diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,15543 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9983779399837793, + "eval_steps": 154, + "global_step": 1540, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "batch_num_effect_tokens": 8022, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 0.0013, + "grad_norm": 0.31844592094421387, + "learning_rate": 6.493506493506495e-08, + "loss": 6.3418, + "step": 1 + }, + { + "batch_num_effect_tokens": 8055, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.0026, + "grad_norm": 0.26869267225265503, + "learning_rate": 1.298701298701299e-07, + "loss": 6.3926, + "step": 2 + }, + { + "batch_num_effect_tokens": 8015, + "batch_num_samples": 18, + "batch_num_tokens": 8192, + "epoch": 0.00389, + "grad_norm": 0.2948790490627289, + "learning_rate": 1.948051948051948e-07, + "loss": 6.2676, + "step": 3 + }, + { + "batch_num_effect_tokens": 8021, + "batch_num_samples": 18, + "batch_num_tokens": 8192, + "epoch": 0.00519, + "grad_norm": 0.3147197663784027, + "learning_rate": 2.597402597402598e-07, + "loss": 6.4707, + "step": 4 + }, + { + "batch_num_effect_tokens": 8064, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.00649, + "grad_norm": 0.3039565086364746, + "learning_rate": 3.2467532467532465e-07, + "loss": 7.0977, + "step": 5 + }, + { + "batch_num_effect_tokens": 8046, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.00779, + "grad_norm": 0.28105345368385315, + "learning_rate": 3.896103896103896e-07, + "loss": 6.8184, + "step": 6 + }, + { + "batch_num_effect_tokens": 8037, + "batch_num_samples": 16, + "batch_num_tokens": 8189, + "epoch": 0.00908, + "grad_norm": 0.2680772840976715, + "learning_rate": 4.5454545454545457e-07, + "loss": 6.5508, + "step": 7 + }, + { + "batch_num_effect_tokens": 7887, + "batch_num_samples": 23, + "batch_num_tokens": 8105, + "epoch": 0.01038, + "grad_norm": 0.3874300718307495, + "learning_rate": 5.194805194805196e-07, + "loss": 6.1758, + "step": 8 + }, + { + "batch_num_effect_tokens": 8027, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 0.01168, + "grad_norm": 0.29857560992240906, + "learning_rate": 5.844155844155845e-07, + "loss": 6.7598, + "step": 9 + }, + { + "batch_num_effect_tokens": 7892, + "batch_num_samples": 19, + "batch_num_tokens": 8119, + "epoch": 0.01298, + "grad_norm": 0.3115284740924835, + "learning_rate": 6.493506493506493e-07, + "loss": 6.3301, + "step": 10 + }, + { + "batch_num_effect_tokens": 7943, + "batch_num_samples": 23, + "batch_num_tokens": 8192, + "epoch": 0.01427, + "grad_norm": 0.38096851110458374, + "learning_rate": 7.142857142857143e-07, + "loss": 6.2168, + "step": 11 + }, + { + "batch_num_effect_tokens": 8074, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 0.01557, + "grad_norm": 0.24052490293979645, + "learning_rate": 7.792207792207792e-07, + "loss": 6.0957, + "step": 12 + }, + { + "batch_num_effect_tokens": 7983, + "batch_num_samples": 14, + "batch_num_tokens": 8146, + "epoch": 0.01687, + "grad_norm": 0.2261405885219574, + "learning_rate": 8.441558441558442e-07, + "loss": 5.8008, + "step": 13 + }, + { + "batch_num_effect_tokens": 8038, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 0.01817, + "grad_norm": 0.25014105439186096, + "learning_rate": 9.090909090909091e-07, + "loss": 6.459, + "step": 14 + }, + { + "batch_num_effect_tokens": 8011, + "batch_num_samples": 14, + "batch_num_tokens": 8177, + "epoch": 0.01946, + "grad_norm": 0.2601844370365143, + "learning_rate": 9.740259740259742e-07, + "loss": 6.2109, + "step": 15 + }, + { + "batch_num_effect_tokens": 8040, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 0.02076, + "grad_norm": 0.2764773368835449, + "learning_rate": 1.0389610389610392e-06, + "loss": 6.0508, + "step": 16 + }, + { + "batch_num_effect_tokens": 7991, + "batch_num_samples": 14, + "batch_num_tokens": 8128, + "epoch": 0.02206, + "grad_norm": 0.23783260583877563, + "learning_rate": 1.103896103896104e-06, + "loss": 6.3242, + "step": 17 + }, + { + "batch_num_effect_tokens": 8013, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 0.02336, + "grad_norm": 0.21356581151485443, + "learning_rate": 1.168831168831169e-06, + "loss": 6.1641, + "step": 18 + }, + { + "batch_num_effect_tokens": 7974, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 0.02466, + "grad_norm": 0.2238253653049469, + "learning_rate": 1.2337662337662338e-06, + "loss": 5.8047, + "step": 19 + }, + { + "batch_num_effect_tokens": 8033, + "batch_num_samples": 14, + "batch_num_tokens": 8170, + "epoch": 0.02595, + "grad_norm": 0.21132396161556244, + "learning_rate": 1.2987012987012986e-06, + "loss": 5.7354, + "step": 20 + }, + { + "batch_num_effect_tokens": 7901, + "batch_num_samples": 22, + "batch_num_tokens": 8128, + "epoch": 0.02725, + "grad_norm": 0.2627267837524414, + "learning_rate": 1.3636363636363636e-06, + "loss": 6.2051, + "step": 21 + }, + { + "batch_num_effect_tokens": 7997, + "batch_num_samples": 17, + "batch_num_tokens": 8165, + "epoch": 0.02855, + "grad_norm": 0.23212876915931702, + "learning_rate": 1.4285714285714286e-06, + "loss": 6.1699, + "step": 22 + }, + { + "batch_num_effect_tokens": 8045, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.02985, + "grad_norm": 0.20811595022678375, + "learning_rate": 1.4935064935064936e-06, + "loss": 5.6621, + "step": 23 + }, + { + "batch_num_effect_tokens": 8066, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 0.03114, + "grad_norm": 0.205363467335701, + "learning_rate": 1.5584415584415584e-06, + "loss": 5.7109, + "step": 24 + }, + { + "batch_num_effect_tokens": 7906, + "batch_num_samples": 20, + "batch_num_tokens": 8091, + "epoch": 0.03244, + "grad_norm": 0.248266339302063, + "learning_rate": 1.6233766233766235e-06, + "loss": 5.8008, + "step": 25 + }, + { + "batch_num_effect_tokens": 8072, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.03374, + "grad_norm": 0.22567051649093628, + "learning_rate": 1.6883116883116885e-06, + "loss": 5.668, + "step": 26 + }, + { + "batch_num_effect_tokens": 8014, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.03504, + "grad_norm": 0.21363011002540588, + "learning_rate": 1.7532467532467535e-06, + "loss": 5.9082, + "step": 27 + }, + { + "batch_num_effect_tokens": 7982, + "batch_num_samples": 14, + "batch_num_tokens": 8142, + "epoch": 0.03633, + "grad_norm": 0.17314466834068298, + "learning_rate": 1.8181818181818183e-06, + "loss": 5.8809, + "step": 28 + }, + { + "batch_num_effect_tokens": 8052, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.03763, + "grad_norm": 0.1672496497631073, + "learning_rate": 1.8831168831168833e-06, + "loss": 5.2207, + "step": 29 + }, + { + "batch_num_effect_tokens": 7987, + "batch_num_samples": 16, + "batch_num_tokens": 8155, + "epoch": 0.03893, + "grad_norm": 0.17449834942817688, + "learning_rate": 1.9480519480519483e-06, + "loss": 5.7324, + "step": 30 + }, + { + "batch_num_effect_tokens": 8046, + "batch_num_samples": 14, + "batch_num_tokens": 8191, + "epoch": 0.04023, + "grad_norm": 0.1806560754776001, + "learning_rate": 2.012987012987013e-06, + "loss": 5.2949, + "step": 31 + }, + { + "batch_num_effect_tokens": 7998, + "batch_num_samples": 20, + "batch_num_tokens": 8192, + "epoch": 0.04152, + "grad_norm": 0.1780816614627838, + "learning_rate": 2.0779220779220784e-06, + "loss": 5.6357, + "step": 32 + }, + { + "batch_num_effect_tokens": 7915, + "batch_num_samples": 18, + "batch_num_tokens": 8090, + "epoch": 0.04282, + "grad_norm": 0.1659129112958908, + "learning_rate": 2.1428571428571427e-06, + "loss": 5.2832, + "step": 33 + }, + { + "batch_num_effect_tokens": 8050, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.04412, + "grad_norm": 0.18337897956371307, + "learning_rate": 2.207792207792208e-06, + "loss": 5.4307, + "step": 34 + }, + { + "batch_num_effect_tokens": 7927, + "batch_num_samples": 25, + "batch_num_tokens": 8154, + "epoch": 0.04542, + "grad_norm": 0.20109063386917114, + "learning_rate": 2.2727272727272728e-06, + "loss": 5.876, + "step": 35 + }, + { + "batch_num_effect_tokens": 7992, + "batch_num_samples": 17, + "batch_num_tokens": 8176, + "epoch": 0.04672, + "grad_norm": 0.15896430611610413, + "learning_rate": 2.337662337662338e-06, + "loss": 5.251, + "step": 36 + }, + { + "batch_num_effect_tokens": 8000, + "batch_num_samples": 14, + "batch_num_tokens": 8182, + "epoch": 0.04801, + "grad_norm": 0.15734650194644928, + "learning_rate": 2.402597402597403e-06, + "loss": 4.8984, + "step": 37 + }, + { + "batch_num_effect_tokens": 8032, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.04931, + "grad_norm": 0.17897239327430725, + "learning_rate": 2.4675324675324676e-06, + "loss": 5.3438, + "step": 38 + }, + { + "batch_num_effect_tokens": 8032, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 0.05061, + "grad_norm": 0.1797408014535904, + "learning_rate": 2.5324675324675324e-06, + "loss": 5.3223, + "step": 39 + }, + { + "batch_num_effect_tokens": 7991, + "batch_num_samples": 15, + "batch_num_tokens": 8156, + "epoch": 0.05191, + "grad_norm": 0.16180779039859772, + "learning_rate": 2.597402597402597e-06, + "loss": 5.0254, + "step": 40 + }, + { + "batch_num_effect_tokens": 7865, + "batch_num_samples": 17, + "batch_num_tokens": 8029, + "epoch": 0.0532, + "grad_norm": 0.1604710966348648, + "learning_rate": 2.6623376623376624e-06, + "loss": 5.3867, + "step": 41 + }, + { + "batch_num_effect_tokens": 7967, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.0545, + "grad_norm": 0.15748171508312225, + "learning_rate": 2.7272727272727272e-06, + "loss": 4.9678, + "step": 42 + }, + { + "batch_num_effect_tokens": 8047, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.0558, + "grad_norm": 0.163799449801445, + "learning_rate": 2.7922077922077925e-06, + "loss": 5.1172, + "step": 43 + }, + { + "batch_num_effect_tokens": 7880, + "batch_num_samples": 26, + "batch_num_tokens": 8102, + "epoch": 0.0571, + "grad_norm": 0.1661587953567505, + "learning_rate": 2.8571428571428573e-06, + "loss": 4.8545, + "step": 44 + }, + { + "batch_num_effect_tokens": 7964, + "batch_num_samples": 23, + "batch_num_tokens": 8143, + "epoch": 0.05839, + "grad_norm": 0.1683725267648697, + "learning_rate": 2.922077922077922e-06, + "loss": 5.2148, + "step": 45 + }, + { + "batch_num_effect_tokens": 8010, + "batch_num_samples": 15, + "batch_num_tokens": 8164, + "epoch": 0.05969, + "grad_norm": 0.15301677584648132, + "learning_rate": 2.9870129870129873e-06, + "loss": 5.0928, + "step": 46 + }, + { + "batch_num_effect_tokens": 7915, + "batch_num_samples": 17, + "batch_num_tokens": 8087, + "epoch": 0.06099, + "grad_norm": 0.14993946254253387, + "learning_rate": 3.051948051948052e-06, + "loss": 5.0557, + "step": 47 + }, + { + "batch_num_effect_tokens": 7994, + "batch_num_samples": 14, + "batch_num_tokens": 8128, + "epoch": 0.06229, + "grad_norm": 0.15920305252075195, + "learning_rate": 3.116883116883117e-06, + "loss": 5.209, + "step": 48 + }, + { + "batch_num_effect_tokens": 8016, + "batch_num_samples": 18, + "batch_num_tokens": 8192, + "epoch": 0.06358, + "grad_norm": 0.1530810445547104, + "learning_rate": 3.181818181818182e-06, + "loss": 5.3164, + "step": 49 + }, + { + "batch_num_effect_tokens": 7979, + "batch_num_samples": 14, + "batch_num_tokens": 8121, + "epoch": 0.06488, + "grad_norm": 0.14603291451931, + "learning_rate": 3.246753246753247e-06, + "loss": 5.1729, + "step": 50 + }, + { + "batch_num_effect_tokens": 8043, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 0.06618, + "grad_norm": 0.14708639681339264, + "learning_rate": 3.311688311688312e-06, + "loss": 4.7744, + "step": 51 + }, + { + "batch_num_effect_tokens": 8067, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 0.06748, + "grad_norm": 0.14484994113445282, + "learning_rate": 3.376623376623377e-06, + "loss": 4.9619, + "step": 52 + }, + { + "batch_num_effect_tokens": 7904, + "batch_num_samples": 18, + "batch_num_tokens": 8192, + "epoch": 0.06878, + "grad_norm": 0.1400662362575531, + "learning_rate": 3.4415584415584418e-06, + "loss": 4.7002, + "step": 53 + }, + { + "batch_num_effect_tokens": 8053, + "batch_num_samples": 14, + "batch_num_tokens": 8191, + "epoch": 0.07007, + "grad_norm": 0.14303331077098846, + "learning_rate": 3.506493506493507e-06, + "loss": 4.6895, + "step": 54 + }, + { + "batch_num_effect_tokens": 8059, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.07137, + "grad_norm": 0.14038386940956116, + "learning_rate": 3.5714285714285718e-06, + "loss": 5.3848, + "step": 55 + }, + { + "batch_num_effect_tokens": 7905, + "batch_num_samples": 17, + "batch_num_tokens": 8075, + "epoch": 0.07267, + "grad_norm": 0.1472426950931549, + "learning_rate": 3.6363636363636366e-06, + "loss": 4.9775, + "step": 56 + }, + { + "batch_num_effect_tokens": 8021, + "batch_num_samples": 18, + "batch_num_tokens": 8192, + "epoch": 0.07397, + "grad_norm": 0.15343016386032104, + "learning_rate": 3.701298701298702e-06, + "loss": 5.2734, + "step": 57 + }, + { + "batch_num_effect_tokens": 8057, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.07526, + "grad_norm": 0.1423100382089615, + "learning_rate": 3.7662337662337666e-06, + "loss": 5.3379, + "step": 58 + }, + { + "batch_num_effect_tokens": 8052, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.07656, + "grad_norm": 0.14533978700637817, + "learning_rate": 3.831168831168831e-06, + "loss": 4.8135, + "step": 59 + }, + { + "batch_num_effect_tokens": 8007, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.07786, + "grad_norm": 0.14846143126487732, + "learning_rate": 3.896103896103897e-06, + "loss": 4.9902, + "step": 60 + }, + { + "batch_num_effect_tokens": 7928, + "batch_num_samples": 17, + "batch_num_tokens": 8113, + "epoch": 0.07916, + "grad_norm": 0.1590896099805832, + "learning_rate": 3.961038961038962e-06, + "loss": 5.0049, + "step": 61 + }, + { + "batch_num_effect_tokens": 8023, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 0.08045, + "grad_norm": 0.1540524810552597, + "learning_rate": 4.025974025974026e-06, + "loss": 4.8965, + "step": 62 + }, + { + "batch_num_effect_tokens": 8055, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.08175, + "grad_norm": 0.13902273774147034, + "learning_rate": 4.0909090909090915e-06, + "loss": 4.9961, + "step": 63 + }, + { + "batch_num_effect_tokens": 7977, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 0.08305, + "grad_norm": 0.14425402879714966, + "learning_rate": 4.155844155844157e-06, + "loss": 5.0908, + "step": 64 + }, + { + "batch_num_effect_tokens": 8034, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 0.08435, + "grad_norm": 0.15884140133857727, + "learning_rate": 4.220779220779221e-06, + "loss": 5.2842, + "step": 65 + }, + { + "batch_num_effect_tokens": 8002, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 0.08564, + "grad_norm": 0.14033250510692596, + "learning_rate": 4.2857142857142855e-06, + "loss": 5.0371, + "step": 66 + }, + { + "batch_num_effect_tokens": 7999, + "batch_num_samples": 19, + "batch_num_tokens": 8192, + "epoch": 0.08694, + "grad_norm": 0.15526610612869263, + "learning_rate": 4.350649350649351e-06, + "loss": 4.6426, + "step": 67 + }, + { + "batch_num_effect_tokens": 7806, + "batch_num_samples": 17, + "batch_num_tokens": 7994, + "epoch": 0.08824, + "grad_norm": 0.1523507982492447, + "learning_rate": 4.415584415584416e-06, + "loss": 5.1045, + "step": 68 + }, + { + "batch_num_effect_tokens": 8065, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 0.08954, + "grad_norm": 0.15373018383979797, + "learning_rate": 4.48051948051948e-06, + "loss": 5.3213, + "step": 69 + }, + { + "batch_num_effect_tokens": 7953, + "batch_num_samples": 14, + "batch_num_tokens": 8076, + "epoch": 0.09084, + "grad_norm": 0.16291280090808868, + "learning_rate": 4.5454545454545455e-06, + "loss": 4.9736, + "step": 70 + }, + { + "batch_num_effect_tokens": 8030, + "batch_num_samples": 19, + "batch_num_tokens": 8192, + "epoch": 0.09213, + "grad_norm": 0.135183647274971, + "learning_rate": 4.610389610389611e-06, + "loss": 5.1475, + "step": 71 + }, + { + "batch_num_effect_tokens": 8022, + "batch_num_samples": 17, + "batch_num_tokens": 8181, + "epoch": 0.09343, + "grad_norm": 0.13779762387275696, + "learning_rate": 4.675324675324676e-06, + "loss": 4.999, + "step": 72 + }, + { + "batch_num_effect_tokens": 7968, + "batch_num_samples": 14, + "batch_num_tokens": 8152, + "epoch": 0.09473, + "grad_norm": 0.1394745260477066, + "learning_rate": 4.74025974025974e-06, + "loss": 4.9697, + "step": 73 + }, + { + "batch_num_effect_tokens": 7997, + "batch_num_samples": 14, + "batch_num_tokens": 8149, + "epoch": 0.09603, + "grad_norm": 0.14722809195518494, + "learning_rate": 4.805194805194806e-06, + "loss": 5.4229, + "step": 74 + }, + { + "batch_num_effect_tokens": 8051, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.09732, + "grad_norm": 0.1443023979663849, + "learning_rate": 4.870129870129871e-06, + "loss": 4.8057, + "step": 75 + }, + { + "batch_num_effect_tokens": 7998, + "batch_num_samples": 17, + "batch_num_tokens": 8171, + "epoch": 0.09862, + "grad_norm": 0.1304166615009308, + "learning_rate": 4.935064935064935e-06, + "loss": 5.1406, + "step": 76 + }, + { + "batch_num_effect_tokens": 8004, + "batch_num_samples": 19, + "batch_num_tokens": 8176, + "epoch": 0.09992, + "grad_norm": 0.14469240605831146, + "learning_rate": 5e-06, + "loss": 4.9717, + "step": 77 + }, + { + "batch_num_effect_tokens": 7992, + "batch_num_samples": 19, + "batch_num_tokens": 8192, + "epoch": 0.10122, + "grad_norm": 0.14036931097507477, + "learning_rate": 5.064935064935065e-06, + "loss": 4.6973, + "step": 78 + }, + { + "batch_num_effect_tokens": 7970, + "batch_num_samples": 20, + "batch_num_tokens": 8176, + "epoch": 0.10251, + "grad_norm": 0.1444014012813568, + "learning_rate": 5.12987012987013e-06, + "loss": 4.8516, + "step": 79 + }, + { + "batch_num_effect_tokens": 7878, + "batch_num_samples": 27, + "batch_num_tokens": 8114, + "epoch": 0.10381, + "grad_norm": 0.15033836662769318, + "learning_rate": 5.194805194805194e-06, + "loss": 4.6299, + "step": 80 + }, + { + "batch_num_effect_tokens": 8023, + "batch_num_samples": 18, + "batch_num_tokens": 8192, + "epoch": 0.10511, + "grad_norm": 0.1547461301088333, + "learning_rate": 5.2597402597402605e-06, + "loss": 5.0889, + "step": 81 + }, + { + "batch_num_effect_tokens": 8025, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 0.10641, + "grad_norm": 0.14421503245830536, + "learning_rate": 5.324675324675325e-06, + "loss": 4.9414, + "step": 82 + }, + { + "batch_num_effect_tokens": 8060, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.1077, + "grad_norm": 0.13420124351978302, + "learning_rate": 5.38961038961039e-06, + "loss": 4.8047, + "step": 83 + }, + { + "batch_num_effect_tokens": 8065, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 0.109, + "grad_norm": 0.1364876627922058, + "learning_rate": 5.4545454545454545e-06, + "loss": 5.0352, + "step": 84 + }, + { + "batch_num_effect_tokens": 8031, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 0.1103, + "grad_norm": 0.13382357358932495, + "learning_rate": 5.5194805194805205e-06, + "loss": 4.8984, + "step": 85 + }, + { + "batch_num_effect_tokens": 7784, + "batch_num_samples": 28, + "batch_num_tokens": 8032, + "epoch": 0.1116, + "grad_norm": 0.1488446742296219, + "learning_rate": 5.584415584415585e-06, + "loss": 5.2764, + "step": 86 + }, + { + "batch_num_effect_tokens": 8057, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.1129, + "grad_norm": 0.15475524961948395, + "learning_rate": 5.64935064935065e-06, + "loss": 5.1055, + "step": 87 + }, + { + "batch_num_effect_tokens": 8030, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 0.11419, + "grad_norm": 0.1377478688955307, + "learning_rate": 5.7142857142857145e-06, + "loss": 4.835, + "step": 88 + }, + { + "batch_num_effect_tokens": 7766, + "batch_num_samples": 26, + "batch_num_tokens": 8030, + "epoch": 0.11549, + "grad_norm": 0.1429220736026764, + "learning_rate": 5.77922077922078e-06, + "loss": 4.8867, + "step": 89 + }, + { + "batch_num_effect_tokens": 7998, + "batch_num_samples": 16, + "batch_num_tokens": 8144, + "epoch": 0.11679, + "grad_norm": 0.14639155566692352, + "learning_rate": 5.844155844155844e-06, + "loss": 5.1807, + "step": 90 + }, + { + "batch_num_effect_tokens": 8019, + "batch_num_samples": 18, + "batch_num_tokens": 8192, + "epoch": 0.11809, + "grad_norm": 0.1477179378271103, + "learning_rate": 5.90909090909091e-06, + "loss": 4.6084, + "step": 91 + }, + { + "batch_num_effect_tokens": 7969, + "batch_num_samples": 14, + "batch_num_tokens": 8136, + "epoch": 0.11938, + "grad_norm": 0.1457316130399704, + "learning_rate": 5.9740259740259746e-06, + "loss": 5.335, + "step": 92 + }, + { + "batch_num_effect_tokens": 7885, + "batch_num_samples": 20, + "batch_num_tokens": 8192, + "epoch": 0.12068, + "grad_norm": 0.1372513622045517, + "learning_rate": 6.03896103896104e-06, + "loss": 4.6279, + "step": 93 + }, + { + "batch_num_effect_tokens": 7953, + "batch_num_samples": 14, + "batch_num_tokens": 8107, + "epoch": 0.12198, + "grad_norm": 0.14941509068012238, + "learning_rate": 6.103896103896104e-06, + "loss": 4.7344, + "step": 94 + }, + { + "batch_num_effect_tokens": 8040, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 0.12328, + "grad_norm": 0.14506636559963226, + "learning_rate": 6.168831168831169e-06, + "loss": 5.2168, + "step": 95 + }, + { + "batch_num_effect_tokens": 8001, + "batch_num_samples": 19, + "batch_num_tokens": 8192, + "epoch": 0.12457, + "grad_norm": 0.15786777436733246, + "learning_rate": 6.233766233766234e-06, + "loss": 5.1094, + "step": 96 + }, + { + "batch_num_effect_tokens": 8032, + "batch_num_samples": 14, + "batch_num_tokens": 8149, + "epoch": 0.12587, + "grad_norm": 0.14335079491138458, + "learning_rate": 6.2987012987013e-06, + "loss": 4.7344, + "step": 97 + }, + { + "batch_num_effect_tokens": 8031, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.12717, + "grad_norm": 0.1454756110906601, + "learning_rate": 6.363636363636364e-06, + "loss": 4.9893, + "step": 98 + }, + { + "batch_num_effect_tokens": 8021, + "batch_num_samples": 18, + "batch_num_tokens": 8192, + "epoch": 0.12847, + "grad_norm": 0.13855211436748505, + "learning_rate": 6.4285714285714295e-06, + "loss": 4.6758, + "step": 99 + }, + { + "batch_num_effect_tokens": 8052, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.12976, + "grad_norm": 0.1334666609764099, + "learning_rate": 6.493506493506494e-06, + "loss": 5.1504, + "step": 100 + }, + { + "batch_num_effect_tokens": 8061, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.13106, + "grad_norm": 0.14907817542552948, + "learning_rate": 6.55844155844156e-06, + "loss": 4.8926, + "step": 101 + }, + { + "batch_num_effect_tokens": 8050, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 0.13236, + "grad_norm": 0.134397953748703, + "learning_rate": 6.623376623376624e-06, + "loss": 4.7324, + "step": 102 + }, + { + "batch_num_effect_tokens": 7971, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.13366, + "grad_norm": 0.13027189671993256, + "learning_rate": 6.688311688311689e-06, + "loss": 4.7451, + "step": 103 + }, + { + "batch_num_effect_tokens": 8018, + "batch_num_samples": 14, + "batch_num_tokens": 8144, + "epoch": 0.13496, + "grad_norm": 0.1361909955739975, + "learning_rate": 6.753246753246754e-06, + "loss": 5.1758, + "step": 104 + }, + { + "batch_num_effect_tokens": 8053, + "batch_num_samples": 14, + "batch_num_tokens": 8190, + "epoch": 0.13625, + "grad_norm": 0.14071358740329742, + "learning_rate": 6.818181818181818e-06, + "loss": 4.8379, + "step": 105 + }, + { + "batch_num_effect_tokens": 8037, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 0.13755, + "grad_norm": 0.14475062489509583, + "learning_rate": 6.8831168831168835e-06, + "loss": 4.9668, + "step": 106 + }, + { + "batch_num_effect_tokens": 8044, + "batch_num_samples": 14, + "batch_num_tokens": 8189, + "epoch": 0.13885, + "grad_norm": 0.1445104032754898, + "learning_rate": 6.948051948051948e-06, + "loss": 4.8447, + "step": 107 + }, + { + "batch_num_effect_tokens": 8018, + "batch_num_samples": 20, + "batch_num_tokens": 8192, + "epoch": 0.14015, + "grad_norm": 0.14156053960323334, + "learning_rate": 7.012987012987014e-06, + "loss": 5.1055, + "step": 108 + }, + { + "batch_num_effect_tokens": 8036, + "batch_num_samples": 18, + "batch_num_tokens": 8191, + "epoch": 0.14144, + "grad_norm": 0.1461244523525238, + "learning_rate": 7.077922077922078e-06, + "loss": 5.0107, + "step": 109 + }, + { + "batch_num_effect_tokens": 8061, + "batch_num_samples": 14, + "batch_num_tokens": 8184, + "epoch": 0.14274, + "grad_norm": 0.14059850573539734, + "learning_rate": 7.1428571428571436e-06, + "loss": 5.1133, + "step": 110 + }, + { + "batch_num_effect_tokens": 7994, + "batch_num_samples": 15, + "batch_num_tokens": 8140, + "epoch": 0.14404, + "grad_norm": 0.14305299520492554, + "learning_rate": 7.207792207792208e-06, + "loss": 4.6953, + "step": 111 + }, + { + "batch_num_effect_tokens": 7981, + "batch_num_samples": 14, + "batch_num_tokens": 8144, + "epoch": 0.14534, + "grad_norm": 0.1341462880373001, + "learning_rate": 7.272727272727273e-06, + "loss": 4.8037, + "step": 112 + }, + { + "batch_num_effect_tokens": 7913, + "batch_num_samples": 15, + "batch_num_tokens": 8098, + "epoch": 0.14663, + "grad_norm": 0.14198477566242218, + "learning_rate": 7.3376623376623375e-06, + "loss": 4.8252, + "step": 113 + }, + { + "batch_num_effect_tokens": 8045, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.14793, + "grad_norm": 0.1437690556049347, + "learning_rate": 7.402597402597404e-06, + "loss": 5.0996, + "step": 114 + }, + { + "batch_num_effect_tokens": 7999, + "batch_num_samples": 15, + "batch_num_tokens": 8152, + "epoch": 0.14923, + "grad_norm": 0.13746197521686554, + "learning_rate": 7.467532467532468e-06, + "loss": 4.8242, + "step": 115 + }, + { + "batch_num_effect_tokens": 8036, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 0.15053, + "grad_norm": 0.1328958421945572, + "learning_rate": 7.532467532467533e-06, + "loss": 5.4316, + "step": 116 + }, + { + "batch_num_effect_tokens": 8047, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 0.15182, + "grad_norm": 0.14948073029518127, + "learning_rate": 7.597402597402598e-06, + "loss": 5.1084, + "step": 117 + }, + { + "batch_num_effect_tokens": 7937, + "batch_num_samples": 17, + "batch_num_tokens": 8139, + "epoch": 0.15312, + "grad_norm": 0.14273382723331451, + "learning_rate": 7.662337662337663e-06, + "loss": 5.0645, + "step": 118 + }, + { + "batch_num_effect_tokens": 8062, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.15442, + "grad_norm": 0.13271141052246094, + "learning_rate": 7.727272727272727e-06, + "loss": 4.7783, + "step": 119 + }, + { + "batch_num_effect_tokens": 8065, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.15572, + "grad_norm": 0.14579838514328003, + "learning_rate": 7.792207792207793e-06, + "loss": 5.0703, + "step": 120 + }, + { + "batch_num_effect_tokens": 7983, + "batch_num_samples": 21, + "batch_num_tokens": 8192, + "epoch": 0.15702, + "grad_norm": 0.1460658609867096, + "learning_rate": 7.857142857142858e-06, + "loss": 4.7344, + "step": 121 + }, + { + "batch_num_effect_tokens": 8014, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 0.15831, + "grad_norm": 0.1394878476858139, + "learning_rate": 7.922077922077924e-06, + "loss": 5.0918, + "step": 122 + }, + { + "batch_num_effect_tokens": 8001, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 0.15961, + "grad_norm": 0.1350386142730713, + "learning_rate": 7.987012987012988e-06, + "loss": 5.291, + "step": 123 + }, + { + "batch_num_effect_tokens": 7925, + "batch_num_samples": 15, + "batch_num_tokens": 8094, + "epoch": 0.16091, + "grad_norm": 0.1350218653678894, + "learning_rate": 8.051948051948052e-06, + "loss": 5.0156, + "step": 124 + }, + { + "batch_num_effect_tokens": 8070, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 0.16221, + "grad_norm": 0.1472414880990982, + "learning_rate": 8.116883116883117e-06, + "loss": 4.998, + "step": 125 + }, + { + "batch_num_effect_tokens": 8035, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 0.1635, + "grad_norm": 0.14445587992668152, + "learning_rate": 8.181818181818183e-06, + "loss": 5.0547, + "step": 126 + }, + { + "batch_num_effect_tokens": 8012, + "batch_num_samples": 14, + "batch_num_tokens": 8177, + "epoch": 0.1648, + "grad_norm": 0.14494451880455017, + "learning_rate": 8.246753246753247e-06, + "loss": 4.6123, + "step": 127 + }, + { + "batch_num_effect_tokens": 8032, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 0.1661, + "grad_norm": 0.14212098717689514, + "learning_rate": 8.311688311688313e-06, + "loss": 4.6992, + "step": 128 + }, + { + "batch_num_effect_tokens": 8029, + "batch_num_samples": 19, + "batch_num_tokens": 8192, + "epoch": 0.1674, + "grad_norm": 0.1644868403673172, + "learning_rate": 8.376623376623378e-06, + "loss": 5.2734, + "step": 129 + }, + { + "batch_num_effect_tokens": 7962, + "batch_num_samples": 15, + "batch_num_tokens": 8158, + "epoch": 0.16869, + "grad_norm": 0.13638760149478912, + "learning_rate": 8.441558441558442e-06, + "loss": 4.7773, + "step": 130 + }, + { + "batch_num_effect_tokens": 7940, + "batch_num_samples": 14, + "batch_num_tokens": 8101, + "epoch": 0.16999, + "grad_norm": 0.14820170402526855, + "learning_rate": 8.506493506493507e-06, + "loss": 4.8438, + "step": 131 + }, + { + "batch_num_effect_tokens": 8035, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 0.17129, + "grad_norm": 0.1421528309583664, + "learning_rate": 8.571428571428571e-06, + "loss": 5.0254, + "step": 132 + }, + { + "batch_num_effect_tokens": 7936, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.17259, + "grad_norm": 0.13923408091068268, + "learning_rate": 8.636363636363637e-06, + "loss": 4.7812, + "step": 133 + }, + { + "batch_num_effect_tokens": 7989, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 0.17388, + "grad_norm": 0.15315015614032745, + "learning_rate": 8.701298701298701e-06, + "loss": 4.834, + "step": 134 + }, + { + "batch_num_effect_tokens": 7984, + "batch_num_samples": 21, + "batch_num_tokens": 8192, + "epoch": 0.17518, + "grad_norm": 0.1589311957359314, + "learning_rate": 8.766233766233767e-06, + "loss": 4.9219, + "step": 135 + }, + { + "batch_num_effect_tokens": 8006, + "batch_num_samples": 14, + "batch_num_tokens": 8160, + "epoch": 0.17648, + "grad_norm": 0.14829252660274506, + "learning_rate": 8.831168831168832e-06, + "loss": 4.9746, + "step": 136 + }, + { + "batch_num_effect_tokens": 7960, + "batch_num_samples": 22, + "batch_num_tokens": 8192, + "epoch": 0.17778, + "grad_norm": 0.14853787422180176, + "learning_rate": 8.896103896103896e-06, + "loss": 5.0918, + "step": 137 + }, + { + "batch_num_effect_tokens": 8031, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.17908, + "grad_norm": 0.13866592943668365, + "learning_rate": 8.96103896103896e-06, + "loss": 4.8525, + "step": 138 + }, + { + "batch_num_effect_tokens": 8002, + "batch_num_samples": 17, + "batch_num_tokens": 8179, + "epoch": 0.18037, + "grad_norm": 0.1399109810590744, + "learning_rate": 9.025974025974027e-06, + "loss": 4.9492, + "step": 139 + }, + { + "batch_num_effect_tokens": 8039, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 0.18167, + "grad_norm": 0.14804702997207642, + "learning_rate": 9.090909090909091e-06, + "loss": 4.8105, + "step": 140 + }, + { + "batch_num_effect_tokens": 8007, + "batch_num_samples": 18, + "batch_num_tokens": 8192, + "epoch": 0.18297, + "grad_norm": 0.13978514075279236, + "learning_rate": 9.155844155844157e-06, + "loss": 4.6553, + "step": 141 + }, + { + "batch_num_effect_tokens": 7962, + "batch_num_samples": 14, + "batch_num_tokens": 8119, + "epoch": 0.18427, + "grad_norm": 0.1482185572385788, + "learning_rate": 9.220779220779221e-06, + "loss": 4.8184, + "step": 142 + }, + { + "batch_num_effect_tokens": 7988, + "batch_num_samples": 22, + "batch_num_tokens": 8189, + "epoch": 0.18556, + "grad_norm": 0.1415347456932068, + "learning_rate": 9.285714285714288e-06, + "loss": 4.8008, + "step": 143 + }, + { + "batch_num_effect_tokens": 7989, + "batch_num_samples": 23, + "batch_num_tokens": 8192, + "epoch": 0.18686, + "grad_norm": 0.15562182664871216, + "learning_rate": 9.350649350649352e-06, + "loss": 5.4648, + "step": 144 + }, + { + "batch_num_effect_tokens": 8026, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 0.18816, + "grad_norm": 0.14722460508346558, + "learning_rate": 9.415584415584416e-06, + "loss": 4.7441, + "step": 145 + }, + { + "batch_num_effect_tokens": 7974, + "batch_num_samples": 15, + "batch_num_tokens": 8146, + "epoch": 0.18946, + "grad_norm": 0.13538284599781036, + "learning_rate": 9.48051948051948e-06, + "loss": 4.5645, + "step": 146 + }, + { + "batch_num_effect_tokens": 7967, + "batch_num_samples": 19, + "batch_num_tokens": 8176, + "epoch": 0.19075, + "grad_norm": 0.1557544320821762, + "learning_rate": 9.545454545454547e-06, + "loss": 4.9785, + "step": 147 + }, + { + "batch_num_effect_tokens": 8031, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 0.19205, + "grad_norm": 0.15202026069164276, + "learning_rate": 9.610389610389611e-06, + "loss": 4.8027, + "step": 148 + }, + { + "batch_num_effect_tokens": 8032, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 0.19335, + "grad_norm": 0.14528276026248932, + "learning_rate": 9.675324675324677e-06, + "loss": 4.7441, + "step": 149 + }, + { + "batch_num_effect_tokens": 8048, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.19465, + "grad_norm": 0.15638510882854462, + "learning_rate": 9.740259740259742e-06, + "loss": 4.9463, + "step": 150 + }, + { + "batch_num_effect_tokens": 8070, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 0.19594, + "grad_norm": 0.14898745715618134, + "learning_rate": 9.805194805194806e-06, + "loss": 4.9248, + "step": 151 + }, + { + "batch_num_effect_tokens": 8027, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 0.19724, + "grad_norm": 0.13369755446910858, + "learning_rate": 9.87012987012987e-06, + "loss": 4.9199, + "step": 152 + }, + { + "batch_num_effect_tokens": 8070, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 0.19854, + "grad_norm": 0.14535720646381378, + "learning_rate": 9.935064935064936e-06, + "loss": 4.9004, + "step": 153 + }, + { + "batch_num_effect_tokens": 8061, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 0.19984, + "grad_norm": 0.1457190215587616, + "learning_rate": 1e-05, + "loss": 5.2598, + "step": 154 + }, + { + "batch_num_effect_tokens": 8061, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 0.19984, + "eval_eval_loss": 0.6176656484603882, + "eval_eval_runtime": 115.0266, + "eval_eval_samples_per_second": 43.468, + "eval_eval_steps_per_second": 2.721, + "step": 154 + }, + { + "batch_num_effect_tokens": 7946, + "batch_num_samples": 15, + "batch_num_tokens": 8126, + "epoch": 0.20114, + "grad_norm": 0.14440381526947021, + "learning_rate": 9.999987155621127e-06, + "loss": 5.0732, + "step": 155 + }, + { + "batch_num_effect_tokens": 7991, + "batch_num_samples": 18, + "batch_num_tokens": 8170, + "epoch": 0.20243, + "grad_norm": 0.14386983215808868, + "learning_rate": 9.999948622550497e-06, + "loss": 4.9297, + "step": 156 + }, + { + "batch_num_effect_tokens": 7997, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 0.20373, + "grad_norm": 0.14097128808498383, + "learning_rate": 9.999884400986087e-06, + "loss": 5.0283, + "step": 157 + }, + { + "batch_num_effect_tokens": 8033, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 0.20503, + "grad_norm": 0.14975176751613617, + "learning_rate": 9.999794491257846e-06, + "loss": 4.6611, + "step": 158 + }, + { + "batch_num_effect_tokens": 8053, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.20633, + "grad_norm": 0.13971513509750366, + "learning_rate": 9.999678893827711e-06, + "loss": 4.9727, + "step": 159 + }, + { + "batch_num_effect_tokens": 8046, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.20762, + "grad_norm": 0.12843751907348633, + "learning_rate": 9.999537609289592e-06, + "loss": 4.9268, + "step": 160 + }, + { + "batch_num_effect_tokens": 8050, + "batch_num_samples": 14, + "batch_num_tokens": 8184, + "epoch": 0.20892, + "grad_norm": 0.13939853012561798, + "learning_rate": 9.999370638369377e-06, + "loss": 4.709, + "step": 161 + }, + { + "batch_num_effect_tokens": 8040, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.21022, + "grad_norm": 0.14971594512462616, + "learning_rate": 9.999177981924915e-06, + "loss": 4.7676, + "step": 162 + }, + { + "batch_num_effect_tokens": 7874, + "batch_num_samples": 21, + "batch_num_tokens": 8091, + "epoch": 0.21152, + "grad_norm": 0.14274117350578308, + "learning_rate": 9.998959640946033e-06, + "loss": 4.8418, + "step": 163 + }, + { + "batch_num_effect_tokens": 8054, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.21281, + "grad_norm": 0.14646124839782715, + "learning_rate": 9.998715616554509e-06, + "loss": 4.6113, + "step": 164 + }, + { + "batch_num_effect_tokens": 8051, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 0.21411, + "grad_norm": 0.13392624258995056, + "learning_rate": 9.998445910004082e-06, + "loss": 4.6914, + "step": 165 + }, + { + "batch_num_effect_tokens": 7927, + "batch_num_samples": 20, + "batch_num_tokens": 8144, + "epoch": 0.21541, + "grad_norm": 0.14332374930381775, + "learning_rate": 9.998150522680437e-06, + "loss": 5.0967, + "step": 166 + }, + { + "batch_num_effect_tokens": 8034, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 0.21671, + "grad_norm": 0.15254738926887512, + "learning_rate": 9.997829456101196e-06, + "loss": 4.7773, + "step": 167 + }, + { + "batch_num_effect_tokens": 7932, + "batch_num_samples": 15, + "batch_num_tokens": 8092, + "epoch": 0.218, + "grad_norm": 0.14264167845249176, + "learning_rate": 9.997482711915926e-06, + "loss": 4.7695, + "step": 168 + }, + { + "batch_num_effect_tokens": 8044, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 0.2193, + "grad_norm": 0.1465001106262207, + "learning_rate": 9.997110291906109e-06, + "loss": 4.7012, + "step": 169 + }, + { + "batch_num_effect_tokens": 7952, + "batch_num_samples": 15, + "batch_num_tokens": 8110, + "epoch": 0.2206, + "grad_norm": 0.14456188678741455, + "learning_rate": 9.996712197985147e-06, + "loss": 4.7178, + "step": 170 + }, + { + "batch_num_effect_tokens": 7862, + "batch_num_samples": 26, + "batch_num_tokens": 8106, + "epoch": 0.2219, + "grad_norm": 0.1434151530265808, + "learning_rate": 9.99628843219835e-06, + "loss": 5.1191, + "step": 171 + }, + { + "batch_num_effect_tokens": 8044, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 0.2232, + "grad_norm": 0.14123517274856567, + "learning_rate": 9.995838996722916e-06, + "loss": 4.9141, + "step": 172 + }, + { + "batch_num_effect_tokens": 8063, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.22449, + "grad_norm": 0.1379905641078949, + "learning_rate": 9.995363893867935e-06, + "loss": 4.8369, + "step": 173 + }, + { + "batch_num_effect_tokens": 8079, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 0.22579, + "grad_norm": 0.13364200294017792, + "learning_rate": 9.994863126074371e-06, + "loss": 5.0586, + "step": 174 + }, + { + "batch_num_effect_tokens": 8015, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.22709, + "grad_norm": 0.13714580237865448, + "learning_rate": 9.994336695915041e-06, + "loss": 4.9736, + "step": 175 + }, + { + "batch_num_effect_tokens": 8030, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 0.22839, + "grad_norm": 0.13936279714107513, + "learning_rate": 9.993784606094612e-06, + "loss": 5.0059, + "step": 176 + }, + { + "batch_num_effect_tokens": 8007, + "batch_num_samples": 18, + "batch_num_tokens": 8192, + "epoch": 0.22968, + "grad_norm": 0.1406031847000122, + "learning_rate": 9.993206859449587e-06, + "loss": 4.8916, + "step": 177 + }, + { + "batch_num_effect_tokens": 7963, + "batch_num_samples": 14, + "batch_num_tokens": 8128, + "epoch": 0.23098, + "grad_norm": 0.1409338116645813, + "learning_rate": 9.992603458948282e-06, + "loss": 5.2207, + "step": 178 + }, + { + "batch_num_effect_tokens": 8050, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 0.23228, + "grad_norm": 0.16277164220809937, + "learning_rate": 9.99197440769082e-06, + "loss": 4.9805, + "step": 179 + }, + { + "batch_num_effect_tokens": 8032, + "batch_num_samples": 19, + "batch_num_tokens": 8191, + "epoch": 0.23358, + "grad_norm": 0.1462642252445221, + "learning_rate": 9.991319708909113e-06, + "loss": 4.6992, + "step": 180 + }, + { + "batch_num_effect_tokens": 8037, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.23487, + "grad_norm": 0.14574594795703888, + "learning_rate": 9.990639365966835e-06, + "loss": 5.0459, + "step": 181 + }, + { + "batch_num_effect_tokens": 7967, + "batch_num_samples": 14, + "batch_num_tokens": 8112, + "epoch": 0.23617, + "grad_norm": 0.14711208641529083, + "learning_rate": 9.989933382359423e-06, + "loss": 5.1992, + "step": 182 + }, + { + "batch_num_effect_tokens": 7931, + "batch_num_samples": 15, + "batch_num_tokens": 8086, + "epoch": 0.23747, + "grad_norm": 0.1444520354270935, + "learning_rate": 9.989201761714043e-06, + "loss": 5.2109, + "step": 183 + }, + { + "batch_num_effect_tokens": 8040, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 0.23877, + "grad_norm": 0.13259008526802063, + "learning_rate": 9.988444507789584e-06, + "loss": 4.9014, + "step": 184 + }, + { + "batch_num_effect_tokens": 8014, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 0.24006, + "grad_norm": 0.1382356435060501, + "learning_rate": 9.987661624476624e-06, + "loss": 4.876, + "step": 185 + }, + { + "batch_num_effect_tokens": 8060, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.24136, + "grad_norm": 0.13415783643722534, + "learning_rate": 9.986853115797424e-06, + "loss": 4.7227, + "step": 186 + }, + { + "batch_num_effect_tokens": 8009, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 0.24266, + "grad_norm": 0.13997548818588257, + "learning_rate": 9.986018985905901e-06, + "loss": 5.1807, + "step": 187 + }, + { + "batch_num_effect_tokens": 7933, + "batch_num_samples": 14, + "batch_num_tokens": 8086, + "epoch": 0.24396, + "grad_norm": 0.14471964538097382, + "learning_rate": 9.98515923908761e-06, + "loss": 4.7754, + "step": 188 + }, + { + "batch_num_effect_tokens": 8058, + "batch_num_samples": 14, + "batch_num_tokens": 8191, + "epoch": 0.24526, + "grad_norm": 0.15303927659988403, + "learning_rate": 9.984273879759713e-06, + "loss": 4.8975, + "step": 189 + }, + { + "batch_num_effect_tokens": 7912, + "batch_num_samples": 14, + "batch_num_tokens": 8101, + "epoch": 0.24655, + "grad_norm": 0.13627475500106812, + "learning_rate": 9.983362912470967e-06, + "loss": 4.9385, + "step": 190 + }, + { + "batch_num_effect_tokens": 7985, + "batch_num_samples": 18, + "batch_num_tokens": 8192, + "epoch": 0.24785, + "grad_norm": 0.1349112093448639, + "learning_rate": 9.982426341901697e-06, + "loss": 5.1465, + "step": 191 + }, + { + "batch_num_effect_tokens": 8033, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.24915, + "grad_norm": 0.13799749314785004, + "learning_rate": 9.981464172863769e-06, + "loss": 5.0508, + "step": 192 + }, + { + "batch_num_effect_tokens": 7856, + "batch_num_samples": 17, + "batch_num_tokens": 8040, + "epoch": 0.25045, + "grad_norm": 0.14057736098766327, + "learning_rate": 9.980476410300567e-06, + "loss": 4.9756, + "step": 193 + }, + { + "batch_num_effect_tokens": 7958, + "batch_num_samples": 21, + "batch_num_tokens": 8176, + "epoch": 0.25174, + "grad_norm": 0.14482036232948303, + "learning_rate": 9.979463059286972e-06, + "loss": 4.8223, + "step": 194 + }, + { + "batch_num_effect_tokens": 7984, + "batch_num_samples": 21, + "batch_num_tokens": 8176, + "epoch": 0.25304, + "grad_norm": 0.1419980227947235, + "learning_rate": 9.978424125029329e-06, + "loss": 4.875, + "step": 195 + }, + { + "batch_num_effect_tokens": 8000, + "batch_num_samples": 19, + "batch_num_tokens": 8192, + "epoch": 0.25434, + "grad_norm": 0.1462613195180893, + "learning_rate": 9.977359612865424e-06, + "loss": 4.9316, + "step": 196 + }, + { + "batch_num_effect_tokens": 8061, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.25564, + "grad_norm": 0.13534201681613922, + "learning_rate": 9.976269528264456e-06, + "loss": 4.7822, + "step": 197 + }, + { + "batch_num_effect_tokens": 8047, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 0.25693, + "grad_norm": 0.13299311697483063, + "learning_rate": 9.975153876827008e-06, + "loss": 4.9941, + "step": 198 + }, + { + "batch_num_effect_tokens": 7979, + "batch_num_samples": 15, + "batch_num_tokens": 8116, + "epoch": 0.25823, + "grad_norm": 0.13966509699821472, + "learning_rate": 9.97401266428502e-06, + "loss": 5.2793, + "step": 199 + }, + { + "batch_num_effect_tokens": 8054, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.25953, + "grad_norm": 0.14264234900474548, + "learning_rate": 9.972845896501762e-06, + "loss": 4.8848, + "step": 200 + }, + { + "batch_num_effect_tokens": 8000, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 0.26083, + "grad_norm": 0.1481478363275528, + "learning_rate": 9.971653579471791e-06, + "loss": 5.0264, + "step": 201 + }, + { + "batch_num_effect_tokens": 7957, + "batch_num_samples": 14, + "batch_num_tokens": 8119, + "epoch": 0.26212, + "grad_norm": 0.14331968128681183, + "learning_rate": 9.97043571932094e-06, + "loss": 4.7031, + "step": 202 + }, + { + "batch_num_effect_tokens": 7925, + "batch_num_samples": 17, + "batch_num_tokens": 8123, + "epoch": 0.26342, + "grad_norm": 0.15035419166088104, + "learning_rate": 9.969192322306271e-06, + "loss": 4.6455, + "step": 203 + }, + { + "batch_num_effect_tokens": 8041, + "batch_num_samples": 16, + "batch_num_tokens": 8188, + "epoch": 0.26472, + "grad_norm": 0.1354558765888214, + "learning_rate": 9.96792339481605e-06, + "loss": 4.6113, + "step": 204 + }, + { + "batch_num_effect_tokens": 7979, + "batch_num_samples": 14, + "batch_num_tokens": 8119, + "epoch": 0.26602, + "grad_norm": 0.14034751057624817, + "learning_rate": 9.966628943369708e-06, + "loss": 5.1328, + "step": 205 + }, + { + "batch_num_effect_tokens": 8073, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.26732, + "grad_norm": 0.13551993668079376, + "learning_rate": 9.965308974617816e-06, + "loss": 5.0332, + "step": 206 + }, + { + "batch_num_effect_tokens": 7989, + "batch_num_samples": 17, + "batch_num_tokens": 8171, + "epoch": 0.26861, + "grad_norm": 0.14565272629261017, + "learning_rate": 9.963963495342049e-06, + "loss": 4.8906, + "step": 207 + }, + { + "batch_num_effect_tokens": 8052, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.26991, + "grad_norm": 0.1394621729850769, + "learning_rate": 9.96259251245514e-06, + "loss": 4.835, + "step": 208 + }, + { + "batch_num_effect_tokens": 8034, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 0.27121, + "grad_norm": 0.13364410400390625, + "learning_rate": 9.961196033000862e-06, + "loss": 4.9238, + "step": 209 + }, + { + "batch_num_effect_tokens": 7996, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 0.27251, + "grad_norm": 0.13798867166042328, + "learning_rate": 9.959774064153977e-06, + "loss": 4.9531, + "step": 210 + }, + { + "batch_num_effect_tokens": 8048, + "batch_num_samples": 14, + "batch_num_tokens": 8185, + "epoch": 0.2738, + "grad_norm": 0.13652759790420532, + "learning_rate": 9.95832661322021e-06, + "loss": 4.7188, + "step": 211 + }, + { + "batch_num_effect_tokens": 8050, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.2751, + "grad_norm": 0.14416304230690002, + "learning_rate": 9.956853687636203e-06, + "loss": 5.21, + "step": 212 + }, + { + "batch_num_effect_tokens": 8009, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.2764, + "grad_norm": 0.15409010648727417, + "learning_rate": 9.955355294969483e-06, + "loss": 4.8691, + "step": 213 + }, + { + "batch_num_effect_tokens": 7921, + "batch_num_samples": 18, + "batch_num_tokens": 8085, + "epoch": 0.2777, + "grad_norm": 0.13492988049983978, + "learning_rate": 9.953831442918418e-06, + "loss": 4.9668, + "step": 214 + }, + { + "batch_num_effect_tokens": 8033, + "batch_num_samples": 16, + "batch_num_tokens": 8190, + "epoch": 0.27899, + "grad_norm": 0.13695424795150757, + "learning_rate": 9.952282139312182e-06, + "loss": 4.749, + "step": 215 + }, + { + "batch_num_effect_tokens": 7980, + "batch_num_samples": 16, + "batch_num_tokens": 8155, + "epoch": 0.28029, + "grad_norm": 0.14915227890014648, + "learning_rate": 9.95070739211071e-06, + "loss": 5.1152, + "step": 216 + }, + { + "batch_num_effect_tokens": 7958, + "batch_num_samples": 14, + "batch_num_tokens": 8114, + "epoch": 0.28159, + "grad_norm": 0.13804367184638977, + "learning_rate": 9.949107209404664e-06, + "loss": 4.7793, + "step": 217 + }, + { + "batch_num_effect_tokens": 8072, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 0.28289, + "grad_norm": 0.1392471045255661, + "learning_rate": 9.947481599415385e-06, + "loss": 5.0469, + "step": 218 + }, + { + "batch_num_effect_tokens": 8003, + "batch_num_samples": 19, + "batch_num_tokens": 8192, + "epoch": 0.28418, + "grad_norm": 0.14468149840831757, + "learning_rate": 9.945830570494851e-06, + "loss": 4.8887, + "step": 219 + }, + { + "batch_num_effect_tokens": 8032, + "batch_num_samples": 14, + "batch_num_tokens": 8184, + "epoch": 0.28548, + "grad_norm": 0.1316855400800705, + "learning_rate": 9.944154131125643e-06, + "loss": 4.9443, + "step": 220 + }, + { + "batch_num_effect_tokens": 8045, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 0.28678, + "grad_norm": 0.15862536430358887, + "learning_rate": 9.942452289920886e-06, + "loss": 4.8623, + "step": 221 + }, + { + "batch_num_effect_tokens": 7995, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.28808, + "grad_norm": 0.13182777166366577, + "learning_rate": 9.940725055624218e-06, + "loss": 5.0381, + "step": 222 + }, + { + "batch_num_effect_tokens": 8025, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.28938, + "grad_norm": 0.1340331733226776, + "learning_rate": 9.938972437109742e-06, + "loss": 4.7461, + "step": 223 + }, + { + "batch_num_effect_tokens": 7929, + "batch_num_samples": 15, + "batch_num_tokens": 8086, + "epoch": 0.29067, + "grad_norm": 0.1430303007364273, + "learning_rate": 9.937194443381972e-06, + "loss": 4.8057, + "step": 224 + }, + { + "batch_num_effect_tokens": 8016, + "batch_num_samples": 18, + "batch_num_tokens": 8191, + "epoch": 0.29197, + "grad_norm": 0.14733895659446716, + "learning_rate": 9.935391083575803e-06, + "loss": 4.7725, + "step": 225 + }, + { + "batch_num_effect_tokens": 8056, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 0.29327, + "grad_norm": 0.13456237316131592, + "learning_rate": 9.933562366956445e-06, + "loss": 4.5049, + "step": 226 + }, + { + "batch_num_effect_tokens": 7997, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.29457, + "grad_norm": 0.13491599261760712, + "learning_rate": 9.931708302919394e-06, + "loss": 5.0586, + "step": 227 + }, + { + "batch_num_effect_tokens": 8015, + "batch_num_samples": 19, + "batch_num_tokens": 8192, + "epoch": 0.29586, + "grad_norm": 0.13626770675182343, + "learning_rate": 9.929828900990367e-06, + "loss": 4.7988, + "step": 228 + }, + { + "batch_num_effect_tokens": 8057, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.29716, + "grad_norm": 0.14051003754138947, + "learning_rate": 9.927924170825266e-06, + "loss": 5.0586, + "step": 229 + }, + { + "batch_num_effect_tokens": 8036, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.29846, + "grad_norm": 0.13720320165157318, + "learning_rate": 9.92599412221012e-06, + "loss": 4.8613, + "step": 230 + }, + { + "batch_num_effect_tokens": 7980, + "batch_num_samples": 14, + "batch_num_tokens": 8113, + "epoch": 0.29976, + "grad_norm": 0.15138697624206543, + "learning_rate": 9.924038765061042e-06, + "loss": 4.7715, + "step": 231 + }, + { + "batch_num_effect_tokens": 8023, + "batch_num_samples": 18, + "batch_num_tokens": 8192, + "epoch": 0.30105, + "grad_norm": 0.1236131489276886, + "learning_rate": 9.922058109424168e-06, + "loss": 4.916, + "step": 232 + }, + { + "batch_num_effect_tokens": 7912, + "batch_num_samples": 23, + "batch_num_tokens": 8105, + "epoch": 0.30235, + "grad_norm": 0.1621561050415039, + "learning_rate": 9.920052165475615e-06, + "loss": 5.0439, + "step": 233 + }, + { + "batch_num_effect_tokens": 8058, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.30365, + "grad_norm": 0.14631430804729462, + "learning_rate": 9.918020943521427e-06, + "loss": 4.79, + "step": 234 + }, + { + "batch_num_effect_tokens": 7846, + "batch_num_samples": 26, + "batch_num_tokens": 8080, + "epoch": 0.30495, + "grad_norm": 0.13547104597091675, + "learning_rate": 9.915964453997516e-06, + "loss": 4.9248, + "step": 235 + }, + { + "batch_num_effect_tokens": 7915, + "batch_num_samples": 15, + "batch_num_tokens": 8086, + "epoch": 0.30624, + "grad_norm": 0.13369937241077423, + "learning_rate": 9.913882707469615e-06, + "loss": 4.9131, + "step": 236 + }, + { + "batch_num_effect_tokens": 8041, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 0.30754, + "grad_norm": 0.14017365872859955, + "learning_rate": 9.911775714633218e-06, + "loss": 4.5908, + "step": 237 + }, + { + "batch_num_effect_tokens": 7915, + "batch_num_samples": 16, + "batch_num_tokens": 8078, + "epoch": 0.30884, + "grad_norm": 0.13819383084774017, + "learning_rate": 9.909643486313533e-06, + "loss": 4.9268, + "step": 238 + }, + { + "batch_num_effect_tokens": 8068, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.31014, + "grad_norm": 0.14381656050682068, + "learning_rate": 9.907486033465421e-06, + "loss": 4.8018, + "step": 239 + }, + { + "batch_num_effect_tokens": 7998, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 0.31144, + "grad_norm": 0.12868614494800568, + "learning_rate": 9.905303367173336e-06, + "loss": 4.8428, + "step": 240 + }, + { + "batch_num_effect_tokens": 7997, + "batch_num_samples": 15, + "batch_num_tokens": 8166, + "epoch": 0.31273, + "grad_norm": 0.13400490581989288, + "learning_rate": 9.903095498651276e-06, + "loss": 4.8477, + "step": 241 + }, + { + "batch_num_effect_tokens": 8032, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 0.31403, + "grad_norm": 0.14183661341667175, + "learning_rate": 9.900862439242719e-06, + "loss": 4.709, + "step": 242 + }, + { + "batch_num_effect_tokens": 8036, + "batch_num_samples": 14, + "batch_num_tokens": 8191, + "epoch": 0.31533, + "grad_norm": 0.1373617798089981, + "learning_rate": 9.898604200420573e-06, + "loss": 5.0449, + "step": 243 + }, + { + "batch_num_effect_tokens": 8013, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.31663, + "grad_norm": 0.1389341652393341, + "learning_rate": 9.896320793787106e-06, + "loss": 4.9932, + "step": 244 + }, + { + "batch_num_effect_tokens": 7900, + "batch_num_samples": 15, + "batch_num_tokens": 8092, + "epoch": 0.31792, + "grad_norm": 0.1446686089038849, + "learning_rate": 9.894012231073895e-06, + "loss": 4.9473, + "step": 245 + }, + { + "batch_num_effect_tokens": 7948, + "batch_num_samples": 15, + "batch_num_tokens": 8080, + "epoch": 0.31922, + "grad_norm": 0.1249702200293541, + "learning_rate": 9.891678524141759e-06, + "loss": 4.7959, + "step": 246 + }, + { + "batch_num_effect_tokens": 7985, + "batch_num_samples": 19, + "batch_num_tokens": 8161, + "epoch": 0.32052, + "grad_norm": 0.14360341429710388, + "learning_rate": 9.889319684980707e-06, + "loss": 5.1543, + "step": 247 + }, + { + "batch_num_effect_tokens": 7880, + "batch_num_samples": 17, + "batch_num_tokens": 8062, + "epoch": 0.32182, + "grad_norm": 0.1445484608411789, + "learning_rate": 9.886935725709868e-06, + "loss": 4.9531, + "step": 248 + }, + { + "batch_num_effect_tokens": 7734, + "batch_num_samples": 28, + "batch_num_tokens": 8008, + "epoch": 0.32311, + "grad_norm": 0.1418876349925995, + "learning_rate": 9.884526658577433e-06, + "loss": 4.9629, + "step": 249 + }, + { + "batch_num_effect_tokens": 8059, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.32441, + "grad_norm": 0.14063134789466858, + "learning_rate": 9.882092495960589e-06, + "loss": 5.0117, + "step": 250 + }, + { + "batch_num_effect_tokens": 7890, + "batch_num_samples": 23, + "batch_num_tokens": 8086, + "epoch": 0.32571, + "grad_norm": 0.13051624596118927, + "learning_rate": 9.87963325036546e-06, + "loss": 4.5283, + "step": 251 + }, + { + "batch_num_effect_tokens": 8060, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.32701, + "grad_norm": 0.14672017097473145, + "learning_rate": 9.877148934427037e-06, + "loss": 4.4834, + "step": 252 + }, + { + "batch_num_effect_tokens": 7976, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.3283, + "grad_norm": 0.13878877460956573, + "learning_rate": 9.874639560909118e-06, + "loss": 4.6523, + "step": 253 + }, + { + "batch_num_effect_tokens": 7886, + "batch_num_samples": 22, + "batch_num_tokens": 8100, + "epoch": 0.3296, + "grad_norm": 0.13505133986473083, + "learning_rate": 9.872105142704245e-06, + "loss": 4.8672, + "step": 254 + }, + { + "batch_num_effect_tokens": 8046, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.3309, + "grad_norm": 0.1461716592311859, + "learning_rate": 9.869545692833624e-06, + "loss": 4.5898, + "step": 255 + }, + { + "batch_num_effect_tokens": 8030, + "batch_num_samples": 18, + "batch_num_tokens": 8192, + "epoch": 0.3322, + "grad_norm": 0.16102413833141327, + "learning_rate": 9.866961224447076e-06, + "loss": 4.7529, + "step": 256 + }, + { + "batch_num_effect_tokens": 8072, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.3335, + "grad_norm": 0.13654085993766785, + "learning_rate": 9.864351750822957e-06, + "loss": 4.6143, + "step": 257 + }, + { + "batch_num_effect_tokens": 8020, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 0.33479, + "grad_norm": 0.13663263618946075, + "learning_rate": 9.86171728536809e-06, + "loss": 4.9082, + "step": 258 + }, + { + "batch_num_effect_tokens": 8034, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 0.33609, + "grad_norm": 0.13321144878864288, + "learning_rate": 9.859057841617709e-06, + "loss": 5.084, + "step": 259 + }, + { + "batch_num_effect_tokens": 8063, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.33739, + "grad_norm": 0.13456861674785614, + "learning_rate": 9.856373433235373e-06, + "loss": 4.8818, + "step": 260 + }, + { + "batch_num_effect_tokens": 8026, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.33869, + "grad_norm": 0.13602375984191895, + "learning_rate": 9.853664074012907e-06, + "loss": 5.0449, + "step": 261 + }, + { + "batch_num_effect_tokens": 8032, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 0.33998, + "grad_norm": 0.13534726202487946, + "learning_rate": 9.850929777870324e-06, + "loss": 4.9688, + "step": 262 + }, + { + "batch_num_effect_tokens": 8016, + "batch_num_samples": 14, + "batch_num_tokens": 8160, + "epoch": 0.34128, + "grad_norm": 0.14672520756721497, + "learning_rate": 9.848170558855757e-06, + "loss": 4.6787, + "step": 263 + }, + { + "batch_num_effect_tokens": 8041, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 0.34258, + "grad_norm": 0.12731723487377167, + "learning_rate": 9.84538643114539e-06, + "loss": 4.998, + "step": 264 + }, + { + "batch_num_effect_tokens": 8032, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 0.34388, + "grad_norm": 0.12880627810955048, + "learning_rate": 9.84257740904338e-06, + "loss": 4.9824, + "step": 265 + }, + { + "batch_num_effect_tokens": 7977, + "batch_num_samples": 14, + "batch_num_tokens": 8128, + "epoch": 0.34517, + "grad_norm": 0.13919463753700256, + "learning_rate": 9.839743506981783e-06, + "loss": 4.5137, + "step": 266 + }, + { + "batch_num_effect_tokens": 8070, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 0.34647, + "grad_norm": 0.14044621586799622, + "learning_rate": 9.836884739520482e-06, + "loss": 4.8906, + "step": 267 + }, + { + "batch_num_effect_tokens": 8033, + "batch_num_samples": 17, + "batch_num_tokens": 8191, + "epoch": 0.34777, + "grad_norm": 0.14072751998901367, + "learning_rate": 9.83400112134712e-06, + "loss": 5.084, + "step": 268 + }, + { + "batch_num_effect_tokens": 8052, + "batch_num_samples": 14, + "batch_num_tokens": 8184, + "epoch": 0.34907, + "grad_norm": 0.13198843598365784, + "learning_rate": 9.831092667277002e-06, + "loss": 4.7402, + "step": 269 + }, + { + "batch_num_effect_tokens": 8046, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 0.35036, + "grad_norm": 0.14157812297344208, + "learning_rate": 9.828159392253051e-06, + "loss": 4.8887, + "step": 270 + }, + { + "batch_num_effect_tokens": 8047, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 0.35166, + "grad_norm": 0.13811397552490234, + "learning_rate": 9.8252013113457e-06, + "loss": 4.8408, + "step": 271 + }, + { + "batch_num_effect_tokens": 8023, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.35296, + "grad_norm": 0.141897514462471, + "learning_rate": 9.822218439752835e-06, + "loss": 4.8301, + "step": 272 + }, + { + "batch_num_effect_tokens": 7977, + "batch_num_samples": 14, + "batch_num_tokens": 8149, + "epoch": 0.35426, + "grad_norm": 0.1375928372144699, + "learning_rate": 9.819210792799711e-06, + "loss": 4.8154, + "step": 273 + }, + { + "batch_num_effect_tokens": 7896, + "batch_num_samples": 20, + "batch_num_tokens": 8096, + "epoch": 0.35556, + "grad_norm": 0.14474721252918243, + "learning_rate": 9.816178385938867e-06, + "loss": 4.959, + "step": 274 + }, + { + "batch_num_effect_tokens": 7988, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 0.35685, + "grad_norm": 0.14300163090229034, + "learning_rate": 9.81312123475006e-06, + "loss": 4.9746, + "step": 275 + }, + { + "batch_num_effect_tokens": 8009, + "batch_num_samples": 14, + "batch_num_tokens": 8173, + "epoch": 0.35815, + "grad_norm": 0.1514219492673874, + "learning_rate": 9.810039354940172e-06, + "loss": 4.9414, + "step": 276 + }, + { + "batch_num_effect_tokens": 8001, + "batch_num_samples": 15, + "batch_num_tokens": 8145, + "epoch": 0.35945, + "grad_norm": 0.13827340304851532, + "learning_rate": 9.806932762343136e-06, + "loss": 4.9424, + "step": 277 + }, + { + "batch_num_effect_tokens": 7975, + "batch_num_samples": 17, + "batch_num_tokens": 8152, + "epoch": 0.36075, + "grad_norm": 0.14099182188510895, + "learning_rate": 9.80380147291985e-06, + "loss": 5.1465, + "step": 278 + }, + { + "batch_num_effect_tokens": 8024, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 0.36204, + "grad_norm": 0.12410301715135574, + "learning_rate": 9.800645502758104e-06, + "loss": 4.9053, + "step": 279 + }, + { + "batch_num_effect_tokens": 7958, + "batch_num_samples": 14, + "batch_num_tokens": 8101, + "epoch": 0.36334, + "grad_norm": 0.1367730349302292, + "learning_rate": 9.797464868072489e-06, + "loss": 4.6543, + "step": 280 + }, + { + "batch_num_effect_tokens": 7939, + "batch_num_samples": 22, + "batch_num_tokens": 8146, + "epoch": 0.36464, + "grad_norm": 0.1414947360754013, + "learning_rate": 9.794259585204313e-06, + "loss": 4.9229, + "step": 281 + }, + { + "batch_num_effect_tokens": 8007, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.36594, + "grad_norm": 0.12790058553218842, + "learning_rate": 9.791029670621525e-06, + "loss": 4.9121, + "step": 282 + }, + { + "batch_num_effect_tokens": 8015, + "batch_num_samples": 21, + "batch_num_tokens": 8192, + "epoch": 0.36723, + "grad_norm": 0.13167431950569153, + "learning_rate": 9.787775140918625e-06, + "loss": 4.918, + "step": 283 + }, + { + "batch_num_effect_tokens": 8028, + "batch_num_samples": 18, + "batch_num_tokens": 8192, + "epoch": 0.36853, + "grad_norm": 0.14066706597805023, + "learning_rate": 9.784496012816574e-06, + "loss": 4.8828, + "step": 284 + }, + { + "batch_num_effect_tokens": 7989, + "batch_num_samples": 23, + "batch_num_tokens": 8192, + "epoch": 0.36983, + "grad_norm": 0.14358888566493988, + "learning_rate": 9.781192303162721e-06, + "loss": 4.7529, + "step": 285 + }, + { + "batch_num_effect_tokens": 8052, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.37113, + "grad_norm": 0.14802835881710052, + "learning_rate": 9.777864028930705e-06, + "loss": 4.8633, + "step": 286 + }, + { + "batch_num_effect_tokens": 7988, + "batch_num_samples": 15, + "batch_num_tokens": 8146, + "epoch": 0.37242, + "grad_norm": 0.14124155044555664, + "learning_rate": 9.774511207220369e-06, + "loss": 4.8584, + "step": 287 + }, + { + "batch_num_effect_tokens": 7869, + "batch_num_samples": 15, + "batch_num_tokens": 8086, + "epoch": 0.37372, + "grad_norm": 0.13983023166656494, + "learning_rate": 9.771133855257684e-06, + "loss": 5.127, + "step": 288 + }, + { + "batch_num_effect_tokens": 8036, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 0.37502, + "grad_norm": 0.12876510620117188, + "learning_rate": 9.767731990394638e-06, + "loss": 4.8506, + "step": 289 + }, + { + "batch_num_effect_tokens": 8044, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.37632, + "grad_norm": 0.13428495824337006, + "learning_rate": 9.764305630109174e-06, + "loss": 4.8955, + "step": 290 + }, + { + "batch_num_effect_tokens": 7997, + "batch_num_samples": 16, + "batch_num_tokens": 8155, + "epoch": 0.37762, + "grad_norm": 0.12863335013389587, + "learning_rate": 9.760854792005075e-06, + "loss": 5.1689, + "step": 291 + }, + { + "batch_num_effect_tokens": 8068, + "batch_num_samples": 14, + "batch_num_tokens": 8184, + "epoch": 0.37891, + "grad_norm": 0.13183218240737915, + "learning_rate": 9.757379493811892e-06, + "loss": 4.8193, + "step": 292 + }, + { + "batch_num_effect_tokens": 8035, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 0.38021, + "grad_norm": 0.1384708732366562, + "learning_rate": 9.753879753384845e-06, + "loss": 4.8105, + "step": 293 + }, + { + "batch_num_effect_tokens": 8034, + "batch_num_samples": 18, + "batch_num_tokens": 8192, + "epoch": 0.38151, + "grad_norm": 0.1356097161769867, + "learning_rate": 9.750355588704728e-06, + "loss": 4.9473, + "step": 294 + }, + { + "batch_num_effect_tokens": 8032, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.38281, + "grad_norm": 0.13910697400569916, + "learning_rate": 9.746807017877823e-06, + "loss": 4.9854, + "step": 295 + }, + { + "batch_num_effect_tokens": 7997, + "batch_num_samples": 17, + "batch_num_tokens": 8191, + "epoch": 0.3841, + "grad_norm": 0.13383249938488007, + "learning_rate": 9.743234059135812e-06, + "loss": 4.8418, + "step": 296 + }, + { + "batch_num_effect_tokens": 8049, + "batch_num_samples": 14, + "batch_num_tokens": 8164, + "epoch": 0.3854, + "grad_norm": 0.1411924511194229, + "learning_rate": 9.73963673083566e-06, + "loss": 4.9199, + "step": 297 + }, + { + "batch_num_effect_tokens": 8025, + "batch_num_samples": 19, + "batch_num_tokens": 8192, + "epoch": 0.3867, + "grad_norm": 0.1354569047689438, + "learning_rate": 9.736015051459551e-06, + "loss": 4.6748, + "step": 298 + }, + { + "batch_num_effect_tokens": 8065, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.388, + "grad_norm": 0.138069286942482, + "learning_rate": 9.732369039614774e-06, + "loss": 4.8672, + "step": 299 + }, + { + "batch_num_effect_tokens": 8056, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 0.38929, + "grad_norm": 0.137836754322052, + "learning_rate": 9.728698714033631e-06, + "loss": 5.0059, + "step": 300 + }, + { + "batch_num_effect_tokens": 8037, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 0.39059, + "grad_norm": 0.14079798758029938, + "learning_rate": 9.725004093573343e-06, + "loss": 5.0039, + "step": 301 + }, + { + "batch_num_effect_tokens": 7842, + "batch_num_samples": 17, + "batch_num_tokens": 7989, + "epoch": 0.39189, + "grad_norm": 0.13341772556304932, + "learning_rate": 9.721285197215954e-06, + "loss": 4.8281, + "step": 302 + }, + { + "batch_num_effect_tokens": 7995, + "batch_num_samples": 19, + "batch_num_tokens": 8191, + "epoch": 0.39319, + "grad_norm": 0.14130514860153198, + "learning_rate": 9.717542044068224e-06, + "loss": 4.6729, + "step": 303 + }, + { + "batch_num_effect_tokens": 7882, + "batch_num_samples": 25, + "batch_num_tokens": 8105, + "epoch": 0.39448, + "grad_norm": 0.1398962289094925, + "learning_rate": 9.71377465336155e-06, + "loss": 4.6582, + "step": 304 + }, + { + "batch_num_effect_tokens": 7844, + "batch_num_samples": 20, + "batch_num_tokens": 8080, + "epoch": 0.39578, + "grad_norm": 0.1334947943687439, + "learning_rate": 9.709983044451847e-06, + "loss": 4.6211, + "step": 305 + }, + { + "batch_num_effect_tokens": 7944, + "batch_num_samples": 14, + "batch_num_tokens": 8088, + "epoch": 0.39708, + "grad_norm": 0.1320602148771286, + "learning_rate": 9.70616723681946e-06, + "loss": 4.7031, + "step": 306 + }, + { + "batch_num_effect_tokens": 8048, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.39838, + "grad_norm": 0.13198183476924896, + "learning_rate": 9.702327250069058e-06, + "loss": 4.5635, + "step": 307 + }, + { + "batch_num_effect_tokens": 8024, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 0.39968, + "grad_norm": 0.13476605713367462, + "learning_rate": 9.698463103929542e-06, + "loss": 4.9883, + "step": 308 + }, + { + "batch_num_effect_tokens": 8024, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 0.39968, + "eval_eval_loss": 0.6101124882698059, + "eval_eval_runtime": 115.2852, + "eval_eval_samples_per_second": 43.371, + "eval_eval_steps_per_second": 2.715, + "step": 308 + }, + { + "batch_num_effect_tokens": 7840, + "batch_num_samples": 28, + "batch_num_tokens": 8104, + "epoch": 0.40097, + "grad_norm": 0.13941439986228943, + "learning_rate": 9.694574818253935e-06, + "loss": 4.916, + "step": 309 + }, + { + "batch_num_effect_tokens": 8024, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 0.40227, + "grad_norm": 0.13381004333496094, + "learning_rate": 9.69066241301928e-06, + "loss": 5.0781, + "step": 310 + }, + { + "batch_num_effect_tokens": 8042, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 0.40357, + "grad_norm": 0.14835673570632935, + "learning_rate": 9.686725908326547e-06, + "loss": 4.8125, + "step": 311 + }, + { + "batch_num_effect_tokens": 8038, + "batch_num_samples": 18, + "batch_num_tokens": 8192, + "epoch": 0.40487, + "grad_norm": 0.1386537253856659, + "learning_rate": 9.682765324400514e-06, + "loss": 4.7021, + "step": 312 + }, + { + "batch_num_effect_tokens": 7999, + "batch_num_samples": 14, + "batch_num_tokens": 8137, + "epoch": 0.40616, + "grad_norm": 0.13303305208683014, + "learning_rate": 9.67878068158968e-06, + "loss": 4.6924, + "step": 313 + }, + { + "batch_num_effect_tokens": 8070, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.40746, + "grad_norm": 0.13811469078063965, + "learning_rate": 9.674772000366151e-06, + "loss": 4.8867, + "step": 314 + }, + { + "batch_num_effect_tokens": 8025, + "batch_num_samples": 15, + "batch_num_tokens": 8176, + "epoch": 0.40876, + "grad_norm": 0.13623467087745667, + "learning_rate": 9.670739301325534e-06, + "loss": 4.7764, + "step": 315 + }, + { + "batch_num_effect_tokens": 8006, + "batch_num_samples": 18, + "batch_num_tokens": 8192, + "epoch": 0.41006, + "grad_norm": 0.13184677064418793, + "learning_rate": 9.666682605186834e-06, + "loss": 4.6846, + "step": 316 + }, + { + "batch_num_effect_tokens": 8058, + "batch_num_samples": 14, + "batch_num_tokens": 8190, + "epoch": 0.41135, + "grad_norm": 0.13243718445301056, + "learning_rate": 9.662601932792349e-06, + "loss": 4.5635, + "step": 317 + }, + { + "batch_num_effect_tokens": 7975, + "batch_num_samples": 14, + "batch_num_tokens": 8104, + "epoch": 0.41265, + "grad_norm": 0.13839715719223022, + "learning_rate": 9.658497305107559e-06, + "loss": 4.8477, + "step": 318 + }, + { + "batch_num_effect_tokens": 7927, + "batch_num_samples": 14, + "batch_num_tokens": 8086, + "epoch": 0.41395, + "grad_norm": 0.1342511624097824, + "learning_rate": 9.654368743221022e-06, + "loss": 4.8398, + "step": 319 + }, + { + "batch_num_effect_tokens": 8039, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 0.41525, + "grad_norm": 0.1273842304944992, + "learning_rate": 9.650216268344263e-06, + "loss": 4.4043, + "step": 320 + }, + { + "batch_num_effect_tokens": 7888, + "batch_num_samples": 23, + "batch_num_tokens": 8086, + "epoch": 0.41655, + "grad_norm": 0.12917739152908325, + "learning_rate": 9.646039901811666e-06, + "loss": 4.6807, + "step": 321 + }, + { + "batch_num_effect_tokens": 8011, + "batch_num_samples": 14, + "batch_num_tokens": 8142, + "epoch": 0.41784, + "grad_norm": 0.13067729771137238, + "learning_rate": 9.641839665080363e-06, + "loss": 4.6602, + "step": 322 + }, + { + "batch_num_effect_tokens": 8013, + "batch_num_samples": 17, + "batch_num_tokens": 8181, + "epoch": 0.41914, + "grad_norm": 0.13833428919315338, + "learning_rate": 9.63761557973013e-06, + "loss": 4.6309, + "step": 323 + }, + { + "batch_num_effect_tokens": 8008, + "batch_num_samples": 19, + "batch_num_tokens": 8176, + "epoch": 0.42044, + "grad_norm": 0.14830631017684937, + "learning_rate": 9.633367667463267e-06, + "loss": 5.5645, + "step": 324 + }, + { + "batch_num_effect_tokens": 7975, + "batch_num_samples": 18, + "batch_num_tokens": 8192, + "epoch": 0.42174, + "grad_norm": 0.13653573393821716, + "learning_rate": 9.62909595010449e-06, + "loss": 5.4238, + "step": 325 + }, + { + "batch_num_effect_tokens": 7962, + "batch_num_samples": 14, + "batch_num_tokens": 8120, + "epoch": 0.42303, + "grad_norm": 0.13599801063537598, + "learning_rate": 9.624800449600826e-06, + "loss": 5.1523, + "step": 326 + }, + { + "batch_num_effect_tokens": 7994, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.42433, + "grad_norm": 0.1389356106519699, + "learning_rate": 9.620481188021484e-06, + "loss": 4.9199, + "step": 327 + }, + { + "batch_num_effect_tokens": 7996, + "batch_num_samples": 16, + "batch_num_tokens": 8166, + "epoch": 0.42563, + "grad_norm": 0.1340194195508957, + "learning_rate": 9.616138187557758e-06, + "loss": 4.7656, + "step": 328 + }, + { + "batch_num_effect_tokens": 8002, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 0.42693, + "grad_norm": 0.1351163685321808, + "learning_rate": 9.611771470522908e-06, + "loss": 5.2266, + "step": 329 + }, + { + "batch_num_effect_tokens": 7892, + "batch_num_samples": 26, + "batch_num_tokens": 8137, + "epoch": 0.42822, + "grad_norm": 0.13661961257457733, + "learning_rate": 9.60738105935204e-06, + "loss": 5.0176, + "step": 330 + }, + { + "batch_num_effect_tokens": 8043, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 0.42952, + "grad_norm": 0.1298064887523651, + "learning_rate": 9.602966976601995e-06, + "loss": 4.6572, + "step": 331 + }, + { + "batch_num_effect_tokens": 7974, + "batch_num_samples": 18, + "batch_num_tokens": 8128, + "epoch": 0.43082, + "grad_norm": 0.13714361190795898, + "learning_rate": 9.598529244951233e-06, + "loss": 4.959, + "step": 332 + }, + { + "batch_num_effect_tokens": 8021, + "batch_num_samples": 17, + "batch_num_tokens": 8186, + "epoch": 0.43212, + "grad_norm": 0.13826780021190643, + "learning_rate": 9.594067887199719e-06, + "loss": 4.875, + "step": 333 + }, + { + "batch_num_effect_tokens": 7877, + "batch_num_samples": 18, + "batch_num_tokens": 8100, + "epoch": 0.43341, + "grad_norm": 0.1306610256433487, + "learning_rate": 9.589582926268798e-06, + "loss": 4.2568, + "step": 334 + }, + { + "batch_num_effect_tokens": 7949, + "batch_num_samples": 14, + "batch_num_tokens": 8079, + "epoch": 0.43471, + "grad_norm": 0.12845724821090698, + "learning_rate": 9.585074385201087e-06, + "loss": 4.8184, + "step": 335 + }, + { + "batch_num_effect_tokens": 8046, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.43601, + "grad_norm": 0.13526244461536407, + "learning_rate": 9.580542287160348e-06, + "loss": 4.8398, + "step": 336 + }, + { + "batch_num_effect_tokens": 7987, + "batch_num_samples": 20, + "batch_num_tokens": 8186, + "epoch": 0.43731, + "grad_norm": 0.1280571073293686, + "learning_rate": 9.575986655431377e-06, + "loss": 4.7578, + "step": 337 + }, + { + "batch_num_effect_tokens": 8017, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 0.43861, + "grad_norm": 0.12609054148197174, + "learning_rate": 9.571407513419878e-06, + "loss": 4.6699, + "step": 338 + }, + { + "batch_num_effect_tokens": 7953, + "batch_num_samples": 15, + "batch_num_tokens": 8104, + "epoch": 0.4399, + "grad_norm": 0.12861526012420654, + "learning_rate": 9.566804884652342e-06, + "loss": 4.9395, + "step": 339 + }, + { + "batch_num_effect_tokens": 8027, + "batch_num_samples": 18, + "batch_num_tokens": 8191, + "epoch": 0.4412, + "grad_norm": 0.15593406558036804, + "learning_rate": 9.562178792775936e-06, + "loss": 4.7197, + "step": 340 + }, + { + "batch_num_effect_tokens": 7989, + "batch_num_samples": 14, + "batch_num_tokens": 8142, + "epoch": 0.4425, + "grad_norm": 0.1305861622095108, + "learning_rate": 9.557529261558367e-06, + "loss": 4.8428, + "step": 341 + }, + { + "batch_num_effect_tokens": 8042, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.4438, + "grad_norm": 0.13278287649154663, + "learning_rate": 9.552856314887772e-06, + "loss": 4.7871, + "step": 342 + }, + { + "batch_num_effect_tokens": 8030, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.44509, + "grad_norm": 0.12788629531860352, + "learning_rate": 9.548159976772593e-06, + "loss": 4.709, + "step": 343 + }, + { + "batch_num_effect_tokens": 8030, + "batch_num_samples": 18, + "batch_num_tokens": 8192, + "epoch": 0.44639, + "grad_norm": 0.12882810831069946, + "learning_rate": 9.543440271341445e-06, + "loss": 4.9229, + "step": 344 + }, + { + "batch_num_effect_tokens": 8059, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 0.44769, + "grad_norm": 0.1309846192598343, + "learning_rate": 9.538697222843004e-06, + "loss": 4.8623, + "step": 345 + }, + { + "batch_num_effect_tokens": 7976, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 0.44899, + "grad_norm": 0.12217242270708084, + "learning_rate": 9.533930855645872e-06, + "loss": 4.7715, + "step": 346 + }, + { + "batch_num_effect_tokens": 7944, + "batch_num_samples": 14, + "batch_num_tokens": 8112, + "epoch": 0.45028, + "grad_norm": 0.13333797454833984, + "learning_rate": 9.529141194238462e-06, + "loss": 4.8975, + "step": 347 + }, + { + "batch_num_effect_tokens": 7923, + "batch_num_samples": 15, + "batch_num_tokens": 8110, + "epoch": 0.45158, + "grad_norm": 0.12796379625797272, + "learning_rate": 9.524328263228866e-06, + "loss": 4.8311, + "step": 348 + }, + { + "batch_num_effect_tokens": 8016, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 0.45288, + "grad_norm": 0.13038370013237, + "learning_rate": 9.519492087344724e-06, + "loss": 4.708, + "step": 349 + }, + { + "batch_num_effect_tokens": 8035, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 0.45418, + "grad_norm": 0.14246924221515656, + "learning_rate": 9.514632691433108e-06, + "loss": 4.5479, + "step": 350 + }, + { + "batch_num_effect_tokens": 8040, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 0.45547, + "grad_norm": 0.13306821882724762, + "learning_rate": 9.509750100460384e-06, + "loss": 4.7334, + "step": 351 + }, + { + "batch_num_effect_tokens": 8036, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.45677, + "grad_norm": 0.12792479991912842, + "learning_rate": 9.504844339512096e-06, + "loss": 4.626, + "step": 352 + }, + { + "batch_num_effect_tokens": 8073, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.45807, + "grad_norm": 0.13247136771678925, + "learning_rate": 9.499915433792823e-06, + "loss": 4.9121, + "step": 353 + }, + { + "batch_num_effect_tokens": 8003, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 0.45937, + "grad_norm": 0.13608035445213318, + "learning_rate": 9.494963408626056e-06, + "loss": 5.0977, + "step": 354 + }, + { + "batch_num_effect_tokens": 8065, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.46067, + "grad_norm": 0.1288149058818817, + "learning_rate": 9.489988289454073e-06, + "loss": 4.7832, + "step": 355 + }, + { + "batch_num_effect_tokens": 8038, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 0.46196, + "grad_norm": 0.13947491347789764, + "learning_rate": 9.484990101837798e-06, + "loss": 4.625, + "step": 356 + }, + { + "batch_num_effect_tokens": 8042, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 0.46326, + "grad_norm": 0.13044162094593048, + "learning_rate": 9.47996887145668e-06, + "loss": 4.6738, + "step": 357 + }, + { + "batch_num_effect_tokens": 7940, + "batch_num_samples": 20, + "batch_num_tokens": 8144, + "epoch": 0.46456, + "grad_norm": 0.13908232748508453, + "learning_rate": 9.47492462410855e-06, + "loss": 4.6367, + "step": 358 + }, + { + "batch_num_effect_tokens": 7940, + "batch_num_samples": 15, + "batch_num_tokens": 8116, + "epoch": 0.46586, + "grad_norm": 0.13168592751026154, + "learning_rate": 9.469857385709498e-06, + "loss": 4.7568, + "step": 359 + }, + { + "batch_num_effect_tokens": 7968, + "batch_num_samples": 14, + "batch_num_tokens": 8107, + "epoch": 0.46715, + "grad_norm": 0.13281431794166565, + "learning_rate": 9.46476718229374e-06, + "loss": 4.833, + "step": 360 + }, + { + "batch_num_effect_tokens": 8036, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.46845, + "grad_norm": 0.13974054157733917, + "learning_rate": 9.45965404001347e-06, + "loss": 4.6826, + "step": 361 + }, + { + "batch_num_effect_tokens": 8018, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 0.46975, + "grad_norm": 0.1310376226902008, + "learning_rate": 9.454517985138748e-06, + "loss": 4.626, + "step": 362 + }, + { + "batch_num_effect_tokens": 7976, + "batch_num_samples": 15, + "batch_num_tokens": 8128, + "epoch": 0.47105, + "grad_norm": 0.12379533797502518, + "learning_rate": 9.449359044057344e-06, + "loss": 4.4814, + "step": 363 + }, + { + "batch_num_effect_tokens": 7984, + "batch_num_samples": 14, + "batch_num_tokens": 8128, + "epoch": 0.47234, + "grad_norm": 0.13945648074150085, + "learning_rate": 9.444177243274619e-06, + "loss": 4.6367, + "step": 364 + }, + { + "batch_num_effect_tokens": 8009, + "batch_num_samples": 19, + "batch_num_tokens": 8192, + "epoch": 0.47364, + "grad_norm": 0.12774516642093658, + "learning_rate": 9.438972609413376e-06, + "loss": 4.7061, + "step": 365 + }, + { + "batch_num_effect_tokens": 7941, + "batch_num_samples": 23, + "batch_num_tokens": 8162, + "epoch": 0.47494, + "grad_norm": 0.12300989776849747, + "learning_rate": 9.433745169213729e-06, + "loss": 4.6963, + "step": 366 + }, + { + "batch_num_effect_tokens": 7997, + "batch_num_samples": 20, + "batch_num_tokens": 8176, + "epoch": 0.47624, + "grad_norm": 0.1422237604856491, + "learning_rate": 9.428494949532972e-06, + "loss": 5.0645, + "step": 367 + }, + { + "batch_num_effect_tokens": 8039, + "batch_num_samples": 15, + "batch_num_tokens": 8186, + "epoch": 0.47753, + "grad_norm": 0.14680571854114532, + "learning_rate": 9.423221977345425e-06, + "loss": 4.583, + "step": 368 + }, + { + "batch_num_effect_tokens": 8044, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.47883, + "grad_norm": 0.14597126841545105, + "learning_rate": 9.41792627974231e-06, + "loss": 4.7686, + "step": 369 + }, + { + "batch_num_effect_tokens": 8035, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 0.48013, + "grad_norm": 0.13670004904270172, + "learning_rate": 9.412607883931608e-06, + "loss": 4.7676, + "step": 370 + }, + { + "batch_num_effect_tokens": 8010, + "batch_num_samples": 21, + "batch_num_tokens": 8192, + "epoch": 0.48143, + "grad_norm": 0.1419769525527954, + "learning_rate": 9.40726681723791e-06, + "loss": 4.9912, + "step": 371 + }, + { + "batch_num_effect_tokens": 7992, + "batch_num_samples": 18, + "batch_num_tokens": 8184, + "epoch": 0.48273, + "grad_norm": 0.13378414511680603, + "learning_rate": 9.401903107102295e-06, + "loss": 4.8301, + "step": 372 + }, + { + "batch_num_effect_tokens": 8038, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 0.48402, + "grad_norm": 0.12032909691333771, + "learning_rate": 9.396516781082172e-06, + "loss": 4.9736, + "step": 373 + }, + { + "batch_num_effect_tokens": 7962, + "batch_num_samples": 23, + "batch_num_tokens": 8181, + "epoch": 0.48532, + "grad_norm": 0.14288964867591858, + "learning_rate": 9.391107866851143e-06, + "loss": 5.4004, + "step": 374 + }, + { + "batch_num_effect_tokens": 7887, + "batch_num_samples": 18, + "batch_num_tokens": 8086, + "epoch": 0.48662, + "grad_norm": 0.13049355149269104, + "learning_rate": 9.385676392198869e-06, + "loss": 4.8486, + "step": 375 + }, + { + "batch_num_effect_tokens": 7965, + "batch_num_samples": 22, + "batch_num_tokens": 8177, + "epoch": 0.48792, + "grad_norm": 0.12765397131443024, + "learning_rate": 9.380222385030916e-06, + "loss": 4.8682, + "step": 376 + }, + { + "batch_num_effect_tokens": 7958, + "batch_num_samples": 16, + "batch_num_tokens": 8133, + "epoch": 0.48921, + "grad_norm": 0.12337980419397354, + "learning_rate": 9.374745873368614e-06, + "loss": 4.6826, + "step": 377 + }, + { + "batch_num_effect_tokens": 8071, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 0.49051, + "grad_norm": 0.12957154214382172, + "learning_rate": 9.369246885348926e-06, + "loss": 5.0645, + "step": 378 + }, + { + "batch_num_effect_tokens": 7992, + "batch_num_samples": 14, + "batch_num_tokens": 8155, + "epoch": 0.49181, + "grad_norm": 0.13127748668193817, + "learning_rate": 9.363725449224281e-06, + "loss": 4.9531, + "step": 379 + }, + { + "batch_num_effect_tokens": 8056, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 0.49311, + "grad_norm": 0.13518795371055603, + "learning_rate": 9.35818159336245e-06, + "loss": 4.9785, + "step": 380 + }, + { + "batch_num_effect_tokens": 7961, + "batch_num_samples": 17, + "batch_num_tokens": 8108, + "epoch": 0.4944, + "grad_norm": 0.13270984590053558, + "learning_rate": 9.352615346246383e-06, + "loss": 4.8457, + "step": 381 + }, + { + "batch_num_effect_tokens": 8028, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 0.4957, + "grad_norm": 0.13498768210411072, + "learning_rate": 9.347026736474077e-06, + "loss": 4.6934, + "step": 382 + }, + { + "batch_num_effect_tokens": 8029, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 0.497, + "grad_norm": 0.1299462616443634, + "learning_rate": 9.341415792758421e-06, + "loss": 5.041, + "step": 383 + }, + { + "batch_num_effect_tokens": 8015, + "batch_num_samples": 17, + "batch_num_tokens": 8178, + "epoch": 0.4983, + "grad_norm": 0.14259278774261475, + "learning_rate": 9.33578254392705e-06, + "loss": 4.8867, + "step": 384 + }, + { + "batch_num_effect_tokens": 8068, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.49959, + "grad_norm": 0.142298623919487, + "learning_rate": 9.330127018922195e-06, + "loss": 5.1357, + "step": 385 + }, + { + "batch_num_effect_tokens": 8053, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.50089, + "grad_norm": 0.1336406022310257, + "learning_rate": 9.324449246800538e-06, + "loss": 4.9541, + "step": 386 + }, + { + "batch_num_effect_tokens": 8067, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 0.50219, + "grad_norm": 0.13553744554519653, + "learning_rate": 9.318749256733064e-06, + "loss": 5.0166, + "step": 387 + }, + { + "batch_num_effect_tokens": 8046, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.50349, + "grad_norm": 0.12613889575004578, + "learning_rate": 9.313027078004903e-06, + "loss": 4.8721, + "step": 388 + }, + { + "batch_num_effect_tokens": 7974, + "batch_num_samples": 15, + "batch_num_tokens": 8126, + "epoch": 0.50479, + "grad_norm": 0.1368468850851059, + "learning_rate": 9.307282740015192e-06, + "loss": 5.2559, + "step": 389 + }, + { + "batch_num_effect_tokens": 8008, + "batch_num_samples": 19, + "batch_num_tokens": 8191, + "epoch": 0.50608, + "grad_norm": 0.14101892709732056, + "learning_rate": 9.301516272276907e-06, + "loss": 4.7598, + "step": 390 + }, + { + "batch_num_effect_tokens": 7966, + "batch_num_samples": 17, + "batch_num_tokens": 8101, + "epoch": 0.50738, + "grad_norm": 0.1324855089187622, + "learning_rate": 9.295727704416731e-06, + "loss": 5.0908, + "step": 391 + }, + { + "batch_num_effect_tokens": 7998, + "batch_num_samples": 19, + "batch_num_tokens": 8192, + "epoch": 0.50868, + "grad_norm": 0.13836555182933807, + "learning_rate": 9.289917066174887e-06, + "loss": 4.9092, + "step": 392 + }, + { + "batch_num_effect_tokens": 8033, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.50998, + "grad_norm": 0.1290932595729828, + "learning_rate": 9.284084387404985e-06, + "loss": 5.0156, + "step": 393 + }, + { + "batch_num_effect_tokens": 8009, + "batch_num_samples": 17, + "batch_num_tokens": 8191, + "epoch": 0.51127, + "grad_norm": 0.13208125531673431, + "learning_rate": 9.278229698073889e-06, + "loss": 4.7783, + "step": 394 + }, + { + "batch_num_effect_tokens": 8040, + "batch_num_samples": 16, + "batch_num_tokens": 8188, + "epoch": 0.51257, + "grad_norm": 0.12734545767307281, + "learning_rate": 9.27235302826153e-06, + "loss": 4.6992, + "step": 395 + }, + { + "batch_num_effect_tokens": 8025, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 0.51387, + "grad_norm": 0.1279255449771881, + "learning_rate": 9.266454408160779e-06, + "loss": 4.3008, + "step": 396 + }, + { + "batch_num_effect_tokens": 8048, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 0.51517, + "grad_norm": 0.13789525628089905, + "learning_rate": 9.260533868077283e-06, + "loss": 4.7852, + "step": 397 + }, + { + "batch_num_effect_tokens": 7919, + "batch_num_samples": 15, + "batch_num_tokens": 8076, + "epoch": 0.51646, + "grad_norm": 0.12028060853481293, + "learning_rate": 9.254591438429305e-06, + "loss": 4.7539, + "step": 398 + }, + { + "batch_num_effect_tokens": 8069, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.51776, + "grad_norm": 0.12482751905918121, + "learning_rate": 9.248627149747573e-06, + "loss": 4.7402, + "step": 399 + }, + { + "batch_num_effect_tokens": 8016, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 0.51906, + "grad_norm": 0.13144247233867645, + "learning_rate": 9.242641032675118e-06, + "loss": 4.7803, + "step": 400 + }, + { + "batch_num_effect_tokens": 8056, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.52036, + "grad_norm": 0.13110798597335815, + "learning_rate": 9.236633117967125e-06, + "loss": 4.6787, + "step": 401 + }, + { + "batch_num_effect_tokens": 7975, + "batch_num_samples": 17, + "batch_num_tokens": 8136, + "epoch": 0.52165, + "grad_norm": 0.13168026506900787, + "learning_rate": 9.230603436490764e-06, + "loss": 4.8691, + "step": 402 + }, + { + "batch_num_effect_tokens": 8011, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 0.52295, + "grad_norm": 0.14049184322357178, + "learning_rate": 9.224552019225044e-06, + "loss": 4.9766, + "step": 403 + }, + { + "batch_num_effect_tokens": 7976, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 0.52425, + "grad_norm": 0.13541923463344574, + "learning_rate": 9.21847889726064e-06, + "loss": 4.6348, + "step": 404 + }, + { + "batch_num_effect_tokens": 8024, + "batch_num_samples": 15, + "batch_num_tokens": 8158, + "epoch": 0.52555, + "grad_norm": 0.13659295439720154, + "learning_rate": 9.212384101799748e-06, + "loss": 5.1406, + "step": 405 + }, + { + "batch_num_effect_tokens": 8017, + "batch_num_samples": 14, + "batch_num_tokens": 8188, + "epoch": 0.52685, + "grad_norm": 0.1369631290435791, + "learning_rate": 9.206267664155906e-06, + "loss": 4.7207, + "step": 406 + }, + { + "batch_num_effect_tokens": 7976, + "batch_num_samples": 15, + "batch_num_tokens": 8128, + "epoch": 0.52814, + "grad_norm": 0.12341079860925674, + "learning_rate": 9.200129615753858e-06, + "loss": 4.6543, + "step": 407 + }, + { + "batch_num_effect_tokens": 7884, + "batch_num_samples": 20, + "batch_num_tokens": 8080, + "epoch": 0.52944, + "grad_norm": 0.1462087631225586, + "learning_rate": 9.193969988129367e-06, + "loss": 4.8408, + "step": 408 + }, + { + "batch_num_effect_tokens": 7955, + "batch_num_samples": 14, + "batch_num_tokens": 8119, + "epoch": 0.53074, + "grad_norm": 0.136996328830719, + "learning_rate": 9.187788812929074e-06, + "loss": 4.9297, + "step": 409 + }, + { + "batch_num_effect_tokens": 7812, + "batch_num_samples": 32, + "batch_num_tokens": 8076, + "epoch": 0.53204, + "grad_norm": 0.1477639228105545, + "learning_rate": 9.181586121910317e-06, + "loss": 4.9512, + "step": 410 + }, + { + "batch_num_effect_tokens": 7994, + "batch_num_samples": 14, + "batch_num_tokens": 8136, + "epoch": 0.53333, + "grad_norm": 0.13376633822917938, + "learning_rate": 9.175361946940983e-06, + "loss": 4.9346, + "step": 411 + }, + { + "batch_num_effect_tokens": 7993, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 0.53463, + "grad_norm": 0.12200411409139633, + "learning_rate": 9.169116319999336e-06, + "loss": 4.5762, + "step": 412 + }, + { + "batch_num_effect_tokens": 8054, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 0.53593, + "grad_norm": 0.14485883712768555, + "learning_rate": 9.162849273173857e-06, + "loss": 4.7148, + "step": 413 + }, + { + "batch_num_effect_tokens": 8020, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 0.53723, + "grad_norm": 0.12444700300693512, + "learning_rate": 9.156560838663076e-06, + "loss": 4.5879, + "step": 414 + }, + { + "batch_num_effect_tokens": 8043, + "batch_num_samples": 14, + "batch_num_tokens": 8184, + "epoch": 0.53852, + "grad_norm": 0.13118426501750946, + "learning_rate": 9.150251048775403e-06, + "loss": 4.6113, + "step": 415 + }, + { + "batch_num_effect_tokens": 8042, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.53982, + "grad_norm": 0.12771986424922943, + "learning_rate": 9.143919935928975e-06, + "loss": 4.8223, + "step": 416 + }, + { + "batch_num_effect_tokens": 8040, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 0.54112, + "grad_norm": 0.13158395886421204, + "learning_rate": 9.137567532651477e-06, + "loss": 4.6729, + "step": 417 + }, + { + "batch_num_effect_tokens": 8023, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 0.54242, + "grad_norm": 0.13526186347007751, + "learning_rate": 9.131193871579975e-06, + "loss": 4.4736, + "step": 418 + }, + { + "batch_num_effect_tokens": 7949, + "batch_num_samples": 17, + "batch_num_tokens": 8100, + "epoch": 0.54371, + "grad_norm": 0.14715400338172913, + "learning_rate": 9.124798985460759e-06, + "loss": 4.917, + "step": 419 + }, + { + "batch_num_effect_tokens": 7918, + "batch_num_samples": 19, + "batch_num_tokens": 8101, + "epoch": 0.54501, + "grad_norm": 0.12297821789979935, + "learning_rate": 9.118382907149164e-06, + "loss": 4.8252, + "step": 420 + }, + { + "batch_num_effect_tokens": 8039, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 0.54631, + "grad_norm": 0.14057192206382751, + "learning_rate": 9.111945669609408e-06, + "loss": 4.5547, + "step": 421 + }, + { + "batch_num_effect_tokens": 8020, + "batch_num_samples": 17, + "batch_num_tokens": 8181, + "epoch": 0.54761, + "grad_norm": 0.1264130175113678, + "learning_rate": 9.105487305914415e-06, + "loss": 4.6621, + "step": 422 + }, + { + "batch_num_effect_tokens": 7898, + "batch_num_samples": 17, + "batch_num_tokens": 8126, + "epoch": 0.54891, + "grad_norm": 0.12750910222530365, + "learning_rate": 9.099007849245656e-06, + "loss": 4.7354, + "step": 423 + }, + { + "batch_num_effect_tokens": 8061, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 0.5502, + "grad_norm": 0.12850888073444366, + "learning_rate": 9.092507332892968e-06, + "loss": 4.5928, + "step": 424 + }, + { + "batch_num_effect_tokens": 8037, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 0.5515, + "grad_norm": 0.1398119032382965, + "learning_rate": 9.08598579025439e-06, + "loss": 5.082, + "step": 425 + }, + { + "batch_num_effect_tokens": 8018, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.5528, + "grad_norm": 0.12561438977718353, + "learning_rate": 9.079443254835987e-06, + "loss": 4.8418, + "step": 426 + }, + { + "batch_num_effect_tokens": 7994, + "batch_num_samples": 16, + "batch_num_tokens": 8177, + "epoch": 0.5541, + "grad_norm": 0.13627532124519348, + "learning_rate": 9.07287976025168e-06, + "loss": 4.748, + "step": 427 + }, + { + "batch_num_effect_tokens": 8048, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.55539, + "grad_norm": 0.1425987184047699, + "learning_rate": 9.066295340223073e-06, + "loss": 4.8652, + "step": 428 + }, + { + "batch_num_effect_tokens": 8053, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.55669, + "grad_norm": 0.12989814579486847, + "learning_rate": 9.059690028579285e-06, + "loss": 4.5225, + "step": 429 + }, + { + "batch_num_effect_tokens": 8028, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.55799, + "grad_norm": 0.12423403561115265, + "learning_rate": 9.05306385925676e-06, + "loss": 4.8164, + "step": 430 + }, + { + "batch_num_effect_tokens": 8042, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.55929, + "grad_norm": 0.1329265534877777, + "learning_rate": 9.04641686629911e-06, + "loss": 4.667, + "step": 431 + }, + { + "batch_num_effect_tokens": 7906, + "batch_num_samples": 14, + "batch_num_tokens": 8086, + "epoch": 0.56058, + "grad_norm": 0.13709376752376556, + "learning_rate": 9.039749083856938e-06, + "loss": 4.6504, + "step": 432 + }, + { + "batch_num_effect_tokens": 8054, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 0.56188, + "grad_norm": 0.1327197253704071, + "learning_rate": 9.033060546187651e-06, + "loss": 4.9004, + "step": 433 + }, + { + "batch_num_effect_tokens": 7986, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 0.56318, + "grad_norm": 0.12089107930660248, + "learning_rate": 9.026351287655294e-06, + "loss": 4.6582, + "step": 434 + }, + { + "batch_num_effect_tokens": 8072, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 0.56448, + "grad_norm": 0.12628640234470367, + "learning_rate": 9.019621342730369e-06, + "loss": 4.7559, + "step": 435 + }, + { + "batch_num_effect_tokens": 8072, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 0.56577, + "grad_norm": 0.12535719573497772, + "learning_rate": 9.012870745989663e-06, + "loss": 4.7764, + "step": 436 + }, + { + "batch_num_effect_tokens": 8059, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 0.56707, + "grad_norm": 0.12927192449569702, + "learning_rate": 9.006099532116066e-06, + "loss": 4.6074, + "step": 437 + }, + { + "batch_num_effect_tokens": 7960, + "batch_num_samples": 19, + "batch_num_tokens": 8146, + "epoch": 0.56837, + "grad_norm": 0.12784621119499207, + "learning_rate": 8.999307735898389e-06, + "loss": 4.3076, + "step": 438 + }, + { + "batch_num_effect_tokens": 7991, + "batch_num_samples": 17, + "batch_num_tokens": 8186, + "epoch": 0.56967, + "grad_norm": 0.12837977707386017, + "learning_rate": 8.992495392231195e-06, + "loss": 4.6992, + "step": 439 + }, + { + "batch_num_effect_tokens": 7883, + "batch_num_samples": 21, + "batch_num_tokens": 8091, + "epoch": 0.57097, + "grad_norm": 0.1314186155796051, + "learning_rate": 8.985662536114614e-06, + "loss": 4.6367, + "step": 440 + }, + { + "batch_num_effect_tokens": 7975, + "batch_num_samples": 20, + "batch_num_tokens": 8144, + "epoch": 0.57226, + "grad_norm": 0.13375988602638245, + "learning_rate": 8.978809202654161e-06, + "loss": 4.8691, + "step": 441 + }, + { + "batch_num_effect_tokens": 8024, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 0.57356, + "grad_norm": 0.1275053322315216, + "learning_rate": 8.971935427060563e-06, + "loss": 4.6514, + "step": 442 + }, + { + "batch_num_effect_tokens": 8027, + "batch_num_samples": 14, + "batch_num_tokens": 8169, + "epoch": 0.57486, + "grad_norm": 0.14429806172847748, + "learning_rate": 8.965041244649572e-06, + "loss": 5.0264, + "step": 443 + }, + { + "batch_num_effect_tokens": 7937, + "batch_num_samples": 18, + "batch_num_tokens": 8093, + "epoch": 0.57616, + "grad_norm": 0.13653282821178436, + "learning_rate": 8.95812669084178e-06, + "loss": 4.5127, + "step": 444 + }, + { + "batch_num_effect_tokens": 7938, + "batch_num_samples": 14, + "batch_num_tokens": 8093, + "epoch": 0.57745, + "grad_norm": 0.1339128315448761, + "learning_rate": 8.951191801162453e-06, + "loss": 4.4707, + "step": 445 + }, + { + "batch_num_effect_tokens": 7966, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.57875, + "grad_norm": 0.12954694032669067, + "learning_rate": 8.944236611241323e-06, + "loss": 4.8291, + "step": 446 + }, + { + "batch_num_effect_tokens": 8035, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.58005, + "grad_norm": 0.13166210055351257, + "learning_rate": 8.937261156812436e-06, + "loss": 4.7471, + "step": 447 + }, + { + "batch_num_effect_tokens": 8030, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 0.58135, + "grad_norm": 0.12804020941257477, + "learning_rate": 8.930265473713939e-06, + "loss": 4.7012, + "step": 448 + }, + { + "batch_num_effect_tokens": 8041, + "batch_num_samples": 14, + "batch_num_tokens": 8189, + "epoch": 0.58264, + "grad_norm": 0.13141286373138428, + "learning_rate": 8.923249597887913e-06, + "loss": 4.7891, + "step": 449 + }, + { + "batch_num_effect_tokens": 8058, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.58394, + "grad_norm": 0.12595658004283905, + "learning_rate": 8.916213565380188e-06, + "loss": 5.0732, + "step": 450 + }, + { + "batch_num_effect_tokens": 8005, + "batch_num_samples": 18, + "batch_num_tokens": 8192, + "epoch": 0.58524, + "grad_norm": 0.1271679848432541, + "learning_rate": 8.90915741234015e-06, + "loss": 4.5, + "step": 451 + }, + { + "batch_num_effect_tokens": 8034, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.58654, + "grad_norm": 0.13062655925750732, + "learning_rate": 8.902081175020558e-06, + "loss": 4.8711, + "step": 452 + }, + { + "batch_num_effect_tokens": 7999, + "batch_num_samples": 17, + "batch_num_tokens": 8165, + "epoch": 0.58783, + "grad_norm": 0.12481129914522171, + "learning_rate": 8.894984889777365e-06, + "loss": 4.8623, + "step": 453 + }, + { + "batch_num_effect_tokens": 8018, + "batch_num_samples": 14, + "batch_num_tokens": 8146, + "epoch": 0.58913, + "grad_norm": 0.1350601315498352, + "learning_rate": 8.88786859306952e-06, + "loss": 4.916, + "step": 454 + }, + { + "batch_num_effect_tokens": 8020, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 0.59043, + "grad_norm": 0.12822680175304413, + "learning_rate": 8.880732321458785e-06, + "loss": 4.4541, + "step": 455 + }, + { + "batch_num_effect_tokens": 7981, + "batch_num_samples": 20, + "batch_num_tokens": 8160, + "epoch": 0.59173, + "grad_norm": 0.1304287612438202, + "learning_rate": 8.873576111609552e-06, + "loss": 4.8018, + "step": 456 + }, + { + "batch_num_effect_tokens": 8011, + "batch_num_samples": 14, + "batch_num_tokens": 8155, + "epoch": 0.59303, + "grad_norm": 0.13626275956630707, + "learning_rate": 8.866400000288652e-06, + "loss": 4.9375, + "step": 457 + }, + { + "batch_num_effect_tokens": 8009, + "batch_num_samples": 17, + "batch_num_tokens": 8173, + "epoch": 0.59432, + "grad_norm": 0.13052628934383392, + "learning_rate": 8.85920402436516e-06, + "loss": 4.9404, + "step": 458 + }, + { + "batch_num_effect_tokens": 7932, + "batch_num_samples": 16, + "batch_num_tokens": 8111, + "epoch": 0.59562, + "grad_norm": 0.12803995609283447, + "learning_rate": 8.85198822081021e-06, + "loss": 4.7695, + "step": 459 + }, + { + "batch_num_effect_tokens": 7900, + "batch_num_samples": 15, + "batch_num_tokens": 8074, + "epoch": 0.59692, + "grad_norm": 0.12911297380924225, + "learning_rate": 8.84475262669681e-06, + "loss": 4.6729, + "step": 460 + }, + { + "batch_num_effect_tokens": 8024, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 0.59822, + "grad_norm": 0.12205401062965393, + "learning_rate": 8.837497279199647e-06, + "loss": 4.5557, + "step": 461 + }, + { + "batch_num_effect_tokens": 7937, + "batch_num_samples": 23, + "batch_num_tokens": 8143, + "epoch": 0.59951, + "grad_norm": 0.1380755603313446, + "learning_rate": 8.83022221559489e-06, + "loss": 5.0176, + "step": 462 + }, + { + "batch_num_effect_tokens": 7937, + "batch_num_samples": 23, + "batch_num_tokens": 8143, + "epoch": 0.59951, + "eval_eval_loss": 0.597096860408783, + "eval_eval_runtime": 114.9903, + "eval_eval_samples_per_second": 43.482, + "eval_eval_steps_per_second": 2.722, + "step": 462 + }, + { + "batch_num_effect_tokens": 8013, + "batch_num_samples": 20, + "batch_num_tokens": 8192, + "epoch": 0.60081, + "grad_norm": 0.12681666016578674, + "learning_rate": 8.822927473260012e-06, + "loss": 4.998, + "step": 463 + }, + { + "batch_num_effect_tokens": 8070, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.60211, + "grad_norm": 0.1322617530822754, + "learning_rate": 8.815613089673584e-06, + "loss": 4.9268, + "step": 464 + }, + { + "batch_num_effect_tokens": 7983, + "batch_num_samples": 14, + "batch_num_tokens": 8135, + "epoch": 0.60341, + "grad_norm": 0.13114838302135468, + "learning_rate": 8.808279102415093e-06, + "loss": 4.6543, + "step": 465 + }, + { + "batch_num_effect_tokens": 7985, + "batch_num_samples": 15, + "batch_num_tokens": 8158, + "epoch": 0.6047, + "grad_norm": 0.12885698676109314, + "learning_rate": 8.800925549164742e-06, + "loss": 4.6309, + "step": 466 + }, + { + "batch_num_effect_tokens": 7946, + "batch_num_samples": 17, + "batch_num_tokens": 8123, + "epoch": 0.606, + "grad_norm": 0.14466021955013275, + "learning_rate": 8.79355246770326e-06, + "loss": 4.5459, + "step": 467 + }, + { + "batch_num_effect_tokens": 8068, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.6073, + "grad_norm": 0.1357675939798355, + "learning_rate": 8.786159895911712e-06, + "loss": 4.873, + "step": 468 + }, + { + "batch_num_effect_tokens": 8008, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 0.6086, + "grad_norm": 0.1205708235502243, + "learning_rate": 8.778747871771293e-06, + "loss": 4.8271, + "step": 469 + }, + { + "batch_num_effect_tokens": 8034, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.60989, + "grad_norm": 0.137266144156456, + "learning_rate": 8.771316433363139e-06, + "loss": 4.9111, + "step": 470 + }, + { + "batch_num_effect_tokens": 8014, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 0.61119, + "grad_norm": 0.1284547597169876, + "learning_rate": 8.763865618868136e-06, + "loss": 4.7998, + "step": 471 + }, + { + "batch_num_effect_tokens": 8026, + "batch_num_samples": 14, + "batch_num_tokens": 8191, + "epoch": 0.61249, + "grad_norm": 0.12678979337215424, + "learning_rate": 8.756395466566718e-06, + "loss": 4.707, + "step": 472 + }, + { + "batch_num_effect_tokens": 8008, + "batch_num_samples": 20, + "batch_num_tokens": 8192, + "epoch": 0.61379, + "grad_norm": 0.1220792606472969, + "learning_rate": 8.748906014838672e-06, + "loss": 4.6953, + "step": 473 + }, + { + "batch_num_effect_tokens": 7918, + "batch_num_samples": 19, + "batch_num_tokens": 8086, + "epoch": 0.61509, + "grad_norm": 0.1242067739367485, + "learning_rate": 8.74139730216294e-06, + "loss": 4.7539, + "step": 474 + }, + { + "batch_num_effect_tokens": 8079, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.61638, + "grad_norm": 0.1328115165233612, + "learning_rate": 8.73386936711742e-06, + "loss": 4.8418, + "step": 475 + }, + { + "batch_num_effect_tokens": 7996, + "batch_num_samples": 19, + "batch_num_tokens": 8180, + "epoch": 0.61768, + "grad_norm": 0.1341410130262375, + "learning_rate": 8.726322248378775e-06, + "loss": 4.6699, + "step": 476 + }, + { + "batch_num_effect_tokens": 8045, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.61898, + "grad_norm": 0.13784657418727875, + "learning_rate": 8.718755984722224e-06, + "loss": 4.7334, + "step": 477 + }, + { + "batch_num_effect_tokens": 7959, + "batch_num_samples": 24, + "batch_num_tokens": 8168, + "epoch": 0.62028, + "grad_norm": 0.1370074301958084, + "learning_rate": 8.71117061502135e-06, + "loss": 4.7334, + "step": 478 + }, + { + "batch_num_effect_tokens": 7899, + "batch_num_samples": 16, + "batch_num_tokens": 8078, + "epoch": 0.62157, + "grad_norm": 0.12483939528465271, + "learning_rate": 8.7035661782479e-06, + "loss": 4.6084, + "step": 479 + }, + { + "batch_num_effect_tokens": 8033, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 0.62287, + "grad_norm": 0.14714112877845764, + "learning_rate": 8.695942713471578e-06, + "loss": 5.0137, + "step": 480 + }, + { + "batch_num_effect_tokens": 8063, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 0.62417, + "grad_norm": 0.12480289489030838, + "learning_rate": 8.688300259859855e-06, + "loss": 4.5625, + "step": 481 + }, + { + "batch_num_effect_tokens": 7996, + "batch_num_samples": 16, + "batch_num_tokens": 8177, + "epoch": 0.62547, + "grad_norm": 0.132341668009758, + "learning_rate": 8.680638856677754e-06, + "loss": 4.8096, + "step": 482 + }, + { + "batch_num_effect_tokens": 7995, + "batch_num_samples": 14, + "batch_num_tokens": 8190, + "epoch": 0.62676, + "grad_norm": 0.1296975016593933, + "learning_rate": 8.672958543287666e-06, + "loss": 4.499, + "step": 483 + }, + { + "batch_num_effect_tokens": 8000, + "batch_num_samples": 15, + "batch_num_tokens": 8146, + "epoch": 0.62806, + "grad_norm": 0.12684115767478943, + "learning_rate": 8.665259359149132e-06, + "loss": 4.9092, + "step": 484 + }, + { + "batch_num_effect_tokens": 7810, + "batch_num_samples": 17, + "batch_num_tokens": 7989, + "epoch": 0.62936, + "grad_norm": 0.12434987723827362, + "learning_rate": 8.657541343818646e-06, + "loss": 4.5098, + "step": 485 + }, + { + "batch_num_effect_tokens": 8021, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 0.63066, + "grad_norm": 0.13358746469020844, + "learning_rate": 8.649804536949453e-06, + "loss": 4.875, + "step": 486 + }, + { + "batch_num_effect_tokens": 7944, + "batch_num_samples": 25, + "batch_num_tokens": 8156, + "epoch": 0.63195, + "grad_norm": 0.13076664507389069, + "learning_rate": 8.642048978291347e-06, + "loss": 4.8301, + "step": 487 + }, + { + "batch_num_effect_tokens": 8046, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.63325, + "grad_norm": 0.12482704967260361, + "learning_rate": 8.634274707690458e-06, + "loss": 5.208, + "step": 488 + }, + { + "batch_num_effect_tokens": 7996, + "batch_num_samples": 23, + "batch_num_tokens": 8192, + "epoch": 0.63455, + "grad_norm": 0.13764898478984833, + "learning_rate": 8.626481765089058e-06, + "loss": 5.4395, + "step": 489 + }, + { + "batch_num_effect_tokens": 8015, + "batch_num_samples": 14, + "batch_num_tokens": 8155, + "epoch": 0.63585, + "grad_norm": 0.12434025853872299, + "learning_rate": 8.61867019052535e-06, + "loss": 4.8242, + "step": 490 + }, + { + "batch_num_effect_tokens": 8027, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 0.63715, + "grad_norm": 0.13372331857681274, + "learning_rate": 8.610840024133266e-06, + "loss": 4.9395, + "step": 491 + }, + { + "batch_num_effect_tokens": 8045, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.63844, + "grad_norm": 0.12320306152105331, + "learning_rate": 8.602991306142252e-06, + "loss": 4.4512, + "step": 492 + }, + { + "batch_num_effect_tokens": 7924, + "batch_num_samples": 16, + "batch_num_tokens": 8100, + "epoch": 0.63974, + "grad_norm": 0.13152460753917694, + "learning_rate": 8.595124076877074e-06, + "loss": 4.8301, + "step": 493 + }, + { + "batch_num_effect_tokens": 8029, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 0.64104, + "grad_norm": 0.1355685591697693, + "learning_rate": 8.587238376757597e-06, + "loss": 4.7451, + "step": 494 + }, + { + "batch_num_effect_tokens": 7968, + "batch_num_samples": 15, + "batch_num_tokens": 8100, + "epoch": 0.64234, + "grad_norm": 0.1284026801586151, + "learning_rate": 8.579334246298593e-06, + "loss": 4.9092, + "step": 495 + }, + { + "batch_num_effect_tokens": 7918, + "batch_num_samples": 25, + "batch_num_tokens": 8130, + "epoch": 0.64363, + "grad_norm": 0.13228504359722137, + "learning_rate": 8.571411726109518e-06, + "loss": 4.5225, + "step": 496 + }, + { + "batch_num_effect_tokens": 8025, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 0.64493, + "grad_norm": 0.14529111981391907, + "learning_rate": 8.563470856894316e-06, + "loss": 4.9707, + "step": 497 + }, + { + "batch_num_effect_tokens": 7998, + "batch_num_samples": 21, + "batch_num_tokens": 8191, + "epoch": 0.64623, + "grad_norm": 0.13738293945789337, + "learning_rate": 8.555511679451197e-06, + "loss": 4.6738, + "step": 498 + }, + { + "batch_num_effect_tokens": 8020, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 0.64753, + "grad_norm": 0.13121268153190613, + "learning_rate": 8.547534234672435e-06, + "loss": 4.4697, + "step": 499 + }, + { + "batch_num_effect_tokens": 8012, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 0.64882, + "grad_norm": 0.1251576542854309, + "learning_rate": 8.539538563544165e-06, + "loss": 4.7764, + "step": 500 + }, + { + "batch_num_effect_tokens": 8033, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 0.65012, + "grad_norm": 0.13326169550418854, + "learning_rate": 8.531524707146154e-06, + "loss": 4.6836, + "step": 501 + }, + { + "batch_num_effect_tokens": 7931, + "batch_num_samples": 24, + "batch_num_tokens": 8144, + "epoch": 0.65142, + "grad_norm": 0.13999317586421967, + "learning_rate": 8.523492706651607e-06, + "loss": 4.9355, + "step": 502 + }, + { + "batch_num_effect_tokens": 8051, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 0.65272, + "grad_norm": 0.12973648309707642, + "learning_rate": 8.515442603326948e-06, + "loss": 4.7969, + "step": 503 + }, + { + "batch_num_effect_tokens": 8028, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.65401, + "grad_norm": 0.1382308453321457, + "learning_rate": 8.507374438531606e-06, + "loss": 5.1699, + "step": 504 + }, + { + "batch_num_effect_tokens": 7991, + "batch_num_samples": 15, + "batch_num_tokens": 8152, + "epoch": 0.65531, + "grad_norm": 0.12315783649682999, + "learning_rate": 8.49928825371781e-06, + "loss": 4.8672, + "step": 505 + }, + { + "batch_num_effect_tokens": 8040, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 0.65661, + "grad_norm": 0.14307758212089539, + "learning_rate": 8.491184090430365e-06, + "loss": 4.5527, + "step": 506 + }, + { + "batch_num_effect_tokens": 7951, + "batch_num_samples": 14, + "batch_num_tokens": 8091, + "epoch": 0.65791, + "grad_norm": 0.12455622851848602, + "learning_rate": 8.483061990306451e-06, + "loss": 4.9229, + "step": 507 + }, + { + "batch_num_effect_tokens": 7904, + "batch_num_samples": 18, + "batch_num_tokens": 8086, + "epoch": 0.65921, + "grad_norm": 0.13688203692436218, + "learning_rate": 8.474921995075399e-06, + "loss": 4.5957, + "step": 508 + }, + { + "batch_num_effect_tokens": 7943, + "batch_num_samples": 16, + "batch_num_tokens": 8100, + "epoch": 0.6605, + "grad_norm": 0.1287027895450592, + "learning_rate": 8.466764146558482e-06, + "loss": 4.9189, + "step": 509 + }, + { + "batch_num_effect_tokens": 8075, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 0.6618, + "grad_norm": 0.12565134465694427, + "learning_rate": 8.4585884866687e-06, + "loss": 5.2285, + "step": 510 + }, + { + "batch_num_effect_tokens": 7930, + "batch_num_samples": 16, + "batch_num_tokens": 8089, + "epoch": 0.6631, + "grad_norm": 0.11928825825452805, + "learning_rate": 8.450395057410561e-06, + "loss": 4.7051, + "step": 511 + }, + { + "batch_num_effect_tokens": 8075, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 0.6644, + "grad_norm": 0.1257006675004959, + "learning_rate": 8.44218390087987e-06, + "loss": 4.7148, + "step": 512 + }, + { + "batch_num_effect_tokens": 8003, + "batch_num_samples": 19, + "batch_num_tokens": 8192, + "epoch": 0.66569, + "grad_norm": 0.14077560603618622, + "learning_rate": 8.433955059263508e-06, + "loss": 4.8691, + "step": 513 + }, + { + "batch_num_effect_tokens": 8065, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 0.66699, + "grad_norm": 0.1343725621700287, + "learning_rate": 8.425708574839221e-06, + "loss": 4.8486, + "step": 514 + }, + { + "batch_num_effect_tokens": 8054, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.66829, + "grad_norm": 0.1511705070734024, + "learning_rate": 8.417444489975396e-06, + "loss": 4.584, + "step": 515 + }, + { + "batch_num_effect_tokens": 8077, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 0.66959, + "grad_norm": 0.15018604695796967, + "learning_rate": 8.409162847130847e-06, + "loss": 5.0859, + "step": 516 + }, + { + "batch_num_effect_tokens": 8006, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 0.67088, + "grad_norm": 0.13522522151470184, + "learning_rate": 8.400863688854598e-06, + "loss": 4.9492, + "step": 517 + }, + { + "batch_num_effect_tokens": 7935, + "batch_num_samples": 14, + "batch_num_tokens": 8092, + "epoch": 0.67218, + "grad_norm": 0.1357102394104004, + "learning_rate": 8.392547057785662e-06, + "loss": 4.4229, + "step": 518 + }, + { + "batch_num_effect_tokens": 7896, + "batch_num_samples": 26, + "batch_num_tokens": 8140, + "epoch": 0.67348, + "grad_norm": 0.12707726657390594, + "learning_rate": 8.384212996652823e-06, + "loss": 4.5303, + "step": 519 + }, + { + "batch_num_effect_tokens": 7996, + "batch_num_samples": 14, + "batch_num_tokens": 8135, + "epoch": 0.67478, + "grad_norm": 0.127869114279747, + "learning_rate": 8.375861548274417e-06, + "loss": 4.6426, + "step": 520 + }, + { + "batch_num_effect_tokens": 7982, + "batch_num_samples": 24, + "batch_num_tokens": 8192, + "epoch": 0.67607, + "grad_norm": 0.13051429390907288, + "learning_rate": 8.367492755558111e-06, + "loss": 4.9316, + "step": 521 + }, + { + "batch_num_effect_tokens": 7906, + "batch_num_samples": 20, + "batch_num_tokens": 8112, + "epoch": 0.67737, + "grad_norm": 0.13462385535240173, + "learning_rate": 8.359106661500683e-06, + "loss": 4.7568, + "step": 522 + }, + { + "batch_num_effect_tokens": 8046, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.67867, + "grad_norm": 0.1249893382191658, + "learning_rate": 8.3507033091878e-06, + "loss": 4.5117, + "step": 523 + }, + { + "batch_num_effect_tokens": 8012, + "batch_num_samples": 14, + "batch_num_tokens": 8168, + "epoch": 0.67997, + "grad_norm": 0.13060161471366882, + "learning_rate": 8.342282741793797e-06, + "loss": 4.8574, + "step": 524 + }, + { + "batch_num_effect_tokens": 7949, + "batch_num_samples": 18, + "batch_num_tokens": 8192, + "epoch": 0.68127, + "grad_norm": 0.11890089511871338, + "learning_rate": 8.33384500258146e-06, + "loss": 4.6885, + "step": 525 + }, + { + "batch_num_effect_tokens": 7939, + "batch_num_samples": 14, + "batch_num_tokens": 8076, + "epoch": 0.68256, + "grad_norm": 0.12186378985643387, + "learning_rate": 8.325390134901794e-06, + "loss": 4.4736, + "step": 526 + }, + { + "batch_num_effect_tokens": 7961, + "batch_num_samples": 17, + "batch_num_tokens": 8118, + "epoch": 0.68386, + "grad_norm": 0.140080064535141, + "learning_rate": 8.316918182193811e-06, + "loss": 4.8838, + "step": 527 + }, + { + "batch_num_effect_tokens": 7969, + "batch_num_samples": 21, + "batch_num_tokens": 8176, + "epoch": 0.68516, + "grad_norm": 0.1309884935617447, + "learning_rate": 8.308429187984298e-06, + "loss": 4.8018, + "step": 528 + }, + { + "batch_num_effect_tokens": 7935, + "batch_num_samples": 14, + "batch_num_tokens": 8149, + "epoch": 0.68646, + "grad_norm": 0.1284397542476654, + "learning_rate": 8.299923195887599e-06, + "loss": 4.4141, + "step": 529 + }, + { + "batch_num_effect_tokens": 7875, + "batch_num_samples": 15, + "batch_num_tokens": 8074, + "epoch": 0.68775, + "grad_norm": 0.12711189687252045, + "learning_rate": 8.291400249605387e-06, + "loss": 4.6455, + "step": 530 + }, + { + "batch_num_effect_tokens": 8020, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 0.68905, + "grad_norm": 0.13765956461429596, + "learning_rate": 8.282860392926442e-06, + "loss": 4.4688, + "step": 531 + }, + { + "batch_num_effect_tokens": 7934, + "batch_num_samples": 22, + "batch_num_tokens": 8146, + "epoch": 0.69035, + "grad_norm": 0.13526172935962677, + "learning_rate": 8.274303669726427e-06, + "loss": 4.6934, + "step": 532 + }, + { + "batch_num_effect_tokens": 7921, + "batch_num_samples": 20, + "batch_num_tokens": 8160, + "epoch": 0.69165, + "grad_norm": 0.13791659474372864, + "learning_rate": 8.26573012396766e-06, + "loss": 4.8594, + "step": 533 + }, + { + "batch_num_effect_tokens": 8051, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 0.69294, + "grad_norm": 0.1339079588651657, + "learning_rate": 8.257139799698887e-06, + "loss": 5.1318, + "step": 534 + }, + { + "batch_num_effect_tokens": 7988, + "batch_num_samples": 14, + "batch_num_tokens": 8120, + "epoch": 0.69424, + "grad_norm": 0.13170425593852997, + "learning_rate": 8.248532741055061e-06, + "loss": 4.5645, + "step": 535 + }, + { + "batch_num_effect_tokens": 7932, + "batch_num_samples": 18, + "batch_num_tokens": 8100, + "epoch": 0.69554, + "grad_norm": 0.1507280021905899, + "learning_rate": 8.239908992257114e-06, + "loss": 4.7578, + "step": 536 + }, + { + "batch_num_effect_tokens": 8014, + "batch_num_samples": 19, + "batch_num_tokens": 8192, + "epoch": 0.69684, + "grad_norm": 0.13211563229560852, + "learning_rate": 8.231268597611722e-06, + "loss": 5.0664, + "step": 537 + }, + { + "batch_num_effect_tokens": 8060, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 0.69813, + "grad_norm": 0.135163813829422, + "learning_rate": 8.222611601511084e-06, + "loss": 4.5693, + "step": 538 + }, + { + "batch_num_effect_tokens": 7743, + "batch_num_samples": 29, + "batch_num_tokens": 8007, + "epoch": 0.69943, + "grad_norm": 0.13044171035289764, + "learning_rate": 8.213938048432697e-06, + "loss": 4.8115, + "step": 539 + }, + { + "batch_num_effect_tokens": 8056, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.70073, + "grad_norm": 0.13288696110248566, + "learning_rate": 8.205247982939124e-06, + "loss": 4.7236, + "step": 540 + }, + { + "batch_num_effect_tokens": 8062, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.70203, + "grad_norm": 0.12817522883415222, + "learning_rate": 8.196541449677758e-06, + "loss": 4.7334, + "step": 541 + }, + { + "batch_num_effect_tokens": 8032, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 0.70333, + "grad_norm": 0.1278115212917328, + "learning_rate": 8.187818493380607e-06, + "loss": 4.6318, + "step": 542 + }, + { + "batch_num_effect_tokens": 8031, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.70462, + "grad_norm": 0.12950873374938965, + "learning_rate": 8.179079158864053e-06, + "loss": 4.8809, + "step": 543 + }, + { + "batch_num_effect_tokens": 7919, + "batch_num_samples": 14, + "batch_num_tokens": 8083, + "epoch": 0.70592, + "grad_norm": 0.1247512623667717, + "learning_rate": 8.170323491028625e-06, + "loss": 4.9658, + "step": 544 + }, + { + "batch_num_effect_tokens": 8047, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.70722, + "grad_norm": 0.12909561395645142, + "learning_rate": 8.161551534858767e-06, + "loss": 4.7041, + "step": 545 + }, + { + "batch_num_effect_tokens": 7979, + "batch_num_samples": 14, + "batch_num_tokens": 8146, + "epoch": 0.70852, + "grad_norm": 0.12214743345975876, + "learning_rate": 8.152763335422612e-06, + "loss": 5.0234, + "step": 546 + }, + { + "batch_num_effect_tokens": 8029, + "batch_num_samples": 19, + "batch_num_tokens": 8192, + "epoch": 0.70981, + "grad_norm": 0.12689444422721863, + "learning_rate": 8.143958937871748e-06, + "loss": 4.8711, + "step": 547 + }, + { + "batch_num_effect_tokens": 7902, + "batch_num_samples": 15, + "batch_num_tokens": 8080, + "epoch": 0.71111, + "grad_norm": 0.12359411269426346, + "learning_rate": 8.135138387440978e-06, + "loss": 4.877, + "step": 548 + }, + { + "batch_num_effect_tokens": 7921, + "batch_num_samples": 19, + "batch_num_tokens": 8086, + "epoch": 0.71241, + "grad_norm": 0.1347743421792984, + "learning_rate": 8.126301729448101e-06, + "loss": 4.8076, + "step": 549 + }, + { + "batch_num_effect_tokens": 8059, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 0.71371, + "grad_norm": 0.12478054314851761, + "learning_rate": 8.117449009293668e-06, + "loss": 4.6523, + "step": 550 + }, + { + "batch_num_effect_tokens": 8038, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.715, + "grad_norm": 0.12902334332466125, + "learning_rate": 8.108580272460759e-06, + "loss": 4.6719, + "step": 551 + }, + { + "batch_num_effect_tokens": 8047, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.7163, + "grad_norm": 0.1194981262087822, + "learning_rate": 8.099695564514738e-06, + "loss": 4.6465, + "step": 552 + }, + { + "batch_num_effect_tokens": 8058, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 0.7176, + "grad_norm": 0.12877826392650604, + "learning_rate": 8.090794931103026e-06, + "loss": 4.7939, + "step": 553 + }, + { + "batch_num_effect_tokens": 8069, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 0.7189, + "grad_norm": 0.13682805001735687, + "learning_rate": 8.08187841795487e-06, + "loss": 4.7344, + "step": 554 + }, + { + "batch_num_effect_tokens": 8019, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 0.72019, + "grad_norm": 0.13331077992916107, + "learning_rate": 8.072946070881095e-06, + "loss": 5.084, + "step": 555 + }, + { + "batch_num_effect_tokens": 7942, + "batch_num_samples": 14, + "batch_num_tokens": 8149, + "epoch": 0.72149, + "grad_norm": 0.13391903042793274, + "learning_rate": 8.063997935773885e-06, + "loss": 4.6699, + "step": 556 + }, + { + "batch_num_effect_tokens": 7917, + "batch_num_samples": 16, + "batch_num_tokens": 8078, + "epoch": 0.72279, + "grad_norm": 0.1353442519903183, + "learning_rate": 8.055034058606533e-06, + "loss": 4.7354, + "step": 557 + }, + { + "batch_num_effect_tokens": 8003, + "batch_num_samples": 14, + "batch_num_tokens": 8146, + "epoch": 0.72409, + "grad_norm": 0.13279181718826294, + "learning_rate": 8.046054485433211e-06, + "loss": 4.7617, + "step": 558 + }, + { + "batch_num_effect_tokens": 8015, + "batch_num_samples": 17, + "batch_num_tokens": 8165, + "epoch": 0.72539, + "grad_norm": 0.13407373428344727, + "learning_rate": 8.03705926238874e-06, + "loss": 4.667, + "step": 559 + }, + { + "batch_num_effect_tokens": 8059, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.72668, + "grad_norm": 0.12700553238391876, + "learning_rate": 8.028048435688333e-06, + "loss": 4.4395, + "step": 560 + }, + { + "batch_num_effect_tokens": 8022, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.72798, + "grad_norm": 0.1360039860010147, + "learning_rate": 8.019022051627387e-06, + "loss": 4.7686, + "step": 561 + }, + { + "batch_num_effect_tokens": 7992, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.72928, + "grad_norm": 0.121078722178936, + "learning_rate": 8.009980156581218e-06, + "loss": 4.6289, + "step": 562 + }, + { + "batch_num_effect_tokens": 7946, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.73058, + "grad_norm": 0.12270597368478775, + "learning_rate": 8.000922797004835e-06, + "loss": 4.5605, + "step": 563 + }, + { + "batch_num_effect_tokens": 8048, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 0.73187, + "grad_norm": 0.13452237844467163, + "learning_rate": 7.991850019432701e-06, + "loss": 4.6885, + "step": 564 + }, + { + "batch_num_effect_tokens": 8020, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 0.73317, + "grad_norm": 0.12567798793315887, + "learning_rate": 7.982761870478495e-06, + "loss": 4.8379, + "step": 565 + }, + { + "batch_num_effect_tokens": 7995, + "batch_num_samples": 22, + "batch_num_tokens": 8192, + "epoch": 0.73447, + "grad_norm": 0.1310405284166336, + "learning_rate": 7.973658396834868e-06, + "loss": 4.6504, + "step": 566 + }, + { + "batch_num_effect_tokens": 7962, + "batch_num_samples": 22, + "batch_num_tokens": 8192, + "epoch": 0.73577, + "grad_norm": 0.13182175159454346, + "learning_rate": 7.964539645273204e-06, + "loss": 4.7881, + "step": 567 + }, + { + "batch_num_effect_tokens": 8022, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 0.73706, + "grad_norm": 0.12408516556024551, + "learning_rate": 7.955405662643384e-06, + "loss": 4.458, + "step": 568 + }, + { + "batch_num_effect_tokens": 8038, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.73836, + "grad_norm": 0.12731651961803436, + "learning_rate": 7.946256495873542e-06, + "loss": 5.0205, + "step": 569 + }, + { + "batch_num_effect_tokens": 8067, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.73966, + "grad_norm": 0.13854657113552094, + "learning_rate": 7.937092191969821e-06, + "loss": 4.6074, + "step": 570 + }, + { + "batch_num_effect_tokens": 7958, + "batch_num_samples": 21, + "batch_num_tokens": 8192, + "epoch": 0.74096, + "grad_norm": 0.127966970205307, + "learning_rate": 7.927912798016144e-06, + "loss": 5.0039, + "step": 571 + }, + { + "batch_num_effect_tokens": 8051, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.74225, + "grad_norm": 0.1232946589589119, + "learning_rate": 7.918718361173951e-06, + "loss": 4.749, + "step": 572 + }, + { + "batch_num_effect_tokens": 7932, + "batch_num_samples": 23, + "batch_num_tokens": 8181, + "epoch": 0.74355, + "grad_norm": 0.12766174972057343, + "learning_rate": 7.909508928681975e-06, + "loss": 5.0156, + "step": 573 + }, + { + "batch_num_effect_tokens": 8053, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 0.74485, + "grad_norm": 0.1258310079574585, + "learning_rate": 7.900284547855992e-06, + "loss": 4.4893, + "step": 574 + }, + { + "batch_num_effect_tokens": 8059, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.74615, + "grad_norm": 0.13721963763237, + "learning_rate": 7.89104526608858e-06, + "loss": 4.7783, + "step": 575 + }, + { + "batch_num_effect_tokens": 8019, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 0.74745, + "grad_norm": 0.11561396718025208, + "learning_rate": 7.881791130848872e-06, + "loss": 4.6162, + "step": 576 + }, + { + "batch_num_effect_tokens": 8033, + "batch_num_samples": 17, + "batch_num_tokens": 8191, + "epoch": 0.74874, + "grad_norm": 0.11618711799383163, + "learning_rate": 7.872522189682318e-06, + "loss": 4.541, + "step": 577 + }, + { + "batch_num_effect_tokens": 8046, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 0.75004, + "grad_norm": 0.1265118569135666, + "learning_rate": 7.863238490210432e-06, + "loss": 4.6934, + "step": 578 + }, + { + "batch_num_effect_tokens": 8046, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 0.75134, + "grad_norm": 0.11919250339269638, + "learning_rate": 7.853940080130556e-06, + "loss": 4.4326, + "step": 579 + }, + { + "batch_num_effect_tokens": 7886, + "batch_num_samples": 27, + "batch_num_tokens": 8110, + "epoch": 0.75264, + "grad_norm": 0.13348767161369324, + "learning_rate": 7.844627007215613e-06, + "loss": 4.9668, + "step": 580 + }, + { + "batch_num_effect_tokens": 8026, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.75393, + "grad_norm": 0.12527728080749512, + "learning_rate": 7.835299319313854e-06, + "loss": 4.8496, + "step": 581 + }, + { + "batch_num_effect_tokens": 7946, + "batch_num_samples": 15, + "batch_num_tokens": 8076, + "epoch": 0.75523, + "grad_norm": 0.12182778120040894, + "learning_rate": 7.825957064348625e-06, + "loss": 4.6016, + "step": 582 + }, + { + "batch_num_effect_tokens": 8000, + "batch_num_samples": 17, + "batch_num_tokens": 8165, + "epoch": 0.75653, + "grad_norm": 0.11762472242116928, + "learning_rate": 7.81660029031811e-06, + "loss": 4.6768, + "step": 583 + }, + { + "batch_num_effect_tokens": 8014, + "batch_num_samples": 15, + "batch_num_tokens": 8176, + "epoch": 0.75783, + "grad_norm": 0.1293763965368271, + "learning_rate": 7.80722904529509e-06, + "loss": 4.7266, + "step": 584 + }, + { + "batch_num_effect_tokens": 7942, + "batch_num_samples": 23, + "batch_num_tokens": 8191, + "epoch": 0.75912, + "grad_norm": 0.12573941051959991, + "learning_rate": 7.797843377426693e-06, + "loss": 4.6338, + "step": 585 + }, + { + "batch_num_effect_tokens": 8037, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.76042, + "grad_norm": 0.1293158084154129, + "learning_rate": 7.788443334934148e-06, + "loss": 4.5762, + "step": 586 + }, + { + "batch_num_effect_tokens": 7907, + "batch_num_samples": 15, + "batch_num_tokens": 8098, + "epoch": 0.76172, + "grad_norm": 0.14639928936958313, + "learning_rate": 7.779028966112538e-06, + "loss": 5.0459, + "step": 587 + }, + { + "batch_num_effect_tokens": 7981, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 0.76302, + "grad_norm": 0.1278136819601059, + "learning_rate": 7.769600319330553e-06, + "loss": 4.585, + "step": 588 + }, + { + "batch_num_effect_tokens": 8017, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 0.76431, + "grad_norm": 0.12693235278129578, + "learning_rate": 7.760157443030234e-06, + "loss": 4.7744, + "step": 589 + }, + { + "batch_num_effect_tokens": 8044, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 0.76561, + "grad_norm": 0.11962468177080154, + "learning_rate": 7.750700385726736e-06, + "loss": 4.6338, + "step": 590 + }, + { + "batch_num_effect_tokens": 8036, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.76691, + "grad_norm": 0.13058899343013763, + "learning_rate": 7.741229196008068e-06, + "loss": 4.9893, + "step": 591 + }, + { + "batch_num_effect_tokens": 7970, + "batch_num_samples": 14, + "batch_num_tokens": 8123, + "epoch": 0.76821, + "grad_norm": 0.1270352005958557, + "learning_rate": 7.731743922534854e-06, + "loss": 4.5371, + "step": 592 + }, + { + "batch_num_effect_tokens": 8059, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 0.76951, + "grad_norm": 0.1201217994093895, + "learning_rate": 7.722244614040068e-06, + "loss": 4.3867, + "step": 593 + }, + { + "batch_num_effect_tokens": 7950, + "batch_num_samples": 16, + "batch_num_tokens": 8122, + "epoch": 0.7708, + "grad_norm": 0.11798641830682755, + "learning_rate": 7.712731319328798e-06, + "loss": 4.873, + "step": 594 + }, + { + "batch_num_effect_tokens": 8027, + "batch_num_samples": 17, + "batch_num_tokens": 8187, + "epoch": 0.7721, + "grad_norm": 0.1351042240858078, + "learning_rate": 7.703204087277989e-06, + "loss": 4.4766, + "step": 595 + }, + { + "batch_num_effect_tokens": 8027, + "batch_num_samples": 14, + "batch_num_tokens": 8189, + "epoch": 0.7734, + "grad_norm": 0.12383313477039337, + "learning_rate": 7.693662966836191e-06, + "loss": 4.6631, + "step": 596 + }, + { + "batch_num_effect_tokens": 7880, + "batch_num_samples": 17, + "batch_num_tokens": 8062, + "epoch": 0.7747, + "grad_norm": 0.12391753494739532, + "learning_rate": 7.684108007023313e-06, + "loss": 4.5283, + "step": 597 + }, + { + "batch_num_effect_tokens": 7995, + "batch_num_samples": 22, + "batch_num_tokens": 8192, + "epoch": 0.77599, + "grad_norm": 0.12056957185268402, + "learning_rate": 7.674539256930364e-06, + "loss": 4.5322, + "step": 598 + }, + { + "batch_num_effect_tokens": 7999, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 0.77729, + "grad_norm": 0.11704017966985703, + "learning_rate": 7.6649567657192e-06, + "loss": 4.6699, + "step": 599 + }, + { + "batch_num_effect_tokens": 8011, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 0.77859, + "grad_norm": 0.12692949175834656, + "learning_rate": 7.655360582622287e-06, + "loss": 4.5049, + "step": 600 + }, + { + "batch_num_effect_tokens": 8021, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.77989, + "grad_norm": 0.1289125233888626, + "learning_rate": 7.645750756942425e-06, + "loss": 4.8818, + "step": 601 + }, + { + "batch_num_effect_tokens": 7924, + "batch_num_samples": 16, + "batch_num_tokens": 8078, + "epoch": 0.78118, + "grad_norm": 0.12068326771259308, + "learning_rate": 7.636127338052513e-06, + "loss": 4.7988, + "step": 602 + }, + { + "batch_num_effect_tokens": 8053, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.78248, + "grad_norm": 0.1243966817855835, + "learning_rate": 7.626490375395286e-06, + "loss": 4.6328, + "step": 603 + }, + { + "batch_num_effect_tokens": 8045, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 0.78378, + "grad_norm": 0.12233472615480423, + "learning_rate": 7.616839918483061e-06, + "loss": 4.5869, + "step": 604 + }, + { + "batch_num_effect_tokens": 8008, + "batch_num_samples": 16, + "batch_num_tokens": 8166, + "epoch": 0.78508, + "grad_norm": 0.11933887749910355, + "learning_rate": 7.607176016897491e-06, + "loss": 4.7559, + "step": 605 + }, + { + "batch_num_effect_tokens": 7950, + "batch_num_samples": 15, + "batch_num_tokens": 8116, + "epoch": 0.78637, + "grad_norm": 0.11429915577173233, + "learning_rate": 7.597498720289302e-06, + "loss": 4.4414, + "step": 606 + }, + { + "batch_num_effect_tokens": 8041, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.78767, + "grad_norm": 0.12575142085552216, + "learning_rate": 7.587808078378036e-06, + "loss": 4.5176, + "step": 607 + }, + { + "batch_num_effect_tokens": 7824, + "batch_num_samples": 29, + "batch_num_tokens": 8092, + "epoch": 0.78897, + "grad_norm": 0.1377941220998764, + "learning_rate": 7.578104140951806e-06, + "loss": 4.6582, + "step": 608 + }, + { + "batch_num_effect_tokens": 7923, + "batch_num_samples": 18, + "batch_num_tokens": 8081, + "epoch": 0.79027, + "grad_norm": 0.12251248210668564, + "learning_rate": 7.568386957867033e-06, + "loss": 4.5859, + "step": 609 + }, + { + "batch_num_effect_tokens": 8047, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 0.79157, + "grad_norm": 0.12789161503314972, + "learning_rate": 7.5586565790481855e-06, + "loss": 4.7432, + "step": 610 + }, + { + "batch_num_effect_tokens": 8042, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 0.79286, + "grad_norm": 0.14461970329284668, + "learning_rate": 7.548913054487537e-06, + "loss": 4.7646, + "step": 611 + }, + { + "batch_num_effect_tokens": 8057, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 0.79416, + "grad_norm": 0.12491682916879654, + "learning_rate": 7.539156434244892e-06, + "loss": 4.6553, + "step": 612 + }, + { + "batch_num_effect_tokens": 7984, + "batch_num_samples": 17, + "batch_num_tokens": 8178, + "epoch": 0.79546, + "grad_norm": 0.12910866737365723, + "learning_rate": 7.529386768447342e-06, + "loss": 4.9033, + "step": 613 + }, + { + "batch_num_effect_tokens": 8033, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.79676, + "grad_norm": 0.1255904883146286, + "learning_rate": 7.519604107289004e-06, + "loss": 4.7559, + "step": 614 + }, + { + "batch_num_effect_tokens": 8073, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 0.79805, + "grad_norm": 0.12721942365169525, + "learning_rate": 7.50980850103076e-06, + "loss": 4.5977, + "step": 615 + }, + { + "batch_num_effect_tokens": 8052, + "batch_num_samples": 14, + "batch_num_tokens": 8191, + "epoch": 0.79935, + "grad_norm": 0.12462284415960312, + "learning_rate": 7.500000000000001e-06, + "loss": 4.3994, + "step": 616 + }, + { + "batch_num_effect_tokens": 8052, + "batch_num_samples": 14, + "batch_num_tokens": 8191, + "epoch": 0.79935, + "eval_eval_loss": 0.5898093581199646, + "eval_eval_runtime": 115.3418, + "eval_eval_samples_per_second": 43.349, + "eval_eval_steps_per_second": 2.714, + "step": 616 + }, + { + "batch_num_effect_tokens": 8079, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 0.80065, + "grad_norm": 0.12710371613502502, + "learning_rate": 7.490178654590367e-06, + "loss": 4.9082, + "step": 617 + }, + { + "batch_num_effect_tokens": 8061, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.80195, + "grad_norm": 0.12795433402061462, + "learning_rate": 7.480344515261495e-06, + "loss": 4.6973, + "step": 618 + }, + { + "batch_num_effect_tokens": 8038, + "batch_num_samples": 14, + "batch_num_tokens": 8184, + "epoch": 0.80324, + "grad_norm": 0.13104230165481567, + "learning_rate": 7.470497632538743e-06, + "loss": 4.9326, + "step": 619 + }, + { + "batch_num_effect_tokens": 8043, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.80454, + "grad_norm": 0.12019386142492294, + "learning_rate": 7.460638057012956e-06, + "loss": 4.665, + "step": 620 + }, + { + "batch_num_effect_tokens": 8012, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 0.80584, + "grad_norm": 0.1315028965473175, + "learning_rate": 7.450765839340175e-06, + "loss": 4.9375, + "step": 621 + }, + { + "batch_num_effect_tokens": 8059, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.80714, + "grad_norm": 0.13341425359249115, + "learning_rate": 7.440881030241407e-06, + "loss": 4.7939, + "step": 622 + }, + { + "batch_num_effect_tokens": 8028, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 0.80843, + "grad_norm": 0.13055460155010223, + "learning_rate": 7.430983680502344e-06, + "loss": 4.9736, + "step": 623 + }, + { + "batch_num_effect_tokens": 8033, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 0.80973, + "grad_norm": 0.12852461636066437, + "learning_rate": 7.4210738409731095e-06, + "loss": 4.6982, + "step": 624 + }, + { + "batch_num_effect_tokens": 7932, + "batch_num_samples": 22, + "batch_num_tokens": 8164, + "epoch": 0.81103, + "grad_norm": 0.13344988226890564, + "learning_rate": 7.411151562567999e-06, + "loss": 4.7471, + "step": 625 + }, + { + "batch_num_effect_tokens": 7929, + "batch_num_samples": 26, + "batch_num_tokens": 8138, + "epoch": 0.81233, + "grad_norm": 0.13381238281726837, + "learning_rate": 7.401216896265208e-06, + "loss": 4.6709, + "step": 626 + }, + { + "batch_num_effect_tokens": 7977, + "batch_num_samples": 23, + "batch_num_tokens": 8181, + "epoch": 0.81363, + "grad_norm": 0.13180480897426605, + "learning_rate": 7.391269893106592e-06, + "loss": 4.8457, + "step": 627 + }, + { + "batch_num_effect_tokens": 8007, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 0.81492, + "grad_norm": 0.12136835604906082, + "learning_rate": 7.381310604197375e-06, + "loss": 4.8252, + "step": 628 + }, + { + "batch_num_effect_tokens": 7929, + "batch_num_samples": 17, + "batch_num_tokens": 8118, + "epoch": 0.81622, + "grad_norm": 0.1308746188879013, + "learning_rate": 7.371339080705913e-06, + "loss": 4.5479, + "step": 629 + }, + { + "batch_num_effect_tokens": 8059, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 0.81752, + "grad_norm": 0.13940449059009552, + "learning_rate": 7.361355373863415e-06, + "loss": 4.7676, + "step": 630 + }, + { + "batch_num_effect_tokens": 8076, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 0.81882, + "grad_norm": 0.1301494687795639, + "learning_rate": 7.351359534963684e-06, + "loss": 4.4824, + "step": 631 + }, + { + "batch_num_effect_tokens": 8070, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.82011, + "grad_norm": 0.12944510579109192, + "learning_rate": 7.3413516153628605e-06, + "loss": 4.8672, + "step": 632 + }, + { + "batch_num_effect_tokens": 8009, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 0.82141, + "grad_norm": 0.13300953805446625, + "learning_rate": 7.331331666479149e-06, + "loss": 4.9189, + "step": 633 + }, + { + "batch_num_effect_tokens": 8053, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.82271, + "grad_norm": 0.12441015988588333, + "learning_rate": 7.321299739792553e-06, + "loss": 4.4316, + "step": 634 + }, + { + "batch_num_effect_tokens": 7972, + "batch_num_samples": 20, + "batch_num_tokens": 8192, + "epoch": 0.82401, + "grad_norm": 0.12126115709543228, + "learning_rate": 7.311255886844624e-06, + "loss": 4.5771, + "step": 635 + }, + { + "batch_num_effect_tokens": 7986, + "batch_num_samples": 14, + "batch_num_tokens": 8135, + "epoch": 0.8253, + "grad_norm": 0.12674327194690704, + "learning_rate": 7.30120015923818e-06, + "loss": 4.8574, + "step": 636 + }, + { + "batch_num_effect_tokens": 8045, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 0.8266, + "grad_norm": 0.12705481052398682, + "learning_rate": 7.291132608637053e-06, + "loss": 4.334, + "step": 637 + }, + { + "batch_num_effect_tokens": 8010, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 0.8279, + "grad_norm": 0.12920741736888885, + "learning_rate": 7.281053286765816e-06, + "loss": 4.6611, + "step": 638 + }, + { + "batch_num_effect_tokens": 7761, + "batch_num_samples": 25, + "batch_num_tokens": 8034, + "epoch": 0.8292, + "grad_norm": 0.1230921670794487, + "learning_rate": 7.27096224540952e-06, + "loss": 4.5332, + "step": 639 + }, + { + "batch_num_effect_tokens": 8042, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.83049, + "grad_norm": 0.12442672997713089, + "learning_rate": 7.260859536413429e-06, + "loss": 4.7666, + "step": 640 + }, + { + "batch_num_effect_tokens": 7965, + "batch_num_samples": 14, + "batch_num_tokens": 8120, + "epoch": 0.83179, + "grad_norm": 0.13749492168426514, + "learning_rate": 7.250745211682752e-06, + "loss": 4.9414, + "step": 641 + }, + { + "batch_num_effect_tokens": 8023, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 0.83309, + "grad_norm": 0.1354299634695053, + "learning_rate": 7.240619323182378e-06, + "loss": 4.9287, + "step": 642 + }, + { + "batch_num_effect_tokens": 7934, + "batch_num_samples": 16, + "batch_num_tokens": 8100, + "epoch": 0.83439, + "grad_norm": 0.1304435133934021, + "learning_rate": 7.2304819229366015e-06, + "loss": 4.4697, + "step": 643 + }, + { + "batch_num_effect_tokens": 7976, + "batch_num_samples": 23, + "batch_num_tokens": 8192, + "epoch": 0.83569, + "grad_norm": 0.12707528471946716, + "learning_rate": 7.2203330630288714e-06, + "loss": 5.0391, + "step": 644 + }, + { + "batch_num_effect_tokens": 7918, + "batch_num_samples": 18, + "batch_num_tokens": 8100, + "epoch": 0.83698, + "grad_norm": 0.12164284288883209, + "learning_rate": 7.210172795601506e-06, + "loss": 4.2236, + "step": 645 + }, + { + "batch_num_effect_tokens": 8054, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.83828, + "grad_norm": 0.13518215715885162, + "learning_rate": 7.200001172855436e-06, + "loss": 4.7686, + "step": 646 + }, + { + "batch_num_effect_tokens": 8036, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.83958, + "grad_norm": 0.12766359746456146, + "learning_rate": 7.189818247049931e-06, + "loss": 4.5146, + "step": 647 + }, + { + "batch_num_effect_tokens": 7961, + "batch_num_samples": 15, + "batch_num_tokens": 8116, + "epoch": 0.84088, + "grad_norm": 0.12365400791168213, + "learning_rate": 7.179624070502334e-06, + "loss": 4.9824, + "step": 648 + }, + { + "batch_num_effect_tokens": 7946, + "batch_num_samples": 17, + "batch_num_tokens": 8123, + "epoch": 0.84217, + "grad_norm": 0.13238734006881714, + "learning_rate": 7.169418695587791e-06, + "loss": 4.9043, + "step": 649 + }, + { + "batch_num_effect_tokens": 7933, + "batch_num_samples": 14, + "batch_num_tokens": 8122, + "epoch": 0.84347, + "grad_norm": 0.11986955255270004, + "learning_rate": 7.159202174738984e-06, + "loss": 4.3682, + "step": 650 + }, + { + "batch_num_effect_tokens": 7899, + "batch_num_samples": 18, + "batch_num_tokens": 8085, + "epoch": 0.84477, + "grad_norm": 0.1329929381608963, + "learning_rate": 7.148974560445859e-06, + "loss": 4.6943, + "step": 651 + }, + { + "batch_num_effect_tokens": 8054, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.84607, + "grad_norm": 0.1225806325674057, + "learning_rate": 7.138735905255355e-06, + "loss": 4.8477, + "step": 652 + }, + { + "batch_num_effect_tokens": 7858, + "batch_num_samples": 21, + "batch_num_tokens": 8108, + "epoch": 0.84736, + "grad_norm": 0.12441520392894745, + "learning_rate": 7.128486261771142e-06, + "loss": 4.5928, + "step": 653 + }, + { + "batch_num_effect_tokens": 7978, + "batch_num_samples": 16, + "batch_num_tokens": 8133, + "epoch": 0.84866, + "grad_norm": 0.11729971319437027, + "learning_rate": 7.1182256826533365e-06, + "loss": 4.8398, + "step": 654 + }, + { + "batch_num_effect_tokens": 7973, + "batch_num_samples": 21, + "batch_num_tokens": 8159, + "epoch": 0.84996, + "grad_norm": 0.1261633038520813, + "learning_rate": 7.107954220618251e-06, + "loss": 4.9746, + "step": 655 + }, + { + "batch_num_effect_tokens": 8008, + "batch_num_samples": 19, + "batch_num_tokens": 8192, + "epoch": 0.85126, + "grad_norm": 0.13062052428722382, + "learning_rate": 7.097671928438101e-06, + "loss": 4.6182, + "step": 656 + }, + { + "batch_num_effect_tokens": 8062, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.85255, + "grad_norm": 0.12817202508449554, + "learning_rate": 7.08737885894075e-06, + "loss": 4.6768, + "step": 657 + }, + { + "batch_num_effect_tokens": 7970, + "batch_num_samples": 17, + "batch_num_tokens": 8139, + "epoch": 0.85385, + "grad_norm": 0.13156555593013763, + "learning_rate": 7.0770750650094335e-06, + "loss": 4.7109, + "step": 658 + }, + { + "batch_num_effect_tokens": 8025, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 0.85515, + "grad_norm": 0.13155829906463623, + "learning_rate": 7.066760599582481e-06, + "loss": 4.9395, + "step": 659 + }, + { + "batch_num_effect_tokens": 7992, + "batch_num_samples": 22, + "batch_num_tokens": 8182, + "epoch": 0.85645, + "grad_norm": 0.12426735460758209, + "learning_rate": 7.056435515653059e-06, + "loss": 4.5059, + "step": 660 + }, + { + "batch_num_effect_tokens": 8042, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.85775, + "grad_norm": 0.11923466622829437, + "learning_rate": 7.046099866268878e-06, + "loss": 4.6162, + "step": 661 + }, + { + "batch_num_effect_tokens": 8047, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.85904, + "grad_norm": 0.12440145760774612, + "learning_rate": 7.03575370453194e-06, + "loss": 4.6377, + "step": 662 + }, + { + "batch_num_effect_tokens": 7967, + "batch_num_samples": 23, + "batch_num_tokens": 8181, + "epoch": 0.86034, + "grad_norm": 0.12746423482894897, + "learning_rate": 7.025397083598251e-06, + "loss": 4.7217, + "step": 663 + }, + { + "batch_num_effect_tokens": 8011, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 0.86164, + "grad_norm": 0.1230856403708458, + "learning_rate": 7.015030056677559e-06, + "loss": 4.3516, + "step": 664 + }, + { + "batch_num_effect_tokens": 7945, + "batch_num_samples": 20, + "batch_num_tokens": 8128, + "epoch": 0.86294, + "grad_norm": 0.14013619720935822, + "learning_rate": 7.004652677033069e-06, + "loss": 4.9609, + "step": 665 + }, + { + "batch_num_effect_tokens": 7995, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 0.86423, + "grad_norm": 0.1289435476064682, + "learning_rate": 6.9942649979811836e-06, + "loss": 4.7041, + "step": 666 + }, + { + "batch_num_effect_tokens": 7973, + "batch_num_samples": 22, + "batch_num_tokens": 8192, + "epoch": 0.86553, + "grad_norm": 0.11872459203004837, + "learning_rate": 6.983867072891213e-06, + "loss": 4.5059, + "step": 667 + }, + { + "batch_num_effect_tokens": 7942, + "batch_num_samples": 14, + "batch_num_tokens": 8074, + "epoch": 0.86683, + "grad_norm": 0.11841870099306107, + "learning_rate": 6.973458955185116e-06, + "loss": 4.834, + "step": 668 + }, + { + "batch_num_effect_tokens": 8062, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.86813, + "grad_norm": 0.13256226480007172, + "learning_rate": 6.963040698337215e-06, + "loss": 4.7764, + "step": 669 + }, + { + "batch_num_effect_tokens": 8065, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 0.86942, + "grad_norm": 0.12503276765346527, + "learning_rate": 6.952612355873922e-06, + "loss": 4.5635, + "step": 670 + }, + { + "batch_num_effect_tokens": 8057, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 0.87072, + "grad_norm": 0.13926607370376587, + "learning_rate": 6.942173981373474e-06, + "loss": 4.9668, + "step": 671 + }, + { + "batch_num_effect_tokens": 8012, + "batch_num_samples": 18, + "batch_num_tokens": 8192, + "epoch": 0.87202, + "grad_norm": 0.1426723152399063, + "learning_rate": 6.931725628465643e-06, + "loss": 4.9531, + "step": 672 + }, + { + "batch_num_effect_tokens": 7934, + "batch_num_samples": 14, + "batch_num_tokens": 8079, + "epoch": 0.87332, + "grad_norm": 0.12457671016454697, + "learning_rate": 6.9212673508314734e-06, + "loss": 4.6992, + "step": 673 + }, + { + "batch_num_effect_tokens": 7773, + "batch_num_samples": 30, + "batch_num_tokens": 8057, + "epoch": 0.87461, + "grad_norm": 0.1329599916934967, + "learning_rate": 6.910799202202993e-06, + "loss": 4.7793, + "step": 674 + }, + { + "batch_num_effect_tokens": 8075, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 0.87591, + "grad_norm": 0.1251561939716339, + "learning_rate": 6.900321236362952e-06, + "loss": 4.7969, + "step": 675 + }, + { + "batch_num_effect_tokens": 7908, + "batch_num_samples": 15, + "batch_num_tokens": 8086, + "epoch": 0.87721, + "grad_norm": 0.12417805939912796, + "learning_rate": 6.889833507144534e-06, + "loss": 4.9121, + "step": 676 + }, + { + "batch_num_effect_tokens": 8005, + "batch_num_samples": 14, + "batch_num_tokens": 8156, + "epoch": 0.87851, + "grad_norm": 0.1217103824019432, + "learning_rate": 6.879336068431086e-06, + "loss": 4.8389, + "step": 677 + }, + { + "batch_num_effect_tokens": 8015, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.87981, + "grad_norm": 0.1301388144493103, + "learning_rate": 6.868828974155841e-06, + "loss": 5.5527, + "step": 678 + }, + { + "batch_num_effect_tokens": 8023, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.8811, + "grad_norm": 0.12873168289661407, + "learning_rate": 6.858312278301638e-06, + "loss": 4.6826, + "step": 679 + }, + { + "batch_num_effect_tokens": 8045, + "batch_num_samples": 14, + "batch_num_tokens": 8163, + "epoch": 0.8824, + "grad_norm": 0.12014926224946976, + "learning_rate": 6.847786034900648e-06, + "loss": 4.4951, + "step": 680 + }, + { + "batch_num_effect_tokens": 7913, + "batch_num_samples": 14, + "batch_num_tokens": 8088, + "epoch": 0.8837, + "grad_norm": 0.12555824220180511, + "learning_rate": 6.837250298034095e-06, + "loss": 4.5303, + "step": 681 + }, + { + "batch_num_effect_tokens": 8012, + "batch_num_samples": 15, + "batch_num_tokens": 8158, + "epoch": 0.885, + "grad_norm": 0.12343177944421768, + "learning_rate": 6.8267051218319766e-06, + "loss": 4.4834, + "step": 682 + }, + { + "batch_num_effect_tokens": 8055, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 0.88629, + "grad_norm": 0.12695080041885376, + "learning_rate": 6.816150560472787e-06, + "loss": 4.4951, + "step": 683 + }, + { + "batch_num_effect_tokens": 7893, + "batch_num_samples": 18, + "batch_num_tokens": 8085, + "epoch": 0.88759, + "grad_norm": 0.12869073450565338, + "learning_rate": 6.805586668183242e-06, + "loss": 4.7705, + "step": 684 + }, + { + "batch_num_effect_tokens": 8022, + "batch_num_samples": 14, + "batch_num_tokens": 8142, + "epoch": 0.88889, + "grad_norm": 0.12758734822273254, + "learning_rate": 6.7950134992379935e-06, + "loss": 4.4922, + "step": 685 + }, + { + "batch_num_effect_tokens": 7884, + "batch_num_samples": 17, + "batch_num_tokens": 8087, + "epoch": 0.89019, + "grad_norm": 0.12447866797447205, + "learning_rate": 6.78443110795936e-06, + "loss": 4.5645, + "step": 686 + }, + { + "batch_num_effect_tokens": 8066, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.89148, + "grad_norm": 0.1174447238445282, + "learning_rate": 6.773839548717036e-06, + "loss": 4.6055, + "step": 687 + }, + { + "batch_num_effect_tokens": 8047, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.89278, + "grad_norm": 0.1315164715051651, + "learning_rate": 6.7632388759278225e-06, + "loss": 4.4893, + "step": 688 + }, + { + "batch_num_effect_tokens": 7998, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 0.89408, + "grad_norm": 0.119374580681324, + "learning_rate": 6.752629144055342e-06, + "loss": 4.4414, + "step": 689 + }, + { + "batch_num_effect_tokens": 8038, + "batch_num_samples": 18, + "batch_num_tokens": 8192, + "epoch": 0.89538, + "grad_norm": 0.12017811089754105, + "learning_rate": 6.742010407609759e-06, + "loss": 4.8516, + "step": 690 + }, + { + "batch_num_effect_tokens": 8016, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.89667, + "grad_norm": 0.11169980466365814, + "learning_rate": 6.731382721147509e-06, + "loss": 4.6357, + "step": 691 + }, + { + "batch_num_effect_tokens": 8004, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 0.89797, + "grad_norm": 0.12770824134349823, + "learning_rate": 6.720746139270997e-06, + "loss": 4.7705, + "step": 692 + }, + { + "batch_num_effect_tokens": 8032, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.89927, + "grad_norm": 0.12186800688505173, + "learning_rate": 6.710100716628345e-06, + "loss": 4.2812, + "step": 693 + }, + { + "batch_num_effect_tokens": 8063, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 0.90057, + "grad_norm": 0.12909391522407532, + "learning_rate": 6.699446507913083e-06, + "loss": 4.7236, + "step": 694 + }, + { + "batch_num_effect_tokens": 7907, + "batch_num_samples": 26, + "batch_num_tokens": 8128, + "epoch": 0.90187, + "grad_norm": 0.1295677274465561, + "learning_rate": 6.6887835678638944e-06, + "loss": 4.6318, + "step": 695 + }, + { + "batch_num_effect_tokens": 8058, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.90316, + "grad_norm": 0.12642303109169006, + "learning_rate": 6.6781119512643136e-06, + "loss": 4.3809, + "step": 696 + }, + { + "batch_num_effect_tokens": 7971, + "batch_num_samples": 18, + "batch_num_tokens": 8192, + "epoch": 0.90446, + "grad_norm": 0.12434201687574387, + "learning_rate": 6.6674317129424535e-06, + "loss": 4.5703, + "step": 697 + }, + { + "batch_num_effect_tokens": 8052, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 0.90576, + "grad_norm": 0.11948221176862717, + "learning_rate": 6.656742907770728e-06, + "loss": 4.6201, + "step": 698 + }, + { + "batch_num_effect_tokens": 7950, + "batch_num_samples": 19, + "batch_num_tokens": 8131, + "epoch": 0.90706, + "grad_norm": 0.12394808977842331, + "learning_rate": 6.6460455906655595e-06, + "loss": 4.4463, + "step": 699 + }, + { + "batch_num_effect_tokens": 8064, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.90835, + "grad_norm": 0.1280088871717453, + "learning_rate": 6.635339816587109e-06, + "loss": 4.7422, + "step": 700 + }, + { + "batch_num_effect_tokens": 7911, + "batch_num_samples": 15, + "batch_num_tokens": 8086, + "epoch": 0.90965, + "grad_norm": 0.1304391473531723, + "learning_rate": 6.6246256405389805e-06, + "loss": 4.2695, + "step": 701 + }, + { + "batch_num_effect_tokens": 7914, + "batch_num_samples": 25, + "batch_num_tokens": 8164, + "epoch": 0.91095, + "grad_norm": 0.1250067502260208, + "learning_rate": 6.613903117567951e-06, + "loss": 4.5664, + "step": 702 + }, + { + "batch_num_effect_tokens": 8039, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 0.91225, + "grad_norm": 0.12943719327449799, + "learning_rate": 6.6031723027636775e-06, + "loss": 4.5186, + "step": 703 + }, + { + "batch_num_effect_tokens": 8023, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.91354, + "grad_norm": 0.11747249215841293, + "learning_rate": 6.592433251258423e-06, + "loss": 4.7568, + "step": 704 + }, + { + "batch_num_effect_tokens": 8066, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.91484, + "grad_norm": 0.13342788815498352, + "learning_rate": 6.581686018226764e-06, + "loss": 4.6963, + "step": 705 + }, + { + "batch_num_effect_tokens": 7763, + "batch_num_samples": 28, + "batch_num_tokens": 8008, + "epoch": 0.91614, + "grad_norm": 0.12844344973564148, + "learning_rate": 6.570930658885314e-06, + "loss": 4.6582, + "step": 706 + }, + { + "batch_num_effect_tokens": 8019, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 0.91744, + "grad_norm": 0.14021001756191254, + "learning_rate": 6.560167228492436e-06, + "loss": 4.8984, + "step": 707 + }, + { + "batch_num_effect_tokens": 7956, + "batch_num_samples": 14, + "batch_num_tokens": 8114, + "epoch": 0.91873, + "grad_norm": 0.1323830634355545, + "learning_rate": 6.549395782347963e-06, + "loss": 4.7314, + "step": 708 + }, + { + "batch_num_effect_tokens": 7965, + "batch_num_samples": 17, + "batch_num_tokens": 8157, + "epoch": 0.92003, + "grad_norm": 0.12167170643806458, + "learning_rate": 6.53861637579291e-06, + "loss": 4.9531, + "step": 709 + }, + { + "batch_num_effect_tokens": 8009, + "batch_num_samples": 15, + "batch_num_tokens": 8156, + "epoch": 0.92133, + "grad_norm": 0.12227673083543777, + "learning_rate": 6.527829064209187e-06, + "loss": 4.8438, + "step": 710 + }, + { + "batch_num_effect_tokens": 8019, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 0.92263, + "grad_norm": 0.12005326896905899, + "learning_rate": 6.517033903019323e-06, + "loss": 5.04, + "step": 711 + }, + { + "batch_num_effect_tokens": 8053, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 0.92393, + "grad_norm": 0.11777986586093903, + "learning_rate": 6.5062309476861714e-06, + "loss": 4.9141, + "step": 712 + }, + { + "batch_num_effect_tokens": 7997, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 0.92522, + "grad_norm": 0.12427664548158646, + "learning_rate": 6.495420253712636e-06, + "loss": 5.0312, + "step": 713 + }, + { + "batch_num_effect_tokens": 8026, + "batch_num_samples": 18, + "batch_num_tokens": 8192, + "epoch": 0.92652, + "grad_norm": 0.12834517657756805, + "learning_rate": 6.484601876641375e-06, + "loss": 4.7354, + "step": 714 + }, + { + "batch_num_effect_tokens": 7978, + "batch_num_samples": 15, + "batch_num_tokens": 8116, + "epoch": 0.92782, + "grad_norm": 0.13113918900489807, + "learning_rate": 6.473775872054522e-06, + "loss": 4.6543, + "step": 715 + }, + { + "batch_num_effect_tokens": 7916, + "batch_num_samples": 17, + "batch_num_tokens": 8087, + "epoch": 0.92912, + "grad_norm": 0.1328577995300293, + "learning_rate": 6.4629422955733975e-06, + "loss": 4.9062, + "step": 716 + }, + { + "batch_num_effect_tokens": 8018, + "batch_num_samples": 14, + "batch_num_tokens": 8191, + "epoch": 0.93041, + "grad_norm": 0.11206276714801788, + "learning_rate": 6.452101202858229e-06, + "loss": 4.6455, + "step": 717 + }, + { + "batch_num_effect_tokens": 7956, + "batch_num_samples": 22, + "batch_num_tokens": 8164, + "epoch": 0.93171, + "grad_norm": 0.12403670698404312, + "learning_rate": 6.4412526496078555e-06, + "loss": 4.5957, + "step": 718 + }, + { + "batch_num_effect_tokens": 7954, + "batch_num_samples": 21, + "batch_num_tokens": 8125, + "epoch": 0.93301, + "grad_norm": 0.142649307847023, + "learning_rate": 6.430396691559446e-06, + "loss": 4.876, + "step": 719 + }, + { + "batch_num_effect_tokens": 7999, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 0.93431, + "grad_norm": 0.12546393275260925, + "learning_rate": 6.419533384488221e-06, + "loss": 4.5439, + "step": 720 + }, + { + "batch_num_effect_tokens": 7889, + "batch_num_samples": 21, + "batch_num_tokens": 8108, + "epoch": 0.9356, + "grad_norm": 0.12057095021009445, + "learning_rate": 6.408662784207149e-06, + "loss": 4.6611, + "step": 721 + }, + { + "batch_num_effect_tokens": 8021, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 0.9369, + "grad_norm": 0.1193976029753685, + "learning_rate": 6.397784946566676e-06, + "loss": 4.7529, + "step": 722 + }, + { + "batch_num_effect_tokens": 7955, + "batch_num_samples": 14, + "batch_num_tokens": 8121, + "epoch": 0.9382, + "grad_norm": 0.12544573843479156, + "learning_rate": 6.3868999274544264e-06, + "loss": 4.9453, + "step": 723 + }, + { + "batch_num_effect_tokens": 7982, + "batch_num_samples": 24, + "batch_num_tokens": 8192, + "epoch": 0.9395, + "grad_norm": 0.1267952173948288, + "learning_rate": 6.376007782794926e-06, + "loss": 4.7207, + "step": 724 + }, + { + "batch_num_effect_tokens": 7882, + "batch_num_samples": 24, + "batch_num_tokens": 8096, + "epoch": 0.94079, + "grad_norm": 0.14192216098308563, + "learning_rate": 6.365108568549308e-06, + "loss": 5.0576, + "step": 725 + }, + { + "batch_num_effect_tokens": 7956, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 0.94209, + "grad_norm": 0.12729580700397491, + "learning_rate": 6.354202340715027e-06, + "loss": 4.6826, + "step": 726 + }, + { + "batch_num_effect_tokens": 8062, + "batch_num_samples": 14, + "batch_num_tokens": 8188, + "epoch": 0.94339, + "grad_norm": 0.12701718509197235, + "learning_rate": 6.34328915532557e-06, + "loss": 4.8945, + "step": 727 + }, + { + "batch_num_effect_tokens": 8051, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.94469, + "grad_norm": 0.13189218938350677, + "learning_rate": 6.332369068450175e-06, + "loss": 4.8848, + "step": 728 + }, + { + "batch_num_effect_tokens": 8031, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 0.94599, + "grad_norm": 0.12214604765176773, + "learning_rate": 6.321442136193535e-06, + "loss": 4.6484, + "step": 729 + }, + { + "batch_num_effect_tokens": 7778, + "batch_num_samples": 17, + "batch_num_tokens": 7989, + "epoch": 0.94728, + "grad_norm": 0.12111053615808487, + "learning_rate": 6.310508414695511e-06, + "loss": 4.7383, + "step": 730 + }, + { + "batch_num_effect_tokens": 7963, + "batch_num_samples": 21, + "batch_num_tokens": 8192, + "epoch": 0.94858, + "grad_norm": 0.12662427127361298, + "learning_rate": 6.29956796013085e-06, + "loss": 4.6602, + "step": 731 + }, + { + "batch_num_effect_tokens": 8073, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.94988, + "grad_norm": 0.11730318516492844, + "learning_rate": 6.288620828708888e-06, + "loss": 4.5, + "step": 732 + }, + { + "batch_num_effect_tokens": 8058, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 0.95118, + "grad_norm": 0.12332963943481445, + "learning_rate": 6.277667076673266e-06, + "loss": 4.7041, + "step": 733 + }, + { + "batch_num_effect_tokens": 8024, + "batch_num_samples": 15, + "batch_num_tokens": 8167, + "epoch": 0.95247, + "grad_norm": 0.12476927042007446, + "learning_rate": 6.266706760301641e-06, + "loss": 4.5742, + "step": 734 + }, + { + "batch_num_effect_tokens": 8054, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.95377, + "grad_norm": 0.13098326325416565, + "learning_rate": 6.255739935905396e-06, + "loss": 4.4307, + "step": 735 + }, + { + "batch_num_effect_tokens": 8045, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 0.95507, + "grad_norm": 0.13105835020542145, + "learning_rate": 6.244766659829351e-06, + "loss": 4.8428, + "step": 736 + }, + { + "batch_num_effect_tokens": 8011, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 0.95637, + "grad_norm": 0.1244327500462532, + "learning_rate": 6.233786988451468e-06, + "loss": 4.3555, + "step": 737 + }, + { + "batch_num_effect_tokens": 7967, + "batch_num_samples": 22, + "batch_num_tokens": 8192, + "epoch": 0.95766, + "grad_norm": 0.12690576910972595, + "learning_rate": 6.222800978182576e-06, + "loss": 4.7607, + "step": 738 + }, + { + "batch_num_effect_tokens": 8024, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 0.95896, + "grad_norm": 0.12604181468486786, + "learning_rate": 6.211808685466063e-06, + "loss": 4.9053, + "step": 739 + }, + { + "batch_num_effect_tokens": 8039, + "batch_num_samples": 15, + "batch_num_tokens": 8176, + "epoch": 0.96026, + "grad_norm": 0.11882328987121582, + "learning_rate": 6.200810166777598e-06, + "loss": 4.167, + "step": 740 + }, + { + "batch_num_effect_tokens": 8009, + "batch_num_samples": 19, + "batch_num_tokens": 8192, + "epoch": 0.96156, + "grad_norm": 0.12758396565914154, + "learning_rate": 6.189805478624838e-06, + "loss": 4.5254, + "step": 741 + }, + { + "batch_num_effect_tokens": 7998, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 0.96285, + "grad_norm": 0.12387480586767197, + "learning_rate": 6.178794677547138e-06, + "loss": 4.5957, + "step": 742 + }, + { + "batch_num_effect_tokens": 8029, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.96415, + "grad_norm": 0.1272263377904892, + "learning_rate": 6.167777820115254e-06, + "loss": 4.5576, + "step": 743 + }, + { + "batch_num_effect_tokens": 8024, + "batch_num_samples": 17, + "batch_num_tokens": 8191, + "epoch": 0.96545, + "grad_norm": 0.1208052784204483, + "learning_rate": 6.156754962931069e-06, + "loss": 4.4629, + "step": 744 + }, + { + "batch_num_effect_tokens": 7940, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 0.96675, + "grad_norm": 0.11877097934484482, + "learning_rate": 6.145726162627278e-06, + "loss": 4.6768, + "step": 745 + }, + { + "batch_num_effect_tokens": 7949, + "batch_num_samples": 18, + "batch_num_tokens": 8114, + "epoch": 0.96805, + "grad_norm": 0.12087776511907578, + "learning_rate": 6.134691475867122e-06, + "loss": 4.6719, + "step": 746 + }, + { + "batch_num_effect_tokens": 7937, + "batch_num_samples": 14, + "batch_num_tokens": 8079, + "epoch": 0.96934, + "grad_norm": 0.1275898814201355, + "learning_rate": 6.123650959344075e-06, + "loss": 4.666, + "step": 747 + }, + { + "batch_num_effect_tokens": 8026, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.97064, + "grad_norm": 0.11717364192008972, + "learning_rate": 6.112604669781572e-06, + "loss": 4.6748, + "step": 748 + }, + { + "batch_num_effect_tokens": 8079, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 0.97194, + "grad_norm": 0.1246257945895195, + "learning_rate": 6.101552663932704e-06, + "loss": 4.5859, + "step": 749 + }, + { + "batch_num_effect_tokens": 8062, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.97324, + "grad_norm": 0.12895013391971588, + "learning_rate": 6.090494998579929e-06, + "loss": 4.7861, + "step": 750 + }, + { + "batch_num_effect_tokens": 7823, + "batch_num_samples": 27, + "batch_num_tokens": 8091, + "epoch": 0.97453, + "grad_norm": 0.13118872046470642, + "learning_rate": 6.079431730534786e-06, + "loss": 4.7031, + "step": 751 + }, + { + "batch_num_effect_tokens": 8030, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 0.97583, + "grad_norm": 0.1227671355009079, + "learning_rate": 6.0683629166375955e-06, + "loss": 4.5049, + "step": 752 + }, + { + "batch_num_effect_tokens": 8047, + "batch_num_samples": 14, + "batch_num_tokens": 8189, + "epoch": 0.97713, + "grad_norm": 0.12133854627609253, + "learning_rate": 6.057288613757178e-06, + "loss": 4.7334, + "step": 753 + }, + { + "batch_num_effect_tokens": 8074, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 0.97843, + "grad_norm": 0.12346214056015015, + "learning_rate": 6.046208878790543e-06, + "loss": 4.7197, + "step": 754 + }, + { + "batch_num_effect_tokens": 8002, + "batch_num_samples": 18, + "batch_num_tokens": 8191, + "epoch": 0.97972, + "grad_norm": 0.11389555037021637, + "learning_rate": 6.035123768662622e-06, + "loss": 4.7832, + "step": 755 + }, + { + "batch_num_effect_tokens": 8000, + "batch_num_samples": 14, + "batch_num_tokens": 8149, + "epoch": 0.98102, + "grad_norm": 0.1279493272304535, + "learning_rate": 6.024033340325954e-06, + "loss": 4.7656, + "step": 756 + }, + { + "batch_num_effect_tokens": 8020, + "batch_num_samples": 21, + "batch_num_tokens": 8192, + "epoch": 0.98232, + "grad_norm": 0.12431464344263077, + "learning_rate": 6.012937650760406e-06, + "loss": 4.96, + "step": 757 + }, + { + "batch_num_effect_tokens": 8076, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 0.98362, + "grad_norm": 0.126203715801239, + "learning_rate": 6.001836756972873e-06, + "loss": 4.752, + "step": 758 + }, + { + "batch_num_effect_tokens": 7894, + "batch_num_samples": 23, + "batch_num_tokens": 8143, + "epoch": 0.98491, + "grad_norm": 0.12350256741046906, + "learning_rate": 5.990730715996989e-06, + "loss": 4.6533, + "step": 759 + }, + { + "batch_num_effect_tokens": 7952, + "batch_num_samples": 20, + "batch_num_tokens": 8192, + "epoch": 0.98621, + "grad_norm": 0.12190555781126022, + "learning_rate": 5.979619584892834e-06, + "loss": 4.6904, + "step": 760 + }, + { + "batch_num_effect_tokens": 7993, + "batch_num_samples": 20, + "batch_num_tokens": 8192, + "epoch": 0.98751, + "grad_norm": 0.12734700739383698, + "learning_rate": 5.968503420746638e-06, + "loss": 5.4248, + "step": 761 + }, + { + "batch_num_effect_tokens": 8041, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.98881, + "grad_norm": 0.12115947902202606, + "learning_rate": 5.957382280670494e-06, + "loss": 4.6416, + "step": 762 + }, + { + "batch_num_effect_tokens": 8021, + "batch_num_samples": 14, + "batch_num_tokens": 8152, + "epoch": 0.99011, + "grad_norm": 0.12552732229232788, + "learning_rate": 5.946256221802052e-06, + "loss": 4.4473, + "step": 763 + }, + { + "batch_num_effect_tokens": 8026, + "batch_num_samples": 18, + "batch_num_tokens": 8192, + "epoch": 0.9914, + "grad_norm": 0.129593163728714, + "learning_rate": 5.935125301304241e-06, + "loss": 4.9512, + "step": 764 + }, + { + "batch_num_effect_tokens": 7873, + "batch_num_samples": 27, + "batch_num_tokens": 8120, + "epoch": 0.9927, + "grad_norm": 0.1321675032377243, + "learning_rate": 5.9239895763649635e-06, + "loss": 4.8701, + "step": 765 + }, + { + "batch_num_effect_tokens": 8043, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 0.994, + "grad_norm": 0.12967081367969513, + "learning_rate": 5.91284910419681e-06, + "loss": 4.4424, + "step": 766 + }, + { + "batch_num_effect_tokens": 7949, + "batch_num_samples": 14, + "batch_num_tokens": 8088, + "epoch": 0.9953, + "grad_norm": 0.12642012536525726, + "learning_rate": 5.901703942036755e-06, + "loss": 5.1172, + "step": 767 + }, + { + "batch_num_effect_tokens": 7940, + "batch_num_samples": 21, + "batch_num_tokens": 8142, + "epoch": 0.99659, + "grad_norm": 0.13420720398426056, + "learning_rate": 5.890554147145875e-06, + "loss": 4.7734, + "step": 768 + }, + { + "batch_num_effect_tokens": 7937, + "batch_num_samples": 18, + "batch_num_tokens": 8090, + "epoch": 0.99789, + "grad_norm": 0.12632231414318085, + "learning_rate": 5.879399776809047e-06, + "loss": 4.4863, + "step": 769 + }, + { + "batch_num_effect_tokens": 8011, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 0.99919, + "grad_norm": 0.12234952300786972, + "learning_rate": 5.8682408883346535e-06, + "loss": 4.6025, + "step": 770 + }, + { + "batch_num_effect_tokens": 8011, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 0.99919, + "eval_eval_loss": 0.5797469019889832, + "eval_eval_runtime": 114.9751, + "eval_eval_samples_per_second": 43.488, + "eval_eval_steps_per_second": 2.722, + "step": 770 + }, + { + "batch_num_effect_tokens": 8059, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 1.00049, + "grad_norm": 0.13404805958271027, + "learning_rate": 5.857077539054289e-06, + "loss": 4.9434, + "step": 771 + }, + { + "batch_num_effect_tokens": 7897, + "batch_num_samples": 18, + "batch_num_tokens": 8142, + "epoch": 1.00178, + "grad_norm": 0.1201479434967041, + "learning_rate": 5.8459097863224705e-06, + "loss": 4.8154, + "step": 772 + }, + { + "batch_num_effect_tokens": 7988, + "batch_num_samples": 20, + "batch_num_tokens": 8192, + "epoch": 1.00308, + "grad_norm": 0.12970799207687378, + "learning_rate": 5.834737687516336e-06, + "loss": 4.8105, + "step": 773 + }, + { + "batch_num_effect_tokens": 8028, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.00438, + "grad_norm": 0.12445977330207825, + "learning_rate": 5.823561300035355e-06, + "loss": 4.2812, + "step": 774 + }, + { + "batch_num_effect_tokens": 8019, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 1.00568, + "grad_norm": 0.14487802982330322, + "learning_rate": 5.812380681301031e-06, + "loss": 4.6328, + "step": 775 + }, + { + "batch_num_effect_tokens": 8008, + "batch_num_samples": 15, + "batch_num_tokens": 8166, + "epoch": 1.00697, + "grad_norm": 0.12720316648483276, + "learning_rate": 5.8011958887565986e-06, + "loss": 4.6025, + "step": 776 + }, + { + "batch_num_effect_tokens": 8029, + "batch_num_samples": 18, + "batch_num_tokens": 8192, + "epoch": 1.00827, + "grad_norm": 0.13001519441604614, + "learning_rate": 5.79000697986675e-06, + "loss": 4.4268, + "step": 777 + }, + { + "batch_num_effect_tokens": 7976, + "batch_num_samples": 23, + "batch_num_tokens": 8192, + "epoch": 1.00957, + "grad_norm": 0.12651684880256653, + "learning_rate": 5.778814012117315e-06, + "loss": 4.4297, + "step": 778 + }, + { + "batch_num_effect_tokens": 7949, + "batch_num_samples": 18, + "batch_num_tokens": 8128, + "epoch": 1.01087, + "grad_norm": 0.13454923033714294, + "learning_rate": 5.767617043014985e-06, + "loss": 4.3477, + "step": 779 + }, + { + "batch_num_effect_tokens": 7983, + "batch_num_samples": 21, + "batch_num_tokens": 8192, + "epoch": 1.01217, + "grad_norm": 0.14765796065330505, + "learning_rate": 5.756416130087002e-06, + "loss": 4.8281, + "step": 780 + }, + { + "batch_num_effect_tokens": 8000, + "batch_num_samples": 20, + "batch_num_tokens": 8186, + "epoch": 1.01346, + "grad_norm": 0.14546914398670197, + "learning_rate": 5.745211330880872e-06, + "loss": 4.3789, + "step": 781 + }, + { + "batch_num_effect_tokens": 8026, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.01476, + "grad_norm": 0.11960723996162415, + "learning_rate": 5.7340027029640755e-06, + "loss": 4.3809, + "step": 782 + }, + { + "batch_num_effect_tokens": 8010, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.01606, + "grad_norm": 0.14267292618751526, + "learning_rate": 5.7227903039237535e-06, + "loss": 4.5361, + "step": 783 + }, + { + "batch_num_effect_tokens": 7992, + "batch_num_samples": 14, + "batch_num_tokens": 8155, + "epoch": 1.01736, + "grad_norm": 0.1270764023065567, + "learning_rate": 5.711574191366427e-06, + "loss": 4.5635, + "step": 784 + }, + { + "batch_num_effect_tokens": 8066, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.01865, + "grad_norm": 0.14421948790550232, + "learning_rate": 5.7003544229176955e-06, + "loss": 5.0654, + "step": 785 + }, + { + "batch_num_effect_tokens": 8016, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 1.01995, + "grad_norm": 0.12795937061309814, + "learning_rate": 5.689131056221944e-06, + "loss": 4.3809, + "step": 786 + }, + { + "batch_num_effect_tokens": 8012, + "batch_num_samples": 14, + "batch_num_tokens": 8184, + "epoch": 1.02125, + "grad_norm": 0.1386982649564743, + "learning_rate": 5.677904148942039e-06, + "loss": 4.5127, + "step": 787 + }, + { + "batch_num_effect_tokens": 8040, + "batch_num_samples": 15, + "batch_num_tokens": 8190, + "epoch": 1.02255, + "grad_norm": 0.1462188959121704, + "learning_rate": 5.666673758759045e-06, + "loss": 4.3438, + "step": 788 + }, + { + "batch_num_effect_tokens": 8048, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.02384, + "grad_norm": 0.16186738014221191, + "learning_rate": 5.655439943371912e-06, + "loss": 4.7744, + "step": 789 + }, + { + "batch_num_effect_tokens": 8051, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.02514, + "grad_norm": 0.12250373512506485, + "learning_rate": 5.644202760497195e-06, + "loss": 4.2549, + "step": 790 + }, + { + "batch_num_effect_tokens": 7968, + "batch_num_samples": 22, + "batch_num_tokens": 8146, + "epoch": 1.02644, + "grad_norm": 0.17847490310668945, + "learning_rate": 5.632962267868747e-06, + "loss": 4.7627, + "step": 791 + }, + { + "batch_num_effect_tokens": 8061, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.02774, + "grad_norm": 0.1615590900182724, + "learning_rate": 5.621718523237427e-06, + "loss": 4.8848, + "step": 792 + }, + { + "batch_num_effect_tokens": 8056, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.02903, + "grad_norm": 0.13113276660442352, + "learning_rate": 5.6104715843708e-06, + "loss": 4.4883, + "step": 793 + }, + { + "batch_num_effect_tokens": 8022, + "batch_num_samples": 14, + "batch_num_tokens": 8184, + "epoch": 1.03033, + "grad_norm": 0.13856875896453857, + "learning_rate": 5.599221509052844e-06, + "loss": 4.5146, + "step": 794 + }, + { + "batch_num_effect_tokens": 8062, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 1.03163, + "grad_norm": 0.13490043580532074, + "learning_rate": 5.587968355083654e-06, + "loss": 4.5322, + "step": 795 + }, + { + "batch_num_effect_tokens": 8043, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 1.03293, + "grad_norm": 0.12667182087898254, + "learning_rate": 5.576712180279134e-06, + "loss": 4.5234, + "step": 796 + }, + { + "batch_num_effect_tokens": 8051, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 1.03423, + "grad_norm": 0.12577715516090393, + "learning_rate": 5.565453042470717e-06, + "loss": 4.5273, + "step": 797 + }, + { + "batch_num_effect_tokens": 8031, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.03552, + "grad_norm": 0.1268448680639267, + "learning_rate": 5.5541909995050554e-06, + "loss": 4.7002, + "step": 798 + }, + { + "batch_num_effect_tokens": 7978, + "batch_num_samples": 15, + "batch_num_tokens": 8140, + "epoch": 1.03682, + "grad_norm": 0.12386433035135269, + "learning_rate": 5.542926109243727e-06, + "loss": 4.5459, + "step": 799 + }, + { + "batch_num_effect_tokens": 8010, + "batch_num_samples": 19, + "batch_num_tokens": 8192, + "epoch": 1.03812, + "grad_norm": 0.13546870648860931, + "learning_rate": 5.53165842956294e-06, + "loss": 4.6445, + "step": 800 + }, + { + "batch_num_effect_tokens": 8041, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 1.03942, + "grad_norm": 0.12320155650377274, + "learning_rate": 5.520388018353233e-06, + "loss": 4.2441, + "step": 801 + }, + { + "batch_num_effect_tokens": 7989, + "batch_num_samples": 23, + "batch_num_tokens": 8192, + "epoch": 1.04071, + "grad_norm": 0.14361971616744995, + "learning_rate": 5.509114933519179e-06, + "loss": 4.4756, + "step": 802 + }, + { + "batch_num_effect_tokens": 8043, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.04201, + "grad_norm": 0.13132131099700928, + "learning_rate": 5.497839232979084e-06, + "loss": 4.2627, + "step": 803 + }, + { + "batch_num_effect_tokens": 8047, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.04331, + "grad_norm": 0.1350114941596985, + "learning_rate": 5.4865609746647e-06, + "loss": 4.957, + "step": 804 + }, + { + "batch_num_effect_tokens": 8043, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.04461, + "grad_norm": 0.12061459571123123, + "learning_rate": 5.475280216520913e-06, + "loss": 4.4453, + "step": 805 + }, + { + "batch_num_effect_tokens": 8024, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.0459, + "grad_norm": 0.12335465103387833, + "learning_rate": 5.463997016505459e-06, + "loss": 4.1699, + "step": 806 + }, + { + "batch_num_effect_tokens": 8044, + "batch_num_samples": 14, + "batch_num_tokens": 8189, + "epoch": 1.0472, + "grad_norm": 0.12723498046398163, + "learning_rate": 5.4527114325886145e-06, + "loss": 4.1455, + "step": 807 + }, + { + "batch_num_effect_tokens": 7996, + "batch_num_samples": 22, + "batch_num_tokens": 8192, + "epoch": 1.0485, + "grad_norm": 0.13448134064674377, + "learning_rate": 5.441423522752904e-06, + "loss": 4.625, + "step": 808 + }, + { + "batch_num_effect_tokens": 8014, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.0498, + "grad_norm": 0.12482011318206787, + "learning_rate": 5.430133344992807e-06, + "loss": 4.5391, + "step": 809 + }, + { + "batch_num_effect_tokens": 8003, + "batch_num_samples": 16, + "batch_num_tokens": 8190, + "epoch": 1.05109, + "grad_norm": 0.12930616736412048, + "learning_rate": 5.418840957314451e-06, + "loss": 4.1719, + "step": 810 + }, + { + "batch_num_effect_tokens": 7976, + "batch_num_samples": 16, + "batch_num_tokens": 8189, + "epoch": 1.05239, + "grad_norm": 0.13999834656715393, + "learning_rate": 5.4075464177353165e-06, + "loss": 4.7783, + "step": 811 + }, + { + "batch_num_effect_tokens": 8026, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.05369, + "grad_norm": 0.13308821618556976, + "learning_rate": 5.396249784283943e-06, + "loss": 4.3184, + "step": 812 + }, + { + "batch_num_effect_tokens": 7997, + "batch_num_samples": 14, + "batch_num_tokens": 8149, + "epoch": 1.05499, + "grad_norm": 0.12184661626815796, + "learning_rate": 5.3849511149996255e-06, + "loss": 4.5986, + "step": 813 + }, + { + "batch_num_effect_tokens": 8004, + "batch_num_samples": 19, + "batch_num_tokens": 8191, + "epoch": 1.05629, + "grad_norm": 0.13552594184875488, + "learning_rate": 5.373650467932122e-06, + "loss": 4.4873, + "step": 814 + }, + { + "batch_num_effect_tokens": 8017, + "batch_num_samples": 20, + "batch_num_tokens": 8192, + "epoch": 1.05758, + "grad_norm": 0.12386645376682281, + "learning_rate": 5.362347901141348e-06, + "loss": 4.4834, + "step": 815 + }, + { + "batch_num_effect_tokens": 8005, + "batch_num_samples": 19, + "batch_num_tokens": 8192, + "epoch": 1.05888, + "grad_norm": 0.12549902498722076, + "learning_rate": 5.351043472697082e-06, + "loss": 4.9111, + "step": 816 + }, + { + "batch_num_effect_tokens": 7901, + "batch_num_samples": 22, + "batch_num_tokens": 8128, + "epoch": 1.06018, + "grad_norm": 0.12777559459209442, + "learning_rate": 5.339737240678671e-06, + "loss": 4.4678, + "step": 817 + }, + { + "batch_num_effect_tokens": 7908, + "batch_num_samples": 20, + "batch_num_tokens": 8112, + "epoch": 1.06148, + "grad_norm": 0.14045743644237518, + "learning_rate": 5.328429263174725e-06, + "loss": 4.4395, + "step": 818 + }, + { + "batch_num_effect_tokens": 7903, + "batch_num_samples": 21, + "batch_num_tokens": 8125, + "epoch": 1.06277, + "grad_norm": 0.13065417110919952, + "learning_rate": 5.317119598282823e-06, + "loss": 4.668, + "step": 819 + }, + { + "batch_num_effect_tokens": 7937, + "batch_num_samples": 14, + "batch_num_tokens": 8074, + "epoch": 1.06407, + "grad_norm": 0.1203104555606842, + "learning_rate": 5.3058083041092145e-06, + "loss": 4.0527, + "step": 820 + }, + { + "batch_num_effect_tokens": 7985, + "batch_num_samples": 24, + "batch_num_tokens": 8192, + "epoch": 1.06537, + "grad_norm": 0.1304715871810913, + "learning_rate": 5.294495438768517e-06, + "loss": 4.2881, + "step": 821 + }, + { + "batch_num_effect_tokens": 7990, + "batch_num_samples": 14, + "batch_num_tokens": 8155, + "epoch": 1.06667, + "grad_norm": 0.1247849240899086, + "learning_rate": 5.283181060383423e-06, + "loss": 4.2393, + "step": 822 + }, + { + "batch_num_effect_tokens": 8026, + "batch_num_samples": 15, + "batch_num_tokens": 8166, + "epoch": 1.06796, + "grad_norm": 0.11411383748054504, + "learning_rate": 5.271865227084397e-06, + "loss": 4.7168, + "step": 823 + }, + { + "batch_num_effect_tokens": 7985, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 1.06926, + "grad_norm": 0.1270046830177307, + "learning_rate": 5.260547997009379e-06, + "loss": 4.5264, + "step": 824 + }, + { + "batch_num_effect_tokens": 7917, + "batch_num_samples": 17, + "batch_num_tokens": 8079, + "epoch": 1.07056, + "grad_norm": 0.12649409472942352, + "learning_rate": 5.249229428303486e-06, + "loss": 4.5293, + "step": 825 + }, + { + "batch_num_effect_tokens": 8027, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 1.07186, + "grad_norm": 0.1309773325920105, + "learning_rate": 5.237909579118713e-06, + "loss": 4.2207, + "step": 826 + }, + { + "batch_num_effect_tokens": 8064, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.07315, + "grad_norm": 0.12407130748033524, + "learning_rate": 5.226588507613629e-06, + "loss": 4.4414, + "step": 827 + }, + { + "batch_num_effect_tokens": 7999, + "batch_num_samples": 15, + "batch_num_tokens": 8152, + "epoch": 1.07445, + "grad_norm": 0.12215977907180786, + "learning_rate": 5.21526627195309e-06, + "loss": 4.9326, + "step": 828 + }, + { + "batch_num_effect_tokens": 7995, + "batch_num_samples": 15, + "batch_num_tokens": 8126, + "epoch": 1.07575, + "grad_norm": 0.12580524384975433, + "learning_rate": 5.2039429303079294e-06, + "loss": 4.4629, + "step": 829 + }, + { + "batch_num_effect_tokens": 8062, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.07705, + "grad_norm": 0.12904728949069977, + "learning_rate": 5.1926185408546604e-06, + "loss": 4.3467, + "step": 830 + }, + { + "batch_num_effect_tokens": 7957, + "batch_num_samples": 15, + "batch_num_tokens": 8086, + "epoch": 1.07835, + "grad_norm": 0.13490159809589386, + "learning_rate": 5.181293161775186e-06, + "loss": 4.4609, + "step": 831 + }, + { + "batch_num_effect_tokens": 7975, + "batch_num_samples": 16, + "batch_num_tokens": 8133, + "epoch": 1.07964, + "grad_norm": 0.13504654169082642, + "learning_rate": 5.169966851256489e-06, + "loss": 4.2334, + "step": 832 + }, + { + "batch_num_effect_tokens": 8011, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 1.08094, + "grad_norm": 0.1325363963842392, + "learning_rate": 5.15863966749034e-06, + "loss": 4.5322, + "step": 833 + }, + { + "batch_num_effect_tokens": 8060, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.08224, + "grad_norm": 0.13651366531848907, + "learning_rate": 5.147311668672991e-06, + "loss": 4.6211, + "step": 834 + }, + { + "batch_num_effect_tokens": 7933, + "batch_num_samples": 14, + "batch_num_tokens": 8092, + "epoch": 1.08354, + "grad_norm": 0.15273743867874146, + "learning_rate": 5.135982913004889e-06, + "loss": 4.9326, + "step": 835 + }, + { + "batch_num_effect_tokens": 8011, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.08483, + "grad_norm": 0.12992307543754578, + "learning_rate": 5.1246534586903655e-06, + "loss": 4.623, + "step": 836 + }, + { + "batch_num_effect_tokens": 8059, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.08613, + "grad_norm": 0.13165108859539032, + "learning_rate": 5.11332336393734e-06, + "loss": 4.5762, + "step": 837 + }, + { + "batch_num_effect_tokens": 8051, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.08743, + "grad_norm": 0.125563845038414, + "learning_rate": 5.101992686957028e-06, + "loss": 4.0518, + "step": 838 + }, + { + "batch_num_effect_tokens": 8055, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.08873, + "grad_norm": 0.12698976695537567, + "learning_rate": 5.090661485963628e-06, + "loss": 4.3701, + "step": 839 + }, + { + "batch_num_effect_tokens": 7922, + "batch_num_samples": 15, + "batch_num_tokens": 8080, + "epoch": 1.09002, + "grad_norm": 0.12403866648674011, + "learning_rate": 5.07932981917404e-06, + "loss": 4.1211, + "step": 840 + }, + { + "batch_num_effect_tokens": 8022, + "batch_num_samples": 18, + "batch_num_tokens": 8192, + "epoch": 1.09132, + "grad_norm": 0.12252990156412125, + "learning_rate": 5.06799774480755e-06, + "loss": 4.2109, + "step": 841 + }, + { + "batch_num_effect_tokens": 8020, + "batch_num_samples": 16, + "batch_num_tokens": 8189, + "epoch": 1.09262, + "grad_norm": 0.11591480672359467, + "learning_rate": 5.056665321085542e-06, + "loss": 4.6582, + "step": 842 + }, + { + "batch_num_effect_tokens": 8047, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.09392, + "grad_norm": 0.12528832256793976, + "learning_rate": 5.045332606231191e-06, + "loss": 4.5166, + "step": 843 + }, + { + "batch_num_effect_tokens": 8025, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.09521, + "grad_norm": 0.1252938210964203, + "learning_rate": 5.033999658469174e-06, + "loss": 4.1709, + "step": 844 + }, + { + "batch_num_effect_tokens": 7936, + "batch_num_samples": 14, + "batch_num_tokens": 8146, + "epoch": 1.09651, + "grad_norm": 0.12733793258666992, + "learning_rate": 5.022666536025359e-06, + "loss": 4.4521, + "step": 845 + }, + { + "batch_num_effect_tokens": 8024, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 1.09781, + "grad_norm": 0.12023656815290451, + "learning_rate": 5.011333297126513e-06, + "loss": 4.3408, + "step": 846 + }, + { + "batch_num_effect_tokens": 7998, + "batch_num_samples": 17, + "batch_num_tokens": 8171, + "epoch": 1.09911, + "grad_norm": 0.11743319034576416, + "learning_rate": 5e-06, + "loss": 4.5605, + "step": 847 + }, + { + "batch_num_effect_tokens": 8019, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 1.10041, + "grad_norm": 0.12908881902694702, + "learning_rate": 4.98866670287349e-06, + "loss": 4.5234, + "step": 848 + }, + { + "batch_num_effect_tokens": 8004, + "batch_num_samples": 17, + "batch_num_tokens": 8152, + "epoch": 1.1017, + "grad_norm": 0.1330942064523697, + "learning_rate": 4.977333463974643e-06, + "loss": 4.6748, + "step": 849 + }, + { + "batch_num_effect_tokens": 7978, + "batch_num_samples": 15, + "batch_num_tokens": 8116, + "epoch": 1.103, + "grad_norm": 0.13068662583827972, + "learning_rate": 4.966000341530827e-06, + "loss": 4.543, + "step": 850 + }, + { + "batch_num_effect_tokens": 7926, + "batch_num_samples": 17, + "batch_num_tokens": 8074, + "epoch": 1.1043, + "grad_norm": 0.12920166552066803, + "learning_rate": 4.9546673937688086e-06, + "loss": 4.1533, + "step": 851 + }, + { + "batch_num_effect_tokens": 7905, + "batch_num_samples": 14, + "batch_num_tokens": 8083, + "epoch": 1.1056, + "grad_norm": 0.12737612426280975, + "learning_rate": 4.94333467891446e-06, + "loss": 4.5225, + "step": 852 + }, + { + "batch_num_effect_tokens": 8035, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 1.10689, + "grad_norm": 0.11825668811798096, + "learning_rate": 4.932002255192452e-06, + "loss": 4.542, + "step": 853 + }, + { + "batch_num_effect_tokens": 7872, + "batch_num_samples": 23, + "batch_num_tokens": 8086, + "epoch": 1.10819, + "grad_norm": 0.1468936651945114, + "learning_rate": 4.9206701808259605e-06, + "loss": 5.1279, + "step": 854 + }, + { + "batch_num_effect_tokens": 7952, + "batch_num_samples": 15, + "batch_num_tokens": 8116, + "epoch": 1.10949, + "grad_norm": 0.1266355961561203, + "learning_rate": 4.909338514036373e-06, + "loss": 4.3125, + "step": 855 + }, + { + "batch_num_effect_tokens": 7974, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 1.11079, + "grad_norm": 0.13342653214931488, + "learning_rate": 4.898007313042975e-06, + "loss": 4.3066, + "step": 856 + }, + { + "batch_num_effect_tokens": 8038, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 1.11208, + "grad_norm": 0.12773488461971283, + "learning_rate": 4.8866766360626615e-06, + "loss": 4.4775, + "step": 857 + }, + { + "batch_num_effect_tokens": 8026, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 1.11338, + "grad_norm": 0.13515350222587585, + "learning_rate": 4.875346541309637e-06, + "loss": 4.3096, + "step": 858 + }, + { + "batch_num_effect_tokens": 7949, + "batch_num_samples": 17, + "batch_num_tokens": 8139, + "epoch": 1.11468, + "grad_norm": 0.13417378067970276, + "learning_rate": 4.864017086995112e-06, + "loss": 4.7119, + "step": 859 + }, + { + "batch_num_effect_tokens": 8003, + "batch_num_samples": 21, + "batch_num_tokens": 8192, + "epoch": 1.11598, + "grad_norm": 0.13580955564975739, + "learning_rate": 4.852688331327011e-06, + "loss": 4.8125, + "step": 860 + }, + { + "batch_num_effect_tokens": 7911, + "batch_num_samples": 16, + "batch_num_tokens": 8144, + "epoch": 1.11727, + "grad_norm": 0.12650568783283234, + "learning_rate": 4.841360332509663e-06, + "loss": 4.5361, + "step": 861 + }, + { + "batch_num_effect_tokens": 8036, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.11857, + "grad_norm": 0.1183309480547905, + "learning_rate": 4.830033148743512e-06, + "loss": 4.0146, + "step": 862 + }, + { + "batch_num_effect_tokens": 8064, + "batch_num_samples": 14, + "batch_num_tokens": 8189, + "epoch": 1.11987, + "grad_norm": 0.126793771982193, + "learning_rate": 4.818706838224815e-06, + "loss": 4.502, + "step": 863 + }, + { + "batch_num_effect_tokens": 7941, + "batch_num_samples": 14, + "batch_num_tokens": 8073, + "epoch": 1.12117, + "grad_norm": 0.125539630651474, + "learning_rate": 4.8073814591453395e-06, + "loss": 4.2773, + "step": 864 + }, + { + "batch_num_effect_tokens": 8039, + "batch_num_samples": 15, + "batch_num_tokens": 8176, + "epoch": 1.12247, + "grad_norm": 0.13130030035972595, + "learning_rate": 4.796057069692073e-06, + "loss": 4.6611, + "step": 865 + }, + { + "batch_num_effect_tokens": 7915, + "batch_num_samples": 18, + "batch_num_tokens": 8090, + "epoch": 1.12376, + "grad_norm": 0.12117452919483185, + "learning_rate": 4.784733728046912e-06, + "loss": 4.3535, + "step": 866 + }, + { + "batch_num_effect_tokens": 7944, + "batch_num_samples": 15, + "batch_num_tokens": 8096, + "epoch": 1.12506, + "grad_norm": 0.12501753866672516, + "learning_rate": 4.773411492386372e-06, + "loss": 4.7051, + "step": 867 + }, + { + "batch_num_effect_tokens": 8019, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.12636, + "grad_norm": 0.12537001073360443, + "learning_rate": 4.762090420881289e-06, + "loss": 4.7852, + "step": 868 + }, + { + "batch_num_effect_tokens": 8001, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 1.12766, + "grad_norm": 0.11610813438892365, + "learning_rate": 4.750770571696514e-06, + "loss": 4.1914, + "step": 869 + }, + { + "batch_num_effect_tokens": 8023, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 1.12895, + "grad_norm": 0.12504693865776062, + "learning_rate": 4.739452002990621e-06, + "loss": 3.9814, + "step": 870 + }, + { + "batch_num_effect_tokens": 7964, + "batch_num_samples": 23, + "batch_num_tokens": 8143, + "epoch": 1.13025, + "grad_norm": 0.122451052069664, + "learning_rate": 4.728134772915605e-06, + "loss": 4.2246, + "step": 871 + }, + { + "batch_num_effect_tokens": 7910, + "batch_num_samples": 15, + "batch_num_tokens": 8089, + "epoch": 1.13155, + "grad_norm": 0.1333150565624237, + "learning_rate": 4.716818939616578e-06, + "loss": 4.5938, + "step": 872 + }, + { + "batch_num_effect_tokens": 8021, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.13285, + "grad_norm": 0.13619881868362427, + "learning_rate": 4.705504561231485e-06, + "loss": 4.6348, + "step": 873 + }, + { + "batch_num_effect_tokens": 8014, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.13414, + "grad_norm": 0.1358458697795868, + "learning_rate": 4.694191695890788e-06, + "loss": 4.4453, + "step": 874 + }, + { + "batch_num_effect_tokens": 7844, + "batch_num_samples": 20, + "batch_num_tokens": 8080, + "epoch": 1.13544, + "grad_norm": 0.12052467465400696, + "learning_rate": 4.682880401717178e-06, + "loss": 4.4482, + "step": 875 + }, + { + "batch_num_effect_tokens": 8046, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.13674, + "grad_norm": 0.11595940589904785, + "learning_rate": 4.671570736825277e-06, + "loss": 4.1816, + "step": 876 + }, + { + "batch_num_effect_tokens": 8025, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.13804, + "grad_norm": 0.1310449242591858, + "learning_rate": 4.660262759321331e-06, + "loss": 4.4414, + "step": 877 + }, + { + "batch_num_effect_tokens": 7966, + "batch_num_samples": 14, + "batch_num_tokens": 8092, + "epoch": 1.13933, + "grad_norm": 0.1393718123435974, + "learning_rate": 4.6489565273029196e-06, + "loss": 4.3857, + "step": 878 + }, + { + "batch_num_effect_tokens": 8039, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.14063, + "grad_norm": 0.13982267677783966, + "learning_rate": 4.637652098858655e-06, + "loss": 4.4727, + "step": 879 + }, + { + "batch_num_effect_tokens": 7863, + "batch_num_samples": 24, + "batch_num_tokens": 8073, + "epoch": 1.14193, + "grad_norm": 0.14085061848163605, + "learning_rate": 4.626349532067879e-06, + "loss": 4.707, + "step": 880 + }, + { + "batch_num_effect_tokens": 8078, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 1.14323, + "grad_norm": 0.13555698096752167, + "learning_rate": 4.615048885000375e-06, + "loss": 4.4424, + "step": 881 + }, + { + "batch_num_effect_tokens": 8024, + "batch_num_samples": 16, + "batch_num_tokens": 8188, + "epoch": 1.14453, + "grad_norm": 0.13564865291118622, + "learning_rate": 4.603750215716057e-06, + "loss": 4.2158, + "step": 882 + }, + { + "batch_num_effect_tokens": 8066, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.14582, + "grad_norm": 0.13119758665561676, + "learning_rate": 4.592453582264684e-06, + "loss": 4.6748, + "step": 883 + }, + { + "batch_num_effect_tokens": 7983, + "batch_num_samples": 14, + "batch_num_tokens": 8137, + "epoch": 1.14712, + "grad_norm": 0.1268366128206253, + "learning_rate": 4.581159042685552e-06, + "loss": 4.3467, + "step": 884 + }, + { + "batch_num_effect_tokens": 7963, + "batch_num_samples": 19, + "batch_num_tokens": 8161, + "epoch": 1.14842, + "grad_norm": 0.12466870993375778, + "learning_rate": 4.569866655007193e-06, + "loss": 4.7676, + "step": 885 + }, + { + "batch_num_effect_tokens": 8051, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.14972, + "grad_norm": 0.13537685573101044, + "learning_rate": 4.558576477247097e-06, + "loss": 4.1719, + "step": 886 + }, + { + "batch_num_effect_tokens": 7946, + "batch_num_samples": 18, + "batch_num_tokens": 8114, + "epoch": 1.15101, + "grad_norm": 0.12638984620571136, + "learning_rate": 4.547288567411388e-06, + "loss": 4.3047, + "step": 887 + }, + { + "batch_num_effect_tokens": 8062, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.15231, + "grad_norm": 0.12655675411224365, + "learning_rate": 4.5360029834945425e-06, + "loss": 4.5449, + "step": 888 + }, + { + "batch_num_effect_tokens": 8071, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.15361, + "grad_norm": 0.13120026886463165, + "learning_rate": 4.524719783479088e-06, + "loss": 4.3848, + "step": 889 + }, + { + "batch_num_effect_tokens": 8036, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 1.15491, + "grad_norm": 0.12158391624689102, + "learning_rate": 4.513439025335302e-06, + "loss": 4.4844, + "step": 890 + }, + { + "batch_num_effect_tokens": 8050, + "batch_num_samples": 17, + "batch_num_tokens": 8191, + "epoch": 1.1562, + "grad_norm": 0.13134336471557617, + "learning_rate": 4.502160767020918e-06, + "loss": 5.5039, + "step": 891 + }, + { + "batch_num_effect_tokens": 8046, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.1575, + "grad_norm": 0.12917788326740265, + "learning_rate": 4.4908850664808245e-06, + "loss": 4.6094, + "step": 892 + }, + { + "batch_num_effect_tokens": 8052, + "batch_num_samples": 13, + "batch_num_tokens": 8188, + "epoch": 1.1588, + "grad_norm": 0.13100962340831757, + "learning_rate": 4.4796119816467685e-06, + "loss": 4.3867, + "step": 893 + }, + { + "batch_num_effect_tokens": 8045, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.1601, + "grad_norm": 0.13368336856365204, + "learning_rate": 4.468341570437061e-06, + "loss": 4.4697, + "step": 894 + }, + { + "batch_num_effect_tokens": 8007, + "batch_num_samples": 20, + "batch_num_tokens": 8192, + "epoch": 1.16139, + "grad_norm": 0.12571601569652557, + "learning_rate": 4.457073890756273e-06, + "loss": 4.249, + "step": 895 + }, + { + "batch_num_effect_tokens": 8035, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.16269, + "grad_norm": 0.1370954066514969, + "learning_rate": 4.445809000494945e-06, + "loss": 4.5898, + "step": 896 + }, + { + "batch_num_effect_tokens": 8022, + "batch_num_samples": 14, + "batch_num_tokens": 8163, + "epoch": 1.16399, + "grad_norm": 0.12560419738292694, + "learning_rate": 4.434546957529283e-06, + "loss": 4.3018, + "step": 897 + }, + { + "batch_num_effect_tokens": 8063, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.16529, + "grad_norm": 0.1281665414571762, + "learning_rate": 4.423287819720866e-06, + "loss": 4.2051, + "step": 898 + }, + { + "batch_num_effect_tokens": 8076, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 1.16659, + "grad_norm": 0.13026204705238342, + "learning_rate": 4.412031644916348e-06, + "loss": 4.3184, + "step": 899 + }, + { + "batch_num_effect_tokens": 7923, + "batch_num_samples": 23, + "batch_num_tokens": 8124, + "epoch": 1.16788, + "grad_norm": 0.1349964737892151, + "learning_rate": 4.400778490947157e-06, + "loss": 4.7354, + "step": 900 + }, + { + "batch_num_effect_tokens": 8023, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.16918, + "grad_norm": 0.13438257575035095, + "learning_rate": 4.389528415629201e-06, + "loss": 4.5127, + "step": 901 + }, + { + "batch_num_effect_tokens": 8048, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.17048, + "grad_norm": 0.12244999408721924, + "learning_rate": 4.3782814767625755e-06, + "loss": 4.752, + "step": 902 + }, + { + "batch_num_effect_tokens": 8041, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 1.17178, + "grad_norm": 0.12824036180973053, + "learning_rate": 4.367037732131254e-06, + "loss": 4.1963, + "step": 903 + }, + { + "batch_num_effect_tokens": 8052, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.17307, + "grad_norm": 0.13308383524417877, + "learning_rate": 4.355797239502807e-06, + "loss": 4.3115, + "step": 904 + }, + { + "batch_num_effect_tokens": 8033, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 1.17437, + "grad_norm": 0.12866966426372528, + "learning_rate": 4.34456005662809e-06, + "loss": 4.7246, + "step": 905 + }, + { + "batch_num_effect_tokens": 8045, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.17567, + "grad_norm": 0.12427198141813278, + "learning_rate": 4.3333262412409575e-06, + "loss": 4.4072, + "step": 906 + }, + { + "batch_num_effect_tokens": 8016, + "batch_num_samples": 20, + "batch_num_tokens": 8192, + "epoch": 1.17697, + "grad_norm": 0.12111925333738327, + "learning_rate": 4.322095851057962e-06, + "loss": 4.4688, + "step": 907 + }, + { + "batch_num_effect_tokens": 8005, + "batch_num_samples": 18, + "batch_num_tokens": 8170, + "epoch": 1.17826, + "grad_norm": 0.13822191953659058, + "learning_rate": 4.310868943778057e-06, + "loss": 4.6992, + "step": 908 + }, + { + "batch_num_effect_tokens": 7912, + "batch_num_samples": 17, + "batch_num_tokens": 8100, + "epoch": 1.17956, + "grad_norm": 0.13831211626529694, + "learning_rate": 4.299645577082305e-06, + "loss": 4.749, + "step": 909 + }, + { + "batch_num_effect_tokens": 8018, + "batch_num_samples": 14, + "batch_num_tokens": 8191, + "epoch": 1.18086, + "grad_norm": 0.11694054305553436, + "learning_rate": 4.2884258086335755e-06, + "loss": 4.2002, + "step": 910 + }, + { + "batch_num_effect_tokens": 7919, + "batch_num_samples": 15, + "batch_num_tokens": 8086, + "epoch": 1.18216, + "grad_norm": 0.13560707867145538, + "learning_rate": 4.277209696076248e-06, + "loss": 4.751, + "step": 911 + }, + { + "batch_num_effect_tokens": 8032, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.18345, + "grad_norm": 0.11765341460704803, + "learning_rate": 4.265997297035926e-06, + "loss": 4.2842, + "step": 912 + }, + { + "batch_num_effect_tokens": 8042, + "batch_num_samples": 16, + "batch_num_tokens": 8190, + "epoch": 1.18475, + "grad_norm": 0.13359101116657257, + "learning_rate": 4.254788669119127e-06, + "loss": 4.832, + "step": 913 + }, + { + "batch_num_effect_tokens": 8069, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 1.18605, + "grad_norm": 0.13347217440605164, + "learning_rate": 4.243583869913e-06, + "loss": 4.5137, + "step": 914 + }, + { + "batch_num_effect_tokens": 8029, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 1.18735, + "grad_norm": 0.12483620643615723, + "learning_rate": 4.232382956985017e-06, + "loss": 4.1416, + "step": 915 + }, + { + "batch_num_effect_tokens": 7913, + "batch_num_samples": 16, + "batch_num_tokens": 8089, + "epoch": 1.18865, + "grad_norm": 0.1381087601184845, + "learning_rate": 4.221185987882684e-06, + "loss": 4.5977, + "step": 916 + }, + { + "batch_num_effect_tokens": 8033, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.18994, + "grad_norm": 0.15423673391342163, + "learning_rate": 4.209993020133251e-06, + "loss": 4.1279, + "step": 917 + }, + { + "batch_num_effect_tokens": 7968, + "batch_num_samples": 16, + "batch_num_tokens": 8122, + "epoch": 1.19124, + "grad_norm": 0.14219102263450623, + "learning_rate": 4.198804111243403e-06, + "loss": 4.1191, + "step": 918 + }, + { + "batch_num_effect_tokens": 7946, + "batch_num_samples": 20, + "batch_num_tokens": 8144, + "epoch": 1.19254, + "grad_norm": 0.1339540034532547, + "learning_rate": 4.187619318698971e-06, + "loss": 4.3799, + "step": 919 + }, + { + "batch_num_effect_tokens": 8050, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.19384, + "grad_norm": 0.1272207647562027, + "learning_rate": 4.176438699964646e-06, + "loss": 4.2256, + "step": 920 + }, + { + "batch_num_effect_tokens": 7961, + "batch_num_samples": 16, + "batch_num_tokens": 8122, + "epoch": 1.19513, + "grad_norm": 0.1126057505607605, + "learning_rate": 4.165262312483664e-06, + "loss": 4.7441, + "step": 921 + }, + { + "batch_num_effect_tokens": 8026, + "batch_num_samples": 17, + "batch_num_tokens": 8184, + "epoch": 1.19643, + "grad_norm": 0.13080507516860962, + "learning_rate": 4.154090213677531e-06, + "loss": 4.5449, + "step": 922 + }, + { + "batch_num_effect_tokens": 7840, + "batch_num_samples": 17, + "batch_num_tokens": 8006, + "epoch": 1.19773, + "grad_norm": 0.13032877445220947, + "learning_rate": 4.1429224609457135e-06, + "loss": 4.7578, + "step": 923 + }, + { + "batch_num_effect_tokens": 7991, + "batch_num_samples": 14, + "batch_num_tokens": 8114, + "epoch": 1.19903, + "grad_norm": 0.12347196787595749, + "learning_rate": 4.131759111665349e-06, + "loss": 4.6602, + "step": 924 + }, + { + "batch_num_effect_tokens": 7991, + "batch_num_samples": 14, + "batch_num_tokens": 8114, + "epoch": 1.19903, + "eval_eval_loss": 0.5730312466621399, + "eval_eval_runtime": 115.3354, + "eval_eval_samples_per_second": 43.352, + "eval_eval_steps_per_second": 2.714, + "step": 924 + }, + { + "batch_num_effect_tokens": 7754, + "batch_num_samples": 30, + "batch_num_tokens": 8011, + "epoch": 1.20032, + "grad_norm": 0.12765365839004517, + "learning_rate": 4.120600223190955e-06, + "loss": 4.1592, + "step": 925 + }, + { + "batch_num_effect_tokens": 8045, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.20162, + "grad_norm": 0.13900645077228546, + "learning_rate": 4.109445852854125e-06, + "loss": 4.457, + "step": 926 + }, + { + "batch_num_effect_tokens": 7980, + "batch_num_samples": 15, + "batch_num_tokens": 8134, + "epoch": 1.20292, + "grad_norm": 0.12310691922903061, + "learning_rate": 4.098296057963246e-06, + "loss": 4.6924, + "step": 927 + }, + { + "batch_num_effect_tokens": 7909, + "batch_num_samples": 17, + "batch_num_tokens": 8087, + "epoch": 1.20422, + "grad_norm": 0.1377318799495697, + "learning_rate": 4.087150895803192e-06, + "loss": 4.085, + "step": 928 + }, + { + "batch_num_effect_tokens": 7991, + "batch_num_samples": 24, + "batch_num_tokens": 8188, + "epoch": 1.20552, + "grad_norm": 0.14090469479560852, + "learning_rate": 4.076010423635037e-06, + "loss": 4.8594, + "step": 929 + }, + { + "batch_num_effect_tokens": 8018, + "batch_num_samples": 19, + "batch_num_tokens": 8192, + "epoch": 1.20681, + "grad_norm": 0.12155922502279282, + "learning_rate": 4.064874698695761e-06, + "loss": 4.3281, + "step": 930 + }, + { + "batch_num_effect_tokens": 8050, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.20811, + "grad_norm": 0.13325005769729614, + "learning_rate": 4.053743778197951e-06, + "loss": 4.668, + "step": 931 + }, + { + "batch_num_effect_tokens": 8045, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.20941, + "grad_norm": 0.1313610076904297, + "learning_rate": 4.042617719329507e-06, + "loss": 4.7402, + "step": 932 + }, + { + "batch_num_effect_tokens": 8037, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.21071, + "grad_norm": 0.1211569532752037, + "learning_rate": 4.0314965792533635e-06, + "loss": 4.4873, + "step": 933 + }, + { + "batch_num_effect_tokens": 8048, + "batch_num_samples": 13, + "batch_num_tokens": 8191, + "epoch": 1.212, + "grad_norm": 0.12477646768093109, + "learning_rate": 4.020380415107167e-06, + "loss": 4.4355, + "step": 934 + }, + { + "batch_num_effect_tokens": 8056, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.2133, + "grad_norm": 0.1258799135684967, + "learning_rate": 4.009269284003014e-06, + "loss": 4.6689, + "step": 935 + }, + { + "batch_num_effect_tokens": 7976, + "batch_num_samples": 15, + "batch_num_tokens": 8116, + "epoch": 1.2146, + "grad_norm": 0.1279737502336502, + "learning_rate": 3.99816324302713e-06, + "loss": 4.0469, + "step": 936 + }, + { + "batch_num_effect_tokens": 7933, + "batch_num_samples": 14, + "batch_num_tokens": 8122, + "epoch": 1.2159, + "grad_norm": 0.12206988781690598, + "learning_rate": 3.987062349239596e-06, + "loss": 4.1738, + "step": 937 + }, + { + "batch_num_effect_tokens": 8052, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 1.21719, + "grad_norm": 0.1263066828250885, + "learning_rate": 3.975966659674048e-06, + "loss": 4.3438, + "step": 938 + }, + { + "batch_num_effect_tokens": 8066, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 1.21849, + "grad_norm": 0.1253357082605362, + "learning_rate": 3.964876231337379e-06, + "loss": 4.2725, + "step": 939 + }, + { + "batch_num_effect_tokens": 7945, + "batch_num_samples": 20, + "batch_num_tokens": 8128, + "epoch": 1.21979, + "grad_norm": 0.146712064743042, + "learning_rate": 3.953791121209458e-06, + "loss": 4.1543, + "step": 940 + }, + { + "batch_num_effect_tokens": 7998, + "batch_num_samples": 16, + "batch_num_tokens": 8155, + "epoch": 1.22109, + "grad_norm": 0.13425204157829285, + "learning_rate": 3.942711386242826e-06, + "loss": 4.8652, + "step": 941 + }, + { + "batch_num_effect_tokens": 7906, + "batch_num_samples": 14, + "batch_num_tokens": 8079, + "epoch": 1.22238, + "grad_norm": 0.13137395679950714, + "learning_rate": 3.931637083362405e-06, + "loss": 4.0664, + "step": 942 + }, + { + "batch_num_effect_tokens": 8020, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.22368, + "grad_norm": 0.12695470452308655, + "learning_rate": 3.920568269465216e-06, + "loss": 4.3643, + "step": 943 + }, + { + "batch_num_effect_tokens": 8007, + "batch_num_samples": 14, + "batch_num_tokens": 8144, + "epoch": 1.22498, + "grad_norm": 0.12561850249767303, + "learning_rate": 3.909505001420072e-06, + "loss": 4.4287, + "step": 944 + }, + { + "batch_num_effect_tokens": 8026, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 1.22628, + "grad_norm": 0.13399529457092285, + "learning_rate": 3.898447336067297e-06, + "loss": 4.2666, + "step": 945 + }, + { + "batch_num_effect_tokens": 8055, + "batch_num_samples": 14, + "batch_num_tokens": 8182, + "epoch": 1.22758, + "grad_norm": 0.1406509280204773, + "learning_rate": 3.887395330218429e-06, + "loss": 4.4023, + "step": 946 + }, + { + "batch_num_effect_tokens": 8033, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.22887, + "grad_norm": 0.12866802513599396, + "learning_rate": 3.876349040655925e-06, + "loss": 4.2334, + "step": 947 + }, + { + "batch_num_effect_tokens": 8035, + "batch_num_samples": 14, + "batch_num_tokens": 8163, + "epoch": 1.23017, + "grad_norm": 0.1247694343328476, + "learning_rate": 3.86530852413288e-06, + "loss": 4.5361, + "step": 948 + }, + { + "batch_num_effect_tokens": 8037, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.23147, + "grad_norm": 0.12325099110603333, + "learning_rate": 3.854273837372724e-06, + "loss": 4.2549, + "step": 949 + }, + { + "batch_num_effect_tokens": 8037, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.23277, + "grad_norm": 0.1377268135547638, + "learning_rate": 3.843245037068932e-06, + "loss": 4.5361, + "step": 950 + }, + { + "batch_num_effect_tokens": 8050, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.23406, + "grad_norm": 0.1305629014968872, + "learning_rate": 3.832222179884747e-06, + "loss": 4.4932, + "step": 951 + }, + { + "batch_num_effect_tokens": 8009, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.23536, + "grad_norm": 0.1270604133605957, + "learning_rate": 3.821205322452863e-06, + "loss": 4.877, + "step": 952 + }, + { + "batch_num_effect_tokens": 7940, + "batch_num_samples": 15, + "batch_num_tokens": 8106, + "epoch": 1.23666, + "grad_norm": 0.1398427039384842, + "learning_rate": 3.8101945213751635e-06, + "loss": 4.583, + "step": 953 + }, + { + "batch_num_effect_tokens": 8043, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.23796, + "grad_norm": 0.12297014147043228, + "learning_rate": 3.799189833222404e-06, + "loss": 4.4424, + "step": 954 + }, + { + "batch_num_effect_tokens": 7968, + "batch_num_samples": 14, + "batch_num_tokens": 8107, + "epoch": 1.23925, + "grad_norm": 0.1305890530347824, + "learning_rate": 3.7881913145339387e-06, + "loss": 4.5752, + "step": 955 + }, + { + "batch_num_effect_tokens": 8052, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.24055, + "grad_norm": 0.1272619217634201, + "learning_rate": 3.777199021817426e-06, + "loss": 4.9912, + "step": 956 + }, + { + "batch_num_effect_tokens": 8012, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.24185, + "grad_norm": 0.13338087499141693, + "learning_rate": 3.7662130115485317e-06, + "loss": 4.6572, + "step": 957 + }, + { + "batch_num_effect_tokens": 8052, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.24315, + "grad_norm": 0.13084988296031952, + "learning_rate": 3.7552333401706508e-06, + "loss": 4.0723, + "step": 958 + }, + { + "batch_num_effect_tokens": 8036, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 1.24444, + "grad_norm": 0.12794189155101776, + "learning_rate": 3.7442600640946045e-06, + "loss": 4.2676, + "step": 959 + }, + { + "batch_num_effect_tokens": 8053, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 1.24574, + "grad_norm": 0.1372474730014801, + "learning_rate": 3.733293239698359e-06, + "loss": 4.6777, + "step": 960 + }, + { + "batch_num_effect_tokens": 7919, + "batch_num_samples": 18, + "batch_num_tokens": 8081, + "epoch": 1.24704, + "grad_norm": 0.14294537901878357, + "learning_rate": 3.7223329233267354e-06, + "loss": 4.585, + "step": 961 + }, + { + "batch_num_effect_tokens": 7930, + "batch_num_samples": 14, + "batch_num_tokens": 8086, + "epoch": 1.24834, + "grad_norm": 0.13273997604846954, + "learning_rate": 3.711379171291115e-06, + "loss": 4.2793, + "step": 962 + }, + { + "batch_num_effect_tokens": 8065, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.24964, + "grad_norm": 0.13120055198669434, + "learning_rate": 3.7004320398691507e-06, + "loss": 4.5068, + "step": 963 + }, + { + "batch_num_effect_tokens": 8053, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.25093, + "grad_norm": 0.12945452332496643, + "learning_rate": 3.689491585304491e-06, + "loss": 4.873, + "step": 964 + }, + { + "batch_num_effect_tokens": 7897, + "batch_num_samples": 19, + "batch_num_tokens": 8112, + "epoch": 1.25223, + "grad_norm": 0.13530410826206207, + "learning_rate": 3.6785578638064655e-06, + "loss": 4.6064, + "step": 965 + }, + { + "batch_num_effect_tokens": 8075, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.25353, + "grad_norm": 0.12716589868068695, + "learning_rate": 3.667630931549826e-06, + "loss": 4.0967, + "step": 966 + }, + { + "batch_num_effect_tokens": 8007, + "batch_num_samples": 19, + "batch_num_tokens": 8192, + "epoch": 1.25483, + "grad_norm": 0.12516002357006073, + "learning_rate": 3.6567108446744314e-06, + "loss": 4.5811, + "step": 967 + }, + { + "batch_num_effect_tokens": 8063, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.25612, + "grad_norm": 0.13172359764575958, + "learning_rate": 3.6457976592849753e-06, + "loss": 4.4248, + "step": 968 + }, + { + "batch_num_effect_tokens": 7955, + "batch_num_samples": 17, + "batch_num_tokens": 8167, + "epoch": 1.25742, + "grad_norm": 0.12710091471672058, + "learning_rate": 3.6348914314506944e-06, + "loss": 4.167, + "step": 969 + }, + { + "batch_num_effect_tokens": 8003, + "batch_num_samples": 20, + "batch_num_tokens": 8192, + "epoch": 1.25872, + "grad_norm": 0.1424860805273056, + "learning_rate": 3.623992217205075e-06, + "loss": 4.3926, + "step": 970 + }, + { + "batch_num_effect_tokens": 8048, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 1.26002, + "grad_norm": 0.12454716116189957, + "learning_rate": 3.6131000725455756e-06, + "loss": 4.5156, + "step": 971 + }, + { + "batch_num_effect_tokens": 7839, + "batch_num_samples": 26, + "batch_num_tokens": 8080, + "epoch": 1.26131, + "grad_norm": 0.13083425164222717, + "learning_rate": 3.6022150534333267e-06, + "loss": 4.667, + "step": 972 + }, + { + "batch_num_effect_tokens": 7960, + "batch_num_samples": 20, + "batch_num_tokens": 8160, + "epoch": 1.26261, + "grad_norm": 0.13552185893058777, + "learning_rate": 3.5913372157928515e-06, + "loss": 4.8428, + "step": 973 + }, + { + "batch_num_effect_tokens": 7976, + "batch_num_samples": 15, + "batch_num_tokens": 8128, + "epoch": 1.26391, + "grad_norm": 0.119549959897995, + "learning_rate": 3.5804666155117807e-06, + "loss": 4.6348, + "step": 974 + }, + { + "batch_num_effect_tokens": 7979, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 1.26521, + "grad_norm": 0.12790077924728394, + "learning_rate": 3.5696033084405535e-06, + "loss": 4.1328, + "step": 975 + }, + { + "batch_num_effect_tokens": 8011, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.2665, + "grad_norm": 0.12213937938213348, + "learning_rate": 3.558747350392146e-06, + "loss": 4.584, + "step": 976 + }, + { + "batch_num_effect_tokens": 7982, + "batch_num_samples": 20, + "batch_num_tokens": 8192, + "epoch": 1.2678, + "grad_norm": 0.12294816225767136, + "learning_rate": 3.5478987971417723e-06, + "loss": 4.5674, + "step": 977 + }, + { + "batch_num_effect_tokens": 7989, + "batch_num_samples": 20, + "batch_num_tokens": 8184, + "epoch": 1.2691, + "grad_norm": 0.12941895425319672, + "learning_rate": 3.537057704426602e-06, + "loss": 4.6514, + "step": 978 + }, + { + "batch_num_effect_tokens": 8059, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.2704, + "grad_norm": 0.11868005990982056, + "learning_rate": 3.526224127945479e-06, + "loss": 4.3252, + "step": 979 + }, + { + "batch_num_effect_tokens": 7976, + "batch_num_samples": 15, + "batch_num_tokens": 8116, + "epoch": 1.2717, + "grad_norm": 0.13335098326206207, + "learning_rate": 3.5153981233586277e-06, + "loss": 4.7188, + "step": 980 + }, + { + "batch_num_effect_tokens": 7880, + "batch_num_samples": 17, + "batch_num_tokens": 8062, + "epoch": 1.27299, + "grad_norm": 0.11686256527900696, + "learning_rate": 3.5045797462873643e-06, + "loss": 4.2773, + "step": 981 + }, + { + "batch_num_effect_tokens": 7808, + "batch_num_samples": 30, + "batch_num_tokens": 8082, + "epoch": 1.27429, + "grad_norm": 0.13790954649448395, + "learning_rate": 3.4937690523138302e-06, + "loss": 4.5752, + "step": 982 + }, + { + "batch_num_effect_tokens": 8003, + "batch_num_samples": 18, + "batch_num_tokens": 8192, + "epoch": 1.27559, + "grad_norm": 0.1367255300283432, + "learning_rate": 3.4829660969806776e-06, + "loss": 4.6543, + "step": 983 + }, + { + "batch_num_effect_tokens": 7947, + "batch_num_samples": 15, + "batch_num_tokens": 8116, + "epoch": 1.27689, + "grad_norm": 0.14571502804756165, + "learning_rate": 3.4721709357908146e-06, + "loss": 4.248, + "step": 984 + }, + { + "batch_num_effect_tokens": 7961, + "batch_num_samples": 17, + "batch_num_tokens": 8108, + "epoch": 1.27818, + "grad_norm": 0.12180175632238388, + "learning_rate": 3.461383624207092e-06, + "loss": 4.6895, + "step": 985 + }, + { + "batch_num_effect_tokens": 8017, + "batch_num_samples": 17, + "batch_num_tokens": 8176, + "epoch": 1.27948, + "grad_norm": 0.1343916952610016, + "learning_rate": 3.4506042176520375e-06, + "loss": 4.3574, + "step": 986 + }, + { + "batch_num_effect_tokens": 8063, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.28078, + "grad_norm": 0.13950027525424957, + "learning_rate": 3.439832771507565e-06, + "loss": 4.165, + "step": 987 + }, + { + "batch_num_effect_tokens": 8016, + "batch_num_samples": 15, + "batch_num_tokens": 8146, + "epoch": 1.28208, + "grad_norm": 0.1476665586233139, + "learning_rate": 3.4290693411146882e-06, + "loss": 4.8047, + "step": 988 + }, + { + "batch_num_effect_tokens": 7985, + "batch_num_samples": 17, + "batch_num_tokens": 8186, + "epoch": 1.28337, + "grad_norm": 0.13402019441127777, + "learning_rate": 3.418313981773238e-06, + "loss": 4.5215, + "step": 989 + }, + { + "batch_num_effect_tokens": 7940, + "batch_num_samples": 14, + "batch_num_tokens": 8101, + "epoch": 1.28467, + "grad_norm": 0.1343563050031662, + "learning_rate": 3.4075667487415785e-06, + "loss": 4.3604, + "step": 990 + }, + { + "batch_num_effect_tokens": 8053, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 1.28597, + "grad_norm": 0.13799680769443512, + "learning_rate": 3.3968276972363224e-06, + "loss": 4.3262, + "step": 991 + }, + { + "batch_num_effect_tokens": 7924, + "batch_num_samples": 24, + "batch_num_tokens": 8192, + "epoch": 1.28727, + "grad_norm": 0.13754983246326447, + "learning_rate": 3.3860968824320507e-06, + "loss": 4.3242, + "step": 992 + }, + { + "batch_num_effect_tokens": 8043, + "batch_num_samples": 14, + "batch_num_tokens": 8188, + "epoch": 1.28856, + "grad_norm": 0.13641497492790222, + "learning_rate": 3.3753743594610216e-06, + "loss": 4.498, + "step": 993 + }, + { + "batch_num_effect_tokens": 8067, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.28986, + "grad_norm": 0.1407681405544281, + "learning_rate": 3.3646601834128924e-06, + "loss": 4.7393, + "step": 994 + }, + { + "batch_num_effect_tokens": 7816, + "batch_num_samples": 27, + "batch_num_tokens": 8041, + "epoch": 1.29116, + "grad_norm": 0.14411447942256927, + "learning_rate": 3.353954409334442e-06, + "loss": 4.3535, + "step": 995 + }, + { + "batch_num_effect_tokens": 7919, + "batch_num_samples": 14, + "batch_num_tokens": 8119, + "epoch": 1.29246, + "grad_norm": 0.13188710808753967, + "learning_rate": 3.3432570922292728e-06, + "loss": 4.2559, + "step": 996 + }, + { + "batch_num_effect_tokens": 7998, + "batch_num_samples": 21, + "batch_num_tokens": 8186, + "epoch": 1.29376, + "grad_norm": 0.12662333250045776, + "learning_rate": 3.3325682870575478e-06, + "loss": 4.0684, + "step": 997 + }, + { + "batch_num_effect_tokens": 8002, + "batch_num_samples": 17, + "batch_num_tokens": 8179, + "epoch": 1.29505, + "grad_norm": 0.12670519948005676, + "learning_rate": 3.3218880487356885e-06, + "loss": 4.5381, + "step": 998 + }, + { + "batch_num_effect_tokens": 8002, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 1.29635, + "grad_norm": 0.13219594955444336, + "learning_rate": 3.3112164321361064e-06, + "loss": 4.7129, + "step": 999 + }, + { + "batch_num_effect_tokens": 8010, + "batch_num_samples": 20, + "batch_num_tokens": 8192, + "epoch": 1.29765, + "grad_norm": 0.12541687488555908, + "learning_rate": 3.3005534920869175e-06, + "loss": 4.333, + "step": 1000 + }, + { + "batch_num_effect_tokens": 7940, + "batch_num_samples": 25, + "batch_num_tokens": 8160, + "epoch": 1.29895, + "grad_norm": 0.12586721777915955, + "learning_rate": 3.289899283371657e-06, + "loss": 4.4082, + "step": 1001 + }, + { + "batch_num_effect_tokens": 8032, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.30024, + "grad_norm": 0.11721961200237274, + "learning_rate": 3.2792538607290036e-06, + "loss": 4.3291, + "step": 1002 + }, + { + "batch_num_effect_tokens": 7975, + "batch_num_samples": 14, + "batch_num_tokens": 8130, + "epoch": 1.30154, + "grad_norm": 0.14944490790367126, + "learning_rate": 3.268617278852494e-06, + "loss": 4.0322, + "step": 1003 + }, + { + "batch_num_effect_tokens": 8009, + "batch_num_samples": 18, + "batch_num_tokens": 8180, + "epoch": 1.30284, + "grad_norm": 0.13059593737125397, + "learning_rate": 3.257989592390241e-06, + "loss": 4.1982, + "step": 1004 + }, + { + "batch_num_effect_tokens": 7998, + "batch_num_samples": 21, + "batch_num_tokens": 8192, + "epoch": 1.30414, + "grad_norm": 0.12026825547218323, + "learning_rate": 3.2473708559446606e-06, + "loss": 4.377, + "step": 1005 + }, + { + "batch_num_effect_tokens": 8051, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 1.30543, + "grad_norm": 0.1340772956609726, + "learning_rate": 3.2367611240721796e-06, + "loss": 4.5195, + "step": 1006 + }, + { + "batch_num_effect_tokens": 8044, + "batch_num_samples": 14, + "batch_num_tokens": 8184, + "epoch": 1.30673, + "grad_norm": 0.13199454545974731, + "learning_rate": 3.226160451282965e-06, + "loss": 4.3545, + "step": 1007 + }, + { + "batch_num_effect_tokens": 7976, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 1.30803, + "grad_norm": 0.11274772882461548, + "learning_rate": 3.2155688920406415e-06, + "loss": 4.1318, + "step": 1008 + }, + { + "batch_num_effect_tokens": 8059, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 1.30933, + "grad_norm": 0.13221722841262817, + "learning_rate": 3.204986500762006e-06, + "loss": 4.499, + "step": 1009 + }, + { + "batch_num_effect_tokens": 7972, + "batch_num_samples": 17, + "batch_num_tokens": 8126, + "epoch": 1.31062, + "grad_norm": 0.13470391929149628, + "learning_rate": 3.194413331816759e-06, + "loss": 4.7568, + "step": 1010 + }, + { + "batch_num_effect_tokens": 7829, + "batch_num_samples": 20, + "batch_num_tokens": 8112, + "epoch": 1.31192, + "grad_norm": 0.1261770874261856, + "learning_rate": 3.1838494395272155e-06, + "loss": 4.3809, + "step": 1011 + }, + { + "batch_num_effect_tokens": 8017, + "batch_num_samples": 18, + "batch_num_tokens": 8192, + "epoch": 1.31322, + "grad_norm": 0.13255248963832855, + "learning_rate": 3.173294878168025e-06, + "loss": 4.3428, + "step": 1012 + }, + { + "batch_num_effect_tokens": 7932, + "batch_num_samples": 15, + "batch_num_tokens": 8076, + "epoch": 1.31452, + "grad_norm": 0.13333898782730103, + "learning_rate": 3.162749701965907e-06, + "loss": 4.5859, + "step": 1013 + }, + { + "batch_num_effect_tokens": 8034, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.31582, + "grad_norm": 0.13349847495555878, + "learning_rate": 3.152213965099352e-06, + "loss": 4.8818, + "step": 1014 + }, + { + "batch_num_effect_tokens": 7997, + "batch_num_samples": 14, + "batch_num_tokens": 8184, + "epoch": 1.31711, + "grad_norm": 0.12797629833221436, + "learning_rate": 3.141687721698363e-06, + "loss": 4.6836, + "step": 1015 + }, + { + "batch_num_effect_tokens": 7989, + "batch_num_samples": 23, + "batch_num_tokens": 8192, + "epoch": 1.31841, + "grad_norm": 0.1437963992357254, + "learning_rate": 3.1311710258441607e-06, + "loss": 4.3955, + "step": 1016 + }, + { + "batch_num_effect_tokens": 8036, + "batch_num_samples": 17, + "batch_num_tokens": 8191, + "epoch": 1.31971, + "grad_norm": 0.12287195771932602, + "learning_rate": 3.1206639315689154e-06, + "loss": 3.877, + "step": 1017 + }, + { + "batch_num_effect_tokens": 8046, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 1.32101, + "grad_norm": 0.1235748901963234, + "learning_rate": 3.110166492855468e-06, + "loss": 4.1572, + "step": 1018 + }, + { + "batch_num_effect_tokens": 8071, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.3223, + "grad_norm": 0.1256420761346817, + "learning_rate": 3.0996787636370495e-06, + "loss": 4.1611, + "step": 1019 + }, + { + "batch_num_effect_tokens": 8071, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.3236, + "grad_norm": 0.13278773427009583, + "learning_rate": 3.0892007977970083e-06, + "loss": 4.5078, + "step": 1020 + }, + { + "batch_num_effect_tokens": 8029, + "batch_num_samples": 14, + "batch_num_tokens": 8188, + "epoch": 1.3249, + "grad_norm": 0.130946084856987, + "learning_rate": 3.0787326491685287e-06, + "loss": 4.6865, + "step": 1021 + }, + { + "batch_num_effect_tokens": 7960, + "batch_num_samples": 22, + "batch_num_tokens": 8192, + "epoch": 1.3262, + "grad_norm": 0.130752831697464, + "learning_rate": 3.0682743715343565e-06, + "loss": 4.5918, + "step": 1022 + }, + { + "batch_num_effect_tokens": 7974, + "batch_num_samples": 24, + "batch_num_tokens": 8192, + "epoch": 1.32749, + "grad_norm": 0.13391391932964325, + "learning_rate": 3.057826018626527e-06, + "loss": 4.5469, + "step": 1023 + }, + { + "batch_num_effect_tokens": 8031, + "batch_num_samples": 14, + "batch_num_tokens": 8191, + "epoch": 1.32879, + "grad_norm": 0.13457229733467102, + "learning_rate": 3.0473876441260786e-06, + "loss": 4.7383, + "step": 1024 + }, + { + "batch_num_effect_tokens": 7991, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 1.33009, + "grad_norm": 0.13133271038532257, + "learning_rate": 3.0369593016627867e-06, + "loss": 4.1289, + "step": 1025 + }, + { + "batch_num_effect_tokens": 7963, + "batch_num_samples": 15, + "batch_num_tokens": 8098, + "epoch": 1.33139, + "grad_norm": 0.13519789278507233, + "learning_rate": 3.026541044814885e-06, + "loss": 4.1523, + "step": 1026 + }, + { + "batch_num_effect_tokens": 7929, + "batch_num_samples": 14, + "batch_num_tokens": 8081, + "epoch": 1.33268, + "grad_norm": 0.137874037027359, + "learning_rate": 3.016132927108787e-06, + "loss": 4.8057, + "step": 1027 + }, + { + "batch_num_effect_tokens": 8019, + "batch_num_samples": 14, + "batch_num_tokens": 8163, + "epoch": 1.33398, + "grad_norm": 0.11607379466295242, + "learning_rate": 3.005735002018818e-06, + "loss": 4.1748, + "step": 1028 + }, + { + "batch_num_effect_tokens": 8040, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 1.33528, + "grad_norm": 0.12126877903938293, + "learning_rate": 2.995347322966933e-06, + "loss": 4.4434, + "step": 1029 + }, + { + "batch_num_effect_tokens": 8010, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 1.33658, + "grad_norm": 0.12571828067302704, + "learning_rate": 2.9849699433224423e-06, + "loss": 4.2344, + "step": 1030 + }, + { + "batch_num_effect_tokens": 8043, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.33788, + "grad_norm": 0.134991854429245, + "learning_rate": 2.974602916401751e-06, + "loss": 4.3867, + "step": 1031 + }, + { + "batch_num_effect_tokens": 8030, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.33917, + "grad_norm": 0.13775356113910675, + "learning_rate": 2.9642462954680605e-06, + "loss": 4.7285, + "step": 1032 + }, + { + "batch_num_effect_tokens": 7844, + "batch_num_samples": 17, + "batch_num_tokens": 7989, + "epoch": 1.34047, + "grad_norm": 0.12610451877117157, + "learning_rate": 2.9539001337311234e-06, + "loss": 4.1611, + "step": 1033 + }, + { + "batch_num_effect_tokens": 7988, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 1.34177, + "grad_norm": 0.13606472313404083, + "learning_rate": 2.9435644843469434e-06, + "loss": 4.2061, + "step": 1034 + }, + { + "batch_num_effect_tokens": 7909, + "batch_num_samples": 15, + "batch_num_tokens": 8080, + "epoch": 1.34307, + "grad_norm": 0.12449389696121216, + "learning_rate": 2.933239400417519e-06, + "loss": 4.1045, + "step": 1035 + }, + { + "batch_num_effect_tokens": 8043, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 1.34436, + "grad_norm": 0.12599188089370728, + "learning_rate": 2.9229249349905686e-06, + "loss": 4.1924, + "step": 1036 + }, + { + "batch_num_effect_tokens": 7968, + "batch_num_samples": 14, + "batch_num_tokens": 8097, + "epoch": 1.34566, + "grad_norm": 0.1278277039527893, + "learning_rate": 2.9126211410592527e-06, + "loss": 4.5273, + "step": 1037 + }, + { + "batch_num_effect_tokens": 7967, + "batch_num_samples": 15, + "batch_num_tokens": 8126, + "epoch": 1.34696, + "grad_norm": 0.13065114617347717, + "learning_rate": 2.9023280715619005e-06, + "loss": 4.5469, + "step": 1038 + }, + { + "batch_num_effect_tokens": 8077, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 1.34826, + "grad_norm": 0.14062908291816711, + "learning_rate": 2.8920457793817507e-06, + "loss": 4.5078, + "step": 1039 + }, + { + "batch_num_effect_tokens": 7943, + "batch_num_samples": 14, + "batch_num_tokens": 8101, + "epoch": 1.34955, + "grad_norm": 0.13684336841106415, + "learning_rate": 2.881774317346664e-06, + "loss": 4.6104, + "step": 1040 + }, + { + "batch_num_effect_tokens": 7958, + "batch_num_samples": 14, + "batch_num_tokens": 8114, + "epoch": 1.35085, + "grad_norm": 0.12485930323600769, + "learning_rate": 2.871513738228861e-06, + "loss": 4.4727, + "step": 1041 + }, + { + "batch_num_effect_tokens": 8064, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 1.35215, + "grad_norm": 0.13105593621730804, + "learning_rate": 2.861264094744647e-06, + "loss": 4.6553, + "step": 1042 + }, + { + "batch_num_effect_tokens": 8039, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.35345, + "grad_norm": 0.14259664714336395, + "learning_rate": 2.851025439554142e-06, + "loss": 4.5693, + "step": 1043 + }, + { + "batch_num_effect_tokens": 8008, + "batch_num_samples": 19, + "batch_num_tokens": 8191, + "epoch": 1.35474, + "grad_norm": 0.12127009779214859, + "learning_rate": 2.840797825261017e-06, + "loss": 4.0859, + "step": 1044 + }, + { + "batch_num_effect_tokens": 8059, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.35604, + "grad_norm": 0.13202065229415894, + "learning_rate": 2.83058130441221e-06, + "loss": 4.2617, + "step": 1045 + }, + { + "batch_num_effect_tokens": 8000, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.35734, + "grad_norm": 0.13531279563903809, + "learning_rate": 2.8203759294976687e-06, + "loss": 4.2031, + "step": 1046 + }, + { + "batch_num_effect_tokens": 8048, + "batch_num_samples": 14, + "batch_num_tokens": 8182, + "epoch": 1.35864, + "grad_norm": 0.13501350581645966, + "learning_rate": 2.810181752950072e-06, + "loss": 4.1162, + "step": 1047 + }, + { + "batch_num_effect_tokens": 8042, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.35994, + "grad_norm": 0.1449299156665802, + "learning_rate": 2.7999988271445643e-06, + "loss": 4.2139, + "step": 1048 + }, + { + "batch_num_effect_tokens": 7937, + "batch_num_samples": 17, + "batch_num_tokens": 8101, + "epoch": 1.36123, + "grad_norm": 0.15028077363967896, + "learning_rate": 2.7898272043984947e-06, + "loss": 4.8975, + "step": 1049 + }, + { + "batch_num_effect_tokens": 7871, + "batch_num_samples": 15, + "batch_num_tokens": 8074, + "epoch": 1.36253, + "grad_norm": 0.1480277180671692, + "learning_rate": 2.7796669369711294e-06, + "loss": 4.4062, + "step": 1050 + }, + { + "batch_num_effect_tokens": 7916, + "batch_num_samples": 26, + "batch_num_tokens": 8126, + "epoch": 1.36383, + "grad_norm": 0.14486616849899292, + "learning_rate": 2.7695180770633993e-06, + "loss": 4.582, + "step": 1051 + }, + { + "batch_num_effect_tokens": 8000, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.36513, + "grad_norm": 0.13963328301906586, + "learning_rate": 2.7593806768176244e-06, + "loss": 4.0654, + "step": 1052 + }, + { + "batch_num_effect_tokens": 7940, + "batch_num_samples": 16, + "batch_num_tokens": 8078, + "epoch": 1.36642, + "grad_norm": 0.1411132663488388, + "learning_rate": 2.7492547883172473e-06, + "loss": 4.2812, + "step": 1053 + }, + { + "batch_num_effect_tokens": 7897, + "batch_num_samples": 20, + "batch_num_tokens": 8080, + "epoch": 1.36772, + "grad_norm": 0.14485670626163483, + "learning_rate": 2.7391404635865725e-06, + "loss": 4.6309, + "step": 1054 + }, + { + "batch_num_effect_tokens": 8053, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 1.36902, + "grad_norm": 0.13264106214046478, + "learning_rate": 2.7290377545904823e-06, + "loss": 4.4268, + "step": 1055 + }, + { + "batch_num_effect_tokens": 8024, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 1.37032, + "grad_norm": 0.12862573564052582, + "learning_rate": 2.718946713234185e-06, + "loss": 4.1699, + "step": 1056 + }, + { + "batch_num_effect_tokens": 8041, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.37161, + "grad_norm": 0.12804825603961945, + "learning_rate": 2.708867391362948e-06, + "loss": 4.5527, + "step": 1057 + }, + { + "batch_num_effect_tokens": 7860, + "batch_num_samples": 17, + "batch_num_tokens": 8062, + "epoch": 1.37291, + "grad_norm": 0.14360636472702026, + "learning_rate": 2.6987998407618216e-06, + "loss": 4.084, + "step": 1058 + }, + { + "batch_num_effect_tokens": 8016, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.37421, + "grad_norm": 0.13846245408058167, + "learning_rate": 2.688744113155378e-06, + "loss": 4.0527, + "step": 1059 + }, + { + "batch_num_effect_tokens": 8052, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.37551, + "grad_norm": 0.1346784085035324, + "learning_rate": 2.678700260207449e-06, + "loss": 4.7793, + "step": 1060 + }, + { + "batch_num_effect_tokens": 7994, + "batch_num_samples": 23, + "batch_num_tokens": 8192, + "epoch": 1.3768, + "grad_norm": 0.14536811411380768, + "learning_rate": 2.6686683335208526e-06, + "loss": 4.9219, + "step": 1061 + }, + { + "batch_num_effect_tokens": 8031, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.3781, + "grad_norm": 0.13124246895313263, + "learning_rate": 2.65864838463714e-06, + "loss": 4.498, + "step": 1062 + }, + { + "batch_num_effect_tokens": 7892, + "batch_num_samples": 21, + "batch_num_tokens": 8091, + "epoch": 1.3794, + "grad_norm": 0.12533849477767944, + "learning_rate": 2.648640465036316e-06, + "loss": 4.3086, + "step": 1063 + }, + { + "batch_num_effect_tokens": 8028, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.3807, + "grad_norm": 0.13352860510349274, + "learning_rate": 2.6386446261365874e-06, + "loss": 4.4902, + "step": 1064 + }, + { + "batch_num_effect_tokens": 7728, + "batch_num_samples": 33, + "batch_num_tokens": 7999, + "epoch": 1.382, + "grad_norm": 0.141183003783226, + "learning_rate": 2.6286609192940887e-06, + "loss": 4.1797, + "step": 1065 + }, + { + "batch_num_effect_tokens": 8064, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.38329, + "grad_norm": 0.12944729626178741, + "learning_rate": 2.6186893958026245e-06, + "loss": 4.6572, + "step": 1066 + }, + { + "batch_num_effect_tokens": 8028, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 1.38459, + "grad_norm": 0.1253744512796402, + "learning_rate": 2.608730106893411e-06, + "loss": 4.2881, + "step": 1067 + }, + { + "batch_num_effect_tokens": 8019, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.38589, + "grad_norm": 0.14108753204345703, + "learning_rate": 2.5987831037347933e-06, + "loss": 4.2412, + "step": 1068 + }, + { + "batch_num_effect_tokens": 8004, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.38719, + "grad_norm": 0.12252593785524368, + "learning_rate": 2.5888484374320033e-06, + "loss": 4.2715, + "step": 1069 + }, + { + "batch_num_effect_tokens": 8038, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 1.38848, + "grad_norm": 0.1265387237071991, + "learning_rate": 2.578926159026891e-06, + "loss": 4.2881, + "step": 1070 + }, + { + "batch_num_effect_tokens": 8027, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 1.38978, + "grad_norm": 0.13131491839885712, + "learning_rate": 2.5690163194976576e-06, + "loss": 4.3721, + "step": 1071 + }, + { + "batch_num_effect_tokens": 8022, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.39108, + "grad_norm": 0.13168731331825256, + "learning_rate": 2.559118969758595e-06, + "loss": 4.291, + "step": 1072 + }, + { + "batch_num_effect_tokens": 8012, + "batch_num_samples": 17, + "batch_num_tokens": 8171, + "epoch": 1.39238, + "grad_norm": 0.1283009946346283, + "learning_rate": 2.549234160659827e-06, + "loss": 4.3809, + "step": 1073 + }, + { + "batch_num_effect_tokens": 8004, + "batch_num_samples": 21, + "batch_num_tokens": 8192, + "epoch": 1.39367, + "grad_norm": 0.13365747034549713, + "learning_rate": 2.539361942987046e-06, + "loss": 4.8643, + "step": 1074 + }, + { + "batch_num_effect_tokens": 7982, + "batch_num_samples": 14, + "batch_num_tokens": 8142, + "epoch": 1.39497, + "grad_norm": 0.13202007114887238, + "learning_rate": 2.5295023674612568e-06, + "loss": 4.3945, + "step": 1075 + }, + { + "batch_num_effect_tokens": 8032, + "batch_num_samples": 14, + "batch_num_tokens": 8156, + "epoch": 1.39627, + "grad_norm": 0.13004064559936523, + "learning_rate": 2.519655484738507e-06, + "loss": 4.4951, + "step": 1076 + }, + { + "batch_num_effect_tokens": 8031, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 1.39757, + "grad_norm": 0.1316070407629013, + "learning_rate": 2.509821345409633e-06, + "loss": 4.7695, + "step": 1077 + }, + { + "batch_num_effect_tokens": 8050, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.39886, + "grad_norm": 0.12465286254882812, + "learning_rate": 2.5000000000000015e-06, + "loss": 4.4688, + "step": 1078 + }, + { + "batch_num_effect_tokens": 8050, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.39886, + "eval_eval_loss": 0.5671281218528748, + "eval_eval_runtime": 115.3241, + "eval_eval_samples_per_second": 43.356, + "eval_eval_steps_per_second": 2.714, + "step": 1078 + }, + { + "batch_num_effect_tokens": 7894, + "batch_num_samples": 19, + "batch_num_tokens": 8192, + "epoch": 1.40016, + "grad_norm": 0.1325971782207489, + "learning_rate": 2.4901914989692405e-06, + "loss": 4.2979, + "step": 1079 + }, + { + "batch_num_effect_tokens": 8049, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 1.40146, + "grad_norm": 0.12942376732826233, + "learning_rate": 2.480395892710997e-06, + "loss": 4.3652, + "step": 1080 + }, + { + "batch_num_effect_tokens": 8032, + "batch_num_samples": 19, + "batch_num_tokens": 8191, + "epoch": 1.40276, + "grad_norm": 0.12413739413022995, + "learning_rate": 2.470613231552661e-06, + "loss": 4.5146, + "step": 1081 + }, + { + "batch_num_effect_tokens": 7966, + "batch_num_samples": 15, + "batch_num_tokens": 8086, + "epoch": 1.40406, + "grad_norm": 0.129132479429245, + "learning_rate": 2.46084356575511e-06, + "loss": 3.9941, + "step": 1082 + }, + { + "batch_num_effect_tokens": 7873, + "batch_num_samples": 26, + "batch_num_tokens": 8106, + "epoch": 1.40535, + "grad_norm": 0.126277893781662, + "learning_rate": 2.451086945512465e-06, + "loss": 4.3438, + "step": 1083 + }, + { + "batch_num_effect_tokens": 8022, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 1.40665, + "grad_norm": 0.1285809427499771, + "learning_rate": 2.4413434209518137e-06, + "loss": 4.2676, + "step": 1084 + }, + { + "batch_num_effect_tokens": 8000, + "batch_num_samples": 14, + "batch_num_tokens": 8149, + "epoch": 1.40795, + "grad_norm": 0.1211712658405304, + "learning_rate": 2.4316130421329696e-06, + "loss": 4.6133, + "step": 1085 + }, + { + "batch_num_effect_tokens": 8012, + "batch_num_samples": 14, + "batch_num_tokens": 8177, + "epoch": 1.40925, + "grad_norm": 0.12398537993431091, + "learning_rate": 2.421895859048196e-06, + "loss": 4.0205, + "step": 1086 + }, + { + "batch_num_effect_tokens": 8049, + "batch_num_samples": 14, + "batch_num_tokens": 8189, + "epoch": 1.41054, + "grad_norm": 0.12769639492034912, + "learning_rate": 2.4121919216219646e-06, + "loss": 4.2617, + "step": 1087 + }, + { + "batch_num_effect_tokens": 8050, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.41184, + "grad_norm": 0.1281779408454895, + "learning_rate": 2.4025012797107e-06, + "loss": 4.3457, + "step": 1088 + }, + { + "batch_num_effect_tokens": 8023, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.41314, + "grad_norm": 0.1288890838623047, + "learning_rate": 2.39282398310251e-06, + "loss": 4.0811, + "step": 1089 + }, + { + "batch_num_effect_tokens": 8052, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.41444, + "grad_norm": 0.1370052844285965, + "learning_rate": 2.383160081516941e-06, + "loss": 4.4746, + "step": 1090 + }, + { + "batch_num_effect_tokens": 8009, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 1.41573, + "grad_norm": 0.11726196855306625, + "learning_rate": 2.373509624604717e-06, + "loss": 4.4619, + "step": 1091 + }, + { + "batch_num_effect_tokens": 7985, + "batch_num_samples": 15, + "batch_num_tokens": 8158, + "epoch": 1.41703, + "grad_norm": 0.12008494138717651, + "learning_rate": 2.363872661947488e-06, + "loss": 4.2627, + "step": 1092 + }, + { + "batch_num_effect_tokens": 7961, + "batch_num_samples": 14, + "batch_num_tokens": 8137, + "epoch": 1.41833, + "grad_norm": 0.12784817814826965, + "learning_rate": 2.3542492430575752e-06, + "loss": 4.4648, + "step": 1093 + }, + { + "batch_num_effect_tokens": 7973, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.41963, + "grad_norm": 0.14390228688716888, + "learning_rate": 2.344639417377714e-06, + "loss": 4.4814, + "step": 1094 + }, + { + "batch_num_effect_tokens": 7863, + "batch_num_samples": 17, + "batch_num_tokens": 8023, + "epoch": 1.42092, + "grad_norm": 0.13291668891906738, + "learning_rate": 2.3350432342808003e-06, + "loss": 4.4844, + "step": 1095 + }, + { + "batch_num_effect_tokens": 8021, + "batch_num_samples": 16, + "batch_num_tokens": 8188, + "epoch": 1.42222, + "grad_norm": 0.13262400031089783, + "learning_rate": 2.3254607430696393e-06, + "loss": 4.3721, + "step": 1096 + }, + { + "batch_num_effect_tokens": 8017, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 1.42352, + "grad_norm": 0.12368138134479523, + "learning_rate": 2.315891992976687e-06, + "loss": 4.29, + "step": 1097 + }, + { + "batch_num_effect_tokens": 8052, + "batch_num_samples": 14, + "batch_num_tokens": 8173, + "epoch": 1.42482, + "grad_norm": 0.13449449837207794, + "learning_rate": 2.3063370331638084e-06, + "loss": 4.1885, + "step": 1098 + }, + { + "batch_num_effect_tokens": 7931, + "batch_num_samples": 14, + "batch_num_tokens": 8114, + "epoch": 1.42612, + "grad_norm": 0.12767821550369263, + "learning_rate": 2.296795912722014e-06, + "loss": 4.3281, + "step": 1099 + }, + { + "batch_num_effect_tokens": 7935, + "batch_num_samples": 14, + "batch_num_tokens": 8104, + "epoch": 1.42741, + "grad_norm": 0.13633497059345245, + "learning_rate": 2.2872686806712037e-06, + "loss": 4.5225, + "step": 1100 + }, + { + "batch_num_effect_tokens": 7834, + "batch_num_samples": 24, + "batch_num_tokens": 8072, + "epoch": 1.42871, + "grad_norm": 0.13452018797397614, + "learning_rate": 2.277755385959934e-06, + "loss": 4.4902, + "step": 1101 + }, + { + "batch_num_effect_tokens": 8018, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.43001, + "grad_norm": 0.1304135024547577, + "learning_rate": 2.2682560774651458e-06, + "loss": 4.3936, + "step": 1102 + }, + { + "batch_num_effect_tokens": 8042, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.43131, + "grad_norm": 0.1268252283334732, + "learning_rate": 2.258770803991932e-06, + "loss": 4.2588, + "step": 1103 + }, + { + "batch_num_effect_tokens": 7986, + "batch_num_samples": 24, + "batch_num_tokens": 8188, + "epoch": 1.4326, + "grad_norm": 0.1350371092557907, + "learning_rate": 2.249299614273266e-06, + "loss": 4.2285, + "step": 1104 + }, + { + "batch_num_effect_tokens": 7901, + "batch_num_samples": 17, + "batch_num_tokens": 8074, + "epoch": 1.4339, + "grad_norm": 0.12892520427703857, + "learning_rate": 2.2398425569697667e-06, + "loss": 4.3389, + "step": 1105 + }, + { + "batch_num_effect_tokens": 7979, + "batch_num_samples": 19, + "batch_num_tokens": 8192, + "epoch": 1.4352, + "grad_norm": 0.11750482767820358, + "learning_rate": 2.230399680669449e-06, + "loss": 4.2197, + "step": 1106 + }, + { + "batch_num_effect_tokens": 7980, + "batch_num_samples": 14, + "batch_num_tokens": 8128, + "epoch": 1.4365, + "grad_norm": 0.12220905721187592, + "learning_rate": 2.220971033887463e-06, + "loss": 4.3984, + "step": 1107 + }, + { + "batch_num_effect_tokens": 8001, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.43779, + "grad_norm": 0.12369371205568314, + "learning_rate": 2.211556665065854e-06, + "loss": 4.5498, + "step": 1108 + }, + { + "batch_num_effect_tokens": 7932, + "batch_num_samples": 14, + "batch_num_tokens": 8076, + "epoch": 1.43909, + "grad_norm": 0.13513018190860748, + "learning_rate": 2.2021566225733094e-06, + "loss": 4.3604, + "step": 1109 + }, + { + "batch_num_effect_tokens": 8075, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 1.44039, + "grad_norm": 0.12401560693979263, + "learning_rate": 2.1927709547049096e-06, + "loss": 4.2617, + "step": 1110 + }, + { + "batch_num_effect_tokens": 7918, + "batch_num_samples": 19, + "batch_num_tokens": 8086, + "epoch": 1.44169, + "grad_norm": 0.12758344411849976, + "learning_rate": 2.1833997096818897e-06, + "loss": 4.7422, + "step": 1111 + }, + { + "batch_num_effect_tokens": 7989, + "batch_num_samples": 19, + "batch_num_tokens": 8191, + "epoch": 1.44298, + "grad_norm": 0.12552611529827118, + "learning_rate": 2.174042935651377e-06, + "loss": 4.3584, + "step": 1112 + }, + { + "batch_num_effect_tokens": 8035, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.44428, + "grad_norm": 0.12584245204925537, + "learning_rate": 2.1647006806861472e-06, + "loss": 4.2412, + "step": 1113 + }, + { + "batch_num_effect_tokens": 8033, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.44558, + "grad_norm": 0.13394629955291748, + "learning_rate": 2.1553729927843894e-06, + "loss": 4.0078, + "step": 1114 + }, + { + "batch_num_effect_tokens": 8032, + "batch_num_samples": 15, + "batch_num_tokens": 8186, + "epoch": 1.44688, + "grad_norm": 0.12964493036270142, + "learning_rate": 2.146059919869444e-06, + "loss": 4.3604, + "step": 1115 + }, + { + "batch_num_effect_tokens": 8009, + "batch_num_samples": 14, + "batch_num_tokens": 8173, + "epoch": 1.44818, + "grad_norm": 0.12681354582309723, + "learning_rate": 2.1367615097895707e-06, + "loss": 3.9531, + "step": 1116 + }, + { + "batch_num_effect_tokens": 8055, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 1.44947, + "grad_norm": 0.13806065917015076, + "learning_rate": 2.1274778103176854e-06, + "loss": 4.4346, + "step": 1117 + }, + { + "batch_num_effect_tokens": 8012, + "batch_num_samples": 14, + "batch_num_tokens": 8156, + "epoch": 1.45077, + "grad_norm": 0.14222683012485504, + "learning_rate": 2.1182088691511287e-06, + "loss": 4.541, + "step": 1118 + }, + { + "batch_num_effect_tokens": 7946, + "batch_num_samples": 21, + "batch_num_tokens": 8142, + "epoch": 1.45207, + "grad_norm": 0.14069655537605286, + "learning_rate": 2.1089547339114215e-06, + "loss": 4.3574, + "step": 1119 + }, + { + "batch_num_effect_tokens": 8012, + "batch_num_samples": 15, + "batch_num_tokens": 8164, + "epoch": 1.45337, + "grad_norm": 0.13768544793128967, + "learning_rate": 2.09971545214401e-06, + "loss": 4.7539, + "step": 1120 + }, + { + "batch_num_effect_tokens": 7943, + "batch_num_samples": 14, + "batch_num_tokens": 8101, + "epoch": 1.45466, + "grad_norm": 0.13390763103961945, + "learning_rate": 2.0904910713180275e-06, + "loss": 4.2021, + "step": 1121 + }, + { + "batch_num_effect_tokens": 8015, + "batch_num_samples": 18, + "batch_num_tokens": 8192, + "epoch": 1.45596, + "grad_norm": 0.1425563544034958, + "learning_rate": 2.081281638826052e-06, + "loss": 4.1191, + "step": 1122 + }, + { + "batch_num_effect_tokens": 8046, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.45726, + "grad_norm": 0.13213439285755157, + "learning_rate": 2.072087201983857e-06, + "loss": 4.6445, + "step": 1123 + }, + { + "batch_num_effect_tokens": 8026, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 1.45856, + "grad_norm": 0.13721303641796112, + "learning_rate": 2.0629078080301782e-06, + "loss": 4.1318, + "step": 1124 + }, + { + "batch_num_effect_tokens": 7984, + "batch_num_samples": 14, + "batch_num_tokens": 8188, + "epoch": 1.45985, + "grad_norm": 0.1321757435798645, + "learning_rate": 2.0537435041264597e-06, + "loss": 4.1426, + "step": 1125 + }, + { + "batch_num_effect_tokens": 7970, + "batch_num_samples": 15, + "batch_num_tokens": 8126, + "epoch": 1.46115, + "grad_norm": 0.13200123608112335, + "learning_rate": 2.0445943373566178e-06, + "loss": 4.5459, + "step": 1126 + }, + { + "batch_num_effect_tokens": 8049, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.46245, + "grad_norm": 0.1399904489517212, + "learning_rate": 2.0354603547267985e-06, + "loss": 4.2383, + "step": 1127 + }, + { + "batch_num_effect_tokens": 8012, + "batch_num_samples": 20, + "batch_num_tokens": 8192, + "epoch": 1.46375, + "grad_norm": 0.14394307136535645, + "learning_rate": 2.0263416031651335e-06, + "loss": 5.0156, + "step": 1128 + }, + { + "batch_num_effect_tokens": 8035, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 1.46504, + "grad_norm": 0.12676626443862915, + "learning_rate": 2.017238129521506e-06, + "loss": 4.1953, + "step": 1129 + }, + { + "batch_num_effect_tokens": 7999, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.46634, + "grad_norm": 0.12323788553476334, + "learning_rate": 2.0081499805673015e-06, + "loss": 4.3027, + "step": 1130 + }, + { + "batch_num_effect_tokens": 8029, + "batch_num_samples": 14, + "batch_num_tokens": 8170, + "epoch": 1.46764, + "grad_norm": 0.12607906758785248, + "learning_rate": 1.9990772029951665e-06, + "loss": 3.9326, + "step": 1131 + }, + { + "batch_num_effect_tokens": 8022, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 1.46894, + "grad_norm": 0.12812362611293793, + "learning_rate": 1.9900198434187838e-06, + "loss": 4.7461, + "step": 1132 + }, + { + "batch_num_effect_tokens": 7995, + "batch_num_samples": 20, + "batch_num_tokens": 8192, + "epoch": 1.47024, + "grad_norm": 0.13349904119968414, + "learning_rate": 1.980977948372612e-06, + "loss": 4.3418, + "step": 1133 + }, + { + "batch_num_effect_tokens": 8013, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 1.47153, + "grad_norm": 0.12421982735395432, + "learning_rate": 1.971951564311668e-06, + "loss": 4.1846, + "step": 1134 + }, + { + "batch_num_effect_tokens": 8064, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.47283, + "grad_norm": 0.1404682993888855, + "learning_rate": 1.962940737611264e-06, + "loss": 4.3447, + "step": 1135 + }, + { + "batch_num_effect_tokens": 7949, + "batch_num_samples": 14, + "batch_num_tokens": 8088, + "epoch": 1.47413, + "grad_norm": 0.1217743456363678, + "learning_rate": 1.953945514566789e-06, + "loss": 3.9229, + "step": 1136 + }, + { + "batch_num_effect_tokens": 8047, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.47543, + "grad_norm": 0.13491998612880707, + "learning_rate": 1.9449659413934684e-06, + "loss": 4.1543, + "step": 1137 + }, + { + "batch_num_effect_tokens": 8078, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.47672, + "grad_norm": 0.12323690205812454, + "learning_rate": 1.9360020642261155e-06, + "loss": 4.2627, + "step": 1138 + }, + { + "batch_num_effect_tokens": 7875, + "batch_num_samples": 21, + "batch_num_tokens": 8108, + "epoch": 1.47802, + "grad_norm": 0.13586825132369995, + "learning_rate": 1.9270539291189054e-06, + "loss": 4.5137, + "step": 1139 + }, + { + "batch_num_effect_tokens": 8032, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 1.47932, + "grad_norm": 0.12245763093233109, + "learning_rate": 1.918121582045132e-06, + "loss": 4.5752, + "step": 1140 + }, + { + "batch_num_effect_tokens": 7842, + "batch_num_samples": 24, + "batch_num_tokens": 8048, + "epoch": 1.48062, + "grad_norm": 0.12825018167495728, + "learning_rate": 1.9092050688969736e-06, + "loss": 4.5674, + "step": 1141 + }, + { + "batch_num_effect_tokens": 7974, + "batch_num_samples": 19, + "batch_num_tokens": 8192, + "epoch": 1.48191, + "grad_norm": 0.12709911167621613, + "learning_rate": 1.9003044354852634e-06, + "loss": 4.3623, + "step": 1142 + }, + { + "batch_num_effect_tokens": 8006, + "batch_num_samples": 14, + "batch_num_tokens": 8191, + "epoch": 1.48321, + "grad_norm": 0.11626652628183365, + "learning_rate": 1.8914197275392444e-06, + "loss": 4.0283, + "step": 1143 + }, + { + "batch_num_effect_tokens": 7904, + "batch_num_samples": 18, + "batch_num_tokens": 8088, + "epoch": 1.48451, + "grad_norm": 0.12491626292467117, + "learning_rate": 1.8825509907063328e-06, + "loss": 4.376, + "step": 1144 + }, + { + "batch_num_effect_tokens": 8040, + "batch_num_samples": 16, + "batch_num_tokens": 8188, + "epoch": 1.48581, + "grad_norm": 0.1228693351149559, + "learning_rate": 1.8736982705519013e-06, + "loss": 4.1221, + "step": 1145 + }, + { + "batch_num_effect_tokens": 8032, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.4871, + "grad_norm": 0.13609446585178375, + "learning_rate": 1.8648616125590218e-06, + "loss": 3.9092, + "step": 1146 + }, + { + "batch_num_effect_tokens": 8029, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 1.4884, + "grad_norm": 0.13068844377994537, + "learning_rate": 1.8560410621282543e-06, + "loss": 4.2646, + "step": 1147 + }, + { + "batch_num_effect_tokens": 7931, + "batch_num_samples": 18, + "batch_num_tokens": 8086, + "epoch": 1.4897, + "grad_norm": 0.12820830941200256, + "learning_rate": 1.8472366645773892e-06, + "loss": 4.3457, + "step": 1148 + }, + { + "batch_num_effect_tokens": 8061, + "batch_num_samples": 13, + "batch_num_tokens": 8188, + "epoch": 1.491, + "grad_norm": 0.12200096249580383, + "learning_rate": 1.8384484651412338e-06, + "loss": 4.3672, + "step": 1149 + }, + { + "batch_num_effect_tokens": 8011, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.4923, + "grad_norm": 0.13038086891174316, + "learning_rate": 1.829676508971377e-06, + "loss": 4.2734, + "step": 1150 + }, + { + "batch_num_effect_tokens": 7874, + "batch_num_samples": 21, + "batch_num_tokens": 8091, + "epoch": 1.49359, + "grad_norm": 0.11553595215082169, + "learning_rate": 1.8209208411359485e-06, + "loss": 4.3574, + "step": 1151 + }, + { + "batch_num_effect_tokens": 8062, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 1.49489, + "grad_norm": 0.12026538699865341, + "learning_rate": 1.8121815066193944e-06, + "loss": 4.4014, + "step": 1152 + }, + { + "batch_num_effect_tokens": 8058, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 1.49619, + "grad_norm": 0.12895485758781433, + "learning_rate": 1.8034585503222441e-06, + "loss": 4.7461, + "step": 1153 + }, + { + "batch_num_effect_tokens": 8057, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.49749, + "grad_norm": 0.1211390346288681, + "learning_rate": 1.7947520170608774e-06, + "loss": 4.3555, + "step": 1154 + }, + { + "batch_num_effect_tokens": 8015, + "batch_num_samples": 14, + "batch_num_tokens": 8182, + "epoch": 1.49878, + "grad_norm": 0.1284831017255783, + "learning_rate": 1.7860619515673034e-06, + "loss": 4.2139, + "step": 1155 + }, + { + "batch_num_effect_tokens": 7974, + "batch_num_samples": 15, + "batch_num_tokens": 8098, + "epoch": 1.50008, + "grad_norm": 0.12839344143867493, + "learning_rate": 1.7773883984889178e-06, + "loss": 4.4795, + "step": 1156 + }, + { + "batch_num_effect_tokens": 7808, + "batch_num_samples": 24, + "batch_num_tokens": 8052, + "epoch": 1.50138, + "grad_norm": 0.13333484530448914, + "learning_rate": 1.7687314023882806e-06, + "loss": 4.5332, + "step": 1157 + }, + { + "batch_num_effect_tokens": 7969, + "batch_num_samples": 15, + "batch_num_tokens": 8156, + "epoch": 1.50268, + "grad_norm": 0.1345791220664978, + "learning_rate": 1.760091007742888e-06, + "loss": 4.5273, + "step": 1158 + }, + { + "batch_num_effect_tokens": 8054, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.50397, + "grad_norm": 0.12442679703235626, + "learning_rate": 1.7514672589449378e-06, + "loss": 4.248, + "step": 1159 + }, + { + "batch_num_effect_tokens": 7963, + "batch_num_samples": 14, + "batch_num_tokens": 8128, + "epoch": 1.50527, + "grad_norm": 0.1204453706741333, + "learning_rate": 1.7428602003011136e-06, + "loss": 4.6221, + "step": 1160 + }, + { + "batch_num_effect_tokens": 8049, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 1.50657, + "grad_norm": 0.13553136587142944, + "learning_rate": 1.734269876032344e-06, + "loss": 4.1123, + "step": 1161 + }, + { + "batch_num_effect_tokens": 8033, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 1.50787, + "grad_norm": 0.13123448193073273, + "learning_rate": 1.7256963302735752e-06, + "loss": 4.3574, + "step": 1162 + }, + { + "batch_num_effect_tokens": 8039, + "batch_num_samples": 14, + "batch_num_tokens": 8173, + "epoch": 1.50916, + "grad_norm": 0.12561143934726715, + "learning_rate": 1.7171396070735602e-06, + "loss": 4.0078, + "step": 1163 + }, + { + "batch_num_effect_tokens": 8063, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 1.51046, + "grad_norm": 0.13120131194591522, + "learning_rate": 1.7085997503946144e-06, + "loss": 4.377, + "step": 1164 + }, + { + "batch_num_effect_tokens": 8056, + "batch_num_samples": 14, + "batch_num_tokens": 8191, + "epoch": 1.51176, + "grad_norm": 0.1411786675453186, + "learning_rate": 1.7000768041124038e-06, + "loss": 4.5732, + "step": 1165 + }, + { + "batch_num_effect_tokens": 7977, + "batch_num_samples": 14, + "batch_num_tokens": 8152, + "epoch": 1.51306, + "grad_norm": 0.12541553378105164, + "learning_rate": 1.6915708120157042e-06, + "loss": 4.3467, + "step": 1166 + }, + { + "batch_num_effect_tokens": 7889, + "batch_num_samples": 25, + "batch_num_tokens": 8163, + "epoch": 1.51436, + "grad_norm": 0.12473280727863312, + "learning_rate": 1.6830818178061897e-06, + "loss": 4.1572, + "step": 1167 + }, + { + "batch_num_effect_tokens": 8015, + "batch_num_samples": 19, + "batch_num_tokens": 8192, + "epoch": 1.51565, + "grad_norm": 0.14233461022377014, + "learning_rate": 1.6746098650982072e-06, + "loss": 4.6309, + "step": 1168 + }, + { + "batch_num_effect_tokens": 8038, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.51695, + "grad_norm": 0.12509174644947052, + "learning_rate": 1.6661549974185426e-06, + "loss": 4.5986, + "step": 1169 + }, + { + "batch_num_effect_tokens": 8063, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.51825, + "grad_norm": 0.13657966256141663, + "learning_rate": 1.657717258206205e-06, + "loss": 4.9082, + "step": 1170 + }, + { + "batch_num_effect_tokens": 8046, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.51955, + "grad_norm": 0.12476794421672821, + "learning_rate": 1.6492966908122033e-06, + "loss": 4.1396, + "step": 1171 + }, + { + "batch_num_effect_tokens": 7912, + "batch_num_samples": 26, + "batch_num_tokens": 8139, + "epoch": 1.52084, + "grad_norm": 0.12963926792144775, + "learning_rate": 1.6408933384993187e-06, + "loss": 4.4238, + "step": 1172 + }, + { + "batch_num_effect_tokens": 8027, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 1.52214, + "grad_norm": 0.12490490823984146, + "learning_rate": 1.63250724444189e-06, + "loss": 4.6895, + "step": 1173 + }, + { + "batch_num_effect_tokens": 7991, + "batch_num_samples": 14, + "batch_num_tokens": 8121, + "epoch": 1.52344, + "grad_norm": 0.1316630095243454, + "learning_rate": 1.6241384517255854e-06, + "loss": 4.5205, + "step": 1174 + }, + { + "batch_num_effect_tokens": 7946, + "batch_num_samples": 15, + "batch_num_tokens": 8128, + "epoch": 1.52474, + "grad_norm": 0.1253131479024887, + "learning_rate": 1.6157870033471785e-06, + "loss": 4.4424, + "step": 1175 + }, + { + "batch_num_effect_tokens": 7782, + "batch_num_samples": 26, + "batch_num_tokens": 8010, + "epoch": 1.52603, + "grad_norm": 0.12522853910923004, + "learning_rate": 1.6074529422143398e-06, + "loss": 4.3564, + "step": 1176 + }, + { + "batch_num_effect_tokens": 8050, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 1.52733, + "grad_norm": 0.12250207364559174, + "learning_rate": 1.5991363111454023e-06, + "loss": 4.7217, + "step": 1177 + }, + { + "batch_num_effect_tokens": 8074, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.52863, + "grad_norm": 0.12559637427330017, + "learning_rate": 1.5908371528691553e-06, + "loss": 4.4688, + "step": 1178 + }, + { + "batch_num_effect_tokens": 7855, + "batch_num_samples": 17, + "batch_num_tokens": 8023, + "epoch": 1.52993, + "grad_norm": 0.13404156267642975, + "learning_rate": 1.5825555100246066e-06, + "loss": 4.5176, + "step": 1179 + }, + { + "batch_num_effect_tokens": 8030, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.53122, + "grad_norm": 0.12473981827497482, + "learning_rate": 1.5742914251607794e-06, + "loss": 4.2764, + "step": 1180 + }, + { + "batch_num_effect_tokens": 7951, + "batch_num_samples": 14, + "batch_num_tokens": 8092, + "epoch": 1.53252, + "grad_norm": 0.1322951763868332, + "learning_rate": 1.5660449407364919e-06, + "loss": 4.5439, + "step": 1181 + }, + { + "batch_num_effect_tokens": 7907, + "batch_num_samples": 20, + "batch_num_tokens": 8128, + "epoch": 1.53382, + "grad_norm": 0.13409925997257233, + "learning_rate": 1.5578160991201313e-06, + "loss": 4.3096, + "step": 1182 + }, + { + "batch_num_effect_tokens": 8038, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.53512, + "grad_norm": 0.11892995983362198, + "learning_rate": 1.549604942589441e-06, + "loss": 4.3701, + "step": 1183 + }, + { + "batch_num_effect_tokens": 8046, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.53642, + "grad_norm": 0.13523681461811066, + "learning_rate": 1.5414115133313029e-06, + "loss": 4.2051, + "step": 1184 + }, + { + "batch_num_effect_tokens": 8057, + "batch_num_samples": 17, + "batch_num_tokens": 8191, + "epoch": 1.53771, + "grad_norm": 0.12341441959142685, + "learning_rate": 1.5332358534415192e-06, + "loss": 4.4072, + "step": 1185 + }, + { + "batch_num_effect_tokens": 8033, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 1.53901, + "grad_norm": 0.12092790752649307, + "learning_rate": 1.5250780049246028e-06, + "loss": 4.498, + "step": 1186 + }, + { + "batch_num_effect_tokens": 8049, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.54031, + "grad_norm": 0.12938234210014343, + "learning_rate": 1.516938009693551e-06, + "loss": 4.6123, + "step": 1187 + }, + { + "batch_num_effect_tokens": 7905, + "batch_num_samples": 14, + "batch_num_tokens": 8088, + "epoch": 1.54161, + "grad_norm": 0.1272503286600113, + "learning_rate": 1.5088159095696365e-06, + "loss": 4.2402, + "step": 1188 + }, + { + "batch_num_effect_tokens": 7971, + "batch_num_samples": 21, + "batch_num_tokens": 8192, + "epoch": 1.5429, + "grad_norm": 0.14063680171966553, + "learning_rate": 1.500711746282192e-06, + "loss": 4.2334, + "step": 1189 + }, + { + "batch_num_effect_tokens": 8040, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 1.5442, + "grad_norm": 0.11090775579214096, + "learning_rate": 1.4926255614683931e-06, + "loss": 4.2617, + "step": 1190 + }, + { + "batch_num_effect_tokens": 7936, + "batch_num_samples": 25, + "batch_num_tokens": 8162, + "epoch": 1.5455, + "grad_norm": 0.12592093646526337, + "learning_rate": 1.484557396673052e-06, + "loss": 4.1523, + "step": 1191 + }, + { + "batch_num_effect_tokens": 8044, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.5468, + "grad_norm": 0.12234609574079514, + "learning_rate": 1.4765072933483949e-06, + "loss": 4.2314, + "step": 1192 + }, + { + "batch_num_effect_tokens": 8007, + "batch_num_samples": 15, + "batch_num_tokens": 8168, + "epoch": 1.54809, + "grad_norm": 0.11718329787254333, + "learning_rate": 1.468475292853847e-06, + "loss": 4.0645, + "step": 1193 + }, + { + "batch_num_effect_tokens": 7944, + "batch_num_samples": 16, + "batch_num_tokens": 8133, + "epoch": 1.54939, + "grad_norm": 0.13274738192558289, + "learning_rate": 1.4604614364558372e-06, + "loss": 4.4199, + "step": 1194 + }, + { + "batch_num_effect_tokens": 8022, + "batch_num_samples": 17, + "batch_num_tokens": 8166, + "epoch": 1.55069, + "grad_norm": 0.12475960701704025, + "learning_rate": 1.4524657653275653e-06, + "loss": 4.043, + "step": 1195 + }, + { + "batch_num_effect_tokens": 8041, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.55199, + "grad_norm": 0.12034016847610474, + "learning_rate": 1.444488320548807e-06, + "loss": 4.0186, + "step": 1196 + }, + { + "batch_num_effect_tokens": 8056, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.55328, + "grad_norm": 0.12400320917367935, + "learning_rate": 1.4365291431056871e-06, + "loss": 4.248, + "step": 1197 + }, + { + "batch_num_effect_tokens": 8045, + "batch_num_samples": 14, + "batch_num_tokens": 8173, + "epoch": 1.55458, + "grad_norm": 0.12639890611171722, + "learning_rate": 1.4285882738904822e-06, + "loss": 4.6426, + "step": 1198 + }, + { + "batch_num_effect_tokens": 7946, + "batch_num_samples": 17, + "batch_num_tokens": 8123, + "epoch": 1.55588, + "grad_norm": 0.12510213255882263, + "learning_rate": 1.4206657537014078e-06, + "loss": 4.248, + "step": 1199 + }, + { + "batch_num_effect_tokens": 8031, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.55718, + "grad_norm": 0.11327057331800461, + "learning_rate": 1.4127616232424042e-06, + "loss": 3.8843, + "step": 1200 + }, + { + "batch_num_effect_tokens": 8006, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.55848, + "grad_norm": 0.13065361976623535, + "learning_rate": 1.404875923122928e-06, + "loss": 4.3848, + "step": 1201 + }, + { + "batch_num_effect_tokens": 8008, + "batch_num_samples": 18, + "batch_num_tokens": 8170, + "epoch": 1.55977, + "grad_norm": 0.1324455589056015, + "learning_rate": 1.3970086938577492e-06, + "loss": 4.3975, + "step": 1202 + }, + { + "batch_num_effect_tokens": 8044, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 1.56107, + "grad_norm": 0.12078768759965897, + "learning_rate": 1.389159975866734e-06, + "loss": 4.0654, + "step": 1203 + }, + { + "batch_num_effect_tokens": 8037, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.56237, + "grad_norm": 0.1465671807527542, + "learning_rate": 1.3813298094746491e-06, + "loss": 4.2598, + "step": 1204 + }, + { + "batch_num_effect_tokens": 7993, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 1.56367, + "grad_norm": 0.13225287199020386, + "learning_rate": 1.3735182349109428e-06, + "loss": 4.4785, + "step": 1205 + }, + { + "batch_num_effect_tokens": 8042, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.56496, + "grad_norm": 0.1285356879234314, + "learning_rate": 1.3657252923095437e-06, + "loss": 4.54, + "step": 1206 + }, + { + "batch_num_effect_tokens": 7879, + "batch_num_samples": 18, + "batch_num_tokens": 8090, + "epoch": 1.56626, + "grad_norm": 0.13167142868041992, + "learning_rate": 1.357951021708655e-06, + "loss": 4.0176, + "step": 1207 + }, + { + "batch_num_effect_tokens": 8026, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 1.56756, + "grad_norm": 0.13281382620334625, + "learning_rate": 1.3501954630505464e-06, + "loss": 4.2588, + "step": 1208 + }, + { + "batch_num_effect_tokens": 7945, + "batch_num_samples": 14, + "batch_num_tokens": 8079, + "epoch": 1.56886, + "grad_norm": 0.12919877469539642, + "learning_rate": 1.342458656181354e-06, + "loss": 4.5977, + "step": 1209 + }, + { + "batch_num_effect_tokens": 8021, + "batch_num_samples": 14, + "batch_num_tokens": 8191, + "epoch": 1.57015, + "grad_norm": 0.13256850838661194, + "learning_rate": 1.3347406408508695e-06, + "loss": 4.1484, + "step": 1210 + }, + { + "batch_num_effect_tokens": 8055, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 1.57145, + "grad_norm": 0.13196738064289093, + "learning_rate": 1.3270414567123342e-06, + "loss": 4.1807, + "step": 1211 + }, + { + "batch_num_effect_tokens": 8025, + "batch_num_samples": 18, + "batch_num_tokens": 8192, + "epoch": 1.57275, + "grad_norm": 0.1331268548965454, + "learning_rate": 1.3193611433222465e-06, + "loss": 4.4814, + "step": 1212 + }, + { + "batch_num_effect_tokens": 8024, + "batch_num_samples": 14, + "batch_num_tokens": 8182, + "epoch": 1.57405, + "grad_norm": 0.1267508864402771, + "learning_rate": 1.311699740140146e-06, + "loss": 4.3477, + "step": 1213 + }, + { + "batch_num_effect_tokens": 8057, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.57534, + "grad_norm": 0.1376553326845169, + "learning_rate": 1.3040572865284234e-06, + "loss": 4.5098, + "step": 1214 + }, + { + "batch_num_effect_tokens": 7972, + "batch_num_samples": 14, + "batch_num_tokens": 8100, + "epoch": 1.57664, + "grad_norm": 0.13548849523067474, + "learning_rate": 1.2964338217521021e-06, + "loss": 4.3359, + "step": 1215 + }, + { + "batch_num_effect_tokens": 8069, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.57794, + "grad_norm": 0.13228566944599152, + "learning_rate": 1.2888293849786503e-06, + "loss": 4.4883, + "step": 1216 + }, + { + "batch_num_effect_tokens": 8041, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 1.57924, + "grad_norm": 0.1392965018749237, + "learning_rate": 1.2812440152777773e-06, + "loss": 4.5889, + "step": 1217 + }, + { + "batch_num_effect_tokens": 8027, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 1.58054, + "grad_norm": 0.12365755438804626, + "learning_rate": 1.2736777516212267e-06, + "loss": 4.541, + "step": 1218 + }, + { + "batch_num_effect_tokens": 7996, + "batch_num_samples": 20, + "batch_num_tokens": 8192, + "epoch": 1.58183, + "grad_norm": 0.12618915736675262, + "learning_rate": 1.2661306328825818e-06, + "loss": 4.1309, + "step": 1219 + }, + { + "batch_num_effect_tokens": 8035, + "batch_num_samples": 14, + "batch_num_tokens": 8163, + "epoch": 1.58313, + "grad_norm": 0.12724487483501434, + "learning_rate": 1.258602697837063e-06, + "loss": 4.7012, + "step": 1220 + }, + { + "batch_num_effect_tokens": 8034, + "batch_num_samples": 14, + "batch_num_tokens": 8189, + "epoch": 1.58443, + "grad_norm": 0.12507867813110352, + "learning_rate": 1.2510939851613285e-06, + "loss": 4.6133, + "step": 1221 + }, + { + "batch_num_effect_tokens": 7940, + "batch_num_samples": 14, + "batch_num_tokens": 8190, + "epoch": 1.58573, + "grad_norm": 0.124232716858387, + "learning_rate": 1.2436045334332824e-06, + "loss": 4.6475, + "step": 1222 + }, + { + "batch_num_effect_tokens": 7957, + "batch_num_samples": 22, + "batch_num_tokens": 8192, + "epoch": 1.58702, + "grad_norm": 0.13959231972694397, + "learning_rate": 1.2361343811318665e-06, + "loss": 4.6484, + "step": 1223 + }, + { + "batch_num_effect_tokens": 8044, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 1.58832, + "grad_norm": 0.12538942694664001, + "learning_rate": 1.2286835666368623e-06, + "loss": 4.4883, + "step": 1224 + }, + { + "batch_num_effect_tokens": 8027, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.58962, + "grad_norm": 0.12452027201652527, + "learning_rate": 1.2212521282287093e-06, + "loss": 4.0713, + "step": 1225 + }, + { + "batch_num_effect_tokens": 8045, + "batch_num_samples": 14, + "batch_num_tokens": 8191, + "epoch": 1.59092, + "grad_norm": 0.13691964745521545, + "learning_rate": 1.2138401040882874e-06, + "loss": 4.5684, + "step": 1226 + }, + { + "batch_num_effect_tokens": 7894, + "batch_num_samples": 24, + "batch_num_tokens": 8096, + "epoch": 1.59221, + "grad_norm": 0.12278582155704498, + "learning_rate": 1.20644753229674e-06, + "loss": 4.1816, + "step": 1227 + }, + { + "batch_num_effect_tokens": 8032, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.59351, + "grad_norm": 0.12262444198131561, + "learning_rate": 1.1990744508352604e-06, + "loss": 4.0322, + "step": 1228 + }, + { + "batch_num_effect_tokens": 7945, + "batch_num_samples": 15, + "batch_num_tokens": 8106, + "epoch": 1.59481, + "grad_norm": 0.13073332607746124, + "learning_rate": 1.191720897584908e-06, + "loss": 4.5508, + "step": 1229 + }, + { + "batch_num_effect_tokens": 7928, + "batch_num_samples": 17, + "batch_num_tokens": 8113, + "epoch": 1.59611, + "grad_norm": 0.13038553297519684, + "learning_rate": 1.1843869103264173e-06, + "loss": 4.6455, + "step": 1230 + }, + { + "batch_num_effect_tokens": 8026, + "batch_num_samples": 17, + "batch_num_tokens": 8187, + "epoch": 1.5974, + "grad_norm": 0.11809444427490234, + "learning_rate": 1.1770725267399892e-06, + "loss": 4.1328, + "step": 1231 + }, + { + "batch_num_effect_tokens": 7988, + "batch_num_samples": 22, + "batch_num_tokens": 8189, + "epoch": 1.5987, + "grad_norm": 0.11818146705627441, + "learning_rate": 1.1697777844051105e-06, + "loss": 4.2168, + "step": 1232 + }, + { + "batch_num_effect_tokens": 7988, + "batch_num_samples": 22, + "batch_num_tokens": 8189, + "epoch": 1.5987, + "eval_eval_loss": 0.5610187649726868, + "eval_eval_runtime": 115.0388, + "eval_eval_samples_per_second": 43.464, + "eval_eval_steps_per_second": 2.721, + "step": 1232 + }, + { + "batch_num_effect_tokens": 8012, + "batch_num_samples": 21, + "batch_num_tokens": 8192, + "epoch": 1.6, + "grad_norm": 0.12877851724624634, + "learning_rate": 1.1625027208003547e-06, + "loss": 4.4346, + "step": 1233 + }, + { + "batch_num_effect_tokens": 7934, + "batch_num_samples": 15, + "batch_num_tokens": 8104, + "epoch": 1.6013, + "grad_norm": 0.13281628489494324, + "learning_rate": 1.1552473733031893e-06, + "loss": 4.0088, + "step": 1234 + }, + { + "batch_num_effect_tokens": 7990, + "batch_num_samples": 15, + "batch_num_tokens": 8182, + "epoch": 1.6026, + "grad_norm": 0.133657768368721, + "learning_rate": 1.148011779189791e-06, + "loss": 4.5449, + "step": 1235 + }, + { + "batch_num_effect_tokens": 8052, + "batch_num_samples": 14, + "batch_num_tokens": 8191, + "epoch": 1.60389, + "grad_norm": 0.11349672079086304, + "learning_rate": 1.1407959756348424e-06, + "loss": 4.1943, + "step": 1236 + }, + { + "batch_num_effect_tokens": 7980, + "batch_num_samples": 21, + "batch_num_tokens": 8192, + "epoch": 1.60519, + "grad_norm": 0.14057059586048126, + "learning_rate": 1.133599999711349e-06, + "loss": 4.209, + "step": 1237 + }, + { + "batch_num_effect_tokens": 8055, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 1.60649, + "grad_norm": 0.13591386377811432, + "learning_rate": 1.1264238883904483e-06, + "loss": 4.25, + "step": 1238 + }, + { + "batch_num_effect_tokens": 7949, + "batch_num_samples": 17, + "batch_num_tokens": 8096, + "epoch": 1.60779, + "grad_norm": 0.1403154879808426, + "learning_rate": 1.1192676785412154e-06, + "loss": 4.5283, + "step": 1239 + }, + { + "batch_num_effect_tokens": 8032, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.60908, + "grad_norm": 0.12656299769878387, + "learning_rate": 1.112131406930481e-06, + "loss": 4.4707, + "step": 1240 + }, + { + "batch_num_effect_tokens": 8030, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 1.61038, + "grad_norm": 0.1420130729675293, + "learning_rate": 1.1050151102226369e-06, + "loss": 4.4678, + "step": 1241 + }, + { + "batch_num_effect_tokens": 8002, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.61168, + "grad_norm": 0.12833444774150848, + "learning_rate": 1.097918824979442e-06, + "loss": 4.2129, + "step": 1242 + }, + { + "batch_num_effect_tokens": 7962, + "batch_num_samples": 20, + "batch_num_tokens": 8192, + "epoch": 1.61298, + "grad_norm": 0.13786719739437103, + "learning_rate": 1.0908425876598512e-06, + "loss": 4.3408, + "step": 1243 + }, + { + "batch_num_effect_tokens": 7948, + "batch_num_samples": 14, + "batch_num_tokens": 8116, + "epoch": 1.61427, + "grad_norm": 0.13015086948871613, + "learning_rate": 1.0837864346198117e-06, + "loss": 4.4014, + "step": 1244 + }, + { + "batch_num_effect_tokens": 8040, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 1.61557, + "grad_norm": 0.1163618341088295, + "learning_rate": 1.0767504021120884e-06, + "loss": 3.9277, + "step": 1245 + }, + { + "batch_num_effect_tokens": 7856, + "batch_num_samples": 17, + "batch_num_tokens": 8040, + "epoch": 1.61687, + "grad_norm": 0.12296216189861298, + "learning_rate": 1.0697345262860638e-06, + "loss": 4.5225, + "step": 1246 + }, + { + "batch_num_effect_tokens": 7929, + "batch_num_samples": 23, + "batch_num_tokens": 8136, + "epoch": 1.61817, + "grad_norm": 0.1348690241575241, + "learning_rate": 1.062738843187565e-06, + "loss": 4.3125, + "step": 1247 + }, + { + "batch_num_effect_tokens": 8043, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.61946, + "grad_norm": 0.12651218473911285, + "learning_rate": 1.0557633887586765e-06, + "loss": 4.2842, + "step": 1248 + }, + { + "batch_num_effect_tokens": 8025, + "batch_num_samples": 14, + "batch_num_tokens": 8191, + "epoch": 1.62076, + "grad_norm": 0.12578874826431274, + "learning_rate": 1.0488081988375493e-06, + "loss": 4.0039, + "step": 1249 + }, + { + "batch_num_effect_tokens": 8064, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.62206, + "grad_norm": 0.1441895216703415, + "learning_rate": 1.04187330915822e-06, + "loss": 4.1748, + "step": 1250 + }, + { + "batch_num_effect_tokens": 7945, + "batch_num_samples": 19, + "batch_num_tokens": 8146, + "epoch": 1.62336, + "grad_norm": 0.13080251216888428, + "learning_rate": 1.0349587553504298e-06, + "loss": 4.6592, + "step": 1251 + }, + { + "batch_num_effect_tokens": 7926, + "batch_num_samples": 15, + "batch_num_tokens": 8086, + "epoch": 1.62466, + "grad_norm": 0.1321391612291336, + "learning_rate": 1.0280645729394368e-06, + "loss": 4.3711, + "step": 1252 + }, + { + "batch_num_effect_tokens": 7936, + "batch_num_samples": 14, + "batch_num_tokens": 8083, + "epoch": 1.62595, + "grad_norm": 0.14101214706897736, + "learning_rate": 1.0211907973458391e-06, + "loss": 4.248, + "step": 1253 + }, + { + "batch_num_effect_tokens": 8074, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.62725, + "grad_norm": 0.13918721675872803, + "learning_rate": 1.0143374638853892e-06, + "loss": 4.5293, + "step": 1254 + }, + { + "batch_num_effect_tokens": 8058, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.62855, + "grad_norm": 0.1319819986820221, + "learning_rate": 1.0075046077688067e-06, + "loss": 4.3057, + "step": 1255 + }, + { + "batch_num_effect_tokens": 7918, + "batch_num_samples": 19, + "batch_num_tokens": 8101, + "epoch": 1.62985, + "grad_norm": 0.11569049954414368, + "learning_rate": 1.0006922641016131e-06, + "loss": 4.0488, + "step": 1256 + }, + { + "batch_num_effect_tokens": 8029, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 1.63114, + "grad_norm": 0.13643518090248108, + "learning_rate": 9.939004678839348e-07, + "loss": 4.96, + "step": 1257 + }, + { + "batch_num_effect_tokens": 8010, + "batch_num_samples": 21, + "batch_num_tokens": 8192, + "epoch": 1.63244, + "grad_norm": 0.1374160200357437, + "learning_rate": 9.871292540103377e-07, + "loss": 4.2441, + "step": 1258 + }, + { + "batch_num_effect_tokens": 8060, + "batch_num_samples": 14, + "batch_num_tokens": 8191, + "epoch": 1.63374, + "grad_norm": 0.12853796780109406, + "learning_rate": 9.803786572696321e-07, + "loss": 4.377, + "step": 1259 + }, + { + "batch_num_effect_tokens": 7908, + "batch_num_samples": 19, + "batch_num_tokens": 8086, + "epoch": 1.63504, + "grad_norm": 0.14067308604717255, + "learning_rate": 9.73648712344707e-07, + "loss": 4.4121, + "step": 1260 + }, + { + "batch_num_effect_tokens": 8050, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.63633, + "grad_norm": 0.1275281310081482, + "learning_rate": 9.6693945381235e-07, + "loss": 4.5293, + "step": 1261 + }, + { + "batch_num_effect_tokens": 8034, + "batch_num_samples": 18, + "batch_num_tokens": 8192, + "epoch": 1.63763, + "grad_norm": 0.12822289764881134, + "learning_rate": 9.602509161430628e-07, + "loss": 4.0166, + "step": 1262 + }, + { + "batch_num_effect_tokens": 7985, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.63893, + "grad_norm": 0.13135388493537903, + "learning_rate": 9.53583133700891e-07, + "loss": 4.1982, + "step": 1263 + }, + { + "batch_num_effect_tokens": 7980, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.64023, + "grad_norm": 0.13726817071437836, + "learning_rate": 9.469361407432431e-07, + "loss": 4.4258, + "step": 1264 + }, + { + "batch_num_effect_tokens": 8024, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 1.64152, + "grad_norm": 0.1434670388698578, + "learning_rate": 9.403099714207175e-07, + "loss": 4.5791, + "step": 1265 + }, + { + "batch_num_effect_tokens": 7957, + "batch_num_samples": 14, + "batch_num_tokens": 8110, + "epoch": 1.64282, + "grad_norm": 0.12421970069408417, + "learning_rate": 9.337046597769272e-07, + "loss": 4.0225, + "step": 1266 + }, + { + "batch_num_effect_tokens": 8022, + "batch_num_samples": 14, + "batch_num_tokens": 8142, + "epoch": 1.64412, + "grad_norm": 0.13223737478256226, + "learning_rate": 9.271202397483214e-07, + "loss": 3.9521, + "step": 1267 + }, + { + "batch_num_effect_tokens": 8023, + "batch_num_samples": 18, + "batch_num_tokens": 8192, + "epoch": 1.64542, + "grad_norm": 0.12750263512134552, + "learning_rate": 9.205567451640151e-07, + "loss": 4.0049, + "step": 1268 + }, + { + "batch_num_effect_tokens": 8065, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.64672, + "grad_norm": 0.12239693850278854, + "learning_rate": 9.140142097456117e-07, + "loss": 4.2539, + "step": 1269 + }, + { + "batch_num_effect_tokens": 8007, + "batch_num_samples": 19, + "batch_num_tokens": 8179, + "epoch": 1.64801, + "grad_norm": 0.1410730630159378, + "learning_rate": 9.074926671070322e-07, + "loss": 4.3662, + "step": 1270 + }, + { + "batch_num_effect_tokens": 7938, + "batch_num_samples": 18, + "batch_num_tokens": 8085, + "epoch": 1.64931, + "grad_norm": 0.12114191055297852, + "learning_rate": 9.009921507543445e-07, + "loss": 4.6211, + "step": 1271 + }, + { + "batch_num_effect_tokens": 7922, + "batch_num_samples": 14, + "batch_num_tokens": 8092, + "epoch": 1.65061, + "grad_norm": 0.12979581952095032, + "learning_rate": 8.945126940855864e-07, + "loss": 4.6465, + "step": 1272 + }, + { + "batch_num_effect_tokens": 8024, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 1.65191, + "grad_norm": 0.12733100354671478, + "learning_rate": 8.880543303905931e-07, + "loss": 4.082, + "step": 1273 + }, + { + "batch_num_effect_tokens": 7917, + "batch_num_samples": 16, + "batch_num_tokens": 8089, + "epoch": 1.6532, + "grad_norm": 0.11588294804096222, + "learning_rate": 8.816170928508367e-07, + "loss": 4.1299, + "step": 1274 + }, + { + "batch_num_effect_tokens": 8001, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 1.6545, + "grad_norm": 0.11835164576768875, + "learning_rate": 8.752010145392408e-07, + "loss": 4.1543, + "step": 1275 + }, + { + "batch_num_effect_tokens": 8027, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.6558, + "grad_norm": 0.13759027421474457, + "learning_rate": 8.688061284200266e-07, + "loss": 4.5654, + "step": 1276 + }, + { + "batch_num_effect_tokens": 8030, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 1.6571, + "grad_norm": 0.13586406409740448, + "learning_rate": 8.624324673485252e-07, + "loss": 4.3682, + "step": 1277 + }, + { + "batch_num_effect_tokens": 8036, + "batch_num_samples": 14, + "batch_num_tokens": 8164, + "epoch": 1.65839, + "grad_norm": 0.13176386058330536, + "learning_rate": 8.560800640710248e-07, + "loss": 4.3643, + "step": 1278 + }, + { + "batch_num_effect_tokens": 8035, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.65969, + "grad_norm": 0.1281612068414688, + "learning_rate": 8.497489512245971e-07, + "loss": 4.0146, + "step": 1279 + }, + { + "batch_num_effect_tokens": 7869, + "batch_num_samples": 15, + "batch_num_tokens": 8086, + "epoch": 1.66099, + "grad_norm": 0.11786817759275436, + "learning_rate": 8.434391613369258e-07, + "loss": 3.9717, + "step": 1280 + }, + { + "batch_num_effect_tokens": 8016, + "batch_num_samples": 18, + "batch_num_tokens": 8192, + "epoch": 1.66229, + "grad_norm": 0.13192172348499298, + "learning_rate": 8.371507268261436e-07, + "loss": 4.3145, + "step": 1281 + }, + { + "batch_num_effect_tokens": 8052, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 1.66358, + "grad_norm": 0.13585922122001648, + "learning_rate": 8.308836800006648e-07, + "loss": 4.2051, + "step": 1282 + }, + { + "batch_num_effect_tokens": 8044, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 1.66488, + "grad_norm": 0.13544043898582458, + "learning_rate": 8.246380530590175e-07, + "loss": 4.2764, + "step": 1283 + }, + { + "batch_num_effect_tokens": 7965, + "batch_num_samples": 15, + "batch_num_tokens": 8110, + "epoch": 1.66618, + "grad_norm": 0.1333230584859848, + "learning_rate": 8.184138780896839e-07, + "loss": 4.2051, + "step": 1284 + }, + { + "batch_num_effect_tokens": 7949, + "batch_num_samples": 14, + "batch_num_tokens": 8088, + "epoch": 1.66748, + "grad_norm": 0.1322176456451416, + "learning_rate": 8.122111870709287e-07, + "loss": 4.2715, + "step": 1285 + }, + { + "batch_num_effect_tokens": 7749, + "batch_num_samples": 30, + "batch_num_tokens": 8018, + "epoch": 1.66878, + "grad_norm": 0.13512325286865234, + "learning_rate": 8.060300118706327e-07, + "loss": 3.9844, + "step": 1286 + }, + { + "batch_num_effect_tokens": 7993, + "batch_num_samples": 14, + "batch_num_tokens": 8119, + "epoch": 1.67007, + "grad_norm": 0.13298512995243073, + "learning_rate": 7.99870384246143e-07, + "loss": 4.3496, + "step": 1287 + }, + { + "batch_num_effect_tokens": 8030, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 1.67137, + "grad_norm": 0.134169802069664, + "learning_rate": 7.937323358440935e-07, + "loss": 4.5732, + "step": 1288 + }, + { + "batch_num_effect_tokens": 8047, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 1.67267, + "grad_norm": 0.139683797955513, + "learning_rate": 7.876158982002552e-07, + "loss": 4.2637, + "step": 1289 + }, + { + "batch_num_effect_tokens": 8037, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.67397, + "grad_norm": 0.13284529745578766, + "learning_rate": 7.815211027393616e-07, + "loss": 4.3613, + "step": 1290 + }, + { + "batch_num_effect_tokens": 8020, + "batch_num_samples": 17, + "batch_num_tokens": 8165, + "epoch": 1.67526, + "grad_norm": 0.1360265165567398, + "learning_rate": 7.754479807749571e-07, + "loss": 3.9727, + "step": 1291 + }, + { + "batch_num_effect_tokens": 7988, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.67656, + "grad_norm": 0.13193875551223755, + "learning_rate": 7.693965635092365e-07, + "loss": 4.5273, + "step": 1292 + }, + { + "batch_num_effect_tokens": 8002, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.67786, + "grad_norm": 0.1406845897436142, + "learning_rate": 7.633668820328765e-07, + "loss": 4.4824, + "step": 1293 + }, + { + "batch_num_effect_tokens": 7877, + "batch_num_samples": 18, + "batch_num_tokens": 8100, + "epoch": 1.67916, + "grad_norm": 0.12547780573368073, + "learning_rate": 7.573589673248833e-07, + "loss": 4.3389, + "step": 1294 + }, + { + "batch_num_effect_tokens": 8022, + "batch_num_samples": 15, + "batch_num_tokens": 8175, + "epoch": 1.68045, + "grad_norm": 0.12817974388599396, + "learning_rate": 7.513728502524286e-07, + "loss": 4.0254, + "step": 1295 + }, + { + "batch_num_effect_tokens": 8007, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 1.68175, + "grad_norm": 0.1354372501373291, + "learning_rate": 7.454085615706951e-07, + "loss": 4.4277, + "step": 1296 + }, + { + "batch_num_effect_tokens": 8031, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.68305, + "grad_norm": 0.12303349375724792, + "learning_rate": 7.394661319227175e-07, + "loss": 4.6855, + "step": 1297 + }, + { + "batch_num_effect_tokens": 8019, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.68435, + "grad_norm": 0.11604474484920502, + "learning_rate": 7.33545591839222e-07, + "loss": 4.2285, + "step": 1298 + }, + { + "batch_num_effect_tokens": 7995, + "batch_num_samples": 18, + "batch_num_tokens": 8192, + "epoch": 1.68564, + "grad_norm": 0.12446524202823639, + "learning_rate": 7.276469717384726e-07, + "loss": 4.1304, + "step": 1299 + }, + { + "batch_num_effect_tokens": 8042, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.68694, + "grad_norm": 0.1320941001176834, + "learning_rate": 7.217703019261135e-07, + "loss": 4.4512, + "step": 1300 + }, + { + "batch_num_effect_tokens": 8004, + "batch_num_samples": 19, + "batch_num_tokens": 8192, + "epoch": 1.68824, + "grad_norm": 0.13994361460208893, + "learning_rate": 7.15915612595014e-07, + "loss": 4.5732, + "step": 1301 + }, + { + "batch_num_effect_tokens": 7981, + "batch_num_samples": 14, + "batch_num_tokens": 8121, + "epoch": 1.68954, + "grad_norm": 0.1323327273130417, + "learning_rate": 7.100829338251147e-07, + "loss": 4.2119, + "step": 1302 + }, + { + "batch_num_effect_tokens": 8014, + "batch_num_samples": 15, + "batch_num_tokens": 8152, + "epoch": 1.69084, + "grad_norm": 0.12983821332454681, + "learning_rate": 7.042722955832703e-07, + "loss": 4.5225, + "step": 1303 + }, + { + "batch_num_effect_tokens": 7942, + "batch_num_samples": 19, + "batch_num_tokens": 8116, + "epoch": 1.69213, + "grad_norm": 0.13648320734500885, + "learning_rate": 6.984837277230927e-07, + "loss": 4.3262, + "step": 1304 + }, + { + "batch_num_effect_tokens": 8002, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 1.69343, + "grad_norm": 0.13606522977352142, + "learning_rate": 6.927172599848092e-07, + "loss": 4.4053, + "step": 1305 + }, + { + "batch_num_effect_tokens": 8028, + "batch_num_samples": 14, + "batch_num_tokens": 8185, + "epoch": 1.69473, + "grad_norm": 0.1341053992509842, + "learning_rate": 6.86972921995096e-07, + "loss": 4.0566, + "step": 1306 + }, + { + "batch_num_effect_tokens": 8042, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 1.69603, + "grad_norm": 0.13478976488113403, + "learning_rate": 6.812507432669374e-07, + "loss": 4.1006, + "step": 1307 + }, + { + "batch_num_effect_tokens": 8054, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 1.69732, + "grad_norm": 0.12561902403831482, + "learning_rate": 6.755507531994637e-07, + "loss": 4.1797, + "step": 1308 + }, + { + "batch_num_effect_tokens": 7813, + "batch_num_samples": 17, + "batch_num_tokens": 7989, + "epoch": 1.69862, + "grad_norm": 0.13221710920333862, + "learning_rate": 6.698729810778065e-07, + "loss": 4.3721, + "step": 1309 + }, + { + "batch_num_effect_tokens": 7902, + "batch_num_samples": 18, + "batch_num_tokens": 8090, + "epoch": 1.69992, + "grad_norm": 0.1371319591999054, + "learning_rate": 6.642174560729514e-07, + "loss": 4.084, + "step": 1310 + }, + { + "batch_num_effect_tokens": 7862, + "batch_num_samples": 29, + "batch_num_tokens": 8092, + "epoch": 1.70122, + "grad_norm": 0.12322220206260681, + "learning_rate": 6.585842072415799e-07, + "loss": 4.2988, + "step": 1311 + }, + { + "batch_num_effect_tokens": 7981, + "batch_num_samples": 14, + "batch_num_tokens": 8110, + "epoch": 1.70251, + "grad_norm": 0.13426773250102997, + "learning_rate": 6.529732635259234e-07, + "loss": 4.7168, + "step": 1312 + }, + { + "batch_num_effect_tokens": 8023, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.70381, + "grad_norm": 0.12802381813526154, + "learning_rate": 6.473846537536183e-07, + "loss": 4.4238, + "step": 1313 + }, + { + "batch_num_effect_tokens": 8022, + "batch_num_samples": 19, + "batch_num_tokens": 8189, + "epoch": 1.70511, + "grad_norm": 0.1425420641899109, + "learning_rate": 6.41818406637551e-07, + "loss": 4.29, + "step": 1314 + }, + { + "batch_num_effect_tokens": 8036, + "batch_num_samples": 15, + "batch_num_tokens": 8190, + "epoch": 1.70641, + "grad_norm": 0.13316959142684937, + "learning_rate": 6.36274550775719e-07, + "loss": 4.2227, + "step": 1315 + }, + { + "batch_num_effect_tokens": 8000, + "batch_num_samples": 18, + "batch_num_tokens": 8192, + "epoch": 1.7077, + "grad_norm": 0.1335574984550476, + "learning_rate": 6.307531146510754e-07, + "loss": 4.5928, + "step": 1316 + }, + { + "batch_num_effect_tokens": 8053, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.709, + "grad_norm": 0.1302870213985443, + "learning_rate": 6.252541266313866e-07, + "loss": 4.0986, + "step": 1317 + }, + { + "batch_num_effect_tokens": 8042, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.7103, + "grad_norm": 0.13204161822795868, + "learning_rate": 6.197776149690871e-07, + "loss": 4.5967, + "step": 1318 + }, + { + "batch_num_effect_tokens": 8058, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.7116, + "grad_norm": 0.12694908678531647, + "learning_rate": 6.143236078011317e-07, + "loss": 4.2031, + "step": 1319 + }, + { + "batch_num_effect_tokens": 7929, + "batch_num_samples": 15, + "batch_num_tokens": 8086, + "epoch": 1.7129, + "grad_norm": 0.12293847650289536, + "learning_rate": 6.088921331488568e-07, + "loss": 3.9863, + "step": 1320 + }, + { + "batch_num_effect_tokens": 8007, + "batch_num_samples": 22, + "batch_num_tokens": 8192, + "epoch": 1.71419, + "grad_norm": 0.12406273186206818, + "learning_rate": 6.034832189178302e-07, + "loss": 4.2266, + "step": 1321 + }, + { + "batch_num_effect_tokens": 8033, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.71549, + "grad_norm": 0.12104199081659317, + "learning_rate": 5.980968928977049e-07, + "loss": 4.2158, + "step": 1322 + }, + { + "batch_num_effect_tokens": 8025, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 1.71679, + "grad_norm": 0.13361838459968567, + "learning_rate": 5.927331827620902e-07, + "loss": 4.3867, + "step": 1323 + }, + { + "batch_num_effect_tokens": 8051, + "batch_num_samples": 14, + "batch_num_tokens": 8191, + "epoch": 1.71809, + "grad_norm": 0.1261344850063324, + "learning_rate": 5.873921160683943e-07, + "loss": 4.332, + "step": 1324 + }, + { + "batch_num_effect_tokens": 8053, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 1.71938, + "grad_norm": 0.127448171377182, + "learning_rate": 5.820737202576909e-07, + "loss": 4.499, + "step": 1325 + }, + { + "batch_num_effect_tokens": 7944, + "batch_num_samples": 14, + "batch_num_tokens": 8112, + "epoch": 1.72068, + "grad_norm": 0.12296268343925476, + "learning_rate": 5.767780226545766e-07, + "loss": 4.0928, + "step": 1326 + }, + { + "batch_num_effect_tokens": 8039, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 1.72198, + "grad_norm": 0.1200428232550621, + "learning_rate": 5.715050504670288e-07, + "loss": 4.5107, + "step": 1327 + }, + { + "batch_num_effect_tokens": 8043, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.72328, + "grad_norm": 0.1229812353849411, + "learning_rate": 5.662548307862714e-07, + "loss": 4.6191, + "step": 1328 + }, + { + "batch_num_effect_tokens": 8028, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.72457, + "grad_norm": 0.1287735551595688, + "learning_rate": 5.61027390586626e-07, + "loss": 4.6602, + "step": 1329 + }, + { + "batch_num_effect_tokens": 8022, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.72587, + "grad_norm": 0.12394028156995773, + "learning_rate": 5.558227567253832e-07, + "loss": 4.3076, + "step": 1330 + }, + { + "batch_num_effect_tokens": 8012, + "batch_num_samples": 14, + "batch_num_tokens": 8149, + "epoch": 1.72717, + "grad_norm": 0.12008914351463318, + "learning_rate": 5.506409559426573e-07, + "loss": 4.373, + "step": 1331 + }, + { + "batch_num_effect_tokens": 8036, + "batch_num_samples": 14, + "batch_num_tokens": 8191, + "epoch": 1.72847, + "grad_norm": 0.12099087238311768, + "learning_rate": 5.454820148612533e-07, + "loss": 4.0195, + "step": 1332 + }, + { + "batch_num_effect_tokens": 7910, + "batch_num_samples": 20, + "batch_num_tokens": 8096, + "epoch": 1.72976, + "grad_norm": 0.13755109906196594, + "learning_rate": 5.403459599865307e-07, + "loss": 4.459, + "step": 1333 + }, + { + "batch_num_effect_tokens": 8041, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.73106, + "grad_norm": 0.12216237932443619, + "learning_rate": 5.352328177062626e-07, + "loss": 4.0791, + "step": 1334 + }, + { + "batch_num_effect_tokens": 7912, + "batch_num_samples": 23, + "batch_num_tokens": 8105, + "epoch": 1.73236, + "grad_norm": 0.14445650577545166, + "learning_rate": 5.301426142905019e-07, + "loss": 4.2441, + "step": 1335 + }, + { + "batch_num_effect_tokens": 8031, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 1.73366, + "grad_norm": 0.12957431375980377, + "learning_rate": 5.250753758914506e-07, + "loss": 4.7578, + "step": 1336 + }, + { + "batch_num_effect_tokens": 7838, + "batch_num_samples": 25, + "batch_num_tokens": 8114, + "epoch": 1.73496, + "grad_norm": 0.12466391175985336, + "learning_rate": 5.200311285433213e-07, + "loss": 4.2861, + "step": 1337 + }, + { + "batch_num_effect_tokens": 8027, + "batch_num_samples": 18, + "batch_num_tokens": 8192, + "epoch": 1.73625, + "grad_norm": 0.12449593842029572, + "learning_rate": 5.15009898162202e-07, + "loss": 4.5078, + "step": 1338 + }, + { + "batch_num_effect_tokens": 8008, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 1.73755, + "grad_norm": 0.12438397109508514, + "learning_rate": 5.100117105459279e-07, + "loss": 4.2246, + "step": 1339 + }, + { + "batch_num_effect_tokens": 8036, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.73885, + "grad_norm": 0.1266915500164032, + "learning_rate": 5.050365913739441e-07, + "loss": 3.8174, + "step": 1340 + }, + { + "batch_num_effect_tokens": 8032, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 1.74015, + "grad_norm": 0.12431465089321136, + "learning_rate": 5.000845662071779e-07, + "loss": 4.334, + "step": 1341 + }, + { + "batch_num_effect_tokens": 8017, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 1.74144, + "grad_norm": 0.12137281149625778, + "learning_rate": 4.951556604879049e-07, + "loss": 4.001, + "step": 1342 + }, + { + "batch_num_effect_tokens": 8057, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 1.74274, + "grad_norm": 0.12117177993059158, + "learning_rate": 4.902498995396166e-07, + "loss": 4.2422, + "step": 1343 + }, + { + "batch_num_effect_tokens": 8002, + "batch_num_samples": 19, + "batch_num_tokens": 8192, + "epoch": 1.74404, + "grad_norm": 0.12428870052099228, + "learning_rate": 4.853673085668947e-07, + "loss": 4.1943, + "step": 1344 + }, + { + "batch_num_effect_tokens": 8014, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 1.74534, + "grad_norm": 0.12221790850162506, + "learning_rate": 4.80507912655277e-07, + "loss": 4.5078, + "step": 1345 + }, + { + "batch_num_effect_tokens": 7997, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 1.74663, + "grad_norm": 0.12552009522914886, + "learning_rate": 4.75671736771135e-07, + "loss": 4.2559, + "step": 1346 + }, + { + "batch_num_effect_tokens": 8049, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.74793, + "grad_norm": 0.11707472056150436, + "learning_rate": 4.7085880576153765e-07, + "loss": 3.9307, + "step": 1347 + }, + { + "batch_num_effect_tokens": 8031, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.74923, + "grad_norm": 0.11228827387094498, + "learning_rate": 4.660691443541282e-07, + "loss": 4.1211, + "step": 1348 + }, + { + "batch_num_effect_tokens": 8019, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 1.75053, + "grad_norm": 0.12909752130508423, + "learning_rate": 4.6130277715699777e-07, + "loss": 4.2334, + "step": 1349 + }, + { + "batch_num_effect_tokens": 8030, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 1.75182, + "grad_norm": 0.1351223587989807, + "learning_rate": 4.565597286585555e-07, + "loss": 4.5566, + "step": 1350 + }, + { + "batch_num_effect_tokens": 7985, + "batch_num_samples": 15, + "batch_num_tokens": 8167, + "epoch": 1.75312, + "grad_norm": 0.13169695436954498, + "learning_rate": 4.5184002322740784e-07, + "loss": 4.4697, + "step": 1351 + }, + { + "batch_num_effect_tokens": 8046, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.75442, + "grad_norm": 0.12474015355110168, + "learning_rate": 4.4714368511222905e-07, + "loss": 3.9141, + "step": 1352 + }, + { + "batch_num_effect_tokens": 8058, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.75572, + "grad_norm": 0.1359989494085312, + "learning_rate": 4.4247073844163434e-07, + "loss": 4.4863, + "step": 1353 + }, + { + "batch_num_effect_tokens": 8040, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 1.75702, + "grad_norm": 0.13669313490390778, + "learning_rate": 4.3782120722406565e-07, + "loss": 4.3809, + "step": 1354 + }, + { + "batch_num_effect_tokens": 8029, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.75831, + "grad_norm": 0.13616794347763062, + "learning_rate": 4.331951153476588e-07, + "loss": 4.0215, + "step": 1355 + }, + { + "batch_num_effect_tokens": 8004, + "batch_num_samples": 17, + "batch_num_tokens": 8191, + "epoch": 1.75961, + "grad_norm": 0.13854430615901947, + "learning_rate": 4.285924865801233e-07, + "loss": 4.5312, + "step": 1356 + }, + { + "batch_num_effect_tokens": 8053, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.76091, + "grad_norm": 0.13246670365333557, + "learning_rate": 4.2401334456862344e-07, + "loss": 4.4092, + "step": 1357 + }, + { + "batch_num_effect_tokens": 8025, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.76221, + "grad_norm": 0.12861448526382446, + "learning_rate": 4.194577128396521e-07, + "loss": 4.2188, + "step": 1358 + }, + { + "batch_num_effect_tokens": 8045, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.7635, + "grad_norm": 0.1204322949051857, + "learning_rate": 4.149256147989139e-07, + "loss": 4.2617, + "step": 1359 + }, + { + "batch_num_effect_tokens": 8039, + "batch_num_samples": 14, + "batch_num_tokens": 8189, + "epoch": 1.7648, + "grad_norm": 0.14009417593479156, + "learning_rate": 4.1041707373120354e-07, + "loss": 4.0459, + "step": 1360 + }, + { + "batch_num_effect_tokens": 8015, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.7661, + "grad_norm": 0.13464143872261047, + "learning_rate": 4.05932112800283e-07, + "loss": 4.4492, + "step": 1361 + }, + { + "batch_num_effect_tokens": 7937, + "batch_num_samples": 23, + "batch_num_tokens": 8192, + "epoch": 1.7674, + "grad_norm": 0.12960170209407806, + "learning_rate": 4.0147075504876844e-07, + "loss": 4.3623, + "step": 1362 + }, + { + "batch_num_effect_tokens": 8084, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.76869, + "grad_norm": 0.1406746506690979, + "learning_rate": 3.9703302339800687e-07, + "loss": 4.4395, + "step": 1363 + }, + { + "batch_num_effect_tokens": 8039, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.76999, + "grad_norm": 0.12615273892879486, + "learning_rate": 3.9261894064796136e-07, + "loss": 4.4746, + "step": 1364 + }, + { + "batch_num_effect_tokens": 8056, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 1.77129, + "grad_norm": 0.13323578238487244, + "learning_rate": 3.882285294770938e-07, + "loss": 4.1318, + "step": 1365 + }, + { + "batch_num_effect_tokens": 7836, + "batch_num_samples": 17, + "batch_num_tokens": 8011, + "epoch": 1.77259, + "grad_norm": 0.12410898506641388, + "learning_rate": 3.8386181244224274e-07, + "loss": 4.2305, + "step": 1366 + }, + { + "batch_num_effect_tokens": 8017, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 1.77388, + "grad_norm": 0.1302991360425949, + "learning_rate": 3.7951881197851816e-07, + "loss": 4.3486, + "step": 1367 + }, + { + "batch_num_effect_tokens": 7912, + "batch_num_samples": 18, + "batch_num_tokens": 8081, + "epoch": 1.77518, + "grad_norm": 0.13446044921875, + "learning_rate": 3.751995503991762e-07, + "loss": 4.4795, + "step": 1368 + }, + { + "batch_num_effect_tokens": 8090, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 1.77648, + "grad_norm": 0.1244499534368515, + "learning_rate": 3.709040498955102e-07, + "loss": 4.1152, + "step": 1369 + }, + { + "batch_num_effect_tokens": 7669, + "batch_num_samples": 32, + "batch_num_tokens": 7981, + "epoch": 1.77778, + "grad_norm": 0.13350388407707214, + "learning_rate": 3.666323325367344e-07, + "loss": 4.3936, + "step": 1370 + }, + { + "batch_num_effect_tokens": 7937, + "batch_num_samples": 16, + "batch_num_tokens": 8108, + "epoch": 1.77908, + "grad_norm": 0.1298922747373581, + "learning_rate": 3.623844202698701e-07, + "loss": 4.291, + "step": 1371 + }, + { + "batch_num_effect_tokens": 8017, + "batch_num_samples": 16, + "batch_num_tokens": 8166, + "epoch": 1.78037, + "grad_norm": 0.12909585237503052, + "learning_rate": 3.581603349196372e-07, + "loss": 4.3447, + "step": 1372 + }, + { + "batch_num_effect_tokens": 8046, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 1.78167, + "grad_norm": 0.1274796426296234, + "learning_rate": 3.5396009818833567e-07, + "loss": 4.3516, + "step": 1373 + }, + { + "batch_num_effect_tokens": 8005, + "batch_num_samples": 14, + "batch_num_tokens": 8184, + "epoch": 1.78297, + "grad_norm": 0.11704126745462418, + "learning_rate": 3.497837316557384e-07, + "loss": 4.2861, + "step": 1374 + }, + { + "batch_num_effect_tokens": 7885, + "batch_num_samples": 20, + "batch_num_tokens": 8192, + "epoch": 1.78427, + "grad_norm": 0.11343449354171753, + "learning_rate": 3.4563125677897936e-07, + "loss": 4.2314, + "step": 1375 + }, + { + "batch_num_effect_tokens": 8037, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.78556, + "grad_norm": 0.12782780826091766, + "learning_rate": 3.41502694892441e-07, + "loss": 4.335, + "step": 1376 + }, + { + "batch_num_effect_tokens": 8075, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.78686, + "grad_norm": 0.12656031548976898, + "learning_rate": 3.373980672076516e-07, + "loss": 4.3848, + "step": 1377 + }, + { + "batch_num_effect_tokens": 8057, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.78816, + "grad_norm": 0.12621362507343292, + "learning_rate": 3.333173948131663e-07, + "loss": 4.1943, + "step": 1378 + }, + { + "batch_num_effect_tokens": 8012, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.78946, + "grad_norm": 0.12268788367509842, + "learning_rate": 3.2926069867446673e-07, + "loss": 4.7314, + "step": 1379 + }, + { + "batch_num_effect_tokens": 7885, + "batch_num_samples": 23, + "batch_num_tokens": 8124, + "epoch": 1.79075, + "grad_norm": 0.12487363070249557, + "learning_rate": 3.252279996338492e-07, + "loss": 4.0225, + "step": 1380 + }, + { + "batch_num_effect_tokens": 8007, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.79205, + "grad_norm": 0.13526642322540283, + "learning_rate": 3.212193184103196e-07, + "loss": 4.2959, + "step": 1381 + }, + { + "batch_num_effect_tokens": 7931, + "batch_num_samples": 19, + "batch_num_tokens": 8146, + "epoch": 1.79335, + "grad_norm": 0.13416649401187897, + "learning_rate": 3.172346755994865e-07, + "loss": 4.4658, + "step": 1382 + }, + { + "batch_num_effect_tokens": 7978, + "batch_num_samples": 16, + "batch_num_tokens": 8133, + "epoch": 1.79465, + "grad_norm": 0.11787126213312149, + "learning_rate": 3.132740916734556e-07, + "loss": 4.0879, + "step": 1383 + }, + { + "batch_num_effect_tokens": 7981, + "batch_num_samples": 21, + "batch_num_tokens": 8192, + "epoch": 1.79594, + "grad_norm": 0.13434545695781708, + "learning_rate": 3.0933758698072023e-07, + "loss": 4.5879, + "step": 1384 + }, + { + "batch_num_effect_tokens": 8062, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.79724, + "grad_norm": 0.13060152530670166, + "learning_rate": 3.054251817460663e-07, + "loss": 4.4268, + "step": 1385 + }, + { + "batch_num_effect_tokens": 8028, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 1.79854, + "grad_norm": 0.12576042115688324, + "learning_rate": 3.015368960704584e-07, + "loss": 4.0508, + "step": 1386 + }, + { + "batch_num_effect_tokens": 8028, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 1.79854, + "eval_eval_loss": 0.5592343807220459, + "eval_eval_runtime": 115.3013, + "eval_eval_samples_per_second": 43.365, + "eval_eval_steps_per_second": 2.715, + "step": 1386 + }, + { + "batch_num_effect_tokens": 8063, + "batch_num_samples": 13, + "batch_num_tokens": 8189, + "epoch": 1.79984, + "grad_norm": 0.13307738304138184, + "learning_rate": 2.9767274993094285e-07, + "loss": 4.0938, + "step": 1387 + }, + { + "batch_num_effect_tokens": 7991, + "batch_num_samples": 15, + "batch_num_tokens": 8168, + "epoch": 1.80114, + "grad_norm": 0.12784725427627563, + "learning_rate": 2.938327631805421e-07, + "loss": 4.2842, + "step": 1388 + }, + { + "batch_num_effect_tokens": 8072, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 1.80243, + "grad_norm": 0.12473003566265106, + "learning_rate": 2.900169555481536e-07, + "loss": 4.373, + "step": 1389 + }, + { + "batch_num_effect_tokens": 8038, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.80373, + "grad_norm": 0.12554599344730377, + "learning_rate": 2.862253466384507e-07, + "loss": 4.249, + "step": 1390 + }, + { + "batch_num_effect_tokens": 8049, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.80503, + "grad_norm": 0.12785503268241882, + "learning_rate": 2.8245795593177637e-07, + "loss": 4.376, + "step": 1391 + }, + { + "batch_num_effect_tokens": 8068, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.80633, + "grad_norm": 0.13512638211250305, + "learning_rate": 2.787148027840486e-07, + "loss": 4.3232, + "step": 1392 + }, + { + "batch_num_effect_tokens": 8052, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.80762, + "grad_norm": 0.1361304223537445, + "learning_rate": 2.7499590642665773e-07, + "loss": 4.4668, + "step": 1393 + }, + { + "batch_num_effect_tokens": 8033, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 1.80892, + "grad_norm": 0.1287328451871872, + "learning_rate": 2.713012859663694e-07, + "loss": 4.2832, + "step": 1394 + }, + { + "batch_num_effect_tokens": 8080, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.81022, + "grad_norm": 0.13318751752376556, + "learning_rate": 2.6763096038522673e-07, + "loss": 4.5, + "step": 1395 + }, + { + "batch_num_effect_tokens": 7937, + "batch_num_samples": 14, + "batch_num_tokens": 8107, + "epoch": 1.81152, + "grad_norm": 0.12792308628559113, + "learning_rate": 2.6398494854045055e-07, + "loss": 4.0791, + "step": 1396 + }, + { + "batch_num_effect_tokens": 8067, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.81281, + "grad_norm": 0.13105538487434387, + "learning_rate": 2.6036326916434153e-07, + "loss": 4.4668, + "step": 1397 + }, + { + "batch_num_effect_tokens": 8045, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.81411, + "grad_norm": 0.1282477080821991, + "learning_rate": 2.5676594086419037e-07, + "loss": 4.124, + "step": 1398 + }, + { + "batch_num_effect_tokens": 7992, + "batch_num_samples": 14, + "batch_num_tokens": 8144, + "epoch": 1.81541, + "grad_norm": 0.13009123504161835, + "learning_rate": 2.531929821221768e-07, + "loss": 4.4424, + "step": 1399 + }, + { + "batch_num_effect_tokens": 8046, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.81671, + "grad_norm": 0.14188528060913086, + "learning_rate": 2.4964441129527337e-07, + "loss": 4.3438, + "step": 1400 + }, + { + "batch_num_effect_tokens": 8068, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 1.818, + "grad_norm": 0.13717369735240936, + "learning_rate": 2.4612024661515686e-07, + "loss": 4.4883, + "step": 1401 + }, + { + "batch_num_effect_tokens": 8076, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 1.8193, + "grad_norm": 0.13617396354675293, + "learning_rate": 2.426205061881082e-07, + "loss": 5.0674, + "step": 1402 + }, + { + "batch_num_effect_tokens": 7924, + "batch_num_samples": 25, + "batch_num_tokens": 8156, + "epoch": 1.8206, + "grad_norm": 0.1338592916727066, + "learning_rate": 2.3914520799492527e-07, + "loss": 4.501, + "step": 1403 + }, + { + "batch_num_effect_tokens": 8038, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 1.8219, + "grad_norm": 0.11929450929164886, + "learning_rate": 2.3569436989082705e-07, + "loss": 4.0938, + "step": 1404 + }, + { + "batch_num_effect_tokens": 7994, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 1.8232, + "grad_norm": 0.1299748420715332, + "learning_rate": 2.32268009605362e-07, + "loss": 4.1816, + "step": 1405 + }, + { + "batch_num_effect_tokens": 8020, + "batch_num_samples": 17, + "batch_num_tokens": 8187, + "epoch": 1.82449, + "grad_norm": 0.13757820427417755, + "learning_rate": 2.2886614474231794e-07, + "loss": 4.4873, + "step": 1406 + }, + { + "batch_num_effect_tokens": 8046, + "batch_num_samples": 14, + "batch_num_tokens": 8164, + "epoch": 1.82579, + "grad_norm": 0.1272607445716858, + "learning_rate": 2.2548879277963065e-07, + "loss": 4.7734, + "step": 1407 + }, + { + "batch_num_effect_tokens": 7982, + "batch_num_samples": 14, + "batch_num_tokens": 8142, + "epoch": 1.82709, + "grad_norm": 0.1257350742816925, + "learning_rate": 2.2213597106929608e-07, + "loss": 4.124, + "step": 1408 + }, + { + "batch_num_effect_tokens": 7921, + "batch_num_samples": 17, + "batch_num_tokens": 8087, + "epoch": 1.82839, + "grad_norm": 0.12541215121746063, + "learning_rate": 2.1880769683727986e-07, + "loss": 3.9297, + "step": 1409 + }, + { + "batch_num_effect_tokens": 7907, + "batch_num_samples": 21, + "batch_num_tokens": 8074, + "epoch": 1.82968, + "grad_norm": 0.11980535089969635, + "learning_rate": 2.1550398718342692e-07, + "loss": 3.9902, + "step": 1410 + }, + { + "batch_num_effect_tokens": 8036, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 1.83098, + "grad_norm": 0.13398943841457367, + "learning_rate": 2.1222485908137747e-07, + "loss": 3.9873, + "step": 1411 + }, + { + "batch_num_effect_tokens": 8078, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 1.83228, + "grad_norm": 0.12076178193092346, + "learning_rate": 2.0897032937847616e-07, + "loss": 4.5732, + "step": 1412 + }, + { + "batch_num_effect_tokens": 8048, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.83358, + "grad_norm": 0.12311708927154541, + "learning_rate": 2.0574041479568817e-07, + "loss": 4.1377, + "step": 1413 + }, + { + "batch_num_effect_tokens": 8063, + "batch_num_samples": 14, + "batch_num_tokens": 8182, + "epoch": 1.83487, + "grad_norm": 0.13371671736240387, + "learning_rate": 2.0253513192751374e-07, + "loss": 4.7109, + "step": 1414 + }, + { + "batch_num_effect_tokens": 7978, + "batch_num_samples": 22, + "batch_num_tokens": 8192, + "epoch": 1.83617, + "grad_norm": 0.12499105930328369, + "learning_rate": 1.9935449724189705e-07, + "loss": 4.2949, + "step": 1415 + }, + { + "batch_num_effect_tokens": 7971, + "batch_num_samples": 17, + "batch_num_tokens": 8157, + "epoch": 1.83747, + "grad_norm": 0.12765094637870789, + "learning_rate": 1.9619852708015142e-07, + "loss": 4.6221, + "step": 1416 + }, + { + "batch_num_effect_tokens": 7899, + "batch_num_samples": 18, + "batch_num_tokens": 8081, + "epoch": 1.83877, + "grad_norm": 0.12610025703907013, + "learning_rate": 1.9306723765686598e-07, + "loss": 4.502, + "step": 1417 + }, + { + "batch_num_effect_tokens": 7882, + "batch_num_samples": 27, + "batch_num_tokens": 8120, + "epoch": 1.84006, + "grad_norm": 0.14963027834892273, + "learning_rate": 1.8996064505982903e-07, + "loss": 4.8271, + "step": 1418 + }, + { + "batch_num_effect_tokens": 7996, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.84136, + "grad_norm": 0.12004819512367249, + "learning_rate": 1.8687876524993987e-07, + "loss": 4.5586, + "step": 1419 + }, + { + "batch_num_effect_tokens": 7978, + "batch_num_samples": 16, + "batch_num_tokens": 8191, + "epoch": 1.84266, + "grad_norm": 0.1406659036874771, + "learning_rate": 1.8382161406113208e-07, + "loss": 4.3506, + "step": 1420 + }, + { + "batch_num_effect_tokens": 8050, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.84396, + "grad_norm": 0.1230742409825325, + "learning_rate": 1.807892072002898e-07, + "loss": 4.8135, + "step": 1421 + }, + { + "batch_num_effect_tokens": 8047, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 1.84526, + "grad_norm": 0.1338808238506317, + "learning_rate": 1.7778156024716497e-07, + "loss": 4.292, + "step": 1422 + }, + { + "batch_num_effect_tokens": 8011, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.84655, + "grad_norm": 0.11327774822711945, + "learning_rate": 1.7479868865430072e-07, + "loss": 4.3232, + "step": 1423 + }, + { + "batch_num_effect_tokens": 8029, + "batch_num_samples": 14, + "batch_num_tokens": 8163, + "epoch": 1.84785, + "grad_norm": 0.13712045550346375, + "learning_rate": 1.7184060774695033e-07, + "loss": 4.5557, + "step": 1424 + }, + { + "batch_num_effect_tokens": 7921, + "batch_num_samples": 14, + "batch_num_tokens": 8079, + "epoch": 1.84915, + "grad_norm": 0.12802965939044952, + "learning_rate": 1.689073327229973e-07, + "loss": 3.8936, + "step": 1425 + }, + { + "batch_num_effect_tokens": 7866, + "batch_num_samples": 17, + "batch_num_tokens": 8028, + "epoch": 1.85045, + "grad_norm": 0.14027731120586395, + "learning_rate": 1.659988786528821e-07, + "loss": 4.3535, + "step": 1426 + }, + { + "batch_num_effect_tokens": 8047, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 1.85174, + "grad_norm": 0.1289585828781128, + "learning_rate": 1.6311526047951774e-07, + "loss": 4.3008, + "step": 1427 + }, + { + "batch_num_effect_tokens": 8002, + "batch_num_samples": 20, + "batch_num_tokens": 8192, + "epoch": 1.85304, + "grad_norm": 0.12682278454303741, + "learning_rate": 1.6025649301821877e-07, + "loss": 4.6836, + "step": 1428 + }, + { + "batch_num_effect_tokens": 8006, + "batch_num_samples": 14, + "batch_num_tokens": 8189, + "epoch": 1.85434, + "grad_norm": 0.1304275095462799, + "learning_rate": 1.5742259095662126e-07, + "loss": 4.1318, + "step": 1429 + }, + { + "batch_num_effect_tokens": 8063, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.85564, + "grad_norm": 0.1194472685456276, + "learning_rate": 1.5461356885461077e-07, + "loss": 4.4521, + "step": 1430 + }, + { + "batch_num_effect_tokens": 8049, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.85693, + "grad_norm": 0.12798550724983215, + "learning_rate": 1.5182944114424337e-07, + "loss": 4.1016, + "step": 1431 + }, + { + "batch_num_effect_tokens": 8059, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.85823, + "grad_norm": 0.12709662318229675, + "learning_rate": 1.4907022212967803e-07, + "loss": 4.1875, + "step": 1432 + }, + { + "batch_num_effect_tokens": 8031, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 1.85953, + "grad_norm": 0.11923374235630035, + "learning_rate": 1.463359259870939e-07, + "loss": 3.9727, + "step": 1433 + }, + { + "batch_num_effect_tokens": 8025, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.86083, + "grad_norm": 0.11731646209955215, + "learning_rate": 1.436265667646275e-07, + "loss": 4.1377, + "step": 1434 + }, + { + "batch_num_effect_tokens": 7870, + "batch_num_samples": 14, + "batch_num_tokens": 8076, + "epoch": 1.86212, + "grad_norm": 0.12564802169799805, + "learning_rate": 1.4094215838229176e-07, + "loss": 4.3174, + "step": 1435 + }, + { + "batch_num_effect_tokens": 7994, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.86342, + "grad_norm": 0.1299310177564621, + "learning_rate": 1.38282714631911e-07, + "loss": 4.2109, + "step": 1436 + }, + { + "batch_num_effect_tokens": 8068, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.86472, + "grad_norm": 0.1304285228252411, + "learning_rate": 1.3564824917704556e-07, + "loss": 4.2119, + "step": 1437 + }, + { + "batch_num_effect_tokens": 8061, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 1.86602, + "grad_norm": 0.1279984563589096, + "learning_rate": 1.3303877555292443e-07, + "loss": 4.4756, + "step": 1438 + }, + { + "batch_num_effect_tokens": 7857, + "batch_num_samples": 19, + "batch_num_tokens": 8086, + "epoch": 1.86732, + "grad_norm": 0.12695564329624176, + "learning_rate": 1.3045430716637608e-07, + "loss": 4.6621, + "step": 1439 + }, + { + "batch_num_effect_tokens": 7988, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 1.86861, + "grad_norm": 0.13486185669898987, + "learning_rate": 1.2789485729575612e-07, + "loss": 4.252, + "step": 1440 + }, + { + "batch_num_effect_tokens": 8022, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.86991, + "grad_norm": 0.11354608088731766, + "learning_rate": 1.253604390908819e-07, + "loss": 3.9609, + "step": 1441 + }, + { + "batch_num_effect_tokens": 8041, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.87121, + "grad_norm": 0.13337519764900208, + "learning_rate": 1.2285106557296479e-07, + "loss": 3.9141, + "step": 1442 + }, + { + "batch_num_effect_tokens": 8017, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 1.87251, + "grad_norm": 0.13385063409805298, + "learning_rate": 1.2036674963454232e-07, + "loss": 4.4512, + "step": 1443 + }, + { + "batch_num_effect_tokens": 8033, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 1.8738, + "grad_norm": 0.12899687886238098, + "learning_rate": 1.1790750403941231e-07, + "loss": 4.5166, + "step": 1444 + }, + { + "batch_num_effect_tokens": 8034, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 1.8751, + "grad_norm": 0.1389113813638687, + "learning_rate": 1.1547334142256895e-07, + "loss": 4.4268, + "step": 1445 + }, + { + "batch_num_effect_tokens": 8048, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.8764, + "grad_norm": 0.13730958104133606, + "learning_rate": 1.1306427429013222e-07, + "loss": 4.4258, + "step": 1446 + }, + { + "batch_num_effect_tokens": 8025, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.8777, + "grad_norm": 0.1304236501455307, + "learning_rate": 1.1068031501929366e-07, + "loss": 4.334, + "step": 1447 + }, + { + "batch_num_effect_tokens": 8015, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.87899, + "grad_norm": 0.12509453296661377, + "learning_rate": 1.0832147585824182e-07, + "loss": 3.9365, + "step": 1448 + }, + { + "batch_num_effect_tokens": 7965, + "batch_num_samples": 15, + "batch_num_tokens": 8140, + "epoch": 1.88029, + "grad_norm": 0.1231050118803978, + "learning_rate": 1.0598776892610685e-07, + "loss": 4.208, + "step": 1449 + }, + { + "batch_num_effect_tokens": 8034, + "batch_num_samples": 18, + "batch_num_tokens": 8192, + "epoch": 1.88159, + "grad_norm": 0.12538541853427887, + "learning_rate": 1.0367920621289496e-07, + "loss": 4.3555, + "step": 1450 + }, + { + "batch_num_effect_tokens": 7939, + "batch_num_samples": 17, + "batch_num_tokens": 8087, + "epoch": 1.88289, + "grad_norm": 0.1338019073009491, + "learning_rate": 1.0139579957942736e-07, + "loss": 3.7793, + "step": 1451 + }, + { + "batch_num_effect_tokens": 7940, + "batch_num_samples": 18, + "batch_num_tokens": 8128, + "epoch": 1.88418, + "grad_norm": 0.13684529066085815, + "learning_rate": 9.913756075728088e-08, + "loss": 4.415, + "step": 1452 + }, + { + "batch_num_effect_tokens": 7970, + "batch_num_samples": 15, + "batch_num_tokens": 8120, + "epoch": 1.88548, + "grad_norm": 0.1501617431640625, + "learning_rate": 9.69045013487252e-08, + "loss": 4.5879, + "step": 1453 + }, + { + "batch_num_effect_tokens": 7784, + "batch_num_samples": 28, + "batch_num_tokens": 8032, + "epoch": 1.88678, + "grad_norm": 0.1272551417350769, + "learning_rate": 9.469663282666519e-08, + "loss": 4.4873, + "step": 1454 + }, + { + "batch_num_effect_tokens": 7994, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.88808, + "grad_norm": 0.13146163523197174, + "learning_rate": 9.251396653457978e-08, + "loss": 4.5342, + "step": 1455 + }, + { + "batch_num_effect_tokens": 8006, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 1.88938, + "grad_norm": 0.12587164342403412, + "learning_rate": 9.035651368646647e-08, + "loss": 4.1934, + "step": 1456 + }, + { + "batch_num_effect_tokens": 8075, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.89067, + "grad_norm": 0.12512744963169098, + "learning_rate": 8.822428536678251e-08, + "loss": 4.3193, + "step": 1457 + }, + { + "batch_num_effect_tokens": 7942, + "batch_num_samples": 14, + "batch_num_tokens": 8088, + "epoch": 1.89197, + "grad_norm": 0.13346132636070251, + "learning_rate": 8.611729253038658e-08, + "loss": 4.7422, + "step": 1458 + }, + { + "batch_num_effect_tokens": 8043, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.89327, + "grad_norm": 0.13034431636333466, + "learning_rate": 8.403554600248498e-08, + "loss": 4.293, + "step": 1459 + }, + { + "batch_num_effect_tokens": 7982, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 1.89457, + "grad_norm": 0.12772700190544128, + "learning_rate": 8.197905647857385e-08, + "loss": 4.1699, + "step": 1460 + }, + { + "batch_num_effect_tokens": 8013, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.89586, + "grad_norm": 0.12720619142055511, + "learning_rate": 7.994783452438592e-08, + "loss": 4.1992, + "step": 1461 + }, + { + "batch_num_effect_tokens": 8031, + "batch_num_samples": 14, + "batch_num_tokens": 8152, + "epoch": 1.89716, + "grad_norm": 0.12452313303947449, + "learning_rate": 7.794189057583335e-08, + "loss": 4.1182, + "step": 1462 + }, + { + "batch_num_effect_tokens": 7979, + "batch_num_samples": 19, + "batch_num_tokens": 8178, + "epoch": 1.89846, + "grad_norm": 0.1336333006620407, + "learning_rate": 7.59612349389599e-08, + "loss": 4.4717, + "step": 1463 + }, + { + "batch_num_effect_tokens": 7935, + "batch_num_samples": 14, + "batch_num_tokens": 8074, + "epoch": 1.89976, + "grad_norm": 0.134145587682724, + "learning_rate": 7.400587778988055e-08, + "loss": 4.3926, + "step": 1464 + }, + { + "batch_num_effect_tokens": 7802, + "batch_num_samples": 27, + "batch_num_tokens": 8039, + "epoch": 1.90105, + "grad_norm": 0.12880048155784607, + "learning_rate": 7.207582917473532e-08, + "loss": 4.9883, + "step": 1465 + }, + { + "batch_num_effect_tokens": 7901, + "batch_num_samples": 18, + "batch_num_tokens": 8076, + "epoch": 1.90235, + "grad_norm": 0.13317479193210602, + "learning_rate": 7.017109900963437e-08, + "loss": 3.8604, + "step": 1466 + }, + { + "batch_num_effect_tokens": 7849, + "batch_num_samples": 17, + "batch_num_tokens": 8023, + "epoch": 1.90365, + "grad_norm": 0.13144908845424652, + "learning_rate": 6.829169708060745e-08, + "loss": 4.3184, + "step": 1467 + }, + { + "batch_num_effect_tokens": 7972, + "batch_num_samples": 19, + "batch_num_tokens": 8146, + "epoch": 1.90495, + "grad_norm": 0.1278463453054428, + "learning_rate": 6.643763304355566e-08, + "loss": 4.335, + "step": 1468 + }, + { + "batch_num_effect_tokens": 8047, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.90624, + "grad_norm": 0.12541644275188446, + "learning_rate": 6.460891642419865e-08, + "loss": 4.5146, + "step": 1469 + }, + { + "batch_num_effect_tokens": 7816, + "batch_num_samples": 26, + "batch_num_tokens": 8032, + "epoch": 1.90754, + "grad_norm": 0.13894987106323242, + "learning_rate": 6.280555661802857e-08, + "loss": 4.9238, + "step": 1470 + }, + { + "batch_num_effect_tokens": 8050, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.90884, + "grad_norm": 0.13302992284297943, + "learning_rate": 6.102756289025957e-08, + "loss": 3.9033, + "step": 1471 + }, + { + "batch_num_effect_tokens": 7917, + "batch_num_samples": 22, + "batch_num_tokens": 8110, + "epoch": 1.91014, + "grad_norm": 0.12157834321260452, + "learning_rate": 5.92749443757823e-08, + "loss": 4.4756, + "step": 1472 + }, + { + "batch_num_effect_tokens": 8001, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.91144, + "grad_norm": 0.12770642340183258, + "learning_rate": 5.754771007911441e-08, + "loss": 4.5713, + "step": 1473 + }, + { + "batch_num_effect_tokens": 8016, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 1.91273, + "grad_norm": 0.12481579184532166, + "learning_rate": 5.584586887435739e-08, + "loss": 4.3623, + "step": 1474 + }, + { + "batch_num_effect_tokens": 7933, + "batch_num_samples": 18, + "batch_num_tokens": 8086, + "epoch": 1.91403, + "grad_norm": 0.12227432429790497, + "learning_rate": 5.4169429505148144e-08, + "loss": 4.3359, + "step": 1475 + }, + { + "batch_num_effect_tokens": 8028, + "batch_num_samples": 18, + "batch_num_tokens": 8192, + "epoch": 1.91533, + "grad_norm": 0.12159877270460129, + "learning_rate": 5.251840058461577e-08, + "loss": 4.0273, + "step": 1476 + }, + { + "batch_num_effect_tokens": 7994, + "batch_num_samples": 14, + "batch_num_tokens": 8128, + "epoch": 1.91663, + "grad_norm": 0.1284315288066864, + "learning_rate": 5.089279059533658e-08, + "loss": 4.3184, + "step": 1477 + }, + { + "batch_num_effect_tokens": 8019, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.91792, + "grad_norm": 0.135638028383255, + "learning_rate": 4.92926078892908e-08, + "loss": 4.3555, + "step": 1478 + }, + { + "batch_num_effect_tokens": 8018, + "batch_num_samples": 14, + "batch_num_tokens": 8146, + "epoch": 1.91922, + "grad_norm": 0.14031581580638885, + "learning_rate": 4.7717860687819254e-08, + "loss": 4.165, + "step": 1479 + }, + { + "batch_num_effect_tokens": 8045, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.92052, + "grad_norm": 0.13316749036312103, + "learning_rate": 4.6168557081582854e-08, + "loss": 4.5801, + "step": 1480 + }, + { + "batch_num_effect_tokens": 8013, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 1.92182, + "grad_norm": 0.12714573740959167, + "learning_rate": 4.464470503051765e-08, + "loss": 3.9043, + "step": 1481 + }, + { + "batch_num_effect_tokens": 7984, + "batch_num_samples": 22, + "batch_num_tokens": 8192, + "epoch": 1.92311, + "grad_norm": 0.13386176526546478, + "learning_rate": 4.314631236379707e-08, + "loss": 4.0039, + "step": 1482 + }, + { + "batch_num_effect_tokens": 8034, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.92441, + "grad_norm": 0.13148388266563416, + "learning_rate": 4.167338677979027e-08, + "loss": 4.3115, + "step": 1483 + }, + { + "batch_num_effect_tokens": 8013, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 1.92571, + "grad_norm": 0.13195869326591492, + "learning_rate": 4.02259358460233e-08, + "loss": 4.5586, + "step": 1484 + }, + { + "batch_num_effect_tokens": 7984, + "batch_num_samples": 18, + "batch_num_tokens": 8156, + "epoch": 1.92701, + "grad_norm": 0.14345306158065796, + "learning_rate": 3.8803966999139686e-08, + "loss": 4.5898, + "step": 1485 + }, + { + "batch_num_effect_tokens": 8005, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 1.9283, + "grad_norm": 0.13300980627536774, + "learning_rate": 3.7407487544861565e-08, + "loss": 4.5, + "step": 1486 + }, + { + "batch_num_effect_tokens": 7975, + "batch_num_samples": 19, + "batch_num_tokens": 8131, + "epoch": 1.9296, + "grad_norm": 0.1455434411764145, + "learning_rate": 3.603650465795305e-08, + "loss": 4.6602, + "step": 1487 + }, + { + "batch_num_effect_tokens": 8035, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 1.9309, + "grad_norm": 0.12875740230083466, + "learning_rate": 3.4691025382184165e-08, + "loss": 4.6846, + "step": 1488 + }, + { + "batch_num_effect_tokens": 8058, + "batch_num_samples": 14, + "batch_num_tokens": 8177, + "epoch": 1.9322, + "grad_norm": 0.12339173257350922, + "learning_rate": 3.337105663029361e-08, + "loss": 4.2891, + "step": 1489 + }, + { + "batch_num_effect_tokens": 8038, + "batch_num_samples": 14, + "batch_num_tokens": 8152, + "epoch": 1.9335, + "grad_norm": 0.12480738013982773, + "learning_rate": 3.2076605183951614e-08, + "loss": 4.4785, + "step": 1490 + }, + { + "batch_num_effect_tokens": 8013, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 1.93479, + "grad_norm": 0.13804928958415985, + "learning_rate": 3.080767769372939e-08, + "loss": 4.7588, + "step": 1491 + }, + { + "batch_num_effect_tokens": 8016, + "batch_num_samples": 20, + "batch_num_tokens": 8192, + "epoch": 1.93609, + "grad_norm": 0.14307373762130737, + "learning_rate": 2.9564280679060255e-08, + "loss": 4.8125, + "step": 1492 + }, + { + "batch_num_effect_tokens": 8027, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 1.93739, + "grad_norm": 0.12223173677921295, + "learning_rate": 2.834642052820913e-08, + "loss": 4.1455, + "step": 1493 + }, + { + "batch_num_effect_tokens": 8027, + "batch_num_samples": 14, + "batch_num_tokens": 8168, + "epoch": 1.93869, + "grad_norm": 0.13116061687469482, + "learning_rate": 2.715410349823977e-08, + "loss": 3.9893, + "step": 1494 + }, + { + "batch_num_effect_tokens": 8014, + "batch_num_samples": 20, + "batch_num_tokens": 8191, + "epoch": 1.93998, + "grad_norm": 0.1298002302646637, + "learning_rate": 2.59873357149798e-08, + "loss": 4.1182, + "step": 1495 + }, + { + "batch_num_effect_tokens": 8043, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.94128, + "grad_norm": 0.12619996070861816, + "learning_rate": 2.4846123172992953e-08, + "loss": 4.2354, + "step": 1496 + }, + { + "batch_num_effect_tokens": 7887, + "batch_num_samples": 23, + "batch_num_tokens": 8105, + "epoch": 1.94258, + "grad_norm": 0.12969645857810974, + "learning_rate": 2.3730471735545213e-08, + "loss": 4.1484, + "step": 1497 + }, + { + "batch_num_effect_tokens": 8024, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 1.94388, + "grad_norm": 0.12418043613433838, + "learning_rate": 2.264038713457706e-08, + "loss": 4.3389, + "step": 1498 + }, + { + "batch_num_effect_tokens": 7918, + "batch_num_samples": 21, + "batch_num_tokens": 8125, + "epoch": 1.94517, + "grad_norm": 0.122939832508564, + "learning_rate": 2.157587497067182e-08, + "loss": 3.9756, + "step": 1499 + }, + { + "batch_num_effect_tokens": 8040, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.94647, + "grad_norm": 0.12949828803539276, + "learning_rate": 2.0536940713028475e-08, + "loss": 4.4385, + "step": 1500 + }, + { + "batch_num_effect_tokens": 8024, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.94777, + "grad_norm": 0.11821475625038147, + "learning_rate": 1.9523589699433355e-08, + "loss": 4.0107, + "step": 1501 + }, + { + "batch_num_effect_tokens": 8072, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 1.94907, + "grad_norm": 0.1258937120437622, + "learning_rate": 1.8535827136232365e-08, + "loss": 4.6211, + "step": 1502 + }, + { + "batch_num_effect_tokens": 7988, + "batch_num_samples": 17, + "batch_num_tokens": 8166, + "epoch": 1.95036, + "grad_norm": 0.1313227117061615, + "learning_rate": 1.7573658098304357e-08, + "loss": 4.418, + "step": 1503 + }, + { + "batch_num_effect_tokens": 8036, + "batch_num_samples": 13, + "batch_num_tokens": 8189, + "epoch": 1.95166, + "grad_norm": 0.12842047214508057, + "learning_rate": 1.6637087529033925e-08, + "loss": 4.1191, + "step": 1504 + }, + { + "batch_num_effect_tokens": 7982, + "batch_num_samples": 20, + "batch_num_tokens": 8192, + "epoch": 1.95296, + "grad_norm": 0.1315157413482666, + "learning_rate": 1.5726120240288632e-08, + "loss": 4.5, + "step": 1505 + }, + { + "batch_num_effect_tokens": 8009, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 1.95426, + "grad_norm": 0.1246904730796814, + "learning_rate": 1.4840760912391283e-08, + "loss": 4.2578, + "step": 1506 + }, + { + "batch_num_effect_tokens": 8031, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.95556, + "grad_norm": 0.125263512134552, + "learning_rate": 1.3981014094099354e-08, + "loss": 4.165, + "step": 1507 + }, + { + "batch_num_effect_tokens": 7982, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 1.95685, + "grad_norm": 0.12067491561174393, + "learning_rate": 1.314688420257726e-08, + "loss": 4.4043, + "step": 1508 + }, + { + "batch_num_effect_tokens": 8055, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.95815, + "grad_norm": 0.12801378965377808, + "learning_rate": 1.2338375523378022e-08, + "loss": 4.1875, + "step": 1509 + }, + { + "batch_num_effect_tokens": 8025, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 1.95945, + "grad_norm": 0.14885924756526947, + "learning_rate": 1.1555492210418295e-08, + "loss": 4.7236, + "step": 1510 + }, + { + "batch_num_effect_tokens": 8043, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.96075, + "grad_norm": 0.14124317467212677, + "learning_rate": 1.0798238285957274e-08, + "loss": 4.7051, + "step": 1511 + }, + { + "batch_num_effect_tokens": 8041, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.96204, + "grad_norm": 0.11698544025421143, + "learning_rate": 1.006661764057837e-08, + "loss": 4.5605, + "step": 1512 + }, + { + "batch_num_effect_tokens": 7831, + "batch_num_samples": 31, + "batch_num_tokens": 8080, + "epoch": 1.96334, + "grad_norm": 0.1408989280462265, + "learning_rate": 9.36063403316534e-09, + "loss": 4.9922, + "step": 1513 + }, + { + "batch_num_effect_tokens": 7983, + "batch_num_samples": 20, + "batch_num_tokens": 8192, + "epoch": 1.96464, + "grad_norm": 0.12344139814376831, + "learning_rate": 8.680291090888416e-09, + "loss": 4.3076, + "step": 1514 + }, + { + "batch_num_effect_tokens": 8007, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 1.96594, + "grad_norm": 0.1171666607260704, + "learning_rate": 8.02559230917932e-09, + "loss": 4.1758, + "step": 1515 + }, + { + "batch_num_effect_tokens": 7977, + "batch_num_samples": 22, + "batch_num_tokens": 8192, + "epoch": 1.96723, + "grad_norm": 0.12780164182186127, + "learning_rate": 7.3965410517179426e-09, + "loss": 4.2393, + "step": 1516 + }, + { + "batch_num_effect_tokens": 7998, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 1.96853, + "grad_norm": 0.12426239252090454, + "learning_rate": 6.793140550414024e-09, + "loss": 3.9375, + "step": 1517 + }, + { + "batch_num_effect_tokens": 8071, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.96983, + "grad_norm": 0.13120593130588531, + "learning_rate": 6.215393905388278e-09, + "loss": 4.624, + "step": 1518 + }, + { + "batch_num_effect_tokens": 8038, + "batch_num_samples": 15, + "batch_num_tokens": 8167, + "epoch": 1.97113, + "grad_norm": 0.12499607354402542, + "learning_rate": 5.6633040849601865e-09, + "loss": 4.4346, + "step": 1519 + }, + { + "batch_num_effect_tokens": 8036, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 1.97242, + "grad_norm": 0.12693850696086884, + "learning_rate": 5.1368739256296704e-09, + "loss": 4.0693, + "step": 1520 + }, + { + "batch_num_effect_tokens": 8030, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 1.97372, + "grad_norm": 0.13233652710914612, + "learning_rate": 4.636106132064888e-09, + "loss": 4.4072, + "step": 1521 + }, + { + "batch_num_effect_tokens": 8037, + "batch_num_samples": 15, + "batch_num_tokens": 8192, + "epoch": 1.97502, + "grad_norm": 0.11534731835126877, + "learning_rate": 4.161003277085574e-09, + "loss": 3.8291, + "step": 1522 + }, + { + "batch_num_effect_tokens": 8010, + "batch_num_samples": 17, + "batch_num_tokens": 8192, + "epoch": 1.97632, + "grad_norm": 0.13354112207889557, + "learning_rate": 3.711567801652494e-09, + "loss": 4.1992, + "step": 1523 + }, + { + "batch_num_effect_tokens": 7938, + "batch_num_samples": 21, + "batch_num_tokens": 8142, + "epoch": 1.97762, + "grad_norm": 0.13712389767169952, + "learning_rate": 3.2878020148530143e-09, + "loss": 4.1602, + "step": 1524 + }, + { + "batch_num_effect_tokens": 7910, + "batch_num_samples": 18, + "batch_num_tokens": 8090, + "epoch": 1.97891, + "grad_norm": 0.12684784829616547, + "learning_rate": 2.8897080938916634e-09, + "loss": 4.5029, + "step": 1525 + }, + { + "batch_num_effect_tokens": 8033, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 1.98021, + "grad_norm": 0.13627074658870697, + "learning_rate": 2.5172880840745873e-09, + "loss": 4.7266, + "step": 1526 + }, + { + "batch_num_effect_tokens": 7935, + "batch_num_samples": 16, + "batch_num_tokens": 8122, + "epoch": 1.98151, + "grad_norm": 0.13185909390449524, + "learning_rate": 2.1705438988040005e-09, + "loss": 4.0479, + "step": 1527 + }, + { + "batch_num_effect_tokens": 8065, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 1.98281, + "grad_norm": 0.12739528715610504, + "learning_rate": 1.849477319564863e-09, + "loss": 4.4717, + "step": 1528 + }, + { + "batch_num_effect_tokens": 8036, + "batch_num_samples": 14, + "batch_num_tokens": 8191, + "epoch": 1.9841, + "grad_norm": 0.12966281175613403, + "learning_rate": 1.5540899959187727e-09, + "loss": 4.1494, + "step": 1529 + }, + { + "batch_num_effect_tokens": 7937, + "batch_num_samples": 17, + "batch_num_tokens": 8139, + "epoch": 1.9854, + "grad_norm": 0.1280137449502945, + "learning_rate": 1.2843834454911997e-09, + "loss": 4.377, + "step": 1530 + }, + { + "batch_num_effect_tokens": 7968, + "batch_num_samples": 24, + "batch_num_tokens": 8192, + "epoch": 1.9867, + "grad_norm": 0.13096709549427032, + "learning_rate": 1.040359053967599e-09, + "loss": 4.127, + "step": 1531 + }, + { + "batch_num_effect_tokens": 7984, + "batch_num_samples": 14, + "batch_num_tokens": 8128, + "epoch": 1.988, + "grad_norm": 0.12772879004478455, + "learning_rate": 8.220180750850848e-10, + "loss": 4.3184, + "step": 1532 + }, + { + "batch_num_effect_tokens": 8040, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.98929, + "grad_norm": 0.1290796995162964, + "learning_rate": 6.293616306246586e-10, + "loss": 4.8291, + "step": 1533 + }, + { + "batch_num_effect_tokens": 7978, + "batch_num_samples": 14, + "batch_num_tokens": 8146, + "epoch": 1.99059, + "grad_norm": 0.12888257205486298, + "learning_rate": 4.623907104084335e-10, + "loss": 4.251, + "step": 1534 + }, + { + "batch_num_effect_tokens": 8063, + "batch_num_samples": 14, + "batch_num_tokens": 8192, + "epoch": 1.99189, + "grad_norm": 0.13058820366859436, + "learning_rate": 3.211061722901976e-10, + "loss": 4.3525, + "step": 1535 + }, + { + "batch_num_effect_tokens": 7978, + "batch_num_samples": 17, + "batch_num_tokens": 8139, + "epoch": 1.99319, + "grad_norm": 0.12405020743608475, + "learning_rate": 2.0550874215541362e-10, + "loss": 4.4014, + "step": 1536 + }, + { + "batch_num_effect_tokens": 8015, + "batch_num_samples": 13, + "batch_num_tokens": 8192, + "epoch": 1.99448, + "grad_norm": 0.1255870759487152, + "learning_rate": 1.1559901391511308e-10, + "loss": 4.2578, + "step": 1537 + }, + { + "batch_num_effect_tokens": 8054, + "batch_num_samples": 16, + "batch_num_tokens": 8192, + "epoch": 1.99578, + "grad_norm": 0.12560389935970306, + "learning_rate": 5.137744950312051e-11, + "loss": 4.3672, + "step": 1538 + }, + { + "batch_num_effect_tokens": 8007, + "batch_num_samples": 18, + "batch_num_tokens": 8184, + "epoch": 1.99708, + "grad_norm": 0.1317049115896225, + "learning_rate": 1.2844378873833053e-11, + "loss": 4.2676, + "step": 1539 + }, + { + "batch_num_effect_tokens": 8033, + "batch_num_samples": 18, + "batch_num_tokens": 8192, + "epoch": 1.99838, + "grad_norm": 0.12680941820144653, + "learning_rate": 0.0, + "loss": 4.6318, + "step": 1540 + }, + { + "batch_num_effect_tokens": 8033, + "batch_num_samples": 18, + "batch_num_tokens": 8192, + "epoch": 1.99838, + "eval_eval_loss": 0.5588093996047974, + "eval_eval_runtime": 115.4397, + "eval_eval_samples_per_second": 43.313, + "eval_eval_steps_per_second": 2.711, + "step": 1540 + } + ], + "logging_steps": 1.0, + "max_steps": 1540, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}