{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9983779399837793, "eval_steps": 154, "global_step": 1540, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "batch_num_effect_tokens": 8022, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 0.0013, "grad_norm": 0.31844592094421387, "learning_rate": 6.493506493506495e-08, "loss": 6.3418, "step": 1 }, { "batch_num_effect_tokens": 8055, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.0026, "grad_norm": 0.26869267225265503, "learning_rate": 1.298701298701299e-07, "loss": 6.3926, "step": 2 }, { "batch_num_effect_tokens": 8015, "batch_num_samples": 18, "batch_num_tokens": 8192, "epoch": 0.00389, "grad_norm": 0.2948790490627289, "learning_rate": 1.948051948051948e-07, "loss": 6.2676, "step": 3 }, { "batch_num_effect_tokens": 8021, "batch_num_samples": 18, "batch_num_tokens": 8192, "epoch": 0.00519, "grad_norm": 0.3147197663784027, "learning_rate": 2.597402597402598e-07, "loss": 6.4707, "step": 4 }, { "batch_num_effect_tokens": 8064, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.00649, "grad_norm": 0.3039565086364746, "learning_rate": 3.2467532467532465e-07, "loss": 7.0977, "step": 5 }, { "batch_num_effect_tokens": 8046, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.00779, "grad_norm": 0.28105345368385315, "learning_rate": 3.896103896103896e-07, "loss": 6.8184, "step": 6 }, { "batch_num_effect_tokens": 8037, "batch_num_samples": 16, "batch_num_tokens": 8189, "epoch": 0.00908, "grad_norm": 0.2680772840976715, "learning_rate": 4.5454545454545457e-07, "loss": 6.5508, "step": 7 }, { "batch_num_effect_tokens": 7887, "batch_num_samples": 23, "batch_num_tokens": 8105, "epoch": 0.01038, "grad_norm": 0.3874300718307495, "learning_rate": 5.194805194805196e-07, "loss": 6.1758, "step": 8 }, { "batch_num_effect_tokens": 8027, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 0.01168, "grad_norm": 0.29857560992240906, "learning_rate": 5.844155844155845e-07, "loss": 6.7598, "step": 9 }, { "batch_num_effect_tokens": 7892, "batch_num_samples": 19, "batch_num_tokens": 8119, "epoch": 0.01298, "grad_norm": 0.3115284740924835, "learning_rate": 6.493506493506493e-07, "loss": 6.3301, "step": 10 }, { "batch_num_effect_tokens": 7943, "batch_num_samples": 23, "batch_num_tokens": 8192, "epoch": 0.01427, "grad_norm": 0.38096851110458374, "learning_rate": 7.142857142857143e-07, "loss": 6.2168, "step": 11 }, { "batch_num_effect_tokens": 8074, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 0.01557, "grad_norm": 0.24052490293979645, "learning_rate": 7.792207792207792e-07, "loss": 6.0957, "step": 12 }, { "batch_num_effect_tokens": 7983, "batch_num_samples": 14, "batch_num_tokens": 8146, "epoch": 0.01687, "grad_norm": 0.2261405885219574, "learning_rate": 8.441558441558442e-07, "loss": 5.8008, "step": 13 }, { "batch_num_effect_tokens": 8038, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 0.01817, "grad_norm": 0.25014105439186096, "learning_rate": 9.090909090909091e-07, "loss": 6.459, "step": 14 }, { "batch_num_effect_tokens": 8011, "batch_num_samples": 14, "batch_num_tokens": 8177, "epoch": 0.01946, "grad_norm": 0.2601844370365143, "learning_rate": 9.740259740259742e-07, "loss": 6.2109, "step": 15 }, { "batch_num_effect_tokens": 8040, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 0.02076, "grad_norm": 0.2764773368835449, "learning_rate": 1.0389610389610392e-06, "loss": 6.0508, "step": 16 }, { "batch_num_effect_tokens": 7991, "batch_num_samples": 14, "batch_num_tokens": 8128, "epoch": 0.02206, "grad_norm": 0.23783260583877563, "learning_rate": 1.103896103896104e-06, "loss": 6.3242, "step": 17 }, { "batch_num_effect_tokens": 8013, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 0.02336, "grad_norm": 0.21356581151485443, "learning_rate": 1.168831168831169e-06, "loss": 6.1641, "step": 18 }, { "batch_num_effect_tokens": 7974, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 0.02466, "grad_norm": 0.2238253653049469, "learning_rate": 1.2337662337662338e-06, "loss": 5.8047, "step": 19 }, { "batch_num_effect_tokens": 8033, "batch_num_samples": 14, "batch_num_tokens": 8170, "epoch": 0.02595, "grad_norm": 0.21132396161556244, "learning_rate": 1.2987012987012986e-06, "loss": 5.7354, "step": 20 }, { "batch_num_effect_tokens": 7901, "batch_num_samples": 22, "batch_num_tokens": 8128, "epoch": 0.02725, "grad_norm": 0.2627267837524414, "learning_rate": 1.3636363636363636e-06, "loss": 6.2051, "step": 21 }, { "batch_num_effect_tokens": 7997, "batch_num_samples": 17, "batch_num_tokens": 8165, "epoch": 0.02855, "grad_norm": 0.23212876915931702, "learning_rate": 1.4285714285714286e-06, "loss": 6.1699, "step": 22 }, { "batch_num_effect_tokens": 8045, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.02985, "grad_norm": 0.20811595022678375, "learning_rate": 1.4935064935064936e-06, "loss": 5.6621, "step": 23 }, { "batch_num_effect_tokens": 8066, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 0.03114, "grad_norm": 0.205363467335701, "learning_rate": 1.5584415584415584e-06, "loss": 5.7109, "step": 24 }, { "batch_num_effect_tokens": 7906, "batch_num_samples": 20, "batch_num_tokens": 8091, "epoch": 0.03244, "grad_norm": 0.248266339302063, "learning_rate": 1.6233766233766235e-06, "loss": 5.8008, "step": 25 }, { "batch_num_effect_tokens": 8072, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.03374, "grad_norm": 0.22567051649093628, "learning_rate": 1.6883116883116885e-06, "loss": 5.668, "step": 26 }, { "batch_num_effect_tokens": 8014, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.03504, "grad_norm": 0.21363011002540588, "learning_rate": 1.7532467532467535e-06, "loss": 5.9082, "step": 27 }, { "batch_num_effect_tokens": 7982, "batch_num_samples": 14, "batch_num_tokens": 8142, "epoch": 0.03633, "grad_norm": 0.17314466834068298, "learning_rate": 1.8181818181818183e-06, "loss": 5.8809, "step": 28 }, { "batch_num_effect_tokens": 8052, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.03763, "grad_norm": 0.1672496497631073, "learning_rate": 1.8831168831168833e-06, "loss": 5.2207, "step": 29 }, { "batch_num_effect_tokens": 7987, "batch_num_samples": 16, "batch_num_tokens": 8155, "epoch": 0.03893, "grad_norm": 0.17449834942817688, "learning_rate": 1.9480519480519483e-06, "loss": 5.7324, "step": 30 }, { "batch_num_effect_tokens": 8046, "batch_num_samples": 14, "batch_num_tokens": 8191, "epoch": 0.04023, "grad_norm": 0.1806560754776001, "learning_rate": 2.012987012987013e-06, "loss": 5.2949, "step": 31 }, { "batch_num_effect_tokens": 7998, "batch_num_samples": 20, "batch_num_tokens": 8192, "epoch": 0.04152, "grad_norm": 0.1780816614627838, "learning_rate": 2.0779220779220784e-06, "loss": 5.6357, "step": 32 }, { "batch_num_effect_tokens": 7915, "batch_num_samples": 18, "batch_num_tokens": 8090, "epoch": 0.04282, "grad_norm": 0.1659129112958908, "learning_rate": 2.1428571428571427e-06, "loss": 5.2832, "step": 33 }, { "batch_num_effect_tokens": 8050, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.04412, "grad_norm": 0.18337897956371307, "learning_rate": 2.207792207792208e-06, "loss": 5.4307, "step": 34 }, { "batch_num_effect_tokens": 7927, "batch_num_samples": 25, "batch_num_tokens": 8154, "epoch": 0.04542, "grad_norm": 0.20109063386917114, "learning_rate": 2.2727272727272728e-06, "loss": 5.876, "step": 35 }, { "batch_num_effect_tokens": 7992, "batch_num_samples": 17, "batch_num_tokens": 8176, "epoch": 0.04672, "grad_norm": 0.15896430611610413, "learning_rate": 2.337662337662338e-06, "loss": 5.251, "step": 36 }, { "batch_num_effect_tokens": 8000, "batch_num_samples": 14, "batch_num_tokens": 8182, "epoch": 0.04801, "grad_norm": 0.15734650194644928, "learning_rate": 2.402597402597403e-06, "loss": 4.8984, "step": 37 }, { "batch_num_effect_tokens": 8032, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.04931, "grad_norm": 0.17897239327430725, "learning_rate": 2.4675324675324676e-06, "loss": 5.3438, "step": 38 }, { "batch_num_effect_tokens": 8032, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 0.05061, "grad_norm": 0.1797408014535904, "learning_rate": 2.5324675324675324e-06, "loss": 5.3223, "step": 39 }, { "batch_num_effect_tokens": 7991, "batch_num_samples": 15, "batch_num_tokens": 8156, "epoch": 0.05191, "grad_norm": 0.16180779039859772, "learning_rate": 2.597402597402597e-06, "loss": 5.0254, "step": 40 }, { "batch_num_effect_tokens": 7865, "batch_num_samples": 17, "batch_num_tokens": 8029, "epoch": 0.0532, "grad_norm": 0.1604710966348648, "learning_rate": 2.6623376623376624e-06, "loss": 5.3867, "step": 41 }, { "batch_num_effect_tokens": 7967, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.0545, "grad_norm": 0.15748171508312225, "learning_rate": 2.7272727272727272e-06, "loss": 4.9678, "step": 42 }, { "batch_num_effect_tokens": 8047, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.0558, "grad_norm": 0.163799449801445, "learning_rate": 2.7922077922077925e-06, "loss": 5.1172, "step": 43 }, { "batch_num_effect_tokens": 7880, "batch_num_samples": 26, "batch_num_tokens": 8102, "epoch": 0.0571, "grad_norm": 0.1661587953567505, "learning_rate": 2.8571428571428573e-06, "loss": 4.8545, "step": 44 }, { "batch_num_effect_tokens": 7964, "batch_num_samples": 23, "batch_num_tokens": 8143, "epoch": 0.05839, "grad_norm": 0.1683725267648697, "learning_rate": 2.922077922077922e-06, "loss": 5.2148, "step": 45 }, { "batch_num_effect_tokens": 8010, "batch_num_samples": 15, "batch_num_tokens": 8164, "epoch": 0.05969, "grad_norm": 0.15301677584648132, "learning_rate": 2.9870129870129873e-06, "loss": 5.0928, "step": 46 }, { "batch_num_effect_tokens": 7915, "batch_num_samples": 17, "batch_num_tokens": 8087, "epoch": 0.06099, "grad_norm": 0.14993946254253387, "learning_rate": 3.051948051948052e-06, "loss": 5.0557, "step": 47 }, { "batch_num_effect_tokens": 7994, "batch_num_samples": 14, "batch_num_tokens": 8128, "epoch": 0.06229, "grad_norm": 0.15920305252075195, "learning_rate": 3.116883116883117e-06, "loss": 5.209, "step": 48 }, { "batch_num_effect_tokens": 8016, "batch_num_samples": 18, "batch_num_tokens": 8192, "epoch": 0.06358, "grad_norm": 0.1530810445547104, "learning_rate": 3.181818181818182e-06, "loss": 5.3164, "step": 49 }, { "batch_num_effect_tokens": 7979, "batch_num_samples": 14, "batch_num_tokens": 8121, "epoch": 0.06488, "grad_norm": 0.14603291451931, "learning_rate": 3.246753246753247e-06, "loss": 5.1729, "step": 50 }, { "batch_num_effect_tokens": 8043, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 0.06618, "grad_norm": 0.14708639681339264, "learning_rate": 3.311688311688312e-06, "loss": 4.7744, "step": 51 }, { "batch_num_effect_tokens": 8067, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 0.06748, "grad_norm": 0.14484994113445282, "learning_rate": 3.376623376623377e-06, "loss": 4.9619, "step": 52 }, { "batch_num_effect_tokens": 7904, "batch_num_samples": 18, "batch_num_tokens": 8192, "epoch": 0.06878, "grad_norm": 0.1400662362575531, "learning_rate": 3.4415584415584418e-06, "loss": 4.7002, "step": 53 }, { "batch_num_effect_tokens": 8053, "batch_num_samples": 14, "batch_num_tokens": 8191, "epoch": 0.07007, "grad_norm": 0.14303331077098846, "learning_rate": 3.506493506493507e-06, "loss": 4.6895, "step": 54 }, { "batch_num_effect_tokens": 8059, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.07137, "grad_norm": 0.14038386940956116, "learning_rate": 3.5714285714285718e-06, "loss": 5.3848, "step": 55 }, { "batch_num_effect_tokens": 7905, "batch_num_samples": 17, "batch_num_tokens": 8075, "epoch": 0.07267, "grad_norm": 0.1472426950931549, "learning_rate": 3.6363636363636366e-06, "loss": 4.9775, "step": 56 }, { "batch_num_effect_tokens": 8021, "batch_num_samples": 18, "batch_num_tokens": 8192, "epoch": 0.07397, "grad_norm": 0.15343016386032104, "learning_rate": 3.701298701298702e-06, "loss": 5.2734, "step": 57 }, { "batch_num_effect_tokens": 8057, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.07526, "grad_norm": 0.1423100382089615, "learning_rate": 3.7662337662337666e-06, "loss": 5.3379, "step": 58 }, { "batch_num_effect_tokens": 8052, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.07656, "grad_norm": 0.14533978700637817, "learning_rate": 3.831168831168831e-06, "loss": 4.8135, "step": 59 }, { "batch_num_effect_tokens": 8007, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.07786, "grad_norm": 0.14846143126487732, "learning_rate": 3.896103896103897e-06, "loss": 4.9902, "step": 60 }, { "batch_num_effect_tokens": 7928, "batch_num_samples": 17, "batch_num_tokens": 8113, "epoch": 0.07916, "grad_norm": 0.1590896099805832, "learning_rate": 3.961038961038962e-06, "loss": 5.0049, "step": 61 }, { "batch_num_effect_tokens": 8023, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 0.08045, "grad_norm": 0.1540524810552597, "learning_rate": 4.025974025974026e-06, "loss": 4.8965, "step": 62 }, { "batch_num_effect_tokens": 8055, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.08175, "grad_norm": 0.13902273774147034, "learning_rate": 4.0909090909090915e-06, "loss": 4.9961, "step": 63 }, { "batch_num_effect_tokens": 7977, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 0.08305, "grad_norm": 0.14425402879714966, "learning_rate": 4.155844155844157e-06, "loss": 5.0908, "step": 64 }, { "batch_num_effect_tokens": 8034, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 0.08435, "grad_norm": 0.15884140133857727, "learning_rate": 4.220779220779221e-06, "loss": 5.2842, "step": 65 }, { "batch_num_effect_tokens": 8002, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 0.08564, "grad_norm": 0.14033250510692596, "learning_rate": 4.2857142857142855e-06, "loss": 5.0371, "step": 66 }, { "batch_num_effect_tokens": 7999, "batch_num_samples": 19, "batch_num_tokens": 8192, "epoch": 0.08694, "grad_norm": 0.15526610612869263, "learning_rate": 4.350649350649351e-06, "loss": 4.6426, "step": 67 }, { "batch_num_effect_tokens": 7806, "batch_num_samples": 17, "batch_num_tokens": 7994, "epoch": 0.08824, "grad_norm": 0.1523507982492447, "learning_rate": 4.415584415584416e-06, "loss": 5.1045, "step": 68 }, { "batch_num_effect_tokens": 8065, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 0.08954, "grad_norm": 0.15373018383979797, "learning_rate": 4.48051948051948e-06, "loss": 5.3213, "step": 69 }, { "batch_num_effect_tokens": 7953, "batch_num_samples": 14, "batch_num_tokens": 8076, "epoch": 0.09084, "grad_norm": 0.16291280090808868, "learning_rate": 4.5454545454545455e-06, "loss": 4.9736, "step": 70 }, { "batch_num_effect_tokens": 8030, "batch_num_samples": 19, "batch_num_tokens": 8192, "epoch": 0.09213, "grad_norm": 0.135183647274971, "learning_rate": 4.610389610389611e-06, "loss": 5.1475, "step": 71 }, { "batch_num_effect_tokens": 8022, "batch_num_samples": 17, "batch_num_tokens": 8181, "epoch": 0.09343, "grad_norm": 0.13779762387275696, "learning_rate": 4.675324675324676e-06, "loss": 4.999, "step": 72 }, { "batch_num_effect_tokens": 7968, "batch_num_samples": 14, "batch_num_tokens": 8152, "epoch": 0.09473, "grad_norm": 0.1394745260477066, "learning_rate": 4.74025974025974e-06, "loss": 4.9697, "step": 73 }, { "batch_num_effect_tokens": 7997, "batch_num_samples": 14, "batch_num_tokens": 8149, "epoch": 0.09603, "grad_norm": 0.14722809195518494, "learning_rate": 4.805194805194806e-06, "loss": 5.4229, "step": 74 }, { "batch_num_effect_tokens": 8051, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.09732, "grad_norm": 0.1443023979663849, "learning_rate": 4.870129870129871e-06, "loss": 4.8057, "step": 75 }, { "batch_num_effect_tokens": 7998, "batch_num_samples": 17, "batch_num_tokens": 8171, "epoch": 0.09862, "grad_norm": 0.1304166615009308, "learning_rate": 4.935064935064935e-06, "loss": 5.1406, "step": 76 }, { "batch_num_effect_tokens": 8004, "batch_num_samples": 19, "batch_num_tokens": 8176, "epoch": 0.09992, "grad_norm": 0.14469240605831146, "learning_rate": 5e-06, "loss": 4.9717, "step": 77 }, { "batch_num_effect_tokens": 7992, "batch_num_samples": 19, "batch_num_tokens": 8192, "epoch": 0.10122, "grad_norm": 0.14036931097507477, "learning_rate": 5.064935064935065e-06, "loss": 4.6973, "step": 78 }, { "batch_num_effect_tokens": 7970, "batch_num_samples": 20, "batch_num_tokens": 8176, "epoch": 0.10251, "grad_norm": 0.1444014012813568, "learning_rate": 5.12987012987013e-06, "loss": 4.8516, "step": 79 }, { "batch_num_effect_tokens": 7878, "batch_num_samples": 27, "batch_num_tokens": 8114, "epoch": 0.10381, "grad_norm": 0.15033836662769318, "learning_rate": 5.194805194805194e-06, "loss": 4.6299, "step": 80 }, { "batch_num_effect_tokens": 8023, "batch_num_samples": 18, "batch_num_tokens": 8192, "epoch": 0.10511, "grad_norm": 0.1547461301088333, "learning_rate": 5.2597402597402605e-06, "loss": 5.0889, "step": 81 }, { "batch_num_effect_tokens": 8025, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 0.10641, "grad_norm": 0.14421503245830536, "learning_rate": 5.324675324675325e-06, "loss": 4.9414, "step": 82 }, { "batch_num_effect_tokens": 8060, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.1077, "grad_norm": 0.13420124351978302, "learning_rate": 5.38961038961039e-06, "loss": 4.8047, "step": 83 }, { "batch_num_effect_tokens": 8065, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 0.109, "grad_norm": 0.1364876627922058, "learning_rate": 5.4545454545454545e-06, "loss": 5.0352, "step": 84 }, { "batch_num_effect_tokens": 8031, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 0.1103, "grad_norm": 0.13382357358932495, "learning_rate": 5.5194805194805205e-06, "loss": 4.8984, "step": 85 }, { "batch_num_effect_tokens": 7784, "batch_num_samples": 28, "batch_num_tokens": 8032, "epoch": 0.1116, "grad_norm": 0.1488446742296219, "learning_rate": 5.584415584415585e-06, "loss": 5.2764, "step": 86 }, { "batch_num_effect_tokens": 8057, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.1129, "grad_norm": 0.15475524961948395, "learning_rate": 5.64935064935065e-06, "loss": 5.1055, "step": 87 }, { "batch_num_effect_tokens": 8030, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 0.11419, "grad_norm": 0.1377478688955307, "learning_rate": 5.7142857142857145e-06, "loss": 4.835, "step": 88 }, { "batch_num_effect_tokens": 7766, "batch_num_samples": 26, "batch_num_tokens": 8030, "epoch": 0.11549, "grad_norm": 0.1429220736026764, "learning_rate": 5.77922077922078e-06, "loss": 4.8867, "step": 89 }, { "batch_num_effect_tokens": 7998, "batch_num_samples": 16, "batch_num_tokens": 8144, "epoch": 0.11679, "grad_norm": 0.14639155566692352, "learning_rate": 5.844155844155844e-06, "loss": 5.1807, "step": 90 }, { "batch_num_effect_tokens": 8019, "batch_num_samples": 18, "batch_num_tokens": 8192, "epoch": 0.11809, "grad_norm": 0.1477179378271103, "learning_rate": 5.90909090909091e-06, "loss": 4.6084, "step": 91 }, { "batch_num_effect_tokens": 7969, "batch_num_samples": 14, "batch_num_tokens": 8136, "epoch": 0.11938, "grad_norm": 0.1457316130399704, "learning_rate": 5.9740259740259746e-06, "loss": 5.335, "step": 92 }, { "batch_num_effect_tokens": 7885, "batch_num_samples": 20, "batch_num_tokens": 8192, "epoch": 0.12068, "grad_norm": 0.1372513622045517, "learning_rate": 6.03896103896104e-06, "loss": 4.6279, "step": 93 }, { "batch_num_effect_tokens": 7953, "batch_num_samples": 14, "batch_num_tokens": 8107, "epoch": 0.12198, "grad_norm": 0.14941509068012238, "learning_rate": 6.103896103896104e-06, "loss": 4.7344, "step": 94 }, { "batch_num_effect_tokens": 8040, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 0.12328, "grad_norm": 0.14506636559963226, "learning_rate": 6.168831168831169e-06, "loss": 5.2168, "step": 95 }, { "batch_num_effect_tokens": 8001, "batch_num_samples": 19, "batch_num_tokens": 8192, "epoch": 0.12457, "grad_norm": 0.15786777436733246, "learning_rate": 6.233766233766234e-06, "loss": 5.1094, "step": 96 }, { "batch_num_effect_tokens": 8032, "batch_num_samples": 14, "batch_num_tokens": 8149, "epoch": 0.12587, "grad_norm": 0.14335079491138458, "learning_rate": 6.2987012987013e-06, "loss": 4.7344, "step": 97 }, { "batch_num_effect_tokens": 8031, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.12717, "grad_norm": 0.1454756110906601, "learning_rate": 6.363636363636364e-06, "loss": 4.9893, "step": 98 }, { "batch_num_effect_tokens": 8021, "batch_num_samples": 18, "batch_num_tokens": 8192, "epoch": 0.12847, "grad_norm": 0.13855211436748505, "learning_rate": 6.4285714285714295e-06, "loss": 4.6758, "step": 99 }, { "batch_num_effect_tokens": 8052, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.12976, "grad_norm": 0.1334666609764099, "learning_rate": 6.493506493506494e-06, "loss": 5.1504, "step": 100 }, { "batch_num_effect_tokens": 8061, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.13106, "grad_norm": 0.14907817542552948, "learning_rate": 6.55844155844156e-06, "loss": 4.8926, "step": 101 }, { "batch_num_effect_tokens": 8050, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 0.13236, "grad_norm": 0.134397953748703, "learning_rate": 6.623376623376624e-06, "loss": 4.7324, "step": 102 }, { "batch_num_effect_tokens": 7971, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.13366, "grad_norm": 0.13027189671993256, "learning_rate": 6.688311688311689e-06, "loss": 4.7451, "step": 103 }, { "batch_num_effect_tokens": 8018, "batch_num_samples": 14, "batch_num_tokens": 8144, "epoch": 0.13496, "grad_norm": 0.1361909955739975, "learning_rate": 6.753246753246754e-06, "loss": 5.1758, "step": 104 }, { "batch_num_effect_tokens": 8053, "batch_num_samples": 14, "batch_num_tokens": 8190, "epoch": 0.13625, "grad_norm": 0.14071358740329742, "learning_rate": 6.818181818181818e-06, "loss": 4.8379, "step": 105 }, { "batch_num_effect_tokens": 8037, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 0.13755, "grad_norm": 0.14475062489509583, "learning_rate": 6.8831168831168835e-06, "loss": 4.9668, "step": 106 }, { "batch_num_effect_tokens": 8044, "batch_num_samples": 14, "batch_num_tokens": 8189, "epoch": 0.13885, "grad_norm": 0.1445104032754898, "learning_rate": 6.948051948051948e-06, "loss": 4.8447, "step": 107 }, { "batch_num_effect_tokens": 8018, "batch_num_samples": 20, "batch_num_tokens": 8192, "epoch": 0.14015, "grad_norm": 0.14156053960323334, "learning_rate": 7.012987012987014e-06, "loss": 5.1055, "step": 108 }, { "batch_num_effect_tokens": 8036, "batch_num_samples": 18, "batch_num_tokens": 8191, "epoch": 0.14144, "grad_norm": 0.1461244523525238, "learning_rate": 7.077922077922078e-06, "loss": 5.0107, "step": 109 }, { "batch_num_effect_tokens": 8061, "batch_num_samples": 14, "batch_num_tokens": 8184, "epoch": 0.14274, "grad_norm": 0.14059850573539734, "learning_rate": 7.1428571428571436e-06, "loss": 5.1133, "step": 110 }, { "batch_num_effect_tokens": 7994, "batch_num_samples": 15, "batch_num_tokens": 8140, "epoch": 0.14404, "grad_norm": 0.14305299520492554, "learning_rate": 7.207792207792208e-06, "loss": 4.6953, "step": 111 }, { "batch_num_effect_tokens": 7981, "batch_num_samples": 14, "batch_num_tokens": 8144, "epoch": 0.14534, "grad_norm": 0.1341462880373001, "learning_rate": 7.272727272727273e-06, "loss": 4.8037, "step": 112 }, { "batch_num_effect_tokens": 7913, "batch_num_samples": 15, "batch_num_tokens": 8098, "epoch": 0.14663, "grad_norm": 0.14198477566242218, "learning_rate": 7.3376623376623375e-06, "loss": 4.8252, "step": 113 }, { "batch_num_effect_tokens": 8045, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.14793, "grad_norm": 0.1437690556049347, "learning_rate": 7.402597402597404e-06, "loss": 5.0996, "step": 114 }, { "batch_num_effect_tokens": 7999, "batch_num_samples": 15, "batch_num_tokens": 8152, "epoch": 0.14923, "grad_norm": 0.13746197521686554, "learning_rate": 7.467532467532468e-06, "loss": 4.8242, "step": 115 }, { "batch_num_effect_tokens": 8036, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 0.15053, "grad_norm": 0.1328958421945572, "learning_rate": 7.532467532467533e-06, "loss": 5.4316, "step": 116 }, { "batch_num_effect_tokens": 8047, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 0.15182, "grad_norm": 0.14948073029518127, "learning_rate": 7.597402597402598e-06, "loss": 5.1084, "step": 117 }, { "batch_num_effect_tokens": 7937, "batch_num_samples": 17, "batch_num_tokens": 8139, "epoch": 0.15312, "grad_norm": 0.14273382723331451, "learning_rate": 7.662337662337663e-06, "loss": 5.0645, "step": 118 }, { "batch_num_effect_tokens": 8062, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.15442, "grad_norm": 0.13271141052246094, "learning_rate": 7.727272727272727e-06, "loss": 4.7783, "step": 119 }, { "batch_num_effect_tokens": 8065, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.15572, "grad_norm": 0.14579838514328003, "learning_rate": 7.792207792207793e-06, "loss": 5.0703, "step": 120 }, { "batch_num_effect_tokens": 7983, "batch_num_samples": 21, "batch_num_tokens": 8192, "epoch": 0.15702, "grad_norm": 0.1460658609867096, "learning_rate": 7.857142857142858e-06, "loss": 4.7344, "step": 121 }, { "batch_num_effect_tokens": 8014, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 0.15831, "grad_norm": 0.1394878476858139, "learning_rate": 7.922077922077924e-06, "loss": 5.0918, "step": 122 }, { "batch_num_effect_tokens": 8001, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 0.15961, "grad_norm": 0.1350386142730713, "learning_rate": 7.987012987012988e-06, "loss": 5.291, "step": 123 }, { "batch_num_effect_tokens": 7925, "batch_num_samples": 15, "batch_num_tokens": 8094, "epoch": 0.16091, "grad_norm": 0.1350218653678894, "learning_rate": 8.051948051948052e-06, "loss": 5.0156, "step": 124 }, { "batch_num_effect_tokens": 8070, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 0.16221, "grad_norm": 0.1472414880990982, "learning_rate": 8.116883116883117e-06, "loss": 4.998, "step": 125 }, { "batch_num_effect_tokens": 8035, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 0.1635, "grad_norm": 0.14445587992668152, "learning_rate": 8.181818181818183e-06, "loss": 5.0547, "step": 126 }, { "batch_num_effect_tokens": 8012, "batch_num_samples": 14, "batch_num_tokens": 8177, "epoch": 0.1648, "grad_norm": 0.14494451880455017, "learning_rate": 8.246753246753247e-06, "loss": 4.6123, "step": 127 }, { "batch_num_effect_tokens": 8032, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 0.1661, "grad_norm": 0.14212098717689514, "learning_rate": 8.311688311688313e-06, "loss": 4.6992, "step": 128 }, { "batch_num_effect_tokens": 8029, "batch_num_samples": 19, "batch_num_tokens": 8192, "epoch": 0.1674, "grad_norm": 0.1644868403673172, "learning_rate": 8.376623376623378e-06, "loss": 5.2734, "step": 129 }, { "batch_num_effect_tokens": 7962, "batch_num_samples": 15, "batch_num_tokens": 8158, "epoch": 0.16869, "grad_norm": 0.13638760149478912, "learning_rate": 8.441558441558442e-06, "loss": 4.7773, "step": 130 }, { "batch_num_effect_tokens": 7940, "batch_num_samples": 14, "batch_num_tokens": 8101, "epoch": 0.16999, "grad_norm": 0.14820170402526855, "learning_rate": 8.506493506493507e-06, "loss": 4.8438, "step": 131 }, { "batch_num_effect_tokens": 8035, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 0.17129, "grad_norm": 0.1421528309583664, "learning_rate": 8.571428571428571e-06, "loss": 5.0254, "step": 132 }, { "batch_num_effect_tokens": 7936, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.17259, "grad_norm": 0.13923408091068268, "learning_rate": 8.636363636363637e-06, "loss": 4.7812, "step": 133 }, { "batch_num_effect_tokens": 7989, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 0.17388, "grad_norm": 0.15315015614032745, "learning_rate": 8.701298701298701e-06, "loss": 4.834, "step": 134 }, { "batch_num_effect_tokens": 7984, "batch_num_samples": 21, "batch_num_tokens": 8192, "epoch": 0.17518, "grad_norm": 0.1589311957359314, "learning_rate": 8.766233766233767e-06, "loss": 4.9219, "step": 135 }, { "batch_num_effect_tokens": 8006, "batch_num_samples": 14, "batch_num_tokens": 8160, "epoch": 0.17648, "grad_norm": 0.14829252660274506, "learning_rate": 8.831168831168832e-06, "loss": 4.9746, "step": 136 }, { "batch_num_effect_tokens": 7960, "batch_num_samples": 22, "batch_num_tokens": 8192, "epoch": 0.17778, "grad_norm": 0.14853787422180176, "learning_rate": 8.896103896103896e-06, "loss": 5.0918, "step": 137 }, { "batch_num_effect_tokens": 8031, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.17908, "grad_norm": 0.13866592943668365, "learning_rate": 8.96103896103896e-06, "loss": 4.8525, "step": 138 }, { "batch_num_effect_tokens": 8002, "batch_num_samples": 17, "batch_num_tokens": 8179, "epoch": 0.18037, "grad_norm": 0.1399109810590744, "learning_rate": 9.025974025974027e-06, "loss": 4.9492, "step": 139 }, { "batch_num_effect_tokens": 8039, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 0.18167, "grad_norm": 0.14804702997207642, "learning_rate": 9.090909090909091e-06, "loss": 4.8105, "step": 140 }, { "batch_num_effect_tokens": 8007, "batch_num_samples": 18, "batch_num_tokens": 8192, "epoch": 0.18297, "grad_norm": 0.13978514075279236, "learning_rate": 9.155844155844157e-06, "loss": 4.6553, "step": 141 }, { "batch_num_effect_tokens": 7962, "batch_num_samples": 14, "batch_num_tokens": 8119, "epoch": 0.18427, "grad_norm": 0.1482185572385788, "learning_rate": 9.220779220779221e-06, "loss": 4.8184, "step": 142 }, { "batch_num_effect_tokens": 7988, "batch_num_samples": 22, "batch_num_tokens": 8189, "epoch": 0.18556, "grad_norm": 0.1415347456932068, "learning_rate": 9.285714285714288e-06, "loss": 4.8008, "step": 143 }, { "batch_num_effect_tokens": 7989, "batch_num_samples": 23, "batch_num_tokens": 8192, "epoch": 0.18686, "grad_norm": 0.15562182664871216, "learning_rate": 9.350649350649352e-06, "loss": 5.4648, "step": 144 }, { "batch_num_effect_tokens": 8026, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 0.18816, "grad_norm": 0.14722460508346558, "learning_rate": 9.415584415584416e-06, "loss": 4.7441, "step": 145 }, { "batch_num_effect_tokens": 7974, "batch_num_samples": 15, "batch_num_tokens": 8146, "epoch": 0.18946, "grad_norm": 0.13538284599781036, "learning_rate": 9.48051948051948e-06, "loss": 4.5645, "step": 146 }, { "batch_num_effect_tokens": 7967, "batch_num_samples": 19, "batch_num_tokens": 8176, "epoch": 0.19075, "grad_norm": 0.1557544320821762, "learning_rate": 9.545454545454547e-06, "loss": 4.9785, "step": 147 }, { "batch_num_effect_tokens": 8031, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 0.19205, "grad_norm": 0.15202026069164276, "learning_rate": 9.610389610389611e-06, "loss": 4.8027, "step": 148 }, { "batch_num_effect_tokens": 8032, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 0.19335, "grad_norm": 0.14528276026248932, "learning_rate": 9.675324675324677e-06, "loss": 4.7441, "step": 149 }, { "batch_num_effect_tokens": 8048, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.19465, "grad_norm": 0.15638510882854462, "learning_rate": 9.740259740259742e-06, "loss": 4.9463, "step": 150 }, { "batch_num_effect_tokens": 8070, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 0.19594, "grad_norm": 0.14898745715618134, "learning_rate": 9.805194805194806e-06, "loss": 4.9248, "step": 151 }, { "batch_num_effect_tokens": 8027, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 0.19724, "grad_norm": 0.13369755446910858, "learning_rate": 9.87012987012987e-06, "loss": 4.9199, "step": 152 }, { "batch_num_effect_tokens": 8070, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 0.19854, "grad_norm": 0.14535720646381378, "learning_rate": 9.935064935064936e-06, "loss": 4.9004, "step": 153 }, { "batch_num_effect_tokens": 8061, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 0.19984, "grad_norm": 0.1457190215587616, "learning_rate": 1e-05, "loss": 5.2598, "step": 154 }, { "batch_num_effect_tokens": 8061, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 0.19984, "eval_eval_loss": 0.6176656484603882, "eval_eval_runtime": 115.0266, "eval_eval_samples_per_second": 43.468, "eval_eval_steps_per_second": 2.721, "step": 154 }, { "batch_num_effect_tokens": 7946, "batch_num_samples": 15, "batch_num_tokens": 8126, "epoch": 0.20114, "grad_norm": 0.14440381526947021, "learning_rate": 9.999987155621127e-06, "loss": 5.0732, "step": 155 }, { "batch_num_effect_tokens": 7991, "batch_num_samples": 18, "batch_num_tokens": 8170, "epoch": 0.20243, "grad_norm": 0.14386983215808868, "learning_rate": 9.999948622550497e-06, "loss": 4.9297, "step": 156 }, { "batch_num_effect_tokens": 7997, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 0.20373, "grad_norm": 0.14097128808498383, "learning_rate": 9.999884400986087e-06, "loss": 5.0283, "step": 157 }, { "batch_num_effect_tokens": 8033, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 0.20503, "grad_norm": 0.14975176751613617, "learning_rate": 9.999794491257846e-06, "loss": 4.6611, "step": 158 }, { "batch_num_effect_tokens": 8053, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.20633, "grad_norm": 0.13971513509750366, "learning_rate": 9.999678893827711e-06, "loss": 4.9727, "step": 159 }, { "batch_num_effect_tokens": 8046, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.20762, "grad_norm": 0.12843751907348633, "learning_rate": 9.999537609289592e-06, "loss": 4.9268, "step": 160 }, { "batch_num_effect_tokens": 8050, "batch_num_samples": 14, "batch_num_tokens": 8184, "epoch": 0.20892, "grad_norm": 0.13939853012561798, "learning_rate": 9.999370638369377e-06, "loss": 4.709, "step": 161 }, { "batch_num_effect_tokens": 8040, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.21022, "grad_norm": 0.14971594512462616, "learning_rate": 9.999177981924915e-06, "loss": 4.7676, "step": 162 }, { "batch_num_effect_tokens": 7874, "batch_num_samples": 21, "batch_num_tokens": 8091, "epoch": 0.21152, "grad_norm": 0.14274117350578308, "learning_rate": 9.998959640946033e-06, "loss": 4.8418, "step": 163 }, { "batch_num_effect_tokens": 8054, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.21281, "grad_norm": 0.14646124839782715, "learning_rate": 9.998715616554509e-06, "loss": 4.6113, "step": 164 }, { "batch_num_effect_tokens": 8051, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 0.21411, "grad_norm": 0.13392624258995056, "learning_rate": 9.998445910004082e-06, "loss": 4.6914, "step": 165 }, { "batch_num_effect_tokens": 7927, "batch_num_samples": 20, "batch_num_tokens": 8144, "epoch": 0.21541, "grad_norm": 0.14332374930381775, "learning_rate": 9.998150522680437e-06, "loss": 5.0967, "step": 166 }, { "batch_num_effect_tokens": 8034, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 0.21671, "grad_norm": 0.15254738926887512, "learning_rate": 9.997829456101196e-06, "loss": 4.7773, "step": 167 }, { "batch_num_effect_tokens": 7932, "batch_num_samples": 15, "batch_num_tokens": 8092, "epoch": 0.218, "grad_norm": 0.14264167845249176, "learning_rate": 9.997482711915926e-06, "loss": 4.7695, "step": 168 }, { "batch_num_effect_tokens": 8044, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 0.2193, "grad_norm": 0.1465001106262207, "learning_rate": 9.997110291906109e-06, "loss": 4.7012, "step": 169 }, { "batch_num_effect_tokens": 7952, "batch_num_samples": 15, "batch_num_tokens": 8110, "epoch": 0.2206, "grad_norm": 0.14456188678741455, "learning_rate": 9.996712197985147e-06, "loss": 4.7178, "step": 170 }, { "batch_num_effect_tokens": 7862, "batch_num_samples": 26, "batch_num_tokens": 8106, "epoch": 0.2219, "grad_norm": 0.1434151530265808, "learning_rate": 9.99628843219835e-06, "loss": 5.1191, "step": 171 }, { "batch_num_effect_tokens": 8044, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 0.2232, "grad_norm": 0.14123517274856567, "learning_rate": 9.995838996722916e-06, "loss": 4.9141, "step": 172 }, { "batch_num_effect_tokens": 8063, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.22449, "grad_norm": 0.1379905641078949, "learning_rate": 9.995363893867935e-06, "loss": 4.8369, "step": 173 }, { "batch_num_effect_tokens": 8079, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 0.22579, "grad_norm": 0.13364200294017792, "learning_rate": 9.994863126074371e-06, "loss": 5.0586, "step": 174 }, { "batch_num_effect_tokens": 8015, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.22709, "grad_norm": 0.13714580237865448, "learning_rate": 9.994336695915041e-06, "loss": 4.9736, "step": 175 }, { "batch_num_effect_tokens": 8030, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 0.22839, "grad_norm": 0.13936279714107513, "learning_rate": 9.993784606094612e-06, "loss": 5.0059, "step": 176 }, { "batch_num_effect_tokens": 8007, "batch_num_samples": 18, "batch_num_tokens": 8192, "epoch": 0.22968, "grad_norm": 0.1406031847000122, "learning_rate": 9.993206859449587e-06, "loss": 4.8916, "step": 177 }, { "batch_num_effect_tokens": 7963, "batch_num_samples": 14, "batch_num_tokens": 8128, "epoch": 0.23098, "grad_norm": 0.1409338116645813, "learning_rate": 9.992603458948282e-06, "loss": 5.2207, "step": 178 }, { "batch_num_effect_tokens": 8050, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 0.23228, "grad_norm": 0.16277164220809937, "learning_rate": 9.99197440769082e-06, "loss": 4.9805, "step": 179 }, { "batch_num_effect_tokens": 8032, "batch_num_samples": 19, "batch_num_tokens": 8191, "epoch": 0.23358, "grad_norm": 0.1462642252445221, "learning_rate": 9.991319708909113e-06, "loss": 4.6992, "step": 180 }, { "batch_num_effect_tokens": 8037, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.23487, "grad_norm": 0.14574594795703888, "learning_rate": 9.990639365966835e-06, "loss": 5.0459, "step": 181 }, { "batch_num_effect_tokens": 7967, "batch_num_samples": 14, "batch_num_tokens": 8112, "epoch": 0.23617, "grad_norm": 0.14711208641529083, "learning_rate": 9.989933382359423e-06, "loss": 5.1992, "step": 182 }, { "batch_num_effect_tokens": 7931, "batch_num_samples": 15, "batch_num_tokens": 8086, "epoch": 0.23747, "grad_norm": 0.1444520354270935, "learning_rate": 9.989201761714043e-06, "loss": 5.2109, "step": 183 }, { "batch_num_effect_tokens": 8040, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 0.23877, "grad_norm": 0.13259008526802063, "learning_rate": 9.988444507789584e-06, "loss": 4.9014, "step": 184 }, { "batch_num_effect_tokens": 8014, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 0.24006, "grad_norm": 0.1382356435060501, "learning_rate": 9.987661624476624e-06, "loss": 4.876, "step": 185 }, { "batch_num_effect_tokens": 8060, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.24136, "grad_norm": 0.13415783643722534, "learning_rate": 9.986853115797424e-06, "loss": 4.7227, "step": 186 }, { "batch_num_effect_tokens": 8009, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 0.24266, "grad_norm": 0.13997548818588257, "learning_rate": 9.986018985905901e-06, "loss": 5.1807, "step": 187 }, { "batch_num_effect_tokens": 7933, "batch_num_samples": 14, "batch_num_tokens": 8086, "epoch": 0.24396, "grad_norm": 0.14471964538097382, "learning_rate": 9.98515923908761e-06, "loss": 4.7754, "step": 188 }, { "batch_num_effect_tokens": 8058, "batch_num_samples": 14, "batch_num_tokens": 8191, "epoch": 0.24526, "grad_norm": 0.15303927659988403, "learning_rate": 9.984273879759713e-06, "loss": 4.8975, "step": 189 }, { "batch_num_effect_tokens": 7912, "batch_num_samples": 14, "batch_num_tokens": 8101, "epoch": 0.24655, "grad_norm": 0.13627475500106812, "learning_rate": 9.983362912470967e-06, "loss": 4.9385, "step": 190 }, { "batch_num_effect_tokens": 7985, "batch_num_samples": 18, "batch_num_tokens": 8192, "epoch": 0.24785, "grad_norm": 0.1349112093448639, "learning_rate": 9.982426341901697e-06, "loss": 5.1465, "step": 191 }, { "batch_num_effect_tokens": 8033, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.24915, "grad_norm": 0.13799749314785004, "learning_rate": 9.981464172863769e-06, "loss": 5.0508, "step": 192 }, { "batch_num_effect_tokens": 7856, "batch_num_samples": 17, "batch_num_tokens": 8040, "epoch": 0.25045, "grad_norm": 0.14057736098766327, "learning_rate": 9.980476410300567e-06, "loss": 4.9756, "step": 193 }, { "batch_num_effect_tokens": 7958, "batch_num_samples": 21, "batch_num_tokens": 8176, "epoch": 0.25174, "grad_norm": 0.14482036232948303, "learning_rate": 9.979463059286972e-06, "loss": 4.8223, "step": 194 }, { "batch_num_effect_tokens": 7984, "batch_num_samples": 21, "batch_num_tokens": 8176, "epoch": 0.25304, "grad_norm": 0.1419980227947235, "learning_rate": 9.978424125029329e-06, "loss": 4.875, "step": 195 }, { "batch_num_effect_tokens": 8000, "batch_num_samples": 19, "batch_num_tokens": 8192, "epoch": 0.25434, "grad_norm": 0.1462613195180893, "learning_rate": 9.977359612865424e-06, "loss": 4.9316, "step": 196 }, { "batch_num_effect_tokens": 8061, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.25564, "grad_norm": 0.13534201681613922, "learning_rate": 9.976269528264456e-06, "loss": 4.7822, "step": 197 }, { "batch_num_effect_tokens": 8047, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 0.25693, "grad_norm": 0.13299311697483063, "learning_rate": 9.975153876827008e-06, "loss": 4.9941, "step": 198 }, { "batch_num_effect_tokens": 7979, "batch_num_samples": 15, "batch_num_tokens": 8116, "epoch": 0.25823, "grad_norm": 0.13966509699821472, "learning_rate": 9.97401266428502e-06, "loss": 5.2793, "step": 199 }, { "batch_num_effect_tokens": 8054, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.25953, "grad_norm": 0.14264234900474548, "learning_rate": 9.972845896501762e-06, "loss": 4.8848, "step": 200 }, { "batch_num_effect_tokens": 8000, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 0.26083, "grad_norm": 0.1481478363275528, "learning_rate": 9.971653579471791e-06, "loss": 5.0264, "step": 201 }, { "batch_num_effect_tokens": 7957, "batch_num_samples": 14, "batch_num_tokens": 8119, "epoch": 0.26212, "grad_norm": 0.14331968128681183, "learning_rate": 9.97043571932094e-06, "loss": 4.7031, "step": 202 }, { "batch_num_effect_tokens": 7925, "batch_num_samples": 17, "batch_num_tokens": 8123, "epoch": 0.26342, "grad_norm": 0.15035419166088104, "learning_rate": 9.969192322306271e-06, "loss": 4.6455, "step": 203 }, { "batch_num_effect_tokens": 8041, "batch_num_samples": 16, "batch_num_tokens": 8188, "epoch": 0.26472, "grad_norm": 0.1354558765888214, "learning_rate": 9.96792339481605e-06, "loss": 4.6113, "step": 204 }, { "batch_num_effect_tokens": 7979, "batch_num_samples": 14, "batch_num_tokens": 8119, "epoch": 0.26602, "grad_norm": 0.14034751057624817, "learning_rate": 9.966628943369708e-06, "loss": 5.1328, "step": 205 }, { "batch_num_effect_tokens": 8073, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.26732, "grad_norm": 0.13551993668079376, "learning_rate": 9.965308974617816e-06, "loss": 5.0332, "step": 206 }, { "batch_num_effect_tokens": 7989, "batch_num_samples": 17, "batch_num_tokens": 8171, "epoch": 0.26861, "grad_norm": 0.14565272629261017, "learning_rate": 9.963963495342049e-06, "loss": 4.8906, "step": 207 }, { "batch_num_effect_tokens": 8052, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.26991, "grad_norm": 0.1394621729850769, "learning_rate": 9.96259251245514e-06, "loss": 4.835, "step": 208 }, { "batch_num_effect_tokens": 8034, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 0.27121, "grad_norm": 0.13364410400390625, "learning_rate": 9.961196033000862e-06, "loss": 4.9238, "step": 209 }, { "batch_num_effect_tokens": 7996, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 0.27251, "grad_norm": 0.13798867166042328, "learning_rate": 9.959774064153977e-06, "loss": 4.9531, "step": 210 }, { "batch_num_effect_tokens": 8048, "batch_num_samples": 14, "batch_num_tokens": 8185, "epoch": 0.2738, "grad_norm": 0.13652759790420532, "learning_rate": 9.95832661322021e-06, "loss": 4.7188, "step": 211 }, { "batch_num_effect_tokens": 8050, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.2751, "grad_norm": 0.14416304230690002, "learning_rate": 9.956853687636203e-06, "loss": 5.21, "step": 212 }, { "batch_num_effect_tokens": 8009, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.2764, "grad_norm": 0.15409010648727417, "learning_rate": 9.955355294969483e-06, "loss": 4.8691, "step": 213 }, { "batch_num_effect_tokens": 7921, "batch_num_samples": 18, "batch_num_tokens": 8085, "epoch": 0.2777, "grad_norm": 0.13492988049983978, "learning_rate": 9.953831442918418e-06, "loss": 4.9668, "step": 214 }, { "batch_num_effect_tokens": 8033, "batch_num_samples": 16, "batch_num_tokens": 8190, "epoch": 0.27899, "grad_norm": 0.13695424795150757, "learning_rate": 9.952282139312182e-06, "loss": 4.749, "step": 215 }, { "batch_num_effect_tokens": 7980, "batch_num_samples": 16, "batch_num_tokens": 8155, "epoch": 0.28029, "grad_norm": 0.14915227890014648, "learning_rate": 9.95070739211071e-06, "loss": 5.1152, "step": 216 }, { "batch_num_effect_tokens": 7958, "batch_num_samples": 14, "batch_num_tokens": 8114, "epoch": 0.28159, "grad_norm": 0.13804367184638977, "learning_rate": 9.949107209404664e-06, "loss": 4.7793, "step": 217 }, { "batch_num_effect_tokens": 8072, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 0.28289, "grad_norm": 0.1392471045255661, "learning_rate": 9.947481599415385e-06, "loss": 5.0469, "step": 218 }, { "batch_num_effect_tokens": 8003, "batch_num_samples": 19, "batch_num_tokens": 8192, "epoch": 0.28418, "grad_norm": 0.14468149840831757, "learning_rate": 9.945830570494851e-06, "loss": 4.8887, "step": 219 }, { "batch_num_effect_tokens": 8032, "batch_num_samples": 14, "batch_num_tokens": 8184, "epoch": 0.28548, "grad_norm": 0.1316855400800705, "learning_rate": 9.944154131125643e-06, "loss": 4.9443, "step": 220 }, { "batch_num_effect_tokens": 8045, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 0.28678, "grad_norm": 0.15862536430358887, "learning_rate": 9.942452289920886e-06, "loss": 4.8623, "step": 221 }, { "batch_num_effect_tokens": 7995, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.28808, "grad_norm": 0.13182777166366577, "learning_rate": 9.940725055624218e-06, "loss": 5.0381, "step": 222 }, { "batch_num_effect_tokens": 8025, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.28938, "grad_norm": 0.1340331733226776, "learning_rate": 9.938972437109742e-06, "loss": 4.7461, "step": 223 }, { "batch_num_effect_tokens": 7929, "batch_num_samples": 15, "batch_num_tokens": 8086, "epoch": 0.29067, "grad_norm": 0.1430303007364273, "learning_rate": 9.937194443381972e-06, "loss": 4.8057, "step": 224 }, { "batch_num_effect_tokens": 8016, "batch_num_samples": 18, "batch_num_tokens": 8191, "epoch": 0.29197, "grad_norm": 0.14733895659446716, "learning_rate": 9.935391083575803e-06, "loss": 4.7725, "step": 225 }, { "batch_num_effect_tokens": 8056, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 0.29327, "grad_norm": 0.13456237316131592, "learning_rate": 9.933562366956445e-06, "loss": 4.5049, "step": 226 }, { "batch_num_effect_tokens": 7997, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.29457, "grad_norm": 0.13491599261760712, "learning_rate": 9.931708302919394e-06, "loss": 5.0586, "step": 227 }, { "batch_num_effect_tokens": 8015, "batch_num_samples": 19, "batch_num_tokens": 8192, "epoch": 0.29586, "grad_norm": 0.13626770675182343, "learning_rate": 9.929828900990367e-06, "loss": 4.7988, "step": 228 }, { "batch_num_effect_tokens": 8057, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.29716, "grad_norm": 0.14051003754138947, "learning_rate": 9.927924170825266e-06, "loss": 5.0586, "step": 229 }, { "batch_num_effect_tokens": 8036, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.29846, "grad_norm": 0.13720320165157318, "learning_rate": 9.92599412221012e-06, "loss": 4.8613, "step": 230 }, { "batch_num_effect_tokens": 7980, "batch_num_samples": 14, "batch_num_tokens": 8113, "epoch": 0.29976, "grad_norm": 0.15138697624206543, "learning_rate": 9.924038765061042e-06, "loss": 4.7715, "step": 231 }, { "batch_num_effect_tokens": 8023, "batch_num_samples": 18, "batch_num_tokens": 8192, "epoch": 0.30105, "grad_norm": 0.1236131489276886, "learning_rate": 9.922058109424168e-06, "loss": 4.916, "step": 232 }, { "batch_num_effect_tokens": 7912, "batch_num_samples": 23, "batch_num_tokens": 8105, "epoch": 0.30235, "grad_norm": 0.1621561050415039, "learning_rate": 9.920052165475615e-06, "loss": 5.0439, "step": 233 }, { "batch_num_effect_tokens": 8058, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.30365, "grad_norm": 0.14631430804729462, "learning_rate": 9.918020943521427e-06, "loss": 4.79, "step": 234 }, { "batch_num_effect_tokens": 7846, "batch_num_samples": 26, "batch_num_tokens": 8080, "epoch": 0.30495, "grad_norm": 0.13547104597091675, "learning_rate": 9.915964453997516e-06, "loss": 4.9248, "step": 235 }, { "batch_num_effect_tokens": 7915, "batch_num_samples": 15, "batch_num_tokens": 8086, "epoch": 0.30624, "grad_norm": 0.13369937241077423, "learning_rate": 9.913882707469615e-06, "loss": 4.9131, "step": 236 }, { "batch_num_effect_tokens": 8041, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 0.30754, "grad_norm": 0.14017365872859955, "learning_rate": 9.911775714633218e-06, "loss": 4.5908, "step": 237 }, { "batch_num_effect_tokens": 7915, "batch_num_samples": 16, "batch_num_tokens": 8078, "epoch": 0.30884, "grad_norm": 0.13819383084774017, "learning_rate": 9.909643486313533e-06, "loss": 4.9268, "step": 238 }, { "batch_num_effect_tokens": 8068, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.31014, "grad_norm": 0.14381656050682068, "learning_rate": 9.907486033465421e-06, "loss": 4.8018, "step": 239 }, { "batch_num_effect_tokens": 7998, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 0.31144, "grad_norm": 0.12868614494800568, "learning_rate": 9.905303367173336e-06, "loss": 4.8428, "step": 240 }, { "batch_num_effect_tokens": 7997, "batch_num_samples": 15, "batch_num_tokens": 8166, "epoch": 0.31273, "grad_norm": 0.13400490581989288, "learning_rate": 9.903095498651276e-06, "loss": 4.8477, "step": 241 }, { "batch_num_effect_tokens": 8032, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 0.31403, "grad_norm": 0.14183661341667175, "learning_rate": 9.900862439242719e-06, "loss": 4.709, "step": 242 }, { "batch_num_effect_tokens": 8036, "batch_num_samples": 14, "batch_num_tokens": 8191, "epoch": 0.31533, "grad_norm": 0.1373617798089981, "learning_rate": 9.898604200420573e-06, "loss": 5.0449, "step": 243 }, { "batch_num_effect_tokens": 8013, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.31663, "grad_norm": 0.1389341652393341, "learning_rate": 9.896320793787106e-06, "loss": 4.9932, "step": 244 }, { "batch_num_effect_tokens": 7900, "batch_num_samples": 15, "batch_num_tokens": 8092, "epoch": 0.31792, "grad_norm": 0.1446686089038849, "learning_rate": 9.894012231073895e-06, "loss": 4.9473, "step": 245 }, { "batch_num_effect_tokens": 7948, "batch_num_samples": 15, "batch_num_tokens": 8080, "epoch": 0.31922, "grad_norm": 0.1249702200293541, "learning_rate": 9.891678524141759e-06, "loss": 4.7959, "step": 246 }, { "batch_num_effect_tokens": 7985, "batch_num_samples": 19, "batch_num_tokens": 8161, "epoch": 0.32052, "grad_norm": 0.14360341429710388, "learning_rate": 9.889319684980707e-06, "loss": 5.1543, "step": 247 }, { "batch_num_effect_tokens": 7880, "batch_num_samples": 17, "batch_num_tokens": 8062, "epoch": 0.32182, "grad_norm": 0.1445484608411789, "learning_rate": 9.886935725709868e-06, "loss": 4.9531, "step": 248 }, { "batch_num_effect_tokens": 7734, "batch_num_samples": 28, "batch_num_tokens": 8008, "epoch": 0.32311, "grad_norm": 0.1418876349925995, "learning_rate": 9.884526658577433e-06, "loss": 4.9629, "step": 249 }, { "batch_num_effect_tokens": 8059, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.32441, "grad_norm": 0.14063134789466858, "learning_rate": 9.882092495960589e-06, "loss": 5.0117, "step": 250 }, { "batch_num_effect_tokens": 7890, "batch_num_samples": 23, "batch_num_tokens": 8086, "epoch": 0.32571, "grad_norm": 0.13051624596118927, "learning_rate": 9.87963325036546e-06, "loss": 4.5283, "step": 251 }, { "batch_num_effect_tokens": 8060, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.32701, "grad_norm": 0.14672017097473145, "learning_rate": 9.877148934427037e-06, "loss": 4.4834, "step": 252 }, { "batch_num_effect_tokens": 7976, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.3283, "grad_norm": 0.13878877460956573, "learning_rate": 9.874639560909118e-06, "loss": 4.6523, "step": 253 }, { "batch_num_effect_tokens": 7886, "batch_num_samples": 22, "batch_num_tokens": 8100, "epoch": 0.3296, "grad_norm": 0.13505133986473083, "learning_rate": 9.872105142704245e-06, "loss": 4.8672, "step": 254 }, { "batch_num_effect_tokens": 8046, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.3309, "grad_norm": 0.1461716592311859, "learning_rate": 9.869545692833624e-06, "loss": 4.5898, "step": 255 }, { "batch_num_effect_tokens": 8030, "batch_num_samples": 18, "batch_num_tokens": 8192, "epoch": 0.3322, "grad_norm": 0.16102413833141327, "learning_rate": 9.866961224447076e-06, "loss": 4.7529, "step": 256 }, { "batch_num_effect_tokens": 8072, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.3335, "grad_norm": 0.13654085993766785, "learning_rate": 9.864351750822957e-06, "loss": 4.6143, "step": 257 }, { "batch_num_effect_tokens": 8020, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 0.33479, "grad_norm": 0.13663263618946075, "learning_rate": 9.86171728536809e-06, "loss": 4.9082, "step": 258 }, { "batch_num_effect_tokens": 8034, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 0.33609, "grad_norm": 0.13321144878864288, "learning_rate": 9.859057841617709e-06, "loss": 5.084, "step": 259 }, { "batch_num_effect_tokens": 8063, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.33739, "grad_norm": 0.13456861674785614, "learning_rate": 9.856373433235373e-06, "loss": 4.8818, "step": 260 }, { "batch_num_effect_tokens": 8026, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.33869, "grad_norm": 0.13602375984191895, "learning_rate": 9.853664074012907e-06, "loss": 5.0449, "step": 261 }, { "batch_num_effect_tokens": 8032, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 0.33998, "grad_norm": 0.13534726202487946, "learning_rate": 9.850929777870324e-06, "loss": 4.9688, "step": 262 }, { "batch_num_effect_tokens": 8016, "batch_num_samples": 14, "batch_num_tokens": 8160, "epoch": 0.34128, "grad_norm": 0.14672520756721497, "learning_rate": 9.848170558855757e-06, "loss": 4.6787, "step": 263 }, { "batch_num_effect_tokens": 8041, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 0.34258, "grad_norm": 0.12731723487377167, "learning_rate": 9.84538643114539e-06, "loss": 4.998, "step": 264 }, { "batch_num_effect_tokens": 8032, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 0.34388, "grad_norm": 0.12880627810955048, "learning_rate": 9.84257740904338e-06, "loss": 4.9824, "step": 265 }, { "batch_num_effect_tokens": 7977, "batch_num_samples": 14, "batch_num_tokens": 8128, "epoch": 0.34517, "grad_norm": 0.13919463753700256, "learning_rate": 9.839743506981783e-06, "loss": 4.5137, "step": 266 }, { "batch_num_effect_tokens": 8070, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 0.34647, "grad_norm": 0.14044621586799622, "learning_rate": 9.836884739520482e-06, "loss": 4.8906, "step": 267 }, { "batch_num_effect_tokens": 8033, "batch_num_samples": 17, "batch_num_tokens": 8191, "epoch": 0.34777, "grad_norm": 0.14072751998901367, "learning_rate": 9.83400112134712e-06, "loss": 5.084, "step": 268 }, { "batch_num_effect_tokens": 8052, "batch_num_samples": 14, "batch_num_tokens": 8184, "epoch": 0.34907, "grad_norm": 0.13198843598365784, "learning_rate": 9.831092667277002e-06, "loss": 4.7402, "step": 269 }, { "batch_num_effect_tokens": 8046, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 0.35036, "grad_norm": 0.14157812297344208, "learning_rate": 9.828159392253051e-06, "loss": 4.8887, "step": 270 }, { "batch_num_effect_tokens": 8047, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 0.35166, "grad_norm": 0.13811397552490234, "learning_rate": 9.8252013113457e-06, "loss": 4.8408, "step": 271 }, { "batch_num_effect_tokens": 8023, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.35296, "grad_norm": 0.141897514462471, "learning_rate": 9.822218439752835e-06, "loss": 4.8301, "step": 272 }, { "batch_num_effect_tokens": 7977, "batch_num_samples": 14, "batch_num_tokens": 8149, "epoch": 0.35426, "grad_norm": 0.1375928372144699, "learning_rate": 9.819210792799711e-06, "loss": 4.8154, "step": 273 }, { "batch_num_effect_tokens": 7896, "batch_num_samples": 20, "batch_num_tokens": 8096, "epoch": 0.35556, "grad_norm": 0.14474721252918243, "learning_rate": 9.816178385938867e-06, "loss": 4.959, "step": 274 }, { "batch_num_effect_tokens": 7988, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 0.35685, "grad_norm": 0.14300163090229034, "learning_rate": 9.81312123475006e-06, "loss": 4.9746, "step": 275 }, { "batch_num_effect_tokens": 8009, "batch_num_samples": 14, "batch_num_tokens": 8173, "epoch": 0.35815, "grad_norm": 0.1514219492673874, "learning_rate": 9.810039354940172e-06, "loss": 4.9414, "step": 276 }, { "batch_num_effect_tokens": 8001, "batch_num_samples": 15, "batch_num_tokens": 8145, "epoch": 0.35945, "grad_norm": 0.13827340304851532, "learning_rate": 9.806932762343136e-06, "loss": 4.9424, "step": 277 }, { "batch_num_effect_tokens": 7975, "batch_num_samples": 17, "batch_num_tokens": 8152, "epoch": 0.36075, "grad_norm": 0.14099182188510895, "learning_rate": 9.80380147291985e-06, "loss": 5.1465, "step": 278 }, { "batch_num_effect_tokens": 8024, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 0.36204, "grad_norm": 0.12410301715135574, "learning_rate": 9.800645502758104e-06, "loss": 4.9053, "step": 279 }, { "batch_num_effect_tokens": 7958, "batch_num_samples": 14, "batch_num_tokens": 8101, "epoch": 0.36334, "grad_norm": 0.1367730349302292, "learning_rate": 9.797464868072489e-06, "loss": 4.6543, "step": 280 }, { "batch_num_effect_tokens": 7939, "batch_num_samples": 22, "batch_num_tokens": 8146, "epoch": 0.36464, "grad_norm": 0.1414947360754013, "learning_rate": 9.794259585204313e-06, "loss": 4.9229, "step": 281 }, { "batch_num_effect_tokens": 8007, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.36594, "grad_norm": 0.12790058553218842, "learning_rate": 9.791029670621525e-06, "loss": 4.9121, "step": 282 }, { "batch_num_effect_tokens": 8015, "batch_num_samples": 21, "batch_num_tokens": 8192, "epoch": 0.36723, "grad_norm": 0.13167431950569153, "learning_rate": 9.787775140918625e-06, "loss": 4.918, "step": 283 }, { "batch_num_effect_tokens": 8028, "batch_num_samples": 18, "batch_num_tokens": 8192, "epoch": 0.36853, "grad_norm": 0.14066706597805023, "learning_rate": 9.784496012816574e-06, "loss": 4.8828, "step": 284 }, { "batch_num_effect_tokens": 7989, "batch_num_samples": 23, "batch_num_tokens": 8192, "epoch": 0.36983, "grad_norm": 0.14358888566493988, "learning_rate": 9.781192303162721e-06, "loss": 4.7529, "step": 285 }, { "batch_num_effect_tokens": 8052, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.37113, "grad_norm": 0.14802835881710052, "learning_rate": 9.777864028930705e-06, "loss": 4.8633, "step": 286 }, { "batch_num_effect_tokens": 7988, "batch_num_samples": 15, "batch_num_tokens": 8146, "epoch": 0.37242, "grad_norm": 0.14124155044555664, "learning_rate": 9.774511207220369e-06, "loss": 4.8584, "step": 287 }, { "batch_num_effect_tokens": 7869, "batch_num_samples": 15, "batch_num_tokens": 8086, "epoch": 0.37372, "grad_norm": 0.13983023166656494, "learning_rate": 9.771133855257684e-06, "loss": 5.127, "step": 288 }, { "batch_num_effect_tokens": 8036, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 0.37502, "grad_norm": 0.12876510620117188, "learning_rate": 9.767731990394638e-06, "loss": 4.8506, "step": 289 }, { "batch_num_effect_tokens": 8044, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.37632, "grad_norm": 0.13428495824337006, "learning_rate": 9.764305630109174e-06, "loss": 4.8955, "step": 290 }, { "batch_num_effect_tokens": 7997, "batch_num_samples": 16, "batch_num_tokens": 8155, "epoch": 0.37762, "grad_norm": 0.12863335013389587, "learning_rate": 9.760854792005075e-06, "loss": 5.1689, "step": 291 }, { "batch_num_effect_tokens": 8068, "batch_num_samples": 14, "batch_num_tokens": 8184, "epoch": 0.37891, "grad_norm": 0.13183218240737915, "learning_rate": 9.757379493811892e-06, "loss": 4.8193, "step": 292 }, { "batch_num_effect_tokens": 8035, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 0.38021, "grad_norm": 0.1384708732366562, "learning_rate": 9.753879753384845e-06, "loss": 4.8105, "step": 293 }, { "batch_num_effect_tokens": 8034, "batch_num_samples": 18, "batch_num_tokens": 8192, "epoch": 0.38151, "grad_norm": 0.1356097161769867, "learning_rate": 9.750355588704728e-06, "loss": 4.9473, "step": 294 }, { "batch_num_effect_tokens": 8032, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.38281, "grad_norm": 0.13910697400569916, "learning_rate": 9.746807017877823e-06, "loss": 4.9854, "step": 295 }, { "batch_num_effect_tokens": 7997, "batch_num_samples": 17, "batch_num_tokens": 8191, "epoch": 0.3841, "grad_norm": 0.13383249938488007, "learning_rate": 9.743234059135812e-06, "loss": 4.8418, "step": 296 }, { "batch_num_effect_tokens": 8049, "batch_num_samples": 14, "batch_num_tokens": 8164, "epoch": 0.3854, "grad_norm": 0.1411924511194229, "learning_rate": 9.73963673083566e-06, "loss": 4.9199, "step": 297 }, { "batch_num_effect_tokens": 8025, "batch_num_samples": 19, "batch_num_tokens": 8192, "epoch": 0.3867, "grad_norm": 0.1354569047689438, "learning_rate": 9.736015051459551e-06, "loss": 4.6748, "step": 298 }, { "batch_num_effect_tokens": 8065, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.388, "grad_norm": 0.138069286942482, "learning_rate": 9.732369039614774e-06, "loss": 4.8672, "step": 299 }, { "batch_num_effect_tokens": 8056, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 0.38929, "grad_norm": 0.137836754322052, "learning_rate": 9.728698714033631e-06, "loss": 5.0059, "step": 300 }, { "batch_num_effect_tokens": 8037, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 0.39059, "grad_norm": 0.14079798758029938, "learning_rate": 9.725004093573343e-06, "loss": 5.0039, "step": 301 }, { "batch_num_effect_tokens": 7842, "batch_num_samples": 17, "batch_num_tokens": 7989, "epoch": 0.39189, "grad_norm": 0.13341772556304932, "learning_rate": 9.721285197215954e-06, "loss": 4.8281, "step": 302 }, { "batch_num_effect_tokens": 7995, "batch_num_samples": 19, "batch_num_tokens": 8191, "epoch": 0.39319, "grad_norm": 0.14130514860153198, "learning_rate": 9.717542044068224e-06, "loss": 4.6729, "step": 303 }, { "batch_num_effect_tokens": 7882, "batch_num_samples": 25, "batch_num_tokens": 8105, "epoch": 0.39448, "grad_norm": 0.1398962289094925, "learning_rate": 9.71377465336155e-06, "loss": 4.6582, "step": 304 }, { "batch_num_effect_tokens": 7844, "batch_num_samples": 20, "batch_num_tokens": 8080, "epoch": 0.39578, "grad_norm": 0.1334947943687439, "learning_rate": 9.709983044451847e-06, "loss": 4.6211, "step": 305 }, { "batch_num_effect_tokens": 7944, "batch_num_samples": 14, "batch_num_tokens": 8088, "epoch": 0.39708, "grad_norm": 0.1320602148771286, "learning_rate": 9.70616723681946e-06, "loss": 4.7031, "step": 306 }, { "batch_num_effect_tokens": 8048, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.39838, "grad_norm": 0.13198183476924896, "learning_rate": 9.702327250069058e-06, "loss": 4.5635, "step": 307 }, { "batch_num_effect_tokens": 8024, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 0.39968, "grad_norm": 0.13476605713367462, "learning_rate": 9.698463103929542e-06, "loss": 4.9883, "step": 308 }, { "batch_num_effect_tokens": 8024, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 0.39968, "eval_eval_loss": 0.6101124882698059, "eval_eval_runtime": 115.2852, "eval_eval_samples_per_second": 43.371, "eval_eval_steps_per_second": 2.715, "step": 308 }, { "batch_num_effect_tokens": 7840, "batch_num_samples": 28, "batch_num_tokens": 8104, "epoch": 0.40097, "grad_norm": 0.13941439986228943, "learning_rate": 9.694574818253935e-06, "loss": 4.916, "step": 309 }, { "batch_num_effect_tokens": 8024, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 0.40227, "grad_norm": 0.13381004333496094, "learning_rate": 9.69066241301928e-06, "loss": 5.0781, "step": 310 }, { "batch_num_effect_tokens": 8042, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 0.40357, "grad_norm": 0.14835673570632935, "learning_rate": 9.686725908326547e-06, "loss": 4.8125, "step": 311 }, { "batch_num_effect_tokens": 8038, "batch_num_samples": 18, "batch_num_tokens": 8192, "epoch": 0.40487, "grad_norm": 0.1386537253856659, "learning_rate": 9.682765324400514e-06, "loss": 4.7021, "step": 312 }, { "batch_num_effect_tokens": 7999, "batch_num_samples": 14, "batch_num_tokens": 8137, "epoch": 0.40616, "grad_norm": 0.13303305208683014, "learning_rate": 9.67878068158968e-06, "loss": 4.6924, "step": 313 }, { "batch_num_effect_tokens": 8070, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.40746, "grad_norm": 0.13811469078063965, "learning_rate": 9.674772000366151e-06, "loss": 4.8867, "step": 314 }, { "batch_num_effect_tokens": 8025, "batch_num_samples": 15, "batch_num_tokens": 8176, "epoch": 0.40876, "grad_norm": 0.13623467087745667, "learning_rate": 9.670739301325534e-06, "loss": 4.7764, "step": 315 }, { "batch_num_effect_tokens": 8006, "batch_num_samples": 18, "batch_num_tokens": 8192, "epoch": 0.41006, "grad_norm": 0.13184677064418793, "learning_rate": 9.666682605186834e-06, "loss": 4.6846, "step": 316 }, { "batch_num_effect_tokens": 8058, "batch_num_samples": 14, "batch_num_tokens": 8190, "epoch": 0.41135, "grad_norm": 0.13243718445301056, "learning_rate": 9.662601932792349e-06, "loss": 4.5635, "step": 317 }, { "batch_num_effect_tokens": 7975, "batch_num_samples": 14, "batch_num_tokens": 8104, "epoch": 0.41265, "grad_norm": 0.13839715719223022, "learning_rate": 9.658497305107559e-06, "loss": 4.8477, "step": 318 }, { "batch_num_effect_tokens": 7927, "batch_num_samples": 14, "batch_num_tokens": 8086, "epoch": 0.41395, "grad_norm": 0.1342511624097824, "learning_rate": 9.654368743221022e-06, "loss": 4.8398, "step": 319 }, { "batch_num_effect_tokens": 8039, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 0.41525, "grad_norm": 0.1273842304944992, "learning_rate": 9.650216268344263e-06, "loss": 4.4043, "step": 320 }, { "batch_num_effect_tokens": 7888, "batch_num_samples": 23, "batch_num_tokens": 8086, "epoch": 0.41655, "grad_norm": 0.12917739152908325, "learning_rate": 9.646039901811666e-06, "loss": 4.6807, "step": 321 }, { "batch_num_effect_tokens": 8011, "batch_num_samples": 14, "batch_num_tokens": 8142, "epoch": 0.41784, "grad_norm": 0.13067729771137238, "learning_rate": 9.641839665080363e-06, "loss": 4.6602, "step": 322 }, { "batch_num_effect_tokens": 8013, "batch_num_samples": 17, "batch_num_tokens": 8181, "epoch": 0.41914, "grad_norm": 0.13833428919315338, "learning_rate": 9.63761557973013e-06, "loss": 4.6309, "step": 323 }, { "batch_num_effect_tokens": 8008, "batch_num_samples": 19, "batch_num_tokens": 8176, "epoch": 0.42044, "grad_norm": 0.14830631017684937, "learning_rate": 9.633367667463267e-06, "loss": 5.5645, "step": 324 }, { "batch_num_effect_tokens": 7975, "batch_num_samples": 18, "batch_num_tokens": 8192, "epoch": 0.42174, "grad_norm": 0.13653573393821716, "learning_rate": 9.62909595010449e-06, "loss": 5.4238, "step": 325 }, { "batch_num_effect_tokens": 7962, "batch_num_samples": 14, "batch_num_tokens": 8120, "epoch": 0.42303, "grad_norm": 0.13599801063537598, "learning_rate": 9.624800449600826e-06, "loss": 5.1523, "step": 326 }, { "batch_num_effect_tokens": 7994, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.42433, "grad_norm": 0.1389356106519699, "learning_rate": 9.620481188021484e-06, "loss": 4.9199, "step": 327 }, { "batch_num_effect_tokens": 7996, "batch_num_samples": 16, "batch_num_tokens": 8166, "epoch": 0.42563, "grad_norm": 0.1340194195508957, "learning_rate": 9.616138187557758e-06, "loss": 4.7656, "step": 328 }, { "batch_num_effect_tokens": 8002, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 0.42693, "grad_norm": 0.1351163685321808, "learning_rate": 9.611771470522908e-06, "loss": 5.2266, "step": 329 }, { "batch_num_effect_tokens": 7892, "batch_num_samples": 26, "batch_num_tokens": 8137, "epoch": 0.42822, "grad_norm": 0.13661961257457733, "learning_rate": 9.60738105935204e-06, "loss": 5.0176, "step": 330 }, { "batch_num_effect_tokens": 8043, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 0.42952, "grad_norm": 0.1298064887523651, "learning_rate": 9.602966976601995e-06, "loss": 4.6572, "step": 331 }, { "batch_num_effect_tokens": 7974, "batch_num_samples": 18, "batch_num_tokens": 8128, "epoch": 0.43082, "grad_norm": 0.13714361190795898, "learning_rate": 9.598529244951233e-06, "loss": 4.959, "step": 332 }, { "batch_num_effect_tokens": 8021, "batch_num_samples": 17, "batch_num_tokens": 8186, "epoch": 0.43212, "grad_norm": 0.13826780021190643, "learning_rate": 9.594067887199719e-06, "loss": 4.875, "step": 333 }, { "batch_num_effect_tokens": 7877, "batch_num_samples": 18, "batch_num_tokens": 8100, "epoch": 0.43341, "grad_norm": 0.1306610256433487, "learning_rate": 9.589582926268798e-06, "loss": 4.2568, "step": 334 }, { "batch_num_effect_tokens": 7949, "batch_num_samples": 14, "batch_num_tokens": 8079, "epoch": 0.43471, "grad_norm": 0.12845724821090698, "learning_rate": 9.585074385201087e-06, "loss": 4.8184, "step": 335 }, { "batch_num_effect_tokens": 8046, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.43601, "grad_norm": 0.13526244461536407, "learning_rate": 9.580542287160348e-06, "loss": 4.8398, "step": 336 }, { "batch_num_effect_tokens": 7987, "batch_num_samples": 20, "batch_num_tokens": 8186, "epoch": 0.43731, "grad_norm": 0.1280571073293686, "learning_rate": 9.575986655431377e-06, "loss": 4.7578, "step": 337 }, { "batch_num_effect_tokens": 8017, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 0.43861, "grad_norm": 0.12609054148197174, "learning_rate": 9.571407513419878e-06, "loss": 4.6699, "step": 338 }, { "batch_num_effect_tokens": 7953, "batch_num_samples": 15, "batch_num_tokens": 8104, "epoch": 0.4399, "grad_norm": 0.12861526012420654, "learning_rate": 9.566804884652342e-06, "loss": 4.9395, "step": 339 }, { "batch_num_effect_tokens": 8027, "batch_num_samples": 18, "batch_num_tokens": 8191, "epoch": 0.4412, "grad_norm": 0.15593406558036804, "learning_rate": 9.562178792775936e-06, "loss": 4.7197, "step": 340 }, { "batch_num_effect_tokens": 7989, "batch_num_samples": 14, "batch_num_tokens": 8142, "epoch": 0.4425, "grad_norm": 0.1305861622095108, "learning_rate": 9.557529261558367e-06, "loss": 4.8428, "step": 341 }, { "batch_num_effect_tokens": 8042, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.4438, "grad_norm": 0.13278287649154663, "learning_rate": 9.552856314887772e-06, "loss": 4.7871, "step": 342 }, { "batch_num_effect_tokens": 8030, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.44509, "grad_norm": 0.12788629531860352, "learning_rate": 9.548159976772593e-06, "loss": 4.709, "step": 343 }, { "batch_num_effect_tokens": 8030, "batch_num_samples": 18, "batch_num_tokens": 8192, "epoch": 0.44639, "grad_norm": 0.12882810831069946, "learning_rate": 9.543440271341445e-06, "loss": 4.9229, "step": 344 }, { "batch_num_effect_tokens": 8059, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 0.44769, "grad_norm": 0.1309846192598343, "learning_rate": 9.538697222843004e-06, "loss": 4.8623, "step": 345 }, { "batch_num_effect_tokens": 7976, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 0.44899, "grad_norm": 0.12217242270708084, "learning_rate": 9.533930855645872e-06, "loss": 4.7715, "step": 346 }, { "batch_num_effect_tokens": 7944, "batch_num_samples": 14, "batch_num_tokens": 8112, "epoch": 0.45028, "grad_norm": 0.13333797454833984, "learning_rate": 9.529141194238462e-06, "loss": 4.8975, "step": 347 }, { "batch_num_effect_tokens": 7923, "batch_num_samples": 15, "batch_num_tokens": 8110, "epoch": 0.45158, "grad_norm": 0.12796379625797272, "learning_rate": 9.524328263228866e-06, "loss": 4.8311, "step": 348 }, { "batch_num_effect_tokens": 8016, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 0.45288, "grad_norm": 0.13038370013237, "learning_rate": 9.519492087344724e-06, "loss": 4.708, "step": 349 }, { "batch_num_effect_tokens": 8035, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 0.45418, "grad_norm": 0.14246924221515656, "learning_rate": 9.514632691433108e-06, "loss": 4.5479, "step": 350 }, { "batch_num_effect_tokens": 8040, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 0.45547, "grad_norm": 0.13306821882724762, "learning_rate": 9.509750100460384e-06, "loss": 4.7334, "step": 351 }, { "batch_num_effect_tokens": 8036, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.45677, "grad_norm": 0.12792479991912842, "learning_rate": 9.504844339512096e-06, "loss": 4.626, "step": 352 }, { "batch_num_effect_tokens": 8073, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.45807, "grad_norm": 0.13247136771678925, "learning_rate": 9.499915433792823e-06, "loss": 4.9121, "step": 353 }, { "batch_num_effect_tokens": 8003, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 0.45937, "grad_norm": 0.13608035445213318, "learning_rate": 9.494963408626056e-06, "loss": 5.0977, "step": 354 }, { "batch_num_effect_tokens": 8065, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.46067, "grad_norm": 0.1288149058818817, "learning_rate": 9.489988289454073e-06, "loss": 4.7832, "step": 355 }, { "batch_num_effect_tokens": 8038, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 0.46196, "grad_norm": 0.13947491347789764, "learning_rate": 9.484990101837798e-06, "loss": 4.625, "step": 356 }, { "batch_num_effect_tokens": 8042, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 0.46326, "grad_norm": 0.13044162094593048, "learning_rate": 9.47996887145668e-06, "loss": 4.6738, "step": 357 }, { "batch_num_effect_tokens": 7940, "batch_num_samples": 20, "batch_num_tokens": 8144, "epoch": 0.46456, "grad_norm": 0.13908232748508453, "learning_rate": 9.47492462410855e-06, "loss": 4.6367, "step": 358 }, { "batch_num_effect_tokens": 7940, "batch_num_samples": 15, "batch_num_tokens": 8116, "epoch": 0.46586, "grad_norm": 0.13168592751026154, "learning_rate": 9.469857385709498e-06, "loss": 4.7568, "step": 359 }, { "batch_num_effect_tokens": 7968, "batch_num_samples": 14, "batch_num_tokens": 8107, "epoch": 0.46715, "grad_norm": 0.13281431794166565, "learning_rate": 9.46476718229374e-06, "loss": 4.833, "step": 360 }, { "batch_num_effect_tokens": 8036, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.46845, "grad_norm": 0.13974054157733917, "learning_rate": 9.45965404001347e-06, "loss": 4.6826, "step": 361 }, { "batch_num_effect_tokens": 8018, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 0.46975, "grad_norm": 0.1310376226902008, "learning_rate": 9.454517985138748e-06, "loss": 4.626, "step": 362 }, { "batch_num_effect_tokens": 7976, "batch_num_samples": 15, "batch_num_tokens": 8128, "epoch": 0.47105, "grad_norm": 0.12379533797502518, "learning_rate": 9.449359044057344e-06, "loss": 4.4814, "step": 363 }, { "batch_num_effect_tokens": 7984, "batch_num_samples": 14, "batch_num_tokens": 8128, "epoch": 0.47234, "grad_norm": 0.13945648074150085, "learning_rate": 9.444177243274619e-06, "loss": 4.6367, "step": 364 }, { "batch_num_effect_tokens": 8009, "batch_num_samples": 19, "batch_num_tokens": 8192, "epoch": 0.47364, "grad_norm": 0.12774516642093658, "learning_rate": 9.438972609413376e-06, "loss": 4.7061, "step": 365 }, { "batch_num_effect_tokens": 7941, "batch_num_samples": 23, "batch_num_tokens": 8162, "epoch": 0.47494, "grad_norm": 0.12300989776849747, "learning_rate": 9.433745169213729e-06, "loss": 4.6963, "step": 366 }, { "batch_num_effect_tokens": 7997, "batch_num_samples": 20, "batch_num_tokens": 8176, "epoch": 0.47624, "grad_norm": 0.1422237604856491, "learning_rate": 9.428494949532972e-06, "loss": 5.0645, "step": 367 }, { "batch_num_effect_tokens": 8039, "batch_num_samples": 15, "batch_num_tokens": 8186, "epoch": 0.47753, "grad_norm": 0.14680571854114532, "learning_rate": 9.423221977345425e-06, "loss": 4.583, "step": 368 }, { "batch_num_effect_tokens": 8044, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.47883, "grad_norm": 0.14597126841545105, "learning_rate": 9.41792627974231e-06, "loss": 4.7686, "step": 369 }, { "batch_num_effect_tokens": 8035, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 0.48013, "grad_norm": 0.13670004904270172, "learning_rate": 9.412607883931608e-06, "loss": 4.7676, "step": 370 }, { "batch_num_effect_tokens": 8010, "batch_num_samples": 21, "batch_num_tokens": 8192, "epoch": 0.48143, "grad_norm": 0.1419769525527954, "learning_rate": 9.40726681723791e-06, "loss": 4.9912, "step": 371 }, { "batch_num_effect_tokens": 7992, "batch_num_samples": 18, "batch_num_tokens": 8184, "epoch": 0.48273, "grad_norm": 0.13378414511680603, "learning_rate": 9.401903107102295e-06, "loss": 4.8301, "step": 372 }, { "batch_num_effect_tokens": 8038, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 0.48402, "grad_norm": 0.12032909691333771, "learning_rate": 9.396516781082172e-06, "loss": 4.9736, "step": 373 }, { "batch_num_effect_tokens": 7962, "batch_num_samples": 23, "batch_num_tokens": 8181, "epoch": 0.48532, "grad_norm": 0.14288964867591858, "learning_rate": 9.391107866851143e-06, "loss": 5.4004, "step": 374 }, { "batch_num_effect_tokens": 7887, "batch_num_samples": 18, "batch_num_tokens": 8086, "epoch": 0.48662, "grad_norm": 0.13049355149269104, "learning_rate": 9.385676392198869e-06, "loss": 4.8486, "step": 375 }, { "batch_num_effect_tokens": 7965, "batch_num_samples": 22, "batch_num_tokens": 8177, "epoch": 0.48792, "grad_norm": 0.12765397131443024, "learning_rate": 9.380222385030916e-06, "loss": 4.8682, "step": 376 }, { "batch_num_effect_tokens": 7958, "batch_num_samples": 16, "batch_num_tokens": 8133, "epoch": 0.48921, "grad_norm": 0.12337980419397354, "learning_rate": 9.374745873368614e-06, "loss": 4.6826, "step": 377 }, { "batch_num_effect_tokens": 8071, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 0.49051, "grad_norm": 0.12957154214382172, "learning_rate": 9.369246885348926e-06, "loss": 5.0645, "step": 378 }, { "batch_num_effect_tokens": 7992, "batch_num_samples": 14, "batch_num_tokens": 8155, "epoch": 0.49181, "grad_norm": 0.13127748668193817, "learning_rate": 9.363725449224281e-06, "loss": 4.9531, "step": 379 }, { "batch_num_effect_tokens": 8056, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 0.49311, "grad_norm": 0.13518795371055603, "learning_rate": 9.35818159336245e-06, "loss": 4.9785, "step": 380 }, { "batch_num_effect_tokens": 7961, "batch_num_samples": 17, "batch_num_tokens": 8108, "epoch": 0.4944, "grad_norm": 0.13270984590053558, "learning_rate": 9.352615346246383e-06, "loss": 4.8457, "step": 381 }, { "batch_num_effect_tokens": 8028, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 0.4957, "grad_norm": 0.13498768210411072, "learning_rate": 9.347026736474077e-06, "loss": 4.6934, "step": 382 }, { "batch_num_effect_tokens": 8029, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 0.497, "grad_norm": 0.1299462616443634, "learning_rate": 9.341415792758421e-06, "loss": 5.041, "step": 383 }, { "batch_num_effect_tokens": 8015, "batch_num_samples": 17, "batch_num_tokens": 8178, "epoch": 0.4983, "grad_norm": 0.14259278774261475, "learning_rate": 9.33578254392705e-06, "loss": 4.8867, "step": 384 }, { "batch_num_effect_tokens": 8068, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.49959, "grad_norm": 0.142298623919487, "learning_rate": 9.330127018922195e-06, "loss": 5.1357, "step": 385 }, { "batch_num_effect_tokens": 8053, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.50089, "grad_norm": 0.1336406022310257, "learning_rate": 9.324449246800538e-06, "loss": 4.9541, "step": 386 }, { "batch_num_effect_tokens": 8067, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 0.50219, "grad_norm": 0.13553744554519653, "learning_rate": 9.318749256733064e-06, "loss": 5.0166, "step": 387 }, { "batch_num_effect_tokens": 8046, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.50349, "grad_norm": 0.12613889575004578, "learning_rate": 9.313027078004903e-06, "loss": 4.8721, "step": 388 }, { "batch_num_effect_tokens": 7974, "batch_num_samples": 15, "batch_num_tokens": 8126, "epoch": 0.50479, "grad_norm": 0.1368468850851059, "learning_rate": 9.307282740015192e-06, "loss": 5.2559, "step": 389 }, { "batch_num_effect_tokens": 8008, "batch_num_samples": 19, "batch_num_tokens": 8191, "epoch": 0.50608, "grad_norm": 0.14101892709732056, "learning_rate": 9.301516272276907e-06, "loss": 4.7598, "step": 390 }, { "batch_num_effect_tokens": 7966, "batch_num_samples": 17, "batch_num_tokens": 8101, "epoch": 0.50738, "grad_norm": 0.1324855089187622, "learning_rate": 9.295727704416731e-06, "loss": 5.0908, "step": 391 }, { "batch_num_effect_tokens": 7998, "batch_num_samples": 19, "batch_num_tokens": 8192, "epoch": 0.50868, "grad_norm": 0.13836555182933807, "learning_rate": 9.289917066174887e-06, "loss": 4.9092, "step": 392 }, { "batch_num_effect_tokens": 8033, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.50998, "grad_norm": 0.1290932595729828, "learning_rate": 9.284084387404985e-06, "loss": 5.0156, "step": 393 }, { "batch_num_effect_tokens": 8009, "batch_num_samples": 17, "batch_num_tokens": 8191, "epoch": 0.51127, "grad_norm": 0.13208125531673431, "learning_rate": 9.278229698073889e-06, "loss": 4.7783, "step": 394 }, { "batch_num_effect_tokens": 8040, "batch_num_samples": 16, "batch_num_tokens": 8188, "epoch": 0.51257, "grad_norm": 0.12734545767307281, "learning_rate": 9.27235302826153e-06, "loss": 4.6992, "step": 395 }, { "batch_num_effect_tokens": 8025, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 0.51387, "grad_norm": 0.1279255449771881, "learning_rate": 9.266454408160779e-06, "loss": 4.3008, "step": 396 }, { "batch_num_effect_tokens": 8048, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 0.51517, "grad_norm": 0.13789525628089905, "learning_rate": 9.260533868077283e-06, "loss": 4.7852, "step": 397 }, { "batch_num_effect_tokens": 7919, "batch_num_samples": 15, "batch_num_tokens": 8076, "epoch": 0.51646, "grad_norm": 0.12028060853481293, "learning_rate": 9.254591438429305e-06, "loss": 4.7539, "step": 398 }, { "batch_num_effect_tokens": 8069, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.51776, "grad_norm": 0.12482751905918121, "learning_rate": 9.248627149747573e-06, "loss": 4.7402, "step": 399 }, { "batch_num_effect_tokens": 8016, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 0.51906, "grad_norm": 0.13144247233867645, "learning_rate": 9.242641032675118e-06, "loss": 4.7803, "step": 400 }, { "batch_num_effect_tokens": 8056, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.52036, "grad_norm": 0.13110798597335815, "learning_rate": 9.236633117967125e-06, "loss": 4.6787, "step": 401 }, { "batch_num_effect_tokens": 7975, "batch_num_samples": 17, "batch_num_tokens": 8136, "epoch": 0.52165, "grad_norm": 0.13168026506900787, "learning_rate": 9.230603436490764e-06, "loss": 4.8691, "step": 402 }, { "batch_num_effect_tokens": 8011, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 0.52295, "grad_norm": 0.14049184322357178, "learning_rate": 9.224552019225044e-06, "loss": 4.9766, "step": 403 }, { "batch_num_effect_tokens": 7976, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 0.52425, "grad_norm": 0.13541923463344574, "learning_rate": 9.21847889726064e-06, "loss": 4.6348, "step": 404 }, { "batch_num_effect_tokens": 8024, "batch_num_samples": 15, "batch_num_tokens": 8158, "epoch": 0.52555, "grad_norm": 0.13659295439720154, "learning_rate": 9.212384101799748e-06, "loss": 5.1406, "step": 405 }, { "batch_num_effect_tokens": 8017, "batch_num_samples": 14, "batch_num_tokens": 8188, "epoch": 0.52685, "grad_norm": 0.1369631290435791, "learning_rate": 9.206267664155906e-06, "loss": 4.7207, "step": 406 }, { "batch_num_effect_tokens": 7976, "batch_num_samples": 15, "batch_num_tokens": 8128, "epoch": 0.52814, "grad_norm": 0.12341079860925674, "learning_rate": 9.200129615753858e-06, "loss": 4.6543, "step": 407 }, { "batch_num_effect_tokens": 7884, "batch_num_samples": 20, "batch_num_tokens": 8080, "epoch": 0.52944, "grad_norm": 0.1462087631225586, "learning_rate": 9.193969988129367e-06, "loss": 4.8408, "step": 408 }, { "batch_num_effect_tokens": 7955, "batch_num_samples": 14, "batch_num_tokens": 8119, "epoch": 0.53074, "grad_norm": 0.136996328830719, "learning_rate": 9.187788812929074e-06, "loss": 4.9297, "step": 409 }, { "batch_num_effect_tokens": 7812, "batch_num_samples": 32, "batch_num_tokens": 8076, "epoch": 0.53204, "grad_norm": 0.1477639228105545, "learning_rate": 9.181586121910317e-06, "loss": 4.9512, "step": 410 }, { "batch_num_effect_tokens": 7994, "batch_num_samples": 14, "batch_num_tokens": 8136, "epoch": 0.53333, "grad_norm": 0.13376633822917938, "learning_rate": 9.175361946940983e-06, "loss": 4.9346, "step": 411 }, { "batch_num_effect_tokens": 7993, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 0.53463, "grad_norm": 0.12200411409139633, "learning_rate": 9.169116319999336e-06, "loss": 4.5762, "step": 412 }, { "batch_num_effect_tokens": 8054, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 0.53593, "grad_norm": 0.14485883712768555, "learning_rate": 9.162849273173857e-06, "loss": 4.7148, "step": 413 }, { "batch_num_effect_tokens": 8020, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 0.53723, "grad_norm": 0.12444700300693512, "learning_rate": 9.156560838663076e-06, "loss": 4.5879, "step": 414 }, { "batch_num_effect_tokens": 8043, "batch_num_samples": 14, "batch_num_tokens": 8184, "epoch": 0.53852, "grad_norm": 0.13118426501750946, "learning_rate": 9.150251048775403e-06, "loss": 4.6113, "step": 415 }, { "batch_num_effect_tokens": 8042, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.53982, "grad_norm": 0.12771986424922943, "learning_rate": 9.143919935928975e-06, "loss": 4.8223, "step": 416 }, { "batch_num_effect_tokens": 8040, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 0.54112, "grad_norm": 0.13158395886421204, "learning_rate": 9.137567532651477e-06, "loss": 4.6729, "step": 417 }, { "batch_num_effect_tokens": 8023, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 0.54242, "grad_norm": 0.13526186347007751, "learning_rate": 9.131193871579975e-06, "loss": 4.4736, "step": 418 }, { "batch_num_effect_tokens": 7949, "batch_num_samples": 17, "batch_num_tokens": 8100, "epoch": 0.54371, "grad_norm": 0.14715400338172913, "learning_rate": 9.124798985460759e-06, "loss": 4.917, "step": 419 }, { "batch_num_effect_tokens": 7918, "batch_num_samples": 19, "batch_num_tokens": 8101, "epoch": 0.54501, "grad_norm": 0.12297821789979935, "learning_rate": 9.118382907149164e-06, "loss": 4.8252, "step": 420 }, { "batch_num_effect_tokens": 8039, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 0.54631, "grad_norm": 0.14057192206382751, "learning_rate": 9.111945669609408e-06, "loss": 4.5547, "step": 421 }, { "batch_num_effect_tokens": 8020, "batch_num_samples": 17, "batch_num_tokens": 8181, "epoch": 0.54761, "grad_norm": 0.1264130175113678, "learning_rate": 9.105487305914415e-06, "loss": 4.6621, "step": 422 }, { "batch_num_effect_tokens": 7898, "batch_num_samples": 17, "batch_num_tokens": 8126, "epoch": 0.54891, "grad_norm": 0.12750910222530365, "learning_rate": 9.099007849245656e-06, "loss": 4.7354, "step": 423 }, { "batch_num_effect_tokens": 8061, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 0.5502, "grad_norm": 0.12850888073444366, "learning_rate": 9.092507332892968e-06, "loss": 4.5928, "step": 424 }, { "batch_num_effect_tokens": 8037, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 0.5515, "grad_norm": 0.1398119032382965, "learning_rate": 9.08598579025439e-06, "loss": 5.082, "step": 425 }, { "batch_num_effect_tokens": 8018, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.5528, "grad_norm": 0.12561438977718353, "learning_rate": 9.079443254835987e-06, "loss": 4.8418, "step": 426 }, { "batch_num_effect_tokens": 7994, "batch_num_samples": 16, "batch_num_tokens": 8177, "epoch": 0.5541, "grad_norm": 0.13627532124519348, "learning_rate": 9.07287976025168e-06, "loss": 4.748, "step": 427 }, { "batch_num_effect_tokens": 8048, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.55539, "grad_norm": 0.1425987184047699, "learning_rate": 9.066295340223073e-06, "loss": 4.8652, "step": 428 }, { "batch_num_effect_tokens": 8053, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.55669, "grad_norm": 0.12989814579486847, "learning_rate": 9.059690028579285e-06, "loss": 4.5225, "step": 429 }, { "batch_num_effect_tokens": 8028, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.55799, "grad_norm": 0.12423403561115265, "learning_rate": 9.05306385925676e-06, "loss": 4.8164, "step": 430 }, { "batch_num_effect_tokens": 8042, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.55929, "grad_norm": 0.1329265534877777, "learning_rate": 9.04641686629911e-06, "loss": 4.667, "step": 431 }, { "batch_num_effect_tokens": 7906, "batch_num_samples": 14, "batch_num_tokens": 8086, "epoch": 0.56058, "grad_norm": 0.13709376752376556, "learning_rate": 9.039749083856938e-06, "loss": 4.6504, "step": 432 }, { "batch_num_effect_tokens": 8054, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 0.56188, "grad_norm": 0.1327197253704071, "learning_rate": 9.033060546187651e-06, "loss": 4.9004, "step": 433 }, { "batch_num_effect_tokens": 7986, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 0.56318, "grad_norm": 0.12089107930660248, "learning_rate": 9.026351287655294e-06, "loss": 4.6582, "step": 434 }, { "batch_num_effect_tokens": 8072, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 0.56448, "grad_norm": 0.12628640234470367, "learning_rate": 9.019621342730369e-06, "loss": 4.7559, "step": 435 }, { "batch_num_effect_tokens": 8072, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 0.56577, "grad_norm": 0.12535719573497772, "learning_rate": 9.012870745989663e-06, "loss": 4.7764, "step": 436 }, { "batch_num_effect_tokens": 8059, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 0.56707, "grad_norm": 0.12927192449569702, "learning_rate": 9.006099532116066e-06, "loss": 4.6074, "step": 437 }, { "batch_num_effect_tokens": 7960, "batch_num_samples": 19, "batch_num_tokens": 8146, "epoch": 0.56837, "grad_norm": 0.12784621119499207, "learning_rate": 8.999307735898389e-06, "loss": 4.3076, "step": 438 }, { "batch_num_effect_tokens": 7991, "batch_num_samples": 17, "batch_num_tokens": 8186, "epoch": 0.56967, "grad_norm": 0.12837977707386017, "learning_rate": 8.992495392231195e-06, "loss": 4.6992, "step": 439 }, { "batch_num_effect_tokens": 7883, "batch_num_samples": 21, "batch_num_tokens": 8091, "epoch": 0.57097, "grad_norm": 0.1314186155796051, "learning_rate": 8.985662536114614e-06, "loss": 4.6367, "step": 440 }, { "batch_num_effect_tokens": 7975, "batch_num_samples": 20, "batch_num_tokens": 8144, "epoch": 0.57226, "grad_norm": 0.13375988602638245, "learning_rate": 8.978809202654161e-06, "loss": 4.8691, "step": 441 }, { "batch_num_effect_tokens": 8024, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 0.57356, "grad_norm": 0.1275053322315216, "learning_rate": 8.971935427060563e-06, "loss": 4.6514, "step": 442 }, { "batch_num_effect_tokens": 8027, "batch_num_samples": 14, "batch_num_tokens": 8169, "epoch": 0.57486, "grad_norm": 0.14429806172847748, "learning_rate": 8.965041244649572e-06, "loss": 5.0264, "step": 443 }, { "batch_num_effect_tokens": 7937, "batch_num_samples": 18, "batch_num_tokens": 8093, "epoch": 0.57616, "grad_norm": 0.13653282821178436, "learning_rate": 8.95812669084178e-06, "loss": 4.5127, "step": 444 }, { "batch_num_effect_tokens": 7938, "batch_num_samples": 14, "batch_num_tokens": 8093, "epoch": 0.57745, "grad_norm": 0.1339128315448761, "learning_rate": 8.951191801162453e-06, "loss": 4.4707, "step": 445 }, { "batch_num_effect_tokens": 7966, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.57875, "grad_norm": 0.12954694032669067, "learning_rate": 8.944236611241323e-06, "loss": 4.8291, "step": 446 }, { "batch_num_effect_tokens": 8035, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.58005, "grad_norm": 0.13166210055351257, "learning_rate": 8.937261156812436e-06, "loss": 4.7471, "step": 447 }, { "batch_num_effect_tokens": 8030, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 0.58135, "grad_norm": 0.12804020941257477, "learning_rate": 8.930265473713939e-06, "loss": 4.7012, "step": 448 }, { "batch_num_effect_tokens": 8041, "batch_num_samples": 14, "batch_num_tokens": 8189, "epoch": 0.58264, "grad_norm": 0.13141286373138428, "learning_rate": 8.923249597887913e-06, "loss": 4.7891, "step": 449 }, { "batch_num_effect_tokens": 8058, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.58394, "grad_norm": 0.12595658004283905, "learning_rate": 8.916213565380188e-06, "loss": 5.0732, "step": 450 }, { "batch_num_effect_tokens": 8005, "batch_num_samples": 18, "batch_num_tokens": 8192, "epoch": 0.58524, "grad_norm": 0.1271679848432541, "learning_rate": 8.90915741234015e-06, "loss": 4.5, "step": 451 }, { "batch_num_effect_tokens": 8034, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.58654, "grad_norm": 0.13062655925750732, "learning_rate": 8.902081175020558e-06, "loss": 4.8711, "step": 452 }, { "batch_num_effect_tokens": 7999, "batch_num_samples": 17, "batch_num_tokens": 8165, "epoch": 0.58783, "grad_norm": 0.12481129914522171, "learning_rate": 8.894984889777365e-06, "loss": 4.8623, "step": 453 }, { "batch_num_effect_tokens": 8018, "batch_num_samples": 14, "batch_num_tokens": 8146, "epoch": 0.58913, "grad_norm": 0.1350601315498352, "learning_rate": 8.88786859306952e-06, "loss": 4.916, "step": 454 }, { "batch_num_effect_tokens": 8020, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 0.59043, "grad_norm": 0.12822680175304413, "learning_rate": 8.880732321458785e-06, "loss": 4.4541, "step": 455 }, { "batch_num_effect_tokens": 7981, "batch_num_samples": 20, "batch_num_tokens": 8160, "epoch": 0.59173, "grad_norm": 0.1304287612438202, "learning_rate": 8.873576111609552e-06, "loss": 4.8018, "step": 456 }, { "batch_num_effect_tokens": 8011, "batch_num_samples": 14, "batch_num_tokens": 8155, "epoch": 0.59303, "grad_norm": 0.13626275956630707, "learning_rate": 8.866400000288652e-06, "loss": 4.9375, "step": 457 }, { "batch_num_effect_tokens": 8009, "batch_num_samples": 17, "batch_num_tokens": 8173, "epoch": 0.59432, "grad_norm": 0.13052628934383392, "learning_rate": 8.85920402436516e-06, "loss": 4.9404, "step": 458 }, { "batch_num_effect_tokens": 7932, "batch_num_samples": 16, "batch_num_tokens": 8111, "epoch": 0.59562, "grad_norm": 0.12803995609283447, "learning_rate": 8.85198822081021e-06, "loss": 4.7695, "step": 459 }, { "batch_num_effect_tokens": 7900, "batch_num_samples": 15, "batch_num_tokens": 8074, "epoch": 0.59692, "grad_norm": 0.12911297380924225, "learning_rate": 8.84475262669681e-06, "loss": 4.6729, "step": 460 }, { "batch_num_effect_tokens": 8024, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 0.59822, "grad_norm": 0.12205401062965393, "learning_rate": 8.837497279199647e-06, "loss": 4.5557, "step": 461 }, { "batch_num_effect_tokens": 7937, "batch_num_samples": 23, "batch_num_tokens": 8143, "epoch": 0.59951, "grad_norm": 0.1380755603313446, "learning_rate": 8.83022221559489e-06, "loss": 5.0176, "step": 462 }, { "batch_num_effect_tokens": 7937, "batch_num_samples": 23, "batch_num_tokens": 8143, "epoch": 0.59951, "eval_eval_loss": 0.597096860408783, "eval_eval_runtime": 114.9903, "eval_eval_samples_per_second": 43.482, "eval_eval_steps_per_second": 2.722, "step": 462 }, { "batch_num_effect_tokens": 8013, "batch_num_samples": 20, "batch_num_tokens": 8192, "epoch": 0.60081, "grad_norm": 0.12681666016578674, "learning_rate": 8.822927473260012e-06, "loss": 4.998, "step": 463 }, { "batch_num_effect_tokens": 8070, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.60211, "grad_norm": 0.1322617530822754, "learning_rate": 8.815613089673584e-06, "loss": 4.9268, "step": 464 }, { "batch_num_effect_tokens": 7983, "batch_num_samples": 14, "batch_num_tokens": 8135, "epoch": 0.60341, "grad_norm": 0.13114838302135468, "learning_rate": 8.808279102415093e-06, "loss": 4.6543, "step": 465 }, { "batch_num_effect_tokens": 7985, "batch_num_samples": 15, "batch_num_tokens": 8158, "epoch": 0.6047, "grad_norm": 0.12885698676109314, "learning_rate": 8.800925549164742e-06, "loss": 4.6309, "step": 466 }, { "batch_num_effect_tokens": 7946, "batch_num_samples": 17, "batch_num_tokens": 8123, "epoch": 0.606, "grad_norm": 0.14466021955013275, "learning_rate": 8.79355246770326e-06, "loss": 4.5459, "step": 467 }, { "batch_num_effect_tokens": 8068, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.6073, "grad_norm": 0.1357675939798355, "learning_rate": 8.786159895911712e-06, "loss": 4.873, "step": 468 }, { "batch_num_effect_tokens": 8008, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 0.6086, "grad_norm": 0.1205708235502243, "learning_rate": 8.778747871771293e-06, "loss": 4.8271, "step": 469 }, { "batch_num_effect_tokens": 8034, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.60989, "grad_norm": 0.137266144156456, "learning_rate": 8.771316433363139e-06, "loss": 4.9111, "step": 470 }, { "batch_num_effect_tokens": 8014, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 0.61119, "grad_norm": 0.1284547597169876, "learning_rate": 8.763865618868136e-06, "loss": 4.7998, "step": 471 }, { "batch_num_effect_tokens": 8026, "batch_num_samples": 14, "batch_num_tokens": 8191, "epoch": 0.61249, "grad_norm": 0.12678979337215424, "learning_rate": 8.756395466566718e-06, "loss": 4.707, "step": 472 }, { "batch_num_effect_tokens": 8008, "batch_num_samples": 20, "batch_num_tokens": 8192, "epoch": 0.61379, "grad_norm": 0.1220792606472969, "learning_rate": 8.748906014838672e-06, "loss": 4.6953, "step": 473 }, { "batch_num_effect_tokens": 7918, "batch_num_samples": 19, "batch_num_tokens": 8086, "epoch": 0.61509, "grad_norm": 0.1242067739367485, "learning_rate": 8.74139730216294e-06, "loss": 4.7539, "step": 474 }, { "batch_num_effect_tokens": 8079, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.61638, "grad_norm": 0.1328115165233612, "learning_rate": 8.73386936711742e-06, "loss": 4.8418, "step": 475 }, { "batch_num_effect_tokens": 7996, "batch_num_samples": 19, "batch_num_tokens": 8180, "epoch": 0.61768, "grad_norm": 0.1341410130262375, "learning_rate": 8.726322248378775e-06, "loss": 4.6699, "step": 476 }, { "batch_num_effect_tokens": 8045, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.61898, "grad_norm": 0.13784657418727875, "learning_rate": 8.718755984722224e-06, "loss": 4.7334, "step": 477 }, { "batch_num_effect_tokens": 7959, "batch_num_samples": 24, "batch_num_tokens": 8168, "epoch": 0.62028, "grad_norm": 0.1370074301958084, "learning_rate": 8.71117061502135e-06, "loss": 4.7334, "step": 478 }, { "batch_num_effect_tokens": 7899, "batch_num_samples": 16, "batch_num_tokens": 8078, "epoch": 0.62157, "grad_norm": 0.12483939528465271, "learning_rate": 8.7035661782479e-06, "loss": 4.6084, "step": 479 }, { "batch_num_effect_tokens": 8033, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 0.62287, "grad_norm": 0.14714112877845764, "learning_rate": 8.695942713471578e-06, "loss": 5.0137, "step": 480 }, { "batch_num_effect_tokens": 8063, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 0.62417, "grad_norm": 0.12480289489030838, "learning_rate": 8.688300259859855e-06, "loss": 4.5625, "step": 481 }, { "batch_num_effect_tokens": 7996, "batch_num_samples": 16, "batch_num_tokens": 8177, "epoch": 0.62547, "grad_norm": 0.132341668009758, "learning_rate": 8.680638856677754e-06, "loss": 4.8096, "step": 482 }, { "batch_num_effect_tokens": 7995, "batch_num_samples": 14, "batch_num_tokens": 8190, "epoch": 0.62676, "grad_norm": 0.1296975016593933, "learning_rate": 8.672958543287666e-06, "loss": 4.499, "step": 483 }, { "batch_num_effect_tokens": 8000, "batch_num_samples": 15, "batch_num_tokens": 8146, "epoch": 0.62806, "grad_norm": 0.12684115767478943, "learning_rate": 8.665259359149132e-06, "loss": 4.9092, "step": 484 }, { "batch_num_effect_tokens": 7810, "batch_num_samples": 17, "batch_num_tokens": 7989, "epoch": 0.62936, "grad_norm": 0.12434987723827362, "learning_rate": 8.657541343818646e-06, "loss": 4.5098, "step": 485 }, { "batch_num_effect_tokens": 8021, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 0.63066, "grad_norm": 0.13358746469020844, "learning_rate": 8.649804536949453e-06, "loss": 4.875, "step": 486 }, { "batch_num_effect_tokens": 7944, "batch_num_samples": 25, "batch_num_tokens": 8156, "epoch": 0.63195, "grad_norm": 0.13076664507389069, "learning_rate": 8.642048978291347e-06, "loss": 4.8301, "step": 487 }, { "batch_num_effect_tokens": 8046, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.63325, "grad_norm": 0.12482704967260361, "learning_rate": 8.634274707690458e-06, "loss": 5.208, "step": 488 }, { "batch_num_effect_tokens": 7996, "batch_num_samples": 23, "batch_num_tokens": 8192, "epoch": 0.63455, "grad_norm": 0.13764898478984833, "learning_rate": 8.626481765089058e-06, "loss": 5.4395, "step": 489 }, { "batch_num_effect_tokens": 8015, "batch_num_samples": 14, "batch_num_tokens": 8155, "epoch": 0.63585, "grad_norm": 0.12434025853872299, "learning_rate": 8.61867019052535e-06, "loss": 4.8242, "step": 490 }, { "batch_num_effect_tokens": 8027, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 0.63715, "grad_norm": 0.13372331857681274, "learning_rate": 8.610840024133266e-06, "loss": 4.9395, "step": 491 }, { "batch_num_effect_tokens": 8045, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.63844, "grad_norm": 0.12320306152105331, "learning_rate": 8.602991306142252e-06, "loss": 4.4512, "step": 492 }, { "batch_num_effect_tokens": 7924, "batch_num_samples": 16, "batch_num_tokens": 8100, "epoch": 0.63974, "grad_norm": 0.13152460753917694, "learning_rate": 8.595124076877074e-06, "loss": 4.8301, "step": 493 }, { "batch_num_effect_tokens": 8029, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 0.64104, "grad_norm": 0.1355685591697693, "learning_rate": 8.587238376757597e-06, "loss": 4.7451, "step": 494 }, { "batch_num_effect_tokens": 7968, "batch_num_samples": 15, "batch_num_tokens": 8100, "epoch": 0.64234, "grad_norm": 0.1284026801586151, "learning_rate": 8.579334246298593e-06, "loss": 4.9092, "step": 495 }, { "batch_num_effect_tokens": 7918, "batch_num_samples": 25, "batch_num_tokens": 8130, "epoch": 0.64363, "grad_norm": 0.13228504359722137, "learning_rate": 8.571411726109518e-06, "loss": 4.5225, "step": 496 }, { "batch_num_effect_tokens": 8025, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 0.64493, "grad_norm": 0.14529111981391907, "learning_rate": 8.563470856894316e-06, "loss": 4.9707, "step": 497 }, { "batch_num_effect_tokens": 7998, "batch_num_samples": 21, "batch_num_tokens": 8191, "epoch": 0.64623, "grad_norm": 0.13738293945789337, "learning_rate": 8.555511679451197e-06, "loss": 4.6738, "step": 498 }, { "batch_num_effect_tokens": 8020, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 0.64753, "grad_norm": 0.13121268153190613, "learning_rate": 8.547534234672435e-06, "loss": 4.4697, "step": 499 }, { "batch_num_effect_tokens": 8012, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 0.64882, "grad_norm": 0.1251576542854309, "learning_rate": 8.539538563544165e-06, "loss": 4.7764, "step": 500 }, { "batch_num_effect_tokens": 8033, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 0.65012, "grad_norm": 0.13326169550418854, "learning_rate": 8.531524707146154e-06, "loss": 4.6836, "step": 501 }, { "batch_num_effect_tokens": 7931, "batch_num_samples": 24, "batch_num_tokens": 8144, "epoch": 0.65142, "grad_norm": 0.13999317586421967, "learning_rate": 8.523492706651607e-06, "loss": 4.9355, "step": 502 }, { "batch_num_effect_tokens": 8051, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 0.65272, "grad_norm": 0.12973648309707642, "learning_rate": 8.515442603326948e-06, "loss": 4.7969, "step": 503 }, { "batch_num_effect_tokens": 8028, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.65401, "grad_norm": 0.1382308453321457, "learning_rate": 8.507374438531606e-06, "loss": 5.1699, "step": 504 }, { "batch_num_effect_tokens": 7991, "batch_num_samples": 15, "batch_num_tokens": 8152, "epoch": 0.65531, "grad_norm": 0.12315783649682999, "learning_rate": 8.49928825371781e-06, "loss": 4.8672, "step": 505 }, { "batch_num_effect_tokens": 8040, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 0.65661, "grad_norm": 0.14307758212089539, "learning_rate": 8.491184090430365e-06, "loss": 4.5527, "step": 506 }, { "batch_num_effect_tokens": 7951, "batch_num_samples": 14, "batch_num_tokens": 8091, "epoch": 0.65791, "grad_norm": 0.12455622851848602, "learning_rate": 8.483061990306451e-06, "loss": 4.9229, "step": 507 }, { "batch_num_effect_tokens": 7904, "batch_num_samples": 18, "batch_num_tokens": 8086, "epoch": 0.65921, "grad_norm": 0.13688203692436218, "learning_rate": 8.474921995075399e-06, "loss": 4.5957, "step": 508 }, { "batch_num_effect_tokens": 7943, "batch_num_samples": 16, "batch_num_tokens": 8100, "epoch": 0.6605, "grad_norm": 0.1287027895450592, "learning_rate": 8.466764146558482e-06, "loss": 4.9189, "step": 509 }, { "batch_num_effect_tokens": 8075, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 0.6618, "grad_norm": 0.12565134465694427, "learning_rate": 8.4585884866687e-06, "loss": 5.2285, "step": 510 }, { "batch_num_effect_tokens": 7930, "batch_num_samples": 16, "batch_num_tokens": 8089, "epoch": 0.6631, "grad_norm": 0.11928825825452805, "learning_rate": 8.450395057410561e-06, "loss": 4.7051, "step": 511 }, { "batch_num_effect_tokens": 8075, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 0.6644, "grad_norm": 0.1257006675004959, "learning_rate": 8.44218390087987e-06, "loss": 4.7148, "step": 512 }, { "batch_num_effect_tokens": 8003, "batch_num_samples": 19, "batch_num_tokens": 8192, "epoch": 0.66569, "grad_norm": 0.14077560603618622, "learning_rate": 8.433955059263508e-06, "loss": 4.8691, "step": 513 }, { "batch_num_effect_tokens": 8065, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 0.66699, "grad_norm": 0.1343725621700287, "learning_rate": 8.425708574839221e-06, "loss": 4.8486, "step": 514 }, { "batch_num_effect_tokens": 8054, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.66829, "grad_norm": 0.1511705070734024, "learning_rate": 8.417444489975396e-06, "loss": 4.584, "step": 515 }, { "batch_num_effect_tokens": 8077, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 0.66959, "grad_norm": 0.15018604695796967, "learning_rate": 8.409162847130847e-06, "loss": 5.0859, "step": 516 }, { "batch_num_effect_tokens": 8006, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 0.67088, "grad_norm": 0.13522522151470184, "learning_rate": 8.400863688854598e-06, "loss": 4.9492, "step": 517 }, { "batch_num_effect_tokens": 7935, "batch_num_samples": 14, "batch_num_tokens": 8092, "epoch": 0.67218, "grad_norm": 0.1357102394104004, "learning_rate": 8.392547057785662e-06, "loss": 4.4229, "step": 518 }, { "batch_num_effect_tokens": 7896, "batch_num_samples": 26, "batch_num_tokens": 8140, "epoch": 0.67348, "grad_norm": 0.12707726657390594, "learning_rate": 8.384212996652823e-06, "loss": 4.5303, "step": 519 }, { "batch_num_effect_tokens": 7996, "batch_num_samples": 14, "batch_num_tokens": 8135, "epoch": 0.67478, "grad_norm": 0.127869114279747, "learning_rate": 8.375861548274417e-06, "loss": 4.6426, "step": 520 }, { "batch_num_effect_tokens": 7982, "batch_num_samples": 24, "batch_num_tokens": 8192, "epoch": 0.67607, "grad_norm": 0.13051429390907288, "learning_rate": 8.367492755558111e-06, "loss": 4.9316, "step": 521 }, { "batch_num_effect_tokens": 7906, "batch_num_samples": 20, "batch_num_tokens": 8112, "epoch": 0.67737, "grad_norm": 0.13462385535240173, "learning_rate": 8.359106661500683e-06, "loss": 4.7568, "step": 522 }, { "batch_num_effect_tokens": 8046, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.67867, "grad_norm": 0.1249893382191658, "learning_rate": 8.3507033091878e-06, "loss": 4.5117, "step": 523 }, { "batch_num_effect_tokens": 8012, "batch_num_samples": 14, "batch_num_tokens": 8168, "epoch": 0.67997, "grad_norm": 0.13060161471366882, "learning_rate": 8.342282741793797e-06, "loss": 4.8574, "step": 524 }, { "batch_num_effect_tokens": 7949, "batch_num_samples": 18, "batch_num_tokens": 8192, "epoch": 0.68127, "grad_norm": 0.11890089511871338, "learning_rate": 8.33384500258146e-06, "loss": 4.6885, "step": 525 }, { "batch_num_effect_tokens": 7939, "batch_num_samples": 14, "batch_num_tokens": 8076, "epoch": 0.68256, "grad_norm": 0.12186378985643387, "learning_rate": 8.325390134901794e-06, "loss": 4.4736, "step": 526 }, { "batch_num_effect_tokens": 7961, "batch_num_samples": 17, "batch_num_tokens": 8118, "epoch": 0.68386, "grad_norm": 0.140080064535141, "learning_rate": 8.316918182193811e-06, "loss": 4.8838, "step": 527 }, { "batch_num_effect_tokens": 7969, "batch_num_samples": 21, "batch_num_tokens": 8176, "epoch": 0.68516, "grad_norm": 0.1309884935617447, "learning_rate": 8.308429187984298e-06, "loss": 4.8018, "step": 528 }, { "batch_num_effect_tokens": 7935, "batch_num_samples": 14, "batch_num_tokens": 8149, "epoch": 0.68646, "grad_norm": 0.1284397542476654, "learning_rate": 8.299923195887599e-06, "loss": 4.4141, "step": 529 }, { "batch_num_effect_tokens": 7875, "batch_num_samples": 15, "batch_num_tokens": 8074, "epoch": 0.68775, "grad_norm": 0.12711189687252045, "learning_rate": 8.291400249605387e-06, "loss": 4.6455, "step": 530 }, { "batch_num_effect_tokens": 8020, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 0.68905, "grad_norm": 0.13765956461429596, "learning_rate": 8.282860392926442e-06, "loss": 4.4688, "step": 531 }, { "batch_num_effect_tokens": 7934, "batch_num_samples": 22, "batch_num_tokens": 8146, "epoch": 0.69035, "grad_norm": 0.13526172935962677, "learning_rate": 8.274303669726427e-06, "loss": 4.6934, "step": 532 }, { "batch_num_effect_tokens": 7921, "batch_num_samples": 20, "batch_num_tokens": 8160, "epoch": 0.69165, "grad_norm": 0.13791659474372864, "learning_rate": 8.26573012396766e-06, "loss": 4.8594, "step": 533 }, { "batch_num_effect_tokens": 8051, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 0.69294, "grad_norm": 0.1339079588651657, "learning_rate": 8.257139799698887e-06, "loss": 5.1318, "step": 534 }, { "batch_num_effect_tokens": 7988, "batch_num_samples": 14, "batch_num_tokens": 8120, "epoch": 0.69424, "grad_norm": 0.13170425593852997, "learning_rate": 8.248532741055061e-06, "loss": 4.5645, "step": 535 }, { "batch_num_effect_tokens": 7932, "batch_num_samples": 18, "batch_num_tokens": 8100, "epoch": 0.69554, "grad_norm": 0.1507280021905899, "learning_rate": 8.239908992257114e-06, "loss": 4.7578, "step": 536 }, { "batch_num_effect_tokens": 8014, "batch_num_samples": 19, "batch_num_tokens": 8192, "epoch": 0.69684, "grad_norm": 0.13211563229560852, "learning_rate": 8.231268597611722e-06, "loss": 5.0664, "step": 537 }, { "batch_num_effect_tokens": 8060, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 0.69813, "grad_norm": 0.135163813829422, "learning_rate": 8.222611601511084e-06, "loss": 4.5693, "step": 538 }, { "batch_num_effect_tokens": 7743, "batch_num_samples": 29, "batch_num_tokens": 8007, "epoch": 0.69943, "grad_norm": 0.13044171035289764, "learning_rate": 8.213938048432697e-06, "loss": 4.8115, "step": 539 }, { "batch_num_effect_tokens": 8056, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.70073, "grad_norm": 0.13288696110248566, "learning_rate": 8.205247982939124e-06, "loss": 4.7236, "step": 540 }, { "batch_num_effect_tokens": 8062, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.70203, "grad_norm": 0.12817522883415222, "learning_rate": 8.196541449677758e-06, "loss": 4.7334, "step": 541 }, { "batch_num_effect_tokens": 8032, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 0.70333, "grad_norm": 0.1278115212917328, "learning_rate": 8.187818493380607e-06, "loss": 4.6318, "step": 542 }, { "batch_num_effect_tokens": 8031, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.70462, "grad_norm": 0.12950873374938965, "learning_rate": 8.179079158864053e-06, "loss": 4.8809, "step": 543 }, { "batch_num_effect_tokens": 7919, "batch_num_samples": 14, "batch_num_tokens": 8083, "epoch": 0.70592, "grad_norm": 0.1247512623667717, "learning_rate": 8.170323491028625e-06, "loss": 4.9658, "step": 544 }, { "batch_num_effect_tokens": 8047, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.70722, "grad_norm": 0.12909561395645142, "learning_rate": 8.161551534858767e-06, "loss": 4.7041, "step": 545 }, { "batch_num_effect_tokens": 7979, "batch_num_samples": 14, "batch_num_tokens": 8146, "epoch": 0.70852, "grad_norm": 0.12214743345975876, "learning_rate": 8.152763335422612e-06, "loss": 5.0234, "step": 546 }, { "batch_num_effect_tokens": 8029, "batch_num_samples": 19, "batch_num_tokens": 8192, "epoch": 0.70981, "grad_norm": 0.12689444422721863, "learning_rate": 8.143958937871748e-06, "loss": 4.8711, "step": 547 }, { "batch_num_effect_tokens": 7902, "batch_num_samples": 15, "batch_num_tokens": 8080, "epoch": 0.71111, "grad_norm": 0.12359411269426346, "learning_rate": 8.135138387440978e-06, "loss": 4.877, "step": 548 }, { "batch_num_effect_tokens": 7921, "batch_num_samples": 19, "batch_num_tokens": 8086, "epoch": 0.71241, "grad_norm": 0.1347743421792984, "learning_rate": 8.126301729448101e-06, "loss": 4.8076, "step": 549 }, { "batch_num_effect_tokens": 8059, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 0.71371, "grad_norm": 0.12478054314851761, "learning_rate": 8.117449009293668e-06, "loss": 4.6523, "step": 550 }, { "batch_num_effect_tokens": 8038, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.715, "grad_norm": 0.12902334332466125, "learning_rate": 8.108580272460759e-06, "loss": 4.6719, "step": 551 }, { "batch_num_effect_tokens": 8047, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.7163, "grad_norm": 0.1194981262087822, "learning_rate": 8.099695564514738e-06, "loss": 4.6465, "step": 552 }, { "batch_num_effect_tokens": 8058, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 0.7176, "grad_norm": 0.12877826392650604, "learning_rate": 8.090794931103026e-06, "loss": 4.7939, "step": 553 }, { "batch_num_effect_tokens": 8069, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 0.7189, "grad_norm": 0.13682805001735687, "learning_rate": 8.08187841795487e-06, "loss": 4.7344, "step": 554 }, { "batch_num_effect_tokens": 8019, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 0.72019, "grad_norm": 0.13331077992916107, "learning_rate": 8.072946070881095e-06, "loss": 5.084, "step": 555 }, { "batch_num_effect_tokens": 7942, "batch_num_samples": 14, "batch_num_tokens": 8149, "epoch": 0.72149, "grad_norm": 0.13391903042793274, "learning_rate": 8.063997935773885e-06, "loss": 4.6699, "step": 556 }, { "batch_num_effect_tokens": 7917, "batch_num_samples": 16, "batch_num_tokens": 8078, "epoch": 0.72279, "grad_norm": 0.1353442519903183, "learning_rate": 8.055034058606533e-06, "loss": 4.7354, "step": 557 }, { "batch_num_effect_tokens": 8003, "batch_num_samples": 14, "batch_num_tokens": 8146, "epoch": 0.72409, "grad_norm": 0.13279181718826294, "learning_rate": 8.046054485433211e-06, "loss": 4.7617, "step": 558 }, { "batch_num_effect_tokens": 8015, "batch_num_samples": 17, "batch_num_tokens": 8165, "epoch": 0.72539, "grad_norm": 0.13407373428344727, "learning_rate": 8.03705926238874e-06, "loss": 4.667, "step": 559 }, { "batch_num_effect_tokens": 8059, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.72668, "grad_norm": 0.12700553238391876, "learning_rate": 8.028048435688333e-06, "loss": 4.4395, "step": 560 }, { "batch_num_effect_tokens": 8022, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.72798, "grad_norm": 0.1360039860010147, "learning_rate": 8.019022051627387e-06, "loss": 4.7686, "step": 561 }, { "batch_num_effect_tokens": 7992, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.72928, "grad_norm": 0.121078722178936, "learning_rate": 8.009980156581218e-06, "loss": 4.6289, "step": 562 }, { "batch_num_effect_tokens": 7946, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.73058, "grad_norm": 0.12270597368478775, "learning_rate": 8.000922797004835e-06, "loss": 4.5605, "step": 563 }, { "batch_num_effect_tokens": 8048, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 0.73187, "grad_norm": 0.13452237844467163, "learning_rate": 7.991850019432701e-06, "loss": 4.6885, "step": 564 }, { "batch_num_effect_tokens": 8020, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 0.73317, "grad_norm": 0.12567798793315887, "learning_rate": 7.982761870478495e-06, "loss": 4.8379, "step": 565 }, { "batch_num_effect_tokens": 7995, "batch_num_samples": 22, "batch_num_tokens": 8192, "epoch": 0.73447, "grad_norm": 0.1310405284166336, "learning_rate": 7.973658396834868e-06, "loss": 4.6504, "step": 566 }, { "batch_num_effect_tokens": 7962, "batch_num_samples": 22, "batch_num_tokens": 8192, "epoch": 0.73577, "grad_norm": 0.13182175159454346, "learning_rate": 7.964539645273204e-06, "loss": 4.7881, "step": 567 }, { "batch_num_effect_tokens": 8022, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 0.73706, "grad_norm": 0.12408516556024551, "learning_rate": 7.955405662643384e-06, "loss": 4.458, "step": 568 }, { "batch_num_effect_tokens": 8038, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.73836, "grad_norm": 0.12731651961803436, "learning_rate": 7.946256495873542e-06, "loss": 5.0205, "step": 569 }, { "batch_num_effect_tokens": 8067, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.73966, "grad_norm": 0.13854657113552094, "learning_rate": 7.937092191969821e-06, "loss": 4.6074, "step": 570 }, { "batch_num_effect_tokens": 7958, "batch_num_samples": 21, "batch_num_tokens": 8192, "epoch": 0.74096, "grad_norm": 0.127966970205307, "learning_rate": 7.927912798016144e-06, "loss": 5.0039, "step": 571 }, { "batch_num_effect_tokens": 8051, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.74225, "grad_norm": 0.1232946589589119, "learning_rate": 7.918718361173951e-06, "loss": 4.749, "step": 572 }, { "batch_num_effect_tokens": 7932, "batch_num_samples": 23, "batch_num_tokens": 8181, "epoch": 0.74355, "grad_norm": 0.12766174972057343, "learning_rate": 7.909508928681975e-06, "loss": 5.0156, "step": 573 }, { "batch_num_effect_tokens": 8053, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 0.74485, "grad_norm": 0.1258310079574585, "learning_rate": 7.900284547855992e-06, "loss": 4.4893, "step": 574 }, { "batch_num_effect_tokens": 8059, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.74615, "grad_norm": 0.13721963763237, "learning_rate": 7.89104526608858e-06, "loss": 4.7783, "step": 575 }, { "batch_num_effect_tokens": 8019, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 0.74745, "grad_norm": 0.11561396718025208, "learning_rate": 7.881791130848872e-06, "loss": 4.6162, "step": 576 }, { "batch_num_effect_tokens": 8033, "batch_num_samples": 17, "batch_num_tokens": 8191, "epoch": 0.74874, "grad_norm": 0.11618711799383163, "learning_rate": 7.872522189682318e-06, "loss": 4.541, "step": 577 }, { "batch_num_effect_tokens": 8046, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 0.75004, "grad_norm": 0.1265118569135666, "learning_rate": 7.863238490210432e-06, "loss": 4.6934, "step": 578 }, { "batch_num_effect_tokens": 8046, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 0.75134, "grad_norm": 0.11919250339269638, "learning_rate": 7.853940080130556e-06, "loss": 4.4326, "step": 579 }, { "batch_num_effect_tokens": 7886, "batch_num_samples": 27, "batch_num_tokens": 8110, "epoch": 0.75264, "grad_norm": 0.13348767161369324, "learning_rate": 7.844627007215613e-06, "loss": 4.9668, "step": 580 }, { "batch_num_effect_tokens": 8026, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.75393, "grad_norm": 0.12527728080749512, "learning_rate": 7.835299319313854e-06, "loss": 4.8496, "step": 581 }, { "batch_num_effect_tokens": 7946, "batch_num_samples": 15, "batch_num_tokens": 8076, "epoch": 0.75523, "grad_norm": 0.12182778120040894, "learning_rate": 7.825957064348625e-06, "loss": 4.6016, "step": 582 }, { "batch_num_effect_tokens": 8000, "batch_num_samples": 17, "batch_num_tokens": 8165, "epoch": 0.75653, "grad_norm": 0.11762472242116928, "learning_rate": 7.81660029031811e-06, "loss": 4.6768, "step": 583 }, { "batch_num_effect_tokens": 8014, "batch_num_samples": 15, "batch_num_tokens": 8176, "epoch": 0.75783, "grad_norm": 0.1293763965368271, "learning_rate": 7.80722904529509e-06, "loss": 4.7266, "step": 584 }, { "batch_num_effect_tokens": 7942, "batch_num_samples": 23, "batch_num_tokens": 8191, "epoch": 0.75912, "grad_norm": 0.12573941051959991, "learning_rate": 7.797843377426693e-06, "loss": 4.6338, "step": 585 }, { "batch_num_effect_tokens": 8037, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.76042, "grad_norm": 0.1293158084154129, "learning_rate": 7.788443334934148e-06, "loss": 4.5762, "step": 586 }, { "batch_num_effect_tokens": 7907, "batch_num_samples": 15, "batch_num_tokens": 8098, "epoch": 0.76172, "grad_norm": 0.14639928936958313, "learning_rate": 7.779028966112538e-06, "loss": 5.0459, "step": 587 }, { "batch_num_effect_tokens": 7981, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 0.76302, "grad_norm": 0.1278136819601059, "learning_rate": 7.769600319330553e-06, "loss": 4.585, "step": 588 }, { "batch_num_effect_tokens": 8017, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 0.76431, "grad_norm": 0.12693235278129578, "learning_rate": 7.760157443030234e-06, "loss": 4.7744, "step": 589 }, { "batch_num_effect_tokens": 8044, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 0.76561, "grad_norm": 0.11962468177080154, "learning_rate": 7.750700385726736e-06, "loss": 4.6338, "step": 590 }, { "batch_num_effect_tokens": 8036, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.76691, "grad_norm": 0.13058899343013763, "learning_rate": 7.741229196008068e-06, "loss": 4.9893, "step": 591 }, { "batch_num_effect_tokens": 7970, "batch_num_samples": 14, "batch_num_tokens": 8123, "epoch": 0.76821, "grad_norm": 0.1270352005958557, "learning_rate": 7.731743922534854e-06, "loss": 4.5371, "step": 592 }, { "batch_num_effect_tokens": 8059, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 0.76951, "grad_norm": 0.1201217994093895, "learning_rate": 7.722244614040068e-06, "loss": 4.3867, "step": 593 }, { "batch_num_effect_tokens": 7950, "batch_num_samples": 16, "batch_num_tokens": 8122, "epoch": 0.7708, "grad_norm": 0.11798641830682755, "learning_rate": 7.712731319328798e-06, "loss": 4.873, "step": 594 }, { "batch_num_effect_tokens": 8027, "batch_num_samples": 17, "batch_num_tokens": 8187, "epoch": 0.7721, "grad_norm": 0.1351042240858078, "learning_rate": 7.703204087277989e-06, "loss": 4.4766, "step": 595 }, { "batch_num_effect_tokens": 8027, "batch_num_samples": 14, "batch_num_tokens": 8189, "epoch": 0.7734, "grad_norm": 0.12383313477039337, "learning_rate": 7.693662966836191e-06, "loss": 4.6631, "step": 596 }, { "batch_num_effect_tokens": 7880, "batch_num_samples": 17, "batch_num_tokens": 8062, "epoch": 0.7747, "grad_norm": 0.12391753494739532, "learning_rate": 7.684108007023313e-06, "loss": 4.5283, "step": 597 }, { "batch_num_effect_tokens": 7995, "batch_num_samples": 22, "batch_num_tokens": 8192, "epoch": 0.77599, "grad_norm": 0.12056957185268402, "learning_rate": 7.674539256930364e-06, "loss": 4.5322, "step": 598 }, { "batch_num_effect_tokens": 7999, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 0.77729, "grad_norm": 0.11704017966985703, "learning_rate": 7.6649567657192e-06, "loss": 4.6699, "step": 599 }, { "batch_num_effect_tokens": 8011, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 0.77859, "grad_norm": 0.12692949175834656, "learning_rate": 7.655360582622287e-06, "loss": 4.5049, "step": 600 }, { "batch_num_effect_tokens": 8021, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.77989, "grad_norm": 0.1289125233888626, "learning_rate": 7.645750756942425e-06, "loss": 4.8818, "step": 601 }, { "batch_num_effect_tokens": 7924, "batch_num_samples": 16, "batch_num_tokens": 8078, "epoch": 0.78118, "grad_norm": 0.12068326771259308, "learning_rate": 7.636127338052513e-06, "loss": 4.7988, "step": 602 }, { "batch_num_effect_tokens": 8053, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.78248, "grad_norm": 0.1243966817855835, "learning_rate": 7.626490375395286e-06, "loss": 4.6328, "step": 603 }, { "batch_num_effect_tokens": 8045, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 0.78378, "grad_norm": 0.12233472615480423, "learning_rate": 7.616839918483061e-06, "loss": 4.5869, "step": 604 }, { "batch_num_effect_tokens": 8008, "batch_num_samples": 16, "batch_num_tokens": 8166, "epoch": 0.78508, "grad_norm": 0.11933887749910355, "learning_rate": 7.607176016897491e-06, "loss": 4.7559, "step": 605 }, { "batch_num_effect_tokens": 7950, "batch_num_samples": 15, "batch_num_tokens": 8116, "epoch": 0.78637, "grad_norm": 0.11429915577173233, "learning_rate": 7.597498720289302e-06, "loss": 4.4414, "step": 606 }, { "batch_num_effect_tokens": 8041, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.78767, "grad_norm": 0.12575142085552216, "learning_rate": 7.587808078378036e-06, "loss": 4.5176, "step": 607 }, { "batch_num_effect_tokens": 7824, "batch_num_samples": 29, "batch_num_tokens": 8092, "epoch": 0.78897, "grad_norm": 0.1377941220998764, "learning_rate": 7.578104140951806e-06, "loss": 4.6582, "step": 608 }, { "batch_num_effect_tokens": 7923, "batch_num_samples": 18, "batch_num_tokens": 8081, "epoch": 0.79027, "grad_norm": 0.12251248210668564, "learning_rate": 7.568386957867033e-06, "loss": 4.5859, "step": 609 }, { "batch_num_effect_tokens": 8047, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 0.79157, "grad_norm": 0.12789161503314972, "learning_rate": 7.5586565790481855e-06, "loss": 4.7432, "step": 610 }, { "batch_num_effect_tokens": 8042, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 0.79286, "grad_norm": 0.14461970329284668, "learning_rate": 7.548913054487537e-06, "loss": 4.7646, "step": 611 }, { "batch_num_effect_tokens": 8057, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 0.79416, "grad_norm": 0.12491682916879654, "learning_rate": 7.539156434244892e-06, "loss": 4.6553, "step": 612 }, { "batch_num_effect_tokens": 7984, "batch_num_samples": 17, "batch_num_tokens": 8178, "epoch": 0.79546, "grad_norm": 0.12910866737365723, "learning_rate": 7.529386768447342e-06, "loss": 4.9033, "step": 613 }, { "batch_num_effect_tokens": 8033, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.79676, "grad_norm": 0.1255904883146286, "learning_rate": 7.519604107289004e-06, "loss": 4.7559, "step": 614 }, { "batch_num_effect_tokens": 8073, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 0.79805, "grad_norm": 0.12721942365169525, "learning_rate": 7.50980850103076e-06, "loss": 4.5977, "step": 615 }, { "batch_num_effect_tokens": 8052, "batch_num_samples": 14, "batch_num_tokens": 8191, "epoch": 0.79935, "grad_norm": 0.12462284415960312, "learning_rate": 7.500000000000001e-06, "loss": 4.3994, "step": 616 }, { "batch_num_effect_tokens": 8052, "batch_num_samples": 14, "batch_num_tokens": 8191, "epoch": 0.79935, "eval_eval_loss": 0.5898093581199646, "eval_eval_runtime": 115.3418, "eval_eval_samples_per_second": 43.349, "eval_eval_steps_per_second": 2.714, "step": 616 }, { "batch_num_effect_tokens": 8079, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 0.80065, "grad_norm": 0.12710371613502502, "learning_rate": 7.490178654590367e-06, "loss": 4.9082, "step": 617 }, { "batch_num_effect_tokens": 8061, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.80195, "grad_norm": 0.12795433402061462, "learning_rate": 7.480344515261495e-06, "loss": 4.6973, "step": 618 }, { "batch_num_effect_tokens": 8038, "batch_num_samples": 14, "batch_num_tokens": 8184, "epoch": 0.80324, "grad_norm": 0.13104230165481567, "learning_rate": 7.470497632538743e-06, "loss": 4.9326, "step": 619 }, { "batch_num_effect_tokens": 8043, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.80454, "grad_norm": 0.12019386142492294, "learning_rate": 7.460638057012956e-06, "loss": 4.665, "step": 620 }, { "batch_num_effect_tokens": 8012, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 0.80584, "grad_norm": 0.1315028965473175, "learning_rate": 7.450765839340175e-06, "loss": 4.9375, "step": 621 }, { "batch_num_effect_tokens": 8059, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.80714, "grad_norm": 0.13341425359249115, "learning_rate": 7.440881030241407e-06, "loss": 4.7939, "step": 622 }, { "batch_num_effect_tokens": 8028, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 0.80843, "grad_norm": 0.13055460155010223, "learning_rate": 7.430983680502344e-06, "loss": 4.9736, "step": 623 }, { "batch_num_effect_tokens": 8033, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 0.80973, "grad_norm": 0.12852461636066437, "learning_rate": 7.4210738409731095e-06, "loss": 4.6982, "step": 624 }, { "batch_num_effect_tokens": 7932, "batch_num_samples": 22, "batch_num_tokens": 8164, "epoch": 0.81103, "grad_norm": 0.13344988226890564, "learning_rate": 7.411151562567999e-06, "loss": 4.7471, "step": 625 }, { "batch_num_effect_tokens": 7929, "batch_num_samples": 26, "batch_num_tokens": 8138, "epoch": 0.81233, "grad_norm": 0.13381238281726837, "learning_rate": 7.401216896265208e-06, "loss": 4.6709, "step": 626 }, { "batch_num_effect_tokens": 7977, "batch_num_samples": 23, "batch_num_tokens": 8181, "epoch": 0.81363, "grad_norm": 0.13180480897426605, "learning_rate": 7.391269893106592e-06, "loss": 4.8457, "step": 627 }, { "batch_num_effect_tokens": 8007, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 0.81492, "grad_norm": 0.12136835604906082, "learning_rate": 7.381310604197375e-06, "loss": 4.8252, "step": 628 }, { "batch_num_effect_tokens": 7929, "batch_num_samples": 17, "batch_num_tokens": 8118, "epoch": 0.81622, "grad_norm": 0.1308746188879013, "learning_rate": 7.371339080705913e-06, "loss": 4.5479, "step": 629 }, { "batch_num_effect_tokens": 8059, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 0.81752, "grad_norm": 0.13940449059009552, "learning_rate": 7.361355373863415e-06, "loss": 4.7676, "step": 630 }, { "batch_num_effect_tokens": 8076, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 0.81882, "grad_norm": 0.1301494687795639, "learning_rate": 7.351359534963684e-06, "loss": 4.4824, "step": 631 }, { "batch_num_effect_tokens": 8070, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.82011, "grad_norm": 0.12944510579109192, "learning_rate": 7.3413516153628605e-06, "loss": 4.8672, "step": 632 }, { "batch_num_effect_tokens": 8009, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 0.82141, "grad_norm": 0.13300953805446625, "learning_rate": 7.331331666479149e-06, "loss": 4.9189, "step": 633 }, { "batch_num_effect_tokens": 8053, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.82271, "grad_norm": 0.12441015988588333, "learning_rate": 7.321299739792553e-06, "loss": 4.4316, "step": 634 }, { "batch_num_effect_tokens": 7972, "batch_num_samples": 20, "batch_num_tokens": 8192, "epoch": 0.82401, "grad_norm": 0.12126115709543228, "learning_rate": 7.311255886844624e-06, "loss": 4.5771, "step": 635 }, { "batch_num_effect_tokens": 7986, "batch_num_samples": 14, "batch_num_tokens": 8135, "epoch": 0.8253, "grad_norm": 0.12674327194690704, "learning_rate": 7.30120015923818e-06, "loss": 4.8574, "step": 636 }, { "batch_num_effect_tokens": 8045, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 0.8266, "grad_norm": 0.12705481052398682, "learning_rate": 7.291132608637053e-06, "loss": 4.334, "step": 637 }, { "batch_num_effect_tokens": 8010, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 0.8279, "grad_norm": 0.12920741736888885, "learning_rate": 7.281053286765816e-06, "loss": 4.6611, "step": 638 }, { "batch_num_effect_tokens": 7761, "batch_num_samples": 25, "batch_num_tokens": 8034, "epoch": 0.8292, "grad_norm": 0.1230921670794487, "learning_rate": 7.27096224540952e-06, "loss": 4.5332, "step": 639 }, { "batch_num_effect_tokens": 8042, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.83049, "grad_norm": 0.12442672997713089, "learning_rate": 7.260859536413429e-06, "loss": 4.7666, "step": 640 }, { "batch_num_effect_tokens": 7965, "batch_num_samples": 14, "batch_num_tokens": 8120, "epoch": 0.83179, "grad_norm": 0.13749492168426514, "learning_rate": 7.250745211682752e-06, "loss": 4.9414, "step": 641 }, { "batch_num_effect_tokens": 8023, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 0.83309, "grad_norm": 0.1354299634695053, "learning_rate": 7.240619323182378e-06, "loss": 4.9287, "step": 642 }, { "batch_num_effect_tokens": 7934, "batch_num_samples": 16, "batch_num_tokens": 8100, "epoch": 0.83439, "grad_norm": 0.1304435133934021, "learning_rate": 7.2304819229366015e-06, "loss": 4.4697, "step": 643 }, { "batch_num_effect_tokens": 7976, "batch_num_samples": 23, "batch_num_tokens": 8192, "epoch": 0.83569, "grad_norm": 0.12707528471946716, "learning_rate": 7.2203330630288714e-06, "loss": 5.0391, "step": 644 }, { "batch_num_effect_tokens": 7918, "batch_num_samples": 18, "batch_num_tokens": 8100, "epoch": 0.83698, "grad_norm": 0.12164284288883209, "learning_rate": 7.210172795601506e-06, "loss": 4.2236, "step": 645 }, { "batch_num_effect_tokens": 8054, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.83828, "grad_norm": 0.13518215715885162, "learning_rate": 7.200001172855436e-06, "loss": 4.7686, "step": 646 }, { "batch_num_effect_tokens": 8036, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.83958, "grad_norm": 0.12766359746456146, "learning_rate": 7.189818247049931e-06, "loss": 4.5146, "step": 647 }, { "batch_num_effect_tokens": 7961, "batch_num_samples": 15, "batch_num_tokens": 8116, "epoch": 0.84088, "grad_norm": 0.12365400791168213, "learning_rate": 7.179624070502334e-06, "loss": 4.9824, "step": 648 }, { "batch_num_effect_tokens": 7946, "batch_num_samples": 17, "batch_num_tokens": 8123, "epoch": 0.84217, "grad_norm": 0.13238734006881714, "learning_rate": 7.169418695587791e-06, "loss": 4.9043, "step": 649 }, { "batch_num_effect_tokens": 7933, "batch_num_samples": 14, "batch_num_tokens": 8122, "epoch": 0.84347, "grad_norm": 0.11986955255270004, "learning_rate": 7.159202174738984e-06, "loss": 4.3682, "step": 650 }, { "batch_num_effect_tokens": 7899, "batch_num_samples": 18, "batch_num_tokens": 8085, "epoch": 0.84477, "grad_norm": 0.1329929381608963, "learning_rate": 7.148974560445859e-06, "loss": 4.6943, "step": 651 }, { "batch_num_effect_tokens": 8054, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.84607, "grad_norm": 0.1225806325674057, "learning_rate": 7.138735905255355e-06, "loss": 4.8477, "step": 652 }, { "batch_num_effect_tokens": 7858, "batch_num_samples": 21, "batch_num_tokens": 8108, "epoch": 0.84736, "grad_norm": 0.12441520392894745, "learning_rate": 7.128486261771142e-06, "loss": 4.5928, "step": 653 }, { "batch_num_effect_tokens": 7978, "batch_num_samples": 16, "batch_num_tokens": 8133, "epoch": 0.84866, "grad_norm": 0.11729971319437027, "learning_rate": 7.1182256826533365e-06, "loss": 4.8398, "step": 654 }, { "batch_num_effect_tokens": 7973, "batch_num_samples": 21, "batch_num_tokens": 8159, "epoch": 0.84996, "grad_norm": 0.1261633038520813, "learning_rate": 7.107954220618251e-06, "loss": 4.9746, "step": 655 }, { "batch_num_effect_tokens": 8008, "batch_num_samples": 19, "batch_num_tokens": 8192, "epoch": 0.85126, "grad_norm": 0.13062052428722382, "learning_rate": 7.097671928438101e-06, "loss": 4.6182, "step": 656 }, { "batch_num_effect_tokens": 8062, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.85255, "grad_norm": 0.12817202508449554, "learning_rate": 7.08737885894075e-06, "loss": 4.6768, "step": 657 }, { "batch_num_effect_tokens": 7970, "batch_num_samples": 17, "batch_num_tokens": 8139, "epoch": 0.85385, "grad_norm": 0.13156555593013763, "learning_rate": 7.0770750650094335e-06, "loss": 4.7109, "step": 658 }, { "batch_num_effect_tokens": 8025, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 0.85515, "grad_norm": 0.13155829906463623, "learning_rate": 7.066760599582481e-06, "loss": 4.9395, "step": 659 }, { "batch_num_effect_tokens": 7992, "batch_num_samples": 22, "batch_num_tokens": 8182, "epoch": 0.85645, "grad_norm": 0.12426735460758209, "learning_rate": 7.056435515653059e-06, "loss": 4.5059, "step": 660 }, { "batch_num_effect_tokens": 8042, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.85775, "grad_norm": 0.11923466622829437, "learning_rate": 7.046099866268878e-06, "loss": 4.6162, "step": 661 }, { "batch_num_effect_tokens": 8047, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.85904, "grad_norm": 0.12440145760774612, "learning_rate": 7.03575370453194e-06, "loss": 4.6377, "step": 662 }, { "batch_num_effect_tokens": 7967, "batch_num_samples": 23, "batch_num_tokens": 8181, "epoch": 0.86034, "grad_norm": 0.12746423482894897, "learning_rate": 7.025397083598251e-06, "loss": 4.7217, "step": 663 }, { "batch_num_effect_tokens": 8011, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 0.86164, "grad_norm": 0.1230856403708458, "learning_rate": 7.015030056677559e-06, "loss": 4.3516, "step": 664 }, { "batch_num_effect_tokens": 7945, "batch_num_samples": 20, "batch_num_tokens": 8128, "epoch": 0.86294, "grad_norm": 0.14013619720935822, "learning_rate": 7.004652677033069e-06, "loss": 4.9609, "step": 665 }, { "batch_num_effect_tokens": 7995, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 0.86423, "grad_norm": 0.1289435476064682, "learning_rate": 6.9942649979811836e-06, "loss": 4.7041, "step": 666 }, { "batch_num_effect_tokens": 7973, "batch_num_samples": 22, "batch_num_tokens": 8192, "epoch": 0.86553, "grad_norm": 0.11872459203004837, "learning_rate": 6.983867072891213e-06, "loss": 4.5059, "step": 667 }, { "batch_num_effect_tokens": 7942, "batch_num_samples": 14, "batch_num_tokens": 8074, "epoch": 0.86683, "grad_norm": 0.11841870099306107, "learning_rate": 6.973458955185116e-06, "loss": 4.834, "step": 668 }, { "batch_num_effect_tokens": 8062, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.86813, "grad_norm": 0.13256226480007172, "learning_rate": 6.963040698337215e-06, "loss": 4.7764, "step": 669 }, { "batch_num_effect_tokens": 8065, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 0.86942, "grad_norm": 0.12503276765346527, "learning_rate": 6.952612355873922e-06, "loss": 4.5635, "step": 670 }, { "batch_num_effect_tokens": 8057, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 0.87072, "grad_norm": 0.13926607370376587, "learning_rate": 6.942173981373474e-06, "loss": 4.9668, "step": 671 }, { "batch_num_effect_tokens": 8012, "batch_num_samples": 18, "batch_num_tokens": 8192, "epoch": 0.87202, "grad_norm": 0.1426723152399063, "learning_rate": 6.931725628465643e-06, "loss": 4.9531, "step": 672 }, { "batch_num_effect_tokens": 7934, "batch_num_samples": 14, "batch_num_tokens": 8079, "epoch": 0.87332, "grad_norm": 0.12457671016454697, "learning_rate": 6.9212673508314734e-06, "loss": 4.6992, "step": 673 }, { "batch_num_effect_tokens": 7773, "batch_num_samples": 30, "batch_num_tokens": 8057, "epoch": 0.87461, "grad_norm": 0.1329599916934967, "learning_rate": 6.910799202202993e-06, "loss": 4.7793, "step": 674 }, { "batch_num_effect_tokens": 8075, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 0.87591, "grad_norm": 0.1251561939716339, "learning_rate": 6.900321236362952e-06, "loss": 4.7969, "step": 675 }, { "batch_num_effect_tokens": 7908, "batch_num_samples": 15, "batch_num_tokens": 8086, "epoch": 0.87721, "grad_norm": 0.12417805939912796, "learning_rate": 6.889833507144534e-06, "loss": 4.9121, "step": 676 }, { "batch_num_effect_tokens": 8005, "batch_num_samples": 14, "batch_num_tokens": 8156, "epoch": 0.87851, "grad_norm": 0.1217103824019432, "learning_rate": 6.879336068431086e-06, "loss": 4.8389, "step": 677 }, { "batch_num_effect_tokens": 8015, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.87981, "grad_norm": 0.1301388144493103, "learning_rate": 6.868828974155841e-06, "loss": 5.5527, "step": 678 }, { "batch_num_effect_tokens": 8023, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.8811, "grad_norm": 0.12873168289661407, "learning_rate": 6.858312278301638e-06, "loss": 4.6826, "step": 679 }, { "batch_num_effect_tokens": 8045, "batch_num_samples": 14, "batch_num_tokens": 8163, "epoch": 0.8824, "grad_norm": 0.12014926224946976, "learning_rate": 6.847786034900648e-06, "loss": 4.4951, "step": 680 }, { "batch_num_effect_tokens": 7913, "batch_num_samples": 14, "batch_num_tokens": 8088, "epoch": 0.8837, "grad_norm": 0.12555824220180511, "learning_rate": 6.837250298034095e-06, "loss": 4.5303, "step": 681 }, { "batch_num_effect_tokens": 8012, "batch_num_samples": 15, "batch_num_tokens": 8158, "epoch": 0.885, "grad_norm": 0.12343177944421768, "learning_rate": 6.8267051218319766e-06, "loss": 4.4834, "step": 682 }, { "batch_num_effect_tokens": 8055, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 0.88629, "grad_norm": 0.12695080041885376, "learning_rate": 6.816150560472787e-06, "loss": 4.4951, "step": 683 }, { "batch_num_effect_tokens": 7893, "batch_num_samples": 18, "batch_num_tokens": 8085, "epoch": 0.88759, "grad_norm": 0.12869073450565338, "learning_rate": 6.805586668183242e-06, "loss": 4.7705, "step": 684 }, { "batch_num_effect_tokens": 8022, "batch_num_samples": 14, "batch_num_tokens": 8142, "epoch": 0.88889, "grad_norm": 0.12758734822273254, "learning_rate": 6.7950134992379935e-06, "loss": 4.4922, "step": 685 }, { "batch_num_effect_tokens": 7884, "batch_num_samples": 17, "batch_num_tokens": 8087, "epoch": 0.89019, "grad_norm": 0.12447866797447205, "learning_rate": 6.78443110795936e-06, "loss": 4.5645, "step": 686 }, { "batch_num_effect_tokens": 8066, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.89148, "grad_norm": 0.1174447238445282, "learning_rate": 6.773839548717036e-06, "loss": 4.6055, "step": 687 }, { "batch_num_effect_tokens": 8047, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.89278, "grad_norm": 0.1315164715051651, "learning_rate": 6.7632388759278225e-06, "loss": 4.4893, "step": 688 }, { "batch_num_effect_tokens": 7998, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 0.89408, "grad_norm": 0.119374580681324, "learning_rate": 6.752629144055342e-06, "loss": 4.4414, "step": 689 }, { "batch_num_effect_tokens": 8038, "batch_num_samples": 18, "batch_num_tokens": 8192, "epoch": 0.89538, "grad_norm": 0.12017811089754105, "learning_rate": 6.742010407609759e-06, "loss": 4.8516, "step": 690 }, { "batch_num_effect_tokens": 8016, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.89667, "grad_norm": 0.11169980466365814, "learning_rate": 6.731382721147509e-06, "loss": 4.6357, "step": 691 }, { "batch_num_effect_tokens": 8004, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 0.89797, "grad_norm": 0.12770824134349823, "learning_rate": 6.720746139270997e-06, "loss": 4.7705, "step": 692 }, { "batch_num_effect_tokens": 8032, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.89927, "grad_norm": 0.12186800688505173, "learning_rate": 6.710100716628345e-06, "loss": 4.2812, "step": 693 }, { "batch_num_effect_tokens": 8063, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 0.90057, "grad_norm": 0.12909391522407532, "learning_rate": 6.699446507913083e-06, "loss": 4.7236, "step": 694 }, { "batch_num_effect_tokens": 7907, "batch_num_samples": 26, "batch_num_tokens": 8128, "epoch": 0.90187, "grad_norm": 0.1295677274465561, "learning_rate": 6.6887835678638944e-06, "loss": 4.6318, "step": 695 }, { "batch_num_effect_tokens": 8058, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.90316, "grad_norm": 0.12642303109169006, "learning_rate": 6.6781119512643136e-06, "loss": 4.3809, "step": 696 }, { "batch_num_effect_tokens": 7971, "batch_num_samples": 18, "batch_num_tokens": 8192, "epoch": 0.90446, "grad_norm": 0.12434201687574387, "learning_rate": 6.6674317129424535e-06, "loss": 4.5703, "step": 697 }, { "batch_num_effect_tokens": 8052, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 0.90576, "grad_norm": 0.11948221176862717, "learning_rate": 6.656742907770728e-06, "loss": 4.6201, "step": 698 }, { "batch_num_effect_tokens": 7950, "batch_num_samples": 19, "batch_num_tokens": 8131, "epoch": 0.90706, "grad_norm": 0.12394808977842331, "learning_rate": 6.6460455906655595e-06, "loss": 4.4463, "step": 699 }, { "batch_num_effect_tokens": 8064, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.90835, "grad_norm": 0.1280088871717453, "learning_rate": 6.635339816587109e-06, "loss": 4.7422, "step": 700 }, { "batch_num_effect_tokens": 7911, "batch_num_samples": 15, "batch_num_tokens": 8086, "epoch": 0.90965, "grad_norm": 0.1304391473531723, "learning_rate": 6.6246256405389805e-06, "loss": 4.2695, "step": 701 }, { "batch_num_effect_tokens": 7914, "batch_num_samples": 25, "batch_num_tokens": 8164, "epoch": 0.91095, "grad_norm": 0.1250067502260208, "learning_rate": 6.613903117567951e-06, "loss": 4.5664, "step": 702 }, { "batch_num_effect_tokens": 8039, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 0.91225, "grad_norm": 0.12943719327449799, "learning_rate": 6.6031723027636775e-06, "loss": 4.5186, "step": 703 }, { "batch_num_effect_tokens": 8023, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.91354, "grad_norm": 0.11747249215841293, "learning_rate": 6.592433251258423e-06, "loss": 4.7568, "step": 704 }, { "batch_num_effect_tokens": 8066, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.91484, "grad_norm": 0.13342788815498352, "learning_rate": 6.581686018226764e-06, "loss": 4.6963, "step": 705 }, { "batch_num_effect_tokens": 7763, "batch_num_samples": 28, "batch_num_tokens": 8008, "epoch": 0.91614, "grad_norm": 0.12844344973564148, "learning_rate": 6.570930658885314e-06, "loss": 4.6582, "step": 706 }, { "batch_num_effect_tokens": 8019, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 0.91744, "grad_norm": 0.14021001756191254, "learning_rate": 6.560167228492436e-06, "loss": 4.8984, "step": 707 }, { "batch_num_effect_tokens": 7956, "batch_num_samples": 14, "batch_num_tokens": 8114, "epoch": 0.91873, "grad_norm": 0.1323830634355545, "learning_rate": 6.549395782347963e-06, "loss": 4.7314, "step": 708 }, { "batch_num_effect_tokens": 7965, "batch_num_samples": 17, "batch_num_tokens": 8157, "epoch": 0.92003, "grad_norm": 0.12167170643806458, "learning_rate": 6.53861637579291e-06, "loss": 4.9531, "step": 709 }, { "batch_num_effect_tokens": 8009, "batch_num_samples": 15, "batch_num_tokens": 8156, "epoch": 0.92133, "grad_norm": 0.12227673083543777, "learning_rate": 6.527829064209187e-06, "loss": 4.8438, "step": 710 }, { "batch_num_effect_tokens": 8019, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 0.92263, "grad_norm": 0.12005326896905899, "learning_rate": 6.517033903019323e-06, "loss": 5.04, "step": 711 }, { "batch_num_effect_tokens": 8053, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 0.92393, "grad_norm": 0.11777986586093903, "learning_rate": 6.5062309476861714e-06, "loss": 4.9141, "step": 712 }, { "batch_num_effect_tokens": 7997, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 0.92522, "grad_norm": 0.12427664548158646, "learning_rate": 6.495420253712636e-06, "loss": 5.0312, "step": 713 }, { "batch_num_effect_tokens": 8026, "batch_num_samples": 18, "batch_num_tokens": 8192, "epoch": 0.92652, "grad_norm": 0.12834517657756805, "learning_rate": 6.484601876641375e-06, "loss": 4.7354, "step": 714 }, { "batch_num_effect_tokens": 7978, "batch_num_samples": 15, "batch_num_tokens": 8116, "epoch": 0.92782, "grad_norm": 0.13113918900489807, "learning_rate": 6.473775872054522e-06, "loss": 4.6543, "step": 715 }, { "batch_num_effect_tokens": 7916, "batch_num_samples": 17, "batch_num_tokens": 8087, "epoch": 0.92912, "grad_norm": 0.1328577995300293, "learning_rate": 6.4629422955733975e-06, "loss": 4.9062, "step": 716 }, { "batch_num_effect_tokens": 8018, "batch_num_samples": 14, "batch_num_tokens": 8191, "epoch": 0.93041, "grad_norm": 0.11206276714801788, "learning_rate": 6.452101202858229e-06, "loss": 4.6455, "step": 717 }, { "batch_num_effect_tokens": 7956, "batch_num_samples": 22, "batch_num_tokens": 8164, "epoch": 0.93171, "grad_norm": 0.12403670698404312, "learning_rate": 6.4412526496078555e-06, "loss": 4.5957, "step": 718 }, { "batch_num_effect_tokens": 7954, "batch_num_samples": 21, "batch_num_tokens": 8125, "epoch": 0.93301, "grad_norm": 0.142649307847023, "learning_rate": 6.430396691559446e-06, "loss": 4.876, "step": 719 }, { "batch_num_effect_tokens": 7999, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 0.93431, "grad_norm": 0.12546393275260925, "learning_rate": 6.419533384488221e-06, "loss": 4.5439, "step": 720 }, { "batch_num_effect_tokens": 7889, "batch_num_samples": 21, "batch_num_tokens": 8108, "epoch": 0.9356, "grad_norm": 0.12057095021009445, "learning_rate": 6.408662784207149e-06, "loss": 4.6611, "step": 721 }, { "batch_num_effect_tokens": 8021, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 0.9369, "grad_norm": 0.1193976029753685, "learning_rate": 6.397784946566676e-06, "loss": 4.7529, "step": 722 }, { "batch_num_effect_tokens": 7955, "batch_num_samples": 14, "batch_num_tokens": 8121, "epoch": 0.9382, "grad_norm": 0.12544573843479156, "learning_rate": 6.3868999274544264e-06, "loss": 4.9453, "step": 723 }, { "batch_num_effect_tokens": 7982, "batch_num_samples": 24, "batch_num_tokens": 8192, "epoch": 0.9395, "grad_norm": 0.1267952173948288, "learning_rate": 6.376007782794926e-06, "loss": 4.7207, "step": 724 }, { "batch_num_effect_tokens": 7882, "batch_num_samples": 24, "batch_num_tokens": 8096, "epoch": 0.94079, "grad_norm": 0.14192216098308563, "learning_rate": 6.365108568549308e-06, "loss": 5.0576, "step": 725 }, { "batch_num_effect_tokens": 7956, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 0.94209, "grad_norm": 0.12729580700397491, "learning_rate": 6.354202340715027e-06, "loss": 4.6826, "step": 726 }, { "batch_num_effect_tokens": 8062, "batch_num_samples": 14, "batch_num_tokens": 8188, "epoch": 0.94339, "grad_norm": 0.12701718509197235, "learning_rate": 6.34328915532557e-06, "loss": 4.8945, "step": 727 }, { "batch_num_effect_tokens": 8051, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.94469, "grad_norm": 0.13189218938350677, "learning_rate": 6.332369068450175e-06, "loss": 4.8848, "step": 728 }, { "batch_num_effect_tokens": 8031, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 0.94599, "grad_norm": 0.12214604765176773, "learning_rate": 6.321442136193535e-06, "loss": 4.6484, "step": 729 }, { "batch_num_effect_tokens": 7778, "batch_num_samples": 17, "batch_num_tokens": 7989, "epoch": 0.94728, "grad_norm": 0.12111053615808487, "learning_rate": 6.310508414695511e-06, "loss": 4.7383, "step": 730 }, { "batch_num_effect_tokens": 7963, "batch_num_samples": 21, "batch_num_tokens": 8192, "epoch": 0.94858, "grad_norm": 0.12662427127361298, "learning_rate": 6.29956796013085e-06, "loss": 4.6602, "step": 731 }, { "batch_num_effect_tokens": 8073, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.94988, "grad_norm": 0.11730318516492844, "learning_rate": 6.288620828708888e-06, "loss": 4.5, "step": 732 }, { "batch_num_effect_tokens": 8058, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 0.95118, "grad_norm": 0.12332963943481445, "learning_rate": 6.277667076673266e-06, "loss": 4.7041, "step": 733 }, { "batch_num_effect_tokens": 8024, "batch_num_samples": 15, "batch_num_tokens": 8167, "epoch": 0.95247, "grad_norm": 0.12476927042007446, "learning_rate": 6.266706760301641e-06, "loss": 4.5742, "step": 734 }, { "batch_num_effect_tokens": 8054, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.95377, "grad_norm": 0.13098326325416565, "learning_rate": 6.255739935905396e-06, "loss": 4.4307, "step": 735 }, { "batch_num_effect_tokens": 8045, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 0.95507, "grad_norm": 0.13105835020542145, "learning_rate": 6.244766659829351e-06, "loss": 4.8428, "step": 736 }, { "batch_num_effect_tokens": 8011, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 0.95637, "grad_norm": 0.1244327500462532, "learning_rate": 6.233786988451468e-06, "loss": 4.3555, "step": 737 }, { "batch_num_effect_tokens": 7967, "batch_num_samples": 22, "batch_num_tokens": 8192, "epoch": 0.95766, "grad_norm": 0.12690576910972595, "learning_rate": 6.222800978182576e-06, "loss": 4.7607, "step": 738 }, { "batch_num_effect_tokens": 8024, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 0.95896, "grad_norm": 0.12604181468486786, "learning_rate": 6.211808685466063e-06, "loss": 4.9053, "step": 739 }, { "batch_num_effect_tokens": 8039, "batch_num_samples": 15, "batch_num_tokens": 8176, "epoch": 0.96026, "grad_norm": 0.11882328987121582, "learning_rate": 6.200810166777598e-06, "loss": 4.167, "step": 740 }, { "batch_num_effect_tokens": 8009, "batch_num_samples": 19, "batch_num_tokens": 8192, "epoch": 0.96156, "grad_norm": 0.12758396565914154, "learning_rate": 6.189805478624838e-06, "loss": 4.5254, "step": 741 }, { "batch_num_effect_tokens": 7998, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 0.96285, "grad_norm": 0.12387480586767197, "learning_rate": 6.178794677547138e-06, "loss": 4.5957, "step": 742 }, { "batch_num_effect_tokens": 8029, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.96415, "grad_norm": 0.1272263377904892, "learning_rate": 6.167777820115254e-06, "loss": 4.5576, "step": 743 }, { "batch_num_effect_tokens": 8024, "batch_num_samples": 17, "batch_num_tokens": 8191, "epoch": 0.96545, "grad_norm": 0.1208052784204483, "learning_rate": 6.156754962931069e-06, "loss": 4.4629, "step": 744 }, { "batch_num_effect_tokens": 7940, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 0.96675, "grad_norm": 0.11877097934484482, "learning_rate": 6.145726162627278e-06, "loss": 4.6768, "step": 745 }, { "batch_num_effect_tokens": 7949, "batch_num_samples": 18, "batch_num_tokens": 8114, "epoch": 0.96805, "grad_norm": 0.12087776511907578, "learning_rate": 6.134691475867122e-06, "loss": 4.6719, "step": 746 }, { "batch_num_effect_tokens": 7937, "batch_num_samples": 14, "batch_num_tokens": 8079, "epoch": 0.96934, "grad_norm": 0.1275898814201355, "learning_rate": 6.123650959344075e-06, "loss": 4.666, "step": 747 }, { "batch_num_effect_tokens": 8026, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.97064, "grad_norm": 0.11717364192008972, "learning_rate": 6.112604669781572e-06, "loss": 4.6748, "step": 748 }, { "batch_num_effect_tokens": 8079, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 0.97194, "grad_norm": 0.1246257945895195, "learning_rate": 6.101552663932704e-06, "loss": 4.5859, "step": 749 }, { "batch_num_effect_tokens": 8062, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.97324, "grad_norm": 0.12895013391971588, "learning_rate": 6.090494998579929e-06, "loss": 4.7861, "step": 750 }, { "batch_num_effect_tokens": 7823, "batch_num_samples": 27, "batch_num_tokens": 8091, "epoch": 0.97453, "grad_norm": 0.13118872046470642, "learning_rate": 6.079431730534786e-06, "loss": 4.7031, "step": 751 }, { "batch_num_effect_tokens": 8030, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 0.97583, "grad_norm": 0.1227671355009079, "learning_rate": 6.0683629166375955e-06, "loss": 4.5049, "step": 752 }, { "batch_num_effect_tokens": 8047, "batch_num_samples": 14, "batch_num_tokens": 8189, "epoch": 0.97713, "grad_norm": 0.12133854627609253, "learning_rate": 6.057288613757178e-06, "loss": 4.7334, "step": 753 }, { "batch_num_effect_tokens": 8074, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 0.97843, "grad_norm": 0.12346214056015015, "learning_rate": 6.046208878790543e-06, "loss": 4.7197, "step": 754 }, { "batch_num_effect_tokens": 8002, "batch_num_samples": 18, "batch_num_tokens": 8191, "epoch": 0.97972, "grad_norm": 0.11389555037021637, "learning_rate": 6.035123768662622e-06, "loss": 4.7832, "step": 755 }, { "batch_num_effect_tokens": 8000, "batch_num_samples": 14, "batch_num_tokens": 8149, "epoch": 0.98102, "grad_norm": 0.1279493272304535, "learning_rate": 6.024033340325954e-06, "loss": 4.7656, "step": 756 }, { "batch_num_effect_tokens": 8020, "batch_num_samples": 21, "batch_num_tokens": 8192, "epoch": 0.98232, "grad_norm": 0.12431464344263077, "learning_rate": 6.012937650760406e-06, "loss": 4.96, "step": 757 }, { "batch_num_effect_tokens": 8076, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 0.98362, "grad_norm": 0.126203715801239, "learning_rate": 6.001836756972873e-06, "loss": 4.752, "step": 758 }, { "batch_num_effect_tokens": 7894, "batch_num_samples": 23, "batch_num_tokens": 8143, "epoch": 0.98491, "grad_norm": 0.12350256741046906, "learning_rate": 5.990730715996989e-06, "loss": 4.6533, "step": 759 }, { "batch_num_effect_tokens": 7952, "batch_num_samples": 20, "batch_num_tokens": 8192, "epoch": 0.98621, "grad_norm": 0.12190555781126022, "learning_rate": 5.979619584892834e-06, "loss": 4.6904, "step": 760 }, { "batch_num_effect_tokens": 7993, "batch_num_samples": 20, "batch_num_tokens": 8192, "epoch": 0.98751, "grad_norm": 0.12734700739383698, "learning_rate": 5.968503420746638e-06, "loss": 5.4248, "step": 761 }, { "batch_num_effect_tokens": 8041, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.98881, "grad_norm": 0.12115947902202606, "learning_rate": 5.957382280670494e-06, "loss": 4.6416, "step": 762 }, { "batch_num_effect_tokens": 8021, "batch_num_samples": 14, "batch_num_tokens": 8152, "epoch": 0.99011, "grad_norm": 0.12552732229232788, "learning_rate": 5.946256221802052e-06, "loss": 4.4473, "step": 763 }, { "batch_num_effect_tokens": 8026, "batch_num_samples": 18, "batch_num_tokens": 8192, "epoch": 0.9914, "grad_norm": 0.129593163728714, "learning_rate": 5.935125301304241e-06, "loss": 4.9512, "step": 764 }, { "batch_num_effect_tokens": 7873, "batch_num_samples": 27, "batch_num_tokens": 8120, "epoch": 0.9927, "grad_norm": 0.1321675032377243, "learning_rate": 5.9239895763649635e-06, "loss": 4.8701, "step": 765 }, { "batch_num_effect_tokens": 8043, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 0.994, "grad_norm": 0.12967081367969513, "learning_rate": 5.91284910419681e-06, "loss": 4.4424, "step": 766 }, { "batch_num_effect_tokens": 7949, "batch_num_samples": 14, "batch_num_tokens": 8088, "epoch": 0.9953, "grad_norm": 0.12642012536525726, "learning_rate": 5.901703942036755e-06, "loss": 5.1172, "step": 767 }, { "batch_num_effect_tokens": 7940, "batch_num_samples": 21, "batch_num_tokens": 8142, "epoch": 0.99659, "grad_norm": 0.13420720398426056, "learning_rate": 5.890554147145875e-06, "loss": 4.7734, "step": 768 }, { "batch_num_effect_tokens": 7937, "batch_num_samples": 18, "batch_num_tokens": 8090, "epoch": 0.99789, "grad_norm": 0.12632231414318085, "learning_rate": 5.879399776809047e-06, "loss": 4.4863, "step": 769 }, { "batch_num_effect_tokens": 8011, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 0.99919, "grad_norm": 0.12234952300786972, "learning_rate": 5.8682408883346535e-06, "loss": 4.6025, "step": 770 }, { "batch_num_effect_tokens": 8011, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 0.99919, "eval_eval_loss": 0.5797469019889832, "eval_eval_runtime": 114.9751, "eval_eval_samples_per_second": 43.488, "eval_eval_steps_per_second": 2.722, "step": 770 }, { "batch_num_effect_tokens": 8059, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 1.00049, "grad_norm": 0.13404805958271027, "learning_rate": 5.857077539054289e-06, "loss": 4.9434, "step": 771 }, { "batch_num_effect_tokens": 7897, "batch_num_samples": 18, "batch_num_tokens": 8142, "epoch": 1.00178, "grad_norm": 0.1201479434967041, "learning_rate": 5.8459097863224705e-06, "loss": 4.8154, "step": 772 }, { "batch_num_effect_tokens": 7988, "batch_num_samples": 20, "batch_num_tokens": 8192, "epoch": 1.00308, "grad_norm": 0.12970799207687378, "learning_rate": 5.834737687516336e-06, "loss": 4.8105, "step": 773 }, { "batch_num_effect_tokens": 8028, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.00438, "grad_norm": 0.12445977330207825, "learning_rate": 5.823561300035355e-06, "loss": 4.2812, "step": 774 }, { "batch_num_effect_tokens": 8019, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 1.00568, "grad_norm": 0.14487802982330322, "learning_rate": 5.812380681301031e-06, "loss": 4.6328, "step": 775 }, { "batch_num_effect_tokens": 8008, "batch_num_samples": 15, "batch_num_tokens": 8166, "epoch": 1.00697, "grad_norm": 0.12720316648483276, "learning_rate": 5.8011958887565986e-06, "loss": 4.6025, "step": 776 }, { "batch_num_effect_tokens": 8029, "batch_num_samples": 18, "batch_num_tokens": 8192, "epoch": 1.00827, "grad_norm": 0.13001519441604614, "learning_rate": 5.79000697986675e-06, "loss": 4.4268, "step": 777 }, { "batch_num_effect_tokens": 7976, "batch_num_samples": 23, "batch_num_tokens": 8192, "epoch": 1.00957, "grad_norm": 0.12651684880256653, "learning_rate": 5.778814012117315e-06, "loss": 4.4297, "step": 778 }, { "batch_num_effect_tokens": 7949, "batch_num_samples": 18, "batch_num_tokens": 8128, "epoch": 1.01087, "grad_norm": 0.13454923033714294, "learning_rate": 5.767617043014985e-06, "loss": 4.3477, "step": 779 }, { "batch_num_effect_tokens": 7983, "batch_num_samples": 21, "batch_num_tokens": 8192, "epoch": 1.01217, "grad_norm": 0.14765796065330505, "learning_rate": 5.756416130087002e-06, "loss": 4.8281, "step": 780 }, { "batch_num_effect_tokens": 8000, "batch_num_samples": 20, "batch_num_tokens": 8186, "epoch": 1.01346, "grad_norm": 0.14546914398670197, "learning_rate": 5.745211330880872e-06, "loss": 4.3789, "step": 781 }, { "batch_num_effect_tokens": 8026, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.01476, "grad_norm": 0.11960723996162415, "learning_rate": 5.7340027029640755e-06, "loss": 4.3809, "step": 782 }, { "batch_num_effect_tokens": 8010, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.01606, "grad_norm": 0.14267292618751526, "learning_rate": 5.7227903039237535e-06, "loss": 4.5361, "step": 783 }, { "batch_num_effect_tokens": 7992, "batch_num_samples": 14, "batch_num_tokens": 8155, "epoch": 1.01736, "grad_norm": 0.1270764023065567, "learning_rate": 5.711574191366427e-06, "loss": 4.5635, "step": 784 }, { "batch_num_effect_tokens": 8066, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.01865, "grad_norm": 0.14421948790550232, "learning_rate": 5.7003544229176955e-06, "loss": 5.0654, "step": 785 }, { "batch_num_effect_tokens": 8016, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 1.01995, "grad_norm": 0.12795937061309814, "learning_rate": 5.689131056221944e-06, "loss": 4.3809, "step": 786 }, { "batch_num_effect_tokens": 8012, "batch_num_samples": 14, "batch_num_tokens": 8184, "epoch": 1.02125, "grad_norm": 0.1386982649564743, "learning_rate": 5.677904148942039e-06, "loss": 4.5127, "step": 787 }, { "batch_num_effect_tokens": 8040, "batch_num_samples": 15, "batch_num_tokens": 8190, "epoch": 1.02255, "grad_norm": 0.1462188959121704, "learning_rate": 5.666673758759045e-06, "loss": 4.3438, "step": 788 }, { "batch_num_effect_tokens": 8048, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.02384, "grad_norm": 0.16186738014221191, "learning_rate": 5.655439943371912e-06, "loss": 4.7744, "step": 789 }, { "batch_num_effect_tokens": 8051, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.02514, "grad_norm": 0.12250373512506485, "learning_rate": 5.644202760497195e-06, "loss": 4.2549, "step": 790 }, { "batch_num_effect_tokens": 7968, "batch_num_samples": 22, "batch_num_tokens": 8146, "epoch": 1.02644, "grad_norm": 0.17847490310668945, "learning_rate": 5.632962267868747e-06, "loss": 4.7627, "step": 791 }, { "batch_num_effect_tokens": 8061, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.02774, "grad_norm": 0.1615590900182724, "learning_rate": 5.621718523237427e-06, "loss": 4.8848, "step": 792 }, { "batch_num_effect_tokens": 8056, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.02903, "grad_norm": 0.13113276660442352, "learning_rate": 5.6104715843708e-06, "loss": 4.4883, "step": 793 }, { "batch_num_effect_tokens": 8022, "batch_num_samples": 14, "batch_num_tokens": 8184, "epoch": 1.03033, "grad_norm": 0.13856875896453857, "learning_rate": 5.599221509052844e-06, "loss": 4.5146, "step": 794 }, { "batch_num_effect_tokens": 8062, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 1.03163, "grad_norm": 0.13490043580532074, "learning_rate": 5.587968355083654e-06, "loss": 4.5322, "step": 795 }, { "batch_num_effect_tokens": 8043, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 1.03293, "grad_norm": 0.12667182087898254, "learning_rate": 5.576712180279134e-06, "loss": 4.5234, "step": 796 }, { "batch_num_effect_tokens": 8051, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 1.03423, "grad_norm": 0.12577715516090393, "learning_rate": 5.565453042470717e-06, "loss": 4.5273, "step": 797 }, { "batch_num_effect_tokens": 8031, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.03552, "grad_norm": 0.1268448680639267, "learning_rate": 5.5541909995050554e-06, "loss": 4.7002, "step": 798 }, { "batch_num_effect_tokens": 7978, "batch_num_samples": 15, "batch_num_tokens": 8140, "epoch": 1.03682, "grad_norm": 0.12386433035135269, "learning_rate": 5.542926109243727e-06, "loss": 4.5459, "step": 799 }, { "batch_num_effect_tokens": 8010, "batch_num_samples": 19, "batch_num_tokens": 8192, "epoch": 1.03812, "grad_norm": 0.13546870648860931, "learning_rate": 5.53165842956294e-06, "loss": 4.6445, "step": 800 }, { "batch_num_effect_tokens": 8041, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 1.03942, "grad_norm": 0.12320155650377274, "learning_rate": 5.520388018353233e-06, "loss": 4.2441, "step": 801 }, { "batch_num_effect_tokens": 7989, "batch_num_samples": 23, "batch_num_tokens": 8192, "epoch": 1.04071, "grad_norm": 0.14361971616744995, "learning_rate": 5.509114933519179e-06, "loss": 4.4756, "step": 802 }, { "batch_num_effect_tokens": 8043, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.04201, "grad_norm": 0.13132131099700928, "learning_rate": 5.497839232979084e-06, "loss": 4.2627, "step": 803 }, { "batch_num_effect_tokens": 8047, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.04331, "grad_norm": 0.1350114941596985, "learning_rate": 5.4865609746647e-06, "loss": 4.957, "step": 804 }, { "batch_num_effect_tokens": 8043, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.04461, "grad_norm": 0.12061459571123123, "learning_rate": 5.475280216520913e-06, "loss": 4.4453, "step": 805 }, { "batch_num_effect_tokens": 8024, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.0459, "grad_norm": 0.12335465103387833, "learning_rate": 5.463997016505459e-06, "loss": 4.1699, "step": 806 }, { "batch_num_effect_tokens": 8044, "batch_num_samples": 14, "batch_num_tokens": 8189, "epoch": 1.0472, "grad_norm": 0.12723498046398163, "learning_rate": 5.4527114325886145e-06, "loss": 4.1455, "step": 807 }, { "batch_num_effect_tokens": 7996, "batch_num_samples": 22, "batch_num_tokens": 8192, "epoch": 1.0485, "grad_norm": 0.13448134064674377, "learning_rate": 5.441423522752904e-06, "loss": 4.625, "step": 808 }, { "batch_num_effect_tokens": 8014, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.0498, "grad_norm": 0.12482011318206787, "learning_rate": 5.430133344992807e-06, "loss": 4.5391, "step": 809 }, { "batch_num_effect_tokens": 8003, "batch_num_samples": 16, "batch_num_tokens": 8190, "epoch": 1.05109, "grad_norm": 0.12930616736412048, "learning_rate": 5.418840957314451e-06, "loss": 4.1719, "step": 810 }, { "batch_num_effect_tokens": 7976, "batch_num_samples": 16, "batch_num_tokens": 8189, "epoch": 1.05239, "grad_norm": 0.13999834656715393, "learning_rate": 5.4075464177353165e-06, "loss": 4.7783, "step": 811 }, { "batch_num_effect_tokens": 8026, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.05369, "grad_norm": 0.13308821618556976, "learning_rate": 5.396249784283943e-06, "loss": 4.3184, "step": 812 }, { "batch_num_effect_tokens": 7997, "batch_num_samples": 14, "batch_num_tokens": 8149, "epoch": 1.05499, "grad_norm": 0.12184661626815796, "learning_rate": 5.3849511149996255e-06, "loss": 4.5986, "step": 813 }, { "batch_num_effect_tokens": 8004, "batch_num_samples": 19, "batch_num_tokens": 8191, "epoch": 1.05629, "grad_norm": 0.13552594184875488, "learning_rate": 5.373650467932122e-06, "loss": 4.4873, "step": 814 }, { "batch_num_effect_tokens": 8017, "batch_num_samples": 20, "batch_num_tokens": 8192, "epoch": 1.05758, "grad_norm": 0.12386645376682281, "learning_rate": 5.362347901141348e-06, "loss": 4.4834, "step": 815 }, { "batch_num_effect_tokens": 8005, "batch_num_samples": 19, "batch_num_tokens": 8192, "epoch": 1.05888, "grad_norm": 0.12549902498722076, "learning_rate": 5.351043472697082e-06, "loss": 4.9111, "step": 816 }, { "batch_num_effect_tokens": 7901, "batch_num_samples": 22, "batch_num_tokens": 8128, "epoch": 1.06018, "grad_norm": 0.12777559459209442, "learning_rate": 5.339737240678671e-06, "loss": 4.4678, "step": 817 }, { "batch_num_effect_tokens": 7908, "batch_num_samples": 20, "batch_num_tokens": 8112, "epoch": 1.06148, "grad_norm": 0.14045743644237518, "learning_rate": 5.328429263174725e-06, "loss": 4.4395, "step": 818 }, { "batch_num_effect_tokens": 7903, "batch_num_samples": 21, "batch_num_tokens": 8125, "epoch": 1.06277, "grad_norm": 0.13065417110919952, "learning_rate": 5.317119598282823e-06, "loss": 4.668, "step": 819 }, { "batch_num_effect_tokens": 7937, "batch_num_samples": 14, "batch_num_tokens": 8074, "epoch": 1.06407, "grad_norm": 0.1203104555606842, "learning_rate": 5.3058083041092145e-06, "loss": 4.0527, "step": 820 }, { "batch_num_effect_tokens": 7985, "batch_num_samples": 24, "batch_num_tokens": 8192, "epoch": 1.06537, "grad_norm": 0.1304715871810913, "learning_rate": 5.294495438768517e-06, "loss": 4.2881, "step": 821 }, { "batch_num_effect_tokens": 7990, "batch_num_samples": 14, "batch_num_tokens": 8155, "epoch": 1.06667, "grad_norm": 0.1247849240899086, "learning_rate": 5.283181060383423e-06, "loss": 4.2393, "step": 822 }, { "batch_num_effect_tokens": 8026, "batch_num_samples": 15, "batch_num_tokens": 8166, "epoch": 1.06796, "grad_norm": 0.11411383748054504, "learning_rate": 5.271865227084397e-06, "loss": 4.7168, "step": 823 }, { "batch_num_effect_tokens": 7985, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 1.06926, "grad_norm": 0.1270046830177307, "learning_rate": 5.260547997009379e-06, "loss": 4.5264, "step": 824 }, { "batch_num_effect_tokens": 7917, "batch_num_samples": 17, "batch_num_tokens": 8079, "epoch": 1.07056, "grad_norm": 0.12649409472942352, "learning_rate": 5.249229428303486e-06, "loss": 4.5293, "step": 825 }, { "batch_num_effect_tokens": 8027, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 1.07186, "grad_norm": 0.1309773325920105, "learning_rate": 5.237909579118713e-06, "loss": 4.2207, "step": 826 }, { "batch_num_effect_tokens": 8064, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.07315, "grad_norm": 0.12407130748033524, "learning_rate": 5.226588507613629e-06, "loss": 4.4414, "step": 827 }, { "batch_num_effect_tokens": 7999, "batch_num_samples": 15, "batch_num_tokens": 8152, "epoch": 1.07445, "grad_norm": 0.12215977907180786, "learning_rate": 5.21526627195309e-06, "loss": 4.9326, "step": 828 }, { "batch_num_effect_tokens": 7995, "batch_num_samples": 15, "batch_num_tokens": 8126, "epoch": 1.07575, "grad_norm": 0.12580524384975433, "learning_rate": 5.2039429303079294e-06, "loss": 4.4629, "step": 829 }, { "batch_num_effect_tokens": 8062, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.07705, "grad_norm": 0.12904728949069977, "learning_rate": 5.1926185408546604e-06, "loss": 4.3467, "step": 830 }, { "batch_num_effect_tokens": 7957, "batch_num_samples": 15, "batch_num_tokens": 8086, "epoch": 1.07835, "grad_norm": 0.13490159809589386, "learning_rate": 5.181293161775186e-06, "loss": 4.4609, "step": 831 }, { "batch_num_effect_tokens": 7975, "batch_num_samples": 16, "batch_num_tokens": 8133, "epoch": 1.07964, "grad_norm": 0.13504654169082642, "learning_rate": 5.169966851256489e-06, "loss": 4.2334, "step": 832 }, { "batch_num_effect_tokens": 8011, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 1.08094, "grad_norm": 0.1325363963842392, "learning_rate": 5.15863966749034e-06, "loss": 4.5322, "step": 833 }, { "batch_num_effect_tokens": 8060, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.08224, "grad_norm": 0.13651366531848907, "learning_rate": 5.147311668672991e-06, "loss": 4.6211, "step": 834 }, { "batch_num_effect_tokens": 7933, "batch_num_samples": 14, "batch_num_tokens": 8092, "epoch": 1.08354, "grad_norm": 0.15273743867874146, "learning_rate": 5.135982913004889e-06, "loss": 4.9326, "step": 835 }, { "batch_num_effect_tokens": 8011, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.08483, "grad_norm": 0.12992307543754578, "learning_rate": 5.1246534586903655e-06, "loss": 4.623, "step": 836 }, { "batch_num_effect_tokens": 8059, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.08613, "grad_norm": 0.13165108859539032, "learning_rate": 5.11332336393734e-06, "loss": 4.5762, "step": 837 }, { "batch_num_effect_tokens": 8051, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.08743, "grad_norm": 0.125563845038414, "learning_rate": 5.101992686957028e-06, "loss": 4.0518, "step": 838 }, { "batch_num_effect_tokens": 8055, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.08873, "grad_norm": 0.12698976695537567, "learning_rate": 5.090661485963628e-06, "loss": 4.3701, "step": 839 }, { "batch_num_effect_tokens": 7922, "batch_num_samples": 15, "batch_num_tokens": 8080, "epoch": 1.09002, "grad_norm": 0.12403866648674011, "learning_rate": 5.07932981917404e-06, "loss": 4.1211, "step": 840 }, { "batch_num_effect_tokens": 8022, "batch_num_samples": 18, "batch_num_tokens": 8192, "epoch": 1.09132, "grad_norm": 0.12252990156412125, "learning_rate": 5.06799774480755e-06, "loss": 4.2109, "step": 841 }, { "batch_num_effect_tokens": 8020, "batch_num_samples": 16, "batch_num_tokens": 8189, "epoch": 1.09262, "grad_norm": 0.11591480672359467, "learning_rate": 5.056665321085542e-06, "loss": 4.6582, "step": 842 }, { "batch_num_effect_tokens": 8047, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.09392, "grad_norm": 0.12528832256793976, "learning_rate": 5.045332606231191e-06, "loss": 4.5166, "step": 843 }, { "batch_num_effect_tokens": 8025, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.09521, "grad_norm": 0.1252938210964203, "learning_rate": 5.033999658469174e-06, "loss": 4.1709, "step": 844 }, { "batch_num_effect_tokens": 7936, "batch_num_samples": 14, "batch_num_tokens": 8146, "epoch": 1.09651, "grad_norm": 0.12733793258666992, "learning_rate": 5.022666536025359e-06, "loss": 4.4521, "step": 845 }, { "batch_num_effect_tokens": 8024, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 1.09781, "grad_norm": 0.12023656815290451, "learning_rate": 5.011333297126513e-06, "loss": 4.3408, "step": 846 }, { "batch_num_effect_tokens": 7998, "batch_num_samples": 17, "batch_num_tokens": 8171, "epoch": 1.09911, "grad_norm": 0.11743319034576416, "learning_rate": 5e-06, "loss": 4.5605, "step": 847 }, { "batch_num_effect_tokens": 8019, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 1.10041, "grad_norm": 0.12908881902694702, "learning_rate": 4.98866670287349e-06, "loss": 4.5234, "step": 848 }, { "batch_num_effect_tokens": 8004, "batch_num_samples": 17, "batch_num_tokens": 8152, "epoch": 1.1017, "grad_norm": 0.1330942064523697, "learning_rate": 4.977333463974643e-06, "loss": 4.6748, "step": 849 }, { "batch_num_effect_tokens": 7978, "batch_num_samples": 15, "batch_num_tokens": 8116, "epoch": 1.103, "grad_norm": 0.13068662583827972, "learning_rate": 4.966000341530827e-06, "loss": 4.543, "step": 850 }, { "batch_num_effect_tokens": 7926, "batch_num_samples": 17, "batch_num_tokens": 8074, "epoch": 1.1043, "grad_norm": 0.12920166552066803, "learning_rate": 4.9546673937688086e-06, "loss": 4.1533, "step": 851 }, { "batch_num_effect_tokens": 7905, "batch_num_samples": 14, "batch_num_tokens": 8083, "epoch": 1.1056, "grad_norm": 0.12737612426280975, "learning_rate": 4.94333467891446e-06, "loss": 4.5225, "step": 852 }, { "batch_num_effect_tokens": 8035, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 1.10689, "grad_norm": 0.11825668811798096, "learning_rate": 4.932002255192452e-06, "loss": 4.542, "step": 853 }, { "batch_num_effect_tokens": 7872, "batch_num_samples": 23, "batch_num_tokens": 8086, "epoch": 1.10819, "grad_norm": 0.1468936651945114, "learning_rate": 4.9206701808259605e-06, "loss": 5.1279, "step": 854 }, { "batch_num_effect_tokens": 7952, "batch_num_samples": 15, "batch_num_tokens": 8116, "epoch": 1.10949, "grad_norm": 0.1266355961561203, "learning_rate": 4.909338514036373e-06, "loss": 4.3125, "step": 855 }, { "batch_num_effect_tokens": 7974, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 1.11079, "grad_norm": 0.13342653214931488, "learning_rate": 4.898007313042975e-06, "loss": 4.3066, "step": 856 }, { "batch_num_effect_tokens": 8038, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 1.11208, "grad_norm": 0.12773488461971283, "learning_rate": 4.8866766360626615e-06, "loss": 4.4775, "step": 857 }, { "batch_num_effect_tokens": 8026, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 1.11338, "grad_norm": 0.13515350222587585, "learning_rate": 4.875346541309637e-06, "loss": 4.3096, "step": 858 }, { "batch_num_effect_tokens": 7949, "batch_num_samples": 17, "batch_num_tokens": 8139, "epoch": 1.11468, "grad_norm": 0.13417378067970276, "learning_rate": 4.864017086995112e-06, "loss": 4.7119, "step": 859 }, { "batch_num_effect_tokens": 8003, "batch_num_samples": 21, "batch_num_tokens": 8192, "epoch": 1.11598, "grad_norm": 0.13580955564975739, "learning_rate": 4.852688331327011e-06, "loss": 4.8125, "step": 860 }, { "batch_num_effect_tokens": 7911, "batch_num_samples": 16, "batch_num_tokens": 8144, "epoch": 1.11727, "grad_norm": 0.12650568783283234, "learning_rate": 4.841360332509663e-06, "loss": 4.5361, "step": 861 }, { "batch_num_effect_tokens": 8036, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.11857, "grad_norm": 0.1183309480547905, "learning_rate": 4.830033148743512e-06, "loss": 4.0146, "step": 862 }, { "batch_num_effect_tokens": 8064, "batch_num_samples": 14, "batch_num_tokens": 8189, "epoch": 1.11987, "grad_norm": 0.126793771982193, "learning_rate": 4.818706838224815e-06, "loss": 4.502, "step": 863 }, { "batch_num_effect_tokens": 7941, "batch_num_samples": 14, "batch_num_tokens": 8073, "epoch": 1.12117, "grad_norm": 0.125539630651474, "learning_rate": 4.8073814591453395e-06, "loss": 4.2773, "step": 864 }, { "batch_num_effect_tokens": 8039, "batch_num_samples": 15, "batch_num_tokens": 8176, "epoch": 1.12247, "grad_norm": 0.13130030035972595, "learning_rate": 4.796057069692073e-06, "loss": 4.6611, "step": 865 }, { "batch_num_effect_tokens": 7915, "batch_num_samples": 18, "batch_num_tokens": 8090, "epoch": 1.12376, "grad_norm": 0.12117452919483185, "learning_rate": 4.784733728046912e-06, "loss": 4.3535, "step": 866 }, { "batch_num_effect_tokens": 7944, "batch_num_samples": 15, "batch_num_tokens": 8096, "epoch": 1.12506, "grad_norm": 0.12501753866672516, "learning_rate": 4.773411492386372e-06, "loss": 4.7051, "step": 867 }, { "batch_num_effect_tokens": 8019, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.12636, "grad_norm": 0.12537001073360443, "learning_rate": 4.762090420881289e-06, "loss": 4.7852, "step": 868 }, { "batch_num_effect_tokens": 8001, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 1.12766, "grad_norm": 0.11610813438892365, "learning_rate": 4.750770571696514e-06, "loss": 4.1914, "step": 869 }, { "batch_num_effect_tokens": 8023, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 1.12895, "grad_norm": 0.12504693865776062, "learning_rate": 4.739452002990621e-06, "loss": 3.9814, "step": 870 }, { "batch_num_effect_tokens": 7964, "batch_num_samples": 23, "batch_num_tokens": 8143, "epoch": 1.13025, "grad_norm": 0.122451052069664, "learning_rate": 4.728134772915605e-06, "loss": 4.2246, "step": 871 }, { "batch_num_effect_tokens": 7910, "batch_num_samples": 15, "batch_num_tokens": 8089, "epoch": 1.13155, "grad_norm": 0.1333150565624237, "learning_rate": 4.716818939616578e-06, "loss": 4.5938, "step": 872 }, { "batch_num_effect_tokens": 8021, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.13285, "grad_norm": 0.13619881868362427, "learning_rate": 4.705504561231485e-06, "loss": 4.6348, "step": 873 }, { "batch_num_effect_tokens": 8014, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.13414, "grad_norm": 0.1358458697795868, "learning_rate": 4.694191695890788e-06, "loss": 4.4453, "step": 874 }, { "batch_num_effect_tokens": 7844, "batch_num_samples": 20, "batch_num_tokens": 8080, "epoch": 1.13544, "grad_norm": 0.12052467465400696, "learning_rate": 4.682880401717178e-06, "loss": 4.4482, "step": 875 }, { "batch_num_effect_tokens": 8046, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.13674, "grad_norm": 0.11595940589904785, "learning_rate": 4.671570736825277e-06, "loss": 4.1816, "step": 876 }, { "batch_num_effect_tokens": 8025, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.13804, "grad_norm": 0.1310449242591858, "learning_rate": 4.660262759321331e-06, "loss": 4.4414, "step": 877 }, { "batch_num_effect_tokens": 7966, "batch_num_samples": 14, "batch_num_tokens": 8092, "epoch": 1.13933, "grad_norm": 0.1393718123435974, "learning_rate": 4.6489565273029196e-06, "loss": 4.3857, "step": 878 }, { "batch_num_effect_tokens": 8039, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.14063, "grad_norm": 0.13982267677783966, "learning_rate": 4.637652098858655e-06, "loss": 4.4727, "step": 879 }, { "batch_num_effect_tokens": 7863, "batch_num_samples": 24, "batch_num_tokens": 8073, "epoch": 1.14193, "grad_norm": 0.14085061848163605, "learning_rate": 4.626349532067879e-06, "loss": 4.707, "step": 880 }, { "batch_num_effect_tokens": 8078, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 1.14323, "grad_norm": 0.13555698096752167, "learning_rate": 4.615048885000375e-06, "loss": 4.4424, "step": 881 }, { "batch_num_effect_tokens": 8024, "batch_num_samples": 16, "batch_num_tokens": 8188, "epoch": 1.14453, "grad_norm": 0.13564865291118622, "learning_rate": 4.603750215716057e-06, "loss": 4.2158, "step": 882 }, { "batch_num_effect_tokens": 8066, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.14582, "grad_norm": 0.13119758665561676, "learning_rate": 4.592453582264684e-06, "loss": 4.6748, "step": 883 }, { "batch_num_effect_tokens": 7983, "batch_num_samples": 14, "batch_num_tokens": 8137, "epoch": 1.14712, "grad_norm": 0.1268366128206253, "learning_rate": 4.581159042685552e-06, "loss": 4.3467, "step": 884 }, { "batch_num_effect_tokens": 7963, "batch_num_samples": 19, "batch_num_tokens": 8161, "epoch": 1.14842, "grad_norm": 0.12466870993375778, "learning_rate": 4.569866655007193e-06, "loss": 4.7676, "step": 885 }, { "batch_num_effect_tokens": 8051, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.14972, "grad_norm": 0.13537685573101044, "learning_rate": 4.558576477247097e-06, "loss": 4.1719, "step": 886 }, { "batch_num_effect_tokens": 7946, "batch_num_samples": 18, "batch_num_tokens": 8114, "epoch": 1.15101, "grad_norm": 0.12638984620571136, "learning_rate": 4.547288567411388e-06, "loss": 4.3047, "step": 887 }, { "batch_num_effect_tokens": 8062, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.15231, "grad_norm": 0.12655675411224365, "learning_rate": 4.5360029834945425e-06, "loss": 4.5449, "step": 888 }, { "batch_num_effect_tokens": 8071, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.15361, "grad_norm": 0.13120026886463165, "learning_rate": 4.524719783479088e-06, "loss": 4.3848, "step": 889 }, { "batch_num_effect_tokens": 8036, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 1.15491, "grad_norm": 0.12158391624689102, "learning_rate": 4.513439025335302e-06, "loss": 4.4844, "step": 890 }, { "batch_num_effect_tokens": 8050, "batch_num_samples": 17, "batch_num_tokens": 8191, "epoch": 1.1562, "grad_norm": 0.13134336471557617, "learning_rate": 4.502160767020918e-06, "loss": 5.5039, "step": 891 }, { "batch_num_effect_tokens": 8046, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.1575, "grad_norm": 0.12917788326740265, "learning_rate": 4.4908850664808245e-06, "loss": 4.6094, "step": 892 }, { "batch_num_effect_tokens": 8052, "batch_num_samples": 13, "batch_num_tokens": 8188, "epoch": 1.1588, "grad_norm": 0.13100962340831757, "learning_rate": 4.4796119816467685e-06, "loss": 4.3867, "step": 893 }, { "batch_num_effect_tokens": 8045, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.1601, "grad_norm": 0.13368336856365204, "learning_rate": 4.468341570437061e-06, "loss": 4.4697, "step": 894 }, { "batch_num_effect_tokens": 8007, "batch_num_samples": 20, "batch_num_tokens": 8192, "epoch": 1.16139, "grad_norm": 0.12571601569652557, "learning_rate": 4.457073890756273e-06, "loss": 4.249, "step": 895 }, { "batch_num_effect_tokens": 8035, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.16269, "grad_norm": 0.1370954066514969, "learning_rate": 4.445809000494945e-06, "loss": 4.5898, "step": 896 }, { "batch_num_effect_tokens": 8022, "batch_num_samples": 14, "batch_num_tokens": 8163, "epoch": 1.16399, "grad_norm": 0.12560419738292694, "learning_rate": 4.434546957529283e-06, "loss": 4.3018, "step": 897 }, { "batch_num_effect_tokens": 8063, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.16529, "grad_norm": 0.1281665414571762, "learning_rate": 4.423287819720866e-06, "loss": 4.2051, "step": 898 }, { "batch_num_effect_tokens": 8076, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 1.16659, "grad_norm": 0.13026204705238342, "learning_rate": 4.412031644916348e-06, "loss": 4.3184, "step": 899 }, { "batch_num_effect_tokens": 7923, "batch_num_samples": 23, "batch_num_tokens": 8124, "epoch": 1.16788, "grad_norm": 0.1349964737892151, "learning_rate": 4.400778490947157e-06, "loss": 4.7354, "step": 900 }, { "batch_num_effect_tokens": 8023, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.16918, "grad_norm": 0.13438257575035095, "learning_rate": 4.389528415629201e-06, "loss": 4.5127, "step": 901 }, { "batch_num_effect_tokens": 8048, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.17048, "grad_norm": 0.12244999408721924, "learning_rate": 4.3782814767625755e-06, "loss": 4.752, "step": 902 }, { "batch_num_effect_tokens": 8041, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 1.17178, "grad_norm": 0.12824036180973053, "learning_rate": 4.367037732131254e-06, "loss": 4.1963, "step": 903 }, { "batch_num_effect_tokens": 8052, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.17307, "grad_norm": 0.13308383524417877, "learning_rate": 4.355797239502807e-06, "loss": 4.3115, "step": 904 }, { "batch_num_effect_tokens": 8033, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 1.17437, "grad_norm": 0.12866966426372528, "learning_rate": 4.34456005662809e-06, "loss": 4.7246, "step": 905 }, { "batch_num_effect_tokens": 8045, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.17567, "grad_norm": 0.12427198141813278, "learning_rate": 4.3333262412409575e-06, "loss": 4.4072, "step": 906 }, { "batch_num_effect_tokens": 8016, "batch_num_samples": 20, "batch_num_tokens": 8192, "epoch": 1.17697, "grad_norm": 0.12111925333738327, "learning_rate": 4.322095851057962e-06, "loss": 4.4688, "step": 907 }, { "batch_num_effect_tokens": 8005, "batch_num_samples": 18, "batch_num_tokens": 8170, "epoch": 1.17826, "grad_norm": 0.13822191953659058, "learning_rate": 4.310868943778057e-06, "loss": 4.6992, "step": 908 }, { "batch_num_effect_tokens": 7912, "batch_num_samples": 17, "batch_num_tokens": 8100, "epoch": 1.17956, "grad_norm": 0.13831211626529694, "learning_rate": 4.299645577082305e-06, "loss": 4.749, "step": 909 }, { "batch_num_effect_tokens": 8018, "batch_num_samples": 14, "batch_num_tokens": 8191, "epoch": 1.18086, "grad_norm": 0.11694054305553436, "learning_rate": 4.2884258086335755e-06, "loss": 4.2002, "step": 910 }, { "batch_num_effect_tokens": 7919, "batch_num_samples": 15, "batch_num_tokens": 8086, "epoch": 1.18216, "grad_norm": 0.13560707867145538, "learning_rate": 4.277209696076248e-06, "loss": 4.751, "step": 911 }, { "batch_num_effect_tokens": 8032, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.18345, "grad_norm": 0.11765341460704803, "learning_rate": 4.265997297035926e-06, "loss": 4.2842, "step": 912 }, { "batch_num_effect_tokens": 8042, "batch_num_samples": 16, "batch_num_tokens": 8190, "epoch": 1.18475, "grad_norm": 0.13359101116657257, "learning_rate": 4.254788669119127e-06, "loss": 4.832, "step": 913 }, { "batch_num_effect_tokens": 8069, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 1.18605, "grad_norm": 0.13347217440605164, "learning_rate": 4.243583869913e-06, "loss": 4.5137, "step": 914 }, { "batch_num_effect_tokens": 8029, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 1.18735, "grad_norm": 0.12483620643615723, "learning_rate": 4.232382956985017e-06, "loss": 4.1416, "step": 915 }, { "batch_num_effect_tokens": 7913, "batch_num_samples": 16, "batch_num_tokens": 8089, "epoch": 1.18865, "grad_norm": 0.1381087601184845, "learning_rate": 4.221185987882684e-06, "loss": 4.5977, "step": 916 }, { "batch_num_effect_tokens": 8033, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.18994, "grad_norm": 0.15423673391342163, "learning_rate": 4.209993020133251e-06, "loss": 4.1279, "step": 917 }, { "batch_num_effect_tokens": 7968, "batch_num_samples": 16, "batch_num_tokens": 8122, "epoch": 1.19124, "grad_norm": 0.14219102263450623, "learning_rate": 4.198804111243403e-06, "loss": 4.1191, "step": 918 }, { "batch_num_effect_tokens": 7946, "batch_num_samples": 20, "batch_num_tokens": 8144, "epoch": 1.19254, "grad_norm": 0.1339540034532547, "learning_rate": 4.187619318698971e-06, "loss": 4.3799, "step": 919 }, { "batch_num_effect_tokens": 8050, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.19384, "grad_norm": 0.1272207647562027, "learning_rate": 4.176438699964646e-06, "loss": 4.2256, "step": 920 }, { "batch_num_effect_tokens": 7961, "batch_num_samples": 16, "batch_num_tokens": 8122, "epoch": 1.19513, "grad_norm": 0.1126057505607605, "learning_rate": 4.165262312483664e-06, "loss": 4.7441, "step": 921 }, { "batch_num_effect_tokens": 8026, "batch_num_samples": 17, "batch_num_tokens": 8184, "epoch": 1.19643, "grad_norm": 0.13080507516860962, "learning_rate": 4.154090213677531e-06, "loss": 4.5449, "step": 922 }, { "batch_num_effect_tokens": 7840, "batch_num_samples": 17, "batch_num_tokens": 8006, "epoch": 1.19773, "grad_norm": 0.13032877445220947, "learning_rate": 4.1429224609457135e-06, "loss": 4.7578, "step": 923 }, { "batch_num_effect_tokens": 7991, "batch_num_samples": 14, "batch_num_tokens": 8114, "epoch": 1.19903, "grad_norm": 0.12347196787595749, "learning_rate": 4.131759111665349e-06, "loss": 4.6602, "step": 924 }, { "batch_num_effect_tokens": 7991, "batch_num_samples": 14, "batch_num_tokens": 8114, "epoch": 1.19903, "eval_eval_loss": 0.5730312466621399, "eval_eval_runtime": 115.3354, "eval_eval_samples_per_second": 43.352, "eval_eval_steps_per_second": 2.714, "step": 924 }, { "batch_num_effect_tokens": 7754, "batch_num_samples": 30, "batch_num_tokens": 8011, "epoch": 1.20032, "grad_norm": 0.12765365839004517, "learning_rate": 4.120600223190955e-06, "loss": 4.1592, "step": 925 }, { "batch_num_effect_tokens": 8045, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.20162, "grad_norm": 0.13900645077228546, "learning_rate": 4.109445852854125e-06, "loss": 4.457, "step": 926 }, { "batch_num_effect_tokens": 7980, "batch_num_samples": 15, "batch_num_tokens": 8134, "epoch": 1.20292, "grad_norm": 0.12310691922903061, "learning_rate": 4.098296057963246e-06, "loss": 4.6924, "step": 927 }, { "batch_num_effect_tokens": 7909, "batch_num_samples": 17, "batch_num_tokens": 8087, "epoch": 1.20422, "grad_norm": 0.1377318799495697, "learning_rate": 4.087150895803192e-06, "loss": 4.085, "step": 928 }, { "batch_num_effect_tokens": 7991, "batch_num_samples": 24, "batch_num_tokens": 8188, "epoch": 1.20552, "grad_norm": 0.14090469479560852, "learning_rate": 4.076010423635037e-06, "loss": 4.8594, "step": 929 }, { "batch_num_effect_tokens": 8018, "batch_num_samples": 19, "batch_num_tokens": 8192, "epoch": 1.20681, "grad_norm": 0.12155922502279282, "learning_rate": 4.064874698695761e-06, "loss": 4.3281, "step": 930 }, { "batch_num_effect_tokens": 8050, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.20811, "grad_norm": 0.13325005769729614, "learning_rate": 4.053743778197951e-06, "loss": 4.668, "step": 931 }, { "batch_num_effect_tokens": 8045, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.20941, "grad_norm": 0.1313610076904297, "learning_rate": 4.042617719329507e-06, "loss": 4.7402, "step": 932 }, { "batch_num_effect_tokens": 8037, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.21071, "grad_norm": 0.1211569532752037, "learning_rate": 4.0314965792533635e-06, "loss": 4.4873, "step": 933 }, { "batch_num_effect_tokens": 8048, "batch_num_samples": 13, "batch_num_tokens": 8191, "epoch": 1.212, "grad_norm": 0.12477646768093109, "learning_rate": 4.020380415107167e-06, "loss": 4.4355, "step": 934 }, { "batch_num_effect_tokens": 8056, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.2133, "grad_norm": 0.1258799135684967, "learning_rate": 4.009269284003014e-06, "loss": 4.6689, "step": 935 }, { "batch_num_effect_tokens": 7976, "batch_num_samples": 15, "batch_num_tokens": 8116, "epoch": 1.2146, "grad_norm": 0.1279737502336502, "learning_rate": 3.99816324302713e-06, "loss": 4.0469, "step": 936 }, { "batch_num_effect_tokens": 7933, "batch_num_samples": 14, "batch_num_tokens": 8122, "epoch": 1.2159, "grad_norm": 0.12206988781690598, "learning_rate": 3.987062349239596e-06, "loss": 4.1738, "step": 937 }, { "batch_num_effect_tokens": 8052, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 1.21719, "grad_norm": 0.1263066828250885, "learning_rate": 3.975966659674048e-06, "loss": 4.3438, "step": 938 }, { "batch_num_effect_tokens": 8066, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 1.21849, "grad_norm": 0.1253357082605362, "learning_rate": 3.964876231337379e-06, "loss": 4.2725, "step": 939 }, { "batch_num_effect_tokens": 7945, "batch_num_samples": 20, "batch_num_tokens": 8128, "epoch": 1.21979, "grad_norm": 0.146712064743042, "learning_rate": 3.953791121209458e-06, "loss": 4.1543, "step": 940 }, { "batch_num_effect_tokens": 7998, "batch_num_samples": 16, "batch_num_tokens": 8155, "epoch": 1.22109, "grad_norm": 0.13425204157829285, "learning_rate": 3.942711386242826e-06, "loss": 4.8652, "step": 941 }, { "batch_num_effect_tokens": 7906, "batch_num_samples": 14, "batch_num_tokens": 8079, "epoch": 1.22238, "grad_norm": 0.13137395679950714, "learning_rate": 3.931637083362405e-06, "loss": 4.0664, "step": 942 }, { "batch_num_effect_tokens": 8020, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.22368, "grad_norm": 0.12695470452308655, "learning_rate": 3.920568269465216e-06, "loss": 4.3643, "step": 943 }, { "batch_num_effect_tokens": 8007, "batch_num_samples": 14, "batch_num_tokens": 8144, "epoch": 1.22498, "grad_norm": 0.12561850249767303, "learning_rate": 3.909505001420072e-06, "loss": 4.4287, "step": 944 }, { "batch_num_effect_tokens": 8026, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 1.22628, "grad_norm": 0.13399529457092285, "learning_rate": 3.898447336067297e-06, "loss": 4.2666, "step": 945 }, { "batch_num_effect_tokens": 8055, "batch_num_samples": 14, "batch_num_tokens": 8182, "epoch": 1.22758, "grad_norm": 0.1406509280204773, "learning_rate": 3.887395330218429e-06, "loss": 4.4023, "step": 946 }, { "batch_num_effect_tokens": 8033, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.22887, "grad_norm": 0.12866802513599396, "learning_rate": 3.876349040655925e-06, "loss": 4.2334, "step": 947 }, { "batch_num_effect_tokens": 8035, "batch_num_samples": 14, "batch_num_tokens": 8163, "epoch": 1.23017, "grad_norm": 0.1247694343328476, "learning_rate": 3.86530852413288e-06, "loss": 4.5361, "step": 948 }, { "batch_num_effect_tokens": 8037, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.23147, "grad_norm": 0.12325099110603333, "learning_rate": 3.854273837372724e-06, "loss": 4.2549, "step": 949 }, { "batch_num_effect_tokens": 8037, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.23277, "grad_norm": 0.1377268135547638, "learning_rate": 3.843245037068932e-06, "loss": 4.5361, "step": 950 }, { "batch_num_effect_tokens": 8050, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.23406, "grad_norm": 0.1305629014968872, "learning_rate": 3.832222179884747e-06, "loss": 4.4932, "step": 951 }, { "batch_num_effect_tokens": 8009, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.23536, "grad_norm": 0.1270604133605957, "learning_rate": 3.821205322452863e-06, "loss": 4.877, "step": 952 }, { "batch_num_effect_tokens": 7940, "batch_num_samples": 15, "batch_num_tokens": 8106, "epoch": 1.23666, "grad_norm": 0.1398427039384842, "learning_rate": 3.8101945213751635e-06, "loss": 4.583, "step": 953 }, { "batch_num_effect_tokens": 8043, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.23796, "grad_norm": 0.12297014147043228, "learning_rate": 3.799189833222404e-06, "loss": 4.4424, "step": 954 }, { "batch_num_effect_tokens": 7968, "batch_num_samples": 14, "batch_num_tokens": 8107, "epoch": 1.23925, "grad_norm": 0.1305890530347824, "learning_rate": 3.7881913145339387e-06, "loss": 4.5752, "step": 955 }, { "batch_num_effect_tokens": 8052, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.24055, "grad_norm": 0.1272619217634201, "learning_rate": 3.777199021817426e-06, "loss": 4.9912, "step": 956 }, { "batch_num_effect_tokens": 8012, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.24185, "grad_norm": 0.13338087499141693, "learning_rate": 3.7662130115485317e-06, "loss": 4.6572, "step": 957 }, { "batch_num_effect_tokens": 8052, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.24315, "grad_norm": 0.13084988296031952, "learning_rate": 3.7552333401706508e-06, "loss": 4.0723, "step": 958 }, { "batch_num_effect_tokens": 8036, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 1.24444, "grad_norm": 0.12794189155101776, "learning_rate": 3.7442600640946045e-06, "loss": 4.2676, "step": 959 }, { "batch_num_effect_tokens": 8053, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 1.24574, "grad_norm": 0.1372474730014801, "learning_rate": 3.733293239698359e-06, "loss": 4.6777, "step": 960 }, { "batch_num_effect_tokens": 7919, "batch_num_samples": 18, "batch_num_tokens": 8081, "epoch": 1.24704, "grad_norm": 0.14294537901878357, "learning_rate": 3.7223329233267354e-06, "loss": 4.585, "step": 961 }, { "batch_num_effect_tokens": 7930, "batch_num_samples": 14, "batch_num_tokens": 8086, "epoch": 1.24834, "grad_norm": 0.13273997604846954, "learning_rate": 3.711379171291115e-06, "loss": 4.2793, "step": 962 }, { "batch_num_effect_tokens": 8065, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.24964, "grad_norm": 0.13120055198669434, "learning_rate": 3.7004320398691507e-06, "loss": 4.5068, "step": 963 }, { "batch_num_effect_tokens": 8053, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.25093, "grad_norm": 0.12945452332496643, "learning_rate": 3.689491585304491e-06, "loss": 4.873, "step": 964 }, { "batch_num_effect_tokens": 7897, "batch_num_samples": 19, "batch_num_tokens": 8112, "epoch": 1.25223, "grad_norm": 0.13530410826206207, "learning_rate": 3.6785578638064655e-06, "loss": 4.6064, "step": 965 }, { "batch_num_effect_tokens": 8075, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.25353, "grad_norm": 0.12716589868068695, "learning_rate": 3.667630931549826e-06, "loss": 4.0967, "step": 966 }, { "batch_num_effect_tokens": 8007, "batch_num_samples": 19, "batch_num_tokens": 8192, "epoch": 1.25483, "grad_norm": 0.12516002357006073, "learning_rate": 3.6567108446744314e-06, "loss": 4.5811, "step": 967 }, { "batch_num_effect_tokens": 8063, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.25612, "grad_norm": 0.13172359764575958, "learning_rate": 3.6457976592849753e-06, "loss": 4.4248, "step": 968 }, { "batch_num_effect_tokens": 7955, "batch_num_samples": 17, "batch_num_tokens": 8167, "epoch": 1.25742, "grad_norm": 0.12710091471672058, "learning_rate": 3.6348914314506944e-06, "loss": 4.167, "step": 969 }, { "batch_num_effect_tokens": 8003, "batch_num_samples": 20, "batch_num_tokens": 8192, "epoch": 1.25872, "grad_norm": 0.1424860805273056, "learning_rate": 3.623992217205075e-06, "loss": 4.3926, "step": 970 }, { "batch_num_effect_tokens": 8048, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 1.26002, "grad_norm": 0.12454716116189957, "learning_rate": 3.6131000725455756e-06, "loss": 4.5156, "step": 971 }, { "batch_num_effect_tokens": 7839, "batch_num_samples": 26, "batch_num_tokens": 8080, "epoch": 1.26131, "grad_norm": 0.13083425164222717, "learning_rate": 3.6022150534333267e-06, "loss": 4.667, "step": 972 }, { "batch_num_effect_tokens": 7960, "batch_num_samples": 20, "batch_num_tokens": 8160, "epoch": 1.26261, "grad_norm": 0.13552185893058777, "learning_rate": 3.5913372157928515e-06, "loss": 4.8428, "step": 973 }, { "batch_num_effect_tokens": 7976, "batch_num_samples": 15, "batch_num_tokens": 8128, "epoch": 1.26391, "grad_norm": 0.119549959897995, "learning_rate": 3.5804666155117807e-06, "loss": 4.6348, "step": 974 }, { "batch_num_effect_tokens": 7979, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 1.26521, "grad_norm": 0.12790077924728394, "learning_rate": 3.5696033084405535e-06, "loss": 4.1328, "step": 975 }, { "batch_num_effect_tokens": 8011, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.2665, "grad_norm": 0.12213937938213348, "learning_rate": 3.558747350392146e-06, "loss": 4.584, "step": 976 }, { "batch_num_effect_tokens": 7982, "batch_num_samples": 20, "batch_num_tokens": 8192, "epoch": 1.2678, "grad_norm": 0.12294816225767136, "learning_rate": 3.5478987971417723e-06, "loss": 4.5674, "step": 977 }, { "batch_num_effect_tokens": 7989, "batch_num_samples": 20, "batch_num_tokens": 8184, "epoch": 1.2691, "grad_norm": 0.12941895425319672, "learning_rate": 3.537057704426602e-06, "loss": 4.6514, "step": 978 }, { "batch_num_effect_tokens": 8059, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.2704, "grad_norm": 0.11868005990982056, "learning_rate": 3.526224127945479e-06, "loss": 4.3252, "step": 979 }, { "batch_num_effect_tokens": 7976, "batch_num_samples": 15, "batch_num_tokens": 8116, "epoch": 1.2717, "grad_norm": 0.13335098326206207, "learning_rate": 3.5153981233586277e-06, "loss": 4.7188, "step": 980 }, { "batch_num_effect_tokens": 7880, "batch_num_samples": 17, "batch_num_tokens": 8062, "epoch": 1.27299, "grad_norm": 0.11686256527900696, "learning_rate": 3.5045797462873643e-06, "loss": 4.2773, "step": 981 }, { "batch_num_effect_tokens": 7808, "batch_num_samples": 30, "batch_num_tokens": 8082, "epoch": 1.27429, "grad_norm": 0.13790954649448395, "learning_rate": 3.4937690523138302e-06, "loss": 4.5752, "step": 982 }, { "batch_num_effect_tokens": 8003, "batch_num_samples": 18, "batch_num_tokens": 8192, "epoch": 1.27559, "grad_norm": 0.1367255300283432, "learning_rate": 3.4829660969806776e-06, "loss": 4.6543, "step": 983 }, { "batch_num_effect_tokens": 7947, "batch_num_samples": 15, "batch_num_tokens": 8116, "epoch": 1.27689, "grad_norm": 0.14571502804756165, "learning_rate": 3.4721709357908146e-06, "loss": 4.248, "step": 984 }, { "batch_num_effect_tokens": 7961, "batch_num_samples": 17, "batch_num_tokens": 8108, "epoch": 1.27818, "grad_norm": 0.12180175632238388, "learning_rate": 3.461383624207092e-06, "loss": 4.6895, "step": 985 }, { "batch_num_effect_tokens": 8017, "batch_num_samples": 17, "batch_num_tokens": 8176, "epoch": 1.27948, "grad_norm": 0.1343916952610016, "learning_rate": 3.4506042176520375e-06, "loss": 4.3574, "step": 986 }, { "batch_num_effect_tokens": 8063, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.28078, "grad_norm": 0.13950027525424957, "learning_rate": 3.439832771507565e-06, "loss": 4.165, "step": 987 }, { "batch_num_effect_tokens": 8016, "batch_num_samples": 15, "batch_num_tokens": 8146, "epoch": 1.28208, "grad_norm": 0.1476665586233139, "learning_rate": 3.4290693411146882e-06, "loss": 4.8047, "step": 988 }, { "batch_num_effect_tokens": 7985, "batch_num_samples": 17, "batch_num_tokens": 8186, "epoch": 1.28337, "grad_norm": 0.13402019441127777, "learning_rate": 3.418313981773238e-06, "loss": 4.5215, "step": 989 }, { "batch_num_effect_tokens": 7940, "batch_num_samples": 14, "batch_num_tokens": 8101, "epoch": 1.28467, "grad_norm": 0.1343563050031662, "learning_rate": 3.4075667487415785e-06, "loss": 4.3604, "step": 990 }, { "batch_num_effect_tokens": 8053, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 1.28597, "grad_norm": 0.13799680769443512, "learning_rate": 3.3968276972363224e-06, "loss": 4.3262, "step": 991 }, { "batch_num_effect_tokens": 7924, "batch_num_samples": 24, "batch_num_tokens": 8192, "epoch": 1.28727, "grad_norm": 0.13754983246326447, "learning_rate": 3.3860968824320507e-06, "loss": 4.3242, "step": 992 }, { "batch_num_effect_tokens": 8043, "batch_num_samples": 14, "batch_num_tokens": 8188, "epoch": 1.28856, "grad_norm": 0.13641497492790222, "learning_rate": 3.3753743594610216e-06, "loss": 4.498, "step": 993 }, { "batch_num_effect_tokens": 8067, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.28986, "grad_norm": 0.1407681405544281, "learning_rate": 3.3646601834128924e-06, "loss": 4.7393, "step": 994 }, { "batch_num_effect_tokens": 7816, "batch_num_samples": 27, "batch_num_tokens": 8041, "epoch": 1.29116, "grad_norm": 0.14411447942256927, "learning_rate": 3.353954409334442e-06, "loss": 4.3535, "step": 995 }, { "batch_num_effect_tokens": 7919, "batch_num_samples": 14, "batch_num_tokens": 8119, "epoch": 1.29246, "grad_norm": 0.13188710808753967, "learning_rate": 3.3432570922292728e-06, "loss": 4.2559, "step": 996 }, { "batch_num_effect_tokens": 7998, "batch_num_samples": 21, "batch_num_tokens": 8186, "epoch": 1.29376, "grad_norm": 0.12662333250045776, "learning_rate": 3.3325682870575478e-06, "loss": 4.0684, "step": 997 }, { "batch_num_effect_tokens": 8002, "batch_num_samples": 17, "batch_num_tokens": 8179, "epoch": 1.29505, "grad_norm": 0.12670519948005676, "learning_rate": 3.3218880487356885e-06, "loss": 4.5381, "step": 998 }, { "batch_num_effect_tokens": 8002, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 1.29635, "grad_norm": 0.13219594955444336, "learning_rate": 3.3112164321361064e-06, "loss": 4.7129, "step": 999 }, { "batch_num_effect_tokens": 8010, "batch_num_samples": 20, "batch_num_tokens": 8192, "epoch": 1.29765, "grad_norm": 0.12541687488555908, "learning_rate": 3.3005534920869175e-06, "loss": 4.333, "step": 1000 }, { "batch_num_effect_tokens": 7940, "batch_num_samples": 25, "batch_num_tokens": 8160, "epoch": 1.29895, "grad_norm": 0.12586721777915955, "learning_rate": 3.289899283371657e-06, "loss": 4.4082, "step": 1001 }, { "batch_num_effect_tokens": 8032, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.30024, "grad_norm": 0.11721961200237274, "learning_rate": 3.2792538607290036e-06, "loss": 4.3291, "step": 1002 }, { "batch_num_effect_tokens": 7975, "batch_num_samples": 14, "batch_num_tokens": 8130, "epoch": 1.30154, "grad_norm": 0.14944490790367126, "learning_rate": 3.268617278852494e-06, "loss": 4.0322, "step": 1003 }, { "batch_num_effect_tokens": 8009, "batch_num_samples": 18, "batch_num_tokens": 8180, "epoch": 1.30284, "grad_norm": 0.13059593737125397, "learning_rate": 3.257989592390241e-06, "loss": 4.1982, "step": 1004 }, { "batch_num_effect_tokens": 7998, "batch_num_samples": 21, "batch_num_tokens": 8192, "epoch": 1.30414, "grad_norm": 0.12026825547218323, "learning_rate": 3.2473708559446606e-06, "loss": 4.377, "step": 1005 }, { "batch_num_effect_tokens": 8051, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 1.30543, "grad_norm": 0.1340772956609726, "learning_rate": 3.2367611240721796e-06, "loss": 4.5195, "step": 1006 }, { "batch_num_effect_tokens": 8044, "batch_num_samples": 14, "batch_num_tokens": 8184, "epoch": 1.30673, "grad_norm": 0.13199454545974731, "learning_rate": 3.226160451282965e-06, "loss": 4.3545, "step": 1007 }, { "batch_num_effect_tokens": 7976, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 1.30803, "grad_norm": 0.11274772882461548, "learning_rate": 3.2155688920406415e-06, "loss": 4.1318, "step": 1008 }, { "batch_num_effect_tokens": 8059, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 1.30933, "grad_norm": 0.13221722841262817, "learning_rate": 3.204986500762006e-06, "loss": 4.499, "step": 1009 }, { "batch_num_effect_tokens": 7972, "batch_num_samples": 17, "batch_num_tokens": 8126, "epoch": 1.31062, "grad_norm": 0.13470391929149628, "learning_rate": 3.194413331816759e-06, "loss": 4.7568, "step": 1010 }, { "batch_num_effect_tokens": 7829, "batch_num_samples": 20, "batch_num_tokens": 8112, "epoch": 1.31192, "grad_norm": 0.1261770874261856, "learning_rate": 3.1838494395272155e-06, "loss": 4.3809, "step": 1011 }, { "batch_num_effect_tokens": 8017, "batch_num_samples": 18, "batch_num_tokens": 8192, "epoch": 1.31322, "grad_norm": 0.13255248963832855, "learning_rate": 3.173294878168025e-06, "loss": 4.3428, "step": 1012 }, { "batch_num_effect_tokens": 7932, "batch_num_samples": 15, "batch_num_tokens": 8076, "epoch": 1.31452, "grad_norm": 0.13333898782730103, "learning_rate": 3.162749701965907e-06, "loss": 4.5859, "step": 1013 }, { "batch_num_effect_tokens": 8034, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.31582, "grad_norm": 0.13349847495555878, "learning_rate": 3.152213965099352e-06, "loss": 4.8818, "step": 1014 }, { "batch_num_effect_tokens": 7997, "batch_num_samples": 14, "batch_num_tokens": 8184, "epoch": 1.31711, "grad_norm": 0.12797629833221436, "learning_rate": 3.141687721698363e-06, "loss": 4.6836, "step": 1015 }, { "batch_num_effect_tokens": 7989, "batch_num_samples": 23, "batch_num_tokens": 8192, "epoch": 1.31841, "grad_norm": 0.1437963992357254, "learning_rate": 3.1311710258441607e-06, "loss": 4.3955, "step": 1016 }, { "batch_num_effect_tokens": 8036, "batch_num_samples": 17, "batch_num_tokens": 8191, "epoch": 1.31971, "grad_norm": 0.12287195771932602, "learning_rate": 3.1206639315689154e-06, "loss": 3.877, "step": 1017 }, { "batch_num_effect_tokens": 8046, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 1.32101, "grad_norm": 0.1235748901963234, "learning_rate": 3.110166492855468e-06, "loss": 4.1572, "step": 1018 }, { "batch_num_effect_tokens": 8071, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.3223, "grad_norm": 0.1256420761346817, "learning_rate": 3.0996787636370495e-06, "loss": 4.1611, "step": 1019 }, { "batch_num_effect_tokens": 8071, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.3236, "grad_norm": 0.13278773427009583, "learning_rate": 3.0892007977970083e-06, "loss": 4.5078, "step": 1020 }, { "batch_num_effect_tokens": 8029, "batch_num_samples": 14, "batch_num_tokens": 8188, "epoch": 1.3249, "grad_norm": 0.130946084856987, "learning_rate": 3.0787326491685287e-06, "loss": 4.6865, "step": 1021 }, { "batch_num_effect_tokens": 7960, "batch_num_samples": 22, "batch_num_tokens": 8192, "epoch": 1.3262, "grad_norm": 0.130752831697464, "learning_rate": 3.0682743715343565e-06, "loss": 4.5918, "step": 1022 }, { "batch_num_effect_tokens": 7974, "batch_num_samples": 24, "batch_num_tokens": 8192, "epoch": 1.32749, "grad_norm": 0.13391391932964325, "learning_rate": 3.057826018626527e-06, "loss": 4.5469, "step": 1023 }, { "batch_num_effect_tokens": 8031, "batch_num_samples": 14, "batch_num_tokens": 8191, "epoch": 1.32879, "grad_norm": 0.13457229733467102, "learning_rate": 3.0473876441260786e-06, "loss": 4.7383, "step": 1024 }, { "batch_num_effect_tokens": 7991, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 1.33009, "grad_norm": 0.13133271038532257, "learning_rate": 3.0369593016627867e-06, "loss": 4.1289, "step": 1025 }, { "batch_num_effect_tokens": 7963, "batch_num_samples": 15, "batch_num_tokens": 8098, "epoch": 1.33139, "grad_norm": 0.13519789278507233, "learning_rate": 3.026541044814885e-06, "loss": 4.1523, "step": 1026 }, { "batch_num_effect_tokens": 7929, "batch_num_samples": 14, "batch_num_tokens": 8081, "epoch": 1.33268, "grad_norm": 0.137874037027359, "learning_rate": 3.016132927108787e-06, "loss": 4.8057, "step": 1027 }, { "batch_num_effect_tokens": 8019, "batch_num_samples": 14, "batch_num_tokens": 8163, "epoch": 1.33398, "grad_norm": 0.11607379466295242, "learning_rate": 3.005735002018818e-06, "loss": 4.1748, "step": 1028 }, { "batch_num_effect_tokens": 8040, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 1.33528, "grad_norm": 0.12126877903938293, "learning_rate": 2.995347322966933e-06, "loss": 4.4434, "step": 1029 }, { "batch_num_effect_tokens": 8010, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 1.33658, "grad_norm": 0.12571828067302704, "learning_rate": 2.9849699433224423e-06, "loss": 4.2344, "step": 1030 }, { "batch_num_effect_tokens": 8043, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.33788, "grad_norm": 0.134991854429245, "learning_rate": 2.974602916401751e-06, "loss": 4.3867, "step": 1031 }, { "batch_num_effect_tokens": 8030, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.33917, "grad_norm": 0.13775356113910675, "learning_rate": 2.9642462954680605e-06, "loss": 4.7285, "step": 1032 }, { "batch_num_effect_tokens": 7844, "batch_num_samples": 17, "batch_num_tokens": 7989, "epoch": 1.34047, "grad_norm": 0.12610451877117157, "learning_rate": 2.9539001337311234e-06, "loss": 4.1611, "step": 1033 }, { "batch_num_effect_tokens": 7988, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 1.34177, "grad_norm": 0.13606472313404083, "learning_rate": 2.9435644843469434e-06, "loss": 4.2061, "step": 1034 }, { "batch_num_effect_tokens": 7909, "batch_num_samples": 15, "batch_num_tokens": 8080, "epoch": 1.34307, "grad_norm": 0.12449389696121216, "learning_rate": 2.933239400417519e-06, "loss": 4.1045, "step": 1035 }, { "batch_num_effect_tokens": 8043, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 1.34436, "grad_norm": 0.12599188089370728, "learning_rate": 2.9229249349905686e-06, "loss": 4.1924, "step": 1036 }, { "batch_num_effect_tokens": 7968, "batch_num_samples": 14, "batch_num_tokens": 8097, "epoch": 1.34566, "grad_norm": 0.1278277039527893, "learning_rate": 2.9126211410592527e-06, "loss": 4.5273, "step": 1037 }, { "batch_num_effect_tokens": 7967, "batch_num_samples": 15, "batch_num_tokens": 8126, "epoch": 1.34696, "grad_norm": 0.13065114617347717, "learning_rate": 2.9023280715619005e-06, "loss": 4.5469, "step": 1038 }, { "batch_num_effect_tokens": 8077, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 1.34826, "grad_norm": 0.14062908291816711, "learning_rate": 2.8920457793817507e-06, "loss": 4.5078, "step": 1039 }, { "batch_num_effect_tokens": 7943, "batch_num_samples": 14, "batch_num_tokens": 8101, "epoch": 1.34955, "grad_norm": 0.13684336841106415, "learning_rate": 2.881774317346664e-06, "loss": 4.6104, "step": 1040 }, { "batch_num_effect_tokens": 7958, "batch_num_samples": 14, "batch_num_tokens": 8114, "epoch": 1.35085, "grad_norm": 0.12485930323600769, "learning_rate": 2.871513738228861e-06, "loss": 4.4727, "step": 1041 }, { "batch_num_effect_tokens": 8064, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 1.35215, "grad_norm": 0.13105593621730804, "learning_rate": 2.861264094744647e-06, "loss": 4.6553, "step": 1042 }, { "batch_num_effect_tokens": 8039, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.35345, "grad_norm": 0.14259664714336395, "learning_rate": 2.851025439554142e-06, "loss": 4.5693, "step": 1043 }, { "batch_num_effect_tokens": 8008, "batch_num_samples": 19, "batch_num_tokens": 8191, "epoch": 1.35474, "grad_norm": 0.12127009779214859, "learning_rate": 2.840797825261017e-06, "loss": 4.0859, "step": 1044 }, { "batch_num_effect_tokens": 8059, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.35604, "grad_norm": 0.13202065229415894, "learning_rate": 2.83058130441221e-06, "loss": 4.2617, "step": 1045 }, { "batch_num_effect_tokens": 8000, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.35734, "grad_norm": 0.13531279563903809, "learning_rate": 2.8203759294976687e-06, "loss": 4.2031, "step": 1046 }, { "batch_num_effect_tokens": 8048, "batch_num_samples": 14, "batch_num_tokens": 8182, "epoch": 1.35864, "grad_norm": 0.13501350581645966, "learning_rate": 2.810181752950072e-06, "loss": 4.1162, "step": 1047 }, { "batch_num_effect_tokens": 8042, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.35994, "grad_norm": 0.1449299156665802, "learning_rate": 2.7999988271445643e-06, "loss": 4.2139, "step": 1048 }, { "batch_num_effect_tokens": 7937, "batch_num_samples": 17, "batch_num_tokens": 8101, "epoch": 1.36123, "grad_norm": 0.15028077363967896, "learning_rate": 2.7898272043984947e-06, "loss": 4.8975, "step": 1049 }, { "batch_num_effect_tokens": 7871, "batch_num_samples": 15, "batch_num_tokens": 8074, "epoch": 1.36253, "grad_norm": 0.1480277180671692, "learning_rate": 2.7796669369711294e-06, "loss": 4.4062, "step": 1050 }, { "batch_num_effect_tokens": 7916, "batch_num_samples": 26, "batch_num_tokens": 8126, "epoch": 1.36383, "grad_norm": 0.14486616849899292, "learning_rate": 2.7695180770633993e-06, "loss": 4.582, "step": 1051 }, { "batch_num_effect_tokens": 8000, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.36513, "grad_norm": 0.13963328301906586, "learning_rate": 2.7593806768176244e-06, "loss": 4.0654, "step": 1052 }, { "batch_num_effect_tokens": 7940, "batch_num_samples": 16, "batch_num_tokens": 8078, "epoch": 1.36642, "grad_norm": 0.1411132663488388, "learning_rate": 2.7492547883172473e-06, "loss": 4.2812, "step": 1053 }, { "batch_num_effect_tokens": 7897, "batch_num_samples": 20, "batch_num_tokens": 8080, "epoch": 1.36772, "grad_norm": 0.14485670626163483, "learning_rate": 2.7391404635865725e-06, "loss": 4.6309, "step": 1054 }, { "batch_num_effect_tokens": 8053, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 1.36902, "grad_norm": 0.13264106214046478, "learning_rate": 2.7290377545904823e-06, "loss": 4.4268, "step": 1055 }, { "batch_num_effect_tokens": 8024, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 1.37032, "grad_norm": 0.12862573564052582, "learning_rate": 2.718946713234185e-06, "loss": 4.1699, "step": 1056 }, { "batch_num_effect_tokens": 8041, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.37161, "grad_norm": 0.12804825603961945, "learning_rate": 2.708867391362948e-06, "loss": 4.5527, "step": 1057 }, { "batch_num_effect_tokens": 7860, "batch_num_samples": 17, "batch_num_tokens": 8062, "epoch": 1.37291, "grad_norm": 0.14360636472702026, "learning_rate": 2.6987998407618216e-06, "loss": 4.084, "step": 1058 }, { "batch_num_effect_tokens": 8016, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.37421, "grad_norm": 0.13846245408058167, "learning_rate": 2.688744113155378e-06, "loss": 4.0527, "step": 1059 }, { "batch_num_effect_tokens": 8052, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.37551, "grad_norm": 0.1346784085035324, "learning_rate": 2.678700260207449e-06, "loss": 4.7793, "step": 1060 }, { "batch_num_effect_tokens": 7994, "batch_num_samples": 23, "batch_num_tokens": 8192, "epoch": 1.3768, "grad_norm": 0.14536811411380768, "learning_rate": 2.6686683335208526e-06, "loss": 4.9219, "step": 1061 }, { "batch_num_effect_tokens": 8031, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.3781, "grad_norm": 0.13124246895313263, "learning_rate": 2.65864838463714e-06, "loss": 4.498, "step": 1062 }, { "batch_num_effect_tokens": 7892, "batch_num_samples": 21, "batch_num_tokens": 8091, "epoch": 1.3794, "grad_norm": 0.12533849477767944, "learning_rate": 2.648640465036316e-06, "loss": 4.3086, "step": 1063 }, { "batch_num_effect_tokens": 8028, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.3807, "grad_norm": 0.13352860510349274, "learning_rate": 2.6386446261365874e-06, "loss": 4.4902, "step": 1064 }, { "batch_num_effect_tokens": 7728, "batch_num_samples": 33, "batch_num_tokens": 7999, "epoch": 1.382, "grad_norm": 0.141183003783226, "learning_rate": 2.6286609192940887e-06, "loss": 4.1797, "step": 1065 }, { "batch_num_effect_tokens": 8064, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.38329, "grad_norm": 0.12944729626178741, "learning_rate": 2.6186893958026245e-06, "loss": 4.6572, "step": 1066 }, { "batch_num_effect_tokens": 8028, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 1.38459, "grad_norm": 0.1253744512796402, "learning_rate": 2.608730106893411e-06, "loss": 4.2881, "step": 1067 }, { "batch_num_effect_tokens": 8019, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.38589, "grad_norm": 0.14108753204345703, "learning_rate": 2.5987831037347933e-06, "loss": 4.2412, "step": 1068 }, { "batch_num_effect_tokens": 8004, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.38719, "grad_norm": 0.12252593785524368, "learning_rate": 2.5888484374320033e-06, "loss": 4.2715, "step": 1069 }, { "batch_num_effect_tokens": 8038, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 1.38848, "grad_norm": 0.1265387237071991, "learning_rate": 2.578926159026891e-06, "loss": 4.2881, "step": 1070 }, { "batch_num_effect_tokens": 8027, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 1.38978, "grad_norm": 0.13131491839885712, "learning_rate": 2.5690163194976576e-06, "loss": 4.3721, "step": 1071 }, { "batch_num_effect_tokens": 8022, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.39108, "grad_norm": 0.13168731331825256, "learning_rate": 2.559118969758595e-06, "loss": 4.291, "step": 1072 }, { "batch_num_effect_tokens": 8012, "batch_num_samples": 17, "batch_num_tokens": 8171, "epoch": 1.39238, "grad_norm": 0.1283009946346283, "learning_rate": 2.549234160659827e-06, "loss": 4.3809, "step": 1073 }, { "batch_num_effect_tokens": 8004, "batch_num_samples": 21, "batch_num_tokens": 8192, "epoch": 1.39367, "grad_norm": 0.13365747034549713, "learning_rate": 2.539361942987046e-06, "loss": 4.8643, "step": 1074 }, { "batch_num_effect_tokens": 7982, "batch_num_samples": 14, "batch_num_tokens": 8142, "epoch": 1.39497, "grad_norm": 0.13202007114887238, "learning_rate": 2.5295023674612568e-06, "loss": 4.3945, "step": 1075 }, { "batch_num_effect_tokens": 8032, "batch_num_samples": 14, "batch_num_tokens": 8156, "epoch": 1.39627, "grad_norm": 0.13004064559936523, "learning_rate": 2.519655484738507e-06, "loss": 4.4951, "step": 1076 }, { "batch_num_effect_tokens": 8031, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 1.39757, "grad_norm": 0.1316070407629013, "learning_rate": 2.509821345409633e-06, "loss": 4.7695, "step": 1077 }, { "batch_num_effect_tokens": 8050, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.39886, "grad_norm": 0.12465286254882812, "learning_rate": 2.5000000000000015e-06, "loss": 4.4688, "step": 1078 }, { "batch_num_effect_tokens": 8050, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.39886, "eval_eval_loss": 0.5671281218528748, "eval_eval_runtime": 115.3241, "eval_eval_samples_per_second": 43.356, "eval_eval_steps_per_second": 2.714, "step": 1078 }, { "batch_num_effect_tokens": 7894, "batch_num_samples": 19, "batch_num_tokens": 8192, "epoch": 1.40016, "grad_norm": 0.1325971782207489, "learning_rate": 2.4901914989692405e-06, "loss": 4.2979, "step": 1079 }, { "batch_num_effect_tokens": 8049, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 1.40146, "grad_norm": 0.12942376732826233, "learning_rate": 2.480395892710997e-06, "loss": 4.3652, "step": 1080 }, { "batch_num_effect_tokens": 8032, "batch_num_samples": 19, "batch_num_tokens": 8191, "epoch": 1.40276, "grad_norm": 0.12413739413022995, "learning_rate": 2.470613231552661e-06, "loss": 4.5146, "step": 1081 }, { "batch_num_effect_tokens": 7966, "batch_num_samples": 15, "batch_num_tokens": 8086, "epoch": 1.40406, "grad_norm": 0.129132479429245, "learning_rate": 2.46084356575511e-06, "loss": 3.9941, "step": 1082 }, { "batch_num_effect_tokens": 7873, "batch_num_samples": 26, "batch_num_tokens": 8106, "epoch": 1.40535, "grad_norm": 0.126277893781662, "learning_rate": 2.451086945512465e-06, "loss": 4.3438, "step": 1083 }, { "batch_num_effect_tokens": 8022, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 1.40665, "grad_norm": 0.1285809427499771, "learning_rate": 2.4413434209518137e-06, "loss": 4.2676, "step": 1084 }, { "batch_num_effect_tokens": 8000, "batch_num_samples": 14, "batch_num_tokens": 8149, "epoch": 1.40795, "grad_norm": 0.1211712658405304, "learning_rate": 2.4316130421329696e-06, "loss": 4.6133, "step": 1085 }, { "batch_num_effect_tokens": 8012, "batch_num_samples": 14, "batch_num_tokens": 8177, "epoch": 1.40925, "grad_norm": 0.12398537993431091, "learning_rate": 2.421895859048196e-06, "loss": 4.0205, "step": 1086 }, { "batch_num_effect_tokens": 8049, "batch_num_samples": 14, "batch_num_tokens": 8189, "epoch": 1.41054, "grad_norm": 0.12769639492034912, "learning_rate": 2.4121919216219646e-06, "loss": 4.2617, "step": 1087 }, { "batch_num_effect_tokens": 8050, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.41184, "grad_norm": 0.1281779408454895, "learning_rate": 2.4025012797107e-06, "loss": 4.3457, "step": 1088 }, { "batch_num_effect_tokens": 8023, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.41314, "grad_norm": 0.1288890838623047, "learning_rate": 2.39282398310251e-06, "loss": 4.0811, "step": 1089 }, { "batch_num_effect_tokens": 8052, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.41444, "grad_norm": 0.1370052844285965, "learning_rate": 2.383160081516941e-06, "loss": 4.4746, "step": 1090 }, { "batch_num_effect_tokens": 8009, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 1.41573, "grad_norm": 0.11726196855306625, "learning_rate": 2.373509624604717e-06, "loss": 4.4619, "step": 1091 }, { "batch_num_effect_tokens": 7985, "batch_num_samples": 15, "batch_num_tokens": 8158, "epoch": 1.41703, "grad_norm": 0.12008494138717651, "learning_rate": 2.363872661947488e-06, "loss": 4.2627, "step": 1092 }, { "batch_num_effect_tokens": 7961, "batch_num_samples": 14, "batch_num_tokens": 8137, "epoch": 1.41833, "grad_norm": 0.12784817814826965, "learning_rate": 2.3542492430575752e-06, "loss": 4.4648, "step": 1093 }, { "batch_num_effect_tokens": 7973, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.41963, "grad_norm": 0.14390228688716888, "learning_rate": 2.344639417377714e-06, "loss": 4.4814, "step": 1094 }, { "batch_num_effect_tokens": 7863, "batch_num_samples": 17, "batch_num_tokens": 8023, "epoch": 1.42092, "grad_norm": 0.13291668891906738, "learning_rate": 2.3350432342808003e-06, "loss": 4.4844, "step": 1095 }, { "batch_num_effect_tokens": 8021, "batch_num_samples": 16, "batch_num_tokens": 8188, "epoch": 1.42222, "grad_norm": 0.13262400031089783, "learning_rate": 2.3254607430696393e-06, "loss": 4.3721, "step": 1096 }, { "batch_num_effect_tokens": 8017, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 1.42352, "grad_norm": 0.12368138134479523, "learning_rate": 2.315891992976687e-06, "loss": 4.29, "step": 1097 }, { "batch_num_effect_tokens": 8052, "batch_num_samples": 14, "batch_num_tokens": 8173, "epoch": 1.42482, "grad_norm": 0.13449449837207794, "learning_rate": 2.3063370331638084e-06, "loss": 4.1885, "step": 1098 }, { "batch_num_effect_tokens": 7931, "batch_num_samples": 14, "batch_num_tokens": 8114, "epoch": 1.42612, "grad_norm": 0.12767821550369263, "learning_rate": 2.296795912722014e-06, "loss": 4.3281, "step": 1099 }, { "batch_num_effect_tokens": 7935, "batch_num_samples": 14, "batch_num_tokens": 8104, "epoch": 1.42741, "grad_norm": 0.13633497059345245, "learning_rate": 2.2872686806712037e-06, "loss": 4.5225, "step": 1100 }, { "batch_num_effect_tokens": 7834, "batch_num_samples": 24, "batch_num_tokens": 8072, "epoch": 1.42871, "grad_norm": 0.13452018797397614, "learning_rate": 2.277755385959934e-06, "loss": 4.4902, "step": 1101 }, { "batch_num_effect_tokens": 8018, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.43001, "grad_norm": 0.1304135024547577, "learning_rate": 2.2682560774651458e-06, "loss": 4.3936, "step": 1102 }, { "batch_num_effect_tokens": 8042, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.43131, "grad_norm": 0.1268252283334732, "learning_rate": 2.258770803991932e-06, "loss": 4.2588, "step": 1103 }, { "batch_num_effect_tokens": 7986, "batch_num_samples": 24, "batch_num_tokens": 8188, "epoch": 1.4326, "grad_norm": 0.1350371092557907, "learning_rate": 2.249299614273266e-06, "loss": 4.2285, "step": 1104 }, { "batch_num_effect_tokens": 7901, "batch_num_samples": 17, "batch_num_tokens": 8074, "epoch": 1.4339, "grad_norm": 0.12892520427703857, "learning_rate": 2.2398425569697667e-06, "loss": 4.3389, "step": 1105 }, { "batch_num_effect_tokens": 7979, "batch_num_samples": 19, "batch_num_tokens": 8192, "epoch": 1.4352, "grad_norm": 0.11750482767820358, "learning_rate": 2.230399680669449e-06, "loss": 4.2197, "step": 1106 }, { "batch_num_effect_tokens": 7980, "batch_num_samples": 14, "batch_num_tokens": 8128, "epoch": 1.4365, "grad_norm": 0.12220905721187592, "learning_rate": 2.220971033887463e-06, "loss": 4.3984, "step": 1107 }, { "batch_num_effect_tokens": 8001, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.43779, "grad_norm": 0.12369371205568314, "learning_rate": 2.211556665065854e-06, "loss": 4.5498, "step": 1108 }, { "batch_num_effect_tokens": 7932, "batch_num_samples": 14, "batch_num_tokens": 8076, "epoch": 1.43909, "grad_norm": 0.13513018190860748, "learning_rate": 2.2021566225733094e-06, "loss": 4.3604, "step": 1109 }, { "batch_num_effect_tokens": 8075, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 1.44039, "grad_norm": 0.12401560693979263, "learning_rate": 2.1927709547049096e-06, "loss": 4.2617, "step": 1110 }, { "batch_num_effect_tokens": 7918, "batch_num_samples": 19, "batch_num_tokens": 8086, "epoch": 1.44169, "grad_norm": 0.12758344411849976, "learning_rate": 2.1833997096818897e-06, "loss": 4.7422, "step": 1111 }, { "batch_num_effect_tokens": 7989, "batch_num_samples": 19, "batch_num_tokens": 8191, "epoch": 1.44298, "grad_norm": 0.12552611529827118, "learning_rate": 2.174042935651377e-06, "loss": 4.3584, "step": 1112 }, { "batch_num_effect_tokens": 8035, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.44428, "grad_norm": 0.12584245204925537, "learning_rate": 2.1647006806861472e-06, "loss": 4.2412, "step": 1113 }, { "batch_num_effect_tokens": 8033, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.44558, "grad_norm": 0.13394629955291748, "learning_rate": 2.1553729927843894e-06, "loss": 4.0078, "step": 1114 }, { "batch_num_effect_tokens": 8032, "batch_num_samples": 15, "batch_num_tokens": 8186, "epoch": 1.44688, "grad_norm": 0.12964493036270142, "learning_rate": 2.146059919869444e-06, "loss": 4.3604, "step": 1115 }, { "batch_num_effect_tokens": 8009, "batch_num_samples": 14, "batch_num_tokens": 8173, "epoch": 1.44818, "grad_norm": 0.12681354582309723, "learning_rate": 2.1367615097895707e-06, "loss": 3.9531, "step": 1116 }, { "batch_num_effect_tokens": 8055, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 1.44947, "grad_norm": 0.13806065917015076, "learning_rate": 2.1274778103176854e-06, "loss": 4.4346, "step": 1117 }, { "batch_num_effect_tokens": 8012, "batch_num_samples": 14, "batch_num_tokens": 8156, "epoch": 1.45077, "grad_norm": 0.14222683012485504, "learning_rate": 2.1182088691511287e-06, "loss": 4.541, "step": 1118 }, { "batch_num_effect_tokens": 7946, "batch_num_samples": 21, "batch_num_tokens": 8142, "epoch": 1.45207, "grad_norm": 0.14069655537605286, "learning_rate": 2.1089547339114215e-06, "loss": 4.3574, "step": 1119 }, { "batch_num_effect_tokens": 8012, "batch_num_samples": 15, "batch_num_tokens": 8164, "epoch": 1.45337, "grad_norm": 0.13768544793128967, "learning_rate": 2.09971545214401e-06, "loss": 4.7539, "step": 1120 }, { "batch_num_effect_tokens": 7943, "batch_num_samples": 14, "batch_num_tokens": 8101, "epoch": 1.45466, "grad_norm": 0.13390763103961945, "learning_rate": 2.0904910713180275e-06, "loss": 4.2021, "step": 1121 }, { "batch_num_effect_tokens": 8015, "batch_num_samples": 18, "batch_num_tokens": 8192, "epoch": 1.45596, "grad_norm": 0.1425563544034958, "learning_rate": 2.081281638826052e-06, "loss": 4.1191, "step": 1122 }, { "batch_num_effect_tokens": 8046, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.45726, "grad_norm": 0.13213439285755157, "learning_rate": 2.072087201983857e-06, "loss": 4.6445, "step": 1123 }, { "batch_num_effect_tokens": 8026, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 1.45856, "grad_norm": 0.13721303641796112, "learning_rate": 2.0629078080301782e-06, "loss": 4.1318, "step": 1124 }, { "batch_num_effect_tokens": 7984, "batch_num_samples": 14, "batch_num_tokens": 8188, "epoch": 1.45985, "grad_norm": 0.1321757435798645, "learning_rate": 2.0537435041264597e-06, "loss": 4.1426, "step": 1125 }, { "batch_num_effect_tokens": 7970, "batch_num_samples": 15, "batch_num_tokens": 8126, "epoch": 1.46115, "grad_norm": 0.13200123608112335, "learning_rate": 2.0445943373566178e-06, "loss": 4.5459, "step": 1126 }, { "batch_num_effect_tokens": 8049, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.46245, "grad_norm": 0.1399904489517212, "learning_rate": 2.0354603547267985e-06, "loss": 4.2383, "step": 1127 }, { "batch_num_effect_tokens": 8012, "batch_num_samples": 20, "batch_num_tokens": 8192, "epoch": 1.46375, "grad_norm": 0.14394307136535645, "learning_rate": 2.0263416031651335e-06, "loss": 5.0156, "step": 1128 }, { "batch_num_effect_tokens": 8035, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 1.46504, "grad_norm": 0.12676626443862915, "learning_rate": 2.017238129521506e-06, "loss": 4.1953, "step": 1129 }, { "batch_num_effect_tokens": 7999, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.46634, "grad_norm": 0.12323788553476334, "learning_rate": 2.0081499805673015e-06, "loss": 4.3027, "step": 1130 }, { "batch_num_effect_tokens": 8029, "batch_num_samples": 14, "batch_num_tokens": 8170, "epoch": 1.46764, "grad_norm": 0.12607906758785248, "learning_rate": 1.9990772029951665e-06, "loss": 3.9326, "step": 1131 }, { "batch_num_effect_tokens": 8022, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 1.46894, "grad_norm": 0.12812362611293793, "learning_rate": 1.9900198434187838e-06, "loss": 4.7461, "step": 1132 }, { "batch_num_effect_tokens": 7995, "batch_num_samples": 20, "batch_num_tokens": 8192, "epoch": 1.47024, "grad_norm": 0.13349904119968414, "learning_rate": 1.980977948372612e-06, "loss": 4.3418, "step": 1133 }, { "batch_num_effect_tokens": 8013, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 1.47153, "grad_norm": 0.12421982735395432, "learning_rate": 1.971951564311668e-06, "loss": 4.1846, "step": 1134 }, { "batch_num_effect_tokens": 8064, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.47283, "grad_norm": 0.1404682993888855, "learning_rate": 1.962940737611264e-06, "loss": 4.3447, "step": 1135 }, { "batch_num_effect_tokens": 7949, "batch_num_samples": 14, "batch_num_tokens": 8088, "epoch": 1.47413, "grad_norm": 0.1217743456363678, "learning_rate": 1.953945514566789e-06, "loss": 3.9229, "step": 1136 }, { "batch_num_effect_tokens": 8047, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.47543, "grad_norm": 0.13491998612880707, "learning_rate": 1.9449659413934684e-06, "loss": 4.1543, "step": 1137 }, { "batch_num_effect_tokens": 8078, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.47672, "grad_norm": 0.12323690205812454, "learning_rate": 1.9360020642261155e-06, "loss": 4.2627, "step": 1138 }, { "batch_num_effect_tokens": 7875, "batch_num_samples": 21, "batch_num_tokens": 8108, "epoch": 1.47802, "grad_norm": 0.13586825132369995, "learning_rate": 1.9270539291189054e-06, "loss": 4.5137, "step": 1139 }, { "batch_num_effect_tokens": 8032, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 1.47932, "grad_norm": 0.12245763093233109, "learning_rate": 1.918121582045132e-06, "loss": 4.5752, "step": 1140 }, { "batch_num_effect_tokens": 7842, "batch_num_samples": 24, "batch_num_tokens": 8048, "epoch": 1.48062, "grad_norm": 0.12825018167495728, "learning_rate": 1.9092050688969736e-06, "loss": 4.5674, "step": 1141 }, { "batch_num_effect_tokens": 7974, "batch_num_samples": 19, "batch_num_tokens": 8192, "epoch": 1.48191, "grad_norm": 0.12709911167621613, "learning_rate": 1.9003044354852634e-06, "loss": 4.3623, "step": 1142 }, { "batch_num_effect_tokens": 8006, "batch_num_samples": 14, "batch_num_tokens": 8191, "epoch": 1.48321, "grad_norm": 0.11626652628183365, "learning_rate": 1.8914197275392444e-06, "loss": 4.0283, "step": 1143 }, { "batch_num_effect_tokens": 7904, "batch_num_samples": 18, "batch_num_tokens": 8088, "epoch": 1.48451, "grad_norm": 0.12491626292467117, "learning_rate": 1.8825509907063328e-06, "loss": 4.376, "step": 1144 }, { "batch_num_effect_tokens": 8040, "batch_num_samples": 16, "batch_num_tokens": 8188, "epoch": 1.48581, "grad_norm": 0.1228693351149559, "learning_rate": 1.8736982705519013e-06, "loss": 4.1221, "step": 1145 }, { "batch_num_effect_tokens": 8032, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.4871, "grad_norm": 0.13609446585178375, "learning_rate": 1.8648616125590218e-06, "loss": 3.9092, "step": 1146 }, { "batch_num_effect_tokens": 8029, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 1.4884, "grad_norm": 0.13068844377994537, "learning_rate": 1.8560410621282543e-06, "loss": 4.2646, "step": 1147 }, { "batch_num_effect_tokens": 7931, "batch_num_samples": 18, "batch_num_tokens": 8086, "epoch": 1.4897, "grad_norm": 0.12820830941200256, "learning_rate": 1.8472366645773892e-06, "loss": 4.3457, "step": 1148 }, { "batch_num_effect_tokens": 8061, "batch_num_samples": 13, "batch_num_tokens": 8188, "epoch": 1.491, "grad_norm": 0.12200096249580383, "learning_rate": 1.8384484651412338e-06, "loss": 4.3672, "step": 1149 }, { "batch_num_effect_tokens": 8011, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.4923, "grad_norm": 0.13038086891174316, "learning_rate": 1.829676508971377e-06, "loss": 4.2734, "step": 1150 }, { "batch_num_effect_tokens": 7874, "batch_num_samples": 21, "batch_num_tokens": 8091, "epoch": 1.49359, "grad_norm": 0.11553595215082169, "learning_rate": 1.8209208411359485e-06, "loss": 4.3574, "step": 1151 }, { "batch_num_effect_tokens": 8062, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 1.49489, "grad_norm": 0.12026538699865341, "learning_rate": 1.8121815066193944e-06, "loss": 4.4014, "step": 1152 }, { "batch_num_effect_tokens": 8058, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 1.49619, "grad_norm": 0.12895485758781433, "learning_rate": 1.8034585503222441e-06, "loss": 4.7461, "step": 1153 }, { "batch_num_effect_tokens": 8057, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.49749, "grad_norm": 0.1211390346288681, "learning_rate": 1.7947520170608774e-06, "loss": 4.3555, "step": 1154 }, { "batch_num_effect_tokens": 8015, "batch_num_samples": 14, "batch_num_tokens": 8182, "epoch": 1.49878, "grad_norm": 0.1284831017255783, "learning_rate": 1.7860619515673034e-06, "loss": 4.2139, "step": 1155 }, { "batch_num_effect_tokens": 7974, "batch_num_samples": 15, "batch_num_tokens": 8098, "epoch": 1.50008, "grad_norm": 0.12839344143867493, "learning_rate": 1.7773883984889178e-06, "loss": 4.4795, "step": 1156 }, { "batch_num_effect_tokens": 7808, "batch_num_samples": 24, "batch_num_tokens": 8052, "epoch": 1.50138, "grad_norm": 0.13333484530448914, "learning_rate": 1.7687314023882806e-06, "loss": 4.5332, "step": 1157 }, { "batch_num_effect_tokens": 7969, "batch_num_samples": 15, "batch_num_tokens": 8156, "epoch": 1.50268, "grad_norm": 0.1345791220664978, "learning_rate": 1.760091007742888e-06, "loss": 4.5273, "step": 1158 }, { "batch_num_effect_tokens": 8054, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.50397, "grad_norm": 0.12442679703235626, "learning_rate": 1.7514672589449378e-06, "loss": 4.248, "step": 1159 }, { "batch_num_effect_tokens": 7963, "batch_num_samples": 14, "batch_num_tokens": 8128, "epoch": 1.50527, "grad_norm": 0.1204453706741333, "learning_rate": 1.7428602003011136e-06, "loss": 4.6221, "step": 1160 }, { "batch_num_effect_tokens": 8049, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 1.50657, "grad_norm": 0.13553136587142944, "learning_rate": 1.734269876032344e-06, "loss": 4.1123, "step": 1161 }, { "batch_num_effect_tokens": 8033, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 1.50787, "grad_norm": 0.13123448193073273, "learning_rate": 1.7256963302735752e-06, "loss": 4.3574, "step": 1162 }, { "batch_num_effect_tokens": 8039, "batch_num_samples": 14, "batch_num_tokens": 8173, "epoch": 1.50916, "grad_norm": 0.12561143934726715, "learning_rate": 1.7171396070735602e-06, "loss": 4.0078, "step": 1163 }, { "batch_num_effect_tokens": 8063, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 1.51046, "grad_norm": 0.13120131194591522, "learning_rate": 1.7085997503946144e-06, "loss": 4.377, "step": 1164 }, { "batch_num_effect_tokens": 8056, "batch_num_samples": 14, "batch_num_tokens": 8191, "epoch": 1.51176, "grad_norm": 0.1411786675453186, "learning_rate": 1.7000768041124038e-06, "loss": 4.5732, "step": 1165 }, { "batch_num_effect_tokens": 7977, "batch_num_samples": 14, "batch_num_tokens": 8152, "epoch": 1.51306, "grad_norm": 0.12541553378105164, "learning_rate": 1.6915708120157042e-06, "loss": 4.3467, "step": 1166 }, { "batch_num_effect_tokens": 7889, "batch_num_samples": 25, "batch_num_tokens": 8163, "epoch": 1.51436, "grad_norm": 0.12473280727863312, "learning_rate": 1.6830818178061897e-06, "loss": 4.1572, "step": 1167 }, { "batch_num_effect_tokens": 8015, "batch_num_samples": 19, "batch_num_tokens": 8192, "epoch": 1.51565, "grad_norm": 0.14233461022377014, "learning_rate": 1.6746098650982072e-06, "loss": 4.6309, "step": 1168 }, { "batch_num_effect_tokens": 8038, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.51695, "grad_norm": 0.12509174644947052, "learning_rate": 1.6661549974185426e-06, "loss": 4.5986, "step": 1169 }, { "batch_num_effect_tokens": 8063, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.51825, "grad_norm": 0.13657966256141663, "learning_rate": 1.657717258206205e-06, "loss": 4.9082, "step": 1170 }, { "batch_num_effect_tokens": 8046, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.51955, "grad_norm": 0.12476794421672821, "learning_rate": 1.6492966908122033e-06, "loss": 4.1396, "step": 1171 }, { "batch_num_effect_tokens": 7912, "batch_num_samples": 26, "batch_num_tokens": 8139, "epoch": 1.52084, "grad_norm": 0.12963926792144775, "learning_rate": 1.6408933384993187e-06, "loss": 4.4238, "step": 1172 }, { "batch_num_effect_tokens": 8027, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 1.52214, "grad_norm": 0.12490490823984146, "learning_rate": 1.63250724444189e-06, "loss": 4.6895, "step": 1173 }, { "batch_num_effect_tokens": 7991, "batch_num_samples": 14, "batch_num_tokens": 8121, "epoch": 1.52344, "grad_norm": 0.1316630095243454, "learning_rate": 1.6241384517255854e-06, "loss": 4.5205, "step": 1174 }, { "batch_num_effect_tokens": 7946, "batch_num_samples": 15, "batch_num_tokens": 8128, "epoch": 1.52474, "grad_norm": 0.1253131479024887, "learning_rate": 1.6157870033471785e-06, "loss": 4.4424, "step": 1175 }, { "batch_num_effect_tokens": 7782, "batch_num_samples": 26, "batch_num_tokens": 8010, "epoch": 1.52603, "grad_norm": 0.12522853910923004, "learning_rate": 1.6074529422143398e-06, "loss": 4.3564, "step": 1176 }, { "batch_num_effect_tokens": 8050, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 1.52733, "grad_norm": 0.12250207364559174, "learning_rate": 1.5991363111454023e-06, "loss": 4.7217, "step": 1177 }, { "batch_num_effect_tokens": 8074, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.52863, "grad_norm": 0.12559637427330017, "learning_rate": 1.5908371528691553e-06, "loss": 4.4688, "step": 1178 }, { "batch_num_effect_tokens": 7855, "batch_num_samples": 17, "batch_num_tokens": 8023, "epoch": 1.52993, "grad_norm": 0.13404156267642975, "learning_rate": 1.5825555100246066e-06, "loss": 4.5176, "step": 1179 }, { "batch_num_effect_tokens": 8030, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.53122, "grad_norm": 0.12473981827497482, "learning_rate": 1.5742914251607794e-06, "loss": 4.2764, "step": 1180 }, { "batch_num_effect_tokens": 7951, "batch_num_samples": 14, "batch_num_tokens": 8092, "epoch": 1.53252, "grad_norm": 0.1322951763868332, "learning_rate": 1.5660449407364919e-06, "loss": 4.5439, "step": 1181 }, { "batch_num_effect_tokens": 7907, "batch_num_samples": 20, "batch_num_tokens": 8128, "epoch": 1.53382, "grad_norm": 0.13409925997257233, "learning_rate": 1.5578160991201313e-06, "loss": 4.3096, "step": 1182 }, { "batch_num_effect_tokens": 8038, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.53512, "grad_norm": 0.11892995983362198, "learning_rate": 1.549604942589441e-06, "loss": 4.3701, "step": 1183 }, { "batch_num_effect_tokens": 8046, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.53642, "grad_norm": 0.13523681461811066, "learning_rate": 1.5414115133313029e-06, "loss": 4.2051, "step": 1184 }, { "batch_num_effect_tokens": 8057, "batch_num_samples": 17, "batch_num_tokens": 8191, "epoch": 1.53771, "grad_norm": 0.12341441959142685, "learning_rate": 1.5332358534415192e-06, "loss": 4.4072, "step": 1185 }, { "batch_num_effect_tokens": 8033, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 1.53901, "grad_norm": 0.12092790752649307, "learning_rate": 1.5250780049246028e-06, "loss": 4.498, "step": 1186 }, { "batch_num_effect_tokens": 8049, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.54031, "grad_norm": 0.12938234210014343, "learning_rate": 1.516938009693551e-06, "loss": 4.6123, "step": 1187 }, { "batch_num_effect_tokens": 7905, "batch_num_samples": 14, "batch_num_tokens": 8088, "epoch": 1.54161, "grad_norm": 0.1272503286600113, "learning_rate": 1.5088159095696365e-06, "loss": 4.2402, "step": 1188 }, { "batch_num_effect_tokens": 7971, "batch_num_samples": 21, "batch_num_tokens": 8192, "epoch": 1.5429, "grad_norm": 0.14063680171966553, "learning_rate": 1.500711746282192e-06, "loss": 4.2334, "step": 1189 }, { "batch_num_effect_tokens": 8040, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 1.5442, "grad_norm": 0.11090775579214096, "learning_rate": 1.4926255614683931e-06, "loss": 4.2617, "step": 1190 }, { "batch_num_effect_tokens": 7936, "batch_num_samples": 25, "batch_num_tokens": 8162, "epoch": 1.5455, "grad_norm": 0.12592093646526337, "learning_rate": 1.484557396673052e-06, "loss": 4.1523, "step": 1191 }, { "batch_num_effect_tokens": 8044, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.5468, "grad_norm": 0.12234609574079514, "learning_rate": 1.4765072933483949e-06, "loss": 4.2314, "step": 1192 }, { "batch_num_effect_tokens": 8007, "batch_num_samples": 15, "batch_num_tokens": 8168, "epoch": 1.54809, "grad_norm": 0.11718329787254333, "learning_rate": 1.468475292853847e-06, "loss": 4.0645, "step": 1193 }, { "batch_num_effect_tokens": 7944, "batch_num_samples": 16, "batch_num_tokens": 8133, "epoch": 1.54939, "grad_norm": 0.13274738192558289, "learning_rate": 1.4604614364558372e-06, "loss": 4.4199, "step": 1194 }, { "batch_num_effect_tokens": 8022, "batch_num_samples": 17, "batch_num_tokens": 8166, "epoch": 1.55069, "grad_norm": 0.12475960701704025, "learning_rate": 1.4524657653275653e-06, "loss": 4.043, "step": 1195 }, { "batch_num_effect_tokens": 8041, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.55199, "grad_norm": 0.12034016847610474, "learning_rate": 1.444488320548807e-06, "loss": 4.0186, "step": 1196 }, { "batch_num_effect_tokens": 8056, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.55328, "grad_norm": 0.12400320917367935, "learning_rate": 1.4365291431056871e-06, "loss": 4.248, "step": 1197 }, { "batch_num_effect_tokens": 8045, "batch_num_samples": 14, "batch_num_tokens": 8173, "epoch": 1.55458, "grad_norm": 0.12639890611171722, "learning_rate": 1.4285882738904822e-06, "loss": 4.6426, "step": 1198 }, { "batch_num_effect_tokens": 7946, "batch_num_samples": 17, "batch_num_tokens": 8123, "epoch": 1.55588, "grad_norm": 0.12510213255882263, "learning_rate": 1.4206657537014078e-06, "loss": 4.248, "step": 1199 }, { "batch_num_effect_tokens": 8031, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.55718, "grad_norm": 0.11327057331800461, "learning_rate": 1.4127616232424042e-06, "loss": 3.8843, "step": 1200 }, { "batch_num_effect_tokens": 8006, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.55848, "grad_norm": 0.13065361976623535, "learning_rate": 1.404875923122928e-06, "loss": 4.3848, "step": 1201 }, { "batch_num_effect_tokens": 8008, "batch_num_samples": 18, "batch_num_tokens": 8170, "epoch": 1.55977, "grad_norm": 0.1324455589056015, "learning_rate": 1.3970086938577492e-06, "loss": 4.3975, "step": 1202 }, { "batch_num_effect_tokens": 8044, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 1.56107, "grad_norm": 0.12078768759965897, "learning_rate": 1.389159975866734e-06, "loss": 4.0654, "step": 1203 }, { "batch_num_effect_tokens": 8037, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.56237, "grad_norm": 0.1465671807527542, "learning_rate": 1.3813298094746491e-06, "loss": 4.2598, "step": 1204 }, { "batch_num_effect_tokens": 7993, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 1.56367, "grad_norm": 0.13225287199020386, "learning_rate": 1.3735182349109428e-06, "loss": 4.4785, "step": 1205 }, { "batch_num_effect_tokens": 8042, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.56496, "grad_norm": 0.1285356879234314, "learning_rate": 1.3657252923095437e-06, "loss": 4.54, "step": 1206 }, { "batch_num_effect_tokens": 7879, "batch_num_samples": 18, "batch_num_tokens": 8090, "epoch": 1.56626, "grad_norm": 0.13167142868041992, "learning_rate": 1.357951021708655e-06, "loss": 4.0176, "step": 1207 }, { "batch_num_effect_tokens": 8026, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 1.56756, "grad_norm": 0.13281382620334625, "learning_rate": 1.3501954630505464e-06, "loss": 4.2588, "step": 1208 }, { "batch_num_effect_tokens": 7945, "batch_num_samples": 14, "batch_num_tokens": 8079, "epoch": 1.56886, "grad_norm": 0.12919877469539642, "learning_rate": 1.342458656181354e-06, "loss": 4.5977, "step": 1209 }, { "batch_num_effect_tokens": 8021, "batch_num_samples": 14, "batch_num_tokens": 8191, "epoch": 1.57015, "grad_norm": 0.13256850838661194, "learning_rate": 1.3347406408508695e-06, "loss": 4.1484, "step": 1210 }, { "batch_num_effect_tokens": 8055, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 1.57145, "grad_norm": 0.13196738064289093, "learning_rate": 1.3270414567123342e-06, "loss": 4.1807, "step": 1211 }, { "batch_num_effect_tokens": 8025, "batch_num_samples": 18, "batch_num_tokens": 8192, "epoch": 1.57275, "grad_norm": 0.1331268548965454, "learning_rate": 1.3193611433222465e-06, "loss": 4.4814, "step": 1212 }, { "batch_num_effect_tokens": 8024, "batch_num_samples": 14, "batch_num_tokens": 8182, "epoch": 1.57405, "grad_norm": 0.1267508864402771, "learning_rate": 1.311699740140146e-06, "loss": 4.3477, "step": 1213 }, { "batch_num_effect_tokens": 8057, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.57534, "grad_norm": 0.1376553326845169, "learning_rate": 1.3040572865284234e-06, "loss": 4.5098, "step": 1214 }, { "batch_num_effect_tokens": 7972, "batch_num_samples": 14, "batch_num_tokens": 8100, "epoch": 1.57664, "grad_norm": 0.13548849523067474, "learning_rate": 1.2964338217521021e-06, "loss": 4.3359, "step": 1215 }, { "batch_num_effect_tokens": 8069, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.57794, "grad_norm": 0.13228566944599152, "learning_rate": 1.2888293849786503e-06, "loss": 4.4883, "step": 1216 }, { "batch_num_effect_tokens": 8041, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 1.57924, "grad_norm": 0.1392965018749237, "learning_rate": 1.2812440152777773e-06, "loss": 4.5889, "step": 1217 }, { "batch_num_effect_tokens": 8027, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 1.58054, "grad_norm": 0.12365755438804626, "learning_rate": 1.2736777516212267e-06, "loss": 4.541, "step": 1218 }, { "batch_num_effect_tokens": 7996, "batch_num_samples": 20, "batch_num_tokens": 8192, "epoch": 1.58183, "grad_norm": 0.12618915736675262, "learning_rate": 1.2661306328825818e-06, "loss": 4.1309, "step": 1219 }, { "batch_num_effect_tokens": 8035, "batch_num_samples": 14, "batch_num_tokens": 8163, "epoch": 1.58313, "grad_norm": 0.12724487483501434, "learning_rate": 1.258602697837063e-06, "loss": 4.7012, "step": 1220 }, { "batch_num_effect_tokens": 8034, "batch_num_samples": 14, "batch_num_tokens": 8189, "epoch": 1.58443, "grad_norm": 0.12507867813110352, "learning_rate": 1.2510939851613285e-06, "loss": 4.6133, "step": 1221 }, { "batch_num_effect_tokens": 7940, "batch_num_samples": 14, "batch_num_tokens": 8190, "epoch": 1.58573, "grad_norm": 0.124232716858387, "learning_rate": 1.2436045334332824e-06, "loss": 4.6475, "step": 1222 }, { "batch_num_effect_tokens": 7957, "batch_num_samples": 22, "batch_num_tokens": 8192, "epoch": 1.58702, "grad_norm": 0.13959231972694397, "learning_rate": 1.2361343811318665e-06, "loss": 4.6484, "step": 1223 }, { "batch_num_effect_tokens": 8044, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 1.58832, "grad_norm": 0.12538942694664001, "learning_rate": 1.2286835666368623e-06, "loss": 4.4883, "step": 1224 }, { "batch_num_effect_tokens": 8027, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.58962, "grad_norm": 0.12452027201652527, "learning_rate": 1.2212521282287093e-06, "loss": 4.0713, "step": 1225 }, { "batch_num_effect_tokens": 8045, "batch_num_samples": 14, "batch_num_tokens": 8191, "epoch": 1.59092, "grad_norm": 0.13691964745521545, "learning_rate": 1.2138401040882874e-06, "loss": 4.5684, "step": 1226 }, { "batch_num_effect_tokens": 7894, "batch_num_samples": 24, "batch_num_tokens": 8096, "epoch": 1.59221, "grad_norm": 0.12278582155704498, "learning_rate": 1.20644753229674e-06, "loss": 4.1816, "step": 1227 }, { "batch_num_effect_tokens": 8032, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.59351, "grad_norm": 0.12262444198131561, "learning_rate": 1.1990744508352604e-06, "loss": 4.0322, "step": 1228 }, { "batch_num_effect_tokens": 7945, "batch_num_samples": 15, "batch_num_tokens": 8106, "epoch": 1.59481, "grad_norm": 0.13073332607746124, "learning_rate": 1.191720897584908e-06, "loss": 4.5508, "step": 1229 }, { "batch_num_effect_tokens": 7928, "batch_num_samples": 17, "batch_num_tokens": 8113, "epoch": 1.59611, "grad_norm": 0.13038553297519684, "learning_rate": 1.1843869103264173e-06, "loss": 4.6455, "step": 1230 }, { "batch_num_effect_tokens": 8026, "batch_num_samples": 17, "batch_num_tokens": 8187, "epoch": 1.5974, "grad_norm": 0.11809444427490234, "learning_rate": 1.1770725267399892e-06, "loss": 4.1328, "step": 1231 }, { "batch_num_effect_tokens": 7988, "batch_num_samples": 22, "batch_num_tokens": 8189, "epoch": 1.5987, "grad_norm": 0.11818146705627441, "learning_rate": 1.1697777844051105e-06, "loss": 4.2168, "step": 1232 }, { "batch_num_effect_tokens": 7988, "batch_num_samples": 22, "batch_num_tokens": 8189, "epoch": 1.5987, "eval_eval_loss": 0.5610187649726868, "eval_eval_runtime": 115.0388, "eval_eval_samples_per_second": 43.464, "eval_eval_steps_per_second": 2.721, "step": 1232 }, { "batch_num_effect_tokens": 8012, "batch_num_samples": 21, "batch_num_tokens": 8192, "epoch": 1.6, "grad_norm": 0.12877851724624634, "learning_rate": 1.1625027208003547e-06, "loss": 4.4346, "step": 1233 }, { "batch_num_effect_tokens": 7934, "batch_num_samples": 15, "batch_num_tokens": 8104, "epoch": 1.6013, "grad_norm": 0.13281628489494324, "learning_rate": 1.1552473733031893e-06, "loss": 4.0088, "step": 1234 }, { "batch_num_effect_tokens": 7990, "batch_num_samples": 15, "batch_num_tokens": 8182, "epoch": 1.6026, "grad_norm": 0.133657768368721, "learning_rate": 1.148011779189791e-06, "loss": 4.5449, "step": 1235 }, { "batch_num_effect_tokens": 8052, "batch_num_samples": 14, "batch_num_tokens": 8191, "epoch": 1.60389, "grad_norm": 0.11349672079086304, "learning_rate": 1.1407959756348424e-06, "loss": 4.1943, "step": 1236 }, { "batch_num_effect_tokens": 7980, "batch_num_samples": 21, "batch_num_tokens": 8192, "epoch": 1.60519, "grad_norm": 0.14057059586048126, "learning_rate": 1.133599999711349e-06, "loss": 4.209, "step": 1237 }, { "batch_num_effect_tokens": 8055, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 1.60649, "grad_norm": 0.13591386377811432, "learning_rate": 1.1264238883904483e-06, "loss": 4.25, "step": 1238 }, { "batch_num_effect_tokens": 7949, "batch_num_samples": 17, "batch_num_tokens": 8096, "epoch": 1.60779, "grad_norm": 0.1403154879808426, "learning_rate": 1.1192676785412154e-06, "loss": 4.5283, "step": 1239 }, { "batch_num_effect_tokens": 8032, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.60908, "grad_norm": 0.12656299769878387, "learning_rate": 1.112131406930481e-06, "loss": 4.4707, "step": 1240 }, { "batch_num_effect_tokens": 8030, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 1.61038, "grad_norm": 0.1420130729675293, "learning_rate": 1.1050151102226369e-06, "loss": 4.4678, "step": 1241 }, { "batch_num_effect_tokens": 8002, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.61168, "grad_norm": 0.12833444774150848, "learning_rate": 1.097918824979442e-06, "loss": 4.2129, "step": 1242 }, { "batch_num_effect_tokens": 7962, "batch_num_samples": 20, "batch_num_tokens": 8192, "epoch": 1.61298, "grad_norm": 0.13786719739437103, "learning_rate": 1.0908425876598512e-06, "loss": 4.3408, "step": 1243 }, { "batch_num_effect_tokens": 7948, "batch_num_samples": 14, "batch_num_tokens": 8116, "epoch": 1.61427, "grad_norm": 0.13015086948871613, "learning_rate": 1.0837864346198117e-06, "loss": 4.4014, "step": 1244 }, { "batch_num_effect_tokens": 8040, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 1.61557, "grad_norm": 0.1163618341088295, "learning_rate": 1.0767504021120884e-06, "loss": 3.9277, "step": 1245 }, { "batch_num_effect_tokens": 7856, "batch_num_samples": 17, "batch_num_tokens": 8040, "epoch": 1.61687, "grad_norm": 0.12296216189861298, "learning_rate": 1.0697345262860638e-06, "loss": 4.5225, "step": 1246 }, { "batch_num_effect_tokens": 7929, "batch_num_samples": 23, "batch_num_tokens": 8136, "epoch": 1.61817, "grad_norm": 0.1348690241575241, "learning_rate": 1.062738843187565e-06, "loss": 4.3125, "step": 1247 }, { "batch_num_effect_tokens": 8043, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.61946, "grad_norm": 0.12651218473911285, "learning_rate": 1.0557633887586765e-06, "loss": 4.2842, "step": 1248 }, { "batch_num_effect_tokens": 8025, "batch_num_samples": 14, "batch_num_tokens": 8191, "epoch": 1.62076, "grad_norm": 0.12578874826431274, "learning_rate": 1.0488081988375493e-06, "loss": 4.0039, "step": 1249 }, { "batch_num_effect_tokens": 8064, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.62206, "grad_norm": 0.1441895216703415, "learning_rate": 1.04187330915822e-06, "loss": 4.1748, "step": 1250 }, { "batch_num_effect_tokens": 7945, "batch_num_samples": 19, "batch_num_tokens": 8146, "epoch": 1.62336, "grad_norm": 0.13080251216888428, "learning_rate": 1.0349587553504298e-06, "loss": 4.6592, "step": 1251 }, { "batch_num_effect_tokens": 7926, "batch_num_samples": 15, "batch_num_tokens": 8086, "epoch": 1.62466, "grad_norm": 0.1321391612291336, "learning_rate": 1.0280645729394368e-06, "loss": 4.3711, "step": 1252 }, { "batch_num_effect_tokens": 7936, "batch_num_samples": 14, "batch_num_tokens": 8083, "epoch": 1.62595, "grad_norm": 0.14101214706897736, "learning_rate": 1.0211907973458391e-06, "loss": 4.248, "step": 1253 }, { "batch_num_effect_tokens": 8074, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.62725, "grad_norm": 0.13918721675872803, "learning_rate": 1.0143374638853892e-06, "loss": 4.5293, "step": 1254 }, { "batch_num_effect_tokens": 8058, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.62855, "grad_norm": 0.1319819986820221, "learning_rate": 1.0075046077688067e-06, "loss": 4.3057, "step": 1255 }, { "batch_num_effect_tokens": 7918, "batch_num_samples": 19, "batch_num_tokens": 8101, "epoch": 1.62985, "grad_norm": 0.11569049954414368, "learning_rate": 1.0006922641016131e-06, "loss": 4.0488, "step": 1256 }, { "batch_num_effect_tokens": 8029, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 1.63114, "grad_norm": 0.13643518090248108, "learning_rate": 9.939004678839348e-07, "loss": 4.96, "step": 1257 }, { "batch_num_effect_tokens": 8010, "batch_num_samples": 21, "batch_num_tokens": 8192, "epoch": 1.63244, "grad_norm": 0.1374160200357437, "learning_rate": 9.871292540103377e-07, "loss": 4.2441, "step": 1258 }, { "batch_num_effect_tokens": 8060, "batch_num_samples": 14, "batch_num_tokens": 8191, "epoch": 1.63374, "grad_norm": 0.12853796780109406, "learning_rate": 9.803786572696321e-07, "loss": 4.377, "step": 1259 }, { "batch_num_effect_tokens": 7908, "batch_num_samples": 19, "batch_num_tokens": 8086, "epoch": 1.63504, "grad_norm": 0.14067308604717255, "learning_rate": 9.73648712344707e-07, "loss": 4.4121, "step": 1260 }, { "batch_num_effect_tokens": 8050, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.63633, "grad_norm": 0.1275281310081482, "learning_rate": 9.6693945381235e-07, "loss": 4.5293, "step": 1261 }, { "batch_num_effect_tokens": 8034, "batch_num_samples": 18, "batch_num_tokens": 8192, "epoch": 1.63763, "grad_norm": 0.12822289764881134, "learning_rate": 9.602509161430628e-07, "loss": 4.0166, "step": 1262 }, { "batch_num_effect_tokens": 7985, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.63893, "grad_norm": 0.13135388493537903, "learning_rate": 9.53583133700891e-07, "loss": 4.1982, "step": 1263 }, { "batch_num_effect_tokens": 7980, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.64023, "grad_norm": 0.13726817071437836, "learning_rate": 9.469361407432431e-07, "loss": 4.4258, "step": 1264 }, { "batch_num_effect_tokens": 8024, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 1.64152, "grad_norm": 0.1434670388698578, "learning_rate": 9.403099714207175e-07, "loss": 4.5791, "step": 1265 }, { "batch_num_effect_tokens": 7957, "batch_num_samples": 14, "batch_num_tokens": 8110, "epoch": 1.64282, "grad_norm": 0.12421970069408417, "learning_rate": 9.337046597769272e-07, "loss": 4.0225, "step": 1266 }, { "batch_num_effect_tokens": 8022, "batch_num_samples": 14, "batch_num_tokens": 8142, "epoch": 1.64412, "grad_norm": 0.13223737478256226, "learning_rate": 9.271202397483214e-07, "loss": 3.9521, "step": 1267 }, { "batch_num_effect_tokens": 8023, "batch_num_samples": 18, "batch_num_tokens": 8192, "epoch": 1.64542, "grad_norm": 0.12750263512134552, "learning_rate": 9.205567451640151e-07, "loss": 4.0049, "step": 1268 }, { "batch_num_effect_tokens": 8065, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.64672, "grad_norm": 0.12239693850278854, "learning_rate": 9.140142097456117e-07, "loss": 4.2539, "step": 1269 }, { "batch_num_effect_tokens": 8007, "batch_num_samples": 19, "batch_num_tokens": 8179, "epoch": 1.64801, "grad_norm": 0.1410730630159378, "learning_rate": 9.074926671070322e-07, "loss": 4.3662, "step": 1270 }, { "batch_num_effect_tokens": 7938, "batch_num_samples": 18, "batch_num_tokens": 8085, "epoch": 1.64931, "grad_norm": 0.12114191055297852, "learning_rate": 9.009921507543445e-07, "loss": 4.6211, "step": 1271 }, { "batch_num_effect_tokens": 7922, "batch_num_samples": 14, "batch_num_tokens": 8092, "epoch": 1.65061, "grad_norm": 0.12979581952095032, "learning_rate": 8.945126940855864e-07, "loss": 4.6465, "step": 1272 }, { "batch_num_effect_tokens": 8024, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 1.65191, "grad_norm": 0.12733100354671478, "learning_rate": 8.880543303905931e-07, "loss": 4.082, "step": 1273 }, { "batch_num_effect_tokens": 7917, "batch_num_samples": 16, "batch_num_tokens": 8089, "epoch": 1.6532, "grad_norm": 0.11588294804096222, "learning_rate": 8.816170928508367e-07, "loss": 4.1299, "step": 1274 }, { "batch_num_effect_tokens": 8001, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 1.6545, "grad_norm": 0.11835164576768875, "learning_rate": 8.752010145392408e-07, "loss": 4.1543, "step": 1275 }, { "batch_num_effect_tokens": 8027, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.6558, "grad_norm": 0.13759027421474457, "learning_rate": 8.688061284200266e-07, "loss": 4.5654, "step": 1276 }, { "batch_num_effect_tokens": 8030, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 1.6571, "grad_norm": 0.13586406409740448, "learning_rate": 8.624324673485252e-07, "loss": 4.3682, "step": 1277 }, { "batch_num_effect_tokens": 8036, "batch_num_samples": 14, "batch_num_tokens": 8164, "epoch": 1.65839, "grad_norm": 0.13176386058330536, "learning_rate": 8.560800640710248e-07, "loss": 4.3643, "step": 1278 }, { "batch_num_effect_tokens": 8035, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.65969, "grad_norm": 0.1281612068414688, "learning_rate": 8.497489512245971e-07, "loss": 4.0146, "step": 1279 }, { "batch_num_effect_tokens": 7869, "batch_num_samples": 15, "batch_num_tokens": 8086, "epoch": 1.66099, "grad_norm": 0.11786817759275436, "learning_rate": 8.434391613369258e-07, "loss": 3.9717, "step": 1280 }, { "batch_num_effect_tokens": 8016, "batch_num_samples": 18, "batch_num_tokens": 8192, "epoch": 1.66229, "grad_norm": 0.13192172348499298, "learning_rate": 8.371507268261436e-07, "loss": 4.3145, "step": 1281 }, { "batch_num_effect_tokens": 8052, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 1.66358, "grad_norm": 0.13585922122001648, "learning_rate": 8.308836800006648e-07, "loss": 4.2051, "step": 1282 }, { "batch_num_effect_tokens": 8044, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 1.66488, "grad_norm": 0.13544043898582458, "learning_rate": 8.246380530590175e-07, "loss": 4.2764, "step": 1283 }, { "batch_num_effect_tokens": 7965, "batch_num_samples": 15, "batch_num_tokens": 8110, "epoch": 1.66618, "grad_norm": 0.1333230584859848, "learning_rate": 8.184138780896839e-07, "loss": 4.2051, "step": 1284 }, { "batch_num_effect_tokens": 7949, "batch_num_samples": 14, "batch_num_tokens": 8088, "epoch": 1.66748, "grad_norm": 0.1322176456451416, "learning_rate": 8.122111870709287e-07, "loss": 4.2715, "step": 1285 }, { "batch_num_effect_tokens": 7749, "batch_num_samples": 30, "batch_num_tokens": 8018, "epoch": 1.66878, "grad_norm": 0.13512325286865234, "learning_rate": 8.060300118706327e-07, "loss": 3.9844, "step": 1286 }, { "batch_num_effect_tokens": 7993, "batch_num_samples": 14, "batch_num_tokens": 8119, "epoch": 1.67007, "grad_norm": 0.13298512995243073, "learning_rate": 7.99870384246143e-07, "loss": 4.3496, "step": 1287 }, { "batch_num_effect_tokens": 8030, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 1.67137, "grad_norm": 0.134169802069664, "learning_rate": 7.937323358440935e-07, "loss": 4.5732, "step": 1288 }, { "batch_num_effect_tokens": 8047, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 1.67267, "grad_norm": 0.139683797955513, "learning_rate": 7.876158982002552e-07, "loss": 4.2637, "step": 1289 }, { "batch_num_effect_tokens": 8037, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.67397, "grad_norm": 0.13284529745578766, "learning_rate": 7.815211027393616e-07, "loss": 4.3613, "step": 1290 }, { "batch_num_effect_tokens": 8020, "batch_num_samples": 17, "batch_num_tokens": 8165, "epoch": 1.67526, "grad_norm": 0.1360265165567398, "learning_rate": 7.754479807749571e-07, "loss": 3.9727, "step": 1291 }, { "batch_num_effect_tokens": 7988, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.67656, "grad_norm": 0.13193875551223755, "learning_rate": 7.693965635092365e-07, "loss": 4.5273, "step": 1292 }, { "batch_num_effect_tokens": 8002, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.67786, "grad_norm": 0.1406845897436142, "learning_rate": 7.633668820328765e-07, "loss": 4.4824, "step": 1293 }, { "batch_num_effect_tokens": 7877, "batch_num_samples": 18, "batch_num_tokens": 8100, "epoch": 1.67916, "grad_norm": 0.12547780573368073, "learning_rate": 7.573589673248833e-07, "loss": 4.3389, "step": 1294 }, { "batch_num_effect_tokens": 8022, "batch_num_samples": 15, "batch_num_tokens": 8175, "epoch": 1.68045, "grad_norm": 0.12817974388599396, "learning_rate": 7.513728502524286e-07, "loss": 4.0254, "step": 1295 }, { "batch_num_effect_tokens": 8007, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 1.68175, "grad_norm": 0.1354372501373291, "learning_rate": 7.454085615706951e-07, "loss": 4.4277, "step": 1296 }, { "batch_num_effect_tokens": 8031, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.68305, "grad_norm": 0.12303349375724792, "learning_rate": 7.394661319227175e-07, "loss": 4.6855, "step": 1297 }, { "batch_num_effect_tokens": 8019, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.68435, "grad_norm": 0.11604474484920502, "learning_rate": 7.33545591839222e-07, "loss": 4.2285, "step": 1298 }, { "batch_num_effect_tokens": 7995, "batch_num_samples": 18, "batch_num_tokens": 8192, "epoch": 1.68564, "grad_norm": 0.12446524202823639, "learning_rate": 7.276469717384726e-07, "loss": 4.1304, "step": 1299 }, { "batch_num_effect_tokens": 8042, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.68694, "grad_norm": 0.1320941001176834, "learning_rate": 7.217703019261135e-07, "loss": 4.4512, "step": 1300 }, { "batch_num_effect_tokens": 8004, "batch_num_samples": 19, "batch_num_tokens": 8192, "epoch": 1.68824, "grad_norm": 0.13994361460208893, "learning_rate": 7.15915612595014e-07, "loss": 4.5732, "step": 1301 }, { "batch_num_effect_tokens": 7981, "batch_num_samples": 14, "batch_num_tokens": 8121, "epoch": 1.68954, "grad_norm": 0.1323327273130417, "learning_rate": 7.100829338251147e-07, "loss": 4.2119, "step": 1302 }, { "batch_num_effect_tokens": 8014, "batch_num_samples": 15, "batch_num_tokens": 8152, "epoch": 1.69084, "grad_norm": 0.12983821332454681, "learning_rate": 7.042722955832703e-07, "loss": 4.5225, "step": 1303 }, { "batch_num_effect_tokens": 7942, "batch_num_samples": 19, "batch_num_tokens": 8116, "epoch": 1.69213, "grad_norm": 0.13648320734500885, "learning_rate": 6.984837277230927e-07, "loss": 4.3262, "step": 1304 }, { "batch_num_effect_tokens": 8002, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 1.69343, "grad_norm": 0.13606522977352142, "learning_rate": 6.927172599848092e-07, "loss": 4.4053, "step": 1305 }, { "batch_num_effect_tokens": 8028, "batch_num_samples": 14, "batch_num_tokens": 8185, "epoch": 1.69473, "grad_norm": 0.1341053992509842, "learning_rate": 6.86972921995096e-07, "loss": 4.0566, "step": 1306 }, { "batch_num_effect_tokens": 8042, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 1.69603, "grad_norm": 0.13478976488113403, "learning_rate": 6.812507432669374e-07, "loss": 4.1006, "step": 1307 }, { "batch_num_effect_tokens": 8054, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 1.69732, "grad_norm": 0.12561902403831482, "learning_rate": 6.755507531994637e-07, "loss": 4.1797, "step": 1308 }, { "batch_num_effect_tokens": 7813, "batch_num_samples": 17, "batch_num_tokens": 7989, "epoch": 1.69862, "grad_norm": 0.13221710920333862, "learning_rate": 6.698729810778065e-07, "loss": 4.3721, "step": 1309 }, { "batch_num_effect_tokens": 7902, "batch_num_samples": 18, "batch_num_tokens": 8090, "epoch": 1.69992, "grad_norm": 0.1371319591999054, "learning_rate": 6.642174560729514e-07, "loss": 4.084, "step": 1310 }, { "batch_num_effect_tokens": 7862, "batch_num_samples": 29, "batch_num_tokens": 8092, "epoch": 1.70122, "grad_norm": 0.12322220206260681, "learning_rate": 6.585842072415799e-07, "loss": 4.2988, "step": 1311 }, { "batch_num_effect_tokens": 7981, "batch_num_samples": 14, "batch_num_tokens": 8110, "epoch": 1.70251, "grad_norm": 0.13426773250102997, "learning_rate": 6.529732635259234e-07, "loss": 4.7168, "step": 1312 }, { "batch_num_effect_tokens": 8023, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.70381, "grad_norm": 0.12802381813526154, "learning_rate": 6.473846537536183e-07, "loss": 4.4238, "step": 1313 }, { "batch_num_effect_tokens": 8022, "batch_num_samples": 19, "batch_num_tokens": 8189, "epoch": 1.70511, "grad_norm": 0.1425420641899109, "learning_rate": 6.41818406637551e-07, "loss": 4.29, "step": 1314 }, { "batch_num_effect_tokens": 8036, "batch_num_samples": 15, "batch_num_tokens": 8190, "epoch": 1.70641, "grad_norm": 0.13316959142684937, "learning_rate": 6.36274550775719e-07, "loss": 4.2227, "step": 1315 }, { "batch_num_effect_tokens": 8000, "batch_num_samples": 18, "batch_num_tokens": 8192, "epoch": 1.7077, "grad_norm": 0.1335574984550476, "learning_rate": 6.307531146510754e-07, "loss": 4.5928, "step": 1316 }, { "batch_num_effect_tokens": 8053, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.709, "grad_norm": 0.1302870213985443, "learning_rate": 6.252541266313866e-07, "loss": 4.0986, "step": 1317 }, { "batch_num_effect_tokens": 8042, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.7103, "grad_norm": 0.13204161822795868, "learning_rate": 6.197776149690871e-07, "loss": 4.5967, "step": 1318 }, { "batch_num_effect_tokens": 8058, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.7116, "grad_norm": 0.12694908678531647, "learning_rate": 6.143236078011317e-07, "loss": 4.2031, "step": 1319 }, { "batch_num_effect_tokens": 7929, "batch_num_samples": 15, "batch_num_tokens": 8086, "epoch": 1.7129, "grad_norm": 0.12293847650289536, "learning_rate": 6.088921331488568e-07, "loss": 3.9863, "step": 1320 }, { "batch_num_effect_tokens": 8007, "batch_num_samples": 22, "batch_num_tokens": 8192, "epoch": 1.71419, "grad_norm": 0.12406273186206818, "learning_rate": 6.034832189178302e-07, "loss": 4.2266, "step": 1321 }, { "batch_num_effect_tokens": 8033, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.71549, "grad_norm": 0.12104199081659317, "learning_rate": 5.980968928977049e-07, "loss": 4.2158, "step": 1322 }, { "batch_num_effect_tokens": 8025, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 1.71679, "grad_norm": 0.13361838459968567, "learning_rate": 5.927331827620902e-07, "loss": 4.3867, "step": 1323 }, { "batch_num_effect_tokens": 8051, "batch_num_samples": 14, "batch_num_tokens": 8191, "epoch": 1.71809, "grad_norm": 0.1261344850063324, "learning_rate": 5.873921160683943e-07, "loss": 4.332, "step": 1324 }, { "batch_num_effect_tokens": 8053, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 1.71938, "grad_norm": 0.127448171377182, "learning_rate": 5.820737202576909e-07, "loss": 4.499, "step": 1325 }, { "batch_num_effect_tokens": 7944, "batch_num_samples": 14, "batch_num_tokens": 8112, "epoch": 1.72068, "grad_norm": 0.12296268343925476, "learning_rate": 5.767780226545766e-07, "loss": 4.0928, "step": 1326 }, { "batch_num_effect_tokens": 8039, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 1.72198, "grad_norm": 0.1200428232550621, "learning_rate": 5.715050504670288e-07, "loss": 4.5107, "step": 1327 }, { "batch_num_effect_tokens": 8043, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.72328, "grad_norm": 0.1229812353849411, "learning_rate": 5.662548307862714e-07, "loss": 4.6191, "step": 1328 }, { "batch_num_effect_tokens": 8028, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.72457, "grad_norm": 0.1287735551595688, "learning_rate": 5.61027390586626e-07, "loss": 4.6602, "step": 1329 }, { "batch_num_effect_tokens": 8022, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.72587, "grad_norm": 0.12394028156995773, "learning_rate": 5.558227567253832e-07, "loss": 4.3076, "step": 1330 }, { "batch_num_effect_tokens": 8012, "batch_num_samples": 14, "batch_num_tokens": 8149, "epoch": 1.72717, "grad_norm": 0.12008914351463318, "learning_rate": 5.506409559426573e-07, "loss": 4.373, "step": 1331 }, { "batch_num_effect_tokens": 8036, "batch_num_samples": 14, "batch_num_tokens": 8191, "epoch": 1.72847, "grad_norm": 0.12099087238311768, "learning_rate": 5.454820148612533e-07, "loss": 4.0195, "step": 1332 }, { "batch_num_effect_tokens": 7910, "batch_num_samples": 20, "batch_num_tokens": 8096, "epoch": 1.72976, "grad_norm": 0.13755109906196594, "learning_rate": 5.403459599865307e-07, "loss": 4.459, "step": 1333 }, { "batch_num_effect_tokens": 8041, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.73106, "grad_norm": 0.12216237932443619, "learning_rate": 5.352328177062626e-07, "loss": 4.0791, "step": 1334 }, { "batch_num_effect_tokens": 7912, "batch_num_samples": 23, "batch_num_tokens": 8105, "epoch": 1.73236, "grad_norm": 0.14445650577545166, "learning_rate": 5.301426142905019e-07, "loss": 4.2441, "step": 1335 }, { "batch_num_effect_tokens": 8031, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 1.73366, "grad_norm": 0.12957431375980377, "learning_rate": 5.250753758914506e-07, "loss": 4.7578, "step": 1336 }, { "batch_num_effect_tokens": 7838, "batch_num_samples": 25, "batch_num_tokens": 8114, "epoch": 1.73496, "grad_norm": 0.12466391175985336, "learning_rate": 5.200311285433213e-07, "loss": 4.2861, "step": 1337 }, { "batch_num_effect_tokens": 8027, "batch_num_samples": 18, "batch_num_tokens": 8192, "epoch": 1.73625, "grad_norm": 0.12449593842029572, "learning_rate": 5.15009898162202e-07, "loss": 4.5078, "step": 1338 }, { "batch_num_effect_tokens": 8008, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 1.73755, "grad_norm": 0.12438397109508514, "learning_rate": 5.100117105459279e-07, "loss": 4.2246, "step": 1339 }, { "batch_num_effect_tokens": 8036, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.73885, "grad_norm": 0.1266915500164032, "learning_rate": 5.050365913739441e-07, "loss": 3.8174, "step": 1340 }, { "batch_num_effect_tokens": 8032, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 1.74015, "grad_norm": 0.12431465089321136, "learning_rate": 5.000845662071779e-07, "loss": 4.334, "step": 1341 }, { "batch_num_effect_tokens": 8017, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 1.74144, "grad_norm": 0.12137281149625778, "learning_rate": 4.951556604879049e-07, "loss": 4.001, "step": 1342 }, { "batch_num_effect_tokens": 8057, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 1.74274, "grad_norm": 0.12117177993059158, "learning_rate": 4.902498995396166e-07, "loss": 4.2422, "step": 1343 }, { "batch_num_effect_tokens": 8002, "batch_num_samples": 19, "batch_num_tokens": 8192, "epoch": 1.74404, "grad_norm": 0.12428870052099228, "learning_rate": 4.853673085668947e-07, "loss": 4.1943, "step": 1344 }, { "batch_num_effect_tokens": 8014, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 1.74534, "grad_norm": 0.12221790850162506, "learning_rate": 4.80507912655277e-07, "loss": 4.5078, "step": 1345 }, { "batch_num_effect_tokens": 7997, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 1.74663, "grad_norm": 0.12552009522914886, "learning_rate": 4.75671736771135e-07, "loss": 4.2559, "step": 1346 }, { "batch_num_effect_tokens": 8049, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.74793, "grad_norm": 0.11707472056150436, "learning_rate": 4.7085880576153765e-07, "loss": 3.9307, "step": 1347 }, { "batch_num_effect_tokens": 8031, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.74923, "grad_norm": 0.11228827387094498, "learning_rate": 4.660691443541282e-07, "loss": 4.1211, "step": 1348 }, { "batch_num_effect_tokens": 8019, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 1.75053, "grad_norm": 0.12909752130508423, "learning_rate": 4.6130277715699777e-07, "loss": 4.2334, "step": 1349 }, { "batch_num_effect_tokens": 8030, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 1.75182, "grad_norm": 0.1351223587989807, "learning_rate": 4.565597286585555e-07, "loss": 4.5566, "step": 1350 }, { "batch_num_effect_tokens": 7985, "batch_num_samples": 15, "batch_num_tokens": 8167, "epoch": 1.75312, "grad_norm": 0.13169695436954498, "learning_rate": 4.5184002322740784e-07, "loss": 4.4697, "step": 1351 }, { "batch_num_effect_tokens": 8046, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.75442, "grad_norm": 0.12474015355110168, "learning_rate": 4.4714368511222905e-07, "loss": 3.9141, "step": 1352 }, { "batch_num_effect_tokens": 8058, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.75572, "grad_norm": 0.1359989494085312, "learning_rate": 4.4247073844163434e-07, "loss": 4.4863, "step": 1353 }, { "batch_num_effect_tokens": 8040, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 1.75702, "grad_norm": 0.13669313490390778, "learning_rate": 4.3782120722406565e-07, "loss": 4.3809, "step": 1354 }, { "batch_num_effect_tokens": 8029, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.75831, "grad_norm": 0.13616794347763062, "learning_rate": 4.331951153476588e-07, "loss": 4.0215, "step": 1355 }, { "batch_num_effect_tokens": 8004, "batch_num_samples": 17, "batch_num_tokens": 8191, "epoch": 1.75961, "grad_norm": 0.13854430615901947, "learning_rate": 4.285924865801233e-07, "loss": 4.5312, "step": 1356 }, { "batch_num_effect_tokens": 8053, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.76091, "grad_norm": 0.13246670365333557, "learning_rate": 4.2401334456862344e-07, "loss": 4.4092, "step": 1357 }, { "batch_num_effect_tokens": 8025, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.76221, "grad_norm": 0.12861448526382446, "learning_rate": 4.194577128396521e-07, "loss": 4.2188, "step": 1358 }, { "batch_num_effect_tokens": 8045, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.7635, "grad_norm": 0.1204322949051857, "learning_rate": 4.149256147989139e-07, "loss": 4.2617, "step": 1359 }, { "batch_num_effect_tokens": 8039, "batch_num_samples": 14, "batch_num_tokens": 8189, "epoch": 1.7648, "grad_norm": 0.14009417593479156, "learning_rate": 4.1041707373120354e-07, "loss": 4.0459, "step": 1360 }, { "batch_num_effect_tokens": 8015, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.7661, "grad_norm": 0.13464143872261047, "learning_rate": 4.05932112800283e-07, "loss": 4.4492, "step": 1361 }, { "batch_num_effect_tokens": 7937, "batch_num_samples": 23, "batch_num_tokens": 8192, "epoch": 1.7674, "grad_norm": 0.12960170209407806, "learning_rate": 4.0147075504876844e-07, "loss": 4.3623, "step": 1362 }, { "batch_num_effect_tokens": 8084, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.76869, "grad_norm": 0.1406746506690979, "learning_rate": 3.9703302339800687e-07, "loss": 4.4395, "step": 1363 }, { "batch_num_effect_tokens": 8039, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.76999, "grad_norm": 0.12615273892879486, "learning_rate": 3.9261894064796136e-07, "loss": 4.4746, "step": 1364 }, { "batch_num_effect_tokens": 8056, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 1.77129, "grad_norm": 0.13323578238487244, "learning_rate": 3.882285294770938e-07, "loss": 4.1318, "step": 1365 }, { "batch_num_effect_tokens": 7836, "batch_num_samples": 17, "batch_num_tokens": 8011, "epoch": 1.77259, "grad_norm": 0.12410898506641388, "learning_rate": 3.8386181244224274e-07, "loss": 4.2305, "step": 1366 }, { "batch_num_effect_tokens": 8017, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 1.77388, "grad_norm": 0.1302991360425949, "learning_rate": 3.7951881197851816e-07, "loss": 4.3486, "step": 1367 }, { "batch_num_effect_tokens": 7912, "batch_num_samples": 18, "batch_num_tokens": 8081, "epoch": 1.77518, "grad_norm": 0.13446044921875, "learning_rate": 3.751995503991762e-07, "loss": 4.4795, "step": 1368 }, { "batch_num_effect_tokens": 8090, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 1.77648, "grad_norm": 0.1244499534368515, "learning_rate": 3.709040498955102e-07, "loss": 4.1152, "step": 1369 }, { "batch_num_effect_tokens": 7669, "batch_num_samples": 32, "batch_num_tokens": 7981, "epoch": 1.77778, "grad_norm": 0.13350388407707214, "learning_rate": 3.666323325367344e-07, "loss": 4.3936, "step": 1370 }, { "batch_num_effect_tokens": 7937, "batch_num_samples": 16, "batch_num_tokens": 8108, "epoch": 1.77908, "grad_norm": 0.1298922747373581, "learning_rate": 3.623844202698701e-07, "loss": 4.291, "step": 1371 }, { "batch_num_effect_tokens": 8017, "batch_num_samples": 16, "batch_num_tokens": 8166, "epoch": 1.78037, "grad_norm": 0.12909585237503052, "learning_rate": 3.581603349196372e-07, "loss": 4.3447, "step": 1372 }, { "batch_num_effect_tokens": 8046, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 1.78167, "grad_norm": 0.1274796426296234, "learning_rate": 3.5396009818833567e-07, "loss": 4.3516, "step": 1373 }, { "batch_num_effect_tokens": 8005, "batch_num_samples": 14, "batch_num_tokens": 8184, "epoch": 1.78297, "grad_norm": 0.11704126745462418, "learning_rate": 3.497837316557384e-07, "loss": 4.2861, "step": 1374 }, { "batch_num_effect_tokens": 7885, "batch_num_samples": 20, "batch_num_tokens": 8192, "epoch": 1.78427, "grad_norm": 0.11343449354171753, "learning_rate": 3.4563125677897936e-07, "loss": 4.2314, "step": 1375 }, { "batch_num_effect_tokens": 8037, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.78556, "grad_norm": 0.12782780826091766, "learning_rate": 3.41502694892441e-07, "loss": 4.335, "step": 1376 }, { "batch_num_effect_tokens": 8075, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.78686, "grad_norm": 0.12656031548976898, "learning_rate": 3.373980672076516e-07, "loss": 4.3848, "step": 1377 }, { "batch_num_effect_tokens": 8057, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.78816, "grad_norm": 0.12621362507343292, "learning_rate": 3.333173948131663e-07, "loss": 4.1943, "step": 1378 }, { "batch_num_effect_tokens": 8012, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.78946, "grad_norm": 0.12268788367509842, "learning_rate": 3.2926069867446673e-07, "loss": 4.7314, "step": 1379 }, { "batch_num_effect_tokens": 7885, "batch_num_samples": 23, "batch_num_tokens": 8124, "epoch": 1.79075, "grad_norm": 0.12487363070249557, "learning_rate": 3.252279996338492e-07, "loss": 4.0225, "step": 1380 }, { "batch_num_effect_tokens": 8007, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.79205, "grad_norm": 0.13526642322540283, "learning_rate": 3.212193184103196e-07, "loss": 4.2959, "step": 1381 }, { "batch_num_effect_tokens": 7931, "batch_num_samples": 19, "batch_num_tokens": 8146, "epoch": 1.79335, "grad_norm": 0.13416649401187897, "learning_rate": 3.172346755994865e-07, "loss": 4.4658, "step": 1382 }, { "batch_num_effect_tokens": 7978, "batch_num_samples": 16, "batch_num_tokens": 8133, "epoch": 1.79465, "grad_norm": 0.11787126213312149, "learning_rate": 3.132740916734556e-07, "loss": 4.0879, "step": 1383 }, { "batch_num_effect_tokens": 7981, "batch_num_samples": 21, "batch_num_tokens": 8192, "epoch": 1.79594, "grad_norm": 0.13434545695781708, "learning_rate": 3.0933758698072023e-07, "loss": 4.5879, "step": 1384 }, { "batch_num_effect_tokens": 8062, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.79724, "grad_norm": 0.13060152530670166, "learning_rate": 3.054251817460663e-07, "loss": 4.4268, "step": 1385 }, { "batch_num_effect_tokens": 8028, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 1.79854, "grad_norm": 0.12576042115688324, "learning_rate": 3.015368960704584e-07, "loss": 4.0508, "step": 1386 }, { "batch_num_effect_tokens": 8028, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 1.79854, "eval_eval_loss": 0.5592343807220459, "eval_eval_runtime": 115.3013, "eval_eval_samples_per_second": 43.365, "eval_eval_steps_per_second": 2.715, "step": 1386 }, { "batch_num_effect_tokens": 8063, "batch_num_samples": 13, "batch_num_tokens": 8189, "epoch": 1.79984, "grad_norm": 0.13307738304138184, "learning_rate": 2.9767274993094285e-07, "loss": 4.0938, "step": 1387 }, { "batch_num_effect_tokens": 7991, "batch_num_samples": 15, "batch_num_tokens": 8168, "epoch": 1.80114, "grad_norm": 0.12784725427627563, "learning_rate": 2.938327631805421e-07, "loss": 4.2842, "step": 1388 }, { "batch_num_effect_tokens": 8072, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 1.80243, "grad_norm": 0.12473003566265106, "learning_rate": 2.900169555481536e-07, "loss": 4.373, "step": 1389 }, { "batch_num_effect_tokens": 8038, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.80373, "grad_norm": 0.12554599344730377, "learning_rate": 2.862253466384507e-07, "loss": 4.249, "step": 1390 }, { "batch_num_effect_tokens": 8049, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.80503, "grad_norm": 0.12785503268241882, "learning_rate": 2.8245795593177637e-07, "loss": 4.376, "step": 1391 }, { "batch_num_effect_tokens": 8068, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.80633, "grad_norm": 0.13512638211250305, "learning_rate": 2.787148027840486e-07, "loss": 4.3232, "step": 1392 }, { "batch_num_effect_tokens": 8052, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.80762, "grad_norm": 0.1361304223537445, "learning_rate": 2.7499590642665773e-07, "loss": 4.4668, "step": 1393 }, { "batch_num_effect_tokens": 8033, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 1.80892, "grad_norm": 0.1287328451871872, "learning_rate": 2.713012859663694e-07, "loss": 4.2832, "step": 1394 }, { "batch_num_effect_tokens": 8080, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.81022, "grad_norm": 0.13318751752376556, "learning_rate": 2.6763096038522673e-07, "loss": 4.5, "step": 1395 }, { "batch_num_effect_tokens": 7937, "batch_num_samples": 14, "batch_num_tokens": 8107, "epoch": 1.81152, "grad_norm": 0.12792308628559113, "learning_rate": 2.6398494854045055e-07, "loss": 4.0791, "step": 1396 }, { "batch_num_effect_tokens": 8067, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.81281, "grad_norm": 0.13105538487434387, "learning_rate": 2.6036326916434153e-07, "loss": 4.4668, "step": 1397 }, { "batch_num_effect_tokens": 8045, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.81411, "grad_norm": 0.1282477080821991, "learning_rate": 2.5676594086419037e-07, "loss": 4.124, "step": 1398 }, { "batch_num_effect_tokens": 7992, "batch_num_samples": 14, "batch_num_tokens": 8144, "epoch": 1.81541, "grad_norm": 0.13009123504161835, "learning_rate": 2.531929821221768e-07, "loss": 4.4424, "step": 1399 }, { "batch_num_effect_tokens": 8046, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.81671, "grad_norm": 0.14188528060913086, "learning_rate": 2.4964441129527337e-07, "loss": 4.3438, "step": 1400 }, { "batch_num_effect_tokens": 8068, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 1.818, "grad_norm": 0.13717369735240936, "learning_rate": 2.4612024661515686e-07, "loss": 4.4883, "step": 1401 }, { "batch_num_effect_tokens": 8076, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 1.8193, "grad_norm": 0.13617396354675293, "learning_rate": 2.426205061881082e-07, "loss": 5.0674, "step": 1402 }, { "batch_num_effect_tokens": 7924, "batch_num_samples": 25, "batch_num_tokens": 8156, "epoch": 1.8206, "grad_norm": 0.1338592916727066, "learning_rate": 2.3914520799492527e-07, "loss": 4.501, "step": 1403 }, { "batch_num_effect_tokens": 8038, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 1.8219, "grad_norm": 0.11929450929164886, "learning_rate": 2.3569436989082705e-07, "loss": 4.0938, "step": 1404 }, { "batch_num_effect_tokens": 7994, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 1.8232, "grad_norm": 0.1299748420715332, "learning_rate": 2.32268009605362e-07, "loss": 4.1816, "step": 1405 }, { "batch_num_effect_tokens": 8020, "batch_num_samples": 17, "batch_num_tokens": 8187, "epoch": 1.82449, "grad_norm": 0.13757820427417755, "learning_rate": 2.2886614474231794e-07, "loss": 4.4873, "step": 1406 }, { "batch_num_effect_tokens": 8046, "batch_num_samples": 14, "batch_num_tokens": 8164, "epoch": 1.82579, "grad_norm": 0.1272607445716858, "learning_rate": 2.2548879277963065e-07, "loss": 4.7734, "step": 1407 }, { "batch_num_effect_tokens": 7982, "batch_num_samples": 14, "batch_num_tokens": 8142, "epoch": 1.82709, "grad_norm": 0.1257350742816925, "learning_rate": 2.2213597106929608e-07, "loss": 4.124, "step": 1408 }, { "batch_num_effect_tokens": 7921, "batch_num_samples": 17, "batch_num_tokens": 8087, "epoch": 1.82839, "grad_norm": 0.12541215121746063, "learning_rate": 2.1880769683727986e-07, "loss": 3.9297, "step": 1409 }, { "batch_num_effect_tokens": 7907, "batch_num_samples": 21, "batch_num_tokens": 8074, "epoch": 1.82968, "grad_norm": 0.11980535089969635, "learning_rate": 2.1550398718342692e-07, "loss": 3.9902, "step": 1410 }, { "batch_num_effect_tokens": 8036, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 1.83098, "grad_norm": 0.13398943841457367, "learning_rate": 2.1222485908137747e-07, "loss": 3.9873, "step": 1411 }, { "batch_num_effect_tokens": 8078, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 1.83228, "grad_norm": 0.12076178193092346, "learning_rate": 2.0897032937847616e-07, "loss": 4.5732, "step": 1412 }, { "batch_num_effect_tokens": 8048, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.83358, "grad_norm": 0.12311708927154541, "learning_rate": 2.0574041479568817e-07, "loss": 4.1377, "step": 1413 }, { "batch_num_effect_tokens": 8063, "batch_num_samples": 14, "batch_num_tokens": 8182, "epoch": 1.83487, "grad_norm": 0.13371671736240387, "learning_rate": 2.0253513192751374e-07, "loss": 4.7109, "step": 1414 }, { "batch_num_effect_tokens": 7978, "batch_num_samples": 22, "batch_num_tokens": 8192, "epoch": 1.83617, "grad_norm": 0.12499105930328369, "learning_rate": 1.9935449724189705e-07, "loss": 4.2949, "step": 1415 }, { "batch_num_effect_tokens": 7971, "batch_num_samples": 17, "batch_num_tokens": 8157, "epoch": 1.83747, "grad_norm": 0.12765094637870789, "learning_rate": 1.9619852708015142e-07, "loss": 4.6221, "step": 1416 }, { "batch_num_effect_tokens": 7899, "batch_num_samples": 18, "batch_num_tokens": 8081, "epoch": 1.83877, "grad_norm": 0.12610025703907013, "learning_rate": 1.9306723765686598e-07, "loss": 4.502, "step": 1417 }, { "batch_num_effect_tokens": 7882, "batch_num_samples": 27, "batch_num_tokens": 8120, "epoch": 1.84006, "grad_norm": 0.14963027834892273, "learning_rate": 1.8996064505982903e-07, "loss": 4.8271, "step": 1418 }, { "batch_num_effect_tokens": 7996, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.84136, "grad_norm": 0.12004819512367249, "learning_rate": 1.8687876524993987e-07, "loss": 4.5586, "step": 1419 }, { "batch_num_effect_tokens": 7978, "batch_num_samples": 16, "batch_num_tokens": 8191, "epoch": 1.84266, "grad_norm": 0.1406659036874771, "learning_rate": 1.8382161406113208e-07, "loss": 4.3506, "step": 1420 }, { "batch_num_effect_tokens": 8050, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.84396, "grad_norm": 0.1230742409825325, "learning_rate": 1.807892072002898e-07, "loss": 4.8135, "step": 1421 }, { "batch_num_effect_tokens": 8047, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 1.84526, "grad_norm": 0.1338808238506317, "learning_rate": 1.7778156024716497e-07, "loss": 4.292, "step": 1422 }, { "batch_num_effect_tokens": 8011, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.84655, "grad_norm": 0.11327774822711945, "learning_rate": 1.7479868865430072e-07, "loss": 4.3232, "step": 1423 }, { "batch_num_effect_tokens": 8029, "batch_num_samples": 14, "batch_num_tokens": 8163, "epoch": 1.84785, "grad_norm": 0.13712045550346375, "learning_rate": 1.7184060774695033e-07, "loss": 4.5557, "step": 1424 }, { "batch_num_effect_tokens": 7921, "batch_num_samples": 14, "batch_num_tokens": 8079, "epoch": 1.84915, "grad_norm": 0.12802965939044952, "learning_rate": 1.689073327229973e-07, "loss": 3.8936, "step": 1425 }, { "batch_num_effect_tokens": 7866, "batch_num_samples": 17, "batch_num_tokens": 8028, "epoch": 1.85045, "grad_norm": 0.14027731120586395, "learning_rate": 1.659988786528821e-07, "loss": 4.3535, "step": 1426 }, { "batch_num_effect_tokens": 8047, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 1.85174, "grad_norm": 0.1289585828781128, "learning_rate": 1.6311526047951774e-07, "loss": 4.3008, "step": 1427 }, { "batch_num_effect_tokens": 8002, "batch_num_samples": 20, "batch_num_tokens": 8192, "epoch": 1.85304, "grad_norm": 0.12682278454303741, "learning_rate": 1.6025649301821877e-07, "loss": 4.6836, "step": 1428 }, { "batch_num_effect_tokens": 8006, "batch_num_samples": 14, "batch_num_tokens": 8189, "epoch": 1.85434, "grad_norm": 0.1304275095462799, "learning_rate": 1.5742259095662126e-07, "loss": 4.1318, "step": 1429 }, { "batch_num_effect_tokens": 8063, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.85564, "grad_norm": 0.1194472685456276, "learning_rate": 1.5461356885461077e-07, "loss": 4.4521, "step": 1430 }, { "batch_num_effect_tokens": 8049, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.85693, "grad_norm": 0.12798550724983215, "learning_rate": 1.5182944114424337e-07, "loss": 4.1016, "step": 1431 }, { "batch_num_effect_tokens": 8059, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.85823, "grad_norm": 0.12709662318229675, "learning_rate": 1.4907022212967803e-07, "loss": 4.1875, "step": 1432 }, { "batch_num_effect_tokens": 8031, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 1.85953, "grad_norm": 0.11923374235630035, "learning_rate": 1.463359259870939e-07, "loss": 3.9727, "step": 1433 }, { "batch_num_effect_tokens": 8025, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.86083, "grad_norm": 0.11731646209955215, "learning_rate": 1.436265667646275e-07, "loss": 4.1377, "step": 1434 }, { "batch_num_effect_tokens": 7870, "batch_num_samples": 14, "batch_num_tokens": 8076, "epoch": 1.86212, "grad_norm": 0.12564802169799805, "learning_rate": 1.4094215838229176e-07, "loss": 4.3174, "step": 1435 }, { "batch_num_effect_tokens": 7994, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.86342, "grad_norm": 0.1299310177564621, "learning_rate": 1.38282714631911e-07, "loss": 4.2109, "step": 1436 }, { "batch_num_effect_tokens": 8068, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.86472, "grad_norm": 0.1304285228252411, "learning_rate": 1.3564824917704556e-07, "loss": 4.2119, "step": 1437 }, { "batch_num_effect_tokens": 8061, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 1.86602, "grad_norm": 0.1279984563589096, "learning_rate": 1.3303877555292443e-07, "loss": 4.4756, "step": 1438 }, { "batch_num_effect_tokens": 7857, "batch_num_samples": 19, "batch_num_tokens": 8086, "epoch": 1.86732, "grad_norm": 0.12695564329624176, "learning_rate": 1.3045430716637608e-07, "loss": 4.6621, "step": 1439 }, { "batch_num_effect_tokens": 7988, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 1.86861, "grad_norm": 0.13486185669898987, "learning_rate": 1.2789485729575612e-07, "loss": 4.252, "step": 1440 }, { "batch_num_effect_tokens": 8022, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.86991, "grad_norm": 0.11354608088731766, "learning_rate": 1.253604390908819e-07, "loss": 3.9609, "step": 1441 }, { "batch_num_effect_tokens": 8041, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.87121, "grad_norm": 0.13337519764900208, "learning_rate": 1.2285106557296479e-07, "loss": 3.9141, "step": 1442 }, { "batch_num_effect_tokens": 8017, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 1.87251, "grad_norm": 0.13385063409805298, "learning_rate": 1.2036674963454232e-07, "loss": 4.4512, "step": 1443 }, { "batch_num_effect_tokens": 8033, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 1.8738, "grad_norm": 0.12899687886238098, "learning_rate": 1.1790750403941231e-07, "loss": 4.5166, "step": 1444 }, { "batch_num_effect_tokens": 8034, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 1.8751, "grad_norm": 0.1389113813638687, "learning_rate": 1.1547334142256895e-07, "loss": 4.4268, "step": 1445 }, { "batch_num_effect_tokens": 8048, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.8764, "grad_norm": 0.13730958104133606, "learning_rate": 1.1306427429013222e-07, "loss": 4.4258, "step": 1446 }, { "batch_num_effect_tokens": 8025, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.8777, "grad_norm": 0.1304236501455307, "learning_rate": 1.1068031501929366e-07, "loss": 4.334, "step": 1447 }, { "batch_num_effect_tokens": 8015, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.87899, "grad_norm": 0.12509453296661377, "learning_rate": 1.0832147585824182e-07, "loss": 3.9365, "step": 1448 }, { "batch_num_effect_tokens": 7965, "batch_num_samples": 15, "batch_num_tokens": 8140, "epoch": 1.88029, "grad_norm": 0.1231050118803978, "learning_rate": 1.0598776892610685e-07, "loss": 4.208, "step": 1449 }, { "batch_num_effect_tokens": 8034, "batch_num_samples": 18, "batch_num_tokens": 8192, "epoch": 1.88159, "grad_norm": 0.12538541853427887, "learning_rate": 1.0367920621289496e-07, "loss": 4.3555, "step": 1450 }, { "batch_num_effect_tokens": 7939, "batch_num_samples": 17, "batch_num_tokens": 8087, "epoch": 1.88289, "grad_norm": 0.1338019073009491, "learning_rate": 1.0139579957942736e-07, "loss": 3.7793, "step": 1451 }, { "batch_num_effect_tokens": 7940, "batch_num_samples": 18, "batch_num_tokens": 8128, "epoch": 1.88418, "grad_norm": 0.13684529066085815, "learning_rate": 9.913756075728088e-08, "loss": 4.415, "step": 1452 }, { "batch_num_effect_tokens": 7970, "batch_num_samples": 15, "batch_num_tokens": 8120, "epoch": 1.88548, "grad_norm": 0.1501617431640625, "learning_rate": 9.69045013487252e-08, "loss": 4.5879, "step": 1453 }, { "batch_num_effect_tokens": 7784, "batch_num_samples": 28, "batch_num_tokens": 8032, "epoch": 1.88678, "grad_norm": 0.1272551417350769, "learning_rate": 9.469663282666519e-08, "loss": 4.4873, "step": 1454 }, { "batch_num_effect_tokens": 7994, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.88808, "grad_norm": 0.13146163523197174, "learning_rate": 9.251396653457978e-08, "loss": 4.5342, "step": 1455 }, { "batch_num_effect_tokens": 8006, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 1.88938, "grad_norm": 0.12587164342403412, "learning_rate": 9.035651368646647e-08, "loss": 4.1934, "step": 1456 }, { "batch_num_effect_tokens": 8075, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.89067, "grad_norm": 0.12512744963169098, "learning_rate": 8.822428536678251e-08, "loss": 4.3193, "step": 1457 }, { "batch_num_effect_tokens": 7942, "batch_num_samples": 14, "batch_num_tokens": 8088, "epoch": 1.89197, "grad_norm": 0.13346132636070251, "learning_rate": 8.611729253038658e-08, "loss": 4.7422, "step": 1458 }, { "batch_num_effect_tokens": 8043, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.89327, "grad_norm": 0.13034431636333466, "learning_rate": 8.403554600248498e-08, "loss": 4.293, "step": 1459 }, { "batch_num_effect_tokens": 7982, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 1.89457, "grad_norm": 0.12772700190544128, "learning_rate": 8.197905647857385e-08, "loss": 4.1699, "step": 1460 }, { "batch_num_effect_tokens": 8013, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.89586, "grad_norm": 0.12720619142055511, "learning_rate": 7.994783452438592e-08, "loss": 4.1992, "step": 1461 }, { "batch_num_effect_tokens": 8031, "batch_num_samples": 14, "batch_num_tokens": 8152, "epoch": 1.89716, "grad_norm": 0.12452313303947449, "learning_rate": 7.794189057583335e-08, "loss": 4.1182, "step": 1462 }, { "batch_num_effect_tokens": 7979, "batch_num_samples": 19, "batch_num_tokens": 8178, "epoch": 1.89846, "grad_norm": 0.1336333006620407, "learning_rate": 7.59612349389599e-08, "loss": 4.4717, "step": 1463 }, { "batch_num_effect_tokens": 7935, "batch_num_samples": 14, "batch_num_tokens": 8074, "epoch": 1.89976, "grad_norm": 0.134145587682724, "learning_rate": 7.400587778988055e-08, "loss": 4.3926, "step": 1464 }, { "batch_num_effect_tokens": 7802, "batch_num_samples": 27, "batch_num_tokens": 8039, "epoch": 1.90105, "grad_norm": 0.12880048155784607, "learning_rate": 7.207582917473532e-08, "loss": 4.9883, "step": 1465 }, { "batch_num_effect_tokens": 7901, "batch_num_samples": 18, "batch_num_tokens": 8076, "epoch": 1.90235, "grad_norm": 0.13317479193210602, "learning_rate": 7.017109900963437e-08, "loss": 3.8604, "step": 1466 }, { "batch_num_effect_tokens": 7849, "batch_num_samples": 17, "batch_num_tokens": 8023, "epoch": 1.90365, "grad_norm": 0.13144908845424652, "learning_rate": 6.829169708060745e-08, "loss": 4.3184, "step": 1467 }, { "batch_num_effect_tokens": 7972, "batch_num_samples": 19, "batch_num_tokens": 8146, "epoch": 1.90495, "grad_norm": 0.1278463453054428, "learning_rate": 6.643763304355566e-08, "loss": 4.335, "step": 1468 }, { "batch_num_effect_tokens": 8047, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.90624, "grad_norm": 0.12541644275188446, "learning_rate": 6.460891642419865e-08, "loss": 4.5146, "step": 1469 }, { "batch_num_effect_tokens": 7816, "batch_num_samples": 26, "batch_num_tokens": 8032, "epoch": 1.90754, "grad_norm": 0.13894987106323242, "learning_rate": 6.280555661802857e-08, "loss": 4.9238, "step": 1470 }, { "batch_num_effect_tokens": 8050, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.90884, "grad_norm": 0.13302992284297943, "learning_rate": 6.102756289025957e-08, "loss": 3.9033, "step": 1471 }, { "batch_num_effect_tokens": 7917, "batch_num_samples": 22, "batch_num_tokens": 8110, "epoch": 1.91014, "grad_norm": 0.12157834321260452, "learning_rate": 5.92749443757823e-08, "loss": 4.4756, "step": 1472 }, { "batch_num_effect_tokens": 8001, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.91144, "grad_norm": 0.12770642340183258, "learning_rate": 5.754771007911441e-08, "loss": 4.5713, "step": 1473 }, { "batch_num_effect_tokens": 8016, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 1.91273, "grad_norm": 0.12481579184532166, "learning_rate": 5.584586887435739e-08, "loss": 4.3623, "step": 1474 }, { "batch_num_effect_tokens": 7933, "batch_num_samples": 18, "batch_num_tokens": 8086, "epoch": 1.91403, "grad_norm": 0.12227432429790497, "learning_rate": 5.4169429505148144e-08, "loss": 4.3359, "step": 1475 }, { "batch_num_effect_tokens": 8028, "batch_num_samples": 18, "batch_num_tokens": 8192, "epoch": 1.91533, "grad_norm": 0.12159877270460129, "learning_rate": 5.251840058461577e-08, "loss": 4.0273, "step": 1476 }, { "batch_num_effect_tokens": 7994, "batch_num_samples": 14, "batch_num_tokens": 8128, "epoch": 1.91663, "grad_norm": 0.1284315288066864, "learning_rate": 5.089279059533658e-08, "loss": 4.3184, "step": 1477 }, { "batch_num_effect_tokens": 8019, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.91792, "grad_norm": 0.135638028383255, "learning_rate": 4.92926078892908e-08, "loss": 4.3555, "step": 1478 }, { "batch_num_effect_tokens": 8018, "batch_num_samples": 14, "batch_num_tokens": 8146, "epoch": 1.91922, "grad_norm": 0.14031581580638885, "learning_rate": 4.7717860687819254e-08, "loss": 4.165, "step": 1479 }, { "batch_num_effect_tokens": 8045, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.92052, "grad_norm": 0.13316749036312103, "learning_rate": 4.6168557081582854e-08, "loss": 4.5801, "step": 1480 }, { "batch_num_effect_tokens": 8013, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 1.92182, "grad_norm": 0.12714573740959167, "learning_rate": 4.464470503051765e-08, "loss": 3.9043, "step": 1481 }, { "batch_num_effect_tokens": 7984, "batch_num_samples": 22, "batch_num_tokens": 8192, "epoch": 1.92311, "grad_norm": 0.13386176526546478, "learning_rate": 4.314631236379707e-08, "loss": 4.0039, "step": 1482 }, { "batch_num_effect_tokens": 8034, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.92441, "grad_norm": 0.13148388266563416, "learning_rate": 4.167338677979027e-08, "loss": 4.3115, "step": 1483 }, { "batch_num_effect_tokens": 8013, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 1.92571, "grad_norm": 0.13195869326591492, "learning_rate": 4.02259358460233e-08, "loss": 4.5586, "step": 1484 }, { "batch_num_effect_tokens": 7984, "batch_num_samples": 18, "batch_num_tokens": 8156, "epoch": 1.92701, "grad_norm": 0.14345306158065796, "learning_rate": 3.8803966999139686e-08, "loss": 4.5898, "step": 1485 }, { "batch_num_effect_tokens": 8005, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 1.9283, "grad_norm": 0.13300980627536774, "learning_rate": 3.7407487544861565e-08, "loss": 4.5, "step": 1486 }, { "batch_num_effect_tokens": 7975, "batch_num_samples": 19, "batch_num_tokens": 8131, "epoch": 1.9296, "grad_norm": 0.1455434411764145, "learning_rate": 3.603650465795305e-08, "loss": 4.6602, "step": 1487 }, { "batch_num_effect_tokens": 8035, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 1.9309, "grad_norm": 0.12875740230083466, "learning_rate": 3.4691025382184165e-08, "loss": 4.6846, "step": 1488 }, { "batch_num_effect_tokens": 8058, "batch_num_samples": 14, "batch_num_tokens": 8177, "epoch": 1.9322, "grad_norm": 0.12339173257350922, "learning_rate": 3.337105663029361e-08, "loss": 4.2891, "step": 1489 }, { "batch_num_effect_tokens": 8038, "batch_num_samples": 14, "batch_num_tokens": 8152, "epoch": 1.9335, "grad_norm": 0.12480738013982773, "learning_rate": 3.2076605183951614e-08, "loss": 4.4785, "step": 1490 }, { "batch_num_effect_tokens": 8013, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 1.93479, "grad_norm": 0.13804928958415985, "learning_rate": 3.080767769372939e-08, "loss": 4.7588, "step": 1491 }, { "batch_num_effect_tokens": 8016, "batch_num_samples": 20, "batch_num_tokens": 8192, "epoch": 1.93609, "grad_norm": 0.14307373762130737, "learning_rate": 2.9564280679060255e-08, "loss": 4.8125, "step": 1492 }, { "batch_num_effect_tokens": 8027, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 1.93739, "grad_norm": 0.12223173677921295, "learning_rate": 2.834642052820913e-08, "loss": 4.1455, "step": 1493 }, { "batch_num_effect_tokens": 8027, "batch_num_samples": 14, "batch_num_tokens": 8168, "epoch": 1.93869, "grad_norm": 0.13116061687469482, "learning_rate": 2.715410349823977e-08, "loss": 3.9893, "step": 1494 }, { "batch_num_effect_tokens": 8014, "batch_num_samples": 20, "batch_num_tokens": 8191, "epoch": 1.93998, "grad_norm": 0.1298002302646637, "learning_rate": 2.59873357149798e-08, "loss": 4.1182, "step": 1495 }, { "batch_num_effect_tokens": 8043, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.94128, "grad_norm": 0.12619996070861816, "learning_rate": 2.4846123172992953e-08, "loss": 4.2354, "step": 1496 }, { "batch_num_effect_tokens": 7887, "batch_num_samples": 23, "batch_num_tokens": 8105, "epoch": 1.94258, "grad_norm": 0.12969645857810974, "learning_rate": 2.3730471735545213e-08, "loss": 4.1484, "step": 1497 }, { "batch_num_effect_tokens": 8024, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 1.94388, "grad_norm": 0.12418043613433838, "learning_rate": 2.264038713457706e-08, "loss": 4.3389, "step": 1498 }, { "batch_num_effect_tokens": 7918, "batch_num_samples": 21, "batch_num_tokens": 8125, "epoch": 1.94517, "grad_norm": 0.122939832508564, "learning_rate": 2.157587497067182e-08, "loss": 3.9756, "step": 1499 }, { "batch_num_effect_tokens": 8040, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.94647, "grad_norm": 0.12949828803539276, "learning_rate": 2.0536940713028475e-08, "loss": 4.4385, "step": 1500 }, { "batch_num_effect_tokens": 8024, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.94777, "grad_norm": 0.11821475625038147, "learning_rate": 1.9523589699433355e-08, "loss": 4.0107, "step": 1501 }, { "batch_num_effect_tokens": 8072, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 1.94907, "grad_norm": 0.1258937120437622, "learning_rate": 1.8535827136232365e-08, "loss": 4.6211, "step": 1502 }, { "batch_num_effect_tokens": 7988, "batch_num_samples": 17, "batch_num_tokens": 8166, "epoch": 1.95036, "grad_norm": 0.1313227117061615, "learning_rate": 1.7573658098304357e-08, "loss": 4.418, "step": 1503 }, { "batch_num_effect_tokens": 8036, "batch_num_samples": 13, "batch_num_tokens": 8189, "epoch": 1.95166, "grad_norm": 0.12842047214508057, "learning_rate": 1.6637087529033925e-08, "loss": 4.1191, "step": 1504 }, { "batch_num_effect_tokens": 7982, "batch_num_samples": 20, "batch_num_tokens": 8192, "epoch": 1.95296, "grad_norm": 0.1315157413482666, "learning_rate": 1.5726120240288632e-08, "loss": 4.5, "step": 1505 }, { "batch_num_effect_tokens": 8009, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 1.95426, "grad_norm": 0.1246904730796814, "learning_rate": 1.4840760912391283e-08, "loss": 4.2578, "step": 1506 }, { "batch_num_effect_tokens": 8031, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.95556, "grad_norm": 0.125263512134552, "learning_rate": 1.3981014094099354e-08, "loss": 4.165, "step": 1507 }, { "batch_num_effect_tokens": 7982, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 1.95685, "grad_norm": 0.12067491561174393, "learning_rate": 1.314688420257726e-08, "loss": 4.4043, "step": 1508 }, { "batch_num_effect_tokens": 8055, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.95815, "grad_norm": 0.12801378965377808, "learning_rate": 1.2338375523378022e-08, "loss": 4.1875, "step": 1509 }, { "batch_num_effect_tokens": 8025, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 1.95945, "grad_norm": 0.14885924756526947, "learning_rate": 1.1555492210418295e-08, "loss": 4.7236, "step": 1510 }, { "batch_num_effect_tokens": 8043, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.96075, "grad_norm": 0.14124317467212677, "learning_rate": 1.0798238285957274e-08, "loss": 4.7051, "step": 1511 }, { "batch_num_effect_tokens": 8041, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.96204, "grad_norm": 0.11698544025421143, "learning_rate": 1.006661764057837e-08, "loss": 4.5605, "step": 1512 }, { "batch_num_effect_tokens": 7831, "batch_num_samples": 31, "batch_num_tokens": 8080, "epoch": 1.96334, "grad_norm": 0.1408989280462265, "learning_rate": 9.36063403316534e-09, "loss": 4.9922, "step": 1513 }, { "batch_num_effect_tokens": 7983, "batch_num_samples": 20, "batch_num_tokens": 8192, "epoch": 1.96464, "grad_norm": 0.12344139814376831, "learning_rate": 8.680291090888416e-09, "loss": 4.3076, "step": 1514 }, { "batch_num_effect_tokens": 8007, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 1.96594, "grad_norm": 0.1171666607260704, "learning_rate": 8.02559230917932e-09, "loss": 4.1758, "step": 1515 }, { "batch_num_effect_tokens": 7977, "batch_num_samples": 22, "batch_num_tokens": 8192, "epoch": 1.96723, "grad_norm": 0.12780164182186127, "learning_rate": 7.3965410517179426e-09, "loss": 4.2393, "step": 1516 }, { "batch_num_effect_tokens": 7998, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 1.96853, "grad_norm": 0.12426239252090454, "learning_rate": 6.793140550414024e-09, "loss": 3.9375, "step": 1517 }, { "batch_num_effect_tokens": 8071, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.96983, "grad_norm": 0.13120593130588531, "learning_rate": 6.215393905388278e-09, "loss": 4.624, "step": 1518 }, { "batch_num_effect_tokens": 8038, "batch_num_samples": 15, "batch_num_tokens": 8167, "epoch": 1.97113, "grad_norm": 0.12499607354402542, "learning_rate": 5.6633040849601865e-09, "loss": 4.4346, "step": 1519 }, { "batch_num_effect_tokens": 8036, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 1.97242, "grad_norm": 0.12693850696086884, "learning_rate": 5.1368739256296704e-09, "loss": 4.0693, "step": 1520 }, { "batch_num_effect_tokens": 8030, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 1.97372, "grad_norm": 0.13233652710914612, "learning_rate": 4.636106132064888e-09, "loss": 4.4072, "step": 1521 }, { "batch_num_effect_tokens": 8037, "batch_num_samples": 15, "batch_num_tokens": 8192, "epoch": 1.97502, "grad_norm": 0.11534731835126877, "learning_rate": 4.161003277085574e-09, "loss": 3.8291, "step": 1522 }, { "batch_num_effect_tokens": 8010, "batch_num_samples": 17, "batch_num_tokens": 8192, "epoch": 1.97632, "grad_norm": 0.13354112207889557, "learning_rate": 3.711567801652494e-09, "loss": 4.1992, "step": 1523 }, { "batch_num_effect_tokens": 7938, "batch_num_samples": 21, "batch_num_tokens": 8142, "epoch": 1.97762, "grad_norm": 0.13712389767169952, "learning_rate": 3.2878020148530143e-09, "loss": 4.1602, "step": 1524 }, { "batch_num_effect_tokens": 7910, "batch_num_samples": 18, "batch_num_tokens": 8090, "epoch": 1.97891, "grad_norm": 0.12684784829616547, "learning_rate": 2.8897080938916634e-09, "loss": 4.5029, "step": 1525 }, { "batch_num_effect_tokens": 8033, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 1.98021, "grad_norm": 0.13627074658870697, "learning_rate": 2.5172880840745873e-09, "loss": 4.7266, "step": 1526 }, { "batch_num_effect_tokens": 7935, "batch_num_samples": 16, "batch_num_tokens": 8122, "epoch": 1.98151, "grad_norm": 0.13185909390449524, "learning_rate": 2.1705438988040005e-09, "loss": 4.0479, "step": 1527 }, { "batch_num_effect_tokens": 8065, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 1.98281, "grad_norm": 0.12739528715610504, "learning_rate": 1.849477319564863e-09, "loss": 4.4717, "step": 1528 }, { "batch_num_effect_tokens": 8036, "batch_num_samples": 14, "batch_num_tokens": 8191, "epoch": 1.9841, "grad_norm": 0.12966281175613403, "learning_rate": 1.5540899959187727e-09, "loss": 4.1494, "step": 1529 }, { "batch_num_effect_tokens": 7937, "batch_num_samples": 17, "batch_num_tokens": 8139, "epoch": 1.9854, "grad_norm": 0.1280137449502945, "learning_rate": 1.2843834454911997e-09, "loss": 4.377, "step": 1530 }, { "batch_num_effect_tokens": 7968, "batch_num_samples": 24, "batch_num_tokens": 8192, "epoch": 1.9867, "grad_norm": 0.13096709549427032, "learning_rate": 1.040359053967599e-09, "loss": 4.127, "step": 1531 }, { "batch_num_effect_tokens": 7984, "batch_num_samples": 14, "batch_num_tokens": 8128, "epoch": 1.988, "grad_norm": 0.12772879004478455, "learning_rate": 8.220180750850848e-10, "loss": 4.3184, "step": 1532 }, { "batch_num_effect_tokens": 8040, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.98929, "grad_norm": 0.1290796995162964, "learning_rate": 6.293616306246586e-10, "loss": 4.8291, "step": 1533 }, { "batch_num_effect_tokens": 7978, "batch_num_samples": 14, "batch_num_tokens": 8146, "epoch": 1.99059, "grad_norm": 0.12888257205486298, "learning_rate": 4.623907104084335e-10, "loss": 4.251, "step": 1534 }, { "batch_num_effect_tokens": 8063, "batch_num_samples": 14, "batch_num_tokens": 8192, "epoch": 1.99189, "grad_norm": 0.13058820366859436, "learning_rate": 3.211061722901976e-10, "loss": 4.3525, "step": 1535 }, { "batch_num_effect_tokens": 7978, "batch_num_samples": 17, "batch_num_tokens": 8139, "epoch": 1.99319, "grad_norm": 0.12405020743608475, "learning_rate": 2.0550874215541362e-10, "loss": 4.4014, "step": 1536 }, { "batch_num_effect_tokens": 8015, "batch_num_samples": 13, "batch_num_tokens": 8192, "epoch": 1.99448, "grad_norm": 0.1255870759487152, "learning_rate": 1.1559901391511308e-10, "loss": 4.2578, "step": 1537 }, { "batch_num_effect_tokens": 8054, "batch_num_samples": 16, "batch_num_tokens": 8192, "epoch": 1.99578, "grad_norm": 0.12560389935970306, "learning_rate": 5.137744950312051e-11, "loss": 4.3672, "step": 1538 }, { "batch_num_effect_tokens": 8007, "batch_num_samples": 18, "batch_num_tokens": 8184, "epoch": 1.99708, "grad_norm": 0.1317049115896225, "learning_rate": 1.2844378873833053e-11, "loss": 4.2676, "step": 1539 }, { "batch_num_effect_tokens": 8033, "batch_num_samples": 18, "batch_num_tokens": 8192, "epoch": 1.99838, "grad_norm": 0.12680941820144653, "learning_rate": 0.0, "loss": 4.6318, "step": 1540 }, { "batch_num_effect_tokens": 8033, "batch_num_samples": 18, "batch_num_tokens": 8192, "epoch": 1.99838, "eval_eval_loss": 0.5588093996047974, "eval_eval_runtime": 115.4397, "eval_eval_samples_per_second": 43.313, "eval_eval_steps_per_second": 2.711, "step": 1540 } ], "logging_steps": 1.0, "max_steps": 1540, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }