diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4588 @@ +{ + "best_global_step": 20000, + "best_metric": 0.8734950423240662, + "best_model_checkpoint": "/scratch-shared/gwijngaard/outputs/qwen2_5-sft-nolora-nonewtoken-nofreeze_20250930_231031/checkpoint-20000", + "epoch": 1.0, + "eval_steps": 5000, + "global_step": 22500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "entropy": 2.9206753075122833, + "epoch": 4.4444444444444447e-05, + "grad_norm": 8.4375, + "learning_rate": 0.0, + "loss": 1.1975, + "mean_token_accuracy": 0.7345979571342468, + "num_tokens": 29463751.0, + "step": 1 + }, + { + "entropy": 2.8330495211542868, + "epoch": 0.0022222222222222222, + "grad_norm": 8.5625, + "learning_rate": 1.451851851851852e-06, + "loss": 0.8656, + "mean_token_accuracy": 0.7418258153662389, + "num_tokens": 29529779.0, + "step": 50 + }, + { + "entropy": 2.836770453453064, + "epoch": 0.0044444444444444444, + "grad_norm": 6.625, + "learning_rate": 2.9333333333333338e-06, + "loss": 0.8785, + "mean_token_accuracy": 0.7382896327972412, + "num_tokens": 29596644.0, + "step": 100 + }, + { + "entropy": 2.9122318506240843, + "epoch": 0.006666666666666667, + "grad_norm": 8.3125, + "learning_rate": 4.4148148148148154e-06, + "loss": 0.8733, + "mean_token_accuracy": 0.7413022458553314, + "num_tokens": 29660760.0, + "step": 150 + }, + { + "entropy": 2.900973844528198, + "epoch": 0.008888888888888889, + "grad_norm": 7.65625, + "learning_rate": 5.896296296296296e-06, + "loss": 0.897, + "mean_token_accuracy": 0.737481083869934, + "num_tokens": 29728064.0, + "step": 200 + }, + { + "entropy": 2.8783673429489136, + "epoch": 0.011111111111111112, + "grad_norm": 7.53125, + "learning_rate": 7.377777777777778e-06, + "loss": 0.8858, + "mean_token_accuracy": 0.7386987507343292, + "num_tokens": 29793159.0, + "step": 250 + }, + { + "entropy": 2.855666689872742, + "epoch": 0.013333333333333334, + "grad_norm": 6.71875, + "learning_rate": 8.85925925925926e-06, + "loss": 0.8958, + "mean_token_accuracy": 0.736550339460373, + "num_tokens": 29858642.0, + "step": 300 + }, + { + "entropy": 2.7782880878448486, + "epoch": 0.015555555555555555, + "grad_norm": 9.125, + "learning_rate": 1.0340740740740743e-05, + "loss": 0.8976, + "mean_token_accuracy": 0.7359859848022461, + "num_tokens": 29927577.0, + "step": 350 + }, + { + "entropy": 2.747315707206726, + "epoch": 0.017777777777777778, + "grad_norm": 9.125, + "learning_rate": 1.1822222222222225e-05, + "loss": 0.851, + "mean_token_accuracy": 0.7458417963981628, + "num_tokens": 29993260.0, + "step": 400 + }, + { + "entropy": 2.777502555847168, + "epoch": 0.02, + "grad_norm": 6.28125, + "learning_rate": 1.3303703703703705e-05, + "loss": 0.8894, + "mean_token_accuracy": 0.7404855847358703, + "num_tokens": 30061485.0, + "step": 450 + }, + { + "entropy": 2.879872035980225, + "epoch": 0.022222222222222223, + "grad_norm": 8.4375, + "learning_rate": 1.4785185185185186e-05, + "loss": 0.9081, + "mean_token_accuracy": 0.7336488461494446, + "num_tokens": 30126648.0, + "step": 500 + }, + { + "entropy": 2.7876138353347777, + "epoch": 0.024444444444444446, + "grad_norm": 9.6875, + "learning_rate": 1.6266666666666668e-05, + "loss": 0.8461, + "mean_token_accuracy": 0.7486303246021271, + "num_tokens": 30192879.0, + "step": 550 + }, + { + "entropy": 2.9612947702407837, + "epoch": 0.02666666666666667, + "grad_norm": 8.0, + "learning_rate": 1.774814814814815e-05, + "loss": 0.8981, + "mean_token_accuracy": 0.7390279281139374, + "num_tokens": 30257537.0, + "step": 600 + }, + { + "entropy": 2.7916600465774537, + "epoch": 0.028888888888888888, + "grad_norm": 7.9375, + "learning_rate": 1.922962962962963e-05, + "loss": 0.8675, + "mean_token_accuracy": 0.7437423765659332, + "num_tokens": 30323879.0, + "step": 650 + }, + { + "entropy": 2.8081023025512697, + "epoch": 0.03111111111111111, + "grad_norm": 7.28125, + "learning_rate": 1.9978006872852237e-05, + "loss": 0.8574, + "mean_token_accuracy": 0.7459037184715271, + "num_tokens": 30393385.0, + "step": 700 + }, + { + "entropy": 2.9191632890701293, + "epoch": 0.03333333333333333, + "grad_norm": 8.3125, + "learning_rate": 1.9932187857961058e-05, + "loss": 0.9081, + "mean_token_accuracy": 0.7324011015892029, + "num_tokens": 30458516.0, + "step": 750 + }, + { + "entropy": 2.783015651702881, + "epoch": 0.035555555555555556, + "grad_norm": 6.875, + "learning_rate": 1.9886368843069876e-05, + "loss": 0.8403, + "mean_token_accuracy": 0.7504636573791504, + "num_tokens": 30522811.0, + "step": 800 + }, + { + "entropy": 2.782901654243469, + "epoch": 0.03777777777777778, + "grad_norm": 7.90625, + "learning_rate": 1.9840549828178697e-05, + "loss": 0.8633, + "mean_token_accuracy": 0.7460731649398804, + "num_tokens": 30588800.0, + "step": 850 + }, + { + "entropy": 2.852466220855713, + "epoch": 0.04, + "grad_norm": 7.875, + "learning_rate": 1.9794730813287515e-05, + "loss": 0.8939, + "mean_token_accuracy": 0.7356749868392944, + "num_tokens": 30654866.0, + "step": 900 + }, + { + "entropy": 2.8751544713974, + "epoch": 0.042222222222222223, + "grad_norm": 6.5625, + "learning_rate": 1.9748911798396336e-05, + "loss": 0.8526, + "mean_token_accuracy": 0.7440717363357544, + "num_tokens": 30718125.0, + "step": 950 + }, + { + "entropy": 2.791564016342163, + "epoch": 0.044444444444444446, + "grad_norm": 8.5625, + "learning_rate": 1.9703092783505157e-05, + "loss": 0.8528, + "mean_token_accuracy": 0.743047387599945, + "num_tokens": 30784326.0, + "step": 1000 + }, + { + "entropy": 2.7872579193115232, + "epoch": 0.04666666666666667, + "grad_norm": 10.3125, + "learning_rate": 1.9657273768613975e-05, + "loss": 0.8516, + "mean_token_accuracy": 0.747126750946045, + "num_tokens": 30851638.0, + "step": 1050 + }, + { + "entropy": 2.7552728176116945, + "epoch": 0.04888888888888889, + "grad_norm": 8.4375, + "learning_rate": 1.9611454753722796e-05, + "loss": 0.8552, + "mean_token_accuracy": 0.7450509345531464, + "num_tokens": 30917218.0, + "step": 1100 + }, + { + "entropy": 2.7746379947662354, + "epoch": 0.051111111111111114, + "grad_norm": 9.3125, + "learning_rate": 1.9565635738831617e-05, + "loss": 0.8364, + "mean_token_accuracy": 0.754111316204071, + "num_tokens": 30981770.0, + "step": 1150 + }, + { + "entropy": 2.6211288261413572, + "epoch": 0.05333333333333334, + "grad_norm": 7.03125, + "learning_rate": 1.9519816723940438e-05, + "loss": 0.8351, + "mean_token_accuracy": 0.7522010815143585, + "num_tokens": 31048479.0, + "step": 1200 + }, + { + "entropy": 2.6207118034362793, + "epoch": 0.05555555555555555, + "grad_norm": 8.5625, + "learning_rate": 1.9473997709049256e-05, + "loss": 0.8428, + "mean_token_accuracy": 0.7495422863960266, + "num_tokens": 31116595.0, + "step": 1250 + }, + { + "entropy": 2.6855200052261354, + "epoch": 0.057777777777777775, + "grad_norm": 9.1875, + "learning_rate": 1.9428178694158077e-05, + "loss": 0.8629, + "mean_token_accuracy": 0.7421471869945526, + "num_tokens": 31183980.0, + "step": 1300 + }, + { + "entropy": 2.528144361972809, + "epoch": 0.06, + "grad_norm": 6.0625, + "learning_rate": 1.93823596792669e-05, + "loss": 0.7948, + "mean_token_accuracy": 0.7594469666481019, + "num_tokens": 31250261.0, + "step": 1350 + }, + { + "entropy": 2.5050185775756835, + "epoch": 0.06222222222222222, + "grad_norm": 8.5, + "learning_rate": 1.933654066437572e-05, + "loss": 0.8032, + "mean_token_accuracy": 0.7555232620239258, + "num_tokens": 31316852.0, + "step": 1400 + }, + { + "entropy": 2.511464171409607, + "epoch": 0.06444444444444444, + "grad_norm": 8.4375, + "learning_rate": 1.9290721649484537e-05, + "loss": 0.8122, + "mean_token_accuracy": 0.7525858426094055, + "num_tokens": 31381166.0, + "step": 1450 + }, + { + "entropy": 2.47177143573761, + "epoch": 0.06666666666666667, + "grad_norm": 7.59375, + "learning_rate": 1.924490263459336e-05, + "loss": 0.8022, + "mean_token_accuracy": 0.7576224994659424, + "num_tokens": 31450722.0, + "step": 1500 + }, + { + "entropy": 2.451930871009827, + "epoch": 0.06888888888888889, + "grad_norm": 12.3125, + "learning_rate": 1.919908361970218e-05, + "loss": 0.8025, + "mean_token_accuracy": 0.7531751859188079, + "num_tokens": 31517729.0, + "step": 1550 + }, + { + "entropy": 2.46038672208786, + "epoch": 0.07111111111111111, + "grad_norm": 6.84375, + "learning_rate": 1.9153264604810998e-05, + "loss": 0.8188, + "mean_token_accuracy": 0.7543287444114685, + "num_tokens": 31584935.0, + "step": 1600 + }, + { + "entropy": 2.5046206855773927, + "epoch": 0.07333333333333333, + "grad_norm": 7.40625, + "learning_rate": 1.910744558991982e-05, + "loss": 0.8076, + "mean_token_accuracy": 0.7573912596702576, + "num_tokens": 31651819.0, + "step": 1650 + }, + { + "entropy": 2.587780613899231, + "epoch": 0.07555555555555556, + "grad_norm": 6.0625, + "learning_rate": 1.9061626575028637e-05, + "loss": 0.7887, + "mean_token_accuracy": 0.7603560221195221, + "num_tokens": 31716593.0, + "step": 1700 + }, + { + "entropy": 2.471900417804718, + "epoch": 0.07777777777777778, + "grad_norm": 7.0, + "learning_rate": 1.9015807560137458e-05, + "loss": 0.7814, + "mean_token_accuracy": 0.7607843494415283, + "num_tokens": 31783153.0, + "step": 1750 + }, + { + "entropy": 2.4517657709121705, + "epoch": 0.08, + "grad_norm": 7.1875, + "learning_rate": 1.896998854524628e-05, + "loss": 0.8497, + "mean_token_accuracy": 0.7433587527275085, + "num_tokens": 31851501.0, + "step": 1800 + }, + { + "entropy": 2.4117108368873597, + "epoch": 0.08222222222222222, + "grad_norm": 7.0625, + "learning_rate": 1.89241695303551e-05, + "loss": 0.799, + "mean_token_accuracy": 0.7592541599273681, + "num_tokens": 31916822.0, + "step": 1850 + }, + { + "entropy": 2.35961368560791, + "epoch": 0.08444444444444445, + "grad_norm": 12.4375, + "learning_rate": 1.8878350515463918e-05, + "loss": 0.8093, + "mean_token_accuracy": 0.7549574375152588, + "num_tokens": 31984311.0, + "step": 1900 + }, + { + "entropy": 2.444758574962616, + "epoch": 0.08666666666666667, + "grad_norm": 7.0, + "learning_rate": 1.883253150057274e-05, + "loss": 0.7778, + "mean_token_accuracy": 0.7624431896209717, + "num_tokens": 32049197.0, + "step": 1950 + }, + { + "entropy": 2.4109366965293884, + "epoch": 0.08888888888888889, + "grad_norm": 8.6875, + "learning_rate": 1.878671248568156e-05, + "loss": 0.8078, + "mean_token_accuracy": 0.7562625980377198, + "num_tokens": 32114585.0, + "step": 2000 + }, + { + "entropy": 2.338375616073608, + "epoch": 0.09111111111111111, + "grad_norm": 7.8125, + "learning_rate": 1.874089347079038e-05, + "loss": 0.8051, + "mean_token_accuracy": 0.7559153795242309, + "num_tokens": 32182189.0, + "step": 2050 + }, + { + "entropy": 2.3647463655471803, + "epoch": 0.09333333333333334, + "grad_norm": 8.5625, + "learning_rate": 1.86950744558992e-05, + "loss": 0.7964, + "mean_token_accuracy": 0.7597098231315613, + "num_tokens": 32246491.0, + "step": 2100 + }, + { + "entropy": 2.3017068338394164, + "epoch": 0.09555555555555556, + "grad_norm": 11.0625, + "learning_rate": 1.864925544100802e-05, + "loss": 0.8237, + "mean_token_accuracy": 0.749286116361618, + "num_tokens": 32313665.0, + "step": 2150 + }, + { + "entropy": 2.3421465373039245, + "epoch": 0.09777777777777778, + "grad_norm": 8.6875, + "learning_rate": 1.860343642611684e-05, + "loss": 0.7969, + "mean_token_accuracy": 0.757558046579361, + "num_tokens": 32377681.0, + "step": 2200 + }, + { + "entropy": 2.2409315156936644, + "epoch": 0.1, + "grad_norm": 8.25, + "learning_rate": 1.855761741122566e-05, + "loss": 0.782, + "mean_token_accuracy": 0.7633419442176819, + "num_tokens": 32445533.0, + "step": 2250 + }, + { + "entropy": 2.2817729759216308, + "epoch": 0.10222222222222223, + "grad_norm": 8.5625, + "learning_rate": 1.851179839633448e-05, + "loss": 0.803, + "mean_token_accuracy": 0.7561051642894745, + "num_tokens": 32514116.0, + "step": 2300 + }, + { + "entropy": 2.151771836280823, + "epoch": 0.10444444444444445, + "grad_norm": 7.125, + "learning_rate": 1.84659793814433e-05, + "loss": 0.7906, + "mean_token_accuracy": 0.7589080274105072, + "num_tokens": 32583284.0, + "step": 2350 + }, + { + "entropy": 2.184410240650177, + "epoch": 0.10666666666666667, + "grad_norm": 10.4375, + "learning_rate": 1.842016036655212e-05, + "loss": 0.8398, + "mean_token_accuracy": 0.7480292344093322, + "num_tokens": 32650513.0, + "step": 2400 + }, + { + "entropy": 2.18844452381134, + "epoch": 0.10888888888888888, + "grad_norm": 10.5, + "learning_rate": 1.837434135166094e-05, + "loss": 0.801, + "mean_token_accuracy": 0.7543882429599762, + "num_tokens": 32716407.0, + "step": 2450 + }, + { + "entropy": 2.172247538566589, + "epoch": 0.1111111111111111, + "grad_norm": 10.5625, + "learning_rate": 1.8328522336769762e-05, + "loss": 0.7963, + "mean_token_accuracy": 0.7586643397808075, + "num_tokens": 32781494.0, + "step": 2500 + }, + { + "entropy": 2.241757538318634, + "epoch": 0.11333333333333333, + "grad_norm": 9.9375, + "learning_rate": 1.828270332187858e-05, + "loss": 0.8141, + "mean_token_accuracy": 0.7552792716026306, + "num_tokens": 32846884.0, + "step": 2550 + }, + { + "entropy": 2.2868755722045897, + "epoch": 0.11555555555555555, + "grad_norm": 7.8125, + "learning_rate": 1.82368843069874e-05, + "loss": 0.8164, + "mean_token_accuracy": 0.7541338610649109, + "num_tokens": 32912538.0, + "step": 2600 + }, + { + "entropy": 2.2125061202049254, + "epoch": 0.11777777777777777, + "grad_norm": 7.1875, + "learning_rate": 1.8191065292096222e-05, + "loss": 0.7896, + "mean_token_accuracy": 0.7596218919754029, + "num_tokens": 32980319.0, + "step": 2650 + }, + { + "entropy": 2.155822920799255, + "epoch": 0.12, + "grad_norm": 6.78125, + "learning_rate": 1.8145246277205043e-05, + "loss": 0.8207, + "mean_token_accuracy": 0.754329582452774, + "num_tokens": 33049961.0, + "step": 2700 + }, + { + "entropy": 2.1824621748924256, + "epoch": 0.12222222222222222, + "grad_norm": 7.59375, + "learning_rate": 1.809942726231386e-05, + "loss": 0.7661, + "mean_token_accuracy": 0.7682196378707886, + "num_tokens": 33116366.0, + "step": 2750 + }, + { + "entropy": 2.231255145072937, + "epoch": 0.12444444444444444, + "grad_norm": 8.875, + "learning_rate": 1.8053608247422682e-05, + "loss": 0.8048, + "mean_token_accuracy": 0.7537202823162079, + "num_tokens": 33185374.0, + "step": 2800 + }, + { + "entropy": 2.2291252851486205, + "epoch": 0.12666666666666668, + "grad_norm": 8.125, + "learning_rate": 1.8007789232531504e-05, + "loss": 0.7598, + "mean_token_accuracy": 0.766055703163147, + "num_tokens": 33252044.0, + "step": 2850 + }, + { + "entropy": 2.1579408121109007, + "epoch": 0.1288888888888889, + "grad_norm": 8.3125, + "learning_rate": 1.7961970217640325e-05, + "loss": 0.7862, + "mean_token_accuracy": 0.7643947994709015, + "num_tokens": 33319654.0, + "step": 2900 + }, + { + "entropy": 2.1827985763549806, + "epoch": 0.13111111111111112, + "grad_norm": 7.375, + "learning_rate": 1.7916151202749143e-05, + "loss": 0.8002, + "mean_token_accuracy": 0.7550258612632752, + "num_tokens": 33388120.0, + "step": 2950 + }, + { + "entropy": 2.2125383615493774, + "epoch": 0.13333333333333333, + "grad_norm": 8.0, + "learning_rate": 1.787033218785796e-05, + "loss": 0.7784, + "mean_token_accuracy": 0.7626236033439636, + "num_tokens": 33454419.0, + "step": 3000 + }, + { + "entropy": 2.1113911986351015, + "epoch": 0.13555555555555557, + "grad_norm": 6.6875, + "learning_rate": 1.782451317296678e-05, + "loss": 0.7674, + "mean_token_accuracy": 0.7665451729297638, + "num_tokens": 33520682.0, + "step": 3050 + }, + { + "entropy": 2.1803041291236878, + "epoch": 0.13777777777777778, + "grad_norm": 7.34375, + "learning_rate": 1.7778694158075603e-05, + "loss": 0.7898, + "mean_token_accuracy": 0.7653110408782959, + "num_tokens": 33588492.0, + "step": 3100 + }, + { + "entropy": 2.2334672379493714, + "epoch": 0.14, + "grad_norm": 7.46875, + "learning_rate": 1.7732875143184424e-05, + "loss": 0.7731, + "mean_token_accuracy": 0.7676819574832916, + "num_tokens": 33654548.0, + "step": 3150 + }, + { + "entropy": 2.1736029314994814, + "epoch": 0.14222222222222222, + "grad_norm": 9.3125, + "learning_rate": 1.768705612829324e-05, + "loss": 0.8031, + "mean_token_accuracy": 0.7566261160373687, + "num_tokens": 33722789.0, + "step": 3200 + }, + { + "entropy": 2.2151904988288877, + "epoch": 0.14444444444444443, + "grad_norm": 7.625, + "learning_rate": 1.7641237113402063e-05, + "loss": 0.7979, + "mean_token_accuracy": 0.7582576811313629, + "num_tokens": 33789866.0, + "step": 3250 + }, + { + "entropy": 2.213924946784973, + "epoch": 0.14666666666666667, + "grad_norm": 9.3125, + "learning_rate": 1.7595418098510884e-05, + "loss": 0.7889, + "mean_token_accuracy": 0.7583772194385529, + "num_tokens": 33857885.0, + "step": 3300 + }, + { + "entropy": 2.144984345436096, + "epoch": 0.14888888888888888, + "grad_norm": 7.78125, + "learning_rate": 1.7549599083619705e-05, + "loss": 0.8019, + "mean_token_accuracy": 0.7535349237918854, + "num_tokens": 33923487.0, + "step": 3350 + }, + { + "entropy": 2.1649972796440125, + "epoch": 0.1511111111111111, + "grad_norm": 8.125, + "learning_rate": 1.7503780068728523e-05, + "loss": 0.7989, + "mean_token_accuracy": 0.757453374862671, + "num_tokens": 33989482.0, + "step": 3400 + }, + { + "entropy": 2.2169123339653014, + "epoch": 0.15333333333333332, + "grad_norm": 6.5625, + "learning_rate": 1.7457961053837344e-05, + "loss": 0.8161, + "mean_token_accuracy": 0.7528200805187225, + "num_tokens": 34059798.0, + "step": 3450 + }, + { + "entropy": 2.2231992268562317, + "epoch": 0.15555555555555556, + "grad_norm": 7.78125, + "learning_rate": 1.7412142038946165e-05, + "loss": 0.8242, + "mean_token_accuracy": 0.7535739564895629, + "num_tokens": 34125686.0, + "step": 3500 + }, + { + "entropy": 2.1861121082305908, + "epoch": 0.15777777777777777, + "grad_norm": 7.125, + "learning_rate": 1.7366323024054987e-05, + "loss": 0.777, + "mean_token_accuracy": 0.7635022902488708, + "num_tokens": 34192858.0, + "step": 3550 + }, + { + "entropy": 2.1724994444847106, + "epoch": 0.16, + "grad_norm": 8.0625, + "learning_rate": 1.7320504009163804e-05, + "loss": 0.7834, + "mean_token_accuracy": 0.76077321767807, + "num_tokens": 34259641.0, + "step": 3600 + }, + { + "entropy": 2.182099552154541, + "epoch": 0.1622222222222222, + "grad_norm": 7.09375, + "learning_rate": 1.7274684994272622e-05, + "loss": 0.7983, + "mean_token_accuracy": 0.7584855699539185, + "num_tokens": 34328061.0, + "step": 3650 + }, + { + "entropy": 2.1422414350509644, + "epoch": 0.16444444444444445, + "grad_norm": 8.375, + "learning_rate": 1.7228865979381443e-05, + "loss": 0.7817, + "mean_token_accuracy": 0.7643873155117035, + "num_tokens": 34397172.0, + "step": 3700 + }, + { + "entropy": 2.1676920294761657, + "epoch": 0.16666666666666666, + "grad_norm": 7.34375, + "learning_rate": 1.7183046964490265e-05, + "loss": 0.8121, + "mean_token_accuracy": 0.7518685030937194, + "num_tokens": 34462090.0, + "step": 3750 + }, + { + "entropy": 2.0866295003890993, + "epoch": 0.1688888888888889, + "grad_norm": 7.5, + "learning_rate": 1.7137227949599086e-05, + "loss": 0.7967, + "mean_token_accuracy": 0.7572396492958069, + "num_tokens": 34531839.0, + "step": 3800 + }, + { + "entropy": 2.1308164954185487, + "epoch": 0.1711111111111111, + "grad_norm": 6.9375, + "learning_rate": 1.7091408934707904e-05, + "loss": 0.7656, + "mean_token_accuracy": 0.7690932631492615, + "num_tokens": 34597073.0, + "step": 3850 + }, + { + "entropy": 2.1494863390922547, + "epoch": 0.17333333333333334, + "grad_norm": 8.5625, + "learning_rate": 1.7045589919816725e-05, + "loss": 0.8129, + "mean_token_accuracy": 0.7566492486000062, + "num_tokens": 34667098.0, + "step": 3900 + }, + { + "entropy": 2.098915767669678, + "epoch": 0.17555555555555555, + "grad_norm": 7.9375, + "learning_rate": 1.6999770904925546e-05, + "loss": 0.7681, + "mean_token_accuracy": 0.764438933134079, + "num_tokens": 34734238.0, + "step": 3950 + }, + { + "entropy": 2.2060401248931885, + "epoch": 0.17777777777777778, + "grad_norm": 6.6875, + "learning_rate": 1.6953951890034367e-05, + "loss": 0.7993, + "mean_token_accuracy": 0.7594318461418151, + "num_tokens": 34801484.0, + "step": 4000 + }, + { + "entropy": 2.1855486106872557, + "epoch": 0.18, + "grad_norm": 7.96875, + "learning_rate": 1.6908132875143185e-05, + "loss": 0.792, + "mean_token_accuracy": 0.7597147989273071, + "num_tokens": 34869674.0, + "step": 4050 + }, + { + "entropy": 2.0849170541763304, + "epoch": 0.18222222222222223, + "grad_norm": 7.21875, + "learning_rate": 1.6862313860252006e-05, + "loss": 0.8095, + "mean_token_accuracy": 0.7556380236148834, + "num_tokens": 34939372.0, + "step": 4100 + }, + { + "entropy": 2.1427531123161314, + "epoch": 0.18444444444444444, + "grad_norm": 7.4375, + "learning_rate": 1.6816494845360827e-05, + "loss": 0.8103, + "mean_token_accuracy": 0.7562667608261109, + "num_tokens": 35007892.0, + "step": 4150 + }, + { + "entropy": 2.0929103469848633, + "epoch": 0.18666666666666668, + "grad_norm": 10.3125, + "learning_rate": 1.677067583046965e-05, + "loss": 0.8292, + "mean_token_accuracy": 0.7510438573360443, + "num_tokens": 35078997.0, + "step": 4200 + }, + { + "entropy": 2.049334568977356, + "epoch": 0.18888888888888888, + "grad_norm": 7.875, + "learning_rate": 1.6724856815578466e-05, + "loss": 0.7729, + "mean_token_accuracy": 0.7638446021080018, + "num_tokens": 35148350.0, + "step": 4250 + }, + { + "entropy": 2.099260985851288, + "epoch": 0.19111111111111112, + "grad_norm": 6.8125, + "learning_rate": 1.6679037800687284e-05, + "loss": 0.7974, + "mean_token_accuracy": 0.7584287643432617, + "num_tokens": 35216731.0, + "step": 4300 + }, + { + "entropy": 2.112381303310394, + "epoch": 0.19333333333333333, + "grad_norm": 7.0, + "learning_rate": 1.6633218785796105e-05, + "loss": 0.743, + "mean_token_accuracy": 0.7701405155658722, + "num_tokens": 35282400.0, + "step": 4350 + }, + { + "entropy": 2.152976577281952, + "epoch": 0.19555555555555557, + "grad_norm": 7.21875, + "learning_rate": 1.6587399770904926e-05, + "loss": 0.79, + "mean_token_accuracy": 0.7612162494659424, + "num_tokens": 35348623.0, + "step": 4400 + }, + { + "entropy": 2.1815468096733093, + "epoch": 0.19777777777777777, + "grad_norm": 7.125, + "learning_rate": 1.6541580756013748e-05, + "loss": 0.8279, + "mean_token_accuracy": 0.7511774361133575, + "num_tokens": 35417105.0, + "step": 4450 + }, + { + "entropy": 2.2018524050712585, + "epoch": 0.2, + "grad_norm": 8.8125, + "learning_rate": 1.6495761741122565e-05, + "loss": 0.7781, + "mean_token_accuracy": 0.763760222196579, + "num_tokens": 35483468.0, + "step": 4500 + }, + { + "entropy": 2.1301321840286254, + "epoch": 0.20222222222222222, + "grad_norm": 7.28125, + "learning_rate": 1.6449942726231387e-05, + "loss": 0.8019, + "mean_token_accuracy": 0.7569781160354614, + "num_tokens": 35550517.0, + "step": 4550 + }, + { + "entropy": 2.187158074378967, + "epoch": 0.20444444444444446, + "grad_norm": 8.25, + "learning_rate": 1.6404123711340208e-05, + "loss": 0.8104, + "mean_token_accuracy": 0.7544570553302765, + "num_tokens": 35617223.0, + "step": 4600 + }, + { + "entropy": 2.106666340827942, + "epoch": 0.20666666666666667, + "grad_norm": 7.53125, + "learning_rate": 1.635830469644903e-05, + "loss": 0.755, + "mean_token_accuracy": 0.7678434896469116, + "num_tokens": 35682799.0, + "step": 4650 + }, + { + "entropy": 2.109342801570892, + "epoch": 0.2088888888888889, + "grad_norm": 7.09375, + "learning_rate": 1.6312485681557847e-05, + "loss": 0.809, + "mean_token_accuracy": 0.7529355835914612, + "num_tokens": 35749281.0, + "step": 4700 + }, + { + "entropy": 2.0876813530921936, + "epoch": 0.2111111111111111, + "grad_norm": 8.5625, + "learning_rate": 1.6266666666666668e-05, + "loss": 0.7859, + "mean_token_accuracy": 0.760038149356842, + "num_tokens": 35817971.0, + "step": 4750 + }, + { + "entropy": 2.109708137512207, + "epoch": 0.21333333333333335, + "grad_norm": 8.875, + "learning_rate": 1.622084765177549e-05, + "loss": 0.8217, + "mean_token_accuracy": 0.7479185128211975, + "num_tokens": 35884686.0, + "step": 4800 + }, + { + "entropy": 2.181152358055115, + "epoch": 0.21555555555555556, + "grad_norm": 11.375, + "learning_rate": 1.617502863688431e-05, + "loss": 0.8565, + "mean_token_accuracy": 0.7458688330650329, + "num_tokens": 35950287.0, + "step": 4850 + }, + { + "entropy": 2.1443709397315978, + "epoch": 0.21777777777777776, + "grad_norm": 9.4375, + "learning_rate": 1.6129209621993128e-05, + "loss": 0.8138, + "mean_token_accuracy": 0.7534062623977661, + "num_tokens": 36015777.0, + "step": 4900 + }, + { + "entropy": 2.0954318118095396, + "epoch": 0.22, + "grad_norm": 7.84375, + "learning_rate": 1.608339060710195e-05, + "loss": 0.7995, + "mean_token_accuracy": 0.7556386387348175, + "num_tokens": 36084969.0, + "step": 4950 + }, + { + "entropy": 2.1733604526519774, + "epoch": 0.2222222222222222, + "grad_norm": 7.3125, + "learning_rate": 1.6037571592210767e-05, + "loss": 0.7857, + "mean_token_accuracy": 0.7591150319576263, + "num_tokens": 36150062.0, + "step": 5000 + }, + { + "epoch": 0.2222222222222222, + "eval_entropy": 2.060102254152298, + "eval_loss": 0.9120854735374451, + "eval_mean_token_accuracy": 0.7374376058578491, + "eval_num_tokens": 36150062.0, + "eval_runtime": 5.8568, + "eval_samples_per_second": 2.22, + "eval_steps_per_second": 0.683, + "step": 5000 + }, + { + "entropy": 2.12566153049469, + "epoch": 0.22444444444444445, + "grad_norm": 6.28125, + "learning_rate": 1.5991752577319588e-05, + "loss": 0.7834, + "mean_token_accuracy": 0.7630481898784638, + "num_tokens": 36218190.0, + "step": 5050 + }, + { + "entropy": 2.1597506642341613, + "epoch": 0.22666666666666666, + "grad_norm": 7.96875, + "learning_rate": 1.594593356242841e-05, + "loss": 0.8013, + "mean_token_accuracy": 0.7572294700145722, + "num_tokens": 36284871.0, + "step": 5100 + }, + { + "entropy": 2.2783447027206423, + "epoch": 0.2288888888888889, + "grad_norm": 6.90625, + "learning_rate": 1.5900114547537227e-05, + "loss": 0.8446, + "mean_token_accuracy": 0.7509990322589875, + "num_tokens": 36352201.0, + "step": 5150 + }, + { + "entropy": 2.0995934796333313, + "epoch": 0.2311111111111111, + "grad_norm": 7.21875, + "learning_rate": 1.585429553264605e-05, + "loss": 0.7758, + "mean_token_accuracy": 0.7649268102645874, + "num_tokens": 36422636.0, + "step": 5200 + }, + { + "entropy": 2.2191508650779723, + "epoch": 0.23333333333333334, + "grad_norm": 8.9375, + "learning_rate": 1.580847651775487e-05, + "loss": 0.8183, + "mean_token_accuracy": 0.7507635414600372, + "num_tokens": 36489811.0, + "step": 5250 + }, + { + "entropy": 2.158704442977905, + "epoch": 0.23555555555555555, + "grad_norm": 11.625, + "learning_rate": 1.576265750286369e-05, + "loss": 0.7919, + "mean_token_accuracy": 0.7578798067569733, + "num_tokens": 36558093.0, + "step": 5300 + }, + { + "entropy": 2.2435457158088683, + "epoch": 0.23777777777777778, + "grad_norm": 10.1875, + "learning_rate": 1.571683848797251e-05, + "loss": 0.8026, + "mean_token_accuracy": 0.7596748018264771, + "num_tokens": 36623942.0, + "step": 5350 + }, + { + "entropy": 2.26217898607254, + "epoch": 0.24, + "grad_norm": 6.6875, + "learning_rate": 1.567101947308133e-05, + "loss": 0.7801, + "mean_token_accuracy": 0.7630024456977844, + "num_tokens": 36693576.0, + "step": 5400 + }, + { + "entropy": 2.1895520281791687, + "epoch": 0.24222222222222223, + "grad_norm": 6.96875, + "learning_rate": 1.562520045819015e-05, + "loss": 0.8052, + "mean_token_accuracy": 0.7553786933422089, + "num_tokens": 36761921.0, + "step": 5450 + }, + { + "entropy": 2.242999081611633, + "epoch": 0.24444444444444444, + "grad_norm": 7.90625, + "learning_rate": 1.5579381443298972e-05, + "loss": 0.7577, + "mean_token_accuracy": 0.7669945132732391, + "num_tokens": 36829085.0, + "step": 5500 + }, + { + "entropy": 2.168731119632721, + "epoch": 0.24666666666666667, + "grad_norm": 7.71875, + "learning_rate": 1.553356242840779e-05, + "loss": 0.7417, + "mean_token_accuracy": 0.7682712721824646, + "num_tokens": 36894333.0, + "step": 5550 + }, + { + "entropy": 2.209793448448181, + "epoch": 0.24888888888888888, + "grad_norm": 10.875, + "learning_rate": 1.548774341351661e-05, + "loss": 0.787, + "mean_token_accuracy": 0.7617891025543213, + "num_tokens": 36964522.0, + "step": 5600 + }, + { + "entropy": 2.2578547739982606, + "epoch": 0.2511111111111111, + "grad_norm": 7.59375, + "learning_rate": 1.5441924398625432e-05, + "loss": 0.8494, + "mean_token_accuracy": 0.7462784695625305, + "num_tokens": 37031831.0, + "step": 5650 + }, + { + "entropy": 2.2453298950195313, + "epoch": 0.25333333333333335, + "grad_norm": 7.09375, + "learning_rate": 1.539610538373425e-05, + "loss": 0.7898, + "mean_token_accuracy": 0.7597798085212708, + "num_tokens": 37096360.0, + "step": 5700 + }, + { + "entropy": 2.1777192115783692, + "epoch": 0.25555555555555554, + "grad_norm": 10.0, + "learning_rate": 1.535028636884307e-05, + "loss": 0.8147, + "mean_token_accuracy": 0.7537575364112854, + "num_tokens": 37161633.0, + "step": 5750 + }, + { + "entropy": 2.171867892742157, + "epoch": 0.2577777777777778, + "grad_norm": 7.71875, + "learning_rate": 1.530446735395189e-05, + "loss": 0.8001, + "mean_token_accuracy": 0.7575827407836914, + "num_tokens": 37228323.0, + "step": 5800 + }, + { + "entropy": 2.1830092740058897, + "epoch": 0.26, + "grad_norm": 8.8125, + "learning_rate": 1.525864833906071e-05, + "loss": 0.7722, + "mean_token_accuracy": 0.7659056377410889, + "num_tokens": 37295036.0, + "step": 5850 + }, + { + "entropy": 2.2247348022460938, + "epoch": 0.26222222222222225, + "grad_norm": 7.75, + "learning_rate": 1.5212829324169531e-05, + "loss": 0.786, + "mean_token_accuracy": 0.7639349389076233, + "num_tokens": 37363275.0, + "step": 5900 + }, + { + "entropy": 2.1976384329795837, + "epoch": 0.2644444444444444, + "grad_norm": 7.21875, + "learning_rate": 1.5167010309278351e-05, + "loss": 0.7832, + "mean_token_accuracy": 0.7637232232093811, + "num_tokens": 37429362.0, + "step": 5950 + }, + { + "entropy": 2.1915416312217713, + "epoch": 0.26666666666666666, + "grad_norm": 7.53125, + "learning_rate": 1.5121191294387172e-05, + "loss": 0.7911, + "mean_token_accuracy": 0.7595293188095092, + "num_tokens": 37496221.0, + "step": 6000 + }, + { + "entropy": 2.1550126123428344, + "epoch": 0.2688888888888889, + "grad_norm": 7.90625, + "learning_rate": 1.5075372279495992e-05, + "loss": 0.7866, + "mean_token_accuracy": 0.7628938972949981, + "num_tokens": 37563946.0, + "step": 6050 + }, + { + "entropy": 2.1578181266784666, + "epoch": 0.27111111111111114, + "grad_norm": 11.625, + "learning_rate": 1.5029553264604813e-05, + "loss": 0.8002, + "mean_token_accuracy": 0.7602377796173095, + "num_tokens": 37632154.0, + "step": 6100 + }, + { + "entropy": 2.1704964351654055, + "epoch": 0.2733333333333333, + "grad_norm": 7.09375, + "learning_rate": 1.4983734249713632e-05, + "loss": 0.7968, + "mean_token_accuracy": 0.7589374840259552, + "num_tokens": 37700403.0, + "step": 6150 + }, + { + "entropy": 2.1958404278755186, + "epoch": 0.27555555555555555, + "grad_norm": 7.71875, + "learning_rate": 1.4937915234822453e-05, + "loss": 0.8021, + "mean_token_accuracy": 0.7585455322265625, + "num_tokens": 37763144.0, + "step": 6200 + }, + { + "entropy": 2.2267961406707766, + "epoch": 0.2777777777777778, + "grad_norm": 9.375, + "learning_rate": 1.4892096219931273e-05, + "loss": 0.8, + "mean_token_accuracy": 0.756108273267746, + "num_tokens": 37828258.0, + "step": 6250 + }, + { + "entropy": 2.2654725027084353, + "epoch": 0.28, + "grad_norm": 8.875, + "learning_rate": 1.4846277205040094e-05, + "loss": 0.8398, + "mean_token_accuracy": 0.7474746882915497, + "num_tokens": 37894712.0, + "step": 6300 + }, + { + "entropy": 2.2119923210144044, + "epoch": 0.2822222222222222, + "grad_norm": 7.125, + "learning_rate": 1.4800458190148912e-05, + "loss": 0.8204, + "mean_token_accuracy": 0.7523449230194091, + "num_tokens": 37961686.0, + "step": 6350 + }, + { + "entropy": 2.176643466949463, + "epoch": 0.28444444444444444, + "grad_norm": 8.5625, + "learning_rate": 1.4754639175257731e-05, + "loss": 0.7743, + "mean_token_accuracy": 0.7683572423458099, + "num_tokens": 38028734.0, + "step": 6400 + }, + { + "entropy": 2.175390179157257, + "epoch": 0.2866666666666667, + "grad_norm": 7.03125, + "learning_rate": 1.4708820160366553e-05, + "loss": 0.8181, + "mean_token_accuracy": 0.7544300401210785, + "num_tokens": 38093844.0, + "step": 6450 + }, + { + "entropy": 2.2051888942718505, + "epoch": 0.28888888888888886, + "grad_norm": 6.5, + "learning_rate": 1.4663001145475372e-05, + "loss": 0.8126, + "mean_token_accuracy": 0.7564982330799103, + "num_tokens": 38159450.0, + "step": 6500 + }, + { + "entropy": 2.217521970272064, + "epoch": 0.2911111111111111, + "grad_norm": 7.90625, + "learning_rate": 1.4617182130584193e-05, + "loss": 0.816, + "mean_token_accuracy": 0.7553452157974243, + "num_tokens": 38224878.0, + "step": 6550 + }, + { + "entropy": 2.190767750740051, + "epoch": 0.29333333333333333, + "grad_norm": 8.875, + "learning_rate": 1.4571363115693013e-05, + "loss": 0.8144, + "mean_token_accuracy": 0.7556489539146424, + "num_tokens": 38291865.0, + "step": 6600 + }, + { + "entropy": 2.2169429779052736, + "epoch": 0.29555555555555557, + "grad_norm": 7.0, + "learning_rate": 1.4525544100801834e-05, + "loss": 0.8167, + "mean_token_accuracy": 0.7550825190544128, + "num_tokens": 38360809.0, + "step": 6650 + }, + { + "entropy": 2.2201628613471986, + "epoch": 0.29777777777777775, + "grad_norm": 8.375, + "learning_rate": 1.4479725085910653e-05, + "loss": 0.8098, + "mean_token_accuracy": 0.7543143463134766, + "num_tokens": 38428799.0, + "step": 6700 + }, + { + "entropy": 2.198468723297119, + "epoch": 0.3, + "grad_norm": 7.59375, + "learning_rate": 1.4433906071019475e-05, + "loss": 0.841, + "mean_token_accuracy": 0.7505165827274323, + "num_tokens": 38496550.0, + "step": 6750 + }, + { + "entropy": 2.1353413486480712, + "epoch": 0.3022222222222222, + "grad_norm": 7.625, + "learning_rate": 1.4388087056128294e-05, + "loss": 0.794, + "mean_token_accuracy": 0.7610986518859864, + "num_tokens": 38564172.0, + "step": 6800 + }, + { + "entropy": 2.249865219593048, + "epoch": 0.30444444444444446, + "grad_norm": 7.5, + "learning_rate": 1.4342268041237115e-05, + "loss": 0.8153, + "mean_token_accuracy": 0.7510960531234742, + "num_tokens": 38630875.0, + "step": 6850 + }, + { + "entropy": 2.1702062487602234, + "epoch": 0.30666666666666664, + "grad_norm": 10.6875, + "learning_rate": 1.4296449026345935e-05, + "loss": 0.7737, + "mean_token_accuracy": 0.763483716249466, + "num_tokens": 38694435.0, + "step": 6900 + }, + { + "entropy": 2.2601536536216735, + "epoch": 0.3088888888888889, + "grad_norm": 10.375, + "learning_rate": 1.4250630011454756e-05, + "loss": 0.8578, + "mean_token_accuracy": 0.7417992842197418, + "num_tokens": 38759789.0, + "step": 6950 + }, + { + "entropy": 2.276434664726257, + "epoch": 0.3111111111111111, + "grad_norm": 6.8125, + "learning_rate": 1.4204810996563575e-05, + "loss": 0.7678, + "mean_token_accuracy": 0.7661231434345246, + "num_tokens": 38826171.0, + "step": 7000 + }, + { + "entropy": 2.2015094637870787, + "epoch": 0.31333333333333335, + "grad_norm": 7.9375, + "learning_rate": 1.4158991981672395e-05, + "loss": 0.8136, + "mean_token_accuracy": 0.7545084583759308, + "num_tokens": 38890825.0, + "step": 7050 + }, + { + "entropy": 2.1954299902915952, + "epoch": 0.31555555555555553, + "grad_norm": 7.875, + "learning_rate": 1.4113172966781214e-05, + "loss": 0.8102, + "mean_token_accuracy": 0.7536708974838257, + "num_tokens": 38957597.0, + "step": 7100 + }, + { + "entropy": 2.1553618788719175, + "epoch": 0.31777777777777777, + "grad_norm": 10.625, + "learning_rate": 1.4067353951890036e-05, + "loss": 0.806, + "mean_token_accuracy": 0.756889488697052, + "num_tokens": 39024094.0, + "step": 7150 + }, + { + "entropy": 2.1930714321136473, + "epoch": 0.32, + "grad_norm": 7.34375, + "learning_rate": 1.4021534936998855e-05, + "loss": 0.8191, + "mean_token_accuracy": 0.7526458752155304, + "num_tokens": 39088926.0, + "step": 7200 + }, + { + "entropy": 2.1648390030860902, + "epoch": 0.32222222222222224, + "grad_norm": 6.40625, + "learning_rate": 1.3975715922107675e-05, + "loss": 0.8018, + "mean_token_accuracy": 0.7559799265861511, + "num_tokens": 39155958.0, + "step": 7250 + }, + { + "entropy": 2.163375446796417, + "epoch": 0.3244444444444444, + "grad_norm": 7.8125, + "learning_rate": 1.3929896907216496e-05, + "loss": 0.8065, + "mean_token_accuracy": 0.7592847204208374, + "num_tokens": 39224450.0, + "step": 7300 + }, + { + "entropy": 2.1846578884124757, + "epoch": 0.32666666666666666, + "grad_norm": 8.0625, + "learning_rate": 1.3884077892325315e-05, + "loss": 0.7933, + "mean_token_accuracy": 0.761990624666214, + "num_tokens": 39288896.0, + "step": 7350 + }, + { + "entropy": 2.2050635480880736, + "epoch": 0.3288888888888889, + "grad_norm": 6.84375, + "learning_rate": 1.3838258877434137e-05, + "loss": 0.8317, + "mean_token_accuracy": 0.7510676419734955, + "num_tokens": 39354824.0, + "step": 7400 + }, + { + "entropy": 2.1733724021911622, + "epoch": 0.33111111111111113, + "grad_norm": 10.5, + "learning_rate": 1.3792439862542956e-05, + "loss": 0.8164, + "mean_token_accuracy": 0.755161405801773, + "num_tokens": 39423233.0, + "step": 7450 + }, + { + "entropy": 2.1624359107017517, + "epoch": 0.3333333333333333, + "grad_norm": 8.875, + "learning_rate": 1.3746620847651777e-05, + "loss": 0.8105, + "mean_token_accuracy": 0.7563036870956421, + "num_tokens": 39488844.0, + "step": 7500 + }, + { + "entropy": 2.2275446701049804, + "epoch": 0.33555555555555555, + "grad_norm": 8.625, + "learning_rate": 1.3700801832760597e-05, + "loss": 0.8569, + "mean_token_accuracy": 0.7444409322738648, + "num_tokens": 39557151.0, + "step": 7550 + }, + { + "entropy": 2.160377633571625, + "epoch": 0.3377777777777778, + "grad_norm": 8.0, + "learning_rate": 1.3654982817869418e-05, + "loss": 0.7943, + "mean_token_accuracy": 0.7591172540187836, + "num_tokens": 39622145.0, + "step": 7600 + }, + { + "entropy": 2.1659918451309204, + "epoch": 0.34, + "grad_norm": 8.8125, + "learning_rate": 1.3609163802978237e-05, + "loss": 0.8312, + "mean_token_accuracy": 0.7525255751609802, + "num_tokens": 39692633.0, + "step": 7650 + }, + { + "entropy": 2.15194406747818, + "epoch": 0.3422222222222222, + "grad_norm": 9.8125, + "learning_rate": 1.3563344788087059e-05, + "loss": 0.7778, + "mean_token_accuracy": 0.7636078727245331, + "num_tokens": 39759159.0, + "step": 7700 + }, + { + "entropy": 2.217858202457428, + "epoch": 0.34444444444444444, + "grad_norm": 7.09375, + "learning_rate": 1.3517525773195876e-05, + "loss": 0.8241, + "mean_token_accuracy": 0.7522653472423554, + "num_tokens": 39826283.0, + "step": 7750 + }, + { + "entropy": 2.1696537256240847, + "epoch": 0.3466666666666667, + "grad_norm": 9.0625, + "learning_rate": 1.3471706758304698e-05, + "loss": 0.815, + "mean_token_accuracy": 0.7559839642047882, + "num_tokens": 39895721.0, + "step": 7800 + }, + { + "entropy": 2.1727401876449584, + "epoch": 0.3488888888888889, + "grad_norm": 8.375, + "learning_rate": 1.3425887743413517e-05, + "loss": 0.8366, + "mean_token_accuracy": 0.7508484077453613, + "num_tokens": 39963266.0, + "step": 7850 + }, + { + "entropy": 2.1629985857009886, + "epoch": 0.3511111111111111, + "grad_norm": 7.25, + "learning_rate": 1.3380068728522338e-05, + "loss": 0.782, + "mean_token_accuracy": 0.761427184343338, + "num_tokens": 40031913.0, + "step": 7900 + }, + { + "entropy": 2.157064917087555, + "epoch": 0.35333333333333333, + "grad_norm": 14.125, + "learning_rate": 1.3334249713631158e-05, + "loss": 0.7747, + "mean_token_accuracy": 0.7661045718193055, + "num_tokens": 40098930.0, + "step": 7950 + }, + { + "entropy": 2.0971979546546935, + "epoch": 0.35555555555555557, + "grad_norm": 8.75, + "learning_rate": 1.3288430698739979e-05, + "loss": 0.8186, + "mean_token_accuracy": 0.7531020665168762, + "num_tokens": 40165756.0, + "step": 8000 + }, + { + "entropy": 2.1666273212432863, + "epoch": 0.35777777777777775, + "grad_norm": 7.03125, + "learning_rate": 1.3242611683848798e-05, + "loss": 0.842, + "mean_token_accuracy": 0.7474488174915314, + "num_tokens": 40233071.0, + "step": 8050 + }, + { + "entropy": 2.2938923120498655, + "epoch": 0.36, + "grad_norm": 11.875, + "learning_rate": 1.3196792668957618e-05, + "loss": 0.8804, + "mean_token_accuracy": 0.7407302105426788, + "num_tokens": 40297749.0, + "step": 8100 + }, + { + "entropy": 2.2303052496910096, + "epoch": 0.3622222222222222, + "grad_norm": 8.125, + "learning_rate": 1.3150973654066439e-05, + "loss": 0.8033, + "mean_token_accuracy": 0.7578864073753357, + "num_tokens": 40366213.0, + "step": 8150 + }, + { + "entropy": 2.2161930847167968, + "epoch": 0.36444444444444446, + "grad_norm": 9.6875, + "learning_rate": 1.3105154639175259e-05, + "loss": 0.7947, + "mean_token_accuracy": 0.7593982243537902, + "num_tokens": 40433288.0, + "step": 8200 + }, + { + "entropy": 2.1040697479248047, + "epoch": 0.36666666666666664, + "grad_norm": 7.21875, + "learning_rate": 1.305933562428408e-05, + "loss": 0.7959, + "mean_token_accuracy": 0.7588922154903411, + "num_tokens": 40499720.0, + "step": 8250 + }, + { + "entropy": 2.1772580099105836, + "epoch": 0.3688888888888889, + "grad_norm": 7.25, + "learning_rate": 1.30135166093929e-05, + "loss": 0.8284, + "mean_token_accuracy": 0.7538638985157013, + "num_tokens": 40565376.0, + "step": 8300 + }, + { + "entropy": 2.128792498111725, + "epoch": 0.3711111111111111, + "grad_norm": 8.6875, + "learning_rate": 1.296769759450172e-05, + "loss": 0.7672, + "mean_token_accuracy": 0.7641964781284333, + "num_tokens": 40631675.0, + "step": 8350 + }, + { + "entropy": 2.1719995403289794, + "epoch": 0.37333333333333335, + "grad_norm": 9.25, + "learning_rate": 1.2921878579610538e-05, + "loss": 0.801, + "mean_token_accuracy": 0.76089714884758, + "num_tokens": 40697335.0, + "step": 8400 + }, + { + "entropy": 2.156131479740143, + "epoch": 0.37555555555555553, + "grad_norm": 8.75, + "learning_rate": 1.287605956471936e-05, + "loss": 0.8384, + "mean_token_accuracy": 0.7503017449378967, + "num_tokens": 40764922.0, + "step": 8450 + }, + { + "entropy": 2.284397015571594, + "epoch": 0.37777777777777777, + "grad_norm": 9.25, + "learning_rate": 1.2830240549828179e-05, + "loss": 0.8418, + "mean_token_accuracy": 0.7493084251880646, + "num_tokens": 40828443.0, + "step": 8500 + }, + { + "entropy": 2.147591190338135, + "epoch": 0.38, + "grad_norm": 8.625, + "learning_rate": 1.2784421534937e-05, + "loss": 0.8173, + "mean_token_accuracy": 0.756924353837967, + "num_tokens": 40894414.0, + "step": 8550 + }, + { + "entropy": 2.142817313671112, + "epoch": 0.38222222222222224, + "grad_norm": 8.5625, + "learning_rate": 1.273860252004582e-05, + "loss": 0.7962, + "mean_token_accuracy": 0.7598996949195862, + "num_tokens": 40962274.0, + "step": 8600 + }, + { + "entropy": 2.1330949759483335, + "epoch": 0.3844444444444444, + "grad_norm": 7.0625, + "learning_rate": 1.269278350515464e-05, + "loss": 0.7939, + "mean_token_accuracy": 0.7612218356132507, + "num_tokens": 41030193.0, + "step": 8650 + }, + { + "entropy": 2.166394736766815, + "epoch": 0.38666666666666666, + "grad_norm": 7.40625, + "learning_rate": 1.264696449026346e-05, + "loss": 0.8336, + "mean_token_accuracy": 0.7500250363349914, + "num_tokens": 41099766.0, + "step": 8700 + }, + { + "entropy": 2.2078594040870665, + "epoch": 0.3888888888888889, + "grad_norm": 8.625, + "learning_rate": 1.2601145475372281e-05, + "loss": 0.803, + "mean_token_accuracy": 0.7575648665428162, + "num_tokens": 41164744.0, + "step": 8750 + }, + { + "entropy": 2.2065504598617554, + "epoch": 0.39111111111111113, + "grad_norm": 6.9375, + "learning_rate": 1.2555326460481101e-05, + "loss": 0.816, + "mean_token_accuracy": 0.7564821767807007, + "num_tokens": 41232194.0, + "step": 8800 + }, + { + "entropy": 2.1662226915359497, + "epoch": 0.3933333333333333, + "grad_norm": 8.0625, + "learning_rate": 1.2509507445589922e-05, + "loss": 0.8317, + "mean_token_accuracy": 0.7503245520591736, + "num_tokens": 41297353.0, + "step": 8850 + }, + { + "entropy": 2.236915967464447, + "epoch": 0.39555555555555555, + "grad_norm": 9.875, + "learning_rate": 1.2463688430698742e-05, + "loss": 0.7977, + "mean_token_accuracy": 0.759481954574585, + "num_tokens": 41361134.0, + "step": 8900 + }, + { + "entropy": 2.1871555137634275, + "epoch": 0.3977777777777778, + "grad_norm": 7.90625, + "learning_rate": 1.2417869415807561e-05, + "loss": 0.8034, + "mean_token_accuracy": 0.757278825044632, + "num_tokens": 41427995.0, + "step": 8950 + }, + { + "entropy": 2.2592362785339355, + "epoch": 0.4, + "grad_norm": 9.875, + "learning_rate": 1.2372050400916382e-05, + "loss": 0.8186, + "mean_token_accuracy": 0.7548012447357177, + "num_tokens": 41494201.0, + "step": 9000 + }, + { + "entropy": 2.196383099555969, + "epoch": 0.4022222222222222, + "grad_norm": 7.625, + "learning_rate": 1.2326231386025202e-05, + "loss": 0.8138, + "mean_token_accuracy": 0.7561362779140473, + "num_tokens": 41559989.0, + "step": 9050 + }, + { + "entropy": 2.1516851663589476, + "epoch": 0.40444444444444444, + "grad_norm": 13.75, + "learning_rate": 1.2280412371134021e-05, + "loss": 0.7951, + "mean_token_accuracy": 0.7594835031032562, + "num_tokens": 41626379.0, + "step": 9100 + }, + { + "entropy": 2.1745046091079714, + "epoch": 0.4066666666666667, + "grad_norm": 8.0, + "learning_rate": 1.223459335624284e-05, + "loss": 0.8048, + "mean_token_accuracy": 0.7599712920188904, + "num_tokens": 41692143.0, + "step": 9150 + }, + { + "entropy": 2.154158115386963, + "epoch": 0.4088888888888889, + "grad_norm": 8.25, + "learning_rate": 1.2188774341351662e-05, + "loss": 0.7877, + "mean_token_accuracy": 0.7622714912891388, + "num_tokens": 41759805.0, + "step": 9200 + }, + { + "entropy": 2.2480538749694823, + "epoch": 0.4111111111111111, + "grad_norm": 8.4375, + "learning_rate": 1.2142955326460481e-05, + "loss": 0.8052, + "mean_token_accuracy": 0.7560270345211029, + "num_tokens": 41823593.0, + "step": 9250 + }, + { + "entropy": 2.2491649985313416, + "epoch": 0.41333333333333333, + "grad_norm": 9.125, + "learning_rate": 1.2097136311569303e-05, + "loss": 0.8696, + "mean_token_accuracy": 0.7419601881504059, + "num_tokens": 41888809.0, + "step": 9300 + }, + { + "entropy": 2.258921926021576, + "epoch": 0.41555555555555557, + "grad_norm": 10.0625, + "learning_rate": 1.2051317296678122e-05, + "loss": 0.8289, + "mean_token_accuracy": 0.75118199467659, + "num_tokens": 41952730.0, + "step": 9350 + }, + { + "entropy": 2.163720915317535, + "epoch": 0.4177777777777778, + "grad_norm": 9.25, + "learning_rate": 1.2005498281786943e-05, + "loss": 0.8323, + "mean_token_accuracy": 0.7530957090854645, + "num_tokens": 42019573.0, + "step": 9400 + }, + { + "entropy": 2.2212659883499146, + "epoch": 0.42, + "grad_norm": 7.40625, + "learning_rate": 1.1959679266895763e-05, + "loss": 0.8361, + "mean_token_accuracy": 0.7518267011642457, + "num_tokens": 42088291.0, + "step": 9450 + }, + { + "entropy": 2.2050712847709657, + "epoch": 0.4222222222222222, + "grad_norm": 9.75, + "learning_rate": 1.1913860252004584e-05, + "loss": 0.8456, + "mean_token_accuracy": 0.7448002827167511, + "num_tokens": 42158120.0, + "step": 9500 + }, + { + "entropy": 2.216916351318359, + "epoch": 0.42444444444444446, + "grad_norm": 6.8125, + "learning_rate": 1.1868041237113403e-05, + "loss": 0.8415, + "mean_token_accuracy": 0.7452654683589935, + "num_tokens": 42226742.0, + "step": 9550 + }, + { + "entropy": 2.1955419325828553, + "epoch": 0.4266666666666667, + "grad_norm": 8.1875, + "learning_rate": 1.1822222222222225e-05, + "loss": 0.8166, + "mean_token_accuracy": 0.7562698805332184, + "num_tokens": 42292822.0, + "step": 9600 + }, + { + "entropy": 2.2368349695205687, + "epoch": 0.4288888888888889, + "grad_norm": 7.34375, + "learning_rate": 1.1776403207331044e-05, + "loss": 0.8089, + "mean_token_accuracy": 0.757049810886383, + "num_tokens": 42356424.0, + "step": 9650 + }, + { + "entropy": 2.178975234031677, + "epoch": 0.4311111111111111, + "grad_norm": 8.375, + "learning_rate": 1.1730584192439865e-05, + "loss": 0.8438, + "mean_token_accuracy": 0.7456352376937866, + "num_tokens": 42422979.0, + "step": 9700 + }, + { + "entropy": 2.0868096995353698, + "epoch": 0.43333333333333335, + "grad_norm": 7.53125, + "learning_rate": 1.1684765177548683e-05, + "loss": 0.8235, + "mean_token_accuracy": 0.7529381263256073, + "num_tokens": 42490375.0, + "step": 9750 + }, + { + "entropy": 2.147622332572937, + "epoch": 0.43555555555555553, + "grad_norm": 9.75, + "learning_rate": 1.1638946162657503e-05, + "loss": 0.7728, + "mean_token_accuracy": 0.7673224699497223, + "num_tokens": 42557143.0, + "step": 9800 + }, + { + "entropy": 2.114510886669159, + "epoch": 0.43777777777777777, + "grad_norm": 9.375, + "learning_rate": 1.1593127147766324e-05, + "loss": 0.8238, + "mean_token_accuracy": 0.7501462066173553, + "num_tokens": 42626026.0, + "step": 9850 + }, + { + "entropy": 2.133998863697052, + "epoch": 0.44, + "grad_norm": 7.4375, + "learning_rate": 1.1547308132875143e-05, + "loss": 0.8005, + "mean_token_accuracy": 0.7565454721450806, + "num_tokens": 42688911.0, + "step": 9900 + }, + { + "entropy": 2.146713092327118, + "epoch": 0.44222222222222224, + "grad_norm": 8.625, + "learning_rate": 1.1501489117983964e-05, + "loss": 0.801, + "mean_token_accuracy": 0.7562527394294739, + "num_tokens": 42756274.0, + "step": 9950 + }, + { + "entropy": 2.205234091281891, + "epoch": 0.4444444444444444, + "grad_norm": 6.875, + "learning_rate": 1.1455670103092784e-05, + "loss": 0.8242, + "mean_token_accuracy": 0.7537857472896576, + "num_tokens": 42825342.0, + "step": 10000 + }, + { + "epoch": 0.4444444444444444, + "eval_entropy": 2.0749987959861755, + "eval_loss": 0.8938310146331787, + "eval_mean_token_accuracy": 0.7446183115243912, + "eval_num_tokens": 42825342.0, + "eval_runtime": 7.8855, + "eval_samples_per_second": 1.649, + "eval_steps_per_second": 0.507, + "step": 10000 + }, + { + "entropy": 2.232636868953705, + "epoch": 0.44666666666666666, + "grad_norm": 6.3125, + "learning_rate": 1.1409851088201605e-05, + "loss": 0.8088, + "mean_token_accuracy": 0.7571802771091461, + "num_tokens": 42892615.0, + "step": 10050 + }, + { + "entropy": 2.1724118828773498, + "epoch": 0.4488888888888889, + "grad_norm": 7.4375, + "learning_rate": 1.1364032073310425e-05, + "loss": 0.7938, + "mean_token_accuracy": 0.7575406277179718, + "num_tokens": 42961116.0, + "step": 10100 + }, + { + "entropy": 2.2688467741012572, + "epoch": 0.45111111111111113, + "grad_norm": 8.25, + "learning_rate": 1.1318213058419246e-05, + "loss": 0.8109, + "mean_token_accuracy": 0.7592162156105041, + "num_tokens": 43026002.0, + "step": 10150 + }, + { + "entropy": 2.189846224784851, + "epoch": 0.4533333333333333, + "grad_norm": 7.4375, + "learning_rate": 1.1272394043528065e-05, + "loss": 0.8393, + "mean_token_accuracy": 0.750606085062027, + "num_tokens": 43096450.0, + "step": 10200 + }, + { + "entropy": 2.1485534167289733, + "epoch": 0.45555555555555555, + "grad_norm": 6.96875, + "learning_rate": 1.1226575028636886e-05, + "loss": 0.8005, + "mean_token_accuracy": 0.7586869549751282, + "num_tokens": 43164974.0, + "step": 10250 + }, + { + "entropy": 2.114973647594452, + "epoch": 0.4577777777777778, + "grad_norm": 7.71875, + "learning_rate": 1.1180756013745706e-05, + "loss": 0.81, + "mean_token_accuracy": 0.7530654954910279, + "num_tokens": 43230676.0, + "step": 10300 + }, + { + "entropy": 2.210615482330322, + "epoch": 0.46, + "grad_norm": 11.6875, + "learning_rate": 1.1134936998854527e-05, + "loss": 0.8133, + "mean_token_accuracy": 0.7568694865703582, + "num_tokens": 43295393.0, + "step": 10350 + }, + { + "entropy": 2.1622642707824706, + "epoch": 0.4622222222222222, + "grad_norm": 7.03125, + "learning_rate": 1.1089117983963347e-05, + "loss": 0.7942, + "mean_token_accuracy": 0.7580956876277923, + "num_tokens": 43361602.0, + "step": 10400 + }, + { + "entropy": 2.164878299236298, + "epoch": 0.46444444444444444, + "grad_norm": 6.6875, + "learning_rate": 1.1043298969072164e-05, + "loss": 0.7759, + "mean_token_accuracy": 0.7644217216968536, + "num_tokens": 43427335.0, + "step": 10450 + }, + { + "entropy": 2.1177299284934996, + "epoch": 0.4666666666666667, + "grad_norm": 10.0, + "learning_rate": 1.0997479954180986e-05, + "loss": 0.85, + "mean_token_accuracy": 0.7469760966300965, + "num_tokens": 43494368.0, + "step": 10500 + }, + { + "entropy": 2.151867859363556, + "epoch": 0.4688888888888889, + "grad_norm": 9.25, + "learning_rate": 1.0951660939289805e-05, + "loss": 0.8053, + "mean_token_accuracy": 0.7571006786823272, + "num_tokens": 43560267.0, + "step": 10550 + }, + { + "entropy": 2.1284614849090575, + "epoch": 0.4711111111111111, + "grad_norm": 6.6875, + "learning_rate": 1.0905841924398626e-05, + "loss": 0.8309, + "mean_token_accuracy": 0.752720388174057, + "num_tokens": 43626812.0, + "step": 10600 + }, + { + "entropy": 2.1522640681266783, + "epoch": 0.47333333333333333, + "grad_norm": 8.4375, + "learning_rate": 1.0860022909507446e-05, + "loss": 0.8633, + "mean_token_accuracy": 0.7454240775108337, + "num_tokens": 43693459.0, + "step": 10650 + }, + { + "entropy": 2.1508279252052307, + "epoch": 0.47555555555555556, + "grad_norm": 7.78125, + "learning_rate": 1.0814203894616267e-05, + "loss": 0.7701, + "mean_token_accuracy": 0.767644681930542, + "num_tokens": 43758441.0, + "step": 10700 + }, + { + "entropy": 2.1406164813041686, + "epoch": 0.4777777777777778, + "grad_norm": 8.3125, + "learning_rate": 1.0768384879725086e-05, + "loss": 0.8299, + "mean_token_accuracy": 0.7507808887958527, + "num_tokens": 43827587.0, + "step": 10750 + }, + { + "entropy": 2.0773234295845033, + "epoch": 0.48, + "grad_norm": 6.65625, + "learning_rate": 1.0722565864833908e-05, + "loss": 0.8059, + "mean_token_accuracy": 0.7575164914131165, + "num_tokens": 43893454.0, + "step": 10800 + }, + { + "entropy": 2.0964699006080627, + "epoch": 0.4822222222222222, + "grad_norm": 7.09375, + "learning_rate": 1.0676746849942727e-05, + "loss": 0.8538, + "mean_token_accuracy": 0.7474709522724151, + "num_tokens": 43961221.0, + "step": 10850 + }, + { + "entropy": 2.127323613166809, + "epoch": 0.48444444444444446, + "grad_norm": 8.25, + "learning_rate": 1.0630927835051548e-05, + "loss": 0.823, + "mean_token_accuracy": 0.7566704392433167, + "num_tokens": 44030179.0, + "step": 10900 + }, + { + "entropy": 2.1277793073654174, + "epoch": 0.4866666666666667, + "grad_norm": 8.0, + "learning_rate": 1.0585108820160368e-05, + "loss": 0.7951, + "mean_token_accuracy": 0.7619584739208222, + "num_tokens": 44096190.0, + "step": 10950 + }, + { + "entropy": 2.100730609893799, + "epoch": 0.4888888888888889, + "grad_norm": 7.5625, + "learning_rate": 1.0539289805269189e-05, + "loss": 0.8355, + "mean_token_accuracy": 0.7507687473297119, + "num_tokens": 44164149.0, + "step": 11000 + }, + { + "entropy": 2.1630342602729797, + "epoch": 0.4911111111111111, + "grad_norm": 9.0, + "learning_rate": 1.0493470790378008e-05, + "loss": 0.8394, + "mean_token_accuracy": 0.7504077112674713, + "num_tokens": 44232826.0, + "step": 11050 + }, + { + "entropy": 2.135989320278168, + "epoch": 0.49333333333333335, + "grad_norm": 7.25, + "learning_rate": 1.044765177548683e-05, + "loss": 0.826, + "mean_token_accuracy": 0.7503847754001618, + "num_tokens": 44300785.0, + "step": 11100 + }, + { + "entropy": 2.1245554232597352, + "epoch": 0.4955555555555556, + "grad_norm": 13.125, + "learning_rate": 1.0401832760595647e-05, + "loss": 0.8276, + "mean_token_accuracy": 0.7498899948596954, + "num_tokens": 44366272.0, + "step": 11150 + }, + { + "entropy": 2.0881606268882753, + "epoch": 0.49777777777777776, + "grad_norm": 8.5625, + "learning_rate": 1.0356013745704467e-05, + "loss": 0.8272, + "mean_token_accuracy": 0.7556144452095032, + "num_tokens": 44432287.0, + "step": 11200 + }, + { + "entropy": 2.1587840700149536, + "epoch": 0.5, + "grad_norm": 22.625, + "learning_rate": 1.0310194730813288e-05, + "loss": 0.8169, + "mean_token_accuracy": 0.7587198996543885, + "num_tokens": 44497099.0, + "step": 11250 + }, + { + "entropy": 2.059075405597687, + "epoch": 0.5022222222222222, + "grad_norm": 6.15625, + "learning_rate": 1.0264375715922108e-05, + "loss": 0.8128, + "mean_token_accuracy": 0.7540943920612335, + "num_tokens": 44568074.0, + "step": 11300 + }, + { + "entropy": 2.132368493080139, + "epoch": 0.5044444444444445, + "grad_norm": 5.65625, + "learning_rate": 1.0218556701030929e-05, + "loss": 0.7915, + "mean_token_accuracy": 0.7632870030403137, + "num_tokens": 44634710.0, + "step": 11350 + }, + { + "entropy": 2.075038847923279, + "epoch": 0.5066666666666667, + "grad_norm": 11.3125, + "learning_rate": 1.0172737686139748e-05, + "loss": 0.8615, + "mean_token_accuracy": 0.7465237331390381, + "num_tokens": 44705095.0, + "step": 11400 + }, + { + "entropy": 2.11560346364975, + "epoch": 0.5088888888888888, + "grad_norm": 7.3125, + "learning_rate": 1.012691867124857e-05, + "loss": 0.8016, + "mean_token_accuracy": 0.758952580690384, + "num_tokens": 44772217.0, + "step": 11450 + }, + { + "entropy": 2.1024736380577087, + "epoch": 0.5111111111111111, + "grad_norm": 13.5625, + "learning_rate": 1.0081099656357389e-05, + "loss": 0.7976, + "mean_token_accuracy": 0.7605179595947266, + "num_tokens": 44840575.0, + "step": 11500 + }, + { + "entropy": 2.145506045818329, + "epoch": 0.5133333333333333, + "grad_norm": 7.03125, + "learning_rate": 1.003528064146621e-05, + "loss": 0.8572, + "mean_token_accuracy": 0.7433122813701629, + "num_tokens": 44906721.0, + "step": 11550 + }, + { + "entropy": 2.1501906871795655, + "epoch": 0.5155555555555555, + "grad_norm": 7.9375, + "learning_rate": 9.98946162657503e-06, + "loss": 0.8199, + "mean_token_accuracy": 0.7566520750522614, + "num_tokens": 44973401.0, + "step": 11600 + }, + { + "entropy": 2.180145356655121, + "epoch": 0.5177777777777778, + "grad_norm": 7.96875, + "learning_rate": 9.943642611683849e-06, + "loss": 0.8497, + "mean_token_accuracy": 0.747704974412918, + "num_tokens": 45038114.0, + "step": 11650 + }, + { + "entropy": 2.1444980192184446, + "epoch": 0.52, + "grad_norm": 7.3125, + "learning_rate": 9.897823596792669e-06, + "loss": 0.8054, + "mean_token_accuracy": 0.7571915662288666, + "num_tokens": 45105740.0, + "step": 11700 + }, + { + "entropy": 2.1226493763923644, + "epoch": 0.5222222222222223, + "grad_norm": 8.6875, + "learning_rate": 9.85200458190149e-06, + "loss": 0.8543, + "mean_token_accuracy": 0.74732057929039, + "num_tokens": 45173497.0, + "step": 11750 + }, + { + "entropy": 2.105964336395264, + "epoch": 0.5244444444444445, + "grad_norm": 6.90625, + "learning_rate": 9.80618556701031e-06, + "loss": 0.8664, + "mean_token_accuracy": 0.7433609414100647, + "num_tokens": 45240881.0, + "step": 11800 + }, + { + "entropy": 2.1655562567710875, + "epoch": 0.5266666666666666, + "grad_norm": 7.78125, + "learning_rate": 9.76036655211913e-06, + "loss": 0.8134, + "mean_token_accuracy": 0.7605512762069702, + "num_tokens": 45306709.0, + "step": 11850 + }, + { + "entropy": 2.115200798511505, + "epoch": 0.5288888888888889, + "grad_norm": 8.125, + "learning_rate": 9.71454753722795e-06, + "loss": 0.7942, + "mean_token_accuracy": 0.76005859375, + "num_tokens": 45373791.0, + "step": 11900 + }, + { + "entropy": 2.1086077737808226, + "epoch": 0.5311111111111111, + "grad_norm": 8.375, + "learning_rate": 9.668728522336771e-06, + "loss": 0.832, + "mean_token_accuracy": 0.7541840577125549, + "num_tokens": 45442329.0, + "step": 11950 + }, + { + "entropy": 2.1453971552848814, + "epoch": 0.5333333333333333, + "grad_norm": 9.625, + "learning_rate": 9.62290950744559e-06, + "loss": 0.8618, + "mean_token_accuracy": 0.74840935587883, + "num_tokens": 45509133.0, + "step": 12000 + }, + { + "entropy": 2.185004472732544, + "epoch": 0.5355555555555556, + "grad_norm": 6.46875, + "learning_rate": 9.57709049255441e-06, + "loss": 0.8269, + "mean_token_accuracy": 0.75218052983284, + "num_tokens": 45574414.0, + "step": 12050 + }, + { + "entropy": 2.0952596497535705, + "epoch": 0.5377777777777778, + "grad_norm": 7.59375, + "learning_rate": 9.531271477663231e-06, + "loss": 0.8312, + "mean_token_accuracy": 0.7515332329273224, + "num_tokens": 45645759.0, + "step": 12100 + }, + { + "entropy": 2.1197858357429507, + "epoch": 0.54, + "grad_norm": 8.0, + "learning_rate": 9.48545246277205e-06, + "loss": 0.8165, + "mean_token_accuracy": 0.7548327159881592, + "num_tokens": 45711167.0, + "step": 12150 + }, + { + "entropy": 2.0839271712303162, + "epoch": 0.5422222222222223, + "grad_norm": 7.34375, + "learning_rate": 9.439633447880872e-06, + "loss": 0.8007, + "mean_token_accuracy": 0.7602275168895721, + "num_tokens": 45781410.0, + "step": 12200 + }, + { + "entropy": 2.1347982597351076, + "epoch": 0.5444444444444444, + "grad_norm": 7.09375, + "learning_rate": 9.393814432989692e-06, + "loss": 0.8163, + "mean_token_accuracy": 0.752629029750824, + "num_tokens": 45845816.0, + "step": 12250 + }, + { + "entropy": 2.124971535205841, + "epoch": 0.5466666666666666, + "grad_norm": 8.1875, + "learning_rate": 9.347995418098513e-06, + "loss": 0.8166, + "mean_token_accuracy": 0.7571777141094208, + "num_tokens": 45911093.0, + "step": 12300 + }, + { + "entropy": 2.133582751750946, + "epoch": 0.5488888888888889, + "grad_norm": 7.8125, + "learning_rate": 9.302176403207332e-06, + "loss": 0.8724, + "mean_token_accuracy": 0.7431828391551971, + "num_tokens": 45980429.0, + "step": 12350 + }, + { + "entropy": 2.102145965099335, + "epoch": 0.5511111111111111, + "grad_norm": 6.8125, + "learning_rate": 9.256357388316152e-06, + "loss": 0.799, + "mean_token_accuracy": 0.7613983142375946, + "num_tokens": 46047747.0, + "step": 12400 + }, + { + "entropy": 2.19835401058197, + "epoch": 0.5533333333333333, + "grad_norm": 8.25, + "learning_rate": 9.210538373424973e-06, + "loss": 0.8433, + "mean_token_accuracy": 0.750613443851471, + "num_tokens": 46109196.0, + "step": 12450 + }, + { + "entropy": 2.192390067577362, + "epoch": 0.5555555555555556, + "grad_norm": 6.6875, + "learning_rate": 9.164719358533792e-06, + "loss": 0.8233, + "mean_token_accuracy": 0.7548783671855926, + "num_tokens": 46175945.0, + "step": 12500 + }, + { + "entropy": 2.1229864621162413, + "epoch": 0.5577777777777778, + "grad_norm": 7.0625, + "learning_rate": 9.118900343642612e-06, + "loss": 0.8412, + "mean_token_accuracy": 0.7494572842121124, + "num_tokens": 46241375.0, + "step": 12550 + }, + { + "entropy": 2.1183197784423826, + "epoch": 0.56, + "grad_norm": 6.875, + "learning_rate": 9.073081328751433e-06, + "loss": 0.7941, + "mean_token_accuracy": 0.7617311882972717, + "num_tokens": 46309203.0, + "step": 12600 + }, + { + "entropy": 2.1563652181625366, + "epoch": 0.5622222222222222, + "grad_norm": 7.34375, + "learning_rate": 9.027262313860253e-06, + "loss": 0.8333, + "mean_token_accuracy": 0.7494297361373902, + "num_tokens": 46377337.0, + "step": 12650 + }, + { + "entropy": 2.149436049461365, + "epoch": 0.5644444444444444, + "grad_norm": 9.0, + "learning_rate": 8.981443298969072e-06, + "loss": 0.8053, + "mean_token_accuracy": 0.7584555840492249, + "num_tokens": 46444416.0, + "step": 12700 + }, + { + "entropy": 2.1461888194084167, + "epoch": 0.5666666666666667, + "grad_norm": 6.6875, + "learning_rate": 8.935624284077893e-06, + "loss": 0.8286, + "mean_token_accuracy": 0.7490329504013061, + "num_tokens": 46513786.0, + "step": 12750 + }, + { + "entropy": 2.149775302410126, + "epoch": 0.5688888888888889, + "grad_norm": 7.5625, + "learning_rate": 8.889805269186713e-06, + "loss": 0.8296, + "mean_token_accuracy": 0.7498440980911255, + "num_tokens": 46580058.0, + "step": 12800 + }, + { + "entropy": 2.1283646035194397, + "epoch": 0.5711111111111111, + "grad_norm": 6.96875, + "learning_rate": 8.843986254295534e-06, + "loss": 0.8538, + "mean_token_accuracy": 0.7479376924037934, + "num_tokens": 46645685.0, + "step": 12850 + }, + { + "entropy": 2.136440978050232, + "epoch": 0.5733333333333334, + "grad_norm": 10.1875, + "learning_rate": 8.798167239404353e-06, + "loss": 0.8558, + "mean_token_accuracy": 0.7462117183208465, + "num_tokens": 46713622.0, + "step": 12900 + }, + { + "entropy": 2.044099681377411, + "epoch": 0.5755555555555556, + "grad_norm": 7.65625, + "learning_rate": 8.752348224513175e-06, + "loss": 0.8078, + "mean_token_accuracy": 0.755551530122757, + "num_tokens": 46784415.0, + "step": 12950 + }, + { + "entropy": 2.1400314664840696, + "epoch": 0.5777777777777777, + "grad_norm": 9.1875, + "learning_rate": 8.706529209621994e-06, + "loss": 0.7706, + "mean_token_accuracy": 0.7651760971546173, + "num_tokens": 46849509.0, + "step": 13000 + }, + { + "entropy": 2.1189517760276795, + "epoch": 0.58, + "grad_norm": 8.0, + "learning_rate": 8.660710194730814e-06, + "loss": 0.8118, + "mean_token_accuracy": 0.7566381883621216, + "num_tokens": 46916191.0, + "step": 13050 + }, + { + "entropy": 2.107829716205597, + "epoch": 0.5822222222222222, + "grad_norm": 6.5625, + "learning_rate": 8.614891179839635e-06, + "loss": 0.7543, + "mean_token_accuracy": 0.7710553121566772, + "num_tokens": 46984133.0, + "step": 13100 + }, + { + "entropy": 2.2209577679634096, + "epoch": 0.5844444444444444, + "grad_norm": 10.4375, + "learning_rate": 8.569072164948454e-06, + "loss": 0.8554, + "mean_token_accuracy": 0.7479790794849396, + "num_tokens": 47046937.0, + "step": 13150 + }, + { + "entropy": 2.1490628361701964, + "epoch": 0.5866666666666667, + "grad_norm": 7.40625, + "learning_rate": 8.523253150057275e-06, + "loss": 0.8488, + "mean_token_accuracy": 0.7476764714717865, + "num_tokens": 47115124.0, + "step": 13200 + }, + { + "entropy": 2.1350867557525635, + "epoch": 0.5888888888888889, + "grad_norm": 7.4375, + "learning_rate": 8.477434135166095e-06, + "loss": 0.8266, + "mean_token_accuracy": 0.7515107154846191, + "num_tokens": 47183436.0, + "step": 13250 + }, + { + "entropy": 2.203930015563965, + "epoch": 0.5911111111111111, + "grad_norm": 7.375, + "learning_rate": 8.431615120274916e-06, + "loss": 0.8206, + "mean_token_accuracy": 0.758274735212326, + "num_tokens": 47250973.0, + "step": 13300 + }, + { + "entropy": 2.17663028717041, + "epoch": 0.5933333333333334, + "grad_norm": 7.125, + "learning_rate": 8.385796105383734e-06, + "loss": 0.8627, + "mean_token_accuracy": 0.7431224703788757, + "num_tokens": 47319629.0, + "step": 13350 + }, + { + "entropy": 2.1186418867111207, + "epoch": 0.5955555555555555, + "grad_norm": 8.0625, + "learning_rate": 8.339977090492555e-06, + "loss": 0.8079, + "mean_token_accuracy": 0.7619412040710449, + "num_tokens": 47387168.0, + "step": 13400 + }, + { + "entropy": 2.1040466976165773, + "epoch": 0.5977777777777777, + "grad_norm": 8.0625, + "learning_rate": 8.294158075601375e-06, + "loss": 0.8375, + "mean_token_accuracy": 0.7491306090354919, + "num_tokens": 47458980.0, + "step": 13450 + }, + { + "entropy": 2.1847296571731567, + "epoch": 0.6, + "grad_norm": 7.75, + "learning_rate": 8.248339060710196e-06, + "loss": 0.8656, + "mean_token_accuracy": 0.7442897510528564, + "num_tokens": 47527776.0, + "step": 13500 + }, + { + "entropy": 2.1727059721946715, + "epoch": 0.6022222222222222, + "grad_norm": 8.875, + "learning_rate": 8.202520045819015e-06, + "loss": 0.8416, + "mean_token_accuracy": 0.7498923110961914, + "num_tokens": 47592067.0, + "step": 13550 + }, + { + "entropy": 2.146069450378418, + "epoch": 0.6044444444444445, + "grad_norm": 8.1875, + "learning_rate": 8.156701030927836e-06, + "loss": 0.8304, + "mean_token_accuracy": 0.7521639728546142, + "num_tokens": 47660520.0, + "step": 13600 + }, + { + "entropy": 2.1906869626045227, + "epoch": 0.6066666666666667, + "grad_norm": 8.0625, + "learning_rate": 8.110882016036656e-06, + "loss": 0.8126, + "mean_token_accuracy": 0.7562307071685791, + "num_tokens": 47726291.0, + "step": 13650 + }, + { + "entropy": 2.256641490459442, + "epoch": 0.6088888888888889, + "grad_norm": 8.6875, + "learning_rate": 8.065063001145475e-06, + "loss": 0.8229, + "mean_token_accuracy": 0.7553273499011993, + "num_tokens": 47789796.0, + "step": 13700 + }, + { + "entropy": 2.15241749048233, + "epoch": 0.6111111111111112, + "grad_norm": 7.625, + "learning_rate": 8.019243986254297e-06, + "loss": 0.7972, + "mean_token_accuracy": 0.7611013460159302, + "num_tokens": 47854791.0, + "step": 13750 + }, + { + "entropy": 2.1587498426437377, + "epoch": 0.6133333333333333, + "grad_norm": 10.1875, + "learning_rate": 7.973424971363116e-06, + "loss": 0.8219, + "mean_token_accuracy": 0.7551308906078339, + "num_tokens": 47922964.0, + "step": 13800 + }, + { + "entropy": 2.141474585533142, + "epoch": 0.6155555555555555, + "grad_norm": 8.875, + "learning_rate": 7.927605956471937e-06, + "loss": 0.8584, + "mean_token_accuracy": 0.7452862620353699, + "num_tokens": 47989985.0, + "step": 13850 + }, + { + "entropy": 2.116554036140442, + "epoch": 0.6177777777777778, + "grad_norm": 7.0625, + "learning_rate": 7.881786941580757e-06, + "loss": 0.7575, + "mean_token_accuracy": 0.7697462677955628, + "num_tokens": 48055485.0, + "step": 13900 + }, + { + "entropy": 2.145723969936371, + "epoch": 0.62, + "grad_norm": 7.21875, + "learning_rate": 7.835967926689578e-06, + "loss": 0.8424, + "mean_token_accuracy": 0.7527009093761444, + "num_tokens": 48123791.0, + "step": 13950 + }, + { + "entropy": 2.228086581230164, + "epoch": 0.6222222222222222, + "grad_norm": 16.25, + "learning_rate": 7.790148911798397e-06, + "loss": 0.8609, + "mean_token_accuracy": 0.7481016480922699, + "num_tokens": 48188322.0, + "step": 14000 + }, + { + "entropy": 2.1463445258140563, + "epoch": 0.6244444444444445, + "grad_norm": 8.0, + "learning_rate": 7.744329896907217e-06, + "loss": 0.7909, + "mean_token_accuracy": 0.7623829674720765, + "num_tokens": 48256913.0, + "step": 14050 + }, + { + "entropy": 2.1033449459075926, + "epoch": 0.6266666666666667, + "grad_norm": 6.5, + "learning_rate": 7.698510882016036e-06, + "loss": 0.8107, + "mean_token_accuracy": 0.7558232533931732, + "num_tokens": 48322383.0, + "step": 14100 + }, + { + "entropy": 2.1505328583717347, + "epoch": 0.6288888888888889, + "grad_norm": 8.0, + "learning_rate": 7.652691867124858e-06, + "loss": 0.8591, + "mean_token_accuracy": 0.7444049680233001, + "num_tokens": 48390728.0, + "step": 14150 + }, + { + "entropy": 2.1623160433769226, + "epoch": 0.6311111111111111, + "grad_norm": 7.875, + "learning_rate": 7.606872852233678e-06, + "loss": 0.783, + "mean_token_accuracy": 0.7643996751308442, + "num_tokens": 48457254.0, + "step": 14200 + }, + { + "entropy": 2.1291307163238526, + "epoch": 0.6333333333333333, + "grad_norm": 9.125, + "learning_rate": 7.561053837342498e-06, + "loss": 0.8297, + "mean_token_accuracy": 0.755406551361084, + "num_tokens": 48526165.0, + "step": 14250 + }, + { + "entropy": 2.0992329573631285, + "epoch": 0.6355555555555555, + "grad_norm": 9.5625, + "learning_rate": 7.515234822451319e-06, + "loss": 0.8357, + "mean_token_accuracy": 0.7475246119499207, + "num_tokens": 48594409.0, + "step": 14300 + }, + { + "entropy": 2.096374764442444, + "epoch": 0.6377777777777778, + "grad_norm": 7.5, + "learning_rate": 7.469415807560139e-06, + "loss": 0.7929, + "mean_token_accuracy": 0.7609642744064331, + "num_tokens": 48662016.0, + "step": 14350 + }, + { + "entropy": 2.1556865262985228, + "epoch": 0.64, + "grad_norm": 7.96875, + "learning_rate": 7.4235967926689576e-06, + "loss": 0.8432, + "mean_token_accuracy": 0.7489691793918609, + "num_tokens": 48727783.0, + "step": 14400 + }, + { + "entropy": 2.2297734808921814, + "epoch": 0.6422222222222222, + "grad_norm": 7.25, + "learning_rate": 7.377777777777778e-06, + "loss": 0.845, + "mean_token_accuracy": 0.7471547079086304, + "num_tokens": 48795872.0, + "step": 14450 + }, + { + "entropy": 2.093806471824646, + "epoch": 0.6444444444444445, + "grad_norm": 9.875, + "learning_rate": 7.331958762886598e-06, + "loss": 0.7843, + "mean_token_accuracy": 0.764157919883728, + "num_tokens": 48864940.0, + "step": 14500 + }, + { + "entropy": 2.1970014452934263, + "epoch": 0.6466666666666666, + "grad_norm": 6.96875, + "learning_rate": 7.286139747995419e-06, + "loss": 0.8726, + "mean_token_accuracy": 0.7447559666633606, + "num_tokens": 48932155.0, + "step": 14550 + }, + { + "entropy": 2.173217701911926, + "epoch": 0.6488888888888888, + "grad_norm": 7.0, + "learning_rate": 7.240320733104239e-06, + "loss": 0.8209, + "mean_token_accuracy": 0.7533725011348724, + "num_tokens": 48996865.0, + "step": 14600 + }, + { + "entropy": 2.1182384943962096, + "epoch": 0.6511111111111111, + "grad_norm": 7.25, + "learning_rate": 7.194501718213059e-06, + "loss": 0.8433, + "mean_token_accuracy": 0.743918879032135, + "num_tokens": 49062177.0, + "step": 14650 + }, + { + "entropy": 2.182834794521332, + "epoch": 0.6533333333333333, + "grad_norm": 15.5, + "learning_rate": 7.148682703321879e-06, + "loss": 0.8163, + "mean_token_accuracy": 0.7540112996101379, + "num_tokens": 49126666.0, + "step": 14700 + }, + { + "entropy": 2.1658547282218934, + "epoch": 0.6555555555555556, + "grad_norm": 7.0625, + "learning_rate": 7.102863688430699e-06, + "loss": 0.8318, + "mean_token_accuracy": 0.7548858106136322, + "num_tokens": 49193802.0, + "step": 14750 + }, + { + "entropy": 2.22178261756897, + "epoch": 0.6577777777777778, + "grad_norm": 7.34375, + "learning_rate": 7.0570446735395194e-06, + "loss": 0.8171, + "mean_token_accuracy": 0.755473929643631, + "num_tokens": 49256927.0, + "step": 14800 + }, + { + "entropy": 2.1972863483428955, + "epoch": 0.66, + "grad_norm": 17.25, + "learning_rate": 7.01122565864834e-06, + "loss": 0.868, + "mean_token_accuracy": 0.7451527345180512, + "num_tokens": 49324296.0, + "step": 14850 + }, + { + "entropy": 2.13280175447464, + "epoch": 0.6622222222222223, + "grad_norm": 7.28125, + "learning_rate": 6.96540664375716e-06, + "loss": 0.8565, + "mean_token_accuracy": 0.7481217682361603, + "num_tokens": 49393359.0, + "step": 14900 + }, + { + "entropy": 2.223068025112152, + "epoch": 0.6644444444444444, + "grad_norm": 9.9375, + "learning_rate": 6.9195876288659804e-06, + "loss": 0.8344, + "mean_token_accuracy": 0.7528722262382508, + "num_tokens": 49455849.0, + "step": 14950 + }, + { + "entropy": 2.1984812426567077, + "epoch": 0.6666666666666666, + "grad_norm": 7.875, + "learning_rate": 6.873768613974801e-06, + "loss": 0.8534, + "mean_token_accuracy": 0.7472201347351074, + "num_tokens": 49521825.0, + "step": 15000 + }, + { + "epoch": 0.6666666666666666, + "eval_entropy": 2.0471889674663544, + "eval_loss": 0.8776370286941528, + "eval_mean_token_accuracy": 0.7463443726301193, + "eval_num_tokens": 49521825.0, + "eval_runtime": 5.8695, + "eval_samples_per_second": 2.215, + "eval_steps_per_second": 0.681, + "step": 15000 + }, + { + "entropy": 2.1753278255462645, + "epoch": 0.6688888888888889, + "grad_norm": 8.6875, + "learning_rate": 6.8279495990836194e-06, + "loss": 0.8559, + "mean_token_accuracy": 0.7446865785121918, + "num_tokens": 49586773.0, + "step": 15050 + }, + { + "entropy": 2.137886700630188, + "epoch": 0.6711111111111111, + "grad_norm": 8.6875, + "learning_rate": 6.78213058419244e-06, + "loss": 0.8128, + "mean_token_accuracy": 0.757747368812561, + "num_tokens": 49654785.0, + "step": 15100 + }, + { + "entropy": 2.1542887043952943, + "epoch": 0.6733333333333333, + "grad_norm": 9.5625, + "learning_rate": 6.73631156930126e-06, + "loss": 0.8498, + "mean_token_accuracy": 0.7489660286903381, + "num_tokens": 49720666.0, + "step": 15150 + }, + { + "entropy": 2.1714869332313538, + "epoch": 0.6755555555555556, + "grad_norm": 8.25, + "learning_rate": 6.6904925544100804e-06, + "loss": 0.8183, + "mean_token_accuracy": 0.7538860237598419, + "num_tokens": 49790606.0, + "step": 15200 + }, + { + "entropy": 2.1753248810768127, + "epoch": 0.6777777777777778, + "grad_norm": 7.96875, + "learning_rate": 6.644673539518901e-06, + "loss": 0.8017, + "mean_token_accuracy": 0.7590576016902923, + "num_tokens": 49855120.0, + "step": 15250 + }, + { + "entropy": 2.1427893018722535, + "epoch": 0.68, + "grad_norm": 10.25, + "learning_rate": 6.598854524627721e-06, + "loss": 0.8308, + "mean_token_accuracy": 0.7554869890213013, + "num_tokens": 49923395.0, + "step": 15300 + }, + { + "entropy": 2.0930906391143798, + "epoch": 0.6822222222222222, + "grad_norm": 8.1875, + "learning_rate": 6.5530355097365415e-06, + "loss": 0.8226, + "mean_token_accuracy": 0.7525417697429657, + "num_tokens": 49988632.0, + "step": 15350 + }, + { + "entropy": 2.183457748889923, + "epoch": 0.6844444444444444, + "grad_norm": 7.34375, + "learning_rate": 6.507216494845361e-06, + "loss": 0.8382, + "mean_token_accuracy": 0.7489526784420013, + "num_tokens": 50055473.0, + "step": 15400 + }, + { + "entropy": 2.1905418968200685, + "epoch": 0.6866666666666666, + "grad_norm": 8.25, + "learning_rate": 6.461397479954181e-06, + "loss": 0.8143, + "mean_token_accuracy": 0.7581993770599366, + "num_tokens": 50122557.0, + "step": 15450 + }, + { + "entropy": 2.1051335525512695, + "epoch": 0.6888888888888889, + "grad_norm": 8.25, + "learning_rate": 6.415578465063002e-06, + "loss": 0.8459, + "mean_token_accuracy": 0.7460039758682251, + "num_tokens": 50190091.0, + "step": 15500 + }, + { + "entropy": 2.191064128875732, + "epoch": 0.6911111111111111, + "grad_norm": 6.96875, + "learning_rate": 6.369759450171822e-06, + "loss": 0.8458, + "mean_token_accuracy": 0.7485956978797913, + "num_tokens": 50255412.0, + "step": 15550 + }, + { + "entropy": 2.1825055122375487, + "epoch": 0.6933333333333334, + "grad_norm": 8.25, + "learning_rate": 6.323940435280642e-06, + "loss": 0.8686, + "mean_token_accuracy": 0.7439832353591919, + "num_tokens": 50323487.0, + "step": 15600 + }, + { + "entropy": 2.2044737410545348, + "epoch": 0.6955555555555556, + "grad_norm": 6.71875, + "learning_rate": 6.278121420389463e-06, + "loss": 0.8422, + "mean_token_accuracy": 0.7480260360240937, + "num_tokens": 50389804.0, + "step": 15650 + }, + { + "entropy": 2.194586501121521, + "epoch": 0.6977777777777778, + "grad_norm": 7.65625, + "learning_rate": 6.232302405498283e-06, + "loss": 0.8351, + "mean_token_accuracy": 0.7537850439548492, + "num_tokens": 50452603.0, + "step": 15700 + }, + { + "entropy": 2.1760614275932313, + "epoch": 0.7, + "grad_norm": 7.90625, + "learning_rate": 6.186483390607102e-06, + "loss": 0.8462, + "mean_token_accuracy": 0.7481569695472717, + "num_tokens": 50522865.0, + "step": 15750 + }, + { + "entropy": 2.1524909257888796, + "epoch": 0.7022222222222222, + "grad_norm": 15.375, + "learning_rate": 6.140664375715922e-06, + "loss": 0.8255, + "mean_token_accuracy": 0.7523048520088196, + "num_tokens": 50592440.0, + "step": 15800 + }, + { + "entropy": 2.160615997314453, + "epoch": 0.7044444444444444, + "grad_norm": 8.1875, + "learning_rate": 6.094845360824742e-06, + "loss": 0.8195, + "mean_token_accuracy": 0.7542084169387817, + "num_tokens": 50661169.0, + "step": 15850 + }, + { + "entropy": 2.1588108134269715, + "epoch": 0.7066666666666667, + "grad_norm": 7.28125, + "learning_rate": 6.049026345933563e-06, + "loss": 0.8343, + "mean_token_accuracy": 0.7516414785385132, + "num_tokens": 50730313.0, + "step": 15900 + }, + { + "entropy": 2.2916067910194395, + "epoch": 0.7088888888888889, + "grad_norm": 8.9375, + "learning_rate": 6.003207331042383e-06, + "loss": 0.8779, + "mean_token_accuracy": 0.7434639298915863, + "num_tokens": 50793896.0, + "step": 15950 + }, + { + "entropy": 2.165027015209198, + "epoch": 0.7111111111111111, + "grad_norm": 9.75, + "learning_rate": 5.957388316151203e-06, + "loss": 0.8546, + "mean_token_accuracy": 0.7454061865806579, + "num_tokens": 50861288.0, + "step": 16000 + }, + { + "entropy": 2.0912844824790953, + "epoch": 0.7133333333333334, + "grad_norm": 8.375, + "learning_rate": 5.911569301260024e-06, + "loss": 0.7921, + "mean_token_accuracy": 0.7610292685031891, + "num_tokens": 50928175.0, + "step": 16050 + }, + { + "entropy": 2.193835806846619, + "epoch": 0.7155555555555555, + "grad_norm": 9.375, + "learning_rate": 5.865750286368843e-06, + "loss": 0.8429, + "mean_token_accuracy": 0.748592312335968, + "num_tokens": 50991843.0, + "step": 16100 + }, + { + "entropy": 2.150286679267883, + "epoch": 0.7177777777777777, + "grad_norm": 8.0, + "learning_rate": 5.8199312714776635e-06, + "loss": 0.8731, + "mean_token_accuracy": 0.7444507765769959, + "num_tokens": 51057738.0, + "step": 16150 + }, + { + "entropy": 2.2080190443992613, + "epoch": 0.72, + "grad_norm": 6.59375, + "learning_rate": 5.774112256586484e-06, + "loss": 0.8394, + "mean_token_accuracy": 0.7506636822223663, + "num_tokens": 51122791.0, + "step": 16200 + }, + { + "entropy": 2.1831669211387634, + "epoch": 0.7222222222222222, + "grad_norm": 6.78125, + "learning_rate": 5.728293241695304e-06, + "loss": 0.8655, + "mean_token_accuracy": 0.7454936730861664, + "num_tokens": 51189529.0, + "step": 16250 + }, + { + "entropy": 2.187736349105835, + "epoch": 0.7244444444444444, + "grad_norm": 7.15625, + "learning_rate": 5.6824742268041245e-06, + "loss": 0.8503, + "mean_token_accuracy": 0.7467346620559693, + "num_tokens": 51255571.0, + "step": 16300 + }, + { + "entropy": 2.228825159072876, + "epoch": 0.7266666666666667, + "grad_norm": 7.09375, + "learning_rate": 5.636655211912945e-06, + "loss": 0.8646, + "mean_token_accuracy": 0.7439834475517273, + "num_tokens": 51321902.0, + "step": 16350 + }, + { + "entropy": 2.1766190052032472, + "epoch": 0.7288888888888889, + "grad_norm": 8.0, + "learning_rate": 5.590836197021764e-06, + "loss": 0.8582, + "mean_token_accuracy": 0.7422134637832641, + "num_tokens": 51387236.0, + "step": 16400 + }, + { + "entropy": 2.1550874876976014, + "epoch": 0.7311111111111112, + "grad_norm": 6.84375, + "learning_rate": 5.545017182130585e-06, + "loss": 0.803, + "mean_token_accuracy": 0.7592871415615082, + "num_tokens": 51454340.0, + "step": 16450 + }, + { + "entropy": 2.1886205792427065, + "epoch": 0.7333333333333333, + "grad_norm": 8.5625, + "learning_rate": 5.499198167239405e-06, + "loss": 0.8312, + "mean_token_accuracy": 0.7522400307655335, + "num_tokens": 51521289.0, + "step": 16500 + }, + { + "entropy": 2.1630620193481445, + "epoch": 0.7355555555555555, + "grad_norm": 11.125, + "learning_rate": 5.4533791523482245e-06, + "loss": 0.8429, + "mean_token_accuracy": 0.7494550979137421, + "num_tokens": 51590453.0, + "step": 16550 + }, + { + "entropy": 2.2003714513778685, + "epoch": 0.7377777777777778, + "grad_norm": 7.90625, + "learning_rate": 5.407560137457045e-06, + "loss": 0.8594, + "mean_token_accuracy": 0.7431814324855804, + "num_tokens": 51659077.0, + "step": 16600 + }, + { + "entropy": 2.159868106842041, + "epoch": 0.74, + "grad_norm": 7.09375, + "learning_rate": 5.361741122565865e-06, + "loss": 0.8511, + "mean_token_accuracy": 0.7491097986698151, + "num_tokens": 51724327.0, + "step": 16650 + }, + { + "entropy": 2.1559503626823426, + "epoch": 0.7422222222222222, + "grad_norm": 7.03125, + "learning_rate": 5.3159221076746855e-06, + "loss": 0.866, + "mean_token_accuracy": 0.7423543095588684, + "num_tokens": 51793571.0, + "step": 16700 + }, + { + "entropy": 2.1431314539909363, + "epoch": 0.7444444444444445, + "grad_norm": 6.96875, + "learning_rate": 5.270103092783505e-06, + "loss": 0.8175, + "mean_token_accuracy": 0.7550452971458435, + "num_tokens": 51862535.0, + "step": 16750 + }, + { + "entropy": 2.1364932513237, + "epoch": 0.7466666666666667, + "grad_norm": 9.5, + "learning_rate": 5.224284077892325e-06, + "loss": 0.8622, + "mean_token_accuracy": 0.7470258843898773, + "num_tokens": 51930936.0, + "step": 16800 + }, + { + "entropy": 2.213224956989288, + "epoch": 0.7488888888888889, + "grad_norm": 7.0, + "learning_rate": 5.178465063001146e-06, + "loss": 0.8664, + "mean_token_accuracy": 0.7476742577552795, + "num_tokens": 51996312.0, + "step": 16850 + }, + { + "entropy": 2.3157504415512085, + "epoch": 0.7511111111111111, + "grad_norm": 7.53125, + "learning_rate": 5.132646048109966e-06, + "loss": 0.821, + "mean_token_accuracy": 0.7552366864681244, + "num_tokens": 52061146.0, + "step": 16900 + }, + { + "entropy": 2.258919379711151, + "epoch": 0.7533333333333333, + "grad_norm": 7.1875, + "learning_rate": 5.086827033218786e-06, + "loss": 0.8122, + "mean_token_accuracy": 0.7566414999961854, + "num_tokens": 52126822.0, + "step": 16950 + }, + { + "entropy": 2.2003828430175782, + "epoch": 0.7555555555555555, + "grad_norm": 8.6875, + "learning_rate": 5.041008018327607e-06, + "loss": 0.8321, + "mean_token_accuracy": 0.7529778003692627, + "num_tokens": 52192915.0, + "step": 17000 + }, + { + "entropy": 2.2184296226501465, + "epoch": 0.7577777777777778, + "grad_norm": 9.1875, + "learning_rate": 4.995189003436426e-06, + "loss": 0.8577, + "mean_token_accuracy": 0.7449932956695556, + "num_tokens": 52261692.0, + "step": 17050 + }, + { + "entropy": 2.171595447063446, + "epoch": 0.76, + "grad_norm": 9.125, + "learning_rate": 4.9493699885452465e-06, + "loss": 0.8545, + "mean_token_accuracy": 0.7480556511878967, + "num_tokens": 52329194.0, + "step": 17100 + }, + { + "entropy": 2.2223637771606444, + "epoch": 0.7622222222222222, + "grad_norm": 7.375, + "learning_rate": 4.903550973654067e-06, + "loss": 0.8367, + "mean_token_accuracy": 0.7503479218482971, + "num_tokens": 52395383.0, + "step": 17150 + }, + { + "entropy": 2.2046579623222353, + "epoch": 0.7644444444444445, + "grad_norm": 7.53125, + "learning_rate": 4.857731958762887e-06, + "loss": 0.8635, + "mean_token_accuracy": 0.7457800447940827, + "num_tokens": 52463232.0, + "step": 17200 + }, + { + "entropy": 2.173711452484131, + "epoch": 0.7666666666666667, + "grad_norm": 6.625, + "learning_rate": 4.8119129438717075e-06, + "loss": 0.8209, + "mean_token_accuracy": 0.7542583394050598, + "num_tokens": 52531656.0, + "step": 17250 + }, + { + "entropy": 2.2189766001701354, + "epoch": 0.7688888888888888, + "grad_norm": 10.6875, + "learning_rate": 4.766093928980528e-06, + "loss": 0.8685, + "mean_token_accuracy": 0.7441653323173523, + "num_tokens": 52595389.0, + "step": 17300 + }, + { + "entropy": 2.1452325272560118, + "epoch": 0.7711111111111111, + "grad_norm": 9.625, + "learning_rate": 4.720274914089347e-06, + "loss": 0.8348, + "mean_token_accuracy": 0.7506163120269775, + "num_tokens": 52663028.0, + "step": 17350 + }, + { + "entropy": 2.2454160952568056, + "epoch": 0.7733333333333333, + "grad_norm": 7.75, + "learning_rate": 4.674455899198168e-06, + "loss": 0.8597, + "mean_token_accuracy": 0.7458804631233216, + "num_tokens": 52728656.0, + "step": 17400 + }, + { + "entropy": 2.292165369987488, + "epoch": 0.7755555555555556, + "grad_norm": 6.65625, + "learning_rate": 4.628636884306988e-06, + "loss": 0.8405, + "mean_token_accuracy": 0.7525258159637451, + "num_tokens": 52794230.0, + "step": 17450 + }, + { + "entropy": 2.1589057970047, + "epoch": 0.7777777777777778, + "grad_norm": 7.9375, + "learning_rate": 4.582817869415808e-06, + "loss": 0.8481, + "mean_token_accuracy": 0.7463726258277893, + "num_tokens": 52862165.0, + "step": 17500 + }, + { + "entropy": 2.169076681137085, + "epoch": 0.78, + "grad_norm": 7.78125, + "learning_rate": 4.536998854524628e-06, + "loss": 0.8224, + "mean_token_accuracy": 0.7554120934009552, + "num_tokens": 52930052.0, + "step": 17550 + }, + { + "entropy": 2.21035982131958, + "epoch": 0.7822222222222223, + "grad_norm": 8.625, + "learning_rate": 4.491179839633448e-06, + "loss": 0.7947, + "mean_token_accuracy": 0.7617706823348999, + "num_tokens": 52997539.0, + "step": 17600 + }, + { + "entropy": 2.285029878616333, + "epoch": 0.7844444444444445, + "grad_norm": 6.90625, + "learning_rate": 4.4453608247422685e-06, + "loss": 0.8251, + "mean_token_accuracy": 0.7533777153491974, + "num_tokens": 53062000.0, + "step": 17650 + }, + { + "entropy": 2.169939410686493, + "epoch": 0.7866666666666666, + "grad_norm": 6.875, + "learning_rate": 4.399541809851088e-06, + "loss": 0.8256, + "mean_token_accuracy": 0.7538401031494141, + "num_tokens": 53128912.0, + "step": 17700 + }, + { + "entropy": 2.2041222214698792, + "epoch": 0.7888888888888889, + "grad_norm": 10.375, + "learning_rate": 4.353722794959908e-06, + "loss": 0.7965, + "mean_token_accuracy": 0.7599800097942352, + "num_tokens": 53195055.0, + "step": 17750 + }, + { + "entropy": 2.2029996418952944, + "epoch": 0.7911111111111111, + "grad_norm": 8.25, + "learning_rate": 4.307903780068729e-06, + "loss": 0.8447, + "mean_token_accuracy": 0.748561098575592, + "num_tokens": 53259948.0, + "step": 17800 + }, + { + "entropy": 2.139230773448944, + "epoch": 0.7933333333333333, + "grad_norm": 6.15625, + "learning_rate": 4.262084765177549e-06, + "loss": 0.8634, + "mean_token_accuracy": 0.7443192017078399, + "num_tokens": 53330653.0, + "step": 17850 + }, + { + "entropy": 2.1335135221481325, + "epoch": 0.7955555555555556, + "grad_norm": 7.09375, + "learning_rate": 4.216265750286369e-06, + "loss": 0.8332, + "mean_token_accuracy": 0.7517373490333558, + "num_tokens": 53398724.0, + "step": 17900 + }, + { + "entropy": 2.164382312297821, + "epoch": 0.7977777777777778, + "grad_norm": 8.375, + "learning_rate": 4.17044673539519e-06, + "loss": 0.8511, + "mean_token_accuracy": 0.7499736166000366, + "num_tokens": 53466863.0, + "step": 17950 + }, + { + "entropy": 2.190858428478241, + "epoch": 0.8, + "grad_norm": 6.59375, + "learning_rate": 4.12462772050401e-06, + "loss": 0.8523, + "mean_token_accuracy": 0.7486334586143494, + "num_tokens": 53532021.0, + "step": 18000 + }, + { + "entropy": 2.22938738822937, + "epoch": 0.8022222222222222, + "grad_norm": 8.3125, + "learning_rate": 4.0788087056128295e-06, + "loss": 0.8667, + "mean_token_accuracy": 0.7432076168060303, + "num_tokens": 53600120.0, + "step": 18050 + }, + { + "entropy": 2.233106544017792, + "epoch": 0.8044444444444444, + "grad_norm": 9.375, + "learning_rate": 4.03298969072165e-06, + "loss": 0.8267, + "mean_token_accuracy": 0.7550076794624329, + "num_tokens": 53665356.0, + "step": 18100 + }, + { + "entropy": 2.214742834568024, + "epoch": 0.8066666666666666, + "grad_norm": 10.375, + "learning_rate": 3.98717067583047e-06, + "loss": 0.8315, + "mean_token_accuracy": 0.752135591506958, + "num_tokens": 53732051.0, + "step": 18150 + }, + { + "entropy": 2.1795053052902222, + "epoch": 0.8088888888888889, + "grad_norm": 8.0625, + "learning_rate": 3.94135166093929e-06, + "loss": 0.8391, + "mean_token_accuracy": 0.7515090310573578, + "num_tokens": 53798222.0, + "step": 18200 + }, + { + "entropy": 2.1936265587806703, + "epoch": 0.8111111111111111, + "grad_norm": 7.875, + "learning_rate": 3.89553264604811e-06, + "loss": 0.8642, + "mean_token_accuracy": 0.7441390895843506, + "num_tokens": 53861370.0, + "step": 18250 + }, + { + "entropy": 2.261586802005768, + "epoch": 0.8133333333333334, + "grad_norm": 8.0625, + "learning_rate": 3.84971363115693e-06, + "loss": 0.8219, + "mean_token_accuracy": 0.7518996393680573, + "num_tokens": 53927168.0, + "step": 18300 + }, + { + "entropy": 2.1711619758605956, + "epoch": 0.8155555555555556, + "grad_norm": 7.125, + "learning_rate": 3.8038946162657507e-06, + "loss": 0.854, + "mean_token_accuracy": 0.7472757256031036, + "num_tokens": 53995523.0, + "step": 18350 + }, + { + "entropy": 2.1286602544784547, + "epoch": 0.8177777777777778, + "grad_norm": 10.0625, + "learning_rate": 3.7580756013745706e-06, + "loss": 0.8391, + "mean_token_accuracy": 0.7513591229915619, + "num_tokens": 54064144.0, + "step": 18400 + }, + { + "entropy": 2.227732105255127, + "epoch": 0.82, + "grad_norm": 8.5, + "learning_rate": 3.712256586483391e-06, + "loss": 0.8569, + "mean_token_accuracy": 0.7463644111156463, + "num_tokens": 54130199.0, + "step": 18450 + }, + { + "entropy": 2.242250292301178, + "epoch": 0.8222222222222222, + "grad_norm": 7.4375, + "learning_rate": 3.6664375715922113e-06, + "loss": 0.8458, + "mean_token_accuracy": 0.7489724898338318, + "num_tokens": 54193967.0, + "step": 18500 + }, + { + "entropy": 2.2216247129440307, + "epoch": 0.8244444444444444, + "grad_norm": 9.5, + "learning_rate": 3.620618556701031e-06, + "loss": 0.8631, + "mean_token_accuracy": 0.7474298918247223, + "num_tokens": 54258735.0, + "step": 18550 + }, + { + "entropy": 2.19566166639328, + "epoch": 0.8266666666666667, + "grad_norm": 8.875, + "learning_rate": 3.574799541809851e-06, + "loss": 0.8383, + "mean_token_accuracy": 0.7505941188335419, + "num_tokens": 54325869.0, + "step": 18600 + }, + { + "entropy": 2.2029261493682863, + "epoch": 0.8288888888888889, + "grad_norm": 6.71875, + "learning_rate": 3.5289805269186715e-06, + "loss": 0.8685, + "mean_token_accuracy": 0.7453620088100433, + "num_tokens": 54391495.0, + "step": 18650 + }, + { + "entropy": 2.321768162250519, + "epoch": 0.8311111111111111, + "grad_norm": 8.1875, + "learning_rate": 3.4831615120274914e-06, + "loss": 0.8268, + "mean_token_accuracy": 0.7540508365631103, + "num_tokens": 54457662.0, + "step": 18700 + }, + { + "entropy": 2.204611828327179, + "epoch": 0.8333333333333334, + "grad_norm": 7.03125, + "learning_rate": 3.4373424971363117e-06, + "loss": 0.8534, + "mean_token_accuracy": 0.7491276848316193, + "num_tokens": 54522465.0, + "step": 18750 + }, + { + "entropy": 2.180729539394379, + "epoch": 0.8355555555555556, + "grad_norm": 7.96875, + "learning_rate": 3.391523482245132e-06, + "loss": 0.8335, + "mean_token_accuracy": 0.7529357576370239, + "num_tokens": 54590053.0, + "step": 18800 + }, + { + "entropy": 2.205647897720337, + "epoch": 0.8377777777777777, + "grad_norm": 7.53125, + "learning_rate": 3.3457044673539524e-06, + "loss": 0.8733, + "mean_token_accuracy": 0.7421969878673553, + "num_tokens": 54656959.0, + "step": 18850 + }, + { + "entropy": 2.22350492477417, + "epoch": 0.84, + "grad_norm": 7.96875, + "learning_rate": 3.2998854524627723e-06, + "loss": 0.8657, + "mean_token_accuracy": 0.7474096655845642, + "num_tokens": 54722156.0, + "step": 18900 + }, + { + "entropy": 2.1877153444290163, + "epoch": 0.8422222222222222, + "grad_norm": 6.15625, + "learning_rate": 3.2540664375715927e-06, + "loss": 0.8894, + "mean_token_accuracy": 0.7431522738933564, + "num_tokens": 54789152.0, + "step": 18950 + }, + { + "entropy": 2.225923342704773, + "epoch": 0.8444444444444444, + "grad_norm": 11.125, + "learning_rate": 3.2082474226804126e-06, + "loss": 0.8555, + "mean_token_accuracy": 0.7505220258235932, + "num_tokens": 54858257.0, + "step": 19000 + }, + { + "entropy": 2.1716776871681214, + "epoch": 0.8466666666666667, + "grad_norm": 7.75, + "learning_rate": 3.1624284077892325e-06, + "loss": 0.8272, + "mean_token_accuracy": 0.7512453496456146, + "num_tokens": 54927221.0, + "step": 19050 + }, + { + "entropy": 2.2240468645095826, + "epoch": 0.8488888888888889, + "grad_norm": 12.5, + "learning_rate": 3.116609392898053e-06, + "loss": 0.8363, + "mean_token_accuracy": 0.7510754656791687, + "num_tokens": 54995938.0, + "step": 19100 + }, + { + "entropy": 2.1904780864715576, + "epoch": 0.8511111111111112, + "grad_norm": 9.75, + "learning_rate": 3.070790378006873e-06, + "loss": 0.834, + "mean_token_accuracy": 0.75088552236557, + "num_tokens": 55058163.0, + "step": 19150 + }, + { + "entropy": 2.215365264415741, + "epoch": 0.8533333333333334, + "grad_norm": 8.25, + "learning_rate": 3.0249713631156935e-06, + "loss": 0.8597, + "mean_token_accuracy": 0.7446792745590209, + "num_tokens": 55124026.0, + "step": 19200 + }, + { + "entropy": 2.202007007598877, + "epoch": 0.8555555555555555, + "grad_norm": 9.0625, + "learning_rate": 2.9791523482245134e-06, + "loss": 0.8144, + "mean_token_accuracy": 0.7558258211612702, + "num_tokens": 55192976.0, + "step": 19250 + }, + { + "entropy": 2.2235484766960143, + "epoch": 0.8577777777777778, + "grad_norm": 8.3125, + "learning_rate": 2.9333333333333338e-06, + "loss": 0.8662, + "mean_token_accuracy": 0.7427694439888001, + "num_tokens": 55260374.0, + "step": 19300 + }, + { + "entropy": 2.256188449859619, + "epoch": 0.86, + "grad_norm": 8.5625, + "learning_rate": 2.887514318442154e-06, + "loss": 0.8496, + "mean_token_accuracy": 0.7467195224761963, + "num_tokens": 55326060.0, + "step": 19350 + }, + { + "entropy": 2.2128705596923828, + "epoch": 0.8622222222222222, + "grad_norm": 11.75, + "learning_rate": 2.8416953035509736e-06, + "loss": 0.8645, + "mean_token_accuracy": 0.7435001564025879, + "num_tokens": 55391552.0, + "step": 19400 + }, + { + "entropy": 2.1930198001861574, + "epoch": 0.8644444444444445, + "grad_norm": 8.125, + "learning_rate": 2.795876288659794e-06, + "loss": 0.8781, + "mean_token_accuracy": 0.7410417079925538, + "num_tokens": 55461963.0, + "step": 19450 + }, + { + "entropy": 2.1576480197906496, + "epoch": 0.8666666666666667, + "grad_norm": 7.75, + "learning_rate": 2.7500572737686143e-06, + "loss": 0.832, + "mean_token_accuracy": 0.7539917016029358, + "num_tokens": 55529650.0, + "step": 19500 + }, + { + "entropy": 2.1880473136901855, + "epoch": 0.8688888888888889, + "grad_norm": 7.75, + "learning_rate": 2.7042382588774346e-06, + "loss": 0.868, + "mean_token_accuracy": 0.7416273355484009, + "num_tokens": 55596036.0, + "step": 19550 + }, + { + "entropy": 2.2236318159103394, + "epoch": 0.8711111111111111, + "grad_norm": 8.1875, + "learning_rate": 2.6584192439862545e-06, + "loss": 0.8535, + "mean_token_accuracy": 0.7471865510940552, + "num_tokens": 55662244.0, + "step": 19600 + }, + { + "entropy": 2.2074507117271422, + "epoch": 0.8733333333333333, + "grad_norm": 8.125, + "learning_rate": 2.612600229095075e-06, + "loss": 0.857, + "mean_token_accuracy": 0.746492406129837, + "num_tokens": 55730056.0, + "step": 19650 + }, + { + "entropy": 2.175764639377594, + "epoch": 0.8755555555555555, + "grad_norm": 7.875, + "learning_rate": 2.566781214203895e-06, + "loss": 0.8402, + "mean_token_accuracy": 0.7526404368877411, + "num_tokens": 55796368.0, + "step": 19700 + }, + { + "entropy": 2.2133652138710023, + "epoch": 0.8777777777777778, + "grad_norm": 7.40625, + "learning_rate": 2.5209621993127147e-06, + "loss": 0.8401, + "mean_token_accuracy": 0.7528079390525818, + "num_tokens": 55862412.0, + "step": 19750 + }, + { + "entropy": 2.16952513217926, + "epoch": 0.88, + "grad_norm": 11.125, + "learning_rate": 2.475143184421535e-06, + "loss": 0.8162, + "mean_token_accuracy": 0.7567255461215973, + "num_tokens": 55929780.0, + "step": 19800 + }, + { + "entropy": 2.2304827785491943, + "epoch": 0.8822222222222222, + "grad_norm": 9.125, + "learning_rate": 2.4293241695303554e-06, + "loss": 0.8747, + "mean_token_accuracy": 0.7438322114944458, + "num_tokens": 55997356.0, + "step": 19850 + }, + { + "entropy": 2.1662241196632386, + "epoch": 0.8844444444444445, + "grad_norm": 7.71875, + "learning_rate": 2.3835051546391753e-06, + "loss": 0.8738, + "mean_token_accuracy": 0.7397397947311402, + "num_tokens": 56066573.0, + "step": 19900 + }, + { + "entropy": 2.1665696811676027, + "epoch": 0.8866666666666667, + "grad_norm": 7.15625, + "learning_rate": 2.3376861397479956e-06, + "loss": 0.8558, + "mean_token_accuracy": 0.7482295572757721, + "num_tokens": 56136956.0, + "step": 19950 + }, + { + "entropy": 2.2371082639694215, + "epoch": 0.8888888888888888, + "grad_norm": 8.75, + "learning_rate": 2.291867124856816e-06, + "loss": 0.8136, + "mean_token_accuracy": 0.7589144480228424, + "num_tokens": 56202618.0, + "step": 20000 + }, + { + "epoch": 0.8888888888888888, + "eval_entropy": 2.108439266681671, + "eval_loss": 0.8734950423240662, + "eval_mean_token_accuracy": 0.7479927390813828, + "eval_num_tokens": 56202618.0, + "eval_runtime": 5.499, + "eval_samples_per_second": 2.364, + "eval_steps_per_second": 0.727, + "step": 20000 + }, + { + "entropy": 2.2510220575332642, + "epoch": 0.8911111111111111, + "grad_norm": 9.5, + "learning_rate": 2.246048109965636e-06, + "loss": 0.8871, + "mean_token_accuracy": 0.7370022451877594, + "num_tokens": 56266792.0, + "step": 20050 + }, + { + "entropy": 2.1528950190544127, + "epoch": 0.8933333333333333, + "grad_norm": 9.75, + "learning_rate": 2.200229095074456e-06, + "loss": 0.8586, + "mean_token_accuracy": 0.7451235044002533, + "num_tokens": 56334124.0, + "step": 20100 + }, + { + "entropy": 2.1923975205421447, + "epoch": 0.8955555555555555, + "grad_norm": 6.71875, + "learning_rate": 2.154410080183276e-06, + "loss": 0.874, + "mean_token_accuracy": 0.7444364356994629, + "num_tokens": 56401545.0, + "step": 20150 + }, + { + "entropy": 2.222883083820343, + "epoch": 0.8977777777777778, + "grad_norm": 9.1875, + "learning_rate": 2.1085910652920965e-06, + "loss": 0.8376, + "mean_token_accuracy": 0.7498513388633729, + "num_tokens": 56471261.0, + "step": 20200 + }, + { + "entropy": 2.228672001361847, + "epoch": 0.9, + "grad_norm": 7.21875, + "learning_rate": 2.062772050400917e-06, + "loss": 0.8231, + "mean_token_accuracy": 0.755576502084732, + "num_tokens": 56535034.0, + "step": 20250 + }, + { + "entropy": 2.196291310787201, + "epoch": 0.9022222222222223, + "grad_norm": 7.21875, + "learning_rate": 2.0169530355097367e-06, + "loss": 0.8711, + "mean_token_accuracy": 0.7452339708805085, + "num_tokens": 56605038.0, + "step": 20300 + }, + { + "entropy": 2.206197905540466, + "epoch": 0.9044444444444445, + "grad_norm": 9.0, + "learning_rate": 1.971134020618557e-06, + "loss": 0.8707, + "mean_token_accuracy": 0.7420957183837891, + "num_tokens": 56670283.0, + "step": 20350 + }, + { + "entropy": 2.1762799286842345, + "epoch": 0.9066666666666666, + "grad_norm": 11.0, + "learning_rate": 1.925315005727377e-06, + "loss": 0.8375, + "mean_token_accuracy": 0.751966392993927, + "num_tokens": 56735920.0, + "step": 20400 + }, + { + "entropy": 2.218523108959198, + "epoch": 0.9088888888888889, + "grad_norm": 9.25, + "learning_rate": 1.879495990836197e-06, + "loss": 0.8468, + "mean_token_accuracy": 0.7500053834915161, + "num_tokens": 56801907.0, + "step": 20450 + }, + { + "entropy": 2.289003756046295, + "epoch": 0.9111111111111111, + "grad_norm": 11.75, + "learning_rate": 1.8336769759450174e-06, + "loss": 0.8643, + "mean_token_accuracy": 0.7460965967178345, + "num_tokens": 56867853.0, + "step": 20500 + }, + { + "entropy": 2.1768972492218017, + "epoch": 0.9133333333333333, + "grad_norm": 9.1875, + "learning_rate": 1.7878579610538373e-06, + "loss": 0.8613, + "mean_token_accuracy": 0.7483590936660767, + "num_tokens": 56936042.0, + "step": 20550 + }, + { + "entropy": 2.211531710624695, + "epoch": 0.9155555555555556, + "grad_norm": 12.125, + "learning_rate": 1.7420389461626577e-06, + "loss": 0.8518, + "mean_token_accuracy": 0.7475169038772583, + "num_tokens": 57003712.0, + "step": 20600 + }, + { + "entropy": 2.2285032725334166, + "epoch": 0.9177777777777778, + "grad_norm": 9.0, + "learning_rate": 1.6962199312714778e-06, + "loss": 0.8805, + "mean_token_accuracy": 0.7423696160316468, + "num_tokens": 57071218.0, + "step": 20650 + }, + { + "entropy": 2.2252227783203127, + "epoch": 0.92, + "grad_norm": 13.1875, + "learning_rate": 1.6504009163802981e-06, + "loss": 0.8401, + "mean_token_accuracy": 0.751958976984024, + "num_tokens": 57136636.0, + "step": 20700 + }, + { + "entropy": 2.220773038864136, + "epoch": 0.9222222222222223, + "grad_norm": 6.9375, + "learning_rate": 1.604581901489118e-06, + "loss": 0.8293, + "mean_token_accuracy": 0.7526602661609649, + "num_tokens": 57201653.0, + "step": 20750 + }, + { + "entropy": 2.2865464878082276, + "epoch": 0.9244444444444444, + "grad_norm": 7.3125, + "learning_rate": 1.5587628865979382e-06, + "loss": 0.8497, + "mean_token_accuracy": 0.7494020068645477, + "num_tokens": 57266565.0, + "step": 20800 + }, + { + "entropy": 2.2102810382843017, + "epoch": 0.9266666666666666, + "grad_norm": 6.8125, + "learning_rate": 1.5129438717067585e-06, + "loss": 0.8608, + "mean_token_accuracy": 0.748868852853775, + "num_tokens": 57334744.0, + "step": 20850 + }, + { + "entropy": 2.1809313702583313, + "epoch": 0.9288888888888889, + "grad_norm": 15.625, + "learning_rate": 1.4671248568155784e-06, + "loss": 0.8679, + "mean_token_accuracy": 0.7432131195068359, + "num_tokens": 57403405.0, + "step": 20900 + }, + { + "entropy": 2.2084882354736326, + "epoch": 0.9311111111111111, + "grad_norm": 6.90625, + "learning_rate": 1.4213058419243988e-06, + "loss": 0.8544, + "mean_token_accuracy": 0.7461783969402314, + "num_tokens": 57470110.0, + "step": 20950 + }, + { + "entropy": 2.143686933517456, + "epoch": 0.9333333333333333, + "grad_norm": 7.375, + "learning_rate": 1.375486827033219e-06, + "loss": 0.8695, + "mean_token_accuracy": 0.7416430413722992, + "num_tokens": 57537171.0, + "step": 21000 + }, + { + "entropy": 2.210377869606018, + "epoch": 0.9355555555555556, + "grad_norm": 6.75, + "learning_rate": 1.329667812142039e-06, + "loss": 0.8413, + "mean_token_accuracy": 0.7495397543907165, + "num_tokens": 57602216.0, + "step": 21050 + }, + { + "entropy": 2.186688220500946, + "epoch": 0.9377777777777778, + "grad_norm": 6.96875, + "learning_rate": 1.2838487972508592e-06, + "loss": 0.8411, + "mean_token_accuracy": 0.7513405895233154, + "num_tokens": 57669645.0, + "step": 21100 + }, + { + "entropy": 2.1919109773635865, + "epoch": 0.94, + "grad_norm": 6.96875, + "learning_rate": 1.2380297823596793e-06, + "loss": 0.8498, + "mean_token_accuracy": 0.7503270518779754, + "num_tokens": 57738922.0, + "step": 21150 + }, + { + "entropy": 2.2736938905715944, + "epoch": 0.9422222222222222, + "grad_norm": 9.0625, + "learning_rate": 1.1922107674684994e-06, + "loss": 0.8467, + "mean_token_accuracy": 0.7496572816371918, + "num_tokens": 57805121.0, + "step": 21200 + }, + { + "entropy": 2.2604011583328245, + "epoch": 0.9444444444444444, + "grad_norm": 7.28125, + "learning_rate": 1.1463917525773197e-06, + "loss": 0.8553, + "mean_token_accuracy": 0.7471484684944153, + "num_tokens": 57869045.0, + "step": 21250 + }, + { + "entropy": 2.236326413154602, + "epoch": 0.9466666666666667, + "grad_norm": 8.625, + "learning_rate": 1.1005727376861399e-06, + "loss": 0.8684, + "mean_token_accuracy": 0.7434519910812378, + "num_tokens": 57937773.0, + "step": 21300 + }, + { + "entropy": 2.2119676208496095, + "epoch": 0.9488888888888889, + "grad_norm": 7.5, + "learning_rate": 1.05475372279496e-06, + "loss": 0.8414, + "mean_token_accuracy": 0.7505972480773926, + "num_tokens": 57999509.0, + "step": 21350 + }, + { + "entropy": 2.19934800863266, + "epoch": 0.9511111111111111, + "grad_norm": 10.0, + "learning_rate": 1.0089347079037801e-06, + "loss": 0.8283, + "mean_token_accuracy": 0.7531795799732208, + "num_tokens": 58066324.0, + "step": 21400 + }, + { + "entropy": 2.189887263774872, + "epoch": 0.9533333333333334, + "grad_norm": 8.6875, + "learning_rate": 9.631156930126003e-07, + "loss": 0.8438, + "mean_token_accuracy": 0.7486509644985199, + "num_tokens": 58133522.0, + "step": 21450 + }, + { + "entropy": 2.2022621488571166, + "epoch": 0.9555555555555556, + "grad_norm": 12.0, + "learning_rate": 9.172966781214204e-07, + "loss": 0.8816, + "mean_token_accuracy": 0.7386648190021515, + "num_tokens": 58201032.0, + "step": 21500 + }, + { + "entropy": 2.2088757705688478, + "epoch": 0.9577777777777777, + "grad_norm": 7.65625, + "learning_rate": 8.714776632302406e-07, + "loss": 0.8685, + "mean_token_accuracy": 0.745578328371048, + "num_tokens": 58266051.0, + "step": 21550 + }, + { + "entropy": 2.2617397117614746, + "epoch": 0.96, + "grad_norm": 7.5625, + "learning_rate": 8.256586483390607e-07, + "loss": 0.857, + "mean_token_accuracy": 0.7448407518863678, + "num_tokens": 58332021.0, + "step": 21600 + }, + { + "entropy": 2.2229527854919433, + "epoch": 0.9622222222222222, + "grad_norm": 6.625, + "learning_rate": 7.79839633447881e-07, + "loss": 0.8636, + "mean_token_accuracy": 0.7455604159832001, + "num_tokens": 58396998.0, + "step": 21650 + }, + { + "entropy": 2.179794452190399, + "epoch": 0.9644444444444444, + "grad_norm": 7.71875, + "learning_rate": 7.340206185567011e-07, + "loss": 0.8293, + "mean_token_accuracy": 0.754916387796402, + "num_tokens": 58464526.0, + "step": 21700 + }, + { + "entropy": 2.2393770956993104, + "epoch": 0.9666666666666667, + "grad_norm": 12.4375, + "learning_rate": 6.882016036655212e-07, + "loss": 0.8434, + "mean_token_accuracy": 0.7503564131259918, + "num_tokens": 58530361.0, + "step": 21750 + }, + { + "entropy": 2.263950316905975, + "epoch": 0.9688888888888889, + "grad_norm": 6.40625, + "learning_rate": 6.423825887743414e-07, + "loss": 0.8466, + "mean_token_accuracy": 0.748320734500885, + "num_tokens": 58595415.0, + "step": 21800 + }, + { + "entropy": 2.3069614815711974, + "epoch": 0.9711111111111111, + "grad_norm": 8.4375, + "learning_rate": 5.965635738831616e-07, + "loss": 0.8578, + "mean_token_accuracy": 0.7452424871921539, + "num_tokens": 58658566.0, + "step": 21850 + }, + { + "entropy": 2.2174816036224367, + "epoch": 0.9733333333333334, + "grad_norm": 8.875, + "learning_rate": 5.507445589919817e-07, + "loss": 0.8814, + "mean_token_accuracy": 0.743362922668457, + "num_tokens": 58728364.0, + "step": 21900 + }, + { + "entropy": 2.2033449959754945, + "epoch": 0.9755555555555555, + "grad_norm": 7.15625, + "learning_rate": 5.049255441008018e-07, + "loss": 0.8709, + "mean_token_accuracy": 0.7446471619606018, + "num_tokens": 58796528.0, + "step": 21950 + }, + { + "entropy": 2.262139241695404, + "epoch": 0.9777777777777777, + "grad_norm": 12.75, + "learning_rate": 4.59106529209622e-07, + "loss": 0.8688, + "mean_token_accuracy": 0.7436162507534028, + "num_tokens": 58862439.0, + "step": 22000 + }, + { + "entropy": 2.2621299147605898, + "epoch": 0.98, + "grad_norm": 6.25, + "learning_rate": 4.132875143184422e-07, + "loss": 0.866, + "mean_token_accuracy": 0.7452659630775451, + "num_tokens": 58930357.0, + "step": 22050 + }, + { + "entropy": 2.2046650099754332, + "epoch": 0.9822222222222222, + "grad_norm": 8.0, + "learning_rate": 3.674684994272623e-07, + "loss": 0.8945, + "mean_token_accuracy": 0.7410675585269928, + "num_tokens": 58997717.0, + "step": 22100 + }, + { + "entropy": 2.210694715976715, + "epoch": 0.9844444444444445, + "grad_norm": 7.71875, + "learning_rate": 3.216494845360825e-07, + "loss": 0.8986, + "mean_token_accuracy": 0.7420401775836944, + "num_tokens": 59064018.0, + "step": 22150 + }, + { + "entropy": 2.196241397857666, + "epoch": 0.9866666666666667, + "grad_norm": 7.34375, + "learning_rate": 2.758304696449026e-07, + "loss": 0.9046, + "mean_token_accuracy": 0.7343410396575928, + "num_tokens": 59131839.0, + "step": 22200 + }, + { + "entropy": 2.1624761509895323, + "epoch": 0.9888888888888889, + "grad_norm": 8.6875, + "learning_rate": 2.3001145475372283e-07, + "loss": 0.8625, + "mean_token_accuracy": 0.749553587436676, + "num_tokens": 59199340.0, + "step": 22250 + }, + { + "entropy": 2.214530596733093, + "epoch": 0.9911111111111112, + "grad_norm": 7.21875, + "learning_rate": 1.8419243986254296e-07, + "loss": 0.9018, + "mean_token_accuracy": 0.7367948520183564, + "num_tokens": 59265961.0, + "step": 22300 + }, + { + "entropy": 2.189036545753479, + "epoch": 0.9933333333333333, + "grad_norm": 7.0625, + "learning_rate": 1.3837342497136314e-07, + "loss": 0.9118, + "mean_token_accuracy": 0.7363495242595672, + "num_tokens": 59335350.0, + "step": 22350 + }, + { + "entropy": 2.2046823930740356, + "epoch": 0.9955555555555555, + "grad_norm": 7.65625, + "learning_rate": 9.255441008018328e-08, + "loss": 0.8634, + "mean_token_accuracy": 0.7454369437694549, + "num_tokens": 59401509.0, + "step": 22400 + }, + { + "entropy": 2.1603779411315918, + "epoch": 0.9977777777777778, + "grad_norm": 8.25, + "learning_rate": 4.673539518900344e-08, + "loss": 0.8957, + "mean_token_accuracy": 0.741243121623993, + "num_tokens": 59470524.0, + "step": 22450 + }, + { + "entropy": 2.1624831557273865, + "epoch": 1.0, + "grad_norm": 9.0625, + "learning_rate": 9.163802978235968e-10, + "loss": 0.8587, + "mean_token_accuracy": 0.7448894453048706, + "num_tokens": 59536445.0, + "step": 22500 + } + ], + "logging_steps": 50, + "max_steps": 22500, + "num_input_tokens_seen": 0, + "num_train_epochs": 9223372036854775807, + "save_steps": 5000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.515437740795392e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}