diff --git "a/CFunModel/trainer_state.json" "b/CFunModel/trainer_state.json" new file mode 100644--- /dev/null +++ "b/CFunModel/trainer_state.json" @@ -0,0 +1,22033 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 220, + "global_step": 2200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "batch_num_effect_tokens": 7156, + "batch_num_samples": 149, + "batch_num_tokens": 50569, + "epoch": 0.00091, + "grad_norm": 30.947839736938477, + "learning_rate": 9.090909090909091e-08, + "loss": 2.9805, + "step": 1 + }, + { + "batch_num_effect_tokens": 6304, + "batch_num_samples": 149, + "batch_num_tokens": 52221, + "epoch": 0.00182, + "grad_norm": 38.34111785888672, + "learning_rate": 1.8181818181818183e-07, + "loss": 3.1035, + "step": 2 + }, + { + "batch_num_effect_tokens": 8816, + "batch_num_samples": 150, + "batch_num_tokens": 52216, + "epoch": 0.00273, + "grad_norm": 28.859642028808594, + "learning_rate": 2.7272727272727274e-07, + "loss": 3.1328, + "step": 3 + }, + { + "batch_num_effect_tokens": 8810, + "batch_num_samples": 149, + "batch_num_tokens": 52155, + "epoch": 0.00364, + "grad_norm": 28.776859283447266, + "learning_rate": 3.6363636363636366e-07, + "loss": 2.8384, + "step": 4 + }, + { + "batch_num_effect_tokens": 7038, + "batch_num_samples": 149, + "batch_num_tokens": 52159, + "epoch": 0.00455, + "grad_norm": 34.464454650878906, + "learning_rate": 4.5454545454545457e-07, + "loss": 2.9175, + "step": 5 + }, + { + "batch_num_effect_tokens": 11920, + "batch_num_samples": 149, + "batch_num_tokens": 52199, + "epoch": 0.00545, + "grad_norm": 19.11911392211914, + "learning_rate": 5.454545454545455e-07, + "loss": 3.0703, + "step": 6 + }, + { + "batch_num_effect_tokens": 6632, + "batch_num_samples": 149, + "batch_num_tokens": 52118, + "epoch": 0.00636, + "grad_norm": 37.14106750488281, + "learning_rate": 6.363636363636364e-07, + "loss": 3.0396, + "step": 7 + }, + { + "batch_num_effect_tokens": 7207, + "batch_num_samples": 150, + "batch_num_tokens": 52142, + "epoch": 0.00727, + "grad_norm": 32.404598236083984, + "learning_rate": 7.272727272727273e-07, + "loss": 3.0059, + "step": 8 + }, + { + "batch_num_effect_tokens": 8803, + "batch_num_samples": 149, + "batch_num_tokens": 52133, + "epoch": 0.00818, + "grad_norm": 26.34598731994629, + "learning_rate": 8.181818181818182e-07, + "loss": 2.9204, + "step": 9 + }, + { + "batch_num_effect_tokens": 6747, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 0.00909, + "grad_norm": 33.00784683227539, + "learning_rate": 9.090909090909091e-07, + "loss": 2.7783, + "step": 10 + }, + { + "batch_num_effect_tokens": 7202, + "batch_num_samples": 149, + "batch_num_tokens": 52139, + "epoch": 0.01, + "grad_norm": 27.24934196472168, + "learning_rate": 1.0000000000000002e-06, + "loss": 2.7075, + "step": 11 + }, + { + "batch_num_effect_tokens": 7435, + "batch_num_samples": 149, + "batch_num_tokens": 52161, + "epoch": 0.01091, + "grad_norm": 21.876995086669922, + "learning_rate": 1.090909090909091e-06, + "loss": 2.7607, + "step": 12 + }, + { + "batch_num_effect_tokens": 8055, + "batch_num_samples": 149, + "batch_num_tokens": 52134, + "epoch": 0.01182, + "grad_norm": 16.76250648498535, + "learning_rate": 1.181818181818182e-06, + "loss": 2.6719, + "step": 13 + }, + { + "batch_num_effect_tokens": 5491, + "batch_num_samples": 149, + "batch_num_tokens": 52179, + "epoch": 0.01273, + "grad_norm": 20.941076278686523, + "learning_rate": 1.2727272727272728e-06, + "loss": 2.6194, + "step": 14 + }, + { + "batch_num_effect_tokens": 8461, + "batch_num_samples": 149, + "batch_num_tokens": 52163, + "epoch": 0.01364, + "grad_norm": 10.907334327697754, + "learning_rate": 1.3636363636363636e-06, + "loss": 2.5329, + "step": 15 + }, + { + "batch_num_effect_tokens": 7714, + "batch_num_samples": 149, + "batch_num_tokens": 52124, + "epoch": 0.01455, + "grad_norm": 7.7442803382873535, + "learning_rate": 1.4545454545454546e-06, + "loss": 2.4167, + "step": 16 + }, + { + "batch_num_effect_tokens": 6003, + "batch_num_samples": 149, + "batch_num_tokens": 52177, + "epoch": 0.01545, + "grad_norm": 7.480747222900391, + "learning_rate": 1.5454545454545454e-06, + "loss": 2.2767, + "step": 17 + }, + { + "batch_num_effect_tokens": 8222, + "batch_num_samples": 149, + "batch_num_tokens": 52189, + "epoch": 0.01636, + "grad_norm": 6.072074890136719, + "learning_rate": 1.6363636363636365e-06, + "loss": 2.2823, + "step": 18 + }, + { + "batch_num_effect_tokens": 4948, + "batch_num_samples": 149, + "batch_num_tokens": 52210, + "epoch": 0.01727, + "grad_norm": 5.665102481842041, + "learning_rate": 1.7272727272727275e-06, + "loss": 1.5879, + "step": 19 + }, + { + "batch_num_effect_tokens": 4090, + "batch_num_samples": 149, + "batch_num_tokens": 52146, + "epoch": 0.01818, + "grad_norm": 5.8563666343688965, + "learning_rate": 1.8181818181818183e-06, + "loss": 1.5819, + "step": 20 + }, + { + "batch_num_effect_tokens": 6086, + "batch_num_samples": 149, + "batch_num_tokens": 50559, + "epoch": 0.01909, + "grad_norm": 4.837255001068115, + "learning_rate": 1.9090909090909095e-06, + "loss": 1.9778, + "step": 21 + }, + { + "batch_num_effect_tokens": 5614, + "batch_num_samples": 149, + "batch_num_tokens": 52183, + "epoch": 0.02, + "grad_norm": 8.160093307495117, + "learning_rate": 2.0000000000000003e-06, + "loss": 2.0801, + "step": 22 + }, + { + "batch_num_effect_tokens": 6381, + "batch_num_samples": 149, + "batch_num_tokens": 52164, + "epoch": 0.02091, + "grad_norm": 4.796694755554199, + "learning_rate": 2.090909090909091e-06, + "loss": 2.1119, + "step": 23 + }, + { + "batch_num_effect_tokens": 5944, + "batch_num_samples": 149, + "batch_num_tokens": 52181, + "epoch": 0.02182, + "grad_norm": 4.278438091278076, + "learning_rate": 2.181818181818182e-06, + "loss": 1.9312, + "step": 24 + }, + { + "batch_num_effect_tokens": 8000, + "batch_num_samples": 149, + "batch_num_tokens": 52214, + "epoch": 0.02273, + "grad_norm": 4.323818683624268, + "learning_rate": 2.2727272727272728e-06, + "loss": 2.2292, + "step": 25 + }, + { + "batch_num_effect_tokens": 8597, + "batch_num_samples": 149, + "batch_num_tokens": 52153, + "epoch": 0.02364, + "grad_norm": 4.498684406280518, + "learning_rate": 2.363636363636364e-06, + "loss": 2.3325, + "step": 26 + }, + { + "batch_num_effect_tokens": 8007, + "batch_num_samples": 149, + "batch_num_tokens": 52177, + "epoch": 0.02455, + "grad_norm": 4.384886741638184, + "learning_rate": 2.454545454545455e-06, + "loss": 2.2605, + "step": 27 + }, + { + "batch_num_effect_tokens": 7764, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 0.02545, + "grad_norm": 4.394514560699463, + "learning_rate": 2.5454545454545456e-06, + "loss": 2.219, + "step": 28 + }, + { + "batch_num_effect_tokens": 6261, + "batch_num_samples": 149, + "batch_num_tokens": 52171, + "epoch": 0.02636, + "grad_norm": 4.165273189544678, + "learning_rate": 2.6363636363636364e-06, + "loss": 2.028, + "step": 29 + }, + { + "batch_num_effect_tokens": 7873, + "batch_num_samples": 149, + "batch_num_tokens": 52191, + "epoch": 0.02727, + "grad_norm": 4.326238632202148, + "learning_rate": 2.7272727272727272e-06, + "loss": 2.3767, + "step": 30 + }, + { + "batch_num_effect_tokens": 6274, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 0.02818, + "grad_norm": 4.087923526763916, + "learning_rate": 2.818181818181818e-06, + "loss": 2.1954, + "step": 31 + }, + { + "batch_num_effect_tokens": 5754, + "batch_num_samples": 149, + "batch_num_tokens": 52223, + "epoch": 0.02909, + "grad_norm": 4.483077526092529, + "learning_rate": 2.9090909090909093e-06, + "loss": 2.1962, + "step": 32 + }, + { + "batch_num_effect_tokens": 3874, + "batch_num_samples": 149, + "batch_num_tokens": 52166, + "epoch": 0.03, + "grad_norm": 4.695860862731934, + "learning_rate": 3e-06, + "loss": 1.4154, + "step": 33 + }, + { + "batch_num_effect_tokens": 7026, + "batch_num_samples": 149, + "batch_num_tokens": 52223, + "epoch": 0.03091, + "grad_norm": 4.0346293449401855, + "learning_rate": 3.090909090909091e-06, + "loss": 2.1736, + "step": 34 + }, + { + "batch_num_effect_tokens": 9795, + "batch_num_samples": 149, + "batch_num_tokens": 52172, + "epoch": 0.03182, + "grad_norm": 3.8173470497131348, + "learning_rate": 3.181818181818182e-06, + "loss": 2.5917, + "step": 35 + }, + { + "batch_num_effect_tokens": 7027, + "batch_num_samples": 149, + "batch_num_tokens": 52179, + "epoch": 0.03273, + "grad_norm": 3.9705400466918945, + "learning_rate": 3.272727272727273e-06, + "loss": 2.1748, + "step": 36 + }, + { + "batch_num_effect_tokens": 6395, + "batch_num_samples": 149, + "batch_num_tokens": 52129, + "epoch": 0.03364, + "grad_norm": 4.2313923835754395, + "learning_rate": 3.3636363636363637e-06, + "loss": 2.0255, + "step": 37 + }, + { + "batch_num_effect_tokens": 5370, + "batch_num_samples": 149, + "batch_num_tokens": 52163, + "epoch": 0.03455, + "grad_norm": 4.113375186920166, + "learning_rate": 3.454545454545455e-06, + "loss": 1.8015, + "step": 38 + }, + { + "batch_num_effect_tokens": 5401, + "batch_num_samples": 149, + "batch_num_tokens": 52197, + "epoch": 0.03545, + "grad_norm": 5.210320949554443, + "learning_rate": 3.5454545454545458e-06, + "loss": 1.9014, + "step": 39 + }, + { + "batch_num_effect_tokens": 4069, + "batch_num_samples": 149, + "batch_num_tokens": 52140, + "epoch": 0.03636, + "grad_norm": 4.81237268447876, + "learning_rate": 3.6363636363636366e-06, + "loss": 1.6282, + "step": 40 + }, + { + "batch_num_effect_tokens": 7244, + "batch_num_samples": 149, + "batch_num_tokens": 52183, + "epoch": 0.03727, + "grad_norm": 3.647012233734131, + "learning_rate": 3.727272727272728e-06, + "loss": 1.9749, + "step": 41 + }, + { + "batch_num_effect_tokens": 6864, + "batch_num_samples": 150, + "batch_num_tokens": 52218, + "epoch": 0.03818, + "grad_norm": 3.990999460220337, + "learning_rate": 3.818181818181819e-06, + "loss": 2.0056, + "step": 42 + }, + { + "batch_num_effect_tokens": 6332, + "batch_num_samples": 149, + "batch_num_tokens": 52180, + "epoch": 0.03909, + "grad_norm": 4.233860969543457, + "learning_rate": 3.90909090909091e-06, + "loss": 2.155, + "step": 43 + }, + { + "batch_num_effect_tokens": 7136, + "batch_num_samples": 149, + "batch_num_tokens": 52208, + "epoch": 0.04, + "grad_norm": 3.865319013595581, + "learning_rate": 4.000000000000001e-06, + "loss": 1.9899, + "step": 44 + }, + { + "batch_num_effect_tokens": 5198, + "batch_num_samples": 149, + "batch_num_tokens": 52191, + "epoch": 0.04091, + "grad_norm": 3.954425096511841, + "learning_rate": 4.0909090909090915e-06, + "loss": 1.6524, + "step": 45 + }, + { + "batch_num_effect_tokens": 5478, + "batch_num_samples": 149, + "batch_num_tokens": 52117, + "epoch": 0.04182, + "grad_norm": 4.367476940155029, + "learning_rate": 4.181818181818182e-06, + "loss": 2.0508, + "step": 46 + }, + { + "batch_num_effect_tokens": 4745, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 0.04273, + "grad_norm": 4.386691570281982, + "learning_rate": 4.272727272727273e-06, + "loss": 1.6864, + "step": 47 + }, + { + "batch_num_effect_tokens": 4896, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 0.04364, + "grad_norm": 4.337430477142334, + "learning_rate": 4.363636363636364e-06, + "loss": 1.5469, + "step": 48 + }, + { + "batch_num_effect_tokens": 5867, + "batch_num_samples": 149, + "batch_num_tokens": 52141, + "epoch": 0.04455, + "grad_norm": 4.468596458435059, + "learning_rate": 4.454545454545455e-06, + "loss": 2.1321, + "step": 49 + }, + { + "batch_num_effect_tokens": 3693, + "batch_num_samples": 149, + "batch_num_tokens": 52147, + "epoch": 0.04545, + "grad_norm": 4.92578125, + "learning_rate": 4.5454545454545455e-06, + "loss": 1.4774, + "step": 50 + }, + { + "batch_num_effect_tokens": 8159, + "batch_num_samples": 149, + "batch_num_tokens": 52181, + "epoch": 0.04636, + "grad_norm": 3.620227098464966, + "learning_rate": 4.636363636363636e-06, + "loss": 2.0459, + "step": 51 + }, + { + "batch_num_effect_tokens": 7699, + "batch_num_samples": 149, + "batch_num_tokens": 52158, + "epoch": 0.04727, + "grad_norm": 3.7664144039154053, + "learning_rate": 4.727272727272728e-06, + "loss": 2.2405, + "step": 52 + }, + { + "batch_num_effect_tokens": 6907, + "batch_num_samples": 149, + "batch_num_tokens": 52208, + "epoch": 0.04818, + "grad_norm": 3.695967435836792, + "learning_rate": 4.818181818181819e-06, + "loss": 2.0831, + "step": 53 + }, + { + "batch_num_effect_tokens": 8038, + "batch_num_samples": 149, + "batch_num_tokens": 52170, + "epoch": 0.04909, + "grad_norm": 3.6256861686706543, + "learning_rate": 4.90909090909091e-06, + "loss": 1.9589, + "step": 54 + }, + { + "batch_num_effect_tokens": 5507, + "batch_num_samples": 149, + "batch_num_tokens": 52149, + "epoch": 0.05, + "grad_norm": 4.499513626098633, + "learning_rate": 5e-06, + "loss": 2.1505, + "step": 55 + }, + { + "batch_num_effect_tokens": 7857, + "batch_num_samples": 149, + "batch_num_tokens": 52160, + "epoch": 0.05091, + "grad_norm": 3.6978249549865723, + "learning_rate": 5.090909090909091e-06, + "loss": 2.1525, + "step": 56 + }, + { + "batch_num_effect_tokens": 9794, + "batch_num_samples": 149, + "batch_num_tokens": 52184, + "epoch": 0.05182, + "grad_norm": 3.654414653778076, + "learning_rate": 5.181818181818182e-06, + "loss": 2.3803, + "step": 57 + }, + { + "batch_num_effect_tokens": 7112, + "batch_num_samples": 149, + "batch_num_tokens": 52213, + "epoch": 0.05273, + "grad_norm": 3.9470489025115967, + "learning_rate": 5.272727272727273e-06, + "loss": 2.1569, + "step": 58 + }, + { + "batch_num_effect_tokens": 5113, + "batch_num_samples": 149, + "batch_num_tokens": 52191, + "epoch": 0.05364, + "grad_norm": 4.192826747894287, + "learning_rate": 5.3636363636363645e-06, + "loss": 1.9103, + "step": 59 + }, + { + "batch_num_effect_tokens": 7647, + "batch_num_samples": 149, + "batch_num_tokens": 52140, + "epoch": 0.05455, + "grad_norm": 4.059316635131836, + "learning_rate": 5.4545454545454545e-06, + "loss": 2.1822, + "step": 60 + }, + { + "batch_num_effect_tokens": 4677, + "batch_num_samples": 149, + "batch_num_tokens": 50524, + "epoch": 0.05545, + "grad_norm": 4.552770137786865, + "learning_rate": 5.545454545454546e-06, + "loss": 1.7139, + "step": 61 + }, + { + "batch_num_effect_tokens": 9222, + "batch_num_samples": 149, + "batch_num_tokens": 52176, + "epoch": 0.05636, + "grad_norm": 3.726174831390381, + "learning_rate": 5.636363636363636e-06, + "loss": 2.2145, + "step": 62 + }, + { + "batch_num_effect_tokens": 4083, + "batch_num_samples": 149, + "batch_num_tokens": 52201, + "epoch": 0.05727, + "grad_norm": 4.518796443939209, + "learning_rate": 5.727272727272728e-06, + "loss": 1.4964, + "step": 63 + }, + { + "batch_num_effect_tokens": 6641, + "batch_num_samples": 150, + "batch_num_tokens": 52152, + "epoch": 0.05818, + "grad_norm": 4.039824485778809, + "learning_rate": 5.8181818181818185e-06, + "loss": 1.9297, + "step": 64 + }, + { + "batch_num_effect_tokens": 6507, + "batch_num_samples": 149, + "batch_num_tokens": 52147, + "epoch": 0.05909, + "grad_norm": 4.078365802764893, + "learning_rate": 5.90909090909091e-06, + "loss": 2.0474, + "step": 65 + }, + { + "batch_num_effect_tokens": 6856, + "batch_num_samples": 149, + "batch_num_tokens": 52111, + "epoch": 0.06, + "grad_norm": 4.096185207366943, + "learning_rate": 6e-06, + "loss": 2.1375, + "step": 66 + }, + { + "batch_num_effect_tokens": 6408, + "batch_num_samples": 149, + "batch_num_tokens": 52140, + "epoch": 0.06091, + "grad_norm": 4.003268718719482, + "learning_rate": 6.090909090909092e-06, + "loss": 2.0536, + "step": 67 + }, + { + "batch_num_effect_tokens": 6312, + "batch_num_samples": 149, + "batch_num_tokens": 52152, + "epoch": 0.06182, + "grad_norm": 4.170324325561523, + "learning_rate": 6.181818181818182e-06, + "loss": 2.3304, + "step": 68 + }, + { + "batch_num_effect_tokens": 8592, + "batch_num_samples": 149, + "batch_num_tokens": 52157, + "epoch": 0.06273, + "grad_norm": 3.4838006496429443, + "learning_rate": 6.2727272727272734e-06, + "loss": 2.1861, + "step": 69 + }, + { + "batch_num_effect_tokens": 6987, + "batch_num_samples": 149, + "batch_num_tokens": 52201, + "epoch": 0.06364, + "grad_norm": 3.732853651046753, + "learning_rate": 6.363636363636364e-06, + "loss": 2.2194, + "step": 70 + }, + { + "batch_num_effect_tokens": 5225, + "batch_num_samples": 149, + "batch_num_tokens": 52180, + "epoch": 0.06455, + "grad_norm": 4.206742286682129, + "learning_rate": 6.454545454545456e-06, + "loss": 1.926, + "step": 71 + }, + { + "batch_num_effect_tokens": 9847, + "batch_num_samples": 149, + "batch_num_tokens": 52177, + "epoch": 0.06545, + "grad_norm": 3.4091250896453857, + "learning_rate": 6.545454545454546e-06, + "loss": 2.2813, + "step": 72 + }, + { + "batch_num_effect_tokens": 5197, + "batch_num_samples": 149, + "batch_num_tokens": 52215, + "epoch": 0.06636, + "grad_norm": 4.02803897857666, + "learning_rate": 6.6363636363636375e-06, + "loss": 1.8807, + "step": 73 + }, + { + "batch_num_effect_tokens": 9185, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 0.06727, + "grad_norm": 3.7550344467163086, + "learning_rate": 6.7272727272727275e-06, + "loss": 2.2754, + "step": 74 + }, + { + "batch_num_effect_tokens": 7074, + "batch_num_samples": 149, + "batch_num_tokens": 52151, + "epoch": 0.06818, + "grad_norm": 3.7421798706054688, + "learning_rate": 6.818181818181818e-06, + "loss": 1.9489, + "step": 75 + }, + { + "batch_num_effect_tokens": 5189, + "batch_num_samples": 149, + "batch_num_tokens": 52222, + "epoch": 0.06909, + "grad_norm": 4.224911689758301, + "learning_rate": 6.90909090909091e-06, + "loss": 1.9238, + "step": 76 + }, + { + "batch_num_effect_tokens": 5192, + "batch_num_samples": 149, + "batch_num_tokens": 52210, + "epoch": 0.07, + "grad_norm": 3.922016143798828, + "learning_rate": 7e-06, + "loss": 1.7985, + "step": 77 + }, + { + "batch_num_effect_tokens": 5656, + "batch_num_samples": 149, + "batch_num_tokens": 52187, + "epoch": 0.07091, + "grad_norm": 4.252484321594238, + "learning_rate": 7.0909090909090916e-06, + "loss": 1.9175, + "step": 78 + }, + { + "batch_num_effect_tokens": 4409, + "batch_num_samples": 149, + "batch_num_tokens": 52222, + "epoch": 0.07182, + "grad_norm": 4.439416885375977, + "learning_rate": 7.181818181818182e-06, + "loss": 1.6123, + "step": 79 + }, + { + "batch_num_effect_tokens": 5713, + "batch_num_samples": 149, + "batch_num_tokens": 52201, + "epoch": 0.07273, + "grad_norm": 4.0134124755859375, + "learning_rate": 7.272727272727273e-06, + "loss": 1.9377, + "step": 80 + }, + { + "batch_num_effect_tokens": 6445, + "batch_num_samples": 149, + "batch_num_tokens": 52135, + "epoch": 0.07364, + "grad_norm": 4.2473907470703125, + "learning_rate": 7.363636363636364e-06, + "loss": 1.9624, + "step": 81 + }, + { + "batch_num_effect_tokens": 7883, + "batch_num_samples": 149, + "batch_num_tokens": 52165, + "epoch": 0.07455, + "grad_norm": 3.6550698280334473, + "learning_rate": 7.454545454545456e-06, + "loss": 2.0343, + "step": 82 + }, + { + "batch_num_effect_tokens": 5319, + "batch_num_samples": 149, + "batch_num_tokens": 52208, + "epoch": 0.07545, + "grad_norm": 4.416125774383545, + "learning_rate": 7.545454545454546e-06, + "loss": 1.9891, + "step": 83 + }, + { + "batch_num_effect_tokens": 6165, + "batch_num_samples": 149, + "batch_num_tokens": 52206, + "epoch": 0.07636, + "grad_norm": 4.053107738494873, + "learning_rate": 7.636363636363638e-06, + "loss": 1.9417, + "step": 84 + }, + { + "batch_num_effect_tokens": 6178, + "batch_num_samples": 149, + "batch_num_tokens": 52157, + "epoch": 0.07727, + "grad_norm": 4.197380065917969, + "learning_rate": 7.727272727272727e-06, + "loss": 2.1951, + "step": 85 + }, + { + "batch_num_effect_tokens": 6818, + "batch_num_samples": 149, + "batch_num_tokens": 52210, + "epoch": 0.07818, + "grad_norm": 3.6595616340637207, + "learning_rate": 7.81818181818182e-06, + "loss": 2.0797, + "step": 86 + }, + { + "batch_num_effect_tokens": 6648, + "batch_num_samples": 150, + "batch_num_tokens": 52218, + "epoch": 0.07909, + "grad_norm": 4.197087287902832, + "learning_rate": 7.909090909090909e-06, + "loss": 2.2547, + "step": 87 + }, + { + "batch_num_effect_tokens": 7709, + "batch_num_samples": 149, + "batch_num_tokens": 52136, + "epoch": 0.08, + "grad_norm": 4.223806381225586, + "learning_rate": 8.000000000000001e-06, + "loss": 2.1499, + "step": 88 + }, + { + "batch_num_effect_tokens": 6877, + "batch_num_samples": 149, + "batch_num_tokens": 52143, + "epoch": 0.08091, + "grad_norm": 3.888137102127075, + "learning_rate": 8.090909090909092e-06, + "loss": 2.1985, + "step": 89 + }, + { + "batch_num_effect_tokens": 8419, + "batch_num_samples": 149, + "batch_num_tokens": 52208, + "epoch": 0.08182, + "grad_norm": 3.3746907711029053, + "learning_rate": 8.181818181818183e-06, + "loss": 2.1038, + "step": 90 + }, + { + "batch_num_effect_tokens": 5839, + "batch_num_samples": 149, + "batch_num_tokens": 52152, + "epoch": 0.08273, + "grad_norm": 4.06436824798584, + "learning_rate": 8.272727272727274e-06, + "loss": 1.9237, + "step": 91 + }, + { + "batch_num_effect_tokens": 3673, + "batch_num_samples": 149, + "batch_num_tokens": 52200, + "epoch": 0.08364, + "grad_norm": 4.105583667755127, + "learning_rate": 8.363636363636365e-06, + "loss": 1.5624, + "step": 92 + }, + { + "batch_num_effect_tokens": 4391, + "batch_num_samples": 149, + "batch_num_tokens": 52201, + "epoch": 0.08455, + "grad_norm": 4.130006313323975, + "learning_rate": 8.454545454545455e-06, + "loss": 1.6963, + "step": 93 + }, + { + "batch_num_effect_tokens": 5418, + "batch_num_samples": 149, + "batch_num_tokens": 52130, + "epoch": 0.08545, + "grad_norm": 5.248917102813721, + "learning_rate": 8.545454545454546e-06, + "loss": 1.7968, + "step": 94 + }, + { + "batch_num_effect_tokens": 6581, + "batch_num_samples": 149, + "batch_num_tokens": 52132, + "epoch": 0.08636, + "grad_norm": 4.27993106842041, + "learning_rate": 8.636363636363637e-06, + "loss": 2.0516, + "step": 95 + }, + { + "batch_num_effect_tokens": 6002, + "batch_num_samples": 149, + "batch_num_tokens": 52185, + "epoch": 0.08727, + "grad_norm": 4.21064567565918, + "learning_rate": 8.727272727272728e-06, + "loss": 2.1405, + "step": 96 + }, + { + "batch_num_effect_tokens": 4853, + "batch_num_samples": 149, + "batch_num_tokens": 52202, + "epoch": 0.08818, + "grad_norm": 4.228909015655518, + "learning_rate": 8.818181818181819e-06, + "loss": 1.5968, + "step": 97 + }, + { + "batch_num_effect_tokens": 4766, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 0.08909, + "grad_norm": 4.670202255249023, + "learning_rate": 8.90909090909091e-06, + "loss": 1.8443, + "step": 98 + }, + { + "batch_num_effect_tokens": 7877, + "batch_num_samples": 149, + "batch_num_tokens": 52157, + "epoch": 0.09, + "grad_norm": 3.8756136894226074, + "learning_rate": 9e-06, + "loss": 2.3089, + "step": 99 + }, + { + "batch_num_effect_tokens": 5806, + "batch_num_samples": 149, + "batch_num_tokens": 52123, + "epoch": 0.09091, + "grad_norm": 3.7578859329223633, + "learning_rate": 9.090909090909091e-06, + "loss": 1.9039, + "step": 100 + }, + { + "batch_num_effect_tokens": 5456, + "batch_num_samples": 149, + "batch_num_tokens": 52219, + "epoch": 0.09182, + "grad_norm": 4.004074573516846, + "learning_rate": 9.181818181818184e-06, + "loss": 1.8958, + "step": 101 + }, + { + "batch_num_effect_tokens": 7545, + "batch_num_samples": 149, + "batch_num_tokens": 52134, + "epoch": 0.09273, + "grad_norm": 4.003213405609131, + "learning_rate": 9.272727272727273e-06, + "loss": 2.1584, + "step": 102 + }, + { + "batch_num_effect_tokens": 6497, + "batch_num_samples": 149, + "batch_num_tokens": 52165, + "epoch": 0.09364, + "grad_norm": 3.9445877075195312, + "learning_rate": 9.363636363636365e-06, + "loss": 2.1446, + "step": 103 + }, + { + "batch_num_effect_tokens": 11118, + "batch_num_samples": 149, + "batch_num_tokens": 52172, + "epoch": 0.09455, + "grad_norm": 3.3731114864349365, + "learning_rate": 9.454545454545456e-06, + "loss": 2.582, + "step": 104 + }, + { + "batch_num_effect_tokens": 8679, + "batch_num_samples": 149, + "batch_num_tokens": 52184, + "epoch": 0.09545, + "grad_norm": 3.720583915710449, + "learning_rate": 9.545454545454547e-06, + "loss": 2.2406, + "step": 105 + }, + { + "batch_num_effect_tokens": 7387, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 0.09636, + "grad_norm": 4.026169776916504, + "learning_rate": 9.636363636363638e-06, + "loss": 2.1927, + "step": 106 + }, + { + "batch_num_effect_tokens": 6047, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 0.09727, + "grad_norm": 3.9075863361358643, + "learning_rate": 9.727272727272728e-06, + "loss": 1.8892, + "step": 107 + }, + { + "batch_num_effect_tokens": 8357, + "batch_num_samples": 149, + "batch_num_tokens": 52084, + "epoch": 0.09818, + "grad_norm": 3.6489765644073486, + "learning_rate": 9.81818181818182e-06, + "loss": 2.1783, + "step": 108 + }, + { + "batch_num_effect_tokens": 8611, + "batch_num_samples": 149, + "batch_num_tokens": 52174, + "epoch": 0.09909, + "grad_norm": 4.339054107666016, + "learning_rate": 9.90909090909091e-06, + "loss": 2.2626, + "step": 109 + }, + { + "batch_num_effect_tokens": 7198, + "batch_num_samples": 149, + "batch_num_tokens": 52223, + "epoch": 0.1, + "grad_norm": 3.8931431770324707, + "learning_rate": 1e-05, + "loss": 2.1807, + "step": 110 + }, + { + "batch_num_effect_tokens": 4704, + "batch_num_samples": 149, + "batch_num_tokens": 52210, + "epoch": 0.10091, + "grad_norm": 3.9868645668029785, + "learning_rate": 1.0090909090909092e-05, + "loss": 1.5651, + "step": 111 + }, + { + "batch_num_effect_tokens": 7161, + "batch_num_samples": 149, + "batch_num_tokens": 52106, + "epoch": 0.10182, + "grad_norm": 3.8002917766571045, + "learning_rate": 1.0181818181818182e-05, + "loss": 2.0946, + "step": 112 + }, + { + "batch_num_effect_tokens": 7873, + "batch_num_samples": 149, + "batch_num_tokens": 52210, + "epoch": 0.10273, + "grad_norm": 4.119603157043457, + "learning_rate": 1.0272727272727275e-05, + "loss": 2.3052, + "step": 113 + }, + { + "batch_num_effect_tokens": 5651, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 0.10364, + "grad_norm": 4.133640766143799, + "learning_rate": 1.0363636363636364e-05, + "loss": 1.9881, + "step": 114 + }, + { + "batch_num_effect_tokens": 5795, + "batch_num_samples": 149, + "batch_num_tokens": 52173, + "epoch": 0.10455, + "grad_norm": 4.669517517089844, + "learning_rate": 1.0454545454545455e-05, + "loss": 2.0219, + "step": 115 + }, + { + "batch_num_effect_tokens": 8210, + "batch_num_samples": 149, + "batch_num_tokens": 52170, + "epoch": 0.10545, + "grad_norm": 3.619904041290283, + "learning_rate": 1.0545454545454546e-05, + "loss": 2.2578, + "step": 116 + }, + { + "batch_num_effect_tokens": 6330, + "batch_num_samples": 149, + "batch_num_tokens": 50569, + "epoch": 0.10636, + "grad_norm": 4.191747665405273, + "learning_rate": 1.0636363636363638e-05, + "loss": 2.0283, + "step": 117 + }, + { + "batch_num_effect_tokens": 5419, + "batch_num_samples": 149, + "batch_num_tokens": 52201, + "epoch": 0.10727, + "grad_norm": 3.955883264541626, + "learning_rate": 1.0727272727272729e-05, + "loss": 1.7797, + "step": 118 + }, + { + "batch_num_effect_tokens": 6625, + "batch_num_samples": 149, + "batch_num_tokens": 52199, + "epoch": 0.10818, + "grad_norm": 3.954253673553467, + "learning_rate": 1.0818181818181818e-05, + "loss": 2.0083, + "step": 119 + }, + { + "batch_num_effect_tokens": 7511, + "batch_num_samples": 149, + "batch_num_tokens": 52139, + "epoch": 0.10909, + "grad_norm": 4.029529571533203, + "learning_rate": 1.0909090909090909e-05, + "loss": 2.2898, + "step": 120 + }, + { + "batch_num_effect_tokens": 7908, + "batch_num_samples": 150, + "batch_num_tokens": 52213, + "epoch": 0.11, + "grad_norm": 4.126637935638428, + "learning_rate": 1.1000000000000001e-05, + "loss": 2.3242, + "step": 121 + }, + { + "batch_num_effect_tokens": 6498, + "batch_num_samples": 149, + "batch_num_tokens": 52143, + "epoch": 0.11091, + "grad_norm": 3.942624092102051, + "learning_rate": 1.1090909090909092e-05, + "loss": 2.0677, + "step": 122 + }, + { + "batch_num_effect_tokens": 7055, + "batch_num_samples": 149, + "batch_num_tokens": 52213, + "epoch": 0.11182, + "grad_norm": 4.413516521453857, + "learning_rate": 1.1181818181818183e-05, + "loss": 2.178, + "step": 123 + }, + { + "batch_num_effect_tokens": 11530, + "batch_num_samples": 149, + "batch_num_tokens": 52181, + "epoch": 0.11273, + "grad_norm": 3.2890639305114746, + "learning_rate": 1.1272727272727272e-05, + "loss": 2.4971, + "step": 124 + }, + { + "batch_num_effect_tokens": 5723, + "batch_num_samples": 149, + "batch_num_tokens": 52115, + "epoch": 0.11364, + "grad_norm": 4.125791072845459, + "learning_rate": 1.1363636363636366e-05, + "loss": 2.0655, + "step": 125 + }, + { + "batch_num_effect_tokens": 5965, + "batch_num_samples": 149, + "batch_num_tokens": 52151, + "epoch": 0.11455, + "grad_norm": 3.919264316558838, + "learning_rate": 1.1454545454545455e-05, + "loss": 2.1151, + "step": 126 + }, + { + "batch_num_effect_tokens": 9544, + "batch_num_samples": 149, + "batch_num_tokens": 52174, + "epoch": 0.11545, + "grad_norm": 3.3462822437286377, + "learning_rate": 1.1545454545454546e-05, + "loss": 2.2, + "step": 127 + }, + { + "batch_num_effect_tokens": 8131, + "batch_num_samples": 149, + "batch_num_tokens": 52221, + "epoch": 0.11636, + "grad_norm": 3.6498684883117676, + "learning_rate": 1.1636363636363637e-05, + "loss": 2.1605, + "step": 128 + }, + { + "batch_num_effect_tokens": 6962, + "batch_num_samples": 149, + "batch_num_tokens": 52183, + "epoch": 0.11727, + "grad_norm": 4.1046929359436035, + "learning_rate": 1.1727272727272728e-05, + "loss": 2.3661, + "step": 129 + }, + { + "batch_num_effect_tokens": 5453, + "batch_num_samples": 149, + "batch_num_tokens": 52131, + "epoch": 0.11818, + "grad_norm": 3.9962284564971924, + "learning_rate": 1.181818181818182e-05, + "loss": 1.6459, + "step": 130 + }, + { + "batch_num_effect_tokens": 6656, + "batch_num_samples": 150, + "batch_num_tokens": 52143, + "epoch": 0.11909, + "grad_norm": 4.055619239807129, + "learning_rate": 1.190909090909091e-05, + "loss": 2.0234, + "step": 131 + }, + { + "batch_num_effect_tokens": 8693, + "batch_num_samples": 150, + "batch_num_tokens": 52179, + "epoch": 0.12, + "grad_norm": 4.601987838745117, + "learning_rate": 1.2e-05, + "loss": 2.1816, + "step": 132 + }, + { + "batch_num_effect_tokens": 8201, + "batch_num_samples": 149, + "batch_num_tokens": 52134, + "epoch": 0.12091, + "grad_norm": 3.506317615509033, + "learning_rate": 1.2090909090909091e-05, + "loss": 2.0799, + "step": 133 + }, + { + "batch_num_effect_tokens": 5992, + "batch_num_samples": 149, + "batch_num_tokens": 52188, + "epoch": 0.12182, + "grad_norm": 5.071348190307617, + "learning_rate": 1.2181818181818184e-05, + "loss": 2.0837, + "step": 134 + }, + { + "batch_num_effect_tokens": 6012, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 0.12273, + "grad_norm": 3.723125457763672, + "learning_rate": 1.2272727272727274e-05, + "loss": 1.8454, + "step": 135 + }, + { + "batch_num_effect_tokens": 8301, + "batch_num_samples": 149, + "batch_num_tokens": 52218, + "epoch": 0.12364, + "grad_norm": 3.9254860877990723, + "learning_rate": 1.2363636363636364e-05, + "loss": 2.3271, + "step": 136 + }, + { + "batch_num_effect_tokens": 4103, + "batch_num_samples": 149, + "batch_num_tokens": 52166, + "epoch": 0.12455, + "grad_norm": 4.1657233238220215, + "learning_rate": 1.2454545454545454e-05, + "loss": 1.4053, + "step": 137 + }, + { + "batch_num_effect_tokens": 6719, + "batch_num_samples": 149, + "batch_num_tokens": 52128, + "epoch": 0.12545, + "grad_norm": 4.125800132751465, + "learning_rate": 1.2545454545454547e-05, + "loss": 2.1318, + "step": 138 + }, + { + "batch_num_effect_tokens": 5288, + "batch_num_samples": 149, + "batch_num_tokens": 52196, + "epoch": 0.12636, + "grad_norm": 4.3959174156188965, + "learning_rate": 1.2636363636363638e-05, + "loss": 1.8073, + "step": 139 + }, + { + "batch_num_effect_tokens": 4890, + "batch_num_samples": 149, + "batch_num_tokens": 52220, + "epoch": 0.12727, + "grad_norm": 3.882976770401001, + "learning_rate": 1.2727272727272728e-05, + "loss": 1.4711, + "step": 140 + }, + { + "batch_num_effect_tokens": 8337, + "batch_num_samples": 149, + "batch_num_tokens": 52168, + "epoch": 0.12818, + "grad_norm": 4.436567306518555, + "learning_rate": 1.281818181818182e-05, + "loss": 2.4239, + "step": 141 + }, + { + "batch_num_effect_tokens": 8868, + "batch_num_samples": 149, + "batch_num_tokens": 52155, + "epoch": 0.12909, + "grad_norm": 3.8208858966827393, + "learning_rate": 1.2909090909090912e-05, + "loss": 2.2625, + "step": 142 + }, + { + "batch_num_effect_tokens": 6646, + "batch_num_samples": 149, + "batch_num_tokens": 52160, + "epoch": 0.13, + "grad_norm": 3.840920925140381, + "learning_rate": 1.3000000000000001e-05, + "loss": 2.0837, + "step": 143 + }, + { + "batch_num_effect_tokens": 6819, + "batch_num_samples": 149, + "batch_num_tokens": 52187, + "epoch": 0.13091, + "grad_norm": 4.054839134216309, + "learning_rate": 1.3090909090909092e-05, + "loss": 1.9195, + "step": 144 + }, + { + "batch_num_effect_tokens": 4915, + "batch_num_samples": 149, + "batch_num_tokens": 52163, + "epoch": 0.13182, + "grad_norm": 3.954787492752075, + "learning_rate": 1.3181818181818183e-05, + "loss": 1.5082, + "step": 145 + }, + { + "batch_num_effect_tokens": 8163, + "batch_num_samples": 150, + "batch_num_tokens": 52154, + "epoch": 0.13273, + "grad_norm": 3.4955637454986572, + "learning_rate": 1.3272727272727275e-05, + "loss": 2.3483, + "step": 146 + }, + { + "batch_num_effect_tokens": 7112, + "batch_num_samples": 149, + "batch_num_tokens": 52145, + "epoch": 0.13364, + "grad_norm": 3.9054946899414062, + "learning_rate": 1.3363636363636366e-05, + "loss": 2.0471, + "step": 147 + }, + { + "batch_num_effect_tokens": 6689, + "batch_num_samples": 149, + "batch_num_tokens": 52197, + "epoch": 0.13455, + "grad_norm": 3.8406553268432617, + "learning_rate": 1.3454545454545455e-05, + "loss": 1.997, + "step": 148 + }, + { + "batch_num_effect_tokens": 6119, + "batch_num_samples": 149, + "batch_num_tokens": 52142, + "epoch": 0.13545, + "grad_norm": 4.178918838500977, + "learning_rate": 1.3545454545454546e-05, + "loss": 1.9578, + "step": 149 + }, + { + "batch_num_effect_tokens": 6297, + "batch_num_samples": 149, + "batch_num_tokens": 52187, + "epoch": 0.13636, + "grad_norm": 4.147446632385254, + "learning_rate": 1.3636363636363637e-05, + "loss": 2.0032, + "step": 150 + }, + { + "batch_num_effect_tokens": 6621, + "batch_num_samples": 149, + "batch_num_tokens": 52141, + "epoch": 0.13727, + "grad_norm": 4.597929954528809, + "learning_rate": 1.3727272727272729e-05, + "loss": 2.2709, + "step": 151 + }, + { + "batch_num_effect_tokens": 7233, + "batch_num_samples": 149, + "batch_num_tokens": 52204, + "epoch": 0.13818, + "grad_norm": 3.8162758350372314, + "learning_rate": 1.381818181818182e-05, + "loss": 2.1774, + "step": 152 + }, + { + "batch_num_effect_tokens": 7396, + "batch_num_samples": 149, + "batch_num_tokens": 52102, + "epoch": 0.13909, + "grad_norm": 3.5979058742523193, + "learning_rate": 1.390909090909091e-05, + "loss": 2.1845, + "step": 153 + }, + { + "batch_num_effect_tokens": 7449, + "batch_num_samples": 149, + "batch_num_tokens": 52209, + "epoch": 0.14, + "grad_norm": 3.8547518253326416, + "learning_rate": 1.4e-05, + "loss": 2.201, + "step": 154 + }, + { + "batch_num_effect_tokens": 6947, + "batch_num_samples": 149, + "batch_num_tokens": 52133, + "epoch": 0.14091, + "grad_norm": 3.8475093841552734, + "learning_rate": 1.4090909090909092e-05, + "loss": 2.0156, + "step": 155 + }, + { + "batch_num_effect_tokens": 9360, + "batch_num_samples": 149, + "batch_num_tokens": 52160, + "epoch": 0.14182, + "grad_norm": 3.3440723419189453, + "learning_rate": 1.4181818181818183e-05, + "loss": 2.4424, + "step": 156 + }, + { + "batch_num_effect_tokens": 7812, + "batch_num_samples": 149, + "batch_num_tokens": 52163, + "epoch": 0.14273, + "grad_norm": 4.060225963592529, + "learning_rate": 1.4272727272727274e-05, + "loss": 2.1743, + "step": 157 + }, + { + "batch_num_effect_tokens": 5670, + "batch_num_samples": 149, + "batch_num_tokens": 52175, + "epoch": 0.14364, + "grad_norm": 4.626434326171875, + "learning_rate": 1.4363636363636365e-05, + "loss": 1.8219, + "step": 158 + }, + { + "batch_num_effect_tokens": 4969, + "batch_num_samples": 149, + "batch_num_tokens": 52210, + "epoch": 0.14455, + "grad_norm": 3.8437905311584473, + "learning_rate": 1.4454545454545457e-05, + "loss": 1.7275, + "step": 159 + }, + { + "batch_num_effect_tokens": 6736, + "batch_num_samples": 149, + "batch_num_tokens": 52117, + "epoch": 0.14545, + "grad_norm": 4.022931098937988, + "learning_rate": 1.4545454545454546e-05, + "loss": 2.219, + "step": 160 + }, + { + "batch_num_effect_tokens": 6256, + "batch_num_samples": 149, + "batch_num_tokens": 52176, + "epoch": 0.14636, + "grad_norm": 3.9149224758148193, + "learning_rate": 1.4636363636363637e-05, + "loss": 2.0597, + "step": 161 + }, + { + "batch_num_effect_tokens": 5138, + "batch_num_samples": 149, + "batch_num_tokens": 52204, + "epoch": 0.14727, + "grad_norm": 4.706681728363037, + "learning_rate": 1.4727272727272728e-05, + "loss": 1.9799, + "step": 162 + }, + { + "batch_num_effect_tokens": 5858, + "batch_num_samples": 149, + "batch_num_tokens": 52132, + "epoch": 0.14818, + "grad_norm": 4.0887370109558105, + "learning_rate": 1.481818181818182e-05, + "loss": 1.9767, + "step": 163 + }, + { + "batch_num_effect_tokens": 7680, + "batch_num_samples": 149, + "batch_num_tokens": 52197, + "epoch": 0.14909, + "grad_norm": 3.7380943298339844, + "learning_rate": 1.4909090909090911e-05, + "loss": 2.2115, + "step": 164 + }, + { + "batch_num_effect_tokens": 6705, + "batch_num_samples": 149, + "batch_num_tokens": 52177, + "epoch": 0.15, + "grad_norm": 4.241217136383057, + "learning_rate": 1.5000000000000002e-05, + "loss": 2.0222, + "step": 165 + }, + { + "batch_num_effect_tokens": 5555, + "batch_num_samples": 149, + "batch_num_tokens": 52139, + "epoch": 0.15091, + "grad_norm": 3.943678379058838, + "learning_rate": 1.5090909090909091e-05, + "loss": 1.9941, + "step": 166 + }, + { + "batch_num_effect_tokens": 8023, + "batch_num_samples": 149, + "batch_num_tokens": 52183, + "epoch": 0.15182, + "grad_norm": 3.7555346488952637, + "learning_rate": 1.5181818181818182e-05, + "loss": 2.2806, + "step": 167 + }, + { + "batch_num_effect_tokens": 5555, + "batch_num_samples": 149, + "batch_num_tokens": 52132, + "epoch": 0.15273, + "grad_norm": 3.931405782699585, + "learning_rate": 1.5272727272727276e-05, + "loss": 1.8538, + "step": 168 + }, + { + "batch_num_effect_tokens": 6675, + "batch_num_samples": 149, + "batch_num_tokens": 52177, + "epoch": 0.15364, + "grad_norm": 3.9357919692993164, + "learning_rate": 1.5363636363636365e-05, + "loss": 2.0735, + "step": 169 + }, + { + "batch_num_effect_tokens": 6744, + "batch_num_samples": 149, + "batch_num_tokens": 52161, + "epoch": 0.15455, + "grad_norm": 3.840468645095825, + "learning_rate": 1.5454545454545454e-05, + "loss": 1.9243, + "step": 170 + }, + { + "batch_num_effect_tokens": 4856, + "batch_num_samples": 149, + "batch_num_tokens": 50540, + "epoch": 0.15545, + "grad_norm": 3.940772294998169, + "learning_rate": 1.5545454545454547e-05, + "loss": 1.7354, + "step": 171 + }, + { + "batch_num_effect_tokens": 5268, + "batch_num_samples": 149, + "batch_num_tokens": 52130, + "epoch": 0.15636, + "grad_norm": 4.208749294281006, + "learning_rate": 1.563636363636364e-05, + "loss": 1.7741, + "step": 172 + }, + { + "batch_num_effect_tokens": 7239, + "batch_num_samples": 149, + "batch_num_tokens": 52176, + "epoch": 0.15727, + "grad_norm": 3.6911253929138184, + "learning_rate": 1.572727272727273e-05, + "loss": 2.1765, + "step": 173 + }, + { + "batch_num_effect_tokens": 7250, + "batch_num_samples": 149, + "batch_num_tokens": 52118, + "epoch": 0.15818, + "grad_norm": 3.9492316246032715, + "learning_rate": 1.5818181818181818e-05, + "loss": 2.145, + "step": 174 + }, + { + "batch_num_effect_tokens": 9635, + "batch_num_samples": 149, + "batch_num_tokens": 52184, + "epoch": 0.15909, + "grad_norm": 3.3741977214813232, + "learning_rate": 1.590909090909091e-05, + "loss": 2.2064, + "step": 175 + }, + { + "batch_num_effect_tokens": 7578, + "batch_num_samples": 150, + "batch_num_tokens": 52141, + "epoch": 0.16, + "grad_norm": 3.4716920852661133, + "learning_rate": 1.6000000000000003e-05, + "loss": 1.9224, + "step": 176 + }, + { + "batch_num_effect_tokens": 8088, + "batch_num_samples": 149, + "batch_num_tokens": 52209, + "epoch": 0.16091, + "grad_norm": 3.4077889919281006, + "learning_rate": 1.6090909090909092e-05, + "loss": 1.9219, + "step": 177 + }, + { + "batch_num_effect_tokens": 7364, + "batch_num_samples": 149, + "batch_num_tokens": 52151, + "epoch": 0.16182, + "grad_norm": 3.86611270904541, + "learning_rate": 1.6181818181818184e-05, + "loss": 2.1574, + "step": 178 + }, + { + "batch_num_effect_tokens": 8004, + "batch_num_samples": 149, + "batch_num_tokens": 52163, + "epoch": 0.16273, + "grad_norm": 3.459207773208618, + "learning_rate": 1.6272727272727273e-05, + "loss": 2.0184, + "step": 179 + }, + { + "batch_num_effect_tokens": 4996, + "batch_num_samples": 149, + "batch_num_tokens": 52143, + "epoch": 0.16364, + "grad_norm": 4.295303821563721, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.8571, + "step": 180 + }, + { + "batch_num_effect_tokens": 7540, + "batch_num_samples": 149, + "batch_num_tokens": 52170, + "epoch": 0.16455, + "grad_norm": 3.390723943710327, + "learning_rate": 1.6454545454545455e-05, + "loss": 1.8768, + "step": 181 + }, + { + "batch_num_effect_tokens": 6088, + "batch_num_samples": 149, + "batch_num_tokens": 52184, + "epoch": 0.16545, + "grad_norm": 4.339141368865967, + "learning_rate": 1.6545454545454548e-05, + "loss": 2.2682, + "step": 182 + }, + { + "batch_num_effect_tokens": 5206, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 0.16636, + "grad_norm": 3.778287649154663, + "learning_rate": 1.6636363636363637e-05, + "loss": 1.5497, + "step": 183 + }, + { + "batch_num_effect_tokens": 7747, + "batch_num_samples": 149, + "batch_num_tokens": 52132, + "epoch": 0.16727, + "grad_norm": 3.729515790939331, + "learning_rate": 1.672727272727273e-05, + "loss": 2.0643, + "step": 184 + }, + { + "batch_num_effect_tokens": 6412, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 0.16818, + "grad_norm": 3.697777271270752, + "learning_rate": 1.681818181818182e-05, + "loss": 1.869, + "step": 185 + }, + { + "batch_num_effect_tokens": 5186, + "batch_num_samples": 149, + "batch_num_tokens": 52184, + "epoch": 0.16909, + "grad_norm": 4.121335506439209, + "learning_rate": 1.690909090909091e-05, + "loss": 1.7163, + "step": 186 + }, + { + "batch_num_effect_tokens": 6355, + "batch_num_samples": 149, + "batch_num_tokens": 52143, + "epoch": 0.17, + "grad_norm": 4.144448280334473, + "learning_rate": 1.7e-05, + "loss": 2.0994, + "step": 187 + }, + { + "batch_num_effect_tokens": 9178, + "batch_num_samples": 149, + "batch_num_tokens": 52180, + "epoch": 0.17091, + "grad_norm": 3.9074015617370605, + "learning_rate": 1.7090909090909092e-05, + "loss": 2.3383, + "step": 188 + }, + { + "batch_num_effect_tokens": 4468, + "batch_num_samples": 149, + "batch_num_tokens": 52136, + "epoch": 0.17182, + "grad_norm": 4.231569290161133, + "learning_rate": 1.7181818181818185e-05, + "loss": 1.7775, + "step": 189 + }, + { + "batch_num_effect_tokens": 6013, + "batch_num_samples": 149, + "batch_num_tokens": 52124, + "epoch": 0.17273, + "grad_norm": 4.446117877960205, + "learning_rate": 1.7272727272727274e-05, + "loss": 1.9935, + "step": 190 + }, + { + "batch_num_effect_tokens": 6840, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 0.17364, + "grad_norm": 3.933229684829712, + "learning_rate": 1.7363636363636363e-05, + "loss": 2.1809, + "step": 191 + }, + { + "batch_num_effect_tokens": 6780, + "batch_num_samples": 149, + "batch_num_tokens": 52170, + "epoch": 0.17455, + "grad_norm": 4.084712028503418, + "learning_rate": 1.7454545454545456e-05, + "loss": 2.1108, + "step": 192 + }, + { + "batch_num_effect_tokens": 6101, + "batch_num_samples": 149, + "batch_num_tokens": 52197, + "epoch": 0.17545, + "grad_norm": 3.8516998291015625, + "learning_rate": 1.7545454545454548e-05, + "loss": 2.0655, + "step": 193 + }, + { + "batch_num_effect_tokens": 5591, + "batch_num_samples": 149, + "batch_num_tokens": 52191, + "epoch": 0.17636, + "grad_norm": 3.7539749145507812, + "learning_rate": 1.7636363636363637e-05, + "loss": 1.7109, + "step": 194 + }, + { + "batch_num_effect_tokens": 6305, + "batch_num_samples": 149, + "batch_num_tokens": 50593, + "epoch": 0.17727, + "grad_norm": 4.456710338592529, + "learning_rate": 1.772727272727273e-05, + "loss": 2.2209, + "step": 195 + }, + { + "batch_num_effect_tokens": 6776, + "batch_num_samples": 149, + "batch_num_tokens": 52183, + "epoch": 0.17818, + "grad_norm": 3.7552881240844727, + "learning_rate": 1.781818181818182e-05, + "loss": 2.1274, + "step": 196 + }, + { + "batch_num_effect_tokens": 7808, + "batch_num_samples": 149, + "batch_num_tokens": 52160, + "epoch": 0.17909, + "grad_norm": 3.59682559967041, + "learning_rate": 1.790909090909091e-05, + "loss": 2.186, + "step": 197 + }, + { + "batch_num_effect_tokens": 10696, + "batch_num_samples": 150, + "batch_num_tokens": 52174, + "epoch": 0.18, + "grad_norm": 3.125765323638916, + "learning_rate": 1.8e-05, + "loss": 2.4307, + "step": 198 + }, + { + "batch_num_effect_tokens": 9770, + "batch_num_samples": 149, + "batch_num_tokens": 52161, + "epoch": 0.18091, + "grad_norm": 3.97883939743042, + "learning_rate": 1.8090909090909093e-05, + "loss": 2.5137, + "step": 199 + }, + { + "batch_num_effect_tokens": 6270, + "batch_num_samples": 149, + "batch_num_tokens": 52140, + "epoch": 0.18182, + "grad_norm": 4.143767356872559, + "learning_rate": 1.8181818181818182e-05, + "loss": 2.023, + "step": 200 + }, + { + "batch_num_effect_tokens": 7577, + "batch_num_samples": 150, + "batch_num_tokens": 52153, + "epoch": 0.18273, + "grad_norm": 3.6797730922698975, + "learning_rate": 1.8272727272727275e-05, + "loss": 2.0665, + "step": 201 + }, + { + "batch_num_effect_tokens": 5559, + "batch_num_samples": 149, + "batch_num_tokens": 52166, + "epoch": 0.18364, + "grad_norm": 4.038333415985107, + "learning_rate": 1.8363636363636367e-05, + "loss": 2.075, + "step": 202 + }, + { + "batch_num_effect_tokens": 8606, + "batch_num_samples": 149, + "batch_num_tokens": 52211, + "epoch": 0.18455, + "grad_norm": 4.0963239669799805, + "learning_rate": 1.8454545454545456e-05, + "loss": 2.4193, + "step": 203 + }, + { + "batch_num_effect_tokens": 6120, + "batch_num_samples": 149, + "batch_num_tokens": 50540, + "epoch": 0.18545, + "grad_norm": 3.896514415740967, + "learning_rate": 1.8545454545454545e-05, + "loss": 2.1565, + "step": 204 + }, + { + "batch_num_effect_tokens": 7134, + "batch_num_samples": 149, + "batch_num_tokens": 52137, + "epoch": 0.18636, + "grad_norm": 3.8699703216552734, + "learning_rate": 1.8636363636363638e-05, + "loss": 2.3128, + "step": 205 + }, + { + "batch_num_effect_tokens": 8291, + "batch_num_samples": 149, + "batch_num_tokens": 52158, + "epoch": 0.18727, + "grad_norm": 3.4299514293670654, + "learning_rate": 1.872727272727273e-05, + "loss": 2.2254, + "step": 206 + }, + { + "batch_num_effect_tokens": 6028, + "batch_num_samples": 149, + "batch_num_tokens": 52185, + "epoch": 0.18818, + "grad_norm": 4.2485198974609375, + "learning_rate": 1.881818181818182e-05, + "loss": 1.8865, + "step": 207 + }, + { + "batch_num_effect_tokens": 6741, + "batch_num_samples": 149, + "batch_num_tokens": 52135, + "epoch": 0.18909, + "grad_norm": 3.9595305919647217, + "learning_rate": 1.8909090909090912e-05, + "loss": 2.0176, + "step": 208 + }, + { + "batch_num_effect_tokens": 4830, + "batch_num_samples": 149, + "batch_num_tokens": 52160, + "epoch": 0.19, + "grad_norm": 4.34491491317749, + "learning_rate": 1.9e-05, + "loss": 1.7146, + "step": 209 + }, + { + "batch_num_effect_tokens": 4728, + "batch_num_samples": 149, + "batch_num_tokens": 52179, + "epoch": 0.19091, + "grad_norm": 5.689029216766357, + "learning_rate": 1.9090909090909094e-05, + "loss": 1.7834, + "step": 210 + }, + { + "batch_num_effect_tokens": 5023, + "batch_num_samples": 149, + "batch_num_tokens": 52118, + "epoch": 0.19182, + "grad_norm": 4.221342086791992, + "learning_rate": 1.9181818181818183e-05, + "loss": 1.9373, + "step": 211 + }, + { + "batch_num_effect_tokens": 7665, + "batch_num_samples": 149, + "batch_num_tokens": 52158, + "epoch": 0.19273, + "grad_norm": 3.7729616165161133, + "learning_rate": 1.9272727272727275e-05, + "loss": 2.299, + "step": 212 + }, + { + "batch_num_effect_tokens": 7062, + "batch_num_samples": 149, + "batch_num_tokens": 52184, + "epoch": 0.19364, + "grad_norm": 3.5272579193115234, + "learning_rate": 1.9363636363636364e-05, + "loss": 2.2194, + "step": 213 + }, + { + "batch_num_effect_tokens": 6712, + "batch_num_samples": 149, + "batch_num_tokens": 52140, + "epoch": 0.19455, + "grad_norm": 3.839637279510498, + "learning_rate": 1.9454545454545457e-05, + "loss": 2.2719, + "step": 214 + }, + { + "batch_num_effect_tokens": 4438, + "batch_num_samples": 149, + "batch_num_tokens": 52180, + "epoch": 0.19545, + "grad_norm": 4.071404933929443, + "learning_rate": 1.9545454545454546e-05, + "loss": 1.7334, + "step": 215 + }, + { + "batch_num_effect_tokens": 5133, + "batch_num_samples": 149, + "batch_num_tokens": 52140, + "epoch": 0.19636, + "grad_norm": 3.8490676879882812, + "learning_rate": 1.963636363636364e-05, + "loss": 1.8869, + "step": 216 + }, + { + "batch_num_effect_tokens": 6167, + "batch_num_samples": 149, + "batch_num_tokens": 52197, + "epoch": 0.19727, + "grad_norm": 4.4581732749938965, + "learning_rate": 1.9727272727272728e-05, + "loss": 1.9831, + "step": 217 + }, + { + "batch_num_effect_tokens": 7929, + "batch_num_samples": 150, + "batch_num_tokens": 52220, + "epoch": 0.19818, + "grad_norm": 3.9144175052642822, + "learning_rate": 1.981818181818182e-05, + "loss": 2.1646, + "step": 218 + }, + { + "batch_num_effect_tokens": 8765, + "batch_num_samples": 149, + "batch_num_tokens": 52184, + "epoch": 0.19909, + "grad_norm": 3.417576789855957, + "learning_rate": 1.9909090909090913e-05, + "loss": 2.2986, + "step": 219 + }, + { + "batch_num_effect_tokens": 5925, + "batch_num_samples": 149, + "batch_num_tokens": 52201, + "epoch": 0.2, + "grad_norm": 3.8771045207977295, + "learning_rate": 2e-05, + "loss": 2.001, + "step": 220 + }, + { + "batch_num_effect_tokens": 7624, + "batch_num_samples": 150, + "batch_num_tokens": 52178, + "epoch": 0.20091, + "grad_norm": 3.4873709678649902, + "learning_rate": 1.9999987412505956e-05, + "loss": 2.0297, + "step": 221 + }, + { + "batch_num_effect_tokens": 5772, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 0.20182, + "grad_norm": 4.14601993560791, + "learning_rate": 1.9999949650055512e-05, + "loss": 2.0554, + "step": 222 + }, + { + "batch_num_effect_tokens": 5791, + "batch_num_samples": 149, + "batch_num_tokens": 52214, + "epoch": 0.20273, + "grad_norm": 3.8745217323303223, + "learning_rate": 1.9999886712743734e-05, + "loss": 1.9404, + "step": 223 + }, + { + "batch_num_effect_tokens": 6915, + "batch_num_samples": 149, + "batch_num_tokens": 52184, + "epoch": 0.20364, + "grad_norm": 4.043534278869629, + "learning_rate": 1.9999798600729067e-05, + "loss": 2.2196, + "step": 224 + }, + { + "batch_num_effect_tokens": 5540, + "batch_num_samples": 149, + "batch_num_tokens": 52206, + "epoch": 0.20455, + "grad_norm": 3.968337059020996, + "learning_rate": 1.9999685314233333e-05, + "loss": 1.8076, + "step": 225 + }, + { + "batch_num_effect_tokens": 6473, + "batch_num_samples": 149, + "batch_num_tokens": 52209, + "epoch": 0.20545, + "grad_norm": 4.364017486572266, + "learning_rate": 1.9999546853541728e-05, + "loss": 2.166, + "step": 226 + }, + { + "batch_num_effect_tokens": 8991, + "batch_num_samples": 149, + "batch_num_tokens": 52202, + "epoch": 0.20636, + "grad_norm": 3.8495795726776123, + "learning_rate": 1.9999383219002836e-05, + "loss": 2.3994, + "step": 227 + }, + { + "batch_num_effect_tokens": 7268, + "batch_num_samples": 149, + "batch_num_tokens": 52178, + "epoch": 0.20727, + "grad_norm": 3.666080951690674, + "learning_rate": 1.9999194411028596e-05, + "loss": 2.1414, + "step": 228 + }, + { + "batch_num_effect_tokens": 7291, + "batch_num_samples": 150, + "batch_num_tokens": 52193, + "epoch": 0.20818, + "grad_norm": 3.8996286392211914, + "learning_rate": 1.9998980430094333e-05, + "loss": 2.2034, + "step": 229 + }, + { + "batch_num_effect_tokens": 5587, + "batch_num_samples": 149, + "batch_num_tokens": 52151, + "epoch": 0.20909, + "grad_norm": 3.8308238983154297, + "learning_rate": 1.9998741276738753e-05, + "loss": 1.9367, + "step": 230 + }, + { + "batch_num_effect_tokens": 8107, + "batch_num_samples": 149, + "batch_num_tokens": 52123, + "epoch": 0.21, + "grad_norm": 3.7341339588165283, + "learning_rate": 1.9998476951563914e-05, + "loss": 2.2329, + "step": 231 + }, + { + "batch_num_effect_tokens": 5474, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 0.21091, + "grad_norm": 3.69163179397583, + "learning_rate": 1.999818745523526e-05, + "loss": 1.5736, + "step": 232 + }, + { + "batch_num_effect_tokens": 5578, + "batch_num_samples": 149, + "batch_num_tokens": 52187, + "epoch": 0.21182, + "grad_norm": 4.45286226272583, + "learning_rate": 1.9997872788481595e-05, + "loss": 1.9841, + "step": 233 + }, + { + "batch_num_effect_tokens": 5678, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 0.21273, + "grad_norm": 3.889274835586548, + "learning_rate": 1.9997532952095093e-05, + "loss": 1.7809, + "step": 234 + }, + { + "batch_num_effect_tokens": 5605, + "batch_num_samples": 149, + "batch_num_tokens": 52141, + "epoch": 0.21364, + "grad_norm": 3.6350696086883545, + "learning_rate": 1.9997167946931293e-05, + "loss": 1.9915, + "step": 235 + }, + { + "batch_num_effect_tokens": 7913, + "batch_num_samples": 150, + "batch_num_tokens": 52187, + "epoch": 0.21455, + "grad_norm": 4.61751127243042, + "learning_rate": 1.9996777773909093e-05, + "loss": 2.2168, + "step": 236 + }, + { + "batch_num_effect_tokens": 6433, + "batch_num_samples": 149, + "batch_num_tokens": 52169, + "epoch": 0.21545, + "grad_norm": 3.994758129119873, + "learning_rate": 1.9996362434010754e-05, + "loss": 1.9697, + "step": 237 + }, + { + "batch_num_effect_tokens": 5781, + "batch_num_samples": 149, + "batch_num_tokens": 52146, + "epoch": 0.21636, + "grad_norm": 3.7514944076538086, + "learning_rate": 1.9995921928281893e-05, + "loss": 2.1329, + "step": 238 + }, + { + "batch_num_effect_tokens": 5847, + "batch_num_samples": 149, + "batch_num_tokens": 52112, + "epoch": 0.21727, + "grad_norm": 3.858617067337036, + "learning_rate": 1.9995456257831484e-05, + "loss": 2.0299, + "step": 239 + }, + { + "batch_num_effect_tokens": 7325, + "batch_num_samples": 149, + "batch_num_tokens": 50555, + "epoch": 0.21818, + "grad_norm": 3.4423437118530273, + "learning_rate": 1.9994965423831853e-05, + "loss": 2.0997, + "step": 240 + }, + { + "batch_num_effect_tokens": 5637, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 0.21909, + "grad_norm": 4.060293197631836, + "learning_rate": 1.999444942751867e-05, + "loss": 2.0875, + "step": 241 + }, + { + "batch_num_effect_tokens": 7749, + "batch_num_samples": 149, + "batch_num_tokens": 52166, + "epoch": 0.22, + "grad_norm": 3.5604634284973145, + "learning_rate": 1.999390827019096e-05, + "loss": 2.2539, + "step": 242 + }, + { + "batch_num_effect_tokens": 6767, + "batch_num_samples": 149, + "batch_num_tokens": 52215, + "epoch": 0.22091, + "grad_norm": 3.51718807220459, + "learning_rate": 1.999334195321108e-05, + "loss": 2.155, + "step": 243 + }, + { + "batch_num_effect_tokens": 7804, + "batch_num_samples": 149, + "batch_num_tokens": 52187, + "epoch": 0.22182, + "grad_norm": 3.5282416343688965, + "learning_rate": 1.999275047800474e-05, + "loss": 2.2269, + "step": 244 + }, + { + "batch_num_effect_tokens": 6813, + "batch_num_samples": 149, + "batch_num_tokens": 52209, + "epoch": 0.22273, + "grad_norm": 4.269754886627197, + "learning_rate": 1.999213384606097e-05, + "loss": 2.083, + "step": 245 + }, + { + "batch_num_effect_tokens": 7350, + "batch_num_samples": 149, + "batch_num_tokens": 52197, + "epoch": 0.22364, + "grad_norm": 4.033580303192139, + "learning_rate": 1.9991492058932143e-05, + "loss": 2.21, + "step": 246 + }, + { + "batch_num_effect_tokens": 5880, + "batch_num_samples": 150, + "batch_num_tokens": 52143, + "epoch": 0.22455, + "grad_norm": 4.044896125793457, + "learning_rate": 1.9990825118233958e-05, + "loss": 2.257, + "step": 247 + }, + { + "batch_num_effect_tokens": 7649, + "batch_num_samples": 150, + "batch_num_tokens": 52217, + "epoch": 0.22545, + "grad_norm": 3.287102222442627, + "learning_rate": 1.999013302564544e-05, + "loss": 1.96, + "step": 248 + }, + { + "batch_num_effect_tokens": 6009, + "batch_num_samples": 149, + "batch_num_tokens": 52166, + "epoch": 0.22636, + "grad_norm": 3.5989625453948975, + "learning_rate": 1.998941578290893e-05, + "loss": 1.705, + "step": 249 + }, + { + "batch_num_effect_tokens": 6845, + "batch_num_samples": 149, + "batch_num_tokens": 52179, + "epoch": 0.22727, + "grad_norm": 4.358585357666016, + "learning_rate": 1.9988673391830082e-05, + "loss": 2.0284, + "step": 250 + }, + { + "batch_num_effect_tokens": 4556, + "batch_num_samples": 149, + "batch_num_tokens": 52145, + "epoch": 0.22818, + "grad_norm": 3.9450693130493164, + "learning_rate": 1.9987905854277867e-05, + "loss": 1.6541, + "step": 251 + }, + { + "batch_num_effect_tokens": 4540, + "batch_num_samples": 149, + "batch_num_tokens": 52223, + "epoch": 0.22909, + "grad_norm": 3.910506010055542, + "learning_rate": 1.9987113172184562e-05, + "loss": 1.5919, + "step": 252 + }, + { + "batch_num_effect_tokens": 6801, + "batch_num_samples": 149, + "batch_num_tokens": 50546, + "epoch": 0.23, + "grad_norm": 3.481827974319458, + "learning_rate": 1.9986295347545738e-05, + "loss": 1.9684, + "step": 253 + }, + { + "batch_num_effect_tokens": 5553, + "batch_num_samples": 149, + "batch_num_tokens": 52132, + "epoch": 0.23091, + "grad_norm": 4.6898674964904785, + "learning_rate": 1.9985452382420277e-05, + "loss": 1.9056, + "step": 254 + }, + { + "batch_num_effect_tokens": 7498, + "batch_num_samples": 149, + "batch_num_tokens": 52191, + "epoch": 0.23182, + "grad_norm": 3.7411327362060547, + "learning_rate": 1.9984584278930333e-05, + "loss": 1.7946, + "step": 255 + }, + { + "batch_num_effect_tokens": 5376, + "batch_num_samples": 149, + "batch_num_tokens": 52135, + "epoch": 0.23273, + "grad_norm": 4.128430366516113, + "learning_rate": 1.9983691039261358e-05, + "loss": 1.9899, + "step": 256 + }, + { + "batch_num_effect_tokens": 4172, + "batch_num_samples": 149, + "batch_num_tokens": 52121, + "epoch": 0.23364, + "grad_norm": 3.863016128540039, + "learning_rate": 1.9982772665662083e-05, + "loss": 1.4849, + "step": 257 + }, + { + "batch_num_effect_tokens": 5703, + "batch_num_samples": 149, + "batch_num_tokens": 52151, + "epoch": 0.23455, + "grad_norm": 4.168217658996582, + "learning_rate": 1.9981829160444515e-05, + "loss": 2.035, + "step": 258 + }, + { + "batch_num_effect_tokens": 6098, + "batch_num_samples": 149, + "batch_num_tokens": 52178, + "epoch": 0.23545, + "grad_norm": 4.579566478729248, + "learning_rate": 1.9980860525983924e-05, + "loss": 2.1822, + "step": 259 + }, + { + "batch_num_effect_tokens": 4666, + "batch_num_samples": 149, + "batch_num_tokens": 52220, + "epoch": 0.23636, + "grad_norm": 3.742579698562622, + "learning_rate": 1.9979866764718846e-05, + "loss": 1.4695, + "step": 260 + }, + { + "batch_num_effect_tokens": 6150, + "batch_num_samples": 149, + "batch_num_tokens": 52179, + "epoch": 0.23727, + "grad_norm": 4.352850437164307, + "learning_rate": 1.9978847879151076e-05, + "loss": 2.0964, + "step": 261 + }, + { + "batch_num_effect_tokens": 8410, + "batch_num_samples": 149, + "batch_num_tokens": 52140, + "epoch": 0.23818, + "grad_norm": 3.5962162017822266, + "learning_rate": 1.997780387184565e-05, + "loss": 2.375, + "step": 262 + }, + { + "batch_num_effect_tokens": 6367, + "batch_num_samples": 149, + "batch_num_tokens": 52191, + "epoch": 0.23909, + "grad_norm": 3.5124571323394775, + "learning_rate": 1.997673474543087e-05, + "loss": 1.8841, + "step": 263 + }, + { + "batch_num_effect_tokens": 6349, + "batch_num_samples": 149, + "batch_num_tokens": 52151, + "epoch": 0.24, + "grad_norm": 3.808424234390259, + "learning_rate": 1.9975640502598243e-05, + "loss": 2.02, + "step": 264 + }, + { + "batch_num_effect_tokens": 7481, + "batch_num_samples": 149, + "batch_num_tokens": 52132, + "epoch": 0.24091, + "grad_norm": 3.9207777976989746, + "learning_rate": 1.9974521146102535e-05, + "loss": 2.2218, + "step": 265 + }, + { + "batch_num_effect_tokens": 8286, + "batch_num_samples": 150, + "batch_num_tokens": 52178, + "epoch": 0.24182, + "grad_norm": 3.6245782375335693, + "learning_rate": 1.9973376678761726e-05, + "loss": 2.146, + "step": 266 + }, + { + "batch_num_effect_tokens": 8987, + "batch_num_samples": 149, + "batch_num_tokens": 52170, + "epoch": 0.24273, + "grad_norm": 3.7090020179748535, + "learning_rate": 1.9972207103457e-05, + "loss": 2.3555, + "step": 267 + }, + { + "batch_num_effect_tokens": 7652, + "batch_num_samples": 150, + "batch_num_tokens": 52216, + "epoch": 0.24364, + "grad_norm": 3.7007925510406494, + "learning_rate": 1.9971012423132776e-05, + "loss": 2.201, + "step": 268 + }, + { + "batch_num_effect_tokens": 5345, + "batch_num_samples": 149, + "batch_num_tokens": 52208, + "epoch": 0.24455, + "grad_norm": 3.480968952178955, + "learning_rate": 1.996979264079665e-05, + "loss": 1.8025, + "step": 269 + }, + { + "batch_num_effect_tokens": 6528, + "batch_num_samples": 149, + "batch_num_tokens": 52139, + "epoch": 0.24545, + "grad_norm": 4.5325398445129395, + "learning_rate": 1.9968547759519426e-05, + "loss": 2.0981, + "step": 270 + }, + { + "batch_num_effect_tokens": 5371, + "batch_num_samples": 149, + "batch_num_tokens": 52212, + "epoch": 0.24636, + "grad_norm": 4.283621788024902, + "learning_rate": 1.996727778243509e-05, + "loss": 2.0344, + "step": 271 + }, + { + "batch_num_effect_tokens": 3862, + "batch_num_samples": 149, + "batch_num_tokens": 52130, + "epoch": 0.24727, + "grad_norm": 4.2085280418396, + "learning_rate": 1.996598271274081e-05, + "loss": 1.2084, + "step": 272 + }, + { + "batch_num_effect_tokens": 5831, + "batch_num_samples": 149, + "batch_num_tokens": 52196, + "epoch": 0.24818, + "grad_norm": 3.9223201274871826, + "learning_rate": 1.9964662553696915e-05, + "loss": 1.9943, + "step": 273 + }, + { + "batch_num_effect_tokens": 7808, + "batch_num_samples": 149, + "batch_num_tokens": 52215, + "epoch": 0.24909, + "grad_norm": 3.444143295288086, + "learning_rate": 1.9963317308626916e-05, + "loss": 2.0967, + "step": 274 + }, + { + "batch_num_effect_tokens": 5857, + "batch_num_samples": 149, + "batch_num_tokens": 52198, + "epoch": 0.25, + "grad_norm": 3.47464919090271, + "learning_rate": 1.9961946980917457e-05, + "loss": 1.95, + "step": 275 + }, + { + "batch_num_effect_tokens": 6623, + "batch_num_samples": 149, + "batch_num_tokens": 52193, + "epoch": 0.25091, + "grad_norm": 3.51035475730896, + "learning_rate": 1.996055157401834e-05, + "loss": 2.0641, + "step": 276 + }, + { + "batch_num_effect_tokens": 5545, + "batch_num_samples": 149, + "batch_num_tokens": 52118, + "epoch": 0.25182, + "grad_norm": 3.6571717262268066, + "learning_rate": 1.9959131091442497e-05, + "loss": 1.8225, + "step": 277 + }, + { + "batch_num_effect_tokens": 5112, + "batch_num_samples": 149, + "batch_num_tokens": 52183, + "epoch": 0.25273, + "grad_norm": 3.652961015701294, + "learning_rate": 1.9957685536765998e-05, + "loss": 1.7117, + "step": 278 + }, + { + "batch_num_effect_tokens": 4869, + "batch_num_samples": 149, + "batch_num_tokens": 52210, + "epoch": 0.25364, + "grad_norm": 4.006814479827881, + "learning_rate": 1.9956214913628015e-05, + "loss": 1.7309, + "step": 279 + }, + { + "batch_num_effect_tokens": 7287, + "batch_num_samples": 149, + "batch_num_tokens": 52161, + "epoch": 0.25455, + "grad_norm": 3.6142709255218506, + "learning_rate": 1.9954719225730847e-05, + "loss": 1.9263, + "step": 280 + }, + { + "batch_num_effect_tokens": 6941, + "batch_num_samples": 149, + "batch_num_tokens": 52140, + "epoch": 0.25545, + "grad_norm": 3.489576816558838, + "learning_rate": 1.9953198476839886e-05, + "loss": 2.0953, + "step": 281 + }, + { + "batch_num_effect_tokens": 6004, + "batch_num_samples": 149, + "batch_num_tokens": 50545, + "epoch": 0.25636, + "grad_norm": 4.156979560852051, + "learning_rate": 1.9951652670783615e-05, + "loss": 2.1339, + "step": 282 + }, + { + "batch_num_effect_tokens": 7416, + "batch_num_samples": 149, + "batch_num_tokens": 52221, + "epoch": 0.25727, + "grad_norm": 3.9066414833068848, + "learning_rate": 1.9950081811453598e-05, + "loss": 2.3405, + "step": 283 + }, + { + "batch_num_effect_tokens": 4681, + "batch_num_samples": 149, + "batch_num_tokens": 52220, + "epoch": 0.25818, + "grad_norm": 4.091082572937012, + "learning_rate": 1.9948485902804472e-05, + "loss": 1.6512, + "step": 284 + }, + { + "batch_num_effect_tokens": 8297, + "batch_num_samples": 149, + "batch_num_tokens": 52132, + "epoch": 0.25909, + "grad_norm": 3.9799063205718994, + "learning_rate": 1.9946864948853936e-05, + "loss": 2.4767, + "step": 285 + }, + { + "batch_num_effect_tokens": 7533, + "batch_num_samples": 149, + "batch_num_tokens": 52223, + "epoch": 0.26, + "grad_norm": 3.5255239009857178, + "learning_rate": 1.9945218953682736e-05, + "loss": 2.1628, + "step": 286 + }, + { + "batch_num_effect_tokens": 8860, + "batch_num_samples": 149, + "batch_num_tokens": 52218, + "epoch": 0.26091, + "grad_norm": 3.5023748874664307, + "learning_rate": 1.9943547921434666e-05, + "loss": 2.2402, + "step": 287 + }, + { + "batch_num_effect_tokens": 7771, + "batch_num_samples": 149, + "batch_num_tokens": 52209, + "epoch": 0.26182, + "grad_norm": 3.9955313205718994, + "learning_rate": 1.994185185631655e-05, + "loss": 2.1782, + "step": 288 + }, + { + "batch_num_effect_tokens": 5847, + "batch_num_samples": 149, + "batch_num_tokens": 52146, + "epoch": 0.26273, + "grad_norm": 3.5915133953094482, + "learning_rate": 1.9940130762598224e-05, + "loss": 1.8022, + "step": 289 + }, + { + "batch_num_effect_tokens": 6404, + "batch_num_samples": 149, + "batch_num_tokens": 52209, + "epoch": 0.26364, + "grad_norm": 3.4613535404205322, + "learning_rate": 1.9938384644612542e-05, + "loss": 2.0824, + "step": 290 + }, + { + "batch_num_effect_tokens": 7125, + "batch_num_samples": 149, + "batch_num_tokens": 52148, + "epoch": 0.26455, + "grad_norm": 3.2900590896606445, + "learning_rate": 1.9936613506755357e-05, + "loss": 2.134, + "step": 291 + }, + { + "batch_num_effect_tokens": 6211, + "batch_num_samples": 149, + "batch_num_tokens": 52198, + "epoch": 0.26545, + "grad_norm": 4.248907089233398, + "learning_rate": 1.99348173534855e-05, + "loss": 2.0173, + "step": 292 + }, + { + "batch_num_effect_tokens": 5748, + "batch_num_samples": 149, + "batch_num_tokens": 52149, + "epoch": 0.26636, + "grad_norm": 3.5205302238464355, + "learning_rate": 1.9932996189324796e-05, + "loss": 1.8658, + "step": 293 + }, + { + "batch_num_effect_tokens": 4489, + "batch_num_samples": 149, + "batch_num_tokens": 52163, + "epoch": 0.26727, + "grad_norm": 3.9939587116241455, + "learning_rate": 1.9931150018858013e-05, + "loss": 1.9508, + "step": 294 + }, + { + "batch_num_effect_tokens": 8045, + "batch_num_samples": 150, + "batch_num_tokens": 52142, + "epoch": 0.26818, + "grad_norm": 3.3021066188812256, + "learning_rate": 1.9929278846732883e-05, + "loss": 2.3226, + "step": 295 + }, + { + "batch_num_effect_tokens": 8879, + "batch_num_samples": 150, + "batch_num_tokens": 52155, + "epoch": 0.26909, + "grad_norm": 3.1655590534210205, + "learning_rate": 1.992738267766009e-05, + "loss": 2.3486, + "step": 296 + }, + { + "batch_num_effect_tokens": 7963, + "batch_num_samples": 150, + "batch_num_tokens": 52201, + "epoch": 0.27, + "grad_norm": 3.3706653118133545, + "learning_rate": 1.9925461516413224e-05, + "loss": 2.2756, + "step": 297 + }, + { + "batch_num_effect_tokens": 6110, + "batch_num_samples": 149, + "batch_num_tokens": 52201, + "epoch": 0.27091, + "grad_norm": 3.7976138591766357, + "learning_rate": 1.9923515367828812e-05, + "loss": 2.2094, + "step": 298 + }, + { + "batch_num_effect_tokens": 8280, + "batch_num_samples": 149, + "batch_num_tokens": 52201, + "epoch": 0.27182, + "grad_norm": 3.4359230995178223, + "learning_rate": 1.9921544236806284e-05, + "loss": 2.3017, + "step": 299 + }, + { + "batch_num_effect_tokens": 5775, + "batch_num_samples": 149, + "batch_num_tokens": 52191, + "epoch": 0.27273, + "grad_norm": 4.127292633056641, + "learning_rate": 1.9919548128307954e-05, + "loss": 1.8205, + "step": 300 + }, + { + "batch_num_effect_tokens": 5309, + "batch_num_samples": 149, + "batch_num_tokens": 52134, + "epoch": 0.27364, + "grad_norm": 3.933262348175049, + "learning_rate": 1.991752704735903e-05, + "loss": 1.8377, + "step": 301 + }, + { + "batch_num_effect_tokens": 9964, + "batch_num_samples": 149, + "batch_num_tokens": 52142, + "epoch": 0.27455, + "grad_norm": 4.070802688598633, + "learning_rate": 1.9915480999047573e-05, + "loss": 2.2202, + "step": 302 + }, + { + "batch_num_effect_tokens": 6187, + "batch_num_samples": 149, + "batch_num_tokens": 52210, + "epoch": 0.27545, + "grad_norm": 3.5866167545318604, + "learning_rate": 1.991340998852451e-05, + "loss": 1.9792, + "step": 303 + }, + { + "batch_num_effect_tokens": 5621, + "batch_num_samples": 149, + "batch_num_tokens": 52207, + "epoch": 0.27636, + "grad_norm": 3.8665053844451904, + "learning_rate": 1.9911314021003614e-05, + "loss": 1.8149, + "step": 304 + }, + { + "batch_num_effect_tokens": 5774, + "batch_num_samples": 149, + "batch_num_tokens": 52132, + "epoch": 0.27727, + "grad_norm": 3.8318331241607666, + "learning_rate": 1.990919310176147e-05, + "loss": 1.9496, + "step": 305 + }, + { + "batch_num_effect_tokens": 4571, + "batch_num_samples": 149, + "batch_num_tokens": 52187, + "epoch": 0.27818, + "grad_norm": 4.002640724182129, + "learning_rate": 1.99070472361375e-05, + "loss": 1.8213, + "step": 306 + }, + { + "batch_num_effect_tokens": 5408, + "batch_num_samples": 149, + "batch_num_tokens": 52191, + "epoch": 0.27909, + "grad_norm": 3.516658306121826, + "learning_rate": 1.9904876429533912e-05, + "loss": 1.8694, + "step": 307 + }, + { + "batch_num_effect_tokens": 5948, + "batch_num_samples": 149, + "batch_num_tokens": 52146, + "epoch": 0.28, + "grad_norm": 3.7011466026306152, + "learning_rate": 1.9902680687415704e-05, + "loss": 2.0552, + "step": 308 + }, + { + "batch_num_effect_tokens": 7477, + "batch_num_samples": 150, + "batch_num_tokens": 52212, + "epoch": 0.28091, + "grad_norm": 3.722318649291992, + "learning_rate": 1.9900460015310667e-05, + "loss": 2.2286, + "step": 309 + }, + { + "batch_num_effect_tokens": 6647, + "batch_num_samples": 149, + "batch_num_tokens": 52175, + "epoch": 0.28182, + "grad_norm": 3.540323257446289, + "learning_rate": 1.989821441880933e-05, + "loss": 1.975, + "step": 310 + }, + { + "batch_num_effect_tokens": 7885, + "batch_num_samples": 149, + "batch_num_tokens": 52116, + "epoch": 0.28273, + "grad_norm": 3.5634026527404785, + "learning_rate": 1.989594390356498e-05, + "loss": 2.2363, + "step": 311 + }, + { + "batch_num_effect_tokens": 7326, + "batch_num_samples": 150, + "batch_num_tokens": 52190, + "epoch": 0.28364, + "grad_norm": 3.539261817932129, + "learning_rate": 1.9893648475293646e-05, + "loss": 1.939, + "step": 312 + }, + { + "batch_num_effect_tokens": 3660, + "batch_num_samples": 149, + "batch_num_tokens": 52153, + "epoch": 0.28455, + "grad_norm": 3.6285557746887207, + "learning_rate": 1.9891328139774057e-05, + "loss": 1.2964, + "step": 313 + }, + { + "batch_num_effect_tokens": 9164, + "batch_num_samples": 149, + "batch_num_tokens": 52168, + "epoch": 0.28545, + "grad_norm": 5.367908000946045, + "learning_rate": 1.9888982902847658e-05, + "loss": 2.4264, + "step": 314 + }, + { + "batch_num_effect_tokens": 7064, + "batch_num_samples": 149, + "batch_num_tokens": 52107, + "epoch": 0.28636, + "grad_norm": 3.4953806400299072, + "learning_rate": 1.988661277041858e-05, + "loss": 1.9632, + "step": 315 + }, + { + "batch_num_effect_tokens": 10838, + "batch_num_samples": 149, + "batch_num_tokens": 52183, + "epoch": 0.28727, + "grad_norm": 3.466254472732544, + "learning_rate": 1.9884217748453625e-05, + "loss": 2.5673, + "step": 316 + }, + { + "batch_num_effect_tokens": 8718, + "batch_num_samples": 149, + "batch_num_tokens": 52223, + "epoch": 0.28818, + "grad_norm": 3.52352237701416, + "learning_rate": 1.9881797842982265e-05, + "loss": 2.222, + "step": 317 + }, + { + "batch_num_effect_tokens": 5664, + "batch_num_samples": 149, + "batch_num_tokens": 52173, + "epoch": 0.28909, + "grad_norm": 4.089743137359619, + "learning_rate": 1.98793530600966e-05, + "loss": 1.9174, + "step": 318 + }, + { + "batch_num_effect_tokens": 5837, + "batch_num_samples": 149, + "batch_num_tokens": 52173, + "epoch": 0.29, + "grad_norm": 4.193911552429199, + "learning_rate": 1.9876883405951378e-05, + "loss": 2.004, + "step": 319 + }, + { + "batch_num_effect_tokens": 5214, + "batch_num_samples": 149, + "batch_num_tokens": 52160, + "epoch": 0.29091, + "grad_norm": 4.251591205596924, + "learning_rate": 1.9874388886763944e-05, + "loss": 1.9869, + "step": 320 + }, + { + "batch_num_effect_tokens": 5105, + "batch_num_samples": 149, + "batch_num_tokens": 52197, + "epoch": 0.29182, + "grad_norm": 3.726947546005249, + "learning_rate": 1.987186950881425e-05, + "loss": 1.4984, + "step": 321 + }, + { + "batch_num_effect_tokens": 7011, + "batch_num_samples": 149, + "batch_num_tokens": 52219, + "epoch": 0.29273, + "grad_norm": 3.730116128921509, + "learning_rate": 1.9869325278444824e-05, + "loss": 2.1577, + "step": 322 + }, + { + "batch_num_effect_tokens": 4520, + "batch_num_samples": 149, + "batch_num_tokens": 52150, + "epoch": 0.29364, + "grad_norm": 3.86989426612854, + "learning_rate": 1.9866756202060764e-05, + "loss": 1.7366, + "step": 323 + }, + { + "batch_num_effect_tokens": 5279, + "batch_num_samples": 149, + "batch_num_tokens": 52164, + "epoch": 0.29455, + "grad_norm": 4.149124622344971, + "learning_rate": 1.986416228612972e-05, + "loss": 1.8388, + "step": 324 + }, + { + "batch_num_effect_tokens": 8876, + "batch_num_samples": 149, + "batch_num_tokens": 52183, + "epoch": 0.29545, + "grad_norm": 5.287720203399658, + "learning_rate": 1.986154353718187e-05, + "loss": 2.1111, + "step": 325 + }, + { + "batch_num_effect_tokens": 7286, + "batch_num_samples": 150, + "batch_num_tokens": 52219, + "epoch": 0.29636, + "grad_norm": 3.4820773601531982, + "learning_rate": 1.9858899961809904e-05, + "loss": 1.877, + "step": 326 + }, + { + "batch_num_effect_tokens": 4332, + "batch_num_samples": 149, + "batch_num_tokens": 52176, + "epoch": 0.29727, + "grad_norm": 3.8096587657928467, + "learning_rate": 1.9856231566669036e-05, + "loss": 1.6738, + "step": 327 + }, + { + "batch_num_effect_tokens": 8761, + "batch_num_samples": 149, + "batch_num_tokens": 52164, + "epoch": 0.29818, + "grad_norm": 4.4504852294921875, + "learning_rate": 1.9853538358476933e-05, + "loss": 2.0797, + "step": 328 + }, + { + "batch_num_effect_tokens": 5662, + "batch_num_samples": 149, + "batch_num_tokens": 52184, + "epoch": 0.29909, + "grad_norm": 4.1594696044921875, + "learning_rate": 1.985082034401375e-05, + "loss": 1.9774, + "step": 329 + }, + { + "batch_num_effect_tokens": 6919, + "batch_num_samples": 149, + "batch_num_tokens": 52140, + "epoch": 0.3, + "grad_norm": 3.5411789417266846, + "learning_rate": 1.9848077530122083e-05, + "loss": 2.0552, + "step": 330 + }, + { + "batch_num_effect_tokens": 7886, + "batch_num_samples": 149, + "batch_num_tokens": 52168, + "epoch": 0.30091, + "grad_norm": 5.033746719360352, + "learning_rate": 1.9845309923706965e-05, + "loss": 1.6951, + "step": 331 + }, + { + "batch_num_effect_tokens": 8567, + "batch_num_samples": 150, + "batch_num_tokens": 52221, + "epoch": 0.30182, + "grad_norm": 4.57145881652832, + "learning_rate": 1.9842517531735837e-05, + "loss": 2.2676, + "step": 332 + }, + { + "batch_num_effect_tokens": 9606, + "batch_num_samples": 149, + "batch_num_tokens": 52180, + "epoch": 0.30273, + "grad_norm": 3.5114569664001465, + "learning_rate": 1.9839700361238548e-05, + "loss": 2.5096, + "step": 333 + }, + { + "batch_num_effect_tokens": 9075, + "batch_num_samples": 149, + "batch_num_tokens": 52213, + "epoch": 0.30364, + "grad_norm": 3.247704029083252, + "learning_rate": 1.9836858419307325e-05, + "loss": 2.3547, + "step": 334 + }, + { + "batch_num_effect_tokens": 6285, + "batch_num_samples": 149, + "batch_num_tokens": 52142, + "epoch": 0.30455, + "grad_norm": 3.7255682945251465, + "learning_rate": 1.9833991713096742e-05, + "loss": 2.1543, + "step": 335 + }, + { + "batch_num_effect_tokens": 8129, + "batch_num_samples": 149, + "batch_num_tokens": 52222, + "epoch": 0.30545, + "grad_norm": 3.3006324768066406, + "learning_rate": 1.9831100249823732e-05, + "loss": 2.0478, + "step": 336 + }, + { + "batch_num_effect_tokens": 5358, + "batch_num_samples": 149, + "batch_num_tokens": 52222, + "epoch": 0.30636, + "grad_norm": 3.799280881881714, + "learning_rate": 1.9828184036767556e-05, + "loss": 1.6325, + "step": 337 + }, + { + "batch_num_effect_tokens": 4368, + "batch_num_samples": 149, + "batch_num_tokens": 52141, + "epoch": 0.30727, + "grad_norm": 4.022277355194092, + "learning_rate": 1.9825243081269778e-05, + "loss": 1.5553, + "step": 338 + }, + { + "batch_num_effect_tokens": 7283, + "batch_num_samples": 149, + "batch_num_tokens": 52137, + "epoch": 0.30818, + "grad_norm": 4.02409553527832, + "learning_rate": 1.982227739073424e-05, + "loss": 2.3756, + "step": 339 + }, + { + "batch_num_effect_tokens": 6278, + "batch_num_samples": 150, + "batch_num_tokens": 52179, + "epoch": 0.30909, + "grad_norm": 3.805237293243408, + "learning_rate": 1.9819286972627066e-05, + "loss": 1.9413, + "step": 340 + }, + { + "batch_num_effect_tokens": 4420, + "batch_num_samples": 149, + "batch_num_tokens": 52195, + "epoch": 0.31, + "grad_norm": 3.8129725456237793, + "learning_rate": 1.9816271834476642e-05, + "loss": 1.432, + "step": 341 + }, + { + "batch_num_effect_tokens": 4993, + "batch_num_samples": 149, + "batch_num_tokens": 52221, + "epoch": 0.31091, + "grad_norm": 3.3998563289642334, + "learning_rate": 1.9813231983873563e-05, + "loss": 1.5523, + "step": 342 + }, + { + "batch_num_effect_tokens": 6330, + "batch_num_samples": 150, + "batch_num_tokens": 52192, + "epoch": 0.31182, + "grad_norm": 4.177271842956543, + "learning_rate": 1.9810167428470653e-05, + "loss": 2.2859, + "step": 343 + }, + { + "batch_num_effect_tokens": 7214, + "batch_num_samples": 149, + "batch_num_tokens": 52182, + "epoch": 0.31273, + "grad_norm": 3.3524763584136963, + "learning_rate": 1.9807078175982925e-05, + "loss": 2.167, + "step": 344 + }, + { + "batch_num_effect_tokens": 7129, + "batch_num_samples": 149, + "batch_num_tokens": 52137, + "epoch": 0.31364, + "grad_norm": 3.9093234539031982, + "learning_rate": 1.980396423418757e-05, + "loss": 2.3973, + "step": 345 + }, + { + "batch_num_effect_tokens": 7229, + "batch_num_samples": 149, + "batch_num_tokens": 52190, + "epoch": 0.31455, + "grad_norm": 3.5451037883758545, + "learning_rate": 1.9800825610923937e-05, + "loss": 2.0647, + "step": 346 + }, + { + "batch_num_effect_tokens": 6494, + "batch_num_samples": 149, + "batch_num_tokens": 52191, + "epoch": 0.31545, + "grad_norm": 3.35791277885437, + "learning_rate": 1.9797662314093496e-05, + "loss": 1.9389, + "step": 347 + }, + { + "batch_num_effect_tokens": 7132, + "batch_num_samples": 149, + "batch_num_tokens": 52143, + "epoch": 0.31636, + "grad_norm": 3.3549084663391113, + "learning_rate": 1.9794474351659854e-05, + "loss": 2.1595, + "step": 348 + }, + { + "batch_num_effect_tokens": 6921, + "batch_num_samples": 149, + "batch_num_tokens": 52212, + "epoch": 0.31727, + "grad_norm": 3.5743157863616943, + "learning_rate": 1.9791261731648694e-05, + "loss": 2.1689, + "step": 349 + }, + { + "batch_num_effect_tokens": 5448, + "batch_num_samples": 149, + "batch_num_tokens": 52138, + "epoch": 0.31818, + "grad_norm": 3.6076207160949707, + "learning_rate": 1.978802446214779e-05, + "loss": 1.9005, + "step": 350 + }, + { + "batch_num_effect_tokens": 4677, + "batch_num_samples": 149, + "batch_num_tokens": 50568, + "epoch": 0.31909, + "grad_norm": 3.772836208343506, + "learning_rate": 1.978476255130696e-05, + "loss": 1.8154, + "step": 351 + }, + { + "batch_num_effect_tokens": 6046, + "batch_num_samples": 149, + "batch_num_tokens": 52138, + "epoch": 0.32, + "grad_norm": 3.8440117835998535, + "learning_rate": 1.9781476007338058e-05, + "loss": 2.0698, + "step": 352 + }, + { + "batch_num_effect_tokens": 3892, + "batch_num_samples": 149, + "batch_num_tokens": 52162, + "epoch": 0.32091, + "grad_norm": 4.189990043640137, + "learning_rate": 1.977816483851496e-05, + "loss": 1.6277, + "step": 353 + }, + { + "batch_num_effect_tokens": 7533, + "batch_num_samples": 149, + "batch_num_tokens": 52177, + "epoch": 0.32182, + "grad_norm": 3.25177001953125, + "learning_rate": 1.977482905317353e-05, + "loss": 2.2104, + "step": 354 + }, + { + "batch_num_effect_tokens": 6350, + "batch_num_samples": 149, + "batch_num_tokens": 52139, + "epoch": 0.32273, + "grad_norm": 3.4130544662475586, + "learning_rate": 1.9771468659711595e-05, + "loss": 1.9284, + "step": 355 + }, + { + "batch_num_effect_tokens": 5356, + "batch_num_samples": 149, + "batch_num_tokens": 52208, + "epoch": 0.32364, + "grad_norm": 3.7726941108703613, + "learning_rate": 1.9768083666588954e-05, + "loss": 1.9012, + "step": 356 + }, + { + "batch_num_effect_tokens": 5176, + "batch_num_samples": 149, + "batch_num_tokens": 52126, + "epoch": 0.32455, + "grad_norm": 3.8374075889587402, + "learning_rate": 1.9764674082327313e-05, + "loss": 1.7814, + "step": 357 + }, + { + "batch_num_effect_tokens": 4955, + "batch_num_samples": 149, + "batch_num_tokens": 52154, + "epoch": 0.32545, + "grad_norm": 4.390801429748535, + "learning_rate": 1.9761239915510302e-05, + "loss": 1.652, + "step": 358 + }, + { + "batch_num_effect_tokens": 4207, + "batch_num_samples": 149, + "batch_num_tokens": 52146, + "epoch": 0.32636, + "grad_norm": 3.5614068508148193, + "learning_rate": 1.975778117478343e-05, + "loss": 1.4238, + "step": 359 + }, + { + "batch_num_effect_tokens": 6837, + "batch_num_samples": 149, + "batch_num_tokens": 52200, + "epoch": 0.32727, + "grad_norm": 3.47572660446167, + "learning_rate": 1.9754297868854075e-05, + "loss": 2.0447, + "step": 360 + }, + { + "batch_num_effect_tokens": 9300, + "batch_num_samples": 149, + "batch_num_tokens": 52117, + "epoch": 0.32818, + "grad_norm": 4.148090362548828, + "learning_rate": 1.9750790006491447e-05, + "loss": 2.416, + "step": 361 + }, + { + "batch_num_effect_tokens": 6922, + "batch_num_samples": 149, + "batch_num_tokens": 52221, + "epoch": 0.32909, + "grad_norm": 3.374837636947632, + "learning_rate": 1.9747257596526594e-05, + "loss": 1.7274, + "step": 362 + }, + { + "batch_num_effect_tokens": 6355, + "batch_num_samples": 149, + "batch_num_tokens": 52142, + "epoch": 0.33, + "grad_norm": 3.4720685482025146, + "learning_rate": 1.9743700647852356e-05, + "loss": 2.0928, + "step": 363 + }, + { + "batch_num_effect_tokens": 5826, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 0.33091, + "grad_norm": 3.680713176727295, + "learning_rate": 1.9740119169423337e-05, + "loss": 1.75, + "step": 364 + }, + { + "batch_num_effect_tokens": 7576, + "batch_num_samples": 149, + "batch_num_tokens": 52209, + "epoch": 0.33182, + "grad_norm": 3.1444408893585205, + "learning_rate": 1.973651317025591e-05, + "loss": 2.1317, + "step": 365 + }, + { + "batch_num_effect_tokens": 8806, + "batch_num_samples": 149, + "batch_num_tokens": 52175, + "epoch": 0.33273, + "grad_norm": 3.1442887783050537, + "learning_rate": 1.973288265942818e-05, + "loss": 2.3923, + "step": 366 + }, + { + "batch_num_effect_tokens": 7476, + "batch_num_samples": 149, + "batch_num_tokens": 52139, + "epoch": 0.33364, + "grad_norm": 3.6415657997131348, + "learning_rate": 1.9729227646079942e-05, + "loss": 2.3736, + "step": 367 + }, + { + "batch_num_effect_tokens": 3792, + "batch_num_samples": 149, + "batch_num_tokens": 52143, + "epoch": 0.33455, + "grad_norm": 3.8284006118774414, + "learning_rate": 1.9725548139412693e-05, + "loss": 1.3342, + "step": 368 + }, + { + "batch_num_effect_tokens": 6077, + "batch_num_samples": 149, + "batch_num_tokens": 52152, + "epoch": 0.33545, + "grad_norm": 3.79321551322937, + "learning_rate": 1.972184414868959e-05, + "loss": 2.0298, + "step": 369 + }, + { + "batch_num_effect_tokens": 6096, + "batch_num_samples": 149, + "batch_num_tokens": 52223, + "epoch": 0.33636, + "grad_norm": 4.172370433807373, + "learning_rate": 1.9718115683235418e-05, + "loss": 1.912, + "step": 370 + }, + { + "batch_num_effect_tokens": 5990, + "batch_num_samples": 149, + "batch_num_tokens": 52140, + "epoch": 0.33727, + "grad_norm": 3.5405514240264893, + "learning_rate": 1.971436275243659e-05, + "loss": 1.9807, + "step": 371 + }, + { + "batch_num_effect_tokens": 6166, + "batch_num_samples": 149, + "batch_num_tokens": 52198, + "epoch": 0.33818, + "grad_norm": 3.740248680114746, + "learning_rate": 1.9710585365741105e-05, + "loss": 2.1228, + "step": 372 + }, + { + "batch_num_effect_tokens": 8286, + "batch_num_samples": 149, + "batch_num_tokens": 52137, + "epoch": 0.33909, + "grad_norm": 3.0704259872436523, + "learning_rate": 1.9706783532658528e-05, + "loss": 2.2266, + "step": 373 + }, + { + "batch_num_effect_tokens": 8179, + "batch_num_samples": 149, + "batch_num_tokens": 52137, + "epoch": 0.34, + "grad_norm": 3.2725648880004883, + "learning_rate": 1.9702957262759964e-05, + "loss": 2.2335, + "step": 374 + }, + { + "batch_num_effect_tokens": 6548, + "batch_num_samples": 149, + "batch_num_tokens": 52222, + "epoch": 0.34091, + "grad_norm": 3.4414355754852295, + "learning_rate": 1.969910656567805e-05, + "loss": 2.115, + "step": 375 + }, + { + "batch_num_effect_tokens": 10572, + "batch_num_samples": 149, + "batch_num_tokens": 52170, + "epoch": 0.34182, + "grad_norm": 3.2218399047851562, + "learning_rate": 1.9695231451106914e-05, + "loss": 2.415, + "step": 376 + }, + { + "batch_num_effect_tokens": 7220, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 0.34273, + "grad_norm": 3.478093147277832, + "learning_rate": 1.9691331928802144e-05, + "loss": 2.2834, + "step": 377 + }, + { + "batch_num_effect_tokens": 6895, + "batch_num_samples": 149, + "batch_num_tokens": 52133, + "epoch": 0.34364, + "grad_norm": 3.4649593830108643, + "learning_rate": 1.9687408008580785e-05, + "loss": 1.9373, + "step": 378 + }, + { + "batch_num_effect_tokens": 4249, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 0.34455, + "grad_norm": 3.788116455078125, + "learning_rate": 1.9683459700321305e-05, + "loss": 1.7062, + "step": 379 + }, + { + "batch_num_effect_tokens": 7538, + "batch_num_samples": 149, + "batch_num_tokens": 52137, + "epoch": 0.34545, + "grad_norm": 3.7470011711120605, + "learning_rate": 1.9679487013963566e-05, + "loss": 2.2533, + "step": 380 + }, + { + "batch_num_effect_tokens": 7912, + "batch_num_samples": 149, + "batch_num_tokens": 52135, + "epoch": 0.34636, + "grad_norm": 3.405688524246216, + "learning_rate": 1.9675489959508794e-05, + "loss": 2.088, + "step": 381 + }, + { + "batch_num_effect_tokens": 4731, + "batch_num_samples": 149, + "batch_num_tokens": 52146, + "epoch": 0.34727, + "grad_norm": 3.5091192722320557, + "learning_rate": 1.9671468547019575e-05, + "loss": 1.4163, + "step": 382 + }, + { + "batch_num_effect_tokens": 6690, + "batch_num_samples": 149, + "batch_num_tokens": 52151, + "epoch": 0.34818, + "grad_norm": 3.5613765716552734, + "learning_rate": 1.9667422786619804e-05, + "loss": 1.9267, + "step": 383 + }, + { + "batch_num_effect_tokens": 5752, + "batch_num_samples": 149, + "batch_num_tokens": 52197, + "epoch": 0.34909, + "grad_norm": 3.2376224994659424, + "learning_rate": 1.9663352688494686e-05, + "loss": 1.5661, + "step": 384 + }, + { + "batch_num_effect_tokens": 5535, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 0.35, + "grad_norm": 3.3313965797424316, + "learning_rate": 1.9659258262890683e-05, + "loss": 1.4459, + "step": 385 + }, + { + "batch_num_effect_tokens": 6964, + "batch_num_samples": 149, + "batch_num_tokens": 52189, + "epoch": 0.35091, + "grad_norm": 4.082736968994141, + "learning_rate": 1.965513952011551e-05, + "loss": 2.1208, + "step": 386 + }, + { + "batch_num_effect_tokens": 4936, + "batch_num_samples": 149, + "batch_num_tokens": 52141, + "epoch": 0.35182, + "grad_norm": 3.491699695587158, + "learning_rate": 1.9650996470538093e-05, + "loss": 1.5283, + "step": 387 + }, + { + "batch_num_effect_tokens": 6667, + "batch_num_samples": 149, + "batch_num_tokens": 52168, + "epoch": 0.35273, + "grad_norm": 6.086740016937256, + "learning_rate": 1.964682912458856e-05, + "loss": 1.8542, + "step": 388 + }, + { + "batch_num_effect_tokens": 7105, + "batch_num_samples": 149, + "batch_num_tokens": 52159, + "epoch": 0.35364, + "grad_norm": 3.77236008644104, + "learning_rate": 1.9642637492758193e-05, + "loss": 2.1419, + "step": 389 + }, + { + "batch_num_effect_tokens": 6647, + "batch_num_samples": 149, + "batch_num_tokens": 52127, + "epoch": 0.35455, + "grad_norm": 3.3697710037231445, + "learning_rate": 1.9638421585599422e-05, + "loss": 2.0109, + "step": 390 + }, + { + "batch_num_effect_tokens": 8727, + "batch_num_samples": 149, + "batch_num_tokens": 52155, + "epoch": 0.35545, + "grad_norm": 3.1939914226531982, + "learning_rate": 1.963418141372579e-05, + "loss": 2.136, + "step": 391 + }, + { + "batch_num_effect_tokens": 4262, + "batch_num_samples": 149, + "batch_num_tokens": 52208, + "epoch": 0.35636, + "grad_norm": 3.370107889175415, + "learning_rate": 1.9629916987811924e-05, + "loss": 1.3332, + "step": 392 + }, + { + "batch_num_effect_tokens": 5487, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 0.35727, + "grad_norm": 4.085422039031982, + "learning_rate": 1.9625628318593514e-05, + "loss": 2.1505, + "step": 393 + }, + { + "batch_num_effect_tokens": 6407, + "batch_num_samples": 149, + "batch_num_tokens": 52220, + "epoch": 0.35818, + "grad_norm": 3.593017578125, + "learning_rate": 1.9621315416867274e-05, + "loss": 2.0573, + "step": 394 + }, + { + "batch_num_effect_tokens": 7014, + "batch_num_samples": 149, + "batch_num_tokens": 52186, + "epoch": 0.35909, + "grad_norm": 3.438126564025879, + "learning_rate": 1.961697829349093e-05, + "loss": 2.1716, + "step": 395 + }, + { + "batch_num_effect_tokens": 5753, + "batch_num_samples": 149, + "batch_num_tokens": 50567, + "epoch": 0.36, + "grad_norm": 3.483640432357788, + "learning_rate": 1.961261695938319e-05, + "loss": 1.7637, + "step": 396 + }, + { + "batch_num_effect_tokens": 3818, + "batch_num_samples": 149, + "batch_num_tokens": 52125, + "epoch": 0.36091, + "grad_norm": 3.637179136276245, + "learning_rate": 1.9608231425523702e-05, + "loss": 1.425, + "step": 397 + }, + { + "batch_num_effect_tokens": 6166, + "batch_num_samples": 149, + "batch_num_tokens": 52208, + "epoch": 0.36182, + "grad_norm": 4.281872272491455, + "learning_rate": 1.9603821702953047e-05, + "loss": 1.8861, + "step": 398 + }, + { + "batch_num_effect_tokens": 6623, + "batch_num_samples": 149, + "batch_num_tokens": 52151, + "epoch": 0.36273, + "grad_norm": 3.598214864730835, + "learning_rate": 1.9599387802772693e-05, + "loss": 2.0335, + "step": 399 + }, + { + "batch_num_effect_tokens": 8465, + "batch_num_samples": 149, + "batch_num_tokens": 52168, + "epoch": 0.36364, + "grad_norm": 6.010089874267578, + "learning_rate": 1.9594929736144978e-05, + "loss": 1.5298, + "step": 400 + }, + { + "batch_num_effect_tokens": 7077, + "batch_num_samples": 150, + "batch_num_tokens": 52194, + "epoch": 0.36455, + "grad_norm": 3.569758415222168, + "learning_rate": 1.959044751429308e-05, + "loss": 1.9929, + "step": 401 + }, + { + "batch_num_effect_tokens": 11051, + "batch_num_samples": 149, + "batch_num_tokens": 52181, + "epoch": 0.36545, + "grad_norm": 3.3714101314544678, + "learning_rate": 1.9585941148500987e-05, + "loss": 2.5957, + "step": 402 + }, + { + "batch_num_effect_tokens": 6003, + "batch_num_samples": 149, + "batch_num_tokens": 50555, + "epoch": 0.36636, + "grad_norm": 3.890573501586914, + "learning_rate": 1.958141065011347e-05, + "loss": 2.0343, + "step": 403 + }, + { + "batch_num_effect_tokens": 6037, + "batch_num_samples": 149, + "batch_num_tokens": 52220, + "epoch": 0.36727, + "grad_norm": 5.133095741271973, + "learning_rate": 1.9576856030536055e-05, + "loss": 1.7706, + "step": 404 + }, + { + "batch_num_effect_tokens": 5590, + "batch_num_samples": 149, + "batch_num_tokens": 52171, + "epoch": 0.36818, + "grad_norm": 3.661139726638794, + "learning_rate": 1.957227730123499e-05, + "loss": 1.9668, + "step": 405 + }, + { + "batch_num_effect_tokens": 5160, + "batch_num_samples": 149, + "batch_num_tokens": 52208, + "epoch": 0.36909, + "grad_norm": 3.9254584312438965, + "learning_rate": 1.956767447373722e-05, + "loss": 1.7646, + "step": 406 + }, + { + "batch_num_effect_tokens": 7085, + "batch_num_samples": 149, + "batch_num_tokens": 52197, + "epoch": 0.37, + "grad_norm": 9.332328796386719, + "learning_rate": 1.9563047559630356e-05, + "loss": 1.9354, + "step": 407 + }, + { + "batch_num_effect_tokens": 7484, + "batch_num_samples": 149, + "batch_num_tokens": 52207, + "epoch": 0.37091, + "grad_norm": 3.1720235347747803, + "learning_rate": 1.955839657056265e-05, + "loss": 2.0569, + "step": 408 + }, + { + "batch_num_effect_tokens": 4746, + "batch_num_samples": 149, + "batch_num_tokens": 52195, + "epoch": 0.37182, + "grad_norm": 3.7923851013183594, + "learning_rate": 1.955372151824297e-05, + "loss": 1.7236, + "step": 409 + }, + { + "batch_num_effect_tokens": 4527, + "batch_num_samples": 149, + "batch_num_tokens": 52179, + "epoch": 0.37273, + "grad_norm": 4.349642276763916, + "learning_rate": 1.9549022414440738e-05, + "loss": 1.5834, + "step": 410 + }, + { + "batch_num_effect_tokens": 4718, + "batch_num_samples": 149, + "batch_num_tokens": 50544, + "epoch": 0.37364, + "grad_norm": 3.7531752586364746, + "learning_rate": 1.9544299270985958e-05, + "loss": 1.6154, + "step": 411 + }, + { + "batch_num_effect_tokens": 8186, + "batch_num_samples": 149, + "batch_num_tokens": 50591, + "epoch": 0.37455, + "grad_norm": 15.484469413757324, + "learning_rate": 1.9539552099769128e-05, + "loss": 2.3389, + "step": 412 + }, + { + "batch_num_effect_tokens": 5516, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 0.37545, + "grad_norm": 3.6566741466522217, + "learning_rate": 1.953478091274125e-05, + "loss": 1.6849, + "step": 413 + }, + { + "batch_num_effect_tokens": 8184, + "batch_num_samples": 149, + "batch_num_tokens": 52210, + "epoch": 0.37636, + "grad_norm": 3.60722279548645, + "learning_rate": 1.952998572191378e-05, + "loss": 2.1157, + "step": 414 + }, + { + "batch_num_effect_tokens": 6107, + "batch_num_samples": 149, + "batch_num_tokens": 52124, + "epoch": 0.37727, + "grad_norm": 3.568403720855713, + "learning_rate": 1.9525166539358608e-05, + "loss": 1.8607, + "step": 415 + }, + { + "batch_num_effect_tokens": 5373, + "batch_num_samples": 149, + "batch_num_tokens": 52184, + "epoch": 0.37818, + "grad_norm": 3.696733236312866, + "learning_rate": 1.9520323377208017e-05, + "loss": 1.6664, + "step": 416 + }, + { + "batch_num_effect_tokens": 5750, + "batch_num_samples": 149, + "batch_num_tokens": 52197, + "epoch": 0.37909, + "grad_norm": 4.781795978546143, + "learning_rate": 1.951545624765466e-05, + "loss": 1.9601, + "step": 417 + }, + { + "batch_num_effect_tokens": 6467, + "batch_num_samples": 149, + "batch_num_tokens": 52151, + "epoch": 0.38, + "grad_norm": 3.721050262451172, + "learning_rate": 1.9510565162951538e-05, + "loss": 1.7897, + "step": 418 + }, + { + "batch_num_effect_tokens": 5158, + "batch_num_samples": 149, + "batch_num_tokens": 52184, + "epoch": 0.38091, + "grad_norm": 3.6470894813537598, + "learning_rate": 1.950565013541194e-05, + "loss": 1.8109, + "step": 419 + }, + { + "batch_num_effect_tokens": 7931, + "batch_num_samples": 149, + "batch_num_tokens": 52174, + "epoch": 0.38182, + "grad_norm": 3.234846591949463, + "learning_rate": 1.9500711177409456e-05, + "loss": 2.2473, + "step": 420 + }, + { + "batch_num_effect_tokens": 5828, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 0.38273, + "grad_norm": 3.299785852432251, + "learning_rate": 1.9495748301377895e-05, + "loss": 1.7766, + "step": 421 + }, + { + "batch_num_effect_tokens": 9265, + "batch_num_samples": 149, + "batch_num_tokens": 52201, + "epoch": 0.38364, + "grad_norm": 3.1475846767425537, + "learning_rate": 1.9490761519811295e-05, + "loss": 2.3207, + "step": 422 + }, + { + "batch_num_effect_tokens": 6778, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 0.38455, + "grad_norm": 3.426121234893799, + "learning_rate": 1.9485750845263874e-05, + "loss": 2.1797, + "step": 423 + }, + { + "batch_num_effect_tokens": 7228, + "batch_num_samples": 149, + "batch_num_tokens": 52187, + "epoch": 0.38545, + "grad_norm": 3.2111194133758545, + "learning_rate": 1.9480716290349998e-05, + "loss": 1.9292, + "step": 424 + }, + { + "batch_num_effect_tokens": 4729, + "batch_num_samples": 149, + "batch_num_tokens": 52209, + "epoch": 0.38636, + "grad_norm": 4.0305047035217285, + "learning_rate": 1.9475657867744153e-05, + "loss": 1.6335, + "step": 425 + }, + { + "batch_num_effect_tokens": 6369, + "batch_num_samples": 149, + "batch_num_tokens": 52150, + "epoch": 0.38727, + "grad_norm": 3.4517645835876465, + "learning_rate": 1.947057559018091e-05, + "loss": 1.9695, + "step": 426 + }, + { + "batch_num_effect_tokens": 3094, + "batch_num_samples": 149, + "batch_num_tokens": 52180, + "epoch": 0.38818, + "grad_norm": 3.5539233684539795, + "learning_rate": 1.94654694704549e-05, + "loss": 0.765, + "step": 427 + }, + { + "batch_num_effect_tokens": 5971, + "batch_num_samples": 149, + "batch_num_tokens": 52141, + "epoch": 0.38909, + "grad_norm": 3.3859667778015137, + "learning_rate": 1.946033952142077e-05, + "loss": 1.6888, + "step": 428 + }, + { + "batch_num_effect_tokens": 7126, + "batch_num_samples": 149, + "batch_num_tokens": 52184, + "epoch": 0.39, + "grad_norm": 3.397372007369995, + "learning_rate": 1.945518575599317e-05, + "loss": 2.0718, + "step": 429 + }, + { + "batch_num_effect_tokens": 8198, + "batch_num_samples": 150, + "batch_num_tokens": 52143, + "epoch": 0.39091, + "grad_norm": 3.590057373046875, + "learning_rate": 1.9450008187146685e-05, + "loss": 2.2892, + "step": 430 + }, + { + "batch_num_effect_tokens": 5553, + "batch_num_samples": 149, + "batch_num_tokens": 52223, + "epoch": 0.39182, + "grad_norm": 3.60089373588562, + "learning_rate": 1.9444806827915848e-05, + "loss": 1.8969, + "step": 431 + }, + { + "batch_num_effect_tokens": 6068, + "batch_num_samples": 150, + "batch_num_tokens": 52166, + "epoch": 0.39273, + "grad_norm": 3.4060354232788086, + "learning_rate": 1.943958169139507e-05, + "loss": 1.858, + "step": 432 + }, + { + "batch_num_effect_tokens": 8030, + "batch_num_samples": 149, + "batch_num_tokens": 52152, + "epoch": 0.39364, + "grad_norm": 3.4720919132232666, + "learning_rate": 1.9434332790738625e-05, + "loss": 2.2429, + "step": 433 + }, + { + "batch_num_effect_tokens": 4799, + "batch_num_samples": 149, + "batch_num_tokens": 52118, + "epoch": 0.39455, + "grad_norm": 3.771951913833618, + "learning_rate": 1.942906013916062e-05, + "loss": 1.7619, + "step": 434 + }, + { + "batch_num_effect_tokens": 5977, + "batch_num_samples": 149, + "batch_num_tokens": 52116, + "epoch": 0.39545, + "grad_norm": 3.652972936630249, + "learning_rate": 1.9423763749934942e-05, + "loss": 1.7407, + "step": 435 + }, + { + "batch_num_effect_tokens": 5247, + "batch_num_samples": 149, + "batch_num_tokens": 52116, + "epoch": 0.39636, + "grad_norm": 3.8835346698760986, + "learning_rate": 1.941844363639525e-05, + "loss": 1.9667, + "step": 436 + }, + { + "batch_num_effect_tokens": 7674, + "batch_num_samples": 149, + "batch_num_tokens": 52208, + "epoch": 0.39727, + "grad_norm": 3.0361108779907227, + "learning_rate": 1.941309981193492e-05, + "loss": 1.94, + "step": 437 + }, + { + "batch_num_effect_tokens": 5285, + "batch_num_samples": 149, + "batch_num_tokens": 52189, + "epoch": 0.39818, + "grad_norm": 3.8913521766662598, + "learning_rate": 1.9407732290007023e-05, + "loss": 2.0395, + "step": 438 + }, + { + "batch_num_effect_tokens": 5503, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 0.39909, + "grad_norm": 3.5886588096618652, + "learning_rate": 1.9402341084124298e-05, + "loss": 1.7529, + "step": 439 + }, + { + "batch_num_effect_tokens": 7691, + "batch_num_samples": 149, + "batch_num_tokens": 52122, + "epoch": 0.4, + "grad_norm": 3.757056474685669, + "learning_rate": 1.9396926207859085e-05, + "loss": 2.373, + "step": 440 + }, + { + "batch_num_effect_tokens": 4646, + "batch_num_samples": 149, + "batch_num_tokens": 52210, + "epoch": 0.40091, + "grad_norm": 3.726503610610962, + "learning_rate": 1.939148767484334e-05, + "loss": 1.4204, + "step": 441 + }, + { + "batch_num_effect_tokens": 5042, + "batch_num_samples": 149, + "batch_num_tokens": 52222, + "epoch": 0.40182, + "grad_norm": 3.9943902492523193, + "learning_rate": 1.938602549876856e-05, + "loss": 2.0746, + "step": 442 + }, + { + "batch_num_effect_tokens": 5524, + "batch_num_samples": 149, + "batch_num_tokens": 52139, + "epoch": 0.40273, + "grad_norm": 4.357745170593262, + "learning_rate": 1.9380539693385763e-05, + "loss": 1.9158, + "step": 443 + }, + { + "batch_num_effect_tokens": 9174, + "batch_num_samples": 149, + "batch_num_tokens": 50580, + "epoch": 0.40364, + "grad_norm": 3.923656940460205, + "learning_rate": 1.9375030272505463e-05, + "loss": 2.5712, + "step": 444 + }, + { + "batch_num_effect_tokens": 6709, + "batch_num_samples": 149, + "batch_num_tokens": 52189, + "epoch": 0.40455, + "grad_norm": 3.5520644187927246, + "learning_rate": 1.936949724999762e-05, + "loss": 2.0598, + "step": 445 + }, + { + "batch_num_effect_tokens": 6041, + "batch_num_samples": 150, + "batch_num_tokens": 52176, + "epoch": 0.40545, + "grad_norm": 3.5794007778167725, + "learning_rate": 1.9363940639791607e-05, + "loss": 1.9235, + "step": 446 + }, + { + "batch_num_effect_tokens": 6017, + "batch_num_samples": 149, + "batch_num_tokens": 52121, + "epoch": 0.40636, + "grad_norm": 3.34661602973938, + "learning_rate": 1.935836045587619e-05, + "loss": 1.9621, + "step": 447 + }, + { + "batch_num_effect_tokens": 6171, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 0.40727, + "grad_norm": 3.2858693599700928, + "learning_rate": 1.9352756712299467e-05, + "loss": 1.6668, + "step": 448 + }, + { + "batch_num_effect_tokens": 4035, + "batch_num_samples": 149, + "batch_num_tokens": 52201, + "epoch": 0.40818, + "grad_norm": 4.985914707183838, + "learning_rate": 1.934712942316886e-05, + "loss": 1.3112, + "step": 449 + }, + { + "batch_num_effect_tokens": 6588, + "batch_num_samples": 149, + "batch_num_tokens": 52180, + "epoch": 0.40909, + "grad_norm": 3.5650634765625, + "learning_rate": 1.9341478602651068e-05, + "loss": 1.6175, + "step": 450 + }, + { + "batch_num_effect_tokens": 7081, + "batch_num_samples": 149, + "batch_num_tokens": 52167, + "epoch": 0.41, + "grad_norm": 6.88336181640625, + "learning_rate": 1.9335804264972018e-05, + "loss": 1.4374, + "step": 451 + }, + { + "batch_num_effect_tokens": 6896, + "batch_num_samples": 149, + "batch_num_tokens": 52204, + "epoch": 0.41091, + "grad_norm": 3.438095808029175, + "learning_rate": 1.9330106424416852e-05, + "loss": 2.1431, + "step": 452 + }, + { + "batch_num_effect_tokens": 7606, + "batch_num_samples": 149, + "batch_num_tokens": 52189, + "epoch": 0.41182, + "grad_norm": 3.3056182861328125, + "learning_rate": 1.9324385095329875e-05, + "loss": 2.2578, + "step": 453 + }, + { + "batch_num_effect_tokens": 5966, + "batch_num_samples": 149, + "batch_num_tokens": 52222, + "epoch": 0.41273, + "grad_norm": 3.2087910175323486, + "learning_rate": 1.9318640292114526e-05, + "loss": 1.6353, + "step": 454 + }, + { + "batch_num_effect_tokens": 4157, + "batch_num_samples": 149, + "batch_num_tokens": 52191, + "epoch": 0.41364, + "grad_norm": 3.875227689743042, + "learning_rate": 1.931287202923334e-05, + "loss": 1.3412, + "step": 455 + }, + { + "batch_num_effect_tokens": 5966, + "batch_num_samples": 149, + "batch_num_tokens": 52129, + "epoch": 0.41455, + "grad_norm": 3.8848047256469727, + "learning_rate": 1.9307080321207913e-05, + "loss": 2.1777, + "step": 456 + }, + { + "batch_num_effect_tokens": 7158, + "batch_num_samples": 149, + "batch_num_tokens": 52180, + "epoch": 0.41545, + "grad_norm": 3.2563788890838623, + "learning_rate": 1.9301265182618862e-05, + "loss": 2.1152, + "step": 457 + }, + { + "batch_num_effect_tokens": 8421, + "batch_num_samples": 149, + "batch_num_tokens": 52168, + "epoch": 0.41636, + "grad_norm": 3.4559903144836426, + "learning_rate": 1.9295426628105792e-05, + "loss": 1.502, + "step": 458 + }, + { + "batch_num_effect_tokens": 6188, + "batch_num_samples": 149, + "batch_num_tokens": 52186, + "epoch": 0.41727, + "grad_norm": 3.458390235900879, + "learning_rate": 1.928956467236726e-05, + "loss": 1.7265, + "step": 459 + }, + { + "batch_num_effect_tokens": 6964, + "batch_num_samples": 149, + "batch_num_tokens": 52215, + "epoch": 0.41818, + "grad_norm": 4.283545017242432, + "learning_rate": 1.9283679330160726e-05, + "loss": 2.3138, + "step": 460 + }, + { + "batch_num_effect_tokens": 5525, + "batch_num_samples": 150, + "batch_num_tokens": 52214, + "epoch": 0.41909, + "grad_norm": 3.834029197692871, + "learning_rate": 1.927777061630254e-05, + "loss": 1.7211, + "step": 461 + }, + { + "batch_num_effect_tokens": 6867, + "batch_num_samples": 149, + "batch_num_tokens": 52174, + "epoch": 0.42, + "grad_norm": 3.8594608306884766, + "learning_rate": 1.9271838545667876e-05, + "loss": 2.2086, + "step": 462 + }, + { + "batch_num_effect_tokens": 5732, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 0.42091, + "grad_norm": 3.368337869644165, + "learning_rate": 1.9265883133190715e-05, + "loss": 1.6516, + "step": 463 + }, + { + "batch_num_effect_tokens": 8993, + "batch_num_samples": 149, + "batch_num_tokens": 52159, + "epoch": 0.42182, + "grad_norm": 3.0719375610351562, + "learning_rate": 1.9259904393863804e-05, + "loss": 2.1589, + "step": 464 + }, + { + "batch_num_effect_tokens": 4547, + "batch_num_samples": 149, + "batch_num_tokens": 52156, + "epoch": 0.42273, + "grad_norm": 3.582049608230591, + "learning_rate": 1.9253902342738612e-05, + "loss": 1.6844, + "step": 465 + }, + { + "batch_num_effect_tokens": 5729, + "batch_num_samples": 149, + "batch_num_tokens": 52135, + "epoch": 0.42364, + "grad_norm": 3.6378562450408936, + "learning_rate": 1.9247876994925293e-05, + "loss": 1.8612, + "step": 466 + }, + { + "batch_num_effect_tokens": 7245, + "batch_num_samples": 149, + "batch_num_tokens": 52135, + "epoch": 0.42455, + "grad_norm": 3.700286626815796, + "learning_rate": 1.9241828365592653e-05, + "loss": 2.0882, + "step": 467 + }, + { + "batch_num_effect_tokens": 5252, + "batch_num_samples": 149, + "batch_num_tokens": 52163, + "epoch": 0.42545, + "grad_norm": 3.388467311859131, + "learning_rate": 1.9235756469968112e-05, + "loss": 1.5082, + "step": 468 + }, + { + "batch_num_effect_tokens": 5824, + "batch_num_samples": 149, + "batch_num_tokens": 52208, + "epoch": 0.42636, + "grad_norm": 3.6123640537261963, + "learning_rate": 1.922966132333766e-05, + "loss": 1.8822, + "step": 469 + }, + { + "batch_num_effect_tokens": 5069, + "batch_num_samples": 149, + "batch_num_tokens": 52156, + "epoch": 0.42727, + "grad_norm": 3.3630306720733643, + "learning_rate": 1.9223542941045817e-05, + "loss": 1.2953, + "step": 470 + }, + { + "batch_num_effect_tokens": 7137, + "batch_num_samples": 149, + "batch_num_tokens": 52185, + "epoch": 0.42818, + "grad_norm": 3.2389023303985596, + "learning_rate": 1.9217401338495605e-05, + "loss": 2.0332, + "step": 471 + }, + { + "batch_num_effect_tokens": 9415, + "batch_num_samples": 149, + "batch_num_tokens": 52170, + "epoch": 0.42909, + "grad_norm": 3.571338176727295, + "learning_rate": 1.92112365311485e-05, + "loss": 2.4274, + "step": 472 + }, + { + "batch_num_effect_tokens": 6512, + "batch_num_samples": 149, + "batch_num_tokens": 52126, + "epoch": 0.43, + "grad_norm": 3.4565165042877197, + "learning_rate": 1.9205048534524405e-05, + "loss": 2.134, + "step": 473 + }, + { + "batch_num_effect_tokens": 6771, + "batch_num_samples": 149, + "batch_num_tokens": 52081, + "epoch": 0.43091, + "grad_norm": 3.388167381286621, + "learning_rate": 1.9198837364201587e-05, + "loss": 2.0009, + "step": 474 + }, + { + "batch_num_effect_tokens": 6701, + "batch_num_samples": 149, + "batch_num_tokens": 52180, + "epoch": 0.43182, + "grad_norm": 3.1186001300811768, + "learning_rate": 1.9192603035816657e-05, + "loss": 2.0379, + "step": 475 + }, + { + "batch_num_effect_tokens": 6036, + "batch_num_samples": 149, + "batch_num_tokens": 52147, + "epoch": 0.43273, + "grad_norm": 3.3869993686676025, + "learning_rate": 1.918634556506454e-05, + "loss": 1.9541, + "step": 476 + }, + { + "batch_num_effect_tokens": 4699, + "batch_num_samples": 149, + "batch_num_tokens": 52201, + "epoch": 0.43364, + "grad_norm": 3.9716951847076416, + "learning_rate": 1.91800649676984e-05, + "loss": 1.6997, + "step": 477 + }, + { + "batch_num_effect_tokens": 5120, + "batch_num_samples": 150, + "batch_num_tokens": 52167, + "epoch": 0.43455, + "grad_norm": 3.8032917976379395, + "learning_rate": 1.9173761259529634e-05, + "loss": 1.7517, + "step": 478 + }, + { + "batch_num_effect_tokens": 5568, + "batch_num_samples": 149, + "batch_num_tokens": 52211, + "epoch": 0.43545, + "grad_norm": 3.4945125579833984, + "learning_rate": 1.916743445642783e-05, + "loss": 1.5496, + "step": 479 + }, + { + "batch_num_effect_tokens": 8009, + "batch_num_samples": 149, + "batch_num_tokens": 52140, + "epoch": 0.43636, + "grad_norm": 3.0792834758758545, + "learning_rate": 1.9161084574320696e-05, + "loss": 2.0386, + "step": 480 + }, + { + "batch_num_effect_tokens": 7660, + "batch_num_samples": 150, + "batch_num_tokens": 52153, + "epoch": 0.43727, + "grad_norm": 3.443307399749756, + "learning_rate": 1.9154711629194062e-05, + "loss": 2.0699, + "step": 481 + }, + { + "batch_num_effect_tokens": 5759, + "batch_num_samples": 149, + "batch_num_tokens": 52222, + "epoch": 0.43818, + "grad_norm": 3.4880309104919434, + "learning_rate": 1.9148315637091805e-05, + "loss": 1.7058, + "step": 482 + }, + { + "batch_num_effect_tokens": 6151, + "batch_num_samples": 149, + "batch_num_tokens": 52151, + "epoch": 0.43909, + "grad_norm": 3.6025350093841553, + "learning_rate": 1.9141896614115824e-05, + "loss": 2.1311, + "step": 483 + }, + { + "batch_num_effect_tokens": 8317, + "batch_num_samples": 149, + "batch_num_tokens": 52166, + "epoch": 0.44, + "grad_norm": 3.1102218627929688, + "learning_rate": 1.913545457642601e-05, + "loss": 2.2208, + "step": 484 + }, + { + "batch_num_effect_tokens": 5579, + "batch_num_samples": 149, + "batch_num_tokens": 52208, + "epoch": 0.44091, + "grad_norm": 3.600101947784424, + "learning_rate": 1.9128989540240178e-05, + "loss": 2.0342, + "step": 485 + }, + { + "batch_num_effect_tokens": 6947, + "batch_num_samples": 149, + "batch_num_tokens": 52157, + "epoch": 0.44182, + "grad_norm": 3.5309343338012695, + "learning_rate": 1.9122501521834052e-05, + "loss": 2.2342, + "step": 486 + }, + { + "batch_num_effect_tokens": 6241, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 0.44273, + "grad_norm": 3.5790889263153076, + "learning_rate": 1.9115990537541217e-05, + "loss": 2.1857, + "step": 487 + }, + { + "batch_num_effect_tokens": 7397, + "batch_num_samples": 150, + "batch_num_tokens": 52146, + "epoch": 0.44364, + "grad_norm": 3.4434750080108643, + "learning_rate": 1.910945660375305e-05, + "loss": 2.326, + "step": 488 + }, + { + "batch_num_effect_tokens": 9990, + "batch_num_samples": 149, + "batch_num_tokens": 52167, + "epoch": 0.44455, + "grad_norm": 2.4198713302612305, + "learning_rate": 1.9102899736918742e-05, + "loss": 1.5947, + "step": 489 + }, + { + "batch_num_effect_tokens": 4837, + "batch_num_samples": 149, + "batch_num_tokens": 52123, + "epoch": 0.44545, + "grad_norm": 3.3059439659118652, + "learning_rate": 1.9096319953545186e-05, + "loss": 1.5192, + "step": 490 + }, + { + "batch_num_effect_tokens": 7704, + "batch_num_samples": 149, + "batch_num_tokens": 52142, + "epoch": 0.44636, + "grad_norm": 3.98941969871521, + "learning_rate": 1.9089717270196982e-05, + "loss": 1.4529, + "step": 491 + }, + { + "batch_num_effect_tokens": 4778, + "batch_num_samples": 149, + "batch_num_tokens": 52201, + "epoch": 0.44727, + "grad_norm": 3.2443060874938965, + "learning_rate": 1.9083091703496373e-05, + "loss": 1.6286, + "step": 492 + }, + { + "batch_num_effect_tokens": 9290, + "batch_num_samples": 149, + "batch_num_tokens": 52197, + "epoch": 0.44818, + "grad_norm": 2.968362808227539, + "learning_rate": 1.9076443270123222e-05, + "loss": 2.2458, + "step": 493 + }, + { + "batch_num_effect_tokens": 7917, + "batch_num_samples": 149, + "batch_num_tokens": 52157, + "epoch": 0.44909, + "grad_norm": 2.978633403778076, + "learning_rate": 1.9069771986814948e-05, + "loss": 2.103, + "step": 494 + }, + { + "batch_num_effect_tokens": 4098, + "batch_num_samples": 149, + "batch_num_tokens": 52173, + "epoch": 0.45, + "grad_norm": 4.185185432434082, + "learning_rate": 1.9063077870366504e-05, + "loss": 1.7879, + "step": 495 + }, + { + "batch_num_effect_tokens": 7130, + "batch_num_samples": 150, + "batch_num_tokens": 52144, + "epoch": 0.45091, + "grad_norm": 3.301975727081299, + "learning_rate": 1.905636093763031e-05, + "loss": 2.2401, + "step": 496 + }, + { + "batch_num_effect_tokens": 7763, + "batch_num_samples": 149, + "batch_num_tokens": 52214, + "epoch": 0.45182, + "grad_norm": 9.562934875488281, + "learning_rate": 1.9049621205516243e-05, + "loss": 2.104, + "step": 497 + }, + { + "batch_num_effect_tokens": 3216, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 0.45273, + "grad_norm": 3.6380558013916016, + "learning_rate": 1.9042858690991574e-05, + "loss": 1.3197, + "step": 498 + }, + { + "batch_num_effect_tokens": 5344, + "batch_num_samples": 149, + "batch_num_tokens": 52145, + "epoch": 0.45364, + "grad_norm": 3.306352376937866, + "learning_rate": 1.9036073411080917e-05, + "loss": 1.7201, + "step": 499 + }, + { + "batch_num_effect_tokens": 5946, + "batch_num_samples": 149, + "batch_num_tokens": 52151, + "epoch": 0.45455, + "grad_norm": 3.5917532444000244, + "learning_rate": 1.9029265382866216e-05, + "loss": 2.0292, + "step": 500 + }, + { + "batch_num_effect_tokens": 3840, + "batch_num_samples": 149, + "batch_num_tokens": 52177, + "epoch": 0.45545, + "grad_norm": 3.4000165462493896, + "learning_rate": 1.902243462348666e-05, + "loss": 1.3569, + "step": 501 + }, + { + "batch_num_effect_tokens": 6128, + "batch_num_samples": 149, + "batch_num_tokens": 52151, + "epoch": 0.45636, + "grad_norm": 3.308820962905884, + "learning_rate": 1.9015581150138693e-05, + "loss": 1.7542, + "step": 502 + }, + { + "batch_num_effect_tokens": 6220, + "batch_num_samples": 149, + "batch_num_tokens": 52210, + "epoch": 0.45727, + "grad_norm": 3.2647831439971924, + "learning_rate": 1.9008704980075915e-05, + "loss": 2.0074, + "step": 503 + }, + { + "batch_num_effect_tokens": 7995, + "batch_num_samples": 150, + "batch_num_tokens": 52187, + "epoch": 0.45818, + "grad_norm": 3.451694965362549, + "learning_rate": 1.900180613060908e-05, + "loss": 2.301, + "step": 504 + }, + { + "batch_num_effect_tokens": 6552, + "batch_num_samples": 149, + "batch_num_tokens": 52159, + "epoch": 0.45909, + "grad_norm": 3.290741443634033, + "learning_rate": 1.8994884619106034e-05, + "loss": 2.1138, + "step": 505 + }, + { + "batch_num_effect_tokens": 4136, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 0.46, + "grad_norm": 3.646514654159546, + "learning_rate": 1.8987940462991673e-05, + "loss": 1.6685, + "step": 506 + }, + { + "batch_num_effect_tokens": 7924, + "batch_num_samples": 149, + "batch_num_tokens": 52223, + "epoch": 0.46091, + "grad_norm": 3.5173795223236084, + "learning_rate": 1.8980973679747897e-05, + "loss": 2.1933, + "step": 507 + }, + { + "batch_num_effect_tokens": 4122, + "batch_num_samples": 149, + "batch_num_tokens": 52210, + "epoch": 0.46182, + "grad_norm": 3.5924158096313477, + "learning_rate": 1.8973984286913584e-05, + "loss": 1.2037, + "step": 508 + }, + { + "batch_num_effect_tokens": 6331, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 0.46273, + "grad_norm": 3.658207654953003, + "learning_rate": 1.8966972302084516e-05, + "loss": 2.0125, + "step": 509 + }, + { + "batch_num_effect_tokens": 7272, + "batch_num_samples": 149, + "batch_num_tokens": 52160, + "epoch": 0.46364, + "grad_norm": 3.5813214778900146, + "learning_rate": 1.895993774291336e-05, + "loss": 2.1851, + "step": 510 + }, + { + "batch_num_effect_tokens": 6230, + "batch_num_samples": 149, + "batch_num_tokens": 52185, + "epoch": 0.46455, + "grad_norm": 3.268073558807373, + "learning_rate": 1.8952880627109606e-05, + "loss": 1.9113, + "step": 511 + }, + { + "batch_num_effect_tokens": 4682, + "batch_num_samples": 149, + "batch_num_tokens": 52164, + "epoch": 0.46545, + "grad_norm": 3.3010008335113525, + "learning_rate": 1.894580097243954e-05, + "loss": 1.2218, + "step": 512 + }, + { + "batch_num_effect_tokens": 7353, + "batch_num_samples": 149, + "batch_num_tokens": 52213, + "epoch": 0.46636, + "grad_norm": 3.572483539581299, + "learning_rate": 1.8938698796726177e-05, + "loss": 2.0842, + "step": 513 + }, + { + "batch_num_effect_tokens": 9044, + "batch_num_samples": 149, + "batch_num_tokens": 52144, + "epoch": 0.46727, + "grad_norm": 3.1595988273620605, + "learning_rate": 1.893157411784924e-05, + "loss": 2.2458, + "step": 514 + }, + { + "batch_num_effect_tokens": 10117, + "batch_num_samples": 149, + "batch_num_tokens": 52181, + "epoch": 0.46818, + "grad_norm": 3.0882046222686768, + "learning_rate": 1.89244269537451e-05, + "loss": 2.1724, + "step": 515 + }, + { + "batch_num_effect_tokens": 4353, + "batch_num_samples": 149, + "batch_num_tokens": 52221, + "epoch": 0.46909, + "grad_norm": 3.614635467529297, + "learning_rate": 1.8917257322406735e-05, + "loss": 1.6285, + "step": 516 + }, + { + "batch_num_effect_tokens": 5500, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 0.47, + "grad_norm": 3.201978921890259, + "learning_rate": 1.891006524188368e-05, + "loss": 1.6012, + "step": 517 + }, + { + "batch_num_effect_tokens": 5168, + "batch_num_samples": 149, + "batch_num_tokens": 52207, + "epoch": 0.47091, + "grad_norm": 3.692725419998169, + "learning_rate": 1.8902850730281993e-05, + "loss": 1.9266, + "step": 518 + }, + { + "batch_num_effect_tokens": 4942, + "batch_num_samples": 150, + "batch_num_tokens": 52193, + "epoch": 0.47182, + "grad_norm": 4.545666694641113, + "learning_rate": 1.8895613805764196e-05, + "loss": 2.0146, + "step": 519 + }, + { + "batch_num_effect_tokens": 7190, + "batch_num_samples": 149, + "batch_num_tokens": 52146, + "epoch": 0.47273, + "grad_norm": 3.550752878189087, + "learning_rate": 1.8888354486549238e-05, + "loss": 2.1113, + "step": 520 + }, + { + "batch_num_effect_tokens": 8762, + "batch_num_samples": 149, + "batch_num_tokens": 52197, + "epoch": 0.47364, + "grad_norm": 3.4238336086273193, + "learning_rate": 1.8881072790912445e-05, + "loss": 2.2257, + "step": 521 + }, + { + "batch_num_effect_tokens": 6978, + "batch_num_samples": 150, + "batch_num_tokens": 52189, + "epoch": 0.47455, + "grad_norm": 3.5296685695648193, + "learning_rate": 1.887376873718548e-05, + "loss": 2.2325, + "step": 522 + }, + { + "batch_num_effect_tokens": 4304, + "batch_num_samples": 149, + "batch_num_tokens": 52196, + "epoch": 0.47545, + "grad_norm": 3.596109628677368, + "learning_rate": 1.8866442343756288e-05, + "loss": 1.5695, + "step": 523 + }, + { + "batch_num_effect_tokens": 6116, + "batch_num_samples": 149, + "batch_num_tokens": 52178, + "epoch": 0.47636, + "grad_norm": 3.8315112590789795, + "learning_rate": 1.8859093629069057e-05, + "loss": 1.7386, + "step": 524 + }, + { + "batch_num_effect_tokens": 5970, + "batch_num_samples": 149, + "batch_num_tokens": 52112, + "epoch": 0.47727, + "grad_norm": 3.498262405395508, + "learning_rate": 1.8851722611624166e-05, + "loss": 1.9725, + "step": 525 + }, + { + "batch_num_effect_tokens": 5960, + "batch_num_samples": 150, + "batch_num_tokens": 52154, + "epoch": 0.47818, + "grad_norm": 3.7059292793273926, + "learning_rate": 1.8844329309978146e-05, + "loss": 2.0527, + "step": 526 + }, + { + "batch_num_effect_tokens": 5305, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 0.47909, + "grad_norm": 3.4222211837768555, + "learning_rate": 1.883691374274362e-05, + "loss": 1.8587, + "step": 527 + }, + { + "batch_num_effect_tokens": 7227, + "batch_num_samples": 150, + "batch_num_tokens": 52142, + "epoch": 0.48, + "grad_norm": 3.3856403827667236, + "learning_rate": 1.8829475928589272e-05, + "loss": 2.215, + "step": 528 + }, + { + "batch_num_effect_tokens": 4177, + "batch_num_samples": 149, + "batch_num_tokens": 52220, + "epoch": 0.48091, + "grad_norm": 3.9069483280181885, + "learning_rate": 1.882201588623979e-05, + "loss": 1.5857, + "step": 529 + }, + { + "batch_num_effect_tokens": 5385, + "batch_num_samples": 149, + "batch_num_tokens": 52142, + "epoch": 0.48182, + "grad_norm": 3.723468542098999, + "learning_rate": 1.881453363447582e-05, + "loss": 2.0356, + "step": 530 + }, + { + "batch_num_effect_tokens": 5821, + "batch_num_samples": 149, + "batch_num_tokens": 52197, + "epoch": 0.48273, + "grad_norm": 3.635272741317749, + "learning_rate": 1.8807029192133927e-05, + "loss": 1.7682, + "step": 531 + }, + { + "batch_num_effect_tokens": 6328, + "batch_num_samples": 149, + "batch_num_tokens": 52159, + "epoch": 0.48364, + "grad_norm": 3.9412262439727783, + "learning_rate": 1.8799502578106533e-05, + "loss": 2.1657, + "step": 532 + }, + { + "batch_num_effect_tokens": 6253, + "batch_num_samples": 149, + "batch_num_tokens": 52161, + "epoch": 0.48455, + "grad_norm": 3.4781086444854736, + "learning_rate": 1.879195381134188e-05, + "loss": 1.8876, + "step": 533 + }, + { + "batch_num_effect_tokens": 7108, + "batch_num_samples": 149, + "batch_num_tokens": 52141, + "epoch": 0.48545, + "grad_norm": 3.2070069313049316, + "learning_rate": 1.8784382910843978e-05, + "loss": 2.125, + "step": 534 + }, + { + "batch_num_effect_tokens": 6323, + "batch_num_samples": 149, + "batch_num_tokens": 52200, + "epoch": 0.48636, + "grad_norm": 4.461274147033691, + "learning_rate": 1.8776789895672557e-05, + "loss": 1.7811, + "step": 535 + }, + { + "batch_num_effect_tokens": 7237, + "batch_num_samples": 149, + "batch_num_tokens": 52191, + "epoch": 0.48727, + "grad_norm": 3.477790355682373, + "learning_rate": 1.8769174784943032e-05, + "loss": 1.9597, + "step": 536 + }, + { + "batch_num_effect_tokens": 4006, + "batch_num_samples": 149, + "batch_num_tokens": 52162, + "epoch": 0.48818, + "grad_norm": 3.06264066696167, + "learning_rate": 1.8761537597826426e-05, + "loss": 1.2381, + "step": 537 + }, + { + "batch_num_effect_tokens": 7709, + "batch_num_samples": 149, + "batch_num_tokens": 52222, + "epoch": 0.48909, + "grad_norm": 3.4257607460021973, + "learning_rate": 1.8753878353549357e-05, + "loss": 2.1019, + "step": 538 + }, + { + "batch_num_effect_tokens": 7508, + "batch_num_samples": 149, + "batch_num_tokens": 52099, + "epoch": 0.49, + "grad_norm": 2.888709545135498, + "learning_rate": 1.874619707139396e-05, + "loss": 1.5886, + "step": 539 + }, + { + "batch_num_effect_tokens": 7412, + "batch_num_samples": 150, + "batch_num_tokens": 52215, + "epoch": 0.49091, + "grad_norm": 3.1860759258270264, + "learning_rate": 1.873849377069785e-05, + "loss": 2.0846, + "step": 540 + }, + { + "batch_num_effect_tokens": 4228, + "batch_num_samples": 149, + "batch_num_tokens": 52208, + "epoch": 0.49182, + "grad_norm": 3.918816328048706, + "learning_rate": 1.8730768470854085e-05, + "loss": 1.763, + "step": 541 + }, + { + "batch_num_effect_tokens": 4622, + "batch_num_samples": 149, + "batch_num_tokens": 52130, + "epoch": 0.49273, + "grad_norm": 3.312856912612915, + "learning_rate": 1.872302119131109e-05, + "loss": 1.5452, + "step": 542 + }, + { + "batch_num_effect_tokens": 4885, + "batch_num_samples": 149, + "batch_num_tokens": 52145, + "epoch": 0.49364, + "grad_norm": 3.5208563804626465, + "learning_rate": 1.8715251951572635e-05, + "loss": 1.7166, + "step": 543 + }, + { + "batch_num_effect_tokens": 5722, + "batch_num_samples": 149, + "batch_num_tokens": 52174, + "epoch": 0.49455, + "grad_norm": 4.089620590209961, + "learning_rate": 1.8707460771197773e-05, + "loss": 2.1785, + "step": 544 + }, + { + "batch_num_effect_tokens": 6006, + "batch_num_samples": 149, + "batch_num_tokens": 52201, + "epoch": 0.49545, + "grad_norm": 3.4604692459106445, + "learning_rate": 1.869964766980079e-05, + "loss": 1.9571, + "step": 545 + }, + { + "batch_num_effect_tokens": 7263, + "batch_num_samples": 149, + "batch_num_tokens": 52116, + "epoch": 0.49636, + "grad_norm": 3.3346478939056396, + "learning_rate": 1.8691812667051164e-05, + "loss": 2.19, + "step": 546 + }, + { + "batch_num_effect_tokens": 7015, + "batch_num_samples": 149, + "batch_num_tokens": 52172, + "epoch": 0.49727, + "grad_norm": 3.139596700668335, + "learning_rate": 1.8683955782673496e-05, + "loss": 2.203, + "step": 547 + }, + { + "batch_num_effect_tokens": 5609, + "batch_num_samples": 149, + "batch_num_tokens": 52171, + "epoch": 0.49818, + "grad_norm": 3.289130687713623, + "learning_rate": 1.867607703644749e-05, + "loss": 1.8295, + "step": 548 + }, + { + "batch_num_effect_tokens": 5855, + "batch_num_samples": 149, + "batch_num_tokens": 52153, + "epoch": 0.49909, + "grad_norm": 3.2381234169006348, + "learning_rate": 1.8668176448207883e-05, + "loss": 1.8196, + "step": 549 + }, + { + "batch_num_effect_tokens": 6374, + "batch_num_samples": 150, + "batch_num_tokens": 52167, + "epoch": 0.5, + "grad_norm": 3.6862313747406006, + "learning_rate": 1.866025403784439e-05, + "loss": 2.1483, + "step": 550 + }, + { + "batch_num_effect_tokens": 7273, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 0.50091, + "grad_norm": 3.053278923034668, + "learning_rate": 1.865230982530167e-05, + "loss": 2.1838, + "step": 551 + }, + { + "batch_num_effect_tokens": 4038, + "batch_num_samples": 149, + "batch_num_tokens": 52108, + "epoch": 0.50182, + "grad_norm": 3.9057250022888184, + "learning_rate": 1.864434383057927e-05, + "loss": 1.7801, + "step": 552 + }, + { + "batch_num_effect_tokens": 7194, + "batch_num_samples": 150, + "batch_num_tokens": 52168, + "epoch": 0.50273, + "grad_norm": 2.9885618686676025, + "learning_rate": 1.863635607373157e-05, + "loss": 1.9528, + "step": 553 + }, + { + "batch_num_effect_tokens": 3872, + "batch_num_samples": 149, + "batch_num_tokens": 52140, + "epoch": 0.50364, + "grad_norm": 3.4793949127197266, + "learning_rate": 1.8628346574867748e-05, + "loss": 1.2668, + "step": 554 + }, + { + "batch_num_effect_tokens": 5104, + "batch_num_samples": 149, + "batch_num_tokens": 52180, + "epoch": 0.50455, + "grad_norm": 3.6043288707733154, + "learning_rate": 1.8620315354151695e-05, + "loss": 1.8763, + "step": 555 + }, + { + "batch_num_effect_tokens": 9724, + "batch_num_samples": 149, + "batch_num_tokens": 52142, + "epoch": 0.50545, + "grad_norm": 7.592609882354736, + "learning_rate": 1.861226243180201e-05, + "loss": 1.3798, + "step": 556 + }, + { + "batch_num_effect_tokens": 6299, + "batch_num_samples": 149, + "batch_num_tokens": 52138, + "epoch": 0.50636, + "grad_norm": 3.6786558628082275, + "learning_rate": 1.8604187828091906e-05, + "loss": 2.0197, + "step": 557 + }, + { + "batch_num_effect_tokens": 7648, + "batch_num_samples": 149, + "batch_num_tokens": 52222, + "epoch": 0.50727, + "grad_norm": 3.2028064727783203, + "learning_rate": 1.859609156334919e-05, + "loss": 1.9882, + "step": 558 + }, + { + "batch_num_effect_tokens": 6900, + "batch_num_samples": 149, + "batch_num_tokens": 52156, + "epoch": 0.50818, + "grad_norm": 3.237802267074585, + "learning_rate": 1.858797365795621e-05, + "loss": 1.9579, + "step": 559 + }, + { + "batch_num_effect_tokens": 6792, + "batch_num_samples": 149, + "batch_num_tokens": 50565, + "epoch": 0.50909, + "grad_norm": 3.682898759841919, + "learning_rate": 1.8579834132349773e-05, + "loss": 2.3555, + "step": 560 + }, + { + "batch_num_effect_tokens": 6533, + "batch_num_samples": 149, + "batch_num_tokens": 52197, + "epoch": 0.51, + "grad_norm": 3.4176299571990967, + "learning_rate": 1.8571673007021124e-05, + "loss": 1.9165, + "step": 561 + }, + { + "batch_num_effect_tokens": 7501, + "batch_num_samples": 149, + "batch_num_tokens": 52208, + "epoch": 0.51091, + "grad_norm": 3.5271034240722656, + "learning_rate": 1.856349030251589e-05, + "loss": 2.236, + "step": 562 + }, + { + "batch_num_effect_tokens": 5026, + "batch_num_samples": 149, + "batch_num_tokens": 52139, + "epoch": 0.51182, + "grad_norm": 3.6298139095306396, + "learning_rate": 1.8555286039434022e-05, + "loss": 1.8674, + "step": 563 + }, + { + "batch_num_effect_tokens": 6409, + "batch_num_samples": 149, + "batch_num_tokens": 52175, + "epoch": 0.51273, + "grad_norm": 3.4671781063079834, + "learning_rate": 1.8547060238429737e-05, + "loss": 1.9354, + "step": 564 + }, + { + "batch_num_effect_tokens": 7138, + "batch_num_samples": 149, + "batch_num_tokens": 52174, + "epoch": 0.51364, + "grad_norm": 3.528872013092041, + "learning_rate": 1.8538812920211484e-05, + "loss": 2.2646, + "step": 565 + }, + { + "batch_num_effect_tokens": 7711, + "batch_num_samples": 149, + "batch_num_tokens": 52190, + "epoch": 0.51455, + "grad_norm": 3.2639312744140625, + "learning_rate": 1.8530544105541872e-05, + "loss": 2.2585, + "step": 566 + }, + { + "batch_num_effect_tokens": 7945, + "batch_num_samples": 149, + "batch_num_tokens": 52143, + "epoch": 0.51545, + "grad_norm": 3.3265206813812256, + "learning_rate": 1.8522253815237636e-05, + "loss": 2.1542, + "step": 567 + }, + { + "batch_num_effect_tokens": 7369, + "batch_num_samples": 149, + "batch_num_tokens": 52214, + "epoch": 0.51636, + "grad_norm": 2.9828522205352783, + "learning_rate": 1.8513942070169572e-05, + "loss": 2.2732, + "step": 568 + }, + { + "batch_num_effect_tokens": 7247, + "batch_num_samples": 149, + "batch_num_tokens": 52167, + "epoch": 0.51727, + "grad_norm": 3.3099958896636963, + "learning_rate": 1.8505608891262487e-05, + "loss": 2.2839, + "step": 569 + }, + { + "batch_num_effect_tokens": 4585, + "batch_num_samples": 149, + "batch_num_tokens": 52151, + "epoch": 0.51818, + "grad_norm": 3.316478729248047, + "learning_rate": 1.8497254299495147e-05, + "loss": 1.5033, + "step": 570 + }, + { + "batch_num_effect_tokens": 6077, + "batch_num_samples": 149, + "batch_num_tokens": 52214, + "epoch": 0.51909, + "grad_norm": 3.613713502883911, + "learning_rate": 1.8488878315900228e-05, + "loss": 2.1368, + "step": 571 + }, + { + "batch_num_effect_tokens": 9597, + "batch_num_samples": 149, + "batch_num_tokens": 52183, + "epoch": 0.52, + "grad_norm": 3.1866140365600586, + "learning_rate": 1.848048096156426e-05, + "loss": 2.093, + "step": 572 + }, + { + "batch_num_effect_tokens": 4443, + "batch_num_samples": 149, + "batch_num_tokens": 52222, + "epoch": 0.52091, + "grad_norm": 3.794059991836548, + "learning_rate": 1.8472062257627573e-05, + "loss": 1.549, + "step": 573 + }, + { + "batch_num_effect_tokens": 3305, + "batch_num_samples": 149, + "batch_num_tokens": 52220, + "epoch": 0.52182, + "grad_norm": 3.145803451538086, + "learning_rate": 1.8463622225284242e-05, + "loss": 0.9547, + "step": 574 + }, + { + "batch_num_effect_tokens": 3851, + "batch_num_samples": 149, + "batch_num_tokens": 52204, + "epoch": 0.52273, + "grad_norm": 3.587178945541382, + "learning_rate": 1.8455160885782045e-05, + "loss": 1.4285, + "step": 575 + }, + { + "batch_num_effect_tokens": 8686, + "batch_num_samples": 149, + "batch_num_tokens": 52208, + "epoch": 0.52364, + "grad_norm": 3.6582915782928467, + "learning_rate": 1.8446678260422388e-05, + "loss": 1.9506, + "step": 576 + }, + { + "batch_num_effect_tokens": 4839, + "batch_num_samples": 149, + "batch_num_tokens": 52141, + "epoch": 0.52455, + "grad_norm": 3.1688389778137207, + "learning_rate": 1.8438174370560263e-05, + "loss": 1.3682, + "step": 577 + }, + { + "batch_num_effect_tokens": 6489, + "batch_num_samples": 150, + "batch_num_tokens": 52153, + "epoch": 0.52545, + "grad_norm": 3.809812307357788, + "learning_rate": 1.8429649237604215e-05, + "loss": 1.9886, + "step": 578 + }, + { + "batch_num_effect_tokens": 5519, + "batch_num_samples": 149, + "batch_num_tokens": 52143, + "epoch": 0.52636, + "grad_norm": 3.732606887817383, + "learning_rate": 1.8421102883016253e-05, + "loss": 1.6313, + "step": 579 + }, + { + "batch_num_effect_tokens": 10408, + "batch_num_samples": 149, + "batch_num_tokens": 52172, + "epoch": 0.52727, + "grad_norm": 3.174161911010742, + "learning_rate": 1.8412535328311813e-05, + "loss": 2.4339, + "step": 580 + }, + { + "batch_num_effect_tokens": 4047, + "batch_num_samples": 149, + "batch_num_tokens": 50510, + "epoch": 0.52818, + "grad_norm": 3.608646869659424, + "learning_rate": 1.8403946595059705e-05, + "loss": 1.3289, + "step": 581 + }, + { + "batch_num_effect_tokens": 6078, + "batch_num_samples": 149, + "batch_num_tokens": 52149, + "epoch": 0.52909, + "grad_norm": 3.461907148361206, + "learning_rate": 1.839533670488205e-05, + "loss": 1.9802, + "step": 582 + }, + { + "batch_num_effect_tokens": 4242, + "batch_num_samples": 149, + "batch_num_tokens": 52158, + "epoch": 0.53, + "grad_norm": 3.994670867919922, + "learning_rate": 1.8386705679454243e-05, + "loss": 1.6785, + "step": 583 + }, + { + "batch_num_effect_tokens": 5623, + "batch_num_samples": 149, + "batch_num_tokens": 52132, + "epoch": 0.53091, + "grad_norm": 3.693882465362549, + "learning_rate": 1.8378053540504874e-05, + "loss": 1.8519, + "step": 584 + }, + { + "batch_num_effect_tokens": 4473, + "batch_num_samples": 149, + "batch_num_tokens": 52162, + "epoch": 0.53182, + "grad_norm": 3.797433614730835, + "learning_rate": 1.83693803098157e-05, + "loss": 1.2907, + "step": 585 + }, + { + "batch_num_effect_tokens": 7440, + "batch_num_samples": 149, + "batch_num_tokens": 52148, + "epoch": 0.53273, + "grad_norm": 3.0073556900024414, + "learning_rate": 1.836068600922156e-05, + "loss": 1.8691, + "step": 586 + }, + { + "batch_num_effect_tokens": 9577, + "batch_num_samples": 149, + "batch_num_tokens": 52182, + "epoch": 0.53364, + "grad_norm": 3.577150583267212, + "learning_rate": 1.835197066061035e-05, + "loss": 2.1412, + "step": 587 + }, + { + "batch_num_effect_tokens": 7733, + "batch_num_samples": 149, + "batch_num_tokens": 52162, + "epoch": 0.53455, + "grad_norm": 3.0998103618621826, + "learning_rate": 1.8343234285922955e-05, + "loss": 1.8911, + "step": 588 + }, + { + "batch_num_effect_tokens": 6919, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 0.53545, + "grad_norm": 3.259941816329956, + "learning_rate": 1.8334476907153177e-05, + "loss": 2.0594, + "step": 589 + }, + { + "batch_num_effect_tokens": 6546, + "batch_num_samples": 149, + "batch_num_tokens": 52214, + "epoch": 0.53636, + "grad_norm": 3.149838924407959, + "learning_rate": 1.8325698546347714e-05, + "loss": 1.9796, + "step": 590 + }, + { + "batch_num_effect_tokens": 10309, + "batch_num_samples": 149, + "batch_num_tokens": 52169, + "epoch": 0.53727, + "grad_norm": 2.3748481273651123, + "learning_rate": 1.8316899225606078e-05, + "loss": 1.6333, + "step": 591 + }, + { + "batch_num_effect_tokens": 5608, + "batch_num_samples": 149, + "batch_num_tokens": 52139, + "epoch": 0.53818, + "grad_norm": 3.143259048461914, + "learning_rate": 1.8308078967080547e-05, + "loss": 1.6868, + "step": 592 + }, + { + "batch_num_effect_tokens": 5158, + "batch_num_samples": 149, + "batch_num_tokens": 52214, + "epoch": 0.53909, + "grad_norm": 3.805025339126587, + "learning_rate": 1.829923779297611e-05, + "loss": 1.8101, + "step": 593 + }, + { + "batch_num_effect_tokens": 6560, + "batch_num_samples": 149, + "batch_num_tokens": 52185, + "epoch": 0.54, + "grad_norm": 3.417865753173828, + "learning_rate": 1.8290375725550417e-05, + "loss": 2.1457, + "step": 594 + }, + { + "batch_num_effect_tokens": 6663, + "batch_num_samples": 149, + "batch_num_tokens": 52201, + "epoch": 0.54091, + "grad_norm": 3.3823914527893066, + "learning_rate": 1.8281492787113707e-05, + "loss": 1.9639, + "step": 595 + }, + { + "batch_num_effect_tokens": 5353, + "batch_num_samples": 149, + "batch_num_tokens": 52130, + "epoch": 0.54182, + "grad_norm": 3.1551902294158936, + "learning_rate": 1.8272589000028774e-05, + "loss": 1.8591, + "step": 596 + }, + { + "batch_num_effect_tokens": 8126, + "batch_num_samples": 149, + "batch_num_tokens": 52213, + "epoch": 0.54273, + "grad_norm": 2.9723100662231445, + "learning_rate": 1.826366438671088e-05, + "loss": 2.0776, + "step": 597 + }, + { + "batch_num_effect_tokens": 5902, + "batch_num_samples": 149, + "batch_num_tokens": 52102, + "epoch": 0.54364, + "grad_norm": 3.1751818656921387, + "learning_rate": 1.825471896962774e-05, + "loss": 1.9042, + "step": 598 + }, + { + "batch_num_effect_tokens": 7567, + "batch_num_samples": 149, + "batch_num_tokens": 52210, + "epoch": 0.54455, + "grad_norm": 2.933289051055908, + "learning_rate": 1.8245752771299426e-05, + "loss": 2.1582, + "step": 599 + }, + { + "batch_num_effect_tokens": 6325, + "batch_num_samples": 149, + "batch_num_tokens": 52196, + "epoch": 0.54545, + "grad_norm": 3.5300397872924805, + "learning_rate": 1.8236765814298328e-05, + "loss": 2.0646, + "step": 600 + }, + { + "batch_num_effect_tokens": 4882, + "batch_num_samples": 149, + "batch_num_tokens": 52089, + "epoch": 0.54636, + "grad_norm": 3.5921027660369873, + "learning_rate": 1.8227758121249108e-05, + "loss": 1.7715, + "step": 601 + }, + { + "batch_num_effect_tokens": 6232, + "batch_num_samples": 149, + "batch_num_tokens": 52210, + "epoch": 0.54727, + "grad_norm": 3.275217294692993, + "learning_rate": 1.8218729714828612e-05, + "loss": 2.0114, + "step": 602 + }, + { + "batch_num_effect_tokens": 6150, + "batch_num_samples": 149, + "batch_num_tokens": 52223, + "epoch": 0.54818, + "grad_norm": 3.6507728099823, + "learning_rate": 1.820968061776585e-05, + "loss": 2.1635, + "step": 603 + }, + { + "batch_num_effect_tokens": 5137, + "batch_num_samples": 149, + "batch_num_tokens": 52188, + "epoch": 0.54909, + "grad_norm": 3.0628061294555664, + "learning_rate": 1.8200610852841913e-05, + "loss": 1.5982, + "step": 604 + }, + { + "batch_num_effect_tokens": 5457, + "batch_num_samples": 149, + "batch_num_tokens": 52214, + "epoch": 0.55, + "grad_norm": 3.211014986038208, + "learning_rate": 1.819152044288992e-05, + "loss": 1.7999, + "step": 605 + }, + { + "batch_num_effect_tokens": 6750, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 0.55091, + "grad_norm": 3.275047540664673, + "learning_rate": 1.818240941079497e-05, + "loss": 2.1458, + "step": 606 + }, + { + "batch_num_effect_tokens": 5085, + "batch_num_samples": 149, + "batch_num_tokens": 52144, + "epoch": 0.55182, + "grad_norm": 3.6827852725982666, + "learning_rate": 1.817327777949407e-05, + "loss": 1.8882, + "step": 607 + }, + { + "batch_num_effect_tokens": 6080, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 0.55273, + "grad_norm": 3.345921277999878, + "learning_rate": 1.81641255719761e-05, + "loss": 1.9049, + "step": 608 + }, + { + "batch_num_effect_tokens": 5989, + "batch_num_samples": 149, + "batch_num_tokens": 52082, + "epoch": 0.55364, + "grad_norm": 3.1699001789093018, + "learning_rate": 1.8154952811281723e-05, + "loss": 1.7104, + "step": 609 + }, + { + "batch_num_effect_tokens": 6347, + "batch_num_samples": 150, + "batch_num_tokens": 52215, + "epoch": 0.55455, + "grad_norm": 3.049734592437744, + "learning_rate": 1.814575952050336e-05, + "loss": 1.758, + "step": 610 + }, + { + "batch_num_effect_tokens": 6280, + "batch_num_samples": 149, + "batch_num_tokens": 50569, + "epoch": 0.55545, + "grad_norm": 3.2737529277801514, + "learning_rate": 1.81365457227851e-05, + "loss": 1.4851, + "step": 611 + }, + { + "batch_num_effect_tokens": 5818, + "batch_num_samples": 149, + "batch_num_tokens": 52173, + "epoch": 0.55636, + "grad_norm": 3.4388372898101807, + "learning_rate": 1.812731144132268e-05, + "loss": 1.8755, + "step": 612 + }, + { + "batch_num_effect_tokens": 5249, + "batch_num_samples": 149, + "batch_num_tokens": 52210, + "epoch": 0.55727, + "grad_norm": 3.5681707859039307, + "learning_rate": 1.8118056699363386e-05, + "loss": 1.8418, + "step": 613 + }, + { + "batch_num_effect_tokens": 6035, + "batch_num_samples": 149, + "batch_num_tokens": 52197, + "epoch": 0.55818, + "grad_norm": 4.498365879058838, + "learning_rate": 1.810878152020602e-05, + "loss": 1.8722, + "step": 614 + }, + { + "batch_num_effect_tokens": 4748, + "batch_num_samples": 149, + "batch_num_tokens": 52222, + "epoch": 0.55909, + "grad_norm": 3.7093660831451416, + "learning_rate": 1.809948592720084e-05, + "loss": 1.5109, + "step": 615 + }, + { + "batch_num_effect_tokens": 5118, + "batch_num_samples": 149, + "batch_num_tokens": 52151, + "epoch": 0.56, + "grad_norm": 3.8467259407043457, + "learning_rate": 1.8090169943749477e-05, + "loss": 1.65, + "step": 616 + }, + { + "batch_num_effect_tokens": 7968, + "batch_num_samples": 150, + "batch_num_tokens": 52213, + "epoch": 0.56091, + "grad_norm": 3.0363943576812744, + "learning_rate": 1.8080833593304917e-05, + "loss": 2.1111, + "step": 617 + }, + { + "batch_num_effect_tokens": 5406, + "batch_num_samples": 149, + "batch_num_tokens": 52176, + "epoch": 0.56182, + "grad_norm": 3.4476122856140137, + "learning_rate": 1.8071476899371414e-05, + "loss": 1.5297, + "step": 618 + }, + { + "batch_num_effect_tokens": 7348, + "batch_num_samples": 150, + "batch_num_tokens": 52167, + "epoch": 0.56273, + "grad_norm": 3.2488205432891846, + "learning_rate": 1.806209988550443e-05, + "loss": 2.1373, + "step": 619 + }, + { + "batch_num_effect_tokens": 5017, + "batch_num_samples": 149, + "batch_num_tokens": 52142, + "epoch": 0.56364, + "grad_norm": 3.3816072940826416, + "learning_rate": 1.8052702575310588e-05, + "loss": 1.8264, + "step": 620 + }, + { + "batch_num_effect_tokens": 6569, + "batch_num_samples": 149, + "batch_num_tokens": 52133, + "epoch": 0.56455, + "grad_norm": 3.2509372234344482, + "learning_rate": 1.8043284992447603e-05, + "loss": 2.0343, + "step": 621 + }, + { + "batch_num_effect_tokens": 5804, + "batch_num_samples": 149, + "batch_num_tokens": 52198, + "epoch": 0.56545, + "grad_norm": 3.390310287475586, + "learning_rate": 1.803384716062423e-05, + "loss": 2.2642, + "step": 622 + }, + { + "batch_num_effect_tokens": 5884, + "batch_num_samples": 149, + "batch_num_tokens": 52179, + "epoch": 0.56636, + "grad_norm": 3.26361346244812, + "learning_rate": 1.8024389103600196e-05, + "loss": 1.8734, + "step": 623 + }, + { + "batch_num_effect_tokens": 5589, + "batch_num_samples": 149, + "batch_num_tokens": 52180, + "epoch": 0.56727, + "grad_norm": 3.1460154056549072, + "learning_rate": 1.8014910845186154e-05, + "loss": 1.8174, + "step": 624 + }, + { + "batch_num_effect_tokens": 8632, + "batch_num_samples": 150, + "batch_num_tokens": 52216, + "epoch": 0.56818, + "grad_norm": 2.8307669162750244, + "learning_rate": 1.8005412409243604e-05, + "loss": 2.0944, + "step": 625 + }, + { + "batch_num_effect_tokens": 5966, + "batch_num_samples": 149, + "batch_num_tokens": 52151, + "epoch": 0.56909, + "grad_norm": 3.4551782608032227, + "learning_rate": 1.799589381968485e-05, + "loss": 2.0861, + "step": 626 + }, + { + "batch_num_effect_tokens": 7451, + "batch_num_samples": 149, + "batch_num_tokens": 52132, + "epoch": 0.57, + "grad_norm": 2.963189125061035, + "learning_rate": 1.798635510047293e-05, + "loss": 1.8454, + "step": 627 + }, + { + "batch_num_effect_tokens": 6263, + "batch_num_samples": 149, + "batch_num_tokens": 52168, + "epoch": 0.57091, + "grad_norm": 3.049030065536499, + "learning_rate": 1.7976796275621556e-05, + "loss": 1.8021, + "step": 628 + }, + { + "batch_num_effect_tokens": 5990, + "batch_num_samples": 149, + "batch_num_tokens": 52118, + "epoch": 0.57182, + "grad_norm": 3.6765613555908203, + "learning_rate": 1.7967217369195058e-05, + "loss": 1.9847, + "step": 629 + }, + { + "batch_num_effect_tokens": 9942, + "batch_num_samples": 149, + "batch_num_tokens": 52132, + "epoch": 0.57273, + "grad_norm": 2.79931902885437, + "learning_rate": 1.7957618405308323e-05, + "loss": 2.1257, + "step": 630 + }, + { + "batch_num_effect_tokens": 7262, + "batch_num_samples": 149, + "batch_num_tokens": 52132, + "epoch": 0.57364, + "grad_norm": 3.0973598957061768, + "learning_rate": 1.794799940812673e-05, + "loss": 1.9928, + "step": 631 + }, + { + "batch_num_effect_tokens": 5831, + "batch_num_samples": 149, + "batch_num_tokens": 52220, + "epoch": 0.57455, + "grad_norm": 3.324289321899414, + "learning_rate": 1.7938360401866096e-05, + "loss": 1.8391, + "step": 632 + }, + { + "batch_num_effect_tokens": 8034, + "batch_num_samples": 149, + "batch_num_tokens": 52196, + "epoch": 0.57545, + "grad_norm": 3.042884111404419, + "learning_rate": 1.79287014107926e-05, + "loss": 2.0706, + "step": 633 + }, + { + "batch_num_effect_tokens": 5643, + "batch_num_samples": 149, + "batch_num_tokens": 52082, + "epoch": 0.57636, + "grad_norm": 3.443073272705078, + "learning_rate": 1.7919022459222754e-05, + "loss": 1.966, + "step": 634 + }, + { + "batch_num_effect_tokens": 4436, + "batch_num_samples": 149, + "batch_num_tokens": 52222, + "epoch": 0.57727, + "grad_norm": 3.825007438659668, + "learning_rate": 1.7909323571523295e-05, + "loss": 2.0158, + "step": 635 + }, + { + "batch_num_effect_tokens": 4307, + "batch_num_samples": 149, + "batch_num_tokens": 52138, + "epoch": 0.57818, + "grad_norm": 3.416240692138672, + "learning_rate": 1.7899604772111163e-05, + "loss": 1.417, + "step": 636 + }, + { + "batch_num_effect_tokens": 6263, + "batch_num_samples": 149, + "batch_num_tokens": 52173, + "epoch": 0.57909, + "grad_norm": 3.2189557552337646, + "learning_rate": 1.788986608545343e-05, + "loss": 1.8994, + "step": 637 + }, + { + "batch_num_effect_tokens": 6508, + "batch_num_samples": 149, + "batch_num_tokens": 52217, + "epoch": 0.58, + "grad_norm": 3.308938503265381, + "learning_rate": 1.788010753606722e-05, + "loss": 2.2697, + "step": 638 + }, + { + "batch_num_effect_tokens": 8259, + "batch_num_samples": 149, + "batch_num_tokens": 52179, + "epoch": 0.58091, + "grad_norm": 3.0805373191833496, + "learning_rate": 1.7870329148519675e-05, + "loss": 2.0879, + "step": 639 + }, + { + "batch_num_effect_tokens": 8419, + "batch_num_samples": 149, + "batch_num_tokens": 52214, + "epoch": 0.58182, + "grad_norm": 3.1611268520355225, + "learning_rate": 1.7860530947427878e-05, + "loss": 1.9171, + "step": 640 + }, + { + "batch_num_effect_tokens": 5999, + "batch_num_samples": 149, + "batch_num_tokens": 52216, + "epoch": 0.58273, + "grad_norm": 3.2548370361328125, + "learning_rate": 1.7850712957458777e-05, + "loss": 1.7961, + "step": 641 + }, + { + "batch_num_effect_tokens": 5583, + "batch_num_samples": 149, + "batch_num_tokens": 52117, + "epoch": 0.58364, + "grad_norm": 3.7277276515960693, + "learning_rate": 1.784087520332916e-05, + "loss": 1.6307, + "step": 642 + }, + { + "batch_num_effect_tokens": 3962, + "batch_num_samples": 149, + "batch_num_tokens": 52208, + "epoch": 0.58455, + "grad_norm": 3.814300060272217, + "learning_rate": 1.7831017709805555e-05, + "loss": 1.6648, + "step": 643 + }, + { + "batch_num_effect_tokens": 9244, + "batch_num_samples": 149, + "batch_num_tokens": 52162, + "epoch": 0.58545, + "grad_norm": 2.9811148643493652, + "learning_rate": 1.7821140501704195e-05, + "loss": 1.951, + "step": 644 + }, + { + "batch_num_effect_tokens": 6123, + "batch_num_samples": 149, + "batch_num_tokens": 52124, + "epoch": 0.58636, + "grad_norm": 3.252951145172119, + "learning_rate": 1.7811243603890934e-05, + "loss": 1.8387, + "step": 645 + }, + { + "batch_num_effect_tokens": 6517, + "batch_num_samples": 149, + "batch_num_tokens": 50559, + "epoch": 0.58727, + "grad_norm": 3.345839023590088, + "learning_rate": 1.780132704128121e-05, + "loss": 1.9952, + "step": 646 + }, + { + "batch_num_effect_tokens": 7450, + "batch_num_samples": 150, + "batch_num_tokens": 52175, + "epoch": 0.58818, + "grad_norm": 3.410034656524658, + "learning_rate": 1.7791390838839946e-05, + "loss": 2.075, + "step": 647 + }, + { + "batch_num_effect_tokens": 7236, + "batch_num_samples": 149, + "batch_num_tokens": 52169, + "epoch": 0.58909, + "grad_norm": 3.4517953395843506, + "learning_rate": 1.7781435021581527e-05, + "loss": 2.0206, + "step": 648 + }, + { + "batch_num_effect_tokens": 8934, + "batch_num_samples": 149, + "batch_num_tokens": 52196, + "epoch": 0.59, + "grad_norm": 3.131535768508911, + "learning_rate": 1.777145961456971e-05, + "loss": 2.4076, + "step": 649 + }, + { + "batch_num_effect_tokens": 11262, + "batch_num_samples": 149, + "batch_num_tokens": 52184, + "epoch": 0.59091, + "grad_norm": 3.0515923500061035, + "learning_rate": 1.776146464291757e-05, + "loss": 2.2349, + "step": 650 + }, + { + "batch_num_effect_tokens": 8278, + "batch_num_samples": 149, + "batch_num_tokens": 52173, + "epoch": 0.59182, + "grad_norm": 3.1197054386138916, + "learning_rate": 1.7751450131787435e-05, + "loss": 2.2192, + "step": 651 + }, + { + "batch_num_effect_tokens": 6649, + "batch_num_samples": 149, + "batch_num_tokens": 52140, + "epoch": 0.59273, + "grad_norm": 3.2355058193206787, + "learning_rate": 1.7741416106390828e-05, + "loss": 2.0891, + "step": 652 + }, + { + "batch_num_effect_tokens": 6802, + "batch_num_samples": 149, + "batch_num_tokens": 52149, + "epoch": 0.59364, + "grad_norm": 3.1941349506378174, + "learning_rate": 1.773136259198839e-05, + "loss": 2.1189, + "step": 653 + }, + { + "batch_num_effect_tokens": 6640, + "batch_num_samples": 149, + "batch_num_tokens": 52208, + "epoch": 0.59455, + "grad_norm": 3.1458933353424072, + "learning_rate": 1.7721289613889835e-05, + "loss": 2.1445, + "step": 654 + }, + { + "batch_num_effect_tokens": 7602, + "batch_num_samples": 149, + "batch_num_tokens": 52210, + "epoch": 0.59545, + "grad_norm": 2.9972329139709473, + "learning_rate": 1.771119719745388e-05, + "loss": 2.2091, + "step": 655 + }, + { + "batch_num_effect_tokens": 5104, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 0.59636, + "grad_norm": 3.2239010334014893, + "learning_rate": 1.7701085368088157e-05, + "loss": 1.6984, + "step": 656 + }, + { + "batch_num_effect_tokens": 7291, + "batch_num_samples": 149, + "batch_num_tokens": 52140, + "epoch": 0.59727, + "grad_norm": 3.082944631576538, + "learning_rate": 1.7690954151249196e-05, + "loss": 2.0532, + "step": 657 + }, + { + "batch_num_effect_tokens": 6642, + "batch_num_samples": 149, + "batch_num_tokens": 50519, + "epoch": 0.59818, + "grad_norm": 3.380568504333496, + "learning_rate": 1.768080357244232e-05, + "loss": 1.9457, + "step": 658 + }, + { + "batch_num_effect_tokens": 6324, + "batch_num_samples": 149, + "batch_num_tokens": 52149, + "epoch": 0.59909, + "grad_norm": 3.1841609477996826, + "learning_rate": 1.7670633657221602e-05, + "loss": 2.0081, + "step": 659 + }, + { + "batch_num_effect_tokens": 8588, + "batch_num_samples": 149, + "batch_num_tokens": 52182, + "epoch": 0.6, + "grad_norm": 3.0000061988830566, + "learning_rate": 1.766044443118978e-05, + "loss": 2.34, + "step": 660 + }, + { + "batch_num_effect_tokens": 5045, + "batch_num_samples": 149, + "batch_num_tokens": 52180, + "epoch": 0.60091, + "grad_norm": 3.462127685546875, + "learning_rate": 1.7650235919998234e-05, + "loss": 1.8486, + "step": 661 + }, + { + "batch_num_effect_tokens": 7103, + "batch_num_samples": 149, + "batch_num_tokens": 52114, + "epoch": 0.60182, + "grad_norm": 3.326214075088501, + "learning_rate": 1.7640008149346866e-05, + "loss": 2.0843, + "step": 662 + }, + { + "batch_num_effect_tokens": 5104, + "batch_num_samples": 149, + "batch_num_tokens": 52208, + "epoch": 0.60273, + "grad_norm": 3.820380449295044, + "learning_rate": 1.7629761144984087e-05, + "loss": 1.4826, + "step": 663 + }, + { + "batch_num_effect_tokens": 8910, + "batch_num_samples": 149, + "batch_num_tokens": 52180, + "epoch": 0.60364, + "grad_norm": 2.995626211166382, + "learning_rate": 1.761949493270671e-05, + "loss": 2.3316, + "step": 664 + }, + { + "batch_num_effect_tokens": 8323, + "batch_num_samples": 149, + "batch_num_tokens": 52139, + "epoch": 0.60455, + "grad_norm": 3.0097763538360596, + "learning_rate": 1.7609209538359917e-05, + "loss": 2.2883, + "step": 665 + }, + { + "batch_num_effect_tokens": 7378, + "batch_num_samples": 149, + "batch_num_tokens": 52137, + "epoch": 0.60545, + "grad_norm": 3.102829933166504, + "learning_rate": 1.759890498783717e-05, + "loss": 1.9407, + "step": 666 + }, + { + "batch_num_effect_tokens": 4611, + "batch_num_samples": 149, + "batch_num_tokens": 52221, + "epoch": 0.60636, + "grad_norm": 3.913276433944702, + "learning_rate": 1.758858130708017e-05, + "loss": 1.9572, + "step": 667 + }, + { + "batch_num_effect_tokens": 7009, + "batch_num_samples": 149, + "batch_num_tokens": 52142, + "epoch": 0.60727, + "grad_norm": 3.0823023319244385, + "learning_rate": 1.757823852207877e-05, + "loss": 1.5677, + "step": 668 + }, + { + "batch_num_effect_tokens": 7465, + "batch_num_samples": 149, + "batch_num_tokens": 52143, + "epoch": 0.60818, + "grad_norm": 3.0849194526672363, + "learning_rate": 1.7567876658870917e-05, + "loss": 2.2721, + "step": 669 + }, + { + "batch_num_effect_tokens": 8093, + "batch_num_samples": 150, + "batch_num_tokens": 52217, + "epoch": 0.60909, + "grad_norm": 3.021416664123535, + "learning_rate": 1.7557495743542586e-05, + "loss": 2.2716, + "step": 670 + }, + { + "batch_num_effect_tokens": 9927, + "batch_num_samples": 149, + "batch_num_tokens": 52196, + "epoch": 0.61, + "grad_norm": 2.8188624382019043, + "learning_rate": 1.7547095802227723e-05, + "loss": 2.3289, + "step": 671 + }, + { + "batch_num_effect_tokens": 6864, + "batch_num_samples": 150, + "batch_num_tokens": 52165, + "epoch": 0.61091, + "grad_norm": 3.308123826980591, + "learning_rate": 1.7536676861108167e-05, + "loss": 2.0859, + "step": 672 + }, + { + "batch_num_effect_tokens": 3919, + "batch_num_samples": 149, + "batch_num_tokens": 52167, + "epoch": 0.61182, + "grad_norm": 3.851428270339966, + "learning_rate": 1.752623894641359e-05, + "loss": 1.6075, + "step": 673 + }, + { + "batch_num_effect_tokens": 5389, + "batch_num_samples": 149, + "batch_num_tokens": 52147, + "epoch": 0.61273, + "grad_norm": 3.3875057697296143, + "learning_rate": 1.7515782084421426e-05, + "loss": 1.3186, + "step": 674 + }, + { + "batch_num_effect_tokens": 4684, + "batch_num_samples": 149, + "batch_num_tokens": 52124, + "epoch": 0.61364, + "grad_norm": 3.598844289779663, + "learning_rate": 1.7505306301456823e-05, + "loss": 1.5316, + "step": 675 + }, + { + "batch_num_effect_tokens": 7285, + "batch_num_samples": 149, + "batch_num_tokens": 52187, + "epoch": 0.61455, + "grad_norm": 3.2655041217803955, + "learning_rate": 1.7494811623892543e-05, + "loss": 1.7023, + "step": 676 + }, + { + "batch_num_effect_tokens": 5171, + "batch_num_samples": 149, + "batch_num_tokens": 52223, + "epoch": 0.61545, + "grad_norm": 3.2970337867736816, + "learning_rate": 1.7484298078148926e-05, + "loss": 1.5864, + "step": 677 + }, + { + "batch_num_effect_tokens": 8958, + "batch_num_samples": 149, + "batch_num_tokens": 52132, + "epoch": 0.61636, + "grad_norm": 3.159494638442993, + "learning_rate": 1.7473765690693812e-05, + "loss": 2.0336, + "step": 678 + }, + { + "batch_num_effect_tokens": 7201, + "batch_num_samples": 150, + "batch_num_tokens": 52144, + "epoch": 0.61727, + "grad_norm": 3.675417184829712, + "learning_rate": 1.7463214488042472e-05, + "loss": 2.2643, + "step": 679 + }, + { + "batch_num_effect_tokens": 5768, + "batch_num_samples": 149, + "batch_num_tokens": 52207, + "epoch": 0.61818, + "grad_norm": 3.556011199951172, + "learning_rate": 1.745264449675755e-05, + "loss": 1.975, + "step": 680 + }, + { + "batch_num_effect_tokens": 6721, + "batch_num_samples": 150, + "batch_num_tokens": 52177, + "epoch": 0.61909, + "grad_norm": 3.3262288570404053, + "learning_rate": 1.744205574344898e-05, + "loss": 2.1309, + "step": 681 + }, + { + "batch_num_effect_tokens": 5329, + "batch_num_samples": 149, + "batch_num_tokens": 52115, + "epoch": 0.62, + "grad_norm": 3.306814193725586, + "learning_rate": 1.7431448254773943e-05, + "loss": 1.461, + "step": 682 + }, + { + "batch_num_effect_tokens": 6465, + "batch_num_samples": 149, + "batch_num_tokens": 52212, + "epoch": 0.62091, + "grad_norm": 2.9740865230560303, + "learning_rate": 1.7420822057436777e-05, + "loss": 1.4722, + "step": 683 + }, + { + "batch_num_effect_tokens": 4547, + "batch_num_samples": 149, + "batch_num_tokens": 52135, + "epoch": 0.62182, + "grad_norm": 3.394707679748535, + "learning_rate": 1.7410177178188917e-05, + "loss": 1.2162, + "step": 684 + }, + { + "batch_num_effect_tokens": 5792, + "batch_num_samples": 149, + "batch_num_tokens": 52221, + "epoch": 0.62273, + "grad_norm": 3.283432722091675, + "learning_rate": 1.739951364382884e-05, + "loss": 1.5428, + "step": 685 + }, + { + "batch_num_effect_tokens": 6506, + "batch_num_samples": 150, + "batch_num_tokens": 52187, + "epoch": 0.62364, + "grad_norm": 3.075127601623535, + "learning_rate": 1.738883148120198e-05, + "loss": 2.0264, + "step": 686 + }, + { + "batch_num_effect_tokens": 7452, + "batch_num_samples": 149, + "batch_num_tokens": 52222, + "epoch": 0.62455, + "grad_norm": 3.2405524253845215, + "learning_rate": 1.737813071720066e-05, + "loss": 1.8857, + "step": 687 + }, + { + "batch_num_effect_tokens": 6921, + "batch_num_samples": 149, + "batch_num_tokens": 52177, + "epoch": 0.62545, + "grad_norm": 3.0870354175567627, + "learning_rate": 1.736741137876405e-05, + "loss": 1.9788, + "step": 688 + }, + { + "batch_num_effect_tokens": 6641, + "batch_num_samples": 149, + "batch_num_tokens": 52152, + "epoch": 0.62636, + "grad_norm": 3.2581539154052734, + "learning_rate": 1.7356673492878073e-05, + "loss": 1.8177, + "step": 689 + }, + { + "batch_num_effect_tokens": 7005, + "batch_num_samples": 149, + "batch_num_tokens": 52209, + "epoch": 0.62727, + "grad_norm": 3.6070356369018555, + "learning_rate": 1.734591708657533e-05, + "loss": 2.2334, + "step": 690 + }, + { + "batch_num_effect_tokens": 5194, + "batch_num_samples": 149, + "batch_num_tokens": 52150, + "epoch": 0.62818, + "grad_norm": 3.5002195835113525, + "learning_rate": 1.7335142186935083e-05, + "loss": 1.6752, + "step": 691 + }, + { + "batch_num_effect_tokens": 4651, + "batch_num_samples": 149, + "batch_num_tokens": 52198, + "epoch": 0.62909, + "grad_norm": 3.570697784423828, + "learning_rate": 1.732434882108311e-05, + "loss": 1.7339, + "step": 692 + }, + { + "batch_num_effect_tokens": 6218, + "batch_num_samples": 149, + "batch_num_tokens": 52223, + "epoch": 0.63, + "grad_norm": 3.5183212757110596, + "learning_rate": 1.7313537016191706e-05, + "loss": 1.883, + "step": 693 + }, + { + "batch_num_effect_tokens": 8700, + "batch_num_samples": 149, + "batch_num_tokens": 52146, + "epoch": 0.63091, + "grad_norm": 3.099947690963745, + "learning_rate": 1.7302706799479575e-05, + "loss": 2.0278, + "step": 694 + }, + { + "batch_num_effect_tokens": 5236, + "batch_num_samples": 149, + "batch_num_tokens": 52163, + "epoch": 0.63182, + "grad_norm": 3.500469207763672, + "learning_rate": 1.7291858198211772e-05, + "loss": 1.52, + "step": 695 + }, + { + "batch_num_effect_tokens": 6232, + "batch_num_samples": 149, + "batch_num_tokens": 52131, + "epoch": 0.63273, + "grad_norm": 3.528430938720703, + "learning_rate": 1.7280991239699643e-05, + "loss": 2.1318, + "step": 696 + }, + { + "batch_num_effect_tokens": 5888, + "batch_num_samples": 149, + "batch_num_tokens": 52152, + "epoch": 0.63364, + "grad_norm": 3.3069233894348145, + "learning_rate": 1.727010595130074e-05, + "loss": 1.8845, + "step": 697 + }, + { + "batch_num_effect_tokens": 8851, + "batch_num_samples": 149, + "batch_num_tokens": 50559, + "epoch": 0.63455, + "grad_norm": 2.9502975940704346, + "learning_rate": 1.7259202360418765e-05, + "loss": 2.3618, + "step": 698 + }, + { + "batch_num_effect_tokens": 6625, + "batch_num_samples": 149, + "batch_num_tokens": 52159, + "epoch": 0.63545, + "grad_norm": 3.252659320831299, + "learning_rate": 1.724828049450349e-05, + "loss": 2.0632, + "step": 699 + }, + { + "batch_num_effect_tokens": 3409, + "batch_num_samples": 149, + "batch_num_tokens": 52219, + "epoch": 0.63636, + "grad_norm": 3.150601863861084, + "learning_rate": 1.72373403810507e-05, + "loss": 1.1395, + "step": 700 + }, + { + "batch_num_effect_tokens": 4073, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 0.63727, + "grad_norm": 3.0718331336975098, + "learning_rate": 1.722638204760213e-05, + "loss": 1.2115, + "step": 701 + }, + { + "batch_num_effect_tokens": 7952, + "batch_num_samples": 149, + "batch_num_tokens": 52138, + "epoch": 0.63818, + "grad_norm": 3.1261394023895264, + "learning_rate": 1.7215405521745358e-05, + "loss": 1.9513, + "step": 702 + }, + { + "batch_num_effect_tokens": 10026, + "batch_num_samples": 149, + "batch_num_tokens": 52184, + "epoch": 0.63909, + "grad_norm": 2.8539605140686035, + "learning_rate": 1.7204410831113778e-05, + "loss": 2.3353, + "step": 703 + }, + { + "batch_num_effect_tokens": 8168, + "batch_num_samples": 149, + "batch_num_tokens": 52215, + "epoch": 0.64, + "grad_norm": 3.016489267349243, + "learning_rate": 1.7193398003386514e-05, + "loss": 2.254, + "step": 704 + }, + { + "batch_num_effect_tokens": 6234, + "batch_num_samples": 149, + "batch_num_tokens": 52185, + "epoch": 0.64091, + "grad_norm": 3.294809103012085, + "learning_rate": 1.7182367066288344e-05, + "loss": 2.1232, + "step": 705 + }, + { + "batch_num_effect_tokens": 9518, + "batch_num_samples": 149, + "batch_num_tokens": 52161, + "epoch": 0.64182, + "grad_norm": 2.9158883094787598, + "learning_rate": 1.7171318047589637e-05, + "loss": 2.1875, + "step": 706 + }, + { + "batch_num_effect_tokens": 5380, + "batch_num_samples": 149, + "batch_num_tokens": 52118, + "epoch": 0.64273, + "grad_norm": 3.651305913925171, + "learning_rate": 1.7160250975106286e-05, + "loss": 1.8801, + "step": 707 + }, + { + "batch_num_effect_tokens": 6389, + "batch_num_samples": 149, + "batch_num_tokens": 50559, + "epoch": 0.64364, + "grad_norm": 3.2613284587860107, + "learning_rate": 1.7149165876699635e-05, + "loss": 1.9556, + "step": 708 + }, + { + "batch_num_effect_tokens": 5791, + "batch_num_samples": 149, + "batch_num_tokens": 52201, + "epoch": 0.64455, + "grad_norm": 3.0628511905670166, + "learning_rate": 1.7138062780276404e-05, + "loss": 1.6504, + "step": 709 + }, + { + "batch_num_effect_tokens": 5752, + "batch_num_samples": 149, + "batch_num_tokens": 52162, + "epoch": 0.64545, + "grad_norm": 3.2969539165496826, + "learning_rate": 1.7126941713788633e-05, + "loss": 2.0699, + "step": 710 + }, + { + "batch_num_effect_tokens": 5496, + "batch_num_samples": 149, + "batch_num_tokens": 52197, + "epoch": 0.64636, + "grad_norm": 3.300597667694092, + "learning_rate": 1.7115802705233576e-05, + "loss": 1.7136, + "step": 711 + }, + { + "batch_num_effect_tokens": 6962, + "batch_num_samples": 149, + "batch_num_tokens": 52150, + "epoch": 0.64727, + "grad_norm": 3.275751829147339, + "learning_rate": 1.710464578265369e-05, + "loss": 2.0422, + "step": 712 + }, + { + "batch_num_effect_tokens": 5475, + "batch_num_samples": 149, + "batch_num_tokens": 52220, + "epoch": 0.64818, + "grad_norm": 3.213657855987549, + "learning_rate": 1.7093470974136505e-05, + "loss": 1.8713, + "step": 713 + }, + { + "batch_num_effect_tokens": 7187, + "batch_num_samples": 149, + "batch_num_tokens": 52118, + "epoch": 0.64909, + "grad_norm": 3.5997188091278076, + "learning_rate": 1.7082278307814593e-05, + "loss": 1.8454, + "step": 714 + }, + { + "batch_num_effect_tokens": 6372, + "batch_num_samples": 149, + "batch_num_tokens": 52135, + "epoch": 0.65, + "grad_norm": 3.3948583602905273, + "learning_rate": 1.7071067811865477e-05, + "loss": 2.0798, + "step": 715 + }, + { + "batch_num_effect_tokens": 5744, + "batch_num_samples": 149, + "batch_num_tokens": 52210, + "epoch": 0.65091, + "grad_norm": 3.3523178100585938, + "learning_rate": 1.7059839514511565e-05, + "loss": 1.8491, + "step": 716 + }, + { + "batch_num_effect_tokens": 6495, + "batch_num_samples": 149, + "batch_num_tokens": 52147, + "epoch": 0.65182, + "grad_norm": 3.0404746532440186, + "learning_rate": 1.7048593444020084e-05, + "loss": 1.8901, + "step": 717 + }, + { + "batch_num_effect_tokens": 4383, + "batch_num_samples": 149, + "batch_num_tokens": 52201, + "epoch": 0.65273, + "grad_norm": 3.1420235633850098, + "learning_rate": 1.7037329628703005e-05, + "loss": 1.4262, + "step": 718 + }, + { + "batch_num_effect_tokens": 8547, + "batch_num_samples": 150, + "batch_num_tokens": 52215, + "epoch": 0.65364, + "grad_norm": 2.8673055171966553, + "learning_rate": 1.702604809691697e-05, + "loss": 2.0797, + "step": 719 + }, + { + "batch_num_effect_tokens": 7211, + "batch_num_samples": 149, + "batch_num_tokens": 52191, + "epoch": 0.65455, + "grad_norm": 3.111631155014038, + "learning_rate": 1.7014748877063212e-05, + "loss": 1.7795, + "step": 720 + }, + { + "batch_num_effect_tokens": 5651, + "batch_num_samples": 149, + "batch_num_tokens": 52168, + "epoch": 0.65545, + "grad_norm": 3.360675573348999, + "learning_rate": 1.7003431997587516e-05, + "loss": 1.6566, + "step": 721 + }, + { + "batch_num_effect_tokens": 5821, + "batch_num_samples": 149, + "batch_num_tokens": 52125, + "epoch": 0.65636, + "grad_norm": 3.098942995071411, + "learning_rate": 1.6992097486980107e-05, + "loss": 1.5144, + "step": 722 + }, + { + "batch_num_effect_tokens": 7546, + "batch_num_samples": 149, + "batch_num_tokens": 52209, + "epoch": 0.65727, + "grad_norm": 3.1112258434295654, + "learning_rate": 1.6980745373775604e-05, + "loss": 2.0011, + "step": 723 + }, + { + "batch_num_effect_tokens": 5119, + "batch_num_samples": 149, + "batch_num_tokens": 52145, + "epoch": 0.65818, + "grad_norm": 3.3119330406188965, + "learning_rate": 1.696937568655294e-05, + "loss": 1.4487, + "step": 724 + }, + { + "batch_num_effect_tokens": 6011, + "batch_num_samples": 149, + "batch_num_tokens": 52141, + "epoch": 0.65909, + "grad_norm": 3.974515199661255, + "learning_rate": 1.6957988453935276e-05, + "loss": 2.169, + "step": 725 + }, + { + "batch_num_effect_tokens": 5712, + "batch_num_samples": 149, + "batch_num_tokens": 52216, + "epoch": 0.66, + "grad_norm": 3.2805824279785156, + "learning_rate": 1.6946583704589973e-05, + "loss": 1.5914, + "step": 726 + }, + { + "batch_num_effect_tokens": 5982, + "batch_num_samples": 149, + "batch_num_tokens": 52191, + "epoch": 0.66091, + "grad_norm": 3.1364593505859375, + "learning_rate": 1.6935161467228466e-05, + "loss": 1.4481, + "step": 727 + }, + { + "batch_num_effect_tokens": 4891, + "batch_num_samples": 149, + "batch_num_tokens": 52191, + "epoch": 0.66182, + "grad_norm": 3.0658605098724365, + "learning_rate": 1.692372177060623e-05, + "loss": 1.5435, + "step": 728 + }, + { + "batch_num_effect_tokens": 6283, + "batch_num_samples": 149, + "batch_num_tokens": 52121, + "epoch": 0.66273, + "grad_norm": 4.253247261047363, + "learning_rate": 1.691226464352268e-05, + "loss": 1.808, + "step": 729 + }, + { + "batch_num_effect_tokens": 7333, + "batch_num_samples": 149, + "batch_num_tokens": 52160, + "epoch": 0.66364, + "grad_norm": 3.105001211166382, + "learning_rate": 1.6900790114821122e-05, + "loss": 2.1523, + "step": 730 + }, + { + "batch_num_effect_tokens": 6461, + "batch_num_samples": 149, + "batch_num_tokens": 52140, + "epoch": 0.66455, + "grad_norm": 3.2539286613464355, + "learning_rate": 1.688929821338867e-05, + "loss": 1.9806, + "step": 731 + }, + { + "batch_num_effect_tokens": 7060, + "batch_num_samples": 149, + "batch_num_tokens": 52174, + "epoch": 0.66545, + "grad_norm": 3.383930206298828, + "learning_rate": 1.6877788968156172e-05, + "loss": 1.7445, + "step": 732 + }, + { + "batch_num_effect_tokens": 6503, + "batch_num_samples": 149, + "batch_num_tokens": 52110, + "epoch": 0.66636, + "grad_norm": 3.3413214683532715, + "learning_rate": 1.6866262408098134e-05, + "loss": 1.8225, + "step": 733 + }, + { + "batch_num_effect_tokens": 8637, + "batch_num_samples": 149, + "batch_num_tokens": 52181, + "epoch": 0.66727, + "grad_norm": 2.968226671218872, + "learning_rate": 1.685471856223267e-05, + "loss": 2.0853, + "step": 734 + }, + { + "batch_num_effect_tokens": 11260, + "batch_num_samples": 149, + "batch_num_tokens": 52188, + "epoch": 0.66818, + "grad_norm": 2.7590177059173584, + "learning_rate": 1.6843157459621386e-05, + "loss": 2.2001, + "step": 735 + }, + { + "batch_num_effect_tokens": 7229, + "batch_num_samples": 149, + "batch_num_tokens": 52158, + "epoch": 0.66909, + "grad_norm": 3.267742872238159, + "learning_rate": 1.6831579129369347e-05, + "loss": 1.9719, + "step": 736 + }, + { + "batch_num_effect_tokens": 7764, + "batch_num_samples": 149, + "batch_num_tokens": 52177, + "epoch": 0.67, + "grad_norm": 3.2558839321136475, + "learning_rate": 1.6819983600624986e-05, + "loss": 1.9983, + "step": 737 + }, + { + "batch_num_effect_tokens": 7576, + "batch_num_samples": 149, + "batch_num_tokens": 52177, + "epoch": 0.67091, + "grad_norm": 3.4332239627838135, + "learning_rate": 1.6808370902580034e-05, + "loss": 2.0056, + "step": 738 + }, + { + "batch_num_effect_tokens": 4863, + "batch_num_samples": 149, + "batch_num_tokens": 52210, + "epoch": 0.67182, + "grad_norm": 3.064371347427368, + "learning_rate": 1.6796741064469446e-05, + "loss": 1.2753, + "step": 739 + }, + { + "batch_num_effect_tokens": 7655, + "batch_num_samples": 149, + "batch_num_tokens": 52159, + "epoch": 0.67273, + "grad_norm": 3.035062074661255, + "learning_rate": 1.6785094115571323e-05, + "loss": 1.9743, + "step": 740 + }, + { + "batch_num_effect_tokens": 6245, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 0.67364, + "grad_norm": 3.305992841720581, + "learning_rate": 1.677343008520685e-05, + "loss": 1.8482, + "step": 741 + }, + { + "batch_num_effect_tokens": 5362, + "batch_num_samples": 149, + "batch_num_tokens": 52142, + "epoch": 0.67455, + "grad_norm": 8.169853210449219, + "learning_rate": 1.6761749002740195e-05, + "loss": 2.1479, + "step": 742 + }, + { + "batch_num_effect_tokens": 7209, + "batch_num_samples": 149, + "batch_num_tokens": 52190, + "epoch": 0.67545, + "grad_norm": 3.5025248527526855, + "learning_rate": 1.6750050897578484e-05, + "loss": 2.3072, + "step": 743 + }, + { + "batch_num_effect_tokens": 4681, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 0.67636, + "grad_norm": 3.109935760498047, + "learning_rate": 1.673833579917168e-05, + "loss": 1.4393, + "step": 744 + }, + { + "batch_num_effect_tokens": 5669, + "batch_num_samples": 149, + "batch_num_tokens": 52122, + "epoch": 0.67727, + "grad_norm": 3.120631694793701, + "learning_rate": 1.6726603737012527e-05, + "loss": 1.6593, + "step": 745 + }, + { + "batch_num_effect_tokens": 6840, + "batch_num_samples": 149, + "batch_num_tokens": 52208, + "epoch": 0.67818, + "grad_norm": 3.159054756164551, + "learning_rate": 1.6714854740636477e-05, + "loss": 2.0482, + "step": 746 + }, + { + "batch_num_effect_tokens": 9810, + "batch_num_samples": 149, + "batch_num_tokens": 52170, + "epoch": 0.67909, + "grad_norm": 3.1278321743011475, + "learning_rate": 1.6703088839621616e-05, + "loss": 2.3994, + "step": 747 + }, + { + "batch_num_effect_tokens": 7824, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 0.68, + "grad_norm": 3.290961980819702, + "learning_rate": 1.6691306063588583e-05, + "loss": 2.0672, + "step": 748 + }, + { + "batch_num_effect_tokens": 5326, + "batch_num_samples": 149, + "batch_num_tokens": 52183, + "epoch": 0.68091, + "grad_norm": 3.196922540664673, + "learning_rate": 1.6679506442200508e-05, + "loss": 2.0027, + "step": 749 + }, + { + "batch_num_effect_tokens": 4276, + "batch_num_samples": 149, + "batch_num_tokens": 52223, + "epoch": 0.68182, + "grad_norm": 3.541163444519043, + "learning_rate": 1.666769000516292e-05, + "loss": 1.5725, + "step": 750 + }, + { + "batch_num_effect_tokens": 8160, + "batch_num_samples": 149, + "batch_num_tokens": 52168, + "epoch": 0.68273, + "grad_norm": 2.104133129119873, + "learning_rate": 1.6655856782223682e-05, + "loss": 1.1344, + "step": 751 + }, + { + "batch_num_effect_tokens": 8092, + "batch_num_samples": 149, + "batch_num_tokens": 52208, + "epoch": 0.68364, + "grad_norm": 3.19911789894104, + "learning_rate": 1.6644006803172926e-05, + "loss": 2.2778, + "step": 752 + }, + { + "batch_num_effect_tokens": 8542, + "batch_num_samples": 149, + "batch_num_tokens": 52192, + "epoch": 0.68455, + "grad_norm": 3.007779121398926, + "learning_rate": 1.6632140097842953e-05, + "loss": 2.2804, + "step": 753 + }, + { + "batch_num_effect_tokens": 7931, + "batch_num_samples": 149, + "batch_num_tokens": 52112, + "epoch": 0.68545, + "grad_norm": 3.221501111984253, + "learning_rate": 1.6620256696108187e-05, + "loss": 2.1006, + "step": 754 + }, + { + "batch_num_effect_tokens": 6048, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 0.68636, + "grad_norm": 3.1645705699920654, + "learning_rate": 1.660835662788507e-05, + "loss": 2.0649, + "step": 755 + }, + { + "batch_num_effect_tokens": 6160, + "batch_num_samples": 149, + "batch_num_tokens": 52197, + "epoch": 0.68727, + "grad_norm": 3.147974729537964, + "learning_rate": 1.6596439923132016e-05, + "loss": 1.666, + "step": 756 + }, + { + "batch_num_effect_tokens": 5991, + "batch_num_samples": 149, + "batch_num_tokens": 52147, + "epoch": 0.68818, + "grad_norm": 3.1017847061157227, + "learning_rate": 1.6584506611849313e-05, + "loss": 1.7526, + "step": 757 + }, + { + "batch_num_effect_tokens": 7163, + "batch_num_samples": 149, + "batch_num_tokens": 52189, + "epoch": 0.68909, + "grad_norm": 3.19863224029541, + "learning_rate": 1.6572556724079055e-05, + "loss": 1.8921, + "step": 758 + }, + { + "batch_num_effect_tokens": 7250, + "batch_num_samples": 149, + "batch_num_tokens": 52198, + "epoch": 0.69, + "grad_norm": 3.080247640609741, + "learning_rate": 1.6560590289905074e-05, + "loss": 1.9408, + "step": 759 + }, + { + "batch_num_effect_tokens": 9067, + "batch_num_samples": 149, + "batch_num_tokens": 52215, + "epoch": 0.69091, + "grad_norm": 3.105419874191284, + "learning_rate": 1.6548607339452853e-05, + "loss": 2.2802, + "step": 760 + }, + { + "batch_num_effect_tokens": 6394, + "batch_num_samples": 150, + "batch_num_tokens": 52155, + "epoch": 0.69182, + "grad_norm": 3.3251821994781494, + "learning_rate": 1.6536607902889453e-05, + "loss": 2.0261, + "step": 761 + }, + { + "batch_num_effect_tokens": 5741, + "batch_num_samples": 149, + "batch_num_tokens": 52208, + "epoch": 0.69273, + "grad_norm": 3.2268760204315186, + "learning_rate": 1.6524592010423444e-05, + "loss": 1.6041, + "step": 762 + }, + { + "batch_num_effect_tokens": 6934, + "batch_num_samples": 149, + "batch_num_tokens": 52218, + "epoch": 0.69364, + "grad_norm": 3.1550705432891846, + "learning_rate": 1.651255969230482e-05, + "loss": 1.6185, + "step": 763 + }, + { + "batch_num_effect_tokens": 9521, + "batch_num_samples": 149, + "batch_num_tokens": 52167, + "epoch": 0.69455, + "grad_norm": 2.332301616668701, + "learning_rate": 1.6500510978824928e-05, + "loss": 1.3868, + "step": 764 + }, + { + "batch_num_effect_tokens": 5507, + "batch_num_samples": 149, + "batch_num_tokens": 52167, + "epoch": 0.69545, + "grad_norm": 3.3931100368499756, + "learning_rate": 1.6488445900316388e-05, + "loss": 1.7854, + "step": 765 + }, + { + "batch_num_effect_tokens": 5535, + "batch_num_samples": 149, + "batch_num_tokens": 52136, + "epoch": 0.69636, + "grad_norm": 3.425886392593384, + "learning_rate": 1.6476364487153024e-05, + "loss": 1.8349, + "step": 766 + }, + { + "batch_num_effect_tokens": 7539, + "batch_num_samples": 149, + "batch_num_tokens": 52216, + "epoch": 0.69727, + "grad_norm": 3.0860488414764404, + "learning_rate": 1.6464266769749774e-05, + "loss": 2.1777, + "step": 767 + }, + { + "batch_num_effect_tokens": 4574, + "batch_num_samples": 149, + "batch_num_tokens": 52173, + "epoch": 0.69818, + "grad_norm": 3.5921261310577393, + "learning_rate": 1.6452152778562633e-05, + "loss": 1.6029, + "step": 768 + }, + { + "batch_num_effect_tokens": 7124, + "batch_num_samples": 149, + "batch_num_tokens": 52142, + "epoch": 0.69909, + "grad_norm": 2.9657366275787354, + "learning_rate": 1.6440022544088553e-05, + "loss": 1.6801, + "step": 769 + }, + { + "batch_num_effect_tokens": 7162, + "batch_num_samples": 149, + "batch_num_tokens": 52197, + "epoch": 0.7, + "grad_norm": 3.191507577896118, + "learning_rate": 1.6427876096865394e-05, + "loss": 2.0602, + "step": 770 + }, + { + "batch_num_effect_tokens": 7288, + "batch_num_samples": 149, + "batch_num_tokens": 52177, + "epoch": 0.70091, + "grad_norm": 3.031268358230591, + "learning_rate": 1.6415713467471817e-05, + "loss": 2.1469, + "step": 771 + }, + { + "batch_num_effect_tokens": 4286, + "batch_num_samples": 149, + "batch_num_tokens": 52207, + "epoch": 0.70182, + "grad_norm": 3.0480797290802, + "learning_rate": 1.6403534686527223e-05, + "loss": 1.1808, + "step": 772 + }, + { + "batch_num_effect_tokens": 10561, + "batch_num_samples": 149, + "batch_num_tokens": 52198, + "epoch": 0.70273, + "grad_norm": 2.8739101886749268, + "learning_rate": 1.6391339784691685e-05, + "loss": 2.2059, + "step": 773 + }, + { + "batch_num_effect_tokens": 6132, + "batch_num_samples": 149, + "batch_num_tokens": 52181, + "epoch": 0.70364, + "grad_norm": 3.1238291263580322, + "learning_rate": 1.6379128792665853e-05, + "loss": 1.7822, + "step": 774 + }, + { + "batch_num_effect_tokens": 5755, + "batch_num_samples": 149, + "batch_num_tokens": 52191, + "epoch": 0.70455, + "grad_norm": 3.3250069618225098, + "learning_rate": 1.6366901741190885e-05, + "loss": 1.8115, + "step": 775 + }, + { + "batch_num_effect_tokens": 5908, + "batch_num_samples": 149, + "batch_num_tokens": 52183, + "epoch": 0.70545, + "grad_norm": 4.2498393058776855, + "learning_rate": 1.6354658661048364e-05, + "loss": 1.6392, + "step": 776 + }, + { + "batch_num_effect_tokens": 6284, + "batch_num_samples": 149, + "batch_num_tokens": 52205, + "epoch": 0.70636, + "grad_norm": 3.3605003356933594, + "learning_rate": 1.6342399583060234e-05, + "loss": 2.0054, + "step": 777 + }, + { + "batch_num_effect_tokens": 7090, + "batch_num_samples": 149, + "batch_num_tokens": 52223, + "epoch": 0.70727, + "grad_norm": 3.663482666015625, + "learning_rate": 1.6330124538088705e-05, + "loss": 1.6892, + "step": 778 + }, + { + "batch_num_effect_tokens": 5537, + "batch_num_samples": 149, + "batch_num_tokens": 52184, + "epoch": 0.70818, + "grad_norm": 3.340763568878174, + "learning_rate": 1.6317833557036193e-05, + "loss": 1.5238, + "step": 779 + }, + { + "batch_num_effect_tokens": 7954, + "batch_num_samples": 149, + "batch_num_tokens": 52168, + "epoch": 0.70909, + "grad_norm": 2.9226419925689697, + "learning_rate": 1.6305526670845225e-05, + "loss": 1.8049, + "step": 780 + }, + { + "batch_num_effect_tokens": 5450, + "batch_num_samples": 149, + "batch_num_tokens": 52187, + "epoch": 0.71, + "grad_norm": 3.6275925636291504, + "learning_rate": 1.6293203910498375e-05, + "loss": 1.8456, + "step": 781 + }, + { + "batch_num_effect_tokens": 7314, + "batch_num_samples": 149, + "batch_num_tokens": 52189, + "epoch": 0.71091, + "grad_norm": 3.454686403274536, + "learning_rate": 1.6280865307018177e-05, + "loss": 2.1397, + "step": 782 + }, + { + "batch_num_effect_tokens": 7268, + "batch_num_samples": 149, + "batch_num_tokens": 52214, + "epoch": 0.71182, + "grad_norm": 3.575932741165161, + "learning_rate": 1.6268510891467048e-05, + "loss": 2.07, + "step": 783 + }, + { + "batch_num_effect_tokens": 7145, + "batch_num_samples": 149, + "batch_num_tokens": 52151, + "epoch": 0.71273, + "grad_norm": 3.2153611183166504, + "learning_rate": 1.6256140694947217e-05, + "loss": 1.788, + "step": 784 + }, + { + "batch_num_effect_tokens": 9017, + "batch_num_samples": 149, + "batch_num_tokens": 52163, + "epoch": 0.71364, + "grad_norm": 2.9733734130859375, + "learning_rate": 1.6243754748600637e-05, + "loss": 2.1755, + "step": 785 + }, + { + "batch_num_effect_tokens": 6388, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 0.71455, + "grad_norm": 3.064711809158325, + "learning_rate": 1.623135308360891e-05, + "loss": 1.6803, + "step": 786 + }, + { + "batch_num_effect_tokens": 6036, + "batch_num_samples": 149, + "batch_num_tokens": 52180, + "epoch": 0.71545, + "grad_norm": 3.391117811203003, + "learning_rate": 1.6218935731193223e-05, + "loss": 2.102, + "step": 787 + }, + { + "batch_num_effect_tokens": 9903, + "batch_num_samples": 149, + "batch_num_tokens": 52142, + "epoch": 0.71636, + "grad_norm": 2.822171449661255, + "learning_rate": 1.620650272261424e-05, + "loss": 2.1866, + "step": 788 + }, + { + "batch_num_effect_tokens": 6496, + "batch_num_samples": 149, + "batch_num_tokens": 52187, + "epoch": 0.71727, + "grad_norm": 3.41025972366333, + "learning_rate": 1.6194054089172043e-05, + "loss": 2.017, + "step": 789 + }, + { + "batch_num_effect_tokens": 6931, + "batch_num_samples": 149, + "batch_num_tokens": 52190, + "epoch": 0.71818, + "grad_norm": 3.487778425216675, + "learning_rate": 1.6181589862206053e-05, + "loss": 1.9893, + "step": 790 + }, + { + "batch_num_effect_tokens": 4613, + "batch_num_samples": 149, + "batch_num_tokens": 52197, + "epoch": 0.71909, + "grad_norm": 2.997297763824463, + "learning_rate": 1.616911007309495e-05, + "loss": 1.387, + "step": 791 + }, + { + "batch_num_effect_tokens": 6661, + "batch_num_samples": 149, + "batch_num_tokens": 52157, + "epoch": 0.72, + "grad_norm": 3.055971384048462, + "learning_rate": 1.6156614753256583e-05, + "loss": 1.896, + "step": 792 + }, + { + "batch_num_effect_tokens": 5808, + "batch_num_samples": 149, + "batch_num_tokens": 52208, + "epoch": 0.72091, + "grad_norm": 3.1818058490753174, + "learning_rate": 1.614410393414791e-05, + "loss": 1.488, + "step": 793 + }, + { + "batch_num_effect_tokens": 7035, + "batch_num_samples": 149, + "batch_num_tokens": 52183, + "epoch": 0.72182, + "grad_norm": 3.7383460998535156, + "learning_rate": 1.6131577647264903e-05, + "loss": 2.0533, + "step": 794 + }, + { + "batch_num_effect_tokens": 5100, + "batch_num_samples": 149, + "batch_num_tokens": 52140, + "epoch": 0.72273, + "grad_norm": 4.117101192474365, + "learning_rate": 1.6119035924142468e-05, + "loss": 1.4298, + "step": 795 + }, + { + "batch_num_effect_tokens": 6566, + "batch_num_samples": 149, + "batch_num_tokens": 52139, + "epoch": 0.72364, + "grad_norm": 3.8849780559539795, + "learning_rate": 1.6106478796354382e-05, + "loss": 1.943, + "step": 796 + }, + { + "batch_num_effect_tokens": 6001, + "batch_num_samples": 149, + "batch_num_tokens": 52151, + "epoch": 0.72455, + "grad_norm": 3.643822193145752, + "learning_rate": 1.6093906295513202e-05, + "loss": 1.5991, + "step": 797 + }, + { + "batch_num_effect_tokens": 5333, + "batch_num_samples": 149, + "batch_num_tokens": 52184, + "epoch": 0.72545, + "grad_norm": 3.197996139526367, + "learning_rate": 1.608131845327018e-05, + "loss": 1.6382, + "step": 798 + }, + { + "batch_num_effect_tokens": 2904, + "batch_num_samples": 149, + "batch_num_tokens": 52149, + "epoch": 0.72636, + "grad_norm": 3.439863443374634, + "learning_rate": 1.6068715301315195e-05, + "loss": 0.8895, + "step": 799 + }, + { + "batch_num_effect_tokens": 10600, + "batch_num_samples": 149, + "batch_num_tokens": 52188, + "epoch": 0.72727, + "grad_norm": 2.662959337234497, + "learning_rate": 1.6056096871376667e-05, + "loss": 2.3137, + "step": 800 + }, + { + "batch_num_effect_tokens": 5538, + "batch_num_samples": 149, + "batch_num_tokens": 52208, + "epoch": 0.72818, + "grad_norm": 3.3200247287750244, + "learning_rate": 1.604346319522148e-05, + "loss": 1.7092, + "step": 801 + }, + { + "batch_num_effect_tokens": 6844, + "batch_num_samples": 149, + "batch_num_tokens": 52130, + "epoch": 0.72909, + "grad_norm": 3.4481406211853027, + "learning_rate": 1.6030814304654895e-05, + "loss": 1.9916, + "step": 802 + }, + { + "batch_num_effect_tokens": 4004, + "batch_num_samples": 149, + "batch_num_tokens": 52204, + "epoch": 0.73, + "grad_norm": 4.091022491455078, + "learning_rate": 1.6018150231520486e-05, + "loss": 1.253, + "step": 803 + }, + { + "batch_num_effect_tokens": 5369, + "batch_num_samples": 149, + "batch_num_tokens": 52126, + "epoch": 0.73091, + "grad_norm": 3.431154489517212, + "learning_rate": 1.6005471007700033e-05, + "loss": 1.5734, + "step": 804 + }, + { + "batch_num_effect_tokens": 5870, + "batch_num_samples": 149, + "batch_num_tokens": 52146, + "epoch": 0.73182, + "grad_norm": 3.191513776779175, + "learning_rate": 1.599277666511347e-05, + "loss": 1.6448, + "step": 805 + }, + { + "batch_num_effect_tokens": 6880, + "batch_num_samples": 149, + "batch_num_tokens": 52138, + "epoch": 0.73273, + "grad_norm": 19.013717651367188, + "learning_rate": 1.5980067235718793e-05, + "loss": 2.1849, + "step": 806 + }, + { + "batch_num_effect_tokens": 7321, + "batch_num_samples": 149, + "batch_num_tokens": 52155, + "epoch": 0.73364, + "grad_norm": 3.0356898307800293, + "learning_rate": 1.596734275151197e-05, + "loss": 1.6153, + "step": 807 + }, + { + "batch_num_effect_tokens": 4577, + "batch_num_samples": 149, + "batch_num_tokens": 52197, + "epoch": 0.73455, + "grad_norm": 3.775296449661255, + "learning_rate": 1.595460324452688e-05, + "loss": 1.5571, + "step": 808 + }, + { + "batch_num_effect_tokens": 5624, + "batch_num_samples": 149, + "batch_num_tokens": 52221, + "epoch": 0.73545, + "grad_norm": 4.002346038818359, + "learning_rate": 1.5941848746835216e-05, + "loss": 1.9669, + "step": 809 + }, + { + "batch_num_effect_tokens": 5883, + "batch_num_samples": 149, + "batch_num_tokens": 52189, + "epoch": 0.73636, + "grad_norm": 3.727356195449829, + "learning_rate": 1.5929079290546408e-05, + "loss": 1.7648, + "step": 810 + }, + { + "batch_num_effect_tokens": 7143, + "batch_num_samples": 149, + "batch_num_tokens": 52213, + "epoch": 0.73727, + "grad_norm": 3.607943534851074, + "learning_rate": 1.5916294907807547e-05, + "loss": 2.041, + "step": 811 + }, + { + "batch_num_effect_tokens": 5295, + "batch_num_samples": 149, + "batch_num_tokens": 52221, + "epoch": 0.73818, + "grad_norm": 3.056272268295288, + "learning_rate": 1.5903495630803302e-05, + "loss": 1.3723, + "step": 812 + }, + { + "batch_num_effect_tokens": 5226, + "batch_num_samples": 149, + "batch_num_tokens": 52179, + "epoch": 0.73909, + "grad_norm": 3.8680360317230225, + "learning_rate": 1.5890681491755838e-05, + "loss": 1.972, + "step": 813 + }, + { + "batch_num_effect_tokens": 4905, + "batch_num_samples": 149, + "batch_num_tokens": 52197, + "epoch": 0.74, + "grad_norm": 3.7667770385742188, + "learning_rate": 1.5877852522924733e-05, + "loss": 1.6848, + "step": 814 + }, + { + "batch_num_effect_tokens": 6354, + "batch_num_samples": 149, + "batch_num_tokens": 52171, + "epoch": 0.74091, + "grad_norm": 3.2568600177764893, + "learning_rate": 1.5865008756606905e-05, + "loss": 2.0856, + "step": 815 + }, + { + "batch_num_effect_tokens": 2948, + "batch_num_samples": 149, + "batch_num_tokens": 52130, + "epoch": 0.74182, + "grad_norm": 3.1932148933410645, + "learning_rate": 1.585215022513652e-05, + "loss": 0.774, + "step": 816 + }, + { + "batch_num_effect_tokens": 5706, + "batch_num_samples": 149, + "batch_num_tokens": 52111, + "epoch": 0.74273, + "grad_norm": 3.7973036766052246, + "learning_rate": 1.5839276960884906e-05, + "loss": 1.8912, + "step": 817 + }, + { + "batch_num_effect_tokens": 7678, + "batch_num_samples": 149, + "batch_num_tokens": 52145, + "epoch": 0.74364, + "grad_norm": 3.9283089637756348, + "learning_rate": 1.5826388996260503e-05, + "loss": 2.115, + "step": 818 + }, + { + "batch_num_effect_tokens": 8872, + "batch_num_samples": 149, + "batch_num_tokens": 52184, + "epoch": 0.74455, + "grad_norm": 3.237718343734741, + "learning_rate": 1.581348636370874e-05, + "loss": 2.1578, + "step": 819 + }, + { + "batch_num_effect_tokens": 6072, + "batch_num_samples": 149, + "batch_num_tokens": 52197, + "epoch": 0.74545, + "grad_norm": 3.0094594955444336, + "learning_rate": 1.5800569095711983e-05, + "loss": 1.6896, + "step": 820 + }, + { + "batch_num_effect_tokens": 8053, + "batch_num_samples": 149, + "batch_num_tokens": 52140, + "epoch": 0.74636, + "grad_norm": 2.8689441680908203, + "learning_rate": 1.5787637224789434e-05, + "loss": 2.2964, + "step": 821 + }, + { + "batch_num_effect_tokens": 4579, + "batch_num_samples": 149, + "batch_num_tokens": 52220, + "epoch": 0.74727, + "grad_norm": 3.3919966220855713, + "learning_rate": 1.5774690783497066e-05, + "loss": 1.5906, + "step": 822 + }, + { + "batch_num_effect_tokens": 7528, + "batch_num_samples": 149, + "batch_num_tokens": 52141, + "epoch": 0.74818, + "grad_norm": 2.8416075706481934, + "learning_rate": 1.576172980442753e-05, + "loss": 1.7786, + "step": 823 + }, + { + "batch_num_effect_tokens": 6082, + "batch_num_samples": 149, + "batch_num_tokens": 52185, + "epoch": 0.74909, + "grad_norm": 3.191624641418457, + "learning_rate": 1.5748754320210074e-05, + "loss": 1.7562, + "step": 824 + }, + { + "batch_num_effect_tokens": 8173, + "batch_num_samples": 149, + "batch_num_tokens": 52222, + "epoch": 0.75, + "grad_norm": 3.029482364654541, + "learning_rate": 1.573576436351046e-05, + "loss": 2.0347, + "step": 825 + }, + { + "batch_num_effect_tokens": 4003, + "batch_num_samples": 149, + "batch_num_tokens": 52223, + "epoch": 0.75091, + "grad_norm": 3.279954195022583, + "learning_rate": 1.5722759967030898e-05, + "loss": 1.1664, + "step": 826 + }, + { + "batch_num_effect_tokens": 5465, + "batch_num_samples": 149, + "batch_num_tokens": 52124, + "epoch": 0.75182, + "grad_norm": 6.8594183921813965, + "learning_rate": 1.5709741163509934e-05, + "loss": 1.7524, + "step": 827 + }, + { + "batch_num_effect_tokens": 7520, + "batch_num_samples": 149, + "batch_num_tokens": 52153, + "epoch": 0.75273, + "grad_norm": 2.951838731765747, + "learning_rate": 1.569670798572239e-05, + "loss": 1.999, + "step": 828 + }, + { + "batch_num_effect_tokens": 7578, + "batch_num_samples": 149, + "batch_num_tokens": 52191, + "epoch": 0.75364, + "grad_norm": 3.045968770980835, + "learning_rate": 1.5683660466479276e-05, + "loss": 2.1318, + "step": 829 + }, + { + "batch_num_effect_tokens": 5717, + "batch_num_samples": 149, + "batch_num_tokens": 52197, + "epoch": 0.75455, + "grad_norm": 3.512777090072632, + "learning_rate": 1.5670598638627707e-05, + "loss": 2.0365, + "step": 830 + }, + { + "batch_num_effect_tokens": 3331, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 0.75545, + "grad_norm": 3.2990052700042725, + "learning_rate": 1.565752253505082e-05, + "loss": 1.0083, + "step": 831 + }, + { + "batch_num_effect_tokens": 5666, + "batch_num_samples": 149, + "batch_num_tokens": 52200, + "epoch": 0.75636, + "grad_norm": 3.1069211959838867, + "learning_rate": 1.5644432188667695e-05, + "loss": 1.5099, + "step": 832 + }, + { + "batch_num_effect_tokens": 4232, + "batch_num_samples": 149, + "batch_num_tokens": 52197, + "epoch": 0.75727, + "grad_norm": 3.772783041000366, + "learning_rate": 1.563132763243325e-05, + "loss": 1.4528, + "step": 833 + }, + { + "batch_num_effect_tokens": 5526, + "batch_num_samples": 149, + "batch_num_tokens": 52146, + "epoch": 0.75818, + "grad_norm": 3.1174569129943848, + "learning_rate": 1.56182088993382e-05, + "loss": 1.7541, + "step": 834 + }, + { + "batch_num_effect_tokens": 6759, + "batch_num_samples": 149, + "batch_num_tokens": 52171, + "epoch": 0.75909, + "grad_norm": 3.053814172744751, + "learning_rate": 1.560507602240894e-05, + "loss": 1.8855, + "step": 835 + }, + { + "batch_num_effect_tokens": 4630, + "batch_num_samples": 149, + "batch_num_tokens": 52127, + "epoch": 0.76, + "grad_norm": 3.168306350708008, + "learning_rate": 1.5591929034707468e-05, + "loss": 1.5212, + "step": 836 + }, + { + "batch_num_effect_tokens": 5538, + "batch_num_samples": 150, + "batch_num_tokens": 52168, + "epoch": 0.76091, + "grad_norm": 3.2831947803497314, + "learning_rate": 1.5578767969331315e-05, + "loss": 1.54, + "step": 837 + }, + { + "batch_num_effect_tokens": 6277, + "batch_num_samples": 149, + "batch_num_tokens": 52206, + "epoch": 0.76182, + "grad_norm": 3.1704232692718506, + "learning_rate": 1.5565592859413442e-05, + "loss": 1.3928, + "step": 838 + }, + { + "batch_num_effect_tokens": 5844, + "batch_num_samples": 149, + "batch_num_tokens": 52184, + "epoch": 0.76273, + "grad_norm": 3.5144307613372803, + "learning_rate": 1.555240373812217e-05, + "loss": 2.1053, + "step": 839 + }, + { + "batch_num_effect_tokens": 6697, + "batch_num_samples": 149, + "batch_num_tokens": 52161, + "epoch": 0.76364, + "grad_norm": 3.301517963409424, + "learning_rate": 1.5539200638661106e-05, + "loss": 2.1613, + "step": 840 + }, + { + "batch_num_effect_tokens": 8281, + "batch_num_samples": 149, + "batch_num_tokens": 52139, + "epoch": 0.76455, + "grad_norm": 3.2360894680023193, + "learning_rate": 1.5525983594269026e-05, + "loss": 2.3262, + "step": 841 + }, + { + "batch_num_effect_tokens": 7297, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 0.76545, + "grad_norm": 3.1873526573181152, + "learning_rate": 1.5512752638219834e-05, + "loss": 2.1013, + "step": 842 + }, + { + "batch_num_effect_tokens": 3934, + "batch_num_samples": 149, + "batch_num_tokens": 52168, + "epoch": 0.76636, + "grad_norm": 3.1415605545043945, + "learning_rate": 1.549950780382244e-05, + "loss": 1.0846, + "step": 843 + }, + { + "batch_num_effect_tokens": 5417, + "batch_num_samples": 149, + "batch_num_tokens": 52137, + "epoch": 0.76727, + "grad_norm": 3.3242571353912354, + "learning_rate": 1.5486249124420702e-05, + "loss": 1.8593, + "step": 844 + }, + { + "batch_num_effect_tokens": 7916, + "batch_num_samples": 149, + "batch_num_tokens": 52171, + "epoch": 0.76818, + "grad_norm": 2.9603023529052734, + "learning_rate": 1.5472976633393325e-05, + "loss": 1.9193, + "step": 845 + }, + { + "batch_num_effect_tokens": 6049, + "batch_num_samples": 149, + "batch_num_tokens": 52214, + "epoch": 0.76909, + "grad_norm": 3.532238721847534, + "learning_rate": 1.5459690364153792e-05, + "loss": 2.0857, + "step": 846 + }, + { + "batch_num_effect_tokens": 7548, + "batch_num_samples": 149, + "batch_num_tokens": 52209, + "epoch": 0.77, + "grad_norm": 3.206171989440918, + "learning_rate": 1.5446390350150272e-05, + "loss": 2.0077, + "step": 847 + }, + { + "batch_num_effect_tokens": 7110, + "batch_num_samples": 149, + "batch_num_tokens": 52166, + "epoch": 0.77091, + "grad_norm": 2.141608715057373, + "learning_rate": 1.5433076624865533e-05, + "loss": 0.8776, + "step": 848 + }, + { + "batch_num_effect_tokens": 6005, + "batch_num_samples": 149, + "batch_num_tokens": 52201, + "epoch": 0.77182, + "grad_norm": 3.2913215160369873, + "learning_rate": 1.541974922181686e-05, + "loss": 1.8563, + "step": 849 + }, + { + "batch_num_effect_tokens": 5249, + "batch_num_samples": 149, + "batch_num_tokens": 52151, + "epoch": 0.77273, + "grad_norm": 4.281654357910156, + "learning_rate": 1.5406408174555978e-05, + "loss": 1.6274, + "step": 850 + }, + { + "batch_num_effect_tokens": 5526, + "batch_num_samples": 149, + "batch_num_tokens": 52191, + "epoch": 0.77364, + "grad_norm": 3.324355363845825, + "learning_rate": 1.5393053516668954e-05, + "loss": 1.8066, + "step": 851 + }, + { + "batch_num_effect_tokens": 9426, + "batch_num_samples": 149, + "batch_num_tokens": 52199, + "epoch": 0.77455, + "grad_norm": 3.0341014862060547, + "learning_rate": 1.5379685281776125e-05, + "loss": 2.1172, + "step": 852 + }, + { + "batch_num_effect_tokens": 6460, + "batch_num_samples": 149, + "batch_num_tokens": 52082, + "epoch": 0.77545, + "grad_norm": 3.261878490447998, + "learning_rate": 1.536630350353201e-05, + "loss": 1.4555, + "step": 853 + }, + { + "batch_num_effect_tokens": 6866, + "batch_num_samples": 149, + "batch_num_tokens": 52185, + "epoch": 0.77636, + "grad_norm": 3.649355888366699, + "learning_rate": 1.5352908215625215e-05, + "loss": 2.0898, + "step": 854 + }, + { + "batch_num_effect_tokens": 5492, + "batch_num_samples": 149, + "batch_num_tokens": 52191, + "epoch": 0.77727, + "grad_norm": 3.3275415897369385, + "learning_rate": 1.5339499451778363e-05, + "loss": 1.2897, + "step": 855 + }, + { + "batch_num_effect_tokens": 7780, + "batch_num_samples": 149, + "batch_num_tokens": 52116, + "epoch": 0.77818, + "grad_norm": 3.252866744995117, + "learning_rate": 1.5326077245747998e-05, + "loss": 2.2499, + "step": 856 + }, + { + "batch_num_effect_tokens": 4070, + "batch_num_samples": 149, + "batch_num_tokens": 52220, + "epoch": 0.77909, + "grad_norm": 3.7300028800964355, + "learning_rate": 1.5312641631324513e-05, + "loss": 1.4009, + "step": 857 + }, + { + "batch_num_effect_tokens": 5965, + "batch_num_samples": 149, + "batch_num_tokens": 52178, + "epoch": 0.78, + "grad_norm": 3.201202630996704, + "learning_rate": 1.529919264233205e-05, + "loss": 1.9491, + "step": 858 + }, + { + "batch_num_effect_tokens": 7213, + "batch_num_samples": 149, + "batch_num_tokens": 52189, + "epoch": 0.78091, + "grad_norm": 3.3597118854522705, + "learning_rate": 1.528573031262842e-05, + "loss": 1.6224, + "step": 859 + }, + { + "batch_num_effect_tokens": 5073, + "batch_num_samples": 149, + "batch_num_tokens": 52134, + "epoch": 0.78182, + "grad_norm": 3.368971824645996, + "learning_rate": 1.5272254676105026e-05, + "loss": 1.4534, + "step": 860 + }, + { + "batch_num_effect_tokens": 6545, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 0.78273, + "grad_norm": 3.2033519744873047, + "learning_rate": 1.5258765766686762e-05, + "loss": 2.1782, + "step": 861 + }, + { + "batch_num_effect_tokens": 6338, + "batch_num_samples": 149, + "batch_num_tokens": 52208, + "epoch": 0.78364, + "grad_norm": 2.733912944793701, + "learning_rate": 1.5245263618331944e-05, + "loss": 1.6437, + "step": 862 + }, + { + "batch_num_effect_tokens": 4746, + "batch_num_samples": 149, + "batch_num_tokens": 50509, + "epoch": 0.78455, + "grad_norm": 3.5325381755828857, + "learning_rate": 1.5231748265032216e-05, + "loss": 1.7903, + "step": 863 + }, + { + "batch_num_effect_tokens": 5785, + "batch_num_samples": 149, + "batch_num_tokens": 52165, + "epoch": 0.78545, + "grad_norm": 3.381405830383301, + "learning_rate": 1.521821974081246e-05, + "loss": 2.1012, + "step": 864 + }, + { + "batch_num_effect_tokens": 8847, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 0.78636, + "grad_norm": 2.947990655899048, + "learning_rate": 1.5204678079730724e-05, + "loss": 2.1991, + "step": 865 + }, + { + "batch_num_effect_tokens": 6221, + "batch_num_samples": 149, + "batch_num_tokens": 52133, + "epoch": 0.78727, + "grad_norm": 3.320669174194336, + "learning_rate": 1.5191123315878123e-05, + "loss": 1.6411, + "step": 866 + }, + { + "batch_num_effect_tokens": 4073, + "batch_num_samples": 149, + "batch_num_tokens": 52209, + "epoch": 0.78818, + "grad_norm": 3.4623115062713623, + "learning_rate": 1.5177555483378752e-05, + "loss": 1.276, + "step": 867 + }, + { + "batch_num_effect_tokens": 7142, + "batch_num_samples": 150, + "batch_num_tokens": 52214, + "epoch": 0.78909, + "grad_norm": 3.2191693782806396, + "learning_rate": 1.5163974616389621e-05, + "loss": 1.9639, + "step": 868 + }, + { + "batch_num_effect_tokens": 9174, + "batch_num_samples": 149, + "batch_num_tokens": 52159, + "epoch": 0.79, + "grad_norm": 2.8348443508148193, + "learning_rate": 1.5150380749100545e-05, + "loss": 2.0828, + "step": 869 + }, + { + "batch_num_effect_tokens": 6934, + "batch_num_samples": 149, + "batch_num_tokens": 52158, + "epoch": 0.79091, + "grad_norm": 3.363175868988037, + "learning_rate": 1.5136773915734067e-05, + "loss": 2.0916, + "step": 870 + }, + { + "batch_num_effect_tokens": 5090, + "batch_num_samples": 149, + "batch_num_tokens": 52141, + "epoch": 0.79182, + "grad_norm": 4.208559989929199, + "learning_rate": 1.5123154150545372e-05, + "loss": 1.7012, + "step": 871 + }, + { + "batch_num_effect_tokens": 4662, + "batch_num_samples": 149, + "batch_num_tokens": 52187, + "epoch": 0.79273, + "grad_norm": 3.3394784927368164, + "learning_rate": 1.5109521487822208e-05, + "loss": 1.7266, + "step": 872 + }, + { + "batch_num_effect_tokens": 9093, + "batch_num_samples": 149, + "batch_num_tokens": 52139, + "epoch": 0.79364, + "grad_norm": 2.944627285003662, + "learning_rate": 1.5095875961884781e-05, + "loss": 2.1492, + "step": 873 + }, + { + "batch_num_effect_tokens": 4703, + "batch_num_samples": 149, + "batch_num_tokens": 52191, + "epoch": 0.79455, + "grad_norm": 3.579747200012207, + "learning_rate": 1.5082217607085692e-05, + "loss": 1.4926, + "step": 874 + }, + { + "batch_num_effect_tokens": 7135, + "batch_num_samples": 149, + "batch_num_tokens": 52124, + "epoch": 0.79545, + "grad_norm": 3.227884531021118, + "learning_rate": 1.5068546457809831e-05, + "loss": 1.7614, + "step": 875 + }, + { + "batch_num_effect_tokens": 6036, + "batch_num_samples": 149, + "batch_num_tokens": 52140, + "epoch": 0.79636, + "grad_norm": 3.272951126098633, + "learning_rate": 1.5054862548474298e-05, + "loss": 1.8083, + "step": 876 + }, + { + "batch_num_effect_tokens": 3696, + "batch_num_samples": 149, + "batch_num_tokens": 52090, + "epoch": 0.79727, + "grad_norm": 3.2300150394439697, + "learning_rate": 1.504116591352832e-05, + "loss": 1.3745, + "step": 877 + }, + { + "batch_num_effect_tokens": 7175, + "batch_num_samples": 149, + "batch_num_tokens": 52129, + "epoch": 0.79818, + "grad_norm": 3.1032445430755615, + "learning_rate": 1.5027456587453159e-05, + "loss": 1.9573, + "step": 878 + }, + { + "batch_num_effect_tokens": 7828, + "batch_num_samples": 150, + "batch_num_tokens": 52177, + "epoch": 0.79909, + "grad_norm": 3.465160369873047, + "learning_rate": 1.5013734604762032e-05, + "loss": 1.7241, + "step": 879 + }, + { + "batch_num_effect_tokens": 7234, + "batch_num_samples": 150, + "batch_num_tokens": 52188, + "epoch": 0.8, + "grad_norm": 3.1303765773773193, + "learning_rate": 1.5000000000000002e-05, + "loss": 1.7644, + "step": 880 + }, + { + "batch_num_effect_tokens": 5925, + "batch_num_samples": 149, + "batch_num_tokens": 52213, + "epoch": 0.80091, + "grad_norm": 3.201023578643799, + "learning_rate": 1.4986252807743928e-05, + "loss": 1.829, + "step": 881 + }, + { + "batch_num_effect_tokens": 7170, + "batch_num_samples": 149, + "batch_num_tokens": 52180, + "epoch": 0.80182, + "grad_norm": 2.98020601272583, + "learning_rate": 1.4972493062602355e-05, + "loss": 1.9255, + "step": 882 + }, + { + "batch_num_effect_tokens": 4321, + "batch_num_samples": 149, + "batch_num_tokens": 52140, + "epoch": 0.80273, + "grad_norm": 3.3040101528167725, + "learning_rate": 1.4958720799215414e-05, + "loss": 1.4459, + "step": 883 + }, + { + "batch_num_effect_tokens": 5098, + "batch_num_samples": 149, + "batch_num_tokens": 52208, + "epoch": 0.80364, + "grad_norm": 3.1221706867218018, + "learning_rate": 1.494493605225477e-05, + "loss": 1.8362, + "step": 884 + }, + { + "batch_num_effect_tokens": 5578, + "batch_num_samples": 149, + "batch_num_tokens": 52208, + "epoch": 0.80455, + "grad_norm": 3.223900556564331, + "learning_rate": 1.4931138856423504e-05, + "loss": 1.7424, + "step": 885 + }, + { + "batch_num_effect_tokens": 8348, + "batch_num_samples": 149, + "batch_num_tokens": 52141, + "epoch": 0.80545, + "grad_norm": 2.6039373874664307, + "learning_rate": 1.491732924645604e-05, + "loss": 1.5469, + "step": 886 + }, + { + "batch_num_effect_tokens": 5352, + "batch_num_samples": 149, + "batch_num_tokens": 52198, + "epoch": 0.80636, + "grad_norm": 3.405869245529175, + "learning_rate": 1.4903507257118054e-05, + "loss": 1.7076, + "step": 887 + }, + { + "batch_num_effect_tokens": 5396, + "batch_num_samples": 149, + "batch_num_tokens": 52184, + "epoch": 0.80727, + "grad_norm": 3.1065635681152344, + "learning_rate": 1.488967292320639e-05, + "loss": 1.482, + "step": 888 + }, + { + "batch_num_effect_tokens": 7004, + "batch_num_samples": 149, + "batch_num_tokens": 52115, + "epoch": 0.80818, + "grad_norm": 4.136509895324707, + "learning_rate": 1.4875826279548964e-05, + "loss": 1.824, + "step": 889 + }, + { + "batch_num_effect_tokens": 7966, + "batch_num_samples": 149, + "batch_num_tokens": 52132, + "epoch": 0.80909, + "grad_norm": 3.3540987968444824, + "learning_rate": 1.4861967361004687e-05, + "loss": 2.1597, + "step": 890 + }, + { + "batch_num_effect_tokens": 5298, + "batch_num_samples": 149, + "batch_num_tokens": 52190, + "epoch": 0.81, + "grad_norm": 3.02241849899292, + "learning_rate": 1.4848096202463373e-05, + "loss": 1.6436, + "step": 891 + }, + { + "batch_num_effect_tokens": 6420, + "batch_num_samples": 149, + "batch_num_tokens": 52097, + "epoch": 0.81091, + "grad_norm": 3.1122889518737793, + "learning_rate": 1.4834212838845639e-05, + "loss": 1.4681, + "step": 892 + }, + { + "batch_num_effect_tokens": 6113, + "batch_num_samples": 149, + "batch_num_tokens": 52182, + "epoch": 0.81182, + "grad_norm": 3.4665653705596924, + "learning_rate": 1.4820317305102842e-05, + "loss": 2.1555, + "step": 893 + }, + { + "batch_num_effect_tokens": 7208, + "batch_num_samples": 149, + "batch_num_tokens": 52141, + "epoch": 0.81273, + "grad_norm": 2.9199295043945312, + "learning_rate": 1.4806409636216974e-05, + "loss": 1.7773, + "step": 894 + }, + { + "batch_num_effect_tokens": 5194, + "batch_num_samples": 149, + "batch_num_tokens": 50593, + "epoch": 0.81364, + "grad_norm": 3.5358550548553467, + "learning_rate": 1.479248986720057e-05, + "loss": 1.7775, + "step": 895 + }, + { + "batch_num_effect_tokens": 4817, + "batch_num_samples": 149, + "batch_num_tokens": 52117, + "epoch": 0.81455, + "grad_norm": 3.5894317626953125, + "learning_rate": 1.4778558033096633e-05, + "loss": 1.9001, + "step": 896 + }, + { + "batch_num_effect_tokens": 6619, + "batch_num_samples": 149, + "batch_num_tokens": 52117, + "epoch": 0.81545, + "grad_norm": 3.2133636474609375, + "learning_rate": 1.4764614168978539e-05, + "loss": 1.821, + "step": 897 + }, + { + "batch_num_effect_tokens": 7815, + "batch_num_samples": 149, + "batch_num_tokens": 50567, + "epoch": 0.81636, + "grad_norm": 3.0909602642059326, + "learning_rate": 1.4750658309949953e-05, + "loss": 2.0219, + "step": 898 + }, + { + "batch_num_effect_tokens": 9105, + "batch_num_samples": 149, + "batch_num_tokens": 52187, + "epoch": 0.81727, + "grad_norm": 3.031285285949707, + "learning_rate": 1.4736690491144724e-05, + "loss": 2.0928, + "step": 899 + }, + { + "batch_num_effect_tokens": 4729, + "batch_num_samples": 149, + "batch_num_tokens": 52171, + "epoch": 0.81818, + "grad_norm": 3.4227333068847656, + "learning_rate": 1.472271074772683e-05, + "loss": 1.3522, + "step": 900 + }, + { + "batch_num_effect_tokens": 5764, + "batch_num_samples": 149, + "batch_num_tokens": 52198, + "epoch": 0.81909, + "grad_norm": 3.352691888809204, + "learning_rate": 1.470871911489025e-05, + "loss": 2.0215, + "step": 901 + }, + { + "batch_num_effect_tokens": 6642, + "batch_num_samples": 149, + "batch_num_tokens": 52160, + "epoch": 0.82, + "grad_norm": 2.8963751792907715, + "learning_rate": 1.469471562785891e-05, + "loss": 1.5203, + "step": 902 + }, + { + "batch_num_effect_tokens": 7384, + "batch_num_samples": 149, + "batch_num_tokens": 52128, + "epoch": 0.82091, + "grad_norm": 3.655452013015747, + "learning_rate": 1.4680700321886567e-05, + "loss": 2.0525, + "step": 903 + }, + { + "batch_num_effect_tokens": 8367, + "batch_num_samples": 150, + "batch_num_tokens": 52187, + "epoch": 0.82182, + "grad_norm": 3.0699498653411865, + "learning_rate": 1.4666673232256738e-05, + "loss": 1.9485, + "step": 904 + }, + { + "batch_num_effect_tokens": 5188, + "batch_num_samples": 149, + "batch_num_tokens": 52196, + "epoch": 0.82273, + "grad_norm": 4.488002777099609, + "learning_rate": 1.4652634394282608e-05, + "loss": 1.0071, + "step": 905 + }, + { + "batch_num_effect_tokens": 8174, + "batch_num_samples": 149, + "batch_num_tokens": 52132, + "epoch": 0.82364, + "grad_norm": 3.199490785598755, + "learning_rate": 1.4638583843306928e-05, + "loss": 2.0605, + "step": 906 + }, + { + "batch_num_effect_tokens": 8521, + "batch_num_samples": 149, + "batch_num_tokens": 52215, + "epoch": 0.82455, + "grad_norm": 2.913740873336792, + "learning_rate": 1.462452161470195e-05, + "loss": 1.9692, + "step": 907 + }, + { + "batch_num_effect_tokens": 5555, + "batch_num_samples": 149, + "batch_num_tokens": 52207, + "epoch": 0.82545, + "grad_norm": 3.375941038131714, + "learning_rate": 1.4610447743869313e-05, + "loss": 1.7169, + "step": 908 + }, + { + "batch_num_effect_tokens": 5281, + "batch_num_samples": 149, + "batch_num_tokens": 52190, + "epoch": 0.82636, + "grad_norm": 3.434762716293335, + "learning_rate": 1.4596362266239974e-05, + "loss": 1.7983, + "step": 909 + }, + { + "batch_num_effect_tokens": 10814, + "batch_num_samples": 149, + "batch_num_tokens": 52163, + "epoch": 0.82727, + "grad_norm": 2.8641369342803955, + "learning_rate": 1.4582265217274105e-05, + "loss": 2.3574, + "step": 910 + }, + { + "batch_num_effect_tokens": 5793, + "batch_num_samples": 149, + "batch_num_tokens": 52082, + "epoch": 0.82818, + "grad_norm": 3.170866012573242, + "learning_rate": 1.4568156632461008e-05, + "loss": 1.7385, + "step": 911 + }, + { + "batch_num_effect_tokens": 8426, + "batch_num_samples": 149, + "batch_num_tokens": 52184, + "epoch": 0.82909, + "grad_norm": 2.9237499237060547, + "learning_rate": 1.4554036547319033e-05, + "loss": 1.7105, + "step": 912 + }, + { + "batch_num_effect_tokens": 5176, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 0.83, + "grad_norm": 3.3719794750213623, + "learning_rate": 1.4539904997395468e-05, + "loss": 1.7381, + "step": 913 + }, + { + "batch_num_effect_tokens": 6254, + "batch_num_samples": 149, + "batch_num_tokens": 52188, + "epoch": 0.83091, + "grad_norm": 3.7223596572875977, + "learning_rate": 1.4525762018266484e-05, + "loss": 2.2058, + "step": 914 + }, + { + "batch_num_effect_tokens": 6421, + "batch_num_samples": 149, + "batch_num_tokens": 52180, + "epoch": 0.83182, + "grad_norm": 2.9281041622161865, + "learning_rate": 1.4511607645537009e-05, + "loss": 1.6528, + "step": 915 + }, + { + "batch_num_effect_tokens": 5651, + "batch_num_samples": 149, + "batch_num_tokens": 52082, + "epoch": 0.83273, + "grad_norm": 3.437027931213379, + "learning_rate": 1.449744191484066e-05, + "loss": 1.869, + "step": 916 + }, + { + "batch_num_effect_tokens": 6709, + "batch_num_samples": 149, + "batch_num_tokens": 52207, + "epoch": 0.83364, + "grad_norm": 2.9521644115448, + "learning_rate": 1.4483264861839646e-05, + "loss": 1.5646, + "step": 917 + }, + { + "batch_num_effect_tokens": 4699, + "batch_num_samples": 149, + "batch_num_tokens": 52142, + "epoch": 0.83455, + "grad_norm": 3.5215671062469482, + "learning_rate": 1.4469076522224683e-05, + "loss": 1.294, + "step": 918 + }, + { + "batch_num_effect_tokens": 6396, + "batch_num_samples": 149, + "batch_num_tokens": 52142, + "epoch": 0.83545, + "grad_norm": 4.2995686531066895, + "learning_rate": 1.4454876931714896e-05, + "loss": 1.6281, + "step": 919 + }, + { + "batch_num_effect_tokens": 5342, + "batch_num_samples": 149, + "batch_num_tokens": 52128, + "epoch": 0.83636, + "grad_norm": 3.4456379413604736, + "learning_rate": 1.4440666126057743e-05, + "loss": 1.9214, + "step": 920 + }, + { + "batch_num_effect_tokens": 8045, + "batch_num_samples": 149, + "batch_num_tokens": 52201, + "epoch": 0.83727, + "grad_norm": 3.3013765811920166, + "learning_rate": 1.4426444141028905e-05, + "loss": 1.9053, + "step": 921 + }, + { + "batch_num_effect_tokens": 6480, + "batch_num_samples": 149, + "batch_num_tokens": 52170, + "epoch": 0.83818, + "grad_norm": 3.5072872638702393, + "learning_rate": 1.4412211012432213e-05, + "loss": 1.818, + "step": 922 + }, + { + "batch_num_effect_tokens": 3625, + "batch_num_samples": 149, + "batch_num_tokens": 52140, + "epoch": 0.83909, + "grad_norm": 3.228583812713623, + "learning_rate": 1.4397966776099558e-05, + "loss": 1.2451, + "step": 923 + }, + { + "batch_num_effect_tokens": 6205, + "batch_num_samples": 149, + "batch_num_tokens": 52123, + "epoch": 0.84, + "grad_norm": 3.156794548034668, + "learning_rate": 1.4383711467890776e-05, + "loss": 1.7222, + "step": 924 + }, + { + "batch_num_effect_tokens": 6118, + "batch_num_samples": 149, + "batch_num_tokens": 52220, + "epoch": 0.84091, + "grad_norm": 3.497466564178467, + "learning_rate": 1.4369445123693595e-05, + "loss": 2.0007, + "step": 925 + }, + { + "batch_num_effect_tokens": 5880, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 0.84182, + "grad_norm": 3.0640792846679688, + "learning_rate": 1.4355167779423525e-05, + "loss": 1.6348, + "step": 926 + }, + { + "batch_num_effect_tokens": 7842, + "batch_num_samples": 149, + "batch_num_tokens": 52177, + "epoch": 0.84273, + "grad_norm": 3.0178539752960205, + "learning_rate": 1.4340879471023752e-05, + "loss": 2.1852, + "step": 927 + }, + { + "batch_num_effect_tokens": 5592, + "batch_num_samples": 149, + "batch_num_tokens": 52180, + "epoch": 0.84364, + "grad_norm": 3.0909364223480225, + "learning_rate": 1.4326580234465084e-05, + "loss": 1.6169, + "step": 928 + }, + { + "batch_num_effect_tokens": 6014, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 0.84455, + "grad_norm": 3.195918560028076, + "learning_rate": 1.4312270105745829e-05, + "loss": 1.9621, + "step": 929 + }, + { + "batch_num_effect_tokens": 9469, + "batch_num_samples": 149, + "batch_num_tokens": 52188, + "epoch": 0.84545, + "grad_norm": 3.2743117809295654, + "learning_rate": 1.4297949120891718e-05, + "loss": 2.3096, + "step": 930 + }, + { + "batch_num_effect_tokens": 5212, + "batch_num_samples": 149, + "batch_num_tokens": 52223, + "epoch": 0.84636, + "grad_norm": 2.9876720905303955, + "learning_rate": 1.4283617315955815e-05, + "loss": 1.5374, + "step": 931 + }, + { + "batch_num_effect_tokens": 4148, + "batch_num_samples": 149, + "batch_num_tokens": 52210, + "epoch": 0.84727, + "grad_norm": 3.05871844291687, + "learning_rate": 1.4269274727018419e-05, + "loss": 1.1089, + "step": 932 + }, + { + "batch_num_effect_tokens": 5868, + "batch_num_samples": 149, + "batch_num_tokens": 52191, + "epoch": 0.84818, + "grad_norm": 4.548148155212402, + "learning_rate": 1.4254921390186986e-05, + "loss": 1.7198, + "step": 933 + }, + { + "batch_num_effect_tokens": 4948, + "batch_num_samples": 149, + "batch_num_tokens": 52223, + "epoch": 0.84909, + "grad_norm": 3.468212127685547, + "learning_rate": 1.424055734159602e-05, + "loss": 1.3738, + "step": 934 + }, + { + "batch_num_effect_tokens": 3911, + "batch_num_samples": 149, + "batch_num_tokens": 50567, + "epoch": 0.85, + "grad_norm": 3.750844717025757, + "learning_rate": 1.4226182617406996e-05, + "loss": 1.9055, + "step": 935 + }, + { + "batch_num_effect_tokens": 8673, + "batch_num_samples": 149, + "batch_num_tokens": 52213, + "epoch": 0.85091, + "grad_norm": 2.934328556060791, + "learning_rate": 1.4211797253808268e-05, + "loss": 2.0859, + "step": 936 + }, + { + "batch_num_effect_tokens": 6360, + "batch_num_samples": 150, + "batch_num_tokens": 52215, + "epoch": 0.85182, + "grad_norm": 3.3923490047454834, + "learning_rate": 1.419740128701497e-05, + "loss": 1.7395, + "step": 937 + }, + { + "batch_num_effect_tokens": 6507, + "batch_num_samples": 149, + "batch_num_tokens": 52168, + "epoch": 0.85273, + "grad_norm": 3.219904661178589, + "learning_rate": 1.4182994753268929e-05, + "loss": 1.8854, + "step": 938 + }, + { + "batch_num_effect_tokens": 4322, + "batch_num_samples": 149, + "batch_num_tokens": 52167, + "epoch": 0.85364, + "grad_norm": 3.351370096206665, + "learning_rate": 1.4168577688838581e-05, + "loss": 1.5269, + "step": 939 + }, + { + "batch_num_effect_tokens": 6869, + "batch_num_samples": 149, + "batch_num_tokens": 52125, + "epoch": 0.85455, + "grad_norm": 3.1984567642211914, + "learning_rate": 1.4154150130018867e-05, + "loss": 2.1609, + "step": 940 + }, + { + "batch_num_effect_tokens": 6322, + "batch_num_samples": 149, + "batch_num_tokens": 52138, + "epoch": 0.85545, + "grad_norm": 3.1590569019317627, + "learning_rate": 1.4139712113131146e-05, + "loss": 1.786, + "step": 941 + }, + { + "batch_num_effect_tokens": 6568, + "batch_num_samples": 149, + "batch_num_tokens": 52163, + "epoch": 0.85636, + "grad_norm": 3.2718281745910645, + "learning_rate": 1.4125263674523113e-05, + "loss": 1.7956, + "step": 942 + }, + { + "batch_num_effect_tokens": 6453, + "batch_num_samples": 149, + "batch_num_tokens": 52139, + "epoch": 0.85727, + "grad_norm": 3.1080870628356934, + "learning_rate": 1.4110804850568691e-05, + "loss": 2.0176, + "step": 943 + }, + { + "batch_num_effect_tokens": 7243, + "batch_num_samples": 149, + "batch_num_tokens": 52141, + "epoch": 0.85818, + "grad_norm": 2.9790542125701904, + "learning_rate": 1.4096335677667954e-05, + "loss": 1.6818, + "step": 944 + }, + { + "batch_num_effect_tokens": 6578, + "batch_num_samples": 149, + "batch_num_tokens": 52140, + "epoch": 0.85909, + "grad_norm": 2.952096462249756, + "learning_rate": 1.4081856192247032e-05, + "loss": 1.8174, + "step": 945 + }, + { + "batch_num_effect_tokens": 6645, + "batch_num_samples": 149, + "batch_num_tokens": 52223, + "epoch": 0.86, + "grad_norm": 3.055183172225952, + "learning_rate": 1.4067366430758004e-05, + "loss": 1.8087, + "step": 946 + }, + { + "batch_num_effect_tokens": 6191, + "batch_num_samples": 149, + "batch_num_tokens": 52134, + "epoch": 0.86091, + "grad_norm": 3.5459861755371094, + "learning_rate": 1.4052866429678832e-05, + "loss": 1.5099, + "step": 947 + }, + { + "batch_num_effect_tokens": 8808, + "batch_num_samples": 149, + "batch_num_tokens": 52222, + "epoch": 0.86182, + "grad_norm": 3.076371192932129, + "learning_rate": 1.403835622551325e-05, + "loss": 2.0291, + "step": 948 + }, + { + "batch_num_effect_tokens": 8282, + "batch_num_samples": 149, + "batch_num_tokens": 52139, + "epoch": 0.86273, + "grad_norm": 3.062523126602173, + "learning_rate": 1.4023835854790682e-05, + "loss": 2.123, + "step": 949 + }, + { + "batch_num_effect_tokens": 6325, + "batch_num_samples": 149, + "batch_num_tokens": 52209, + "epoch": 0.86364, + "grad_norm": 3.318748950958252, + "learning_rate": 1.4009305354066138e-05, + "loss": 1.9367, + "step": 950 + }, + { + "batch_num_effect_tokens": 5377, + "batch_num_samples": 149, + "batch_num_tokens": 52165, + "epoch": 0.86455, + "grad_norm": 3.2072744369506836, + "learning_rate": 1.3994764759920144e-05, + "loss": 1.7947, + "step": 951 + }, + { + "batch_num_effect_tokens": 5308, + "batch_num_samples": 149, + "batch_num_tokens": 52124, + "epoch": 0.86545, + "grad_norm": 4.14223051071167, + "learning_rate": 1.3980214108958626e-05, + "loss": 1.2963, + "step": 952 + }, + { + "batch_num_effect_tokens": 6727, + "batch_num_samples": 150, + "batch_num_tokens": 52215, + "epoch": 0.86636, + "grad_norm": 3.146315574645996, + "learning_rate": 1.3965653437812825e-05, + "loss": 1.6541, + "step": 953 + }, + { + "batch_num_effect_tokens": 7145, + "batch_num_samples": 149, + "batch_num_tokens": 52131, + "epoch": 0.86727, + "grad_norm": 2.9531638622283936, + "learning_rate": 1.3951082783139221e-05, + "loss": 1.9211, + "step": 954 + }, + { + "batch_num_effect_tokens": 7363, + "batch_num_samples": 149, + "batch_num_tokens": 52146, + "epoch": 0.86818, + "grad_norm": 3.0392003059387207, + "learning_rate": 1.3936502181619415e-05, + "loss": 1.7422, + "step": 955 + }, + { + "batch_num_effect_tokens": 5878, + "batch_num_samples": 149, + "batch_num_tokens": 52180, + "epoch": 0.86909, + "grad_norm": 3.071474552154541, + "learning_rate": 1.3921911669960055e-05, + "loss": 1.8491, + "step": 956 + }, + { + "batch_num_effect_tokens": 9591, + "batch_num_samples": 149, + "batch_num_tokens": 52209, + "epoch": 0.87, + "grad_norm": 2.797166347503662, + "learning_rate": 1.3907311284892737e-05, + "loss": 2.0928, + "step": 957 + }, + { + "batch_num_effect_tokens": 9939, + "batch_num_samples": 149, + "batch_num_tokens": 52184, + "epoch": 0.87091, + "grad_norm": 2.7077417373657227, + "learning_rate": 1.3892701063173917e-05, + "loss": 2.0326, + "step": 958 + }, + { + "batch_num_effect_tokens": 4945, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 0.87182, + "grad_norm": 3.3089983463287354, + "learning_rate": 1.3878081041584803e-05, + "loss": 1.51, + "step": 959 + }, + { + "batch_num_effect_tokens": 7892, + "batch_num_samples": 149, + "batch_num_tokens": 52167, + "epoch": 0.87273, + "grad_norm": 2.2191925048828125, + "learning_rate": 1.3863451256931286e-05, + "loss": 1.0674, + "step": 960 + }, + { + "batch_num_effect_tokens": 7345, + "batch_num_samples": 149, + "batch_num_tokens": 52118, + "epoch": 0.87364, + "grad_norm": 3.0597195625305176, + "learning_rate": 1.3848811746043835e-05, + "loss": 1.7672, + "step": 961 + }, + { + "batch_num_effect_tokens": 4985, + "batch_num_samples": 149, + "batch_num_tokens": 52198, + "epoch": 0.87455, + "grad_norm": 3.9039957523345947, + "learning_rate": 1.3834162545777394e-05, + "loss": 1.938, + "step": 962 + }, + { + "batch_num_effect_tokens": 4052, + "batch_num_samples": 149, + "batch_num_tokens": 52191, + "epoch": 0.87545, + "grad_norm": 3.0499753952026367, + "learning_rate": 1.3819503693011314e-05, + "loss": 1.1316, + "step": 963 + }, + { + "batch_num_effect_tokens": 5414, + "batch_num_samples": 149, + "batch_num_tokens": 52174, + "epoch": 0.87636, + "grad_norm": 3.102602481842041, + "learning_rate": 1.380483522464923e-05, + "loss": 1.4438, + "step": 964 + }, + { + "batch_num_effect_tokens": 5948, + "batch_num_samples": 149, + "batch_num_tokens": 52208, + "epoch": 0.87727, + "grad_norm": 3.1843950748443604, + "learning_rate": 1.3790157177619005e-05, + "loss": 1.9162, + "step": 965 + }, + { + "batch_num_effect_tokens": 4852, + "batch_num_samples": 149, + "batch_num_tokens": 52144, + "epoch": 0.87818, + "grad_norm": 3.322779655456543, + "learning_rate": 1.3775469588872601e-05, + "loss": 1.5374, + "step": 966 + }, + { + "batch_num_effect_tokens": 5462, + "batch_num_samples": 149, + "batch_num_tokens": 50560, + "epoch": 0.87909, + "grad_norm": 3.453280210494995, + "learning_rate": 1.3760772495385998e-05, + "loss": 1.9769, + "step": 967 + }, + { + "batch_num_effect_tokens": 5586, + "batch_num_samples": 149, + "batch_num_tokens": 52171, + "epoch": 0.88, + "grad_norm": 3.1580238342285156, + "learning_rate": 1.3746065934159123e-05, + "loss": 1.7682, + "step": 968 + }, + { + "batch_num_effect_tokens": 5068, + "batch_num_samples": 149, + "batch_num_tokens": 52191, + "epoch": 0.88091, + "grad_norm": 3.2250802516937256, + "learning_rate": 1.3731349942215718e-05, + "loss": 1.6973, + "step": 969 + }, + { + "batch_num_effect_tokens": 4459, + "batch_num_samples": 149, + "batch_num_tokens": 52164, + "epoch": 0.88182, + "grad_norm": 3.6121764183044434, + "learning_rate": 1.3716624556603275e-05, + "loss": 1.7362, + "step": 970 + }, + { + "batch_num_effect_tokens": 6493, + "batch_num_samples": 149, + "batch_num_tokens": 52142, + "epoch": 0.88273, + "grad_norm": 3.3951101303100586, + "learning_rate": 1.3701889814392944e-05, + "loss": 2.0725, + "step": 971 + }, + { + "batch_num_effect_tokens": 9339, + "batch_num_samples": 149, + "batch_num_tokens": 52162, + "epoch": 0.88364, + "grad_norm": 2.8378098011016846, + "learning_rate": 1.3687145752679409e-05, + "loss": 1.9609, + "step": 972 + }, + { + "batch_num_effect_tokens": 6144, + "batch_num_samples": 149, + "batch_num_tokens": 52221, + "epoch": 0.88455, + "grad_norm": 3.2717912197113037, + "learning_rate": 1.3672392408580834e-05, + "loss": 1.9867, + "step": 973 + }, + { + "batch_num_effect_tokens": 7002, + "batch_num_samples": 149, + "batch_num_tokens": 52197, + "epoch": 0.88545, + "grad_norm": 3.2133514881134033, + "learning_rate": 1.3657629819238747e-05, + "loss": 1.7634, + "step": 974 + }, + { + "batch_num_effect_tokens": 7274, + "batch_num_samples": 150, + "batch_num_tokens": 52214, + "epoch": 0.88636, + "grad_norm": 3.2251951694488525, + "learning_rate": 1.3642858021817944e-05, + "loss": 2.203, + "step": 975 + }, + { + "batch_num_effect_tokens": 5400, + "batch_num_samples": 149, + "batch_num_tokens": 52185, + "epoch": 0.88727, + "grad_norm": 3.3960633277893066, + "learning_rate": 1.362807705350641e-05, + "loss": 1.2251, + "step": 976 + }, + { + "batch_num_effect_tokens": 7883, + "batch_num_samples": 149, + "batch_num_tokens": 52179, + "epoch": 0.88818, + "grad_norm": 2.9552037715911865, + "learning_rate": 1.3613286951515216e-05, + "loss": 1.9736, + "step": 977 + }, + { + "batch_num_effect_tokens": 7173, + "batch_num_samples": 149, + "batch_num_tokens": 52146, + "epoch": 0.88909, + "grad_norm": 3.043170690536499, + "learning_rate": 1.3598487753078426e-05, + "loss": 1.7401, + "step": 978 + }, + { + "batch_num_effect_tokens": 7428, + "batch_num_samples": 150, + "batch_num_tokens": 52216, + "epoch": 0.89, + "grad_norm": 3.0731730461120605, + "learning_rate": 1.3583679495453e-05, + "loss": 1.9956, + "step": 979 + }, + { + "batch_num_effect_tokens": 10259, + "batch_num_samples": 149, + "batch_num_tokens": 52182, + "epoch": 0.89091, + "grad_norm": 2.731771469116211, + "learning_rate": 1.356886221591872e-05, + "loss": 2.0283, + "step": 980 + }, + { + "batch_num_effect_tokens": 5746, + "batch_num_samples": 149, + "batch_num_tokens": 52162, + "epoch": 0.89182, + "grad_norm": 3.238330602645874, + "learning_rate": 1.355403595177806e-05, + "loss": 1.5323, + "step": 981 + }, + { + "batch_num_effect_tokens": 7136, + "batch_num_samples": 149, + "batch_num_tokens": 52134, + "epoch": 0.89273, + "grad_norm": 3.06024432182312, + "learning_rate": 1.353920074035612e-05, + "loss": 1.8443, + "step": 982 + }, + { + "batch_num_effect_tokens": 4411, + "batch_num_samples": 149, + "batch_num_tokens": 52187, + "epoch": 0.89364, + "grad_norm": 3.3686585426330566, + "learning_rate": 1.3524356619000534e-05, + "loss": 1.4908, + "step": 983 + }, + { + "batch_num_effect_tokens": 8474, + "batch_num_samples": 149, + "batch_num_tokens": 52118, + "epoch": 0.89455, + "grad_norm": 3.075028657913208, + "learning_rate": 1.350950362508136e-05, + "loss": 2.2764, + "step": 984 + }, + { + "batch_num_effect_tokens": 6610, + "batch_num_samples": 149, + "batch_num_tokens": 52215, + "epoch": 0.89545, + "grad_norm": 3.16196608543396, + "learning_rate": 1.3494641795990986e-05, + "loss": 2.0977, + "step": 985 + }, + { + "batch_num_effect_tokens": 6030, + "batch_num_samples": 149, + "batch_num_tokens": 52187, + "epoch": 0.89636, + "grad_norm": 2.9432215690612793, + "learning_rate": 1.3479771169144052e-05, + "loss": 1.778, + "step": 986 + }, + { + "batch_num_effect_tokens": 6252, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 0.89727, + "grad_norm": 3.090939521789551, + "learning_rate": 1.346489178197735e-05, + "loss": 1.9385, + "step": 987 + }, + { + "batch_num_effect_tokens": 5933, + "batch_num_samples": 149, + "batch_num_tokens": 52118, + "epoch": 0.89818, + "grad_norm": 3.3969521522521973, + "learning_rate": 1.3450003671949707e-05, + "loss": 1.5074, + "step": 988 + }, + { + "batch_num_effect_tokens": 6049, + "batch_num_samples": 149, + "batch_num_tokens": 52163, + "epoch": 0.89909, + "grad_norm": 2.901487112045288, + "learning_rate": 1.3435106876541933e-05, + "loss": 1.6664, + "step": 989 + }, + { + "batch_num_effect_tokens": 7799, + "batch_num_samples": 149, + "batch_num_tokens": 52196, + "epoch": 0.9, + "grad_norm": 2.844240188598633, + "learning_rate": 1.342020143325669e-05, + "loss": 1.8445, + "step": 990 + }, + { + "batch_num_effect_tokens": 5378, + "batch_num_samples": 149, + "batch_num_tokens": 52209, + "epoch": 0.90091, + "grad_norm": 3.242816686630249, + "learning_rate": 1.340528737961841e-05, + "loss": 1.3773, + "step": 991 + }, + { + "batch_num_effect_tokens": 7026, + "batch_num_samples": 149, + "batch_num_tokens": 52147, + "epoch": 0.90182, + "grad_norm": 2.9425547122955322, + "learning_rate": 1.3390364753173206e-05, + "loss": 1.8544, + "step": 992 + }, + { + "batch_num_effect_tokens": 6883, + "batch_num_samples": 149, + "batch_num_tokens": 52151, + "epoch": 0.90273, + "grad_norm": 3.4907162189483643, + "learning_rate": 1.337543359148878e-05, + "loss": 1.9146, + "step": 993 + }, + { + "batch_num_effect_tokens": 6305, + "batch_num_samples": 149, + "batch_num_tokens": 52170, + "epoch": 0.90364, + "grad_norm": 3.1133174896240234, + "learning_rate": 1.3360493932154301e-05, + "loss": 1.875, + "step": 994 + }, + { + "batch_num_effect_tokens": 6244, + "batch_num_samples": 149, + "batch_num_tokens": 52152, + "epoch": 0.90455, + "grad_norm": 3.476804733276367, + "learning_rate": 1.3345545812780354e-05, + "loss": 1.7176, + "step": 995 + }, + { + "batch_num_effect_tokens": 4621, + "batch_num_samples": 149, + "batch_num_tokens": 52182, + "epoch": 0.90545, + "grad_norm": 3.518723964691162, + "learning_rate": 1.3330589270998809e-05, + "loss": 1.8252, + "step": 996 + }, + { + "batch_num_effect_tokens": 7208, + "batch_num_samples": 149, + "batch_num_tokens": 52140, + "epoch": 0.90636, + "grad_norm": 3.3271543979644775, + "learning_rate": 1.331562434446274e-05, + "loss": 2.199, + "step": 997 + }, + { + "batch_num_effect_tokens": 5059, + "batch_num_samples": 149, + "batch_num_tokens": 52139, + "epoch": 0.90727, + "grad_norm": 3.5937557220458984, + "learning_rate": 1.3300651070846333e-05, + "loss": 1.6976, + "step": 998 + }, + { + "batch_num_effect_tokens": 6163, + "batch_num_samples": 149, + "batch_num_tokens": 52179, + "epoch": 0.90818, + "grad_norm": 3.187220335006714, + "learning_rate": 1.3285669487844786e-05, + "loss": 1.6587, + "step": 999 + }, + { + "batch_num_effect_tokens": 6955, + "batch_num_samples": 149, + "batch_num_tokens": 52210, + "epoch": 0.90909, + "grad_norm": 3.0915720462799072, + "learning_rate": 1.3270679633174219e-05, + "loss": 2.0351, + "step": 1000 + }, + { + "batch_num_effect_tokens": 5494, + "batch_num_samples": 149, + "batch_num_tokens": 52210, + "epoch": 0.91, + "grad_norm": 2.7694406509399414, + "learning_rate": 1.3255681544571568e-05, + "loss": 1.0195, + "step": 1001 + }, + { + "batch_num_effect_tokens": 7607, + "batch_num_samples": 149, + "batch_num_tokens": 52161, + "epoch": 0.91091, + "grad_norm": 2.9730849266052246, + "learning_rate": 1.3240675259794507e-05, + "loss": 1.9462, + "step": 1002 + }, + { + "batch_num_effect_tokens": 5629, + "batch_num_samples": 149, + "batch_num_tokens": 52204, + "epoch": 0.91182, + "grad_norm": 3.193814277648926, + "learning_rate": 1.3225660816621342e-05, + "loss": 1.6498, + "step": 1003 + }, + { + "batch_num_effect_tokens": 10167, + "batch_num_samples": 149, + "batch_num_tokens": 52122, + "epoch": 0.91273, + "grad_norm": 2.7844138145446777, + "learning_rate": 1.321063825285091e-05, + "loss": 2.0605, + "step": 1004 + }, + { + "batch_num_effect_tokens": 5533, + "batch_num_samples": 149, + "batch_num_tokens": 52214, + "epoch": 0.91364, + "grad_norm": 3.48059344291687, + "learning_rate": 1.3195607606302501e-05, + "loss": 1.7594, + "step": 1005 + }, + { + "batch_num_effect_tokens": 10088, + "batch_num_samples": 149, + "batch_num_tokens": 52169, + "epoch": 0.91455, + "grad_norm": 2.8770751953125, + "learning_rate": 1.3180568914815752e-05, + "loss": 2.077, + "step": 1006 + }, + { + "batch_num_effect_tokens": 7385, + "batch_num_samples": 149, + "batch_num_tokens": 52211, + "epoch": 0.91545, + "grad_norm": 3.197993278503418, + "learning_rate": 1.3165522216250544e-05, + "loss": 1.8163, + "step": 1007 + }, + { + "batch_num_effect_tokens": 7704, + "batch_num_samples": 149, + "batch_num_tokens": 52221, + "epoch": 0.91636, + "grad_norm": 2.8752472400665283, + "learning_rate": 1.3150467548486929e-05, + "loss": 1.9441, + "step": 1008 + }, + { + "batch_num_effect_tokens": 6017, + "batch_num_samples": 149, + "batch_num_tokens": 52157, + "epoch": 0.91727, + "grad_norm": 3.4210171699523926, + "learning_rate": 1.3135404949425015e-05, + "loss": 1.9371, + "step": 1009 + }, + { + "batch_num_effect_tokens": 6395, + "batch_num_samples": 149, + "batch_num_tokens": 52184, + "epoch": 0.91818, + "grad_norm": 3.0630974769592285, + "learning_rate": 1.3120334456984871e-05, + "loss": 1.7543, + "step": 1010 + }, + { + "batch_num_effect_tokens": 4265, + "batch_num_samples": 149, + "batch_num_tokens": 52183, + "epoch": 0.91909, + "grad_norm": 3.465033531188965, + "learning_rate": 1.310525610910645e-05, + "loss": 1.1579, + "step": 1011 + }, + { + "batch_num_effect_tokens": 4477, + "batch_num_samples": 149, + "batch_num_tokens": 52185, + "epoch": 0.92, + "grad_norm": 3.1414945125579834, + "learning_rate": 1.3090169943749475e-05, + "loss": 1.2547, + "step": 1012 + }, + { + "batch_num_effect_tokens": 8085, + "batch_num_samples": 149, + "batch_num_tokens": 52115, + "epoch": 0.92091, + "grad_norm": 3.4485549926757812, + "learning_rate": 1.3075075998893345e-05, + "loss": 2.2038, + "step": 1013 + }, + { + "batch_num_effect_tokens": 4847, + "batch_num_samples": 149, + "batch_num_tokens": 52209, + "epoch": 0.92182, + "grad_norm": 3.664796829223633, + "learning_rate": 1.3059974312537054e-05, + "loss": 1.5868, + "step": 1014 + }, + { + "batch_num_effect_tokens": 6282, + "batch_num_samples": 149, + "batch_num_tokens": 52168, + "epoch": 0.92273, + "grad_norm": 2.9644715785980225, + "learning_rate": 1.3044864922699072e-05, + "loss": 1.5455, + "step": 1015 + }, + { + "batch_num_effect_tokens": 5664, + "batch_num_samples": 149, + "batch_num_tokens": 52157, + "epoch": 0.92364, + "grad_norm": 3.3737034797668457, + "learning_rate": 1.3029747867417275e-05, + "loss": 1.6858, + "step": 1016 + }, + { + "batch_num_effect_tokens": 7258, + "batch_num_samples": 150, + "batch_num_tokens": 52178, + "epoch": 0.92455, + "grad_norm": 2.9502992630004883, + "learning_rate": 1.301462318474883e-05, + "loss": 1.8345, + "step": 1017 + }, + { + "batch_num_effect_tokens": 7294, + "batch_num_samples": 149, + "batch_num_tokens": 52210, + "epoch": 0.92545, + "grad_norm": 3.1529641151428223, + "learning_rate": 1.2999490912770108e-05, + "loss": 2.0247, + "step": 1018 + }, + { + "batch_num_effect_tokens": 6216, + "batch_num_samples": 149, + "batch_num_tokens": 52140, + "epoch": 0.92636, + "grad_norm": 3.3714842796325684, + "learning_rate": 1.2984351089576585e-05, + "loss": 1.9965, + "step": 1019 + }, + { + "batch_num_effect_tokens": 7921, + "batch_num_samples": 149, + "batch_num_tokens": 52140, + "epoch": 0.92727, + "grad_norm": 2.8136098384857178, + "learning_rate": 1.296920375328275e-05, + "loss": 1.9386, + "step": 1020 + }, + { + "batch_num_effect_tokens": 6546, + "batch_num_samples": 149, + "batch_num_tokens": 52107, + "epoch": 0.92818, + "grad_norm": 3.0203120708465576, + "learning_rate": 1.2954048942022002e-05, + "loss": 1.6337, + "step": 1021 + }, + { + "batch_num_effect_tokens": 6707, + "batch_num_samples": 149, + "batch_num_tokens": 52185, + "epoch": 0.92909, + "grad_norm": 3.1548757553100586, + "learning_rate": 1.2938886693946563e-05, + "loss": 1.8008, + "step": 1022 + }, + { + "batch_num_effect_tokens": 6462, + "batch_num_samples": 149, + "batch_num_tokens": 52184, + "epoch": 0.93, + "grad_norm": 3.236276149749756, + "learning_rate": 1.2923717047227368e-05, + "loss": 2.0444, + "step": 1023 + }, + { + "batch_num_effect_tokens": 4891, + "batch_num_samples": 149, + "batch_num_tokens": 52138, + "epoch": 0.93091, + "grad_norm": 3.0936660766601562, + "learning_rate": 1.2908540040053992e-05, + "loss": 1.4072, + "step": 1024 + }, + { + "batch_num_effect_tokens": 9397, + "batch_num_samples": 149, + "batch_num_tokens": 52132, + "epoch": 0.93182, + "grad_norm": 2.655895948410034, + "learning_rate": 1.289335571063453e-05, + "loss": 1.9849, + "step": 1025 + }, + { + "batch_num_effect_tokens": 7461, + "batch_num_samples": 149, + "batch_num_tokens": 52214, + "epoch": 0.93273, + "grad_norm": 3.800144672393799, + "learning_rate": 1.287816409719551e-05, + "loss": 1.9769, + "step": 1026 + }, + { + "batch_num_effect_tokens": 6287, + "batch_num_samples": 149, + "batch_num_tokens": 52139, + "epoch": 0.93364, + "grad_norm": 3.192833662033081, + "learning_rate": 1.2862965237981804e-05, + "loss": 1.4238, + "step": 1027 + }, + { + "batch_num_effect_tokens": 6381, + "batch_num_samples": 149, + "batch_num_tokens": 52214, + "epoch": 0.93455, + "grad_norm": 3.40364670753479, + "learning_rate": 1.2847759171256523e-05, + "loss": 1.4348, + "step": 1028 + }, + { + "batch_num_effect_tokens": 6186, + "batch_num_samples": 149, + "batch_num_tokens": 52152, + "epoch": 0.93545, + "grad_norm": 3.084475517272949, + "learning_rate": 1.283254593530092e-05, + "loss": 1.5965, + "step": 1029 + }, + { + "batch_num_effect_tokens": 6089, + "batch_num_samples": 149, + "batch_num_tokens": 52135, + "epoch": 0.93636, + "grad_norm": 3.103367805480957, + "learning_rate": 1.2817325568414299e-05, + "loss": 1.4762, + "step": 1030 + }, + { + "batch_num_effect_tokens": 7823, + "batch_num_samples": 149, + "batch_num_tokens": 52163, + "epoch": 0.93727, + "grad_norm": 3.1137585639953613, + "learning_rate": 1.2802098108913914e-05, + "loss": 2.0879, + "step": 1031 + }, + { + "batch_num_effect_tokens": 7435, + "batch_num_samples": 149, + "batch_num_tokens": 52170, + "epoch": 0.93818, + "grad_norm": 3.122626543045044, + "learning_rate": 1.278686359513488e-05, + "loss": 1.7587, + "step": 1032 + }, + { + "batch_num_effect_tokens": 4802, + "batch_num_samples": 149, + "batch_num_tokens": 52196, + "epoch": 0.93909, + "grad_norm": 4.107133865356445, + "learning_rate": 1.2771622065430061e-05, + "loss": 1.6359, + "step": 1033 + }, + { + "batch_num_effect_tokens": 7525, + "batch_num_samples": 149, + "batch_num_tokens": 52221, + "epoch": 0.94, + "grad_norm": 3.2098639011383057, + "learning_rate": 1.2756373558169992e-05, + "loss": 2.0099, + "step": 1034 + }, + { + "batch_num_effect_tokens": 5538, + "batch_num_samples": 149, + "batch_num_tokens": 52140, + "epoch": 0.94091, + "grad_norm": 3.2654569149017334, + "learning_rate": 1.2741118111742778e-05, + "loss": 1.3742, + "step": 1035 + }, + { + "batch_num_effect_tokens": 6099, + "batch_num_samples": 149, + "batch_num_tokens": 52207, + "epoch": 0.94182, + "grad_norm": 3.20632004737854, + "learning_rate": 1.2725855764553981e-05, + "loss": 1.6452, + "step": 1036 + }, + { + "batch_num_effect_tokens": 5712, + "batch_num_samples": 149, + "batch_num_tokens": 50567, + "epoch": 0.94273, + "grad_norm": 3.3683362007141113, + "learning_rate": 1.2710586555026541e-05, + "loss": 1.6897, + "step": 1037 + }, + { + "batch_num_effect_tokens": 5909, + "batch_num_samples": 149, + "batch_num_tokens": 52191, + "epoch": 0.94364, + "grad_norm": 3.2638537883758545, + "learning_rate": 1.269531052160068e-05, + "loss": 1.8701, + "step": 1038 + }, + { + "batch_num_effect_tokens": 5331, + "batch_num_samples": 149, + "batch_num_tokens": 52126, + "epoch": 0.94455, + "grad_norm": 3.3751261234283447, + "learning_rate": 1.2680027702733791e-05, + "loss": 1.4539, + "step": 1039 + }, + { + "batch_num_effect_tokens": 6539, + "batch_num_samples": 149, + "batch_num_tokens": 52201, + "epoch": 0.94545, + "grad_norm": 3.4274160861968994, + "learning_rate": 1.266473813690035e-05, + "loss": 2.0514, + "step": 1040 + }, + { + "batch_num_effect_tokens": 5271, + "batch_num_samples": 149, + "batch_num_tokens": 50543, + "epoch": 0.94636, + "grad_norm": 3.4731414318084717, + "learning_rate": 1.2649441862591826e-05, + "loss": 1.9529, + "step": 1041 + }, + { + "batch_num_effect_tokens": 5074, + "batch_num_samples": 149, + "batch_num_tokens": 52184, + "epoch": 0.94727, + "grad_norm": 3.665320634841919, + "learning_rate": 1.2634138918316567e-05, + "loss": 1.7549, + "step": 1042 + }, + { + "batch_num_effect_tokens": 4107, + "batch_num_samples": 149, + "batch_num_tokens": 52176, + "epoch": 0.94818, + "grad_norm": 3.3488118648529053, + "learning_rate": 1.2618829342599719e-05, + "loss": 1.4641, + "step": 1043 + }, + { + "batch_num_effect_tokens": 8762, + "batch_num_samples": 149, + "batch_num_tokens": 52184, + "epoch": 0.94909, + "grad_norm": 2.7790908813476562, + "learning_rate": 1.2603513173983121e-05, + "loss": 1.4908, + "step": 1044 + }, + { + "batch_num_effect_tokens": 9360, + "batch_num_samples": 150, + "batch_num_tokens": 52200, + "epoch": 0.95, + "grad_norm": 2.7828152179718018, + "learning_rate": 1.2588190451025209e-05, + "loss": 2.2785, + "step": 1045 + }, + { + "batch_num_effect_tokens": 5616, + "batch_num_samples": 149, + "batch_num_tokens": 52223, + "epoch": 0.95091, + "grad_norm": 3.2362778186798096, + "learning_rate": 1.2572861212300917e-05, + "loss": 1.5082, + "step": 1046 + }, + { + "batch_num_effect_tokens": 6733, + "batch_num_samples": 149, + "batch_num_tokens": 52167, + "epoch": 0.95182, + "grad_norm": 2.959310531616211, + "learning_rate": 1.255752549640159e-05, + "loss": 1.6621, + "step": 1047 + }, + { + "batch_num_effect_tokens": 4214, + "batch_num_samples": 149, + "batch_num_tokens": 52140, + "epoch": 0.95273, + "grad_norm": 3.2532103061676025, + "learning_rate": 1.2542183341934873e-05, + "loss": 1.0932, + "step": 1048 + }, + { + "batch_num_effect_tokens": 5282, + "batch_num_samples": 149, + "batch_num_tokens": 52215, + "epoch": 0.95364, + "grad_norm": 3.0876121520996094, + "learning_rate": 1.2526834787524615e-05, + "loss": 0.952, + "step": 1049 + }, + { + "batch_num_effect_tokens": 7447, + "batch_num_samples": 149, + "batch_num_tokens": 52133, + "epoch": 0.95455, + "grad_norm": 3.1305782794952393, + "learning_rate": 1.2511479871810792e-05, + "loss": 1.6978, + "step": 1050 + }, + { + "batch_num_effect_tokens": 10256, + "batch_num_samples": 149, + "batch_num_tokens": 52181, + "epoch": 0.95545, + "grad_norm": 2.8973023891448975, + "learning_rate": 1.2496118633449386e-05, + "loss": 1.9767, + "step": 1051 + }, + { + "batch_num_effect_tokens": 7292, + "batch_num_samples": 149, + "batch_num_tokens": 52138, + "epoch": 0.95636, + "grad_norm": 2.9504804611206055, + "learning_rate": 1.248075111111229e-05, + "loss": 1.5928, + "step": 1052 + }, + { + "batch_num_effect_tokens": 7355, + "batch_num_samples": 149, + "batch_num_tokens": 52146, + "epoch": 0.95727, + "grad_norm": 3.3816585540771484, + "learning_rate": 1.2465377343487227e-05, + "loss": 1.6466, + "step": 1053 + }, + { + "batch_num_effect_tokens": 7898, + "batch_num_samples": 150, + "batch_num_tokens": 52219, + "epoch": 0.95818, + "grad_norm": 3.2212235927581787, + "learning_rate": 1.244999736927764e-05, + "loss": 1.5401, + "step": 1054 + }, + { + "batch_num_effect_tokens": 7335, + "batch_num_samples": 150, + "batch_num_tokens": 52177, + "epoch": 0.95909, + "grad_norm": 3.730105400085449, + "learning_rate": 1.2434611227202591e-05, + "loss": 1.6948, + "step": 1055 + }, + { + "batch_num_effect_tokens": 6588, + "batch_num_samples": 149, + "batch_num_tokens": 52198, + "epoch": 0.96, + "grad_norm": 3.086858034133911, + "learning_rate": 1.2419218955996677e-05, + "loss": 1.5954, + "step": 1056 + }, + { + "batch_num_effect_tokens": 5296, + "batch_num_samples": 149, + "batch_num_tokens": 52144, + "epoch": 0.96091, + "grad_norm": 3.705068588256836, + "learning_rate": 1.2403820594409926e-05, + "loss": 1.875, + "step": 1057 + }, + { + "batch_num_effect_tokens": 10256, + "batch_num_samples": 149, + "batch_num_tokens": 52203, + "epoch": 0.96182, + "grad_norm": 2.9357504844665527, + "learning_rate": 1.238841618120769e-05, + "loss": 2.0334, + "step": 1058 + }, + { + "batch_num_effect_tokens": 6726, + "batch_num_samples": 149, + "batch_num_tokens": 52199, + "epoch": 0.96273, + "grad_norm": 3.0499720573425293, + "learning_rate": 1.2373005755170563e-05, + "loss": 1.666, + "step": 1059 + }, + { + "batch_num_effect_tokens": 7094, + "batch_num_samples": 149, + "batch_num_tokens": 52191, + "epoch": 0.96364, + "grad_norm": 3.0460546016693115, + "learning_rate": 1.2357589355094275e-05, + "loss": 1.9798, + "step": 1060 + }, + { + "batch_num_effect_tokens": 7578, + "batch_num_samples": 149, + "batch_num_tokens": 52184, + "epoch": 0.96455, + "grad_norm": 2.8582043647766113, + "learning_rate": 1.234216701978959e-05, + "loss": 1.9126, + "step": 1061 + }, + { + "batch_num_effect_tokens": 4946, + "batch_num_samples": 149, + "batch_num_tokens": 52219, + "epoch": 0.96545, + "grad_norm": 2.9517149925231934, + "learning_rate": 1.2326738788082225e-05, + "loss": 1.2517, + "step": 1062 + }, + { + "batch_num_effect_tokens": 11793, + "batch_num_samples": 149, + "batch_num_tokens": 52161, + "epoch": 0.96636, + "grad_norm": 2.5095744132995605, + "learning_rate": 1.2311304698812732e-05, + "loss": 2.041, + "step": 1063 + }, + { + "batch_num_effect_tokens": 5526, + "batch_num_samples": 149, + "batch_num_tokens": 52161, + "epoch": 0.96727, + "grad_norm": 3.3237781524658203, + "learning_rate": 1.2295864790836411e-05, + "loss": 1.6071, + "step": 1064 + }, + { + "batch_num_effect_tokens": 5858, + "batch_num_samples": 149, + "batch_num_tokens": 52149, + "epoch": 0.96818, + "grad_norm": 3.2829267978668213, + "learning_rate": 1.2280419103023219e-05, + "loss": 1.843, + "step": 1065 + }, + { + "batch_num_effect_tokens": 5569, + "batch_num_samples": 149, + "batch_num_tokens": 52208, + "epoch": 0.96909, + "grad_norm": 3.221628189086914, + "learning_rate": 1.2264967674257647e-05, + "loss": 1.5967, + "step": 1066 + }, + { + "batch_num_effect_tokens": 6244, + "batch_num_samples": 149, + "batch_num_tokens": 52188, + "epoch": 0.97, + "grad_norm": 3.0191125869750977, + "learning_rate": 1.2249510543438652e-05, + "loss": 1.6961, + "step": 1067 + }, + { + "batch_num_effect_tokens": 6027, + "batch_num_samples": 149, + "batch_num_tokens": 52191, + "epoch": 0.97091, + "grad_norm": 3.282532215118408, + "learning_rate": 1.2234047749479543e-05, + "loss": 1.8343, + "step": 1068 + }, + { + "batch_num_effect_tokens": 7141, + "batch_num_samples": 150, + "batch_num_tokens": 52216, + "epoch": 0.97182, + "grad_norm": 2.938680648803711, + "learning_rate": 1.2218579331307889e-05, + "loss": 1.863, + "step": 1069 + }, + { + "batch_num_effect_tokens": 6113, + "batch_num_samples": 149, + "batch_num_tokens": 52094, + "epoch": 0.97273, + "grad_norm": 3.5186402797698975, + "learning_rate": 1.2203105327865407e-05, + "loss": 1.8756, + "step": 1070 + }, + { + "batch_num_effect_tokens": 6241, + "batch_num_samples": 149, + "batch_num_tokens": 52186, + "epoch": 0.97364, + "grad_norm": 3.062868118286133, + "learning_rate": 1.218762577810789e-05, + "loss": 1.526, + "step": 1071 + }, + { + "batch_num_effect_tokens": 5190, + "batch_num_samples": 149, + "batch_num_tokens": 52140, + "epoch": 0.97455, + "grad_norm": 2.7961323261260986, + "learning_rate": 1.217214072100508e-05, + "loss": 1.3068, + "step": 1072 + }, + { + "batch_num_effect_tokens": 7362, + "batch_num_samples": 149, + "batch_num_tokens": 52215, + "epoch": 0.97545, + "grad_norm": 2.909553289413452, + "learning_rate": 1.2156650195540592e-05, + "loss": 1.9114, + "step": 1073 + }, + { + "batch_num_effect_tokens": 7249, + "batch_num_samples": 149, + "batch_num_tokens": 52177, + "epoch": 0.97636, + "grad_norm": 2.9898719787597656, + "learning_rate": 1.2141154240711806e-05, + "loss": 2.0127, + "step": 1074 + }, + { + "batch_num_effect_tokens": 7332, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 0.97727, + "grad_norm": 2.9858486652374268, + "learning_rate": 1.2125652895529766e-05, + "loss": 2.0471, + "step": 1075 + }, + { + "batch_num_effect_tokens": 7555, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 0.97818, + "grad_norm": 3.1468863487243652, + "learning_rate": 1.2110146199019099e-05, + "loss": 2.057, + "step": 1076 + }, + { + "batch_num_effect_tokens": 6093, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 0.97909, + "grad_norm": 3.0881295204162598, + "learning_rate": 1.2094634190217886e-05, + "loss": 1.6342, + "step": 1077 + }, + { + "batch_num_effect_tokens": 10563, + "batch_num_samples": 149, + "batch_num_tokens": 52208, + "epoch": 0.98, + "grad_norm": 2.679718017578125, + "learning_rate": 1.2079116908177592e-05, + "loss": 2.127, + "step": 1078 + }, + { + "batch_num_effect_tokens": 5258, + "batch_num_samples": 149, + "batch_num_tokens": 52151, + "epoch": 0.98091, + "grad_norm": 2.9763917922973633, + "learning_rate": 1.2063594391962963e-05, + "loss": 1.5558, + "step": 1079 + }, + { + "batch_num_effect_tokens": 4843, + "batch_num_samples": 149, + "batch_num_tokens": 52118, + "epoch": 0.98182, + "grad_norm": 3.296865940093994, + "learning_rate": 1.2048066680651908e-05, + "loss": 1.571, + "step": 1080 + }, + { + "batch_num_effect_tokens": 5420, + "batch_num_samples": 149, + "batch_num_tokens": 52157, + "epoch": 0.98273, + "grad_norm": 3.038447856903076, + "learning_rate": 1.2032533813335423e-05, + "loss": 1.5036, + "step": 1081 + }, + { + "batch_num_effect_tokens": 5103, + "batch_num_samples": 149, + "batch_num_tokens": 52188, + "epoch": 0.98364, + "grad_norm": 3.19463849067688, + "learning_rate": 1.2016995829117489e-05, + "loss": 1.9069, + "step": 1082 + }, + { + "batch_num_effect_tokens": 8013, + "batch_num_samples": 149, + "batch_num_tokens": 52169, + "epoch": 0.98455, + "grad_norm": 3.001164197921753, + "learning_rate": 1.2001452767114952e-05, + "loss": 2.1403, + "step": 1083 + }, + { + "batch_num_effect_tokens": 6417, + "batch_num_samples": 149, + "batch_num_tokens": 52191, + "epoch": 0.98545, + "grad_norm": 2.9088382720947266, + "learning_rate": 1.1985904666457455e-05, + "loss": 1.4724, + "step": 1084 + }, + { + "batch_num_effect_tokens": 5631, + "batch_num_samples": 150, + "batch_num_tokens": 52142, + "epoch": 0.98636, + "grad_norm": 3.3566412925720215, + "learning_rate": 1.1970351566287332e-05, + "loss": 1.7628, + "step": 1085 + }, + { + "batch_num_effect_tokens": 6616, + "batch_num_samples": 149, + "batch_num_tokens": 52221, + "epoch": 0.98727, + "grad_norm": 3.4458236694335938, + "learning_rate": 1.1954793505759484e-05, + "loss": 1.8574, + "step": 1086 + }, + { + "batch_num_effect_tokens": 5497, + "batch_num_samples": 149, + "batch_num_tokens": 52140, + "epoch": 0.98818, + "grad_norm": 3.271362781524658, + "learning_rate": 1.1939230524041315e-05, + "loss": 1.7046, + "step": 1087 + }, + { + "batch_num_effect_tokens": 7787, + "batch_num_samples": 149, + "batch_num_tokens": 52163, + "epoch": 0.98909, + "grad_norm": 3.0926859378814697, + "learning_rate": 1.1923662660312611e-05, + "loss": 1.8148, + "step": 1088 + }, + { + "batch_num_effect_tokens": 7477, + "batch_num_samples": 149, + "batch_num_tokens": 52139, + "epoch": 0.99, + "grad_norm": 2.8267569541931152, + "learning_rate": 1.190808995376545e-05, + "loss": 1.9415, + "step": 1089 + }, + { + "batch_num_effect_tokens": 6118, + "batch_num_samples": 149, + "batch_num_tokens": 52146, + "epoch": 0.99091, + "grad_norm": 3.279690980911255, + "learning_rate": 1.1892512443604103e-05, + "loss": 1.4473, + "step": 1090 + }, + { + "batch_num_effect_tokens": 5825, + "batch_num_samples": 149, + "batch_num_tokens": 52184, + "epoch": 0.99182, + "grad_norm": 2.950528383255005, + "learning_rate": 1.1876930169044935e-05, + "loss": 1.6913, + "step": 1091 + }, + { + "batch_num_effect_tokens": 5726, + "batch_num_samples": 149, + "batch_num_tokens": 52141, + "epoch": 0.99273, + "grad_norm": 3.1925201416015625, + "learning_rate": 1.1861343169316301e-05, + "loss": 1.6325, + "step": 1092 + }, + { + "batch_num_effect_tokens": 5895, + "batch_num_samples": 149, + "batch_num_tokens": 52184, + "epoch": 0.99364, + "grad_norm": 2.982030153274536, + "learning_rate": 1.1845751483658454e-05, + "loss": 1.5498, + "step": 1093 + }, + { + "batch_num_effect_tokens": 3728, + "batch_num_samples": 149, + "batch_num_tokens": 52204, + "epoch": 0.99455, + "grad_norm": 3.6174166202545166, + "learning_rate": 1.1830155151323447e-05, + "loss": 1.2632, + "step": 1094 + }, + { + "batch_num_effect_tokens": 4942, + "batch_num_samples": 149, + "batch_num_tokens": 52127, + "epoch": 0.99545, + "grad_norm": 3.5724058151245117, + "learning_rate": 1.1814554211575026e-05, + "loss": 1.7337, + "step": 1095 + }, + { + "batch_num_effect_tokens": 8458, + "batch_num_samples": 150, + "batch_num_tokens": 52194, + "epoch": 0.99636, + "grad_norm": 2.827831268310547, + "learning_rate": 1.179894870368854e-05, + "loss": 2.0079, + "step": 1096 + }, + { + "batch_num_effect_tokens": 3842, + "batch_num_samples": 149, + "batch_num_tokens": 52183, + "epoch": 0.99727, + "grad_norm": 3.108935832977295, + "learning_rate": 1.1783338666950832e-05, + "loss": 1.1461, + "step": 1097 + }, + { + "batch_num_effect_tokens": 5537, + "batch_num_samples": 149, + "batch_num_tokens": 52188, + "epoch": 0.99818, + "grad_norm": 3.582533359527588, + "learning_rate": 1.1767724140660158e-05, + "loss": 1.7423, + "step": 1098 + }, + { + "batch_num_effect_tokens": 8780, + "batch_num_samples": 149, + "batch_num_tokens": 52211, + "epoch": 0.99909, + "grad_norm": 3.0004284381866455, + "learning_rate": 1.1752105164126062e-05, + "loss": 1.9102, + "step": 1099 + }, + { + "batch_num_effect_tokens": 8035, + "batch_num_samples": 149, + "batch_num_tokens": 52187, + "epoch": 1.0, + "grad_norm": 3.2643775939941406, + "learning_rate": 1.1736481776669307e-05, + "loss": 1.7432, + "step": 1100 + }, + { + "batch_num_effect_tokens": 6503, + "batch_num_samples": 149, + "batch_num_tokens": 52110, + "epoch": 1.00091, + "grad_norm": 3.069766044616699, + "learning_rate": 1.1720854017621744e-05, + "loss": 0.9619, + "step": 1101 + }, + { + "batch_num_effect_tokens": 4830, + "batch_num_samples": 149, + "batch_num_tokens": 52160, + "epoch": 1.00182, + "grad_norm": 3.834742307662964, + "learning_rate": 1.170522192632624e-05, + "loss": 1.0151, + "step": 1102 + }, + { + "batch_num_effect_tokens": 7808, + "batch_num_samples": 149, + "batch_num_tokens": 52215, + "epoch": 1.00273, + "grad_norm": 3.0445826053619385, + "learning_rate": 1.1689585542136568e-05, + "loss": 1.398, + "step": 1103 + }, + { + "batch_num_effect_tokens": 8222, + "batch_num_samples": 149, + "batch_num_tokens": 52189, + "epoch": 1.00364, + "grad_norm": 3.089691162109375, + "learning_rate": 1.1673944904417309e-05, + "loss": 1.8757, + "step": 1104 + }, + { + "batch_num_effect_tokens": 5186, + "batch_num_samples": 149, + "batch_num_tokens": 52184, + "epoch": 1.00455, + "grad_norm": 3.261866569519043, + "learning_rate": 1.1658300052543742e-05, + "loss": 1.0662, + "step": 1105 + }, + { + "batch_num_effect_tokens": 5068, + "batch_num_samples": 149, + "batch_num_tokens": 52191, + "epoch": 1.00545, + "grad_norm": 3.276542901992798, + "learning_rate": 1.1642651025901772e-05, + "loss": 0.9319, + "step": 1106 + }, + { + "batch_num_effect_tokens": 4540, + "batch_num_samples": 149, + "batch_num_tokens": 52223, + "epoch": 1.00636, + "grad_norm": 3.2273130416870117, + "learning_rate": 1.1626997863887801e-05, + "loss": 0.7239, + "step": 1107 + }, + { + "batch_num_effect_tokens": 4436, + "batch_num_samples": 149, + "batch_num_tokens": 52222, + "epoch": 1.00727, + "grad_norm": 3.6009256839752197, + "learning_rate": 1.1611340605908643e-05, + "loss": 0.9413, + "step": 1108 + }, + { + "batch_num_effect_tokens": 7451, + "batch_num_samples": 149, + "batch_num_tokens": 52132, + "epoch": 1.00818, + "grad_norm": 3.0169732570648193, + "learning_rate": 1.159567929138143e-05, + "loss": 1.0808, + "step": 1109 + }, + { + "batch_num_effect_tokens": 8093, + "batch_num_samples": 150, + "batch_num_tokens": 52217, + "epoch": 1.00909, + "grad_norm": 2.988909959793091, + "learning_rate": 1.15800139597335e-05, + "loss": 1.3387, + "step": 1110 + }, + { + "batch_num_effect_tokens": 7321, + "batch_num_samples": 149, + "batch_num_tokens": 52155, + "epoch": 1.01, + "grad_norm": 2.798431396484375, + "learning_rate": 1.156434465040231e-05, + "loss": 0.8343, + "step": 1111 + }, + { + "batch_num_effect_tokens": 7842, + "batch_num_samples": 149, + "batch_num_tokens": 52177, + "epoch": 1.01091, + "grad_norm": 3.4338691234588623, + "learning_rate": 1.1548671402835325e-05, + "loss": 1.2682, + "step": 1112 + }, + { + "batch_num_effect_tokens": 5664, + "batch_num_samples": 149, + "batch_num_tokens": 52157, + "epoch": 1.01182, + "grad_norm": 3.4470508098602295, + "learning_rate": 1.1532994256489926e-05, + "loss": 0.8029, + "step": 1113 + }, + { + "batch_num_effect_tokens": 5989, + "batch_num_samples": 149, + "batch_num_tokens": 52082, + "epoch": 1.01273, + "grad_norm": 3.639939069747925, + "learning_rate": 1.1517313250833318e-05, + "loss": 0.8502, + "step": 1114 + }, + { + "batch_num_effect_tokens": 5629, + "batch_num_samples": 149, + "batch_num_tokens": 52204, + "epoch": 1.01364, + "grad_norm": 3.206958532333374, + "learning_rate": 1.1501628425342404e-05, + "loss": 0.8156, + "step": 1115 + }, + { + "batch_num_effect_tokens": 6388, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 1.01455, + "grad_norm": 3.441685676574707, + "learning_rate": 1.1485939819503717e-05, + "loss": 0.8009, + "step": 1116 + }, + { + "batch_num_effect_tokens": 5590, + "batch_num_samples": 149, + "batch_num_tokens": 52171, + "epoch": 1.01545, + "grad_norm": 4.197703838348389, + "learning_rate": 1.147024747281331e-05, + "loss": 0.887, + "step": 1117 + }, + { + "batch_num_effect_tokens": 7665, + "batch_num_samples": 149, + "batch_num_tokens": 52158, + "epoch": 1.01636, + "grad_norm": 3.792309284210205, + "learning_rate": 1.1454551424776636e-05, + "loss": 1.5214, + "step": 1118 + }, + { + "batch_num_effect_tokens": 5329, + "batch_num_samples": 149, + "batch_num_tokens": 52115, + "epoch": 1.01727, + "grad_norm": 3.148326873779297, + "learning_rate": 1.1438851714908483e-05, + "loss": 0.5887, + "step": 1119 + }, + { + "batch_num_effect_tokens": 4729, + "batch_num_samples": 149, + "batch_num_tokens": 52171, + "epoch": 1.01818, + "grad_norm": 3.497055768966675, + "learning_rate": 1.1423148382732854e-05, + "loss": 0.7, + "step": 1120 + }, + { + "batch_num_effect_tokens": 7931, + "batch_num_samples": 149, + "batch_num_tokens": 52112, + "epoch": 1.01909, + "grad_norm": 3.3380677700042725, + "learning_rate": 1.1407441467782865e-05, + "loss": 1.3483, + "step": 1121 + }, + { + "batch_num_effect_tokens": 8053, + "batch_num_samples": 149, + "batch_num_tokens": 52140, + "epoch": 1.02, + "grad_norm": 3.140716075897217, + "learning_rate": 1.1391731009600655e-05, + "loss": 1.4393, + "step": 1122 + }, + { + "batch_num_effect_tokens": 3696, + "batch_num_samples": 149, + "batch_num_tokens": 52090, + "epoch": 1.02091, + "grad_norm": 2.911566972732544, + "learning_rate": 1.1376017047737292e-05, + "loss": 0.5999, + "step": 1123 + }, + { + "batch_num_effect_tokens": 8184, + "batch_num_samples": 149, + "batch_num_tokens": 52210, + "epoch": 1.02182, + "grad_norm": 2.7683959007263184, + "learning_rate": 1.1360299621752644e-05, + "loss": 1.0861, + "step": 1124 + }, + { + "batch_num_effect_tokens": 8803, + "batch_num_samples": 149, + "batch_num_tokens": 52133, + "epoch": 1.02273, + "grad_norm": 3.8152406215667725, + "learning_rate": 1.1344578771215319e-05, + "loss": 2.0837, + "step": 1125 + }, + { + "batch_num_effect_tokens": 4662, + "batch_num_samples": 149, + "batch_num_tokens": 52187, + "epoch": 1.02364, + "grad_norm": 3.0464205741882324, + "learning_rate": 1.1328854535702542e-05, + "loss": 0.986, + "step": 1126 + }, + { + "batch_num_effect_tokens": 8958, + "batch_num_samples": 149, + "batch_num_tokens": 52132, + "epoch": 1.02455, + "grad_norm": 2.634942054748535, + "learning_rate": 1.1313126954800053e-05, + "loss": 1.2716, + "step": 1127 + }, + { + "batch_num_effect_tokens": 6304, + "batch_num_samples": 149, + "batch_num_tokens": 52221, + "epoch": 1.02545, + "grad_norm": 3.5925192832946777, + "learning_rate": 1.1297396068102019e-05, + "loss": 1.7713, + "step": 1128 + }, + { + "batch_num_effect_tokens": 6005, + "batch_num_samples": 149, + "batch_num_tokens": 52201, + "epoch": 1.02636, + "grad_norm": 2.9206326007843018, + "learning_rate": 1.1281661915210931e-05, + "loss": 0.9858, + "step": 1129 + }, + { + "batch_num_effect_tokens": 5744, + "batch_num_samples": 149, + "batch_num_tokens": 52210, + "epoch": 1.02727, + "grad_norm": 3.2044382095336914, + "learning_rate": 1.1265924535737494e-05, + "loss": 1.0135, + "step": 1130 + }, + { + "batch_num_effect_tokens": 4729, + "batch_num_samples": 149, + "batch_num_tokens": 52209, + "epoch": 1.02818, + "grad_norm": 3.457561492919922, + "learning_rate": 1.1250183969300547e-05, + "loss": 0.6748, + "step": 1131 + }, + { + "batch_num_effect_tokens": 6517, + "batch_num_samples": 149, + "batch_num_tokens": 50559, + "epoch": 1.02909, + "grad_norm": 3.122835874557495, + "learning_rate": 1.1234440255526948e-05, + "loss": 1.1057, + "step": 1132 + }, + { + "batch_num_effect_tokens": 8301, + "batch_num_samples": 149, + "batch_num_tokens": 52218, + "epoch": 1.03, + "grad_norm": 3.4122023582458496, + "learning_rate": 1.1218693434051475e-05, + "loss": 1.5586, + "step": 1133 + }, + { + "batch_num_effect_tokens": 5158, + "batch_num_samples": 149, + "batch_num_tokens": 52214, + "epoch": 1.03091, + "grad_norm": 3.544649600982666, + "learning_rate": 1.1202943544516736e-05, + "loss": 0.9804, + "step": 1134 + }, + { + "batch_num_effect_tokens": 5538, + "batch_num_samples": 149, + "batch_num_tokens": 52140, + "epoch": 1.03182, + "grad_norm": 2.7127685546875, + "learning_rate": 1.1187190626573052e-05, + "loss": 0.6383, + "step": 1135 + }, + { + "batch_num_effect_tokens": 6253, + "batch_num_samples": 149, + "batch_num_tokens": 52161, + "epoch": 1.03273, + "grad_norm": 3.0972988605499268, + "learning_rate": 1.1171434719878385e-05, + "loss": 1.0666, + "step": 1136 + }, + { + "batch_num_effect_tokens": 5190, + "batch_num_samples": 149, + "batch_num_tokens": 52140, + "epoch": 1.03364, + "grad_norm": 2.642263650894165, + "learning_rate": 1.11556758640982e-05, + "loss": 0.5828, + "step": 1137 + }, + { + "batch_num_effect_tokens": 7533, + "batch_num_samples": 149, + "batch_num_tokens": 52177, + "epoch": 1.03455, + "grad_norm": 3.2424395084381104, + "learning_rate": 1.1139914098905406e-05, + "loss": 1.219, + "step": 1138 + }, + { + "batch_num_effect_tokens": 7229, + "batch_num_samples": 149, + "batch_num_tokens": 52190, + "epoch": 1.03545, + "grad_norm": 3.8125650882720947, + "learning_rate": 1.112414946398023e-05, + "loss": 1.1979, + "step": 1139 + }, + { + "batch_num_effect_tokens": 6539, + "batch_num_samples": 149, + "batch_num_tokens": 52201, + "epoch": 1.03636, + "grad_norm": 3.3987605571746826, + "learning_rate": 1.1108381999010111e-05, + "loss": 1.1991, + "step": 1140 + }, + { + "batch_num_effect_tokens": 5621, + "batch_num_samples": 149, + "batch_num_tokens": 52207, + "epoch": 1.03727, + "grad_norm": 3.7911298274993896, + "learning_rate": 1.1092611743689632e-05, + "loss": 0.9732, + "step": 1141 + }, + { + "batch_num_effect_tokens": 5192, + "batch_num_samples": 149, + "batch_num_tokens": 52210, + "epoch": 1.03818, + "grad_norm": 4.324377536773682, + "learning_rate": 1.1076838737720392e-05, + "loss": 1.5059, + "step": 1142 + }, + { + "batch_num_effect_tokens": 6355, + "batch_num_samples": 149, + "batch_num_tokens": 52142, + "epoch": 1.03909, + "grad_norm": 3.5138485431671143, + "learning_rate": 1.1061063020810909e-05, + "loss": 1.1693, + "step": 1143 + }, + { + "batch_num_effect_tokens": 5902, + "batch_num_samples": 149, + "batch_num_tokens": 52102, + "epoch": 1.04, + "grad_norm": 2.956238031387329, + "learning_rate": 1.1045284632676535e-05, + "loss": 1.0026, + "step": 1144 + }, + { + "batch_num_effect_tokens": 7857, + "batch_num_samples": 149, + "batch_num_tokens": 52160, + "epoch": 1.04091, + "grad_norm": 3.6880264282226562, + "learning_rate": 1.1029503613039347e-05, + "loss": 1.8397, + "step": 1145 + }, + { + "batch_num_effect_tokens": 5948, + "batch_num_samples": 149, + "batch_num_tokens": 52208, + "epoch": 1.04182, + "grad_norm": 2.982151508331299, + "learning_rate": 1.1013720001628034e-05, + "loss": 1.1197, + "step": 1146 + }, + { + "batch_num_effect_tokens": 3216, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 1.04273, + "grad_norm": 3.204832077026367, + "learning_rate": 1.0997933838177828e-05, + "loss": 0.6265, + "step": 1147 + }, + { + "batch_num_effect_tokens": 8085, + "batch_num_samples": 149, + "batch_num_tokens": 52115, + "epoch": 1.04364, + "grad_norm": 3.233100414276123, + "learning_rate": 1.0982145162430373e-05, + "loss": 1.432, + "step": 1148 + }, + { + "batch_num_effect_tokens": 8282, + "batch_num_samples": 149, + "batch_num_tokens": 52139, + "epoch": 1.04455, + "grad_norm": 3.0157642364501953, + "learning_rate": 1.096635401413364e-05, + "loss": 1.2412, + "step": 1149 + }, + { + "batch_num_effect_tokens": 5305, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 1.04545, + "grad_norm": 3.3882460594177246, + "learning_rate": 1.0950560433041825e-05, + "loss": 0.9461, + "step": 1150 + }, + { + "batch_num_effect_tokens": 4103, + "batch_num_samples": 149, + "batch_num_tokens": 52166, + "epoch": 1.04636, + "grad_norm": 3.6615240573883057, + "learning_rate": 1.0934764458915258e-05, + "loss": 0.8385, + "step": 1151 + }, + { + "batch_num_effect_tokens": 6726, + "batch_num_samples": 149, + "batch_num_tokens": 52199, + "epoch": 1.04727, + "grad_norm": 2.953944206237793, + "learning_rate": 1.0918966131520276e-05, + "loss": 0.9336, + "step": 1152 + }, + { + "batch_num_effect_tokens": 6332, + "batch_num_samples": 149, + "batch_num_tokens": 52180, + "epoch": 1.04818, + "grad_norm": 4.152585506439209, + "learning_rate": 1.0903165490629152e-05, + "loss": 1.6518, + "step": 1153 + }, + { + "batch_num_effect_tokens": 6578, + "batch_num_samples": 149, + "batch_num_tokens": 52140, + "epoch": 1.04909, + "grad_norm": 3.0139248371124268, + "learning_rate": 1.0887362576019981e-05, + "loss": 0.9943, + "step": 1154 + }, + { + "batch_num_effect_tokens": 6072, + "batch_num_samples": 149, + "batch_num_tokens": 52197, + "epoch": 1.05, + "grad_norm": 3.005803108215332, + "learning_rate": 1.0871557427476585e-05, + "loss": 0.9334, + "step": 1155 + }, + { + "batch_num_effect_tokens": 5623, + "batch_num_samples": 149, + "batch_num_tokens": 52132, + "epoch": 1.05091, + "grad_norm": 3.361359119415283, + "learning_rate": 1.08557500847884e-05, + "loss": 0.9286, + "step": 1156 + }, + { + "batch_num_effect_tokens": 6921, + "batch_num_samples": 149, + "batch_num_tokens": 52212, + "epoch": 1.05182, + "grad_norm": 3.7183468341827393, + "learning_rate": 1.0839940587750394e-05, + "loss": 1.3418, + "step": 1157 + }, + { + "batch_num_effect_tokens": 7384, + "batch_num_samples": 149, + "batch_num_tokens": 52128, + "epoch": 1.05273, + "grad_norm": 8.443079948425293, + "learning_rate": 1.0824128976162964e-05, + "loss": 1.4034, + "step": 1158 + }, + { + "batch_num_effect_tokens": 7243, + "batch_num_samples": 149, + "batch_num_tokens": 52141, + "epoch": 1.05364, + "grad_norm": 2.8543872833251953, + "learning_rate": 1.0808315289831814e-05, + "loss": 0.925, + "step": 1159 + }, + { + "batch_num_effect_tokens": 8632, + "batch_num_samples": 150, + "batch_num_tokens": 52216, + "epoch": 1.05455, + "grad_norm": 2.774327516555786, + "learning_rate": 1.0792499568567885e-05, + "loss": 1.2081, + "step": 1160 + }, + { + "batch_num_effect_tokens": 7412, + "batch_num_samples": 150, + "batch_num_tokens": 52215, + "epoch": 1.05545, + "grad_norm": 3.0567374229431152, + "learning_rate": 1.0776681852187239e-05, + "loss": 1.1328, + "step": 1161 + }, + { + "batch_num_effect_tokens": 4646, + "batch_num_samples": 149, + "batch_num_tokens": 52210, + "epoch": 1.05636, + "grad_norm": 2.918250560760498, + "learning_rate": 1.076086218051095e-05, + "loss": 0.5601, + "step": 1162 + }, + { + "batch_num_effect_tokens": 5703, + "batch_num_samples": 149, + "batch_num_tokens": 52151, + "epoch": 1.05727, + "grad_norm": 3.635793924331665, + "learning_rate": 1.0745040593365032e-05, + "loss": 1.2372, + "step": 1163 + }, + { + "batch_num_effect_tokens": 6002, + "batch_num_samples": 149, + "batch_num_tokens": 52185, + "epoch": 1.05818, + "grad_norm": 3.530992269515991, + "learning_rate": 1.0729217130580309e-05, + "loss": 1.1045, + "step": 1164 + }, + { + "batch_num_effect_tokens": 6150, + "batch_num_samples": 149, + "batch_num_tokens": 52223, + "epoch": 1.05909, + "grad_norm": 3.4126088619232178, + "learning_rate": 1.0713391831992324e-05, + "loss": 1.3539, + "step": 1165 + }, + { + "batch_num_effect_tokens": 7142, + "batch_num_samples": 150, + "batch_num_tokens": 52214, + "epoch": 1.06, + "grad_norm": 2.9573259353637695, + "learning_rate": 1.0697564737441254e-05, + "loss": 1.144, + "step": 1166 + }, + { + "batch_num_effect_tokens": 6675, + "batch_num_samples": 149, + "batch_num_tokens": 52177, + "epoch": 1.06091, + "grad_norm": 3.1278979778289795, + "learning_rate": 1.068173588677179e-05, + "loss": 1.0402, + "step": 1167 + }, + { + "batch_num_effect_tokens": 6747, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 1.06182, + "grad_norm": 4.912282943725586, + "learning_rate": 1.066590531983304e-05, + "loss": 1.7302, + "step": 1168 + }, + { + "batch_num_effect_tokens": 6328, + "batch_num_samples": 149, + "batch_num_tokens": 52159, + "epoch": 1.06273, + "grad_norm": 3.3606529235839844, + "learning_rate": 1.0650073076478442e-05, + "loss": 1.3774, + "step": 1169 + }, + { + "batch_num_effect_tokens": 4477, + "batch_num_samples": 149, + "batch_num_tokens": 52185, + "epoch": 1.06364, + "grad_norm": 2.675183057785034, + "learning_rate": 1.0634239196565646e-05, + "loss": 0.5866, + "step": 1170 + }, + { + "batch_num_effect_tokens": 8367, + "batch_num_samples": 150, + "batch_num_tokens": 52187, + "epoch": 1.06455, + "grad_norm": 2.8494794368743896, + "learning_rate": 1.0618403719956431e-05, + "loss": 1.1782, + "step": 1171 + }, + { + "batch_num_effect_tokens": 6840, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 1.06545, + "grad_norm": 3.3997509479522705, + "learning_rate": 1.0602566686516586e-05, + "loss": 1.3806, + "step": 1172 + }, + { + "batch_num_effect_tokens": 6867, + "batch_num_samples": 149, + "batch_num_tokens": 52174, + "epoch": 1.06636, + "grad_norm": 2.9302570819854736, + "learning_rate": 1.0586728136115824e-05, + "loss": 1.0178, + "step": 1173 + }, + { + "batch_num_effect_tokens": 5868, + "batch_num_samples": 149, + "batch_num_tokens": 52191, + "epoch": 1.06727, + "grad_norm": 2.8624753952026367, + "learning_rate": 1.0570888108627682e-05, + "loss": 0.7767, + "step": 1174 + }, + { + "batch_num_effect_tokens": 5831, + "batch_num_samples": 149, + "batch_num_tokens": 52196, + "epoch": 1.06818, + "grad_norm": 3.2073261737823486, + "learning_rate": 1.0555046643929402e-05, + "loss": 1.0023, + "step": 1175 + }, + { + "batch_num_effect_tokens": 6077, + "batch_num_samples": 149, + "batch_num_tokens": 52214, + "epoch": 1.06909, + "grad_norm": 3.3000328540802, + "learning_rate": 1.053920378190186e-05, + "loss": 1.2256, + "step": 1176 + }, + { + "batch_num_effect_tokens": 7508, + "batch_num_samples": 149, + "batch_num_tokens": 52099, + "epoch": 1.07, + "grad_norm": 2.593930959701538, + "learning_rate": 1.0523359562429441e-05, + "loss": 0.7808, + "step": 1177 + }, + { + "batch_num_effect_tokens": 7353, + "batch_num_samples": 149, + "batch_num_tokens": 52213, + "epoch": 1.07091, + "grad_norm": 3.004465341567993, + "learning_rate": 1.0507514025399944e-05, + "loss": 1.1786, + "step": 1178 + }, + { + "batch_num_effect_tokens": 7435, + "batch_num_samples": 149, + "batch_num_tokens": 52170, + "epoch": 1.07182, + "grad_norm": 2.8977913856506348, + "learning_rate": 1.0491667210704492e-05, + "loss": 1.0343, + "step": 1179 + }, + { + "batch_num_effect_tokens": 5870, + "batch_num_samples": 149, + "batch_num_tokens": 52146, + "epoch": 1.07273, + "grad_norm": 3.019353151321411, + "learning_rate": 1.0475819158237426e-05, + "loss": 0.8489, + "step": 1180 + }, + { + "batch_num_effect_tokens": 5977, + "batch_num_samples": 149, + "batch_num_tokens": 52116, + "epoch": 1.07364, + "grad_norm": 3.124189615249634, + "learning_rate": 1.0459969907896193e-05, + "loss": 0.7553, + "step": 1181 + }, + { + "batch_num_effect_tokens": 7198, + "batch_num_samples": 149, + "batch_num_tokens": 52223, + "epoch": 1.07455, + "grad_norm": 4.0742387771606445, + "learning_rate": 1.0444119499581263e-05, + "loss": 1.6221, + "step": 1182 + }, + { + "batch_num_effect_tokens": 7704, + "batch_num_samples": 149, + "batch_num_tokens": 52142, + "epoch": 1.07545, + "grad_norm": 2.7200570106506348, + "learning_rate": 1.0428267973196027e-05, + "loss": 0.794, + "step": 1183 + }, + { + "batch_num_effect_tokens": 8035, + "batch_num_samples": 149, + "batch_num_tokens": 52187, + "epoch": 1.07636, + "grad_norm": 3.0718557834625244, + "learning_rate": 1.0412415368646674e-05, + "loss": 1.0329, + "step": 1184 + }, + { + "batch_num_effect_tokens": 5666, + "batch_num_samples": 149, + "batch_num_tokens": 52200, + "epoch": 1.07727, + "grad_norm": 3.2390568256378174, + "learning_rate": 1.0396561725842124e-05, + "loss": 0.6177, + "step": 1185 + }, + { + "batch_num_effect_tokens": 6701, + "batch_num_samples": 149, + "batch_num_tokens": 52180, + "epoch": 1.07818, + "grad_norm": 3.020876884460449, + "learning_rate": 1.0380707084693902e-05, + "loss": 1.1314, + "step": 1186 + }, + { + "batch_num_effect_tokens": 7035, + "batch_num_samples": 149, + "batch_num_tokens": 52183, + "epoch": 1.07909, + "grad_norm": 4.152306079864502, + "learning_rate": 1.0364851485116047e-05, + "loss": 1.1334, + "step": 1187 + }, + { + "batch_num_effect_tokens": 8179, + "batch_num_samples": 149, + "batch_num_tokens": 52137, + "epoch": 1.08, + "grad_norm": 3.2629177570343018, + "learning_rate": 1.0348994967025012e-05, + "loss": 1.4448, + "step": 1188 + }, + { + "batch_num_effect_tokens": 9174, + "batch_num_samples": 149, + "batch_num_tokens": 52159, + "epoch": 1.08091, + "grad_norm": 3.2266151905059814, + "learning_rate": 1.0333137570339563e-05, + "loss": 1.2867, + "step": 1189 + }, + { + "batch_num_effect_tokens": 6245, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 1.08182, + "grad_norm": 3.2580463886260986, + "learning_rate": 1.031727933498068e-05, + "loss": 0.9441, + "step": 1190 + }, + { + "batch_num_effect_tokens": 7607, + "batch_num_samples": 149, + "batch_num_tokens": 52161, + "epoch": 1.08273, + "grad_norm": 2.951713800430298, + "learning_rate": 1.0301420300871445e-05, + "loss": 1.109, + "step": 1191 + }, + { + "batch_num_effect_tokens": 7477, + "batch_num_samples": 149, + "batch_num_tokens": 52139, + "epoch": 1.08364, + "grad_norm": 2.900982618331909, + "learning_rate": 1.0285560507936962e-05, + "loss": 1.1732, + "step": 1192 + }, + { + "batch_num_effect_tokens": 5992, + "batch_num_samples": 149, + "batch_num_tokens": 52188, + "epoch": 1.08455, + "grad_norm": 4.045762538909912, + "learning_rate": 1.0269699996104246e-05, + "loss": 1.3233, + "step": 1193 + }, + { + "batch_num_effect_tokens": 5358, + "batch_num_samples": 149, + "batch_num_tokens": 52222, + "epoch": 1.08545, + "grad_norm": 3.2340285778045654, + "learning_rate": 1.0253838805302106e-05, + "loss": 0.5123, + "step": 1194 + }, + { + "batch_num_effect_tokens": 8761, + "batch_num_samples": 149, + "batch_num_tokens": 52164, + "epoch": 1.08636, + "grad_norm": 3.4777562618255615, + "learning_rate": 1.0237976975461074e-05, + "loss": 1.467, + "step": 1195 + }, + { + "batch_num_effect_tokens": 8088, + "batch_num_samples": 149, + "batch_num_tokens": 52209, + "epoch": 1.08727, + "grad_norm": 3.107084274291992, + "learning_rate": 1.0222114546513296e-05, + "loss": 1.2546, + "step": 1196 + }, + { + "batch_num_effect_tokens": 7250, + "batch_num_samples": 149, + "batch_num_tokens": 52118, + "epoch": 1.08818, + "grad_norm": 3.5884499549865723, + "learning_rate": 1.0206251558392408e-05, + "loss": 1.443, + "step": 1197 + }, + { + "batch_num_effect_tokens": 9426, + "batch_num_samples": 149, + "batch_num_tokens": 52199, + "epoch": 1.08909, + "grad_norm": 2.9463024139404297, + "learning_rate": 1.0190388051033466e-05, + "loss": 1.3789, + "step": 1198 + }, + { + "batch_num_effect_tokens": 6048, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 1.09, + "grad_norm": 3.3203890323638916, + "learning_rate": 1.0174524064372837e-05, + "loss": 1.1597, + "step": 1199 + }, + { + "batch_num_effect_tokens": 4853, + "batch_num_samples": 149, + "batch_num_tokens": 52202, + "epoch": 1.09091, + "grad_norm": 3.7079813480377197, + "learning_rate": 1.015865963834808e-05, + "loss": 1.1309, + "step": 1200 + }, + { + "batch_num_effect_tokens": 5824, + "batch_num_samples": 149, + "batch_num_tokens": 52208, + "epoch": 1.09182, + "grad_norm": 3.3707385063171387, + "learning_rate": 1.0142794812897874e-05, + "loss": 1.0394, + "step": 1201 + }, + { + "batch_num_effect_tokens": 10838, + "batch_num_samples": 149, + "batch_num_tokens": 52183, + "epoch": 1.09273, + "grad_norm": 3.0608296394348145, + "learning_rate": 1.0126929627961896e-05, + "loss": 1.5519, + "step": 1202 + }, + { + "batch_num_effect_tokens": 5268, + "batch_num_samples": 149, + "batch_num_tokens": 52130, + "epoch": 1.09364, + "grad_norm": 3.28275728225708, + "learning_rate": 1.0111064123480734e-05, + "loss": 0.8913, + "step": 1203 + }, + { + "batch_num_effect_tokens": 8278, + "batch_num_samples": 149, + "batch_num_tokens": 52173, + "epoch": 1.09455, + "grad_norm": 3.032092332839966, + "learning_rate": 1.0095198339395769e-05, + "loss": 1.3646, + "step": 1204 + }, + { + "batch_num_effect_tokens": 5723, + "batch_num_samples": 149, + "batch_num_tokens": 52115, + "epoch": 1.09545, + "grad_norm": 3.0130598545074463, + "learning_rate": 1.0079332315649097e-05, + "loss": 1.1537, + "step": 1205 + }, + { + "batch_num_effect_tokens": 5837, + "batch_num_samples": 149, + "batch_num_tokens": 52173, + "epoch": 1.09636, + "grad_norm": 3.227560520172119, + "learning_rate": 1.006346609218342e-05, + "loss": 1.2023, + "step": 1206 + }, + { + "batch_num_effect_tokens": 7533, + "batch_num_samples": 149, + "batch_num_tokens": 52223, + "epoch": 1.09727, + "grad_norm": 3.0318572521209717, + "learning_rate": 1.0047599708941926e-05, + "loss": 1.3757, + "step": 1207 + }, + { + "batch_num_effect_tokens": 6667, + "batch_num_samples": 149, + "batch_num_tokens": 52168, + "epoch": 1.09818, + "grad_norm": 3.773564338684082, + "learning_rate": 1.0031733205868223e-05, + "loss": 0.7025, + "step": 1208 + }, + { + "batch_num_effect_tokens": 9942, + "batch_num_samples": 149, + "batch_num_tokens": 52132, + "epoch": 1.09909, + "grad_norm": 2.6167032718658447, + "learning_rate": 1.0015866622906216e-05, + "loss": 1.2315, + "step": 1209 + }, + { + "batch_num_effect_tokens": 5369, + "batch_num_samples": 149, + "batch_num_tokens": 52126, + "epoch": 1.1, + "grad_norm": 3.179255485534668, + "learning_rate": 1e-05, + "loss": 0.689, + "step": 1210 + }, + { + "batch_num_effect_tokens": 7136, + "batch_num_samples": 149, + "batch_num_tokens": 52134, + "epoch": 1.10091, + "grad_norm": 2.66573429107666, + "learning_rate": 9.98413337709379e-06, + "loss": 1.0452, + "step": 1211 + }, + { + "batch_num_effect_tokens": 6263, + "batch_num_samples": 149, + "batch_num_tokens": 52173, + "epoch": 1.10182, + "grad_norm": 3.0495479106903076, + "learning_rate": 9.968266794131778e-06, + "loss": 1.0476, + "step": 1212 + }, + { + "batch_num_effect_tokens": 6082, + "batch_num_samples": 149, + "batch_num_tokens": 52185, + "epoch": 1.10273, + "grad_norm": 3.077975273132324, + "learning_rate": 9.952400291058078e-06, + "loss": 1.0274, + "step": 1213 + }, + { + "batch_num_effect_tokens": 4148, + "batch_num_samples": 149, + "batch_num_tokens": 52210, + "epoch": 1.10364, + "grad_norm": 2.852982521057129, + "learning_rate": 9.936533907816583e-06, + "loss": 0.5047, + "step": 1214 + }, + { + "batch_num_effect_tokens": 7908, + "batch_num_samples": 150, + "batch_num_tokens": 52213, + "epoch": 1.10455, + "grad_norm": 3.789891481399536, + "learning_rate": 9.920667684350906e-06, + "loss": 1.5248, + "step": 1215 + }, + { + "batch_num_effect_tokens": 6645, + "batch_num_samples": 149, + "batch_num_tokens": 52223, + "epoch": 1.10545, + "grad_norm": 3.0531599521636963, + "learning_rate": 9.904801660604234e-06, + "loss": 0.9681, + "step": 1216 + }, + { + "batch_num_effect_tokens": 7027, + "batch_num_samples": 149, + "batch_num_tokens": 52179, + "epoch": 1.10636, + "grad_norm": 4.8088788986206055, + "learning_rate": 9.888935876519272e-06, + "loss": 1.9039, + "step": 1217 + }, + { + "batch_num_effect_tokens": 6049, + "batch_num_samples": 149, + "batch_num_tokens": 52163, + "epoch": 1.10727, + "grad_norm": 3.2031102180480957, + "learning_rate": 9.873070372038106e-06, + "loss": 0.8509, + "step": 1218 + }, + { + "batch_num_effect_tokens": 6047, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 1.10818, + "grad_norm": 3.8017449378967285, + "learning_rate": 9.85720518710213e-06, + "loss": 1.3184, + "step": 1219 + }, + { + "batch_num_effect_tokens": 7350, + "batch_num_samples": 149, + "batch_num_tokens": 52197, + "epoch": 1.10909, + "grad_norm": 3.482895612716675, + "learning_rate": 9.841340361651921e-06, + "loss": 1.4786, + "step": 1220 + }, + { + "batch_num_effect_tokens": 3934, + "batch_num_samples": 149, + "batch_num_tokens": 52168, + "epoch": 1.11, + "grad_norm": 3.3409523963928223, + "learning_rate": 9.825475935627165e-06, + "loss": 0.5001, + "step": 1221 + }, + { + "batch_num_effect_tokens": 7873, + "batch_num_samples": 149, + "batch_num_tokens": 52191, + "epoch": 1.11091, + "grad_norm": 3.9254238605499268, + "learning_rate": 9.809611948966534e-06, + "loss": 2.0723, + "step": 1222 + }, + { + "batch_num_effect_tokens": 6719, + "batch_num_samples": 149, + "batch_num_tokens": 52128, + "epoch": 1.11182, + "grad_norm": 3.5105459690093994, + "learning_rate": 9.793748441607595e-06, + "loss": 1.5862, + "step": 1223 + }, + { + "batch_num_effect_tokens": 5100, + "batch_num_samples": 149, + "batch_num_tokens": 52140, + "epoch": 1.11273, + "grad_norm": 2.8462579250335693, + "learning_rate": 9.777885453486706e-06, + "loss": 0.6737, + "step": 1224 + }, + { + "batch_num_effect_tokens": 8851, + "batch_num_samples": 149, + "batch_num_tokens": 50559, + "epoch": 1.11364, + "grad_norm": 3.0171377658843994, + "learning_rate": 9.762023024538928e-06, + "loss": 1.5563, + "step": 1225 + }, + { + "batch_num_effect_tokens": 8092, + "batch_num_samples": 149, + "batch_num_tokens": 52208, + "epoch": 1.11455, + "grad_norm": 2.975921154022217, + "learning_rate": 9.746161194697895e-06, + "loss": 1.3743, + "step": 1226 + }, + { + "batch_num_effect_tokens": 9265, + "batch_num_samples": 149, + "batch_num_tokens": 52201, + "epoch": 1.11545, + "grad_norm": 2.8814196586608887, + "learning_rate": 9.73030000389576e-06, + "loss": 1.4079, + "step": 1227 + }, + { + "batch_num_effect_tokens": 5494, + "batch_num_samples": 149, + "batch_num_tokens": 52210, + "epoch": 1.11636, + "grad_norm": 2.375429391860962, + "learning_rate": 9.71443949206304e-06, + "loss": 0.4457, + "step": 1228 + }, + { + "batch_num_effect_tokens": 7578, + "batch_num_samples": 150, + "batch_num_tokens": 52141, + "epoch": 1.11727, + "grad_norm": 2.9076132774353027, + "learning_rate": 9.698579699128557e-06, + "loss": 1.1299, + "step": 1229 + }, + { + "batch_num_effect_tokens": 5535, + "batch_num_samples": 149, + "batch_num_tokens": 52136, + "epoch": 1.11818, + "grad_norm": 3.1913745403289795, + "learning_rate": 9.682720665019325e-06, + "loss": 0.9203, + "step": 1230 + }, + { + "batch_num_effect_tokens": 8991, + "batch_num_samples": 149, + "batch_num_tokens": 52202, + "epoch": 1.11909, + "grad_norm": 3.0180065631866455, + "learning_rate": 9.66686242966044e-06, + "loss": 1.6461, + "step": 1231 + }, + { + "batch_num_effect_tokens": 10561, + "batch_num_samples": 149, + "batch_num_tokens": 52198, + "epoch": 1.12, + "grad_norm": 2.6574461460113525, + "learning_rate": 9.651005032974994e-06, + "loss": 1.3578, + "step": 1232 + }, + { + "batch_num_effect_tokens": 6736, + "batch_num_samples": 149, + "batch_num_tokens": 52117, + "epoch": 1.12091, + "grad_norm": 3.393181324005127, + "learning_rate": 9.635148514883956e-06, + "loss": 1.4798, + "step": 1233 + }, + { + "batch_num_effect_tokens": 7173, + "batch_num_samples": 149, + "batch_num_tokens": 52146, + "epoch": 1.12182, + "grad_norm": 2.7249555587768555, + "learning_rate": 9.619292915306103e-06, + "loss": 0.9275, + "step": 1234 + }, + { + "batch_num_effect_tokens": 7145, + "batch_num_samples": 149, + "batch_num_tokens": 52151, + "epoch": 1.12273, + "grad_norm": 2.728337526321411, + "learning_rate": 9.603438274157878e-06, + "loss": 0.8287, + "step": 1235 + }, + { + "batch_num_effect_tokens": 5026, + "batch_num_samples": 149, + "batch_num_tokens": 52139, + "epoch": 1.12364, + "grad_norm": 3.712592363357544, + "learning_rate": 9.58758463135333e-06, + "loss": 0.8681, + "step": 1236 + }, + { + "batch_num_effect_tokens": 6123, + "batch_num_samples": 149, + "batch_num_tokens": 52124, + "epoch": 1.12455, + "grad_norm": 3.236494541168213, + "learning_rate": 9.571732026803978e-06, + "loss": 0.9514, + "step": 1237 + }, + { + "batch_num_effect_tokens": 5858, + "batch_num_samples": 149, + "batch_num_tokens": 52149, + "epoch": 1.12545, + "grad_norm": 3.327521562576294, + "learning_rate": 9.555880500418739e-06, + "loss": 1.0204, + "step": 1238 + }, + { + "batch_num_effect_tokens": 5589, + "batch_num_samples": 149, + "batch_num_tokens": 52180, + "epoch": 1.12636, + "grad_norm": 3.551295518875122, + "learning_rate": 9.540030092103809e-06, + "loss": 0.9061, + "step": 1239 + }, + { + "batch_num_effect_tokens": 3728, + "batch_num_samples": 149, + "batch_num_tokens": 52204, + "epoch": 1.12727, + "grad_norm": 3.2720189094543457, + "learning_rate": 9.524180841762577e-06, + "loss": 0.4908, + "step": 1240 + }, + { + "batch_num_effect_tokens": 6744, + "batch_num_samples": 149, + "batch_num_tokens": 52161, + "epoch": 1.12818, + "grad_norm": 3.4976143836975098, + "learning_rate": 9.50833278929551e-06, + "loss": 1.0591, + "step": 1241 + }, + { + "batch_num_effect_tokens": 7771, + "batch_num_samples": 149, + "batch_num_tokens": 52209, + "epoch": 1.12909, + "grad_norm": 4.257693290710449, + "learning_rate": 9.49248597460006e-06, + "loss": 1.575, + "step": 1242 + }, + { + "batch_num_effect_tokens": 4107, + "batch_num_samples": 149, + "batch_num_tokens": 52176, + "epoch": 1.13, + "grad_norm": 3.3101322650909424, + "learning_rate": 9.476640437570562e-06, + "loss": 0.6062, + "step": 1243 + }, + { + "batch_num_effect_tokens": 7747, + "batch_num_samples": 149, + "batch_num_tokens": 52132, + "epoch": 1.13091, + "grad_norm": 3.23077654838562, + "learning_rate": 9.460796218098143e-06, + "loss": 1.1486, + "step": 1244 + }, + { + "batch_num_effect_tokens": 9415, + "batch_num_samples": 149, + "batch_num_tokens": 52170, + "epoch": 1.13182, + "grad_norm": 3.416395902633667, + "learning_rate": 9.444953356070601e-06, + "loss": 1.6655, + "step": 1245 + }, + { + "batch_num_effect_tokens": 5839, + "batch_num_samples": 149, + "batch_num_tokens": 52152, + "epoch": 1.13273, + "grad_norm": 5.638548374176025, + "learning_rate": 9.42911189137232e-06, + "loss": 1.4627, + "step": 1246 + }, + { + "batch_num_effect_tokens": 5462, + "batch_num_samples": 149, + "batch_num_tokens": 50560, + "epoch": 1.13364, + "grad_norm": 3.1694421768188477, + "learning_rate": 9.413271863884177e-06, + "loss": 1.0307, + "step": 1247 + }, + { + "batch_num_effect_tokens": 5970, + "batch_num_samples": 149, + "batch_num_tokens": 52112, + "epoch": 1.13455, + "grad_norm": 3.264805555343628, + "learning_rate": 9.397433313483417e-06, + "loss": 1.1321, + "step": 1248 + }, + { + "batch_num_effect_tokens": 4547, + "batch_num_samples": 149, + "batch_num_tokens": 52135, + "epoch": 1.13545, + "grad_norm": 3.039553642272949, + "learning_rate": 9.381596280043574e-06, + "loss": 0.5796, + "step": 1249 + }, + { + "batch_num_effect_tokens": 4459, + "batch_num_samples": 149, + "batch_num_tokens": 52164, + "epoch": 1.13636, + "grad_norm": 3.0265426635742188, + "learning_rate": 9.365760803434356e-06, + "loss": 0.6882, + "step": 1250 + }, + { + "batch_num_effect_tokens": 6473, + "batch_num_samples": 149, + "batch_num_tokens": 52209, + "epoch": 1.13727, + "grad_norm": 3.441258430480957, + "learning_rate": 9.349926923521563e-06, + "loss": 1.3783, + "step": 1251 + }, + { + "batch_num_effect_tokens": 4856, + "batch_num_samples": 149, + "batch_num_tokens": 50540, + "epoch": 1.13818, + "grad_norm": 3.1589627265930176, + "learning_rate": 9.334094680166962e-06, + "loss": 0.6254, + "step": 1252 + }, + { + "batch_num_effect_tokens": 5519, + "batch_num_samples": 149, + "batch_num_tokens": 52143, + "epoch": 1.13909, + "grad_norm": 2.8274736404418945, + "learning_rate": 9.318264113228215e-06, + "loss": 0.5885, + "step": 1253 + }, + { + "batch_num_effect_tokens": 9360, + "batch_num_samples": 149, + "batch_num_tokens": 52160, + "epoch": 1.14, + "grad_norm": 2.875990867614746, + "learning_rate": 9.302435262558748e-06, + "loss": 1.666, + "step": 1254 + }, + { + "batch_num_effect_tokens": 9990, + "batch_num_samples": 149, + "batch_num_tokens": 52167, + "epoch": 1.14091, + "grad_norm": 2.228811264038086, + "learning_rate": 9.286608168007678e-06, + "loss": 1.0264, + "step": 1255 + }, + { + "batch_num_effect_tokens": 5826, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 1.14182, + "grad_norm": 3.2085955142974854, + "learning_rate": 9.270782869419694e-06, + "loss": 0.768, + "step": 1256 + }, + { + "batch_num_effect_tokens": 6325, + "batch_num_samples": 149, + "batch_num_tokens": 52196, + "epoch": 1.14273, + "grad_norm": 3.118927001953125, + "learning_rate": 9.25495940663497e-06, + "loss": 1.1966, + "step": 1257 + }, + { + "batch_num_effect_tokens": 7294, + "batch_num_samples": 149, + "batch_num_tokens": 52210, + "epoch": 1.14364, + "grad_norm": 2.9526238441467285, + "learning_rate": 9.239137819489047e-06, + "loss": 1.0929, + "step": 1258 + }, + { + "batch_num_effect_tokens": 4746, + "batch_num_samples": 149, + "batch_num_tokens": 50509, + "epoch": 1.14455, + "grad_norm": 3.432528257369995, + "learning_rate": 9.223318147812765e-06, + "loss": 0.8558, + "step": 1259 + }, + { + "batch_num_effect_tokens": 6433, + "batch_num_samples": 149, + "batch_num_tokens": 52169, + "epoch": 1.14545, + "grad_norm": 3.367201328277588, + "learning_rate": 9.207500431432115e-06, + "loss": 1.213, + "step": 1260 + }, + { + "batch_num_effect_tokens": 8317, + "batch_num_samples": 149, + "batch_num_tokens": 52166, + "epoch": 1.14636, + "grad_norm": 3.025263786315918, + "learning_rate": 9.191684710168188e-06, + "loss": 1.375, + "step": 1261 + }, + { + "batch_num_effect_tokens": 8129, + "batch_num_samples": 149, + "batch_num_tokens": 52222, + "epoch": 1.14727, + "grad_norm": 3.154592514038086, + "learning_rate": 9.175871023837042e-06, + "loss": 1.2919, + "step": 1262 + }, + { + "batch_num_effect_tokens": 7062, + "batch_num_samples": 149, + "batch_num_tokens": 52184, + "epoch": 1.14818, + "grad_norm": 3.2619194984436035, + "learning_rate": 9.160059412249607e-06, + "loss": 1.1456, + "step": 1263 + }, + { + "batch_num_effect_tokens": 6844, + "batch_num_samples": 149, + "batch_num_tokens": 52130, + "epoch": 1.14909, + "grad_norm": 3.1822962760925293, + "learning_rate": 9.144249915211605e-06, + "loss": 1.1395, + "step": 1264 + }, + { + "batch_num_effect_tokens": 6011, + "batch_num_samples": 149, + "batch_num_tokens": 52141, + "epoch": 1.15, + "grad_norm": 3.5489284992218018, + "learning_rate": 9.128442572523418e-06, + "loss": 1.1341, + "step": 1265 + }, + { + "batch_num_effect_tokens": 5420, + "batch_num_samples": 149, + "batch_num_tokens": 52157, + "epoch": 1.15091, + "grad_norm": 2.9631083011627197, + "learning_rate": 9.11263742398002e-06, + "loss": 0.7856, + "step": 1266 + }, + { + "batch_num_effect_tokens": 7156, + "batch_num_samples": 149, + "batch_num_tokens": 50569, + "epoch": 1.15182, + "grad_norm": 4.286734104156494, + "learning_rate": 9.09683450937085e-06, + "loss": 2.0762, + "step": 1267 + }, + { + "batch_num_effect_tokens": 9164, + "batch_num_samples": 149, + "batch_num_tokens": 52168, + "epoch": 1.15273, + "grad_norm": 2.735157012939453, + "learning_rate": 9.081033868479727e-06, + "loss": 1.132, + "step": 1268 + }, + { + "batch_num_effect_tokens": 7674, + "batch_num_samples": 149, + "batch_num_tokens": 52208, + "epoch": 1.15364, + "grad_norm": 2.8595211505889893, + "learning_rate": 9.065235541084745e-06, + "loss": 1.0823, + "step": 1269 + }, + { + "batch_num_effect_tokens": 5831, + "batch_num_samples": 149, + "batch_num_tokens": 52220, + "epoch": 1.15455, + "grad_norm": 2.9400060176849365, + "learning_rate": 9.049439566958176e-06, + "loss": 0.7926, + "step": 1270 + }, + { + "batch_num_effect_tokens": 7135, + "batch_num_samples": 149, + "batch_num_tokens": 52124, + "epoch": 1.15545, + "grad_norm": 2.6830992698669434, + "learning_rate": 9.033645985866361e-06, + "loss": 0.9602, + "step": 1271 + }, + { + "batch_num_effect_tokens": 8934, + "batch_num_samples": 149, + "batch_num_tokens": 52196, + "epoch": 1.15636, + "grad_norm": 3.0802054405212402, + "learning_rate": 9.017854837569629e-06, + "loss": 1.4231, + "step": 1272 + }, + { + "batch_num_effect_tokens": 7525, + "batch_num_samples": 149, + "batch_num_tokens": 52221, + "epoch": 1.15727, + "grad_norm": 2.9055581092834473, + "learning_rate": 9.002066161822174e-06, + "loss": 1.1685, + "step": 1273 + }, + { + "batch_num_effect_tokens": 9017, + "batch_num_samples": 149, + "batch_num_tokens": 52163, + "epoch": 1.15818, + "grad_norm": 2.9452826976776123, + "learning_rate": 8.986279998371968e-06, + "loss": 1.3317, + "step": 1274 + }, + { + "batch_num_effect_tokens": 5795, + "batch_num_samples": 149, + "batch_num_tokens": 52173, + "epoch": 1.15909, + "grad_norm": 4.065710067749023, + "learning_rate": 8.970496386960657e-06, + "loss": 1.2286, + "step": 1275 + }, + { + "batch_num_effect_tokens": 6780, + "batch_num_samples": 149, + "batch_num_tokens": 52170, + "epoch": 1.16, + "grad_norm": 3.2890782356262207, + "learning_rate": 8.954715367323468e-06, + "loss": 1.2892, + "step": 1276 + }, + { + "batch_num_effect_tokens": 6270, + "batch_num_samples": 149, + "batch_num_tokens": 52140, + "epoch": 1.16091, + "grad_norm": 3.506971597671509, + "learning_rate": 8.938936979189091e-06, + "loss": 1.1679, + "step": 1277 + }, + { + "batch_num_effect_tokens": 6216, + "batch_num_samples": 149, + "batch_num_tokens": 52140, + "epoch": 1.16182, + "grad_norm": 3.278193950653076, + "learning_rate": 8.923161262279611e-06, + "loss": 1.1085, + "step": 1278 + }, + { + "batch_num_effect_tokens": 7952, + "batch_num_samples": 149, + "batch_num_tokens": 52138, + "epoch": 1.16273, + "grad_norm": 3.214967966079712, + "learning_rate": 8.907388256310373e-06, + "loss": 1.2083, + "step": 1279 + }, + { + "batch_num_effect_tokens": 8910, + "batch_num_samples": 149, + "batch_num_tokens": 52180, + "epoch": 1.16364, + "grad_norm": 2.634761095046997, + "learning_rate": 8.89161800098989e-06, + "loss": 1.1466, + "step": 1280 + }, + { + "batch_num_effect_tokens": 4847, + "batch_num_samples": 149, + "batch_num_tokens": 52209, + "epoch": 1.16455, + "grad_norm": 3.5305864810943604, + "learning_rate": 8.875850536019775e-06, + "loss": 0.782, + "step": 1281 + }, + { + "batch_num_effect_tokens": 7283, + "batch_num_samples": 149, + "batch_num_tokens": 52137, + "epoch": 1.16545, + "grad_norm": 3.4275307655334473, + "learning_rate": 8.860085901094595e-06, + "loss": 1.2666, + "step": 1282 + }, + { + "batch_num_effect_tokens": 7931, + "batch_num_samples": 149, + "batch_num_tokens": 52174, + "epoch": 1.16636, + "grad_norm": 3.1967883110046387, + "learning_rate": 8.844324135901803e-06, + "loss": 1.2349, + "step": 1283 + }, + { + "batch_num_effect_tokens": 5999, + "batch_num_samples": 149, + "batch_num_tokens": 52216, + "epoch": 1.16727, + "grad_norm": 3.1412158012390137, + "learning_rate": 8.828565280121619e-06, + "loss": 0.8491, + "step": 1284 + }, + { + "batch_num_effect_tokens": 4611, + "batch_num_samples": 149, + "batch_num_tokens": 52221, + "epoch": 1.16818, + "grad_norm": 3.7525036334991455, + "learning_rate": 8.812809373426951e-06, + "loss": 0.9368, + "step": 1285 + }, + { + "batch_num_effect_tokens": 4677, + "batch_num_samples": 149, + "batch_num_tokens": 50524, + "epoch": 1.16909, + "grad_norm": 4.773053169250488, + "learning_rate": 8.797056455483267e-06, + "loss": 1.2988, + "step": 1286 + }, + { + "batch_num_effect_tokens": 6205, + "batch_num_samples": 149, + "batch_num_tokens": 52123, + "epoch": 1.17, + "grad_norm": 3.1944470405578613, + "learning_rate": 8.781306565948528e-06, + "loss": 0.8771, + "step": 1287 + }, + { + "batch_num_effect_tokens": 5908, + "batch_num_samples": 149, + "batch_num_tokens": 52183, + "epoch": 1.17091, + "grad_norm": 4.351945877075195, + "learning_rate": 8.765559744473054e-06, + "loss": 0.9883, + "step": 1288 + }, + { + "batch_num_effect_tokens": 7314, + "batch_num_samples": 149, + "batch_num_tokens": 52189, + "epoch": 1.17182, + "grad_norm": 3.1656649112701416, + "learning_rate": 8.749816030699456e-06, + "loss": 1.2371, + "step": 1289 + }, + { + "batch_num_effect_tokens": 5754, + "batch_num_samples": 149, + "batch_num_tokens": 52223, + "epoch": 1.17273, + "grad_norm": 4.705539226531982, + "learning_rate": 8.734075464262507e-06, + "loss": 1.6817, + "step": 1290 + }, + { + "batch_num_effect_tokens": 3862, + "batch_num_samples": 149, + "batch_num_tokens": 52130, + "epoch": 1.17364, + "grad_norm": 3.1189210414886475, + "learning_rate": 8.718338084789074e-06, + "loss": 0.3547, + "step": 1291 + }, + { + "batch_num_effect_tokens": 6167, + "batch_num_samples": 149, + "batch_num_tokens": 52197, + "epoch": 1.17455, + "grad_norm": 3.551257848739624, + "learning_rate": 8.702603931897983e-06, + "loss": 1.1843, + "step": 1292 + }, + { + "batch_num_effect_tokens": 7435, + "batch_num_samples": 149, + "batch_num_tokens": 52161, + "epoch": 1.17545, + "grad_norm": 4.381896495819092, + "learning_rate": 8.68687304519995e-06, + "loss": 2.0781, + "step": 1293 + }, + { + "batch_num_effect_tokens": 8727, + "batch_num_samples": 149, + "batch_num_tokens": 52155, + "epoch": 1.17636, + "grad_norm": 2.9816768169403076, + "learning_rate": 8.67114546429746e-06, + "loss": 1.2507, + "step": 1294 + }, + { + "batch_num_effect_tokens": 6274, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 1.17727, + "grad_norm": 3.839073657989502, + "learning_rate": 8.655421228784683e-06, + "loss": 1.6419, + "step": 1295 + }, + { + "batch_num_effect_tokens": 4891, + "batch_num_samples": 149, + "batch_num_tokens": 52138, + "epoch": 1.17818, + "grad_norm": 2.7432196140289307, + "learning_rate": 8.639700378247362e-06, + "loss": 0.6166, + "step": 1296 + }, + { + "batch_num_effect_tokens": 6322, + "batch_num_samples": 149, + "batch_num_tokens": 52138, + "epoch": 1.17909, + "grad_norm": 2.8955793380737305, + "learning_rate": 8.623982952262713e-06, + "loss": 0.9081, + "step": 1297 + }, + { + "batch_num_effect_tokens": 6776, + "batch_num_samples": 149, + "batch_num_tokens": 52183, + "epoch": 1.18, + "grad_norm": 2.928248167037964, + "learning_rate": 8.60826899039935e-06, + "loss": 1.2373, + "step": 1298 + }, + { + "batch_num_effect_tokens": 4579, + "batch_num_samples": 149, + "batch_num_tokens": 52220, + "epoch": 1.18091, + "grad_norm": 3.005248546600342, + "learning_rate": 8.592558532217138e-06, + "loss": 0.8357, + "step": 1299 + }, + { + "batch_num_effect_tokens": 7649, + "batch_num_samples": 150, + "batch_num_tokens": 52217, + "epoch": 1.18182, + "grad_norm": 2.8046600818634033, + "learning_rate": 8.576851617267151e-06, + "loss": 1.1489, + "step": 1300 + }, + { + "batch_num_effect_tokens": 4630, + "batch_num_samples": 149, + "batch_num_tokens": 52127, + "epoch": 1.18273, + "grad_norm": 3.1205475330352783, + "learning_rate": 8.56114828509152e-06, + "loss": 0.6783, + "step": 1301 + }, + { + "batch_num_effect_tokens": 5282, + "batch_num_samples": 149, + "batch_num_tokens": 52215, + "epoch": 1.18364, + "grad_norm": 2.6284232139587402, + "learning_rate": 8.545448575223369e-06, + "loss": 0.3939, + "step": 1302 + }, + { + "batch_num_effect_tokens": 5344, + "batch_num_samples": 149, + "batch_num_tokens": 52145, + "epoch": 1.18455, + "grad_norm": 2.9303133487701416, + "learning_rate": 8.529752527186694e-06, + "loss": 0.7759, + "step": 1303 + }, + { + "batch_num_effect_tokens": 5568, + "batch_num_samples": 149, + "batch_num_tokens": 52211, + "epoch": 1.18545, + "grad_norm": 2.889888286590576, + "learning_rate": 8.514060180496285e-06, + "loss": 0.6437, + "step": 1304 + }, + { + "batch_num_effect_tokens": 5664, + "batch_num_samples": 149, + "batch_num_tokens": 52173, + "epoch": 1.18636, + "grad_norm": 3.5130727291107178, + "learning_rate": 8.498371574657596e-06, + "loss": 1.0154, + "step": 1305 + }, + { + "batch_num_effect_tokens": 11118, + "batch_num_samples": 149, + "batch_num_tokens": 52172, + "epoch": 1.18727, + "grad_norm": 2.980041980743408, + "learning_rate": 8.482686749166685e-06, + "loss": 1.9141, + "step": 1306 + }, + { + "batch_num_effect_tokens": 5524, + "batch_num_samples": 149, + "batch_num_tokens": 52139, + "epoch": 1.18818, + "grad_norm": 3.3898441791534424, + "learning_rate": 8.467005743510072e-06, + "loss": 0.8405, + "step": 1307 + }, + { + "batch_num_effect_tokens": 4948, + "batch_num_samples": 149, + "batch_num_tokens": 52210, + "epoch": 1.18909, + "grad_norm": 3.7010104656219482, + "learning_rate": 8.451328597164679e-06, + "loss": 1.1394, + "step": 1308 + }, + { + "batch_num_effect_tokens": 7511, + "batch_num_samples": 149, + "batch_num_tokens": 52139, + "epoch": 1.19, + "grad_norm": 3.6661295890808105, + "learning_rate": 8.43565534959769e-06, + "loss": 1.4902, + "step": 1309 + }, + { + "batch_num_effect_tokens": 5624, + "batch_num_samples": 149, + "batch_num_tokens": 52221, + "epoch": 1.19091, + "grad_norm": 3.152494192123413, + "learning_rate": 8.419986040266502e-06, + "loss": 1.1014, + "step": 1310 + }, + { + "batch_num_effect_tokens": 5909, + "batch_num_samples": 149, + "batch_num_tokens": 52191, + "epoch": 1.19182, + "grad_norm": 3.238676071166992, + "learning_rate": 8.404320708618572e-06, + "loss": 0.9795, + "step": 1311 + }, + { + "batch_num_effect_tokens": 10814, + "batch_num_samples": 149, + "batch_num_tokens": 52163, + "epoch": 1.19273, + "grad_norm": 2.8736958503723145, + "learning_rate": 8.388659394091362e-06, + "loss": 1.4199, + "step": 1312 + }, + { + "batch_num_effect_tokens": 7138, + "batch_num_samples": 149, + "batch_num_tokens": 52174, + "epoch": 1.19364, + "grad_norm": 3.616826057434082, + "learning_rate": 8.373002136112204e-06, + "loss": 1.4153, + "step": 1313 + }, + { + "batch_num_effect_tokens": 5662, + "batch_num_samples": 149, + "batch_num_tokens": 52184, + "epoch": 1.19455, + "grad_norm": 3.198676586151123, + "learning_rate": 8.357348974098232e-06, + "loss": 0.9395, + "step": 1314 + }, + { + "batch_num_effect_tokens": 5553, + "batch_num_samples": 149, + "batch_num_tokens": 52132, + "epoch": 1.19545, + "grad_norm": 3.6962339878082275, + "learning_rate": 8.341699947456261e-06, + "loss": 0.896, + "step": 1315 + }, + { + "batch_num_effect_tokens": 6349, + "batch_num_samples": 149, + "batch_num_tokens": 52151, + "epoch": 1.19636, + "grad_norm": 3.261848211288452, + "learning_rate": 8.326055095582694e-06, + "loss": 1.0207, + "step": 1316 + }, + { + "batch_num_effect_tokens": 7606, + "batch_num_samples": 149, + "batch_num_tokens": 52189, + "epoch": 1.19727, + "grad_norm": 3.192269802093506, + "learning_rate": 8.310414457863437e-06, + "loss": 1.1751, + "step": 1317 + }, + { + "batch_num_effect_tokens": 8860, + "batch_num_samples": 149, + "batch_num_tokens": 52218, + "epoch": 1.19818, + "grad_norm": 3.0157923698425293, + "learning_rate": 8.294778073673762e-06, + "loss": 1.5703, + "step": 1318 + }, + { + "batch_num_effect_tokens": 5214, + "batch_num_samples": 149, + "batch_num_tokens": 52160, + "epoch": 1.19909, + "grad_norm": 3.5552825927734375, + "learning_rate": 8.279145982378261e-06, + "loss": 1.1386, + "step": 1319 + }, + { + "batch_num_effect_tokens": 10259, + "batch_num_samples": 149, + "batch_num_tokens": 52182, + "epoch": 1.2, + "grad_norm": 2.6071653366088867, + "learning_rate": 8.263518223330698e-06, + "loss": 1.3003, + "step": 1320 + }, + { + "batch_num_effect_tokens": 6907, + "batch_num_samples": 149, + "batch_num_tokens": 52208, + "epoch": 1.20091, + "grad_norm": 3.8927736282348633, + "learning_rate": 8.24789483587394e-06, + "loss": 1.8552, + "step": 1321 + }, + { + "batch_num_effect_tokens": 5271, + "batch_num_samples": 149, + "batch_num_tokens": 50543, + "epoch": 1.20182, + "grad_norm": 3.377779245376587, + "learning_rate": 8.232275859339842e-06, + "loss": 1.0852, + "step": 1322 + }, + { + "batch_num_effect_tokens": 4004, + "batch_num_samples": 149, + "batch_num_tokens": 52204, + "epoch": 1.20273, + "grad_norm": 3.201169013977051, + "learning_rate": 8.216661333049171e-06, + "loss": 0.5308, + "step": 1323 + }, + { + "batch_num_effect_tokens": 6163, + "batch_num_samples": 149, + "batch_num_tokens": 52179, + "epoch": 1.20364, + "grad_norm": 3.0209457874298096, + "learning_rate": 8.201051296311462e-06, + "loss": 0.9274, + "step": 1324 + }, + { + "batch_num_effect_tokens": 5098, + "batch_num_samples": 149, + "batch_num_tokens": 52208, + "epoch": 1.20455, + "grad_norm": 3.0369114875793457, + "learning_rate": 8.185445788424975e-06, + "loss": 1.0532, + "step": 1325 + }, + { + "batch_num_effect_tokens": 9244, + "batch_num_samples": 149, + "batch_num_tokens": 52162, + "epoch": 1.20545, + "grad_norm": 2.9742166996002197, + "learning_rate": 8.169844848676553e-06, + "loss": 1.063, + "step": 1326 + }, + { + "batch_num_effect_tokens": 4571, + "batch_num_samples": 149, + "batch_num_tokens": 52187, + "epoch": 1.20636, + "grad_norm": 3.4904568195343018, + "learning_rate": 8.154248516341547e-06, + "loss": 0.8149, + "step": 1327 + }, + { + "batch_num_effect_tokens": 8765, + "batch_num_samples": 149, + "batch_num_tokens": 52184, + "epoch": 1.20727, + "grad_norm": 3.0287582874298096, + "learning_rate": 8.1386568306837e-06, + "loss": 1.2435, + "step": 1328 + }, + { + "batch_num_effect_tokens": 4073, + "batch_num_samples": 149, + "batch_num_tokens": 52209, + "epoch": 1.20818, + "grad_norm": 3.4525487422943115, + "learning_rate": 8.123069830955066e-06, + "loss": 0.5571, + "step": 1329 + }, + { + "batch_num_effect_tokens": 5591, + "batch_num_samples": 149, + "batch_num_tokens": 52191, + "epoch": 1.20909, + "grad_norm": 3.343996047973633, + "learning_rate": 8.107487556395902e-06, + "loss": 0.962, + "step": 1330 + }, + { + "batch_num_effect_tokens": 4778, + "batch_num_samples": 149, + "batch_num_tokens": 52201, + "epoch": 1.21, + "grad_norm": 2.7948801517486572, + "learning_rate": 8.091910046234552e-06, + "loss": 0.6016, + "step": 1331 + }, + { + "batch_num_effect_tokens": 7247, + "batch_num_samples": 149, + "batch_num_tokens": 52167, + "epoch": 1.21091, + "grad_norm": 3.2723145484924316, + "learning_rate": 8.076337339687395e-06, + "loss": 1.203, + "step": 1332 + }, + { + "batch_num_effect_tokens": 7885, + "batch_num_samples": 149, + "batch_num_tokens": 52116, + "epoch": 1.21182, + "grad_norm": 3.3220107555389404, + "learning_rate": 8.06076947595869e-06, + "loss": 1.2771, + "step": 1333 + }, + { + "batch_num_effect_tokens": 6931, + "batch_num_samples": 149, + "batch_num_tokens": 52190, + "epoch": 1.21273, + "grad_norm": 3.1205132007598877, + "learning_rate": 8.04520649424052e-06, + "loss": 0.9818, + "step": 1334 + }, + { + "batch_num_effect_tokens": 4684, + "batch_num_samples": 149, + "batch_num_tokens": 52124, + "epoch": 1.21364, + "grad_norm": 3.3503336906433105, + "learning_rate": 8.029648433712671e-06, + "loss": 0.5986, + "step": 1335 + }, + { + "batch_num_effect_tokens": 4863, + "batch_num_samples": 149, + "batch_num_tokens": 52210, + "epoch": 1.21455, + "grad_norm": 2.5516700744628906, + "learning_rate": 8.014095333542548e-06, + "loss": 0.5332, + "step": 1336 + }, + { + "batch_num_effect_tokens": 9903, + "batch_num_samples": 149, + "batch_num_tokens": 52142, + "epoch": 1.21545, + "grad_norm": 2.74527907371521, + "learning_rate": 7.998547232885053e-06, + "loss": 1.4125, + "step": 1337 + }, + { + "batch_num_effect_tokens": 7362, + "batch_num_samples": 149, + "batch_num_tokens": 52215, + "epoch": 1.21636, + "grad_norm": 2.8555214405059814, + "learning_rate": 7.983004170882518e-06, + "loss": 1.1776, + "step": 1338 + }, + { + "batch_num_effect_tokens": 8013, + "batch_num_samples": 149, + "batch_num_tokens": 52169, + "epoch": 1.21727, + "grad_norm": 2.9209163188934326, + "learning_rate": 7.967466186664579e-06, + "loss": 1.3352, + "step": 1339 + }, + { + "batch_num_effect_tokens": 6412, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 1.21818, + "grad_norm": 2.683448076248169, + "learning_rate": 7.951933319348095e-06, + "loss": 0.8411, + "step": 1340 + }, + { + "batch_num_effect_tokens": 4307, + "batch_num_samples": 149, + "batch_num_tokens": 52138, + "epoch": 1.21909, + "grad_norm": 2.8081939220428467, + "learning_rate": 7.936405608037037e-06, + "loss": 0.5098, + "step": 1341 + }, + { + "batch_num_effect_tokens": 4409, + "batch_num_samples": 149, + "batch_num_tokens": 52222, + "epoch": 1.22, + "grad_norm": 3.882086992263794, + "learning_rate": 7.92088309182241e-06, + "loss": 0.8913, + "step": 1342 + }, + { + "batch_num_effect_tokens": 9927, + "batch_num_samples": 149, + "batch_num_tokens": 52196, + "epoch": 1.22091, + "grad_norm": 2.887500762939453, + "learning_rate": 7.905365809782115e-06, + "loss": 1.3527, + "step": 1343 + }, + { + "batch_num_effect_tokens": 4882, + "batch_num_samples": 149, + "batch_num_tokens": 52089, + "epoch": 1.22182, + "grad_norm": 3.064622163772583, + "learning_rate": 7.889853800980905e-06, + "loss": 0.6535, + "step": 1344 + }, + { + "batch_num_effect_tokens": 6178, + "batch_num_samples": 149, + "batch_num_tokens": 52157, + "epoch": 1.22273, + "grad_norm": 3.8330609798431396, + "learning_rate": 7.874347104470234e-06, + "loss": 1.4913, + "step": 1345 + }, + { + "batch_num_effect_tokens": 6663, + "batch_num_samples": 149, + "batch_num_tokens": 52201, + "epoch": 1.22364, + "grad_norm": 3.099398612976074, + "learning_rate": 7.858845759288198e-06, + "loss": 1.1149, + "step": 1346 + }, + { + "batch_num_effect_tokens": 6186, + "batch_num_samples": 149, + "batch_num_tokens": 52152, + "epoch": 1.22455, + "grad_norm": 3.0903403759002686, + "learning_rate": 7.843349804459412e-06, + "loss": 0.8618, + "step": 1347 + }, + { + "batch_num_effect_tokens": 6350, + "batch_num_samples": 149, + "batch_num_tokens": 52139, + "epoch": 1.22545, + "grad_norm": 3.0803728103637695, + "learning_rate": 7.827859278994924e-06, + "loss": 0.9878, + "step": 1348 + }, + { + "batch_num_effect_tokens": 6041, + "batch_num_samples": 150, + "batch_num_tokens": 52176, + "epoch": 1.22636, + "grad_norm": 3.533050775527954, + "learning_rate": 7.812374221892116e-06, + "loss": 0.9804, + "step": 1349 + }, + { + "batch_num_effect_tokens": 6119, + "batch_num_samples": 149, + "batch_num_tokens": 52142, + "epoch": 1.22727, + "grad_norm": 3.218393325805664, + "learning_rate": 7.796894672134594e-06, + "loss": 1.0535, + "step": 1350 + }, + { + "batch_num_effect_tokens": 5933, + "batch_num_samples": 149, + "batch_num_tokens": 52118, + "epoch": 1.22818, + "grad_norm": 2.996877908706665, + "learning_rate": 7.781420668692116e-06, + "loss": 0.8144, + "step": 1351 + }, + { + "batch_num_effect_tokens": 9178, + "batch_num_samples": 149, + "batch_num_tokens": 52180, + "epoch": 1.22909, + "grad_norm": 3.118384838104248, + "learning_rate": 7.765952250520459e-06, + "loss": 1.2562, + "step": 1352 + }, + { + "batch_num_effect_tokens": 7624, + "batch_num_samples": 150, + "batch_num_tokens": 52178, + "epoch": 1.23, + "grad_norm": 3.031911611557007, + "learning_rate": 7.750489456561351e-06, + "loss": 0.9531, + "step": 1353 + }, + { + "batch_num_effect_tokens": 5752, + "batch_num_samples": 149, + "batch_num_tokens": 52197, + "epoch": 1.23091, + "grad_norm": 2.5690088272094727, + "learning_rate": 7.735032325742355e-06, + "loss": 0.5922, + "step": 1354 + }, + { + "batch_num_effect_tokens": 5533, + "batch_num_samples": 149, + "batch_num_tokens": 52214, + "epoch": 1.23182, + "grad_norm": 4.04586124420166, + "learning_rate": 7.719580896976788e-06, + "loss": 0.9968, + "step": 1355 + }, + { + "batch_num_effect_tokens": 5669, + "batch_num_samples": 149, + "batch_num_tokens": 52122, + "epoch": 1.23273, + "grad_norm": 2.989699602127075, + "learning_rate": 7.704135209163589e-06, + "loss": 0.7587, + "step": 1356 + }, + { + "batch_num_effect_tokens": 6241, + "batch_num_samples": 149, + "batch_num_tokens": 52186, + "epoch": 1.23364, + "grad_norm": 2.8156251907348633, + "learning_rate": 7.68869530118727e-06, + "loss": 0.839, + "step": 1357 + }, + { + "batch_num_effect_tokens": 5059, + "batch_num_samples": 149, + "batch_num_tokens": 52139, + "epoch": 1.23455, + "grad_norm": 3.2609455585479736, + "learning_rate": 7.673261211917777e-06, + "loss": 0.9088, + "step": 1358 + }, + { + "batch_num_effect_tokens": 5309, + "batch_num_samples": 149, + "batch_num_tokens": 52134, + "epoch": 1.23545, + "grad_norm": 3.0399086475372314, + "learning_rate": 7.657832980210412e-06, + "loss": 0.8117, + "step": 1359 + }, + { + "batch_num_effect_tokens": 4817, + "batch_num_samples": 149, + "batch_num_tokens": 52117, + "epoch": 1.23636, + "grad_norm": 3.6285390853881836, + "learning_rate": 7.642410644905726e-06, + "loss": 0.9068, + "step": 1360 + }, + { + "batch_num_effect_tokens": 4073, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 1.23727, + "grad_norm": 2.894031524658203, + "learning_rate": 7.626994244829441e-06, + "loss": 0.3597, + "step": 1361 + }, + { + "batch_num_effect_tokens": 6409, + "batch_num_samples": 149, + "batch_num_tokens": 52175, + "epoch": 1.23818, + "grad_norm": 3.356076240539551, + "learning_rate": 7.611583818792311e-06, + "loss": 1.0443, + "step": 1362 + }, + { + "batch_num_effect_tokens": 6840, + "batch_num_samples": 149, + "batch_num_tokens": 52208, + "epoch": 1.23909, + "grad_norm": 3.375852584838867, + "learning_rate": 7.596179405590076e-06, + "loss": 1.093, + "step": 1363 + }, + { + "batch_num_effect_tokens": 8879, + "batch_num_samples": 150, + "batch_num_tokens": 52155, + "epoch": 1.24, + "grad_norm": 3.4897732734680176, + "learning_rate": 7.580781044003324e-06, + "loss": 1.4193, + "step": 1364 + }, + { + "batch_num_effect_tokens": 5389, + "batch_num_samples": 149, + "batch_num_tokens": 52147, + "epoch": 1.24091, + "grad_norm": 3.4354724884033203, + "learning_rate": 7.565388772797412e-06, + "loss": 0.5507, + "step": 1365 + }, + { + "batch_num_effect_tokens": 7913, + "batch_num_samples": 150, + "batch_num_tokens": 52187, + "epoch": 1.24182, + "grad_norm": 3.8646090030670166, + "learning_rate": 7.550002630722366e-06, + "loss": 1.4203, + "step": 1366 + }, + { + "batch_num_effect_tokens": 3872, + "batch_num_samples": 149, + "batch_num_tokens": 52140, + "epoch": 1.24273, + "grad_norm": 3.718017101287842, + "learning_rate": 7.534622656512777e-06, + "loss": 0.5086, + "step": 1367 + }, + { + "batch_num_effect_tokens": 8348, + "batch_num_samples": 149, + "batch_num_tokens": 52141, + "epoch": 1.24364, + "grad_norm": 2.4211277961730957, + "learning_rate": 7.519248888887715e-06, + "loss": 0.8911, + "step": 1368 + }, + { + "batch_num_effect_tokens": 6355, + "batch_num_samples": 149, + "batch_num_tokens": 52143, + "epoch": 1.24455, + "grad_norm": 3.647061824798584, + "learning_rate": 7.503881366550617e-06, + "loss": 1.2617, + "step": 1369 + }, + { + "batch_num_effect_tokens": 5586, + "batch_num_samples": 149, + "batch_num_tokens": 52171, + "epoch": 1.24545, + "grad_norm": 3.2869815826416016, + "learning_rate": 7.488520128189209e-06, + "loss": 0.9019, + "step": 1370 + }, + { + "batch_num_effect_tokens": 5895, + "batch_num_samples": 149, + "batch_num_tokens": 52184, + "epoch": 1.24636, + "grad_norm": 2.7943344116210938, + "learning_rate": 7.4731652124753865e-06, + "loss": 0.8344, + "step": 1371 + }, + { + "batch_num_effect_tokens": 6089, + "batch_num_samples": 149, + "batch_num_tokens": 52135, + "epoch": 1.24727, + "grad_norm": 2.809882640838623, + "learning_rate": 7.4578166580651335e-06, + "loss": 0.8133, + "step": 1372 + }, + { + "batch_num_effect_tokens": 4249, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 1.24818, + "grad_norm": 3.436528205871582, + "learning_rate": 7.442474503598412e-06, + "loss": 0.8629, + "step": 1373 + }, + { + "batch_num_effect_tokens": 5331, + "batch_num_samples": 149, + "batch_num_tokens": 52126, + "epoch": 1.24909, + "grad_norm": 3.108729362487793, + "learning_rate": 7.4271387876990866e-06, + "loss": 0.6077, + "step": 1374 + }, + { + "batch_num_effect_tokens": 6733, + "batch_num_samples": 149, + "batch_num_tokens": 52167, + "epoch": 1.25, + "grad_norm": 2.72878098487854, + "learning_rate": 7.411809548974792e-06, + "loss": 0.8801, + "step": 1375 + }, + { + "batch_num_effect_tokens": 5971, + "batch_num_samples": 149, + "batch_num_tokens": 52141, + "epoch": 1.25091, + "grad_norm": 2.6645631790161133, + "learning_rate": 7.39648682601688e-06, + "loss": 0.5949, + "step": 1376 + }, + { + "batch_num_effect_tokens": 6856, + "batch_num_samples": 149, + "batch_num_tokens": 52111, + "epoch": 1.25182, + "grad_norm": 3.8237671852111816, + "learning_rate": 7.381170657400281e-06, + "loss": 1.6248, + "step": 1377 + }, + { + "batch_num_effect_tokens": 8547, + "batch_num_samples": 150, + "batch_num_tokens": 52215, + "epoch": 1.25273, + "grad_norm": 2.754514694213867, + "learning_rate": 7.365861081683434e-06, + "loss": 1.1067, + "step": 1378 + }, + { + "batch_num_effect_tokens": 6625, + "batch_num_samples": 149, + "batch_num_tokens": 52199, + "epoch": 1.25364, + "grad_norm": 3.414747714996338, + "learning_rate": 7.350558137408174e-06, + "loss": 1.3872, + "step": 1379 + }, + { + "batch_num_effect_tokens": 8201, + "batch_num_samples": 149, + "batch_num_tokens": 52134, + "epoch": 1.25455, + "grad_norm": 2.9175808429718018, + "learning_rate": 7.335261863099652e-06, + "loss": 1.0167, + "step": 1380 + }, + { + "batch_num_effect_tokens": 5804, + "batch_num_samples": 149, + "batch_num_tokens": 52198, + "epoch": 1.25545, + "grad_norm": 3.4364821910858154, + "learning_rate": 7.319972297266215e-06, + "loss": 1.2104, + "step": 1381 + }, + { + "batch_num_effect_tokens": 5729, + "batch_num_samples": 149, + "batch_num_tokens": 52135, + "epoch": 1.25636, + "grad_norm": 3.21177077293396, + "learning_rate": 7.3046894783993225e-06, + "loss": 0.8951, + "step": 1382 + }, + { + "batch_num_effect_tokens": 7137, + "batch_num_samples": 149, + "batch_num_tokens": 52185, + "epoch": 1.25727, + "grad_norm": 3.1714975833892822, + "learning_rate": 7.289413444973461e-06, + "loss": 1.2, + "step": 1383 + }, + { + "batch_num_effect_tokens": 6818, + "batch_num_samples": 149, + "batch_num_tokens": 52210, + "epoch": 1.25818, + "grad_norm": 3.7215492725372314, + "learning_rate": 7.274144235446024e-06, + "loss": 1.5959, + "step": 1384 + }, + { + "batch_num_effect_tokens": 5616, + "batch_num_samples": 149, + "batch_num_tokens": 52223, + "epoch": 1.25909, + "grad_norm": 2.9003236293792725, + "learning_rate": 7.2588818882572266e-06, + "loss": 0.6636, + "step": 1385 + }, + { + "batch_num_effect_tokens": 6325, + "batch_num_samples": 149, + "batch_num_tokens": 52209, + "epoch": 1.26, + "grad_norm": 3.1018054485321045, + "learning_rate": 7.243626441830009e-06, + "loss": 0.9194, + "step": 1386 + }, + { + "batch_num_effect_tokens": 5608, + "batch_num_samples": 149, + "batch_num_tokens": 52139, + "epoch": 1.26091, + "grad_norm": 2.823843479156494, + "learning_rate": 7.2283779345699455e-06, + "loss": 0.6002, + "step": 1387 + }, + { + "batch_num_effect_tokens": 6813, + "batch_num_samples": 149, + "batch_num_tokens": 52209, + "epoch": 1.26182, + "grad_norm": 3.2531120777130127, + "learning_rate": 7.213136404865124e-06, + "loss": 0.8652, + "step": 1388 + }, + { + "batch_num_effect_tokens": 7764, + "batch_num_samples": 149, + "batch_num_tokens": 52177, + "epoch": 1.26273, + "grad_norm": 3.0588696002960205, + "learning_rate": 7.19790189108609e-06, + "loss": 0.9424, + "step": 1389 + }, + { + "batch_num_effect_tokens": 3094, + "batch_num_samples": 149, + "batch_num_tokens": 52180, + "epoch": 1.26364, + "grad_norm": 1.7678723335266113, + "learning_rate": 7.182674431585703e-06, + "loss": 0.1115, + "step": 1390 + }, + { + "batch_num_effect_tokens": 6241, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 1.26455, + "grad_norm": 4.503663539886475, + "learning_rate": 7.167454064699083e-06, + "loss": 1.3099, + "step": 1391 + }, + { + "batch_num_effect_tokens": 6661, + "batch_num_samples": 149, + "batch_num_tokens": 52157, + "epoch": 1.26545, + "grad_norm": 2.982997417449951, + "learning_rate": 7.1522408287434774e-06, + "loss": 0.9009, + "step": 1392 + }, + { + "batch_num_effect_tokens": 6690, + "batch_num_samples": 149, + "batch_num_tokens": 52151, + "epoch": 1.26636, + "grad_norm": 2.942383050918579, + "learning_rate": 7.137034762018198e-06, + "loss": 0.9115, + "step": 1393 + }, + { + "batch_num_effect_tokens": 5537, + "batch_num_samples": 149, + "batch_num_tokens": 52184, + "epoch": 1.26727, + "grad_norm": 2.937274932861328, + "learning_rate": 7.12183590280449e-06, + "loss": 0.7368, + "step": 1394 + }, + { + "batch_num_effect_tokens": 9544, + "batch_num_samples": 149, + "batch_num_tokens": 52174, + "epoch": 1.26818, + "grad_norm": 3.7725727558135986, + "learning_rate": 7.106644289365474e-06, + "loss": 1.7717, + "step": 1395 + }, + { + "batch_num_effect_tokens": 7207, + "batch_num_samples": 150, + "batch_num_tokens": 52142, + "epoch": 1.26909, + "grad_norm": 4.409503936767578, + "learning_rate": 7.0914599599460095e-06, + "loss": 2.2532, + "step": 1396 + }, + { + "batch_num_effect_tokens": 6919, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 1.27, + "grad_norm": 3.1001265048980713, + "learning_rate": 7.076282952772634e-06, + "loss": 1.1373, + "step": 1397 + }, + { + "batch_num_effect_tokens": 5948, + "batch_num_samples": 149, + "batch_num_tokens": 52146, + "epoch": 1.27091, + "grad_norm": 3.3615310192108154, + "learning_rate": 7.061113306053443e-06, + "loss": 1.1018, + "step": 1398 + }, + { + "batch_num_effect_tokens": 4353, + "batch_num_samples": 149, + "batch_num_tokens": 52221, + "epoch": 1.27182, + "grad_norm": 3.1632819175720215, + "learning_rate": 7.045951057978001e-06, + "loss": 0.735, + "step": 1399 + }, + { + "batch_num_effect_tokens": 6004, + "batch_num_samples": 149, + "batch_num_tokens": 50545, + "epoch": 1.27273, + "grad_norm": 3.4299721717834473, + "learning_rate": 7.0307962467172555e-06, + "loss": 1.1528, + "step": 1400 + }, + { + "batch_num_effect_tokens": 7520, + "batch_num_samples": 149, + "batch_num_tokens": 52153, + "epoch": 1.27364, + "grad_norm": 2.9699318408966064, + "learning_rate": 7.015648910423416e-06, + "loss": 1.0898, + "step": 1401 + }, + { + "batch_num_effect_tokens": 6107, + "batch_num_samples": 149, + "batch_num_tokens": 52124, + "epoch": 1.27455, + "grad_norm": 3.086298704147339, + "learning_rate": 7.0005090872298955e-06, + "loss": 1.0031, + "step": 1402 + }, + { + "batch_num_effect_tokens": 6330, + "batch_num_samples": 149, + "batch_num_tokens": 50569, + "epoch": 1.27545, + "grad_norm": 3.487344741821289, + "learning_rate": 6.985376815251173e-06, + "loss": 1.3832, + "step": 1403 + }, + { + "batch_num_effect_tokens": 6372, + "batch_num_samples": 149, + "batch_num_tokens": 52135, + "epoch": 1.27636, + "grad_norm": 3.3115811347961426, + "learning_rate": 6.970252132582729e-06, + "loss": 1.1073, + "step": 1404 + }, + { + "batch_num_effect_tokens": 8173, + "batch_num_samples": 149, + "batch_num_tokens": 52222, + "epoch": 1.27727, + "grad_norm": 2.6352920532226562, + "learning_rate": 6.955135077300932e-06, + "loss": 1.1538, + "step": 1405 + }, + { + "batch_num_effect_tokens": 6707, + "batch_num_samples": 149, + "batch_num_tokens": 52185, + "epoch": 1.27818, + "grad_norm": 2.982313632965088, + "learning_rate": 6.940025687462952e-06, + "loss": 1.0654, + "step": 1406 + }, + { + "batch_num_effect_tokens": 6610, + "batch_num_samples": 149, + "batch_num_tokens": 52215, + "epoch": 1.27909, + "grad_norm": 3.0123355388641357, + "learning_rate": 6.924924001106655e-06, + "loss": 1.1569, + "step": 1407 + }, + { + "batch_num_effect_tokens": 7134, + "batch_num_samples": 149, + "batch_num_tokens": 52137, + "epoch": 1.28, + "grad_norm": 3.0404586791992188, + "learning_rate": 6.909830056250527e-06, + "loss": 1.1935, + "step": 1408 + }, + { + "batch_num_effect_tokens": 5418, + "batch_num_samples": 149, + "batch_num_tokens": 52130, + "epoch": 1.28091, + "grad_norm": 4.27562141418457, + "learning_rate": 6.8947438908935495e-06, + "loss": 1.0136, + "step": 1409 + }, + { + "batch_num_effect_tokens": 5188, + "batch_num_samples": 149, + "batch_num_tokens": 52196, + "epoch": 1.28182, + "grad_norm": 4.322559356689453, + "learning_rate": 6.87966554301513e-06, + "loss": 0.489, + "step": 1410 + }, + { + "batch_num_effect_tokens": 6036, + "batch_num_samples": 149, + "batch_num_tokens": 52140, + "epoch": 1.28273, + "grad_norm": 3.010681390762329, + "learning_rate": 6.86459505057499e-06, + "loss": 0.954, + "step": 1411 + }, + { + "batch_num_effect_tokens": 8780, + "batch_num_samples": 149, + "batch_num_tokens": 52211, + "epoch": 1.28364, + "grad_norm": 2.756701946258545, + "learning_rate": 6.8495324515130744e-06, + "loss": 1.0386, + "step": 1412 + }, + { + "batch_num_effect_tokens": 6354, + "batch_num_samples": 149, + "batch_num_tokens": 52171, + "epoch": 1.28455, + "grad_norm": 3.2075488567352295, + "learning_rate": 6.8344777837494555e-06, + "loss": 1.1925, + "step": 1413 + }, + { + "batch_num_effect_tokens": 9044, + "batch_num_samples": 149, + "batch_num_tokens": 52144, + "epoch": 1.28545, + "grad_norm": 3.3874874114990234, + "learning_rate": 6.819431085184251e-06, + "loss": 1.2673, + "step": 1414 + }, + { + "batch_num_effect_tokens": 6507, + "batch_num_samples": 149, + "batch_num_tokens": 52168, + "epoch": 1.28636, + "grad_norm": 3.164731979370117, + "learning_rate": 6.804392393697502e-06, + "loss": 1.0044, + "step": 1415 + }, + { + "batch_num_effect_tokens": 6407, + "batch_num_samples": 149, + "batch_num_tokens": 52220, + "epoch": 1.28727, + "grad_norm": 3.7156081199645996, + "learning_rate": 6.789361747149092e-06, + "loss": 1.0976, + "step": 1416 + }, + { + "batch_num_effect_tokens": 6394, + "batch_num_samples": 150, + "batch_num_tokens": 52155, + "epoch": 1.28818, + "grad_norm": 3.471351385116577, + "learning_rate": 6.774339183378663e-06, + "loss": 1.1154, + "step": 1417 + }, + { + "batch_num_effect_tokens": 5017, + "batch_num_samples": 149, + "batch_num_tokens": 52142, + "epoch": 1.28909, + "grad_norm": 3.259028196334839, + "learning_rate": 6.7593247402054955e-06, + "loss": 0.8695, + "step": 1418 + }, + { + "batch_num_effect_tokens": 5651, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 1.29, + "grad_norm": 3.9814767837524414, + "learning_rate": 6.744318455428436e-06, + "loss": 1.2782, + "step": 1419 + }, + { + "batch_num_effect_tokens": 11260, + "batch_num_samples": 149, + "batch_num_tokens": 52188, + "epoch": 1.29091, + "grad_norm": 2.5980544090270996, + "learning_rate": 6.729320366825785e-06, + "loss": 1.1878, + "step": 1420 + }, + { + "batch_num_effect_tokens": 9360, + "batch_num_samples": 150, + "batch_num_tokens": 52200, + "epoch": 1.29182, + "grad_norm": 2.950162649154663, + "learning_rate": 6.714330512155216e-06, + "loss": 1.4895, + "step": 1421 + }, + { + "batch_num_effect_tokens": 5925, + "batch_num_samples": 149, + "batch_num_tokens": 52201, + "epoch": 1.29273, + "grad_norm": 3.376674175262451, + "learning_rate": 6.699348929153668e-06, + "loss": 1.0346, + "step": 1422 + }, + { + "batch_num_effect_tokens": 4621, + "batch_num_samples": 149, + "batch_num_tokens": 52182, + "epoch": 1.29364, + "grad_norm": 3.315309524536133, + "learning_rate": 6.684375655537263e-06, + "loss": 0.9401, + "step": 1423 + }, + { + "batch_num_effect_tokens": 5105, + "batch_num_samples": 149, + "batch_num_tokens": 52197, + "epoch": 1.29455, + "grad_norm": 2.5598363876342773, + "learning_rate": 6.669410729001193e-06, + "loss": 0.5127, + "step": 1424 + }, + { + "batch_num_effect_tokens": 6244, + "batch_num_samples": 149, + "batch_num_tokens": 52152, + "epoch": 1.29545, + "grad_norm": 3.0585110187530518, + "learning_rate": 6.654454187219649e-06, + "loss": 1.0119, + "step": 1425 + }, + { + "batch_num_effect_tokens": 5791, + "batch_num_samples": 149, + "batch_num_tokens": 52214, + "epoch": 1.29636, + "grad_norm": 3.3678364753723145, + "learning_rate": 6.639506067845698e-06, + "loss": 0.9276, + "step": 1426 + }, + { + "batch_num_effect_tokens": 3842, + "batch_num_samples": 149, + "batch_num_tokens": 52183, + "epoch": 1.29727, + "grad_norm": 2.618781805038452, + "learning_rate": 6.6245664085112235e-06, + "loss": 0.4342, + "step": 1427 + }, + { + "batch_num_effect_tokens": 9795, + "batch_num_samples": 149, + "batch_num_tokens": 52172, + "epoch": 1.29818, + "grad_norm": 3.688122034072876, + "learning_rate": 6.6096352468267935e-06, + "loss": 1.9993, + "step": 1428 + }, + { + "batch_num_effect_tokens": 6919, + "batch_num_samples": 149, + "batch_num_tokens": 52140, + "epoch": 1.29909, + "grad_norm": 2.862316608428955, + "learning_rate": 6.594712620381594e-06, + "loss": 0.8784, + "step": 1429 + }, + { + "batch_num_effect_tokens": 7136, + "batch_num_samples": 149, + "batch_num_tokens": 52208, + "epoch": 1.3, + "grad_norm": 3.542351484298706, + "learning_rate": 6.579798566743314e-06, + "loss": 1.274, + "step": 1430 + }, + { + "batch_num_effect_tokens": 8045, + "batch_num_samples": 150, + "batch_num_tokens": 52142, + "epoch": 1.30091, + "grad_norm": 3.241546869277954, + "learning_rate": 6.56489312345807e-06, + "loss": 1.5165, + "step": 1431 + }, + { + "batch_num_effect_tokens": 7921, + "batch_num_samples": 149, + "batch_num_tokens": 52140, + "epoch": 1.30182, + "grad_norm": 2.9332659244537354, + "learning_rate": 6.549996328050296e-06, + "loss": 1.1232, + "step": 1432 + }, + { + "batch_num_effect_tokens": 7440, + "batch_num_samples": 149, + "batch_num_tokens": 52148, + "epoch": 1.30273, + "grad_norm": 2.8302924633026123, + "learning_rate": 6.535108218022654e-06, + "loss": 0.9424, + "step": 1433 + }, + { + "batch_num_effect_tokens": 6560, + "batch_num_samples": 149, + "batch_num_tokens": 52185, + "epoch": 1.30364, + "grad_norm": 3.1806371212005615, + "learning_rate": 6.52022883085595e-06, + "loss": 1.1798, + "step": 1434 + }, + { + "batch_num_effect_tokens": 7709, + "batch_num_samples": 149, + "batch_num_tokens": 52222, + "epoch": 1.30455, + "grad_norm": 3.062058448791504, + "learning_rate": 6.505358204009018e-06, + "loss": 1.1253, + "step": 1435 + }, + { + "batch_num_effect_tokens": 7883, + "batch_num_samples": 149, + "batch_num_tokens": 52179, + "epoch": 1.30545, + "grad_norm": 2.9780805110931396, + "learning_rate": 6.490496374918647e-06, + "loss": 1.1104, + "step": 1436 + }, + { + "batch_num_effect_tokens": 7285, + "batch_num_samples": 149, + "batch_num_tokens": 52187, + "epoch": 1.30636, + "grad_norm": 2.6328577995300293, + "learning_rate": 6.475643380999469e-06, + "loss": 0.6742, + "step": 1437 + }, + { + "batch_num_effect_tokens": 6278, + "batch_num_samples": 150, + "batch_num_tokens": 52179, + "epoch": 1.30727, + "grad_norm": 3.3319318294525146, + "learning_rate": 6.460799259643884e-06, + "loss": 1.0524, + "step": 1438 + }, + { + "batch_num_effect_tokens": 4438, + "batch_num_samples": 149, + "batch_num_tokens": 52180, + "epoch": 1.30818, + "grad_norm": 3.1229591369628906, + "learning_rate": 6.4459640482219445e-06, + "loss": 0.5901, + "step": 1439 + }, + { + "batch_num_effect_tokens": 7578, + "batch_num_samples": 149, + "batch_num_tokens": 52191, + "epoch": 1.30909, + "grad_norm": 3.132690906524658, + "learning_rate": 6.431137784081283e-06, + "loss": 1.2466, + "step": 1440 + }, + { + "batch_num_effect_tokens": 7262, + "batch_num_samples": 149, + "batch_num_tokens": 52132, + "epoch": 1.31, + "grad_norm": 2.7710444927215576, + "learning_rate": 6.4163205045469975e-06, + "loss": 0.8145, + "step": 1441 + }, + { + "batch_num_effect_tokens": 5168, + "batch_num_samples": 149, + "batch_num_tokens": 52207, + "epoch": 1.31091, + "grad_norm": 3.572665214538574, + "learning_rate": 6.401512246921576e-06, + "loss": 1.0269, + "step": 1442 + }, + { + "batch_num_effect_tokens": 6323, + "batch_num_samples": 149, + "batch_num_tokens": 52200, + "epoch": 1.31182, + "grad_norm": 2.9520695209503174, + "learning_rate": 6.386713048484785e-06, + "loss": 0.6127, + "step": 1443 + }, + { + "batch_num_effect_tokens": 5373, + "batch_num_samples": 149, + "batch_num_tokens": 52184, + "epoch": 1.31273, + "grad_norm": 3.336017608642578, + "learning_rate": 6.3719229464935915e-06, + "loss": 0.7666, + "step": 1444 + }, + { + "batch_num_effect_tokens": 5281, + "batch_num_samples": 149, + "batch_num_tokens": 52190, + "epoch": 1.31364, + "grad_norm": 3.226888418197632, + "learning_rate": 6.357141978182056e-06, + "loss": 1.0242, + "step": 1445 + }, + { + "batch_num_effect_tokens": 5474, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 1.31455, + "grad_norm": 3.49110746383667, + "learning_rate": 6.342370180761256e-06, + "loss": 0.931, + "step": 1446 + }, + { + "batch_num_effect_tokens": 7325, + "batch_num_samples": 149, + "batch_num_tokens": 50555, + "epoch": 1.31545, + "grad_norm": 3.3710925579071045, + "learning_rate": 6.327607591419167e-06, + "loss": 1.3554, + "step": 1447 + }, + { + "batch_num_effect_tokens": 5500, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 1.31636, + "grad_norm": 2.879106283187866, + "learning_rate": 6.312854247320594e-06, + "loss": 0.6699, + "step": 1448 + }, + { + "batch_num_effect_tokens": 6277, + "batch_num_samples": 149, + "batch_num_tokens": 52206, + "epoch": 1.31727, + "grad_norm": 2.4635508060455322, + "learning_rate": 6.2981101856070625e-06, + "loss": 0.5001, + "step": 1449 + }, + { + "batch_num_effect_tokens": 5308, + "batch_num_samples": 149, + "batch_num_tokens": 52124, + "epoch": 1.31818, + "grad_norm": 5.102665424346924, + "learning_rate": 6.283375443396726e-06, + "loss": 0.7164, + "step": 1450 + }, + { + "batch_num_effect_tokens": 7108, + "batch_num_samples": 149, + "batch_num_tokens": 52141, + "epoch": 1.31909, + "grad_norm": 3.0555477142333984, + "learning_rate": 6.2686500577842875e-06, + "loss": 1.0813, + "step": 1451 + }, + { + "batch_num_effect_tokens": 4383, + "batch_num_samples": 149, + "batch_num_tokens": 52201, + "epoch": 1.32, + "grad_norm": 3.1150074005126953, + "learning_rate": 6.25393406584088e-06, + "loss": 0.5792, + "step": 1452 + }, + { + "batch_num_effect_tokens": 5781, + "batch_num_samples": 149, + "batch_num_tokens": 52146, + "epoch": 1.32091, + "grad_norm": 3.3713457584381104, + "learning_rate": 6.239227504614004e-06, + "loss": 1.0913, + "step": 1453 + }, + { + "batch_num_effect_tokens": 6017, + "batch_num_samples": 149, + "batch_num_tokens": 52157, + "epoch": 1.32182, + "grad_norm": 3.0999722480773926, + "learning_rate": 6.224530411127403e-06, + "loss": 1.0289, + "step": 1454 + }, + { + "batch_num_effect_tokens": 5342, + "batch_num_samples": 149, + "batch_num_tokens": 52128, + "epoch": 1.32273, + "grad_norm": 3.1640305519104004, + "learning_rate": 6.209842822380998e-06, + "loss": 0.8833, + "step": 1455 + }, + { + "batch_num_effect_tokens": 6408, + "batch_num_samples": 149, + "batch_num_tokens": 52140, + "epoch": 1.32364, + "grad_norm": 3.7910890579223633, + "learning_rate": 6.19516477535077e-06, + "loss": 1.4728, + "step": 1456 + }, + { + "batch_num_effect_tokens": 3911, + "batch_num_samples": 149, + "batch_num_tokens": 50567, + "epoch": 1.32455, + "grad_norm": 3.7931058406829834, + "learning_rate": 6.180496306988693e-06, + "loss": 0.8558, + "step": 1457 + }, + { + "batch_num_effect_tokens": 5491, + "batch_num_samples": 149, + "batch_num_tokens": 52179, + "epoch": 1.32545, + "grad_norm": 5.476532936096191, + "learning_rate": 6.165837454222607e-06, + "loss": 2.1147, + "step": 1458 + }, + { + "batch_num_effect_tokens": 8286, + "batch_num_samples": 149, + "batch_num_tokens": 52137, + "epoch": 1.32636, + "grad_norm": 3.141921281814575, + "learning_rate": 6.151188253956168e-06, + "loss": 1.4141, + "step": 1459 + }, + { + "batch_num_effect_tokens": 6548, + "batch_num_samples": 149, + "batch_num_tokens": 52222, + "epoch": 1.32727, + "grad_norm": 3.4099161624908447, + "learning_rate": 6.136548743068713e-06, + "loss": 1.1871, + "step": 1460 + }, + { + "batch_num_effect_tokens": 5579, + "batch_num_samples": 149, + "batch_num_tokens": 52208, + "epoch": 1.32818, + "grad_norm": 3.310654640197754, + "learning_rate": 6.1219189584152e-06, + "loss": 0.9689, + "step": 1461 + }, + { + "batch_num_effect_tokens": 6619, + "batch_num_samples": 149, + "batch_num_tokens": 52117, + "epoch": 1.32909, + "grad_norm": 3.180025100708008, + "learning_rate": 6.107298936826086e-06, + "loss": 1.0157, + "step": 1462 + }, + { + "batch_num_effect_tokens": 6347, + "batch_num_samples": 150, + "batch_num_tokens": 52215, + "epoch": 1.33, + "grad_norm": 2.9227607250213623, + "learning_rate": 6.092688715107265e-06, + "loss": 0.7552, + "step": 1463 + }, + { + "batch_num_effect_tokens": 5925, + "batch_num_samples": 149, + "batch_num_tokens": 52213, + "epoch": 1.33091, + "grad_norm": 2.8292019367218018, + "learning_rate": 6.078088330039945e-06, + "loss": 0.9346, + "step": 1464 + }, + { + "batch_num_effect_tokens": 4699, + "batch_num_samples": 149, + "batch_num_tokens": 52142, + "epoch": 1.33182, + "grad_norm": 2.105827569961548, + "learning_rate": 6.063497818380587e-06, + "loss": 0.2637, + "step": 1465 + }, + { + "batch_num_effect_tokens": 6003, + "batch_num_samples": 149, + "batch_num_tokens": 52177, + "epoch": 1.33273, + "grad_norm": 4.578786373138428, + "learning_rate": 6.0489172168607816e-06, + "loss": 2.0915, + "step": 1466 + }, + { + "batch_num_effect_tokens": 6588, + "batch_num_samples": 149, + "batch_num_tokens": 52198, + "epoch": 1.33364, + "grad_norm": 2.7085275650024414, + "learning_rate": 6.0343465621871774e-06, + "loss": 0.8409, + "step": 1467 + }, + { + "batch_num_effect_tokens": 6068, + "batch_num_samples": 150, + "batch_num_tokens": 52166, + "epoch": 1.33455, + "grad_norm": 3.308121919631958, + "learning_rate": 6.019785891041381e-06, + "loss": 0.9872, + "step": 1468 + }, + { + "batch_num_effect_tokens": 8186, + "batch_num_samples": 149, + "batch_num_tokens": 50591, + "epoch": 1.33545, + "grad_norm": 3.2569260597229004, + "learning_rate": 6.00523524007986e-06, + "loss": 1.6875, + "step": 1469 + }, + { + "batch_num_effect_tokens": 6507, + "batch_num_samples": 149, + "batch_num_tokens": 52147, + "epoch": 1.33636, + "grad_norm": 3.664898633956909, + "learning_rate": 5.990694645933866e-06, + "loss": 1.6415, + "step": 1470 + }, + { + "batch_num_effect_tokens": 3660, + "batch_num_samples": 149, + "batch_num_tokens": 52153, + "epoch": 1.33727, + "grad_norm": 2.9594593048095703, + "learning_rate": 5.9761641452093225e-06, + "loss": 0.5522, + "step": 1471 + }, + { + "batch_num_effect_tokens": 6642, + "batch_num_samples": 149, + "batch_num_tokens": 50519, + "epoch": 1.33818, + "grad_norm": 3.2117419242858887, + "learning_rate": 5.961643774486754e-06, + "loss": 1.0508, + "step": 1472 + }, + { + "batch_num_effect_tokens": 6623, + "batch_num_samples": 149, + "batch_num_tokens": 52151, + "epoch": 1.33909, + "grad_norm": 3.1382222175598145, + "learning_rate": 5.947133570321171e-06, + "loss": 1.0885, + "step": 1473 + }, + { + "batch_num_effect_tokens": 3305, + "batch_num_samples": 149, + "batch_num_tokens": 52220, + "epoch": 1.34, + "grad_norm": 1.8875463008880615, + "learning_rate": 5.932633569242e-06, + "loss": 0.1166, + "step": 1474 + }, + { + "batch_num_effect_tokens": 4473, + "batch_num_samples": 149, + "batch_num_tokens": 52162, + "epoch": 1.34091, + "grad_norm": 2.190162420272827, + "learning_rate": 5.918143807752972e-06, + "loss": 0.2517, + "step": 1475 + }, + { + "batch_num_effect_tokens": 7009, + "batch_num_samples": 149, + "batch_num_tokens": 52142, + "epoch": 1.34182, + "grad_norm": 2.6532437801361084, + "learning_rate": 5.903664322332048e-06, + "loss": 0.7956, + "step": 1476 + }, + { + "batch_num_effect_tokens": 5419, + "batch_num_samples": 149, + "batch_num_tokens": 52201, + "epoch": 1.34273, + "grad_norm": 3.5957748889923096, + "learning_rate": 5.8891951494313096e-06, + "loss": 1.27, + "step": 1477 + }, + { + "batch_num_effect_tokens": 7214, + "batch_num_samples": 149, + "batch_num_tokens": 52182, + "epoch": 1.34364, + "grad_norm": 2.9003989696502686, + "learning_rate": 5.87473632547689e-06, + "loss": 1.1812, + "step": 1478 + }, + { + "batch_num_effect_tokens": 3851, + "batch_num_samples": 149, + "batch_num_tokens": 52204, + "epoch": 1.34455, + "grad_norm": 2.9135546684265137, + "learning_rate": 5.860287886868855e-06, + "loss": 0.4439, + "step": 1479 + }, + { + "batch_num_effect_tokens": 7369, + "batch_num_samples": 149, + "batch_num_tokens": 52214, + "epoch": 1.34545, + "grad_norm": 3.2737839221954346, + "learning_rate": 5.845849869981137e-06, + "loss": 1.3717, + "step": 1480 + }, + { + "batch_num_effect_tokens": 4703, + "batch_num_samples": 149, + "batch_num_tokens": 52191, + "epoch": 1.34636, + "grad_norm": 3.12182879447937, + "learning_rate": 5.831422311161421e-06, + "loss": 0.639, + "step": 1481 + }, + { + "batch_num_effect_tokens": 10256, + "batch_num_samples": 149, + "batch_num_tokens": 52181, + "epoch": 1.34727, + "grad_norm": 2.677966594696045, + "learning_rate": 5.8170052467310734e-06, + "loss": 1.0875, + "step": 1482 + }, + { + "batch_num_effect_tokens": 7250, + "batch_num_samples": 149, + "batch_num_tokens": 52198, + "epoch": 1.34818, + "grad_norm": 3.431149482727051, + "learning_rate": 5.802598712985032e-06, + "loss": 0.986, + "step": 1483 + }, + { + "batch_num_effect_tokens": 7450, + "batch_num_samples": 150, + "batch_num_tokens": 52175, + "epoch": 1.34909, + "grad_norm": 3.0749874114990234, + "learning_rate": 5.788202746191735e-06, + "loss": 1.2524, + "step": 1484 + }, + { + "batch_num_effect_tokens": 5855, + "batch_num_samples": 149, + "batch_num_tokens": 52153, + "epoch": 1.35, + "grad_norm": 2.78263258934021, + "learning_rate": 5.773817382593008e-06, + "loss": 0.5778, + "step": 1485 + }, + { + "batch_num_effect_tokens": 8007, + "batch_num_samples": 149, + "batch_num_tokens": 52177, + "epoch": 1.35091, + "grad_norm": 3.691978931427002, + "learning_rate": 5.759442658403985e-06, + "loss": 1.6977, + "step": 1486 + }, + { + "batch_num_effect_tokens": 6896, + "batch_num_samples": 149, + "batch_num_tokens": 52204, + "epoch": 1.35182, + "grad_norm": 3.4524037837982178, + "learning_rate": 5.7450786098130196e-06, + "loss": 1.0297, + "step": 1487 + }, + { + "batch_num_effect_tokens": 6287, + "batch_num_samples": 149, + "batch_num_tokens": 52139, + "epoch": 1.35273, + "grad_norm": 2.8186326026916504, + "learning_rate": 5.7307252729815835e-06, + "loss": 0.765, + "step": 1488 + }, + { + "batch_num_effect_tokens": 8597, + "batch_num_samples": 149, + "batch_num_tokens": 52153, + "epoch": 1.35364, + "grad_norm": 3.6953465938568115, + "learning_rate": 5.716382684044191e-06, + "loss": 1.7102, + "step": 1489 + }, + { + "batch_num_effect_tokens": 7220, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 1.35455, + "grad_norm": 2.9871528148651123, + "learning_rate": 5.702050879108284e-06, + "loss": 0.9676, + "step": 1490 + }, + { + "batch_num_effect_tokens": 3919, + "batch_num_samples": 149, + "batch_num_tokens": 52167, + "epoch": 1.35545, + "grad_norm": 4.3469109535217285, + "learning_rate": 5.687729894254175e-06, + "loss": 0.7915, + "step": 1491 + }, + { + "batch_num_effect_tokens": 6381, + "batch_num_samples": 149, + "batch_num_tokens": 52214, + "epoch": 1.35636, + "grad_norm": 2.662632703781128, + "learning_rate": 5.673419765534915e-06, + "loss": 0.6357, + "step": 1492 + }, + { + "batch_num_effect_tokens": 5042, + "batch_num_samples": 149, + "batch_num_tokens": 52222, + "epoch": 1.35727, + "grad_norm": 3.612797737121582, + "learning_rate": 5.659120528976252e-06, + "loss": 1.028, + "step": 1493 + }, + { + "batch_num_effect_tokens": 5236, + "batch_num_samples": 149, + "batch_num_tokens": 52163, + "epoch": 1.35818, + "grad_norm": 3.269341230392456, + "learning_rate": 5.64483222057648e-06, + "loss": 0.6053, + "step": 1494 + }, + { + "batch_num_effect_tokens": 6792, + "batch_num_samples": 149, + "batch_num_tokens": 50565, + "epoch": 1.35909, + "grad_norm": 3.3669612407684326, + "learning_rate": 5.630554876306407e-06, + "loss": 1.4502, + "step": 1495 + }, + { + "batch_num_effect_tokens": 5406, + "batch_num_samples": 149, + "batch_num_tokens": 52176, + "epoch": 1.36, + "grad_norm": 2.8210713863372803, + "learning_rate": 5.616288532109225e-06, + "loss": 0.6667, + "step": 1496 + }, + { + "batch_num_effect_tokens": 4728, + "batch_num_samples": 149, + "batch_num_tokens": 52179, + "epoch": 1.36091, + "grad_norm": 4.39499568939209, + "learning_rate": 5.6020332239004475e-06, + "loss": 0.9532, + "step": 1497 + }, + { + "batch_num_effect_tokens": 6533, + "batch_num_samples": 149, + "batch_num_tokens": 52197, + "epoch": 1.36182, + "grad_norm": 3.180879592895508, + "learning_rate": 5.587788987567785e-06, + "loss": 0.996, + "step": 1498 + }, + { + "batch_num_effect_tokens": 5884, + "batch_num_samples": 149, + "batch_num_tokens": 52179, + "epoch": 1.36273, + "grad_norm": 3.238677740097046, + "learning_rate": 5.5735558589711005e-06, + "loss": 0.9302, + "step": 1499 + }, + { + "batch_num_effect_tokens": 4839, + "batch_num_samples": 149, + "batch_num_tokens": 52141, + "epoch": 1.36364, + "grad_norm": 2.734421730041504, + "learning_rate": 5.559333873942259e-06, + "loss": 0.5516, + "step": 1500 + }, + { + "batch_num_effect_tokens": 6528, + "batch_num_samples": 149, + "batch_num_tokens": 52139, + "epoch": 1.36455, + "grad_norm": 3.6113054752349854, + "learning_rate": 5.545123068285105e-06, + "loss": 1.3667, + "step": 1501 + }, + { + "batch_num_effect_tokens": 6232, + "batch_num_samples": 149, + "batch_num_tokens": 52131, + "epoch": 1.36545, + "grad_norm": 3.091254472732544, + "learning_rate": 5.5309234777753225e-06, + "loss": 0.9556, + "step": 1502 + }, + { + "batch_num_effect_tokens": 6035, + "batch_num_samples": 149, + "batch_num_tokens": 52197, + "epoch": 1.36636, + "grad_norm": 3.764340877532959, + "learning_rate": 5.516735138160356e-06, + "loss": 1.0304, + "step": 1503 + }, + { + "batch_num_effect_tokens": 7763, + "batch_num_samples": 149, + "batch_num_tokens": 52214, + "epoch": 1.36727, + "grad_norm": 3.538569927215576, + "learning_rate": 5.502558085159344e-06, + "loss": 1.4674, + "step": 1504 + }, + { + "batch_num_effect_tokens": 6006, + "batch_num_samples": 149, + "batch_num_tokens": 52201, + "epoch": 1.36818, + "grad_norm": 3.272339105606079, + "learning_rate": 5.488392354462996e-06, + "loss": 1.0269, + "step": 1505 + }, + { + "batch_num_effect_tokens": 7161, + "batch_num_samples": 149, + "batch_num_tokens": 52106, + "epoch": 1.36909, + "grad_norm": 3.459012985229492, + "learning_rate": 5.474237981733521e-06, + "loss": 1.4757, + "step": 1506 + }, + { + "batch_num_effect_tokens": 6900, + "batch_num_samples": 149, + "batch_num_tokens": 52156, + "epoch": 1.37, + "grad_norm": 2.8426942825317383, + "learning_rate": 5.460095002604533e-06, + "loss": 0.981, + "step": 1507 + }, + { + "batch_num_effect_tokens": 3331, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 1.37091, + "grad_norm": 3.165037155151367, + "learning_rate": 5.445963452680974e-06, + "loss": 0.3777, + "step": 1508 + }, + { + "batch_num_effect_tokens": 6489, + "batch_num_samples": 150, + "batch_num_tokens": 52153, + "epoch": 1.37182, + "grad_norm": 2.9159345626831055, + "learning_rate": 5.431843367538992e-06, + "loss": 0.821, + "step": 1509 + }, + { + "batch_num_effect_tokens": 6978, + "batch_num_samples": 150, + "batch_num_tokens": 52189, + "epoch": 1.37273, + "grad_norm": 2.7146685123443604, + "learning_rate": 5.417734782725896e-06, + "loss": 0.8035, + "step": 1510 + }, + { + "batch_num_effect_tokens": 7808, + "batch_num_samples": 149, + "batch_num_tokens": 52160, + "epoch": 1.37364, + "grad_norm": 2.930331230163574, + "learning_rate": 5.403637733760025e-06, + "loss": 1.374, + "step": 1511 + }, + { + "batch_num_effect_tokens": 10696, + "batch_num_samples": 150, + "batch_num_tokens": 52174, + "epoch": 1.37455, + "grad_norm": 2.759847640991211, + "learning_rate": 5.38955225613069e-06, + "loss": 1.541, + "step": 1512 + }, + { + "batch_num_effect_tokens": 5352, + "batch_num_samples": 149, + "batch_num_tokens": 52198, + "epoch": 1.37545, + "grad_norm": 3.5427732467651367, + "learning_rate": 5.375478385298052e-06, + "loss": 0.7671, + "step": 1513 + }, + { + "batch_num_effect_tokens": 5525, + "batch_num_samples": 150, + "batch_num_tokens": 52214, + "epoch": 1.37636, + "grad_norm": 2.800795078277588, + "learning_rate": 5.361416156693075e-06, + "loss": 0.7603, + "step": 1514 + }, + { + "batch_num_effect_tokens": 4214, + "batch_num_samples": 149, + "batch_num_tokens": 52140, + "epoch": 1.37727, + "grad_norm": 2.761247158050537, + "learning_rate": 5.347365605717394e-06, + "loss": 0.4067, + "step": 1515 + }, + { + "batch_num_effect_tokens": 6234, + "batch_num_samples": 149, + "batch_num_tokens": 52185, + "epoch": 1.37818, + "grad_norm": 3.1770267486572266, + "learning_rate": 5.333326767743263e-06, + "loss": 0.7996, + "step": 1516 + }, + { + "batch_num_effect_tokens": 7461, + "batch_num_samples": 149, + "batch_num_tokens": 52214, + "epoch": 1.37909, + "grad_norm": 3.584190607070923, + "learning_rate": 5.319299678113432e-06, + "loss": 1.1818, + "step": 1517 + }, + { + "batch_num_effect_tokens": 5990, + "batch_num_samples": 149, + "batch_num_tokens": 52140, + "epoch": 1.38, + "grad_norm": 3.2287638187408447, + "learning_rate": 5.305284372141095e-06, + "loss": 1.0169, + "step": 1518 + }, + { + "batch_num_effect_tokens": 5295, + "batch_num_samples": 149, + "batch_num_tokens": 52221, + "epoch": 1.38091, + "grad_norm": 2.710073471069336, + "learning_rate": 5.291280885109756e-06, + "loss": 0.6068, + "step": 1519 + }, + { + "batch_num_effect_tokens": 5249, + "batch_num_samples": 149, + "batch_num_tokens": 52210, + "epoch": 1.38182, + "grad_norm": 3.27878999710083, + "learning_rate": 5.277289252273175e-06, + "loss": 0.8559, + "step": 1520 + }, + { + "batch_num_effect_tokens": 6568, + "batch_num_samples": 149, + "batch_num_tokens": 52163, + "epoch": 1.38273, + "grad_norm": 2.8304049968719482, + "learning_rate": 5.26330950885528e-06, + "loss": 0.9576, + "step": 1521 + }, + { + "batch_num_effect_tokens": 6099, + "batch_num_samples": 149, + "batch_num_tokens": 52207, + "epoch": 1.38364, + "grad_norm": 3.1691155433654785, + "learning_rate": 5.249341690050051e-06, + "loss": 0.8513, + "step": 1522 + }, + { + "batch_num_effect_tokens": 7202, + "batch_num_samples": 149, + "batch_num_tokens": 52139, + "epoch": 1.38455, + "grad_norm": 4.808316230773926, + "learning_rate": 5.235385831021464e-06, + "loss": 1.9307, + "step": 1523 + }, + { + "batch_num_effect_tokens": 6017, + "batch_num_samples": 149, + "batch_num_tokens": 52121, + "epoch": 1.38545, + "grad_norm": 3.3912603855133057, + "learning_rate": 5.221441966903371e-06, + "loss": 1.0597, + "step": 1524 + }, + { + "batch_num_effect_tokens": 4748, + "batch_num_samples": 149, + "batch_num_tokens": 52222, + "epoch": 1.38636, + "grad_norm": 2.911712884902954, + "learning_rate": 5.207510132799436e-06, + "loss": 0.5862, + "step": 1525 + }, + { + "batch_num_effect_tokens": 5880, + "batch_num_samples": 150, + "batch_num_tokens": 52143, + "epoch": 1.38727, + "grad_norm": 3.6869122982025146, + "learning_rate": 5.193590363783027e-06, + "loss": 1.2402, + "step": 1526 + }, + { + "batch_num_effect_tokens": 7678, + "batch_num_samples": 149, + "batch_num_tokens": 52145, + "epoch": 1.38818, + "grad_norm": 3.216240644454956, + "learning_rate": 5.179682694897159e-06, + "loss": 1.4263, + "step": 1527 + }, + { + "batch_num_effect_tokens": 6767, + "batch_num_samples": 149, + "batch_num_tokens": 52215, + "epoch": 1.38909, + "grad_norm": 3.3282392024993896, + "learning_rate": 5.165787161154361e-06, + "loss": 1.1479, + "step": 1528 + }, + { + "batch_num_effect_tokens": 4905, + "batch_num_samples": 149, + "batch_num_tokens": 52197, + "epoch": 1.39, + "grad_norm": 3.347588300704956, + "learning_rate": 5.151903797536631e-06, + "loss": 0.8476, + "step": 1529 + }, + { + "batch_num_effect_tokens": 7924, + "batch_num_samples": 149, + "batch_num_tokens": 52223, + "epoch": 1.39091, + "grad_norm": 2.953521728515625, + "learning_rate": 5.138032638995315e-06, + "loss": 1.2351, + "step": 1530 + }, + { + "batch_num_effect_tokens": 6819, + "batch_num_samples": 149, + "batch_num_tokens": 52187, + "epoch": 1.39182, + "grad_norm": 3.2468717098236084, + "learning_rate": 5.12417372045104e-06, + "loss": 1.2991, + "step": 1531 + }, + { + "batch_num_effect_tokens": 4391, + "batch_num_samples": 149, + "batch_num_tokens": 52201, + "epoch": 1.39273, + "grad_norm": 3.6957287788391113, + "learning_rate": 5.110327076793613e-06, + "loss": 1.2211, + "step": 1532 + }, + { + "batch_num_effect_tokens": 8461, + "batch_num_samples": 149, + "batch_num_tokens": 52163, + "epoch": 1.39364, + "grad_norm": 3.3061976432800293, + "learning_rate": 5.096492742881949e-06, + "loss": 1.7162, + "step": 1533 + }, + { + "batch_num_effect_tokens": 6552, + "batch_num_samples": 149, + "batch_num_tokens": 52159, + "epoch": 1.39455, + "grad_norm": 3.1427457332611084, + "learning_rate": 5.082670753543961e-06, + "loss": 1.0838, + "step": 1534 + }, + { + "batch_num_effect_tokens": 5090, + "batch_num_samples": 149, + "batch_num_tokens": 52141, + "epoch": 1.39545, + "grad_norm": 3.309375524520874, + "learning_rate": 5.0688611435764975e-06, + "loss": 0.7022, + "step": 1535 + }, + { + "batch_num_effect_tokens": 5258, + "batch_num_samples": 149, + "batch_num_tokens": 52151, + "epoch": 1.39636, + "grad_norm": 3.023341655731201, + "learning_rate": 5.055063947745234e-06, + "loss": 0.8063, + "step": 1536 + }, + { + "batch_num_effect_tokens": 6498, + "batch_num_samples": 149, + "batch_num_tokens": 52143, + "epoch": 1.39727, + "grad_norm": 3.497258424758911, + "learning_rate": 5.04127920078459e-06, + "loss": 1.424, + "step": 1537 + }, + { + "batch_num_effect_tokens": 6588, + "batch_num_samples": 149, + "batch_num_tokens": 52180, + "epoch": 1.39818, + "grad_norm": 2.1093833446502686, + "learning_rate": 5.027506937397653e-06, + "loss": 0.5216, + "step": 1538 + }, + { + "batch_num_effect_tokens": 5417, + "batch_num_samples": 149, + "batch_num_tokens": 52137, + "epoch": 1.39909, + "grad_norm": 3.166273593902588, + "learning_rate": 5.013747192256073e-06, + "loss": 0.9344, + "step": 1539 + }, + { + "batch_num_effect_tokens": 5631, + "batch_num_samples": 150, + "batch_num_tokens": 52142, + "epoch": 1.4, + "grad_norm": 3.173602342605591, + "learning_rate": 5.000000000000003e-06, + "loss": 0.9181, + "step": 1540 + }, + { + "batch_num_effect_tokens": 7966, + "batch_num_samples": 149, + "batch_num_tokens": 52132, + "epoch": 1.40091, + "grad_norm": 2.9153196811676025, + "learning_rate": 4.986265395237972e-06, + "loss": 1.2476, + "step": 1541 + }, + { + "batch_num_effect_tokens": 6299, + "batch_num_samples": 149, + "batch_num_tokens": 52138, + "epoch": 1.40182, + "grad_norm": 3.3794291019439697, + "learning_rate": 4.972543412546842e-06, + "loss": 1.0079, + "step": 1542 + }, + { + "batch_num_effect_tokens": 5583, + "batch_num_samples": 149, + "batch_num_tokens": 52117, + "epoch": 1.40273, + "grad_norm": 3.053807497024536, + "learning_rate": 4.958834086471683e-06, + "loss": 0.8067, + "step": 1543 + }, + { + "batch_num_effect_tokens": 6947, + "batch_num_samples": 149, + "batch_num_tokens": 52133, + "epoch": 1.40364, + "grad_norm": 3.5736801624298096, + "learning_rate": 4.945137451525707e-06, + "loss": 1.4208, + "step": 1544 + }, + { + "batch_num_effect_tokens": 7828, + "batch_num_samples": 150, + "batch_num_tokens": 52177, + "epoch": 1.40455, + "grad_norm": 2.5810935497283936, + "learning_rate": 4.931453542190172e-06, + "loss": 0.9514, + "step": 1545 + }, + { + "batch_num_effect_tokens": 5808, + "batch_num_samples": 149, + "batch_num_tokens": 52208, + "epoch": 1.40545, + "grad_norm": 2.9263129234313965, + "learning_rate": 4.917782392914311e-06, + "loss": 0.67, + "step": 1546 + }, + { + "batch_num_effect_tokens": 5023, + "batch_num_samples": 149, + "batch_num_tokens": 52118, + "epoch": 1.40636, + "grad_norm": 3.0986263751983643, + "learning_rate": 4.904124038115219e-06, + "loss": 0.593, + "step": 1547 + }, + { + "batch_num_effect_tokens": 7130, + "batch_num_samples": 150, + "batch_num_tokens": 52144, + "epoch": 1.40727, + "grad_norm": 3.111344575881958, + "learning_rate": 4.890478512177796e-06, + "loss": 1.1165, + "step": 1548 + }, + { + "batch_num_effect_tokens": 4993, + "batch_num_samples": 149, + "batch_num_tokens": 52221, + "epoch": 1.40818, + "grad_norm": 3.050793409347534, + "learning_rate": 4.876845849454631e-06, + "loss": 0.5179, + "step": 1549 + }, + { + "batch_num_effect_tokens": 7141, + "batch_num_samples": 150, + "batch_num_tokens": 52216, + "epoch": 1.40909, + "grad_norm": 3.0416104793548584, + "learning_rate": 4.863226084265939e-06, + "loss": 1.0796, + "step": 1550 + }, + { + "batch_num_effect_tokens": 3840, + "batch_num_samples": 149, + "batch_num_tokens": 52177, + "epoch": 1.41, + "grad_norm": 3.3208167552948, + "learning_rate": 4.849619250899458e-06, + "loss": 0.6583, + "step": 1551 + }, + { + "batch_num_effect_tokens": 5326, + "batch_num_samples": 149, + "batch_num_tokens": 52183, + "epoch": 1.41091, + "grad_norm": 3.120213508605957, + "learning_rate": 4.836025383610382e-06, + "loss": 1.0188, + "step": 1552 + }, + { + "batch_num_effect_tokens": 6750, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 1.41182, + "grad_norm": 3.2883312702178955, + "learning_rate": 4.822444516621252e-06, + "loss": 1.1825, + "step": 1553 + }, + { + "batch_num_effect_tokens": 5966, + "batch_num_samples": 149, + "batch_num_tokens": 52222, + "epoch": 1.41273, + "grad_norm": 3.5330169200897217, + "learning_rate": 4.808876684121882e-06, + "loss": 0.7584, + "step": 1554 + }, + { + "batch_num_effect_tokens": 5559, + "batch_num_samples": 149, + "batch_num_tokens": 52166, + "epoch": 1.41364, + "grad_norm": 3.6764943599700928, + "learning_rate": 4.795321920269279e-06, + "loss": 1.0578, + "step": 1555 + }, + { + "batch_num_effect_tokens": 8163, + "batch_num_samples": 150, + "batch_num_tokens": 52154, + "epoch": 1.41455, + "grad_norm": 3.582327127456665, + "learning_rate": 4.781780259187543e-06, + "loss": 1.6447, + "step": 1556 + }, + { + "batch_num_effect_tokens": 7129, + "batch_num_samples": 149, + "batch_num_tokens": 52137, + "epoch": 1.41545, + "grad_norm": 3.57389497756958, + "learning_rate": 4.7682517349677895e-06, + "loss": 1.5303, + "step": 1557 + }, + { + "batch_num_effect_tokens": 4651, + "batch_num_samples": 149, + "batch_num_tokens": 52198, + "epoch": 1.41636, + "grad_norm": 3.384352922439575, + "learning_rate": 4.754736381668057e-06, + "loss": 0.7794, + "step": 1558 + }, + { + "batch_num_effect_tokens": 5713, + "batch_num_samples": 149, + "batch_num_tokens": 52201, + "epoch": 1.41727, + "grad_norm": 4.208513259887695, + "learning_rate": 4.741234233313241e-06, + "loss": 1.4527, + "step": 1559 + }, + { + "batch_num_effect_tokens": 6934, + "batch_num_samples": 149, + "batch_num_tokens": 52218, + "epoch": 1.41818, + "grad_norm": 2.540071725845337, + "learning_rate": 4.727745323894976e-06, + "loss": 0.7746, + "step": 1560 + }, + { + "batch_num_effect_tokens": 5160, + "batch_num_samples": 149, + "batch_num_tokens": 52208, + "epoch": 1.41909, + "grad_norm": 3.0919241905212402, + "learning_rate": 4.714269687371581e-06, + "loss": 0.8692, + "step": 1561 + }, + { + "batch_num_effect_tokens": 6866, + "batch_num_samples": 149, + "batch_num_tokens": 52185, + "epoch": 1.42, + "grad_norm": 3.1210877895355225, + "learning_rate": 4.700807357667953e-06, + "loss": 1.0562, + "step": 1562 + }, + { + "batch_num_effect_tokens": 7396, + "batch_num_samples": 149, + "batch_num_tokens": 52102, + "epoch": 1.42091, + "grad_norm": 3.5360233783721924, + "learning_rate": 4.68735836867549e-06, + "loss": 1.5056, + "step": 1563 + }, + { + "batch_num_effect_tokens": 8045, + "batch_num_samples": 149, + "batch_num_tokens": 52201, + "epoch": 1.42182, + "grad_norm": 3.2280681133270264, + "learning_rate": 4.673922754252001e-06, + "loss": 1.0675, + "step": 1564 + }, + { + "batch_num_effect_tokens": 6934, + "batch_num_samples": 149, + "batch_num_tokens": 52158, + "epoch": 1.42273, + "grad_norm": 3.0071732997894287, + "learning_rate": 4.66050054822164e-06, + "loss": 1.0185, + "step": 1565 + }, + { + "batch_num_effect_tokens": 4468, + "batch_num_samples": 149, + "batch_num_tokens": 52136, + "epoch": 1.42364, + "grad_norm": 2.845442056655884, + "learning_rate": 4.647091784374786e-06, + "loss": 0.6274, + "step": 1566 + }, + { + "batch_num_effect_tokens": 5764, + "batch_num_samples": 149, + "batch_num_tokens": 52198, + "epoch": 1.42455, + "grad_norm": 4.495404243469238, + "learning_rate": 4.633696496467991e-06, + "loss": 1.1627, + "step": 1567 + }, + { + "batch_num_effect_tokens": 7567, + "batch_num_samples": 149, + "batch_num_tokens": 52210, + "epoch": 1.42545, + "grad_norm": 2.7251222133636475, + "learning_rate": 4.620314718223876e-06, + "loss": 1.0659, + "step": 1568 + }, + { + "batch_num_effect_tokens": 7245, + "batch_num_samples": 149, + "batch_num_tokens": 52135, + "epoch": 1.42636, + "grad_norm": 3.165076494216919, + "learning_rate": 4.606946483331049e-06, + "loss": 1.1725, + "step": 1569 + }, + { + "batch_num_effect_tokens": 5553, + "batch_num_samples": 149, + "batch_num_tokens": 52223, + "epoch": 1.42727, + "grad_norm": 3.2784016132354736, + "learning_rate": 4.593591825444028e-06, + "loss": 0.9274, + "step": 1570 + }, + { + "batch_num_effect_tokens": 4321, + "batch_num_samples": 149, + "batch_num_tokens": 52140, + "epoch": 1.42818, + "grad_norm": 3.000758647918701, + "learning_rate": 4.580250778183143e-06, + "loss": 0.5813, + "step": 1571 + }, + { + "batch_num_effect_tokens": 5569, + "batch_num_samples": 149, + "batch_num_tokens": 52208, + "epoch": 1.42909, + "grad_norm": 3.104440689086914, + "learning_rate": 4.5669233751344725e-06, + "loss": 0.8375, + "step": 1572 + }, + { + "batch_num_effect_tokens": 5609, + "batch_num_samples": 149, + "batch_num_tokens": 52171, + "epoch": 1.43, + "grad_norm": 3.22114634513855, + "learning_rate": 4.5536096498497295e-06, + "loss": 0.8254, + "step": 1573 + }, + { + "batch_num_effect_tokens": 9521, + "batch_num_samples": 149, + "batch_num_tokens": 52167, + "epoch": 1.43091, + "grad_norm": 2.0068347454071045, + "learning_rate": 4.5403096358462095e-06, + "loss": 0.557, + "step": 1574 + }, + { + "batch_num_effect_tokens": 7954, + "batch_num_samples": 149, + "batch_num_tokens": 52168, + "epoch": 1.43182, + "grad_norm": 2.7655041217803955, + "learning_rate": 4.527023366606678e-06, + "loss": 1.0156, + "step": 1575 + }, + { + "batch_num_effect_tokens": 7397, + "batch_num_samples": 150, + "batch_num_tokens": 52146, + "epoch": 1.43273, + "grad_norm": 3.338231325149536, + "learning_rate": 4.513750875579303e-06, + "loss": 1.3523, + "step": 1576 + }, + { + "batch_num_effect_tokens": 7074, + "batch_num_samples": 149, + "batch_num_tokens": 52151, + "epoch": 1.43364, + "grad_norm": 3.9786431789398193, + "learning_rate": 4.500492196177561e-06, + "loss": 1.6917, + "step": 1577 + }, + { + "batch_num_effect_tokens": 4802, + "batch_num_samples": 149, + "batch_num_tokens": 52196, + "epoch": 1.43455, + "grad_norm": 3.6411070823669434, + "learning_rate": 4.487247361780169e-06, + "loss": 0.9016, + "step": 1578 + }, + { + "batch_num_effect_tokens": 9939, + "batch_num_samples": 149, + "batch_num_tokens": 52184, + "epoch": 1.43545, + "grad_norm": 2.6721506118774414, + "learning_rate": 4.474016405730973e-06, + "loss": 1.2097, + "step": 1579 + }, + { + "batch_num_effect_tokens": 7528, + "batch_num_samples": 149, + "batch_num_tokens": 52141, + "epoch": 1.43636, + "grad_norm": 2.798081874847412, + "learning_rate": 4.460799361338898e-06, + "loss": 0.9998, + "step": 1580 + }, + { + "batch_num_effect_tokens": 4006, + "batch_num_samples": 149, + "batch_num_tokens": 52162, + "epoch": 1.43727, + "grad_norm": 2.9585371017456055, + "learning_rate": 4.447596261877832e-06, + "loss": 0.4894, + "step": 1581 + }, + { + "batch_num_effect_tokens": 4622, + "batch_num_samples": 149, + "batch_num_tokens": 52130, + "epoch": 1.43818, + "grad_norm": 3.357067108154297, + "learning_rate": 4.4344071405865656e-06, + "loss": 0.709, + "step": 1582 + }, + { + "batch_num_effect_tokens": 5982, + "batch_num_samples": 149, + "batch_num_tokens": 52191, + "epoch": 1.43909, + "grad_norm": 3.168221950531006, + "learning_rate": 4.421232030668688e-06, + "loss": 0.7801, + "step": 1583 + }, + { + "batch_num_effect_tokens": 5118, + "batch_num_samples": 149, + "batch_num_tokens": 52151, + "epoch": 1.44, + "grad_norm": 3.276728630065918, + "learning_rate": 4.408070965292534e-06, + "loss": 0.679, + "step": 1584 + }, + { + "batch_num_effect_tokens": 4891, + "batch_num_samples": 149, + "batch_num_tokens": 52191, + "epoch": 1.44091, + "grad_norm": 3.0199480056762695, + "learning_rate": 4.394923977591059e-06, + "loss": 0.6886, + "step": 1585 + }, + { + "batch_num_effect_tokens": 5212, + "batch_num_samples": 149, + "batch_num_tokens": 52223, + "epoch": 1.44182, + "grad_norm": 3.7399933338165283, + "learning_rate": 4.381791100661798e-06, + "loss": 0.8719, + "step": 1586 + }, + { + "batch_num_effect_tokens": 7335, + "batch_num_samples": 150, + "batch_num_tokens": 52177, + "epoch": 1.44273, + "grad_norm": 2.840649366378784, + "learning_rate": 4.368672367566751e-06, + "loss": 0.9722, + "step": 1587 + }, + { + "batch_num_effect_tokens": 7201, + "batch_num_samples": 150, + "batch_num_tokens": 52144, + "epoch": 1.44364, + "grad_norm": 2.9049928188323975, + "learning_rate": 4.355567811332311e-06, + "loss": 1.0122, + "step": 1588 + }, + { + "batch_num_effect_tokens": 5408, + "batch_num_samples": 149, + "batch_num_tokens": 52191, + "epoch": 1.44455, + "grad_norm": 3.15315842628479, + "learning_rate": 4.342477464949182e-06, + "loss": 0.782, + "step": 1589 + }, + { + "batch_num_effect_tokens": 6864, + "batch_num_samples": 150, + "batch_num_tokens": 52218, + "epoch": 1.44545, + "grad_norm": 4.1640167236328125, + "learning_rate": 4.3294013613722944e-06, + "loss": 1.3601, + "step": 1590 + }, + { + "batch_num_effect_tokens": 4136, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 1.44636, + "grad_norm": 3.5017006397247314, + "learning_rate": 4.316339533520727e-06, + "loss": 0.7745, + "step": 1591 + }, + { + "batch_num_effect_tokens": 5319, + "batch_num_samples": 149, + "batch_num_tokens": 52208, + "epoch": 1.44727, + "grad_norm": 3.5453526973724365, + "learning_rate": 4.3032920142776125e-06, + "loss": 0.9368, + "step": 1592 + }, + { + "batch_num_effect_tokens": 7886, + "batch_num_samples": 149, + "batch_num_tokens": 52168, + "epoch": 1.44818, + "grad_norm": 2.3222293853759766, + "learning_rate": 4.29025883649007e-06, + "loss": 0.5264, + "step": 1593 + }, + { + "batch_num_effect_tokens": 5225, + "batch_num_samples": 149, + "batch_num_tokens": 52180, + "epoch": 1.44909, + "grad_norm": 4.376035690307617, + "learning_rate": 4.2772400329691055e-06, + "loss": 1.4816, + "step": 1594 + }, + { + "batch_num_effect_tokens": 6460, + "batch_num_samples": 149, + "batch_num_tokens": 52082, + "epoch": 1.45, + "grad_norm": 2.6237964630126953, + "learning_rate": 4.264235636489542e-06, + "loss": 0.6, + "step": 1595 + }, + { + "batch_num_effect_tokens": 6284, + "batch_num_samples": 149, + "batch_num_tokens": 52205, + "epoch": 1.45091, + "grad_norm": 3.014374256134033, + "learning_rate": 4.251245679789928e-06, + "loss": 0.9595, + "step": 1596 + }, + { + "batch_num_effect_tokens": 5966, + "batch_num_samples": 149, + "batch_num_tokens": 52129, + "epoch": 1.45182, + "grad_norm": 3.3844950199127197, + "learning_rate": 4.2382701955724724e-06, + "loss": 0.9574, + "step": 1597 + }, + { + "batch_num_effect_tokens": 6036, + "batch_num_samples": 149, + "batch_num_tokens": 52180, + "epoch": 1.45273, + "grad_norm": 3.3551244735717773, + "learning_rate": 4.225309216502933e-06, + "loss": 1.1822, + "step": 1598 + }, + { + "batch_num_effect_tokens": 6962, + "batch_num_samples": 149, + "batch_num_tokens": 52150, + "epoch": 1.45364, + "grad_norm": 3.1049938201904297, + "learning_rate": 4.212362775210566e-06, + "loss": 1.0809, + "step": 1599 + }, + { + "batch_num_effect_tokens": 5748, + "batch_num_samples": 149, + "batch_num_tokens": 52149, + "epoch": 1.45455, + "grad_norm": 3.115337610244751, + "learning_rate": 4.19943090428802e-06, + "loss": 0.7755, + "step": 1600 + }, + { + "batch_num_effect_tokens": 7236, + "batch_num_samples": 149, + "batch_num_tokens": 52169, + "epoch": 1.45545, + "grad_norm": 3.6128201484680176, + "learning_rate": 4.186513636291263e-06, + "loss": 0.9413, + "step": 1601 + }, + { + "batch_num_effect_tokens": 6160, + "batch_num_samples": 149, + "batch_num_tokens": 52197, + "epoch": 1.45636, + "grad_norm": 3.142624855041504, + "learning_rate": 4.173611003739498e-06, + "loss": 0.7049, + "step": 1602 + }, + { + "batch_num_effect_tokens": 4262, + "batch_num_samples": 149, + "batch_num_tokens": 52208, + "epoch": 1.45727, + "grad_norm": 2.6013548374176025, + "learning_rate": 4.160723039115096e-06, + "loss": 0.4307, + "step": 1603 + }, + { + "batch_num_effect_tokens": 6962, + "batch_num_samples": 149, + "batch_num_tokens": 52183, + "epoch": 1.45818, + "grad_norm": 3.305237054824829, + "learning_rate": 4.147849774863488e-06, + "loss": 1.2904, + "step": 1604 + }, + { + "batch_num_effect_tokens": 6581, + "batch_num_samples": 149, + "batch_num_tokens": 52132, + "epoch": 1.45909, + "grad_norm": 3.872040033340454, + "learning_rate": 4.134991243393097e-06, + "loss": 1.4661, + "step": 1605 + }, + { + "batch_num_effect_tokens": 9770, + "batch_num_samples": 149, + "batch_num_tokens": 52161, + "epoch": 1.46, + "grad_norm": 3.0166680812835693, + "learning_rate": 4.12214747707527e-06, + "loss": 1.6123, + "step": 1606 + }, + { + "batch_num_effect_tokens": 5821, + "batch_num_samples": 149, + "batch_num_tokens": 52197, + "epoch": 1.46091, + "grad_norm": 2.6516101360321045, + "learning_rate": 4.109318508244168e-06, + "loss": 0.6149, + "step": 1607 + }, + { + "batch_num_effect_tokens": 6028, + "batch_num_samples": 149, + "batch_num_tokens": 52185, + "epoch": 1.46182, + "grad_norm": 3.194568395614624, + "learning_rate": 4.0965043691967045e-06, + "loss": 1.0061, + "step": 1608 + }, + { + "batch_num_effect_tokens": 7292, + "batch_num_samples": 149, + "batch_num_tokens": 52138, + "epoch": 1.46273, + "grad_norm": 2.660567045211792, + "learning_rate": 4.083705092192457e-06, + "loss": 0.8018, + "step": 1609 + }, + { + "batch_num_effect_tokens": 9606, + "batch_num_samples": 149, + "batch_num_tokens": 52180, + "epoch": 1.46364, + "grad_norm": 3.1425766944885254, + "learning_rate": 4.070920709453597e-06, + "loss": 1.5588, + "step": 1610 + }, + { + "batch_num_effect_tokens": 7163, + "batch_num_samples": 149, + "batch_num_tokens": 52189, + "epoch": 1.46455, + "grad_norm": 3.0139341354370117, + "learning_rate": 4.058151253164786e-06, + "loss": 1.0616, + "step": 1611 + }, + { + "batch_num_effect_tokens": 4896, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 1.46545, + "grad_norm": 3.9788472652435303, + "learning_rate": 4.045396755473121e-06, + "loss": 1.3641, + "step": 1612 + }, + { + "batch_num_effect_tokens": 5555, + "batch_num_samples": 149, + "batch_num_tokens": 52132, + "epoch": 1.46636, + "grad_norm": 3.3677539825439453, + "learning_rate": 4.032657248488031e-06, + "loss": 1.1608, + "step": 1613 + }, + { + "batch_num_effect_tokens": 11051, + "batch_num_samples": 149, + "batch_num_tokens": 52181, + "epoch": 1.46727, + "grad_norm": 2.7463760375976562, + "learning_rate": 4.019932764281212e-06, + "loss": 1.6416, + "step": 1614 + }, + { + "batch_num_effect_tokens": 4681, + "batch_num_samples": 149, + "batch_num_tokens": 52220, + "epoch": 1.46818, + "grad_norm": 3.089949369430542, + "learning_rate": 4.007223334886531e-06, + "loss": 0.6782, + "step": 1615 + }, + { + "batch_num_effect_tokens": 6030, + "batch_num_samples": 149, + "batch_num_tokens": 52187, + "epoch": 1.46909, + "grad_norm": 2.998955249786377, + "learning_rate": 3.9945289922999705e-06, + "loss": 0.9849, + "step": 1616 + }, + { + "batch_num_effect_tokens": 5378, + "batch_num_samples": 149, + "batch_num_tokens": 52209, + "epoch": 1.47, + "grad_norm": 2.4241385459899902, + "learning_rate": 3.981849768479516e-06, + "loss": 0.3628, + "step": 1617 + }, + { + "batch_num_effect_tokens": 6297, + "batch_num_samples": 149, + "batch_num_tokens": 52187, + "epoch": 1.47091, + "grad_norm": 3.1372172832489014, + "learning_rate": 3.9691856953451044e-06, + "loss": 1.115, + "step": 1618 + }, + { + "batch_num_effect_tokens": 6312, + "batch_num_samples": 149, + "batch_num_tokens": 52152, + "epoch": 1.47182, + "grad_norm": 3.268031120300293, + "learning_rate": 3.956536804778523e-06, + "loss": 1.3478, + "step": 1619 + }, + { + "batch_num_effect_tokens": 5785, + "batch_num_samples": 149, + "batch_num_tokens": 52165, + "epoch": 1.47273, + "grad_norm": 3.2121145725250244, + "learning_rate": 3.943903128623336e-06, + "loss": 1.0974, + "step": 1620 + }, + { + "batch_num_effect_tokens": 5960, + "batch_num_samples": 150, + "batch_num_tokens": 52154, + "epoch": 1.47364, + "grad_norm": 3.1616177558898926, + "learning_rate": 3.931284698684809e-06, + "loss": 0.8974, + "step": 1621 + }, + { + "batch_num_effect_tokens": 4574, + "batch_num_samples": 149, + "batch_num_tokens": 52173, + "epoch": 1.47455, + "grad_norm": 3.1711459159851074, + "learning_rate": 3.918681546729822e-06, + "loss": 0.718, + "step": 1622 + }, + { + "batch_num_effect_tokens": 8323, + "batch_num_samples": 149, + "batch_num_tokens": 52139, + "epoch": 1.47545, + "grad_norm": 2.6506152153015137, + "learning_rate": 3.906093704486802e-06, + "loss": 1.1311, + "step": 1623 + }, + { + "batch_num_effect_tokens": 7577, + "batch_num_samples": 150, + "batch_num_tokens": 52153, + "epoch": 1.47636, + "grad_norm": 3.0118792057037354, + "learning_rate": 3.893521203645618e-06, + "loss": 0.8202, + "step": 1624 + }, + { + "batch_num_effect_tokens": 5825, + "batch_num_samples": 149, + "batch_num_tokens": 52184, + "epoch": 1.47727, + "grad_norm": 2.8670742511749268, + "learning_rate": 3.880964075857535e-06, + "loss": 0.9281, + "step": 1625 + }, + { + "batch_num_effect_tokens": 5137, + "batch_num_samples": 149, + "batch_num_tokens": 52188, + "epoch": 1.47818, + "grad_norm": 3.110680341720581, + "learning_rate": 3.8684223527351025e-06, + "loss": 0.6674, + "step": 1626 + }, + { + "batch_num_effect_tokens": 6837, + "batch_num_samples": 149, + "batch_num_tokens": 52200, + "epoch": 1.47909, + "grad_norm": 3.045212745666504, + "learning_rate": 3.855896065852094e-06, + "loss": 0.9355, + "step": 1627 + }, + { + "batch_num_effect_tokens": 5605, + "batch_num_samples": 149, + "batch_num_tokens": 52141, + "epoch": 1.48, + "grad_norm": 3.3067891597747803, + "learning_rate": 3.8433852467434175e-06, + "loss": 1.0483, + "step": 1628 + }, + { + "batch_num_effect_tokens": 5249, + "batch_num_samples": 149, + "batch_num_tokens": 52151, + "epoch": 1.48091, + "grad_norm": 3.023493528366089, + "learning_rate": 3.830889926905054e-06, + "loss": 0.866, + "step": 1629 + }, + { + "batch_num_effect_tokens": 4265, + "batch_num_samples": 149, + "batch_num_tokens": 52183, + "epoch": 1.48182, + "grad_norm": 2.6965324878692627, + "learning_rate": 3.818410137793947e-06, + "loss": 0.5241, + "step": 1630 + }, + { + "batch_num_effect_tokens": 6096, + "batch_num_samples": 149, + "batch_num_tokens": 52223, + "epoch": 1.48273, + "grad_norm": 3.0876448154449463, + "learning_rate": 3.8059459108279596e-06, + "loss": 0.7893, + "step": 1631 + }, + { + "batch_num_effect_tokens": 5400, + "batch_num_samples": 149, + "batch_num_tokens": 52185, + "epoch": 1.48364, + "grad_norm": 3.066556692123413, + "learning_rate": 3.7934972773857637e-06, + "loss": 0.5504, + "step": 1632 + }, + { + "batch_num_effect_tokens": 5944, + "batch_num_samples": 149, + "batch_num_tokens": 52181, + "epoch": 1.48455, + "grad_norm": 3.781280517578125, + "learning_rate": 3.78106426880678e-06, + "loss": 1.4937, + "step": 1633 + }, + { + "batch_num_effect_tokens": 5206, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 1.48545, + "grad_norm": 3.534966230392456, + "learning_rate": 3.768646916391089e-06, + "loss": 0.8883, + "step": 1634 + }, + { + "batch_num_effect_tokens": 6922, + "batch_num_samples": 149, + "batch_num_tokens": 52221, + "epoch": 1.48636, + "grad_norm": 2.704559087753296, + "learning_rate": 3.7562452513993676e-06, + "loss": 0.6444, + "step": 1635 + }, + { + "batch_num_effect_tokens": 9185, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 1.48727, + "grad_norm": 3.4860382080078125, + "learning_rate": 3.743859305052785e-06, + "loss": 1.5957, + "step": 1636 + }, + { + "batch_num_effect_tokens": 7085, + "batch_num_samples": 149, + "batch_num_tokens": 52197, + "epoch": 1.48818, + "grad_norm": 3.459604024887085, + "learning_rate": 3.731489108532954e-06, + "loss": 1.4537, + "step": 1637 + }, + { + "batch_num_effect_tokens": 8872, + "batch_num_samples": 149, + "batch_num_tokens": 52184, + "epoch": 1.48909, + "grad_norm": 2.9613797664642334, + "learning_rate": 3.719134692981826e-06, + "loss": 1.211, + "step": 1638 + }, + { + "batch_num_effect_tokens": 4038, + "batch_num_samples": 149, + "batch_num_tokens": 52108, + "epoch": 1.49, + "grad_norm": 3.5438663959503174, + "learning_rate": 3.7067960895016277e-06, + "loss": 0.7451, + "step": 1639 + }, + { + "batch_num_effect_tokens": 8030, + "batch_num_samples": 149, + "batch_num_tokens": 52152, + "epoch": 1.49091, + "grad_norm": 2.684650421142578, + "learning_rate": 3.6944733291547784e-06, + "loss": 0.907, + "step": 1640 + }, + { + "batch_num_effect_tokens": 7190, + "batch_num_samples": 149, + "batch_num_tokens": 52146, + "epoch": 1.49182, + "grad_norm": 2.9121947288513184, + "learning_rate": 3.6821664429638093e-06, + "loss": 0.9502, + "step": 1641 + }, + { + "batch_num_effect_tokens": 7233, + "batch_num_samples": 149, + "batch_num_tokens": 52204, + "epoch": 1.49273, + "grad_norm": 3.3889174461364746, + "learning_rate": 3.6698754619112974e-06, + "loss": 1.2518, + "step": 1642 + }, + { + "batch_num_effect_tokens": 8993, + "batch_num_samples": 149, + "batch_num_tokens": 52159, + "epoch": 1.49364, + "grad_norm": 2.7589786052703857, + "learning_rate": 3.6576004169397684e-06, + "loss": 1.1827, + "step": 1643 + }, + { + "batch_num_effect_tokens": 7291, + "batch_num_samples": 150, + "batch_num_tokens": 52193, + "epoch": 1.49455, + "grad_norm": 3.014965772628784, + "learning_rate": 3.645341338951639e-06, + "loss": 1.1923, + "step": 1644 + }, + { + "batch_num_effect_tokens": 6077, + "batch_num_samples": 149, + "batch_num_tokens": 52152, + "epoch": 1.49545, + "grad_norm": 2.734469413757324, + "learning_rate": 3.633098258809119e-06, + "loss": 0.747, + "step": 1645 + }, + { + "batch_num_effect_tokens": 7428, + "batch_num_samples": 150, + "batch_num_tokens": 52216, + "epoch": 1.49636, + "grad_norm": 2.967613458633423, + "learning_rate": 3.62087120733415e-06, + "loss": 1.0398, + "step": 1646 + }, + { + "batch_num_effect_tokens": 6955, + "batch_num_samples": 149, + "batch_num_tokens": 52210, + "epoch": 1.49727, + "grad_norm": 3.0190069675445557, + "learning_rate": 3.608660215308315e-06, + "loss": 1.2252, + "step": 1647 + }, + { + "batch_num_effect_tokens": 4942, + "batch_num_samples": 149, + "batch_num_tokens": 52127, + "epoch": 1.49818, + "grad_norm": 3.297241449356079, + "learning_rate": 3.596465313472778e-06, + "loss": 0.8813, + "step": 1648 + }, + { + "batch_num_effect_tokens": 7227, + "batch_num_samples": 150, + "batch_num_tokens": 52142, + "epoch": 1.49909, + "grad_norm": 2.968118906021118, + "learning_rate": 3.584286532528184e-06, + "loss": 1.215, + "step": 1649 + }, + { + "batch_num_effect_tokens": 7004, + "batch_num_samples": 149, + "batch_num_tokens": 52115, + "epoch": 1.5, + "grad_norm": 2.9560141563415527, + "learning_rate": 3.5721239031346067e-06, + "loss": 1.0096, + "step": 1650 + }, + { + "batch_num_effect_tokens": 6649, + "batch_num_samples": 149, + "batch_num_tokens": 52140, + "epoch": 1.50091, + "grad_norm": 2.98976731300354, + "learning_rate": 3.5599774559114475e-06, + "loss": 0.8184, + "step": 1651 + }, + { + "batch_num_effect_tokens": 7209, + "batch_num_samples": 149, + "batch_num_tokens": 52190, + "epoch": 1.50182, + "grad_norm": 3.2497808933258057, + "learning_rate": 3.5478472214373716e-06, + "loss": 1.3472, + "step": 1652 + }, + { + "batch_num_effect_tokens": 6625, + "batch_num_samples": 149, + "batch_num_tokens": 52159, + "epoch": 1.50273, + "grad_norm": 2.750164031982422, + "learning_rate": 3.535733230250228e-06, + "loss": 0.8109, + "step": 1653 + }, + { + "batch_num_effect_tokens": 4242, + "batch_num_samples": 149, + "batch_num_tokens": 52158, + "epoch": 1.50364, + "grad_norm": 3.154306650161743, + "learning_rate": 3.5236355128469814e-06, + "loss": 0.4982, + "step": 1654 + }, + { + "batch_num_effect_tokens": 6324, + "batch_num_samples": 149, + "batch_num_tokens": 52149, + "epoch": 1.50455, + "grad_norm": 3.2036898136138916, + "learning_rate": 3.5115540996836174e-06, + "loss": 1.0942, + "step": 1655 + }, + { + "batch_num_effect_tokens": 6331, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 1.50545, + "grad_norm": 3.789130210876465, + "learning_rate": 3.4994890211750754e-06, + "loss": 1.0987, + "step": 1656 + }, + { + "batch_num_effect_tokens": 7787, + "batch_num_samples": 149, + "batch_num_tokens": 52163, + "epoch": 1.50636, + "grad_norm": 2.964521884918213, + "learning_rate": 3.4874403076951833e-06, + "loss": 1.1047, + "step": 1657 + }, + { + "batch_num_effect_tokens": 8055, + "batch_num_samples": 149, + "batch_num_tokens": 52134, + "epoch": 1.50727, + "grad_norm": 4.526617050170898, + "learning_rate": 3.4754079895765604e-06, + "loss": 2.2207, + "step": 1658 + }, + { + "batch_num_effect_tokens": 8868, + "batch_num_samples": 149, + "batch_num_tokens": 52155, + "epoch": 1.50818, + "grad_norm": 3.2682905197143555, + "learning_rate": 3.4633920971105515e-06, + "loss": 1.3323, + "step": 1659 + }, + { + "batch_num_effect_tokens": 8718, + "batch_num_samples": 149, + "batch_num_tokens": 52223, + "epoch": 1.50909, + "grad_norm": 3.7098581790924072, + "learning_rate": 3.4513926605471504e-06, + "loss": 1.4918, + "step": 1660 + }, + { + "batch_num_effect_tokens": 5298, + "batch_num_samples": 149, + "batch_num_tokens": 52190, + "epoch": 1.51, + "grad_norm": 2.9554543495178223, + "learning_rate": 3.4394097100949286e-06, + "loss": 0.6862, + "step": 1661 + }, + { + "batch_num_effect_tokens": 4945, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 1.51091, + "grad_norm": 3.075629949569702, + "learning_rate": 3.4274432759209454e-06, + "loss": 0.7103, + "step": 1662 + }, + { + "batch_num_effect_tokens": 5759, + "batch_num_samples": 149, + "batch_num_tokens": 52222, + "epoch": 1.51182, + "grad_norm": 3.055271625518799, + "learning_rate": 3.415493388150689e-06, + "loss": 0.6997, + "step": 1663 + }, + { + "batch_num_effect_tokens": 7229, + "batch_num_samples": 149, + "batch_num_tokens": 52158, + "epoch": 1.51273, + "grad_norm": 3.065497875213623, + "learning_rate": 3.4035600768679855e-06, + "loss": 1.0441, + "step": 1664 + }, + { + "batch_num_effect_tokens": 7170, + "batch_num_samples": 149, + "batch_num_tokens": 52180, + "epoch": 1.51364, + "grad_norm": 2.845918655395508, + "learning_rate": 3.3916433721149323e-06, + "loss": 1.1487, + "step": 1665 + }, + { + "batch_num_effect_tokens": 5252, + "batch_num_samples": 149, + "batch_num_tokens": 52163, + "epoch": 1.51455, + "grad_norm": 3.0218636989593506, + "learning_rate": 3.379743303891815e-06, + "loss": 0.6869, + "step": 1666 + }, + { + "batch_num_effect_tokens": 6367, + "batch_num_samples": 149, + "batch_num_tokens": 52191, + "epoch": 1.51545, + "grad_norm": 3.1631014347076416, + "learning_rate": 3.367859902157048e-06, + "loss": 0.9597, + "step": 1667 + }, + { + "batch_num_effect_tokens": 5526, + "batch_num_samples": 149, + "batch_num_tokens": 52146, + "epoch": 1.51636, + "grad_norm": 2.9969050884246826, + "learning_rate": 3.355993196827075e-06, + "loss": 0.8599, + "step": 1668 + }, + { + "batch_num_effect_tokens": 6648, + "batch_num_samples": 150, + "batch_num_tokens": 52218, + "epoch": 1.51727, + "grad_norm": 3.920488119125366, + "learning_rate": 3.344143217776319e-06, + "loss": 1.7936, + "step": 1669 + }, + { + "batch_num_effect_tokens": 7578, + "batch_num_samples": 149, + "batch_num_tokens": 52184, + "epoch": 1.51818, + "grad_norm": 2.879525899887085, + "learning_rate": 3.3323099948370853e-06, + "loss": 1.1958, + "step": 1670 + }, + { + "batch_num_effect_tokens": 4766, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 1.51909, + "grad_norm": 3.3454763889312744, + "learning_rate": 3.3204935577994967e-06, + "loss": 1.0473, + "step": 1671 + }, + { + "batch_num_effect_tokens": 5247, + "batch_num_samples": 149, + "batch_num_tokens": 52116, + "epoch": 1.52, + "grad_norm": 3.61670184135437, + "learning_rate": 3.308693936411421e-06, + "loss": 1.1154, + "step": 1672 + }, + { + "batch_num_effect_tokens": 10026, + "batch_num_samples": 149, + "batch_num_tokens": 52184, + "epoch": 1.52091, + "grad_norm": 2.697474241256714, + "learning_rate": 3.296911160378388e-06, + "loss": 1.3497, + "step": 1673 + }, + { + "batch_num_effect_tokens": 6802, + "batch_num_samples": 149, + "batch_num_tokens": 52149, + "epoch": 1.52182, + "grad_norm": 2.902420997619629, + "learning_rate": 3.2851452593635267e-06, + "loss": 0.9251, + "step": 1674 + }, + { + "batch_num_effect_tokens": 5791, + "batch_num_samples": 149, + "batch_num_tokens": 52201, + "epoch": 1.52273, + "grad_norm": 2.8669610023498535, + "learning_rate": 3.273396262987475e-06, + "loss": 0.5728, + "step": 1675 + }, + { + "batch_num_effect_tokens": 7332, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 1.52364, + "grad_norm": 2.9916625022888184, + "learning_rate": 3.2616642008283218e-06, + "loss": 1.1941, + "step": 1676 + }, + { + "batch_num_effect_tokens": 5752, + "batch_num_samples": 149, + "batch_num_tokens": 52162, + "epoch": 1.52455, + "grad_norm": 2.614650011062622, + "learning_rate": 3.249949102421518e-06, + "loss": 0.6, + "step": 1677 + }, + { + "batch_num_effect_tokens": 7273, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 1.52545, + "grad_norm": 2.79362416267395, + "learning_rate": 3.2382509972598087e-06, + "loss": 0.9564, + "step": 1678 + }, + { + "batch_num_effect_tokens": 7268, + "batch_num_samples": 149, + "batch_num_tokens": 52178, + "epoch": 1.52636, + "grad_norm": 3.24861741065979, + "learning_rate": 3.2265699147931562e-06, + "loss": 1.3387, + "step": 1679 + }, + { + "batch_num_effect_tokens": 5726, + "batch_num_samples": 149, + "batch_num_tokens": 52141, + "epoch": 1.52727, + "grad_norm": 2.863703966140747, + "learning_rate": 3.2149058844286796e-06, + "loss": 0.7691, + "step": 1680 + }, + { + "batch_num_effect_tokens": 5880, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 1.52818, + "grad_norm": 2.8440418243408203, + "learning_rate": 3.2032589355305544e-06, + "loss": 0.7248, + "step": 1681 + }, + { + "batch_num_effect_tokens": 6220, + "batch_num_samples": 149, + "batch_num_tokens": 52210, + "epoch": 1.52909, + "grad_norm": 3.2520759105682373, + "learning_rate": 3.1916290974199658e-06, + "loss": 0.9424, + "step": 1682 + }, + { + "batch_num_effect_tokens": 10256, + "batch_num_samples": 149, + "batch_num_tokens": 52203, + "epoch": 1.53, + "grad_norm": 2.665754556655884, + "learning_rate": 3.1800163993750166e-06, + "loss": 0.9723, + "step": 1683 + }, + { + "batch_num_effect_tokens": 9810, + "batch_num_samples": 149, + "batch_num_tokens": 52170, + "epoch": 1.53091, + "grad_norm": 3.15767240524292, + "learning_rate": 3.1684208706306572e-06, + "loss": 1.5059, + "step": 1684 + }, + { + "batch_num_effect_tokens": 4852, + "batch_num_samples": 149, + "batch_num_tokens": 52144, + "epoch": 1.53182, + "grad_norm": 3.166114568710327, + "learning_rate": 3.1568425403786175e-06, + "loss": 0.6738, + "step": 1685 + }, + { + "batch_num_effect_tokens": 8126, + "batch_num_samples": 149, + "batch_num_tokens": 52213, + "epoch": 1.53273, + "grad_norm": 3.4396119117736816, + "learning_rate": 3.1452814377673344e-06, + "loss": 1.1103, + "step": 1686 + }, + { + "batch_num_effect_tokens": 4718, + "batch_num_samples": 149, + "batch_num_tokens": 50544, + "epoch": 1.53364, + "grad_norm": 3.933088779449463, + "learning_rate": 3.133737591901864e-06, + "loss": 0.5764, + "step": 1687 + }, + { + "batch_num_effect_tokens": 8159, + "batch_num_samples": 149, + "batch_num_tokens": 52181, + "epoch": 1.53455, + "grad_norm": 3.9641008377075195, + "learning_rate": 3.12221103184383e-06, + "loss": 1.6929, + "step": 1688 + }, + { + "batch_num_effect_tokens": 7258, + "batch_num_samples": 150, + "batch_num_tokens": 52178, + "epoch": 1.53545, + "grad_norm": 2.8331658840179443, + "learning_rate": 3.110701786611333e-06, + "loss": 0.778, + "step": 1689 + }, + { + "batch_num_effect_tokens": 8700, + "batch_num_samples": 149, + "batch_num_tokens": 52146, + "epoch": 1.53636, + "grad_norm": 2.990499973297119, + "learning_rate": 3.099209885178882e-06, + "loss": 1.1492, + "step": 1690 + }, + { + "batch_num_effect_tokens": 5717, + "batch_num_samples": 149, + "batch_num_tokens": 52197, + "epoch": 1.53727, + "grad_norm": 3.5310122966766357, + "learning_rate": 3.087735356477326e-06, + "loss": 1.0758, + "step": 1691 + }, + { + "batch_num_effect_tokens": 6778, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 1.53818, + "grad_norm": 2.8749349117279053, + "learning_rate": 3.076278229393773e-06, + "loss": 0.938, + "step": 1692 + }, + { + "batch_num_effect_tokens": 8421, + "batch_num_samples": 149, + "batch_num_tokens": 52168, + "epoch": 1.53909, + "grad_norm": 2.7926547527313232, + "learning_rate": 3.0648385327715347e-06, + "loss": 0.867, + "step": 1693 + }, + { + "batch_num_effect_tokens": 7804, + "batch_num_samples": 149, + "batch_num_tokens": 52187, + "epoch": 1.54, + "grad_norm": 3.0578343868255615, + "learning_rate": 3.0534162954100264e-06, + "loss": 1.0182, + "step": 1694 + }, + { + "batch_num_effect_tokens": 4232, + "batch_num_samples": 149, + "batch_num_tokens": 52197, + "epoch": 1.54091, + "grad_norm": 2.940028429031372, + "learning_rate": 3.042011546064724e-06, + "loss": 0.6103, + "step": 1695 + }, + { + "batch_num_effect_tokens": 6254, + "batch_num_samples": 149, + "batch_num_tokens": 52188, + "epoch": 1.54182, + "grad_norm": 3.4290342330932617, + "learning_rate": 3.0306243134470668e-06, + "loss": 1.2933, + "step": 1696 + }, + { + "batch_num_effect_tokens": 9518, + "batch_num_samples": 149, + "batch_num_tokens": 52161, + "epoch": 1.54273, + "grad_norm": 2.8330044746398926, + "learning_rate": 3.0192546262243993e-06, + "loss": 1.2832, + "step": 1697 + }, + { + "batch_num_effect_tokens": 4527, + "batch_num_samples": 149, + "batch_num_tokens": 52179, + "epoch": 1.54364, + "grad_norm": 3.1589434146881104, + "learning_rate": 3.0079025130198936e-06, + "loss": 0.5135, + "step": 1698 + }, + { + "batch_num_effect_tokens": 9635, + "batch_num_samples": 149, + "batch_num_tokens": 52184, + "epoch": 1.54455, + "grad_norm": 2.468977689743042, + "learning_rate": 2.9965680024124856e-06, + "loss": 0.9308, + "step": 1699 + }, + { + "batch_num_effect_tokens": 5414, + "batch_num_samples": 149, + "batch_num_tokens": 52174, + "epoch": 1.54545, + "grad_norm": 2.762683153152466, + "learning_rate": 2.9852511229367862e-06, + "loss": 0.6347, + "step": 1700 + }, + { + "batch_num_effect_tokens": 5226, + "batch_num_samples": 149, + "batch_num_tokens": 52179, + "epoch": 1.54636, + "grad_norm": 3.7109501361846924, + "learning_rate": 2.9739519030830333e-06, + "loss": 0.9315, + "step": 1701 + }, + { + "batch_num_effect_tokens": 7824, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 1.54727, + "grad_norm": 2.943030595779419, + "learning_rate": 2.9626703712969962e-06, + "loss": 1.119, + "step": 1702 + }, + { + "batch_num_effect_tokens": 7364, + "batch_num_samples": 149, + "batch_num_tokens": 52151, + "epoch": 1.54818, + "grad_norm": 3.1611106395721436, + "learning_rate": 2.9514065559799176e-06, + "loss": 1.0515, + "step": 1703 + }, + { + "batch_num_effect_tokens": 6546, + "batch_num_samples": 149, + "batch_num_tokens": 52107, + "epoch": 1.54909, + "grad_norm": 2.5471692085266113, + "learning_rate": 2.940160485488436e-06, + "loss": 0.7075, + "step": 1704 + }, + { + "batch_num_effect_tokens": 7447, + "batch_num_samples": 149, + "batch_num_tokens": 52133, + "epoch": 1.55, + "grad_norm": 3.0841455459594727, + "learning_rate": 2.9289321881345257e-06, + "loss": 0.8351, + "step": 1705 + }, + { + "batch_num_effect_tokens": 5888, + "batch_num_samples": 149, + "batch_num_tokens": 52152, + "epoch": 1.55091, + "grad_norm": 3.2254083156585693, + "learning_rate": 2.91772169218541e-06, + "loss": 0.9222, + "step": 1706 + }, + { + "batch_num_effect_tokens": 7465, + "batch_num_samples": 149, + "batch_num_tokens": 52143, + "epoch": 1.55182, + "grad_norm": 3.2685139179229736, + "learning_rate": 2.906529025863496e-06, + "loss": 1.2465, + "step": 1707 + }, + { + "batch_num_effect_tokens": 5138, + "batch_num_samples": 149, + "batch_num_tokens": 52204, + "epoch": 1.55273, + "grad_norm": 3.4334542751312256, + "learning_rate": 2.8953542173463133e-06, + "loss": 0.881, + "step": 1708 + }, + { + "batch_num_effect_tokens": 4731, + "batch_num_samples": 149, + "batch_num_tokens": 52146, + "epoch": 1.55364, + "grad_norm": 3.9102420806884766, + "learning_rate": 2.8841972947664255e-06, + "loss": 0.603, + "step": 1709 + }, + { + "batch_num_effect_tokens": 5475, + "batch_num_samples": 149, + "batch_num_tokens": 52220, + "epoch": 1.55455, + "grad_norm": 3.3529446125030518, + "learning_rate": 2.8730582862113743e-06, + "loss": 0.8389, + "step": 1710 + }, + { + "batch_num_effect_tokens": 7815, + "batch_num_samples": 149, + "batch_num_tokens": 50567, + "epoch": 1.55545, + "grad_norm": 2.8974621295928955, + "learning_rate": 2.861937219723595e-06, + "loss": 1.0167, + "step": 1711 + }, + { + "batch_num_effect_tokens": 5746, + "batch_num_samples": 149, + "batch_num_tokens": 52162, + "epoch": 1.55636, + "grad_norm": 2.9369184970855713, + "learning_rate": 2.8508341233003656e-06, + "loss": 0.7625, + "step": 1712 + }, + { + "batch_num_effect_tokens": 5279, + "batch_num_samples": 149, + "batch_num_tokens": 52164, + "epoch": 1.55727, + "grad_norm": 3.400021553039551, + "learning_rate": 2.839749024893713e-06, + "loss": 1.1009, + "step": 1713 + }, + { + "batch_num_effect_tokens": 6465, + "batch_num_samples": 149, + "batch_num_tokens": 52212, + "epoch": 1.55818, + "grad_norm": 2.686952829360962, + "learning_rate": 2.8286819524103657e-06, + "loss": 0.6808, + "step": 1714 + }, + { + "batch_num_effect_tokens": 4915, + "batch_num_samples": 149, + "batch_num_tokens": 52163, + "epoch": 1.55909, + "grad_norm": 2.5672800540924072, + "learning_rate": 2.8176329337116604e-06, + "loss": 0.5838, + "step": 1715 + }, + { + "batch_num_effect_tokens": 5828, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 1.56, + "grad_norm": 3.3081350326538086, + "learning_rate": 2.8066019966134907e-06, + "loss": 0.9368, + "step": 1716 + }, + { + "batch_num_effect_tokens": 8521, + "batch_num_samples": 149, + "batch_num_tokens": 52215, + "epoch": 1.56091, + "grad_norm": 2.824108362197876, + "learning_rate": 2.7955891688862235e-06, + "loss": 0.957, + "step": 1717 + }, + { + "batch_num_effect_tokens": 6462, + "batch_num_samples": 149, + "batch_num_tokens": 52184, + "epoch": 1.56182, + "grad_norm": 3.050898551940918, + "learning_rate": 2.7845944782546453e-06, + "loss": 1.1143, + "step": 1718 + }, + { + "batch_num_effect_tokens": 9397, + "batch_num_samples": 149, + "batch_num_tokens": 52132, + "epoch": 1.56273, + "grad_norm": 2.890437602996826, + "learning_rate": 2.773617952397871e-06, + "loss": 1.2549, + "step": 1719 + }, + { + "batch_num_effect_tokens": 7103, + "batch_num_samples": 149, + "batch_num_tokens": 52114, + "epoch": 1.56364, + "grad_norm": 3.1431565284729004, + "learning_rate": 2.7626596189492983e-06, + "loss": 1.072, + "step": 1720 + }, + { + "batch_num_effect_tokens": 8693, + "batch_num_samples": 150, + "batch_num_tokens": 52179, + "epoch": 1.56455, + "grad_norm": 2.832227945327759, + "learning_rate": 2.751719505496514e-06, + "loss": 1.0417, + "step": 1721 + }, + { + "batch_num_effect_tokens": 5353, + "batch_num_samples": 149, + "batch_num_tokens": 52130, + "epoch": 1.56545, + "grad_norm": 3.170254707336426, + "learning_rate": 2.7407976395812417e-06, + "loss": 0.7899, + "step": 1722 + }, + { + "batch_num_effect_tokens": 5371, + "batch_num_samples": 149, + "batch_num_tokens": 52212, + "epoch": 1.56636, + "grad_norm": 3.459228754043579, + "learning_rate": 2.7298940486992654e-06, + "loss": 1.0546, + "step": 1723 + }, + { + "batch_num_effect_tokens": 7647, + "batch_num_samples": 149, + "batch_num_tokens": 52140, + "epoch": 1.56727, + "grad_norm": 3.55985164642334, + "learning_rate": 2.719008760300359e-06, + "loss": 1.4869, + "step": 1724 + }, + { + "batch_num_effect_tokens": 4177, + "batch_num_samples": 149, + "batch_num_tokens": 52220, + "epoch": 1.56818, + "grad_norm": 3.6153676509857178, + "learning_rate": 2.70814180178823e-06, + "loss": 0.6789, + "step": 1725 + }, + { + "batch_num_effect_tokens": 8808, + "batch_num_samples": 149, + "batch_num_tokens": 52222, + "epoch": 1.56909, + "grad_norm": 3.0937259197235107, + "learning_rate": 2.6972932005204267e-06, + "loss": 1.286, + "step": 1726 + }, + { + "batch_num_effect_tokens": 3874, + "batch_num_samples": 149, + "batch_num_tokens": 52166, + "epoch": 1.57, + "grad_norm": 3.9511656761169434, + "learning_rate": 2.6864629838082957e-06, + "loss": 0.9757, + "step": 1727 + }, + { + "batch_num_effect_tokens": 5497, + "batch_num_samples": 149, + "batch_num_tokens": 52140, + "epoch": 1.57091, + "grad_norm": 3.421701669692993, + "learning_rate": 2.6756511789168926e-06, + "loss": 0.9162, + "step": 1728 + }, + { + "batch_num_effect_tokens": 5104, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 1.57182, + "grad_norm": 3.42718505859375, + "learning_rate": 2.6648578130649215e-06, + "loss": 0.7696, + "step": 1729 + }, + { + "batch_num_effect_tokens": 7263, + "batch_num_samples": 149, + "batch_num_tokens": 52116, + "epoch": 1.57273, + "grad_norm": 2.9832472801208496, + "learning_rate": 2.6540829134246683e-06, + "loss": 0.9055, + "step": 1730 + }, + { + "batch_num_effect_tokens": 9577, + "batch_num_samples": 149, + "batch_num_tokens": 52182, + "epoch": 1.57364, + "grad_norm": 2.9695019721984863, + "learning_rate": 2.643326507121933e-06, + "loss": 1.2408, + "step": 1731 + }, + { + "batch_num_effect_tokens": 7249, + "batch_num_samples": 149, + "batch_num_tokens": 52177, + "epoch": 1.57455, + "grad_norm": 3.132408857345581, + "learning_rate": 2.6325886212359496e-06, + "loss": 1.2289, + "step": 1732 + }, + { + "batch_num_effect_tokens": 7387, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 1.57545, + "grad_norm": 3.4658567905426025, + "learning_rate": 2.621869282799342e-06, + "loss": 1.4882, + "step": 1733 + }, + { + "batch_num_effect_tokens": 5103, + "batch_num_samples": 149, + "batch_num_tokens": 52188, + "epoch": 1.57636, + "grad_norm": 3.2325146198272705, + "learning_rate": 2.611168518798026e-06, + "loss": 1.085, + "step": 1734 + }, + { + "batch_num_effect_tokens": 5651, + "batch_num_samples": 149, + "batch_num_tokens": 52168, + "epoch": 1.57727, + "grad_norm": 2.9820148944854736, + "learning_rate": 2.6004863561711633e-06, + "loss": 0.6723, + "step": 1735 + }, + { + "batch_num_effect_tokens": 5592, + "batch_num_samples": 149, + "batch_num_tokens": 52180, + "epoch": 1.57818, + "grad_norm": 2.9091336727142334, + "learning_rate": 2.5898228218110834e-06, + "loss": 0.7578, + "step": 1736 + }, + { + "batch_num_effect_tokens": 6166, + "batch_num_samples": 149, + "batch_num_tokens": 52208, + "epoch": 1.57909, + "grad_norm": 3.0767745971679688, + "learning_rate": 2.5791779425632257e-06, + "loss": 0.936, + "step": 1737 + }, + { + "batch_num_effect_tokens": 4677, + "batch_num_samples": 149, + "batch_num_tokens": 50568, + "epoch": 1.58, + "grad_norm": 2.596431255340576, + "learning_rate": 2.5685517452260566e-06, + "loss": 0.4832, + "step": 1738 + }, + { + "batch_num_effect_tokens": 4098, + "batch_num_samples": 149, + "batch_num_tokens": 52173, + "epoch": 1.58091, + "grad_norm": 3.6022675037384033, + "learning_rate": 2.5579442565510205e-06, + "loss": 0.823, + "step": 1739 + }, + { + "batch_num_effect_tokens": 7194, + "batch_num_samples": 150, + "batch_num_tokens": 52168, + "epoch": 1.58182, + "grad_norm": 2.7989399433135986, + "learning_rate": 2.5473555032424534e-06, + "loss": 0.9279, + "step": 1740 + }, + { + "batch_num_effect_tokens": 5119, + "batch_num_samples": 149, + "batch_num_tokens": 52145, + "epoch": 1.58273, + "grad_norm": 2.3581643104553223, + "learning_rate": 2.5367855119575314e-06, + "loss": 0.3767, + "step": 1741 + }, + { + "batch_num_effect_tokens": 5965, + "batch_num_samples": 149, + "batch_num_tokens": 52178, + "epoch": 1.58364, + "grad_norm": 3.442153215408325, + "learning_rate": 2.526234309306194e-06, + "loss": 0.9723, + "step": 1742 + }, + { + "batch_num_effect_tokens": 5158, + "batch_num_samples": 149, + "batch_num_tokens": 52184, + "epoch": 1.58455, + "grad_norm": 3.5384604930877686, + "learning_rate": 2.515701921851077e-06, + "loss": 0.8521, + "step": 1743 + }, + { + "batch_num_effect_tokens": 7348, + "batch_num_samples": 150, + "batch_num_tokens": 52167, + "epoch": 1.58545, + "grad_norm": 3.1147358417510986, + "learning_rate": 2.5051883761074613e-06, + "loss": 1.1336, + "step": 1744 + }, + { + "batch_num_effect_tokens": 7892, + "batch_num_samples": 149, + "batch_num_tokens": 52167, + "epoch": 1.58636, + "grad_norm": 2.0374927520751953, + "learning_rate": 2.494693698543179e-06, + "loss": 0.4602, + "step": 1745 + }, + { + "batch_num_effect_tokens": 8038, + "batch_num_samples": 149, + "batch_num_tokens": 52170, + "epoch": 1.58727, + "grad_norm": 3.6828629970550537, + "learning_rate": 2.484217915578574e-06, + "loss": 1.5568, + "step": 1746 + }, + { + "batch_num_effect_tokens": 6741, + "batch_num_samples": 149, + "batch_num_tokens": 52135, + "epoch": 1.58818, + "grad_norm": 3.2681405544281006, + "learning_rate": 2.4737610535864145e-06, + "loss": 0.9953, + "step": 1747 + }, + { + "batch_num_effect_tokens": 5818, + "batch_num_samples": 149, + "batch_num_tokens": 52173, + "epoch": 1.58909, + "grad_norm": 2.930443048477173, + "learning_rate": 2.4633231388918377e-06, + "loss": 0.788, + "step": 1748 + }, + { + "batch_num_effect_tokens": 7963, + "batch_num_samples": 150, + "batch_num_tokens": 52201, + "epoch": 1.59, + "grad_norm": 3.2641735076904297, + "learning_rate": 2.45290419777228e-06, + "loss": 1.3389, + "step": 1749 + }, + { + "batch_num_effect_tokens": 4948, + "batch_num_samples": 149, + "batch_num_tokens": 52223, + "epoch": 1.59091, + "grad_norm": 3.009199857711792, + "learning_rate": 2.4425042564574186e-06, + "loss": 0.4797, + "step": 1750 + }, + { + "batch_num_effect_tokens": 6445, + "batch_num_samples": 149, + "batch_num_tokens": 52135, + "epoch": 1.59182, + "grad_norm": 3.7565250396728516, + "learning_rate": 2.432123341129087e-06, + "loss": 1.199, + "step": 1751 + }, + { + "batch_num_effect_tokens": 5496, + "batch_num_samples": 149, + "batch_num_tokens": 52197, + "epoch": 1.59273, + "grad_norm": 3.0002763271331787, + "learning_rate": 2.421761477921232e-06, + "loss": 0.711, + "step": 1752 + }, + { + "batch_num_effect_tokens": 5755, + "batch_num_samples": 149, + "batch_num_tokens": 52191, + "epoch": 1.59364, + "grad_norm": 3.30511474609375, + "learning_rate": 2.411418692919831e-06, + "loss": 1.0191, + "step": 1753 + }, + { + "batch_num_effect_tokens": 8357, + "batch_num_samples": 149, + "batch_num_tokens": 52084, + "epoch": 1.59455, + "grad_norm": 3.0193865299224854, + "learning_rate": 2.401095012162832e-06, + "loss": 1.3428, + "step": 1754 + }, + { + "batch_num_effect_tokens": 7660, + "batch_num_samples": 150, + "batch_num_tokens": 52153, + "epoch": 1.59545, + "grad_norm": 2.9472556114196777, + "learning_rate": 2.3907904616400855e-06, + "loss": 1.1688, + "step": 1755 + }, + { + "batch_num_effect_tokens": 7234, + "batch_num_samples": 150, + "batch_num_tokens": 52188, + "epoch": 1.59636, + "grad_norm": 2.4208133220672607, + "learning_rate": 2.380505067293293e-06, + "loss": 0.6147, + "step": 1756 + }, + { + "batch_num_effect_tokens": 8168, + "batch_num_samples": 149, + "batch_num_tokens": 52215, + "epoch": 1.59727, + "grad_norm": 2.9983484745025635, + "learning_rate": 2.3702388550159172e-06, + "loss": 1.083, + "step": 1757 + }, + { + "batch_num_effect_tokens": 8542, + "batch_num_samples": 149, + "batch_num_tokens": 52192, + "epoch": 1.59818, + "grad_norm": 2.831907272338867, + "learning_rate": 2.3599918506531337e-06, + "loss": 1.1711, + "step": 1758 + }, + { + "batch_num_effect_tokens": 7883, + "batch_num_samples": 149, + "batch_num_tokens": 52165, + "epoch": 1.59909, + "grad_norm": 3.413278818130493, + "learning_rate": 2.3497640800017687e-06, + "loss": 1.4813, + "step": 1759 + }, + { + "batch_num_effect_tokens": 4681, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 1.6, + "grad_norm": 2.6594529151916504, + "learning_rate": 2.339555568810221e-06, + "loss": 0.581, + "step": 1760 + }, + { + "batch_num_effect_tokens": 10408, + "batch_num_samples": 149, + "batch_num_tokens": 52172, + "epoch": 1.60091, + "grad_norm": 2.7606842517852783, + "learning_rate": 2.329366342778404e-06, + "loss": 1.3419, + "step": 1761 + }, + { + "batch_num_effect_tokens": 7211, + "batch_num_samples": 149, + "batch_num_tokens": 52191, + "epoch": 1.60182, + "grad_norm": 2.9090163707733154, + "learning_rate": 2.3191964275576806e-06, + "loss": 0.8893, + "step": 1762 + }, + { + "batch_num_effect_tokens": 6113, + "batch_num_samples": 149, + "batch_num_tokens": 52182, + "epoch": 1.60273, + "grad_norm": 3.3374078273773193, + "learning_rate": 2.309045848750806e-06, + "loss": 1.1442, + "step": 1763 + }, + { + "batch_num_effect_tokens": 8686, + "batch_num_samples": 149, + "batch_num_tokens": 52208, + "epoch": 1.60364, + "grad_norm": 2.685124158859253, + "learning_rate": 2.2989146319118428e-06, + "loss": 1.0721, + "step": 1764 + }, + { + "batch_num_effect_tokens": 8679, + "batch_num_samples": 149, + "batch_num_tokens": 52184, + "epoch": 1.60455, + "grad_norm": 3.2319931983947754, + "learning_rate": 2.288802802546124e-06, + "loss": 1.5218, + "step": 1765 + }, + { + "batch_num_effect_tokens": 7823, + "batch_num_samples": 149, + "batch_num_tokens": 52163, + "epoch": 1.60545, + "grad_norm": 2.9048807621002197, + "learning_rate": 2.2787103861101656e-06, + "loss": 1.1865, + "step": 1766 + }, + { + "batch_num_effect_tokens": 6285, + "batch_num_samples": 149, + "batch_num_tokens": 52142, + "epoch": 1.60636, + "grad_norm": 3.122467279434204, + "learning_rate": 2.2686374080116136e-06, + "loss": 1.092, + "step": 1767 + }, + { + "batch_num_effect_tokens": 7060, + "batch_num_samples": 149, + "batch_num_tokens": 52174, + "epoch": 1.60727, + "grad_norm": 2.725559711456299, + "learning_rate": 2.2585838936091753e-06, + "loss": 0.6991, + "step": 1768 + }, + { + "batch_num_effect_tokens": 8588, + "batch_num_samples": 149, + "batch_num_tokens": 52182, + "epoch": 1.60818, + "grad_norm": 3.078824758529663, + "learning_rate": 2.2485498682125674e-06, + "loss": 1.3859, + "step": 1769 + }, + { + "batch_num_effect_tokens": 7064, + "batch_num_samples": 149, + "batch_num_tokens": 52107, + "epoch": 1.60909, + "grad_norm": 2.6935298442840576, + "learning_rate": 2.2385353570824308e-06, + "loss": 0.8102, + "step": 1770 + }, + { + "batch_num_effect_tokens": 5637, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 1.61, + "grad_norm": 3.3661398887634277, + "learning_rate": 2.2285403854302912e-06, + "loss": 1.0509, + "step": 1771 + }, + { + "batch_num_effect_tokens": 6232, + "batch_num_samples": 149, + "batch_num_tokens": 52210, + "epoch": 1.61091, + "grad_norm": 3.273484706878662, + "learning_rate": 2.218564978418475e-06, + "loss": 0.7791, + "step": 1772 + }, + { + "batch_num_effect_tokens": 3892, + "batch_num_samples": 149, + "batch_num_tokens": 52162, + "epoch": 1.61182, + "grad_norm": 4.022351264953613, + "learning_rate": 2.208609161160057e-06, + "loss": 0.8899, + "step": 1773 + }, + { + "batch_num_effect_tokens": 5333, + "batch_num_samples": 149, + "batch_num_tokens": 52184, + "epoch": 1.61273, + "grad_norm": 2.932511806488037, + "learning_rate": 2.198672958718796e-06, + "loss": 0.7308, + "step": 1774 + }, + { + "batch_num_effect_tokens": 6280, + "batch_num_samples": 149, + "batch_num_tokens": 50569, + "epoch": 1.61364, + "grad_norm": 2.7373950481414795, + "learning_rate": 2.1887563961090664e-06, + "loss": 0.634, + "step": 1775 + }, + { + "batch_num_effect_tokens": 5883, + "batch_num_samples": 149, + "batch_num_tokens": 52189, + "epoch": 1.61455, + "grad_norm": 3.1122817993164062, + "learning_rate": 2.1788594982958087e-06, + "loss": 0.8156, + "step": 1776 + }, + { + "batch_num_effect_tokens": 8606, + "batch_num_samples": 149, + "batch_num_tokens": 52211, + "epoch": 1.61545, + "grad_norm": 2.974762201309204, + "learning_rate": 2.1689822901944456e-06, + "loss": 1.2899, + "step": 1777 + }, + { + "batch_num_effect_tokens": 5448, + "batch_num_samples": 149, + "batch_num_tokens": 52138, + "epoch": 1.61636, + "grad_norm": 3.2081286907196045, + "learning_rate": 2.159124796670843e-06, + "loss": 0.7461, + "step": 1778 + }, + { + "batch_num_effect_tokens": 7055, + "batch_num_samples": 149, + "batch_num_tokens": 52213, + "epoch": 1.61727, + "grad_norm": 3.310664653778076, + "learning_rate": 2.149287042541225e-06, + "loss": 1.3933, + "step": 1779 + }, + { + "batch_num_effect_tokens": 7378, + "batch_num_samples": 149, + "batch_num_tokens": 52137, + "epoch": 1.61818, + "grad_norm": 2.7810285091400146, + "learning_rate": 2.1394690525721275e-06, + "loss": 0.9525, + "step": 1780 + }, + { + "batch_num_effect_tokens": 6656, + "batch_num_samples": 150, + "batch_num_tokens": 52143, + "epoch": 1.61909, + "grad_norm": 3.0689432621002197, + "learning_rate": 2.1296708514803244e-06, + "loss": 1.1013, + "step": 1781 + }, + { + "batch_num_effect_tokens": 6009, + "batch_num_samples": 149, + "batch_num_tokens": 52166, + "epoch": 1.62, + "grad_norm": 2.846339702606201, + "learning_rate": 2.119892463932781e-06, + "loss": 0.6385, + "step": 1782 + }, + { + "batch_num_effect_tokens": 7005, + "batch_num_samples": 149, + "batch_num_tokens": 52209, + "epoch": 1.62091, + "grad_norm": 3.1606693267822266, + "learning_rate": 2.1101339145465725e-06, + "loss": 1.0057, + "step": 1783 + }, + { + "batch_num_effect_tokens": 6086, + "batch_num_samples": 149, + "batch_num_tokens": 50559, + "epoch": 1.62182, + "grad_norm": 3.815751552581787, + "learning_rate": 2.1003952278888382e-06, + "loss": 1.5689, + "step": 1784 + }, + { + "batch_num_effect_tokens": 5806, + "batch_num_samples": 149, + "batch_num_tokens": 52123, + "epoch": 1.62273, + "grad_norm": 3.289590835571289, + "learning_rate": 2.090676428476709e-06, + "loss": 1.0711, + "step": 1785 + }, + { + "batch_num_effect_tokens": 3409, + "batch_num_samples": 149, + "batch_num_tokens": 52219, + "epoch": 1.62364, + "grad_norm": 2.739345073699951, + "learning_rate": 2.0809775407772505e-06, + "loss": 0.215, + "step": 1786 + }, + { + "batch_num_effect_tokens": 8592, + "batch_num_samples": 149, + "batch_num_tokens": 52157, + "epoch": 1.62455, + "grad_norm": 2.8174004554748535, + "learning_rate": 2.071298589207399e-06, + "loss": 1.2823, + "step": 1787 + }, + { + "batch_num_effect_tokens": 7297, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 1.62545, + "grad_norm": 2.857300281524658, + "learning_rate": 2.0616395981339076e-06, + "loss": 1.0295, + "step": 1788 + }, + { + "batch_num_effect_tokens": 6110, + "batch_num_samples": 149, + "batch_num_tokens": 52201, + "epoch": 1.62636, + "grad_norm": 3.4632177352905273, + "learning_rate": 2.05200059187327e-06, + "loss": 1.2141, + "step": 1789 + }, + { + "batch_num_effect_tokens": 6218, + "batch_num_samples": 149, + "batch_num_tokens": 52223, + "epoch": 1.62727, + "grad_norm": 3.2370378971099854, + "learning_rate": 2.0423815946916783e-06, + "loss": 0.9353, + "step": 1790 + }, + { + "batch_num_effect_tokens": 4699, + "batch_num_samples": 149, + "batch_num_tokens": 52201, + "epoch": 1.62818, + "grad_norm": 4.399960041046143, + "learning_rate": 2.032782630804945e-06, + "loss": 0.7076, + "step": 1791 + }, + { + "batch_num_effect_tokens": 5507, + "batch_num_samples": 149, + "batch_num_tokens": 52149, + "epoch": 1.62909, + "grad_norm": 3.497896671295166, + "learning_rate": 2.0232037243784475e-06, + "loss": 1.2132, + "step": 1792 + }, + { + "batch_num_effect_tokens": 6461, + "batch_num_samples": 149, + "batch_num_tokens": 52140, + "epoch": 1.63, + "grad_norm": 2.8389227390289307, + "learning_rate": 2.013644899527074e-06, + "loss": 1.0196, + "step": 1793 + }, + { + "batch_num_effect_tokens": 6493, + "batch_num_samples": 149, + "batch_num_tokens": 52142, + "epoch": 1.63091, + "grad_norm": 3.1590051651000977, + "learning_rate": 2.004106180315151e-06, + "loss": 1.1268, + "step": 1794 + }, + { + "batch_num_effect_tokens": 7699, + "batch_num_samples": 149, + "batch_num_tokens": 52158, + "epoch": 1.63182, + "grad_norm": 3.41550874710083, + "learning_rate": 1.994587590756397e-06, + "loss": 1.2996, + "step": 1795 + }, + { + "batch_num_effect_tokens": 7288, + "batch_num_samples": 149, + "batch_num_tokens": 52177, + "epoch": 1.63273, + "grad_norm": 3.010758399963379, + "learning_rate": 1.9850891548138463e-06, + "loss": 1.2565, + "step": 1796 + }, + { + "batch_num_effect_tokens": 6014, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 1.63364, + "grad_norm": 3.2456741333007812, + "learning_rate": 1.9756108963998054e-06, + "loss": 1.1311, + "step": 1797 + }, + { + "batch_num_effect_tokens": 4547, + "batch_num_samples": 149, + "batch_num_tokens": 52156, + "epoch": 1.63455, + "grad_norm": 2.8084323406219482, + "learning_rate": 1.9661528393757744e-06, + "loss": 0.4857, + "step": 1798 + }, + { + "batch_num_effect_tokens": 6305, + "batch_num_samples": 149, + "batch_num_tokens": 52170, + "epoch": 1.63545, + "grad_norm": 3.300767183303833, + "learning_rate": 1.956715007552401e-06, + "loss": 0.9622, + "step": 1799 + }, + { + "batch_num_effect_tokens": 7916, + "batch_num_samples": 149, + "batch_num_tokens": 52171, + "epoch": 1.63636, + "grad_norm": 2.9808456897735596, + "learning_rate": 1.947297424689414e-06, + "loss": 0.9933, + "step": 1800 + }, + { + "batch_num_effect_tokens": 6046, + "batch_num_samples": 149, + "batch_num_tokens": 52138, + "epoch": 1.63727, + "grad_norm": 3.626404285430908, + "learning_rate": 1.9379001144955713e-06, + "loss": 1.194, + "step": 1801 + }, + { + "batch_num_effect_tokens": 6036, + "batch_num_samples": 149, + "batch_num_tokens": 52147, + "epoch": 1.63818, + "grad_norm": 3.19749116897583, + "learning_rate": 1.9285231006285855e-06, + "loss": 0.9358, + "step": 1802 + }, + { + "batch_num_effect_tokens": 4047, + "batch_num_samples": 149, + "batch_num_tokens": 50510, + "epoch": 1.63909, + "grad_norm": 2.7461113929748535, + "learning_rate": 1.9191664066950834e-06, + "loss": 0.4657, + "step": 1803 + }, + { + "batch_num_effect_tokens": 6395, + "batch_num_samples": 149, + "batch_num_tokens": 52184, + "epoch": 1.64, + "grad_norm": 2.6643176078796387, + "learning_rate": 1.9098300562505266e-06, + "loss": 0.8469, + "step": 1804 + }, + { + "batch_num_effect_tokens": 4585, + "batch_num_samples": 149, + "batch_num_tokens": 52151, + "epoch": 1.64091, + "grad_norm": 2.729966163635254, + "learning_rate": 1.9005140727991678e-06, + "loss": 0.5909, + "step": 1805 + }, + { + "batch_num_effect_tokens": 6252, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 1.64182, + "grad_norm": 3.1414084434509277, + "learning_rate": 1.8912184797939803e-06, + "loss": 1.0312, + "step": 1806 + }, + { + "batch_num_effect_tokens": 5722, + "batch_num_samples": 149, + "batch_num_tokens": 52174, + "epoch": 1.64273, + "grad_norm": 3.29822039604187, + "learning_rate": 1.881943300636615e-06, + "loss": 1.0402, + "step": 1807 + }, + { + "batch_num_effect_tokens": 8174, + "batch_num_samples": 149, + "batch_num_tokens": 52132, + "epoch": 1.64364, + "grad_norm": 3.040043830871582, + "learning_rate": 1.8726885586773213e-06, + "loss": 1.1538, + "step": 1808 + }, + { + "batch_num_effect_tokens": 6188, + "batch_num_samples": 149, + "batch_num_tokens": 52186, + "epoch": 1.64455, + "grad_norm": 3.0945098400115967, + "learning_rate": 1.8634542772148978e-06, + "loss": 0.7199, + "step": 1809 + }, + { + "batch_num_effect_tokens": 5535, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 1.64545, + "grad_norm": 2.3633594512939453, + "learning_rate": 1.854240479496643e-06, + "loss": 0.3655, + "step": 1810 + }, + { + "batch_num_effect_tokens": 7126, + "batch_num_samples": 149, + "batch_num_tokens": 52184, + "epoch": 1.64636, + "grad_norm": 2.8708362579345703, + "learning_rate": 1.8450471887182797e-06, + "loss": 0.8611, + "step": 1811 + }, + { + "batch_num_effect_tokens": 5189, + "batch_num_samples": 149, + "batch_num_tokens": 52222, + "epoch": 1.64727, + "grad_norm": 4.187164783477783, + "learning_rate": 1.8358744280239048e-06, + "loss": 1.5781, + "step": 1812 + }, + { + "batch_num_effect_tokens": 5376, + "batch_num_samples": 149, + "batch_num_tokens": 52135, + "epoch": 1.64818, + "grad_norm": 3.8114173412323, + "learning_rate": 1.826722220505931e-06, + "loss": 0.947, + "step": 1813 + }, + { + "batch_num_effect_tokens": 7132, + "batch_num_samples": 149, + "batch_num_tokens": 52143, + "epoch": 1.64909, + "grad_norm": 3.5395960807800293, + "learning_rate": 1.817590589205035e-06, + "loss": 1.1665, + "step": 1814 + }, + { + "batch_num_effect_tokens": 6801, + "batch_num_samples": 149, + "batch_num_tokens": 50546, + "epoch": 1.65, + "grad_norm": 3.047496795654297, + "learning_rate": 1.808479557110081e-06, + "loss": 0.897, + "step": 1815 + }, + { + "batch_num_effect_tokens": 6263, + "batch_num_samples": 149, + "batch_num_tokens": 52168, + "epoch": 1.65091, + "grad_norm": 2.9120888710021973, + "learning_rate": 1.7993891471580894e-06, + "loss": 0.9763, + "step": 1816 + }, + { + "batch_num_effect_tokens": 8410, + "batch_num_samples": 149, + "batch_num_tokens": 52140, + "epoch": 1.65182, + "grad_norm": 2.9664645195007324, + "learning_rate": 1.7903193822341513e-06, + "loss": 1.3164, + "step": 1817 + }, + { + "batch_num_effect_tokens": 4368, + "batch_num_samples": 149, + "batch_num_tokens": 52141, + "epoch": 1.65273, + "grad_norm": 3.506654977798462, + "learning_rate": 1.7812702851713904e-06, + "loss": 0.6319, + "step": 1818 + }, + { + "batch_num_effect_tokens": 6964, + "batch_num_samples": 149, + "batch_num_tokens": 52189, + "epoch": 1.65364, + "grad_norm": 3.4171743392944336, + "learning_rate": 1.7722418787508956e-06, + "loss": 1.1607, + "step": 1819 + }, + { + "batch_num_effect_tokens": 5712, + "batch_num_samples": 149, + "batch_num_tokens": 50567, + "epoch": 1.65455, + "grad_norm": 3.163783550262451, + "learning_rate": 1.7632341857016733e-06, + "loss": 0.8253, + "step": 1820 + }, + { + "batch_num_effect_tokens": 9105, + "batch_num_samples": 149, + "batch_num_tokens": 52187, + "epoch": 1.65545, + "grad_norm": 2.640961170196533, + "learning_rate": 1.754247228700575e-06, + "loss": 1.1542, + "step": 1821 + }, + { + "batch_num_effect_tokens": 5878, + "batch_num_samples": 149, + "batch_num_tokens": 52180, + "epoch": 1.65636, + "grad_norm": 3.138787269592285, + "learning_rate": 1.74528103037226e-06, + "loss": 1.106, + "step": 1822 + }, + { + "batch_num_effect_tokens": 8034, + "batch_num_samples": 149, + "batch_num_tokens": 52196, + "epoch": 1.65727, + "grad_norm": 3.100926399230957, + "learning_rate": 1.7363356132891196e-06, + "loss": 1.0607, + "step": 1823 + }, + { + "batch_num_effect_tokens": 7090, + "batch_num_samples": 149, + "batch_num_tokens": 52223, + "epoch": 1.65818, + "grad_norm": 2.5713181495666504, + "learning_rate": 1.7274109999712295e-06, + "loss": 0.8876, + "step": 1824 + }, + { + "batch_num_effect_tokens": 8023, + "batch_num_samples": 149, + "batch_num_tokens": 52183, + "epoch": 1.65909, + "grad_norm": 3.4491758346557617, + "learning_rate": 1.7185072128862934e-06, + "loss": 1.3966, + "step": 1825 + }, + { + "batch_num_effect_tokens": 5171, + "batch_num_samples": 149, + "batch_num_tokens": 52223, + "epoch": 1.66, + "grad_norm": 2.647582530975342, + "learning_rate": 1.709624274449584e-06, + "loss": 0.6023, + "step": 1826 + }, + { + "batch_num_effect_tokens": 4286, + "batch_num_samples": 149, + "batch_num_tokens": 52207, + "epoch": 1.66091, + "grad_norm": 2.4348816871643066, + "learning_rate": 1.7007622070238905e-06, + "loss": 0.3872, + "step": 1827 + }, + { + "batch_num_effect_tokens": 7501, + "batch_num_samples": 149, + "batch_num_tokens": 52208, + "epoch": 1.66182, + "grad_norm": 3.173098087310791, + "learning_rate": 1.6919210329194535e-06, + "loss": 1.2194, + "step": 1828 + }, + { + "batch_num_effect_tokens": 7094, + "batch_num_samples": 149, + "batch_num_tokens": 52191, + "epoch": 1.66273, + "grad_norm": 3.285263776779175, + "learning_rate": 1.6831007743939231e-06, + "loss": 1.0569, + "step": 1829 + }, + { + "batch_num_effect_tokens": 8286, + "batch_num_samples": 150, + "batch_num_tokens": 52178, + "epoch": 1.66364, + "grad_norm": 2.7359249591827393, + "learning_rate": 1.6743014536522872e-06, + "loss": 0.9939, + "step": 1830 + }, + { + "batch_num_effect_tokens": 6013, + "batch_num_samples": 149, + "batch_num_tokens": 52124, + "epoch": 1.66455, + "grad_norm": 3.9989168643951416, + "learning_rate": 1.6655230928468257e-06, + "loss": 0.8864, + "step": 1831 + }, + { + "batch_num_effect_tokens": 7145, + "batch_num_samples": 149, + "batch_num_tokens": 52131, + "epoch": 1.66545, + "grad_norm": 3.026437759399414, + "learning_rate": 1.6567657140770477e-06, + "loss": 1.0664, + "step": 1832 + }, + { + "batch_num_effect_tokens": 6330, + "batch_num_samples": 150, + "batch_num_tokens": 52192, + "epoch": 1.66636, + "grad_norm": 3.3815693855285645, + "learning_rate": 1.6480293393896508e-06, + "loss": 1.0099, + "step": 1833 + }, + { + "batch_num_effect_tokens": 4332, + "batch_num_samples": 149, + "batch_num_tokens": 52176, + "epoch": 1.66727, + "grad_norm": 1.886484980583191, + "learning_rate": 1.6393139907784405e-06, + "loss": 0.1685, + "step": 1834 + }, + { + "batch_num_effect_tokens": 6494, + "batch_num_samples": 149, + "batch_num_tokens": 52191, + "epoch": 1.66818, + "grad_norm": 2.715851306915283, + "learning_rate": 1.630619690184303e-06, + "loss": 0.7091, + "step": 1835 + }, + { + "batch_num_effect_tokens": 7968, + "batch_num_samples": 150, + "batch_num_tokens": 52213, + "epoch": 1.66909, + "grad_norm": 2.8430440425872803, + "learning_rate": 1.6219464594951273e-06, + "loss": 1.0437, + "step": 1836 + }, + { + "batch_num_effect_tokens": 10572, + "batch_num_samples": 149, + "batch_num_tokens": 52170, + "epoch": 1.67, + "grad_norm": 2.509157657623291, + "learning_rate": 1.6132943205457607e-06, + "loss": 1.0703, + "step": 1837 + }, + { + "batch_num_effect_tokens": 8004, + "batch_num_samples": 149, + "batch_num_tokens": 52163, + "epoch": 1.67091, + "grad_norm": 3.3842663764953613, + "learning_rate": 1.6046632951179508e-06, + "loss": 1.3952, + "step": 1838 + }, + { + "batch_num_effect_tokens": 6566, + "batch_num_samples": 149, + "batch_num_tokens": 52139, + "epoch": 1.67182, + "grad_norm": 3.1450865268707275, + "learning_rate": 1.5960534049402987e-06, + "loss": 0.9144, + "step": 1839 + }, + { + "batch_num_effect_tokens": 6941, + "batch_num_samples": 149, + "batch_num_tokens": 52140, + "epoch": 1.67273, + "grad_norm": 3.1405293941497803, + "learning_rate": 1.587464671688187e-06, + "loss": 1.0331, + "step": 1840 + }, + { + "batch_num_effect_tokens": 4996, + "batch_num_samples": 149, + "batch_num_tokens": 52143, + "epoch": 1.67364, + "grad_norm": 3.9549872875213623, + "learning_rate": 1.5788971169837474e-06, + "loss": 1.1802, + "step": 1841 + }, + { + "batch_num_effect_tokens": 6037, + "batch_num_samples": 149, + "batch_num_tokens": 52220, + "epoch": 1.67455, + "grad_norm": 2.9194626808166504, + "learning_rate": 1.5703507623957848e-06, + "loss": 0.7473, + "step": 1842 + }, + { + "batch_num_effect_tokens": 9964, + "batch_num_samples": 149, + "batch_num_tokens": 52142, + "epoch": 1.67545, + "grad_norm": 2.755096912384033, + "learning_rate": 1.5618256294397383e-06, + "loss": 1.011, + "step": 1843 + }, + { + "batch_num_effect_tokens": 7898, + "batch_num_samples": 150, + "batch_num_tokens": 52219, + "epoch": 1.67636, + "grad_norm": 2.4391679763793945, + "learning_rate": 1.553321739577619e-06, + "loss": 0.9038, + "step": 1844 + }, + { + "batch_num_effect_tokens": 3673, + "batch_num_samples": 149, + "batch_num_tokens": 52200, + "epoch": 1.67727, + "grad_norm": 3.347842216491699, + "learning_rate": 1.5448391142179575e-06, + "loss": 0.77, + "step": 1845 + }, + { + "batch_num_effect_tokens": 5867, + "batch_num_samples": 149, + "batch_num_tokens": 52141, + "epoch": 1.67818, + "grad_norm": 4.084648132324219, + "learning_rate": 1.536377774715757e-06, + "loss": 1.792, + "step": 1846 + }, + { + "batch_num_effect_tokens": 6421, + "batch_num_samples": 149, + "batch_num_tokens": 52180, + "epoch": 1.67909, + "grad_norm": 2.802750587463379, + "learning_rate": 1.5279377423724261e-06, + "loss": 0.7052, + "step": 1847 + }, + { + "batch_num_effect_tokens": 7452, + "batch_num_samples": 149, + "batch_num_tokens": 52222, + "epoch": 1.68, + "grad_norm": 2.640302896499634, + "learning_rate": 1.5195190384357405e-06, + "loss": 0.8296, + "step": 1848 + }, + { + "batch_num_effect_tokens": 9075, + "batch_num_samples": 149, + "batch_num_tokens": 52213, + "epoch": 1.68091, + "grad_norm": 3.084359884262085, + "learning_rate": 1.5111216840997745e-06, + "loss": 1.3787, + "step": 1849 + }, + { + "batch_num_effect_tokens": 6113, + "batch_num_samples": 149, + "batch_num_tokens": 52094, + "epoch": 1.68182, + "grad_norm": 3.2951931953430176, + "learning_rate": 1.5027457005048573e-06, + "loss": 1.1228, + "step": 1850 + }, + { + "batch_num_effect_tokens": 4228, + "batch_num_samples": 149, + "batch_num_tokens": 52208, + "epoch": 1.68273, + "grad_norm": 3.9379076957702637, + "learning_rate": 1.4943911087375173e-06, + "loss": 0.8264, + "step": 1851 + }, + { + "batch_num_effect_tokens": 4946, + "batch_num_samples": 149, + "batch_num_tokens": 52219, + "epoch": 1.68364, + "grad_norm": 2.608274459838867, + "learning_rate": 1.4860579298304311e-06, + "loss": 0.517, + "step": 1852 + }, + { + "batch_num_effect_tokens": 6964, + "batch_num_samples": 149, + "batch_num_tokens": 52215, + "epoch": 1.68455, + "grad_norm": 3.3249189853668213, + "learning_rate": 1.4777461847623653e-06, + "loss": 1.1667, + "step": 1853 + }, + { + "batch_num_effect_tokens": 7873, + "batch_num_samples": 149, + "batch_num_tokens": 52210, + "epoch": 1.68545, + "grad_norm": 3.2587575912475586, + "learning_rate": 1.4694558944581294e-06, + "loss": 1.5438, + "step": 1854 + }, + { + "batch_num_effect_tokens": 7917, + "batch_num_samples": 149, + "batch_num_tokens": 52157, + "epoch": 1.68636, + "grad_norm": 3.054732322692871, + "learning_rate": 1.4611870797885196e-06, + "loss": 1.1736, + "step": 1855 + }, + { + "batch_num_effect_tokens": 9724, + "batch_num_samples": 149, + "batch_num_tokens": 52142, + "epoch": 1.68727, + "grad_norm": 2.0472991466522217, + "learning_rate": 1.4529397615702656e-06, + "loss": 0.7589, + "step": 1856 + }, + { + "batch_num_effect_tokens": 6360, + "batch_num_samples": 150, + "batch_num_tokens": 52215, + "epoch": 1.68818, + "grad_norm": 3.0648200511932373, + "learning_rate": 1.44471396056598e-06, + "loss": 0.9101, + "step": 1857 + }, + { + "batch_num_effect_tokens": 5741, + "batch_num_samples": 149, + "batch_num_tokens": 52208, + "epoch": 1.68909, + "grad_norm": 2.9164628982543945, + "learning_rate": 1.436509697484111e-06, + "loss": 0.695, + "step": 1858 + }, + { + "batch_num_effect_tokens": 6632, + "batch_num_samples": 149, + "batch_num_tokens": 52118, + "epoch": 1.69, + "grad_norm": 4.474465847015381, + "learning_rate": 1.4283269929788779e-06, + "loss": 1.7564, + "step": 1859 + }, + { + "batch_num_effect_tokens": 5356, + "batch_num_samples": 149, + "batch_num_tokens": 52208, + "epoch": 1.69091, + "grad_norm": 3.284327745437622, + "learning_rate": 1.4201658676502294e-06, + "loss": 0.8759, + "step": 1860 + }, + { + "batch_num_effect_tokens": 8474, + "batch_num_samples": 149, + "batch_num_tokens": 52118, + "epoch": 1.69182, + "grad_norm": 2.9198808670043945, + "learning_rate": 1.4120263420437919e-06, + "loss": 1.4958, + "step": 1861 + }, + { + "batch_num_effect_tokens": 8280, + "batch_num_samples": 149, + "batch_num_tokens": 52201, + "epoch": 1.69273, + "grad_norm": 2.9854891300201416, + "learning_rate": 1.4039084366508094e-06, + "loss": 1.1542, + "step": 1862 + }, + { + "batch_num_effect_tokens": 4069, + "batch_num_samples": 149, + "batch_num_tokens": 52140, + "epoch": 1.69364, + "grad_norm": 4.182518005371094, + "learning_rate": 1.3958121719080986e-06, + "loss": 1.0759, + "step": 1863 + }, + { + "batch_num_effect_tokens": 4885, + "batch_num_samples": 149, + "batch_num_tokens": 52145, + "epoch": 1.69455, + "grad_norm": 3.351048469543457, + "learning_rate": 1.3877375681979944e-06, + "loss": 0.766, + "step": 1864 + }, + { + "batch_num_effect_tokens": 6128, + "batch_num_samples": 149, + "batch_num_tokens": 52151, + "epoch": 1.69545, + "grad_norm": 3.1366031169891357, + "learning_rate": 1.379684645848307e-06, + "loss": 0.8039, + "step": 1865 + }, + { + "batch_num_effect_tokens": 6641, + "batch_num_samples": 149, + "batch_num_tokens": 52152, + "epoch": 1.69636, + "grad_norm": 2.8246731758117676, + "learning_rate": 1.3716534251322543e-06, + "loss": 0.765, + "step": 1866 + }, + { + "batch_num_effect_tokens": 5457, + "batch_num_samples": 149, + "batch_num_tokens": 52214, + "epoch": 1.69727, + "grad_norm": 2.756589889526367, + "learning_rate": 1.3636439262684299e-06, + "loss": 0.7063, + "step": 1867 + }, + { + "batch_num_effect_tokens": 6369, + "batch_num_samples": 149, + "batch_num_tokens": 52150, + "epoch": 1.69818, + "grad_norm": 3.16241192817688, + "learning_rate": 1.3556561694207337e-06, + "loss": 0.9265, + "step": 1868 + }, + { + "batch_num_effect_tokens": 7286, + "batch_num_samples": 150, + "batch_num_tokens": 52219, + "epoch": 1.69909, + "grad_norm": 2.5733494758605957, + "learning_rate": 1.347690174698335e-06, + "loss": 0.5214, + "step": 1869 + }, + { + "batch_num_effect_tokens": 7002, + "batch_num_samples": 149, + "batch_num_tokens": 52197, + "epoch": 1.7, + "grad_norm": 2.967886447906494, + "learning_rate": 1.339745962155613e-06, + "loss": 1.0079, + "step": 1870 + }, + { + "batch_num_effect_tokens": 5487, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 1.70091, + "grad_norm": 3.7134454250335693, + "learning_rate": 1.3318235517921197e-06, + "loss": 1.1692, + "step": 1871 + }, + { + "batch_num_effect_tokens": 5793, + "batch_num_samples": 149, + "batch_num_tokens": 52082, + "epoch": 1.70182, + "grad_norm": 2.976924419403076, + "learning_rate": 1.3239229635525074e-06, + "loss": 0.7546, + "step": 1872 + }, + { + "batch_num_effect_tokens": 7355, + "batch_num_samples": 149, + "batch_num_tokens": 52146, + "epoch": 1.70273, + "grad_norm": 2.6366283893585205, + "learning_rate": 1.3160442173265032e-06, + "loss": 0.8432, + "step": 1873 + }, + { + "batch_num_effect_tokens": 5844, + "batch_num_samples": 149, + "batch_num_tokens": 52184, + "epoch": 1.70364, + "grad_norm": 3.6266331672668457, + "learning_rate": 1.3081873329488393e-06, + "loss": 1.1098, + "step": 1874 + }, + { + "batch_num_effect_tokens": 4746, + "batch_num_samples": 149, + "batch_num_tokens": 52195, + "epoch": 1.70455, + "grad_norm": 3.1041576862335205, + "learning_rate": 1.3003523301992105e-06, + "loss": 0.564, + "step": 1875 + }, + { + "batch_num_effect_tokens": 9597, + "batch_num_samples": 149, + "batch_num_tokens": 52183, + "epoch": 1.70545, + "grad_norm": 2.8251724243164062, + "learning_rate": 1.2925392288022299e-06, + "loss": 1.197, + "step": 1876 + }, + { + "batch_num_effect_tokens": 4799, + "batch_num_samples": 149, + "batch_num_tokens": 52118, + "epoch": 1.70636, + "grad_norm": 3.5474135875701904, + "learning_rate": 1.2847480484273666e-06, + "loss": 0.8343, + "step": 1877 + }, + { + "batch_num_effect_tokens": 5678, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 1.70727, + "grad_norm": 3.4982004165649414, + "learning_rate": 1.2769788086889135e-06, + "loss": 0.9056, + "step": 1878 + }, + { + "batch_num_effect_tokens": 10563, + "batch_num_samples": 149, + "batch_num_tokens": 52208, + "epoch": 1.70818, + "grad_norm": 2.6265292167663574, + "learning_rate": 1.269231529145918e-06, + "loss": 1.3058, + "step": 1879 + }, + { + "batch_num_effect_tokens": 10088, + "batch_num_samples": 149, + "batch_num_tokens": 52169, + "epoch": 1.70909, + "grad_norm": 2.6288015842437744, + "learning_rate": 1.2615062293021508e-06, + "loss": 1.2069, + "step": 1880 + }, + { + "batch_num_effect_tokens": 6101, + "batch_num_samples": 149, + "batch_num_tokens": 52197, + "epoch": 1.71, + "grad_norm": 3.580078363418579, + "learning_rate": 1.2538029286060428e-06, + "loss": 1.1826, + "step": 1881 + }, + { + "batch_num_effect_tokens": 6623, + "batch_num_samples": 149, + "batch_num_tokens": 52193, + "epoch": 1.71091, + "grad_norm": 3.16204571723938, + "learning_rate": 1.2461216464506454e-06, + "loss": 1.1779, + "step": 1882 + }, + { + "batch_num_effect_tokens": 4556, + "batch_num_samples": 149, + "batch_num_tokens": 52145, + "epoch": 1.71182, + "grad_norm": 3.065345287322998, + "learning_rate": 1.2384624021735736e-06, + "loss": 0.5967, + "step": 1883 + }, + { + "batch_num_effect_tokens": 6305, + "batch_num_samples": 149, + "batch_num_tokens": 50593, + "epoch": 1.71273, + "grad_norm": 3.218221426010132, + "learning_rate": 1.230825215056971e-06, + "loss": 0.8445, + "step": 1884 + }, + { + "batch_num_effect_tokens": 4052, + "batch_num_samples": 149, + "batch_num_tokens": 52191, + "epoch": 1.71364, + "grad_norm": 2.8410584926605225, + "learning_rate": 1.2232101043274437e-06, + "loss": 0.3147, + "step": 1885 + }, + { + "batch_num_effect_tokens": 10167, + "batch_num_samples": 149, + "batch_num_tokens": 52122, + "epoch": 1.71455, + "grad_norm": 2.5719006061553955, + "learning_rate": 1.215617089156026e-06, + "loss": 1.2197, + "step": 1886 + }, + { + "batch_num_effect_tokens": 7237, + "batch_num_samples": 149, + "batch_num_tokens": 52191, + "epoch": 1.71545, + "grad_norm": 2.9863481521606445, + "learning_rate": 1.208046188658124e-06, + "loss": 1.0261, + "step": 1887 + }, + { + "batch_num_effect_tokens": 4322, + "batch_num_samples": 149, + "batch_num_tokens": 52167, + "epoch": 1.71636, + "grad_norm": 3.265608549118042, + "learning_rate": 1.2004974218934695e-06, + "loss": 0.7267, + "step": 1888 + }, + { + "batch_num_effect_tokens": 5478, + "batch_num_samples": 149, + "batch_num_tokens": 52117, + "epoch": 1.71727, + "grad_norm": 4.387427806854248, + "learning_rate": 1.192970807866073e-06, + "loss": 1.5839, + "step": 1889 + }, + { + "batch_num_effect_tokens": 5198, + "batch_num_samples": 149, + "batch_num_tokens": 52191, + "epoch": 1.71818, + "grad_norm": 3.4425690174102783, + "learning_rate": 1.1854663655241804e-06, + "loss": 0.7649, + "step": 1890 + }, + { + "batch_num_effect_tokens": 6151, + "batch_num_samples": 149, + "batch_num_tokens": 52151, + "epoch": 1.71909, + "grad_norm": 3.0315802097320557, + "learning_rate": 1.177984113760211e-06, + "loss": 0.8902, + "step": 1891 + }, + { + "batch_num_effect_tokens": 8876, + "batch_num_samples": 149, + "batch_num_tokens": 52183, + "epoch": 1.72, + "grad_norm": 2.988717555999756, + "learning_rate": 1.1705240714107301e-06, + "loss": 1.3511, + "step": 1892 + }, + { + "batch_num_effect_tokens": 5401, + "batch_num_samples": 149, + "batch_num_tokens": 52197, + "epoch": 1.72091, + "grad_norm": 4.3993730545043945, + "learning_rate": 1.163086257256385e-06, + "loss": 1.4819, + "step": 1893 + }, + { + "batch_num_effect_tokens": 6641, + "batch_num_samples": 150, + "batch_num_tokens": 52152, + "epoch": 1.72182, + "grad_norm": 3.5445523262023926, + "learning_rate": 1.1556706900218572e-06, + "loss": 1.4186, + "step": 1894 + }, + { + "batch_num_effect_tokens": 7538, + "batch_num_samples": 149, + "batch_num_tokens": 52137, + "epoch": 1.72273, + "grad_norm": 3.3498003482818604, + "learning_rate": 1.1482773883758357e-06, + "loss": 1.31, + "step": 1895 + }, + { + "batch_num_effect_tokens": 5370, + "batch_num_samples": 149, + "batch_num_tokens": 52163, + "epoch": 1.72364, + "grad_norm": 4.466821670532227, + "learning_rate": 1.1409063709309442e-06, + "loss": 1.7006, + "step": 1896 + }, + { + "batch_num_effect_tokens": 6420, + "batch_num_samples": 149, + "batch_num_tokens": 52097, + "epoch": 1.72455, + "grad_norm": 2.439729928970337, + "learning_rate": 1.1335576562437134e-06, + "loss": 0.7083, + "step": 1897 + }, + { + "batch_num_effect_tokens": 7274, + "batch_num_samples": 150, + "batch_num_tokens": 52214, + "epoch": 1.72545, + "grad_norm": 3.1005828380584717, + "learning_rate": 1.126231262814521e-06, + "loss": 1.153, + "step": 1898 + }, + { + "batch_num_effect_tokens": 5792, + "batch_num_samples": 149, + "batch_num_tokens": 52221, + "epoch": 1.72636, + "grad_norm": 2.4603047370910645, + "learning_rate": 1.1189272090875592e-06, + "loss": 0.4733, + "step": 1899 + }, + { + "batch_num_effect_tokens": 6616, + "batch_num_samples": 149, + "batch_num_tokens": 52221, + "epoch": 1.72727, + "grad_norm": 3.304178476333618, + "learning_rate": 1.1116455134507665e-06, + "loss": 1.0372, + "step": 1900 + }, + { + "batch_num_effect_tokens": 4083, + "batch_num_samples": 149, + "batch_num_tokens": 52201, + "epoch": 1.72818, + "grad_norm": 3.7938997745513916, + "learning_rate": 1.1043861942358081e-06, + "loss": 1.1319, + "step": 1901 + }, + { + "batch_num_effect_tokens": 6721, + "batch_num_samples": 150, + "batch_num_tokens": 52177, + "epoch": 1.72909, + "grad_norm": 3.1155800819396973, + "learning_rate": 1.0971492697180097e-06, + "loss": 1.0504, + "step": 1902 + }, + { + "batch_num_effect_tokens": 7175, + "batch_num_samples": 149, + "batch_num_tokens": 52129, + "epoch": 1.73, + "grad_norm": 2.911799907684326, + "learning_rate": 1.0899347581163222e-06, + "loss": 1.0896, + "step": 1903 + }, + { + "batch_num_effect_tokens": 7680, + "batch_num_samples": 149, + "batch_num_tokens": 52197, + "epoch": 1.73091, + "grad_norm": 3.1543545722961426, + "learning_rate": 1.0827426775932658e-06, + "loss": 1.3, + "step": 1904 + }, + { + "batch_num_effect_tokens": 5453, + "batch_num_samples": 149, + "batch_num_tokens": 52131, + "epoch": 1.73182, + "grad_norm": 3.1990206241607666, + "learning_rate": 1.0755730462549008e-06, + "loss": 0.9333, + "step": 1905 + }, + { + "batch_num_effect_tokens": 7691, + "batch_num_samples": 149, + "batch_num_tokens": 52122, + "epoch": 1.73273, + "grad_norm": 2.3991081714630127, + "learning_rate": 1.068425882150762e-06, + "loss": 0.7612, + "step": 1906 + }, + { + "batch_num_effect_tokens": 6211, + "batch_num_samples": 149, + "batch_num_tokens": 52198, + "epoch": 1.73364, + "grad_norm": 3.288367509841919, + "learning_rate": 1.0613012032738268e-06, + "loss": 1.0547, + "step": 1907 + }, + { + "batch_num_effect_tokens": 7345, + "batch_num_samples": 149, + "batch_num_tokens": 52118, + "epoch": 1.73455, + "grad_norm": 2.6501219272613525, + "learning_rate": 1.054199027560463e-06, + "loss": 0.8752, + "step": 1908 + }, + { + "batch_num_effect_tokens": 10309, + "batch_num_samples": 149, + "batch_num_tokens": 52169, + "epoch": 1.73545, + "grad_norm": 2.221205711364746, + "learning_rate": 1.047119372890395e-06, + "loss": 0.9218, + "step": 1909 + }, + { + "batch_num_effect_tokens": 6150, + "batch_num_samples": 149, + "batch_num_tokens": 52179, + "epoch": 1.73636, + "grad_norm": 3.7723615169525146, + "learning_rate": 1.0400622570866426e-06, + "loss": 1.232, + "step": 1910 + }, + { + "batch_num_effect_tokens": 5991, + "batch_num_samples": 149, + "batch_num_tokens": 52147, + "epoch": 1.73727, + "grad_norm": 2.9985649585723877, + "learning_rate": 1.033027697915483e-06, + "loss": 0.8793, + "step": 1911 + }, + { + "batch_num_effect_tokens": 6396, + "batch_num_samples": 149, + "batch_num_tokens": 52142, + "epoch": 1.73818, + "grad_norm": 3.3636891841888428, + "learning_rate": 1.0260157130864178e-06, + "loss": 0.8005, + "step": 1912 + }, + { + "batch_num_effect_tokens": 5555, + "batch_num_samples": 149, + "batch_num_tokens": 52207, + "epoch": 1.73909, + "grad_norm": 2.8261046409606934, + "learning_rate": 1.0190263202521033e-06, + "loss": 0.7, + "step": 1913 + }, + { + "batch_num_effect_tokens": 8611, + "batch_num_samples": 149, + "batch_num_tokens": 52174, + "epoch": 1.74, + "grad_norm": 3.332305669784546, + "learning_rate": 1.012059537008332e-06, + "loss": 1.4691, + "step": 1914 + }, + { + "batch_num_effect_tokens": 7709, + "batch_num_samples": 149, + "batch_num_tokens": 52136, + "epoch": 1.74091, + "grad_norm": 3.5245392322540283, + "learning_rate": 1.0051153808939683e-06, + "loss": 1.6294, + "step": 1915 + }, + { + "batch_num_effect_tokens": 8637, + "batch_num_samples": 149, + "batch_num_tokens": 52181, + "epoch": 1.74182, + "grad_norm": 2.706677198410034, + "learning_rate": 9.981938693909221e-07, + "loss": 1.1122, + "step": 1916 + }, + { + "batch_num_effect_tokens": 4411, + "batch_num_samples": 149, + "batch_num_tokens": 52187, + "epoch": 1.74273, + "grad_norm": 3.1833739280700684, + "learning_rate": 9.912950199240867e-07, + "loss": 0.8263, + "step": 1917 + }, + { + "batch_num_effect_tokens": 4942, + "batch_num_samples": 150, + "batch_num_tokens": 52193, + "epoch": 1.74364, + "grad_norm": 3.8086676597595215, + "learning_rate": 9.844188498613117e-07, + "loss": 0.9263, + "step": 1918 + }, + { + "batch_num_effect_tokens": 6171, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 1.74455, + "grad_norm": 2.9984874725341797, + "learning_rate": 9.775653765133398e-07, + "loss": 0.6323, + "step": 1919 + }, + { + "batch_num_effect_tokens": 3792, + "batch_num_samples": 149, + "batch_num_tokens": 52143, + "epoch": 1.74545, + "grad_norm": 3.45582914352417, + "learning_rate": 9.707346171337895e-07, + "loss": 0.5912, + "step": 1920 + }, + { + "batch_num_effect_tokens": 7333, + "batch_num_samples": 149, + "batch_num_tokens": 52160, + "epoch": 1.74636, + "grad_norm": 2.9762768745422363, + "learning_rate": 9.63926588919083e-07, + "loss": 1.1486, + "step": 1921 + }, + { + "batch_num_effect_tokens": 5750, + "batch_num_samples": 149, + "batch_num_tokens": 52197, + "epoch": 1.74727, + "grad_norm": 2.3640410900115967, + "learning_rate": 9.571413090084281e-07, + "loss": 0.5453, + "step": 1922 + }, + { + "batch_num_effect_tokens": 7812, + "batch_num_samples": 149, + "batch_num_tokens": 52163, + "epoch": 1.74818, + "grad_norm": 3.1287717819213867, + "learning_rate": 9.503787944837562e-07, + "loss": 1.2124, + "step": 1923 + }, + { + "batch_num_effect_tokens": 5194, + "batch_num_samples": 149, + "batch_num_tokens": 50593, + "epoch": 1.74909, + "grad_norm": 3.3572943210601807, + "learning_rate": 9.436390623696911e-07, + "loss": 0.8045, + "step": 1924 + }, + { + "batch_num_effect_tokens": 4003, + "batch_num_samples": 149, + "batch_num_tokens": 52223, + "epoch": 1.75, + "grad_norm": 2.8662009239196777, + "learning_rate": 9.369221296335007e-07, + "loss": 0.3116, + "step": 1925 + }, + { + "batch_num_effect_tokens": 8198, + "batch_num_samples": 150, + "batch_num_tokens": 52143, + "epoch": 1.75091, + "grad_norm": 3.0082473754882812, + "learning_rate": 9.302280131850538e-07, + "loss": 1.3142, + "step": 1926 + }, + { + "batch_num_effect_tokens": 5362, + "batch_num_samples": 149, + "batch_num_tokens": 52142, + "epoch": 1.75182, + "grad_norm": 3.834455728530884, + "learning_rate": 9.235567298767812e-07, + "loss": 1.5696, + "step": 1927 + }, + { + "batch_num_effect_tokens": 5656, + "batch_num_samples": 149, + "batch_num_tokens": 52187, + "epoch": 1.75273, + "grad_norm": 3.420131206512451, + "learning_rate": 9.16908296503628e-07, + "loss": 1.1827, + "step": 1928 + }, + { + "batch_num_effect_tokens": 2948, + "batch_num_samples": 149, + "batch_num_tokens": 52130, + "epoch": 1.75364, + "grad_norm": 1.886879801750183, + "learning_rate": 9.102827298030226e-07, + "loss": 0.0661, + "step": 1929 + }, + { + "batch_num_effect_tokens": 6512, + "batch_num_samples": 149, + "batch_num_tokens": 52126, + "epoch": 1.75455, + "grad_norm": 3.2286927700042725, + "learning_rate": 9.036800464548157e-07, + "loss": 0.9319, + "step": 1930 + }, + { + "batch_num_effect_tokens": 6389, + "batch_num_samples": 149, + "batch_num_tokens": 50559, + "epoch": 1.75545, + "grad_norm": 2.8748347759246826, + "learning_rate": 8.97100263081262e-07, + "loss": 0.9958, + "step": 1931 + }, + { + "batch_num_effect_tokens": 11793, + "batch_num_samples": 149, + "batch_num_tokens": 52161, + "epoch": 1.75636, + "grad_norm": 2.4396183490753174, + "learning_rate": 8.905433962469489e-07, + "loss": 1.2373, + "step": 1932 + }, + { + "batch_num_effect_tokens": 6132, + "batch_num_samples": 149, + "batch_num_tokens": 52181, + "epoch": 1.75727, + "grad_norm": 2.9254977703094482, + "learning_rate": 8.840094624587892e-07, + "loss": 0.9092, + "step": 1933 + }, + { + "batch_num_effect_tokens": 4157, + "batch_num_samples": 149, + "batch_num_tokens": 52191, + "epoch": 1.75818, + "grad_norm": 3.1090028285980225, + "learning_rate": 8.774984781659468e-07, + "loss": 0.3909, + "step": 1934 + }, + { + "batch_num_effect_tokens": 7081, + "batch_num_samples": 149, + "batch_num_tokens": 52167, + "epoch": 1.75909, + "grad_norm": 2.3378493785858154, + "learning_rate": 8.710104597598224e-07, + "loss": 0.7473, + "step": 1935 + }, + { + "batch_num_effect_tokens": 5516, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 1.76, + "grad_norm": 3.141493558883667, + "learning_rate": 8.645454235739903e-07, + "loss": 0.8269, + "step": 1936 + }, + { + "batch_num_effect_tokens": 4122, + "batch_num_samples": 149, + "batch_num_tokens": 52210, + "epoch": 1.76091, + "grad_norm": 3.506274938583374, + "learning_rate": 8.581033858841769e-07, + "loss": 0.5283, + "step": 1937 + }, + { + "batch_num_effect_tokens": 6230, + "batch_num_samples": 149, + "batch_num_tokens": 52185, + "epoch": 1.76182, + "grad_norm": 3.2314116954803467, + "learning_rate": 8.516843629081983e-07, + "loss": 1.0369, + "step": 1938 + }, + { + "batch_num_effect_tokens": 5545, + "batch_num_samples": 149, + "batch_num_tokens": 52118, + "epoch": 1.76273, + "grad_norm": 3.286485433578491, + "learning_rate": 8.4528837080594e-07, + "loss": 0.9618, + "step": 1939 + }, + { + "batch_num_effect_tokens": 5104, + "batch_num_samples": 149, + "batch_num_tokens": 52208, + "epoch": 1.76364, + "grad_norm": 2.489656686782837, + "learning_rate": 8.389154256793042e-07, + "loss": 0.6243, + "step": 1940 + }, + { + "batch_num_effect_tokens": 6727, + "batch_num_samples": 150, + "batch_num_tokens": 52215, + "epoch": 1.76455, + "grad_norm": 2.5669116973876953, + "learning_rate": 8.325655435721735e-07, + "loss": 0.8501, + "step": 1941 + }, + { + "batch_num_effect_tokens": 5069, + "batch_num_samples": 149, + "batch_num_tokens": 52156, + "epoch": 1.76545, + "grad_norm": 2.9081106185913086, + "learning_rate": 8.262387404703654e-07, + "loss": 0.5421, + "step": 1942 + }, + { + "batch_num_effect_tokens": 7477, + "batch_num_samples": 150, + "batch_num_tokens": 52212, + "epoch": 1.76636, + "grad_norm": 5.922401428222656, + "learning_rate": 8.199350323016042e-07, + "loss": 1.4223, + "step": 1943 + }, + { + "batch_num_effect_tokens": 6118, + "batch_num_samples": 149, + "batch_num_tokens": 52220, + "epoch": 1.76727, + "grad_norm": 3.127690076828003, + "learning_rate": 8.136544349354669e-07, + "loss": 0.9317, + "step": 1944 + }, + { + "batch_num_effect_tokens": 4070, + "batch_num_samples": 149, + "batch_num_tokens": 52220, + "epoch": 1.76818, + "grad_norm": 3.2388033866882324, + "learning_rate": 8.073969641833446e-07, + "loss": 0.5806, + "step": 1945 + }, + { + "batch_num_effect_tokens": 6078, + "batch_num_samples": 149, + "batch_num_tokens": 52149, + "epoch": 1.76909, + "grad_norm": 3.322255849838257, + "learning_rate": 8.011626357984182e-07, + "loss": 0.9383, + "step": 1946 + }, + { + "batch_num_effect_tokens": 8281, + "batch_num_samples": 149, + "batch_num_tokens": 52139, + "epoch": 1.77, + "grad_norm": 3.274935245513916, + "learning_rate": 7.949514654755963e-07, + "loss": 1.4529, + "step": 1947 + }, + { + "batch_num_effect_tokens": 5857, + "batch_num_samples": 149, + "batch_num_tokens": 52198, + "epoch": 1.77091, + "grad_norm": 3.174466848373413, + "learning_rate": 7.887634688515e-07, + "loss": 0.936, + "step": 1948 + }, + { + "batch_num_effect_tokens": 7239, + "batch_num_samples": 149, + "batch_num_tokens": 52176, + "epoch": 1.77182, + "grad_norm": 3.1621530055999756, + "learning_rate": 7.825986615043967e-07, + "loss": 1.1346, + "step": 1949 + }, + { + "batch_num_effect_tokens": 8131, + "batch_num_samples": 149, + "batch_num_tokens": 52221, + "epoch": 1.77273, + "grad_norm": 3.23815655708313, + "learning_rate": 7.764570589541876e-07, + "loss": 1.3067, + "step": 1950 + }, + { + "batch_num_effect_tokens": 7187, + "batch_num_samples": 149, + "batch_num_tokens": 52118, + "epoch": 1.77364, + "grad_norm": 2.7597808837890625, + "learning_rate": 7.703386766623444e-07, + "loss": 0.7689, + "step": 1951 + }, + { + "batch_num_effect_tokens": 6545, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 1.77455, + "grad_norm": 3.302802324295044, + "learning_rate": 7.642435300318906e-07, + "loss": 1.2263, + "step": 1952 + }, + { + "batch_num_effect_tokens": 8419, + "batch_num_samples": 149, + "batch_num_tokens": 52214, + "epoch": 1.77545, + "grad_norm": 2.663577079772949, + "learning_rate": 7.581716344073476e-07, + "loss": 0.9514, + "step": 1953 + }, + { + "batch_num_effect_tokens": 10600, + "batch_num_samples": 149, + "batch_num_tokens": 52188, + "epoch": 1.77636, + "grad_norm": 2.5999982357025146, + "learning_rate": 7.521230050747086e-07, + "loss": 1.3394, + "step": 1954 + }, + { + "batch_num_effect_tokens": 7929, + "batch_num_samples": 150, + "batch_num_tokens": 52220, + "epoch": 1.77727, + "grad_norm": 3.399446487426758, + "learning_rate": 7.460976572613888e-07, + "loss": 1.2472, + "step": 1955 + }, + { + "batch_num_effect_tokens": 5821, + "batch_num_samples": 149, + "batch_num_tokens": 52125, + "epoch": 1.77818, + "grad_norm": 2.714089870452881, + "learning_rate": 7.400956061361975e-07, + "loss": 0.6145, + "step": 1956 + }, + { + "batch_num_effect_tokens": 5670, + "batch_num_samples": 149, + "batch_num_tokens": 52175, + "epoch": 1.77909, + "grad_norm": 3.530482530593872, + "learning_rate": 7.341168668092857e-07, + "loss": 0.8642, + "step": 1957 + }, + { + "batch_num_effect_tokens": 5753, + "batch_num_samples": 149, + "batch_num_tokens": 50567, + "epoch": 1.78, + "grad_norm": 2.5470070838928223, + "learning_rate": 7.281614543321269e-07, + "loss": 0.5887, + "step": 1958 + }, + { + "batch_num_effect_tokens": 6883, + "batch_num_samples": 149, + "batch_num_tokens": 52151, + "epoch": 1.78091, + "grad_norm": 2.87015438079834, + "learning_rate": 7.222293836974614e-07, + "loss": 0.8979, + "step": 1959 + }, + { + "batch_num_effect_tokens": 6880, + "batch_num_samples": 149, + "batch_num_tokens": 52138, + "epoch": 1.78182, + "grad_norm": 3.753164291381836, + "learning_rate": 7.163206698392744e-07, + "loss": 1.5885, + "step": 1960 + }, + { + "batch_num_effect_tokens": 6697, + "batch_num_samples": 149, + "batch_num_tokens": 52161, + "epoch": 1.78273, + "grad_norm": 3.28859281539917, + "learning_rate": 7.104353276327414e-07, + "loss": 1.1016, + "step": 1961 + }, + { + "batch_num_effect_tokens": 6404, + "batch_num_samples": 149, + "batch_num_tokens": 52209, + "epoch": 1.78364, + "grad_norm": 3.4291625022888184, + "learning_rate": 7.045733718942094e-07, + "loss": 1.1827, + "step": 1962 + }, + { + "batch_num_effect_tokens": 8762, + "batch_num_samples": 149, + "batch_num_tokens": 52197, + "epoch": 1.78455, + "grad_norm": 2.9575815200805664, + "learning_rate": 6.987348173811415e-07, + "loss": 1.2368, + "step": 1963 + }, + { + "batch_num_effect_tokens": 9222, + "batch_num_samples": 149, + "batch_num_tokens": 52176, + "epoch": 1.78545, + "grad_norm": 2.867325782775879, + "learning_rate": 6.9291967879209e-07, + "loss": 1.1963, + "step": 1964 + }, + { + "batch_num_effect_tokens": 6282, + "batch_num_samples": 149, + "batch_num_tokens": 52168, + "epoch": 1.78636, + "grad_norm": 2.8609049320220947, + "learning_rate": 6.871279707666634e-07, + "loss": 0.8053, + "step": 1965 + }, + { + "batch_num_effect_tokens": 6003, + "batch_num_samples": 149, + "batch_num_tokens": 50555, + "epoch": 1.78727, + "grad_norm": 3.0638647079467773, + "learning_rate": 6.813597078854772e-07, + "loss": 0.8633, + "step": 1966 + }, + { + "batch_num_effect_tokens": 4843, + "batch_num_samples": 149, + "batch_num_tokens": 52118, + "epoch": 1.78818, + "grad_norm": 3.135483741760254, + "learning_rate": 6.756149046701277e-07, + "loss": 0.7593, + "step": 1967 + }, + { + "batch_num_effect_tokens": 6118, + "batch_num_samples": 149, + "batch_num_tokens": 52146, + "epoch": 1.78909, + "grad_norm": 2.4894979000091553, + "learning_rate": 6.698935755831493e-07, + "loss": 0.5755, + "step": 1968 + }, + { + "batch_num_effect_tokens": 7576, + "batch_num_samples": 149, + "batch_num_tokens": 52177, + "epoch": 1.79, + "grad_norm": 2.775993585586548, + "learning_rate": 6.641957350279838e-07, + "loss": 0.8734, + "step": 1969 + }, + { + "batch_num_effect_tokens": 7291, + "batch_num_samples": 149, + "batch_num_tokens": 52140, + "epoch": 1.79091, + "grad_norm": 3.0943503379821777, + "learning_rate": 6.585213973489335e-07, + "loss": 1.1922, + "step": 1970 + }, + { + "batch_num_effect_tokens": 6771, + "batch_num_samples": 149, + "batch_num_tokens": 52081, + "epoch": 1.79182, + "grad_norm": 2.703305959701538, + "learning_rate": 6.528705768311395e-07, + "loss": 0.651, + "step": 1971 + }, + { + "batch_num_effect_tokens": 6261, + "batch_num_samples": 149, + "batch_num_tokens": 52171, + "epoch": 1.79273, + "grad_norm": 3.735302209854126, + "learning_rate": 6.472432877005341e-07, + "loss": 1.5032, + "step": 1972 + }, + { + "batch_num_effect_tokens": 5538, + "batch_num_samples": 149, + "batch_num_tokens": 52208, + "epoch": 1.79364, + "grad_norm": 2.7473983764648438, + "learning_rate": 6.416395441238143e-07, + "loss": 0.7273, + "step": 1973 + }, + { + "batch_num_effect_tokens": 7363, + "batch_num_samples": 149, + "batch_num_tokens": 52146, + "epoch": 1.79455, + "grad_norm": 2.6810669898986816, + "learning_rate": 6.360593602083942e-07, + "loss": 0.942, + "step": 1974 + }, + { + "batch_num_effect_tokens": 8806, + "batch_num_samples": 149, + "batch_num_tokens": 52175, + "epoch": 1.79545, + "grad_norm": 3.0994985103607178, + "learning_rate": 6.305027500023841e-07, + "loss": 1.3185, + "step": 1975 + }, + { + "batch_num_effect_tokens": 6187, + "batch_num_samples": 149, + "batch_num_tokens": 52210, + "epoch": 1.79636, + "grad_norm": 3.1788151264190674, + "learning_rate": 6.249697274945377e-07, + "loss": 0.8476, + "step": 1976 + }, + { + "batch_num_effect_tokens": 5772, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 1.79727, + "grad_norm": 3.620708703994751, + "learning_rate": 6.19460306614238e-07, + "loss": 1.2019, + "step": 1977 + }, + { + "batch_num_effect_tokens": 6709, + "batch_num_samples": 149, + "batch_num_tokens": 52207, + "epoch": 1.79818, + "grad_norm": 2.2427990436553955, + "learning_rate": 6.139745012314424e-07, + "loss": 0.5892, + "step": 1978 + }, + { + "batch_num_effect_tokens": 5345, + "batch_num_samples": 149, + "batch_num_tokens": 52208, + "epoch": 1.79909, + "grad_norm": 2.9493868350982666, + "learning_rate": 6.085123251566616e-07, + "loss": 0.8059, + "step": 1979 + }, + { + "batch_num_effect_tokens": 7749, + "batch_num_samples": 149, + "batch_num_tokens": 52166, + "epoch": 1.8, + "grad_norm": 3.1915297508239746, + "learning_rate": 6.030737921409169e-07, + "loss": 1.394, + "step": 1980 + }, + { + "batch_num_effect_tokens": 5492, + "batch_num_samples": 149, + "batch_num_tokens": 52191, + "epoch": 1.80091, + "grad_norm": 2.7314164638519287, + "learning_rate": 5.976589158757074e-07, + "loss": 0.4688, + "step": 1981 + }, + { + "batch_num_effect_tokens": 5450, + "batch_num_samples": 149, + "batch_num_tokens": 52187, + "epoch": 1.80182, + "grad_norm": 3.20293927192688, + "learning_rate": 5.922677099929785e-07, + "loss": 0.8613, + "step": 1982 + }, + { + "batch_num_effect_tokens": 7602, + "batch_num_samples": 149, + "batch_num_tokens": 52210, + "epoch": 1.80273, + "grad_norm": 3.1139471530914307, + "learning_rate": 5.869001880650826e-07, + "loss": 1.2699, + "step": 1983 + }, + { + "batch_num_effect_tokens": 4704, + "batch_num_samples": 149, + "batch_num_tokens": 52210, + "epoch": 1.80364, + "grad_norm": 3.194953680038452, + "learning_rate": 5.815563636047539e-07, + "loss": 0.6933, + "step": 1984 + }, + { + "batch_num_effect_tokens": 7539, + "batch_num_samples": 149, + "batch_num_tokens": 52216, + "epoch": 1.80455, + "grad_norm": 3.176323413848877, + "learning_rate": 5.762362500650598e-07, + "loss": 1.1949, + "step": 1985 + }, + { + "batch_num_effect_tokens": 7143, + "batch_num_samples": 149, + "batch_num_tokens": 52213, + "epoch": 1.80545, + "grad_norm": 3.151172637939453, + "learning_rate": 5.709398608393835e-07, + "loss": 1.1113, + "step": 1986 + }, + { + "batch_num_effect_tokens": 8160, + "batch_num_samples": 149, + "batch_num_tokens": 52168, + "epoch": 1.80636, + "grad_norm": 1.761962652206421, + "learning_rate": 5.656672092613757e-07, + "loss": 0.3349, + "step": 1987 + }, + { + "batch_num_effect_tokens": 7576, + "batch_num_samples": 149, + "batch_num_tokens": 52209, + "epoch": 1.80727, + "grad_norm": 2.854128837585449, + "learning_rate": 5.604183086049342e-07, + "loss": 0.9761, + "step": 1988 + }, + { + "batch_num_effect_tokens": 8291, + "batch_num_samples": 149, + "batch_num_tokens": 52158, + "epoch": 1.80818, + "grad_norm": 3.2257771492004395, + "learning_rate": 5.551931720841541e-07, + "loss": 1.2465, + "step": 1989 + }, + { + "batch_num_effect_tokens": 5120, + "batch_num_samples": 150, + "batch_num_tokens": 52167, + "epoch": 1.80909, + "grad_norm": 3.463531732559204, + "learning_rate": 5.499918128533155e-07, + "loss": 0.8566, + "step": 1990 + }, + { + "batch_num_effect_tokens": 5706, + "batch_num_samples": 149, + "batch_num_tokens": 52111, + "epoch": 1.81, + "grad_norm": 3.1953766345977783, + "learning_rate": 5.448142440068316e-07, + "loss": 0.9884, + "step": 1991 + }, + { + "batch_num_effect_tokens": 5990, + "batch_num_samples": 149, + "batch_num_tokens": 52118, + "epoch": 1.81091, + "grad_norm": 3.6692111492156982, + "learning_rate": 5.396604785792281e-07, + "loss": 0.9714, + "step": 1992 + }, + { + "batch_num_effect_tokens": 7945, + "batch_num_samples": 149, + "batch_num_tokens": 52143, + "epoch": 1.81182, + "grad_norm": 3.071803092956543, + "learning_rate": 5.345305295450997e-07, + "loss": 1.2217, + "step": 1993 + }, + { + "batch_num_effect_tokens": 5578, + "batch_num_samples": 149, + "batch_num_tokens": 52208, + "epoch": 1.81273, + "grad_norm": 3.0185506343841553, + "learning_rate": 5.294244098190926e-07, + "loss": 0.8658, + "step": 1994 + }, + { + "batch_num_effect_tokens": 3962, + "batch_num_samples": 149, + "batch_num_tokens": 52208, + "epoch": 1.81364, + "grad_norm": 3.220330238342285, + "learning_rate": 5.243421322558506e-07, + "loss": 0.619, + "step": 1995 + }, + { + "batch_num_effect_tokens": 11920, + "batch_num_samples": 149, + "batch_num_tokens": 52199, + "epoch": 1.81455, + "grad_norm": 3.5391247272491455, + "learning_rate": 5.192837096500058e-07, + "loss": 2.2227, + "step": 1996 + }, + { + "batch_num_effect_tokens": 4304, + "batch_num_samples": 149, + "batch_num_tokens": 52196, + "epoch": 1.81545, + "grad_norm": 3.778665065765381, + "learning_rate": 5.142491547361294e-07, + "loss": 0.7287, + "step": 1997 + }, + { + "batch_num_effect_tokens": 8847, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 1.81636, + "grad_norm": 2.801353931427002, + "learning_rate": 5.092384801887074e-07, + "loss": 1.2618, + "step": 1998 + }, + { + "batch_num_effect_tokens": 5465, + "batch_num_samples": 149, + "batch_num_tokens": 52124, + "epoch": 1.81727, + "grad_norm": 4.023580551147461, + "learning_rate": 5.04251698622108e-07, + "loss": 0.5592, + "step": 1999 + }, + { + "batch_num_effect_tokens": 7014, + "batch_num_samples": 149, + "batch_num_tokens": 52186, + "epoch": 1.81818, + "grad_norm": 2.3245139122009277, + "learning_rate": 4.992888225905467e-07, + "loss": 0.5798, + "step": 2000 + }, + { + "batch_num_effect_tokens": 7799, + "batch_num_samples": 149, + "batch_num_tokens": 52196, + "epoch": 1.81909, + "grad_norm": 2.6953628063201904, + "learning_rate": 4.943498645880595e-07, + "loss": 1.0382, + "step": 2001 + }, + { + "batch_num_effect_tokens": 6712, + "batch_num_samples": 149, + "batch_num_tokens": 52140, + "epoch": 1.82, + "grad_norm": 3.313474416732788, + "learning_rate": 4.894348370484648e-07, + "loss": 1.1191, + "step": 2002 + }, + { + "batch_num_effect_tokens": 4090, + "batch_num_samples": 149, + "batch_num_tokens": 52146, + "epoch": 1.82091, + "grad_norm": 3.861618995666504, + "learning_rate": 4.845437523453411e-07, + "loss": 1.0173, + "step": 2003 + }, + { + "batch_num_effect_tokens": 6621, + "batch_num_samples": 149, + "batch_num_tokens": 52141, + "epoch": 1.82182, + "grad_norm": 3.3845365047454834, + "learning_rate": 4.796766227919858e-07, + "loss": 1.0425, + "step": 2004 + }, + { + "batch_num_effect_tokens": 7026, + "batch_num_samples": 149, + "batch_num_tokens": 52147, + "epoch": 1.82273, + "grad_norm": 3.0628647804260254, + "learning_rate": 4.7483346064139513e-07, + "loss": 1.047, + "step": 2005 + }, + { + "batch_num_effect_tokens": 7244, + "batch_num_samples": 149, + "batch_num_tokens": 52183, + "epoch": 1.82364, + "grad_norm": 4.020042419433594, + "learning_rate": 4.7001427808622045e-07, + "loss": 1.7565, + "step": 2006 + }, + { + "batch_num_effect_tokens": 6116, + "batch_num_samples": 149, + "batch_num_tokens": 52178, + "epoch": 1.82455, + "grad_norm": 5.533095836639404, + "learning_rate": 4.6521908725875253e-07, + "loss": 0.5821, + "step": 2007 + }, + { + "batch_num_effect_tokens": 6080, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 1.82545, + "grad_norm": 3.1684529781341553, + "learning_rate": 4.6044790023087373e-07, + "loss": 0.8718, + "step": 2008 + }, + { + "batch_num_effect_tokens": 8107, + "batch_num_samples": 149, + "batch_num_tokens": 52123, + "epoch": 1.82636, + "grad_norm": 2.843222141265869, + "learning_rate": 4.5570072901404474e-07, + "loss": 1.1437, + "step": 2009 + }, + { + "batch_num_effect_tokens": 7416, + "batch_num_samples": 149, + "batch_num_tokens": 52221, + "epoch": 1.82727, + "grad_norm": 3.33341383934021, + "learning_rate": 4.509775855592613e-07, + "loss": 1.1667, + "step": 2010 + }, + { + "batch_num_effect_tokens": 6191, + "batch_num_samples": 149, + "batch_num_tokens": 52134, + "epoch": 1.82818, + "grad_norm": 2.8735108375549316, + "learning_rate": 4.4627848175703315e-07, + "loss": 0.7308, + "step": 2011 + }, + { + "batch_num_effect_tokens": 6283, + "batch_num_samples": 149, + "batch_num_tokens": 52121, + "epoch": 1.82909, + "grad_norm": 3.083097457885742, + "learning_rate": 4.4160342943734723e-07, + "loss": 0.8518, + "step": 2012 + }, + { + "batch_num_effect_tokens": 7764, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 1.83, + "grad_norm": 4.017691135406494, + "learning_rate": 4.3695244036964567e-07, + "loss": 2.1002, + "step": 2013 + }, + { + "batch_num_effect_tokens": 6987, + "batch_num_samples": 149, + "batch_num_tokens": 52201, + "epoch": 1.83091, + "grad_norm": 3.9710652828216553, + "learning_rate": 4.323255262627846e-07, + "loss": 1.9213, + "step": 2014 + }, + { + "batch_num_effect_tokens": 8673, + "batch_num_samples": 149, + "batch_num_tokens": 52213, + "epoch": 1.83182, + "grad_norm": 2.6227548122406006, + "learning_rate": 4.277226987650129e-07, + "loss": 1.0801, + "step": 2015 + }, + { + "batch_num_effect_tokens": 7711, + "batch_num_samples": 149, + "batch_num_tokens": 52190, + "epoch": 1.83273, + "grad_norm": 3.010826826095581, + "learning_rate": 4.2314396946394833e-07, + "loss": 1.2035, + "step": 2016 + }, + { + "batch_num_effect_tokens": 5578, + "batch_num_samples": 149, + "batch_num_tokens": 52187, + "epoch": 1.83364, + "grad_norm": 3.5875613689422607, + "learning_rate": 4.1858934988653233e-07, + "loss": 1.0575, + "step": 2017 + }, + { + "batch_num_effect_tokens": 5538, + "batch_num_samples": 150, + "batch_num_tokens": 52168, + "epoch": 1.83455, + "grad_norm": 2.5732593536376953, + "learning_rate": 4.1405885149901623e-07, + "loss": 0.4825, + "step": 2018 + }, + { + "batch_num_effect_tokens": 6915, + "batch_num_samples": 149, + "batch_num_tokens": 52184, + "epoch": 1.83545, + "grad_norm": 3.549899101257324, + "learning_rate": 4.095524857069244e-07, + "loss": 1.4148, + "step": 2019 + }, + { + "batch_num_effect_tokens": 6640, + "batch_num_samples": 149, + "batch_num_tokens": 52208, + "epoch": 1.83636, + "grad_norm": 3.262498140335083, + "learning_rate": 4.0507026385502747e-07, + "loss": 1.0891, + "step": 2020 + }, + { + "batch_num_effect_tokens": 8297, + "batch_num_samples": 149, + "batch_num_tokens": 52132, + "epoch": 1.83727, + "grad_norm": 3.292078733444214, + "learning_rate": 4.0061219722731136e-07, + "loss": 1.6603, + "step": 2021 + }, + { + "batch_num_effect_tokens": 7026, + "batch_num_samples": 149, + "batch_num_tokens": 52223, + "epoch": 1.83818, + "grad_norm": 3.5750980377197266, + "learning_rate": 3.9617829704695634e-07, + "loss": 1.5395, + "step": 2022 + }, + { + "batch_num_effect_tokens": 7326, + "batch_num_samples": 150, + "batch_num_tokens": 52190, + "epoch": 1.83909, + "grad_norm": 3.339860200881958, + "learning_rate": 3.917685744762989e-07, + "loss": 1.1482, + "step": 2023 + }, + { + "batch_num_effect_tokens": 6495, + "batch_num_samples": 149, + "batch_num_tokens": 52147, + "epoch": 1.84, + "grad_norm": 3.0043394565582275, + "learning_rate": 3.8738304061681107e-07, + "loss": 0.9975, + "step": 2024 + }, + { + "batch_num_effect_tokens": 7548, + "batch_num_samples": 149, + "batch_num_tokens": 52209, + "epoch": 1.84091, + "grad_norm": 2.484179973602295, + "learning_rate": 3.8302170650907023e-07, + "loss": 0.8843, + "step": 2025 + }, + { + "batch_num_effect_tokens": 4985, + "batch_num_samples": 149, + "batch_num_tokens": 52198, + "epoch": 1.84182, + "grad_norm": 3.724242687225342, + "learning_rate": 3.7868458313272906e-07, + "loss": 1.0107, + "step": 2026 + }, + { + "batch_num_effect_tokens": 4613, + "batch_num_samples": 149, + "batch_num_tokens": 52197, + "epoch": 1.84273, + "grad_norm": 2.1949408054351807, + "learning_rate": 3.7437168140648904e-07, + "loss": 0.2, + "step": 2027 + }, + { + "batch_num_effect_tokens": 6506, + "batch_num_samples": 150, + "batch_num_tokens": 52187, + "epoch": 1.84364, + "grad_norm": 3.2145578861236572, + "learning_rate": 3.7008301218807716e-07, + "loss": 0.9436, + "step": 2028 + }, + { + "batch_num_effect_tokens": 7112, + "batch_num_samples": 149, + "batch_num_tokens": 52145, + "epoch": 1.84455, + "grad_norm": 3.019531488418579, + "learning_rate": 3.658185862742103e-07, + "loss": 1.2709, + "step": 2029 + }, + { + "batch_num_effect_tokens": 9174, + "batch_num_samples": 149, + "batch_num_tokens": 50580, + "epoch": 1.84545, + "grad_norm": 3.0364532470703125, + "learning_rate": 3.615784144005796e-07, + "loss": 1.6047, + "step": 2030 + }, + { + "batch_num_effect_tokens": 7268, + "batch_num_samples": 149, + "batch_num_tokens": 52214, + "epoch": 1.84636, + "grad_norm": 3.2398998737335205, + "learning_rate": 3.5736250724180965e-07, + "loss": 1.1237, + "step": 2031 + }, + { + "batch_num_effect_tokens": 2904, + "batch_num_samples": 149, + "batch_num_tokens": 52149, + "epoch": 1.84727, + "grad_norm": 2.7733280658721924, + "learning_rate": 3.531708754114438e-07, + "loss": 0.1986, + "step": 2032 + }, + { + "batch_num_effect_tokens": 6374, + "batch_num_samples": 150, + "batch_num_tokens": 52167, + "epoch": 1.84818, + "grad_norm": 3.345583200454712, + "learning_rate": 3.490035294619087e-07, + "loss": 1.0036, + "step": 2033 + }, + { + "batch_num_effect_tokens": 5774, + "batch_num_samples": 149, + "batch_num_tokens": 52132, + "epoch": 1.84909, + "grad_norm": 3.2348711490631104, + "learning_rate": 3.448604798844912e-07, + "loss": 0.9395, + "step": 2034 + }, + { + "batch_num_effect_tokens": 9591, + "batch_num_samples": 149, + "batch_num_tokens": 52209, + "epoch": 1.85, + "grad_norm": 2.7883570194244385, + "learning_rate": 3.4074173710931804e-07, + "loss": 1.3181, + "step": 2035 + }, + { + "batch_num_effect_tokens": 8987, + "batch_num_samples": 149, + "batch_num_tokens": 52170, + "epoch": 1.85091, + "grad_norm": 3.0228030681610107, + "learning_rate": 3.3664731150531484e-07, + "loss": 1.3226, + "step": 2036 + }, + { + "batch_num_effect_tokens": 5526, + "batch_num_samples": 149, + "batch_num_tokens": 52161, + "epoch": 1.85182, + "grad_norm": 3.1885998249053955, + "learning_rate": 3.3257721338019633e-07, + "loss": 0.7501, + "step": 2037 + }, + { + "batch_num_effect_tokens": 6166, + "batch_num_samples": 149, + "batch_num_tokens": 52198, + "epoch": 1.85273, + "grad_norm": 3.2657663822174072, + "learning_rate": 3.2853145298042954e-07, + "loss": 1.1105, + "step": 2038 + }, + { + "batch_num_effect_tokens": 5526, + "batch_num_samples": 149, + "batch_num_tokens": 52191, + "epoch": 1.85364, + "grad_norm": 2.836792230606079, + "learning_rate": 3.2451004049120936e-07, + "loss": 0.7755, + "step": 2039 + }, + { + "batch_num_effect_tokens": 6221, + "batch_num_samples": 149, + "batch_num_tokens": 52133, + "epoch": 1.85455, + "grad_norm": 3.320397138595581, + "learning_rate": 3.2051298603643754e-07, + "loss": 0.7057, + "step": 2040 + }, + { + "batch_num_effect_tokens": 5113, + "batch_num_samples": 149, + "batch_num_tokens": 52191, + "epoch": 1.85545, + "grad_norm": 3.700753927230835, + "learning_rate": 3.165402996786948e-07, + "loss": 1.3216, + "step": 2041 + }, + { + "batch_num_effect_tokens": 8337, + "batch_num_samples": 149, + "batch_num_tokens": 52168, + "epoch": 1.85636, + "grad_norm": 3.2562508583068848, + "learning_rate": 3.125919914192144e-07, + "loss": 1.5416, + "step": 2042 + }, + { + "batch_num_effect_tokens": 7714, + "batch_num_samples": 149, + "batch_num_tokens": 52124, + "epoch": 1.85727, + "grad_norm": 3.239128351211548, + "learning_rate": 3.086680711978574e-07, + "loss": 1.2409, + "step": 2043 + }, + { + "batch_num_effect_tokens": 4489, + "batch_num_samples": 149, + "batch_num_tokens": 52163, + "epoch": 1.85818, + "grad_norm": 4.076598644256592, + "learning_rate": 3.0476854889308737e-07, + "loss": 0.9881, + "step": 2044 + }, + { + "batch_num_effect_tokens": 7077, + "batch_num_samples": 150, + "batch_num_tokens": 52194, + "epoch": 1.85909, + "grad_norm": 2.9148364067077637, + "learning_rate": 3.008934343219483e-07, + "loss": 0.8965, + "step": 2045 + }, + { + "batch_num_effect_tokens": 6093, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 1.86, + "grad_norm": 2.8413338661193848, + "learning_rate": 2.970427372400353e-07, + "loss": 0.889, + "step": 2046 + }, + { + "batch_num_effect_tokens": 6947, + "batch_num_samples": 149, + "batch_num_tokens": 52157, + "epoch": 1.86091, + "grad_norm": 3.3784499168395996, + "learning_rate": 2.93216467341475e-07, + "loss": 1.262, + "step": 2047 + }, + { + "batch_num_effect_tokens": 10117, + "batch_num_samples": 149, + "batch_num_tokens": 52181, + "epoch": 1.86182, + "grad_norm": 2.520775318145752, + "learning_rate": 2.894146342588977e-07, + "loss": 1.0522, + "step": 2048 + }, + { + "batch_num_effect_tokens": 4035, + "batch_num_samples": 149, + "batch_num_tokens": 52201, + "epoch": 1.86273, + "grad_norm": 4.631101608276367, + "learning_rate": 2.856372475634106e-07, + "loss": 0.5698, + "step": 2049 + }, + { + "batch_num_effect_tokens": 5732, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 1.86364, + "grad_norm": 2.630826234817505, + "learning_rate": 2.818843167645835e-07, + "loss": 0.549, + "step": 2050 + }, + { + "batch_num_effect_tokens": 6049, + "batch_num_samples": 149, + "batch_num_tokens": 52214, + "epoch": 1.86455, + "grad_norm": 3.310131072998047, + "learning_rate": 2.781558513104143e-07, + "loss": 1.0531, + "step": 2051 + }, + { + "batch_num_effect_tokens": 5768, + "batch_num_samples": 149, + "batch_num_tokens": 52207, + "epoch": 1.86545, + "grad_norm": 3.0105044841766357, + "learning_rate": 2.744518605873092e-07, + "loss": 0.8083, + "step": 2052 + }, + { + "batch_num_effect_tokens": 6497, + "batch_num_samples": 149, + "batch_num_tokens": 52165, + "epoch": 1.86636, + "grad_norm": 3.7525155544281006, + "learning_rate": 2.707723539200613e-07, + "loss": 1.6784, + "step": 2053 + }, + { + "batch_num_effect_tokens": 6647, + "batch_num_samples": 149, + "batch_num_tokens": 52175, + "epoch": 1.86727, + "grad_norm": 3.0726425647735596, + "learning_rate": 2.6711734057182417e-07, + "loss": 1.0663, + "step": 2054 + }, + { + "batch_num_effect_tokens": 6001, + "batch_num_samples": 149, + "batch_num_tokens": 52151, + "epoch": 1.86818, + "grad_norm": 2.7889184951782227, + "learning_rate": 2.6348682974408956e-07, + "loss": 0.7351, + "step": 2055 + }, + { + "batch_num_effect_tokens": 6088, + "batch_num_samples": 149, + "batch_num_tokens": 52184, + "epoch": 1.86909, + "grad_norm": 3.8500564098358154, + "learning_rate": 2.5988083057666534e-07, + "loss": 1.4846, + "step": 2056 + }, + { + "batch_num_effect_tokens": 5847, + "batch_num_samples": 149, + "batch_num_tokens": 52112, + "epoch": 1.87, + "grad_norm": 3.5931131839752197, + "learning_rate": 2.5629935214764866e-07, + "loss": 1.168, + "step": 2057 + }, + { + "batch_num_effect_tokens": 6144, + "batch_num_samples": 149, + "batch_num_tokens": 52221, + "epoch": 1.87091, + "grad_norm": 3.006503105163574, + "learning_rate": 2.527424034734072e-07, + "loss": 0.9912, + "step": 2058 + }, + { + "batch_num_effect_tokens": 9794, + "batch_num_samples": 149, + "batch_num_tokens": 52184, + "epoch": 1.87182, + "grad_norm": 2.374946355819702, + "learning_rate": 2.492099935085546e-07, + "loss": 1.0785, + "step": 2059 + }, + { + "batch_num_effect_tokens": 9290, + "batch_num_samples": 149, + "batch_num_tokens": 52197, + "epoch": 1.87273, + "grad_norm": 2.640920400619507, + "learning_rate": 2.4570213114592957e-07, + "loss": 1.1373, + "step": 2060 + }, + { + "batch_num_effect_tokens": 6709, + "batch_num_samples": 149, + "batch_num_tokens": 52189, + "epoch": 1.87364, + "grad_norm": 3.0625946521759033, + "learning_rate": 2.422188252165714e-07, + "loss": 0.8988, + "step": 2061 + }, + { + "batch_num_effect_tokens": 4172, + "batch_num_samples": 149, + "batch_num_tokens": 52121, + "epoch": 1.87455, + "grad_norm": 3.482499837875366, + "learning_rate": 2.387600844896998e-07, + "loss": 0.6744, + "step": 2062 + }, + { + "batch_num_effect_tokens": 7208, + "batch_num_samples": 149, + "batch_num_tokens": 52140, + "epoch": 1.87545, + "grad_norm": 3.2273452281951904, + "learning_rate": 2.3532591767268854e-07, + "loss": 1.2344, + "step": 2063 + }, + { + "batch_num_effect_tokens": 3818, + "batch_num_samples": 149, + "batch_num_tokens": 52125, + "epoch": 1.87636, + "grad_norm": 2.9675824642181396, + "learning_rate": 2.3191633341104859e-07, + "loss": 0.4268, + "step": 2064 + }, + { + "batch_num_effect_tokens": 8259, + "batch_num_samples": 149, + "batch_num_tokens": 52179, + "epoch": 1.87727, + "grad_norm": 2.812716245651245, + "learning_rate": 2.2853134028840594e-07, + "loss": 1.0256, + "step": 2065 + }, + { + "batch_num_effect_tokens": 8465, + "batch_num_samples": 149, + "batch_num_tokens": 52168, + "epoch": 1.87818, + "grad_norm": 2.420287609100342, + "learning_rate": 2.25170946826474e-07, + "loss": 0.9995, + "step": 2066 + }, + { + "batch_num_effect_tokens": 7208, + "batch_num_samples": 149, + "batch_num_tokens": 52141, + "epoch": 1.87909, + "grad_norm": 3.029155969619751, + "learning_rate": 2.2183516148504225e-07, + "loss": 0.9697, + "step": 2067 + }, + { + "batch_num_effect_tokens": 5775, + "batch_num_samples": 149, + "batch_num_tokens": 52191, + "epoch": 1.88, + "grad_norm": 3.367401123046875, + "learning_rate": 2.1852399266194312e-07, + "loss": 1.0406, + "step": 2068 + }, + { + "batch_num_effect_tokens": 7011, + "batch_num_samples": 149, + "batch_num_tokens": 52219, + "epoch": 1.88091, + "grad_norm": 3.2168996334075928, + "learning_rate": 2.152374486930442e-07, + "loss": 1.1897, + "step": 2069 + }, + { + "batch_num_effect_tokens": 6453, + "batch_num_samples": 149, + "batch_num_tokens": 52139, + "epoch": 1.88182, + "grad_norm": 3.0506627559661865, + "learning_rate": 2.119755378522137e-07, + "loss": 1.0789, + "step": 2070 + }, + { + "batch_num_effect_tokens": 6546, + "batch_num_samples": 149, + "batch_num_tokens": 52214, + "epoch": 1.88273, + "grad_norm": 2.782705783843994, + "learning_rate": 2.0873826835130728e-07, + "loss": 0.6964, + "step": 2071 + }, + { + "batch_num_effect_tokens": 8762, + "batch_num_samples": 149, + "batch_num_tokens": 52184, + "epoch": 1.88364, + "grad_norm": 2.063823938369751, + "learning_rate": 2.0552564834014797e-07, + "loss": 0.5552, + "step": 2072 + }, + { + "batch_num_effect_tokens": 4420, + "batch_num_samples": 149, + "batch_num_tokens": 52195, + "epoch": 1.88455, + "grad_norm": 3.2488834857940674, + "learning_rate": 2.0233768590650405e-07, + "loss": 0.5407, + "step": 2073 + }, + { + "batch_num_effect_tokens": 6417, + "batch_num_samples": 149, + "batch_num_tokens": 52191, + "epoch": 1.88545, + "grad_norm": 2.4004364013671875, + "learning_rate": 1.9917438907606556e-07, + "loss": 0.6243, + "step": 2074 + }, + { + "batch_num_effect_tokens": 6864, + "batch_num_samples": 150, + "batch_num_tokens": 52165, + "epoch": 1.88636, + "grad_norm": 3.1030189990997314, + "learning_rate": 1.960357658124301e-07, + "loss": 0.938, + "step": 2075 + }, + { + "batch_num_effect_tokens": 5456, + "batch_num_samples": 149, + "batch_num_tokens": 52219, + "epoch": 1.88727, + "grad_norm": 3.3235042095184326, + "learning_rate": 1.9292182401707603e-07, + "loss": 0.8727, + "step": 2076 + }, + { + "batch_num_effect_tokens": 7481, + "batch_num_samples": 149, + "batch_num_tokens": 52132, + "epoch": 1.88818, + "grad_norm": 3.2268526554107666, + "learning_rate": 1.898325715293503e-07, + "loss": 1.1493, + "step": 2077 + }, + { + "batch_num_effect_tokens": 5946, + "batch_num_samples": 149, + "batch_num_tokens": 52151, + "epoch": 1.88909, + "grad_norm": 3.3354926109313965, + "learning_rate": 1.8676801612643957e-07, + "loss": 1.0522, + "step": 2078 + }, + { + "batch_num_effect_tokens": 5537, + "batch_num_samples": 149, + "batch_num_tokens": 52188, + "epoch": 1.89, + "grad_norm": 3.2507591247558594, + "learning_rate": 1.8372816552336025e-07, + "loss": 1.0263, + "step": 2079 + }, + { + "batch_num_effect_tokens": 8816, + "batch_num_samples": 150, + "batch_num_tokens": 52216, + "epoch": 1.89091, + "grad_norm": 3.3479599952697754, + "learning_rate": 1.8071302737293294e-07, + "loss": 1.6094, + "step": 2080 + }, + { + "batch_num_effect_tokens": 3693, + "batch_num_samples": 149, + "batch_num_tokens": 52147, + "epoch": 1.89182, + "grad_norm": 3.6269097328186035, + "learning_rate": 1.7772260926576357e-07, + "loss": 0.8417, + "step": 2081 + }, + { + "batch_num_effect_tokens": 6569, + "batch_num_samples": 149, + "batch_num_tokens": 52133, + "epoch": 1.89273, + "grad_norm": 3.275144338607788, + "learning_rate": 1.747569187302267e-07, + "loss": 1.0675, + "step": 2082 + }, + { + "batch_num_effect_tokens": 5288, + "batch_num_samples": 149, + "batch_num_tokens": 52196, + "epoch": 1.89364, + "grad_norm": 3.7092061042785645, + "learning_rate": 1.7181596323244453e-07, + "loss": 1.0236, + "step": 2083 + }, + { + "batch_num_effect_tokens": 4837, + "batch_num_samples": 149, + "batch_num_tokens": 52123, + "epoch": 1.89455, + "grad_norm": 3.253692865371704, + "learning_rate": 1.6889975017626902e-07, + "loss": 0.5135, + "step": 2084 + }, + { + "batch_num_effect_tokens": 4890, + "batch_num_samples": 149, + "batch_num_tokens": 52220, + "epoch": 1.89545, + "grad_norm": 3.156859874725342, + "learning_rate": 1.6600828690326087e-07, + "loss": 0.7532, + "step": 2085 + }, + { + "batch_num_effect_tokens": 5073, + "batch_num_samples": 149, + "batch_num_tokens": 52134, + "epoch": 1.89636, + "grad_norm": 2.6373441219329834, + "learning_rate": 1.631415806926795e-07, + "loss": 0.5579, + "step": 2086 + }, + { + "batch_num_effect_tokens": 7704, + "batch_num_samples": 149, + "batch_num_tokens": 52221, + "epoch": 1.89727, + "grad_norm": 2.8775365352630615, + "learning_rate": 1.6029963876145084e-07, + "loss": 1.0058, + "step": 2087 + }, + { + "batch_num_effect_tokens": 6338, + "batch_num_samples": 149, + "batch_num_tokens": 52208, + "epoch": 1.89818, + "grad_norm": 2.6224660873413086, + "learning_rate": 1.574824682641629e-07, + "loss": 0.7892, + "step": 2088 + }, + { + "batch_num_effect_tokens": 7038, + "batch_num_samples": 149, + "batch_num_tokens": 52159, + "epoch": 1.89909, + "grad_norm": 4.682471752166748, + "learning_rate": 1.5469007629303812e-07, + "loss": 2.0439, + "step": 2089 + }, + { + "batch_num_effect_tokens": 5197, + "batch_num_samples": 149, + "batch_num_tokens": 52215, + "epoch": 1.9, + "grad_norm": 3.5335943698883057, + "learning_rate": 1.519224698779198e-07, + "loss": 1.2065, + "step": 2090 + }, + { + "batch_num_effect_tokens": 8009, + "batch_num_samples": 149, + "batch_num_tokens": 52140, + "epoch": 1.90091, + "grad_norm": 2.6326913833618164, + "learning_rate": 1.4917965598625351e-07, + "loss": 0.8632, + "step": 2091 + }, + { + "batch_num_effect_tokens": 6759, + "batch_num_samples": 149, + "batch_num_tokens": 52171, + "epoch": 1.90182, + "grad_norm": 2.8118245601654053, + "learning_rate": 1.464616415230702e-07, + "loss": 0.9084, + "step": 2092 + }, + { + "batch_num_effect_tokens": 4443, + "batch_num_samples": 149, + "batch_num_tokens": 52222, + "epoch": 1.90273, + "grad_norm": 2.5267601013183594, + "learning_rate": 1.4376843333096746e-07, + "loss": 0.3668, + "step": 2093 + }, + { + "batch_num_effect_tokens": 8810, + "batch_num_samples": 149, + "batch_num_tokens": 52155, + "epoch": 1.90364, + "grad_norm": 3.9348878860473633, + "learning_rate": 1.411000381900951e-07, + "loss": 1.8771, + "step": 2094 + }, + { + "batch_num_effect_tokens": 7110, + "batch_num_samples": 149, + "batch_num_tokens": 52166, + "epoch": 1.90455, + "grad_norm": 1.6358133554458618, + "learning_rate": 1.3845646281813508e-07, + "loss": 0.2877, + "step": 2095 + }, + { + "batch_num_effect_tokens": 6381, + "batch_num_samples": 149, + "batch_num_tokens": 52164, + "epoch": 1.90545, + "grad_norm": 3.8244762420654297, + "learning_rate": 1.3583771387028267e-07, + "loss": 1.5889, + "step": 2096 + }, + { + "batch_num_effect_tokens": 6921, + "batch_num_samples": 149, + "batch_num_tokens": 52177, + "epoch": 1.90636, + "grad_norm": 3.0918750762939453, + "learning_rate": 1.3324379793923648e-07, + "loss": 1.0494, + "step": 2097 + }, + { + "batch_num_effect_tokens": 5966, + "batch_num_samples": 149, + "batch_num_tokens": 52151, + "epoch": 1.90727, + "grad_norm": 3.315189838409424, + "learning_rate": 1.3067472155517736e-07, + "loss": 1.1321, + "step": 2098 + }, + { + "batch_num_effect_tokens": 5133, + "batch_num_samples": 149, + "batch_num_tokens": 52140, + "epoch": 1.90818, + "grad_norm": 3.508096218109131, + "learning_rate": 1.2813049118575282e-07, + "loss": 0.8875, + "step": 2099 + }, + { + "batch_num_effect_tokens": 4969, + "batch_num_samples": 149, + "batch_num_tokens": 52210, + "epoch": 1.90909, + "grad_norm": 2.9247848987579346, + "learning_rate": 1.2561111323605714e-07, + "loss": 0.7568, + "step": 2100 + }, + { + "batch_num_effect_tokens": 4955, + "batch_num_samples": 149, + "batch_num_tokens": 52154, + "epoch": 1.91, + "grad_norm": 3.291416883468628, + "learning_rate": 1.231165940486234e-07, + "loss": 0.783, + "step": 2101 + }, + { + "batch_num_effect_tokens": 6480, + "batch_num_samples": 149, + "batch_num_tokens": 52170, + "epoch": 1.91091, + "grad_norm": 3.497671127319336, + "learning_rate": 1.2064693990339936e-07, + "loss": 0.8337, + "step": 2102 + }, + { + "batch_num_effect_tokens": 8419, + "batch_num_samples": 149, + "batch_num_tokens": 52208, + "epoch": 1.91182, + "grad_norm": 3.0520131587982178, + "learning_rate": 1.1820215701773829e-07, + "loss": 1.4666, + "step": 2103 + }, + { + "batch_num_effect_tokens": 4869, + "batch_num_samples": 149, + "batch_num_tokens": 52210, + "epoch": 1.91273, + "grad_norm": 3.4749419689178467, + "learning_rate": 1.1578225154637579e-07, + "loss": 0.8401, + "step": 2104 + }, + { + "batch_num_effect_tokens": 7648, + "batch_num_samples": 149, + "batch_num_tokens": 52222, + "epoch": 1.91364, + "grad_norm": 2.883669853210449, + "learning_rate": 1.1338722958142311e-07, + "loss": 0.9917, + "step": 2105 + }, + { + "batch_num_effect_tokens": 6705, + "batch_num_samples": 149, + "batch_num_tokens": 52177, + "epoch": 1.91455, + "grad_norm": 3.6415936946868896, + "learning_rate": 1.1101709715234388e-07, + "loss": 1.3216, + "step": 2106 + }, + { + "batch_num_effect_tokens": 7498, + "batch_num_samples": 149, + "batch_num_tokens": 52191, + "epoch": 1.91545, + "grad_norm": 3.037210702896118, + "learning_rate": 1.08671860225944e-07, + "loss": 0.9825, + "step": 2107 + }, + { + "batch_num_effect_tokens": 6012, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 1.91636, + "grad_norm": 3.028671979904175, + "learning_rate": 1.0635152470635513e-07, + "loss": 0.933, + "step": 2108 + }, + { + "batch_num_effect_tokens": 5176, + "batch_num_samples": 149, + "batch_num_tokens": 52126, + "epoch": 1.91727, + "grad_norm": 3.4049301147460938, + "learning_rate": 1.0405609643501902e-07, + "loss": 0.8042, + "step": 2109 + }, + { + "batch_num_effect_tokens": 5858, + "batch_num_samples": 149, + "batch_num_tokens": 52132, + "epoch": 1.91818, + "grad_norm": 3.471726417541504, + "learning_rate": 1.0178558119067316e-07, + "loss": 1.2755, + "step": 2110 + }, + { + "batch_num_effect_tokens": 5847, + "batch_num_samples": 149, + "batch_num_tokens": 52146, + "epoch": 1.91909, + "grad_norm": 3.8953139781951904, + "learning_rate": 9.953998468933635e-08, + "loss": 0.9107, + "step": 2111 + }, + { + "batch_num_effect_tokens": 7213, + "batch_num_samples": 149, + "batch_num_tokens": 52189, + "epoch": 1.92, + "grad_norm": 2.7599267959594727, + "learning_rate": 9.731931258429638e-08, + "loss": 0.6609, + "step": 2112 + }, + { + "batch_num_effect_tokens": 7449, + "batch_num_samples": 149, + "batch_num_tokens": 52209, + "epoch": 1.92091, + "grad_norm": 3.0122787952423096, + "learning_rate": 9.512357046609244e-08, + "loss": 1.0107, + "step": 2113 + }, + { + "batch_num_effect_tokens": 4666, + "batch_num_samples": 149, + "batch_num_tokens": 52220, + "epoch": 1.92182, + "grad_norm": 2.793931245803833, + "learning_rate": 9.295276386250273e-08, + "loss": 0.5671, + "step": 2114 + }, + { + "batch_num_effect_tokens": 5614, + "batch_num_samples": 149, + "batch_num_tokens": 52183, + "epoch": 1.92273, + "grad_norm": 3.8008737564086914, + "learning_rate": 9.080689823853017e-08, + "loss": 1.4121, + "step": 2115 + }, + { + "batch_num_effect_tokens": 6395, + "batch_num_samples": 149, + "batch_num_tokens": 52129, + "epoch": 1.92364, + "grad_norm": 3.3346686363220215, + "learning_rate": 8.868597899638897e-08, + "loss": 1.2089, + "step": 2116 + }, + { + "batch_num_effect_tokens": 8000, + "batch_num_samples": 149, + "batch_num_tokens": 52214, + "epoch": 1.92455, + "grad_norm": 3.25838303565979, + "learning_rate": 8.659001147548918e-08, + "loss": 1.2777, + "step": 2117 + }, + { + "batch_num_effect_tokens": 6689, + "batch_num_samples": 149, + "batch_num_tokens": 52197, + "epoch": 1.92545, + "grad_norm": 3.4332966804504395, + "learning_rate": 8.451900095242882e-08, + "loss": 1.2815, + "step": 2118 + }, + { + "batch_num_effect_tokens": 6496, + "batch_num_samples": 149, + "batch_num_tokens": 52187, + "epoch": 1.92636, + "grad_norm": 3.0824673175811768, + "learning_rate": 8.247295264097288e-08, + "loss": 0.985, + "step": 2119 + }, + { + "batch_num_effect_tokens": 7652, + "batch_num_samples": 150, + "batch_num_tokens": 52216, + "epoch": 1.92727, + "grad_norm": 3.0345468521118164, + "learning_rate": 8.04518716920466e-08, + "loss": 0.8738, + "step": 2120 + }, + { + "batch_num_effect_tokens": 6869, + "batch_num_samples": 149, + "batch_num_tokens": 52125, + "epoch": 1.92818, + "grad_norm": 3.161374092102051, + "learning_rate": 7.845576319371884e-08, + "loss": 1.1486, + "step": 2121 + }, + { + "batch_num_effect_tokens": 6895, + "batch_num_samples": 149, + "batch_num_tokens": 52133, + "epoch": 1.92909, + "grad_norm": 3.010014533996582, + "learning_rate": 7.648463217118985e-08, + "loss": 1.1652, + "step": 2122 + }, + { + "batch_num_effect_tokens": 7995, + "batch_num_samples": 150, + "batch_num_tokens": 52187, + "epoch": 1.93, + "grad_norm": 3.0425710678100586, + "learning_rate": 7.453848358678018e-08, + "loss": 1.0709, + "step": 2123 + }, + { + "batch_num_effect_tokens": 6120, + "batch_num_samples": 149, + "batch_num_tokens": 50540, + "epoch": 1.93091, + "grad_norm": 3.1712393760681152, + "learning_rate": 7.261732233991514e-08, + "loss": 0.9768, + "step": 2124 + }, + { + "batch_num_effect_tokens": 7125, + "batch_num_samples": 149, + "batch_num_tokens": 52148, + "epoch": 1.93182, + "grad_norm": 3.0530688762664795, + "learning_rate": 7.072115326711704e-08, + "loss": 1.0305, + "step": 2125 + }, + { + "batch_num_effect_tokens": 6646, + "batch_num_samples": 149, + "batch_num_tokens": 52160, + "epoch": 1.93273, + "grad_norm": 3.473911762237549, + "learning_rate": 6.88499811419896e-08, + "loss": 1.3033, + "step": 2126 + }, + { + "batch_num_effect_tokens": 6467, + "batch_num_samples": 149, + "batch_num_tokens": 52151, + "epoch": 1.93364, + "grad_norm": 3.20442795753479, + "learning_rate": 6.700381067520578e-08, + "loss": 0.8315, + "step": 2127 + }, + { + "batch_num_effect_tokens": 9847, + "batch_num_samples": 149, + "batch_num_tokens": 52177, + "epoch": 1.93455, + "grad_norm": 3.296698570251465, + "learning_rate": 6.51826465144978e-08, + "loss": 1.7688, + "step": 2128 + }, + { + "batch_num_effect_tokens": 7015, + "batch_num_samples": 149, + "batch_num_tokens": 52172, + "epoch": 1.93545, + "grad_norm": 3.054431915283203, + "learning_rate": 6.338649324464375e-08, + "loss": 1.1406, + "step": 2129 + }, + { + "batch_num_effect_tokens": 4682, + "batch_num_samples": 149, + "batch_num_tokens": 52164, + "epoch": 1.93636, + "grad_norm": 2.780895233154297, + "learning_rate": 6.161535538745877e-08, + "loss": 0.4555, + "step": 2130 + }, + { + "batch_num_effect_tokens": 5045, + "batch_num_samples": 149, + "batch_num_tokens": 52180, + "epoch": 1.93727, + "grad_norm": 3.6313552856445312, + "learning_rate": 5.986923740177841e-08, + "loss": 0.8906, + "step": 2131 + }, + { + "batch_num_effect_tokens": 8426, + "batch_num_samples": 149, + "batch_num_tokens": 52184, + "epoch": 1.93818, + "grad_norm": 2.6858532428741455, + "learning_rate": 5.814814368345412e-08, + "loss": 0.7991, + "step": 2132 + }, + { + "batch_num_effect_tokens": 9339, + "batch_num_samples": 149, + "batch_num_tokens": 52162, + "epoch": 1.93909, + "grad_norm": 2.4184558391571045, + "learning_rate": 5.6452078565335524e-08, + "loss": 1.0356, + "step": 2133 + }, + { + "batch_num_effect_tokens": 5380, + "batch_num_samples": 149, + "batch_num_tokens": 52118, + "epoch": 1.94, + "grad_norm": 3.1847732067108154, + "learning_rate": 5.4781046317267103e-08, + "loss": 0.9598, + "step": 2134 + }, + { + "batch_num_effect_tokens": 9300, + "batch_num_samples": 149, + "batch_num_tokens": 52117, + "epoch": 1.94091, + "grad_norm": 2.9929068088531494, + "learning_rate": 5.3135051146068203e-08, + "loss": 1.4023, + "step": 2135 + }, + { + "batch_num_effect_tokens": 7162, + "batch_num_samples": 149, + "batch_num_tokens": 52197, + "epoch": 1.94182, + "grad_norm": 3.0101776123046875, + "learning_rate": 5.15140971955308e-08, + "loss": 0.9932, + "step": 2136 + }, + { + "batch_num_effect_tokens": 5296, + "batch_num_samples": 149, + "batch_num_tokens": 52144, + "epoch": 1.94273, + "grad_norm": 3.4937398433685303, + "learning_rate": 4.991818854640396e-08, + "loss": 0.9352, + "step": 2137 + }, + { + "batch_num_effect_tokens": 7385, + "batch_num_samples": 149, + "batch_num_tokens": 52211, + "epoch": 1.94364, + "grad_norm": 2.734804153442383, + "learning_rate": 4.8347329216387184e-08, + "loss": 0.9585, + "step": 2138 + }, + { + "batch_num_effect_tokens": 7546, + "batch_num_samples": 149, + "batch_num_tokens": 52209, + "epoch": 1.94455, + "grad_norm": 2.896134614944458, + "learning_rate": 4.6801523160114884e-08, + "loss": 0.8693, + "step": 2139 + }, + { + "batch_num_effect_tokens": 7476, + "batch_num_samples": 149, + "batch_num_tokens": 52139, + "epoch": 1.94545, + "grad_norm": 2.7789533138275146, + "learning_rate": 4.528077426915412e-08, + "loss": 0.9293, + "step": 2140 + }, + { + "batch_num_effect_tokens": 4207, + "batch_num_samples": 149, + "batch_num_tokens": 52146, + "epoch": 1.94636, + "grad_norm": 2.832606554031372, + "learning_rate": 4.378508637198686e-08, + "loss": 0.4552, + "step": 2141 + }, + { + "batch_num_effect_tokens": 5396, + "batch_num_samples": 149, + "batch_num_tokens": 52184, + "epoch": 1.94727, + "grad_norm": 2.760693073272705, + "learning_rate": 4.231446323400557e-08, + "loss": 0.6138, + "step": 2142 + }, + { + "batch_num_effect_tokens": 3625, + "batch_num_samples": 149, + "batch_num_tokens": 52140, + "epoch": 1.94818, + "grad_norm": 2.6959006786346436, + "learning_rate": 4.086890855750425e-08, + "loss": 0.3044, + "step": 2143 + }, + { + "batch_num_effect_tokens": 7112, + "batch_num_samples": 149, + "batch_num_tokens": 52213, + "epoch": 1.94909, + "grad_norm": 3.302431344985962, + "learning_rate": 3.9448425981661876e-08, + "loss": 1.2291, + "step": 2144 + }, + { + "batch_num_effect_tokens": 7545, + "batch_num_samples": 149, + "batch_num_tokens": 52134, + "epoch": 1.95, + "grad_norm": 3.5944392681121826, + "learning_rate": 3.805301908254455e-08, + "loss": 1.5729, + "step": 2145 + }, + { + "batch_num_effect_tokens": 5555, + "batch_num_samples": 149, + "batch_num_tokens": 52139, + "epoch": 1.95091, + "grad_norm": 3.7243990898132324, + "learning_rate": 3.668269137308666e-08, + "loss": 1.2057, + "step": 2146 + }, + { + "batch_num_effect_tokens": 5712, + "batch_num_samples": 149, + "batch_num_tokens": 52216, + "epoch": 1.95182, + "grad_norm": 2.9040653705596924, + "learning_rate": 3.533744630308533e-08, + "loss": 0.4343, + "step": 2147 + }, + { + "batch_num_effect_tokens": 4745, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 1.95273, + "grad_norm": 3.090961217880249, + "learning_rate": 3.401728725919373e-08, + "loss": 0.678, + "step": 2148 + }, + { + "batch_num_effect_tokens": 5507, + "batch_num_samples": 149, + "batch_num_tokens": 52167, + "epoch": 1.95364, + "grad_norm": 2.775160074234009, + "learning_rate": 3.2722217564912226e-08, + "loss": 0.685, + "step": 2149 + }, + { + "batch_num_effect_tokens": 7124, + "batch_num_samples": 149, + "batch_num_tokens": 52142, + "epoch": 1.95455, + "grad_norm": 2.7031285762786865, + "learning_rate": 3.1452240480577265e-08, + "loss": 0.8426, + "step": 2150 + }, + { + "batch_num_effect_tokens": 7540, + "batch_num_samples": 149, + "batch_num_tokens": 52170, + "epoch": 1.95545, + "grad_norm": 2.8114101886749268, + "learning_rate": 3.020735920335138e-08, + "loss": 0.8927, + "step": 2151 + }, + { + "batch_num_effect_tokens": 5540, + "batch_num_samples": 149, + "batch_num_tokens": 52206, + "epoch": 1.95636, + "grad_norm": 2.670271873474121, + "learning_rate": 2.898757686722542e-08, + "loss": 0.5648, + "step": 2152 + }, + { + "batch_num_effect_tokens": 5503, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 1.95727, + "grad_norm": 3.1284098625183105, + "learning_rate": 2.779289654299855e-08, + "loss": 0.7566, + "step": 2153 + }, + { + "batch_num_effect_tokens": 5074, + "batch_num_samples": 149, + "batch_num_tokens": 52184, + "epoch": 1.95818, + "grad_norm": 3.766758441925049, + "learning_rate": 2.6623321238277157e-08, + "loss": 0.9247, + "step": 2154 + }, + { + "batch_num_effect_tokens": 9093, + "batch_num_samples": 149, + "batch_num_tokens": 52139, + "epoch": 1.95909, + "grad_norm": 2.73887300491333, + "learning_rate": 2.547885389746485e-08, + "loss": 1.1392, + "step": 2155 + }, + { + "batch_num_effect_tokens": 5651, + "batch_num_samples": 149, + "batch_num_tokens": 52082, + "epoch": 1.96, + "grad_norm": 3.31974196434021, + "learning_rate": 2.4359497401758026e-08, + "loss": 0.7835, + "step": 2156 + }, + { + "batch_num_effect_tokens": 6508, + "batch_num_samples": 149, + "batch_num_tokens": 52217, + "epoch": 1.96091, + "grad_norm": 3.1143388748168945, + "learning_rate": 2.3265254569133645e-08, + "loss": 0.9426, + "step": 2157 + }, + { + "batch_num_effect_tokens": 7877, + "batch_num_samples": 149, + "batch_num_tokens": 52157, + "epoch": 1.96182, + "grad_norm": 3.482353448867798, + "learning_rate": 2.219612815434924e-08, + "loss": 1.6844, + "step": 2158 + }, + { + "batch_num_effect_tokens": 6098, + "batch_num_samples": 149, + "batch_num_tokens": 52178, + "epoch": 1.96273, + "grad_norm": 3.5550529956817627, + "learning_rate": 2.115212084892737e-08, + "loss": 0.9044, + "step": 2159 + }, + { + "batch_num_effect_tokens": 5643, + "batch_num_samples": 149, + "batch_num_tokens": 52082, + "epoch": 1.96364, + "grad_norm": 3.3726587295532227, + "learning_rate": 2.013323528115674e-08, + "loss": 0.9737, + "step": 2160 + }, + { + "batch_num_effect_tokens": 5377, + "batch_num_samples": 149, + "batch_num_tokens": 52165, + "epoch": 1.96455, + "grad_norm": 3.1672065258026123, + "learning_rate": 1.913947401607774e-08, + "loss": 0.8833, + "step": 2161 + }, + { + "batch_num_effect_tokens": 7228, + "batch_num_samples": 149, + "batch_num_tokens": 52187, + "epoch": 1.96545, + "grad_norm": 2.9322712421417236, + "learning_rate": 1.817083955548693e-08, + "loss": 0.9785, + "step": 2162 + }, + { + "batch_num_effect_tokens": 8458, + "batch_num_samples": 150, + "batch_num_tokens": 52194, + "epoch": 1.96636, + "grad_norm": 2.756446599960327, + "learning_rate": 1.722733433791701e-08, + "loss": 1.103, + "step": 2163 + }, + { + "batch_num_effect_tokens": 11262, + "batch_num_samples": 149, + "batch_num_tokens": 52184, + "epoch": 1.96727, + "grad_norm": 2.5466971397399902, + "learning_rate": 1.630896073864352e-08, + "loss": 1.3516, + "step": 2164 + }, + { + "batch_num_effect_tokens": 6845, + "batch_num_samples": 149, + "batch_num_tokens": 52179, + "epoch": 1.96818, + "grad_norm": 3.5355217456817627, + "learning_rate": 1.5415721069669265e-08, + "loss": 1.2466, + "step": 2165 + }, + { + "batch_num_effect_tokens": 5385, + "batch_num_samples": 149, + "batch_num_tokens": 52142, + "epoch": 1.96909, + "grad_norm": 3.531609296798706, + "learning_rate": 1.4547617579725449e-08, + "loss": 0.8818, + "step": 2166 + }, + { + "batch_num_effect_tokens": 4276, + "batch_num_samples": 149, + "batch_num_tokens": 52223, + "epoch": 1.97, + "grad_norm": 2.2698025703430176, + "learning_rate": 1.370465245426167e-08, + "loss": 0.3254, + "step": 2167 + }, + { + "batch_num_effect_tokens": 6165, + "batch_num_samples": 149, + "batch_num_tokens": 52206, + "epoch": 1.97091, + "grad_norm": 3.9124033451080322, + "learning_rate": 1.2886827815440373e-08, + "loss": 1.5506, + "step": 2168 + }, + { + "batch_num_effect_tokens": 4577, + "batch_num_samples": 149, + "batch_num_tokens": 52197, + "epoch": 1.97182, + "grad_norm": 3.2252120971679688, + "learning_rate": 1.2094145722134631e-08, + "loss": 0.7644, + "step": 2169 + }, + { + "batch_num_effect_tokens": 7780, + "batch_num_samples": 149, + "batch_num_tokens": 52116, + "epoch": 1.97273, + "grad_norm": 3.0887022018432617, + "learning_rate": 1.1326608169920373e-08, + "loss": 1.1574, + "step": 2170 + }, + { + "batch_num_effect_tokens": 7272, + "batch_num_samples": 149, + "batch_num_tokens": 52160, + "epoch": 1.97364, + "grad_norm": 3.182729959487915, + "learning_rate": 1.0584217091073046e-08, + "loss": 1.0788, + "step": 2171 + }, + { + "batch_num_effect_tokens": 11530, + "batch_num_samples": 149, + "batch_num_tokens": 52181, + "epoch": 1.97455, + "grad_norm": 2.8558449745178223, + "learning_rate": 9.866974354560966e-09, + "loss": 1.6694, + "step": 2172 + }, + { + "batch_num_effect_tokens": 7912, + "batch_num_samples": 149, + "batch_num_tokens": 52135, + "epoch": 1.97545, + "grad_norm": 3.0910027027130127, + "learning_rate": 9.174881766043086e-09, + "loss": 1.1643, + "step": 2173 + }, + { + "batch_num_effect_tokens": 8210, + "batch_num_samples": 149, + "batch_num_tokens": 52170, + "epoch": 1.97636, + "grad_norm": 3.4341235160827637, + "learning_rate": 8.507941067859016e-09, + "loss": 1.5604, + "step": 2174 + }, + { + "batch_num_effect_tokens": 5085, + "batch_num_samples": 149, + "batch_num_tokens": 52144, + "epoch": 1.97727, + "grad_norm": 4.295083999633789, + "learning_rate": 7.866153939033449e-09, + "loss": 0.8828, + "step": 2175 + }, + { + "batch_num_effect_tokens": 4520, + "batch_num_samples": 149, + "batch_num_tokens": 52150, + "epoch": 1.97818, + "grad_norm": 3.1078341007232666, + "learning_rate": 7.2495219952639636e-09, + "loss": 0.6108, + "step": 2176 + }, + { + "batch_num_effect_tokens": 5965, + "batch_num_samples": 149, + "batch_num_tokens": 52151, + "epoch": 1.97909, + "grad_norm": 3.652076005935669, + "learning_rate": 6.658046788921013e-09, + "loss": 1.323, + "step": 2177 + }, + { + "batch_num_effect_tokens": 9469, + "batch_num_samples": 149, + "batch_num_tokens": 52188, + "epoch": 1.98, + "grad_norm": 3.063791513442993, + "learning_rate": 6.091729809042379e-09, + "loss": 1.5039, + "step": 2178 + }, + { + "batch_num_effect_tokens": 6244, + "batch_num_samples": 149, + "batch_num_tokens": 52188, + "epoch": 1.98091, + "grad_norm": 3.100691556930542, + "learning_rate": 5.550572481330951e-09, + "loss": 0.8641, + "step": 2179 + }, + { + "batch_num_effect_tokens": 7484, + "batch_num_samples": 149, + "batch_num_tokens": 52207, + "epoch": 1.98182, + "grad_norm": 2.8077163696289062, + "learning_rate": 5.034576168149175e-09, + "loss": 0.8706, + "step": 2180 + }, + { + "batch_num_effect_tokens": 6642, + "batch_num_samples": 149, + "batch_num_tokens": 52160, + "epoch": 1.98273, + "grad_norm": 2.6694931983947754, + "learning_rate": 4.543742168516829e-09, + "loss": 0.746, + "step": 2181 + }, + { + "batch_num_effect_tokens": 5587, + "batch_num_samples": 149, + "batch_num_tokens": 52151, + "epoch": 1.98364, + "grad_norm": 3.4628288745880127, + "learning_rate": 4.0780717181077015e-09, + "loss": 0.9929, + "step": 2182 + }, + { + "batch_num_effect_tokens": 6256, + "batch_num_samples": 149, + "batch_num_tokens": 52176, + "epoch": 1.98455, + "grad_norm": 3.096724510192871, + "learning_rate": 3.6375659892473604e-09, + "loss": 0.9109, + "step": 2183 + }, + { + "batch_num_effect_tokens": 7733, + "batch_num_samples": 149, + "batch_num_tokens": 52162, + "epoch": 1.98545, + "grad_norm": 2.4371373653411865, + "learning_rate": 3.22222609090872e-09, + "loss": 0.7449, + "step": 2184 + }, + { + "batch_num_effect_tokens": 7158, + "batch_num_samples": 149, + "batch_num_tokens": 52180, + "epoch": 1.98636, + "grad_norm": 2.9890170097351074, + "learning_rate": 2.832053068709817e-09, + "loss": 0.9248, + "step": 2185 + }, + { + "batch_num_effect_tokens": 6877, + "batch_num_samples": 149, + "batch_num_tokens": 52143, + "epoch": 1.98727, + "grad_norm": 3.2912912368774414, + "learning_rate": 2.4670479049082596e-09, + "loss": 1.4214, + "step": 2186 + }, + { + "batch_num_effect_tokens": 5112, + "batch_num_samples": 149, + "batch_num_tokens": 52183, + "epoch": 1.98818, + "grad_norm": 3.2346763610839844, + "learning_rate": 2.1272115184067797e-09, + "loss": 0.7472, + "step": 2187 + }, + { + "batch_num_effect_tokens": 7287, + "batch_num_samples": 149, + "batch_num_tokens": 52161, + "epoch": 1.98909, + "grad_norm": 2.7889344692230225, + "learning_rate": 1.8125447647421302e-09, + "loss": 0.8528, + "step": 2188 + }, + { + "batch_num_effect_tokens": 9067, + "batch_num_samples": 149, + "batch_num_tokens": 52215, + "epoch": 1.99, + "grad_norm": 2.8845736980438232, + "learning_rate": 1.5230484360873043e-09, + "loss": 1.4642, + "step": 2189 + }, + { + "batch_num_effect_tokens": 6027, + "batch_num_samples": 149, + "batch_num_tokens": 52191, + "epoch": 1.99091, + "grad_norm": 3.2461230754852295, + "learning_rate": 1.2587232612493172e-09, + "loss": 0.8588, + "step": 2190 + }, + { + "batch_num_effect_tokens": 8567, + "batch_num_samples": 150, + "batch_num_tokens": 52221, + "epoch": 1.99182, + "grad_norm": 3.1907031536102295, + "learning_rate": 1.019569905666984e-09, + "loss": 1.421, + "step": 2191 + }, + { + "batch_num_effect_tokens": 4936, + "batch_num_samples": 149, + "batch_num_tokens": 52141, + "epoch": 1.99273, + "grad_norm": 2.44538950920105, + "learning_rate": 8.05588971406479e-10, + "loss": 0.2971, + "step": 2192 + }, + { + "batch_num_effect_tokens": 5104, + "batch_num_samples": 149, + "batch_num_tokens": 52180, + "epoch": 1.99364, + "grad_norm": 3.5328586101531982, + "learning_rate": 6.167809971668881e-10, + "loss": 0.8963, + "step": 2193 + }, + { + "batch_num_effect_tokens": 5176, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 1.99455, + "grad_norm": 3.223731517791748, + "learning_rate": 4.531464582713252e-10, + "loss": 0.7121, + "step": 2194 + }, + { + "batch_num_effect_tokens": 5194, + "batch_num_samples": 149, + "batch_num_tokens": 52150, + "epoch": 1.99545, + "grad_norm": 2.8537750244140625, + "learning_rate": 3.1468576666915383e-10, + "loss": 0.4552, + "step": 2195 + }, + { + "batch_num_effect_tokens": 7555, + "batch_num_samples": 149, + "batch_num_tokens": 52224, + "epoch": 1.99636, + "grad_norm": 2.976595401763916, + "learning_rate": 2.0139927093487666e-10, + "loss": 1.3754, + "step": 2196 + }, + { + "batch_num_effect_tokens": 5285, + "batch_num_samples": 149, + "batch_num_tokens": 52189, + "epoch": 1.99727, + "grad_norm": 3.4908828735351562, + "learning_rate": 1.1328725626813531e-10, + "loss": 1.0096, + "step": 2197 + }, + { + "batch_num_effect_tokens": 6647, + "batch_num_samples": 149, + "batch_num_tokens": 52127, + "epoch": 1.99818, + "grad_norm": 3.540771007537842, + "learning_rate": 5.034994448926967e-11, + "loss": 1.0589, + "step": 2198 + }, + { + "batch_num_effect_tokens": 7105, + "batch_num_samples": 149, + "batch_num_tokens": 52159, + "epoch": 1.99909, + "grad_norm": 2.4064557552337646, + "learning_rate": 1.2587494044868919e-11, + "loss": 0.5547, + "step": 2199 + }, + { + "batch_num_effect_tokens": 7655, + "batch_num_samples": 149, + "batch_num_tokens": 52159, + "epoch": 2.0, + "grad_norm": 2.689605474472046, + "learning_rate": 0.0, + "loss": 0.991, + "step": 2200 + } + ], + "logging_steps": 1.0, + "max_steps": 2200, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}