diff --git "a/last-checkpoint/trainer_state.json" "b/last-checkpoint/trainer_state.json" --- "a/last-checkpoint/trainer_state.json" +++ "b/last-checkpoint/trainer_state.json" @@ -2,6384 +2,720 @@ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.9345794392523364, + "epoch": 0.10384215991692627, "eval_steps": 100, - "global_step": 900, + "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0010384215991692627, - "grad_norm": 1.6015625, + "grad_norm": 1.7265625, "learning_rate": 0.0, - "loss": 10.8837, + "loss": 10.8665, "step": 1 }, { "epoch": 0.0020768431983385254, - "grad_norm": 1.4765625, + "grad_norm": 1.78125, "learning_rate": 2.0408163265306123e-05, - "loss": 10.8726, + "loss": 10.8771, "step": 2 }, { "epoch": 0.003115264797507788, - "grad_norm": 1.515625, + "grad_norm": 1.71875, "learning_rate": 4.0816326530612245e-05, - "loss": 10.8856, + "loss": 10.8809, "step": 3 }, { "epoch": 0.004153686396677051, - "grad_norm": 1.625, + "grad_norm": 1.6953125, "learning_rate": 6.122448979591836e-05, - "loss": 10.8768, + "loss": 10.881, "step": 4 }, { "epoch": 0.005192107995846314, - "grad_norm": 2.65625, + "grad_norm": 1.7109375, "learning_rate": 8.163265306122449e-05, - "loss": 10.8674, + "loss": 10.8689, "step": 5 }, { "epoch": 0.006230529595015576, - "grad_norm": 1.46875, + "grad_norm": 1.6015625, "learning_rate": 0.00010204081632653062, - "loss": 10.8608, + "loss": 10.8573, "step": 6 }, { "epoch": 0.007268951194184839, - "grad_norm": 2.046875, + "grad_norm": 1.5390625, "learning_rate": 0.00012244897959183673, - "loss": 10.8457, + "loss": 10.8544, "step": 7 }, { "epoch": 0.008307372793354102, - "grad_norm": 1.6171875, + "grad_norm": 1.5234375, "learning_rate": 0.00014285714285714284, - "loss": 10.8527, + "loss": 10.828, "step": 8 }, { "epoch": 0.009345794392523364, - "grad_norm": 1.7578125, + "grad_norm": 1.7421875, "learning_rate": 0.00016326530612244898, - "loss": 10.8386, + "loss": 10.8025, "step": 9 }, { "epoch": 0.010384215991692628, - "grad_norm": 1.875, + "grad_norm": 1.6484375, "learning_rate": 0.00018367346938775512, - "loss": 10.8283, + "loss": 10.7755, "step": 10 }, { "epoch": 0.01142263759086189, - "grad_norm": 1.8515625, + "grad_norm": 1.890625, "learning_rate": 0.00020408163265306123, - "loss": 10.7993, + "loss": 10.762, "step": 11 }, { "epoch": 0.012461059190031152, - "grad_norm": 1.7890625, + "grad_norm": 1.859375, "learning_rate": 0.00022448979591836734, - "loss": 10.7665, + "loss": 10.7136, "step": 12 }, { "epoch": 0.013499480789200415, - "grad_norm": 9.5, + "grad_norm": 1.921875, "learning_rate": 0.00024489795918367346, - "loss": 10.815, + "loss": 10.652, "step": 13 }, { "epoch": 0.014537902388369679, - "grad_norm": 1.984375, + "grad_norm": 2.125, "learning_rate": 0.0002653061224489796, - "loss": 10.7245, + "loss": 10.5645, "step": 14 }, { "epoch": 0.01557632398753894, - "grad_norm": 2.25, + "grad_norm": 2.296875, "learning_rate": 0.0002857142857142857, - "loss": 10.6816, + "loss": 10.4862, "step": 15 }, { "epoch": 0.016614745586708203, - "grad_norm": 2.3125, + "grad_norm": 2.390625, "learning_rate": 0.0003061224489795919, - "loss": 10.6417, + "loss": 10.4193, "step": 16 }, { "epoch": 0.017653167185877467, - "grad_norm": 2.6875, + "grad_norm": 2.5, "learning_rate": 0.00032653061224489796, - "loss": 10.5831, + "loss": 10.2264, "step": 17 }, { "epoch": 0.018691588785046728, - "grad_norm": 3.109375, + "grad_norm": 2.546875, "learning_rate": 0.0003469387755102041, - "loss": 10.549, + "loss": 10.1162, "step": 18 }, { "epoch": 0.01973001038421599, - "grad_norm": 3.21875, + "grad_norm": 2.734375, "learning_rate": 0.00036734693877551024, - "loss": 10.4233, + "loss": 9.9658, "step": 19 }, { "epoch": 0.020768431983385256, - "grad_norm": 3.578125, + "grad_norm": 2.40625, "learning_rate": 0.0003877551020408163, - "loss": 10.3826, + "loss": 9.7941, "step": 20 }, { "epoch": 0.021806853582554516, - "grad_norm": 3.59375, + "grad_norm": 2.34375, "learning_rate": 0.00040816326530612246, - "loss": 10.3051, + "loss": 9.5882, "step": 21 }, { "epoch": 0.02284527518172378, - "grad_norm": 4.65625, + "grad_norm": 2.28125, "learning_rate": 0.00042857142857142855, - "loss": 10.1497, + "loss": 9.4191, "step": 22 }, { "epoch": 0.023883696780893044, - "grad_norm": 4.40625, + "grad_norm": 2.078125, "learning_rate": 0.0004489795918367347, - "loss": 10.043, + "loss": 9.2386, "step": 23 }, { "epoch": 0.024922118380062305, - "grad_norm": 4.78125, + "grad_norm": 1.6328125, "learning_rate": 0.00046938775510204083, - "loss": 9.9011, + "loss": 9.1101, "step": 24 }, { "epoch": 0.02596053997923157, - "grad_norm": 4.5, + "grad_norm": 1.2890625, "learning_rate": 0.0004897959183673469, - "loss": 9.803, + "loss": 8.9569, "step": 25 }, { "epoch": 0.02699896157840083, - "grad_norm": 4.5625, + "grad_norm": 1.3671875, "learning_rate": 0.0005102040816326531, - "loss": 9.6678, + "loss": 8.8046, "step": 26 }, { "epoch": 0.028037383177570093, - "grad_norm": 4.8125, + "grad_norm": 0.98046875, "learning_rate": 0.0005306122448979592, - "loss": 9.5148, + "loss": 8.6394, "step": 27 }, { "epoch": 0.029075804776739357, - "grad_norm": 4.71875, + "grad_norm": 0.7578125, "learning_rate": 0.0005510204081632653, - "loss": 9.3753, + "loss": 8.5832, "step": 28 }, { "epoch": 0.030114226375908618, - "grad_norm": 4.875, + "grad_norm": 0.79296875, "learning_rate": 0.0005714285714285714, - "loss": 9.2481, + "loss": 8.4092, "step": 29 }, { "epoch": 0.03115264797507788, - "grad_norm": 4.53125, + "grad_norm": 0.66015625, "learning_rate": 0.0005918367346938776, - "loss": 9.1792, + "loss": 8.364, "step": 30 }, { "epoch": 0.032191069574247146, - "grad_norm": 4.40625, + "grad_norm": 0.875, "learning_rate": 0.0006122448979591838, - "loss": 9.1323, + "loss": 8.2433, "step": 31 }, { "epoch": 0.033229491173416406, - "grad_norm": 4.15625, + "grad_norm": 8.875, "learning_rate": 0.0006326530612244898, - "loss": 8.8994, + "loss": 8.5265, "step": 32 }, { "epoch": 0.03426791277258567, - "grad_norm": 9.4375, + "grad_norm": 0.73046875, "learning_rate": 0.0006530612244897959, - "loss": 8.7817, + "loss": 8.2092, "step": 33 }, { "epoch": 0.035306334371754934, - "grad_norm": 3.546875, + "grad_norm": 0.70703125, "learning_rate": 0.000673469387755102, - "loss": 8.6807, + "loss": 8.2644, "step": 34 }, { "epoch": 0.036344755970924195, - "grad_norm": 3.65625, + "grad_norm": 0.671875, "learning_rate": 0.0006938775510204082, - "loss": 8.5546, + "loss": 8.3096, "step": 35 }, { "epoch": 0.037383177570093455, - "grad_norm": 3.0, + "grad_norm": 0.71484375, "learning_rate": 0.0007142857142857143, - "loss": 8.5261, + "loss": 8.2509, "step": 36 }, { "epoch": 0.03842159916926272, - "grad_norm": 2.453125, + "grad_norm": 0.7109375, "learning_rate": 0.0007346938775510205, - "loss": 8.4449, + "loss": 8.3452, "step": 37 }, { "epoch": 0.03946002076843198, - "grad_norm": 1.8984375, + "grad_norm": 0.5625, "learning_rate": 0.0007551020408163265, - "loss": 8.3237, + "loss": 8.1782, "step": 38 }, { "epoch": 0.040498442367601244, - "grad_norm": 1.7578125, + "grad_norm": 0.5390625, "learning_rate": 0.0007755102040816326, - "loss": 8.3935, + "loss": 8.2422, "step": 39 }, { "epoch": 0.04153686396677051, - "grad_norm": 2.28125, + "grad_norm": 0.6640625, "learning_rate": 0.0007959183673469387, - "loss": 8.2964, + "loss": 8.2454, "step": 40 }, { "epoch": 0.04257528556593977, - "grad_norm": 2.21875, + "grad_norm": 0.6328125, "learning_rate": 0.0008163265306122449, - "loss": 8.4733, + "loss": 8.1751, "step": 41 }, { "epoch": 0.04361370716510903, - "grad_norm": 2.265625, + "grad_norm": 0.4609375, "learning_rate": 0.0008367346938775511, - "loss": 8.3357, + "loss": 8.1872, "step": 42 }, { "epoch": 0.0446521287642783, - "grad_norm": 2.203125, + "grad_norm": 6.03125, "learning_rate": 0.0008571428571428571, - "loss": 8.2853, + "loss": 8.0279, "step": 43 }, { "epoch": 0.04569055036344756, - "grad_norm": 2.15625, + "grad_norm": 0.57421875, "learning_rate": 0.0008775510204081633, - "loss": 8.4411, + "loss": 8.1868, "step": 44 }, { "epoch": 0.04672897196261682, - "grad_norm": 2.0625, + "grad_norm": 1.640625, "learning_rate": 0.0008979591836734694, - "loss": 8.3876, + "loss": 8.1969, "step": 45 }, { "epoch": 0.04776739356178609, - "grad_norm": 2.078125, + "grad_norm": 1.25, "learning_rate": 0.0009183673469387756, - "loss": 8.5309, + "loss": 8.1478, "step": 46 }, { "epoch": 0.04880581516095535, - "grad_norm": 1.8828125, + "grad_norm": 0.8984375, "learning_rate": 0.0009387755102040817, - "loss": 8.3942, + "loss": 8.1154, "step": 47 }, { "epoch": 0.04984423676012461, - "grad_norm": 1.703125, + "grad_norm": 1.109375, "learning_rate": 0.0009591836734693877, - "loss": 8.3333, + "loss": 8.1156, "step": 48 }, { "epoch": 0.05088265835929388, - "grad_norm": 1.6171875, + "grad_norm": 1.328125, "learning_rate": 0.0009795918367346938, - "loss": 8.3561, + "loss": 8.0631, "step": 49 }, { "epoch": 0.05192107995846314, - "grad_norm": 1.5234375, + "grad_norm": 0.82421875, "learning_rate": 0.001, - "loss": 8.3608, + "loss": 8.0943, "step": 50 }, { "epoch": 0.0529595015576324, - "grad_norm": 1.3125, + "grad_norm": 0.9140625, "learning_rate": 0.0009999970464322657, - "loss": 8.3252, + "loss": 7.9754, "step": 51 }, { "epoch": 0.05399792315680166, - "grad_norm": 1.1171875, + "grad_norm": 0.6953125, "learning_rate": 0.0009999881857639566, - "loss": 8.2845, + "loss": 8.0153, "step": 52 }, { "epoch": 0.055036344755970926, - "grad_norm": 1.1484375, + "grad_norm": 0.53125, "learning_rate": 0.0009999734180997555, - "loss": 8.3072, + "loss": 7.9965, "step": 53 }, { "epoch": 0.056074766355140186, - "grad_norm": 1.203125, + "grad_norm": 0.66796875, "learning_rate": 0.0009999527436141311, - "loss": 8.2587, + "loss": 7.911, "step": 54 }, { "epoch": 0.05711318795430945, - "grad_norm": 1.3125, + "grad_norm": 0.82421875, "learning_rate": 0.0009999261625513378, - "loss": 8.3019, + "loss": 7.9039, "step": 55 }, { "epoch": 0.058151609553478714, - "grad_norm": 1.2265625, + "grad_norm": 0.7421875, "learning_rate": 0.0009998936752254111, - "loss": 8.3933, + "loss": 7.9147, "step": 56 }, { "epoch": 0.059190031152647975, - "grad_norm": 1.125, + "grad_norm": 0.80859375, "learning_rate": 0.0009998552820201655, - "loss": 8.2921, + "loss": 7.9948, "step": 57 }, { "epoch": 0.060228452751817235, - "grad_norm": 1.265625, + "grad_norm": 1.0546875, "learning_rate": 0.0009998109833891883, - "loss": 8.1804, + "loss": 7.9213, "step": 58 }, { "epoch": 0.0612668743509865, - "grad_norm": 1.171875, + "grad_norm": 0.84375, "learning_rate": 0.0009997607798558359, - "loss": 8.3127, + "loss": 7.8629, "step": 59 }, { "epoch": 0.06230529595015576, - "grad_norm": 1.2421875, + "grad_norm": 1.0546875, "learning_rate": 0.0009997046720132262, - "loss": 8.1936, + "loss": 7.8418, "step": 60 }, { "epoch": 0.06334371754932502, - "grad_norm": 1.0, + "grad_norm": 0.7265625, "learning_rate": 0.0009996426605242323, - "loss": 8.2456, + "loss": 7.9071, "step": 61 }, { "epoch": 0.06438213914849429, - "grad_norm": 1.1484375, + "grad_norm": 1.0078125, "learning_rate": 0.0009995747461214752, - "loss": 8.2582, + "loss": 7.8148, "step": 62 }, { "epoch": 0.06542056074766354, - "grad_norm": 1.09375, + "grad_norm": 0.82421875, "learning_rate": 0.0009995009296073138, - "loss": 8.2961, + "loss": 7.8949, "step": 63 }, { "epoch": 0.06645898234683281, - "grad_norm": 1.2421875, + "grad_norm": 0.8515625, "learning_rate": 0.0009994212118538365, - "loss": 8.1676, + "loss": 7.7507, "step": 64 }, { "epoch": 0.06749740394600208, - "grad_norm": 7.9375, + "grad_norm": 0.74609375, "learning_rate": 0.00099933559380285, - "loss": 8.1897, + "loss": 7.8288, "step": 65 }, { "epoch": 0.06853582554517133, - "grad_norm": 1.1953125, + "grad_norm": 0.75390625, "learning_rate": 0.0009992440764658697, - "loss": 8.3123, + "loss": 7.6728, "step": 66 }, { "epoch": 0.0695742471443406, - "grad_norm": 0.9453125, + "grad_norm": 0.8984375, "learning_rate": 0.000999146660924106, - "loss": 8.2322, + "loss": 7.65, "step": 67 }, { "epoch": 0.07061266874350987, - "grad_norm": 1.1796875, + "grad_norm": 1.2890625, "learning_rate": 0.0009990433483284527, - "loss": 8.2263, + "loss": 7.5908, "step": 68 }, { "epoch": 0.07165109034267912, - "grad_norm": 1.0703125, + "grad_norm": 0.7109375, "learning_rate": 0.0009989341398994724, - "loss": 8.2091, + "loss": 7.5341, "step": 69 }, { "epoch": 0.07268951194184839, - "grad_norm": 1.046875, + "grad_norm": 0.9375, "learning_rate": 0.0009988190369273832, - "loss": 8.1462, + "loss": 7.4824, "step": 70 }, { "epoch": 0.07372793354101766, - "grad_norm": 1.0390625, + "grad_norm": 0.91015625, "learning_rate": 0.000998698040772043, - "loss": 8.183, + "loss": 7.489, "step": 71 }, { "epoch": 0.07476635514018691, - "grad_norm": 0.9921875, + "grad_norm": 1.0546875, "learning_rate": 0.000998571152862933, - "loss": 8.1936, + "loss": 7.3756, "step": 72 }, { "epoch": 0.07580477673935618, - "grad_norm": 1.0, + "grad_norm": 0.5625, "learning_rate": 0.0009984383746991417, - "loss": 8.2025, + "loss": 7.3947, "step": 73 }, { "epoch": 0.07684319833852545, - "grad_norm": 0.984375, + "grad_norm": 0.6171875, "learning_rate": 0.0009982997078493456, - "loss": 8.1422, + "loss": 7.3367, "step": 74 }, { "epoch": 0.0778816199376947, - "grad_norm": 1.0703125, + "grad_norm": 0.90625, "learning_rate": 0.0009981551539517929, - "loss": 8.1703, + "loss": 7.359, "step": 75 }, { "epoch": 0.07892004153686397, - "grad_norm": 1.078125, + "grad_norm": 0.74609375, "learning_rate": 0.0009980047147142824, - "loss": 8.1197, + "loss": 7.1188, "step": 76 }, { "epoch": 0.07995846313603323, - "grad_norm": 1.1015625, + "grad_norm": 0.66796875, "learning_rate": 0.0009978483919141441, - "loss": 8.2123, + "loss": 7.3683, "step": 77 }, { "epoch": 0.08099688473520249, - "grad_norm": 1.0234375, + "grad_norm": 0.92578125, "learning_rate": 0.0009976861873982176, - "loss": 8.1096, + "loss": 7.2359, "step": 78 }, { "epoch": 0.08203530633437175, - "grad_norm": 1.5546875, + "grad_norm": 1.46875, "learning_rate": 0.0009975181030828317, - "loss": 8.266, + "loss": 7.1375, "step": 79 }, { "epoch": 0.08307372793354102, - "grad_norm": 1.2421875, + "grad_norm": 0.52734375, "learning_rate": 0.0009973441409537793, - "loss": 8.1913, + "loss": 7.0958, "step": 80 }, { "epoch": 0.08411214953271028, - "grad_norm": 1.03125, + "grad_norm": 0.62890625, "learning_rate": 0.0009971643030662965, - "loss": 8.1207, + "loss": 7.1365, "step": 81 }, { "epoch": 0.08515057113187954, - "grad_norm": 1.03125, + "grad_norm": 0.8046875, "learning_rate": 0.0009969785915450366, - "loss": 8.1482, + "loss": 7.122, "step": 82 }, { "epoch": 0.08618899273104881, - "grad_norm": 1.203125, + "grad_norm": 1.0546875, "learning_rate": 0.0009967870085840463, - "loss": 8.1108, + "loss": 7.0849, "step": 83 }, { "epoch": 0.08722741433021806, - "grad_norm": 0.99609375, + "grad_norm": 0.458984375, "learning_rate": 0.000996589556446738, - "loss": 8.082, + "loss": 7.0815, "step": 84 }, { "epoch": 0.08826583592938733, - "grad_norm": 1.109375, + "grad_norm": 0.72265625, "learning_rate": 0.000996386237465865, - "loss": 8.0509, + "loss": 7.109, "step": 85 }, { "epoch": 0.0893042575285566, - "grad_norm": 1.0390625, + "grad_norm": 0.7734375, "learning_rate": 0.000996177054043493, - "loss": 8.0733, + "loss": 6.9565, "step": 86 }, { "epoch": 0.09034267912772585, - "grad_norm": 9.5, + "grad_norm": 0.609375, "learning_rate": 0.0009959620086509714, - "loss": 7.9885, + "loss": 7.0025, "step": 87 }, { "epoch": 0.09138110072689512, - "grad_norm": 1.0859375, + "grad_norm": 0.546875, "learning_rate": 0.0009957411038289047, - "loss": 7.9692, + "loss": 7.002, "step": 88 }, { "epoch": 0.09241952232606439, - "grad_norm": 1.03125, + "grad_norm": 0.5078125, "learning_rate": 0.0009955143421871227, - "loss": 8.05, + "loss": 6.6959, "step": 89 }, { "epoch": 0.09345794392523364, - "grad_norm": 1.0078125, + "grad_norm": 0.53515625, "learning_rate": 0.0009952817264046484, - "loss": 8.1264, + "loss": 6.7453, "step": 90 }, { "epoch": 0.09449636552440291, - "grad_norm": 1.6796875, + "grad_norm": 0.5546875, "learning_rate": 0.000995043259229668, - "loss": 8.0639, + "loss": 6.734, "step": 91 }, { "epoch": 0.09553478712357218, - "grad_norm": 1.1328125, + "grad_norm": 0.52734375, "learning_rate": 0.0009947989434794973, - "loss": 8.0093, + "loss": 6.8088, "step": 92 }, { "epoch": 0.09657320872274143, - "grad_norm": 1.1484375, + "grad_norm": 0.80859375, "learning_rate": 0.0009945487820405485, - "loss": 8.1015, + "loss": 6.8563, "step": 93 }, { "epoch": 0.0976116303219107, - "grad_norm": 0.91796875, + "grad_norm": 0.7734375, "learning_rate": 0.0009942927778682968, - "loss": 8.0984, + "loss": 6.5181, "step": 94 }, { "epoch": 0.09865005192107996, - "grad_norm": 1.203125, + "grad_norm": 0.60546875, "learning_rate": 0.0009940309339872449, - "loss": 7.9496, + "loss": 6.8268, "step": 95 }, { "epoch": 0.09968847352024922, - "grad_norm": 1.25, + "grad_norm": 0.625, "learning_rate": 0.000993763253490887, - "loss": 8.0393, + "loss": 6.7739, "step": 96 }, { "epoch": 0.10072689511941849, - "grad_norm": 1.109375, + "grad_norm": 0.54296875, "learning_rate": 0.0009934897395416737, - "loss": 8.0389, + "loss": 6.6782, "step": 97 }, { "epoch": 0.10176531671858775, - "grad_norm": 1.078125, + "grad_norm": 0.5625, "learning_rate": 0.0009932103953709723, - "loss": 7.9644, + "loss": 6.727, "step": 98 }, { "epoch": 0.102803738317757, - "grad_norm": 1.2578125, + "grad_norm": 0.5625, "learning_rate": 0.0009929252242790309, - "loss": 7.9443, + "loss": 6.7222, "step": 99 }, { "epoch": 0.10384215991692627, - "grad_norm": 1.390625, + "grad_norm": 0.5234375, "learning_rate": 0.0009926342296349379, - "loss": 7.9222, + "loss": 6.5565, "step": 100 }, { "epoch": 0.10384215991692627, - "eval_loss": 7.957639694213867, - "eval_runtime": 2.6128, - "eval_samples_per_second": 6.124, - "eval_steps_per_second": 0.765, + "eval_loss": 6.583271503448486, + "eval_runtime": 2.5831, + "eval_samples_per_second": 6.194, + "eval_steps_per_second": 0.774, "step": 100 - }, - { - "epoch": 0.10488058151609553, - "grad_norm": 1.2890625, - "learning_rate": 0.0009923374148765826, - "loss": 7.926, - "step": 101 - }, - { - "epoch": 0.1059190031152648, - "grad_norm": 1.0703125, - "learning_rate": 0.0009920347835106153, - "loss": 8.0019, - "step": 102 - }, - { - "epoch": 0.10695742471443406, - "grad_norm": 1.4609375, - "learning_rate": 0.0009917263391124047, - "loss": 7.8873, - "step": 103 - }, - { - "epoch": 0.10799584631360332, - "grad_norm": 1.2734375, - "learning_rate": 0.0009914120853259966, - "loss": 7.9428, - "step": 104 - }, - { - "epoch": 0.10903426791277258, - "grad_norm": 5.625, - "learning_rate": 0.0009910920258640707, - "loss": 8.0591, - "step": 105 - }, - { - "epoch": 0.11007268951194185, - "grad_norm": 1.1875, - "learning_rate": 0.0009907661645078957, - "loss": 7.9165, - "step": 106 - }, - { - "epoch": 0.1111111111111111, - "grad_norm": 1.6796875, - "learning_rate": 0.0009904345051072862, - "loss": 7.997, - "step": 107 - }, - { - "epoch": 0.11214953271028037, - "grad_norm": 1.09375, - "learning_rate": 0.0009900970515805563, - "loss": 7.9629, - "step": 108 - }, - { - "epoch": 0.11318795430944964, - "grad_norm": 1.015625, - "learning_rate": 0.0009897538079144733, - "loss": 7.9483, - "step": 109 - }, - { - "epoch": 0.1142263759086189, - "grad_norm": 1.171875, - "learning_rate": 0.0009894047781642107, - "loss": 7.8955, - "step": 110 - }, - { - "epoch": 0.11526479750778816, - "grad_norm": 1.09375, - "learning_rate": 0.0009890499664533007, - "loss": 7.908, - "step": 111 - }, - { - "epoch": 0.11630321910695743, - "grad_norm": 1.9375, - "learning_rate": 0.0009886893769735852, - "loss": 7.8703, - "step": 112 - }, - { - "epoch": 0.11734164070612668, - "grad_norm": 0.99609375, - "learning_rate": 0.0009883230139851656, - "loss": 7.8814, - "step": 113 - }, - { - "epoch": 0.11838006230529595, - "grad_norm": 1.4609375, - "learning_rate": 0.0009879508818163537, - "loss": 7.9023, - "step": 114 - }, - { - "epoch": 0.11941848390446522, - "grad_norm": 1.265625, - "learning_rate": 0.0009875729848636196, - "loss": 7.7985, - "step": 115 - }, - { - "epoch": 0.12045690550363447, - "grad_norm": 1.2890625, - "learning_rate": 0.0009871893275915408, - "loss": 7.9216, - "step": 116 - }, - { - "epoch": 0.12149532710280374, - "grad_norm": 1.2890625, - "learning_rate": 0.0009867999145327475, - "loss": 7.8029, - "step": 117 - }, - { - "epoch": 0.122533748701973, - "grad_norm": 1.125, - "learning_rate": 0.0009864047502878717, - "loss": 7.8441, - "step": 118 - }, - { - "epoch": 0.12357217030114226, - "grad_norm": 1.1171875, - "learning_rate": 0.0009860038395254906, - "loss": 7.8356, - "step": 119 - }, - { - "epoch": 0.12461059190031153, - "grad_norm": 1.1171875, - "learning_rate": 0.0009855971869820726, - "loss": 7.8124, - "step": 120 - }, - { - "epoch": 0.1256490134994808, - "grad_norm": 1.1328125, - "learning_rate": 0.000985184797461921, - "loss": 7.758, - "step": 121 - }, - { - "epoch": 0.12668743509865005, - "grad_norm": 1.2578125, - "learning_rate": 0.0009847666758371174, - "loss": 7.8092, - "step": 122 - }, - { - "epoch": 0.1277258566978193, - "grad_norm": 1.21875, - "learning_rate": 0.000984342827047464, - "loss": 7.7203, - "step": 123 - }, - { - "epoch": 0.12876427829698858, - "grad_norm": 1.2109375, - "learning_rate": 0.0009839132561004248, - "loss": 7.7885, - "step": 124 - }, - { - "epoch": 0.12980269989615784, - "grad_norm": 1.4375, - "learning_rate": 0.0009834779680710681, - "loss": 7.722, - "step": 125 - }, - { - "epoch": 0.1308411214953271, - "grad_norm": 1.4375, - "learning_rate": 0.0009830369681020042, - "loss": 7.733, - "step": 126 - }, - { - "epoch": 0.13187954309449637, - "grad_norm": 2.40625, - "learning_rate": 0.000982590261403326, - "loss": 7.8116, - "step": 127 - }, - { - "epoch": 0.13291796469366562, - "grad_norm": 1.828125, - "learning_rate": 0.0009821378532525477, - "loss": 7.8408, - "step": 128 - }, - { - "epoch": 0.13395638629283488, - "grad_norm": 1.3515625, - "learning_rate": 0.000981679748994542, - "loss": 7.8247, - "step": 129 - }, - { - "epoch": 0.13499480789200416, - "grad_norm": 1.2421875, - "learning_rate": 0.0009812159540414764, - "loss": 7.6884, - "step": 130 - }, - { - "epoch": 0.1360332294911734, - "grad_norm": 1.6640625, - "learning_rate": 0.0009807464738727503, - "loss": 7.7018, - "step": 131 - }, - { - "epoch": 0.13707165109034267, - "grad_norm": 1.2890625, - "learning_rate": 0.0009802713140349294, - "loss": 7.6106, - "step": 132 - }, - { - "epoch": 0.13811007268951195, - "grad_norm": 1.921875, - "learning_rate": 0.000979790480141681, - "loss": 7.6482, - "step": 133 - }, - { - "epoch": 0.1391484942886812, - "grad_norm": 1.4296875, - "learning_rate": 0.0009793039778737069, - "loss": 7.6953, - "step": 134 - }, - { - "epoch": 0.14018691588785046, - "grad_norm": 1.2890625, - "learning_rate": 0.0009788118129786768, - "loss": 7.5737, - "step": 135 - }, - { - "epoch": 0.14122533748701974, - "grad_norm": 1.25, - "learning_rate": 0.0009783139912711597, - "loss": 7.7097, - "step": 136 - }, - { - "epoch": 0.142263759086189, - "grad_norm": 1.4765625, - "learning_rate": 0.0009778105186325566, - "loss": 7.6463, - "step": 137 - }, - { - "epoch": 0.14330218068535824, - "grad_norm": 1.4921875, - "learning_rate": 0.0009773014010110298, - "loss": 7.5251, - "step": 138 - }, - { - "epoch": 0.14434060228452752, - "grad_norm": 1.34375, - "learning_rate": 0.0009767866444214321, - "loss": 7.5961, - "step": 139 - }, - { - "epoch": 0.14537902388369678, - "grad_norm": 5.875, - "learning_rate": 0.0009762662549452379, - "loss": 7.3664, - "step": 140 - }, - { - "epoch": 0.14641744548286603, - "grad_norm": 1.4375, - "learning_rate": 0.0009757402387304694, - "loss": 7.5663, - "step": 141 - }, - { - "epoch": 0.1474558670820353, - "grad_norm": 1.5859375, - "learning_rate": 0.0009752086019916245, - "loss": 7.6656, - "step": 142 - }, - { - "epoch": 0.14849428868120457, - "grad_norm": 1.40625, - "learning_rate": 0.000974671351009604, - "loss": 7.5884, - "step": 143 - }, - { - "epoch": 0.14953271028037382, - "grad_norm": 1.3359375, - "learning_rate": 0.000974128492131636, - "loss": 7.5191, - "step": 144 - }, - { - "epoch": 0.1505711318795431, - "grad_norm": 1.46875, - "learning_rate": 0.0009735800317712028, - "loss": 7.4686, - "step": 145 - }, - { - "epoch": 0.15160955347871236, - "grad_norm": 1.7421875, - "learning_rate": 0.0009730259764079637, - "loss": 7.4524, - "step": 146 - }, - { - "epoch": 0.1526479750778816, - "grad_norm": 1.828125, - "learning_rate": 0.0009724663325876786, - "loss": 7.46, - "step": 147 - }, - { - "epoch": 0.1536863966770509, - "grad_norm": 1.7578125, - "learning_rate": 0.0009719011069221315, - "loss": 7.4372, - "step": 148 - }, - { - "epoch": 0.15472481827622014, - "grad_norm": 1.7890625, - "learning_rate": 0.0009713303060890514, - "loss": 7.3809, - "step": 149 - }, - { - "epoch": 0.1557632398753894, - "grad_norm": 2.375, - "learning_rate": 0.000970753936832034, - "loss": 7.4196, - "step": 150 - }, - { - "epoch": 0.15680166147455868, - "grad_norm": 1.3828125, - "learning_rate": 0.000970172005960462, - "loss": 7.4732, - "step": 151 - }, - { - "epoch": 0.15784008307372793, - "grad_norm": 1.5390625, - "learning_rate": 0.0009695845203494243, - "loss": 7.4662, - "step": 152 - }, - { - "epoch": 0.1588785046728972, - "grad_norm": 1.484375, - "learning_rate": 0.0009689914869396349, - "loss": 7.4651, - "step": 153 - }, - { - "epoch": 0.15991692627206647, - "grad_norm": 1.5625, - "learning_rate": 0.0009683929127373514, - "loss": 7.4005, - "step": 154 - }, - { - "epoch": 0.16095534787123572, - "grad_norm": 1.34375, - "learning_rate": 0.0009677888048142915, - "loss": 7.3895, - "step": 155 - }, - { - "epoch": 0.16199376947040497, - "grad_norm": 1.578125, - "learning_rate": 0.00096717917030755, - "loss": 7.3578, - "step": 156 - }, - { - "epoch": 0.16303219106957426, - "grad_norm": 1.578125, - "learning_rate": 0.0009665640164195142, - "loss": 7.358, - "step": 157 - }, - { - "epoch": 0.1640706126687435, - "grad_norm": 6.46875, - "learning_rate": 0.0009659433504177785, - "loss": 7.0409, - "step": 158 - }, - { - "epoch": 0.16510903426791276, - "grad_norm": 1.7265625, - "learning_rate": 0.0009653171796350593, - "loss": 7.4103, - "step": 159 - }, - { - "epoch": 0.16614745586708204, - "grad_norm": 1.4921875, - "learning_rate": 0.000964685511469108, - "loss": 7.3658, - "step": 160 - }, - { - "epoch": 0.1671858774662513, - "grad_norm": 1.328125, - "learning_rate": 0.0009640483533826234, - "loss": 7.2973, - "step": 161 - }, - { - "epoch": 0.16822429906542055, - "grad_norm": 1.4765625, - "learning_rate": 0.0009634057129031639, - "loss": 7.3486, - "step": 162 - }, - { - "epoch": 0.16926272066458983, - "grad_norm": 1.46875, - "learning_rate": 0.0009627575976230579, - "loss": 7.2357, - "step": 163 - }, - { - "epoch": 0.1703011422637591, - "grad_norm": 1.640625, - "learning_rate": 0.0009621040151993152, - "loss": 7.2312, - "step": 164 - }, - { - "epoch": 0.17133956386292834, - "grad_norm": 1.5390625, - "learning_rate": 0.0009614449733535357, - "loss": 7.2768, - "step": 165 - }, - { - "epoch": 0.17237798546209762, - "grad_norm": 1.375, - "learning_rate": 0.000960780479871818, - "loss": 7.282, - "step": 166 - }, - { - "epoch": 0.17341640706126688, - "grad_norm": 1.359375, - "learning_rate": 0.0009601105426046684, - "loss": 7.2627, - "step": 167 - }, - { - "epoch": 0.17445482866043613, - "grad_norm": 1.5078125, - "learning_rate": 0.000959435169466907, - "loss": 7.2436, - "step": 168 - }, - { - "epoch": 0.1754932502596054, - "grad_norm": 1.296875, - "learning_rate": 0.0009587543684375755, - "loss": 7.1775, - "step": 169 - }, - { - "epoch": 0.17653167185877466, - "grad_norm": 1.3515625, - "learning_rate": 0.0009580681475598413, - "loss": 7.2909, - "step": 170 - }, - { - "epoch": 0.17757009345794392, - "grad_norm": 1.6015625, - "learning_rate": 0.000957376514940904, - "loss": 7.2195, - "step": 171 - }, - { - "epoch": 0.1786085150571132, - "grad_norm": 1.5859375, - "learning_rate": 0.0009566794787518986, - "loss": 7.3209, - "step": 172 - }, - { - "epoch": 0.17964693665628245, - "grad_norm": 1.4765625, - "learning_rate": 0.0009559770472277996, - "loss": 7.1312, - "step": 173 - }, - { - "epoch": 0.1806853582554517, - "grad_norm": 1.265625, - "learning_rate": 0.0009552692286673232, - "loss": 7.1432, - "step": 174 - }, - { - "epoch": 0.181723779854621, - "grad_norm": 1.4375, - "learning_rate": 0.0009545560314328297, - "loss": 7.1383, - "step": 175 - }, - { - "epoch": 0.18276220145379024, - "grad_norm": 1.7265625, - "learning_rate": 0.0009538374639502247, - "loss": 7.0784, - "step": 176 - }, - { - "epoch": 0.1838006230529595, - "grad_norm": 1.7890625, - "learning_rate": 0.0009531135347088589, - "loss": 7.2013, - "step": 177 - }, - { - "epoch": 0.18483904465212878, - "grad_norm": 1.75, - "learning_rate": 0.0009523842522614285, - "loss": 7.1522, - "step": 178 - }, - { - "epoch": 0.18587746625129803, - "grad_norm": 1.71875, - "learning_rate": 0.0009516496252238738, - "loss": 7.262, - "step": 179 - }, - { - "epoch": 0.18691588785046728, - "grad_norm": 1.6953125, - "learning_rate": 0.000950909662275278, - "loss": 7.1672, - "step": 180 - }, - { - "epoch": 0.18795430944963656, - "grad_norm": 1.8203125, - "learning_rate": 0.0009501643721577637, - "loss": 7.132, - "step": 181 - }, - { - "epoch": 0.18899273104880582, - "grad_norm": 2.171875, - "learning_rate": 0.00094941376367639, - "loss": 7.0703, - "step": 182 - }, - { - "epoch": 0.19003115264797507, - "grad_norm": 1.5234375, - "learning_rate": 0.0009486578456990493, - "loss": 7.1446, - "step": 183 - }, - { - "epoch": 0.19106957424714435, - "grad_norm": 1.578125, - "learning_rate": 0.0009478966271563613, - "loss": 7.1094, - "step": 184 - }, - { - "epoch": 0.1921079958463136, - "grad_norm": 1.8828125, - "learning_rate": 0.0009471301170415679, - "loss": 7.1068, - "step": 185 - }, - { - "epoch": 0.19314641744548286, - "grad_norm": 1.6328125, - "learning_rate": 0.0009463583244104273, - "loss": 7.1385, - "step": 186 - }, - { - "epoch": 0.19418483904465214, - "grad_norm": 1.3671875, - "learning_rate": 0.000945581258381107, - "loss": 7.0947, - "step": 187 - }, - { - "epoch": 0.1952232606438214, - "grad_norm": 1.6328125, - "learning_rate": 0.0009447989281340753, - "loss": 7.1821, - "step": 188 - }, - { - "epoch": 0.19626168224299065, - "grad_norm": 1.75, - "learning_rate": 0.0009440113429119939, - "loss": 7.0452, - "step": 189 - }, - { - "epoch": 0.19730010384215993, - "grad_norm": 1.6015625, - "learning_rate": 0.0009432185120196079, - "loss": 7.0751, - "step": 190 - }, - { - "epoch": 0.19833852544132918, - "grad_norm": 1.75, - "learning_rate": 0.0009424204448236364, - "loss": 7.2093, - "step": 191 - }, - { - "epoch": 0.19937694704049844, - "grad_norm": 2.171875, - "learning_rate": 0.0009416171507526614, - "loss": 6.9811, - "step": 192 - }, - { - "epoch": 0.20041536863966772, - "grad_norm": 1.7421875, - "learning_rate": 0.0009408086392970167, - "loss": 7.0301, - "step": 193 - }, - { - "epoch": 0.20145379023883697, - "grad_norm": 1.6640625, - "learning_rate": 0.0009399949200086756, - "loss": 6.9737, - "step": 194 - }, - { - "epoch": 0.20249221183800623, - "grad_norm": 2.0, - "learning_rate": 0.0009391760025011384, - "loss": 6.9886, - "step": 195 - }, - { - "epoch": 0.2035306334371755, - "grad_norm": 1.359375, - "learning_rate": 0.0009383518964493182, - "loss": 6.9123, - "step": 196 - }, - { - "epoch": 0.20456905503634476, - "grad_norm": 1.5546875, - "learning_rate": 0.0009375226115894275, - "loss": 7.0419, - "step": 197 - }, - { - "epoch": 0.205607476635514, - "grad_norm": 1.3984375, - "learning_rate": 0.0009366881577188619, - "loss": 7.0344, - "step": 198 - }, - { - "epoch": 0.2066458982346833, - "grad_norm": 1.46875, - "learning_rate": 0.0009358485446960861, - "loss": 7.0015, - "step": 199 - }, - { - "epoch": 0.20768431983385255, - "grad_norm": 4.59375, - "learning_rate": 0.0009350037824405151, - "loss": 7.1113, - "step": 200 - }, - { - "epoch": 0.20768431983385255, - "eval_loss": 6.959992408752441, - "eval_runtime": 1.5744, - "eval_samples_per_second": 10.163, - "eval_steps_per_second": 1.27, - "step": 200 - }, - { - "epoch": 0.2087227414330218, - "grad_norm": 1.890625, - "learning_rate": 0.0009341538809323997, - "loss": 6.9605, - "step": 201 - }, - { - "epoch": 0.20976116303219106, - "grad_norm": 1.53125, - "learning_rate": 0.0009332988502127062, - "loss": 7.1006, - "step": 202 - }, - { - "epoch": 0.21079958463136034, - "grad_norm": 1.46875, - "learning_rate": 0.0009324387003829993, - "loss": 7.0535, - "step": 203 - }, - { - "epoch": 0.2118380062305296, - "grad_norm": 2.046875, - "learning_rate": 0.0009315734416053222, - "loss": 7.1432, - "step": 204 - }, - { - "epoch": 0.21287642782969884, - "grad_norm": 4.90625, - "learning_rate": 0.0009307030841020765, - "loss": 6.8602, - "step": 205 - }, - { - "epoch": 0.21391484942886813, - "grad_norm": 2.265625, - "learning_rate": 0.0009298276381559014, - "loss": 6.8611, - "step": 206 - }, - { - "epoch": 0.21495327102803738, - "grad_norm": 1.484375, - "learning_rate": 0.0009289471141095528, - "loss": 6.9153, - "step": 207 - }, - { - "epoch": 0.21599169262720663, - "grad_norm": 2.015625, - "learning_rate": 0.0009280615223657801, - "loss": 6.88, - "step": 208 - }, - { - "epoch": 0.21703011422637591, - "grad_norm": 1.875, - "learning_rate": 0.0009271708733872042, - "loss": 6.9271, - "step": 209 - }, - { - "epoch": 0.21806853582554517, - "grad_norm": 1.375, - "learning_rate": 0.0009262751776961935, - "loss": 6.9736, - "step": 210 - }, - { - "epoch": 0.21910695742471442, - "grad_norm": 1.75, - "learning_rate": 0.0009253744458747394, - "loss": 6.9327, - "step": 211 - }, - { - "epoch": 0.2201453790238837, - "grad_norm": 1.390625, - "learning_rate": 0.0009244686885643319, - "loss": 6.9902, - "step": 212 - }, - { - "epoch": 0.22118380062305296, - "grad_norm": 1.5390625, - "learning_rate": 0.000923557916465833, - "loss": 6.8993, - "step": 213 - }, - { - "epoch": 0.2222222222222222, - "grad_norm": 1.640625, - "learning_rate": 0.0009226421403393512, - "loss": 6.7531, - "step": 214 - }, - { - "epoch": 0.2232606438213915, - "grad_norm": 1.3515625, - "learning_rate": 0.0009217213710041138, - "loss": 6.9481, - "step": 215 - }, - { - "epoch": 0.22429906542056074, - "grad_norm": 1.5859375, - "learning_rate": 0.0009207956193383392, - "loss": 6.9571, - "step": 216 - }, - { - "epoch": 0.22533748701973, - "grad_norm": 1.4609375, - "learning_rate": 0.0009198648962791083, - "loss": 6.7456, - "step": 217 - }, - { - "epoch": 0.22637590861889928, - "grad_norm": 1.2890625, - "learning_rate": 0.0009189292128222355, - "loss": 6.8709, - "step": 218 - }, - { - "epoch": 0.22741433021806853, - "grad_norm": 1.7890625, - "learning_rate": 0.0009179885800221388, - "loss": 6.9518, - "step": 219 - }, - { - "epoch": 0.2284527518172378, - "grad_norm": 1.8046875, - "learning_rate": 0.0009170430089917088, - "loss": 6.8989, - "step": 220 - }, - { - "epoch": 0.22949117341640707, - "grad_norm": 1.4453125, - "learning_rate": 0.0009160925109021781, - "loss": 6.9521, - "step": 221 - }, - { - "epoch": 0.23052959501557632, - "grad_norm": 1.859375, - "learning_rate": 0.0009151370969829882, - "loss": 6.7814, - "step": 222 - }, - { - "epoch": 0.23156801661474558, - "grad_norm": 1.625, - "learning_rate": 0.0009141767785216584, - "loss": 6.808, - "step": 223 - }, - { - "epoch": 0.23260643821391486, - "grad_norm": 1.6484375, - "learning_rate": 0.0009132115668636511, - "loss": 6.9264, - "step": 224 - }, - { - "epoch": 0.2336448598130841, - "grad_norm": 1.34375, - "learning_rate": 0.0009122414734122383, - "loss": 6.8812, - "step": 225 - }, - { - "epoch": 0.23468328141225336, - "grad_norm": 1.59375, - "learning_rate": 0.0009112665096283667, - "loss": 6.7264, - "step": 226 - }, - { - "epoch": 0.23572170301142265, - "grad_norm": 2.1875, - "learning_rate": 0.0009102866870305231, - "loss": 6.8659, - "step": 227 - }, - { - "epoch": 0.2367601246105919, - "grad_norm": 1.8984375, - "learning_rate": 0.0009093020171945965, - "loss": 6.8242, - "step": 228 - }, - { - "epoch": 0.23779854620976115, - "grad_norm": 1.609375, - "learning_rate": 0.0009083125117537436, - "loss": 6.7649, - "step": 229 - }, - { - "epoch": 0.23883696780893043, - "grad_norm": 1.4765625, - "learning_rate": 0.0009073181823982494, - "loss": 6.9016, - "step": 230 - }, - { - "epoch": 0.2398753894080997, - "grad_norm": 1.625, - "learning_rate": 0.0009063190408753908, - "loss": 6.794, - "step": 231 - }, - { - "epoch": 0.24091381100726894, - "grad_norm": 1.5859375, - "learning_rate": 0.000905315098989296, - "loss": 6.7241, - "step": 232 - }, - { - "epoch": 0.24195223260643822, - "grad_norm": 1.734375, - "learning_rate": 0.0009043063686008066, - "loss": 6.9263, - "step": 233 - }, - { - "epoch": 0.24299065420560748, - "grad_norm": 1.765625, - "learning_rate": 0.0009032928616273368, - "loss": 6.8098, - "step": 234 - }, - { - "epoch": 0.24402907580477673, - "grad_norm": 5.96875, - "learning_rate": 0.0009022745900427324, - "loss": 6.7429, - "step": 235 - }, - { - "epoch": 0.245067497403946, - "grad_norm": 1.7578125, - "learning_rate": 0.00090125156587713, - "loss": 6.7389, - "step": 236 - }, - { - "epoch": 0.24610591900311526, - "grad_norm": 1.6484375, - "learning_rate": 0.0009002238012168143, - "loss": 6.6653, - "step": 237 - }, - { - "epoch": 0.24714434060228452, - "grad_norm": 1.703125, - "learning_rate": 0.0008991913082040751, - "loss": 6.8711, - "step": 238 - }, - { - "epoch": 0.2481827622014538, - "grad_norm": 2.015625, - "learning_rate": 0.0008981540990370649, - "loss": 6.6424, - "step": 239 - }, - { - "epoch": 0.24922118380062305, - "grad_norm": 1.609375, - "learning_rate": 0.0008971121859696538, - "loss": 6.7559, - "step": 240 - }, - { - "epoch": 0.2502596053997923, - "grad_norm": 1.703125, - "learning_rate": 0.000896065581311285, - "loss": 6.7114, - "step": 241 - }, - { - "epoch": 0.2512980269989616, - "grad_norm": 1.8203125, - "learning_rate": 0.0008950142974268293, - "loss": 6.7799, - "step": 242 - }, - { - "epoch": 0.2523364485981308, - "grad_norm": 1.3984375, - "learning_rate": 0.0008939583467364398, - "loss": 6.7178, - "step": 243 - }, - { - "epoch": 0.2533748701973001, - "grad_norm": 1.5859375, - "learning_rate": 0.0008928977417154037, - "loss": 6.731, - "step": 244 - }, - { - "epoch": 0.2544132917964694, - "grad_norm": 1.46875, - "learning_rate": 0.0008918324948939962, - "loss": 6.7293, - "step": 245 - }, - { - "epoch": 0.2554517133956386, - "grad_norm": 1.7265625, - "learning_rate": 0.0008907626188573319, - "loss": 6.8015, - "step": 246 - }, - { - "epoch": 0.2564901349948079, - "grad_norm": 2.28125, - "learning_rate": 0.000889688126245216, - "loss": 6.7796, - "step": 247 - }, - { - "epoch": 0.25752855659397716, - "grad_norm": 3.234375, - "learning_rate": 0.0008886090297519955, - "loss": 6.6777, - "step": 248 - }, - { - "epoch": 0.2585669781931464, - "grad_norm": 2.234375, - "learning_rate": 0.0008875253421264087, - "loss": 6.5925, - "step": 249 - }, - { - "epoch": 0.25960539979231567, - "grad_norm": 1.8828125, - "learning_rate": 0.0008864370761714347, - "loss": 6.7802, - "step": 250 - }, - { - "epoch": 0.26064382139148495, - "grad_norm": 2.109375, - "learning_rate": 0.0008853442447441426, - "loss": 6.6427, - "step": 251 - }, - { - "epoch": 0.2616822429906542, - "grad_norm": 1.7265625, - "learning_rate": 0.0008842468607555389, - "loss": 6.7946, - "step": 252 - }, - { - "epoch": 0.26272066458982346, - "grad_norm": 1.90625, - "learning_rate": 0.000883144937170415, - "loss": 6.6118, - "step": 253 - }, - { - "epoch": 0.26375908618899274, - "grad_norm": 3.78125, - "learning_rate": 0.0008820384870071951, - "loss": 6.8264, - "step": 254 - }, - { - "epoch": 0.26479750778816197, - "grad_norm": 1.4375, - "learning_rate": 0.0008809275233377813, - "loss": 6.6543, - "step": 255 - }, - { - "epoch": 0.26583592938733125, - "grad_norm": 1.578125, - "learning_rate": 0.0008798120592873989, - "loss": 6.6704, - "step": 256 - }, - { - "epoch": 0.26687435098650053, - "grad_norm": 1.53125, - "learning_rate": 0.0008786921080344428, - "loss": 6.7579, - "step": 257 - }, - { - "epoch": 0.26791277258566976, - "grad_norm": 1.8125, - "learning_rate": 0.0008775676828103205, - "loss": 6.6997, - "step": 258 - }, - { - "epoch": 0.26895119418483904, - "grad_norm": 1.59375, - "learning_rate": 0.0008764387968992961, - "loss": 6.8118, - "step": 259 - }, - { - "epoch": 0.2699896157840083, - "grad_norm": 3.578125, - "learning_rate": 0.0008753054636383336, - "loss": 6.4752, - "step": 260 - }, - { - "epoch": 0.27102803738317754, - "grad_norm": 2.046875, - "learning_rate": 0.0008741676964169394, - "loss": 6.8343, - "step": 261 - }, - { - "epoch": 0.2720664589823468, - "grad_norm": 1.9921875, - "learning_rate": 0.0008730255086770036, - "loss": 6.8082, - "step": 262 - }, - { - "epoch": 0.2731048805815161, - "grad_norm": 2.296875, - "learning_rate": 0.0008718789139126416, - "loss": 6.5756, - "step": 263 - }, - { - "epoch": 0.27414330218068533, - "grad_norm": 2.25, - "learning_rate": 0.0008707279256700347, - "loss": 6.6768, - "step": 264 - }, - { - "epoch": 0.2751817237798546, - "grad_norm": 1.671875, - "learning_rate": 0.0008695725575472697, - "loss": 6.6729, - "step": 265 - }, - { - "epoch": 0.2762201453790239, - "grad_norm": 2.46875, - "learning_rate": 0.0008684128231941788, - "loss": 6.7176, - "step": 266 - }, - { - "epoch": 0.2772585669781931, - "grad_norm": 2.296875, - "learning_rate": 0.0008672487363121777, - "loss": 6.6639, - "step": 267 - }, - { - "epoch": 0.2782969885773624, - "grad_norm": 1.9453125, - "learning_rate": 0.0008660803106541043, - "loss": 6.7007, - "step": 268 - }, - { - "epoch": 0.2793354101765317, - "grad_norm": 2.0625, - "learning_rate": 0.0008649075600240559, - "loss": 6.6715, - "step": 269 - }, - { - "epoch": 0.2803738317757009, - "grad_norm": 1.890625, - "learning_rate": 0.0008637304982772262, - "loss": 6.6705, - "step": 270 - }, - { - "epoch": 0.2814122533748702, - "grad_norm": 1.8671875, - "learning_rate": 0.0008625491393197415, - "loss": 6.7322, - "step": 271 - }, - { - "epoch": 0.2824506749740395, - "grad_norm": 1.6875, - "learning_rate": 0.0008613634971084967, - "loss": 6.7985, - "step": 272 - }, - { - "epoch": 0.2834890965732087, - "grad_norm": 2.09375, - "learning_rate": 0.0008601735856509902, - "loss": 6.4347, - "step": 273 - }, - { - "epoch": 0.284527518172378, - "grad_norm": 1.5234375, - "learning_rate": 0.0008589794190051581, - "loss": 6.5613, - "step": 274 - }, - { - "epoch": 0.28556593977154726, - "grad_norm": 1.6171875, - "learning_rate": 0.0008577810112792089, - "loss": 6.5693, - "step": 275 - }, - { - "epoch": 0.2866043613707165, - "grad_norm": 1.796875, - "learning_rate": 0.000856578376631456, - "loss": 6.5121, - "step": 276 - }, - { - "epoch": 0.28764278296988577, - "grad_norm": 1.6328125, - "learning_rate": 0.0008553715292701512, - "loss": 6.5373, - "step": 277 - }, - { - "epoch": 0.28868120456905505, - "grad_norm": 1.515625, - "learning_rate": 0.0008541604834533159, - "loss": 6.6374, - "step": 278 - }, - { - "epoch": 0.2897196261682243, - "grad_norm": 1.5859375, - "learning_rate": 0.0008529452534885738, - "loss": 6.6681, - "step": 279 - }, - { - "epoch": 0.29075804776739356, - "grad_norm": 1.6640625, - "learning_rate": 0.0008517258537329808, - "loss": 6.5905, - "step": 280 - }, - { - "epoch": 0.29179646936656284, - "grad_norm": 1.4609375, - "learning_rate": 0.0008505022985928559, - "loss": 6.5741, - "step": 281 - }, - { - "epoch": 0.29283489096573206, - "grad_norm": 1.59375, - "learning_rate": 0.0008492746025236113, - "loss": 6.5985, - "step": 282 - }, - { - "epoch": 0.29387331256490135, - "grad_norm": 1.5, - "learning_rate": 0.0008480427800295809, - "loss": 6.6211, - "step": 283 - }, - { - "epoch": 0.2949117341640706, - "grad_norm": 1.7890625, - "learning_rate": 0.0008468068456638491, - "loss": 6.5773, - "step": 284 - }, - { - "epoch": 0.29595015576323985, - "grad_norm": 1.75, - "learning_rate": 0.0008455668140280798, - "loss": 6.6446, - "step": 285 - }, - { - "epoch": 0.29698857736240913, - "grad_norm": 1.4921875, - "learning_rate": 0.0008443226997723424, - "loss": 6.672, - "step": 286 - }, - { - "epoch": 0.2980269989615784, - "grad_norm": 1.9765625, - "learning_rate": 0.0008430745175949399, - "loss": 6.3888, - "step": 287 - }, - { - "epoch": 0.29906542056074764, - "grad_norm": 1.7265625, - "learning_rate": 0.0008418222822422348, - "loss": 6.583, - "step": 288 - }, - { - "epoch": 0.3001038421599169, - "grad_norm": 3.109375, - "learning_rate": 0.0008405660085084748, - "loss": 6.5971, - "step": 289 - }, - { - "epoch": 0.3011422637590862, - "grad_norm": 1.703125, - "learning_rate": 0.0008393057112356181, - "loss": 6.683, - "step": 290 - }, - { - "epoch": 0.30218068535825543, - "grad_norm": 1.734375, - "learning_rate": 0.000838041405313158, - "loss": 6.6866, - "step": 291 - }, - { - "epoch": 0.3032191069574247, - "grad_norm": 1.5234375, - "learning_rate": 0.0008367731056779475, - "loss": 6.5641, - "step": 292 - }, - { - "epoch": 0.304257528556594, - "grad_norm": 1.3515625, - "learning_rate": 0.0008355008273140221, - "loss": 6.6613, - "step": 293 - }, - { - "epoch": 0.3052959501557632, - "grad_norm": 1.6640625, - "learning_rate": 0.0008342245852524229, - "loss": 6.4916, - "step": 294 - }, - { - "epoch": 0.3063343717549325, - "grad_norm": 1.5390625, - "learning_rate": 0.0008329443945710196, - "loss": 6.5246, - "step": 295 - }, - { - "epoch": 0.3073727933541018, - "grad_norm": 2.375, - "learning_rate": 0.0008316602703943314, - "loss": 6.2067, - "step": 296 - }, - { - "epoch": 0.308411214953271, - "grad_norm": 1.8984375, - "learning_rate": 0.0008303722278933497, - "loss": 6.7264, - "step": 297 - }, - { - "epoch": 0.3094496365524403, - "grad_norm": 1.453125, - "learning_rate": 0.0008290802822853575, - "loss": 6.5385, - "step": 298 - }, - { - "epoch": 0.31048805815160957, - "grad_norm": 1.765625, - "learning_rate": 0.00082778444883375, - "loss": 6.4497, - "step": 299 - }, - { - "epoch": 0.3115264797507788, - "grad_norm": 1.5390625, - "learning_rate": 0.0008264847428478549, - "loss": 6.6755, - "step": 300 - }, - { - "epoch": 0.3115264797507788, - "eval_loss": 6.586581230163574, - "eval_runtime": 1.5864, - "eval_samples_per_second": 10.086, - "eval_steps_per_second": 1.261, - "step": 300 - }, - { - "epoch": 0.3125649013499481, - "grad_norm": 1.7890625, - "learning_rate": 0.0008251811796827506, - "loss": 6.7036, - "step": 301 - }, - { - "epoch": 0.31360332294911736, - "grad_norm": 1.734375, - "learning_rate": 0.0008238737747390859, - "loss": 6.641, - "step": 302 - }, - { - "epoch": 0.3146417445482866, - "grad_norm": 1.6875, - "learning_rate": 0.0008225625434628969, - "loss": 6.4883, - "step": 303 - }, - { - "epoch": 0.31568016614745587, - "grad_norm": 1.6796875, - "learning_rate": 0.0008212475013454248, - "loss": 6.5421, - "step": 304 - }, - { - "epoch": 0.31671858774662515, - "grad_norm": 1.6171875, - "learning_rate": 0.0008199286639229339, - "loss": 6.5171, - "step": 305 - }, - { - "epoch": 0.3177570093457944, - "grad_norm": 1.5859375, - "learning_rate": 0.0008186060467765268, - "loss": 6.4292, - "step": 306 - }, - { - "epoch": 0.31879543094496365, - "grad_norm": 1.6953125, - "learning_rate": 0.0008172796655319606, - "loss": 6.604, - "step": 307 - }, - { - "epoch": 0.31983385254413293, - "grad_norm": 1.6953125, - "learning_rate": 0.0008159495358594628, - "loss": 6.5217, - "step": 308 - }, - { - "epoch": 0.32087227414330216, - "grad_norm": 1.453125, - "learning_rate": 0.0008146156734735457, - "loss": 6.5842, - "step": 309 - }, - { - "epoch": 0.32191069574247144, - "grad_norm": 1.640625, - "learning_rate": 0.0008132780941328211, - "loss": 6.4846, - "step": 310 - }, - { - "epoch": 0.3229491173416407, - "grad_norm": 1.8046875, - "learning_rate": 0.0008119368136398133, - "loss": 6.488, - "step": 311 - }, - { - "epoch": 0.32398753894080995, - "grad_norm": 1.890625, - "learning_rate": 0.0008105918478407739, - "loss": 6.6186, - "step": 312 - }, - { - "epoch": 0.32502596053997923, - "grad_norm": 2.109375, - "learning_rate": 0.0008092432126254931, - "loss": 6.5442, - "step": 313 - }, - { - "epoch": 0.3260643821391485, - "grad_norm": 1.609375, - "learning_rate": 0.0008078909239271126, - "loss": 6.5886, - "step": 314 - }, - { - "epoch": 0.32710280373831774, - "grad_norm": 1.796875, - "learning_rate": 0.0008065349977219379, - "loss": 6.5669, - "step": 315 - }, - { - "epoch": 0.328141225337487, - "grad_norm": 2.109375, - "learning_rate": 0.0008051754500292479, - "loss": 6.6405, - "step": 316 - }, - { - "epoch": 0.3291796469366563, - "grad_norm": 1.8671875, - "learning_rate": 0.0008038122969111079, - "loss": 6.4938, - "step": 317 - }, - { - "epoch": 0.3302180685358255, - "grad_norm": 1.5390625, - "learning_rate": 0.0008024455544721779, - "loss": 6.5688, - "step": 318 - }, - { - "epoch": 0.3312564901349948, - "grad_norm": 1.9296875, - "learning_rate": 0.0008010752388595235, - "loss": 6.5988, - "step": 319 - }, - { - "epoch": 0.3322949117341641, - "grad_norm": 1.640625, - "learning_rate": 0.0007997013662624246, - "loss": 6.4479, - "step": 320 - }, - { - "epoch": 0.3333333333333333, - "grad_norm": 1.8828125, - "learning_rate": 0.0007983239529121843, - "loss": 6.3366, - "step": 321 - }, - { - "epoch": 0.3343717549325026, - "grad_norm": 1.6171875, - "learning_rate": 0.0007969430150819372, - "loss": 6.4521, - "step": 322 - }, - { - "epoch": 0.3354101765316719, - "grad_norm": 1.6796875, - "learning_rate": 0.0007955585690864566, - "loss": 6.2812, - "step": 323 - }, - { - "epoch": 0.3364485981308411, - "grad_norm": 1.921875, - "learning_rate": 0.0007941706312819631, - "loss": 6.4711, - "step": 324 - }, - { - "epoch": 0.3374870197300104, - "grad_norm": 1.6484375, - "learning_rate": 0.0007927792180659296, - "loss": 6.4105, - "step": 325 - }, - { - "epoch": 0.33852544132917967, - "grad_norm": 1.7890625, - "learning_rate": 0.0007913843458768891, - "loss": 6.4175, - "step": 326 - }, - { - "epoch": 0.3395638629283489, - "grad_norm": 1.890625, - "learning_rate": 0.0007899860311942394, - "loss": 6.493, - "step": 327 - }, - { - "epoch": 0.3406022845275182, - "grad_norm": 1.6171875, - "learning_rate": 0.000788584290538049, - "loss": 6.553, - "step": 328 - }, - { - "epoch": 0.34164070612668745, - "grad_norm": 1.65625, - "learning_rate": 0.0007871791404688617, - "loss": 6.512, - "step": 329 - }, - { - "epoch": 0.3426791277258567, - "grad_norm": 1.6328125, - "learning_rate": 0.0007857705975875015, - "loss": 6.4949, - "step": 330 - }, - { - "epoch": 0.34371754932502596, - "grad_norm": 1.7265625, - "learning_rate": 0.0007843586785348752, - "loss": 6.4785, - "step": 331 - }, - { - "epoch": 0.34475597092419524, - "grad_norm": 1.5234375, - "learning_rate": 0.0007829433999917773, - "loss": 6.452, - "step": 332 - }, - { - "epoch": 0.34579439252336447, - "grad_norm": 1.765625, - "learning_rate": 0.0007815247786786919, - "loss": 6.6337, - "step": 333 - }, - { - "epoch": 0.34683281412253375, - "grad_norm": 1.40625, - "learning_rate": 0.0007801028313555953, - "loss": 6.4057, - "step": 334 - }, - { - "epoch": 0.34787123572170303, - "grad_norm": 1.5625, - "learning_rate": 0.000778677574821759, - "loss": 6.3247, - "step": 335 - }, - { - "epoch": 0.34890965732087226, - "grad_norm": 1.671875, - "learning_rate": 0.0007772490259155494, - "loss": 6.4594, - "step": 336 - }, - { - "epoch": 0.34994807892004154, - "grad_norm": 1.5078125, - "learning_rate": 0.0007758172015142302, - "loss": 6.4916, - "step": 337 - }, - { - "epoch": 0.3509865005192108, - "grad_norm": 1.75, - "learning_rate": 0.0007743821185337633, - "loss": 6.6281, - "step": 338 - }, - { - "epoch": 0.35202492211838005, - "grad_norm": 1.8125, - "learning_rate": 0.0007729437939286074, - "loss": 6.4948, - "step": 339 - }, - { - "epoch": 0.3530633437175493, - "grad_norm": 1.6640625, - "learning_rate": 0.0007715022446915194, - "loss": 6.6097, - "step": 340 - }, - { - "epoch": 0.3541017653167186, - "grad_norm": 1.609375, - "learning_rate": 0.0007700574878533524, - "loss": 6.5094, - "step": 341 - }, - { - "epoch": 0.35514018691588783, - "grad_norm": 1.5703125, - "learning_rate": 0.0007686095404828553, - "loss": 6.5109, - "step": 342 - }, - { - "epoch": 0.3561786085150571, - "grad_norm": 2.0625, - "learning_rate": 0.0007671584196864702, - "loss": 6.519, - "step": 343 - }, - { - "epoch": 0.3572170301142264, - "grad_norm": 1.5234375, - "learning_rate": 0.0007657041426081319, - "loss": 6.5761, - "step": 344 - }, - { - "epoch": 0.3582554517133956, - "grad_norm": 1.8671875, - "learning_rate": 0.0007642467264290636, - "loss": 6.6179, - "step": 345 - }, - { - "epoch": 0.3592938733125649, - "grad_norm": 1.625, - "learning_rate": 0.0007627861883675748, - "loss": 6.5245, - "step": 346 - }, - { - "epoch": 0.3603322949117342, - "grad_norm": 1.8046875, - "learning_rate": 0.0007613225456788579, - "loss": 6.3649, - "step": 347 - }, - { - "epoch": 0.3613707165109034, - "grad_norm": 1.9296875, - "learning_rate": 0.0007598558156547841, - "loss": 6.3155, - "step": 348 - }, - { - "epoch": 0.3624091381100727, - "grad_norm": 1.5078125, - "learning_rate": 0.0007583860156236997, - "loss": 6.4403, - "step": 349 - }, - { - "epoch": 0.363447559709242, - "grad_norm": 1.4296875, - "learning_rate": 0.0007569131629502201, - "loss": 6.5533, - "step": 350 - }, - { - "epoch": 0.3644859813084112, - "grad_norm": 1.6953125, - "learning_rate": 0.0007554372750350259, - "loss": 6.4351, - "step": 351 - }, - { - "epoch": 0.3655244029075805, - "grad_norm": 1.28125, - "learning_rate": 0.000753958369314657, - "loss": 6.511, - "step": 352 - }, - { - "epoch": 0.36656282450674976, - "grad_norm": 2.421875, - "learning_rate": 0.000752476463261306, - "loss": 6.4314, - "step": 353 - }, - { - "epoch": 0.367601246105919, - "grad_norm": 1.421875, - "learning_rate": 0.0007509915743826128, - "loss": 6.4206, - "step": 354 - }, - { - "epoch": 0.36863966770508827, - "grad_norm": 1.5546875, - "learning_rate": 0.0007495037202214565, - "loss": 6.5012, - "step": 355 - }, - { - "epoch": 0.36967808930425755, - "grad_norm": 1.3203125, - "learning_rate": 0.0007480129183557498, - "loss": 6.4068, - "step": 356 - }, - { - "epoch": 0.3707165109034268, - "grad_norm": 1.4140625, - "learning_rate": 0.0007465191863982295, - "loss": 6.5955, - "step": 357 - }, - { - "epoch": 0.37175493250259606, - "grad_norm": 1.484375, - "learning_rate": 0.0007450225419962498, - "loss": 6.4707, - "step": 358 - }, - { - "epoch": 0.37279335410176534, - "grad_norm": 1.7109375, - "learning_rate": 0.0007435230028315731, - "loss": 6.4684, - "step": 359 - }, - { - "epoch": 0.37383177570093457, - "grad_norm": 1.2265625, - "learning_rate": 0.0007420205866201614, - "loss": 6.4594, - "step": 360 - }, - { - "epoch": 0.37487019730010385, - "grad_norm": 1.75, - "learning_rate": 0.0007405153111119668, - "loss": 6.4225, - "step": 361 - }, - { - "epoch": 0.37590861889927313, - "grad_norm": 1.65625, - "learning_rate": 0.0007390071940907222, - "loss": 6.394, - "step": 362 - }, - { - "epoch": 0.37694704049844235, - "grad_norm": 1.796875, - "learning_rate": 0.0007374962533737304, - "loss": 6.3531, - "step": 363 - }, - { - "epoch": 0.37798546209761164, - "grad_norm": 1.6484375, - "learning_rate": 0.0007359825068116548, - "loss": 6.2633, - "step": 364 - }, - { - "epoch": 0.3790238836967809, - "grad_norm": 1.25, - "learning_rate": 0.0007344659722883072, - "loss": 6.5055, - "step": 365 - }, - { - "epoch": 0.38006230529595014, - "grad_norm": 1.3046875, - "learning_rate": 0.000732946667720437, - "loss": 6.4877, - "step": 366 - }, - { - "epoch": 0.3811007268951194, - "grad_norm": 2.8125, - "learning_rate": 0.0007314246110575206, - "loss": 6.5176, - "step": 367 - }, - { - "epoch": 0.3821391484942887, - "grad_norm": 1.75, - "learning_rate": 0.0007298998202815473, - "loss": 6.5267, - "step": 368 - }, - { - "epoch": 0.38317757009345793, - "grad_norm": 1.5, - "learning_rate": 0.0007283723134068089, - "loss": 6.4728, - "step": 369 - }, - { - "epoch": 0.3842159916926272, - "grad_norm": 1.953125, - "learning_rate": 0.0007268421084796851, - "loss": 6.3112, - "step": 370 - }, - { - "epoch": 0.3852544132917965, - "grad_norm": 1.6953125, - "learning_rate": 0.0007253092235784317, - "loss": 6.3693, - "step": 371 - }, - { - "epoch": 0.3862928348909657, - "grad_norm": 1.796875, - "learning_rate": 0.0007237736768129663, - "loss": 6.4253, - "step": 372 - }, - { - "epoch": 0.387331256490135, - "grad_norm": 1.578125, - "learning_rate": 0.0007222354863246542, - "loss": 6.566, - "step": 373 - }, - { - "epoch": 0.3883696780893043, - "grad_norm": 1.5625, - "learning_rate": 0.0007206946702860947, - "loss": 6.49, - "step": 374 - }, - { - "epoch": 0.3894080996884735, - "grad_norm": 1.4453125, - "learning_rate": 0.000719151246900906, - "loss": 6.5474, - "step": 375 - }, - { - "epoch": 0.3904465212876428, - "grad_norm": 1.84375, - "learning_rate": 0.00071760523440351, - "loss": 6.111, - "step": 376 - }, - { - "epoch": 0.39148494288681207, - "grad_norm": 3.21875, - "learning_rate": 0.0007160566510589174, - "loss": 6.1774, - "step": 377 - }, - { - "epoch": 0.3925233644859813, - "grad_norm": 1.5390625, - "learning_rate": 0.0007145055151625113, - "loss": 6.4977, - "step": 378 - }, - { - "epoch": 0.3935617860851506, - "grad_norm": 1.4453125, - "learning_rate": 0.0007129518450398314, - "loss": 6.3905, - "step": 379 - }, - { - "epoch": 0.39460020768431986, - "grad_norm": 1.9609375, - "learning_rate": 0.0007113956590463575, - "loss": 6.2437, - "step": 380 - }, - { - "epoch": 0.3956386292834891, - "grad_norm": 1.4140625, - "learning_rate": 0.0007098369755672926, - "loss": 6.3716, - "step": 381 - }, - { - "epoch": 0.39667705088265837, - "grad_norm": 1.546875, - "learning_rate": 0.0007082758130173456, - "loss": 6.4034, - "step": 382 - }, - { - "epoch": 0.39771547248182765, - "grad_norm": 1.796875, - "learning_rate": 0.0007067121898405138, - "loss": 6.4803, - "step": 383 - }, - { - "epoch": 0.3987538940809969, - "grad_norm": 1.5859375, - "learning_rate": 0.0007051461245098653, - "loss": 6.2609, - "step": 384 - }, - { - "epoch": 0.39979231568016615, - "grad_norm": 1.8515625, - "learning_rate": 0.0007035776355273199, - "loss": 6.3421, - "step": 385 - }, - { - "epoch": 0.40083073727933544, - "grad_norm": 1.3515625, - "learning_rate": 0.0007020067414234315, - "loss": 6.3368, - "step": 386 - }, - { - "epoch": 0.40186915887850466, - "grad_norm": 1.3515625, - "learning_rate": 0.0007004334607571687, - "loss": 6.2582, - "step": 387 - }, - { - "epoch": 0.40290758047767394, - "grad_norm": 1.96875, - "learning_rate": 0.0006988578121156955, - "loss": 6.4665, - "step": 388 - }, - { - "epoch": 0.4039460020768432, - "grad_norm": 1.515625, - "learning_rate": 0.000697279814114152, - "loss": 6.486, - "step": 389 - }, - { - "epoch": 0.40498442367601245, - "grad_norm": 1.734375, - "learning_rate": 0.0006956994853954342, - "loss": 6.4766, - "step": 390 - }, - { - "epoch": 0.40602284527518173, - "grad_norm": 1.5703125, - "learning_rate": 0.0006941168446299733, - "loss": 6.235, - "step": 391 - }, - { - "epoch": 0.407061266874351, - "grad_norm": 2.875, - "learning_rate": 0.0006925319105155164, - "loss": 6.0013, - "step": 392 - }, - { - "epoch": 0.40809968847352024, - "grad_norm": 1.796875, - "learning_rate": 0.0006909447017769046, - "loss": 6.2921, - "step": 393 - }, - { - "epoch": 0.4091381100726895, - "grad_norm": 1.5078125, - "learning_rate": 0.000689355237165852, - "loss": 6.3846, - "step": 394 - }, - { - "epoch": 0.4101765316718588, - "grad_norm": 1.7890625, - "learning_rate": 0.0006877635354607238, - "loss": 6.365, - "step": 395 - }, - { - "epoch": 0.411214953271028, - "grad_norm": 1.328125, - "learning_rate": 0.0006861696154663158, - "loss": 6.4089, - "step": 396 - }, - { - "epoch": 0.4122533748701973, - "grad_norm": 1.1796875, - "learning_rate": 0.0006845734960136301, - "loss": 6.4685, - "step": 397 - }, - { - "epoch": 0.4132917964693666, - "grad_norm": 1.3046875, - "learning_rate": 0.0006829751959596544, - "loss": 6.4737, - "step": 398 - }, - { - "epoch": 0.4143302180685358, - "grad_norm": 2.09375, - "learning_rate": 0.000681374734187139, - "loss": 5.9592, - "step": 399 - }, - { - "epoch": 0.4153686396677051, - "grad_norm": 1.7109375, - "learning_rate": 0.0006797721296043726, - "loss": 6.5409, - "step": 400 - }, - { - "epoch": 0.4153686396677051, - "eval_loss": 6.42236328125, - "eval_runtime": 1.6609, - "eval_samples_per_second": 9.633, - "eval_steps_per_second": 1.204, - "step": 400 - }, - { - "epoch": 0.4164070612668744, - "grad_norm": 1.875, - "learning_rate": 0.0006781674011449602, - "loss": 6.3909, - "step": 401 - }, - { - "epoch": 0.4174454828660436, - "grad_norm": 1.6328125, - "learning_rate": 0.0006765605677675981, - "loss": 6.3576, - "step": 402 - }, - { - "epoch": 0.4184839044652129, - "grad_norm": 1.40625, - "learning_rate": 0.0006749516484558518, - "loss": 6.3327, - "step": 403 - }, - { - "epoch": 0.4195223260643821, - "grad_norm": 1.5546875, - "learning_rate": 0.0006733406622179294, - "loss": 6.4531, - "step": 404 - }, - { - "epoch": 0.4205607476635514, - "grad_norm": 1.3515625, - "learning_rate": 0.000671727628086459, - "loss": 6.4259, - "step": 405 - }, - { - "epoch": 0.4215991692627207, - "grad_norm": 1.421875, - "learning_rate": 0.0006701125651182631, - "loss": 6.3521, - "step": 406 - }, - { - "epoch": 0.4226375908618899, - "grad_norm": 1.6484375, - "learning_rate": 0.000668495492394133, - "loss": 6.4586, - "step": 407 - }, - { - "epoch": 0.4236760124610592, - "grad_norm": 1.6484375, - "learning_rate": 0.0006668764290186039, - "loss": 6.3092, - "step": 408 - }, - { - "epoch": 0.42471443406022846, - "grad_norm": 1.328125, - "learning_rate": 0.0006652553941197294, - "loss": 6.4544, - "step": 409 - }, - { - "epoch": 0.4257528556593977, - "grad_norm": 2.078125, - "learning_rate": 0.0006636324068488547, - "loss": 6.4361, - "step": 410 - }, - { - "epoch": 0.42679127725856697, - "grad_norm": 1.7109375, - "learning_rate": 0.0006620074863803913, - "loss": 6.3362, - "step": 411 - }, - { - "epoch": 0.42782969885773625, - "grad_norm": 1.8203125, - "learning_rate": 0.0006603806519115899, - "loss": 6.4031, - "step": 412 - }, - { - "epoch": 0.4288681204569055, - "grad_norm": 1.6484375, - "learning_rate": 0.0006587519226623137, - "loss": 6.3723, - "step": 413 - }, - { - "epoch": 0.42990654205607476, - "grad_norm": 2.09375, - "learning_rate": 0.0006571213178748112, - "loss": 6.5301, - "step": 414 - }, - { - "epoch": 0.43094496365524404, - "grad_norm": 1.609375, - "learning_rate": 0.0006554888568134894, - "loss": 6.3084, - "step": 415 - }, - { - "epoch": 0.43198338525441327, - "grad_norm": 1.9921875, - "learning_rate": 0.0006538545587646854, - "loss": 6.4097, - "step": 416 - }, - { - "epoch": 0.43302180685358255, - "grad_norm": 2.09375, - "learning_rate": 0.0006522184430364391, - "loss": 6.2246, - "step": 417 - }, - { - "epoch": 0.43406022845275183, - "grad_norm": 1.875, - "learning_rate": 0.0006505805289582649, - "loss": 6.4737, - "step": 418 - }, - { - "epoch": 0.43509865005192105, - "grad_norm": 1.53125, - "learning_rate": 0.0006489408358809239, - "loss": 6.4345, - "step": 419 - }, - { - "epoch": 0.43613707165109034, - "grad_norm": 1.421875, - "learning_rate": 0.000647299383176194, - "loss": 6.2794, - "step": 420 - }, - { - "epoch": 0.4371754932502596, - "grad_norm": 1.578125, - "learning_rate": 0.0006456561902366424, - "loss": 6.3705, - "step": 421 - }, - { - "epoch": 0.43821391484942884, - "grad_norm": 1.3828125, - "learning_rate": 0.0006440112764753956, - "loss": 6.3456, - "step": 422 - }, - { - "epoch": 0.4392523364485981, - "grad_norm": 1.7109375, - "learning_rate": 0.0006423646613259104, - "loss": 6.4904, - "step": 423 - }, - { - "epoch": 0.4402907580477674, - "grad_norm": 1.5390625, - "learning_rate": 0.0006407163642417442, - "loss": 6.2549, - "step": 424 - }, - { - "epoch": 0.44132917964693663, - "grad_norm": 1.9375, - "learning_rate": 0.0006390664046963256, - "loss": 6.4673, - "step": 425 - }, - { - "epoch": 0.4423676012461059, - "grad_norm": 1.578125, - "learning_rate": 0.0006374148021827237, - "loss": 6.4597, - "step": 426 - }, - { - "epoch": 0.4434060228452752, - "grad_norm": 1.4296875, - "learning_rate": 0.0006357615762134178, - "loss": 6.3055, - "step": 427 - }, - { - "epoch": 0.4444444444444444, - "grad_norm": 1.4921875, - "learning_rate": 0.0006341067463200677, - "loss": 6.3134, - "step": 428 - }, - { - "epoch": 0.4454828660436137, - "grad_norm": 1.8359375, - "learning_rate": 0.0006324503320532819, - "loss": 5.9713, - "step": 429 - }, - { - "epoch": 0.446521287642783, - "grad_norm": 1.4765625, - "learning_rate": 0.0006307923529823876, - "loss": 6.3577, - "step": 430 - }, - { - "epoch": 0.4475597092419522, - "grad_norm": 3.546875, - "learning_rate": 0.0006291328286951985, - "loss": 6.227, - "step": 431 - }, - { - "epoch": 0.4485981308411215, - "grad_norm": 1.796875, - "learning_rate": 0.0006274717787977842, - "loss": 6.1991, - "step": 432 - }, - { - "epoch": 0.44963655244029077, - "grad_norm": 1.5625, - "learning_rate": 0.0006258092229142382, - "loss": 6.4064, - "step": 433 - }, - { - "epoch": 0.45067497403946, - "grad_norm": 1.734375, - "learning_rate": 0.0006241451806864465, - "loss": 6.4659, - "step": 434 - }, - { - "epoch": 0.4517133956386293, - "grad_norm": 1.3359375, - "learning_rate": 0.0006224796717738544, - "loss": 6.3568, - "step": 435 - }, - { - "epoch": 0.45275181723779856, - "grad_norm": 1.40625, - "learning_rate": 0.0006208127158532357, - "loss": 6.2009, - "step": 436 - }, - { - "epoch": 0.4537902388369678, - "grad_norm": 1.9765625, - "learning_rate": 0.0006191443326184591, - "loss": 6.3971, - "step": 437 - }, - { - "epoch": 0.45482866043613707, - "grad_norm": 1.625, - "learning_rate": 0.0006174745417802563, - "loss": 6.5802, - "step": 438 - }, - { - "epoch": 0.45586708203530635, - "grad_norm": 3.25, - "learning_rate": 0.0006158033630659886, - "loss": 5.9341, - "step": 439 - }, - { - "epoch": 0.4569055036344756, - "grad_norm": 1.6171875, - "learning_rate": 0.0006141308162194142, - "loss": 6.2119, - "step": 440 - }, - { - "epoch": 0.45794392523364486, - "grad_norm": 1.5859375, - "learning_rate": 0.0006124569210004547, - "loss": 6.3936, - "step": 441 - }, - { - "epoch": 0.45898234683281414, - "grad_norm": 1.9296875, - "learning_rate": 0.0006107816971849616, - "loss": 6.1906, - "step": 442 - }, - { - "epoch": 0.46002076843198336, - "grad_norm": 1.484375, - "learning_rate": 0.000609105164564483, - "loss": 6.1966, - "step": 443 - }, - { - "epoch": 0.46105919003115264, - "grad_norm": 1.3984375, - "learning_rate": 0.0006074273429460295, - "loss": 6.358, - "step": 444 - }, - { - "epoch": 0.4620976116303219, - "grad_norm": 1.8828125, - "learning_rate": 0.0006057482521518403, - "loss": 6.4207, - "step": 445 - }, - { - "epoch": 0.46313603322949115, - "grad_norm": 1.546875, - "learning_rate": 0.0006040679120191491, - "loss": 6.3122, - "step": 446 - }, - { - "epoch": 0.46417445482866043, - "grad_norm": 1.7734375, - "learning_rate": 0.0006023863423999496, - "loss": 6.2674, - "step": 447 - }, - { - "epoch": 0.4652128764278297, - "grad_norm": 1.6171875, - "learning_rate": 0.0006007035631607605, - "loss": 6.285, - "step": 448 - }, - { - "epoch": 0.46625129802699894, - "grad_norm": 1.546875, - "learning_rate": 0.0005990195941823916, - "loss": 6.2897, - "step": 449 - }, - { - "epoch": 0.4672897196261682, - "grad_norm": 1.703125, - "learning_rate": 0.0005973344553597091, - "loss": 6.3552, - "step": 450 - }, - { - "epoch": 0.4683281412253375, - "grad_norm": 1.671875, - "learning_rate": 0.0005956481666013993, - "loss": 6.3373, - "step": 451 - }, - { - "epoch": 0.46936656282450673, - "grad_norm": 1.6171875, - "learning_rate": 0.0005939607478297346, - "loss": 6.2862, - "step": 452 - }, - { - "epoch": 0.470404984423676, - "grad_norm": 1.8984375, - "learning_rate": 0.0005922722189803378, - "loss": 6.253, - "step": 453 - }, - { - "epoch": 0.4714434060228453, - "grad_norm": 1.4140625, - "learning_rate": 0.0005905826000019458, - "loss": 6.3616, - "step": 454 - }, - { - "epoch": 0.4724818276220145, - "grad_norm": 4.125, - "learning_rate": 0.0005888919108561749, - "loss": 6.4911, - "step": 455 - }, - { - "epoch": 0.4735202492211838, - "grad_norm": 1.6953125, - "learning_rate": 0.0005872001715172853, - "loss": 6.3549, - "step": 456 - }, - { - "epoch": 0.4745586708203531, - "grad_norm": 1.796875, - "learning_rate": 0.0005855074019719433, - "loss": 6.3192, - "step": 457 - }, - { - "epoch": 0.4755970924195223, - "grad_norm": 1.5078125, - "learning_rate": 0.0005838136222189874, - "loss": 6.2864, - "step": 458 - }, - { - "epoch": 0.4766355140186916, - "grad_norm": 1.4921875, - "learning_rate": 0.0005821188522691903, - "loss": 6.4379, - "step": 459 - }, - { - "epoch": 0.47767393561786087, - "grad_norm": 1.25, - "learning_rate": 0.0005804231121450236, - "loss": 6.4266, - "step": 460 - }, - { - "epoch": 0.4787123572170301, - "grad_norm": 1.3125, - "learning_rate": 0.0005787264218804201, - "loss": 6.3983, - "step": 461 - }, - { - "epoch": 0.4797507788161994, - "grad_norm": 1.6015625, - "learning_rate": 0.0005770288015205385, - "loss": 6.3447, - "step": 462 - }, - { - "epoch": 0.48078920041536866, - "grad_norm": 1.984375, - "learning_rate": 0.0005753302711215259, - "loss": 6.1843, - "step": 463 - }, - { - "epoch": 0.4818276220145379, - "grad_norm": 1.984375, - "learning_rate": 0.0005736308507502804, - "loss": 6.4501, - "step": 464 - }, - { - "epoch": 0.48286604361370716, - "grad_norm": 1.484375, - "learning_rate": 0.0005719305604842148, - "loss": 6.3637, - "step": 465 - }, - { - "epoch": 0.48390446521287644, - "grad_norm": 1.6796875, - "learning_rate": 0.000570229420411019, - "loss": 6.2897, - "step": 466 - }, - { - "epoch": 0.48494288681204567, - "grad_norm": 1.640625, - "learning_rate": 0.0005685274506284226, - "loss": 6.3343, - "step": 467 - }, - { - "epoch": 0.48598130841121495, - "grad_norm": 1.3671875, - "learning_rate": 0.0005668246712439578, - "loss": 6.4351, - "step": 468 - }, - { - "epoch": 0.48701973001038423, - "grad_norm": 1.9765625, - "learning_rate": 0.0005651211023747216, - "loss": 6.0338, - "step": 469 - }, - { - "epoch": 0.48805815160955346, - "grad_norm": 1.421875, - "learning_rate": 0.0005634167641471383, - "loss": 6.1922, - "step": 470 - }, - { - "epoch": 0.48909657320872274, - "grad_norm": 1.3203125, - "learning_rate": 0.0005617116766967212, - "loss": 6.6165, - "step": 471 - }, - { - "epoch": 0.490134994807892, - "grad_norm": 2.515625, - "learning_rate": 0.0005600058601678358, - "loss": 6.311, - "step": 472 - }, - { - "epoch": 0.49117341640706125, - "grad_norm": 1.46875, - "learning_rate": 0.0005582993347134603, - "loss": 6.4132, - "step": 473 - }, - { - "epoch": 0.49221183800623053, - "grad_norm": 1.6484375, - "learning_rate": 0.0005565921204949492, - "loss": 6.2585, - "step": 474 - }, - { - "epoch": 0.4932502596053998, - "grad_norm": 2.03125, - "learning_rate": 0.0005548842376817935, - "loss": 6.3609, - "step": 475 - }, - { - "epoch": 0.49428868120456904, - "grad_norm": 1.6484375, - "learning_rate": 0.0005531757064513835, - "loss": 6.3305, - "step": 476 - }, - { - "epoch": 0.4953271028037383, - "grad_norm": 1.421875, - "learning_rate": 0.0005514665469887705, - "loss": 6.2568, - "step": 477 - }, - { - "epoch": 0.4963655244029076, - "grad_norm": 1.75, - "learning_rate": 0.000549756779486427, - "loss": 6.2414, - "step": 478 - }, - { - "epoch": 0.4974039460020768, - "grad_norm": 1.46875, - "learning_rate": 0.0005480464241440096, - "loss": 6.3746, - "step": 479 - }, - { - "epoch": 0.4984423676012461, - "grad_norm": 1.484375, - "learning_rate": 0.0005463355011681199, - "loss": 6.3126, - "step": 480 - }, - { - "epoch": 0.4994807892004154, - "grad_norm": 1.3359375, - "learning_rate": 0.0005446240307720653, - "loss": 6.3338, - "step": 481 - }, - { - "epoch": 0.5005192107995846, - "grad_norm": 1.3359375, - "learning_rate": 0.0005429120331756208, - "loss": 6.3524, - "step": 482 - }, - { - "epoch": 0.5015576323987538, - "grad_norm": 1.71875, - "learning_rate": 0.00054119952860479, - "loss": 6.2864, - "step": 483 - }, - { - "epoch": 0.5025960539979232, - "grad_norm": 1.734375, - "learning_rate": 0.0005394865372915656, - "loss": 6.3503, - "step": 484 - }, - { - "epoch": 0.5036344755970924, - "grad_norm": 1.78125, - "learning_rate": 0.0005377730794736914, - "loss": 6.1829, - "step": 485 - }, - { - "epoch": 0.5046728971962616, - "grad_norm": 1.6484375, - "learning_rate": 0.000536059175394422, - "loss": 6.2451, - "step": 486 - }, - { - "epoch": 0.505711318795431, - "grad_norm": 2.265625, - "learning_rate": 0.0005343448453022847, - "loss": 5.8713, - "step": 487 - }, - { - "epoch": 0.5067497403946002, - "grad_norm": 1.3203125, - "learning_rate": 0.0005326301094508399, - "loss": 6.2755, - "step": 488 - }, - { - "epoch": 0.5077881619937694, - "grad_norm": 7.125, - "learning_rate": 0.0005309149880984411, - "loss": 6.4164, - "step": 489 - }, - { - "epoch": 0.5088265835929388, - "grad_norm": 2.0, - "learning_rate": 0.0005291995015079969, - "loss": 6.4206, - "step": 490 - }, - { - "epoch": 0.509865005192108, - "grad_norm": 1.734375, - "learning_rate": 0.0005274836699467307, - "loss": 6.2462, - "step": 491 - }, - { - "epoch": 0.5109034267912772, - "grad_norm": 2.234375, - "learning_rate": 0.0005257675136859414, - "loss": 6.0642, - "step": 492 - }, - { - "epoch": 0.5119418483904465, - "grad_norm": 2.53125, - "learning_rate": 0.000524051053000764, - "loss": 6.1371, - "step": 493 - }, - { - "epoch": 0.5129802699896158, - "grad_norm": 1.875, - "learning_rate": 0.0005223343081699302, - "loss": 6.4626, - "step": 494 - }, - { - "epoch": 0.514018691588785, - "grad_norm": 2.703125, - "learning_rate": 0.0005206172994755284, - "loss": 6.3176, - "step": 495 - }, - { - "epoch": 0.5150571131879543, - "grad_norm": 1.46875, - "learning_rate": 0.0005189000472027645, - "loss": 6.2687, - "step": 496 - }, - { - "epoch": 0.5160955347871236, - "grad_norm": 1.296875, - "learning_rate": 0.0005171825716397222, - "loss": 6.3934, - "step": 497 - }, - { - "epoch": 0.5171339563862928, - "grad_norm": 1.609375, - "learning_rate": 0.0005154648930771234, - "loss": 6.3839, - "step": 498 - }, - { - "epoch": 0.5181723779854621, - "grad_norm": 1.734375, - "learning_rate": 0.0005137470318080876, - "loss": 6.2297, - "step": 499 - }, - { - "epoch": 0.5192107995846313, - "grad_norm": 2.0625, - "learning_rate": 0.0005120290081278934, - "loss": 6.214, - "step": 500 - }, - { - "epoch": 0.5192107995846313, - "eval_loss": 6.333878040313721, - "eval_runtime": 1.7224, - "eval_samples_per_second": 9.289, - "eval_steps_per_second": 1.161, - "step": 500 - }, - { - "epoch": 0.5202492211838006, - "grad_norm": 1.6640625, - "learning_rate": 0.0005103108423337382, - "loss": 6.1978, - "step": 501 - }, - { - "epoch": 0.5212876427829699, - "grad_norm": 1.515625, - "learning_rate": 0.0005085925547244979, - "loss": 6.3541, - "step": 502 - }, - { - "epoch": 0.5223260643821391, - "grad_norm": 1.7265625, - "learning_rate": 0.000506874165600488, - "loss": 6.3855, - "step": 503 - }, - { - "epoch": 0.5233644859813084, - "grad_norm": 1.4921875, - "learning_rate": 0.0005051556952632235, - "loss": 6.1905, - "step": 504 - }, - { - "epoch": 0.5244029075804777, - "grad_norm": 1.7578125, - "learning_rate": 0.0005034371640151781, - "loss": 6.2397, - "step": 505 - }, - { - "epoch": 0.5254413291796469, - "grad_norm": 1.671875, - "learning_rate": 0.0005017185921595457, - "loss": 6.3751, - "step": 506 - }, - { - "epoch": 0.5264797507788161, - "grad_norm": 2.109375, - "learning_rate": 0.0005, - "loss": 6.5016, - "step": 507 - }, - { - "epoch": 0.5275181723779855, - "grad_norm": 1.5625, - "learning_rate": 0.0004982814078404544, - "loss": 6.1849, - "step": 508 - }, - { - "epoch": 0.5285565939771547, - "grad_norm": 3.90625, - "learning_rate": 0.000496562835984822, - "loss": 5.6967, - "step": 509 - }, - { - "epoch": 0.5295950155763239, - "grad_norm": 1.5078125, - "learning_rate": 0.0004948443047367768, - "loss": 6.3165, - "step": 510 - }, - { - "epoch": 0.5306334371754933, - "grad_norm": 1.3515625, - "learning_rate": 0.0004931258343995119, - "loss": 6.3457, - "step": 511 - }, - { - "epoch": 0.5316718587746625, - "grad_norm": 1.5546875, - "learning_rate": 0.0004914074452755021, - "loss": 6.2912, - "step": 512 - }, - { - "epoch": 0.5327102803738317, - "grad_norm": 2.875, - "learning_rate": 0.000489689157666262, - "loss": 6.436, - "step": 513 - }, - { - "epoch": 0.5337487019730011, - "grad_norm": 1.9140625, - "learning_rate": 0.0004879709918721067, - "loss": 6.388, - "step": 514 - }, - { - "epoch": 0.5347871235721703, - "grad_norm": 1.5390625, - "learning_rate": 0.0004862529681919125, - "loss": 6.3625, - "step": 515 - }, - { - "epoch": 0.5358255451713395, - "grad_norm": 1.421875, - "learning_rate": 0.0004845351069228767, - "loss": 6.3582, - "step": 516 - }, - { - "epoch": 0.5368639667705088, - "grad_norm": 1.3671875, - "learning_rate": 0.0004828174283602778, - "loss": 6.2882, - "step": 517 - }, - { - "epoch": 0.5379023883696781, - "grad_norm": 1.3515625, - "learning_rate": 0.0004810999527972355, - "loss": 6.3509, - "step": 518 - }, - { - "epoch": 0.5389408099688473, - "grad_norm": 1.8203125, - "learning_rate": 0.0004793827005244717, - "loss": 6.1571, - "step": 519 - }, - { - "epoch": 0.5399792315680166, - "grad_norm": 1.8984375, - "learning_rate": 0.0004776656918300699, - "loss": 6.4146, - "step": 520 - }, - { - "epoch": 0.5410176531671859, - "grad_norm": 1.8828125, - "learning_rate": 0.00047594894699923605, - "loss": 6.2866, - "step": 521 - }, - { - "epoch": 0.5420560747663551, - "grad_norm": 1.390625, - "learning_rate": 0.0004742324863140586, - "loss": 6.3875, - "step": 522 - }, - { - "epoch": 0.5430944963655244, - "grad_norm": 1.8203125, - "learning_rate": 0.0004725163300532693, - "loss": 6.3196, - "step": 523 - }, - { - "epoch": 0.5441329179646937, - "grad_norm": 1.796875, - "learning_rate": 0.000470800498492003, - "loss": 6.1896, - "step": 524 - }, - { - "epoch": 0.5451713395638629, - "grad_norm": 1.890625, - "learning_rate": 0.0004690850119015589, - "loss": 6.1498, - "step": 525 - }, - { - "epoch": 0.5462097611630322, - "grad_norm": 2.359375, - "learning_rate": 0.00046736989054916016, - "loss": 5.9399, - "step": 526 - }, - { - "epoch": 0.5472481827622014, - "grad_norm": 1.96875, - "learning_rate": 0.00046565515469771524, - "loss": 6.011, - "step": 527 - }, - { - "epoch": 0.5482866043613707, - "grad_norm": 1.4453125, - "learning_rate": 0.0004639408246055781, - "loss": 6.2529, - "step": 528 - }, - { - "epoch": 0.54932502596054, - "grad_norm": 1.375, - "learning_rate": 0.00046222692052630876, - "loss": 6.2216, - "step": 529 - }, - { - "epoch": 0.5503634475597092, - "grad_norm": 1.59375, - "learning_rate": 0.00046051346270843446, - "loss": 6.2293, - "step": 530 - }, - { - "epoch": 0.5514018691588785, - "grad_norm": 1.796875, - "learning_rate": 0.00045880047139521, - "loss": 6.2045, - "step": 531 - }, - { - "epoch": 0.5524402907580478, - "grad_norm": 1.5078125, - "learning_rate": 0.0004570879668243792, - "loss": 6.163, - "step": 532 - }, - { - "epoch": 0.553478712357217, - "grad_norm": 1.5234375, - "learning_rate": 0.0004553759692279347, - "loss": 6.2428, - "step": 533 - }, - { - "epoch": 0.5545171339563862, - "grad_norm": 1.484375, - "learning_rate": 0.00045366449883188015, - "loss": 6.3414, - "step": 534 - }, - { - "epoch": 0.5555555555555556, - "grad_norm": 1.6953125, - "learning_rate": 0.0004519535758559904, - "loss": 6.3549, - "step": 535 - }, - { - "epoch": 0.5565939771547248, - "grad_norm": 1.609375, - "learning_rate": 0.0004502432205135731, - "loss": 6.205, - "step": 536 - }, - { - "epoch": 0.557632398753894, - "grad_norm": 2.78125, - "learning_rate": 0.0004485334530112297, - "loss": 6.3195, - "step": 537 - }, - { - "epoch": 0.5586708203530634, - "grad_norm": 1.359375, - "learning_rate": 0.00044682429354861637, - "loss": 6.1937, - "step": 538 - }, - { - "epoch": 0.5597092419522326, - "grad_norm": 1.8671875, - "learning_rate": 0.00044511576231820655, - "loss": 6.0656, - "step": 539 - }, - { - "epoch": 0.5607476635514018, - "grad_norm": 1.9296875, - "learning_rate": 0.0004434078795050509, - "loss": 6.3324, - "step": 540 - }, - { - "epoch": 0.5617860851505712, - "grad_norm": 1.7890625, - "learning_rate": 0.00044170066528653973, - "loss": 6.2444, - "step": 541 - }, - { - "epoch": 0.5628245067497404, - "grad_norm": 1.5, - "learning_rate": 0.0004399941398321644, - "loss": 6.2952, - "step": 542 - }, - { - "epoch": 0.5638629283489096, - "grad_norm": 1.5390625, - "learning_rate": 0.00043828832330327895, - "loss": 6.3273, - "step": 543 - }, - { - "epoch": 0.564901349948079, - "grad_norm": 1.9296875, - "learning_rate": 0.00043658323585286175, - "loss": 6.1395, - "step": 544 - }, - { - "epoch": 0.5659397715472482, - "grad_norm": 1.90625, - "learning_rate": 0.00043487889762527834, - "loss": 6.4269, - "step": 545 - }, - { - "epoch": 0.5669781931464174, - "grad_norm": 1.40625, - "learning_rate": 0.0004331753287560422, - "loss": 6.3326, - "step": 546 - }, - { - "epoch": 0.5680166147455867, - "grad_norm": 1.6328125, - "learning_rate": 0.00043147254937157744, - "loss": 6.2105, - "step": 547 - }, - { - "epoch": 0.569055036344756, - "grad_norm": 4.3125, - "learning_rate": 0.00042977057958898103, - "loss": 6.5283, - "step": 548 - }, - { - "epoch": 0.5700934579439252, - "grad_norm": 1.8046875, - "learning_rate": 0.0004280694395157853, - "loss": 6.3104, - "step": 549 - }, - { - "epoch": 0.5711318795430945, - "grad_norm": 1.5546875, - "learning_rate": 0.0004263691492497197, - "loss": 6.2183, - "step": 550 - }, - { - "epoch": 0.5721703011422637, - "grad_norm": 1.609375, - "learning_rate": 0.00042466972887847404, - "loss": 6.276, - "step": 551 - }, - { - "epoch": 0.573208722741433, - "grad_norm": 1.40625, - "learning_rate": 0.0004229711984794614, - "loss": 6.1446, - "step": 552 - }, - { - "epoch": 0.5742471443406023, - "grad_norm": 1.5, - "learning_rate": 0.00042127357811958, - "loss": 6.3867, - "step": 553 - }, - { - "epoch": 0.5752855659397715, - "grad_norm": 1.453125, - "learning_rate": 0.0004195768878549766, - "loss": 6.4525, - "step": 554 - }, - { - "epoch": 0.5763239875389408, - "grad_norm": 1.203125, - "learning_rate": 0.00041788114773080984, - "loss": 6.3332, - "step": 555 - }, - { - "epoch": 0.5773624091381101, - "grad_norm": 1.21875, - "learning_rate": 0.0004161863777810128, - "loss": 6.2988, - "step": 556 - }, - { - "epoch": 0.5784008307372793, - "grad_norm": 1.2890625, - "learning_rate": 0.00041449259802805685, - "loss": 6.3041, - "step": 557 - }, - { - "epoch": 0.5794392523364486, - "grad_norm": 1.4140625, - "learning_rate": 0.0004127998284827148, - "loss": 6.1669, - "step": 558 - }, - { - "epoch": 0.5804776739356179, - "grad_norm": 1.8125, - "learning_rate": 0.00041110808914382506, - "loss": 6.033, - "step": 559 - }, - { - "epoch": 0.5815160955347871, - "grad_norm": 1.4375, - "learning_rate": 0.0004094173999980544, - "loss": 6.3417, - "step": 560 - }, - { - "epoch": 0.5825545171339563, - "grad_norm": 2.015625, - "learning_rate": 0.00040772778101966234, - "loss": 6.3482, - "step": 561 - }, - { - "epoch": 0.5835929387331257, - "grad_norm": 1.4453125, - "learning_rate": 0.00040603925217026543, - "loss": 6.1923, - "step": 562 - }, - { - "epoch": 0.5846313603322949, - "grad_norm": 2.125, - "learning_rate": 0.00040435183339860084, - "loss": 6.0967, - "step": 563 - }, - { - "epoch": 0.5856697819314641, - "grad_norm": 1.71875, - "learning_rate": 0.00040266554464029117, - "loss": 6.1855, - "step": 564 - }, - { - "epoch": 0.5867082035306335, - "grad_norm": 1.234375, - "learning_rate": 0.00040098040581760836, - "loss": 6.2101, - "step": 565 - }, - { - "epoch": 0.5877466251298027, - "grad_norm": 1.1875, - "learning_rate": 0.0003992964368392397, - "loss": 6.3314, - "step": 566 - }, - { - "epoch": 0.5887850467289719, - "grad_norm": 1.6015625, - "learning_rate": 0.00039761365760005053, - "loss": 6.1649, - "step": 567 - }, - { - "epoch": 0.5898234683281413, - "grad_norm": 1.40625, - "learning_rate": 0.0003959320879808509, - "loss": 6.3325, - "step": 568 - }, - { - "epoch": 0.5908618899273105, - "grad_norm": 1.7578125, - "learning_rate": 0.00039425174784815973, - "loss": 6.1956, - "step": 569 - }, - { - "epoch": 0.5919003115264797, - "grad_norm": 1.578125, - "learning_rate": 0.0003925726570539706, - "loss": 6.3196, - "step": 570 - }, - { - "epoch": 0.592938733125649, - "grad_norm": 1.6796875, - "learning_rate": 0.000390894835435517, - "loss": 6.2539, - "step": 571 - }, - { - "epoch": 0.5939771547248183, - "grad_norm": 1.4140625, - "learning_rate": 0.00038921830281503844, - "loss": 6.2395, - "step": 572 - }, - { - "epoch": 0.5950155763239875, - "grad_norm": 1.9296875, - "learning_rate": 0.00038754307899954536, - "loss": 6.3062, - "step": 573 - }, - { - "epoch": 0.5960539979231568, - "grad_norm": 1.9921875, - "learning_rate": 0.0003858691837805859, - "loss": 6.1366, - "step": 574 - }, - { - "epoch": 0.5970924195223261, - "grad_norm": 1.8984375, - "learning_rate": 0.0003841966369340115, - "loss": 6.1059, - "step": 575 - }, - { - "epoch": 0.5981308411214953, - "grad_norm": 1.8125, - "learning_rate": 0.0003825254582197438, - "loss": 6.0075, - "step": 576 - }, - { - "epoch": 0.5991692627206646, - "grad_norm": 1.8046875, - "learning_rate": 0.000380855667381541, - "loss": 6.1437, - "step": 577 - }, - { - "epoch": 0.6002076843198338, - "grad_norm": 1.59375, - "learning_rate": 0.0003791872841467643, - "loss": 6.3531, - "step": 578 - }, - { - "epoch": 0.6012461059190031, - "grad_norm": 2.5, - "learning_rate": 0.00037752032822614554, - "loss": 6.2369, - "step": 579 - }, - { - "epoch": 0.6022845275181724, - "grad_norm": 1.4921875, - "learning_rate": 0.0003758548193135536, - "loss": 6.2403, - "step": 580 - }, - { - "epoch": 0.6033229491173416, - "grad_norm": 1.40625, - "learning_rate": 0.0003741907770857618, - "loss": 6.2333, - "step": 581 - }, - { - "epoch": 0.6043613707165109, - "grad_norm": 1.4375, - "learning_rate": 0.0003725282212022159, - "loss": 6.2413, - "step": 582 - }, - { - "epoch": 0.6053997923156802, - "grad_norm": 1.6171875, - "learning_rate": 0.0003708671713048017, - "loss": 6.4416, - "step": 583 - }, - { - "epoch": 0.6064382139148494, - "grad_norm": 1.484375, - "learning_rate": 0.0003692076470176126, - "loss": 6.0833, - "step": 584 - }, - { - "epoch": 0.6074766355140186, - "grad_norm": 1.78125, - "learning_rate": 0.000367549667946718, - "loss": 6.1351, - "step": 585 - }, - { - "epoch": 0.608515057113188, - "grad_norm": 1.3828125, - "learning_rate": 0.0003658932536799324, - "loss": 6.3859, - "step": 586 - }, - { - "epoch": 0.6095534787123572, - "grad_norm": 1.2734375, - "learning_rate": 0.0003642384237865823, - "loss": 6.3225, - "step": 587 - }, - { - "epoch": 0.6105919003115264, - "grad_norm": 1.7421875, - "learning_rate": 0.0003625851978172765, - "loss": 6.2748, - "step": 588 - }, - { - "epoch": 0.6116303219106958, - "grad_norm": 1.5625, - "learning_rate": 0.00036093359530367454, - "loss": 6.2779, - "step": 589 - }, - { - "epoch": 0.612668743509865, - "grad_norm": 1.3125, - "learning_rate": 0.00035928363575825595, - "loss": 6.2115, - "step": 590 - }, - { - "epoch": 0.6137071651090342, - "grad_norm": 1.59375, - "learning_rate": 0.0003576353386740899, - "loss": 6.2217, - "step": 591 - }, - { - "epoch": 0.6147455867082036, - "grad_norm": 1.5078125, - "learning_rate": 0.00035598872352460455, - "loss": 6.3422, - "step": 592 - }, - { - "epoch": 0.6157840083073728, - "grad_norm": 1.8984375, - "learning_rate": 0.0003543438097633577, - "loss": 6.0398, - "step": 593 - }, - { - "epoch": 0.616822429906542, - "grad_norm": 1.90625, - "learning_rate": 0.00035270061682380607, - "loss": 6.2974, - "step": 594 - }, - { - "epoch": 0.6178608515057114, - "grad_norm": 1.453125, - "learning_rate": 0.0003510591641190762, - "loss": 6.2733, - "step": 595 - }, - { - "epoch": 0.6188992731048806, - "grad_norm": 1.7578125, - "learning_rate": 0.0003494194710417351, - "loss": 6.2583, - "step": 596 - }, - { - "epoch": 0.6199376947040498, - "grad_norm": 1.65625, - "learning_rate": 0.0003477815569635611, - "loss": 5.9226, - "step": 597 - }, - { - "epoch": 0.6209761163032191, - "grad_norm": 1.4765625, - "learning_rate": 0.00034614544123531474, - "loss": 6.2218, - "step": 598 - }, - { - "epoch": 0.6220145379023884, - "grad_norm": 2.015625, - "learning_rate": 0.00034451114318651064, - "loss": 6.2521, - "step": 599 - }, - { - "epoch": 0.6230529595015576, - "grad_norm": 1.6015625, - "learning_rate": 0.0003428786821251888, - "loss": 6.2548, - "step": 600 - }, - { - "epoch": 0.6230529595015576, - "eval_loss": 6.287681579589844, - "eval_runtime": 1.6799, - "eval_samples_per_second": 9.525, - "eval_steps_per_second": 1.191, - "step": 600 - }, - { - "epoch": 0.6240913811007269, - "grad_norm": 1.609375, - "learning_rate": 0.0003412480773376864, - "loss": 6.2849, - "step": 601 - }, - { - "epoch": 0.6251298026998962, - "grad_norm": 1.5234375, - "learning_rate": 0.00033961934808841024, - "loss": 6.1491, - "step": 602 - }, - { - "epoch": 0.6261682242990654, - "grad_norm": 1.3203125, - "learning_rate": 0.0003379925136196088, - "loss": 6.281, - "step": 603 - }, - { - "epoch": 0.6272066458982347, - "grad_norm": 1.5625, - "learning_rate": 0.0003363675931511455, - "loss": 6.2496, - "step": 604 - }, - { - "epoch": 0.6282450674974039, - "grad_norm": 1.3046875, - "learning_rate": 0.0003347446058802708, - "loss": 6.2776, - "step": 605 - }, - { - "epoch": 0.6292834890965732, - "grad_norm": 1.46875, - "learning_rate": 0.00033312357098139617, - "loss": 6.3928, - "step": 606 - }, - { - "epoch": 0.6303219106957425, - "grad_norm": 1.34375, - "learning_rate": 0.0003315045076058671, - "loss": 6.2803, - "step": 607 - }, - { - "epoch": 0.6313603322949117, - "grad_norm": 1.4296875, - "learning_rate": 0.00032988743488173697, - "loss": 6.2263, - "step": 608 - }, - { - "epoch": 0.632398753894081, - "grad_norm": 1.46875, - "learning_rate": 0.000328272371913541, - "loss": 6.1383, - "step": 609 - }, - { - "epoch": 0.6334371754932503, - "grad_norm": 1.265625, - "learning_rate": 0.0003266593377820708, - "loss": 6.2603, - "step": 610 - }, - { - "epoch": 0.6344755970924195, - "grad_norm": 2.25, - "learning_rate": 0.0003250483515441485, - "loss": 6.3359, - "step": 611 - }, - { - "epoch": 0.6355140186915887, - "grad_norm": 1.6015625, - "learning_rate": 0.0003234394322324019, - "loss": 6.1653, - "step": 612 - }, - { - "epoch": 0.6365524402907581, - "grad_norm": 2.125, - "learning_rate": 0.00032183259885504, - "loss": 6.2869, - "step": 613 - }, - { - "epoch": 0.6375908618899273, - "grad_norm": 1.3515625, - "learning_rate": 0.00032022787039562745, - "loss": 6.3017, - "step": 614 - }, - { - "epoch": 0.6386292834890965, - "grad_norm": 4.3125, - "learning_rate": 0.0003186252658128611, - "loss": 6.1045, - "step": 615 - }, - { - "epoch": 0.6396677050882659, - "grad_norm": 1.6171875, - "learning_rate": 0.00031702480404034565, - "loss": 6.3121, - "step": 616 - }, - { - "epoch": 0.6407061266874351, - "grad_norm": 1.8515625, - "learning_rate": 0.00031542650398637016, - "loss": 6.1043, - "step": 617 - }, - { - "epoch": 0.6417445482866043, - "grad_norm": 1.4375, - "learning_rate": 0.0003138303845336844, - "loss": 6.2402, - "step": 618 - }, - { - "epoch": 0.6427829698857737, - "grad_norm": 1.46875, - "learning_rate": 0.0003122364645392762, - "loss": 6.2972, - "step": 619 - }, - { - "epoch": 0.6438213914849429, - "grad_norm": 2.234375, - "learning_rate": 0.0003106447628341482, - "loss": 6.2454, - "step": 620 - }, - { - "epoch": 0.6448598130841121, - "grad_norm": 1.4921875, - "learning_rate": 0.0003090552982230954, - "loss": 6.1745, - "step": 621 - }, - { - "epoch": 0.6458982346832814, - "grad_norm": 1.3984375, - "learning_rate": 0.00030746808948448366, - "loss": 6.224, - "step": 622 - }, - { - "epoch": 0.6469366562824507, - "grad_norm": 1.4921875, - "learning_rate": 0.0003058831553700268, - "loss": 6.3142, - "step": 623 - }, - { - "epoch": 0.6479750778816199, - "grad_norm": 1.9140625, - "learning_rate": 0.00030430051460456596, - "loss": 6.2258, - "step": 624 - }, - { - "epoch": 0.6490134994807892, - "grad_norm": 1.6171875, - "learning_rate": 0.0003027201858858479, - "loss": 6.1406, - "step": 625 - }, - { - "epoch": 0.6500519210799585, - "grad_norm": 1.59375, - "learning_rate": 0.00030114218788430437, - "loss": 5.7233, - "step": 626 - }, - { - "epoch": 0.6510903426791277, - "grad_norm": 1.5234375, - "learning_rate": 0.0002995665392428313, - "loss": 6.0472, - "step": 627 - }, - { - "epoch": 0.652128764278297, - "grad_norm": 2.265625, - "learning_rate": 0.00029799325857656855, - "loss": 6.161, - "step": 628 - }, - { - "epoch": 0.6531671858774662, - "grad_norm": 1.53125, - "learning_rate": 0.00029642236447268024, - "loss": 6.112, - "step": 629 - }, - { - "epoch": 0.6542056074766355, - "grad_norm": 1.4140625, - "learning_rate": 0.00029485387549013485, - "loss": 6.2132, - "step": 630 - }, - { - "epoch": 0.6552440290758048, - "grad_norm": 1.1875, - "learning_rate": 0.00029328781015948625, - "loss": 6.2657, - "step": 631 - }, - { - "epoch": 0.656282450674974, - "grad_norm": 1.5078125, - "learning_rate": 0.00029172418698265444, - "loss": 6.1711, - "step": 632 - }, - { - "epoch": 0.6573208722741433, - "grad_norm": 1.5859375, - "learning_rate": 0.0002901630244327075, - "loss": 6.1935, - "step": 633 - }, - { - "epoch": 0.6583592938733126, - "grad_norm": 1.6484375, - "learning_rate": 0.00028860434095364263, - "loss": 6.4055, - "step": 634 - }, - { - "epoch": 0.6593977154724818, - "grad_norm": 1.96875, - "learning_rate": 0.00028704815496016875, - "loss": 5.9916, - "step": 635 - }, - { - "epoch": 0.660436137071651, - "grad_norm": 1.3359375, - "learning_rate": 0.00028549448483748886, - "loss": 6.1303, - "step": 636 - }, - { - "epoch": 0.6614745586708204, - "grad_norm": 1.65625, - "learning_rate": 0.0002839433489410828, - "loss": 6.1363, - "step": 637 - }, - { - "epoch": 0.6625129802699896, - "grad_norm": 1.734375, - "learning_rate": 0.0002823947655964901, - "loss": 6.1624, - "step": 638 - }, - { - "epoch": 0.6635514018691588, - "grad_norm": 1.453125, - "learning_rate": 0.000280848753099094, - "loss": 5.9139, - "step": 639 - }, - { - "epoch": 0.6645898234683282, - "grad_norm": 2.625, - "learning_rate": 0.0002793053297139054, - "loss": 6.3661, - "step": 640 - }, - { - "epoch": 0.6656282450674974, - "grad_norm": 1.7578125, - "learning_rate": 0.0002777645136753459, - "loss": 6.3797, - "step": 641 - }, - { - "epoch": 0.6666666666666666, - "grad_norm": 1.8671875, - "learning_rate": 0.0002762263231870339, - "loss": 6.0671, - "step": 642 - }, - { - "epoch": 0.667705088265836, - "grad_norm": 1.6328125, - "learning_rate": 0.00027469077642156844, - "loss": 6.1843, - "step": 643 - }, - { - "epoch": 0.6687435098650052, - "grad_norm": 1.625, - "learning_rate": 0.000273157891520315, - "loss": 6.4473, - "step": 644 - }, - { - "epoch": 0.6697819314641744, - "grad_norm": 1.6953125, - "learning_rate": 0.00027162768659319114, - "loss": 6.3164, - "step": 645 - }, - { - "epoch": 0.6708203530633438, - "grad_norm": 1.6328125, - "learning_rate": 0.00027010017971845264, - "loss": 6.4004, - "step": 646 - }, - { - "epoch": 0.671858774662513, - "grad_norm": 1.6171875, - "learning_rate": 0.00026857538894247947, - "loss": 6.242, - "step": 647 - }, - { - "epoch": 0.6728971962616822, - "grad_norm": 1.28125, - "learning_rate": 0.00026705333227956303, - "loss": 6.2597, - "step": 648 - }, - { - "epoch": 0.6739356178608515, - "grad_norm": 1.2734375, - "learning_rate": 0.000265534027711693, - "loss": 6.3972, - "step": 649 - }, - { - "epoch": 0.6749740394600208, - "grad_norm": 1.90625, - "learning_rate": 0.00026401749318834527, - "loss": 6.4521, - "step": 650 - }, - { - "epoch": 0.67601246105919, - "grad_norm": 1.40625, - "learning_rate": 0.0002625037466262696, - "loss": 5.9594, - "step": 651 - }, - { - "epoch": 0.6770508826583593, - "grad_norm": 1.6015625, - "learning_rate": 0.0002609928059092779, - "loss": 6.2181, - "step": 652 - }, - { - "epoch": 0.6780893042575286, - "grad_norm": 1.3828125, - "learning_rate": 0.00025948468888803324, - "loss": 6.2781, - "step": 653 - }, - { - "epoch": 0.6791277258566978, - "grad_norm": 1.65625, - "learning_rate": 0.00025797941337983875, - "loss": 6.1757, - "step": 654 - }, - { - "epoch": 0.6801661474558671, - "grad_norm": 1.4609375, - "learning_rate": 0.0002564769971684271, - "loss": 6.266, - "step": 655 - }, - { - "epoch": 0.6812045690550363, - "grad_norm": 1.390625, - "learning_rate": 0.00025497745800375036, - "loss": 6.3151, - "step": 656 - }, - { - "epoch": 0.6822429906542056, - "grad_norm": 1.3671875, - "learning_rate": 0.0002534808136017707, - "loss": 6.2002, - "step": 657 - }, - { - "epoch": 0.6832814122533749, - "grad_norm": 1.3671875, - "learning_rate": 0.00025198708164425045, - "loss": 6.1243, - "step": 658 - }, - { - "epoch": 0.6843198338525441, - "grad_norm": 1.3671875, - "learning_rate": 0.0002504962797785435, - "loss": 6.2666, - "step": 659 - }, - { - "epoch": 0.6853582554517134, - "grad_norm": 4.4375, - "learning_rate": 0.00024900842561738736, - "loss": 5.9076, - "step": 660 - }, - { - "epoch": 0.6863966770508827, - "grad_norm": 1.484375, - "learning_rate": 0.00024752353673869405, - "loss": 6.1968, - "step": 661 - }, - { - "epoch": 0.6874350986500519, - "grad_norm": 1.203125, - "learning_rate": 0.00024604163068534315, - "loss": 6.2919, - "step": 662 - }, - { - "epoch": 0.6884735202492211, - "grad_norm": 1.46875, - "learning_rate": 0.00024456272496497415, - "loss": 6.1599, - "step": 663 - }, - { - "epoch": 0.6895119418483905, - "grad_norm": 1.8984375, - "learning_rate": 0.00024308683704978002, - "loss": 6.2052, - "step": 664 - }, - { - "epoch": 0.6905503634475597, - "grad_norm": 1.375, - "learning_rate": 0.00024161398437630045, - "loss": 6.3025, - "step": 665 - }, - { - "epoch": 0.6915887850467289, - "grad_norm": 1.296875, - "learning_rate": 0.0002401441843452159, - "loss": 6.1938, - "step": 666 - }, - { - "epoch": 0.6926272066458983, - "grad_norm": 1.46875, - "learning_rate": 0.0002386774543211423, - "loss": 6.3049, - "step": 667 - }, - { - "epoch": 0.6936656282450675, - "grad_norm": 1.8515625, - "learning_rate": 0.0002372138116324254, - "loss": 6.3624, - "step": 668 - }, - { - "epoch": 0.6947040498442367, - "grad_norm": 1.84375, - "learning_rate": 0.00023575327357093658, - "loss": 6.3294, - "step": 669 - }, - { - "epoch": 0.6957424714434061, - "grad_norm": 1.296875, - "learning_rate": 0.0002342958573918682, - "loss": 6.097, - "step": 670 - }, - { - "epoch": 0.6967808930425753, - "grad_norm": 1.8203125, - "learning_rate": 0.0002328415803135298, - "loss": 6.1121, - "step": 671 - }, - { - "epoch": 0.6978193146417445, - "grad_norm": 1.5859375, - "learning_rate": 0.0002313904595171447, - "loss": 6.1288, - "step": 672 - }, - { - "epoch": 0.6988577362409139, - "grad_norm": 1.5, - "learning_rate": 0.0002299425121466475, - "loss": 6.0291, - "step": 673 - }, - { - "epoch": 0.6998961578400831, - "grad_norm": 1.5, - "learning_rate": 0.00022849775530848056, - "loss": 6.3553, - "step": 674 - }, - { - "epoch": 0.7009345794392523, - "grad_norm": 1.4765625, - "learning_rate": 0.00022705620607139254, - "loss": 6.2154, - "step": 675 - }, - { - "epoch": 0.7019730010384216, - "grad_norm": 1.3984375, - "learning_rate": 0.00022561788146623679, - "loss": 6.1831, - "step": 676 - }, - { - "epoch": 0.7030114226375909, - "grad_norm": 1.34375, - "learning_rate": 0.0002241827984857698, - "loss": 6.1788, - "step": 677 - }, - { - "epoch": 0.7040498442367601, - "grad_norm": 1.6875, - "learning_rate": 0.00022275097408445076, - "loss": 6.3325, - "step": 678 - }, - { - "epoch": 0.7050882658359294, - "grad_norm": 1.3203125, - "learning_rate": 0.00022132242517824115, - "loss": 6.2826, - "step": 679 - }, - { - "epoch": 0.7061266874350987, - "grad_norm": 1.3359375, - "learning_rate": 0.0002198971686444047, - "loss": 6.1395, - "step": 680 - }, - { - "epoch": 0.7071651090342679, - "grad_norm": 1.4296875, - "learning_rate": 0.00021847522132130827, - "loss": 6.3967, - "step": 681 - }, - { - "epoch": 0.7082035306334372, - "grad_norm": 2.234375, - "learning_rate": 0.00021705660000822285, - "loss": 6.4163, - "step": 682 - }, - { - "epoch": 0.7092419522326064, - "grad_norm": 1.5546875, - "learning_rate": 0.00021564132146512495, - "loss": 6.1454, - "step": 683 - }, - { - "epoch": 0.7102803738317757, - "grad_norm": 1.9296875, - "learning_rate": 0.00021422940241249872, - "loss": 6.2889, - "step": 684 - }, - { - "epoch": 0.711318795430945, - "grad_norm": 1.4765625, - "learning_rate": 0.0002128208595311384, - "loss": 6.227, - "step": 685 - }, - { - "epoch": 0.7123572170301142, - "grad_norm": 1.5078125, - "learning_rate": 0.00021141570946195105, - "loss": 6.2655, - "step": 686 - }, - { - "epoch": 0.7133956386292835, - "grad_norm": 1.4140625, - "learning_rate": 0.00021001396880576063, - "loss": 6.1813, - "step": 687 - }, - { - "epoch": 0.7144340602284528, - "grad_norm": 1.1875, - "learning_rate": 0.0002086156541231109, - "loss": 6.3148, - "step": 688 - }, - { - "epoch": 0.715472481827622, - "grad_norm": 1.6015625, - "learning_rate": 0.00020722078193407035, - "loss": 6.2127, - "step": 689 - }, - { - "epoch": 0.7165109034267912, - "grad_norm": 1.53125, - "learning_rate": 0.00020582936871803693, - "loss": 6.2863, - "step": 690 - }, - { - "epoch": 0.7175493250259606, - "grad_norm": 1.203125, - "learning_rate": 0.0002044414309135434, - "loss": 6.236, - "step": 691 - }, - { - "epoch": 0.7185877466251298, - "grad_norm": 1.5546875, - "learning_rate": 0.00020305698491806295, - "loss": 6.3918, - "step": 692 - }, - { - "epoch": 0.719626168224299, - "grad_norm": 1.59375, - "learning_rate": 0.0002016760470878158, - "loss": 5.9227, - "step": 693 - }, - { - "epoch": 0.7206645898234684, - "grad_norm": 1.4921875, - "learning_rate": 0.0002002986337375755, - "loss": 6.0943, - "step": 694 - }, - { - "epoch": 0.7217030114226376, - "grad_norm": 1.4921875, - "learning_rate": 0.00019892476114047664, - "loss": 6.2113, - "step": 695 - }, - { - "epoch": 0.7227414330218068, - "grad_norm": 1.4921875, - "learning_rate": 0.00019755444552782225, - "loss": 6.3502, - "step": 696 - }, - { - "epoch": 0.7237798546209762, - "grad_norm": 1.515625, - "learning_rate": 0.00019618770308889227, - "loss": 5.9658, - "step": 697 - }, - { - "epoch": 0.7248182762201454, - "grad_norm": 1.5, - "learning_rate": 0.00019482454997075228, - "loss": 6.1518, - "step": 698 - }, - { - "epoch": 0.7258566978193146, - "grad_norm": 1.4609375, - "learning_rate": 0.00019346500227806218, - "loss": 6.1651, - "step": 699 - }, - { - "epoch": 0.726895119418484, - "grad_norm": 1.359375, - "learning_rate": 0.00019210907607288723, - "loss": 6.2656, - "step": 700 - }, - { - "epoch": 0.726895119418484, - "eval_loss": 6.260857105255127, - "eval_runtime": 1.646, - "eval_samples_per_second": 9.72, - "eval_steps_per_second": 1.215, - "step": 700 - }, - { - "epoch": 0.7279335410176532, - "grad_norm": 1.7109375, - "learning_rate": 0.00019075678737450686, - "loss": 6.1547, - "step": 701 - }, - { - "epoch": 0.7289719626168224, - "grad_norm": 1.7734375, - "learning_rate": 0.00018940815215922607, - "loss": 6.0599, - "step": 702 - }, - { - "epoch": 0.7300103842159917, - "grad_norm": 1.6171875, - "learning_rate": 0.00018806318636018665, - "loss": 6.2195, - "step": 703 - }, - { - "epoch": 0.731048805815161, - "grad_norm": 1.46875, - "learning_rate": 0.00018672190586717908, - "loss": 6.4289, - "step": 704 - }, - { - "epoch": 0.7320872274143302, - "grad_norm": 1.203125, - "learning_rate": 0.00018538432652645437, - "loss": 6.2451, - "step": 705 - }, - { - "epoch": 0.7331256490134995, - "grad_norm": 1.1640625, - "learning_rate": 0.00018405046414053728, - "loss": 6.2281, - "step": 706 - }, - { - "epoch": 0.7341640706126688, - "grad_norm": 1.484375, - "learning_rate": 0.00018272033446803949, - "loss": 6.2168, - "step": 707 - }, - { - "epoch": 0.735202492211838, - "grad_norm": 1.7890625, - "learning_rate": 0.00018139395322347334, - "loss": 6.1276, - "step": 708 - }, - { - "epoch": 0.7362409138110073, - "grad_norm": 1.4453125, - "learning_rate": 0.00018007133607706615, - "loss": 6.0438, - "step": 709 - }, - { - "epoch": 0.7372793354101765, - "grad_norm": 1.875, - "learning_rate": 0.00017875249865457527, - "loss": 6.0624, - "step": 710 - }, - { - "epoch": 0.7383177570093458, - "grad_norm": 1.671875, - "learning_rate": 0.00017743745653710336, - "loss": 6.4648, - "step": 711 - }, - { - "epoch": 0.7393561786085151, - "grad_norm": 1.4375, - "learning_rate": 0.00017612622526091403, - "loss": 6.2431, - "step": 712 - }, - { - "epoch": 0.7403946002076843, - "grad_norm": 1.375, - "learning_rate": 0.00017481882031724927, - "loss": 6.3375, - "step": 713 - }, - { - "epoch": 0.7414330218068536, - "grad_norm": 1.703125, - "learning_rate": 0.0001735152571521451, - "loss": 6.3156, - "step": 714 - }, - { - "epoch": 0.7424714434060229, - "grad_norm": 1.1484375, - "learning_rate": 0.00017221555116625, - "loss": 6.2417, - "step": 715 - }, - { - "epoch": 0.7435098650051921, - "grad_norm": 1.09375, - "learning_rate": 0.0001709197177146425, - "loss": 6.3143, - "step": 716 - }, - { - "epoch": 0.7445482866043613, - "grad_norm": 1.34375, - "learning_rate": 0.0001696277721066502, - "loss": 6.1396, - "step": 717 - }, - { - "epoch": 0.7455867082035307, - "grad_norm": 1.2265625, - "learning_rate": 0.00016833972960566868, - "loss": 6.3164, - "step": 718 - }, - { - "epoch": 0.7466251298026999, - "grad_norm": 1.6484375, - "learning_rate": 0.00016705560542898051, - "loss": 6.2559, - "step": 719 - }, - { - "epoch": 0.7476635514018691, - "grad_norm": 1.5234375, - "learning_rate": 0.00016577541474757713, - "loss": 6.4104, - "step": 720 - }, - { - "epoch": 0.7487019730010385, - "grad_norm": 1.421875, - "learning_rate": 0.00016449917268597798, - "loss": 6.0631, - "step": 721 - }, - { - "epoch": 0.7497403946002077, - "grad_norm": 1.4140625, - "learning_rate": 0.00016322689432205252, - "loss": 6.3299, - "step": 722 - }, - { - "epoch": 0.7507788161993769, - "grad_norm": 1.5546875, - "learning_rate": 0.00016195859468684198, - "loss": 6.2053, - "step": 723 - }, - { - "epoch": 0.7518172377985463, - "grad_norm": 1.5546875, - "learning_rate": 0.00016069428876438202, - "loss": 6.3051, - "step": 724 - }, - { - "epoch": 0.7528556593977155, - "grad_norm": 1.359375, - "learning_rate": 0.00015943399149152533, - "loss": 6.1548, - "step": 725 - }, - { - "epoch": 0.7538940809968847, - "grad_norm": 1.375, - "learning_rate": 0.00015817771775776507, - "loss": 6.2009, - "step": 726 - }, - { - "epoch": 0.754932502596054, - "grad_norm": 1.3046875, - "learning_rate": 0.00015692548240506, - "loss": 6.2028, - "step": 727 - }, - { - "epoch": 0.7559709241952233, - "grad_norm": 1.25, - "learning_rate": 0.00015567730022765752, - "loss": 5.9373, - "step": 728 - }, - { - "epoch": 0.7570093457943925, - "grad_norm": 1.203125, - "learning_rate": 0.0001544331859719202, - "loss": 6.1761, - "step": 729 - }, - { - "epoch": 0.7580477673935618, - "grad_norm": 1.484375, - "learning_rate": 0.000153193154336151, - "loss": 6.2549, - "step": 730 - }, - { - "epoch": 0.7590861889927311, - "grad_norm": 1.609375, - "learning_rate": 0.00015195721997041933, - "loss": 6.1982, - "step": 731 - }, - { - "epoch": 0.7601246105919003, - "grad_norm": 1.59375, - "learning_rate": 0.00015072539747638887, - "loss": 6.2346, - "step": 732 - }, - { - "epoch": 0.7611630321910696, - "grad_norm": 1.765625, - "learning_rate": 0.00014949770140714407, - "loss": 5.5064, - "step": 733 - }, - { - "epoch": 0.7622014537902388, - "grad_norm": 1.296875, - "learning_rate": 0.0001482741462670193, - "loss": 6.3146, - "step": 734 - }, - { - "epoch": 0.7632398753894081, - "grad_norm": 1.578125, - "learning_rate": 0.0001470547465114263, - "loss": 6.2277, - "step": 735 - }, - { - "epoch": 0.7642782969885774, - "grad_norm": 1.328125, - "learning_rate": 0.00014583951654668415, - "loss": 6.3032, - "step": 736 - }, - { - "epoch": 0.7653167185877466, - "grad_norm": 1.6484375, - "learning_rate": 0.00014462847072984898, - "loss": 5.9154, - "step": 737 - }, - { - "epoch": 0.7663551401869159, - "grad_norm": 1.6484375, - "learning_rate": 0.0001434216233685441, - "loss": 6.0951, - "step": 738 - }, - { - "epoch": 0.7673935617860852, - "grad_norm": 1.578125, - "learning_rate": 0.00014221898872079108, - "loss": 6.0921, - "step": 739 - }, - { - "epoch": 0.7684319833852544, - "grad_norm": 1.5703125, - "learning_rate": 0.0001410205809948419, - "loss": 6.2295, - "step": 740 - }, - { - "epoch": 0.7694704049844237, - "grad_norm": 1.3515625, - "learning_rate": 0.00013982641434900984, - "loss": 6.229, - "step": 741 - }, - { - "epoch": 0.770508826583593, - "grad_norm": 1.5, - "learning_rate": 0.00013863650289150338, - "loss": 6.3173, - "step": 742 - }, - { - "epoch": 0.7715472481827622, - "grad_norm": 1.515625, - "learning_rate": 0.00013745086068025857, - "loss": 6.3666, - "step": 743 - }, - { - "epoch": 0.7725856697819314, - "grad_norm": 1.15625, - "learning_rate": 0.00013626950172277398, - "loss": 6.1824, - "step": 744 - }, - { - "epoch": 0.7736240913811008, - "grad_norm": 1.78125, - "learning_rate": 0.00013509243997594423, - "loss": 6.2045, - "step": 745 - }, - { - "epoch": 0.77466251298027, - "grad_norm": 1.359375, - "learning_rate": 0.00013391968934589572, - "loss": 6.295, - "step": 746 - }, - { - "epoch": 0.7757009345794392, - "grad_norm": 1.265625, - "learning_rate": 0.00013275126368782235, - "loss": 6.3082, - "step": 747 - }, - { - "epoch": 0.7767393561786086, - "grad_norm": 1.5703125, - "learning_rate": 0.00013158717680582127, - "loss": 6.2444, - "step": 748 - }, - { - "epoch": 0.7777777777777778, - "grad_norm": 1.4296875, - "learning_rate": 0.00013042744245273037, - "loss": 6.1545, - "step": 749 - }, - { - "epoch": 0.778816199376947, - "grad_norm": 1.75, - "learning_rate": 0.0001292720743299654, - "loss": 6.0637, - "step": 750 - }, - { - "epoch": 0.7798546209761164, - "grad_norm": 1.4453125, - "learning_rate": 0.00012812108608735846, - "loss": 6.0392, - "step": 751 - }, - { - "epoch": 0.7808930425752856, - "grad_norm": 1.6796875, - "learning_rate": 0.0001269744913229965, - "loss": 6.285, - "step": 752 - }, - { - "epoch": 0.7819314641744548, - "grad_norm": 1.21875, - "learning_rate": 0.00012583230358306053, - "loss": 6.3178, - "step": 753 - }, - { - "epoch": 0.7829698857736241, - "grad_norm": 1.578125, - "learning_rate": 0.00012469453636166643, - "loss": 6.1123, - "step": 754 - }, - { - "epoch": 0.7840083073727934, - "grad_norm": 1.453125, - "learning_rate": 0.00012356120310070407, - "loss": 6.379, - "step": 755 - }, - { - "epoch": 0.7850467289719626, - "grad_norm": 1.625, - "learning_rate": 0.00012243231718967967, - "loss": 6.127, - "step": 756 - }, - { - "epoch": 0.7860851505711319, - "grad_norm": 1.1796875, - "learning_rate": 0.0001213078919655573, - "loss": 6.222, - "step": 757 - }, - { - "epoch": 0.7871235721703012, - "grad_norm": 1.1875, - "learning_rate": 0.00012018794071260119, - "loss": 6.0595, - "step": 758 - }, - { - "epoch": 0.7881619937694704, - "grad_norm": 1.2421875, - "learning_rate": 0.00011907247666221893, - "loss": 6.1771, - "step": 759 - }, - { - "epoch": 0.7892004153686397, - "grad_norm": 1.5, - "learning_rate": 0.00011796151299280483, - "loss": 6.1493, - "step": 760 - }, - { - "epoch": 0.7902388369678089, - "grad_norm": 1.3046875, - "learning_rate": 0.00011685506282958496, - "loss": 6.3724, - "step": 761 - }, - { - "epoch": 0.7912772585669782, - "grad_norm": 1.8203125, - "learning_rate": 0.00011575313924446123, - "loss": 6.1028, - "step": 762 - }, - { - "epoch": 0.7923156801661475, - "grad_norm": 1.5234375, - "learning_rate": 0.00011465575525585741, - "loss": 6.1988, - "step": 763 - }, - { - "epoch": 0.7933541017653167, - "grad_norm": 3.03125, - "learning_rate": 0.00011356292382856532, - "loss": 6.1213, - "step": 764 - }, - { - "epoch": 0.794392523364486, - "grad_norm": 1.8515625, - "learning_rate": 0.0001124746578735914, - "loss": 6.3859, - "step": 765 - }, - { - "epoch": 0.7954309449636553, - "grad_norm": 1.8984375, - "learning_rate": 0.0001113909702480046, - "loss": 6.0653, - "step": 766 - }, - { - "epoch": 0.7964693665628245, - "grad_norm": 1.359375, - "learning_rate": 0.00011031187375478407, - "loss": 6.2933, - "step": 767 - }, - { - "epoch": 0.7975077881619937, - "grad_norm": 1.3203125, - "learning_rate": 0.00010923738114266823, - "loss": 6.0991, - "step": 768 - }, - { - "epoch": 0.7985462097611631, - "grad_norm": 1.4921875, - "learning_rate": 0.00010816750510600387, - "loss": 6.2484, - "step": 769 - }, - { - "epoch": 0.7995846313603323, - "grad_norm": 1.703125, - "learning_rate": 0.00010710225828459641, - "loss": 5.7827, - "step": 770 - }, - { - "epoch": 0.8006230529595015, - "grad_norm": 1.46875, - "learning_rate": 0.0001060416532635603, - "loss": 6.3373, - "step": 771 - }, - { - "epoch": 0.8016614745586709, - "grad_norm": 1.2265625, - "learning_rate": 0.00010498570257317076, - "loss": 6.3325, - "step": 772 - }, - { - "epoch": 0.8026998961578401, - "grad_norm": 1.578125, - "learning_rate": 0.00010393441868871506, - "loss": 6.1373, - "step": 773 - }, - { - "epoch": 0.8037383177570093, - "grad_norm": 1.609375, - "learning_rate": 0.00010288781403034619, - "loss": 6.06, - "step": 774 - }, - { - "epoch": 0.8047767393561787, - "grad_norm": 1.71875, - "learning_rate": 0.00010184590096293506, - "loss": 6.0622, - "step": 775 - }, - { - "epoch": 0.8058151609553479, - "grad_norm": 1.0546875, - "learning_rate": 0.0001008086917959249, - "loss": 6.2272, - "step": 776 - }, - { - "epoch": 0.8068535825545171, - "grad_norm": 1.3828125, - "learning_rate": 9.977619878318578e-05, - "loss": 6.2692, - "step": 777 - }, - { - "epoch": 0.8078920041536864, - "grad_norm": 1.2265625, - "learning_rate": 9.874843412286993e-05, - "loss": 6.3817, - "step": 778 - }, - { - "epoch": 0.8089304257528557, - "grad_norm": 1.9921875, - "learning_rate": 9.772540995726753e-05, - "loss": 6.4033, - "step": 779 - }, - { - "epoch": 0.8099688473520249, - "grad_norm": 1.625, - "learning_rate": 9.67071383726632e-05, - "loss": 6.0418, - "step": 780 - }, - { - "epoch": 0.8110072689511942, - "grad_norm": 1.3125, - "learning_rate": 9.569363139919341e-05, - "loss": 6.2407, - "step": 781 - }, - { - "epoch": 0.8120456905503635, - "grad_norm": 1.6796875, - "learning_rate": 9.468490101070409e-05, - "loss": 6.1643, - "step": 782 - }, - { - "epoch": 0.8130841121495327, - "grad_norm": 1.1796875, - "learning_rate": 9.368095912460934e-05, - "loss": 6.1331, - "step": 783 - }, - { - "epoch": 0.814122533748702, - "grad_norm": 1.2421875, - "learning_rate": 9.26818176017506e-05, - "loss": 6.363, - "step": 784 - }, - { - "epoch": 0.8151609553478713, - "grad_norm": 1.4609375, - "learning_rate": 9.168748824625655e-05, - "loss": 6.2178, - "step": 785 - }, - { - "epoch": 0.8161993769470405, - "grad_norm": 1.6328125, - "learning_rate": 9.069798280540348e-05, - "loss": 6.146, - "step": 786 - }, - { - "epoch": 0.8172377985462098, - "grad_norm": 1.4453125, - "learning_rate": 8.9713312969477e-05, - "loss": 6.1887, - "step": 787 - }, - { - "epoch": 0.818276220145379, - "grad_norm": 1.2421875, - "learning_rate": 8.87334903716332e-05, - "loss": 6.2521, - "step": 788 - }, - { - "epoch": 0.8193146417445483, - "grad_norm": 1.265625, - "learning_rate": 8.775852658776173e-05, - "loss": 6.3487, - "step": 789 - }, - { - "epoch": 0.8203530633437176, - "grad_norm": 1.5234375, - "learning_rate": 8.678843313634893e-05, - "loss": 6.2509, - "step": 790 - }, - { - "epoch": 0.8213914849428868, - "grad_norm": 1.71875, - "learning_rate": 8.58232214783416e-05, - "loss": 6.0586, - "step": 791 - }, - { - "epoch": 0.822429906542056, - "grad_norm": 1.4921875, - "learning_rate": 8.486290301701182e-05, - "loss": 6.293, - "step": 792 - }, - { - "epoch": 0.8234683281412254, - "grad_norm": 1.5, - "learning_rate": 8.390748909782204e-05, - "loss": 6.2504, - "step": 793 - }, - { - "epoch": 0.8245067497403946, - "grad_norm": 1.28125, - "learning_rate": 8.295699100829124e-05, - "loss": 6.2907, - "step": 794 - }, - { - "epoch": 0.8255451713395638, - "grad_norm": 1.125, - "learning_rate": 8.201141997786127e-05, - "loss": 6.2033, - "step": 795 - }, - { - "epoch": 0.8265835929387332, - "grad_norm": 1.2109375, - "learning_rate": 8.107078717776456e-05, - "loss": 6.3058, - "step": 796 - }, - { - "epoch": 0.8276220145379024, - "grad_norm": 1.4296875, - "learning_rate": 8.013510372089184e-05, - "loss": 6.1276, - "step": 797 - }, - { - "epoch": 0.8286604361370716, - "grad_norm": 1.4140625, - "learning_rate": 7.920438066166097e-05, - "loss": 6.4023, - "step": 798 - }, - { - "epoch": 0.829698857736241, - "grad_norm": 1.953125, - "learning_rate": 7.827862899588634e-05, - "loss": 6.1487, - "step": 799 - }, - { - "epoch": 0.8307372793354102, - "grad_norm": 1.3671875, - "learning_rate": 7.735785966064884e-05, - "loss": 5.9001, - "step": 800 - }, - { - "epoch": 0.8307372793354102, - "eval_loss": 6.255230903625488, - "eval_runtime": 1.6449, - "eval_samples_per_second": 9.727, - "eval_steps_per_second": 1.216, - "step": 800 - }, - { - "epoch": 0.8317757009345794, - "grad_norm": 1.3515625, - "learning_rate": 7.644208353416704e-05, - "loss": 6.1918, - "step": 801 - }, - { - "epoch": 0.8328141225337488, - "grad_norm": 2.328125, - "learning_rate": 7.553131143566822e-05, - "loss": 6.2873, - "step": 802 - }, - { - "epoch": 0.833852544132918, - "grad_norm": 1.390625, - "learning_rate": 7.462555412526062e-05, - "loss": 6.1316, - "step": 803 - }, - { - "epoch": 0.8348909657320872, - "grad_norm": 2.0, - "learning_rate": 7.372482230380656e-05, - "loss": 6.2054, - "step": 804 - }, - { - "epoch": 0.8359293873312564, - "grad_norm": 1.7578125, - "learning_rate": 7.282912661279584e-05, - "loss": 6.1334, - "step": 805 - }, - { - "epoch": 0.8369678089304258, - "grad_norm": 1.328125, - "learning_rate": 7.19384776342199e-05, - "loss": 6.1515, - "step": 806 - }, - { - "epoch": 0.838006230529595, - "grad_norm": 1.6328125, - "learning_rate": 7.105288589044723e-05, - "loss": 6.044, - "step": 807 - }, - { - "epoch": 0.8390446521287642, - "grad_norm": 1.265625, - "learning_rate": 7.017236184409858e-05, - "loss": 6.4329, - "step": 808 - }, - { - "epoch": 0.8400830737279336, - "grad_norm": 1.203125, - "learning_rate": 6.929691589792358e-05, - "loss": 6.2727, - "step": 809 - }, - { - "epoch": 0.8411214953271028, - "grad_norm": 1.6015625, - "learning_rate": 6.842655839467788e-05, - "loss": 6.1676, - "step": 810 - }, - { - "epoch": 0.842159916926272, - "grad_norm": 1.546875, - "learning_rate": 6.756129961700075e-05, - "loss": 5.9781, - "step": 811 - }, - { - "epoch": 0.8431983385254413, - "grad_norm": 1.8515625, - "learning_rate": 6.670114978729391e-05, - "loss": 5.9659, - "step": 812 - }, - { - "epoch": 0.8442367601246106, - "grad_norm": 1.6640625, - "learning_rate": 6.584611906760035e-05, - "loss": 5.9971, - "step": 813 - }, - { - "epoch": 0.8452751817237798, - "grad_norm": 1.7734375, - "learning_rate": 6.499621755948487e-05, - "loss": 6.3746, - "step": 814 - }, - { - "epoch": 0.8463136033229491, - "grad_norm": 1.296875, - "learning_rate": 6.415145530391403e-05, - "loss": 5.9012, - "step": 815 - }, - { - "epoch": 0.8473520249221184, - "grad_norm": 1.15625, - "learning_rate": 6.331184228113802e-05, - "loss": 6.3141, - "step": 816 - }, - { - "epoch": 0.8483904465212876, - "grad_norm": 1.5078125, - "learning_rate": 6.247738841057255e-05, - "loss": 6.2155, - "step": 817 - }, - { - "epoch": 0.8494288681204569, - "grad_norm": 1.65625, - "learning_rate": 6.164810355068179e-05, - "loss": 6.1955, - "step": 818 - }, - { - "epoch": 0.8504672897196262, - "grad_norm": 1.515625, - "learning_rate": 6.082399749886169e-05, - "loss": 6.3145, - "step": 819 - }, - { - "epoch": 0.8515057113187954, - "grad_norm": 1.3984375, - "learning_rate": 6.000507999132443e-05, - "loss": 6.173, - "step": 820 - }, - { - "epoch": 0.8525441329179647, - "grad_norm": 1.484375, - "learning_rate": 5.919136070298342e-05, - "loss": 6.0277, - "step": 821 - }, - { - "epoch": 0.8535825545171339, - "grad_norm": 1.4609375, - "learning_rate": 5.838284924733866e-05, - "loss": 6.163, - "step": 822 - }, - { - "epoch": 0.8546209761163032, - "grad_norm": 1.390625, - "learning_rate": 5.7579555176363654e-05, - "loss": 6.324, - "step": 823 - }, - { - "epoch": 0.8556593977154725, - "grad_norm": 1.5390625, - "learning_rate": 5.678148798039212e-05, - "loss": 6.3801, - "step": 824 - }, - { - "epoch": 0.8566978193146417, - "grad_norm": 1.5625, - "learning_rate": 5.598865708800616e-05, - "loss": 5.8392, - "step": 825 - }, - { - "epoch": 0.857736240913811, - "grad_norm": 1.609375, - "learning_rate": 5.520107186592477e-05, - "loss": 6.3217, - "step": 826 - }, - { - "epoch": 0.8587746625129803, - "grad_norm": 1.4609375, - "learning_rate": 5.441874161889304e-05, - "loss": 6.2988, - "step": 827 - }, - { - "epoch": 0.8598130841121495, - "grad_norm": 1.328125, - "learning_rate": 5.364167558957267e-05, - "loss": 6.3745, - "step": 828 - }, - { - "epoch": 0.8608515057113187, - "grad_norm": 1.296875, - "learning_rate": 5.286988295843215e-05, - "loss": 6.2074, - "step": 829 - }, - { - "epoch": 0.8618899273104881, - "grad_norm": 1.4296875, - "learning_rate": 5.2103372843638754e-05, - "loss": 6.2226, - "step": 830 - }, - { - "epoch": 0.8629283489096573, - "grad_norm": 1.59375, - "learning_rate": 5.134215430095068e-05, - "loss": 5.8549, - "step": 831 - }, - { - "epoch": 0.8639667705088265, - "grad_norm": 1.71875, - "learning_rate": 5.0586236323610034e-05, - "loss": 6.3173, - "step": 832 - }, - { - "epoch": 0.8650051921079959, - "grad_norm": 1.234375, - "learning_rate": 4.983562784223644e-05, - "loss": 6.2721, - "step": 833 - }, - { - "epoch": 0.8660436137071651, - "grad_norm": 1.40625, - "learning_rate": 4.909033772472205e-05, - "loss": 6.1832, - "step": 834 - }, - { - "epoch": 0.8670820353063343, - "grad_norm": 1.6640625, - "learning_rate": 4.835037477612619e-05, - "loss": 6.2962, - "step": 835 - }, - { - "epoch": 0.8681204569055037, - "grad_norm": 1.5625, - "learning_rate": 4.761574773857163e-05, - "loss": 6.3508, - "step": 836 - }, - { - "epoch": 0.8691588785046729, - "grad_norm": 1.5859375, - "learning_rate": 4.688646529114121e-05, - "loss": 6.0014, - "step": 837 - }, - { - "epoch": 0.8701973001038421, - "grad_norm": 1.5, - "learning_rate": 4.6162536049775385e-05, - "loss": 5.8404, - "step": 838 - }, - { - "epoch": 0.8712357217030114, - "grad_norm": 1.359375, - "learning_rate": 4.5443968567170314e-05, - "loss": 5.7522, - "step": 839 - }, - { - "epoch": 0.8722741433021807, - "grad_norm": 1.2890625, - "learning_rate": 4.4730771332676835e-05, - "loss": 6.0892, - "step": 840 - }, - { - "epoch": 0.8733125649013499, - "grad_norm": 1.328125, - "learning_rate": 4.402295277220048e-05, - "loss": 6.2618, - "step": 841 - }, - { - "epoch": 0.8743509865005192, - "grad_norm": 1.5390625, - "learning_rate": 4.3320521248101484e-05, - "loss": 5.9419, - "step": 842 - }, - { - "epoch": 0.8753894080996885, - "grad_norm": 1.75, - "learning_rate": 4.262348505909608e-05, - "loss": 5.9647, - "step": 843 - }, - { - "epoch": 0.8764278296988577, - "grad_norm": 1.703125, - "learning_rate": 4.1931852440158794e-05, - "loss": 6.2304, - "step": 844 - }, - { - "epoch": 0.877466251298027, - "grad_norm": 1.9765625, - "learning_rate": 4.124563156242467e-05, - "loss": 6.3681, - "step": 845 - }, - { - "epoch": 0.8785046728971962, - "grad_norm": 1.3828125, - "learning_rate": 4.056483053309301e-05, - "loss": 6.222, - "step": 846 - }, - { - "epoch": 0.8795430944963655, - "grad_norm": 1.625, - "learning_rate": 3.988945739533173e-05, - "loss": 6.1925, - "step": 847 - }, - { - "epoch": 0.8805815160955348, - "grad_norm": 1.4453125, - "learning_rate": 3.9219520128182086e-05, - "loss": 6.0481, - "step": 848 - }, - { - "epoch": 0.881619937694704, - "grad_norm": 1.3203125, - "learning_rate": 3.855502664646443e-05, - "loss": 6.1119, - "step": 849 - }, - { - "epoch": 0.8826583592938733, - "grad_norm": 1.40625, - "learning_rate": 3.789598480068479e-05, - "loss": 6.2874, - "step": 850 - }, - { - "epoch": 0.8836967808930426, - "grad_norm": 1.453125, - "learning_rate": 3.7242402376942096e-05, - "loss": 6.1814, - "step": 851 - }, - { - "epoch": 0.8847352024922118, - "grad_norm": 1.7734375, - "learning_rate": 3.659428709683621e-05, - "loss": 6.3491, - "step": 852 - }, - { - "epoch": 0.885773624091381, - "grad_norm": 1.171875, - "learning_rate": 3.59516466173766e-05, - "loss": 6.186, - "step": 853 - }, - { - "epoch": 0.8868120456905504, - "grad_norm": 1.390625, - "learning_rate": 3.531448853089192e-05, - "loss": 6.1761, - "step": 854 - }, - { - "epoch": 0.8878504672897196, - "grad_norm": 1.6640625, - "learning_rate": 3.4682820364940636e-05, - "loss": 6.235, - "step": 855 - }, - { - "epoch": 0.8888888888888888, - "grad_norm": 1.3046875, - "learning_rate": 3.40566495822216e-05, - "loss": 6.3703, - "step": 856 - }, - { - "epoch": 0.8899273104880582, - "grad_norm": 1.53125, - "learning_rate": 3.343598358048594e-05, - "loss": 6.2765, - "step": 857 - }, - { - "epoch": 0.8909657320872274, - "grad_norm": 1.703125, - "learning_rate": 3.2820829692449985e-05, - "loss": 6.4258, - "step": 858 - }, - { - "epoch": 0.8920041536863966, - "grad_norm": 1.6484375, - "learning_rate": 3.221119518570848e-05, - "loss": 5.9581, - "step": 859 - }, - { - "epoch": 0.893042575285566, - "grad_norm": 1.609375, - "learning_rate": 3.160708726264855e-05, - "loss": 6.1848, - "step": 860 - }, - { - "epoch": 0.8940809968847352, - "grad_norm": 1.640625, - "learning_rate": 3.100851306036512e-05, - "loss": 6.2575, - "step": 861 - }, - { - "epoch": 0.8951194184839044, - "grad_norm": 1.5859375, - "learning_rate": 3.0415479650575784e-05, - "loss": 5.9909, - "step": 862 - }, - { - "epoch": 0.8961578400830738, - "grad_norm": 1.3515625, - "learning_rate": 2.982799403953801e-05, - "loss": 5.9893, - "step": 863 - }, - { - "epoch": 0.897196261682243, - "grad_norm": 1.4921875, - "learning_rate": 2.9246063167965962e-05, - "loss": 6.1746, - "step": 864 - }, - { - "epoch": 0.8982346832814122, - "grad_norm": 1.4453125, - "learning_rate": 2.8669693910948646e-05, - "loss": 6.1166, - "step": 865 - }, - { - "epoch": 0.8992731048805815, - "grad_norm": 1.5, - "learning_rate": 2.809889307786856e-05, - "loss": 6.2788, - "step": 866 - }, - { - "epoch": 0.9003115264797508, - "grad_norm": 1.71875, - "learning_rate": 2.7533667412321385e-05, - "loss": 5.8872, - "step": 867 - }, - { - "epoch": 0.90134994807892, - "grad_norm": 1.75, - "learning_rate": 2.6974023592036378e-05, - "loss": 6.292, - "step": 868 - }, - { - "epoch": 0.9023883696780893, - "grad_norm": 1.5390625, - "learning_rate": 2.6419968228797274e-05, - "loss": 6.03, - "step": 869 - }, - { - "epoch": 0.9034267912772586, - "grad_norm": 1.34375, - "learning_rate": 2.5871507868364063e-05, - "loss": 6.2342, - "step": 870 - }, - { - "epoch": 0.9044652128764278, - "grad_norm": 1.453125, - "learning_rate": 2.532864899039622e-05, - "loss": 6.2455, - "step": 871 - }, - { - "epoch": 0.9055036344755971, - "grad_norm": 1.8125, - "learning_rate": 2.4791398008375542e-05, - "loss": 5.9972, - "step": 872 - }, - { - "epoch": 0.9065420560747663, - "grad_norm": 1.2421875, - "learning_rate": 2.4259761269530666e-05, - "loss": 6.218, - "step": 873 - }, - { - "epoch": 0.9075804776739356, - "grad_norm": 1.6796875, - "learning_rate": 2.3733745054762058e-05, - "loss": 5.6284, - "step": 874 - }, - { - "epoch": 0.9086188992731049, - "grad_norm": 1.3671875, - "learning_rate": 2.321335557856791e-05, - "loss": 6.2292, - "step": 875 - }, - { - "epoch": 0.9096573208722741, - "grad_norm": 1.3671875, - "learning_rate": 2.2698598988970422e-05, - "loss": 6.3235, - "step": 876 - }, - { - "epoch": 0.9106957424714434, - "grad_norm": 1.484375, - "learning_rate": 2.2189481367443366e-05, - "loss": 6.2901, - "step": 877 - }, - { - "epoch": 0.9117341640706127, - "grad_norm": 1.3828125, - "learning_rate": 2.16860087288403e-05, - "loss": 6.1946, - "step": 878 - }, - { - "epoch": 0.9127725856697819, - "grad_norm": 1.390625, - "learning_rate": 2.1188187021323413e-05, - "loss": 5.9255, - "step": 879 - }, - { - "epoch": 0.9138110072689511, - "grad_norm": 1.6171875, - "learning_rate": 2.0696022126293124e-05, - "loss": 5.9947, - "step": 880 - }, - { - "epoch": 0.9148494288681205, - "grad_norm": 1.3828125, - "learning_rate": 2.0209519858319037e-05, - "loss": 6.0533, - "step": 881 - }, - { - "epoch": 0.9158878504672897, - "grad_norm": 1.296875, - "learning_rate": 1.9728685965070602e-05, - "loss": 6.1479, - "step": 882 - }, - { - "epoch": 0.9169262720664589, - "grad_norm": 1.1875, - "learning_rate": 1.925352612724979e-05, - "loss": 6.1287, - "step": 883 - }, - { - "epoch": 0.9179646936656283, - "grad_norm": 1.4609375, - "learning_rate": 1.878404595852362e-05, - "loss": 6.1971, - "step": 884 - }, - { - "epoch": 0.9190031152647975, - "grad_norm": 2.34375, - "learning_rate": 1.8320251005457976e-05, - "loss": 6.3691, - "step": 885 - }, - { - "epoch": 0.9200415368639667, - "grad_norm": 1.4296875, - "learning_rate": 1.7862146747452178e-05, - "loss": 6.3665, - "step": 886 - }, - { - "epoch": 0.9210799584631361, - "grad_norm": 1.5625, - "learning_rate": 1.740973859667394e-05, - "loss": 6.278, - "step": 887 - }, - { - "epoch": 0.9221183800623053, - "grad_norm": 1.8125, - "learning_rate": 1.6963031897995862e-05, - "loss": 6.1343, - "step": 888 - }, - { - "epoch": 0.9231568016614745, - "grad_norm": 1.3046875, - "learning_rate": 1.652203192893187e-05, - "loss": 6.3423, - "step": 889 - }, - { - "epoch": 0.9241952232606438, - "grad_norm": 1.78125, - "learning_rate": 1.6086743899575042e-05, - "loss": 6.2271, - "step": 890 - }, - { - "epoch": 0.9252336448598131, - "grad_norm": 1.578125, - "learning_rate": 1.56571729525361e-05, - "loss": 6.1616, - "step": 891 - }, - { - "epoch": 0.9262720664589823, - "grad_norm": 1.1875, - "learning_rate": 1.5233324162882589e-05, - "loss": 6.3031, - "step": 892 - }, - { - "epoch": 0.9273104880581516, - "grad_norm": 1.3828125, - "learning_rate": 1.4815202538078998e-05, - "loss": 6.2343, - "step": 893 - }, - { - "epoch": 0.9283489096573209, - "grad_norm": 1.9609375, - "learning_rate": 1.4402813017927396e-05, - "loss": 6.1985, - "step": 894 - }, - { - "epoch": 0.9293873312564901, - "grad_norm": 1.3046875, - "learning_rate": 1.3996160474509411e-05, - "loss": 6.2852, - "step": 895 - }, - { - "epoch": 0.9304257528556594, - "grad_norm": 1.8828125, - "learning_rate": 1.3595249712128333e-05, - "loss": 6.39, - "step": 896 - }, - { - "epoch": 0.9314641744548287, - "grad_norm": 1.5078125, - "learning_rate": 1.3200085467252488e-05, - "loss": 6.1291, - "step": 897 - }, - { - "epoch": 0.9325025960539979, - "grad_norm": 1.734375, - "learning_rate": 1.28106724084594e-05, - "loss": 6.2431, - "step": 898 - }, - { - "epoch": 0.9335410176531672, - "grad_norm": 1.1953125, - "learning_rate": 1.2427015136380393e-05, - "loss": 6.2799, - "step": 899 - }, - { - "epoch": 0.9345794392523364, - "grad_norm": 1.3359375, - "learning_rate": 1.2049118183646401e-05, - "loss": 6.259, - "step": 900 - }, - { - "epoch": 0.9345794392523364, - "eval_loss": 6.254264831542969, - "eval_runtime": 1.7188, - "eval_samples_per_second": 9.309, - "eval_steps_per_second": 1.164, - "step": 900 } ], "logging_steps": 1, @@ -6399,7 +735,7 @@ "attributes": {} } }, - "total_flos": 1.146256374104064e+17, + "total_flos": 1.27361819344896e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null