diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,56033 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.5, + "eval_steps": 500, + "global_step": 7000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005, + "grad_norm": 104.43379556992704, + "learning_rate": 2.5e-09, + "loss": 3.9866, + "mean_token_accuracy": 0.5756432414054871, + "step": 1 + }, + { + "epoch": 0.001, + "grad_norm": 136.2921206129143, + "learning_rate": 5e-09, + "loss": 4.4461, + "mean_token_accuracy": 0.5250204801559448, + "step": 2 + }, + { + "epoch": 0.0015, + "grad_norm": 109.79368760449812, + "learning_rate": 7.500000000000001e-09, + "loss": 4.3054, + "mean_token_accuracy": 0.5249227285385132, + "step": 3 + }, + { + "epoch": 0.002, + "grad_norm": 104.33504508566458, + "learning_rate": 1e-08, + "loss": 4.4292, + "mean_token_accuracy": 0.5293892621994019, + "step": 4 + }, + { + "epoch": 0.0025, + "grad_norm": 145.01117105279133, + "learning_rate": 1.2500000000000001e-08, + "loss": 5.1781, + "mean_token_accuracy": 0.48397791385650635, + "step": 5 + }, + { + "epoch": 0.003, + "grad_norm": 113.33849139271989, + "learning_rate": 1.5000000000000002e-08, + "loss": 4.3804, + "mean_token_accuracy": 0.5229966640472412, + "step": 6 + }, + { + "epoch": 0.0035, + "grad_norm": 96.83454747097487, + "learning_rate": 1.75e-08, + "loss": 3.8389, + "mean_token_accuracy": 0.5735393166542053, + "step": 7 + }, + { + "epoch": 0.004, + "grad_norm": 97.70149433417195, + "learning_rate": 2e-08, + "loss": 4.1325, + "mean_token_accuracy": 0.5280811190605164, + "step": 8 + }, + { + "epoch": 0.0045, + "grad_norm": 185.9155865618245, + "learning_rate": 2.25e-08, + "loss": 4.3011, + "mean_token_accuracy": 0.5282331705093384, + "step": 9 + }, + { + "epoch": 0.005, + "grad_norm": 98.75318471412527, + "learning_rate": 2.5000000000000002e-08, + "loss": 3.9713, + "mean_token_accuracy": 0.5515362620353699, + "step": 10 + }, + { + "epoch": 0.0055, + "grad_norm": 127.5480254004297, + "learning_rate": 2.75e-08, + "loss": 4.7802, + "mean_token_accuracy": 0.5131838321685791, + "step": 11 + }, + { + "epoch": 0.006, + "grad_norm": 113.64332963141999, + "learning_rate": 3.0000000000000004e-08, + "loss": 4.5021, + "mean_token_accuracy": 0.5038599371910095, + "step": 12 + }, + { + "epoch": 0.0065, + "grad_norm": 114.46541964650037, + "learning_rate": 3.25e-08, + "loss": 4.0431, + "mean_token_accuracy": 0.5543318390846252, + "step": 13 + }, + { + "epoch": 0.007, + "grad_norm": 95.55958475652747, + "learning_rate": 3.5e-08, + "loss": 3.9162, + "mean_token_accuracy": 0.5451327562332153, + "step": 14 + }, + { + "epoch": 0.0075, + "grad_norm": 120.33503760193925, + "learning_rate": 3.7500000000000005e-08, + "loss": 4.1018, + "mean_token_accuracy": 0.5334757566452026, + "step": 15 + }, + { + "epoch": 0.008, + "grad_norm": 92.53356191058556, + "learning_rate": 4e-08, + "loss": 3.7717, + "mean_token_accuracy": 0.5530080795288086, + "step": 16 + }, + { + "epoch": 0.0085, + "grad_norm": 100.36515194387209, + "learning_rate": 4.2500000000000003e-08, + "loss": 3.6579, + "mean_token_accuracy": 0.606023907661438, + "step": 17 + }, + { + "epoch": 0.009, + "grad_norm": 98.04973651878664, + "learning_rate": 4.5e-08, + "loss": 4.0418, + "mean_token_accuracy": 0.5361368656158447, + "step": 18 + }, + { + "epoch": 0.0095, + "grad_norm": 106.7087524203917, + "learning_rate": 4.75e-08, + "loss": 4.4614, + "mean_token_accuracy": 0.5594761371612549, + "step": 19 + }, + { + "epoch": 0.01, + "grad_norm": 130.79589087708518, + "learning_rate": 5.0000000000000004e-08, + "loss": 4.6448, + "mean_token_accuracy": 0.5175386071205139, + "step": 20 + }, + { + "epoch": 0.0105, + "grad_norm": 102.46613115972703, + "learning_rate": 5.250000000000001e-08, + "loss": 4.0862, + "mean_token_accuracy": 0.541530430316925, + "step": 21 + }, + { + "epoch": 0.011, + "grad_norm": 128.2459174333983, + "learning_rate": 5.5e-08, + "loss": 4.5051, + "mean_token_accuracy": 0.5114660263061523, + "step": 22 + }, + { + "epoch": 0.0115, + "grad_norm": 111.13528635236868, + "learning_rate": 5.7500000000000005e-08, + "loss": 4.3419, + "mean_token_accuracy": 0.5170454382896423, + "step": 23 + }, + { + "epoch": 0.012, + "grad_norm": 90.90991078047513, + "learning_rate": 6.000000000000001e-08, + "loss": 3.8515, + "mean_token_accuracy": 0.5634060502052307, + "step": 24 + }, + { + "epoch": 0.0125, + "grad_norm": 89.11530774037661, + "learning_rate": 6.250000000000001e-08, + "loss": 3.7381, + "mean_token_accuracy": 0.5497871041297913, + "step": 25 + }, + { + "epoch": 0.013, + "grad_norm": 116.378185032941, + "learning_rate": 6.5e-08, + "loss": 4.5839, + "mean_token_accuracy": 0.5325897336006165, + "step": 26 + }, + { + "epoch": 0.0135, + "grad_norm": 97.50452833494519, + "learning_rate": 6.75e-08, + "loss": 3.8281, + "mean_token_accuracy": 0.5867918729782104, + "step": 27 + }, + { + "epoch": 0.014, + "grad_norm": 104.88064177874197, + "learning_rate": 7e-08, + "loss": 4.2888, + "mean_token_accuracy": 0.5312155485153198, + "step": 28 + }, + { + "epoch": 0.0145, + "grad_norm": 151.58695263284272, + "learning_rate": 7.250000000000001e-08, + "loss": 4.5217, + "mean_token_accuracy": 0.529009222984314, + "step": 29 + }, + { + "epoch": 0.015, + "grad_norm": 108.66664637663065, + "learning_rate": 7.500000000000001e-08, + "loss": 4.1538, + "mean_token_accuracy": 0.5331829190254211, + "step": 30 + }, + { + "epoch": 0.0155, + "grad_norm": 113.66255186924342, + "learning_rate": 7.750000000000001e-08, + "loss": 4.1864, + "mean_token_accuracy": 0.5408940315246582, + "step": 31 + }, + { + "epoch": 0.016, + "grad_norm": 110.12888134110074, + "learning_rate": 8e-08, + "loss": 4.5084, + "mean_token_accuracy": 0.5330917239189148, + "step": 32 + }, + { + "epoch": 0.0165, + "grad_norm": 108.30182391390225, + "learning_rate": 8.25e-08, + "loss": 4.4225, + "mean_token_accuracy": 0.49871042370796204, + "step": 33 + }, + { + "epoch": 0.017, + "grad_norm": 86.10443415325719, + "learning_rate": 8.500000000000001e-08, + "loss": 3.8653, + "mean_token_accuracy": 0.546950101852417, + "step": 34 + }, + { + "epoch": 0.0175, + "grad_norm": 159.39279056032683, + "learning_rate": 8.750000000000001e-08, + "loss": 3.8011, + "mean_token_accuracy": 0.5468406081199646, + "step": 35 + }, + { + "epoch": 0.018, + "grad_norm": 124.39727618055925, + "learning_rate": 9e-08, + "loss": 4.6077, + "mean_token_accuracy": 0.5083958506584167, + "step": 36 + }, + { + "epoch": 0.0185, + "grad_norm": 112.87506682663928, + "learning_rate": 9.25e-08, + "loss": 3.8902, + "mean_token_accuracy": 0.5520051121711731, + "step": 37 + }, + { + "epoch": 0.019, + "grad_norm": 104.63935041451096, + "learning_rate": 9.5e-08, + "loss": 4.0397, + "mean_token_accuracy": 0.5446016192436218, + "step": 38 + }, + { + "epoch": 0.0195, + "grad_norm": 271.3206766681036, + "learning_rate": 9.75e-08, + "loss": 3.9916, + "mean_token_accuracy": 0.5392693877220154, + "step": 39 + }, + { + "epoch": 0.02, + "grad_norm": 100.53230978548245, + "learning_rate": 1.0000000000000001e-07, + "loss": 4.1405, + "mean_token_accuracy": 0.5346001982688904, + "step": 40 + }, + { + "epoch": 0.0205, + "grad_norm": 99.6504914689114, + "learning_rate": 1.0250000000000001e-07, + "loss": 3.6825, + "mean_token_accuracy": 0.5634505152702332, + "step": 41 + }, + { + "epoch": 0.021, + "grad_norm": 157.82043953402598, + "learning_rate": 1.0500000000000001e-07, + "loss": 4.3953, + "mean_token_accuracy": 0.5134896636009216, + "step": 42 + }, + { + "epoch": 0.0215, + "grad_norm": 107.91011452966937, + "learning_rate": 1.075e-07, + "loss": 4.2247, + "mean_token_accuracy": 0.5312015414237976, + "step": 43 + }, + { + "epoch": 0.022, + "grad_norm": 101.19701047249734, + "learning_rate": 1.1e-07, + "loss": 3.7581, + "mean_token_accuracy": 0.5569323301315308, + "step": 44 + }, + { + "epoch": 0.0225, + "grad_norm": 98.60902935549137, + "learning_rate": 1.1250000000000001e-07, + "loss": 3.2791, + "mean_token_accuracy": 0.6154070496559143, + "step": 45 + }, + { + "epoch": 0.023, + "grad_norm": 114.55411365251855, + "learning_rate": 1.1500000000000001e-07, + "loss": 4.3515, + "mean_token_accuracy": 0.5188862681388855, + "step": 46 + }, + { + "epoch": 0.0235, + "grad_norm": 79.49362082354764, + "learning_rate": 1.1750000000000001e-07, + "loss": 3.4079, + "mean_token_accuracy": 0.583070695400238, + "step": 47 + }, + { + "epoch": 0.024, + "grad_norm": 97.90869504123893, + "learning_rate": 1.2000000000000002e-07, + "loss": 4.0423, + "mean_token_accuracy": 0.5397695899009705, + "step": 48 + }, + { + "epoch": 0.0245, + "grad_norm": 82.16795457951133, + "learning_rate": 1.2250000000000002e-07, + "loss": 3.62, + "mean_token_accuracy": 0.5634582042694092, + "step": 49 + }, + { + "epoch": 0.025, + "grad_norm": 77.87938007343516, + "learning_rate": 1.2500000000000002e-07, + "loss": 3.4596, + "mean_token_accuracy": 0.5864777565002441, + "step": 50 + }, + { + "epoch": 0.0255, + "grad_norm": 141.91712006653177, + "learning_rate": 1.275e-07, + "loss": 3.8848, + "mean_token_accuracy": 0.5438910722732544, + "step": 51 + }, + { + "epoch": 0.026, + "grad_norm": 95.59210809942282, + "learning_rate": 1.3e-07, + "loss": 3.9797, + "mean_token_accuracy": 0.5360203981399536, + "step": 52 + }, + { + "epoch": 0.0265, + "grad_norm": 87.92980416630832, + "learning_rate": 1.325e-07, + "loss": 3.7004, + "mean_token_accuracy": 0.5490781664848328, + "step": 53 + }, + { + "epoch": 0.027, + "grad_norm": 96.43744962517417, + "learning_rate": 1.35e-07, + "loss": 3.7457, + "mean_token_accuracy": 0.5738234519958496, + "step": 54 + }, + { + "epoch": 0.0275, + "grad_norm": 112.7908014041784, + "learning_rate": 1.375e-07, + "loss": 4.7982, + "mean_token_accuracy": 0.534604549407959, + "step": 55 + }, + { + "epoch": 0.028, + "grad_norm": 105.08794272716375, + "learning_rate": 1.4e-07, + "loss": 4.107, + "mean_token_accuracy": 0.5355709791183472, + "step": 56 + }, + { + "epoch": 0.0285, + "grad_norm": 90.88660462703972, + "learning_rate": 1.425e-07, + "loss": 3.8786, + "mean_token_accuracy": 0.5373102426528931, + "step": 57 + }, + { + "epoch": 0.029, + "grad_norm": 94.01272270848632, + "learning_rate": 1.4500000000000001e-07, + "loss": 3.7868, + "mean_token_accuracy": 0.5496562719345093, + "step": 58 + }, + { + "epoch": 0.0295, + "grad_norm": 100.40765331002936, + "learning_rate": 1.4750000000000002e-07, + "loss": 4.0683, + "mean_token_accuracy": 0.5399691462516785, + "step": 59 + }, + { + "epoch": 0.03, + "grad_norm": 76.28463089090802, + "learning_rate": 1.5000000000000002e-07, + "loss": 3.5014, + "mean_token_accuracy": 0.5804827213287354, + "step": 60 + }, + { + "epoch": 0.0305, + "grad_norm": 84.58766102718256, + "learning_rate": 1.5250000000000002e-07, + "loss": 3.7593, + "mean_token_accuracy": 0.575046718120575, + "step": 61 + }, + { + "epoch": 0.031, + "grad_norm": 73.39127255839719, + "learning_rate": 1.5500000000000002e-07, + "loss": 3.2767, + "mean_token_accuracy": 0.5805550813674927, + "step": 62 + }, + { + "epoch": 0.0315, + "grad_norm": 88.29492811313071, + "learning_rate": 1.575e-07, + "loss": 3.5298, + "mean_token_accuracy": 0.5585082769393921, + "step": 63 + }, + { + "epoch": 0.032, + "grad_norm": 74.84947303533023, + "learning_rate": 1.6e-07, + "loss": 3.4125, + "mean_token_accuracy": 0.5853991508483887, + "step": 64 + }, + { + "epoch": 0.0325, + "grad_norm": 59.83286823570855, + "learning_rate": 1.625e-07, + "loss": 2.9141, + "mean_token_accuracy": 0.5974161028862, + "step": 65 + }, + { + "epoch": 0.033, + "grad_norm": 62.740498801883206, + "learning_rate": 1.65e-07, + "loss": 2.9796, + "mean_token_accuracy": 0.5988442301750183, + "step": 66 + }, + { + "epoch": 0.0335, + "grad_norm": 81.10830628308806, + "learning_rate": 1.675e-07, + "loss": 3.6709, + "mean_token_accuracy": 0.5548274517059326, + "step": 67 + }, + { + "epoch": 0.034, + "grad_norm": 110.509530026277, + "learning_rate": 1.7000000000000001e-07, + "loss": 3.8188, + "mean_token_accuracy": 0.5492273569107056, + "step": 68 + }, + { + "epoch": 0.0345, + "grad_norm": 81.6213428208745, + "learning_rate": 1.7250000000000002e-07, + "loss": 3.686, + "mean_token_accuracy": 0.5580618381500244, + "step": 69 + }, + { + "epoch": 0.035, + "grad_norm": 67.21264300745776, + "learning_rate": 1.7500000000000002e-07, + "loss": 3.0652, + "mean_token_accuracy": 0.611005961894989, + "step": 70 + }, + { + "epoch": 0.0355, + "grad_norm": 67.32766788333228, + "learning_rate": 1.775e-07, + "loss": 3.0083, + "mean_token_accuracy": 0.596723198890686, + "step": 71 + }, + { + "epoch": 0.036, + "grad_norm": 87.85051423912351, + "learning_rate": 1.8e-07, + "loss": 3.2317, + "mean_token_accuracy": 0.57576984167099, + "step": 72 + }, + { + "epoch": 0.0365, + "grad_norm": 64.58940091362433, + "learning_rate": 1.825e-07, + "loss": 3.0175, + "mean_token_accuracy": 0.6153965592384338, + "step": 73 + }, + { + "epoch": 0.037, + "grad_norm": 72.49640592353627, + "learning_rate": 1.85e-07, + "loss": 3.2937, + "mean_token_accuracy": 0.588723361492157, + "step": 74 + }, + { + "epoch": 0.0375, + "grad_norm": 71.02342545532157, + "learning_rate": 1.875e-07, + "loss": 3.295, + "mean_token_accuracy": 0.5702571868896484, + "step": 75 + }, + { + "epoch": 0.038, + "grad_norm": 71.10674302536903, + "learning_rate": 1.9e-07, + "loss": 3.0816, + "mean_token_accuracy": 0.6124688386917114, + "step": 76 + }, + { + "epoch": 0.0385, + "grad_norm": 73.67837955885236, + "learning_rate": 1.925e-07, + "loss": 3.6402, + "mean_token_accuracy": 0.5645330548286438, + "step": 77 + }, + { + "epoch": 0.039, + "grad_norm": 67.8069587192761, + "learning_rate": 1.95e-07, + "loss": 3.2184, + "mean_token_accuracy": 0.607692301273346, + "step": 78 + }, + { + "epoch": 0.0395, + "grad_norm": 58.22055190752241, + "learning_rate": 1.9750000000000001e-07, + "loss": 2.9321, + "mean_token_accuracy": 0.5934129357337952, + "step": 79 + }, + { + "epoch": 0.04, + "grad_norm": 148.6698426571755, + "learning_rate": 2.0000000000000002e-07, + "loss": 2.7701, + "mean_token_accuracy": 0.6352377533912659, + "step": 80 + }, + { + "epoch": 0.0405, + "grad_norm": 88.21813672359454, + "learning_rate": 2.0250000000000002e-07, + "loss": 2.6321, + "mean_token_accuracy": 0.6280364990234375, + "step": 81 + }, + { + "epoch": 0.041, + "grad_norm": 52.66089336444737, + "learning_rate": 2.0500000000000002e-07, + "loss": 2.4028, + "mean_token_accuracy": 0.6516039371490479, + "step": 82 + }, + { + "epoch": 0.0415, + "grad_norm": 42.213338711817165, + "learning_rate": 2.0750000000000003e-07, + "loss": 2.4168, + "mean_token_accuracy": 0.6394001245498657, + "step": 83 + }, + { + "epoch": 0.042, + "grad_norm": 67.01508914845404, + "learning_rate": 2.1000000000000003e-07, + "loss": 2.6125, + "mean_token_accuracy": 0.6424080729484558, + "step": 84 + }, + { + "epoch": 0.0425, + "grad_norm": 63.46366352591913, + "learning_rate": 2.1250000000000003e-07, + "loss": 2.476, + "mean_token_accuracy": 0.6502318382263184, + "step": 85 + }, + { + "epoch": 0.043, + "grad_norm": 74.87452461775297, + "learning_rate": 2.15e-07, + "loss": 2.6291, + "mean_token_accuracy": 0.6346423625946045, + "step": 86 + }, + { + "epoch": 0.0435, + "grad_norm": 143.154656160571, + "learning_rate": 2.175e-07, + "loss": 2.8455, + "mean_token_accuracy": 0.629447877407074, + "step": 87 + }, + { + "epoch": 0.044, + "grad_norm": 43.390049986705066, + "learning_rate": 2.2e-07, + "loss": 2.2302, + "mean_token_accuracy": 0.6652181148529053, + "step": 88 + }, + { + "epoch": 0.0445, + "grad_norm": 52.34249831608958, + "learning_rate": 2.2250000000000001e-07, + "loss": 2.3508, + "mean_token_accuracy": 0.6453005075454712, + "step": 89 + }, + { + "epoch": 0.045, + "grad_norm": 38.68580587325263, + "learning_rate": 2.2500000000000002e-07, + "loss": 2.2084, + "mean_token_accuracy": 0.6631662845611572, + "step": 90 + }, + { + "epoch": 0.0455, + "grad_norm": 31.68333473049462, + "learning_rate": 2.2750000000000002e-07, + "loss": 1.9769, + "mean_token_accuracy": 0.6889253258705139, + "step": 91 + }, + { + "epoch": 0.046, + "grad_norm": 38.43636570280464, + "learning_rate": 2.3000000000000002e-07, + "loss": 2.1271, + "mean_token_accuracy": 0.6674694418907166, + "step": 92 + }, + { + "epoch": 0.0465, + "grad_norm": 54.90261543919748, + "learning_rate": 2.3250000000000002e-07, + "loss": 2.3998, + "mean_token_accuracy": 0.6535195112228394, + "step": 93 + }, + { + "epoch": 0.047, + "grad_norm": 305.91675354302134, + "learning_rate": 2.3500000000000003e-07, + "loss": 2.2088, + "mean_token_accuracy": 0.672615647315979, + "step": 94 + }, + { + "epoch": 0.0475, + "grad_norm": 42.437200616857304, + "learning_rate": 2.3750000000000003e-07, + "loss": 2.6227, + "mean_token_accuracy": 0.6292094588279724, + "step": 95 + }, + { + "epoch": 0.048, + "grad_norm": 51.43056110746083, + "learning_rate": 2.4000000000000003e-07, + "loss": 2.4636, + "mean_token_accuracy": 0.6406076550483704, + "step": 96 + }, + { + "epoch": 0.0485, + "grad_norm": 50.3661378392945, + "learning_rate": 2.425e-07, + "loss": 2.2679, + "mean_token_accuracy": 0.6293948292732239, + "step": 97 + }, + { + "epoch": 0.049, + "grad_norm": 33.49097021410256, + "learning_rate": 2.4500000000000004e-07, + "loss": 2.1236, + "mean_token_accuracy": 0.6550528407096863, + "step": 98 + }, + { + "epoch": 0.0495, + "grad_norm": 54.55499624917994, + "learning_rate": 2.475e-07, + "loss": 2.2728, + "mean_token_accuracy": 0.6322933435440063, + "step": 99 + }, + { + "epoch": 0.05, + "grad_norm": 37.90925486641291, + "learning_rate": 2.5000000000000004e-07, + "loss": 2.2719, + "mean_token_accuracy": 0.6356201171875, + "step": 100 + }, + { + "epoch": 0.0505, + "grad_norm": 42.97315568907401, + "learning_rate": 2.525e-07, + "loss": 1.9878, + "mean_token_accuracy": 0.6541979908943176, + "step": 101 + }, + { + "epoch": 0.051, + "grad_norm": 38.55558441156077, + "learning_rate": 2.55e-07, + "loss": 2.2766, + "mean_token_accuracy": 0.6552030444145203, + "step": 102 + }, + { + "epoch": 0.0515, + "grad_norm": 40.95704856096843, + "learning_rate": 2.575e-07, + "loss": 1.8545, + "mean_token_accuracy": 0.7035004496574402, + "step": 103 + }, + { + "epoch": 0.052, + "grad_norm": 27.30210855839091, + "learning_rate": 2.6e-07, + "loss": 1.9009, + "mean_token_accuracy": 0.666824221611023, + "step": 104 + }, + { + "epoch": 0.0525, + "grad_norm": 40.14489996306052, + "learning_rate": 2.6250000000000003e-07, + "loss": 2.1033, + "mean_token_accuracy": 0.6789297461509705, + "step": 105 + }, + { + "epoch": 0.053, + "grad_norm": 29.373547078729004, + "learning_rate": 2.65e-07, + "loss": 1.7665, + "mean_token_accuracy": 0.6956717371940613, + "step": 106 + }, + { + "epoch": 0.0535, + "grad_norm": 48.37431488718083, + "learning_rate": 2.6750000000000003e-07, + "loss": 1.8062, + "mean_token_accuracy": 0.667114794254303, + "step": 107 + }, + { + "epoch": 0.054, + "grad_norm": 33.208318357094434, + "learning_rate": 2.7e-07, + "loss": 1.7774, + "mean_token_accuracy": 0.7020869851112366, + "step": 108 + }, + { + "epoch": 0.0545, + "grad_norm": 52.55125666080506, + "learning_rate": 2.7250000000000004e-07, + "loss": 1.7478, + "mean_token_accuracy": 0.69691002368927, + "step": 109 + }, + { + "epoch": 0.055, + "grad_norm": 26.206302038394547, + "learning_rate": 2.75e-07, + "loss": 1.7581, + "mean_token_accuracy": 0.690674364566803, + "step": 110 + }, + { + "epoch": 0.0555, + "grad_norm": 59.64931192107834, + "learning_rate": 2.7750000000000004e-07, + "loss": 1.7717, + "mean_token_accuracy": 0.7072126865386963, + "step": 111 + }, + { + "epoch": 0.056, + "grad_norm": 26.08828514829347, + "learning_rate": 2.8e-07, + "loss": 1.6934, + "mean_token_accuracy": 0.6920771598815918, + "step": 112 + }, + { + "epoch": 0.0565, + "grad_norm": 40.628884375849125, + "learning_rate": 2.8250000000000005e-07, + "loss": 1.9263, + "mean_token_accuracy": 0.6974894404411316, + "step": 113 + }, + { + "epoch": 0.057, + "grad_norm": 30.858417696295323, + "learning_rate": 2.85e-07, + "loss": 1.7164, + "mean_token_accuracy": 0.7053359150886536, + "step": 114 + }, + { + "epoch": 0.0575, + "grad_norm": 20.737921539056675, + "learning_rate": 2.8750000000000005e-07, + "loss": 1.602, + "mean_token_accuracy": 0.7161341905593872, + "step": 115 + }, + { + "epoch": 0.058, + "grad_norm": 19.742916409274034, + "learning_rate": 2.9000000000000003e-07, + "loss": 1.4939, + "mean_token_accuracy": 0.6953277587890625, + "step": 116 + }, + { + "epoch": 0.0585, + "grad_norm": 24.49686985811096, + "learning_rate": 2.9250000000000006e-07, + "loss": 1.6372, + "mean_token_accuracy": 0.7129045128822327, + "step": 117 + }, + { + "epoch": 0.059, + "grad_norm": 24.300087690019865, + "learning_rate": 2.9500000000000003e-07, + "loss": 1.5723, + "mean_token_accuracy": 0.7128270864486694, + "step": 118 + }, + { + "epoch": 0.0595, + "grad_norm": 19.28026051255224, + "learning_rate": 2.975e-07, + "loss": 1.5277, + "mean_token_accuracy": 0.6923511624336243, + "step": 119 + }, + { + "epoch": 0.06, + "grad_norm": 21.603764243296347, + "learning_rate": 3.0000000000000004e-07, + "loss": 1.465, + "mean_token_accuracy": 0.7124010324478149, + "step": 120 + }, + { + "epoch": 0.0605, + "grad_norm": 28.673960130908, + "learning_rate": 3.025e-07, + "loss": 1.3159, + "mean_token_accuracy": 0.7334812879562378, + "step": 121 + }, + { + "epoch": 0.061, + "grad_norm": 24.41405332584742, + "learning_rate": 3.0500000000000004e-07, + "loss": 1.4729, + "mean_token_accuracy": 0.7138429880142212, + "step": 122 + }, + { + "epoch": 0.0615, + "grad_norm": 18.633568317837693, + "learning_rate": 3.075e-07, + "loss": 1.4131, + "mean_token_accuracy": 0.7090487480163574, + "step": 123 + }, + { + "epoch": 0.062, + "grad_norm": 18.83213822116201, + "learning_rate": 3.1000000000000005e-07, + "loss": 1.2531, + "mean_token_accuracy": 0.7438396215438843, + "step": 124 + }, + { + "epoch": 0.0625, + "grad_norm": 25.49321063454615, + "learning_rate": 3.125e-07, + "loss": 1.3401, + "mean_token_accuracy": 0.7183177471160889, + "step": 125 + }, + { + "epoch": 0.063, + "grad_norm": 30.740773116086316, + "learning_rate": 3.15e-07, + "loss": 1.3604, + "mean_token_accuracy": 0.7298600077629089, + "step": 126 + }, + { + "epoch": 0.0635, + "grad_norm": 13.782572477631847, + "learning_rate": 3.1750000000000003e-07, + "loss": 1.2628, + "mean_token_accuracy": 0.7238405346870422, + "step": 127 + }, + { + "epoch": 0.064, + "grad_norm": 64.53684197020571, + "learning_rate": 3.2e-07, + "loss": 1.2973, + "mean_token_accuracy": 0.7300660014152527, + "step": 128 + }, + { + "epoch": 0.0645, + "grad_norm": 25.397275845182783, + "learning_rate": 3.2250000000000004e-07, + "loss": 1.1557, + "mean_token_accuracy": 0.7471628189086914, + "step": 129 + }, + { + "epoch": 0.065, + "grad_norm": 12.662517150749188, + "learning_rate": 3.25e-07, + "loss": 1.2238, + "mean_token_accuracy": 0.7339003682136536, + "step": 130 + }, + { + "epoch": 0.0655, + "grad_norm": 11.65805411497064, + "learning_rate": 3.2750000000000004e-07, + "loss": 1.1028, + "mean_token_accuracy": 0.7471389770507812, + "step": 131 + }, + { + "epoch": 0.066, + "grad_norm": 14.689600076546723, + "learning_rate": 3.3e-07, + "loss": 1.094, + "mean_token_accuracy": 0.750293493270874, + "step": 132 + }, + { + "epoch": 0.0665, + "grad_norm": 13.454815121915011, + "learning_rate": 3.3250000000000005e-07, + "loss": 1.1182, + "mean_token_accuracy": 0.7399627566337585, + "step": 133 + }, + { + "epoch": 0.067, + "grad_norm": 24.191165232991732, + "learning_rate": 3.35e-07, + "loss": 1.1154, + "mean_token_accuracy": 0.7495072484016418, + "step": 134 + }, + { + "epoch": 0.0675, + "grad_norm": 13.38284877803766, + "learning_rate": 3.3750000000000005e-07, + "loss": 1.0493, + "mean_token_accuracy": 0.7537634372711182, + "step": 135 + }, + { + "epoch": 0.068, + "grad_norm": 13.641671718532667, + "learning_rate": 3.4000000000000003e-07, + "loss": 1.0509, + "mean_token_accuracy": 0.763604462146759, + "step": 136 + }, + { + "epoch": 0.0685, + "grad_norm": 9.993276446269832, + "learning_rate": 3.4250000000000006e-07, + "loss": 1.0776, + "mean_token_accuracy": 0.7468227744102478, + "step": 137 + }, + { + "epoch": 0.069, + "grad_norm": 15.973435371721932, + "learning_rate": 3.4500000000000003e-07, + "loss": 1.0864, + "mean_token_accuracy": 0.7474688291549683, + "step": 138 + }, + { + "epoch": 0.0695, + "grad_norm": 12.849710670146347, + "learning_rate": 3.4750000000000006e-07, + "loss": 0.9707, + "mean_token_accuracy": 0.7666284441947937, + "step": 139 + }, + { + "epoch": 0.07, + "grad_norm": 25.813736877978467, + "learning_rate": 3.5000000000000004e-07, + "loss": 0.8941, + "mean_token_accuracy": 0.7742288112640381, + "step": 140 + }, + { + "epoch": 0.0705, + "grad_norm": 11.281195527712853, + "learning_rate": 3.525e-07, + "loss": 0.965, + "mean_token_accuracy": 0.7730990648269653, + "step": 141 + }, + { + "epoch": 0.071, + "grad_norm": 13.805743150248224, + "learning_rate": 3.55e-07, + "loss": 1.0951, + "mean_token_accuracy": 0.7376875281333923, + "step": 142 + }, + { + "epoch": 0.0715, + "grad_norm": 11.998972659630667, + "learning_rate": 3.575e-07, + "loss": 0.9625, + "mean_token_accuracy": 0.756373941898346, + "step": 143 + }, + { + "epoch": 0.072, + "grad_norm": 9.430914765519834, + "learning_rate": 3.6e-07, + "loss": 0.8288, + "mean_token_accuracy": 0.783513069152832, + "step": 144 + }, + { + "epoch": 0.0725, + "grad_norm": 9.55340606870045, + "learning_rate": 3.625e-07, + "loss": 0.8042, + "mean_token_accuracy": 0.7879395484924316, + "step": 145 + }, + { + "epoch": 0.073, + "grad_norm": 15.575166704610806, + "learning_rate": 3.65e-07, + "loss": 0.9027, + "mean_token_accuracy": 0.7803046703338623, + "step": 146 + }, + { + "epoch": 0.0735, + "grad_norm": 24.507630101369642, + "learning_rate": 3.6750000000000003e-07, + "loss": 0.8602, + "mean_token_accuracy": 0.7739679217338562, + "step": 147 + }, + { + "epoch": 0.074, + "grad_norm": 13.210332042697447, + "learning_rate": 3.7e-07, + "loss": 0.8221, + "mean_token_accuracy": 0.7851002812385559, + "step": 148 + }, + { + "epoch": 0.0745, + "grad_norm": 10.74701794091736, + "learning_rate": 3.7250000000000003e-07, + "loss": 0.7812, + "mean_token_accuracy": 0.7973614931106567, + "step": 149 + }, + { + "epoch": 0.075, + "grad_norm": 9.053703222711345, + "learning_rate": 3.75e-07, + "loss": 0.8547, + "mean_token_accuracy": 0.7782257795333862, + "step": 150 + }, + { + "epoch": 0.0755, + "grad_norm": 15.88680594711285, + "learning_rate": 3.7750000000000004e-07, + "loss": 0.824, + "mean_token_accuracy": 0.7841760516166687, + "step": 151 + }, + { + "epoch": 0.076, + "grad_norm": 10.841089775771435, + "learning_rate": 3.8e-07, + "loss": 0.7938, + "mean_token_accuracy": 0.7886626124382019, + "step": 152 + }, + { + "epoch": 0.0765, + "grad_norm": 25.5129606540258, + "learning_rate": 3.8250000000000004e-07, + "loss": 0.6703, + "mean_token_accuracy": 0.8185693025588989, + "step": 153 + }, + { + "epoch": 0.077, + "grad_norm": 10.930973276267865, + "learning_rate": 3.85e-07, + "loss": 0.7958, + "mean_token_accuracy": 0.8104684352874756, + "step": 154 + }, + { + "epoch": 0.0775, + "grad_norm": 7.3145481359560955, + "learning_rate": 3.8750000000000005e-07, + "loss": 0.8083, + "mean_token_accuracy": 0.7884812951087952, + "step": 155 + }, + { + "epoch": 0.078, + "grad_norm": 7.623471802879489, + "learning_rate": 3.9e-07, + "loss": 0.7518, + "mean_token_accuracy": 0.8028879761695862, + "step": 156 + }, + { + "epoch": 0.0785, + "grad_norm": 6.861699982855856, + "learning_rate": 3.9250000000000005e-07, + "loss": 0.6974, + "mean_token_accuracy": 0.8153296113014221, + "step": 157 + }, + { + "epoch": 0.079, + "grad_norm": 22.037853228916227, + "learning_rate": 3.9500000000000003e-07, + "loss": 0.7144, + "mean_token_accuracy": 0.8166239857673645, + "step": 158 + }, + { + "epoch": 0.0795, + "grad_norm": 7.318575330506741, + "learning_rate": 3.9750000000000006e-07, + "loss": 0.8487, + "mean_token_accuracy": 0.7733063697814941, + "step": 159 + }, + { + "epoch": 0.08, + "grad_norm": 24.68119606309031, + "learning_rate": 4.0000000000000003e-07, + "loss": 0.6717, + "mean_token_accuracy": 0.8283261656761169, + "step": 160 + }, + { + "epoch": 0.0805, + "grad_norm": 9.032093639447282, + "learning_rate": 4.0250000000000006e-07, + "loss": 0.7397, + "mean_token_accuracy": 0.8164443373680115, + "step": 161 + }, + { + "epoch": 0.081, + "grad_norm": 6.620725800047389, + "learning_rate": 4.0500000000000004e-07, + "loss": 0.6621, + "mean_token_accuracy": 0.8188380002975464, + "step": 162 + }, + { + "epoch": 0.0815, + "grad_norm": 26.814382593607633, + "learning_rate": 4.0750000000000007e-07, + "loss": 0.9634, + "mean_token_accuracy": 0.7846559882164001, + "step": 163 + }, + { + "epoch": 0.082, + "grad_norm": 4.917705690777649, + "learning_rate": 4.1000000000000004e-07, + "loss": 0.7026, + "mean_token_accuracy": 0.812732994556427, + "step": 164 + }, + { + "epoch": 0.0825, + "grad_norm": 3.6488658757809023, + "learning_rate": 4.125000000000001e-07, + "loss": 0.6842, + "mean_token_accuracy": 0.8093895316123962, + "step": 165 + }, + { + "epoch": 0.083, + "grad_norm": 7.849604292408337, + "learning_rate": 4.1500000000000005e-07, + "loss": 0.6683, + "mean_token_accuracy": 0.8214733600616455, + "step": 166 + }, + { + "epoch": 0.0835, + "grad_norm": 7.497498146045543, + "learning_rate": 4.175000000000001e-07, + "loss": 0.6988, + "mean_token_accuracy": 0.8225500583648682, + "step": 167 + }, + { + "epoch": 0.084, + "grad_norm": 4.284925855460229, + "learning_rate": 4.2000000000000006e-07, + "loss": 0.6648, + "mean_token_accuracy": 0.8096798062324524, + "step": 168 + }, + { + "epoch": 0.0845, + "grad_norm": 5.087161370872455, + "learning_rate": 4.225000000000001e-07, + "loss": 0.5372, + "mean_token_accuracy": 0.8480150103569031, + "step": 169 + }, + { + "epoch": 0.085, + "grad_norm": 4.348568745758091, + "learning_rate": 4.2500000000000006e-07, + "loss": 0.5415, + "mean_token_accuracy": 0.8439901471138, + "step": 170 + }, + { + "epoch": 0.0855, + "grad_norm": 6.81191615401275, + "learning_rate": 4.275000000000001e-07, + "loss": 0.6841, + "mean_token_accuracy": 0.807784914970398, + "step": 171 + }, + { + "epoch": 0.086, + "grad_norm": 3.983793562986111, + "learning_rate": 4.3e-07, + "loss": 0.6399, + "mean_token_accuracy": 0.8268763422966003, + "step": 172 + }, + { + "epoch": 0.0865, + "grad_norm": 4.2289487899997695, + "learning_rate": 4.325e-07, + "loss": 0.6236, + "mean_token_accuracy": 0.826316773891449, + "step": 173 + }, + { + "epoch": 0.087, + "grad_norm": 7.377382624203844, + "learning_rate": 4.35e-07, + "loss": 0.4933, + "mean_token_accuracy": 0.857876718044281, + "step": 174 + }, + { + "epoch": 0.0875, + "grad_norm": 3.470280957572885, + "learning_rate": 4.375e-07, + "loss": 0.5276, + "mean_token_accuracy": 0.8412405848503113, + "step": 175 + }, + { + "epoch": 0.088, + "grad_norm": 6.251661560581956, + "learning_rate": 4.4e-07, + "loss": 0.7566, + "mean_token_accuracy": 0.8175948262214661, + "step": 176 + }, + { + "epoch": 0.0885, + "grad_norm": 5.27106326245366, + "learning_rate": 4.425e-07, + "loss": 0.5132, + "mean_token_accuracy": 0.8507302403450012, + "step": 177 + }, + { + "epoch": 0.089, + "grad_norm": 5.904331616909356, + "learning_rate": 4.4500000000000003e-07, + "loss": 0.6359, + "mean_token_accuracy": 0.8248772621154785, + "step": 178 + }, + { + "epoch": 0.0895, + "grad_norm": 8.376112445211021, + "learning_rate": 4.475e-07, + "loss": 0.5495, + "mean_token_accuracy": 0.8387997150421143, + "step": 179 + }, + { + "epoch": 0.09, + "grad_norm": 9.25471556529717, + "learning_rate": 4.5000000000000003e-07, + "loss": 0.5405, + "mean_token_accuracy": 0.8459620475769043, + "step": 180 + }, + { + "epoch": 0.0905, + "grad_norm": 4.923315619953546, + "learning_rate": 4.525e-07, + "loss": 0.661, + "mean_token_accuracy": 0.8112226128578186, + "step": 181 + }, + { + "epoch": 0.091, + "grad_norm": 4.729157817747984, + "learning_rate": 4.5500000000000004e-07, + "loss": 0.5952, + "mean_token_accuracy": 0.8264179825782776, + "step": 182 + }, + { + "epoch": 0.0915, + "grad_norm": 4.709767668684372, + "learning_rate": 4.575e-07, + "loss": 0.4853, + "mean_token_accuracy": 0.8641581535339355, + "step": 183 + }, + { + "epoch": 0.092, + "grad_norm": 4.057411034336886, + "learning_rate": 4.6000000000000004e-07, + "loss": 0.6245, + "mean_token_accuracy": 0.8166694045066833, + "step": 184 + }, + { + "epoch": 0.0925, + "grad_norm": 3.380084941972598, + "learning_rate": 4.625e-07, + "loss": 0.5414, + "mean_token_accuracy": 0.8441869616508484, + "step": 185 + }, + { + "epoch": 0.093, + "grad_norm": 3.592219361944554, + "learning_rate": 4.6500000000000005e-07, + "loss": 0.4604, + "mean_token_accuracy": 0.8661704063415527, + "step": 186 + }, + { + "epoch": 0.0935, + "grad_norm": 12.310308733690627, + "learning_rate": 4.675e-07, + "loss": 0.4247, + "mean_token_accuracy": 0.8684611320495605, + "step": 187 + }, + { + "epoch": 0.094, + "grad_norm": 3.1845776954308556, + "learning_rate": 4.7000000000000005e-07, + "loss": 0.5136, + "mean_token_accuracy": 0.8507369756698608, + "step": 188 + }, + { + "epoch": 0.0945, + "grad_norm": 4.823312782989296, + "learning_rate": 4.7250000000000003e-07, + "loss": 0.5656, + "mean_token_accuracy": 0.8396063446998596, + "step": 189 + }, + { + "epoch": 0.095, + "grad_norm": 4.074839741825718, + "learning_rate": 4.7500000000000006e-07, + "loss": 0.5891, + "mean_token_accuracy": 0.8361931443214417, + "step": 190 + }, + { + "epoch": 0.0955, + "grad_norm": 5.447814725888482, + "learning_rate": 4.775000000000001e-07, + "loss": 0.5262, + "mean_token_accuracy": 0.8489331007003784, + "step": 191 + }, + { + "epoch": 0.096, + "grad_norm": 4.166828571629537, + "learning_rate": 4.800000000000001e-07, + "loss": 0.541, + "mean_token_accuracy": 0.845575213432312, + "step": 192 + }, + { + "epoch": 0.0965, + "grad_norm": 4.177309284949197, + "learning_rate": 4.825e-07, + "loss": 0.5803, + "mean_token_accuracy": 0.8321821093559265, + "step": 193 + }, + { + "epoch": 0.097, + "grad_norm": 4.331275009114961, + "learning_rate": 4.85e-07, + "loss": 0.5875, + "mean_token_accuracy": 0.8351383805274963, + "step": 194 + }, + { + "epoch": 0.0975, + "grad_norm": 3.618614651525313, + "learning_rate": 4.875000000000001e-07, + "loss": 0.4943, + "mean_token_accuracy": 0.8513711094856262, + "step": 195 + }, + { + "epoch": 0.098, + "grad_norm": 3.803219696171734, + "learning_rate": 4.900000000000001e-07, + "loss": 0.5699, + "mean_token_accuracy": 0.8392122983932495, + "step": 196 + }, + { + "epoch": 0.0985, + "grad_norm": 3.5966234261257877, + "learning_rate": 4.925e-07, + "loss": 0.5104, + "mean_token_accuracy": 0.8470369577407837, + "step": 197 + }, + { + "epoch": 0.099, + "grad_norm": 4.574641484158082, + "learning_rate": 4.95e-07, + "loss": 0.4782, + "mean_token_accuracy": 0.8580899238586426, + "step": 198 + }, + { + "epoch": 0.0995, + "grad_norm": 6.943185750343928, + "learning_rate": 4.975000000000001e-07, + "loss": 0.6514, + "mean_token_accuracy": 0.8255584836006165, + "step": 199 + }, + { + "epoch": 0.1, + "grad_norm": 4.396263486689462, + "learning_rate": 5.000000000000001e-07, + "loss": 0.4415, + "mean_token_accuracy": 0.868842363357544, + "step": 200 + }, + { + "epoch": 0.1005, + "grad_norm": 5.927780811585465, + "learning_rate": 5.025000000000001e-07, + "loss": 0.4238, + "mean_token_accuracy": 0.8719876408576965, + "step": 201 + }, + { + "epoch": 0.101, + "grad_norm": 5.431924797443824, + "learning_rate": 5.05e-07, + "loss": 0.6764, + "mean_token_accuracy": 0.8057921528816223, + "step": 202 + }, + { + "epoch": 0.1015, + "grad_norm": 4.62398387024987, + "learning_rate": 5.075000000000001e-07, + "loss": 0.5459, + "mean_token_accuracy": 0.8472557663917542, + "step": 203 + }, + { + "epoch": 0.102, + "grad_norm": 5.822585941560787, + "learning_rate": 5.1e-07, + "loss": 0.6071, + "mean_token_accuracy": 0.8233404755592346, + "step": 204 + }, + { + "epoch": 0.1025, + "grad_norm": 14.30090057055125, + "learning_rate": 5.125e-07, + "loss": 0.5857, + "mean_token_accuracy": 0.8421052694320679, + "step": 205 + }, + { + "epoch": 0.103, + "grad_norm": 2.8739757745102326, + "learning_rate": 5.15e-07, + "loss": 0.5183, + "mean_token_accuracy": 0.8446069359779358, + "step": 206 + }, + { + "epoch": 0.1035, + "grad_norm": 3.5006300030259183, + "learning_rate": 5.175e-07, + "loss": 0.5239, + "mean_token_accuracy": 0.852412760257721, + "step": 207 + }, + { + "epoch": 0.104, + "grad_norm": 3.3980121980271876, + "learning_rate": 5.2e-07, + "loss": 0.505, + "mean_token_accuracy": 0.8661903738975525, + "step": 208 + }, + { + "epoch": 0.1045, + "grad_norm": 4.01096118655403, + "learning_rate": 5.225e-07, + "loss": 0.6194, + "mean_token_accuracy": 0.8302261829376221, + "step": 209 + }, + { + "epoch": 0.105, + "grad_norm": 4.060940019985344, + "learning_rate": 5.250000000000001e-07, + "loss": 0.4964, + "mean_token_accuracy": 0.8557397127151489, + "step": 210 + }, + { + "epoch": 0.1055, + "grad_norm": 3.6309783179877826, + "learning_rate": 5.275e-07, + "loss": 0.6105, + "mean_token_accuracy": 0.8289773464202881, + "step": 211 + }, + { + "epoch": 0.106, + "grad_norm": 4.107457259001183, + "learning_rate": 5.3e-07, + "loss": 0.5301, + "mean_token_accuracy": 0.852389395236969, + "step": 212 + }, + { + "epoch": 0.1065, + "grad_norm": 2.7504231776787793, + "learning_rate": 5.325e-07, + "loss": 0.3487, + "mean_token_accuracy": 0.8911246657371521, + "step": 213 + }, + { + "epoch": 0.107, + "grad_norm": 6.38914820995502, + "learning_rate": 5.350000000000001e-07, + "loss": 0.569, + "mean_token_accuracy": 0.8477979302406311, + "step": 214 + }, + { + "epoch": 0.1075, + "grad_norm": 5.491681976690247, + "learning_rate": 5.375e-07, + "loss": 0.4654, + "mean_token_accuracy": 0.8613930344581604, + "step": 215 + }, + { + "epoch": 0.108, + "grad_norm": 3.04252534459608, + "learning_rate": 5.4e-07, + "loss": 0.547, + "mean_token_accuracy": 0.8429275155067444, + "step": 216 + }, + { + "epoch": 0.1085, + "grad_norm": 5.080579632183264, + "learning_rate": 5.425e-07, + "loss": 0.7113, + "mean_token_accuracy": 0.8004699349403381, + "step": 217 + }, + { + "epoch": 0.109, + "grad_norm": 3.4313485006948454, + "learning_rate": 5.450000000000001e-07, + "loss": 0.316, + "mean_token_accuracy": 0.8931225538253784, + "step": 218 + }, + { + "epoch": 0.1095, + "grad_norm": 4.455081747884277, + "learning_rate": 5.475e-07, + "loss": 0.8281, + "mean_token_accuracy": 0.796342134475708, + "step": 219 + }, + { + "epoch": 0.11, + "grad_norm": 3.6903863927351304, + "learning_rate": 5.5e-07, + "loss": 0.5189, + "mean_token_accuracy": 0.8523291945457458, + "step": 220 + }, + { + "epoch": 0.1105, + "grad_norm": 4.486567311467059, + "learning_rate": 5.525e-07, + "loss": 0.4533, + "mean_token_accuracy": 0.859686017036438, + "step": 221 + }, + { + "epoch": 0.111, + "grad_norm": 3.6113746718810162, + "learning_rate": 5.550000000000001e-07, + "loss": 0.5486, + "mean_token_accuracy": 0.8486055731773376, + "step": 222 + }, + { + "epoch": 0.1115, + "grad_norm": 10.251622550483924, + "learning_rate": 5.575000000000001e-07, + "loss": 0.4357, + "mean_token_accuracy": 0.8708924055099487, + "step": 223 + }, + { + "epoch": 0.112, + "grad_norm": 3.506010813288032, + "learning_rate": 5.6e-07, + "loss": 0.6587, + "mean_token_accuracy": 0.8165871500968933, + "step": 224 + }, + { + "epoch": 0.1125, + "grad_norm": 3.0339639912913037, + "learning_rate": 5.625e-07, + "loss": 0.4619, + "mean_token_accuracy": 0.8397407531738281, + "step": 225 + }, + { + "epoch": 0.113, + "grad_norm": 4.706985051809806, + "learning_rate": 5.650000000000001e-07, + "loss": 0.444, + "mean_token_accuracy": 0.8715541362762451, + "step": 226 + }, + { + "epoch": 0.1135, + "grad_norm": 4.478792384388779, + "learning_rate": 5.675000000000001e-07, + "loss": 0.4226, + "mean_token_accuracy": 0.8612552285194397, + "step": 227 + }, + { + "epoch": 0.114, + "grad_norm": 4.121952602836529, + "learning_rate": 5.7e-07, + "loss": 0.3785, + "mean_token_accuracy": 0.8844577074050903, + "step": 228 + }, + { + "epoch": 0.1145, + "grad_norm": 2.6288708417091784, + "learning_rate": 5.725e-07, + "loss": 0.3511, + "mean_token_accuracy": 0.8921183943748474, + "step": 229 + }, + { + "epoch": 0.115, + "grad_norm": 3.2439892339191183, + "learning_rate": 5.750000000000001e-07, + "loss": 0.5339, + "mean_token_accuracy": 0.8403614163398743, + "step": 230 + }, + { + "epoch": 0.1155, + "grad_norm": 16.44340862419775, + "learning_rate": 5.775000000000001e-07, + "loss": 0.338, + "mean_token_accuracy": 0.894345760345459, + "step": 231 + }, + { + "epoch": 0.116, + "grad_norm": 3.4007128603898704, + "learning_rate": 5.800000000000001e-07, + "loss": 0.4123, + "mean_token_accuracy": 0.8859551548957825, + "step": 232 + }, + { + "epoch": 0.1165, + "grad_norm": 3.8106351331107837, + "learning_rate": 5.825e-07, + "loss": 0.4297, + "mean_token_accuracy": 0.8699356317520142, + "step": 233 + }, + { + "epoch": 0.117, + "grad_norm": 3.570983039604431, + "learning_rate": 5.850000000000001e-07, + "loss": 0.6126, + "mean_token_accuracy": 0.8359014987945557, + "step": 234 + }, + { + "epoch": 0.1175, + "grad_norm": 2.439160485770999, + "learning_rate": 5.875e-07, + "loss": 0.2759, + "mean_token_accuracy": 0.906438410282135, + "step": 235 + }, + { + "epoch": 0.118, + "grad_norm": 3.7545916010233054, + "learning_rate": 5.900000000000001e-07, + "loss": 0.4111, + "mean_token_accuracy": 0.8726483583450317, + "step": 236 + }, + { + "epoch": 0.1185, + "grad_norm": 4.379054470727371, + "learning_rate": 5.925e-07, + "loss": 0.4706, + "mean_token_accuracy": 0.8616225123405457, + "step": 237 + }, + { + "epoch": 0.119, + "grad_norm": 3.4038926081431953, + "learning_rate": 5.95e-07, + "loss": 0.5283, + "mean_token_accuracy": 0.8428614139556885, + "step": 238 + }, + { + "epoch": 0.1195, + "grad_norm": 3.5501779700872267, + "learning_rate": 5.975e-07, + "loss": 0.3988, + "mean_token_accuracy": 0.8801015615463257, + "step": 239 + }, + { + "epoch": 0.12, + "grad_norm": 3.4865281168985454, + "learning_rate": 6.000000000000001e-07, + "loss": 0.5883, + "mean_token_accuracy": 0.8334001898765564, + "step": 240 + }, + { + "epoch": 0.1205, + "grad_norm": 12.15671959305347, + "learning_rate": 6.025000000000001e-07, + "loss": 0.4447, + "mean_token_accuracy": 0.8672566413879395, + "step": 241 + }, + { + "epoch": 0.121, + "grad_norm": 7.630098963443698, + "learning_rate": 6.05e-07, + "loss": 0.4998, + "mean_token_accuracy": 0.8519627451896667, + "step": 242 + }, + { + "epoch": 0.1215, + "grad_norm": 2.4430197566348957, + "learning_rate": 6.075e-07, + "loss": 0.4574, + "mean_token_accuracy": 0.8626946806907654, + "step": 243 + }, + { + "epoch": 0.122, + "grad_norm": 5.224423142284969, + "learning_rate": 6.100000000000001e-07, + "loss": 0.431, + "mean_token_accuracy": 0.8743402361869812, + "step": 244 + }, + { + "epoch": 0.1225, + "grad_norm": 2.934221266052305, + "learning_rate": 6.125000000000001e-07, + "loss": 0.5053, + "mean_token_accuracy": 0.8594306111335754, + "step": 245 + }, + { + "epoch": 0.123, + "grad_norm": 3.834961098943068, + "learning_rate": 6.15e-07, + "loss": 0.4692, + "mean_token_accuracy": 0.8509916663169861, + "step": 246 + }, + { + "epoch": 0.1235, + "grad_norm": 3.8239470754382503, + "learning_rate": 6.175e-07, + "loss": 0.5965, + "mean_token_accuracy": 0.8199611306190491, + "step": 247 + }, + { + "epoch": 0.124, + "grad_norm": 5.04208009109431, + "learning_rate": 6.200000000000001e-07, + "loss": 0.4364, + "mean_token_accuracy": 0.8730769157409668, + "step": 248 + }, + { + "epoch": 0.1245, + "grad_norm": 3.759301928548334, + "learning_rate": 6.225000000000001e-07, + "loss": 0.4761, + "mean_token_accuracy": 0.8643551468849182, + "step": 249 + }, + { + "epoch": 0.125, + "grad_norm": 2.5732757332735376, + "learning_rate": 6.25e-07, + "loss": 0.415, + "mean_token_accuracy": 0.866170346736908, + "step": 250 + }, + { + "epoch": 0.1255, + "grad_norm": 3.373584690393273, + "learning_rate": 6.275e-07, + "loss": 0.4917, + "mean_token_accuracy": 0.8439793586730957, + "step": 251 + }, + { + "epoch": 0.126, + "grad_norm": 3.4979156592927807, + "learning_rate": 6.3e-07, + "loss": 0.4545, + "mean_token_accuracy": 0.86359703540802, + "step": 252 + }, + { + "epoch": 0.1265, + "grad_norm": 3.0385556967649023, + "learning_rate": 6.325000000000001e-07, + "loss": 0.6817, + "mean_token_accuracy": 0.8060628175735474, + "step": 253 + }, + { + "epoch": 0.127, + "grad_norm": 19.202001281580205, + "learning_rate": 6.350000000000001e-07, + "loss": 0.461, + "mean_token_accuracy": 0.8665376901626587, + "step": 254 + }, + { + "epoch": 0.1275, + "grad_norm": 6.163162208495731, + "learning_rate": 6.375e-07, + "loss": 0.2837, + "mean_token_accuracy": 0.902049720287323, + "step": 255 + }, + { + "epoch": 0.128, + "grad_norm": 3.128114949408478, + "learning_rate": 6.4e-07, + "loss": 0.497, + "mean_token_accuracy": 0.8603525161743164, + "step": 256 + }, + { + "epoch": 0.1285, + "grad_norm": 3.3792984752463093, + "learning_rate": 6.425000000000001e-07, + "loss": 0.5352, + "mean_token_accuracy": 0.8399932980537415, + "step": 257 + }, + { + "epoch": 0.129, + "grad_norm": 2.2883619734621163, + "learning_rate": 6.450000000000001e-07, + "loss": 0.2582, + "mean_token_accuracy": 0.9143004417419434, + "step": 258 + }, + { + "epoch": 0.1295, + "grad_norm": 9.023649993772796, + "learning_rate": 6.475e-07, + "loss": 0.5681, + "mean_token_accuracy": 0.8394970297813416, + "step": 259 + }, + { + "epoch": 0.13, + "grad_norm": 3.8593600229035627, + "learning_rate": 6.5e-07, + "loss": 0.4871, + "mean_token_accuracy": 0.8581374883651733, + "step": 260 + }, + { + "epoch": 0.1305, + "grad_norm": 4.336840877481701, + "learning_rate": 6.525000000000001e-07, + "loss": 0.4475, + "mean_token_accuracy": 0.8677893877029419, + "step": 261 + }, + { + "epoch": 0.131, + "grad_norm": 2.63020578962031, + "learning_rate": 6.550000000000001e-07, + "loss": 0.5286, + "mean_token_accuracy": 0.839936375617981, + "step": 262 + }, + { + "epoch": 0.1315, + "grad_norm": 3.53004707926649, + "learning_rate": 6.575000000000001e-07, + "loss": 0.3897, + "mean_token_accuracy": 0.8748196363449097, + "step": 263 + }, + { + "epoch": 0.132, + "grad_norm": 2.494417285126672, + "learning_rate": 6.6e-07, + "loss": 0.4057, + "mean_token_accuracy": 0.8728576302528381, + "step": 264 + }, + { + "epoch": 0.1325, + "grad_norm": 3.4468663913201345, + "learning_rate": 6.625000000000001e-07, + "loss": 0.342, + "mean_token_accuracy": 0.8875536322593689, + "step": 265 + }, + { + "epoch": 0.133, + "grad_norm": 2.4377604101930688, + "learning_rate": 6.650000000000001e-07, + "loss": 0.4111, + "mean_token_accuracy": 0.8655756711959839, + "step": 266 + }, + { + "epoch": 0.1335, + "grad_norm": 3.772764979735233, + "learning_rate": 6.675000000000001e-07, + "loss": 0.3664, + "mean_token_accuracy": 0.8904603719711304, + "step": 267 + }, + { + "epoch": 0.134, + "grad_norm": 4.3228508261391845, + "learning_rate": 6.7e-07, + "loss": 0.5197, + "mean_token_accuracy": 0.8432098627090454, + "step": 268 + }, + { + "epoch": 0.1345, + "grad_norm": 2.5321603230609213, + "learning_rate": 6.725000000000001e-07, + "loss": 0.5093, + "mean_token_accuracy": 0.8461917042732239, + "step": 269 + }, + { + "epoch": 0.135, + "grad_norm": 2.5462391883495967, + "learning_rate": 6.750000000000001e-07, + "loss": 0.3992, + "mean_token_accuracy": 0.8764221668243408, + "step": 270 + }, + { + "epoch": 0.1355, + "grad_norm": 5.527577986708619, + "learning_rate": 6.775000000000001e-07, + "loss": 0.5343, + "mean_token_accuracy": 0.8374419212341309, + "step": 271 + }, + { + "epoch": 0.136, + "grad_norm": 5.20817704586823, + "learning_rate": 6.800000000000001e-07, + "loss": 0.5309, + "mean_token_accuracy": 0.8441677093505859, + "step": 272 + }, + { + "epoch": 0.1365, + "grad_norm": 3.5201593909288538, + "learning_rate": 6.825000000000001e-07, + "loss": 0.493, + "mean_token_accuracy": 0.845506489276886, + "step": 273 + }, + { + "epoch": 0.137, + "grad_norm": 5.39586279224165, + "learning_rate": 6.850000000000001e-07, + "loss": 0.5395, + "mean_token_accuracy": 0.8464308977127075, + "step": 274 + }, + { + "epoch": 0.1375, + "grad_norm": 2.3071589354151367, + "learning_rate": 6.875000000000001e-07, + "loss": 0.3515, + "mean_token_accuracy": 0.89152592420578, + "step": 275 + }, + { + "epoch": 0.138, + "grad_norm": 11.517469825904445, + "learning_rate": 6.900000000000001e-07, + "loss": 0.4549, + "mean_token_accuracy": 0.8792135119438171, + "step": 276 + }, + { + "epoch": 0.1385, + "grad_norm": 4.852715936866524, + "learning_rate": 6.925000000000001e-07, + "loss": 0.578, + "mean_token_accuracy": 0.8251568078994751, + "step": 277 + }, + { + "epoch": 0.139, + "grad_norm": 5.415498060713951, + "learning_rate": 6.950000000000001e-07, + "loss": 0.4899, + "mean_token_accuracy": 0.855933666229248, + "step": 278 + }, + { + "epoch": 0.1395, + "grad_norm": 2.6928384330434416, + "learning_rate": 6.975000000000001e-07, + "loss": 0.4903, + "mean_token_accuracy": 0.8505903482437134, + "step": 279 + }, + { + "epoch": 0.14, + "grad_norm": 2.287885064994372, + "learning_rate": 7.000000000000001e-07, + "loss": 0.4149, + "mean_token_accuracy": 0.8662109375, + "step": 280 + }, + { + "epoch": 0.1405, + "grad_norm": 2.6282961878900273, + "learning_rate": 7.025000000000002e-07, + "loss": 0.4795, + "mean_token_accuracy": 0.8556131720542908, + "step": 281 + }, + { + "epoch": 0.141, + "grad_norm": 2.7481631621777085, + "learning_rate": 7.05e-07, + "loss": 0.4462, + "mean_token_accuracy": 0.8605436682701111, + "step": 282 + }, + { + "epoch": 0.1415, + "grad_norm": 3.1606145854255283, + "learning_rate": 7.075e-07, + "loss": 0.4837, + "mean_token_accuracy": 0.855025053024292, + "step": 283 + }, + { + "epoch": 0.142, + "grad_norm": 3.641613929598986, + "learning_rate": 7.1e-07, + "loss": 0.6441, + "mean_token_accuracy": 0.7965736389160156, + "step": 284 + }, + { + "epoch": 0.1425, + "grad_norm": 3.445941180745205, + "learning_rate": 7.125e-07, + "loss": 0.4667, + "mean_token_accuracy": 0.8607901334762573, + "step": 285 + }, + { + "epoch": 0.143, + "grad_norm": 2.9820014373037544, + "learning_rate": 7.15e-07, + "loss": 0.4171, + "mean_token_accuracy": 0.8668538331985474, + "step": 286 + }, + { + "epoch": 0.1435, + "grad_norm": 3.241350722266683, + "learning_rate": 7.175e-07, + "loss": 0.5899, + "mean_token_accuracy": 0.8341099619865417, + "step": 287 + }, + { + "epoch": 0.144, + "grad_norm": 3.4065401006293956, + "learning_rate": 7.2e-07, + "loss": 0.384, + "mean_token_accuracy": 0.8907978534698486, + "step": 288 + }, + { + "epoch": 0.1445, + "grad_norm": 2.4158454368293842, + "learning_rate": 7.225e-07, + "loss": 0.3585, + "mean_token_accuracy": 0.8899676203727722, + "step": 289 + }, + { + "epoch": 0.145, + "grad_norm": 3.0667596115804963, + "learning_rate": 7.25e-07, + "loss": 0.4214, + "mean_token_accuracy": 0.8797519207000732, + "step": 290 + }, + { + "epoch": 0.1455, + "grad_norm": 3.6504394569347047, + "learning_rate": 7.275e-07, + "loss": 0.4145, + "mean_token_accuracy": 0.8821941018104553, + "step": 291 + }, + { + "epoch": 0.146, + "grad_norm": 2.8083595474648915, + "learning_rate": 7.3e-07, + "loss": 0.4171, + "mean_token_accuracy": 0.8617386221885681, + "step": 292 + }, + { + "epoch": 0.1465, + "grad_norm": 2.3600042248802313, + "learning_rate": 7.325e-07, + "loss": 0.4309, + "mean_token_accuracy": 0.8581507205963135, + "step": 293 + }, + { + "epoch": 0.147, + "grad_norm": 3.4090564553997424, + "learning_rate": 7.350000000000001e-07, + "loss": 0.4659, + "mean_token_accuracy": 0.8671252727508545, + "step": 294 + }, + { + "epoch": 0.1475, + "grad_norm": 3.8399329246518605, + "learning_rate": 7.375e-07, + "loss": 0.4395, + "mean_token_accuracy": 0.8681267499923706, + "step": 295 + }, + { + "epoch": 0.148, + "grad_norm": 4.848343437899493, + "learning_rate": 7.4e-07, + "loss": 0.3869, + "mean_token_accuracy": 0.8818274736404419, + "step": 296 + }, + { + "epoch": 0.1485, + "grad_norm": 2.313304986461409, + "learning_rate": 7.425e-07, + "loss": 0.3736, + "mean_token_accuracy": 0.882244348526001, + "step": 297 + }, + { + "epoch": 0.149, + "grad_norm": 3.8410921457140614, + "learning_rate": 7.450000000000001e-07, + "loss": 0.3992, + "mean_token_accuracy": 0.8787592053413391, + "step": 298 + }, + { + "epoch": 0.1495, + "grad_norm": 3.0238451094516963, + "learning_rate": 7.475e-07, + "loss": 0.4273, + "mean_token_accuracy": 0.8681870698928833, + "step": 299 + }, + { + "epoch": 0.15, + "grad_norm": 2.531412887220846, + "learning_rate": 7.5e-07, + "loss": 0.4067, + "mean_token_accuracy": 0.8687325716018677, + "step": 300 + }, + { + "epoch": 0.1505, + "grad_norm": 2.393487969449163, + "learning_rate": 7.525e-07, + "loss": 0.4449, + "mean_token_accuracy": 0.8600386381149292, + "step": 301 + }, + { + "epoch": 0.151, + "grad_norm": 3.402312883561383, + "learning_rate": 7.550000000000001e-07, + "loss": 0.4878, + "mean_token_accuracy": 0.8546742796897888, + "step": 302 + }, + { + "epoch": 0.1515, + "grad_norm": 2.938080037831026, + "learning_rate": 7.575000000000001e-07, + "loss": 0.6391, + "mean_token_accuracy": 0.8267379403114319, + "step": 303 + }, + { + "epoch": 0.152, + "grad_norm": 3.6962530491286953, + "learning_rate": 7.6e-07, + "loss": 0.3532, + "mean_token_accuracy": 0.8923670649528503, + "step": 304 + }, + { + "epoch": 0.1525, + "grad_norm": 4.764390802505974, + "learning_rate": 7.625e-07, + "loss": 0.3782, + "mean_token_accuracy": 0.8852680325508118, + "step": 305 + }, + { + "epoch": 0.153, + "grad_norm": 2.8275006944454937, + "learning_rate": 7.650000000000001e-07, + "loss": 0.3999, + "mean_token_accuracy": 0.8798516988754272, + "step": 306 + }, + { + "epoch": 0.1535, + "grad_norm": 2.9753461352344632, + "learning_rate": 7.675000000000001e-07, + "loss": 0.4887, + "mean_token_accuracy": 0.8596702814102173, + "step": 307 + }, + { + "epoch": 0.154, + "grad_norm": 4.756581681428941, + "learning_rate": 7.7e-07, + "loss": 0.4278, + "mean_token_accuracy": 0.8733031749725342, + "step": 308 + }, + { + "epoch": 0.1545, + "grad_norm": 3.1252850951061255, + "learning_rate": 7.725e-07, + "loss": 0.4906, + "mean_token_accuracy": 0.8614618182182312, + "step": 309 + }, + { + "epoch": 0.155, + "grad_norm": 3.290688818062431, + "learning_rate": 7.750000000000001e-07, + "loss": 0.5152, + "mean_token_accuracy": 0.8503433465957642, + "step": 310 + }, + { + "epoch": 0.1555, + "grad_norm": 2.13997057965177, + "learning_rate": 7.775000000000001e-07, + "loss": 0.3332, + "mean_token_accuracy": 0.8933380246162415, + "step": 311 + }, + { + "epoch": 0.156, + "grad_norm": 2.9869738593288995, + "learning_rate": 7.8e-07, + "loss": 0.5395, + "mean_token_accuracy": 0.8403740525245667, + "step": 312 + }, + { + "epoch": 0.1565, + "grad_norm": 7.89207379617612, + "learning_rate": 7.825e-07, + "loss": 0.4473, + "mean_token_accuracy": 0.8695202469825745, + "step": 313 + }, + { + "epoch": 0.157, + "grad_norm": 3.862685692878103, + "learning_rate": 7.850000000000001e-07, + "loss": 0.4005, + "mean_token_accuracy": 0.8835355639457703, + "step": 314 + }, + { + "epoch": 0.1575, + "grad_norm": 3.3984070751079787, + "learning_rate": 7.875000000000001e-07, + "loss": 0.5623, + "mean_token_accuracy": 0.8412303924560547, + "step": 315 + }, + { + "epoch": 0.158, + "grad_norm": 7.150466541442853, + "learning_rate": 7.900000000000001e-07, + "loss": 0.4086, + "mean_token_accuracy": 0.8843233585357666, + "step": 316 + }, + { + "epoch": 0.1585, + "grad_norm": 3.3150746218561946, + "learning_rate": 7.925e-07, + "loss": 0.3628, + "mean_token_accuracy": 0.8857194185256958, + "step": 317 + }, + { + "epoch": 0.159, + "grad_norm": 2.4489059531539863, + "learning_rate": 7.950000000000001e-07, + "loss": 0.5386, + "mean_token_accuracy": 0.8381162881851196, + "step": 318 + }, + { + "epoch": 0.1595, + "grad_norm": 3.3115559227210443, + "learning_rate": 7.975000000000001e-07, + "loss": 0.411, + "mean_token_accuracy": 0.8812017440795898, + "step": 319 + }, + { + "epoch": 0.16, + "grad_norm": 5.356576280300871, + "learning_rate": 8.000000000000001e-07, + "loss": 0.3749, + "mean_token_accuracy": 0.8823529481887817, + "step": 320 + }, + { + "epoch": 0.1605, + "grad_norm": 2.65888202561482, + "learning_rate": 8.025e-07, + "loss": 0.4154, + "mean_token_accuracy": 0.8724673986434937, + "step": 321 + }, + { + "epoch": 0.161, + "grad_norm": 2.505376474598052, + "learning_rate": 8.050000000000001e-07, + "loss": 0.4828, + "mean_token_accuracy": 0.862577497959137, + "step": 322 + }, + { + "epoch": 0.1615, + "grad_norm": 2.5493878811456425, + "learning_rate": 8.075000000000001e-07, + "loss": 0.2936, + "mean_token_accuracy": 0.9036591649055481, + "step": 323 + }, + { + "epoch": 0.162, + "grad_norm": 4.860629925402789, + "learning_rate": 8.100000000000001e-07, + "loss": 0.4104, + "mean_token_accuracy": 0.8743394017219543, + "step": 324 + }, + { + "epoch": 0.1625, + "grad_norm": 6.8833992089683464, + "learning_rate": 8.125000000000001e-07, + "loss": 0.3691, + "mean_token_accuracy": 0.8859046697616577, + "step": 325 + }, + { + "epoch": 0.163, + "grad_norm": 3.8849265219929547, + "learning_rate": 8.150000000000001e-07, + "loss": 0.3938, + "mean_token_accuracy": 0.8720014095306396, + "step": 326 + }, + { + "epoch": 0.1635, + "grad_norm": 4.577171476996754, + "learning_rate": 8.175000000000001e-07, + "loss": 0.4983, + "mean_token_accuracy": 0.8371473550796509, + "step": 327 + }, + { + "epoch": 0.164, + "grad_norm": 7.577788246578263, + "learning_rate": 8.200000000000001e-07, + "loss": 0.4333, + "mean_token_accuracy": 0.8680663704872131, + "step": 328 + }, + { + "epoch": 0.1645, + "grad_norm": 2.4369973785358177, + "learning_rate": 8.225000000000001e-07, + "loss": 0.552, + "mean_token_accuracy": 0.8325157761573792, + "step": 329 + }, + { + "epoch": 0.165, + "grad_norm": 3.3376136947611754, + "learning_rate": 8.250000000000001e-07, + "loss": 0.4349, + "mean_token_accuracy": 0.8645793795585632, + "step": 330 + }, + { + "epoch": 0.1655, + "grad_norm": 2.805771660151881, + "learning_rate": 8.275000000000001e-07, + "loss": 0.4188, + "mean_token_accuracy": 0.8619465827941895, + "step": 331 + }, + { + "epoch": 0.166, + "grad_norm": 9.653143367675174, + "learning_rate": 8.300000000000001e-07, + "loss": 0.4596, + "mean_token_accuracy": 0.8530179858207703, + "step": 332 + }, + { + "epoch": 0.1665, + "grad_norm": 3.6724284686856365, + "learning_rate": 8.325000000000001e-07, + "loss": 0.5489, + "mean_token_accuracy": 0.8412429094314575, + "step": 333 + }, + { + "epoch": 0.167, + "grad_norm": 4.095930304889529, + "learning_rate": 8.350000000000002e-07, + "loss": 0.4337, + "mean_token_accuracy": 0.8665246963500977, + "step": 334 + }, + { + "epoch": 0.1675, + "grad_norm": 2.9287513687789977, + "learning_rate": 8.375000000000001e-07, + "loss": 0.3888, + "mean_token_accuracy": 0.8895981311798096, + "step": 335 + }, + { + "epoch": 0.168, + "grad_norm": 4.365173378417767, + "learning_rate": 8.400000000000001e-07, + "loss": 0.492, + "mean_token_accuracy": 0.8503165245056152, + "step": 336 + }, + { + "epoch": 0.1685, + "grad_norm": 3.6529016701335437, + "learning_rate": 8.425000000000001e-07, + "loss": 0.4945, + "mean_token_accuracy": 0.8680707216262817, + "step": 337 + }, + { + "epoch": 0.169, + "grad_norm": 5.49065360343676, + "learning_rate": 8.450000000000002e-07, + "loss": 0.3835, + "mean_token_accuracy": 0.8884093761444092, + "step": 338 + }, + { + "epoch": 0.1695, + "grad_norm": 4.170729696853376, + "learning_rate": 8.475000000000001e-07, + "loss": 0.4296, + "mean_token_accuracy": 0.8654906153678894, + "step": 339 + }, + { + "epoch": 0.17, + "grad_norm": 2.6538414969821957, + "learning_rate": 8.500000000000001e-07, + "loss": 0.3238, + "mean_token_accuracy": 0.8962541818618774, + "step": 340 + }, + { + "epoch": 0.1705, + "grad_norm": 5.750985595969953, + "learning_rate": 8.525000000000001e-07, + "loss": 0.4264, + "mean_token_accuracy": 0.8742406964302063, + "step": 341 + }, + { + "epoch": 0.171, + "grad_norm": 5.653456632186626, + "learning_rate": 8.550000000000002e-07, + "loss": 0.6058, + "mean_token_accuracy": 0.8336173892021179, + "step": 342 + }, + { + "epoch": 0.1715, + "grad_norm": 3.8520228190161165, + "learning_rate": 8.575000000000002e-07, + "loss": 0.5225, + "mean_token_accuracy": 0.8432432413101196, + "step": 343 + }, + { + "epoch": 0.172, + "grad_norm": 11.908536330720864, + "learning_rate": 8.6e-07, + "loss": 0.4258, + "mean_token_accuracy": 0.8711666464805603, + "step": 344 + }, + { + "epoch": 0.1725, + "grad_norm": 3.137170097680878, + "learning_rate": 8.625e-07, + "loss": 0.5382, + "mean_token_accuracy": 0.8369052410125732, + "step": 345 + }, + { + "epoch": 0.173, + "grad_norm": 2.455534693710938, + "learning_rate": 8.65e-07, + "loss": 0.483, + "mean_token_accuracy": 0.8589116334915161, + "step": 346 + }, + { + "epoch": 0.1735, + "grad_norm": 5.951735631928892, + "learning_rate": 8.675000000000001e-07, + "loss": 0.4409, + "mean_token_accuracy": 0.8680762052536011, + "step": 347 + }, + { + "epoch": 0.174, + "grad_norm": 2.6749714699502842, + "learning_rate": 8.7e-07, + "loss": 0.6315, + "mean_token_accuracy": 0.8036792874336243, + "step": 348 + }, + { + "epoch": 0.1745, + "grad_norm": 3.496214202526413, + "learning_rate": 8.725e-07, + "loss": 0.3783, + "mean_token_accuracy": 0.8753733038902283, + "step": 349 + }, + { + "epoch": 0.175, + "grad_norm": 3.9637137149655004, + "learning_rate": 8.75e-07, + "loss": 0.4993, + "mean_token_accuracy": 0.8548266291618347, + "step": 350 + }, + { + "epoch": 0.1755, + "grad_norm": 2.4847994821380524, + "learning_rate": 8.775000000000001e-07, + "loss": 0.3744, + "mean_token_accuracy": 0.8869577646255493, + "step": 351 + }, + { + "epoch": 0.176, + "grad_norm": 3.4253871541026055, + "learning_rate": 8.8e-07, + "loss": 0.4748, + "mean_token_accuracy": 0.858090877532959, + "step": 352 + }, + { + "epoch": 0.1765, + "grad_norm": 4.3712695817831815, + "learning_rate": 8.825e-07, + "loss": 0.49, + "mean_token_accuracy": 0.8553351759910583, + "step": 353 + }, + { + "epoch": 0.177, + "grad_norm": 33.51864058672052, + "learning_rate": 8.85e-07, + "loss": 0.4682, + "mean_token_accuracy": 0.8669376373291016, + "step": 354 + }, + { + "epoch": 0.1775, + "grad_norm": 3.511974201698784, + "learning_rate": 8.875000000000001e-07, + "loss": 0.4392, + "mean_token_accuracy": 0.8716694116592407, + "step": 355 + }, + { + "epoch": 0.178, + "grad_norm": 6.2458111943731165, + "learning_rate": 8.900000000000001e-07, + "loss": 0.332, + "mean_token_accuracy": 0.8953214287757874, + "step": 356 + }, + { + "epoch": 0.1785, + "grad_norm": 2.457968974193809, + "learning_rate": 8.925e-07, + "loss": 0.4107, + "mean_token_accuracy": 0.8726665377616882, + "step": 357 + }, + { + "epoch": 0.179, + "grad_norm": 3.051538408375331, + "learning_rate": 8.95e-07, + "loss": 0.4316, + "mean_token_accuracy": 0.8637909889221191, + "step": 358 + }, + { + "epoch": 0.1795, + "grad_norm": 2.0658461065672977, + "learning_rate": 8.975000000000001e-07, + "loss": 0.3446, + "mean_token_accuracy": 0.8809255957603455, + "step": 359 + }, + { + "epoch": 0.18, + "grad_norm": 3.403612766958126, + "learning_rate": 9.000000000000001e-07, + "loss": 0.5898, + "mean_token_accuracy": 0.8346444964408875, + "step": 360 + }, + { + "epoch": 0.1805, + "grad_norm": 4.834976651497001, + "learning_rate": 9.025e-07, + "loss": 0.4244, + "mean_token_accuracy": 0.8694244623184204, + "step": 361 + }, + { + "epoch": 0.181, + "grad_norm": 2.481446972522839, + "learning_rate": 9.05e-07, + "loss": 0.4015, + "mean_token_accuracy": 0.8787557482719421, + "step": 362 + }, + { + "epoch": 0.1815, + "grad_norm": 4.566618105490305, + "learning_rate": 9.075000000000001e-07, + "loss": 0.4045, + "mean_token_accuracy": 0.8767096400260925, + "step": 363 + }, + { + "epoch": 0.182, + "grad_norm": 3.089101831167309, + "learning_rate": 9.100000000000001e-07, + "loss": 0.4325, + "mean_token_accuracy": 0.8672971129417419, + "step": 364 + }, + { + "epoch": 0.1825, + "grad_norm": 3.7222595616837375, + "learning_rate": 9.125e-07, + "loss": 0.4644, + "mean_token_accuracy": 0.8577759265899658, + "step": 365 + }, + { + "epoch": 0.183, + "grad_norm": 3.4094248186008054, + "learning_rate": 9.15e-07, + "loss": 0.3561, + "mean_token_accuracy": 0.8883348107337952, + "step": 366 + }, + { + "epoch": 0.1835, + "grad_norm": 2.3579351232405186, + "learning_rate": 9.175000000000001e-07, + "loss": 0.3451, + "mean_token_accuracy": 0.896847665309906, + "step": 367 + }, + { + "epoch": 0.184, + "grad_norm": 12.445442768492232, + "learning_rate": 9.200000000000001e-07, + "loss": 0.567, + "mean_token_accuracy": 0.8456966876983643, + "step": 368 + }, + { + "epoch": 0.1845, + "grad_norm": 2.8195649629741175, + "learning_rate": 9.225000000000001e-07, + "loss": 0.2928, + "mean_token_accuracy": 0.9026694297790527, + "step": 369 + }, + { + "epoch": 0.185, + "grad_norm": 4.831746930683186, + "learning_rate": 9.25e-07, + "loss": 0.4104, + "mean_token_accuracy": 0.8815943598747253, + "step": 370 + }, + { + "epoch": 0.1855, + "grad_norm": 2.4465492651116896, + "learning_rate": 9.275000000000001e-07, + "loss": 0.4014, + "mean_token_accuracy": 0.8746472597122192, + "step": 371 + }, + { + "epoch": 0.186, + "grad_norm": 3.11189229889603, + "learning_rate": 9.300000000000001e-07, + "loss": 0.5011, + "mean_token_accuracy": 0.865180492401123, + "step": 372 + }, + { + "epoch": 0.1865, + "grad_norm": 4.629404623545758, + "learning_rate": 9.325000000000001e-07, + "loss": 0.5515, + "mean_token_accuracy": 0.8351632952690125, + "step": 373 + }, + { + "epoch": 0.187, + "grad_norm": 2.7125127058032046, + "learning_rate": 9.35e-07, + "loss": 0.3746, + "mean_token_accuracy": 0.8825291991233826, + "step": 374 + }, + { + "epoch": 0.1875, + "grad_norm": 2.0898578863398773, + "learning_rate": 9.375000000000001e-07, + "loss": 0.2603, + "mean_token_accuracy": 0.9079907536506653, + "step": 375 + }, + { + "epoch": 0.188, + "grad_norm": 3.4620529622883285, + "learning_rate": 9.400000000000001e-07, + "loss": 0.6432, + "mean_token_accuracy": 0.8406538963317871, + "step": 376 + }, + { + "epoch": 0.1885, + "grad_norm": 2.3377173063718626, + "learning_rate": 9.425000000000001e-07, + "loss": 0.4063, + "mean_token_accuracy": 0.876895010471344, + "step": 377 + }, + { + "epoch": 0.189, + "grad_norm": 4.960699096417907, + "learning_rate": 9.450000000000001e-07, + "loss": 0.4024, + "mean_token_accuracy": 0.8788956999778748, + "step": 378 + }, + { + "epoch": 0.1895, + "grad_norm": 3.10458682660367, + "learning_rate": 9.475e-07, + "loss": 0.5064, + "mean_token_accuracy": 0.8448704481124878, + "step": 379 + }, + { + "epoch": 0.19, + "grad_norm": 2.9093306663626226, + "learning_rate": 9.500000000000001e-07, + "loss": 0.3479, + "mean_token_accuracy": 0.8904086351394653, + "step": 380 + }, + { + "epoch": 0.1905, + "grad_norm": 3.225013234996323, + "learning_rate": 9.525000000000001e-07, + "loss": 0.3828, + "mean_token_accuracy": 0.885650634765625, + "step": 381 + }, + { + "epoch": 0.191, + "grad_norm": 2.572310991164746, + "learning_rate": 9.550000000000002e-07, + "loss": 0.5021, + "mean_token_accuracy": 0.8423334956169128, + "step": 382 + }, + { + "epoch": 0.1915, + "grad_norm": 4.0351158350439915, + "learning_rate": 9.575000000000001e-07, + "loss": 0.5559, + "mean_token_accuracy": 0.8400127291679382, + "step": 383 + }, + { + "epoch": 0.192, + "grad_norm": 2.6482473124447434, + "learning_rate": 9.600000000000001e-07, + "loss": 0.3328, + "mean_token_accuracy": 0.8878240585327148, + "step": 384 + }, + { + "epoch": 0.1925, + "grad_norm": 2.8502518319101084, + "learning_rate": 9.625e-07, + "loss": 0.4099, + "mean_token_accuracy": 0.8692687153816223, + "step": 385 + }, + { + "epoch": 0.193, + "grad_norm": 4.196737584324953, + "learning_rate": 9.65e-07, + "loss": 0.493, + "mean_token_accuracy": 0.8594704866409302, + "step": 386 + }, + { + "epoch": 0.1935, + "grad_norm": 3.486378721442577, + "learning_rate": 9.675e-07, + "loss": 0.539, + "mean_token_accuracy": 0.8517847657203674, + "step": 387 + }, + { + "epoch": 0.194, + "grad_norm": 2.5868523456044277, + "learning_rate": 9.7e-07, + "loss": 0.3847, + "mean_token_accuracy": 0.8815299868583679, + "step": 388 + }, + { + "epoch": 0.1945, + "grad_norm": 2.5564925780823566, + "learning_rate": 9.725e-07, + "loss": 0.4526, + "mean_token_accuracy": 0.8623368740081787, + "step": 389 + }, + { + "epoch": 0.195, + "grad_norm": 4.992799197293632, + "learning_rate": 9.750000000000002e-07, + "loss": 0.2427, + "mean_token_accuracy": 0.9248460531234741, + "step": 390 + }, + { + "epoch": 0.1955, + "grad_norm": 6.068184233515326, + "learning_rate": 9.775000000000002e-07, + "loss": 0.3179, + "mean_token_accuracy": 0.8967213034629822, + "step": 391 + }, + { + "epoch": 0.196, + "grad_norm": 6.210752636948838, + "learning_rate": 9.800000000000001e-07, + "loss": 0.3842, + "mean_token_accuracy": 0.8929049372673035, + "step": 392 + }, + { + "epoch": 0.1965, + "grad_norm": 2.572056561819132, + "learning_rate": 9.825000000000001e-07, + "loss": 0.4181, + "mean_token_accuracy": 0.8701026439666748, + "step": 393 + }, + { + "epoch": 0.197, + "grad_norm": 3.7641972640362007, + "learning_rate": 9.85e-07, + "loss": 0.239, + "mean_token_accuracy": 0.9212028980255127, + "step": 394 + }, + { + "epoch": 0.1975, + "grad_norm": 4.346344396922438, + "learning_rate": 9.875e-07, + "loss": 0.4922, + "mean_token_accuracy": 0.8587275743484497, + "step": 395 + }, + { + "epoch": 0.198, + "grad_norm": 2.6414458501762135, + "learning_rate": 9.9e-07, + "loss": 0.3302, + "mean_token_accuracy": 0.8968820571899414, + "step": 396 + }, + { + "epoch": 0.1985, + "grad_norm": 3.6757501569133173, + "learning_rate": 9.925e-07, + "loss": 0.4617, + "mean_token_accuracy": 0.8704690933227539, + "step": 397 + }, + { + "epoch": 0.199, + "grad_norm": 3.830681788947014, + "learning_rate": 9.950000000000002e-07, + "loss": 0.4117, + "mean_token_accuracy": 0.8782467246055603, + "step": 398 + }, + { + "epoch": 0.1995, + "grad_norm": 2.508253713689785, + "learning_rate": 9.975000000000002e-07, + "loss": 0.4358, + "mean_token_accuracy": 0.8542020916938782, + "step": 399 + }, + { + "epoch": 0.2, + "grad_norm": 3.25164156720632, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8671806454658508, + "step": 400 + }, + { + "epoch": 0.2005, + "grad_norm": 2.196251784886038, + "learning_rate": 1.0025000000000001e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8672729730606079, + "step": 401 + }, + { + "epoch": 0.201, + "grad_norm": 3.5559156694383875, + "learning_rate": 1.0050000000000001e-06, + "loss": 0.5323, + "mean_token_accuracy": 0.8437981605529785, + "step": 402 + }, + { + "epoch": 0.2015, + "grad_norm": 3.093083976865327, + "learning_rate": 1.0075e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8510600924491882, + "step": 403 + }, + { + "epoch": 0.202, + "grad_norm": 4.177126083759835, + "learning_rate": 1.01e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8501346111297607, + "step": 404 + }, + { + "epoch": 0.2025, + "grad_norm": 3.2042400271672533, + "learning_rate": 1.0125e-06, + "loss": 0.4878, + "mean_token_accuracy": 0.8542195558547974, + "step": 405 + }, + { + "epoch": 0.203, + "grad_norm": 2.9924268057636176, + "learning_rate": 1.0150000000000002e-06, + "loss": 0.6106, + "mean_token_accuracy": 0.796768844127655, + "step": 406 + }, + { + "epoch": 0.2035, + "grad_norm": 6.948860794710829, + "learning_rate": 1.0175e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8684485554695129, + "step": 407 + }, + { + "epoch": 0.204, + "grad_norm": 15.695644232355495, + "learning_rate": 1.02e-06, + "loss": 0.503, + "mean_token_accuracy": 0.8506129384040833, + "step": 408 + }, + { + "epoch": 0.2045, + "grad_norm": 2.5154451836351654, + "learning_rate": 1.0225e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8960499167442322, + "step": 409 + }, + { + "epoch": 0.205, + "grad_norm": 7.047415837272822, + "learning_rate": 1.025e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.8968172669410706, + "step": 410 + }, + { + "epoch": 0.2055, + "grad_norm": 3.731941513509143, + "learning_rate": 1.0275000000000001e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8656537532806396, + "step": 411 + }, + { + "epoch": 0.206, + "grad_norm": 7.175338025590295, + "learning_rate": 1.03e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8677467703819275, + "step": 412 + }, + { + "epoch": 0.2065, + "grad_norm": 4.280750151503222, + "learning_rate": 1.0325e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8681617975234985, + "step": 413 + }, + { + "epoch": 0.207, + "grad_norm": 3.5013283195465608, + "learning_rate": 1.035e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8831560015678406, + "step": 414 + }, + { + "epoch": 0.2075, + "grad_norm": 3.468629473325741, + "learning_rate": 1.0375e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8633715510368347, + "step": 415 + }, + { + "epoch": 0.208, + "grad_norm": 3.5038732281261065, + "learning_rate": 1.04e-06, + "loss": 0.3182, + "mean_token_accuracy": 0.9081451892852783, + "step": 416 + }, + { + "epoch": 0.2085, + "grad_norm": 4.7559960125448955, + "learning_rate": 1.0425e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8527131676673889, + "step": 417 + }, + { + "epoch": 0.209, + "grad_norm": 7.409809064857581, + "learning_rate": 1.045e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.8556668162345886, + "step": 418 + }, + { + "epoch": 0.2095, + "grad_norm": 4.168137047763808, + "learning_rate": 1.0475000000000001e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8702370524406433, + "step": 419 + }, + { + "epoch": 0.21, + "grad_norm": 3.048442659863358, + "learning_rate": 1.0500000000000001e-06, + "loss": 0.5258, + "mean_token_accuracy": 0.8391042351722717, + "step": 420 + }, + { + "epoch": 0.2105, + "grad_norm": 4.661388929354102, + "learning_rate": 1.0525e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8556998372077942, + "step": 421 + }, + { + "epoch": 0.211, + "grad_norm": 4.726959850952846, + "learning_rate": 1.055e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8707756400108337, + "step": 422 + }, + { + "epoch": 0.2115, + "grad_norm": 2.5127633656448936, + "learning_rate": 1.0575e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.876367449760437, + "step": 423 + }, + { + "epoch": 0.212, + "grad_norm": 2.9846734531267676, + "learning_rate": 1.06e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.8566885590553284, + "step": 424 + }, + { + "epoch": 0.2125, + "grad_norm": 4.892694909166894, + "learning_rate": 1.0625e-06, + "loss": 0.6393, + "mean_token_accuracy": 0.8141864538192749, + "step": 425 + }, + { + "epoch": 0.213, + "grad_norm": 2.8068256536903995, + "learning_rate": 1.065e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8611932396888733, + "step": 426 + }, + { + "epoch": 0.2135, + "grad_norm": 3.4197171723813615, + "learning_rate": 1.0675000000000002e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.854664146900177, + "step": 427 + }, + { + "epoch": 0.214, + "grad_norm": 2.754478240445495, + "learning_rate": 1.0700000000000001e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8605422377586365, + "step": 428 + }, + { + "epoch": 0.2145, + "grad_norm": 5.741292826554025, + "learning_rate": 1.0725000000000001e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8600782752037048, + "step": 429 + }, + { + "epoch": 0.215, + "grad_norm": 10.281889522707454, + "learning_rate": 1.075e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8973162174224854, + "step": 430 + }, + { + "epoch": 0.2155, + "grad_norm": 5.141694950244242, + "learning_rate": 1.0775e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8571707010269165, + "step": 431 + }, + { + "epoch": 0.216, + "grad_norm": 2.905589235865156, + "learning_rate": 1.08e-06, + "loss": 0.5261, + "mean_token_accuracy": 0.8411600589752197, + "step": 432 + }, + { + "epoch": 0.2165, + "grad_norm": 2.2735890320576018, + "learning_rate": 1.0825e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8776041865348816, + "step": 433 + }, + { + "epoch": 0.217, + "grad_norm": 3.6091271837417653, + "learning_rate": 1.085e-06, + "loss": 0.5169, + "mean_token_accuracy": 0.8365407586097717, + "step": 434 + }, + { + "epoch": 0.2175, + "grad_norm": 3.544264349213069, + "learning_rate": 1.0875000000000002e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8539143204689026, + "step": 435 + }, + { + "epoch": 0.218, + "grad_norm": 2.947520352480788, + "learning_rate": 1.0900000000000002e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8524150252342224, + "step": 436 + }, + { + "epoch": 0.2185, + "grad_norm": 2.5300236069975726, + "learning_rate": 1.0925000000000001e-06, + "loss": 0.4816, + "mean_token_accuracy": 0.85075843334198, + "step": 437 + }, + { + "epoch": 0.219, + "grad_norm": 11.355173649747435, + "learning_rate": 1.095e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.8471516966819763, + "step": 438 + }, + { + "epoch": 0.2195, + "grad_norm": 3.3245013070437897, + "learning_rate": 1.0975e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.886805534362793, + "step": 439 + }, + { + "epoch": 0.22, + "grad_norm": 3.8888508460560502, + "learning_rate": 1.1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8681749701499939, + "step": 440 + }, + { + "epoch": 0.2205, + "grad_norm": 10.375495971443982, + "learning_rate": 1.1025e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8592636585235596, + "step": 441 + }, + { + "epoch": 0.221, + "grad_norm": 3.222916920248342, + "learning_rate": 1.105e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8602975606918335, + "step": 442 + }, + { + "epoch": 0.2215, + "grad_norm": 3.000042016266004, + "learning_rate": 1.1075000000000002e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8850299119949341, + "step": 443 + }, + { + "epoch": 0.222, + "grad_norm": 2.283914346393677, + "learning_rate": 1.1100000000000002e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.866108775138855, + "step": 444 + }, + { + "epoch": 0.2225, + "grad_norm": 13.373672254387571, + "learning_rate": 1.1125000000000001e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8948675394058228, + "step": 445 + }, + { + "epoch": 0.223, + "grad_norm": 2.836007020706504, + "learning_rate": 1.1150000000000001e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.8481621146202087, + "step": 446 + }, + { + "epoch": 0.2235, + "grad_norm": 9.498883623208899, + "learning_rate": 1.1175e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8846283555030823, + "step": 447 + }, + { + "epoch": 0.224, + "grad_norm": 2.394160484738892, + "learning_rate": 1.12e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.887944221496582, + "step": 448 + }, + { + "epoch": 0.2245, + "grad_norm": 2.640629387772573, + "learning_rate": 1.1225e-06, + "loss": 0.406, + "mean_token_accuracy": 0.869941771030426, + "step": 449 + }, + { + "epoch": 0.225, + "grad_norm": 2.5333482691522624, + "learning_rate": 1.125e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8444416522979736, + "step": 450 + }, + { + "epoch": 0.2255, + "grad_norm": 3.964580042363198, + "learning_rate": 1.1275000000000002e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.8536050915718079, + "step": 451 + }, + { + "epoch": 0.226, + "grad_norm": 3.116405662217796, + "learning_rate": 1.1300000000000002e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8707022666931152, + "step": 452 + }, + { + "epoch": 0.2265, + "grad_norm": 5.679826260817871, + "learning_rate": 1.1325000000000002e-06, + "loss": 0.501, + "mean_token_accuracy": 0.8515853881835938, + "step": 453 + }, + { + "epoch": 0.227, + "grad_norm": 2.235599429605642, + "learning_rate": 1.1350000000000001e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8845506310462952, + "step": 454 + }, + { + "epoch": 0.2275, + "grad_norm": 3.507120385886683, + "learning_rate": 1.1375000000000001e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8791090250015259, + "step": 455 + }, + { + "epoch": 0.228, + "grad_norm": 2.460380244026984, + "learning_rate": 1.14e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8708928227424622, + "step": 456 + }, + { + "epoch": 0.2285, + "grad_norm": 2.2258492601496824, + "learning_rate": 1.1425e-06, + "loss": 0.543, + "mean_token_accuracy": 0.8399950265884399, + "step": 457 + }, + { + "epoch": 0.229, + "grad_norm": 3.838612021111019, + "learning_rate": 1.145e-06, + "loss": 0.4913, + "mean_token_accuracy": 0.8596465587615967, + "step": 458 + }, + { + "epoch": 0.2295, + "grad_norm": 24.574968939403263, + "learning_rate": 1.1475000000000002e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8885486721992493, + "step": 459 + }, + { + "epoch": 0.23, + "grad_norm": 7.418773274411834, + "learning_rate": 1.1500000000000002e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.857994019985199, + "step": 460 + }, + { + "epoch": 0.2305, + "grad_norm": 5.322667231936087, + "learning_rate": 1.1525000000000002e-06, + "loss": 0.5291, + "mean_token_accuracy": 0.8475148677825928, + "step": 461 + }, + { + "epoch": 0.231, + "grad_norm": 4.4154643305888985, + "learning_rate": 1.1550000000000002e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8639817833900452, + "step": 462 + }, + { + "epoch": 0.2315, + "grad_norm": 3.967865500973328, + "learning_rate": 1.1575000000000001e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8823529481887817, + "step": 463 + }, + { + "epoch": 0.232, + "grad_norm": 7.054233886387562, + "learning_rate": 1.1600000000000001e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.8874266147613525, + "step": 464 + }, + { + "epoch": 0.2325, + "grad_norm": 3.650464004517707, + "learning_rate": 1.1625e-06, + "loss": 0.5885, + "mean_token_accuracy": 0.8516460061073303, + "step": 465 + }, + { + "epoch": 0.233, + "grad_norm": 2.8787440413594023, + "learning_rate": 1.165e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.871828019618988, + "step": 466 + }, + { + "epoch": 0.2335, + "grad_norm": 2.6194962650159224, + "learning_rate": 1.1675000000000003e-06, + "loss": 0.3277, + "mean_token_accuracy": 0.9045072793960571, + "step": 467 + }, + { + "epoch": 0.234, + "grad_norm": 2.137598619703235, + "learning_rate": 1.1700000000000002e-06, + "loss": 0.2971, + "mean_token_accuracy": 0.8980912566184998, + "step": 468 + }, + { + "epoch": 0.2345, + "grad_norm": 2.3986354128916023, + "learning_rate": 1.1725e-06, + "loss": 0.5989, + "mean_token_accuracy": 0.8330010771751404, + "step": 469 + }, + { + "epoch": 0.235, + "grad_norm": 4.907400053408162, + "learning_rate": 1.175e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8825021386146545, + "step": 470 + }, + { + "epoch": 0.2355, + "grad_norm": 4.4022005210340875, + "learning_rate": 1.1775e-06, + "loss": 0.6347, + "mean_token_accuracy": 0.8090737462043762, + "step": 471 + }, + { + "epoch": 0.236, + "grad_norm": 4.467355632362501, + "learning_rate": 1.1800000000000001e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8616219758987427, + "step": 472 + }, + { + "epoch": 0.2365, + "grad_norm": 8.07640976453766, + "learning_rate": 1.1825000000000001e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8723024725914001, + "step": 473 + }, + { + "epoch": 0.237, + "grad_norm": 2.3876432166467016, + "learning_rate": 1.185e-06, + "loss": 0.2945, + "mean_token_accuracy": 0.9026299715042114, + "step": 474 + }, + { + "epoch": 0.2375, + "grad_norm": 3.0987703856735704, + "learning_rate": 1.1875e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8702807426452637, + "step": 475 + }, + { + "epoch": 0.238, + "grad_norm": 2.8078049393555573, + "learning_rate": 1.19e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8666359782218933, + "step": 476 + }, + { + "epoch": 0.2385, + "grad_norm": 4.123143108075537, + "learning_rate": 1.1925e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8597205877304077, + "step": 477 + }, + { + "epoch": 0.239, + "grad_norm": 3.4959983862116393, + "learning_rate": 1.195e-06, + "loss": 0.5774, + "mean_token_accuracy": 0.8586819767951965, + "step": 478 + }, + { + "epoch": 0.2395, + "grad_norm": 3.4175980909953054, + "learning_rate": 1.1975e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8957042694091797, + "step": 479 + }, + { + "epoch": 0.24, + "grad_norm": 5.4537170592271975, + "learning_rate": 1.2000000000000002e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8704928755760193, + "step": 480 + }, + { + "epoch": 0.2405, + "grad_norm": 3.199624444932144, + "learning_rate": 1.2025000000000001e-06, + "loss": 0.3392, + "mean_token_accuracy": 0.8893598318099976, + "step": 481 + }, + { + "epoch": 0.241, + "grad_norm": 2.525905792363657, + "learning_rate": 1.2050000000000001e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8778376579284668, + "step": 482 + }, + { + "epoch": 0.2415, + "grad_norm": 4.506057046967999, + "learning_rate": 1.2075e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8607750535011292, + "step": 483 + }, + { + "epoch": 0.242, + "grad_norm": 3.239642073032744, + "learning_rate": 1.21e-06, + "loss": 0.5017, + "mean_token_accuracy": 0.8432705998420715, + "step": 484 + }, + { + "epoch": 0.2425, + "grad_norm": 2.199820995359597, + "learning_rate": 1.2125e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8847934603691101, + "step": 485 + }, + { + "epoch": 0.243, + "grad_norm": 2.992451136053559, + "learning_rate": 1.215e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8609834909439087, + "step": 486 + }, + { + "epoch": 0.2435, + "grad_norm": 3.239189823885537, + "learning_rate": 1.2175e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8876319527626038, + "step": 487 + }, + { + "epoch": 0.244, + "grad_norm": 3.83145671419935, + "learning_rate": 1.2200000000000002e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.8397995233535767, + "step": 488 + }, + { + "epoch": 0.2445, + "grad_norm": 2.9250411912298397, + "learning_rate": 1.2225000000000002e-06, + "loss": 0.5195, + "mean_token_accuracy": 0.8407617211341858, + "step": 489 + }, + { + "epoch": 0.245, + "grad_norm": 2.6780651957781183, + "learning_rate": 1.2250000000000001e-06, + "loss": 0.3089, + "mean_token_accuracy": 0.9011319279670715, + "step": 490 + }, + { + "epoch": 0.2455, + "grad_norm": 2.7817682922979214, + "learning_rate": 1.2275000000000001e-06, + "loss": 0.3301, + "mean_token_accuracy": 0.8978585600852966, + "step": 491 + }, + { + "epoch": 0.246, + "grad_norm": 3.358653370826445, + "learning_rate": 1.23e-06, + "loss": 0.5226, + "mean_token_accuracy": 0.8498212099075317, + "step": 492 + }, + { + "epoch": 0.2465, + "grad_norm": 5.806685877693358, + "learning_rate": 1.2325e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8834578394889832, + "step": 493 + }, + { + "epoch": 0.247, + "grad_norm": 7.729598329153408, + "learning_rate": 1.235e-06, + "loss": 0.338, + "mean_token_accuracy": 0.888498842716217, + "step": 494 + }, + { + "epoch": 0.2475, + "grad_norm": 28.466389745314032, + "learning_rate": 1.2375e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8825021386146545, + "step": 495 + }, + { + "epoch": 0.248, + "grad_norm": 4.165164112209453, + "learning_rate": 1.2400000000000002e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8848625421524048, + "step": 496 + }, + { + "epoch": 0.2485, + "grad_norm": 6.695628609569135, + "learning_rate": 1.2425000000000002e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8771726489067078, + "step": 497 + }, + { + "epoch": 0.249, + "grad_norm": 13.743052903959132, + "learning_rate": 1.2450000000000002e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8709113001823425, + "step": 498 + }, + { + "epoch": 0.2495, + "grad_norm": 6.60861086959685, + "learning_rate": 1.2475000000000001e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.894058108329773, + "step": 499 + }, + { + "epoch": 0.25, + "grad_norm": 2.659503541586789, + "learning_rate": 1.25e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8753471374511719, + "step": 500 + }, + { + "epoch": 0.2505, + "grad_norm": 2.5971426739775167, + "learning_rate": 1.2525e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8836153149604797, + "step": 501 + }, + { + "epoch": 0.251, + "grad_norm": 2.437065130958607, + "learning_rate": 1.255e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8859084844589233, + "step": 502 + }, + { + "epoch": 0.2515, + "grad_norm": 8.27511140286702, + "learning_rate": 1.2575e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8835261464118958, + "step": 503 + }, + { + "epoch": 0.252, + "grad_norm": 2.669705811342528, + "learning_rate": 1.26e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.851424515247345, + "step": 504 + }, + { + "epoch": 0.2525, + "grad_norm": 2.115281275785455, + "learning_rate": 1.2625000000000002e-06, + "loss": 0.3299, + "mean_token_accuracy": 0.8906494975090027, + "step": 505 + }, + { + "epoch": 0.253, + "grad_norm": 7.272884263490958, + "learning_rate": 1.2650000000000002e-06, + "loss": 0.4831, + "mean_token_accuracy": 0.8463166356086731, + "step": 506 + }, + { + "epoch": 0.2535, + "grad_norm": 2.3849262819202477, + "learning_rate": 1.2675000000000001e-06, + "loss": 0.5024, + "mean_token_accuracy": 0.8522545695304871, + "step": 507 + }, + { + "epoch": 0.254, + "grad_norm": 3.163498916374257, + "learning_rate": 1.2700000000000001e-06, + "loss": 0.5424, + "mean_token_accuracy": 0.8463992476463318, + "step": 508 + }, + { + "epoch": 0.2545, + "grad_norm": 3.5584516966177113, + "learning_rate": 1.2725e-06, + "loss": 0.495, + "mean_token_accuracy": 0.8546833395957947, + "step": 509 + }, + { + "epoch": 0.255, + "grad_norm": 2.5026699860286743, + "learning_rate": 1.275e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8407468199729919, + "step": 510 + }, + { + "epoch": 0.2555, + "grad_norm": 2.8232150769699995, + "learning_rate": 1.2775e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8692397475242615, + "step": 511 + }, + { + "epoch": 0.256, + "grad_norm": 4.73574410658576, + "learning_rate": 1.28e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8579463362693787, + "step": 512 + }, + { + "epoch": 0.2565, + "grad_norm": 2.48332235965913, + "learning_rate": 1.2825000000000002e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8709530234336853, + "step": 513 + }, + { + "epoch": 0.257, + "grad_norm": 3.2411780702816815, + "learning_rate": 1.2850000000000002e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8610827326774597, + "step": 514 + }, + { + "epoch": 0.2575, + "grad_norm": 2.4763804598927166, + "learning_rate": 1.2875000000000002e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8787497878074646, + "step": 515 + }, + { + "epoch": 0.258, + "grad_norm": 3.688509495372377, + "learning_rate": 1.2900000000000001e-06, + "loss": 0.5435, + "mean_token_accuracy": 0.8263362050056458, + "step": 516 + }, + { + "epoch": 0.2585, + "grad_norm": 3.4196853073854956, + "learning_rate": 1.2925000000000001e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8608392477035522, + "step": 517 + }, + { + "epoch": 0.259, + "grad_norm": 3.139525814506908, + "learning_rate": 1.295e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.86317378282547, + "step": 518 + }, + { + "epoch": 0.2595, + "grad_norm": 54.72415427687482, + "learning_rate": 1.2975e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8504574298858643, + "step": 519 + }, + { + "epoch": 0.26, + "grad_norm": 3.09362821586923, + "learning_rate": 1.3e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8756811022758484, + "step": 520 + }, + { + "epoch": 0.2605, + "grad_norm": 123.47157384950772, + "learning_rate": 1.3025000000000002e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8796486854553223, + "step": 521 + }, + { + "epoch": 0.261, + "grad_norm": 7.75950228829246, + "learning_rate": 1.3050000000000002e-06, + "loss": 0.2492, + "mean_token_accuracy": 0.9167020916938782, + "step": 522 + }, + { + "epoch": 0.2615, + "grad_norm": 2.526488725332755, + "learning_rate": 1.3075000000000002e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8729439973831177, + "step": 523 + }, + { + "epoch": 0.262, + "grad_norm": 2.5432208727643353, + "learning_rate": 1.3100000000000002e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8711615204811096, + "step": 524 + }, + { + "epoch": 0.2625, + "grad_norm": 2.1028246452297195, + "learning_rate": 1.3125000000000001e-06, + "loss": 0.3248, + "mean_token_accuracy": 0.8956043720245361, + "step": 525 + }, + { + "epoch": 0.263, + "grad_norm": 6.751831113501939, + "learning_rate": 1.3150000000000001e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8776325583457947, + "step": 526 + }, + { + "epoch": 0.2635, + "grad_norm": 3.0621804846683296, + "learning_rate": 1.3175e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8725746273994446, + "step": 527 + }, + { + "epoch": 0.264, + "grad_norm": 10.131307538083595, + "learning_rate": 1.32e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8657371401786804, + "step": 528 + }, + { + "epoch": 0.2645, + "grad_norm": 3.6783921382212865, + "learning_rate": 1.3225000000000003e-06, + "loss": 0.342, + "mean_token_accuracy": 0.8803636431694031, + "step": 529 + }, + { + "epoch": 0.265, + "grad_norm": 7.493799767556795, + "learning_rate": 1.3250000000000002e-06, + "loss": 0.5703, + "mean_token_accuracy": 0.8289835453033447, + "step": 530 + }, + { + "epoch": 0.2655, + "grad_norm": 3.4816633192247317, + "learning_rate": 1.3275000000000002e-06, + "loss": 0.3357, + "mean_token_accuracy": 0.89914470911026, + "step": 531 + }, + { + "epoch": 0.266, + "grad_norm": 9.644221970273836, + "learning_rate": 1.3300000000000002e-06, + "loss": 0.46, + "mean_token_accuracy": 0.859552264213562, + "step": 532 + }, + { + "epoch": 0.2665, + "grad_norm": 2.4493811675343737, + "learning_rate": 1.3325000000000002e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8455507755279541, + "step": 533 + }, + { + "epoch": 0.267, + "grad_norm": 2.498589944238823, + "learning_rate": 1.3350000000000001e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8692626357078552, + "step": 534 + }, + { + "epoch": 0.2675, + "grad_norm": 2.1870607488495617, + "learning_rate": 1.3375000000000001e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8750210404396057, + "step": 535 + }, + { + "epoch": 0.268, + "grad_norm": 3.0967857873211893, + "learning_rate": 1.34e-06, + "loss": 0.3028, + "mean_token_accuracy": 0.8961254358291626, + "step": 536 + }, + { + "epoch": 0.2685, + "grad_norm": 7.826992706792521, + "learning_rate": 1.3425000000000003e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.864343523979187, + "step": 537 + }, + { + "epoch": 0.269, + "grad_norm": 2.109736129652651, + "learning_rate": 1.3450000000000003e-06, + "loss": 0.3153, + "mean_token_accuracy": 0.897372841835022, + "step": 538 + }, + { + "epoch": 0.2695, + "grad_norm": 2.8657440690584517, + "learning_rate": 1.3475000000000002e-06, + "loss": 0.3203, + "mean_token_accuracy": 0.8987488150596619, + "step": 539 + }, + { + "epoch": 0.27, + "grad_norm": 11.833066996992683, + "learning_rate": 1.3500000000000002e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8617502450942993, + "step": 540 + }, + { + "epoch": 0.2705, + "grad_norm": 7.095445607325715, + "learning_rate": 1.3525000000000002e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8806769251823425, + "step": 541 + }, + { + "epoch": 0.271, + "grad_norm": 7.317716692230138, + "learning_rate": 1.3550000000000002e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8856325745582581, + "step": 542 + }, + { + "epoch": 0.2715, + "grad_norm": 4.268690223641264, + "learning_rate": 1.3575000000000001e-06, + "loss": 0.4877, + "mean_token_accuracy": 0.8633361458778381, + "step": 543 + }, + { + "epoch": 0.272, + "grad_norm": 3.565726576336461, + "learning_rate": 1.3600000000000001e-06, + "loss": 0.5991, + "mean_token_accuracy": 0.8212933540344238, + "step": 544 + }, + { + "epoch": 0.2725, + "grad_norm": 2.470286145732415, + "learning_rate": 1.3625000000000003e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8707593083381653, + "step": 545 + }, + { + "epoch": 0.273, + "grad_norm": 2.8323871759602928, + "learning_rate": 1.3650000000000003e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8641585111618042, + "step": 546 + }, + { + "epoch": 0.2735, + "grad_norm": 2.3432787219512634, + "learning_rate": 1.3675000000000002e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8618186712265015, + "step": 547 + }, + { + "epoch": 0.274, + "grad_norm": 17.19645731429941, + "learning_rate": 1.3700000000000002e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.8938432335853577, + "step": 548 + }, + { + "epoch": 0.2745, + "grad_norm": 2.890306702553426, + "learning_rate": 1.3725000000000002e-06, + "loss": 0.5792, + "mean_token_accuracy": 0.8318789005279541, + "step": 549 + }, + { + "epoch": 0.275, + "grad_norm": 2.4824047221779786, + "learning_rate": 1.3750000000000002e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8828209638595581, + "step": 550 + }, + { + "epoch": 0.2755, + "grad_norm": 6.944838008599966, + "learning_rate": 1.3775000000000002e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8798291087150574, + "step": 551 + }, + { + "epoch": 0.276, + "grad_norm": 2.773439909123874, + "learning_rate": 1.3800000000000001e-06, + "loss": 0.3253, + "mean_token_accuracy": 0.8933050632476807, + "step": 552 + }, + { + "epoch": 0.2765, + "grad_norm": 2.1461902892505194, + "learning_rate": 1.3825000000000003e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8393926024436951, + "step": 553 + }, + { + "epoch": 0.277, + "grad_norm": 4.156189173072534, + "learning_rate": 1.3850000000000003e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.8675535917282104, + "step": 554 + }, + { + "epoch": 0.2775, + "grad_norm": 3.655417255709178, + "learning_rate": 1.3875000000000003e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8938160538673401, + "step": 555 + }, + { + "epoch": 0.278, + "grad_norm": 4.052224634656976, + "learning_rate": 1.3900000000000002e-06, + "loss": 0.5167, + "mean_token_accuracy": 0.8468955159187317, + "step": 556 + }, + { + "epoch": 0.2785, + "grad_norm": 4.383072574180313, + "learning_rate": 1.3925000000000002e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8588504791259766, + "step": 557 + }, + { + "epoch": 0.279, + "grad_norm": 2.662514461432446, + "learning_rate": 1.3950000000000002e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8805156350135803, + "step": 558 + }, + { + "epoch": 0.2795, + "grad_norm": 5.667304720148319, + "learning_rate": 1.3975000000000002e-06, + "loss": 0.5147, + "mean_token_accuracy": 0.8406931161880493, + "step": 559 + }, + { + "epoch": 0.28, + "grad_norm": 4.299677486901654, + "learning_rate": 1.4000000000000001e-06, + "loss": 0.5306, + "mean_token_accuracy": 0.8373227119445801, + "step": 560 + }, + { + "epoch": 0.2805, + "grad_norm": 3.3396323244888504, + "learning_rate": 1.4025000000000003e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8667563796043396, + "step": 561 + }, + { + "epoch": 0.281, + "grad_norm": 3.8442499451727414, + "learning_rate": 1.4050000000000003e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8849493265151978, + "step": 562 + }, + { + "epoch": 0.2815, + "grad_norm": 3.2311306056650237, + "learning_rate": 1.4075e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8815701007843018, + "step": 563 + }, + { + "epoch": 0.282, + "grad_norm": 3.1816985375715734, + "learning_rate": 1.41e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8783107995986938, + "step": 564 + }, + { + "epoch": 0.2825, + "grad_norm": 13.739610641085552, + "learning_rate": 1.4125e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8761708736419678, + "step": 565 + }, + { + "epoch": 0.283, + "grad_norm": 2.816151688792399, + "learning_rate": 1.415e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8722705841064453, + "step": 566 + }, + { + "epoch": 0.2835, + "grad_norm": 3.4995307196153007, + "learning_rate": 1.4175e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8603761196136475, + "step": 567 + }, + { + "epoch": 0.284, + "grad_norm": 3.3760091790531837, + "learning_rate": 1.42e-06, + "loss": 0.5195, + "mean_token_accuracy": 0.8364385962486267, + "step": 568 + }, + { + "epoch": 0.2845, + "grad_norm": 5.44747712273763, + "learning_rate": 1.4225e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8630061745643616, + "step": 569 + }, + { + "epoch": 0.285, + "grad_norm": 3.2942537111969004, + "learning_rate": 1.425e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8562263250350952, + "step": 570 + }, + { + "epoch": 0.2855, + "grad_norm": 2.548201223351187, + "learning_rate": 1.4275e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8839501738548279, + "step": 571 + }, + { + "epoch": 0.286, + "grad_norm": 3.371800278485495, + "learning_rate": 1.43e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8690977096557617, + "step": 572 + }, + { + "epoch": 0.2865, + "grad_norm": 5.690556307941863, + "learning_rate": 1.4325e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8717067837715149, + "step": 573 + }, + { + "epoch": 0.287, + "grad_norm": 2.254373214090795, + "learning_rate": 1.435e-06, + "loss": 0.3086, + "mean_token_accuracy": 0.900260865688324, + "step": 574 + }, + { + "epoch": 0.2875, + "grad_norm": 3.1072655202936446, + "learning_rate": 1.4375e-06, + "loss": 0.5684, + "mean_token_accuracy": 0.8244982957839966, + "step": 575 + }, + { + "epoch": 0.288, + "grad_norm": 4.878247660508092, + "learning_rate": 1.44e-06, + "loss": 0.2773, + "mean_token_accuracy": 0.9051240086555481, + "step": 576 + }, + { + "epoch": 0.2885, + "grad_norm": 3.4066310948936156, + "learning_rate": 1.4425e-06, + "loss": 0.7446, + "mean_token_accuracy": 0.7864951491355896, + "step": 577 + }, + { + "epoch": 0.289, + "grad_norm": 2.2880132326447535, + "learning_rate": 1.445e-06, + "loss": 0.3056, + "mean_token_accuracy": 0.9008129835128784, + "step": 578 + }, + { + "epoch": 0.2895, + "grad_norm": 3.160784785006124, + "learning_rate": 1.4475000000000001e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8590149283409119, + "step": 579 + }, + { + "epoch": 0.29, + "grad_norm": 3.060593415394006, + "learning_rate": 1.45e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.8455005884170532, + "step": 580 + }, + { + "epoch": 0.2905, + "grad_norm": 7.755889020356052, + "learning_rate": 1.4525e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8534343838691711, + "step": 581 + }, + { + "epoch": 0.291, + "grad_norm": 3.1939163132382977, + "learning_rate": 1.455e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8869707584381104, + "step": 582 + }, + { + "epoch": 0.2915, + "grad_norm": 2.3686387846474086, + "learning_rate": 1.4575e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.886884331703186, + "step": 583 + }, + { + "epoch": 0.292, + "grad_norm": 4.2939553140987226, + "learning_rate": 1.46e-06, + "loss": 0.3446, + "mean_token_accuracy": 0.8839177489280701, + "step": 584 + }, + { + "epoch": 0.2925, + "grad_norm": 4.775980428979328, + "learning_rate": 1.4625e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8789212107658386, + "step": 585 + }, + { + "epoch": 0.293, + "grad_norm": 3.0386044363739386, + "learning_rate": 1.465e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.882847011089325, + "step": 586 + }, + { + "epoch": 0.2935, + "grad_norm": 8.15778133846193, + "learning_rate": 1.4675000000000001e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8795437216758728, + "step": 587 + }, + { + "epoch": 0.294, + "grad_norm": 4.580875806490521, + "learning_rate": 1.4700000000000001e-06, + "loss": 0.432, + "mean_token_accuracy": 0.869330883026123, + "step": 588 + }, + { + "epoch": 0.2945, + "grad_norm": 2.677001528732634, + "learning_rate": 1.4725e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8709349632263184, + "step": 589 + }, + { + "epoch": 0.295, + "grad_norm": 2.9021608111091974, + "learning_rate": 1.475e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8813056349754333, + "step": 590 + }, + { + "epoch": 0.2955, + "grad_norm": 4.111190549827192, + "learning_rate": 1.4775e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8688419461250305, + "step": 591 + }, + { + "epoch": 0.296, + "grad_norm": 3.062907425098525, + "learning_rate": 1.48e-06, + "loss": 0.4949, + "mean_token_accuracy": 0.8427377343177795, + "step": 592 + }, + { + "epoch": 0.2965, + "grad_norm": 2.020003880495077, + "learning_rate": 1.4825e-06, + "loss": 0.2661, + "mean_token_accuracy": 0.9084346294403076, + "step": 593 + }, + { + "epoch": 0.297, + "grad_norm": 3.837897463793231, + "learning_rate": 1.485e-06, + "loss": 0.5698, + "mean_token_accuracy": 0.8296089172363281, + "step": 594 + }, + { + "epoch": 0.2975, + "grad_norm": 3.4670363810199856, + "learning_rate": 1.4875000000000002e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8823314309120178, + "step": 595 + }, + { + "epoch": 0.298, + "grad_norm": 2.4677989413126347, + "learning_rate": 1.4900000000000001e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8919084668159485, + "step": 596 + }, + { + "epoch": 0.2985, + "grad_norm": 2.458336574715034, + "learning_rate": 1.4925000000000001e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8596740961074829, + "step": 597 + }, + { + "epoch": 0.299, + "grad_norm": 3.0856407276380824, + "learning_rate": 1.495e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8704984784126282, + "step": 598 + }, + { + "epoch": 0.2995, + "grad_norm": 6.774692424820504, + "learning_rate": 1.4975e-06, + "loss": 0.3338, + "mean_token_accuracy": 0.8853961825370789, + "step": 599 + }, + { + "epoch": 0.3, + "grad_norm": 3.0082434250808707, + "learning_rate": 1.5e-06, + "loss": 0.5405, + "mean_token_accuracy": 0.8467816710472107, + "step": 600 + }, + { + "epoch": 0.3005, + "grad_norm": 2.966834630059898, + "learning_rate": 1.5025e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8669788241386414, + "step": 601 + }, + { + "epoch": 0.301, + "grad_norm": 2.537039979003183, + "learning_rate": 1.505e-06, + "loss": 0.3143, + "mean_token_accuracy": 0.9003881216049194, + "step": 602 + }, + { + "epoch": 0.3015, + "grad_norm": 2.2303103388655288, + "learning_rate": 1.5075000000000002e-06, + "loss": 0.2841, + "mean_token_accuracy": 0.8986340761184692, + "step": 603 + }, + { + "epoch": 0.302, + "grad_norm": 3.9199178795528056, + "learning_rate": 1.5100000000000002e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8513315320014954, + "step": 604 + }, + { + "epoch": 0.3025, + "grad_norm": 3.04424090535173, + "learning_rate": 1.5125000000000001e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8612734079360962, + "step": 605 + }, + { + "epoch": 0.303, + "grad_norm": 2.163237794773555, + "learning_rate": 1.5150000000000001e-06, + "loss": 0.2475, + "mean_token_accuracy": 0.9158041477203369, + "step": 606 + }, + { + "epoch": 0.3035, + "grad_norm": 3.3494558993416814, + "learning_rate": 1.5175e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.8574866056442261, + "step": 607 + }, + { + "epoch": 0.304, + "grad_norm": 2.658079269869628, + "learning_rate": 1.52e-06, + "loss": 0.7154, + "mean_token_accuracy": 0.8096446990966797, + "step": 608 + }, + { + "epoch": 0.3045, + "grad_norm": 4.512177226013666, + "learning_rate": 1.5225e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8741679191589355, + "step": 609 + }, + { + "epoch": 0.305, + "grad_norm": 4.295271066054665, + "learning_rate": 1.525e-06, + "loss": 0.3179, + "mean_token_accuracy": 0.8941265940666199, + "step": 610 + }, + { + "epoch": 0.3055, + "grad_norm": 6.567799428578436, + "learning_rate": 1.5275000000000002e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8405945897102356, + "step": 611 + }, + { + "epoch": 0.306, + "grad_norm": 3.0596113572946266, + "learning_rate": 1.5300000000000002e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8648211359977722, + "step": 612 + }, + { + "epoch": 0.3065, + "grad_norm": 2.7432291986840154, + "learning_rate": 1.5325000000000002e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8615692853927612, + "step": 613 + }, + { + "epoch": 0.307, + "grad_norm": 2.3191317293057403, + "learning_rate": 1.5350000000000001e-06, + "loss": 0.2966, + "mean_token_accuracy": 0.9050742983818054, + "step": 614 + }, + { + "epoch": 0.3075, + "grad_norm": 3.0105372214164547, + "learning_rate": 1.5375e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.854214608669281, + "step": 615 + }, + { + "epoch": 0.308, + "grad_norm": 2.5200718210227593, + "learning_rate": 1.54e-06, + "loss": 0.5355, + "mean_token_accuracy": 0.8370182514190674, + "step": 616 + }, + { + "epoch": 0.3085, + "grad_norm": 3.363009556417013, + "learning_rate": 1.5425e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.865298867225647, + "step": 617 + }, + { + "epoch": 0.309, + "grad_norm": 3.274564917558898, + "learning_rate": 1.545e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.8759286999702454, + "step": 618 + }, + { + "epoch": 0.3095, + "grad_norm": 5.598254745731521, + "learning_rate": 1.5475000000000002e-06, + "loss": 0.5087, + "mean_token_accuracy": 0.8667004704475403, + "step": 619 + }, + { + "epoch": 0.31, + "grad_norm": 2.7587798068502645, + "learning_rate": 1.5500000000000002e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8698752522468567, + "step": 620 + }, + { + "epoch": 0.3105, + "grad_norm": 2.900840086942216, + "learning_rate": 1.5525000000000002e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8754870295524597, + "step": 621 + }, + { + "epoch": 0.311, + "grad_norm": 2.631616219472046, + "learning_rate": 1.5550000000000001e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8884271383285522, + "step": 622 + }, + { + "epoch": 0.3115, + "grad_norm": 2.258525502822492, + "learning_rate": 1.5575000000000001e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8894230723381042, + "step": 623 + }, + { + "epoch": 0.312, + "grad_norm": 2.7981164841469397, + "learning_rate": 1.56e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8808106780052185, + "step": 624 + }, + { + "epoch": 0.3125, + "grad_norm": 3.095536538760297, + "learning_rate": 1.5625e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8776814341545105, + "step": 625 + }, + { + "epoch": 0.313, + "grad_norm": 65.0843836395049, + "learning_rate": 1.565e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8696049451828003, + "step": 626 + }, + { + "epoch": 0.3135, + "grad_norm": 3.711265291438733, + "learning_rate": 1.5675e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8714205622673035, + "step": 627 + }, + { + "epoch": 0.314, + "grad_norm": 3.057223206991299, + "learning_rate": 1.5700000000000002e-06, + "loss": 0.3036, + "mean_token_accuracy": 0.900486409664154, + "step": 628 + }, + { + "epoch": 0.3145, + "grad_norm": 2.982190183979293, + "learning_rate": 1.5725000000000002e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.864883303642273, + "step": 629 + }, + { + "epoch": 0.315, + "grad_norm": 3.0852251992033133, + "learning_rate": 1.5750000000000002e-06, + "loss": 0.6357, + "mean_token_accuracy": 0.8167709708213806, + "step": 630 + }, + { + "epoch": 0.3155, + "grad_norm": 3.5786098522207017, + "learning_rate": 1.5775000000000001e-06, + "loss": 0.528, + "mean_token_accuracy": 0.8273463249206543, + "step": 631 + }, + { + "epoch": 0.316, + "grad_norm": 2.9090227575531014, + "learning_rate": 1.5800000000000001e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.8545246124267578, + "step": 632 + }, + { + "epoch": 0.3165, + "grad_norm": 9.23129022920944, + "learning_rate": 1.5825e-06, + "loss": 0.3035, + "mean_token_accuracy": 0.9024096131324768, + "step": 633 + }, + { + "epoch": 0.317, + "grad_norm": 4.994332987705462, + "learning_rate": 1.585e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.8539237380027771, + "step": 634 + }, + { + "epoch": 0.3175, + "grad_norm": 4.187316717036041, + "learning_rate": 1.5875e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.896200954914093, + "step": 635 + }, + { + "epoch": 0.318, + "grad_norm": 4.853162360556293, + "learning_rate": 1.5900000000000002e-06, + "loss": 0.5534, + "mean_token_accuracy": 0.8658950328826904, + "step": 636 + }, + { + "epoch": 0.3185, + "grad_norm": 6.659185225926735, + "learning_rate": 1.5925000000000002e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8850692510604858, + "step": 637 + }, + { + "epoch": 0.319, + "grad_norm": 5.901147301388738, + "learning_rate": 1.5950000000000002e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8911145329475403, + "step": 638 + }, + { + "epoch": 0.3195, + "grad_norm": 11.277134856075884, + "learning_rate": 1.5975000000000002e-06, + "loss": 0.5906, + "mean_token_accuracy": 0.8161345720291138, + "step": 639 + }, + { + "epoch": 0.32, + "grad_norm": 3.1761023266078254, + "learning_rate": 1.6000000000000001e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8649029731750488, + "step": 640 + }, + { + "epoch": 0.3205, + "grad_norm": 2.9945752178999645, + "learning_rate": 1.6025000000000001e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8570795655250549, + "step": 641 + }, + { + "epoch": 0.321, + "grad_norm": 2.5603230252923592, + "learning_rate": 1.605e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8832015991210938, + "step": 642 + }, + { + "epoch": 0.3215, + "grad_norm": 4.277113245653716, + "learning_rate": 1.6075e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.882776141166687, + "step": 643 + }, + { + "epoch": 0.322, + "grad_norm": 2.2843010228346325, + "learning_rate": 1.6100000000000003e-06, + "loss": 0.3107, + "mean_token_accuracy": 0.900404691696167, + "step": 644 + }, + { + "epoch": 0.3225, + "grad_norm": 3.9454945273694837, + "learning_rate": 1.6125000000000002e-06, + "loss": 0.8115, + "mean_token_accuracy": 0.8075718879699707, + "step": 645 + }, + { + "epoch": 0.323, + "grad_norm": 4.398156287450177, + "learning_rate": 1.6150000000000002e-06, + "loss": 0.374, + "mean_token_accuracy": 0.884107768535614, + "step": 646 + }, + { + "epoch": 0.3235, + "grad_norm": 2.0450370041111308, + "learning_rate": 1.6175000000000002e-06, + "loss": 0.3381, + "mean_token_accuracy": 0.8922106027603149, + "step": 647 + }, + { + "epoch": 0.324, + "grad_norm": 21.420043244153185, + "learning_rate": 1.6200000000000002e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8816657066345215, + "step": 648 + }, + { + "epoch": 0.3245, + "grad_norm": 2.7666394852553804, + "learning_rate": 1.6225000000000001e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8824460506439209, + "step": 649 + }, + { + "epoch": 0.325, + "grad_norm": 3.2103821304258937, + "learning_rate": 1.6250000000000001e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8771325349807739, + "step": 650 + }, + { + "epoch": 0.3255, + "grad_norm": 5.9062159625964235, + "learning_rate": 1.6275e-06, + "loss": 0.55, + "mean_token_accuracy": 0.8440860509872437, + "step": 651 + }, + { + "epoch": 0.326, + "grad_norm": 2.8477089168075946, + "learning_rate": 1.6300000000000003e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8656716346740723, + "step": 652 + }, + { + "epoch": 0.3265, + "grad_norm": 2.0028362871623577, + "learning_rate": 1.6325000000000003e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8763161897659302, + "step": 653 + }, + { + "epoch": 0.327, + "grad_norm": 2.082165205464613, + "learning_rate": 1.6350000000000002e-06, + "loss": 0.2554, + "mean_token_accuracy": 0.9112634062767029, + "step": 654 + }, + { + "epoch": 0.3275, + "grad_norm": 2.2822407320876863, + "learning_rate": 1.6375000000000002e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8599014282226562, + "step": 655 + }, + { + "epoch": 0.328, + "grad_norm": 2.3677757502572194, + "learning_rate": 1.6400000000000002e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8601841330528259, + "step": 656 + }, + { + "epoch": 0.3285, + "grad_norm": 7.286992552962739, + "learning_rate": 1.6425000000000002e-06, + "loss": 0.5205, + "mean_token_accuracy": 0.8642703294754028, + "step": 657 + }, + { + "epoch": 0.329, + "grad_norm": 2.510804584347149, + "learning_rate": 1.6450000000000001e-06, + "loss": 0.481, + "mean_token_accuracy": 0.855781078338623, + "step": 658 + }, + { + "epoch": 0.3295, + "grad_norm": 4.22640239815148, + "learning_rate": 1.6475000000000001e-06, + "loss": 0.5183, + "mean_token_accuracy": 0.8666927218437195, + "step": 659 + }, + { + "epoch": 0.33, + "grad_norm": 3.5499966160741714, + "learning_rate": 1.6500000000000003e-06, + "loss": 0.531, + "mean_token_accuracy": 0.8442058563232422, + "step": 660 + }, + { + "epoch": 0.3305, + "grad_norm": 2.9694116414081777, + "learning_rate": 1.6525000000000003e-06, + "loss": 0.5679, + "mean_token_accuracy": 0.831453800201416, + "step": 661 + }, + { + "epoch": 0.331, + "grad_norm": 2.2956338743120113, + "learning_rate": 1.6550000000000002e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8694357872009277, + "step": 662 + }, + { + "epoch": 0.3315, + "grad_norm": 2.769325113156021, + "learning_rate": 1.6575000000000002e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8554235100746155, + "step": 663 + }, + { + "epoch": 0.332, + "grad_norm": 3.7622842056767816, + "learning_rate": 1.6600000000000002e-06, + "loss": 0.6138, + "mean_token_accuracy": 0.8180920481681824, + "step": 664 + }, + { + "epoch": 0.3325, + "grad_norm": 4.482983796416749, + "learning_rate": 1.6625000000000002e-06, + "loss": 0.4871, + "mean_token_accuracy": 0.8566345572471619, + "step": 665 + }, + { + "epoch": 0.333, + "grad_norm": 3.6135767534064978, + "learning_rate": 1.6650000000000002e-06, + "loss": 0.5382, + "mean_token_accuracy": 0.8392516374588013, + "step": 666 + }, + { + "epoch": 0.3335, + "grad_norm": 3.146000819006347, + "learning_rate": 1.6675000000000001e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.8453492522239685, + "step": 667 + }, + { + "epoch": 0.334, + "grad_norm": 3.4357771729758486, + "learning_rate": 1.6700000000000003e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8771428465843201, + "step": 668 + }, + { + "epoch": 0.3345, + "grad_norm": 9.096910313899476, + "learning_rate": 1.6725000000000003e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8627005815505981, + "step": 669 + }, + { + "epoch": 0.335, + "grad_norm": 3.2314368957108655, + "learning_rate": 1.6750000000000003e-06, + "loss": 0.6293, + "mean_token_accuracy": 0.8227806091308594, + "step": 670 + }, + { + "epoch": 0.3355, + "grad_norm": 8.360023345831424, + "learning_rate": 1.6775000000000002e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.891548216342926, + "step": 671 + }, + { + "epoch": 0.336, + "grad_norm": 5.828326257064448, + "learning_rate": 1.6800000000000002e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8814935088157654, + "step": 672 + }, + { + "epoch": 0.3365, + "grad_norm": 3.622571609454911, + "learning_rate": 1.6825000000000002e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8667103052139282, + "step": 673 + }, + { + "epoch": 0.337, + "grad_norm": 3.4391871666483453, + "learning_rate": 1.6850000000000002e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8611776232719421, + "step": 674 + }, + { + "epoch": 0.3375, + "grad_norm": 4.91707238900694, + "learning_rate": 1.6875000000000001e-06, + "loss": 0.5052, + "mean_token_accuracy": 0.8379416465759277, + "step": 675 + }, + { + "epoch": 0.338, + "grad_norm": 3.0879981089360466, + "learning_rate": 1.6900000000000003e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8727693557739258, + "step": 676 + }, + { + "epoch": 0.3385, + "grad_norm": 3.3100095758552595, + "learning_rate": 1.6925000000000003e-06, + "loss": 0.3168, + "mean_token_accuracy": 0.8900781273841858, + "step": 677 + }, + { + "epoch": 0.339, + "grad_norm": 1.9448565330767087, + "learning_rate": 1.6950000000000003e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.8944610953330994, + "step": 678 + }, + { + "epoch": 0.3395, + "grad_norm": 2.8258417976590677, + "learning_rate": 1.6975000000000003e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8754289746284485, + "step": 679 + }, + { + "epoch": 0.34, + "grad_norm": 5.045041418850587, + "learning_rate": 1.7000000000000002e-06, + "loss": 0.3404, + "mean_token_accuracy": 0.8840749263763428, + "step": 680 + }, + { + "epoch": 0.3405, + "grad_norm": 3.286509167714639, + "learning_rate": 1.7025000000000002e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8729653358459473, + "step": 681 + }, + { + "epoch": 0.341, + "grad_norm": 2.311961980590325, + "learning_rate": 1.7050000000000002e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.883109986782074, + "step": 682 + }, + { + "epoch": 0.3415, + "grad_norm": 3.063690728985768, + "learning_rate": 1.7075000000000002e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8786135911941528, + "step": 683 + }, + { + "epoch": 0.342, + "grad_norm": 2.2524900396946643, + "learning_rate": 1.7100000000000004e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8785275816917419, + "step": 684 + }, + { + "epoch": 0.3425, + "grad_norm": 8.664561807934994, + "learning_rate": 1.7125000000000003e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8701399564743042, + "step": 685 + }, + { + "epoch": 0.343, + "grad_norm": 9.63911940500497, + "learning_rate": 1.7150000000000003e-06, + "loss": 0.5396, + "mean_token_accuracy": 0.8289409279823303, + "step": 686 + }, + { + "epoch": 0.3435, + "grad_norm": 2.5287268776625997, + "learning_rate": 1.7175000000000003e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8721519112586975, + "step": 687 + }, + { + "epoch": 0.344, + "grad_norm": 2.750833745561045, + "learning_rate": 1.72e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8832220435142517, + "step": 688 + }, + { + "epoch": 0.3445, + "grad_norm": 2.9781992610243164, + "learning_rate": 1.7225e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8762475252151489, + "step": 689 + }, + { + "epoch": 0.345, + "grad_norm": 2.5583110473677872, + "learning_rate": 1.725e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8863255977630615, + "step": 690 + }, + { + "epoch": 0.3455, + "grad_norm": 2.7969931288616126, + "learning_rate": 1.7275e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8812128305435181, + "step": 691 + }, + { + "epoch": 0.346, + "grad_norm": 2.3855193842290636, + "learning_rate": 1.73e-06, + "loss": 0.2954, + "mean_token_accuracy": 0.9065557718276978, + "step": 692 + }, + { + "epoch": 0.3465, + "grad_norm": 3.398514502222385, + "learning_rate": 1.7325e-06, + "loss": 0.5383, + "mean_token_accuracy": 0.8424153327941895, + "step": 693 + }, + { + "epoch": 0.347, + "grad_norm": 7.614352215594403, + "learning_rate": 1.7350000000000001e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8859196305274963, + "step": 694 + }, + { + "epoch": 0.3475, + "grad_norm": 2.9723151480146215, + "learning_rate": 1.7375e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8630356192588806, + "step": 695 + }, + { + "epoch": 0.348, + "grad_norm": 2.9612114192976726, + "learning_rate": 1.74e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.8476967811584473, + "step": 696 + }, + { + "epoch": 0.3485, + "grad_norm": 4.052817055651729, + "learning_rate": 1.7425e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8795443177223206, + "step": 697 + }, + { + "epoch": 0.349, + "grad_norm": 3.0462767365573957, + "learning_rate": 1.745e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8524622917175293, + "step": 698 + }, + { + "epoch": 0.3495, + "grad_norm": 2.585146595401321, + "learning_rate": 1.7475e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.8930075764656067, + "step": 699 + }, + { + "epoch": 0.35, + "grad_norm": 3.09925094131537, + "learning_rate": 1.75e-06, + "loss": 0.2534, + "mean_token_accuracy": 0.9131141901016235, + "step": 700 + }, + { + "epoch": 0.3505, + "grad_norm": 3.0955273913322814, + "learning_rate": 1.7525e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8826471567153931, + "step": 701 + }, + { + "epoch": 0.351, + "grad_norm": 2.9087049808279555, + "learning_rate": 1.7550000000000001e-06, + "loss": 0.3305, + "mean_token_accuracy": 0.8814516067504883, + "step": 702 + }, + { + "epoch": 0.3515, + "grad_norm": 2.7149510257276237, + "learning_rate": 1.7575000000000001e-06, + "loss": 0.2979, + "mean_token_accuracy": 0.8997406959533691, + "step": 703 + }, + { + "epoch": 0.352, + "grad_norm": 2.10920735188998, + "learning_rate": 1.76e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8555858135223389, + "step": 704 + }, + { + "epoch": 0.3525, + "grad_norm": 7.622918160000862, + "learning_rate": 1.7625e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8801428079605103, + "step": 705 + }, + { + "epoch": 0.353, + "grad_norm": 3.2861047801348255, + "learning_rate": 1.765e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8643625974655151, + "step": 706 + }, + { + "epoch": 0.3535, + "grad_norm": 11.773627950628606, + "learning_rate": 1.7675e-06, + "loss": 0.2924, + "mean_token_accuracy": 0.9033270478248596, + "step": 707 + }, + { + "epoch": 0.354, + "grad_norm": 6.707276070473103, + "learning_rate": 1.77e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.873410701751709, + "step": 708 + }, + { + "epoch": 0.3545, + "grad_norm": 2.448862958683539, + "learning_rate": 1.7725e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8614374995231628, + "step": 709 + }, + { + "epoch": 0.355, + "grad_norm": 5.586471543020632, + "learning_rate": 1.7750000000000002e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.8846370577812195, + "step": 710 + }, + { + "epoch": 0.3555, + "grad_norm": 3.0727138549678013, + "learning_rate": 1.7775000000000001e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8575806617736816, + "step": 711 + }, + { + "epoch": 0.356, + "grad_norm": 3.250655393127768, + "learning_rate": 1.7800000000000001e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8748832941055298, + "step": 712 + }, + { + "epoch": 0.3565, + "grad_norm": 3.0355241702206563, + "learning_rate": 1.7825e-06, + "loss": 0.311, + "mean_token_accuracy": 0.8985875248908997, + "step": 713 + }, + { + "epoch": 0.357, + "grad_norm": 3.314285333049772, + "learning_rate": 1.785e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8560391068458557, + "step": 714 + }, + { + "epoch": 0.3575, + "grad_norm": 3.3626139998010895, + "learning_rate": 1.7875e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.88346928358078, + "step": 715 + }, + { + "epoch": 0.358, + "grad_norm": 2.6782265861179986, + "learning_rate": 1.79e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8780338764190674, + "step": 716 + }, + { + "epoch": 0.3585, + "grad_norm": 4.364242965999136, + "learning_rate": 1.7925e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8789787292480469, + "step": 717 + }, + { + "epoch": 0.359, + "grad_norm": 3.9211923290610895, + "learning_rate": 1.7950000000000002e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8654388189315796, + "step": 718 + }, + { + "epoch": 0.3595, + "grad_norm": 3.3515489580540896, + "learning_rate": 1.7975000000000002e-06, + "loss": 0.3156, + "mean_token_accuracy": 0.8988910913467407, + "step": 719 + }, + { + "epoch": 0.36, + "grad_norm": 2.444536814364808, + "learning_rate": 1.8000000000000001e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8541828989982605, + "step": 720 + }, + { + "epoch": 0.3605, + "grad_norm": 2.3659283686575683, + "learning_rate": 1.8025000000000001e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8883705139160156, + "step": 721 + }, + { + "epoch": 0.361, + "grad_norm": 3.3107080610446635, + "learning_rate": 1.805e-06, + "loss": 0.58, + "mean_token_accuracy": 0.8282907605171204, + "step": 722 + }, + { + "epoch": 0.3615, + "grad_norm": 6.416445348170384, + "learning_rate": 1.8075e-06, + "loss": 0.3176, + "mean_token_accuracy": 0.8978707194328308, + "step": 723 + }, + { + "epoch": 0.362, + "grad_norm": 2.7005266208586263, + "learning_rate": 1.81e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8868764638900757, + "step": 724 + }, + { + "epoch": 0.3625, + "grad_norm": 2.7431589397609137, + "learning_rate": 1.8125e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8818556666374207, + "step": 725 + }, + { + "epoch": 0.363, + "grad_norm": 2.6959218756400745, + "learning_rate": 1.8150000000000002e-06, + "loss": 0.3146, + "mean_token_accuracy": 0.8970510959625244, + "step": 726 + }, + { + "epoch": 0.3635, + "grad_norm": 2.8901840397164618, + "learning_rate": 1.8175000000000002e-06, + "loss": 0.3091, + "mean_token_accuracy": 0.8915585279464722, + "step": 727 + }, + { + "epoch": 0.364, + "grad_norm": 2.4049501147480314, + "learning_rate": 1.8200000000000002e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.8438547849655151, + "step": 728 + }, + { + "epoch": 0.3645, + "grad_norm": 3.6996479754027134, + "learning_rate": 1.8225000000000001e-06, + "loss": 0.5065, + "mean_token_accuracy": 0.8587704300880432, + "step": 729 + }, + { + "epoch": 0.365, + "grad_norm": 3.1203496535272484, + "learning_rate": 1.825e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8731768131256104, + "step": 730 + }, + { + "epoch": 0.3655, + "grad_norm": 11.57680993751205, + "learning_rate": 1.8275e-06, + "loss": 0.2963, + "mean_token_accuracy": 0.9015178680419922, + "step": 731 + }, + { + "epoch": 0.366, + "grad_norm": 6.13981292129569, + "learning_rate": 1.83e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8804141283035278, + "step": 732 + }, + { + "epoch": 0.3665, + "grad_norm": 3.0166068495092726, + "learning_rate": 1.8325e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8749292492866516, + "step": 733 + }, + { + "epoch": 0.367, + "grad_norm": 2.8011825285243406, + "learning_rate": 1.8350000000000002e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.8832905888557434, + "step": 734 + }, + { + "epoch": 0.3675, + "grad_norm": 2.6861501635807117, + "learning_rate": 1.8375000000000002e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8740915060043335, + "step": 735 + }, + { + "epoch": 0.368, + "grad_norm": 3.6397077084993392, + "learning_rate": 1.8400000000000002e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8664320707321167, + "step": 736 + }, + { + "epoch": 0.3685, + "grad_norm": 2.184022062808553, + "learning_rate": 1.8425000000000001e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8749291896820068, + "step": 737 + }, + { + "epoch": 0.369, + "grad_norm": 3.8909962865054366, + "learning_rate": 1.8450000000000001e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8877279758453369, + "step": 738 + }, + { + "epoch": 0.3695, + "grad_norm": 3.7824923704850586, + "learning_rate": 1.8475e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8662811517715454, + "step": 739 + }, + { + "epoch": 0.37, + "grad_norm": 2.4783608999533597, + "learning_rate": 1.85e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.8425685167312622, + "step": 740 + }, + { + "epoch": 0.3705, + "grad_norm": 2.195767135691227, + "learning_rate": 1.8525e-06, + "loss": 0.2912, + "mean_token_accuracy": 0.9113015532493591, + "step": 741 + }, + { + "epoch": 0.371, + "grad_norm": 3.4433383664600408, + "learning_rate": 1.8550000000000002e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8486425876617432, + "step": 742 + }, + { + "epoch": 0.3715, + "grad_norm": 3.0287280333980484, + "learning_rate": 1.8575000000000002e-06, + "loss": 0.338, + "mean_token_accuracy": 0.8987511396408081, + "step": 743 + }, + { + "epoch": 0.372, + "grad_norm": 4.081749100145511, + "learning_rate": 1.8600000000000002e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.8504780530929565, + "step": 744 + }, + { + "epoch": 0.3725, + "grad_norm": 3.7217867167192216, + "learning_rate": 1.8625000000000002e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.8289563655853271, + "step": 745 + }, + { + "epoch": 0.373, + "grad_norm": 9.7241541827513, + "learning_rate": 1.8650000000000001e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8762927055358887, + "step": 746 + }, + { + "epoch": 0.3735, + "grad_norm": 4.050028722123843, + "learning_rate": 1.8675000000000001e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8641182780265808, + "step": 747 + }, + { + "epoch": 0.374, + "grad_norm": 3.0386469556644, + "learning_rate": 1.87e-06, + "loss": 0.3198, + "mean_token_accuracy": 0.8961505889892578, + "step": 748 + }, + { + "epoch": 0.3745, + "grad_norm": 2.053451928402059, + "learning_rate": 1.8725e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.887458324432373, + "step": 749 + }, + { + "epoch": 0.375, + "grad_norm": 2.3820991419719575, + "learning_rate": 1.8750000000000003e-06, + "loss": 0.5367, + "mean_token_accuracy": 0.8358404040336609, + "step": 750 + }, + { + "epoch": 0.3755, + "grad_norm": 2.8859528978941733, + "learning_rate": 1.8775000000000002e-06, + "loss": 0.4893, + "mean_token_accuracy": 0.8652310967445374, + "step": 751 + }, + { + "epoch": 0.376, + "grad_norm": 2.7655269646697405, + "learning_rate": 1.8800000000000002e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8548784255981445, + "step": 752 + }, + { + "epoch": 0.3765, + "grad_norm": 2.0961639057300125, + "learning_rate": 1.8825000000000002e-06, + "loss": 0.3154, + "mean_token_accuracy": 0.9011173844337463, + "step": 753 + }, + { + "epoch": 0.377, + "grad_norm": 5.830087222892792, + "learning_rate": 1.8850000000000002e-06, + "loss": 0.5129, + "mean_token_accuracy": 0.8457426428794861, + "step": 754 + }, + { + "epoch": 0.3775, + "grad_norm": 4.266696851039516, + "learning_rate": 1.8875000000000001e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8849902749061584, + "step": 755 + }, + { + "epoch": 0.378, + "grad_norm": 9.924034356319824, + "learning_rate": 1.8900000000000001e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8747422695159912, + "step": 756 + }, + { + "epoch": 0.3785, + "grad_norm": 4.063779949634894, + "learning_rate": 1.8925e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8726746439933777, + "step": 757 + }, + { + "epoch": 0.379, + "grad_norm": 5.518516043736674, + "learning_rate": 1.895e-06, + "loss": 0.2717, + "mean_token_accuracy": 0.9050430655479431, + "step": 758 + }, + { + "epoch": 0.3795, + "grad_norm": 2.736955065422879, + "learning_rate": 1.8975000000000003e-06, + "loss": 0.3048, + "mean_token_accuracy": 0.8992993235588074, + "step": 759 + }, + { + "epoch": 0.38, + "grad_norm": 2.4635972360377547, + "learning_rate": 1.9000000000000002e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8729166388511658, + "step": 760 + }, + { + "epoch": 0.3805, + "grad_norm": 6.6180044766864095, + "learning_rate": 1.9025000000000002e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8649616241455078, + "step": 761 + }, + { + "epoch": 0.381, + "grad_norm": 2.828519801207426, + "learning_rate": 1.9050000000000002e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8702743649482727, + "step": 762 + }, + { + "epoch": 0.3815, + "grad_norm": 2.046460562907205, + "learning_rate": 1.9075000000000004e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.8747141361236572, + "step": 763 + }, + { + "epoch": 0.382, + "grad_norm": 2.527721265369041, + "learning_rate": 1.9100000000000003e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.8940276503562927, + "step": 764 + }, + { + "epoch": 0.3825, + "grad_norm": 4.029670679433298, + "learning_rate": 1.9125000000000003e-06, + "loss": 0.4845, + "mean_token_accuracy": 0.8640756011009216, + "step": 765 + }, + { + "epoch": 0.383, + "grad_norm": 8.319841386741965, + "learning_rate": 1.9150000000000003e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.8567620515823364, + "step": 766 + }, + { + "epoch": 0.3835, + "grad_norm": 2.700873441993634, + "learning_rate": 1.9175000000000003e-06, + "loss": 0.5009, + "mean_token_accuracy": 0.8574208617210388, + "step": 767 + }, + { + "epoch": 0.384, + "grad_norm": 2.1577629755396157, + "learning_rate": 1.9200000000000003e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8527722358703613, + "step": 768 + }, + { + "epoch": 0.3845, + "grad_norm": 3.7352171772098526, + "learning_rate": 1.9225000000000002e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.877599835395813, + "step": 769 + }, + { + "epoch": 0.385, + "grad_norm": 3.7694206168754785, + "learning_rate": 1.925e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8818573951721191, + "step": 770 + }, + { + "epoch": 0.3855, + "grad_norm": 2.6478657435688167, + "learning_rate": 1.9275e-06, + "loss": 0.313, + "mean_token_accuracy": 0.9015277028083801, + "step": 771 + }, + { + "epoch": 0.386, + "grad_norm": 3.2866825432067426, + "learning_rate": 1.93e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8697038888931274, + "step": 772 + }, + { + "epoch": 0.3865, + "grad_norm": 4.460861876618227, + "learning_rate": 1.9325e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8813523650169373, + "step": 773 + }, + { + "epoch": 0.387, + "grad_norm": 3.437978780822134, + "learning_rate": 1.935e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8584210276603699, + "step": 774 + }, + { + "epoch": 0.3875, + "grad_norm": 2.9970131524874293, + "learning_rate": 1.9375e-06, + "loss": 0.2667, + "mean_token_accuracy": 0.9116814732551575, + "step": 775 + }, + { + "epoch": 0.388, + "grad_norm": 5.531379996659423, + "learning_rate": 1.94e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8589029312133789, + "step": 776 + }, + { + "epoch": 0.3885, + "grad_norm": 9.975096280671208, + "learning_rate": 1.9425e-06, + "loss": 0.5413, + "mean_token_accuracy": 0.8521881103515625, + "step": 777 + }, + { + "epoch": 0.389, + "grad_norm": 5.557830201842082, + "learning_rate": 1.945e-06, + "loss": 0.5339, + "mean_token_accuracy": 0.8311886191368103, + "step": 778 + }, + { + "epoch": 0.3895, + "grad_norm": 2.7737810980549837, + "learning_rate": 1.9475000000000004e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8781977295875549, + "step": 779 + }, + { + "epoch": 0.39, + "grad_norm": 2.6174186677619784, + "learning_rate": 1.9500000000000004e-06, + "loss": 0.3208, + "mean_token_accuracy": 0.8994082808494568, + "step": 780 + }, + { + "epoch": 0.3905, + "grad_norm": 2.1387914559353574, + "learning_rate": 1.9525000000000004e-06, + "loss": 0.2966, + "mean_token_accuracy": 0.9024346470832825, + "step": 781 + }, + { + "epoch": 0.391, + "grad_norm": 3.92648778228015, + "learning_rate": 1.9550000000000003e-06, + "loss": 0.635, + "mean_token_accuracy": 0.8085071444511414, + "step": 782 + }, + { + "epoch": 0.3915, + "grad_norm": 2.3987472488744266, + "learning_rate": 1.9575000000000003e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.8551068902015686, + "step": 783 + }, + { + "epoch": 0.392, + "grad_norm": 3.1326973202473374, + "learning_rate": 1.9600000000000003e-06, + "loss": 0.537, + "mean_token_accuracy": 0.8602297306060791, + "step": 784 + }, + { + "epoch": 0.3925, + "grad_norm": 6.440456322978317, + "learning_rate": 1.9625000000000003e-06, + "loss": 0.3232, + "mean_token_accuracy": 0.8928437829017639, + "step": 785 + }, + { + "epoch": 0.393, + "grad_norm": 2.9010077759228774, + "learning_rate": 1.9650000000000002e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8681454062461853, + "step": 786 + }, + { + "epoch": 0.3935, + "grad_norm": 2.394985947684964, + "learning_rate": 1.9675000000000002e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8816878795623779, + "step": 787 + }, + { + "epoch": 0.394, + "grad_norm": 2.362460921112624, + "learning_rate": 1.97e-06, + "loss": 0.2918, + "mean_token_accuracy": 0.900814950466156, + "step": 788 + }, + { + "epoch": 0.3945, + "grad_norm": 2.200275103205839, + "learning_rate": 1.9725e-06, + "loss": 0.3257, + "mean_token_accuracy": 0.8934651017189026, + "step": 789 + }, + { + "epoch": 0.395, + "grad_norm": 3.572266344000248, + "learning_rate": 1.975e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8633911609649658, + "step": 790 + }, + { + "epoch": 0.3955, + "grad_norm": 2.577801075111681, + "learning_rate": 1.9775e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8646694421768188, + "step": 791 + }, + { + "epoch": 0.396, + "grad_norm": 3.4042287804611893, + "learning_rate": 1.98e-06, + "loss": 0.5387, + "mean_token_accuracy": 0.8433639407157898, + "step": 792 + }, + { + "epoch": 0.3965, + "grad_norm": 2.715975149180644, + "learning_rate": 1.9825e-06, + "loss": 0.586, + "mean_token_accuracy": 0.8184326887130737, + "step": 793 + }, + { + "epoch": 0.397, + "grad_norm": 2.594646725101913, + "learning_rate": 1.985e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8627078533172607, + "step": 794 + }, + { + "epoch": 0.3975, + "grad_norm": 2.4258436491179385, + "learning_rate": 1.9875000000000005e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8749533295631409, + "step": 795 + }, + { + "epoch": 0.398, + "grad_norm": 2.9384231492776025, + "learning_rate": 1.9900000000000004e-06, + "loss": 0.5878, + "mean_token_accuracy": 0.8306654095649719, + "step": 796 + }, + { + "epoch": 0.3985, + "grad_norm": 1.743820138109884, + "learning_rate": 1.9925000000000004e-06, + "loss": 0.2956, + "mean_token_accuracy": 0.902953565120697, + "step": 797 + }, + { + "epoch": 0.399, + "grad_norm": 2.478119587772704, + "learning_rate": 1.9950000000000004e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8720490336418152, + "step": 798 + }, + { + "epoch": 0.3995, + "grad_norm": 3.121071231257071, + "learning_rate": 1.9975000000000004e-06, + "loss": 0.5414, + "mean_token_accuracy": 0.852888822555542, + "step": 799 + }, + { + "epoch": 0.4, + "grad_norm": 2.5621939080626928, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.5208, + "mean_token_accuracy": 0.8477929830551147, + "step": 800 + }, + { + "epoch": 0.4005, + "grad_norm": 3.4201125011406908, + "learning_rate": 2.0025000000000003e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8791071176528931, + "step": 801 + }, + { + "epoch": 0.401, + "grad_norm": 2.525741374937095, + "learning_rate": 2.0050000000000003e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8761569857597351, + "step": 802 + }, + { + "epoch": 0.4015, + "grad_norm": 2.8148620359877063, + "learning_rate": 2.0075000000000003e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8894028663635254, + "step": 803 + }, + { + "epoch": 0.402, + "grad_norm": 4.192770052139345, + "learning_rate": 2.0100000000000002e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8842519521713257, + "step": 804 + }, + { + "epoch": 0.4025, + "grad_norm": 2.457177754718361, + "learning_rate": 2.0125000000000002e-06, + "loss": 0.5003, + "mean_token_accuracy": 0.8538563251495361, + "step": 805 + }, + { + "epoch": 0.403, + "grad_norm": 2.7327659278306764, + "learning_rate": 2.015e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8586010932922363, + "step": 806 + }, + { + "epoch": 0.4035, + "grad_norm": 6.048538129337998, + "learning_rate": 2.0175e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8762176036834717, + "step": 807 + }, + { + "epoch": 0.404, + "grad_norm": 44.20723066778965, + "learning_rate": 2.02e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.8407825231552124, + "step": 808 + }, + { + "epoch": 0.4045, + "grad_norm": 3.2715406558304236, + "learning_rate": 2.0225e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8695944547653198, + "step": 809 + }, + { + "epoch": 0.405, + "grad_norm": 3.7460352363242535, + "learning_rate": 2.025e-06, + "loss": 0.5441, + "mean_token_accuracy": 0.8311302661895752, + "step": 810 + }, + { + "epoch": 0.4055, + "grad_norm": 2.8443508541642073, + "learning_rate": 2.0275000000000005e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8491449356079102, + "step": 811 + }, + { + "epoch": 0.406, + "grad_norm": 2.3467214845429982, + "learning_rate": 2.0300000000000005e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.867108941078186, + "step": 812 + }, + { + "epoch": 0.4065, + "grad_norm": 4.984444096317264, + "learning_rate": 2.0325e-06, + "loss": 0.5589, + "mean_token_accuracy": 0.837151825428009, + "step": 813 + }, + { + "epoch": 0.407, + "grad_norm": 2.449394728794601, + "learning_rate": 2.035e-06, + "loss": 0.3315, + "mean_token_accuracy": 0.878616452217102, + "step": 814 + }, + { + "epoch": 0.4075, + "grad_norm": 3.9713583462668485, + "learning_rate": 2.0375e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8731935024261475, + "step": 815 + }, + { + "epoch": 0.408, + "grad_norm": 3.7734666037900713, + "learning_rate": 2.04e-06, + "loss": 0.5, + "mean_token_accuracy": 0.8536542654037476, + "step": 816 + }, + { + "epoch": 0.4085, + "grad_norm": 3.1668453692474796, + "learning_rate": 2.0425e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8886559009552002, + "step": 817 + }, + { + "epoch": 0.409, + "grad_norm": 4.526606618091803, + "learning_rate": 2.045e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.861483633518219, + "step": 818 + }, + { + "epoch": 0.4095, + "grad_norm": 2.3716094619075654, + "learning_rate": 2.0475e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8775793313980103, + "step": 819 + }, + { + "epoch": 0.41, + "grad_norm": 3.1708139102113515, + "learning_rate": 2.05e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8860813975334167, + "step": 820 + }, + { + "epoch": 0.4105, + "grad_norm": 2.523182334467328, + "learning_rate": 2.0525000000000003e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8649803996086121, + "step": 821 + }, + { + "epoch": 0.411, + "grad_norm": 2.2682829836328353, + "learning_rate": 2.0550000000000002e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8655844330787659, + "step": 822 + }, + { + "epoch": 0.4115, + "grad_norm": 2.0010241716958155, + "learning_rate": 2.0575e-06, + "loss": 0.2738, + "mean_token_accuracy": 0.9066067934036255, + "step": 823 + }, + { + "epoch": 0.412, + "grad_norm": 2.4371728249703306, + "learning_rate": 2.06e-06, + "loss": 0.3391, + "mean_token_accuracy": 0.8947368264198303, + "step": 824 + }, + { + "epoch": 0.4125, + "grad_norm": 7.801940640536967, + "learning_rate": 2.0625e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8691203594207764, + "step": 825 + }, + { + "epoch": 0.413, + "grad_norm": 2.627990356666174, + "learning_rate": 2.065e-06, + "loss": 0.5314, + "mean_token_accuracy": 0.8449873924255371, + "step": 826 + }, + { + "epoch": 0.4135, + "grad_norm": 2.3960883163667384, + "learning_rate": 2.0675e-06, + "loss": 0.5031, + "mean_token_accuracy": 0.8384693264961243, + "step": 827 + }, + { + "epoch": 0.414, + "grad_norm": 2.2730292376225756, + "learning_rate": 2.07e-06, + "loss": 0.338, + "mean_token_accuracy": 0.8897058963775635, + "step": 828 + }, + { + "epoch": 0.4145, + "grad_norm": 2.8443364552843904, + "learning_rate": 2.0725e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.876762330532074, + "step": 829 + }, + { + "epoch": 0.415, + "grad_norm": 2.779715420072783, + "learning_rate": 2.075e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8554030656814575, + "step": 830 + }, + { + "epoch": 0.4155, + "grad_norm": 3.9857611236237283, + "learning_rate": 2.0775e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8529607653617859, + "step": 831 + }, + { + "epoch": 0.416, + "grad_norm": 2.3023370036814343, + "learning_rate": 2.08e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8741443157196045, + "step": 832 + }, + { + "epoch": 0.4165, + "grad_norm": 2.576260051992714, + "learning_rate": 2.0825e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.882629930973053, + "step": 833 + }, + { + "epoch": 0.417, + "grad_norm": 4.109618268242055, + "learning_rate": 2.085e-06, + "loss": 0.5274, + "mean_token_accuracy": 0.8445800542831421, + "step": 834 + }, + { + "epoch": 0.4175, + "grad_norm": 3.415179814664979, + "learning_rate": 2.0875e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8619718551635742, + "step": 835 + }, + { + "epoch": 0.418, + "grad_norm": 3.649203758514362, + "learning_rate": 2.09e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8787272572517395, + "step": 836 + }, + { + "epoch": 0.4185, + "grad_norm": 2.4660099534768545, + "learning_rate": 2.0925000000000003e-06, + "loss": 0.2747, + "mean_token_accuracy": 0.905021607875824, + "step": 837 + }, + { + "epoch": 0.419, + "grad_norm": 2.5093707967271492, + "learning_rate": 2.0950000000000003e-06, + "loss": 0.6784, + "mean_token_accuracy": 0.7932776212692261, + "step": 838 + }, + { + "epoch": 0.4195, + "grad_norm": 2.9784495783173126, + "learning_rate": 2.0975000000000002e-06, + "loss": 0.5549, + "mean_token_accuracy": 0.8338017463684082, + "step": 839 + }, + { + "epoch": 0.42, + "grad_norm": 11.384822503279254, + "learning_rate": 2.1000000000000002e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8712365031242371, + "step": 840 + }, + { + "epoch": 0.4205, + "grad_norm": 2.6110623738475955, + "learning_rate": 2.1025e-06, + "loss": 0.395, + "mean_token_accuracy": 0.874142587184906, + "step": 841 + }, + { + "epoch": 0.421, + "grad_norm": 26.9995543510326, + "learning_rate": 2.105e-06, + "loss": 0.4869, + "mean_token_accuracy": 0.8517467975616455, + "step": 842 + }, + { + "epoch": 0.4215, + "grad_norm": 2.4378286740175277, + "learning_rate": 2.1075e-06, + "loss": 0.5172, + "mean_token_accuracy": 0.8332382440567017, + "step": 843 + }, + { + "epoch": 0.422, + "grad_norm": 3.7170469644693687, + "learning_rate": 2.11e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.861102283000946, + "step": 844 + }, + { + "epoch": 0.4225, + "grad_norm": 2.5268405141854986, + "learning_rate": 2.1125e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8556343913078308, + "step": 845 + }, + { + "epoch": 0.423, + "grad_norm": 3.269997456252647, + "learning_rate": 2.115e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8762641549110413, + "step": 846 + }, + { + "epoch": 0.4235, + "grad_norm": 4.3541027449593805, + "learning_rate": 2.1175e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8868200778961182, + "step": 847 + }, + { + "epoch": 0.424, + "grad_norm": 2.743568050925779, + "learning_rate": 2.12e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8819979429244995, + "step": 848 + }, + { + "epoch": 0.4245, + "grad_norm": 85.91713411066046, + "learning_rate": 2.1225e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8662146925926208, + "step": 849 + }, + { + "epoch": 0.425, + "grad_norm": 3.8542374894809663, + "learning_rate": 2.125e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8786771893501282, + "step": 850 + }, + { + "epoch": 0.4255, + "grad_norm": 5.582346405336546, + "learning_rate": 2.1275e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8638556599617004, + "step": 851 + }, + { + "epoch": 0.426, + "grad_norm": 4.434684195197404, + "learning_rate": 2.13e-06, + "loss": 0.5971, + "mean_token_accuracy": 0.8350931406021118, + "step": 852 + }, + { + "epoch": 0.4265, + "grad_norm": 7.637792634027671, + "learning_rate": 2.1325000000000003e-06, + "loss": 0.3301, + "mean_token_accuracy": 0.8973350524902344, + "step": 853 + }, + { + "epoch": 0.427, + "grad_norm": 2.5468899010654265, + "learning_rate": 2.1350000000000003e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8895924687385559, + "step": 854 + }, + { + "epoch": 0.4275, + "grad_norm": 2.2276378828700114, + "learning_rate": 2.1375000000000003e-06, + "loss": 0.2857, + "mean_token_accuracy": 0.9107702374458313, + "step": 855 + }, + { + "epoch": 0.428, + "grad_norm": 3.4523345469171525, + "learning_rate": 2.1400000000000003e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8808359503746033, + "step": 856 + }, + { + "epoch": 0.4285, + "grad_norm": 3.956811360098673, + "learning_rate": 2.1425000000000002e-06, + "loss": 0.2872, + "mean_token_accuracy": 0.9086740016937256, + "step": 857 + }, + { + "epoch": 0.429, + "grad_norm": 3.3519207293569044, + "learning_rate": 2.1450000000000002e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8715375065803528, + "step": 858 + }, + { + "epoch": 0.4295, + "grad_norm": 3.563596958984784, + "learning_rate": 2.1475e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8498912453651428, + "step": 859 + }, + { + "epoch": 0.43, + "grad_norm": 3.646176641847391, + "learning_rate": 2.15e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.8590518832206726, + "step": 860 + }, + { + "epoch": 0.4305, + "grad_norm": 3.274098789877053, + "learning_rate": 2.1525e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8643699288368225, + "step": 861 + }, + { + "epoch": 0.431, + "grad_norm": 5.92583052843577, + "learning_rate": 2.155e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.880878746509552, + "step": 862 + }, + { + "epoch": 0.4315, + "grad_norm": 2.2503284968082906, + "learning_rate": 2.1575e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.8928506970405579, + "step": 863 + }, + { + "epoch": 0.432, + "grad_norm": 3.2954865324035767, + "learning_rate": 2.16e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8547914028167725, + "step": 864 + }, + { + "epoch": 0.4325, + "grad_norm": 6.64091724335779, + "learning_rate": 2.1625e-06, + "loss": 0.4824, + "mean_token_accuracy": 0.8653053641319275, + "step": 865 + }, + { + "epoch": 0.433, + "grad_norm": 3.7909519097081703, + "learning_rate": 2.165e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8748453259468079, + "step": 866 + }, + { + "epoch": 0.4335, + "grad_norm": 2.5499390668782875, + "learning_rate": 2.1675e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.8460000157356262, + "step": 867 + }, + { + "epoch": 0.434, + "grad_norm": 3.386396040965625, + "learning_rate": 2.17e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8749226927757263, + "step": 868 + }, + { + "epoch": 0.4345, + "grad_norm": 2.4834327365123006, + "learning_rate": 2.1725000000000004e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8717465996742249, + "step": 869 + }, + { + "epoch": 0.435, + "grad_norm": 5.04095901075201, + "learning_rate": 2.1750000000000004e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.8933892846107483, + "step": 870 + }, + { + "epoch": 0.4355, + "grad_norm": 2.203870283067697, + "learning_rate": 2.1775000000000003e-06, + "loss": 0.5079, + "mean_token_accuracy": 0.8345926403999329, + "step": 871 + }, + { + "epoch": 0.436, + "grad_norm": 3.7977068574553545, + "learning_rate": 2.1800000000000003e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8646872043609619, + "step": 872 + }, + { + "epoch": 0.4365, + "grad_norm": 3.3684490768418325, + "learning_rate": 2.1825000000000003e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8520172834396362, + "step": 873 + }, + { + "epoch": 0.437, + "grad_norm": 2.2847668320692973, + "learning_rate": 2.1850000000000003e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8583921194076538, + "step": 874 + }, + { + "epoch": 0.4375, + "grad_norm": 7.587759743283465, + "learning_rate": 2.1875000000000002e-06, + "loss": 0.6264, + "mean_token_accuracy": 0.8235689997673035, + "step": 875 + }, + { + "epoch": 0.438, + "grad_norm": 3.8974007453917006, + "learning_rate": 2.19e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8643820881843567, + "step": 876 + }, + { + "epoch": 0.4385, + "grad_norm": 4.55351855652277, + "learning_rate": 2.1925e-06, + "loss": 0.6635, + "mean_token_accuracy": 0.8109395503997803, + "step": 877 + }, + { + "epoch": 0.439, + "grad_norm": 2.3271868662854414, + "learning_rate": 2.195e-06, + "loss": 0.4868, + "mean_token_accuracy": 0.8544647693634033, + "step": 878 + }, + { + "epoch": 0.4395, + "grad_norm": 3.771328585935941, + "learning_rate": 2.1975e-06, + "loss": 0.5647, + "mean_token_accuracy": 0.8486545085906982, + "step": 879 + }, + { + "epoch": 0.44, + "grad_norm": 2.7277349985005195, + "learning_rate": 2.2e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8593510985374451, + "step": 880 + }, + { + "epoch": 0.4405, + "grad_norm": 2.719277636948065, + "learning_rate": 2.2025e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.86495441198349, + "step": 881 + }, + { + "epoch": 0.441, + "grad_norm": 3.5658885990455853, + "learning_rate": 2.205e-06, + "loss": 0.5285, + "mean_token_accuracy": 0.8436918258666992, + "step": 882 + }, + { + "epoch": 0.4415, + "grad_norm": 2.626725025586801, + "learning_rate": 2.2075e-06, + "loss": 0.5347, + "mean_token_accuracy": 0.8461010456085205, + "step": 883 + }, + { + "epoch": 0.442, + "grad_norm": 4.1660977984900045, + "learning_rate": 2.21e-06, + "loss": 0.5361, + "mean_token_accuracy": 0.8464052081108093, + "step": 884 + }, + { + "epoch": 0.4425, + "grad_norm": 2.8306293396543953, + "learning_rate": 2.2125e-06, + "loss": 0.4918, + "mean_token_accuracy": 0.8583683371543884, + "step": 885 + }, + { + "epoch": 0.443, + "grad_norm": 5.86617701611566, + "learning_rate": 2.2150000000000004e-06, + "loss": 0.5318, + "mean_token_accuracy": 0.849391758441925, + "step": 886 + }, + { + "epoch": 0.4435, + "grad_norm": 3.4903233738830832, + "learning_rate": 2.2175000000000004e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8731057047843933, + "step": 887 + }, + { + "epoch": 0.444, + "grad_norm": 2.051898844942643, + "learning_rate": 2.2200000000000003e-06, + "loss": 0.359, + "mean_token_accuracy": 0.8901062607765198, + "step": 888 + }, + { + "epoch": 0.4445, + "grad_norm": 3.0003836326682394, + "learning_rate": 2.2225000000000003e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8626912236213684, + "step": 889 + }, + { + "epoch": 0.445, + "grad_norm": 2.5314701444254495, + "learning_rate": 2.2250000000000003e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8498536348342896, + "step": 890 + }, + { + "epoch": 0.4455, + "grad_norm": 2.6802407820379384, + "learning_rate": 2.2275000000000003e-06, + "loss": 0.3161, + "mean_token_accuracy": 0.8953194618225098, + "step": 891 + }, + { + "epoch": 0.446, + "grad_norm": 56.17083943895622, + "learning_rate": 2.2300000000000002e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8898440599441528, + "step": 892 + }, + { + "epoch": 0.4465, + "grad_norm": 3.3242649424980124, + "learning_rate": 2.2325000000000002e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8860092759132385, + "step": 893 + }, + { + "epoch": 0.447, + "grad_norm": 3.7015509058353033, + "learning_rate": 2.235e-06, + "loss": 0.5695, + "mean_token_accuracy": 0.8323844075202942, + "step": 894 + }, + { + "epoch": 0.4475, + "grad_norm": 4.814399335613338, + "learning_rate": 2.2375e-06, + "loss": 0.3175, + "mean_token_accuracy": 0.8977493047714233, + "step": 895 + }, + { + "epoch": 0.448, + "grad_norm": 3.520642519210208, + "learning_rate": 2.24e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8749594688415527, + "step": 896 + }, + { + "epoch": 0.4485, + "grad_norm": 2.798435392966591, + "learning_rate": 2.2425e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8674381971359253, + "step": 897 + }, + { + "epoch": 0.449, + "grad_norm": 2.8233004857596677, + "learning_rate": 2.245e-06, + "loss": 0.5525, + "mean_token_accuracy": 0.8442431092262268, + "step": 898 + }, + { + "epoch": 0.4495, + "grad_norm": 3.463871675335283, + "learning_rate": 2.2475e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8352478742599487, + "step": 899 + }, + { + "epoch": 0.45, + "grad_norm": 3.3954550571861173, + "learning_rate": 2.25e-06, + "loss": 0.3235, + "mean_token_accuracy": 0.9003984332084656, + "step": 900 + }, + { + "epoch": 0.4505, + "grad_norm": 1.9493894929527078, + "learning_rate": 2.2525e-06, + "loss": 0.3193, + "mean_token_accuracy": 0.8989473581314087, + "step": 901 + }, + { + "epoch": 0.451, + "grad_norm": 4.1031298971901045, + "learning_rate": 2.2550000000000004e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8583717942237854, + "step": 902 + }, + { + "epoch": 0.4515, + "grad_norm": 3.75906139072641, + "learning_rate": 2.2575000000000004e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.856923520565033, + "step": 903 + }, + { + "epoch": 0.452, + "grad_norm": 2.4496848997353737, + "learning_rate": 2.2600000000000004e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8886111974716187, + "step": 904 + }, + { + "epoch": 0.4525, + "grad_norm": 5.175908394712735, + "learning_rate": 2.2625000000000004e-06, + "loss": 0.577, + "mean_token_accuracy": 0.8268546462059021, + "step": 905 + }, + { + "epoch": 0.453, + "grad_norm": 2.417757576837369, + "learning_rate": 2.2650000000000003e-06, + "loss": 0.4962, + "mean_token_accuracy": 0.8433964848518372, + "step": 906 + }, + { + "epoch": 0.4535, + "grad_norm": 2.660315768296687, + "learning_rate": 2.2675000000000003e-06, + "loss": 0.3242, + "mean_token_accuracy": 0.8926344513893127, + "step": 907 + }, + { + "epoch": 0.454, + "grad_norm": 2.485029127175605, + "learning_rate": 2.2700000000000003e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8620856404304504, + "step": 908 + }, + { + "epoch": 0.4545, + "grad_norm": 3.0988220630700267, + "learning_rate": 2.2725000000000003e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8659571409225464, + "step": 909 + }, + { + "epoch": 0.455, + "grad_norm": 3.332091677933116, + "learning_rate": 2.2750000000000002e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8905482292175293, + "step": 910 + }, + { + "epoch": 0.4555, + "grad_norm": 20.595372270947017, + "learning_rate": 2.2775000000000002e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8902236223220825, + "step": 911 + }, + { + "epoch": 0.456, + "grad_norm": 2.679502340095463, + "learning_rate": 2.28e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8802757859230042, + "step": 912 + }, + { + "epoch": 0.4565, + "grad_norm": 2.551865606058781, + "learning_rate": 2.2825e-06, + "loss": 0.3321, + "mean_token_accuracy": 0.8968083262443542, + "step": 913 + }, + { + "epoch": 0.457, + "grad_norm": 2.1306000074988525, + "learning_rate": 2.285e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8685452342033386, + "step": 914 + }, + { + "epoch": 0.4575, + "grad_norm": 2.9786205271112816, + "learning_rate": 2.2875e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8854835629463196, + "step": 915 + }, + { + "epoch": 0.458, + "grad_norm": 4.070452699724559, + "learning_rate": 2.29e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8849190473556519, + "step": 916 + }, + { + "epoch": 0.4585, + "grad_norm": 3.084148803942349, + "learning_rate": 2.2925e-06, + "loss": 0.4823, + "mean_token_accuracy": 0.8575414419174194, + "step": 917 + }, + { + "epoch": 0.459, + "grad_norm": 2.1206944504572904, + "learning_rate": 2.2950000000000005e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8733302354812622, + "step": 918 + }, + { + "epoch": 0.4595, + "grad_norm": 5.661709589563985, + "learning_rate": 2.2975000000000004e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8721708655357361, + "step": 919 + }, + { + "epoch": 0.46, + "grad_norm": 2.6079153655633815, + "learning_rate": 2.3000000000000004e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8710716962814331, + "step": 920 + }, + { + "epoch": 0.4605, + "grad_norm": 2.647681579540982, + "learning_rate": 2.3025000000000004e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8519295454025269, + "step": 921 + }, + { + "epoch": 0.461, + "grad_norm": 2.680753907102449, + "learning_rate": 2.3050000000000004e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8707170486450195, + "step": 922 + }, + { + "epoch": 0.4615, + "grad_norm": 2.3927743704601574, + "learning_rate": 2.3075000000000004e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8659264445304871, + "step": 923 + }, + { + "epoch": 0.462, + "grad_norm": 2.960158795741286, + "learning_rate": 2.3100000000000003e-06, + "loss": 0.285, + "mean_token_accuracy": 0.9037178158760071, + "step": 924 + }, + { + "epoch": 0.4625, + "grad_norm": 3.0919551323579464, + "learning_rate": 2.3125000000000003e-06, + "loss": 0.5386, + "mean_token_accuracy": 0.8587077856063843, + "step": 925 + }, + { + "epoch": 0.463, + "grad_norm": 15.53217063779084, + "learning_rate": 2.3150000000000003e-06, + "loss": 0.5812, + "mean_token_accuracy": 0.8221664428710938, + "step": 926 + }, + { + "epoch": 0.4635, + "grad_norm": 2.351735560758683, + "learning_rate": 2.3175000000000003e-06, + "loss": 0.3395, + "mean_token_accuracy": 0.902695894241333, + "step": 927 + }, + { + "epoch": 0.464, + "grad_norm": 66.86794405357209, + "learning_rate": 2.3200000000000002e-06, + "loss": 0.574, + "mean_token_accuracy": 0.8363152742385864, + "step": 928 + }, + { + "epoch": 0.4645, + "grad_norm": 2.4203043735825225, + "learning_rate": 2.3225e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.8535515666007996, + "step": 929 + }, + { + "epoch": 0.465, + "grad_norm": 4.036879945787897, + "learning_rate": 2.325e-06, + "loss": 0.3316, + "mean_token_accuracy": 0.8851545453071594, + "step": 930 + }, + { + "epoch": 0.4655, + "grad_norm": 13.381104246810862, + "learning_rate": 2.3275e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8652840256690979, + "step": 931 + }, + { + "epoch": 0.466, + "grad_norm": 2.1648995286574206, + "learning_rate": 2.33e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8746182322502136, + "step": 932 + }, + { + "epoch": 0.4665, + "grad_norm": 2.145299931074882, + "learning_rate": 2.3325e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8849185705184937, + "step": 933 + }, + { + "epoch": 0.467, + "grad_norm": 8.621489458746538, + "learning_rate": 2.3350000000000005e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8826027512550354, + "step": 934 + }, + { + "epoch": 0.4675, + "grad_norm": 3.497400546828144, + "learning_rate": 2.3375000000000005e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8829787373542786, + "step": 935 + }, + { + "epoch": 0.468, + "grad_norm": 2.2311951476880725, + "learning_rate": 2.3400000000000005e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8580330014228821, + "step": 936 + }, + { + "epoch": 0.4685, + "grad_norm": 3.0840033844986, + "learning_rate": 2.3425000000000004e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8799677491188049, + "step": 937 + }, + { + "epoch": 0.469, + "grad_norm": 3.870398443347673, + "learning_rate": 2.345e-06, + "loss": 0.4863, + "mean_token_accuracy": 0.8614775538444519, + "step": 938 + }, + { + "epoch": 0.4695, + "grad_norm": 2.360399658784844, + "learning_rate": 2.3475e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8711888790130615, + "step": 939 + }, + { + "epoch": 0.47, + "grad_norm": 3.976139842555636, + "learning_rate": 2.35e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8503860831260681, + "step": 940 + }, + { + "epoch": 0.4705, + "grad_norm": 2.5718292246421455, + "learning_rate": 2.3525e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8524162173271179, + "step": 941 + }, + { + "epoch": 0.471, + "grad_norm": 2.7284945936097706, + "learning_rate": 2.355e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8824135065078735, + "step": 942 + }, + { + "epoch": 0.4715, + "grad_norm": 2.1134484140378373, + "learning_rate": 2.3575e-06, + "loss": 0.3081, + "mean_token_accuracy": 0.894884467124939, + "step": 943 + }, + { + "epoch": 0.472, + "grad_norm": 2.56184630973786, + "learning_rate": 2.3600000000000003e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8841453194618225, + "step": 944 + }, + { + "epoch": 0.4725, + "grad_norm": 3.3750613908533955, + "learning_rate": 2.3625000000000003e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8697868585586548, + "step": 945 + }, + { + "epoch": 0.473, + "grad_norm": 3.1628909454715184, + "learning_rate": 2.3650000000000002e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8566505908966064, + "step": 946 + }, + { + "epoch": 0.4735, + "grad_norm": 2.352887626774322, + "learning_rate": 2.3675e-06, + "loss": 0.3241, + "mean_token_accuracy": 0.8983148336410522, + "step": 947 + }, + { + "epoch": 0.474, + "grad_norm": 5.522981484532449, + "learning_rate": 2.37e-06, + "loss": 0.5684, + "mean_token_accuracy": 0.823798656463623, + "step": 948 + }, + { + "epoch": 0.4745, + "grad_norm": 2.864986832529383, + "learning_rate": 2.3725e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.849277138710022, + "step": 949 + }, + { + "epoch": 0.475, + "grad_norm": 2.1367717772755386, + "learning_rate": 2.375e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.8815915584564209, + "step": 950 + }, + { + "epoch": 0.4755, + "grad_norm": 2.9818685731638395, + "learning_rate": 2.3775e-06, + "loss": 0.447, + "mean_token_accuracy": 0.869965672492981, + "step": 951 + }, + { + "epoch": 0.476, + "grad_norm": 3.3174490709315165, + "learning_rate": 2.38e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8732098340988159, + "step": 952 + }, + { + "epoch": 0.4765, + "grad_norm": 21.212498712096405, + "learning_rate": 2.3825e-06, + "loss": 0.2701, + "mean_token_accuracy": 0.9204727411270142, + "step": 953 + }, + { + "epoch": 0.477, + "grad_norm": 5.787100710561361, + "learning_rate": 2.385e-06, + "loss": 0.2746, + "mean_token_accuracy": 0.903322696685791, + "step": 954 + }, + { + "epoch": 0.4775, + "grad_norm": 2.6979074763103856, + "learning_rate": 2.3875e-06, + "loss": 0.6577, + "mean_token_accuracy": 0.8099491596221924, + "step": 955 + }, + { + "epoch": 0.478, + "grad_norm": 2.613544433972111, + "learning_rate": 2.39e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.878463089466095, + "step": 956 + }, + { + "epoch": 0.4785, + "grad_norm": 2.963944644799704, + "learning_rate": 2.3925e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8691366910934448, + "step": 957 + }, + { + "epoch": 0.479, + "grad_norm": 1.8090272964006646, + "learning_rate": 2.395e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.8813463449478149, + "step": 958 + }, + { + "epoch": 0.4795, + "grad_norm": 9.210788120614335, + "learning_rate": 2.3975e-06, + "loss": 0.5487, + "mean_token_accuracy": 0.8412024974822998, + "step": 959 + }, + { + "epoch": 0.48, + "grad_norm": 2.2987121067644893, + "learning_rate": 2.4000000000000003e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8757103681564331, + "step": 960 + }, + { + "epoch": 0.4805, + "grad_norm": 2.511761810102173, + "learning_rate": 2.4025000000000003e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8763478398323059, + "step": 961 + }, + { + "epoch": 0.481, + "grad_norm": 4.9901366101216444, + "learning_rate": 2.4050000000000003e-06, + "loss": 0.5287, + "mean_token_accuracy": 0.8628488183021545, + "step": 962 + }, + { + "epoch": 0.4815, + "grad_norm": 2.187453700736623, + "learning_rate": 2.4075000000000002e-06, + "loss": 0.4898, + "mean_token_accuracy": 0.8501180410385132, + "step": 963 + }, + { + "epoch": 0.482, + "grad_norm": 2.4617931858556137, + "learning_rate": 2.4100000000000002e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8724767565727234, + "step": 964 + }, + { + "epoch": 0.4825, + "grad_norm": 4.087288025116215, + "learning_rate": 2.4125e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8904600143432617, + "step": 965 + }, + { + "epoch": 0.483, + "grad_norm": 3.7807605165768527, + "learning_rate": 2.415e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8776735067367554, + "step": 966 + }, + { + "epoch": 0.4835, + "grad_norm": 2.921886120975433, + "learning_rate": 2.4175e-06, + "loss": 0.3073, + "mean_token_accuracy": 0.9009882807731628, + "step": 967 + }, + { + "epoch": 0.484, + "grad_norm": 3.8989743247084885, + "learning_rate": 2.42e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.877310574054718, + "step": 968 + }, + { + "epoch": 0.4845, + "grad_norm": 2.6727466443791763, + "learning_rate": 2.4225e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.8524196743965149, + "step": 969 + }, + { + "epoch": 0.485, + "grad_norm": 4.026325794166499, + "learning_rate": 2.425e-06, + "loss": 0.3321, + "mean_token_accuracy": 0.8955166339874268, + "step": 970 + }, + { + "epoch": 0.4855, + "grad_norm": 2.798872886518778, + "learning_rate": 2.4275e-06, + "loss": 0.4999, + "mean_token_accuracy": 0.8392939567565918, + "step": 971 + }, + { + "epoch": 0.486, + "grad_norm": 10.924392118793213, + "learning_rate": 2.43e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8559064865112305, + "step": 972 + }, + { + "epoch": 0.4865, + "grad_norm": 6.256817069810683, + "learning_rate": 2.4325e-06, + "loss": 0.5466, + "mean_token_accuracy": 0.8275598883628845, + "step": 973 + }, + { + "epoch": 0.487, + "grad_norm": 5.407540483862201, + "learning_rate": 2.435e-06, + "loss": 0.2952, + "mean_token_accuracy": 0.8934911489486694, + "step": 974 + }, + { + "epoch": 0.4875, + "grad_norm": 3.4682723230273065, + "learning_rate": 2.4375e-06, + "loss": 0.3305, + "mean_token_accuracy": 0.8905075192451477, + "step": 975 + }, + { + "epoch": 0.488, + "grad_norm": 2.2198792285234465, + "learning_rate": 2.4400000000000004e-06, + "loss": 0.342, + "mean_token_accuracy": 0.890861988067627, + "step": 976 + }, + { + "epoch": 0.4885, + "grad_norm": 3.4870447490041476, + "learning_rate": 2.4425000000000003e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8576675653457642, + "step": 977 + }, + { + "epoch": 0.489, + "grad_norm": 9.227112624181778, + "learning_rate": 2.4450000000000003e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8633167743682861, + "step": 978 + }, + { + "epoch": 0.4895, + "grad_norm": 1.9285723979157694, + "learning_rate": 2.4475000000000003e-06, + "loss": 0.2985, + "mean_token_accuracy": 0.898037850856781, + "step": 979 + }, + { + "epoch": 0.49, + "grad_norm": 2.4248860150971807, + "learning_rate": 2.4500000000000003e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.8855207562446594, + "step": 980 + }, + { + "epoch": 0.4905, + "grad_norm": 2.5643659683475186, + "learning_rate": 2.4525000000000002e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8641017079353333, + "step": 981 + }, + { + "epoch": 0.491, + "grad_norm": 2.2953747968961746, + "learning_rate": 2.4550000000000002e-06, + "loss": 0.3446, + "mean_token_accuracy": 0.8878116607666016, + "step": 982 + }, + { + "epoch": 0.4915, + "grad_norm": 3.0299745252131305, + "learning_rate": 2.4575e-06, + "loss": 0.5611, + "mean_token_accuracy": 0.8384099006652832, + "step": 983 + }, + { + "epoch": 0.492, + "grad_norm": 3.634645787014504, + "learning_rate": 2.46e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.8952417373657227, + "step": 984 + }, + { + "epoch": 0.4925, + "grad_norm": 30.09161017145367, + "learning_rate": 2.4625e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.886145830154419, + "step": 985 + }, + { + "epoch": 0.493, + "grad_norm": 4.006132528443792, + "learning_rate": 2.465e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8676987886428833, + "step": 986 + }, + { + "epoch": 0.4935, + "grad_norm": 4.3673240846487635, + "learning_rate": 2.4675e-06, + "loss": 0.6603, + "mean_token_accuracy": 0.8148592114448547, + "step": 987 + }, + { + "epoch": 0.494, + "grad_norm": 3.487303765935392, + "learning_rate": 2.47e-06, + "loss": 0.5393, + "mean_token_accuracy": 0.8466135263442993, + "step": 988 + }, + { + "epoch": 0.4945, + "grad_norm": 4.622526059235117, + "learning_rate": 2.4725e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8581092953681946, + "step": 989 + }, + { + "epoch": 0.495, + "grad_norm": 1.9356258490040417, + "learning_rate": 2.475e-06, + "loss": 0.3054, + "mean_token_accuracy": 0.8992805480957031, + "step": 990 + }, + { + "epoch": 0.4955, + "grad_norm": 3.7379133425546867, + "learning_rate": 2.4775e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.874374270439148, + "step": 991 + }, + { + "epoch": 0.496, + "grad_norm": 3.040281596563202, + "learning_rate": 2.4800000000000004e-06, + "loss": 0.3236, + "mean_token_accuracy": 0.8974772095680237, + "step": 992 + }, + { + "epoch": 0.4965, + "grad_norm": 2.460670160728881, + "learning_rate": 2.4825000000000004e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8687071800231934, + "step": 993 + }, + { + "epoch": 0.497, + "grad_norm": 2.9633128144700125, + "learning_rate": 2.4850000000000003e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8625109791755676, + "step": 994 + }, + { + "epoch": 0.4975, + "grad_norm": 3.080126416082851, + "learning_rate": 2.4875000000000003e-06, + "loss": 0.5965, + "mean_token_accuracy": 0.8080313205718994, + "step": 995 + }, + { + "epoch": 0.498, + "grad_norm": 3.036433736645473, + "learning_rate": 2.4900000000000003e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.882533609867096, + "step": 996 + }, + { + "epoch": 0.4985, + "grad_norm": 3.0186052406946766, + "learning_rate": 2.4925000000000003e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.861401379108429, + "step": 997 + }, + { + "epoch": 0.499, + "grad_norm": 2.3641297581174956, + "learning_rate": 2.4950000000000003e-06, + "loss": 0.422, + "mean_token_accuracy": 0.868742823600769, + "step": 998 + }, + { + "epoch": 0.4995, + "grad_norm": 2.304549415252266, + "learning_rate": 2.4975000000000002e-06, + "loss": 0.3318, + "mean_token_accuracy": 0.8934397101402283, + "step": 999 + }, + { + "epoch": 0.5, + "grad_norm": 2.1731262904379864, + "learning_rate": 2.5e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.8827179074287415, + "step": 1000 + }, + { + "epoch": 0.5005, + "grad_norm": 5.588843171800428, + "learning_rate": 2.5024999999999998e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.8947787880897522, + "step": 1001 + }, + { + "epoch": 0.501, + "grad_norm": 3.5090853789057657, + "learning_rate": 2.505e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8543434143066406, + "step": 1002 + }, + { + "epoch": 0.5015, + "grad_norm": 6.847049615046502, + "learning_rate": 2.5075e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8795694708824158, + "step": 1003 + }, + { + "epoch": 0.502, + "grad_norm": 3.738093269969132, + "learning_rate": 2.51e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.880420982837677, + "step": 1004 + }, + { + "epoch": 0.5025, + "grad_norm": 6.768373181064739, + "learning_rate": 2.5125e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8732008337974548, + "step": 1005 + }, + { + "epoch": 0.503, + "grad_norm": 5.53328838558432, + "learning_rate": 2.515e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8778150677680969, + "step": 1006 + }, + { + "epoch": 0.5035, + "grad_norm": 2.395458636440132, + "learning_rate": 2.5175e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8634029030799866, + "step": 1007 + }, + { + "epoch": 0.504, + "grad_norm": 4.618157664183284, + "learning_rate": 2.52e-06, + "loss": 0.335, + "mean_token_accuracy": 0.8945346474647522, + "step": 1008 + }, + { + "epoch": 0.5045, + "grad_norm": 2.4961114779272706, + "learning_rate": 2.5225e-06, + "loss": 0.5445, + "mean_token_accuracy": 0.8443629145622253, + "step": 1009 + }, + { + "epoch": 0.505, + "grad_norm": 4.628016624600684, + "learning_rate": 2.5250000000000004e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.85498046875, + "step": 1010 + }, + { + "epoch": 0.5055, + "grad_norm": 2.5456807590314314, + "learning_rate": 2.5275e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8523997664451599, + "step": 1011 + }, + { + "epoch": 0.506, + "grad_norm": 3.7535351647014688, + "learning_rate": 2.5300000000000003e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8953824043273926, + "step": 1012 + }, + { + "epoch": 0.5065, + "grad_norm": 3.4679215797316925, + "learning_rate": 2.5325e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8577191233634949, + "step": 1013 + }, + { + "epoch": 0.507, + "grad_norm": 4.081353218346677, + "learning_rate": 2.5350000000000003e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.8558591604232788, + "step": 1014 + }, + { + "epoch": 0.5075, + "grad_norm": 2.9754296978005454, + "learning_rate": 2.5375e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8561010360717773, + "step": 1015 + }, + { + "epoch": 0.508, + "grad_norm": 2.331697967252764, + "learning_rate": 2.5400000000000002e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8774986267089844, + "step": 1016 + }, + { + "epoch": 0.5085, + "grad_norm": 2.054139405740755, + "learning_rate": 2.5425e-06, + "loss": 0.5385, + "mean_token_accuracy": 0.8225033283233643, + "step": 1017 + }, + { + "epoch": 0.509, + "grad_norm": 3.1964802030467387, + "learning_rate": 2.545e-06, + "loss": 0.5402, + "mean_token_accuracy": 0.8478972911834717, + "step": 1018 + }, + { + "epoch": 0.5095, + "grad_norm": 4.918661333463484, + "learning_rate": 2.5475e-06, + "loss": 0.299, + "mean_token_accuracy": 0.9056137204170227, + "step": 1019 + }, + { + "epoch": 0.51, + "grad_norm": 3.1946115806009523, + "learning_rate": 2.55e-06, + "loss": 0.5758, + "mean_token_accuracy": 0.8222944140434265, + "step": 1020 + }, + { + "epoch": 0.5105, + "grad_norm": 3.687216888548191, + "learning_rate": 2.5525e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8755580186843872, + "step": 1021 + }, + { + "epoch": 0.511, + "grad_norm": 2.832389936365028, + "learning_rate": 2.555e-06, + "loss": 0.3256, + "mean_token_accuracy": 0.8990775942802429, + "step": 1022 + }, + { + "epoch": 0.5115, + "grad_norm": 2.781112657033419, + "learning_rate": 2.5575e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8660847544670105, + "step": 1023 + }, + { + "epoch": 0.512, + "grad_norm": 2.3226293205059, + "learning_rate": 2.56e-06, + "loss": 0.3153, + "mean_token_accuracy": 0.9055622816085815, + "step": 1024 + }, + { + "epoch": 0.5125, + "grad_norm": 3.427871720224503, + "learning_rate": 2.5625e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8801090121269226, + "step": 1025 + }, + { + "epoch": 0.513, + "grad_norm": 2.5080474088793627, + "learning_rate": 2.5650000000000004e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.8538917303085327, + "step": 1026 + }, + { + "epoch": 0.5135, + "grad_norm": 3.0992956684102486, + "learning_rate": 2.5675e-06, + "loss": 0.3136, + "mean_token_accuracy": 0.8983325362205505, + "step": 1027 + }, + { + "epoch": 0.514, + "grad_norm": 3.9796405679289992, + "learning_rate": 2.5700000000000004e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8800832629203796, + "step": 1028 + }, + { + "epoch": 0.5145, + "grad_norm": 2.857756655406505, + "learning_rate": 2.5725e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8927863240242004, + "step": 1029 + }, + { + "epoch": 0.515, + "grad_norm": 4.128235538100643, + "learning_rate": 2.5750000000000003e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.884509801864624, + "step": 1030 + }, + { + "epoch": 0.5155, + "grad_norm": 4.026699469013501, + "learning_rate": 2.5775e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8758683204650879, + "step": 1031 + }, + { + "epoch": 0.516, + "grad_norm": 2.5986872100283906, + "learning_rate": 2.5800000000000003e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8656056523323059, + "step": 1032 + }, + { + "epoch": 0.5165, + "grad_norm": 2.065091530994369, + "learning_rate": 2.5825e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8493255376815796, + "step": 1033 + }, + { + "epoch": 0.517, + "grad_norm": 3.4182308084991697, + "learning_rate": 2.5850000000000002e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8616822361946106, + "step": 1034 + }, + { + "epoch": 0.5175, + "grad_norm": 2.5661670807702275, + "learning_rate": 2.5875000000000002e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8834201693534851, + "step": 1035 + }, + { + "epoch": 0.518, + "grad_norm": 13.122893661938276, + "learning_rate": 2.59e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8667421936988831, + "step": 1036 + }, + { + "epoch": 0.5185, + "grad_norm": 2.7619166218228424, + "learning_rate": 2.5925e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8548490405082703, + "step": 1037 + }, + { + "epoch": 0.519, + "grad_norm": 4.181754869412563, + "learning_rate": 2.595e-06, + "loss": 0.5787, + "mean_token_accuracy": 0.8431427478790283, + "step": 1038 + }, + { + "epoch": 0.5195, + "grad_norm": 4.868462797811075, + "learning_rate": 2.5975e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.8478260636329651, + "step": 1039 + }, + { + "epoch": 0.52, + "grad_norm": 2.5887471007030114, + "learning_rate": 2.6e-06, + "loss": 0.5617, + "mean_token_accuracy": 0.8399215340614319, + "step": 1040 + }, + { + "epoch": 0.5205, + "grad_norm": 2.264382748182166, + "learning_rate": 2.6025e-06, + "loss": 0.4886, + "mean_token_accuracy": 0.8371784687042236, + "step": 1041 + }, + { + "epoch": 0.521, + "grad_norm": 3.239792791938581, + "learning_rate": 2.6050000000000005e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8547440767288208, + "step": 1042 + }, + { + "epoch": 0.5215, + "grad_norm": 4.389258708896822, + "learning_rate": 2.6075e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8740711212158203, + "step": 1043 + }, + { + "epoch": 0.522, + "grad_norm": 25.991759122824906, + "learning_rate": 2.6100000000000004e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8867509365081787, + "step": 1044 + }, + { + "epoch": 0.5225, + "grad_norm": 2.6983648838588516, + "learning_rate": 2.6125e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.882591962814331, + "step": 1045 + }, + { + "epoch": 0.523, + "grad_norm": 4.359621109274665, + "learning_rate": 2.6150000000000004e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8678221106529236, + "step": 1046 + }, + { + "epoch": 0.5235, + "grad_norm": 3.574678967750403, + "learning_rate": 2.6175e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8879715204238892, + "step": 1047 + }, + { + "epoch": 0.524, + "grad_norm": 2.3184885986760233, + "learning_rate": 2.6200000000000003e-06, + "loss": 0.4837, + "mean_token_accuracy": 0.8553971648216248, + "step": 1048 + }, + { + "epoch": 0.5245, + "grad_norm": 2.904931011615207, + "learning_rate": 2.6225e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8803752660751343, + "step": 1049 + }, + { + "epoch": 0.525, + "grad_norm": 2.595689721998031, + "learning_rate": 2.6250000000000003e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.8638368248939514, + "step": 1050 + }, + { + "epoch": 0.5255, + "grad_norm": 3.0163844382018183, + "learning_rate": 2.6275000000000003e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8745192289352417, + "step": 1051 + }, + { + "epoch": 0.526, + "grad_norm": 2.0825313390066817, + "learning_rate": 2.6300000000000002e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8670716285705566, + "step": 1052 + }, + { + "epoch": 0.5265, + "grad_norm": 2.6028814833567444, + "learning_rate": 2.6325e-06, + "loss": 0.5738, + "mean_token_accuracy": 0.8222612738609314, + "step": 1053 + }, + { + "epoch": 0.527, + "grad_norm": 3.4412467908570386, + "learning_rate": 2.635e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8792238235473633, + "step": 1054 + }, + { + "epoch": 0.5275, + "grad_norm": 7.309892740222198, + "learning_rate": 2.6375e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8894699215888977, + "step": 1055 + }, + { + "epoch": 0.528, + "grad_norm": 6.964393878869755, + "learning_rate": 2.64e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8823404908180237, + "step": 1056 + }, + { + "epoch": 0.5285, + "grad_norm": 3.0008977484593067, + "learning_rate": 2.6425e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8719351291656494, + "step": 1057 + }, + { + "epoch": 0.529, + "grad_norm": 2.5849574657282184, + "learning_rate": 2.6450000000000005e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.884271502494812, + "step": 1058 + }, + { + "epoch": 0.5295, + "grad_norm": 4.183443441018812, + "learning_rate": 2.6475e-06, + "loss": 0.5186, + "mean_token_accuracy": 0.8434256315231323, + "step": 1059 + }, + { + "epoch": 0.53, + "grad_norm": 5.801113247548145, + "learning_rate": 2.6500000000000005e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8820403814315796, + "step": 1060 + }, + { + "epoch": 0.5305, + "grad_norm": 5.412992790443362, + "learning_rate": 2.6525e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8760861158370972, + "step": 1061 + }, + { + "epoch": 0.531, + "grad_norm": 2.3957835822942424, + "learning_rate": 2.6550000000000004e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8658015727996826, + "step": 1062 + }, + { + "epoch": 0.5315, + "grad_norm": 2.1369969244820686, + "learning_rate": 2.6575e-06, + "loss": 0.327, + "mean_token_accuracy": 0.8927609920501709, + "step": 1063 + }, + { + "epoch": 0.532, + "grad_norm": 2.243863692724379, + "learning_rate": 2.6600000000000004e-06, + "loss": 0.5646, + "mean_token_accuracy": 0.8152029514312744, + "step": 1064 + }, + { + "epoch": 0.5325, + "grad_norm": 2.4081526497119894, + "learning_rate": 2.6625e-06, + "loss": 0.3251, + "mean_token_accuracy": 0.9026864767074585, + "step": 1065 + }, + { + "epoch": 0.533, + "grad_norm": 9.595332415400415, + "learning_rate": 2.6650000000000003e-06, + "loss": 0.3425, + "mean_token_accuracy": 0.8989335298538208, + "step": 1066 + }, + { + "epoch": 0.5335, + "grad_norm": 2.3181779571035737, + "learning_rate": 2.6675000000000003e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8655774593353271, + "step": 1067 + }, + { + "epoch": 0.534, + "grad_norm": 3.4828801005311227, + "learning_rate": 2.6700000000000003e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8542013168334961, + "step": 1068 + }, + { + "epoch": 0.5345, + "grad_norm": 3.0028189308252333, + "learning_rate": 2.6725000000000002e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8684831857681274, + "step": 1069 + }, + { + "epoch": 0.535, + "grad_norm": 2.032677853801547, + "learning_rate": 2.6750000000000002e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8747877478599548, + "step": 1070 + }, + { + "epoch": 0.5355, + "grad_norm": 3.1470300521567127, + "learning_rate": 2.6775e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8633437156677246, + "step": 1071 + }, + { + "epoch": 0.536, + "grad_norm": 2.034375571612017, + "learning_rate": 2.68e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8715742230415344, + "step": 1072 + }, + { + "epoch": 0.5365, + "grad_norm": 3.5794536630937563, + "learning_rate": 2.6825e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.889502763748169, + "step": 1073 + }, + { + "epoch": 0.537, + "grad_norm": 3.1988588617998914, + "learning_rate": 2.6850000000000006e-06, + "loss": 0.5653, + "mean_token_accuracy": 0.8348568081855774, + "step": 1074 + }, + { + "epoch": 0.5375, + "grad_norm": 2.5482905638777864, + "learning_rate": 2.6875e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8575494289398193, + "step": 1075 + }, + { + "epoch": 0.538, + "grad_norm": 3.2042581249509983, + "learning_rate": 2.6900000000000005e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8703829646110535, + "step": 1076 + }, + { + "epoch": 0.5385, + "grad_norm": 3.175458106190966, + "learning_rate": 2.6925e-06, + "loss": 0.5496, + "mean_token_accuracy": 0.8298944234848022, + "step": 1077 + }, + { + "epoch": 0.539, + "grad_norm": 3.856643650875459, + "learning_rate": 2.6950000000000005e-06, + "loss": 0.5247, + "mean_token_accuracy": 0.8431791663169861, + "step": 1078 + }, + { + "epoch": 0.5395, + "grad_norm": 2.2482720265611933, + "learning_rate": 2.6975e-06, + "loss": 0.502, + "mean_token_accuracy": 0.8171148896217346, + "step": 1079 + }, + { + "epoch": 0.54, + "grad_norm": 2.4910315169856267, + "learning_rate": 2.7000000000000004e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8683152198791504, + "step": 1080 + }, + { + "epoch": 0.5405, + "grad_norm": 3.259850069109779, + "learning_rate": 2.7025e-06, + "loss": 0.5457, + "mean_token_accuracy": 0.8528279066085815, + "step": 1081 + }, + { + "epoch": 0.541, + "grad_norm": 11.969653622469089, + "learning_rate": 2.7050000000000004e-06, + "loss": 0.5888, + "mean_token_accuracy": 0.8097003698348999, + "step": 1082 + }, + { + "epoch": 0.5415, + "grad_norm": 2.292832590756232, + "learning_rate": 2.7075000000000003e-06, + "loss": 0.3145, + "mean_token_accuracy": 0.897955596446991, + "step": 1083 + }, + { + "epoch": 0.542, + "grad_norm": 2.427300115237803, + "learning_rate": 2.7100000000000003e-06, + "loss": 0.297, + "mean_token_accuracy": 0.8993862867355347, + "step": 1084 + }, + { + "epoch": 0.5425, + "grad_norm": 17.498710601244234, + "learning_rate": 2.7125000000000003e-06, + "loss": 0.593, + "mean_token_accuracy": 0.8348739743232727, + "step": 1085 + }, + { + "epoch": 0.543, + "grad_norm": 2.11005473067083, + "learning_rate": 2.7150000000000003e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.8585031628608704, + "step": 1086 + }, + { + "epoch": 0.5435, + "grad_norm": 2.195697521931695, + "learning_rate": 2.7175000000000002e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8454964756965637, + "step": 1087 + }, + { + "epoch": 0.544, + "grad_norm": 3.4437991540130595, + "learning_rate": 2.7200000000000002e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8843950033187866, + "step": 1088 + }, + { + "epoch": 0.5445, + "grad_norm": 2.3013078001910805, + "learning_rate": 2.7225e-06, + "loss": 0.5528, + "mean_token_accuracy": 0.8361790776252747, + "step": 1089 + }, + { + "epoch": 0.545, + "grad_norm": 4.181936658245915, + "learning_rate": 2.7250000000000006e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8790273666381836, + "step": 1090 + }, + { + "epoch": 0.5455, + "grad_norm": 3.948860209037969, + "learning_rate": 2.7275e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8690616488456726, + "step": 1091 + }, + { + "epoch": 0.546, + "grad_norm": 2.267037508838201, + "learning_rate": 2.7300000000000005e-06, + "loss": 0.2547, + "mean_token_accuracy": 0.9150500297546387, + "step": 1092 + }, + { + "epoch": 0.5465, + "grad_norm": 3.0866418836622858, + "learning_rate": 2.7325e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.8517650961875916, + "step": 1093 + }, + { + "epoch": 0.547, + "grad_norm": 5.620406315786091, + "learning_rate": 2.7350000000000005e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8762604594230652, + "step": 1094 + }, + { + "epoch": 0.5475, + "grad_norm": 2.3892902735207127, + "learning_rate": 2.7375e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8932533264160156, + "step": 1095 + }, + { + "epoch": 0.548, + "grad_norm": 2.5069534484391287, + "learning_rate": 2.7400000000000004e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8687773942947388, + "step": 1096 + }, + { + "epoch": 0.5485, + "grad_norm": 4.648708340258333, + "learning_rate": 2.7425e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8941110968589783, + "step": 1097 + }, + { + "epoch": 0.549, + "grad_norm": 3.3306264940029506, + "learning_rate": 2.7450000000000004e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.8274884223937988, + "step": 1098 + }, + { + "epoch": 0.5495, + "grad_norm": 4.037383027851751, + "learning_rate": 2.7475000000000004e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8741329908370972, + "step": 1099 + }, + { + "epoch": 0.55, + "grad_norm": 15.998090779110106, + "learning_rate": 2.7500000000000004e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8747712969779968, + "step": 1100 + }, + { + "epoch": 0.5505, + "grad_norm": 4.219671659533045, + "learning_rate": 2.7525000000000003e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8763149380683899, + "step": 1101 + }, + { + "epoch": 0.551, + "grad_norm": 2.311290436054196, + "learning_rate": 2.7550000000000003e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8719723224639893, + "step": 1102 + }, + { + "epoch": 0.5515, + "grad_norm": 2.292924424460769, + "learning_rate": 2.7575000000000003e-06, + "loss": 0.3164, + "mean_token_accuracy": 0.8965880274772644, + "step": 1103 + }, + { + "epoch": 0.552, + "grad_norm": 5.88001049306057, + "learning_rate": 2.7600000000000003e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8871881365776062, + "step": 1104 + }, + { + "epoch": 0.5525, + "grad_norm": 9.440121336440193, + "learning_rate": 2.7625000000000002e-06, + "loss": 0.5479, + "mean_token_accuracy": 0.8477006554603577, + "step": 1105 + }, + { + "epoch": 0.553, + "grad_norm": 2.6082898040359566, + "learning_rate": 2.7650000000000006e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8732134699821472, + "step": 1106 + }, + { + "epoch": 0.5535, + "grad_norm": 2.800010661971948, + "learning_rate": 2.7675e-06, + "loss": 0.5305, + "mean_token_accuracy": 0.8448584079742432, + "step": 1107 + }, + { + "epoch": 0.554, + "grad_norm": 3.210343005388996, + "learning_rate": 2.7700000000000006e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8594082593917847, + "step": 1108 + }, + { + "epoch": 0.5545, + "grad_norm": 1.8219941185867052, + "learning_rate": 2.7725e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8722895383834839, + "step": 1109 + }, + { + "epoch": 0.555, + "grad_norm": 2.527825988427846, + "learning_rate": 2.7750000000000005e-06, + "loss": 0.5075, + "mean_token_accuracy": 0.8439351320266724, + "step": 1110 + }, + { + "epoch": 0.5555, + "grad_norm": 2.584896121954782, + "learning_rate": 2.7775e-06, + "loss": 0.2787, + "mean_token_accuracy": 0.9030700325965881, + "step": 1111 + }, + { + "epoch": 0.556, + "grad_norm": 5.754165219777672, + "learning_rate": 2.7800000000000005e-06, + "loss": 0.336, + "mean_token_accuracy": 0.8918783664703369, + "step": 1112 + }, + { + "epoch": 0.5565, + "grad_norm": 2.9921463317083354, + "learning_rate": 2.7825e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8825881481170654, + "step": 1113 + }, + { + "epoch": 0.557, + "grad_norm": 5.645896731589971, + "learning_rate": 2.7850000000000004e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8915110230445862, + "step": 1114 + }, + { + "epoch": 0.5575, + "grad_norm": 3.2256760612334863, + "learning_rate": 2.7875000000000004e-06, + "loss": 0.5962, + "mean_token_accuracy": 0.8125606775283813, + "step": 1115 + }, + { + "epoch": 0.558, + "grad_norm": 2.7395683486147817, + "learning_rate": 2.7900000000000004e-06, + "loss": 0.5139, + "mean_token_accuracy": 0.8508895635604858, + "step": 1116 + }, + { + "epoch": 0.5585, + "grad_norm": 2.954594887323933, + "learning_rate": 2.7925000000000004e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8712319135665894, + "step": 1117 + }, + { + "epoch": 0.559, + "grad_norm": 1.9407913260612468, + "learning_rate": 2.7950000000000003e-06, + "loss": 0.3182, + "mean_token_accuracy": 0.9045244455337524, + "step": 1118 + }, + { + "epoch": 0.5595, + "grad_norm": 2.163442948162634, + "learning_rate": 2.7975000000000003e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8888302445411682, + "step": 1119 + }, + { + "epoch": 0.56, + "grad_norm": 10.782545786684505, + "learning_rate": 2.8000000000000003e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8708565831184387, + "step": 1120 + }, + { + "epoch": 0.5605, + "grad_norm": 2.2264679550577315, + "learning_rate": 2.8025000000000003e-06, + "loss": 0.3111, + "mean_token_accuracy": 0.8976437449455261, + "step": 1121 + }, + { + "epoch": 0.561, + "grad_norm": 3.2851338486280426, + "learning_rate": 2.8050000000000007e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8747283220291138, + "step": 1122 + }, + { + "epoch": 0.5615, + "grad_norm": 4.871296106041865, + "learning_rate": 2.8075000000000002e-06, + "loss": 0.2658, + "mean_token_accuracy": 0.9086787700653076, + "step": 1123 + }, + { + "epoch": 0.562, + "grad_norm": 2.3049649593597437, + "learning_rate": 2.8100000000000006e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8699763417243958, + "step": 1124 + }, + { + "epoch": 0.5625, + "grad_norm": 1.9620277887329967, + "learning_rate": 2.8125e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8763861656188965, + "step": 1125 + }, + { + "epoch": 0.563, + "grad_norm": 2.4133448482535367, + "learning_rate": 2.815e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8791666626930237, + "step": 1126 + }, + { + "epoch": 0.5635, + "grad_norm": 2.158109977573732, + "learning_rate": 2.8175e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8748261332511902, + "step": 1127 + }, + { + "epoch": 0.564, + "grad_norm": 6.083949798530675, + "learning_rate": 2.82e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8518593311309814, + "step": 1128 + }, + { + "epoch": 0.5645, + "grad_norm": 2.441936130274822, + "learning_rate": 2.8225e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.8488832712173462, + "step": 1129 + }, + { + "epoch": 0.565, + "grad_norm": 4.272880175860465, + "learning_rate": 2.825e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8572462201118469, + "step": 1130 + }, + { + "epoch": 0.5655, + "grad_norm": 2.505601729065775, + "learning_rate": 2.8275e-06, + "loss": 0.3234, + "mean_token_accuracy": 0.8915627002716064, + "step": 1131 + }, + { + "epoch": 0.566, + "grad_norm": 2.823557523364311, + "learning_rate": 2.83e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8875517845153809, + "step": 1132 + }, + { + "epoch": 0.5665, + "grad_norm": 3.746004286525884, + "learning_rate": 2.8325000000000004e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8817852735519409, + "step": 1133 + }, + { + "epoch": 0.567, + "grad_norm": 5.676754816405843, + "learning_rate": 2.835e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8791732788085938, + "step": 1134 + }, + { + "epoch": 0.5675, + "grad_norm": 2.253407630800986, + "learning_rate": 2.8375000000000004e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8573619723320007, + "step": 1135 + }, + { + "epoch": 0.568, + "grad_norm": 2.1766962070695257, + "learning_rate": 2.84e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8593035936355591, + "step": 1136 + }, + { + "epoch": 0.5685, + "grad_norm": 2.8245219067703586, + "learning_rate": 2.8425000000000003e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8692784309387207, + "step": 1137 + }, + { + "epoch": 0.569, + "grad_norm": 7.434648327967597, + "learning_rate": 2.845e-06, + "loss": 0.5428, + "mean_token_accuracy": 0.8408710360527039, + "step": 1138 + }, + { + "epoch": 0.5695, + "grad_norm": 2.1275134805140383, + "learning_rate": 2.8475000000000003e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8472696542739868, + "step": 1139 + }, + { + "epoch": 0.57, + "grad_norm": 5.158851761498036, + "learning_rate": 2.85e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.8542372584342957, + "step": 1140 + }, + { + "epoch": 0.5705, + "grad_norm": 2.664890213185606, + "learning_rate": 2.8525000000000002e-06, + "loss": 0.5428, + "mean_token_accuracy": 0.8333030939102173, + "step": 1141 + }, + { + "epoch": 0.571, + "grad_norm": 2.468672308609047, + "learning_rate": 2.855e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8374419212341309, + "step": 1142 + }, + { + "epoch": 0.5715, + "grad_norm": 4.307521661280823, + "learning_rate": 2.8575e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8690434098243713, + "step": 1143 + }, + { + "epoch": 0.572, + "grad_norm": 2.1599145305188365, + "learning_rate": 2.86e-06, + "loss": 0.6227, + "mean_token_accuracy": 0.8411648869514465, + "step": 1144 + }, + { + "epoch": 0.5725, + "grad_norm": 3.820564747390588, + "learning_rate": 2.8625e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8730091452598572, + "step": 1145 + }, + { + "epoch": 0.573, + "grad_norm": 4.103238394339286, + "learning_rate": 2.865e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.854178249835968, + "step": 1146 + }, + { + "epoch": 0.5735, + "grad_norm": 2.5153126080278203, + "learning_rate": 2.8675e-06, + "loss": 0.5746, + "mean_token_accuracy": 0.8078501224517822, + "step": 1147 + }, + { + "epoch": 0.574, + "grad_norm": 2.3909451434625244, + "learning_rate": 2.87e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8757184147834778, + "step": 1148 + }, + { + "epoch": 0.5745, + "grad_norm": 3.0955690575478765, + "learning_rate": 2.8725000000000004e-06, + "loss": 0.3092, + "mean_token_accuracy": 0.904158353805542, + "step": 1149 + }, + { + "epoch": 0.575, + "grad_norm": 20.871764340238247, + "learning_rate": 2.875e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8563336730003357, + "step": 1150 + }, + { + "epoch": 0.5755, + "grad_norm": 2.0264630898733924, + "learning_rate": 2.8775000000000004e-06, + "loss": 0.317, + "mean_token_accuracy": 0.8993233442306519, + "step": 1151 + }, + { + "epoch": 0.576, + "grad_norm": 2.5384310628876827, + "learning_rate": 2.88e-06, + "loss": 0.4863, + "mean_token_accuracy": 0.8445110321044922, + "step": 1152 + }, + { + "epoch": 0.5765, + "grad_norm": 2.440458838530037, + "learning_rate": 2.8825000000000004e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8600550889968872, + "step": 1153 + }, + { + "epoch": 0.577, + "grad_norm": 1.8204052369673287, + "learning_rate": 2.885e-06, + "loss": 0.2651, + "mean_token_accuracy": 0.8846153616905212, + "step": 1154 + }, + { + "epoch": 0.5775, + "grad_norm": 1.699729852589082, + "learning_rate": 2.8875000000000003e-06, + "loss": 0.2246, + "mean_token_accuracy": 0.9147040247917175, + "step": 1155 + }, + { + "epoch": 0.578, + "grad_norm": 2.5273950660794267, + "learning_rate": 2.89e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.858127772808075, + "step": 1156 + }, + { + "epoch": 0.5785, + "grad_norm": 2.1906072784334754, + "learning_rate": 2.8925000000000003e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8785030841827393, + "step": 1157 + }, + { + "epoch": 0.579, + "grad_norm": 9.594054333288677, + "learning_rate": 2.8950000000000002e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8609986305236816, + "step": 1158 + }, + { + "epoch": 0.5795, + "grad_norm": 2.8456563705543547, + "learning_rate": 2.8975e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8458815813064575, + "step": 1159 + }, + { + "epoch": 0.58, + "grad_norm": 2.4994416264469033, + "learning_rate": 2.9e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8766071200370789, + "step": 1160 + }, + { + "epoch": 0.5805, + "grad_norm": 9.634201466852085, + "learning_rate": 2.9025e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8620221018791199, + "step": 1161 + }, + { + "epoch": 0.581, + "grad_norm": 2.22079711509052, + "learning_rate": 2.905e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.8961226344108582, + "step": 1162 + }, + { + "epoch": 0.5815, + "grad_norm": 3.3334083162147827, + "learning_rate": 2.9075e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8695801496505737, + "step": 1163 + }, + { + "epoch": 0.582, + "grad_norm": 3.004502797889907, + "learning_rate": 2.91e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8721461296081543, + "step": 1164 + }, + { + "epoch": 0.5825, + "grad_norm": 2.8299564950635245, + "learning_rate": 2.9125000000000005e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8716672658920288, + "step": 1165 + }, + { + "epoch": 0.583, + "grad_norm": 3.4088920168239265, + "learning_rate": 2.915e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8693373203277588, + "step": 1166 + }, + { + "epoch": 0.5835, + "grad_norm": 2.1058801785277534, + "learning_rate": 2.9175000000000004e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.8853503465652466, + "step": 1167 + }, + { + "epoch": 0.584, + "grad_norm": 2.4237831817100974, + "learning_rate": 2.92e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8678908944129944, + "step": 1168 + }, + { + "epoch": 0.5845, + "grad_norm": 3.230571960295096, + "learning_rate": 2.9225000000000004e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8527346253395081, + "step": 1169 + }, + { + "epoch": 0.585, + "grad_norm": 2.6045150902025607, + "learning_rate": 2.925e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8863560557365417, + "step": 1170 + }, + { + "epoch": 0.5855, + "grad_norm": 5.123114736545639, + "learning_rate": 2.9275000000000003e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8615652322769165, + "step": 1171 + }, + { + "epoch": 0.586, + "grad_norm": 3.043239109590872, + "learning_rate": 2.93e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.87876957654953, + "step": 1172 + }, + { + "epoch": 0.5865, + "grad_norm": 2.6225077361224955, + "learning_rate": 2.9325000000000003e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8787193894386292, + "step": 1173 + }, + { + "epoch": 0.587, + "grad_norm": 4.569320428626024, + "learning_rate": 2.9350000000000003e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8859001398086548, + "step": 1174 + }, + { + "epoch": 0.5875, + "grad_norm": 2.9885665609361265, + "learning_rate": 2.9375000000000003e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8615787625312805, + "step": 1175 + }, + { + "epoch": 0.588, + "grad_norm": 17.731831111185876, + "learning_rate": 2.9400000000000002e-06, + "loss": 0.5407, + "mean_token_accuracy": 0.8323599100112915, + "step": 1176 + }, + { + "epoch": 0.5885, + "grad_norm": 1.7384681066115095, + "learning_rate": 2.9425e-06, + "loss": 0.1921, + "mean_token_accuracy": 0.9350970983505249, + "step": 1177 + }, + { + "epoch": 0.589, + "grad_norm": 2.877547312655557, + "learning_rate": 2.945e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.8589614629745483, + "step": 1178 + }, + { + "epoch": 0.5895, + "grad_norm": 3.995525775532434, + "learning_rate": 2.9475e-06, + "loss": 0.3379, + "mean_token_accuracy": 0.8943396210670471, + "step": 1179 + }, + { + "epoch": 0.59, + "grad_norm": 3.0157642715156014, + "learning_rate": 2.95e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8665732741355896, + "step": 1180 + }, + { + "epoch": 0.5905, + "grad_norm": 2.784321507076952, + "learning_rate": 2.9525000000000005e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8698233962059021, + "step": 1181 + }, + { + "epoch": 0.591, + "grad_norm": 2.618671118720354, + "learning_rate": 2.955e-06, + "loss": 0.6184, + "mean_token_accuracy": 0.8151914477348328, + "step": 1182 + }, + { + "epoch": 0.5915, + "grad_norm": 1.9830533903223753, + "learning_rate": 2.9575000000000005e-06, + "loss": 0.2861, + "mean_token_accuracy": 0.9002525210380554, + "step": 1183 + }, + { + "epoch": 0.592, + "grad_norm": 5.049640508689157, + "learning_rate": 2.96e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8568106293678284, + "step": 1184 + }, + { + "epoch": 0.5925, + "grad_norm": 3.0233298491112057, + "learning_rate": 2.9625000000000004e-06, + "loss": 0.3381, + "mean_token_accuracy": 0.895446240901947, + "step": 1185 + }, + { + "epoch": 0.593, + "grad_norm": 2.987288577635998, + "learning_rate": 2.965e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.8558784127235413, + "step": 1186 + }, + { + "epoch": 0.5935, + "grad_norm": 4.4441298200508195, + "learning_rate": 2.9675000000000004e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8711693286895752, + "step": 1187 + }, + { + "epoch": 0.594, + "grad_norm": 7.001145598755695, + "learning_rate": 2.97e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8704004883766174, + "step": 1188 + }, + { + "epoch": 0.5945, + "grad_norm": 6.811153198250846, + "learning_rate": 2.9725000000000003e-06, + "loss": 0.317, + "mean_token_accuracy": 0.8853313326835632, + "step": 1189 + }, + { + "epoch": 0.595, + "grad_norm": 3.6478600432158963, + "learning_rate": 2.9750000000000003e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.8901523351669312, + "step": 1190 + }, + { + "epoch": 0.5955, + "grad_norm": 2.5111939990387455, + "learning_rate": 2.9775000000000003e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.879572331905365, + "step": 1191 + }, + { + "epoch": 0.596, + "grad_norm": 2.6220022770564686, + "learning_rate": 2.9800000000000003e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8776223659515381, + "step": 1192 + }, + { + "epoch": 0.5965, + "grad_norm": 2.8220475254982613, + "learning_rate": 2.9825000000000002e-06, + "loss": 0.3277, + "mean_token_accuracy": 0.893203854560852, + "step": 1193 + }, + { + "epoch": 0.597, + "grad_norm": 7.3927401742707035, + "learning_rate": 2.9850000000000002e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8898662328720093, + "step": 1194 + }, + { + "epoch": 0.5975, + "grad_norm": 2.712290320103414, + "learning_rate": 2.9875e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8782489895820618, + "step": 1195 + }, + { + "epoch": 0.598, + "grad_norm": 2.9781563007566687, + "learning_rate": 2.99e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8842653036117554, + "step": 1196 + }, + { + "epoch": 0.5985, + "grad_norm": 2.881585998670034, + "learning_rate": 2.9925000000000006e-06, + "loss": 0.4815, + "mean_token_accuracy": 0.8667240142822266, + "step": 1197 + }, + { + "epoch": 0.599, + "grad_norm": 3.5264372310263634, + "learning_rate": 2.995e-06, + "loss": 0.3367, + "mean_token_accuracy": 0.895797610282898, + "step": 1198 + }, + { + "epoch": 0.5995, + "grad_norm": 1.9262994877258452, + "learning_rate": 2.9975000000000005e-06, + "loss": 0.3154, + "mean_token_accuracy": 0.902207612991333, + "step": 1199 + }, + { + "epoch": 0.6, + "grad_norm": 2.338323284064842, + "learning_rate": 3e-06, + "loss": 0.5299, + "mean_token_accuracy": 0.8367825746536255, + "step": 1200 + }, + { + "epoch": 0.6005, + "grad_norm": 3.8504714634348334, + "learning_rate": 3.0025000000000005e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8601583242416382, + "step": 1201 + }, + { + "epoch": 0.601, + "grad_norm": 3.4968727824145223, + "learning_rate": 3.005e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.889509916305542, + "step": 1202 + }, + { + "epoch": 0.6015, + "grad_norm": 2.7988240471877877, + "learning_rate": 3.0075000000000004e-06, + "loss": 0.5231, + "mean_token_accuracy": 0.8280739784240723, + "step": 1203 + }, + { + "epoch": 0.602, + "grad_norm": 2.445760063308487, + "learning_rate": 3.01e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8771994709968567, + "step": 1204 + }, + { + "epoch": 0.6025, + "grad_norm": 2.951217985158523, + "learning_rate": 3.0125000000000004e-06, + "loss": 0.3201, + "mean_token_accuracy": 0.8954692482948303, + "step": 1205 + }, + { + "epoch": 0.603, + "grad_norm": 1.9407340146635463, + "learning_rate": 3.0150000000000004e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8582624197006226, + "step": 1206 + }, + { + "epoch": 0.6035, + "grad_norm": 4.007450663645477, + "learning_rate": 3.0175000000000003e-06, + "loss": 0.6453, + "mean_token_accuracy": 0.813069224357605, + "step": 1207 + }, + { + "epoch": 0.604, + "grad_norm": 4.1274335030767855, + "learning_rate": 3.0200000000000003e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.8526351451873779, + "step": 1208 + }, + { + "epoch": 0.6045, + "grad_norm": 2.2238878427261786, + "learning_rate": 3.0225000000000003e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8761569261550903, + "step": 1209 + }, + { + "epoch": 0.605, + "grad_norm": 2.347378835046214, + "learning_rate": 3.0250000000000003e-06, + "loss": 0.482, + "mean_token_accuracy": 0.844825267791748, + "step": 1210 + }, + { + "epoch": 0.6055, + "grad_norm": 3.069184239333117, + "learning_rate": 3.0275000000000002e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8691285252571106, + "step": 1211 + }, + { + "epoch": 0.606, + "grad_norm": 2.680326981059948, + "learning_rate": 3.0300000000000002e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8598484992980957, + "step": 1212 + }, + { + "epoch": 0.6065, + "grad_norm": 37.51335446002244, + "learning_rate": 3.0325000000000006e-06, + "loss": 0.3337, + "mean_token_accuracy": 0.8952275514602661, + "step": 1213 + }, + { + "epoch": 0.607, + "grad_norm": 4.403915085916194, + "learning_rate": 3.035e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8804925680160522, + "step": 1214 + }, + { + "epoch": 0.6075, + "grad_norm": 2.6073328929795285, + "learning_rate": 3.0375000000000006e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8658292293548584, + "step": 1215 + }, + { + "epoch": 0.608, + "grad_norm": 2.8352223131441345, + "learning_rate": 3.04e-06, + "loss": 0.4906, + "mean_token_accuracy": 0.8470866680145264, + "step": 1216 + }, + { + "epoch": 0.6085, + "grad_norm": 4.878312470724781, + "learning_rate": 3.0425000000000005e-06, + "loss": 0.3008, + "mean_token_accuracy": 0.9010896682739258, + "step": 1217 + }, + { + "epoch": 0.609, + "grad_norm": 2.614956217826969, + "learning_rate": 3.045e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8784881830215454, + "step": 1218 + }, + { + "epoch": 0.6095, + "grad_norm": 3.279045653565803, + "learning_rate": 3.0475000000000005e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8878840208053589, + "step": 1219 + }, + { + "epoch": 0.61, + "grad_norm": 4.2394264188412185, + "learning_rate": 3.05e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8629574179649353, + "step": 1220 + }, + { + "epoch": 0.6105, + "grad_norm": 3.310075025607259, + "learning_rate": 3.0525000000000004e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8812165260314941, + "step": 1221 + }, + { + "epoch": 0.611, + "grad_norm": 2.8725357668558065, + "learning_rate": 3.0550000000000004e-06, + "loss": 0.5086, + "mean_token_accuracy": 0.8506926894187927, + "step": 1222 + }, + { + "epoch": 0.6115, + "grad_norm": 9.408041861593508, + "learning_rate": 3.0575000000000004e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8775510191917419, + "step": 1223 + }, + { + "epoch": 0.612, + "grad_norm": 9.629296668683722, + "learning_rate": 3.0600000000000003e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8649196624755859, + "step": 1224 + }, + { + "epoch": 0.6125, + "grad_norm": 2.1591643778241805, + "learning_rate": 3.0625000000000003e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.868683934211731, + "step": 1225 + }, + { + "epoch": 0.613, + "grad_norm": 29.984306727264098, + "learning_rate": 3.0650000000000003e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8614662289619446, + "step": 1226 + }, + { + "epoch": 0.6135, + "grad_norm": 2.304697865517175, + "learning_rate": 3.0675000000000003e-06, + "loss": 0.5556, + "mean_token_accuracy": 0.8363455533981323, + "step": 1227 + }, + { + "epoch": 0.614, + "grad_norm": 2.0580140830883553, + "learning_rate": 3.0700000000000003e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8519822359085083, + "step": 1228 + }, + { + "epoch": 0.6145, + "grad_norm": 4.3098256332201785, + "learning_rate": 3.0725000000000007e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8822780251502991, + "step": 1229 + }, + { + "epoch": 0.615, + "grad_norm": 2.77070389630778, + "learning_rate": 3.075e-06, + "loss": 0.2818, + "mean_token_accuracy": 0.9094540476799011, + "step": 1230 + }, + { + "epoch": 0.6155, + "grad_norm": 3.9521358210740805, + "learning_rate": 3.0775000000000006e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8755466341972351, + "step": 1231 + }, + { + "epoch": 0.616, + "grad_norm": 2.3906447558371067, + "learning_rate": 3.08e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8553813099861145, + "step": 1232 + }, + { + "epoch": 0.6165, + "grad_norm": 3.673274486631104, + "learning_rate": 3.0825000000000006e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8831102252006531, + "step": 1233 + }, + { + "epoch": 0.617, + "grad_norm": 2.519897223239977, + "learning_rate": 3.085e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8571640253067017, + "step": 1234 + }, + { + "epoch": 0.6175, + "grad_norm": 2.3748314851179964, + "learning_rate": 3.0875000000000005e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8761397004127502, + "step": 1235 + }, + { + "epoch": 0.618, + "grad_norm": 3.9086465919900313, + "learning_rate": 3.09e-06, + "loss": 0.4763, + "mean_token_accuracy": 0.8495283722877502, + "step": 1236 + }, + { + "epoch": 0.6185, + "grad_norm": 2.1811079999923564, + "learning_rate": 3.0925000000000005e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.8923126459121704, + "step": 1237 + }, + { + "epoch": 0.619, + "grad_norm": 2.414917239426464, + "learning_rate": 3.0950000000000004e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8778797388076782, + "step": 1238 + }, + { + "epoch": 0.6195, + "grad_norm": 2.441041358593779, + "learning_rate": 3.0975000000000004e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8620585799217224, + "step": 1239 + }, + { + "epoch": 0.62, + "grad_norm": 2.920325177145414, + "learning_rate": 3.1000000000000004e-06, + "loss": 0.5994, + "mean_token_accuracy": 0.8394620418548584, + "step": 1240 + }, + { + "epoch": 0.6205, + "grad_norm": 2.5365332698020406, + "learning_rate": 3.1025000000000004e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.8674635887145996, + "step": 1241 + }, + { + "epoch": 0.621, + "grad_norm": 1.9897281353405125, + "learning_rate": 3.1050000000000003e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8644264340400696, + "step": 1242 + }, + { + "epoch": 0.6215, + "grad_norm": 2.3868176618187786, + "learning_rate": 3.1075000000000003e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.858961284160614, + "step": 1243 + }, + { + "epoch": 0.622, + "grad_norm": 9.40720435429725, + "learning_rate": 3.1100000000000003e-06, + "loss": 0.424, + "mean_token_accuracy": 0.86631178855896, + "step": 1244 + }, + { + "epoch": 0.6225, + "grad_norm": 35.42726374203148, + "learning_rate": 3.1125000000000007e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8721610903739929, + "step": 1245 + }, + { + "epoch": 0.623, + "grad_norm": 2.7313672745553066, + "learning_rate": 3.1150000000000002e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8735440969467163, + "step": 1246 + }, + { + "epoch": 0.6235, + "grad_norm": 3.874280481339407, + "learning_rate": 3.1175000000000006e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8675345778465271, + "step": 1247 + }, + { + "epoch": 0.624, + "grad_norm": 2.974049152745891, + "learning_rate": 3.12e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8701581954956055, + "step": 1248 + }, + { + "epoch": 0.6245, + "grad_norm": 3.8778597831145585, + "learning_rate": 3.1225000000000006e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8611951470375061, + "step": 1249 + }, + { + "epoch": 0.625, + "grad_norm": 3.4156253325814414, + "learning_rate": 3.125e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8695553541183472, + "step": 1250 + }, + { + "epoch": 0.6255, + "grad_norm": 2.5761974369573384, + "learning_rate": 3.1275e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8594454526901245, + "step": 1251 + }, + { + "epoch": 0.626, + "grad_norm": 2.3464318745557686, + "learning_rate": 3.13e-06, + "loss": 0.326, + "mean_token_accuracy": 0.8942475318908691, + "step": 1252 + }, + { + "epoch": 0.6265, + "grad_norm": 3.1247727185346292, + "learning_rate": 3.1325e-06, + "loss": 0.5083, + "mean_token_accuracy": 0.8332015872001648, + "step": 1253 + }, + { + "epoch": 0.627, + "grad_norm": 2.6657198219989318, + "learning_rate": 3.135e-06, + "loss": 0.5552, + "mean_token_accuracy": 0.8340153694152832, + "step": 1254 + }, + { + "epoch": 0.6275, + "grad_norm": 3.471824947078273, + "learning_rate": 3.1375e-06, + "loss": 0.2913, + "mean_token_accuracy": 0.9026687741279602, + "step": 1255 + }, + { + "epoch": 0.628, + "grad_norm": 2.990847722701747, + "learning_rate": 3.1400000000000004e-06, + "loss": 0.3095, + "mean_token_accuracy": 0.8992263674736023, + "step": 1256 + }, + { + "epoch": 0.6285, + "grad_norm": 2.526376635835852, + "learning_rate": 3.1425e-06, + "loss": 0.293, + "mean_token_accuracy": 0.8986421227455139, + "step": 1257 + }, + { + "epoch": 0.629, + "grad_norm": 2.538970040812623, + "learning_rate": 3.1450000000000004e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8662157654762268, + "step": 1258 + }, + { + "epoch": 0.6295, + "grad_norm": 2.542609241361033, + "learning_rate": 3.1475e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8755389451980591, + "step": 1259 + }, + { + "epoch": 0.63, + "grad_norm": 19.99595901214683, + "learning_rate": 3.1500000000000003e-06, + "loss": 0.2535, + "mean_token_accuracy": 0.911248505115509, + "step": 1260 + }, + { + "epoch": 0.6305, + "grad_norm": 4.016455491891791, + "learning_rate": 3.1525e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8814961314201355, + "step": 1261 + }, + { + "epoch": 0.631, + "grad_norm": 2.8153698425773386, + "learning_rate": 3.1550000000000003e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8686612844467163, + "step": 1262 + }, + { + "epoch": 0.6315, + "grad_norm": 1.970352479624823, + "learning_rate": 3.1575e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8749829530715942, + "step": 1263 + }, + { + "epoch": 0.632, + "grad_norm": 3.5134715459392885, + "learning_rate": 3.1600000000000002e-06, + "loss": 0.5658, + "mean_token_accuracy": 0.8401743769645691, + "step": 1264 + }, + { + "epoch": 0.6325, + "grad_norm": 3.088109590903423, + "learning_rate": 3.1625000000000002e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8736575841903687, + "step": 1265 + }, + { + "epoch": 0.633, + "grad_norm": 2.444833662125795, + "learning_rate": 3.165e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8643831610679626, + "step": 1266 + }, + { + "epoch": 0.6335, + "grad_norm": 2.02974129137391, + "learning_rate": 3.1675e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8691176176071167, + "step": 1267 + }, + { + "epoch": 0.634, + "grad_norm": 2.561818636007311, + "learning_rate": 3.17e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.874748170375824, + "step": 1268 + }, + { + "epoch": 0.6345, + "grad_norm": 4.28452798952699, + "learning_rate": 3.1725e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.8938223719596863, + "step": 1269 + }, + { + "epoch": 0.635, + "grad_norm": 2.8970239307042265, + "learning_rate": 3.175e-06, + "loss": 0.5193, + "mean_token_accuracy": 0.8412655591964722, + "step": 1270 + }, + { + "epoch": 0.6355, + "grad_norm": 2.3995812325052372, + "learning_rate": 3.1775e-06, + "loss": 0.3373, + "mean_token_accuracy": 0.883967936038971, + "step": 1271 + }, + { + "epoch": 0.636, + "grad_norm": 2.782335453930043, + "learning_rate": 3.1800000000000005e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8883213400840759, + "step": 1272 + }, + { + "epoch": 0.6365, + "grad_norm": 2.213432123408321, + "learning_rate": 3.1825e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8798863887786865, + "step": 1273 + }, + { + "epoch": 0.637, + "grad_norm": 2.923662896128449, + "learning_rate": 3.1850000000000004e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.8546885251998901, + "step": 1274 + }, + { + "epoch": 0.6375, + "grad_norm": 1.864631467489487, + "learning_rate": 3.1875e-06, + "loss": 0.5547, + "mean_token_accuracy": 0.8243456482887268, + "step": 1275 + }, + { + "epoch": 0.638, + "grad_norm": 2.3610617547368524, + "learning_rate": 3.1900000000000004e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8618890643119812, + "step": 1276 + }, + { + "epoch": 0.6385, + "grad_norm": 2.8606628467243738, + "learning_rate": 3.1925e-06, + "loss": 0.5809, + "mean_token_accuracy": 0.8261347413063049, + "step": 1277 + }, + { + "epoch": 0.639, + "grad_norm": 3.9170797573381795, + "learning_rate": 3.1950000000000003e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8633333444595337, + "step": 1278 + }, + { + "epoch": 0.6395, + "grad_norm": 3.021735722178588, + "learning_rate": 3.1975e-06, + "loss": 0.7113, + "mean_token_accuracy": 0.7583237886428833, + "step": 1279 + }, + { + "epoch": 0.64, + "grad_norm": 2.425985024001355, + "learning_rate": 3.2000000000000003e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8453447818756104, + "step": 1280 + }, + { + "epoch": 0.6405, + "grad_norm": 2.5463792336393256, + "learning_rate": 3.2025000000000003e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8706896305084229, + "step": 1281 + }, + { + "epoch": 0.641, + "grad_norm": 3.4207722666193363, + "learning_rate": 3.2050000000000002e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8843249082565308, + "step": 1282 + }, + { + "epoch": 0.6415, + "grad_norm": 2.482459519865177, + "learning_rate": 3.2075e-06, + "loss": 0.5108, + "mean_token_accuracy": 0.8426250219345093, + "step": 1283 + }, + { + "epoch": 0.642, + "grad_norm": 2.1868732498453376, + "learning_rate": 3.21e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8769205212593079, + "step": 1284 + }, + { + "epoch": 0.6425, + "grad_norm": 3.1191845747696467, + "learning_rate": 3.2125e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.8479381203651428, + "step": 1285 + }, + { + "epoch": 0.643, + "grad_norm": 13.163563080816557, + "learning_rate": 3.215e-06, + "loss": 0.3404, + "mean_token_accuracy": 0.8886020183563232, + "step": 1286 + }, + { + "epoch": 0.6435, + "grad_norm": 3.619902684247806, + "learning_rate": 3.2175e-06, + "loss": 0.6493, + "mean_token_accuracy": 0.8216719031333923, + "step": 1287 + }, + { + "epoch": 0.644, + "grad_norm": 2.3075527149203046, + "learning_rate": 3.2200000000000005e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8928403258323669, + "step": 1288 + }, + { + "epoch": 0.6445, + "grad_norm": 2.127158775762964, + "learning_rate": 3.2225e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8536403179168701, + "step": 1289 + }, + { + "epoch": 0.645, + "grad_norm": 2.0161194900503334, + "learning_rate": 3.2250000000000005e-06, + "loss": 0.3326, + "mean_token_accuracy": 0.8829709887504578, + "step": 1290 + }, + { + "epoch": 0.6455, + "grad_norm": 2.9979139256485237, + "learning_rate": 3.2275e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8726881146430969, + "step": 1291 + }, + { + "epoch": 0.646, + "grad_norm": 5.04823891251302, + "learning_rate": 3.2300000000000004e-06, + "loss": 0.5694, + "mean_token_accuracy": 0.8724721670150757, + "step": 1292 + }, + { + "epoch": 0.6465, + "grad_norm": 2.6846744910950355, + "learning_rate": 3.2325e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8842719793319702, + "step": 1293 + }, + { + "epoch": 0.647, + "grad_norm": 2.6694401931490073, + "learning_rate": 3.2350000000000004e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8955253958702087, + "step": 1294 + }, + { + "epoch": 0.6475, + "grad_norm": 2.338433475400117, + "learning_rate": 3.2375e-06, + "loss": 0.5796, + "mean_token_accuracy": 0.8246621489524841, + "step": 1295 + }, + { + "epoch": 0.648, + "grad_norm": 3.0374925205104066, + "learning_rate": 3.2400000000000003e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8835664391517639, + "step": 1296 + }, + { + "epoch": 0.6485, + "grad_norm": 2.2237792366868097, + "learning_rate": 3.2425000000000003e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8596681356430054, + "step": 1297 + }, + { + "epoch": 0.649, + "grad_norm": 3.2086684785114246, + "learning_rate": 3.2450000000000003e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8893609046936035, + "step": 1298 + }, + { + "epoch": 0.6495, + "grad_norm": 2.7518315573408, + "learning_rate": 3.2475000000000002e-06, + "loss": 0.5894, + "mean_token_accuracy": 0.8253529071807861, + "step": 1299 + }, + { + "epoch": 0.65, + "grad_norm": 2.6041286354230233, + "learning_rate": 3.2500000000000002e-06, + "loss": 0.3224, + "mean_token_accuracy": 0.8981919884681702, + "step": 1300 + }, + { + "epoch": 0.6505, + "grad_norm": 12.051660571712421, + "learning_rate": 3.2525e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8602254986763, + "step": 1301 + }, + { + "epoch": 0.651, + "grad_norm": 2.0451956066640853, + "learning_rate": 3.255e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8881545662879944, + "step": 1302 + }, + { + "epoch": 0.6515, + "grad_norm": 2.3448502676677756, + "learning_rate": 3.2575e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8805021643638611, + "step": 1303 + }, + { + "epoch": 0.652, + "grad_norm": 16.696364622330297, + "learning_rate": 3.2600000000000006e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8662981390953064, + "step": 1304 + }, + { + "epoch": 0.6525, + "grad_norm": 3.2247622404797878, + "learning_rate": 3.2625e-06, + "loss": 0.5236, + "mean_token_accuracy": 0.8412914872169495, + "step": 1305 + }, + { + "epoch": 0.653, + "grad_norm": 2.9831547563183, + "learning_rate": 3.2650000000000005e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8669679760932922, + "step": 1306 + }, + { + "epoch": 0.6535, + "grad_norm": 2.5353064724839194, + "learning_rate": 3.2675e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8650338649749756, + "step": 1307 + }, + { + "epoch": 0.654, + "grad_norm": 2.7557176567334043, + "learning_rate": 3.2700000000000005e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8774571418762207, + "step": 1308 + }, + { + "epoch": 0.6545, + "grad_norm": 2.3444120348614517, + "learning_rate": 3.2725e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8727553486824036, + "step": 1309 + }, + { + "epoch": 0.655, + "grad_norm": 1.9337095602836982, + "learning_rate": 3.2750000000000004e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8739763498306274, + "step": 1310 + }, + { + "epoch": 0.6555, + "grad_norm": 2.20983957833477, + "learning_rate": 3.2775e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8728062510490417, + "step": 1311 + }, + { + "epoch": 0.656, + "grad_norm": 2.5402187814172925, + "learning_rate": 3.2800000000000004e-06, + "loss": 0.5503, + "mean_token_accuracy": 0.8434442281723022, + "step": 1312 + }, + { + "epoch": 0.6565, + "grad_norm": 1.8537084203357606, + "learning_rate": 3.2825000000000003e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8586332201957703, + "step": 1313 + }, + { + "epoch": 0.657, + "grad_norm": 1.7287636859507787, + "learning_rate": 3.2850000000000003e-06, + "loss": 0.3026, + "mean_token_accuracy": 0.8968305587768555, + "step": 1314 + }, + { + "epoch": 0.6575, + "grad_norm": 2.9539720683816473, + "learning_rate": 3.2875000000000003e-06, + "loss": 0.2368, + "mean_token_accuracy": 0.9178998470306396, + "step": 1315 + }, + { + "epoch": 0.658, + "grad_norm": 3.869151865360778, + "learning_rate": 3.2900000000000003e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8811219930648804, + "step": 1316 + }, + { + "epoch": 0.6585, + "grad_norm": 2.580641187110029, + "learning_rate": 3.2925000000000002e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8699619770050049, + "step": 1317 + }, + { + "epoch": 0.659, + "grad_norm": 2.4650039333498377, + "learning_rate": 3.2950000000000002e-06, + "loss": 0.3594, + "mean_token_accuracy": 0.8855960369110107, + "step": 1318 + }, + { + "epoch": 0.6595, + "grad_norm": 4.4251155586818065, + "learning_rate": 3.2975e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8619338870048523, + "step": 1319 + }, + { + "epoch": 0.66, + "grad_norm": 2.1592437208287913, + "learning_rate": 3.3000000000000006e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8839210271835327, + "step": 1320 + }, + { + "epoch": 0.6605, + "grad_norm": 2.3989953213182296, + "learning_rate": 3.3025e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8803146481513977, + "step": 1321 + }, + { + "epoch": 0.661, + "grad_norm": 2.25951940277084, + "learning_rate": 3.3050000000000005e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.873781681060791, + "step": 1322 + }, + { + "epoch": 0.6615, + "grad_norm": 2.873714764090584, + "learning_rate": 3.3075e-06, + "loss": 0.5116, + "mean_token_accuracy": 0.8265960216522217, + "step": 1323 + }, + { + "epoch": 0.662, + "grad_norm": 2.88533285676886, + "learning_rate": 3.3100000000000005e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8750654458999634, + "step": 1324 + }, + { + "epoch": 0.6625, + "grad_norm": 2.2746129012507335, + "learning_rate": 3.3125e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8870691061019897, + "step": 1325 + }, + { + "epoch": 0.663, + "grad_norm": 2.074332581710155, + "learning_rate": 3.3150000000000004e-06, + "loss": 0.3299, + "mean_token_accuracy": 0.9041568636894226, + "step": 1326 + }, + { + "epoch": 0.6635, + "grad_norm": 2.387033892849427, + "learning_rate": 3.3175e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8694957494735718, + "step": 1327 + }, + { + "epoch": 0.664, + "grad_norm": 2.7167058945583182, + "learning_rate": 3.3200000000000004e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.8366402387619019, + "step": 1328 + }, + { + "epoch": 0.6645, + "grad_norm": 5.931594633213141, + "learning_rate": 3.3225000000000004e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8773548007011414, + "step": 1329 + }, + { + "epoch": 0.665, + "grad_norm": 2.3796975185404468, + "learning_rate": 3.3250000000000004e-06, + "loss": 0.4823, + "mean_token_accuracy": 0.8474393486976624, + "step": 1330 + }, + { + "epoch": 0.6655, + "grad_norm": 2.6061775348063003, + "learning_rate": 3.3275000000000003e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8844230771064758, + "step": 1331 + }, + { + "epoch": 0.666, + "grad_norm": 2.6000930736623635, + "learning_rate": 3.3300000000000003e-06, + "loss": 0.5179, + "mean_token_accuracy": 0.8436502814292908, + "step": 1332 + }, + { + "epoch": 0.6665, + "grad_norm": 13.23772763679899, + "learning_rate": 3.3325000000000003e-06, + "loss": 0.3028, + "mean_token_accuracy": 0.8971278667449951, + "step": 1333 + }, + { + "epoch": 0.667, + "grad_norm": 8.137775582945594, + "learning_rate": 3.3350000000000003e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8861857652664185, + "step": 1334 + }, + { + "epoch": 0.6675, + "grad_norm": 2.8148836342547336, + "learning_rate": 3.3375000000000002e-06, + "loss": 0.5503, + "mean_token_accuracy": 0.8398482799530029, + "step": 1335 + }, + { + "epoch": 0.668, + "grad_norm": 3.7762644915846364, + "learning_rate": 3.3400000000000006e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.874524712562561, + "step": 1336 + }, + { + "epoch": 0.6685, + "grad_norm": 3.067301738846055, + "learning_rate": 3.3425e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8557851314544678, + "step": 1337 + }, + { + "epoch": 0.669, + "grad_norm": 4.071235847661652, + "learning_rate": 3.3450000000000006e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.8856300711631775, + "step": 1338 + }, + { + "epoch": 0.6695, + "grad_norm": 4.073464497377369, + "learning_rate": 3.3475e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8704704642295837, + "step": 1339 + }, + { + "epoch": 0.67, + "grad_norm": 2.3128558409955784, + "learning_rate": 3.3500000000000005e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8823727965354919, + "step": 1340 + }, + { + "epoch": 0.6705, + "grad_norm": 10.674951578610518, + "learning_rate": 3.3525e-06, + "loss": 0.5327, + "mean_token_accuracy": 0.8569301962852478, + "step": 1341 + }, + { + "epoch": 0.671, + "grad_norm": 2.098900928610429, + "learning_rate": 3.3550000000000005e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8661379814147949, + "step": 1342 + }, + { + "epoch": 0.6715, + "grad_norm": 15.536974593146729, + "learning_rate": 3.3575e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8642092943191528, + "step": 1343 + }, + { + "epoch": 0.672, + "grad_norm": 25.28911641703962, + "learning_rate": 3.3600000000000004e-06, + "loss": 0.3278, + "mean_token_accuracy": 0.8983708024024963, + "step": 1344 + }, + { + "epoch": 0.6725, + "grad_norm": 2.3959321885675378, + "learning_rate": 3.3625000000000004e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8737307786941528, + "step": 1345 + }, + { + "epoch": 0.673, + "grad_norm": 3.749214054143405, + "learning_rate": 3.3650000000000004e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8911575078964233, + "step": 1346 + }, + { + "epoch": 0.6735, + "grad_norm": 5.148366739755893, + "learning_rate": 3.3675000000000004e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8736110925674438, + "step": 1347 + }, + { + "epoch": 0.674, + "grad_norm": 2.7725740274775146, + "learning_rate": 3.3700000000000003e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8776699304580688, + "step": 1348 + }, + { + "epoch": 0.6745, + "grad_norm": 2.1507924816087014, + "learning_rate": 3.3725000000000003e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.847953200340271, + "step": 1349 + }, + { + "epoch": 0.675, + "grad_norm": 2.0517152993465397, + "learning_rate": 3.3750000000000003e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8689532279968262, + "step": 1350 + }, + { + "epoch": 0.6755, + "grad_norm": 16.038528098367575, + "learning_rate": 3.3775000000000003e-06, + "loss": 0.5531, + "mean_token_accuracy": 0.8543180823326111, + "step": 1351 + }, + { + "epoch": 0.676, + "grad_norm": 3.0978842681201857, + "learning_rate": 3.3800000000000007e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8815813660621643, + "step": 1352 + }, + { + "epoch": 0.6765, + "grad_norm": 3.6237465499493227, + "learning_rate": 3.3825000000000002e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8750420212745667, + "step": 1353 + }, + { + "epoch": 0.677, + "grad_norm": 5.815681074021694, + "learning_rate": 3.3850000000000006e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8713049292564392, + "step": 1354 + }, + { + "epoch": 0.6775, + "grad_norm": 6.849823190061937, + "learning_rate": 3.3875e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8572745323181152, + "step": 1355 + }, + { + "epoch": 0.678, + "grad_norm": 9.729506194241571, + "learning_rate": 3.3900000000000006e-06, + "loss": 0.4798, + "mean_token_accuracy": 0.8586421608924866, + "step": 1356 + }, + { + "epoch": 0.6785, + "grad_norm": 3.345852810675071, + "learning_rate": 3.3925e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8829890489578247, + "step": 1357 + }, + { + "epoch": 0.679, + "grad_norm": 2.113168050272731, + "learning_rate": 3.3950000000000005e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8836705088615417, + "step": 1358 + }, + { + "epoch": 0.6795, + "grad_norm": 4.6902921191756715, + "learning_rate": 3.3975e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8730881810188293, + "step": 1359 + }, + { + "epoch": 0.68, + "grad_norm": 2.5993753137852096, + "learning_rate": 3.4000000000000005e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8603997826576233, + "step": 1360 + }, + { + "epoch": 0.6805, + "grad_norm": 3.3597519561252476, + "learning_rate": 3.4025000000000005e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8565253615379333, + "step": 1361 + }, + { + "epoch": 0.681, + "grad_norm": 3.7672949093769534, + "learning_rate": 3.4050000000000004e-06, + "loss": 0.2884, + "mean_token_accuracy": 0.9146059155464172, + "step": 1362 + }, + { + "epoch": 0.6815, + "grad_norm": 1.8455580926776258, + "learning_rate": 3.4075000000000004e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8719486594200134, + "step": 1363 + }, + { + "epoch": 0.682, + "grad_norm": 3.03153354779856, + "learning_rate": 3.4100000000000004e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8660170435905457, + "step": 1364 + }, + { + "epoch": 0.6825, + "grad_norm": 1.8118645067668968, + "learning_rate": 3.4125000000000004e-06, + "loss": 0.2814, + "mean_token_accuracy": 0.9033951759338379, + "step": 1365 + }, + { + "epoch": 0.683, + "grad_norm": 2.1159311063956814, + "learning_rate": 3.4150000000000003e-06, + "loss": 0.477, + "mean_token_accuracy": 0.8517315983772278, + "step": 1366 + }, + { + "epoch": 0.6835, + "grad_norm": 2.810179958771713, + "learning_rate": 3.4175000000000003e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8721871972084045, + "step": 1367 + }, + { + "epoch": 0.684, + "grad_norm": 2.3884900889698675, + "learning_rate": 3.4200000000000007e-06, + "loss": 0.5825, + "mean_token_accuracy": 0.8192610144615173, + "step": 1368 + }, + { + "epoch": 0.6845, + "grad_norm": 3.298393221797875, + "learning_rate": 3.4225000000000003e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.857372522354126, + "step": 1369 + }, + { + "epoch": 0.685, + "grad_norm": 3.6365580589254223, + "learning_rate": 3.4250000000000007e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8792458772659302, + "step": 1370 + }, + { + "epoch": 0.6855, + "grad_norm": 2.3470747870532094, + "learning_rate": 3.4275000000000002e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8770625591278076, + "step": 1371 + }, + { + "epoch": 0.686, + "grad_norm": 3.2936886173559192, + "learning_rate": 3.4300000000000006e-06, + "loss": 0.6251, + "mean_token_accuracy": 0.8276875019073486, + "step": 1372 + }, + { + "epoch": 0.6865, + "grad_norm": 2.0645567654362043, + "learning_rate": 3.4325e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8623224496841431, + "step": 1373 + }, + { + "epoch": 0.687, + "grad_norm": 2.5773038330268387, + "learning_rate": 3.4350000000000006e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.856521725654602, + "step": 1374 + }, + { + "epoch": 0.6875, + "grad_norm": 2.0126940832300115, + "learning_rate": 3.4375e-06, + "loss": 0.3055, + "mean_token_accuracy": 0.8930778503417969, + "step": 1375 + }, + { + "epoch": 0.688, + "grad_norm": 4.95836072575672, + "learning_rate": 3.44e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.856453537940979, + "step": 1376 + }, + { + "epoch": 0.6885, + "grad_norm": 2.137829602476388, + "learning_rate": 3.4425e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.87218177318573, + "step": 1377 + }, + { + "epoch": 0.689, + "grad_norm": 2.1643601448719165, + "learning_rate": 3.445e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8867890238761902, + "step": 1378 + }, + { + "epoch": 0.6895, + "grad_norm": 2.048329413987939, + "learning_rate": 3.4475000000000005e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.861574113368988, + "step": 1379 + }, + { + "epoch": 0.69, + "grad_norm": 3.737618784693143, + "learning_rate": 3.45e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8571428656578064, + "step": 1380 + }, + { + "epoch": 0.6905, + "grad_norm": 3.7317966942850664, + "learning_rate": 3.4525000000000004e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.873498260974884, + "step": 1381 + }, + { + "epoch": 0.691, + "grad_norm": 3.408257624887224, + "learning_rate": 3.455e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8586755990982056, + "step": 1382 + }, + { + "epoch": 0.6915, + "grad_norm": 2.378622169216875, + "learning_rate": 3.4575000000000004e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8685029745101929, + "step": 1383 + }, + { + "epoch": 0.692, + "grad_norm": 5.792761753044653, + "learning_rate": 3.46e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8689003586769104, + "step": 1384 + }, + { + "epoch": 0.6925, + "grad_norm": 3.9719200413971545, + "learning_rate": 3.4625000000000003e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8725398182868958, + "step": 1385 + }, + { + "epoch": 0.693, + "grad_norm": 3.3088877263865872, + "learning_rate": 3.465e-06, + "loss": 0.3258, + "mean_token_accuracy": 0.895736575126648, + "step": 1386 + }, + { + "epoch": 0.6935, + "grad_norm": 3.2366037100248306, + "learning_rate": 3.4675000000000003e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8580235242843628, + "step": 1387 + }, + { + "epoch": 0.694, + "grad_norm": 2.899147621807376, + "learning_rate": 3.4700000000000002e-06, + "loss": 0.5957, + "mean_token_accuracy": 0.8186259269714355, + "step": 1388 + }, + { + "epoch": 0.6945, + "grad_norm": 2.3991483714063317, + "learning_rate": 3.4725e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.8896382451057434, + "step": 1389 + }, + { + "epoch": 0.695, + "grad_norm": 2.4550574675803243, + "learning_rate": 3.475e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8581345081329346, + "step": 1390 + }, + { + "epoch": 0.6955, + "grad_norm": 3.1232491206113018, + "learning_rate": 3.4775e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.8518059253692627, + "step": 1391 + }, + { + "epoch": 0.696, + "grad_norm": 2.0814395043388467, + "learning_rate": 3.48e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8593630790710449, + "step": 1392 + }, + { + "epoch": 0.6965, + "grad_norm": 3.3817743857228146, + "learning_rate": 3.4825e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.8600184917449951, + "step": 1393 + }, + { + "epoch": 0.697, + "grad_norm": 2.115106923294149, + "learning_rate": 3.485e-06, + "loss": 0.2857, + "mean_token_accuracy": 0.9067780375480652, + "step": 1394 + }, + { + "epoch": 0.6975, + "grad_norm": 2.4543880964296587, + "learning_rate": 3.4875000000000005e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8710418343544006, + "step": 1395 + }, + { + "epoch": 0.698, + "grad_norm": 2.472785938223861, + "learning_rate": 3.49e-06, + "loss": 0.2657, + "mean_token_accuracy": 0.9074576497077942, + "step": 1396 + }, + { + "epoch": 0.6985, + "grad_norm": 2.9679784532770426, + "learning_rate": 3.4925000000000004e-06, + "loss": 0.4999, + "mean_token_accuracy": 0.8561728596687317, + "step": 1397 + }, + { + "epoch": 0.699, + "grad_norm": 2.3276650976832793, + "learning_rate": 3.495e-06, + "loss": 0.2954, + "mean_token_accuracy": 0.9035466313362122, + "step": 1398 + }, + { + "epoch": 0.6995, + "grad_norm": 2.3635289561972637, + "learning_rate": 3.4975000000000004e-06, + "loss": 0.5518, + "mean_token_accuracy": 0.8314606547355652, + "step": 1399 + }, + { + "epoch": 0.7, + "grad_norm": 2.220682625567123, + "learning_rate": 3.5e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8775296807289124, + "step": 1400 + }, + { + "epoch": 0.7005, + "grad_norm": 3.6425616917272414, + "learning_rate": 3.5025000000000003e-06, + "loss": 0.5217, + "mean_token_accuracy": 0.8501908183097839, + "step": 1401 + }, + { + "epoch": 0.701, + "grad_norm": 2.977050270304088, + "learning_rate": 3.505e-06, + "loss": 0.5152, + "mean_token_accuracy": 0.8461236357688904, + "step": 1402 + }, + { + "epoch": 0.7015, + "grad_norm": 4.366480755177015, + "learning_rate": 3.5075000000000003e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8640350699424744, + "step": 1403 + }, + { + "epoch": 0.702, + "grad_norm": 2.27450398329371, + "learning_rate": 3.5100000000000003e-06, + "loss": 0.3093, + "mean_token_accuracy": 0.8966509103775024, + "step": 1404 + }, + { + "epoch": 0.7025, + "grad_norm": 2.8775216338883434, + "learning_rate": 3.5125000000000003e-06, + "loss": 0.3165, + "mean_token_accuracy": 0.901096761226654, + "step": 1405 + }, + { + "epoch": 0.703, + "grad_norm": 2.748489127800668, + "learning_rate": 3.5150000000000002e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8531616926193237, + "step": 1406 + }, + { + "epoch": 0.7035, + "grad_norm": 3.059064381525762, + "learning_rate": 3.5175e-06, + "loss": 0.6096, + "mean_token_accuracy": 0.8203098177909851, + "step": 1407 + }, + { + "epoch": 0.704, + "grad_norm": 4.607136765815474, + "learning_rate": 3.52e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8612365126609802, + "step": 1408 + }, + { + "epoch": 0.7045, + "grad_norm": 9.514098476792547, + "learning_rate": 3.5225e-06, + "loss": 0.2737, + "mean_token_accuracy": 0.9069806337356567, + "step": 1409 + }, + { + "epoch": 0.705, + "grad_norm": 4.312318006835822, + "learning_rate": 3.525e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.8630231618881226, + "step": 1410 + }, + { + "epoch": 0.7055, + "grad_norm": 5.257684602094953, + "learning_rate": 3.5275000000000005e-06, + "loss": 0.2592, + "mean_token_accuracy": 0.9105513691902161, + "step": 1411 + }, + { + "epoch": 0.706, + "grad_norm": 3.22891369467915, + "learning_rate": 3.53e-06, + "loss": 0.2984, + "mean_token_accuracy": 0.9004276990890503, + "step": 1412 + }, + { + "epoch": 0.7065, + "grad_norm": 2.6779310605985613, + "learning_rate": 3.5325000000000005e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8694934844970703, + "step": 1413 + }, + { + "epoch": 0.707, + "grad_norm": 2.1659643088016916, + "learning_rate": 3.535e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8712759017944336, + "step": 1414 + }, + { + "epoch": 0.7075, + "grad_norm": 2.089013084363839, + "learning_rate": 3.5375000000000004e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.8924911618232727, + "step": 1415 + }, + { + "epoch": 0.708, + "grad_norm": 2.552803014298446, + "learning_rate": 3.54e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8758760094642639, + "step": 1416 + }, + { + "epoch": 0.7085, + "grad_norm": 1.9530628035169746, + "learning_rate": 3.5425000000000004e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.871886670589447, + "step": 1417 + }, + { + "epoch": 0.709, + "grad_norm": 2.393887206433259, + "learning_rate": 3.545e-06, + "loss": 0.6839, + "mean_token_accuracy": 0.7996843457221985, + "step": 1418 + }, + { + "epoch": 0.7095, + "grad_norm": 1.8496958393845317, + "learning_rate": 3.5475000000000003e-06, + "loss": 0.2097, + "mean_token_accuracy": 0.92457515001297, + "step": 1419 + }, + { + "epoch": 0.71, + "grad_norm": 1.9439335266291906, + "learning_rate": 3.5500000000000003e-06, + "loss": 0.3425, + "mean_token_accuracy": 0.8861937522888184, + "step": 1420 + }, + { + "epoch": 0.7105, + "grad_norm": 2.5114945344623627, + "learning_rate": 3.5525000000000003e-06, + "loss": 0.5312, + "mean_token_accuracy": 0.8369489312171936, + "step": 1421 + }, + { + "epoch": 0.711, + "grad_norm": 2.385586439036041, + "learning_rate": 3.5550000000000003e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8512559533119202, + "step": 1422 + }, + { + "epoch": 0.7115, + "grad_norm": 3.4526108170559904, + "learning_rate": 3.5575000000000002e-06, + "loss": 0.5251, + "mean_token_accuracy": 0.839883029460907, + "step": 1423 + }, + { + "epoch": 0.712, + "grad_norm": 3.9935037033795, + "learning_rate": 3.5600000000000002e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8657376766204834, + "step": 1424 + }, + { + "epoch": 0.7125, + "grad_norm": 2.9871540533976546, + "learning_rate": 3.5625e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8797934055328369, + "step": 1425 + }, + { + "epoch": 0.713, + "grad_norm": 2.4036118739703958, + "learning_rate": 3.565e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8643150329589844, + "step": 1426 + }, + { + "epoch": 0.7135, + "grad_norm": 2.0657499164914026, + "learning_rate": 3.5675000000000006e-06, + "loss": 0.3216, + "mean_token_accuracy": 0.8884952664375305, + "step": 1427 + }, + { + "epoch": 0.714, + "grad_norm": 4.329203262634133, + "learning_rate": 3.57e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.8813016414642334, + "step": 1428 + }, + { + "epoch": 0.7145, + "grad_norm": 2.0780664940324614, + "learning_rate": 3.5725000000000005e-06, + "loss": 0.4986, + "mean_token_accuracy": 0.8488761186599731, + "step": 1429 + }, + { + "epoch": 0.715, + "grad_norm": 2.229277039758071, + "learning_rate": 3.575e-06, + "loss": 0.2872, + "mean_token_accuracy": 0.907979428768158, + "step": 1430 + }, + { + "epoch": 0.7155, + "grad_norm": 4.346935234830472, + "learning_rate": 3.5775000000000005e-06, + "loss": 0.6155, + "mean_token_accuracy": 0.8107013702392578, + "step": 1431 + }, + { + "epoch": 0.716, + "grad_norm": 3.042220153630424, + "learning_rate": 3.58e-06, + "loss": 0.6489, + "mean_token_accuracy": 0.8100080490112305, + "step": 1432 + }, + { + "epoch": 0.7165, + "grad_norm": 4.260568787128246, + "learning_rate": 3.5825000000000004e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8691844344139099, + "step": 1433 + }, + { + "epoch": 0.717, + "grad_norm": 2.2170788591644044, + "learning_rate": 3.585e-06, + "loss": 0.3314, + "mean_token_accuracy": 0.8927587866783142, + "step": 1434 + }, + { + "epoch": 0.7175, + "grad_norm": 3.17375048820345, + "learning_rate": 3.5875000000000004e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8804766535758972, + "step": 1435 + }, + { + "epoch": 0.718, + "grad_norm": 2.022038721895601, + "learning_rate": 3.5900000000000004e-06, + "loss": 0.3161, + "mean_token_accuracy": 0.888580858707428, + "step": 1436 + }, + { + "epoch": 0.7185, + "grad_norm": 2.4201402600016997, + "learning_rate": 3.5925000000000003e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8572757840156555, + "step": 1437 + }, + { + "epoch": 0.719, + "grad_norm": 12.295507194490717, + "learning_rate": 3.5950000000000003e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8796331882476807, + "step": 1438 + }, + { + "epoch": 0.7195, + "grad_norm": 3.139452700664637, + "learning_rate": 3.5975000000000003e-06, + "loss": 0.6629, + "mean_token_accuracy": 0.8181962966918945, + "step": 1439 + }, + { + "epoch": 0.72, + "grad_norm": 2.181892162802441, + "learning_rate": 3.6000000000000003e-06, + "loss": 0.3392, + "mean_token_accuracy": 0.8958295583724976, + "step": 1440 + }, + { + "epoch": 0.7205, + "grad_norm": 1.9597308166413892, + "learning_rate": 3.6025000000000002e-06, + "loss": 0.3069, + "mean_token_accuracy": 0.8931243419647217, + "step": 1441 + }, + { + "epoch": 0.721, + "grad_norm": 2.174820840284989, + "learning_rate": 3.6050000000000002e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8860185146331787, + "step": 1442 + }, + { + "epoch": 0.7215, + "grad_norm": 2.4120576249417467, + "learning_rate": 3.6075000000000006e-06, + "loss": 0.6288, + "mean_token_accuracy": 0.8215521574020386, + "step": 1443 + }, + { + "epoch": 0.722, + "grad_norm": 2.6695661273956306, + "learning_rate": 3.61e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8699759244918823, + "step": 1444 + }, + { + "epoch": 0.7225, + "grad_norm": 2.2089928471637355, + "learning_rate": 3.6125000000000006e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8788276314735413, + "step": 1445 + }, + { + "epoch": 0.723, + "grad_norm": 2.0898993761092814, + "learning_rate": 3.615e-06, + "loss": 0.5112, + "mean_token_accuracy": 0.8426966071128845, + "step": 1446 + }, + { + "epoch": 0.7235, + "grad_norm": 2.8506115851552023, + "learning_rate": 3.6175000000000005e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8889131546020508, + "step": 1447 + }, + { + "epoch": 0.724, + "grad_norm": 2.4555690429495325, + "learning_rate": 3.62e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8814387917518616, + "step": 1448 + }, + { + "epoch": 0.7245, + "grad_norm": 2.1158743996343388, + "learning_rate": 3.6225000000000005e-06, + "loss": 0.2795, + "mean_token_accuracy": 0.9006913900375366, + "step": 1449 + }, + { + "epoch": 0.725, + "grad_norm": 4.62902163101194, + "learning_rate": 3.625e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8620892763137817, + "step": 1450 + }, + { + "epoch": 0.7255, + "grad_norm": 2.3742824002976417, + "learning_rate": 3.6275000000000004e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8647568225860596, + "step": 1451 + }, + { + "epoch": 0.726, + "grad_norm": 4.669896660372106, + "learning_rate": 3.6300000000000004e-06, + "loss": 0.2827, + "mean_token_accuracy": 0.899402379989624, + "step": 1452 + }, + { + "epoch": 0.7265, + "grad_norm": 3.172431749900365, + "learning_rate": 3.6325000000000004e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8860647082328796, + "step": 1453 + }, + { + "epoch": 0.727, + "grad_norm": 4.808323613329563, + "learning_rate": 3.6350000000000003e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8880327939987183, + "step": 1454 + }, + { + "epoch": 0.7275, + "grad_norm": 1.9617862144638705, + "learning_rate": 3.6375000000000003e-06, + "loss": 0.3268, + "mean_token_accuracy": 0.8922946453094482, + "step": 1455 + }, + { + "epoch": 0.728, + "grad_norm": 6.14483717100214, + "learning_rate": 3.6400000000000003e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8596434593200684, + "step": 1456 + }, + { + "epoch": 0.7285, + "grad_norm": 6.610280833236898, + "learning_rate": 3.6425000000000003e-06, + "loss": 0.3487, + "mean_token_accuracy": 0.9027643799781799, + "step": 1457 + }, + { + "epoch": 0.729, + "grad_norm": 2.0109253699057503, + "learning_rate": 3.6450000000000003e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8689582943916321, + "step": 1458 + }, + { + "epoch": 0.7295, + "grad_norm": 2.3876592210040286, + "learning_rate": 3.6475000000000007e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8716075420379639, + "step": 1459 + }, + { + "epoch": 0.73, + "grad_norm": 2.5311035228243326, + "learning_rate": 3.65e-06, + "loss": 0.5252, + "mean_token_accuracy": 0.8385888338088989, + "step": 1460 + }, + { + "epoch": 0.7305, + "grad_norm": 2.0316621559182506, + "learning_rate": 3.6525000000000006e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8599210381507874, + "step": 1461 + }, + { + "epoch": 0.731, + "grad_norm": 1.8708561582995835, + "learning_rate": 3.655e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8641815781593323, + "step": 1462 + }, + { + "epoch": 0.7315, + "grad_norm": 2.183969812878311, + "learning_rate": 3.6575000000000006e-06, + "loss": 0.3321, + "mean_token_accuracy": 0.8874632120132446, + "step": 1463 + }, + { + "epoch": 0.732, + "grad_norm": 2.11579057027429, + "learning_rate": 3.66e-06, + "loss": 0.2823, + "mean_token_accuracy": 0.9060373306274414, + "step": 1464 + }, + { + "epoch": 0.7325, + "grad_norm": 2.0777993089858486, + "learning_rate": 3.6625000000000005e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.8888700604438782, + "step": 1465 + }, + { + "epoch": 0.733, + "grad_norm": 3.0605768259088393, + "learning_rate": 3.665e-06, + "loss": 0.5383, + "mean_token_accuracy": 0.8395341634750366, + "step": 1466 + }, + { + "epoch": 0.7335, + "grad_norm": 5.31977002291471, + "learning_rate": 3.6675000000000005e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8609788417816162, + "step": 1467 + }, + { + "epoch": 0.734, + "grad_norm": 6.973318717474127, + "learning_rate": 3.6700000000000004e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8609133362770081, + "step": 1468 + }, + { + "epoch": 0.7345, + "grad_norm": 2.1980476807006513, + "learning_rate": 3.6725000000000004e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8825852870941162, + "step": 1469 + }, + { + "epoch": 0.735, + "grad_norm": 2.6507360519040204, + "learning_rate": 3.6750000000000004e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8797431588172913, + "step": 1470 + }, + { + "epoch": 0.7355, + "grad_norm": 2.5831995975548256, + "learning_rate": 3.6775000000000004e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8602347373962402, + "step": 1471 + }, + { + "epoch": 0.736, + "grad_norm": 7.22351301980173, + "learning_rate": 3.6800000000000003e-06, + "loss": 0.5478, + "mean_token_accuracy": 0.838704526424408, + "step": 1472 + }, + { + "epoch": 0.7365, + "grad_norm": 2.3318915157538997, + "learning_rate": 3.6825000000000003e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8810717463493347, + "step": 1473 + }, + { + "epoch": 0.737, + "grad_norm": 30.78592453769807, + "learning_rate": 3.6850000000000003e-06, + "loss": 0.5059, + "mean_token_accuracy": 0.8741024136543274, + "step": 1474 + }, + { + "epoch": 0.7375, + "grad_norm": 2.7599665698349045, + "learning_rate": 3.6875000000000007e-06, + "loss": 0.521, + "mean_token_accuracy": 0.8542641401290894, + "step": 1475 + }, + { + "epoch": 0.738, + "grad_norm": 2.1222528790477533, + "learning_rate": 3.6900000000000002e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8604471683502197, + "step": 1476 + }, + { + "epoch": 0.7385, + "grad_norm": 2.4423762968829994, + "learning_rate": 3.6925000000000006e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8872944116592407, + "step": 1477 + }, + { + "epoch": 0.739, + "grad_norm": 2.5102014404518838, + "learning_rate": 3.695e-06, + "loss": 0.566, + "mean_token_accuracy": 0.8391625881195068, + "step": 1478 + }, + { + "epoch": 0.7395, + "grad_norm": 2.1654789392441907, + "learning_rate": 3.6975000000000006e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.887033224105835, + "step": 1479 + }, + { + "epoch": 0.74, + "grad_norm": 2.8502097909422566, + "learning_rate": 3.7e-06, + "loss": 0.2649, + "mean_token_accuracy": 0.9111111164093018, + "step": 1480 + }, + { + "epoch": 0.7405, + "grad_norm": 2.110342811545499, + "learning_rate": 3.7025000000000005e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8543312549591064, + "step": 1481 + }, + { + "epoch": 0.741, + "grad_norm": 3.1781421549316233, + "learning_rate": 3.705e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8771862387657166, + "step": 1482 + }, + { + "epoch": 0.7415, + "grad_norm": 4.243012904769045, + "learning_rate": 3.7075000000000005e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8679983615875244, + "step": 1483 + }, + { + "epoch": 0.742, + "grad_norm": 2.450243681005131, + "learning_rate": 3.7100000000000005e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8794994950294495, + "step": 1484 + }, + { + "epoch": 0.7425, + "grad_norm": 2.8121056735455325, + "learning_rate": 3.7125000000000005e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.8468451499938965, + "step": 1485 + }, + { + "epoch": 0.743, + "grad_norm": 2.0990942205365166, + "learning_rate": 3.7150000000000004e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8637362718582153, + "step": 1486 + }, + { + "epoch": 0.7435, + "grad_norm": 2.18918560019555, + "learning_rate": 3.7175000000000004e-06, + "loss": 0.5196, + "mean_token_accuracy": 0.8457127809524536, + "step": 1487 + }, + { + "epoch": 0.744, + "grad_norm": 3.3917427016726944, + "learning_rate": 3.7200000000000004e-06, + "loss": 0.5033, + "mean_token_accuracy": 0.8515739440917969, + "step": 1488 + }, + { + "epoch": 0.7445, + "grad_norm": 3.1864700074768306, + "learning_rate": 3.7225000000000004e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8561887741088867, + "step": 1489 + }, + { + "epoch": 0.745, + "grad_norm": 2.823367109959597, + "learning_rate": 3.7250000000000003e-06, + "loss": 0.6037, + "mean_token_accuracy": 0.8254858255386353, + "step": 1490 + }, + { + "epoch": 0.7455, + "grad_norm": 2.89158864101404, + "learning_rate": 3.7275000000000007e-06, + "loss": 0.6376, + "mean_token_accuracy": 0.8115237951278687, + "step": 1491 + }, + { + "epoch": 0.746, + "grad_norm": 2.14796541898312, + "learning_rate": 3.7300000000000003e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8804841041564941, + "step": 1492 + }, + { + "epoch": 0.7465, + "grad_norm": 3.5303333210199725, + "learning_rate": 3.7325000000000007e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.864581286907196, + "step": 1493 + }, + { + "epoch": 0.747, + "grad_norm": 5.951140700163329, + "learning_rate": 3.7350000000000002e-06, + "loss": 0.333, + "mean_token_accuracy": 0.8947572112083435, + "step": 1494 + }, + { + "epoch": 0.7475, + "grad_norm": 3.43636791958162, + "learning_rate": 3.7375000000000006e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8798243999481201, + "step": 1495 + }, + { + "epoch": 0.748, + "grad_norm": 8.047089567131325, + "learning_rate": 3.74e-06, + "loss": 0.286, + "mean_token_accuracy": 0.9025959968566895, + "step": 1496 + }, + { + "epoch": 0.7485, + "grad_norm": 2.263571486217934, + "learning_rate": 3.7425000000000006e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.8317795991897583, + "step": 1497 + }, + { + "epoch": 0.749, + "grad_norm": 2.3594389498503308, + "learning_rate": 3.745e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8806546330451965, + "step": 1498 + }, + { + "epoch": 0.7495, + "grad_norm": 2.5282285999620693, + "learning_rate": 3.7475000000000005e-06, + "loss": 0.754, + "mean_token_accuracy": 0.8100041747093201, + "step": 1499 + }, + { + "epoch": 0.75, + "grad_norm": 6.058991554158672, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.85367751121521, + "step": 1500 + }, + { + "epoch": 0.7505, + "grad_norm": 2.103861767705487, + "learning_rate": 3.7525e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8837371468544006, + "step": 1501 + }, + { + "epoch": 0.751, + "grad_norm": 2.3786788553251115, + "learning_rate": 3.7550000000000005e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8846225142478943, + "step": 1502 + }, + { + "epoch": 0.7515, + "grad_norm": 2.3173535956161255, + "learning_rate": 3.7575e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8792832493782043, + "step": 1503 + }, + { + "epoch": 0.752, + "grad_norm": 3.7190646398964673, + "learning_rate": 3.7600000000000004e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8710847496986389, + "step": 1504 + }, + { + "epoch": 0.7525, + "grad_norm": 3.450783704270027, + "learning_rate": 3.7625e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.858697772026062, + "step": 1505 + }, + { + "epoch": 0.753, + "grad_norm": 2.0130603552969877, + "learning_rate": 3.7650000000000004e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8775087594985962, + "step": 1506 + }, + { + "epoch": 0.7535, + "grad_norm": 2.580401726231149, + "learning_rate": 3.7675e-06, + "loss": 0.5049, + "mean_token_accuracy": 0.8441897034645081, + "step": 1507 + }, + { + "epoch": 0.754, + "grad_norm": 2.7162220195797837, + "learning_rate": 3.7700000000000003e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8922155499458313, + "step": 1508 + }, + { + "epoch": 0.7545, + "grad_norm": 5.220494481094, + "learning_rate": 3.7725e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8645833134651184, + "step": 1509 + }, + { + "epoch": 0.755, + "grad_norm": 2.5195436205061537, + "learning_rate": 3.7750000000000003e-06, + "loss": 0.5117, + "mean_token_accuracy": 0.8416008353233337, + "step": 1510 + }, + { + "epoch": 0.7555, + "grad_norm": 3.014453671238699, + "learning_rate": 3.7775000000000003e-06, + "loss": 0.497, + "mean_token_accuracy": 0.8593775629997253, + "step": 1511 + }, + { + "epoch": 0.756, + "grad_norm": 5.224288116723025, + "learning_rate": 3.7800000000000002e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8835780024528503, + "step": 1512 + }, + { + "epoch": 0.7565, + "grad_norm": 2.442175566729029, + "learning_rate": 3.7825e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8704000115394592, + "step": 1513 + }, + { + "epoch": 0.757, + "grad_norm": 2.3526551550782813, + "learning_rate": 3.785e-06, + "loss": 0.5274, + "mean_token_accuracy": 0.837217390537262, + "step": 1514 + }, + { + "epoch": 0.7575, + "grad_norm": 2.3088418013890277, + "learning_rate": 3.7875e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8832512497901917, + "step": 1515 + }, + { + "epoch": 0.758, + "grad_norm": 3.309635882304442, + "learning_rate": 3.79e-06, + "loss": 0.324, + "mean_token_accuracy": 0.8962185382843018, + "step": 1516 + }, + { + "epoch": 0.7585, + "grad_norm": 4.013118910134506, + "learning_rate": 3.7925e-06, + "loss": 0.3197, + "mean_token_accuracy": 0.9026477932929993, + "step": 1517 + }, + { + "epoch": 0.759, + "grad_norm": 6.19329638926052, + "learning_rate": 3.7950000000000005e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.873423159122467, + "step": 1518 + }, + { + "epoch": 0.7595, + "grad_norm": 3.7372559826361527, + "learning_rate": 3.7975e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8816612958908081, + "step": 1519 + }, + { + "epoch": 0.76, + "grad_norm": 2.2030674276049593, + "learning_rate": 3.8000000000000005e-06, + "loss": 0.2724, + "mean_token_accuracy": 0.9142740964889526, + "step": 1520 + }, + { + "epoch": 0.7605, + "grad_norm": 2.2348765191437114, + "learning_rate": 3.8025e-06, + "loss": 0.3486, + "mean_token_accuracy": 0.8864107728004456, + "step": 1521 + }, + { + "epoch": 0.761, + "grad_norm": 2.1784355254750514, + "learning_rate": 3.8050000000000004e-06, + "loss": 0.2834, + "mean_token_accuracy": 0.8961488008499146, + "step": 1522 + }, + { + "epoch": 0.7615, + "grad_norm": 2.6866487750639547, + "learning_rate": 3.8075e-06, + "loss": 0.5049, + "mean_token_accuracy": 0.852977991104126, + "step": 1523 + }, + { + "epoch": 0.762, + "grad_norm": 3.2515339378952586, + "learning_rate": 3.8100000000000004e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8706015944480896, + "step": 1524 + }, + { + "epoch": 0.7625, + "grad_norm": 3.0075483989111746, + "learning_rate": 3.8125e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.849743127822876, + "step": 1525 + }, + { + "epoch": 0.763, + "grad_norm": 3.8209265627803566, + "learning_rate": 3.815000000000001e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8734495639801025, + "step": 1526 + }, + { + "epoch": 0.7635, + "grad_norm": 2.5804134617290897, + "learning_rate": 3.8175e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8731752038002014, + "step": 1527 + }, + { + "epoch": 0.764, + "grad_norm": 2.0993095388729133, + "learning_rate": 3.820000000000001e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8632955551147461, + "step": 1528 + }, + { + "epoch": 0.7645, + "grad_norm": 2.996963837136526, + "learning_rate": 3.8225e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8634557127952576, + "step": 1529 + }, + { + "epoch": 0.765, + "grad_norm": 2.697892352959782, + "learning_rate": 3.825000000000001e-06, + "loss": 0.5336, + "mean_token_accuracy": 0.8333069086074829, + "step": 1530 + }, + { + "epoch": 0.7655, + "grad_norm": 2.591590406142509, + "learning_rate": 3.8275e-06, + "loss": 0.5585, + "mean_token_accuracy": 0.8361491560935974, + "step": 1531 + }, + { + "epoch": 0.766, + "grad_norm": 8.107057898018011, + "learning_rate": 3.830000000000001e-06, + "loss": 0.5092, + "mean_token_accuracy": 0.8437356948852539, + "step": 1532 + }, + { + "epoch": 0.7665, + "grad_norm": 2.850254050993852, + "learning_rate": 3.8325e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.87128084897995, + "step": 1533 + }, + { + "epoch": 0.767, + "grad_norm": 3.6224486345790536, + "learning_rate": 3.8350000000000006e-06, + "loss": 0.3357, + "mean_token_accuracy": 0.895503580570221, + "step": 1534 + }, + { + "epoch": 0.7675, + "grad_norm": 2.235104873153351, + "learning_rate": 3.8375e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8553125262260437, + "step": 1535 + }, + { + "epoch": 0.768, + "grad_norm": 8.132317389355102, + "learning_rate": 3.8400000000000005e-06, + "loss": 0.2668, + "mean_token_accuracy": 0.9115004539489746, + "step": 1536 + }, + { + "epoch": 0.7685, + "grad_norm": 5.536770844227113, + "learning_rate": 3.8425e-06, + "loss": 0.5267, + "mean_token_accuracy": 0.8353776335716248, + "step": 1537 + }, + { + "epoch": 0.769, + "grad_norm": 2.3960086196044714, + "learning_rate": 3.8450000000000005e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8596834540367126, + "step": 1538 + }, + { + "epoch": 0.7695, + "grad_norm": 4.596233675626708, + "learning_rate": 3.8475e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8659754395484924, + "step": 1539 + }, + { + "epoch": 0.77, + "grad_norm": 2.401539538727794, + "learning_rate": 3.85e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8677165508270264, + "step": 1540 + }, + { + "epoch": 0.7705, + "grad_norm": 3.8197099572691378, + "learning_rate": 3.8525e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8715246319770813, + "step": 1541 + }, + { + "epoch": 0.771, + "grad_norm": 5.7455800343860295, + "learning_rate": 3.855e-06, + "loss": 0.3111, + "mean_token_accuracy": 0.9020859003067017, + "step": 1542 + }, + { + "epoch": 0.7715, + "grad_norm": 2.1284163642096905, + "learning_rate": 3.8575e-06, + "loss": 0.3147, + "mean_token_accuracy": 0.8985125422477722, + "step": 1543 + }, + { + "epoch": 0.772, + "grad_norm": 4.46280394648305, + "learning_rate": 3.86e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8574970960617065, + "step": 1544 + }, + { + "epoch": 0.7725, + "grad_norm": 2.339496772784042, + "learning_rate": 3.8625e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8775352239608765, + "step": 1545 + }, + { + "epoch": 0.773, + "grad_norm": 2.55657553282144, + "learning_rate": 3.865e-06, + "loss": 0.2435, + "mean_token_accuracy": 0.9163300395011902, + "step": 1546 + }, + { + "epoch": 0.7735, + "grad_norm": 1.8048504791502191, + "learning_rate": 3.8675e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8778125643730164, + "step": 1547 + }, + { + "epoch": 0.774, + "grad_norm": 23.416076195531325, + "learning_rate": 3.87e-06, + "loss": 0.486, + "mean_token_accuracy": 0.8533180356025696, + "step": 1548 + }, + { + "epoch": 0.7745, + "grad_norm": 2.459374339510323, + "learning_rate": 3.8725e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8795648813247681, + "step": 1549 + }, + { + "epoch": 0.775, + "grad_norm": 4.685317652106017, + "learning_rate": 3.875e-06, + "loss": 0.3294, + "mean_token_accuracy": 0.8876349925994873, + "step": 1550 + }, + { + "epoch": 0.7755, + "grad_norm": 2.2111163600594743, + "learning_rate": 3.8775000000000006e-06, + "loss": 0.5086, + "mean_token_accuracy": 0.8486335873603821, + "step": 1551 + }, + { + "epoch": 0.776, + "grad_norm": 7.638230079312359, + "learning_rate": 3.88e-06, + "loss": 0.5285, + "mean_token_accuracy": 0.8377785086631775, + "step": 1552 + }, + { + "epoch": 0.7765, + "grad_norm": 2.812730253145382, + "learning_rate": 3.8825000000000005e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.8443281054496765, + "step": 1553 + }, + { + "epoch": 0.777, + "grad_norm": 12.385967621413602, + "learning_rate": 3.885e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8739721179008484, + "step": 1554 + }, + { + "epoch": 0.7775, + "grad_norm": 4.18853529134691, + "learning_rate": 3.8875000000000005e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8823529481887817, + "step": 1555 + }, + { + "epoch": 0.778, + "grad_norm": 7.611286786535677, + "learning_rate": 3.89e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8585271239280701, + "step": 1556 + }, + { + "epoch": 0.7785, + "grad_norm": 2.328773268207072, + "learning_rate": 3.8925000000000004e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8739640712738037, + "step": 1557 + }, + { + "epoch": 0.779, + "grad_norm": 3.2103776518006035, + "learning_rate": 3.895000000000001e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8828796744346619, + "step": 1558 + }, + { + "epoch": 0.7795, + "grad_norm": 3.178521656498292, + "learning_rate": 3.8975e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8802233338356018, + "step": 1559 + }, + { + "epoch": 0.78, + "grad_norm": 2.45813874454054, + "learning_rate": 3.900000000000001e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8782597184181213, + "step": 1560 + }, + { + "epoch": 0.7805, + "grad_norm": 3.5195828812560563, + "learning_rate": 3.9025e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.9013645052909851, + "step": 1561 + }, + { + "epoch": 0.781, + "grad_norm": 3.3340863261162186, + "learning_rate": 3.905000000000001e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8731527328491211, + "step": 1562 + }, + { + "epoch": 0.7815, + "grad_norm": 2.418379700994461, + "learning_rate": 3.9075e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8704034090042114, + "step": 1563 + }, + { + "epoch": 0.782, + "grad_norm": 2.5933035501387796, + "learning_rate": 3.910000000000001e-06, + "loss": 0.5141, + "mean_token_accuracy": 0.8382198810577393, + "step": 1564 + }, + { + "epoch": 0.7825, + "grad_norm": 2.507166470489615, + "learning_rate": 3.9125e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8712198734283447, + "step": 1565 + }, + { + "epoch": 0.783, + "grad_norm": 2.3504283872591727, + "learning_rate": 3.915000000000001e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8528369069099426, + "step": 1566 + }, + { + "epoch": 0.7835, + "grad_norm": 2.1546138769580185, + "learning_rate": 3.9175e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8778590559959412, + "step": 1567 + }, + { + "epoch": 0.784, + "grad_norm": 5.964113841840237, + "learning_rate": 3.920000000000001e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8877447843551636, + "step": 1568 + }, + { + "epoch": 0.7845, + "grad_norm": 3.894012932197344, + "learning_rate": 3.9225e-06, + "loss": 0.6222, + "mean_token_accuracy": 0.8217443823814392, + "step": 1569 + }, + { + "epoch": 0.785, + "grad_norm": 2.016410841392547, + "learning_rate": 3.9250000000000005e-06, + "loss": 0.3033, + "mean_token_accuracy": 0.9058205485343933, + "step": 1570 + }, + { + "epoch": 0.7855, + "grad_norm": 4.8960416293774145, + "learning_rate": 3.9275e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8814076781272888, + "step": 1571 + }, + { + "epoch": 0.786, + "grad_norm": 2.5418920179023146, + "learning_rate": 3.9300000000000005e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8683092594146729, + "step": 1572 + }, + { + "epoch": 0.7865, + "grad_norm": 2.773510279601437, + "learning_rate": 3.9325e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8804177641868591, + "step": 1573 + }, + { + "epoch": 0.787, + "grad_norm": 2.9789883237737054, + "learning_rate": 3.9350000000000004e-06, + "loss": 0.6285, + "mean_token_accuracy": 0.8184956312179565, + "step": 1574 + }, + { + "epoch": 0.7875, + "grad_norm": 1.8317493182992777, + "learning_rate": 3.9375e-06, + "loss": 0.2866, + "mean_token_accuracy": 0.8887179493904114, + "step": 1575 + }, + { + "epoch": 0.788, + "grad_norm": 4.119476347485589, + "learning_rate": 3.94e-06, + "loss": 0.5126, + "mean_token_accuracy": 0.8408685326576233, + "step": 1576 + }, + { + "epoch": 0.7885, + "grad_norm": 2.333261758146459, + "learning_rate": 3.9425e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.881028950214386, + "step": 1577 + }, + { + "epoch": 0.789, + "grad_norm": 7.196422962299612, + "learning_rate": 3.945e-06, + "loss": 0.424, + "mean_token_accuracy": 0.87531578540802, + "step": 1578 + }, + { + "epoch": 0.7895, + "grad_norm": 2.0778619460735963, + "learning_rate": 3.9475e-06, + "loss": 0.5232, + "mean_token_accuracy": 0.8396719098091125, + "step": 1579 + }, + { + "epoch": 0.79, + "grad_norm": 2.6294763867291113, + "learning_rate": 3.95e-06, + "loss": 0.533, + "mean_token_accuracy": 0.8491111397743225, + "step": 1580 + }, + { + "epoch": 0.7905, + "grad_norm": 2.027995103950123, + "learning_rate": 3.9525e-06, + "loss": 0.3384, + "mean_token_accuracy": 0.8783042430877686, + "step": 1581 + }, + { + "epoch": 0.791, + "grad_norm": 4.706296980115798, + "learning_rate": 3.955e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8581327795982361, + "step": 1582 + }, + { + "epoch": 0.7915, + "grad_norm": 2.528020993985161, + "learning_rate": 3.957500000000001e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8793503642082214, + "step": 1583 + }, + { + "epoch": 0.792, + "grad_norm": 2.2475272706307856, + "learning_rate": 3.96e-06, + "loss": 0.3244, + "mean_token_accuracy": 0.8946061134338379, + "step": 1584 + }, + { + "epoch": 0.7925, + "grad_norm": 1.7095587810235584, + "learning_rate": 3.962500000000001e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8819742202758789, + "step": 1585 + }, + { + "epoch": 0.793, + "grad_norm": 2.62509277618106, + "learning_rate": 3.965e-06, + "loss": 0.4925, + "mean_token_accuracy": 0.8564384579658508, + "step": 1586 + }, + { + "epoch": 0.7935, + "grad_norm": 3.966064370813628, + "learning_rate": 3.9675000000000006e-06, + "loss": 0.5713, + "mean_token_accuracy": 0.8428518176078796, + "step": 1587 + }, + { + "epoch": 0.794, + "grad_norm": 3.8010570737501572, + "learning_rate": 3.97e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8830247521400452, + "step": 1588 + }, + { + "epoch": 0.7945, + "grad_norm": 2.2430573790571695, + "learning_rate": 3.9725000000000005e-06, + "loss": 0.3003, + "mean_token_accuracy": 0.8998247385025024, + "step": 1589 + }, + { + "epoch": 0.795, + "grad_norm": 1.8331360866221458, + "learning_rate": 3.975000000000001e-06, + "loss": 0.3001, + "mean_token_accuracy": 0.8983799815177917, + "step": 1590 + }, + { + "epoch": 0.7955, + "grad_norm": 4.104092152573327, + "learning_rate": 3.9775000000000005e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8699592351913452, + "step": 1591 + }, + { + "epoch": 0.796, + "grad_norm": 3.216348329131107, + "learning_rate": 3.980000000000001e-06, + "loss": 0.3019, + "mean_token_accuracy": 0.9006891250610352, + "step": 1592 + }, + { + "epoch": 0.7965, + "grad_norm": 2.714793966047475, + "learning_rate": 3.9825e-06, + "loss": 0.3193, + "mean_token_accuracy": 0.8856443762779236, + "step": 1593 + }, + { + "epoch": 0.797, + "grad_norm": 5.026632726052353, + "learning_rate": 3.985000000000001e-06, + "loss": 0.517, + "mean_token_accuracy": 0.8419926762580872, + "step": 1594 + }, + { + "epoch": 0.7975, + "grad_norm": 7.1859753016972485, + "learning_rate": 3.9875e-06, + "loss": 0.5519, + "mean_token_accuracy": 0.8481370210647583, + "step": 1595 + }, + { + "epoch": 0.798, + "grad_norm": 2.3363957297176663, + "learning_rate": 3.990000000000001e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8634076118469238, + "step": 1596 + }, + { + "epoch": 0.7985, + "grad_norm": 2.841894661167153, + "learning_rate": 3.9925e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8640504479408264, + "step": 1597 + }, + { + "epoch": 0.799, + "grad_norm": 2.3926248887496824, + "learning_rate": 3.995000000000001e-06, + "loss": 0.2812, + "mean_token_accuracy": 0.9029639363288879, + "step": 1598 + }, + { + "epoch": 0.7995, + "grad_norm": 2.1508405732121387, + "learning_rate": 3.9975e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.884483277797699, + "step": 1599 + }, + { + "epoch": 0.8, + "grad_norm": 4.946905853637683, + "learning_rate": 4.000000000000001e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8685661554336548, + "step": 1600 + }, + { + "epoch": 0.8005, + "grad_norm": 2.299175344082755, + "learning_rate": 4.0025e-06, + "loss": 0.2434, + "mean_token_accuracy": 0.9185154438018799, + "step": 1601 + }, + { + "epoch": 0.801, + "grad_norm": 2.9292558797645296, + "learning_rate": 4.005000000000001e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8657236695289612, + "step": 1602 + }, + { + "epoch": 0.8015, + "grad_norm": 2.848462880933968, + "learning_rate": 4.0075e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.891566276550293, + "step": 1603 + }, + { + "epoch": 0.802, + "grad_norm": 2.0968153155924334, + "learning_rate": 4.0100000000000006e-06, + "loss": 0.2472, + "mean_token_accuracy": 0.9127907156944275, + "step": 1604 + }, + { + "epoch": 0.8025, + "grad_norm": 38.265641807969565, + "learning_rate": 4.0125e-06, + "loss": 0.5092, + "mean_token_accuracy": 0.8432098627090454, + "step": 1605 + }, + { + "epoch": 0.803, + "grad_norm": 7.758471987694449, + "learning_rate": 4.0150000000000005e-06, + "loss": 0.5514, + "mean_token_accuracy": 0.8300057649612427, + "step": 1606 + }, + { + "epoch": 0.8035, + "grad_norm": 2.8450619258441945, + "learning_rate": 4.0175e-06, + "loss": 0.6649, + "mean_token_accuracy": 0.7955158352851868, + "step": 1607 + }, + { + "epoch": 0.804, + "grad_norm": 25.75407054049612, + "learning_rate": 4.0200000000000005e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8727027177810669, + "step": 1608 + }, + { + "epoch": 0.8045, + "grad_norm": 3.39010345278202, + "learning_rate": 4.0225e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8625320792198181, + "step": 1609 + }, + { + "epoch": 0.805, + "grad_norm": 4.398866904087669, + "learning_rate": 4.0250000000000004e-06, + "loss": 0.5047, + "mean_token_accuracy": 0.8488286733627319, + "step": 1610 + }, + { + "epoch": 0.8055, + "grad_norm": 3.8573708090704604, + "learning_rate": 4.0275e-06, + "loss": 0.5017, + "mean_token_accuracy": 0.8441539406776428, + "step": 1611 + }, + { + "epoch": 0.806, + "grad_norm": 1.9768381045616708, + "learning_rate": 4.03e-06, + "loss": 0.3118, + "mean_token_accuracy": 0.888350248336792, + "step": 1612 + }, + { + "epoch": 0.8065, + "grad_norm": 4.04456862572682, + "learning_rate": 4.0325e-06, + "loss": 0.3273, + "mean_token_accuracy": 0.8982245922088623, + "step": 1613 + }, + { + "epoch": 0.807, + "grad_norm": 2.7841644368140384, + "learning_rate": 4.035e-06, + "loss": 0.2741, + "mean_token_accuracy": 0.9050841331481934, + "step": 1614 + }, + { + "epoch": 0.8075, + "grad_norm": 4.381062652728181, + "learning_rate": 4.037500000000001e-06, + "loss": 0.5997, + "mean_token_accuracy": 0.8279001712799072, + "step": 1615 + }, + { + "epoch": 0.808, + "grad_norm": 2.5688791312600405, + "learning_rate": 4.04e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8599432706832886, + "step": 1616 + }, + { + "epoch": 0.8085, + "grad_norm": 1.835328294255902, + "learning_rate": 4.042500000000001e-06, + "loss": 0.2842, + "mean_token_accuracy": 0.9090909361839294, + "step": 1617 + }, + { + "epoch": 0.809, + "grad_norm": 2.4261396865478204, + "learning_rate": 4.045e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8580752611160278, + "step": 1618 + }, + { + "epoch": 0.8095, + "grad_norm": 2.0120670194696864, + "learning_rate": 4.047500000000001e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.8955650925636292, + "step": 1619 + }, + { + "epoch": 0.81, + "grad_norm": 4.878504719244862, + "learning_rate": 4.05e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8701691031455994, + "step": 1620 + }, + { + "epoch": 0.8105, + "grad_norm": 2.4495761508145497, + "learning_rate": 4.052500000000001e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8657074570655823, + "step": 1621 + }, + { + "epoch": 0.811, + "grad_norm": 2.5576274049552534, + "learning_rate": 4.055000000000001e-06, + "loss": 0.5442, + "mean_token_accuracy": 0.8378069400787354, + "step": 1622 + }, + { + "epoch": 0.8115, + "grad_norm": 3.252713858865844, + "learning_rate": 4.0575000000000005e-06, + "loss": 0.601, + "mean_token_accuracy": 0.8253855109214783, + "step": 1623 + }, + { + "epoch": 0.812, + "grad_norm": 3.312943145027146, + "learning_rate": 4.060000000000001e-06, + "loss": 0.2281, + "mean_token_accuracy": 0.9206106662750244, + "step": 1624 + }, + { + "epoch": 0.8125, + "grad_norm": 4.181888260190059, + "learning_rate": 4.0625000000000005e-06, + "loss": 0.5486, + "mean_token_accuracy": 0.8325034976005554, + "step": 1625 + }, + { + "epoch": 0.813, + "grad_norm": 2.119001441460793, + "learning_rate": 4.065e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8694362044334412, + "step": 1626 + }, + { + "epoch": 0.8135, + "grad_norm": 2.748227045221289, + "learning_rate": 4.0675000000000004e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8450119495391846, + "step": 1627 + }, + { + "epoch": 0.814, + "grad_norm": 3.2520533901684585, + "learning_rate": 4.07e-06, + "loss": 0.5008, + "mean_token_accuracy": 0.8581360578536987, + "step": 1628 + }, + { + "epoch": 0.8145, + "grad_norm": 2.156695839741556, + "learning_rate": 4.0725e-06, + "loss": 0.3198, + "mean_token_accuracy": 0.8973338007926941, + "step": 1629 + }, + { + "epoch": 0.815, + "grad_norm": 3.0690209223091203, + "learning_rate": 4.075e-06, + "loss": 0.2872, + "mean_token_accuracy": 0.9028137922286987, + "step": 1630 + }, + { + "epoch": 0.8155, + "grad_norm": 2.530343859076828, + "learning_rate": 4.0775e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.8623453378677368, + "step": 1631 + }, + { + "epoch": 0.816, + "grad_norm": 1.714937341404861, + "learning_rate": 4.08e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8744620680809021, + "step": 1632 + }, + { + "epoch": 0.8165, + "grad_norm": 3.0232332173305996, + "learning_rate": 4.0825e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.882413387298584, + "step": 1633 + }, + { + "epoch": 0.817, + "grad_norm": 2.233687482734785, + "learning_rate": 4.085e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8700944781303406, + "step": 1634 + }, + { + "epoch": 0.8175, + "grad_norm": 3.6169338670639224, + "learning_rate": 4.0875e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8895328640937805, + "step": 1635 + }, + { + "epoch": 0.818, + "grad_norm": 5.747395549232406, + "learning_rate": 4.09e-06, + "loss": 0.525, + "mean_token_accuracy": 0.8374319076538086, + "step": 1636 + }, + { + "epoch": 0.8185, + "grad_norm": 2.725436265651893, + "learning_rate": 4.0925e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.8560444712638855, + "step": 1637 + }, + { + "epoch": 0.819, + "grad_norm": 2.1096054025568325, + "learning_rate": 4.095e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8810487389564514, + "step": 1638 + }, + { + "epoch": 0.8195, + "grad_norm": 5.9216595098752745, + "learning_rate": 4.0975e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8588658571243286, + "step": 1639 + }, + { + "epoch": 0.82, + "grad_norm": 2.2541515991285586, + "learning_rate": 4.1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8818268179893494, + "step": 1640 + }, + { + "epoch": 0.8205, + "grad_norm": 2.977810165889736, + "learning_rate": 4.1025e-06, + "loss": 0.339, + "mean_token_accuracy": 0.8911082744598389, + "step": 1641 + }, + { + "epoch": 0.821, + "grad_norm": 2.729481742502532, + "learning_rate": 4.1050000000000005e-06, + "loss": 0.3387, + "mean_token_accuracy": 0.8886597752571106, + "step": 1642 + }, + { + "epoch": 0.8215, + "grad_norm": 2.311660115098209, + "learning_rate": 4.1075e-06, + "loss": 0.3293, + "mean_token_accuracy": 0.8912334442138672, + "step": 1643 + }, + { + "epoch": 0.822, + "grad_norm": 2.1819745168735976, + "learning_rate": 4.1100000000000005e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.8341878056526184, + "step": 1644 + }, + { + "epoch": 0.8225, + "grad_norm": 2.848705824376445, + "learning_rate": 4.1125e-06, + "loss": 0.3082, + "mean_token_accuracy": 0.8939017653465271, + "step": 1645 + }, + { + "epoch": 0.823, + "grad_norm": 11.517221633873739, + "learning_rate": 4.115e-06, + "loss": 0.3016, + "mean_token_accuracy": 0.902643084526062, + "step": 1646 + }, + { + "epoch": 0.8235, + "grad_norm": 2.7162489135311803, + "learning_rate": 4.1175e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8616622090339661, + "step": 1647 + }, + { + "epoch": 0.824, + "grad_norm": 2.823042373643299, + "learning_rate": 4.12e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8867398500442505, + "step": 1648 + }, + { + "epoch": 0.8245, + "grad_norm": 2.769280210739026, + "learning_rate": 4.122500000000001e-06, + "loss": 0.5103, + "mean_token_accuracy": 0.8371748328208923, + "step": 1649 + }, + { + "epoch": 0.825, + "grad_norm": 2.3117528216815613, + "learning_rate": 4.125e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8592072129249573, + "step": 1650 + }, + { + "epoch": 0.8255, + "grad_norm": 3.9821431731306616, + "learning_rate": 4.127500000000001e-06, + "loss": 0.5072, + "mean_token_accuracy": 0.8253778219223022, + "step": 1651 + }, + { + "epoch": 0.826, + "grad_norm": 5.475472823497642, + "learning_rate": 4.13e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8632766604423523, + "step": 1652 + }, + { + "epoch": 0.8265, + "grad_norm": 3.3178785219708637, + "learning_rate": 4.132500000000001e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8923856019973755, + "step": 1653 + }, + { + "epoch": 0.827, + "grad_norm": 2.6747455500410564, + "learning_rate": 4.135e-06, + "loss": 0.2625, + "mean_token_accuracy": 0.9145106673240662, + "step": 1654 + }, + { + "epoch": 0.8275, + "grad_norm": 3.527055264490762, + "learning_rate": 4.137500000000001e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8805233836174011, + "step": 1655 + }, + { + "epoch": 0.828, + "grad_norm": 2.351466946506723, + "learning_rate": 4.14e-06, + "loss": 0.4847, + "mean_token_accuracy": 0.8513017296791077, + "step": 1656 + }, + { + "epoch": 0.8285, + "grad_norm": 2.9205169317489563, + "learning_rate": 4.1425000000000006e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.874169111251831, + "step": 1657 + }, + { + "epoch": 0.829, + "grad_norm": 2.3125458577653104, + "learning_rate": 4.145e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8850616812705994, + "step": 1658 + }, + { + "epoch": 0.8295, + "grad_norm": 4.369156744197606, + "learning_rate": 4.1475000000000005e-06, + "loss": 0.3232, + "mean_token_accuracy": 0.8928441405296326, + "step": 1659 + }, + { + "epoch": 0.83, + "grad_norm": 40.107760742031395, + "learning_rate": 4.15e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.8568228483200073, + "step": 1660 + }, + { + "epoch": 0.8305, + "grad_norm": 5.367481716522769, + "learning_rate": 4.1525000000000005e-06, + "loss": 0.2472, + "mean_token_accuracy": 0.9179179072380066, + "step": 1661 + }, + { + "epoch": 0.831, + "grad_norm": 2.3563612304771957, + "learning_rate": 4.155e-06, + "loss": 0.5265, + "mean_token_accuracy": 0.839801013469696, + "step": 1662 + }, + { + "epoch": 0.8315, + "grad_norm": 3.1551823735189592, + "learning_rate": 4.1575000000000004e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.8584460616111755, + "step": 1663 + }, + { + "epoch": 0.832, + "grad_norm": 3.5864447540398077, + "learning_rate": 4.16e-06, + "loss": 0.5972, + "mean_token_accuracy": 0.8380666971206665, + "step": 1664 + }, + { + "epoch": 0.8325, + "grad_norm": 2.856548775928645, + "learning_rate": 4.1625e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8732314109802246, + "step": 1665 + }, + { + "epoch": 0.833, + "grad_norm": 2.273774175191375, + "learning_rate": 4.165e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8649012446403503, + "step": 1666 + }, + { + "epoch": 0.8335, + "grad_norm": 4.220455703474561, + "learning_rate": 4.1675e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8646131753921509, + "step": 1667 + }, + { + "epoch": 0.834, + "grad_norm": 2.7594137940502446, + "learning_rate": 4.17e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.8475366830825806, + "step": 1668 + }, + { + "epoch": 0.8345, + "grad_norm": 2.6649254142393968, + "learning_rate": 4.1725e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8782414793968201, + "step": 1669 + }, + { + "epoch": 0.835, + "grad_norm": 2.9034856487721403, + "learning_rate": 4.175e-06, + "loss": 0.5673, + "mean_token_accuracy": 0.8340094685554504, + "step": 1670 + }, + { + "epoch": 0.8355, + "grad_norm": 3.121886536754492, + "learning_rate": 4.1775e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8685828447341919, + "step": 1671 + }, + { + "epoch": 0.836, + "grad_norm": 3.4962534984598377, + "learning_rate": 4.18e-06, + "loss": 0.3302, + "mean_token_accuracy": 0.9002358317375183, + "step": 1672 + }, + { + "epoch": 0.8365, + "grad_norm": 4.649884979385627, + "learning_rate": 4.1825e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8524838089942932, + "step": 1673 + }, + { + "epoch": 0.837, + "grad_norm": 2.72440603239608, + "learning_rate": 4.185000000000001e-06, + "loss": 0.2548, + "mean_token_accuracy": 0.9074954390525818, + "step": 1674 + }, + { + "epoch": 0.8375, + "grad_norm": 4.71698346545663, + "learning_rate": 4.1875e-06, + "loss": 0.5194, + "mean_token_accuracy": 0.8531641364097595, + "step": 1675 + }, + { + "epoch": 0.838, + "grad_norm": 3.815527993854938, + "learning_rate": 4.1900000000000005e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8596996665000916, + "step": 1676 + }, + { + "epoch": 0.8385, + "grad_norm": 2.7521992925822123, + "learning_rate": 4.1925e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.867916464805603, + "step": 1677 + }, + { + "epoch": 0.839, + "grad_norm": 2.5831989433823277, + "learning_rate": 4.1950000000000005e-06, + "loss": 0.3016, + "mean_token_accuracy": 0.9030808806419373, + "step": 1678 + }, + { + "epoch": 0.8395, + "grad_norm": 2.0786620145003543, + "learning_rate": 4.1975e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8760190010070801, + "step": 1679 + }, + { + "epoch": 0.84, + "grad_norm": 2.7126375178429964, + "learning_rate": 4.2000000000000004e-06, + "loss": 0.3374, + "mean_token_accuracy": 0.8891552090644836, + "step": 1680 + }, + { + "epoch": 0.8405, + "grad_norm": 2.4334028425430962, + "learning_rate": 4.202500000000001e-06, + "loss": 0.2915, + "mean_token_accuracy": 0.9060580730438232, + "step": 1681 + }, + { + "epoch": 0.841, + "grad_norm": 2.139893256902727, + "learning_rate": 4.205e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.8920032978057861, + "step": 1682 + }, + { + "epoch": 0.8415, + "grad_norm": 3.3862194972231707, + "learning_rate": 4.207500000000001e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8602169156074524, + "step": 1683 + }, + { + "epoch": 0.842, + "grad_norm": 2.494057895558358, + "learning_rate": 4.21e-06, + "loss": 0.3119, + "mean_token_accuracy": 0.900737464427948, + "step": 1684 + }, + { + "epoch": 0.8425, + "grad_norm": 2.561143001076503, + "learning_rate": 4.212500000000001e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8698049783706665, + "step": 1685 + }, + { + "epoch": 0.843, + "grad_norm": 4.47778514991453, + "learning_rate": 4.215e-06, + "loss": 0.5664, + "mean_token_accuracy": 0.8228384256362915, + "step": 1686 + }, + { + "epoch": 0.8435, + "grad_norm": 2.1260854252062047, + "learning_rate": 4.217500000000001e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.8935091495513916, + "step": 1687 + }, + { + "epoch": 0.844, + "grad_norm": 3.025899877011327, + "learning_rate": 4.22e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8606423735618591, + "step": 1688 + }, + { + "epoch": 0.8445, + "grad_norm": 4.642059455938399, + "learning_rate": 4.222500000000001e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8563458919525146, + "step": 1689 + }, + { + "epoch": 0.845, + "grad_norm": 2.1959171364790393, + "learning_rate": 4.225e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8733850121498108, + "step": 1690 + }, + { + "epoch": 0.8455, + "grad_norm": 3.1160276024414757, + "learning_rate": 4.227500000000001e-06, + "loss": 0.5417, + "mean_token_accuracy": 0.8355599641799927, + "step": 1691 + }, + { + "epoch": 0.846, + "grad_norm": 2.84130916084512, + "learning_rate": 4.23e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8573925495147705, + "step": 1692 + }, + { + "epoch": 0.8465, + "grad_norm": 4.1874615721980675, + "learning_rate": 4.2325000000000006e-06, + "loss": 0.5318, + "mean_token_accuracy": 0.8484272360801697, + "step": 1693 + }, + { + "epoch": 0.847, + "grad_norm": 4.178889846708239, + "learning_rate": 4.235e-06, + "loss": 0.2524, + "mean_token_accuracy": 0.9111968874931335, + "step": 1694 + }, + { + "epoch": 0.8475, + "grad_norm": 2.7721248296093166, + "learning_rate": 4.2375000000000005e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8895694613456726, + "step": 1695 + }, + { + "epoch": 0.848, + "grad_norm": 4.291784456780452, + "learning_rate": 4.24e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8598545789718628, + "step": 1696 + }, + { + "epoch": 0.8485, + "grad_norm": 3.0980514618695527, + "learning_rate": 4.2425000000000005e-06, + "loss": 0.535, + "mean_token_accuracy": 0.8439658880233765, + "step": 1697 + }, + { + "epoch": 0.849, + "grad_norm": 3.5898630141270154, + "learning_rate": 4.245e-06, + "loss": 0.6374, + "mean_token_accuracy": 0.8211382031440735, + "step": 1698 + }, + { + "epoch": 0.8495, + "grad_norm": 4.203886893916219, + "learning_rate": 4.2475e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.8462373614311218, + "step": 1699 + }, + { + "epoch": 0.85, + "grad_norm": 3.5914663298435134, + "learning_rate": 4.25e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8698132634162903, + "step": 1700 + }, + { + "epoch": 0.8505, + "grad_norm": 6.812495477340547, + "learning_rate": 4.2525e-06, + "loss": 0.5509, + "mean_token_accuracy": 0.8370293378829956, + "step": 1701 + }, + { + "epoch": 0.851, + "grad_norm": 3.827571377843308, + "learning_rate": 4.255e-06, + "loss": 0.2852, + "mean_token_accuracy": 0.9117125272750854, + "step": 1702 + }, + { + "epoch": 0.8515, + "grad_norm": 4.133959600755195, + "learning_rate": 4.2575e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.8581390976905823, + "step": 1703 + }, + { + "epoch": 0.852, + "grad_norm": 2.292213012181119, + "learning_rate": 4.26e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8804869651794434, + "step": 1704 + }, + { + "epoch": 0.8525, + "grad_norm": 3.091915726376857, + "learning_rate": 4.2625e-06, + "loss": 0.5395, + "mean_token_accuracy": 0.8368868231773376, + "step": 1705 + }, + { + "epoch": 0.853, + "grad_norm": 10.707473496169145, + "learning_rate": 4.265000000000001e-06, + "loss": 0.5109, + "mean_token_accuracy": 0.8581478595733643, + "step": 1706 + }, + { + "epoch": 0.8535, + "grad_norm": 2.991001163777279, + "learning_rate": 4.2675e-06, + "loss": 0.6061, + "mean_token_accuracy": 0.8246167302131653, + "step": 1707 + }, + { + "epoch": 0.854, + "grad_norm": 3.7344781998362087, + "learning_rate": 4.270000000000001e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.864170253276825, + "step": 1708 + }, + { + "epoch": 0.8545, + "grad_norm": 2.3421294252204135, + "learning_rate": 4.2725e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8790999054908752, + "step": 1709 + }, + { + "epoch": 0.855, + "grad_norm": 1.8820028035268375, + "learning_rate": 4.2750000000000006e-06, + "loss": 0.2444, + "mean_token_accuracy": 0.9209379553794861, + "step": 1710 + }, + { + "epoch": 0.8555, + "grad_norm": 2.904165622828542, + "learning_rate": 4.2775e-06, + "loss": 0.2784, + "mean_token_accuracy": 0.9132194519042969, + "step": 1711 + }, + { + "epoch": 0.856, + "grad_norm": 3.366589751152068, + "learning_rate": 4.2800000000000005e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.865984320640564, + "step": 1712 + }, + { + "epoch": 0.8565, + "grad_norm": 2.0225235131246326, + "learning_rate": 4.282500000000001e-06, + "loss": 0.5434, + "mean_token_accuracy": 0.8455386757850647, + "step": 1713 + }, + { + "epoch": 0.857, + "grad_norm": 4.311590470875771, + "learning_rate": 4.2850000000000005e-06, + "loss": 0.3088, + "mean_token_accuracy": 0.8978609442710876, + "step": 1714 + }, + { + "epoch": 0.8575, + "grad_norm": 2.1231284390744762, + "learning_rate": 4.287500000000001e-06, + "loss": 0.3261, + "mean_token_accuracy": 0.8998401761054993, + "step": 1715 + }, + { + "epoch": 0.858, + "grad_norm": 3.2899012961218275, + "learning_rate": 4.2900000000000004e-06, + "loss": 0.5999, + "mean_token_accuracy": 0.8340467810630798, + "step": 1716 + }, + { + "epoch": 0.8585, + "grad_norm": 2.6549651509960817, + "learning_rate": 4.292500000000001e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8853997588157654, + "step": 1717 + }, + { + "epoch": 0.859, + "grad_norm": 2.961694032920385, + "learning_rate": 4.295e-06, + "loss": 0.282, + "mean_token_accuracy": 0.9116426110267639, + "step": 1718 + }, + { + "epoch": 0.8595, + "grad_norm": 4.24507764964314, + "learning_rate": 4.297500000000001e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8579439520835876, + "step": 1719 + }, + { + "epoch": 0.86, + "grad_norm": 3.351653500260571, + "learning_rate": 4.3e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8698022961616516, + "step": 1720 + }, + { + "epoch": 0.8605, + "grad_norm": 2.2703308115889165, + "learning_rate": 4.302500000000001e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8775861859321594, + "step": 1721 + }, + { + "epoch": 0.861, + "grad_norm": 3.5679415786825723, + "learning_rate": 4.305e-06, + "loss": 0.592, + "mean_token_accuracy": 0.8268716335296631, + "step": 1722 + }, + { + "epoch": 0.8615, + "grad_norm": 2.373735140367738, + "learning_rate": 4.307500000000001e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8838887214660645, + "step": 1723 + }, + { + "epoch": 0.862, + "grad_norm": 2.6110069677239314, + "learning_rate": 4.31e-06, + "loss": 0.5094, + "mean_token_accuracy": 0.8557548522949219, + "step": 1724 + }, + { + "epoch": 0.8625, + "grad_norm": 3.760254156343768, + "learning_rate": 4.312500000000001e-06, + "loss": 0.482, + "mean_token_accuracy": 0.8375401496887207, + "step": 1725 + }, + { + "epoch": 0.863, + "grad_norm": 5.668347884264919, + "learning_rate": 4.315e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8833540081977844, + "step": 1726 + }, + { + "epoch": 0.8635, + "grad_norm": 2.550376368985143, + "learning_rate": 4.317500000000001e-06, + "loss": 0.3236, + "mean_token_accuracy": 0.8995991349220276, + "step": 1727 + }, + { + "epoch": 0.864, + "grad_norm": 3.0107695495137183, + "learning_rate": 4.32e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8916592001914978, + "step": 1728 + }, + { + "epoch": 0.8645, + "grad_norm": 6.002243897635309, + "learning_rate": 4.3225000000000005e-06, + "loss": 0.3077, + "mean_token_accuracy": 0.9046531915664673, + "step": 1729 + }, + { + "epoch": 0.865, + "grad_norm": 2.203925525930894, + "learning_rate": 4.325e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.883309006690979, + "step": 1730 + }, + { + "epoch": 0.8655, + "grad_norm": 3.257284446633349, + "learning_rate": 4.3275000000000005e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8788705468177795, + "step": 1731 + }, + { + "epoch": 0.866, + "grad_norm": 12.300762213569925, + "learning_rate": 4.33e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8663058876991272, + "step": 1732 + }, + { + "epoch": 0.8665, + "grad_norm": 2.89946954868566, + "learning_rate": 4.3325000000000004e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8588441014289856, + "step": 1733 + }, + { + "epoch": 0.867, + "grad_norm": 3.068963601463194, + "learning_rate": 4.335e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.8923954367637634, + "step": 1734 + }, + { + "epoch": 0.8675, + "grad_norm": 2.272652803407874, + "learning_rate": 4.3375e-06, + "loss": 0.5452, + "mean_token_accuracy": 0.8269405364990234, + "step": 1735 + }, + { + "epoch": 0.868, + "grad_norm": 3.930545650204552, + "learning_rate": 4.34e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.8500237464904785, + "step": 1736 + }, + { + "epoch": 0.8685, + "grad_norm": 3.4300954688681817, + "learning_rate": 4.3425e-06, + "loss": 0.6484, + "mean_token_accuracy": 0.8139029741287231, + "step": 1737 + }, + { + "epoch": 0.869, + "grad_norm": 2.6075332687045525, + "learning_rate": 4.345000000000001e-06, + "loss": 0.5272, + "mean_token_accuracy": 0.8366940021514893, + "step": 1738 + }, + { + "epoch": 0.8695, + "grad_norm": 5.955017265991068, + "learning_rate": 4.3475e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8800069689750671, + "step": 1739 + }, + { + "epoch": 0.87, + "grad_norm": 2.38063354232181, + "learning_rate": 4.350000000000001e-06, + "loss": 0.3161, + "mean_token_accuracy": 0.8942021131515503, + "step": 1740 + }, + { + "epoch": 0.8705, + "grad_norm": 7.838406279048806, + "learning_rate": 4.3525e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.8853654265403748, + "step": 1741 + }, + { + "epoch": 0.871, + "grad_norm": 3.116917939552001, + "learning_rate": 4.355000000000001e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8701590299606323, + "step": 1742 + }, + { + "epoch": 0.8715, + "grad_norm": 4.512709421310596, + "learning_rate": 4.3575e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.8428423404693604, + "step": 1743 + }, + { + "epoch": 0.872, + "grad_norm": 3.4584492157037396, + "learning_rate": 4.360000000000001e-06, + "loss": 0.5939, + "mean_token_accuracy": 0.8282531499862671, + "step": 1744 + }, + { + "epoch": 0.8725, + "grad_norm": 2.5777135413638086, + "learning_rate": 4.362500000000001e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.8833226561546326, + "step": 1745 + }, + { + "epoch": 0.873, + "grad_norm": 3.1174768997513316, + "learning_rate": 4.3650000000000006e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8595568537712097, + "step": 1746 + }, + { + "epoch": 0.8735, + "grad_norm": 3.0539418572634647, + "learning_rate": 4.367500000000001e-06, + "loss": 0.2573, + "mean_token_accuracy": 0.9150704741477966, + "step": 1747 + }, + { + "epoch": 0.874, + "grad_norm": 4.194960180528836, + "learning_rate": 4.3700000000000005e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8684807419776917, + "step": 1748 + }, + { + "epoch": 0.8745, + "grad_norm": 4.350094875868593, + "learning_rate": 4.372500000000001e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8692671060562134, + "step": 1749 + }, + { + "epoch": 0.875, + "grad_norm": 3.356203990552749, + "learning_rate": 4.3750000000000005e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.8403361439704895, + "step": 1750 + }, + { + "epoch": 0.8755, + "grad_norm": 3.8883396868199656, + "learning_rate": 4.3775e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8430783748626709, + "step": 1751 + }, + { + "epoch": 0.876, + "grad_norm": 2.4588609910045074, + "learning_rate": 4.38e-06, + "loss": 0.6265, + "mean_token_accuracy": 0.8200547695159912, + "step": 1752 + }, + { + "epoch": 0.8765, + "grad_norm": 4.883373531801429, + "learning_rate": 4.3825e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.8482722043991089, + "step": 1753 + }, + { + "epoch": 0.877, + "grad_norm": 2.315612744967227, + "learning_rate": 4.385e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8908949494361877, + "step": 1754 + }, + { + "epoch": 0.8775, + "grad_norm": 1.93414906080788, + "learning_rate": 4.3875e-06, + "loss": 0.2562, + "mean_token_accuracy": 0.9109131693840027, + "step": 1755 + }, + { + "epoch": 0.878, + "grad_norm": 3.018554414027276, + "learning_rate": 4.39e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8817073106765747, + "step": 1756 + }, + { + "epoch": 0.8785, + "grad_norm": 6.914691940667236, + "learning_rate": 4.3925e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8963607549667358, + "step": 1757 + }, + { + "epoch": 0.879, + "grad_norm": 3.380758768694136, + "learning_rate": 4.395e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8696491122245789, + "step": 1758 + }, + { + "epoch": 0.8795, + "grad_norm": 1.8301805949627568, + "learning_rate": 4.3975e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8848705887794495, + "step": 1759 + }, + { + "epoch": 0.88, + "grad_norm": 2.6116773487997964, + "learning_rate": 4.4e-06, + "loss": 0.3306, + "mean_token_accuracy": 0.8993220329284668, + "step": 1760 + }, + { + "epoch": 0.8805, + "grad_norm": 3.760720703921714, + "learning_rate": 4.4025e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8667697906494141, + "step": 1761 + }, + { + "epoch": 0.881, + "grad_norm": 2.2122034981611374, + "learning_rate": 4.405e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.887977659702301, + "step": 1762 + }, + { + "epoch": 0.8815, + "grad_norm": 2.6882595382528764, + "learning_rate": 4.4075e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8777543306350708, + "step": 1763 + }, + { + "epoch": 0.882, + "grad_norm": 2.969034545496295, + "learning_rate": 4.41e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8579162955284119, + "step": 1764 + }, + { + "epoch": 0.8825, + "grad_norm": 6.581123112526529, + "learning_rate": 4.4125000000000005e-06, + "loss": 0.4823, + "mean_token_accuracy": 0.8504757881164551, + "step": 1765 + }, + { + "epoch": 0.883, + "grad_norm": 2.261495603755162, + "learning_rate": 4.415e-06, + "loss": 0.411, + "mean_token_accuracy": 0.85146564245224, + "step": 1766 + }, + { + "epoch": 0.8835, + "grad_norm": 5.335213889006182, + "learning_rate": 4.4175000000000005e-06, + "loss": 0.5205, + "mean_token_accuracy": 0.8456857204437256, + "step": 1767 + }, + { + "epoch": 0.884, + "grad_norm": 3.64690536636751, + "learning_rate": 4.42e-06, + "loss": 0.2906, + "mean_token_accuracy": 0.9066213965415955, + "step": 1768 + }, + { + "epoch": 0.8845, + "grad_norm": 2.61285886115817, + "learning_rate": 4.4225000000000004e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.876625657081604, + "step": 1769 + }, + { + "epoch": 0.885, + "grad_norm": 2.588101906215189, + "learning_rate": 4.425e-06, + "loss": 0.2622, + "mean_token_accuracy": 0.9079039692878723, + "step": 1770 + }, + { + "epoch": 0.8855, + "grad_norm": 6.15585204704438, + "learning_rate": 4.4275e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8685429692268372, + "step": 1771 + }, + { + "epoch": 0.886, + "grad_norm": 6.067451282332706, + "learning_rate": 4.430000000000001e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8811949491500854, + "step": 1772 + }, + { + "epoch": 0.8865, + "grad_norm": 3.2079961866320765, + "learning_rate": 4.4325e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8629881143569946, + "step": 1773 + }, + { + "epoch": 0.887, + "grad_norm": 22.818652379135013, + "learning_rate": 4.435000000000001e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8696191310882568, + "step": 1774 + }, + { + "epoch": 0.8875, + "grad_norm": 4.105561022086271, + "learning_rate": 4.4375e-06, + "loss": 0.5383, + "mean_token_accuracy": 0.8459036350250244, + "step": 1775 + }, + { + "epoch": 0.888, + "grad_norm": 2.6620498595990387, + "learning_rate": 4.440000000000001e-06, + "loss": 0.5079, + "mean_token_accuracy": 0.8304568529129028, + "step": 1776 + }, + { + "epoch": 0.8885, + "grad_norm": 5.553268968324448, + "learning_rate": 4.4425e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8742233514785767, + "step": 1777 + }, + { + "epoch": 0.889, + "grad_norm": 2.331954600511652, + "learning_rate": 4.445000000000001e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8367061018943787, + "step": 1778 + }, + { + "epoch": 0.8895, + "grad_norm": 3.214221238646685, + "learning_rate": 4.4475e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.9027132987976074, + "step": 1779 + }, + { + "epoch": 0.89, + "grad_norm": 2.2329217139857356, + "learning_rate": 4.450000000000001e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8563432693481445, + "step": 1780 + }, + { + "epoch": 0.8905, + "grad_norm": 2.057046021676232, + "learning_rate": 4.4525e-06, + "loss": 0.2335, + "mean_token_accuracy": 0.9124200940132141, + "step": 1781 + }, + { + "epoch": 0.891, + "grad_norm": 2.6147578746615565, + "learning_rate": 4.4550000000000005e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.883513331413269, + "step": 1782 + }, + { + "epoch": 0.8915, + "grad_norm": 18.851871971293516, + "learning_rate": 4.4575e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8742783665657043, + "step": 1783 + }, + { + "epoch": 0.892, + "grad_norm": 6.905931608682358, + "learning_rate": 4.4600000000000005e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8479345440864563, + "step": 1784 + }, + { + "epoch": 0.8925, + "grad_norm": 2.6551306765763494, + "learning_rate": 4.4625e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8534119725227356, + "step": 1785 + }, + { + "epoch": 0.893, + "grad_norm": 3.3237586699950024, + "learning_rate": 4.4650000000000004e-06, + "loss": 0.3321, + "mean_token_accuracy": 0.8946378231048584, + "step": 1786 + }, + { + "epoch": 0.8935, + "grad_norm": 2.3052593467421936, + "learning_rate": 4.4675e-06, + "loss": 0.5267, + "mean_token_accuracy": 0.8375962972640991, + "step": 1787 + }, + { + "epoch": 0.894, + "grad_norm": 2.605926065537978, + "learning_rate": 4.47e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8841544389724731, + "step": 1788 + }, + { + "epoch": 0.8945, + "grad_norm": 2.6746808029356264, + "learning_rate": 4.4725e-06, + "loss": 0.613, + "mean_token_accuracy": 0.8331769704818726, + "step": 1789 + }, + { + "epoch": 0.895, + "grad_norm": 2.2419486424320474, + "learning_rate": 4.475e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8595505356788635, + "step": 1790 + }, + { + "epoch": 0.8955, + "grad_norm": 3.807117758677018, + "learning_rate": 4.4775e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.9000837802886963, + "step": 1791 + }, + { + "epoch": 0.896, + "grad_norm": 1.9437716185750629, + "learning_rate": 4.48e-06, + "loss": 0.2947, + "mean_token_accuracy": 0.8948442339897156, + "step": 1792 + }, + { + "epoch": 0.8965, + "grad_norm": 41.68023854606428, + "learning_rate": 4.4825e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8896486163139343, + "step": 1793 + }, + { + "epoch": 0.897, + "grad_norm": 2.053877062641258, + "learning_rate": 4.485e-06, + "loss": 0.2863, + "mean_token_accuracy": 0.9103130102157593, + "step": 1794 + }, + { + "epoch": 0.8975, + "grad_norm": 1.9590677546669522, + "learning_rate": 4.4875e-06, + "loss": 0.2704, + "mean_token_accuracy": 0.9068047404289246, + "step": 1795 + }, + { + "epoch": 0.898, + "grad_norm": 2.2876114824857994, + "learning_rate": 4.49e-06, + "loss": 0.6614, + "mean_token_accuracy": 0.8280453085899353, + "step": 1796 + }, + { + "epoch": 0.8985, + "grad_norm": 5.200836975304713, + "learning_rate": 4.492500000000001e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8373504877090454, + "step": 1797 + }, + { + "epoch": 0.899, + "grad_norm": 2.4862199744992997, + "learning_rate": 4.495e-06, + "loss": 0.6314, + "mean_token_accuracy": 0.8254777193069458, + "step": 1798 + }, + { + "epoch": 0.8995, + "grad_norm": 4.065569160180409, + "learning_rate": 4.4975000000000006e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8970687389373779, + "step": 1799 + }, + { + "epoch": 0.9, + "grad_norm": 3.721098603621264, + "learning_rate": 4.5e-06, + "loss": 0.579, + "mean_token_accuracy": 0.8435669541358948, + "step": 1800 + }, + { + "epoch": 0.9005, + "grad_norm": 2.2172126960834304, + "learning_rate": 4.5025000000000005e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8662809133529663, + "step": 1801 + }, + { + "epoch": 0.901, + "grad_norm": 2.4718512157262507, + "learning_rate": 4.505e-06, + "loss": 0.506, + "mean_token_accuracy": 0.845196545124054, + "step": 1802 + }, + { + "epoch": 0.9015, + "grad_norm": 2.1981591671042016, + "learning_rate": 4.5075000000000005e-06, + "loss": 0.3104, + "mean_token_accuracy": 0.8980268836021423, + "step": 1803 + }, + { + "epoch": 0.902, + "grad_norm": 3.47161001144009, + "learning_rate": 4.510000000000001e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8738613724708557, + "step": 1804 + }, + { + "epoch": 0.9025, + "grad_norm": 2.578657203477948, + "learning_rate": 4.5125e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8583358526229858, + "step": 1805 + }, + { + "epoch": 0.903, + "grad_norm": 3.0870449496584964, + "learning_rate": 4.515000000000001e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8660939931869507, + "step": 1806 + }, + { + "epoch": 0.9035, + "grad_norm": 4.531641818754263, + "learning_rate": 4.5175e-06, + "loss": 0.5941, + "mean_token_accuracy": 0.8314736485481262, + "step": 1807 + }, + { + "epoch": 0.904, + "grad_norm": 4.667590640796402, + "learning_rate": 4.520000000000001e-06, + "loss": 0.3298, + "mean_token_accuracy": 0.8920324444770813, + "step": 1808 + }, + { + "epoch": 0.9045, + "grad_norm": 2.6302932169067543, + "learning_rate": 4.5225e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.8478426337242126, + "step": 1809 + }, + { + "epoch": 0.905, + "grad_norm": 3.0256798686314914, + "learning_rate": 4.525000000000001e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.866779088973999, + "step": 1810 + }, + { + "epoch": 0.9055, + "grad_norm": 2.356756984334903, + "learning_rate": 4.5275e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.8563033938407898, + "step": 1811 + }, + { + "epoch": 0.906, + "grad_norm": 2.9994342299111763, + "learning_rate": 4.530000000000001e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8672258853912354, + "step": 1812 + }, + { + "epoch": 0.9065, + "grad_norm": 4.722369765482061, + "learning_rate": 4.5325e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8720142245292664, + "step": 1813 + }, + { + "epoch": 0.907, + "grad_norm": 11.63418167842022, + "learning_rate": 4.535000000000001e-06, + "loss": 0.491, + "mean_token_accuracy": 0.8633772134780884, + "step": 1814 + }, + { + "epoch": 0.9075, + "grad_norm": 2.858549201264206, + "learning_rate": 4.5375e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.8853313326835632, + "step": 1815 + }, + { + "epoch": 0.908, + "grad_norm": 2.5115403454834495, + "learning_rate": 4.540000000000001e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8550337553024292, + "step": 1816 + }, + { + "epoch": 0.9085, + "grad_norm": 2.87449090853024, + "learning_rate": 4.5425e-06, + "loss": 0.497, + "mean_token_accuracy": 0.8427854776382446, + "step": 1817 + }, + { + "epoch": 0.909, + "grad_norm": 2.80840694014522, + "learning_rate": 4.5450000000000005e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8897313475608826, + "step": 1818 + }, + { + "epoch": 0.9095, + "grad_norm": 4.7764079429114945, + "learning_rate": 4.5475e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.86459881067276, + "step": 1819 + }, + { + "epoch": 0.91, + "grad_norm": 2.898588969071309, + "learning_rate": 4.5500000000000005e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8757644295692444, + "step": 1820 + }, + { + "epoch": 0.9105, + "grad_norm": 2.1539667591870235, + "learning_rate": 4.5525e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8832579255104065, + "step": 1821 + }, + { + "epoch": 0.911, + "grad_norm": 2.1749291552590657, + "learning_rate": 4.5550000000000004e-06, + "loss": 0.2948, + "mean_token_accuracy": 0.9088798761367798, + "step": 1822 + }, + { + "epoch": 0.9115, + "grad_norm": 5.429725735755323, + "learning_rate": 4.5575e-06, + "loss": 0.5488, + "mean_token_accuracy": 0.8391950130462646, + "step": 1823 + }, + { + "epoch": 0.912, + "grad_norm": 1.811078976323479, + "learning_rate": 4.56e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8744985461235046, + "step": 1824 + }, + { + "epoch": 0.9125, + "grad_norm": 3.8399922241112523, + "learning_rate": 4.5625e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8636437058448792, + "step": 1825 + }, + { + "epoch": 0.913, + "grad_norm": 3.326501475585793, + "learning_rate": 4.565e-06, + "loss": 0.5116, + "mean_token_accuracy": 0.8508771657943726, + "step": 1826 + }, + { + "epoch": 0.9135, + "grad_norm": 3.356895799418738, + "learning_rate": 4.5675e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.8895134329795837, + "step": 1827 + }, + { + "epoch": 0.914, + "grad_norm": 3.44098630661699, + "learning_rate": 4.57e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8799406290054321, + "step": 1828 + }, + { + "epoch": 0.9145, + "grad_norm": 3.087717669171969, + "learning_rate": 4.572500000000001e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8494858145713806, + "step": 1829 + }, + { + "epoch": 0.915, + "grad_norm": 5.757692825762681, + "learning_rate": 4.575e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8582013845443726, + "step": 1830 + }, + { + "epoch": 0.9155, + "grad_norm": 1.9662212381267017, + "learning_rate": 4.577500000000001e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.871152937412262, + "step": 1831 + }, + { + "epoch": 0.916, + "grad_norm": 5.7587524865264355, + "learning_rate": 4.58e-06, + "loss": 0.492, + "mean_token_accuracy": 0.8445078730583191, + "step": 1832 + }, + { + "epoch": 0.9165, + "grad_norm": 3.72875737368874, + "learning_rate": 4.582500000000001e-06, + "loss": 0.598, + "mean_token_accuracy": 0.8329278230667114, + "step": 1833 + }, + { + "epoch": 0.917, + "grad_norm": 3.7881651150802984, + "learning_rate": 4.585e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8550808429718018, + "step": 1834 + }, + { + "epoch": 0.9175, + "grad_norm": 2.6901169559002853, + "learning_rate": 4.5875000000000005e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8729549646377563, + "step": 1835 + }, + { + "epoch": 0.918, + "grad_norm": 2.341560928834151, + "learning_rate": 4.590000000000001e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8681682348251343, + "step": 1836 + }, + { + "epoch": 0.9185, + "grad_norm": 3.402961006000969, + "learning_rate": 4.5925000000000005e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8651390671730042, + "step": 1837 + }, + { + "epoch": 0.919, + "grad_norm": 2.704773426800597, + "learning_rate": 4.595000000000001e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8582640290260315, + "step": 1838 + }, + { + "epoch": 0.9195, + "grad_norm": 5.054534703246811, + "learning_rate": 4.5975000000000005e-06, + "loss": 0.355, + "mean_token_accuracy": 0.8830527067184448, + "step": 1839 + }, + { + "epoch": 0.92, + "grad_norm": 3.586029578967357, + "learning_rate": 4.600000000000001e-06, + "loss": 0.5262, + "mean_token_accuracy": 0.8515598773956299, + "step": 1840 + }, + { + "epoch": 0.9205, + "grad_norm": 2.207964961204147, + "learning_rate": 4.6025e-06, + "loss": 0.2776, + "mean_token_accuracy": 0.9006248712539673, + "step": 1841 + }, + { + "epoch": 0.921, + "grad_norm": 2.5692880195842975, + "learning_rate": 4.605000000000001e-06, + "loss": 0.3232, + "mean_token_accuracy": 0.8947010636329651, + "step": 1842 + }, + { + "epoch": 0.9215, + "grad_norm": 2.7072228723354357, + "learning_rate": 4.6075e-06, + "loss": 0.5186, + "mean_token_accuracy": 0.8362694382667542, + "step": 1843 + }, + { + "epoch": 0.922, + "grad_norm": 2.539741263004604, + "learning_rate": 4.610000000000001e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8866819739341736, + "step": 1844 + }, + { + "epoch": 0.9225, + "grad_norm": 2.7963663475136356, + "learning_rate": 4.6125e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8665671348571777, + "step": 1845 + }, + { + "epoch": 0.923, + "grad_norm": 3.325096900746996, + "learning_rate": 4.615000000000001e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.858146607875824, + "step": 1846 + }, + { + "epoch": 0.9235, + "grad_norm": 6.5173710687073, + "learning_rate": 4.6175e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8787564635276794, + "step": 1847 + }, + { + "epoch": 0.924, + "grad_norm": 3.0163524156476313, + "learning_rate": 4.620000000000001e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8520065546035767, + "step": 1848 + }, + { + "epoch": 0.9245, + "grad_norm": 2.9689016005218525, + "learning_rate": 4.6225e-06, + "loss": 0.5519, + "mean_token_accuracy": 0.8352823257446289, + "step": 1849 + }, + { + "epoch": 0.925, + "grad_norm": 1.952035066068506, + "learning_rate": 4.625000000000001e-06, + "loss": 0.2958, + "mean_token_accuracy": 0.9048277735710144, + "step": 1850 + }, + { + "epoch": 0.9255, + "grad_norm": 3.4893560710981615, + "learning_rate": 4.6275e-06, + "loss": 0.6011, + "mean_token_accuracy": 0.8182862997055054, + "step": 1851 + }, + { + "epoch": 0.926, + "grad_norm": 1.8226642803117727, + "learning_rate": 4.6300000000000006e-06, + "loss": 0.2785, + "mean_token_accuracy": 0.9060913920402527, + "step": 1852 + }, + { + "epoch": 0.9265, + "grad_norm": 2.7351096144826834, + "learning_rate": 4.6325e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.8858614563941956, + "step": 1853 + }, + { + "epoch": 0.927, + "grad_norm": 2.8014570754959847, + "learning_rate": 4.6350000000000005e-06, + "loss": 0.3258, + "mean_token_accuracy": 0.8924484848976135, + "step": 1854 + }, + { + "epoch": 0.9275, + "grad_norm": 3.939519658712007, + "learning_rate": 4.6375e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.8581196665763855, + "step": 1855 + }, + { + "epoch": 0.928, + "grad_norm": 8.525525217742, + "learning_rate": 4.6400000000000005e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8723030090332031, + "step": 1856 + }, + { + "epoch": 0.9285, + "grad_norm": 1.9566504155188058, + "learning_rate": 4.6425e-06, + "loss": 0.353, + "mean_token_accuracy": 0.8909183740615845, + "step": 1857 + }, + { + "epoch": 0.929, + "grad_norm": 3.284633792128679, + "learning_rate": 4.645e-06, + "loss": 0.2707, + "mean_token_accuracy": 0.9149850010871887, + "step": 1858 + }, + { + "epoch": 0.9295, + "grad_norm": 2.2894041135237706, + "learning_rate": 4.6475e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8837998509407043, + "step": 1859 + }, + { + "epoch": 0.93, + "grad_norm": 1.9434512620334823, + "learning_rate": 4.65e-06, + "loss": 0.2715, + "mean_token_accuracy": 0.9163424372673035, + "step": 1860 + }, + { + "epoch": 0.9305, + "grad_norm": 2.616624595475835, + "learning_rate": 4.652500000000001e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8693915009498596, + "step": 1861 + }, + { + "epoch": 0.931, + "grad_norm": 3.223589034583944, + "learning_rate": 4.655e-06, + "loss": 0.3167, + "mean_token_accuracy": 0.895921528339386, + "step": 1862 + }, + { + "epoch": 0.9315, + "grad_norm": 3.461651035653434, + "learning_rate": 4.657500000000001e-06, + "loss": 0.5263, + "mean_token_accuracy": 0.8407407402992249, + "step": 1863 + }, + { + "epoch": 0.932, + "grad_norm": 3.7107275030096156, + "learning_rate": 4.66e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8863727450370789, + "step": 1864 + }, + { + "epoch": 0.9325, + "grad_norm": 2.6730862134481863, + "learning_rate": 4.662500000000001e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8579994440078735, + "step": 1865 + }, + { + "epoch": 0.933, + "grad_norm": 2.9392693329979678, + "learning_rate": 4.665e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.866963803768158, + "step": 1866 + }, + { + "epoch": 0.9335, + "grad_norm": 9.512388436804898, + "learning_rate": 4.667500000000001e-06, + "loss": 0.7614, + "mean_token_accuracy": 0.7577532529830933, + "step": 1867 + }, + { + "epoch": 0.934, + "grad_norm": 2.4341037334203723, + "learning_rate": 4.670000000000001e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.854920506477356, + "step": 1868 + }, + { + "epoch": 0.9345, + "grad_norm": 1.7955168977188094, + "learning_rate": 4.672500000000001e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8670295476913452, + "step": 1869 + }, + { + "epoch": 0.935, + "grad_norm": 3.667461675252926, + "learning_rate": 4.675000000000001e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8670963048934937, + "step": 1870 + }, + { + "epoch": 0.9355, + "grad_norm": 2.213429436573514, + "learning_rate": 4.6775000000000005e-06, + "loss": 0.3141, + "mean_token_accuracy": 0.893449604511261, + "step": 1871 + }, + { + "epoch": 0.936, + "grad_norm": 4.613084435257416, + "learning_rate": 4.680000000000001e-06, + "loss": 0.2667, + "mean_token_accuracy": 0.9007759094238281, + "step": 1872 + }, + { + "epoch": 0.9365, + "grad_norm": 2.777232672353042, + "learning_rate": 4.6825000000000005e-06, + "loss": 0.5356, + "mean_token_accuracy": 0.8372131586074829, + "step": 1873 + }, + { + "epoch": 0.937, + "grad_norm": 2.578008459376909, + "learning_rate": 4.685000000000001e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8754408359527588, + "step": 1874 + }, + { + "epoch": 0.9375, + "grad_norm": 2.562259435860468, + "learning_rate": 4.6875000000000004e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.866820752620697, + "step": 1875 + }, + { + "epoch": 0.938, + "grad_norm": 3.1880271013490256, + "learning_rate": 4.69e-06, + "loss": 0.844, + "mean_token_accuracy": 0.7684869170188904, + "step": 1876 + }, + { + "epoch": 0.9385, + "grad_norm": 4.115362816696068, + "learning_rate": 4.6925e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8808144330978394, + "step": 1877 + }, + { + "epoch": 0.939, + "grad_norm": 4.1091232101918465, + "learning_rate": 4.695e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.876558780670166, + "step": 1878 + }, + { + "epoch": 0.9395, + "grad_norm": 4.40897490194378, + "learning_rate": 4.6975e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8590017557144165, + "step": 1879 + }, + { + "epoch": 0.94, + "grad_norm": 2.4038591329685404, + "learning_rate": 4.7e-06, + "loss": 0.5043, + "mean_token_accuracy": 0.8524617552757263, + "step": 1880 + }, + { + "epoch": 0.9405, + "grad_norm": 3.351832697756871, + "learning_rate": 4.7025e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8773338198661804, + "step": 1881 + }, + { + "epoch": 0.941, + "grad_norm": 6.284633853221279, + "learning_rate": 4.705e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8764762282371521, + "step": 1882 + }, + { + "epoch": 0.9415, + "grad_norm": 2.223667802018353, + "learning_rate": 4.7075e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8829376101493835, + "step": 1883 + }, + { + "epoch": 0.942, + "grad_norm": 2.6801785601785446, + "learning_rate": 4.71e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8587419390678406, + "step": 1884 + }, + { + "epoch": 0.9425, + "grad_norm": 2.058402756197186, + "learning_rate": 4.7125e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8809523582458496, + "step": 1885 + }, + { + "epoch": 0.943, + "grad_norm": 8.46219851860106, + "learning_rate": 4.715e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8801184892654419, + "step": 1886 + }, + { + "epoch": 0.9435, + "grad_norm": 5.819598867464457, + "learning_rate": 4.7175e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8673309683799744, + "step": 1887 + }, + { + "epoch": 0.944, + "grad_norm": 3.9432682331506497, + "learning_rate": 4.7200000000000005e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8629624247550964, + "step": 1888 + }, + { + "epoch": 0.9445, + "grad_norm": 3.184147568889667, + "learning_rate": 4.7225e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8924278616905212, + "step": 1889 + }, + { + "epoch": 0.945, + "grad_norm": 3.723247855861853, + "learning_rate": 4.7250000000000005e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8822695016860962, + "step": 1890 + }, + { + "epoch": 0.9455, + "grad_norm": 2.616488909071611, + "learning_rate": 4.7275e-06, + "loss": 0.5024, + "mean_token_accuracy": 0.8488680720329285, + "step": 1891 + }, + { + "epoch": 0.946, + "grad_norm": 9.246861624972023, + "learning_rate": 4.7300000000000005e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8531824350357056, + "step": 1892 + }, + { + "epoch": 0.9465, + "grad_norm": 2.89853530590805, + "learning_rate": 4.7325e-06, + "loss": 0.3306, + "mean_token_accuracy": 0.8939118981361389, + "step": 1893 + }, + { + "epoch": 0.947, + "grad_norm": 5.911451930358914, + "learning_rate": 4.735e-06, + "loss": 0.3172, + "mean_token_accuracy": 0.9010313749313354, + "step": 1894 + }, + { + "epoch": 0.9475, + "grad_norm": 2.6430753391527926, + "learning_rate": 4.737500000000001e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8715706467628479, + "step": 1895 + }, + { + "epoch": 0.948, + "grad_norm": 4.614785200289433, + "learning_rate": 4.74e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8782727122306824, + "step": 1896 + }, + { + "epoch": 0.9485, + "grad_norm": 3.4127783623004833, + "learning_rate": 4.742500000000001e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8573480248451233, + "step": 1897 + }, + { + "epoch": 0.949, + "grad_norm": 4.77019314926549, + "learning_rate": 4.745e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8759504556655884, + "step": 1898 + }, + { + "epoch": 0.9495, + "grad_norm": 2.9248860856242733, + "learning_rate": 4.747500000000001e-06, + "loss": 0.6276, + "mean_token_accuracy": 0.8186217546463013, + "step": 1899 + }, + { + "epoch": 0.95, + "grad_norm": 2.5395701004719125, + "learning_rate": 4.75e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8595773577690125, + "step": 1900 + }, + { + "epoch": 0.9505, + "grad_norm": 5.590912219525201, + "learning_rate": 4.752500000000001e-06, + "loss": 0.5297, + "mean_token_accuracy": 0.8372025489807129, + "step": 1901 + }, + { + "epoch": 0.951, + "grad_norm": 2.3571384450662607, + "learning_rate": 4.755e-06, + "loss": 0.4843, + "mean_token_accuracy": 0.8425782322883606, + "step": 1902 + }, + { + "epoch": 0.9515, + "grad_norm": 2.3672033541179394, + "learning_rate": 4.757500000000001e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8964889645576477, + "step": 1903 + }, + { + "epoch": 0.952, + "grad_norm": 6.891579460456281, + "learning_rate": 4.76e-06, + "loss": 0.3123, + "mean_token_accuracy": 0.8951250314712524, + "step": 1904 + }, + { + "epoch": 0.9525, + "grad_norm": 2.3865908978838277, + "learning_rate": 4.7625000000000006e-06, + "loss": 0.3418, + "mean_token_accuracy": 0.8859366774559021, + "step": 1905 + }, + { + "epoch": 0.953, + "grad_norm": 4.0507404340094135, + "learning_rate": 4.765e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8680166602134705, + "step": 1906 + }, + { + "epoch": 0.9535, + "grad_norm": 2.9645769936081723, + "learning_rate": 4.7675000000000005e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8597204089164734, + "step": 1907 + }, + { + "epoch": 0.954, + "grad_norm": 5.496816917109838, + "learning_rate": 4.77e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8907427191734314, + "step": 1908 + }, + { + "epoch": 0.9545, + "grad_norm": 2.258925864601257, + "learning_rate": 4.7725000000000005e-06, + "loss": 0.3392, + "mean_token_accuracy": 0.8918128609657288, + "step": 1909 + }, + { + "epoch": 0.955, + "grad_norm": 4.4306516452223486, + "learning_rate": 4.775e-06, + "loss": 0.2935, + "mean_token_accuracy": 0.8999489545822144, + "step": 1910 + }, + { + "epoch": 0.9555, + "grad_norm": 2.267320337379333, + "learning_rate": 4.7775e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8712619543075562, + "step": 1911 + }, + { + "epoch": 0.956, + "grad_norm": 11.236638443258729, + "learning_rate": 4.78e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8442808389663696, + "step": 1912 + }, + { + "epoch": 0.9565, + "grad_norm": 3.4341780165582176, + "learning_rate": 4.7825e-06, + "loss": 0.3254, + "mean_token_accuracy": 0.8900135159492493, + "step": 1913 + }, + { + "epoch": 0.957, + "grad_norm": 4.416214036922353, + "learning_rate": 4.785e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8602187633514404, + "step": 1914 + }, + { + "epoch": 0.9575, + "grad_norm": 4.758428376307822, + "learning_rate": 4.7875e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8772549033164978, + "step": 1915 + }, + { + "epoch": 0.958, + "grad_norm": 1.9054208639368655, + "learning_rate": 4.79e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8716188073158264, + "step": 1916 + }, + { + "epoch": 0.9585, + "grad_norm": 22.66259418326052, + "learning_rate": 4.7925e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8765586018562317, + "step": 1917 + }, + { + "epoch": 0.959, + "grad_norm": 1.9916069340423397, + "learning_rate": 4.795e-06, + "loss": 0.311, + "mean_token_accuracy": 0.897599995136261, + "step": 1918 + }, + { + "epoch": 0.9595, + "grad_norm": 2.1930335463423303, + "learning_rate": 4.7975e-06, + "loss": 0.2992, + "mean_token_accuracy": 0.9037806987762451, + "step": 1919 + }, + { + "epoch": 0.96, + "grad_norm": 13.754465091754167, + "learning_rate": 4.800000000000001e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8942925333976746, + "step": 1920 + }, + { + "epoch": 0.9605, + "grad_norm": 3.5783490334409715, + "learning_rate": 4.8025e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8508572578430176, + "step": 1921 + }, + { + "epoch": 0.961, + "grad_norm": 3.0093697620160293, + "learning_rate": 4.805000000000001e-06, + "loss": 0.34, + "mean_token_accuracy": 0.8645319938659668, + "step": 1922 + }, + { + "epoch": 0.9615, + "grad_norm": 3.7882822278519717, + "learning_rate": 4.8075e-06, + "loss": 0.2322, + "mean_token_accuracy": 0.9214046597480774, + "step": 1923 + }, + { + "epoch": 0.962, + "grad_norm": 2.221328783060671, + "learning_rate": 4.8100000000000005e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8782335519790649, + "step": 1924 + }, + { + "epoch": 0.9625, + "grad_norm": 14.014099832354507, + "learning_rate": 4.8125e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8527815341949463, + "step": 1925 + }, + { + "epoch": 0.963, + "grad_norm": 3.5526105070600624, + "learning_rate": 4.8150000000000005e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8672372698783875, + "step": 1926 + }, + { + "epoch": 0.9635, + "grad_norm": 3.0238106260770863, + "learning_rate": 4.817500000000001e-06, + "loss": 0.5003, + "mean_token_accuracy": 0.8717121481895447, + "step": 1927 + }, + { + "epoch": 0.964, + "grad_norm": 3.144808547814222, + "learning_rate": 4.8200000000000004e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8772026300430298, + "step": 1928 + }, + { + "epoch": 0.9645, + "grad_norm": 4.479304943264591, + "learning_rate": 4.822500000000001e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8778986930847168, + "step": 1929 + }, + { + "epoch": 0.965, + "grad_norm": 3.9011373200700397, + "learning_rate": 4.825e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.866531252861023, + "step": 1930 + }, + { + "epoch": 0.9655, + "grad_norm": 6.517215030336415, + "learning_rate": 4.827500000000001e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8716517686843872, + "step": 1931 + }, + { + "epoch": 0.966, + "grad_norm": 3.005790050883589, + "learning_rate": 4.83e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8636707067489624, + "step": 1932 + }, + { + "epoch": 0.9665, + "grad_norm": 2.1940230309447633, + "learning_rate": 4.832500000000001e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8518087863922119, + "step": 1933 + }, + { + "epoch": 0.967, + "grad_norm": 4.098831031256946, + "learning_rate": 4.835e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8765556216239929, + "step": 1934 + }, + { + "epoch": 0.9675, + "grad_norm": 2.4967783855439425, + "learning_rate": 4.837500000000001e-06, + "loss": 0.4835, + "mean_token_accuracy": 0.8555629849433899, + "step": 1935 + }, + { + "epoch": 0.968, + "grad_norm": 1.9575824068719225, + "learning_rate": 4.84e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8735617399215698, + "step": 1936 + }, + { + "epoch": 0.9685, + "grad_norm": 2.1096097624554306, + "learning_rate": 4.842500000000001e-06, + "loss": 0.3414, + "mean_token_accuracy": 0.8872798681259155, + "step": 1937 + }, + { + "epoch": 0.969, + "grad_norm": 3.6154804621551953, + "learning_rate": 4.845e-06, + "loss": 0.5627, + "mean_token_accuracy": 0.8409726619720459, + "step": 1938 + }, + { + "epoch": 0.9695, + "grad_norm": 2.37848570575611, + "learning_rate": 4.847500000000001e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8641101717948914, + "step": 1939 + }, + { + "epoch": 0.97, + "grad_norm": 9.84285902613978, + "learning_rate": 4.85e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8831900358200073, + "step": 1940 + }, + { + "epoch": 0.9705, + "grad_norm": 3.9328736870961074, + "learning_rate": 4.8525000000000006e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.8483606576919556, + "step": 1941 + }, + { + "epoch": 0.971, + "grad_norm": 6.2385926392949616, + "learning_rate": 4.855e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8724145293235779, + "step": 1942 + }, + { + "epoch": 0.9715, + "grad_norm": 2.2193950632222, + "learning_rate": 4.8575000000000005e-06, + "loss": 0.2627, + "mean_token_accuracy": 0.9159275889396667, + "step": 1943 + }, + { + "epoch": 0.972, + "grad_norm": 2.954126746240228, + "learning_rate": 4.86e-06, + "loss": 0.3122, + "mean_token_accuracy": 0.8957131505012512, + "step": 1944 + }, + { + "epoch": 0.9725, + "grad_norm": 2.3663072458749452, + "learning_rate": 4.8625000000000005e-06, + "loss": 0.3387, + "mean_token_accuracy": 0.8841940760612488, + "step": 1945 + }, + { + "epoch": 0.973, + "grad_norm": 4.070457758803494, + "learning_rate": 4.865e-06, + "loss": 0.6125, + "mean_token_accuracy": 0.8215339183807373, + "step": 1946 + }, + { + "epoch": 0.9735, + "grad_norm": 3.0492066354646457, + "learning_rate": 4.8675e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8830306529998779, + "step": 1947 + }, + { + "epoch": 0.974, + "grad_norm": 5.317725175105106, + "learning_rate": 4.87e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8936491012573242, + "step": 1948 + }, + { + "epoch": 0.9745, + "grad_norm": 2.326468936459559, + "learning_rate": 4.8725e-06, + "loss": 0.2894, + "mean_token_accuracy": 0.9101606011390686, + "step": 1949 + }, + { + "epoch": 0.975, + "grad_norm": 3.3105746015176916, + "learning_rate": 4.875e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8507214188575745, + "step": 1950 + }, + { + "epoch": 0.9755, + "grad_norm": 3.4426344504176694, + "learning_rate": 4.8775e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.8793243169784546, + "step": 1951 + }, + { + "epoch": 0.976, + "grad_norm": 3.059595270639217, + "learning_rate": 4.880000000000001e-06, + "loss": 0.6982, + "mean_token_accuracy": 0.8048326969146729, + "step": 1952 + }, + { + "epoch": 0.9765, + "grad_norm": 3.2516385870562656, + "learning_rate": 4.8825e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8780269026756287, + "step": 1953 + }, + { + "epoch": 0.977, + "grad_norm": 2.42926895691309, + "learning_rate": 4.885000000000001e-06, + "loss": 0.3133, + "mean_token_accuracy": 0.8931680917739868, + "step": 1954 + }, + { + "epoch": 0.9775, + "grad_norm": 3.5178949763845293, + "learning_rate": 4.8875e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.8400262594223022, + "step": 1955 + }, + { + "epoch": 0.978, + "grad_norm": 2.2786293609238584, + "learning_rate": 4.890000000000001e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.8792458772659302, + "step": 1956 + }, + { + "epoch": 0.9785, + "grad_norm": 2.3544504731324363, + "learning_rate": 4.8925e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8913289308547974, + "step": 1957 + }, + { + "epoch": 0.979, + "grad_norm": 3.093766718272378, + "learning_rate": 4.8950000000000006e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8725780844688416, + "step": 1958 + }, + { + "epoch": 0.9795, + "grad_norm": 3.772044667087488, + "learning_rate": 4.897500000000001e-06, + "loss": 0.4988, + "mean_token_accuracy": 0.859183669090271, + "step": 1959 + }, + { + "epoch": 0.98, + "grad_norm": 4.151672986473129, + "learning_rate": 4.9000000000000005e-06, + "loss": 0.5073, + "mean_token_accuracy": 0.8582925200462341, + "step": 1960 + }, + { + "epoch": 0.9805, + "grad_norm": 2.7148712212266393, + "learning_rate": 4.902500000000001e-06, + "loss": 0.3045, + "mean_token_accuracy": 0.8995015025138855, + "step": 1961 + }, + { + "epoch": 0.981, + "grad_norm": 2.2910326990557315, + "learning_rate": 4.9050000000000005e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8749773502349854, + "step": 1962 + }, + { + "epoch": 0.9815, + "grad_norm": 3.1890056907506015, + "learning_rate": 4.907500000000001e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8798543810844421, + "step": 1963 + }, + { + "epoch": 0.982, + "grad_norm": 1.9549938520773753, + "learning_rate": 4.9100000000000004e-06, + "loss": 0.3238, + "mean_token_accuracy": 0.8868829011917114, + "step": 1964 + }, + { + "epoch": 0.9825, + "grad_norm": 2.327507918560174, + "learning_rate": 4.912500000000001e-06, + "loss": 0.5216, + "mean_token_accuracy": 0.8574068546295166, + "step": 1965 + }, + { + "epoch": 0.983, + "grad_norm": 2.842908698420174, + "learning_rate": 4.915e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8825827836990356, + "step": 1966 + }, + { + "epoch": 0.9835, + "grad_norm": 2.5114406209030524, + "learning_rate": 4.917500000000001e-06, + "loss": 0.2878, + "mean_token_accuracy": 0.9103337526321411, + "step": 1967 + }, + { + "epoch": 0.984, + "grad_norm": 2.8419393916370645, + "learning_rate": 4.92e-06, + "loss": 0.6569, + "mean_token_accuracy": 0.8270687460899353, + "step": 1968 + }, + { + "epoch": 0.9845, + "grad_norm": 2.927743594293173, + "learning_rate": 4.922500000000001e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8583970069885254, + "step": 1969 + }, + { + "epoch": 0.985, + "grad_norm": 2.7469689599516416, + "learning_rate": 4.925e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8810279369354248, + "step": 1970 + }, + { + "epoch": 0.9855, + "grad_norm": 4.734343499930797, + "learning_rate": 4.927500000000001e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8809946775436401, + "step": 1971 + }, + { + "epoch": 0.986, + "grad_norm": 3.1631155085161105, + "learning_rate": 4.93e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8697497844696045, + "step": 1972 + }, + { + "epoch": 0.9865, + "grad_norm": 2.3080030694069533, + "learning_rate": 4.932500000000001e-06, + "loss": 0.481, + "mean_token_accuracy": 0.8419007658958435, + "step": 1973 + }, + { + "epoch": 0.987, + "grad_norm": 9.952696703838592, + "learning_rate": 4.935e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8667883276939392, + "step": 1974 + }, + { + "epoch": 0.9875, + "grad_norm": 2.307015654841235, + "learning_rate": 4.937500000000001e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8682466149330139, + "step": 1975 + }, + { + "epoch": 0.988, + "grad_norm": 2.015786348172131, + "learning_rate": 4.94e-06, + "loss": 0.2645, + "mean_token_accuracy": 0.911814272403717, + "step": 1976 + }, + { + "epoch": 0.9885, + "grad_norm": 2.469875101294022, + "learning_rate": 4.9425000000000005e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8672624826431274, + "step": 1977 + }, + { + "epoch": 0.989, + "grad_norm": 3.6450449718879097, + "learning_rate": 4.945e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8776470422744751, + "step": 1978 + }, + { + "epoch": 0.9895, + "grad_norm": 3.018469779627854, + "learning_rate": 4.9475000000000005e-06, + "loss": 0.6253, + "mean_token_accuracy": 0.8170954585075378, + "step": 1979 + }, + { + "epoch": 0.99, + "grad_norm": 18.16602282573675, + "learning_rate": 4.95e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8608788251876831, + "step": 1980 + }, + { + "epoch": 0.9905, + "grad_norm": 2.484150151338406, + "learning_rate": 4.9525000000000004e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8650346398353577, + "step": 1981 + }, + { + "epoch": 0.991, + "grad_norm": 2.3631409487194706, + "learning_rate": 4.955e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8802506923675537, + "step": 1982 + }, + { + "epoch": 0.9915, + "grad_norm": 2.6775790432326567, + "learning_rate": 4.9575e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8735920190811157, + "step": 1983 + }, + { + "epoch": 0.992, + "grad_norm": 2.444436092625368, + "learning_rate": 4.960000000000001e-06, + "loss": 0.5744, + "mean_token_accuracy": 0.8313527703285217, + "step": 1984 + }, + { + "epoch": 0.9925, + "grad_norm": 2.617691061565062, + "learning_rate": 4.9625e-06, + "loss": 0.5362, + "mean_token_accuracy": 0.8343279957771301, + "step": 1985 + }, + { + "epoch": 0.993, + "grad_norm": 4.350059007825223, + "learning_rate": 4.965000000000001e-06, + "loss": 0.2807, + "mean_token_accuracy": 0.9080016016960144, + "step": 1986 + }, + { + "epoch": 0.9935, + "grad_norm": 2.5739214526296315, + "learning_rate": 4.9675e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8507552146911621, + "step": 1987 + }, + { + "epoch": 0.994, + "grad_norm": 2.4701707255372725, + "learning_rate": 4.970000000000001e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8746495246887207, + "step": 1988 + }, + { + "epoch": 0.9945, + "grad_norm": 2.0828351107228706, + "learning_rate": 4.9725e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.87625652551651, + "step": 1989 + }, + { + "epoch": 0.995, + "grad_norm": 2.710213460945703, + "learning_rate": 4.975000000000001e-06, + "loss": 0.3171, + "mean_token_accuracy": 0.891161322593689, + "step": 1990 + }, + { + "epoch": 0.9955, + "grad_norm": 2.0894660240725544, + "learning_rate": 4.977500000000001e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.881982684135437, + "step": 1991 + }, + { + "epoch": 0.996, + "grad_norm": 7.061909479479843, + "learning_rate": 4.980000000000001e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8510638475418091, + "step": 1992 + }, + { + "epoch": 0.9965, + "grad_norm": 3.437773844656248, + "learning_rate": 4.982500000000001e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.87926185131073, + "step": 1993 + }, + { + "epoch": 0.997, + "grad_norm": 2.8513825789575336, + "learning_rate": 4.9850000000000006e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.8535290956497192, + "step": 1994 + }, + { + "epoch": 0.9975, + "grad_norm": 5.4494597120809924, + "learning_rate": 4.987500000000001e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8918149471282959, + "step": 1995 + }, + { + "epoch": 0.998, + "grad_norm": 2.4162351726587805, + "learning_rate": 4.9900000000000005e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.8409758806228638, + "step": 1996 + }, + { + "epoch": 0.9985, + "grad_norm": 1.957044975462362, + "learning_rate": 4.992500000000001e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8837174773216248, + "step": 1997 + }, + { + "epoch": 0.999, + "grad_norm": 4.609078134130719, + "learning_rate": 4.9950000000000005e-06, + "loss": 0.532, + "mean_token_accuracy": 0.8423028588294983, + "step": 1998 + }, + { + "epoch": 0.9995, + "grad_norm": 4.8969643722136045, + "learning_rate": 4.997500000000001e-06, + "loss": 0.4856, + "mean_token_accuracy": 0.8781460523605347, + "step": 1999 + }, + { + "epoch": 1.0, + "grad_norm": 4.732704495099117, + "learning_rate": 5e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8761390447616577, + "step": 2000 + }, + { + "epoch": 1.0005, + "grad_norm": 2.0893218796384536, + "learning_rate": 4.9999999619228235e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8659112453460693, + "step": 2001 + }, + { + "epoch": 1.001, + "grad_norm": 3.162158378129106, + "learning_rate": 4.999999847691292e-06, + "loss": 0.3516, + "mean_token_accuracy": 0.8871589303016663, + "step": 2002 + }, + { + "epoch": 1.0015, + "grad_norm": 3.741601489270102, + "learning_rate": 4.999999657305411e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8771735429763794, + "step": 2003 + }, + { + "epoch": 1.002, + "grad_norm": 2.5372918139020664, + "learning_rate": 4.999999390765186e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.8375329375267029, + "step": 2004 + }, + { + "epoch": 1.0025, + "grad_norm": 2.803963329279, + "learning_rate": 4.999999048070624e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8965731859207153, + "step": 2005 + }, + { + "epoch": 1.003, + "grad_norm": 2.268287252395865, + "learning_rate": 4.999998629221737e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8674153089523315, + "step": 2006 + }, + { + "epoch": 1.0035, + "grad_norm": 2.2625839532132113, + "learning_rate": 4.999998134218537e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8906145691871643, + "step": 2007 + }, + { + "epoch": 1.004, + "grad_norm": 2.7455746127268235, + "learning_rate": 4.999997563061038e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8927484154701233, + "step": 2008 + }, + { + "epoch": 1.0045, + "grad_norm": 2.64465885939225, + "learning_rate": 4.9999969157492586e-06, + "loss": 0.3283, + "mean_token_accuracy": 0.8777628540992737, + "step": 2009 + }, + { + "epoch": 1.005, + "grad_norm": 3.9602143084151065, + "learning_rate": 4.99999619228322e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8694560527801514, + "step": 2010 + }, + { + "epoch": 1.0055, + "grad_norm": 11.036305910675235, + "learning_rate": 4.999995392662941e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8644624948501587, + "step": 2011 + }, + { + "epoch": 1.006, + "grad_norm": 2.7937906926716862, + "learning_rate": 4.999994516888449e-06, + "loss": 0.3359, + "mean_token_accuracy": 0.8940821290016174, + "step": 2012 + }, + { + "epoch": 1.0065, + "grad_norm": 15.914552032311164, + "learning_rate": 4.999993564959768e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8960273861885071, + "step": 2013 + }, + { + "epoch": 1.007, + "grad_norm": 2.1881460983161682, + "learning_rate": 4.9999925368769286e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8922507762908936, + "step": 2014 + }, + { + "epoch": 1.0075, + "grad_norm": 2.508085865296378, + "learning_rate": 4.999991432639962e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8931190967559814, + "step": 2015 + }, + { + "epoch": 1.008, + "grad_norm": 4.104359990811608, + "learning_rate": 4.999990252248902e-06, + "loss": 0.5637, + "mean_token_accuracy": 0.8435498476028442, + "step": 2016 + }, + { + "epoch": 1.0085, + "grad_norm": 2.410370196671925, + "learning_rate": 4.999988995703784e-06, + "loss": 0.516, + "mean_token_accuracy": 0.8475073575973511, + "step": 2017 + }, + { + "epoch": 1.009, + "grad_norm": 3.248924298821353, + "learning_rate": 4.999987663004646e-06, + "loss": 0.2786, + "mean_token_accuracy": 0.9050785899162292, + "step": 2018 + }, + { + "epoch": 1.0095, + "grad_norm": 2.993069848803185, + "learning_rate": 4.999986254151529e-06, + "loss": 0.3214, + "mean_token_accuracy": 0.8979458808898926, + "step": 2019 + }, + { + "epoch": 1.01, + "grad_norm": 2.34683082859088, + "learning_rate": 4.999984769144476e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8659626245498657, + "step": 2020 + }, + { + "epoch": 1.0105, + "grad_norm": 2.7570460098316554, + "learning_rate": 4.999983207983533e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.9001200199127197, + "step": 2021 + }, + { + "epoch": 1.011, + "grad_norm": 3.4913838407393802, + "learning_rate": 4.999981570668746e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8874003887176514, + "step": 2022 + }, + { + "epoch": 1.0115, + "grad_norm": 2.77095873221066, + "learning_rate": 4.999979857200165e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8840375542640686, + "step": 2023 + }, + { + "epoch": 1.012, + "grad_norm": 4.019224819109813, + "learning_rate": 4.999978067577844e-06, + "loss": 0.425, + "mean_token_accuracy": 0.878923773765564, + "step": 2024 + }, + { + "epoch": 1.0125, + "grad_norm": 3.8225532558684976, + "learning_rate": 4.999976201801837e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8607112169265747, + "step": 2025 + }, + { + "epoch": 1.013, + "grad_norm": 2.9418805016502656, + "learning_rate": 4.999974259872199e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8914636373519897, + "step": 2026 + }, + { + "epoch": 1.0135, + "grad_norm": 1.8998752082818833, + "learning_rate": 4.999972241788991e-06, + "loss": 0.2594, + "mean_token_accuracy": 0.9139785170555115, + "step": 2027 + }, + { + "epoch": 1.014, + "grad_norm": 11.17166413315654, + "learning_rate": 4.999970147552273e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8793020844459534, + "step": 2028 + }, + { + "epoch": 1.0145, + "grad_norm": 2.5606367907539784, + "learning_rate": 4.999967977162109e-06, + "loss": 0.2914, + "mean_token_accuracy": 0.9089841246604919, + "step": 2029 + }, + { + "epoch": 1.015, + "grad_norm": 4.663802846295132, + "learning_rate": 4.999965730618567e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8783810138702393, + "step": 2030 + }, + { + "epoch": 1.0155, + "grad_norm": 1.9991935928149904, + "learning_rate": 4.9999634079217145e-06, + "loss": 0.2559, + "mean_token_accuracy": 0.9133924245834351, + "step": 2031 + }, + { + "epoch": 1.016, + "grad_norm": 5.615859678540278, + "learning_rate": 4.999961009071621e-06, + "loss": 0.195, + "mean_token_accuracy": 0.930656909942627, + "step": 2032 + }, + { + "epoch": 1.0165, + "grad_norm": 5.690949510520285, + "learning_rate": 4.999958534068361e-06, + "loss": 0.2535, + "mean_token_accuracy": 0.9165821075439453, + "step": 2033 + }, + { + "epoch": 1.017, + "grad_norm": 2.570799906357114, + "learning_rate": 4.999955982912009e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8918296694755554, + "step": 2034 + }, + { + "epoch": 1.0175, + "grad_norm": 4.759867156186752, + "learning_rate": 4.999953355602643e-06, + "loss": 0.3126, + "mean_token_accuracy": 0.8939828276634216, + "step": 2035 + }, + { + "epoch": 1.018, + "grad_norm": 3.1501975977498358, + "learning_rate": 4.999950652140343e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8602872490882874, + "step": 2036 + }, + { + "epoch": 1.0185, + "grad_norm": 2.4599453622100733, + "learning_rate": 4.999947872525192e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8855721354484558, + "step": 2037 + }, + { + "epoch": 1.019, + "grad_norm": 11.897046547072858, + "learning_rate": 4.999945016757274e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8885655403137207, + "step": 2038 + }, + { + "epoch": 1.0195, + "grad_norm": 2.259693422476896, + "learning_rate": 4.999942084836676e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8683133721351624, + "step": 2039 + }, + { + "epoch": 1.02, + "grad_norm": 2.6758783237802612, + "learning_rate": 4.999939076763487e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8761528134346008, + "step": 2040 + }, + { + "epoch": 1.0205, + "grad_norm": 2.5957780484930515, + "learning_rate": 4.9999359925378e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8755288124084473, + "step": 2041 + }, + { + "epoch": 1.021, + "grad_norm": 2.8902249716726933, + "learning_rate": 4.999932832159707e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8742838501930237, + "step": 2042 + }, + { + "epoch": 1.0215, + "grad_norm": 5.123415915463467, + "learning_rate": 4.999929595629307e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8546520471572876, + "step": 2043 + }, + { + "epoch": 1.022, + "grad_norm": 2.7742033450319, + "learning_rate": 4.999926282946695e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.890239417552948, + "step": 2044 + }, + { + "epoch": 1.0225, + "grad_norm": 2.5598808985698134, + "learning_rate": 4.999922894111975e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8823312520980835, + "step": 2045 + }, + { + "epoch": 1.023, + "grad_norm": 30.55421404972077, + "learning_rate": 4.9999194291252485e-06, + "loss": 0.2781, + "mean_token_accuracy": 0.9046831130981445, + "step": 2046 + }, + { + "epoch": 1.0235, + "grad_norm": 2.571650532087633, + "learning_rate": 4.999915887986621e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8785010576248169, + "step": 2047 + }, + { + "epoch": 1.024, + "grad_norm": 2.863700167690974, + "learning_rate": 4.999912270696202e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8486825823783875, + "step": 2048 + }, + { + "epoch": 1.0245, + "grad_norm": 10.285280921212788, + "learning_rate": 4.9999085772541e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8668941855430603, + "step": 2049 + }, + { + "epoch": 1.025, + "grad_norm": 2.4026111523433085, + "learning_rate": 4.9999048076604286e-06, + "loss": 0.2261, + "mean_token_accuracy": 0.9217900633811951, + "step": 2050 + }, + { + "epoch": 1.0255, + "grad_norm": 35.15880035924436, + "learning_rate": 4.999900961915302e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8738055229187012, + "step": 2051 + }, + { + "epoch": 1.026, + "grad_norm": 1.7729542282375472, + "learning_rate": 4.999897040018838e-06, + "loss": 0.2655, + "mean_token_accuracy": 0.9122059345245361, + "step": 2052 + }, + { + "epoch": 1.0265, + "grad_norm": 4.76264288618489, + "learning_rate": 4.9998930419711544e-06, + "loss": 0.491, + "mean_token_accuracy": 0.8437286019325256, + "step": 2053 + }, + { + "epoch": 1.027, + "grad_norm": 4.479057977552017, + "learning_rate": 4.999888967772375e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8690319061279297, + "step": 2054 + }, + { + "epoch": 1.0275, + "grad_norm": 2.632041236392044, + "learning_rate": 4.9998848174226225e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8754149675369263, + "step": 2055 + }, + { + "epoch": 1.028, + "grad_norm": 2.4022239255856963, + "learning_rate": 4.999880590922025e-06, + "loss": 0.3188, + "mean_token_accuracy": 0.8957421183586121, + "step": 2056 + }, + { + "epoch": 1.0285, + "grad_norm": 4.913940575921319, + "learning_rate": 4.999876288270709e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.871910810470581, + "step": 2057 + }, + { + "epoch": 1.029, + "grad_norm": 3.964948483702491, + "learning_rate": 4.999871909468807e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8659179210662842, + "step": 2058 + }, + { + "epoch": 1.0295, + "grad_norm": 11.535395235082419, + "learning_rate": 4.999867454516453e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8842443823814392, + "step": 2059 + }, + { + "epoch": 1.03, + "grad_norm": 2.2502859114461606, + "learning_rate": 4.999862923413781e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.877828061580658, + "step": 2060 + }, + { + "epoch": 1.0305, + "grad_norm": 2.022361789651449, + "learning_rate": 4.99985831616093e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8695046305656433, + "step": 2061 + }, + { + "epoch": 1.031, + "grad_norm": 2.6471320483089804, + "learning_rate": 4.99985363275804e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8715356588363647, + "step": 2062 + }, + { + "epoch": 1.0315, + "grad_norm": 1.9220460430265363, + "learning_rate": 4.999848873205254e-06, + "loss": 0.3068, + "mean_token_accuracy": 0.8952183723449707, + "step": 2063 + }, + { + "epoch": 1.032, + "grad_norm": 32.30957500878129, + "learning_rate": 4.999844037502717e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.889310896396637, + "step": 2064 + }, + { + "epoch": 1.0325, + "grad_norm": 2.5290977936865904, + "learning_rate": 4.999839125650576e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8822850584983826, + "step": 2065 + }, + { + "epoch": 1.033, + "grad_norm": 3.4743804105857894, + "learning_rate": 4.99983413764898e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8749750852584839, + "step": 2066 + }, + { + "epoch": 1.0335, + "grad_norm": 2.8799833208031957, + "learning_rate": 4.999829073498082e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8713037371635437, + "step": 2067 + }, + { + "epoch": 1.034, + "grad_norm": 2.5767598983764364, + "learning_rate": 4.999823933198037e-06, + "loss": 0.2913, + "mean_token_accuracy": 0.8891043663024902, + "step": 2068 + }, + { + "epoch": 1.0345, + "grad_norm": 4.269855923328759, + "learning_rate": 4.9998187167489996e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8838709592819214, + "step": 2069 + }, + { + "epoch": 1.035, + "grad_norm": 3.0687813044134225, + "learning_rate": 4.9998134241511305e-06, + "loss": 0.3213, + "mean_token_accuracy": 0.8989919424057007, + "step": 2070 + }, + { + "epoch": 1.0355, + "grad_norm": 5.158791020176613, + "learning_rate": 4.999808055404589e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8879361748695374, + "step": 2071 + }, + { + "epoch": 1.036, + "grad_norm": 3.189898107109392, + "learning_rate": 4.999802610509541e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8869722485542297, + "step": 2072 + }, + { + "epoch": 1.0365, + "grad_norm": 3.3759209887352353, + "learning_rate": 4.99979708946615e-06, + "loss": 0.3068, + "mean_token_accuracy": 0.9069343209266663, + "step": 2073 + }, + { + "epoch": 1.037, + "grad_norm": 2.307243973162988, + "learning_rate": 4.999791492274586e-06, + "loss": 0.2948, + "mean_token_accuracy": 0.9037787318229675, + "step": 2074 + }, + { + "epoch": 1.0375, + "grad_norm": 3.093203644237893, + "learning_rate": 4.999785818935018e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.864449679851532, + "step": 2075 + }, + { + "epoch": 1.038, + "grad_norm": 2.564830957974422, + "learning_rate": 4.999780069447619e-06, + "loss": 0.5027, + "mean_token_accuracy": 0.8401941061019897, + "step": 2076 + }, + { + "epoch": 1.0385, + "grad_norm": 1.9478173395006182, + "learning_rate": 4.999774243812566e-06, + "loss": 0.3215, + "mean_token_accuracy": 0.8958367705345154, + "step": 2077 + }, + { + "epoch": 1.039, + "grad_norm": 3.3598927827335237, + "learning_rate": 4.9997683420300355e-06, + "loss": 0.3281, + "mean_token_accuracy": 0.8892075419425964, + "step": 2078 + }, + { + "epoch": 1.0395, + "grad_norm": 3.0842267780213852, + "learning_rate": 4.999762364100206e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8598014712333679, + "step": 2079 + }, + { + "epoch": 1.04, + "grad_norm": 2.4993835225847563, + "learning_rate": 4.999756310023261e-06, + "loss": 0.3158, + "mean_token_accuracy": 0.8984040021896362, + "step": 2080 + }, + { + "epoch": 1.0405, + "grad_norm": 2.434853891684262, + "learning_rate": 4.9997501797993846e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8591896295547485, + "step": 2081 + }, + { + "epoch": 1.041, + "grad_norm": 2.7573979426915805, + "learning_rate": 4.999743973428763e-06, + "loss": 0.2712, + "mean_token_accuracy": 0.8996260762214661, + "step": 2082 + }, + { + "epoch": 1.0415, + "grad_norm": 3.1712565692124164, + "learning_rate": 4.999737690911586e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.8951892852783203, + "step": 2083 + }, + { + "epoch": 1.042, + "grad_norm": 2.74341040948717, + "learning_rate": 4.999731332248044e-06, + "loss": 0.2633, + "mean_token_accuracy": 0.913586437702179, + "step": 2084 + }, + { + "epoch": 1.0425, + "grad_norm": 3.344146688724244, + "learning_rate": 4.999724897438332e-06, + "loss": 0.3192, + "mean_token_accuracy": 0.8911980390548706, + "step": 2085 + }, + { + "epoch": 1.043, + "grad_norm": 2.4104503660855356, + "learning_rate": 4.999718386482645e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.8853635787963867, + "step": 2086 + }, + { + "epoch": 1.0435, + "grad_norm": 2.324674016434321, + "learning_rate": 4.999711799381182e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.8873834013938904, + "step": 2087 + }, + { + "epoch": 1.044, + "grad_norm": 8.065380852699272, + "learning_rate": 4.999705136134143e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.8874826431274414, + "step": 2088 + }, + { + "epoch": 1.0445, + "grad_norm": 1.8941745158592835, + "learning_rate": 4.999698396741731e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.884336531162262, + "step": 2089 + }, + { + "epoch": 1.045, + "grad_norm": 2.5133460791039566, + "learning_rate": 4.9996915812041515e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8746739625930786, + "step": 2090 + }, + { + "epoch": 1.0455, + "grad_norm": 2.200710653781544, + "learning_rate": 4.9996846895216135e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.8822943568229675, + "step": 2091 + }, + { + "epoch": 1.046, + "grad_norm": 3.841831339349648, + "learning_rate": 4.9996777216943245e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8551566004753113, + "step": 2092 + }, + { + "epoch": 1.0465, + "grad_norm": 2.831506558063351, + "learning_rate": 4.999670677722498e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8882978558540344, + "step": 2093 + }, + { + "epoch": 1.047, + "grad_norm": 5.009595892732167, + "learning_rate": 4.999663557606349e-06, + "loss": 0.3118, + "mean_token_accuracy": 0.8875889182090759, + "step": 2094 + }, + { + "epoch": 1.0475, + "grad_norm": 3.2665904659301184, + "learning_rate": 4.999656361346094e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8795425295829773, + "step": 2095 + }, + { + "epoch": 1.048, + "grad_norm": 2.2363177580046973, + "learning_rate": 4.999649088941951e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8644727468490601, + "step": 2096 + }, + { + "epoch": 1.0485, + "grad_norm": 1.9271451636729404, + "learning_rate": 4.999641740394144e-06, + "loss": 0.2066, + "mean_token_accuracy": 0.9239269495010376, + "step": 2097 + }, + { + "epoch": 1.049, + "grad_norm": 5.1413558552167125, + "learning_rate": 4.999634315702895e-06, + "loss": 0.5206, + "mean_token_accuracy": 0.8561788201332092, + "step": 2098 + }, + { + "epoch": 1.0495, + "grad_norm": 3.6032133075207824, + "learning_rate": 4.99962681486843e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8800580501556396, + "step": 2099 + }, + { + "epoch": 1.05, + "grad_norm": 2.6907886067186024, + "learning_rate": 4.9996192378909785e-06, + "loss": 0.3192, + "mean_token_accuracy": 0.889586329460144, + "step": 2100 + }, + { + "epoch": 1.0505, + "grad_norm": 2.78220052067072, + "learning_rate": 4.999611584770771e-06, + "loss": 0.2268, + "mean_token_accuracy": 0.9242398738861084, + "step": 2101 + }, + { + "epoch": 1.051, + "grad_norm": 2.563256785792912, + "learning_rate": 4.999603855508041e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8790803551673889, + "step": 2102 + }, + { + "epoch": 1.0515, + "grad_norm": 2.418923747668519, + "learning_rate": 4.999596050103022e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.8976648449897766, + "step": 2103 + }, + { + "epoch": 1.052, + "grad_norm": 7.245054538185173, + "learning_rate": 4.999588168555954e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8688662052154541, + "step": 2104 + }, + { + "epoch": 1.0525, + "grad_norm": 2.304275980819438, + "learning_rate": 4.9995802108670775e-06, + "loss": 0.2828, + "mean_token_accuracy": 0.8957560658454895, + "step": 2105 + }, + { + "epoch": 1.053, + "grad_norm": 2.511727801033543, + "learning_rate": 4.999572177036632e-06, + "loss": 0.5247, + "mean_token_accuracy": 0.8427539467811584, + "step": 2106 + }, + { + "epoch": 1.0535, + "grad_norm": 1.9631175035942228, + "learning_rate": 4.999564067064866e-06, + "loss": 0.2704, + "mean_token_accuracy": 0.9051799774169922, + "step": 2107 + }, + { + "epoch": 1.054, + "grad_norm": 2.3438550354662135, + "learning_rate": 4.999555880952023e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8537226915359497, + "step": 2108 + }, + { + "epoch": 1.0545, + "grad_norm": 2.537621497147623, + "learning_rate": 4.999547618698354e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8750803470611572, + "step": 2109 + }, + { + "epoch": 1.055, + "grad_norm": 2.98567061195742, + "learning_rate": 4.999539280304111e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8527963161468506, + "step": 2110 + }, + { + "epoch": 1.0555, + "grad_norm": 1.8705752735301926, + "learning_rate": 4.999530865769547e-06, + "loss": 0.2885, + "mean_token_accuracy": 0.8924367427825928, + "step": 2111 + }, + { + "epoch": 1.056, + "grad_norm": 2.436978353705962, + "learning_rate": 4.99952237509492e-06, + "loss": 0.3184, + "mean_token_accuracy": 0.8945392966270447, + "step": 2112 + }, + { + "epoch": 1.0565, + "grad_norm": 5.055153497223533, + "learning_rate": 4.999513808280486e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8537689447402954, + "step": 2113 + }, + { + "epoch": 1.057, + "grad_norm": 2.3816144386377216, + "learning_rate": 4.999505165326509e-06, + "loss": 0.2697, + "mean_token_accuracy": 0.9122071862220764, + "step": 2114 + }, + { + "epoch": 1.0575, + "grad_norm": 4.088762391324643, + "learning_rate": 4.999496446233249e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8585498929023743, + "step": 2115 + }, + { + "epoch": 1.058, + "grad_norm": 6.797557047905753, + "learning_rate": 4.999487651000975e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8797327280044556, + "step": 2116 + }, + { + "epoch": 1.0585, + "grad_norm": 2.2554971945394544, + "learning_rate": 4.999478779629953e-06, + "loss": 0.322, + "mean_token_accuracy": 0.8953292965888977, + "step": 2117 + }, + { + "epoch": 1.059, + "grad_norm": 3.2952649177063824, + "learning_rate": 4.999469832120454e-06, + "loss": 0.2976, + "mean_token_accuracy": 0.908757209777832, + "step": 2118 + }, + { + "epoch": 1.0594999999999999, + "grad_norm": 1.8511163304798066, + "learning_rate": 4.999460808472749e-06, + "loss": 0.3225, + "mean_token_accuracy": 0.8929723501205444, + "step": 2119 + }, + { + "epoch": 1.06, + "grad_norm": 2.5802224196837025, + "learning_rate": 4.999451708687114e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.9025565981864929, + "step": 2120 + }, + { + "epoch": 1.0605, + "grad_norm": 3.2515933441139167, + "learning_rate": 4.999442532763826e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8745512962341309, + "step": 2121 + }, + { + "epoch": 1.061, + "grad_norm": 2.709718324671315, + "learning_rate": 4.999433280703166e-06, + "loss": 0.3282, + "mean_token_accuracy": 0.8948412537574768, + "step": 2122 + }, + { + "epoch": 1.0615, + "grad_norm": 6.996198284700598, + "learning_rate": 4.999423952505413e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8788766264915466, + "step": 2123 + }, + { + "epoch": 1.062, + "grad_norm": 2.69096973689344, + "learning_rate": 4.999414548170853e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8696955442428589, + "step": 2124 + }, + { + "epoch": 1.0625, + "grad_norm": 3.02882612508869, + "learning_rate": 4.999405067699773e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8766319751739502, + "step": 2125 + }, + { + "epoch": 1.063, + "grad_norm": 2.2650769999801588, + "learning_rate": 4.999395511092461e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8856627345085144, + "step": 2126 + }, + { + "epoch": 1.0635, + "grad_norm": 2.956665251699805, + "learning_rate": 4.999385878349207e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8904486894607544, + "step": 2127 + }, + { + "epoch": 1.064, + "grad_norm": 2.140444527127992, + "learning_rate": 4.999376169470306e-06, + "loss": 0.3366, + "mean_token_accuracy": 0.8966642618179321, + "step": 2128 + }, + { + "epoch": 1.0645, + "grad_norm": 2.3836156552077448, + "learning_rate": 4.999366384456053e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8725519180297852, + "step": 2129 + }, + { + "epoch": 1.065, + "grad_norm": 5.0495330788562045, + "learning_rate": 4.999356523306746e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8921586275100708, + "step": 2130 + }, + { + "epoch": 1.0655000000000001, + "grad_norm": 2.013448419262611, + "learning_rate": 4.999346586022686e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8838951587677002, + "step": 2131 + }, + { + "epoch": 1.066, + "grad_norm": 4.678289691097921, + "learning_rate": 4.999336572604176e-06, + "loss": 0.3395, + "mean_token_accuracy": 0.8845556378364563, + "step": 2132 + }, + { + "epoch": 1.0665, + "grad_norm": 2.8504792901803753, + "learning_rate": 4.999326483051519e-06, + "loss": 0.3519, + "mean_token_accuracy": 0.8883528709411621, + "step": 2133 + }, + { + "epoch": 1.067, + "grad_norm": 2.0389275300290586, + "learning_rate": 4.999316317365025e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.8810414671897888, + "step": 2134 + }, + { + "epoch": 1.0675, + "grad_norm": 2.9869914659859447, + "learning_rate": 4.999306075545002e-06, + "loss": 0.505, + "mean_token_accuracy": 0.8364839553833008, + "step": 2135 + }, + { + "epoch": 1.068, + "grad_norm": 2.747872802629393, + "learning_rate": 4.999295757591762e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8879441618919373, + "step": 2136 + }, + { + "epoch": 1.0685, + "grad_norm": 3.198362067464647, + "learning_rate": 4.99928536350562e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8797626495361328, + "step": 2137 + }, + { + "epoch": 1.069, + "grad_norm": 2.5494217795207534, + "learning_rate": 4.999274893286893e-06, + "loss": 0.2731, + "mean_token_accuracy": 0.8971295952796936, + "step": 2138 + }, + { + "epoch": 1.0695000000000001, + "grad_norm": 2.4969571838749567, + "learning_rate": 4.999264346935898e-06, + "loss": 0.3389, + "mean_token_accuracy": 0.8925355076789856, + "step": 2139 + }, + { + "epoch": 1.07, + "grad_norm": 3.9017130153121085, + "learning_rate": 4.9992537244529585e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8606964945793152, + "step": 2140 + }, + { + "epoch": 1.0705, + "grad_norm": 7.735325452079604, + "learning_rate": 4.999243025838396e-06, + "loss": 0.375, + "mean_token_accuracy": 0.876855731010437, + "step": 2141 + }, + { + "epoch": 1.071, + "grad_norm": 2.544966707739719, + "learning_rate": 4.999232251092538e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8861283659934998, + "step": 2142 + }, + { + "epoch": 1.0715, + "grad_norm": 2.2170608883588243, + "learning_rate": 4.999221400215714e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8588331341743469, + "step": 2143 + }, + { + "epoch": 1.072, + "grad_norm": 4.357455067458866, + "learning_rate": 4.99921047320825e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8681837916374207, + "step": 2144 + }, + { + "epoch": 1.0725, + "grad_norm": 3.874418031083091, + "learning_rate": 4.999199470070484e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8834951519966125, + "step": 2145 + }, + { + "epoch": 1.073, + "grad_norm": 2.5951844714535177, + "learning_rate": 4.999188390802747e-06, + "loss": 0.3211, + "mean_token_accuracy": 0.8977721333503723, + "step": 2146 + }, + { + "epoch": 1.0735, + "grad_norm": 3.183584413163105, + "learning_rate": 4.999177235405379e-06, + "loss": 0.3073, + "mean_token_accuracy": 0.9032410979270935, + "step": 2147 + }, + { + "epoch": 1.074, + "grad_norm": 2.7746090019007843, + "learning_rate": 4.999166003878718e-06, + "loss": 0.3457, + "mean_token_accuracy": 0.8918918967247009, + "step": 2148 + }, + { + "epoch": 1.0745, + "grad_norm": 2.377299068461255, + "learning_rate": 4.999154696223109e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8930333256721497, + "step": 2149 + }, + { + "epoch": 1.075, + "grad_norm": 2.3845902445890013, + "learning_rate": 4.999143312438893e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8828750252723694, + "step": 2150 + }, + { + "epoch": 1.0755, + "grad_norm": 2.6396571829289504, + "learning_rate": 4.99913185252642e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8758549690246582, + "step": 2151 + }, + { + "epoch": 1.076, + "grad_norm": 2.9762852914011697, + "learning_rate": 4.9991203164860365e-06, + "loss": 0.3045, + "mean_token_accuracy": 0.9086456894874573, + "step": 2152 + }, + { + "epoch": 1.0765, + "grad_norm": 2.1434510198284658, + "learning_rate": 4.999108704318095e-06, + "loss": 0.322, + "mean_token_accuracy": 0.9010553956031799, + "step": 2153 + }, + { + "epoch": 1.077, + "grad_norm": 4.685614984266839, + "learning_rate": 4.99909701602295e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.886158287525177, + "step": 2154 + }, + { + "epoch": 1.0775, + "grad_norm": 3.09241362338344, + "learning_rate": 4.9990852516009556e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8659083843231201, + "step": 2155 + }, + { + "epoch": 1.078, + "grad_norm": 2.5656934503426436, + "learning_rate": 4.9990734110524715e-06, + "loss": 0.52, + "mean_token_accuracy": 0.8582836985588074, + "step": 2156 + }, + { + "epoch": 1.0785, + "grad_norm": 2.032492662859778, + "learning_rate": 4.999061494377859e-06, + "loss": 0.3093, + "mean_token_accuracy": 0.9061724543571472, + "step": 2157 + }, + { + "epoch": 1.079, + "grad_norm": 2.2887566402766, + "learning_rate": 4.99904950157748e-06, + "loss": 0.3125, + "mean_token_accuracy": 0.9006168842315674, + "step": 2158 + }, + { + "epoch": 1.0795, + "grad_norm": 2.4880045415840475, + "learning_rate": 4.9990374326517e-06, + "loss": 0.3186, + "mean_token_accuracy": 0.8783832788467407, + "step": 2159 + }, + { + "epoch": 1.08, + "grad_norm": 5.962314740305097, + "learning_rate": 4.999025287600886e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.8466230034828186, + "step": 2160 + }, + { + "epoch": 1.0805, + "grad_norm": 3.433143331948326, + "learning_rate": 4.99901306642541e-06, + "loss": 0.3137, + "mean_token_accuracy": 0.8938401341438293, + "step": 2161 + }, + { + "epoch": 1.081, + "grad_norm": 2.701757357478633, + "learning_rate": 4.999000769125642e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8741065859794617, + "step": 2162 + }, + { + "epoch": 1.0815, + "grad_norm": 45.041863272636405, + "learning_rate": 4.998988395701958e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8667575120925903, + "step": 2163 + }, + { + "epoch": 1.082, + "grad_norm": 6.881984481825635, + "learning_rate": 4.998975946154734e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8918548822402954, + "step": 2164 + }, + { + "epoch": 1.0825, + "grad_norm": 2.5392787316291665, + "learning_rate": 4.998963420484349e-06, + "loss": 0.3432, + "mean_token_accuracy": 0.8925256729125977, + "step": 2165 + }, + { + "epoch": 1.083, + "grad_norm": 2.677468197869087, + "learning_rate": 4.998950818691187e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8606340885162354, + "step": 2166 + }, + { + "epoch": 1.0835, + "grad_norm": 2.634131089924316, + "learning_rate": 4.998938140775629e-06, + "loss": 0.318, + "mean_token_accuracy": 0.9048349857330322, + "step": 2167 + }, + { + "epoch": 1.084, + "grad_norm": 17.36681170762602, + "learning_rate": 4.998925386738063e-06, + "loss": 0.2381, + "mean_token_accuracy": 0.9193093776702881, + "step": 2168 + }, + { + "epoch": 1.0845, + "grad_norm": 2.5335676663835685, + "learning_rate": 4.998912556578877e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8863903880119324, + "step": 2169 + }, + { + "epoch": 1.085, + "grad_norm": 3.7162844326757645, + "learning_rate": 4.9988996502984604e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8712739944458008, + "step": 2170 + }, + { + "epoch": 1.0855, + "grad_norm": 27.317148342303714, + "learning_rate": 4.998886667897208e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8795902729034424, + "step": 2171 + }, + { + "epoch": 1.086, + "grad_norm": 2.5396204940488536, + "learning_rate": 4.998873609375516e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8873443007469177, + "step": 2172 + }, + { + "epoch": 1.0865, + "grad_norm": 2.2278173623932456, + "learning_rate": 4.99886047473378e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8653969168663025, + "step": 2173 + }, + { + "epoch": 1.087, + "grad_norm": 2.8258675323606863, + "learning_rate": 4.998847263972402e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8801381587982178, + "step": 2174 + }, + { + "epoch": 1.0875, + "grad_norm": 1.602046282174369, + "learning_rate": 4.998833977091783e-06, + "loss": 0.2215, + "mean_token_accuracy": 0.9212929010391235, + "step": 2175 + }, + { + "epoch": 1.088, + "grad_norm": 3.633456243116383, + "learning_rate": 4.998820614092328e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8583628535270691, + "step": 2176 + }, + { + "epoch": 1.0885, + "grad_norm": 12.31703227976285, + "learning_rate": 4.998807174974445e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8746690154075623, + "step": 2177 + }, + { + "epoch": 1.089, + "grad_norm": 2.5238707470434405, + "learning_rate": 4.998793659738542e-06, + "loss": 0.429, + "mean_token_accuracy": 0.865725576877594, + "step": 2178 + }, + { + "epoch": 1.0895, + "grad_norm": 2.886796126137583, + "learning_rate": 4.998780068385033e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8912274241447449, + "step": 2179 + }, + { + "epoch": 1.09, + "grad_norm": 12.807289300295762, + "learning_rate": 4.998766400914329e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8758289217948914, + "step": 2180 + }, + { + "epoch": 1.0905, + "grad_norm": 2.8362117181388893, + "learning_rate": 4.998752657326849e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8894060850143433, + "step": 2181 + }, + { + "epoch": 1.091, + "grad_norm": 2.97098437549282, + "learning_rate": 4.998738837623009e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.8582282662391663, + "step": 2182 + }, + { + "epoch": 1.0915, + "grad_norm": 2.23210220404455, + "learning_rate": 4.998724941803233e-06, + "loss": 0.3293, + "mean_token_accuracy": 0.8941010236740112, + "step": 2183 + }, + { + "epoch": 1.092, + "grad_norm": 22.294541936467787, + "learning_rate": 4.998710969867942e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8730249404907227, + "step": 2184 + }, + { + "epoch": 1.0925, + "grad_norm": 3.220369862943614, + "learning_rate": 4.998696921817562e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8743159174919128, + "step": 2185 + }, + { + "epoch": 1.093, + "grad_norm": 4.013645276287082, + "learning_rate": 4.998682797652522e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8677605986595154, + "step": 2186 + }, + { + "epoch": 1.0935, + "grad_norm": 6.467273373425976, + "learning_rate": 4.9986685973732514e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8702303767204285, + "step": 2187 + }, + { + "epoch": 1.094, + "grad_norm": 2.2481593946417178, + "learning_rate": 4.998654320980183e-06, + "loss": 0.1979, + "mean_token_accuracy": 0.9243221282958984, + "step": 2188 + }, + { + "epoch": 1.0945, + "grad_norm": 4.792638605109907, + "learning_rate": 4.998639968473751e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8963399529457092, + "step": 2189 + }, + { + "epoch": 1.095, + "grad_norm": 2.969838286128672, + "learning_rate": 4.998625539854394e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8773638010025024, + "step": 2190 + }, + { + "epoch": 1.0955, + "grad_norm": 2.9135992664389434, + "learning_rate": 4.998611035122549e-06, + "loss": 0.2953, + "mean_token_accuracy": 0.906073808670044, + "step": 2191 + }, + { + "epoch": 1.096, + "grad_norm": 2.4898919092890184, + "learning_rate": 4.998596454278661e-06, + "loss": 0.3107, + "mean_token_accuracy": 0.8875380158424377, + "step": 2192 + }, + { + "epoch": 1.0965, + "grad_norm": 1.9496265410148403, + "learning_rate": 4.9985817973231725e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8793210983276367, + "step": 2193 + }, + { + "epoch": 1.097, + "grad_norm": 3.885954687408901, + "learning_rate": 4.99856706425653e-06, + "loss": 0.2582, + "mean_token_accuracy": 0.9126213788986206, + "step": 2194 + }, + { + "epoch": 1.0975, + "grad_norm": 4.68542780719993, + "learning_rate": 4.998552255079182e-06, + "loss": 0.3247, + "mean_token_accuracy": 0.8954342007637024, + "step": 2195 + }, + { + "epoch": 1.098, + "grad_norm": 2.2240130914889815, + "learning_rate": 4.998537369791581e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8700366616249084, + "step": 2196 + }, + { + "epoch": 1.0985, + "grad_norm": 6.175125988268577, + "learning_rate": 4.998522408394179e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8817182779312134, + "step": 2197 + }, + { + "epoch": 1.099, + "grad_norm": 2.5154550441494754, + "learning_rate": 4.998507370887433e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8629769682884216, + "step": 2198 + }, + { + "epoch": 1.0995, + "grad_norm": 3.4819830201204134, + "learning_rate": 4.998492257271799e-06, + "loss": 0.6179, + "mean_token_accuracy": 0.8107842206954956, + "step": 2199 + }, + { + "epoch": 1.1, + "grad_norm": 3.191661160391769, + "learning_rate": 4.99847706754774e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8637706637382507, + "step": 2200 + }, + { + "epoch": 1.1005, + "grad_norm": 2.372654785290116, + "learning_rate": 4.998461801715717e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8931407928466797, + "step": 2201 + }, + { + "epoch": 1.101, + "grad_norm": 3.2230822461123205, + "learning_rate": 4.998446459776195e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8872962594032288, + "step": 2202 + }, + { + "epoch": 1.1015, + "grad_norm": 2.0264999997464543, + "learning_rate": 4.998431041729642e-06, + "loss": 0.3146, + "mean_token_accuracy": 0.8904651999473572, + "step": 2203 + }, + { + "epoch": 1.102, + "grad_norm": 9.50874716290215, + "learning_rate": 4.998415547576527e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8636983036994934, + "step": 2204 + }, + { + "epoch": 1.1025, + "grad_norm": 2.107940395854671, + "learning_rate": 4.998399977317323e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8750388026237488, + "step": 2205 + }, + { + "epoch": 1.103, + "grad_norm": 2.4189435211069434, + "learning_rate": 4.998384330952504e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8860632181167603, + "step": 2206 + }, + { + "epoch": 1.1035, + "grad_norm": 5.039107398505866, + "learning_rate": 4.998368608482546e-06, + "loss": 0.5219, + "mean_token_accuracy": 0.8570140600204468, + "step": 2207 + }, + { + "epoch": 1.104, + "grad_norm": 2.018166864971122, + "learning_rate": 4.998352809907928e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8826229572296143, + "step": 2208 + }, + { + "epoch": 1.1045, + "grad_norm": 1.804110031979642, + "learning_rate": 4.9983369352291325e-06, + "loss": 0.2759, + "mean_token_accuracy": 0.9046729207038879, + "step": 2209 + }, + { + "epoch": 1.105, + "grad_norm": 4.5745838946051185, + "learning_rate": 4.9983209844466404e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8635717034339905, + "step": 2210 + }, + { + "epoch": 1.1055, + "grad_norm": 2.764303122648874, + "learning_rate": 4.998304957560941e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8832967281341553, + "step": 2211 + }, + { + "epoch": 1.106, + "grad_norm": 2.140382890300598, + "learning_rate": 4.99828885457252e-06, + "loss": 0.2938, + "mean_token_accuracy": 0.9032163619995117, + "step": 2212 + }, + { + "epoch": 1.1065, + "grad_norm": 2.2188864112750855, + "learning_rate": 4.998272675481869e-06, + "loss": 0.3337, + "mean_token_accuracy": 0.9009795188903809, + "step": 2213 + }, + { + "epoch": 1.107, + "grad_norm": 3.42035944410332, + "learning_rate": 4.99825642028948e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.8909321427345276, + "step": 2214 + }, + { + "epoch": 1.1075, + "grad_norm": 4.8361167279214525, + "learning_rate": 4.9982400889958494e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8949146270751953, + "step": 2215 + }, + { + "epoch": 1.108, + "grad_norm": 3.208841873464172, + "learning_rate": 4.9982236816014735e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8836382031440735, + "step": 2216 + }, + { + "epoch": 1.1085, + "grad_norm": 4.916323996593828, + "learning_rate": 4.998207198106852e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8729669451713562, + "step": 2217 + }, + { + "epoch": 1.109, + "grad_norm": 2.2839051074574503, + "learning_rate": 4.998190638512489e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8854386210441589, + "step": 2218 + }, + { + "epoch": 1.1095, + "grad_norm": 2.2101402758957867, + "learning_rate": 4.998174002818887e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8909361362457275, + "step": 2219 + }, + { + "epoch": 1.11, + "grad_norm": 5.393471972709744, + "learning_rate": 4.998157291026553e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8804303407669067, + "step": 2220 + }, + { + "epoch": 1.1105, + "grad_norm": 2.5356274544781354, + "learning_rate": 4.998140503135997e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8643645644187927, + "step": 2221 + }, + { + "epoch": 1.111, + "grad_norm": 26.625297191720833, + "learning_rate": 4.99812363914773e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8831868767738342, + "step": 2222 + }, + { + "epoch": 1.1115, + "grad_norm": 1.8227539270067967, + "learning_rate": 4.998106699062264e-06, + "loss": 0.2372, + "mean_token_accuracy": 0.9197418689727783, + "step": 2223 + }, + { + "epoch": 1.112, + "grad_norm": 3.175183564336392, + "learning_rate": 4.998089682880117e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8854186534881592, + "step": 2224 + }, + { + "epoch": 1.1125, + "grad_norm": 6.245741131085423, + "learning_rate": 4.998072590601808e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8791251182556152, + "step": 2225 + }, + { + "epoch": 1.113, + "grad_norm": 2.3247363937664445, + "learning_rate": 4.998055422227855e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8817675113677979, + "step": 2226 + }, + { + "epoch": 1.1135, + "grad_norm": 2.833327367870677, + "learning_rate": 4.998038177758784e-06, + "loss": 0.3438, + "mean_token_accuracy": 0.8896434903144836, + "step": 2227 + }, + { + "epoch": 1.114, + "grad_norm": 2.3456177410667878, + "learning_rate": 4.9980208571951174e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8736340999603271, + "step": 2228 + }, + { + "epoch": 1.1145, + "grad_norm": 9.746992797048293, + "learning_rate": 4.998003460537385e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8726800084114075, + "step": 2229 + }, + { + "epoch": 1.115, + "grad_norm": 3.6115823723148712, + "learning_rate": 4.9979859877861155e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.8902332186698914, + "step": 2230 + }, + { + "epoch": 1.1155, + "grad_norm": 1.6548630137541238, + "learning_rate": 4.997968438941842e-06, + "loss": 0.2972, + "mean_token_accuracy": 0.895673394203186, + "step": 2231 + }, + { + "epoch": 1.116, + "grad_norm": 2.157722965411572, + "learning_rate": 4.997950814005098e-06, + "loss": 0.2922, + "mean_token_accuracy": 0.9029300212860107, + "step": 2232 + }, + { + "epoch": 1.1165, + "grad_norm": 3.7575629903447187, + "learning_rate": 4.9979331129764205e-06, + "loss": 0.2737, + "mean_token_accuracy": 0.905830442905426, + "step": 2233 + }, + { + "epoch": 1.117, + "grad_norm": 1.9967459327541723, + "learning_rate": 4.997915335856351e-06, + "loss": 0.3164, + "mean_token_accuracy": 0.8937304019927979, + "step": 2234 + }, + { + "epoch": 1.1175, + "grad_norm": 1.8049637104007368, + "learning_rate": 4.997897482645428e-06, + "loss": 0.318, + "mean_token_accuracy": 0.894837498664856, + "step": 2235 + }, + { + "epoch": 1.1179999999999999, + "grad_norm": 4.270941011624149, + "learning_rate": 4.997879553344197e-06, + "loss": 0.4763, + "mean_token_accuracy": 0.8578431606292725, + "step": 2236 + }, + { + "epoch": 1.1185, + "grad_norm": 3.135589412371398, + "learning_rate": 4.997861547953203e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.8884052038192749, + "step": 2237 + }, + { + "epoch": 1.119, + "grad_norm": 2.336602684474983, + "learning_rate": 4.9978434664729965e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.8948182463645935, + "step": 2238 + }, + { + "epoch": 1.1195, + "grad_norm": 2.3582996174786577, + "learning_rate": 4.997825308904126e-06, + "loss": 0.3295, + "mean_token_accuracy": 0.9032191634178162, + "step": 2239 + }, + { + "epoch": 1.12, + "grad_norm": 7.029058323409005, + "learning_rate": 4.997807075247147e-06, + "loss": 0.3318, + "mean_token_accuracy": 0.8984315395355225, + "step": 2240 + }, + { + "epoch": 1.1205, + "grad_norm": 1.681703075380487, + "learning_rate": 4.997788765502612e-06, + "loss": 0.2278, + "mean_token_accuracy": 0.9207081198692322, + "step": 2241 + }, + { + "epoch": 1.121, + "grad_norm": 2.1820430813840654, + "learning_rate": 4.9977703796710805e-06, + "loss": 0.279, + "mean_token_accuracy": 0.9042844176292419, + "step": 2242 + }, + { + "epoch": 1.1215, + "grad_norm": 4.657527802576172, + "learning_rate": 4.997751917753113e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8749185800552368, + "step": 2243 + }, + { + "epoch": 1.1219999999999999, + "grad_norm": 8.318614920647406, + "learning_rate": 4.9977333797492715e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.8892483115196228, + "step": 2244 + }, + { + "epoch": 1.1225, + "grad_norm": 2.247330279418711, + "learning_rate": 4.9977147656601196e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8534917831420898, + "step": 2245 + }, + { + "epoch": 1.123, + "grad_norm": 2.2409658122229525, + "learning_rate": 4.997696075486226e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8818017244338989, + "step": 2246 + }, + { + "epoch": 1.1235, + "grad_norm": 10.59016560180525, + "learning_rate": 4.997677309228158e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.868571400642395, + "step": 2247 + }, + { + "epoch": 1.124, + "grad_norm": 2.1121963350250375, + "learning_rate": 4.997658466886489e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8740968704223633, + "step": 2248 + }, + { + "epoch": 1.1245, + "grad_norm": 2.8315989747614454, + "learning_rate": 4.997639548461792e-06, + "loss": 0.3043, + "mean_token_accuracy": 0.8932883739471436, + "step": 2249 + }, + { + "epoch": 1.125, + "grad_norm": 2.3792140137443294, + "learning_rate": 4.997620553954645e-06, + "loss": 0.2771, + "mean_token_accuracy": 0.9072948098182678, + "step": 2250 + }, + { + "epoch": 1.1255, + "grad_norm": 3.8158782011815155, + "learning_rate": 4.997601483365624e-06, + "loss": 0.5739, + "mean_token_accuracy": 0.8358554840087891, + "step": 2251 + }, + { + "epoch": 1.126, + "grad_norm": 2.7259674597309633, + "learning_rate": 4.997582336695312e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8556948900222778, + "step": 2252 + }, + { + "epoch": 1.1265, + "grad_norm": 1.5083179810189038, + "learning_rate": 4.9975631139442915e-06, + "loss": 0.2949, + "mean_token_accuracy": 0.894503116607666, + "step": 2253 + }, + { + "epoch": 1.127, + "grad_norm": 2.164122442956959, + "learning_rate": 4.997543815113148e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8743735551834106, + "step": 2254 + }, + { + "epoch": 1.1275, + "grad_norm": 2.2253206366490343, + "learning_rate": 4.997524440202469e-06, + "loss": 0.3258, + "mean_token_accuracy": 0.9007130861282349, + "step": 2255 + }, + { + "epoch": 1.1280000000000001, + "grad_norm": 2.59643458272654, + "learning_rate": 4.997504989212846e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8754492998123169, + "step": 2256 + }, + { + "epoch": 1.1285, + "grad_norm": 3.605780003985737, + "learning_rate": 4.99748546214487e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8803843855857849, + "step": 2257 + }, + { + "epoch": 1.129, + "grad_norm": 2.5032303303811956, + "learning_rate": 4.997465858999136e-06, + "loss": 0.2528, + "mean_token_accuracy": 0.9101167321205139, + "step": 2258 + }, + { + "epoch": 1.1295, + "grad_norm": 2.2571310449531508, + "learning_rate": 4.997446179776242e-06, + "loss": 0.3295, + "mean_token_accuracy": 0.88210529088974, + "step": 2259 + }, + { + "epoch": 1.13, + "grad_norm": 1.8654869251886075, + "learning_rate": 4.997426424476787e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8829772472381592, + "step": 2260 + }, + { + "epoch": 1.1305, + "grad_norm": 2.8786625515755357, + "learning_rate": 4.997406593101373e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8841602206230164, + "step": 2261 + }, + { + "epoch": 1.131, + "grad_norm": 5.354424324248324, + "learning_rate": 4.997386685650604e-06, + "loss": 0.2646, + "mean_token_accuracy": 0.9027538895606995, + "step": 2262 + }, + { + "epoch": 1.1315, + "grad_norm": 3.196282589939173, + "learning_rate": 4.997366702125086e-06, + "loss": 0.414, + "mean_token_accuracy": 0.874131441116333, + "step": 2263 + }, + { + "epoch": 1.1320000000000001, + "grad_norm": 2.54416144260457, + "learning_rate": 4.997346642525429e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8924129605293274, + "step": 2264 + }, + { + "epoch": 1.1325, + "grad_norm": 2.7232172930744407, + "learning_rate": 4.997326506852242e-06, + "loss": 0.3254, + "mean_token_accuracy": 0.8993146419525146, + "step": 2265 + }, + { + "epoch": 1.133, + "grad_norm": 2.2629406842920696, + "learning_rate": 4.99730629510614e-06, + "loss": 0.2716, + "mean_token_accuracy": 0.9120901823043823, + "step": 2266 + }, + { + "epoch": 1.1335, + "grad_norm": 1.686270604134985, + "learning_rate": 4.997286007287738e-06, + "loss": 0.2601, + "mean_token_accuracy": 0.9124937057495117, + "step": 2267 + }, + { + "epoch": 1.134, + "grad_norm": 4.21871323532974, + "learning_rate": 4.9972656433976544e-06, + "loss": 0.1886, + "mean_token_accuracy": 0.9290335178375244, + "step": 2268 + }, + { + "epoch": 1.1345, + "grad_norm": 2.067949994221342, + "learning_rate": 4.997245203436509e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8589661717414856, + "step": 2269 + }, + { + "epoch": 1.135, + "grad_norm": 3.182421396958, + "learning_rate": 4.9972246874049254e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8775391578674316, + "step": 2270 + }, + { + "epoch": 1.1355, + "grad_norm": 2.099060847395066, + "learning_rate": 4.997204095303527e-06, + "loss": 0.2284, + "mean_token_accuracy": 0.9210476279258728, + "step": 2271 + }, + { + "epoch": 1.1360000000000001, + "grad_norm": 2.5322888610428884, + "learning_rate": 4.997183427132943e-06, + "loss": 0.2991, + "mean_token_accuracy": 0.9026780128479004, + "step": 2272 + }, + { + "epoch": 1.1365, + "grad_norm": 1.8616680409557385, + "learning_rate": 4.997162682893801e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8875075578689575, + "step": 2273 + }, + { + "epoch": 1.137, + "grad_norm": 4.842040458856371, + "learning_rate": 4.997141862586734e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8877604603767395, + "step": 2274 + }, + { + "epoch": 1.1375, + "grad_norm": 2.4029492015887213, + "learning_rate": 4.9971209662123774e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8822755813598633, + "step": 2275 + }, + { + "epoch": 1.138, + "grad_norm": 2.655777023052889, + "learning_rate": 4.997099993771365e-06, + "loss": 0.351, + "mean_token_accuracy": 0.8820422291755676, + "step": 2276 + }, + { + "epoch": 1.1385, + "grad_norm": 2.767966712829804, + "learning_rate": 4.997078945264338e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.863628625869751, + "step": 2277 + }, + { + "epoch": 1.139, + "grad_norm": 2.3138485697382656, + "learning_rate": 4.997057820691936e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8486384749412537, + "step": 2278 + }, + { + "epoch": 1.1395, + "grad_norm": 3.689334824908712, + "learning_rate": 4.997036620054803e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.8502289056777954, + "step": 2279 + }, + { + "epoch": 1.1400000000000001, + "grad_norm": 2.8937254544785422, + "learning_rate": 4.9970153433535855e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8854308724403381, + "step": 2280 + }, + { + "epoch": 1.1405, + "grad_norm": 8.472694290913664, + "learning_rate": 4.996993990588931e-06, + "loss": 0.5839, + "mean_token_accuracy": 0.8096936345100403, + "step": 2281 + }, + { + "epoch": 1.141, + "grad_norm": 2.2703219382241038, + "learning_rate": 4.99697256176149e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8563656210899353, + "step": 2282 + }, + { + "epoch": 1.1415, + "grad_norm": 2.4576466230628915, + "learning_rate": 4.996951056871915e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.854651153087616, + "step": 2283 + }, + { + "epoch": 1.142, + "grad_norm": 2.480441159821499, + "learning_rate": 4.996929475920862e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8793332576751709, + "step": 2284 + }, + { + "epoch": 1.1425, + "grad_norm": 3.341736155118729, + "learning_rate": 4.996907818908987e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8731626272201538, + "step": 2285 + }, + { + "epoch": 1.143, + "grad_norm": 3.2017045408207965, + "learning_rate": 4.9968860858369505e-06, + "loss": 0.3294, + "mean_token_accuracy": 0.8882625699043274, + "step": 2286 + }, + { + "epoch": 1.1435, + "grad_norm": 1.726391626641346, + "learning_rate": 4.996864276705416e-06, + "loss": 0.2184, + "mean_token_accuracy": 0.9226937294006348, + "step": 2287 + }, + { + "epoch": 1.144, + "grad_norm": 1.9620707376411732, + "learning_rate": 4.996842391515045e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8671965599060059, + "step": 2288 + }, + { + "epoch": 1.1445, + "grad_norm": 2.4840304546784506, + "learning_rate": 4.9968204302665045e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8678650259971619, + "step": 2289 + }, + { + "epoch": 1.145, + "grad_norm": 1.8085843582216459, + "learning_rate": 4.996798392960466e-06, + "loss": 0.2504, + "mean_token_accuracy": 0.9148076772689819, + "step": 2290 + }, + { + "epoch": 1.1455, + "grad_norm": 2.2542756322079027, + "learning_rate": 4.996776279597599e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.8905472755432129, + "step": 2291 + }, + { + "epoch": 1.146, + "grad_norm": 2.203602863287916, + "learning_rate": 4.996754090178577e-06, + "loss": 0.332, + "mean_token_accuracy": 0.8942429423332214, + "step": 2292 + }, + { + "epoch": 1.1465, + "grad_norm": 2.22930503670881, + "learning_rate": 4.996731824704076e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8553886413574219, + "step": 2293 + }, + { + "epoch": 1.147, + "grad_norm": 1.812031583233776, + "learning_rate": 4.996709483174776e-06, + "loss": 0.2728, + "mean_token_accuracy": 0.9081369042396545, + "step": 2294 + }, + { + "epoch": 1.1475, + "grad_norm": 2.4489231250138075, + "learning_rate": 4.996687065591355e-06, + "loss": 0.3302, + "mean_token_accuracy": 0.8930314183235168, + "step": 2295 + }, + { + "epoch": 1.148, + "grad_norm": 6.250522059586083, + "learning_rate": 4.996664571954497e-06, + "loss": 0.3086, + "mean_token_accuracy": 0.8975359201431274, + "step": 2296 + }, + { + "epoch": 1.1485, + "grad_norm": 2.2572594317388, + "learning_rate": 4.996642002264888e-06, + "loss": 0.2834, + "mean_token_accuracy": 0.9088261723518372, + "step": 2297 + }, + { + "epoch": 1.149, + "grad_norm": 5.492466417564072, + "learning_rate": 4.996619356523214e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.8609698414802551, + "step": 2298 + }, + { + "epoch": 1.1495, + "grad_norm": 2.5078191747431022, + "learning_rate": 4.996596634730165e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8743693232536316, + "step": 2299 + }, + { + "epoch": 1.15, + "grad_norm": 2.904837632053113, + "learning_rate": 4.9965738368864345e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8613126277923584, + "step": 2300 + }, + { + "epoch": 1.1505, + "grad_norm": 4.150087582241266, + "learning_rate": 4.996550962992717e-06, + "loss": 0.3195, + "mean_token_accuracy": 0.8937246799468994, + "step": 2301 + }, + { + "epoch": 1.151, + "grad_norm": 2.6087897073302653, + "learning_rate": 4.9965280130497075e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.885826051235199, + "step": 2302 + }, + { + "epoch": 1.1515, + "grad_norm": 5.34994472352847, + "learning_rate": 4.9965049870581055e-06, + "loss": 0.3034, + "mean_token_accuracy": 0.9074598550796509, + "step": 2303 + }, + { + "epoch": 1.152, + "grad_norm": 3.1578674224505168, + "learning_rate": 4.996481885018613e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8633880019187927, + "step": 2304 + }, + { + "epoch": 1.1525, + "grad_norm": 3.6039761218725612, + "learning_rate": 4.996458706931935e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.8597002029418945, + "step": 2305 + }, + { + "epoch": 1.153, + "grad_norm": 2.12170388802144, + "learning_rate": 4.9964354527987745e-06, + "loss": 0.3025, + "mean_token_accuracy": 0.900661289691925, + "step": 2306 + }, + { + "epoch": 1.1535, + "grad_norm": 1.8892937318734455, + "learning_rate": 4.9964121226198425e-06, + "loss": 0.3239, + "mean_token_accuracy": 0.8928571343421936, + "step": 2307 + }, + { + "epoch": 1.154, + "grad_norm": 5.112313702554253, + "learning_rate": 4.9963887163958484e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.8881969451904297, + "step": 2308 + }, + { + "epoch": 1.1545, + "grad_norm": 2.8258584825057897, + "learning_rate": 4.996365234127506e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8800308108329773, + "step": 2309 + }, + { + "epoch": 1.155, + "grad_norm": 2.6214735491736985, + "learning_rate": 4.99634167581553e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8575021028518677, + "step": 2310 + }, + { + "epoch": 1.1555, + "grad_norm": 2.252217680119378, + "learning_rate": 4.996318041460637e-06, + "loss": 0.2505, + "mean_token_accuracy": 0.9054411053657532, + "step": 2311 + }, + { + "epoch": 1.156, + "grad_norm": 2.1615237985304088, + "learning_rate": 4.99629433106355e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8848291039466858, + "step": 2312 + }, + { + "epoch": 1.1565, + "grad_norm": 2.47241450405352, + "learning_rate": 4.996270544624988e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.887805700302124, + "step": 2313 + }, + { + "epoch": 1.157, + "grad_norm": 2.4781046302839025, + "learning_rate": 4.996246682145678e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.8858622908592224, + "step": 2314 + }, + { + "epoch": 1.1575, + "grad_norm": 3.8488058209677085, + "learning_rate": 4.996222743626346e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8868480324745178, + "step": 2315 + }, + { + "epoch": 1.158, + "grad_norm": 2.745390825684843, + "learning_rate": 4.996198729067719e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8857300877571106, + "step": 2316 + }, + { + "epoch": 1.1585, + "grad_norm": 2.6698689510621154, + "learning_rate": 4.996174638470532e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8883653283119202, + "step": 2317 + }, + { + "epoch": 1.159, + "grad_norm": 2.427832561476039, + "learning_rate": 4.996150471835518e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8817868232727051, + "step": 2318 + }, + { + "epoch": 1.1595, + "grad_norm": 2.592482703755828, + "learning_rate": 4.996126229163412e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.8610125184059143, + "step": 2319 + }, + { + "epoch": 1.16, + "grad_norm": 1.7231354957953293, + "learning_rate": 4.996101910454953e-06, + "loss": 0.3013, + "mean_token_accuracy": 0.8964994549751282, + "step": 2320 + }, + { + "epoch": 1.1605, + "grad_norm": 2.5763758993473336, + "learning_rate": 4.996077515710882e-06, + "loss": 0.3019, + "mean_token_accuracy": 0.8982179760932922, + "step": 2321 + }, + { + "epoch": 1.161, + "grad_norm": 4.3667616728309, + "learning_rate": 4.996053044931942e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8847576975822449, + "step": 2322 + }, + { + "epoch": 1.1615, + "grad_norm": 2.3522326852462636, + "learning_rate": 4.996028498118878e-06, + "loss": 0.3046, + "mean_token_accuracy": 0.9033926725387573, + "step": 2323 + }, + { + "epoch": 1.162, + "grad_norm": 2.470179800504652, + "learning_rate": 4.996003875272438e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.892608642578125, + "step": 2324 + }, + { + "epoch": 1.1625, + "grad_norm": 1.8392716581713886, + "learning_rate": 4.995979176393372e-06, + "loss": 0.2642, + "mean_token_accuracy": 0.913847804069519, + "step": 2325 + }, + { + "epoch": 1.163, + "grad_norm": 2.9787822164486086, + "learning_rate": 4.995954401482434e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8860327005386353, + "step": 2326 + }, + { + "epoch": 1.1635, + "grad_norm": 1.8104444703323226, + "learning_rate": 4.995929550540376e-06, + "loss": 0.317, + "mean_token_accuracy": 0.8928021788597107, + "step": 2327 + }, + { + "epoch": 1.164, + "grad_norm": 5.245861018638613, + "learning_rate": 4.995904623567956e-06, + "loss": 0.2311, + "mean_token_accuracy": 0.9151367545127869, + "step": 2328 + }, + { + "epoch": 1.1645, + "grad_norm": 2.192429749292512, + "learning_rate": 4.995879620565934e-06, + "loss": 0.3278, + "mean_token_accuracy": 0.8939822912216187, + "step": 2329 + }, + { + "epoch": 1.165, + "grad_norm": 2.1527988729595045, + "learning_rate": 4.995854541535072e-06, + "loss": 0.352, + "mean_token_accuracy": 0.883378803730011, + "step": 2330 + }, + { + "epoch": 1.1655, + "grad_norm": 11.396455949623789, + "learning_rate": 4.995829386476132e-06, + "loss": 0.3031, + "mean_token_accuracy": 0.9092902541160583, + "step": 2331 + }, + { + "epoch": 1.166, + "grad_norm": 3.026650938777154, + "learning_rate": 4.995804155389881e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8754917979240417, + "step": 2332 + }, + { + "epoch": 1.1665, + "grad_norm": 3.339026126575213, + "learning_rate": 4.995778848277088e-06, + "loss": 0.2713, + "mean_token_accuracy": 0.9086965322494507, + "step": 2333 + }, + { + "epoch": 1.167, + "grad_norm": 2.455796459732302, + "learning_rate": 4.995753465138525e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8754856586456299, + "step": 2334 + }, + { + "epoch": 1.1675, + "grad_norm": 1.9154716571617998, + "learning_rate": 4.995728005974964e-06, + "loss": 0.316, + "mean_token_accuracy": 0.899583101272583, + "step": 2335 + }, + { + "epoch": 1.168, + "grad_norm": 2.5292451265957, + "learning_rate": 4.99570247078718e-06, + "loss": 0.3197, + "mean_token_accuracy": 0.9018107056617737, + "step": 2336 + }, + { + "epoch": 1.1685, + "grad_norm": 2.065920377432601, + "learning_rate": 4.995676859575952e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8717382550239563, + "step": 2337 + }, + { + "epoch": 1.169, + "grad_norm": 12.22610440297445, + "learning_rate": 4.99565117234206e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8650065660476685, + "step": 2338 + }, + { + "epoch": 1.1695, + "grad_norm": 2.3300122857574106, + "learning_rate": 4.995625409086286e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8840747475624084, + "step": 2339 + }, + { + "epoch": 1.17, + "grad_norm": 2.3707950381995953, + "learning_rate": 4.995599569809414e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8796723484992981, + "step": 2340 + }, + { + "epoch": 1.1705, + "grad_norm": 2.5651607841379422, + "learning_rate": 4.995573654512232e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8618239164352417, + "step": 2341 + }, + { + "epoch": 1.171, + "grad_norm": 3.4781800329012884, + "learning_rate": 4.9955476631955304e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8498068451881409, + "step": 2342 + }, + { + "epoch": 1.1715, + "grad_norm": 2.9047388692097114, + "learning_rate": 4.995521595860099e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8887731432914734, + "step": 2343 + }, + { + "epoch": 1.172, + "grad_norm": 3.52838846974527, + "learning_rate": 4.995495452506733e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8817689418792725, + "step": 2344 + }, + { + "epoch": 1.1724999999999999, + "grad_norm": 4.78493385201998, + "learning_rate": 4.9954692331362295e-06, + "loss": 0.313, + "mean_token_accuracy": 0.9061693549156189, + "step": 2345 + }, + { + "epoch": 1.173, + "grad_norm": 2.3106167221632194, + "learning_rate": 4.995442937749385e-06, + "loss": 0.2854, + "mean_token_accuracy": 0.907392680644989, + "step": 2346 + }, + { + "epoch": 1.1735, + "grad_norm": 2.0573703513516177, + "learning_rate": 4.995416566347003e-06, + "loss": 0.2869, + "mean_token_accuracy": 0.9062931537628174, + "step": 2347 + }, + { + "epoch": 1.174, + "grad_norm": 2.0894985864374602, + "learning_rate": 4.995390118929885e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8673059344291687, + "step": 2348 + }, + { + "epoch": 1.1745, + "grad_norm": 9.994953120100917, + "learning_rate": 4.995363595498837e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.877082884311676, + "step": 2349 + }, + { + "epoch": 1.175, + "grad_norm": 4.207284769412036, + "learning_rate": 4.995336996054668e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8724221587181091, + "step": 2350 + }, + { + "epoch": 1.1755, + "grad_norm": 1.8082518145132884, + "learning_rate": 4.995310320598187e-06, + "loss": 0.3068, + "mean_token_accuracy": 0.8974918127059937, + "step": 2351 + }, + { + "epoch": 1.176, + "grad_norm": 3.2747972924519826, + "learning_rate": 4.995283569130207e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.891250491142273, + "step": 2352 + }, + { + "epoch": 1.1764999999999999, + "grad_norm": 3.039688464985263, + "learning_rate": 4.995256741651543e-06, + "loss": 0.3144, + "mean_token_accuracy": 0.8990347385406494, + "step": 2353 + }, + { + "epoch": 1.177, + "grad_norm": 2.7238965762325256, + "learning_rate": 4.995229838163012e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8852487802505493, + "step": 2354 + }, + { + "epoch": 1.1775, + "grad_norm": 2.1691183198562247, + "learning_rate": 4.995202858665434e-06, + "loss": 0.2472, + "mean_token_accuracy": 0.9153943061828613, + "step": 2355 + }, + { + "epoch": 1.178, + "grad_norm": 2.2391312953420925, + "learning_rate": 4.995175803159631e-06, + "loss": 0.2983, + "mean_token_accuracy": 0.8883216381072998, + "step": 2356 + }, + { + "epoch": 1.1785, + "grad_norm": 2.5375137298746435, + "learning_rate": 4.995148671646426e-06, + "loss": 0.564, + "mean_token_accuracy": 0.8458893299102783, + "step": 2357 + }, + { + "epoch": 1.179, + "grad_norm": 2.705514913346577, + "learning_rate": 4.995121464126646e-06, + "loss": 0.2484, + "mean_token_accuracy": 0.9131187200546265, + "step": 2358 + }, + { + "epoch": 1.1795, + "grad_norm": 2.7500605423811795, + "learning_rate": 4.99509418060112e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8735939860343933, + "step": 2359 + }, + { + "epoch": 1.18, + "grad_norm": 2.257808242269036, + "learning_rate": 4.9950668210706795e-06, + "loss": 0.2952, + "mean_token_accuracy": 0.8996545672416687, + "step": 2360 + }, + { + "epoch": 1.1804999999999999, + "grad_norm": 2.0212580531980273, + "learning_rate": 4.995039385536157e-06, + "loss": 0.3257, + "mean_token_accuracy": 0.8943598866462708, + "step": 2361 + }, + { + "epoch": 1.181, + "grad_norm": 2.83701253397353, + "learning_rate": 4.995011873998389e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.863318681716919, + "step": 2362 + }, + { + "epoch": 1.1815, + "grad_norm": 2.027580976320052, + "learning_rate": 4.994984286458213e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.8782828450202942, + "step": 2363 + }, + { + "epoch": 1.182, + "grad_norm": 2.3470897531724386, + "learning_rate": 4.99495662291647e-06, + "loss": 0.3275, + "mean_token_accuracy": 0.8851031064987183, + "step": 2364 + }, + { + "epoch": 1.1825, + "grad_norm": 12.140550824801815, + "learning_rate": 4.9949288833740016e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.879422128200531, + "step": 2365 + }, + { + "epoch": 1.183, + "grad_norm": 5.845311261456864, + "learning_rate": 4.994901067831654e-06, + "loss": 0.3448, + "mean_token_accuracy": 0.8942089080810547, + "step": 2366 + }, + { + "epoch": 1.1835, + "grad_norm": 1.590367215088435, + "learning_rate": 4.994873176290274e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.8925759196281433, + "step": 2367 + }, + { + "epoch": 1.184, + "grad_norm": 2.7992851131808147, + "learning_rate": 4.9948452087507114e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.855681300163269, + "step": 2368 + }, + { + "epoch": 1.1844999999999999, + "grad_norm": 2.6556827094641573, + "learning_rate": 4.994817165213818e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.8733721971511841, + "step": 2369 + }, + { + "epoch": 1.185, + "grad_norm": 2.351326145997134, + "learning_rate": 4.994789045680448e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8797785639762878, + "step": 2370 + }, + { + "epoch": 1.1855, + "grad_norm": 5.646018819908241, + "learning_rate": 4.994760850151458e-06, + "loss": 0.3214, + "mean_token_accuracy": 0.890332818031311, + "step": 2371 + }, + { + "epoch": 1.186, + "grad_norm": 2.045880410607237, + "learning_rate": 4.9947325786277065e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8661472201347351, + "step": 2372 + }, + { + "epoch": 1.1865, + "grad_norm": 3.1682801246225307, + "learning_rate": 4.9947042311100546e-06, + "loss": 0.4843, + "mean_token_accuracy": 0.8550452589988708, + "step": 2373 + }, + { + "epoch": 1.187, + "grad_norm": 1.6377066729291683, + "learning_rate": 4.994675807599367e-06, + "loss": 0.2622, + "mean_token_accuracy": 0.9024995565414429, + "step": 2374 + }, + { + "epoch": 1.1875, + "grad_norm": 2.59765267084693, + "learning_rate": 4.994647308096509e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8813852667808533, + "step": 2375 + }, + { + "epoch": 1.188, + "grad_norm": 2.5368363237286133, + "learning_rate": 4.994618732602349e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8783314228057861, + "step": 2376 + }, + { + "epoch": 1.1885, + "grad_norm": 2.3524799366557185, + "learning_rate": 4.994590081117756e-06, + "loss": 0.49, + "mean_token_accuracy": 0.8560367226600647, + "step": 2377 + }, + { + "epoch": 1.189, + "grad_norm": 8.999717481127725, + "learning_rate": 4.994561353643605e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8854711055755615, + "step": 2378 + }, + { + "epoch": 1.1895, + "grad_norm": 3.8008525860031805, + "learning_rate": 4.994532550180769e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8456218242645264, + "step": 2379 + }, + { + "epoch": 1.19, + "grad_norm": 2.1083728919236844, + "learning_rate": 4.994503670730126e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.881328821182251, + "step": 2380 + }, + { + "epoch": 1.1905000000000001, + "grad_norm": 2.323520183141667, + "learning_rate": 4.994474715292555e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8741463422775269, + "step": 2381 + }, + { + "epoch": 1.191, + "grad_norm": 1.8562781178296148, + "learning_rate": 4.994445683868941e-06, + "loss": 0.3136, + "mean_token_accuracy": 0.8912550806999207, + "step": 2382 + }, + { + "epoch": 1.1915, + "grad_norm": 1.9410109448317616, + "learning_rate": 4.994416576460166e-06, + "loss": 0.2577, + "mean_token_accuracy": 0.9143596887588501, + "step": 2383 + }, + { + "epoch": 1.192, + "grad_norm": 4.668964083238941, + "learning_rate": 4.9943873930671175e-06, + "loss": 0.362, + "mean_token_accuracy": 0.890842854976654, + "step": 2384 + }, + { + "epoch": 1.1925, + "grad_norm": 2.489001357851651, + "learning_rate": 4.994358133690683e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8819571733474731, + "step": 2385 + }, + { + "epoch": 1.193, + "grad_norm": 2.149856770161974, + "learning_rate": 4.994328798331754e-06, + "loss": 0.2707, + "mean_token_accuracy": 0.9089885354042053, + "step": 2386 + }, + { + "epoch": 1.1935, + "grad_norm": 2.457979817666016, + "learning_rate": 4.9942993869912275e-06, + "loss": 0.3188, + "mean_token_accuracy": 0.8937766551971436, + "step": 2387 + }, + { + "epoch": 1.194, + "grad_norm": 4.837771576755538, + "learning_rate": 4.9942698996699945e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.875, + "step": 2388 + }, + { + "epoch": 1.1945000000000001, + "grad_norm": 10.156920847585033, + "learning_rate": 4.9942403363689576e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8790953159332275, + "step": 2389 + }, + { + "epoch": 1.195, + "grad_norm": 3.6696693572945573, + "learning_rate": 4.9942106970890136e-06, + "loss": 0.2506, + "mean_token_accuracy": 0.9116050004959106, + "step": 2390 + }, + { + "epoch": 1.1955, + "grad_norm": 5.404511239502892, + "learning_rate": 4.994180981831068e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8509305119514465, + "step": 2391 + }, + { + "epoch": 1.196, + "grad_norm": 3.166659441311036, + "learning_rate": 4.994151190596025e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8637913465499878, + "step": 2392 + }, + { + "epoch": 1.1965, + "grad_norm": 1.975141941932114, + "learning_rate": 4.9941213233847915e-06, + "loss": 0.3014, + "mean_token_accuracy": 0.9065906405448914, + "step": 2393 + }, + { + "epoch": 1.197, + "grad_norm": 2.0992236890446283, + "learning_rate": 4.994091380198278e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.8907548189163208, + "step": 2394 + }, + { + "epoch": 1.1975, + "grad_norm": 4.17551503433939, + "learning_rate": 4.9940613610373974e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8621209263801575, + "step": 2395 + }, + { + "epoch": 1.198, + "grad_norm": 2.3531243521219203, + "learning_rate": 4.9940312659030635e-06, + "loss": 0.5093, + "mean_token_accuracy": 0.8500474691390991, + "step": 2396 + }, + { + "epoch": 1.1985000000000001, + "grad_norm": 2.9659940423971514, + "learning_rate": 4.994001094796192e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8749781250953674, + "step": 2397 + }, + { + "epoch": 1.199, + "grad_norm": 2.463992375549641, + "learning_rate": 4.993970847717704e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8793055415153503, + "step": 2398 + }, + { + "epoch": 1.1995, + "grad_norm": 2.2398121993730773, + "learning_rate": 4.993940524668519e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.866694450378418, + "step": 2399 + }, + { + "epoch": 1.2, + "grad_norm": 2.9513958381812757, + "learning_rate": 4.993910125649561e-06, + "loss": 0.5086, + "mean_token_accuracy": 0.8529054522514343, + "step": 2400 + }, + { + "epoch": 1.2005, + "grad_norm": 2.0756989349823955, + "learning_rate": 4.9938796506617574e-06, + "loss": 0.3025, + "mean_token_accuracy": 0.8992383480072021, + "step": 2401 + }, + { + "epoch": 1.201, + "grad_norm": 1.6268591707550792, + "learning_rate": 4.993849099706035e-06, + "loss": 0.3041, + "mean_token_accuracy": 0.8950077891349792, + "step": 2402 + }, + { + "epoch": 1.2015, + "grad_norm": 2.545672422041884, + "learning_rate": 4.993818472783325e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8984171748161316, + "step": 2403 + }, + { + "epoch": 1.202, + "grad_norm": 1.9975707695596905, + "learning_rate": 4.9937877698945595e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8840905427932739, + "step": 2404 + }, + { + "epoch": 1.2025000000000001, + "grad_norm": 2.198948097919507, + "learning_rate": 4.993756991040676e-06, + "loss": 0.2834, + "mean_token_accuracy": 0.9026600122451782, + "step": 2405 + }, + { + "epoch": 1.203, + "grad_norm": 2.787125937376694, + "learning_rate": 4.99372613622261e-06, + "loss": 0.3248, + "mean_token_accuracy": 0.892214298248291, + "step": 2406 + }, + { + "epoch": 1.2035, + "grad_norm": 1.9992861460388018, + "learning_rate": 4.993695205441302e-06, + "loss": 0.2464, + "mean_token_accuracy": 0.9165651798248291, + "step": 2407 + }, + { + "epoch": 1.204, + "grad_norm": 1.6423082662403392, + "learning_rate": 4.993664198697694e-06, + "loss": 0.2715, + "mean_token_accuracy": 0.9095701575279236, + "step": 2408 + }, + { + "epoch": 1.2045, + "grad_norm": 6.642835838737819, + "learning_rate": 4.993633115992732e-06, + "loss": 0.5139, + "mean_token_accuracy": 0.8438277244567871, + "step": 2409 + }, + { + "epoch": 1.205, + "grad_norm": 1.9837635987384659, + "learning_rate": 4.993601957327361e-06, + "loss": 0.2357, + "mean_token_accuracy": 0.9230769276618958, + "step": 2410 + }, + { + "epoch": 1.2055, + "grad_norm": 3.1431882452643034, + "learning_rate": 4.99357072270253e-06, + "loss": 0.2639, + "mean_token_accuracy": 0.9114000201225281, + "step": 2411 + }, + { + "epoch": 1.206, + "grad_norm": 2.480612075488898, + "learning_rate": 4.993539412119192e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8936089277267456, + "step": 2412 + }, + { + "epoch": 1.2065, + "grad_norm": 2.848785046695813, + "learning_rate": 4.993508025578299e-06, + "loss": 0.2899, + "mean_token_accuracy": 0.9017258882522583, + "step": 2413 + }, + { + "epoch": 1.207, + "grad_norm": 1.8752926199232036, + "learning_rate": 4.99347656308081e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8823193907737732, + "step": 2414 + }, + { + "epoch": 1.2075, + "grad_norm": 2.2507505407131676, + "learning_rate": 4.99344502462768e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.887977123260498, + "step": 2415 + }, + { + "epoch": 1.208, + "grad_norm": 2.571357113375206, + "learning_rate": 4.993413410219872e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.8455692529678345, + "step": 2416 + }, + { + "epoch": 1.2085, + "grad_norm": 10.06240223782298, + "learning_rate": 4.993381719858348e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8901933431625366, + "step": 2417 + }, + { + "epoch": 1.209, + "grad_norm": 2.721001530151379, + "learning_rate": 4.993349953544073e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8848883509635925, + "step": 2418 + }, + { + "epoch": 1.2095, + "grad_norm": 2.6064059482060538, + "learning_rate": 4.993318111278016e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8803327679634094, + "step": 2419 + }, + { + "epoch": 1.21, + "grad_norm": 2.3306411603783665, + "learning_rate": 4.993286193061145e-06, + "loss": 0.4861, + "mean_token_accuracy": 0.8582813739776611, + "step": 2420 + }, + { + "epoch": 1.2105, + "grad_norm": 2.2710646916862016, + "learning_rate": 4.993254198894435e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.8806188702583313, + "step": 2421 + }, + { + "epoch": 1.211, + "grad_norm": 2.3808826463536383, + "learning_rate": 4.993222128778858e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8658496141433716, + "step": 2422 + }, + { + "epoch": 1.2115, + "grad_norm": 2.636626911818767, + "learning_rate": 4.993189982715393e-06, + "loss": 0.2958, + "mean_token_accuracy": 0.9082343578338623, + "step": 2423 + }, + { + "epoch": 1.212, + "grad_norm": 2.6190222712305893, + "learning_rate": 4.993157760705018e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8698498010635376, + "step": 2424 + }, + { + "epoch": 1.2125, + "grad_norm": 2.8724070819589573, + "learning_rate": 4.993125462748714e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8875888586044312, + "step": 2425 + }, + { + "epoch": 1.213, + "grad_norm": 3.3452977560487627, + "learning_rate": 4.9930930888474675e-06, + "loss": 0.2379, + "mean_token_accuracy": 0.9209237098693848, + "step": 2426 + }, + { + "epoch": 1.2135, + "grad_norm": 2.7743191927731314, + "learning_rate": 4.9930606390022605e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8587158918380737, + "step": 2427 + }, + { + "epoch": 1.214, + "grad_norm": 2.5933745386406724, + "learning_rate": 4.993028113214085e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8697232604026794, + "step": 2428 + }, + { + "epoch": 1.2145, + "grad_norm": 1.8473291407676071, + "learning_rate": 4.99299551148393e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8884449601173401, + "step": 2429 + }, + { + "epoch": 1.215, + "grad_norm": 4.230146562670275, + "learning_rate": 4.9929628338127904e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8715822696685791, + "step": 2430 + }, + { + "epoch": 1.2155, + "grad_norm": 1.4823627898639622, + "learning_rate": 4.992930080201659e-06, + "loss": 0.301, + "mean_token_accuracy": 0.891491711139679, + "step": 2431 + }, + { + "epoch": 1.216, + "grad_norm": 4.314225097646127, + "learning_rate": 4.992897250651535e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8823066353797913, + "step": 2432 + }, + { + "epoch": 1.2165, + "grad_norm": 1.9774565083519873, + "learning_rate": 4.992864345163419e-06, + "loss": 0.3194, + "mean_token_accuracy": 0.8894515037536621, + "step": 2433 + }, + { + "epoch": 1.217, + "grad_norm": 2.2259184041667326, + "learning_rate": 4.992831363738312e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8874613642692566, + "step": 2434 + }, + { + "epoch": 1.2175, + "grad_norm": 2.9170177212609882, + "learning_rate": 4.9927983063772205e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8511159420013428, + "step": 2435 + }, + { + "epoch": 1.218, + "grad_norm": 2.275024909848434, + "learning_rate": 4.99276517308115e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8715746998786926, + "step": 2436 + }, + { + "epoch": 1.2185, + "grad_norm": 4.185626077225519, + "learning_rate": 4.992731963851109e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8719483613967896, + "step": 2437 + }, + { + "epoch": 1.219, + "grad_norm": 2.1629206753215917, + "learning_rate": 4.992698678688112e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8597402572631836, + "step": 2438 + }, + { + "epoch": 1.2195, + "grad_norm": 5.839876098998603, + "learning_rate": 4.99266531759317e-06, + "loss": 0.5531, + "mean_token_accuracy": 0.8445148468017578, + "step": 2439 + }, + { + "epoch": 1.22, + "grad_norm": 2.555411594560899, + "learning_rate": 4.992631880567301e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8786696195602417, + "step": 2440 + }, + { + "epoch": 1.2205, + "grad_norm": 1.932290909135276, + "learning_rate": 4.992598367611523e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8831676840782166, + "step": 2441 + }, + { + "epoch": 1.221, + "grad_norm": 3.7672858761149564, + "learning_rate": 4.992564778726857e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8610662221908569, + "step": 2442 + }, + { + "epoch": 1.2215, + "grad_norm": 2.26486056569772, + "learning_rate": 4.992531113914325e-06, + "loss": 0.2821, + "mean_token_accuracy": 0.9039239287376404, + "step": 2443 + }, + { + "epoch": 1.222, + "grad_norm": 3.0531503315035278, + "learning_rate": 4.992497373174955e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8765807747840881, + "step": 2444 + }, + { + "epoch": 1.2225, + "grad_norm": 2.436696258134347, + "learning_rate": 4.992463556509772e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8621332049369812, + "step": 2445 + }, + { + "epoch": 1.223, + "grad_norm": 2.757740663226853, + "learning_rate": 4.992429663919809e-06, + "loss": 0.2808, + "mean_token_accuracy": 0.8903675675392151, + "step": 2446 + }, + { + "epoch": 1.2235, + "grad_norm": 1.9715914415477223, + "learning_rate": 4.9923956954060955e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8808040618896484, + "step": 2447 + }, + { + "epoch": 1.224, + "grad_norm": 3.1114957145791644, + "learning_rate": 4.992361650969668e-06, + "loss": 0.4859, + "mean_token_accuracy": 0.8481627702713013, + "step": 2448 + }, + { + "epoch": 1.2245, + "grad_norm": 2.9975387084752723, + "learning_rate": 4.992327530611563e-06, + "loss": 0.5099, + "mean_token_accuracy": 0.8369615077972412, + "step": 2449 + }, + { + "epoch": 1.225, + "grad_norm": 2.6035813838085238, + "learning_rate": 4.992293334332821e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8756887912750244, + "step": 2450 + }, + { + "epoch": 1.2255, + "grad_norm": 2.983756813154375, + "learning_rate": 4.992259062134481e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8776707649230957, + "step": 2451 + }, + { + "epoch": 1.226, + "grad_norm": 2.0233523327276832, + "learning_rate": 4.99222471401759e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8694710731506348, + "step": 2452 + }, + { + "epoch": 1.2265, + "grad_norm": 2.207804220128388, + "learning_rate": 4.992190289983193e-06, + "loss": 0.2832, + "mean_token_accuracy": 0.9096080660820007, + "step": 2453 + }, + { + "epoch": 1.227, + "grad_norm": 3.3819331571478832, + "learning_rate": 4.992155790032338e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.881780743598938, + "step": 2454 + }, + { + "epoch": 1.2275, + "grad_norm": 2.377906428057397, + "learning_rate": 4.992121214166077e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8737270832061768, + "step": 2455 + }, + { + "epoch": 1.228, + "grad_norm": 2.1334088289118793, + "learning_rate": 4.992086562385462e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8706119656562805, + "step": 2456 + }, + { + "epoch": 1.2285, + "grad_norm": 2.364811217940512, + "learning_rate": 4.99205183469155e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.880180835723877, + "step": 2457 + }, + { + "epoch": 1.229, + "grad_norm": 2.322576611113373, + "learning_rate": 4.992017031085398e-06, + "loss": 0.2615, + "mean_token_accuracy": 0.9136085510253906, + "step": 2458 + }, + { + "epoch": 1.2295, + "grad_norm": 3.135874799800739, + "learning_rate": 4.991982151568067e-06, + "loss": 0.3144, + "mean_token_accuracy": 0.8902735114097595, + "step": 2459 + }, + { + "epoch": 1.23, + "grad_norm": 2.0250594607749486, + "learning_rate": 4.991947196140619e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8585970997810364, + "step": 2460 + }, + { + "epoch": 1.2305, + "grad_norm": 8.591724348364357, + "learning_rate": 4.991912164804117e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.8531687259674072, + "step": 2461 + }, + { + "epoch": 1.231, + "grad_norm": 4.665645723760064, + "learning_rate": 4.991877057559631e-06, + "loss": 0.489, + "mean_token_accuracy": 0.8617182374000549, + "step": 2462 + }, + { + "epoch": 1.2315, + "grad_norm": 2.449389948932664, + "learning_rate": 4.9918418744082295e-06, + "loss": 0.472, + "mean_token_accuracy": 0.8539259433746338, + "step": 2463 + }, + { + "epoch": 1.232, + "grad_norm": 2.2960813471076995, + "learning_rate": 4.9918066153509835e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8762863278388977, + "step": 2464 + }, + { + "epoch": 1.2325, + "grad_norm": 3.037416042629677, + "learning_rate": 4.991771280388967e-06, + "loss": 0.2979, + "mean_token_accuracy": 0.9032366275787354, + "step": 2465 + }, + { + "epoch": 1.233, + "grad_norm": 2.592250935849321, + "learning_rate": 4.9917358695232576e-06, + "loss": 0.3353, + "mean_token_accuracy": 0.8901571035385132, + "step": 2466 + }, + { + "epoch": 1.2335, + "grad_norm": 1.9453362821151563, + "learning_rate": 4.991700382754934e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8797683715820312, + "step": 2467 + }, + { + "epoch": 1.234, + "grad_norm": 2.7306257612284286, + "learning_rate": 4.991664820085075e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8755943179130554, + "step": 2468 + }, + { + "epoch": 1.2345, + "grad_norm": 3.324307791507287, + "learning_rate": 4.991629181514766e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8559092283248901, + "step": 2469 + }, + { + "epoch": 1.2349999999999999, + "grad_norm": 1.9797842737431453, + "learning_rate": 4.991593467045092e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8819645643234253, + "step": 2470 + }, + { + "epoch": 1.2355, + "grad_norm": 2.1234657388295264, + "learning_rate": 4.991557676677141e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8646328449249268, + "step": 2471 + }, + { + "epoch": 1.236, + "grad_norm": 6.262026643125095, + "learning_rate": 4.9915218104120024e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8725705742835999, + "step": 2472 + }, + { + "epoch": 1.2365, + "grad_norm": 2.3692266908508097, + "learning_rate": 4.9914858682507696e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8724471926689148, + "step": 2473 + }, + { + "epoch": 1.237, + "grad_norm": 2.800417428422308, + "learning_rate": 4.9914498501945384e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8826579451560974, + "step": 2474 + }, + { + "epoch": 1.2375, + "grad_norm": 2.701263863937478, + "learning_rate": 4.991413756244404e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8789324760437012, + "step": 2475 + }, + { + "epoch": 1.238, + "grad_norm": 14.049952434800339, + "learning_rate": 4.9913775864014665e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8718064427375793, + "step": 2476 + }, + { + "epoch": 1.2385, + "grad_norm": 4.128983111229445, + "learning_rate": 4.991341340666829e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8789078593254089, + "step": 2477 + }, + { + "epoch": 1.2389999999999999, + "grad_norm": 3.8803728738753347, + "learning_rate": 4.991305019041594e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.882322371006012, + "step": 2478 + }, + { + "epoch": 1.2395, + "grad_norm": 1.9999825889567457, + "learning_rate": 4.9912686215268684e-06, + "loss": 0.323, + "mean_token_accuracy": 0.8932937979698181, + "step": 2479 + }, + { + "epoch": 1.24, + "grad_norm": 2.1238464682622022, + "learning_rate": 4.9912321481237616e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.884365439414978, + "step": 2480 + }, + { + "epoch": 1.2405, + "grad_norm": 9.931462515064245, + "learning_rate": 4.991195598833383e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8812211155891418, + "step": 2481 + }, + { + "epoch": 1.241, + "grad_norm": 2.0748603743114375, + "learning_rate": 4.991158973656848e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8758500218391418, + "step": 2482 + }, + { + "epoch": 1.2415, + "grad_norm": 3.333828591921908, + "learning_rate": 4.991122272595271e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8829708695411682, + "step": 2483 + }, + { + "epoch": 1.242, + "grad_norm": 2.5416613284457936, + "learning_rate": 4.99108549564977e-06, + "loss": 0.343, + "mean_token_accuracy": 0.8806358575820923, + "step": 2484 + }, + { + "epoch": 1.2425, + "grad_norm": 2.5958450777830424, + "learning_rate": 4.991048642821466e-06, + "loss": 0.2556, + "mean_token_accuracy": 0.9164825677871704, + "step": 2485 + }, + { + "epoch": 1.2429999999999999, + "grad_norm": 5.744558562489135, + "learning_rate": 4.9910117141114815e-06, + "loss": 0.2534, + "mean_token_accuracy": 0.914145290851593, + "step": 2486 + }, + { + "epoch": 1.2435, + "grad_norm": 2.658396485093482, + "learning_rate": 4.99097470952094e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.867253303527832, + "step": 2487 + }, + { + "epoch": 1.244, + "grad_norm": 2.44995184099197, + "learning_rate": 4.990937629050972e-06, + "loss": 0.3014, + "mean_token_accuracy": 0.9021322131156921, + "step": 2488 + }, + { + "epoch": 1.2445, + "grad_norm": 7.084268788285291, + "learning_rate": 4.990900472702702e-06, + "loss": 0.31, + "mean_token_accuracy": 0.8973689079284668, + "step": 2489 + }, + { + "epoch": 1.245, + "grad_norm": 2.3247839419876892, + "learning_rate": 4.990863240477266e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8567497134208679, + "step": 2490 + }, + { + "epoch": 1.2455, + "grad_norm": 2.3239979647951765, + "learning_rate": 4.990825932375797e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8858447670936584, + "step": 2491 + }, + { + "epoch": 1.246, + "grad_norm": 2.521507125396697, + "learning_rate": 4.990788548399432e-06, + "loss": 0.3026, + "mean_token_accuracy": 0.9077203273773193, + "step": 2492 + }, + { + "epoch": 1.2465, + "grad_norm": 2.6720115861773808, + "learning_rate": 4.990751088549308e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8958661556243896, + "step": 2493 + }, + { + "epoch": 1.2469999999999999, + "grad_norm": 4.088832338454395, + "learning_rate": 4.990713552826567e-06, + "loss": 0.359, + "mean_token_accuracy": 0.8946495056152344, + "step": 2494 + }, + { + "epoch": 1.2475, + "grad_norm": 4.203559032742981, + "learning_rate": 4.990675941232353e-06, + "loss": 0.2833, + "mean_token_accuracy": 0.9077908396720886, + "step": 2495 + }, + { + "epoch": 1.248, + "grad_norm": 2.5031293122648823, + "learning_rate": 4.990638253767812e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8697829842567444, + "step": 2496 + }, + { + "epoch": 1.2485, + "grad_norm": 2.6729771001909244, + "learning_rate": 4.990600490434091e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8574461340904236, + "step": 2497 + }, + { + "epoch": 1.249, + "grad_norm": 1.9603331463711433, + "learning_rate": 4.9905626512323406e-06, + "loss": 0.3047, + "mean_token_accuracy": 0.896915853023529, + "step": 2498 + }, + { + "epoch": 1.2495, + "grad_norm": 5.501193995218667, + "learning_rate": 4.9905247361637135e-06, + "loss": 0.3249, + "mean_token_accuracy": 0.8934000134468079, + "step": 2499 + }, + { + "epoch": 1.25, + "grad_norm": 2.571135808085696, + "learning_rate": 4.990486745229364e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8573668003082275, + "step": 2500 + }, + { + "epoch": 1.2505, + "grad_norm": 2.8264935850819217, + "learning_rate": 4.990448678430451e-06, + "loss": 0.3181, + "mean_token_accuracy": 0.892187237739563, + "step": 2501 + }, + { + "epoch": 1.251, + "grad_norm": 2.791362428600353, + "learning_rate": 4.990410535768133e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8622293472290039, + "step": 2502 + }, + { + "epoch": 1.2515, + "grad_norm": 2.4631507073040293, + "learning_rate": 4.990372317243571e-06, + "loss": 0.3071, + "mean_token_accuracy": 0.9022082090377808, + "step": 2503 + }, + { + "epoch": 1.252, + "grad_norm": 2.1312258260016383, + "learning_rate": 4.990334022857932e-06, + "loss": 0.2897, + "mean_token_accuracy": 0.9069387912750244, + "step": 2504 + }, + { + "epoch": 1.2525, + "grad_norm": 2.471730957339364, + "learning_rate": 4.990295652612379e-06, + "loss": 0.2435, + "mean_token_accuracy": 0.9215933084487915, + "step": 2505 + }, + { + "epoch": 1.2530000000000001, + "grad_norm": 3.703518194176909, + "learning_rate": 4.990257206508084e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8720195889472961, + "step": 2506 + }, + { + "epoch": 1.2535, + "grad_norm": 1.9005659437136655, + "learning_rate": 4.990218684546216e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8857994079589844, + "step": 2507 + }, + { + "epoch": 1.254, + "grad_norm": 2.208816313409312, + "learning_rate": 4.9901800867279495e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8789159059524536, + "step": 2508 + }, + { + "epoch": 1.2545, + "grad_norm": 4.434239338835746, + "learning_rate": 4.990141413054459e-06, + "loss": 0.3367, + "mean_token_accuracy": 0.8879728317260742, + "step": 2509 + }, + { + "epoch": 1.255, + "grad_norm": 2.5050305442568246, + "learning_rate": 4.990102663526925e-06, + "loss": 0.245, + "mean_token_accuracy": 0.9208359718322754, + "step": 2510 + }, + { + "epoch": 1.2555, + "grad_norm": 4.482167051507545, + "learning_rate": 4.990063838146525e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8695865273475647, + "step": 2511 + }, + { + "epoch": 1.256, + "grad_norm": 2.954504196838863, + "learning_rate": 4.9900249369144435e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8603087067604065, + "step": 2512 + }, + { + "epoch": 1.2565, + "grad_norm": 2.8850010476730037, + "learning_rate": 4.989985959831865e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8520895838737488, + "step": 2513 + }, + { + "epoch": 1.2570000000000001, + "grad_norm": 2.55526667147998, + "learning_rate": 4.989946906899977e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8549156188964844, + "step": 2514 + }, + { + "epoch": 1.2575, + "grad_norm": 3.0380239169833203, + "learning_rate": 4.989907778119969e-06, + "loss": 0.5325, + "mean_token_accuracy": 0.8489765524864197, + "step": 2515 + }, + { + "epoch": 1.258, + "grad_norm": 54.69587693731007, + "learning_rate": 4.989868573493032e-06, + "loss": 0.2642, + "mean_token_accuracy": 0.9001321792602539, + "step": 2516 + }, + { + "epoch": 1.2585, + "grad_norm": 2.347612852287577, + "learning_rate": 4.989829293020363e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.883108913898468, + "step": 2517 + }, + { + "epoch": 1.259, + "grad_norm": 2.2910188303470993, + "learning_rate": 4.989789936703155e-06, + "loss": 0.3187, + "mean_token_accuracy": 0.8950777053833008, + "step": 2518 + }, + { + "epoch": 1.2595, + "grad_norm": 2.947851565791113, + "learning_rate": 4.989750504542609e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8730062246322632, + "step": 2519 + }, + { + "epoch": 1.26, + "grad_norm": 4.0019949886410044, + "learning_rate": 4.989710996539926e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.8950467109680176, + "step": 2520 + }, + { + "epoch": 1.2605, + "grad_norm": 2.9215288091564537, + "learning_rate": 4.98967141269631e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8783089518547058, + "step": 2521 + }, + { + "epoch": 1.2610000000000001, + "grad_norm": 2.3200637367564014, + "learning_rate": 4.989631753012965e-06, + "loss": 0.2742, + "mean_token_accuracy": 0.9078691601753235, + "step": 2522 + }, + { + "epoch": 1.2615, + "grad_norm": 3.225153548131476, + "learning_rate": 4.9895920174911e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8667005300521851, + "step": 2523 + }, + { + "epoch": 1.262, + "grad_norm": 6.842644112475704, + "learning_rate": 4.9895522061319255e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8907449841499329, + "step": 2524 + }, + { + "epoch": 1.2625, + "grad_norm": 2.4327787906391487, + "learning_rate": 4.989512318936654e-06, + "loss": 0.3242, + "mean_token_accuracy": 0.8907153606414795, + "step": 2525 + }, + { + "epoch": 1.263, + "grad_norm": 3.0055314155560775, + "learning_rate": 4.9894723559065015e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.9015166759490967, + "step": 2526 + }, + { + "epoch": 1.2635, + "grad_norm": 2.1985942403056673, + "learning_rate": 4.989432317042685e-06, + "loss": 0.248, + "mean_token_accuracy": 0.9197651743888855, + "step": 2527 + }, + { + "epoch": 1.264, + "grad_norm": 3.040970189563025, + "learning_rate": 4.989392202346423e-06, + "loss": 0.2389, + "mean_token_accuracy": 0.9128333926200867, + "step": 2528 + }, + { + "epoch": 1.2645, + "grad_norm": 3.9689880016324643, + "learning_rate": 4.989352011818939e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.8571428656578064, + "step": 2529 + }, + { + "epoch": 1.2650000000000001, + "grad_norm": 2.5641859587796114, + "learning_rate": 4.989311745461456e-06, + "loss": 0.2668, + "mean_token_accuracy": 0.9055221676826477, + "step": 2530 + }, + { + "epoch": 1.2655, + "grad_norm": 2.98165430489228, + "learning_rate": 4.989271403275201e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8643895983695984, + "step": 2531 + }, + { + "epoch": 1.266, + "grad_norm": 2.9947893577699474, + "learning_rate": 4.989230985261403e-06, + "loss": 0.3401, + "mean_token_accuracy": 0.8855765461921692, + "step": 2532 + }, + { + "epoch": 1.2665, + "grad_norm": 2.7090209551477984, + "learning_rate": 4.989190491421293e-06, + "loss": 0.2929, + "mean_token_accuracy": 0.9027407765388489, + "step": 2533 + }, + { + "epoch": 1.267, + "grad_norm": 2.1483321589160025, + "learning_rate": 4.989149921756105e-06, + "loss": 0.1822, + "mean_token_accuracy": 0.9338427186012268, + "step": 2534 + }, + { + "epoch": 1.2675, + "grad_norm": 3.9625733374518974, + "learning_rate": 4.989109276267074e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8802754878997803, + "step": 2535 + }, + { + "epoch": 1.268, + "grad_norm": 2.6819476565028664, + "learning_rate": 4.98906855495544e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8810727000236511, + "step": 2536 + }, + { + "epoch": 1.2685, + "grad_norm": 4.2630621613053545, + "learning_rate": 4.989027757822441e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.878620982170105, + "step": 2537 + }, + { + "epoch": 1.2690000000000001, + "grad_norm": 6.702774593367379, + "learning_rate": 4.988986884869321e-06, + "loss": 0.3354, + "mean_token_accuracy": 0.8894001245498657, + "step": 2538 + }, + { + "epoch": 1.2695, + "grad_norm": 2.3538216786886803, + "learning_rate": 4.988945936097325e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.878918468952179, + "step": 2539 + }, + { + "epoch": 1.27, + "grad_norm": 4.22697884269318, + "learning_rate": 4.9889049115077e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8663366436958313, + "step": 2540 + }, + { + "epoch": 1.2705, + "grad_norm": 2.90116186067249, + "learning_rate": 4.988863811101697e-06, + "loss": 0.5134, + "mean_token_accuracy": 0.8492922782897949, + "step": 2541 + }, + { + "epoch": 1.271, + "grad_norm": 5.390407238958239, + "learning_rate": 4.9888226348805654e-06, + "loss": 0.5878, + "mean_token_accuracy": 0.8284818530082703, + "step": 2542 + }, + { + "epoch": 1.2715, + "grad_norm": 3.721198039137362, + "learning_rate": 4.9887813828455625e-06, + "loss": 0.336, + "mean_token_accuracy": 0.8848951458930969, + "step": 2543 + }, + { + "epoch": 1.272, + "grad_norm": 3.620582762907961, + "learning_rate": 4.988740054997943e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8613579273223877, + "step": 2544 + }, + { + "epoch": 1.2725, + "grad_norm": 2.5223585114834495, + "learning_rate": 4.988698651338965e-06, + "loss": 0.4872, + "mean_token_accuracy": 0.8453850746154785, + "step": 2545 + }, + { + "epoch": 1.2730000000000001, + "grad_norm": 2.786115018371665, + "learning_rate": 4.988657171869893e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8725746273994446, + "step": 2546 + }, + { + "epoch": 1.2735, + "grad_norm": 3.370738796735774, + "learning_rate": 4.988615616591988e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8823529481887817, + "step": 2547 + }, + { + "epoch": 1.274, + "grad_norm": 3.6385625406174587, + "learning_rate": 4.988573985506516e-06, + "loss": 0.4884, + "mean_token_accuracy": 0.8528138399124146, + "step": 2548 + }, + { + "epoch": 1.2745, + "grad_norm": 2.1140246743602327, + "learning_rate": 4.988532278614746e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8793249130249023, + "step": 2549 + }, + { + "epoch": 1.275, + "grad_norm": 6.971926521856776, + "learning_rate": 4.988490495917948e-06, + "loss": 0.3072, + "mean_token_accuracy": 0.9084256291389465, + "step": 2550 + }, + { + "epoch": 1.2755, + "grad_norm": 6.688006382933348, + "learning_rate": 4.988448637417394e-06, + "loss": 0.3171, + "mean_token_accuracy": 0.8940444588661194, + "step": 2551 + }, + { + "epoch": 1.276, + "grad_norm": 1.861559341340273, + "learning_rate": 4.98840670311436e-06, + "loss": 0.2622, + "mean_token_accuracy": 0.9096885919570923, + "step": 2552 + }, + { + "epoch": 1.2765, + "grad_norm": 3.7759123386835354, + "learning_rate": 4.988364693010124e-06, + "loss": 0.2702, + "mean_token_accuracy": 0.9085354804992676, + "step": 2553 + }, + { + "epoch": 1.2770000000000001, + "grad_norm": 3.2160589515060805, + "learning_rate": 4.988322607105964e-06, + "loss": 0.2665, + "mean_token_accuracy": 0.9115802645683289, + "step": 2554 + }, + { + "epoch": 1.2775, + "grad_norm": 2.084493395275754, + "learning_rate": 4.988280445403164e-06, + "loss": 0.3278, + "mean_token_accuracy": 0.8900911211967468, + "step": 2555 + }, + { + "epoch": 1.278, + "grad_norm": 11.392868711639624, + "learning_rate": 4.988238207903007e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8838493824005127, + "step": 2556 + }, + { + "epoch": 1.2785, + "grad_norm": 2.2110367703352996, + "learning_rate": 4.98819589460678e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8633151054382324, + "step": 2557 + }, + { + "epoch": 1.279, + "grad_norm": 2.338617875015391, + "learning_rate": 4.988153505515771e-06, + "loss": 0.2898, + "mean_token_accuracy": 0.9006291627883911, + "step": 2558 + }, + { + "epoch": 1.2795, + "grad_norm": 1.9940216696743331, + "learning_rate": 4.9881110406312724e-06, + "loss": 0.2686, + "mean_token_accuracy": 0.9067890644073486, + "step": 2559 + }, + { + "epoch": 1.28, + "grad_norm": 2.5462406699291207, + "learning_rate": 4.988068499954578e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8796093463897705, + "step": 2560 + }, + { + "epoch": 1.2805, + "grad_norm": 2.2154601743188103, + "learning_rate": 4.988025883486983e-06, + "loss": 0.2705, + "mean_token_accuracy": 0.9149848222732544, + "step": 2561 + }, + { + "epoch": 1.2810000000000001, + "grad_norm": 4.1716402568841975, + "learning_rate": 4.987983191229786e-06, + "loss": 0.2362, + "mean_token_accuracy": 0.921796977519989, + "step": 2562 + }, + { + "epoch": 1.2814999999999999, + "grad_norm": 2.013723451248281, + "learning_rate": 4.987940423184286e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8811096549034119, + "step": 2563 + }, + { + "epoch": 1.282, + "grad_norm": 2.3887191518804882, + "learning_rate": 4.987897579351788e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8411998748779297, + "step": 2564 + }, + { + "epoch": 1.2825, + "grad_norm": 2.271841493965126, + "learning_rate": 4.987854659733597e-06, + "loss": 0.2925, + "mean_token_accuracy": 0.898851752281189, + "step": 2565 + }, + { + "epoch": 1.283, + "grad_norm": 10.875590362290366, + "learning_rate": 4.987811664331018e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8618354201316833, + "step": 2566 + }, + { + "epoch": 1.2835, + "grad_norm": 2.4068269209390505, + "learning_rate": 4.9877685931453625e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8875399231910706, + "step": 2567 + }, + { + "epoch": 1.284, + "grad_norm": 2.502627403200757, + "learning_rate": 4.987725446177941e-06, + "loss": 0.5162, + "mean_token_accuracy": 0.8448820114135742, + "step": 2568 + }, + { + "epoch": 1.2845, + "grad_norm": 2.633182026651007, + "learning_rate": 4.987682223430071e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8761572241783142, + "step": 2569 + }, + { + "epoch": 1.285, + "grad_norm": 2.9195995750163473, + "learning_rate": 4.987638924903066e-06, + "loss": 0.2679, + "mean_token_accuracy": 0.9101975560188293, + "step": 2570 + }, + { + "epoch": 1.2854999999999999, + "grad_norm": 1.8799092109261097, + "learning_rate": 4.987595550598246e-06, + "loss": 0.3303, + "mean_token_accuracy": 0.8895145058631897, + "step": 2571 + }, + { + "epoch": 1.286, + "grad_norm": 2.0557878309233395, + "learning_rate": 4.987552100516934e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8664470911026001, + "step": 2572 + }, + { + "epoch": 1.2865, + "grad_norm": 3.4158862392805127, + "learning_rate": 4.98750857466045e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8810082077980042, + "step": 2573 + }, + { + "epoch": 1.287, + "grad_norm": 2.638403098047342, + "learning_rate": 4.987464973030123e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8784050345420837, + "step": 2574 + }, + { + "epoch": 1.2875, + "grad_norm": 2.0039449677480508, + "learning_rate": 4.987421295627279e-06, + "loss": 0.2532, + "mean_token_accuracy": 0.909507691860199, + "step": 2575 + }, + { + "epoch": 1.288, + "grad_norm": 2.0918655632697796, + "learning_rate": 4.9873775424532515e-06, + "loss": 0.3085, + "mean_token_accuracy": 0.8893671035766602, + "step": 2576 + }, + { + "epoch": 1.2885, + "grad_norm": 2.5393403840341446, + "learning_rate": 4.9873337135093695e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8711675405502319, + "step": 2577 + }, + { + "epoch": 1.289, + "grad_norm": 8.067374466496217, + "learning_rate": 4.98728980879697e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8745756149291992, + "step": 2578 + }, + { + "epoch": 1.2894999999999999, + "grad_norm": 4.536506776056758, + "learning_rate": 4.987245828317391e-06, + "loss": 0.2727, + "mean_token_accuracy": 0.9147695302963257, + "step": 2579 + }, + { + "epoch": 1.29, + "grad_norm": 2.0692007854698877, + "learning_rate": 4.987201772071971e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.8927053213119507, + "step": 2580 + }, + { + "epoch": 1.2905, + "grad_norm": 2.169037925898852, + "learning_rate": 4.987157640062053e-06, + "loss": 0.2799, + "mean_token_accuracy": 0.9084926247596741, + "step": 2581 + }, + { + "epoch": 1.291, + "grad_norm": 2.238553375433489, + "learning_rate": 4.98711343228898e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8725069165229797, + "step": 2582 + }, + { + "epoch": 1.2915, + "grad_norm": 5.389052859895309, + "learning_rate": 4.9870691487541e-06, + "loss": 0.2677, + "mean_token_accuracy": 0.9107936024665833, + "step": 2583 + }, + { + "epoch": 1.292, + "grad_norm": 2.378104184083976, + "learning_rate": 4.987024789458762e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8793847560882568, + "step": 2584 + }, + { + "epoch": 1.2925, + "grad_norm": 4.031714995958326, + "learning_rate": 4.986980354404316e-06, + "loss": 0.2934, + "mean_token_accuracy": 0.9117646813392639, + "step": 2585 + }, + { + "epoch": 1.293, + "grad_norm": 3.503753172617567, + "learning_rate": 4.986935843592117e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8781608939170837, + "step": 2586 + }, + { + "epoch": 1.2934999999999999, + "grad_norm": 2.2976828748046563, + "learning_rate": 4.986891257023521e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8901062607765198, + "step": 2587 + }, + { + "epoch": 1.294, + "grad_norm": 3.1562042211530863, + "learning_rate": 4.9868465946998835e-06, + "loss": 0.287, + "mean_token_accuracy": 0.9040576815605164, + "step": 2588 + }, + { + "epoch": 1.2945, + "grad_norm": 6.275868587432262, + "learning_rate": 4.986801856622568e-06, + "loss": 0.5089, + "mean_token_accuracy": 0.8640776872634888, + "step": 2589 + }, + { + "epoch": 1.295, + "grad_norm": 5.430532611664113, + "learning_rate": 4.9867570427929356e-06, + "loss": 0.3076, + "mean_token_accuracy": 0.898851752281189, + "step": 2590 + }, + { + "epoch": 1.2955, + "grad_norm": 2.8340723909459773, + "learning_rate": 4.986712153212353e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8591595888137817, + "step": 2591 + }, + { + "epoch": 1.296, + "grad_norm": 2.3022064543465266, + "learning_rate": 4.986667187882186e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.8919662833213806, + "step": 2592 + }, + { + "epoch": 1.2965, + "grad_norm": 19.8801263684607, + "learning_rate": 4.986622146803804e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8731138706207275, + "step": 2593 + }, + { + "epoch": 1.297, + "grad_norm": 2.2591631294438104, + "learning_rate": 4.986577029978581e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8528546094894409, + "step": 2594 + }, + { + "epoch": 1.2974999999999999, + "grad_norm": 2.5341628673398775, + "learning_rate": 4.986531837407891e-06, + "loss": 0.3331, + "mean_token_accuracy": 0.887747585773468, + "step": 2595 + }, + { + "epoch": 1.298, + "grad_norm": 1.9964355537432528, + "learning_rate": 4.986486569093109e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8816306591033936, + "step": 2596 + }, + { + "epoch": 1.2985, + "grad_norm": 2.8730182510757163, + "learning_rate": 4.9864412250356145e-06, + "loss": 0.316, + "mean_token_accuracy": 0.8988804817199707, + "step": 2597 + }, + { + "epoch": 1.299, + "grad_norm": 4.281798658615667, + "learning_rate": 4.986395805236789e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8810210824012756, + "step": 2598 + }, + { + "epoch": 1.2995, + "grad_norm": 8.721230643991774, + "learning_rate": 4.986350309698017e-06, + "loss": 0.281, + "mean_token_accuracy": 0.8968728184700012, + "step": 2599 + }, + { + "epoch": 1.3, + "grad_norm": 7.180234629289914, + "learning_rate": 4.986304738420684e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8477774858474731, + "step": 2600 + }, + { + "epoch": 1.3005, + "grad_norm": 1.9237896210007497, + "learning_rate": 4.986259091406177e-06, + "loss": 0.3274, + "mean_token_accuracy": 0.8902942538261414, + "step": 2601 + }, + { + "epoch": 1.301, + "grad_norm": 2.4732323432968797, + "learning_rate": 4.986213368655887e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.8943045735359192, + "step": 2602 + }, + { + "epoch": 1.3014999999999999, + "grad_norm": 3.0149980134403154, + "learning_rate": 4.986167570171208e-06, + "loss": 0.683, + "mean_token_accuracy": 0.7993019223213196, + "step": 2603 + }, + { + "epoch": 1.302, + "grad_norm": 3.314970873842207, + "learning_rate": 4.986121695953534e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8790218830108643, + "step": 2604 + }, + { + "epoch": 1.3025, + "grad_norm": 2.0677264159152298, + "learning_rate": 4.986075746004262e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8818487524986267, + "step": 2605 + }, + { + "epoch": 1.303, + "grad_norm": 3.3571569215087727, + "learning_rate": 4.986029720324792e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8713648319244385, + "step": 2606 + }, + { + "epoch": 1.3035, + "grad_norm": 3.0642641086262303, + "learning_rate": 4.985983618916527e-06, + "loss": 0.5021, + "mean_token_accuracy": 0.8556666970252991, + "step": 2607 + }, + { + "epoch": 1.304, + "grad_norm": 5.322086897919999, + "learning_rate": 4.98593744178087e-06, + "loss": 0.2759, + "mean_token_accuracy": 0.9109182953834534, + "step": 2608 + }, + { + "epoch": 1.3045, + "grad_norm": 2.39857261647764, + "learning_rate": 4.985891188919229e-06, + "loss": 0.549, + "mean_token_accuracy": 0.853981614112854, + "step": 2609 + }, + { + "epoch": 1.305, + "grad_norm": 2.7246538138751166, + "learning_rate": 4.985844860333012e-06, + "loss": 0.3022, + "mean_token_accuracy": 0.8951244950294495, + "step": 2610 + }, + { + "epoch": 1.3054999999999999, + "grad_norm": 2.399048069300529, + "learning_rate": 4.985798456023631e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8747091293334961, + "step": 2611 + }, + { + "epoch": 1.306, + "grad_norm": 2.260734932779869, + "learning_rate": 4.985751975992498e-06, + "loss": 0.3235, + "mean_token_accuracy": 0.8943921327590942, + "step": 2612 + }, + { + "epoch": 1.3065, + "grad_norm": 2.123306835231304, + "learning_rate": 4.98570542024103e-06, + "loss": 0.3268, + "mean_token_accuracy": 0.8945483565330505, + "step": 2613 + }, + { + "epoch": 1.307, + "grad_norm": 2.4238282049211373, + "learning_rate": 4.985658788770645e-06, + "loss": 0.3019, + "mean_token_accuracy": 0.8992380499839783, + "step": 2614 + }, + { + "epoch": 1.3075, + "grad_norm": 2.478355718975425, + "learning_rate": 4.985612081582763e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8665525913238525, + "step": 2615 + }, + { + "epoch": 1.308, + "grad_norm": 2.2262171565175803, + "learning_rate": 4.985565298678809e-06, + "loss": 0.246, + "mean_token_accuracy": 0.9149600267410278, + "step": 2616 + }, + { + "epoch": 1.3085, + "grad_norm": 4.14128351633413, + "learning_rate": 4.985518440060205e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8754929304122925, + "step": 2617 + }, + { + "epoch": 1.309, + "grad_norm": 2.648946109378292, + "learning_rate": 4.985471505728381e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8860049843788147, + "step": 2618 + }, + { + "epoch": 1.3094999999999999, + "grad_norm": 2.2783011610267287, + "learning_rate": 4.9854244956847645e-06, + "loss": 0.2979, + "mean_token_accuracy": 0.898613691329956, + "step": 2619 + }, + { + "epoch": 1.31, + "grad_norm": 3.459380241594694, + "learning_rate": 4.985377409930789e-06, + "loss": 0.5396, + "mean_token_accuracy": 0.8485142588615417, + "step": 2620 + }, + { + "epoch": 1.3105, + "grad_norm": 2.5797866684705526, + "learning_rate": 4.985330248467887e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8900614380836487, + "step": 2621 + }, + { + "epoch": 1.311, + "grad_norm": 2.722610276798883, + "learning_rate": 4.985283011297498e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8562934994697571, + "step": 2622 + }, + { + "epoch": 1.3115, + "grad_norm": 2.561454635783533, + "learning_rate": 4.985235698421059e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.869407057762146, + "step": 2623 + }, + { + "epoch": 1.312, + "grad_norm": 1.944144873778836, + "learning_rate": 4.985188309840012e-06, + "loss": 0.3159, + "mean_token_accuracy": 0.8916146755218506, + "step": 2624 + }, + { + "epoch": 1.3125, + "grad_norm": 2.2680894697059024, + "learning_rate": 4.985140845555799e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8579216599464417, + "step": 2625 + }, + { + "epoch": 1.313, + "grad_norm": 6.3973951270602045, + "learning_rate": 4.985093305569868e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.880882978439331, + "step": 2626 + }, + { + "epoch": 1.3135, + "grad_norm": 6.30924751028503, + "learning_rate": 4.985045689883666e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8764764070510864, + "step": 2627 + }, + { + "epoch": 1.314, + "grad_norm": 2.3625935401702045, + "learning_rate": 4.984997998498643e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8741775155067444, + "step": 2628 + }, + { + "epoch": 1.3145, + "grad_norm": 4.078386582053017, + "learning_rate": 4.9849502314162524e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.872536301612854, + "step": 2629 + }, + { + "epoch": 1.315, + "grad_norm": 1.8316208133311893, + "learning_rate": 4.98490238863795e-06, + "loss": 0.2909, + "mean_token_accuracy": 0.9016368985176086, + "step": 2630 + }, + { + "epoch": 1.3155000000000001, + "grad_norm": 5.042208416312547, + "learning_rate": 4.9848544701651915e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8616239428520203, + "step": 2631 + }, + { + "epoch": 1.316, + "grad_norm": 4.388476104191994, + "learning_rate": 4.984806475999437e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8811426758766174, + "step": 2632 + }, + { + "epoch": 1.3165, + "grad_norm": 2.9650047606991223, + "learning_rate": 4.984758406142151e-06, + "loss": 0.354, + "mean_token_accuracy": 0.8907335996627808, + "step": 2633 + }, + { + "epoch": 1.317, + "grad_norm": 2.022222851696174, + "learning_rate": 4.984710260594794e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8766182065010071, + "step": 2634 + }, + { + "epoch": 1.3175, + "grad_norm": 2.22295138086279, + "learning_rate": 4.984662039358835e-06, + "loss": 0.3033, + "mean_token_accuracy": 0.9017767906188965, + "step": 2635 + }, + { + "epoch": 1.318, + "grad_norm": 2.364970085381014, + "learning_rate": 4.984613742435742e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8727442026138306, + "step": 2636 + }, + { + "epoch": 1.3185, + "grad_norm": 3.817810882063994, + "learning_rate": 4.984565369826986e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8593490719795227, + "step": 2637 + }, + { + "epoch": 1.319, + "grad_norm": 9.922747339615192, + "learning_rate": 4.984516921534042e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8644727468490601, + "step": 2638 + }, + { + "epoch": 1.3195000000000001, + "grad_norm": 2.399748376451321, + "learning_rate": 4.984468397558384e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.880606472492218, + "step": 2639 + }, + { + "epoch": 1.32, + "grad_norm": 2.3032130772137718, + "learning_rate": 4.984419797901491e-06, + "loss": 0.2557, + "mean_token_accuracy": 0.9166973829269409, + "step": 2640 + }, + { + "epoch": 1.3205, + "grad_norm": 2.5776911507486413, + "learning_rate": 4.984371122564844e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8836402893066406, + "step": 2641 + }, + { + "epoch": 1.321, + "grad_norm": 28.94632123247562, + "learning_rate": 4.984322371549924e-06, + "loss": 0.2989, + "mean_token_accuracy": 0.9045280814170837, + "step": 2642 + }, + { + "epoch": 1.3215, + "grad_norm": 39.14511999144349, + "learning_rate": 4.984273544858218e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8836228847503662, + "step": 2643 + }, + { + "epoch": 1.322, + "grad_norm": 8.732280273315277, + "learning_rate": 4.984224642491212e-06, + "loss": 0.5514, + "mean_token_accuracy": 0.8481127619743347, + "step": 2644 + }, + { + "epoch": 1.3225, + "grad_norm": 2.366539889141116, + "learning_rate": 4.9841756644503965e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8583404421806335, + "step": 2645 + }, + { + "epoch": 1.323, + "grad_norm": 5.163258719858546, + "learning_rate": 4.9841266107372634e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8822619318962097, + "step": 2646 + }, + { + "epoch": 1.3235000000000001, + "grad_norm": 2.1683220926949436, + "learning_rate": 4.984077481353305e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.8968105316162109, + "step": 2647 + }, + { + "epoch": 1.324, + "grad_norm": 3.4385802037532014, + "learning_rate": 4.984028276300021e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8505163192749023, + "step": 2648 + }, + { + "epoch": 1.3245, + "grad_norm": 2.1885496745738413, + "learning_rate": 4.983978995578908e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8856616020202637, + "step": 2649 + }, + { + "epoch": 1.325, + "grad_norm": 2.894042812921278, + "learning_rate": 4.9839296391914696e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8703201413154602, + "step": 2650 + }, + { + "epoch": 1.3255, + "grad_norm": 2.1181649349302076, + "learning_rate": 4.983880207139205e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8814938068389893, + "step": 2651 + }, + { + "epoch": 1.326, + "grad_norm": 1.9555897392415396, + "learning_rate": 4.983830699423625e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8646247386932373, + "step": 2652 + }, + { + "epoch": 1.3265, + "grad_norm": 3.268332410018144, + "learning_rate": 4.983781116046234e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8704704642295837, + "step": 2653 + }, + { + "epoch": 1.327, + "grad_norm": 3.404977324206011, + "learning_rate": 4.9837314570085435e-06, + "loss": 0.3045, + "mean_token_accuracy": 0.8940940499305725, + "step": 2654 + }, + { + "epoch": 1.3275000000000001, + "grad_norm": 2.3704558062857943, + "learning_rate": 4.983681722312068e-06, + "loss": 0.2986, + "mean_token_accuracy": 0.8892825245857239, + "step": 2655 + }, + { + "epoch": 1.328, + "grad_norm": 2.098752841972737, + "learning_rate": 4.983631911958319e-06, + "loss": 0.398, + "mean_token_accuracy": 0.868639349937439, + "step": 2656 + }, + { + "epoch": 1.3285, + "grad_norm": 3.1977644925134143, + "learning_rate": 4.983582025948816e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.858430027961731, + "step": 2657 + }, + { + "epoch": 1.329, + "grad_norm": 2.12096227732156, + "learning_rate": 4.98353206428508e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8818918466567993, + "step": 2658 + }, + { + "epoch": 1.3295, + "grad_norm": 4.05351439524177, + "learning_rate": 4.98348202696863e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8901515007019043, + "step": 2659 + }, + { + "epoch": 1.33, + "grad_norm": 2.3113674601619483, + "learning_rate": 4.983431914000991e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8819058537483215, + "step": 2660 + }, + { + "epoch": 1.3305, + "grad_norm": 2.2032726325616228, + "learning_rate": 4.983381725383692e-06, + "loss": 0.3376, + "mean_token_accuracy": 0.8955920934677124, + "step": 2661 + }, + { + "epoch": 1.331, + "grad_norm": 7.5916906674934514, + "learning_rate": 4.9833314611182575e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8665147423744202, + "step": 2662 + }, + { + "epoch": 1.3315000000000001, + "grad_norm": 3.4038212588687484, + "learning_rate": 4.983281121206222e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.8927372097969055, + "step": 2663 + }, + { + "epoch": 1.332, + "grad_norm": 2.6077208826610168, + "learning_rate": 4.983230705649118e-06, + "loss": 0.2801, + "mean_token_accuracy": 0.9062790274620056, + "step": 2664 + }, + { + "epoch": 1.3325, + "grad_norm": 3.037843512736961, + "learning_rate": 4.983180214448481e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8699685335159302, + "step": 2665 + }, + { + "epoch": 1.333, + "grad_norm": 11.45779845364638, + "learning_rate": 4.983129647605849e-06, + "loss": 0.2772, + "mean_token_accuracy": 0.9067413806915283, + "step": 2666 + }, + { + "epoch": 1.3335, + "grad_norm": 2.593223728579861, + "learning_rate": 4.983079005122763e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8700098395347595, + "step": 2667 + }, + { + "epoch": 1.334, + "grad_norm": 3.017571024554558, + "learning_rate": 4.983028287000764e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8846515417098999, + "step": 2668 + }, + { + "epoch": 1.3345, + "grad_norm": 2.726291894077532, + "learning_rate": 4.9829774932414e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8819429874420166, + "step": 2669 + }, + { + "epoch": 1.335, + "grad_norm": 2.2993267810390186, + "learning_rate": 4.982926623846216e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8624548316001892, + "step": 2670 + }, + { + "epoch": 1.3355000000000001, + "grad_norm": 1.8561554375766507, + "learning_rate": 4.982875678816761e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8603895902633667, + "step": 2671 + }, + { + "epoch": 1.336, + "grad_norm": 2.515723473535935, + "learning_rate": 4.982824658154589e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8817322254180908, + "step": 2672 + }, + { + "epoch": 1.3365, + "grad_norm": 3.248480308857295, + "learning_rate": 4.982773561861253e-06, + "loss": 0.2446, + "mean_token_accuracy": 0.9117476344108582, + "step": 2673 + }, + { + "epoch": 1.337, + "grad_norm": 3.5352006016471305, + "learning_rate": 4.982722389938309e-06, + "loss": 0.283, + "mean_token_accuracy": 0.8972245454788208, + "step": 2674 + }, + { + "epoch": 1.3375, + "grad_norm": 7.9909152973590105, + "learning_rate": 4.982671142387316e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8747158646583557, + "step": 2675 + }, + { + "epoch": 1.338, + "grad_norm": 2.320523767714981, + "learning_rate": 4.982619819209837e-06, + "loss": 0.3177, + "mean_token_accuracy": 0.8910183906555176, + "step": 2676 + }, + { + "epoch": 1.3385, + "grad_norm": 3.450303469697223, + "learning_rate": 4.982568420407432e-06, + "loss": 0.3029, + "mean_token_accuracy": 0.8951466679573059, + "step": 2677 + }, + { + "epoch": 1.339, + "grad_norm": 2.5453166239792226, + "learning_rate": 4.982516945981669e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8841961622238159, + "step": 2678 + }, + { + "epoch": 1.3395000000000001, + "grad_norm": 2.4801382326448373, + "learning_rate": 4.9824653959341165e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.8984863758087158, + "step": 2679 + }, + { + "epoch": 1.34, + "grad_norm": 4.850334015390436, + "learning_rate": 4.9824137702663424e-06, + "loss": 0.348, + "mean_token_accuracy": 0.8950715661048889, + "step": 2680 + }, + { + "epoch": 1.3405, + "grad_norm": 9.465752923578682, + "learning_rate": 4.982362068979921e-06, + "loss": 0.3008, + "mean_token_accuracy": 0.9065684080123901, + "step": 2681 + }, + { + "epoch": 1.341, + "grad_norm": 15.502776016624637, + "learning_rate": 4.982310292076429e-06, + "loss": 0.3333, + "mean_token_accuracy": 0.8910196423530579, + "step": 2682 + }, + { + "epoch": 1.3415, + "grad_norm": 2.399739060004114, + "learning_rate": 4.982258439557439e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8761820793151855, + "step": 2683 + }, + { + "epoch": 1.342, + "grad_norm": 34.90540221339275, + "learning_rate": 4.9822065114245345e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.8855239152908325, + "step": 2684 + }, + { + "epoch": 1.3425, + "grad_norm": 4.777506951173396, + "learning_rate": 4.982154507679296e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8598459362983704, + "step": 2685 + }, + { + "epoch": 1.343, + "grad_norm": 2.136090589919277, + "learning_rate": 4.982102428323307e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8815856575965881, + "step": 2686 + }, + { + "epoch": 1.3435000000000001, + "grad_norm": 2.5182058627924717, + "learning_rate": 4.982050273358155e-06, + "loss": 0.3018, + "mean_token_accuracy": 0.9089062213897705, + "step": 2687 + }, + { + "epoch": 1.3439999999999999, + "grad_norm": 2.313098645027501, + "learning_rate": 4.981998042785427e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.8845327496528625, + "step": 2688 + }, + { + "epoch": 1.3445, + "grad_norm": 2.122703194206534, + "learning_rate": 4.981945736606716e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8779404163360596, + "step": 2689 + }, + { + "epoch": 1.345, + "grad_norm": 4.808227115303734, + "learning_rate": 4.981893354823614e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8633484244346619, + "step": 2690 + }, + { + "epoch": 1.3455, + "grad_norm": 4.3283225062856685, + "learning_rate": 4.981840897437718e-06, + "loss": 0.5316, + "mean_token_accuracy": 0.8579663038253784, + "step": 2691 + }, + { + "epoch": 1.346, + "grad_norm": 5.596187090886464, + "learning_rate": 4.981788364450625e-06, + "loss": 0.3167, + "mean_token_accuracy": 0.9047188758850098, + "step": 2692 + }, + { + "epoch": 1.3465, + "grad_norm": 2.696089951359599, + "learning_rate": 4.981735755863935e-06, + "loss": 0.5685, + "mean_token_accuracy": 0.8244251012802124, + "step": 2693 + }, + { + "epoch": 1.347, + "grad_norm": 3.258499342712734, + "learning_rate": 4.981683071679251e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8713539838790894, + "step": 2694 + }, + { + "epoch": 1.3475, + "grad_norm": 2.97173067399273, + "learning_rate": 4.981630311898178e-06, + "loss": 0.3161, + "mean_token_accuracy": 0.891092836856842, + "step": 2695 + }, + { + "epoch": 1.3479999999999999, + "grad_norm": 1.4002750138847797, + "learning_rate": 4.981577476522323e-06, + "loss": 0.1756, + "mean_token_accuracy": 0.936677098274231, + "step": 2696 + }, + { + "epoch": 1.3485, + "grad_norm": 2.4449275779479445, + "learning_rate": 4.981524565553295e-06, + "loss": 0.4856, + "mean_token_accuracy": 0.8522130250930786, + "step": 2697 + }, + { + "epoch": 1.349, + "grad_norm": 2.418087775746348, + "learning_rate": 4.981471578992706e-06, + "loss": 0.3188, + "mean_token_accuracy": 0.9027335047721863, + "step": 2698 + }, + { + "epoch": 1.3495, + "grad_norm": 3.820101408466968, + "learning_rate": 4.981418516842171e-06, + "loss": 0.2894, + "mean_token_accuracy": 0.9048511385917664, + "step": 2699 + }, + { + "epoch": 1.35, + "grad_norm": 2.3008710167498334, + "learning_rate": 4.981365379103306e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8707494139671326, + "step": 2700 + }, + { + "epoch": 1.3505, + "grad_norm": 16.127300617427576, + "learning_rate": 4.981312165777728e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.8826849460601807, + "step": 2701 + }, + { + "epoch": 1.351, + "grad_norm": 7.079222482967488, + "learning_rate": 4.98125887686706e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.860468864440918, + "step": 2702 + }, + { + "epoch": 1.3515, + "grad_norm": 4.098995915499342, + "learning_rate": 4.981205512372924e-06, + "loss": 0.3213, + "mean_token_accuracy": 0.8950048685073853, + "step": 2703 + }, + { + "epoch": 1.3519999999999999, + "grad_norm": 2.1210677536255487, + "learning_rate": 4.9811520722969465e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.884636640548706, + "step": 2704 + }, + { + "epoch": 1.3525, + "grad_norm": 2.9005559161302665, + "learning_rate": 4.981098556640754e-06, + "loss": 0.2882, + "mean_token_accuracy": 0.9071244597434998, + "step": 2705 + }, + { + "epoch": 1.353, + "grad_norm": 3.157886847943047, + "learning_rate": 4.981044965405979e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.871313214302063, + "step": 2706 + }, + { + "epoch": 1.3535, + "grad_norm": 2.552214671142988, + "learning_rate": 4.980991298594252e-06, + "loss": 0.353, + "mean_token_accuracy": 0.8968499898910522, + "step": 2707 + }, + { + "epoch": 1.354, + "grad_norm": 4.552663122619018, + "learning_rate": 4.980937556207208e-06, + "loss": 0.2511, + "mean_token_accuracy": 0.9224137663841248, + "step": 2708 + }, + { + "epoch": 1.3545, + "grad_norm": 2.3272449074111075, + "learning_rate": 4.980883738246485e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8688942193984985, + "step": 2709 + }, + { + "epoch": 1.355, + "grad_norm": 2.6678465505858404, + "learning_rate": 4.980829844713722e-06, + "loss": 0.373, + "mean_token_accuracy": 0.880348801612854, + "step": 2710 + }, + { + "epoch": 1.3555, + "grad_norm": 2.0361384987388216, + "learning_rate": 4.9807758756105605e-06, + "loss": 0.3277, + "mean_token_accuracy": 0.8981547355651855, + "step": 2711 + }, + { + "epoch": 1.3559999999999999, + "grad_norm": 2.33881429569946, + "learning_rate": 4.980721830938645e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8658433556556702, + "step": 2712 + }, + { + "epoch": 1.3565, + "grad_norm": 2.3423851776115265, + "learning_rate": 4.980667710699621e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.8894755840301514, + "step": 2713 + }, + { + "epoch": 1.357, + "grad_norm": 2.6115606965504274, + "learning_rate": 4.980613514895136e-06, + "loss": 0.313, + "mean_token_accuracy": 0.8936693072319031, + "step": 2714 + }, + { + "epoch": 1.3575, + "grad_norm": 9.248223399976574, + "learning_rate": 4.980559243526844e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8813797831535339, + "step": 2715 + }, + { + "epoch": 1.358, + "grad_norm": 6.30449976207802, + "learning_rate": 4.980504896596397e-06, + "loss": 0.5324, + "mean_token_accuracy": 0.8398173451423645, + "step": 2716 + }, + { + "epoch": 1.3585, + "grad_norm": 2.9909150552381596, + "learning_rate": 4.9804504741054485e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8838527202606201, + "step": 2717 + }, + { + "epoch": 1.359, + "grad_norm": 4.855543143626225, + "learning_rate": 4.980395976055659e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8599025011062622, + "step": 2718 + }, + { + "epoch": 1.3595, + "grad_norm": 1.8808573916190439, + "learning_rate": 4.9803414024486865e-06, + "loss": 0.3064, + "mean_token_accuracy": 0.9034457802772522, + "step": 2719 + }, + { + "epoch": 1.3599999999999999, + "grad_norm": 5.587414995102895, + "learning_rate": 4.980286753286196e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8814197778701782, + "step": 2720 + }, + { + "epoch": 1.3605, + "grad_norm": 2.660116910750255, + "learning_rate": 4.980232028569849e-06, + "loss": 0.3141, + "mean_token_accuracy": 0.8972787857055664, + "step": 2721 + }, + { + "epoch": 1.361, + "grad_norm": 2.0492592658429327, + "learning_rate": 4.9801772283013135e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8903282880783081, + "step": 2722 + }, + { + "epoch": 1.3615, + "grad_norm": 2.7209421458437797, + "learning_rate": 4.9801223524822605e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8527919054031372, + "step": 2723 + }, + { + "epoch": 1.362, + "grad_norm": 1.8731927186698767, + "learning_rate": 4.980067401114361e-06, + "loss": 0.2158, + "mean_token_accuracy": 0.9241364002227783, + "step": 2724 + }, + { + "epoch": 1.3625, + "grad_norm": 3.3578350250125344, + "learning_rate": 4.980012374199288e-06, + "loss": 0.3136, + "mean_token_accuracy": 0.899433434009552, + "step": 2725 + }, + { + "epoch": 1.363, + "grad_norm": 10.0926643216027, + "learning_rate": 4.979957271738718e-06, + "loss": 0.2892, + "mean_token_accuracy": 0.9021006226539612, + "step": 2726 + }, + { + "epoch": 1.3635, + "grad_norm": 2.6472781139844277, + "learning_rate": 4.97990209373433e-06, + "loss": 0.3064, + "mean_token_accuracy": 0.9069212675094604, + "step": 2727 + }, + { + "epoch": 1.3639999999999999, + "grad_norm": 2.003981692031176, + "learning_rate": 4.979846840187804e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.8847039341926575, + "step": 2728 + }, + { + "epoch": 1.3645, + "grad_norm": 2.890926771002096, + "learning_rate": 4.9797915111008236e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8981274962425232, + "step": 2729 + }, + { + "epoch": 1.365, + "grad_norm": 2.9718574170771634, + "learning_rate": 4.979736106475075e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.871433436870575, + "step": 2730 + }, + { + "epoch": 1.3655, + "grad_norm": 2.871879868207927, + "learning_rate": 4.979680626312244e-06, + "loss": 0.2906, + "mean_token_accuracy": 0.8999066352844238, + "step": 2731 + }, + { + "epoch": 1.366, + "grad_norm": 2.403954435585962, + "learning_rate": 4.979625070614023e-06, + "loss": 0.2977, + "mean_token_accuracy": 0.894538938999176, + "step": 2732 + }, + { + "epoch": 1.3665, + "grad_norm": 3.6199802746957954, + "learning_rate": 4.979569439382101e-06, + "loss": 0.3122, + "mean_token_accuracy": 0.897803008556366, + "step": 2733 + }, + { + "epoch": 1.367, + "grad_norm": 1.6937080222813625, + "learning_rate": 4.979513732618177e-06, + "loss": 0.2529, + "mean_token_accuracy": 0.9078509211540222, + "step": 2734 + }, + { + "epoch": 1.3675, + "grad_norm": 2.370700247636131, + "learning_rate": 4.979457950323945e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8627796769142151, + "step": 2735 + }, + { + "epoch": 1.3679999999999999, + "grad_norm": 3.2008631997557444, + "learning_rate": 4.979402092501104e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8932265639305115, + "step": 2736 + }, + { + "epoch": 1.3685, + "grad_norm": 2.2487872304877228, + "learning_rate": 4.979346159151357e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8611413240432739, + "step": 2737 + }, + { + "epoch": 1.369, + "grad_norm": 2.6583326788575143, + "learning_rate": 4.979290150276408e-06, + "loss": 0.3368, + "mean_token_accuracy": 0.8884078860282898, + "step": 2738 + }, + { + "epoch": 1.3695, + "grad_norm": 7.820596147834839, + "learning_rate": 4.979234065877961e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8807471394538879, + "step": 2739 + }, + { + "epoch": 1.37, + "grad_norm": 2.2751364063468857, + "learning_rate": 4.979177905957726e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8859072327613831, + "step": 2740 + }, + { + "epoch": 1.3705, + "grad_norm": 3.16931771878068, + "learning_rate": 4.979121670517414e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8688711524009705, + "step": 2741 + }, + { + "epoch": 1.371, + "grad_norm": 5.160214291287451, + "learning_rate": 4.979065359558738e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8750384449958801, + "step": 2742 + }, + { + "epoch": 1.3715, + "grad_norm": 6.673683519233731, + "learning_rate": 4.979008973083412e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8743879795074463, + "step": 2743 + }, + { + "epoch": 1.3719999999999999, + "grad_norm": 2.8907111651877164, + "learning_rate": 4.9789525110931545e-06, + "loss": 0.3078, + "mean_token_accuracy": 0.8980721235275269, + "step": 2744 + }, + { + "epoch": 1.3725, + "grad_norm": 2.8863567693135614, + "learning_rate": 4.978895973589686e-06, + "loss": 0.5187, + "mean_token_accuracy": 0.8542253375053406, + "step": 2745 + }, + { + "epoch": 1.373, + "grad_norm": 12.016827014681416, + "learning_rate": 4.978839360574727e-06, + "loss": 0.3071, + "mean_token_accuracy": 0.905401349067688, + "step": 2746 + }, + { + "epoch": 1.3735, + "grad_norm": 2.9503997135773647, + "learning_rate": 4.978782672050004e-06, + "loss": 0.3425, + "mean_token_accuracy": 0.8916666507720947, + "step": 2747 + }, + { + "epoch": 1.374, + "grad_norm": 3.3998735327902736, + "learning_rate": 4.978725908017244e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8648881316184998, + "step": 2748 + }, + { + "epoch": 1.3745, + "grad_norm": 16.715387979235004, + "learning_rate": 4.978669068478173e-06, + "loss": 0.3101, + "mean_token_accuracy": 0.8970853090286255, + "step": 2749 + }, + { + "epoch": 1.375, + "grad_norm": 2.80947296484371, + "learning_rate": 4.978612153434527e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8666666746139526, + "step": 2750 + }, + { + "epoch": 1.3755, + "grad_norm": 4.090215049066494, + "learning_rate": 4.978555162888036e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8693253993988037, + "step": 2751 + }, + { + "epoch": 1.376, + "grad_norm": 2.789783641955177, + "learning_rate": 4.978498096840437e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8912016749382019, + "step": 2752 + }, + { + "epoch": 1.3765, + "grad_norm": 4.287473125800498, + "learning_rate": 4.978440955293469e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.873064398765564, + "step": 2753 + }, + { + "epoch": 1.377, + "grad_norm": 26.8350344860944, + "learning_rate": 4.978383738248872e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8629796504974365, + "step": 2754 + }, + { + "epoch": 1.3775, + "grad_norm": 4.452977055735668, + "learning_rate": 4.97832644570839e-06, + "loss": 0.5171, + "mean_token_accuracy": 0.8512628674507141, + "step": 2755 + }, + { + "epoch": 1.3780000000000001, + "grad_norm": 2.35231257746174, + "learning_rate": 4.978269077673767e-06, + "loss": 0.592, + "mean_token_accuracy": 0.8238796591758728, + "step": 2756 + }, + { + "epoch": 1.3785, + "grad_norm": 4.071251030781996, + "learning_rate": 4.9782116341467515e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8805409669876099, + "step": 2757 + }, + { + "epoch": 1.379, + "grad_norm": 2.918081285881815, + "learning_rate": 4.978154115129091e-06, + "loss": 0.33, + "mean_token_accuracy": 0.8987138271331787, + "step": 2758 + }, + { + "epoch": 1.3795, + "grad_norm": 2.164233218044367, + "learning_rate": 4.978096520622542e-06, + "loss": 0.3305, + "mean_token_accuracy": 0.881965696811676, + "step": 2759 + }, + { + "epoch": 1.38, + "grad_norm": 2.842694928997513, + "learning_rate": 4.978038850628855e-06, + "loss": 0.7659, + "mean_token_accuracy": 0.7752179503440857, + "step": 2760 + }, + { + "epoch": 1.3805, + "grad_norm": 2.5877592632141617, + "learning_rate": 4.9779811051497884e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8843005895614624, + "step": 2761 + }, + { + "epoch": 1.381, + "grad_norm": 6.888414218030372, + "learning_rate": 4.977923284187101e-06, + "loss": 0.3096, + "mean_token_accuracy": 0.8969020843505859, + "step": 2762 + }, + { + "epoch": 1.3815, + "grad_norm": 3.2373933073862804, + "learning_rate": 4.977865387742553e-06, + "loss": 0.2838, + "mean_token_accuracy": 0.9079834818840027, + "step": 2763 + }, + { + "epoch": 1.3820000000000001, + "grad_norm": 2.319585915512049, + "learning_rate": 4.97780741581791e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.867976725101471, + "step": 2764 + }, + { + "epoch": 1.3825, + "grad_norm": 2.7365001370367112, + "learning_rate": 4.977749368414938e-06, + "loss": 0.2437, + "mean_token_accuracy": 0.9164915680885315, + "step": 2765 + }, + { + "epoch": 1.383, + "grad_norm": 2.8583901815077994, + "learning_rate": 4.977691245535403e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8755898475646973, + "step": 2766 + }, + { + "epoch": 1.3835, + "grad_norm": 1.6602807464882743, + "learning_rate": 4.977633047181077e-06, + "loss": 0.2909, + "mean_token_accuracy": 0.9009767770767212, + "step": 2767 + }, + { + "epoch": 1.384, + "grad_norm": 2.4476323832539437, + "learning_rate": 4.977574773353732e-06, + "loss": 0.2665, + "mean_token_accuracy": 0.90388023853302, + "step": 2768 + }, + { + "epoch": 1.3845, + "grad_norm": 3.8552708363418122, + "learning_rate": 4.977516424055144e-06, + "loss": 0.4962, + "mean_token_accuracy": 0.8489719033241272, + "step": 2769 + }, + { + "epoch": 1.385, + "grad_norm": 2.498444575937116, + "learning_rate": 4.977457999287091e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8786779642105103, + "step": 2770 + }, + { + "epoch": 1.3855, + "grad_norm": 2.6544107096484746, + "learning_rate": 4.977399499051351e-06, + "loss": 0.305, + "mean_token_accuracy": 0.9044662714004517, + "step": 2771 + }, + { + "epoch": 1.3860000000000001, + "grad_norm": 4.186656197071646, + "learning_rate": 4.977340923349707e-06, + "loss": 0.3135, + "mean_token_accuracy": 0.8904109597206116, + "step": 2772 + }, + { + "epoch": 1.3865, + "grad_norm": 4.698493591704348, + "learning_rate": 4.977282272183944e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8684007525444031, + "step": 2773 + }, + { + "epoch": 1.387, + "grad_norm": 2.499227887634687, + "learning_rate": 4.977223545555847e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8806399703025818, + "step": 2774 + }, + { + "epoch": 1.3875, + "grad_norm": 2.2056052232868097, + "learning_rate": 4.977164743467206e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.873136579990387, + "step": 2775 + }, + { + "epoch": 1.388, + "grad_norm": 2.7533442368313703, + "learning_rate": 4.9771058659198115e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8531660437583923, + "step": 2776 + }, + { + "epoch": 1.3885, + "grad_norm": 1.9096569560792986, + "learning_rate": 4.977046912915459e-06, + "loss": 0.3366, + "mean_token_accuracy": 0.8875479698181152, + "step": 2777 + }, + { + "epoch": 1.389, + "grad_norm": 2.215172270553074, + "learning_rate": 4.9769878844559405e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8583892583847046, + "step": 2778 + }, + { + "epoch": 1.3895, + "grad_norm": 2.148886203289574, + "learning_rate": 4.976928780543058e-06, + "loss": 0.3079, + "mean_token_accuracy": 0.9006690979003906, + "step": 2779 + }, + { + "epoch": 1.3900000000000001, + "grad_norm": 3.1273174928693015, + "learning_rate": 4.9768696011786095e-06, + "loss": 0.3159, + "mean_token_accuracy": 0.8971717953681946, + "step": 2780 + }, + { + "epoch": 1.3905, + "grad_norm": 4.304850385771484, + "learning_rate": 4.976810346364398e-06, + "loss": 0.2107, + "mean_token_accuracy": 0.9270855784416199, + "step": 2781 + }, + { + "epoch": 1.391, + "grad_norm": 2.5577435464251517, + "learning_rate": 4.976751016102231e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8859447240829468, + "step": 2782 + }, + { + "epoch": 1.3915, + "grad_norm": 4.9318052860232235, + "learning_rate": 4.976691610393912e-06, + "loss": 0.2903, + "mean_token_accuracy": 0.9073885679244995, + "step": 2783 + }, + { + "epoch": 1.392, + "grad_norm": 5.605727630573726, + "learning_rate": 4.976632129241253e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8636690378189087, + "step": 2784 + }, + { + "epoch": 1.3925, + "grad_norm": 2.9957616063806607, + "learning_rate": 4.976572572646064e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.8999599814414978, + "step": 2785 + }, + { + "epoch": 1.393, + "grad_norm": 5.148897708123203, + "learning_rate": 4.976512940610162e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8575224280357361, + "step": 2786 + }, + { + "epoch": 1.3935, + "grad_norm": 2.44705910170929, + "learning_rate": 4.976453233135362e-06, + "loss": 0.3347, + "mean_token_accuracy": 0.88685142993927, + "step": 2787 + }, + { + "epoch": 1.3940000000000001, + "grad_norm": 2.9770677622496735, + "learning_rate": 4.976393450223482e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8761593103408813, + "step": 2788 + }, + { + "epoch": 1.3945, + "grad_norm": 2.4418192342147784, + "learning_rate": 4.976333591876345e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.8929917216300964, + "step": 2789 + }, + { + "epoch": 1.395, + "grad_norm": 2.4501827698134737, + "learning_rate": 4.976273658095772e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8701492547988892, + "step": 2790 + }, + { + "epoch": 1.3955, + "grad_norm": 2.044567050335, + "learning_rate": 4.976213648883591e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8824347257614136, + "step": 2791 + }, + { + "epoch": 1.396, + "grad_norm": 2.8942459980164377, + "learning_rate": 4.9761535642416284e-06, + "loss": 0.2294, + "mean_token_accuracy": 0.925848126411438, + "step": 2792 + }, + { + "epoch": 1.3965, + "grad_norm": 3.719654385257623, + "learning_rate": 4.9760934041717155e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8883712887763977, + "step": 2793 + }, + { + "epoch": 1.397, + "grad_norm": 2.895949654022317, + "learning_rate": 4.976033168675684e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8838205933570862, + "step": 2794 + }, + { + "epoch": 1.3975, + "grad_norm": 2.621350363528576, + "learning_rate": 4.975972857755369e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8788609504699707, + "step": 2795 + }, + { + "epoch": 1.3980000000000001, + "grad_norm": 3.3655493311219034, + "learning_rate": 4.975912471412608e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.863326907157898, + "step": 2796 + }, + { + "epoch": 1.3985, + "grad_norm": 3.7558116559320713, + "learning_rate": 4.9758520096492405e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8720888495445251, + "step": 2797 + }, + { + "epoch": 1.399, + "grad_norm": 2.0089089732338574, + "learning_rate": 4.975791472467108e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8634012937545776, + "step": 2798 + }, + { + "epoch": 1.3995, + "grad_norm": 2.1045498002988006, + "learning_rate": 4.9757308598680545e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8891364932060242, + "step": 2799 + }, + { + "epoch": 1.4, + "grad_norm": 4.882438848794339, + "learning_rate": 4.975670171853926e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8787444233894348, + "step": 2800 + }, + { + "epoch": 1.4005, + "grad_norm": 3.6749324369768868, + "learning_rate": 4.975609408426573e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8793670535087585, + "step": 2801 + }, + { + "epoch": 1.401, + "grad_norm": 4.741691907085196, + "learning_rate": 4.975548569587844e-06, + "loss": 0.2475, + "mean_token_accuracy": 0.916558027267456, + "step": 2802 + }, + { + "epoch": 1.4015, + "grad_norm": 4.197077932553666, + "learning_rate": 4.975487655339594e-06, + "loss": 0.291, + "mean_token_accuracy": 0.8907103538513184, + "step": 2803 + }, + { + "epoch": 1.4020000000000001, + "grad_norm": 19.192387064004077, + "learning_rate": 4.975426665683678e-06, + "loss": 0.5164, + "mean_token_accuracy": 0.8242841362953186, + "step": 2804 + }, + { + "epoch": 1.4025, + "grad_norm": 1.9417958953902963, + "learning_rate": 4.975365600621953e-06, + "loss": 0.351, + "mean_token_accuracy": 0.8827394843101501, + "step": 2805 + }, + { + "epoch": 1.403, + "grad_norm": 2.171481951218218, + "learning_rate": 4.97530446015628e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8591810464859009, + "step": 2806 + }, + { + "epoch": 1.4035, + "grad_norm": 2.470114373082044, + "learning_rate": 4.975243244288523e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8749303221702576, + "step": 2807 + }, + { + "epoch": 1.404, + "grad_norm": 2.6850029153188864, + "learning_rate": 4.975181953020544e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8635746836662292, + "step": 2808 + }, + { + "epoch": 1.4045, + "grad_norm": 2.607198244117409, + "learning_rate": 4.975120586354212e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8755712509155273, + "step": 2809 + }, + { + "epoch": 1.405, + "grad_norm": 2.5902995493132313, + "learning_rate": 4.975059144291395e-06, + "loss": 0.328, + "mean_token_accuracy": 0.898592472076416, + "step": 2810 + }, + { + "epoch": 1.4055, + "grad_norm": 2.6403118564000927, + "learning_rate": 4.974997626833964e-06, + "loss": 0.3058, + "mean_token_accuracy": 0.8981861472129822, + "step": 2811 + }, + { + "epoch": 1.4060000000000001, + "grad_norm": 6.122662246203958, + "learning_rate": 4.974936033983795e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8814965486526489, + "step": 2812 + }, + { + "epoch": 1.4064999999999999, + "grad_norm": 3.1302917603263793, + "learning_rate": 4.974874365742764e-06, + "loss": 0.343, + "mean_token_accuracy": 0.8855130672454834, + "step": 2813 + }, + { + "epoch": 1.407, + "grad_norm": 2.9785547212144072, + "learning_rate": 4.974812622112748e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.8515754342079163, + "step": 2814 + }, + { + "epoch": 1.4075, + "grad_norm": 2.683341603573198, + "learning_rate": 4.974750803095629e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8514408469200134, + "step": 2815 + }, + { + "epoch": 1.408, + "grad_norm": 1.4900255604080654, + "learning_rate": 4.97468890869329e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8788748383522034, + "step": 2816 + }, + { + "epoch": 1.4085, + "grad_norm": 1.8356560567106142, + "learning_rate": 4.974626938907616e-06, + "loss": 0.2864, + "mean_token_accuracy": 0.8999457955360413, + "step": 2817 + }, + { + "epoch": 1.409, + "grad_norm": 1.6427301114086335, + "learning_rate": 4.974564893740494e-06, + "loss": 0.2306, + "mean_token_accuracy": 0.9142687916755676, + "step": 2818 + }, + { + "epoch": 1.4095, + "grad_norm": 2.6504768449911236, + "learning_rate": 4.974502773193816e-06, + "loss": 0.323, + "mean_token_accuracy": 0.894183874130249, + "step": 2819 + }, + { + "epoch": 1.41, + "grad_norm": 2.0874135569247283, + "learning_rate": 4.974440577269473e-06, + "loss": 0.2889, + "mean_token_accuracy": 0.9129626750946045, + "step": 2820 + }, + { + "epoch": 1.4104999999999999, + "grad_norm": 2.6008590357657684, + "learning_rate": 4.9743783059693595e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8731924295425415, + "step": 2821 + }, + { + "epoch": 1.411, + "grad_norm": 1.8875621012855532, + "learning_rate": 4.974315959295373e-06, + "loss": 0.315, + "mean_token_accuracy": 0.8893933892250061, + "step": 2822 + }, + { + "epoch": 1.4115, + "grad_norm": 2.841242746615867, + "learning_rate": 4.974253537249412e-06, + "loss": 0.5297, + "mean_token_accuracy": 0.8388499021530151, + "step": 2823 + }, + { + "epoch": 1.412, + "grad_norm": 2.9619319416120833, + "learning_rate": 4.974191039833378e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8756608366966248, + "step": 2824 + }, + { + "epoch": 1.4125, + "grad_norm": 2.103483801899893, + "learning_rate": 4.974128467049177e-06, + "loss": 0.2806, + "mean_token_accuracy": 0.9086765646934509, + "step": 2825 + }, + { + "epoch": 1.413, + "grad_norm": 6.979758237340035, + "learning_rate": 4.97406581889871e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8728565573692322, + "step": 2826 + }, + { + "epoch": 1.4135, + "grad_norm": 2.193570923378297, + "learning_rate": 4.9740030953838915e-06, + "loss": 0.3391, + "mean_token_accuracy": 0.8927099704742432, + "step": 2827 + }, + { + "epoch": 1.414, + "grad_norm": 5.663147408181292, + "learning_rate": 4.973940296506628e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.880989670753479, + "step": 2828 + }, + { + "epoch": 1.4144999999999999, + "grad_norm": 2.902316552815866, + "learning_rate": 4.973877422268833e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.9005513787269592, + "step": 2829 + }, + { + "epoch": 1.415, + "grad_norm": 1.912078705445706, + "learning_rate": 4.973814472672424e-06, + "loss": 0.3204, + "mean_token_accuracy": 0.8940113186836243, + "step": 2830 + }, + { + "epoch": 1.4155, + "grad_norm": 2.8918043474963064, + "learning_rate": 4.973751447719316e-06, + "loss": 0.3364, + "mean_token_accuracy": 0.8834951519966125, + "step": 2831 + }, + { + "epoch": 1.416, + "grad_norm": 2.072638301343098, + "learning_rate": 4.973688347411431e-06, + "loss": 0.2716, + "mean_token_accuracy": 0.9020183682441711, + "step": 2832 + }, + { + "epoch": 1.4165, + "grad_norm": 2.180845747537763, + "learning_rate": 4.973625171750689e-06, + "loss": 0.2412, + "mean_token_accuracy": 0.9194480776786804, + "step": 2833 + }, + { + "epoch": 1.417, + "grad_norm": 2.0120604861807867, + "learning_rate": 4.973561920739016e-06, + "loss": 0.2766, + "mean_token_accuracy": 0.907737135887146, + "step": 2834 + }, + { + "epoch": 1.4175, + "grad_norm": 8.307035555992057, + "learning_rate": 4.973498594378338e-06, + "loss": 0.299, + "mean_token_accuracy": 0.8993784189224243, + "step": 2835 + }, + { + "epoch": 1.418, + "grad_norm": 2.5762169168467204, + "learning_rate": 4.9734351926705836e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8518677949905396, + "step": 2836 + }, + { + "epoch": 1.4184999999999999, + "grad_norm": 2.6672494219848204, + "learning_rate": 4.973371715617685e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8505260348320007, + "step": 2837 + }, + { + "epoch": 1.419, + "grad_norm": 2.286996340830546, + "learning_rate": 4.9733081632215766e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8715259432792664, + "step": 2838 + }, + { + "epoch": 1.4195, + "grad_norm": 4.149057921671696, + "learning_rate": 4.9732445354841915e-06, + "loss": 0.2907, + "mean_token_accuracy": 0.9026851058006287, + "step": 2839 + }, + { + "epoch": 1.42, + "grad_norm": 2.1904026169174964, + "learning_rate": 4.973180832407471e-06, + "loss": 0.3318, + "mean_token_accuracy": 0.8915820717811584, + "step": 2840 + }, + { + "epoch": 1.4205, + "grad_norm": 2.516178468439778, + "learning_rate": 4.973117053993354e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8800750374794006, + "step": 2841 + }, + { + "epoch": 1.421, + "grad_norm": 3.510137010162871, + "learning_rate": 4.973053200243784e-06, + "loss": 0.3111, + "mean_token_accuracy": 0.8959464430809021, + "step": 2842 + }, + { + "epoch": 1.4215, + "grad_norm": 2.7983413042286323, + "learning_rate": 4.972989271160705e-06, + "loss": 0.3044, + "mean_token_accuracy": 0.9041833281517029, + "step": 2843 + }, + { + "epoch": 1.422, + "grad_norm": 4.276547261954323, + "learning_rate": 4.972925266746066e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8821747303009033, + "step": 2844 + }, + { + "epoch": 1.4224999999999999, + "grad_norm": 4.839294010409953, + "learning_rate": 4.972861187001815e-06, + "loss": 0.2929, + "mean_token_accuracy": 0.9071527719497681, + "step": 2845 + }, + { + "epoch": 1.423, + "grad_norm": 2.6686274269932158, + "learning_rate": 4.972797031929905e-06, + "loss": 0.3013, + "mean_token_accuracy": 0.9017841219902039, + "step": 2846 + }, + { + "epoch": 1.4235, + "grad_norm": 1.9177680604742717, + "learning_rate": 4.9727328015322905e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8728777170181274, + "step": 2847 + }, + { + "epoch": 1.424, + "grad_norm": 2.44375327271069, + "learning_rate": 4.972668495810927e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8774246573448181, + "step": 2848 + }, + { + "epoch": 1.4245, + "grad_norm": 2.647740446012862, + "learning_rate": 4.972604114767774e-06, + "loss": 0.2292, + "mean_token_accuracy": 0.924673318862915, + "step": 2849 + }, + { + "epoch": 1.425, + "grad_norm": 2.4855889939302864, + "learning_rate": 4.972539658404793e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8656771183013916, + "step": 2850 + }, + { + "epoch": 1.4255, + "grad_norm": 2.8620548096706426, + "learning_rate": 4.972475126723946e-06, + "loss": 0.3308, + "mean_token_accuracy": 0.9027258157730103, + "step": 2851 + }, + { + "epoch": 1.426, + "grad_norm": 3.5326764004893723, + "learning_rate": 4.972410519727201e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8787413835525513, + "step": 2852 + }, + { + "epoch": 1.4264999999999999, + "grad_norm": 2.4069969852683553, + "learning_rate": 4.972345837416524e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8481836915016174, + "step": 2853 + }, + { + "epoch": 1.427, + "grad_norm": 2.5647365695883675, + "learning_rate": 4.972281079793887e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.868789792060852, + "step": 2854 + }, + { + "epoch": 1.4275, + "grad_norm": 3.924922813032909, + "learning_rate": 4.9722162468612625e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.8935900330543518, + "step": 2855 + }, + { + "epoch": 1.428, + "grad_norm": 2.246152149767648, + "learning_rate": 4.9721513386206235e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8713386058807373, + "step": 2856 + }, + { + "epoch": 1.4285, + "grad_norm": 3.0392708875330747, + "learning_rate": 4.9720863550739485e-06, + "loss": 0.2944, + "mean_token_accuracy": 0.9116643071174622, + "step": 2857 + }, + { + "epoch": 1.429, + "grad_norm": 2.2068211722615385, + "learning_rate": 4.972021296223217e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8747531771659851, + "step": 2858 + }, + { + "epoch": 1.4295, + "grad_norm": 2.0181990626176036, + "learning_rate": 4.971956162070411e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8759677410125732, + "step": 2859 + }, + { + "epoch": 1.43, + "grad_norm": 39.67048093046301, + "learning_rate": 4.971890952617515e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.883536159992218, + "step": 2860 + }, + { + "epoch": 1.4304999999999999, + "grad_norm": 4.334993404133135, + "learning_rate": 4.971825667866514e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8666216731071472, + "step": 2861 + }, + { + "epoch": 1.431, + "grad_norm": 5.541510450476147, + "learning_rate": 4.971760307819398e-06, + "loss": 0.5939, + "mean_token_accuracy": 0.8180537819862366, + "step": 2862 + }, + { + "epoch": 1.4315, + "grad_norm": 7.8162925532094345, + "learning_rate": 4.971694872478158e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8817912936210632, + "step": 2863 + }, + { + "epoch": 1.432, + "grad_norm": 5.402787180635673, + "learning_rate": 4.971629361844785e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8729792237281799, + "step": 2864 + }, + { + "epoch": 1.4325, + "grad_norm": 2.5314607452756284, + "learning_rate": 4.9715637759212775e-06, + "loss": 0.5489, + "mean_token_accuracy": 0.8383092284202576, + "step": 2865 + }, + { + "epoch": 1.433, + "grad_norm": 1.6172333451567107, + "learning_rate": 4.971498114709632e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8832576274871826, + "step": 2866 + }, + { + "epoch": 1.4335, + "grad_norm": 2.0524007351332147, + "learning_rate": 4.971432378211849e-06, + "loss": 0.3229, + "mean_token_accuracy": 0.8920271992683411, + "step": 2867 + }, + { + "epoch": 1.434, + "grad_norm": 2.110287620709249, + "learning_rate": 4.971366566429931e-06, + "loss": 0.3153, + "mean_token_accuracy": 0.895639181137085, + "step": 2868 + }, + { + "epoch": 1.4344999999999999, + "grad_norm": 2.3633648844665323, + "learning_rate": 4.9713006793658816e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.8517278432846069, + "step": 2869 + }, + { + "epoch": 1.435, + "grad_norm": 4.158352676577829, + "learning_rate": 4.971234717021709e-06, + "loss": 0.2947, + "mean_token_accuracy": 0.9015557169914246, + "step": 2870 + }, + { + "epoch": 1.4355, + "grad_norm": 1.9806843692884486, + "learning_rate": 4.971168679399423e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8678541779518127, + "step": 2871 + }, + { + "epoch": 1.436, + "grad_norm": 6.165958080229843, + "learning_rate": 4.9711025665010335e-06, + "loss": 0.2905, + "mean_token_accuracy": 0.893082857131958, + "step": 2872 + }, + { + "epoch": 1.4365, + "grad_norm": 3.734787824029433, + "learning_rate": 4.971036378328556e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8800548315048218, + "step": 2873 + }, + { + "epoch": 1.437, + "grad_norm": 3.6572019334157573, + "learning_rate": 4.970970114884006e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8748683929443359, + "step": 2874 + }, + { + "epoch": 1.4375, + "grad_norm": 2.262352659765153, + "learning_rate": 4.970903776169403e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8866305947303772, + "step": 2875 + }, + { + "epoch": 1.438, + "grad_norm": 4.730023146960895, + "learning_rate": 4.970837362186765e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.8971332311630249, + "step": 2876 + }, + { + "epoch": 1.4385, + "grad_norm": 3.8734382789504296, + "learning_rate": 4.970770872938118e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.877307116985321, + "step": 2877 + }, + { + "epoch": 1.439, + "grad_norm": 9.009248290748788, + "learning_rate": 4.970704308425487e-06, + "loss": 0.312, + "mean_token_accuracy": 0.9059266448020935, + "step": 2878 + }, + { + "epoch": 1.4395, + "grad_norm": 3.851592189799695, + "learning_rate": 4.970637668650898e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8730879426002502, + "step": 2879 + }, + { + "epoch": 1.44, + "grad_norm": 2.317226394570055, + "learning_rate": 4.970570953616383e-06, + "loss": 0.2598, + "mean_token_accuracy": 0.9066253304481506, + "step": 2880 + }, + { + "epoch": 1.4405000000000001, + "grad_norm": 3.1509490907936484, + "learning_rate": 4.970504163323972e-06, + "loss": 0.494, + "mean_token_accuracy": 0.8522847890853882, + "step": 2881 + }, + { + "epoch": 1.441, + "grad_norm": 2.882369611778011, + "learning_rate": 4.970437297775702e-06, + "loss": 0.3192, + "mean_token_accuracy": 0.8986157774925232, + "step": 2882 + }, + { + "epoch": 1.4415, + "grad_norm": 2.3607630072205663, + "learning_rate": 4.970370356973608e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8725987672805786, + "step": 2883 + }, + { + "epoch": 1.442, + "grad_norm": 3.709444851704403, + "learning_rate": 4.97030334091973e-06, + "loss": 0.2063, + "mean_token_accuracy": 0.9189849495887756, + "step": 2884 + }, + { + "epoch": 1.4425, + "grad_norm": 1.9763761350915534, + "learning_rate": 4.970236249616109e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8743529319763184, + "step": 2885 + }, + { + "epoch": 1.443, + "grad_norm": 2.2365550824614293, + "learning_rate": 4.970169083064789e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8888660669326782, + "step": 2886 + }, + { + "epoch": 1.4435, + "grad_norm": 2.062122994098253, + "learning_rate": 4.970101841267816e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8769098520278931, + "step": 2887 + }, + { + "epoch": 1.444, + "grad_norm": 15.258243623233115, + "learning_rate": 4.970034524227239e-06, + "loss": 0.296, + "mean_token_accuracy": 0.8951964974403381, + "step": 2888 + }, + { + "epoch": 1.4445000000000001, + "grad_norm": 2.7267610129108553, + "learning_rate": 4.969967131945107e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.8544842004776001, + "step": 2889 + }, + { + "epoch": 1.445, + "grad_norm": 3.050187468731189, + "learning_rate": 4.969899664423473e-06, + "loss": 0.3133, + "mean_token_accuracy": 0.8957800269126892, + "step": 2890 + }, + { + "epoch": 1.4455, + "grad_norm": 5.630715458789413, + "learning_rate": 4.969832121664394e-06, + "loss": 0.3011, + "mean_token_accuracy": 0.908877432346344, + "step": 2891 + }, + { + "epoch": 1.446, + "grad_norm": 2.274646757106153, + "learning_rate": 4.969764503669926e-06, + "loss": 0.3288, + "mean_token_accuracy": 0.8980439901351929, + "step": 2892 + }, + { + "epoch": 1.4465, + "grad_norm": 2.153273166290739, + "learning_rate": 4.969696810442129e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.88749760389328, + "step": 2893 + }, + { + "epoch": 1.447, + "grad_norm": 3.66566769359458, + "learning_rate": 4.969629041983065e-06, + "loss": 0.2192, + "mean_token_accuracy": 0.9214620590209961, + "step": 2894 + }, + { + "epoch": 1.4475, + "grad_norm": 2.1978033960164876, + "learning_rate": 4.9695611982947995e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.874019980430603, + "step": 2895 + }, + { + "epoch": 1.448, + "grad_norm": 2.0317945522669913, + "learning_rate": 4.969493279379397e-06, + "loss": 0.3068, + "mean_token_accuracy": 0.8916994333267212, + "step": 2896 + }, + { + "epoch": 1.4485000000000001, + "grad_norm": 2.3202539636180655, + "learning_rate": 4.969425285238929e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8810506463050842, + "step": 2897 + }, + { + "epoch": 1.449, + "grad_norm": 4.517220672521521, + "learning_rate": 4.969357215875464e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8640884160995483, + "step": 2898 + }, + { + "epoch": 1.4495, + "grad_norm": 3.2990250421433323, + "learning_rate": 4.969289071291078e-06, + "loss": 0.2556, + "mean_token_accuracy": 0.9155411720275879, + "step": 2899 + }, + { + "epoch": 1.45, + "grad_norm": 2.0499923902026516, + "learning_rate": 4.9692208514878445e-06, + "loss": 0.2284, + "mean_token_accuracy": 0.9186083078384399, + "step": 2900 + }, + { + "epoch": 1.4505, + "grad_norm": 2.5518790314214073, + "learning_rate": 4.9691525564678435e-06, + "loss": 0.2941, + "mean_token_accuracy": 0.909045398235321, + "step": 2901 + }, + { + "epoch": 1.451, + "grad_norm": 2.3824386349496995, + "learning_rate": 4.969084186233154e-06, + "loss": 0.3338, + "mean_token_accuracy": 0.8863726258277893, + "step": 2902 + }, + { + "epoch": 1.4515, + "grad_norm": 20.39238163757976, + "learning_rate": 4.9690157407858595e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8774558901786804, + "step": 2903 + }, + { + "epoch": 1.452, + "grad_norm": 2.6241731552680725, + "learning_rate": 4.968947220128046e-06, + "loss": 0.3059, + "mean_token_accuracy": 0.9000488519668579, + "step": 2904 + }, + { + "epoch": 1.4525000000000001, + "grad_norm": 2.925517487587371, + "learning_rate": 4.968878624261798e-06, + "loss": 0.3095, + "mean_token_accuracy": 0.8965399265289307, + "step": 2905 + }, + { + "epoch": 1.453, + "grad_norm": 21.724409777115426, + "learning_rate": 4.968809953189207e-06, + "loss": 0.3202, + "mean_token_accuracy": 0.8990963697433472, + "step": 2906 + }, + { + "epoch": 1.4535, + "grad_norm": 2.2379220307476246, + "learning_rate": 4.968741206912364e-06, + "loss": 0.2813, + "mean_token_accuracy": 0.9073523283004761, + "step": 2907 + }, + { + "epoch": 1.454, + "grad_norm": 7.202233894210642, + "learning_rate": 4.968672385433364e-06, + "loss": 0.2925, + "mean_token_accuracy": 0.9038975834846497, + "step": 2908 + }, + { + "epoch": 1.4545, + "grad_norm": 2.4271237583237544, + "learning_rate": 4.9686034887543024e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8662031888961792, + "step": 2909 + }, + { + "epoch": 1.455, + "grad_norm": 2.4304438146646774, + "learning_rate": 4.968534516877279e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8821945190429688, + "step": 2910 + }, + { + "epoch": 1.4555, + "grad_norm": 2.6123374142714306, + "learning_rate": 4.968465469804394e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8529613018035889, + "step": 2911 + }, + { + "epoch": 1.456, + "grad_norm": 2.468928224645289, + "learning_rate": 4.968396347537751e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.886914074420929, + "step": 2912 + }, + { + "epoch": 1.4565000000000001, + "grad_norm": 2.976047544488344, + "learning_rate": 4.968327150079456e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8681612610816956, + "step": 2913 + }, + { + "epoch": 1.457, + "grad_norm": 1.8468647324575909, + "learning_rate": 4.968257877431616e-06, + "loss": 0.351, + "mean_token_accuracy": 0.8901032209396362, + "step": 2914 + }, + { + "epoch": 1.4575, + "grad_norm": 2.8072924675652167, + "learning_rate": 4.968188529596342e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.889160692691803, + "step": 2915 + }, + { + "epoch": 1.458, + "grad_norm": 7.579966895710864, + "learning_rate": 4.968119106575746e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8856841921806335, + "step": 2916 + }, + { + "epoch": 1.4585, + "grad_norm": 2.6359743663200192, + "learning_rate": 4.968049608371942e-06, + "loss": 0.3051, + "mean_token_accuracy": 0.8955295085906982, + "step": 2917 + }, + { + "epoch": 1.459, + "grad_norm": 2.2446464883873567, + "learning_rate": 4.967980034987048e-06, + "loss": 0.2723, + "mean_token_accuracy": 0.9035578966140747, + "step": 2918 + }, + { + "epoch": 1.4595, + "grad_norm": 2.4788356848348463, + "learning_rate": 4.967910386423183e-06, + "loss": 0.3533, + "mean_token_accuracy": 0.8774822950363159, + "step": 2919 + }, + { + "epoch": 1.46, + "grad_norm": 3.510977813552299, + "learning_rate": 4.96784066268247e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8571825623512268, + "step": 2920 + }, + { + "epoch": 1.4605000000000001, + "grad_norm": 8.171920043849067, + "learning_rate": 4.9677708637670315e-06, + "loss": 0.4898, + "mean_token_accuracy": 0.8750196695327759, + "step": 2921 + }, + { + "epoch": 1.461, + "grad_norm": 177.05563138960707, + "learning_rate": 4.967700989678993e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8705676794052124, + "step": 2922 + }, + { + "epoch": 1.4615, + "grad_norm": 3.9606355225801737, + "learning_rate": 4.9676310404204846e-06, + "loss": 0.2937, + "mean_token_accuracy": 0.8966556787490845, + "step": 2923 + }, + { + "epoch": 1.462, + "grad_norm": 2.569328718877818, + "learning_rate": 4.967561015993635e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8619776964187622, + "step": 2924 + }, + { + "epoch": 1.4625, + "grad_norm": 2.8665928600853103, + "learning_rate": 4.9674909164005805e-06, + "loss": 0.3023, + "mean_token_accuracy": 0.8983410000801086, + "step": 2925 + }, + { + "epoch": 1.463, + "grad_norm": 2.7281639989696527, + "learning_rate": 4.9674207416434535e-06, + "loss": 0.5194, + "mean_token_accuracy": 0.8413867354393005, + "step": 2926 + }, + { + "epoch": 1.4635, + "grad_norm": 2.9574134652160673, + "learning_rate": 4.967350491724393e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8821844458580017, + "step": 2927 + }, + { + "epoch": 1.464, + "grad_norm": 2.019597942085895, + "learning_rate": 4.967280166645538e-06, + "loss": 0.3407, + "mean_token_accuracy": 0.8868412375450134, + "step": 2928 + }, + { + "epoch": 1.4645000000000001, + "grad_norm": 1.9442736373161198, + "learning_rate": 4.967209766409032e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8694099187850952, + "step": 2929 + }, + { + "epoch": 1.465, + "grad_norm": 2.0595576338456127, + "learning_rate": 4.967139291017018e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8862393498420715, + "step": 2930 + }, + { + "epoch": 1.4655, + "grad_norm": 2.3364062261129703, + "learning_rate": 4.967068740471645e-06, + "loss": 0.3174, + "mean_token_accuracy": 0.8989251255989075, + "step": 2931 + }, + { + "epoch": 1.466, + "grad_norm": 1.8945415516842474, + "learning_rate": 4.96699811477506e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.888676643371582, + "step": 2932 + }, + { + "epoch": 1.4665, + "grad_norm": 11.378194049683067, + "learning_rate": 4.9669274139294154e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8756824731826782, + "step": 2933 + }, + { + "epoch": 1.467, + "grad_norm": 2.2664638097347094, + "learning_rate": 4.966856637936864e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8654007911682129, + "step": 2934 + }, + { + "epoch": 1.4675, + "grad_norm": 2.0881183142239315, + "learning_rate": 4.966785786799564e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8820604085922241, + "step": 2935 + }, + { + "epoch": 1.468, + "grad_norm": 3.144028826407174, + "learning_rate": 4.96671486051967e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8855703473091125, + "step": 2936 + }, + { + "epoch": 1.4685000000000001, + "grad_norm": 3.555944606656122, + "learning_rate": 4.966643859099346e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.8874907493591309, + "step": 2937 + }, + { + "epoch": 1.4689999999999999, + "grad_norm": 2.8973193291391763, + "learning_rate": 4.966572782540753e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.884509801864624, + "step": 2938 + }, + { + "epoch": 1.4695, + "grad_norm": 1.9519938881799148, + "learning_rate": 4.966501630846057e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8707329034805298, + "step": 2939 + }, + { + "epoch": 1.47, + "grad_norm": 2.3697199719456, + "learning_rate": 4.966430404017424e-06, + "loss": 0.1744, + "mean_token_accuracy": 0.9365194439888, + "step": 2940 + }, + { + "epoch": 1.4705, + "grad_norm": 2.6097061060353024, + "learning_rate": 4.966359102057025e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8743160963058472, + "step": 2941 + }, + { + "epoch": 1.471, + "grad_norm": 2.659399467524392, + "learning_rate": 4.966287724967031e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8767550587654114, + "step": 2942 + }, + { + "epoch": 1.4715, + "grad_norm": 4.579368763143208, + "learning_rate": 4.966216272749618e-06, + "loss": 0.2793, + "mean_token_accuracy": 0.9073998332023621, + "step": 2943 + }, + { + "epoch": 1.472, + "grad_norm": 2.71537378953539, + "learning_rate": 4.966144745406961e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8849160075187683, + "step": 2944 + }, + { + "epoch": 1.4725, + "grad_norm": 2.6645296307913435, + "learning_rate": 4.966073142941239e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8921002149581909, + "step": 2945 + }, + { + "epoch": 1.4729999999999999, + "grad_norm": 2.9021241099401522, + "learning_rate": 4.966001465354634e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8869591355323792, + "step": 2946 + }, + { + "epoch": 1.4735, + "grad_norm": 2.0516170260058915, + "learning_rate": 4.965929712649327e-06, + "loss": 0.254, + "mean_token_accuracy": 0.9120346903800964, + "step": 2947 + }, + { + "epoch": 1.474, + "grad_norm": 4.390554879979782, + "learning_rate": 4.965857884827508e-06, + "loss": 0.3116, + "mean_token_accuracy": 0.9092960953712463, + "step": 2948 + }, + { + "epoch": 1.4745, + "grad_norm": 3.9302568629655195, + "learning_rate": 4.965785981891361e-06, + "loss": 0.3142, + "mean_token_accuracy": 0.9055802822113037, + "step": 2949 + }, + { + "epoch": 1.475, + "grad_norm": 57.00245546263812, + "learning_rate": 4.965714003843079e-06, + "loss": 0.2804, + "mean_token_accuracy": 0.905343770980835, + "step": 2950 + }, + { + "epoch": 1.4755, + "grad_norm": 2.501431157909815, + "learning_rate": 4.965641950684853e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8824581503868103, + "step": 2951 + }, + { + "epoch": 1.476, + "grad_norm": 4.059234041842537, + "learning_rate": 4.965569822418878e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.884878396987915, + "step": 2952 + }, + { + "epoch": 1.4765, + "grad_norm": 2.5449915880679703, + "learning_rate": 4.965497619047352e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8912426829338074, + "step": 2953 + }, + { + "epoch": 1.4769999999999999, + "grad_norm": 2.7073500340132304, + "learning_rate": 4.965425340572473e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8525688648223877, + "step": 2954 + }, + { + "epoch": 1.4775, + "grad_norm": 2.353172243031324, + "learning_rate": 4.965352986996443e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.866291880607605, + "step": 2955 + }, + { + "epoch": 1.478, + "grad_norm": 2.467109332062177, + "learning_rate": 4.965280558321468e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8680127263069153, + "step": 2956 + }, + { + "epoch": 1.4785, + "grad_norm": 5.658707919434567, + "learning_rate": 4.9652080545497525e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8791208863258362, + "step": 2957 + }, + { + "epoch": 1.479, + "grad_norm": 1.8375344179123836, + "learning_rate": 4.965135475683506e-06, + "loss": 0.273, + "mean_token_accuracy": 0.9140221476554871, + "step": 2958 + }, + { + "epoch": 1.4795, + "grad_norm": 2.1060440401357807, + "learning_rate": 4.965062821724937e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8789798617362976, + "step": 2959 + }, + { + "epoch": 1.48, + "grad_norm": 2.676346237009173, + "learning_rate": 4.964990092676263e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8732442855834961, + "step": 2960 + }, + { + "epoch": 1.4805, + "grad_norm": 8.295691632023443, + "learning_rate": 4.964917288539696e-06, + "loss": 0.2992, + "mean_token_accuracy": 0.9008764624595642, + "step": 2961 + }, + { + "epoch": 1.4809999999999999, + "grad_norm": 2.482391104772021, + "learning_rate": 4.964844409317454e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8624155521392822, + "step": 2962 + }, + { + "epoch": 1.4815, + "grad_norm": 2.3590066463015447, + "learning_rate": 4.964771455011759e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8650424480438232, + "step": 2963 + }, + { + "epoch": 1.482, + "grad_norm": 2.8835819560071245, + "learning_rate": 4.9646984256248306e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8915315270423889, + "step": 2964 + }, + { + "epoch": 1.4825, + "grad_norm": 3.502425003797088, + "learning_rate": 4.964625321158897e-06, + "loss": 0.3006, + "mean_token_accuracy": 0.8999263048171997, + "step": 2965 + }, + { + "epoch": 1.483, + "grad_norm": 1.9639023561583746, + "learning_rate": 4.9645521416161815e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8802940845489502, + "step": 2966 + }, + { + "epoch": 1.4835, + "grad_norm": 2.8270276437066917, + "learning_rate": 4.964478886998915e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8660851716995239, + "step": 2967 + }, + { + "epoch": 1.484, + "grad_norm": 6.824281247138264, + "learning_rate": 4.964405557309329e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8752490878105164, + "step": 2968 + }, + { + "epoch": 1.4845, + "grad_norm": 5.282656006592904, + "learning_rate": 4.964332152549656e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.8821119666099548, + "step": 2969 + }, + { + "epoch": 1.4849999999999999, + "grad_norm": 3.3129827396474987, + "learning_rate": 4.964258672722135e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8712724447250366, + "step": 2970 + }, + { + "epoch": 1.4855, + "grad_norm": 2.590871761182563, + "learning_rate": 4.964185117829e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8974742889404297, + "step": 2971 + }, + { + "epoch": 1.486, + "grad_norm": 4.010077155128602, + "learning_rate": 4.964111487872496e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8726708292961121, + "step": 2972 + }, + { + "epoch": 1.4865, + "grad_norm": 5.420614317081186, + "learning_rate": 4.964037782854862e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8605024218559265, + "step": 2973 + }, + { + "epoch": 1.487, + "grad_norm": 10.473431225633018, + "learning_rate": 4.963964002778346e-06, + "loss": 0.3404, + "mean_token_accuracy": 0.894875705242157, + "step": 2974 + }, + { + "epoch": 1.4875, + "grad_norm": 2.4661318011950177, + "learning_rate": 4.963890147645195e-06, + "loss": 0.6947, + "mean_token_accuracy": 0.7946251630783081, + "step": 2975 + }, + { + "epoch": 1.488, + "grad_norm": 2.3706414422084925, + "learning_rate": 4.9638162174576575e-06, + "loss": 0.2876, + "mean_token_accuracy": 0.9093680381774902, + "step": 2976 + }, + { + "epoch": 1.4885, + "grad_norm": 2.5236274861503314, + "learning_rate": 4.963742212217986e-06, + "loss": 0.3044, + "mean_token_accuracy": 0.8963197469711304, + "step": 2977 + }, + { + "epoch": 1.4889999999999999, + "grad_norm": 3.577660785811546, + "learning_rate": 4.963668131928436e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8670368194580078, + "step": 2978 + }, + { + "epoch": 1.4895, + "grad_norm": 2.1818196348370362, + "learning_rate": 4.963593976591262e-06, + "loss": 0.2741, + "mean_token_accuracy": 0.9076326489448547, + "step": 2979 + }, + { + "epoch": 1.49, + "grad_norm": 2.374045777450861, + "learning_rate": 4.963519746208726e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8727654814720154, + "step": 2980 + }, + { + "epoch": 1.4905, + "grad_norm": 3.207382735107286, + "learning_rate": 4.963445440783086e-06, + "loss": 0.2318, + "mean_token_accuracy": 0.9253061413764954, + "step": 2981 + }, + { + "epoch": 1.491, + "grad_norm": 1.9500486314204886, + "learning_rate": 4.963371060316608e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8651353120803833, + "step": 2982 + }, + { + "epoch": 1.4915, + "grad_norm": 4.328227659221448, + "learning_rate": 4.963296604811555e-06, + "loss": 0.2901, + "mean_token_accuracy": 0.8857460021972656, + "step": 2983 + }, + { + "epoch": 1.492, + "grad_norm": 3.639483019046161, + "learning_rate": 4.963222074270197e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8814235925674438, + "step": 2984 + }, + { + "epoch": 1.4925, + "grad_norm": 2.2206536799227554, + "learning_rate": 4.963147468694804e-06, + "loss": 0.3003, + "mean_token_accuracy": 0.9024296998977661, + "step": 2985 + }, + { + "epoch": 1.4929999999999999, + "grad_norm": 2.7642401694367953, + "learning_rate": 4.963072788087648e-06, + "loss": 0.5678, + "mean_token_accuracy": 0.8346249461174011, + "step": 2986 + }, + { + "epoch": 1.4935, + "grad_norm": 1.985763086152194, + "learning_rate": 4.9629980324510055e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8940491080284119, + "step": 2987 + }, + { + "epoch": 1.494, + "grad_norm": 5.011548455230968, + "learning_rate": 4.962923201787153e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8616881966590881, + "step": 2988 + }, + { + "epoch": 1.4945, + "grad_norm": 3.0569237142882466, + "learning_rate": 4.9628482960983685e-06, + "loss": 0.2946, + "mean_token_accuracy": 0.9061174988746643, + "step": 2989 + }, + { + "epoch": 1.495, + "grad_norm": 8.27897150783973, + "learning_rate": 4.962773315386935e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8655690550804138, + "step": 2990 + }, + { + "epoch": 1.4955, + "grad_norm": 1.9541457381881773, + "learning_rate": 4.9626982596551364e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8803139328956604, + "step": 2991 + }, + { + "epoch": 1.496, + "grad_norm": 3.4307183673463113, + "learning_rate": 4.9626231289052594e-06, + "loss": 0.2834, + "mean_token_accuracy": 0.9065553545951843, + "step": 2992 + }, + { + "epoch": 1.4965, + "grad_norm": 4.4232017784120226, + "learning_rate": 4.9625479231395925e-06, + "loss": 0.478, + "mean_token_accuracy": 0.8780487775802612, + "step": 2993 + }, + { + "epoch": 1.4969999999999999, + "grad_norm": 3.4719158936835144, + "learning_rate": 4.962472642360426e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8796424865722656, + "step": 2994 + }, + { + "epoch": 1.4975, + "grad_norm": 233.47278778676747, + "learning_rate": 4.962397286570053e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8791934847831726, + "step": 2995 + }, + { + "epoch": 1.498, + "grad_norm": 4.595293467238486, + "learning_rate": 4.96232185577077e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8661125302314758, + "step": 2996 + }, + { + "epoch": 1.4985, + "grad_norm": 2.621787958989372, + "learning_rate": 4.962246349964875e-06, + "loss": 0.417, + "mean_token_accuracy": 0.864461362361908, + "step": 2997 + }, + { + "epoch": 1.499, + "grad_norm": 62.73949411559195, + "learning_rate": 4.962170769154665e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8745043873786926, + "step": 2998 + }, + { + "epoch": 1.4995, + "grad_norm": 2.6348244482002543, + "learning_rate": 4.962095113342446e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8505509495735168, + "step": 2999 + }, + { + "epoch": 1.5, + "grad_norm": 2.351960455419964, + "learning_rate": 4.962019382530521e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8890588879585266, + "step": 3000 + }, + { + "epoch": 1.5005, + "grad_norm": 2.0415906463884688, + "learning_rate": 4.9619435767211964e-06, + "loss": 0.3036, + "mean_token_accuracy": 0.9054336547851562, + "step": 3001 + }, + { + "epoch": 1.501, + "grad_norm": 4.069364897966555, + "learning_rate": 4.961867695916782e-06, + "loss": 0.3308, + "mean_token_accuracy": 0.896334707736969, + "step": 3002 + }, + { + "epoch": 1.5015, + "grad_norm": 3.890350174599575, + "learning_rate": 4.961791740119591e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8522114157676697, + "step": 3003 + }, + { + "epoch": 1.502, + "grad_norm": 3.0396786575528174, + "learning_rate": 4.961715709331933e-06, + "loss": 0.2703, + "mean_token_accuracy": 0.9093277454376221, + "step": 3004 + }, + { + "epoch": 1.5025, + "grad_norm": 2.4712233401106656, + "learning_rate": 4.961639603556128e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.8382062911987305, + "step": 3005 + }, + { + "epoch": 1.5030000000000001, + "grad_norm": 2.3656990993593694, + "learning_rate": 4.961563422794491e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8760910630226135, + "step": 3006 + }, + { + "epoch": 1.5034999999999998, + "grad_norm": 3.2616226977771525, + "learning_rate": 4.961487167049346e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8852818608283997, + "step": 3007 + }, + { + "epoch": 1.504, + "grad_norm": 3.007630154661029, + "learning_rate": 4.961410836323014e-06, + "loss": 0.564, + "mean_token_accuracy": 0.8357221484184265, + "step": 3008 + }, + { + "epoch": 1.5045, + "grad_norm": 10.17591486384331, + "learning_rate": 4.96133443061782e-06, + "loss": 0.3095, + "mean_token_accuracy": 0.8987171053886414, + "step": 3009 + }, + { + "epoch": 1.505, + "grad_norm": 2.2520308223770154, + "learning_rate": 4.961257949936092e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8741872310638428, + "step": 3010 + }, + { + "epoch": 1.5055, + "grad_norm": 2.155433115307505, + "learning_rate": 4.96118139428016e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8783641457557678, + "step": 3011 + }, + { + "epoch": 1.506, + "grad_norm": 2.4975681199978443, + "learning_rate": 4.9611047636523545e-06, + "loss": 0.5009, + "mean_token_accuracy": 0.8364887833595276, + "step": 3012 + }, + { + "epoch": 1.5065, + "grad_norm": 2.823269896188485, + "learning_rate": 4.961028058055012e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8895381689071655, + "step": 3013 + }, + { + "epoch": 1.5070000000000001, + "grad_norm": 2.8107184081439907, + "learning_rate": 4.9609512774904674e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8809685111045837, + "step": 3014 + }, + { + "epoch": 1.5074999999999998, + "grad_norm": 4.336582906267823, + "learning_rate": 4.96087442196106e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8590425252914429, + "step": 3015 + }, + { + "epoch": 1.508, + "grad_norm": 3.9126808124375643, + "learning_rate": 4.960797491469131e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.876663863658905, + "step": 3016 + }, + { + "epoch": 1.5085, + "grad_norm": 5.02140599578377, + "learning_rate": 4.960720486017025e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8544106483459473, + "step": 3017 + }, + { + "epoch": 1.509, + "grad_norm": 2.523281452754345, + "learning_rate": 4.9606434056070865e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8717038035392761, + "step": 3018 + }, + { + "epoch": 1.5095, + "grad_norm": 3.526241602780604, + "learning_rate": 4.960566250241663e-06, + "loss": 0.3012, + "mean_token_accuracy": 0.9031071662902832, + "step": 3019 + }, + { + "epoch": 1.51, + "grad_norm": 2.6019144468768935, + "learning_rate": 4.960489019923105e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8689563274383545, + "step": 3020 + }, + { + "epoch": 1.5105, + "grad_norm": 2.337342340998808, + "learning_rate": 4.960411714653767e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.881085216999054, + "step": 3021 + }, + { + "epoch": 1.5110000000000001, + "grad_norm": 2.3132437835493227, + "learning_rate": 4.960334334436001e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8724082708358765, + "step": 3022 + }, + { + "epoch": 1.5114999999999998, + "grad_norm": 2.906996359946841, + "learning_rate": 4.960256879272166e-06, + "loss": 0.5834, + "mean_token_accuracy": 0.8093276023864746, + "step": 3023 + }, + { + "epoch": 1.512, + "grad_norm": 10.752221784859993, + "learning_rate": 4.960179349164621e-06, + "loss": 0.295, + "mean_token_accuracy": 0.9072774052619934, + "step": 3024 + }, + { + "epoch": 1.5125, + "grad_norm": 5.327983694505099, + "learning_rate": 4.960101744115727e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8826778531074524, + "step": 3025 + }, + { + "epoch": 1.513, + "grad_norm": 2.2939624589509235, + "learning_rate": 4.9600240641278495e-06, + "loss": 0.267, + "mean_token_accuracy": 0.910886824131012, + "step": 3026 + }, + { + "epoch": 1.5135, + "grad_norm": 1.7017719808760028, + "learning_rate": 4.959946309203354e-06, + "loss": 0.2709, + "mean_token_accuracy": 0.902761697769165, + "step": 3027 + }, + { + "epoch": 1.514, + "grad_norm": 2.042404449796258, + "learning_rate": 4.959868479344608e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.882841169834137, + "step": 3028 + }, + { + "epoch": 1.5145, + "grad_norm": 1.829392203879183, + "learning_rate": 4.959790574553984e-06, + "loss": 0.2512, + "mean_token_accuracy": 0.9226202964782715, + "step": 3029 + }, + { + "epoch": 1.5150000000000001, + "grad_norm": 2.165419911754053, + "learning_rate": 4.959712594833855e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8816087245941162, + "step": 3030 + }, + { + "epoch": 1.5154999999999998, + "grad_norm": 2.114135765053498, + "learning_rate": 4.959634540186594e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8657147288322449, + "step": 3031 + }, + { + "epoch": 1.516, + "grad_norm": 3.202702643417, + "learning_rate": 4.9595564106145825e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8629664778709412, + "step": 3032 + }, + { + "epoch": 1.5165, + "grad_norm": 6.5181164605483675, + "learning_rate": 4.959478206120197e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8862107396125793, + "step": 3033 + }, + { + "epoch": 1.517, + "grad_norm": 4.395153619180546, + "learning_rate": 4.959399926705821e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8899371027946472, + "step": 3034 + }, + { + "epoch": 1.5175, + "grad_norm": 2.561101816124232, + "learning_rate": 4.9593215723738405e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8553918600082397, + "step": 3035 + }, + { + "epoch": 1.518, + "grad_norm": 3.9181460833646136, + "learning_rate": 4.959243143126639e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8802471160888672, + "step": 3036 + }, + { + "epoch": 1.5185, + "grad_norm": 4.5782832085087355, + "learning_rate": 4.95916463896661e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8816992044448853, + "step": 3037 + }, + { + "epoch": 1.5190000000000001, + "grad_norm": 2.9760022041835237, + "learning_rate": 4.959086059896141e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8820429444313049, + "step": 3038 + }, + { + "epoch": 1.5194999999999999, + "grad_norm": 2.422161686921274, + "learning_rate": 4.959007405917627e-06, + "loss": 0.3281, + "mean_token_accuracy": 0.8881469368934631, + "step": 3039 + }, + { + "epoch": 1.52, + "grad_norm": 2.4439379100462686, + "learning_rate": 4.958928677033465e-06, + "loss": 0.3326, + "mean_token_accuracy": 0.8957496881484985, + "step": 3040 + }, + { + "epoch": 1.5205, + "grad_norm": 1.9190698685298226, + "learning_rate": 4.958849873246052e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8463619351387024, + "step": 3041 + }, + { + "epoch": 1.521, + "grad_norm": 1.9319676050758376, + "learning_rate": 4.958770994557789e-06, + "loss": 0.3165, + "mean_token_accuracy": 0.8935810923576355, + "step": 3042 + }, + { + "epoch": 1.5215, + "grad_norm": 2.253754596664127, + "learning_rate": 4.958692040971078e-06, + "loss": 0.2904, + "mean_token_accuracy": 0.9095053672790527, + "step": 3043 + }, + { + "epoch": 1.522, + "grad_norm": 2.088570066933825, + "learning_rate": 4.958613012488325e-06, + "loss": 0.3097, + "mean_token_accuracy": 0.8963963985443115, + "step": 3044 + }, + { + "epoch": 1.5225, + "grad_norm": 2.224383647395394, + "learning_rate": 4.958533909111936e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.9008046984672546, + "step": 3045 + }, + { + "epoch": 1.5230000000000001, + "grad_norm": 1.8766360239725386, + "learning_rate": 4.958454730844323e-06, + "loss": 0.3294, + "mean_token_accuracy": 0.8942142724990845, + "step": 3046 + }, + { + "epoch": 1.5234999999999999, + "grad_norm": 11.353302161789099, + "learning_rate": 4.9583754776878955e-06, + "loss": 0.3487, + "mean_token_accuracy": 0.893785297870636, + "step": 3047 + }, + { + "epoch": 1.524, + "grad_norm": 137.92950080912937, + "learning_rate": 4.95829614964507e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.87462317943573, + "step": 3048 + }, + { + "epoch": 1.5245, + "grad_norm": 2.054635806270467, + "learning_rate": 4.95821674671826e-06, + "loss": 0.2427, + "mean_token_accuracy": 0.9185199737548828, + "step": 3049 + }, + { + "epoch": 1.525, + "grad_norm": 2.71558895925032, + "learning_rate": 4.958137268909887e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8831507563591003, + "step": 3050 + }, + { + "epoch": 1.5255, + "grad_norm": 4.901410684532377, + "learning_rate": 4.958057716222371e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.8850133419036865, + "step": 3051 + }, + { + "epoch": 1.526, + "grad_norm": 2.1661836664494607, + "learning_rate": 4.957978088658134e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8745898604393005, + "step": 3052 + }, + { + "epoch": 1.5265, + "grad_norm": 2.4236560178958406, + "learning_rate": 4.957898386219604e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.858068585395813, + "step": 3053 + }, + { + "epoch": 1.5270000000000001, + "grad_norm": 9.886442705095876, + "learning_rate": 4.957818608909208e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8650606870651245, + "step": 3054 + }, + { + "epoch": 1.5274999999999999, + "grad_norm": 2.258099647423031, + "learning_rate": 4.957738756729375e-06, + "loss": 0.2689, + "mean_token_accuracy": 0.9013594388961792, + "step": 3055 + }, + { + "epoch": 1.528, + "grad_norm": 2.82458016110256, + "learning_rate": 4.957658829682539e-06, + "loss": 0.3293, + "mean_token_accuracy": 0.894236147403717, + "step": 3056 + }, + { + "epoch": 1.5285, + "grad_norm": 3.0552278820010708, + "learning_rate": 4.957578827771134e-06, + "loss": 0.5388, + "mean_token_accuracy": 0.8413873910903931, + "step": 3057 + }, + { + "epoch": 1.529, + "grad_norm": 1.8674750378722063, + "learning_rate": 4.957498750997597e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8784350156784058, + "step": 3058 + }, + { + "epoch": 1.5295, + "grad_norm": 2.58430756941656, + "learning_rate": 4.957418599364367e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8380014896392822, + "step": 3059 + }, + { + "epoch": 1.53, + "grad_norm": 2.6386259517070463, + "learning_rate": 4.957338372873886e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8531377911567688, + "step": 3060 + }, + { + "epoch": 1.5305, + "grad_norm": 2.051735925099292, + "learning_rate": 4.957258071528598e-06, + "loss": 0.294, + "mean_token_accuracy": 0.9062877893447876, + "step": 3061 + }, + { + "epoch": 1.5310000000000001, + "grad_norm": 2.0913374725283513, + "learning_rate": 4.957177695330948e-06, + "loss": 0.1792, + "mean_token_accuracy": 0.9341355562210083, + "step": 3062 + }, + { + "epoch": 1.5314999999999999, + "grad_norm": 1.7420873967444956, + "learning_rate": 4.957097244283387e-06, + "loss": 0.2973, + "mean_token_accuracy": 0.8979377746582031, + "step": 3063 + }, + { + "epoch": 1.532, + "grad_norm": 2.0096568445230116, + "learning_rate": 4.957016718388362e-06, + "loss": 0.2319, + "mean_token_accuracy": 0.9220384955406189, + "step": 3064 + }, + { + "epoch": 1.5325, + "grad_norm": 2.625586838688585, + "learning_rate": 4.956936117648329e-06, + "loss": 0.3126, + "mean_token_accuracy": 0.8972190618515015, + "step": 3065 + }, + { + "epoch": 1.533, + "grad_norm": 2.1848187303449595, + "learning_rate": 4.9568554420657415e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8757608532905579, + "step": 3066 + }, + { + "epoch": 1.5335, + "grad_norm": 3.3750445708804966, + "learning_rate": 4.9567746916430584e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8645554780960083, + "step": 3067 + }, + { + "epoch": 1.534, + "grad_norm": 2.3202370219734227, + "learning_rate": 4.956693866382738e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8672029376029968, + "step": 3068 + }, + { + "epoch": 1.5345, + "grad_norm": 2.603624967310942, + "learning_rate": 4.956612966287243e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8708499073982239, + "step": 3069 + }, + { + "epoch": 1.5350000000000001, + "grad_norm": 11.56563158071605, + "learning_rate": 4.956531991359038e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8990742564201355, + "step": 3070 + }, + { + "epoch": 1.5354999999999999, + "grad_norm": 2.272213605848946, + "learning_rate": 4.95645094160059e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8766564726829529, + "step": 3071 + }, + { + "epoch": 1.536, + "grad_norm": 3.03380240493573, + "learning_rate": 4.956369817014367e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8864473700523376, + "step": 3072 + }, + { + "epoch": 1.5365, + "grad_norm": 2.1697751721977565, + "learning_rate": 4.956288617602841e-06, + "loss": 0.3127, + "mean_token_accuracy": 0.8978189826011658, + "step": 3073 + }, + { + "epoch": 1.537, + "grad_norm": 1.7682537323439662, + "learning_rate": 4.956207343368486e-06, + "loss": 0.3263, + "mean_token_accuracy": 0.890947163105011, + "step": 3074 + }, + { + "epoch": 1.5375, + "grad_norm": 1.754124179342105, + "learning_rate": 4.956125994313775e-06, + "loss": 0.3256, + "mean_token_accuracy": 0.8868699669837952, + "step": 3075 + }, + { + "epoch": 1.538, + "grad_norm": 2.055432764562556, + "learning_rate": 4.956044570441188e-06, + "loss": 0.2926, + "mean_token_accuracy": 0.9015978574752808, + "step": 3076 + }, + { + "epoch": 1.5385, + "grad_norm": 2.1133048786105477, + "learning_rate": 4.955963071753206e-06, + "loss": 0.2923, + "mean_token_accuracy": 0.9053277373313904, + "step": 3077 + }, + { + "epoch": 1.5390000000000001, + "grad_norm": 2.7348192137787573, + "learning_rate": 4.955881498252311e-06, + "loss": 0.3251, + "mean_token_accuracy": 0.9015111327171326, + "step": 3078 + }, + { + "epoch": 1.5394999999999999, + "grad_norm": 3.1710905004090884, + "learning_rate": 4.955799849940987e-06, + "loss": 0.2773, + "mean_token_accuracy": 0.9079907536506653, + "step": 3079 + }, + { + "epoch": 1.54, + "grad_norm": 2.5498817437606562, + "learning_rate": 4.9557181268217225e-06, + "loss": 0.3379, + "mean_token_accuracy": 0.8900760412216187, + "step": 3080 + }, + { + "epoch": 1.5405, + "grad_norm": 11.07620882682872, + "learning_rate": 4.9556363288970055e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8881556391716003, + "step": 3081 + }, + { + "epoch": 1.541, + "grad_norm": 2.6399072347827808, + "learning_rate": 4.955554456169328e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8776190280914307, + "step": 3082 + }, + { + "epoch": 1.5415, + "grad_norm": 2.3632588482726997, + "learning_rate": 4.955472508641186e-06, + "loss": 0.5113, + "mean_token_accuracy": 0.8542852997779846, + "step": 3083 + }, + { + "epoch": 1.542, + "grad_norm": 18.643015703013464, + "learning_rate": 4.955390486315073e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8920615911483765, + "step": 3084 + }, + { + "epoch": 1.5425, + "grad_norm": 2.859417316061193, + "learning_rate": 4.955308389193489e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.8834136724472046, + "step": 3085 + }, + { + "epoch": 1.5430000000000001, + "grad_norm": 1.9976785271271005, + "learning_rate": 4.955226217278935e-06, + "loss": 0.1892, + "mean_token_accuracy": 0.9311844706535339, + "step": 3086 + }, + { + "epoch": 1.5434999999999999, + "grad_norm": 2.708618240904334, + "learning_rate": 4.955143970573913e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8847607970237732, + "step": 3087 + }, + { + "epoch": 1.544, + "grad_norm": 2.754095331040922, + "learning_rate": 4.95506164908093e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8629141449928284, + "step": 3088 + }, + { + "epoch": 1.5445, + "grad_norm": 2.7181029893474458, + "learning_rate": 4.954979252802492e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8616861701011658, + "step": 3089 + }, + { + "epoch": 1.545, + "grad_norm": 3.6267131445478467, + "learning_rate": 4.95489678174111e-06, + "loss": 0.5098, + "mean_token_accuracy": 0.8609609603881836, + "step": 3090 + }, + { + "epoch": 1.5455, + "grad_norm": 2.761042635345619, + "learning_rate": 4.954814235899295e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8864645957946777, + "step": 3091 + }, + { + "epoch": 1.546, + "grad_norm": 2.954077869574854, + "learning_rate": 4.954731615279563e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.879207193851471, + "step": 3092 + }, + { + "epoch": 1.5465, + "grad_norm": 4.850999170833432, + "learning_rate": 4.95464891988443e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8587930798530579, + "step": 3093 + }, + { + "epoch": 1.5470000000000002, + "grad_norm": 2.415371933644714, + "learning_rate": 4.954566149716415e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.87595534324646, + "step": 3094 + }, + { + "epoch": 1.5474999999999999, + "grad_norm": 2.2841642698330777, + "learning_rate": 4.95448330477804e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8717143535614014, + "step": 3095 + }, + { + "epoch": 1.548, + "grad_norm": 1.7781072296304465, + "learning_rate": 4.954400385071827e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.8933131694793701, + "step": 3096 + }, + { + "epoch": 1.5485, + "grad_norm": 3.065457501098387, + "learning_rate": 4.954317390600304e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8810720443725586, + "step": 3097 + }, + { + "epoch": 1.549, + "grad_norm": 4.374984584426953, + "learning_rate": 4.954234321365998e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8677113652229309, + "step": 3098 + }, + { + "epoch": 1.5495, + "grad_norm": 2.45704096564397, + "learning_rate": 4.954151177371439e-06, + "loss": 0.2794, + "mean_token_accuracy": 0.9087850451469421, + "step": 3099 + }, + { + "epoch": 1.55, + "grad_norm": 5.143443407012977, + "learning_rate": 4.9540679586191605e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8584973216056824, + "step": 3100 + }, + { + "epoch": 1.5505, + "grad_norm": 2.1145993320297425, + "learning_rate": 4.9539846651116975e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8528124690055847, + "step": 3101 + }, + { + "epoch": 1.5510000000000002, + "grad_norm": 2.3696989155972763, + "learning_rate": 4.953901296851586e-06, + "loss": 0.3187, + "mean_token_accuracy": 0.9001876711845398, + "step": 3102 + }, + { + "epoch": 1.5514999999999999, + "grad_norm": 118.15096971835641, + "learning_rate": 4.953817853841367e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8865358829498291, + "step": 3103 + }, + { + "epoch": 1.552, + "grad_norm": 2.7558266744989166, + "learning_rate": 4.953734336083582e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8828828930854797, + "step": 3104 + }, + { + "epoch": 1.5525, + "grad_norm": 4.235122675567636, + "learning_rate": 4.953650743580776e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8727718591690063, + "step": 3105 + }, + { + "epoch": 1.553, + "grad_norm": 11.197194562673571, + "learning_rate": 4.9535670763354935e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8725401759147644, + "step": 3106 + }, + { + "epoch": 1.5535, + "grad_norm": 2.4659289854051463, + "learning_rate": 4.953483334350284e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8923324942588806, + "step": 3107 + }, + { + "epoch": 1.554, + "grad_norm": 3.0468004395009016, + "learning_rate": 4.953399517627698e-06, + "loss": 0.1581, + "mean_token_accuracy": 0.9409152269363403, + "step": 3108 + }, + { + "epoch": 1.5545, + "grad_norm": 2.204399705033033, + "learning_rate": 4.953315626170289e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.8824963569641113, + "step": 3109 + }, + { + "epoch": 1.5550000000000002, + "grad_norm": 1.461498290556765, + "learning_rate": 4.953231659980613e-06, + "loss": 0.262, + "mean_token_accuracy": 0.9247882962226868, + "step": 3110 + }, + { + "epoch": 1.5554999999999999, + "grad_norm": 9.053876668191208, + "learning_rate": 4.953147619061228e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8788692355155945, + "step": 3111 + }, + { + "epoch": 1.556, + "grad_norm": 15.159205910761905, + "learning_rate": 4.953063503414692e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8604916334152222, + "step": 3112 + }, + { + "epoch": 1.5565, + "grad_norm": 2.0023346017560737, + "learning_rate": 4.95297931304357e-06, + "loss": 0.2563, + "mean_token_accuracy": 0.9172908067703247, + "step": 3113 + }, + { + "epoch": 1.557, + "grad_norm": 2.534018291529775, + "learning_rate": 4.952895047950424e-06, + "loss": 0.2948, + "mean_token_accuracy": 0.9035413861274719, + "step": 3114 + }, + { + "epoch": 1.5575, + "grad_norm": 3.0608209699265387, + "learning_rate": 4.952810708137824e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8488888740539551, + "step": 3115 + }, + { + "epoch": 1.558, + "grad_norm": 2.477075236311297, + "learning_rate": 4.952726293608335e-06, + "loss": 0.2549, + "mean_token_accuracy": 0.9126566052436829, + "step": 3116 + }, + { + "epoch": 1.5585, + "grad_norm": 3.033284804791032, + "learning_rate": 4.952641804364533e-06, + "loss": 0.3042, + "mean_token_accuracy": 0.9018344283103943, + "step": 3117 + }, + { + "epoch": 1.5590000000000002, + "grad_norm": 4.114477389368347, + "learning_rate": 4.952557240408988e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.883922278881073, + "step": 3118 + }, + { + "epoch": 1.5594999999999999, + "grad_norm": 2.138583517213155, + "learning_rate": 4.952472601744277e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.8981022238731384, + "step": 3119 + }, + { + "epoch": 1.56, + "grad_norm": 2.4650515748795767, + "learning_rate": 4.9523878883729794e-06, + "loss": 0.3012, + "mean_token_accuracy": 0.8985946178436279, + "step": 3120 + }, + { + "epoch": 1.5605, + "grad_norm": 5.647661078806936, + "learning_rate": 4.952303100297674e-06, + "loss": 0.508, + "mean_token_accuracy": 0.8416468501091003, + "step": 3121 + }, + { + "epoch": 1.561, + "grad_norm": 1.9138325336839952, + "learning_rate": 4.952218237520946e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8770532011985779, + "step": 3122 + }, + { + "epoch": 1.5615, + "grad_norm": 3.9796865885351784, + "learning_rate": 4.952133300045378e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8578876256942749, + "step": 3123 + }, + { + "epoch": 1.562, + "grad_norm": 1.9794376230962194, + "learning_rate": 4.952048287873558e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8687664270401001, + "step": 3124 + }, + { + "epoch": 1.5625, + "grad_norm": 2.066914839240362, + "learning_rate": 4.9519632010080765e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8730065226554871, + "step": 3125 + }, + { + "epoch": 1.563, + "grad_norm": 2.7536056549345442, + "learning_rate": 4.951878039451525e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8616119027137756, + "step": 3126 + }, + { + "epoch": 1.5635, + "grad_norm": 2.6280977747622716, + "learning_rate": 4.9517928032064965e-06, + "loss": 0.296, + "mean_token_accuracy": 0.8947635293006897, + "step": 3127 + }, + { + "epoch": 1.564, + "grad_norm": 2.616178808095389, + "learning_rate": 4.951707492275589e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8814670443534851, + "step": 3128 + }, + { + "epoch": 1.5645, + "grad_norm": 2.5596859552044964, + "learning_rate": 4.951622106661401e-06, + "loss": 0.5204, + "mean_token_accuracy": 0.8396103978157043, + "step": 3129 + }, + { + "epoch": 1.565, + "grad_norm": 6.750895751350051, + "learning_rate": 4.9515366463665324e-06, + "loss": 0.2833, + "mean_token_accuracy": 0.9018639326095581, + "step": 3130 + }, + { + "epoch": 1.5655000000000001, + "grad_norm": 2.385428599943417, + "learning_rate": 4.951451111393588e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8650495409965515, + "step": 3131 + }, + { + "epoch": 1.5659999999999998, + "grad_norm": 3.2889410956119276, + "learning_rate": 4.951365501745172e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8898168206214905, + "step": 3132 + }, + { + "epoch": 1.5665, + "grad_norm": 3.723890322952524, + "learning_rate": 4.951279817423894e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8753957748413086, + "step": 3133 + }, + { + "epoch": 1.567, + "grad_norm": 2.093990564984661, + "learning_rate": 4.951194058432362e-06, + "loss": 0.2688, + "mean_token_accuracy": 0.9099888205528259, + "step": 3134 + }, + { + "epoch": 1.5675, + "grad_norm": 3.0918462880615003, + "learning_rate": 4.951108224773189e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8668746948242188, + "step": 3135 + }, + { + "epoch": 1.568, + "grad_norm": 21.9575192867268, + "learning_rate": 4.95102231644899e-06, + "loss": 0.3182, + "mean_token_accuracy": 0.8939862251281738, + "step": 3136 + }, + { + "epoch": 1.5685, + "grad_norm": 3.33612306361981, + "learning_rate": 4.950936333462382e-06, + "loss": 0.3288, + "mean_token_accuracy": 0.9001211524009705, + "step": 3137 + }, + { + "epoch": 1.569, + "grad_norm": 1.948731288082576, + "learning_rate": 4.950850275815983e-06, + "loss": 0.3592, + "mean_token_accuracy": 0.8700308203697205, + "step": 3138 + }, + { + "epoch": 1.5695000000000001, + "grad_norm": 4.1353207327779655, + "learning_rate": 4.950764143512416e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8752471804618835, + "step": 3139 + }, + { + "epoch": 1.5699999999999998, + "grad_norm": 3.316415634047519, + "learning_rate": 4.9506779365543054e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.8836453557014465, + "step": 3140 + }, + { + "epoch": 1.5705, + "grad_norm": 6.586969742988237, + "learning_rate": 4.950591654944274e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8619356751441956, + "step": 3141 + }, + { + "epoch": 1.571, + "grad_norm": 3.7564741095267253, + "learning_rate": 4.950505298684954e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8554510474205017, + "step": 3142 + }, + { + "epoch": 1.5715, + "grad_norm": 2.5008014838098256, + "learning_rate": 4.950418867778973e-06, + "loss": 0.5704, + "mean_token_accuracy": 0.8289920687675476, + "step": 3143 + }, + { + "epoch": 1.572, + "grad_norm": 3.6720875426674215, + "learning_rate": 4.950332362228966e-06, + "loss": 0.2935, + "mean_token_accuracy": 0.9008424282073975, + "step": 3144 + }, + { + "epoch": 1.5725, + "grad_norm": 3.9719263008487764, + "learning_rate": 4.950245782037566e-06, + "loss": 0.5104, + "mean_token_accuracy": 0.860822856426239, + "step": 3145 + }, + { + "epoch": 1.573, + "grad_norm": 2.881457973817483, + "learning_rate": 4.950159127207411e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8608071208000183, + "step": 3146 + }, + { + "epoch": 1.5735000000000001, + "grad_norm": 15.749001021699385, + "learning_rate": 4.950072397741141e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8848411440849304, + "step": 3147 + }, + { + "epoch": 1.5739999999999998, + "grad_norm": 3.1884216353657133, + "learning_rate": 4.949985593641399e-06, + "loss": 0.2613, + "mean_token_accuracy": 0.9103279709815979, + "step": 3148 + }, + { + "epoch": 1.5745, + "grad_norm": 1.9945432796821427, + "learning_rate": 4.949898714910828e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8659135103225708, + "step": 3149 + }, + { + "epoch": 1.575, + "grad_norm": 4.081811420607912, + "learning_rate": 4.949811761552074e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8883010745048523, + "step": 3150 + }, + { + "epoch": 1.5755, + "grad_norm": 4.6702404371025965, + "learning_rate": 4.949724733567787e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8543971180915833, + "step": 3151 + }, + { + "epoch": 1.576, + "grad_norm": 5.31438379975185, + "learning_rate": 4.949637630960618e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8931225538253784, + "step": 3152 + }, + { + "epoch": 1.5765, + "grad_norm": 2.6582427370815007, + "learning_rate": 4.9495504537332186e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8843919634819031, + "step": 3153 + }, + { + "epoch": 1.577, + "grad_norm": 2.029609524944604, + "learning_rate": 4.949463201888246e-06, + "loss": 0.3556, + "mean_token_accuracy": 0.8846042156219482, + "step": 3154 + }, + { + "epoch": 1.5775000000000001, + "grad_norm": 3.256186261907556, + "learning_rate": 4.9493758754283575e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8751739263534546, + "step": 3155 + }, + { + "epoch": 1.5779999999999998, + "grad_norm": 3.008982964469226, + "learning_rate": 4.9492884743562135e-06, + "loss": 0.2642, + "mean_token_accuracy": 0.9034907817840576, + "step": 3156 + }, + { + "epoch": 1.5785, + "grad_norm": 2.686594073796178, + "learning_rate": 4.949200998674476e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8885798454284668, + "step": 3157 + }, + { + "epoch": 1.579, + "grad_norm": 2.8146184241637697, + "learning_rate": 4.949113448385809e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8577171564102173, + "step": 3158 + }, + { + "epoch": 1.5795, + "grad_norm": 2.1808793940068836, + "learning_rate": 4.949025823492881e-06, + "loss": 0.2792, + "mean_token_accuracy": 0.9109910130500793, + "step": 3159 + }, + { + "epoch": 1.58, + "grad_norm": 2.4166374015732788, + "learning_rate": 4.94893812399836e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.8514174222946167, + "step": 3160 + }, + { + "epoch": 1.5805, + "grad_norm": 2.441050028909239, + "learning_rate": 4.948850349904919e-06, + "loss": 0.3379, + "mean_token_accuracy": 0.8972712755203247, + "step": 3161 + }, + { + "epoch": 1.581, + "grad_norm": 1.9241144330250208, + "learning_rate": 4.9487625012152296e-06, + "loss": 0.285, + "mean_token_accuracy": 0.8994674682617188, + "step": 3162 + }, + { + "epoch": 1.5815000000000001, + "grad_norm": 2.4196442506503786, + "learning_rate": 4.94867457793197e-06, + "loss": 0.4886, + "mean_token_accuracy": 0.8499693870544434, + "step": 3163 + }, + { + "epoch": 1.5819999999999999, + "grad_norm": 3.0122396283033104, + "learning_rate": 4.948586580057816e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8809842467308044, + "step": 3164 + }, + { + "epoch": 1.5825, + "grad_norm": 7.046241104471559, + "learning_rate": 4.9484985075954505e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8789229989051819, + "step": 3165 + }, + { + "epoch": 1.583, + "grad_norm": 15.936745987781732, + "learning_rate": 4.948410360547555e-06, + "loss": 0.2861, + "mean_token_accuracy": 0.9118279814720154, + "step": 3166 + }, + { + "epoch": 1.5835, + "grad_norm": 4.9691896267891424, + "learning_rate": 4.948322138916816e-06, + "loss": 0.347, + "mean_token_accuracy": 0.8971448540687561, + "step": 3167 + }, + { + "epoch": 1.584, + "grad_norm": 2.468837723465189, + "learning_rate": 4.948233842705919e-06, + "loss": 0.3147, + "mean_token_accuracy": 0.898068904876709, + "step": 3168 + }, + { + "epoch": 1.5845, + "grad_norm": 2.6976881412660676, + "learning_rate": 4.948145471917555e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8674013614654541, + "step": 3169 + }, + { + "epoch": 1.585, + "grad_norm": 2.486795140956223, + "learning_rate": 4.948057026554415e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8594611883163452, + "step": 3170 + }, + { + "epoch": 1.5855000000000001, + "grad_norm": 4.804983389526368, + "learning_rate": 4.947968506619194e-06, + "loss": 0.3269, + "mean_token_accuracy": 0.8920876383781433, + "step": 3171 + }, + { + "epoch": 1.5859999999999999, + "grad_norm": 1.7794786644926406, + "learning_rate": 4.947879912114588e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8772563338279724, + "step": 3172 + }, + { + "epoch": 1.5865, + "grad_norm": 2.1400958253737006, + "learning_rate": 4.947791243043296e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8783673644065857, + "step": 3173 + }, + { + "epoch": 1.587, + "grad_norm": 3.272042922785077, + "learning_rate": 4.947702499408019e-06, + "loss": 0.6577, + "mean_token_accuracy": 0.8152998685836792, + "step": 3174 + }, + { + "epoch": 1.5875, + "grad_norm": 3.4249532545854224, + "learning_rate": 4.94761368121146e-06, + "loss": 0.1804, + "mean_token_accuracy": 0.9356240034103394, + "step": 3175 + }, + { + "epoch": 1.588, + "grad_norm": 2.0334663675130162, + "learning_rate": 4.947524788456325e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8606293201446533, + "step": 3176 + }, + { + "epoch": 1.5885, + "grad_norm": 4.601195144901803, + "learning_rate": 4.947435821145321e-06, + "loss": 0.3251, + "mean_token_accuracy": 0.90101158618927, + "step": 3177 + }, + { + "epoch": 1.589, + "grad_norm": 2.09256950300935, + "learning_rate": 4.9473467792811595e-06, + "loss": 0.2736, + "mean_token_accuracy": 0.8912865519523621, + "step": 3178 + }, + { + "epoch": 1.5895000000000001, + "grad_norm": 2.0489165896419923, + "learning_rate": 4.947257662866552e-06, + "loss": 0.2571, + "mean_token_accuracy": 0.9132776260375977, + "step": 3179 + }, + { + "epoch": 1.5899999999999999, + "grad_norm": 2.6356328671174185, + "learning_rate": 4.947168471904213e-06, + "loss": 0.5143, + "mean_token_accuracy": 0.8461211323738098, + "step": 3180 + }, + { + "epoch": 1.5905, + "grad_norm": 2.3539604281994686, + "learning_rate": 4.94707920639686e-06, + "loss": 0.3184, + "mean_token_accuracy": 0.8926980495452881, + "step": 3181 + }, + { + "epoch": 1.591, + "grad_norm": 2.5701396073299327, + "learning_rate": 4.946989866347211e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8893252611160278, + "step": 3182 + }, + { + "epoch": 1.5915, + "grad_norm": 4.403306608679642, + "learning_rate": 4.946900451757989e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.890631914138794, + "step": 3183 + }, + { + "epoch": 1.592, + "grad_norm": 2.0845490718193207, + "learning_rate": 4.946810962631916e-06, + "loss": 0.2826, + "mean_token_accuracy": 0.8990703821182251, + "step": 3184 + }, + { + "epoch": 1.5925, + "grad_norm": 1.9602273996801456, + "learning_rate": 4.94672139897172e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8654851913452148, + "step": 3185 + }, + { + "epoch": 1.593, + "grad_norm": 2.7361004058633407, + "learning_rate": 4.946631760780128e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.835881769657135, + "step": 3186 + }, + { + "epoch": 1.5935000000000001, + "grad_norm": 1.9217332042364605, + "learning_rate": 4.94654204805987e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8825503587722778, + "step": 3187 + }, + { + "epoch": 1.5939999999999999, + "grad_norm": 1.9891420429770321, + "learning_rate": 4.94645226081368e-06, + "loss": 0.3196, + "mean_token_accuracy": 0.8945409655570984, + "step": 3188 + }, + { + "epoch": 1.5945, + "grad_norm": 3.0533523903875768, + "learning_rate": 4.946362399044293e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8856781721115112, + "step": 3189 + }, + { + "epoch": 1.595, + "grad_norm": 2.651793077301361, + "learning_rate": 4.946272462754447e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8703817129135132, + "step": 3190 + }, + { + "epoch": 1.5955, + "grad_norm": 4.79331098208969, + "learning_rate": 4.94618245194688e-06, + "loss": 0.3379, + "mean_token_accuracy": 0.8946930766105652, + "step": 3191 + }, + { + "epoch": 1.596, + "grad_norm": 4.80860746595163, + "learning_rate": 4.946092366624333e-06, + "loss": 0.2718, + "mean_token_accuracy": 0.9105098843574524, + "step": 3192 + }, + { + "epoch": 1.5965, + "grad_norm": 1.6657101283689348, + "learning_rate": 4.946002206789553e-06, + "loss": 0.2471, + "mean_token_accuracy": 0.9083796739578247, + "step": 3193 + }, + { + "epoch": 1.597, + "grad_norm": 3.8675817483911628, + "learning_rate": 4.9459119724452845e-06, + "loss": 0.2637, + "mean_token_accuracy": 0.906856119632721, + "step": 3194 + }, + { + "epoch": 1.5975000000000001, + "grad_norm": 2.4016485264676137, + "learning_rate": 4.945821663594277e-06, + "loss": 0.3, + "mean_token_accuracy": 0.9033463597297668, + "step": 3195 + }, + { + "epoch": 1.5979999999999999, + "grad_norm": 1.9546120765923882, + "learning_rate": 4.945731280239281e-06, + "loss": 0.3079, + "mean_token_accuracy": 0.9004548788070679, + "step": 3196 + }, + { + "epoch": 1.5985, + "grad_norm": 1.7601168439316355, + "learning_rate": 4.94564082238305e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8851659297943115, + "step": 3197 + }, + { + "epoch": 1.599, + "grad_norm": 1.7498697367865395, + "learning_rate": 4.9455502900283405e-06, + "loss": 0.3159, + "mean_token_accuracy": 0.8969718217849731, + "step": 3198 + }, + { + "epoch": 1.5995, + "grad_norm": 4.135731411461094, + "learning_rate": 4.945459683177908e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8782567381858826, + "step": 3199 + }, + { + "epoch": 1.6, + "grad_norm": 2.23235907766178, + "learning_rate": 4.9453690018345144e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8833248019218445, + "step": 3200 + }, + { + "epoch": 1.6005, + "grad_norm": 1.8553265815987565, + "learning_rate": 4.9452782460009215e-06, + "loss": 0.296, + "mean_token_accuracy": 0.9014109373092651, + "step": 3201 + }, + { + "epoch": 1.601, + "grad_norm": 4.531971483899524, + "learning_rate": 4.945187415679893e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8753302693367004, + "step": 3202 + }, + { + "epoch": 1.6015000000000001, + "grad_norm": 2.374063426180815, + "learning_rate": 4.945096510874198e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8675992488861084, + "step": 3203 + }, + { + "epoch": 1.6019999999999999, + "grad_norm": 4.250506048096953, + "learning_rate": 4.945005531586603e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.866084635257721, + "step": 3204 + }, + { + "epoch": 1.6025, + "grad_norm": 6.167951287598094, + "learning_rate": 4.944914477819881e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8773557543754578, + "step": 3205 + }, + { + "epoch": 1.603, + "grad_norm": 2.6551987608337058, + "learning_rate": 4.944823349576805e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8838527202606201, + "step": 3206 + }, + { + "epoch": 1.6035, + "grad_norm": 2.7336039299440777, + "learning_rate": 4.944732146860151e-06, + "loss": 0.3165, + "mean_token_accuracy": 0.9093401432037354, + "step": 3207 + }, + { + "epoch": 1.604, + "grad_norm": 4.226725443246068, + "learning_rate": 4.9446408696726974e-06, + "loss": 0.2802, + "mean_token_accuracy": 0.903219997882843, + "step": 3208 + }, + { + "epoch": 1.6045, + "grad_norm": 2.220989039296081, + "learning_rate": 4.944549518017225e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8568129539489746, + "step": 3209 + }, + { + "epoch": 1.605, + "grad_norm": 1.9547519963558257, + "learning_rate": 4.944458091896515e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.8859972953796387, + "step": 3210 + }, + { + "epoch": 1.6055000000000001, + "grad_norm": 2.1658192895397277, + "learning_rate": 4.944366591313356e-06, + "loss": 0.2762, + "mean_token_accuracy": 0.9126480221748352, + "step": 3211 + }, + { + "epoch": 1.6059999999999999, + "grad_norm": 2.3589383437384006, + "learning_rate": 4.94427501627053e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8807634115219116, + "step": 3212 + }, + { + "epoch": 1.6065, + "grad_norm": 3.1632629163704666, + "learning_rate": 4.9441833667708305e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8784926533699036, + "step": 3213 + }, + { + "epoch": 1.607, + "grad_norm": 1.5813517577149994, + "learning_rate": 4.944091642817049e-06, + "loss": 0.2859, + "mean_token_accuracy": 0.9034680724143982, + "step": 3214 + }, + { + "epoch": 1.6075, + "grad_norm": 2.6241522469547096, + "learning_rate": 4.943999844411978e-06, + "loss": 0.4816, + "mean_token_accuracy": 0.863203763961792, + "step": 3215 + }, + { + "epoch": 1.608, + "grad_norm": 3.321202779980494, + "learning_rate": 4.943907971558414e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8753310441970825, + "step": 3216 + }, + { + "epoch": 1.6085, + "grad_norm": 2.1680344278962087, + "learning_rate": 4.943816024259156e-06, + "loss": 0.33, + "mean_token_accuracy": 0.8970729112625122, + "step": 3217 + }, + { + "epoch": 1.609, + "grad_norm": 1.725606620488804, + "learning_rate": 4.9437240025170054e-06, + "loss": 0.2905, + "mean_token_accuracy": 0.8978763222694397, + "step": 3218 + }, + { + "epoch": 1.6095000000000002, + "grad_norm": 1.6819438338695634, + "learning_rate": 4.943631906334765e-06, + "loss": 0.35, + "mean_token_accuracy": 0.8829571604728699, + "step": 3219 + }, + { + "epoch": 1.6099999999999999, + "grad_norm": 2.688366083237466, + "learning_rate": 4.9435397357152406e-06, + "loss": 0.5278, + "mean_token_accuracy": 0.8347750902175903, + "step": 3220 + }, + { + "epoch": 1.6105, + "grad_norm": 2.3855654917898765, + "learning_rate": 4.943447490661238e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8677884340286255, + "step": 3221 + }, + { + "epoch": 1.611, + "grad_norm": 2.038730958321732, + "learning_rate": 4.94335517117557e-06, + "loss": 0.2776, + "mean_token_accuracy": 0.9026140570640564, + "step": 3222 + }, + { + "epoch": 1.6115, + "grad_norm": 2.266956261330196, + "learning_rate": 4.943262777261048e-06, + "loss": 0.3165, + "mean_token_accuracy": 0.8960220813751221, + "step": 3223 + }, + { + "epoch": 1.612, + "grad_norm": 2.193729135225712, + "learning_rate": 4.943170308920484e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8848334550857544, + "step": 3224 + }, + { + "epoch": 1.6125, + "grad_norm": 2.0283802458836346, + "learning_rate": 4.943077766156698e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8673129081726074, + "step": 3225 + }, + { + "epoch": 1.613, + "grad_norm": 3.6489879090561783, + "learning_rate": 4.942985148972506e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8860988020896912, + "step": 3226 + }, + { + "epoch": 1.6135000000000002, + "grad_norm": 2.378190246822287, + "learning_rate": 4.9428924573707325e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8841193914413452, + "step": 3227 + }, + { + "epoch": 1.6139999999999999, + "grad_norm": 1.9697656373559793, + "learning_rate": 4.9427996913542e-06, + "loss": 0.32, + "mean_token_accuracy": 0.8921048641204834, + "step": 3228 + }, + { + "epoch": 1.6145, + "grad_norm": 1.8025927502978623, + "learning_rate": 4.942706850925733e-06, + "loss": 0.2444, + "mean_token_accuracy": 0.9061720967292786, + "step": 3229 + }, + { + "epoch": 1.615, + "grad_norm": 2.4310299252538012, + "learning_rate": 4.94261393608816e-06, + "loss": 0.5166, + "mean_token_accuracy": 0.8445647358894348, + "step": 3230 + }, + { + "epoch": 1.6155, + "grad_norm": 2.086229893500022, + "learning_rate": 4.9425209468443115e-06, + "loss": 0.3448, + "mean_token_accuracy": 0.8884210586547852, + "step": 3231 + }, + { + "epoch": 1.616, + "grad_norm": 3.0472786136148877, + "learning_rate": 4.942427883197021e-06, + "loss": 0.4836, + "mean_token_accuracy": 0.8613767027854919, + "step": 3232 + }, + { + "epoch": 1.6165, + "grad_norm": 2.4627375382341583, + "learning_rate": 4.942334745149122e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.8887336850166321, + "step": 3233 + }, + { + "epoch": 1.617, + "grad_norm": 2.234608285942879, + "learning_rate": 4.942241532703453e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8684580326080322, + "step": 3234 + }, + { + "epoch": 1.6175000000000002, + "grad_norm": 2.430035386750772, + "learning_rate": 4.942148245862852e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8715953230857849, + "step": 3235 + }, + { + "epoch": 1.6179999999999999, + "grad_norm": 3.1450879868571198, + "learning_rate": 4.942054884630163e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8687182664871216, + "step": 3236 + }, + { + "epoch": 1.6185, + "grad_norm": 4.32436583387189, + "learning_rate": 4.941961449008227e-06, + "loss": 0.3293, + "mean_token_accuracy": 0.8971583247184753, + "step": 3237 + }, + { + "epoch": 1.619, + "grad_norm": 2.4207337376412266, + "learning_rate": 4.941867938999892e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8728376626968384, + "step": 3238 + }, + { + "epoch": 1.6195, + "grad_norm": 6.668573685596162, + "learning_rate": 4.941774354608007e-06, + "loss": 0.3284, + "mean_token_accuracy": 0.8994647860527039, + "step": 3239 + }, + { + "epoch": 1.62, + "grad_norm": 2.632578645613373, + "learning_rate": 4.9416806958354206e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.88675856590271, + "step": 3240 + }, + { + "epoch": 1.6205, + "grad_norm": 2.210685490390943, + "learning_rate": 4.941586962684986e-06, + "loss": 0.3059, + "mean_token_accuracy": 0.8991639614105225, + "step": 3241 + }, + { + "epoch": 1.621, + "grad_norm": 2.826084372173845, + "learning_rate": 4.941493155159562e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.885966420173645, + "step": 3242 + }, + { + "epoch": 1.6215000000000002, + "grad_norm": 3.1957522790391244, + "learning_rate": 4.941399273262003e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8693790435791016, + "step": 3243 + }, + { + "epoch": 1.6219999999999999, + "grad_norm": 9.583066820543648, + "learning_rate": 4.941305316995169e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.8932722210884094, + "step": 3244 + }, + { + "epoch": 1.6225, + "grad_norm": 4.791130821409367, + "learning_rate": 4.941211286361922e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8542638421058655, + "step": 3245 + }, + { + "epoch": 1.623, + "grad_norm": 3.030246183450057, + "learning_rate": 4.9411171813651275e-06, + "loss": 0.2932, + "mean_token_accuracy": 0.905749499797821, + "step": 3246 + }, + { + "epoch": 1.6235, + "grad_norm": 3.2963544900365256, + "learning_rate": 4.941023002007651e-06, + "loss": 0.4934, + "mean_token_accuracy": 0.859971284866333, + "step": 3247 + }, + { + "epoch": 1.624, + "grad_norm": 2.4187601776987324, + "learning_rate": 4.940928748292363e-06, + "loss": 0.355, + "mean_token_accuracy": 0.890205979347229, + "step": 3248 + }, + { + "epoch": 1.6245, + "grad_norm": 4.298782973843247, + "learning_rate": 4.940834420222133e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8725338578224182, + "step": 3249 + }, + { + "epoch": 1.625, + "grad_norm": 2.5426254215738338, + "learning_rate": 4.9407400177998335e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8735005259513855, + "step": 3250 + }, + { + "epoch": 1.6255, + "grad_norm": 2.0056842371279897, + "learning_rate": 4.940645541028343e-06, + "loss": 0.2855, + "mean_token_accuracy": 0.9028946757316589, + "step": 3251 + }, + { + "epoch": 1.626, + "grad_norm": 3.6863963037643415, + "learning_rate": 4.940550989910537e-06, + "loss": 0.3205, + "mean_token_accuracy": 0.9041683673858643, + "step": 3252 + }, + { + "epoch": 1.6265, + "grad_norm": 1.8914028278375374, + "learning_rate": 4.940456364449298e-06, + "loss": 0.2646, + "mean_token_accuracy": 0.9150943160057068, + "step": 3253 + }, + { + "epoch": 1.627, + "grad_norm": 2.0185578110223243, + "learning_rate": 4.940361664647506e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8810300230979919, + "step": 3254 + }, + { + "epoch": 1.6275, + "grad_norm": 2.9870777114792246, + "learning_rate": 4.940266890508048e-06, + "loss": 0.3002, + "mean_token_accuracy": 0.8978655934333801, + "step": 3255 + }, + { + "epoch": 1.6280000000000001, + "grad_norm": 1.5877826413882343, + "learning_rate": 4.940172042033808e-06, + "loss": 0.2739, + "mean_token_accuracy": 0.9111447930335999, + "step": 3256 + }, + { + "epoch": 1.6284999999999998, + "grad_norm": 2.284375828691295, + "learning_rate": 4.940077119227678e-06, + "loss": 0.3019, + "mean_token_accuracy": 0.9121791124343872, + "step": 3257 + }, + { + "epoch": 1.629, + "grad_norm": 6.8754595577976, + "learning_rate": 4.939982122092549e-06, + "loss": 0.3212, + "mean_token_accuracy": 0.8890566229820251, + "step": 3258 + }, + { + "epoch": 1.6295, + "grad_norm": 2.633480395398593, + "learning_rate": 4.939887050631313e-06, + "loss": 0.5217, + "mean_token_accuracy": 0.8409425616264343, + "step": 3259 + }, + { + "epoch": 1.63, + "grad_norm": 2.493822980487088, + "learning_rate": 4.939791904846869e-06, + "loss": 0.2815, + "mean_token_accuracy": 0.9102341532707214, + "step": 3260 + }, + { + "epoch": 1.6305, + "grad_norm": 3.838360265956715, + "learning_rate": 4.939696684742113e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8489977717399597, + "step": 3261 + }, + { + "epoch": 1.631, + "grad_norm": 2.938714291440168, + "learning_rate": 4.939601390319947e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8765816688537598, + "step": 3262 + }, + { + "epoch": 1.6315, + "grad_norm": 2.4619488597219745, + "learning_rate": 4.9395060215832716e-06, + "loss": 0.3237, + "mean_token_accuracy": 0.8950733542442322, + "step": 3263 + }, + { + "epoch": 1.6320000000000001, + "grad_norm": 2.7543568605068534, + "learning_rate": 4.939410578534994e-06, + "loss": 0.2545, + "mean_token_accuracy": 0.9083879590034485, + "step": 3264 + }, + { + "epoch": 1.6324999999999998, + "grad_norm": 6.4132715026476, + "learning_rate": 4.9393150611780215e-06, + "loss": 0.5733, + "mean_token_accuracy": 0.8198524117469788, + "step": 3265 + }, + { + "epoch": 1.633, + "grad_norm": 2.990204520729527, + "learning_rate": 4.939219469515263e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8767619132995605, + "step": 3266 + }, + { + "epoch": 1.6335, + "grad_norm": 2.6068051712135474, + "learning_rate": 4.93912380354963e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8764045238494873, + "step": 3267 + }, + { + "epoch": 1.634, + "grad_norm": 13.232172378840486, + "learning_rate": 4.939028063284038e-06, + "loss": 0.3242, + "mean_token_accuracy": 0.8942848443984985, + "step": 3268 + }, + { + "epoch": 1.6345, + "grad_norm": 2.223756236893483, + "learning_rate": 4.938932248721402e-06, + "loss": 0.3378, + "mean_token_accuracy": 0.8796701431274414, + "step": 3269 + }, + { + "epoch": 1.635, + "grad_norm": 2.7393021533569195, + "learning_rate": 4.938836359864641e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8655598163604736, + "step": 3270 + }, + { + "epoch": 1.6355, + "grad_norm": 3.827259884042763, + "learning_rate": 4.938740396716678e-06, + "loss": 0.2884, + "mean_token_accuracy": 0.9068843126296997, + "step": 3271 + }, + { + "epoch": 1.6360000000000001, + "grad_norm": 1.7771333617019274, + "learning_rate": 4.938644359280433e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8460394144058228, + "step": 3272 + }, + { + "epoch": 1.6364999999999998, + "grad_norm": 2.559597440414025, + "learning_rate": 4.938548247558833e-06, + "loss": 0.2625, + "mean_token_accuracy": 0.9115461111068726, + "step": 3273 + }, + { + "epoch": 1.637, + "grad_norm": 3.688571813979323, + "learning_rate": 4.9384520615548065e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8629615306854248, + "step": 3274 + }, + { + "epoch": 1.6375, + "grad_norm": 2.2698213520904456, + "learning_rate": 4.938355801271282e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.886306643486023, + "step": 3275 + }, + { + "epoch": 1.638, + "grad_norm": 1.9730322425759277, + "learning_rate": 4.9382594667111925e-06, + "loss": 0.2745, + "mean_token_accuracy": 0.8983380198478699, + "step": 3276 + }, + { + "epoch": 1.6385, + "grad_norm": 4.5012115030131366, + "learning_rate": 4.938163057877473e-06, + "loss": 0.3184, + "mean_token_accuracy": 0.8935507535934448, + "step": 3277 + }, + { + "epoch": 1.639, + "grad_norm": 2.864493243112793, + "learning_rate": 4.9380665747730585e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8649674654006958, + "step": 3278 + }, + { + "epoch": 1.6395, + "grad_norm": 2.482097463860967, + "learning_rate": 4.9379700174008905e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8879702091217041, + "step": 3279 + }, + { + "epoch": 1.6400000000000001, + "grad_norm": 2.3620698868346817, + "learning_rate": 4.937873385763909e-06, + "loss": 0.3412, + "mean_token_accuracy": 0.8929583430290222, + "step": 3280 + }, + { + "epoch": 1.6404999999999998, + "grad_norm": 1.9162568841484242, + "learning_rate": 4.937776679865056e-06, + "loss": 0.3232, + "mean_token_accuracy": 0.8926931023597717, + "step": 3281 + }, + { + "epoch": 1.641, + "grad_norm": 2.8047927329896023, + "learning_rate": 4.93767989970728e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8624981045722961, + "step": 3282 + }, + { + "epoch": 1.6415, + "grad_norm": 3.4452215973851517, + "learning_rate": 4.937583045293529e-06, + "loss": 0.3407, + "mean_token_accuracy": 0.8829402923583984, + "step": 3283 + }, + { + "epoch": 1.642, + "grad_norm": 2.323895266974683, + "learning_rate": 4.937486116626752e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8862337470054626, + "step": 3284 + }, + { + "epoch": 1.6425, + "grad_norm": 3.127416259012172, + "learning_rate": 4.937389113709902e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8581133484840393, + "step": 3285 + }, + { + "epoch": 1.643, + "grad_norm": 2.7820313316975547, + "learning_rate": 4.9372920365459335e-06, + "loss": 0.3376, + "mean_token_accuracy": 0.8964883685112, + "step": 3286 + }, + { + "epoch": 1.6435, + "grad_norm": 1.8816482373795684, + "learning_rate": 4.937194885137804e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8831278681755066, + "step": 3287 + }, + { + "epoch": 1.6440000000000001, + "grad_norm": 2.535826048589673, + "learning_rate": 4.937097659488473e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8836648464202881, + "step": 3288 + }, + { + "epoch": 1.6444999999999999, + "grad_norm": 2.6198256710907586, + "learning_rate": 4.937000359600902e-06, + "loss": 0.5085, + "mean_token_accuracy": 0.8519545197486877, + "step": 3289 + }, + { + "epoch": 1.645, + "grad_norm": 3.0248063644883065, + "learning_rate": 4.936902985478055e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.8850768804550171, + "step": 3290 + }, + { + "epoch": 1.6455, + "grad_norm": 2.369767112989742, + "learning_rate": 4.9368055371228985e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8753224611282349, + "step": 3291 + }, + { + "epoch": 1.646, + "grad_norm": 2.5493519791853125, + "learning_rate": 4.9367080145384006e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8844740986824036, + "step": 3292 + }, + { + "epoch": 1.6465, + "grad_norm": 3.3786900730000466, + "learning_rate": 4.936610417727532e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8698517084121704, + "step": 3293 + }, + { + "epoch": 1.647, + "grad_norm": 2.1949542591601117, + "learning_rate": 4.9365127466932655e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.875654935836792, + "step": 3294 + }, + { + "epoch": 1.6475, + "grad_norm": 2.126889829626133, + "learning_rate": 4.936415001438577e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8617066144943237, + "step": 3295 + }, + { + "epoch": 1.6480000000000001, + "grad_norm": 2.326181390003299, + "learning_rate": 4.9363171819664434e-06, + "loss": 0.2693, + "mean_token_accuracy": 0.9133713245391846, + "step": 3296 + }, + { + "epoch": 1.6484999999999999, + "grad_norm": 3.272484617656461, + "learning_rate": 4.936219288279844e-06, + "loss": 0.2972, + "mean_token_accuracy": 0.8995641469955444, + "step": 3297 + }, + { + "epoch": 1.649, + "grad_norm": 3.677711762951032, + "learning_rate": 4.936121320381762e-06, + "loss": 0.6257, + "mean_token_accuracy": 0.8265369534492493, + "step": 3298 + }, + { + "epoch": 1.6495, + "grad_norm": 1.8407446870058257, + "learning_rate": 4.936023278275181e-06, + "loss": 0.2977, + "mean_token_accuracy": 0.9008361101150513, + "step": 3299 + }, + { + "epoch": 1.65, + "grad_norm": 2.5000113367839734, + "learning_rate": 4.935925161963089e-06, + "loss": 0.2499, + "mean_token_accuracy": 0.9089929461479187, + "step": 3300 + }, + { + "epoch": 1.6505, + "grad_norm": 2.4350048858746556, + "learning_rate": 4.935826971448472e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8844863176345825, + "step": 3301 + }, + { + "epoch": 1.651, + "grad_norm": 1.6520262499510248, + "learning_rate": 4.935728706734322e-06, + "loss": 0.2814, + "mean_token_accuracy": 0.9181331396102905, + "step": 3302 + }, + { + "epoch": 1.6515, + "grad_norm": 18.609074990797982, + "learning_rate": 4.935630367823634e-06, + "loss": 0.408, + "mean_token_accuracy": 0.871013879776001, + "step": 3303 + }, + { + "epoch": 1.6520000000000001, + "grad_norm": 2.0621862263540636, + "learning_rate": 4.935531954719401e-06, + "loss": 0.3438, + "mean_token_accuracy": 0.8882761001586914, + "step": 3304 + }, + { + "epoch": 1.6524999999999999, + "grad_norm": 4.163833118539124, + "learning_rate": 4.935433467424624e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.85355144739151, + "step": 3305 + }, + { + "epoch": 1.653, + "grad_norm": 2.7686622756773756, + "learning_rate": 4.9353349059423e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8759111166000366, + "step": 3306 + }, + { + "epoch": 1.6535, + "grad_norm": 4.022051362405644, + "learning_rate": 4.935236270275433e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.878681480884552, + "step": 3307 + }, + { + "epoch": 1.654, + "grad_norm": 4.250376000825927, + "learning_rate": 4.935137560427028e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8695356249809265, + "step": 3308 + }, + { + "epoch": 1.6545, + "grad_norm": 1.9196633346585688, + "learning_rate": 4.9350387764000895e-06, + "loss": 0.3142, + "mean_token_accuracy": 0.8956867456436157, + "step": 3309 + }, + { + "epoch": 1.655, + "grad_norm": 4.785517177065613, + "learning_rate": 4.93493991819763e-06, + "loss": 0.3319, + "mean_token_accuracy": 0.8904038667678833, + "step": 3310 + }, + { + "epoch": 1.6555, + "grad_norm": 1.8443169771892043, + "learning_rate": 4.9348409858226575e-06, + "loss": 0.3357, + "mean_token_accuracy": 0.8887107372283936, + "step": 3311 + }, + { + "epoch": 1.6560000000000001, + "grad_norm": 2.259330502621743, + "learning_rate": 4.934741979278188e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.873369038105011, + "step": 3312 + }, + { + "epoch": 1.6564999999999999, + "grad_norm": 2.048226601953001, + "learning_rate": 4.934642898567237e-06, + "loss": 0.3346, + "mean_token_accuracy": 0.8935209512710571, + "step": 3313 + }, + { + "epoch": 1.657, + "grad_norm": 3.9191878425603504, + "learning_rate": 4.934543743692822e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8779867887496948, + "step": 3314 + }, + { + "epoch": 1.6575, + "grad_norm": 1.6858174302844562, + "learning_rate": 4.934444514657964e-06, + "loss": 0.295, + "mean_token_accuracy": 0.9004198312759399, + "step": 3315 + }, + { + "epoch": 1.658, + "grad_norm": 1.8290728211328797, + "learning_rate": 4.934345211465686e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8840948939323425, + "step": 3316 + }, + { + "epoch": 1.6585, + "grad_norm": 2.519497214841426, + "learning_rate": 4.9342458341190125e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8711340427398682, + "step": 3317 + }, + { + "epoch": 1.659, + "grad_norm": 2.7804551510492175, + "learning_rate": 4.93414638262097e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8670318722724915, + "step": 3318 + }, + { + "epoch": 1.6595, + "grad_norm": 4.624150489157138, + "learning_rate": 4.93404685697459e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.9012705683708191, + "step": 3319 + }, + { + "epoch": 1.6600000000000001, + "grad_norm": 2.672297296868301, + "learning_rate": 4.933947257182901e-06, + "loss": 0.2842, + "mean_token_accuracy": 0.9092495441436768, + "step": 3320 + }, + { + "epoch": 1.6604999999999999, + "grad_norm": 133.60326436942583, + "learning_rate": 4.93384758324894e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8935721516609192, + "step": 3321 + }, + { + "epoch": 1.661, + "grad_norm": 4.587966304058628, + "learning_rate": 4.933747835175741e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.8890603184700012, + "step": 3322 + }, + { + "epoch": 1.6615, + "grad_norm": 2.1311063265804737, + "learning_rate": 4.933648012966344e-06, + "loss": 0.2166, + "mean_token_accuracy": 0.9225634336471558, + "step": 3323 + }, + { + "epoch": 1.662, + "grad_norm": 2.080498161507953, + "learning_rate": 4.9335481166237905e-06, + "loss": 0.3189, + "mean_token_accuracy": 0.8999605178833008, + "step": 3324 + }, + { + "epoch": 1.6625, + "grad_norm": 3.473505497685135, + "learning_rate": 4.933448146151122e-06, + "loss": 0.3208, + "mean_token_accuracy": 0.8980855941772461, + "step": 3325 + }, + { + "epoch": 1.663, + "grad_norm": 1.629742512313501, + "learning_rate": 4.933348101551383e-06, + "loss": 0.2523, + "mean_token_accuracy": 0.9057984352111816, + "step": 3326 + }, + { + "epoch": 1.6635, + "grad_norm": 3.9255622869587463, + "learning_rate": 4.9332479828276234e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8686535954475403, + "step": 3327 + }, + { + "epoch": 1.6640000000000001, + "grad_norm": 2.939365827479006, + "learning_rate": 4.933147789982891e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8747929930686951, + "step": 3328 + }, + { + "epoch": 1.6644999999999999, + "grad_norm": 4.527294392705926, + "learning_rate": 4.933047523020239e-06, + "loss": 0.3294, + "mean_token_accuracy": 0.9017361998558044, + "step": 3329 + }, + { + "epoch": 1.665, + "grad_norm": 2.1947119899511875, + "learning_rate": 4.932947181942721e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8634886145591736, + "step": 3330 + }, + { + "epoch": 1.6655, + "grad_norm": 2.350836050814285, + "learning_rate": 4.932846766753394e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8696718811988831, + "step": 3331 + }, + { + "epoch": 1.666, + "grad_norm": 2.114436647841977, + "learning_rate": 4.932746277455317e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8715226054191589, + "step": 3332 + }, + { + "epoch": 1.6665, + "grad_norm": 9.511451426796487, + "learning_rate": 4.932645714051551e-06, + "loss": 0.3332, + "mean_token_accuracy": 0.8930069804191589, + "step": 3333 + }, + { + "epoch": 1.667, + "grad_norm": 2.51810673343652, + "learning_rate": 4.9325450765451574e-06, + "loss": 0.3116, + "mean_token_accuracy": 0.907852053642273, + "step": 3334 + }, + { + "epoch": 1.6675, + "grad_norm": 2.778095187927227, + "learning_rate": 4.932444364939205e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8701934814453125, + "step": 3335 + }, + { + "epoch": 1.6680000000000001, + "grad_norm": 3.07175043517155, + "learning_rate": 4.93234357923676e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8857852220535278, + "step": 3336 + }, + { + "epoch": 1.6684999999999999, + "grad_norm": 3.0634353871225883, + "learning_rate": 4.932242719440893e-06, + "loss": 0.3091, + "mean_token_accuracy": 0.8946336507797241, + "step": 3337 + }, + { + "epoch": 1.669, + "grad_norm": 1.9112966324419696, + "learning_rate": 4.932141785554676e-06, + "loss": 0.3333, + "mean_token_accuracy": 0.8896023035049438, + "step": 3338 + }, + { + "epoch": 1.6695, + "grad_norm": 5.596711418944226, + "learning_rate": 4.932040777581183e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.8953119516372681, + "step": 3339 + }, + { + "epoch": 1.67, + "grad_norm": 2.057833621691355, + "learning_rate": 4.9319396955234925e-06, + "loss": 0.3069, + "mean_token_accuracy": 0.898274302482605, + "step": 3340 + }, + { + "epoch": 1.6705, + "grad_norm": 2.6910219416800873, + "learning_rate": 4.931838539384681e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.8898809552192688, + "step": 3341 + }, + { + "epoch": 1.671, + "grad_norm": 3.5780425683679145, + "learning_rate": 4.931737309167833e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.8911401629447937, + "step": 3342 + }, + { + "epoch": 1.6715, + "grad_norm": 2.0224612292355317, + "learning_rate": 4.93163600487603e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8897243142127991, + "step": 3343 + }, + { + "epoch": 1.6720000000000002, + "grad_norm": 3.5086875906441786, + "learning_rate": 4.931534626512359e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8795263171195984, + "step": 3344 + }, + { + "epoch": 1.6724999999999999, + "grad_norm": 2.1523162294861296, + "learning_rate": 4.9314331740799084e-06, + "loss": 0.2844, + "mean_token_accuracy": 0.9096694588661194, + "step": 3345 + }, + { + "epoch": 1.673, + "grad_norm": 2.0834770841508576, + "learning_rate": 4.931331647581767e-06, + "loss": 0.334, + "mean_token_accuracy": 0.8951416015625, + "step": 3346 + }, + { + "epoch": 1.6735, + "grad_norm": 17.9555849910724, + "learning_rate": 4.931230047021028e-06, + "loss": 0.3263, + "mean_token_accuracy": 0.9034426808357239, + "step": 3347 + }, + { + "epoch": 1.674, + "grad_norm": 2.257710985343882, + "learning_rate": 4.931128372400788e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.8892307877540588, + "step": 3348 + }, + { + "epoch": 1.6745, + "grad_norm": 2.369311951066303, + "learning_rate": 4.9310266237241424e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8592908978462219, + "step": 3349 + }, + { + "epoch": 1.675, + "grad_norm": 2.50636157489511, + "learning_rate": 4.930924800994192e-06, + "loss": 0.344, + "mean_token_accuracy": 0.8904405832290649, + "step": 3350 + }, + { + "epoch": 1.6755, + "grad_norm": 11.669056810879404, + "learning_rate": 4.930822904214037e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8930131196975708, + "step": 3351 + }, + { + "epoch": 1.6760000000000002, + "grad_norm": 8.494187129727749, + "learning_rate": 4.930720933386782e-06, + "loss": 0.2598, + "mean_token_accuracy": 0.9231427907943726, + "step": 3352 + }, + { + "epoch": 1.6764999999999999, + "grad_norm": 2.081974868489171, + "learning_rate": 4.930618888515534e-06, + "loss": 0.3448, + "mean_token_accuracy": 0.8877174258232117, + "step": 3353 + }, + { + "epoch": 1.677, + "grad_norm": 3.016572313281562, + "learning_rate": 4.9305167696034e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.888414740562439, + "step": 3354 + }, + { + "epoch": 1.6775, + "grad_norm": 2.5451947416622547, + "learning_rate": 4.930414576653492e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8705348372459412, + "step": 3355 + }, + { + "epoch": 1.678, + "grad_norm": 4.674616165060507, + "learning_rate": 4.930312309668922e-06, + "loss": 0.322, + "mean_token_accuracy": 0.8874026536941528, + "step": 3356 + }, + { + "epoch": 1.6785, + "grad_norm": 2.3584349033308585, + "learning_rate": 4.930209968652806e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8980441093444824, + "step": 3357 + }, + { + "epoch": 1.679, + "grad_norm": 4.6086487443319655, + "learning_rate": 4.930107553608261e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8916055560112, + "step": 3358 + }, + { + "epoch": 1.6795, + "grad_norm": 2.549894124852871, + "learning_rate": 4.930005064538407e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.8786497712135315, + "step": 3359 + }, + { + "epoch": 1.6800000000000002, + "grad_norm": 2.6664636854743127, + "learning_rate": 4.9299025014463665e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8927724957466125, + "step": 3360 + }, + { + "epoch": 1.6804999999999999, + "grad_norm": 4.377318649046773, + "learning_rate": 4.929799864335262e-06, + "loss": 0.5058, + "mean_token_accuracy": 0.8407718539237976, + "step": 3361 + }, + { + "epoch": 1.681, + "grad_norm": 2.1867460846338242, + "learning_rate": 4.929697153208222e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8761619925498962, + "step": 3362 + }, + { + "epoch": 1.6815, + "grad_norm": 6.034369983835362, + "learning_rate": 4.929594368068374e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8687509894371033, + "step": 3363 + }, + { + "epoch": 1.682, + "grad_norm": 2.993032502888446, + "learning_rate": 4.92949150891885e-06, + "loss": 0.2822, + "mean_token_accuracy": 0.9023728966712952, + "step": 3364 + }, + { + "epoch": 1.6825, + "grad_norm": 2.357230071685505, + "learning_rate": 4.9293885757627815e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8881892561912537, + "step": 3365 + }, + { + "epoch": 1.683, + "grad_norm": 2.5759601255592646, + "learning_rate": 4.929285568603306e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8771474361419678, + "step": 3366 + }, + { + "epoch": 1.6835, + "grad_norm": 2.468126953860798, + "learning_rate": 4.9291824874435605e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8628270626068115, + "step": 3367 + }, + { + "epoch": 1.6840000000000002, + "grad_norm": 2.522427589926624, + "learning_rate": 4.929079332286685e-06, + "loss": 0.3298, + "mean_token_accuracy": 0.8879668116569519, + "step": 3368 + }, + { + "epoch": 1.6844999999999999, + "grad_norm": 4.328374187616457, + "learning_rate": 4.928976103135822e-06, + "loss": 0.2371, + "mean_token_accuracy": 0.91147381067276, + "step": 3369 + }, + { + "epoch": 1.685, + "grad_norm": 2.120274091004316, + "learning_rate": 4.928872799994116e-06, + "loss": 0.2683, + "mean_token_accuracy": 0.9061263203620911, + "step": 3370 + }, + { + "epoch": 1.6855, + "grad_norm": 8.240559413438337, + "learning_rate": 4.9287694228647135e-06, + "loss": 0.3093, + "mean_token_accuracy": 0.8966307044029236, + "step": 3371 + }, + { + "epoch": 1.686, + "grad_norm": 11.818368130796163, + "learning_rate": 4.928665971750764e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8855326175689697, + "step": 3372 + }, + { + "epoch": 1.6865, + "grad_norm": 4.733515676529729, + "learning_rate": 4.928562446655417e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8761020302772522, + "step": 3373 + }, + { + "epoch": 1.687, + "grad_norm": 2.2705673735623475, + "learning_rate": 4.928458847581829e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.8915572166442871, + "step": 3374 + }, + { + "epoch": 1.6875, + "grad_norm": 2.6038743413070904, + "learning_rate": 4.928355174533153e-06, + "loss": 0.2527, + "mean_token_accuracy": 0.9157021045684814, + "step": 3375 + }, + { + "epoch": 1.688, + "grad_norm": 2.548562773293648, + "learning_rate": 4.928251427512551e-06, + "loss": 0.6037, + "mean_token_accuracy": 0.8268219232559204, + "step": 3376 + }, + { + "epoch": 1.6885, + "grad_norm": 2.4322047721704165, + "learning_rate": 4.928147606523179e-06, + "loss": 0.2184, + "mean_token_accuracy": 0.9231958985328674, + "step": 3377 + }, + { + "epoch": 1.689, + "grad_norm": 49.24015526224017, + "learning_rate": 4.9280437115682015e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8800415992736816, + "step": 3378 + }, + { + "epoch": 1.6895, + "grad_norm": 1.651590649069109, + "learning_rate": 4.9279397426507824e-06, + "loss": 0.2498, + "mean_token_accuracy": 0.9123711585998535, + "step": 3379 + }, + { + "epoch": 1.69, + "grad_norm": 2.587679652669377, + "learning_rate": 4.92783569977409e-06, + "loss": 0.2391, + "mean_token_accuracy": 0.9165678024291992, + "step": 3380 + }, + { + "epoch": 1.6905000000000001, + "grad_norm": 3.3436430114009137, + "learning_rate": 4.927731582941294e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.8808469772338867, + "step": 3381 + }, + { + "epoch": 1.6909999999999998, + "grad_norm": 3.5443383645853026, + "learning_rate": 4.927627392155565e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8605700135231018, + "step": 3382 + }, + { + "epoch": 1.6915, + "grad_norm": 3.57673276310761, + "learning_rate": 4.927523127420075e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.884372353553772, + "step": 3383 + }, + { + "epoch": 1.692, + "grad_norm": 3.2269331779854067, + "learning_rate": 4.927418788738004e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.879024088382721, + "step": 3384 + }, + { + "epoch": 1.6925, + "grad_norm": 1.7664337187435357, + "learning_rate": 4.927314376112528e-06, + "loss": 0.3226, + "mean_token_accuracy": 0.8917720317840576, + "step": 3385 + }, + { + "epoch": 1.693, + "grad_norm": 2.8884644599281586, + "learning_rate": 4.927209889546828e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8567917346954346, + "step": 3386 + }, + { + "epoch": 1.6935, + "grad_norm": 1.723716154341598, + "learning_rate": 4.927105329044086e-06, + "loss": 0.2452, + "mean_token_accuracy": 0.9112260341644287, + "step": 3387 + }, + { + "epoch": 1.694, + "grad_norm": 3.4199067860727737, + "learning_rate": 4.927000694607489e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8749760389328003, + "step": 3388 + }, + { + "epoch": 1.6945000000000001, + "grad_norm": 40.27799044954926, + "learning_rate": 4.926895986240223e-06, + "loss": 0.333, + "mean_token_accuracy": 0.889160692691803, + "step": 3389 + }, + { + "epoch": 1.6949999999999998, + "grad_norm": 2.959152662163495, + "learning_rate": 4.926791203945477e-06, + "loss": 0.35, + "mean_token_accuracy": 0.8892207741737366, + "step": 3390 + }, + { + "epoch": 1.6955, + "grad_norm": 2.5697278870934652, + "learning_rate": 4.926686347726445e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.8998066186904907, + "step": 3391 + }, + { + "epoch": 1.696, + "grad_norm": 2.4258046383818694, + "learning_rate": 4.926581417586319e-06, + "loss": 0.2346, + "mean_token_accuracy": 0.9223009347915649, + "step": 3392 + }, + { + "epoch": 1.6965, + "grad_norm": 2.4751630149332673, + "learning_rate": 4.926476413528296e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8841325640678406, + "step": 3393 + }, + { + "epoch": 1.697, + "grad_norm": 4.953526494975086, + "learning_rate": 4.9263713355555755e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.85373854637146, + "step": 3394 + }, + { + "epoch": 1.6975, + "grad_norm": 2.8302174908653086, + "learning_rate": 4.926266183671356e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8568633198738098, + "step": 3395 + }, + { + "epoch": 1.698, + "grad_norm": 2.6957472135670164, + "learning_rate": 4.926160957878844e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.879572331905365, + "step": 3396 + }, + { + "epoch": 1.6985000000000001, + "grad_norm": 2.3194319659101086, + "learning_rate": 4.926055658181242e-06, + "loss": 0.2852, + "mean_token_accuracy": 0.9093090891838074, + "step": 3397 + }, + { + "epoch": 1.6989999999999998, + "grad_norm": 8.277767135107078, + "learning_rate": 4.92595028458176e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.88498455286026, + "step": 3398 + }, + { + "epoch": 1.6995, + "grad_norm": 5.943406589498572, + "learning_rate": 4.925844837083606e-06, + "loss": 0.4995, + "mean_token_accuracy": 0.8365050554275513, + "step": 3399 + }, + { + "epoch": 1.7, + "grad_norm": 1.8792102270704176, + "learning_rate": 4.925739315689991e-06, + "loss": 0.3338, + "mean_token_accuracy": 0.89570552110672, + "step": 3400 + }, + { + "epoch": 1.7005, + "grad_norm": 2.027503547203648, + "learning_rate": 4.925633720404132e-06, + "loss": 0.317, + "mean_token_accuracy": 0.8949461579322815, + "step": 3401 + }, + { + "epoch": 1.701, + "grad_norm": 3.2082769927076407, + "learning_rate": 4.925528051229246e-06, + "loss": 0.5367, + "mean_token_accuracy": 0.8465861678123474, + "step": 3402 + }, + { + "epoch": 1.7015, + "grad_norm": 2.129542773226747, + "learning_rate": 4.92542230816855e-06, + "loss": 0.3419, + "mean_token_accuracy": 0.8882582187652588, + "step": 3403 + }, + { + "epoch": 1.702, + "grad_norm": 1.8198084826834755, + "learning_rate": 4.925316491225265e-06, + "loss": 0.2272, + "mean_token_accuracy": 0.9201710820198059, + "step": 3404 + }, + { + "epoch": 1.7025000000000001, + "grad_norm": 1.9023755420769233, + "learning_rate": 4.925210600402615e-06, + "loss": 0.292, + "mean_token_accuracy": 0.8967846035957336, + "step": 3405 + }, + { + "epoch": 1.7029999999999998, + "grad_norm": 3.117638595299413, + "learning_rate": 4.925104635703826e-06, + "loss": 0.2591, + "mean_token_accuracy": 0.9179804921150208, + "step": 3406 + }, + { + "epoch": 1.7035, + "grad_norm": 2.2403585687742247, + "learning_rate": 4.9249985971321254e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8606179356575012, + "step": 3407 + }, + { + "epoch": 1.704, + "grad_norm": 2.7247315705674717, + "learning_rate": 4.924892484690744e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8539638519287109, + "step": 3408 + }, + { + "epoch": 1.7045, + "grad_norm": 16.30342962663821, + "learning_rate": 4.924786298382913e-06, + "loss": 0.3556, + "mean_token_accuracy": 0.8777154684066772, + "step": 3409 + }, + { + "epoch": 1.705, + "grad_norm": 3.3785337029874087, + "learning_rate": 4.924680038211868e-06, + "loss": 0.2903, + "mean_token_accuracy": 0.8975666165351868, + "step": 3410 + }, + { + "epoch": 1.7055, + "grad_norm": 2.31153303514862, + "learning_rate": 4.924573704180845e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.8888511061668396, + "step": 3411 + }, + { + "epoch": 1.706, + "grad_norm": 2.2593371085305414, + "learning_rate": 4.924467296293083e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.885037899017334, + "step": 3412 + }, + { + "epoch": 1.7065000000000001, + "grad_norm": 6.048888767242139, + "learning_rate": 4.924360814551825e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.890439510345459, + "step": 3413 + }, + { + "epoch": 1.7069999999999999, + "grad_norm": 2.272453190045707, + "learning_rate": 4.924254258960313e-06, + "loss": 0.2846, + "mean_token_accuracy": 0.9079370498657227, + "step": 3414 + }, + { + "epoch": 1.7075, + "grad_norm": 2.9690100315264045, + "learning_rate": 4.924147629521794e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.8451799750328064, + "step": 3415 + }, + { + "epoch": 1.708, + "grad_norm": 4.069596044542371, + "learning_rate": 4.924040926239515e-06, + "loss": 0.4995, + "mean_token_accuracy": 0.8434553742408752, + "step": 3416 + }, + { + "epoch": 1.7085, + "grad_norm": 2.225150813955101, + "learning_rate": 4.9239341491167284e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8779679536819458, + "step": 3417 + }, + { + "epoch": 1.709, + "grad_norm": 2.4628958127780956, + "learning_rate": 4.923827298156684e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8815661668777466, + "step": 3418 + }, + { + "epoch": 1.7095, + "grad_norm": 7.910889206660223, + "learning_rate": 4.923720373362638e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8571163415908813, + "step": 3419 + }, + { + "epoch": 1.71, + "grad_norm": 2.7401212358503995, + "learning_rate": 4.923613374737848e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.8555900454521179, + "step": 3420 + }, + { + "epoch": 1.7105000000000001, + "grad_norm": 2.901927365956319, + "learning_rate": 4.923506302285573e-06, + "loss": 0.4893, + "mean_token_accuracy": 0.851307213306427, + "step": 3421 + }, + { + "epoch": 1.7109999999999999, + "grad_norm": 37.645548093877316, + "learning_rate": 4.9233991560090735e-06, + "loss": 0.3074, + "mean_token_accuracy": 0.898938000202179, + "step": 3422 + }, + { + "epoch": 1.7115, + "grad_norm": 2.944911030751243, + "learning_rate": 4.923291935911615e-06, + "loss": 0.2937, + "mean_token_accuracy": 0.9046729207038879, + "step": 3423 + }, + { + "epoch": 1.712, + "grad_norm": 2.9117034142581497, + "learning_rate": 4.923184641996463e-06, + "loss": 0.2955, + "mean_token_accuracy": 0.9019572138786316, + "step": 3424 + }, + { + "epoch": 1.7125, + "grad_norm": 2.69763482705816, + "learning_rate": 4.923077274266886e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8828940391540527, + "step": 3425 + }, + { + "epoch": 1.713, + "grad_norm": 2.591821922018202, + "learning_rate": 4.9229698327261545e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.8830034136772156, + "step": 3426 + }, + { + "epoch": 1.7135, + "grad_norm": 4.337469138386906, + "learning_rate": 4.9228623173775415e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.876954972743988, + "step": 3427 + }, + { + "epoch": 1.714, + "grad_norm": 2.1411049911600326, + "learning_rate": 4.922754728224321e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8524088859558105, + "step": 3428 + }, + { + "epoch": 1.7145000000000001, + "grad_norm": 2.7785381646195675, + "learning_rate": 4.922647065269772e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.8505541682243347, + "step": 3429 + }, + { + "epoch": 1.7149999999999999, + "grad_norm": 1.7957329979571361, + "learning_rate": 4.922539328517174e-06, + "loss": 0.241, + "mean_token_accuracy": 0.9215855002403259, + "step": 3430 + }, + { + "epoch": 1.7155, + "grad_norm": 1.5834152131134112, + "learning_rate": 4.922431517969808e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8783237338066101, + "step": 3431 + }, + { + "epoch": 1.716, + "grad_norm": 3.0541154572389053, + "learning_rate": 4.922323633630957e-06, + "loss": 0.5132, + "mean_token_accuracy": 0.8267436623573303, + "step": 3432 + }, + { + "epoch": 1.7165, + "grad_norm": 3.167465772472344, + "learning_rate": 4.92221567550391e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8787878751754761, + "step": 3433 + }, + { + "epoch": 1.717, + "grad_norm": 8.158018600964617, + "learning_rate": 4.922107643591955e-06, + "loss": 0.297, + "mean_token_accuracy": 0.9043741822242737, + "step": 3434 + }, + { + "epoch": 1.7175, + "grad_norm": 5.082088992433296, + "learning_rate": 4.92199953789838e-06, + "loss": 0.4884, + "mean_token_accuracy": 0.8517646789550781, + "step": 3435 + }, + { + "epoch": 1.718, + "grad_norm": 3.3123571993641727, + "learning_rate": 4.9218913584264816e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.887012243270874, + "step": 3436 + }, + { + "epoch": 1.7185000000000001, + "grad_norm": 5.875128395710083, + "learning_rate": 4.921783105179552e-06, + "loss": 0.2999, + "mean_token_accuracy": 0.9009382724761963, + "step": 3437 + }, + { + "epoch": 1.7189999999999999, + "grad_norm": 3.4114423282747453, + "learning_rate": 4.9216747781608935e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8849297761917114, + "step": 3438 + }, + { + "epoch": 1.7195, + "grad_norm": 2.7341614496325466, + "learning_rate": 4.921566377373801e-06, + "loss": 0.3418, + "mean_token_accuracy": 0.8797181844711304, + "step": 3439 + }, + { + "epoch": 1.72, + "grad_norm": 2.947636113320475, + "learning_rate": 4.921457902821578e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8831627368927002, + "step": 3440 + }, + { + "epoch": 1.7205, + "grad_norm": 1.5877915894997447, + "learning_rate": 4.92134935450753e-06, + "loss": 0.3224, + "mean_token_accuracy": 0.8894763588905334, + "step": 3441 + }, + { + "epoch": 1.721, + "grad_norm": 2.5842750465586604, + "learning_rate": 4.921240732434963e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8899204134941101, + "step": 3442 + }, + { + "epoch": 1.7215, + "grad_norm": 3.6776547065875134, + "learning_rate": 4.921132036607185e-06, + "loss": 0.2922, + "mean_token_accuracy": 0.9027853608131409, + "step": 3443 + }, + { + "epoch": 1.722, + "grad_norm": 3.703739876388769, + "learning_rate": 4.92102326702751e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8686973452568054, + "step": 3444 + }, + { + "epoch": 1.7225000000000001, + "grad_norm": 3.885758148518744, + "learning_rate": 4.920914423699247e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8820930123329163, + "step": 3445 + }, + { + "epoch": 1.7229999999999999, + "grad_norm": 1.645042515018015, + "learning_rate": 4.920805506625714e-06, + "loss": 0.2947, + "mean_token_accuracy": 0.8994975090026855, + "step": 3446 + }, + { + "epoch": 1.7235, + "grad_norm": 2.874813836886111, + "learning_rate": 4.92069651581023e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8787326812744141, + "step": 3447 + }, + { + "epoch": 1.724, + "grad_norm": 1.955403870005845, + "learning_rate": 4.920587451256112e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8769980072975159, + "step": 3448 + }, + { + "epoch": 1.7245, + "grad_norm": 3.0832304639617156, + "learning_rate": 4.920478312966683e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8637940287590027, + "step": 3449 + }, + { + "epoch": 1.725, + "grad_norm": 4.477387132235642, + "learning_rate": 4.92036910094527e-06, + "loss": 0.3207, + "mean_token_accuracy": 0.8918086290359497, + "step": 3450 + }, + { + "epoch": 1.7255, + "grad_norm": 2.7080370870099535, + "learning_rate": 4.920259815195198e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8738309144973755, + "step": 3451 + }, + { + "epoch": 1.726, + "grad_norm": 2.7241383091021745, + "learning_rate": 4.920150455719795e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8813503384590149, + "step": 3452 + }, + { + "epoch": 1.7265000000000001, + "grad_norm": 2.2762394275343767, + "learning_rate": 4.920041022522394e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.873181164264679, + "step": 3453 + }, + { + "epoch": 1.7269999999999999, + "grad_norm": 6.2018006971639466, + "learning_rate": 4.919931515606328e-06, + "loss": 0.2564, + "mean_token_accuracy": 0.9140591621398926, + "step": 3454 + }, + { + "epoch": 1.7275, + "grad_norm": 13.123945570209127, + "learning_rate": 4.919821934974933e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8741615414619446, + "step": 3455 + }, + { + "epoch": 1.728, + "grad_norm": 2.7586679098282048, + "learning_rate": 4.919712280631547e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8852800130844116, + "step": 3456 + }, + { + "epoch": 1.7285, + "grad_norm": 2.1364798462035566, + "learning_rate": 4.91960255257951e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8801140189170837, + "step": 3457 + }, + { + "epoch": 1.729, + "grad_norm": 2.2201292882758583, + "learning_rate": 4.919492750822164e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8844765424728394, + "step": 3458 + }, + { + "epoch": 1.7295, + "grad_norm": 8.904621997396104, + "learning_rate": 4.919382875362855e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8569842576980591, + "step": 3459 + }, + { + "epoch": 1.73, + "grad_norm": 1.5873828450576866, + "learning_rate": 4.9192729262049285e-06, + "loss": 0.2293, + "mean_token_accuracy": 0.9186199307441711, + "step": 3460 + }, + { + "epoch": 1.7305000000000001, + "grad_norm": 6.496257428977602, + "learning_rate": 4.9191629033517356e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.878874659538269, + "step": 3461 + }, + { + "epoch": 1.7309999999999999, + "grad_norm": 4.685801457207244, + "learning_rate": 4.919052806806625e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.878668487071991, + "step": 3462 + }, + { + "epoch": 1.7315, + "grad_norm": 2.2706326316476977, + "learning_rate": 4.918942636572953e-06, + "loss": 0.2526, + "mean_token_accuracy": 0.9095546007156372, + "step": 3463 + }, + { + "epoch": 1.732, + "grad_norm": 2.4354373594678362, + "learning_rate": 4.918832392654075e-06, + "loss": 0.3232, + "mean_token_accuracy": 0.8982552289962769, + "step": 3464 + }, + { + "epoch": 1.7325, + "grad_norm": 2.921167427120138, + "learning_rate": 4.918722075053349e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.8802887797355652, + "step": 3465 + }, + { + "epoch": 1.733, + "grad_norm": 3.1714483501884367, + "learning_rate": 4.9186116837741355e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8672086596488953, + "step": 3466 + }, + { + "epoch": 1.7335, + "grad_norm": 2.573013830343314, + "learning_rate": 4.918501218819797e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8530341982841492, + "step": 3467 + }, + { + "epoch": 1.734, + "grad_norm": 3.525182450120572, + "learning_rate": 4.918390680193698e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8864573836326599, + "step": 3468 + }, + { + "epoch": 1.7345000000000002, + "grad_norm": 2.4856813335427153, + "learning_rate": 4.918280067899207e-06, + "loss": 0.3275, + "mean_token_accuracy": 0.8935151100158691, + "step": 3469 + }, + { + "epoch": 1.7349999999999999, + "grad_norm": 2.0881506844113784, + "learning_rate": 4.918169381939693e-06, + "loss": 0.2524, + "mean_token_accuracy": 0.9091882109642029, + "step": 3470 + }, + { + "epoch": 1.7355, + "grad_norm": 2.87076271305646, + "learning_rate": 4.918058622318526e-06, + "loss": 0.325, + "mean_token_accuracy": 0.8926380276679993, + "step": 3471 + }, + { + "epoch": 1.736, + "grad_norm": 2.942233585877529, + "learning_rate": 4.9179477890390825e-06, + "loss": 0.4843, + "mean_token_accuracy": 0.8566316962242126, + "step": 3472 + }, + { + "epoch": 1.7365, + "grad_norm": 2.3787431928803144, + "learning_rate": 4.917836882104738e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8716108202934265, + "step": 3473 + }, + { + "epoch": 1.737, + "grad_norm": 2.2768515991159526, + "learning_rate": 4.917725901518869e-06, + "loss": 0.2748, + "mean_token_accuracy": 0.9034100770950317, + "step": 3474 + }, + { + "epoch": 1.7375, + "grad_norm": 3.0385250929849836, + "learning_rate": 4.917614847284858e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8795421719551086, + "step": 3475 + }, + { + "epoch": 1.738, + "grad_norm": 3.3979243636652, + "learning_rate": 4.917503719406088e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8783023953437805, + "step": 3476 + }, + { + "epoch": 1.7385000000000002, + "grad_norm": 2.4298613163206015, + "learning_rate": 4.9173925178859435e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8767001628875732, + "step": 3477 + }, + { + "epoch": 1.7389999999999999, + "grad_norm": 3.5579392803495904, + "learning_rate": 4.917281242727811e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8762865662574768, + "step": 3478 + }, + { + "epoch": 1.7395, + "grad_norm": 2.8479437575702646, + "learning_rate": 4.917169893935083e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8801503777503967, + "step": 3479 + }, + { + "epoch": 1.74, + "grad_norm": 2.441420593580101, + "learning_rate": 4.917058471511149e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.8956043720245361, + "step": 3480 + }, + { + "epoch": 1.7405, + "grad_norm": 2.1958341782791346, + "learning_rate": 4.916946975459404e-06, + "loss": 0.3007, + "mean_token_accuracy": 0.9051468372344971, + "step": 3481 + }, + { + "epoch": 1.741, + "grad_norm": 2.963978097956232, + "learning_rate": 4.9168354057832426e-06, + "loss": 0.3243, + "mean_token_accuracy": 0.8971457481384277, + "step": 3482 + }, + { + "epoch": 1.7415, + "grad_norm": 2.914407846622282, + "learning_rate": 4.916723762486066e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8943833708763123, + "step": 3483 + }, + { + "epoch": 1.742, + "grad_norm": 2.3104930234954213, + "learning_rate": 4.916612045571274e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.862272322177887, + "step": 3484 + }, + { + "epoch": 1.7425000000000002, + "grad_norm": 2.746216403939737, + "learning_rate": 4.916500255042269e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8771275877952576, + "step": 3485 + }, + { + "epoch": 1.7429999999999999, + "grad_norm": 4.077984755207739, + "learning_rate": 4.9163883909024565e-06, + "loss": 0.3304, + "mean_token_accuracy": 0.8915702700614929, + "step": 3486 + }, + { + "epoch": 1.7435, + "grad_norm": 4.998863519687197, + "learning_rate": 4.916276453155246e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8844323754310608, + "step": 3487 + }, + { + "epoch": 1.744, + "grad_norm": 2.738719555032967, + "learning_rate": 4.916164441804044e-06, + "loss": 0.3092, + "mean_token_accuracy": 0.9020283222198486, + "step": 3488 + }, + { + "epoch": 1.7445, + "grad_norm": 6.919328005817792, + "learning_rate": 4.916052356852266e-06, + "loss": 0.5108, + "mean_token_accuracy": 0.8570641875267029, + "step": 3489 + }, + { + "epoch": 1.745, + "grad_norm": 4.605998502138494, + "learning_rate": 4.915940198303324e-06, + "loss": 0.5775, + "mean_token_accuracy": 0.8340517282485962, + "step": 3490 + }, + { + "epoch": 1.7455, + "grad_norm": 2.513282645480146, + "learning_rate": 4.915827966160635e-06, + "loss": 0.3338, + "mean_token_accuracy": 0.8886775970458984, + "step": 3491 + }, + { + "epoch": 1.746, + "grad_norm": 14.123699569576257, + "learning_rate": 4.915715660427618e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8884711861610413, + "step": 3492 + }, + { + "epoch": 1.7465000000000002, + "grad_norm": 1.7830994785729444, + "learning_rate": 4.915603281107695e-06, + "loss": 0.3054, + "mean_token_accuracy": 0.8941091299057007, + "step": 3493 + }, + { + "epoch": 1.7469999999999999, + "grad_norm": 2.142544752324562, + "learning_rate": 4.915490828204287e-06, + "loss": 0.4859, + "mean_token_accuracy": 0.8326725959777832, + "step": 3494 + }, + { + "epoch": 1.7475, + "grad_norm": 2.8084768324659946, + "learning_rate": 4.915378301720822e-06, + "loss": 0.239, + "mean_token_accuracy": 0.9183019399642944, + "step": 3495 + }, + { + "epoch": 1.748, + "grad_norm": 5.089602552341844, + "learning_rate": 4.915265701660726e-06, + "loss": 0.2788, + "mean_token_accuracy": 0.9126743674278259, + "step": 3496 + }, + { + "epoch": 1.7485, + "grad_norm": 2.196786276116802, + "learning_rate": 4.91515302802743e-06, + "loss": 0.3107, + "mean_token_accuracy": 0.8903635144233704, + "step": 3497 + }, + { + "epoch": 1.749, + "grad_norm": 3.008581315074756, + "learning_rate": 4.915040280824365e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8751576542854309, + "step": 3498 + }, + { + "epoch": 1.7495, + "grad_norm": 2.319006268801256, + "learning_rate": 4.914927460054967e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8778907060623169, + "step": 3499 + }, + { + "epoch": 1.75, + "grad_norm": 3.6374851570001834, + "learning_rate": 4.914814565722671e-06, + "loss": 0.3294, + "mean_token_accuracy": 0.8887377381324768, + "step": 3500 + }, + { + "epoch": 1.7505, + "grad_norm": 2.317490416273342, + "learning_rate": 4.914701597830918e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.87903892993927, + "step": 3501 + }, + { + "epoch": 1.751, + "grad_norm": 12.573471760083594, + "learning_rate": 4.914588556383148e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8679393529891968, + "step": 3502 + }, + { + "epoch": 1.7515, + "grad_norm": 2.25884603958536, + "learning_rate": 4.914475441382804e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8848762512207031, + "step": 3503 + }, + { + "epoch": 1.752, + "grad_norm": 5.037059937126727, + "learning_rate": 4.914362252833332e-06, + "loss": 0.5073, + "mean_token_accuracy": 0.8518581986427307, + "step": 3504 + }, + { + "epoch": 1.7525, + "grad_norm": 4.160587345182576, + "learning_rate": 4.914248990738182e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8639909029006958, + "step": 3505 + }, + { + "epoch": 1.7530000000000001, + "grad_norm": 2.138625717838355, + "learning_rate": 4.914135655100801e-06, + "loss": 0.3316, + "mean_token_accuracy": 0.8973214030265808, + "step": 3506 + }, + { + "epoch": 1.7534999999999998, + "grad_norm": 2.908181712474071, + "learning_rate": 4.914022245924643e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8840746879577637, + "step": 3507 + }, + { + "epoch": 1.754, + "grad_norm": 2.5085508195565134, + "learning_rate": 4.913908763213162e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8777092695236206, + "step": 3508 + }, + { + "epoch": 1.7545, + "grad_norm": 3.0653024220839753, + "learning_rate": 4.9137952069698155e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8610504865646362, + "step": 3509 + }, + { + "epoch": 1.755, + "grad_norm": 2.5018708439942015, + "learning_rate": 4.913681577198063e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8699818849563599, + "step": 3510 + }, + { + "epoch": 1.7555, + "grad_norm": 5.510635447993565, + "learning_rate": 4.913567873901365e-06, + "loss": 0.3098, + "mean_token_accuracy": 0.8980498313903809, + "step": 3511 + }, + { + "epoch": 1.756, + "grad_norm": 4.687851390241938, + "learning_rate": 4.913454097083185e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8884214162826538, + "step": 3512 + }, + { + "epoch": 1.7565, + "grad_norm": 2.4328853434474187, + "learning_rate": 4.91334024674699e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.8846616148948669, + "step": 3513 + }, + { + "epoch": 1.7570000000000001, + "grad_norm": 2.147440296303554, + "learning_rate": 4.913226322896247e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8620144128799438, + "step": 3514 + }, + { + "epoch": 1.7574999999999998, + "grad_norm": 26.337707621452733, + "learning_rate": 4.913112325534426e-06, + "loss": 0.235, + "mean_token_accuracy": 0.9231609106063843, + "step": 3515 + }, + { + "epoch": 1.758, + "grad_norm": 3.3518039796124386, + "learning_rate": 4.9129982546650005e-06, + "loss": 0.3009, + "mean_token_accuracy": 0.89629065990448, + "step": 3516 + }, + { + "epoch": 1.7585, + "grad_norm": 1.9934174943305152, + "learning_rate": 4.912884110291445e-06, + "loss": 0.329, + "mean_token_accuracy": 0.895317792892456, + "step": 3517 + }, + { + "epoch": 1.759, + "grad_norm": 2.2229714061431673, + "learning_rate": 4.912769892417236e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8893450498580933, + "step": 3518 + }, + { + "epoch": 1.7595, + "grad_norm": 5.506086861883051, + "learning_rate": 4.912655601045854e-06, + "loss": 0.6165, + "mean_token_accuracy": 0.814342737197876, + "step": 3519 + }, + { + "epoch": 1.76, + "grad_norm": 4.984869322736597, + "learning_rate": 4.912541236180779e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8623431921005249, + "step": 3520 + }, + { + "epoch": 1.7605, + "grad_norm": 2.293606473938552, + "learning_rate": 4.912426797825496e-06, + "loss": 0.595, + "mean_token_accuracy": 0.8308408856391907, + "step": 3521 + }, + { + "epoch": 1.7610000000000001, + "grad_norm": 2.7157814692066053, + "learning_rate": 4.912312285983491e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.8941872715950012, + "step": 3522 + }, + { + "epoch": 1.7614999999999998, + "grad_norm": 1.4718024215006575, + "learning_rate": 4.912197700658251e-06, + "loss": 0.2103, + "mean_token_accuracy": 0.928699791431427, + "step": 3523 + }, + { + "epoch": 1.762, + "grad_norm": 2.53442079629297, + "learning_rate": 4.912083041853267e-06, + "loss": 0.3277, + "mean_token_accuracy": 0.8939393758773804, + "step": 3524 + }, + { + "epoch": 1.7625, + "grad_norm": 4.150850849235547, + "learning_rate": 4.9119683095720325e-06, + "loss": 0.2975, + "mean_token_accuracy": 0.899493396282196, + "step": 3525 + }, + { + "epoch": 1.763, + "grad_norm": 3.3995927245433197, + "learning_rate": 4.911853503818042e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8855947852134705, + "step": 3526 + }, + { + "epoch": 1.7635, + "grad_norm": 2.336232339316802, + "learning_rate": 4.911738624594793e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8907809853553772, + "step": 3527 + }, + { + "epoch": 1.764, + "grad_norm": 2.23180210552452, + "learning_rate": 4.911623671905784e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8892145156860352, + "step": 3528 + }, + { + "epoch": 1.7645, + "grad_norm": 3.3407885864817177, + "learning_rate": 4.911508645754517e-06, + "loss": 0.3213, + "mean_token_accuracy": 0.8841893076896667, + "step": 3529 + }, + { + "epoch": 1.7650000000000001, + "grad_norm": 2.1501948340612778, + "learning_rate": 4.9113935461444955e-06, + "loss": 0.3445, + "mean_token_accuracy": 0.8895959854125977, + "step": 3530 + }, + { + "epoch": 1.7654999999999998, + "grad_norm": 17.050670382803734, + "learning_rate": 4.9112783730792265e-06, + "loss": 0.2879, + "mean_token_accuracy": 0.901294469833374, + "step": 3531 + }, + { + "epoch": 1.766, + "grad_norm": 5.883341898249555, + "learning_rate": 4.911163126562218e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8692213296890259, + "step": 3532 + }, + { + "epoch": 1.7665, + "grad_norm": 2.2111527171484284, + "learning_rate": 4.911047806596981e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8687315583229065, + "step": 3533 + }, + { + "epoch": 1.767, + "grad_norm": 2.246199853459659, + "learning_rate": 4.910932413187029e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8639510273933411, + "step": 3534 + }, + { + "epoch": 1.7675, + "grad_norm": 12.036741058826744, + "learning_rate": 4.910816946335875e-06, + "loss": 0.3157, + "mean_token_accuracy": 0.9031696915626526, + "step": 3535 + }, + { + "epoch": 1.768, + "grad_norm": 4.181071111835289, + "learning_rate": 4.910701406047037e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.87217116355896, + "step": 3536 + }, + { + "epoch": 1.7685, + "grad_norm": 4.008377728663762, + "learning_rate": 4.910585792324035e-06, + "loss": 0.3076, + "mean_token_accuracy": 0.8995951414108276, + "step": 3537 + }, + { + "epoch": 1.7690000000000001, + "grad_norm": 2.723667426938206, + "learning_rate": 4.910470105170392e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8452528715133667, + "step": 3538 + }, + { + "epoch": 1.7694999999999999, + "grad_norm": 3.956439246218823, + "learning_rate": 4.91035434458963e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8808178901672363, + "step": 3539 + }, + { + "epoch": 1.77, + "grad_norm": 2.3509121147020067, + "learning_rate": 4.910238510585275e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8540785908699036, + "step": 3540 + }, + { + "epoch": 1.7705, + "grad_norm": 2.3524636832374117, + "learning_rate": 4.910122603160858e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8864802122116089, + "step": 3541 + }, + { + "epoch": 1.771, + "grad_norm": 2.518906427097686, + "learning_rate": 4.910006622319908e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8779012560844421, + "step": 3542 + }, + { + "epoch": 1.7715, + "grad_norm": 3.204355375867945, + "learning_rate": 4.909890568065958e-06, + "loss": 0.4871, + "mean_token_accuracy": 0.841576099395752, + "step": 3543 + }, + { + "epoch": 1.772, + "grad_norm": 2.4037795976049146, + "learning_rate": 4.9097744404025435e-06, + "loss": 0.2989, + "mean_token_accuracy": 0.9074305891990662, + "step": 3544 + }, + { + "epoch": 1.7725, + "grad_norm": 3.0539425545156647, + "learning_rate": 4.909658239333203e-06, + "loss": 0.3183, + "mean_token_accuracy": 0.8999144434928894, + "step": 3545 + }, + { + "epoch": 1.7730000000000001, + "grad_norm": 2.6717779701083835, + "learning_rate": 4.9095419648614735e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.8929239511489868, + "step": 3546 + }, + { + "epoch": 1.7734999999999999, + "grad_norm": 2.201885758207391, + "learning_rate": 4.9094256169908995e-06, + "loss": 0.3022, + "mean_token_accuracy": 0.9043523073196411, + "step": 3547 + }, + { + "epoch": 1.774, + "grad_norm": 2.2072552771822576, + "learning_rate": 4.909309195725025e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8679022192955017, + "step": 3548 + }, + { + "epoch": 1.7745, + "grad_norm": 7.712837650774191, + "learning_rate": 4.909192701067394e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8956828117370605, + "step": 3549 + }, + { + "epoch": 1.775, + "grad_norm": 2.8180367716051253, + "learning_rate": 4.909076133021558e-06, + "loss": 0.2702, + "mean_token_accuracy": 0.9117221236228943, + "step": 3550 + }, + { + "epoch": 1.7755, + "grad_norm": 2.6505998848206422, + "learning_rate": 4.908959491591066e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.8936833739280701, + "step": 3551 + }, + { + "epoch": 1.776, + "grad_norm": 97.79041229320646, + "learning_rate": 4.908842776779472e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8563884496688843, + "step": 3552 + }, + { + "epoch": 1.7765, + "grad_norm": 2.6942210127831756, + "learning_rate": 4.90872598859033e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8660093545913696, + "step": 3553 + }, + { + "epoch": 1.7770000000000001, + "grad_norm": 2.072584129074605, + "learning_rate": 4.9086091270272e-06, + "loss": 0.2731, + "mean_token_accuracy": 0.9086690545082092, + "step": 3554 + }, + { + "epoch": 1.7774999999999999, + "grad_norm": 2.9485881099942954, + "learning_rate": 4.9084921920936405e-06, + "loss": 0.3213, + "mean_token_accuracy": 0.8948960304260254, + "step": 3555 + }, + { + "epoch": 1.778, + "grad_norm": 2.5712431841048278, + "learning_rate": 4.908375183793212e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8745062947273254, + "step": 3556 + }, + { + "epoch": 1.7785, + "grad_norm": 2.723125322088821, + "learning_rate": 4.908258102129482e-06, + "loss": 0.3384, + "mean_token_accuracy": 0.8851259350776672, + "step": 3557 + }, + { + "epoch": 1.779, + "grad_norm": 1.8590463524663219, + "learning_rate": 4.908140947106014e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8926951289176941, + "step": 3558 + }, + { + "epoch": 1.7795, + "grad_norm": 2.3554983801630014, + "learning_rate": 4.908023718726378e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8729248642921448, + "step": 3559 + }, + { + "epoch": 1.78, + "grad_norm": 1.5747275408340056, + "learning_rate": 4.907906416994146e-06, + "loss": 0.2347, + "mean_token_accuracy": 0.918181836605072, + "step": 3560 + }, + { + "epoch": 1.7805, + "grad_norm": 2.1652832720322954, + "learning_rate": 4.907789041912889e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8854994177818298, + "step": 3561 + }, + { + "epoch": 1.7810000000000001, + "grad_norm": 111.96346765573699, + "learning_rate": 4.9076715934861844e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8738793730735779, + "step": 3562 + }, + { + "epoch": 1.7814999999999999, + "grad_norm": 2.272655745111058, + "learning_rate": 4.90755407171761e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8726319074630737, + "step": 3563 + }, + { + "epoch": 1.782, + "grad_norm": 2.1803939070821516, + "learning_rate": 4.907436476610743e-06, + "loss": 0.3043, + "mean_token_accuracy": 0.9020775556564331, + "step": 3564 + }, + { + "epoch": 1.7825, + "grad_norm": 1.8819629361555354, + "learning_rate": 4.907318808169168e-06, + "loss": 0.3533, + "mean_token_accuracy": 0.8907870054244995, + "step": 3565 + }, + { + "epoch": 1.783, + "grad_norm": 2.461803290135532, + "learning_rate": 4.9072010663964695e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8915913701057434, + "step": 3566 + }, + { + "epoch": 1.7835, + "grad_norm": 2.404867861126476, + "learning_rate": 4.907083251296233e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8644661903381348, + "step": 3567 + }, + { + "epoch": 1.784, + "grad_norm": 4.290148866419256, + "learning_rate": 4.906965362872048e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8737502098083496, + "step": 3568 + }, + { + "epoch": 1.7845, + "grad_norm": 2.6000124218600886, + "learning_rate": 4.906847401127504e-06, + "loss": 0.5187, + "mean_token_accuracy": 0.8331117033958435, + "step": 3569 + }, + { + "epoch": 1.7850000000000001, + "grad_norm": 3.5954237644762364, + "learning_rate": 4.906729366066197e-06, + "loss": 0.2876, + "mean_token_accuracy": 0.9066780805587769, + "step": 3570 + }, + { + "epoch": 1.7854999999999999, + "grad_norm": 1.7050507595141176, + "learning_rate": 4.906611257691721e-06, + "loss": 0.2478, + "mean_token_accuracy": 0.9174824953079224, + "step": 3571 + }, + { + "epoch": 1.786, + "grad_norm": 1.8176712816255938, + "learning_rate": 4.906493076007674e-06, + "loss": 0.2421, + "mean_token_accuracy": 0.9213991761207581, + "step": 3572 + }, + { + "epoch": 1.7865, + "grad_norm": 4.481937827387511, + "learning_rate": 4.906374821017657e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8795714974403381, + "step": 3573 + }, + { + "epoch": 1.787, + "grad_norm": 3.754145734433105, + "learning_rate": 4.9062564927252695e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.885606050491333, + "step": 3574 + }, + { + "epoch": 1.7875, + "grad_norm": 7.793067944962258, + "learning_rate": 4.906138091134118e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8634374737739563, + "step": 3575 + }, + { + "epoch": 1.788, + "grad_norm": 3.3414863149245075, + "learning_rate": 4.90601961624781e-06, + "loss": 0.5131, + "mean_token_accuracy": 0.8384958505630493, + "step": 3576 + }, + { + "epoch": 1.7885, + "grad_norm": 3.182074603978566, + "learning_rate": 4.905901068069953e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8764011859893799, + "step": 3577 + }, + { + "epoch": 1.7890000000000001, + "grad_norm": 2.792701043428192, + "learning_rate": 4.905782446604159e-06, + "loss": 0.2229, + "mean_token_accuracy": 0.9233233332633972, + "step": 3578 + }, + { + "epoch": 1.7894999999999999, + "grad_norm": 1.845921805190484, + "learning_rate": 4.90566375185404e-06, + "loss": 0.2902, + "mean_token_accuracy": 0.8977920413017273, + "step": 3579 + }, + { + "epoch": 1.79, + "grad_norm": 2.6229661628979244, + "learning_rate": 4.905544983823214e-06, + "loss": 0.515, + "mean_token_accuracy": 0.83562833070755, + "step": 3580 + }, + { + "epoch": 1.7905, + "grad_norm": 2.951934377208503, + "learning_rate": 4.9054261425152966e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8858920931816101, + "step": 3581 + }, + { + "epoch": 1.791, + "grad_norm": 5.045511435984276, + "learning_rate": 4.905307227933909e-06, + "loss": 0.2808, + "mean_token_accuracy": 0.8971032500267029, + "step": 3582 + }, + { + "epoch": 1.7915, + "grad_norm": 16.292998515061182, + "learning_rate": 4.9051882400826736e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8819642663002014, + "step": 3583 + }, + { + "epoch": 1.792, + "grad_norm": 1.6972560936182877, + "learning_rate": 4.905069178965215e-06, + "loss": 0.2768, + "mean_token_accuracy": 0.9057260155677795, + "step": 3584 + }, + { + "epoch": 1.7925, + "grad_norm": 3.230982622335555, + "learning_rate": 4.904950044585159e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8611582517623901, + "step": 3585 + }, + { + "epoch": 1.7930000000000001, + "grad_norm": 2.063551053295106, + "learning_rate": 4.904830836946137e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.878055989742279, + "step": 3586 + }, + { + "epoch": 1.7934999999999999, + "grad_norm": 2.6205991683806005, + "learning_rate": 4.904711556051778e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8662400245666504, + "step": 3587 + }, + { + "epoch": 1.794, + "grad_norm": 2.496339969281909, + "learning_rate": 4.904592201905716e-06, + "loss": 0.5339, + "mean_token_accuracy": 0.8391849994659424, + "step": 3588 + }, + { + "epoch": 1.7945, + "grad_norm": 2.5255084439317224, + "learning_rate": 4.9044727745115875e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8657499551773071, + "step": 3589 + }, + { + "epoch": 1.795, + "grad_norm": 5.577290523049574, + "learning_rate": 4.904353273873029e-06, + "loss": 0.2927, + "mean_token_accuracy": 0.9123328328132629, + "step": 3590 + }, + { + "epoch": 1.7955, + "grad_norm": 2.9656185647842093, + "learning_rate": 4.904233699993681e-06, + "loss": 0.5302, + "mean_token_accuracy": 0.8402270674705505, + "step": 3591 + }, + { + "epoch": 1.796, + "grad_norm": 6.882405219320378, + "learning_rate": 4.904114052877189e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8502247333526611, + "step": 3592 + }, + { + "epoch": 1.7965, + "grad_norm": 2.361701105491652, + "learning_rate": 4.9039943325271935e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8531965017318726, + "step": 3593 + }, + { + "epoch": 1.7970000000000002, + "grad_norm": 2.8204749307404287, + "learning_rate": 4.903874538947343e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.86265629529953, + "step": 3594 + }, + { + "epoch": 1.7974999999999999, + "grad_norm": 3.052124994511247, + "learning_rate": 4.903754672141288e-06, + "loss": 0.446, + "mean_token_accuracy": 0.871099054813385, + "step": 3595 + }, + { + "epoch": 1.798, + "grad_norm": 5.3089703971183875, + "learning_rate": 4.9036347321126776e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.890113890171051, + "step": 3596 + }, + { + "epoch": 1.7985, + "grad_norm": 3.01395108178013, + "learning_rate": 4.903514718865166e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8599445223808289, + "step": 3597 + }, + { + "epoch": 1.799, + "grad_norm": 3.881255213374671, + "learning_rate": 4.9033946324024105e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8820651173591614, + "step": 3598 + }, + { + "epoch": 1.7995, + "grad_norm": 2.5838558208992657, + "learning_rate": 4.903274472728067e-06, + "loss": 0.2395, + "mean_token_accuracy": 0.9128139615058899, + "step": 3599 + }, + { + "epoch": 1.8, + "grad_norm": 1.9993348093631342, + "learning_rate": 4.903154239845798e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8494008183479309, + "step": 3600 + }, + { + "epoch": 1.8005, + "grad_norm": 5.266411010158874, + "learning_rate": 4.903033933759264e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8550357818603516, + "step": 3601 + }, + { + "epoch": 1.8010000000000002, + "grad_norm": 2.3137686728553626, + "learning_rate": 4.90291355447213e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8785519599914551, + "step": 3602 + }, + { + "epoch": 1.8014999999999999, + "grad_norm": 4.515597561473421, + "learning_rate": 4.902793101988064e-06, + "loss": 0.3005, + "mean_token_accuracy": 0.8978102207183838, + "step": 3603 + }, + { + "epoch": 1.802, + "grad_norm": 3.4305360186462797, + "learning_rate": 4.902672576310735e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8621436357498169, + "step": 3604 + }, + { + "epoch": 1.8025, + "grad_norm": 2.894504379691539, + "learning_rate": 4.902551977443813e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8890253901481628, + "step": 3605 + }, + { + "epoch": 1.803, + "grad_norm": 3.311636315835142, + "learning_rate": 4.9024313053909745e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8779069781303406, + "step": 3606 + }, + { + "epoch": 1.8035, + "grad_norm": 2.7449173864333094, + "learning_rate": 4.902310560155893e-06, + "loss": 0.6262, + "mean_token_accuracy": 0.8295146226882935, + "step": 3607 + }, + { + "epoch": 1.804, + "grad_norm": 2.626806135998862, + "learning_rate": 4.902189741742247e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.882520318031311, + "step": 3608 + }, + { + "epoch": 1.8045, + "grad_norm": 2.616053191041337, + "learning_rate": 4.902068850153717e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8658942580223083, + "step": 3609 + }, + { + "epoch": 1.8050000000000002, + "grad_norm": 3.3075145914277417, + "learning_rate": 4.901947885393986e-06, + "loss": 0.3519, + "mean_token_accuracy": 0.8889598846435547, + "step": 3610 + }, + { + "epoch": 1.8054999999999999, + "grad_norm": 6.650682539805383, + "learning_rate": 4.901826847466738e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8750640153884888, + "step": 3611 + }, + { + "epoch": 1.806, + "grad_norm": 4.4620487457086915, + "learning_rate": 4.9017057363756604e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.87446528673172, + "step": 3612 + }, + { + "epoch": 1.8065, + "grad_norm": 2.31682246225349, + "learning_rate": 4.901584552124443e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8621811270713806, + "step": 3613 + }, + { + "epoch": 1.807, + "grad_norm": 1.6289504320060795, + "learning_rate": 4.901463294716777e-06, + "loss": 0.3254, + "mean_token_accuracy": 0.8890167474746704, + "step": 3614 + }, + { + "epoch": 1.8075, + "grad_norm": 2.235063638339916, + "learning_rate": 4.901341964156356e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8925275206565857, + "step": 3615 + }, + { + "epoch": 1.808, + "grad_norm": 3.284231465213133, + "learning_rate": 4.901220560446875e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8744341731071472, + "step": 3616 + }, + { + "epoch": 1.8085, + "grad_norm": 6.428790153901921, + "learning_rate": 4.901099083592033e-06, + "loss": 0.2404, + "mean_token_accuracy": 0.9236836433410645, + "step": 3617 + }, + { + "epoch": 1.8090000000000002, + "grad_norm": 3.711472185269346, + "learning_rate": 4.900977533595531e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.864130437374115, + "step": 3618 + }, + { + "epoch": 1.8094999999999999, + "grad_norm": 2.1590435750359767, + "learning_rate": 4.900855910461071e-06, + "loss": 0.2958, + "mean_token_accuracy": 0.9024289846420288, + "step": 3619 + }, + { + "epoch": 1.81, + "grad_norm": 2.676324121784231, + "learning_rate": 4.900734214192358e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8690671324729919, + "step": 3620 + }, + { + "epoch": 1.8105, + "grad_norm": 2.750868225741896, + "learning_rate": 4.900612444793099e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8791607618331909, + "step": 3621 + }, + { + "epoch": 1.811, + "grad_norm": 2.0253148472302867, + "learning_rate": 4.900490602267003e-06, + "loss": 0.3173, + "mean_token_accuracy": 0.8971284031867981, + "step": 3622 + }, + { + "epoch": 1.8115, + "grad_norm": 2.5529181651907513, + "learning_rate": 4.9003686866177825e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8842251896858215, + "step": 3623 + }, + { + "epoch": 1.812, + "grad_norm": 2.4353860092086532, + "learning_rate": 4.90024669784915e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8807981610298157, + "step": 3624 + }, + { + "epoch": 1.8125, + "grad_norm": 2.157522605254025, + "learning_rate": 4.900124635964823e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8929879069328308, + "step": 3625 + }, + { + "epoch": 1.813, + "grad_norm": 1.9063387286450333, + "learning_rate": 4.900002500968517e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8777980208396912, + "step": 3626 + }, + { + "epoch": 1.8135, + "grad_norm": 6.120620949383015, + "learning_rate": 4.899880292863955e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8865832090377808, + "step": 3627 + }, + { + "epoch": 1.814, + "grad_norm": 2.150412441598849, + "learning_rate": 4.899758011654859e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8885454535484314, + "step": 3628 + }, + { + "epoch": 1.8145, + "grad_norm": 2.0584251240441076, + "learning_rate": 4.899635657344955e-06, + "loss": 0.3357, + "mean_token_accuracy": 0.8898712396621704, + "step": 3629 + }, + { + "epoch": 1.815, + "grad_norm": 2.136089467988746, + "learning_rate": 4.899513229937968e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.8892531991004944, + "step": 3630 + }, + { + "epoch": 1.8155000000000001, + "grad_norm": 2.895162524274102, + "learning_rate": 4.899390729437628e-06, + "loss": 0.2017, + "mean_token_accuracy": 0.9319304823875427, + "step": 3631 + }, + { + "epoch": 1.8159999999999998, + "grad_norm": 15.349263846566954, + "learning_rate": 4.899268155847667e-06, + "loss": 0.2032, + "mean_token_accuracy": 0.9292035102844238, + "step": 3632 + }, + { + "epoch": 1.8165, + "grad_norm": 2.794715581489953, + "learning_rate": 4.899145509171819e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8550085425376892, + "step": 3633 + }, + { + "epoch": 1.817, + "grad_norm": 12.239479147683626, + "learning_rate": 4.89902278941382e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8820536732673645, + "step": 3634 + }, + { + "epoch": 1.8175, + "grad_norm": 2.9206953578414545, + "learning_rate": 4.898899996577407e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8740513324737549, + "step": 3635 + }, + { + "epoch": 1.818, + "grad_norm": 3.932950215096369, + "learning_rate": 4.898777130666322e-06, + "loss": 0.3188, + "mean_token_accuracy": 0.8955168724060059, + "step": 3636 + }, + { + "epoch": 1.8185, + "grad_norm": 2.08872763040904, + "learning_rate": 4.8986541916843075e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8803656101226807, + "step": 3637 + }, + { + "epoch": 1.819, + "grad_norm": 2.6608651775166567, + "learning_rate": 4.898531179635107e-06, + "loss": 0.5577, + "mean_token_accuracy": 0.8364625573158264, + "step": 3638 + }, + { + "epoch": 1.8195000000000001, + "grad_norm": 2.1564336798355717, + "learning_rate": 4.89840809452247e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8891732096672058, + "step": 3639 + }, + { + "epoch": 1.8199999999999998, + "grad_norm": 2.4158214204225015, + "learning_rate": 4.898284936350144e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8795577883720398, + "step": 3640 + }, + { + "epoch": 1.8205, + "grad_norm": 4.017626024314173, + "learning_rate": 4.8981617051218815e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.892317533493042, + "step": 3641 + }, + { + "epoch": 1.821, + "grad_norm": 2.904884324821749, + "learning_rate": 4.8980384008414365e-06, + "loss": 0.3401, + "mean_token_accuracy": 0.8829052448272705, + "step": 3642 + }, + { + "epoch": 1.8215, + "grad_norm": 1.7935504823305037, + "learning_rate": 4.8979150235125635e-06, + "loss": 0.3168, + "mean_token_accuracy": 0.8963687419891357, + "step": 3643 + }, + { + "epoch": 1.822, + "grad_norm": 1.75234074767346, + "learning_rate": 4.897791573139023e-06, + "loss": 0.2616, + "mean_token_accuracy": 0.9105986952781677, + "step": 3644 + }, + { + "epoch": 1.8225, + "grad_norm": 3.0196060474573576, + "learning_rate": 4.897668049724574e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8808092474937439, + "step": 3645 + }, + { + "epoch": 1.823, + "grad_norm": 2.8923861159247046, + "learning_rate": 4.8975444532729796e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8582653999328613, + "step": 3646 + }, + { + "epoch": 1.8235000000000001, + "grad_norm": 5.564695645410033, + "learning_rate": 4.897420783788005e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.9007249474525452, + "step": 3647 + }, + { + "epoch": 1.8239999999999998, + "grad_norm": 3.6767162134558307, + "learning_rate": 4.8972970412734174e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.8857576847076416, + "step": 3648 + }, + { + "epoch": 1.8245, + "grad_norm": 4.43782608734605, + "learning_rate": 4.897173225732986e-06, + "loss": 0.3137, + "mean_token_accuracy": 0.8946759104728699, + "step": 3649 + }, + { + "epoch": 1.825, + "grad_norm": 2.783125000853455, + "learning_rate": 4.897049337170483e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8670135736465454, + "step": 3650 + }, + { + "epoch": 1.8255, + "grad_norm": 2.6137564536827016, + "learning_rate": 4.896925375589681e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8658928275108337, + "step": 3651 + }, + { + "epoch": 1.826, + "grad_norm": 2.6020071999029772, + "learning_rate": 4.896801340994357e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8686990737915039, + "step": 3652 + }, + { + "epoch": 1.8265, + "grad_norm": 45.89523596823952, + "learning_rate": 4.896677233388289e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8912662863731384, + "step": 3653 + }, + { + "epoch": 1.827, + "grad_norm": 2.205076266202999, + "learning_rate": 4.896553052775259e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8727242946624756, + "step": 3654 + }, + { + "epoch": 1.8275000000000001, + "grad_norm": 1.579379729824284, + "learning_rate": 4.896428799159048e-06, + "loss": 0.3134, + "mean_token_accuracy": 0.8905007839202881, + "step": 3655 + }, + { + "epoch": 1.8279999999999998, + "grad_norm": 2.4026547843180395, + "learning_rate": 4.89630447254344e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8628926277160645, + "step": 3656 + }, + { + "epoch": 1.8285, + "grad_norm": 1.6405790742827402, + "learning_rate": 4.8961800729322245e-06, + "loss": 0.3177, + "mean_token_accuracy": 0.8925119042396545, + "step": 3657 + }, + { + "epoch": 1.829, + "grad_norm": 2.3849080102847733, + "learning_rate": 4.89605560032919e-06, + "loss": 0.2729, + "mean_token_accuracy": 0.9139452576637268, + "step": 3658 + }, + { + "epoch": 1.8295, + "grad_norm": 3.0255061712188156, + "learning_rate": 4.895931054738129e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8888888955116272, + "step": 3659 + }, + { + "epoch": 1.83, + "grad_norm": 4.404482346483019, + "learning_rate": 4.8958064361628334e-06, + "loss": 0.3144, + "mean_token_accuracy": 0.8929362297058105, + "step": 3660 + }, + { + "epoch": 1.8305, + "grad_norm": 2.7083931602716715, + "learning_rate": 4.895681744607102e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8709502220153809, + "step": 3661 + }, + { + "epoch": 1.831, + "grad_norm": 4.7051291770448564, + "learning_rate": 4.895556980074729e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.8876017332077026, + "step": 3662 + }, + { + "epoch": 1.8315000000000001, + "grad_norm": 2.9824505809639743, + "learning_rate": 4.89543214256952e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8649977445602417, + "step": 3663 + }, + { + "epoch": 1.8319999999999999, + "grad_norm": 2.583294935500856, + "learning_rate": 4.895307232095275e-06, + "loss": 0.3031, + "mean_token_accuracy": 0.8983824849128723, + "step": 3664 + }, + { + "epoch": 1.8325, + "grad_norm": 2.095752636465058, + "learning_rate": 4.8951822486557985e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8517094254493713, + "step": 3665 + }, + { + "epoch": 1.833, + "grad_norm": 2.0653398724095564, + "learning_rate": 4.895057192254898e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8897876739501953, + "step": 3666 + }, + { + "epoch": 1.8335, + "grad_norm": 2.879422652699201, + "learning_rate": 4.8949320628963844e-06, + "loss": 0.2989, + "mean_token_accuracy": 0.895756721496582, + "step": 3667 + }, + { + "epoch": 1.834, + "grad_norm": 3.265890138853578, + "learning_rate": 4.894806860584069e-06, + "loss": 0.5119, + "mean_token_accuracy": 0.8474011421203613, + "step": 3668 + }, + { + "epoch": 1.8345, + "grad_norm": 2.2799393458467807, + "learning_rate": 4.8946815853217644e-06, + "loss": 0.2211, + "mean_token_accuracy": 0.9231561422348022, + "step": 3669 + }, + { + "epoch": 1.835, + "grad_norm": 2.3077738546302546, + "learning_rate": 4.894556237113287e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8689014315605164, + "step": 3670 + }, + { + "epoch": 1.8355000000000001, + "grad_norm": 13.238282370270065, + "learning_rate": 4.894430815962456e-06, + "loss": 0.3142, + "mean_token_accuracy": 0.8962599635124207, + "step": 3671 + }, + { + "epoch": 1.8359999999999999, + "grad_norm": 2.4704036713039996, + "learning_rate": 4.894305321873092e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8813559412956238, + "step": 3672 + }, + { + "epoch": 1.8365, + "grad_norm": 2.2773047355097176, + "learning_rate": 4.894179754849016e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8802684545516968, + "step": 3673 + }, + { + "epoch": 1.837, + "grad_norm": 1.646438651367129, + "learning_rate": 4.894054114894056e-06, + "loss": 0.2821, + "mean_token_accuracy": 0.897641658782959, + "step": 3674 + }, + { + "epoch": 1.8375, + "grad_norm": 2.3861081452686936, + "learning_rate": 4.8939284020120365e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.870916485786438, + "step": 3675 + }, + { + "epoch": 1.838, + "grad_norm": 1.6210285601772099, + "learning_rate": 4.893802616206788e-06, + "loss": 0.2402, + "mean_token_accuracy": 0.9089133739471436, + "step": 3676 + }, + { + "epoch": 1.8385, + "grad_norm": 5.734133173222422, + "learning_rate": 4.893676757482142e-06, + "loss": 0.343, + "mean_token_accuracy": 0.8856332898139954, + "step": 3677 + }, + { + "epoch": 1.839, + "grad_norm": 1.9873985854716696, + "learning_rate": 4.893550825841932e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8812891840934753, + "step": 3678 + }, + { + "epoch": 1.8395000000000001, + "grad_norm": 2.866910408560939, + "learning_rate": 4.893424821289995e-06, + "loss": 0.6105, + "mean_token_accuracy": 0.8416864275932312, + "step": 3679 + }, + { + "epoch": 1.8399999999999999, + "grad_norm": 3.3058668298199114, + "learning_rate": 4.893298743830168e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.86663419008255, + "step": 3680 + }, + { + "epoch": 1.8405, + "grad_norm": 2.6401928931113554, + "learning_rate": 4.893172593466293e-06, + "loss": 0.3175, + "mean_token_accuracy": 0.892926812171936, + "step": 3681 + }, + { + "epoch": 1.841, + "grad_norm": 2.564987326352383, + "learning_rate": 4.893046370202212e-06, + "loss": 0.5335, + "mean_token_accuracy": 0.8262300491333008, + "step": 3682 + }, + { + "epoch": 1.8415, + "grad_norm": 2.9853912544247163, + "learning_rate": 4.892920074041771e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8810679316520691, + "step": 3683 + }, + { + "epoch": 1.842, + "grad_norm": 7.445036760638237, + "learning_rate": 4.892793704988816e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8861419558525085, + "step": 3684 + }, + { + "epoch": 1.8425, + "grad_norm": 2.5068393451507487, + "learning_rate": 4.892667263047196e-06, + "loss": 0.3132, + "mean_token_accuracy": 0.8968105316162109, + "step": 3685 + }, + { + "epoch": 1.843, + "grad_norm": 2.3891957239071306, + "learning_rate": 4.892540748220764e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8799630403518677, + "step": 3686 + }, + { + "epoch": 1.8435000000000001, + "grad_norm": 3.3056502795710596, + "learning_rate": 4.892414160513373e-06, + "loss": 0.2999, + "mean_token_accuracy": 0.8942824006080627, + "step": 3687 + }, + { + "epoch": 1.8439999999999999, + "grad_norm": 2.492747959719474, + "learning_rate": 4.892287499928879e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8886756300926208, + "step": 3688 + }, + { + "epoch": 1.8445, + "grad_norm": 1.7548254595232144, + "learning_rate": 4.8921607664711415e-06, + "loss": 0.2394, + "mean_token_accuracy": 0.9174385666847229, + "step": 3689 + }, + { + "epoch": 1.845, + "grad_norm": 2.176098033960271, + "learning_rate": 4.89203396014402e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.8906542062759399, + "step": 3690 + }, + { + "epoch": 1.8455, + "grad_norm": 1.703467274442565, + "learning_rate": 4.8919070809513755e-06, + "loss": 0.2791, + "mean_token_accuracy": 0.9059359431266785, + "step": 3691 + }, + { + "epoch": 1.846, + "grad_norm": 2.361475512663453, + "learning_rate": 4.891780128897077e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.865692675113678, + "step": 3692 + }, + { + "epoch": 1.8465, + "grad_norm": 2.698959600263957, + "learning_rate": 4.891653103984988e-06, + "loss": 0.3274, + "mean_token_accuracy": 0.8859598636627197, + "step": 3693 + }, + { + "epoch": 1.847, + "grad_norm": 2.6726449022718324, + "learning_rate": 4.891526006218981e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.9099516868591309, + "step": 3694 + }, + { + "epoch": 1.8475000000000001, + "grad_norm": 7.4196726934481765, + "learning_rate": 4.891398835602925e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.882363498210907, + "step": 3695 + }, + { + "epoch": 1.8479999999999999, + "grad_norm": 1.9196988054502535, + "learning_rate": 4.891271592140695e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8819543123245239, + "step": 3696 + }, + { + "epoch": 1.8485, + "grad_norm": 2.3853127963583596, + "learning_rate": 4.8911442758361675e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8797019720077515, + "step": 3697 + }, + { + "epoch": 1.849, + "grad_norm": 2.066441421909772, + "learning_rate": 4.891016886693219e-06, + "loss": 0.3161, + "mean_token_accuracy": 0.8882092833518982, + "step": 3698 + }, + { + "epoch": 1.8495, + "grad_norm": 1.782769256868171, + "learning_rate": 4.8908894247157325e-06, + "loss": 0.2913, + "mean_token_accuracy": 0.8987758159637451, + "step": 3699 + }, + { + "epoch": 1.85, + "grad_norm": 6.752244654400723, + "learning_rate": 4.890761889907589e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8949806690216064, + "step": 3700 + }, + { + "epoch": 1.8505, + "grad_norm": 2.372771268106091, + "learning_rate": 4.890634282272674e-06, + "loss": 0.259, + "mean_token_accuracy": 0.9103516340255737, + "step": 3701 + }, + { + "epoch": 1.851, + "grad_norm": 2.692695585490036, + "learning_rate": 4.890506601814874e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8840347528457642, + "step": 3702 + }, + { + "epoch": 1.8515000000000001, + "grad_norm": 2.8639629617549893, + "learning_rate": 4.8903788485380795e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8968043923377991, + "step": 3703 + }, + { + "epoch": 1.8519999999999999, + "grad_norm": 2.7808615848122216, + "learning_rate": 4.890251022446181e-06, + "loss": 0.2319, + "mean_token_accuracy": 0.9242990612983704, + "step": 3704 + }, + { + "epoch": 1.8525, + "grad_norm": 4.171900358739642, + "learning_rate": 4.890123123543074e-06, + "loss": 0.333, + "mean_token_accuracy": 0.9025893807411194, + "step": 3705 + }, + { + "epoch": 1.853, + "grad_norm": 4.904168586604766, + "learning_rate": 4.889995151832652e-06, + "loss": 0.3323, + "mean_token_accuracy": 0.9010174870491028, + "step": 3706 + }, + { + "epoch": 1.8535, + "grad_norm": 4.424063229501631, + "learning_rate": 4.8898671073188145e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8413336277008057, + "step": 3707 + }, + { + "epoch": 1.854, + "grad_norm": 1.7180149981741373, + "learning_rate": 4.889738990005462e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.877920925617218, + "step": 3708 + }, + { + "epoch": 1.8545, + "grad_norm": 2.808679807319678, + "learning_rate": 4.889610799896498e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8698351383209229, + "step": 3709 + }, + { + "epoch": 1.855, + "grad_norm": 2.8100151533836515, + "learning_rate": 4.889482536995826e-06, + "loss": 0.3055, + "mean_token_accuracy": 0.9060846567153931, + "step": 3710 + }, + { + "epoch": 1.8555000000000001, + "grad_norm": 2.4191392062980803, + "learning_rate": 4.889354201307354e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8589984774589539, + "step": 3711 + }, + { + "epoch": 1.8559999999999999, + "grad_norm": 1.7906841706260783, + "learning_rate": 4.889225792834991e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8459206819534302, + "step": 3712 + }, + { + "epoch": 1.8565, + "grad_norm": 1.2956166698379097, + "learning_rate": 4.889097311582648e-06, + "loss": 0.2545, + "mean_token_accuracy": 0.9115486145019531, + "step": 3713 + }, + { + "epoch": 1.857, + "grad_norm": 2.1911972564716815, + "learning_rate": 4.888968757554239e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8811091780662537, + "step": 3714 + }, + { + "epoch": 1.8575, + "grad_norm": 2.3413155430543955, + "learning_rate": 4.888840130753681e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8661430478096008, + "step": 3715 + }, + { + "epoch": 1.858, + "grad_norm": 3.514856475396425, + "learning_rate": 4.888711431184891e-06, + "loss": 0.2415, + "mean_token_accuracy": 0.9156243801116943, + "step": 3716 + }, + { + "epoch": 1.8585, + "grad_norm": 3.165608024762197, + "learning_rate": 4.88858265885179e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8699172735214233, + "step": 3717 + }, + { + "epoch": 1.859, + "grad_norm": 1.5052031598787556, + "learning_rate": 4.888453813758302e-06, + "loss": 0.2549, + "mean_token_accuracy": 0.908481776714325, + "step": 3718 + }, + { + "epoch": 1.8595000000000002, + "grad_norm": 3.875711329004097, + "learning_rate": 4.888324895908349e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8763312101364136, + "step": 3719 + }, + { + "epoch": 1.8599999999999999, + "grad_norm": 2.2722109081943254, + "learning_rate": 4.888195905305859e-06, + "loss": 0.3011, + "mean_token_accuracy": 0.9047116637229919, + "step": 3720 + }, + { + "epoch": 1.8605, + "grad_norm": 5.975972546283572, + "learning_rate": 4.888066841954763e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.8971717357635498, + "step": 3721 + }, + { + "epoch": 1.861, + "grad_norm": 1.5574342582517764, + "learning_rate": 4.887937705858991e-06, + "loss": 0.311, + "mean_token_accuracy": 0.8952457904815674, + "step": 3722 + }, + { + "epoch": 1.8615, + "grad_norm": 2.223382439350317, + "learning_rate": 4.887808497022476e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8797228932380676, + "step": 3723 + }, + { + "epoch": 1.862, + "grad_norm": 2.465808506904029, + "learning_rate": 4.887679215449156e-06, + "loss": 0.2948, + "mean_token_accuracy": 0.9035007953643799, + "step": 3724 + }, + { + "epoch": 1.8625, + "grad_norm": 2.6443130446810312, + "learning_rate": 4.887549861142967e-06, + "loss": 0.3051, + "mean_token_accuracy": 0.8977346420288086, + "step": 3725 + }, + { + "epoch": 1.863, + "grad_norm": 5.888430963683827, + "learning_rate": 4.88742043410785e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8730273842811584, + "step": 3726 + }, + { + "epoch": 1.8635000000000002, + "grad_norm": 3.2461874620866302, + "learning_rate": 4.8872909343477495e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8806735277175903, + "step": 3727 + }, + { + "epoch": 1.8639999999999999, + "grad_norm": 3.8408403393152675, + "learning_rate": 4.887161361866608e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8878458738327026, + "step": 3728 + }, + { + "epoch": 1.8645, + "grad_norm": 2.612484142507569, + "learning_rate": 4.887031716668373e-06, + "loss": 0.3165, + "mean_token_accuracy": 0.9009832739830017, + "step": 3729 + }, + { + "epoch": 1.865, + "grad_norm": 2.635295168614986, + "learning_rate": 4.886901998756995e-06, + "loss": 0.3412, + "mean_token_accuracy": 0.8918631672859192, + "step": 3730 + }, + { + "epoch": 1.8655, + "grad_norm": 2.0453292617355507, + "learning_rate": 4.886772208136422e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8576335310935974, + "step": 3731 + }, + { + "epoch": 1.866, + "grad_norm": 1.779279119793933, + "learning_rate": 4.886642344810612e-06, + "loss": 0.3059, + "mean_token_accuracy": 0.8981673121452332, + "step": 3732 + }, + { + "epoch": 1.8665, + "grad_norm": 1.9552174212508429, + "learning_rate": 4.886512408783518e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.9007754921913147, + "step": 3733 + }, + { + "epoch": 1.867, + "grad_norm": 2.283009577051473, + "learning_rate": 4.8863824000591e-06, + "loss": 0.5271, + "mean_token_accuracy": 0.8522279262542725, + "step": 3734 + }, + { + "epoch": 1.8675000000000002, + "grad_norm": 3.5177018410421255, + "learning_rate": 4.886252318641316e-06, + "loss": 0.2361, + "mean_token_accuracy": 0.9201083183288574, + "step": 3735 + }, + { + "epoch": 1.8679999999999999, + "grad_norm": 2.896919172706284, + "learning_rate": 4.8861221645341305e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8699594736099243, + "step": 3736 + }, + { + "epoch": 1.8685, + "grad_norm": 3.2289343748400143, + "learning_rate": 4.885991937741506e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8859556317329407, + "step": 3737 + }, + { + "epoch": 1.869, + "grad_norm": 2.061983532289894, + "learning_rate": 4.885861638267413e-06, + "loss": 0.3151, + "mean_token_accuracy": 0.9048543572425842, + "step": 3738 + }, + { + "epoch": 1.8695, + "grad_norm": 2.963826444510005, + "learning_rate": 4.8857312661158176e-06, + "loss": 0.41, + "mean_token_accuracy": 0.863120973110199, + "step": 3739 + }, + { + "epoch": 1.87, + "grad_norm": 2.836132037445005, + "learning_rate": 4.885600821290692e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8825855255126953, + "step": 3740 + }, + { + "epoch": 1.8705, + "grad_norm": 3.3478566222793233, + "learning_rate": 4.885470303796011e-06, + "loss": 0.3293, + "mean_token_accuracy": 0.8920806050300598, + "step": 3741 + }, + { + "epoch": 1.871, + "grad_norm": 8.458747327318635, + "learning_rate": 4.885339713635748e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8679601550102234, + "step": 3742 + }, + { + "epoch": 1.8715000000000002, + "grad_norm": 1.7056152850825503, + "learning_rate": 4.8852090508138825e-06, + "loss": 0.2065, + "mean_token_accuracy": 0.9287410974502563, + "step": 3743 + }, + { + "epoch": 1.8719999999999999, + "grad_norm": 2.601323415225733, + "learning_rate": 4.885078315334395e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8699161410331726, + "step": 3744 + }, + { + "epoch": 1.8725, + "grad_norm": 2.944767096040863, + "learning_rate": 4.884947507201268e-06, + "loss": 0.4798, + "mean_token_accuracy": 0.8400185108184814, + "step": 3745 + }, + { + "epoch": 1.873, + "grad_norm": 3.244411399591097, + "learning_rate": 4.8848166264184844e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.890666663646698, + "step": 3746 + }, + { + "epoch": 1.8735, + "grad_norm": 1.6665192200466343, + "learning_rate": 4.884685672990033e-06, + "loss": 0.2611, + "mean_token_accuracy": 0.9094051718711853, + "step": 3747 + }, + { + "epoch": 1.874, + "grad_norm": 2.1936353425543964, + "learning_rate": 4.884554646919901e-06, + "loss": 0.3131, + "mean_token_accuracy": 0.8886656761169434, + "step": 3748 + }, + { + "epoch": 1.8745, + "grad_norm": 2.0414560691926993, + "learning_rate": 4.8844235482120814e-06, + "loss": 0.2681, + "mean_token_accuracy": 0.9077757596969604, + "step": 3749 + }, + { + "epoch": 1.875, + "grad_norm": 2.4195003355048605, + "learning_rate": 4.884292376870567e-06, + "loss": 0.2835, + "mean_token_accuracy": 0.9145504832267761, + "step": 3750 + }, + { + "epoch": 1.8755, + "grad_norm": 1.9658631144785437, + "learning_rate": 4.884161132899354e-06, + "loss": 0.2614, + "mean_token_accuracy": 0.9061349630355835, + "step": 3751 + }, + { + "epoch": 1.876, + "grad_norm": 2.0467588275542363, + "learning_rate": 4.884029816302441e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8636146187782288, + "step": 3752 + }, + { + "epoch": 1.8765, + "grad_norm": 2.5319600525709713, + "learning_rate": 4.883898427083825e-06, + "loss": 0.2777, + "mean_token_accuracy": 0.9098377227783203, + "step": 3753 + }, + { + "epoch": 1.877, + "grad_norm": 2.717899750869039, + "learning_rate": 4.8837669652475116e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8761181235313416, + "step": 3754 + }, + { + "epoch": 1.8775, + "grad_norm": 2.0886479563153246, + "learning_rate": 4.883635430797503e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8841232061386108, + "step": 3755 + }, + { + "epoch": 1.8780000000000001, + "grad_norm": 3.2121694421169438, + "learning_rate": 4.883503823737809e-06, + "loss": 0.3103, + "mean_token_accuracy": 0.8975706100463867, + "step": 3756 + }, + { + "epoch": 1.8784999999999998, + "grad_norm": 2.472485765882339, + "learning_rate": 4.883372144072434e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8819032907485962, + "step": 3757 + }, + { + "epoch": 1.879, + "grad_norm": 3.5368379869632127, + "learning_rate": 4.883240391805394e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8680976033210754, + "step": 3758 + }, + { + "epoch": 1.8795, + "grad_norm": 5.948120721701247, + "learning_rate": 4.8831085669407e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8899667263031006, + "step": 3759 + }, + { + "epoch": 1.88, + "grad_norm": 2.0625092073448665, + "learning_rate": 4.882976669482368e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.886313259601593, + "step": 3760 + }, + { + "epoch": 1.8805, + "grad_norm": 5.860873842850521, + "learning_rate": 4.882844699434415e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8732197880744934, + "step": 3761 + }, + { + "epoch": 1.881, + "grad_norm": 1.9619057245463216, + "learning_rate": 4.882712656800863e-06, + "loss": 0.3146, + "mean_token_accuracy": 0.8889680504798889, + "step": 3762 + }, + { + "epoch": 1.8815, + "grad_norm": 3.626285697226697, + "learning_rate": 4.882580541585732e-06, + "loss": 0.3316, + "mean_token_accuracy": 0.8860214948654175, + "step": 3763 + }, + { + "epoch": 1.8820000000000001, + "grad_norm": 7.821503670428803, + "learning_rate": 4.882448353793048e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8773069381713867, + "step": 3764 + }, + { + "epoch": 1.8824999999999998, + "grad_norm": 7.9907854754735235, + "learning_rate": 4.8823160934268365e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8664578199386597, + "step": 3765 + }, + { + "epoch": 1.883, + "grad_norm": 2.5802544110990238, + "learning_rate": 4.8821837604911275e-06, + "loss": 0.5034, + "mean_token_accuracy": 0.8446617722511292, + "step": 3766 + }, + { + "epoch": 1.8835, + "grad_norm": 2.806675059391199, + "learning_rate": 4.882051354989951e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.8421052694320679, + "step": 3767 + }, + { + "epoch": 1.884, + "grad_norm": 16.939869598054017, + "learning_rate": 4.881918876927342e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8865395784378052, + "step": 3768 + }, + { + "epoch": 1.8845, + "grad_norm": 4.089655392624302, + "learning_rate": 4.881786326307334e-06, + "loss": 0.266, + "mean_token_accuracy": 0.9144129157066345, + "step": 3769 + }, + { + "epoch": 1.885, + "grad_norm": 1.8040354049059053, + "learning_rate": 4.881653703133966e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8886067867279053, + "step": 3770 + }, + { + "epoch": 1.8855, + "grad_norm": 2.4759424201325038, + "learning_rate": 4.881521007411278e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8612191677093506, + "step": 3771 + }, + { + "epoch": 1.8860000000000001, + "grad_norm": 2.662927801569694, + "learning_rate": 4.881388239143311e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8572513461112976, + "step": 3772 + }, + { + "epoch": 1.8864999999999998, + "grad_norm": 2.2929209269652033, + "learning_rate": 4.881255398334111e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8838809728622437, + "step": 3773 + }, + { + "epoch": 1.887, + "grad_norm": 4.437037653276401, + "learning_rate": 4.881122484987723e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8848758339881897, + "step": 3774 + }, + { + "epoch": 1.8875, + "grad_norm": 5.312472697191217, + "learning_rate": 4.880989499108196e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8844967484474182, + "step": 3775 + }, + { + "epoch": 1.888, + "grad_norm": 3.4861427567279257, + "learning_rate": 4.880856440699582e-06, + "loss": 0.5205, + "mean_token_accuracy": 0.8456299901008606, + "step": 3776 + }, + { + "epoch": 1.8885, + "grad_norm": 2.7514531704889245, + "learning_rate": 4.880723309765933e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8806060552597046, + "step": 3777 + }, + { + "epoch": 1.889, + "grad_norm": 2.9836776060297465, + "learning_rate": 4.8805901063113064e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8907103538513184, + "step": 3778 + }, + { + "epoch": 1.8895, + "grad_norm": 2.0292852482848374, + "learning_rate": 4.880456830339757e-06, + "loss": 0.239, + "mean_token_accuracy": 0.9188640713691711, + "step": 3779 + }, + { + "epoch": 1.8900000000000001, + "grad_norm": 2.6827492068019696, + "learning_rate": 4.880323481855347e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8768492937088013, + "step": 3780 + }, + { + "epoch": 1.8904999999999998, + "grad_norm": 4.187035992744743, + "learning_rate": 4.8801900608621375e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8502031564712524, + "step": 3781 + }, + { + "epoch": 1.891, + "grad_norm": 2.1620570003047357, + "learning_rate": 4.880056567364192e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8523895740509033, + "step": 3782 + }, + { + "epoch": 1.8915, + "grad_norm": 3.1825599090468786, + "learning_rate": 4.879923001365578e-06, + "loss": 0.5298, + "mean_token_accuracy": 0.8401885032653809, + "step": 3783 + }, + { + "epoch": 1.892, + "grad_norm": 6.028706490088384, + "learning_rate": 4.879789362870363e-06, + "loss": 0.353, + "mean_token_accuracy": 0.8873944282531738, + "step": 3784 + }, + { + "epoch": 1.8925, + "grad_norm": 4.288766820264351, + "learning_rate": 4.8796556518826196e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8737755417823792, + "step": 3785 + }, + { + "epoch": 1.893, + "grad_norm": 2.454843030788932, + "learning_rate": 4.87952186840642e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.8502216339111328, + "step": 3786 + }, + { + "epoch": 1.8935, + "grad_norm": 3.0015905904661886, + "learning_rate": 4.8793880124458396e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.871341347694397, + "step": 3787 + }, + { + "epoch": 1.8940000000000001, + "grad_norm": 2.5281092201094157, + "learning_rate": 4.879254084004954e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8767672777175903, + "step": 3788 + }, + { + "epoch": 1.8944999999999999, + "grad_norm": 2.6404780654099675, + "learning_rate": 4.879120083087846e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8777524828910828, + "step": 3789 + }, + { + "epoch": 1.895, + "grad_norm": 4.0005445551645265, + "learning_rate": 4.878986009698596e-06, + "loss": 0.3031, + "mean_token_accuracy": 0.9086207151412964, + "step": 3790 + }, + { + "epoch": 1.8955, + "grad_norm": 3.0658229205310645, + "learning_rate": 4.878851863841287e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8628366589546204, + "step": 3791 + }, + { + "epoch": 1.896, + "grad_norm": 2.3904535538277667, + "learning_rate": 4.878717645520008e-06, + "loss": 0.2842, + "mean_token_accuracy": 0.8976473212242126, + "step": 3792 + }, + { + "epoch": 1.8965, + "grad_norm": 2.7747338382363456, + "learning_rate": 4.878583354738846e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8985011577606201, + "step": 3793 + }, + { + "epoch": 1.897, + "grad_norm": 2.7046150317588484, + "learning_rate": 4.878448991501891e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.882322371006012, + "step": 3794 + }, + { + "epoch": 1.8975, + "grad_norm": 2.3373048509447556, + "learning_rate": 4.878314555813237e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8695513010025024, + "step": 3795 + }, + { + "epoch": 1.8980000000000001, + "grad_norm": 4.185920850100296, + "learning_rate": 4.878180047676979e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.854073703289032, + "step": 3796 + }, + { + "epoch": 1.8984999999999999, + "grad_norm": 3.3349504417201277, + "learning_rate": 4.8780454670972136e-06, + "loss": 0.3404, + "mean_token_accuracy": 0.897241473197937, + "step": 3797 + }, + { + "epoch": 1.899, + "grad_norm": 2.2532640094748886, + "learning_rate": 4.877910814078041e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8684915900230408, + "step": 3798 + }, + { + "epoch": 1.8995, + "grad_norm": 1.819438267146254, + "learning_rate": 4.877776088623563e-06, + "loss": 0.288, + "mean_token_accuracy": 0.8941499590873718, + "step": 3799 + }, + { + "epoch": 1.9, + "grad_norm": 2.822702814749927, + "learning_rate": 4.8776412907378845e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8743627667427063, + "step": 3800 + }, + { + "epoch": 1.9005, + "grad_norm": 2.344683843069261, + "learning_rate": 4.87750642042511e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8589620590209961, + "step": 3801 + }, + { + "epoch": 1.901, + "grad_norm": 2.4573902431118677, + "learning_rate": 4.877371477689348e-06, + "loss": 0.2648, + "mean_token_accuracy": 0.9125809669494629, + "step": 3802 + }, + { + "epoch": 1.9015, + "grad_norm": 2.0681626509889743, + "learning_rate": 4.87723646253471e-06, + "loss": 0.2855, + "mean_token_accuracy": 0.9097703099250793, + "step": 3803 + }, + { + "epoch": 1.9020000000000001, + "grad_norm": 2.4332284909441086, + "learning_rate": 4.877101374965309e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8754640817642212, + "step": 3804 + }, + { + "epoch": 1.9024999999999999, + "grad_norm": 5.882980074762406, + "learning_rate": 4.876966214985259e-06, + "loss": 0.5179, + "mean_token_accuracy": 0.8527994155883789, + "step": 3805 + }, + { + "epoch": 1.903, + "grad_norm": 2.5113050305061804, + "learning_rate": 4.876830982598677e-06, + "loss": 0.2901, + "mean_token_accuracy": 0.9035502672195435, + "step": 3806 + }, + { + "epoch": 1.9035, + "grad_norm": 3.769953305465729, + "learning_rate": 4.8766956778096844e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.8628423810005188, + "step": 3807 + }, + { + "epoch": 1.904, + "grad_norm": 3.591524541704361, + "learning_rate": 4.8765603006224e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8691902160644531, + "step": 3808 + }, + { + "epoch": 1.9045, + "grad_norm": 1.7786201054271333, + "learning_rate": 4.876424851040951e-06, + "loss": 0.2509, + "mean_token_accuracy": 0.9167803525924683, + "step": 3809 + }, + { + "epoch": 1.905, + "grad_norm": 7.9401422195059315, + "learning_rate": 4.87628932906946e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8579354882240295, + "step": 3810 + }, + { + "epoch": 1.9055, + "grad_norm": 2.8989420015681873, + "learning_rate": 4.876153734712057e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8720717430114746, + "step": 3811 + }, + { + "epoch": 1.9060000000000001, + "grad_norm": 5.806496878299209, + "learning_rate": 4.8760180679728715e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8852623105049133, + "step": 3812 + }, + { + "epoch": 1.9064999999999999, + "grad_norm": 2.2070715308745252, + "learning_rate": 4.875882328856038e-06, + "loss": 0.3164, + "mean_token_accuracy": 0.8919327855110168, + "step": 3813 + }, + { + "epoch": 1.907, + "grad_norm": 3.904611191063642, + "learning_rate": 4.87574651736569e-06, + "loss": 0.3448, + "mean_token_accuracy": 0.8807417750358582, + "step": 3814 + }, + { + "epoch": 1.9075, + "grad_norm": 1.6810463285609067, + "learning_rate": 4.875610633505965e-06, + "loss": 0.3145, + "mean_token_accuracy": 0.8948303461074829, + "step": 3815 + }, + { + "epoch": 1.908, + "grad_norm": 2.0716640125323664, + "learning_rate": 4.875474677281003e-06, + "loss": 0.2675, + "mean_token_accuracy": 0.9122042059898376, + "step": 3816 + }, + { + "epoch": 1.9085, + "grad_norm": 2.5612157688637907, + "learning_rate": 4.875338648694942e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8812182545661926, + "step": 3817 + }, + { + "epoch": 1.909, + "grad_norm": 4.746295934114422, + "learning_rate": 4.875202547751929e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.8742000460624695, + "step": 3818 + }, + { + "epoch": 1.9095, + "grad_norm": 1.9514717217674857, + "learning_rate": 4.87506637445611e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8776604533195496, + "step": 3819 + }, + { + "epoch": 1.9100000000000001, + "grad_norm": 2.4699570280566028, + "learning_rate": 4.874930128811631e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8752354979515076, + "step": 3820 + }, + { + "epoch": 1.9104999999999999, + "grad_norm": 3.140949836997223, + "learning_rate": 4.874793810822645e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8589842319488525, + "step": 3821 + }, + { + "epoch": 1.911, + "grad_norm": 2.084772671089142, + "learning_rate": 4.874657420493302e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8780438303947449, + "step": 3822 + }, + { + "epoch": 1.9115, + "grad_norm": 2.341483694618191, + "learning_rate": 4.874520957827757e-06, + "loss": 0.3445, + "mean_token_accuracy": 0.8928278684616089, + "step": 3823 + }, + { + "epoch": 1.912, + "grad_norm": 1.9195802784313056, + "learning_rate": 4.8743844228301676e-06, + "loss": 0.29, + "mean_token_accuracy": 0.8997044563293457, + "step": 3824 + }, + { + "epoch": 1.9125, + "grad_norm": 1.8612317406589314, + "learning_rate": 4.874247815504693e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8797672986984253, + "step": 3825 + }, + { + "epoch": 1.913, + "grad_norm": 2.1658677653504617, + "learning_rate": 4.874111135855494e-06, + "loss": 0.5212, + "mean_token_accuracy": 0.8411730527877808, + "step": 3826 + }, + { + "epoch": 1.9135, + "grad_norm": 2.8790249090444946, + "learning_rate": 4.873974383886734e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8624955415725708, + "step": 3827 + }, + { + "epoch": 1.9140000000000001, + "grad_norm": 4.146980901782296, + "learning_rate": 4.87383755960258e-06, + "loss": 0.2811, + "mean_token_accuracy": 0.9108233451843262, + "step": 3828 + }, + { + "epoch": 1.9144999999999999, + "grad_norm": 5.305037193180108, + "learning_rate": 4.873700663007198e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8679059147834778, + "step": 3829 + }, + { + "epoch": 1.915, + "grad_norm": 5.008325955219432, + "learning_rate": 4.87356369410476e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8816303610801697, + "step": 3830 + }, + { + "epoch": 1.9155, + "grad_norm": 2.734947202702325, + "learning_rate": 4.873426652899437e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.897617518901825, + "step": 3831 + }, + { + "epoch": 1.916, + "grad_norm": 4.841788857216125, + "learning_rate": 4.873289539395404e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8581124544143677, + "step": 3832 + }, + { + "epoch": 1.9165, + "grad_norm": 2.3685918730518676, + "learning_rate": 4.873152353596837e-06, + "loss": 0.3077, + "mean_token_accuracy": 0.8995223641395569, + "step": 3833 + }, + { + "epoch": 1.917, + "grad_norm": 3.0623447045720655, + "learning_rate": 4.873015095507916e-06, + "loss": 0.5525, + "mean_token_accuracy": 0.855017900466919, + "step": 3834 + }, + { + "epoch": 1.9175, + "grad_norm": 2.406333048244106, + "learning_rate": 4.872877765132822e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8719289302825928, + "step": 3835 + }, + { + "epoch": 1.9180000000000001, + "grad_norm": 2.1935265896924268, + "learning_rate": 4.8727403624757365e-06, + "loss": 0.5281, + "mean_token_accuracy": 0.8288078904151917, + "step": 3836 + }, + { + "epoch": 1.9184999999999999, + "grad_norm": 2.470995768700638, + "learning_rate": 4.872602887540848e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8869582414627075, + "step": 3837 + }, + { + "epoch": 1.919, + "grad_norm": 2.411513148327924, + "learning_rate": 4.872465340332342e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.889604389667511, + "step": 3838 + }, + { + "epoch": 1.9195, + "grad_norm": 3.051023699442361, + "learning_rate": 4.8723277208544094e-06, + "loss": 0.3395, + "mean_token_accuracy": 0.8898566365242004, + "step": 3839 + }, + { + "epoch": 1.92, + "grad_norm": 2.677138240028309, + "learning_rate": 4.8721900291112415e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8772299885749817, + "step": 3840 + }, + { + "epoch": 1.9205, + "grad_norm": 2.732194504778065, + "learning_rate": 4.872052265107034e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8750199675559998, + "step": 3841 + }, + { + "epoch": 1.921, + "grad_norm": 2.892133935516485, + "learning_rate": 4.871914428845982e-06, + "loss": 0.369, + "mean_token_accuracy": 0.883833646774292, + "step": 3842 + }, + { + "epoch": 1.9215, + "grad_norm": 2.7399177855104115, + "learning_rate": 4.871776520332285e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.8574712872505188, + "step": 3843 + }, + { + "epoch": 1.9220000000000002, + "grad_norm": 2.60866315889679, + "learning_rate": 4.871638539570144e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8842473030090332, + "step": 3844 + }, + { + "epoch": 1.9224999999999999, + "grad_norm": 2.2587045262871883, + "learning_rate": 4.8715004865637616e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8972895741462708, + "step": 3845 + }, + { + "epoch": 1.923, + "grad_norm": 2.000871375269693, + "learning_rate": 4.871362361317344e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8884214162826538, + "step": 3846 + }, + { + "epoch": 1.9235, + "grad_norm": 2.230930266161239, + "learning_rate": 4.871224163835098e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8609327673912048, + "step": 3847 + }, + { + "epoch": 1.924, + "grad_norm": 2.3666810797928504, + "learning_rate": 4.871085894121234e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8838927745819092, + "step": 3848 + }, + { + "epoch": 1.9245, + "grad_norm": 8.453056492366695, + "learning_rate": 4.870947552179962e-06, + "loss": 0.298, + "mean_token_accuracy": 0.9002225399017334, + "step": 3849 + }, + { + "epoch": 1.925, + "grad_norm": 9.629031170830617, + "learning_rate": 4.870809138015499e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8623619079589844, + "step": 3850 + }, + { + "epoch": 1.9255, + "grad_norm": 2.048189853456225, + "learning_rate": 4.870670651632059e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8679948449134827, + "step": 3851 + }, + { + "epoch": 1.9260000000000002, + "grad_norm": 2.1991170527270336, + "learning_rate": 4.8705320930338615e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8819760680198669, + "step": 3852 + }, + { + "epoch": 1.9264999999999999, + "grad_norm": 2.3573750985958277, + "learning_rate": 4.870393462225128e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8733267784118652, + "step": 3853 + }, + { + "epoch": 1.927, + "grad_norm": 3.8788326200074725, + "learning_rate": 4.87025475921008e-06, + "loss": 0.252, + "mean_token_accuracy": 0.9165465235710144, + "step": 3854 + }, + { + "epoch": 1.9275, + "grad_norm": 5.005333346387351, + "learning_rate": 4.870115983992944e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.8941973447799683, + "step": 3855 + }, + { + "epoch": 1.928, + "grad_norm": 2.37510485363026, + "learning_rate": 4.869977136577946e-06, + "loss": 0.4949, + "mean_token_accuracy": 0.8548860549926758, + "step": 3856 + }, + { + "epoch": 1.9285, + "grad_norm": 1.774068802116459, + "learning_rate": 4.869838216969317e-06, + "loss": 0.3106, + "mean_token_accuracy": 0.9050188064575195, + "step": 3857 + }, + { + "epoch": 1.929, + "grad_norm": 5.102564911629795, + "learning_rate": 4.869699225171286e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8575525879859924, + "step": 3858 + }, + { + "epoch": 1.9295, + "grad_norm": 1.834485381184971, + "learning_rate": 4.86956016118809e-06, + "loss": 0.337, + "mean_token_accuracy": 0.890502393245697, + "step": 3859 + }, + { + "epoch": 1.9300000000000002, + "grad_norm": 3.0241408983286453, + "learning_rate": 4.869421025023965e-06, + "loss": 0.351, + "mean_token_accuracy": 0.8920750617980957, + "step": 3860 + }, + { + "epoch": 1.9304999999999999, + "grad_norm": 1.850616521890016, + "learning_rate": 4.869281816683147e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8875205516815186, + "step": 3861 + }, + { + "epoch": 1.931, + "grad_norm": 1.6079359808000306, + "learning_rate": 4.869142536169878e-06, + "loss": 0.2612, + "mean_token_accuracy": 0.90625, + "step": 3862 + }, + { + "epoch": 1.9315, + "grad_norm": 2.608843141057958, + "learning_rate": 4.8690031834884006e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.8962681889533997, + "step": 3863 + }, + { + "epoch": 1.932, + "grad_norm": 1.7870829323094852, + "learning_rate": 4.86886375864296e-06, + "loss": 0.3035, + "mean_token_accuracy": 0.8904826045036316, + "step": 3864 + }, + { + "epoch": 1.9325, + "grad_norm": 2.930940912531262, + "learning_rate": 4.8687242616378026e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8716132640838623, + "step": 3865 + }, + { + "epoch": 1.933, + "grad_norm": 2.318172162905494, + "learning_rate": 4.868584692477178e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8844426870346069, + "step": 3866 + }, + { + "epoch": 1.9335, + "grad_norm": 4.172057088506926, + "learning_rate": 4.868445051165338e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8688606023788452, + "step": 3867 + }, + { + "epoch": 1.9340000000000002, + "grad_norm": 1.8491425737804121, + "learning_rate": 4.868305337706536e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.8915778994560242, + "step": 3868 + }, + { + "epoch": 1.9344999999999999, + "grad_norm": 1.7461461580219089, + "learning_rate": 4.868165552105028e-06, + "loss": 0.2647, + "mean_token_accuracy": 0.9011045694351196, + "step": 3869 + }, + { + "epoch": 1.935, + "grad_norm": 2.605578413078052, + "learning_rate": 4.868025694365073e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.881748616695404, + "step": 3870 + }, + { + "epoch": 1.9355, + "grad_norm": 2.1347193205632764, + "learning_rate": 4.867885764490929e-06, + "loss": 0.2874, + "mean_token_accuracy": 0.909604549407959, + "step": 3871 + }, + { + "epoch": 1.936, + "grad_norm": 2.0903430401830287, + "learning_rate": 4.867745762486862e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8812911510467529, + "step": 3872 + }, + { + "epoch": 1.9365, + "grad_norm": 1.9491129971909391, + "learning_rate": 4.867605688357133e-06, + "loss": 0.3445, + "mean_token_accuracy": 0.8864359259605408, + "step": 3873 + }, + { + "epoch": 1.937, + "grad_norm": 2.3402869936674713, + "learning_rate": 4.8674655421060105e-06, + "loss": 0.2527, + "mean_token_accuracy": 0.9130987524986267, + "step": 3874 + }, + { + "epoch": 1.9375, + "grad_norm": 1.6645126943233666, + "learning_rate": 4.867325323737765e-06, + "loss": 0.2906, + "mean_token_accuracy": 0.8986722826957703, + "step": 3875 + }, + { + "epoch": 1.938, + "grad_norm": 2.418945751429176, + "learning_rate": 4.867185033256665e-06, + "loss": 0.2762, + "mean_token_accuracy": 0.9083636403083801, + "step": 3876 + }, + { + "epoch": 1.9385, + "grad_norm": 3.893832579219326, + "learning_rate": 4.8670446706669866e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8869972229003906, + "step": 3877 + }, + { + "epoch": 1.939, + "grad_norm": 2.448438710077553, + "learning_rate": 4.866904235973005e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8712615966796875, + "step": 3878 + }, + { + "epoch": 1.9395, + "grad_norm": 1.8015905243674872, + "learning_rate": 4.866763729178996e-06, + "loss": 0.3134, + "mean_token_accuracy": 0.8946100473403931, + "step": 3879 + }, + { + "epoch": 1.94, + "grad_norm": 2.3267286549920954, + "learning_rate": 4.866623150289241e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8649512529373169, + "step": 3880 + }, + { + "epoch": 1.9405000000000001, + "grad_norm": 1.9125499117559785, + "learning_rate": 4.866482499308024e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8536585569381714, + "step": 3881 + }, + { + "epoch": 1.9409999999999998, + "grad_norm": 2.40602715656356, + "learning_rate": 4.866341776239627e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.87673020362854, + "step": 3882 + }, + { + "epoch": 1.9415, + "grad_norm": 5.09756833636819, + "learning_rate": 4.866200981088337e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8864443898200989, + "step": 3883 + }, + { + "epoch": 1.942, + "grad_norm": 1.8245378019973677, + "learning_rate": 4.866060113858444e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8814112544059753, + "step": 3884 + }, + { + "epoch": 1.9425, + "grad_norm": 3.0084610776645477, + "learning_rate": 4.865919174554238e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8715237975120544, + "step": 3885 + }, + { + "epoch": 1.943, + "grad_norm": 13.120338803304456, + "learning_rate": 4.865778163180014e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8676678538322449, + "step": 3886 + }, + { + "epoch": 1.9435, + "grad_norm": 2.8450729236182943, + "learning_rate": 4.8656370797400645e-06, + "loss": 0.3189, + "mean_token_accuracy": 0.8950194120407104, + "step": 3887 + }, + { + "epoch": 1.944, + "grad_norm": 2.249364774230926, + "learning_rate": 4.86549592423869e-06, + "loss": 0.34, + "mean_token_accuracy": 0.8881607055664062, + "step": 3888 + }, + { + "epoch": 1.9445000000000001, + "grad_norm": 2.4322844277053925, + "learning_rate": 4.865354696680189e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8572132587432861, + "step": 3889 + }, + { + "epoch": 1.9449999999999998, + "grad_norm": 2.0771170359673485, + "learning_rate": 4.865213397068864e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8713536262512207, + "step": 3890 + }, + { + "epoch": 1.9455, + "grad_norm": 1.8938538693071416, + "learning_rate": 4.8650720254090185e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.868514895439148, + "step": 3891 + }, + { + "epoch": 1.946, + "grad_norm": 1.899879423469105, + "learning_rate": 4.86493058170496e-06, + "loss": 0.2726, + "mean_token_accuracy": 0.904411792755127, + "step": 3892 + }, + { + "epoch": 1.9465, + "grad_norm": 4.734669950140693, + "learning_rate": 4.864789065960995e-06, + "loss": 0.2794, + "mean_token_accuracy": 0.9037794470787048, + "step": 3893 + }, + { + "epoch": 1.947, + "grad_norm": 2.470691889363903, + "learning_rate": 4.864647478181437e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8636786341667175, + "step": 3894 + }, + { + "epoch": 1.9475, + "grad_norm": 2.3899001064878047, + "learning_rate": 4.8645058183705976e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8557775616645813, + "step": 3895 + }, + { + "epoch": 1.948, + "grad_norm": 2.9299641756103116, + "learning_rate": 4.864364086532792e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.907835066318512, + "step": 3896 + }, + { + "epoch": 1.9485000000000001, + "grad_norm": 1.6982677261133248, + "learning_rate": 4.8642222826723384e-06, + "loss": 0.2687, + "mean_token_accuracy": 0.9144877195358276, + "step": 3897 + }, + { + "epoch": 1.9489999999999998, + "grad_norm": 2.892156872889256, + "learning_rate": 4.8640804067935555e-06, + "loss": 0.2269, + "mean_token_accuracy": 0.9291338324546814, + "step": 3898 + }, + { + "epoch": 1.9495, + "grad_norm": 2.1769531294877633, + "learning_rate": 4.863938458900766e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8634680509567261, + "step": 3899 + }, + { + "epoch": 1.95, + "grad_norm": 2.069869903004523, + "learning_rate": 4.863796438998293e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8688338994979858, + "step": 3900 + }, + { + "epoch": 1.9505, + "grad_norm": 2.382314080285708, + "learning_rate": 4.863654347090462e-06, + "loss": 0.2825, + "mean_token_accuracy": 0.9105011820793152, + "step": 3901 + }, + { + "epoch": 1.951, + "grad_norm": 5.534228224525732, + "learning_rate": 4.863512183181604e-06, + "loss": 0.5435, + "mean_token_accuracy": 0.8476346135139465, + "step": 3902 + }, + { + "epoch": 1.9515, + "grad_norm": 2.6745474512652296, + "learning_rate": 4.863369947276047e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8979068994522095, + "step": 3903 + }, + { + "epoch": 1.952, + "grad_norm": 1.69551600038975, + "learning_rate": 4.863227639378124e-06, + "loss": 0.3072, + "mean_token_accuracy": 0.9020244479179382, + "step": 3904 + }, + { + "epoch": 1.9525000000000001, + "grad_norm": 1.913317386300014, + "learning_rate": 4.863085259492171e-06, + "loss": 0.2725, + "mean_token_accuracy": 0.9091646671295166, + "step": 3905 + }, + { + "epoch": 1.9529999999999998, + "grad_norm": 2.8933185618340556, + "learning_rate": 4.862942807622525e-06, + "loss": 0.3059, + "mean_token_accuracy": 0.8922228813171387, + "step": 3906 + }, + { + "epoch": 1.9535, + "grad_norm": 3.829541133147138, + "learning_rate": 4.862800283773525e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8765240907669067, + "step": 3907 + }, + { + "epoch": 1.954, + "grad_norm": 1.902339357134361, + "learning_rate": 4.8626576879495125e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8662698268890381, + "step": 3908 + }, + { + "epoch": 1.9545, + "grad_norm": 1.870018352112058, + "learning_rate": 4.862515020154831e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.893139660358429, + "step": 3909 + }, + { + "epoch": 1.955, + "grad_norm": 2.7200604009095697, + "learning_rate": 4.862372280393828e-06, + "loss": 0.2948, + "mean_token_accuracy": 0.8944399952888489, + "step": 3910 + }, + { + "epoch": 1.9555, + "grad_norm": 3.0855995955240587, + "learning_rate": 4.86222946867085e-06, + "loss": 0.3019, + "mean_token_accuracy": 0.89554762840271, + "step": 3911 + }, + { + "epoch": 1.956, + "grad_norm": 1.8645988568801732, + "learning_rate": 4.862086584990246e-06, + "loss": 0.2816, + "mean_token_accuracy": 0.9064869284629822, + "step": 3912 + }, + { + "epoch": 1.9565000000000001, + "grad_norm": 2.5011279678361538, + "learning_rate": 4.861943629356372e-06, + "loss": 0.6288, + "mean_token_accuracy": 0.8210140466690063, + "step": 3913 + }, + { + "epoch": 1.9569999999999999, + "grad_norm": 2.5215896691205324, + "learning_rate": 4.861800601773579e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8816507458686829, + "step": 3914 + }, + { + "epoch": 1.9575, + "grad_norm": 1.8821424896642933, + "learning_rate": 4.861657502246226e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.849380373954773, + "step": 3915 + }, + { + "epoch": 1.958, + "grad_norm": 2.881560267637315, + "learning_rate": 4.861514330778672e-06, + "loss": 0.2223, + "mean_token_accuracy": 0.9240899682044983, + "step": 3916 + }, + { + "epoch": 1.9585, + "grad_norm": 2.3240221126358342, + "learning_rate": 4.861371087375279e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8909341096878052, + "step": 3917 + }, + { + "epoch": 1.959, + "grad_norm": 1.8349501779524637, + "learning_rate": 4.861227772040409e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8696874380111694, + "step": 3918 + }, + { + "epoch": 1.9595, + "grad_norm": 3.777851702224749, + "learning_rate": 4.8610843847784275e-06, + "loss": 0.2653, + "mean_token_accuracy": 0.9130525588989258, + "step": 3919 + }, + { + "epoch": 1.96, + "grad_norm": 2.2506288062179567, + "learning_rate": 4.860940925593703e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8828408718109131, + "step": 3920 + }, + { + "epoch": 1.9605000000000001, + "grad_norm": 2.4177775606097627, + "learning_rate": 4.8607973944906055e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8807579278945923, + "step": 3921 + }, + { + "epoch": 1.9609999999999999, + "grad_norm": 2.4705900111158825, + "learning_rate": 4.860653791473507e-06, + "loss": 0.3592, + "mean_token_accuracy": 0.8787193298339844, + "step": 3922 + }, + { + "epoch": 1.9615, + "grad_norm": 3.091126447644991, + "learning_rate": 4.860510116546782e-06, + "loss": 0.5271, + "mean_token_accuracy": 0.8554385900497437, + "step": 3923 + }, + { + "epoch": 1.962, + "grad_norm": 2.399653313280493, + "learning_rate": 4.860366369714807e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8582691550254822, + "step": 3924 + }, + { + "epoch": 1.9625, + "grad_norm": 5.167178139074083, + "learning_rate": 4.860222550981961e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8654714822769165, + "step": 3925 + }, + { + "epoch": 1.963, + "grad_norm": 2.390752728950921, + "learning_rate": 4.860078660352625e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8759992718696594, + "step": 3926 + }, + { + "epoch": 1.9635, + "grad_norm": 3.7152635633817543, + "learning_rate": 4.859934697831181e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8747478127479553, + "step": 3927 + }, + { + "epoch": 1.964, + "grad_norm": 2.6605861110015234, + "learning_rate": 4.8597906634220165e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8695430755615234, + "step": 3928 + }, + { + "epoch": 1.9645000000000001, + "grad_norm": 2.1565234744105033, + "learning_rate": 4.859646557129517e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.876498818397522, + "step": 3929 + }, + { + "epoch": 1.9649999999999999, + "grad_norm": 3.3969458364739626, + "learning_rate": 4.8595023789580745e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8772862553596497, + "step": 3930 + }, + { + "epoch": 1.9655, + "grad_norm": 4.489985513208022, + "learning_rate": 4.8593581289120785e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8707826137542725, + "step": 3931 + }, + { + "epoch": 1.966, + "grad_norm": 3.5401820718390495, + "learning_rate": 4.859213806995924e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8737800717353821, + "step": 3932 + }, + { + "epoch": 1.9665, + "grad_norm": 2.676653286684027, + "learning_rate": 4.859069413214007e-06, + "loss": 0.3444, + "mean_token_accuracy": 0.8891406655311584, + "step": 3933 + }, + { + "epoch": 1.967, + "grad_norm": 2.5468451671902326, + "learning_rate": 4.8589249475707276e-06, + "loss": 0.2756, + "mean_token_accuracy": 0.894353985786438, + "step": 3934 + }, + { + "epoch": 1.9675, + "grad_norm": 2.258932983035775, + "learning_rate": 4.858780410070484e-06, + "loss": 0.322, + "mean_token_accuracy": 0.8966730833053589, + "step": 3935 + }, + { + "epoch": 1.968, + "grad_norm": 1.9317910119507855, + "learning_rate": 4.8586358007176815e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.870905876159668, + "step": 3936 + }, + { + "epoch": 1.9685000000000001, + "grad_norm": 1.9453396357370663, + "learning_rate": 4.858491119516724e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.8862400054931641, + "step": 3937 + }, + { + "epoch": 1.9689999999999999, + "grad_norm": 2.216111997946265, + "learning_rate": 4.858346366472018e-06, + "loss": 0.2697, + "mean_token_accuracy": 0.9117221236228943, + "step": 3938 + }, + { + "epoch": 1.9695, + "grad_norm": 2.1755063034330475, + "learning_rate": 4.858201541587974e-06, + "loss": 0.3061, + "mean_token_accuracy": 0.8987861275672913, + "step": 3939 + }, + { + "epoch": 1.97, + "grad_norm": 1.671009384486894, + "learning_rate": 4.858056644869002e-06, + "loss": 0.2985, + "mean_token_accuracy": 0.9028427600860596, + "step": 3940 + }, + { + "epoch": 1.9705, + "grad_norm": 1.838725946050487, + "learning_rate": 4.857911676319519e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8727400898933411, + "step": 3941 + }, + { + "epoch": 1.971, + "grad_norm": 3.896584463739908, + "learning_rate": 4.857766635943938e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8678791522979736, + "step": 3942 + }, + { + "epoch": 1.9715, + "grad_norm": 2.420901609359069, + "learning_rate": 4.857621523746679e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.8694528341293335, + "step": 3943 + }, + { + "epoch": 1.972, + "grad_norm": 2.5317684318527345, + "learning_rate": 4.857476339732162e-06, + "loss": 0.3053, + "mean_token_accuracy": 0.9006720185279846, + "step": 3944 + }, + { + "epoch": 1.9725000000000001, + "grad_norm": 2.8803816020990194, + "learning_rate": 4.8573310839048085e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8889430165290833, + "step": 3945 + }, + { + "epoch": 1.9729999999999999, + "grad_norm": 2.294455026047457, + "learning_rate": 4.857185756269044e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.882641077041626, + "step": 3946 + }, + { + "epoch": 1.9735, + "grad_norm": 1.9611005410422055, + "learning_rate": 4.857040356829295e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.8902338147163391, + "step": 3947 + }, + { + "epoch": 1.974, + "grad_norm": 5.109982312559411, + "learning_rate": 4.8568948855899915e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8873189687728882, + "step": 3948 + }, + { + "epoch": 1.9745, + "grad_norm": 3.4778473127883336, + "learning_rate": 4.856749342555564e-06, + "loss": 0.3206, + "mean_token_accuracy": 0.8851563930511475, + "step": 3949 + }, + { + "epoch": 1.975, + "grad_norm": 2.6552852264313964, + "learning_rate": 4.856603727730446e-06, + "loss": 0.2816, + "mean_token_accuracy": 0.9042180776596069, + "step": 3950 + }, + { + "epoch": 1.9755, + "grad_norm": 2.347004722793954, + "learning_rate": 4.856458041119074e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8625785708427429, + "step": 3951 + }, + { + "epoch": 1.976, + "grad_norm": 2.909111520059335, + "learning_rate": 4.856312282725886e-06, + "loss": 0.3115, + "mean_token_accuracy": 0.9043603539466858, + "step": 3952 + }, + { + "epoch": 1.9765000000000001, + "grad_norm": 2.3465140023104474, + "learning_rate": 4.856166452555321e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8887425661087036, + "step": 3953 + }, + { + "epoch": 1.9769999999999999, + "grad_norm": 5.328473224790206, + "learning_rate": 4.85602055061182e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8669447302818298, + "step": 3954 + }, + { + "epoch": 1.9775, + "grad_norm": 3.679410672391385, + "learning_rate": 4.855874576899831e-06, + "loss": 0.2728, + "mean_token_accuracy": 0.9129727482795715, + "step": 3955 + }, + { + "epoch": 1.978, + "grad_norm": 3.0391123957828845, + "learning_rate": 4.855728531423798e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8898826837539673, + "step": 3956 + }, + { + "epoch": 1.9785, + "grad_norm": 4.065561133667753, + "learning_rate": 4.855582414188171e-06, + "loss": 0.5508, + "mean_token_accuracy": 0.8417279124259949, + "step": 3957 + }, + { + "epoch": 1.979, + "grad_norm": 2.256932524101353, + "learning_rate": 4.8554362251974e-06, + "loss": 0.31, + "mean_token_accuracy": 0.898062527179718, + "step": 3958 + }, + { + "epoch": 1.9795, + "grad_norm": 3.5562605607893887, + "learning_rate": 4.855289964455938e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8948529362678528, + "step": 3959 + }, + { + "epoch": 1.98, + "grad_norm": 2.14104084344583, + "learning_rate": 4.855143631968242e-06, + "loss": 0.3092, + "mean_token_accuracy": 0.9042474031448364, + "step": 3960 + }, + { + "epoch": 1.9805000000000001, + "grad_norm": 3.216374514753817, + "learning_rate": 4.854997227738769e-06, + "loss": 0.2984, + "mean_token_accuracy": 0.8980656862258911, + "step": 3961 + }, + { + "epoch": 1.9809999999999999, + "grad_norm": 3.5500653410831817, + "learning_rate": 4.854850751771977e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.869102418422699, + "step": 3962 + }, + { + "epoch": 1.9815, + "grad_norm": 2.4602438791935435, + "learning_rate": 4.85470420407233e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8878874182701111, + "step": 3963 + }, + { + "epoch": 1.982, + "grad_norm": 2.6754971303413857, + "learning_rate": 4.854557584644291e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8491379022598267, + "step": 3964 + }, + { + "epoch": 1.9825, + "grad_norm": 3.8881776293842023, + "learning_rate": 4.854410893492326e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.8927128911018372, + "step": 3965 + }, + { + "epoch": 1.983, + "grad_norm": 3.9728070642814415, + "learning_rate": 4.854264130620905e-06, + "loss": 0.5886, + "mean_token_accuracy": 0.8333333134651184, + "step": 3966 + }, + { + "epoch": 1.9835, + "grad_norm": 2.0755356694746405, + "learning_rate": 4.854117296034497e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.9032133221626282, + "step": 3967 + }, + { + "epoch": 1.984, + "grad_norm": 2.117318693555641, + "learning_rate": 4.853970389737576e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8569166660308838, + "step": 3968 + }, + { + "epoch": 1.9845000000000002, + "grad_norm": 2.3494480941210347, + "learning_rate": 4.853823411734616e-06, + "loss": 0.336, + "mean_token_accuracy": 0.886623740196228, + "step": 3969 + }, + { + "epoch": 1.9849999999999999, + "grad_norm": 163.0715734378269, + "learning_rate": 4.853676362030095e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.867290198802948, + "step": 3970 + }, + { + "epoch": 1.9855, + "grad_norm": 9.60473679919797, + "learning_rate": 4.853529240628493e-06, + "loss": 0.5138, + "mean_token_accuracy": 0.8541915416717529, + "step": 3971 + }, + { + "epoch": 1.986, + "grad_norm": 2.1724409982973003, + "learning_rate": 4.8533820475342895e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8732352256774902, + "step": 3972 + }, + { + "epoch": 1.9865, + "grad_norm": 6.101556463159957, + "learning_rate": 4.85323478275197e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8762529492378235, + "step": 3973 + }, + { + "epoch": 1.987, + "grad_norm": 1.834284644042415, + "learning_rate": 4.853087446286019e-06, + "loss": 0.2891, + "mean_token_accuracy": 0.8966240882873535, + "step": 3974 + }, + { + "epoch": 1.9875, + "grad_norm": 1.8840067707348938, + "learning_rate": 4.852940038140927e-06, + "loss": 0.3516, + "mean_token_accuracy": 0.8837245106697083, + "step": 3975 + }, + { + "epoch": 1.988, + "grad_norm": 2.5786594406069834, + "learning_rate": 4.852792558321182e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8623043894767761, + "step": 3976 + }, + { + "epoch": 1.9885000000000002, + "grad_norm": 2.1838365516429907, + "learning_rate": 4.852645006831278e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8863027691841125, + "step": 3977 + }, + { + "epoch": 1.9889999999999999, + "grad_norm": 2.8095678452382957, + "learning_rate": 4.852497383675709e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8736543655395508, + "step": 3978 + }, + { + "epoch": 1.9895, + "grad_norm": 2.568708633040239, + "learning_rate": 4.85234968885897e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.883319079875946, + "step": 3979 + }, + { + "epoch": 1.99, + "grad_norm": 4.688717839341213, + "learning_rate": 4.852201922385564e-06, + "loss": 0.2288, + "mean_token_accuracy": 0.9232314229011536, + "step": 3980 + }, + { + "epoch": 1.9905, + "grad_norm": 9.811982887864474, + "learning_rate": 4.8520540842599895e-06, + "loss": 0.2764, + "mean_token_accuracy": 0.9138273000717163, + "step": 3981 + }, + { + "epoch": 1.991, + "grad_norm": 2.4918564342071603, + "learning_rate": 4.851906174486751e-06, + "loss": 0.2908, + "mean_token_accuracy": 0.9067419767379761, + "step": 3982 + }, + { + "epoch": 1.9915, + "grad_norm": 4.485367909989317, + "learning_rate": 4.851758193070353e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8634259104728699, + "step": 3983 + }, + { + "epoch": 1.992, + "grad_norm": 2.19679925220146, + "learning_rate": 4.8516101400153036e-06, + "loss": 0.3273, + "mean_token_accuracy": 0.8951759934425354, + "step": 3984 + }, + { + "epoch": 1.9925000000000002, + "grad_norm": 1.6861507097974824, + "learning_rate": 4.851462015326114e-06, + "loss": 0.3151, + "mean_token_accuracy": 0.8953742980957031, + "step": 3985 + }, + { + "epoch": 1.9929999999999999, + "grad_norm": 1.8499235278517956, + "learning_rate": 4.851313819007295e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8881977200508118, + "step": 3986 + }, + { + "epoch": 1.9935, + "grad_norm": 2.107756547156501, + "learning_rate": 4.851165551063362e-06, + "loss": 0.3285, + "mean_token_accuracy": 0.8920201659202576, + "step": 3987 + }, + { + "epoch": 1.994, + "grad_norm": 2.1494773622024455, + "learning_rate": 4.851017211498829e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8833845853805542, + "step": 3988 + }, + { + "epoch": 1.9945, + "grad_norm": 2.4606489020088476, + "learning_rate": 4.850868800318218e-06, + "loss": 0.2628, + "mean_token_accuracy": 0.905264675617218, + "step": 3989 + }, + { + "epoch": 1.995, + "grad_norm": 6.638392209399019, + "learning_rate": 4.850720317526047e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8823879361152649, + "step": 3990 + }, + { + "epoch": 1.9955, + "grad_norm": 3.841191026031927, + "learning_rate": 4.850571763126842e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.885981023311615, + "step": 3991 + }, + { + "epoch": 1.996, + "grad_norm": 6.412394431967908, + "learning_rate": 4.850423137125126e-06, + "loss": 0.3075, + "mean_token_accuracy": 0.8975160121917725, + "step": 3992 + }, + { + "epoch": 1.9965000000000002, + "grad_norm": 1.857057023895757, + "learning_rate": 4.850274439525427e-06, + "loss": 0.2315, + "mean_token_accuracy": 0.918249249458313, + "step": 3993 + }, + { + "epoch": 1.9969999999999999, + "grad_norm": 2.7130266563270062, + "learning_rate": 4.850125670332275e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.875066339969635, + "step": 3994 + }, + { + "epoch": 1.9975, + "grad_norm": 2.1711529801819105, + "learning_rate": 4.8499768295502e-06, + "loss": 0.2904, + "mean_token_accuracy": 0.9044691920280457, + "step": 3995 + }, + { + "epoch": 1.998, + "grad_norm": 3.34267302926741, + "learning_rate": 4.849827917183739e-06, + "loss": 0.3354, + "mean_token_accuracy": 0.9040149450302124, + "step": 3996 + }, + { + "epoch": 1.9985, + "grad_norm": 1.8807551720170104, + "learning_rate": 4.849678933237426e-06, + "loss": 0.2844, + "mean_token_accuracy": 0.9096095561981201, + "step": 3997 + }, + { + "epoch": 1.999, + "grad_norm": 2.769276477495315, + "learning_rate": 4.849529877715799e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8775700926780701, + "step": 3998 + }, + { + "epoch": 1.9995, + "grad_norm": 2.44563552923311, + "learning_rate": 4.8493807506234005e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8859731554985046, + "step": 3999 + }, + { + "epoch": 2.0, + "grad_norm": 2.0875893533116745, + "learning_rate": 4.849231551964771e-06, + "loss": 0.3253, + "mean_token_accuracy": 0.8986757397651672, + "step": 4000 + }, + { + "epoch": 2.0005, + "grad_norm": 2.9774758138857447, + "learning_rate": 4.8490822817444575e-06, + "loss": 0.2911, + "mean_token_accuracy": 0.9088937044143677, + "step": 4001 + }, + { + "epoch": 2.001, + "grad_norm": 3.335448840047643, + "learning_rate": 4.848932939967005e-06, + "loss": 0.2557, + "mean_token_accuracy": 0.9111148118972778, + "step": 4002 + }, + { + "epoch": 2.0015, + "grad_norm": 1.9287189933732565, + "learning_rate": 4.8487835266369635e-06, + "loss": 0.2763, + "mean_token_accuracy": 0.9095922708511353, + "step": 4003 + }, + { + "epoch": 2.002, + "grad_norm": 4.8214604049277785, + "learning_rate": 4.848634041758884e-06, + "loss": 0.3002, + "mean_token_accuracy": 0.8959978818893433, + "step": 4004 + }, + { + "epoch": 2.0025, + "grad_norm": 2.6427846746622357, + "learning_rate": 4.8484844853373205e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.891461968421936, + "step": 4005 + }, + { + "epoch": 2.003, + "grad_norm": 1.63426307208924, + "learning_rate": 4.848334857376829e-06, + "loss": 0.2404, + "mean_token_accuracy": 0.9169785380363464, + "step": 4006 + }, + { + "epoch": 2.0035, + "grad_norm": 2.1432006985278775, + "learning_rate": 4.848185157881969e-06, + "loss": 0.2133, + "mean_token_accuracy": 0.9286764860153198, + "step": 4007 + }, + { + "epoch": 2.004, + "grad_norm": 2.853783295816091, + "learning_rate": 4.848035386857296e-06, + "loss": 0.298, + "mean_token_accuracy": 0.9016653299331665, + "step": 4008 + }, + { + "epoch": 2.0045, + "grad_norm": 5.689724310264397, + "learning_rate": 4.847885544307376e-06, + "loss": 0.2285, + "mean_token_accuracy": 0.9205095767974854, + "step": 4009 + }, + { + "epoch": 2.005, + "grad_norm": 2.4075332960026925, + "learning_rate": 4.847735630236773e-06, + "loss": 0.3134, + "mean_token_accuracy": 0.9003403782844543, + "step": 4010 + }, + { + "epoch": 2.0055, + "grad_norm": 2.7152789403111885, + "learning_rate": 4.847585644650054e-06, + "loss": 0.1871, + "mean_token_accuracy": 0.9361116290092468, + "step": 4011 + }, + { + "epoch": 2.006, + "grad_norm": 2.418141305276696, + "learning_rate": 4.847435587551785e-06, + "loss": 0.2554, + "mean_token_accuracy": 0.9155073165893555, + "step": 4012 + }, + { + "epoch": 2.0065, + "grad_norm": 1.7186222745685793, + "learning_rate": 4.84728545894654e-06, + "loss": 0.3061, + "mean_token_accuracy": 0.8879409432411194, + "step": 4013 + }, + { + "epoch": 2.007, + "grad_norm": 3.81824584651876, + "learning_rate": 4.847135258838891e-06, + "loss": 0.251, + "mean_token_accuracy": 0.9159601926803589, + "step": 4014 + }, + { + "epoch": 2.0075, + "grad_norm": 23.989856899401374, + "learning_rate": 4.846984987233414e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.903602123260498, + "step": 4015 + }, + { + "epoch": 2.008, + "grad_norm": 3.0083500789160467, + "learning_rate": 4.846834644134686e-06, + "loss": 0.3063, + "mean_token_accuracy": 0.8980597257614136, + "step": 4016 + }, + { + "epoch": 2.0085, + "grad_norm": 5.9314411832277, + "learning_rate": 4.846684229547286e-06, + "loss": 0.2773, + "mean_token_accuracy": 0.9119485020637512, + "step": 4017 + }, + { + "epoch": 2.009, + "grad_norm": 3.870405407322259, + "learning_rate": 4.846533743475797e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8902438879013062, + "step": 4018 + }, + { + "epoch": 2.0095, + "grad_norm": 14.328012995572912, + "learning_rate": 4.8463831859248035e-06, + "loss": 0.2847, + "mean_token_accuracy": 0.9022426605224609, + "step": 4019 + }, + { + "epoch": 2.01, + "grad_norm": 2.0066639863416658, + "learning_rate": 4.84623255689889e-06, + "loss": 0.2282, + "mean_token_accuracy": 0.9143544435501099, + "step": 4020 + }, + { + "epoch": 2.0105, + "grad_norm": 4.630247769630776, + "learning_rate": 4.846081856402647e-06, + "loss": 0.2612, + "mean_token_accuracy": 0.9103185534477234, + "step": 4021 + }, + { + "epoch": 2.011, + "grad_norm": 1.5914011312409637, + "learning_rate": 4.845931084440662e-06, + "loss": 0.1758, + "mean_token_accuracy": 0.9358733892440796, + "step": 4022 + }, + { + "epoch": 2.0115, + "grad_norm": 4.353158276471111, + "learning_rate": 4.845780241017533e-06, + "loss": 0.303, + "mean_token_accuracy": 0.8906648755073547, + "step": 4023 + }, + { + "epoch": 2.012, + "grad_norm": 12.133446842134596, + "learning_rate": 4.845629326137849e-06, + "loss": 0.3401, + "mean_token_accuracy": 0.8947464823722839, + "step": 4024 + }, + { + "epoch": 2.0125, + "grad_norm": 4.531891282205214, + "learning_rate": 4.845478339806211e-06, + "loss": 0.3013, + "mean_token_accuracy": 0.9026151299476624, + "step": 4025 + }, + { + "epoch": 2.013, + "grad_norm": 4.025640052094371, + "learning_rate": 4.8453272820272165e-06, + "loss": 0.2915, + "mean_token_accuracy": 0.9112197160720825, + "step": 4026 + }, + { + "epoch": 2.0135, + "grad_norm": 3.041122034042229, + "learning_rate": 4.845176152805469e-06, + "loss": 0.2296, + "mean_token_accuracy": 0.9174729585647583, + "step": 4027 + }, + { + "epoch": 2.014, + "grad_norm": 2.4096199778023295, + "learning_rate": 4.8450249521455695e-06, + "loss": 0.2871, + "mean_token_accuracy": 0.9023844003677368, + "step": 4028 + }, + { + "epoch": 2.0145, + "grad_norm": 2.70559143256807, + "learning_rate": 4.844873680052126e-06, + "loss": 0.2081, + "mean_token_accuracy": 0.9264892339706421, + "step": 4029 + }, + { + "epoch": 2.015, + "grad_norm": 1.5709591717292752, + "learning_rate": 4.844722336529745e-06, + "loss": 0.2461, + "mean_token_accuracy": 0.9135609269142151, + "step": 4030 + }, + { + "epoch": 2.0155, + "grad_norm": 2.6219582257697778, + "learning_rate": 4.844570921583037e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8854730129241943, + "step": 4031 + }, + { + "epoch": 2.016, + "grad_norm": 6.637808616857665, + "learning_rate": 4.844419435216615e-06, + "loss": 0.2641, + "mean_token_accuracy": 0.9113416075706482, + "step": 4032 + }, + { + "epoch": 2.0165, + "grad_norm": 2.0843353268105407, + "learning_rate": 4.8442678774350935e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8891444206237793, + "step": 4033 + }, + { + "epoch": 2.017, + "grad_norm": 4.215659473295367, + "learning_rate": 4.8441162482430896e-06, + "loss": 0.2883, + "mean_token_accuracy": 0.905452311038971, + "step": 4034 + }, + { + "epoch": 2.0175, + "grad_norm": 2.1872563752905005, + "learning_rate": 4.843964547645221e-06, + "loss": 0.3099, + "mean_token_accuracy": 0.8963178396224976, + "step": 4035 + }, + { + "epoch": 2.018, + "grad_norm": 10.622931656700999, + "learning_rate": 4.8438127756461095e-06, + "loss": 0.2028, + "mean_token_accuracy": 0.9306787252426147, + "step": 4036 + }, + { + "epoch": 2.0185, + "grad_norm": 2.3399940753870547, + "learning_rate": 4.843660932250378e-06, + "loss": 0.3087, + "mean_token_accuracy": 0.9054421782493591, + "step": 4037 + }, + { + "epoch": 2.019, + "grad_norm": 3.314579684943133, + "learning_rate": 4.843509017462652e-06, + "loss": 0.3281, + "mean_token_accuracy": 0.8898458480834961, + "step": 4038 + }, + { + "epoch": 2.0195, + "grad_norm": 1.7912084929833965, + "learning_rate": 4.843357031287559e-06, + "loss": 0.2449, + "mean_token_accuracy": 0.908656120300293, + "step": 4039 + }, + { + "epoch": 2.02, + "grad_norm": 2.6776496408350448, + "learning_rate": 4.84320497372973e-06, + "loss": 0.1797, + "mean_token_accuracy": 0.9341563582420349, + "step": 4040 + }, + { + "epoch": 2.0205, + "grad_norm": 2.6046292686044765, + "learning_rate": 4.843052844793794e-06, + "loss": 0.3325, + "mean_token_accuracy": 0.8907634615898132, + "step": 4041 + }, + { + "epoch": 2.021, + "grad_norm": 2.220577873368837, + "learning_rate": 4.8429006444843885e-06, + "loss": 0.2112, + "mean_token_accuracy": 0.9312990307807922, + "step": 4042 + }, + { + "epoch": 2.0215, + "grad_norm": 3.702756316021066, + "learning_rate": 4.8427483728061475e-06, + "loss": 0.2783, + "mean_token_accuracy": 0.9086304306983948, + "step": 4043 + }, + { + "epoch": 2.022, + "grad_norm": 2.4651846114760163, + "learning_rate": 4.84259602976371e-06, + "loss": 0.3128, + "mean_token_accuracy": 0.9003056883811951, + "step": 4044 + }, + { + "epoch": 2.0225, + "grad_norm": 2.448772948022258, + "learning_rate": 4.842443615361718e-06, + "loss": 0.2962, + "mean_token_accuracy": 0.9045570492744446, + "step": 4045 + }, + { + "epoch": 2.023, + "grad_norm": 5.205007133542828, + "learning_rate": 4.8422911296048126e-06, + "loss": 0.2492, + "mean_token_accuracy": 0.9194657802581787, + "step": 4046 + }, + { + "epoch": 2.0235, + "grad_norm": 1.4854089978420373, + "learning_rate": 4.842138572497639e-06, + "loss": 0.1884, + "mean_token_accuracy": 0.9297786951065063, + "step": 4047 + }, + { + "epoch": 2.024, + "grad_norm": 2.9018383576036926, + "learning_rate": 4.841985944044845e-06, + "loss": 0.2807, + "mean_token_accuracy": 0.9063729643821716, + "step": 4048 + }, + { + "epoch": 2.0245, + "grad_norm": 2.556372076943499, + "learning_rate": 4.84183324425108e-06, + "loss": 0.2514, + "mean_token_accuracy": 0.9079291820526123, + "step": 4049 + }, + { + "epoch": 2.025, + "grad_norm": 1.8178307923216026, + "learning_rate": 4.841680473120994e-06, + "loss": 0.2334, + "mean_token_accuracy": 0.9253870248794556, + "step": 4050 + }, + { + "epoch": 2.0255, + "grad_norm": 1.6977847743753391, + "learning_rate": 4.841527630659243e-06, + "loss": 0.2445, + "mean_token_accuracy": 0.9146304726600647, + "step": 4051 + }, + { + "epoch": 2.026, + "grad_norm": 2.0526722711520895, + "learning_rate": 4.8413747168704815e-06, + "loss": 0.2656, + "mean_token_accuracy": 0.9209610223770142, + "step": 4052 + }, + { + "epoch": 2.0265, + "grad_norm": 3.8401863665248253, + "learning_rate": 4.841221731759367e-06, + "loss": 0.272, + "mean_token_accuracy": 0.9053046703338623, + "step": 4053 + }, + { + "epoch": 2.027, + "grad_norm": 1.972073756575742, + "learning_rate": 4.8410686753305615e-06, + "loss": 0.2626, + "mean_token_accuracy": 0.9097030162811279, + "step": 4054 + }, + { + "epoch": 2.0275, + "grad_norm": 2.0063938322812422, + "learning_rate": 4.840915547588725e-06, + "loss": 0.2926, + "mean_token_accuracy": 0.89839106798172, + "step": 4055 + }, + { + "epoch": 2.028, + "grad_norm": 1.611203287177542, + "learning_rate": 4.840762348538524e-06, + "loss": 0.2591, + "mean_token_accuracy": 0.9076040387153625, + "step": 4056 + }, + { + "epoch": 2.0285, + "grad_norm": 2.3682671419549273, + "learning_rate": 4.8406090781846235e-06, + "loss": 0.2621, + "mean_token_accuracy": 0.9121965765953064, + "step": 4057 + }, + { + "epoch": 2.029, + "grad_norm": 1.8074493203639332, + "learning_rate": 4.840455736531695e-06, + "loss": 0.1799, + "mean_token_accuracy": 0.9340659379959106, + "step": 4058 + }, + { + "epoch": 2.0295, + "grad_norm": 5.846000542067344, + "learning_rate": 4.840302323584407e-06, + "loss": 0.1877, + "mean_token_accuracy": 0.9386329054832458, + "step": 4059 + }, + { + "epoch": 2.03, + "grad_norm": 3.3675135997073258, + "learning_rate": 4.840148839347434e-06, + "loss": 0.2606, + "mean_token_accuracy": 0.9112061858177185, + "step": 4060 + }, + { + "epoch": 2.0305, + "grad_norm": 1.745845767935149, + "learning_rate": 4.839995283825451e-06, + "loss": 0.2343, + "mean_token_accuracy": 0.9156783223152161, + "step": 4061 + }, + { + "epoch": 2.031, + "grad_norm": 1.7107502591691501, + "learning_rate": 4.839841657023135e-06, + "loss": 0.1342, + "mean_token_accuracy": 0.9441362023353577, + "step": 4062 + }, + { + "epoch": 2.0315, + "grad_norm": 2.2926197809218434, + "learning_rate": 4.839687958945166e-06, + "loss": 0.2747, + "mean_token_accuracy": 0.9121317267417908, + "step": 4063 + }, + { + "epoch": 2.032, + "grad_norm": 3.6323809151386386, + "learning_rate": 4.839534189596228e-06, + "loss": 0.2408, + "mean_token_accuracy": 0.9187057018280029, + "step": 4064 + }, + { + "epoch": 2.0325, + "grad_norm": 2.698428225439647, + "learning_rate": 4.839380348981002e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8803681135177612, + "step": 4065 + }, + { + "epoch": 2.033, + "grad_norm": 2.0514117235581844, + "learning_rate": 4.839226437104176e-06, + "loss": 0.2284, + "mean_token_accuracy": 0.9180133938789368, + "step": 4066 + }, + { + "epoch": 2.0335, + "grad_norm": 2.5756792651325293, + "learning_rate": 4.839072453970438e-06, + "loss": 0.3325, + "mean_token_accuracy": 0.8919177651405334, + "step": 4067 + }, + { + "epoch": 2.034, + "grad_norm": 2.210845302841798, + "learning_rate": 4.838918399584479e-06, + "loss": 0.25, + "mean_token_accuracy": 0.9125316739082336, + "step": 4068 + }, + { + "epoch": 2.0345, + "grad_norm": 1.7860076735428219, + "learning_rate": 4.838764273950991e-06, + "loss": 0.3092, + "mean_token_accuracy": 0.8936010599136353, + "step": 4069 + }, + { + "epoch": 2.035, + "grad_norm": 3.216204147164568, + "learning_rate": 4.838610077074669e-06, + "loss": 0.235, + "mean_token_accuracy": 0.9158597588539124, + "step": 4070 + }, + { + "epoch": 2.0355, + "grad_norm": 3.751129940387838, + "learning_rate": 4.838455808960211e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8821292519569397, + "step": 4071 + }, + { + "epoch": 2.036, + "grad_norm": 2.016896110576377, + "learning_rate": 4.838301469612315e-06, + "loss": 0.2329, + "mean_token_accuracy": 0.9178758859634399, + "step": 4072 + }, + { + "epoch": 2.0365, + "grad_norm": 1.6777003757516493, + "learning_rate": 4.838147059035684e-06, + "loss": 0.2465, + "mean_token_accuracy": 0.9122672080993652, + "step": 4073 + }, + { + "epoch": 2.037, + "grad_norm": 8.298733529307642, + "learning_rate": 4.83799257723502e-06, + "loss": 0.2683, + "mean_token_accuracy": 0.9052525162696838, + "step": 4074 + }, + { + "epoch": 2.0375, + "grad_norm": 1.640195966161915, + "learning_rate": 4.83783802421503e-06, + "loss": 0.2089, + "mean_token_accuracy": 0.9217115640640259, + "step": 4075 + }, + { + "epoch": 2.038, + "grad_norm": 4.883234180027024, + "learning_rate": 4.837683399980421e-06, + "loss": 0.2013, + "mean_token_accuracy": 0.9258585572242737, + "step": 4076 + }, + { + "epoch": 2.0385, + "grad_norm": 4.231164997988128, + "learning_rate": 4.837528704535904e-06, + "loss": 0.3224, + "mean_token_accuracy": 0.8909299373626709, + "step": 4077 + }, + { + "epoch": 2.039, + "grad_norm": 2.872267822803486, + "learning_rate": 4.837373937886191e-06, + "loss": 0.3335, + "mean_token_accuracy": 0.8877005577087402, + "step": 4078 + }, + { + "epoch": 2.0395, + "grad_norm": 2.1198894670692385, + "learning_rate": 4.837219100035996e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8806698322296143, + "step": 4079 + }, + { + "epoch": 2.04, + "grad_norm": 4.93027332899552, + "learning_rate": 4.837064190990036e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8908507227897644, + "step": 4080 + }, + { + "epoch": 2.0405, + "grad_norm": 36.99112645943481, + "learning_rate": 4.8369092107530305e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8811643123626709, + "step": 4081 + }, + { + "epoch": 2.041, + "grad_norm": 1.9843538827886744, + "learning_rate": 4.836754159329699e-06, + "loss": 0.2468, + "mean_token_accuracy": 0.9094820618629456, + "step": 4082 + }, + { + "epoch": 2.0415, + "grad_norm": 1.5699923787973173, + "learning_rate": 4.836599036724766e-06, + "loss": 0.1784, + "mean_token_accuracy": 0.9361544251441956, + "step": 4083 + }, + { + "epoch": 2.042, + "grad_norm": 1.8380077145879168, + "learning_rate": 4.8364438429429564e-06, + "loss": 0.2605, + "mean_token_accuracy": 0.9113016128540039, + "step": 4084 + }, + { + "epoch": 2.0425, + "grad_norm": 11.212270055164405, + "learning_rate": 4.836288577988997e-06, + "loss": 0.295, + "mean_token_accuracy": 0.8936728239059448, + "step": 4085 + }, + { + "epoch": 2.043, + "grad_norm": 1.8195596916183638, + "learning_rate": 4.8361332418676175e-06, + "loss": 0.2732, + "mean_token_accuracy": 0.905236542224884, + "step": 4086 + }, + { + "epoch": 2.0435, + "grad_norm": 3.450275957767096, + "learning_rate": 4.835977834583551e-06, + "loss": 0.325, + "mean_token_accuracy": 0.9000205397605896, + "step": 4087 + }, + { + "epoch": 2.044, + "grad_norm": 1.8801440484074485, + "learning_rate": 4.8358223561415304e-06, + "loss": 0.2218, + "mean_token_accuracy": 0.9246024489402771, + "step": 4088 + }, + { + "epoch": 2.0445, + "grad_norm": 1.5724914561905523, + "learning_rate": 4.8356668065462916e-06, + "loss": 0.1617, + "mean_token_accuracy": 0.9418765902519226, + "step": 4089 + }, + { + "epoch": 2.045, + "grad_norm": 2.2951616959276717, + "learning_rate": 4.835511185802574e-06, + "loss": 0.3315, + "mean_token_accuracy": 0.8950864672660828, + "step": 4090 + }, + { + "epoch": 2.0455, + "grad_norm": 2.45789561505561, + "learning_rate": 4.835355493915117e-06, + "loss": 0.2549, + "mean_token_accuracy": 0.9108858704566956, + "step": 4091 + }, + { + "epoch": 2.046, + "grad_norm": 2.1041430766532327, + "learning_rate": 4.835199730888664e-06, + "loss": 0.2661, + "mean_token_accuracy": 0.9177061915397644, + "step": 4092 + }, + { + "epoch": 2.0465, + "grad_norm": 2.148102877400761, + "learning_rate": 4.83504389672796e-06, + "loss": 0.2166, + "mean_token_accuracy": 0.9279708862304688, + "step": 4093 + }, + { + "epoch": 2.047, + "grad_norm": 2.3041861622510056, + "learning_rate": 4.83488799143775e-06, + "loss": 0.2985, + "mean_token_accuracy": 0.9025866985321045, + "step": 4094 + }, + { + "epoch": 2.0475, + "grad_norm": 3.0544237388855837, + "learning_rate": 4.834732015022786e-06, + "loss": 0.2881, + "mean_token_accuracy": 0.9063690900802612, + "step": 4095 + }, + { + "epoch": 2.048, + "grad_norm": 2.0086300086883604, + "learning_rate": 4.834575967487817e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8871229290962219, + "step": 4096 + }, + { + "epoch": 2.0485, + "grad_norm": 2.2554155136079217, + "learning_rate": 4.8344198488375985e-06, + "loss": 0.3108, + "mean_token_accuracy": 0.8938837647438049, + "step": 4097 + }, + { + "epoch": 2.049, + "grad_norm": 2.31539315173439, + "learning_rate": 4.834263659076884e-06, + "loss": 0.3126, + "mean_token_accuracy": 0.8957204222679138, + "step": 4098 + }, + { + "epoch": 2.0495, + "grad_norm": 3.149616465948041, + "learning_rate": 4.8341073982104334e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8823059797286987, + "step": 4099 + }, + { + "epoch": 2.05, + "grad_norm": 1.9397977111789273, + "learning_rate": 4.833951066243004e-06, + "loss": 0.2552, + "mean_token_accuracy": 0.9108357429504395, + "step": 4100 + }, + { + "epoch": 2.0505, + "grad_norm": 1.6478154188886673, + "learning_rate": 4.833794663179362e-06, + "loss": 0.2199, + "mean_token_accuracy": 0.9268754124641418, + "step": 4101 + }, + { + "epoch": 2.051, + "grad_norm": 1.9224212379338228, + "learning_rate": 4.833638189024268e-06, + "loss": 0.2993, + "mean_token_accuracy": 0.8972258567810059, + "step": 4102 + }, + { + "epoch": 2.0515, + "grad_norm": 4.4654727941869075, + "learning_rate": 4.833481643782489e-06, + "loss": 0.313, + "mean_token_accuracy": 0.8932468891143799, + "step": 4103 + }, + { + "epoch": 2.052, + "grad_norm": 8.679511100589746, + "learning_rate": 4.833325027458796e-06, + "loss": 0.244, + "mean_token_accuracy": 0.9242196083068848, + "step": 4104 + }, + { + "epoch": 2.0525, + "grad_norm": 2.7522914117653494, + "learning_rate": 4.833168340057957e-06, + "loss": 0.2945, + "mean_token_accuracy": 0.9047042727470398, + "step": 4105 + }, + { + "epoch": 2.053, + "grad_norm": 3.6352452320883857, + "learning_rate": 4.833011581584746e-06, + "loss": 0.2515, + "mean_token_accuracy": 0.9082736968994141, + "step": 4106 + }, + { + "epoch": 2.0535, + "grad_norm": 4.2013589090815415, + "learning_rate": 4.83285475204394e-06, + "loss": 0.3374, + "mean_token_accuracy": 0.887959361076355, + "step": 4107 + }, + { + "epoch": 2.054, + "grad_norm": 1.9007460350718721, + "learning_rate": 4.832697851440313e-06, + "loss": 0.2611, + "mean_token_accuracy": 0.9107167720794678, + "step": 4108 + }, + { + "epoch": 2.0545, + "grad_norm": 11.332235171480162, + "learning_rate": 4.832540879778647e-06, + "loss": 0.3016, + "mean_token_accuracy": 0.9036043882369995, + "step": 4109 + }, + { + "epoch": 2.055, + "grad_norm": 1.5956080862088469, + "learning_rate": 4.832383837063723e-06, + "loss": 0.2054, + "mean_token_accuracy": 0.9259711503982544, + "step": 4110 + }, + { + "epoch": 2.0555, + "grad_norm": 2.2259806099849695, + "learning_rate": 4.832226723300324e-06, + "loss": 0.2227, + "mean_token_accuracy": 0.922370433807373, + "step": 4111 + }, + { + "epoch": 2.056, + "grad_norm": 2.0888510460510155, + "learning_rate": 4.832069538493237e-06, + "loss": 0.2853, + "mean_token_accuracy": 0.9059433341026306, + "step": 4112 + }, + { + "epoch": 2.0565, + "grad_norm": 5.532046712142985, + "learning_rate": 4.831912282647249e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.8938383460044861, + "step": 4113 + }, + { + "epoch": 2.057, + "grad_norm": 6.3968265374020845, + "learning_rate": 4.831754955767151e-06, + "loss": 0.302, + "mean_token_accuracy": 0.9081571102142334, + "step": 4114 + }, + { + "epoch": 2.0575, + "grad_norm": 1.6391698574537983, + "learning_rate": 4.831597557857736e-06, + "loss": 0.2442, + "mean_token_accuracy": 0.9133172035217285, + "step": 4115 + }, + { + "epoch": 2.058, + "grad_norm": 2.7039407006108416, + "learning_rate": 4.831440088923798e-06, + "loss": 0.2519, + "mean_token_accuracy": 0.9060351252555847, + "step": 4116 + }, + { + "epoch": 2.0585, + "grad_norm": 5.410777509837409, + "learning_rate": 4.831282548970132e-06, + "loss": 0.2413, + "mean_token_accuracy": 0.9163617491722107, + "step": 4117 + }, + { + "epoch": 2.059, + "grad_norm": 1.9804357013626912, + "learning_rate": 4.83112493800154e-06, + "loss": 0.2171, + "mean_token_accuracy": 0.9196233153343201, + "step": 4118 + }, + { + "epoch": 2.0595, + "grad_norm": 4.682371845255293, + "learning_rate": 4.830967256022822e-06, + "loss": 0.3017, + "mean_token_accuracy": 0.8966633081436157, + "step": 4119 + }, + { + "epoch": 2.06, + "grad_norm": 3.7511091510059065, + "learning_rate": 4.830809503038781e-06, + "loss": 0.2503, + "mean_token_accuracy": 0.9193887114524841, + "step": 4120 + }, + { + "epoch": 2.0605, + "grad_norm": 3.723614335910291, + "learning_rate": 4.830651679054223e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8707940578460693, + "step": 4121 + }, + { + "epoch": 2.061, + "grad_norm": 6.582764950452612, + "learning_rate": 4.830493784073954e-06, + "loss": 0.2563, + "mean_token_accuracy": 0.9083366394042969, + "step": 4122 + }, + { + "epoch": 2.0615, + "grad_norm": 2.6824122940094197, + "learning_rate": 4.830335818102785e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8798671960830688, + "step": 4123 + }, + { + "epoch": 2.062, + "grad_norm": 2.9517694241724564, + "learning_rate": 4.830177781145528e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.8891167044639587, + "step": 4124 + }, + { + "epoch": 2.0625, + "grad_norm": 25.452179997749855, + "learning_rate": 4.830019673206997e-06, + "loss": 0.3281, + "mean_token_accuracy": 0.9004701972007751, + "step": 4125 + }, + { + "epoch": 2.063, + "grad_norm": 2.868402062713622, + "learning_rate": 4.829861494292007e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8711830377578735, + "step": 4126 + }, + { + "epoch": 2.0635, + "grad_norm": 2.5931194808403673, + "learning_rate": 4.829703244405379e-06, + "loss": 0.3064, + "mean_token_accuracy": 0.8982706069946289, + "step": 4127 + }, + { + "epoch": 2.064, + "grad_norm": 1.989877282299947, + "learning_rate": 4.8295449235519314e-06, + "loss": 0.2409, + "mean_token_accuracy": 0.9199197888374329, + "step": 4128 + }, + { + "epoch": 2.0645, + "grad_norm": 1.6056936015270324, + "learning_rate": 4.829386531736488e-06, + "loss": 0.1943, + "mean_token_accuracy": 0.9284657835960388, + "step": 4129 + }, + { + "epoch": 2.065, + "grad_norm": 3.1243922901304892, + "learning_rate": 4.829228068963873e-06, + "loss": 0.269, + "mean_token_accuracy": 0.9055050611495972, + "step": 4130 + }, + { + "epoch": 2.0655, + "grad_norm": 2.152677197200195, + "learning_rate": 4.8290695352389135e-06, + "loss": 0.2221, + "mean_token_accuracy": 0.9228277206420898, + "step": 4131 + }, + { + "epoch": 2.066, + "grad_norm": 5.566552014787944, + "learning_rate": 4.82891093056644e-06, + "loss": 0.3167, + "mean_token_accuracy": 0.8932926654815674, + "step": 4132 + }, + { + "epoch": 2.0665, + "grad_norm": 3.822263364095697, + "learning_rate": 4.828752254951281e-06, + "loss": 0.3198, + "mean_token_accuracy": 0.8966127634048462, + "step": 4133 + }, + { + "epoch": 2.067, + "grad_norm": 2.1818894146525243, + "learning_rate": 4.828593508398273e-06, + "loss": 0.2455, + "mean_token_accuracy": 0.9124693274497986, + "step": 4134 + }, + { + "epoch": 2.0675, + "grad_norm": 3.0281100289557235, + "learning_rate": 4.828434690912251e-06, + "loss": 0.2927, + "mean_token_accuracy": 0.9031675457954407, + "step": 4135 + }, + { + "epoch": 2.068, + "grad_norm": 5.213640476089396, + "learning_rate": 4.828275802498051e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8959221839904785, + "step": 4136 + }, + { + "epoch": 2.0685000000000002, + "grad_norm": 2.021754588722662, + "learning_rate": 4.828116843160515e-06, + "loss": 0.2858, + "mean_token_accuracy": 0.8993191123008728, + "step": 4137 + }, + { + "epoch": 2.069, + "grad_norm": 3.6405041775002203, + "learning_rate": 4.8279578129044855e-06, + "loss": 0.3096, + "mean_token_accuracy": 0.8998909592628479, + "step": 4138 + }, + { + "epoch": 2.0695, + "grad_norm": 10.787232065073283, + "learning_rate": 4.827798711734804e-06, + "loss": 0.2569, + "mean_token_accuracy": 0.9127492308616638, + "step": 4139 + }, + { + "epoch": 2.07, + "grad_norm": 3.202883597314408, + "learning_rate": 4.8276395396563215e-06, + "loss": 0.3, + "mean_token_accuracy": 0.9023562669754028, + "step": 4140 + }, + { + "epoch": 2.0705, + "grad_norm": 3.6132207428808627, + "learning_rate": 4.827480296673882e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8865209221839905, + "step": 4141 + }, + { + "epoch": 2.071, + "grad_norm": 2.804827996702134, + "learning_rate": 4.82732098279234e-06, + "loss": 0.2679, + "mean_token_accuracy": 0.9149958491325378, + "step": 4142 + }, + { + "epoch": 2.0715, + "grad_norm": 2.617226338909315, + "learning_rate": 4.827161598016546e-06, + "loss": 0.339, + "mean_token_accuracy": 0.8918027877807617, + "step": 4143 + }, + { + "epoch": 2.072, + "grad_norm": 13.229694510527686, + "learning_rate": 4.827002142351356e-06, + "loss": 0.2554, + "mean_token_accuracy": 0.917059063911438, + "step": 4144 + }, + { + "epoch": 2.0725, + "grad_norm": 2.1753175990650875, + "learning_rate": 4.826842615801628e-06, + "loss": 0.2856, + "mean_token_accuracy": 0.9005251526832581, + "step": 4145 + }, + { + "epoch": 2.073, + "grad_norm": 2.2315631023605165, + "learning_rate": 4.82668301837222e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8832388520240784, + "step": 4146 + }, + { + "epoch": 2.0735, + "grad_norm": 1.86594929355049, + "learning_rate": 4.826523350067994e-06, + "loss": 0.2208, + "mean_token_accuracy": 0.9245644807815552, + "step": 4147 + }, + { + "epoch": 2.074, + "grad_norm": 5.10393220721139, + "learning_rate": 4.826363610893815e-06, + "loss": 0.3153, + "mean_token_accuracy": 0.9080459475517273, + "step": 4148 + }, + { + "epoch": 2.0745, + "grad_norm": 2.575396651104821, + "learning_rate": 4.8262038008545485e-06, + "loss": 0.2865, + "mean_token_accuracy": 0.9069945216178894, + "step": 4149 + }, + { + "epoch": 2.075, + "grad_norm": 4.552846366785825, + "learning_rate": 4.826043919955062e-06, + "loss": 0.2882, + "mean_token_accuracy": 0.9090272784233093, + "step": 4150 + }, + { + "epoch": 2.0755, + "grad_norm": 1.982293358574408, + "learning_rate": 4.825883968200226e-06, + "loss": 0.192, + "mean_token_accuracy": 0.9224986433982849, + "step": 4151 + }, + { + "epoch": 2.076, + "grad_norm": 2.283163353432102, + "learning_rate": 4.825723945594912e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8695172071456909, + "step": 4152 + }, + { + "epoch": 2.0765, + "grad_norm": 2.1117812239643987, + "learning_rate": 4.825563852143996e-06, + "loss": 0.238, + "mean_token_accuracy": 0.9159226417541504, + "step": 4153 + }, + { + "epoch": 2.077, + "grad_norm": 2.0826418456041984, + "learning_rate": 4.825403687852354e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8892059922218323, + "step": 4154 + }, + { + "epoch": 2.0775, + "grad_norm": 2.9570577034071763, + "learning_rate": 4.825243452724865e-06, + "loss": 0.2248, + "mean_token_accuracy": 0.9164615273475647, + "step": 4155 + }, + { + "epoch": 2.078, + "grad_norm": 3.0918226513169955, + "learning_rate": 4.825083146766411e-06, + "loss": 0.2805, + "mean_token_accuracy": 0.901764988899231, + "step": 4156 + }, + { + "epoch": 2.0785, + "grad_norm": 3.2866996475953365, + "learning_rate": 4.824922769981874e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8879674673080444, + "step": 4157 + }, + { + "epoch": 2.079, + "grad_norm": 3.652692937039267, + "learning_rate": 4.824762322376139e-06, + "loss": 0.2655, + "mean_token_accuracy": 0.9073103070259094, + "step": 4158 + }, + { + "epoch": 2.0795, + "grad_norm": 5.311110302296244, + "learning_rate": 4.824601803954094e-06, + "loss": 0.2039, + "mean_token_accuracy": 0.9261482954025269, + "step": 4159 + }, + { + "epoch": 2.08, + "grad_norm": 3.0203010203901077, + "learning_rate": 4.824441214720629e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8863371014595032, + "step": 4160 + }, + { + "epoch": 2.0805, + "grad_norm": 3.0206852002091615, + "learning_rate": 4.824280554680636e-06, + "loss": 0.3161, + "mean_token_accuracy": 0.8988746404647827, + "step": 4161 + }, + { + "epoch": 2.081, + "grad_norm": 2.5419033726860616, + "learning_rate": 4.824119823839009e-06, + "loss": 0.2588, + "mean_token_accuracy": 0.9088808298110962, + "step": 4162 + }, + { + "epoch": 2.0815, + "grad_norm": 2.0690849682070698, + "learning_rate": 4.823959022200642e-06, + "loss": 0.2405, + "mean_token_accuracy": 0.9173057079315186, + "step": 4163 + }, + { + "epoch": 2.082, + "grad_norm": 2.5795245614777995, + "learning_rate": 4.823798149770437e-06, + "loss": 0.2773, + "mean_token_accuracy": 0.9036760926246643, + "step": 4164 + }, + { + "epoch": 2.0825, + "grad_norm": 2.432454631739719, + "learning_rate": 4.823637206553292e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8715314865112305, + "step": 4165 + }, + { + "epoch": 2.083, + "grad_norm": 13.555641100226811, + "learning_rate": 4.82347619255411e-06, + "loss": 0.1861, + "mean_token_accuracy": 0.92795330286026, + "step": 4166 + }, + { + "epoch": 2.0835, + "grad_norm": 3.09346899946003, + "learning_rate": 4.8233151077777955e-06, + "loss": 0.3081, + "mean_token_accuracy": 0.8979836106300354, + "step": 4167 + }, + { + "epoch": 2.084, + "grad_norm": 2.578332459558244, + "learning_rate": 4.823153952229257e-06, + "loss": 0.2671, + "mean_token_accuracy": 0.9105653166770935, + "step": 4168 + }, + { + "epoch": 2.0845, + "grad_norm": 2.253620930040032, + "learning_rate": 4.822992725913401e-06, + "loss": 0.3037, + "mean_token_accuracy": 0.8994343280792236, + "step": 4169 + }, + { + "epoch": 2.085, + "grad_norm": 2.7855487433417383, + "learning_rate": 4.8228314288351405e-06, + "loss": 0.2081, + "mean_token_accuracy": 0.9194495677947998, + "step": 4170 + }, + { + "epoch": 2.0855, + "grad_norm": 20.915620746022093, + "learning_rate": 4.8226700609993894e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.8884997367858887, + "step": 4171 + }, + { + "epoch": 2.086, + "grad_norm": 1.6065101628575327, + "learning_rate": 4.822508622411062e-06, + "loss": 0.2231, + "mean_token_accuracy": 0.920880913734436, + "step": 4172 + }, + { + "epoch": 2.0865, + "grad_norm": 3.6822937465592482, + "learning_rate": 4.822347113075076e-06, + "loss": 0.3071, + "mean_token_accuracy": 0.9090909361839294, + "step": 4173 + }, + { + "epoch": 2.087, + "grad_norm": 2.6825693457116713, + "learning_rate": 4.822185532996352e-06, + "loss": 0.2935, + "mean_token_accuracy": 0.8966902494430542, + "step": 4174 + }, + { + "epoch": 2.0875, + "grad_norm": 1.7838670463566986, + "learning_rate": 4.822023882179811e-06, + "loss": 0.2507, + "mean_token_accuracy": 0.9126643538475037, + "step": 4175 + }, + { + "epoch": 2.088, + "grad_norm": 45.489172273760225, + "learning_rate": 4.821862160630378e-06, + "loss": 0.2803, + "mean_token_accuracy": 0.9113706350326538, + "step": 4176 + }, + { + "epoch": 2.0885, + "grad_norm": 3.404037253172037, + "learning_rate": 4.821700368352979e-06, + "loss": 0.2684, + "mean_token_accuracy": 0.9052894711494446, + "step": 4177 + }, + { + "epoch": 2.089, + "grad_norm": 4.629878425726196, + "learning_rate": 4.821538505352544e-06, + "loss": 0.3352, + "mean_token_accuracy": 0.8985121250152588, + "step": 4178 + }, + { + "epoch": 2.0895, + "grad_norm": 7.045801775936881, + "learning_rate": 4.821376571634001e-06, + "loss": 0.3425, + "mean_token_accuracy": 0.8948017954826355, + "step": 4179 + }, + { + "epoch": 2.09, + "grad_norm": 2.5967913101477422, + "learning_rate": 4.821214567202284e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8890328407287598, + "step": 4180 + }, + { + "epoch": 2.0905, + "grad_norm": 2.8048638809096516, + "learning_rate": 4.821052492062328e-06, + "loss": 0.2237, + "mean_token_accuracy": 0.9195441007614136, + "step": 4181 + }, + { + "epoch": 2.091, + "grad_norm": 2.729688035659221, + "learning_rate": 4.820890346219071e-06, + "loss": 0.2619, + "mean_token_accuracy": 0.9091795086860657, + "step": 4182 + }, + { + "epoch": 2.0915, + "grad_norm": 6.102128148658727, + "learning_rate": 4.82072812967745e-06, + "loss": 0.3072, + "mean_token_accuracy": 0.896881639957428, + "step": 4183 + }, + { + "epoch": 2.092, + "grad_norm": 2.049339492215483, + "learning_rate": 4.820565842442408e-06, + "loss": 0.3294, + "mean_token_accuracy": 0.8968183398246765, + "step": 4184 + }, + { + "epoch": 2.0925, + "grad_norm": 4.442831417417991, + "learning_rate": 4.820403484518889e-06, + "loss": 0.2632, + "mean_token_accuracy": 0.9169178605079651, + "step": 4185 + }, + { + "epoch": 2.093, + "grad_norm": 2.317395609383369, + "learning_rate": 4.820241055911837e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8877022862434387, + "step": 4186 + }, + { + "epoch": 2.0935, + "grad_norm": 2.6012116329574133, + "learning_rate": 4.820078556626202e-06, + "loss": 0.2658, + "mean_token_accuracy": 0.9131455421447754, + "step": 4187 + }, + { + "epoch": 2.094, + "grad_norm": 2.1639748826436933, + "learning_rate": 4.819915986666932e-06, + "loss": 0.2697, + "mean_token_accuracy": 0.9072955846786499, + "step": 4188 + }, + { + "epoch": 2.0945, + "grad_norm": 1.9026068336957467, + "learning_rate": 4.81975334603898e-06, + "loss": 0.2342, + "mean_token_accuracy": 0.9203426837921143, + "step": 4189 + }, + { + "epoch": 2.095, + "grad_norm": 2.15930102728411, + "learning_rate": 4.8195906347473e-06, + "loss": 0.2405, + "mean_token_accuracy": 0.9164000153541565, + "step": 4190 + }, + { + "epoch": 2.0955, + "grad_norm": 2.226865051748082, + "learning_rate": 4.819427852796849e-06, + "loss": 0.2882, + "mean_token_accuracy": 0.9016969203948975, + "step": 4191 + }, + { + "epoch": 2.096, + "grad_norm": 2.3560302829182618, + "learning_rate": 4.8192650001925855e-06, + "loss": 0.2185, + "mean_token_accuracy": 0.921136200428009, + "step": 4192 + }, + { + "epoch": 2.0965, + "grad_norm": 1.8559868187366262, + "learning_rate": 4.81910207693947e-06, + "loss": 0.3079, + "mean_token_accuracy": 0.9071428775787354, + "step": 4193 + }, + { + "epoch": 2.097, + "grad_norm": 1.9833249399849746, + "learning_rate": 4.818939083042466e-06, + "loss": 0.2252, + "mean_token_accuracy": 0.9211665391921997, + "step": 4194 + }, + { + "epoch": 2.0975, + "grad_norm": 3.570507994629439, + "learning_rate": 4.818776018506538e-06, + "loss": 0.2385, + "mean_token_accuracy": 0.9191176295280457, + "step": 4195 + }, + { + "epoch": 2.098, + "grad_norm": 2.3512853306247847, + "learning_rate": 4.818612883336654e-06, + "loss": 0.3047, + "mean_token_accuracy": 0.903160810470581, + "step": 4196 + }, + { + "epoch": 2.0985, + "grad_norm": 2.13502631641187, + "learning_rate": 4.818449677537782e-06, + "loss": 0.2428, + "mean_token_accuracy": 0.918140709400177, + "step": 4197 + }, + { + "epoch": 2.099, + "grad_norm": 1.9125396795574414, + "learning_rate": 4.818286401114894e-06, + "loss": 0.2983, + "mean_token_accuracy": 0.9018335342407227, + "step": 4198 + }, + { + "epoch": 2.0995, + "grad_norm": 9.501145950649617, + "learning_rate": 4.818123054072965e-06, + "loss": 0.3298, + "mean_token_accuracy": 0.8888654112815857, + "step": 4199 + }, + { + "epoch": 2.1, + "grad_norm": 2.553374716108795, + "learning_rate": 4.817959636416969e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8822756409645081, + "step": 4200 + }, + { + "epoch": 2.1005, + "grad_norm": 10.577455881776464, + "learning_rate": 4.8177961481518856e-06, + "loss": 0.2139, + "mean_token_accuracy": 0.9261083602905273, + "step": 4201 + }, + { + "epoch": 2.101, + "grad_norm": 2.7112398577522536, + "learning_rate": 4.817632589282693e-06, + "loss": 0.2491, + "mean_token_accuracy": 0.9119553565979004, + "step": 4202 + }, + { + "epoch": 2.1015, + "grad_norm": 3.886268475006972, + "learning_rate": 4.817468959814375e-06, + "loss": 0.2803, + "mean_token_accuracy": 0.9030114412307739, + "step": 4203 + }, + { + "epoch": 2.102, + "grad_norm": 4.301706262762252, + "learning_rate": 4.817305259751916e-06, + "loss": 0.2653, + "mean_token_accuracy": 0.9088176488876343, + "step": 4204 + }, + { + "epoch": 2.1025, + "grad_norm": 2.2889843605163813, + "learning_rate": 4.817141489100302e-06, + "loss": 0.3243, + "mean_token_accuracy": 0.8849366307258606, + "step": 4205 + }, + { + "epoch": 2.103, + "grad_norm": 3.787913556323452, + "learning_rate": 4.816977647864522e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.8964073657989502, + "step": 4206 + }, + { + "epoch": 2.1035, + "grad_norm": 2.0439643074486358, + "learning_rate": 4.816813736049568e-06, + "loss": 0.2695, + "mean_token_accuracy": 0.9050107598304749, + "step": 4207 + }, + { + "epoch": 2.104, + "grad_norm": 2.8158772456318983, + "learning_rate": 4.816649753660431e-06, + "loss": 0.2784, + "mean_token_accuracy": 0.9114503860473633, + "step": 4208 + }, + { + "epoch": 2.1045, + "grad_norm": 3.536738916305452, + "learning_rate": 4.816485700702107e-06, + "loss": 0.2131, + "mean_token_accuracy": 0.9212684631347656, + "step": 4209 + }, + { + "epoch": 2.105, + "grad_norm": 2.2498116442362184, + "learning_rate": 4.816321577179594e-06, + "loss": 0.1967, + "mean_token_accuracy": 0.9274733662605286, + "step": 4210 + }, + { + "epoch": 2.1055, + "grad_norm": 2.0950128275204105, + "learning_rate": 4.816157383097891e-06, + "loss": 0.1991, + "mean_token_accuracy": 0.9290123581886292, + "step": 4211 + }, + { + "epoch": 2.106, + "grad_norm": 4.363157944517664, + "learning_rate": 4.815993118461999e-06, + "loss": 0.1758, + "mean_token_accuracy": 0.938160240650177, + "step": 4212 + }, + { + "epoch": 2.1065, + "grad_norm": 2.0705338382953395, + "learning_rate": 4.815828783276923e-06, + "loss": 0.2199, + "mean_token_accuracy": 0.9242607355117798, + "step": 4213 + }, + { + "epoch": 2.107, + "grad_norm": 2.4404941214808775, + "learning_rate": 4.815664377547667e-06, + "loss": 0.2985, + "mean_token_accuracy": 0.8985281586647034, + "step": 4214 + }, + { + "epoch": 2.1075, + "grad_norm": 2.989380616134534, + "learning_rate": 4.815499901279242e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8817051649093628, + "step": 4215 + }, + { + "epoch": 2.108, + "grad_norm": 2.034786042499269, + "learning_rate": 4.8153353544766555e-06, + "loss": 0.2511, + "mean_token_accuracy": 0.9160068035125732, + "step": 4216 + }, + { + "epoch": 2.1085, + "grad_norm": 18.2158643032631, + "learning_rate": 4.8151707371449215e-06, + "loss": 0.2978, + "mean_token_accuracy": 0.9003746509552002, + "step": 4217 + }, + { + "epoch": 2.109, + "grad_norm": 2.255067544757149, + "learning_rate": 4.815006049289054e-06, + "loss": 0.2974, + "mean_token_accuracy": 0.9130882620811462, + "step": 4218 + }, + { + "epoch": 2.1095, + "grad_norm": 2.982102132101715, + "learning_rate": 4.814841290914069e-06, + "loss": 0.3273, + "mean_token_accuracy": 0.894034743309021, + "step": 4219 + }, + { + "epoch": 2.11, + "grad_norm": 4.264801952780042, + "learning_rate": 4.814676462024988e-06, + "loss": 0.2627, + "mean_token_accuracy": 0.9128094911575317, + "step": 4220 + }, + { + "epoch": 2.1105, + "grad_norm": 3.138076696879929, + "learning_rate": 4.814511562626828e-06, + "loss": 0.2497, + "mean_token_accuracy": 0.9075798988342285, + "step": 4221 + }, + { + "epoch": 2.111, + "grad_norm": 2.8556485270392598, + "learning_rate": 4.814346592724615e-06, + "loss": 0.2383, + "mean_token_accuracy": 0.9225171804428101, + "step": 4222 + }, + { + "epoch": 2.1115, + "grad_norm": 3.243795133982864, + "learning_rate": 4.814181552323374e-06, + "loss": 0.2022, + "mean_token_accuracy": 0.9295336604118347, + "step": 4223 + }, + { + "epoch": 2.112, + "grad_norm": 3.8978950974627207, + "learning_rate": 4.814016441428131e-06, + "loss": 0.3077, + "mean_token_accuracy": 0.9043856263160706, + "step": 4224 + }, + { + "epoch": 2.1125, + "grad_norm": 4.958226321816002, + "learning_rate": 4.8138512600439165e-06, + "loss": 0.2291, + "mean_token_accuracy": 0.9205029606819153, + "step": 4225 + }, + { + "epoch": 2.113, + "grad_norm": 12.630295078718857, + "learning_rate": 4.813686008175762e-06, + "loss": 0.3181, + "mean_token_accuracy": 0.9020006656646729, + "step": 4226 + }, + { + "epoch": 2.1135, + "grad_norm": 1.9323028670862468, + "learning_rate": 4.8135206858287024e-06, + "loss": 0.2929, + "mean_token_accuracy": 0.8973880410194397, + "step": 4227 + }, + { + "epoch": 2.114, + "grad_norm": 2.516784933577442, + "learning_rate": 4.813355293007771e-06, + "loss": 0.2199, + "mean_token_accuracy": 0.9221152067184448, + "step": 4228 + }, + { + "epoch": 2.1145, + "grad_norm": 3.180524506140413, + "learning_rate": 4.813189829718009e-06, + "loss": 0.2233, + "mean_token_accuracy": 0.9236725568771362, + "step": 4229 + }, + { + "epoch": 2.115, + "grad_norm": 2.2825737576499936, + "learning_rate": 4.8130242959644555e-06, + "loss": 0.2588, + "mean_token_accuracy": 0.9138363599777222, + "step": 4230 + }, + { + "epoch": 2.1155, + "grad_norm": 2.2513532604838344, + "learning_rate": 4.812858691752153e-06, + "loss": 0.2515, + "mean_token_accuracy": 0.9125482439994812, + "step": 4231 + }, + { + "epoch": 2.116, + "grad_norm": 2.4307606327892395, + "learning_rate": 4.812693017086145e-06, + "loss": 0.281, + "mean_token_accuracy": 0.9012608528137207, + "step": 4232 + }, + { + "epoch": 2.1165, + "grad_norm": 2.621165511769813, + "learning_rate": 4.81252727197148e-06, + "loss": 0.1857, + "mean_token_accuracy": 0.928078830242157, + "step": 4233 + }, + { + "epoch": 2.117, + "grad_norm": 7.181001086037616, + "learning_rate": 4.812361456413206e-06, + "loss": 0.2688, + "mean_token_accuracy": 0.9118563532829285, + "step": 4234 + }, + { + "epoch": 2.1175, + "grad_norm": 2.128974624125074, + "learning_rate": 4.812195570416374e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8720853924751282, + "step": 4235 + }, + { + "epoch": 2.118, + "grad_norm": 1.596430402708181, + "learning_rate": 4.812029613986038e-06, + "loss": 0.2431, + "mean_token_accuracy": 0.914476752281189, + "step": 4236 + }, + { + "epoch": 2.1185, + "grad_norm": 26.62747613212302, + "learning_rate": 4.811863587127252e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8773859143257141, + "step": 4237 + }, + { + "epoch": 2.1189999999999998, + "grad_norm": 1.8552079568890267, + "learning_rate": 4.811697489845074e-06, + "loss": 0.2995, + "mean_token_accuracy": 0.8955407738685608, + "step": 4238 + }, + { + "epoch": 2.1195, + "grad_norm": 6.013978983401279, + "learning_rate": 4.8115313221445635e-06, + "loss": 0.2184, + "mean_token_accuracy": 0.9233629703521729, + "step": 4239 + }, + { + "epoch": 2.12, + "grad_norm": 1.908269443281761, + "learning_rate": 4.811365084030784e-06, + "loss": 0.2248, + "mean_token_accuracy": 0.9180169701576233, + "step": 4240 + }, + { + "epoch": 2.1205, + "grad_norm": 1.9420806736585257, + "learning_rate": 4.811198775508797e-06, + "loss": 0.319, + "mean_token_accuracy": 0.8901660442352295, + "step": 4241 + }, + { + "epoch": 2.121, + "grad_norm": 3.964138659936366, + "learning_rate": 4.811032396583668e-06, + "loss": 0.2249, + "mean_token_accuracy": 0.9193927645683289, + "step": 4242 + }, + { + "epoch": 2.1215, + "grad_norm": 1.9987698779621372, + "learning_rate": 4.810865947260468e-06, + "loss": 0.2242, + "mean_token_accuracy": 0.9172375202178955, + "step": 4243 + }, + { + "epoch": 2.122, + "grad_norm": 4.207961352558068, + "learning_rate": 4.810699427544265e-06, + "loss": 0.2816, + "mean_token_accuracy": 0.9012193083763123, + "step": 4244 + }, + { + "epoch": 2.1225, + "grad_norm": 19.208726748244786, + "learning_rate": 4.810532837440134e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.8848099708557129, + "step": 4245 + }, + { + "epoch": 2.123, + "grad_norm": 2.7068847450518367, + "learning_rate": 4.8103661769531465e-06, + "loss": 0.2153, + "mean_token_accuracy": 0.9219845533370972, + "step": 4246 + }, + { + "epoch": 2.1235, + "grad_norm": 2.410104016338505, + "learning_rate": 4.810199446088382e-06, + "loss": 0.2176, + "mean_token_accuracy": 0.9260284304618835, + "step": 4247 + }, + { + "epoch": 2.124, + "grad_norm": 2.801631356038792, + "learning_rate": 4.810032644850917e-06, + "loss": 0.3169, + "mean_token_accuracy": 0.898902952671051, + "step": 4248 + }, + { + "epoch": 2.1245, + "grad_norm": 2.8730865188846546, + "learning_rate": 4.809865773245835e-06, + "loss": 0.2499, + "mean_token_accuracy": 0.9150927662849426, + "step": 4249 + }, + { + "epoch": 2.125, + "grad_norm": 11.496226086763945, + "learning_rate": 4.809698831278217e-06, + "loss": 0.1663, + "mean_token_accuracy": 0.9361664056777954, + "step": 4250 + }, + { + "epoch": 2.1255, + "grad_norm": 1.953005124101435, + "learning_rate": 4.80953181895315e-06, + "loss": 0.2629, + "mean_token_accuracy": 0.9163439273834229, + "step": 4251 + }, + { + "epoch": 2.126, + "grad_norm": 3.260398371746421, + "learning_rate": 4.80936473627572e-06, + "loss": 0.2818, + "mean_token_accuracy": 0.906773567199707, + "step": 4252 + }, + { + "epoch": 2.1265, + "grad_norm": 2.175885149375283, + "learning_rate": 4.809197583251018e-06, + "loss": 0.2367, + "mean_token_accuracy": 0.917005717754364, + "step": 4253 + }, + { + "epoch": 2.127, + "grad_norm": 1.8772037366060084, + "learning_rate": 4.809030359884136e-06, + "loss": 0.2632, + "mean_token_accuracy": 0.9222349524497986, + "step": 4254 + }, + { + "epoch": 2.1275, + "grad_norm": 4.375743284785632, + "learning_rate": 4.808863066180167e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8909774422645569, + "step": 4255 + }, + { + "epoch": 2.128, + "grad_norm": 2.6600386926519035, + "learning_rate": 4.808695702144206e-06, + "loss": 0.2796, + "mean_token_accuracy": 0.9067046046257019, + "step": 4256 + }, + { + "epoch": 2.1285, + "grad_norm": 2.478266123491165, + "learning_rate": 4.808528267781353e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8869311809539795, + "step": 4257 + }, + { + "epoch": 2.129, + "grad_norm": 4.325283904985414, + "learning_rate": 4.808360763096708e-06, + "loss": 0.2281, + "mean_token_accuracy": 0.9195587038993835, + "step": 4258 + }, + { + "epoch": 2.1295, + "grad_norm": 4.137070758634976, + "learning_rate": 4.808193188095373e-06, + "loss": 0.2437, + "mean_token_accuracy": 0.9106582999229431, + "step": 4259 + }, + { + "epoch": 2.13, + "grad_norm": 5.395632690066071, + "learning_rate": 4.808025542782453e-06, + "loss": 0.2509, + "mean_token_accuracy": 0.9113765954971313, + "step": 4260 + }, + { + "epoch": 2.1305, + "grad_norm": 1.6357149444286516, + "learning_rate": 4.807857827163054e-06, + "loss": 0.2047, + "mean_token_accuracy": 0.9328547716140747, + "step": 4261 + }, + { + "epoch": 2.1310000000000002, + "grad_norm": 2.8270462491391197, + "learning_rate": 4.8076900412422865e-06, + "loss": 0.2631, + "mean_token_accuracy": 0.9126831293106079, + "step": 4262 + }, + { + "epoch": 2.1315, + "grad_norm": 3.8172480754856646, + "learning_rate": 4.80752218502526e-06, + "loss": 0.2826, + "mean_token_accuracy": 0.9052255749702454, + "step": 4263 + }, + { + "epoch": 2.132, + "grad_norm": 2.3116538347361, + "learning_rate": 4.807354258517088e-06, + "loss": 0.3229, + "mean_token_accuracy": 0.902337908744812, + "step": 4264 + }, + { + "epoch": 2.1325, + "grad_norm": 3.0338455696022906, + "learning_rate": 4.807186261722886e-06, + "loss": 0.2337, + "mean_token_accuracy": 0.9201886057853699, + "step": 4265 + }, + { + "epoch": 2.133, + "grad_norm": 7.747580348339454, + "learning_rate": 4.807018194647772e-06, + "loss": 0.2964, + "mean_token_accuracy": 0.9008968472480774, + "step": 4266 + }, + { + "epoch": 2.1335, + "grad_norm": 5.3058673505512335, + "learning_rate": 4.806850057296866e-06, + "loss": 0.2692, + "mean_token_accuracy": 0.9111448526382446, + "step": 4267 + }, + { + "epoch": 2.134, + "grad_norm": 1.9541517095652736, + "learning_rate": 4.8066818496752875e-06, + "loss": 0.2975, + "mean_token_accuracy": 0.9108607769012451, + "step": 4268 + }, + { + "epoch": 2.1345, + "grad_norm": 2.837564189331402, + "learning_rate": 4.806513571788163e-06, + "loss": 0.2361, + "mean_token_accuracy": 0.9174150228500366, + "step": 4269 + }, + { + "epoch": 2.135, + "grad_norm": 2.1908554648627754, + "learning_rate": 4.806345223640616e-06, + "loss": 0.2837, + "mean_token_accuracy": 0.904125988483429, + "step": 4270 + }, + { + "epoch": 2.1355, + "grad_norm": 3.7659454324584254, + "learning_rate": 4.806176805237777e-06, + "loss": 0.204, + "mean_token_accuracy": 0.9261277914047241, + "step": 4271 + }, + { + "epoch": 2.136, + "grad_norm": 7.092409642854912, + "learning_rate": 4.806008316584776e-06, + "loss": 0.3258, + "mean_token_accuracy": 0.8854022026062012, + "step": 4272 + }, + { + "epoch": 2.1365, + "grad_norm": 8.719107543168429, + "learning_rate": 4.805839757686743e-06, + "loss": 0.2504, + "mean_token_accuracy": 0.9133827686309814, + "step": 4273 + }, + { + "epoch": 2.137, + "grad_norm": 1.679027331430766, + "learning_rate": 4.805671128548816e-06, + "loss": 0.2007, + "mean_token_accuracy": 0.9228141903877258, + "step": 4274 + }, + { + "epoch": 2.1375, + "grad_norm": 3.2261546730405737, + "learning_rate": 4.80550242917613e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8889070749282837, + "step": 4275 + }, + { + "epoch": 2.138, + "grad_norm": 5.6414694088895505, + "learning_rate": 4.805333659573824e-06, + "loss": 0.2065, + "mean_token_accuracy": 0.9294935464859009, + "step": 4276 + }, + { + "epoch": 2.1385, + "grad_norm": 7.259255646499127, + "learning_rate": 4.805164819747039e-06, + "loss": 0.1849, + "mean_token_accuracy": 0.9346596598625183, + "step": 4277 + }, + { + "epoch": 2.1390000000000002, + "grad_norm": 27.63629225690925, + "learning_rate": 4.804995909700918e-06, + "loss": 0.3089, + "mean_token_accuracy": 0.8939089179039001, + "step": 4278 + }, + { + "epoch": 2.1395, + "grad_norm": 5.016469611147047, + "learning_rate": 4.804826929440606e-06, + "loss": 0.2902, + "mean_token_accuracy": 0.8993626236915588, + "step": 4279 + }, + { + "epoch": 2.14, + "grad_norm": 1.9143719446882685, + "learning_rate": 4.804657878971252e-06, + "loss": 0.2659, + "mean_token_accuracy": 0.9095599055290222, + "step": 4280 + }, + { + "epoch": 2.1405, + "grad_norm": 134.2561953690354, + "learning_rate": 4.8044887582980036e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.8837894797325134, + "step": 4281 + }, + { + "epoch": 2.141, + "grad_norm": 3.2482477227054933, + "learning_rate": 4.804319567426014e-06, + "loss": 0.2761, + "mean_token_accuracy": 0.9053861498832703, + "step": 4282 + }, + { + "epoch": 2.1415, + "grad_norm": 2.2685785529018734, + "learning_rate": 4.804150306360437e-06, + "loss": 0.2973, + "mean_token_accuracy": 0.9024225473403931, + "step": 4283 + }, + { + "epoch": 2.142, + "grad_norm": 2.049420245710153, + "learning_rate": 4.803980975106427e-06, + "loss": 0.2989, + "mean_token_accuracy": 0.8985567092895508, + "step": 4284 + }, + { + "epoch": 2.1425, + "grad_norm": 3.820312068564413, + "learning_rate": 4.803811573669143e-06, + "loss": 0.3127, + "mean_token_accuracy": 0.9035383462905884, + "step": 4285 + }, + { + "epoch": 2.143, + "grad_norm": 2.8863427624018154, + "learning_rate": 4.8036421020537465e-06, + "loss": 0.2786, + "mean_token_accuracy": 0.9068813323974609, + "step": 4286 + }, + { + "epoch": 2.1435, + "grad_norm": 2.9113953415947824, + "learning_rate": 4.8034725602653985e-06, + "loss": 0.2941, + "mean_token_accuracy": 0.9071108102798462, + "step": 4287 + }, + { + "epoch": 2.144, + "grad_norm": 2.551332097178932, + "learning_rate": 4.803302948309264e-06, + "loss": 0.2046, + "mean_token_accuracy": 0.9270882606506348, + "step": 4288 + }, + { + "epoch": 2.1445, + "grad_norm": 3.8679378333626286, + "learning_rate": 4.8031332661905096e-06, + "loss": 0.3425, + "mean_token_accuracy": 0.8745776414871216, + "step": 4289 + }, + { + "epoch": 2.145, + "grad_norm": 60.17673478247989, + "learning_rate": 4.802963513914304e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.877976655960083, + "step": 4290 + }, + { + "epoch": 2.1455, + "grad_norm": 2.7946452923058747, + "learning_rate": 4.8027936914858175e-06, + "loss": 0.2558, + "mean_token_accuracy": 0.9192251563072205, + "step": 4291 + }, + { + "epoch": 2.146, + "grad_norm": 3.4145239745931897, + "learning_rate": 4.802623798910224e-06, + "loss": 0.231, + "mean_token_accuracy": 0.9263374209403992, + "step": 4292 + }, + { + "epoch": 2.1465, + "grad_norm": 3.6333495061379524, + "learning_rate": 4.8024538361927e-06, + "loss": 0.2953, + "mean_token_accuracy": 0.8981735706329346, + "step": 4293 + }, + { + "epoch": 2.147, + "grad_norm": 2.5985164715801994, + "learning_rate": 4.80228380333842e-06, + "loss": 0.2916, + "mean_token_accuracy": 0.9000629782676697, + "step": 4294 + }, + { + "epoch": 2.1475, + "grad_norm": 2.2245267375007534, + "learning_rate": 4.802113700352567e-06, + "loss": 0.2547, + "mean_token_accuracy": 0.9301385879516602, + "step": 4295 + }, + { + "epoch": 2.148, + "grad_norm": 3.935737669752874, + "learning_rate": 4.801943527240318e-06, + "loss": 0.2772, + "mean_token_accuracy": 0.9062734842300415, + "step": 4296 + }, + { + "epoch": 2.1485, + "grad_norm": 2.888368858543116, + "learning_rate": 4.8017732840068605e-06, + "loss": 0.2431, + "mean_token_accuracy": 0.9205374121665955, + "step": 4297 + }, + { + "epoch": 2.149, + "grad_norm": 2.3654473313437294, + "learning_rate": 4.80160297065738e-06, + "loss": 0.3235, + "mean_token_accuracy": 0.8924137949943542, + "step": 4298 + }, + { + "epoch": 2.1495, + "grad_norm": 2.4012514256465973, + "learning_rate": 4.801432587197063e-06, + "loss": 0.2897, + "mean_token_accuracy": 0.8954752683639526, + "step": 4299 + }, + { + "epoch": 2.15, + "grad_norm": 3.6102830923611116, + "learning_rate": 4.801262133631101e-06, + "loss": 0.2298, + "mean_token_accuracy": 0.9152176976203918, + "step": 4300 + }, + { + "epoch": 2.1505, + "grad_norm": 8.81825316480955, + "learning_rate": 4.801091609964686e-06, + "loss": 0.2944, + "mean_token_accuracy": 0.9116559028625488, + "step": 4301 + }, + { + "epoch": 2.151, + "grad_norm": 9.47443706770727, + "learning_rate": 4.800921016203012e-06, + "loss": 0.2907, + "mean_token_accuracy": 0.9047451615333557, + "step": 4302 + }, + { + "epoch": 2.1515, + "grad_norm": 3.3076875319261125, + "learning_rate": 4.800750352351276e-06, + "loss": 0.2861, + "mean_token_accuracy": 0.9043824672698975, + "step": 4303 + }, + { + "epoch": 2.152, + "grad_norm": 1.9951772778775299, + "learning_rate": 4.800579618414677e-06, + "loss": 0.2279, + "mean_token_accuracy": 0.9218379855155945, + "step": 4304 + }, + { + "epoch": 2.1525, + "grad_norm": 1.951018826090714, + "learning_rate": 4.800408814398414e-06, + "loss": 0.2674, + "mean_token_accuracy": 0.9183645248413086, + "step": 4305 + }, + { + "epoch": 2.153, + "grad_norm": 2.7544065186720923, + "learning_rate": 4.8002379403076925e-06, + "loss": 0.2801, + "mean_token_accuracy": 0.9112379550933838, + "step": 4306 + }, + { + "epoch": 2.1535, + "grad_norm": 22.171747534889942, + "learning_rate": 4.800066996147717e-06, + "loss": 0.3027, + "mean_token_accuracy": 0.9028973579406738, + "step": 4307 + }, + { + "epoch": 2.154, + "grad_norm": 2.3280446952779883, + "learning_rate": 4.799895981923694e-06, + "loss": 0.222, + "mean_token_accuracy": 0.9236873984336853, + "step": 4308 + }, + { + "epoch": 2.1545, + "grad_norm": 3.5267717398172365, + "learning_rate": 4.799724897640832e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.8826555013656616, + "step": 4309 + }, + { + "epoch": 2.155, + "grad_norm": 5.30364554759521, + "learning_rate": 4.799553743304345e-06, + "loss": 0.2557, + "mean_token_accuracy": 0.9068028330802917, + "step": 4310 + }, + { + "epoch": 2.1555, + "grad_norm": 4.002853098032205, + "learning_rate": 4.799382518919445e-06, + "loss": 0.2819, + "mean_token_accuracy": 0.9106836915016174, + "step": 4311 + }, + { + "epoch": 2.156, + "grad_norm": 7.406108016438325, + "learning_rate": 4.799211224491348e-06, + "loss": 0.2737, + "mean_token_accuracy": 0.9057527780532837, + "step": 4312 + }, + { + "epoch": 2.1565, + "grad_norm": 2.9752904781414884, + "learning_rate": 4.7990398600252715e-06, + "loss": 0.2958, + "mean_token_accuracy": 0.8939720988273621, + "step": 4313 + }, + { + "epoch": 2.157, + "grad_norm": 1.521387281918604, + "learning_rate": 4.798868425526437e-06, + "loss": 0.261, + "mean_token_accuracy": 0.9094051718711853, + "step": 4314 + }, + { + "epoch": 2.1575, + "grad_norm": 3.34864466955865, + "learning_rate": 4.798696921000066e-06, + "loss": 0.2808, + "mean_token_accuracy": 0.9011178016662598, + "step": 4315 + }, + { + "epoch": 2.158, + "grad_norm": 6.207261421972662, + "learning_rate": 4.798525346451382e-06, + "loss": 0.2882, + "mean_token_accuracy": 0.9099239110946655, + "step": 4316 + }, + { + "epoch": 2.1585, + "grad_norm": 3.7386929321437257, + "learning_rate": 4.798353701885613e-06, + "loss": 0.2663, + "mean_token_accuracy": 0.9099283814430237, + "step": 4317 + }, + { + "epoch": 2.159, + "grad_norm": 2.095662678282537, + "learning_rate": 4.798181987307986e-06, + "loss": 0.2908, + "mean_token_accuracy": 0.9006249904632568, + "step": 4318 + }, + { + "epoch": 2.1595, + "grad_norm": 3.019858707656706, + "learning_rate": 4.798010202723734e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8803278803825378, + "step": 4319 + }, + { + "epoch": 2.16, + "grad_norm": 2.5620073093518343, + "learning_rate": 4.7978383481380865e-06, + "loss": 0.2727, + "mean_token_accuracy": 0.911918044090271, + "step": 4320 + }, + { + "epoch": 2.1605, + "grad_norm": 3.0581090151357895, + "learning_rate": 4.797666423556281e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.8893696069717407, + "step": 4321 + }, + { + "epoch": 2.161, + "grad_norm": 5.639487751262769, + "learning_rate": 4.797494428983553e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8891391158103943, + "step": 4322 + }, + { + "epoch": 2.1615, + "grad_norm": 2.458642923229597, + "learning_rate": 4.7973223644251445e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8481239676475525, + "step": 4323 + }, + { + "epoch": 2.162, + "grad_norm": 5.163369548742524, + "learning_rate": 4.797150229886294e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.8972365856170654, + "step": 4324 + }, + { + "epoch": 2.1625, + "grad_norm": 2.7171602574100837, + "learning_rate": 4.796978025372247e-06, + "loss": 0.2435, + "mean_token_accuracy": 0.9153622388839722, + "step": 4325 + }, + { + "epoch": 2.163, + "grad_norm": 7.848151479200114, + "learning_rate": 4.7968057508882465e-06, + "loss": 0.2875, + "mean_token_accuracy": 0.9080808162689209, + "step": 4326 + }, + { + "epoch": 2.1635, + "grad_norm": 2.739323815996508, + "learning_rate": 4.796633406439543e-06, + "loss": 0.2992, + "mean_token_accuracy": 0.902305006980896, + "step": 4327 + }, + { + "epoch": 2.164, + "grad_norm": 1.693992347083045, + "learning_rate": 4.796460992031386e-06, + "loss": 0.2699, + "mean_token_accuracy": 0.9046154022216797, + "step": 4328 + }, + { + "epoch": 2.1645, + "grad_norm": 2.074144074567364, + "learning_rate": 4.796288507669026e-06, + "loss": 0.2315, + "mean_token_accuracy": 0.9214537143707275, + "step": 4329 + }, + { + "epoch": 2.165, + "grad_norm": 2.42065230055269, + "learning_rate": 4.796115953357718e-06, + "loss": 0.2997, + "mean_token_accuracy": 0.9002024531364441, + "step": 4330 + }, + { + "epoch": 2.1655, + "grad_norm": 2.6371054488579464, + "learning_rate": 4.795943329102719e-06, + "loss": 0.2324, + "mean_token_accuracy": 0.9104393720626831, + "step": 4331 + }, + { + "epoch": 2.166, + "grad_norm": 5.096052026427676, + "learning_rate": 4.795770634909287e-06, + "loss": 0.2836, + "mean_token_accuracy": 0.9077699780464172, + "step": 4332 + }, + { + "epoch": 2.1665, + "grad_norm": 2.4850025344696727, + "learning_rate": 4.7955978707826826e-06, + "loss": 0.3267, + "mean_token_accuracy": 0.891983687877655, + "step": 4333 + }, + { + "epoch": 2.167, + "grad_norm": 2.608166975911666, + "learning_rate": 4.795425036728168e-06, + "loss": 0.2683, + "mean_token_accuracy": 0.9179498553276062, + "step": 4334 + }, + { + "epoch": 2.1675, + "grad_norm": 2.273536390748715, + "learning_rate": 4.795252132751008e-06, + "loss": 0.2774, + "mean_token_accuracy": 0.9083728194236755, + "step": 4335 + }, + { + "epoch": 2.168, + "grad_norm": 2.573895932150966, + "learning_rate": 4.795079158856471e-06, + "loss": 0.2505, + "mean_token_accuracy": 0.9121351838111877, + "step": 4336 + }, + { + "epoch": 2.1685, + "grad_norm": 2.654262505510986, + "learning_rate": 4.794906115049824e-06, + "loss": 0.2397, + "mean_token_accuracy": 0.9171345829963684, + "step": 4337 + }, + { + "epoch": 2.169, + "grad_norm": 3.9893789249054614, + "learning_rate": 4.79473300133634e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.895880401134491, + "step": 4338 + }, + { + "epoch": 2.1695, + "grad_norm": 14.162413311289923, + "learning_rate": 4.794559817721291e-06, + "loss": 0.2589, + "mean_token_accuracy": 0.911833643913269, + "step": 4339 + }, + { + "epoch": 2.17, + "grad_norm": 1.8370365672016407, + "learning_rate": 4.794386564209953e-06, + "loss": 0.2097, + "mean_token_accuracy": 0.9262411594390869, + "step": 4340 + }, + { + "epoch": 2.1705, + "grad_norm": 3.7008401818544843, + "learning_rate": 4.794213240807604e-06, + "loss": 0.2394, + "mean_token_accuracy": 0.9190317392349243, + "step": 4341 + }, + { + "epoch": 2.171, + "grad_norm": 2.2695771620853735, + "learning_rate": 4.794039847519524e-06, + "loss": 0.229, + "mean_token_accuracy": 0.912362277507782, + "step": 4342 + }, + { + "epoch": 2.1715, + "grad_norm": 2.3396956231766453, + "learning_rate": 4.793866384350993e-06, + "loss": 0.306, + "mean_token_accuracy": 0.9009364247322083, + "step": 4343 + }, + { + "epoch": 2.172, + "grad_norm": 5.317177185649742, + "learning_rate": 4.793692851307297e-06, + "loss": 0.2149, + "mean_token_accuracy": 0.9237608313560486, + "step": 4344 + }, + { + "epoch": 2.1725, + "grad_norm": 3.701734944607044, + "learning_rate": 4.793519248393721e-06, + "loss": 0.2615, + "mean_token_accuracy": 0.9113619327545166, + "step": 4345 + }, + { + "epoch": 2.173, + "grad_norm": 3.11013878795632, + "learning_rate": 4.793345575615554e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8846448659896851, + "step": 4346 + }, + { + "epoch": 2.1734999999999998, + "grad_norm": 3.0927632785981043, + "learning_rate": 4.7931718329780855e-06, + "loss": 0.2698, + "mean_token_accuracy": 0.909547746181488, + "step": 4347 + }, + { + "epoch": 2.174, + "grad_norm": 18.861343623471104, + "learning_rate": 4.792998020486609e-06, + "loss": 0.3031, + "mean_token_accuracy": 0.9045438170433044, + "step": 4348 + }, + { + "epoch": 2.1745, + "grad_norm": 3.7380506202563613, + "learning_rate": 4.792824138146418e-06, + "loss": 0.2497, + "mean_token_accuracy": 0.9147875905036926, + "step": 4349 + }, + { + "epoch": 2.175, + "grad_norm": 3.5937314054545584, + "learning_rate": 4.79265018596281e-06, + "loss": 0.2341, + "mean_token_accuracy": 0.9220154881477356, + "step": 4350 + }, + { + "epoch": 2.1755, + "grad_norm": 4.37751163681206, + "learning_rate": 4.792476163941084e-06, + "loss": 0.2046, + "mean_token_accuracy": 0.9264705777168274, + "step": 4351 + }, + { + "epoch": 2.176, + "grad_norm": 1.7511600400951468, + "learning_rate": 4.792302072086542e-06, + "loss": 0.2889, + "mean_token_accuracy": 0.9045371413230896, + "step": 4352 + }, + { + "epoch": 2.1765, + "grad_norm": 3.1771784534572043, + "learning_rate": 4.792127910404484e-06, + "loss": 0.2624, + "mean_token_accuracy": 0.9113003611564636, + "step": 4353 + }, + { + "epoch": 2.177, + "grad_norm": 4.481273847152761, + "learning_rate": 4.791953678900218e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.8843219876289368, + "step": 4354 + }, + { + "epoch": 2.1775, + "grad_norm": 4.153446194106739, + "learning_rate": 4.791779377579051e-06, + "loss": 0.258, + "mean_token_accuracy": 0.9177070260047913, + "step": 4355 + }, + { + "epoch": 2.178, + "grad_norm": 6.966074514248129, + "learning_rate": 4.791605006446291e-06, + "loss": 0.2976, + "mean_token_accuracy": 0.9043161869049072, + "step": 4356 + }, + { + "epoch": 2.1785, + "grad_norm": 3.578719987254733, + "learning_rate": 4.791430565507251e-06, + "loss": 0.2329, + "mean_token_accuracy": 0.9220709204673767, + "step": 4357 + }, + { + "epoch": 2.179, + "grad_norm": 2.4711760131130105, + "learning_rate": 4.791256054767245e-06, + "loss": 0.2082, + "mean_token_accuracy": 0.9287330508232117, + "step": 4358 + }, + { + "epoch": 2.1795, + "grad_norm": 2.3920360064296005, + "learning_rate": 4.791081474231589e-06, + "loss": 0.3197, + "mean_token_accuracy": 0.905281662940979, + "step": 4359 + }, + { + "epoch": 2.18, + "grad_norm": 3.631657179731549, + "learning_rate": 4.790906823905599e-06, + "loss": 0.247, + "mean_token_accuracy": 0.9130848050117493, + "step": 4360 + }, + { + "epoch": 2.1805, + "grad_norm": 3.1497443766958466, + "learning_rate": 4.790732103794597e-06, + "loss": 0.2204, + "mean_token_accuracy": 0.9276682734489441, + "step": 4361 + }, + { + "epoch": 2.181, + "grad_norm": 7.06612480699216, + "learning_rate": 4.790557313903906e-06, + "loss": 0.2689, + "mean_token_accuracy": 0.9062403440475464, + "step": 4362 + }, + { + "epoch": 2.1814999999999998, + "grad_norm": 2.4299667338632043, + "learning_rate": 4.790382454238849e-06, + "loss": 0.2822, + "mean_token_accuracy": 0.9060423374176025, + "step": 4363 + }, + { + "epoch": 2.182, + "grad_norm": 2.0850462993646666, + "learning_rate": 4.790207524804752e-06, + "loss": 0.2842, + "mean_token_accuracy": 0.9023754596710205, + "step": 4364 + }, + { + "epoch": 2.1825, + "grad_norm": 9.9630710578668, + "learning_rate": 4.790032525606945e-06, + "loss": 0.3298, + "mean_token_accuracy": 0.894037127494812, + "step": 4365 + }, + { + "epoch": 2.183, + "grad_norm": 2.666062043067682, + "learning_rate": 4.789857456650758e-06, + "loss": 0.3015, + "mean_token_accuracy": 0.8938009738922119, + "step": 4366 + }, + { + "epoch": 2.1835, + "grad_norm": 4.431730617250396, + "learning_rate": 4.789682317941524e-06, + "loss": 0.234, + "mean_token_accuracy": 0.9170858263969421, + "step": 4367 + }, + { + "epoch": 2.184, + "grad_norm": 5.375085939629881, + "learning_rate": 4.789507109484579e-06, + "loss": 0.3263, + "mean_token_accuracy": 0.9007977247238159, + "step": 4368 + }, + { + "epoch": 2.1845, + "grad_norm": 2.0547210941129554, + "learning_rate": 4.789331831285259e-06, + "loss": 0.1914, + "mean_token_accuracy": 0.9321415424346924, + "step": 4369 + }, + { + "epoch": 2.185, + "grad_norm": 19.639053713241903, + "learning_rate": 4.7891564833489035e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.8965517282485962, + "step": 4370 + }, + { + "epoch": 2.1855, + "grad_norm": 3.034063748472858, + "learning_rate": 4.788981065680853e-06, + "loss": 0.2439, + "mean_token_accuracy": 0.9162431955337524, + "step": 4371 + }, + { + "epoch": 2.186, + "grad_norm": 3.648147637708556, + "learning_rate": 4.788805578286454e-06, + "loss": 0.2843, + "mean_token_accuracy": 0.9011572003364563, + "step": 4372 + }, + { + "epoch": 2.1865, + "grad_norm": 94.9301726780682, + "learning_rate": 4.788630021171049e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8975468873977661, + "step": 4373 + }, + { + "epoch": 2.187, + "grad_norm": 2.579068063311352, + "learning_rate": 4.7884543943399875e-06, + "loss": 0.2682, + "mean_token_accuracy": 0.9074585437774658, + "step": 4374 + }, + { + "epoch": 2.1875, + "grad_norm": 10.305670341462621, + "learning_rate": 4.788278697798619e-06, + "loss": 0.2818, + "mean_token_accuracy": 0.9083486199378967, + "step": 4375 + }, + { + "epoch": 2.188, + "grad_norm": 2.7450258041533044, + "learning_rate": 4.788102931552294e-06, + "loss": 0.2626, + "mean_token_accuracy": 0.9100817441940308, + "step": 4376 + }, + { + "epoch": 2.1885, + "grad_norm": 2.4763005753316194, + "learning_rate": 4.78792709560637e-06, + "loss": 0.282, + "mean_token_accuracy": 0.9116421937942505, + "step": 4377 + }, + { + "epoch": 2.189, + "grad_norm": 1.7161414457493538, + "learning_rate": 4.7877511899662e-06, + "loss": 0.2769, + "mean_token_accuracy": 0.9042266607284546, + "step": 4378 + }, + { + "epoch": 2.1895, + "grad_norm": 1.9506117409590993, + "learning_rate": 4.787575214637144e-06, + "loss": 0.2728, + "mean_token_accuracy": 0.9081767201423645, + "step": 4379 + }, + { + "epoch": 2.19, + "grad_norm": 3.7667036108949365, + "learning_rate": 4.787399169624562e-06, + "loss": 0.3165, + "mean_token_accuracy": 0.8968798518180847, + "step": 4380 + }, + { + "epoch": 2.1905, + "grad_norm": 5.361592836338351, + "learning_rate": 4.787223054933818e-06, + "loss": 0.2381, + "mean_token_accuracy": 0.9176321029663086, + "step": 4381 + }, + { + "epoch": 2.191, + "grad_norm": 4.6181385938966475, + "learning_rate": 4.787046870570274e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8863834738731384, + "step": 4382 + }, + { + "epoch": 2.1915, + "grad_norm": 4.62535910246022, + "learning_rate": 4.7868706165393e-06, + "loss": 0.2951, + "mean_token_accuracy": 0.9020053148269653, + "step": 4383 + }, + { + "epoch": 2.192, + "grad_norm": 2.060527508461016, + "learning_rate": 4.7866942928462625e-06, + "loss": 0.2906, + "mean_token_accuracy": 0.9006015062332153, + "step": 4384 + }, + { + "epoch": 2.1925, + "grad_norm": 2.4511889615082665, + "learning_rate": 4.786517899496535e-06, + "loss": 0.2328, + "mean_token_accuracy": 0.9253246784210205, + "step": 4385 + }, + { + "epoch": 2.193, + "grad_norm": 1.5582419695982177, + "learning_rate": 4.786341436495487e-06, + "loss": 0.1799, + "mean_token_accuracy": 0.9370328783988953, + "step": 4386 + }, + { + "epoch": 2.1935000000000002, + "grad_norm": 1.8802177330905794, + "learning_rate": 4.786164903848498e-06, + "loss": 0.2392, + "mean_token_accuracy": 0.9177162051200867, + "step": 4387 + }, + { + "epoch": 2.194, + "grad_norm": 3.50672681826855, + "learning_rate": 4.785988301560944e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8630281686782837, + "step": 4388 + }, + { + "epoch": 2.1945, + "grad_norm": 2.0584409301850486, + "learning_rate": 4.785811629638204e-06, + "loss": 0.3058, + "mean_token_accuracy": 0.8888888955116272, + "step": 4389 + }, + { + "epoch": 2.195, + "grad_norm": 4.422337786214197, + "learning_rate": 4.7856348880856595e-06, + "loss": 0.3448, + "mean_token_accuracy": 0.8916000723838806, + "step": 4390 + }, + { + "epoch": 2.1955, + "grad_norm": 3.3591642214083306, + "learning_rate": 4.785458076908695e-06, + "loss": 0.3351, + "mean_token_accuracy": 0.8902754783630371, + "step": 4391 + }, + { + "epoch": 2.196, + "grad_norm": 4.114394899988366, + "learning_rate": 4.7852811961126974e-06, + "loss": 0.2933, + "mean_token_accuracy": 0.9004648327827454, + "step": 4392 + }, + { + "epoch": 2.1965, + "grad_norm": 4.284931603453772, + "learning_rate": 4.785104245703054e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8956473469734192, + "step": 4393 + }, + { + "epoch": 2.197, + "grad_norm": 6.650959249193391, + "learning_rate": 4.784927225685153e-06, + "loss": 0.3251, + "mean_token_accuracy": 0.8943416476249695, + "step": 4394 + }, + { + "epoch": 2.1975, + "grad_norm": 3.716172797522991, + "learning_rate": 4.78475013606439e-06, + "loss": 0.2506, + "mean_token_accuracy": 0.9171618819236755, + "step": 4395 + }, + { + "epoch": 2.198, + "grad_norm": 2.1768316966071, + "learning_rate": 4.784572976846158e-06, + "loss": 0.3325, + "mean_token_accuracy": 0.8858821392059326, + "step": 4396 + }, + { + "epoch": 2.1985, + "grad_norm": 2.3643940570523005, + "learning_rate": 4.784395748035853e-06, + "loss": 0.2599, + "mean_token_accuracy": 0.9074746370315552, + "step": 4397 + }, + { + "epoch": 2.199, + "grad_norm": 4.578105319658103, + "learning_rate": 4.784218449638875e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.883659839630127, + "step": 4398 + }, + { + "epoch": 2.1995, + "grad_norm": 1.877021413722367, + "learning_rate": 4.7840410816606236e-06, + "loss": 0.3432, + "mean_token_accuracy": 0.889535665512085, + "step": 4399 + }, + { + "epoch": 2.2, + "grad_norm": 3.207116393115922, + "learning_rate": 4.783863644106502e-06, + "loss": 0.2406, + "mean_token_accuracy": 0.9292517304420471, + "step": 4400 + }, + { + "epoch": 2.2005, + "grad_norm": 2.1526139608067485, + "learning_rate": 4.783686136981916e-06, + "loss": 0.2109, + "mean_token_accuracy": 0.9296079874038696, + "step": 4401 + }, + { + "epoch": 2.201, + "grad_norm": 2.1089953438355415, + "learning_rate": 4.783508560292273e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8806272745132446, + "step": 4402 + }, + { + "epoch": 2.2015000000000002, + "grad_norm": 3.01694904194881, + "learning_rate": 4.783330914042981e-06, + "loss": 0.2818, + "mean_token_accuracy": 0.9056360721588135, + "step": 4403 + }, + { + "epoch": 2.202, + "grad_norm": 2.7996096091933125, + "learning_rate": 4.783153198239452e-06, + "loss": 0.2718, + "mean_token_accuracy": 0.9104915857315063, + "step": 4404 + }, + { + "epoch": 2.2025, + "grad_norm": 2.922461177135255, + "learning_rate": 4.7829754128871e-06, + "loss": 0.3135, + "mean_token_accuracy": 0.892291247844696, + "step": 4405 + }, + { + "epoch": 2.203, + "grad_norm": 8.547287800172288, + "learning_rate": 4.782797557991339e-06, + "loss": 0.3178, + "mean_token_accuracy": 0.901780903339386, + "step": 4406 + }, + { + "epoch": 2.2035, + "grad_norm": 2.8218577883807194, + "learning_rate": 4.782619633557589e-06, + "loss": 0.2876, + "mean_token_accuracy": 0.9034273624420166, + "step": 4407 + }, + { + "epoch": 2.204, + "grad_norm": 4.417242936831356, + "learning_rate": 4.782441639591269e-06, + "loss": 0.2619, + "mean_token_accuracy": 0.9094412326812744, + "step": 4408 + }, + { + "epoch": 2.2045, + "grad_norm": 4.042105600558488, + "learning_rate": 4.7822635760978e-06, + "loss": 0.2803, + "mean_token_accuracy": 0.9087581038475037, + "step": 4409 + }, + { + "epoch": 2.205, + "grad_norm": 2.175531231730409, + "learning_rate": 4.782085443082607e-06, + "loss": 0.2673, + "mean_token_accuracy": 0.9020816087722778, + "step": 4410 + }, + { + "epoch": 2.2055, + "grad_norm": 2.4326065763350138, + "learning_rate": 4.781907240551117e-06, + "loss": 0.3304, + "mean_token_accuracy": 0.8812870979309082, + "step": 4411 + }, + { + "epoch": 2.206, + "grad_norm": 2.058499122037714, + "learning_rate": 4.781728968508757e-06, + "loss": 0.1783, + "mean_token_accuracy": 0.939513623714447, + "step": 4412 + }, + { + "epoch": 2.2065, + "grad_norm": 3.402965198404079, + "learning_rate": 4.781550626960959e-06, + "loss": 0.2404, + "mean_token_accuracy": 0.920883297920227, + "step": 4413 + }, + { + "epoch": 2.207, + "grad_norm": 2.810268395705563, + "learning_rate": 4.781372215913153e-06, + "loss": 0.2577, + "mean_token_accuracy": 0.9120569825172424, + "step": 4414 + }, + { + "epoch": 2.2075, + "grad_norm": 5.280989539002866, + "learning_rate": 4.7811937353707776e-06, + "loss": 0.2317, + "mean_token_accuracy": 0.9197572469711304, + "step": 4415 + }, + { + "epoch": 2.208, + "grad_norm": 3.376677717790428, + "learning_rate": 4.781015185339266e-06, + "loss": 0.2419, + "mean_token_accuracy": 0.925674557685852, + "step": 4416 + }, + { + "epoch": 2.2085, + "grad_norm": 7.546098582795244, + "learning_rate": 4.7808365658240585e-06, + "loss": 0.2544, + "mean_token_accuracy": 0.9096440076828003, + "step": 4417 + }, + { + "epoch": 2.209, + "grad_norm": 2.8902675428640254, + "learning_rate": 4.780657876830597e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8822969794273376, + "step": 4418 + }, + { + "epoch": 2.2095, + "grad_norm": 2.778772922064002, + "learning_rate": 4.7804791183643225e-06, + "loss": 0.206, + "mean_token_accuracy": 0.9335890412330627, + "step": 4419 + }, + { + "epoch": 2.21, + "grad_norm": 2.7460818760333066, + "learning_rate": 4.780300290430683e-06, + "loss": 0.2038, + "mean_token_accuracy": 0.9306260347366333, + "step": 4420 + }, + { + "epoch": 2.2105, + "grad_norm": 2.3305711644301685, + "learning_rate": 4.780121393035124e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8732455372810364, + "step": 4421 + }, + { + "epoch": 2.211, + "grad_norm": 3.0291864462098324, + "learning_rate": 4.779942426183096e-06, + "loss": 0.2661, + "mean_token_accuracy": 0.9093425869941711, + "step": 4422 + }, + { + "epoch": 2.2115, + "grad_norm": 2.049248085727488, + "learning_rate": 4.77976338988005e-06, + "loss": 0.2623, + "mean_token_accuracy": 0.9136439561843872, + "step": 4423 + }, + { + "epoch": 2.212, + "grad_norm": 2.180904026853396, + "learning_rate": 4.77958428413144e-06, + "loss": 0.2199, + "mean_token_accuracy": 0.9266841411590576, + "step": 4424 + }, + { + "epoch": 2.2125, + "grad_norm": 1.988586484271296, + "learning_rate": 4.779405108942722e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8803876638412476, + "step": 4425 + }, + { + "epoch": 2.213, + "grad_norm": 3.7224438018394843, + "learning_rate": 4.779225864319353e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8768151998519897, + "step": 4426 + }, + { + "epoch": 2.2135, + "grad_norm": 5.3992958679855345, + "learning_rate": 4.779046550266795e-06, + "loss": 0.2432, + "mean_token_accuracy": 0.9212707281112671, + "step": 4427 + }, + { + "epoch": 2.214, + "grad_norm": 3.6683671421139943, + "learning_rate": 4.778867166790509e-06, + "loss": 0.3298, + "mean_token_accuracy": 0.894137978553772, + "step": 4428 + }, + { + "epoch": 2.2145, + "grad_norm": 9.118798581229028, + "learning_rate": 4.7786877138959596e-06, + "loss": 0.2358, + "mean_token_accuracy": 0.9210420846939087, + "step": 4429 + }, + { + "epoch": 2.215, + "grad_norm": 4.6940070753724035, + "learning_rate": 4.778508191588613e-06, + "loss": 0.299, + "mean_token_accuracy": 0.8968860507011414, + "step": 4430 + }, + { + "epoch": 2.2155, + "grad_norm": 2.6712832048887876, + "learning_rate": 4.778328599873939e-06, + "loss": 0.2366, + "mean_token_accuracy": 0.9161571860313416, + "step": 4431 + }, + { + "epoch": 2.216, + "grad_norm": 2.277414086913351, + "learning_rate": 4.778148938757406e-06, + "loss": 0.2781, + "mean_token_accuracy": 0.9095295667648315, + "step": 4432 + }, + { + "epoch": 2.2165, + "grad_norm": 2.7482353270706947, + "learning_rate": 4.777969208244488e-06, + "loss": 0.2979, + "mean_token_accuracy": 0.9026315808296204, + "step": 4433 + }, + { + "epoch": 2.217, + "grad_norm": 42.223071578733254, + "learning_rate": 4.7777894083406605e-06, + "loss": 0.2571, + "mean_token_accuracy": 0.9078278541564941, + "step": 4434 + }, + { + "epoch": 2.2175, + "grad_norm": 2.563644668346649, + "learning_rate": 4.7776095390514e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8755093812942505, + "step": 4435 + }, + { + "epoch": 2.218, + "grad_norm": 3.1835742568503567, + "learning_rate": 4.7774296003821855e-06, + "loss": 0.2642, + "mean_token_accuracy": 0.9119683504104614, + "step": 4436 + }, + { + "epoch": 2.2185, + "grad_norm": 3.2461705947815127, + "learning_rate": 4.777249592338497e-06, + "loss": 0.2681, + "mean_token_accuracy": 0.9078947305679321, + "step": 4437 + }, + { + "epoch": 2.219, + "grad_norm": 3.31684134870768, + "learning_rate": 4.77706951492582e-06, + "loss": 0.2372, + "mean_token_accuracy": 0.9204831123352051, + "step": 4438 + }, + { + "epoch": 2.2195, + "grad_norm": 2.109664046409008, + "learning_rate": 4.77688936814964e-06, + "loss": 0.1954, + "mean_token_accuracy": 0.93047696352005, + "step": 4439 + }, + { + "epoch": 2.22, + "grad_norm": 2.4489401293493067, + "learning_rate": 4.776709152015443e-06, + "loss": 0.2777, + "mean_token_accuracy": 0.9061239361763, + "step": 4440 + }, + { + "epoch": 2.2205, + "grad_norm": 2.8519188565576328, + "learning_rate": 4.77652886652872e-06, + "loss": 0.2614, + "mean_token_accuracy": 0.913121223449707, + "step": 4441 + }, + { + "epoch": 2.221, + "grad_norm": 2.7440042812078422, + "learning_rate": 4.7763485116949615e-06, + "loss": 0.2099, + "mean_token_accuracy": 0.9271092414855957, + "step": 4442 + }, + { + "epoch": 2.2215, + "grad_norm": 1.848154830339036, + "learning_rate": 4.776168087519662e-06, + "loss": 0.2493, + "mean_token_accuracy": 0.9098971486091614, + "step": 4443 + }, + { + "epoch": 2.222, + "grad_norm": 2.238366409598926, + "learning_rate": 4.775987594008319e-06, + "loss": 0.259, + "mean_token_accuracy": 0.9090243577957153, + "step": 4444 + }, + { + "epoch": 2.2225, + "grad_norm": 2.409123243345017, + "learning_rate": 4.775807031166428e-06, + "loss": 0.2992, + "mean_token_accuracy": 0.9074898362159729, + "step": 4445 + }, + { + "epoch": 2.223, + "grad_norm": 2.8320105994422113, + "learning_rate": 4.775626398999491e-06, + "loss": 0.3057, + "mean_token_accuracy": 0.895129919052124, + "step": 4446 + }, + { + "epoch": 2.2235, + "grad_norm": 3.8665834938424175, + "learning_rate": 4.775445697513011e-06, + "loss": 0.2285, + "mean_token_accuracy": 0.9191918969154358, + "step": 4447 + }, + { + "epoch": 2.224, + "grad_norm": 2.326095606314756, + "learning_rate": 4.775264926712489e-06, + "loss": 0.2825, + "mean_token_accuracy": 0.9094033241271973, + "step": 4448 + }, + { + "epoch": 2.2245, + "grad_norm": 4.481339533419299, + "learning_rate": 4.775084086603437e-06, + "loss": 0.2189, + "mean_token_accuracy": 0.9279080629348755, + "step": 4449 + }, + { + "epoch": 2.225, + "grad_norm": 7.1310510739327215, + "learning_rate": 4.774903177191358e-06, + "loss": 0.318, + "mean_token_accuracy": 0.8959153890609741, + "step": 4450 + }, + { + "epoch": 2.2255, + "grad_norm": 4.092169167967511, + "learning_rate": 4.774722198481767e-06, + "loss": 0.3106, + "mean_token_accuracy": 0.9055435061454773, + "step": 4451 + }, + { + "epoch": 2.226, + "grad_norm": 5.433470318972799, + "learning_rate": 4.7745411504801755e-06, + "loss": 0.3255, + "mean_token_accuracy": 0.9018559455871582, + "step": 4452 + }, + { + "epoch": 2.2265, + "grad_norm": 4.810941184615529, + "learning_rate": 4.774360033192098e-06, + "loss": 0.2479, + "mean_token_accuracy": 0.9091512560844421, + "step": 4453 + }, + { + "epoch": 2.227, + "grad_norm": 3.121439071267931, + "learning_rate": 4.774178846623053e-06, + "loss": 0.2478, + "mean_token_accuracy": 0.9178051948547363, + "step": 4454 + }, + { + "epoch": 2.2275, + "grad_norm": 2.5976606934622404, + "learning_rate": 4.773997590778558e-06, + "loss": 0.2342, + "mean_token_accuracy": 0.9247511029243469, + "step": 4455 + }, + { + "epoch": 2.228, + "grad_norm": 3.361228994257168, + "learning_rate": 4.7738162656641365e-06, + "loss": 0.3261, + "mean_token_accuracy": 0.8940866589546204, + "step": 4456 + }, + { + "epoch": 2.2285, + "grad_norm": 2.6126288619277784, + "learning_rate": 4.77363487128531e-06, + "loss": 0.2638, + "mean_token_accuracy": 0.9007267951965332, + "step": 4457 + }, + { + "epoch": 2.229, + "grad_norm": 3.900395844176329, + "learning_rate": 4.773453407647604e-06, + "loss": 0.2493, + "mean_token_accuracy": 0.9165550470352173, + "step": 4458 + }, + { + "epoch": 2.2295, + "grad_norm": 2.246912593572314, + "learning_rate": 4.773271874756549e-06, + "loss": 0.2993, + "mean_token_accuracy": 0.898744523525238, + "step": 4459 + }, + { + "epoch": 2.23, + "grad_norm": 2.554434967100162, + "learning_rate": 4.773090272617672e-06, + "loss": 0.2389, + "mean_token_accuracy": 0.9194329977035522, + "step": 4460 + }, + { + "epoch": 2.2305, + "grad_norm": 2.2258332280943063, + "learning_rate": 4.772908601236506e-06, + "loss": 0.3414, + "mean_token_accuracy": 0.8925564885139465, + "step": 4461 + }, + { + "epoch": 2.231, + "grad_norm": 5.936759118329423, + "learning_rate": 4.772726860618584e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.8890970945358276, + "step": 4462 + }, + { + "epoch": 2.2315, + "grad_norm": 3.534879097162307, + "learning_rate": 4.772545050769444e-06, + "loss": 0.2589, + "mean_token_accuracy": 0.9113747477531433, + "step": 4463 + }, + { + "epoch": 2.232, + "grad_norm": 3.6174621905994004, + "learning_rate": 4.772363171694623e-06, + "loss": 0.2723, + "mean_token_accuracy": 0.9070777893066406, + "step": 4464 + }, + { + "epoch": 2.2325, + "grad_norm": 3.6528426623043884, + "learning_rate": 4.77218122339966e-06, + "loss": 0.3096, + "mean_token_accuracy": 0.8926891088485718, + "step": 4465 + }, + { + "epoch": 2.233, + "grad_norm": 3.4472853381464583, + "learning_rate": 4.771999205890101e-06, + "loss": 0.2645, + "mean_token_accuracy": 0.9157058000564575, + "step": 4466 + }, + { + "epoch": 2.2335, + "grad_norm": 2.242788742655324, + "learning_rate": 4.7718171191714875e-06, + "loss": 0.3068, + "mean_token_accuracy": 0.8929900527000427, + "step": 4467 + }, + { + "epoch": 2.234, + "grad_norm": 3.4725273457543575, + "learning_rate": 4.771634963249367e-06, + "loss": 0.271, + "mean_token_accuracy": 0.9053537249565125, + "step": 4468 + }, + { + "epoch": 2.2345, + "grad_norm": 3.2189456600168103, + "learning_rate": 4.77145273812929e-06, + "loss": 0.3, + "mean_token_accuracy": 0.8978012204170227, + "step": 4469 + }, + { + "epoch": 2.235, + "grad_norm": 2.950176414376203, + "learning_rate": 4.771270443816805e-06, + "loss": 0.3366, + "mean_token_accuracy": 0.8889986872673035, + "step": 4470 + }, + { + "epoch": 2.2355, + "grad_norm": 1.7996733949963992, + "learning_rate": 4.771088080317466e-06, + "loss": 0.229, + "mean_token_accuracy": 0.9145163893699646, + "step": 4471 + }, + { + "epoch": 2.2359999999999998, + "grad_norm": 2.3925285259534146, + "learning_rate": 4.770905647636828e-06, + "loss": 0.2981, + "mean_token_accuracy": 0.907636284828186, + "step": 4472 + }, + { + "epoch": 2.2365, + "grad_norm": 1.702122050270422, + "learning_rate": 4.770723145780448e-06, + "loss": 0.1923, + "mean_token_accuracy": 0.9294423460960388, + "step": 4473 + }, + { + "epoch": 2.237, + "grad_norm": 2.198542943111179, + "learning_rate": 4.770540574753887e-06, + "loss": 0.2258, + "mean_token_accuracy": 0.9195608496665955, + "step": 4474 + }, + { + "epoch": 2.2375, + "grad_norm": 2.3950428456714015, + "learning_rate": 4.770357934562704e-06, + "loss": 0.2066, + "mean_token_accuracy": 0.9278033971786499, + "step": 4475 + }, + { + "epoch": 2.238, + "grad_norm": 1.6899081223079397, + "learning_rate": 4.770175225212464e-06, + "loss": 0.1671, + "mean_token_accuracy": 0.9372414946556091, + "step": 4476 + }, + { + "epoch": 2.2385, + "grad_norm": 2.779655528299055, + "learning_rate": 4.769992446708731e-06, + "loss": 0.3087, + "mean_token_accuracy": 0.8964073657989502, + "step": 4477 + }, + { + "epoch": 2.239, + "grad_norm": 2.979770859509664, + "learning_rate": 4.769809599057075e-06, + "loss": 0.3158, + "mean_token_accuracy": 0.8945661187171936, + "step": 4478 + }, + { + "epoch": 2.2395, + "grad_norm": 3.2829139155715055, + "learning_rate": 4.769626682263065e-06, + "loss": 0.2344, + "mean_token_accuracy": 0.9069990515708923, + "step": 4479 + }, + { + "epoch": 2.24, + "grad_norm": 3.3556612935557557, + "learning_rate": 4.769443696332272e-06, + "loss": 0.267, + "mean_token_accuracy": 0.9122001528739929, + "step": 4480 + }, + { + "epoch": 2.2405, + "grad_norm": 3.5935573101085905, + "learning_rate": 4.769260641270271e-06, + "loss": 0.2418, + "mean_token_accuracy": 0.917180597782135, + "step": 4481 + }, + { + "epoch": 2.241, + "grad_norm": 2.390167620636821, + "learning_rate": 4.7690775170826385e-06, + "loss": 0.2587, + "mean_token_accuracy": 0.9130003452301025, + "step": 4482 + }, + { + "epoch": 2.2415, + "grad_norm": 2.287504959286789, + "learning_rate": 4.768894323774952e-06, + "loss": 0.2464, + "mean_token_accuracy": 0.9089134931564331, + "step": 4483 + }, + { + "epoch": 2.242, + "grad_norm": 1.5518201919415344, + "learning_rate": 4.768711061352793e-06, + "loss": 0.1959, + "mean_token_accuracy": 0.9224717617034912, + "step": 4484 + }, + { + "epoch": 2.2425, + "grad_norm": 14.808520854071038, + "learning_rate": 4.7685277298217425e-06, + "loss": 0.1907, + "mean_token_accuracy": 0.9285440444946289, + "step": 4485 + }, + { + "epoch": 2.243, + "grad_norm": 1.9183015914249566, + "learning_rate": 4.768344329187386e-06, + "loss": 0.2232, + "mean_token_accuracy": 0.9243929386138916, + "step": 4486 + }, + { + "epoch": 2.2435, + "grad_norm": 6.586715213715891, + "learning_rate": 4.76816085945531e-06, + "loss": 0.2453, + "mean_token_accuracy": 0.9170518517494202, + "step": 4487 + }, + { + "epoch": 2.2439999999999998, + "grad_norm": 3.9883590587702216, + "learning_rate": 4.767977320631103e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8729537725448608, + "step": 4488 + }, + { + "epoch": 2.2445, + "grad_norm": 2.8495657826170127, + "learning_rate": 4.767793712720356e-06, + "loss": 0.3025, + "mean_token_accuracy": 0.9096420407295227, + "step": 4489 + }, + { + "epoch": 2.245, + "grad_norm": 1.740078320295811, + "learning_rate": 4.767610035728663e-06, + "loss": 0.2631, + "mean_token_accuracy": 0.9115734696388245, + "step": 4490 + }, + { + "epoch": 2.2455, + "grad_norm": 1.830541039558276, + "learning_rate": 4.767426289661618e-06, + "loss": 0.2429, + "mean_token_accuracy": 0.9155605435371399, + "step": 4491 + }, + { + "epoch": 2.246, + "grad_norm": 2.428765344403168, + "learning_rate": 4.767242474524818e-06, + "loss": 0.342, + "mean_token_accuracy": 0.8957353234291077, + "step": 4492 + }, + { + "epoch": 2.2465, + "grad_norm": 2.17986460167108, + "learning_rate": 4.767058590323864e-06, + "loss": 0.2939, + "mean_token_accuracy": 0.8967936038970947, + "step": 4493 + }, + { + "epoch": 2.247, + "grad_norm": 1.9332379128864392, + "learning_rate": 4.766874637064356e-06, + "loss": 0.278, + "mean_token_accuracy": 0.9095141887664795, + "step": 4494 + }, + { + "epoch": 2.2475, + "grad_norm": 3.489527843934642, + "learning_rate": 4.766690614751897e-06, + "loss": 0.3022, + "mean_token_accuracy": 0.9109026789665222, + "step": 4495 + }, + { + "epoch": 2.248, + "grad_norm": 2.628024034987908, + "learning_rate": 4.766506523392095e-06, + "loss": 0.2664, + "mean_token_accuracy": 0.9121268391609192, + "step": 4496 + }, + { + "epoch": 2.2485, + "grad_norm": 4.334994155683597, + "learning_rate": 4.766322362990555e-06, + "loss": 0.2854, + "mean_token_accuracy": 0.9051942825317383, + "step": 4497 + }, + { + "epoch": 2.249, + "grad_norm": 2.1290008876645077, + "learning_rate": 4.766138133552889e-06, + "loss": 0.2678, + "mean_token_accuracy": 0.9118536114692688, + "step": 4498 + }, + { + "epoch": 2.2495, + "grad_norm": 4.383951640925387, + "learning_rate": 4.765953835084708e-06, + "loss": 0.2711, + "mean_token_accuracy": 0.9117646813392639, + "step": 4499 + }, + { + "epoch": 2.25, + "grad_norm": 2.3726783934652276, + "learning_rate": 4.765769467591626e-06, + "loss": 0.2277, + "mean_token_accuracy": 0.9199462532997131, + "step": 4500 + }, + { + "epoch": 2.2505, + "grad_norm": 2.1114433688098324, + "learning_rate": 4.765585031079259e-06, + "loss": 0.2902, + "mean_token_accuracy": 0.9050594568252563, + "step": 4501 + }, + { + "epoch": 2.251, + "grad_norm": 2.220211155907896, + "learning_rate": 4.7654005255532246e-06, + "loss": 0.2641, + "mean_token_accuracy": 0.9126584529876709, + "step": 4502 + }, + { + "epoch": 2.2515, + "grad_norm": 2.62990741192329, + "learning_rate": 4.765215951019145e-06, + "loss": 0.2649, + "mean_token_accuracy": 0.904583752155304, + "step": 4503 + }, + { + "epoch": 2.252, + "grad_norm": 2.927331395975913, + "learning_rate": 4.765031307482643e-06, + "loss": 0.2013, + "mean_token_accuracy": 0.9248961806297302, + "step": 4504 + }, + { + "epoch": 2.2525, + "grad_norm": 1.9182136196801762, + "learning_rate": 4.76484659494934e-06, + "loss": 0.3335, + "mean_token_accuracy": 0.894506573677063, + "step": 4505 + }, + { + "epoch": 2.253, + "grad_norm": 6.654675832046426, + "learning_rate": 4.7646618134248655e-06, + "loss": 0.2852, + "mean_token_accuracy": 0.9049872159957886, + "step": 4506 + }, + { + "epoch": 2.2535, + "grad_norm": 2.0924352062534624, + "learning_rate": 4.764476962914847e-06, + "loss": 0.273, + "mean_token_accuracy": 0.9087384939193726, + "step": 4507 + }, + { + "epoch": 2.254, + "grad_norm": 3.119869170156388, + "learning_rate": 4.764292043424916e-06, + "loss": 0.2212, + "mean_token_accuracy": 0.9221177697181702, + "step": 4508 + }, + { + "epoch": 2.2545, + "grad_norm": 8.72617587430329, + "learning_rate": 4.764107054960705e-06, + "loss": 0.2397, + "mean_token_accuracy": 0.9130048751831055, + "step": 4509 + }, + { + "epoch": 2.255, + "grad_norm": 2.8137027875988916, + "learning_rate": 4.763921997527849e-06, + "loss": 0.2147, + "mean_token_accuracy": 0.9254515767097473, + "step": 4510 + }, + { + "epoch": 2.2555, + "grad_norm": 3.053985401953219, + "learning_rate": 4.763736871131987e-06, + "loss": 0.3248, + "mean_token_accuracy": 0.8880522847175598, + "step": 4511 + }, + { + "epoch": 2.2560000000000002, + "grad_norm": 1.5971405054034171, + "learning_rate": 4.763551675778755e-06, + "loss": 0.2124, + "mean_token_accuracy": 0.9219986796379089, + "step": 4512 + }, + { + "epoch": 2.2565, + "grad_norm": 2.5922950759341448, + "learning_rate": 4.763366411473797e-06, + "loss": 0.2913, + "mean_token_accuracy": 0.8999264240264893, + "step": 4513 + }, + { + "epoch": 2.257, + "grad_norm": 2.106911568560149, + "learning_rate": 4.763181078222754e-06, + "loss": 0.2424, + "mean_token_accuracy": 0.9169246554374695, + "step": 4514 + }, + { + "epoch": 2.2575, + "grad_norm": 6.1373787691582535, + "learning_rate": 4.762995676031275e-06, + "loss": 0.2437, + "mean_token_accuracy": 0.9097617268562317, + "step": 4515 + }, + { + "epoch": 2.258, + "grad_norm": 3.7195594610268863, + "learning_rate": 4.7628102049050044e-06, + "loss": 0.2742, + "mean_token_accuracy": 0.9060561656951904, + "step": 4516 + }, + { + "epoch": 2.2585, + "grad_norm": 2.8613733363988767, + "learning_rate": 4.762624664849594e-06, + "loss": 0.2851, + "mean_token_accuracy": 0.9007430076599121, + "step": 4517 + }, + { + "epoch": 2.259, + "grad_norm": 1.6624520819986897, + "learning_rate": 4.762439055870694e-06, + "loss": 0.2313, + "mean_token_accuracy": 0.9172173142433167, + "step": 4518 + }, + { + "epoch": 2.2595, + "grad_norm": 2.404493942265702, + "learning_rate": 4.762253377973961e-06, + "loss": 0.2844, + "mean_token_accuracy": 0.8975685238838196, + "step": 4519 + }, + { + "epoch": 2.26, + "grad_norm": 1.841396782648452, + "learning_rate": 4.762067631165049e-06, + "loss": 0.2092, + "mean_token_accuracy": 0.9232326745986938, + "step": 4520 + }, + { + "epoch": 2.2605, + "grad_norm": 2.695199878291432, + "learning_rate": 4.761881815449617e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8866454362869263, + "step": 4521 + }, + { + "epoch": 2.261, + "grad_norm": 2.2745936484653257, + "learning_rate": 4.7616959308333245e-06, + "loss": 0.2739, + "mean_token_accuracy": 0.9075907468795776, + "step": 4522 + }, + { + "epoch": 2.2615, + "grad_norm": 2.5030501581090667, + "learning_rate": 4.7615099773218346e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8684788346290588, + "step": 4523 + }, + { + "epoch": 2.262, + "grad_norm": 1.9429340357970548, + "learning_rate": 4.76132395492081e-06, + "loss": 0.2128, + "mean_token_accuracy": 0.9238792061805725, + "step": 4524 + }, + { + "epoch": 2.2625, + "grad_norm": 2.823041862139522, + "learning_rate": 4.761137863635921e-06, + "loss": 0.289, + "mean_token_accuracy": 0.9099661111831665, + "step": 4525 + }, + { + "epoch": 2.263, + "grad_norm": 2.57068607853179, + "learning_rate": 4.760951703472833e-06, + "loss": 0.2152, + "mean_token_accuracy": 0.9252769351005554, + "step": 4526 + }, + { + "epoch": 2.2635, + "grad_norm": 10.496778666963715, + "learning_rate": 4.7607654744372165e-06, + "loss": 0.2091, + "mean_token_accuracy": 0.9270498156547546, + "step": 4527 + }, + { + "epoch": 2.2640000000000002, + "grad_norm": 3.9036337213370893, + "learning_rate": 4.760579176534747e-06, + "loss": 0.3254, + "mean_token_accuracy": 0.8929755091667175, + "step": 4528 + }, + { + "epoch": 2.2645, + "grad_norm": 1.5848290592627594, + "learning_rate": 4.760392809771098e-06, + "loss": 0.2195, + "mean_token_accuracy": 0.9200994968414307, + "step": 4529 + }, + { + "epoch": 2.265, + "grad_norm": 1.811770763116669, + "learning_rate": 4.760206374151947e-06, + "loss": 0.2408, + "mean_token_accuracy": 0.9160478115081787, + "step": 4530 + }, + { + "epoch": 2.2655, + "grad_norm": 7.233420816625489, + "learning_rate": 4.760019869682971e-06, + "loss": 0.2391, + "mean_token_accuracy": 0.9213828444480896, + "step": 4531 + }, + { + "epoch": 2.266, + "grad_norm": 2.1808298687109007, + "learning_rate": 4.759833296369855e-06, + "loss": 0.2901, + "mean_token_accuracy": 0.9025687575340271, + "step": 4532 + }, + { + "epoch": 2.2665, + "grad_norm": 9.04723529054749, + "learning_rate": 4.75964665421828e-06, + "loss": 0.2232, + "mean_token_accuracy": 0.922420084476471, + "step": 4533 + }, + { + "epoch": 2.267, + "grad_norm": 5.4008462283357215, + "learning_rate": 4.75945994323393e-06, + "loss": 0.2258, + "mean_token_accuracy": 0.924880862236023, + "step": 4534 + }, + { + "epoch": 2.2675, + "grad_norm": 1.706751955142766, + "learning_rate": 4.759273163422496e-06, + "loss": 0.2048, + "mean_token_accuracy": 0.9222871661186218, + "step": 4535 + }, + { + "epoch": 2.268, + "grad_norm": 2.1737380819039003, + "learning_rate": 4.759086314789667e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8925367593765259, + "step": 4536 + }, + { + "epoch": 2.2685, + "grad_norm": 3.1724694057224156, + "learning_rate": 4.758899397341132e-06, + "loss": 0.279, + "mean_token_accuracy": 0.9134538173675537, + "step": 4537 + }, + { + "epoch": 2.269, + "grad_norm": 4.88311272815055, + "learning_rate": 4.7587124110825875e-06, + "loss": 0.3029, + "mean_token_accuracy": 0.8982856869697571, + "step": 4538 + }, + { + "epoch": 2.2695, + "grad_norm": 2.362132196672693, + "learning_rate": 4.758525356019728e-06, + "loss": 0.2785, + "mean_token_accuracy": 0.913673460483551, + "step": 4539 + }, + { + "epoch": 2.27, + "grad_norm": 2.955213586113402, + "learning_rate": 4.7583382321582525e-06, + "loss": 0.2715, + "mean_token_accuracy": 0.9081533551216125, + "step": 4540 + }, + { + "epoch": 2.2705, + "grad_norm": 2.9600619568732687, + "learning_rate": 4.75815103950386e-06, + "loss": 0.2887, + "mean_token_accuracy": 0.901914656162262, + "step": 4541 + }, + { + "epoch": 2.271, + "grad_norm": 2.517398519632027, + "learning_rate": 4.757963778062254e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8818246126174927, + "step": 4542 + }, + { + "epoch": 2.2715, + "grad_norm": 2.1657636148806287, + "learning_rate": 4.757776447839138e-06, + "loss": 0.2602, + "mean_token_accuracy": 0.9104170799255371, + "step": 4543 + }, + { + "epoch": 2.2720000000000002, + "grad_norm": 2.3204570111419986, + "learning_rate": 4.757589048840219e-06, + "loss": 0.2885, + "mean_token_accuracy": 0.899983286857605, + "step": 4544 + }, + { + "epoch": 2.2725, + "grad_norm": 3.61770313226323, + "learning_rate": 4.757401581071203e-06, + "loss": 0.2888, + "mean_token_accuracy": 0.9000338315963745, + "step": 4545 + }, + { + "epoch": 2.273, + "grad_norm": 3.5256207342941117, + "learning_rate": 4.7572140445378054e-06, + "loss": 0.2893, + "mean_token_accuracy": 0.903936505317688, + "step": 4546 + }, + { + "epoch": 2.2735, + "grad_norm": 3.5921334491902495, + "learning_rate": 4.757026439245735e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8730186223983765, + "step": 4547 + }, + { + "epoch": 2.274, + "grad_norm": 2.0478425548625196, + "learning_rate": 4.756838765200708e-06, + "loss": 0.2083, + "mean_token_accuracy": 0.9282556772232056, + "step": 4548 + }, + { + "epoch": 2.2745, + "grad_norm": 2.394444501595011, + "learning_rate": 4.75665102240844e-06, + "loss": 0.2847, + "mean_token_accuracy": 0.9086844325065613, + "step": 4549 + }, + { + "epoch": 2.275, + "grad_norm": 2.7358679610617926, + "learning_rate": 4.7564632108746524e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8874538540840149, + "step": 4550 + }, + { + "epoch": 2.2755, + "grad_norm": 2.8575841168432383, + "learning_rate": 4.756275330605063e-06, + "loss": 0.2733, + "mean_token_accuracy": 0.9213821887969971, + "step": 4551 + }, + { + "epoch": 2.276, + "grad_norm": 1.8665927979178627, + "learning_rate": 4.756087381605399e-06, + "loss": 0.2441, + "mean_token_accuracy": 0.9184756875038147, + "step": 4552 + }, + { + "epoch": 2.2765, + "grad_norm": 2.129349758813876, + "learning_rate": 4.755899363881382e-06, + "loss": 0.2416, + "mean_token_accuracy": 0.9179062247276306, + "step": 4553 + }, + { + "epoch": 2.277, + "grad_norm": 2.152985730656494, + "learning_rate": 4.755711277438741e-06, + "loss": 0.302, + "mean_token_accuracy": 0.9086419939994812, + "step": 4554 + }, + { + "epoch": 2.2775, + "grad_norm": 2.6370022581594554, + "learning_rate": 4.755523122283206e-06, + "loss": 0.2738, + "mean_token_accuracy": 0.9118677973747253, + "step": 4555 + }, + { + "epoch": 2.278, + "grad_norm": 2.257807105269972, + "learning_rate": 4.755334898420507e-06, + "loss": 0.2649, + "mean_token_accuracy": 0.9128467440605164, + "step": 4556 + }, + { + "epoch": 2.2785, + "grad_norm": 2.760875739174401, + "learning_rate": 4.755146605856379e-06, + "loss": 0.2911, + "mean_token_accuracy": 0.8970540165901184, + "step": 4557 + }, + { + "epoch": 2.279, + "grad_norm": 2.1395557316243887, + "learning_rate": 4.754958244596557e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.8856777548789978, + "step": 4558 + }, + { + "epoch": 2.2795, + "grad_norm": 2.662648804482001, + "learning_rate": 4.754769814646779e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8373016119003296, + "step": 4559 + }, + { + "epoch": 2.2800000000000002, + "grad_norm": 2.2852426140039057, + "learning_rate": 4.754581316012785e-06, + "loss": 0.2647, + "mean_token_accuracy": 0.9074357748031616, + "step": 4560 + }, + { + "epoch": 2.2805, + "grad_norm": 2.1487915334397405, + "learning_rate": 4.754392748700316e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8878255486488342, + "step": 4561 + }, + { + "epoch": 2.281, + "grad_norm": 1.8465553908967614, + "learning_rate": 4.754204112715118e-06, + "loss": 0.2144, + "mean_token_accuracy": 0.9180948734283447, + "step": 4562 + }, + { + "epoch": 2.2815, + "grad_norm": 2.2316146689964165, + "learning_rate": 4.754015408062935e-06, + "loss": 0.3337, + "mean_token_accuracy": 0.8859792947769165, + "step": 4563 + }, + { + "epoch": 2.282, + "grad_norm": 2.9605871182232413, + "learning_rate": 4.753826634749517e-06, + "loss": 0.2548, + "mean_token_accuracy": 0.9127187132835388, + "step": 4564 + }, + { + "epoch": 2.2824999999999998, + "grad_norm": 37.554158818366915, + "learning_rate": 4.753637792780614e-06, + "loss": 0.3283, + "mean_token_accuracy": 0.8988001942634583, + "step": 4565 + }, + { + "epoch": 2.283, + "grad_norm": 3.0628656744269493, + "learning_rate": 4.753448882161978e-06, + "loss": 0.3226, + "mean_token_accuracy": 0.9006211161613464, + "step": 4566 + }, + { + "epoch": 2.2835, + "grad_norm": 9.85618592607748, + "learning_rate": 4.753259902899364e-06, + "loss": 0.2292, + "mean_token_accuracy": 0.9109026789665222, + "step": 4567 + }, + { + "epoch": 2.284, + "grad_norm": 5.40697858903187, + "learning_rate": 4.753070854998529e-06, + "loss": 0.2481, + "mean_token_accuracy": 0.9169208407402039, + "step": 4568 + }, + { + "epoch": 2.2845, + "grad_norm": 2.4855771143865155, + "learning_rate": 4.752881738465231e-06, + "loss": 0.2931, + "mean_token_accuracy": 0.9071676135063171, + "step": 4569 + }, + { + "epoch": 2.285, + "grad_norm": 2.2160704219207936, + "learning_rate": 4.752692553305229e-06, + "loss": 0.2485, + "mean_token_accuracy": 0.9193413853645325, + "step": 4570 + }, + { + "epoch": 2.2855, + "grad_norm": 3.034257500072699, + "learning_rate": 4.752503299524289e-06, + "loss": 0.2996, + "mean_token_accuracy": 0.8939444422721863, + "step": 4571 + }, + { + "epoch": 2.286, + "grad_norm": 2.3607942551273267, + "learning_rate": 4.752313977128176e-06, + "loss": 0.2233, + "mean_token_accuracy": 0.9197195172309875, + "step": 4572 + }, + { + "epoch": 2.2865, + "grad_norm": 3.256687872411774, + "learning_rate": 4.7521245861226544e-06, + "loss": 0.319, + "mean_token_accuracy": 0.89089035987854, + "step": 4573 + }, + { + "epoch": 2.287, + "grad_norm": 3.1844419322034954, + "learning_rate": 4.751935126513496e-06, + "loss": 0.3093, + "mean_token_accuracy": 0.8951411843299866, + "step": 4574 + }, + { + "epoch": 2.2875, + "grad_norm": 2.2402509505791905, + "learning_rate": 4.7517455983064694e-06, + "loss": 0.2469, + "mean_token_accuracy": 0.9143392443656921, + "step": 4575 + }, + { + "epoch": 2.288, + "grad_norm": 1.9635726736219283, + "learning_rate": 4.751556001507351e-06, + "loss": 0.224, + "mean_token_accuracy": 0.9160283207893372, + "step": 4576 + }, + { + "epoch": 2.2885, + "grad_norm": 7.353419367475332, + "learning_rate": 4.751366336121915e-06, + "loss": 0.2808, + "mean_token_accuracy": 0.9072502851486206, + "step": 4577 + }, + { + "epoch": 2.289, + "grad_norm": 5.464537971701372, + "learning_rate": 4.751176602155938e-06, + "loss": 0.3241, + "mean_token_accuracy": 0.8946791291236877, + "step": 4578 + }, + { + "epoch": 2.2895, + "grad_norm": 2.7918109550376697, + "learning_rate": 4.7509867996152e-06, + "loss": 0.2466, + "mean_token_accuracy": 0.9177649617195129, + "step": 4579 + }, + { + "epoch": 2.29, + "grad_norm": 8.091930910903654, + "learning_rate": 4.750796928505484e-06, + "loss": 0.3008, + "mean_token_accuracy": 0.903479814529419, + "step": 4580 + }, + { + "epoch": 2.2904999999999998, + "grad_norm": 2.3754698564149868, + "learning_rate": 4.750606988832573e-06, + "loss": 0.2779, + "mean_token_accuracy": 0.9073160886764526, + "step": 4581 + }, + { + "epoch": 2.291, + "grad_norm": 1.7880514208090503, + "learning_rate": 4.750416980602252e-06, + "loss": 0.2479, + "mean_token_accuracy": 0.9139266610145569, + "step": 4582 + }, + { + "epoch": 2.2915, + "grad_norm": 1.9656796562378331, + "learning_rate": 4.7502269038203105e-06, + "loss": 0.2072, + "mean_token_accuracy": 0.9266971945762634, + "step": 4583 + }, + { + "epoch": 2.292, + "grad_norm": 7.654482719551731, + "learning_rate": 4.750036758492537e-06, + "loss": 0.2292, + "mean_token_accuracy": 0.920209527015686, + "step": 4584 + }, + { + "epoch": 2.2925, + "grad_norm": 4.544993385602405, + "learning_rate": 4.749846544624725e-06, + "loss": 0.2963, + "mean_token_accuracy": 0.9035841822624207, + "step": 4585 + }, + { + "epoch": 2.293, + "grad_norm": 2.3639413318960956, + "learning_rate": 4.749656262222668e-06, + "loss": 0.2523, + "mean_token_accuracy": 0.9098086357116699, + "step": 4586 + }, + { + "epoch": 2.2935, + "grad_norm": 3.0821511340364993, + "learning_rate": 4.7494659112921625e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.882893443107605, + "step": 4587 + }, + { + "epoch": 2.294, + "grad_norm": 5.854935736290555, + "learning_rate": 4.749275491839008e-06, + "loss": 0.2799, + "mean_token_accuracy": 0.904203474521637, + "step": 4588 + }, + { + "epoch": 2.2945, + "grad_norm": 2.6752391907729947, + "learning_rate": 4.749085003869003e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8913043737411499, + "step": 4589 + }, + { + "epoch": 2.295, + "grad_norm": 2.97990870599194, + "learning_rate": 4.7488944473879515e-06, + "loss": 0.2782, + "mean_token_accuracy": 0.9058670401573181, + "step": 4590 + }, + { + "epoch": 2.2955, + "grad_norm": 7.769672779367426, + "learning_rate": 4.7487038224016576e-06, + "loss": 0.2288, + "mean_token_accuracy": 0.919914186000824, + "step": 4591 + }, + { + "epoch": 2.296, + "grad_norm": 4.123186328778449, + "learning_rate": 4.748513128915928e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8921270370483398, + "step": 4592 + }, + { + "epoch": 2.2965, + "grad_norm": 2.98930274385469, + "learning_rate": 4.748322366936572e-06, + "loss": 0.3067, + "mean_token_accuracy": 0.895099937915802, + "step": 4593 + }, + { + "epoch": 2.297, + "grad_norm": 4.82442751732783, + "learning_rate": 4.748131536469401e-06, + "loss": 0.2341, + "mean_token_accuracy": 0.9201160669326782, + "step": 4594 + }, + { + "epoch": 2.2975, + "grad_norm": 10.41497889599715, + "learning_rate": 4.747940637520226e-06, + "loss": 0.2578, + "mean_token_accuracy": 0.9126983880996704, + "step": 4595 + }, + { + "epoch": 2.298, + "grad_norm": 4.186368206955557, + "learning_rate": 4.7477496700948646e-06, + "loss": 0.2668, + "mean_token_accuracy": 0.9098629355430603, + "step": 4596 + }, + { + "epoch": 2.2984999999999998, + "grad_norm": 2.394295690605878, + "learning_rate": 4.747558634199133e-06, + "loss": 0.2897, + "mean_token_accuracy": 0.8995653390884399, + "step": 4597 + }, + { + "epoch": 2.299, + "grad_norm": 3.3282919302102805, + "learning_rate": 4.74736752983885e-06, + "loss": 0.2239, + "mean_token_accuracy": 0.925524115562439, + "step": 4598 + }, + { + "epoch": 2.2995, + "grad_norm": 2.090792174046566, + "learning_rate": 4.747176357019837e-06, + "loss": 0.2438, + "mean_token_accuracy": 0.9173228144645691, + "step": 4599 + }, + { + "epoch": 2.3, + "grad_norm": 2.351484625956361, + "learning_rate": 4.746985115747918e-06, + "loss": 0.3196, + "mean_token_accuracy": 0.9016504883766174, + "step": 4600 + }, + { + "epoch": 2.3005, + "grad_norm": 5.165529887778087, + "learning_rate": 4.746793806028918e-06, + "loss": 0.21, + "mean_token_accuracy": 0.9216912984848022, + "step": 4601 + }, + { + "epoch": 2.301, + "grad_norm": 4.660761267255897, + "learning_rate": 4.746602427868666e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.9004712700843811, + "step": 4602 + }, + { + "epoch": 2.3015, + "grad_norm": 2.906246674420167, + "learning_rate": 4.746410981272989e-06, + "loss": 0.2925, + "mean_token_accuracy": 0.9090098142623901, + "step": 4603 + }, + { + "epoch": 2.302, + "grad_norm": 2.953499997813974, + "learning_rate": 4.746219466247722e-06, + "loss": 0.2287, + "mean_token_accuracy": 0.9211631417274475, + "step": 4604 + }, + { + "epoch": 2.3025, + "grad_norm": 5.630483962373691, + "learning_rate": 4.746027882798697e-06, + "loss": 0.3129, + "mean_token_accuracy": 0.8975950479507446, + "step": 4605 + }, + { + "epoch": 2.303, + "grad_norm": 2.2458835684193126, + "learning_rate": 4.7458362309317505e-06, + "loss": 0.2525, + "mean_token_accuracy": 0.9146615266799927, + "step": 4606 + }, + { + "epoch": 2.3035, + "grad_norm": 3.4517711028003024, + "learning_rate": 4.745644510652721e-06, + "loss": 0.2569, + "mean_token_accuracy": 0.9094849228858948, + "step": 4607 + }, + { + "epoch": 2.304, + "grad_norm": 3.115240377413523, + "learning_rate": 4.745452721967446e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8822898864746094, + "step": 4608 + }, + { + "epoch": 2.3045, + "grad_norm": 3.1236245099464406, + "learning_rate": 4.745260864881772e-06, + "loss": 0.3098, + "mean_token_accuracy": 0.899914026260376, + "step": 4609 + }, + { + "epoch": 2.305, + "grad_norm": 3.5737291670149456, + "learning_rate": 4.745068939401539e-06, + "loss": 0.2052, + "mean_token_accuracy": 0.9282011985778809, + "step": 4610 + }, + { + "epoch": 2.3055, + "grad_norm": 2.3655767125150344, + "learning_rate": 4.744876945532597e-06, + "loss": 0.2943, + "mean_token_accuracy": 0.9011725187301636, + "step": 4611 + }, + { + "epoch": 2.306, + "grad_norm": 2.000243562373279, + "learning_rate": 4.744684883280792e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.8841400742530823, + "step": 4612 + }, + { + "epoch": 2.3064999999999998, + "grad_norm": 2.5591196895739077, + "learning_rate": 4.744492752651976e-06, + "loss": 0.2483, + "mean_token_accuracy": 0.9153863191604614, + "step": 4613 + }, + { + "epoch": 2.307, + "grad_norm": 3.7450590047011105, + "learning_rate": 4.7443005536520005e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.9004614353179932, + "step": 4614 + }, + { + "epoch": 2.3075, + "grad_norm": 2.767356535964485, + "learning_rate": 4.744108286286721e-06, + "loss": 0.3014, + "mean_token_accuracy": 0.9053602814674377, + "step": 4615 + }, + { + "epoch": 2.308, + "grad_norm": 4.94946767789537, + "learning_rate": 4.7439159505619946e-06, + "loss": 0.2609, + "mean_token_accuracy": 0.9093406796455383, + "step": 4616 + }, + { + "epoch": 2.3085, + "grad_norm": 3.0129344718364486, + "learning_rate": 4.743723546483679e-06, + "loss": 0.2303, + "mean_token_accuracy": 0.9180730581283569, + "step": 4617 + }, + { + "epoch": 2.309, + "grad_norm": 2.136506192102015, + "learning_rate": 4.743531074057636e-06, + "loss": 0.2517, + "mean_token_accuracy": 0.9082075357437134, + "step": 4618 + }, + { + "epoch": 2.3095, + "grad_norm": 2.480052451178647, + "learning_rate": 4.743338533289728e-06, + "loss": 0.2828, + "mean_token_accuracy": 0.9095380306243896, + "step": 4619 + }, + { + "epoch": 2.31, + "grad_norm": 5.267963720507131, + "learning_rate": 4.743145924185821e-06, + "loss": 0.305, + "mean_token_accuracy": 0.8995037078857422, + "step": 4620 + }, + { + "epoch": 2.3105, + "grad_norm": 2.866051873148115, + "learning_rate": 4.7429532467517826e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.879737138748169, + "step": 4621 + }, + { + "epoch": 2.311, + "grad_norm": 2.870612165806726, + "learning_rate": 4.742760500993481e-06, + "loss": 0.3235, + "mean_token_accuracy": 0.8972477316856384, + "step": 4622 + }, + { + "epoch": 2.3115, + "grad_norm": 2.292359320642031, + "learning_rate": 4.742567686916787e-06, + "loss": 0.2642, + "mean_token_accuracy": 0.903743326663971, + "step": 4623 + }, + { + "epoch": 2.312, + "grad_norm": 2.592223882055396, + "learning_rate": 4.7423748045275755e-06, + "loss": 0.2607, + "mean_token_accuracy": 0.9084096550941467, + "step": 4624 + }, + { + "epoch": 2.3125, + "grad_norm": 4.39087706641133, + "learning_rate": 4.742181853831721e-06, + "loss": 0.3097, + "mean_token_accuracy": 0.8984771370887756, + "step": 4625 + }, + { + "epoch": 2.313, + "grad_norm": 2.3220070170792697, + "learning_rate": 4.741988834835102e-06, + "loss": 0.2695, + "mean_token_accuracy": 0.9056807160377502, + "step": 4626 + }, + { + "epoch": 2.3135, + "grad_norm": 2.1715406612707793, + "learning_rate": 4.741795747543598e-06, + "loss": 0.2455, + "mean_token_accuracy": 0.9068061113357544, + "step": 4627 + }, + { + "epoch": 2.314, + "grad_norm": 1.964078698617632, + "learning_rate": 4.74160259196309e-06, + "loss": 0.2333, + "mean_token_accuracy": 0.9202728867530823, + "step": 4628 + }, + { + "epoch": 2.3145, + "grad_norm": 4.370138848374431, + "learning_rate": 4.741409368099463e-06, + "loss": 0.3186, + "mean_token_accuracy": 0.8946192264556885, + "step": 4629 + }, + { + "epoch": 2.315, + "grad_norm": 4.773734554067398, + "learning_rate": 4.741216075958602e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8903588652610779, + "step": 4630 + }, + { + "epoch": 2.3155, + "grad_norm": 2.6185514601921347, + "learning_rate": 4.741022715546395e-06, + "loss": 0.2733, + "mean_token_accuracy": 0.9209983348846436, + "step": 4631 + }, + { + "epoch": 2.316, + "grad_norm": 4.275575709008292, + "learning_rate": 4.740829286868732e-06, + "loss": 0.2819, + "mean_token_accuracy": 0.8997117280960083, + "step": 4632 + }, + { + "epoch": 2.3165, + "grad_norm": 2.4789962111252875, + "learning_rate": 4.740635789931507e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8896873593330383, + "step": 4633 + }, + { + "epoch": 2.317, + "grad_norm": 2.150864373721283, + "learning_rate": 4.740442224740612e-06, + "loss": 0.2616, + "mean_token_accuracy": 0.9137720465660095, + "step": 4634 + }, + { + "epoch": 2.3175, + "grad_norm": 2.8907066439335054, + "learning_rate": 4.740248591301945e-06, + "loss": 0.3242, + "mean_token_accuracy": 0.9023203253746033, + "step": 4635 + }, + { + "epoch": 2.318, + "grad_norm": 3.202593775491289, + "learning_rate": 4.740054889621403e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8876771926879883, + "step": 4636 + }, + { + "epoch": 2.3185000000000002, + "grad_norm": 4.350877783747187, + "learning_rate": 4.7398611197048875e-06, + "loss": 0.3114, + "mean_token_accuracy": 0.9022836089134216, + "step": 4637 + }, + { + "epoch": 2.319, + "grad_norm": 3.1354214622516636, + "learning_rate": 4.7396672815583e-06, + "loss": 0.3222, + "mean_token_accuracy": 0.8975885510444641, + "step": 4638 + }, + { + "epoch": 2.3195, + "grad_norm": 2.4093833900408557, + "learning_rate": 4.739473375187546e-06, + "loss": 0.3058, + "mean_token_accuracy": 0.9010319709777832, + "step": 4639 + }, + { + "epoch": 2.32, + "grad_norm": 2.247464621881179, + "learning_rate": 4.7392794005985324e-06, + "loss": 0.2349, + "mean_token_accuracy": 0.9162895679473877, + "step": 4640 + }, + { + "epoch": 2.3205, + "grad_norm": 2.0174808835461837, + "learning_rate": 4.7390853577971675e-06, + "loss": 0.257, + "mean_token_accuracy": 0.916882336139679, + "step": 4641 + }, + { + "epoch": 2.321, + "grad_norm": 1.9221894270556399, + "learning_rate": 4.738891246789362e-06, + "loss": 0.2321, + "mean_token_accuracy": 0.9205940365791321, + "step": 4642 + }, + { + "epoch": 2.3215, + "grad_norm": 4.915211104565301, + "learning_rate": 4.73869706758103e-06, + "loss": 0.3178, + "mean_token_accuracy": 0.8931481838226318, + "step": 4643 + }, + { + "epoch": 2.322, + "grad_norm": 3.3156105408193266, + "learning_rate": 4.738502820178085e-06, + "loss": 0.2698, + "mean_token_accuracy": 0.9035605788230896, + "step": 4644 + }, + { + "epoch": 2.3225, + "grad_norm": 2.8137309623485005, + "learning_rate": 4.738308504586445e-06, + "loss": 0.2457, + "mean_token_accuracy": 0.920730471611023, + "step": 4645 + }, + { + "epoch": 2.323, + "grad_norm": 3.481467052992314, + "learning_rate": 4.738114120812029e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8793311715126038, + "step": 4646 + }, + { + "epoch": 2.3235, + "grad_norm": 3.5298441390065203, + "learning_rate": 4.737919668860759e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8790581226348877, + "step": 4647 + }, + { + "epoch": 2.324, + "grad_norm": 1.8528909393844477, + "learning_rate": 4.7377251487385565e-06, + "loss": 0.2416, + "mean_token_accuracy": 0.9157949686050415, + "step": 4648 + }, + { + "epoch": 2.3245, + "grad_norm": 2.524147349061012, + "learning_rate": 4.737530560451349e-06, + "loss": 0.2795, + "mean_token_accuracy": 0.9048847556114197, + "step": 4649 + }, + { + "epoch": 2.325, + "grad_norm": 3.37686145134575, + "learning_rate": 4.737335904005063e-06, + "loss": 0.3211, + "mean_token_accuracy": 0.8964600563049316, + "step": 4650 + }, + { + "epoch": 2.3255, + "grad_norm": 2.926985888512371, + "learning_rate": 4.7371411794056275e-06, + "loss": 0.2711, + "mean_token_accuracy": 0.9048648476600647, + "step": 4651 + }, + { + "epoch": 2.326, + "grad_norm": 1.770794631318574, + "learning_rate": 4.736946386658976e-06, + "loss": 0.2588, + "mean_token_accuracy": 0.9088693857192993, + "step": 4652 + }, + { + "epoch": 2.3265000000000002, + "grad_norm": 2.867867318391519, + "learning_rate": 4.736751525771039e-06, + "loss": 0.3115, + "mean_token_accuracy": 0.8973971605300903, + "step": 4653 + }, + { + "epoch": 2.327, + "grad_norm": 3.0068578846471112, + "learning_rate": 4.736556596747757e-06, + "loss": 0.303, + "mean_token_accuracy": 0.9010251760482788, + "step": 4654 + }, + { + "epoch": 2.3275, + "grad_norm": 2.102199671044542, + "learning_rate": 4.736361599595063e-06, + "loss": 0.3075, + "mean_token_accuracy": 0.9030901193618774, + "step": 4655 + }, + { + "epoch": 2.328, + "grad_norm": 1.8390354880887376, + "learning_rate": 4.7361665343189e-06, + "loss": 0.2874, + "mean_token_accuracy": 0.9108788967132568, + "step": 4656 + }, + { + "epoch": 2.3285, + "grad_norm": 5.180250530171817, + "learning_rate": 4.735971400925209e-06, + "loss": 0.3071, + "mean_token_accuracy": 0.8936854600906372, + "step": 4657 + }, + { + "epoch": 2.329, + "grad_norm": 2.305763134292441, + "learning_rate": 4.735776199419935e-06, + "loss": 0.1805, + "mean_token_accuracy": 0.9328635931015015, + "step": 4658 + }, + { + "epoch": 2.3295, + "grad_norm": 2.1646197963144136, + "learning_rate": 4.735580929809022e-06, + "loss": 0.2939, + "mean_token_accuracy": 0.9016561508178711, + "step": 4659 + }, + { + "epoch": 2.33, + "grad_norm": 2.402632383960276, + "learning_rate": 4.735385592098421e-06, + "loss": 0.2751, + "mean_token_accuracy": 0.9018545746803284, + "step": 4660 + }, + { + "epoch": 2.3305, + "grad_norm": 1.76652062219238, + "learning_rate": 4.735190186294081e-06, + "loss": 0.2092, + "mean_token_accuracy": 0.9225634336471558, + "step": 4661 + }, + { + "epoch": 2.331, + "grad_norm": 1.657757106124429, + "learning_rate": 4.734994712401953e-06, + "loss": 0.2796, + "mean_token_accuracy": 0.9095718860626221, + "step": 4662 + }, + { + "epoch": 2.3315, + "grad_norm": 1.8763316105381433, + "learning_rate": 4.734799170427994e-06, + "loss": 0.2921, + "mean_token_accuracy": 0.9005380272865295, + "step": 4663 + }, + { + "epoch": 2.332, + "grad_norm": 2.416475450003132, + "learning_rate": 4.73460356037816e-06, + "loss": 0.2632, + "mean_token_accuracy": 0.908527135848999, + "step": 4664 + }, + { + "epoch": 2.3325, + "grad_norm": 3.6883689216430033, + "learning_rate": 4.734407882258408e-06, + "loss": 0.3014, + "mean_token_accuracy": 0.9041049480438232, + "step": 4665 + }, + { + "epoch": 2.333, + "grad_norm": 2.961590089756114, + "learning_rate": 4.734212136074701e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8754612803459167, + "step": 4666 + }, + { + "epoch": 2.3335, + "grad_norm": 2.0575554684069504, + "learning_rate": 4.7340163218329994e-06, + "loss": 0.3144, + "mean_token_accuracy": 0.899368166923523, + "step": 4667 + }, + { + "epoch": 2.334, + "grad_norm": 2.529976271337104, + "learning_rate": 4.73382043953927e-06, + "loss": 0.3198, + "mean_token_accuracy": 0.8994580507278442, + "step": 4668 + }, + { + "epoch": 2.3345000000000002, + "grad_norm": 3.3380381404864083, + "learning_rate": 4.733624489199479e-06, + "loss": 0.3171, + "mean_token_accuracy": 0.8985432982444763, + "step": 4669 + }, + { + "epoch": 2.335, + "grad_norm": 12.373937911697158, + "learning_rate": 4.733428470819595e-06, + "loss": 0.324, + "mean_token_accuracy": 0.8960123658180237, + "step": 4670 + }, + { + "epoch": 2.3355, + "grad_norm": 2.2948651260372155, + "learning_rate": 4.733232384405589e-06, + "loss": 0.2686, + "mean_token_accuracy": 0.9062074422836304, + "step": 4671 + }, + { + "epoch": 2.336, + "grad_norm": 2.0316676755742473, + "learning_rate": 4.733036229963435e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8842647075653076, + "step": 4672 + }, + { + "epoch": 2.3365, + "grad_norm": 2.7524708278758125, + "learning_rate": 4.732840007499106e-06, + "loss": 0.2939, + "mean_token_accuracy": 0.9045354723930359, + "step": 4673 + }, + { + "epoch": 2.337, + "grad_norm": 1.9338714213744115, + "learning_rate": 4.732643717018583e-06, + "loss": 0.2738, + "mean_token_accuracy": 0.9075924158096313, + "step": 4674 + }, + { + "epoch": 2.3375, + "grad_norm": 2.039545689350446, + "learning_rate": 4.732447358527843e-06, + "loss": 0.2083, + "mean_token_accuracy": 0.9261114597320557, + "step": 4675 + }, + { + "epoch": 2.338, + "grad_norm": 2.5430478533725753, + "learning_rate": 4.732250932032867e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8699392080307007, + "step": 4676 + }, + { + "epoch": 2.3385, + "grad_norm": 2.0995706441943067, + "learning_rate": 4.73205443753964e-06, + "loss": 0.2143, + "mean_token_accuracy": 0.9248778223991394, + "step": 4677 + }, + { + "epoch": 2.339, + "grad_norm": 2.8097821941495424, + "learning_rate": 4.731857875054147e-06, + "loss": 0.2576, + "mean_token_accuracy": 0.9192795157432556, + "step": 4678 + }, + { + "epoch": 2.3395, + "grad_norm": 2.3691939066457337, + "learning_rate": 4.731661244582375e-06, + "loss": 0.3047, + "mean_token_accuracy": 0.9054632782936096, + "step": 4679 + }, + { + "epoch": 2.34, + "grad_norm": 14.225207719241503, + "learning_rate": 4.731464546130315e-06, + "loss": 0.2375, + "mean_token_accuracy": 0.9240121841430664, + "step": 4680 + }, + { + "epoch": 2.3405, + "grad_norm": 3.2433303106287403, + "learning_rate": 4.731267779703956e-06, + "loss": 0.2605, + "mean_token_accuracy": 0.9088002443313599, + "step": 4681 + }, + { + "epoch": 2.341, + "grad_norm": 3.4444261239183454, + "learning_rate": 4.731070945309295e-06, + "loss": 0.2487, + "mean_token_accuracy": 0.912848174571991, + "step": 4682 + }, + { + "epoch": 2.3415, + "grad_norm": 1.6327023188107732, + "learning_rate": 4.730874042952327e-06, + "loss": 0.2203, + "mean_token_accuracy": 0.9221605658531189, + "step": 4683 + }, + { + "epoch": 2.342, + "grad_norm": 3.6288751518593365, + "learning_rate": 4.730677072639049e-06, + "loss": 0.2742, + "mean_token_accuracy": 0.9118934273719788, + "step": 4684 + }, + { + "epoch": 2.3425000000000002, + "grad_norm": 1.5056190483238017, + "learning_rate": 4.730480034375462e-06, + "loss": 0.2361, + "mean_token_accuracy": 0.9195583462715149, + "step": 4685 + }, + { + "epoch": 2.343, + "grad_norm": 2.1843493334855175, + "learning_rate": 4.730282928167568e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.8894488215446472, + "step": 4686 + }, + { + "epoch": 2.3435, + "grad_norm": 3.979509147543485, + "learning_rate": 4.730085754021371e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8912234902381897, + "step": 4687 + }, + { + "epoch": 2.344, + "grad_norm": 2.94709795709841, + "learning_rate": 4.729888511942877e-06, + "loss": 0.3354, + "mean_token_accuracy": 0.893588662147522, + "step": 4688 + }, + { + "epoch": 2.3445, + "grad_norm": 3.347628763850283, + "learning_rate": 4.729691201938096e-06, + "loss": 0.2716, + "mean_token_accuracy": 0.9067838191986084, + "step": 4689 + }, + { + "epoch": 2.3449999999999998, + "grad_norm": 2.4947348529567956, + "learning_rate": 4.729493824013036e-06, + "loss": 0.3192, + "mean_token_accuracy": 0.9050378799438477, + "step": 4690 + }, + { + "epoch": 2.3455, + "grad_norm": 2.4182544743197583, + "learning_rate": 4.72929637817371e-06, + "loss": 0.2776, + "mean_token_accuracy": 0.9027959108352661, + "step": 4691 + }, + { + "epoch": 2.346, + "grad_norm": 4.197965558520363, + "learning_rate": 4.729098864426134e-06, + "loss": 0.259, + "mean_token_accuracy": 0.9145658016204834, + "step": 4692 + }, + { + "epoch": 2.3465, + "grad_norm": 3.2434062050884718, + "learning_rate": 4.728901282776323e-06, + "loss": 0.2369, + "mean_token_accuracy": 0.9140024185180664, + "step": 4693 + }, + { + "epoch": 2.347, + "grad_norm": 2.9087265256383663, + "learning_rate": 4.728703633230297e-06, + "loss": 0.2548, + "mean_token_accuracy": 0.9131638407707214, + "step": 4694 + }, + { + "epoch": 2.3475, + "grad_norm": 2.023256567630682, + "learning_rate": 4.7285059157940765e-06, + "loss": 0.3063, + "mean_token_accuracy": 0.9034743309020996, + "step": 4695 + }, + { + "epoch": 2.348, + "grad_norm": 2.4877731097713047, + "learning_rate": 4.7283081304736834e-06, + "loss": 0.2659, + "mean_token_accuracy": 0.9124253392219543, + "step": 4696 + }, + { + "epoch": 2.3485, + "grad_norm": 3.219267470292521, + "learning_rate": 4.728110277275143e-06, + "loss": 0.3128, + "mean_token_accuracy": 0.8946964144706726, + "step": 4697 + }, + { + "epoch": 2.349, + "grad_norm": 6.748803293459518, + "learning_rate": 4.7279123562044835e-06, + "loss": 0.2649, + "mean_token_accuracy": 0.9099617004394531, + "step": 4698 + }, + { + "epoch": 2.3495, + "grad_norm": 1.7876014181104574, + "learning_rate": 4.727714367267732e-06, + "loss": 0.2495, + "mean_token_accuracy": 0.9160872101783752, + "step": 4699 + }, + { + "epoch": 2.35, + "grad_norm": 24.687811391267836, + "learning_rate": 4.72751631047092e-06, + "loss": 0.329, + "mean_token_accuracy": 0.8896595239639282, + "step": 4700 + }, + { + "epoch": 2.3505, + "grad_norm": 2.332036584280311, + "learning_rate": 4.727318185820081e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.889609158039093, + "step": 4701 + }, + { + "epoch": 2.351, + "grad_norm": 2.0281229580145808, + "learning_rate": 4.727119993321252e-06, + "loss": 0.2194, + "mean_token_accuracy": 0.9214031100273132, + "step": 4702 + }, + { + "epoch": 2.3515, + "grad_norm": 6.477948878044584, + "learning_rate": 4.726921732980467e-06, + "loss": 0.3052, + "mean_token_accuracy": 0.8985669016838074, + "step": 4703 + }, + { + "epoch": 2.352, + "grad_norm": 1.7818760082688072, + "learning_rate": 4.726723404803767e-06, + "loss": 0.2298, + "mean_token_accuracy": 0.9209372997283936, + "step": 4704 + }, + { + "epoch": 2.3525, + "grad_norm": 2.191499778101814, + "learning_rate": 4.726525008797194e-06, + "loss": 0.2605, + "mean_token_accuracy": 0.9076138138771057, + "step": 4705 + }, + { + "epoch": 2.3529999999999998, + "grad_norm": 1.8325698116829192, + "learning_rate": 4.72632654496679e-06, + "loss": 0.3074, + "mean_token_accuracy": 0.8937007784843445, + "step": 4706 + }, + { + "epoch": 2.3535, + "grad_norm": 1.5155952568712592, + "learning_rate": 4.726128013318602e-06, + "loss": 0.2625, + "mean_token_accuracy": 0.904837429523468, + "step": 4707 + }, + { + "epoch": 2.354, + "grad_norm": 5.3701728189767985, + "learning_rate": 4.725929413858677e-06, + "loss": 0.2562, + "mean_token_accuracy": 0.9164395928382874, + "step": 4708 + }, + { + "epoch": 2.3545, + "grad_norm": 2.222550273961984, + "learning_rate": 4.725730746593064e-06, + "loss": 0.2323, + "mean_token_accuracy": 0.9133060574531555, + "step": 4709 + }, + { + "epoch": 2.355, + "grad_norm": 2.832883926181154, + "learning_rate": 4.725532011527817e-06, + "loss": 0.2904, + "mean_token_accuracy": 0.904346227645874, + "step": 4710 + }, + { + "epoch": 2.3555, + "grad_norm": 2.476181681395715, + "learning_rate": 4.725333208668987e-06, + "loss": 0.2617, + "mean_token_accuracy": 0.9047528505325317, + "step": 4711 + }, + { + "epoch": 2.356, + "grad_norm": 2.16727904003633, + "learning_rate": 4.725134338022631e-06, + "loss": 0.2458, + "mean_token_accuracy": 0.907634437084198, + "step": 4712 + }, + { + "epoch": 2.3565, + "grad_norm": 2.4463030022027747, + "learning_rate": 4.724935399594807e-06, + "loss": 0.354, + "mean_token_accuracy": 0.8863845467567444, + "step": 4713 + }, + { + "epoch": 2.357, + "grad_norm": 7.768729132174687, + "learning_rate": 4.724736393391577e-06, + "loss": 0.2579, + "mean_token_accuracy": 0.9183955788612366, + "step": 4714 + }, + { + "epoch": 2.3575, + "grad_norm": 1.8136729250738888, + "learning_rate": 4.724537319419e-06, + "loss": 0.2335, + "mean_token_accuracy": 0.9201309084892273, + "step": 4715 + }, + { + "epoch": 2.358, + "grad_norm": 2.9367780653838933, + "learning_rate": 4.724338177683141e-06, + "loss": 0.2912, + "mean_token_accuracy": 0.9059289693832397, + "step": 4716 + }, + { + "epoch": 2.3585, + "grad_norm": 13.922131720979088, + "learning_rate": 4.724138968190067e-06, + "loss": 0.2635, + "mean_token_accuracy": 0.9144151210784912, + "step": 4717 + }, + { + "epoch": 2.359, + "grad_norm": 2.466668087744495, + "learning_rate": 4.723939690945846e-06, + "loss": 0.2495, + "mean_token_accuracy": 0.9202108979225159, + "step": 4718 + }, + { + "epoch": 2.3595, + "grad_norm": 1.6630133453615208, + "learning_rate": 4.723740345956547e-06, + "loss": 0.2731, + "mean_token_accuracy": 0.9023563265800476, + "step": 4719 + }, + { + "epoch": 2.36, + "grad_norm": 2.2328278812258326, + "learning_rate": 4.723540933228245e-06, + "loss": 0.2868, + "mean_token_accuracy": 0.9030413031578064, + "step": 4720 + }, + { + "epoch": 2.3605, + "grad_norm": 2.2756332073785432, + "learning_rate": 4.723341452767012e-06, + "loss": 0.2911, + "mean_token_accuracy": 0.9013791084289551, + "step": 4721 + }, + { + "epoch": 2.3609999999999998, + "grad_norm": 2.8612133478129156, + "learning_rate": 4.723141904578925e-06, + "loss": 0.3106, + "mean_token_accuracy": 0.905701756477356, + "step": 4722 + }, + { + "epoch": 2.3615, + "grad_norm": 27.24551315607496, + "learning_rate": 4.722942288670063e-06, + "loss": 0.3165, + "mean_token_accuracy": 0.8877941966056824, + "step": 4723 + }, + { + "epoch": 2.362, + "grad_norm": 2.0099662625133137, + "learning_rate": 4.722742605046509e-06, + "loss": 0.2272, + "mean_token_accuracy": 0.9212814569473267, + "step": 4724 + }, + { + "epoch": 2.3625, + "grad_norm": 2.5851020599982886, + "learning_rate": 4.7225428537143414e-06, + "loss": 0.281, + "mean_token_accuracy": 0.9101863503456116, + "step": 4725 + }, + { + "epoch": 2.363, + "grad_norm": 3.059163379391534, + "learning_rate": 4.722343034679647e-06, + "loss": 0.3179, + "mean_token_accuracy": 0.8900630474090576, + "step": 4726 + }, + { + "epoch": 2.3635, + "grad_norm": 3.3482483017769886, + "learning_rate": 4.722143147948513e-06, + "loss": 0.2147, + "mean_token_accuracy": 0.9224603772163391, + "step": 4727 + }, + { + "epoch": 2.364, + "grad_norm": 1.759051014280739, + "learning_rate": 4.721943193527029e-06, + "loss": 0.2268, + "mean_token_accuracy": 0.9235947728157043, + "step": 4728 + }, + { + "epoch": 2.3645, + "grad_norm": 3.4391683107387423, + "learning_rate": 4.721743171421285e-06, + "loss": 0.3083, + "mean_token_accuracy": 0.9035073518753052, + "step": 4729 + }, + { + "epoch": 2.365, + "grad_norm": 2.5160510491834236, + "learning_rate": 4.721543081637372e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8891509175300598, + "step": 4730 + }, + { + "epoch": 2.3655, + "grad_norm": 3.0375871736457936, + "learning_rate": 4.721342924181388e-06, + "loss": 0.2726, + "mean_token_accuracy": 0.9113709926605225, + "step": 4731 + }, + { + "epoch": 2.366, + "grad_norm": 14.614050629248302, + "learning_rate": 4.72114269905943e-06, + "loss": 0.3361, + "mean_token_accuracy": 0.8966138362884521, + "step": 4732 + }, + { + "epoch": 2.3665, + "grad_norm": 4.0439451479251085, + "learning_rate": 4.7209424062775954e-06, + "loss": 0.2858, + "mean_token_accuracy": 0.9112992286682129, + "step": 4733 + }, + { + "epoch": 2.367, + "grad_norm": 13.116065560161905, + "learning_rate": 4.7207420458419875e-06, + "loss": 0.2664, + "mean_token_accuracy": 0.9078765511512756, + "step": 4734 + }, + { + "epoch": 2.3675, + "grad_norm": 1.670507673164123, + "learning_rate": 4.720541617758707e-06, + "loss": 0.2686, + "mean_token_accuracy": 0.9070265889167786, + "step": 4735 + }, + { + "epoch": 2.368, + "grad_norm": 2.125535457820086, + "learning_rate": 4.720341122033862e-06, + "loss": 0.2763, + "mean_token_accuracy": 0.9035656452178955, + "step": 4736 + }, + { + "epoch": 2.3685, + "grad_norm": 2.524430575721085, + "learning_rate": 4.720140558673558e-06, + "loss": 0.2978, + "mean_token_accuracy": 0.9019321799278259, + "step": 4737 + }, + { + "epoch": 2.3689999999999998, + "grad_norm": 2.49849076512253, + "learning_rate": 4.719939927683906e-06, + "loss": 0.3157, + "mean_token_accuracy": 0.8950079083442688, + "step": 4738 + }, + { + "epoch": 2.3695, + "grad_norm": 1.4708211354373397, + "learning_rate": 4.719739229071017e-06, + "loss": 0.2091, + "mean_token_accuracy": 0.927299976348877, + "step": 4739 + }, + { + "epoch": 2.37, + "grad_norm": 3.6592285647262237, + "learning_rate": 4.719538462841003e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8834483027458191, + "step": 4740 + }, + { + "epoch": 2.3705, + "grad_norm": 2.102855693750361, + "learning_rate": 4.719337628999983e-06, + "loss": 0.2656, + "mean_token_accuracy": 0.9078202843666077, + "step": 4741 + }, + { + "epoch": 2.371, + "grad_norm": 3.2875645862854084, + "learning_rate": 4.719136727554072e-06, + "loss": 0.2792, + "mean_token_accuracy": 0.9051780104637146, + "step": 4742 + }, + { + "epoch": 2.3715, + "grad_norm": 2.7647862823785663, + "learning_rate": 4.718935758509391e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8861175775527954, + "step": 4743 + }, + { + "epoch": 2.372, + "grad_norm": 2.406501517704065, + "learning_rate": 4.718734721872062e-06, + "loss": 0.2931, + "mean_token_accuracy": 0.9113923907279968, + "step": 4744 + }, + { + "epoch": 2.3725, + "grad_norm": 1.8505535254237662, + "learning_rate": 4.718533617648209e-06, + "loss": 0.2091, + "mean_token_accuracy": 0.9270874857902527, + "step": 4745 + }, + { + "epoch": 2.373, + "grad_norm": 2.822220247903005, + "learning_rate": 4.718332445843956e-06, + "loss": 0.258, + "mean_token_accuracy": 0.912188708782196, + "step": 4746 + }, + { + "epoch": 2.3735, + "grad_norm": 2.263155077864366, + "learning_rate": 4.718131206465434e-06, + "loss": 0.2764, + "mean_token_accuracy": 0.9099482297897339, + "step": 4747 + }, + { + "epoch": 2.374, + "grad_norm": 7.08616414733895, + "learning_rate": 4.717929899518771e-06, + "loss": 0.236, + "mean_token_accuracy": 0.9125379920005798, + "step": 4748 + }, + { + "epoch": 2.3745, + "grad_norm": 1.9932570569784092, + "learning_rate": 4.7177285250101e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.886704683303833, + "step": 4749 + }, + { + "epoch": 2.375, + "grad_norm": 4.045664402638809, + "learning_rate": 4.717527082945555e-06, + "loss": 0.2805, + "mean_token_accuracy": 0.9015671610832214, + "step": 4750 + }, + { + "epoch": 2.3755, + "grad_norm": 4.224496886059489, + "learning_rate": 4.717325573331272e-06, + "loss": 0.325, + "mean_token_accuracy": 0.8910863995552063, + "step": 4751 + }, + { + "epoch": 2.376, + "grad_norm": 1.9080795739168146, + "learning_rate": 4.71712399617339e-06, + "loss": 0.2038, + "mean_token_accuracy": 0.9298245906829834, + "step": 4752 + }, + { + "epoch": 2.3765, + "grad_norm": 2.3394545563761198, + "learning_rate": 4.716922351478049e-06, + "loss": 0.268, + "mean_token_accuracy": 0.905846893787384, + "step": 4753 + }, + { + "epoch": 2.377, + "grad_norm": 3.516129412255737, + "learning_rate": 4.716720639251392e-06, + "loss": 0.3155, + "mean_token_accuracy": 0.8986835479736328, + "step": 4754 + }, + { + "epoch": 2.3775, + "grad_norm": 2.3542756402405103, + "learning_rate": 4.716518859499563e-06, + "loss": 0.2256, + "mean_token_accuracy": 0.9209228754043579, + "step": 4755 + }, + { + "epoch": 2.378, + "grad_norm": 4.874439334355, + "learning_rate": 4.716317012228707e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8749784231185913, + "step": 4756 + }, + { + "epoch": 2.3785, + "grad_norm": 1.5574576523755526, + "learning_rate": 4.716115097444976e-06, + "loss": 0.2391, + "mean_token_accuracy": 0.9170175194740295, + "step": 4757 + }, + { + "epoch": 2.379, + "grad_norm": 1.2389468949673526, + "learning_rate": 4.715913115154518e-06, + "loss": 0.1642, + "mean_token_accuracy": 0.9318437576293945, + "step": 4758 + }, + { + "epoch": 2.3795, + "grad_norm": 7.717246716511586, + "learning_rate": 4.715711065363487e-06, + "loss": 0.3067, + "mean_token_accuracy": 0.900440514087677, + "step": 4759 + }, + { + "epoch": 2.38, + "grad_norm": 7.127884220458557, + "learning_rate": 4.715508948078037e-06, + "loss": 0.2294, + "mean_token_accuracy": 0.9216971397399902, + "step": 4760 + }, + { + "epoch": 2.3805, + "grad_norm": 2.6260270814562596, + "learning_rate": 4.715306763304326e-06, + "loss": 0.3025, + "mean_token_accuracy": 0.8995540738105774, + "step": 4761 + }, + { + "epoch": 2.3810000000000002, + "grad_norm": 1.7362079435571118, + "learning_rate": 4.715104511048512e-06, + "loss": 0.2298, + "mean_token_accuracy": 0.9238227009773254, + "step": 4762 + }, + { + "epoch": 2.3815, + "grad_norm": 2.7058310420486027, + "learning_rate": 4.714902191316755e-06, + "loss": 0.3021, + "mean_token_accuracy": 0.9007019400596619, + "step": 4763 + }, + { + "epoch": 2.382, + "grad_norm": 2.402606019788953, + "learning_rate": 4.714699804115221e-06, + "loss": 0.2588, + "mean_token_accuracy": 0.9093282222747803, + "step": 4764 + }, + { + "epoch": 2.3825, + "grad_norm": 4.53353538594446, + "learning_rate": 4.714497349450071e-06, + "loss": 0.308, + "mean_token_accuracy": 0.899189293384552, + "step": 4765 + }, + { + "epoch": 2.383, + "grad_norm": 7.317732601196225, + "learning_rate": 4.7142948273274755e-06, + "loss": 0.3029, + "mean_token_accuracy": 0.8989385366439819, + "step": 4766 + }, + { + "epoch": 2.3835, + "grad_norm": 1.665921137644512, + "learning_rate": 4.714092237753603e-06, + "loss": 0.2188, + "mean_token_accuracy": 0.9265989661216736, + "step": 4767 + }, + { + "epoch": 2.384, + "grad_norm": 1.8129363103973715, + "learning_rate": 4.713889580734623e-06, + "loss": 0.2068, + "mean_token_accuracy": 0.9234828352928162, + "step": 4768 + }, + { + "epoch": 2.3845, + "grad_norm": 1.6239677488143005, + "learning_rate": 4.7136868562767105e-06, + "loss": 0.238, + "mean_token_accuracy": 0.9131600856781006, + "step": 4769 + }, + { + "epoch": 2.385, + "grad_norm": 3.8229923073805425, + "learning_rate": 4.71348406438604e-06, + "loss": 0.2624, + "mean_token_accuracy": 0.9132189750671387, + "step": 4770 + }, + { + "epoch": 2.3855, + "grad_norm": 2.754228466050309, + "learning_rate": 4.713281205068789e-06, + "loss": 0.2573, + "mean_token_accuracy": 0.9154195189476013, + "step": 4771 + }, + { + "epoch": 2.386, + "grad_norm": 1.8990024705276713, + "learning_rate": 4.713078278331138e-06, + "loss": 0.2204, + "mean_token_accuracy": 0.915339469909668, + "step": 4772 + }, + { + "epoch": 2.3865, + "grad_norm": 2.2959277699084764, + "learning_rate": 4.712875284179268e-06, + "loss": 0.2904, + "mean_token_accuracy": 0.9068873524665833, + "step": 4773 + }, + { + "epoch": 2.387, + "grad_norm": 8.755748973894436, + "learning_rate": 4.7126722226193615e-06, + "loss": 0.2512, + "mean_token_accuracy": 0.9088913202285767, + "step": 4774 + }, + { + "epoch": 2.3875, + "grad_norm": 2.9178077711357484, + "learning_rate": 4.712469093657605e-06, + "loss": 0.2359, + "mean_token_accuracy": 0.9192299842834473, + "step": 4775 + }, + { + "epoch": 2.388, + "grad_norm": 2.1579356702095875, + "learning_rate": 4.712265897300186e-06, + "loss": 0.2693, + "mean_token_accuracy": 0.9059942960739136, + "step": 4776 + }, + { + "epoch": 2.3885, + "grad_norm": 3.6619001053931295, + "learning_rate": 4.712062633553294e-06, + "loss": 0.2445, + "mean_token_accuracy": 0.915011465549469, + "step": 4777 + }, + { + "epoch": 2.3890000000000002, + "grad_norm": 2.0074216347073417, + "learning_rate": 4.7118593024231214e-06, + "loss": 0.2893, + "mean_token_accuracy": 0.8998976349830627, + "step": 4778 + }, + { + "epoch": 2.3895, + "grad_norm": 1.9819439783063697, + "learning_rate": 4.711655903915862e-06, + "loss": 0.1969, + "mean_token_accuracy": 0.9286461472511292, + "step": 4779 + }, + { + "epoch": 2.39, + "grad_norm": 3.6478488719093582, + "learning_rate": 4.71145243803771e-06, + "loss": 0.3042, + "mean_token_accuracy": 0.9024693369865417, + "step": 4780 + }, + { + "epoch": 2.3905, + "grad_norm": 2.3994909159158664, + "learning_rate": 4.711248904794865e-06, + "loss": 0.2306, + "mean_token_accuracy": 0.9166666865348816, + "step": 4781 + }, + { + "epoch": 2.391, + "grad_norm": 2.3836735392332007, + "learning_rate": 4.711045304193528e-06, + "loss": 0.2337, + "mean_token_accuracy": 0.9122806787490845, + "step": 4782 + }, + { + "epoch": 2.3915, + "grad_norm": 2.355337497003899, + "learning_rate": 4.710841636239898e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8809455037117004, + "step": 4783 + }, + { + "epoch": 2.392, + "grad_norm": 2.5482100630702753, + "learning_rate": 4.710637900940181e-06, + "loss": 0.2789, + "mean_token_accuracy": 0.9106664061546326, + "step": 4784 + }, + { + "epoch": 2.3925, + "grad_norm": 3.2153827243389386, + "learning_rate": 4.710434098300584e-06, + "loss": 0.2719, + "mean_token_accuracy": 0.9113021492958069, + "step": 4785 + }, + { + "epoch": 2.393, + "grad_norm": 3.7077698474271066, + "learning_rate": 4.710230228327312e-06, + "loss": 0.2693, + "mean_token_accuracy": 0.9128458499908447, + "step": 4786 + }, + { + "epoch": 2.3935, + "grad_norm": 3.0637363400888686, + "learning_rate": 4.710026291026579e-06, + "loss": 0.3119, + "mean_token_accuracy": 0.9038864970207214, + "step": 4787 + }, + { + "epoch": 2.394, + "grad_norm": 13.54317237511867, + "learning_rate": 4.7098222864045945e-06, + "loss": 0.2689, + "mean_token_accuracy": 0.9040805697441101, + "step": 4788 + }, + { + "epoch": 2.3945, + "grad_norm": 2.624325443572377, + "learning_rate": 4.709618214467574e-06, + "loss": 0.2748, + "mean_token_accuracy": 0.9091533422470093, + "step": 4789 + }, + { + "epoch": 2.395, + "grad_norm": 4.53554823988409, + "learning_rate": 4.709414075221734e-06, + "loss": 0.2259, + "mean_token_accuracy": 0.926074743270874, + "step": 4790 + }, + { + "epoch": 2.3955, + "grad_norm": 1.7888046489209233, + "learning_rate": 4.7092098686732925e-06, + "loss": 0.3011, + "mean_token_accuracy": 0.8959141373634338, + "step": 4791 + }, + { + "epoch": 2.396, + "grad_norm": 2.308128402983134, + "learning_rate": 4.709005594828471e-06, + "loss": 0.2619, + "mean_token_accuracy": 0.9112656116485596, + "step": 4792 + }, + { + "epoch": 2.3965, + "grad_norm": 2.0697230463996124, + "learning_rate": 4.70880125369349e-06, + "loss": 0.2394, + "mean_token_accuracy": 0.9157353043556213, + "step": 4793 + }, + { + "epoch": 2.3970000000000002, + "grad_norm": 3.3287054305208517, + "learning_rate": 4.7085968452745755e-06, + "loss": 0.3301, + "mean_token_accuracy": 0.8961997628211975, + "step": 4794 + }, + { + "epoch": 2.3975, + "grad_norm": 2.489962343842501, + "learning_rate": 4.7083923695779546e-06, + "loss": 0.2818, + "mean_token_accuracy": 0.9097611904144287, + "step": 4795 + }, + { + "epoch": 2.398, + "grad_norm": 2.8792300867938203, + "learning_rate": 4.708187826609855e-06, + "loss": 0.3264, + "mean_token_accuracy": 0.8908461928367615, + "step": 4796 + }, + { + "epoch": 2.3985, + "grad_norm": 6.143206277191439, + "learning_rate": 4.707983216376507e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8932504653930664, + "step": 4797 + }, + { + "epoch": 2.399, + "grad_norm": 2.199392396882917, + "learning_rate": 4.707778538884145e-06, + "loss": 0.289, + "mean_token_accuracy": 0.9053254723548889, + "step": 4798 + }, + { + "epoch": 2.3995, + "grad_norm": 2.717843317993558, + "learning_rate": 4.707573794139003e-06, + "loss": 0.2523, + "mean_token_accuracy": 0.9101890921592712, + "step": 4799 + }, + { + "epoch": 2.4, + "grad_norm": 2.5287286034482905, + "learning_rate": 4.707368982147318e-06, + "loss": 0.2829, + "mean_token_accuracy": 0.9081934690475464, + "step": 4800 + }, + { + "epoch": 2.4005, + "grad_norm": 2.1142816441793113, + "learning_rate": 4.707164102915328e-06, + "loss": 0.258, + "mean_token_accuracy": 0.916307806968689, + "step": 4801 + }, + { + "epoch": 2.401, + "grad_norm": 2.917215754021417, + "learning_rate": 4.706959156449275e-06, + "loss": 0.2762, + "mean_token_accuracy": 0.904365062713623, + "step": 4802 + }, + { + "epoch": 2.4015, + "grad_norm": 1.7177248291067415, + "learning_rate": 4.706754142755402e-06, + "loss": 0.2463, + "mean_token_accuracy": 0.9203515648841858, + "step": 4803 + }, + { + "epoch": 2.402, + "grad_norm": 1.839779772484962, + "learning_rate": 4.706549061839955e-06, + "loss": 0.1856, + "mean_token_accuracy": 0.9289380311965942, + "step": 4804 + }, + { + "epoch": 2.4025, + "grad_norm": 1.947965457073754, + "learning_rate": 4.706343913709178e-06, + "loss": 0.3028, + "mean_token_accuracy": 0.8989601135253906, + "step": 4805 + }, + { + "epoch": 2.403, + "grad_norm": 2.110793811116456, + "learning_rate": 4.7061386983693234e-06, + "loss": 0.2222, + "mean_token_accuracy": 0.925346851348877, + "step": 4806 + }, + { + "epoch": 2.4035, + "grad_norm": 3.566287741197299, + "learning_rate": 4.7059334158266405e-06, + "loss": 0.2609, + "mean_token_accuracy": 0.9069727063179016, + "step": 4807 + }, + { + "epoch": 2.404, + "grad_norm": 3.384940730495302, + "learning_rate": 4.705728066087384e-06, + "loss": 0.2516, + "mean_token_accuracy": 0.9136397242546082, + "step": 4808 + }, + { + "epoch": 2.4045, + "grad_norm": 1.9482415659916343, + "learning_rate": 4.705522649157808e-06, + "loss": 0.2843, + "mean_token_accuracy": 0.9052435159683228, + "step": 4809 + }, + { + "epoch": 2.4050000000000002, + "grad_norm": 5.073052416637311, + "learning_rate": 4.70531716504417e-06, + "loss": 0.2675, + "mean_token_accuracy": 0.899341344833374, + "step": 4810 + }, + { + "epoch": 2.4055, + "grad_norm": 3.4635321629674647, + "learning_rate": 4.70511161375273e-06, + "loss": 0.2456, + "mean_token_accuracy": 0.9102704524993896, + "step": 4811 + }, + { + "epoch": 2.406, + "grad_norm": 3.3360626445981816, + "learning_rate": 4.704905995289749e-06, + "loss": 0.2676, + "mean_token_accuracy": 0.9011660218238831, + "step": 4812 + }, + { + "epoch": 2.4065, + "grad_norm": 2.2059732826059326, + "learning_rate": 4.704700309661491e-06, + "loss": 0.2668, + "mean_token_accuracy": 0.9102951288223267, + "step": 4813 + }, + { + "epoch": 2.407, + "grad_norm": 2.6435167663581463, + "learning_rate": 4.704494556874221e-06, + "loss": 0.305, + "mean_token_accuracy": 0.8993755578994751, + "step": 4814 + }, + { + "epoch": 2.4074999999999998, + "grad_norm": 2.3648592908065784, + "learning_rate": 4.704288736934207e-06, + "loss": 0.2428, + "mean_token_accuracy": 0.9181337952613831, + "step": 4815 + }, + { + "epoch": 2.408, + "grad_norm": 1.9987624775020665, + "learning_rate": 4.704082849847718e-06, + "loss": 0.2645, + "mean_token_accuracy": 0.9088437557220459, + "step": 4816 + }, + { + "epoch": 2.4085, + "grad_norm": 2.753478944953736, + "learning_rate": 4.703876895621026e-06, + "loss": 0.2248, + "mean_token_accuracy": 0.9241162538528442, + "step": 4817 + }, + { + "epoch": 2.409, + "grad_norm": 1.7529712820366017, + "learning_rate": 4.7036708742604054e-06, + "loss": 0.2679, + "mean_token_accuracy": 0.9152276515960693, + "step": 4818 + }, + { + "epoch": 2.4095, + "grad_norm": 6.918822354442996, + "learning_rate": 4.70346478577213e-06, + "loss": 0.2695, + "mean_token_accuracy": 0.9125879406929016, + "step": 4819 + }, + { + "epoch": 2.41, + "grad_norm": 1.9352228645913794, + "learning_rate": 4.703258630162481e-06, + "loss": 0.2412, + "mean_token_accuracy": 0.9176470637321472, + "step": 4820 + }, + { + "epoch": 2.4105, + "grad_norm": 2.462911031401737, + "learning_rate": 4.703052407437735e-06, + "loss": 0.3125, + "mean_token_accuracy": 0.9082793593406677, + "step": 4821 + }, + { + "epoch": 2.411, + "grad_norm": 8.453133990490617, + "learning_rate": 4.702846117604176e-06, + "loss": 0.2342, + "mean_token_accuracy": 0.9194977879524231, + "step": 4822 + }, + { + "epoch": 2.4115, + "grad_norm": 2.9965297469723056, + "learning_rate": 4.702639760668086e-06, + "loss": 0.2961, + "mean_token_accuracy": 0.9127652645111084, + "step": 4823 + }, + { + "epoch": 2.412, + "grad_norm": 2.8473143474802733, + "learning_rate": 4.702433336635753e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.8915041089057922, + "step": 4824 + }, + { + "epoch": 2.4125, + "grad_norm": 15.5696521368279, + "learning_rate": 4.702226845513465e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.8636568188667297, + "step": 4825 + }, + { + "epoch": 2.413, + "grad_norm": 36.633740116441984, + "learning_rate": 4.702020287307509e-06, + "loss": 0.2743, + "mean_token_accuracy": 0.9037793874740601, + "step": 4826 + }, + { + "epoch": 2.4135, + "grad_norm": 2.280914371966527, + "learning_rate": 4.7018136620241805e-06, + "loss": 0.2293, + "mean_token_accuracy": 0.9163014888763428, + "step": 4827 + }, + { + "epoch": 2.414, + "grad_norm": 2.934307925728452, + "learning_rate": 4.701606969669773e-06, + "loss": 0.2582, + "mean_token_accuracy": 0.9098832011222839, + "step": 4828 + }, + { + "epoch": 2.4145, + "grad_norm": 3.1977976903650323, + "learning_rate": 4.701400210250582e-06, + "loss": 0.2601, + "mean_token_accuracy": 0.9062084555625916, + "step": 4829 + }, + { + "epoch": 2.415, + "grad_norm": 2.65489311799924, + "learning_rate": 4.701193383772905e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8781841993331909, + "step": 4830 + }, + { + "epoch": 2.4154999999999998, + "grad_norm": 1.828631810774035, + "learning_rate": 4.7009864902430445e-06, + "loss": 0.2214, + "mean_token_accuracy": 0.9183439016342163, + "step": 4831 + }, + { + "epoch": 2.416, + "grad_norm": 3.716001496233907, + "learning_rate": 4.700779529667301e-06, + "loss": 0.3197, + "mean_token_accuracy": 0.8895506858825684, + "step": 4832 + }, + { + "epoch": 2.4165, + "grad_norm": 1.5037547123771664, + "learning_rate": 4.700572502051979e-06, + "loss": 0.218, + "mean_token_accuracy": 0.9196842312812805, + "step": 4833 + }, + { + "epoch": 2.417, + "grad_norm": 3.7361135784283825, + "learning_rate": 4.700365407403387e-06, + "loss": 0.1856, + "mean_token_accuracy": 0.9298725128173828, + "step": 4834 + }, + { + "epoch": 2.4175, + "grad_norm": 1.9010413136170747, + "learning_rate": 4.70015824572783e-06, + "loss": 0.2429, + "mean_token_accuracy": 0.9164119958877563, + "step": 4835 + }, + { + "epoch": 2.418, + "grad_norm": 7.144839957515657, + "learning_rate": 4.699951017031622e-06, + "loss": 0.3032, + "mean_token_accuracy": 0.9042943120002747, + "step": 4836 + }, + { + "epoch": 2.4185, + "grad_norm": 2.5750657800380408, + "learning_rate": 4.699743721321073e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8892291784286499, + "step": 4837 + }, + { + "epoch": 2.419, + "grad_norm": 8.433674549694018, + "learning_rate": 4.6995363586024975e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8867059350013733, + "step": 4838 + }, + { + "epoch": 2.4195, + "grad_norm": 4.320819022783197, + "learning_rate": 4.699328928882215e-06, + "loss": 0.2767, + "mean_token_accuracy": 0.9100025296211243, + "step": 4839 + }, + { + "epoch": 2.42, + "grad_norm": 26.407282374498184, + "learning_rate": 4.699121432166542e-06, + "loss": 0.2041, + "mean_token_accuracy": 0.9339215159416199, + "step": 4840 + }, + { + "epoch": 2.4205, + "grad_norm": 2.2590590500723207, + "learning_rate": 4.698913868461799e-06, + "loss": 0.2611, + "mean_token_accuracy": 0.9053933024406433, + "step": 4841 + }, + { + "epoch": 2.421, + "grad_norm": 1.6918848867381255, + "learning_rate": 4.698706237774309e-06, + "loss": 0.2734, + "mean_token_accuracy": 0.9118779301643372, + "step": 4842 + }, + { + "epoch": 2.4215, + "grad_norm": 2.386114011399612, + "learning_rate": 4.698498540110397e-06, + "loss": 0.2536, + "mean_token_accuracy": 0.9194427728652954, + "step": 4843 + }, + { + "epoch": 2.422, + "grad_norm": 2.175001037208784, + "learning_rate": 4.6982907754763905e-06, + "loss": 0.2964, + "mean_token_accuracy": 0.893926203250885, + "step": 4844 + }, + { + "epoch": 2.4225, + "grad_norm": 2.711790975285594, + "learning_rate": 4.6980829438786176e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8866490125656128, + "step": 4845 + }, + { + "epoch": 2.423, + "grad_norm": 2.492347151507052, + "learning_rate": 4.69787504532341e-06, + "loss": 0.2571, + "mean_token_accuracy": 0.9090909361839294, + "step": 4846 + }, + { + "epoch": 2.4234999999999998, + "grad_norm": 1.6572015852637434, + "learning_rate": 4.6976670798171e-06, + "loss": 0.214, + "mean_token_accuracy": 0.9212771654129028, + "step": 4847 + }, + { + "epoch": 2.424, + "grad_norm": 2.0298105163508504, + "learning_rate": 4.697459047366022e-06, + "loss": 0.2199, + "mean_token_accuracy": 0.9210526347160339, + "step": 4848 + }, + { + "epoch": 2.4245, + "grad_norm": 2.840583865018109, + "learning_rate": 4.697250947976513e-06, + "loss": 0.339, + "mean_token_accuracy": 0.8956989049911499, + "step": 4849 + }, + { + "epoch": 2.425, + "grad_norm": 3.224463067001575, + "learning_rate": 4.697042781654913e-06, + "loss": 0.3315, + "mean_token_accuracy": 0.8952069282531738, + "step": 4850 + }, + { + "epoch": 2.4255, + "grad_norm": 1.8125857487148331, + "learning_rate": 4.696834548407564e-06, + "loss": 0.1752, + "mean_token_accuracy": 0.932341456413269, + "step": 4851 + }, + { + "epoch": 2.426, + "grad_norm": 1.6157938134259513, + "learning_rate": 4.696626248240808e-06, + "loss": 0.2491, + "mean_token_accuracy": 0.9067015051841736, + "step": 4852 + }, + { + "epoch": 2.4265, + "grad_norm": 2.5363887410712236, + "learning_rate": 4.696417881160989e-06, + "loss": 0.2762, + "mean_token_accuracy": 0.9159609079360962, + "step": 4853 + }, + { + "epoch": 2.427, + "grad_norm": 50.8042046615741, + "learning_rate": 4.696209447174456e-06, + "loss": 0.3005, + "mean_token_accuracy": 0.8996146321296692, + "step": 4854 + }, + { + "epoch": 2.4275, + "grad_norm": 6.67074180276864, + "learning_rate": 4.696000946287558e-06, + "loss": 0.3592, + "mean_token_accuracy": 0.8836925029754639, + "step": 4855 + }, + { + "epoch": 2.428, + "grad_norm": 2.052895416896962, + "learning_rate": 4.695792378506645e-06, + "loss": 0.3184, + "mean_token_accuracy": 0.8893922567367554, + "step": 4856 + }, + { + "epoch": 2.4285, + "grad_norm": 3.3721033303885584, + "learning_rate": 4.695583743838072e-06, + "loss": 0.3067, + "mean_token_accuracy": 0.9049817323684692, + "step": 4857 + }, + { + "epoch": 2.429, + "grad_norm": 2.3804952453518475, + "learning_rate": 4.6953750422881935e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8478230834007263, + "step": 4858 + }, + { + "epoch": 2.4295, + "grad_norm": 2.1836865602731073, + "learning_rate": 4.695166273863367e-06, + "loss": 0.2418, + "mean_token_accuracy": 0.9128026366233826, + "step": 4859 + }, + { + "epoch": 2.43, + "grad_norm": 1.86395643398524, + "learning_rate": 4.6949574385699514e-06, + "loss": 0.219, + "mean_token_accuracy": 0.922135055065155, + "step": 4860 + }, + { + "epoch": 2.4305, + "grad_norm": 4.1530743294232115, + "learning_rate": 4.69474853641431e-06, + "loss": 0.3136, + "mean_token_accuracy": 0.9016001224517822, + "step": 4861 + }, + { + "epoch": 2.431, + "grad_norm": 2.325616046097019, + "learning_rate": 4.694539567402805e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.887267529964447, + "step": 4862 + }, + { + "epoch": 2.4314999999999998, + "grad_norm": 2.121686949105355, + "learning_rate": 4.694330531541801e-06, + "loss": 0.2843, + "mean_token_accuracy": 0.9026768803596497, + "step": 4863 + }, + { + "epoch": 2.432, + "grad_norm": 2.839214585896897, + "learning_rate": 4.694121428837668e-06, + "loss": 0.2897, + "mean_token_accuracy": 0.8979160189628601, + "step": 4864 + }, + { + "epoch": 2.4325, + "grad_norm": 1.917000994022835, + "learning_rate": 4.693912259296773e-06, + "loss": 0.3119, + "mean_token_accuracy": 0.9003397226333618, + "step": 4865 + }, + { + "epoch": 2.433, + "grad_norm": 2.2429837608353265, + "learning_rate": 4.69370302292549e-06, + "loss": 0.2625, + "mean_token_accuracy": 0.9084887504577637, + "step": 4866 + }, + { + "epoch": 2.4335, + "grad_norm": 2.5188486071074236, + "learning_rate": 4.693493719730192e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8833279609680176, + "step": 4867 + }, + { + "epoch": 2.434, + "grad_norm": 2.1317710665448533, + "learning_rate": 4.693284349717254e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8793538808822632, + "step": 4868 + }, + { + "epoch": 2.4345, + "grad_norm": 2.002566927349343, + "learning_rate": 4.6930749128930544e-06, + "loss": 0.2366, + "mean_token_accuracy": 0.9146751165390015, + "step": 4869 + }, + { + "epoch": 2.435, + "grad_norm": 3.045278262721878, + "learning_rate": 4.6928654092639725e-06, + "loss": 0.2953, + "mean_token_accuracy": 0.9063334465026855, + "step": 4870 + }, + { + "epoch": 2.4355, + "grad_norm": 3.016461113398292, + "learning_rate": 4.692655838836391e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8891016840934753, + "step": 4871 + }, + { + "epoch": 2.436, + "grad_norm": 2.669902065208249, + "learning_rate": 4.692446201616692e-06, + "loss": 0.2423, + "mean_token_accuracy": 0.919734001159668, + "step": 4872 + }, + { + "epoch": 2.4365, + "grad_norm": 5.837161619208518, + "learning_rate": 4.692236497611264e-06, + "loss": 0.3189, + "mean_token_accuracy": 0.8978102207183838, + "step": 4873 + }, + { + "epoch": 2.437, + "grad_norm": 3.3434470591553467, + "learning_rate": 4.692026726826493e-06, + "loss": 0.2497, + "mean_token_accuracy": 0.9132091403007507, + "step": 4874 + }, + { + "epoch": 2.4375, + "grad_norm": 2.731258222106659, + "learning_rate": 4.69181688926877e-06, + "loss": 0.263, + "mean_token_accuracy": 0.9149479866027832, + "step": 4875 + }, + { + "epoch": 2.438, + "grad_norm": 4.778955076816588, + "learning_rate": 4.691606984944486e-06, + "loss": 0.3027, + "mean_token_accuracy": 0.8924999833106995, + "step": 4876 + }, + { + "epoch": 2.4385, + "grad_norm": 3.572541607074457, + "learning_rate": 4.691397013860036e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.9006456136703491, + "step": 4877 + }, + { + "epoch": 2.439, + "grad_norm": 3.2889914737688337, + "learning_rate": 4.691186976021816e-06, + "loss": 0.3125, + "mean_token_accuracy": 0.8922910690307617, + "step": 4878 + }, + { + "epoch": 2.4395, + "grad_norm": 2.9028530485730926, + "learning_rate": 4.690976871436224e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.8950109481811523, + "step": 4879 + }, + { + "epoch": 2.44, + "grad_norm": 2.8572987822917244, + "learning_rate": 4.690766700109659e-06, + "loss": 0.2658, + "mean_token_accuracy": 0.902434766292572, + "step": 4880 + }, + { + "epoch": 2.4405, + "grad_norm": 1.801279857918603, + "learning_rate": 4.690556462048526e-06, + "loss": 0.2501, + "mean_token_accuracy": 0.9126697778701782, + "step": 4881 + }, + { + "epoch": 2.441, + "grad_norm": 2.7217775114947163, + "learning_rate": 4.690346157259225e-06, + "loss": 0.2996, + "mean_token_accuracy": 0.9022332429885864, + "step": 4882 + }, + { + "epoch": 2.4415, + "grad_norm": 2.4083068769617526, + "learning_rate": 4.690135785748166e-06, + "loss": 0.2824, + "mean_token_accuracy": 0.9049304723739624, + "step": 4883 + }, + { + "epoch": 2.442, + "grad_norm": 3.309875066484587, + "learning_rate": 4.6899253475217565e-06, + "loss": 0.2529, + "mean_token_accuracy": 0.9090155959129333, + "step": 4884 + }, + { + "epoch": 2.4425, + "grad_norm": 1.5855229115348792, + "learning_rate": 4.689714842586406e-06, + "loss": 0.1989, + "mean_token_accuracy": 0.928379237651825, + "step": 4885 + }, + { + "epoch": 2.443, + "grad_norm": 2.5382708542808268, + "learning_rate": 4.689504270948527e-06, + "loss": 0.3092, + "mean_token_accuracy": 0.9033082723617554, + "step": 4886 + }, + { + "epoch": 2.4435000000000002, + "grad_norm": 2.4736649404046482, + "learning_rate": 4.689293632614534e-06, + "loss": 0.1899, + "mean_token_accuracy": 0.9376947283744812, + "step": 4887 + }, + { + "epoch": 2.444, + "grad_norm": 3.0765573177099763, + "learning_rate": 4.689082927590844e-06, + "loss": 0.2409, + "mean_token_accuracy": 0.9159049391746521, + "step": 4888 + }, + { + "epoch": 2.4445, + "grad_norm": 2.142812151721461, + "learning_rate": 4.688872155883874e-06, + "loss": 0.2762, + "mean_token_accuracy": 0.9074292182922363, + "step": 4889 + }, + { + "epoch": 2.445, + "grad_norm": 1.6582477243098783, + "learning_rate": 4.688661317500045e-06, + "loss": 0.2371, + "mean_token_accuracy": 0.9165393710136414, + "step": 4890 + }, + { + "epoch": 2.4455, + "grad_norm": 1.9466082038929833, + "learning_rate": 4.688450412445781e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8859649300575256, + "step": 4891 + }, + { + "epoch": 2.446, + "grad_norm": 1.4232115007014974, + "learning_rate": 4.688239440727504e-06, + "loss": 0.2218, + "mean_token_accuracy": 0.9254462718963623, + "step": 4892 + }, + { + "epoch": 2.4465, + "grad_norm": 2.8885975166933733, + "learning_rate": 4.688028402351643e-06, + "loss": 0.26, + "mean_token_accuracy": 0.9041686654090881, + "step": 4893 + }, + { + "epoch": 2.447, + "grad_norm": 3.0314395475778477, + "learning_rate": 4.687817297324625e-06, + "loss": 0.2335, + "mean_token_accuracy": 0.9217177629470825, + "step": 4894 + }, + { + "epoch": 2.4475, + "grad_norm": 3.1386528116547465, + "learning_rate": 4.687606125652882e-06, + "loss": 0.2255, + "mean_token_accuracy": 0.9135563969612122, + "step": 4895 + }, + { + "epoch": 2.448, + "grad_norm": 2.2625788077454194, + "learning_rate": 4.687394887342845e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.894605815410614, + "step": 4896 + }, + { + "epoch": 2.4485, + "grad_norm": 4.814916441964316, + "learning_rate": 4.6871835824009495e-06, + "loss": 0.3041, + "mean_token_accuracy": 0.8984482288360596, + "step": 4897 + }, + { + "epoch": 2.449, + "grad_norm": 2.695606030507802, + "learning_rate": 4.686972210833632e-06, + "loss": 0.2801, + "mean_token_accuracy": 0.9049533605575562, + "step": 4898 + }, + { + "epoch": 2.4495, + "grad_norm": 2.124476641236135, + "learning_rate": 4.6867607726473316e-06, + "loss": 0.3033, + "mean_token_accuracy": 0.8925865888595581, + "step": 4899 + }, + { + "epoch": 2.45, + "grad_norm": 1.8414694363060908, + "learning_rate": 4.68654926784849e-06, + "loss": 0.302, + "mean_token_accuracy": 0.8979966044425964, + "step": 4900 + }, + { + "epoch": 2.4505, + "grad_norm": 3.5497368207813804, + "learning_rate": 4.686337696443548e-06, + "loss": 0.2885, + "mean_token_accuracy": 0.9101715087890625, + "step": 4901 + }, + { + "epoch": 2.451, + "grad_norm": 12.546568613879813, + "learning_rate": 4.686126058438952e-06, + "loss": 0.2123, + "mean_token_accuracy": 0.9224351048469543, + "step": 4902 + }, + { + "epoch": 2.4515000000000002, + "grad_norm": 7.644922912189558, + "learning_rate": 4.685914353841148e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8902438879013062, + "step": 4903 + }, + { + "epoch": 2.452, + "grad_norm": 3.9941577023121604, + "learning_rate": 4.6857025826565845e-06, + "loss": 0.2012, + "mean_token_accuracy": 0.9276254177093506, + "step": 4904 + }, + { + "epoch": 2.4525, + "grad_norm": 4.245337686026754, + "learning_rate": 4.685490744891713e-06, + "loss": 0.2527, + "mean_token_accuracy": 0.9168328642845154, + "step": 4905 + }, + { + "epoch": 2.453, + "grad_norm": 1.649619134116451, + "learning_rate": 4.685278840552987e-06, + "loss": 0.2109, + "mean_token_accuracy": 0.9223140478134155, + "step": 4906 + }, + { + "epoch": 2.4535, + "grad_norm": 6.761267215071475, + "learning_rate": 4.6850668696468615e-06, + "loss": 0.272, + "mean_token_accuracy": 0.911488950252533, + "step": 4907 + }, + { + "epoch": 2.454, + "grad_norm": 3.3160877195682152, + "learning_rate": 4.684854832179792e-06, + "loss": 0.2655, + "mean_token_accuracy": 0.9140545725822449, + "step": 4908 + }, + { + "epoch": 2.4545, + "grad_norm": 5.69757728724191, + "learning_rate": 4.684642728158239e-06, + "loss": 0.3007, + "mean_token_accuracy": 0.9078056216239929, + "step": 4909 + }, + { + "epoch": 2.455, + "grad_norm": 2.228845233553048, + "learning_rate": 4.6844305575886635e-06, + "loss": 0.3119, + "mean_token_accuracy": 0.8931590914726257, + "step": 4910 + }, + { + "epoch": 2.4555, + "grad_norm": 2.8409274486818705, + "learning_rate": 4.684218320477528e-06, + "loss": 0.262, + "mean_token_accuracy": 0.9069288372993469, + "step": 4911 + }, + { + "epoch": 2.456, + "grad_norm": 5.894896320935186, + "learning_rate": 4.684006016831297e-06, + "loss": 0.3201, + "mean_token_accuracy": 0.8945069909095764, + "step": 4912 + }, + { + "epoch": 2.4565, + "grad_norm": 2.4080771571903457, + "learning_rate": 4.6837936466564395e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.8980925679206848, + "step": 4913 + }, + { + "epoch": 2.457, + "grad_norm": 2.417708485997719, + "learning_rate": 4.683581209959423e-06, + "loss": 0.2873, + "mean_token_accuracy": 0.9064403176307678, + "step": 4914 + }, + { + "epoch": 2.4575, + "grad_norm": 2.974191009680019, + "learning_rate": 4.6833687067467185e-06, + "loss": 0.2969, + "mean_token_accuracy": 0.9104133248329163, + "step": 4915 + }, + { + "epoch": 2.458, + "grad_norm": 2.4814725646578744, + "learning_rate": 4.683156137024801e-06, + "loss": 0.2682, + "mean_token_accuracy": 0.9129483699798584, + "step": 4916 + }, + { + "epoch": 2.4585, + "grad_norm": 2.6219501292165495, + "learning_rate": 4.682943500800144e-06, + "loss": 0.2946, + "mean_token_accuracy": 0.9049307107925415, + "step": 4917 + }, + { + "epoch": 2.459, + "grad_norm": 3.306885289809539, + "learning_rate": 4.682730798079226e-06, + "loss": 0.2412, + "mean_token_accuracy": 0.9210178256034851, + "step": 4918 + }, + { + "epoch": 2.4595000000000002, + "grad_norm": 3.0506661505231105, + "learning_rate": 4.682518028868526e-06, + "loss": 0.2981, + "mean_token_accuracy": 0.9007092118263245, + "step": 4919 + }, + { + "epoch": 2.46, + "grad_norm": 1.7930695777183936, + "learning_rate": 4.682305193174524e-06, + "loss": 0.2173, + "mean_token_accuracy": 0.9353741407394409, + "step": 4920 + }, + { + "epoch": 2.4605, + "grad_norm": 3.436242070133307, + "learning_rate": 4.6820922910037055e-06, + "loss": 0.2554, + "mean_token_accuracy": 0.9196556806564331, + "step": 4921 + }, + { + "epoch": 2.461, + "grad_norm": 3.573250408382483, + "learning_rate": 4.681879322362555e-06, + "loss": 0.2365, + "mean_token_accuracy": 0.9137172698974609, + "step": 4922 + }, + { + "epoch": 2.4615, + "grad_norm": 3.077875491872947, + "learning_rate": 4.681666287257559e-06, + "loss": 0.2587, + "mean_token_accuracy": 0.9150197505950928, + "step": 4923 + }, + { + "epoch": 2.462, + "grad_norm": 7.172219055614579, + "learning_rate": 4.681453185695208e-06, + "loss": 0.3132, + "mean_token_accuracy": 0.904398500919342, + "step": 4924 + }, + { + "epoch": 2.4625, + "grad_norm": 1.8611088266542997, + "learning_rate": 4.681240017681994e-06, + "loss": 0.2481, + "mean_token_accuracy": 0.9141182899475098, + "step": 4925 + }, + { + "epoch": 2.463, + "grad_norm": 2.8200528243886076, + "learning_rate": 4.681026783224408e-06, + "loss": 0.332, + "mean_token_accuracy": 0.8894811868667603, + "step": 4926 + }, + { + "epoch": 2.4635, + "grad_norm": 3.088037327918954, + "learning_rate": 4.6808134823289475e-06, + "loss": 0.2871, + "mean_token_accuracy": 0.9050809741020203, + "step": 4927 + }, + { + "epoch": 2.464, + "grad_norm": 1.7746180562229072, + "learning_rate": 4.680600115002109e-06, + "loss": 0.2928, + "mean_token_accuracy": 0.9006211161613464, + "step": 4928 + }, + { + "epoch": 2.4645, + "grad_norm": 2.1667668496210934, + "learning_rate": 4.680386681250394e-06, + "loss": 0.2725, + "mean_token_accuracy": 0.9060837626457214, + "step": 4929 + }, + { + "epoch": 2.465, + "grad_norm": 2.68204990002029, + "learning_rate": 4.680173181080302e-06, + "loss": 0.2517, + "mean_token_accuracy": 0.9129007458686829, + "step": 4930 + }, + { + "epoch": 2.4655, + "grad_norm": 2.3792966155702455, + "learning_rate": 4.679959614498337e-06, + "loss": 0.304, + "mean_token_accuracy": 0.9030197262763977, + "step": 4931 + }, + { + "epoch": 2.466, + "grad_norm": 2.50013095137517, + "learning_rate": 4.679745981511005e-06, + "loss": 0.271, + "mean_token_accuracy": 0.9079822897911072, + "step": 4932 + }, + { + "epoch": 2.4665, + "grad_norm": 3.83737014372705, + "learning_rate": 4.6795322821248135e-06, + "loss": 0.2617, + "mean_token_accuracy": 0.9181851744651794, + "step": 4933 + }, + { + "epoch": 2.467, + "grad_norm": 2.138471032518908, + "learning_rate": 4.679318516346273e-06, + "loss": 0.262, + "mean_token_accuracy": 0.9060530662536621, + "step": 4934 + }, + { + "epoch": 2.4675000000000002, + "grad_norm": 1.7231551412671167, + "learning_rate": 4.679104684181893e-06, + "loss": 0.2881, + "mean_token_accuracy": 0.9099954962730408, + "step": 4935 + }, + { + "epoch": 2.468, + "grad_norm": 1.7765010082842478, + "learning_rate": 4.6788907856381895e-06, + "loss": 0.2878, + "mean_token_accuracy": 0.8994413614273071, + "step": 4936 + }, + { + "epoch": 2.4685, + "grad_norm": 2.565786950978038, + "learning_rate": 4.678676820721677e-06, + "loss": 0.2823, + "mean_token_accuracy": 0.9039044976234436, + "step": 4937 + }, + { + "epoch": 2.469, + "grad_norm": 2.3041297849120084, + "learning_rate": 4.678462789438874e-06, + "loss": 0.2189, + "mean_token_accuracy": 0.9191260933876038, + "step": 4938 + }, + { + "epoch": 2.4695, + "grad_norm": 1.6267603271881366, + "learning_rate": 4.678248691796298e-06, + "loss": 0.2396, + "mean_token_accuracy": 0.9192779064178467, + "step": 4939 + }, + { + "epoch": 2.4699999999999998, + "grad_norm": 3.1962067321510927, + "learning_rate": 4.6780345278004744e-06, + "loss": 0.2933, + "mean_token_accuracy": 0.8974547386169434, + "step": 4940 + }, + { + "epoch": 2.4705, + "grad_norm": 3.0948074883404506, + "learning_rate": 4.677820297457924e-06, + "loss": 0.3057, + "mean_token_accuracy": 0.9006509184837341, + "step": 4941 + }, + { + "epoch": 2.471, + "grad_norm": 1.860372591825414, + "learning_rate": 4.6776060007751746e-06, + "loss": 0.2834, + "mean_token_accuracy": 0.9073300957679749, + "step": 4942 + }, + { + "epoch": 2.4715, + "grad_norm": 1.923122510203511, + "learning_rate": 4.677391637758752e-06, + "loss": 0.2789, + "mean_token_accuracy": 0.9102208018302917, + "step": 4943 + }, + { + "epoch": 2.472, + "grad_norm": 1.5878825353344308, + "learning_rate": 4.677177208415189e-06, + "loss": 0.2202, + "mean_token_accuracy": 0.9220407605171204, + "step": 4944 + }, + { + "epoch": 2.4725, + "grad_norm": 1.48493892621363, + "learning_rate": 4.676962712751015e-06, + "loss": 0.2125, + "mean_token_accuracy": 0.9200385212898254, + "step": 4945 + }, + { + "epoch": 2.473, + "grad_norm": 3.506595963440579, + "learning_rate": 4.676748150772764e-06, + "loss": 0.2945, + "mean_token_accuracy": 0.9022394418716431, + "step": 4946 + }, + { + "epoch": 2.4735, + "grad_norm": 3.565596376711891, + "learning_rate": 4.676533522486974e-06, + "loss": 0.2697, + "mean_token_accuracy": 0.902452826499939, + "step": 4947 + }, + { + "epoch": 2.474, + "grad_norm": 2.801309207928338, + "learning_rate": 4.676318827900181e-06, + "loss": 0.2453, + "mean_token_accuracy": 0.9233137965202332, + "step": 4948 + }, + { + "epoch": 2.4745, + "grad_norm": 2.081347920604051, + "learning_rate": 4.676104067018926e-06, + "loss": 0.2693, + "mean_token_accuracy": 0.9054573178291321, + "step": 4949 + }, + { + "epoch": 2.475, + "grad_norm": 2.225490409374777, + "learning_rate": 4.675889239849749e-06, + "loss": 0.2976, + "mean_token_accuracy": 0.8943560123443604, + "step": 4950 + }, + { + "epoch": 2.4755, + "grad_norm": 2.4900797862452952, + "learning_rate": 4.675674346399197e-06, + "loss": 0.2248, + "mean_token_accuracy": 0.9211476445198059, + "step": 4951 + }, + { + "epoch": 2.476, + "grad_norm": 2.097587339351456, + "learning_rate": 4.675459386673815e-06, + "loss": 0.2761, + "mean_token_accuracy": 0.903927743434906, + "step": 4952 + }, + { + "epoch": 2.4765, + "grad_norm": 2.340304458142853, + "learning_rate": 4.675244360680149e-06, + "loss": 0.3071, + "mean_token_accuracy": 0.9015172719955444, + "step": 4953 + }, + { + "epoch": 2.477, + "grad_norm": 2.934183386245873, + "learning_rate": 4.675029268424752e-06, + "loss": 0.3103, + "mean_token_accuracy": 0.8997274041175842, + "step": 4954 + }, + { + "epoch": 2.4775, + "grad_norm": 1.5481160160412746, + "learning_rate": 4.674814109914174e-06, + "loss": 0.2286, + "mean_token_accuracy": 0.9226293563842773, + "step": 4955 + }, + { + "epoch": 2.4779999999999998, + "grad_norm": 2.4218203443105306, + "learning_rate": 4.674598885154971e-06, + "loss": 0.2473, + "mean_token_accuracy": 0.9148787260055542, + "step": 4956 + }, + { + "epoch": 2.4785, + "grad_norm": 2.285780228138623, + "learning_rate": 4.674383594153698e-06, + "loss": 0.2725, + "mean_token_accuracy": 0.9034749269485474, + "step": 4957 + }, + { + "epoch": 2.479, + "grad_norm": 2.514363267474392, + "learning_rate": 4.674168236916912e-06, + "loss": 0.3234, + "mean_token_accuracy": 0.8952273726463318, + "step": 4958 + }, + { + "epoch": 2.4795, + "grad_norm": 6.133483449993162, + "learning_rate": 4.673952813451175e-06, + "loss": 0.2756, + "mean_token_accuracy": 0.9029108285903931, + "step": 4959 + }, + { + "epoch": 2.48, + "grad_norm": 1.5740447452749118, + "learning_rate": 4.673737323763048e-06, + "loss": 0.3038, + "mean_token_accuracy": 0.904080867767334, + "step": 4960 + }, + { + "epoch": 2.4805, + "grad_norm": 3.767552624258051, + "learning_rate": 4.673521767859096e-06, + "loss": 0.3407, + "mean_token_accuracy": 0.8885005116462708, + "step": 4961 + }, + { + "epoch": 2.481, + "grad_norm": 5.41184115503389, + "learning_rate": 4.673306145745885e-06, + "loss": 0.2495, + "mean_token_accuracy": 0.918049156665802, + "step": 4962 + }, + { + "epoch": 2.4815, + "grad_norm": 3.0392724457684954, + "learning_rate": 4.6730904574299825e-06, + "loss": 0.33, + "mean_token_accuracy": 0.8894238471984863, + "step": 4963 + }, + { + "epoch": 2.482, + "grad_norm": 5.7495074826854236, + "learning_rate": 4.67287470291796e-06, + "loss": 0.2638, + "mean_token_accuracy": 0.9090747833251953, + "step": 4964 + }, + { + "epoch": 2.4825, + "grad_norm": 1.9713370379681843, + "learning_rate": 4.67265888221639e-06, + "loss": 0.2894, + "mean_token_accuracy": 0.9092393517494202, + "step": 4965 + }, + { + "epoch": 2.483, + "grad_norm": 2.1533615069330314, + "learning_rate": 4.672442995331844e-06, + "loss": 0.3177, + "mean_token_accuracy": 0.8985289335250854, + "step": 4966 + }, + { + "epoch": 2.4835, + "grad_norm": 1.765211020798257, + "learning_rate": 4.672227042270902e-06, + "loss": 0.2494, + "mean_token_accuracy": 0.9152106642723083, + "step": 4967 + }, + { + "epoch": 2.484, + "grad_norm": 2.5221544340770214, + "learning_rate": 4.6720110230401385e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8899046778678894, + "step": 4968 + }, + { + "epoch": 2.4845, + "grad_norm": 4.008546212036319, + "learning_rate": 4.671794937646137e-06, + "loss": 0.3162, + "mean_token_accuracy": 0.9031276106834412, + "step": 4969 + }, + { + "epoch": 2.485, + "grad_norm": 2.176708479706549, + "learning_rate": 4.671578786095479e-06, + "loss": 0.297, + "mean_token_accuracy": 0.8991008996963501, + "step": 4970 + }, + { + "epoch": 2.4855, + "grad_norm": 2.598693990559962, + "learning_rate": 4.6713625683947474e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.8887171745300293, + "step": 4971 + }, + { + "epoch": 2.4859999999999998, + "grad_norm": 4.809905706709427, + "learning_rate": 4.6711462845505306e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.856144905090332, + "step": 4972 + }, + { + "epoch": 2.4865, + "grad_norm": 4.264740709259812, + "learning_rate": 4.670929934569416e-06, + "loss": 0.2962, + "mean_token_accuracy": 0.896128237247467, + "step": 4973 + }, + { + "epoch": 2.487, + "grad_norm": 2.087434492981373, + "learning_rate": 4.670713518457993e-06, + "loss": 0.3024, + "mean_token_accuracy": 0.8991920948028564, + "step": 4974 + }, + { + "epoch": 2.4875, + "grad_norm": 2.139272007096371, + "learning_rate": 4.670497036222856e-06, + "loss": 0.2411, + "mean_token_accuracy": 0.9210014939308167, + "step": 4975 + }, + { + "epoch": 2.488, + "grad_norm": 28.38370078984846, + "learning_rate": 4.670280487870599e-06, + "loss": 0.29, + "mean_token_accuracy": 0.9041422009468079, + "step": 4976 + }, + { + "epoch": 2.4885, + "grad_norm": 3.738722891743327, + "learning_rate": 4.670063873407816e-06, + "loss": 0.337, + "mean_token_accuracy": 0.8847247362136841, + "step": 4977 + }, + { + "epoch": 2.489, + "grad_norm": 3.2125805367971725, + "learning_rate": 4.6698471928411095e-06, + "loss": 0.2728, + "mean_token_accuracy": 0.9077380895614624, + "step": 4978 + }, + { + "epoch": 2.4895, + "grad_norm": 1.9311283665481245, + "learning_rate": 4.669630446177077e-06, + "loss": 0.1907, + "mean_token_accuracy": 0.9348792433738708, + "step": 4979 + }, + { + "epoch": 2.49, + "grad_norm": 4.005683391743564, + "learning_rate": 4.669413633422322e-06, + "loss": 0.2028, + "mean_token_accuracy": 0.9288044571876526, + "step": 4980 + }, + { + "epoch": 2.4905, + "grad_norm": 2.6493646724025197, + "learning_rate": 4.669196754583448e-06, + "loss": 0.2852, + "mean_token_accuracy": 0.8997954726219177, + "step": 4981 + }, + { + "epoch": 2.491, + "grad_norm": 3.8847074407790334, + "learning_rate": 4.668979809667063e-06, + "loss": 0.3213, + "mean_token_accuracy": 0.9098761081695557, + "step": 4982 + }, + { + "epoch": 2.4915, + "grad_norm": 2.0135570947129335, + "learning_rate": 4.6687627986797745e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8810136914253235, + "step": 4983 + }, + { + "epoch": 2.492, + "grad_norm": 1.8916033607783675, + "learning_rate": 4.668545721628194e-06, + "loss": 0.3076, + "mean_token_accuracy": 0.8914491534233093, + "step": 4984 + }, + { + "epoch": 2.4925, + "grad_norm": 6.142918870145186, + "learning_rate": 4.668328578518933e-06, + "loss": 0.2206, + "mean_token_accuracy": 0.9246708154678345, + "step": 4985 + }, + { + "epoch": 2.493, + "grad_norm": 4.377359848214053, + "learning_rate": 4.668111369358607e-06, + "loss": 0.2859, + "mean_token_accuracy": 0.9057832956314087, + "step": 4986 + }, + { + "epoch": 2.4935, + "grad_norm": 3.15271691923221, + "learning_rate": 4.667894094153831e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8675098419189453, + "step": 4987 + }, + { + "epoch": 2.4939999999999998, + "grad_norm": 2.8313205138064066, + "learning_rate": 4.667676752911225e-06, + "loss": 0.265, + "mean_token_accuracy": 0.9120196104049683, + "step": 4988 + }, + { + "epoch": 2.4945, + "grad_norm": 2.5714776503296193, + "learning_rate": 4.667459345637409e-06, + "loss": 0.2473, + "mean_token_accuracy": 0.9148188829421997, + "step": 4989 + }, + { + "epoch": 2.495, + "grad_norm": 2.0340899215218444, + "learning_rate": 4.667241872339007e-06, + "loss": 0.258, + "mean_token_accuracy": 0.9105349779129028, + "step": 4990 + }, + { + "epoch": 2.4955, + "grad_norm": 1.7114961383832132, + "learning_rate": 4.6670243330226425e-06, + "loss": 0.2703, + "mean_token_accuracy": 0.9022177457809448, + "step": 4991 + }, + { + "epoch": 2.496, + "grad_norm": 2.057191483556044, + "learning_rate": 4.666806727694942e-06, + "loss": 0.306, + "mean_token_accuracy": 0.8943857550621033, + "step": 4992 + }, + { + "epoch": 2.4965, + "grad_norm": 1.7118828004033861, + "learning_rate": 4.666589056362532e-06, + "loss": 0.2941, + "mean_token_accuracy": 0.8930124044418335, + "step": 4993 + }, + { + "epoch": 2.497, + "grad_norm": 2.005516912192958, + "learning_rate": 4.666371319032047e-06, + "loss": 0.309, + "mean_token_accuracy": 0.8981900215148926, + "step": 4994 + }, + { + "epoch": 2.4975, + "grad_norm": 3.1244788267565227, + "learning_rate": 4.666153515710118e-06, + "loss": 0.5276, + "mean_token_accuracy": 0.8466903567314148, + "step": 4995 + }, + { + "epoch": 2.498, + "grad_norm": 2.28875998643293, + "learning_rate": 4.66593564640338e-06, + "loss": 0.2745, + "mean_token_accuracy": 0.9088974595069885, + "step": 4996 + }, + { + "epoch": 2.4985, + "grad_norm": 2.395226473081483, + "learning_rate": 4.665717711118469e-06, + "loss": 0.2672, + "mean_token_accuracy": 0.9153317809104919, + "step": 4997 + }, + { + "epoch": 2.499, + "grad_norm": 1.9814436519579282, + "learning_rate": 4.665499709862024e-06, + "loss": 0.187, + "mean_token_accuracy": 0.9369316101074219, + "step": 4998 + }, + { + "epoch": 2.4995, + "grad_norm": 2.2306885268218526, + "learning_rate": 4.665281642640686e-06, + "loss": 0.2317, + "mean_token_accuracy": 0.9259778261184692, + "step": 4999 + }, + { + "epoch": 2.5, + "grad_norm": 1.8263860385325532, + "learning_rate": 4.665063509461098e-06, + "loss": 0.224, + "mean_token_accuracy": 0.9250888228416443, + "step": 5000 + }, + { + "epoch": 2.5004999999999997, + "grad_norm": 2.6556030333458476, + "learning_rate": 4.6648453103299024e-06, + "loss": 0.2849, + "mean_token_accuracy": 0.9111297726631165, + "step": 5001 + }, + { + "epoch": 2.501, + "grad_norm": 2.4656773061434882, + "learning_rate": 4.664627045253749e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8883146643638611, + "step": 5002 + }, + { + "epoch": 2.5015, + "grad_norm": 3.9355167557684436, + "learning_rate": 4.664408714239285e-06, + "loss": 0.287, + "mean_token_accuracy": 0.9049553275108337, + "step": 5003 + }, + { + "epoch": 2.502, + "grad_norm": 1.9324585521567605, + "learning_rate": 4.664190317293161e-06, + "loss": 0.2726, + "mean_token_accuracy": 0.912958025932312, + "step": 5004 + }, + { + "epoch": 2.5025, + "grad_norm": 5.291398370999028, + "learning_rate": 4.66397185442203e-06, + "loss": 0.14, + "mean_token_accuracy": 0.9491710662841797, + "step": 5005 + }, + { + "epoch": 2.503, + "grad_norm": 3.1499545344390336, + "learning_rate": 4.663753325632548e-06, + "loss": 0.2704, + "mean_token_accuracy": 0.9059611558914185, + "step": 5006 + }, + { + "epoch": 2.5035, + "grad_norm": 2.056972625878822, + "learning_rate": 4.663534730931369e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.874080240726471, + "step": 5007 + }, + { + "epoch": 2.504, + "grad_norm": 2.2061461214132994, + "learning_rate": 4.6633160703251556e-06, + "loss": 0.2664, + "mean_token_accuracy": 0.9074960350990295, + "step": 5008 + }, + { + "epoch": 2.5045, + "grad_norm": 16.66853290486859, + "learning_rate": 4.663097343820565e-06, + "loss": 0.2663, + "mean_token_accuracy": 0.9066380858421326, + "step": 5009 + }, + { + "epoch": 2.505, + "grad_norm": 2.7460059594441004, + "learning_rate": 4.6628785514242615e-06, + "loss": 0.2591, + "mean_token_accuracy": 0.9083333611488342, + "step": 5010 + }, + { + "epoch": 2.5055, + "grad_norm": 2.3774901085033675, + "learning_rate": 4.66265969314291e-06, + "loss": 0.2277, + "mean_token_accuracy": 0.918367326259613, + "step": 5011 + }, + { + "epoch": 2.5060000000000002, + "grad_norm": 2.457122683470469, + "learning_rate": 4.6624407689831775e-06, + "loss": 0.2737, + "mean_token_accuracy": 0.9063876867294312, + "step": 5012 + }, + { + "epoch": 2.5065, + "grad_norm": 1.782599365935734, + "learning_rate": 4.662221778951731e-06, + "loss": 0.2493, + "mean_token_accuracy": 0.9074587821960449, + "step": 5013 + }, + { + "epoch": 2.507, + "grad_norm": 1.925408650323388, + "learning_rate": 4.662002723055245e-06, + "loss": 0.2835, + "mean_token_accuracy": 0.9083867073059082, + "step": 5014 + }, + { + "epoch": 2.5075, + "grad_norm": 2.182119096397143, + "learning_rate": 4.6617836013003885e-06, + "loss": 0.2476, + "mean_token_accuracy": 0.9198957681655884, + "step": 5015 + }, + { + "epoch": 2.508, + "grad_norm": 3.179064904573606, + "learning_rate": 4.661564413693838e-06, + "loss": 0.3244, + "mean_token_accuracy": 0.8930352926254272, + "step": 5016 + }, + { + "epoch": 2.5084999999999997, + "grad_norm": 3.50860759540509, + "learning_rate": 4.66134516024227e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8833447098731995, + "step": 5017 + }, + { + "epoch": 2.509, + "grad_norm": 2.417334057121449, + "learning_rate": 4.661125840952364e-06, + "loss": 0.2616, + "mean_token_accuracy": 0.916373610496521, + "step": 5018 + }, + { + "epoch": 2.5095, + "grad_norm": 2.374650653619366, + "learning_rate": 4.6609064558308e-06, + "loss": 0.2691, + "mean_token_accuracy": 0.9049151539802551, + "step": 5019 + }, + { + "epoch": 2.51, + "grad_norm": 1.9735053766271946, + "learning_rate": 4.6606870048842626e-06, + "loss": 0.3182, + "mean_token_accuracy": 0.9068908095359802, + "step": 5020 + }, + { + "epoch": 2.5105, + "grad_norm": 2.2378310565667006, + "learning_rate": 4.660467488119434e-06, + "loss": 0.3323, + "mean_token_accuracy": 0.889305055141449, + "step": 5021 + }, + { + "epoch": 2.511, + "grad_norm": 2.068584822712146, + "learning_rate": 4.660247905543003e-06, + "loss": 0.2317, + "mean_token_accuracy": 0.9253367781639099, + "step": 5022 + }, + { + "epoch": 2.5115, + "grad_norm": 1.7935171702982633, + "learning_rate": 4.660028257161658e-06, + "loss": 0.2612, + "mean_token_accuracy": 0.9149367213249207, + "step": 5023 + }, + { + "epoch": 2.512, + "grad_norm": 3.950367720723208, + "learning_rate": 4.659808542982089e-06, + "loss": 0.3325, + "mean_token_accuracy": 0.8852586150169373, + "step": 5024 + }, + { + "epoch": 2.5125, + "grad_norm": 1.9525137266952692, + "learning_rate": 4.65958876301099e-06, + "loss": 0.2914, + "mean_token_accuracy": 0.9019736051559448, + "step": 5025 + }, + { + "epoch": 2.513, + "grad_norm": 3.3951740916979514, + "learning_rate": 4.659368917255055e-06, + "loss": 0.275, + "mean_token_accuracy": 0.9017598032951355, + "step": 5026 + }, + { + "epoch": 2.5135, + "grad_norm": 2.239390608568218, + "learning_rate": 4.659149005720982e-06, + "loss": 0.3033, + "mean_token_accuracy": 0.9040640592575073, + "step": 5027 + }, + { + "epoch": 2.5140000000000002, + "grad_norm": 2.361556644877964, + "learning_rate": 4.658929028415469e-06, + "loss": 0.2712, + "mean_token_accuracy": 0.9027576446533203, + "step": 5028 + }, + { + "epoch": 2.5145, + "grad_norm": 4.407262623844562, + "learning_rate": 4.6587089853452174e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.8860676288604736, + "step": 5029 + }, + { + "epoch": 2.515, + "grad_norm": 4.134347898972641, + "learning_rate": 4.658488876516929e-06, + "loss": 0.2587, + "mean_token_accuracy": 0.9080129861831665, + "step": 5030 + }, + { + "epoch": 2.5155, + "grad_norm": 2.104872560148671, + "learning_rate": 4.65826870193731e-06, + "loss": 0.2785, + "mean_token_accuracy": 0.908613920211792, + "step": 5031 + }, + { + "epoch": 2.516, + "grad_norm": 39.38056506281216, + "learning_rate": 4.658048461613068e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8672608137130737, + "step": 5032 + }, + { + "epoch": 2.5164999999999997, + "grad_norm": 6.491735163746569, + "learning_rate": 4.65782815555091e-06, + "loss": 0.2913, + "mean_token_accuracy": 0.9061105251312256, + "step": 5033 + }, + { + "epoch": 2.517, + "grad_norm": 2.1668479887711234, + "learning_rate": 4.657607783757547e-06, + "loss": 0.3351, + "mean_token_accuracy": 0.8916427493095398, + "step": 5034 + }, + { + "epoch": 2.5175, + "grad_norm": 2.172612362902219, + "learning_rate": 4.6573873462396935e-06, + "loss": 0.2344, + "mean_token_accuracy": 0.9210843443870544, + "step": 5035 + }, + { + "epoch": 2.518, + "grad_norm": 2.246153753753887, + "learning_rate": 4.6571668430040625e-06, + "loss": 0.3192, + "mean_token_accuracy": 0.8949850797653198, + "step": 5036 + }, + { + "epoch": 2.5185, + "grad_norm": 2.610193810359865, + "learning_rate": 4.656946274057373e-06, + "loss": 0.2329, + "mean_token_accuracy": 0.91478031873703, + "step": 5037 + }, + { + "epoch": 2.519, + "grad_norm": 2.8640014088218644, + "learning_rate": 4.656725639406342e-06, + "loss": 0.2922, + "mean_token_accuracy": 0.9117437601089478, + "step": 5038 + }, + { + "epoch": 2.5195, + "grad_norm": 2.523771403480075, + "learning_rate": 4.656504939057691e-06, + "loss": 0.2698, + "mean_token_accuracy": 0.9068541526794434, + "step": 5039 + }, + { + "epoch": 2.52, + "grad_norm": 1.9704114031569886, + "learning_rate": 4.656284173018144e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.8842496871948242, + "step": 5040 + }, + { + "epoch": 2.5205, + "grad_norm": 2.600162046658263, + "learning_rate": 4.6560633412944245e-06, + "loss": 0.3205, + "mean_token_accuracy": 0.8976436853408813, + "step": 5041 + }, + { + "epoch": 2.521, + "grad_norm": 1.9839697732248662, + "learning_rate": 4.65584244389326e-06, + "loss": 0.2856, + "mean_token_accuracy": 0.9013351798057556, + "step": 5042 + }, + { + "epoch": 2.5215, + "grad_norm": 2.991817150478061, + "learning_rate": 4.65562148082138e-06, + "loss": 0.2428, + "mean_token_accuracy": 0.9171752333641052, + "step": 5043 + }, + { + "epoch": 2.5220000000000002, + "grad_norm": 2.178513901884669, + "learning_rate": 4.655400452085515e-06, + "loss": 0.265, + "mean_token_accuracy": 0.9057618975639343, + "step": 5044 + }, + { + "epoch": 2.5225, + "grad_norm": 4.836092944044894, + "learning_rate": 4.655179357692396e-06, + "loss": 0.2523, + "mean_token_accuracy": 0.9097269773483276, + "step": 5045 + }, + { + "epoch": 2.523, + "grad_norm": 2.551269184076867, + "learning_rate": 4.654958197648761e-06, + "loss": 0.3148, + "mean_token_accuracy": 0.9059081077575684, + "step": 5046 + }, + { + "epoch": 2.5235, + "grad_norm": 7.666887707886289, + "learning_rate": 4.654736971961345e-06, + "loss": 0.2231, + "mean_token_accuracy": 0.9221156239509583, + "step": 5047 + }, + { + "epoch": 2.524, + "grad_norm": 2.453888186793675, + "learning_rate": 4.654515680636888e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.8934506177902222, + "step": 5048 + }, + { + "epoch": 2.5244999999999997, + "grad_norm": 1.510167820752988, + "learning_rate": 4.65429432368213e-06, + "loss": 0.1769, + "mean_token_accuracy": 0.9309571981430054, + "step": 5049 + }, + { + "epoch": 2.525, + "grad_norm": 2.2165204846255695, + "learning_rate": 4.654072901103815e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.8930565714836121, + "step": 5050 + }, + { + "epoch": 2.5255, + "grad_norm": 2.792821611017465, + "learning_rate": 4.653851412908687e-06, + "loss": 0.2791, + "mean_token_accuracy": 0.9071877002716064, + "step": 5051 + }, + { + "epoch": 2.526, + "grad_norm": 1.6658424704428874, + "learning_rate": 4.653629859103492e-06, + "loss": 0.1865, + "mean_token_accuracy": 0.9351456165313721, + "step": 5052 + }, + { + "epoch": 2.5265, + "grad_norm": 2.5697812402369644, + "learning_rate": 4.653408239694982e-06, + "loss": 0.2155, + "mean_token_accuracy": 0.9328681826591492, + "step": 5053 + }, + { + "epoch": 2.527, + "grad_norm": 2.195690622027853, + "learning_rate": 4.653186554689905e-06, + "loss": 0.275, + "mean_token_accuracy": 0.9110326766967773, + "step": 5054 + }, + { + "epoch": 2.5275, + "grad_norm": 1.831533811105499, + "learning_rate": 4.652964804095015e-06, + "loss": 0.2787, + "mean_token_accuracy": 0.9050841927528381, + "step": 5055 + }, + { + "epoch": 2.528, + "grad_norm": 2.009381002548852, + "learning_rate": 4.652742987917066e-06, + "loss": 0.2904, + "mean_token_accuracy": 0.8961827158927917, + "step": 5056 + }, + { + "epoch": 2.5285, + "grad_norm": 1.6016022971592274, + "learning_rate": 4.652521106162817e-06, + "loss": 0.2543, + "mean_token_accuracy": 0.9077281355857849, + "step": 5057 + }, + { + "epoch": 2.529, + "grad_norm": 4.990698000435031, + "learning_rate": 4.652299158839025e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8583897948265076, + "step": 5058 + }, + { + "epoch": 2.5295, + "grad_norm": 4.0717773484412465, + "learning_rate": 4.652077145952452e-06, + "loss": 0.2278, + "mean_token_accuracy": 0.9158385992050171, + "step": 5059 + }, + { + "epoch": 2.5300000000000002, + "grad_norm": 2.1684238741021424, + "learning_rate": 4.65185506750986e-06, + "loss": 0.2714, + "mean_token_accuracy": 0.9028602242469788, + "step": 5060 + }, + { + "epoch": 2.5305, + "grad_norm": 1.509418838806606, + "learning_rate": 4.651632923518014e-06, + "loss": 0.2428, + "mean_token_accuracy": 0.9099748134613037, + "step": 5061 + }, + { + "epoch": 2.531, + "grad_norm": 7.674223318524449, + "learning_rate": 4.651410713983682e-06, + "loss": 0.2782, + "mean_token_accuracy": 0.9034934043884277, + "step": 5062 + }, + { + "epoch": 2.5315, + "grad_norm": 2.0501441531497986, + "learning_rate": 4.651188438913631e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.8929005265235901, + "step": 5063 + }, + { + "epoch": 2.532, + "grad_norm": 6.615599541309787, + "learning_rate": 4.6509660983146334e-06, + "loss": 0.3111, + "mean_token_accuracy": 0.8968824744224548, + "step": 5064 + }, + { + "epoch": 2.5324999999999998, + "grad_norm": 2.0931660712775613, + "learning_rate": 4.650743692193462e-06, + "loss": 0.2671, + "mean_token_accuracy": 0.9110549092292786, + "step": 5065 + }, + { + "epoch": 2.533, + "grad_norm": 4.074998953173289, + "learning_rate": 4.650521220556892e-06, + "loss": 0.2414, + "mean_token_accuracy": 0.9148619771003723, + "step": 5066 + }, + { + "epoch": 2.5335, + "grad_norm": 3.1280167318391596, + "learning_rate": 4.650298683411698e-06, + "loss": 0.2793, + "mean_token_accuracy": 0.9072639346122742, + "step": 5067 + }, + { + "epoch": 2.534, + "grad_norm": 1.4933327034935617, + "learning_rate": 4.650076080764663e-06, + "loss": 0.2565, + "mean_token_accuracy": 0.9098243117332458, + "step": 5068 + }, + { + "epoch": 2.5345, + "grad_norm": 2.393133398397592, + "learning_rate": 4.6498534126225634e-06, + "loss": 0.2258, + "mean_token_accuracy": 0.9303653240203857, + "step": 5069 + }, + { + "epoch": 2.535, + "grad_norm": 1.5080574821134536, + "learning_rate": 4.649630678992184e-06, + "loss": 0.1676, + "mean_token_accuracy": 0.9360033869743347, + "step": 5070 + }, + { + "epoch": 2.5355, + "grad_norm": 2.9221565740262045, + "learning_rate": 4.64940787988031e-06, + "loss": 0.319, + "mean_token_accuracy": 0.887958288192749, + "step": 5071 + }, + { + "epoch": 2.536, + "grad_norm": 3.1880421041321982, + "learning_rate": 4.649185015293728e-06, + "loss": 0.3095, + "mean_token_accuracy": 0.8982667922973633, + "step": 5072 + }, + { + "epoch": 2.5365, + "grad_norm": 3.171995916965701, + "learning_rate": 4.648962085239227e-06, + "loss": 0.3077, + "mean_token_accuracy": 0.8908573389053345, + "step": 5073 + }, + { + "epoch": 2.537, + "grad_norm": 2.32111456628043, + "learning_rate": 4.648739089723597e-06, + "loss": 0.2847, + "mean_token_accuracy": 0.9068087935447693, + "step": 5074 + }, + { + "epoch": 2.5375, + "grad_norm": 2.131041538392155, + "learning_rate": 4.648516028753632e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8926529288291931, + "step": 5075 + }, + { + "epoch": 2.5380000000000003, + "grad_norm": 4.290966481837349, + "learning_rate": 4.648292902336126e-06, + "loss": 0.2706, + "mean_token_accuracy": 0.9056418538093567, + "step": 5076 + }, + { + "epoch": 2.5385, + "grad_norm": 2.1355768826049344, + "learning_rate": 4.648069710477876e-06, + "loss": 0.2496, + "mean_token_accuracy": 0.918367326259613, + "step": 5077 + }, + { + "epoch": 2.539, + "grad_norm": 1.6639720050127669, + "learning_rate": 4.647846453185681e-06, + "loss": 0.2824, + "mean_token_accuracy": 0.9035968780517578, + "step": 5078 + }, + { + "epoch": 2.5395, + "grad_norm": 2.3279885295865195, + "learning_rate": 4.6476231304663425e-06, + "loss": 0.2835, + "mean_token_accuracy": 0.9117434024810791, + "step": 5079 + }, + { + "epoch": 2.54, + "grad_norm": 2.1737058222888943, + "learning_rate": 4.6473997423266615e-06, + "loss": 0.2724, + "mean_token_accuracy": 0.9059856534004211, + "step": 5080 + }, + { + "epoch": 2.5404999999999998, + "grad_norm": 2.6829022521339487, + "learning_rate": 4.647176288773444e-06, + "loss": 0.3206, + "mean_token_accuracy": 0.8998399972915649, + "step": 5081 + }, + { + "epoch": 2.541, + "grad_norm": 2.2061899375688783, + "learning_rate": 4.646952769813496e-06, + "loss": 0.3278, + "mean_token_accuracy": 0.892353355884552, + "step": 5082 + }, + { + "epoch": 2.5415, + "grad_norm": 25.04816252140449, + "learning_rate": 4.646729185453628e-06, + "loss": 0.2857, + "mean_token_accuracy": 0.9155182838439941, + "step": 5083 + }, + { + "epoch": 2.542, + "grad_norm": 1.6044016665614462, + "learning_rate": 4.646505535700649e-06, + "loss": 0.2347, + "mean_token_accuracy": 0.919636070728302, + "step": 5084 + }, + { + "epoch": 2.5425, + "grad_norm": 24.97406776224343, + "learning_rate": 4.646281820561372e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.8986692428588867, + "step": 5085 + }, + { + "epoch": 2.543, + "grad_norm": 3.016841793557329, + "learning_rate": 4.646058040042613e-06, + "loss": 0.307, + "mean_token_accuracy": 0.8968304991722107, + "step": 5086 + }, + { + "epoch": 2.5435, + "grad_norm": 2.0266945871903195, + "learning_rate": 4.6458341941511876e-06, + "loss": 0.2643, + "mean_token_accuracy": 0.9065865874290466, + "step": 5087 + }, + { + "epoch": 2.544, + "grad_norm": 2.1925349035257513, + "learning_rate": 4.645610282893914e-06, + "loss": 0.2706, + "mean_token_accuracy": 0.9075362086296082, + "step": 5088 + }, + { + "epoch": 2.5445, + "grad_norm": 1.8215820379646928, + "learning_rate": 4.645386306277615e-06, + "loss": 0.2634, + "mean_token_accuracy": 0.9131333827972412, + "step": 5089 + }, + { + "epoch": 2.545, + "grad_norm": 2.6351220456757485, + "learning_rate": 4.645162264309112e-06, + "loss": 0.296, + "mean_token_accuracy": 0.9014654755592346, + "step": 5090 + }, + { + "epoch": 2.5455, + "grad_norm": 3.9876306168197124, + "learning_rate": 4.644938156995229e-06, + "loss": 0.3129, + "mean_token_accuracy": 0.8964972496032715, + "step": 5091 + }, + { + "epoch": 2.5460000000000003, + "grad_norm": 4.276878714242053, + "learning_rate": 4.644713984342794e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8861633539199829, + "step": 5092 + }, + { + "epoch": 2.5465, + "grad_norm": 3.6634807118488473, + "learning_rate": 4.644489746358635e-06, + "loss": 0.32, + "mean_token_accuracy": 0.8951982259750366, + "step": 5093 + }, + { + "epoch": 2.547, + "grad_norm": 8.171963919606457, + "learning_rate": 4.644265443049583e-06, + "loss": 0.295, + "mean_token_accuracy": 0.8891451358795166, + "step": 5094 + }, + { + "epoch": 2.5475, + "grad_norm": 1.7255680885563949, + "learning_rate": 4.644041074422469e-06, + "loss": 0.3146, + "mean_token_accuracy": 0.8963183760643005, + "step": 5095 + }, + { + "epoch": 2.548, + "grad_norm": 2.391478649562944, + "learning_rate": 4.6438166404841316e-06, + "loss": 0.2561, + "mean_token_accuracy": 0.9143415689468384, + "step": 5096 + }, + { + "epoch": 2.5484999999999998, + "grad_norm": 4.765818282475695, + "learning_rate": 4.643592141241403e-06, + "loss": 0.3133, + "mean_token_accuracy": 0.8990269899368286, + "step": 5097 + }, + { + "epoch": 2.549, + "grad_norm": 3.655600633005676, + "learning_rate": 4.643367576701125e-06, + "loss": 0.2881, + "mean_token_accuracy": 0.9026178121566772, + "step": 5098 + }, + { + "epoch": 2.5495, + "grad_norm": 1.9266410923242765, + "learning_rate": 4.643142946870137e-06, + "loss": 0.2313, + "mean_token_accuracy": 0.9176336526870728, + "step": 5099 + }, + { + "epoch": 2.55, + "grad_norm": 1.740011195349186, + "learning_rate": 4.642918251755281e-06, + "loss": 0.2627, + "mean_token_accuracy": 0.9072869420051575, + "step": 5100 + }, + { + "epoch": 2.5505, + "grad_norm": 3.384226207759655, + "learning_rate": 4.642693491363402e-06, + "loss": 0.3368, + "mean_token_accuracy": 0.893386721611023, + "step": 5101 + }, + { + "epoch": 2.551, + "grad_norm": 1.9016322827750647, + "learning_rate": 4.6424686657013485e-06, + "loss": 0.3064, + "mean_token_accuracy": 0.8947929739952087, + "step": 5102 + }, + { + "epoch": 2.5515, + "grad_norm": 2.4773124291705915, + "learning_rate": 4.642243774775966e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8753246665000916, + "step": 5103 + }, + { + "epoch": 2.552, + "grad_norm": 3.8146545270896617, + "learning_rate": 4.642018818594107e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.8865525722503662, + "step": 5104 + }, + { + "epoch": 2.5525, + "grad_norm": 2.295009070917926, + "learning_rate": 4.641793797162625e-06, + "loss": 0.2736, + "mean_token_accuracy": 0.9077915549278259, + "step": 5105 + }, + { + "epoch": 2.553, + "grad_norm": 2.3905609999079194, + "learning_rate": 4.641568710488371e-06, + "loss": 0.2931, + "mean_token_accuracy": 0.9068837761878967, + "step": 5106 + }, + { + "epoch": 2.5535, + "grad_norm": 8.384954058858906, + "learning_rate": 4.641343558578205e-06, + "loss": 0.2948, + "mean_token_accuracy": 0.8960180878639221, + "step": 5107 + }, + { + "epoch": 2.5540000000000003, + "grad_norm": 3.5231020151892967, + "learning_rate": 4.641118341438984e-06, + "loss": 0.2616, + "mean_token_accuracy": 0.9153059720993042, + "step": 5108 + }, + { + "epoch": 2.5545, + "grad_norm": 1.8870879734023907, + "learning_rate": 4.640893059077568e-06, + "loss": 0.3064, + "mean_token_accuracy": 0.903661847114563, + "step": 5109 + }, + { + "epoch": 2.555, + "grad_norm": 3.4048740096875694, + "learning_rate": 4.640667711500821e-06, + "loss": 0.2897, + "mean_token_accuracy": 0.8925769329071045, + "step": 5110 + }, + { + "epoch": 2.5555, + "grad_norm": 2.4853218754810644, + "learning_rate": 4.640442298715606e-06, + "loss": 0.275, + "mean_token_accuracy": 0.9070714712142944, + "step": 5111 + }, + { + "epoch": 2.556, + "grad_norm": 1.9205933083378384, + "learning_rate": 4.640216820728791e-06, + "loss": 0.3315, + "mean_token_accuracy": 0.8939456343650818, + "step": 5112 + }, + { + "epoch": 2.5564999999999998, + "grad_norm": 2.010516033941566, + "learning_rate": 4.639991277547243e-06, + "loss": 0.2301, + "mean_token_accuracy": 0.912331223487854, + "step": 5113 + }, + { + "epoch": 2.557, + "grad_norm": 2.7618928994606233, + "learning_rate": 4.639765669177833e-06, + "loss": 0.3062, + "mean_token_accuracy": 0.904964029788971, + "step": 5114 + }, + { + "epoch": 2.5575, + "grad_norm": 3.411314121402341, + "learning_rate": 4.6395399956274334e-06, + "loss": 0.3445, + "mean_token_accuracy": 0.8896114230155945, + "step": 5115 + }, + { + "epoch": 2.558, + "grad_norm": 2.462197962656427, + "learning_rate": 4.639314256902919e-06, + "loss": 0.2562, + "mean_token_accuracy": 0.9165095090866089, + "step": 5116 + }, + { + "epoch": 2.5585, + "grad_norm": 2.983719703185102, + "learning_rate": 4.639088453011166e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8689442873001099, + "step": 5117 + }, + { + "epoch": 2.559, + "grad_norm": 9.793767749360082, + "learning_rate": 4.6388625839590514e-06, + "loss": 0.3277, + "mean_token_accuracy": 0.9006129503250122, + "step": 5118 + }, + { + "epoch": 2.5595, + "grad_norm": 2.5894317389162733, + "learning_rate": 4.638636649753459e-06, + "loss": 0.3014, + "mean_token_accuracy": 0.8988333344459534, + "step": 5119 + }, + { + "epoch": 2.56, + "grad_norm": 1.9682857526277469, + "learning_rate": 4.638410650401267e-06, + "loss": 0.2576, + "mean_token_accuracy": 0.9133121371269226, + "step": 5120 + }, + { + "epoch": 2.5605, + "grad_norm": 6.717799485914512, + "learning_rate": 4.638184585909362e-06, + "loss": 0.2309, + "mean_token_accuracy": 0.9159315228462219, + "step": 5121 + }, + { + "epoch": 2.561, + "grad_norm": 4.641408340442514, + "learning_rate": 4.6379584562846306e-06, + "loss": 0.306, + "mean_token_accuracy": 0.8943870067596436, + "step": 5122 + }, + { + "epoch": 2.5615, + "grad_norm": 2.83643931672427, + "learning_rate": 4.637732261533961e-06, + "loss": 0.2035, + "mean_token_accuracy": 0.9270710349082947, + "step": 5123 + }, + { + "epoch": 2.5620000000000003, + "grad_norm": 2.782280270571561, + "learning_rate": 4.637506001664242e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8786759972572327, + "step": 5124 + }, + { + "epoch": 2.5625, + "grad_norm": 1.7758795754680778, + "learning_rate": 4.637279676682367e-06, + "loss": 0.2152, + "mean_token_accuracy": 0.9210150241851807, + "step": 5125 + }, + { + "epoch": 2.5629999999999997, + "grad_norm": 2.9715809013989607, + "learning_rate": 4.63705328659523e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8833643794059753, + "step": 5126 + }, + { + "epoch": 2.5635, + "grad_norm": 3.001389438760625, + "learning_rate": 4.6368268314097275e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.8775035738945007, + "step": 5127 + }, + { + "epoch": 2.564, + "grad_norm": 2.67758343655623, + "learning_rate": 4.636600311132758e-06, + "loss": 0.2964, + "mean_token_accuracy": 0.9091971516609192, + "step": 5128 + }, + { + "epoch": 2.5645, + "grad_norm": 2.6997473135990377, + "learning_rate": 4.636373725771221e-06, + "loss": 0.2497, + "mean_token_accuracy": 0.9144877195358276, + "step": 5129 + }, + { + "epoch": 2.565, + "grad_norm": 13.047775116501038, + "learning_rate": 4.636147075332019e-06, + "loss": 0.2831, + "mean_token_accuracy": 0.9019189476966858, + "step": 5130 + }, + { + "epoch": 2.5655, + "grad_norm": 1.7847988027489619, + "learning_rate": 4.635920359822056e-06, + "loss": 0.2616, + "mean_token_accuracy": 0.9036238789558411, + "step": 5131 + }, + { + "epoch": 2.566, + "grad_norm": 2.6689243152300994, + "learning_rate": 4.635693579248238e-06, + "loss": 0.2496, + "mean_token_accuracy": 0.9174041152000427, + "step": 5132 + }, + { + "epoch": 2.5665, + "grad_norm": 1.7349815636405983, + "learning_rate": 4.635466733617474e-06, + "loss": 0.2374, + "mean_token_accuracy": 0.9159135818481445, + "step": 5133 + }, + { + "epoch": 2.567, + "grad_norm": 2.704349144065447, + "learning_rate": 4.6352398229366735e-06, + "loss": 0.3297, + "mean_token_accuracy": 0.8964567184448242, + "step": 5134 + }, + { + "epoch": 2.5675, + "grad_norm": 2.089909519123466, + "learning_rate": 4.635012847212749e-06, + "loss": 0.292, + "mean_token_accuracy": 0.9015184640884399, + "step": 5135 + }, + { + "epoch": 2.568, + "grad_norm": 3.817194112060425, + "learning_rate": 4.634785806452613e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.8868135213851929, + "step": 5136 + }, + { + "epoch": 2.5685000000000002, + "grad_norm": 2.462171668978704, + "learning_rate": 4.634558700663183e-06, + "loss": 0.2736, + "mean_token_accuracy": 0.9144981503486633, + "step": 5137 + }, + { + "epoch": 2.569, + "grad_norm": 3.284186270102285, + "learning_rate": 4.634331529851377e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8818826079368591, + "step": 5138 + }, + { + "epoch": 2.5695, + "grad_norm": 2.5525285560539444, + "learning_rate": 4.634104294024116e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8956356644630432, + "step": 5139 + }, + { + "epoch": 2.57, + "grad_norm": 3.5005421734986797, + "learning_rate": 4.633876993188319e-06, + "loss": 0.2387, + "mean_token_accuracy": 0.9140350818634033, + "step": 5140 + }, + { + "epoch": 2.5705, + "grad_norm": 10.260981815271451, + "learning_rate": 4.633649627350913e-06, + "loss": 0.343, + "mean_token_accuracy": 0.8929709196090698, + "step": 5141 + }, + { + "epoch": 2.5709999999999997, + "grad_norm": 1.6689344629032234, + "learning_rate": 4.633422196518822e-06, + "loss": 0.2095, + "mean_token_accuracy": 0.9306177496910095, + "step": 5142 + }, + { + "epoch": 2.5715, + "grad_norm": 1.9802122406253566, + "learning_rate": 4.633194700698975e-06, + "loss": 0.3196, + "mean_token_accuracy": 0.8985090851783752, + "step": 5143 + }, + { + "epoch": 2.572, + "grad_norm": 1.6902560485130427, + "learning_rate": 4.632967139898301e-06, + "loss": 0.248, + "mean_token_accuracy": 0.9168204069137573, + "step": 5144 + }, + { + "epoch": 2.5725, + "grad_norm": 1.9496591312990434, + "learning_rate": 4.632739514123733e-06, + "loss": 0.2817, + "mean_token_accuracy": 0.9066312909126282, + "step": 5145 + }, + { + "epoch": 2.573, + "grad_norm": 3.138263654441983, + "learning_rate": 4.6325118233822045e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8793622255325317, + "step": 5146 + }, + { + "epoch": 2.5735, + "grad_norm": 2.1722060893947215, + "learning_rate": 4.6322840676806515e-06, + "loss": 0.2927, + "mean_token_accuracy": 0.9030620455741882, + "step": 5147 + }, + { + "epoch": 2.574, + "grad_norm": 5.263674475416548, + "learning_rate": 4.632056247026011e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8868913650512695, + "step": 5148 + }, + { + "epoch": 2.5745, + "grad_norm": 2.0824437375556846, + "learning_rate": 4.631828361425223e-06, + "loss": 0.2558, + "mean_token_accuracy": 0.9112720489501953, + "step": 5149 + }, + { + "epoch": 2.575, + "grad_norm": 2.2838912340851714, + "learning_rate": 4.631600410885231e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.876049816608429, + "step": 5150 + }, + { + "epoch": 2.5755, + "grad_norm": 3.8392114980292256, + "learning_rate": 4.631372395412976e-06, + "loss": 0.2696, + "mean_token_accuracy": 0.9112364649772644, + "step": 5151 + }, + { + "epoch": 2.576, + "grad_norm": 5.019205237145182, + "learning_rate": 4.631144315015407e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8786950707435608, + "step": 5152 + }, + { + "epoch": 2.5765000000000002, + "grad_norm": 2.7062591575637054, + "learning_rate": 4.6309161696994685e-06, + "loss": 0.2454, + "mean_token_accuracy": 0.9155920147895813, + "step": 5153 + }, + { + "epoch": 2.577, + "grad_norm": 3.072056549575008, + "learning_rate": 4.630687959472112e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8777646422386169, + "step": 5154 + }, + { + "epoch": 2.5775, + "grad_norm": 4.617914635453807, + "learning_rate": 4.6304596843402885e-06, + "loss": 0.3085, + "mean_token_accuracy": 0.8986400961875916, + "step": 5155 + }, + { + "epoch": 2.578, + "grad_norm": 1.923475551664664, + "learning_rate": 4.630231344310953e-06, + "loss": 0.2083, + "mean_token_accuracy": 0.9248834848403931, + "step": 5156 + }, + { + "epoch": 2.5785, + "grad_norm": 1.778392638686009, + "learning_rate": 4.6300029393910585e-06, + "loss": 0.265, + "mean_token_accuracy": 0.910586953163147, + "step": 5157 + }, + { + "epoch": 2.5789999999999997, + "grad_norm": 2.1162626390216324, + "learning_rate": 4.629774469587565e-06, + "loss": 0.2764, + "mean_token_accuracy": 0.9059298038482666, + "step": 5158 + }, + { + "epoch": 2.5795, + "grad_norm": 3.043621932243991, + "learning_rate": 4.629545934907432e-06, + "loss": 0.2932, + "mean_token_accuracy": 0.9063937664031982, + "step": 5159 + }, + { + "epoch": 2.58, + "grad_norm": 1.6055127859699971, + "learning_rate": 4.62931733535762e-06, + "loss": 0.2657, + "mean_token_accuracy": 0.906321108341217, + "step": 5160 + }, + { + "epoch": 2.5805, + "grad_norm": 1.9110414895175978, + "learning_rate": 4.629088670945092e-06, + "loss": 0.3359, + "mean_token_accuracy": 0.9001355767250061, + "step": 5161 + }, + { + "epoch": 2.581, + "grad_norm": 2.410667601641572, + "learning_rate": 4.628859941676815e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8687035441398621, + "step": 5162 + }, + { + "epoch": 2.5815, + "grad_norm": 1.695875902799074, + "learning_rate": 4.628631147559756e-06, + "loss": 0.1744, + "mean_token_accuracy": 0.9346947073936462, + "step": 5163 + }, + { + "epoch": 2.582, + "grad_norm": 2.992030867510501, + "learning_rate": 4.628402288600884e-06, + "loss": 0.2864, + "mean_token_accuracy": 0.9139683246612549, + "step": 5164 + }, + { + "epoch": 2.5825, + "grad_norm": 2.0473339048002255, + "learning_rate": 4.628173364807171e-06, + "loss": 0.2825, + "mean_token_accuracy": 0.9000667333602905, + "step": 5165 + }, + { + "epoch": 2.583, + "grad_norm": 1.9437642109749853, + "learning_rate": 4.627944376185591e-06, + "loss": 0.2612, + "mean_token_accuracy": 0.9135435223579407, + "step": 5166 + }, + { + "epoch": 2.5835, + "grad_norm": 2.491466754817301, + "learning_rate": 4.627715322743118e-06, + "loss": 0.274, + "mean_token_accuracy": 0.9055627584457397, + "step": 5167 + }, + { + "epoch": 2.584, + "grad_norm": 1.796945894949128, + "learning_rate": 4.62748620448673e-06, + "loss": 0.2545, + "mean_token_accuracy": 0.9148468375205994, + "step": 5168 + }, + { + "epoch": 2.5845000000000002, + "grad_norm": 25.693239071484992, + "learning_rate": 4.627257021423407e-06, + "loss": 0.3044, + "mean_token_accuracy": 0.9018433094024658, + "step": 5169 + }, + { + "epoch": 2.585, + "grad_norm": 2.578609683873792, + "learning_rate": 4.627027773560129e-06, + "loss": 0.2835, + "mean_token_accuracy": 0.9049826264381409, + "step": 5170 + }, + { + "epoch": 2.5855, + "grad_norm": 1.681263328588265, + "learning_rate": 4.626798460903879e-06, + "loss": 0.2654, + "mean_token_accuracy": 0.9109526872634888, + "step": 5171 + }, + { + "epoch": 2.586, + "grad_norm": 2.216601222085904, + "learning_rate": 4.626569083461645e-06, + "loss": 0.2456, + "mean_token_accuracy": 0.9233794808387756, + "step": 5172 + }, + { + "epoch": 2.5865, + "grad_norm": 3.3962842937085704, + "learning_rate": 4.626339641240412e-06, + "loss": 0.3213, + "mean_token_accuracy": 0.8903194069862366, + "step": 5173 + }, + { + "epoch": 2.5869999999999997, + "grad_norm": 1.9738382750215484, + "learning_rate": 4.626110134247168e-06, + "loss": 0.2882, + "mean_token_accuracy": 0.8979730010032654, + "step": 5174 + }, + { + "epoch": 2.5875, + "grad_norm": 4.549020264008401, + "learning_rate": 4.625880562488908e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8839444518089294, + "step": 5175 + }, + { + "epoch": 2.588, + "grad_norm": 1.7743434260653752, + "learning_rate": 4.625650925972622e-06, + "loss": 0.2622, + "mean_token_accuracy": 0.9219526648521423, + "step": 5176 + }, + { + "epoch": 2.5885, + "grad_norm": 3.6390791435601737, + "learning_rate": 4.625421224705306e-06, + "loss": 0.2941, + "mean_token_accuracy": 0.9059135317802429, + "step": 5177 + }, + { + "epoch": 2.589, + "grad_norm": 1.7464830444709059, + "learning_rate": 4.6251914586939575e-06, + "loss": 0.2738, + "mean_token_accuracy": 0.8936068415641785, + "step": 5178 + }, + { + "epoch": 2.5895, + "grad_norm": 1.9488966900225608, + "learning_rate": 4.624961627945575e-06, + "loss": 0.2202, + "mean_token_accuracy": 0.9206176400184631, + "step": 5179 + }, + { + "epoch": 2.59, + "grad_norm": 3.2857497900164025, + "learning_rate": 4.62473173246716e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8923745155334473, + "step": 5180 + }, + { + "epoch": 2.5905, + "grad_norm": 2.376873093167991, + "learning_rate": 4.624501772265716e-06, + "loss": 0.2799, + "mean_token_accuracy": 0.9083586931228638, + "step": 5181 + }, + { + "epoch": 2.591, + "grad_norm": 1.9044442252876097, + "learning_rate": 4.624271747348247e-06, + "loss": 0.2854, + "mean_token_accuracy": 0.9004203081130981, + "step": 5182 + }, + { + "epoch": 2.5915, + "grad_norm": 3.2460977599702954, + "learning_rate": 4.624041657721759e-06, + "loss": 0.3031, + "mean_token_accuracy": 0.8971846103668213, + "step": 5183 + }, + { + "epoch": 2.592, + "grad_norm": 1.830130770110916, + "learning_rate": 4.623811503393264e-06, + "loss": 0.3061, + "mean_token_accuracy": 0.8959624767303467, + "step": 5184 + }, + { + "epoch": 2.5925000000000002, + "grad_norm": 1.8005823543345072, + "learning_rate": 4.62358128436977e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.8990918397903442, + "step": 5185 + }, + { + "epoch": 2.593, + "grad_norm": 2.1980438505394777, + "learning_rate": 4.623351000658292e-06, + "loss": 0.2886, + "mean_token_accuracy": 0.9045147895812988, + "step": 5186 + }, + { + "epoch": 2.5935, + "grad_norm": 1.701799311542059, + "learning_rate": 4.623120652265844e-06, + "loss": 0.2487, + "mean_token_accuracy": 0.918630838394165, + "step": 5187 + }, + { + "epoch": 2.594, + "grad_norm": 2.1141134795516967, + "learning_rate": 4.622890239199442e-06, + "loss": 0.2476, + "mean_token_accuracy": 0.9144267439842224, + "step": 5188 + }, + { + "epoch": 2.5945, + "grad_norm": 2.0279506427538503, + "learning_rate": 4.622659761466104e-06, + "loss": 0.2374, + "mean_token_accuracy": 0.9227136969566345, + "step": 5189 + }, + { + "epoch": 2.5949999999999998, + "grad_norm": 2.5990158694355165, + "learning_rate": 4.622429219072854e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8817075490951538, + "step": 5190 + }, + { + "epoch": 2.5955, + "grad_norm": 2.3413616896424596, + "learning_rate": 4.622198612026713e-06, + "loss": 0.277, + "mean_token_accuracy": 0.9039658904075623, + "step": 5191 + }, + { + "epoch": 2.596, + "grad_norm": 2.0659146998943823, + "learning_rate": 4.621967940334705e-06, + "loss": 0.2447, + "mean_token_accuracy": 0.9115406274795532, + "step": 5192 + }, + { + "epoch": 2.5965, + "grad_norm": 2.731582013591699, + "learning_rate": 4.621737204003857e-06, + "loss": 0.2642, + "mean_token_accuracy": 0.9153275489807129, + "step": 5193 + }, + { + "epoch": 2.597, + "grad_norm": 3.448866535849196, + "learning_rate": 4.621506403041199e-06, + "loss": 0.2706, + "mean_token_accuracy": 0.9131549596786499, + "step": 5194 + }, + { + "epoch": 2.5975, + "grad_norm": 1.1950965217531981, + "learning_rate": 4.6212755374537596e-06, + "loss": 0.1517, + "mean_token_accuracy": 0.9406043291091919, + "step": 5195 + }, + { + "epoch": 2.598, + "grad_norm": 2.320758890358742, + "learning_rate": 4.621044607248573e-06, + "loss": 0.28, + "mean_token_accuracy": 0.9091821312904358, + "step": 5196 + }, + { + "epoch": 2.5985, + "grad_norm": 19.124774220400482, + "learning_rate": 4.620813612432672e-06, + "loss": 0.2522, + "mean_token_accuracy": 0.9167552590370178, + "step": 5197 + }, + { + "epoch": 2.599, + "grad_norm": 2.3794141760632033, + "learning_rate": 4.620582553013094e-06, + "loss": 0.2897, + "mean_token_accuracy": 0.9006698727607727, + "step": 5198 + }, + { + "epoch": 2.5995, + "grad_norm": 4.067559233298199, + "learning_rate": 4.620351428996878e-06, + "loss": 0.2656, + "mean_token_accuracy": 0.9100980758666992, + "step": 5199 + }, + { + "epoch": 2.6, + "grad_norm": 2.936125282292127, + "learning_rate": 4.620120240391065e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8925269842147827, + "step": 5200 + }, + { + "epoch": 2.6005000000000003, + "grad_norm": 2.8698439471450934, + "learning_rate": 4.619888987202696e-06, + "loss": 0.3076, + "mean_token_accuracy": 0.8980099558830261, + "step": 5201 + }, + { + "epoch": 2.601, + "grad_norm": 4.5002167815310345, + "learning_rate": 4.619657669438816e-06, + "loss": 0.2007, + "mean_token_accuracy": 0.929938554763794, + "step": 5202 + }, + { + "epoch": 2.6015, + "grad_norm": 2.0189519578600565, + "learning_rate": 4.619426287106471e-06, + "loss": 0.2199, + "mean_token_accuracy": 0.9145784378051758, + "step": 5203 + }, + { + "epoch": 2.602, + "grad_norm": 1.913441948148483, + "learning_rate": 4.619194840212708e-06, + "loss": 0.2576, + "mean_token_accuracy": 0.9198206067085266, + "step": 5204 + }, + { + "epoch": 2.6025, + "grad_norm": 2.097502676422701, + "learning_rate": 4.61896332876458e-06, + "loss": 0.2772, + "mean_token_accuracy": 0.9081049561500549, + "step": 5205 + }, + { + "epoch": 2.6029999999999998, + "grad_norm": 3.1664838956230494, + "learning_rate": 4.6187317527691384e-06, + "loss": 0.2874, + "mean_token_accuracy": 0.902481734752655, + "step": 5206 + }, + { + "epoch": 2.6035, + "grad_norm": 2.260787070841979, + "learning_rate": 4.618500112233436e-06, + "loss": 0.2717, + "mean_token_accuracy": 0.9175041317939758, + "step": 5207 + }, + { + "epoch": 2.604, + "grad_norm": 3.151304924019758, + "learning_rate": 4.618268407164531e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.8916030526161194, + "step": 5208 + }, + { + "epoch": 2.6045, + "grad_norm": 2.2720977696985782, + "learning_rate": 4.618036637569479e-06, + "loss": 0.2931, + "mean_token_accuracy": 0.9015352129936218, + "step": 5209 + }, + { + "epoch": 2.605, + "grad_norm": 2.9598428149449414, + "learning_rate": 4.6178048034553435e-06, + "loss": 0.3142, + "mean_token_accuracy": 0.9022656083106995, + "step": 5210 + }, + { + "epoch": 2.6055, + "grad_norm": 3.7101408825536404, + "learning_rate": 4.617572904829183e-06, + "loss": 0.2967, + "mean_token_accuracy": 0.9012590050697327, + "step": 5211 + }, + { + "epoch": 2.606, + "grad_norm": 5.100337626115751, + "learning_rate": 4.617340941698064e-06, + "loss": 0.3297, + "mean_token_accuracy": 0.8955900073051453, + "step": 5212 + }, + { + "epoch": 2.6065, + "grad_norm": 2.8609503441180433, + "learning_rate": 4.617108914069052e-06, + "loss": 0.2606, + "mean_token_accuracy": 0.9115980267524719, + "step": 5213 + }, + { + "epoch": 2.607, + "grad_norm": 1.6715695258511278, + "learning_rate": 4.616876821949214e-06, + "loss": 0.2369, + "mean_token_accuracy": 0.9087068438529968, + "step": 5214 + }, + { + "epoch": 2.6075, + "grad_norm": 2.0647231768829992, + "learning_rate": 4.616644665345621e-06, + "loss": 0.2877, + "mean_token_accuracy": 0.9088729023933411, + "step": 5215 + }, + { + "epoch": 2.608, + "grad_norm": 2.0764381423095783, + "learning_rate": 4.616412444265344e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.8882808685302734, + "step": 5216 + }, + { + "epoch": 2.6085000000000003, + "grad_norm": 1.9334047233219347, + "learning_rate": 4.616180158715458e-06, + "loss": 0.3181, + "mean_token_accuracy": 0.8889758586883545, + "step": 5217 + }, + { + "epoch": 2.609, + "grad_norm": 3.1832911944752436, + "learning_rate": 4.615947808703038e-06, + "loss": 0.3332, + "mean_token_accuracy": 0.9043009281158447, + "step": 5218 + }, + { + "epoch": 2.6095, + "grad_norm": 1.8877419249014384, + "learning_rate": 4.615715394235163e-06, + "loss": 0.269, + "mean_token_accuracy": 0.9125955700874329, + "step": 5219 + }, + { + "epoch": 2.61, + "grad_norm": 1.8233571726630404, + "learning_rate": 4.6154829153189105e-06, + "loss": 0.2637, + "mean_token_accuracy": 0.9064315557479858, + "step": 5220 + }, + { + "epoch": 2.6105, + "grad_norm": 2.237281986837883, + "learning_rate": 4.615250371961364e-06, + "loss": 0.3197, + "mean_token_accuracy": 0.8952308893203735, + "step": 5221 + }, + { + "epoch": 2.6109999999999998, + "grad_norm": 1.884760948200088, + "learning_rate": 4.615017764169606e-06, + "loss": 0.327, + "mean_token_accuracy": 0.8938567638397217, + "step": 5222 + }, + { + "epoch": 2.6115, + "grad_norm": 1.7787594369513282, + "learning_rate": 4.614785091950723e-06, + "loss": 0.181, + "mean_token_accuracy": 0.9374692440032959, + "step": 5223 + }, + { + "epoch": 2.612, + "grad_norm": 2.6478265238675536, + "learning_rate": 4.614552355311802e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.8946441411972046, + "step": 5224 + }, + { + "epoch": 2.6125, + "grad_norm": 2.5920307238094664, + "learning_rate": 4.614319554259934e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8822367191314697, + "step": 5225 + }, + { + "epoch": 2.613, + "grad_norm": 2.037161813133712, + "learning_rate": 4.614086688802208e-06, + "loss": 0.3093, + "mean_token_accuracy": 0.8990617990493774, + "step": 5226 + }, + { + "epoch": 2.6135, + "grad_norm": 5.588177392116803, + "learning_rate": 4.61385375894572e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8766409158706665, + "step": 5227 + }, + { + "epoch": 2.614, + "grad_norm": 3.706681633342776, + "learning_rate": 4.6136207646975635e-06, + "loss": 0.2783, + "mean_token_accuracy": 0.9088972806930542, + "step": 5228 + }, + { + "epoch": 2.6145, + "grad_norm": 2.081766197067046, + "learning_rate": 4.613387706064838e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8886069655418396, + "step": 5229 + }, + { + "epoch": 2.615, + "grad_norm": 1.8984721247242793, + "learning_rate": 4.613154583054641e-06, + "loss": 0.3139, + "mean_token_accuracy": 0.8865399956703186, + "step": 5230 + }, + { + "epoch": 2.6155, + "grad_norm": 3.7685008681274037, + "learning_rate": 4.612921395674074e-06, + "loss": 0.2253, + "mean_token_accuracy": 0.9205567836761475, + "step": 5231 + }, + { + "epoch": 2.616, + "grad_norm": 1.9729484213317627, + "learning_rate": 4.612688143930242e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8747109770774841, + "step": 5232 + }, + { + "epoch": 2.6165000000000003, + "grad_norm": 2.265501978008317, + "learning_rate": 4.612454827830248e-06, + "loss": 0.2654, + "mean_token_accuracy": 0.9054742455482483, + "step": 5233 + }, + { + "epoch": 2.617, + "grad_norm": 1.7148778522776575, + "learning_rate": 4.6122214473812005e-06, + "loss": 0.2417, + "mean_token_accuracy": 0.9114887714385986, + "step": 5234 + }, + { + "epoch": 2.6175, + "grad_norm": 3.163205573243875, + "learning_rate": 4.611988002590209e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8776540160179138, + "step": 5235 + }, + { + "epoch": 2.618, + "grad_norm": 3.7267892998059717, + "learning_rate": 4.611754493464383e-06, + "loss": 0.3035, + "mean_token_accuracy": 0.8983136415481567, + "step": 5236 + }, + { + "epoch": 2.6185, + "grad_norm": 4.192823842067667, + "learning_rate": 4.611520920010837e-06, + "loss": 0.2825, + "mean_token_accuracy": 0.9143252968788147, + "step": 5237 + }, + { + "epoch": 2.6189999999999998, + "grad_norm": 2.6326782972998237, + "learning_rate": 4.611287282236686e-06, + "loss": 0.3059, + "mean_token_accuracy": 0.9037392139434814, + "step": 5238 + }, + { + "epoch": 2.6195, + "grad_norm": 1.9395051823209981, + "learning_rate": 4.611053580149047e-06, + "loss": 0.296, + "mean_token_accuracy": 0.9081563353538513, + "step": 5239 + }, + { + "epoch": 2.62, + "grad_norm": 2.6015909094184244, + "learning_rate": 4.610819813755038e-06, + "loss": 0.243, + "mean_token_accuracy": 0.9233628511428833, + "step": 5240 + }, + { + "epoch": 2.6205, + "grad_norm": 3.0586340580835576, + "learning_rate": 4.610585983061781e-06, + "loss": 0.3099, + "mean_token_accuracy": 0.9001101851463318, + "step": 5241 + }, + { + "epoch": 2.621, + "grad_norm": 2.005359124109312, + "learning_rate": 4.610352088076399e-06, + "loss": 0.2806, + "mean_token_accuracy": 0.9124690294265747, + "step": 5242 + }, + { + "epoch": 2.6215, + "grad_norm": 1.7462772151928758, + "learning_rate": 4.610118128806016e-06, + "loss": 0.2323, + "mean_token_accuracy": 0.9130575060844421, + "step": 5243 + }, + { + "epoch": 2.622, + "grad_norm": 2.7916162226868937, + "learning_rate": 4.609884105257759e-06, + "loss": 0.2881, + "mean_token_accuracy": 0.9021276831626892, + "step": 5244 + }, + { + "epoch": 2.6225, + "grad_norm": 2.6776427086304144, + "learning_rate": 4.609650017438757e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8866416811943054, + "step": 5245 + }, + { + "epoch": 2.623, + "grad_norm": 3.2144058491469214, + "learning_rate": 4.609415865356141e-06, + "loss": 0.2208, + "mean_token_accuracy": 0.9238404631614685, + "step": 5246 + }, + { + "epoch": 2.6235, + "grad_norm": 2.529641468235667, + "learning_rate": 4.609181649017043e-06, + "loss": 0.2339, + "mean_token_accuracy": 0.9178143739700317, + "step": 5247 + }, + { + "epoch": 2.624, + "grad_norm": 2.2704822725096303, + "learning_rate": 4.608947368428598e-06, + "loss": 0.2813, + "mean_token_accuracy": 0.902157187461853, + "step": 5248 + }, + { + "epoch": 2.6245000000000003, + "grad_norm": 2.6201863222932147, + "learning_rate": 4.608713023597942e-06, + "loss": 0.2833, + "mean_token_accuracy": 0.9078164100646973, + "step": 5249 + }, + { + "epoch": 2.625, + "grad_norm": 3.106647230937419, + "learning_rate": 4.608478614532215e-06, + "loss": 0.2494, + "mean_token_accuracy": 0.9154161214828491, + "step": 5250 + }, + { + "epoch": 2.6254999999999997, + "grad_norm": 3.057065193364447, + "learning_rate": 4.608244141238556e-06, + "loss": 0.2163, + "mean_token_accuracy": 0.9276444911956787, + "step": 5251 + }, + { + "epoch": 2.626, + "grad_norm": 2.5745702281799154, + "learning_rate": 4.608009603724108e-06, + "loss": 0.2404, + "mean_token_accuracy": 0.9164794683456421, + "step": 5252 + }, + { + "epoch": 2.6265, + "grad_norm": 2.5084430483932887, + "learning_rate": 4.607775001996016e-06, + "loss": 0.2743, + "mean_token_accuracy": 0.9069398045539856, + "step": 5253 + }, + { + "epoch": 2.627, + "grad_norm": 9.03611956382332, + "learning_rate": 4.607540336061427e-06, + "loss": 0.268, + "mean_token_accuracy": 0.9059395790100098, + "step": 5254 + }, + { + "epoch": 2.6275, + "grad_norm": 2.210491933597021, + "learning_rate": 4.6073056059274865e-06, + "loss": 0.3086, + "mean_token_accuracy": 0.9028459191322327, + "step": 5255 + }, + { + "epoch": 2.628, + "grad_norm": 2.2976046724721555, + "learning_rate": 4.607070811601347e-06, + "loss": 0.2863, + "mean_token_accuracy": 0.9101750254631042, + "step": 5256 + }, + { + "epoch": 2.6285, + "grad_norm": 1.660604521866907, + "learning_rate": 4.606835953090161e-06, + "loss": 0.2135, + "mean_token_accuracy": 0.9263017773628235, + "step": 5257 + }, + { + "epoch": 2.629, + "grad_norm": 3.3932294231121185, + "learning_rate": 4.606601030401081e-06, + "loss": 0.273, + "mean_token_accuracy": 0.90625, + "step": 5258 + }, + { + "epoch": 2.6295, + "grad_norm": 2.404214045614998, + "learning_rate": 4.6063660435412644e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8814310431480408, + "step": 5259 + }, + { + "epoch": 2.63, + "grad_norm": 8.807122471908093, + "learning_rate": 4.60613099251787e-06, + "loss": 0.2353, + "mean_token_accuracy": 0.9138376116752625, + "step": 5260 + }, + { + "epoch": 2.6305, + "grad_norm": 3.3435464079961954, + "learning_rate": 4.6058958773380555e-06, + "loss": 0.2285, + "mean_token_accuracy": 0.9231643676757812, + "step": 5261 + }, + { + "epoch": 2.6310000000000002, + "grad_norm": 2.867837807397887, + "learning_rate": 4.605660698008985e-06, + "loss": 0.3108, + "mean_token_accuracy": 0.8912618160247803, + "step": 5262 + }, + { + "epoch": 2.6315, + "grad_norm": 2.638268879481899, + "learning_rate": 4.605425454537821e-06, + "loss": 0.3117, + "mean_token_accuracy": 0.8994845151901245, + "step": 5263 + }, + { + "epoch": 2.632, + "grad_norm": 1.7614640417212735, + "learning_rate": 4.605190146931731e-06, + "loss": 0.2948, + "mean_token_accuracy": 0.8994753956794739, + "step": 5264 + }, + { + "epoch": 2.6325, + "grad_norm": 1.5769942229865588, + "learning_rate": 4.604954775197882e-06, + "loss": 0.2177, + "mean_token_accuracy": 0.9181102514266968, + "step": 5265 + }, + { + "epoch": 2.633, + "grad_norm": 1.9018523788815382, + "learning_rate": 4.604719339343444e-06, + "loss": 0.2591, + "mean_token_accuracy": 0.8996925950050354, + "step": 5266 + }, + { + "epoch": 2.6334999999999997, + "grad_norm": 2.151466707104725, + "learning_rate": 4.604483839375589e-06, + "loss": 0.2375, + "mean_token_accuracy": 0.9161707162857056, + "step": 5267 + }, + { + "epoch": 2.634, + "grad_norm": 2.131723587928781, + "learning_rate": 4.604248275301489e-06, + "loss": 0.2702, + "mean_token_accuracy": 0.9057221412658691, + "step": 5268 + }, + { + "epoch": 2.6345, + "grad_norm": 2.04186250088839, + "learning_rate": 4.604012647128323e-06, + "loss": 0.2862, + "mean_token_accuracy": 0.9081259965896606, + "step": 5269 + }, + { + "epoch": 2.635, + "grad_norm": 3.006307406788538, + "learning_rate": 4.603776954863266e-06, + "loss": 0.2943, + "mean_token_accuracy": 0.902875542640686, + "step": 5270 + }, + { + "epoch": 2.6355, + "grad_norm": 2.0726038156289035, + "learning_rate": 4.603541198513498e-06, + "loss": 0.2761, + "mean_token_accuracy": 0.9096198678016663, + "step": 5271 + }, + { + "epoch": 2.636, + "grad_norm": 2.9822182081860675, + "learning_rate": 4.603305378086201e-06, + "loss": 0.2536, + "mean_token_accuracy": 0.9173352122306824, + "step": 5272 + }, + { + "epoch": 2.6365, + "grad_norm": 2.517668310170306, + "learning_rate": 4.6030694935885585e-06, + "loss": 0.3288, + "mean_token_accuracy": 0.8905358910560608, + "step": 5273 + }, + { + "epoch": 2.637, + "grad_norm": 3.0740404465946276, + "learning_rate": 4.602833545027757e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.8896540403366089, + "step": 5274 + }, + { + "epoch": 2.6375, + "grad_norm": 2.220691993123148, + "learning_rate": 4.602597532410982e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8929216861724854, + "step": 5275 + }, + { + "epoch": 2.638, + "grad_norm": 2.433531364992674, + "learning_rate": 4.6023614557454235e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8796005845069885, + "step": 5276 + }, + { + "epoch": 2.6385, + "grad_norm": 2.983063645758302, + "learning_rate": 4.602125315038273e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8960989117622375, + "step": 5277 + }, + { + "epoch": 2.6390000000000002, + "grad_norm": 10.68026686298381, + "learning_rate": 4.601889110296724e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8826308250427246, + "step": 5278 + }, + { + "epoch": 2.6395, + "grad_norm": 3.151308873048934, + "learning_rate": 4.601652841527971e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8840785026550293, + "step": 5279 + }, + { + "epoch": 2.64, + "grad_norm": 14.521786227235244, + "learning_rate": 4.601416508739211e-06, + "loss": 0.2385, + "mean_token_accuracy": 0.9196944236755371, + "step": 5280 + }, + { + "epoch": 2.6405, + "grad_norm": 2.156001679586203, + "learning_rate": 4.601180111937644e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.889309823513031, + "step": 5281 + }, + { + "epoch": 2.641, + "grad_norm": 2.0190378958961763, + "learning_rate": 4.600943651130471e-06, + "loss": 0.2628, + "mean_token_accuracy": 0.910167396068573, + "step": 5282 + }, + { + "epoch": 2.6414999999999997, + "grad_norm": 2.315806052608015, + "learning_rate": 4.600707126324895e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8818199038505554, + "step": 5283 + }, + { + "epoch": 2.642, + "grad_norm": 2.0688322225443927, + "learning_rate": 4.600470537528121e-06, + "loss": 0.2905, + "mean_token_accuracy": 0.9098349213600159, + "step": 5284 + }, + { + "epoch": 2.6425, + "grad_norm": 3.0155787327827137, + "learning_rate": 4.600233884747355e-06, + "loss": 0.3181, + "mean_token_accuracy": 0.891220211982727, + "step": 5285 + }, + { + "epoch": 2.643, + "grad_norm": 1.5896099810570645, + "learning_rate": 4.599997167989807e-06, + "loss": 0.2304, + "mean_token_accuracy": 0.9170420169830322, + "step": 5286 + }, + { + "epoch": 2.6435, + "grad_norm": 2.0140634444687042, + "learning_rate": 4.599760387262687e-06, + "loss": 0.2526, + "mean_token_accuracy": 0.9104874134063721, + "step": 5287 + }, + { + "epoch": 2.644, + "grad_norm": 2.4429319662256788, + "learning_rate": 4.599523542573207e-06, + "loss": 0.2334, + "mean_token_accuracy": 0.9212031364440918, + "step": 5288 + }, + { + "epoch": 2.6445, + "grad_norm": 2.1294523747744107, + "learning_rate": 4.599286633928585e-06, + "loss": 0.355, + "mean_token_accuracy": 0.8872470259666443, + "step": 5289 + }, + { + "epoch": 2.645, + "grad_norm": 2.1263014688013544, + "learning_rate": 4.599049661336033e-06, + "loss": 0.2836, + "mean_token_accuracy": 0.9106002449989319, + "step": 5290 + }, + { + "epoch": 2.6455, + "grad_norm": 4.516568605233136, + "learning_rate": 4.598812624802774e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8880481123924255, + "step": 5291 + }, + { + "epoch": 2.646, + "grad_norm": 3.792286427414523, + "learning_rate": 4.598575524336026e-06, + "loss": 0.3116, + "mean_token_accuracy": 0.8936482071876526, + "step": 5292 + }, + { + "epoch": 2.6465, + "grad_norm": 2.9871478490815524, + "learning_rate": 4.598338359943011e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8767912983894348, + "step": 5293 + }, + { + "epoch": 2.6470000000000002, + "grad_norm": 2.1727537715927014, + "learning_rate": 4.598101131630954e-06, + "loss": 0.3277, + "mean_token_accuracy": 0.889223575592041, + "step": 5294 + }, + { + "epoch": 2.6475, + "grad_norm": 5.016537781133299, + "learning_rate": 4.5978638394070835e-06, + "loss": 0.2901, + "mean_token_accuracy": 0.9036470651626587, + "step": 5295 + }, + { + "epoch": 2.648, + "grad_norm": 1.7920846708792038, + "learning_rate": 4.597626483278626e-06, + "loss": 0.311, + "mean_token_accuracy": 0.898409903049469, + "step": 5296 + }, + { + "epoch": 2.6485, + "grad_norm": 2.1855888997621125, + "learning_rate": 4.597389063252811e-06, + "loss": 0.2966, + "mean_token_accuracy": 0.9056659936904907, + "step": 5297 + }, + { + "epoch": 2.649, + "grad_norm": 2.819700118800082, + "learning_rate": 4.597151579336872e-06, + "loss": 0.2973, + "mean_token_accuracy": 0.8954278826713562, + "step": 5298 + }, + { + "epoch": 2.6494999999999997, + "grad_norm": 6.300006258699827, + "learning_rate": 4.5969140315380435e-06, + "loss": 0.2605, + "mean_token_accuracy": 0.9178690314292908, + "step": 5299 + }, + { + "epoch": 2.65, + "grad_norm": 2.0153104131513464, + "learning_rate": 4.596676419863561e-06, + "loss": 0.3042, + "mean_token_accuracy": 0.9041380286216736, + "step": 5300 + }, + { + "epoch": 2.6505, + "grad_norm": 2.137847856544422, + "learning_rate": 4.596438744320662e-06, + "loss": 0.2367, + "mean_token_accuracy": 0.9116447567939758, + "step": 5301 + }, + { + "epoch": 2.651, + "grad_norm": 7.752805081357047, + "learning_rate": 4.596201004916587e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.879558265209198, + "step": 5302 + }, + { + "epoch": 2.6515, + "grad_norm": 2.324843988636547, + "learning_rate": 4.595963201658578e-06, + "loss": 0.2842, + "mean_token_accuracy": 0.9082465171813965, + "step": 5303 + }, + { + "epoch": 2.652, + "grad_norm": 1.9039883415802805, + "learning_rate": 4.595725334553879e-06, + "loss": 0.2315, + "mean_token_accuracy": 0.9240366816520691, + "step": 5304 + }, + { + "epoch": 2.6525, + "grad_norm": 1.5734965444661713, + "learning_rate": 4.595487403609736e-06, + "loss": 0.2885, + "mean_token_accuracy": 0.8967705368995667, + "step": 5305 + }, + { + "epoch": 2.653, + "grad_norm": 1.6155523890140104, + "learning_rate": 4.595249408833397e-06, + "loss": 0.2007, + "mean_token_accuracy": 0.9236369132995605, + "step": 5306 + }, + { + "epoch": 2.6535, + "grad_norm": 5.861023139940526, + "learning_rate": 4.595011350232111e-06, + "loss": 0.2654, + "mean_token_accuracy": 0.9055195450782776, + "step": 5307 + }, + { + "epoch": 2.654, + "grad_norm": 1.7294301699036647, + "learning_rate": 4.594773227813129e-06, + "loss": 0.2101, + "mean_token_accuracy": 0.9220138192176819, + "step": 5308 + }, + { + "epoch": 2.6545, + "grad_norm": 1.897337946417066, + "learning_rate": 4.594535041583706e-06, + "loss": 0.2859, + "mean_token_accuracy": 0.9046351313591003, + "step": 5309 + }, + { + "epoch": 2.6550000000000002, + "grad_norm": 1.839073643167278, + "learning_rate": 4.5942967915510975e-06, + "loss": 0.2429, + "mean_token_accuracy": 0.9171316623687744, + "step": 5310 + }, + { + "epoch": 2.6555, + "grad_norm": 1.9810832247089427, + "learning_rate": 4.59405847772256e-06, + "loss": 0.2607, + "mean_token_accuracy": 0.9073055982589722, + "step": 5311 + }, + { + "epoch": 2.656, + "grad_norm": 1.9721745045119576, + "learning_rate": 4.593820100105355e-06, + "loss": 0.2639, + "mean_token_accuracy": 0.9109588861465454, + "step": 5312 + }, + { + "epoch": 2.6565, + "grad_norm": 2.4592560243971215, + "learning_rate": 4.593581658706742e-06, + "loss": 0.3063, + "mean_token_accuracy": 0.8998379111289978, + "step": 5313 + }, + { + "epoch": 2.657, + "grad_norm": 2.2512845011930067, + "learning_rate": 4.593343153533984e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.884046196937561, + "step": 5314 + }, + { + "epoch": 2.6574999999999998, + "grad_norm": 2.033158742009147, + "learning_rate": 4.593104584594348e-06, + "loss": 0.3432, + "mean_token_accuracy": 0.8886488080024719, + "step": 5315 + }, + { + "epoch": 2.658, + "grad_norm": 2.201889081394739, + "learning_rate": 4.5928659518951e-06, + "loss": 0.2545, + "mean_token_accuracy": 0.9171759486198425, + "step": 5316 + }, + { + "epoch": 2.6585, + "grad_norm": 2.1407683762825447, + "learning_rate": 4.592627255443509e-06, + "loss": 0.2932, + "mean_token_accuracy": 0.9045059084892273, + "step": 5317 + }, + { + "epoch": 2.659, + "grad_norm": 5.123233526400508, + "learning_rate": 4.592388495246848e-06, + "loss": 0.2581, + "mean_token_accuracy": 0.9058597683906555, + "step": 5318 + }, + { + "epoch": 2.6595, + "grad_norm": 2.2682378637085776, + "learning_rate": 4.592149671312388e-06, + "loss": 0.2992, + "mean_token_accuracy": 0.8992888927459717, + "step": 5319 + }, + { + "epoch": 2.66, + "grad_norm": 2.391931083666118, + "learning_rate": 4.591910783647405e-06, + "loss": 0.257, + "mean_token_accuracy": 0.9101102352142334, + "step": 5320 + }, + { + "epoch": 2.6605, + "grad_norm": 2.6843676960207015, + "learning_rate": 4.591671832259175e-06, + "loss": 0.3117, + "mean_token_accuracy": 0.8926212191581726, + "step": 5321 + }, + { + "epoch": 2.661, + "grad_norm": 3.2392553699181716, + "learning_rate": 4.591432817154978e-06, + "loss": 0.3057, + "mean_token_accuracy": 0.8905003666877747, + "step": 5322 + }, + { + "epoch": 2.6615, + "grad_norm": 3.03095657659671, + "learning_rate": 4.591193738342094e-06, + "loss": 0.2698, + "mean_token_accuracy": 0.9121009111404419, + "step": 5323 + }, + { + "epoch": 2.662, + "grad_norm": 2.087614928287699, + "learning_rate": 4.5909545958278065e-06, + "loss": 0.2522, + "mean_token_accuracy": 0.9134652614593506, + "step": 5324 + }, + { + "epoch": 2.6625, + "grad_norm": 2.1779774563290926, + "learning_rate": 4.590715389619399e-06, + "loss": 0.2561, + "mean_token_accuracy": 0.9176644086837769, + "step": 5325 + }, + { + "epoch": 2.6630000000000003, + "grad_norm": 2.390517405480049, + "learning_rate": 4.59047611972416e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8803082704544067, + "step": 5326 + }, + { + "epoch": 2.6635, + "grad_norm": 3.264708937928227, + "learning_rate": 4.590236786149376e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8866688013076782, + "step": 5327 + }, + { + "epoch": 2.664, + "grad_norm": 2.178914555023538, + "learning_rate": 4.589997388902339e-06, + "loss": 0.2624, + "mean_token_accuracy": 0.9041666388511658, + "step": 5328 + }, + { + "epoch": 2.6645, + "grad_norm": 4.331503498000801, + "learning_rate": 4.589757927990341e-06, + "loss": 0.2811, + "mean_token_accuracy": 0.9079139232635498, + "step": 5329 + }, + { + "epoch": 2.665, + "grad_norm": 1.812621336832958, + "learning_rate": 4.589518403420676e-06, + "loss": 0.2421, + "mean_token_accuracy": 0.917730987071991, + "step": 5330 + }, + { + "epoch": 2.6654999999999998, + "grad_norm": 1.7644217585118669, + "learning_rate": 4.58927881520064e-06, + "loss": 0.2316, + "mean_token_accuracy": 0.9120071530342102, + "step": 5331 + }, + { + "epoch": 2.666, + "grad_norm": 7.889495164848315, + "learning_rate": 4.5890391633375345e-06, + "loss": 0.3043, + "mean_token_accuracy": 0.8932806253433228, + "step": 5332 + }, + { + "epoch": 2.6665, + "grad_norm": 1.9928240740650178, + "learning_rate": 4.588799447838655e-06, + "loss": 0.2663, + "mean_token_accuracy": 0.9074840545654297, + "step": 5333 + }, + { + "epoch": 2.667, + "grad_norm": 1.7695995881181288, + "learning_rate": 4.588559668711306e-06, + "loss": 0.3418, + "mean_token_accuracy": 0.8896187543869019, + "step": 5334 + }, + { + "epoch": 2.6675, + "grad_norm": 2.932481608381957, + "learning_rate": 4.588319825962793e-06, + "loss": 0.3282, + "mean_token_accuracy": 0.8933823704719543, + "step": 5335 + }, + { + "epoch": 2.668, + "grad_norm": 2.6582600452948015, + "learning_rate": 4.588079919600419e-06, + "loss": 0.2764, + "mean_token_accuracy": 0.9049241542816162, + "step": 5336 + }, + { + "epoch": 2.6685, + "grad_norm": 13.036957887294673, + "learning_rate": 4.587839949631494e-06, + "loss": 0.2985, + "mean_token_accuracy": 0.9011090397834778, + "step": 5337 + }, + { + "epoch": 2.669, + "grad_norm": 4.9580564332711115, + "learning_rate": 4.587599916063327e-06, + "loss": 0.287, + "mean_token_accuracy": 0.9057458639144897, + "step": 5338 + }, + { + "epoch": 2.6695, + "grad_norm": 2.5448725112687254, + "learning_rate": 4.587359818903229e-06, + "loss": 0.2481, + "mean_token_accuracy": 0.9122066497802734, + "step": 5339 + }, + { + "epoch": 2.67, + "grad_norm": 9.390821544241653, + "learning_rate": 4.587119658158517e-06, + "loss": 0.2604, + "mean_token_accuracy": 0.909375011920929, + "step": 5340 + }, + { + "epoch": 2.6705, + "grad_norm": 1.8062766663635033, + "learning_rate": 4.586879433836504e-06, + "loss": 0.2762, + "mean_token_accuracy": 0.9088954329490662, + "step": 5341 + }, + { + "epoch": 2.6710000000000003, + "grad_norm": 2.5621029789235528, + "learning_rate": 4.586639145944508e-06, + "loss": 0.3163, + "mean_token_accuracy": 0.9011023044586182, + "step": 5342 + }, + { + "epoch": 2.6715, + "grad_norm": 1.9011340106226544, + "learning_rate": 4.586398794489849e-06, + "loss": 0.2784, + "mean_token_accuracy": 0.9072816371917725, + "step": 5343 + }, + { + "epoch": 2.672, + "grad_norm": 2.288768427986438, + "learning_rate": 4.586158379479848e-06, + "loss": 0.2621, + "mean_token_accuracy": 0.9090015888214111, + "step": 5344 + }, + { + "epoch": 2.6725, + "grad_norm": 2.1955002546857916, + "learning_rate": 4.58591790092183e-06, + "loss": 0.3133, + "mean_token_accuracy": 0.8994736075401306, + "step": 5345 + }, + { + "epoch": 2.673, + "grad_norm": 2.146680430189645, + "learning_rate": 4.585677358823119e-06, + "loss": 0.2772, + "mean_token_accuracy": 0.9029207229614258, + "step": 5346 + }, + { + "epoch": 2.6734999999999998, + "grad_norm": 2.408723109549558, + "learning_rate": 4.5854367531910415e-06, + "loss": 0.3314, + "mean_token_accuracy": 0.8951658010482788, + "step": 5347 + }, + { + "epoch": 2.674, + "grad_norm": 3.2835052825929107, + "learning_rate": 4.585196084032929e-06, + "loss": 0.2393, + "mean_token_accuracy": 0.9180084466934204, + "step": 5348 + }, + { + "epoch": 2.6745, + "grad_norm": 2.7581113432156674, + "learning_rate": 4.584955351356111e-06, + "loss": 0.2997, + "mean_token_accuracy": 0.9123654961585999, + "step": 5349 + }, + { + "epoch": 2.675, + "grad_norm": 1.9955081421495047, + "learning_rate": 4.584714555167921e-06, + "loss": 0.2498, + "mean_token_accuracy": 0.9138381481170654, + "step": 5350 + }, + { + "epoch": 2.6755, + "grad_norm": 2.7194174392880277, + "learning_rate": 4.5844736954756944e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8981670141220093, + "step": 5351 + }, + { + "epoch": 2.676, + "grad_norm": 2.2610135422292865, + "learning_rate": 4.584232772286769e-06, + "loss": 0.2494, + "mean_token_accuracy": 0.9178158640861511, + "step": 5352 + }, + { + "epoch": 2.6765, + "grad_norm": 3.018140457479205, + "learning_rate": 4.583991785608481e-06, + "loss": 0.3119, + "mean_token_accuracy": 0.8925538659095764, + "step": 5353 + }, + { + "epoch": 2.677, + "grad_norm": 2.318967326694454, + "learning_rate": 4.583750735448175e-06, + "loss": 0.3269, + "mean_token_accuracy": 0.9054900407791138, + "step": 5354 + }, + { + "epoch": 2.6775, + "grad_norm": 4.35434697233448, + "learning_rate": 4.583509621813192e-06, + "loss": 0.3063, + "mean_token_accuracy": 0.903552234172821, + "step": 5355 + }, + { + "epoch": 2.678, + "grad_norm": 2.646072022529361, + "learning_rate": 4.583268444710875e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.888067901134491, + "step": 5356 + }, + { + "epoch": 2.6785, + "grad_norm": 2.386974805978371, + "learning_rate": 4.583027204148573e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8761323094367981, + "step": 5357 + }, + { + "epoch": 2.6790000000000003, + "grad_norm": 1.6844795211772532, + "learning_rate": 4.5827859001336335e-06, + "loss": 0.2795, + "mean_token_accuracy": 0.8999999761581421, + "step": 5358 + }, + { + "epoch": 2.6795, + "grad_norm": 1.6448340419286687, + "learning_rate": 4.582544532673409e-06, + "loss": 0.2244, + "mean_token_accuracy": 0.9158650040626526, + "step": 5359 + }, + { + "epoch": 2.68, + "grad_norm": 2.691765360540922, + "learning_rate": 4.582303101775249e-06, + "loss": 0.2444, + "mean_token_accuracy": 0.9077936410903931, + "step": 5360 + }, + { + "epoch": 2.6805, + "grad_norm": 4.003452550298547, + "learning_rate": 4.58206160744651e-06, + "loss": 0.308, + "mean_token_accuracy": 0.9014842510223389, + "step": 5361 + }, + { + "epoch": 2.681, + "grad_norm": 1.9535518873945676, + "learning_rate": 4.581820049694548e-06, + "loss": 0.2512, + "mean_token_accuracy": 0.910471498966217, + "step": 5362 + }, + { + "epoch": 2.6814999999999998, + "grad_norm": 4.257038884926185, + "learning_rate": 4.58157842852672e-06, + "loss": 0.2908, + "mean_token_accuracy": 0.895235002040863, + "step": 5363 + }, + { + "epoch": 2.682, + "grad_norm": 1.8726468424041207, + "learning_rate": 4.5813367439503875e-06, + "loss": 0.2674, + "mean_token_accuracy": 0.9139542579650879, + "step": 5364 + }, + { + "epoch": 2.6825, + "grad_norm": 3.1589429563411424, + "learning_rate": 4.581094995972912e-06, + "loss": 0.3238, + "mean_token_accuracy": 0.8956815600395203, + "step": 5365 + }, + { + "epoch": 2.683, + "grad_norm": 2.054519892499855, + "learning_rate": 4.580853184601659e-06, + "loss": 0.2336, + "mean_token_accuracy": 0.9245008230209351, + "step": 5366 + }, + { + "epoch": 2.6835, + "grad_norm": 2.353996941573604, + "learning_rate": 4.580611309843993e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8945752382278442, + "step": 5367 + }, + { + "epoch": 2.684, + "grad_norm": 3.352277759592485, + "learning_rate": 4.580369371707282e-06, + "loss": 0.2491, + "mean_token_accuracy": 0.9166538715362549, + "step": 5368 + }, + { + "epoch": 2.6845, + "grad_norm": 2.277872766742038, + "learning_rate": 4.580127370198896e-06, + "loss": 0.2958, + "mean_token_accuracy": 0.8943377733230591, + "step": 5369 + }, + { + "epoch": 2.685, + "grad_norm": 7.212576253166539, + "learning_rate": 4.579885305326206e-06, + "loss": 0.3119, + "mean_token_accuracy": 0.8911634683609009, + "step": 5370 + }, + { + "epoch": 2.6855, + "grad_norm": 1.5699266226313353, + "learning_rate": 4.579643177096588e-06, + "loss": 0.212, + "mean_token_accuracy": 0.9198751449584961, + "step": 5371 + }, + { + "epoch": 2.686, + "grad_norm": 2.9147135662289343, + "learning_rate": 4.579400985517416e-06, + "loss": 0.2442, + "mean_token_accuracy": 0.9119077920913696, + "step": 5372 + }, + { + "epoch": 2.6865, + "grad_norm": 2.6564072162843426, + "learning_rate": 4.579158730596068e-06, + "loss": 0.3238, + "mean_token_accuracy": 0.8974316716194153, + "step": 5373 + }, + { + "epoch": 2.6870000000000003, + "grad_norm": 1.5467552129496682, + "learning_rate": 4.578916412339923e-06, + "loss": 0.2224, + "mean_token_accuracy": 0.9249334335327148, + "step": 5374 + }, + { + "epoch": 2.6875, + "grad_norm": 1.8405451586607653, + "learning_rate": 4.578674030756364e-06, + "loss": 0.2997, + "mean_token_accuracy": 0.9035512804985046, + "step": 5375 + }, + { + "epoch": 2.6879999999999997, + "grad_norm": 1.8926944101706344, + "learning_rate": 4.578431585852771e-06, + "loss": 0.2829, + "mean_token_accuracy": 0.9019633531570435, + "step": 5376 + }, + { + "epoch": 2.6885, + "grad_norm": 4.7213588521410745, + "learning_rate": 4.578189077636533e-06, + "loss": 0.2485, + "mean_token_accuracy": 0.9197204113006592, + "step": 5377 + }, + { + "epoch": 2.689, + "grad_norm": 9.064926336291327, + "learning_rate": 4.577946506115036e-06, + "loss": 0.2513, + "mean_token_accuracy": 0.9155682325363159, + "step": 5378 + }, + { + "epoch": 2.6895, + "grad_norm": 11.969017588926922, + "learning_rate": 4.577703871295668e-06, + "loss": 0.2424, + "mean_token_accuracy": 0.9114203453063965, + "step": 5379 + }, + { + "epoch": 2.69, + "grad_norm": 1.8297833715461376, + "learning_rate": 4.577461173185821e-06, + "loss": 0.2829, + "mean_token_accuracy": 0.9007471799850464, + "step": 5380 + }, + { + "epoch": 2.6905, + "grad_norm": 1.9744317355973138, + "learning_rate": 4.577218411792889e-06, + "loss": 0.318, + "mean_token_accuracy": 0.8913951516151428, + "step": 5381 + }, + { + "epoch": 2.691, + "grad_norm": 2.171331808919859, + "learning_rate": 4.576975587124264e-06, + "loss": 0.3046, + "mean_token_accuracy": 0.891459047794342, + "step": 5382 + }, + { + "epoch": 2.6915, + "grad_norm": 1.6591270774216318, + "learning_rate": 4.576732699187346e-06, + "loss": 0.2757, + "mean_token_accuracy": 0.9082251191139221, + "step": 5383 + }, + { + "epoch": 2.692, + "grad_norm": 3.453191989021135, + "learning_rate": 4.576489747989532e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8837209343910217, + "step": 5384 + }, + { + "epoch": 2.6925, + "grad_norm": 2.227531442683252, + "learning_rate": 4.576246733538223e-06, + "loss": 0.2603, + "mean_token_accuracy": 0.9118466973304749, + "step": 5385 + }, + { + "epoch": 2.693, + "grad_norm": 4.344629379757712, + "learning_rate": 4.576003655840823e-06, + "loss": 0.2872, + "mean_token_accuracy": 0.8934993147850037, + "step": 5386 + }, + { + "epoch": 2.6935000000000002, + "grad_norm": 1.672877313495084, + "learning_rate": 4.5757605149047345e-06, + "loss": 0.1743, + "mean_token_accuracy": 0.9346605539321899, + "step": 5387 + }, + { + "epoch": 2.694, + "grad_norm": 2.590806445296756, + "learning_rate": 4.575517310737365e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8700886964797974, + "step": 5388 + }, + { + "epoch": 2.6945, + "grad_norm": 3.116569165452179, + "learning_rate": 4.575274043346123e-06, + "loss": 0.2327, + "mean_token_accuracy": 0.9240080714225769, + "step": 5389 + }, + { + "epoch": 2.695, + "grad_norm": 4.6737248586729905, + "learning_rate": 4.5750307127384194e-06, + "loss": 0.2902, + "mean_token_accuracy": 0.9039930105209351, + "step": 5390 + }, + { + "epoch": 2.6955, + "grad_norm": 2.2979519471787673, + "learning_rate": 4.574787318921665e-06, + "loss": 0.2741, + "mean_token_accuracy": 0.905889630317688, + "step": 5391 + }, + { + "epoch": 2.6959999999999997, + "grad_norm": 3.889632313795735, + "learning_rate": 4.574543861903275e-06, + "loss": 0.2309, + "mean_token_accuracy": 0.9209818243980408, + "step": 5392 + }, + { + "epoch": 2.6965, + "grad_norm": 1.8823465201040297, + "learning_rate": 4.574300341690665e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.8877745866775513, + "step": 5393 + }, + { + "epoch": 2.697, + "grad_norm": 3.31651829734614, + "learning_rate": 4.574056758291254e-06, + "loss": 0.3487, + "mean_token_accuracy": 0.8863796591758728, + "step": 5394 + }, + { + "epoch": 2.6975, + "grad_norm": 2.3028206521910417, + "learning_rate": 4.5738131117124605e-06, + "loss": 0.2883, + "mean_token_accuracy": 0.8974875807762146, + "step": 5395 + }, + { + "epoch": 2.698, + "grad_norm": 6.588406108800268, + "learning_rate": 4.5735694019617085e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.890080451965332, + "step": 5396 + }, + { + "epoch": 2.6985, + "grad_norm": 2.492461892898673, + "learning_rate": 4.573325629046419e-06, + "loss": 0.3109, + "mean_token_accuracy": 0.8962418437004089, + "step": 5397 + }, + { + "epoch": 2.699, + "grad_norm": 3.5720079485039005, + "learning_rate": 4.5730817929740205e-06, + "loss": 0.309, + "mean_token_accuracy": 0.8812004327774048, + "step": 5398 + }, + { + "epoch": 2.6995, + "grad_norm": 2.784079456442744, + "learning_rate": 4.572837893751939e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.8930000066757202, + "step": 5399 + }, + { + "epoch": 2.7, + "grad_norm": 5.621966803381043, + "learning_rate": 4.572593931387604e-06, + "loss": 0.2027, + "mean_token_accuracy": 0.9254509806632996, + "step": 5400 + }, + { + "epoch": 2.7005, + "grad_norm": 2.147021913726829, + "learning_rate": 4.572349905888449e-06, + "loss": 0.285, + "mean_token_accuracy": 0.8940854668617249, + "step": 5401 + }, + { + "epoch": 2.701, + "grad_norm": 2.465186241742228, + "learning_rate": 4.572105817261905e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8820487856864929, + "step": 5402 + }, + { + "epoch": 2.7015000000000002, + "grad_norm": 1.9007280836264848, + "learning_rate": 4.571861665515409e-06, + "loss": 0.2971, + "mean_token_accuracy": 0.9073025584220886, + "step": 5403 + }, + { + "epoch": 2.702, + "grad_norm": 2.1178317317945314, + "learning_rate": 4.571617450656397e-06, + "loss": 0.2433, + "mean_token_accuracy": 0.9157618880271912, + "step": 5404 + }, + { + "epoch": 2.7025, + "grad_norm": 3.023875393214825, + "learning_rate": 4.571373172692309e-06, + "loss": 0.2884, + "mean_token_accuracy": 0.9042816758155823, + "step": 5405 + }, + { + "epoch": 2.703, + "grad_norm": 4.344329626332478, + "learning_rate": 4.571128831630587e-06, + "loss": 0.2532, + "mean_token_accuracy": 0.9128856658935547, + "step": 5406 + }, + { + "epoch": 2.7035, + "grad_norm": 2.9173623063614764, + "learning_rate": 4.570884427478672e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8714331984519958, + "step": 5407 + }, + { + "epoch": 2.7039999999999997, + "grad_norm": 4.46042188495017, + "learning_rate": 4.570639960244011e-06, + "loss": 0.252, + "mean_token_accuracy": 0.9131647348403931, + "step": 5408 + }, + { + "epoch": 2.7045, + "grad_norm": 3.30472651433987, + "learning_rate": 4.570395429934049e-06, + "loss": 0.232, + "mean_token_accuracy": 0.9209345579147339, + "step": 5409 + }, + { + "epoch": 2.705, + "grad_norm": 4.411649196342738, + "learning_rate": 4.570150836556236e-06, + "loss": 0.2612, + "mean_token_accuracy": 0.921939492225647, + "step": 5410 + }, + { + "epoch": 2.7055, + "grad_norm": 2.8096351812355067, + "learning_rate": 4.569906180118023e-06, + "loss": 0.2237, + "mean_token_accuracy": 0.9225398302078247, + "step": 5411 + }, + { + "epoch": 2.706, + "grad_norm": 2.0956791592828363, + "learning_rate": 4.569661460626862e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8778458833694458, + "step": 5412 + }, + { + "epoch": 2.7065, + "grad_norm": 2.069335843778886, + "learning_rate": 4.569416678090208e-06, + "loss": 0.2861, + "mean_token_accuracy": 0.9121425151824951, + "step": 5413 + }, + { + "epoch": 2.707, + "grad_norm": 2.1286205588569826, + "learning_rate": 4.569171832515517e-06, + "loss": 0.307, + "mean_token_accuracy": 0.9017261862754822, + "step": 5414 + }, + { + "epoch": 2.7075, + "grad_norm": 1.5367198813391656, + "learning_rate": 4.568926923910248e-06, + "loss": 0.1871, + "mean_token_accuracy": 0.931251049041748, + "step": 5415 + }, + { + "epoch": 2.708, + "grad_norm": 3.365624740553577, + "learning_rate": 4.56868195228186e-06, + "loss": 0.2952, + "mean_token_accuracy": 0.9027314186096191, + "step": 5416 + }, + { + "epoch": 2.7085, + "grad_norm": 3.516225940454788, + "learning_rate": 4.568436917637817e-06, + "loss": 0.2583, + "mean_token_accuracy": 0.9047182202339172, + "step": 5417 + }, + { + "epoch": 2.709, + "grad_norm": 2.1817264272426438, + "learning_rate": 4.568191819985583e-06, + "loss": 0.2771, + "mean_token_accuracy": 0.9050991535186768, + "step": 5418 + }, + { + "epoch": 2.7095000000000002, + "grad_norm": 1.73929779821677, + "learning_rate": 4.567946659332623e-06, + "loss": 0.3134, + "mean_token_accuracy": 0.8958302736282349, + "step": 5419 + }, + { + "epoch": 2.71, + "grad_norm": 2.5847538934633394, + "learning_rate": 4.567701435686405e-06, + "loss": 0.2607, + "mean_token_accuracy": 0.9168986082077026, + "step": 5420 + }, + { + "epoch": 2.7105, + "grad_norm": 2.4355568990967313, + "learning_rate": 4.5674561490544e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8837323188781738, + "step": 5421 + }, + { + "epoch": 2.711, + "grad_norm": 3.6251306459809327, + "learning_rate": 4.56721079944408e-06, + "loss": 0.309, + "mean_token_accuracy": 0.8884868025779724, + "step": 5422 + }, + { + "epoch": 2.7115, + "grad_norm": 1.8121090623789045, + "learning_rate": 4.5669653868629174e-06, + "loss": 0.241, + "mean_token_accuracy": 0.9179456830024719, + "step": 5423 + }, + { + "epoch": 2.7119999999999997, + "grad_norm": 1.8995049865724811, + "learning_rate": 4.566719911318389e-06, + "loss": 0.2078, + "mean_token_accuracy": 0.9279999732971191, + "step": 5424 + }, + { + "epoch": 2.7125, + "grad_norm": 7.438312414584288, + "learning_rate": 4.566474372817971e-06, + "loss": 0.2628, + "mean_token_accuracy": 0.9065172076225281, + "step": 5425 + }, + { + "epoch": 2.713, + "grad_norm": 2.9994276216563343, + "learning_rate": 4.566228771369146e-06, + "loss": 0.2336, + "mean_token_accuracy": 0.9207154512405396, + "step": 5426 + }, + { + "epoch": 2.7135, + "grad_norm": 3.960300710391834, + "learning_rate": 4.565983106979392e-06, + "loss": 0.2945, + "mean_token_accuracy": 0.9093219041824341, + "step": 5427 + }, + { + "epoch": 2.714, + "grad_norm": 2.0173425253431274, + "learning_rate": 4.565737379656195e-06, + "loss": 0.3414, + "mean_token_accuracy": 0.8863126635551453, + "step": 5428 + }, + { + "epoch": 2.7145, + "grad_norm": 2.39673536432344, + "learning_rate": 4.565491589407039e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.8859894275665283, + "step": 5429 + }, + { + "epoch": 2.715, + "grad_norm": 1.5776439424623625, + "learning_rate": 4.5652457362394094e-06, + "loss": 0.274, + "mean_token_accuracy": 0.9132956266403198, + "step": 5430 + }, + { + "epoch": 2.7155, + "grad_norm": 7.031061081998694, + "learning_rate": 4.564999820160799e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8820437788963318, + "step": 5431 + }, + { + "epoch": 2.716, + "grad_norm": 2.131402116869313, + "learning_rate": 4.5647538411786965e-06, + "loss": 0.302, + "mean_token_accuracy": 0.8995794057846069, + "step": 5432 + }, + { + "epoch": 2.7165, + "grad_norm": 2.646303319577993, + "learning_rate": 4.564507799300596e-06, + "loss": 0.2941, + "mean_token_accuracy": 0.903175413608551, + "step": 5433 + }, + { + "epoch": 2.717, + "grad_norm": 2.646914421172525, + "learning_rate": 4.564261694533991e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.8911643028259277, + "step": 5434 + }, + { + "epoch": 2.7175000000000002, + "grad_norm": 2.6097226050474465, + "learning_rate": 4.56401552688638e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8562461733818054, + "step": 5435 + }, + { + "epoch": 2.718, + "grad_norm": 1.9146065007307418, + "learning_rate": 4.56376929636526e-06, + "loss": 0.2378, + "mean_token_accuracy": 0.9146103262901306, + "step": 5436 + }, + { + "epoch": 2.7185, + "grad_norm": 2.1556662563503175, + "learning_rate": 4.563523002978132e-06, + "loss": 0.278, + "mean_token_accuracy": 0.9032431244850159, + "step": 5437 + }, + { + "epoch": 2.719, + "grad_norm": 3.0383480622448364, + "learning_rate": 4.5632766467325e-06, + "loss": 0.2561, + "mean_token_accuracy": 0.9139162302017212, + "step": 5438 + }, + { + "epoch": 2.7195, + "grad_norm": 1.9707020167472125, + "learning_rate": 4.563030227635867e-06, + "loss": 0.2694, + "mean_token_accuracy": 0.9108108282089233, + "step": 5439 + }, + { + "epoch": 2.7199999999999998, + "grad_norm": 2.438924693653932, + "learning_rate": 4.562783745695738e-06, + "loss": 0.3217, + "mean_token_accuracy": 0.8984149694442749, + "step": 5440 + }, + { + "epoch": 2.7205, + "grad_norm": 2.182188019545359, + "learning_rate": 4.562537200919625e-06, + "loss": 0.2447, + "mean_token_accuracy": 0.9145077466964722, + "step": 5441 + }, + { + "epoch": 2.721, + "grad_norm": 2.592137661885315, + "learning_rate": 4.562290593315035e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.8942466974258423, + "step": 5442 + }, + { + "epoch": 2.7215, + "grad_norm": 2.0135142145806806, + "learning_rate": 4.5620439228894816e-06, + "loss": 0.3212, + "mean_token_accuracy": 0.8915528655052185, + "step": 5443 + }, + { + "epoch": 2.722, + "grad_norm": 2.036671197908253, + "learning_rate": 4.561797189650478e-06, + "loss": 0.2713, + "mean_token_accuracy": 0.9031654596328735, + "step": 5444 + }, + { + "epoch": 2.7225, + "grad_norm": 5.95418881569203, + "learning_rate": 4.561550393605541e-06, + "loss": 0.3011, + "mean_token_accuracy": 0.9087271094322205, + "step": 5445 + }, + { + "epoch": 2.723, + "grad_norm": 1.377994225114307, + "learning_rate": 4.561303534762188e-06, + "loss": 0.194, + "mean_token_accuracy": 0.9352476596832275, + "step": 5446 + }, + { + "epoch": 2.7235, + "grad_norm": 2.0095532452497564, + "learning_rate": 4.561056613127939e-06, + "loss": 0.2643, + "mean_token_accuracy": 0.903356671333313, + "step": 5447 + }, + { + "epoch": 2.724, + "grad_norm": 1.832830533856815, + "learning_rate": 4.560809628710315e-06, + "loss": 0.2961, + "mean_token_accuracy": 0.904644787311554, + "step": 5448 + }, + { + "epoch": 2.7245, + "grad_norm": 2.512932540446473, + "learning_rate": 4.56056258151684e-06, + "loss": 0.2747, + "mean_token_accuracy": 0.9006849527359009, + "step": 5449 + }, + { + "epoch": 2.725, + "grad_norm": 5.6904376755888055, + "learning_rate": 4.560315471555039e-06, + "loss": 0.3381, + "mean_token_accuracy": 0.8922138810157776, + "step": 5450 + }, + { + "epoch": 2.7255000000000003, + "grad_norm": 1.8676466410021508, + "learning_rate": 4.560068298832441e-06, + "loss": 0.2867, + "mean_token_accuracy": 0.9033769369125366, + "step": 5451 + }, + { + "epoch": 2.726, + "grad_norm": 1.9152233846875566, + "learning_rate": 4.5598210633565736e-06, + "loss": 0.2819, + "mean_token_accuracy": 0.9009868502616882, + "step": 5452 + }, + { + "epoch": 2.7265, + "grad_norm": 2.3764228485818064, + "learning_rate": 4.559573765134969e-06, + "loss": 0.2652, + "mean_token_accuracy": 0.9120529890060425, + "step": 5453 + }, + { + "epoch": 2.727, + "grad_norm": 1.5825614208171757, + "learning_rate": 4.55932640417516e-06, + "loss": 0.2644, + "mean_token_accuracy": 0.91236412525177, + "step": 5454 + }, + { + "epoch": 2.7275, + "grad_norm": 25.401431248064924, + "learning_rate": 4.55907898048468e-06, + "loss": 0.2331, + "mean_token_accuracy": 0.9122216105461121, + "step": 5455 + }, + { + "epoch": 2.7279999999999998, + "grad_norm": 2.0334772591951182, + "learning_rate": 4.558831494071069e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8644521236419678, + "step": 5456 + }, + { + "epoch": 2.7285, + "grad_norm": 1.923023660474813, + "learning_rate": 4.558583944941864e-06, + "loss": 0.2788, + "mean_token_accuracy": 0.8971397280693054, + "step": 5457 + }, + { + "epoch": 2.729, + "grad_norm": 2.8079244723788714, + "learning_rate": 4.558336333104606e-06, + "loss": 0.299, + "mean_token_accuracy": 0.8985338807106018, + "step": 5458 + }, + { + "epoch": 2.7295, + "grad_norm": 2.5499046115278325, + "learning_rate": 4.5580886585668384e-06, + "loss": 0.2818, + "mean_token_accuracy": 0.9076817035675049, + "step": 5459 + }, + { + "epoch": 2.73, + "grad_norm": 3.657311098015648, + "learning_rate": 4.5578409213361055e-06, + "loss": 0.3209, + "mean_token_accuracy": 0.9039741158485413, + "step": 5460 + }, + { + "epoch": 2.7305, + "grad_norm": 4.990914137345038, + "learning_rate": 4.557593121419953e-06, + "loss": 0.2655, + "mean_token_accuracy": 0.9142397046089172, + "step": 5461 + }, + { + "epoch": 2.731, + "grad_norm": 1.7342003370142463, + "learning_rate": 4.55734525882593e-06, + "loss": 0.2948, + "mean_token_accuracy": 0.9001451134681702, + "step": 5462 + }, + { + "epoch": 2.7315, + "grad_norm": 1.872787826447872, + "learning_rate": 4.5570973335615866e-06, + "loss": 0.237, + "mean_token_accuracy": 0.9215899109840393, + "step": 5463 + }, + { + "epoch": 2.732, + "grad_norm": 8.896505330055769, + "learning_rate": 4.556849345634475e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8915358185768127, + "step": 5464 + }, + { + "epoch": 2.7325, + "grad_norm": 3.183199067721433, + "learning_rate": 4.55660129505215e-06, + "loss": 0.2396, + "mean_token_accuracy": 0.9161719083786011, + "step": 5465 + }, + { + "epoch": 2.733, + "grad_norm": 1.7023904909428762, + "learning_rate": 4.556353181822167e-06, + "loss": 0.2963, + "mean_token_accuracy": 0.9051417112350464, + "step": 5466 + }, + { + "epoch": 2.7335000000000003, + "grad_norm": 6.805596562792795, + "learning_rate": 4.556105005952084e-06, + "loss": 0.3255, + "mean_token_accuracy": 0.8907504081726074, + "step": 5467 + }, + { + "epoch": 2.734, + "grad_norm": 2.3648348337996707, + "learning_rate": 4.555856767449461e-06, + "loss": 0.2482, + "mean_token_accuracy": 0.9122913479804993, + "step": 5468 + }, + { + "epoch": 2.7345, + "grad_norm": 1.7914209538206884, + "learning_rate": 4.55560846632186e-06, + "loss": 0.2387, + "mean_token_accuracy": 0.9199234843254089, + "step": 5469 + }, + { + "epoch": 2.735, + "grad_norm": 2.1297877315479115, + "learning_rate": 4.555360102576844e-06, + "loss": 0.249, + "mean_token_accuracy": 0.9149820804595947, + "step": 5470 + }, + { + "epoch": 2.7355, + "grad_norm": 1.522165422286847, + "learning_rate": 4.55511167622198e-06, + "loss": 0.2258, + "mean_token_accuracy": 0.9150674939155579, + "step": 5471 + }, + { + "epoch": 2.7359999999999998, + "grad_norm": 1.9123073135206252, + "learning_rate": 4.554863187264833e-06, + "loss": 0.2213, + "mean_token_accuracy": 0.9228426218032837, + "step": 5472 + }, + { + "epoch": 2.7365, + "grad_norm": 1.9925317600308883, + "learning_rate": 4.554614635712975e-06, + "loss": 0.2659, + "mean_token_accuracy": 0.9161496758460999, + "step": 5473 + }, + { + "epoch": 2.737, + "grad_norm": 3.219538424085665, + "learning_rate": 4.554366021573976e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.887139618396759, + "step": 5474 + }, + { + "epoch": 2.7375, + "grad_norm": 2.618179580481999, + "learning_rate": 4.55411734485541e-06, + "loss": 0.335, + "mean_token_accuracy": 0.9002941846847534, + "step": 5475 + }, + { + "epoch": 2.738, + "grad_norm": 3.476048485331518, + "learning_rate": 4.553868605564851e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.8995835185050964, + "step": 5476 + }, + { + "epoch": 2.7385, + "grad_norm": 4.277547628821123, + "learning_rate": 4.553619803709877e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.879516065120697, + "step": 5477 + }, + { + "epoch": 2.739, + "grad_norm": 2.872876005609923, + "learning_rate": 4.553370939298066e-06, + "loss": 0.3144, + "mean_token_accuracy": 0.894577145576477, + "step": 5478 + }, + { + "epoch": 2.7395, + "grad_norm": 1.8429896029256831, + "learning_rate": 4.553122012337e-06, + "loss": 0.3134, + "mean_token_accuracy": 0.8925659656524658, + "step": 5479 + }, + { + "epoch": 2.74, + "grad_norm": 2.156480554624093, + "learning_rate": 4.55287302283426e-06, + "loss": 0.3132, + "mean_token_accuracy": 0.8962714076042175, + "step": 5480 + }, + { + "epoch": 2.7405, + "grad_norm": 2.3461613461805166, + "learning_rate": 4.552623970797433e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8667677044868469, + "step": 5481 + }, + { + "epoch": 2.741, + "grad_norm": 4.349523279766362, + "learning_rate": 4.552374856234104e-06, + "loss": 0.2323, + "mean_token_accuracy": 0.9184119701385498, + "step": 5482 + }, + { + "epoch": 2.7415000000000003, + "grad_norm": 1.8198762325624713, + "learning_rate": 4.552125679151862e-06, + "loss": 0.2283, + "mean_token_accuracy": 0.9275709390640259, + "step": 5483 + }, + { + "epoch": 2.742, + "grad_norm": 2.2263074438072423, + "learning_rate": 4.551876439558297e-06, + "loss": 0.328, + "mean_token_accuracy": 0.8923191428184509, + "step": 5484 + }, + { + "epoch": 2.7425, + "grad_norm": 1.8547268267805048, + "learning_rate": 4.551627137461002e-06, + "loss": 0.2498, + "mean_token_accuracy": 0.9183249473571777, + "step": 5485 + }, + { + "epoch": 2.743, + "grad_norm": 54.37676864197118, + "learning_rate": 4.5513777728675705e-06, + "loss": 0.3017, + "mean_token_accuracy": 0.8950219750404358, + "step": 5486 + }, + { + "epoch": 2.7435, + "grad_norm": 1.9772658273008006, + "learning_rate": 4.551128345785599e-06, + "loss": 0.2611, + "mean_token_accuracy": 0.9092687964439392, + "step": 5487 + }, + { + "epoch": 2.7439999999999998, + "grad_norm": 2.6078878113303525, + "learning_rate": 4.550878856222684e-06, + "loss": 0.1881, + "mean_token_accuracy": 0.9294605851173401, + "step": 5488 + }, + { + "epoch": 2.7445, + "grad_norm": 1.8799838026282292, + "learning_rate": 4.550629304186428e-06, + "loss": 0.262, + "mean_token_accuracy": 0.911139726638794, + "step": 5489 + }, + { + "epoch": 2.745, + "grad_norm": 2.2400592452502375, + "learning_rate": 4.550379689684431e-06, + "loss": 0.3351, + "mean_token_accuracy": 0.8924372792243958, + "step": 5490 + }, + { + "epoch": 2.7455, + "grad_norm": 3.1176883797695445, + "learning_rate": 4.550130012724296e-06, + "loss": 0.2934, + "mean_token_accuracy": 0.9051502346992493, + "step": 5491 + }, + { + "epoch": 2.746, + "grad_norm": 3.4139729199114854, + "learning_rate": 4.549880273313631e-06, + "loss": 0.2548, + "mean_token_accuracy": 0.9069229364395142, + "step": 5492 + }, + { + "epoch": 2.7465, + "grad_norm": 3.2995196490254655, + "learning_rate": 4.549630471460042e-06, + "loss": 0.2906, + "mean_token_accuracy": 0.9123733043670654, + "step": 5493 + }, + { + "epoch": 2.747, + "grad_norm": 2.1159642711665927, + "learning_rate": 4.5493806071711384e-06, + "loss": 0.277, + "mean_token_accuracy": 0.9049488306045532, + "step": 5494 + }, + { + "epoch": 2.7475, + "grad_norm": 1.4858984444983512, + "learning_rate": 4.549130680454532e-06, + "loss": 0.2798, + "mean_token_accuracy": 0.9058674573898315, + "step": 5495 + }, + { + "epoch": 2.748, + "grad_norm": 2.3521495509353407, + "learning_rate": 4.548880691317835e-06, + "loss": 0.2594, + "mean_token_accuracy": 0.9166109561920166, + "step": 5496 + }, + { + "epoch": 2.7485, + "grad_norm": 2.0390815332209424, + "learning_rate": 4.548630639768664e-06, + "loss": 0.2538, + "mean_token_accuracy": 0.9076799750328064, + "step": 5497 + }, + { + "epoch": 2.749, + "grad_norm": 2.915225287677061, + "learning_rate": 4.548380525814634e-06, + "loss": 0.2663, + "mean_token_accuracy": 0.9095901846885681, + "step": 5498 + }, + { + "epoch": 2.7495000000000003, + "grad_norm": 1.656206698511978, + "learning_rate": 4.548130349463366e-06, + "loss": 0.2523, + "mean_token_accuracy": 0.9139402508735657, + "step": 5499 + }, + { + "epoch": 2.75, + "grad_norm": 2.400588808139555, + "learning_rate": 4.54788011072248e-06, + "loss": 0.2543, + "mean_token_accuracy": 0.9107009768486023, + "step": 5500 + }, + { + "epoch": 2.7504999999999997, + "grad_norm": 2.2188623898005844, + "learning_rate": 4.547629809599599e-06, + "loss": 0.2634, + "mean_token_accuracy": 0.9144425392150879, + "step": 5501 + }, + { + "epoch": 2.751, + "grad_norm": 3.9110246992925264, + "learning_rate": 4.547379446102345e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8792113065719604, + "step": 5502 + }, + { + "epoch": 2.7515, + "grad_norm": 1.818826872782675, + "learning_rate": 4.547129020238349e-06, + "loss": 0.243, + "mean_token_accuracy": 0.9133404493331909, + "step": 5503 + }, + { + "epoch": 2.752, + "grad_norm": 2.569668407043903, + "learning_rate": 4.5468785320152365e-06, + "loss": 0.31, + "mean_token_accuracy": 0.8952659368515015, + "step": 5504 + }, + { + "epoch": 2.7525, + "grad_norm": 2.9269292293362414, + "learning_rate": 4.546627981440639e-06, + "loss": 0.3099, + "mean_token_accuracy": 0.9001372456550598, + "step": 5505 + }, + { + "epoch": 2.753, + "grad_norm": 7.58241386136439, + "learning_rate": 4.546377368522188e-06, + "loss": 0.3151, + "mean_token_accuracy": 0.9037933945655823, + "step": 5506 + }, + { + "epoch": 2.7535, + "grad_norm": 2.9104867326657673, + "learning_rate": 4.5461266932675164e-06, + "loss": 0.2738, + "mean_token_accuracy": 0.9104041457176208, + "step": 5507 + }, + { + "epoch": 2.754, + "grad_norm": 3.07485052058887, + "learning_rate": 4.545875955684262e-06, + "loss": 0.3281, + "mean_token_accuracy": 0.8909721970558167, + "step": 5508 + }, + { + "epoch": 2.7545, + "grad_norm": 2.3906021707167096, + "learning_rate": 4.545625155780063e-06, + "loss": 0.2786, + "mean_token_accuracy": 0.9100262522697449, + "step": 5509 + }, + { + "epoch": 2.755, + "grad_norm": 2.69090550985229, + "learning_rate": 4.545374293562559e-06, + "loss": 0.3194, + "mean_token_accuracy": 0.8846222758293152, + "step": 5510 + }, + { + "epoch": 2.7555, + "grad_norm": 5.589600198047445, + "learning_rate": 4.545123369039391e-06, + "loss": 0.3138, + "mean_token_accuracy": 0.8963730335235596, + "step": 5511 + }, + { + "epoch": 2.7560000000000002, + "grad_norm": 4.369917463974314, + "learning_rate": 4.544872382218202e-06, + "loss": 0.263, + "mean_token_accuracy": 0.9060265421867371, + "step": 5512 + }, + { + "epoch": 2.7565, + "grad_norm": 1.6635814570545062, + "learning_rate": 4.544621333106638e-06, + "loss": 0.2978, + "mean_token_accuracy": 0.9065367579460144, + "step": 5513 + }, + { + "epoch": 2.757, + "grad_norm": 1.9264990649450118, + "learning_rate": 4.5443702217123474e-06, + "loss": 0.2727, + "mean_token_accuracy": 0.9001042246818542, + "step": 5514 + }, + { + "epoch": 2.7575, + "grad_norm": 2.473650037036931, + "learning_rate": 4.544119048042978e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8779770135879517, + "step": 5515 + }, + { + "epoch": 2.758, + "grad_norm": 12.153121517511886, + "learning_rate": 4.543867812106183e-06, + "loss": 0.2543, + "mean_token_accuracy": 0.9147078394889832, + "step": 5516 + }, + { + "epoch": 2.7584999999999997, + "grad_norm": 2.3904891816406213, + "learning_rate": 4.5436165139096135e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.8802052736282349, + "step": 5517 + }, + { + "epoch": 2.759, + "grad_norm": 5.399861240345088, + "learning_rate": 4.543365153460925e-06, + "loss": 0.1859, + "mean_token_accuracy": 0.9349431395530701, + "step": 5518 + }, + { + "epoch": 2.7595, + "grad_norm": 2.6973606482288055, + "learning_rate": 4.5431137307677754e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8933983445167542, + "step": 5519 + }, + { + "epoch": 2.76, + "grad_norm": 2.6431629428311334, + "learning_rate": 4.542862245837821e-06, + "loss": 0.2616, + "mean_token_accuracy": 0.9100679755210876, + "step": 5520 + }, + { + "epoch": 2.7605, + "grad_norm": 14.291504225600828, + "learning_rate": 4.542610698678726e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8856469988822937, + "step": 5521 + }, + { + "epoch": 2.761, + "grad_norm": 1.5842725143191438, + "learning_rate": 4.5423590892981506e-06, + "loss": 0.2158, + "mean_token_accuracy": 0.9263157844543457, + "step": 5522 + }, + { + "epoch": 2.7615, + "grad_norm": 4.475520030005896, + "learning_rate": 4.542107417703759e-06, + "loss": 0.3, + "mean_token_accuracy": 0.8977175354957581, + "step": 5523 + }, + { + "epoch": 2.762, + "grad_norm": 1.9873467804478973, + "learning_rate": 4.541855683903219e-06, + "loss": 0.3127, + "mean_token_accuracy": 0.8919642567634583, + "step": 5524 + }, + { + "epoch": 2.7625, + "grad_norm": 2.146408244001285, + "learning_rate": 4.541603887904198e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.8927250504493713, + "step": 5525 + }, + { + "epoch": 2.763, + "grad_norm": 2.0561521057187058, + "learning_rate": 4.541352029714366e-06, + "loss": 0.2962, + "mean_token_accuracy": 0.9013145565986633, + "step": 5526 + }, + { + "epoch": 2.7635, + "grad_norm": 2.629316411953996, + "learning_rate": 4.541100109341396e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.8813438415527344, + "step": 5527 + }, + { + "epoch": 2.7640000000000002, + "grad_norm": 1.854038454960875, + "learning_rate": 4.54084812679296e-06, + "loss": 0.2488, + "mean_token_accuracy": 0.916301965713501, + "step": 5528 + }, + { + "epoch": 2.7645, + "grad_norm": 2.9212288189428524, + "learning_rate": 4.540596082076736e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8902190923690796, + "step": 5529 + }, + { + "epoch": 2.765, + "grad_norm": 2.3083579269025676, + "learning_rate": 4.540343975200401e-06, + "loss": 0.2591, + "mean_token_accuracy": 0.9077436327934265, + "step": 5530 + }, + { + "epoch": 2.7655, + "grad_norm": 2.1397484881463242, + "learning_rate": 4.540091806171634e-06, + "loss": 0.2339, + "mean_token_accuracy": 0.9137285351753235, + "step": 5531 + }, + { + "epoch": 2.766, + "grad_norm": 5.019206922450008, + "learning_rate": 4.539839574998117e-06, + "loss": 0.2223, + "mean_token_accuracy": 0.9277777671813965, + "step": 5532 + }, + { + "epoch": 2.7664999999999997, + "grad_norm": 3.2871281694054377, + "learning_rate": 4.5395872816875346e-06, + "loss": 0.3221, + "mean_token_accuracy": 0.891566276550293, + "step": 5533 + }, + { + "epoch": 2.767, + "grad_norm": 2.1634865311687403, + "learning_rate": 4.539334926247569e-06, + "loss": 0.2761, + "mean_token_accuracy": 0.9132630825042725, + "step": 5534 + }, + { + "epoch": 2.7675, + "grad_norm": 3.1812570732241716, + "learning_rate": 4.5390825086859094e-06, + "loss": 0.2707, + "mean_token_accuracy": 0.9126411080360413, + "step": 5535 + }, + { + "epoch": 2.768, + "grad_norm": 1.6556717834973746, + "learning_rate": 4.538830029010246e-06, + "loss": 0.2183, + "mean_token_accuracy": 0.9206249713897705, + "step": 5536 + }, + { + "epoch": 2.7685, + "grad_norm": 2.7831234135230254, + "learning_rate": 4.538577487228267e-06, + "loss": 0.318, + "mean_token_accuracy": 0.897122323513031, + "step": 5537 + }, + { + "epoch": 2.769, + "grad_norm": 2.3010678182471267, + "learning_rate": 4.538324883347668e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8854604363441467, + "step": 5538 + }, + { + "epoch": 2.7695, + "grad_norm": 1.8493130781063989, + "learning_rate": 4.538072217376141e-06, + "loss": 0.2461, + "mean_token_accuracy": 0.9164150357246399, + "step": 5539 + }, + { + "epoch": 2.77, + "grad_norm": 2.111664564079507, + "learning_rate": 4.537819489321385e-06, + "loss": 0.2756, + "mean_token_accuracy": 0.9119299650192261, + "step": 5540 + }, + { + "epoch": 2.7705, + "grad_norm": 2.5837763172283954, + "learning_rate": 4.537566699191099e-06, + "loss": 0.2545, + "mean_token_accuracy": 0.9109059572219849, + "step": 5541 + }, + { + "epoch": 2.771, + "grad_norm": 1.8313054527156332, + "learning_rate": 4.53731384699298e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8913586735725403, + "step": 5542 + }, + { + "epoch": 2.7715, + "grad_norm": 16.59649067972426, + "learning_rate": 4.537060932734734e-06, + "loss": 0.3243, + "mean_token_accuracy": 0.8910952806472778, + "step": 5543 + }, + { + "epoch": 2.7720000000000002, + "grad_norm": 1.7786275434790395, + "learning_rate": 4.536807956424063e-06, + "loss": 0.2532, + "mean_token_accuracy": 0.9079582095146179, + "step": 5544 + }, + { + "epoch": 2.7725, + "grad_norm": 2.3085937265486645, + "learning_rate": 4.536554918068673e-06, + "loss": 0.3035, + "mean_token_accuracy": 0.8977604508399963, + "step": 5545 + }, + { + "epoch": 2.773, + "grad_norm": 2.46783815207786, + "learning_rate": 4.536301817676274e-06, + "loss": 0.3212, + "mean_token_accuracy": 0.9041153192520142, + "step": 5546 + }, + { + "epoch": 2.7735, + "grad_norm": 8.330178867475736, + "learning_rate": 4.5360486552545735e-06, + "loss": 0.262, + "mean_token_accuracy": 0.9164007902145386, + "step": 5547 + }, + { + "epoch": 2.774, + "grad_norm": 3.176133298338141, + "learning_rate": 4.535795430811285e-06, + "loss": 0.2518, + "mean_token_accuracy": 0.9091358780860901, + "step": 5548 + }, + { + "epoch": 2.7744999999999997, + "grad_norm": 3.5768214551414754, + "learning_rate": 4.535542144354121e-06, + "loss": 0.3157, + "mean_token_accuracy": 0.8981419801712036, + "step": 5549 + }, + { + "epoch": 2.775, + "grad_norm": 3.658169307790104, + "learning_rate": 4.535288795890799e-06, + "loss": 0.3131, + "mean_token_accuracy": 0.8940711617469788, + "step": 5550 + }, + { + "epoch": 2.7755, + "grad_norm": 2.022619031187644, + "learning_rate": 4.535035385429034e-06, + "loss": 0.2315, + "mean_token_accuracy": 0.9309035539627075, + "step": 5551 + }, + { + "epoch": 2.776, + "grad_norm": 2.628877055630572, + "learning_rate": 4.534781912976546e-06, + "loss": 0.3031, + "mean_token_accuracy": 0.90716552734375, + "step": 5552 + }, + { + "epoch": 2.7765, + "grad_norm": 2.220193998999231, + "learning_rate": 4.5345283785410565e-06, + "loss": 0.2984, + "mean_token_accuracy": 0.8932434916496277, + "step": 5553 + }, + { + "epoch": 2.777, + "grad_norm": 2.089639587127672, + "learning_rate": 4.534274782130289e-06, + "loss": 0.3222, + "mean_token_accuracy": 0.8951037526130676, + "step": 5554 + }, + { + "epoch": 2.7775, + "grad_norm": 1.707686159980268, + "learning_rate": 4.5340211237519685e-06, + "loss": 0.3164, + "mean_token_accuracy": 0.8968325853347778, + "step": 5555 + }, + { + "epoch": 2.778, + "grad_norm": 2.1448517466489387, + "learning_rate": 4.53376740341382e-06, + "loss": 0.2455, + "mean_token_accuracy": 0.9149329662322998, + "step": 5556 + }, + { + "epoch": 2.7785, + "grad_norm": 5.049051430984942, + "learning_rate": 4.533513621123575e-06, + "loss": 0.306, + "mean_token_accuracy": 0.8968756198883057, + "step": 5557 + }, + { + "epoch": 2.779, + "grad_norm": 2.0090343162819164, + "learning_rate": 4.533259776888963e-06, + "loss": 0.33, + "mean_token_accuracy": 0.8875547647476196, + "step": 5558 + }, + { + "epoch": 2.7795, + "grad_norm": 2.6996016700230623, + "learning_rate": 4.533005870717716e-06, + "loss": 0.2684, + "mean_token_accuracy": 0.9097018241882324, + "step": 5559 + }, + { + "epoch": 2.7800000000000002, + "grad_norm": 1.98877097779092, + "learning_rate": 4.5327519026175694e-06, + "loss": 0.2665, + "mean_token_accuracy": 0.9105284214019775, + "step": 5560 + }, + { + "epoch": 2.7805, + "grad_norm": 2.812398269825169, + "learning_rate": 4.532497872596259e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8821045160293579, + "step": 5561 + }, + { + "epoch": 2.781, + "grad_norm": 1.94657732551644, + "learning_rate": 4.532243780661523e-06, + "loss": 0.2486, + "mean_token_accuracy": 0.9156747460365295, + "step": 5562 + }, + { + "epoch": 2.7815, + "grad_norm": 2.000742534999242, + "learning_rate": 4.5319896268211004e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.8915544152259827, + "step": 5563 + }, + { + "epoch": 2.782, + "grad_norm": 2.1122119990793875, + "learning_rate": 4.531735411082735e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8820083737373352, + "step": 5564 + }, + { + "epoch": 2.7824999999999998, + "grad_norm": 1.910631347688033, + "learning_rate": 4.5314811334541695e-06, + "loss": 0.3405, + "mean_token_accuracy": 0.888443112373352, + "step": 5565 + }, + { + "epoch": 2.783, + "grad_norm": 2.141930479138237, + "learning_rate": 4.531226793943151e-06, + "loss": 0.1933, + "mean_token_accuracy": 0.9314923882484436, + "step": 5566 + }, + { + "epoch": 2.7835, + "grad_norm": 1.8508420133042567, + "learning_rate": 4.530972392557426e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8779507875442505, + "step": 5567 + }, + { + "epoch": 2.784, + "grad_norm": 1.8866949126257884, + "learning_rate": 4.530717929304743e-06, + "loss": 0.2762, + "mean_token_accuracy": 0.9061086177825928, + "step": 5568 + }, + { + "epoch": 2.7845, + "grad_norm": 3.141032331374316, + "learning_rate": 4.530463404192856e-06, + "loss": 0.2895, + "mean_token_accuracy": 0.9014004468917847, + "step": 5569 + }, + { + "epoch": 2.785, + "grad_norm": 2.5344157505714247, + "learning_rate": 4.530208817229516e-06, + "loss": 0.3233, + "mean_token_accuracy": 0.8916084170341492, + "step": 5570 + }, + { + "epoch": 2.7855, + "grad_norm": 1.8784000405863956, + "learning_rate": 4.529954168422479e-06, + "loss": 0.2912, + "mean_token_accuracy": 0.900431215763092, + "step": 5571 + }, + { + "epoch": 2.786, + "grad_norm": 2.9022849118187493, + "learning_rate": 4.5296994577795025e-06, + "loss": 0.3226, + "mean_token_accuracy": 0.896672785282135, + "step": 5572 + }, + { + "epoch": 2.7865, + "grad_norm": 1.7066613555458652, + "learning_rate": 4.529444685308345e-06, + "loss": 0.2292, + "mean_token_accuracy": 0.9130734205245972, + "step": 5573 + }, + { + "epoch": 2.787, + "grad_norm": 2.5024189983632024, + "learning_rate": 4.5291898510167665e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8889502882957458, + "step": 5574 + }, + { + "epoch": 2.7875, + "grad_norm": 2.3981757094833775, + "learning_rate": 4.528934954912531e-06, + "loss": 0.3034, + "mean_token_accuracy": 0.9046614170074463, + "step": 5575 + }, + { + "epoch": 2.7880000000000003, + "grad_norm": 1.819745575332574, + "learning_rate": 4.528679997003403e-06, + "loss": 0.303, + "mean_token_accuracy": 0.8933456540107727, + "step": 5576 + }, + { + "epoch": 2.7885, + "grad_norm": 5.726770797865063, + "learning_rate": 4.528424977297148e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.8884174227714539, + "step": 5577 + }, + { + "epoch": 2.789, + "grad_norm": 2.197347966390515, + "learning_rate": 4.5281698958015344e-06, + "loss": 0.2929, + "mean_token_accuracy": 0.9051914811134338, + "step": 5578 + }, + { + "epoch": 2.7895, + "grad_norm": 1.9250062679582194, + "learning_rate": 4.527914752524334e-06, + "loss": 0.2926, + "mean_token_accuracy": 0.9036433100700378, + "step": 5579 + }, + { + "epoch": 2.79, + "grad_norm": 2.299976042193658, + "learning_rate": 4.527659547473317e-06, + "loss": 0.3098, + "mean_token_accuracy": 0.8983084559440613, + "step": 5580 + }, + { + "epoch": 2.7904999999999998, + "grad_norm": 2.3610798267441, + "learning_rate": 4.527404280656259e-06, + "loss": 0.2997, + "mean_token_accuracy": 0.9005848169326782, + "step": 5581 + }, + { + "epoch": 2.791, + "grad_norm": 4.488184443159804, + "learning_rate": 4.527148952080934e-06, + "loss": 0.2794, + "mean_token_accuracy": 0.898605465888977, + "step": 5582 + }, + { + "epoch": 2.7915, + "grad_norm": 1.581707902198211, + "learning_rate": 4.526893561755121e-06, + "loss": 0.2312, + "mean_token_accuracy": 0.9248298406600952, + "step": 5583 + }, + { + "epoch": 2.792, + "grad_norm": 2.1691792021001928, + "learning_rate": 4.5266381096866e-06, + "loss": 0.2895, + "mean_token_accuracy": 0.9092698693275452, + "step": 5584 + }, + { + "epoch": 2.7925, + "grad_norm": 2.310415529486908, + "learning_rate": 4.526382595883152e-06, + "loss": 0.2717, + "mean_token_accuracy": 0.9062448143959045, + "step": 5585 + }, + { + "epoch": 2.793, + "grad_norm": 4.050229600972278, + "learning_rate": 4.5261270203525605e-06, + "loss": 0.197, + "mean_token_accuracy": 0.932708203792572, + "step": 5586 + }, + { + "epoch": 2.7935, + "grad_norm": 3.690899083499061, + "learning_rate": 4.52587138310261e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8798239827156067, + "step": 5587 + }, + { + "epoch": 2.794, + "grad_norm": 2.2314638393215693, + "learning_rate": 4.525615684141089e-06, + "loss": 0.3319, + "mean_token_accuracy": 0.8973873257637024, + "step": 5588 + }, + { + "epoch": 2.7945, + "grad_norm": 2.1267681947943604, + "learning_rate": 4.525359923475785e-06, + "loss": 0.3095, + "mean_token_accuracy": 0.9010432362556458, + "step": 5589 + }, + { + "epoch": 2.795, + "grad_norm": 2.075540171121318, + "learning_rate": 4.5251041011144905e-06, + "loss": 0.2899, + "mean_token_accuracy": 0.9106197953224182, + "step": 5590 + }, + { + "epoch": 2.7955, + "grad_norm": 2.3349885850511902, + "learning_rate": 4.524848217064997e-06, + "loss": 0.2702, + "mean_token_accuracy": 0.9141981601715088, + "step": 5591 + }, + { + "epoch": 2.7960000000000003, + "grad_norm": 1.7409658305911457, + "learning_rate": 4.5245922713351e-06, + "loss": 0.3153, + "mean_token_accuracy": 0.8979422450065613, + "step": 5592 + }, + { + "epoch": 2.7965, + "grad_norm": 1.5991713830539134, + "learning_rate": 4.524336263932596e-06, + "loss": 0.2386, + "mean_token_accuracy": 0.913755476474762, + "step": 5593 + }, + { + "epoch": 2.797, + "grad_norm": 5.552520669847028, + "learning_rate": 4.524080194865283e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8561612963676453, + "step": 5594 + }, + { + "epoch": 2.7975, + "grad_norm": 4.213343415567021, + "learning_rate": 4.523824064140961e-06, + "loss": 0.278, + "mean_token_accuracy": 0.9085747599601746, + "step": 5595 + }, + { + "epoch": 2.798, + "grad_norm": 1.8080054292662056, + "learning_rate": 4.523567871767433e-06, + "loss": 0.3056, + "mean_token_accuracy": 0.9020368456840515, + "step": 5596 + }, + { + "epoch": 2.7984999999999998, + "grad_norm": 2.5130203533907984, + "learning_rate": 4.523311617752504e-06, + "loss": 0.2029, + "mean_token_accuracy": 0.9293877482414246, + "step": 5597 + }, + { + "epoch": 2.799, + "grad_norm": 2.5505679732280333, + "learning_rate": 4.523055302103977e-06, + "loss": 0.3151, + "mean_token_accuracy": 0.9014285802841187, + "step": 5598 + }, + { + "epoch": 2.7995, + "grad_norm": 4.229885427302238, + "learning_rate": 4.522798924829662e-06, + "loss": 0.255, + "mean_token_accuracy": 0.9060158133506775, + "step": 5599 + }, + { + "epoch": 2.8, + "grad_norm": 1.6641484916901468, + "learning_rate": 4.522542485937369e-06, + "loss": 0.2485, + "mean_token_accuracy": 0.9151095151901245, + "step": 5600 + }, + { + "epoch": 2.8005, + "grad_norm": 1.9488686627605782, + "learning_rate": 4.522285985434908e-06, + "loss": 0.2913, + "mean_token_accuracy": 0.8979282975196838, + "step": 5601 + }, + { + "epoch": 2.801, + "grad_norm": 5.631859307084084, + "learning_rate": 4.522029423330094e-06, + "loss": 0.2635, + "mean_token_accuracy": 0.9165154099464417, + "step": 5602 + }, + { + "epoch": 2.8015, + "grad_norm": 1.7030661937989273, + "learning_rate": 4.521772799630741e-06, + "loss": 0.2676, + "mean_token_accuracy": 0.9098536372184753, + "step": 5603 + }, + { + "epoch": 2.802, + "grad_norm": 1.992869588823932, + "learning_rate": 4.521516114344667e-06, + "loss": 0.3133, + "mean_token_accuracy": 0.8964337110519409, + "step": 5604 + }, + { + "epoch": 2.8025, + "grad_norm": 10.33759811304663, + "learning_rate": 4.521259367479691e-06, + "loss": 0.2999, + "mean_token_accuracy": 0.8998058438301086, + "step": 5605 + }, + { + "epoch": 2.803, + "grad_norm": 13.851777639058302, + "learning_rate": 4.521002559043633e-06, + "loss": 0.2884, + "mean_token_accuracy": 0.9013766050338745, + "step": 5606 + }, + { + "epoch": 2.8035, + "grad_norm": 1.2655014291906663, + "learning_rate": 4.520745689044317e-06, + "loss": 0.1774, + "mean_token_accuracy": 0.9286795854568481, + "step": 5607 + }, + { + "epoch": 2.8040000000000003, + "grad_norm": 1.4850366275427955, + "learning_rate": 4.520488757489568e-06, + "loss": 0.1773, + "mean_token_accuracy": 0.9354706406593323, + "step": 5608 + }, + { + "epoch": 2.8045, + "grad_norm": 2.0665453980459123, + "learning_rate": 4.520231764387212e-06, + "loss": 0.2111, + "mean_token_accuracy": 0.9326328635215759, + "step": 5609 + }, + { + "epoch": 2.805, + "grad_norm": 9.588909827214172, + "learning_rate": 4.519974709745076e-06, + "loss": 0.311, + "mean_token_accuracy": 0.8962699770927429, + "step": 5610 + }, + { + "epoch": 2.8055, + "grad_norm": 1.9298865151771591, + "learning_rate": 4.519717593570993e-06, + "loss": 0.3089, + "mean_token_accuracy": 0.8895358443260193, + "step": 5611 + }, + { + "epoch": 2.806, + "grad_norm": 2.577779680781753, + "learning_rate": 4.5194604158727935e-06, + "loss": 0.325, + "mean_token_accuracy": 0.8911740779876709, + "step": 5612 + }, + { + "epoch": 2.8064999999999998, + "grad_norm": 2.057974570605946, + "learning_rate": 4.5192031766583135e-06, + "loss": 0.2723, + "mean_token_accuracy": 0.906260073184967, + "step": 5613 + }, + { + "epoch": 2.807, + "grad_norm": 3.6893699918849086, + "learning_rate": 4.518945875935386e-06, + "loss": 0.3274, + "mean_token_accuracy": 0.8839547038078308, + "step": 5614 + }, + { + "epoch": 2.8075, + "grad_norm": 2.3798731847462813, + "learning_rate": 4.51868851371185e-06, + "loss": 0.3059, + "mean_token_accuracy": 0.9002476334571838, + "step": 5615 + }, + { + "epoch": 2.808, + "grad_norm": 2.15643172197989, + "learning_rate": 4.518431089995546e-06, + "loss": 0.2017, + "mean_token_accuracy": 0.9348798394203186, + "step": 5616 + }, + { + "epoch": 2.8085, + "grad_norm": 2.259206163917874, + "learning_rate": 4.518173604794315e-06, + "loss": 0.3202, + "mean_token_accuracy": 0.9039104580879211, + "step": 5617 + }, + { + "epoch": 2.809, + "grad_norm": 1.7798110425262266, + "learning_rate": 4.517916058116001e-06, + "loss": 0.1997, + "mean_token_accuracy": 0.9271255135536194, + "step": 5618 + }, + { + "epoch": 2.8095, + "grad_norm": 2.683821150929941, + "learning_rate": 4.517658449968449e-06, + "loss": 0.2344, + "mean_token_accuracy": 0.9191957712173462, + "step": 5619 + }, + { + "epoch": 2.81, + "grad_norm": 2.3586815555526908, + "learning_rate": 4.517400780359505e-06, + "loss": 0.2883, + "mean_token_accuracy": 0.907579779624939, + "step": 5620 + }, + { + "epoch": 2.8105, + "grad_norm": 3.281578415955726, + "learning_rate": 4.517143049297021e-06, + "loss": 0.3379, + "mean_token_accuracy": 0.895476758480072, + "step": 5621 + }, + { + "epoch": 2.811, + "grad_norm": 2.200403822540812, + "learning_rate": 4.516885256788844e-06, + "loss": 0.2594, + "mean_token_accuracy": 0.9144413471221924, + "step": 5622 + }, + { + "epoch": 2.8115, + "grad_norm": 2.038747781693034, + "learning_rate": 4.516627402842829e-06, + "loss": 0.3205, + "mean_token_accuracy": 0.8964664936065674, + "step": 5623 + }, + { + "epoch": 2.8120000000000003, + "grad_norm": 2.2190026858118412, + "learning_rate": 4.516369487466832e-06, + "loss": 0.2825, + "mean_token_accuracy": 0.9127712845802307, + "step": 5624 + }, + { + "epoch": 2.8125, + "grad_norm": 2.3956002410468953, + "learning_rate": 4.516111510668707e-06, + "loss": 0.3237, + "mean_token_accuracy": 0.8933722972869873, + "step": 5625 + }, + { + "epoch": 2.8129999999999997, + "grad_norm": 2.5578015000865117, + "learning_rate": 4.515853472456314e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8855421543121338, + "step": 5626 + }, + { + "epoch": 2.8135, + "grad_norm": 3.6114277409792326, + "learning_rate": 4.5155953728375125e-06, + "loss": 0.2877, + "mean_token_accuracy": 0.9029585719108582, + "step": 5627 + }, + { + "epoch": 2.814, + "grad_norm": 1.9722906201754444, + "learning_rate": 4.515337211820165e-06, + "loss": 0.2532, + "mean_token_accuracy": 0.9118175506591797, + "step": 5628 + }, + { + "epoch": 2.8145, + "grad_norm": 2.466115956461878, + "learning_rate": 4.515078989412135e-06, + "loss": 0.2807, + "mean_token_accuracy": 0.901802122592926, + "step": 5629 + }, + { + "epoch": 2.815, + "grad_norm": 4.519325179127456, + "learning_rate": 4.51482070562129e-06, + "loss": 0.3069, + "mean_token_accuracy": 0.8995522856712341, + "step": 5630 + }, + { + "epoch": 2.8155, + "grad_norm": 1.61382590592971, + "learning_rate": 4.514562360455496e-06, + "loss": 0.2823, + "mean_token_accuracy": 0.9052750468254089, + "step": 5631 + }, + { + "epoch": 2.816, + "grad_norm": 2.737924175799206, + "learning_rate": 4.514303953922623e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8868404030799866, + "step": 5632 + }, + { + "epoch": 2.8165, + "grad_norm": 3.171339896096666, + "learning_rate": 4.5140454860305435e-06, + "loss": 0.3111, + "mean_token_accuracy": 0.8971068263053894, + "step": 5633 + }, + { + "epoch": 2.817, + "grad_norm": 3.006071253680055, + "learning_rate": 4.51378695678713e-06, + "loss": 0.2593, + "mean_token_accuracy": 0.9137289524078369, + "step": 5634 + }, + { + "epoch": 2.8175, + "grad_norm": 1.8897607537930243, + "learning_rate": 4.513528366200258e-06, + "loss": 0.262, + "mean_token_accuracy": 0.9093211889266968, + "step": 5635 + }, + { + "epoch": 2.818, + "grad_norm": 2.1786582660003866, + "learning_rate": 4.5132697142778045e-06, + "loss": 0.2784, + "mean_token_accuracy": 0.907843828201294, + "step": 5636 + }, + { + "epoch": 2.8185000000000002, + "grad_norm": 2.760475930484618, + "learning_rate": 4.51301100102765e-06, + "loss": 0.2902, + "mean_token_accuracy": 0.9063636660575867, + "step": 5637 + }, + { + "epoch": 2.819, + "grad_norm": 2.1276388011226874, + "learning_rate": 4.512752226457673e-06, + "loss": 0.2505, + "mean_token_accuracy": 0.9133573770523071, + "step": 5638 + }, + { + "epoch": 2.8195, + "grad_norm": 1.711291014850173, + "learning_rate": 4.512493390575757e-06, + "loss": 0.2731, + "mean_token_accuracy": 0.9052115082740784, + "step": 5639 + }, + { + "epoch": 2.82, + "grad_norm": 2.2204009623916794, + "learning_rate": 4.512234493389785e-06, + "loss": 0.2861, + "mean_token_accuracy": 0.907633900642395, + "step": 5640 + }, + { + "epoch": 2.8205, + "grad_norm": 3.055672240821593, + "learning_rate": 4.511975534907648e-06, + "loss": 0.2092, + "mean_token_accuracy": 0.9219738245010376, + "step": 5641 + }, + { + "epoch": 2.8209999999999997, + "grad_norm": 2.169716525024683, + "learning_rate": 4.51171651513723e-06, + "loss": 0.3153, + "mean_token_accuracy": 0.894800066947937, + "step": 5642 + }, + { + "epoch": 2.8215, + "grad_norm": 1.9235300936686728, + "learning_rate": 4.511457434086423e-06, + "loss": 0.3139, + "mean_token_accuracy": 0.8921043872833252, + "step": 5643 + }, + { + "epoch": 2.822, + "grad_norm": 2.5111554093060917, + "learning_rate": 4.511198291763119e-06, + "loss": 0.2431, + "mean_token_accuracy": 0.9280789494514465, + "step": 5644 + }, + { + "epoch": 2.8225, + "grad_norm": 2.241078994978536, + "learning_rate": 4.510939088175211e-06, + "loss": 0.2266, + "mean_token_accuracy": 0.9245237708091736, + "step": 5645 + }, + { + "epoch": 2.823, + "grad_norm": 2.2975309948287927, + "learning_rate": 4.510679823330597e-06, + "loss": 0.2678, + "mean_token_accuracy": 0.9106650948524475, + "step": 5646 + }, + { + "epoch": 2.8235, + "grad_norm": 4.345306350191806, + "learning_rate": 4.510420497237172e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.8923097848892212, + "step": 5647 + }, + { + "epoch": 2.824, + "grad_norm": 3.2416930208849943, + "learning_rate": 4.510161109902837e-06, + "loss": 0.2416, + "mean_token_accuracy": 0.9105252027511597, + "step": 5648 + }, + { + "epoch": 2.8245, + "grad_norm": 4.52280171834477, + "learning_rate": 4.509901661335493e-06, + "loss": 0.2419, + "mean_token_accuracy": 0.9167889356613159, + "step": 5649 + }, + { + "epoch": 2.825, + "grad_norm": 2.1438149974157072, + "learning_rate": 4.509642151543043e-06, + "loss": 0.327, + "mean_token_accuracy": 0.8889084458351135, + "step": 5650 + }, + { + "epoch": 2.8255, + "grad_norm": 5.617379121007214, + "learning_rate": 4.509382580533394e-06, + "loss": 0.2111, + "mean_token_accuracy": 0.9264647364616394, + "step": 5651 + }, + { + "epoch": 2.826, + "grad_norm": 2.978515997623906, + "learning_rate": 4.50912294831445e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8729264140129089, + "step": 5652 + }, + { + "epoch": 2.8265000000000002, + "grad_norm": 2.3684313004874467, + "learning_rate": 4.508863254894121e-06, + "loss": 0.2958, + "mean_token_accuracy": 0.9075272679328918, + "step": 5653 + }, + { + "epoch": 2.827, + "grad_norm": 2.573233876104837, + "learning_rate": 4.5086035002803195e-06, + "loss": 0.2562, + "mean_token_accuracy": 0.9229224920272827, + "step": 5654 + }, + { + "epoch": 2.8275, + "grad_norm": 3.370568335871466, + "learning_rate": 4.508343684480956e-06, + "loss": 0.2886, + "mean_token_accuracy": 0.912138044834137, + "step": 5655 + }, + { + "epoch": 2.828, + "grad_norm": 2.827787609861796, + "learning_rate": 4.508083807503945e-06, + "loss": 0.2805, + "mean_token_accuracy": 0.8983522653579712, + "step": 5656 + }, + { + "epoch": 2.8285, + "grad_norm": 1.3672815338870525, + "learning_rate": 4.507823869357204e-06, + "loss": 0.1916, + "mean_token_accuracy": 0.9271330833435059, + "step": 5657 + }, + { + "epoch": 2.8289999999999997, + "grad_norm": 2.176076573933377, + "learning_rate": 4.5075638700486505e-06, + "loss": 0.246, + "mean_token_accuracy": 0.9191875457763672, + "step": 5658 + }, + { + "epoch": 2.8295, + "grad_norm": 2.6615031936993536, + "learning_rate": 4.507303809586203e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8690573573112488, + "step": 5659 + }, + { + "epoch": 2.83, + "grad_norm": 1.4961264149506823, + "learning_rate": 4.507043687977787e-06, + "loss": 0.251, + "mean_token_accuracy": 0.9092351794242859, + "step": 5660 + }, + { + "epoch": 2.8305, + "grad_norm": 2.2369940314891186, + "learning_rate": 4.506783505231323e-06, + "loss": 0.2517, + "mean_token_accuracy": 0.9192754626274109, + "step": 5661 + }, + { + "epoch": 2.831, + "grad_norm": 1.5660398325655722, + "learning_rate": 4.506523261354739e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8776408433914185, + "step": 5662 + }, + { + "epoch": 2.8315, + "grad_norm": 3.055398857585935, + "learning_rate": 4.50626295635596e-06, + "loss": 0.3479, + "mean_token_accuracy": 0.8947368264198303, + "step": 5663 + }, + { + "epoch": 2.832, + "grad_norm": 3.023247536819923, + "learning_rate": 4.506002590242917e-06, + "loss": 0.3205, + "mean_token_accuracy": 0.8845769762992859, + "step": 5664 + }, + { + "epoch": 2.8325, + "grad_norm": 2.542258576421957, + "learning_rate": 4.505742163023541e-06, + "loss": 0.3366, + "mean_token_accuracy": 0.8791165351867676, + "step": 5665 + }, + { + "epoch": 2.833, + "grad_norm": 2.130998551549632, + "learning_rate": 4.5054816747057645e-06, + "loss": 0.2147, + "mean_token_accuracy": 0.9358530044555664, + "step": 5666 + }, + { + "epoch": 2.8335, + "grad_norm": 2.0814297683237672, + "learning_rate": 4.505221125297523e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8823434114456177, + "step": 5667 + }, + { + "epoch": 2.834, + "grad_norm": 1.8682082578189265, + "learning_rate": 4.504960514806753e-06, + "loss": 0.2136, + "mean_token_accuracy": 0.9283110499382019, + "step": 5668 + }, + { + "epoch": 2.8345000000000002, + "grad_norm": 14.608262667700403, + "learning_rate": 4.504699843241394e-06, + "loss": 0.2388, + "mean_token_accuracy": 0.9153509736061096, + "step": 5669 + }, + { + "epoch": 2.835, + "grad_norm": 36.87103637966103, + "learning_rate": 4.504439110609385e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8813016414642334, + "step": 5670 + }, + { + "epoch": 2.8355, + "grad_norm": 2.178837796232762, + "learning_rate": 4.50417831691867e-06, + "loss": 0.3257, + "mean_token_accuracy": 0.8975565433502197, + "step": 5671 + }, + { + "epoch": 2.836, + "grad_norm": 3.0496690155597195, + "learning_rate": 4.503917462177192e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8890945315361023, + "step": 5672 + }, + { + "epoch": 2.8365, + "grad_norm": 10.281721701653963, + "learning_rate": 4.503656546392897e-06, + "loss": 0.3091, + "mean_token_accuracy": 0.9012598991394043, + "step": 5673 + }, + { + "epoch": 2.8369999999999997, + "grad_norm": 2.1553243345922755, + "learning_rate": 4.503395569573734e-06, + "loss": 0.3391, + "mean_token_accuracy": 0.8809811472892761, + "step": 5674 + }, + { + "epoch": 2.8375, + "grad_norm": 2.157751486671353, + "learning_rate": 4.503134531727652e-06, + "loss": 0.2776, + "mean_token_accuracy": 0.909469723701477, + "step": 5675 + }, + { + "epoch": 2.838, + "grad_norm": 2.540466008906768, + "learning_rate": 4.502873432862603e-06, + "loss": 0.2207, + "mean_token_accuracy": 0.9243592619895935, + "step": 5676 + }, + { + "epoch": 2.8385, + "grad_norm": 1.881283572019036, + "learning_rate": 4.5026122729865405e-06, + "loss": 0.2463, + "mean_token_accuracy": 0.9154345393180847, + "step": 5677 + }, + { + "epoch": 2.839, + "grad_norm": 1.8918655819822572, + "learning_rate": 4.50235105210742e-06, + "loss": 0.2753, + "mean_token_accuracy": 0.8926564455032349, + "step": 5678 + }, + { + "epoch": 2.8395, + "grad_norm": 2.967662359963161, + "learning_rate": 4.502089770233198e-06, + "loss": 0.3089, + "mean_token_accuracy": 0.8859023451805115, + "step": 5679 + }, + { + "epoch": 2.84, + "grad_norm": 2.253226081719211, + "learning_rate": 4.501828427371834e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8879017233848572, + "step": 5680 + }, + { + "epoch": 2.8405, + "grad_norm": 2.7626857672906473, + "learning_rate": 4.50156702353129e-06, + "loss": 0.3102, + "mean_token_accuracy": 0.8936658501625061, + "step": 5681 + }, + { + "epoch": 2.841, + "grad_norm": 1.8313644233140116, + "learning_rate": 4.501305558719527e-06, + "loss": 0.2727, + "mean_token_accuracy": 0.9011239409446716, + "step": 5682 + }, + { + "epoch": 2.8415, + "grad_norm": 2.4445118083676793, + "learning_rate": 4.501044032944511e-06, + "loss": 0.346, + "mean_token_accuracy": 0.8835116028785706, + "step": 5683 + }, + { + "epoch": 2.842, + "grad_norm": 1.3713471016032124, + "learning_rate": 4.500782446214208e-06, + "loss": 0.2592, + "mean_token_accuracy": 0.9004398584365845, + "step": 5684 + }, + { + "epoch": 2.8425000000000002, + "grad_norm": 1.9981629089685853, + "learning_rate": 4.5005207985365875e-06, + "loss": 0.2747, + "mean_token_accuracy": 0.9122386574745178, + "step": 5685 + }, + { + "epoch": 2.843, + "grad_norm": 3.845373779924175, + "learning_rate": 4.500259089919618e-06, + "loss": 0.2612, + "mean_token_accuracy": 0.9105879068374634, + "step": 5686 + }, + { + "epoch": 2.8435, + "grad_norm": 2.973611666280204, + "learning_rate": 4.499997320371271e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.8900130391120911, + "step": 5687 + }, + { + "epoch": 2.844, + "grad_norm": 1.3868102849543844, + "learning_rate": 4.499735489899524e-06, + "loss": 0.1862, + "mean_token_accuracy": 0.9277787804603577, + "step": 5688 + }, + { + "epoch": 2.8445, + "grad_norm": 6.748829429030178, + "learning_rate": 4.499473598512349e-06, + "loss": 0.3289, + "mean_token_accuracy": 0.8881560564041138, + "step": 5689 + }, + { + "epoch": 2.8449999999999998, + "grad_norm": 1.3544279534074841, + "learning_rate": 4.4992116462177274e-06, + "loss": 0.2007, + "mean_token_accuracy": 0.927601158618927, + "step": 5690 + }, + { + "epoch": 2.8455, + "grad_norm": 1.7272880178703949, + "learning_rate": 4.498949633023635e-06, + "loss": 0.3111, + "mean_token_accuracy": 0.8963778018951416, + "step": 5691 + }, + { + "epoch": 2.846, + "grad_norm": 2.010035833357709, + "learning_rate": 4.498687558938055e-06, + "loss": 0.2275, + "mean_token_accuracy": 0.9219101071357727, + "step": 5692 + }, + { + "epoch": 2.8465, + "grad_norm": 1.7209476621731288, + "learning_rate": 4.4984254239689705e-06, + "loss": 0.2265, + "mean_token_accuracy": 0.9163212776184082, + "step": 5693 + }, + { + "epoch": 2.847, + "grad_norm": 2.3113847324415877, + "learning_rate": 4.498163228124366e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8997781276702881, + "step": 5694 + }, + { + "epoch": 2.8475, + "grad_norm": 2.01107947316566, + "learning_rate": 4.49790097141223e-06, + "loss": 0.2499, + "mean_token_accuracy": 0.9159758687019348, + "step": 5695 + }, + { + "epoch": 2.848, + "grad_norm": 1.8850002916517044, + "learning_rate": 4.49763865384055e-06, + "loss": 0.2878, + "mean_token_accuracy": 0.8964011073112488, + "step": 5696 + }, + { + "epoch": 2.8485, + "grad_norm": 1.8978120104262488, + "learning_rate": 4.497376275417317e-06, + "loss": 0.2365, + "mean_token_accuracy": 0.908835232257843, + "step": 5697 + }, + { + "epoch": 2.849, + "grad_norm": 2.2323901766264274, + "learning_rate": 4.497113836150523e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8925818204879761, + "step": 5698 + }, + { + "epoch": 2.8495, + "grad_norm": 2.2823075295114696, + "learning_rate": 4.496851336048163e-06, + "loss": 0.2882, + "mean_token_accuracy": 0.9007455706596375, + "step": 5699 + }, + { + "epoch": 2.85, + "grad_norm": 2.189768097712397, + "learning_rate": 4.496588775118232e-06, + "loss": 0.3404, + "mean_token_accuracy": 0.8849626183509827, + "step": 5700 + }, + { + "epoch": 2.8505000000000003, + "grad_norm": 2.814486178311195, + "learning_rate": 4.496326153368731e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.8979631662368774, + "step": 5701 + }, + { + "epoch": 2.851, + "grad_norm": 5.0001094660298575, + "learning_rate": 4.496063470807657e-06, + "loss": 0.2812, + "mean_token_accuracy": 0.9045643210411072, + "step": 5702 + }, + { + "epoch": 2.8515, + "grad_norm": 2.0236158644711804, + "learning_rate": 4.495800727443012e-06, + "loss": 0.2317, + "mean_token_accuracy": 0.9150469303131104, + "step": 5703 + }, + { + "epoch": 2.852, + "grad_norm": 3.8068618625659623, + "learning_rate": 4.4955379232828014e-06, + "loss": 0.2105, + "mean_token_accuracy": 0.9315037131309509, + "step": 5704 + }, + { + "epoch": 2.8525, + "grad_norm": 4.455014423932426, + "learning_rate": 4.495275058335029e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8903541564941406, + "step": 5705 + }, + { + "epoch": 2.8529999999999998, + "grad_norm": 2.135284092656172, + "learning_rate": 4.495012132607703e-06, + "loss": 0.2899, + "mean_token_accuracy": 0.9004223346710205, + "step": 5706 + }, + { + "epoch": 2.8535, + "grad_norm": 2.2651250534454617, + "learning_rate": 4.494749146108832e-06, + "loss": 0.2779, + "mean_token_accuracy": 0.9014787077903748, + "step": 5707 + }, + { + "epoch": 2.854, + "grad_norm": 1.9741118726024984, + "learning_rate": 4.494486098846428e-06, + "loss": 0.2595, + "mean_token_accuracy": 0.9118269681930542, + "step": 5708 + }, + { + "epoch": 2.8545, + "grad_norm": 1.8550597756862688, + "learning_rate": 4.494222990828503e-06, + "loss": 0.2729, + "mean_token_accuracy": 0.9031816124916077, + "step": 5709 + }, + { + "epoch": 2.855, + "grad_norm": 2.976175987615147, + "learning_rate": 4.4939598220630724e-06, + "loss": 0.3268, + "mean_token_accuracy": 0.8940815329551697, + "step": 5710 + }, + { + "epoch": 2.8555, + "grad_norm": 1.5985799007282884, + "learning_rate": 4.493696592558151e-06, + "loss": 0.2456, + "mean_token_accuracy": 0.9104251265525818, + "step": 5711 + }, + { + "epoch": 2.856, + "grad_norm": 2.832437107484235, + "learning_rate": 4.493433302321759e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8898587822914124, + "step": 5712 + }, + { + "epoch": 2.8565, + "grad_norm": 2.098304626400404, + "learning_rate": 4.493169951361917e-06, + "loss": 0.3165, + "mean_token_accuracy": 0.8919762969017029, + "step": 5713 + }, + { + "epoch": 2.857, + "grad_norm": 9.087178678292947, + "learning_rate": 4.492906539686646e-06, + "loss": 0.2273, + "mean_token_accuracy": 0.9141457080841064, + "step": 5714 + }, + { + "epoch": 2.8575, + "grad_norm": 1.9905101817594983, + "learning_rate": 4.49264306730397e-06, + "loss": 0.2791, + "mean_token_accuracy": 0.9119900465011597, + "step": 5715 + }, + { + "epoch": 2.858, + "grad_norm": 5.026128256947192, + "learning_rate": 4.492379534221916e-06, + "loss": 0.1814, + "mean_token_accuracy": 0.9367321729660034, + "step": 5716 + }, + { + "epoch": 2.8585000000000003, + "grad_norm": 2.202886452905111, + "learning_rate": 4.49211594044851e-06, + "loss": 0.2276, + "mean_token_accuracy": 0.9228689670562744, + "step": 5717 + }, + { + "epoch": 2.859, + "grad_norm": 3.1624772481954797, + "learning_rate": 4.491852285991784e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8857467770576477, + "step": 5718 + }, + { + "epoch": 2.8595, + "grad_norm": 2.2120143047055834, + "learning_rate": 4.491588570859766e-06, + "loss": 0.2875, + "mean_token_accuracy": 0.9040845632553101, + "step": 5719 + }, + { + "epoch": 2.86, + "grad_norm": 2.8171313435877594, + "learning_rate": 4.491324795060491e-06, + "loss": 0.3075, + "mean_token_accuracy": 0.896238386631012, + "step": 5720 + }, + { + "epoch": 2.8605, + "grad_norm": 6.366330464238481, + "learning_rate": 4.491060958601995e-06, + "loss": 0.2884, + "mean_token_accuracy": 0.8996809720993042, + "step": 5721 + }, + { + "epoch": 2.8609999999999998, + "grad_norm": 2.704828431494217, + "learning_rate": 4.490797061492314e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8750221729278564, + "step": 5722 + }, + { + "epoch": 2.8615, + "grad_norm": 3.974909204194204, + "learning_rate": 4.490533103739486e-06, + "loss": 0.334, + "mean_token_accuracy": 0.8947092890739441, + "step": 5723 + }, + { + "epoch": 2.862, + "grad_norm": 2.168076075465188, + "learning_rate": 4.490269085351552e-06, + "loss": 0.2871, + "mean_token_accuracy": 0.9054909348487854, + "step": 5724 + }, + { + "epoch": 2.8625, + "grad_norm": 1.7535643656192847, + "learning_rate": 4.490005006336555e-06, + "loss": 0.2508, + "mean_token_accuracy": 0.9093124270439148, + "step": 5725 + }, + { + "epoch": 2.863, + "grad_norm": 2.6003333027548243, + "learning_rate": 4.48974086670254e-06, + "loss": 0.346, + "mean_token_accuracy": 0.885515034198761, + "step": 5726 + }, + { + "epoch": 2.8635, + "grad_norm": 2.536119681645746, + "learning_rate": 4.489476666457552e-06, + "loss": 0.311, + "mean_token_accuracy": 0.8925890922546387, + "step": 5727 + }, + { + "epoch": 2.864, + "grad_norm": 4.116733956151901, + "learning_rate": 4.4892124056096386e-06, + "loss": 0.2475, + "mean_token_accuracy": 0.9143725037574768, + "step": 5728 + }, + { + "epoch": 2.8645, + "grad_norm": 3.0597189106667297, + "learning_rate": 4.488948084166851e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8785253167152405, + "step": 5729 + }, + { + "epoch": 2.865, + "grad_norm": 2.5218860247646693, + "learning_rate": 4.48868370213724e-06, + "loss": 0.3068, + "mean_token_accuracy": 0.8993933796882629, + "step": 5730 + }, + { + "epoch": 2.8655, + "grad_norm": 1.831871213334096, + "learning_rate": 4.488419259528859e-06, + "loss": 0.2308, + "mean_token_accuracy": 0.9192455410957336, + "step": 5731 + }, + { + "epoch": 2.866, + "grad_norm": 2.0841450544230993, + "learning_rate": 4.488154756349765e-06, + "loss": 0.3078, + "mean_token_accuracy": 0.8976624608039856, + "step": 5732 + }, + { + "epoch": 2.8665000000000003, + "grad_norm": 2.0222850188042263, + "learning_rate": 4.487890192608013e-06, + "loss": 0.3045, + "mean_token_accuracy": 0.8853784203529358, + "step": 5733 + }, + { + "epoch": 2.867, + "grad_norm": 3.3543375849042163, + "learning_rate": 4.487625568311663e-06, + "loss": 0.297, + "mean_token_accuracy": 0.9048804044723511, + "step": 5734 + }, + { + "epoch": 2.8675, + "grad_norm": 2.990712875723516, + "learning_rate": 4.487360883468775e-06, + "loss": 0.2733, + "mean_token_accuracy": 0.9000340104103088, + "step": 5735 + }, + { + "epoch": 2.868, + "grad_norm": 1.794780762024139, + "learning_rate": 4.487096138087415e-06, + "loss": 0.2257, + "mean_token_accuracy": 0.9243652820587158, + "step": 5736 + }, + { + "epoch": 2.8685, + "grad_norm": 3.274596099230943, + "learning_rate": 4.486831332175643e-06, + "loss": 0.1997, + "mean_token_accuracy": 0.928175687789917, + "step": 5737 + }, + { + "epoch": 2.8689999999999998, + "grad_norm": 2.4008090923904284, + "learning_rate": 4.486566465741528e-06, + "loss": 0.2641, + "mean_token_accuracy": 0.9155565500259399, + "step": 5738 + }, + { + "epoch": 2.8695, + "grad_norm": 2.5902830608912892, + "learning_rate": 4.48630153879314e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.883090615272522, + "step": 5739 + }, + { + "epoch": 2.87, + "grad_norm": 14.634786139885291, + "learning_rate": 4.4860365513385456e-06, + "loss": 0.2626, + "mean_token_accuracy": 0.9157874584197998, + "step": 5740 + }, + { + "epoch": 2.8705, + "grad_norm": 2.4153929310307434, + "learning_rate": 4.485771503385818e-06, + "loss": 0.309, + "mean_token_accuracy": 0.8890052437782288, + "step": 5741 + }, + { + "epoch": 2.871, + "grad_norm": 2.4447408457690476, + "learning_rate": 4.485506394943033e-06, + "loss": 0.285, + "mean_token_accuracy": 0.9078730344772339, + "step": 5742 + }, + { + "epoch": 2.8715, + "grad_norm": 5.305474373420247, + "learning_rate": 4.485241226018264e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8674609065055847, + "step": 5743 + }, + { + "epoch": 2.872, + "grad_norm": 2.1592683362845113, + "learning_rate": 4.4849759966195885e-06, + "loss": 0.2611, + "mean_token_accuracy": 0.9106693267822266, + "step": 5744 + }, + { + "epoch": 2.8725, + "grad_norm": 1.6998676292072767, + "learning_rate": 4.484710706755087e-06, + "loss": 0.2907, + "mean_token_accuracy": 0.9057598114013672, + "step": 5745 + }, + { + "epoch": 2.873, + "grad_norm": 1.6367153032340382, + "learning_rate": 4.48444535643284e-06, + "loss": 0.2539, + "mean_token_accuracy": 0.9100884199142456, + "step": 5746 + }, + { + "epoch": 2.8735, + "grad_norm": 12.676284409622694, + "learning_rate": 4.484179945660931e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.8992902636528015, + "step": 5747 + }, + { + "epoch": 2.874, + "grad_norm": 2.3489019019218604, + "learning_rate": 4.483914474447445e-06, + "loss": 0.2119, + "mean_token_accuracy": 0.9207371473312378, + "step": 5748 + }, + { + "epoch": 2.8745000000000003, + "grad_norm": 1.9411157890983137, + "learning_rate": 4.483648942800468e-06, + "loss": 0.2253, + "mean_token_accuracy": 0.9240156412124634, + "step": 5749 + }, + { + "epoch": 2.875, + "grad_norm": 2.588178444906488, + "learning_rate": 4.4833833507280884e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8901873230934143, + "step": 5750 + }, + { + "epoch": 2.8754999999999997, + "grad_norm": 12.160641508645455, + "learning_rate": 4.483117698238397e-06, + "loss": 0.3346, + "mean_token_accuracy": 0.8837878704071045, + "step": 5751 + }, + { + "epoch": 2.876, + "grad_norm": 2.7544834662310658, + "learning_rate": 4.482851985339487e-06, + "loss": 0.2463, + "mean_token_accuracy": 0.901056170463562, + "step": 5752 + }, + { + "epoch": 2.8765, + "grad_norm": 1.9588459421642122, + "learning_rate": 4.482586212039451e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.885055422782898, + "step": 5753 + }, + { + "epoch": 2.877, + "grad_norm": 2.418068425227805, + "learning_rate": 4.482320378346385e-06, + "loss": 0.3122, + "mean_token_accuracy": 0.9000177979469299, + "step": 5754 + }, + { + "epoch": 2.8775, + "grad_norm": 2.3457239468060345, + "learning_rate": 4.482054484268389e-06, + "loss": 0.2132, + "mean_token_accuracy": 0.9249264597892761, + "step": 5755 + }, + { + "epoch": 2.878, + "grad_norm": 1.3978515708892087, + "learning_rate": 4.4817885298135584e-06, + "loss": 0.1998, + "mean_token_accuracy": 0.9322223663330078, + "step": 5756 + }, + { + "epoch": 2.8785, + "grad_norm": 1.8495542673388181, + "learning_rate": 4.48152251499e-06, + "loss": 0.325, + "mean_token_accuracy": 0.9013702869415283, + "step": 5757 + }, + { + "epoch": 2.879, + "grad_norm": 2.6102902307022595, + "learning_rate": 4.481256439805812e-06, + "loss": 0.2678, + "mean_token_accuracy": 0.9088334441184998, + "step": 5758 + }, + { + "epoch": 2.8795, + "grad_norm": 1.9150706720226414, + "learning_rate": 4.480990304269102e-06, + "loss": 0.2191, + "mean_token_accuracy": 0.9226672053337097, + "step": 5759 + }, + { + "epoch": 2.88, + "grad_norm": 2.284976468896805, + "learning_rate": 4.4807241083879774e-06, + "loss": 0.3119, + "mean_token_accuracy": 0.8932605385780334, + "step": 5760 + }, + { + "epoch": 2.8805, + "grad_norm": 1.7080438227338954, + "learning_rate": 4.4804578521705456e-06, + "loss": 0.238, + "mean_token_accuracy": 0.9170795679092407, + "step": 5761 + }, + { + "epoch": 2.8810000000000002, + "grad_norm": 6.17116929133023, + "learning_rate": 4.480191535624918e-06, + "loss": 0.2315, + "mean_token_accuracy": 0.9192155599594116, + "step": 5762 + }, + { + "epoch": 2.8815, + "grad_norm": 2.082723233228504, + "learning_rate": 4.479925158759207e-06, + "loss": 0.298, + "mean_token_accuracy": 0.8942550420761108, + "step": 5763 + }, + { + "epoch": 2.882, + "grad_norm": 2.129238426955137, + "learning_rate": 4.479658721581527e-06, + "loss": 0.3167, + "mean_token_accuracy": 0.8962478041648865, + "step": 5764 + }, + { + "epoch": 2.8825, + "grad_norm": 3.895897147856123, + "learning_rate": 4.4793922240999935e-06, + "loss": 0.3104, + "mean_token_accuracy": 0.9012429714202881, + "step": 5765 + }, + { + "epoch": 2.883, + "grad_norm": 2.248970142065432, + "learning_rate": 4.479125666322725e-06, + "loss": 0.2448, + "mean_token_accuracy": 0.9166070222854614, + "step": 5766 + }, + { + "epoch": 2.8834999999999997, + "grad_norm": 2.353796137135187, + "learning_rate": 4.478859048257842e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8899288177490234, + "step": 5767 + }, + { + "epoch": 2.884, + "grad_norm": 2.0918107014681877, + "learning_rate": 4.478592369913464e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8692726492881775, + "step": 5768 + }, + { + "epoch": 2.8845, + "grad_norm": 2.3511059342255343, + "learning_rate": 4.478325631297717e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.8860097527503967, + "step": 5769 + }, + { + "epoch": 2.885, + "grad_norm": 1.6049651695538958, + "learning_rate": 4.478058832418726e-06, + "loss": 0.2568, + "mean_token_accuracy": 0.904565691947937, + "step": 5770 + }, + { + "epoch": 2.8855, + "grad_norm": 2.6947109763143025, + "learning_rate": 4.477791973284617e-06, + "loss": 0.2365, + "mean_token_accuracy": 0.9226486086845398, + "step": 5771 + }, + { + "epoch": 2.886, + "grad_norm": 2.030341169511081, + "learning_rate": 4.477525053903517e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8783701658248901, + "step": 5772 + }, + { + "epoch": 2.8865, + "grad_norm": 2.0807960023352, + "learning_rate": 4.477258074283562e-06, + "loss": 0.2527, + "mean_token_accuracy": 0.9072536826133728, + "step": 5773 + }, + { + "epoch": 2.887, + "grad_norm": 1.9479105635372578, + "learning_rate": 4.47699103443288e-06, + "loss": 0.293, + "mean_token_accuracy": 0.9032602906227112, + "step": 5774 + }, + { + "epoch": 2.8875, + "grad_norm": 2.5806043796499636, + "learning_rate": 4.476723934359609e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8790090084075928, + "step": 5775 + }, + { + "epoch": 2.888, + "grad_norm": 2.0425463458206643, + "learning_rate": 4.476456774071883e-06, + "loss": 0.2187, + "mean_token_accuracy": 0.9139934778213501, + "step": 5776 + }, + { + "epoch": 2.8885, + "grad_norm": 1.7914710785003973, + "learning_rate": 4.47618955357784e-06, + "loss": 0.2369, + "mean_token_accuracy": 0.9171072244644165, + "step": 5777 + }, + { + "epoch": 2.8890000000000002, + "grad_norm": 1.9664895044240152, + "learning_rate": 4.475922272885622e-06, + "loss": 0.2658, + "mean_token_accuracy": 0.909443736076355, + "step": 5778 + }, + { + "epoch": 2.8895, + "grad_norm": 2.832553670997477, + "learning_rate": 4.475654932003369e-06, + "loss": 0.2546, + "mean_token_accuracy": 0.9131627082824707, + "step": 5779 + }, + { + "epoch": 2.89, + "grad_norm": 1.8700463451594784, + "learning_rate": 4.475387530939226e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8850862383842468, + "step": 5780 + }, + { + "epoch": 2.8905, + "grad_norm": 2.4676185742406944, + "learning_rate": 4.475120069701338e-06, + "loss": 0.2623, + "mean_token_accuracy": 0.9186875820159912, + "step": 5781 + }, + { + "epoch": 2.891, + "grad_norm": 3.28870390782023, + "learning_rate": 4.474852548297852e-06, + "loss": 0.1808, + "mean_token_accuracy": 0.9390581846237183, + "step": 5782 + }, + { + "epoch": 2.8914999999999997, + "grad_norm": 6.491100222130989, + "learning_rate": 4.474584966736917e-06, + "loss": 0.2669, + "mean_token_accuracy": 0.9126871824264526, + "step": 5783 + }, + { + "epoch": 2.892, + "grad_norm": 1.898500440226465, + "learning_rate": 4.474317325026685e-06, + "loss": 0.2761, + "mean_token_accuracy": 0.9109401702880859, + "step": 5784 + }, + { + "epoch": 2.8925, + "grad_norm": 2.0813502237264414, + "learning_rate": 4.474049623175307e-06, + "loss": 0.3305, + "mean_token_accuracy": 0.8854914903640747, + "step": 5785 + }, + { + "epoch": 2.893, + "grad_norm": 2.4118847755517403, + "learning_rate": 4.47378186119094e-06, + "loss": 0.3192, + "mean_token_accuracy": 0.8927428126335144, + "step": 5786 + }, + { + "epoch": 2.8935, + "grad_norm": 1.964211434556006, + "learning_rate": 4.473514039081739e-06, + "loss": 0.2619, + "mean_token_accuracy": 0.909792423248291, + "step": 5787 + }, + { + "epoch": 2.894, + "grad_norm": 8.839820010194101, + "learning_rate": 4.473246156855862e-06, + "loss": 0.2963, + "mean_token_accuracy": 0.904347836971283, + "step": 5788 + }, + { + "epoch": 2.8945, + "grad_norm": 2.101029735470649, + "learning_rate": 4.472978214521472e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8893687725067139, + "step": 5789 + }, + { + "epoch": 2.895, + "grad_norm": 2.141840630666529, + "learning_rate": 4.4727102120867274e-06, + "loss": 0.2268, + "mean_token_accuracy": 0.9199931025505066, + "step": 5790 + }, + { + "epoch": 2.8955, + "grad_norm": 3.8194845088291807, + "learning_rate": 4.472442149559793e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.8956907987594604, + "step": 5791 + }, + { + "epoch": 2.896, + "grad_norm": 1.9152658022974, + "learning_rate": 4.472174026948836e-06, + "loss": 0.2611, + "mean_token_accuracy": 0.9107315540313721, + "step": 5792 + }, + { + "epoch": 2.8965, + "grad_norm": 3.203008225342393, + "learning_rate": 4.471905844262022e-06, + "loss": 0.3172, + "mean_token_accuracy": 0.8947973251342773, + "step": 5793 + }, + { + "epoch": 2.8970000000000002, + "grad_norm": 2.1537700381746467, + "learning_rate": 4.471637601507521e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8992101550102234, + "step": 5794 + }, + { + "epoch": 2.8975, + "grad_norm": 1.8179401020365662, + "learning_rate": 4.471369298693505e-06, + "loss": 0.2568, + "mean_token_accuracy": 0.9143576622009277, + "step": 5795 + }, + { + "epoch": 2.898, + "grad_norm": 2.2746561072999123, + "learning_rate": 4.471100935828146e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8934696912765503, + "step": 5796 + }, + { + "epoch": 2.8985, + "grad_norm": 4.746277439052294, + "learning_rate": 4.470832512919619e-06, + "loss": 0.3079, + "mean_token_accuracy": 0.8933358788490295, + "step": 5797 + }, + { + "epoch": 2.899, + "grad_norm": 2.1958182993420556, + "learning_rate": 4.4705640299761e-06, + "loss": 0.2396, + "mean_token_accuracy": 0.9231509566307068, + "step": 5798 + }, + { + "epoch": 2.8994999999999997, + "grad_norm": 1.8954866051199761, + "learning_rate": 4.470295487005769e-06, + "loss": 0.2591, + "mean_token_accuracy": 0.9121745228767395, + "step": 5799 + }, + { + "epoch": 2.9, + "grad_norm": 2.257050326316319, + "learning_rate": 4.470026884016805e-06, + "loss": 0.2783, + "mean_token_accuracy": 0.9049529433250427, + "step": 5800 + }, + { + "epoch": 2.9005, + "grad_norm": 1.8388968669468688, + "learning_rate": 4.46975822101739e-06, + "loss": 0.2785, + "mean_token_accuracy": 0.9084663987159729, + "step": 5801 + }, + { + "epoch": 2.901, + "grad_norm": 2.626609841284923, + "learning_rate": 4.46948949801571e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8901404738426208, + "step": 5802 + }, + { + "epoch": 2.9015, + "grad_norm": 11.423367669829078, + "learning_rate": 4.469220715019949e-06, + "loss": 0.294, + "mean_token_accuracy": 0.8982753157615662, + "step": 5803 + }, + { + "epoch": 2.902, + "grad_norm": 2.44959555185371, + "learning_rate": 4.468951872038293e-06, + "loss": 0.2653, + "mean_token_accuracy": 0.9221871495246887, + "step": 5804 + }, + { + "epoch": 2.9025, + "grad_norm": 1.8587028011011804, + "learning_rate": 4.468682969078935e-06, + "loss": 0.1816, + "mean_token_accuracy": 0.9337101578712463, + "step": 5805 + }, + { + "epoch": 2.903, + "grad_norm": 3.6230096738683737, + "learning_rate": 4.468414006150063e-06, + "loss": 0.3269, + "mean_token_accuracy": 0.8911840915679932, + "step": 5806 + }, + { + "epoch": 2.9035, + "grad_norm": 2.1015541445481083, + "learning_rate": 4.468144983259873e-06, + "loss": 0.2395, + "mean_token_accuracy": 0.9208144545555115, + "step": 5807 + }, + { + "epoch": 2.904, + "grad_norm": 8.476274140407789, + "learning_rate": 4.467875900416558e-06, + "loss": 0.3275, + "mean_token_accuracy": 0.893993616104126, + "step": 5808 + }, + { + "epoch": 2.9045, + "grad_norm": 2.3443930764213383, + "learning_rate": 4.4676067576283155e-06, + "loss": 0.3204, + "mean_token_accuracy": 0.9047356843948364, + "step": 5809 + }, + { + "epoch": 2.9050000000000002, + "grad_norm": 2.1205628530578875, + "learning_rate": 4.467337554903344e-06, + "loss": 0.2804, + "mean_token_accuracy": 0.9002020359039307, + "step": 5810 + }, + { + "epoch": 2.9055, + "grad_norm": 1.994985109346326, + "learning_rate": 4.467068292249843e-06, + "loss": 0.2752, + "mean_token_accuracy": 0.9023178815841675, + "step": 5811 + }, + { + "epoch": 2.906, + "grad_norm": 2.584919241573699, + "learning_rate": 4.4667989696760154e-06, + "loss": 0.2955, + "mean_token_accuracy": 0.896059513092041, + "step": 5812 + }, + { + "epoch": 2.9065, + "grad_norm": 1.6768718001099057, + "learning_rate": 4.466529587190065e-06, + "loss": 0.2303, + "mean_token_accuracy": 0.9198290705680847, + "step": 5813 + }, + { + "epoch": 2.907, + "grad_norm": 1.820589906623784, + "learning_rate": 4.466260144800198e-06, + "loss": 0.2338, + "mean_token_accuracy": 0.9101640582084656, + "step": 5814 + }, + { + "epoch": 2.9074999999999998, + "grad_norm": 5.152197455174232, + "learning_rate": 4.465990642514622e-06, + "loss": 0.2634, + "mean_token_accuracy": 0.9118065237998962, + "step": 5815 + }, + { + "epoch": 2.908, + "grad_norm": 2.4445866324641177, + "learning_rate": 4.465721080341547e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8863636255264282, + "step": 5816 + }, + { + "epoch": 2.9085, + "grad_norm": 2.2476281732474113, + "learning_rate": 4.4654514582891836e-06, + "loss": 0.2879, + "mean_token_accuracy": 0.9035574197769165, + "step": 5817 + }, + { + "epoch": 2.909, + "grad_norm": 3.164176935189049, + "learning_rate": 4.4651817763657454e-06, + "loss": 0.2477, + "mean_token_accuracy": 0.9160653352737427, + "step": 5818 + }, + { + "epoch": 2.9095, + "grad_norm": 2.1451499361686714, + "learning_rate": 4.464912034579447e-06, + "loss": 0.2645, + "mean_token_accuracy": 0.9132413864135742, + "step": 5819 + }, + { + "epoch": 2.91, + "grad_norm": 2.608774259277538, + "learning_rate": 4.464642232938505e-06, + "loss": 0.2152, + "mean_token_accuracy": 0.9269526600837708, + "step": 5820 + }, + { + "epoch": 2.9105, + "grad_norm": 1.9267626464253524, + "learning_rate": 4.464372371451139e-06, + "loss": 0.2725, + "mean_token_accuracy": 0.9071723222732544, + "step": 5821 + }, + { + "epoch": 2.911, + "grad_norm": 4.386267335605244, + "learning_rate": 4.464102450125568e-06, + "loss": 0.2237, + "mean_token_accuracy": 0.9188500642776489, + "step": 5822 + }, + { + "epoch": 2.9115, + "grad_norm": 2.5284386757183808, + "learning_rate": 4.463832468970015e-06, + "loss": 0.2946, + "mean_token_accuracy": 0.9028346538543701, + "step": 5823 + }, + { + "epoch": 2.912, + "grad_norm": 4.063749042798851, + "learning_rate": 4.463562427992705e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8945572972297668, + "step": 5824 + }, + { + "epoch": 2.9125, + "grad_norm": 10.91980493705624, + "learning_rate": 4.463292327201862e-06, + "loss": 0.3306, + "mean_token_accuracy": 0.8917441368103027, + "step": 5825 + }, + { + "epoch": 2.9130000000000003, + "grad_norm": 2.3592763684066003, + "learning_rate": 4.463022166605716e-06, + "loss": 0.3054, + "mean_token_accuracy": 0.9024966359138489, + "step": 5826 + }, + { + "epoch": 2.9135, + "grad_norm": 2.709651410251201, + "learning_rate": 4.462751946212496e-06, + "loss": 0.313, + "mean_token_accuracy": 0.894060492515564, + "step": 5827 + }, + { + "epoch": 2.914, + "grad_norm": 1.686689418854108, + "learning_rate": 4.462481666030432e-06, + "loss": 0.2385, + "mean_token_accuracy": 0.9163058996200562, + "step": 5828 + }, + { + "epoch": 2.9145, + "grad_norm": 2.4191253406163686, + "learning_rate": 4.462211326067757e-06, + "loss": 0.2375, + "mean_token_accuracy": 0.9169623851776123, + "step": 5829 + }, + { + "epoch": 2.915, + "grad_norm": 9.1611608561508, + "learning_rate": 4.461940926332708e-06, + "loss": 0.2365, + "mean_token_accuracy": 0.9082202911376953, + "step": 5830 + }, + { + "epoch": 2.9154999999999998, + "grad_norm": 1.8425869523510894, + "learning_rate": 4.4616704668335204e-06, + "loss": 0.2547, + "mean_token_accuracy": 0.9182602763175964, + "step": 5831 + }, + { + "epoch": 2.916, + "grad_norm": 2.3560164862206308, + "learning_rate": 4.461399947578434e-06, + "loss": 0.3146, + "mean_token_accuracy": 0.8889073729515076, + "step": 5832 + }, + { + "epoch": 2.9165, + "grad_norm": 3.044429766671353, + "learning_rate": 4.461129368575688e-06, + "loss": 0.3378, + "mean_token_accuracy": 0.897034764289856, + "step": 5833 + }, + { + "epoch": 2.917, + "grad_norm": 2.1059484688069974, + "learning_rate": 4.460858729833526e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8748608827590942, + "step": 5834 + }, + { + "epoch": 2.9175, + "grad_norm": 4.038478119072234, + "learning_rate": 4.460588031360191e-06, + "loss": 0.2391, + "mean_token_accuracy": 0.914216160774231, + "step": 5835 + }, + { + "epoch": 2.918, + "grad_norm": 2.126235034694657, + "learning_rate": 4.460317273163929e-06, + "loss": 0.311, + "mean_token_accuracy": 0.8942081332206726, + "step": 5836 + }, + { + "epoch": 2.9185, + "grad_norm": 4.043546213672478, + "learning_rate": 4.4600464552529885e-06, + "loss": 0.2933, + "mean_token_accuracy": 0.9052064418792725, + "step": 5837 + }, + { + "epoch": 2.919, + "grad_norm": 5.37942031925009, + "learning_rate": 4.459775577635619e-06, + "loss": 0.309, + "mean_token_accuracy": 0.9062718749046326, + "step": 5838 + }, + { + "epoch": 2.9195, + "grad_norm": 2.340506410607264, + "learning_rate": 4.459504640320072e-06, + "loss": 0.3137, + "mean_token_accuracy": 0.9027535915374756, + "step": 5839 + }, + { + "epoch": 2.92, + "grad_norm": 3.440430561911043, + "learning_rate": 4.4592336433146e-06, + "loss": 0.2594, + "mean_token_accuracy": 0.9086325168609619, + "step": 5840 + }, + { + "epoch": 2.9205, + "grad_norm": 1.5919107424084022, + "learning_rate": 4.458962586627458e-06, + "loss": 0.2493, + "mean_token_accuracy": 0.9139841794967651, + "step": 5841 + }, + { + "epoch": 2.9210000000000003, + "grad_norm": 9.700975705108121, + "learning_rate": 4.458691470266904e-06, + "loss": 0.2834, + "mean_token_accuracy": 0.9017125964164734, + "step": 5842 + }, + { + "epoch": 2.9215, + "grad_norm": 2.222045955972206, + "learning_rate": 4.458420294241196e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8679413199424744, + "step": 5843 + }, + { + "epoch": 2.922, + "grad_norm": 2.4672041566933127, + "learning_rate": 4.458149058558594e-06, + "loss": 0.3187, + "mean_token_accuracy": 0.8905640244483948, + "step": 5844 + }, + { + "epoch": 2.9225, + "grad_norm": 2.4017390168559642, + "learning_rate": 4.457877763227361e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.896231472492218, + "step": 5845 + }, + { + "epoch": 2.923, + "grad_norm": 5.718381882629338, + "learning_rate": 4.457606408255761e-06, + "loss": 0.3246, + "mean_token_accuracy": 0.8968380689620972, + "step": 5846 + }, + { + "epoch": 2.9234999999999998, + "grad_norm": 2.3634363893666777, + "learning_rate": 4.457334993652059e-06, + "loss": 0.2874, + "mean_token_accuracy": 0.9097099900245667, + "step": 5847 + }, + { + "epoch": 2.924, + "grad_norm": 1.6722073314443513, + "learning_rate": 4.457063519424525e-06, + "loss": 0.3165, + "mean_token_accuracy": 0.8860368132591248, + "step": 5848 + }, + { + "epoch": 2.9245, + "grad_norm": 3.207296577917079, + "learning_rate": 4.456791985581427e-06, + "loss": 0.3223, + "mean_token_accuracy": 0.893545925617218, + "step": 5849 + }, + { + "epoch": 2.925, + "grad_norm": 2.606088468443848, + "learning_rate": 4.456520392131035e-06, + "loss": 0.235, + "mean_token_accuracy": 0.9189633131027222, + "step": 5850 + }, + { + "epoch": 2.9255, + "grad_norm": 4.2421916500324075, + "learning_rate": 4.456248739081625e-06, + "loss": 0.1956, + "mean_token_accuracy": 0.9346303343772888, + "step": 5851 + }, + { + "epoch": 2.926, + "grad_norm": 1.7538061260359255, + "learning_rate": 4.455977026441471e-06, + "loss": 0.1766, + "mean_token_accuracy": 0.9341415166854858, + "step": 5852 + }, + { + "epoch": 2.9265, + "grad_norm": 2.365098622071247, + "learning_rate": 4.455705254218849e-06, + "loss": 0.2809, + "mean_token_accuracy": 0.9022660255432129, + "step": 5853 + }, + { + "epoch": 2.927, + "grad_norm": 2.9404778145230472, + "learning_rate": 4.4554334224220385e-06, + "loss": 0.2456, + "mean_token_accuracy": 0.9179670810699463, + "step": 5854 + }, + { + "epoch": 2.9275, + "grad_norm": 2.3739505623597936, + "learning_rate": 4.45516153105932e-06, + "loss": 0.2917, + "mean_token_accuracy": 0.9035913348197937, + "step": 5855 + }, + { + "epoch": 2.928, + "grad_norm": 3.946148111498697, + "learning_rate": 4.4548895801389755e-06, + "loss": 0.2885, + "mean_token_accuracy": 0.9078303575515747, + "step": 5856 + }, + { + "epoch": 2.9285, + "grad_norm": 2.487853312299356, + "learning_rate": 4.454617569669289e-06, + "loss": 0.2628, + "mean_token_accuracy": 0.9111310839653015, + "step": 5857 + }, + { + "epoch": 2.9290000000000003, + "grad_norm": 2.3798218353237193, + "learning_rate": 4.454345499658547e-06, + "loss": 0.2638, + "mean_token_accuracy": 0.9088074564933777, + "step": 5858 + }, + { + "epoch": 2.9295, + "grad_norm": 3.7963140708639913, + "learning_rate": 4.454073370115036e-06, + "loss": 0.3108, + "mean_token_accuracy": 0.8961382508277893, + "step": 5859 + }, + { + "epoch": 2.93, + "grad_norm": 2.6601229954512555, + "learning_rate": 4.453801181047047e-06, + "loss": 0.1907, + "mean_token_accuracy": 0.9316757321357727, + "step": 5860 + }, + { + "epoch": 2.9305, + "grad_norm": 2.5415567472582636, + "learning_rate": 4.453528932462871e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8908807635307312, + "step": 5861 + }, + { + "epoch": 2.931, + "grad_norm": 3.934085523886864, + "learning_rate": 4.4532566243708e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.8797725439071655, + "step": 5862 + }, + { + "epoch": 2.9314999999999998, + "grad_norm": 1.9282349614617609, + "learning_rate": 4.452984256779131e-06, + "loss": 0.1835, + "mean_token_accuracy": 0.9321882128715515, + "step": 5863 + }, + { + "epoch": 2.932, + "grad_norm": 2.7134349962887794, + "learning_rate": 4.452711829696158e-06, + "loss": 0.2754, + "mean_token_accuracy": 0.904869556427002, + "step": 5864 + }, + { + "epoch": 2.9325, + "grad_norm": 2.1082291976953003, + "learning_rate": 4.452439343130183e-06, + "loss": 0.2283, + "mean_token_accuracy": 0.922380805015564, + "step": 5865 + }, + { + "epoch": 2.933, + "grad_norm": 2.024296975111711, + "learning_rate": 4.4521667970895035e-06, + "loss": 0.3387, + "mean_token_accuracy": 0.889609158039093, + "step": 5866 + }, + { + "epoch": 2.9335, + "grad_norm": 3.306959614501552, + "learning_rate": 4.4518941915824236e-06, + "loss": 0.3028, + "mean_token_accuracy": 0.8980010151863098, + "step": 5867 + }, + { + "epoch": 2.934, + "grad_norm": 3.3016213383822857, + "learning_rate": 4.451621526617246e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.89129638671875, + "step": 5868 + }, + { + "epoch": 2.9345, + "grad_norm": 1.8626705666696028, + "learning_rate": 4.451348802202276e-06, + "loss": 0.2543, + "mean_token_accuracy": 0.9196217656135559, + "step": 5869 + }, + { + "epoch": 2.935, + "grad_norm": 2.5657181514730554, + "learning_rate": 4.4510760183458246e-06, + "loss": 0.3029, + "mean_token_accuracy": 0.8984512686729431, + "step": 5870 + }, + { + "epoch": 2.9355, + "grad_norm": 3.911874257089062, + "learning_rate": 4.450803175056199e-06, + "loss": 0.3269, + "mean_token_accuracy": 0.90266352891922, + "step": 5871 + }, + { + "epoch": 2.936, + "grad_norm": 1.7775167670342327, + "learning_rate": 4.45053027234171e-06, + "loss": 0.1729, + "mean_token_accuracy": 0.9375250935554504, + "step": 5872 + }, + { + "epoch": 2.9365, + "grad_norm": 3.0081414932954016, + "learning_rate": 4.4502573102106706e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8804287910461426, + "step": 5873 + }, + { + "epoch": 2.9370000000000003, + "grad_norm": 2.2515135011506, + "learning_rate": 4.449984288671397e-06, + "loss": 0.2813, + "mean_token_accuracy": 0.9030710458755493, + "step": 5874 + }, + { + "epoch": 2.9375, + "grad_norm": 1.6835637876540066, + "learning_rate": 4.4497112077322045e-06, + "loss": 0.2674, + "mean_token_accuracy": 0.9026748538017273, + "step": 5875 + }, + { + "epoch": 2.9379999999999997, + "grad_norm": 2.1751388582126236, + "learning_rate": 4.449438067401413e-06, + "loss": 0.2187, + "mean_token_accuracy": 0.9215958714485168, + "step": 5876 + }, + { + "epoch": 2.9385, + "grad_norm": 3.10725490040015, + "learning_rate": 4.449164867687343e-06, + "loss": 0.3278, + "mean_token_accuracy": 0.8963279128074646, + "step": 5877 + }, + { + "epoch": 2.939, + "grad_norm": 2.1276006215924634, + "learning_rate": 4.448891608598314e-06, + "loss": 0.2832, + "mean_token_accuracy": 0.9067171216011047, + "step": 5878 + }, + { + "epoch": 2.9395, + "grad_norm": 1.88852894751463, + "learning_rate": 4.448618290142654e-06, + "loss": 0.2468, + "mean_token_accuracy": 0.9153268933296204, + "step": 5879 + }, + { + "epoch": 2.94, + "grad_norm": 1.471524354759799, + "learning_rate": 4.448344912328686e-06, + "loss": 0.1719, + "mean_token_accuracy": 0.9382633566856384, + "step": 5880 + }, + { + "epoch": 2.9405, + "grad_norm": 3.3797156906281764, + "learning_rate": 4.4480714751647375e-06, + "loss": 0.201, + "mean_token_accuracy": 0.9292704463005066, + "step": 5881 + }, + { + "epoch": 2.941, + "grad_norm": 1.781169244474599, + "learning_rate": 4.447797978659138e-06, + "loss": 0.258, + "mean_token_accuracy": 0.9157167673110962, + "step": 5882 + }, + { + "epoch": 2.9415, + "grad_norm": 4.972627828842231, + "learning_rate": 4.447524422820221e-06, + "loss": 0.2481, + "mean_token_accuracy": 0.9179278016090393, + "step": 5883 + }, + { + "epoch": 2.942, + "grad_norm": 2.2484681916157685, + "learning_rate": 4.447250807656316e-06, + "loss": 0.2974, + "mean_token_accuracy": 0.9056189060211182, + "step": 5884 + }, + { + "epoch": 2.9425, + "grad_norm": 4.215936811985458, + "learning_rate": 4.446977133175761e-06, + "loss": 0.2875, + "mean_token_accuracy": 0.8965412974357605, + "step": 5885 + }, + { + "epoch": 2.943, + "grad_norm": 1.9226988732503878, + "learning_rate": 4.44670339938689e-06, + "loss": 0.2109, + "mean_token_accuracy": 0.9221402406692505, + "step": 5886 + }, + { + "epoch": 2.9435000000000002, + "grad_norm": 3.044281903358352, + "learning_rate": 4.4464296062980425e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8712494969367981, + "step": 5887 + }, + { + "epoch": 2.944, + "grad_norm": 2.4977291787034073, + "learning_rate": 4.446155753917559e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8808614015579224, + "step": 5888 + }, + { + "epoch": 2.9445, + "grad_norm": 3.1711218759938773, + "learning_rate": 4.4458818422537805e-06, + "loss": 0.3179, + "mean_token_accuracy": 0.9010017514228821, + "step": 5889 + }, + { + "epoch": 2.945, + "grad_norm": 3.1744907591007667, + "learning_rate": 4.445607871315053e-06, + "loss": 0.2943, + "mean_token_accuracy": 0.9050140380859375, + "step": 5890 + }, + { + "epoch": 2.9455, + "grad_norm": 2.0353275198078546, + "learning_rate": 4.4453338411097194e-06, + "loss": 0.224, + "mean_token_accuracy": 0.9189332127571106, + "step": 5891 + }, + { + "epoch": 2.9459999999999997, + "grad_norm": 2.153865477862913, + "learning_rate": 4.445059751646129e-06, + "loss": 0.2733, + "mean_token_accuracy": 0.9028163552284241, + "step": 5892 + }, + { + "epoch": 2.9465, + "grad_norm": 1.941254981199905, + "learning_rate": 4.444785602932631e-06, + "loss": 0.2797, + "mean_token_accuracy": 0.9087271094322205, + "step": 5893 + }, + { + "epoch": 2.947, + "grad_norm": 12.937236085253051, + "learning_rate": 4.444511394977575e-06, + "loss": 0.2537, + "mean_token_accuracy": 0.9084886908531189, + "step": 5894 + }, + { + "epoch": 2.9475, + "grad_norm": 2.4032646726343927, + "learning_rate": 4.444237127789315e-06, + "loss": 0.2968, + "mean_token_accuracy": 0.9032130241394043, + "step": 5895 + }, + { + "epoch": 2.948, + "grad_norm": 2.8209028037345547, + "learning_rate": 4.443962801376206e-06, + "loss": 0.2468, + "mean_token_accuracy": 0.912365198135376, + "step": 5896 + }, + { + "epoch": 2.9485, + "grad_norm": 2.234910021887694, + "learning_rate": 4.443688415746602e-06, + "loss": 0.2117, + "mean_token_accuracy": 0.9260241985321045, + "step": 5897 + }, + { + "epoch": 2.949, + "grad_norm": 6.444569961367577, + "learning_rate": 4.443413970908866e-06, + "loss": 0.268, + "mean_token_accuracy": 0.913943350315094, + "step": 5898 + }, + { + "epoch": 2.9495, + "grad_norm": 2.8637655795374575, + "learning_rate": 4.443139466871353e-06, + "loss": 0.3002, + "mean_token_accuracy": 0.8958398103713989, + "step": 5899 + }, + { + "epoch": 2.95, + "grad_norm": 2.561502128756415, + "learning_rate": 4.442864903642428e-06, + "loss": 0.2541, + "mean_token_accuracy": 0.9150747060775757, + "step": 5900 + }, + { + "epoch": 2.9505, + "grad_norm": 2.0688969802299937, + "learning_rate": 4.442590281230453e-06, + "loss": 0.2761, + "mean_token_accuracy": 0.9100276231765747, + "step": 5901 + }, + { + "epoch": 2.951, + "grad_norm": 2.415283849179583, + "learning_rate": 4.442315599643795e-06, + "loss": 0.2866, + "mean_token_accuracy": 0.907221257686615, + "step": 5902 + }, + { + "epoch": 2.9515000000000002, + "grad_norm": 5.88586055312767, + "learning_rate": 4.44204085889082e-06, + "loss": 0.2564, + "mean_token_accuracy": 0.9137318134307861, + "step": 5903 + }, + { + "epoch": 2.952, + "grad_norm": 3.3896780420601056, + "learning_rate": 4.441766058979898e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.890260636806488, + "step": 5904 + }, + { + "epoch": 2.9525, + "grad_norm": 2.1663972707720935, + "learning_rate": 4.4414911999194e-06, + "loss": 0.2612, + "mean_token_accuracy": 0.913848340511322, + "step": 5905 + }, + { + "epoch": 2.953, + "grad_norm": 5.4645483969645, + "learning_rate": 4.441216281717697e-06, + "loss": 0.1614, + "mean_token_accuracy": 0.9425345659255981, + "step": 5906 + }, + { + "epoch": 2.9535, + "grad_norm": 2.1809736757296743, + "learning_rate": 4.440941304383165e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.8822130560874939, + "step": 5907 + }, + { + "epoch": 2.9539999999999997, + "grad_norm": 2.489653788985997, + "learning_rate": 4.44066626792418e-06, + "loss": 0.2026, + "mean_token_accuracy": 0.9267163276672363, + "step": 5908 + }, + { + "epoch": 2.9545, + "grad_norm": 2.4029708430363406, + "learning_rate": 4.44039117234912e-06, + "loss": 0.3108, + "mean_token_accuracy": 0.8996730446815491, + "step": 5909 + }, + { + "epoch": 2.955, + "grad_norm": 1.876105844665175, + "learning_rate": 4.440116017666365e-06, + "loss": 0.2951, + "mean_token_accuracy": 0.9033182263374329, + "step": 5910 + }, + { + "epoch": 2.9555, + "grad_norm": 3.1360675328228673, + "learning_rate": 4.4398408038842975e-06, + "loss": 0.3412, + "mean_token_accuracy": 0.8879138231277466, + "step": 5911 + }, + { + "epoch": 2.956, + "grad_norm": 3.3358086369873248, + "learning_rate": 4.439565531011299e-06, + "loss": 0.2457, + "mean_token_accuracy": 0.9149479269981384, + "step": 5912 + }, + { + "epoch": 2.9565, + "grad_norm": 4.5853608963429675, + "learning_rate": 4.439290199055756e-06, + "loss": 0.2729, + "mean_token_accuracy": 0.9040952920913696, + "step": 5913 + }, + { + "epoch": 2.957, + "grad_norm": 2.50609153745241, + "learning_rate": 4.439014808026055e-06, + "loss": 0.257, + "mean_token_accuracy": 0.9132267236709595, + "step": 5914 + }, + { + "epoch": 2.9575, + "grad_norm": 2.273496230108424, + "learning_rate": 4.438739357930587e-06, + "loss": 0.3029, + "mean_token_accuracy": 0.902825653553009, + "step": 5915 + }, + { + "epoch": 2.958, + "grad_norm": 2.462332673230294, + "learning_rate": 4.43846384877774e-06, + "loss": 0.2927, + "mean_token_accuracy": 0.9030520915985107, + "step": 5916 + }, + { + "epoch": 2.9585, + "grad_norm": 5.781939958864685, + "learning_rate": 4.438188280575907e-06, + "loss": 0.2222, + "mean_token_accuracy": 0.9194464087486267, + "step": 5917 + }, + { + "epoch": 2.959, + "grad_norm": 3.0998960020990762, + "learning_rate": 4.437912653333484e-06, + "loss": 0.2476, + "mean_token_accuracy": 0.9099128842353821, + "step": 5918 + }, + { + "epoch": 2.9595000000000002, + "grad_norm": 2.7436434517795725, + "learning_rate": 4.437636967058865e-06, + "loss": 0.2766, + "mean_token_accuracy": 0.9071577787399292, + "step": 5919 + }, + { + "epoch": 2.96, + "grad_norm": 3.0413850969843557, + "learning_rate": 4.437361221760449e-06, + "loss": 0.2287, + "mean_token_accuracy": 0.9214311242103577, + "step": 5920 + }, + { + "epoch": 2.9605, + "grad_norm": 2.6667242043908685, + "learning_rate": 4.4370854174466364e-06, + "loss": 0.2238, + "mean_token_accuracy": 0.9168344736099243, + "step": 5921 + }, + { + "epoch": 2.961, + "grad_norm": 2.94422123660486, + "learning_rate": 4.436809554125827e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8839375972747803, + "step": 5922 + }, + { + "epoch": 2.9615, + "grad_norm": 1.8868815177385279, + "learning_rate": 4.436533631806425e-06, + "loss": 0.2064, + "mean_token_accuracy": 0.9300270676612854, + "step": 5923 + }, + { + "epoch": 2.9619999999999997, + "grad_norm": 2.320962161110627, + "learning_rate": 4.4362576504968345e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.874451756477356, + "step": 5924 + }, + { + "epoch": 2.9625, + "grad_norm": 2.0395106075354628, + "learning_rate": 4.435981610205464e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8899244070053101, + "step": 5925 + }, + { + "epoch": 2.963, + "grad_norm": 2.332288298725242, + "learning_rate": 4.435705510940722e-06, + "loss": 0.2668, + "mean_token_accuracy": 0.9023239016532898, + "step": 5926 + }, + { + "epoch": 2.9635, + "grad_norm": 2.487074335457737, + "learning_rate": 4.435429352711017e-06, + "loss": 0.2782, + "mean_token_accuracy": 0.9069527387619019, + "step": 5927 + }, + { + "epoch": 2.964, + "grad_norm": 9.834659852616538, + "learning_rate": 4.4351531355247634e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8831601738929749, + "step": 5928 + }, + { + "epoch": 2.9645, + "grad_norm": 1.8036700677219795, + "learning_rate": 4.434876859390374e-06, + "loss": 0.1931, + "mean_token_accuracy": 0.9320388436317444, + "step": 5929 + }, + { + "epoch": 2.965, + "grad_norm": 2.975467803839144, + "learning_rate": 4.434600524316266e-06, + "loss": 0.3082, + "mean_token_accuracy": 0.8941437602043152, + "step": 5930 + }, + { + "epoch": 2.9655, + "grad_norm": 2.7043927361537006, + "learning_rate": 4.434324130310855e-06, + "loss": 0.327, + "mean_token_accuracy": 0.8786177039146423, + "step": 5931 + }, + { + "epoch": 2.966, + "grad_norm": 5.231343276780294, + "learning_rate": 4.434047677382563e-06, + "loss": 0.2778, + "mean_token_accuracy": 0.9096964001655579, + "step": 5932 + }, + { + "epoch": 2.9665, + "grad_norm": 1.659108741638428, + "learning_rate": 4.433771165539808e-06, + "loss": 0.3126, + "mean_token_accuracy": 0.905834972858429, + "step": 5933 + }, + { + "epoch": 2.967, + "grad_norm": 2.8350411355853904, + "learning_rate": 4.433494594791017e-06, + "loss": 0.2795, + "mean_token_accuracy": 0.9043198227882385, + "step": 5934 + }, + { + "epoch": 2.9675000000000002, + "grad_norm": 6.009998305086567, + "learning_rate": 4.4332179651446106e-06, + "loss": 0.2355, + "mean_token_accuracy": 0.9148646593093872, + "step": 5935 + }, + { + "epoch": 2.968, + "grad_norm": 3.9224007698938888, + "learning_rate": 4.432941276609018e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8758118748664856, + "step": 5936 + }, + { + "epoch": 2.9685, + "grad_norm": 2.1695060759596427, + "learning_rate": 4.432664529192668e-06, + "loss": 0.2786, + "mean_token_accuracy": 0.9071716070175171, + "step": 5937 + }, + { + "epoch": 2.969, + "grad_norm": 2.441673043879014, + "learning_rate": 4.432387722903989e-06, + "loss": 0.3321, + "mean_token_accuracy": 0.8858281373977661, + "step": 5938 + }, + { + "epoch": 2.9695, + "grad_norm": 3.3888265762988103, + "learning_rate": 4.432110857751415e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.8887491226196289, + "step": 5939 + }, + { + "epoch": 2.9699999999999998, + "grad_norm": 2.548482197402514, + "learning_rate": 4.431833933743378e-06, + "loss": 0.2819, + "mean_token_accuracy": 0.9112401604652405, + "step": 5940 + }, + { + "epoch": 2.9705, + "grad_norm": 2.5609360971106505, + "learning_rate": 4.431556950888315e-06, + "loss": 0.2786, + "mean_token_accuracy": 0.9164686799049377, + "step": 5941 + }, + { + "epoch": 2.971, + "grad_norm": 2.573107272953846, + "learning_rate": 4.431279909194661e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8758370876312256, + "step": 5942 + }, + { + "epoch": 2.9715, + "grad_norm": 2.258022530875801, + "learning_rate": 4.431002808670858e-06, + "loss": 0.2916, + "mean_token_accuracy": 0.9058862924575806, + "step": 5943 + }, + { + "epoch": 2.972, + "grad_norm": 2.57098138251492, + "learning_rate": 4.430725649325346e-06, + "loss": 0.2087, + "mean_token_accuracy": 0.9219944477081299, + "step": 5944 + }, + { + "epoch": 2.9725, + "grad_norm": 2.1012866356207334, + "learning_rate": 4.430448431166567e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.8934539556503296, + "step": 5945 + }, + { + "epoch": 2.973, + "grad_norm": 2.319612214249571, + "learning_rate": 4.430171154202967e-06, + "loss": 0.2531, + "mean_token_accuracy": 0.9087514877319336, + "step": 5946 + }, + { + "epoch": 2.9735, + "grad_norm": 2.175748686151116, + "learning_rate": 4.429893818442991e-06, + "loss": 0.2527, + "mean_token_accuracy": 0.9154300689697266, + "step": 5947 + }, + { + "epoch": 2.974, + "grad_norm": 1.866280223441356, + "learning_rate": 4.4296164238950875e-06, + "loss": 0.2619, + "mean_token_accuracy": 0.9117426872253418, + "step": 5948 + }, + { + "epoch": 2.9745, + "grad_norm": 2.7542548610538957, + "learning_rate": 4.429338970567707e-06, + "loss": 0.3051, + "mean_token_accuracy": 0.8919437527656555, + "step": 5949 + }, + { + "epoch": 2.975, + "grad_norm": 2.605662583794869, + "learning_rate": 4.4290614584693005e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.8864538669586182, + "step": 5950 + }, + { + "epoch": 2.9755000000000003, + "grad_norm": 1.4467096700558915, + "learning_rate": 4.428783887608321e-06, + "loss": 0.2793, + "mean_token_accuracy": 0.9003721475601196, + "step": 5951 + }, + { + "epoch": 2.976, + "grad_norm": 2.8063431890022077, + "learning_rate": 4.428506257993226e-06, + "loss": 0.2673, + "mean_token_accuracy": 0.9112833738327026, + "step": 5952 + }, + { + "epoch": 2.9765, + "grad_norm": 1.5439584714864498, + "learning_rate": 4.42822856963247e-06, + "loss": 0.1973, + "mean_token_accuracy": 0.929007351398468, + "step": 5953 + }, + { + "epoch": 2.977, + "grad_norm": 2.8788767040889858, + "learning_rate": 4.427950822534513e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8896409869194031, + "step": 5954 + }, + { + "epoch": 2.9775, + "grad_norm": 2.544903121254626, + "learning_rate": 4.427673016707817e-06, + "loss": 0.2635, + "mean_token_accuracy": 0.9051330089569092, + "step": 5955 + }, + { + "epoch": 2.9779999999999998, + "grad_norm": 3.084149116850727, + "learning_rate": 4.427395152160841e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.8937222957611084, + "step": 5956 + }, + { + "epoch": 2.9785, + "grad_norm": 5.642809033026478, + "learning_rate": 4.4271172289020526e-06, + "loss": 0.2382, + "mean_token_accuracy": 0.9237239360809326, + "step": 5957 + }, + { + "epoch": 2.979, + "grad_norm": 2.3802777782059352, + "learning_rate": 4.426839246939916e-06, + "loss": 0.296, + "mean_token_accuracy": 0.8955508470535278, + "step": 5958 + }, + { + "epoch": 2.9795, + "grad_norm": 3.2454703837214747, + "learning_rate": 4.4265612062829e-06, + "loss": 0.2697, + "mean_token_accuracy": 0.9183135628700256, + "step": 5959 + }, + { + "epoch": 2.98, + "grad_norm": 1.7462485012952556, + "learning_rate": 4.426283106939474e-06, + "loss": 0.2858, + "mean_token_accuracy": 0.9008375406265259, + "step": 5960 + }, + { + "epoch": 2.9805, + "grad_norm": 2.7718293123702042, + "learning_rate": 4.4260049489181086e-06, + "loss": 0.2672, + "mean_token_accuracy": 0.9196059703826904, + "step": 5961 + }, + { + "epoch": 2.981, + "grad_norm": 3.577872458545318, + "learning_rate": 4.425726732227277e-06, + "loss": 0.2608, + "mean_token_accuracy": 0.9116960763931274, + "step": 5962 + }, + { + "epoch": 2.9815, + "grad_norm": 1.9769345383118273, + "learning_rate": 4.425448456875456e-06, + "loss": 0.2685, + "mean_token_accuracy": 0.9038901329040527, + "step": 5963 + }, + { + "epoch": 2.982, + "grad_norm": 1.8657954219188264, + "learning_rate": 4.42517012287112e-06, + "loss": 0.2217, + "mean_token_accuracy": 0.9219143390655518, + "step": 5964 + }, + { + "epoch": 2.9825, + "grad_norm": 2.825277414916608, + "learning_rate": 4.424891730222749e-06, + "loss": 0.2996, + "mean_token_accuracy": 0.8972417712211609, + "step": 5965 + }, + { + "epoch": 2.983, + "grad_norm": 5.323737979544365, + "learning_rate": 4.424613278938823e-06, + "loss": 0.2529, + "mean_token_accuracy": 0.9134396314620972, + "step": 5966 + }, + { + "epoch": 2.9835000000000003, + "grad_norm": 2.867006193326318, + "learning_rate": 4.4243347690278246e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.888559877872467, + "step": 5967 + }, + { + "epoch": 2.984, + "grad_norm": 2.2088086053466527, + "learning_rate": 4.424056200498237e-06, + "loss": 0.2394, + "mean_token_accuracy": 0.9196135401725769, + "step": 5968 + }, + { + "epoch": 2.9845, + "grad_norm": 2.126033445816366, + "learning_rate": 4.423777573358545e-06, + "loss": 0.3058, + "mean_token_accuracy": 0.8960980176925659, + "step": 5969 + }, + { + "epoch": 2.985, + "grad_norm": 2.318093021789046, + "learning_rate": 4.423498887617238e-06, + "loss": 0.3037, + "mean_token_accuracy": 0.8943119049072266, + "step": 5970 + }, + { + "epoch": 2.9855, + "grad_norm": 2.4209735693842647, + "learning_rate": 4.423220143282804e-06, + "loss": 0.2662, + "mean_token_accuracy": 0.9182701706886292, + "step": 5971 + }, + { + "epoch": 2.9859999999999998, + "grad_norm": 2.0860786890400846, + "learning_rate": 4.422941340363735e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8909293413162231, + "step": 5972 + }, + { + "epoch": 2.9865, + "grad_norm": 2.8646787869818646, + "learning_rate": 4.422662478868523e-06, + "loss": 0.3128, + "mean_token_accuracy": 0.8952241539955139, + "step": 5973 + }, + { + "epoch": 2.987, + "grad_norm": 2.3751332104399228, + "learning_rate": 4.422383558805662e-06, + "loss": 0.2598, + "mean_token_accuracy": 0.9180220365524292, + "step": 5974 + }, + { + "epoch": 2.9875, + "grad_norm": 2.21217166589663, + "learning_rate": 4.422104580183649e-06, + "loss": 0.2374, + "mean_token_accuracy": 0.9176098704338074, + "step": 5975 + }, + { + "epoch": 2.988, + "grad_norm": 2.3374054101520017, + "learning_rate": 4.421825543010983e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8847672939300537, + "step": 5976 + }, + { + "epoch": 2.9885, + "grad_norm": 1.9840131035699904, + "learning_rate": 4.421546447296163e-06, + "loss": 0.2398, + "mean_token_accuracy": 0.9172767400741577, + "step": 5977 + }, + { + "epoch": 2.989, + "grad_norm": 2.1986533789377525, + "learning_rate": 4.4212672930476915e-06, + "loss": 0.3105, + "mean_token_accuracy": 0.8962070345878601, + "step": 5978 + }, + { + "epoch": 2.9895, + "grad_norm": 1.4781395049842243, + "learning_rate": 4.420988080274072e-06, + "loss": 0.2262, + "mean_token_accuracy": 0.9213049411773682, + "step": 5979 + }, + { + "epoch": 2.99, + "grad_norm": 1.879118486462552, + "learning_rate": 4.420708808983809e-06, + "loss": 0.3209, + "mean_token_accuracy": 0.8903883695602417, + "step": 5980 + }, + { + "epoch": 2.9905, + "grad_norm": 3.0905115758423087, + "learning_rate": 4.4204294791854095e-06, + "loss": 0.1652, + "mean_token_accuracy": 0.9366551041603088, + "step": 5981 + }, + { + "epoch": 2.991, + "grad_norm": 5.145449527770681, + "learning_rate": 4.4201500908873835e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8926271796226501, + "step": 5982 + }, + { + "epoch": 2.9915000000000003, + "grad_norm": 1.8168313145987056, + "learning_rate": 4.419870644098241e-06, + "loss": 0.2993, + "mean_token_accuracy": 0.8958265781402588, + "step": 5983 + }, + { + "epoch": 2.992, + "grad_norm": 1.4473914735818938, + "learning_rate": 4.419591138826495e-06, + "loss": 0.1921, + "mean_token_accuracy": 0.9315222501754761, + "step": 5984 + }, + { + "epoch": 2.9925, + "grad_norm": 1.325350464923995, + "learning_rate": 4.419311575080657e-06, + "loss": 0.2215, + "mean_token_accuracy": 0.916919469833374, + "step": 5985 + }, + { + "epoch": 2.993, + "grad_norm": 1.8530110497164758, + "learning_rate": 4.4190319528692475e-06, + "loss": 0.2745, + "mean_token_accuracy": 0.9071106910705566, + "step": 5986 + }, + { + "epoch": 2.9935, + "grad_norm": 1.5827413191871864, + "learning_rate": 4.41875227220078e-06, + "loss": 0.2015, + "mean_token_accuracy": 0.9279293417930603, + "step": 5987 + }, + { + "epoch": 2.9939999999999998, + "grad_norm": 2.527085056478679, + "learning_rate": 4.418472533083778e-06, + "loss": 0.269, + "mean_token_accuracy": 0.918312668800354, + "step": 5988 + }, + { + "epoch": 2.9945, + "grad_norm": 2.523570821306038, + "learning_rate": 4.4181927355267595e-06, + "loss": 0.2414, + "mean_token_accuracy": 0.9253580570220947, + "step": 5989 + }, + { + "epoch": 2.995, + "grad_norm": 4.886444394249852, + "learning_rate": 4.41791287953825e-06, + "loss": 0.2232, + "mean_token_accuracy": 0.9226661324501038, + "step": 5990 + }, + { + "epoch": 2.9955, + "grad_norm": 3.40563429008394, + "learning_rate": 4.417632965126773e-06, + "loss": 0.2456, + "mean_token_accuracy": 0.9212133288383484, + "step": 5991 + }, + { + "epoch": 2.996, + "grad_norm": 1.9556043212387388, + "learning_rate": 4.417352992300854e-06, + "loss": 0.2505, + "mean_token_accuracy": 0.916903555393219, + "step": 5992 + }, + { + "epoch": 2.9965, + "grad_norm": 1.8979375006179209, + "learning_rate": 4.417072961069025e-06, + "loss": 0.2434, + "mean_token_accuracy": 0.9182411432266235, + "step": 5993 + }, + { + "epoch": 2.997, + "grad_norm": 3.7493846004658846, + "learning_rate": 4.416792871439813e-06, + "loss": 0.2312, + "mean_token_accuracy": 0.9197648167610168, + "step": 5994 + }, + { + "epoch": 2.9975, + "grad_norm": 3.052947269190691, + "learning_rate": 4.416512723421752e-06, + "loss": 0.2934, + "mean_token_accuracy": 0.9085533022880554, + "step": 5995 + }, + { + "epoch": 2.998, + "grad_norm": 2.3199680839087162, + "learning_rate": 4.416232517023375e-06, + "loss": 0.3109, + "mean_token_accuracy": 0.8928285241127014, + "step": 5996 + }, + { + "epoch": 2.9985, + "grad_norm": 1.7992039314371164, + "learning_rate": 4.415952252253217e-06, + "loss": 0.2363, + "mean_token_accuracy": 0.9102841019630432, + "step": 5997 + }, + { + "epoch": 2.999, + "grad_norm": 1.796905219339628, + "learning_rate": 4.415671929119817e-06, + "loss": 0.2483, + "mean_token_accuracy": 0.9098796248435974, + "step": 5998 + }, + { + "epoch": 2.9995000000000003, + "grad_norm": 3.1066880404712305, + "learning_rate": 4.415391547631713e-06, + "loss": 0.2419, + "mean_token_accuracy": 0.9150090217590332, + "step": 5999 + }, + { + "epoch": 3.0, + "grad_norm": 2.4857985498845037, + "learning_rate": 4.415111107797445e-06, + "loss": 0.28, + "mean_token_accuracy": 0.90398770570755, + "step": 6000 + }, + { + "epoch": 3.0005, + "grad_norm": 1.917608379722335, + "learning_rate": 4.414830609625558e-06, + "loss": 0.2187, + "mean_token_accuracy": 0.9245211482048035, + "step": 6001 + }, + { + "epoch": 3.001, + "grad_norm": 1.5716680131932645, + "learning_rate": 4.414550053124594e-06, + "loss": 0.2053, + "mean_token_accuracy": 0.9263460636138916, + "step": 6002 + }, + { + "epoch": 3.0015, + "grad_norm": 2.453651417334919, + "learning_rate": 4.414269438303101e-06, + "loss": 0.2502, + "mean_token_accuracy": 0.9137614965438843, + "step": 6003 + }, + { + "epoch": 3.002, + "grad_norm": 1.962888787221006, + "learning_rate": 4.413988765169627e-06, + "loss": 0.1707, + "mean_token_accuracy": 0.9443114995956421, + "step": 6004 + }, + { + "epoch": 3.0025, + "grad_norm": 6.941118450520442, + "learning_rate": 4.413708033732721e-06, + "loss": 0.3446, + "mean_token_accuracy": 0.8983114957809448, + "step": 6005 + }, + { + "epoch": 3.003, + "grad_norm": 3.976058380340291, + "learning_rate": 4.413427244000934e-06, + "loss": 0.1499, + "mean_token_accuracy": 0.9453479647636414, + "step": 6006 + }, + { + "epoch": 3.0035, + "grad_norm": 1.6993605379495311, + "learning_rate": 4.413146395982821e-06, + "loss": 0.2109, + "mean_token_accuracy": 0.9283819794654846, + "step": 6007 + }, + { + "epoch": 3.004, + "grad_norm": 2.244021668629218, + "learning_rate": 4.412865489686936e-06, + "loss": 0.1768, + "mean_token_accuracy": 0.9404639005661011, + "step": 6008 + }, + { + "epoch": 3.0045, + "grad_norm": 1.9165396004071629, + "learning_rate": 4.412584525121836e-06, + "loss": 0.2498, + "mean_token_accuracy": 0.9122155904769897, + "step": 6009 + }, + { + "epoch": 3.005, + "grad_norm": 1.863643592358646, + "learning_rate": 4.412303502296081e-06, + "loss": 0.2025, + "mean_token_accuracy": 0.9271348118782043, + "step": 6010 + }, + { + "epoch": 3.0055, + "grad_norm": 2.794685277140224, + "learning_rate": 4.412022421218228e-06, + "loss": 0.2004, + "mean_token_accuracy": 0.935957133769989, + "step": 6011 + }, + { + "epoch": 3.006, + "grad_norm": 2.1736165967634395, + "learning_rate": 4.411741281896843e-06, + "loss": 0.2047, + "mean_token_accuracy": 0.9247449040412903, + "step": 6012 + }, + { + "epoch": 3.0065, + "grad_norm": 2.525282363564883, + "learning_rate": 4.411460084340488e-06, + "loss": 0.2136, + "mean_token_accuracy": 0.9241635799407959, + "step": 6013 + }, + { + "epoch": 3.007, + "grad_norm": 5.136513951923588, + "learning_rate": 4.41117882855773e-06, + "loss": 0.2197, + "mean_token_accuracy": 0.921583354473114, + "step": 6014 + }, + { + "epoch": 3.0075, + "grad_norm": 2.4253353746655666, + "learning_rate": 4.410897514557134e-06, + "loss": 0.1846, + "mean_token_accuracy": 0.9371337294578552, + "step": 6015 + }, + { + "epoch": 3.008, + "grad_norm": 3.120930308000918, + "learning_rate": 4.4106161423472726e-06, + "loss": 0.2209, + "mean_token_accuracy": 0.9249489903450012, + "step": 6016 + }, + { + "epoch": 3.0085, + "grad_norm": 2.7019597444898227, + "learning_rate": 4.410334711936715e-06, + "loss": 0.2279, + "mean_token_accuracy": 0.9186307787895203, + "step": 6017 + }, + { + "epoch": 3.009, + "grad_norm": 1.5955229835950882, + "learning_rate": 4.410053223334036e-06, + "loss": 0.1637, + "mean_token_accuracy": 0.9427207708358765, + "step": 6018 + }, + { + "epoch": 3.0095, + "grad_norm": 2.1965277540779633, + "learning_rate": 4.4097716765478066e-06, + "loss": 0.1813, + "mean_token_accuracy": 0.9345335364341736, + "step": 6019 + }, + { + "epoch": 3.01, + "grad_norm": 3.491261141483078, + "learning_rate": 4.409490071586606e-06, + "loss": 0.2052, + "mean_token_accuracy": 0.9284054636955261, + "step": 6020 + }, + { + "epoch": 3.0105, + "grad_norm": 34.259326668775834, + "learning_rate": 4.4092084084590115e-06, + "loss": 0.1864, + "mean_token_accuracy": 0.9426478743553162, + "step": 6021 + }, + { + "epoch": 3.011, + "grad_norm": 3.757601435092056, + "learning_rate": 4.408926687173604e-06, + "loss": 0.1963, + "mean_token_accuracy": 0.9269675612449646, + "step": 6022 + }, + { + "epoch": 3.0115, + "grad_norm": 1.414710396485744, + "learning_rate": 4.408644907738963e-06, + "loss": 0.1493, + "mean_token_accuracy": 0.9451336860656738, + "step": 6023 + }, + { + "epoch": 3.012, + "grad_norm": 2.2052752513080374, + "learning_rate": 4.408363070163675e-06, + "loss": 0.1699, + "mean_token_accuracy": 0.9377973675727844, + "step": 6024 + }, + { + "epoch": 3.0125, + "grad_norm": 2.2653688836404973, + "learning_rate": 4.408081174456322e-06, + "loss": 0.2427, + "mean_token_accuracy": 0.9227694869041443, + "step": 6025 + }, + { + "epoch": 3.013, + "grad_norm": 2.930018990713332, + "learning_rate": 4.407799220625494e-06, + "loss": 0.2555, + "mean_token_accuracy": 0.9130755066871643, + "step": 6026 + }, + { + "epoch": 3.0135, + "grad_norm": 1.9811215723617719, + "learning_rate": 4.407517208679779e-06, + "loss": 0.227, + "mean_token_accuracy": 0.9307770133018494, + "step": 6027 + }, + { + "epoch": 3.014, + "grad_norm": 2.3833219181199254, + "learning_rate": 4.407235138627766e-06, + "loss": 0.2119, + "mean_token_accuracy": 0.9289506077766418, + "step": 6028 + }, + { + "epoch": 3.0145, + "grad_norm": 1.8436101131282487, + "learning_rate": 4.406953010478049e-06, + "loss": 0.2118, + "mean_token_accuracy": 0.9250786304473877, + "step": 6029 + }, + { + "epoch": 3.015, + "grad_norm": 3.0244402864836375, + "learning_rate": 4.406670824239221e-06, + "loss": 0.195, + "mean_token_accuracy": 0.9265727996826172, + "step": 6030 + }, + { + "epoch": 3.0155, + "grad_norm": 1.7127111666888368, + "learning_rate": 4.4063885799198795e-06, + "loss": 0.2112, + "mean_token_accuracy": 0.9171247482299805, + "step": 6031 + }, + { + "epoch": 3.016, + "grad_norm": 1.8398847746400924, + "learning_rate": 4.40610627752862e-06, + "loss": 0.1551, + "mean_token_accuracy": 0.9423300623893738, + "step": 6032 + }, + { + "epoch": 3.0165, + "grad_norm": 1.597360726251469, + "learning_rate": 4.405823917074044e-06, + "loss": 0.1663, + "mean_token_accuracy": 0.9378318190574646, + "step": 6033 + }, + { + "epoch": 3.017, + "grad_norm": 2.8851237137196453, + "learning_rate": 4.405541498564751e-06, + "loss": 0.169, + "mean_token_accuracy": 0.9408463835716248, + "step": 6034 + }, + { + "epoch": 3.0175, + "grad_norm": 3.0799972718645257, + "learning_rate": 4.405259022009345e-06, + "loss": 0.2065, + "mean_token_accuracy": 0.9231297373771667, + "step": 6035 + }, + { + "epoch": 3.018, + "grad_norm": 1.7112492197054059, + "learning_rate": 4.40497648741643e-06, + "loss": 0.2015, + "mean_token_accuracy": 0.9268914461135864, + "step": 6036 + }, + { + "epoch": 3.0185, + "grad_norm": 2.1927144041809674, + "learning_rate": 4.404693894794613e-06, + "loss": 0.1934, + "mean_token_accuracy": 0.9348558783531189, + "step": 6037 + }, + { + "epoch": 3.019, + "grad_norm": 2.447313107988425, + "learning_rate": 4.404411244152503e-06, + "loss": 0.1703, + "mean_token_accuracy": 0.9371176362037659, + "step": 6038 + }, + { + "epoch": 3.0195, + "grad_norm": 2.0897934092488537, + "learning_rate": 4.404128535498708e-06, + "loss": 0.2234, + "mean_token_accuracy": 0.922335684299469, + "step": 6039 + }, + { + "epoch": 3.02, + "grad_norm": 3.8614579804422804, + "learning_rate": 4.403845768841842e-06, + "loss": 0.1991, + "mean_token_accuracy": 0.9362198114395142, + "step": 6040 + }, + { + "epoch": 3.0205, + "grad_norm": 1.684312033622148, + "learning_rate": 4.403562944190518e-06, + "loss": 0.1878, + "mean_token_accuracy": 0.9304347634315491, + "step": 6041 + }, + { + "epoch": 3.021, + "grad_norm": 1.9836064765886614, + "learning_rate": 4.40328006155335e-06, + "loss": 0.2041, + "mean_token_accuracy": 0.9309338331222534, + "step": 6042 + }, + { + "epoch": 3.0215, + "grad_norm": 2.543272343664086, + "learning_rate": 4.402997120938955e-06, + "loss": 0.161, + "mean_token_accuracy": 0.9423597455024719, + "step": 6043 + }, + { + "epoch": 3.022, + "grad_norm": 3.703670441668173, + "learning_rate": 4.402714122355955e-06, + "loss": 0.2244, + "mean_token_accuracy": 0.9232932329177856, + "step": 6044 + }, + { + "epoch": 3.0225, + "grad_norm": 2.015841566521369, + "learning_rate": 4.402431065812968e-06, + "loss": 0.1935, + "mean_token_accuracy": 0.9234881401062012, + "step": 6045 + }, + { + "epoch": 3.023, + "grad_norm": 2.290195169647077, + "learning_rate": 4.402147951318616e-06, + "loss": 0.2122, + "mean_token_accuracy": 0.9246740937232971, + "step": 6046 + }, + { + "epoch": 3.0235, + "grad_norm": 1.9348132878865572, + "learning_rate": 4.401864778881524e-06, + "loss": 0.2227, + "mean_token_accuracy": 0.9233145713806152, + "step": 6047 + }, + { + "epoch": 3.024, + "grad_norm": 2.5314392156352348, + "learning_rate": 4.401581548510319e-06, + "loss": 0.1982, + "mean_token_accuracy": 0.9285004734992981, + "step": 6048 + }, + { + "epoch": 3.0245, + "grad_norm": 1.9778279082880674, + "learning_rate": 4.4012982602136265e-06, + "loss": 0.2439, + "mean_token_accuracy": 0.9159570932388306, + "step": 6049 + }, + { + "epoch": 3.025, + "grad_norm": 1.8817636148562722, + "learning_rate": 4.401014914000078e-06, + "loss": 0.1788, + "mean_token_accuracy": 0.9360895156860352, + "step": 6050 + }, + { + "epoch": 3.0255, + "grad_norm": 1.7279408067599242, + "learning_rate": 4.400731509878304e-06, + "loss": 0.21, + "mean_token_accuracy": 0.9207335710525513, + "step": 6051 + }, + { + "epoch": 3.026, + "grad_norm": 1.4369738872393616, + "learning_rate": 4.400448047856935e-06, + "loss": 0.1607, + "mean_token_accuracy": 0.9424511790275574, + "step": 6052 + }, + { + "epoch": 3.0265, + "grad_norm": 2.8009618282578157, + "learning_rate": 4.4001645279446116e-06, + "loss": 0.192, + "mean_token_accuracy": 0.9294827580451965, + "step": 6053 + }, + { + "epoch": 3.027, + "grad_norm": 12.181619305336646, + "learning_rate": 4.399880950149964e-06, + "loss": 0.2165, + "mean_token_accuracy": 0.9222020506858826, + "step": 6054 + }, + { + "epoch": 3.0275, + "grad_norm": 2.4888627543655253, + "learning_rate": 4.399597314481635e-06, + "loss": 0.2319, + "mean_token_accuracy": 0.9176381826400757, + "step": 6055 + }, + { + "epoch": 3.028, + "grad_norm": 3.656707364626644, + "learning_rate": 4.399313620948262e-06, + "loss": 0.1856, + "mean_token_accuracy": 0.9349079132080078, + "step": 6056 + }, + { + "epoch": 3.0285, + "grad_norm": 2.3672532116940097, + "learning_rate": 4.399029869558487e-06, + "loss": 0.2359, + "mean_token_accuracy": 0.9182866811752319, + "step": 6057 + }, + { + "epoch": 3.029, + "grad_norm": 2.3679234445381114, + "learning_rate": 4.398746060320957e-06, + "loss": 0.243, + "mean_token_accuracy": 0.9158461689949036, + "step": 6058 + }, + { + "epoch": 3.0295, + "grad_norm": 2.646463490677134, + "learning_rate": 4.398462193244311e-06, + "loss": 0.2133, + "mean_token_accuracy": 0.9217467308044434, + "step": 6059 + }, + { + "epoch": 3.03, + "grad_norm": 2.6014938173793927, + "learning_rate": 4.398178268337202e-06, + "loss": 0.2032, + "mean_token_accuracy": 0.9315822720527649, + "step": 6060 + }, + { + "epoch": 3.0305, + "grad_norm": 2.104406504174591, + "learning_rate": 4.3978942856082766e-06, + "loss": 0.1721, + "mean_token_accuracy": 0.9357782006263733, + "step": 6061 + }, + { + "epoch": 3.031, + "grad_norm": 2.135415707955606, + "learning_rate": 4.3976102450661844e-06, + "loss": 0.2393, + "mean_token_accuracy": 0.9231550097465515, + "step": 6062 + }, + { + "epoch": 3.0315, + "grad_norm": 1.7580527557890262, + "learning_rate": 4.397326146719579e-06, + "loss": 0.2115, + "mean_token_accuracy": 0.9245818257331848, + "step": 6063 + }, + { + "epoch": 3.032, + "grad_norm": 3.0224240172785777, + "learning_rate": 4.3970419905771145e-06, + "loss": 0.1858, + "mean_token_accuracy": 0.9342055320739746, + "step": 6064 + }, + { + "epoch": 3.0325, + "grad_norm": 7.3228752842428415, + "learning_rate": 4.396757776647446e-06, + "loss": 0.1818, + "mean_token_accuracy": 0.9322569370269775, + "step": 6065 + }, + { + "epoch": 3.033, + "grad_norm": 2.9016804387532162, + "learning_rate": 4.396473504939231e-06, + "loss": 0.2405, + "mean_token_accuracy": 0.9156262874603271, + "step": 6066 + }, + { + "epoch": 3.0335, + "grad_norm": 1.4941951473297361, + "learning_rate": 4.39618917546113e-06, + "loss": 0.1824, + "mean_token_accuracy": 0.9341161847114563, + "step": 6067 + }, + { + "epoch": 3.034, + "grad_norm": 2.0163906936697793, + "learning_rate": 4.3959047882218055e-06, + "loss": 0.1785, + "mean_token_accuracy": 0.9351666569709778, + "step": 6068 + }, + { + "epoch": 3.0345, + "grad_norm": 2.3113969445166718, + "learning_rate": 4.3956203432299175e-06, + "loss": 0.2234, + "mean_token_accuracy": 0.9200761318206787, + "step": 6069 + }, + { + "epoch": 3.035, + "grad_norm": 3.517736546986186, + "learning_rate": 4.395335840494131e-06, + "loss": 0.1528, + "mean_token_accuracy": 0.9416218400001526, + "step": 6070 + }, + { + "epoch": 3.0355, + "grad_norm": 4.0921071428401055, + "learning_rate": 4.395051280023114e-06, + "loss": 0.1953, + "mean_token_accuracy": 0.932464063167572, + "step": 6071 + }, + { + "epoch": 3.036, + "grad_norm": 2.343078892497141, + "learning_rate": 4.3947666618255335e-06, + "loss": 0.2284, + "mean_token_accuracy": 0.9204068183898926, + "step": 6072 + }, + { + "epoch": 3.0365, + "grad_norm": 1.7374477703176259, + "learning_rate": 4.394481985910061e-06, + "loss": 0.186, + "mean_token_accuracy": 0.9401130080223083, + "step": 6073 + }, + { + "epoch": 3.037, + "grad_norm": 5.157155192848638, + "learning_rate": 4.394197252285366e-06, + "loss": 0.1922, + "mean_token_accuracy": 0.9288461804389954, + "step": 6074 + }, + { + "epoch": 3.0375, + "grad_norm": 2.05015765614643, + "learning_rate": 4.393912460960125e-06, + "loss": 0.2249, + "mean_token_accuracy": 0.9282633066177368, + "step": 6075 + }, + { + "epoch": 3.038, + "grad_norm": 1.817830863126065, + "learning_rate": 4.39362761194301e-06, + "loss": 0.1785, + "mean_token_accuracy": 0.9405395984649658, + "step": 6076 + }, + { + "epoch": 3.0385, + "grad_norm": 3.3189303941804353, + "learning_rate": 4.393342705242699e-06, + "loss": 0.2467, + "mean_token_accuracy": 0.9122552871704102, + "step": 6077 + }, + { + "epoch": 3.039, + "grad_norm": 10.530046823122769, + "learning_rate": 4.3930577408678724e-06, + "loss": 0.1769, + "mean_token_accuracy": 0.9382691979408264, + "step": 6078 + }, + { + "epoch": 3.0395, + "grad_norm": 1.8650125486056108, + "learning_rate": 4.392772718827209e-06, + "loss": 0.2421, + "mean_token_accuracy": 0.9121082425117493, + "step": 6079 + }, + { + "epoch": 3.04, + "grad_norm": 2.727871736874132, + "learning_rate": 4.3924876391293915e-06, + "loss": 0.2303, + "mean_token_accuracy": 0.9211491346359253, + "step": 6080 + }, + { + "epoch": 3.0405, + "grad_norm": 1.2867293652942828, + "learning_rate": 4.392202501783104e-06, + "loss": 0.1443, + "mean_token_accuracy": 0.9461150765419006, + "step": 6081 + }, + { + "epoch": 3.041, + "grad_norm": 1.783514597507084, + "learning_rate": 4.391917306797032e-06, + "loss": 0.2005, + "mean_token_accuracy": 0.933588981628418, + "step": 6082 + }, + { + "epoch": 3.0415, + "grad_norm": 2.132404782936284, + "learning_rate": 4.391632054179864e-06, + "loss": 0.1643, + "mean_token_accuracy": 0.9398430585861206, + "step": 6083 + }, + { + "epoch": 3.042, + "grad_norm": 1.7020874489214413, + "learning_rate": 4.3913467439402875e-06, + "loss": 0.2233, + "mean_token_accuracy": 0.9159906506538391, + "step": 6084 + }, + { + "epoch": 3.0425, + "grad_norm": 4.118689338792462, + "learning_rate": 4.391061376086996e-06, + "loss": 0.222, + "mean_token_accuracy": 0.9307112097740173, + "step": 6085 + }, + { + "epoch": 3.043, + "grad_norm": 1.3995011010763423, + "learning_rate": 4.39077595062868e-06, + "loss": 0.1249, + "mean_token_accuracy": 0.9517391920089722, + "step": 6086 + }, + { + "epoch": 3.0435, + "grad_norm": 2.0795966510142847, + "learning_rate": 4.390490467574036e-06, + "loss": 0.2076, + "mean_token_accuracy": 0.9308782815933228, + "step": 6087 + }, + { + "epoch": 3.044, + "grad_norm": 1.5248036469305648, + "learning_rate": 4.3902049269317585e-06, + "loss": 0.155, + "mean_token_accuracy": 0.942958652973175, + "step": 6088 + }, + { + "epoch": 3.0445, + "grad_norm": 3.4616635898746595, + "learning_rate": 4.389919328710545e-06, + "loss": 0.2002, + "mean_token_accuracy": 0.9321931600570679, + "step": 6089 + }, + { + "epoch": 3.045, + "grad_norm": 3.118238055698748, + "learning_rate": 4.389633672919099e-06, + "loss": 0.2155, + "mean_token_accuracy": 0.9258840680122375, + "step": 6090 + }, + { + "epoch": 3.0455, + "grad_norm": 3.2132101668214297, + "learning_rate": 4.389347959566119e-06, + "loss": 0.2049, + "mean_token_accuracy": 0.938345193862915, + "step": 6091 + }, + { + "epoch": 3.046, + "grad_norm": 1.4859078715199325, + "learning_rate": 4.389062188660309e-06, + "loss": 0.1496, + "mean_token_accuracy": 0.9454760551452637, + "step": 6092 + }, + { + "epoch": 3.0465, + "grad_norm": 2.5283566188650872, + "learning_rate": 4.388776360210374e-06, + "loss": 0.1925, + "mean_token_accuracy": 0.9318974614143372, + "step": 6093 + }, + { + "epoch": 3.047, + "grad_norm": 2.1464597078644068, + "learning_rate": 4.3884904742250215e-06, + "loss": 0.1797, + "mean_token_accuracy": 0.9344416856765747, + "step": 6094 + }, + { + "epoch": 3.0475, + "grad_norm": 2.012541533351681, + "learning_rate": 4.388204530712959e-06, + "loss": 0.178, + "mean_token_accuracy": 0.9352785348892212, + "step": 6095 + }, + { + "epoch": 3.048, + "grad_norm": 34.20276982446903, + "learning_rate": 4.387918529682898e-06, + "loss": 0.1549, + "mean_token_accuracy": 0.941662609577179, + "step": 6096 + }, + { + "epoch": 3.0485, + "grad_norm": 2.3255631581690133, + "learning_rate": 4.38763247114355e-06, + "loss": 0.2529, + "mean_token_accuracy": 0.9102770090103149, + "step": 6097 + }, + { + "epoch": 3.049, + "grad_norm": 3.267499008398246, + "learning_rate": 4.387346355103629e-06, + "loss": 0.178, + "mean_token_accuracy": 0.9359927177429199, + "step": 6098 + }, + { + "epoch": 3.0495, + "grad_norm": 3.9895371239971804, + "learning_rate": 4.387060181571849e-06, + "loss": 0.2244, + "mean_token_accuracy": 0.9233555197715759, + "step": 6099 + }, + { + "epoch": 3.05, + "grad_norm": 2.4790638867366077, + "learning_rate": 4.386773950556931e-06, + "loss": 0.1988, + "mean_token_accuracy": 0.9380837082862854, + "step": 6100 + }, + { + "epoch": 3.0505, + "grad_norm": 2.8533390226035364, + "learning_rate": 4.38648766206759e-06, + "loss": 0.1399, + "mean_token_accuracy": 0.9508455395698547, + "step": 6101 + }, + { + "epoch": 3.051, + "grad_norm": 2.787325608674077, + "learning_rate": 4.386201316112549e-06, + "loss": 0.1734, + "mean_token_accuracy": 0.9331983923912048, + "step": 6102 + }, + { + "epoch": 3.0515, + "grad_norm": 2.0415959966443427, + "learning_rate": 4.3859149127005315e-06, + "loss": 0.1931, + "mean_token_accuracy": 0.9313265681266785, + "step": 6103 + }, + { + "epoch": 3.052, + "grad_norm": 2.108503023707309, + "learning_rate": 4.38562845184026e-06, + "loss": 0.2394, + "mean_token_accuracy": 0.9170962572097778, + "step": 6104 + }, + { + "epoch": 3.0525, + "grad_norm": 1.790280889537242, + "learning_rate": 4.385341933540461e-06, + "loss": 0.2144, + "mean_token_accuracy": 0.9200471043586731, + "step": 6105 + }, + { + "epoch": 3.053, + "grad_norm": 1.8002008847956177, + "learning_rate": 4.385055357809863e-06, + "loss": 0.1777, + "mean_token_accuracy": 0.9347331523895264, + "step": 6106 + }, + { + "epoch": 3.0535, + "grad_norm": 1.760739630490873, + "learning_rate": 4.3847687246571955e-06, + "loss": 0.1813, + "mean_token_accuracy": 0.9421226978302002, + "step": 6107 + }, + { + "epoch": 3.054, + "grad_norm": 2.2109180495074674, + "learning_rate": 4.384482034091189e-06, + "loss": 0.2137, + "mean_token_accuracy": 0.9270804524421692, + "step": 6108 + }, + { + "epoch": 3.0545, + "grad_norm": 2.2161001297875393, + "learning_rate": 4.384195286120577e-06, + "loss": 0.2024, + "mean_token_accuracy": 0.9327617287635803, + "step": 6109 + }, + { + "epoch": 3.055, + "grad_norm": 2.414467742234822, + "learning_rate": 4.3839084807540956e-06, + "loss": 0.2261, + "mean_token_accuracy": 0.9216201901435852, + "step": 6110 + }, + { + "epoch": 3.0555, + "grad_norm": 1.7506781741218915, + "learning_rate": 4.383621618000479e-06, + "loss": 0.2142, + "mean_token_accuracy": 0.923730194568634, + "step": 6111 + }, + { + "epoch": 3.056, + "grad_norm": 5.7596392167229356, + "learning_rate": 4.383334697868468e-06, + "loss": 0.2102, + "mean_token_accuracy": 0.9226765632629395, + "step": 6112 + }, + { + "epoch": 3.0565, + "grad_norm": 2.2645073663216198, + "learning_rate": 4.3830477203668005e-06, + "loss": 0.1793, + "mean_token_accuracy": 0.9351161122322083, + "step": 6113 + }, + { + "epoch": 3.057, + "grad_norm": 6.660900192287749, + "learning_rate": 4.3827606855042194e-06, + "loss": 0.1624, + "mean_token_accuracy": 0.9389994144439697, + "step": 6114 + }, + { + "epoch": 3.0575, + "grad_norm": 2.325274871993864, + "learning_rate": 4.3824735932894695e-06, + "loss": 0.1755, + "mean_token_accuracy": 0.9394830465316772, + "step": 6115 + }, + { + "epoch": 3.058, + "grad_norm": 2.223807729453683, + "learning_rate": 4.382186443731294e-06, + "loss": 0.1612, + "mean_token_accuracy": 0.9406509399414062, + "step": 6116 + }, + { + "epoch": 3.0585, + "grad_norm": 38.176671461688876, + "learning_rate": 4.38189923683844e-06, + "loss": 0.1879, + "mean_token_accuracy": 0.927578330039978, + "step": 6117 + }, + { + "epoch": 3.059, + "grad_norm": 5.596590709417937, + "learning_rate": 4.381611972619658e-06, + "loss": 0.1734, + "mean_token_accuracy": 0.9362244606018066, + "step": 6118 + }, + { + "epoch": 3.0595, + "grad_norm": 1.7711448439833801, + "learning_rate": 4.3813246510836975e-06, + "loss": 0.1491, + "mean_token_accuracy": 0.9423520565032959, + "step": 6119 + }, + { + "epoch": 3.06, + "grad_norm": 2.9190577104257383, + "learning_rate": 4.381037272239311e-06, + "loss": 0.3116, + "mean_token_accuracy": 0.9148340225219727, + "step": 6120 + }, + { + "epoch": 3.0605, + "grad_norm": 2.3070050318339193, + "learning_rate": 4.380749836095253e-06, + "loss": 0.1919, + "mean_token_accuracy": 0.9353869557380676, + "step": 6121 + }, + { + "epoch": 3.061, + "grad_norm": 1.6812780740421671, + "learning_rate": 4.380462342660279e-06, + "loss": 0.2315, + "mean_token_accuracy": 0.9184191823005676, + "step": 6122 + }, + { + "epoch": 3.0615, + "grad_norm": 5.26719326451194, + "learning_rate": 4.3801747919431455e-06, + "loss": 0.2114, + "mean_token_accuracy": 0.9290030002593994, + "step": 6123 + }, + { + "epoch": 3.062, + "grad_norm": 2.582633847548631, + "learning_rate": 4.379887183952614e-06, + "loss": 0.2208, + "mean_token_accuracy": 0.922766387462616, + "step": 6124 + }, + { + "epoch": 3.0625, + "grad_norm": 2.000311127907569, + "learning_rate": 4.379599518697444e-06, + "loss": 0.1928, + "mean_token_accuracy": 0.9316632151603699, + "step": 6125 + }, + { + "epoch": 3.063, + "grad_norm": 5.699795938672695, + "learning_rate": 4.379311796186399e-06, + "loss": 0.2143, + "mean_token_accuracy": 0.9277358651161194, + "step": 6126 + }, + { + "epoch": 3.0635, + "grad_norm": 1.560048720239965, + "learning_rate": 4.379024016428242e-06, + "loss": 0.1414, + "mean_token_accuracy": 0.9457769989967346, + "step": 6127 + }, + { + "epoch": 3.064, + "grad_norm": 3.562753918528687, + "learning_rate": 4.3787361794317405e-06, + "loss": 0.2013, + "mean_token_accuracy": 0.9395421743392944, + "step": 6128 + }, + { + "epoch": 3.0645, + "grad_norm": 1.8676052037159134, + "learning_rate": 4.378448285205663e-06, + "loss": 0.1906, + "mean_token_accuracy": 0.9296178817749023, + "step": 6129 + }, + { + "epoch": 3.065, + "grad_norm": 2.559703268272904, + "learning_rate": 4.378160333758779e-06, + "loss": 0.2066, + "mean_token_accuracy": 0.9288737177848816, + "step": 6130 + }, + { + "epoch": 3.0655, + "grad_norm": 12.350096322230181, + "learning_rate": 4.377872325099859e-06, + "loss": 0.2366, + "mean_token_accuracy": 0.9188562631607056, + "step": 6131 + }, + { + "epoch": 3.066, + "grad_norm": 3.152645273951788, + "learning_rate": 4.377584259237676e-06, + "loss": 0.1942, + "mean_token_accuracy": 0.9337891340255737, + "step": 6132 + }, + { + "epoch": 3.0665, + "grad_norm": 3.684499663482347, + "learning_rate": 4.3772961361810075e-06, + "loss": 0.1796, + "mean_token_accuracy": 0.9371006488800049, + "step": 6133 + }, + { + "epoch": 3.067, + "grad_norm": 2.753864322181415, + "learning_rate": 4.377007955938628e-06, + "loss": 0.2275, + "mean_token_accuracy": 0.9173920750617981, + "step": 6134 + }, + { + "epoch": 3.0675, + "grad_norm": 1.6565997183802672, + "learning_rate": 4.3767197185193164e-06, + "loss": 0.1631, + "mean_token_accuracy": 0.9400718212127686, + "step": 6135 + }, + { + "epoch": 3.068, + "grad_norm": 2.9637285399664015, + "learning_rate": 4.3764314239318534e-06, + "loss": 0.2313, + "mean_token_accuracy": 0.9166905879974365, + "step": 6136 + }, + { + "epoch": 3.0685000000000002, + "grad_norm": 6.53139725305222, + "learning_rate": 4.376143072185021e-06, + "loss": 0.207, + "mean_token_accuracy": 0.926188051700592, + "step": 6137 + }, + { + "epoch": 3.069, + "grad_norm": 2.1508453148159545, + "learning_rate": 4.375854663287602e-06, + "loss": 0.1849, + "mean_token_accuracy": 0.9343278408050537, + "step": 6138 + }, + { + "epoch": 3.0695, + "grad_norm": 4.802427606639156, + "learning_rate": 4.3755661972483824e-06, + "loss": 0.2165, + "mean_token_accuracy": 0.9236928820610046, + "step": 6139 + }, + { + "epoch": 3.07, + "grad_norm": 2.3062740090445226, + "learning_rate": 4.3752776740761495e-06, + "loss": 0.2396, + "mean_token_accuracy": 0.9180271625518799, + "step": 6140 + }, + { + "epoch": 3.0705, + "grad_norm": 2.285439378406509, + "learning_rate": 4.374989093779692e-06, + "loss": 0.185, + "mean_token_accuracy": 0.9303590655326843, + "step": 6141 + }, + { + "epoch": 3.071, + "grad_norm": 2.1756871450533786, + "learning_rate": 4.374700456367801e-06, + "loss": 0.205, + "mean_token_accuracy": 0.9242451190948486, + "step": 6142 + }, + { + "epoch": 3.0715, + "grad_norm": 1.5500558211304578, + "learning_rate": 4.374411761849268e-06, + "loss": 0.1326, + "mean_token_accuracy": 0.9481816291809082, + "step": 6143 + }, + { + "epoch": 3.072, + "grad_norm": 2.1010593122944963, + "learning_rate": 4.374123010232888e-06, + "loss": 0.1665, + "mean_token_accuracy": 0.9404582381248474, + "step": 6144 + }, + { + "epoch": 3.0725, + "grad_norm": 2.404595963991917, + "learning_rate": 4.373834201527457e-06, + "loss": 0.199, + "mean_token_accuracy": 0.9291030764579773, + "step": 6145 + }, + { + "epoch": 3.073, + "grad_norm": 3.44114112321913, + "learning_rate": 4.373545335741771e-06, + "loss": 0.2377, + "mean_token_accuracy": 0.920875072479248, + "step": 6146 + }, + { + "epoch": 3.0735, + "grad_norm": 2.705359829708706, + "learning_rate": 4.373256412884632e-06, + "loss": 0.2814, + "mean_token_accuracy": 0.8974065184593201, + "step": 6147 + }, + { + "epoch": 3.074, + "grad_norm": 2.4641274809368716, + "learning_rate": 4.372967432964838e-06, + "loss": 0.2367, + "mean_token_accuracy": 0.9178775548934937, + "step": 6148 + }, + { + "epoch": 3.0745, + "grad_norm": 2.746131721811722, + "learning_rate": 4.372678395991196e-06, + "loss": 0.1789, + "mean_token_accuracy": 0.9331049919128418, + "step": 6149 + }, + { + "epoch": 3.075, + "grad_norm": 2.0853017812466192, + "learning_rate": 4.372389301972506e-06, + "loss": 0.1777, + "mean_token_accuracy": 0.9343596696853638, + "step": 6150 + }, + { + "epoch": 3.0755, + "grad_norm": 3.6552800466510416, + "learning_rate": 4.372100150917576e-06, + "loss": 0.1984, + "mean_token_accuracy": 0.9304883480072021, + "step": 6151 + }, + { + "epoch": 3.076, + "grad_norm": 2.3411103160084545, + "learning_rate": 4.3718109428352155e-06, + "loss": 0.1826, + "mean_token_accuracy": 0.9328609108924866, + "step": 6152 + }, + { + "epoch": 3.0765, + "grad_norm": 3.1788795209843124, + "learning_rate": 4.371521677734233e-06, + "loss": 0.2129, + "mean_token_accuracy": 0.922897219657898, + "step": 6153 + }, + { + "epoch": 3.077, + "grad_norm": 3.4288851347351375, + "learning_rate": 4.37123235562344e-06, + "loss": 0.2121, + "mean_token_accuracy": 0.9241645336151123, + "step": 6154 + }, + { + "epoch": 3.0775, + "grad_norm": 4.7109154644809115, + "learning_rate": 4.370942976511651e-06, + "loss": 0.1639, + "mean_token_accuracy": 0.9425414204597473, + "step": 6155 + }, + { + "epoch": 3.078, + "grad_norm": 2.5866480507808776, + "learning_rate": 4.370653540407679e-06, + "loss": 0.2184, + "mean_token_accuracy": 0.9258184432983398, + "step": 6156 + }, + { + "epoch": 3.0785, + "grad_norm": 1.7966354711186003, + "learning_rate": 4.3703640473203405e-06, + "loss": 0.1984, + "mean_token_accuracy": 0.9323921203613281, + "step": 6157 + }, + { + "epoch": 3.079, + "grad_norm": 11.482928068517394, + "learning_rate": 4.370074497258456e-06, + "loss": 0.2033, + "mean_token_accuracy": 0.9326820969581604, + "step": 6158 + }, + { + "epoch": 3.0795, + "grad_norm": 3.047978988557688, + "learning_rate": 4.369784890230846e-06, + "loss": 0.2225, + "mean_token_accuracy": 0.9182983040809631, + "step": 6159 + }, + { + "epoch": 3.08, + "grad_norm": 1.656740437370862, + "learning_rate": 4.36949522624633e-06, + "loss": 0.2037, + "mean_token_accuracy": 0.9291363954544067, + "step": 6160 + }, + { + "epoch": 3.0805, + "grad_norm": 2.334536938877311, + "learning_rate": 4.369205505313733e-06, + "loss": 0.2217, + "mean_token_accuracy": 0.92825847864151, + "step": 6161 + }, + { + "epoch": 3.081, + "grad_norm": 1.7495640857608747, + "learning_rate": 4.368915727441881e-06, + "loss": 0.2083, + "mean_token_accuracy": 0.9209610223770142, + "step": 6162 + }, + { + "epoch": 3.0815, + "grad_norm": 1.8639653138570174, + "learning_rate": 4.3686258926396e-06, + "loss": 0.2041, + "mean_token_accuracy": 0.9271087646484375, + "step": 6163 + }, + { + "epoch": 3.082, + "grad_norm": 2.9928141441961933, + "learning_rate": 4.368336000915719e-06, + "loss": 0.189, + "mean_token_accuracy": 0.9299982190132141, + "step": 6164 + }, + { + "epoch": 3.0825, + "grad_norm": 1.909409219308415, + "learning_rate": 4.36804605227907e-06, + "loss": 0.2383, + "mean_token_accuracy": 0.9184010028839111, + "step": 6165 + }, + { + "epoch": 3.083, + "grad_norm": 3.8133172425070874, + "learning_rate": 4.367756046738484e-06, + "loss": 0.1834, + "mean_token_accuracy": 0.93410325050354, + "step": 6166 + }, + { + "epoch": 3.0835, + "grad_norm": 2.201457621824337, + "learning_rate": 4.367465984302794e-06, + "loss": 0.2262, + "mean_token_accuracy": 0.9204897880554199, + "step": 6167 + }, + { + "epoch": 3.084, + "grad_norm": 2.526857684513911, + "learning_rate": 4.36717586498084e-06, + "loss": 0.2364, + "mean_token_accuracy": 0.9172342419624329, + "step": 6168 + }, + { + "epoch": 3.0845, + "grad_norm": 3.4744440588716436, + "learning_rate": 4.366885688781453e-06, + "loss": 0.1665, + "mean_token_accuracy": 0.937701404094696, + "step": 6169 + }, + { + "epoch": 3.085, + "grad_norm": 1.7959830450342853, + "learning_rate": 4.366595455713479e-06, + "loss": 0.1708, + "mean_token_accuracy": 0.9389715790748596, + "step": 6170 + }, + { + "epoch": 3.0855, + "grad_norm": 5.739150967156397, + "learning_rate": 4.366305165785754e-06, + "loss": 0.2152, + "mean_token_accuracy": 0.932865560054779, + "step": 6171 + }, + { + "epoch": 3.086, + "grad_norm": 2.282992774487801, + "learning_rate": 4.366014819007124e-06, + "loss": 0.1708, + "mean_token_accuracy": 0.9420751929283142, + "step": 6172 + }, + { + "epoch": 3.0865, + "grad_norm": 4.026276532199351, + "learning_rate": 4.365724415386432e-06, + "loss": 0.2417, + "mean_token_accuracy": 0.9190788269042969, + "step": 6173 + }, + { + "epoch": 3.087, + "grad_norm": 2.271967905724668, + "learning_rate": 4.365433954932524e-06, + "loss": 0.2083, + "mean_token_accuracy": 0.9265561103820801, + "step": 6174 + }, + { + "epoch": 3.0875, + "grad_norm": 2.1474294944279575, + "learning_rate": 4.365143437654249e-06, + "loss": 0.1628, + "mean_token_accuracy": 0.9429495930671692, + "step": 6175 + }, + { + "epoch": 3.088, + "grad_norm": 1.9474053238619682, + "learning_rate": 4.364852863560456e-06, + "loss": 0.188, + "mean_token_accuracy": 0.929135262966156, + "step": 6176 + }, + { + "epoch": 3.0885, + "grad_norm": 3.2346948733095218, + "learning_rate": 4.364562232659995e-06, + "loss": 0.1943, + "mean_token_accuracy": 0.9343157410621643, + "step": 6177 + }, + { + "epoch": 3.089, + "grad_norm": 2.134692734079357, + "learning_rate": 4.364271544961722e-06, + "loss": 0.2267, + "mean_token_accuracy": 0.9190632700920105, + "step": 6178 + }, + { + "epoch": 3.0895, + "grad_norm": 4.411683985370883, + "learning_rate": 4.36398080047449e-06, + "loss": 0.2387, + "mean_token_accuracy": 0.9253246784210205, + "step": 6179 + }, + { + "epoch": 3.09, + "grad_norm": 2.0078418629619006, + "learning_rate": 4.3636899992071555e-06, + "loss": 0.2106, + "mean_token_accuracy": 0.9333445429801941, + "step": 6180 + }, + { + "epoch": 3.0905, + "grad_norm": 1.6594657280457326, + "learning_rate": 4.363399141168578e-06, + "loss": 0.1809, + "mean_token_accuracy": 0.9404232501983643, + "step": 6181 + }, + { + "epoch": 3.091, + "grad_norm": 3.4839236324637457, + "learning_rate": 4.363108226367616e-06, + "loss": 0.2158, + "mean_token_accuracy": 0.9256364703178406, + "step": 6182 + }, + { + "epoch": 3.0915, + "grad_norm": 1.6321094151263122, + "learning_rate": 4.362817254813133e-06, + "loss": 0.1868, + "mean_token_accuracy": 0.9379734992980957, + "step": 6183 + }, + { + "epoch": 3.092, + "grad_norm": 1.8082838498637608, + "learning_rate": 4.362526226513991e-06, + "loss": 0.2364, + "mean_token_accuracy": 0.923258364200592, + "step": 6184 + }, + { + "epoch": 3.0925, + "grad_norm": 1.788265032768386, + "learning_rate": 4.362235141479055e-06, + "loss": 0.1781, + "mean_token_accuracy": 0.9303421378135681, + "step": 6185 + }, + { + "epoch": 3.093, + "grad_norm": 1.7992208352627994, + "learning_rate": 4.361943999717194e-06, + "loss": 0.1952, + "mean_token_accuracy": 0.9313758015632629, + "step": 6186 + }, + { + "epoch": 3.0935, + "grad_norm": 14.423445868944123, + "learning_rate": 4.3616528012372746e-06, + "loss": 0.211, + "mean_token_accuracy": 0.9310164451599121, + "step": 6187 + }, + { + "epoch": 3.094, + "grad_norm": 1.8914966326408509, + "learning_rate": 4.3613615460481686e-06, + "loss": 0.18, + "mean_token_accuracy": 0.9348168969154358, + "step": 6188 + }, + { + "epoch": 3.0945, + "grad_norm": 2.318419989080959, + "learning_rate": 4.361070234158747e-06, + "loss": 0.2163, + "mean_token_accuracy": 0.9200351238250732, + "step": 6189 + }, + { + "epoch": 3.095, + "grad_norm": 3.0067781969092278, + "learning_rate": 4.360778865577885e-06, + "loss": 0.2059, + "mean_token_accuracy": 0.924164354801178, + "step": 6190 + }, + { + "epoch": 3.0955, + "grad_norm": 2.2431707723556333, + "learning_rate": 4.360487440314458e-06, + "loss": 0.2043, + "mean_token_accuracy": 0.9301643967628479, + "step": 6191 + }, + { + "epoch": 3.096, + "grad_norm": 1.7738244442427664, + "learning_rate": 4.3601959583773415e-06, + "loss": 0.1904, + "mean_token_accuracy": 0.9308624267578125, + "step": 6192 + }, + { + "epoch": 3.0965, + "grad_norm": 2.419803524161145, + "learning_rate": 4.359904419775417e-06, + "loss": 0.145, + "mean_token_accuracy": 0.9470587968826294, + "step": 6193 + }, + { + "epoch": 3.097, + "grad_norm": 3.1643862474466578, + "learning_rate": 4.359612824517563e-06, + "loss": 0.183, + "mean_token_accuracy": 0.9382447004318237, + "step": 6194 + }, + { + "epoch": 3.0975, + "grad_norm": 3.6893630901812027, + "learning_rate": 4.359321172612664e-06, + "loss": 0.1711, + "mean_token_accuracy": 0.9412110447883606, + "step": 6195 + }, + { + "epoch": 3.098, + "grad_norm": 4.182544080139617, + "learning_rate": 4.359029464069603e-06, + "loss": 0.2706, + "mean_token_accuracy": 0.9103714823722839, + "step": 6196 + }, + { + "epoch": 3.0985, + "grad_norm": 2.352812579717328, + "learning_rate": 4.358737698897266e-06, + "loss": 0.1779, + "mean_token_accuracy": 0.9341358542442322, + "step": 6197 + }, + { + "epoch": 3.099, + "grad_norm": 5.696207868056131, + "learning_rate": 4.358445877104541e-06, + "loss": 0.2052, + "mean_token_accuracy": 0.9321392774581909, + "step": 6198 + }, + { + "epoch": 3.0995, + "grad_norm": 6.048010108140964, + "learning_rate": 4.358153998700317e-06, + "loss": 0.1914, + "mean_token_accuracy": 0.9325296878814697, + "step": 6199 + }, + { + "epoch": 3.1, + "grad_norm": 3.5886658259261743, + "learning_rate": 4.357862063693486e-06, + "loss": 0.1706, + "mean_token_accuracy": 0.941440761089325, + "step": 6200 + }, + { + "epoch": 3.1005, + "grad_norm": 25.67252886396099, + "learning_rate": 4.35757007209294e-06, + "loss": 0.1774, + "mean_token_accuracy": 0.9359933733940125, + "step": 6201 + }, + { + "epoch": 3.101, + "grad_norm": 1.7827515600265191, + "learning_rate": 4.357278023907574e-06, + "loss": 0.1935, + "mean_token_accuracy": 0.9315328001976013, + "step": 6202 + }, + { + "epoch": 3.1015, + "grad_norm": 4.005399222458109, + "learning_rate": 4.3569859191462845e-06, + "loss": 0.2573, + "mean_token_accuracy": 0.9111962914466858, + "step": 6203 + }, + { + "epoch": 3.102, + "grad_norm": 2.2518181791469676, + "learning_rate": 4.356693757817969e-06, + "loss": 0.2103, + "mean_token_accuracy": 0.9294710159301758, + "step": 6204 + }, + { + "epoch": 3.1025, + "grad_norm": 2.4487960501027892, + "learning_rate": 4.356401539931528e-06, + "loss": 0.1747, + "mean_token_accuracy": 0.9315295219421387, + "step": 6205 + }, + { + "epoch": 3.103, + "grad_norm": 2.198635749797931, + "learning_rate": 4.356109265495861e-06, + "loss": 0.2382, + "mean_token_accuracy": 0.9233084917068481, + "step": 6206 + }, + { + "epoch": 3.1035, + "grad_norm": 5.855381083753939, + "learning_rate": 4.355816934519875e-06, + "loss": 0.2063, + "mean_token_accuracy": 0.9221804738044739, + "step": 6207 + }, + { + "epoch": 3.104, + "grad_norm": 1.7360431079651333, + "learning_rate": 4.355524547012471e-06, + "loss": 0.169, + "mean_token_accuracy": 0.9363964200019836, + "step": 6208 + }, + { + "epoch": 3.1045, + "grad_norm": 2.012257002469489, + "learning_rate": 4.3552321029825565e-06, + "loss": 0.2022, + "mean_token_accuracy": 0.9269366264343262, + "step": 6209 + }, + { + "epoch": 3.105, + "grad_norm": 93.28355773727017, + "learning_rate": 4.354939602439041e-06, + "loss": 0.178, + "mean_token_accuracy": 0.933265209197998, + "step": 6210 + }, + { + "epoch": 3.1055, + "grad_norm": 1.9189045682477683, + "learning_rate": 4.354647045390835e-06, + "loss": 0.2128, + "mean_token_accuracy": 0.9202861785888672, + "step": 6211 + }, + { + "epoch": 3.106, + "grad_norm": 3.906576706709946, + "learning_rate": 4.354354431846848e-06, + "loss": 0.1836, + "mean_token_accuracy": 0.9335658550262451, + "step": 6212 + }, + { + "epoch": 3.1065, + "grad_norm": 3.6565770146823033, + "learning_rate": 4.354061761815996e-06, + "loss": 0.2372, + "mean_token_accuracy": 0.9256498217582703, + "step": 6213 + }, + { + "epoch": 3.107, + "grad_norm": 2.4253138689741975, + "learning_rate": 4.353769035307193e-06, + "loss": 0.1657, + "mean_token_accuracy": 0.9365853667259216, + "step": 6214 + }, + { + "epoch": 3.1075, + "grad_norm": 2.2130893978172383, + "learning_rate": 4.353476252329356e-06, + "loss": 0.1566, + "mean_token_accuracy": 0.9468051791191101, + "step": 6215 + }, + { + "epoch": 3.108, + "grad_norm": 1.795936774414442, + "learning_rate": 4.353183412891403e-06, + "loss": 0.161, + "mean_token_accuracy": 0.9376770257949829, + "step": 6216 + }, + { + "epoch": 3.1085, + "grad_norm": 2.6593082204779708, + "learning_rate": 4.352890517002256e-06, + "loss": 0.2192, + "mean_token_accuracy": 0.9227976202964783, + "step": 6217 + }, + { + "epoch": 3.109, + "grad_norm": 2.0713919515818193, + "learning_rate": 4.352597564670836e-06, + "loss": 0.1664, + "mean_token_accuracy": 0.9403650164604187, + "step": 6218 + }, + { + "epoch": 3.1095, + "grad_norm": 2.7337903461423227, + "learning_rate": 4.352304555906067e-06, + "loss": 0.2416, + "mean_token_accuracy": 0.914038896560669, + "step": 6219 + }, + { + "epoch": 3.11, + "grad_norm": 7.331594714013287, + "learning_rate": 4.352011490716875e-06, + "loss": 0.1843, + "mean_token_accuracy": 0.9374560713768005, + "step": 6220 + }, + { + "epoch": 3.1105, + "grad_norm": 2.107786490535372, + "learning_rate": 4.351718369112188e-06, + "loss": 0.2286, + "mean_token_accuracy": 0.9160353541374207, + "step": 6221 + }, + { + "epoch": 3.111, + "grad_norm": 2.321658220311075, + "learning_rate": 4.3514251911009316e-06, + "loss": 0.2244, + "mean_token_accuracy": 0.9272617697715759, + "step": 6222 + }, + { + "epoch": 3.1115, + "grad_norm": 4.463981651582452, + "learning_rate": 4.35113195669204e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.8830289840698242, + "step": 6223 + }, + { + "epoch": 3.112, + "grad_norm": 2.422086175822377, + "learning_rate": 4.3508386658944455e-06, + "loss": 0.2411, + "mean_token_accuracy": 0.9155555367469788, + "step": 6224 + }, + { + "epoch": 3.1125, + "grad_norm": 2.972066183254755, + "learning_rate": 4.350545318717081e-06, + "loss": 0.192, + "mean_token_accuracy": 0.9306861758232117, + "step": 6225 + }, + { + "epoch": 3.113, + "grad_norm": 5.9934425183526026, + "learning_rate": 4.350251915168881e-06, + "loss": 0.2051, + "mean_token_accuracy": 0.9316815733909607, + "step": 6226 + }, + { + "epoch": 3.1135, + "grad_norm": 1.869317873268384, + "learning_rate": 4.349958455258787e-06, + "loss": 0.1984, + "mean_token_accuracy": 0.9295796751976013, + "step": 6227 + }, + { + "epoch": 3.114, + "grad_norm": 2.3208895123801567, + "learning_rate": 4.349664938995734e-06, + "loss": 0.2538, + "mean_token_accuracy": 0.9193776249885559, + "step": 6228 + }, + { + "epoch": 3.1145, + "grad_norm": 1.7220509397185482, + "learning_rate": 4.349371366388666e-06, + "loss": 0.31, + "mean_token_accuracy": 0.903954803943634, + "step": 6229 + }, + { + "epoch": 3.115, + "grad_norm": 1.8278643769520295, + "learning_rate": 4.349077737446525e-06, + "loss": 0.2108, + "mean_token_accuracy": 0.920520007610321, + "step": 6230 + }, + { + "epoch": 3.1155, + "grad_norm": 2.5545314708168965, + "learning_rate": 4.348784052178255e-06, + "loss": 0.2194, + "mean_token_accuracy": 0.9163840413093567, + "step": 6231 + }, + { + "epoch": 3.116, + "grad_norm": 3.58670227531691, + "learning_rate": 4.348490310592801e-06, + "loss": 0.2167, + "mean_token_accuracy": 0.9269888401031494, + "step": 6232 + }, + { + "epoch": 3.1165, + "grad_norm": 1.755275089195802, + "learning_rate": 4.348196512699114e-06, + "loss": 0.2389, + "mean_token_accuracy": 0.9192531108856201, + "step": 6233 + }, + { + "epoch": 3.117, + "grad_norm": 2.2608528246013218, + "learning_rate": 4.347902658506142e-06, + "loss": 0.2338, + "mean_token_accuracy": 0.9212656617164612, + "step": 6234 + }, + { + "epoch": 3.1175, + "grad_norm": 2.800327374996684, + "learning_rate": 4.347608748022835e-06, + "loss": 0.2164, + "mean_token_accuracy": 0.9283210039138794, + "step": 6235 + }, + { + "epoch": 3.118, + "grad_norm": 2.098991362638605, + "learning_rate": 4.347314781258148e-06, + "loss": 0.1912, + "mean_token_accuracy": 0.9300780892372131, + "step": 6236 + }, + { + "epoch": 3.1185, + "grad_norm": 2.10479515236642, + "learning_rate": 4.3470207582210334e-06, + "loss": 0.212, + "mean_token_accuracy": 0.9175111055374146, + "step": 6237 + }, + { + "epoch": 3.1189999999999998, + "grad_norm": 2.1298796997758425, + "learning_rate": 4.34672667892045e-06, + "loss": 0.1946, + "mean_token_accuracy": 0.9270720481872559, + "step": 6238 + }, + { + "epoch": 3.1195, + "grad_norm": 3.9824353469789746, + "learning_rate": 4.346432543365356e-06, + "loss": 0.2007, + "mean_token_accuracy": 0.9329656958580017, + "step": 6239 + }, + { + "epoch": 3.12, + "grad_norm": 2.0342767097045362, + "learning_rate": 4.346138351564711e-06, + "loss": 0.1653, + "mean_token_accuracy": 0.9389339089393616, + "step": 6240 + }, + { + "epoch": 3.1205, + "grad_norm": 2.587805395267865, + "learning_rate": 4.345844103527474e-06, + "loss": 0.1876, + "mean_token_accuracy": 0.9320563673973083, + "step": 6241 + }, + { + "epoch": 3.121, + "grad_norm": 2.22261685287397, + "learning_rate": 4.345549799262611e-06, + "loss": 0.1982, + "mean_token_accuracy": 0.9265273213386536, + "step": 6242 + }, + { + "epoch": 3.1215, + "grad_norm": 3.2557495018867795, + "learning_rate": 4.3452554387790866e-06, + "loss": 0.2345, + "mean_token_accuracy": 0.9226540923118591, + "step": 6243 + }, + { + "epoch": 3.122, + "grad_norm": 1.9461495063784164, + "learning_rate": 4.344961022085867e-06, + "loss": 0.197, + "mean_token_accuracy": 0.9346227645874023, + "step": 6244 + }, + { + "epoch": 3.1225, + "grad_norm": 7.4634493503678305, + "learning_rate": 4.344666549191921e-06, + "loss": 0.2597, + "mean_token_accuracy": 0.9154177308082581, + "step": 6245 + }, + { + "epoch": 3.123, + "grad_norm": 2.755035814166, + "learning_rate": 4.344372020106219e-06, + "loss": 0.2263, + "mean_token_accuracy": 0.9255746006965637, + "step": 6246 + }, + { + "epoch": 3.1235, + "grad_norm": 1.393793592120683, + "learning_rate": 4.344077434837732e-06, + "loss": 0.2091, + "mean_token_accuracy": 0.9240487813949585, + "step": 6247 + }, + { + "epoch": 3.124, + "grad_norm": 2.442070139925756, + "learning_rate": 4.343782793395435e-06, + "loss": 0.2366, + "mean_token_accuracy": 0.9183421730995178, + "step": 6248 + }, + { + "epoch": 3.1245, + "grad_norm": 1.6489772552537019, + "learning_rate": 4.343488095788302e-06, + "loss": 0.2067, + "mean_token_accuracy": 0.9269295930862427, + "step": 6249 + }, + { + "epoch": 3.125, + "grad_norm": 7.8062221648937715, + "learning_rate": 4.34319334202531e-06, + "loss": 0.2007, + "mean_token_accuracy": 0.9279661178588867, + "step": 6250 + }, + { + "epoch": 3.1255, + "grad_norm": 3.3063723497882846, + "learning_rate": 4.342898532115439e-06, + "loss": 0.18, + "mean_token_accuracy": 0.9387628436088562, + "step": 6251 + }, + { + "epoch": 3.126, + "grad_norm": 2.6335375322706076, + "learning_rate": 4.342603666067669e-06, + "loss": 0.1617, + "mean_token_accuracy": 0.934944212436676, + "step": 6252 + }, + { + "epoch": 3.1265, + "grad_norm": 2.9925691991291425, + "learning_rate": 4.34230874389098e-06, + "loss": 0.273, + "mean_token_accuracy": 0.9049533605575562, + "step": 6253 + }, + { + "epoch": 3.127, + "grad_norm": 5.232522179203388, + "learning_rate": 4.342013765594359e-06, + "loss": 0.1948, + "mean_token_accuracy": 0.9281476736068726, + "step": 6254 + }, + { + "epoch": 3.1275, + "grad_norm": 15.251041454342882, + "learning_rate": 4.341718731186788e-06, + "loss": 0.1522, + "mean_token_accuracy": 0.9442172050476074, + "step": 6255 + }, + { + "epoch": 3.128, + "grad_norm": 2.944596994533526, + "learning_rate": 4.341423640677259e-06, + "loss": 0.167, + "mean_token_accuracy": 0.9398974180221558, + "step": 6256 + }, + { + "epoch": 3.1285, + "grad_norm": 3.1332769804926803, + "learning_rate": 4.341128494074757e-06, + "loss": 0.149, + "mean_token_accuracy": 0.939181923866272, + "step": 6257 + }, + { + "epoch": 3.129, + "grad_norm": 2.151481141236977, + "learning_rate": 4.340833291388274e-06, + "loss": 0.2578, + "mean_token_accuracy": 0.9086187481880188, + "step": 6258 + }, + { + "epoch": 3.1295, + "grad_norm": 1.5414845152606933, + "learning_rate": 4.340538032626802e-06, + "loss": 0.1759, + "mean_token_accuracy": 0.9316287040710449, + "step": 6259 + }, + { + "epoch": 3.13, + "grad_norm": 4.019333195427534, + "learning_rate": 4.340242717799337e-06, + "loss": 0.1847, + "mean_token_accuracy": 0.9285135865211487, + "step": 6260 + }, + { + "epoch": 3.1305, + "grad_norm": 2.2987113077271197, + "learning_rate": 4.339947346914871e-06, + "loss": 0.2393, + "mean_token_accuracy": 0.9180054664611816, + "step": 6261 + }, + { + "epoch": 3.1310000000000002, + "grad_norm": 3.4178111550187427, + "learning_rate": 4.339651919982406e-06, + "loss": 0.1925, + "mean_token_accuracy": 0.9346600770950317, + "step": 6262 + }, + { + "epoch": 3.1315, + "grad_norm": 2.1732977540222436, + "learning_rate": 4.3393564370109375e-06, + "loss": 0.1686, + "mean_token_accuracy": 0.9419458508491516, + "step": 6263 + }, + { + "epoch": 3.132, + "grad_norm": 1.9932946698397855, + "learning_rate": 4.339060898009469e-06, + "loss": 0.1893, + "mean_token_accuracy": 0.9342878460884094, + "step": 6264 + }, + { + "epoch": 3.1325, + "grad_norm": 4.256124104304759, + "learning_rate": 4.338765302987001e-06, + "loss": 0.1905, + "mean_token_accuracy": 0.9342697858810425, + "step": 6265 + }, + { + "epoch": 3.133, + "grad_norm": 3.6779404009295784, + "learning_rate": 4.33846965195254e-06, + "loss": 0.3209, + "mean_token_accuracy": 0.897704005241394, + "step": 6266 + }, + { + "epoch": 3.1335, + "grad_norm": 4.467551830040533, + "learning_rate": 4.338173944915091e-06, + "loss": 0.2565, + "mean_token_accuracy": 0.91839998960495, + "step": 6267 + }, + { + "epoch": 3.134, + "grad_norm": 6.663779362397937, + "learning_rate": 4.337878181883661e-06, + "loss": 0.1986, + "mean_token_accuracy": 0.9262564778327942, + "step": 6268 + }, + { + "epoch": 3.1345, + "grad_norm": 2.4860648253716935, + "learning_rate": 4.33758236286726e-06, + "loss": 0.1944, + "mean_token_accuracy": 0.9236522316932678, + "step": 6269 + }, + { + "epoch": 3.135, + "grad_norm": 3.8182635305203427, + "learning_rate": 4.3372864878749e-06, + "loss": 0.2568, + "mean_token_accuracy": 0.9253065586090088, + "step": 6270 + }, + { + "epoch": 3.1355, + "grad_norm": 2.2481795814637904, + "learning_rate": 4.336990556915594e-06, + "loss": 0.1974, + "mean_token_accuracy": 0.9271723031997681, + "step": 6271 + }, + { + "epoch": 3.136, + "grad_norm": 2.4289294693392494, + "learning_rate": 4.336694569998354e-06, + "loss": 0.1733, + "mean_token_accuracy": 0.934666097164154, + "step": 6272 + }, + { + "epoch": 3.1365, + "grad_norm": 1.916113772862934, + "learning_rate": 4.336398527132198e-06, + "loss": 0.1571, + "mean_token_accuracy": 0.9434511661529541, + "step": 6273 + }, + { + "epoch": 3.137, + "grad_norm": 2.5328691010145423, + "learning_rate": 4.336102428326146e-06, + "loss": 0.2035, + "mean_token_accuracy": 0.9326229095458984, + "step": 6274 + }, + { + "epoch": 3.1375, + "grad_norm": 2.197847285116817, + "learning_rate": 4.335806273589214e-06, + "loss": 0.2128, + "mean_token_accuracy": 0.927190363407135, + "step": 6275 + }, + { + "epoch": 3.138, + "grad_norm": 2.0277809024813274, + "learning_rate": 4.3355100629304256e-06, + "loss": 0.2234, + "mean_token_accuracy": 0.9208539128303528, + "step": 6276 + }, + { + "epoch": 3.1385, + "grad_norm": 2.05657625532122, + "learning_rate": 4.335213796358804e-06, + "loss": 0.3156, + "mean_token_accuracy": 0.8884032368659973, + "step": 6277 + }, + { + "epoch": 3.1390000000000002, + "grad_norm": 2.6060312529495846, + "learning_rate": 4.334917473883373e-06, + "loss": 0.1919, + "mean_token_accuracy": 0.9305782318115234, + "step": 6278 + }, + { + "epoch": 3.1395, + "grad_norm": 2.7419642972319043, + "learning_rate": 4.33462109551316e-06, + "loss": 0.1975, + "mean_token_accuracy": 0.9291291236877441, + "step": 6279 + }, + { + "epoch": 3.14, + "grad_norm": 8.606703086291411, + "learning_rate": 4.334324661257191e-06, + "loss": 0.2317, + "mean_token_accuracy": 0.9188405871391296, + "step": 6280 + }, + { + "epoch": 3.1405, + "grad_norm": 5.061314727745275, + "learning_rate": 4.334028171124499e-06, + "loss": 0.2158, + "mean_token_accuracy": 0.9286784529685974, + "step": 6281 + }, + { + "epoch": 3.141, + "grad_norm": 8.062684116416204, + "learning_rate": 4.333731625124114e-06, + "loss": 0.2904, + "mean_token_accuracy": 0.9042829871177673, + "step": 6282 + }, + { + "epoch": 3.1415, + "grad_norm": 3.081401558293555, + "learning_rate": 4.333435023265069e-06, + "loss": 0.1958, + "mean_token_accuracy": 0.9299774169921875, + "step": 6283 + }, + { + "epoch": 3.142, + "grad_norm": 1.678548912062805, + "learning_rate": 4.333138365556401e-06, + "loss": 0.1697, + "mean_token_accuracy": 0.9451996684074402, + "step": 6284 + }, + { + "epoch": 3.1425, + "grad_norm": 1.7595080526634546, + "learning_rate": 4.332841652007144e-06, + "loss": 0.202, + "mean_token_accuracy": 0.9273720979690552, + "step": 6285 + }, + { + "epoch": 3.143, + "grad_norm": 2.213323590904783, + "learning_rate": 4.332544882626337e-06, + "loss": 0.2091, + "mean_token_accuracy": 0.9199868440628052, + "step": 6286 + }, + { + "epoch": 3.1435, + "grad_norm": 4.917366968922451, + "learning_rate": 4.332248057423022e-06, + "loss": 0.2278, + "mean_token_accuracy": 0.9242383241653442, + "step": 6287 + }, + { + "epoch": 3.144, + "grad_norm": 2.007569491865393, + "learning_rate": 4.33195117640624e-06, + "loss": 0.1922, + "mean_token_accuracy": 0.9260608553886414, + "step": 6288 + }, + { + "epoch": 3.1445, + "grad_norm": 1.682966652838917, + "learning_rate": 4.331654239585032e-06, + "loss": 0.1931, + "mean_token_accuracy": 0.9297875761985779, + "step": 6289 + }, + { + "epoch": 3.145, + "grad_norm": 1.7066697437820124, + "learning_rate": 4.331357246968447e-06, + "loss": 0.1985, + "mean_token_accuracy": 0.9252077341079712, + "step": 6290 + }, + { + "epoch": 3.1455, + "grad_norm": 1.7223648897175667, + "learning_rate": 4.33106019856553e-06, + "loss": 0.1461, + "mean_token_accuracy": 0.9483408331871033, + "step": 6291 + }, + { + "epoch": 3.146, + "grad_norm": 1.9624806040373337, + "learning_rate": 4.33076309438533e-06, + "loss": 0.2498, + "mean_token_accuracy": 0.9103313684463501, + "step": 6292 + }, + { + "epoch": 3.1465, + "grad_norm": 3.8204018616537145, + "learning_rate": 4.330465934436897e-06, + "loss": 0.2114, + "mean_token_accuracy": 0.9299595952033997, + "step": 6293 + }, + { + "epoch": 3.147, + "grad_norm": 2.956519802550122, + "learning_rate": 4.3301687187292825e-06, + "loss": 0.2391, + "mean_token_accuracy": 0.9200756549835205, + "step": 6294 + }, + { + "epoch": 3.1475, + "grad_norm": 4.049247234859052, + "learning_rate": 4.329871447271541e-06, + "loss": 0.2614, + "mean_token_accuracy": 0.9154238104820251, + "step": 6295 + }, + { + "epoch": 3.148, + "grad_norm": 1.7786397647145669, + "learning_rate": 4.329574120072728e-06, + "loss": 0.1746, + "mean_token_accuracy": 0.9346551895141602, + "step": 6296 + }, + { + "epoch": 3.1485, + "grad_norm": 2.3720937215254496, + "learning_rate": 4.329276737141901e-06, + "loss": 0.1888, + "mean_token_accuracy": 0.9371298551559448, + "step": 6297 + }, + { + "epoch": 3.149, + "grad_norm": 1.7343740714131515, + "learning_rate": 4.328979298488118e-06, + "loss": 0.1764, + "mean_token_accuracy": 0.9374629855155945, + "step": 6298 + }, + { + "epoch": 3.1495, + "grad_norm": 1.6045653267346547, + "learning_rate": 4.328681804120438e-06, + "loss": 0.1639, + "mean_token_accuracy": 0.946835458278656, + "step": 6299 + }, + { + "epoch": 3.15, + "grad_norm": 2.005102333774395, + "learning_rate": 4.328384254047927e-06, + "loss": 0.1839, + "mean_token_accuracy": 0.9350302815437317, + "step": 6300 + }, + { + "epoch": 3.1505, + "grad_norm": 1.6485921717261605, + "learning_rate": 4.328086648279645e-06, + "loss": 0.1624, + "mean_token_accuracy": 0.9347866177558899, + "step": 6301 + }, + { + "epoch": 3.151, + "grad_norm": 1.7033638227902717, + "learning_rate": 4.327788986824661e-06, + "loss": 0.2156, + "mean_token_accuracy": 0.9169355034828186, + "step": 6302 + }, + { + "epoch": 3.1515, + "grad_norm": 1.7222576963749552, + "learning_rate": 4.3274912696920395e-06, + "loss": 0.1661, + "mean_token_accuracy": 0.9409114718437195, + "step": 6303 + }, + { + "epoch": 3.152, + "grad_norm": 1.6746429956360127, + "learning_rate": 4.327193496890852e-06, + "loss": 0.174, + "mean_token_accuracy": 0.9323943853378296, + "step": 6304 + }, + { + "epoch": 3.1525, + "grad_norm": 2.1656112872077378, + "learning_rate": 4.326895668430166e-06, + "loss": 0.2767, + "mean_token_accuracy": 0.9112300872802734, + "step": 6305 + }, + { + "epoch": 3.153, + "grad_norm": 3.099120163063038, + "learning_rate": 4.326597784319057e-06, + "loss": 0.2152, + "mean_token_accuracy": 0.9214980006217957, + "step": 6306 + }, + { + "epoch": 3.1535, + "grad_norm": 2.649385424035011, + "learning_rate": 4.326299844566596e-06, + "loss": 0.1997, + "mean_token_accuracy": 0.9354906678199768, + "step": 6307 + }, + { + "epoch": 3.154, + "grad_norm": 1.8949456422393578, + "learning_rate": 4.326001849181862e-06, + "loss": 0.1921, + "mean_token_accuracy": 0.9341692924499512, + "step": 6308 + }, + { + "epoch": 3.1545, + "grad_norm": 1.8086500572029354, + "learning_rate": 4.32570379817393e-06, + "loss": 0.235, + "mean_token_accuracy": 0.9151937365531921, + "step": 6309 + }, + { + "epoch": 3.155, + "grad_norm": 2.550280819503525, + "learning_rate": 4.3254056915518815e-06, + "loss": 0.1978, + "mean_token_accuracy": 0.9303291440010071, + "step": 6310 + }, + { + "epoch": 3.1555, + "grad_norm": 1.8037395326945511, + "learning_rate": 4.325107529324795e-06, + "loss": 0.1619, + "mean_token_accuracy": 0.9464157819747925, + "step": 6311 + }, + { + "epoch": 3.156, + "grad_norm": 3.210485678101212, + "learning_rate": 4.3248093115017544e-06, + "loss": 0.1827, + "mean_token_accuracy": 0.9388557076454163, + "step": 6312 + }, + { + "epoch": 3.1565, + "grad_norm": 1.6627140396598905, + "learning_rate": 4.324511038091843e-06, + "loss": 0.1907, + "mean_token_accuracy": 0.9438908696174622, + "step": 6313 + }, + { + "epoch": 3.157, + "grad_norm": 2.445347098837071, + "learning_rate": 4.324212709104147e-06, + "loss": 0.1991, + "mean_token_accuracy": 0.9314919710159302, + "step": 6314 + }, + { + "epoch": 3.1575, + "grad_norm": 14.112902416521157, + "learning_rate": 4.323914324547755e-06, + "loss": 0.2016, + "mean_token_accuracy": 0.93047034740448, + "step": 6315 + }, + { + "epoch": 3.158, + "grad_norm": 1.9142318221839292, + "learning_rate": 4.323615884431756e-06, + "loss": 0.2223, + "mean_token_accuracy": 0.9229611158370972, + "step": 6316 + }, + { + "epoch": 3.1585, + "grad_norm": 2.1012248086335603, + "learning_rate": 4.323317388765241e-06, + "loss": 0.1548, + "mean_token_accuracy": 0.9451578259468079, + "step": 6317 + }, + { + "epoch": 3.159, + "grad_norm": 4.668054315407228, + "learning_rate": 4.3230188375573e-06, + "loss": 0.2125, + "mean_token_accuracy": 0.9274581670761108, + "step": 6318 + }, + { + "epoch": 3.1595, + "grad_norm": 1.659230773332989, + "learning_rate": 4.322720230817031e-06, + "loss": 0.1992, + "mean_token_accuracy": 0.9248999953269958, + "step": 6319 + }, + { + "epoch": 3.16, + "grad_norm": 3.1464278294983803, + "learning_rate": 4.322421568553529e-06, + "loss": 0.1886, + "mean_token_accuracy": 0.938255250453949, + "step": 6320 + }, + { + "epoch": 3.1605, + "grad_norm": 2.729031714231576, + "learning_rate": 4.322122850775892e-06, + "loss": 0.2073, + "mean_token_accuracy": 0.9215898513793945, + "step": 6321 + }, + { + "epoch": 3.161, + "grad_norm": 5.700790598698756, + "learning_rate": 4.321824077493218e-06, + "loss": 0.1893, + "mean_token_accuracy": 0.9332161545753479, + "step": 6322 + }, + { + "epoch": 3.1615, + "grad_norm": 1.6825921078037338, + "learning_rate": 4.32152524871461e-06, + "loss": 0.2046, + "mean_token_accuracy": 0.9258402585983276, + "step": 6323 + }, + { + "epoch": 3.162, + "grad_norm": 2.0572038672443407, + "learning_rate": 4.3212263644491694e-06, + "loss": 0.2438, + "mean_token_accuracy": 0.921160101890564, + "step": 6324 + }, + { + "epoch": 3.1625, + "grad_norm": 2.233287824520707, + "learning_rate": 4.320927424706001e-06, + "loss": 0.2419, + "mean_token_accuracy": 0.9163200259208679, + "step": 6325 + }, + { + "epoch": 3.163, + "grad_norm": 1.9226712997613677, + "learning_rate": 4.320628429494212e-06, + "loss": 0.2466, + "mean_token_accuracy": 0.9210191369056702, + "step": 6326 + }, + { + "epoch": 3.1635, + "grad_norm": 2.3000992498341923, + "learning_rate": 4.32032937882291e-06, + "loss": 0.2501, + "mean_token_accuracy": 0.9148776531219482, + "step": 6327 + }, + { + "epoch": 3.164, + "grad_norm": 3.235322981732861, + "learning_rate": 4.320030272701203e-06, + "loss": 0.168, + "mean_token_accuracy": 0.9395095109939575, + "step": 6328 + }, + { + "epoch": 3.1645, + "grad_norm": 2.019062828962353, + "learning_rate": 4.319731111138205e-06, + "loss": 0.1794, + "mean_token_accuracy": 0.94199538230896, + "step": 6329 + }, + { + "epoch": 3.165, + "grad_norm": 4.353816957201768, + "learning_rate": 4.319431894143027e-06, + "loss": 0.244, + "mean_token_accuracy": 0.9202089309692383, + "step": 6330 + }, + { + "epoch": 3.1655, + "grad_norm": 1.8017237907835948, + "learning_rate": 4.319132621724784e-06, + "loss": 0.1688, + "mean_token_accuracy": 0.9368826150894165, + "step": 6331 + }, + { + "epoch": 3.166, + "grad_norm": 2.6171443394006824, + "learning_rate": 4.318833293892593e-06, + "loss": 0.1515, + "mean_token_accuracy": 0.9477951526641846, + "step": 6332 + }, + { + "epoch": 3.1665, + "grad_norm": 4.1633185547184235, + "learning_rate": 4.318533910655571e-06, + "loss": 0.2241, + "mean_token_accuracy": 0.9201827645301819, + "step": 6333 + }, + { + "epoch": 3.167, + "grad_norm": 3.4115232748249813, + "learning_rate": 4.318234472022839e-06, + "loss": 0.1843, + "mean_token_accuracy": 0.9365776777267456, + "step": 6334 + }, + { + "epoch": 3.1675, + "grad_norm": 4.704703827195597, + "learning_rate": 4.317934978003517e-06, + "loss": 0.1955, + "mean_token_accuracy": 0.9295186400413513, + "step": 6335 + }, + { + "epoch": 3.168, + "grad_norm": 2.989394136044689, + "learning_rate": 4.31763542860673e-06, + "loss": 0.22, + "mean_token_accuracy": 0.9244786500930786, + "step": 6336 + }, + { + "epoch": 3.1685, + "grad_norm": 4.184311381536853, + "learning_rate": 4.317335823841601e-06, + "loss": 0.2277, + "mean_token_accuracy": 0.917490541934967, + "step": 6337 + }, + { + "epoch": 3.169, + "grad_norm": 4.034615836483941, + "learning_rate": 4.317036163717258e-06, + "loss": 0.1941, + "mean_token_accuracy": 0.934979259967804, + "step": 6338 + }, + { + "epoch": 3.1695, + "grad_norm": 3.3281090069160766, + "learning_rate": 4.316736448242827e-06, + "loss": 0.1629, + "mean_token_accuracy": 0.9370515942573547, + "step": 6339 + }, + { + "epoch": 3.17, + "grad_norm": 1.828553459220956, + "learning_rate": 4.316436677427441e-06, + "loss": 0.1732, + "mean_token_accuracy": 0.9366286396980286, + "step": 6340 + }, + { + "epoch": 3.1705, + "grad_norm": 2.3615300676500626, + "learning_rate": 4.316136851280228e-06, + "loss": 0.1815, + "mean_token_accuracy": 0.932826817035675, + "step": 6341 + }, + { + "epoch": 3.171, + "grad_norm": 3.1651327028546605, + "learning_rate": 4.315836969810323e-06, + "loss": 0.2007, + "mean_token_accuracy": 0.9418195486068726, + "step": 6342 + }, + { + "epoch": 3.1715, + "grad_norm": 1.8066789571971258, + "learning_rate": 4.315537033026862e-06, + "loss": 0.2153, + "mean_token_accuracy": 0.9215555787086487, + "step": 6343 + }, + { + "epoch": 3.172, + "grad_norm": 2.4237427791512887, + "learning_rate": 4.3152370409389795e-06, + "loss": 0.1881, + "mean_token_accuracy": 0.9335585832595825, + "step": 6344 + }, + { + "epoch": 3.1725, + "grad_norm": 4.186201031252586, + "learning_rate": 4.314936993555816e-06, + "loss": 0.2634, + "mean_token_accuracy": 0.9274353981018066, + "step": 6345 + }, + { + "epoch": 3.173, + "grad_norm": 1.8435255456894046, + "learning_rate": 4.31463689088651e-06, + "loss": 0.1958, + "mean_token_accuracy": 0.9268855452537537, + "step": 6346 + }, + { + "epoch": 3.1734999999999998, + "grad_norm": 4.549705857040951, + "learning_rate": 4.3143367329402025e-06, + "loss": 0.2288, + "mean_token_accuracy": 0.9159046411514282, + "step": 6347 + }, + { + "epoch": 3.174, + "grad_norm": 1.858212617031266, + "learning_rate": 4.314036519726038e-06, + "loss": 0.1981, + "mean_token_accuracy": 0.9290845394134521, + "step": 6348 + }, + { + "epoch": 3.1745, + "grad_norm": 1.620942742511744, + "learning_rate": 4.313736251253161e-06, + "loss": 0.1615, + "mean_token_accuracy": 0.9363781809806824, + "step": 6349 + }, + { + "epoch": 3.175, + "grad_norm": 3.1217898929382115, + "learning_rate": 4.313435927530719e-06, + "loss": 0.197, + "mean_token_accuracy": 0.934149444103241, + "step": 6350 + }, + { + "epoch": 3.1755, + "grad_norm": 4.99910586467008, + "learning_rate": 4.31313554856786e-06, + "loss": 0.1735, + "mean_token_accuracy": 0.9371168613433838, + "step": 6351 + }, + { + "epoch": 3.176, + "grad_norm": 2.0919674922455447, + "learning_rate": 4.3128351143737335e-06, + "loss": 0.2252, + "mean_token_accuracy": 0.9309486746788025, + "step": 6352 + }, + { + "epoch": 3.1765, + "grad_norm": 8.267869629522567, + "learning_rate": 4.312534624957492e-06, + "loss": 0.201, + "mean_token_accuracy": 0.9322487711906433, + "step": 6353 + }, + { + "epoch": 3.177, + "grad_norm": 1.7259195950068962, + "learning_rate": 4.312234080328288e-06, + "loss": 0.1846, + "mean_token_accuracy": 0.93309485912323, + "step": 6354 + }, + { + "epoch": 3.1775, + "grad_norm": 2.171036646678788, + "learning_rate": 4.311933480495278e-06, + "loss": 0.1929, + "mean_token_accuracy": 0.9363478422164917, + "step": 6355 + }, + { + "epoch": 3.178, + "grad_norm": 2.893296372910677, + "learning_rate": 4.311632825467617e-06, + "loss": 0.1983, + "mean_token_accuracy": 0.9266486167907715, + "step": 6356 + }, + { + "epoch": 3.1785, + "grad_norm": 2.051952509682118, + "learning_rate": 4.311332115254465e-06, + "loss": 0.2456, + "mean_token_accuracy": 0.9117770791053772, + "step": 6357 + }, + { + "epoch": 3.179, + "grad_norm": 2.4241466763994137, + "learning_rate": 4.3110313498649816e-06, + "loss": 0.1652, + "mean_token_accuracy": 0.938488245010376, + "step": 6358 + }, + { + "epoch": 3.1795, + "grad_norm": 2.523428519018498, + "learning_rate": 4.310730529308328e-06, + "loss": 0.2375, + "mean_token_accuracy": 0.9220756888389587, + "step": 6359 + }, + { + "epoch": 3.18, + "grad_norm": 1.6038307889148036, + "learning_rate": 4.3104296535936695e-06, + "loss": 0.1632, + "mean_token_accuracy": 0.9452683329582214, + "step": 6360 + }, + { + "epoch": 3.1805, + "grad_norm": 3.3255446258765917, + "learning_rate": 4.310128722730169e-06, + "loss": 0.2512, + "mean_token_accuracy": 0.9128702878952026, + "step": 6361 + }, + { + "epoch": 3.181, + "grad_norm": 1.5981113812133654, + "learning_rate": 4.309827736726995e-06, + "loss": 0.1616, + "mean_token_accuracy": 0.9364376664161682, + "step": 6362 + }, + { + "epoch": 3.1814999999999998, + "grad_norm": 2.6494698094124463, + "learning_rate": 4.309526695593316e-06, + "loss": 0.1898, + "mean_token_accuracy": 0.9336465001106262, + "step": 6363 + }, + { + "epoch": 3.182, + "grad_norm": 2.4064559201055844, + "learning_rate": 4.309225599338301e-06, + "loss": 0.1701, + "mean_token_accuracy": 0.9374042749404907, + "step": 6364 + }, + { + "epoch": 3.1825, + "grad_norm": 1.9043984333316195, + "learning_rate": 4.308924447971123e-06, + "loss": 0.2035, + "mean_token_accuracy": 0.9272916913032532, + "step": 6365 + }, + { + "epoch": 3.183, + "grad_norm": 1.9342943817095073, + "learning_rate": 4.308623241500957e-06, + "loss": 0.154, + "mean_token_accuracy": 0.9439734816551208, + "step": 6366 + }, + { + "epoch": 3.1835, + "grad_norm": 1.8424762699732773, + "learning_rate": 4.308321979936974e-06, + "loss": 0.1558, + "mean_token_accuracy": 0.93918776512146, + "step": 6367 + }, + { + "epoch": 3.184, + "grad_norm": 1.953235134783542, + "learning_rate": 4.308020663288356e-06, + "loss": 0.1767, + "mean_token_accuracy": 0.938321053981781, + "step": 6368 + }, + { + "epoch": 3.1845, + "grad_norm": 2.2999841887513814, + "learning_rate": 4.307719291564277e-06, + "loss": 0.1968, + "mean_token_accuracy": 0.9303671717643738, + "step": 6369 + }, + { + "epoch": 3.185, + "grad_norm": 2.959734159632197, + "learning_rate": 4.3074178647739205e-06, + "loss": 0.1785, + "mean_token_accuracy": 0.9346471428871155, + "step": 6370 + }, + { + "epoch": 3.1855, + "grad_norm": 2.3296585211022114, + "learning_rate": 4.307116382926468e-06, + "loss": 0.1716, + "mean_token_accuracy": 0.9370653033256531, + "step": 6371 + }, + { + "epoch": 3.186, + "grad_norm": 2.239818277687456, + "learning_rate": 4.306814846031102e-06, + "loss": 0.1596, + "mean_token_accuracy": 0.9473280906677246, + "step": 6372 + }, + { + "epoch": 3.1865, + "grad_norm": 3.0041637755032236, + "learning_rate": 4.306513254097009e-06, + "loss": 0.2311, + "mean_token_accuracy": 0.920844316482544, + "step": 6373 + }, + { + "epoch": 3.187, + "grad_norm": 1.553612754088027, + "learning_rate": 4.3062116071333745e-06, + "loss": 0.1837, + "mean_token_accuracy": 0.9290518760681152, + "step": 6374 + }, + { + "epoch": 3.1875, + "grad_norm": 2.9550242859182316, + "learning_rate": 4.305909905149389e-06, + "loss": 0.2, + "mean_token_accuracy": 0.9295727610588074, + "step": 6375 + }, + { + "epoch": 3.188, + "grad_norm": 2.8342585875940314, + "learning_rate": 4.305608148154242e-06, + "loss": 0.2048, + "mean_token_accuracy": 0.9329186081886292, + "step": 6376 + }, + { + "epoch": 3.1885, + "grad_norm": 2.5725650811975145, + "learning_rate": 4.305306336157126e-06, + "loss": 0.2386, + "mean_token_accuracy": 0.9129836559295654, + "step": 6377 + }, + { + "epoch": 3.189, + "grad_norm": 9.91258822924674, + "learning_rate": 4.305004469167233e-06, + "loss": 0.1655, + "mean_token_accuracy": 0.9390162825584412, + "step": 6378 + }, + { + "epoch": 3.1895, + "grad_norm": 2.450332686636952, + "learning_rate": 4.304702547193762e-06, + "loss": 0.2383, + "mean_token_accuracy": 0.916339635848999, + "step": 6379 + }, + { + "epoch": 3.19, + "grad_norm": 3.241961194615983, + "learning_rate": 4.3044005702459055e-06, + "loss": 0.1868, + "mean_token_accuracy": 0.9349862337112427, + "step": 6380 + }, + { + "epoch": 3.1905, + "grad_norm": 2.0683972362730576, + "learning_rate": 4.304098538332866e-06, + "loss": 0.1901, + "mean_token_accuracy": 0.9354585409164429, + "step": 6381 + }, + { + "epoch": 3.191, + "grad_norm": 1.6308023008855639, + "learning_rate": 4.303796451463842e-06, + "loss": 0.1619, + "mean_token_accuracy": 0.9357541799545288, + "step": 6382 + }, + { + "epoch": 3.1915, + "grad_norm": 1.6033956824480362, + "learning_rate": 4.303494309648036e-06, + "loss": 0.1677, + "mean_token_accuracy": 0.933976411819458, + "step": 6383 + }, + { + "epoch": 3.192, + "grad_norm": 2.5604010654160994, + "learning_rate": 4.303192112894652e-06, + "loss": 0.1974, + "mean_token_accuracy": 0.9297147989273071, + "step": 6384 + }, + { + "epoch": 3.1925, + "grad_norm": 5.535953784700805, + "learning_rate": 4.302889861212894e-06, + "loss": 0.1827, + "mean_token_accuracy": 0.9375907182693481, + "step": 6385 + }, + { + "epoch": 3.193, + "grad_norm": 2.363265913106454, + "learning_rate": 4.3025875546119725e-06, + "loss": 0.2299, + "mean_token_accuracy": 0.9205964803695679, + "step": 6386 + }, + { + "epoch": 3.1935000000000002, + "grad_norm": 2.545339965009577, + "learning_rate": 4.302285193101093e-06, + "loss": 0.2376, + "mean_token_accuracy": 0.9205332398414612, + "step": 6387 + }, + { + "epoch": 3.194, + "grad_norm": 2.105901000223715, + "learning_rate": 4.301982776689467e-06, + "loss": 0.1955, + "mean_token_accuracy": 0.9266381859779358, + "step": 6388 + }, + { + "epoch": 3.1945, + "grad_norm": 2.387838612557451, + "learning_rate": 4.301680305386306e-06, + "loss": 0.2058, + "mean_token_accuracy": 0.9246575236320496, + "step": 6389 + }, + { + "epoch": 3.195, + "grad_norm": 18.262100042044562, + "learning_rate": 4.301377779200826e-06, + "loss": 0.1558, + "mean_token_accuracy": 0.941644549369812, + "step": 6390 + }, + { + "epoch": 3.1955, + "grad_norm": 2.4086422978405833, + "learning_rate": 4.301075198142241e-06, + "loss": 0.2717, + "mean_token_accuracy": 0.9092816114425659, + "step": 6391 + }, + { + "epoch": 3.196, + "grad_norm": 1.599793096435502, + "learning_rate": 4.3007725622197675e-06, + "loss": 0.1557, + "mean_token_accuracy": 0.9375497102737427, + "step": 6392 + }, + { + "epoch": 3.1965, + "grad_norm": 5.515735800474377, + "learning_rate": 4.300469871442625e-06, + "loss": 0.1669, + "mean_token_accuracy": 0.9426429271697998, + "step": 6393 + }, + { + "epoch": 3.197, + "grad_norm": 4.250223371196286, + "learning_rate": 4.300167125820035e-06, + "loss": 0.1995, + "mean_token_accuracy": 0.9262223839759827, + "step": 6394 + }, + { + "epoch": 3.1975, + "grad_norm": 6.597882522844507, + "learning_rate": 4.299864325361217e-06, + "loss": 0.1795, + "mean_token_accuracy": 0.9321011900901794, + "step": 6395 + }, + { + "epoch": 3.198, + "grad_norm": 1.8995373515382483, + "learning_rate": 4.2995614700753975e-06, + "loss": 0.205, + "mean_token_accuracy": 0.9238387942314148, + "step": 6396 + }, + { + "epoch": 3.1985, + "grad_norm": 3.0460028485320767, + "learning_rate": 4.299258559971801e-06, + "loss": 0.2412, + "mean_token_accuracy": 0.9174855947494507, + "step": 6397 + }, + { + "epoch": 3.199, + "grad_norm": 2.0100406319816218, + "learning_rate": 4.298955595059654e-06, + "loss": 0.1976, + "mean_token_accuracy": 0.9246362447738647, + "step": 6398 + }, + { + "epoch": 3.1995, + "grad_norm": 1.7624176994545908, + "learning_rate": 4.298652575348187e-06, + "loss": 0.1838, + "mean_token_accuracy": 0.9323610067367554, + "step": 6399 + }, + { + "epoch": 3.2, + "grad_norm": 2.546037765786107, + "learning_rate": 4.2983495008466285e-06, + "loss": 0.2329, + "mean_token_accuracy": 0.924367368221283, + "step": 6400 + }, + { + "epoch": 3.2005, + "grad_norm": 3.882100284455504, + "learning_rate": 4.298046371564212e-06, + "loss": 0.2647, + "mean_token_accuracy": 0.9123838543891907, + "step": 6401 + }, + { + "epoch": 3.201, + "grad_norm": 1.9623205624841706, + "learning_rate": 4.29774318751017e-06, + "loss": 0.1804, + "mean_token_accuracy": 0.9345249533653259, + "step": 6402 + }, + { + "epoch": 3.2015000000000002, + "grad_norm": 3.6434180217444676, + "learning_rate": 4.2974399486937405e-06, + "loss": 0.2534, + "mean_token_accuracy": 0.9145840406417847, + "step": 6403 + }, + { + "epoch": 3.202, + "grad_norm": 3.3029207250774117, + "learning_rate": 4.2971366551241585e-06, + "loss": 0.2626, + "mean_token_accuracy": 0.9107595086097717, + "step": 6404 + }, + { + "epoch": 3.2025, + "grad_norm": 2.307315115986738, + "learning_rate": 4.2968333068106635e-06, + "loss": 0.2536, + "mean_token_accuracy": 0.9219520092010498, + "step": 6405 + }, + { + "epoch": 3.203, + "grad_norm": 3.6254963407463534, + "learning_rate": 4.2965299037624965e-06, + "loss": 0.2361, + "mean_token_accuracy": 0.9229750037193298, + "step": 6406 + }, + { + "epoch": 3.2035, + "grad_norm": 2.345880759754301, + "learning_rate": 4.296226445988899e-06, + "loss": 0.2197, + "mean_token_accuracy": 0.9194928407669067, + "step": 6407 + }, + { + "epoch": 3.204, + "grad_norm": 1.9085904889605296, + "learning_rate": 4.295922933499116e-06, + "loss": 0.1885, + "mean_token_accuracy": 0.932268500328064, + "step": 6408 + }, + { + "epoch": 3.2045, + "grad_norm": 2.4484489950140027, + "learning_rate": 4.295619366302391e-06, + "loss": 0.1761, + "mean_token_accuracy": 0.9335781335830688, + "step": 6409 + }, + { + "epoch": 3.205, + "grad_norm": 2.1640705523556276, + "learning_rate": 4.295315744407972e-06, + "loss": 0.2111, + "mean_token_accuracy": 0.9245370626449585, + "step": 6410 + }, + { + "epoch": 3.2055, + "grad_norm": 2.5387773851165334, + "learning_rate": 4.295012067825109e-06, + "loss": 0.2675, + "mean_token_accuracy": 0.9067296385765076, + "step": 6411 + }, + { + "epoch": 3.206, + "grad_norm": 1.9209776621106731, + "learning_rate": 4.294708336563052e-06, + "loss": 0.1731, + "mean_token_accuracy": 0.9352982044219971, + "step": 6412 + }, + { + "epoch": 3.2065, + "grad_norm": 2.9267199098509464, + "learning_rate": 4.294404550631052e-06, + "loss": 0.1993, + "mean_token_accuracy": 0.9295234680175781, + "step": 6413 + }, + { + "epoch": 3.207, + "grad_norm": 2.158524622579609, + "learning_rate": 4.294100710038363e-06, + "loss": 0.1973, + "mean_token_accuracy": 0.9238696098327637, + "step": 6414 + }, + { + "epoch": 3.2075, + "grad_norm": 1.8671589378273716, + "learning_rate": 4.293796814794243e-06, + "loss": 0.2207, + "mean_token_accuracy": 0.920804500579834, + "step": 6415 + }, + { + "epoch": 3.208, + "grad_norm": 1.9831068538878829, + "learning_rate": 4.293492864907947e-06, + "loss": 0.2064, + "mean_token_accuracy": 0.9282572269439697, + "step": 6416 + }, + { + "epoch": 3.2085, + "grad_norm": 2.2483322087054836, + "learning_rate": 4.2931888603887336e-06, + "loss": 0.1617, + "mean_token_accuracy": 0.94096440076828, + "step": 6417 + }, + { + "epoch": 3.209, + "grad_norm": 3.5016544573415826, + "learning_rate": 4.292884801245864e-06, + "loss": 0.181, + "mean_token_accuracy": 0.9337048530578613, + "step": 6418 + }, + { + "epoch": 3.2095, + "grad_norm": 2.211928181201381, + "learning_rate": 4.292580687488601e-06, + "loss": 0.2038, + "mean_token_accuracy": 0.9263970851898193, + "step": 6419 + }, + { + "epoch": 3.21, + "grad_norm": 3.192587722440528, + "learning_rate": 4.2922765191262075e-06, + "loss": 0.1967, + "mean_token_accuracy": 0.9352054595947266, + "step": 6420 + }, + { + "epoch": 3.2105, + "grad_norm": 3.33812055407923, + "learning_rate": 4.291972296167949e-06, + "loss": 0.2681, + "mean_token_accuracy": 0.9120718240737915, + "step": 6421 + }, + { + "epoch": 3.211, + "grad_norm": 5.474919332277673, + "learning_rate": 4.291668018623093e-06, + "loss": 0.2392, + "mean_token_accuracy": 0.9092111587524414, + "step": 6422 + }, + { + "epoch": 3.2115, + "grad_norm": 2.4962158626968045, + "learning_rate": 4.291363686500908e-06, + "loss": 0.2125, + "mean_token_accuracy": 0.9228209257125854, + "step": 6423 + }, + { + "epoch": 3.212, + "grad_norm": 2.416632804559087, + "learning_rate": 4.291059299810665e-06, + "loss": 0.1653, + "mean_token_accuracy": 0.9404371380805969, + "step": 6424 + }, + { + "epoch": 3.2125, + "grad_norm": 2.8218082020062947, + "learning_rate": 4.290754858561636e-06, + "loss": 0.1569, + "mean_token_accuracy": 0.9391257166862488, + "step": 6425 + }, + { + "epoch": 3.213, + "grad_norm": 2.737553972850566, + "learning_rate": 4.2904503627630945e-06, + "loss": 0.2185, + "mean_token_accuracy": 0.9220216870307922, + "step": 6426 + }, + { + "epoch": 3.2135, + "grad_norm": 2.217045649628293, + "learning_rate": 4.2901458124243165e-06, + "loss": 0.2409, + "mean_token_accuracy": 0.9158477783203125, + "step": 6427 + }, + { + "epoch": 3.214, + "grad_norm": 2.766697824376003, + "learning_rate": 4.289841207554578e-06, + "loss": 0.2539, + "mean_token_accuracy": 0.9152119755744934, + "step": 6428 + }, + { + "epoch": 3.2145, + "grad_norm": 6.290282841795726, + "learning_rate": 4.289536548163159e-06, + "loss": 0.1671, + "mean_token_accuracy": 0.9363296031951904, + "step": 6429 + }, + { + "epoch": 3.215, + "grad_norm": 1.9077483235761428, + "learning_rate": 4.28923183425934e-06, + "loss": 0.2214, + "mean_token_accuracy": 0.9256002306938171, + "step": 6430 + }, + { + "epoch": 3.2155, + "grad_norm": 3.909251149460624, + "learning_rate": 4.288927065852402e-06, + "loss": 0.2269, + "mean_token_accuracy": 0.9229583740234375, + "step": 6431 + }, + { + "epoch": 3.216, + "grad_norm": 2.5850847962638785, + "learning_rate": 4.28862224295163e-06, + "loss": 0.2114, + "mean_token_accuracy": 0.9177814722061157, + "step": 6432 + }, + { + "epoch": 3.2165, + "grad_norm": 1.9296685700373628, + "learning_rate": 4.288317365566309e-06, + "loss": 0.1743, + "mean_token_accuracy": 0.9355274438858032, + "step": 6433 + }, + { + "epoch": 3.217, + "grad_norm": 3.5674397410118583, + "learning_rate": 4.288012433705726e-06, + "loss": 0.213, + "mean_token_accuracy": 0.9311224222183228, + "step": 6434 + }, + { + "epoch": 3.2175, + "grad_norm": 1.97870451616266, + "learning_rate": 4.287707447379169e-06, + "loss": 0.1865, + "mean_token_accuracy": 0.9352162480354309, + "step": 6435 + }, + { + "epoch": 3.218, + "grad_norm": 1.6750928007614394, + "learning_rate": 4.2874024065959295e-06, + "loss": 0.1728, + "mean_token_accuracy": 0.9389671087265015, + "step": 6436 + }, + { + "epoch": 3.2185, + "grad_norm": 2.675796364592958, + "learning_rate": 4.287097311365299e-06, + "loss": 0.1861, + "mean_token_accuracy": 0.9311976432800293, + "step": 6437 + }, + { + "epoch": 3.219, + "grad_norm": 1.6357857523200325, + "learning_rate": 4.286792161696571e-06, + "loss": 0.1691, + "mean_token_accuracy": 0.9486734867095947, + "step": 6438 + }, + { + "epoch": 3.2195, + "grad_norm": 2.50315030827326, + "learning_rate": 4.286486957599042e-06, + "loss": 0.1816, + "mean_token_accuracy": 0.9321534037590027, + "step": 6439 + }, + { + "epoch": 3.22, + "grad_norm": 1.8867643605038367, + "learning_rate": 4.286181699082008e-06, + "loss": 0.2083, + "mean_token_accuracy": 0.9317243695259094, + "step": 6440 + }, + { + "epoch": 3.2205, + "grad_norm": 3.78345969754228, + "learning_rate": 4.2858763861547694e-06, + "loss": 0.2203, + "mean_token_accuracy": 0.9248026609420776, + "step": 6441 + }, + { + "epoch": 3.221, + "grad_norm": 2.2048900227539354, + "learning_rate": 4.285571018826624e-06, + "loss": 0.1972, + "mean_token_accuracy": 0.9326379895210266, + "step": 6442 + }, + { + "epoch": 3.2215, + "grad_norm": 2.051211217583691, + "learning_rate": 4.285265597106876e-06, + "loss": 0.1903, + "mean_token_accuracy": 0.9364191293716431, + "step": 6443 + }, + { + "epoch": 3.222, + "grad_norm": 3.069426664992698, + "learning_rate": 4.284960121004827e-06, + "loss": 0.2189, + "mean_token_accuracy": 0.9179028868675232, + "step": 6444 + }, + { + "epoch": 3.2225, + "grad_norm": 2.2668398765343523, + "learning_rate": 4.284654590529784e-06, + "loss": 0.1994, + "mean_token_accuracy": 0.9239026308059692, + "step": 6445 + }, + { + "epoch": 3.223, + "grad_norm": 2.150045879386074, + "learning_rate": 4.284349005691054e-06, + "loss": 0.2245, + "mean_token_accuracy": 0.9169397354125977, + "step": 6446 + }, + { + "epoch": 3.2235, + "grad_norm": 3.7991434990271347, + "learning_rate": 4.284043366497944e-06, + "loss": 0.2585, + "mean_token_accuracy": 0.9158512949943542, + "step": 6447 + }, + { + "epoch": 3.224, + "grad_norm": 6.528612911551282, + "learning_rate": 4.283737672959766e-06, + "loss": 0.1392, + "mean_token_accuracy": 0.9516733884811401, + "step": 6448 + }, + { + "epoch": 3.2245, + "grad_norm": 5.805346281174081, + "learning_rate": 4.2834319250858316e-06, + "loss": 0.2505, + "mean_token_accuracy": 0.9143498539924622, + "step": 6449 + }, + { + "epoch": 3.225, + "grad_norm": 1.8749063082879105, + "learning_rate": 4.283126122885455e-06, + "loss": 0.163, + "mean_token_accuracy": 0.9379974603652954, + "step": 6450 + }, + { + "epoch": 3.2255, + "grad_norm": 7.24848049149058, + "learning_rate": 4.282820266367949e-06, + "loss": 0.1897, + "mean_token_accuracy": 0.9341344833374023, + "step": 6451 + }, + { + "epoch": 3.226, + "grad_norm": 3.922474774102316, + "learning_rate": 4.282514355542633e-06, + "loss": 0.2345, + "mean_token_accuracy": 0.9198805093765259, + "step": 6452 + }, + { + "epoch": 3.2265, + "grad_norm": 2.770240788942595, + "learning_rate": 4.282208390418825e-06, + "loss": 0.2329, + "mean_token_accuracy": 0.9249324202537537, + "step": 6453 + }, + { + "epoch": 3.227, + "grad_norm": 3.09727237850979, + "learning_rate": 4.281902371005844e-06, + "loss": 0.2054, + "mean_token_accuracy": 0.9282256364822388, + "step": 6454 + }, + { + "epoch": 3.2275, + "grad_norm": 1.775107214826804, + "learning_rate": 4.281596297313014e-06, + "loss": 0.1968, + "mean_token_accuracy": 0.9301592111587524, + "step": 6455 + }, + { + "epoch": 3.228, + "grad_norm": 2.494558899978514, + "learning_rate": 4.281290169349656e-06, + "loss": 0.1779, + "mean_token_accuracy": 0.9342235326766968, + "step": 6456 + }, + { + "epoch": 3.2285, + "grad_norm": 3.7326221958018895, + "learning_rate": 4.280983987125099e-06, + "loss": 0.2273, + "mean_token_accuracy": 0.9145330786705017, + "step": 6457 + }, + { + "epoch": 3.229, + "grad_norm": 2.335655849853285, + "learning_rate": 4.280677750648665e-06, + "loss": 0.1862, + "mean_token_accuracy": 0.931359589099884, + "step": 6458 + }, + { + "epoch": 3.2295, + "grad_norm": 2.0701287688085968, + "learning_rate": 4.280371459929686e-06, + "loss": 0.219, + "mean_token_accuracy": 0.9294238090515137, + "step": 6459 + }, + { + "epoch": 3.23, + "grad_norm": 3.9137777329344603, + "learning_rate": 4.280065114977492e-06, + "loss": 0.1725, + "mean_token_accuracy": 0.9375689625740051, + "step": 6460 + }, + { + "epoch": 3.2305, + "grad_norm": 2.311170618956712, + "learning_rate": 4.279758715801413e-06, + "loss": 0.1889, + "mean_token_accuracy": 0.9369765520095825, + "step": 6461 + }, + { + "epoch": 3.231, + "grad_norm": 1.7704832572572324, + "learning_rate": 4.279452262410782e-06, + "loss": 0.2243, + "mean_token_accuracy": 0.9189473390579224, + "step": 6462 + }, + { + "epoch": 3.2315, + "grad_norm": 3.989115282339367, + "learning_rate": 4.279145754814938e-06, + "loss": 0.2142, + "mean_token_accuracy": 0.9242026209831238, + "step": 6463 + }, + { + "epoch": 3.232, + "grad_norm": 2.0095332145126994, + "learning_rate": 4.278839193023214e-06, + "loss": 0.2144, + "mean_token_accuracy": 0.9254195094108582, + "step": 6464 + }, + { + "epoch": 3.2325, + "grad_norm": 2.110144846267875, + "learning_rate": 4.278532577044949e-06, + "loss": 0.2493, + "mean_token_accuracy": 0.9251852631568909, + "step": 6465 + }, + { + "epoch": 3.233, + "grad_norm": 2.1914906944276327, + "learning_rate": 4.278225906889485e-06, + "loss": 0.2267, + "mean_token_accuracy": 0.9189459681510925, + "step": 6466 + }, + { + "epoch": 3.2335, + "grad_norm": 6.41412608746305, + "learning_rate": 4.2779191825661616e-06, + "loss": 0.225, + "mean_token_accuracy": 0.9209814071655273, + "step": 6467 + }, + { + "epoch": 3.234, + "grad_norm": 4.600408841377667, + "learning_rate": 4.277612404084322e-06, + "loss": 0.1892, + "mean_token_accuracy": 0.9309407472610474, + "step": 6468 + }, + { + "epoch": 3.2345, + "grad_norm": 50.95905566415759, + "learning_rate": 4.277305571453314e-06, + "loss": 0.2753, + "mean_token_accuracy": 0.9133036732673645, + "step": 6469 + }, + { + "epoch": 3.235, + "grad_norm": 2.0151450954004813, + "learning_rate": 4.276998684682482e-06, + "loss": 0.1967, + "mean_token_accuracy": 0.9271804690361023, + "step": 6470 + }, + { + "epoch": 3.2355, + "grad_norm": 1.9304000274211544, + "learning_rate": 4.276691743781174e-06, + "loss": 0.2067, + "mean_token_accuracy": 0.9302689433097839, + "step": 6471 + }, + { + "epoch": 3.2359999999999998, + "grad_norm": 2.247529390129813, + "learning_rate": 4.27638474875874e-06, + "loss": 0.2131, + "mean_token_accuracy": 0.9252017140388489, + "step": 6472 + }, + { + "epoch": 3.2365, + "grad_norm": 1.570957711949479, + "learning_rate": 4.276077699624534e-06, + "loss": 0.1582, + "mean_token_accuracy": 0.940247118473053, + "step": 6473 + }, + { + "epoch": 3.237, + "grad_norm": 3.339756945404924, + "learning_rate": 4.275770596387907e-06, + "loss": 0.2039, + "mean_token_accuracy": 0.9270913004875183, + "step": 6474 + }, + { + "epoch": 3.2375, + "grad_norm": 2.1281603567619105, + "learning_rate": 4.275463439058214e-06, + "loss": 0.1851, + "mean_token_accuracy": 0.9279189705848694, + "step": 6475 + }, + { + "epoch": 3.238, + "grad_norm": 2.897238586916277, + "learning_rate": 4.275156227644812e-06, + "loss": 0.2001, + "mean_token_accuracy": 0.9329627752304077, + "step": 6476 + }, + { + "epoch": 3.2385, + "grad_norm": 1.971116124114435, + "learning_rate": 4.274848962157059e-06, + "loss": 0.1738, + "mean_token_accuracy": 0.9345999956130981, + "step": 6477 + }, + { + "epoch": 3.239, + "grad_norm": 2.4044699764556112, + "learning_rate": 4.274541642604316e-06, + "loss": 0.1954, + "mean_token_accuracy": 0.9356780052185059, + "step": 6478 + }, + { + "epoch": 3.2395, + "grad_norm": 1.8238669122186972, + "learning_rate": 4.274234268995943e-06, + "loss": 0.1957, + "mean_token_accuracy": 0.9245187640190125, + "step": 6479 + }, + { + "epoch": 3.24, + "grad_norm": 3.0035680069505966, + "learning_rate": 4.273926841341303e-06, + "loss": 0.196, + "mean_token_accuracy": 0.9370158314704895, + "step": 6480 + }, + { + "epoch": 3.2405, + "grad_norm": 1.7787362542721417, + "learning_rate": 4.273619359649762e-06, + "loss": 0.1867, + "mean_token_accuracy": 0.9275103211402893, + "step": 6481 + }, + { + "epoch": 3.241, + "grad_norm": 1.6563920864614547, + "learning_rate": 4.273311823930685e-06, + "loss": 0.1568, + "mean_token_accuracy": 0.9399382472038269, + "step": 6482 + }, + { + "epoch": 3.2415, + "grad_norm": 3.057019296088583, + "learning_rate": 4.273004234193442e-06, + "loss": 0.1683, + "mean_token_accuracy": 0.9409449100494385, + "step": 6483 + }, + { + "epoch": 3.242, + "grad_norm": 2.909232656750166, + "learning_rate": 4.2726965904474006e-06, + "loss": 0.19, + "mean_token_accuracy": 0.9315840601921082, + "step": 6484 + }, + { + "epoch": 3.2425, + "grad_norm": 3.10264223142749, + "learning_rate": 4.272388892701934e-06, + "loss": 0.1918, + "mean_token_accuracy": 0.935115396976471, + "step": 6485 + }, + { + "epoch": 3.243, + "grad_norm": 1.7991321899340478, + "learning_rate": 4.2720811409664145e-06, + "loss": 0.1788, + "mean_token_accuracy": 0.9323850274085999, + "step": 6486 + }, + { + "epoch": 3.2435, + "grad_norm": 1.754256979343627, + "learning_rate": 4.271773335250216e-06, + "loss": 0.1845, + "mean_token_accuracy": 0.9295093417167664, + "step": 6487 + }, + { + "epoch": 3.2439999999999998, + "grad_norm": 4.951765924694309, + "learning_rate": 4.271465475562716e-06, + "loss": 0.2043, + "mean_token_accuracy": 0.932949423789978, + "step": 6488 + }, + { + "epoch": 3.2445, + "grad_norm": 2.5351938975187127, + "learning_rate": 4.271157561913292e-06, + "loss": 0.2104, + "mean_token_accuracy": 0.9237575531005859, + "step": 6489 + }, + { + "epoch": 3.245, + "grad_norm": 2.5186649954369584, + "learning_rate": 4.270849594311323e-06, + "loss": 0.2004, + "mean_token_accuracy": 0.9250710606575012, + "step": 6490 + }, + { + "epoch": 3.2455, + "grad_norm": 2.5836013747376363, + "learning_rate": 4.27054157276619e-06, + "loss": 0.1547, + "mean_token_accuracy": 0.9414157867431641, + "step": 6491 + }, + { + "epoch": 3.246, + "grad_norm": 1.8052354343195511, + "learning_rate": 4.270233497287278e-06, + "loss": 0.2326, + "mean_token_accuracy": 0.9162943363189697, + "step": 6492 + }, + { + "epoch": 3.2465, + "grad_norm": 2.7724523376167567, + "learning_rate": 4.269925367883969e-06, + "loss": 0.1667, + "mean_token_accuracy": 0.9357621073722839, + "step": 6493 + }, + { + "epoch": 3.247, + "grad_norm": 1.875810339634343, + "learning_rate": 4.2696171845656505e-06, + "loss": 0.2132, + "mean_token_accuracy": 0.9298511743545532, + "step": 6494 + }, + { + "epoch": 3.2475, + "grad_norm": 2.661004789857095, + "learning_rate": 4.269308947341711e-06, + "loss": 0.1703, + "mean_token_accuracy": 0.9366208910942078, + "step": 6495 + }, + { + "epoch": 3.248, + "grad_norm": 3.0986020717459994, + "learning_rate": 4.269000656221539e-06, + "loss": 0.2116, + "mean_token_accuracy": 0.9190012216567993, + "step": 6496 + }, + { + "epoch": 3.2485, + "grad_norm": 2.1742382877151263, + "learning_rate": 4.268692311214525e-06, + "loss": 0.2477, + "mean_token_accuracy": 0.9169549942016602, + "step": 6497 + }, + { + "epoch": 3.249, + "grad_norm": 1.5158095127430014, + "learning_rate": 4.268383912330062e-06, + "loss": 0.1556, + "mean_token_accuracy": 0.9400498270988464, + "step": 6498 + }, + { + "epoch": 3.2495, + "grad_norm": 1.9683163310402214, + "learning_rate": 4.268075459577544e-06, + "loss": 0.2375, + "mean_token_accuracy": 0.9172943830490112, + "step": 6499 + }, + { + "epoch": 3.25, + "grad_norm": 1.6193221763168364, + "learning_rate": 4.267766952966369e-06, + "loss": 0.1914, + "mean_token_accuracy": 0.9353905320167542, + "step": 6500 + }, + { + "epoch": 3.2505, + "grad_norm": 2.65797687185667, + "learning_rate": 4.267458392505933e-06, + "loss": 0.2029, + "mean_token_accuracy": 0.9252418875694275, + "step": 6501 + }, + { + "epoch": 3.251, + "grad_norm": 1.881687560521928, + "learning_rate": 4.267149778205636e-06, + "loss": 0.1627, + "mean_token_accuracy": 0.9445812702178955, + "step": 6502 + }, + { + "epoch": 3.2515, + "grad_norm": 4.267335357743733, + "learning_rate": 4.266841110074878e-06, + "loss": 0.1731, + "mean_token_accuracy": 0.9351010918617249, + "step": 6503 + }, + { + "epoch": 3.252, + "grad_norm": 2.1739394449403897, + "learning_rate": 4.266532388123063e-06, + "loss": 0.2333, + "mean_token_accuracy": 0.9206165075302124, + "step": 6504 + }, + { + "epoch": 3.2525, + "grad_norm": 2.1123634302856846, + "learning_rate": 4.266223612359593e-06, + "loss": 0.1883, + "mean_token_accuracy": 0.9326110482215881, + "step": 6505 + }, + { + "epoch": 3.253, + "grad_norm": 2.2562732978399023, + "learning_rate": 4.2659147827938754e-06, + "loss": 0.1818, + "mean_token_accuracy": 0.9304715991020203, + "step": 6506 + }, + { + "epoch": 3.2535, + "grad_norm": 3.7571104205560464, + "learning_rate": 4.265605899435318e-06, + "loss": 0.1913, + "mean_token_accuracy": 0.9283581972122192, + "step": 6507 + }, + { + "epoch": 3.254, + "grad_norm": 1.925663878781545, + "learning_rate": 4.2652969622933295e-06, + "loss": 0.1385, + "mean_token_accuracy": 0.9447125196456909, + "step": 6508 + }, + { + "epoch": 3.2545, + "grad_norm": 2.531916102683361, + "learning_rate": 4.2649879713773205e-06, + "loss": 0.2163, + "mean_token_accuracy": 0.9285234808921814, + "step": 6509 + }, + { + "epoch": 3.255, + "grad_norm": 1.8133884795905282, + "learning_rate": 4.264678926696703e-06, + "loss": 0.2135, + "mean_token_accuracy": 0.9229283928871155, + "step": 6510 + }, + { + "epoch": 3.2555, + "grad_norm": 2.860974567120361, + "learning_rate": 4.264369828260892e-06, + "loss": 0.1889, + "mean_token_accuracy": 0.938011884689331, + "step": 6511 + }, + { + "epoch": 3.2560000000000002, + "grad_norm": 1.9750973964095815, + "learning_rate": 4.264060676079302e-06, + "loss": 0.2005, + "mean_token_accuracy": 0.9252740144729614, + "step": 6512 + }, + { + "epoch": 3.2565, + "grad_norm": 3.2183609654921495, + "learning_rate": 4.263751470161351e-06, + "loss": 0.1434, + "mean_token_accuracy": 0.9465464353561401, + "step": 6513 + }, + { + "epoch": 3.257, + "grad_norm": 2.220357510995326, + "learning_rate": 4.263442210516458e-06, + "loss": 0.2706, + "mean_token_accuracy": 0.9104894995689392, + "step": 6514 + }, + { + "epoch": 3.2575, + "grad_norm": 1.937239269847908, + "learning_rate": 4.263132897154044e-06, + "loss": 0.2209, + "mean_token_accuracy": 0.9241822361946106, + "step": 6515 + }, + { + "epoch": 3.258, + "grad_norm": 2.4755857345675367, + "learning_rate": 4.2628235300835315e-06, + "loss": 0.1809, + "mean_token_accuracy": 0.9386810660362244, + "step": 6516 + }, + { + "epoch": 3.2585, + "grad_norm": 2.328939349589042, + "learning_rate": 4.262514109314342e-06, + "loss": 0.1997, + "mean_token_accuracy": 0.9261511564254761, + "step": 6517 + }, + { + "epoch": 3.259, + "grad_norm": 1.5377598603128186, + "learning_rate": 4.262204634855904e-06, + "loss": 0.1985, + "mean_token_accuracy": 0.9188984632492065, + "step": 6518 + }, + { + "epoch": 3.2595, + "grad_norm": 2.77185117327534, + "learning_rate": 4.261895106717643e-06, + "loss": 0.2524, + "mean_token_accuracy": 0.9142318367958069, + "step": 6519 + }, + { + "epoch": 3.26, + "grad_norm": 2.2733408116049922, + "learning_rate": 4.261585524908987e-06, + "loss": 0.1996, + "mean_token_accuracy": 0.9285208582878113, + "step": 6520 + }, + { + "epoch": 3.2605, + "grad_norm": 3.2576970129019425, + "learning_rate": 4.261275889439368e-06, + "loss": 0.2552, + "mean_token_accuracy": 0.9017393589019775, + "step": 6521 + }, + { + "epoch": 3.261, + "grad_norm": 3.042437676296222, + "learning_rate": 4.260966200318217e-06, + "loss": 0.1516, + "mean_token_accuracy": 0.9444444179534912, + "step": 6522 + }, + { + "epoch": 3.2615, + "grad_norm": 2.311229651698938, + "learning_rate": 4.260656457554969e-06, + "loss": 0.1809, + "mean_token_accuracy": 0.9316819906234741, + "step": 6523 + }, + { + "epoch": 3.262, + "grad_norm": 1.9670293000944257, + "learning_rate": 4.260346661159058e-06, + "loss": 0.1709, + "mean_token_accuracy": 0.9325240850448608, + "step": 6524 + }, + { + "epoch": 3.2625, + "grad_norm": 3.1692910870117266, + "learning_rate": 4.260036811139922e-06, + "loss": 0.1887, + "mean_token_accuracy": 0.9321701526641846, + "step": 6525 + }, + { + "epoch": 3.263, + "grad_norm": 2.5947680431848363, + "learning_rate": 4.259726907506998e-06, + "loss": 0.197, + "mean_token_accuracy": 0.9318141937255859, + "step": 6526 + }, + { + "epoch": 3.2635, + "grad_norm": 2.836325156100356, + "learning_rate": 4.259416950269727e-06, + "loss": 0.2126, + "mean_token_accuracy": 0.922068178653717, + "step": 6527 + }, + { + "epoch": 3.2640000000000002, + "grad_norm": 2.873661779053768, + "learning_rate": 4.259106939437551e-06, + "loss": 0.2288, + "mean_token_accuracy": 0.9205530285835266, + "step": 6528 + }, + { + "epoch": 3.2645, + "grad_norm": 2.0382046858180014, + "learning_rate": 4.258796875019914e-06, + "loss": 0.2065, + "mean_token_accuracy": 0.9278688430786133, + "step": 6529 + }, + { + "epoch": 3.265, + "grad_norm": 2.633754263685503, + "learning_rate": 4.25848675702626e-06, + "loss": 0.226, + "mean_token_accuracy": 0.9167888164520264, + "step": 6530 + }, + { + "epoch": 3.2655, + "grad_norm": 1.8065047912412935, + "learning_rate": 4.258176585466037e-06, + "loss": 0.1608, + "mean_token_accuracy": 0.940816342830658, + "step": 6531 + }, + { + "epoch": 3.266, + "grad_norm": 1.528909492840022, + "learning_rate": 4.2578663603486916e-06, + "loss": 0.1734, + "mean_token_accuracy": 0.9354131817817688, + "step": 6532 + }, + { + "epoch": 3.2665, + "grad_norm": 6.919058202053316, + "learning_rate": 4.257556081683676e-06, + "loss": 0.2367, + "mean_token_accuracy": 0.9286551475524902, + "step": 6533 + }, + { + "epoch": 3.267, + "grad_norm": 2.542365462844721, + "learning_rate": 4.25724574948044e-06, + "loss": 0.2106, + "mean_token_accuracy": 0.9179156422615051, + "step": 6534 + }, + { + "epoch": 3.2675, + "grad_norm": 3.2356825628718267, + "learning_rate": 4.256935363748437e-06, + "loss": 0.2803, + "mean_token_accuracy": 0.9075576663017273, + "step": 6535 + }, + { + "epoch": 3.268, + "grad_norm": 1.934736829384105, + "learning_rate": 4.256624924497124e-06, + "loss": 0.194, + "mean_token_accuracy": 0.9327688217163086, + "step": 6536 + }, + { + "epoch": 3.2685, + "grad_norm": 2.423635755399723, + "learning_rate": 4.2563144317359545e-06, + "loss": 0.1713, + "mean_token_accuracy": 0.933682382106781, + "step": 6537 + }, + { + "epoch": 3.269, + "grad_norm": 2.449616886576526, + "learning_rate": 4.256003885474388e-06, + "loss": 0.237, + "mean_token_accuracy": 0.9126631021499634, + "step": 6538 + }, + { + "epoch": 3.2695, + "grad_norm": 2.616791006205123, + "learning_rate": 4.255693285721885e-06, + "loss": 0.2056, + "mean_token_accuracy": 0.9304648637771606, + "step": 6539 + }, + { + "epoch": 3.27, + "grad_norm": 3.7328669631417095, + "learning_rate": 4.255382632487907e-06, + "loss": 0.2424, + "mean_token_accuracy": 0.9202742576599121, + "step": 6540 + }, + { + "epoch": 3.2705, + "grad_norm": 4.190516440741714, + "learning_rate": 4.2550719257819154e-06, + "loss": 0.1799, + "mean_token_accuracy": 0.9349300265312195, + "step": 6541 + }, + { + "epoch": 3.271, + "grad_norm": 2.1013648167928505, + "learning_rate": 4.2547611656133755e-06, + "loss": 0.1935, + "mean_token_accuracy": 0.92479008436203, + "step": 6542 + }, + { + "epoch": 3.2715, + "grad_norm": 1.6793350115955907, + "learning_rate": 4.254450351991754e-06, + "loss": 0.1573, + "mean_token_accuracy": 0.9415470957756042, + "step": 6543 + }, + { + "epoch": 3.2720000000000002, + "grad_norm": 2.233960163891437, + "learning_rate": 4.254139484926519e-06, + "loss": 0.2245, + "mean_token_accuracy": 0.9128291606903076, + "step": 6544 + }, + { + "epoch": 3.2725, + "grad_norm": 1.532419297757237, + "learning_rate": 4.25382856442714e-06, + "loss": 0.2081, + "mean_token_accuracy": 0.9285302758216858, + "step": 6545 + }, + { + "epoch": 3.273, + "grad_norm": 1.6866413886546088, + "learning_rate": 4.253517590503087e-06, + "loss": 0.1701, + "mean_token_accuracy": 0.9364187121391296, + "step": 6546 + }, + { + "epoch": 3.2735, + "grad_norm": 1.9237237429129448, + "learning_rate": 4.253206563163834e-06, + "loss": 0.2376, + "mean_token_accuracy": 0.9161991477012634, + "step": 6547 + }, + { + "epoch": 3.274, + "grad_norm": 1.647155175132203, + "learning_rate": 4.252895482418856e-06, + "loss": 0.2003, + "mean_token_accuracy": 0.9288855195045471, + "step": 6548 + }, + { + "epoch": 3.2745, + "grad_norm": 3.5918817652987887, + "learning_rate": 4.252584348277628e-06, + "loss": 0.2294, + "mean_token_accuracy": 0.9226842522621155, + "step": 6549 + }, + { + "epoch": 3.275, + "grad_norm": 2.5711803370301003, + "learning_rate": 4.2522731607496275e-06, + "loss": 0.2012, + "mean_token_accuracy": 0.9346428513526917, + "step": 6550 + }, + { + "epoch": 3.2755, + "grad_norm": 3.7144299674137167, + "learning_rate": 4.251961919844334e-06, + "loss": 0.1669, + "mean_token_accuracy": 0.9412520527839661, + "step": 6551 + }, + { + "epoch": 3.276, + "grad_norm": 2.2959479525099975, + "learning_rate": 4.25165062557123e-06, + "loss": 0.2187, + "mean_token_accuracy": 0.9244298338890076, + "step": 6552 + }, + { + "epoch": 3.2765, + "grad_norm": 2.3743817789904926, + "learning_rate": 4.251339277939795e-06, + "loss": 0.2355, + "mean_token_accuracy": 0.9258579611778259, + "step": 6553 + }, + { + "epoch": 3.277, + "grad_norm": 4.772210156959594, + "learning_rate": 4.251027876959517e-06, + "loss": 0.1929, + "mean_token_accuracy": 0.9339417815208435, + "step": 6554 + }, + { + "epoch": 3.2775, + "grad_norm": 1.7973129333895042, + "learning_rate": 4.250716422639878e-06, + "loss": 0.1736, + "mean_token_accuracy": 0.9318456053733826, + "step": 6555 + }, + { + "epoch": 3.278, + "grad_norm": 1.7000194761803082, + "learning_rate": 4.250404914990367e-06, + "loss": 0.227, + "mean_token_accuracy": 0.9166399836540222, + "step": 6556 + }, + { + "epoch": 3.2785, + "grad_norm": 1.7563203554721818, + "learning_rate": 4.250093354020475e-06, + "loss": 0.1651, + "mean_token_accuracy": 0.9389224052429199, + "step": 6557 + }, + { + "epoch": 3.279, + "grad_norm": 6.123713147438586, + "learning_rate": 4.249781739739689e-06, + "loss": 0.2167, + "mean_token_accuracy": 0.9316596984863281, + "step": 6558 + }, + { + "epoch": 3.2795, + "grad_norm": 1.7528465485879834, + "learning_rate": 4.2494700721575045e-06, + "loss": 0.187, + "mean_token_accuracy": 0.9318014979362488, + "step": 6559 + }, + { + "epoch": 3.2800000000000002, + "grad_norm": 2.988125766468603, + "learning_rate": 4.249158351283414e-06, + "loss": 0.2584, + "mean_token_accuracy": 0.9154557585716248, + "step": 6560 + }, + { + "epoch": 3.2805, + "grad_norm": 2.924973202435863, + "learning_rate": 4.248846577126912e-06, + "loss": 0.2474, + "mean_token_accuracy": 0.916599690914154, + "step": 6561 + }, + { + "epoch": 3.281, + "grad_norm": 8.143782597191013, + "learning_rate": 4.248534749697499e-06, + "loss": 0.1879, + "mean_token_accuracy": 0.9308016896247864, + "step": 6562 + }, + { + "epoch": 3.2815, + "grad_norm": 4.2300163083763165, + "learning_rate": 4.2482228690046715e-06, + "loss": 0.2319, + "mean_token_accuracy": 0.9172549843788147, + "step": 6563 + }, + { + "epoch": 3.282, + "grad_norm": 5.046519269307213, + "learning_rate": 4.2479109350579286e-06, + "loss": 0.2259, + "mean_token_accuracy": 0.9232392907142639, + "step": 6564 + }, + { + "epoch": 3.2824999999999998, + "grad_norm": 2.592263517897882, + "learning_rate": 4.247598947866775e-06, + "loss": 0.1587, + "mean_token_accuracy": 0.9394844770431519, + "step": 6565 + }, + { + "epoch": 3.283, + "grad_norm": 1.9469680091431603, + "learning_rate": 4.247286907440713e-06, + "loss": 0.2408, + "mean_token_accuracy": 0.9140218496322632, + "step": 6566 + }, + { + "epoch": 3.2835, + "grad_norm": 2.051250025881697, + "learning_rate": 4.2469748137892485e-06, + "loss": 0.1841, + "mean_token_accuracy": 0.932045578956604, + "step": 6567 + }, + { + "epoch": 3.284, + "grad_norm": 2.2506438418347425, + "learning_rate": 4.246662666921888e-06, + "loss": 0.2304, + "mean_token_accuracy": 0.920607328414917, + "step": 6568 + }, + { + "epoch": 3.2845, + "grad_norm": 1.960249505245259, + "learning_rate": 4.24635046684814e-06, + "loss": 0.2062, + "mean_token_accuracy": 0.931246817111969, + "step": 6569 + }, + { + "epoch": 3.285, + "grad_norm": 2.745108535271563, + "learning_rate": 4.246038213577516e-06, + "loss": 0.2207, + "mean_token_accuracy": 0.9255014061927795, + "step": 6570 + }, + { + "epoch": 3.2855, + "grad_norm": 2.170284306252359, + "learning_rate": 4.245725907119525e-06, + "loss": 0.1761, + "mean_token_accuracy": 0.9394837021827698, + "step": 6571 + }, + { + "epoch": 3.286, + "grad_norm": 1.9586714717093656, + "learning_rate": 4.245413547483682e-06, + "loss": 0.2311, + "mean_token_accuracy": 0.9180033206939697, + "step": 6572 + }, + { + "epoch": 3.2865, + "grad_norm": 1.87544207577421, + "learning_rate": 4.245101134679502e-06, + "loss": 0.1622, + "mean_token_accuracy": 0.939455509185791, + "step": 6573 + }, + { + "epoch": 3.287, + "grad_norm": 2.4838714853599315, + "learning_rate": 4.244788668716503e-06, + "loss": 0.2193, + "mean_token_accuracy": 0.9256613254547119, + "step": 6574 + }, + { + "epoch": 3.2875, + "grad_norm": 14.351303350538057, + "learning_rate": 4.244476149604201e-06, + "loss": 0.1851, + "mean_token_accuracy": 0.932361900806427, + "step": 6575 + }, + { + "epoch": 3.288, + "grad_norm": 3.939116230811783, + "learning_rate": 4.244163577352116e-06, + "loss": 0.2144, + "mean_token_accuracy": 0.9319319128990173, + "step": 6576 + }, + { + "epoch": 3.2885, + "grad_norm": 2.610900764868254, + "learning_rate": 4.243850951969772e-06, + "loss": 0.2064, + "mean_token_accuracy": 0.9327070116996765, + "step": 6577 + }, + { + "epoch": 3.289, + "grad_norm": 2.569758903137735, + "learning_rate": 4.243538273466689e-06, + "loss": 0.2573, + "mean_token_accuracy": 0.9109019637107849, + "step": 6578 + }, + { + "epoch": 3.2895, + "grad_norm": 2.4910841500989194, + "learning_rate": 4.2432255418523935e-06, + "loss": 0.1879, + "mean_token_accuracy": 0.9371750354766846, + "step": 6579 + }, + { + "epoch": 3.29, + "grad_norm": 1.9152485743489103, + "learning_rate": 4.242912757136412e-06, + "loss": 0.1731, + "mean_token_accuracy": 0.9363481402397156, + "step": 6580 + }, + { + "epoch": 3.2904999999999998, + "grad_norm": 3.894009903305043, + "learning_rate": 4.242599919328271e-06, + "loss": 0.2284, + "mean_token_accuracy": 0.9223631024360657, + "step": 6581 + }, + { + "epoch": 3.291, + "grad_norm": 1.5353934747004723, + "learning_rate": 4.242287028437502e-06, + "loss": 0.2153, + "mean_token_accuracy": 0.9241840243339539, + "step": 6582 + }, + { + "epoch": 3.2915, + "grad_norm": 3.0501119071609013, + "learning_rate": 4.241974084473634e-06, + "loss": 0.1881, + "mean_token_accuracy": 0.9304537177085876, + "step": 6583 + }, + { + "epoch": 3.292, + "grad_norm": 10.218233871635201, + "learning_rate": 4.241661087446202e-06, + "loss": 0.2019, + "mean_token_accuracy": 0.9257657527923584, + "step": 6584 + }, + { + "epoch": 3.2925, + "grad_norm": 1.784015517982909, + "learning_rate": 4.24134803736474e-06, + "loss": 0.2152, + "mean_token_accuracy": 0.9246106743812561, + "step": 6585 + }, + { + "epoch": 3.293, + "grad_norm": 2.317310149768309, + "learning_rate": 4.241034934238782e-06, + "loss": 0.2211, + "mean_token_accuracy": 0.921460747718811, + "step": 6586 + }, + { + "epoch": 3.2935, + "grad_norm": 5.405463236126653, + "learning_rate": 4.2407217780778685e-06, + "loss": 0.1948, + "mean_token_accuracy": 0.9319304823875427, + "step": 6587 + }, + { + "epoch": 3.294, + "grad_norm": 3.045295078921171, + "learning_rate": 4.240408568891537e-06, + "loss": 0.1992, + "mean_token_accuracy": 0.9286212921142578, + "step": 6588 + }, + { + "epoch": 3.2945, + "grad_norm": 1.5477572565787405, + "learning_rate": 4.240095306689329e-06, + "loss": 0.1629, + "mean_token_accuracy": 0.9386317729949951, + "step": 6589 + }, + { + "epoch": 3.295, + "grad_norm": 2.4365714241738567, + "learning_rate": 4.239781991480786e-06, + "loss": 0.2225, + "mean_token_accuracy": 0.9200263023376465, + "step": 6590 + }, + { + "epoch": 3.2955, + "grad_norm": 6.202944650541917, + "learning_rate": 4.239468623275454e-06, + "loss": 0.1784, + "mean_token_accuracy": 0.9326214790344238, + "step": 6591 + }, + { + "epoch": 3.296, + "grad_norm": 2.68782278337683, + "learning_rate": 4.239155202082878e-06, + "loss": 0.1715, + "mean_token_accuracy": 0.9392514228820801, + "step": 6592 + }, + { + "epoch": 3.2965, + "grad_norm": 1.983764300677779, + "learning_rate": 4.238841727912604e-06, + "loss": 0.2071, + "mean_token_accuracy": 0.9237882494926453, + "step": 6593 + }, + { + "epoch": 3.297, + "grad_norm": 1.589733940761141, + "learning_rate": 4.238528200774182e-06, + "loss": 0.1363, + "mean_token_accuracy": 0.9547069072723389, + "step": 6594 + }, + { + "epoch": 3.2975, + "grad_norm": 2.68704145829445, + "learning_rate": 4.238214620677164e-06, + "loss": 0.187, + "mean_token_accuracy": 0.9353448152542114, + "step": 6595 + }, + { + "epoch": 3.298, + "grad_norm": 2.1961616135420337, + "learning_rate": 4.2379009876311e-06, + "loss": 0.1651, + "mean_token_accuracy": 0.9422915577888489, + "step": 6596 + }, + { + "epoch": 3.2984999999999998, + "grad_norm": 1.7602808500214586, + "learning_rate": 4.237587301645545e-06, + "loss": 0.1812, + "mean_token_accuracy": 0.9385863542556763, + "step": 6597 + }, + { + "epoch": 3.299, + "grad_norm": 2.3161042016401328, + "learning_rate": 4.237273562730054e-06, + "loss": 0.2544, + "mean_token_accuracy": 0.9155893325805664, + "step": 6598 + }, + { + "epoch": 3.2995, + "grad_norm": 2.9122324056771625, + "learning_rate": 4.236959770894184e-06, + "loss": 0.1988, + "mean_token_accuracy": 0.9326111674308777, + "step": 6599 + }, + { + "epoch": 3.3, + "grad_norm": 2.0055591942959703, + "learning_rate": 4.236645926147493e-06, + "loss": 0.1909, + "mean_token_accuracy": 0.9334314465522766, + "step": 6600 + }, + { + "epoch": 3.3005, + "grad_norm": 1.9589974388800264, + "learning_rate": 4.236332028499544e-06, + "loss": 0.1939, + "mean_token_accuracy": 0.9294502139091492, + "step": 6601 + }, + { + "epoch": 3.301, + "grad_norm": 2.68253825109494, + "learning_rate": 4.236018077959895e-06, + "loss": 0.2207, + "mean_token_accuracy": 0.9233430624008179, + "step": 6602 + }, + { + "epoch": 3.3015, + "grad_norm": 2.371857217971697, + "learning_rate": 4.235704074538112e-06, + "loss": 0.1818, + "mean_token_accuracy": 0.9356317520141602, + "step": 6603 + }, + { + "epoch": 3.302, + "grad_norm": 1.352150105070001, + "learning_rate": 4.23539001824376e-06, + "loss": 0.1491, + "mean_token_accuracy": 0.9409422874450684, + "step": 6604 + }, + { + "epoch": 3.3025, + "grad_norm": 3.8398029906683777, + "learning_rate": 4.235075909086405e-06, + "loss": 0.2568, + "mean_token_accuracy": 0.9128636717796326, + "step": 6605 + }, + { + "epoch": 3.303, + "grad_norm": 2.9671847362578676, + "learning_rate": 4.2347617470756146e-06, + "loss": 0.2587, + "mean_token_accuracy": 0.92088383436203, + "step": 6606 + }, + { + "epoch": 3.3035, + "grad_norm": 1.864812482216317, + "learning_rate": 4.23444753222096e-06, + "loss": 0.1997, + "mean_token_accuracy": 0.9371780157089233, + "step": 6607 + }, + { + "epoch": 3.304, + "grad_norm": 1.576182955710227, + "learning_rate": 4.234133264532012e-06, + "loss": 0.172, + "mean_token_accuracy": 0.9440938830375671, + "step": 6608 + }, + { + "epoch": 3.3045, + "grad_norm": 3.4251586181514506, + "learning_rate": 4.233818944018345e-06, + "loss": 0.2096, + "mean_token_accuracy": 0.9198445081710815, + "step": 6609 + }, + { + "epoch": 3.305, + "grad_norm": 2.39205801787422, + "learning_rate": 4.233504570689533e-06, + "loss": 0.1601, + "mean_token_accuracy": 0.9423396587371826, + "step": 6610 + }, + { + "epoch": 3.3055, + "grad_norm": 5.777507987165036, + "learning_rate": 4.2331901445551515e-06, + "loss": 0.2073, + "mean_token_accuracy": 0.9287480711936951, + "step": 6611 + }, + { + "epoch": 3.306, + "grad_norm": 1.7173700126879876, + "learning_rate": 4.232875665624779e-06, + "loss": 0.2042, + "mean_token_accuracy": 0.923882007598877, + "step": 6612 + }, + { + "epoch": 3.3064999999999998, + "grad_norm": 2.0212262254921516, + "learning_rate": 4.232561133907996e-06, + "loss": 0.1521, + "mean_token_accuracy": 0.9442028999328613, + "step": 6613 + }, + { + "epoch": 3.307, + "grad_norm": 2.130661417393135, + "learning_rate": 4.232246549414381e-06, + "loss": 0.2115, + "mean_token_accuracy": 0.9272304177284241, + "step": 6614 + }, + { + "epoch": 3.3075, + "grad_norm": 3.9395209024010582, + "learning_rate": 4.231931912153521e-06, + "loss": 0.1963, + "mean_token_accuracy": 0.9316807985305786, + "step": 6615 + }, + { + "epoch": 3.308, + "grad_norm": 3.2411457133142845, + "learning_rate": 4.231617222134997e-06, + "loss": 0.2311, + "mean_token_accuracy": 0.923391580581665, + "step": 6616 + }, + { + "epoch": 3.3085, + "grad_norm": 1.5724337419923138, + "learning_rate": 4.2313024793683965e-06, + "loss": 0.184, + "mean_token_accuracy": 0.9338973164558411, + "step": 6617 + }, + { + "epoch": 3.309, + "grad_norm": 1.249501360799104, + "learning_rate": 4.230987683863307e-06, + "loss": 0.1399, + "mean_token_accuracy": 0.9468185901641846, + "step": 6618 + }, + { + "epoch": 3.3095, + "grad_norm": 3.883375308999491, + "learning_rate": 4.230672835629317e-06, + "loss": 0.22, + "mean_token_accuracy": 0.9200406670570374, + "step": 6619 + }, + { + "epoch": 3.31, + "grad_norm": 3.173748719156808, + "learning_rate": 4.230357934676017e-06, + "loss": 0.2305, + "mean_token_accuracy": 0.9166666865348816, + "step": 6620 + }, + { + "epoch": 3.3105, + "grad_norm": 3.6255483683177356, + "learning_rate": 4.230042981013002e-06, + "loss": 0.2362, + "mean_token_accuracy": 0.9277703762054443, + "step": 6621 + }, + { + "epoch": 3.311, + "grad_norm": 3.462928390284534, + "learning_rate": 4.229727974649863e-06, + "loss": 0.1739, + "mean_token_accuracy": 0.940646767616272, + "step": 6622 + }, + { + "epoch": 3.3115, + "grad_norm": 2.002884376166602, + "learning_rate": 4.229412915596196e-06, + "loss": 0.1579, + "mean_token_accuracy": 0.9479328989982605, + "step": 6623 + }, + { + "epoch": 3.312, + "grad_norm": 1.8615530240088747, + "learning_rate": 4.229097803861601e-06, + "loss": 0.1899, + "mean_token_accuracy": 0.9389029741287231, + "step": 6624 + }, + { + "epoch": 3.3125, + "grad_norm": 2.5675913114284143, + "learning_rate": 4.228782639455674e-06, + "loss": 0.1554, + "mean_token_accuracy": 0.9385604858398438, + "step": 6625 + }, + { + "epoch": 3.313, + "grad_norm": 1.9999967051639291, + "learning_rate": 4.2284674223880165e-06, + "loss": 0.2273, + "mean_token_accuracy": 0.9263519644737244, + "step": 6626 + }, + { + "epoch": 3.3135, + "grad_norm": 1.8906752470318726, + "learning_rate": 4.228152152668231e-06, + "loss": 0.1939, + "mean_token_accuracy": 0.9315135478973389, + "step": 6627 + }, + { + "epoch": 3.314, + "grad_norm": 1.9217696832039344, + "learning_rate": 4.22783683030592e-06, + "loss": 0.1993, + "mean_token_accuracy": 0.9250395894050598, + "step": 6628 + }, + { + "epoch": 3.3145, + "grad_norm": 2.0632110706636593, + "learning_rate": 4.227521455310689e-06, + "loss": 0.1693, + "mean_token_accuracy": 0.9382957816123962, + "step": 6629 + }, + { + "epoch": 3.315, + "grad_norm": 1.8414525390216814, + "learning_rate": 4.227206027692146e-06, + "loss": 0.2122, + "mean_token_accuracy": 0.931510865688324, + "step": 6630 + }, + { + "epoch": 3.3155, + "grad_norm": 1.9130760371060247, + "learning_rate": 4.226890547459899e-06, + "loss": 0.1956, + "mean_token_accuracy": 0.9370078444480896, + "step": 6631 + }, + { + "epoch": 3.316, + "grad_norm": 2.758612204499232, + "learning_rate": 4.226575014623557e-06, + "loss": 0.17, + "mean_token_accuracy": 0.9348673820495605, + "step": 6632 + }, + { + "epoch": 3.3165, + "grad_norm": 1.9708985086619815, + "learning_rate": 4.226259429192734e-06, + "loss": 0.1746, + "mean_token_accuracy": 0.934391975402832, + "step": 6633 + }, + { + "epoch": 3.317, + "grad_norm": 2.7855307248330585, + "learning_rate": 4.225943791177041e-06, + "loss": 0.1838, + "mean_token_accuracy": 0.9355893731117249, + "step": 6634 + }, + { + "epoch": 3.3175, + "grad_norm": 2.8065790250785003, + "learning_rate": 4.225628100586093e-06, + "loss": 0.2099, + "mean_token_accuracy": 0.9282589554786682, + "step": 6635 + }, + { + "epoch": 3.318, + "grad_norm": 2.320951570929723, + "learning_rate": 4.225312357429508e-06, + "loss": 0.2315, + "mean_token_accuracy": 0.9203231930732727, + "step": 6636 + }, + { + "epoch": 3.3185000000000002, + "grad_norm": 1.5785805084103812, + "learning_rate": 4.224996561716903e-06, + "loss": 0.1601, + "mean_token_accuracy": 0.9366259574890137, + "step": 6637 + }, + { + "epoch": 3.319, + "grad_norm": 1.8847054015454465, + "learning_rate": 4.224680713457899e-06, + "loss": 0.1976, + "mean_token_accuracy": 0.9320546388626099, + "step": 6638 + }, + { + "epoch": 3.3195, + "grad_norm": 2.047494362506027, + "learning_rate": 4.224364812662114e-06, + "loss": 0.2021, + "mean_token_accuracy": 0.9255433082580566, + "step": 6639 + }, + { + "epoch": 3.32, + "grad_norm": 1.8625809691537214, + "learning_rate": 4.224048859339175e-06, + "loss": 0.1534, + "mean_token_accuracy": 0.944314181804657, + "step": 6640 + }, + { + "epoch": 3.3205, + "grad_norm": 1.8846121103240765, + "learning_rate": 4.223732853498704e-06, + "loss": 0.1376, + "mean_token_accuracy": 0.9487341642379761, + "step": 6641 + }, + { + "epoch": 3.321, + "grad_norm": 1.7964721336557774, + "learning_rate": 4.223416795150328e-06, + "loss": 0.1942, + "mean_token_accuracy": 0.9255861639976501, + "step": 6642 + }, + { + "epoch": 3.3215, + "grad_norm": 1.8674781446252626, + "learning_rate": 4.223100684303674e-06, + "loss": 0.1487, + "mean_token_accuracy": 0.9446435570716858, + "step": 6643 + }, + { + "epoch": 3.322, + "grad_norm": 1.6157884823491693, + "learning_rate": 4.2227845209683715e-06, + "loss": 0.2065, + "mean_token_accuracy": 0.9235728979110718, + "step": 6644 + }, + { + "epoch": 3.3225, + "grad_norm": 3.253638042009083, + "learning_rate": 4.222468305154052e-06, + "loss": 0.2395, + "mean_token_accuracy": 0.9283707737922668, + "step": 6645 + }, + { + "epoch": 3.323, + "grad_norm": 2.371017410664903, + "learning_rate": 4.222152036870348e-06, + "loss": 0.2647, + "mean_token_accuracy": 0.9170573949813843, + "step": 6646 + }, + { + "epoch": 3.3235, + "grad_norm": 2.199367372085925, + "learning_rate": 4.221835716126892e-06, + "loss": 0.1687, + "mean_token_accuracy": 0.9337557554244995, + "step": 6647 + }, + { + "epoch": 3.324, + "grad_norm": 3.0544501970333693, + "learning_rate": 4.221519342933321e-06, + "loss": 0.1728, + "mean_token_accuracy": 0.9387155175209045, + "step": 6648 + }, + { + "epoch": 3.3245, + "grad_norm": 1.8926697160828094, + "learning_rate": 4.221202917299273e-06, + "loss": 0.236, + "mean_token_accuracy": 0.9195001721382141, + "step": 6649 + }, + { + "epoch": 3.325, + "grad_norm": 1.6950440642475826, + "learning_rate": 4.220886439234385e-06, + "loss": 0.1782, + "mean_token_accuracy": 0.9343705177307129, + "step": 6650 + }, + { + "epoch": 3.3255, + "grad_norm": 2.175550811228924, + "learning_rate": 4.220569908748299e-06, + "loss": 0.1625, + "mean_token_accuracy": 0.9447106719017029, + "step": 6651 + }, + { + "epoch": 3.326, + "grad_norm": 1.9388358606904526, + "learning_rate": 4.2202533258506575e-06, + "loss": 0.1779, + "mean_token_accuracy": 0.9406720399856567, + "step": 6652 + }, + { + "epoch": 3.3265000000000002, + "grad_norm": 2.5051441203160367, + "learning_rate": 4.219936690551102e-06, + "loss": 0.1641, + "mean_token_accuracy": 0.9504144787788391, + "step": 6653 + }, + { + "epoch": 3.327, + "grad_norm": 2.1203559205578033, + "learning_rate": 4.219620002859278e-06, + "loss": 0.1976, + "mean_token_accuracy": 0.9241982698440552, + "step": 6654 + }, + { + "epoch": 3.3275, + "grad_norm": 2.705912039223652, + "learning_rate": 4.219303262784834e-06, + "loss": 0.1907, + "mean_token_accuracy": 0.9314612150192261, + "step": 6655 + }, + { + "epoch": 3.328, + "grad_norm": 2.0601203021998447, + "learning_rate": 4.218986470337419e-06, + "loss": 0.2129, + "mean_token_accuracy": 0.9220526218414307, + "step": 6656 + }, + { + "epoch": 3.3285, + "grad_norm": 2.7208752316609903, + "learning_rate": 4.218669625526681e-06, + "loss": 0.1979, + "mean_token_accuracy": 0.9260920286178589, + "step": 6657 + }, + { + "epoch": 3.329, + "grad_norm": 3.5862279140915723, + "learning_rate": 4.218352728362272e-06, + "loss": 0.2563, + "mean_token_accuracy": 0.9170992970466614, + "step": 6658 + }, + { + "epoch": 3.3295, + "grad_norm": 2.2705259435728955, + "learning_rate": 4.2180357788538466e-06, + "loss": 0.2442, + "mean_token_accuracy": 0.9151219725608826, + "step": 6659 + }, + { + "epoch": 3.33, + "grad_norm": 2.524709909980083, + "learning_rate": 4.217718777011058e-06, + "loss": 0.1978, + "mean_token_accuracy": 0.9333052635192871, + "step": 6660 + }, + { + "epoch": 3.3305, + "grad_norm": 1.6839745043540106, + "learning_rate": 4.217401722843564e-06, + "loss": 0.1813, + "mean_token_accuracy": 0.9322371482849121, + "step": 6661 + }, + { + "epoch": 3.331, + "grad_norm": 2.732737402592672, + "learning_rate": 4.2170846163610215e-06, + "loss": 0.1957, + "mean_token_accuracy": 0.9263474941253662, + "step": 6662 + }, + { + "epoch": 3.3315, + "grad_norm": 2.7952180310679404, + "learning_rate": 4.216767457573091e-06, + "loss": 0.1671, + "mean_token_accuracy": 0.9390959143638611, + "step": 6663 + }, + { + "epoch": 3.332, + "grad_norm": 2.686857707958953, + "learning_rate": 4.216450246489432e-06, + "loss": 0.2249, + "mean_token_accuracy": 0.9131306409835815, + "step": 6664 + }, + { + "epoch": 3.3325, + "grad_norm": 3.2463252529675257, + "learning_rate": 4.2161329831197095e-06, + "loss": 0.2508, + "mean_token_accuracy": 0.9199285507202148, + "step": 6665 + }, + { + "epoch": 3.333, + "grad_norm": 1.8898407085265005, + "learning_rate": 4.215815667473588e-06, + "loss": 0.1629, + "mean_token_accuracy": 0.939921498298645, + "step": 6666 + }, + { + "epoch": 3.3335, + "grad_norm": 2.592622675037864, + "learning_rate": 4.215498299560731e-06, + "loss": 0.2263, + "mean_token_accuracy": 0.9179091453552246, + "step": 6667 + }, + { + "epoch": 3.334, + "grad_norm": 1.8986346905356013, + "learning_rate": 4.215180879390808e-06, + "loss": 0.1818, + "mean_token_accuracy": 0.9373705387115479, + "step": 6668 + }, + { + "epoch": 3.3345000000000002, + "grad_norm": 2.0233139982001442, + "learning_rate": 4.214863406973487e-06, + "loss": 0.2113, + "mean_token_accuracy": 0.9253124594688416, + "step": 6669 + }, + { + "epoch": 3.335, + "grad_norm": 1.908423961456158, + "learning_rate": 4.2145458823184414e-06, + "loss": 0.2489, + "mean_token_accuracy": 0.9162610769271851, + "step": 6670 + }, + { + "epoch": 3.3355, + "grad_norm": 2.1584898898071803, + "learning_rate": 4.21422830543534e-06, + "loss": 0.2005, + "mean_token_accuracy": 0.9306910037994385, + "step": 6671 + }, + { + "epoch": 3.336, + "grad_norm": 2.180014977133156, + "learning_rate": 4.2139106763338595e-06, + "loss": 0.2314, + "mean_token_accuracy": 0.9164018034934998, + "step": 6672 + }, + { + "epoch": 3.3365, + "grad_norm": 2.244231173560601, + "learning_rate": 4.213592995023673e-06, + "loss": 0.1523, + "mean_token_accuracy": 0.9419007301330566, + "step": 6673 + }, + { + "epoch": 3.337, + "grad_norm": 2.378888460537946, + "learning_rate": 4.21327526151446e-06, + "loss": 0.2452, + "mean_token_accuracy": 0.9109734296798706, + "step": 6674 + }, + { + "epoch": 3.3375, + "grad_norm": 2.6170492741154088, + "learning_rate": 4.212957475815898e-06, + "loss": 0.1694, + "mean_token_accuracy": 0.9356579780578613, + "step": 6675 + }, + { + "epoch": 3.338, + "grad_norm": 2.412401778032329, + "learning_rate": 4.212639637937668e-06, + "loss": 0.1908, + "mean_token_accuracy": 0.932830810546875, + "step": 6676 + }, + { + "epoch": 3.3385, + "grad_norm": 2.8361523104052075, + "learning_rate": 4.21232174788945e-06, + "loss": 0.2478, + "mean_token_accuracy": 0.9140300154685974, + "step": 6677 + }, + { + "epoch": 3.339, + "grad_norm": 4.271815432944379, + "learning_rate": 4.2120038056809304e-06, + "loss": 0.196, + "mean_token_accuracy": 0.9338409304618835, + "step": 6678 + }, + { + "epoch": 3.3395, + "grad_norm": 3.2414568839716136, + "learning_rate": 4.211685811321791e-06, + "loss": 0.1744, + "mean_token_accuracy": 0.9365897178649902, + "step": 6679 + }, + { + "epoch": 3.34, + "grad_norm": 2.5186977042312972, + "learning_rate": 4.211367764821722e-06, + "loss": 0.2115, + "mean_token_accuracy": 0.9262567758560181, + "step": 6680 + }, + { + "epoch": 3.3405, + "grad_norm": 2.132366614380203, + "learning_rate": 4.211049666190409e-06, + "loss": 0.223, + "mean_token_accuracy": 0.9240894913673401, + "step": 6681 + }, + { + "epoch": 3.341, + "grad_norm": 1.6917535119350078, + "learning_rate": 4.210731515437543e-06, + "loss": 0.2065, + "mean_token_accuracy": 0.9273765683174133, + "step": 6682 + }, + { + "epoch": 3.3415, + "grad_norm": 1.500546790657322, + "learning_rate": 4.210413312572815e-06, + "loss": 0.154, + "mean_token_accuracy": 0.9435468912124634, + "step": 6683 + }, + { + "epoch": 3.342, + "grad_norm": 2.8026498442269796, + "learning_rate": 4.210095057605917e-06, + "loss": 0.2108, + "mean_token_accuracy": 0.9291260242462158, + "step": 6684 + }, + { + "epoch": 3.3425000000000002, + "grad_norm": 2.1416190107647672, + "learning_rate": 4.209776750546547e-06, + "loss": 0.213, + "mean_token_accuracy": 0.92357337474823, + "step": 6685 + }, + { + "epoch": 3.343, + "grad_norm": 2.1007815885970285, + "learning_rate": 4.209458391404398e-06, + "loss": 0.2366, + "mean_token_accuracy": 0.9149665832519531, + "step": 6686 + }, + { + "epoch": 3.3435, + "grad_norm": 2.256541591584145, + "learning_rate": 4.209139980189168e-06, + "loss": 0.2008, + "mean_token_accuracy": 0.9295468330383301, + "step": 6687 + }, + { + "epoch": 3.344, + "grad_norm": 2.6097607063044466, + "learning_rate": 4.208821516910557e-06, + "loss": 0.2057, + "mean_token_accuracy": 0.9359626173973083, + "step": 6688 + }, + { + "epoch": 3.3445, + "grad_norm": 1.9126103290495908, + "learning_rate": 4.208503001578267e-06, + "loss": 0.1804, + "mean_token_accuracy": 0.93272864818573, + "step": 6689 + }, + { + "epoch": 3.3449999999999998, + "grad_norm": 2.170119520062399, + "learning_rate": 4.208184434201999e-06, + "loss": 0.1816, + "mean_token_accuracy": 0.934012770652771, + "step": 6690 + }, + { + "epoch": 3.3455, + "grad_norm": 2.617979525624369, + "learning_rate": 4.207865814791456e-06, + "loss": 0.1485, + "mean_token_accuracy": 0.9476696848869324, + "step": 6691 + }, + { + "epoch": 3.346, + "grad_norm": 2.1314162873855227, + "learning_rate": 4.207547143356347e-06, + "loss": 0.1905, + "mean_token_accuracy": 0.9297276139259338, + "step": 6692 + }, + { + "epoch": 3.3465, + "grad_norm": 3.041643810871032, + "learning_rate": 4.207228419906379e-06, + "loss": 0.1621, + "mean_token_accuracy": 0.9449298977851868, + "step": 6693 + }, + { + "epoch": 3.347, + "grad_norm": 4.000789386148516, + "learning_rate": 4.206909644451257e-06, + "loss": 0.1851, + "mean_token_accuracy": 0.9359458684921265, + "step": 6694 + }, + { + "epoch": 3.3475, + "grad_norm": 1.6804631125580156, + "learning_rate": 4.206590817000695e-06, + "loss": 0.1765, + "mean_token_accuracy": 0.9306883215904236, + "step": 6695 + }, + { + "epoch": 3.348, + "grad_norm": 2.3208221815347883, + "learning_rate": 4.206271937564404e-06, + "loss": 0.2213, + "mean_token_accuracy": 0.9293996691703796, + "step": 6696 + }, + { + "epoch": 3.3485, + "grad_norm": 2.131958561183325, + "learning_rate": 4.205953006152098e-06, + "loss": 0.2707, + "mean_token_accuracy": 0.9125402569770813, + "step": 6697 + }, + { + "epoch": 3.349, + "grad_norm": 2.6903503958343595, + "learning_rate": 4.205634022773492e-06, + "loss": 0.1881, + "mean_token_accuracy": 0.9360995888710022, + "step": 6698 + }, + { + "epoch": 3.3495, + "grad_norm": 2.7929571930962247, + "learning_rate": 4.205314987438301e-06, + "loss": 0.2209, + "mean_token_accuracy": 0.9188986420631409, + "step": 6699 + }, + { + "epoch": 3.35, + "grad_norm": 2.360425586449158, + "learning_rate": 4.204995900156247e-06, + "loss": 0.2808, + "mean_token_accuracy": 0.9071672558784485, + "step": 6700 + }, + { + "epoch": 3.3505, + "grad_norm": 4.773527469663062, + "learning_rate": 4.2046767609370464e-06, + "loss": 0.1884, + "mean_token_accuracy": 0.935123860836029, + "step": 6701 + }, + { + "epoch": 3.351, + "grad_norm": 1.8792554134131634, + "learning_rate": 4.204357569790423e-06, + "loss": 0.1802, + "mean_token_accuracy": 0.9320326447486877, + "step": 6702 + }, + { + "epoch": 3.3515, + "grad_norm": 3.150236155259859, + "learning_rate": 4.204038326726099e-06, + "loss": 0.227, + "mean_token_accuracy": 0.9232755303382874, + "step": 6703 + }, + { + "epoch": 3.352, + "grad_norm": 3.35107952992143, + "learning_rate": 4.2037190317538e-06, + "loss": 0.2071, + "mean_token_accuracy": 0.9289239645004272, + "step": 6704 + }, + { + "epoch": 3.3525, + "grad_norm": 3.868162010403682, + "learning_rate": 4.20339968488325e-06, + "loss": 0.1878, + "mean_token_accuracy": 0.9335505962371826, + "step": 6705 + }, + { + "epoch": 3.3529999999999998, + "grad_norm": 2.569168765183603, + "learning_rate": 4.2030802861241804e-06, + "loss": 0.2719, + "mean_token_accuracy": 0.906276285648346, + "step": 6706 + }, + { + "epoch": 3.3535, + "grad_norm": 3.032181810483317, + "learning_rate": 4.202760835486317e-06, + "loss": 0.2207, + "mean_token_accuracy": 0.9207838177680969, + "step": 6707 + }, + { + "epoch": 3.354, + "grad_norm": 2.1756198985461186, + "learning_rate": 4.202441332979394e-06, + "loss": 0.1729, + "mean_token_accuracy": 0.9367548823356628, + "step": 6708 + }, + { + "epoch": 3.3545, + "grad_norm": 1.466431881028652, + "learning_rate": 4.202121778613142e-06, + "loss": 0.135, + "mean_token_accuracy": 0.9497063159942627, + "step": 6709 + }, + { + "epoch": 3.355, + "grad_norm": 1.8861032801610569, + "learning_rate": 4.201802172397295e-06, + "loss": 0.2192, + "mean_token_accuracy": 0.9267187714576721, + "step": 6710 + }, + { + "epoch": 3.3555, + "grad_norm": 1.8352877857896555, + "learning_rate": 4.201482514341589e-06, + "loss": 0.2267, + "mean_token_accuracy": 0.928364098072052, + "step": 6711 + }, + { + "epoch": 3.356, + "grad_norm": 3.6729035968936765, + "learning_rate": 4.201162804455764e-06, + "loss": 0.2022, + "mean_token_accuracy": 0.927912175655365, + "step": 6712 + }, + { + "epoch": 3.3565, + "grad_norm": 1.6889949644677285, + "learning_rate": 4.200843042749555e-06, + "loss": 0.2042, + "mean_token_accuracy": 0.9276779294013977, + "step": 6713 + }, + { + "epoch": 3.357, + "grad_norm": 1.9472192578233722, + "learning_rate": 4.200523229232705e-06, + "loss": 0.2511, + "mean_token_accuracy": 0.9170073866844177, + "step": 6714 + }, + { + "epoch": 3.3575, + "grad_norm": 2.0752304562679607, + "learning_rate": 4.2002033639149545e-06, + "loss": 0.224, + "mean_token_accuracy": 0.918767511844635, + "step": 6715 + }, + { + "epoch": 3.358, + "grad_norm": 2.696128450823254, + "learning_rate": 4.199883446806048e-06, + "loss": 0.1669, + "mean_token_accuracy": 0.94140625, + "step": 6716 + }, + { + "epoch": 3.3585, + "grad_norm": 2.5207183338376042, + "learning_rate": 4.1995634779157315e-06, + "loss": 0.2201, + "mean_token_accuracy": 0.9182872176170349, + "step": 6717 + }, + { + "epoch": 3.359, + "grad_norm": 2.7031404769293057, + "learning_rate": 4.199243457253751e-06, + "loss": 0.2014, + "mean_token_accuracy": 0.9287511706352234, + "step": 6718 + }, + { + "epoch": 3.3595, + "grad_norm": 1.8668279467073228, + "learning_rate": 4.198923384829854e-06, + "loss": 0.1647, + "mean_token_accuracy": 0.9433106780052185, + "step": 6719 + }, + { + "epoch": 3.36, + "grad_norm": 2.4861832957095493, + "learning_rate": 4.198603260653792e-06, + "loss": 0.2238, + "mean_token_accuracy": 0.9196372032165527, + "step": 6720 + }, + { + "epoch": 3.3605, + "grad_norm": 2.0704469768791314, + "learning_rate": 4.198283084735315e-06, + "loss": 0.1889, + "mean_token_accuracy": 0.9320908188819885, + "step": 6721 + }, + { + "epoch": 3.3609999999999998, + "grad_norm": 1.728532932362699, + "learning_rate": 4.197962857084178e-06, + "loss": 0.2479, + "mean_token_accuracy": 0.9210156798362732, + "step": 6722 + }, + { + "epoch": 3.3615, + "grad_norm": 2.0123044203421165, + "learning_rate": 4.197642577710135e-06, + "loss": 0.2387, + "mean_token_accuracy": 0.9132301211357117, + "step": 6723 + }, + { + "epoch": 3.362, + "grad_norm": 2.035082523585989, + "learning_rate": 4.197322246622941e-06, + "loss": 0.1441, + "mean_token_accuracy": 0.9490106701850891, + "step": 6724 + }, + { + "epoch": 3.3625, + "grad_norm": 2.0902042978355393, + "learning_rate": 4.197001863832355e-06, + "loss": 0.1823, + "mean_token_accuracy": 0.9314378499984741, + "step": 6725 + }, + { + "epoch": 3.363, + "grad_norm": 2.3635881543346753, + "learning_rate": 4.196681429348136e-06, + "loss": 0.1646, + "mean_token_accuracy": 0.9356087446212769, + "step": 6726 + }, + { + "epoch": 3.3635, + "grad_norm": 2.0430350299478373, + "learning_rate": 4.196360943180046e-06, + "loss": 0.2726, + "mean_token_accuracy": 0.9043269157409668, + "step": 6727 + }, + { + "epoch": 3.364, + "grad_norm": 1.9001511197287158, + "learning_rate": 4.196040405337846e-06, + "loss": 0.2188, + "mean_token_accuracy": 0.9303819537162781, + "step": 6728 + }, + { + "epoch": 3.3645, + "grad_norm": 2.8034199007191334, + "learning_rate": 4.195719815831301e-06, + "loss": 0.206, + "mean_token_accuracy": 0.9205729365348816, + "step": 6729 + }, + { + "epoch": 3.365, + "grad_norm": 2.783933973620266, + "learning_rate": 4.195399174670177e-06, + "loss": 0.1583, + "mean_token_accuracy": 0.9466928243637085, + "step": 6730 + }, + { + "epoch": 3.3655, + "grad_norm": 1.9546868642071782, + "learning_rate": 4.195078481864241e-06, + "loss": 0.1862, + "mean_token_accuracy": 0.9344753623008728, + "step": 6731 + }, + { + "epoch": 3.366, + "grad_norm": 5.814973202191592, + "learning_rate": 4.194757737423261e-06, + "loss": 0.1868, + "mean_token_accuracy": 0.9409556984901428, + "step": 6732 + }, + { + "epoch": 3.3665, + "grad_norm": 2.7619782879962447, + "learning_rate": 4.194436941357009e-06, + "loss": 0.2318, + "mean_token_accuracy": 0.9253281950950623, + "step": 6733 + }, + { + "epoch": 3.367, + "grad_norm": 1.551866980504493, + "learning_rate": 4.194116093675256e-06, + "loss": 0.1602, + "mean_token_accuracy": 0.9411983489990234, + "step": 6734 + }, + { + "epoch": 3.3675, + "grad_norm": 7.171832578807811, + "learning_rate": 4.193795194387776e-06, + "loss": 0.2233, + "mean_token_accuracy": 0.9199284911155701, + "step": 6735 + }, + { + "epoch": 3.368, + "grad_norm": 1.7645931124125522, + "learning_rate": 4.193474243504343e-06, + "loss": 0.1989, + "mean_token_accuracy": 0.9318894147872925, + "step": 6736 + }, + { + "epoch": 3.3685, + "grad_norm": 2.3712733922452913, + "learning_rate": 4.193153241034736e-06, + "loss": 0.2289, + "mean_token_accuracy": 0.9226776361465454, + "step": 6737 + }, + { + "epoch": 3.3689999999999998, + "grad_norm": 2.1770705617415183, + "learning_rate": 4.192832186988731e-06, + "loss": 0.211, + "mean_token_accuracy": 0.9261812567710876, + "step": 6738 + }, + { + "epoch": 3.3695, + "grad_norm": 3.324275985116284, + "learning_rate": 4.19251108137611e-06, + "loss": 0.236, + "mean_token_accuracy": 0.9210367798805237, + "step": 6739 + }, + { + "epoch": 3.37, + "grad_norm": 2.1604623296654264, + "learning_rate": 4.192189924206652e-06, + "loss": 0.2124, + "mean_token_accuracy": 0.9228324294090271, + "step": 6740 + }, + { + "epoch": 3.3705, + "grad_norm": 1.7825856327600997, + "learning_rate": 4.191868715490141e-06, + "loss": 0.1726, + "mean_token_accuracy": 0.9351634383201599, + "step": 6741 + }, + { + "epoch": 3.371, + "grad_norm": 2.1555210283393835, + "learning_rate": 4.191547455236364e-06, + "loss": 0.2012, + "mean_token_accuracy": 0.925363302230835, + "step": 6742 + }, + { + "epoch": 3.3715, + "grad_norm": 9.992676502991653, + "learning_rate": 4.1912261434551035e-06, + "loss": 0.1837, + "mean_token_accuracy": 0.9366351962089539, + "step": 6743 + }, + { + "epoch": 3.372, + "grad_norm": 2.219839755190152, + "learning_rate": 4.190904780156149e-06, + "loss": 0.1863, + "mean_token_accuracy": 0.9293802976608276, + "step": 6744 + }, + { + "epoch": 3.3725, + "grad_norm": 2.403856072877644, + "learning_rate": 4.190583365349289e-06, + "loss": 0.1849, + "mean_token_accuracy": 0.9358533620834351, + "step": 6745 + }, + { + "epoch": 3.373, + "grad_norm": 2.1480722142318744, + "learning_rate": 4.190261899044315e-06, + "loss": 0.1718, + "mean_token_accuracy": 0.9354966878890991, + "step": 6746 + }, + { + "epoch": 3.3735, + "grad_norm": 2.4543558632280797, + "learning_rate": 4.18994038125102e-06, + "loss": 0.1791, + "mean_token_accuracy": 0.9397053718566895, + "step": 6747 + }, + { + "epoch": 3.374, + "grad_norm": 1.8366135513700879, + "learning_rate": 4.189618811979197e-06, + "loss": 0.1911, + "mean_token_accuracy": 0.9287676215171814, + "step": 6748 + }, + { + "epoch": 3.3745, + "grad_norm": 12.629146885828945, + "learning_rate": 4.189297191238642e-06, + "loss": 0.1836, + "mean_token_accuracy": 0.9380174279212952, + "step": 6749 + }, + { + "epoch": 3.375, + "grad_norm": 2.5197539057745777, + "learning_rate": 4.188975519039151e-06, + "loss": 0.2117, + "mean_token_accuracy": 0.9294762015342712, + "step": 6750 + }, + { + "epoch": 3.3755, + "grad_norm": 2.307446575804522, + "learning_rate": 4.188653795390524e-06, + "loss": 0.1947, + "mean_token_accuracy": 0.9302917718887329, + "step": 6751 + }, + { + "epoch": 3.376, + "grad_norm": 3.2908743623597676, + "learning_rate": 4.188332020302561e-06, + "loss": 0.2365, + "mean_token_accuracy": 0.9147355556488037, + "step": 6752 + }, + { + "epoch": 3.3765, + "grad_norm": 3.4640741246583007, + "learning_rate": 4.1880101937850644e-06, + "loss": 0.2128, + "mean_token_accuracy": 0.9228022694587708, + "step": 6753 + }, + { + "epoch": 3.377, + "grad_norm": 2.6260773335710414, + "learning_rate": 4.187688315847837e-06, + "loss": 0.2509, + "mean_token_accuracy": 0.9180490970611572, + "step": 6754 + }, + { + "epoch": 3.3775, + "grad_norm": 3.147415778767868, + "learning_rate": 4.1873663865006835e-06, + "loss": 0.15, + "mean_token_accuracy": 0.9463986754417419, + "step": 6755 + }, + { + "epoch": 3.378, + "grad_norm": 2.6642989190846644, + "learning_rate": 4.1870444057534095e-06, + "loss": 0.1537, + "mean_token_accuracy": 0.9440783858299255, + "step": 6756 + }, + { + "epoch": 3.3785, + "grad_norm": 2.2060750916778638, + "learning_rate": 4.186722373615825e-06, + "loss": 0.2071, + "mean_token_accuracy": 0.9235106110572815, + "step": 6757 + }, + { + "epoch": 3.379, + "grad_norm": 4.030834586839121, + "learning_rate": 4.186400290097739e-06, + "loss": 0.2484, + "mean_token_accuracy": 0.922208309173584, + "step": 6758 + }, + { + "epoch": 3.3795, + "grad_norm": 3.893757332253416, + "learning_rate": 4.186078155208962e-06, + "loss": 0.2645, + "mean_token_accuracy": 0.9249451160430908, + "step": 6759 + }, + { + "epoch": 3.38, + "grad_norm": 2.147301088273034, + "learning_rate": 4.185755968959308e-06, + "loss": 0.1938, + "mean_token_accuracy": 0.932813823223114, + "step": 6760 + }, + { + "epoch": 3.3805, + "grad_norm": 1.9048086401472897, + "learning_rate": 4.185433731358592e-06, + "loss": 0.1986, + "mean_token_accuracy": 0.9321454167366028, + "step": 6761 + }, + { + "epoch": 3.3810000000000002, + "grad_norm": 4.0911915552680895, + "learning_rate": 4.185111442416627e-06, + "loss": 0.1994, + "mean_token_accuracy": 0.9298210144042969, + "step": 6762 + }, + { + "epoch": 3.3815, + "grad_norm": 1.8386852237910256, + "learning_rate": 4.184789102143233e-06, + "loss": 0.1886, + "mean_token_accuracy": 0.9328816533088684, + "step": 6763 + }, + { + "epoch": 3.382, + "grad_norm": 3.374969481523938, + "learning_rate": 4.184466710548227e-06, + "loss": 0.2121, + "mean_token_accuracy": 0.9307582974433899, + "step": 6764 + }, + { + "epoch": 3.3825, + "grad_norm": 2.271409705532296, + "learning_rate": 4.184144267641433e-06, + "loss": 0.1662, + "mean_token_accuracy": 0.9337117671966553, + "step": 6765 + }, + { + "epoch": 3.383, + "grad_norm": 3.6528006881537776, + "learning_rate": 4.183821773432669e-06, + "loss": 0.2181, + "mean_token_accuracy": 0.9245896339416504, + "step": 6766 + }, + { + "epoch": 3.3835, + "grad_norm": 2.427973800973918, + "learning_rate": 4.183499227931761e-06, + "loss": 0.2195, + "mean_token_accuracy": 0.9205624461174011, + "step": 6767 + }, + { + "epoch": 3.384, + "grad_norm": 2.9430508286464883, + "learning_rate": 4.1831766311485345e-06, + "loss": 0.1961, + "mean_token_accuracy": 0.9313235878944397, + "step": 6768 + }, + { + "epoch": 3.3845, + "grad_norm": 3.0548561466607356, + "learning_rate": 4.182853983092816e-06, + "loss": 0.2216, + "mean_token_accuracy": 0.9281426668167114, + "step": 6769 + }, + { + "epoch": 3.385, + "grad_norm": 2.836572204246741, + "learning_rate": 4.182531283774434e-06, + "loss": 0.2101, + "mean_token_accuracy": 0.9246141314506531, + "step": 6770 + }, + { + "epoch": 3.3855, + "grad_norm": 2.1512983065132834, + "learning_rate": 4.182208533203218e-06, + "loss": 0.1698, + "mean_token_accuracy": 0.9360923171043396, + "step": 6771 + }, + { + "epoch": 3.386, + "grad_norm": 2.96263363857334, + "learning_rate": 4.181885731389e-06, + "loss": 0.2283, + "mean_token_accuracy": 0.9279475808143616, + "step": 6772 + }, + { + "epoch": 3.3865, + "grad_norm": 1.9963650187061213, + "learning_rate": 4.181562878341612e-06, + "loss": 0.2339, + "mean_token_accuracy": 0.9138582944869995, + "step": 6773 + }, + { + "epoch": 3.387, + "grad_norm": 2.8409142652470414, + "learning_rate": 4.18123997407089e-06, + "loss": 0.1976, + "mean_token_accuracy": 0.9283917546272278, + "step": 6774 + }, + { + "epoch": 3.3875, + "grad_norm": 1.6584502791464866, + "learning_rate": 4.18091701858667e-06, + "loss": 0.1782, + "mean_token_accuracy": 0.9276467561721802, + "step": 6775 + }, + { + "epoch": 3.388, + "grad_norm": 1.700551275594588, + "learning_rate": 4.180594011898791e-06, + "loss": 0.2201, + "mean_token_accuracy": 0.9316306710243225, + "step": 6776 + }, + { + "epoch": 3.3885, + "grad_norm": 15.888183097499233, + "learning_rate": 4.18027095401709e-06, + "loss": 0.1983, + "mean_token_accuracy": 0.9232621192932129, + "step": 6777 + }, + { + "epoch": 3.3890000000000002, + "grad_norm": 2.1217165543334304, + "learning_rate": 4.179947844951408e-06, + "loss": 0.2272, + "mean_token_accuracy": 0.9189232587814331, + "step": 6778 + }, + { + "epoch": 3.3895, + "grad_norm": 1.978183664705364, + "learning_rate": 4.179624684711588e-06, + "loss": 0.2086, + "mean_token_accuracy": 0.9277572631835938, + "step": 6779 + }, + { + "epoch": 3.39, + "grad_norm": 1.6238768006951037, + "learning_rate": 4.179301473307476e-06, + "loss": 0.2166, + "mean_token_accuracy": 0.9236459732055664, + "step": 6780 + }, + { + "epoch": 3.3905, + "grad_norm": 2.4828914571145737, + "learning_rate": 4.178978210748915e-06, + "loss": 0.222, + "mean_token_accuracy": 0.9253956079483032, + "step": 6781 + }, + { + "epoch": 3.391, + "grad_norm": 3.7852078208294753, + "learning_rate": 4.178654897045754e-06, + "loss": 0.1596, + "mean_token_accuracy": 0.9409759640693665, + "step": 6782 + }, + { + "epoch": 3.3915, + "grad_norm": 2.0344573866286746, + "learning_rate": 4.17833153220784e-06, + "loss": 0.2163, + "mean_token_accuracy": 0.9276657104492188, + "step": 6783 + }, + { + "epoch": 3.392, + "grad_norm": 2.211879626869519, + "learning_rate": 4.178008116245024e-06, + "loss": 0.1889, + "mean_token_accuracy": 0.9423860907554626, + "step": 6784 + }, + { + "epoch": 3.3925, + "grad_norm": 3.2471064844734396, + "learning_rate": 4.177684649167158e-06, + "loss": 0.2151, + "mean_token_accuracy": 0.9277796149253845, + "step": 6785 + }, + { + "epoch": 3.393, + "grad_norm": 3.820321066602617, + "learning_rate": 4.177361130984095e-06, + "loss": 0.1896, + "mean_token_accuracy": 0.9297006130218506, + "step": 6786 + }, + { + "epoch": 3.3935, + "grad_norm": 2.03127641282385, + "learning_rate": 4.1770375617056904e-06, + "loss": 0.1901, + "mean_token_accuracy": 0.9336493015289307, + "step": 6787 + }, + { + "epoch": 3.394, + "grad_norm": 3.952359775483153, + "learning_rate": 4.1767139413418005e-06, + "loss": 0.2461, + "mean_token_accuracy": 0.918996274471283, + "step": 6788 + }, + { + "epoch": 3.3945, + "grad_norm": 2.5948881646844275, + "learning_rate": 4.176390269902283e-06, + "loss": 0.3142, + "mean_token_accuracy": 0.8939895629882812, + "step": 6789 + }, + { + "epoch": 3.395, + "grad_norm": 2.154056172334202, + "learning_rate": 4.176066547396998e-06, + "loss": 0.2087, + "mean_token_accuracy": 0.924663245677948, + "step": 6790 + }, + { + "epoch": 3.3955, + "grad_norm": 6.973680072337209, + "learning_rate": 4.175742773835807e-06, + "loss": 0.1546, + "mean_token_accuracy": 0.9435619711875916, + "step": 6791 + }, + { + "epoch": 3.396, + "grad_norm": 6.205659114138281, + "learning_rate": 4.175418949228571e-06, + "loss": 0.1612, + "mean_token_accuracy": 0.9463539123535156, + "step": 6792 + }, + { + "epoch": 3.3965, + "grad_norm": 2.302201337952427, + "learning_rate": 4.175095073585156e-06, + "loss": 0.2582, + "mean_token_accuracy": 0.9141720533370972, + "step": 6793 + }, + { + "epoch": 3.3970000000000002, + "grad_norm": 2.260837893862902, + "learning_rate": 4.174771146915427e-06, + "loss": 0.2249, + "mean_token_accuracy": 0.9208341836929321, + "step": 6794 + }, + { + "epoch": 3.3975, + "grad_norm": 5.303921023891111, + "learning_rate": 4.174447169229252e-06, + "loss": 0.1753, + "mean_token_accuracy": 0.9335120916366577, + "step": 6795 + }, + { + "epoch": 3.398, + "grad_norm": 5.30188756812445, + "learning_rate": 4.174123140536499e-06, + "loss": 0.1878, + "mean_token_accuracy": 0.9316189289093018, + "step": 6796 + }, + { + "epoch": 3.3985, + "grad_norm": 11.03640402125717, + "learning_rate": 4.173799060847039e-06, + "loss": 0.2306, + "mean_token_accuracy": 0.9113752245903015, + "step": 6797 + }, + { + "epoch": 3.399, + "grad_norm": 6.013533357470501, + "learning_rate": 4.173474930170744e-06, + "loss": 0.1605, + "mean_token_accuracy": 0.9439234137535095, + "step": 6798 + }, + { + "epoch": 3.3995, + "grad_norm": 4.806041096393325, + "learning_rate": 4.173150748517489e-06, + "loss": 0.166, + "mean_token_accuracy": 0.9369868636131287, + "step": 6799 + }, + { + "epoch": 3.4, + "grad_norm": 4.518879602128738, + "learning_rate": 4.172826515897146e-06, + "loss": 0.1938, + "mean_token_accuracy": 0.9334568977355957, + "step": 6800 + }, + { + "epoch": 3.4005, + "grad_norm": 3.238270817689568, + "learning_rate": 4.172502232319594e-06, + "loss": 0.2115, + "mean_token_accuracy": 0.935529887676239, + "step": 6801 + }, + { + "epoch": 3.401, + "grad_norm": 2.3525189617368674, + "learning_rate": 4.17217789779471e-06, + "loss": 0.2346, + "mean_token_accuracy": 0.9231953620910645, + "step": 6802 + }, + { + "epoch": 3.4015, + "grad_norm": 2.1101609904800687, + "learning_rate": 4.1718535123323755e-06, + "loss": 0.2038, + "mean_token_accuracy": 0.9280550479888916, + "step": 6803 + }, + { + "epoch": 3.402, + "grad_norm": 2.617092678321853, + "learning_rate": 4.171529075942471e-06, + "loss": 0.1727, + "mean_token_accuracy": 0.9380519986152649, + "step": 6804 + }, + { + "epoch": 3.4025, + "grad_norm": 2.7609708170347136, + "learning_rate": 4.171204588634878e-06, + "loss": 0.174, + "mean_token_accuracy": 0.9382052421569824, + "step": 6805 + }, + { + "epoch": 3.403, + "grad_norm": 3.8073987680371935, + "learning_rate": 4.170880050419483e-06, + "loss": 0.2224, + "mean_token_accuracy": 0.9236385822296143, + "step": 6806 + }, + { + "epoch": 3.4035, + "grad_norm": 9.887447873601433, + "learning_rate": 4.170555461306171e-06, + "loss": 0.2383, + "mean_token_accuracy": 0.9156302809715271, + "step": 6807 + }, + { + "epoch": 3.404, + "grad_norm": 7.198407245688048, + "learning_rate": 4.17023082130483e-06, + "loss": 0.1531, + "mean_token_accuracy": 0.9422725439071655, + "step": 6808 + }, + { + "epoch": 3.4045, + "grad_norm": 2.1489045579371293, + "learning_rate": 4.169906130425348e-06, + "loss": 0.1827, + "mean_token_accuracy": 0.9339439868927002, + "step": 6809 + }, + { + "epoch": 3.4050000000000002, + "grad_norm": 2.6324794547753485, + "learning_rate": 4.169581388677617e-06, + "loss": 0.2221, + "mean_token_accuracy": 0.9254658222198486, + "step": 6810 + }, + { + "epoch": 3.4055, + "grad_norm": 2.9374664118412053, + "learning_rate": 4.169256596071528e-06, + "loss": 0.223, + "mean_token_accuracy": 0.919316828250885, + "step": 6811 + }, + { + "epoch": 3.406, + "grad_norm": 2.579173849028562, + "learning_rate": 4.168931752616977e-06, + "loss": 0.1993, + "mean_token_accuracy": 0.9273488521575928, + "step": 6812 + }, + { + "epoch": 3.4065, + "grad_norm": 3.5508903293119083, + "learning_rate": 4.168606858323856e-06, + "loss": 0.2874, + "mean_token_accuracy": 0.91465824842453, + "step": 6813 + }, + { + "epoch": 3.407, + "grad_norm": 2.100439351256083, + "learning_rate": 4.168281913202064e-06, + "loss": 0.1767, + "mean_token_accuracy": 0.9330024719238281, + "step": 6814 + }, + { + "epoch": 3.4074999999999998, + "grad_norm": 3.5007451826551876, + "learning_rate": 4.1679569172614994e-06, + "loss": 0.1786, + "mean_token_accuracy": 0.9279874563217163, + "step": 6815 + }, + { + "epoch": 3.408, + "grad_norm": 3.1949717278341736, + "learning_rate": 4.167631870512061e-06, + "loss": 0.266, + "mean_token_accuracy": 0.9087610840797424, + "step": 6816 + }, + { + "epoch": 3.4085, + "grad_norm": 3.538272417834497, + "learning_rate": 4.167306772963652e-06, + "loss": 0.1914, + "mean_token_accuracy": 0.9303126931190491, + "step": 6817 + }, + { + "epoch": 3.409, + "grad_norm": 1.8645159124606343, + "learning_rate": 4.166981624626174e-06, + "loss": 0.2032, + "mean_token_accuracy": 0.9268678426742554, + "step": 6818 + }, + { + "epoch": 3.4095, + "grad_norm": 2.3284695563537863, + "learning_rate": 4.166656425509532e-06, + "loss": 0.1554, + "mean_token_accuracy": 0.9422857165336609, + "step": 6819 + }, + { + "epoch": 3.41, + "grad_norm": 4.23340536808411, + "learning_rate": 4.166331175623631e-06, + "loss": 0.2409, + "mean_token_accuracy": 0.9098614454269409, + "step": 6820 + }, + { + "epoch": 3.4105, + "grad_norm": 1.7636651462967934, + "learning_rate": 4.166005874978382e-06, + "loss": 0.235, + "mean_token_accuracy": 0.9200761318206787, + "step": 6821 + }, + { + "epoch": 3.411, + "grad_norm": 2.0652114648575486, + "learning_rate": 4.16568052358369e-06, + "loss": 0.2376, + "mean_token_accuracy": 0.9201902747154236, + "step": 6822 + }, + { + "epoch": 3.4115, + "grad_norm": 3.362955768303216, + "learning_rate": 4.165355121449469e-06, + "loss": 0.1893, + "mean_token_accuracy": 0.9373840689659119, + "step": 6823 + }, + { + "epoch": 3.412, + "grad_norm": 1.8328872758542765, + "learning_rate": 4.16502966858563e-06, + "loss": 0.1937, + "mean_token_accuracy": 0.9303781390190125, + "step": 6824 + }, + { + "epoch": 3.4125, + "grad_norm": 2.537223780138807, + "learning_rate": 4.164704165002086e-06, + "loss": 0.2119, + "mean_token_accuracy": 0.9279669523239136, + "step": 6825 + }, + { + "epoch": 3.413, + "grad_norm": 6.944778162258019, + "learning_rate": 4.1643786107087536e-06, + "loss": 0.1846, + "mean_token_accuracy": 0.9400196671485901, + "step": 6826 + }, + { + "epoch": 3.4135, + "grad_norm": 2.069609150774946, + "learning_rate": 4.164053005715551e-06, + "loss": 0.2422, + "mean_token_accuracy": 0.9208248257637024, + "step": 6827 + }, + { + "epoch": 3.414, + "grad_norm": 2.4614611475589676, + "learning_rate": 4.163727350032394e-06, + "loss": 0.185, + "mean_token_accuracy": 0.931585431098938, + "step": 6828 + }, + { + "epoch": 3.4145, + "grad_norm": 2.5935685038176337, + "learning_rate": 4.163401643669203e-06, + "loss": 0.2132, + "mean_token_accuracy": 0.9308362007141113, + "step": 6829 + }, + { + "epoch": 3.415, + "grad_norm": 3.2854260298663553, + "learning_rate": 4.163075886635902e-06, + "loss": 0.1672, + "mean_token_accuracy": 0.9516331553459167, + "step": 6830 + }, + { + "epoch": 3.4154999999999998, + "grad_norm": 2.7429457260654058, + "learning_rate": 4.162750078942413e-06, + "loss": 0.2318, + "mean_token_accuracy": 0.9166526794433594, + "step": 6831 + }, + { + "epoch": 3.416, + "grad_norm": 1.9267250045482824, + "learning_rate": 4.162424220598659e-06, + "loss": 0.1974, + "mean_token_accuracy": 0.9327861070632935, + "step": 6832 + }, + { + "epoch": 3.4165, + "grad_norm": 2.724764234177667, + "learning_rate": 4.162098311614567e-06, + "loss": 0.1336, + "mean_token_accuracy": 0.9522597193717957, + "step": 6833 + }, + { + "epoch": 3.417, + "grad_norm": 8.559465678093545, + "learning_rate": 4.161772352000067e-06, + "loss": 0.2282, + "mean_token_accuracy": 0.9153419137001038, + "step": 6834 + }, + { + "epoch": 3.4175, + "grad_norm": 3.1222479776105785, + "learning_rate": 4.161446341765085e-06, + "loss": 0.2078, + "mean_token_accuracy": 0.9253731369972229, + "step": 6835 + }, + { + "epoch": 3.418, + "grad_norm": 2.058737645308577, + "learning_rate": 4.161120280919555e-06, + "loss": 0.1749, + "mean_token_accuracy": 0.939393937587738, + "step": 6836 + }, + { + "epoch": 3.4185, + "grad_norm": 1.7789113558030827, + "learning_rate": 4.160794169473406e-06, + "loss": 0.1771, + "mean_token_accuracy": 0.9354128241539001, + "step": 6837 + }, + { + "epoch": 3.419, + "grad_norm": 2.2668931528957073, + "learning_rate": 4.160468007436574e-06, + "loss": 0.2293, + "mean_token_accuracy": 0.9222420454025269, + "step": 6838 + }, + { + "epoch": 3.4195, + "grad_norm": 2.1239845711169436, + "learning_rate": 4.1601417948189944e-06, + "loss": 0.1383, + "mean_token_accuracy": 0.956732988357544, + "step": 6839 + }, + { + "epoch": 3.42, + "grad_norm": 5.915324478961467, + "learning_rate": 4.159815531630604e-06, + "loss": 0.1772, + "mean_token_accuracy": 0.9375728368759155, + "step": 6840 + }, + { + "epoch": 3.4205, + "grad_norm": 4.788142845607019, + "learning_rate": 4.159489217881342e-06, + "loss": 0.1659, + "mean_token_accuracy": 0.9342560768127441, + "step": 6841 + }, + { + "epoch": 3.421, + "grad_norm": 3.775435773713931, + "learning_rate": 4.1591628535811465e-06, + "loss": 0.1729, + "mean_token_accuracy": 0.9414650201797485, + "step": 6842 + }, + { + "epoch": 3.4215, + "grad_norm": 2.02731668839354, + "learning_rate": 4.158836438739961e-06, + "loss": 0.2381, + "mean_token_accuracy": 0.9214116930961609, + "step": 6843 + }, + { + "epoch": 3.422, + "grad_norm": 1.5575830997627327, + "learning_rate": 4.158509973367728e-06, + "loss": 0.1368, + "mean_token_accuracy": 0.9583836197853088, + "step": 6844 + }, + { + "epoch": 3.4225, + "grad_norm": 2.5355839877812536, + "learning_rate": 4.158183457474392e-06, + "loss": 0.2462, + "mean_token_accuracy": 0.9218394160270691, + "step": 6845 + }, + { + "epoch": 3.423, + "grad_norm": 2.6658975058426386, + "learning_rate": 4.157856891069901e-06, + "loss": 0.1899, + "mean_token_accuracy": 0.9308434724807739, + "step": 6846 + }, + { + "epoch": 3.4234999999999998, + "grad_norm": 2.789333527264442, + "learning_rate": 4.157530274164199e-06, + "loss": 0.2062, + "mean_token_accuracy": 0.9309831857681274, + "step": 6847 + }, + { + "epoch": 3.424, + "grad_norm": 1.8538763628437576, + "learning_rate": 4.1572036067672386e-06, + "loss": 0.1778, + "mean_token_accuracy": 0.9327305555343628, + "step": 6848 + }, + { + "epoch": 3.4245, + "grad_norm": 2.38879742395652, + "learning_rate": 4.1568768888889695e-06, + "loss": 0.1993, + "mean_token_accuracy": 0.9295806884765625, + "step": 6849 + }, + { + "epoch": 3.425, + "grad_norm": 4.373273212257241, + "learning_rate": 4.1565501205393445e-06, + "loss": 0.1795, + "mean_token_accuracy": 0.938789427280426, + "step": 6850 + }, + { + "epoch": 3.4255, + "grad_norm": 2.3195363827670112, + "learning_rate": 4.156223301728317e-06, + "loss": 0.2163, + "mean_token_accuracy": 0.9333924055099487, + "step": 6851 + }, + { + "epoch": 3.426, + "grad_norm": 2.98000544021976, + "learning_rate": 4.155896432465843e-06, + "loss": 0.2216, + "mean_token_accuracy": 0.9218658208847046, + "step": 6852 + }, + { + "epoch": 3.4265, + "grad_norm": 1.7066110037280728, + "learning_rate": 4.155569512761879e-06, + "loss": 0.2099, + "mean_token_accuracy": 0.9308225512504578, + "step": 6853 + }, + { + "epoch": 3.427, + "grad_norm": 2.0160849995107166, + "learning_rate": 4.155242542626383e-06, + "loss": 0.2262, + "mean_token_accuracy": 0.9174912571907043, + "step": 6854 + }, + { + "epoch": 3.4275, + "grad_norm": 1.3652489840146236, + "learning_rate": 4.154915522069318e-06, + "loss": 0.1428, + "mean_token_accuracy": 0.9447636604309082, + "step": 6855 + }, + { + "epoch": 3.428, + "grad_norm": 3.204942330249165, + "learning_rate": 4.154588451100642e-06, + "loss": 0.1503, + "mean_token_accuracy": 0.944420337677002, + "step": 6856 + }, + { + "epoch": 3.4285, + "grad_norm": 19.205923977037887, + "learning_rate": 4.15426132973032e-06, + "loss": 0.1728, + "mean_token_accuracy": 0.9412989020347595, + "step": 6857 + }, + { + "epoch": 3.429, + "grad_norm": 2.5624546198777383, + "learning_rate": 4.153934157968316e-06, + "loss": 0.2041, + "mean_token_accuracy": 0.9315835237503052, + "step": 6858 + }, + { + "epoch": 3.4295, + "grad_norm": 2.629321732721725, + "learning_rate": 4.1536069358245965e-06, + "loss": 0.209, + "mean_token_accuracy": 0.9269097447395325, + "step": 6859 + }, + { + "epoch": 3.43, + "grad_norm": 2.551950856654009, + "learning_rate": 4.15327966330913e-06, + "loss": 0.2211, + "mean_token_accuracy": 0.9257063269615173, + "step": 6860 + }, + { + "epoch": 3.4305, + "grad_norm": 1.6487397143496247, + "learning_rate": 4.152952340431885e-06, + "loss": 0.1348, + "mean_token_accuracy": 0.9472969770431519, + "step": 6861 + }, + { + "epoch": 3.431, + "grad_norm": 1.8986339864843964, + "learning_rate": 4.152624967202832e-06, + "loss": 0.2215, + "mean_token_accuracy": 0.9226469397544861, + "step": 6862 + }, + { + "epoch": 3.4314999999999998, + "grad_norm": 1.8317693326595166, + "learning_rate": 4.152297543631944e-06, + "loss": 0.1792, + "mean_token_accuracy": 0.9382308721542358, + "step": 6863 + }, + { + "epoch": 3.432, + "grad_norm": 2.74743379822103, + "learning_rate": 4.1519700697291945e-06, + "loss": 0.1862, + "mean_token_accuracy": 0.9340181946754456, + "step": 6864 + }, + { + "epoch": 3.4325, + "grad_norm": 2.397748576662989, + "learning_rate": 4.15164254550456e-06, + "loss": 0.1963, + "mean_token_accuracy": 0.9371588230133057, + "step": 6865 + }, + { + "epoch": 3.433, + "grad_norm": 1.779096247220146, + "learning_rate": 4.151314970968016e-06, + "loss": 0.1846, + "mean_token_accuracy": 0.9295498728752136, + "step": 6866 + }, + { + "epoch": 3.4335, + "grad_norm": 2.1389308080982, + "learning_rate": 4.150987346129541e-06, + "loss": 0.1924, + "mean_token_accuracy": 0.9358034729957581, + "step": 6867 + }, + { + "epoch": 3.434, + "grad_norm": 2.190628668209264, + "learning_rate": 4.1506596709991155e-06, + "loss": 0.202, + "mean_token_accuracy": 0.9283208250999451, + "step": 6868 + }, + { + "epoch": 3.4345, + "grad_norm": 2.004518899994659, + "learning_rate": 4.150331945586722e-06, + "loss": 0.162, + "mean_token_accuracy": 0.9418625831604004, + "step": 6869 + }, + { + "epoch": 3.435, + "grad_norm": 1.9797501492949916, + "learning_rate": 4.150004169902343e-06, + "loss": 0.1975, + "mean_token_accuracy": 0.9294437170028687, + "step": 6870 + }, + { + "epoch": 3.4355, + "grad_norm": 1.9481873825364773, + "learning_rate": 4.149676343955961e-06, + "loss": 0.1939, + "mean_token_accuracy": 0.9299591779708862, + "step": 6871 + }, + { + "epoch": 3.436, + "grad_norm": 1.6998014681196127, + "learning_rate": 4.149348467757566e-06, + "loss": 0.1452, + "mean_token_accuracy": 0.9463616609573364, + "step": 6872 + }, + { + "epoch": 3.4365, + "grad_norm": 7.320390978359511, + "learning_rate": 4.149020541317142e-06, + "loss": 0.2167, + "mean_token_accuracy": 0.9215686321258545, + "step": 6873 + }, + { + "epoch": 3.437, + "grad_norm": 2.4488149439307305, + "learning_rate": 4.1486925646446805e-06, + "loss": 0.1879, + "mean_token_accuracy": 0.9411213397979736, + "step": 6874 + }, + { + "epoch": 3.4375, + "grad_norm": 4.120422757114303, + "learning_rate": 4.1483645377501726e-06, + "loss": 0.1838, + "mean_token_accuracy": 0.9320195317268372, + "step": 6875 + }, + { + "epoch": 3.438, + "grad_norm": 2.065575184885389, + "learning_rate": 4.148036460643608e-06, + "loss": 0.1681, + "mean_token_accuracy": 0.9464831948280334, + "step": 6876 + }, + { + "epoch": 3.4385, + "grad_norm": 5.543820272571146, + "learning_rate": 4.1477083333349835e-06, + "loss": 0.2764, + "mean_token_accuracy": 0.9052062034606934, + "step": 6877 + }, + { + "epoch": 3.439, + "grad_norm": 5.131769070253029, + "learning_rate": 4.147380155834293e-06, + "loss": 0.2347, + "mean_token_accuracy": 0.9262370467185974, + "step": 6878 + }, + { + "epoch": 3.4395, + "grad_norm": 1.7588153656245031, + "learning_rate": 4.147051928151532e-06, + "loss": 0.1978, + "mean_token_accuracy": 0.9304168820381165, + "step": 6879 + }, + { + "epoch": 3.44, + "grad_norm": 5.295812839157823, + "learning_rate": 4.146723650296701e-06, + "loss": 0.1793, + "mean_token_accuracy": 0.9276009798049927, + "step": 6880 + }, + { + "epoch": 3.4405, + "grad_norm": 3.4042677686071694, + "learning_rate": 4.1463953222798e-06, + "loss": 0.2187, + "mean_token_accuracy": 0.9254188537597656, + "step": 6881 + }, + { + "epoch": 3.441, + "grad_norm": 3.5054129912056204, + "learning_rate": 4.1460669441108295e-06, + "loss": 0.1551, + "mean_token_accuracy": 0.939983606338501, + "step": 6882 + }, + { + "epoch": 3.4415, + "grad_norm": 2.671809554100667, + "learning_rate": 4.1457385157997906e-06, + "loss": 0.2158, + "mean_token_accuracy": 0.9264993071556091, + "step": 6883 + }, + { + "epoch": 3.442, + "grad_norm": 3.1416796897810113, + "learning_rate": 4.1454100373566915e-06, + "loss": 0.1916, + "mean_token_accuracy": 0.9358127117156982, + "step": 6884 + }, + { + "epoch": 3.4425, + "grad_norm": 3.145949805304922, + "learning_rate": 4.145081508791536e-06, + "loss": 0.2193, + "mean_token_accuracy": 0.9195920825004578, + "step": 6885 + }, + { + "epoch": 3.443, + "grad_norm": 1.6644038503053773, + "learning_rate": 4.144752930114333e-06, + "loss": 0.1644, + "mean_token_accuracy": 0.9401097297668457, + "step": 6886 + }, + { + "epoch": 3.4435000000000002, + "grad_norm": 2.3119216435065435, + "learning_rate": 4.14442430133509e-06, + "loss": 0.22, + "mean_token_accuracy": 0.9265905618667603, + "step": 6887 + }, + { + "epoch": 3.444, + "grad_norm": 10.255996925077762, + "learning_rate": 4.1440956224638186e-06, + "loss": 0.228, + "mean_token_accuracy": 0.919468879699707, + "step": 6888 + }, + { + "epoch": 3.4445, + "grad_norm": 2.5619700416628794, + "learning_rate": 4.143766893510531e-06, + "loss": 0.1938, + "mean_token_accuracy": 0.928375244140625, + "step": 6889 + }, + { + "epoch": 3.445, + "grad_norm": 1.8130609728104052, + "learning_rate": 4.14343811448524e-06, + "loss": 0.2042, + "mean_token_accuracy": 0.9188376665115356, + "step": 6890 + }, + { + "epoch": 3.4455, + "grad_norm": 3.111475463821177, + "learning_rate": 4.143109285397961e-06, + "loss": 0.1837, + "mean_token_accuracy": 0.93956458568573, + "step": 6891 + }, + { + "epoch": 3.446, + "grad_norm": 3.21777560098944, + "learning_rate": 4.142780406258712e-06, + "loss": 0.1836, + "mean_token_accuracy": 0.9363143444061279, + "step": 6892 + }, + { + "epoch": 3.4465, + "grad_norm": 1.930991403045168, + "learning_rate": 4.142451477077509e-06, + "loss": 0.1688, + "mean_token_accuracy": 0.9395332932472229, + "step": 6893 + }, + { + "epoch": 3.447, + "grad_norm": 2.0790894959511688, + "learning_rate": 4.1421224978643746e-06, + "loss": 0.1925, + "mean_token_accuracy": 0.9392837285995483, + "step": 6894 + }, + { + "epoch": 3.4475, + "grad_norm": 2.0823213535661735, + "learning_rate": 4.141793468629327e-06, + "loss": 0.1969, + "mean_token_accuracy": 0.9280943274497986, + "step": 6895 + }, + { + "epoch": 3.448, + "grad_norm": 5.273087166745848, + "learning_rate": 4.141464389382392e-06, + "loss": 0.1849, + "mean_token_accuracy": 0.9329873919487, + "step": 6896 + }, + { + "epoch": 3.4485, + "grad_norm": 1.793579836926118, + "learning_rate": 4.141135260133591e-06, + "loss": 0.2129, + "mean_token_accuracy": 0.9265492558479309, + "step": 6897 + }, + { + "epoch": 3.449, + "grad_norm": 2.969720699808458, + "learning_rate": 4.140806080892952e-06, + "loss": 0.2198, + "mean_token_accuracy": 0.9231969714164734, + "step": 6898 + }, + { + "epoch": 3.4495, + "grad_norm": 2.9252466609400076, + "learning_rate": 4.1404768516705015e-06, + "loss": 0.1614, + "mean_token_accuracy": 0.9387837052345276, + "step": 6899 + }, + { + "epoch": 3.45, + "grad_norm": 2.089517858649422, + "learning_rate": 4.140147572476269e-06, + "loss": 0.1822, + "mean_token_accuracy": 0.9442484378814697, + "step": 6900 + }, + { + "epoch": 3.4505, + "grad_norm": 2.602812950997946, + "learning_rate": 4.1398182433202834e-06, + "loss": 0.1951, + "mean_token_accuracy": 0.9371196627616882, + "step": 6901 + }, + { + "epoch": 3.451, + "grad_norm": 2.241364992321226, + "learning_rate": 4.139488864212578e-06, + "loss": 0.2083, + "mean_token_accuracy": 0.9273024201393127, + "step": 6902 + }, + { + "epoch": 3.4515000000000002, + "grad_norm": 3.4988156293361166, + "learning_rate": 4.139159435163187e-06, + "loss": 0.2327, + "mean_token_accuracy": 0.9191402196884155, + "step": 6903 + }, + { + "epoch": 3.452, + "grad_norm": 2.23743856614898, + "learning_rate": 4.138829956182144e-06, + "loss": 0.2093, + "mean_token_accuracy": 0.9249294400215149, + "step": 6904 + }, + { + "epoch": 3.4525, + "grad_norm": 2.1160824138471694, + "learning_rate": 4.138500427279485e-06, + "loss": 0.1922, + "mean_token_accuracy": 0.9301483035087585, + "step": 6905 + }, + { + "epoch": 3.453, + "grad_norm": 3.6173937753451635, + "learning_rate": 4.1381708484652495e-06, + "loss": 0.151, + "mean_token_accuracy": 0.946480929851532, + "step": 6906 + }, + { + "epoch": 3.4535, + "grad_norm": 2.973239824024629, + "learning_rate": 4.137841219749476e-06, + "loss": 0.1944, + "mean_token_accuracy": 0.9378787875175476, + "step": 6907 + }, + { + "epoch": 3.454, + "grad_norm": 2.575440501041851, + "learning_rate": 4.137511541142207e-06, + "loss": 0.2725, + "mean_token_accuracy": 0.9084393382072449, + "step": 6908 + }, + { + "epoch": 3.4545, + "grad_norm": 2.199155657047275, + "learning_rate": 4.137181812653484e-06, + "loss": 0.1831, + "mean_token_accuracy": 0.935858964920044, + "step": 6909 + }, + { + "epoch": 3.455, + "grad_norm": 2.362295746902716, + "learning_rate": 4.136852034293349e-06, + "loss": 0.2304, + "mean_token_accuracy": 0.9210779666900635, + "step": 6910 + }, + { + "epoch": 3.4555, + "grad_norm": 2.507762437513183, + "learning_rate": 4.1365222060718525e-06, + "loss": 0.2216, + "mean_token_accuracy": 0.9208885431289673, + "step": 6911 + }, + { + "epoch": 3.456, + "grad_norm": 5.690818475818261, + "learning_rate": 4.136192327999037e-06, + "loss": 0.2519, + "mean_token_accuracy": 0.9105901122093201, + "step": 6912 + }, + { + "epoch": 3.4565, + "grad_norm": 3.374505472029517, + "learning_rate": 4.1358624000849545e-06, + "loss": 0.2382, + "mean_token_accuracy": 0.9181392192840576, + "step": 6913 + }, + { + "epoch": 3.457, + "grad_norm": 2.7128656429799354, + "learning_rate": 4.135532422339653e-06, + "loss": 0.1947, + "mean_token_accuracy": 0.9267297387123108, + "step": 6914 + }, + { + "epoch": 3.4575, + "grad_norm": 2.214786517975554, + "learning_rate": 4.135202394773186e-06, + "loss": 0.2498, + "mean_token_accuracy": 0.9194697737693787, + "step": 6915 + }, + { + "epoch": 3.458, + "grad_norm": 2.0142825871172123, + "learning_rate": 4.134872317395604e-06, + "loss": 0.2332, + "mean_token_accuracy": 0.9210368394851685, + "step": 6916 + }, + { + "epoch": 3.4585, + "grad_norm": 2.7711507999273546, + "learning_rate": 4.134542190216965e-06, + "loss": 0.2195, + "mean_token_accuracy": 0.9210579991340637, + "step": 6917 + }, + { + "epoch": 3.459, + "grad_norm": 4.519990680528103, + "learning_rate": 4.134212013247323e-06, + "loss": 0.2791, + "mean_token_accuracy": 0.9123450517654419, + "step": 6918 + }, + { + "epoch": 3.4595000000000002, + "grad_norm": 3.097311674859316, + "learning_rate": 4.133881786496736e-06, + "loss": 0.2481, + "mean_token_accuracy": 0.9136790037155151, + "step": 6919 + }, + { + "epoch": 3.46, + "grad_norm": 2.4229289471137365, + "learning_rate": 4.133551509975264e-06, + "loss": 0.1598, + "mean_token_accuracy": 0.9406471252441406, + "step": 6920 + }, + { + "epoch": 3.4605, + "grad_norm": 2.095534904598186, + "learning_rate": 4.133221183692968e-06, + "loss": 0.1805, + "mean_token_accuracy": 0.9315466284751892, + "step": 6921 + }, + { + "epoch": 3.461, + "grad_norm": 2.568558328840713, + "learning_rate": 4.13289080765991e-06, + "loss": 0.2459, + "mean_token_accuracy": 0.9215565323829651, + "step": 6922 + }, + { + "epoch": 3.4615, + "grad_norm": 2.872276938421618, + "learning_rate": 4.132560381886152e-06, + "loss": 0.2196, + "mean_token_accuracy": 0.9343085289001465, + "step": 6923 + }, + { + "epoch": 3.462, + "grad_norm": 1.9609803681955533, + "learning_rate": 4.132229906381763e-06, + "loss": 0.173, + "mean_token_accuracy": 0.9408766627311707, + "step": 6924 + }, + { + "epoch": 3.4625, + "grad_norm": 1.9153897289652542, + "learning_rate": 4.1318993811568065e-06, + "loss": 0.1465, + "mean_token_accuracy": 0.9471285343170166, + "step": 6925 + }, + { + "epoch": 3.463, + "grad_norm": 4.613143973015848, + "learning_rate": 4.131568806221353e-06, + "loss": 0.1947, + "mean_token_accuracy": 0.9329969882965088, + "step": 6926 + }, + { + "epoch": 3.4635, + "grad_norm": 5.849159792884834, + "learning_rate": 4.1312381815854716e-06, + "loss": 0.2508, + "mean_token_accuracy": 0.9139184951782227, + "step": 6927 + }, + { + "epoch": 3.464, + "grad_norm": 3.1358291842764996, + "learning_rate": 4.130907507259233e-06, + "loss": 0.2188, + "mean_token_accuracy": 0.9264759421348572, + "step": 6928 + }, + { + "epoch": 3.4645, + "grad_norm": 5.532375765418438, + "learning_rate": 4.130576783252712e-06, + "loss": 0.2149, + "mean_token_accuracy": 0.921950101852417, + "step": 6929 + }, + { + "epoch": 3.465, + "grad_norm": 3.394917216126427, + "learning_rate": 4.130246009575981e-06, + "loss": 0.2289, + "mean_token_accuracy": 0.9216650724411011, + "step": 6930 + }, + { + "epoch": 3.4655, + "grad_norm": 4.626852856155064, + "learning_rate": 4.129915186239117e-06, + "loss": 0.2114, + "mean_token_accuracy": 0.9238004088401794, + "step": 6931 + }, + { + "epoch": 3.466, + "grad_norm": 3.0630220779298396, + "learning_rate": 4.129584313252198e-06, + "loss": 0.1716, + "mean_token_accuracy": 0.9345065355300903, + "step": 6932 + }, + { + "epoch": 3.4665, + "grad_norm": 2.5608719286303594, + "learning_rate": 4.129253390625301e-06, + "loss": 0.1824, + "mean_token_accuracy": 0.9344534277915955, + "step": 6933 + }, + { + "epoch": 3.467, + "grad_norm": 1.767115594570818, + "learning_rate": 4.128922418368509e-06, + "loss": 0.171, + "mean_token_accuracy": 0.9406779408454895, + "step": 6934 + }, + { + "epoch": 3.4675000000000002, + "grad_norm": 2.54680006609053, + "learning_rate": 4.128591396491901e-06, + "loss": 0.1824, + "mean_token_accuracy": 0.9308804869651794, + "step": 6935 + }, + { + "epoch": 3.468, + "grad_norm": 9.35724721458118, + "learning_rate": 4.128260325005563e-06, + "loss": 0.1869, + "mean_token_accuracy": 0.9313432574272156, + "step": 6936 + }, + { + "epoch": 3.4685, + "grad_norm": 23.775221629693785, + "learning_rate": 4.12792920391958e-06, + "loss": 0.1572, + "mean_token_accuracy": 0.9448192119598389, + "step": 6937 + }, + { + "epoch": 3.469, + "grad_norm": 1.9565992327701087, + "learning_rate": 4.127598033244037e-06, + "loss": 0.1957, + "mean_token_accuracy": 0.9246813654899597, + "step": 6938 + }, + { + "epoch": 3.4695, + "grad_norm": 10.537636822750406, + "learning_rate": 4.127266812989023e-06, + "loss": 0.1919, + "mean_token_accuracy": 0.9295458793640137, + "step": 6939 + }, + { + "epoch": 3.4699999999999998, + "grad_norm": 2.681125737972528, + "learning_rate": 4.126935543164628e-06, + "loss": 0.2268, + "mean_token_accuracy": 0.9372736215591431, + "step": 6940 + }, + { + "epoch": 3.4705, + "grad_norm": 3.0790682701146146, + "learning_rate": 4.126604223780941e-06, + "loss": 0.2471, + "mean_token_accuracy": 0.9085357785224915, + "step": 6941 + }, + { + "epoch": 3.471, + "grad_norm": 1.5084000594294535, + "learning_rate": 4.126272854848058e-06, + "loss": 0.1208, + "mean_token_accuracy": 0.9505336880683899, + "step": 6942 + }, + { + "epoch": 3.4715, + "grad_norm": 1.774001819901844, + "learning_rate": 4.125941436376069e-06, + "loss": 0.1886, + "mean_token_accuracy": 0.9309490323066711, + "step": 6943 + }, + { + "epoch": 3.472, + "grad_norm": 2.5498692203376256, + "learning_rate": 4.125609968375073e-06, + "loss": 0.2133, + "mean_token_accuracy": 0.9228309392929077, + "step": 6944 + }, + { + "epoch": 3.4725, + "grad_norm": 4.368836640295485, + "learning_rate": 4.125278450855165e-06, + "loss": 0.2282, + "mean_token_accuracy": 0.9248120188713074, + "step": 6945 + }, + { + "epoch": 3.473, + "grad_norm": 2.3952712573055583, + "learning_rate": 4.124946883826444e-06, + "loss": 0.1907, + "mean_token_accuracy": 0.9380253553390503, + "step": 6946 + }, + { + "epoch": 3.4735, + "grad_norm": 2.154576938031222, + "learning_rate": 4.124615267299011e-06, + "loss": 0.1554, + "mean_token_accuracy": 0.9489110708236694, + "step": 6947 + }, + { + "epoch": 3.474, + "grad_norm": 2.3069556840910432, + "learning_rate": 4.124283601282967e-06, + "loss": 0.2145, + "mean_token_accuracy": 0.9248079061508179, + "step": 6948 + }, + { + "epoch": 3.4745, + "grad_norm": 3.633265430169193, + "learning_rate": 4.1239518857884145e-06, + "loss": 0.219, + "mean_token_accuracy": 0.9239921569824219, + "step": 6949 + }, + { + "epoch": 3.475, + "grad_norm": 1.6246518094607834, + "learning_rate": 4.123620120825459e-06, + "loss": 0.1559, + "mean_token_accuracy": 0.9453375935554504, + "step": 6950 + }, + { + "epoch": 3.4755, + "grad_norm": 2.836436270077171, + "learning_rate": 4.123288306404207e-06, + "loss": 0.2384, + "mean_token_accuracy": 0.9180692434310913, + "step": 6951 + }, + { + "epoch": 3.476, + "grad_norm": 5.013187817775431, + "learning_rate": 4.122956442534765e-06, + "loss": 0.1754, + "mean_token_accuracy": 0.93477863073349, + "step": 6952 + }, + { + "epoch": 3.4765, + "grad_norm": 3.1838567532134205, + "learning_rate": 4.122624529227244e-06, + "loss": 0.2291, + "mean_token_accuracy": 0.9233193397521973, + "step": 6953 + }, + { + "epoch": 3.477, + "grad_norm": 2.377369302901377, + "learning_rate": 4.1222925664917524e-06, + "loss": 0.2377, + "mean_token_accuracy": 0.9164543151855469, + "step": 6954 + }, + { + "epoch": 3.4775, + "grad_norm": 1.2353362326621509, + "learning_rate": 4.1219605543384036e-06, + "loss": 0.1311, + "mean_token_accuracy": 0.9548431634902954, + "step": 6955 + }, + { + "epoch": 3.4779999999999998, + "grad_norm": 4.3283382112224, + "learning_rate": 4.121628492777311e-06, + "loss": 0.2255, + "mean_token_accuracy": 0.925427258014679, + "step": 6956 + }, + { + "epoch": 3.4785, + "grad_norm": 2.5118492612278502, + "learning_rate": 4.121296381818589e-06, + "loss": 0.193, + "mean_token_accuracy": 0.9271761775016785, + "step": 6957 + }, + { + "epoch": 3.479, + "grad_norm": 1.6044574972004388, + "learning_rate": 4.120964221472355e-06, + "loss": 0.1995, + "mean_token_accuracy": 0.9294450879096985, + "step": 6958 + }, + { + "epoch": 3.4795, + "grad_norm": 1.9261702242871757, + "learning_rate": 4.120632011748729e-06, + "loss": 0.1445, + "mean_token_accuracy": 0.9490806460380554, + "step": 6959 + }, + { + "epoch": 3.48, + "grad_norm": 2.6459138853922495, + "learning_rate": 4.120299752657828e-06, + "loss": 0.2378, + "mean_token_accuracy": 0.9203554391860962, + "step": 6960 + }, + { + "epoch": 3.4805, + "grad_norm": 3.163603738699763, + "learning_rate": 4.119967444209774e-06, + "loss": 0.1996, + "mean_token_accuracy": 0.92899090051651, + "step": 6961 + }, + { + "epoch": 3.481, + "grad_norm": 1.8113915513193866, + "learning_rate": 4.1196350864146895e-06, + "loss": 0.1781, + "mean_token_accuracy": 0.9375790357589722, + "step": 6962 + }, + { + "epoch": 3.4815, + "grad_norm": 2.778626647055605, + "learning_rate": 4.1193026792826995e-06, + "loss": 0.2268, + "mean_token_accuracy": 0.924146831035614, + "step": 6963 + }, + { + "epoch": 3.482, + "grad_norm": 3.4959373945895313, + "learning_rate": 4.118970222823929e-06, + "loss": 0.1414, + "mean_token_accuracy": 0.9516776204109192, + "step": 6964 + }, + { + "epoch": 3.4825, + "grad_norm": 2.3623037193505185, + "learning_rate": 4.1186377170485055e-06, + "loss": 0.199, + "mean_token_accuracy": 0.9298534989356995, + "step": 6965 + }, + { + "epoch": 3.483, + "grad_norm": 2.1454271686869655, + "learning_rate": 4.118305161966557e-06, + "loss": 0.1549, + "mean_token_accuracy": 0.9422808289527893, + "step": 6966 + }, + { + "epoch": 3.4835, + "grad_norm": 3.6069320375111715, + "learning_rate": 4.117972557588216e-06, + "loss": 0.1996, + "mean_token_accuracy": 0.9324929118156433, + "step": 6967 + }, + { + "epoch": 3.484, + "grad_norm": 2.994250898793766, + "learning_rate": 4.117639903923611e-06, + "loss": 0.1954, + "mean_token_accuracy": 0.9237160086631775, + "step": 6968 + }, + { + "epoch": 3.4845, + "grad_norm": 3.277587195961184, + "learning_rate": 4.117307200982878e-06, + "loss": 0.2918, + "mean_token_accuracy": 0.9094030857086182, + "step": 6969 + }, + { + "epoch": 3.485, + "grad_norm": 3.6827934769483455, + "learning_rate": 4.11697444877615e-06, + "loss": 0.2388, + "mean_token_accuracy": 0.9200932383537292, + "step": 6970 + }, + { + "epoch": 3.4855, + "grad_norm": 1.7793605363488387, + "learning_rate": 4.116641647313563e-06, + "loss": 0.1704, + "mean_token_accuracy": 0.9354009032249451, + "step": 6971 + }, + { + "epoch": 3.4859999999999998, + "grad_norm": 2.6844467387474116, + "learning_rate": 4.116308796605256e-06, + "loss": 0.1812, + "mean_token_accuracy": 0.9412486553192139, + "step": 6972 + }, + { + "epoch": 3.4865, + "grad_norm": 3.2034927582090904, + "learning_rate": 4.1159758966613674e-06, + "loss": 0.2349, + "mean_token_accuracy": 0.9242902398109436, + "step": 6973 + }, + { + "epoch": 3.487, + "grad_norm": 1.8388938633572014, + "learning_rate": 4.115642947492038e-06, + "loss": 0.2639, + "mean_token_accuracy": 0.9160988926887512, + "step": 6974 + }, + { + "epoch": 3.4875, + "grad_norm": 3.0625429634284966, + "learning_rate": 4.11530994910741e-06, + "loss": 0.2073, + "mean_token_accuracy": 0.9240267276763916, + "step": 6975 + }, + { + "epoch": 3.488, + "grad_norm": 1.9160234543691215, + "learning_rate": 4.114976901517628e-06, + "loss": 0.2372, + "mean_token_accuracy": 0.9179182052612305, + "step": 6976 + }, + { + "epoch": 3.4885, + "grad_norm": 3.4951946358314987, + "learning_rate": 4.114643804732836e-06, + "loss": 0.1894, + "mean_token_accuracy": 0.9362225532531738, + "step": 6977 + }, + { + "epoch": 3.489, + "grad_norm": 4.177400195580469, + "learning_rate": 4.114310658763181e-06, + "loss": 0.2387, + "mean_token_accuracy": 0.9176797866821289, + "step": 6978 + }, + { + "epoch": 3.4895, + "grad_norm": 1.5384049865994738, + "learning_rate": 4.113977463618811e-06, + "loss": 0.1683, + "mean_token_accuracy": 0.9396927952766418, + "step": 6979 + }, + { + "epoch": 3.49, + "grad_norm": 2.4135667172413364, + "learning_rate": 4.113644219309877e-06, + "loss": 0.2309, + "mean_token_accuracy": 0.9178662300109863, + "step": 6980 + }, + { + "epoch": 3.4905, + "grad_norm": 2.2390922117372885, + "learning_rate": 4.113310925846529e-06, + "loss": 0.2178, + "mean_token_accuracy": 0.9213025569915771, + "step": 6981 + }, + { + "epoch": 3.491, + "grad_norm": 4.82611163179791, + "learning_rate": 4.11297758323892e-06, + "loss": 0.2766, + "mean_token_accuracy": 0.9088171124458313, + "step": 6982 + }, + { + "epoch": 3.4915, + "grad_norm": 3.20338541569749, + "learning_rate": 4.112644191497203e-06, + "loss": 0.1645, + "mean_token_accuracy": 0.9388720393180847, + "step": 6983 + }, + { + "epoch": 3.492, + "grad_norm": 2.898833097913804, + "learning_rate": 4.1123107506315366e-06, + "loss": 0.204, + "mean_token_accuracy": 0.928732693195343, + "step": 6984 + }, + { + "epoch": 3.4925, + "grad_norm": 6.813224138557757, + "learning_rate": 4.1119772606520755e-06, + "loss": 0.2467, + "mean_token_accuracy": 0.9154908657073975, + "step": 6985 + }, + { + "epoch": 3.493, + "grad_norm": 3.15879258323708, + "learning_rate": 4.1116437215689785e-06, + "loss": 0.1569, + "mean_token_accuracy": 0.9449421167373657, + "step": 6986 + }, + { + "epoch": 3.4935, + "grad_norm": 2.031712201165046, + "learning_rate": 4.111310133392407e-06, + "loss": 0.1851, + "mean_token_accuracy": 0.9320124387741089, + "step": 6987 + }, + { + "epoch": 3.4939999999999998, + "grad_norm": 2.8849248237776832, + "learning_rate": 4.110976496132523e-06, + "loss": 0.2357, + "mean_token_accuracy": 0.9205992221832275, + "step": 6988 + }, + { + "epoch": 3.4945, + "grad_norm": 1.817915588777728, + "learning_rate": 4.110642809799488e-06, + "loss": 0.1677, + "mean_token_accuracy": 0.939492404460907, + "step": 6989 + }, + { + "epoch": 3.495, + "grad_norm": 1.5545222587788678, + "learning_rate": 4.110309074403467e-06, + "loss": 0.1643, + "mean_token_accuracy": 0.9375097751617432, + "step": 6990 + }, + { + "epoch": 3.4955, + "grad_norm": 2.2299862879511325, + "learning_rate": 4.1099752899546265e-06, + "loss": 0.1221, + "mean_token_accuracy": 0.9583135843276978, + "step": 6991 + }, + { + "epoch": 3.496, + "grad_norm": 2.1020661378630603, + "learning_rate": 4.109641456463135e-06, + "loss": 0.2039, + "mean_token_accuracy": 0.9317371249198914, + "step": 6992 + }, + { + "epoch": 3.4965, + "grad_norm": 6.751647975257786, + "learning_rate": 4.1093075739391605e-06, + "loss": 0.2351, + "mean_token_accuracy": 0.9241276979446411, + "step": 6993 + }, + { + "epoch": 3.497, + "grad_norm": 2.4202352575373256, + "learning_rate": 4.108973642392874e-06, + "loss": 0.1557, + "mean_token_accuracy": 0.9446576237678528, + "step": 6994 + }, + { + "epoch": 3.4975, + "grad_norm": 2.1515662993437332, + "learning_rate": 4.1086396618344474e-06, + "loss": 0.1855, + "mean_token_accuracy": 0.935275673866272, + "step": 6995 + }, + { + "epoch": 3.498, + "grad_norm": 2.149923475516028, + "learning_rate": 4.108305632274055e-06, + "loss": 0.2132, + "mean_token_accuracy": 0.9221417307853699, + "step": 6996 + }, + { + "epoch": 3.4985, + "grad_norm": 4.23047896137184, + "learning_rate": 4.107971553721872e-06, + "loss": 0.1651, + "mean_token_accuracy": 0.9339733123779297, + "step": 6997 + }, + { + "epoch": 3.499, + "grad_norm": 6.418228308460095, + "learning_rate": 4.107637426188074e-06, + "loss": 0.17, + "mean_token_accuracy": 0.9437118768692017, + "step": 6998 + }, + { + "epoch": 3.4995, + "grad_norm": 2.434884767392798, + "learning_rate": 4.1073032496828406e-06, + "loss": 0.2051, + "mean_token_accuracy": 0.931244969367981, + "step": 6999 + }, + { + "epoch": 3.5, + "grad_norm": 1.9053023849619075, + "learning_rate": 4.106969024216348e-06, + "loss": 0.1985, + "mean_token_accuracy": 0.9310483336448669, + "step": 7000 + } + ], + "logging_steps": 1, + "max_steps": 20000, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 589731497590784.0, + "train_batch_size": 6, + "trial_name": null, + "trial_params": null +}