diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7821 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.0, + "eval_steps": 1000, + "global_step": 100000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "grad_norm": 3.8100409507751465, + "learning_rate": 5.9999999999999995e-05, + "loss": 1.816, + "step": 100 + }, + { + "epoch": 0.01, + "grad_norm": 1.0648651123046875, + "learning_rate": 0.00011999999999999999, + "loss": 1.5968, + "step": 200 + }, + { + "epoch": 0.01, + "grad_norm": 1.9017549753189087, + "learning_rate": 0.00017999999999999998, + "loss": 1.56, + "step": 300 + }, + { + "epoch": 0.02, + "grad_norm": 1.5334885120391846, + "learning_rate": 0.00023999999999999998, + "loss": 1.587, + "step": 400 + }, + { + "epoch": 0.03, + "grad_norm": 1.3036648035049438, + "learning_rate": 0.0003, + "loss": 1.6182, + "step": 500 + }, + { + "epoch": 0.03, + "grad_norm": 1.709660530090332, + "learning_rate": 0.00029969849246231153, + "loss": 1.6102, + "step": 600 + }, + { + "epoch": 0.04, + "grad_norm": 1.5684775114059448, + "learning_rate": 0.0002993969849246231, + "loss": 1.6094, + "step": 700 + }, + { + "epoch": 0.04, + "grad_norm": 3.3330438137054443, + "learning_rate": 0.00029909547738693465, + "loss": 1.6118, + "step": 800 + }, + { + "epoch": 0.04, + "grad_norm": 2.1563549041748047, + "learning_rate": 0.0002987939698492462, + "loss": 1.6596, + "step": 900 + }, + { + "epoch": 0.05, + "grad_norm": 2.4043567180633545, + "learning_rate": 0.00029849547738693464, + "loss": 1.6071, + "step": 1000 + }, + { + "epoch": 0.05, + "eval_loss": 1.585342288017273, + "eval_runtime": 37.6462, + "eval_samples_per_second": 26.563, + "eval_steps_per_second": 3.32, + "step": 1000 + }, + { + "epoch": 0.06, + "grad_norm": 2.3647234439849854, + "learning_rate": 0.0002981939698492462, + "loss": 1.611, + "step": 1100 + }, + { + "epoch": 0.06, + "grad_norm": 2.3917016983032227, + "learning_rate": 0.00029789246231155776, + "loss": 1.6003, + "step": 1200 + }, + { + "epoch": 0.07, + "grad_norm": 1.7931370735168457, + "learning_rate": 0.0002975909547738693, + "loss": 1.5789, + "step": 1300 + }, + { + "epoch": 0.07, + "grad_norm": 2.542971611022949, + "learning_rate": 0.0002972894472361809, + "loss": 1.5435, + "step": 1400 + }, + { + "epoch": 0.07, + "grad_norm": 1.8555421829223633, + "learning_rate": 0.00029698793969849243, + "loss": 1.5513, + "step": 1500 + }, + { + "epoch": 0.08, + "grad_norm": 1.9988830089569092, + "learning_rate": 0.000296686432160804, + "loss": 1.5763, + "step": 1600 + }, + { + "epoch": 0.09, + "grad_norm": 1.5328696966171265, + "learning_rate": 0.00029638492462311555, + "loss": 1.5529, + "step": 1700 + }, + { + "epoch": 0.09, + "grad_norm": 2.442533254623413, + "learning_rate": 0.0002960834170854271, + "loss": 1.5581, + "step": 1800 + }, + { + "epoch": 0.1, + "grad_norm": 1.4188216924667358, + "learning_rate": 0.00029578190954773867, + "loss": 1.5598, + "step": 1900 + }, + { + "epoch": 0.1, + "grad_norm": 2.700873851776123, + "learning_rate": 0.00029548040201005023, + "loss": 1.6091, + "step": 2000 + }, + { + "epoch": 0.1, + "eval_loss": 1.5680323839187622, + "eval_runtime": 37.9632, + "eval_samples_per_second": 26.341, + "eval_steps_per_second": 3.293, + "step": 2000 + }, + { + "epoch": 0.1, + "grad_norm": 4.415462493896484, + "learning_rate": 0.0002951788944723618, + "loss": 1.5435, + "step": 2100 + }, + { + "epoch": 0.11, + "grad_norm": 1.5002624988555908, + "learning_rate": 0.00029487738693467335, + "loss": 1.5485, + "step": 2200 + }, + { + "epoch": 0.12, + "grad_norm": 1.8552610874176025, + "learning_rate": 0.0002945758793969849, + "loss": 1.5687, + "step": 2300 + }, + { + "epoch": 0.12, + "grad_norm": 2.6914422512054443, + "learning_rate": 0.00029427437185929647, + "loss": 1.5549, + "step": 2400 + }, + { + "epoch": 0.12, + "grad_norm": 1.5994210243225098, + "learning_rate": 0.00029397286432160803, + "loss": 1.5541, + "step": 2500 + }, + { + "epoch": 0.13, + "grad_norm": 1.9448769092559814, + "learning_rate": 0.0002936713567839196, + "loss": 1.5348, + "step": 2600 + }, + { + "epoch": 0.14, + "grad_norm": 2.3909597396850586, + "learning_rate": 0.00029336984924623115, + "loss": 1.5629, + "step": 2700 + }, + { + "epoch": 0.14, + "grad_norm": 1.4517822265625, + "learning_rate": 0.0002930683417085427, + "loss": 1.4946, + "step": 2800 + }, + { + "epoch": 0.14, + "grad_norm": 1.7407867908477783, + "learning_rate": 0.0002927668341708542, + "loss": 1.568, + "step": 2900 + }, + { + "epoch": 0.15, + "grad_norm": 1.3732205629348755, + "learning_rate": 0.0002924653266331658, + "loss": 1.4928, + "step": 3000 + }, + { + "epoch": 0.15, + "eval_loss": 1.5172981023788452, + "eval_runtime": 37.8358, + "eval_samples_per_second": 26.43, + "eval_steps_per_second": 3.304, + "step": 3000 + }, + { + "epoch": 0.15, + "grad_norm": 1.9255911111831665, + "learning_rate": 0.0002921638190954774, + "loss": 1.5208, + "step": 3100 + }, + { + "epoch": 0.16, + "grad_norm": 1.7328695058822632, + "learning_rate": 0.00029186231155778895, + "loss": 1.5442, + "step": 3200 + }, + { + "epoch": 0.17, + "grad_norm": 2.286285400390625, + "learning_rate": 0.00029156080402010045, + "loss": 1.5071, + "step": 3300 + }, + { + "epoch": 0.17, + "grad_norm": 2.426595687866211, + "learning_rate": 0.000291259296482412, + "loss": 1.5424, + "step": 3400 + }, + { + "epoch": 0.17, + "grad_norm": 1.8213595151901245, + "learning_rate": 0.0002909577889447236, + "loss": 1.487, + "step": 3500 + }, + { + "epoch": 0.18, + "grad_norm": 2.4181461334228516, + "learning_rate": 0.000290659296482412, + "loss": 1.5083, + "step": 3600 + }, + { + "epoch": 0.18, + "grad_norm": 1.4696974754333496, + "learning_rate": 0.0002903577889447236, + "loss": 1.5204, + "step": 3700 + }, + { + "epoch": 0.19, + "grad_norm": 1.285097360610962, + "learning_rate": 0.00029005628140703517, + "loss": 1.515, + "step": 3800 + }, + { + "epoch": 0.2, + "grad_norm": 2.7307722568511963, + "learning_rate": 0.00028975477386934673, + "loss": 1.5283, + "step": 3900 + }, + { + "epoch": 0.2, + "grad_norm": 2.5405428409576416, + "learning_rate": 0.00028945326633165823, + "loss": 1.4657, + "step": 4000 + }, + { + "epoch": 0.2, + "eval_loss": 1.4836663007736206, + "eval_runtime": 37.7733, + "eval_samples_per_second": 26.474, + "eval_steps_per_second": 3.309, + "step": 4000 + }, + { + "epoch": 0.2, + "grad_norm": 2.2221779823303223, + "learning_rate": 0.00028915175879396985, + "loss": 1.4936, + "step": 4100 + }, + { + "epoch": 0.21, + "grad_norm": 2.700119733810425, + "learning_rate": 0.0002888502512562814, + "loss": 1.446, + "step": 4200 + }, + { + "epoch": 0.21, + "grad_norm": 2.11588716506958, + "learning_rate": 0.0002885487437185929, + "loss": 1.4789, + "step": 4300 + }, + { + "epoch": 0.22, + "grad_norm": 2.144611358642578, + "learning_rate": 0.00028824723618090447, + "loss": 1.4913, + "step": 4400 + }, + { + "epoch": 0.23, + "grad_norm": 1.7891815900802612, + "learning_rate": 0.0002879457286432161, + "loss": 1.4693, + "step": 4500 + }, + { + "epoch": 0.23, + "grad_norm": 2.2549595832824707, + "learning_rate": 0.0002876442211055276, + "loss": 1.4957, + "step": 4600 + }, + { + "epoch": 0.23, + "grad_norm": 2.4034409523010254, + "learning_rate": 0.00028734271356783915, + "loss": 1.4909, + "step": 4700 + }, + { + "epoch": 0.24, + "grad_norm": 1.4686906337738037, + "learning_rate": 0.0002870412060301507, + "loss": 1.4989, + "step": 4800 + }, + { + "epoch": 0.24, + "grad_norm": 2.1314849853515625, + "learning_rate": 0.0002867396984924623, + "loss": 1.4899, + "step": 4900 + }, + { + "epoch": 0.25, + "grad_norm": 1.703493595123291, + "learning_rate": 0.00028643819095477383, + "loss": 1.4897, + "step": 5000 + }, + { + "epoch": 0.25, + "eval_loss": 1.5144654512405396, + "eval_runtime": 38.0015, + "eval_samples_per_second": 26.315, + "eval_steps_per_second": 3.289, + "step": 5000 + }, + { + "epoch": 0.26, + "grad_norm": 2.8537943363189697, + "learning_rate": 0.0002861366834170854, + "loss": 1.4702, + "step": 5100 + }, + { + "epoch": 0.26, + "grad_norm": 1.885312557220459, + "learning_rate": 0.00028583517587939695, + "loss": 1.4918, + "step": 5200 + }, + { + "epoch": 0.27, + "grad_norm": 2.6149489879608154, + "learning_rate": 0.0002855336683417085, + "loss": 1.4867, + "step": 5300 + }, + { + "epoch": 0.27, + "grad_norm": 1.8222806453704834, + "learning_rate": 0.00028523216080402007, + "loss": 1.4894, + "step": 5400 + }, + { + "epoch": 0.28, + "grad_norm": 2.105160713195801, + "learning_rate": 0.0002849306532663316, + "loss": 1.4865, + "step": 5500 + }, + { + "epoch": 0.28, + "grad_norm": 1.9180357456207275, + "learning_rate": 0.0002846291457286432, + "loss": 1.4365, + "step": 5600 + }, + { + "epoch": 0.28, + "grad_norm": 1.4675670862197876, + "learning_rate": 0.00028432763819095474, + "loss": 1.4323, + "step": 5700 + }, + { + "epoch": 0.29, + "grad_norm": 3.664919376373291, + "learning_rate": 0.0002840261306532663, + "loss": 1.4605, + "step": 5800 + }, + { + "epoch": 0.29, + "grad_norm": 1.5559368133544922, + "learning_rate": 0.00028372462311557786, + "loss": 1.4799, + "step": 5900 + }, + { + "epoch": 0.3, + "grad_norm": 2.0738680362701416, + "learning_rate": 0.0002834261306532663, + "loss": 1.4923, + "step": 6000 + }, + { + "epoch": 0.3, + "eval_loss": 1.4727822542190552, + "eval_runtime": 38.2425, + "eval_samples_per_second": 26.149, + "eval_steps_per_second": 3.269, + "step": 6000 + }, + { + "epoch": 0.3, + "grad_norm": 1.9228754043579102, + "learning_rate": 0.00028312462311557785, + "loss": 1.4127, + "step": 6100 + }, + { + "epoch": 0.31, + "grad_norm": 2.0438356399536133, + "learning_rate": 0.0002828231155778894, + "loss": 1.4835, + "step": 6200 + }, + { + "epoch": 0.32, + "grad_norm": 2.734626293182373, + "learning_rate": 0.00028252160804020097, + "loss": 1.4489, + "step": 6300 + }, + { + "epoch": 0.32, + "grad_norm": 2.1490132808685303, + "learning_rate": 0.0002822201005025125, + "loss": 1.4684, + "step": 6400 + }, + { + "epoch": 0.33, + "grad_norm": 2.1819868087768555, + "learning_rate": 0.0002819185929648241, + "loss": 1.4416, + "step": 6500 + }, + { + "epoch": 0.33, + "grad_norm": 1.5763262510299683, + "learning_rate": 0.00028161708542713565, + "loss": 1.4532, + "step": 6600 + }, + { + "epoch": 0.34, + "grad_norm": 1.9584680795669556, + "learning_rate": 0.0002813155778894472, + "loss": 1.4558, + "step": 6700 + }, + { + "epoch": 0.34, + "grad_norm": 2.6148059368133545, + "learning_rate": 0.00028101407035175876, + "loss": 1.4588, + "step": 6800 + }, + { + "epoch": 0.34, + "grad_norm": 1.5689460039138794, + "learning_rate": 0.0002807125628140703, + "loss": 1.4352, + "step": 6900 + }, + { + "epoch": 0.35, + "grad_norm": 2.145756483078003, + "learning_rate": 0.0002804110552763819, + "loss": 1.4207, + "step": 7000 + }, + { + "epoch": 0.35, + "eval_loss": 1.4386738538742065, + "eval_runtime": 38.107, + "eval_samples_per_second": 26.242, + "eval_steps_per_second": 3.28, + "step": 7000 + }, + { + "epoch": 0.35, + "grad_norm": 4.316162586212158, + "learning_rate": 0.00028010954773869344, + "loss": 1.4085, + "step": 7100 + }, + { + "epoch": 0.36, + "grad_norm": 2.0866541862487793, + "learning_rate": 0.000279808040201005, + "loss": 1.4634, + "step": 7200 + }, + { + "epoch": 0.36, + "grad_norm": 3.0577406883239746, + "learning_rate": 0.00027950653266331656, + "loss": 1.4515, + "step": 7300 + }, + { + "epoch": 0.37, + "grad_norm": 1.723168969154358, + "learning_rate": 0.0002792050251256281, + "loss": 1.4372, + "step": 7400 + }, + { + "epoch": 0.38, + "grad_norm": 2.8033313751220703, + "learning_rate": 0.0002789035175879397, + "loss": 1.4844, + "step": 7500 + }, + { + "epoch": 0.38, + "grad_norm": 2.051619529724121, + "learning_rate": 0.00027860201005025124, + "loss": 1.4352, + "step": 7600 + }, + { + "epoch": 0.39, + "grad_norm": 1.4199312925338745, + "learning_rate": 0.0002783005025125628, + "loss": 1.4641, + "step": 7700 + }, + { + "epoch": 0.39, + "grad_norm": 2.3949058055877686, + "learning_rate": 0.00027799899497487436, + "loss": 1.4592, + "step": 7800 + }, + { + "epoch": 0.4, + "grad_norm": 2.8449528217315674, + "learning_rate": 0.0002776974874371859, + "loss": 1.4196, + "step": 7900 + }, + { + "epoch": 0.4, + "grad_norm": 3.709972858428955, + "learning_rate": 0.0002773959798994975, + "loss": 1.4375, + "step": 8000 + }, + { + "epoch": 0.4, + "eval_loss": 1.4270827770233154, + "eval_runtime": 38.3346, + "eval_samples_per_second": 26.086, + "eval_steps_per_second": 3.261, + "step": 8000 + }, + { + "epoch": 0.41, + "grad_norm": 1.7984100580215454, + "learning_rate": 0.00027709447236180904, + "loss": 1.3943, + "step": 8100 + }, + { + "epoch": 0.41, + "grad_norm": 2.1693639755249023, + "learning_rate": 0.00027679597989949746, + "loss": 1.4636, + "step": 8200 + }, + { + "epoch": 0.41, + "grad_norm": 1.8211654424667358, + "learning_rate": 0.000276494472361809, + "loss": 1.4539, + "step": 8300 + }, + { + "epoch": 0.42, + "grad_norm": 2.11051869392395, + "learning_rate": 0.0002761929648241206, + "loss": 1.4214, + "step": 8400 + }, + { + "epoch": 0.42, + "grad_norm": 1.5553231239318848, + "learning_rate": 0.00027589145728643214, + "loss": 1.4475, + "step": 8500 + }, + { + "epoch": 0.43, + "grad_norm": 2.0080809593200684, + "learning_rate": 0.0002755899497487437, + "loss": 1.4024, + "step": 8600 + }, + { + "epoch": 0.43, + "grad_norm": 2.6698598861694336, + "learning_rate": 0.00027528844221105526, + "loss": 1.4159, + "step": 8700 + }, + { + "epoch": 0.44, + "grad_norm": 2.2336277961730957, + "learning_rate": 0.0002749869346733668, + "loss": 1.437, + "step": 8800 + }, + { + "epoch": 0.45, + "grad_norm": 1.7006186246871948, + "learning_rate": 0.0002746854271356784, + "loss": 1.4465, + "step": 8900 + }, + { + "epoch": 0.45, + "grad_norm": 1.934051513671875, + "learning_rate": 0.0002743839195979899, + "loss": 1.4319, + "step": 9000 + }, + { + "epoch": 0.45, + "eval_loss": 1.4331704378128052, + "eval_runtime": 37.9595, + "eval_samples_per_second": 26.344, + "eval_steps_per_second": 3.293, + "step": 9000 + }, + { + "epoch": 0.46, + "grad_norm": 2.549532890319824, + "learning_rate": 0.0002740824120603015, + "loss": 1.4018, + "step": 9100 + }, + { + "epoch": 0.46, + "grad_norm": 1.9921625852584839, + "learning_rate": 0.00027378090452261306, + "loss": 1.4354, + "step": 9200 + }, + { + "epoch": 0.47, + "grad_norm": 1.5784940719604492, + "learning_rate": 0.0002734793969849246, + "loss": 1.4515, + "step": 9300 + }, + { + "epoch": 0.47, + "grad_norm": 1.9822384119033813, + "learning_rate": 0.0002731778894472361, + "loss": 1.4784, + "step": 9400 + }, + { + "epoch": 0.47, + "grad_norm": 3.0514814853668213, + "learning_rate": 0.00027287638190954774, + "loss": 1.4235, + "step": 9500 + }, + { + "epoch": 0.48, + "grad_norm": 1.5947296619415283, + "learning_rate": 0.0002725748743718593, + "loss": 1.4325, + "step": 9600 + }, + { + "epoch": 0.48, + "grad_norm": 2.838723659515381, + "learning_rate": 0.0002722733668341708, + "loss": 1.4318, + "step": 9700 + }, + { + "epoch": 0.49, + "grad_norm": 2.7525815963745117, + "learning_rate": 0.00027197185929648236, + "loss": 1.4323, + "step": 9800 + }, + { + "epoch": 0.49, + "grad_norm": 2.186182975769043, + "learning_rate": 0.000271670351758794, + "loss": 1.4122, + "step": 9900 + }, + { + "epoch": 0.5, + "grad_norm": 1.5111092329025269, + "learning_rate": 0.00027136884422110553, + "loss": 1.4278, + "step": 10000 + }, + { + "epoch": 0.5, + "eval_loss": 1.4226535558700562, + "eval_runtime": 37.925, + "eval_samples_per_second": 26.368, + "eval_steps_per_second": 3.296, + "step": 10000 + }, + { + "epoch": 0.51, + "grad_norm": 1.4402307271957397, + "learning_rate": 0.00027106733668341704, + "loss": 1.4775, + "step": 10100 + }, + { + "epoch": 0.51, + "grad_norm": 4.803475379943848, + "learning_rate": 0.0002707658291457286, + "loss": 1.4434, + "step": 10200 + }, + { + "epoch": 0.52, + "grad_norm": 2.159541606903076, + "learning_rate": 0.0002704643216080402, + "loss": 1.4505, + "step": 10300 + }, + { + "epoch": 0.52, + "grad_norm": 1.613765835762024, + "learning_rate": 0.0002701658291457286, + "loss": 1.4336, + "step": 10400 + }, + { + "epoch": 0.53, + "grad_norm": 3.0653555393218994, + "learning_rate": 0.0002698643216080402, + "loss": 1.4238, + "step": 10500 + }, + { + "epoch": 0.53, + "grad_norm": 2.0688183307647705, + "learning_rate": 0.00026956281407035176, + "loss": 1.4048, + "step": 10600 + }, + { + "epoch": 0.54, + "grad_norm": 2.271068572998047, + "learning_rate": 0.0002692613065326633, + "loss": 1.4412, + "step": 10700 + }, + { + "epoch": 0.54, + "grad_norm": 1.7365072965621948, + "learning_rate": 0.0002689597989949748, + "loss": 1.3864, + "step": 10800 + }, + { + "epoch": 0.55, + "grad_norm": 1.7095474004745483, + "learning_rate": 0.00026865829145728643, + "loss": 1.4509, + "step": 10900 + }, + { + "epoch": 0.55, + "grad_norm": 2.595015287399292, + "learning_rate": 0.000268356783919598, + "loss": 1.4068, + "step": 11000 + }, + { + "epoch": 0.55, + "eval_loss": 1.4620698690414429, + "eval_runtime": 37.8254, + "eval_samples_per_second": 26.437, + "eval_steps_per_second": 3.305, + "step": 11000 + }, + { + "epoch": 0.56, + "grad_norm": 1.6796025037765503, + "learning_rate": 0.0002680552763819095, + "loss": 1.4059, + "step": 11100 + }, + { + "epoch": 0.56, + "grad_norm": 2.259477376937866, + "learning_rate": 0.00026775376884422106, + "loss": 1.4112, + "step": 11200 + }, + { + "epoch": 0.56, + "grad_norm": 4.8005051612854, + "learning_rate": 0.00026745226130653267, + "loss": 1.367, + "step": 11300 + }, + { + "epoch": 0.57, + "grad_norm": 2.824021577835083, + "learning_rate": 0.00026715075376884423, + "loss": 1.4156, + "step": 11400 + }, + { + "epoch": 0.57, + "grad_norm": 2.4818904399871826, + "learning_rate": 0.00026684924623115574, + "loss": 1.3846, + "step": 11500 + }, + { + "epoch": 0.58, + "grad_norm": 2.6064958572387695, + "learning_rate": 0.0002665477386934673, + "loss": 1.4062, + "step": 11600 + }, + { + "epoch": 0.58, + "grad_norm": 1.8354562520980835, + "learning_rate": 0.00026624623115577886, + "loss": 1.3761, + "step": 11700 + }, + { + "epoch": 0.59, + "grad_norm": 3.094172477722168, + "learning_rate": 0.0002659447236180904, + "loss": 1.3576, + "step": 11800 + }, + { + "epoch": 0.59, + "grad_norm": 2.000718832015991, + "learning_rate": 0.000265643216080402, + "loss": 1.401, + "step": 11900 + }, + { + "epoch": 0.6, + "grad_norm": 2.301866054534912, + "learning_rate": 0.00026534170854271353, + "loss": 1.4267, + "step": 12000 + }, + { + "epoch": 0.6, + "eval_loss": 1.4072773456573486, + "eval_runtime": 37.8474, + "eval_samples_per_second": 26.422, + "eval_steps_per_second": 3.303, + "step": 12000 + }, + { + "epoch": 0.6, + "grad_norm": 1.8116004467010498, + "learning_rate": 0.0002650402010050251, + "loss": 1.4141, + "step": 12100 + }, + { + "epoch": 0.61, + "grad_norm": 1.7951298952102661, + "learning_rate": 0.00026473869346733665, + "loss": 1.4006, + "step": 12200 + }, + { + "epoch": 0.61, + "grad_norm": 1.9248169660568237, + "learning_rate": 0.0002644371859296482, + "loss": 1.4143, + "step": 12300 + }, + { + "epoch": 0.62, + "grad_norm": 3.0492172241210938, + "learning_rate": 0.00026413567839195977, + "loss": 1.3808, + "step": 12400 + }, + { + "epoch": 0.62, + "grad_norm": 1.3698550462722778, + "learning_rate": 0.00026383417085427133, + "loss": 1.339, + "step": 12500 + }, + { + "epoch": 0.63, + "grad_norm": 2.8333966732025146, + "learning_rate": 0.0002635326633165829, + "loss": 1.3977, + "step": 12600 + }, + { + "epoch": 0.64, + "grad_norm": 2.5511767864227295, + "learning_rate": 0.0002632341708542713, + "loss": 1.4027, + "step": 12700 + }, + { + "epoch": 0.64, + "grad_norm": 1.912987470626831, + "learning_rate": 0.0002629326633165829, + "loss": 1.4062, + "step": 12800 + }, + { + "epoch": 0.65, + "grad_norm": 1.8692814111709595, + "learning_rate": 0.00026263115577889444, + "loss": 1.3901, + "step": 12900 + }, + { + "epoch": 0.65, + "grad_norm": 2.620612859725952, + "learning_rate": 0.000262329648241206, + "loss": 1.3992, + "step": 13000 + }, + { + "epoch": 0.65, + "eval_loss": 1.3693994283676147, + "eval_runtime": 38.004, + "eval_samples_per_second": 26.313, + "eval_steps_per_second": 3.289, + "step": 13000 + }, + { + "epoch": 0.66, + "grad_norm": 3.1771810054779053, + "learning_rate": 0.00026202814070351756, + "loss": 1.3733, + "step": 13100 + }, + { + "epoch": 0.66, + "grad_norm": 2.4650421142578125, + "learning_rate": 0.0002617266331658291, + "loss": 1.399, + "step": 13200 + }, + { + "epoch": 0.67, + "grad_norm": 2.9789535999298096, + "learning_rate": 0.0002614251256281407, + "loss": 1.4291, + "step": 13300 + }, + { + "epoch": 0.67, + "grad_norm": 1.4404784440994263, + "learning_rate": 0.00026112361809045223, + "loss": 1.3833, + "step": 13400 + }, + { + "epoch": 0.68, + "grad_norm": 2.0667450428009033, + "learning_rate": 0.0002608221105527638, + "loss": 1.3884, + "step": 13500 + }, + { + "epoch": 0.68, + "grad_norm": 2.014460563659668, + "learning_rate": 0.00026052060301507535, + "loss": 1.3819, + "step": 13600 + }, + { + "epoch": 0.69, + "grad_norm": 2.360121965408325, + "learning_rate": 0.0002602190954773869, + "loss": 1.3695, + "step": 13700 + }, + { + "epoch": 0.69, + "grad_norm": 1.6982303857803345, + "learning_rate": 0.00025991758793969847, + "loss": 1.3864, + "step": 13800 + }, + { + "epoch": 0.69, + "grad_norm": 2.2350399494171143, + "learning_rate": 0.00025961608040201003, + "loss": 1.4096, + "step": 13900 + }, + { + "epoch": 0.7, + "grad_norm": 1.4647042751312256, + "learning_rate": 0.0002593145728643216, + "loss": 1.3915, + "step": 14000 + }, + { + "epoch": 0.7, + "eval_loss": 1.3878337144851685, + "eval_runtime": 37.7254, + "eval_samples_per_second": 26.507, + "eval_steps_per_second": 3.313, + "step": 14000 + }, + { + "epoch": 0.7, + "grad_norm": 2.002542734146118, + "learning_rate": 0.00025901306532663315, + "loss": 1.4214, + "step": 14100 + }, + { + "epoch": 0.71, + "grad_norm": 1.9857007265090942, + "learning_rate": 0.0002587115577889447, + "loss": 1.3636, + "step": 14200 + }, + { + "epoch": 0.71, + "grad_norm": 2.4016737937927246, + "learning_rate": 0.00025841005025125627, + "loss": 1.4259, + "step": 14300 + }, + { + "epoch": 0.72, + "grad_norm": 3.929931879043579, + "learning_rate": 0.0002581085427135678, + "loss": 1.3937, + "step": 14400 + }, + { + "epoch": 0.72, + "grad_norm": 1.6266632080078125, + "learning_rate": 0.0002578070351758794, + "loss": 1.3678, + "step": 14500 + }, + { + "epoch": 0.73, + "grad_norm": 2.905378580093384, + "learning_rate": 0.00025750552763819095, + "loss": 1.3526, + "step": 14600 + }, + { + "epoch": 0.73, + "grad_norm": 2.535842180252075, + "learning_rate": 0.0002572040201005025, + "loss": 1.4062, + "step": 14700 + }, + { + "epoch": 0.74, + "grad_norm": 1.5988209247589111, + "learning_rate": 0.000256902512562814, + "loss": 1.3915, + "step": 14800 + }, + { + "epoch": 0.74, + "grad_norm": 1.5643303394317627, + "learning_rate": 0.0002566010050251256, + "loss": 1.3783, + "step": 14900 + }, + { + "epoch": 0.75, + "grad_norm": 1.4297415018081665, + "learning_rate": 0.0002562994974874372, + "loss": 1.3782, + "step": 15000 + }, + { + "epoch": 0.75, + "eval_loss": 1.405114769935608, + "eval_runtime": 37.9898, + "eval_samples_per_second": 26.323, + "eval_steps_per_second": 3.29, + "step": 15000 + }, + { + "epoch": 0.76, + "grad_norm": 1.6650172472000122, + "learning_rate": 0.0002559979899497487, + "loss": 1.3387, + "step": 15100 + }, + { + "epoch": 0.76, + "grad_norm": 2.118579864501953, + "learning_rate": 0.00025569648241206025, + "loss": 1.393, + "step": 15200 + }, + { + "epoch": 0.77, + "grad_norm": 1.74748694896698, + "learning_rate": 0.00025539497487437186, + "loss": 1.3353, + "step": 15300 + }, + { + "epoch": 0.77, + "grad_norm": 1.794631004333496, + "learning_rate": 0.0002550934673366834, + "loss": 1.3942, + "step": 15400 + }, + { + "epoch": 0.78, + "grad_norm": 2.7065675258636475, + "learning_rate": 0.00025479195979899493, + "loss": 1.3962, + "step": 15500 + }, + { + "epoch": 0.78, + "grad_norm": 3.389014720916748, + "learning_rate": 0.0002544904522613065, + "loss": 1.3758, + "step": 15600 + }, + { + "epoch": 0.79, + "grad_norm": 1.534252405166626, + "learning_rate": 0.0002541889447236181, + "loss": 1.3526, + "step": 15700 + }, + { + "epoch": 0.79, + "grad_norm": 1.7374197244644165, + "learning_rate": 0.0002538874371859296, + "loss": 1.3577, + "step": 15800 + }, + { + "epoch": 0.8, + "grad_norm": 3.1230342388153076, + "learning_rate": 0.00025358592964824117, + "loss": 1.3548, + "step": 15900 + }, + { + "epoch": 0.8, + "grad_norm": 3.261570692062378, + "learning_rate": 0.0002532844221105527, + "loss": 1.3932, + "step": 16000 + }, + { + "epoch": 0.8, + "eval_loss": 1.3275749683380127, + "eval_runtime": 37.9493, + "eval_samples_per_second": 26.351, + "eval_steps_per_second": 3.294, + "step": 16000 + }, + { + "epoch": 0.81, + "grad_norm": 3.0108933448791504, + "learning_rate": 0.00025298291457286434, + "loss": 1.3445, + "step": 16100 + }, + { + "epoch": 0.81, + "grad_norm": 3.536722421646118, + "learning_rate": 0.00025268140703517584, + "loss": 1.364, + "step": 16200 + }, + { + "epoch": 0.81, + "grad_norm": 1.637465238571167, + "learning_rate": 0.0002523829145728643, + "loss": 1.376, + "step": 16300 + }, + { + "epoch": 0.82, + "grad_norm": 2.8907904624938965, + "learning_rate": 0.0002520814070351759, + "loss": 1.3623, + "step": 16400 + }, + { + "epoch": 0.82, + "grad_norm": 2.4385364055633545, + "learning_rate": 0.0002517798994974874, + "loss": 1.318, + "step": 16500 + }, + { + "epoch": 0.83, + "grad_norm": 1.9113733768463135, + "learning_rate": 0.00025147839195979895, + "loss": 1.3906, + "step": 16600 + }, + { + "epoch": 0.83, + "grad_norm": 5.8118414878845215, + "learning_rate": 0.00025117688442211056, + "loss": 1.3336, + "step": 16700 + }, + { + "epoch": 0.84, + "grad_norm": 0.9629586935043335, + "learning_rate": 0.0002508753768844221, + "loss": 1.3959, + "step": 16800 + }, + { + "epoch": 0.84, + "grad_norm": 2.0420243740081787, + "learning_rate": 0.0002505738693467336, + "loss": 1.3523, + "step": 16900 + }, + { + "epoch": 0.85, + "grad_norm": 2.0758414268493652, + "learning_rate": 0.0002502723618090452, + "loss": 1.3747, + "step": 17000 + }, + { + "epoch": 0.85, + "eval_loss": 1.3606867790222168, + "eval_runtime": 37.9681, + "eval_samples_per_second": 26.338, + "eval_steps_per_second": 3.292, + "step": 17000 + }, + { + "epoch": 0.85, + "grad_norm": 2.486980438232422, + "learning_rate": 0.00024997085427135675, + "loss": 1.3402, + "step": 17100 + }, + { + "epoch": 0.86, + "grad_norm": 2.211982250213623, + "learning_rate": 0.0002496693467336683, + "loss": 1.3419, + "step": 17200 + }, + { + "epoch": 0.86, + "grad_norm": 2.3362228870391846, + "learning_rate": 0.00024936783919597986, + "loss": 1.3748, + "step": 17300 + }, + { + "epoch": 0.87, + "grad_norm": 1.515100121498108, + "learning_rate": 0.0002490663316582914, + "loss": 1.3747, + "step": 17400 + }, + { + "epoch": 0.88, + "grad_norm": 2.1747968196868896, + "learning_rate": 0.000248764824120603, + "loss": 1.3458, + "step": 17500 + }, + { + "epoch": 0.88, + "grad_norm": 2.6045758724212646, + "learning_rate": 0.00024846331658291454, + "loss": 1.3623, + "step": 17600 + }, + { + "epoch": 0.89, + "grad_norm": 1.5456433296203613, + "learning_rate": 0.0002481618090452261, + "loss": 1.3107, + "step": 17700 + }, + { + "epoch": 0.89, + "grad_norm": 1.5310312509536743, + "learning_rate": 0.00024786030150753766, + "loss": 1.3541, + "step": 17800 + }, + { + "epoch": 0.9, + "grad_norm": 3.2094223499298096, + "learning_rate": 0.0002475587939698492, + "loss": 1.3445, + "step": 17900 + }, + { + "epoch": 0.9, + "grad_norm": 2.7595880031585693, + "learning_rate": 0.0002472572864321608, + "loss": 1.3537, + "step": 18000 + }, + { + "epoch": 0.9, + "eval_loss": 1.3503804206848145, + "eval_runtime": 37.8049, + "eval_samples_per_second": 26.452, + "eval_steps_per_second": 3.306, + "step": 18000 + }, + { + "epoch": 0.91, + "grad_norm": 5.4382781982421875, + "learning_rate": 0.00024695577889447234, + "loss": 1.3584, + "step": 18100 + }, + { + "epoch": 0.91, + "grad_norm": 2.7903175354003906, + "learning_rate": 0.0002466542713567839, + "loss": 1.3272, + "step": 18200 + }, + { + "epoch": 0.92, + "grad_norm": 1.6171114444732666, + "learning_rate": 0.00024635276381909546, + "loss": 1.3601, + "step": 18300 + }, + { + "epoch": 0.92, + "grad_norm": 2.9426279067993164, + "learning_rate": 0.000246051256281407, + "loss": 1.3782, + "step": 18400 + }, + { + "epoch": 0.93, + "grad_norm": 2.36596941947937, + "learning_rate": 0.0002457497487437186, + "loss": 1.3307, + "step": 18500 + }, + { + "epoch": 0.93, + "grad_norm": 1.3205448389053345, + "learning_rate": 0.00024544824120603014, + "loss": 1.3929, + "step": 18600 + }, + { + "epoch": 0.94, + "grad_norm": 1.9464951753616333, + "learning_rate": 0.0002451467336683417, + "loss": 1.3415, + "step": 18700 + }, + { + "epoch": 0.94, + "grad_norm": 1.7700294256210327, + "learning_rate": 0.00024484522613065326, + "loss": 1.3473, + "step": 18800 + }, + { + "epoch": 0.94, + "grad_norm": 2.687060832977295, + "learning_rate": 0.0002445437185929648, + "loss": 1.3606, + "step": 18900 + }, + { + "epoch": 0.95, + "grad_norm": 2.02754282951355, + "learning_rate": 0.0002442422110552764, + "loss": 1.3799, + "step": 19000 + }, + { + "epoch": 0.95, + "eval_loss": 1.365315556526184, + "eval_runtime": 37.6707, + "eval_samples_per_second": 26.546, + "eval_steps_per_second": 3.318, + "step": 19000 + }, + { + "epoch": 0.95, + "grad_norm": 2.187087059020996, + "learning_rate": 0.0002439407035175879, + "loss": 1.3585, + "step": 19100 + }, + { + "epoch": 0.96, + "grad_norm": 3.8181040287017822, + "learning_rate": 0.00024363919597989947, + "loss": 1.3723, + "step": 19200 + }, + { + "epoch": 0.96, + "grad_norm": 1.6949020624160767, + "learning_rate": 0.00024333768844221105, + "loss": 1.3074, + "step": 19300 + }, + { + "epoch": 0.97, + "grad_norm": 2.716754913330078, + "learning_rate": 0.00024303618090452259, + "loss": 1.3589, + "step": 19400 + }, + { + "epoch": 0.97, + "grad_norm": 1.5216838121414185, + "learning_rate": 0.00024273467336683415, + "loss": 1.3398, + "step": 19500 + }, + { + "epoch": 0.98, + "grad_norm": 1.7370058298110962, + "learning_rate": 0.0002424331658291457, + "loss": 1.3546, + "step": 19600 + }, + { + "epoch": 0.98, + "grad_norm": 2.0907745361328125, + "learning_rate": 0.00024213165829145726, + "loss": 1.3161, + "step": 19700 + }, + { + "epoch": 0.99, + "grad_norm": 2.9564626216888428, + "learning_rate": 0.00024183015075376882, + "loss": 1.3623, + "step": 19800 + }, + { + "epoch": 0.99, + "grad_norm": 2.6082723140716553, + "learning_rate": 0.00024152864321608038, + "loss": 1.3158, + "step": 19900 + }, + { + "epoch": 1.0, + "grad_norm": 1.0046592950820923, + "learning_rate": 0.00024122713567839192, + "loss": 1.3366, + "step": 20000 + }, + { + "epoch": 1.0, + "eval_loss": 1.3484834432601929, + "eval_runtime": 37.9475, + "eval_samples_per_second": 26.352, + "eval_steps_per_second": 3.294, + "step": 20000 + }, + { + "epoch": 1.0, + "grad_norm": 2.5935070514678955, + "learning_rate": 0.0002409256281407035, + "loss": 1.3512, + "step": 20100 + }, + { + "epoch": 1.01, + "grad_norm": 3.790050506591797, + "learning_rate": 0.00024062412060301506, + "loss": 1.3272, + "step": 20200 + }, + { + "epoch": 1.01, + "grad_norm": 1.3440461158752441, + "learning_rate": 0.00024032562814070351, + "loss": 1.333, + "step": 20300 + }, + { + "epoch": 1.02, + "grad_norm": 6.51857852935791, + "learning_rate": 0.00024002412060301505, + "loss": 1.3334, + "step": 20400 + }, + { + "epoch": 1.02, + "grad_norm": 1.882919192314148, + "learning_rate": 0.0002397226130653266, + "loss": 1.3241, + "step": 20500 + }, + { + "epoch": 1.03, + "grad_norm": 1.361558198928833, + "learning_rate": 0.00023942110552763817, + "loss": 1.3207, + "step": 20600 + }, + { + "epoch": 1.03, + "grad_norm": 2.0967071056365967, + "learning_rate": 0.00023911959798994975, + "loss": 1.2993, + "step": 20700 + }, + { + "epoch": 1.04, + "grad_norm": 2.2517688274383545, + "learning_rate": 0.00023881809045226128, + "loss": 1.3353, + "step": 20800 + }, + { + "epoch": 1.04, + "grad_norm": 7.7647480964660645, + "learning_rate": 0.00023851658291457284, + "loss": 1.3326, + "step": 20900 + }, + { + "epoch": 1.05, + "grad_norm": 2.0270638465881348, + "learning_rate": 0.0002382180904522613, + "loss": 1.3046, + "step": 21000 + }, + { + "epoch": 1.05, + "eval_loss": 1.3456777334213257, + "eval_runtime": 38.0868, + "eval_samples_per_second": 26.256, + "eval_steps_per_second": 3.282, + "step": 21000 + }, + { + "epoch": 1.05, + "grad_norm": 1.9642785787582397, + "learning_rate": 0.00023791658291457283, + "loss": 1.3131, + "step": 21100 + }, + { + "epoch": 1.06, + "grad_norm": 2.517357587814331, + "learning_rate": 0.0002376150753768844, + "loss": 1.3627, + "step": 21200 + }, + { + "epoch": 1.06, + "grad_norm": 1.4660860300064087, + "learning_rate": 0.00023731356783919598, + "loss": 1.2805, + "step": 21300 + }, + { + "epoch": 1.07, + "grad_norm": 3.102552652359009, + "learning_rate": 0.00023701206030150753, + "loss": 1.339, + "step": 21400 + }, + { + "epoch": 1.07, + "grad_norm": 2.017504930496216, + "learning_rate": 0.00023671055276381907, + "loss": 1.3307, + "step": 21500 + }, + { + "epoch": 1.08, + "grad_norm": 1.4260824918746948, + "learning_rate": 0.00023640904522613063, + "loss": 1.3216, + "step": 21600 + }, + { + "epoch": 1.08, + "grad_norm": 4.0052361488342285, + "learning_rate": 0.0002361075376884422, + "loss": 1.3544, + "step": 21700 + }, + { + "epoch": 1.09, + "grad_norm": 3.664625883102417, + "learning_rate": 0.00023580603015075375, + "loss": 1.3508, + "step": 21800 + }, + { + "epoch": 1.09, + "grad_norm": 2.1044421195983887, + "learning_rate": 0.0002355045226130653, + "loss": 1.3205, + "step": 21900 + }, + { + "epoch": 1.1, + "grad_norm": 1.6608549356460571, + "learning_rate": 0.00023520301507537686, + "loss": 1.3373, + "step": 22000 + }, + { + "epoch": 1.1, + "eval_loss": 1.319154977798462, + "eval_runtime": 37.7789, + "eval_samples_per_second": 26.47, + "eval_steps_per_second": 3.309, + "step": 22000 + }, + { + "epoch": 1.1, + "grad_norm": 2.131612777709961, + "learning_rate": 0.00023490150753768845, + "loss": 1.3244, + "step": 22100 + }, + { + "epoch": 1.11, + "grad_norm": 2.0854969024658203, + "learning_rate": 0.00023459999999999998, + "loss": 1.3357, + "step": 22200 + }, + { + "epoch": 1.11, + "grad_norm": 2.3622310161590576, + "learning_rate": 0.0002343075376884422, + "loss": 1.4118, + "step": 22300 + }, + { + "epoch": 1.12, + "grad_norm": 2.5198066234588623, + "learning_rate": 0.00023400603015075376, + "loss": 1.319, + "step": 22400 + }, + { + "epoch": 1.12, + "grad_norm": 2.4654555320739746, + "learning_rate": 0.00023370452261306532, + "loss": 1.3055, + "step": 22500 + }, + { + "epoch": 1.13, + "grad_norm": 2.53120756149292, + "learning_rate": 0.00023340301507537685, + "loss": 1.3763, + "step": 22600 + }, + { + "epoch": 1.14, + "grad_norm": 2.199324131011963, + "learning_rate": 0.00023310150753768843, + "loss": 1.3148, + "step": 22700 + }, + { + "epoch": 1.14, + "grad_norm": 2.951871633529663, + "learning_rate": 0.0002328, + "loss": 1.3234, + "step": 22800 + }, + { + "epoch": 1.15, + "grad_norm": 2.5513529777526855, + "learning_rate": 0.00023249849246231153, + "loss": 1.302, + "step": 22900 + }, + { + "epoch": 1.15, + "grad_norm": 5.096097469329834, + "learning_rate": 0.00023219698492462309, + "loss": 1.3102, + "step": 23000 + }, + { + "epoch": 1.15, + "eval_loss": 1.3704819679260254, + "eval_runtime": 37.8283, + "eval_samples_per_second": 26.435, + "eval_steps_per_second": 3.304, + "step": 23000 + }, + { + "epoch": 1.16, + "grad_norm": 1.3565678596496582, + "learning_rate": 0.00023189547738693467, + "loss": 1.3182, + "step": 23100 + }, + { + "epoch": 1.16, + "grad_norm": 3.1972274780273438, + "learning_rate": 0.00023159396984924623, + "loss": 1.316, + "step": 23200 + }, + { + "epoch": 1.17, + "grad_norm": 2.4728245735168457, + "learning_rate": 0.00023129246231155776, + "loss": 1.2934, + "step": 23300 + }, + { + "epoch": 1.17, + "grad_norm": 1.917893648147583, + "learning_rate": 0.00023099095477386932, + "loss": 1.3241, + "step": 23400 + }, + { + "epoch": 1.18, + "grad_norm": 2.30876088142395, + "learning_rate": 0.00023068944723618086, + "loss": 1.3031, + "step": 23500 + }, + { + "epoch": 1.18, + "grad_norm": 2.5653178691864014, + "learning_rate": 0.00023038793969849244, + "loss": 1.2819, + "step": 23600 + }, + { + "epoch": 1.19, + "grad_norm": 3.500821352005005, + "learning_rate": 0.000230086432160804, + "loss": 1.2829, + "step": 23700 + }, + { + "epoch": 1.19, + "grad_norm": 1.6564580202102661, + "learning_rate": 0.00022978492462311556, + "loss": 1.3209, + "step": 23800 + }, + { + "epoch": 1.2, + "grad_norm": 2.6477315425872803, + "learning_rate": 0.0002294834170854271, + "loss": 1.2991, + "step": 23900 + }, + { + "epoch": 1.2, + "grad_norm": 2.9583780765533447, + "learning_rate": 0.00022918190954773868, + "loss": 1.3011, + "step": 24000 + }, + { + "epoch": 1.2, + "eval_loss": 1.3160556554794312, + "eval_runtime": 37.7643, + "eval_samples_per_second": 26.48, + "eval_steps_per_second": 3.31, + "step": 24000 + }, + { + "epoch": 1.21, + "grad_norm": 2.3997368812561035, + "learning_rate": 0.00022888040201005024, + "loss": 1.2866, + "step": 24100 + }, + { + "epoch": 1.21, + "grad_norm": 2.5909266471862793, + "learning_rate": 0.00022857889447236177, + "loss": 1.3133, + "step": 24200 + }, + { + "epoch": 1.22, + "grad_norm": 1.9457557201385498, + "learning_rate": 0.00022827738693467333, + "loss": 1.2716, + "step": 24300 + }, + { + "epoch": 1.22, + "grad_norm": 2.85856032371521, + "learning_rate": 0.00022797587939698492, + "loss": 1.2932, + "step": 24400 + }, + { + "epoch": 1.23, + "grad_norm": 3.180671215057373, + "learning_rate": 0.00022767437185929648, + "loss": 1.317, + "step": 24500 + }, + { + "epoch": 1.23, + "grad_norm": 1.630612850189209, + "learning_rate": 0.000227372864321608, + "loss": 1.3176, + "step": 24600 + }, + { + "epoch": 1.23, + "grad_norm": 2.159804582595825, + "learning_rate": 0.00022707135678391957, + "loss": 1.3288, + "step": 24700 + }, + { + "epoch": 1.24, + "grad_norm": 1.314036250114441, + "learning_rate": 0.00022676984924623116, + "loss": 1.3157, + "step": 24800 + }, + { + "epoch": 1.25, + "grad_norm": 2.718198776245117, + "learning_rate": 0.0002264683417085427, + "loss": 1.2915, + "step": 24900 + }, + { + "epoch": 1.25, + "grad_norm": 2.3423640727996826, + "learning_rate": 0.00022616683417085425, + "loss": 1.2976, + "step": 25000 + }, + { + "epoch": 1.25, + "eval_loss": 1.3594353199005127, + "eval_runtime": 37.7829, + "eval_samples_per_second": 26.467, + "eval_steps_per_second": 3.308, + "step": 25000 + }, + { + "epoch": 1.25, + "grad_norm": 2.3341753482818604, + "learning_rate": 0.0002258653266331658, + "loss": 1.322, + "step": 25100 + }, + { + "epoch": 1.26, + "grad_norm": 2.0798075199127197, + "learning_rate": 0.0002255638190954774, + "loss": 1.3182, + "step": 25200 + }, + { + "epoch": 1.27, + "grad_norm": 1.5256847143173218, + "learning_rate": 0.00022526231155778893, + "loss": 1.3102, + "step": 25300 + }, + { + "epoch": 1.27, + "grad_norm": 2.4831185340881348, + "learning_rate": 0.00022496080402010049, + "loss": 1.3183, + "step": 25400 + }, + { + "epoch": 1.27, + "grad_norm": 9.853681564331055, + "learning_rate": 0.00022465929648241204, + "loss": 1.2963, + "step": 25500 + }, + { + "epoch": 1.28, + "grad_norm": 2.833552837371826, + "learning_rate": 0.00022435778894472358, + "loss": 1.3226, + "step": 25600 + }, + { + "epoch": 1.28, + "grad_norm": 2.7486400604248047, + "learning_rate": 0.00022405628140703516, + "loss": 1.2742, + "step": 25700 + }, + { + "epoch": 1.29, + "grad_norm": 1.3708908557891846, + "learning_rate": 0.00022375477386934672, + "loss": 1.2878, + "step": 25800 + }, + { + "epoch": 1.29, + "grad_norm": 3.6677916049957275, + "learning_rate": 0.00022345326633165826, + "loss": 1.3113, + "step": 25900 + }, + { + "epoch": 1.3, + "grad_norm": 2.7909395694732666, + "learning_rate": 0.00022315175879396981, + "loss": 1.3221, + "step": 26000 + }, + { + "epoch": 1.3, + "eval_loss": 1.313453197479248, + "eval_runtime": 37.7782, + "eval_samples_per_second": 26.47, + "eval_steps_per_second": 3.309, + "step": 26000 + }, + { + "epoch": 1.3, + "grad_norm": 2.592221736907959, + "learning_rate": 0.0002228502512562814, + "loss": 1.2918, + "step": 26100 + }, + { + "epoch": 1.31, + "grad_norm": 2.911118984222412, + "learning_rate": 0.00022254874371859296, + "loss": 1.3392, + "step": 26200 + }, + { + "epoch": 1.31, + "grad_norm": 2.15328049659729, + "learning_rate": 0.0002222472361809045, + "loss": 1.261, + "step": 26300 + }, + { + "epoch": 1.32, + "grad_norm": 3.0731029510498047, + "learning_rate": 0.00022194572864321605, + "loss": 1.289, + "step": 26400 + }, + { + "epoch": 1.32, + "grad_norm": 3.032560348510742, + "learning_rate": 0.00022164422110552764, + "loss": 1.3186, + "step": 26500 + }, + { + "epoch": 1.33, + "grad_norm": 5.388736724853516, + "learning_rate": 0.00022134271356783917, + "loss": 1.3214, + "step": 26600 + }, + { + "epoch": 1.33, + "grad_norm": 2.6400022506713867, + "learning_rate": 0.00022104120603015073, + "loss": 1.2936, + "step": 26700 + }, + { + "epoch": 1.34, + "grad_norm": 3.9355711936950684, + "learning_rate": 0.0002207396984924623, + "loss": 1.3039, + "step": 26800 + }, + { + "epoch": 1.34, + "grad_norm": 1.6818647384643555, + "learning_rate": 0.00022043819095477388, + "loss": 1.2992, + "step": 26900 + }, + { + "epoch": 1.35, + "grad_norm": 2.2356157302856445, + "learning_rate": 0.0002201366834170854, + "loss": 1.3011, + "step": 27000 + }, + { + "epoch": 1.35, + "eval_loss": 1.3157364130020142, + "eval_runtime": 37.9238, + "eval_samples_per_second": 26.369, + "eval_steps_per_second": 3.296, + "step": 27000 + }, + { + "epoch": 1.35, + "grad_norm": 2.158803701400757, + "learning_rate": 0.00021983517587939697, + "loss": 1.308, + "step": 27100 + }, + { + "epoch": 1.36, + "grad_norm": 1.4748259782791138, + "learning_rate": 0.0002195336683417085, + "loss": 1.2873, + "step": 27200 + }, + { + "epoch": 1.36, + "grad_norm": 2.382047653198242, + "learning_rate": 0.0002192321608040201, + "loss": 1.2795, + "step": 27300 + }, + { + "epoch": 1.37, + "grad_norm": 1.8785953521728516, + "learning_rate": 0.00021893065326633165, + "loss": 1.3101, + "step": 27400 + }, + { + "epoch": 1.38, + "grad_norm": 2.4842770099639893, + "learning_rate": 0.0002186291457286432, + "loss": 1.3124, + "step": 27500 + }, + { + "epoch": 1.38, + "grad_norm": 1.7258535623550415, + "learning_rate": 0.00021832763819095474, + "loss": 1.3315, + "step": 27600 + }, + { + "epoch": 1.39, + "grad_norm": 2.157860517501831, + "learning_rate": 0.00021802613065326633, + "loss": 1.2848, + "step": 27700 + }, + { + "epoch": 1.39, + "grad_norm": 3.1965837478637695, + "learning_rate": 0.00021772462311557788, + "loss": 1.3105, + "step": 27800 + }, + { + "epoch": 1.4, + "grad_norm": 3.141603708267212, + "learning_rate": 0.00021742311557788942, + "loss": 1.3197, + "step": 27900 + }, + { + "epoch": 1.4, + "grad_norm": 2.0368692874908447, + "learning_rate": 0.00021712160804020098, + "loss": 1.3113, + "step": 28000 + }, + { + "epoch": 1.4, + "eval_loss": 1.3079107999801636, + "eval_runtime": 37.8037, + "eval_samples_per_second": 26.452, + "eval_steps_per_second": 3.307, + "step": 28000 + }, + { + "epoch": 1.41, + "grad_norm": 3.013373851776123, + "learning_rate": 0.00021682010050251254, + "loss": 1.2892, + "step": 28100 + }, + { + "epoch": 1.41, + "grad_norm": 2.766491651535034, + "learning_rate": 0.00021651859296482412, + "loss": 1.3414, + "step": 28200 + }, + { + "epoch": 1.42, + "grad_norm": 1.6288301944732666, + "learning_rate": 0.00021621708542713566, + "loss": 1.3156, + "step": 28300 + }, + { + "epoch": 1.42, + "grad_norm": 2.3904545307159424, + "learning_rate": 0.00021591557788944721, + "loss": 1.2905, + "step": 28400 + }, + { + "epoch": 1.43, + "grad_norm": 2.263744831085205, + "learning_rate": 0.00021561407035175877, + "loss": 1.2961, + "step": 28500 + }, + { + "epoch": 1.43, + "grad_norm": 1.985129714012146, + "learning_rate": 0.00021531256281407033, + "loss": 1.2703, + "step": 28600 + }, + { + "epoch": 1.44, + "grad_norm": 2.4574270248413086, + "learning_rate": 0.0002150110552763819, + "loss": 1.2793, + "step": 28700 + }, + { + "epoch": 1.44, + "grad_norm": 2.312525510787964, + "learning_rate": 0.00021470954773869345, + "loss": 1.2669, + "step": 28800 + }, + { + "epoch": 1.45, + "grad_norm": 1.5253132581710815, + "learning_rate": 0.00021440804020100498, + "loss": 1.3187, + "step": 28900 + }, + { + "epoch": 1.45, + "grad_norm": 1.7550122737884521, + "learning_rate": 0.00021410653266331657, + "loss": 1.3154, + "step": 29000 + }, + { + "epoch": 1.45, + "eval_loss": 1.2937275171279907, + "eval_runtime": 37.9639, + "eval_samples_per_second": 26.341, + "eval_steps_per_second": 3.293, + "step": 29000 + }, + { + "epoch": 1.46, + "grad_norm": 2.492000102996826, + "learning_rate": 0.00021380502512562813, + "loss": 1.2868, + "step": 29100 + }, + { + "epoch": 1.46, + "grad_norm": 4.013311862945557, + "learning_rate": 0.00021350351758793966, + "loss": 1.2578, + "step": 29200 + }, + { + "epoch": 1.47, + "grad_norm": 3.991748809814453, + "learning_rate": 0.00021320201005025122, + "loss": 1.3347, + "step": 29300 + }, + { + "epoch": 1.47, + "grad_norm": 4.655180931091309, + "learning_rate": 0.0002129005025125628, + "loss": 1.2935, + "step": 29400 + }, + { + "epoch": 1.48, + "grad_norm": 1.9497921466827393, + "learning_rate": 0.00021259899497487437, + "loss": 1.248, + "step": 29500 + }, + { + "epoch": 1.48, + "grad_norm": 3.372061252593994, + "learning_rate": 0.0002122974874371859, + "loss": 1.2877, + "step": 29600 + }, + { + "epoch": 1.48, + "grad_norm": 2.1920547485351562, + "learning_rate": 0.00021199597989949746, + "loss": 1.2407, + "step": 29700 + }, + { + "epoch": 1.49, + "grad_norm": 3.5231897830963135, + "learning_rate": 0.0002116974874371859, + "loss": 1.2296, + "step": 29800 + }, + { + "epoch": 1.5, + "grad_norm": 4.537712097167969, + "learning_rate": 0.00021139597989949745, + "loss": 1.2704, + "step": 29900 + }, + { + "epoch": 1.5, + "grad_norm": 3.12864351272583, + "learning_rate": 0.00021109447236180903, + "loss": 1.3093, + "step": 30000 + }, + { + "epoch": 1.5, + "eval_loss": 1.2697720527648926, + "eval_runtime": 37.8104, + "eval_samples_per_second": 26.448, + "eval_steps_per_second": 3.306, + "step": 30000 + }, + { + "epoch": 1.5, + "grad_norm": 1.9532142877578735, + "learning_rate": 0.0002107929648241206, + "loss": 1.2892, + "step": 30100 + }, + { + "epoch": 1.51, + "grad_norm": 1.9121806621551514, + "learning_rate": 0.00021049145728643215, + "loss": 1.282, + "step": 30200 + }, + { + "epoch": 1.52, + "grad_norm": 1.2597557306289673, + "learning_rate": 0.00021018994974874368, + "loss": 1.2793, + "step": 30300 + }, + { + "epoch": 1.52, + "grad_norm": 1.7637083530426025, + "learning_rate": 0.00020988844221105527, + "loss": 1.3253, + "step": 30400 + }, + { + "epoch": 1.52, + "grad_norm": 3.788984775543213, + "learning_rate": 0.00020958693467336683, + "loss": 1.249, + "step": 30500 + }, + { + "epoch": 1.53, + "grad_norm": 3.1422038078308105, + "learning_rate": 0.00020928542713567836, + "loss": 1.2429, + "step": 30600 + }, + { + "epoch": 1.54, + "grad_norm": 1.995868444442749, + "learning_rate": 0.00020898391959798992, + "loss": 1.2827, + "step": 30700 + }, + { + "epoch": 1.54, + "grad_norm": 2.3635036945343018, + "learning_rate": 0.00020868241206030148, + "loss": 1.2653, + "step": 30800 + }, + { + "epoch": 1.54, + "grad_norm": 2.0892832279205322, + "learning_rate": 0.00020838090452261307, + "loss": 1.2814, + "step": 30900 + }, + { + "epoch": 1.55, + "grad_norm": 2.8766140937805176, + "learning_rate": 0.0002080793969849246, + "loss": 1.2809, + "step": 31000 + }, + { + "epoch": 1.55, + "eval_loss": 1.2703502178192139, + "eval_runtime": 37.818, + "eval_samples_per_second": 26.442, + "eval_steps_per_second": 3.305, + "step": 31000 + }, + { + "epoch": 1.56, + "grad_norm": 2.5487587451934814, + "learning_rate": 0.00020777788944723616, + "loss": 1.2811, + "step": 31100 + }, + { + "epoch": 1.56, + "grad_norm": 2.325295925140381, + "learning_rate": 0.00020747638190954772, + "loss": 1.2769, + "step": 31200 + }, + { + "epoch": 1.56, + "grad_norm": 1.741773009300232, + "learning_rate": 0.00020717487437185928, + "loss": 1.2741, + "step": 31300 + }, + { + "epoch": 1.57, + "grad_norm": 5.916422367095947, + "learning_rate": 0.00020687336683417084, + "loss": 1.2567, + "step": 31400 + }, + { + "epoch": 1.57, + "grad_norm": 2.166018009185791, + "learning_rate": 0.0002065718592964824, + "loss": 1.2491, + "step": 31500 + }, + { + "epoch": 1.58, + "grad_norm": 1.7622108459472656, + "learning_rate": 0.00020627035175879393, + "loss": 1.2815, + "step": 31600 + }, + { + "epoch": 1.58, + "grad_norm": 2.2861111164093018, + "learning_rate": 0.00020596884422110552, + "loss": 1.2485, + "step": 31700 + }, + { + "epoch": 1.59, + "grad_norm": 2.8738324642181396, + "learning_rate": 0.00020566733668341708, + "loss": 1.2747, + "step": 31800 + }, + { + "epoch": 1.59, + "grad_norm": 1.920782208442688, + "learning_rate": 0.00020536582914572863, + "loss": 1.3094, + "step": 31900 + }, + { + "epoch": 1.6, + "grad_norm": 2.591792345046997, + "learning_rate": 0.00020506432160804017, + "loss": 1.3178, + "step": 32000 + }, + { + "epoch": 1.6, + "eval_loss": 1.2383744716644287, + "eval_runtime": 37.8786, + "eval_samples_per_second": 26.4, + "eval_steps_per_second": 3.3, + "step": 32000 + }, + { + "epoch": 1.6, + "grad_norm": 3.4940438270568848, + "learning_rate": 0.00020476281407035175, + "loss": 1.2755, + "step": 32100 + }, + { + "epoch": 1.61, + "grad_norm": 2.377112627029419, + "learning_rate": 0.0002044613065326633, + "loss": 1.2667, + "step": 32200 + }, + { + "epoch": 1.61, + "grad_norm": 2.5229716300964355, + "learning_rate": 0.00020415979899497485, + "loss": 1.2695, + "step": 32300 + }, + { + "epoch": 1.62, + "grad_norm": 2.469883441925049, + "learning_rate": 0.0002038582914572864, + "loss": 1.3089, + "step": 32400 + }, + { + "epoch": 1.62, + "grad_norm": 1.9299498796463013, + "learning_rate": 0.000203556783919598, + "loss": 1.2835, + "step": 32500 + }, + { + "epoch": 1.63, + "grad_norm": 2.486790895462036, + "learning_rate": 0.00020325527638190955, + "loss": 1.2531, + "step": 32600 + }, + { + "epoch": 1.64, + "grad_norm": 3.485691785812378, + "learning_rate": 0.00020295376884422108, + "loss": 1.2568, + "step": 32700 + }, + { + "epoch": 1.64, + "grad_norm": 1.674727201461792, + "learning_rate": 0.00020265226130653264, + "loss": 1.2739, + "step": 32800 + }, + { + "epoch": 1.65, + "grad_norm": 4.50739049911499, + "learning_rate": 0.00020235075376884417, + "loss": 1.211, + "step": 32900 + }, + { + "epoch": 1.65, + "grad_norm": 11.218056678771973, + "learning_rate": 0.00020204924623115576, + "loss": 1.2891, + "step": 33000 + }, + { + "epoch": 1.65, + "eval_loss": 1.2705625295639038, + "eval_runtime": 37.8291, + "eval_samples_per_second": 26.435, + "eval_steps_per_second": 3.304, + "step": 33000 + }, + { + "epoch": 1.66, + "grad_norm": 1.9991952180862427, + "learning_rate": 0.00020174773869346732, + "loss": 1.2636, + "step": 33100 + }, + { + "epoch": 1.66, + "grad_norm": 3.0366969108581543, + "learning_rate": 0.00020144623115577888, + "loss": 1.2903, + "step": 33200 + }, + { + "epoch": 1.67, + "grad_norm": 1.7985395193099976, + "learning_rate": 0.0002011447236180904, + "loss": 1.2437, + "step": 33300 + }, + { + "epoch": 1.67, + "grad_norm": 3.8208954334259033, + "learning_rate": 0.000200843216080402, + "loss": 1.244, + "step": 33400 + }, + { + "epoch": 1.68, + "grad_norm": 3.2836215496063232, + "learning_rate": 0.00020054170854271356, + "loss": 1.2837, + "step": 33500 + }, + { + "epoch": 1.68, + "grad_norm": 3.15663480758667, + "learning_rate": 0.0002002402010050251, + "loss": 1.2253, + "step": 33600 + }, + { + "epoch": 1.69, + "grad_norm": 1.6871391534805298, + "learning_rate": 0.00019993869346733665, + "loss": 1.2564, + "step": 33700 + }, + { + "epoch": 1.69, + "grad_norm": 2.3701913356781006, + "learning_rate": 0.00019963718592964824, + "loss": 1.2925, + "step": 33800 + }, + { + "epoch": 1.69, + "grad_norm": 2.9534804821014404, + "learning_rate": 0.0001993356783919598, + "loss": 1.2613, + "step": 33900 + }, + { + "epoch": 1.7, + "grad_norm": 2.273113489151001, + "learning_rate": 0.00019903417085427133, + "loss": 1.29, + "step": 34000 + }, + { + "epoch": 1.7, + "eval_loss": 1.2713490724563599, + "eval_runtime": 37.9786, + "eval_samples_per_second": 26.331, + "eval_steps_per_second": 3.291, + "step": 34000 + }, + { + "epoch": 1.71, + "grad_norm": 2.1708054542541504, + "learning_rate": 0.0001987326633165829, + "loss": 1.2775, + "step": 34100 + }, + { + "epoch": 1.71, + "grad_norm": 2.242708683013916, + "learning_rate": 0.00019843115577889447, + "loss": 1.2561, + "step": 34200 + }, + { + "epoch": 1.71, + "grad_norm": 2.0170931816101074, + "learning_rate": 0.000198129648241206, + "loss": 1.2168, + "step": 34300 + }, + { + "epoch": 1.72, + "grad_norm": 2.094848871231079, + "learning_rate": 0.00019782814070351757, + "loss": 1.2588, + "step": 34400 + }, + { + "epoch": 1.73, + "grad_norm": 2.1762752532958984, + "learning_rate": 0.00019752663316582913, + "loss": 1.1837, + "step": 34500 + }, + { + "epoch": 1.73, + "grad_norm": 3.1318016052246094, + "learning_rate": 0.0001972251256281407, + "loss": 1.2196, + "step": 34600 + }, + { + "epoch": 1.73, + "grad_norm": 3.2971861362457275, + "learning_rate": 0.00019692361809045225, + "loss": 1.2778, + "step": 34700 + }, + { + "epoch": 1.74, + "grad_norm": 3.452091693878174, + "learning_rate": 0.0001966221105527638, + "loss": 1.2385, + "step": 34800 + }, + { + "epoch": 1.75, + "grad_norm": 1.7514299154281616, + "learning_rate": 0.00019632060301507536, + "loss": 1.2769, + "step": 34900 + }, + { + "epoch": 1.75, + "grad_norm": 2.3494088649749756, + "learning_rate": 0.00019601909547738692, + "loss": 1.2689, + "step": 35000 + }, + { + "epoch": 1.75, + "eval_loss": 1.2675199508666992, + "eval_runtime": 37.8879, + "eval_samples_per_second": 26.394, + "eval_steps_per_second": 3.299, + "step": 35000 + }, + { + "epoch": 1.75, + "grad_norm": 1.5741009712219238, + "learning_rate": 0.00019571758793969848, + "loss": 1.2352, + "step": 35100 + }, + { + "epoch": 1.76, + "grad_norm": 2.652435302734375, + "learning_rate": 0.00019541608040201004, + "loss": 1.2824, + "step": 35200 + }, + { + "epoch": 1.77, + "grad_norm": 2.9557676315307617, + "learning_rate": 0.00019511457286432157, + "loss": 1.2453, + "step": 35300 + }, + { + "epoch": 1.77, + "grad_norm": 2.8758041858673096, + "learning_rate": 0.00019481306532663313, + "loss": 1.2507, + "step": 35400 + }, + { + "epoch": 1.77, + "grad_norm": 2.5828402042388916, + "learning_rate": 0.0001945145728643216, + "loss": 1.2201, + "step": 35500 + }, + { + "epoch": 1.78, + "grad_norm": 2.887206554412842, + "learning_rate": 0.00019421306532663312, + "loss": 1.2754, + "step": 35600 + }, + { + "epoch": 1.79, + "grad_norm": 2.5521140098571777, + "learning_rate": 0.0001939115577889447, + "loss": 1.234, + "step": 35700 + }, + { + "epoch": 1.79, + "grad_norm": 1.9570846557617188, + "learning_rate": 0.00019361005025125627, + "loss": 1.2708, + "step": 35800 + }, + { + "epoch": 1.79, + "grad_norm": 2.89273738861084, + "learning_rate": 0.00019330854271356782, + "loss": 1.2343, + "step": 35900 + }, + { + "epoch": 1.8, + "grad_norm": 3.624706506729126, + "learning_rate": 0.00019300703517587936, + "loss": 1.2576, + "step": 36000 + }, + { + "epoch": 1.8, + "eval_loss": 1.2644726037979126, + "eval_runtime": 37.8527, + "eval_samples_per_second": 26.418, + "eval_steps_per_second": 3.302, + "step": 36000 + }, + { + "epoch": 1.81, + "grad_norm": 2.5976133346557617, + "learning_rate": 0.00019270552763819094, + "loss": 1.2812, + "step": 36100 + }, + { + "epoch": 1.81, + "grad_norm": 2.899306297302246, + "learning_rate": 0.0001924040201005025, + "loss": 1.2541, + "step": 36200 + }, + { + "epoch": 1.81, + "grad_norm": 3.964782476425171, + "learning_rate": 0.00019210251256281404, + "loss": 1.2639, + "step": 36300 + }, + { + "epoch": 1.82, + "grad_norm": 2.4634933471679688, + "learning_rate": 0.0001918010050251256, + "loss": 1.2089, + "step": 36400 + }, + { + "epoch": 1.82, + "grad_norm": 2.6023619174957275, + "learning_rate": 0.00019149949748743718, + "loss": 1.2612, + "step": 36500 + }, + { + "epoch": 1.83, + "grad_norm": 3.0462849140167236, + "learning_rate": 0.00019119798994974874, + "loss": 1.2204, + "step": 36600 + }, + { + "epoch": 1.83, + "grad_norm": 2.1344144344329834, + "learning_rate": 0.00019089648241206027, + "loss": 1.2142, + "step": 36700 + }, + { + "epoch": 1.84, + "grad_norm": 1.5994189977645874, + "learning_rate": 0.00019059497487437183, + "loss": 1.2586, + "step": 36800 + }, + { + "epoch": 1.84, + "grad_norm": 1.357469916343689, + "learning_rate": 0.00019029346733668342, + "loss": 1.2705, + "step": 36900 + }, + { + "epoch": 1.85, + "grad_norm": 2.4201526641845703, + "learning_rate": 0.00018999195979899495, + "loss": 1.2409, + "step": 37000 + }, + { + "epoch": 1.85, + "eval_loss": 1.2103183269500732, + "eval_runtime": 37.8707, + "eval_samples_per_second": 26.406, + "eval_steps_per_second": 3.301, + "step": 37000 + }, + { + "epoch": 1.85, + "grad_norm": 3.1790504455566406, + "learning_rate": 0.0001896904522613065, + "loss": 1.2639, + "step": 37100 + }, + { + "epoch": 1.86, + "grad_norm": 2.565474033355713, + "learning_rate": 0.00018938894472361807, + "loss": 1.2853, + "step": 37200 + }, + { + "epoch": 1.86, + "grad_norm": 2.6977927684783936, + "learning_rate": 0.00018908743718592966, + "loss": 1.2178, + "step": 37300 + }, + { + "epoch": 1.87, + "grad_norm": 2.588975191116333, + "learning_rate": 0.0001887859296482412, + "loss": 1.2492, + "step": 37400 + }, + { + "epoch": 1.88, + "grad_norm": 2.23592209815979, + "learning_rate": 0.00018848442211055275, + "loss": 1.2273, + "step": 37500 + }, + { + "epoch": 1.88, + "grad_norm": 2.0961692333221436, + "learning_rate": 0.0001881859296482412, + "loss": 1.2375, + "step": 37600 + }, + { + "epoch": 1.89, + "grad_norm": 2.4870264530181885, + "learning_rate": 0.00018788442211055273, + "loss": 1.2564, + "step": 37700 + }, + { + "epoch": 1.89, + "grad_norm": 1.9144058227539062, + "learning_rate": 0.0001875829145728643, + "loss": 1.2403, + "step": 37800 + }, + { + "epoch": 1.9, + "grad_norm": 2.209117889404297, + "learning_rate": 0.00018728140703517588, + "loss": 1.2168, + "step": 37900 + }, + { + "epoch": 1.9, + "grad_norm": 2.7400968074798584, + "learning_rate": 0.00018697989949748744, + "loss": 1.1786, + "step": 38000 + }, + { + "epoch": 1.9, + "eval_loss": 1.2550157308578491, + "eval_runtime": 37.907, + "eval_samples_per_second": 26.38, + "eval_steps_per_second": 3.298, + "step": 38000 + }, + { + "epoch": 1.91, + "grad_norm": 2.392390251159668, + "learning_rate": 0.00018667839195979897, + "loss": 1.2294, + "step": 38100 + }, + { + "epoch": 1.91, + "grad_norm": 3.434168577194214, + "learning_rate": 0.00018637688442211053, + "loss": 1.2491, + "step": 38200 + }, + { + "epoch": 1.92, + "grad_norm": 2.082618236541748, + "learning_rate": 0.0001860753768844221, + "loss": 1.2602, + "step": 38300 + }, + { + "epoch": 1.92, + "grad_norm": 1.6049084663391113, + "learning_rate": 0.00018577386934673365, + "loss": 1.2067, + "step": 38400 + }, + { + "epoch": 1.93, + "grad_norm": 2.1953368186950684, + "learning_rate": 0.0001854723618090452, + "loss": 1.2292, + "step": 38500 + }, + { + "epoch": 1.93, + "grad_norm": 2.6085190773010254, + "learning_rate": 0.00018517085427135677, + "loss": 1.2269, + "step": 38600 + }, + { + "epoch": 1.94, + "grad_norm": 2.9110639095306396, + "learning_rate": 0.0001848693467336683, + "loss": 1.1898, + "step": 38700 + }, + { + "epoch": 1.94, + "grad_norm": 1.514410138130188, + "learning_rate": 0.0001845678391959799, + "loss": 1.199, + "step": 38800 + }, + { + "epoch": 1.94, + "grad_norm": 4.6756134033203125, + "learning_rate": 0.00018426633165829145, + "loss": 1.183, + "step": 38900 + }, + { + "epoch": 1.95, + "grad_norm": 2.704317808151245, + "learning_rate": 0.000183964824120603, + "loss": 1.1999, + "step": 39000 + }, + { + "epoch": 1.95, + "eval_loss": 1.2309662103652954, + "eval_runtime": 37.8598, + "eval_samples_per_second": 26.413, + "eval_steps_per_second": 3.302, + "step": 39000 + }, + { + "epoch": 1.96, + "grad_norm": 2.5975565910339355, + "learning_rate": 0.00018366331658291454, + "loss": 1.2576, + "step": 39100 + }, + { + "epoch": 1.96, + "grad_norm": 3.3112730979919434, + "learning_rate": 0.00018336180904522613, + "loss": 1.2128, + "step": 39200 + }, + { + "epoch": 1.96, + "grad_norm": 2.5991640090942383, + "learning_rate": 0.00018306030150753769, + "loss": 1.2294, + "step": 39300 + }, + { + "epoch": 1.97, + "grad_norm": 4.411704063415527, + "learning_rate": 0.00018275879396984922, + "loss": 1.1977, + "step": 39400 + }, + { + "epoch": 1.98, + "grad_norm": 1.509308099746704, + "learning_rate": 0.00018245728643216078, + "loss": 1.2712, + "step": 39500 + }, + { + "epoch": 1.98, + "grad_norm": 2.136350631713867, + "learning_rate": 0.00018215577889447236, + "loss": 1.2359, + "step": 39600 + }, + { + "epoch": 1.98, + "grad_norm": 2.1651546955108643, + "learning_rate": 0.0001818542713567839, + "loss": 1.2448, + "step": 39700 + }, + { + "epoch": 1.99, + "grad_norm": 2.9962761402130127, + "learning_rate": 0.00018155577889447235, + "loss": 1.218, + "step": 39800 + }, + { + "epoch": 2.0, + "grad_norm": 2.8525376319885254, + "learning_rate": 0.0001812542713567839, + "loss": 1.2564, + "step": 39900 + }, + { + "epoch": 2.0, + "grad_norm": 2.120208740234375, + "learning_rate": 0.00018095276381909547, + "loss": 1.2287, + "step": 40000 + }, + { + "epoch": 2.0, + "eval_loss": 1.2058476209640503, + "eval_runtime": 38.0203, + "eval_samples_per_second": 26.302, + "eval_steps_per_second": 3.288, + "step": 40000 + }, + { + "epoch": 2.0, + "grad_norm": 3.9785573482513428, + "learning_rate": 0.000180651256281407, + "loss": 1.2161, + "step": 40100 + }, + { + "epoch": 2.01, + "grad_norm": 2.7897050380706787, + "learning_rate": 0.0001803497487437186, + "loss": 1.2525, + "step": 40200 + }, + { + "epoch": 2.02, + "grad_norm": 2.042492389678955, + "learning_rate": 0.00018004824120603015, + "loss": 1.2087, + "step": 40300 + }, + { + "epoch": 2.02, + "grad_norm": 1.8287073373794556, + "learning_rate": 0.00017974673366834168, + "loss": 1.2404, + "step": 40400 + }, + { + "epoch": 2.02, + "grad_norm": 1.6399390697479248, + "learning_rate": 0.00017944522613065324, + "loss": 1.174, + "step": 40500 + }, + { + "epoch": 2.03, + "grad_norm": 3.9909472465515137, + "learning_rate": 0.00017914371859296482, + "loss": 1.1869, + "step": 40600 + }, + { + "epoch": 2.04, + "grad_norm": 2.9356400966644287, + "learning_rate": 0.00017884221105527638, + "loss": 1.2271, + "step": 40700 + }, + { + "epoch": 2.04, + "grad_norm": 2.205498218536377, + "learning_rate": 0.00017854070351758792, + "loss": 1.2505, + "step": 40800 + }, + { + "epoch": 2.04, + "grad_norm": 2.2801437377929688, + "learning_rate": 0.00017823919597989948, + "loss": 1.2232, + "step": 40900 + }, + { + "epoch": 2.05, + "grad_norm": 4.001745223999023, + "learning_rate": 0.00017793768844221104, + "loss": 1.257, + "step": 41000 + }, + { + "epoch": 2.05, + "eval_loss": 1.1965339183807373, + "eval_runtime": 37.9045, + "eval_samples_per_second": 26.382, + "eval_steps_per_second": 3.298, + "step": 41000 + }, + { + "epoch": 2.06, + "grad_norm": 3.484135150909424, + "learning_rate": 0.0001776361809045226, + "loss": 1.2232, + "step": 41100 + }, + { + "epoch": 2.06, + "grad_norm": 2.7462897300720215, + "learning_rate": 0.00017733467336683415, + "loss": 1.22, + "step": 41200 + }, + { + "epoch": 2.06, + "grad_norm": 2.9418435096740723, + "learning_rate": 0.00017703316582914571, + "loss": 1.2141, + "step": 41300 + }, + { + "epoch": 2.07, + "grad_norm": 2.188680410385132, + "learning_rate": 0.00017673165829145725, + "loss": 1.1909, + "step": 41400 + }, + { + "epoch": 2.08, + "grad_norm": 3.728938579559326, + "learning_rate": 0.00017643015075376883, + "loss": 1.2146, + "step": 41500 + }, + { + "epoch": 2.08, + "grad_norm": 2.8790736198425293, + "learning_rate": 0.0001761286432160804, + "loss": 1.2305, + "step": 41600 + }, + { + "epoch": 2.08, + "grad_norm": 3.6593847274780273, + "learning_rate": 0.00017582713567839195, + "loss": 1.1753, + "step": 41700 + }, + { + "epoch": 2.09, + "grad_norm": 2.408237934112549, + "learning_rate": 0.00017552562814070348, + "loss": 1.2229, + "step": 41800 + }, + { + "epoch": 2.1, + "grad_norm": 2.574580669403076, + "learning_rate": 0.00017522412060301507, + "loss": 1.2173, + "step": 41900 + }, + { + "epoch": 2.1, + "grad_norm": 2.2249817848205566, + "learning_rate": 0.00017492261306532663, + "loss": 1.2112, + "step": 42000 + }, + { + "epoch": 2.1, + "eval_loss": 1.2255558967590332, + "eval_runtime": 37.9009, + "eval_samples_per_second": 26.385, + "eval_steps_per_second": 3.298, + "step": 42000 + }, + { + "epoch": 2.1, + "grad_norm": 2.2712411880493164, + "learning_rate": 0.00017462110552763816, + "loss": 1.1862, + "step": 42100 + }, + { + "epoch": 2.11, + "grad_norm": 1.646330714225769, + "learning_rate": 0.00017431959798994972, + "loss": 1.1812, + "step": 42200 + }, + { + "epoch": 2.12, + "grad_norm": 2.9691689014434814, + "learning_rate": 0.0001740180904522613, + "loss": 1.2055, + "step": 42300 + }, + { + "epoch": 2.12, + "grad_norm": 5.179681777954102, + "learning_rate": 0.00017371658291457287, + "loss": 1.1625, + "step": 42400 + }, + { + "epoch": 2.12, + "grad_norm": 2.634462833404541, + "learning_rate": 0.0001734150753768844, + "loss": 1.2257, + "step": 42500 + }, + { + "epoch": 2.13, + "grad_norm": 8.693337440490723, + "learning_rate": 0.00017311356783919596, + "loss": 1.2447, + "step": 42600 + }, + { + "epoch": 2.13, + "grad_norm": 3.228513240814209, + "learning_rate": 0.00017281206030150755, + "loss": 1.1993, + "step": 42700 + }, + { + "epoch": 2.14, + "grad_norm": 7.938237190246582, + "learning_rate": 0.00017251055276381908, + "loss": 1.2084, + "step": 42800 + }, + { + "epoch": 2.15, + "grad_norm": 3.0843794345855713, + "learning_rate": 0.00017220904522613064, + "loss": 1.2017, + "step": 42900 + }, + { + "epoch": 2.15, + "grad_norm": 2.86205792427063, + "learning_rate": 0.0001719075376884422, + "loss": 1.1706, + "step": 43000 + }, + { + "epoch": 2.15, + "eval_loss": 1.2179350852966309, + "eval_runtime": 37.9173, + "eval_samples_per_second": 26.373, + "eval_steps_per_second": 3.297, + "step": 43000 + }, + { + "epoch": 2.15, + "grad_norm": 2.137380361557007, + "learning_rate": 0.00017160904522613062, + "loss": 1.2066, + "step": 43100 + }, + { + "epoch": 2.16, + "grad_norm": 2.250091075897217, + "learning_rate": 0.00017130753768844218, + "loss": 1.211, + "step": 43200 + }, + { + "epoch": 2.17, + "grad_norm": 2.008875608444214, + "learning_rate": 0.00017100603015075377, + "loss": 1.2116, + "step": 43300 + }, + { + "epoch": 2.17, + "grad_norm": 2.6691529750823975, + "learning_rate": 0.00017070452261306533, + "loss": 1.1844, + "step": 43400 + }, + { + "epoch": 2.17, + "grad_norm": 1.8802026510238647, + "learning_rate": 0.00017040301507537686, + "loss": 1.1849, + "step": 43500 + }, + { + "epoch": 2.18, + "grad_norm": 2.4100139141082764, + "learning_rate": 0.00017010150753768842, + "loss": 1.1887, + "step": 43600 + }, + { + "epoch": 2.19, + "grad_norm": 3.3384740352630615, + "learning_rate": 0.00016979999999999998, + "loss": 1.2338, + "step": 43700 + }, + { + "epoch": 2.19, + "grad_norm": 2.349433183670044, + "learning_rate": 0.00016949849246231154, + "loss": 1.1633, + "step": 43800 + }, + { + "epoch": 2.19, + "grad_norm": 3.019296884536743, + "learning_rate": 0.0001691969849246231, + "loss": 1.2456, + "step": 43900 + }, + { + "epoch": 2.2, + "grad_norm": 2.497424364089966, + "learning_rate": 0.00016889547738693466, + "loss": 1.1671, + "step": 44000 + }, + { + "epoch": 2.2, + "eval_loss": 1.2000114917755127, + "eval_runtime": 40.4714, + "eval_samples_per_second": 24.709, + "eval_steps_per_second": 3.089, + "step": 44000 + }, + { + "epoch": 2.21, + "grad_norm": 1.6698800325393677, + "learning_rate": 0.0001685939698492462, + "loss": 1.2105, + "step": 44100 + }, + { + "epoch": 2.21, + "grad_norm": 2.3846988677978516, + "learning_rate": 0.00016829246231155778, + "loss": 1.2229, + "step": 44200 + }, + { + "epoch": 2.21, + "grad_norm": 5.891537189483643, + "learning_rate": 0.00016799095477386934, + "loss": 1.1848, + "step": 44300 + }, + { + "epoch": 2.22, + "grad_norm": 1.4433008432388306, + "learning_rate": 0.0001676894472361809, + "loss": 1.1905, + "step": 44400 + }, + { + "epoch": 2.23, + "grad_norm": 2.5641889572143555, + "learning_rate": 0.00016738793969849243, + "loss": 1.2219, + "step": 44500 + }, + { + "epoch": 2.23, + "grad_norm": 3.052948474884033, + "learning_rate": 0.00016708643216080402, + "loss": 1.1887, + "step": 44600 + }, + { + "epoch": 2.23, + "grad_norm": 2.8185369968414307, + "learning_rate": 0.00016678492462311557, + "loss": 1.2107, + "step": 44700 + }, + { + "epoch": 2.24, + "grad_norm": 2.9409399032592773, + "learning_rate": 0.0001664834170854271, + "loss": 1.2222, + "step": 44800 + }, + { + "epoch": 2.25, + "grad_norm": 2.728256940841675, + "learning_rate": 0.00016618190954773867, + "loss": 1.1767, + "step": 44900 + }, + { + "epoch": 2.25, + "grad_norm": 2.4744584560394287, + "learning_rate": 0.00016588040201005025, + "loss": 1.1663, + "step": 45000 + }, + { + "epoch": 2.25, + "eval_loss": 1.2085031270980835, + "eval_runtime": 41.0345, + "eval_samples_per_second": 24.37, + "eval_steps_per_second": 3.046, + "step": 45000 + }, + { + "epoch": 2.25, + "grad_norm": 3.215564250946045, + "learning_rate": 0.00016558190954773868, + "loss": 1.173, + "step": 45100 + }, + { + "epoch": 2.26, + "grad_norm": 1.7013347148895264, + "learning_rate": 0.00016528040201005024, + "loss": 1.1637, + "step": 45200 + }, + { + "epoch": 2.27, + "grad_norm": 3.1096675395965576, + "learning_rate": 0.0001649788944723618, + "loss": 1.1702, + "step": 45300 + }, + { + "epoch": 2.27, + "grad_norm": 2.5975756645202637, + "learning_rate": 0.00016467738693467336, + "loss": 1.1763, + "step": 45400 + }, + { + "epoch": 2.27, + "grad_norm": 2.7020699977874756, + "learning_rate": 0.0001643758793969849, + "loss": 1.1761, + "step": 45500 + }, + { + "epoch": 2.28, + "grad_norm": 1.7007007598876953, + "learning_rate": 0.00016407437185929648, + "loss": 1.2064, + "step": 45600 + }, + { + "epoch": 2.29, + "grad_norm": 3.6038424968719482, + "learning_rate": 0.00016377286432160804, + "loss": 1.1716, + "step": 45700 + }, + { + "epoch": 2.29, + "grad_norm": 2.3656082153320312, + "learning_rate": 0.0001634713567839196, + "loss": 1.1954, + "step": 45800 + }, + { + "epoch": 2.29, + "grad_norm": 2.390509605407715, + "learning_rate": 0.00016316984924623113, + "loss": 1.1664, + "step": 45900 + }, + { + "epoch": 2.3, + "grad_norm": 1.8767670392990112, + "learning_rate": 0.00016286834170854271, + "loss": 1.1784, + "step": 46000 + }, + { + "epoch": 2.3, + "eval_loss": 1.1809154748916626, + "eval_runtime": 43.7304, + "eval_samples_per_second": 22.867, + "eval_steps_per_second": 2.858, + "step": 46000 + }, + { + "epoch": 2.31, + "grad_norm": 3.4367122650146484, + "learning_rate": 0.00016256683417085427, + "loss": 1.2055, + "step": 46100 + }, + { + "epoch": 2.31, + "grad_norm": 1.672525405883789, + "learning_rate": 0.0001622653266331658, + "loss": 1.1954, + "step": 46200 + }, + { + "epoch": 2.31, + "grad_norm": 3.2755866050720215, + "learning_rate": 0.00016196381909547737, + "loss": 1.1801, + "step": 46300 + }, + { + "epoch": 2.32, + "grad_norm": 2.347280979156494, + "learning_rate": 0.00016166231155778892, + "loss": 1.1651, + "step": 46400 + }, + { + "epoch": 2.33, + "grad_norm": 1.9565701484680176, + "learning_rate": 0.0001613608040201005, + "loss": 1.2142, + "step": 46500 + }, + { + "epoch": 2.33, + "grad_norm": 2.317847728729248, + "learning_rate": 0.00016105929648241204, + "loss": 1.188, + "step": 46600 + }, + { + "epoch": 2.33, + "grad_norm": 1.812322974205017, + "learning_rate": 0.0001607577889447236, + "loss": 1.1425, + "step": 46700 + }, + { + "epoch": 2.34, + "grad_norm": 2.5393502712249756, + "learning_rate": 0.00016045628140703514, + "loss": 1.1854, + "step": 46800 + }, + { + "epoch": 2.34, + "grad_norm": 6.562712669372559, + "learning_rate": 0.00016015477386934672, + "loss": 1.1517, + "step": 46900 + }, + { + "epoch": 2.35, + "grad_norm": 2.2086706161499023, + "learning_rate": 0.00015985326633165828, + "loss": 1.1634, + "step": 47000 + }, + { + "epoch": 2.35, + "eval_loss": 1.1972031593322754, + "eval_runtime": 43.2883, + "eval_samples_per_second": 23.101, + "eval_steps_per_second": 2.888, + "step": 47000 + }, + { + "epoch": 2.35, + "grad_norm": 2.061951160430908, + "learning_rate": 0.00015955175879396984, + "loss": 1.2409, + "step": 47100 + }, + { + "epoch": 2.36, + "grad_norm": 2.0312881469726562, + "learning_rate": 0.00015925025125628137, + "loss": 1.1731, + "step": 47200 + }, + { + "epoch": 2.37, + "grad_norm": 4.90245246887207, + "learning_rate": 0.00015894874371859296, + "loss": 1.1849, + "step": 47300 + }, + { + "epoch": 2.37, + "grad_norm": 2.4970901012420654, + "learning_rate": 0.00015864723618090452, + "loss": 1.1684, + "step": 47400 + }, + { + "epoch": 2.38, + "grad_norm": 2.4406049251556396, + "learning_rate": 0.00015834572864321605, + "loss": 1.1855, + "step": 47500 + }, + { + "epoch": 2.38, + "grad_norm": 2.8650543689727783, + "learning_rate": 0.0001580442211055276, + "loss": 1.1586, + "step": 47600 + }, + { + "epoch": 2.38, + "grad_norm": 2.4787731170654297, + "learning_rate": 0.0001577427135678392, + "loss": 1.1913, + "step": 47700 + }, + { + "epoch": 2.39, + "grad_norm": 2.5188841819763184, + "learning_rate": 0.00015744120603015076, + "loss": 1.1938, + "step": 47800 + }, + { + "epoch": 2.4, + "grad_norm": 3.8095650672912598, + "learning_rate": 0.0001571396984924623, + "loss": 1.1858, + "step": 47900 + }, + { + "epoch": 2.4, + "grad_norm": 2.147993564605713, + "learning_rate": 0.00015683819095477385, + "loss": 1.1703, + "step": 48000 + }, + { + "epoch": 2.4, + "eval_loss": 1.1952226161956787, + "eval_runtime": 42.4811, + "eval_samples_per_second": 23.54, + "eval_steps_per_second": 2.942, + "step": 48000 + }, + { + "epoch": 2.41, + "grad_norm": 3.050976514816284, + "learning_rate": 0.00015653668341708544, + "loss": 1.1868, + "step": 48100 + }, + { + "epoch": 2.41, + "grad_norm": 2.6880428791046143, + "learning_rate": 0.00015623517587939697, + "loss": 1.1486, + "step": 48200 + }, + { + "epoch": 2.42, + "grad_norm": 2.169895648956299, + "learning_rate": 0.00015593366834170853, + "loss": 1.1646, + "step": 48300 + }, + { + "epoch": 2.42, + "grad_norm": 9.948437690734863, + "learning_rate": 0.0001556321608040201, + "loss": 1.1625, + "step": 48400 + }, + { + "epoch": 2.42, + "grad_norm": 2.1219215393066406, + "learning_rate": 0.00015533065326633162, + "loss": 1.1854, + "step": 48500 + }, + { + "epoch": 2.43, + "grad_norm": 3.2466542720794678, + "learning_rate": 0.0001550291457286432, + "loss": 1.1556, + "step": 48600 + }, + { + "epoch": 2.44, + "grad_norm": 1.8362162113189697, + "learning_rate": 0.00015472763819095477, + "loss": 1.177, + "step": 48700 + }, + { + "epoch": 2.44, + "grad_norm": 3.579221725463867, + "learning_rate": 0.00015442613065326632, + "loss": 1.1671, + "step": 48800 + }, + { + "epoch": 2.44, + "grad_norm": 2.256967782974243, + "learning_rate": 0.00015412462311557786, + "loss": 1.1807, + "step": 48900 + }, + { + "epoch": 2.45, + "grad_norm": 2.107179641723633, + "learning_rate": 0.00015382311557788944, + "loss": 1.186, + "step": 49000 + }, + { + "epoch": 2.45, + "eval_loss": 1.1811304092407227, + "eval_runtime": 43.1582, + "eval_samples_per_second": 23.171, + "eval_steps_per_second": 2.896, + "step": 49000 + }, + { + "epoch": 2.46, + "grad_norm": 2.615290880203247, + "learning_rate": 0.000153521608040201, + "loss": 1.1828, + "step": 49100 + }, + { + "epoch": 2.46, + "grad_norm": 1.600845217704773, + "learning_rate": 0.00015322010050251254, + "loss": 1.1438, + "step": 49200 + }, + { + "epoch": 2.46, + "grad_norm": 2.272726058959961, + "learning_rate": 0.0001529185929648241, + "loss": 1.1802, + "step": 49300 + }, + { + "epoch": 2.47, + "grad_norm": 1.9845112562179565, + "learning_rate": 0.00015261708542713568, + "loss": 1.1828, + "step": 49400 + }, + { + "epoch": 2.48, + "grad_norm": 1.4725877046585083, + "learning_rate": 0.00015231859296482408, + "loss": 1.1938, + "step": 49500 + }, + { + "epoch": 2.48, + "grad_norm": 2.4453134536743164, + "learning_rate": 0.00015201708542713567, + "loss": 1.1928, + "step": 49600 + }, + { + "epoch": 2.48, + "grad_norm": 2.9869000911712646, + "learning_rate": 0.00015171557788944723, + "loss": 1.1982, + "step": 49700 + }, + { + "epoch": 2.49, + "grad_norm": 2.633794069290161, + "learning_rate": 0.00015141407035175879, + "loss": 1.1287, + "step": 49800 + }, + { + "epoch": 2.5, + "grad_norm": 1.8146005868911743, + "learning_rate": 0.00015111256281407032, + "loss": 1.1747, + "step": 49900 + }, + { + "epoch": 2.5, + "grad_norm": 6.4758405685424805, + "learning_rate": 0.0001508110552763819, + "loss": 1.1548, + "step": 50000 + }, + { + "epoch": 2.5, + "eval_loss": 1.1896699666976929, + "eval_runtime": 43.2315, + "eval_samples_per_second": 23.131, + "eval_steps_per_second": 2.891, + "step": 50000 + }, + { + "epoch": 2.5, + "grad_norm": 1.5688796043395996, + "learning_rate": 0.00015050954773869346, + "loss": 1.168, + "step": 50100 + }, + { + "epoch": 2.51, + "grad_norm": 1.4024161100387573, + "learning_rate": 0.000150208040201005, + "loss": 1.1796, + "step": 50200 + }, + { + "epoch": 2.52, + "grad_norm": 2.066570997238159, + "learning_rate": 0.00014990653266331658, + "loss": 1.1419, + "step": 50300 + }, + { + "epoch": 2.52, + "grad_norm": 3.7978389263153076, + "learning_rate": 0.00014960502512562812, + "loss": 1.1497, + "step": 50400 + }, + { + "epoch": 2.52, + "grad_norm": 2.2129733562469482, + "learning_rate": 0.0001493035175879397, + "loss": 1.1371, + "step": 50500 + }, + { + "epoch": 2.53, + "grad_norm": 3.0140724182128906, + "learning_rate": 0.00014900201005025123, + "loss": 1.1778, + "step": 50600 + }, + { + "epoch": 2.54, + "grad_norm": 2.457521915435791, + "learning_rate": 0.00014870050251256282, + "loss": 1.1266, + "step": 50700 + }, + { + "epoch": 2.54, + "grad_norm": 2.1066813468933105, + "learning_rate": 0.00014839899497487435, + "loss": 1.1635, + "step": 50800 + }, + { + "epoch": 2.54, + "grad_norm": 2.801196336746216, + "learning_rate": 0.0001480974874371859, + "loss": 1.1842, + "step": 50900 + }, + { + "epoch": 2.55, + "grad_norm": 4.693379878997803, + "learning_rate": 0.00014779597989949747, + "loss": 1.1449, + "step": 51000 + }, + { + "epoch": 2.55, + "eval_loss": 1.1495003700256348, + "eval_runtime": 37.9097, + "eval_samples_per_second": 26.378, + "eval_steps_per_second": 3.297, + "step": 51000 + }, + { + "epoch": 2.56, + "grad_norm": 1.917925477027893, + "learning_rate": 0.00014749447236180903, + "loss": 1.1303, + "step": 51100 + }, + { + "epoch": 2.56, + "grad_norm": 2.6460864543914795, + "learning_rate": 0.0001471929648241206, + "loss": 1.1638, + "step": 51200 + }, + { + "epoch": 2.56, + "grad_norm": 2.5040736198425293, + "learning_rate": 0.00014689145728643215, + "loss": 1.1382, + "step": 51300 + }, + { + "epoch": 2.57, + "grad_norm": 2.7533071041107178, + "learning_rate": 0.0001465899497487437, + "loss": 1.1803, + "step": 51400 + }, + { + "epoch": 2.58, + "grad_norm": 2.220345973968506, + "learning_rate": 0.00014629145728643214, + "loss": 1.1506, + "step": 51500 + }, + { + "epoch": 2.58, + "grad_norm": 1.3668216466903687, + "learning_rate": 0.0001459899497487437, + "loss": 1.1538, + "step": 51600 + }, + { + "epoch": 2.58, + "grad_norm": 2.26232647895813, + "learning_rate": 0.00014568844221105525, + "loss": 1.2085, + "step": 51700 + }, + { + "epoch": 2.59, + "grad_norm": 5.508904933929443, + "learning_rate": 0.00014538693467336681, + "loss": 1.1528, + "step": 51800 + }, + { + "epoch": 2.59, + "grad_norm": 2.9169905185699463, + "learning_rate": 0.00014508542713567837, + "loss": 1.1632, + "step": 51900 + }, + { + "epoch": 2.6, + "grad_norm": 2.5156240463256836, + "learning_rate": 0.00014478391959798993, + "loss": 1.1677, + "step": 52000 + }, + { + "epoch": 2.6, + "eval_loss": 1.174816370010376, + "eval_runtime": 42.0784, + "eval_samples_per_second": 23.765, + "eval_steps_per_second": 2.971, + "step": 52000 + }, + { + "epoch": 2.6, + "grad_norm": 1.622004747390747, + "learning_rate": 0.0001444824120603015, + "loss": 1.1174, + "step": 52100 + }, + { + "epoch": 2.61, + "grad_norm": 2.5255143642425537, + "learning_rate": 0.00014418090452261305, + "loss": 1.1415, + "step": 52200 + }, + { + "epoch": 2.62, + "grad_norm": 1.7780824899673462, + "learning_rate": 0.0001438793969849246, + "loss": 1.1871, + "step": 52300 + }, + { + "epoch": 2.62, + "grad_norm": 2.320028305053711, + "learning_rate": 0.00014357788944723617, + "loss": 1.1841, + "step": 52400 + }, + { + "epoch": 2.62, + "grad_norm": 2.6219685077667236, + "learning_rate": 0.00014327638190954773, + "loss": 1.1349, + "step": 52500 + }, + { + "epoch": 2.63, + "grad_norm": 3.0288233757019043, + "learning_rate": 0.0001429748743718593, + "loss": 1.1753, + "step": 52600 + }, + { + "epoch": 2.63, + "grad_norm": 2.3062517642974854, + "learning_rate": 0.00014267336683417085, + "loss": 1.1836, + "step": 52700 + }, + { + "epoch": 2.64, + "grad_norm": 1.8819166421890259, + "learning_rate": 0.0001423718592964824, + "loss": 1.1491, + "step": 52800 + }, + { + "epoch": 2.65, + "grad_norm": 1.7771334648132324, + "learning_rate": 0.00014207035175879397, + "loss": 1.1311, + "step": 52900 + }, + { + "epoch": 2.65, + "grad_norm": 1.9495539665222168, + "learning_rate": 0.00014176884422110553, + "loss": 1.1757, + "step": 53000 + }, + { + "epoch": 2.65, + "eval_loss": 1.161841869354248, + "eval_runtime": 41.6597, + "eval_samples_per_second": 24.004, + "eval_steps_per_second": 3.0, + "step": 53000 + }, + { + "epoch": 2.66, + "grad_norm": 2.317021131515503, + "learning_rate": 0.00014146733668341706, + "loss": 1.145, + "step": 53100 + }, + { + "epoch": 2.66, + "grad_norm": 1.4079538583755493, + "learning_rate": 0.00014116582914572865, + "loss": 1.0893, + "step": 53200 + }, + { + "epoch": 2.67, + "grad_norm": 6.593141555786133, + "learning_rate": 0.00014086432160804018, + "loss": 1.1357, + "step": 53300 + }, + { + "epoch": 2.67, + "grad_norm": 2.657529830932617, + "learning_rate": 0.00014056281407035177, + "loss": 1.1651, + "step": 53400 + }, + { + "epoch": 2.67, + "grad_norm": 3.312056541442871, + "learning_rate": 0.0001402613065326633, + "loss": 1.165, + "step": 53500 + }, + { + "epoch": 2.68, + "grad_norm": 2.1961281299591064, + "learning_rate": 0.00013995979899497486, + "loss": 1.1584, + "step": 53600 + }, + { + "epoch": 2.69, + "grad_norm": 1.933409571647644, + "learning_rate": 0.00013965829145728642, + "loss": 1.1382, + "step": 53700 + }, + { + "epoch": 2.69, + "grad_norm": 2.6763832569122314, + "learning_rate": 0.00013935678391959798, + "loss": 1.1238, + "step": 53800 + }, + { + "epoch": 2.69, + "grad_norm": 3.3957033157348633, + "learning_rate": 0.00013905527638190954, + "loss": 1.154, + "step": 53900 + }, + { + "epoch": 2.7, + "grad_norm": 3.526700019836426, + "learning_rate": 0.0001387537688442211, + "loss": 1.1325, + "step": 54000 + }, + { + "epoch": 2.7, + "eval_loss": 1.141178011894226, + "eval_runtime": 37.9667, + "eval_samples_per_second": 26.339, + "eval_steps_per_second": 3.292, + "step": 54000 + }, + { + "epoch": 2.71, + "grad_norm": 3.3937137126922607, + "learning_rate": 0.00013845226130653265, + "loss": 1.141, + "step": 54100 + }, + { + "epoch": 2.71, + "grad_norm": 1.9187488555908203, + "learning_rate": 0.00013815075376884421, + "loss": 1.1253, + "step": 54200 + }, + { + "epoch": 2.71, + "grad_norm": 2.2351136207580566, + "learning_rate": 0.00013784924623115577, + "loss": 1.2008, + "step": 54300 + }, + { + "epoch": 2.72, + "grad_norm": 3.97955584526062, + "learning_rate": 0.0001375477386934673, + "loss": 1.1609, + "step": 54400 + }, + { + "epoch": 2.73, + "grad_norm": 3.5734050273895264, + "learning_rate": 0.0001372462311557789, + "loss": 1.1584, + "step": 54500 + }, + { + "epoch": 2.73, + "grad_norm": 2.3804807662963867, + "learning_rate": 0.00013694472361809042, + "loss": 1.1343, + "step": 54600 + }, + { + "epoch": 2.73, + "grad_norm": 2.0606038570404053, + "learning_rate": 0.000136643216080402, + "loss": 1.1555, + "step": 54700 + }, + { + "epoch": 2.74, + "grad_norm": 4.046571731567383, + "learning_rate": 0.00013634170854271354, + "loss": 1.1543, + "step": 54800 + }, + { + "epoch": 2.75, + "grad_norm": 2.470393180847168, + "learning_rate": 0.00013604020100502513, + "loss": 1.1651, + "step": 54900 + }, + { + "epoch": 2.75, + "grad_norm": 1.4677540063858032, + "learning_rate": 0.00013573869346733666, + "loss": 1.1366, + "step": 55000 + }, + { + "epoch": 2.75, + "eval_loss": 1.1223907470703125, + "eval_runtime": 43.6458, + "eval_samples_per_second": 22.912, + "eval_steps_per_second": 2.864, + "step": 55000 + }, + { + "epoch": 2.75, + "grad_norm": 2.5567593574523926, + "learning_rate": 0.00013543718592964822, + "loss": 1.1348, + "step": 55100 + }, + { + "epoch": 2.76, + "grad_norm": 4.812506675720215, + "learning_rate": 0.00013513567839195978, + "loss": 1.1675, + "step": 55200 + }, + { + "epoch": 2.77, + "grad_norm": 2.5467748641967773, + "learning_rate": 0.00013483417085427134, + "loss": 1.1238, + "step": 55300 + }, + { + "epoch": 2.77, + "grad_norm": 4.469081878662109, + "learning_rate": 0.0001345326633165829, + "loss": 1.102, + "step": 55400 + }, + { + "epoch": 2.77, + "grad_norm": 3.878526449203491, + "learning_rate": 0.00013423115577889446, + "loss": 1.131, + "step": 55500 + }, + { + "epoch": 2.78, + "grad_norm": 2.0142953395843506, + "learning_rate": 0.00013392964824120602, + "loss": 1.1349, + "step": 55600 + }, + { + "epoch": 2.79, + "grad_norm": 2.600478410720825, + "learning_rate": 0.00013362814070351758, + "loss": 1.1363, + "step": 55700 + }, + { + "epoch": 2.79, + "grad_norm": 2.58322811126709, + "learning_rate": 0.00013332663316582914, + "loss": 1.1426, + "step": 55800 + }, + { + "epoch": 2.79, + "grad_norm": 2.2471609115600586, + "learning_rate": 0.0001330251256281407, + "loss": 1.1446, + "step": 55900 + }, + { + "epoch": 2.8, + "grad_norm": 1.8442782163619995, + "learning_rate": 0.00013272361809045226, + "loss": 1.1315, + "step": 56000 + }, + { + "epoch": 2.8, + "eval_loss": 1.1661006212234497, + "eval_runtime": 47.0159, + "eval_samples_per_second": 21.269, + "eval_steps_per_second": 2.659, + "step": 56000 + }, + { + "epoch": 2.81, + "grad_norm": 2.2928128242492676, + "learning_rate": 0.0001324221105527638, + "loss": 1.122, + "step": 56100 + }, + { + "epoch": 2.81, + "grad_norm": 2.192915201187134, + "learning_rate": 0.00013212361809045224, + "loss": 1.1251, + "step": 56200 + }, + { + "epoch": 2.81, + "grad_norm": 2.334547519683838, + "learning_rate": 0.00013182211055276383, + "loss": 1.1408, + "step": 56300 + }, + { + "epoch": 2.82, + "grad_norm": 1.832930088043213, + "learning_rate": 0.00013152060301507536, + "loss": 1.1146, + "step": 56400 + }, + { + "epoch": 2.83, + "grad_norm": 4.524071216583252, + "learning_rate": 0.00013121909547738692, + "loss": 1.1661, + "step": 56500 + }, + { + "epoch": 2.83, + "grad_norm": 1.4990063905715942, + "learning_rate": 0.00013091758793969848, + "loss": 1.1247, + "step": 56600 + }, + { + "epoch": 2.83, + "grad_norm": 3.572678804397583, + "learning_rate": 0.00013061608040201004, + "loss": 1.1251, + "step": 56700 + }, + { + "epoch": 2.84, + "grad_norm": 2.0090138912200928, + "learning_rate": 0.0001303145728643216, + "loss": 1.1267, + "step": 56800 + }, + { + "epoch": 2.84, + "grad_norm": 2.0328962802886963, + "learning_rate": 0.00013001306532663316, + "loss": 1.1343, + "step": 56900 + }, + { + "epoch": 2.85, + "grad_norm": 1.5744613409042358, + "learning_rate": 0.00012971155778894472, + "loss": 1.1208, + "step": 57000 + }, + { + "epoch": 2.85, + "eval_loss": 1.1388169527053833, + "eval_runtime": 65.7696, + "eval_samples_per_second": 15.205, + "eval_steps_per_second": 1.901, + "step": 57000 + }, + { + "epoch": 2.85, + "grad_norm": 1.2835485935211182, + "learning_rate": 0.00012941005025125628, + "loss": 1.1561, + "step": 57100 + }, + { + "epoch": 2.86, + "grad_norm": 3.413334846496582, + "learning_rate": 0.00012910854271356784, + "loss": 1.126, + "step": 57200 + }, + { + "epoch": 2.87, + "grad_norm": 2.6612489223480225, + "learning_rate": 0.00012880703517587937, + "loss": 1.1705, + "step": 57300 + }, + { + "epoch": 2.87, + "grad_norm": 2.0389411449432373, + "learning_rate": 0.00012850552763819096, + "loss": 1.1322, + "step": 57400 + }, + { + "epoch": 2.88, + "grad_norm": 2.203789710998535, + "learning_rate": 0.0001282040201005025, + "loss": 1.1437, + "step": 57500 + }, + { + "epoch": 2.88, + "grad_norm": 5.272101879119873, + "learning_rate": 0.00012790251256281407, + "loss": 1.1333, + "step": 57600 + }, + { + "epoch": 2.88, + "grad_norm": 3.0776541233062744, + "learning_rate": 0.0001276010050251256, + "loss": 1.1235, + "step": 57700 + }, + { + "epoch": 2.89, + "grad_norm": 3.8333828449249268, + "learning_rate": 0.0001272994974874372, + "loss": 1.1141, + "step": 57800 + }, + { + "epoch": 2.9, + "grad_norm": 3.3916189670562744, + "learning_rate": 0.00012699798994974873, + "loss": 1.1084, + "step": 57900 + }, + { + "epoch": 2.9, + "grad_norm": 1.6035398244857788, + "learning_rate": 0.00012669648241206029, + "loss": 1.1057, + "step": 58000 + }, + { + "epoch": 2.9, + "eval_loss": 1.1182321310043335, + "eval_runtime": 60.567, + "eval_samples_per_second": 16.511, + "eval_steps_per_second": 2.064, + "step": 58000 + }, + { + "epoch": 2.91, + "grad_norm": 2.41086745262146, + "learning_rate": 0.00012639497487437184, + "loss": 1.1424, + "step": 58100 + }, + { + "epoch": 2.91, + "grad_norm": 1.8278477191925049, + "learning_rate": 0.0001260934673366834, + "loss": 1.1126, + "step": 58200 + }, + { + "epoch": 2.92, + "grad_norm": 2.7294256687164307, + "learning_rate": 0.00012579195979899496, + "loss": 1.1207, + "step": 58300 + }, + { + "epoch": 2.92, + "grad_norm": 2.813084602355957, + "learning_rate": 0.00012549045226130652, + "loss": 1.1498, + "step": 58400 + }, + { + "epoch": 2.92, + "grad_norm": 2.6869473457336426, + "learning_rate": 0.00012519195979899495, + "loss": 1.1198, + "step": 58500 + }, + { + "epoch": 2.93, + "grad_norm": 1.8101871013641357, + "learning_rate": 0.00012489045226130654, + "loss": 1.1725, + "step": 58600 + }, + { + "epoch": 2.94, + "grad_norm": 4.7469305992126465, + "learning_rate": 0.00012458894472361807, + "loss": 1.1382, + "step": 58700 + }, + { + "epoch": 2.94, + "grad_norm": 1.8046541213989258, + "learning_rate": 0.00012428743718592965, + "loss": 1.082, + "step": 58800 + }, + { + "epoch": 2.94, + "grad_norm": 2.176015615463257, + "learning_rate": 0.0001239859296482412, + "loss": 1.1304, + "step": 58900 + }, + { + "epoch": 2.95, + "grad_norm": 1.8910236358642578, + "learning_rate": 0.00012368442211055277, + "loss": 1.1638, + "step": 59000 + }, + { + "epoch": 2.95, + "eval_loss": 1.1192156076431274, + "eval_runtime": 41.8257, + "eval_samples_per_second": 23.909, + "eval_steps_per_second": 2.989, + "step": 59000 + }, + { + "epoch": 2.96, + "grad_norm": 2.288358211517334, + "learning_rate": 0.0001233829145728643, + "loss": 1.1203, + "step": 59100 + }, + { + "epoch": 2.96, + "grad_norm": 1.9389914274215698, + "learning_rate": 0.00012308140703517586, + "loss": 1.0892, + "step": 59200 + }, + { + "epoch": 2.96, + "grad_norm": 2.1551334857940674, + "learning_rate": 0.00012277989949748742, + "loss": 1.1046, + "step": 59300 + }, + { + "epoch": 2.97, + "grad_norm": 1.5200018882751465, + "learning_rate": 0.00012247839195979898, + "loss": 1.1373, + "step": 59400 + }, + { + "epoch": 2.98, + "grad_norm": 2.45053768157959, + "learning_rate": 0.00012217688442211054, + "loss": 1.1403, + "step": 59500 + }, + { + "epoch": 2.98, + "grad_norm": 2.767160177230835, + "learning_rate": 0.00012187537688442209, + "loss": 1.0693, + "step": 59600 + }, + { + "epoch": 2.98, + "grad_norm": 2.3581674098968506, + "learning_rate": 0.00012157386934673366, + "loss": 1.125, + "step": 59700 + }, + { + "epoch": 2.99, + "grad_norm": 1.4579651355743408, + "learning_rate": 0.00012127236180904521, + "loss": 1.127, + "step": 59800 + }, + { + "epoch": 3.0, + "grad_norm": 4.08085298538208, + "learning_rate": 0.00012097085427135678, + "loss": 1.1539, + "step": 59900 + }, + { + "epoch": 3.0, + "grad_norm": 1.5620448589324951, + "learning_rate": 0.00012066934673366833, + "loss": 1.1372, + "step": 60000 + }, + { + "epoch": 3.0, + "eval_loss": 1.130272626876831, + "eval_runtime": 37.9665, + "eval_samples_per_second": 26.339, + "eval_steps_per_second": 3.292, + "step": 60000 + }, + { + "epoch": 3.0, + "grad_norm": 3.270860433578491, + "learning_rate": 0.00012036783919597989, + "loss": 1.0761, + "step": 60100 + }, + { + "epoch": 3.01, + "grad_norm": 2.5301287174224854, + "learning_rate": 0.00012006633165829145, + "loss": 1.0881, + "step": 60200 + }, + { + "epoch": 3.02, + "grad_norm": 2.5292015075683594, + "learning_rate": 0.000119764824120603, + "loss": 1.046, + "step": 60300 + }, + { + "epoch": 3.02, + "grad_norm": 2.8234751224517822, + "learning_rate": 0.00011946331658291456, + "loss": 1.0802, + "step": 60400 + }, + { + "epoch": 3.02, + "grad_norm": 2.536975860595703, + "learning_rate": 0.00011916180904522612, + "loss": 1.0993, + "step": 60500 + }, + { + "epoch": 3.03, + "grad_norm": 3.510464906692505, + "learning_rate": 0.00011886030150753767, + "loss": 1.1108, + "step": 60600 + }, + { + "epoch": 3.04, + "grad_norm": 1.9273101091384888, + "learning_rate": 0.00011855879396984924, + "loss": 1.1081, + "step": 60700 + }, + { + "epoch": 3.04, + "grad_norm": 2.1979687213897705, + "learning_rate": 0.00011825728643216079, + "loss": 1.1059, + "step": 60800 + }, + { + "epoch": 3.04, + "grad_norm": 2.097529172897339, + "learning_rate": 0.00011795577889447236, + "loss": 1.1098, + "step": 60900 + }, + { + "epoch": 3.05, + "grad_norm": 2.970689296722412, + "learning_rate": 0.00011765427135678391, + "loss": 1.0915, + "step": 61000 + }, + { + "epoch": 3.05, + "eval_loss": 1.0778993368148804, + "eval_runtime": 37.9552, + "eval_samples_per_second": 26.347, + "eval_steps_per_second": 3.293, + "step": 61000 + }, + { + "epoch": 3.06, + "grad_norm": 2.3489325046539307, + "learning_rate": 0.00011735577889447236, + "loss": 1.1174, + "step": 61100 + }, + { + "epoch": 3.06, + "grad_norm": 2.9280216693878174, + "learning_rate": 0.00011705427135678391, + "loss": 1.0977, + "step": 61200 + }, + { + "epoch": 3.06, + "grad_norm": 2.231684446334839, + "learning_rate": 0.00011675276381909548, + "loss": 1.1004, + "step": 61300 + }, + { + "epoch": 3.07, + "grad_norm": 1.8373113870620728, + "learning_rate": 0.00011645125628140703, + "loss": 1.109, + "step": 61400 + }, + { + "epoch": 3.08, + "grad_norm": 3.446971893310547, + "learning_rate": 0.00011614974874371859, + "loss": 1.092, + "step": 61500 + }, + { + "epoch": 3.08, + "grad_norm": 2.2681097984313965, + "learning_rate": 0.00011584824120603014, + "loss": 1.0901, + "step": 61600 + }, + { + "epoch": 3.08, + "grad_norm": 2.173755407333374, + "learning_rate": 0.0001155467336683417, + "loss": 1.0638, + "step": 61700 + }, + { + "epoch": 3.09, + "grad_norm": 3.3374030590057373, + "learning_rate": 0.00011524522613065325, + "loss": 1.1036, + "step": 61800 + }, + { + "epoch": 3.1, + "grad_norm": 2.082169771194458, + "learning_rate": 0.00011494371859296481, + "loss": 1.0737, + "step": 61900 + }, + { + "epoch": 3.1, + "grad_norm": 2.741830587387085, + "learning_rate": 0.00011464221105527637, + "loss": 1.0705, + "step": 62000 + }, + { + "epoch": 3.1, + "eval_loss": 1.079288125038147, + "eval_runtime": 37.9824, + "eval_samples_per_second": 26.328, + "eval_steps_per_second": 3.291, + "step": 62000 + }, + { + "epoch": 3.1, + "grad_norm": 2.128262996673584, + "learning_rate": 0.00011434070351758793, + "loss": 1.0964, + "step": 62100 + }, + { + "epoch": 3.11, + "grad_norm": 2.100025177001953, + "learning_rate": 0.00011403919597989949, + "loss": 1.0951, + "step": 62200 + }, + { + "epoch": 3.12, + "grad_norm": 7.355963706970215, + "learning_rate": 0.00011373768844221103, + "loss": 1.128, + "step": 62300 + }, + { + "epoch": 3.12, + "grad_norm": 2.6374123096466064, + "learning_rate": 0.0001134361809045226, + "loss": 1.0928, + "step": 62400 + }, + { + "epoch": 3.12, + "grad_norm": 2.6389834880828857, + "learning_rate": 0.00011313467336683415, + "loss": 1.1067, + "step": 62500 + }, + { + "epoch": 3.13, + "grad_norm": 3.367866277694702, + "learning_rate": 0.00011283316582914573, + "loss": 1.0719, + "step": 62600 + }, + { + "epoch": 3.13, + "grad_norm": 2.0250422954559326, + "learning_rate": 0.00011253165829145727, + "loss": 1.0967, + "step": 62700 + }, + { + "epoch": 3.14, + "grad_norm": 3.8763527870178223, + "learning_rate": 0.00011223015075376884, + "loss": 1.0819, + "step": 62800 + }, + { + "epoch": 3.15, + "grad_norm": 2.7926995754241943, + "learning_rate": 0.00011192864321608039, + "loss": 1.1123, + "step": 62900 + }, + { + "epoch": 3.15, + "grad_norm": 2.5031745433807373, + "learning_rate": 0.00011162713567839195, + "loss": 1.0725, + "step": 63000 + }, + { + "epoch": 3.15, + "eval_loss": 1.117138147354126, + "eval_runtime": 37.9757, + "eval_samples_per_second": 26.333, + "eval_steps_per_second": 3.292, + "step": 63000 + }, + { + "epoch": 3.15, + "grad_norm": 2.086465835571289, + "learning_rate": 0.00011132562814070351, + "loss": 1.0588, + "step": 63100 + }, + { + "epoch": 3.16, + "grad_norm": 3.295759439468384, + "learning_rate": 0.00011102412060301507, + "loss": 1.1175, + "step": 63200 + }, + { + "epoch": 3.17, + "grad_norm": 2.666032075881958, + "learning_rate": 0.00011072261306532661, + "loss": 1.0963, + "step": 63300 + }, + { + "epoch": 3.17, + "grad_norm": 1.8267697095870972, + "learning_rate": 0.00011042110552763819, + "loss": 1.0691, + "step": 63400 + }, + { + "epoch": 3.17, + "grad_norm": 2.682745933532715, + "learning_rate": 0.00011011959798994973, + "loss": 1.0671, + "step": 63500 + }, + { + "epoch": 3.18, + "grad_norm": 2.914111375808716, + "learning_rate": 0.00010982110552763819, + "loss": 1.0809, + "step": 63600 + }, + { + "epoch": 3.19, + "grad_norm": 2.7258005142211914, + "learning_rate": 0.00010951959798994973, + "loss": 1.0527, + "step": 63700 + }, + { + "epoch": 3.19, + "grad_norm": 2.646939992904663, + "learning_rate": 0.0001092180904522613, + "loss": 1.0523, + "step": 63800 + }, + { + "epoch": 3.19, + "grad_norm": 2.107849359512329, + "learning_rate": 0.00010891658291457285, + "loss": 1.0629, + "step": 63900 + }, + { + "epoch": 3.2, + "grad_norm": 1.9583218097686768, + "learning_rate": 0.00010861507537688442, + "loss": 1.065, + "step": 64000 + }, + { + "epoch": 3.2, + "eval_loss": 1.1121866703033447, + "eval_runtime": 37.9368, + "eval_samples_per_second": 26.36, + "eval_steps_per_second": 3.295, + "step": 64000 + }, + { + "epoch": 3.21, + "grad_norm": 2.384493589401245, + "learning_rate": 0.00010831356783919597, + "loss": 1.0664, + "step": 64100 + }, + { + "epoch": 3.21, + "grad_norm": 2.060441732406616, + "learning_rate": 0.00010801206030150753, + "loss": 1.0762, + "step": 64200 + }, + { + "epoch": 3.21, + "grad_norm": 6.751837253570557, + "learning_rate": 0.00010771055276381909, + "loss": 1.0553, + "step": 64300 + }, + { + "epoch": 3.22, + "grad_norm": 2.9765820503234863, + "learning_rate": 0.00010740904522613064, + "loss": 1.0636, + "step": 64400 + }, + { + "epoch": 3.23, + "grad_norm": 2.2694509029388428, + "learning_rate": 0.00010710753768844221, + "loss": 1.1031, + "step": 64500 + }, + { + "epoch": 3.23, + "grad_norm": 3.272937536239624, + "learning_rate": 0.00010680603015075375, + "loss": 1.1053, + "step": 64600 + }, + { + "epoch": 3.23, + "grad_norm": 3.242722988128662, + "learning_rate": 0.00010650452261306531, + "loss": 1.1013, + "step": 64700 + }, + { + "epoch": 3.24, + "grad_norm": 2.7234878540039062, + "learning_rate": 0.00010620301507537687, + "loss": 1.0428, + "step": 64800 + }, + { + "epoch": 3.25, + "grad_norm": 2.30928373336792, + "learning_rate": 0.00010590150753768843, + "loss": 1.067, + "step": 64900 + }, + { + "epoch": 3.25, + "grad_norm": 4.809457302093506, + "learning_rate": 0.00010559999999999998, + "loss": 1.053, + "step": 65000 + }, + { + "epoch": 3.25, + "eval_loss": 1.1082242727279663, + "eval_runtime": 37.9286, + "eval_samples_per_second": 26.365, + "eval_steps_per_second": 3.296, + "step": 65000 + }, + { + "epoch": 3.25, + "grad_norm": 2.282684087753296, + "learning_rate": 0.00010529849246231155, + "loss": 1.0547, + "step": 65100 + }, + { + "epoch": 3.26, + "grad_norm": 3.756114959716797, + "learning_rate": 0.0001049969849246231, + "loss": 1.0435, + "step": 65200 + }, + { + "epoch": 3.27, + "grad_norm": 3.709932565689087, + "learning_rate": 0.00010469547738693467, + "loss": 1.0678, + "step": 65300 + }, + { + "epoch": 3.27, + "grad_norm": 1.6080820560455322, + "learning_rate": 0.00010439396984924622, + "loss": 1.101, + "step": 65400 + }, + { + "epoch": 3.27, + "grad_norm": 2.2617008686065674, + "learning_rate": 0.00010409246231155779, + "loss": 1.0729, + "step": 65500 + }, + { + "epoch": 3.28, + "grad_norm": 3.1394824981689453, + "learning_rate": 0.00010379095477386933, + "loss": 1.0861, + "step": 65600 + }, + { + "epoch": 3.29, + "grad_norm": 2.8208096027374268, + "learning_rate": 0.0001034894472361809, + "loss": 1.0535, + "step": 65700 + }, + { + "epoch": 3.29, + "grad_norm": 2.7133829593658447, + "learning_rate": 0.00010318793969849245, + "loss": 1.0498, + "step": 65800 + }, + { + "epoch": 3.29, + "grad_norm": 2.2674591541290283, + "learning_rate": 0.00010288643216080401, + "loss": 1.0861, + "step": 65900 + }, + { + "epoch": 3.3, + "grad_norm": 2.238206386566162, + "learning_rate": 0.00010258492462311557, + "loss": 1.0557, + "step": 66000 + }, + { + "epoch": 3.3, + "eval_loss": 1.0877478122711182, + "eval_runtime": 37.9734, + "eval_samples_per_second": 26.334, + "eval_steps_per_second": 3.292, + "step": 66000 + }, + { + "epoch": 3.31, + "grad_norm": 1.8776639699935913, + "learning_rate": 0.00010228643216080401, + "loss": 1.0898, + "step": 66100 + }, + { + "epoch": 3.31, + "grad_norm": 2.540071725845337, + "learning_rate": 0.00010198492462311557, + "loss": 1.0437, + "step": 66200 + }, + { + "epoch": 3.31, + "grad_norm": 3.616443157196045, + "learning_rate": 0.00010168341708542713, + "loss": 1.0698, + "step": 66300 + }, + { + "epoch": 3.32, + "grad_norm": 2.866360902786255, + "learning_rate": 0.00010138190954773868, + "loss": 1.0666, + "step": 66400 + }, + { + "epoch": 3.33, + "grad_norm": 3.1752941608428955, + "learning_rate": 0.00010108040201005025, + "loss": 1.0723, + "step": 66500 + }, + { + "epoch": 3.33, + "grad_norm": 4.475529193878174, + "learning_rate": 0.0001007788944723618, + "loss": 1.105, + "step": 66600 + }, + { + "epoch": 3.33, + "grad_norm": 2.9230782985687256, + "learning_rate": 0.00010047738693467337, + "loss": 1.0674, + "step": 66700 + }, + { + "epoch": 3.34, + "grad_norm": 4.472579479217529, + "learning_rate": 0.00010017587939698491, + "loss": 1.0798, + "step": 66800 + }, + { + "epoch": 3.34, + "grad_norm": 2.9080252647399902, + "learning_rate": 9.987437185929649e-05, + "loss": 1.0789, + "step": 66900 + }, + { + "epoch": 3.35, + "grad_norm": 2.728170394897461, + "learning_rate": 9.957286432160803e-05, + "loss": 1.0771, + "step": 67000 + }, + { + "epoch": 3.35, + "eval_loss": 1.0558359622955322, + "eval_runtime": 37.9887, + "eval_samples_per_second": 26.324, + "eval_steps_per_second": 3.29, + "step": 67000 + }, + { + "epoch": 3.35, + "grad_norm": 2.227384328842163, + "learning_rate": 9.927135678391958e-05, + "loss": 1.0336, + "step": 67100 + }, + { + "epoch": 3.36, + "grad_norm": 2.5888235569000244, + "learning_rate": 9.896984924623115e-05, + "loss": 1.0525, + "step": 67200 + }, + { + "epoch": 3.37, + "grad_norm": 1.9375131130218506, + "learning_rate": 9.86683417085427e-05, + "loss": 1.1218, + "step": 67300 + }, + { + "epoch": 3.37, + "grad_norm": 1.8543367385864258, + "learning_rate": 9.836683417085426e-05, + "loss": 1.0761, + "step": 67400 + }, + { + "epoch": 3.38, + "grad_norm": 3.050717353820801, + "learning_rate": 9.806532663316582e-05, + "loss": 1.07, + "step": 67500 + }, + { + "epoch": 3.38, + "grad_norm": 3.321708917617798, + "learning_rate": 9.776381909547738e-05, + "loss": 1.0606, + "step": 67600 + }, + { + "epoch": 3.38, + "grad_norm": 2.958376407623291, + "learning_rate": 9.746231155778894e-05, + "loss": 1.0608, + "step": 67700 + }, + { + "epoch": 3.39, + "grad_norm": 2.215822219848633, + "learning_rate": 9.71608040201005e-05, + "loss": 1.0605, + "step": 67800 + }, + { + "epoch": 3.4, + "grad_norm": 2.430649518966675, + "learning_rate": 9.685929648241204e-05, + "loss": 1.0783, + "step": 67900 + }, + { + "epoch": 3.4, + "grad_norm": 2.4160895347595215, + "learning_rate": 9.655778894472361e-05, + "loss": 1.0783, + "step": 68000 + }, + { + "epoch": 3.4, + "eval_loss": 1.1083147525787354, + "eval_runtime": 37.9578, + "eval_samples_per_second": 26.345, + "eval_steps_per_second": 3.293, + "step": 68000 + }, + { + "epoch": 3.41, + "grad_norm": 3.5485310554504395, + "learning_rate": 9.625628140703516e-05, + "loss": 1.0299, + "step": 68100 + }, + { + "epoch": 3.41, + "grad_norm": 2.0450522899627686, + "learning_rate": 9.595477386934673e-05, + "loss": 1.0662, + "step": 68200 + }, + { + "epoch": 3.42, + "grad_norm": 2.339768171310425, + "learning_rate": 9.565326633165828e-05, + "loss": 1.0781, + "step": 68300 + }, + { + "epoch": 3.42, + "grad_norm": 2.055027484893799, + "learning_rate": 9.535477386934673e-05, + "loss": 1.0586, + "step": 68400 + }, + { + "epoch": 3.42, + "grad_norm": 3.186723232269287, + "learning_rate": 9.505326633165828e-05, + "loss": 1.071, + "step": 68500 + }, + { + "epoch": 3.43, + "grad_norm": 2.934070587158203, + "learning_rate": 9.475175879396985e-05, + "loss": 1.0474, + "step": 68600 + }, + { + "epoch": 3.44, + "grad_norm": 4.080368995666504, + "learning_rate": 9.44502512562814e-05, + "loss": 1.0376, + "step": 68700 + }, + { + "epoch": 3.44, + "grad_norm": 9.1796236038208, + "learning_rate": 9.415175879396985e-05, + "loss": 1.0362, + "step": 68800 + }, + { + "epoch": 3.44, + "grad_norm": 2.9005532264709473, + "learning_rate": 9.38502512562814e-05, + "loss": 1.0581, + "step": 68900 + }, + { + "epoch": 3.45, + "grad_norm": 2.2525532245635986, + "learning_rate": 9.354874371859296e-05, + "loss": 1.0664, + "step": 69000 + }, + { + "epoch": 3.45, + "eval_loss": 1.0977917909622192, + "eval_runtime": 37.9301, + "eval_samples_per_second": 26.364, + "eval_steps_per_second": 3.296, + "step": 69000 + }, + { + "epoch": 3.46, + "grad_norm": 4.754021644592285, + "learning_rate": 9.324723618090452e-05, + "loss": 1.0512, + "step": 69100 + }, + { + "epoch": 3.46, + "grad_norm": 2.1440653800964355, + "learning_rate": 9.294572864321607e-05, + "loss": 1.0653, + "step": 69200 + }, + { + "epoch": 3.46, + "grad_norm": 2.278679609298706, + "learning_rate": 9.264422110552762e-05, + "loss": 1.0466, + "step": 69300 + }, + { + "epoch": 3.47, + "grad_norm": 2.176259994506836, + "learning_rate": 9.23427135678392e-05, + "loss": 1.0664, + "step": 69400 + }, + { + "epoch": 3.48, + "grad_norm": 2.2514779567718506, + "learning_rate": 9.204120603015074e-05, + "loss": 1.0597, + "step": 69500 + }, + { + "epoch": 3.48, + "grad_norm": 3.136343002319336, + "learning_rate": 9.173969849246231e-05, + "loss": 1.0742, + "step": 69600 + }, + { + "epoch": 3.48, + "grad_norm": 1.6031814813613892, + "learning_rate": 9.143819095477386e-05, + "loss": 1.0435, + "step": 69700 + }, + { + "epoch": 3.49, + "grad_norm": 5.727216720581055, + "learning_rate": 9.113668341708543e-05, + "loss": 1.0837, + "step": 69800 + }, + { + "epoch": 3.5, + "grad_norm": 2.909613609313965, + "learning_rate": 9.083517587939698e-05, + "loss": 1.0292, + "step": 69900 + }, + { + "epoch": 3.5, + "grad_norm": 2.8508193492889404, + "learning_rate": 9.053366834170854e-05, + "loss": 1.0643, + "step": 70000 + }, + { + "epoch": 3.5, + "eval_loss": 1.0314569473266602, + "eval_runtime": 45.3565, + "eval_samples_per_second": 22.048, + "eval_steps_per_second": 2.756, + "step": 70000 + }, + { + "epoch": 3.5, + "grad_norm": 1.3868812322616577, + "learning_rate": 9.02321608040201e-05, + "loss": 1.0719, + "step": 70100 + }, + { + "epoch": 3.51, + "grad_norm": 2.059966564178467, + "learning_rate": 8.993065326633164e-05, + "loss": 1.0496, + "step": 70200 + }, + { + "epoch": 3.52, + "grad_norm": 2.371212959289551, + "learning_rate": 8.962914572864322e-05, + "loss": 1.0416, + "step": 70300 + }, + { + "epoch": 3.52, + "grad_norm": 5.051455497741699, + "learning_rate": 8.932763819095476e-05, + "loss": 1.0817, + "step": 70400 + }, + { + "epoch": 3.52, + "grad_norm": 2.4436607360839844, + "learning_rate": 8.902613065326632e-05, + "loss": 1.0434, + "step": 70500 + }, + { + "epoch": 3.53, + "grad_norm": 2.097843885421753, + "learning_rate": 8.872462311557788e-05, + "loss": 1.06, + "step": 70600 + }, + { + "epoch": 3.54, + "grad_norm": 3.9826953411102295, + "learning_rate": 8.842311557788944e-05, + "loss": 1.0921, + "step": 70700 + }, + { + "epoch": 3.54, + "grad_norm": 3.572988748550415, + "learning_rate": 8.812160804020099e-05, + "loss": 1.0503, + "step": 70800 + }, + { + "epoch": 3.54, + "grad_norm": 3.2607603073120117, + "learning_rate": 8.782010050251256e-05, + "loss": 1.0308, + "step": 70900 + }, + { + "epoch": 3.55, + "grad_norm": 2.152568817138672, + "learning_rate": 8.75185929648241e-05, + "loss": 1.0508, + "step": 71000 + }, + { + "epoch": 3.55, + "eval_loss": 1.035280704498291, + "eval_runtime": 44.3432, + "eval_samples_per_second": 22.551, + "eval_steps_per_second": 2.819, + "step": 71000 + }, + { + "epoch": 3.56, + "grad_norm": 1.5636742115020752, + "learning_rate": 8.721708542713568e-05, + "loss": 1.0177, + "step": 71100 + }, + { + "epoch": 3.56, + "grad_norm": 1.9526029825210571, + "learning_rate": 8.691557788944722e-05, + "loss": 1.0516, + "step": 71200 + }, + { + "epoch": 3.56, + "grad_norm": 2.2071800231933594, + "learning_rate": 8.66140703517588e-05, + "loss": 1.034, + "step": 71300 + }, + { + "epoch": 3.57, + "grad_norm": 2.6768360137939453, + "learning_rate": 8.631256281407034e-05, + "loss": 1.0642, + "step": 71400 + }, + { + "epoch": 3.58, + "grad_norm": 1.6602065563201904, + "learning_rate": 8.60110552763819e-05, + "loss": 1.0389, + "step": 71500 + }, + { + "epoch": 3.58, + "grad_norm": 2.439145565032959, + "learning_rate": 8.570954773869346e-05, + "loss": 1.0536, + "step": 71600 + }, + { + "epoch": 3.58, + "grad_norm": 6.254899978637695, + "learning_rate": 8.54110552763819e-05, + "loss": 1.0141, + "step": 71700 + }, + { + "epoch": 3.59, + "grad_norm": 1.8221715688705444, + "learning_rate": 8.510954773869346e-05, + "loss": 1.044, + "step": 71800 + }, + { + "epoch": 3.59, + "grad_norm": 4.5664849281311035, + "learning_rate": 8.480804020100502e-05, + "loss": 1.0665, + "step": 71900 + }, + { + "epoch": 3.6, + "grad_norm": 2.4576423168182373, + "learning_rate": 8.450653266331658e-05, + "loss": 1.0615, + "step": 72000 + }, + { + "epoch": 3.6, + "eval_loss": 1.031246542930603, + "eval_runtime": 42.8264, + "eval_samples_per_second": 23.35, + "eval_steps_per_second": 2.919, + "step": 72000 + }, + { + "epoch": 3.6, + "grad_norm": 2.763627290725708, + "learning_rate": 8.420502512562814e-05, + "loss": 1.0333, + "step": 72100 + }, + { + "epoch": 3.61, + "grad_norm": 1.6231377124786377, + "learning_rate": 8.390351758793968e-05, + "loss": 1.0572, + "step": 72200 + }, + { + "epoch": 3.62, + "grad_norm": 1.9768860340118408, + "learning_rate": 8.360201005025126e-05, + "loss": 1.0423, + "step": 72300 + }, + { + "epoch": 3.62, + "grad_norm": 2.292513132095337, + "learning_rate": 8.33005025125628e-05, + "loss": 1.0655, + "step": 72400 + }, + { + "epoch": 3.62, + "grad_norm": 2.1181390285491943, + "learning_rate": 8.299899497487438e-05, + "loss": 1.0216, + "step": 72500 + }, + { + "epoch": 3.63, + "grad_norm": 2.3944106101989746, + "learning_rate": 8.269748743718592e-05, + "loss": 1.0585, + "step": 72600 + }, + { + "epoch": 3.63, + "grad_norm": 1.5745407342910767, + "learning_rate": 8.23959798994975e-05, + "loss": 1.0629, + "step": 72700 + }, + { + "epoch": 3.64, + "grad_norm": 2.130709648132324, + "learning_rate": 8.209447236180904e-05, + "loss": 1.0027, + "step": 72800 + }, + { + "epoch": 3.65, + "grad_norm": 3.202035427093506, + "learning_rate": 8.179296482412059e-05, + "loss": 1.0385, + "step": 72900 + }, + { + "epoch": 3.65, + "grad_norm": 2.009536027908325, + "learning_rate": 8.149145728643216e-05, + "loss": 1.0471, + "step": 73000 + }, + { + "epoch": 3.65, + "eval_loss": 1.047244668006897, + "eval_runtime": 38.0986, + "eval_samples_per_second": 26.248, + "eval_steps_per_second": 3.281, + "step": 73000 + }, + { + "epoch": 3.66, + "grad_norm": 5.239896774291992, + "learning_rate": 8.11899497487437e-05, + "loss": 1.0527, + "step": 73100 + }, + { + "epoch": 3.66, + "grad_norm": 3.438692808151245, + "learning_rate": 8.088844221105527e-05, + "loss": 1.0198, + "step": 73200 + }, + { + "epoch": 3.67, + "grad_norm": 2.0132901668548584, + "learning_rate": 8.058693467336682e-05, + "loss": 0.989, + "step": 73300 + }, + { + "epoch": 3.67, + "grad_norm": 2.9494431018829346, + "learning_rate": 8.028542713567838e-05, + "loss": 1.0329, + "step": 73400 + }, + { + "epoch": 3.67, + "grad_norm": 2.8393380641937256, + "learning_rate": 7.998391959798994e-05, + "loss": 1.043, + "step": 73500 + }, + { + "epoch": 3.68, + "grad_norm": 3.039391279220581, + "learning_rate": 7.96824120603015e-05, + "loss": 1.0035, + "step": 73600 + }, + { + "epoch": 3.69, + "grad_norm": 3.696676731109619, + "learning_rate": 7.938090452261305e-05, + "loss": 1.0472, + "step": 73700 + }, + { + "epoch": 3.69, + "grad_norm": 2.8557331562042236, + "learning_rate": 7.907939698492462e-05, + "loss": 1.0665, + "step": 73800 + }, + { + "epoch": 3.69, + "grad_norm": 3.7987170219421387, + "learning_rate": 7.877788944723617e-05, + "loss": 1.0233, + "step": 73900 + }, + { + "epoch": 3.7, + "grad_norm": 1.9759894609451294, + "learning_rate": 7.847638190954774e-05, + "loss": 1.0303, + "step": 74000 + }, + { + "epoch": 3.7, + "eval_loss": 1.0124469995498657, + "eval_runtime": 38.8346, + "eval_samples_per_second": 25.75, + "eval_steps_per_second": 3.219, + "step": 74000 + }, + { + "epoch": 3.71, + "grad_norm": 1.9311368465423584, + "learning_rate": 7.817487437185929e-05, + "loss": 1.0479, + "step": 74100 + }, + { + "epoch": 3.71, + "grad_norm": 4.948327541351318, + "learning_rate": 7.787336683417086e-05, + "loss": 1.0197, + "step": 74200 + }, + { + "epoch": 3.71, + "grad_norm": 2.6867167949676514, + "learning_rate": 7.75718592964824e-05, + "loss": 1.0209, + "step": 74300 + }, + { + "epoch": 3.72, + "grad_norm": 1.8292616605758667, + "learning_rate": 7.727035175879396e-05, + "loss": 1.0257, + "step": 74400 + }, + { + "epoch": 3.73, + "grad_norm": 3.2925384044647217, + "learning_rate": 7.696884422110552e-05, + "loss": 1.0635, + "step": 74500 + }, + { + "epoch": 3.73, + "grad_norm": 2.2040624618530273, + "learning_rate": 7.666733668341708e-05, + "loss": 1.0285, + "step": 74600 + }, + { + "epoch": 3.73, + "grad_norm": 2.1025142669677734, + "learning_rate": 7.636582914572863e-05, + "loss": 1.05, + "step": 74700 + }, + { + "epoch": 3.74, + "grad_norm": 2.409148693084717, + "learning_rate": 7.60643216080402e-05, + "loss": 1.0638, + "step": 74800 + }, + { + "epoch": 3.75, + "grad_norm": 3.284660577774048, + "learning_rate": 7.576281407035175e-05, + "loss": 1.0203, + "step": 74900 + }, + { + "epoch": 3.75, + "grad_norm": 2.3454208374023438, + "learning_rate": 7.546130653266332e-05, + "loss": 1.0425, + "step": 75000 + }, + { + "epoch": 3.75, + "eval_loss": 1.0414044857025146, + "eval_runtime": 38.2892, + "eval_samples_per_second": 26.117, + "eval_steps_per_second": 3.265, + "step": 75000 + }, + { + "epoch": 3.75, + "grad_norm": 2.6853275299072266, + "learning_rate": 7.515979899497487e-05, + "loss": 0.9762, + "step": 75100 + }, + { + "epoch": 3.76, + "grad_norm": 1.439287543296814, + "learning_rate": 7.485829145728643e-05, + "loss": 0.9955, + "step": 75200 + }, + { + "epoch": 3.77, + "grad_norm": 2.0795187950134277, + "learning_rate": 7.455678391959799e-05, + "loss": 1.0148, + "step": 75300 + }, + { + "epoch": 3.77, + "grad_norm": 2.318300247192383, + "learning_rate": 7.425527638190955e-05, + "loss": 1.0368, + "step": 75400 + }, + { + "epoch": 3.77, + "grad_norm": 2.979464054107666, + "learning_rate": 7.39537688442211e-05, + "loss": 1.0233, + "step": 75500 + }, + { + "epoch": 3.78, + "grad_norm": 2.384615421295166, + "learning_rate": 7.365226130653266e-05, + "loss": 1.0183, + "step": 75600 + }, + { + "epoch": 3.79, + "grad_norm": 2.2947332859039307, + "learning_rate": 7.335075376884421e-05, + "loss": 1.046, + "step": 75700 + }, + { + "epoch": 3.79, + "grad_norm": 2.707266330718994, + "learning_rate": 7.304924623115577e-05, + "loss": 1.0145, + "step": 75800 + }, + { + "epoch": 3.79, + "grad_norm": 1.8125189542770386, + "learning_rate": 7.275075376884422e-05, + "loss": 1.0508, + "step": 75900 + }, + { + "epoch": 3.8, + "grad_norm": 1.833924412727356, + "learning_rate": 7.244924623115577e-05, + "loss": 1.051, + "step": 76000 + }, + { + "epoch": 3.8, + "eval_loss": 1.0207512378692627, + "eval_runtime": 38.1696, + "eval_samples_per_second": 26.199, + "eval_steps_per_second": 3.275, + "step": 76000 + }, + { + "epoch": 3.81, + "grad_norm": 2.3891940116882324, + "learning_rate": 7.214773869346733e-05, + "loss": 1.0006, + "step": 76100 + }, + { + "epoch": 3.81, + "grad_norm": 2.6063296794891357, + "learning_rate": 7.184623115577889e-05, + "loss": 1.0011, + "step": 76200 + }, + { + "epoch": 3.81, + "grad_norm": 1.7001017332077026, + "learning_rate": 7.154472361809045e-05, + "loss": 1.0172, + "step": 76300 + }, + { + "epoch": 3.82, + "grad_norm": 2.0134339332580566, + "learning_rate": 7.124321608040201e-05, + "loss": 1.0367, + "step": 76400 + }, + { + "epoch": 3.83, + "grad_norm": 2.199366807937622, + "learning_rate": 7.094170854271357e-05, + "loss": 1.044, + "step": 76500 + }, + { + "epoch": 3.83, + "grad_norm": 2.8991353511810303, + "learning_rate": 7.064020100502511e-05, + "loss": 1.0121, + "step": 76600 + }, + { + "epoch": 3.83, + "grad_norm": 5.798487663269043, + "learning_rate": 7.033869346733667e-05, + "loss": 0.9734, + "step": 76700 + }, + { + "epoch": 3.84, + "grad_norm": 2.8960068225860596, + "learning_rate": 7.003718592964823e-05, + "loss": 1.004, + "step": 76800 + }, + { + "epoch": 3.84, + "grad_norm": 2.980179786682129, + "learning_rate": 6.973567839195979e-05, + "loss": 1.0118, + "step": 76900 + }, + { + "epoch": 3.85, + "grad_norm": 6.4917988777160645, + "learning_rate": 6.943417085427135e-05, + "loss": 0.9682, + "step": 77000 + }, + { + "epoch": 3.85, + "eval_loss": 1.0282562971115112, + "eval_runtime": 38.0717, + "eval_samples_per_second": 26.266, + "eval_steps_per_second": 3.283, + "step": 77000 + }, + { + "epoch": 3.85, + "grad_norm": 2.9224038124084473, + "learning_rate": 6.913266331658291e-05, + "loss": 1.0385, + "step": 77100 + }, + { + "epoch": 3.86, + "grad_norm": 4.447437763214111, + "learning_rate": 6.883115577889447e-05, + "loss": 1.0388, + "step": 77200 + }, + { + "epoch": 3.87, + "grad_norm": 2.2013559341430664, + "learning_rate": 6.852964824120603e-05, + "loss": 1.034, + "step": 77300 + }, + { + "epoch": 3.87, + "grad_norm": 1.3720605373382568, + "learning_rate": 6.822814070351757e-05, + "loss": 1.0512, + "step": 77400 + }, + { + "epoch": 3.88, + "grad_norm": 2.4448797702789307, + "learning_rate": 6.792663316582913e-05, + "loss": 1.0012, + "step": 77500 + }, + { + "epoch": 3.88, + "grad_norm": 4.061469554901123, + "learning_rate": 6.762512562814069e-05, + "loss": 1.0144, + "step": 77600 + }, + { + "epoch": 3.88, + "grad_norm": 1.62380850315094, + "learning_rate": 6.732361809045225e-05, + "loss": 1.0369, + "step": 77700 + }, + { + "epoch": 3.89, + "grad_norm": 1.3728336095809937, + "learning_rate": 6.702211055276381e-05, + "loss": 1.0133, + "step": 77800 + }, + { + "epoch": 3.9, + "grad_norm": 7.0939435958862305, + "learning_rate": 6.672060301507537e-05, + "loss": 0.9797, + "step": 77900 + }, + { + "epoch": 3.9, + "grad_norm": 2.0842604637145996, + "learning_rate": 6.642211055276381e-05, + "loss": 1.0035, + "step": 78000 + }, + { + "epoch": 3.9, + "eval_loss": 1.0243637561798096, + "eval_runtime": 38.1566, + "eval_samples_per_second": 26.208, + "eval_steps_per_second": 3.276, + "step": 78000 + }, + { + "epoch": 3.91, + "grad_norm": 3.6360020637512207, + "learning_rate": 6.612060301507537e-05, + "loss": 0.9969, + "step": 78100 + }, + { + "epoch": 3.91, + "grad_norm": 2.5551681518554688, + "learning_rate": 6.581909547738693e-05, + "loss": 1.0203, + "step": 78200 + }, + { + "epoch": 3.92, + "grad_norm": 6.86871862411499, + "learning_rate": 6.551758793969849e-05, + "loss": 1.0472, + "step": 78300 + }, + { + "epoch": 3.92, + "grad_norm": 2.3950083255767822, + "learning_rate": 6.521608040201005e-05, + "loss": 1.0167, + "step": 78400 + }, + { + "epoch": 3.92, + "grad_norm": 1.422188401222229, + "learning_rate": 6.491457286432161e-05, + "loss": 0.9968, + "step": 78500 + }, + { + "epoch": 3.93, + "grad_norm": 2.186511993408203, + "learning_rate": 6.461306532663317e-05, + "loss": 1.0113, + "step": 78600 + }, + { + "epoch": 3.94, + "grad_norm": 1.764722228050232, + "learning_rate": 6.431155778894471e-05, + "loss": 0.983, + "step": 78700 + }, + { + "epoch": 3.94, + "grad_norm": 4.928635597229004, + "learning_rate": 6.401005025125627e-05, + "loss": 1.0164, + "step": 78800 + }, + { + "epoch": 3.94, + "grad_norm": 2.1061389446258545, + "learning_rate": 6.370854271356783e-05, + "loss": 1.0171, + "step": 78900 + }, + { + "epoch": 3.95, + "grad_norm": 4.193387985229492, + "learning_rate": 6.340703517587939e-05, + "loss": 1.0072, + "step": 79000 + }, + { + "epoch": 3.95, + "eval_loss": 1.00971519947052, + "eval_runtime": 38.1263, + "eval_samples_per_second": 26.229, + "eval_steps_per_second": 3.279, + "step": 79000 + }, + { + "epoch": 3.96, + "grad_norm": 2.4844706058502197, + "learning_rate": 6.310552763819095e-05, + "loss": 1.0064, + "step": 79100 + }, + { + "epoch": 3.96, + "grad_norm": 5.7934746742248535, + "learning_rate": 6.280402010050251e-05, + "loss": 0.9509, + "step": 79200 + }, + { + "epoch": 3.96, + "grad_norm": 3.7046196460723877, + "learning_rate": 6.250251256281406e-05, + "loss": 1.0139, + "step": 79300 + }, + { + "epoch": 3.97, + "grad_norm": 1.9528000354766846, + "learning_rate": 6.220100502512562e-05, + "loss": 1.0214, + "step": 79400 + }, + { + "epoch": 3.98, + "grad_norm": 3.4000682830810547, + "learning_rate": 6.189949748743718e-05, + "loss": 1.006, + "step": 79500 + }, + { + "epoch": 3.98, + "grad_norm": 3.152561664581299, + "learning_rate": 6.159798994974874e-05, + "loss": 1.0288, + "step": 79600 + }, + { + "epoch": 3.98, + "grad_norm": 3.774915933609009, + "learning_rate": 6.12964824120603e-05, + "loss": 1.022, + "step": 79700 + }, + { + "epoch": 3.99, + "grad_norm": 2.291813373565674, + "learning_rate": 6.0994974874371854e-05, + "loss": 0.9845, + "step": 79800 + }, + { + "epoch": 4.0, + "grad_norm": 3.019514560699463, + "learning_rate": 6.0693467336683413e-05, + "loss": 1.0246, + "step": 79900 + }, + { + "epoch": 4.0, + "grad_norm": 2.4409408569335938, + "learning_rate": 6.0391959798994966e-05, + "loss": 0.9951, + "step": 80000 + }, + { + "epoch": 4.0, + "eval_loss": 0.9992234110832214, + "eval_runtime": 39.3867, + "eval_samples_per_second": 25.389, + "eval_steps_per_second": 3.174, + "step": 80000 + }, + { + "epoch": 4.0, + "grad_norm": 1.4257367849349976, + "learning_rate": 6.0090452261306526e-05, + "loss": 0.9763, + "step": 80100 + }, + { + "epoch": 4.01, + "grad_norm": 4.97927713394165, + "learning_rate": 5.9788944723618085e-05, + "loss": 0.9417, + "step": 80200 + }, + { + "epoch": 4.01, + "grad_norm": 2.8552098274230957, + "learning_rate": 5.9487437185929644e-05, + "loss": 0.9591, + "step": 80300 + }, + { + "epoch": 4.02, + "grad_norm": Infinity, + "learning_rate": 5.9188944723618084e-05, + "loss": 0.9783, + "step": 80400 + }, + { + "epoch": 4.03, + "grad_norm": 3.83720064163208, + "learning_rate": 5.8887437185929643e-05, + "loss": 0.9607, + "step": 80500 + }, + { + "epoch": 4.03, + "grad_norm": 2.607973337173462, + "learning_rate": 5.85859296482412e-05, + "loss": 0.9556, + "step": 80600 + }, + { + "epoch": 4.04, + "grad_norm": 3.51914381980896, + "learning_rate": 5.8284422110552756e-05, + "loss": 0.9371, + "step": 80700 + }, + { + "epoch": 4.04, + "grad_norm": 2.0518856048583984, + "learning_rate": 5.7982914572864315e-05, + "loss": 1.0154, + "step": 80800 + }, + { + "epoch": 4.04, + "grad_norm": 3.5824625492095947, + "learning_rate": 5.7681407035175874e-05, + "loss": 0.9894, + "step": 80900 + }, + { + "epoch": 4.05, + "grad_norm": 7.991865634918213, + "learning_rate": 5.7379899497487434e-05, + "loss": 0.9719, + "step": 81000 + }, + { + "epoch": 4.05, + "eval_loss": 1.0105689764022827, + "eval_runtime": 38.1347, + "eval_samples_per_second": 26.223, + "eval_steps_per_second": 3.278, + "step": 81000 + }, + { + "epoch": 4.05, + "grad_norm": 1.6757104396820068, + "learning_rate": 5.707839195979899e-05, + "loss": 0.9526, + "step": 81100 + }, + { + "epoch": 4.06, + "grad_norm": 3.1675045490264893, + "learning_rate": 5.677688442211055e-05, + "loss": 0.9798, + "step": 81200 + }, + { + "epoch": 4.07, + "grad_norm": 2.8390209674835205, + "learning_rate": 5.6475376884422105e-05, + "loss": 0.9455, + "step": 81300 + }, + { + "epoch": 4.07, + "grad_norm": 2.2900238037109375, + "learning_rate": 5.6173869346733665e-05, + "loss": 1.0016, + "step": 81400 + }, + { + "epoch": 4.08, + "grad_norm": 2.4220378398895264, + "learning_rate": 5.5872361809045224e-05, + "loss": 0.9681, + "step": 81500 + }, + { + "epoch": 4.08, + "grad_norm": 2.7175300121307373, + "learning_rate": 5.5570854271356784e-05, + "loss": 0.9822, + "step": 81600 + }, + { + "epoch": 4.08, + "grad_norm": 3.7499475479125977, + "learning_rate": 5.526934673366834e-05, + "loss": 0.9501, + "step": 81700 + }, + { + "epoch": 4.09, + "grad_norm": 2.1566553115844727, + "learning_rate": 5.4967839195979896e-05, + "loss": 0.9601, + "step": 81800 + }, + { + "epoch": 4.09, + "grad_norm": 2.080754280090332, + "learning_rate": 5.466633165829145e-05, + "loss": 0.954, + "step": 81900 + }, + { + "epoch": 4.1, + "grad_norm": 3.1466102600097656, + "learning_rate": 5.436482412060301e-05, + "loss": 0.9896, + "step": 82000 + }, + { + "epoch": 4.1, + "eval_loss": 1.0087724924087524, + "eval_runtime": 37.9931, + "eval_samples_per_second": 26.321, + "eval_steps_per_second": 3.29, + "step": 82000 + }, + { + "epoch": 4.11, + "grad_norm": 4.262351989746094, + "learning_rate": 5.406331658291457e-05, + "loss": 0.9454, + "step": 82100 + }, + { + "epoch": 4.11, + "grad_norm": 1.9488756656646729, + "learning_rate": 5.376180904522612e-05, + "loss": 0.9494, + "step": 82200 + }, + { + "epoch": 4.12, + "grad_norm": 1.6786818504333496, + "learning_rate": 5.346030150753768e-05, + "loss": 0.9241, + "step": 82300 + }, + { + "epoch": 4.12, + "grad_norm": 2.143955945968628, + "learning_rate": 5.315879396984924e-05, + "loss": 0.9958, + "step": 82400 + }, + { + "epoch": 4.12, + "grad_norm": 3.6211471557617188, + "learning_rate": 5.286030150753768e-05, + "loss": 0.9641, + "step": 82500 + }, + { + "epoch": 4.13, + "grad_norm": 4.066643238067627, + "learning_rate": 5.255879396984924e-05, + "loss": 0.9698, + "step": 82600 + }, + { + "epoch": 4.13, + "grad_norm": 2.151590585708618, + "learning_rate": 5.22572864321608e-05, + "loss": 0.9388, + "step": 82700 + }, + { + "epoch": 4.14, + "grad_norm": 4.644803524017334, + "learning_rate": 5.195577889447236e-05, + "loss": 0.9141, + "step": 82800 + }, + { + "epoch": 4.14, + "grad_norm": 2.652754068374634, + "learning_rate": 5.1654271356783916e-05, + "loss": 0.9592, + "step": 82900 + }, + { + "epoch": 4.15, + "grad_norm": 4.528812885284424, + "learning_rate": 5.135276381909547e-05, + "loss": 0.9778, + "step": 83000 + }, + { + "epoch": 4.15, + "eval_loss": 0.9974797368049622, + "eval_runtime": 38.0893, + "eval_samples_per_second": 26.254, + "eval_steps_per_second": 3.282, + "step": 83000 + }, + { + "epoch": 4.16, + "grad_norm": 2.625786542892456, + "learning_rate": 5.105125628140703e-05, + "loss": 0.9594, + "step": 83100 + }, + { + "epoch": 4.16, + "grad_norm": 3.7137229442596436, + "learning_rate": 5.074974874371859e-05, + "loss": 0.9462, + "step": 83200 + }, + { + "epoch": 4.17, + "grad_norm": 6.682472229003906, + "learning_rate": 5.044824120603015e-05, + "loss": 0.9301, + "step": 83300 + }, + { + "epoch": 4.17, + "grad_norm": 2.7188687324523926, + "learning_rate": 5.014673366834171e-05, + "loss": 0.9801, + "step": 83400 + }, + { + "epoch": 4.17, + "grad_norm": 2.7037341594696045, + "learning_rate": 4.984522613065326e-05, + "loss": 0.9475, + "step": 83500 + }, + { + "epoch": 4.18, + "grad_norm": 2.815229654312134, + "learning_rate": 4.954371859296482e-05, + "loss": 0.9012, + "step": 83600 + }, + { + "epoch": 4.18, + "grad_norm": 2.7187130451202393, + "learning_rate": 4.924221105527638e-05, + "loss": 0.9199, + "step": 83700 + }, + { + "epoch": 4.19, + "grad_norm": 1.6610496044158936, + "learning_rate": 4.894070351758794e-05, + "loss": 0.9321, + "step": 83800 + }, + { + "epoch": 4.2, + "grad_norm": 2.1496291160583496, + "learning_rate": 4.86391959798995e-05, + "loss": 0.9003, + "step": 83900 + }, + { + "epoch": 4.2, + "grad_norm": 2.9933974742889404, + "learning_rate": 4.833768844221105e-05, + "loss": 0.9467, + "step": 84000 + }, + { + "epoch": 4.2, + "eval_loss": 0.9802306890487671, + "eval_runtime": 38.0487, + "eval_samples_per_second": 26.282, + "eval_steps_per_second": 3.285, + "step": 84000 + }, + { + "epoch": 4.21, + "grad_norm": 4.368553161621094, + "learning_rate": 4.803618090452261e-05, + "loss": 0.921, + "step": 84100 + }, + { + "epoch": 4.21, + "grad_norm": 4.087899684906006, + "learning_rate": 4.773467336683417e-05, + "loss": 0.9413, + "step": 84200 + }, + { + "epoch": 4.21, + "grad_norm": 1.8541690111160278, + "learning_rate": 4.743316582914573e-05, + "loss": 0.9657, + "step": 84300 + }, + { + "epoch": 4.22, + "grad_norm": 2.6514675617218018, + "learning_rate": 4.713165829145729e-05, + "loss": 0.9645, + "step": 84400 + }, + { + "epoch": 4.22, + "grad_norm": 3.2329466342926025, + "learning_rate": 4.683015075376885e-05, + "loss": 0.9465, + "step": 84500 + }, + { + "epoch": 4.23, + "grad_norm": 2.358675241470337, + "learning_rate": 4.652864321608039e-05, + "loss": 0.9644, + "step": 84600 + }, + { + "epoch": 4.24, + "grad_norm": 3.6738836765289307, + "learning_rate": 4.6230150753768846e-05, + "loss": 0.9357, + "step": 84700 + }, + { + "epoch": 4.24, + "grad_norm": 2.8447327613830566, + "learning_rate": 4.59286432160804e-05, + "loss": 0.9308, + "step": 84800 + }, + { + "epoch": 4.25, + "grad_norm": 1.6326079368591309, + "learning_rate": 4.562713567839195e-05, + "loss": 0.9068, + "step": 84900 + }, + { + "epoch": 4.25, + "grad_norm": 2.3545360565185547, + "learning_rate": 4.532562814070351e-05, + "loss": 0.9436, + "step": 85000 + }, + { + "epoch": 4.25, + "eval_loss": 0.9844674468040466, + "eval_runtime": 38.274, + "eval_samples_per_second": 26.127, + "eval_steps_per_second": 3.266, + "step": 85000 + }, + { + "epoch": 4.25, + "grad_norm": 3.2402210235595703, + "learning_rate": 4.502412060301507e-05, + "loss": 0.9313, + "step": 85100 + }, + { + "epoch": 4.26, + "grad_norm": 3.3900952339172363, + "learning_rate": 4.472261306532662e-05, + "loss": 0.9385, + "step": 85200 + }, + { + "epoch": 4.26, + "grad_norm": 3.8531854152679443, + "learning_rate": 4.442110552763818e-05, + "loss": 0.9292, + "step": 85300 + }, + { + "epoch": 4.27, + "grad_norm": 2.3123373985290527, + "learning_rate": 4.411959798994974e-05, + "loss": 0.9544, + "step": 85400 + }, + { + "epoch": 4.28, + "grad_norm": 2.5710906982421875, + "learning_rate": 4.38180904522613e-05, + "loss": 0.9591, + "step": 85500 + }, + { + "epoch": 4.28, + "grad_norm": 3.4481329917907715, + "learning_rate": 4.351658291457286e-05, + "loss": 0.9281, + "step": 85600 + }, + { + "epoch": 4.29, + "grad_norm": 1.7887803316116333, + "learning_rate": 4.321507537688442e-05, + "loss": 0.9371, + "step": 85700 + }, + { + "epoch": 4.29, + "grad_norm": 6.177557945251465, + "learning_rate": 4.291356783919597e-05, + "loss": 0.9154, + "step": 85800 + }, + { + "epoch": 4.29, + "grad_norm": 3.0554301738739014, + "learning_rate": 4.261206030150753e-05, + "loss": 0.9483, + "step": 85900 + }, + { + "epoch": 4.3, + "grad_norm": 2.0133023262023926, + "learning_rate": 4.231055276381909e-05, + "loss": 0.9557, + "step": 86000 + }, + { + "epoch": 4.3, + "eval_loss": 0.9593837261199951, + "eval_runtime": 38.1446, + "eval_samples_per_second": 26.216, + "eval_steps_per_second": 3.277, + "step": 86000 + }, + { + "epoch": 4.3, + "grad_norm": 2.1396610736846924, + "learning_rate": 4.200904522613065e-05, + "loss": 0.9643, + "step": 86100 + }, + { + "epoch": 4.31, + "grad_norm": 2.709627628326416, + "learning_rate": 4.170753768844221e-05, + "loss": 0.9365, + "step": 86200 + }, + { + "epoch": 4.32, + "grad_norm": 4.406678199768066, + "learning_rate": 4.1406030150753764e-05, + "loss": 0.9553, + "step": 86300 + }, + { + "epoch": 4.32, + "grad_norm": 4.822593688964844, + "learning_rate": 4.110452261306532e-05, + "loss": 0.9213, + "step": 86400 + }, + { + "epoch": 4.33, + "grad_norm": 4.148794651031494, + "learning_rate": 4.080301507537688e-05, + "loss": 0.9808, + "step": 86500 + }, + { + "epoch": 4.33, + "grad_norm": 3.7028510570526123, + "learning_rate": 4.050150753768844e-05, + "loss": 0.9331, + "step": 86600 + }, + { + "epoch": 4.33, + "grad_norm": 2.314500093460083, + "learning_rate": 4.02e-05, + "loss": 0.9551, + "step": 86700 + }, + { + "epoch": 4.34, + "grad_norm": 3.741234302520752, + "learning_rate": 3.9898492462311554e-05, + "loss": 0.9053, + "step": 86800 + }, + { + "epoch": 4.34, + "grad_norm": 3.7346441745758057, + "learning_rate": 3.9596984924623113e-05, + "loss": 0.9517, + "step": 86900 + }, + { + "epoch": 4.35, + "grad_norm": 1.324827790260315, + "learning_rate": 3.929849246231156e-05, + "loss": 0.9764, + "step": 87000 + }, + { + "epoch": 4.35, + "eval_loss": 1.0139998197555542, + "eval_runtime": 38.1639, + "eval_samples_per_second": 26.203, + "eval_steps_per_second": 3.275, + "step": 87000 + }, + { + "epoch": 4.36, + "grad_norm": 5.19126033782959, + "learning_rate": 3.899698492462311e-05, + "loss": 0.9366, + "step": 87100 + }, + { + "epoch": 4.36, + "grad_norm": 2.899726629257202, + "learning_rate": 3.869547738693467e-05, + "loss": 0.9555, + "step": 87200 + }, + { + "epoch": 4.37, + "grad_norm": 1.9099615812301636, + "learning_rate": 3.839396984924623e-05, + "loss": 0.9033, + "step": 87300 + }, + { + "epoch": 4.37, + "grad_norm": 1.5814082622528076, + "learning_rate": 3.809246231155779e-05, + "loss": 0.9978, + "step": 87400 + }, + { + "epoch": 4.38, + "grad_norm": 3.4520106315612793, + "learning_rate": 3.779095477386935e-05, + "loss": 0.9343, + "step": 87500 + }, + { + "epoch": 4.38, + "grad_norm": 3.0876681804656982, + "learning_rate": 3.74894472361809e-05, + "loss": 0.9094, + "step": 87600 + }, + { + "epoch": 4.38, + "grad_norm": 3.5139119625091553, + "learning_rate": 3.718793969849246e-05, + "loss": 0.8677, + "step": 87700 + }, + { + "epoch": 4.39, + "grad_norm": 2.003330945968628, + "learning_rate": 3.6886432160804015e-05, + "loss": 0.9351, + "step": 87800 + }, + { + "epoch": 4.39, + "grad_norm": 2.259235382080078, + "learning_rate": 3.6584924623115574e-05, + "loss": 0.9388, + "step": 87900 + }, + { + "epoch": 4.4, + "grad_norm": 2.2141153812408447, + "learning_rate": 3.6283417085427134e-05, + "loss": 0.9169, + "step": 88000 + }, + { + "epoch": 4.4, + "eval_loss": 0.9528889060020447, + "eval_runtime": 38.0305, + "eval_samples_per_second": 26.295, + "eval_steps_per_second": 3.287, + "step": 88000 + }, + { + "epoch": 4.41, + "grad_norm": 4.264975547790527, + "learning_rate": 3.5981909547738693e-05, + "loss": 0.9309, + "step": 88100 + }, + { + "epoch": 4.41, + "grad_norm": 4.431647777557373, + "learning_rate": 3.5680402010050246e-05, + "loss": 0.9035, + "step": 88200 + }, + { + "epoch": 4.42, + "grad_norm": 2.326883316040039, + "learning_rate": 3.5378894472361806e-05, + "loss": 0.904, + "step": 88300 + }, + { + "epoch": 4.42, + "grad_norm": 2.6951944828033447, + "learning_rate": 3.5077386934673365e-05, + "loss": 0.9195, + "step": 88400 + }, + { + "epoch": 4.42, + "grad_norm": 1.8017208576202393, + "learning_rate": 3.477587939698492e-05, + "loss": 0.9398, + "step": 88500 + }, + { + "epoch": 4.43, + "grad_norm": 3.8392789363861084, + "learning_rate": 3.447437185929648e-05, + "loss": 0.9591, + "step": 88600 + }, + { + "epoch": 4.43, + "grad_norm": 2.541273593902588, + "learning_rate": 3.4172864321608037e-05, + "loss": 0.9054, + "step": 88700 + }, + { + "epoch": 4.44, + "grad_norm": 2.7736191749572754, + "learning_rate": 3.3874371859296476e-05, + "loss": 0.9473, + "step": 88800 + }, + { + "epoch": 4.45, + "grad_norm": 2.660540819168091, + "learning_rate": 3.3572864321608036e-05, + "loss": 0.9582, + "step": 88900 + }, + { + "epoch": 4.45, + "grad_norm": 3.161513328552246, + "learning_rate": 3.3271356783919595e-05, + "loss": 0.8943, + "step": 89000 + }, + { + "epoch": 4.45, + "eval_loss": 0.9552559852600098, + "eval_runtime": 38.1158, + "eval_samples_per_second": 26.236, + "eval_steps_per_second": 3.279, + "step": 89000 + }, + { + "epoch": 4.46, + "grad_norm": 4.881318092346191, + "learning_rate": 3.2969849246231154e-05, + "loss": 0.9053, + "step": 89100 + }, + { + "epoch": 4.46, + "grad_norm": 1.7572602033615112, + "learning_rate": 3.2668341708542714e-05, + "loss": 0.9364, + "step": 89200 + }, + { + "epoch": 4.46, + "grad_norm": 3.067507743835449, + "learning_rate": 3.2366834170854267e-05, + "loss": 0.9355, + "step": 89300 + }, + { + "epoch": 4.47, + "grad_norm": 3.1982858180999756, + "learning_rate": 3.2065326633165826e-05, + "loss": 0.9333, + "step": 89400 + }, + { + "epoch": 4.47, + "grad_norm": 3.596789598464966, + "learning_rate": 3.1763819095477385e-05, + "loss": 0.8978, + "step": 89500 + }, + { + "epoch": 4.48, + "grad_norm": 5.035818576812744, + "learning_rate": 3.1462311557788945e-05, + "loss": 0.9337, + "step": 89600 + }, + { + "epoch": 4.49, + "grad_norm": 3.149653673171997, + "learning_rate": 3.11608040201005e-05, + "loss": 0.9515, + "step": 89700 + }, + { + "epoch": 4.49, + "grad_norm": 3.4601404666900635, + "learning_rate": 3.085929648241206e-05, + "loss": 0.9021, + "step": 89800 + }, + { + "epoch": 4.5, + "grad_norm": 2.6317124366760254, + "learning_rate": 3.0557788944723616e-05, + "loss": 0.9559, + "step": 89900 + }, + { + "epoch": 4.5, + "grad_norm": 2.667861223220825, + "learning_rate": 3.0256281407035173e-05, + "loss": 0.9341, + "step": 90000 + }, + { + "epoch": 4.5, + "eval_loss": 0.9440233111381531, + "eval_runtime": 38.0809, + "eval_samples_per_second": 26.26, + "eval_steps_per_second": 3.282, + "step": 90000 + }, + { + "epoch": 4.5, + "grad_norm": 3.903172016143799, + "learning_rate": 2.9954773869346732e-05, + "loss": 0.8857, + "step": 90100 + }, + { + "epoch": 4.51, + "grad_norm": 3.9286229610443115, + "learning_rate": 2.9653266331658288e-05, + "loss": 0.9119, + "step": 90200 + }, + { + "epoch": 4.51, + "grad_norm": 2.812256336212158, + "learning_rate": 2.9351758793969847e-05, + "loss": 0.9026, + "step": 90300 + }, + { + "epoch": 4.52, + "grad_norm": 2.2835099697113037, + "learning_rate": 2.9050251256281404e-05, + "loss": 0.885, + "step": 90400 + }, + { + "epoch": 4.53, + "grad_norm": 3.383111000061035, + "learning_rate": 2.8748743718592963e-05, + "loss": 0.8838, + "step": 90500 + }, + { + "epoch": 4.53, + "grad_norm": 2.7682292461395264, + "learning_rate": 2.8447236180904522e-05, + "loss": 0.9139, + "step": 90600 + }, + { + "epoch": 4.54, + "grad_norm": 6.3915019035339355, + "learning_rate": 2.814572864321608e-05, + "loss": 0.9188, + "step": 90700 + }, + { + "epoch": 4.54, + "grad_norm": 5.53504753112793, + "learning_rate": 2.7844221105527635e-05, + "loss": 0.9118, + "step": 90800 + }, + { + "epoch": 4.54, + "grad_norm": 2.5919177532196045, + "learning_rate": 2.754271356783919e-05, + "loss": 0.8844, + "step": 90900 + }, + { + "epoch": 4.55, + "grad_norm": 1.9481797218322754, + "learning_rate": 2.724120603015075e-05, + "loss": 0.9192, + "step": 91000 + }, + { + "epoch": 4.55, + "eval_loss": 0.9217103123664856, + "eval_runtime": 38.1169, + "eval_samples_per_second": 26.235, + "eval_steps_per_second": 3.279, + "step": 91000 + }, + { + "epoch": 4.55, + "grad_norm": 2.1429965496063232, + "learning_rate": 2.693969849246231e-05, + "loss": 0.8889, + "step": 91100 + }, + { + "epoch": 4.56, + "grad_norm": 3.4818546772003174, + "learning_rate": 2.6638190954773866e-05, + "loss": 0.8932, + "step": 91200 + }, + { + "epoch": 4.56, + "grad_norm": 2.3813984394073486, + "learning_rate": 2.6336683417085425e-05, + "loss": 0.9154, + "step": 91300 + }, + { + "epoch": 4.57, + "grad_norm": 2.4688570499420166, + "learning_rate": 2.6035175879396984e-05, + "loss": 0.9344, + "step": 91400 + }, + { + "epoch": 4.58, + "grad_norm": 4.330790996551514, + "learning_rate": 2.573366834170854e-05, + "loss": 0.9137, + "step": 91500 + }, + { + "epoch": 4.58, + "grad_norm": 2.8123939037323, + "learning_rate": 2.54321608040201e-05, + "loss": 0.9041, + "step": 91600 + }, + { + "epoch": 4.58, + "grad_norm": 2.1815638542175293, + "learning_rate": 2.5130653266331656e-05, + "loss": 0.8606, + "step": 91700 + }, + { + "epoch": 4.59, + "grad_norm": 3.3489341735839844, + "learning_rate": 2.4829145728643216e-05, + "loss": 0.934, + "step": 91800 + }, + { + "epoch": 4.59, + "grad_norm": 2.9650094509124756, + "learning_rate": 2.4527638190954775e-05, + "loss": 0.8893, + "step": 91900 + }, + { + "epoch": 4.6, + "grad_norm": 3.541456460952759, + "learning_rate": 2.4226130653266328e-05, + "loss": 0.9239, + "step": 92000 + }, + { + "epoch": 4.6, + "eval_loss": 0.9656698107719421, + "eval_runtime": 38.5991, + "eval_samples_per_second": 25.907, + "eval_steps_per_second": 3.238, + "step": 92000 + }, + { + "epoch": 4.61, + "grad_norm": 3.1648945808410645, + "learning_rate": 2.3924623115577887e-05, + "loss": 0.8777, + "step": 92100 + }, + { + "epoch": 4.61, + "grad_norm": 8.632335662841797, + "learning_rate": 2.3623115577889443e-05, + "loss": 0.9047, + "step": 92200 + }, + { + "epoch": 4.62, + "grad_norm": 2.9412002563476562, + "learning_rate": 2.3321608040201003e-05, + "loss": 0.8964, + "step": 92300 + }, + { + "epoch": 4.62, + "grad_norm": 2.7501888275146484, + "learning_rate": 2.3020100502512562e-05, + "loss": 0.9303, + "step": 92400 + }, + { + "epoch": 4.62, + "grad_norm": 3.36631178855896, + "learning_rate": 2.2718592964824118e-05, + "loss": 0.8987, + "step": 92500 + }, + { + "epoch": 4.63, + "grad_norm": 2.6061251163482666, + "learning_rate": 2.2417085427135678e-05, + "loss": 0.8981, + "step": 92600 + }, + { + "epoch": 4.63, + "grad_norm": 3.9636521339416504, + "learning_rate": 2.2115577889447234e-05, + "loss": 0.893, + "step": 92700 + }, + { + "epoch": 4.64, + "grad_norm": 3.2085049152374268, + "learning_rate": 2.1814070351758793e-05, + "loss": 0.9298, + "step": 92800 + }, + { + "epoch": 4.64, + "grad_norm": 2.590059995651245, + "learning_rate": 2.1512562814070353e-05, + "loss": 0.9118, + "step": 92900 + }, + { + "epoch": 4.65, + "grad_norm": 4.868690013885498, + "learning_rate": 2.121105527638191e-05, + "loss": 0.8873, + "step": 93000 + }, + { + "epoch": 4.65, + "eval_loss": 0.918121337890625, + "eval_runtime": 38.3542, + "eval_samples_per_second": 26.073, + "eval_steps_per_second": 3.259, + "step": 93000 + }, + { + "epoch": 4.66, + "grad_norm": 4.0143303871154785, + "learning_rate": 2.0909547738693465e-05, + "loss": 0.871, + "step": 93100 + }, + { + "epoch": 4.66, + "grad_norm": 4.423349857330322, + "learning_rate": 2.060804020100502e-05, + "loss": 0.9232, + "step": 93200 + }, + { + "epoch": 4.67, + "grad_norm": 3.6609606742858887, + "learning_rate": 2.030653266331658e-05, + "loss": 0.8782, + "step": 93300 + }, + { + "epoch": 4.67, + "grad_norm": 3.252089738845825, + "learning_rate": 2.0008040201005026e-05, + "loss": 0.9232, + "step": 93400 + }, + { + "epoch": 4.67, + "grad_norm": 2.8783979415893555, + "learning_rate": 1.970653266331658e-05, + "loss": 0.8539, + "step": 93500 + }, + { + "epoch": 4.68, + "grad_norm": 5.381927967071533, + "learning_rate": 1.940502512562814e-05, + "loss": 0.9263, + "step": 93600 + }, + { + "epoch": 4.69, + "grad_norm": 3.1031525135040283, + "learning_rate": 1.9103517587939695e-05, + "loss": 0.9095, + "step": 93700 + }, + { + "epoch": 4.69, + "grad_norm": 2.668039321899414, + "learning_rate": 1.8802010050251254e-05, + "loss": 0.892, + "step": 93800 + }, + { + "epoch": 4.7, + "grad_norm": 2.6661875247955322, + "learning_rate": 1.8500502512562814e-05, + "loss": 0.8944, + "step": 93900 + }, + { + "epoch": 4.7, + "grad_norm": 3.5291526317596436, + "learning_rate": 1.819899497487437e-05, + "loss": 0.9074, + "step": 94000 + }, + { + "epoch": 4.7, + "eval_loss": 0.9208371639251709, + "eval_runtime": 38.4003, + "eval_samples_per_second": 26.041, + "eval_steps_per_second": 3.255, + "step": 94000 + }, + { + "epoch": 4.71, + "grad_norm": 4.160482883453369, + "learning_rate": 1.789748743718593e-05, + "loss": 0.9045, + "step": 94100 + }, + { + "epoch": 4.71, + "grad_norm": 3.8051962852478027, + "learning_rate": 1.7595979899497485e-05, + "loss": 0.899, + "step": 94200 + }, + { + "epoch": 4.71, + "grad_norm": 3.431490898132324, + "learning_rate": 1.7294472361809045e-05, + "loss": 0.8577, + "step": 94300 + }, + { + "epoch": 4.72, + "grad_norm": 2.356250524520874, + "learning_rate": 1.69929648241206e-05, + "loss": 0.9204, + "step": 94400 + }, + { + "epoch": 4.72, + "grad_norm": 5.237595081329346, + "learning_rate": 1.669145728643216e-05, + "loss": 0.8973, + "step": 94500 + }, + { + "epoch": 4.73, + "grad_norm": 5.023568153381348, + "learning_rate": 1.6389949748743716e-05, + "loss": 0.9064, + "step": 94600 + }, + { + "epoch": 4.74, + "grad_norm": 6.610247611999512, + "learning_rate": 1.6088442211055276e-05, + "loss": 0.858, + "step": 94700 + }, + { + "epoch": 4.74, + "grad_norm": 2.1937615871429443, + "learning_rate": 1.5786934673366835e-05, + "loss": 0.872, + "step": 94800 + }, + { + "epoch": 4.75, + "grad_norm": 4.40328311920166, + "learning_rate": 1.548542713567839e-05, + "loss": 0.88, + "step": 94900 + }, + { + "epoch": 4.75, + "grad_norm": 3.0487658977508545, + "learning_rate": 1.5183919597989947e-05, + "loss": 0.8779, + "step": 95000 + }, + { + "epoch": 4.75, + "eval_loss": 0.9459323883056641, + "eval_runtime": 38.1338, + "eval_samples_per_second": 26.223, + "eval_steps_per_second": 3.278, + "step": 95000 + }, + { + "epoch": 4.75, + "grad_norm": 3.8922808170318604, + "learning_rate": 1.4882412060301507e-05, + "loss": 0.9075, + "step": 95100 + }, + { + "epoch": 4.76, + "grad_norm": 3.232625722885132, + "learning_rate": 1.4580904522613064e-05, + "loss": 0.869, + "step": 95200 + }, + { + "epoch": 4.76, + "grad_norm": 8.73833179473877, + "learning_rate": 1.4279396984924622e-05, + "loss": 0.8741, + "step": 95300 + }, + { + "epoch": 4.77, + "grad_norm": 4.5711846351623535, + "learning_rate": 1.397788944723618e-05, + "loss": 0.8976, + "step": 95400 + }, + { + "epoch": 4.78, + "grad_norm": 4.647241115570068, + "learning_rate": 1.3676381909547736e-05, + "loss": 0.8392, + "step": 95500 + }, + { + "epoch": 4.78, + "grad_norm": 4.90078067779541, + "learning_rate": 1.337788944723618e-05, + "loss": 0.8739, + "step": 95600 + }, + { + "epoch": 4.79, + "grad_norm": 3.1595067977905273, + "learning_rate": 1.3076381909547738e-05, + "loss": 0.8398, + "step": 95700 + }, + { + "epoch": 4.79, + "grad_norm": 2.488835096359253, + "learning_rate": 1.2774874371859296e-05, + "loss": 0.868, + "step": 95800 + }, + { + "epoch": 4.79, + "grad_norm": 4.495543003082275, + "learning_rate": 1.2473366834170852e-05, + "loss": 0.8872, + "step": 95900 + }, + { + "epoch": 4.8, + "grad_norm": 3.673161268234253, + "learning_rate": 1.217185929648241e-05, + "loss": 0.8824, + "step": 96000 + }, + { + "epoch": 4.8, + "eval_loss": 0.910308301448822, + "eval_runtime": 38.0891, + "eval_samples_per_second": 26.254, + "eval_steps_per_second": 3.282, + "step": 96000 + }, + { + "epoch": 4.8, + "grad_norm": 5.159984111785889, + "learning_rate": 1.187035175879397e-05, + "loss": 0.8672, + "step": 96100 + }, + { + "epoch": 4.81, + "grad_norm": 2.706937551498413, + "learning_rate": 1.1568844221105527e-05, + "loss": 0.8914, + "step": 96200 + }, + { + "epoch": 4.81, + "grad_norm": 3.727692127227783, + "learning_rate": 1.1267336683417085e-05, + "loss": 0.8485, + "step": 96300 + }, + { + "epoch": 4.82, + "grad_norm": 2.665670156478882, + "learning_rate": 1.0965829145728641e-05, + "loss": 0.8695, + "step": 96400 + }, + { + "epoch": 4.83, + "grad_norm": 5.077518463134766, + "learning_rate": 1.0664321608040199e-05, + "loss": 0.8767, + "step": 96500 + }, + { + "epoch": 4.83, + "grad_norm": 3.4337048530578613, + "learning_rate": 1.0362814070351758e-05, + "loss": 0.8673, + "step": 96600 + }, + { + "epoch": 4.83, + "grad_norm": 3.231494665145874, + "learning_rate": 1.0061306532663316e-05, + "loss": 0.8767, + "step": 96700 + }, + { + "epoch": 4.84, + "grad_norm": 4.2955002784729, + "learning_rate": 9.759798994974874e-06, + "loss": 0.8645, + "step": 96800 + }, + { + "epoch": 4.84, + "grad_norm": 6.2070698738098145, + "learning_rate": 9.458291457286431e-06, + "loss": 0.8683, + "step": 96900 + }, + { + "epoch": 4.85, + "grad_norm": 3.6267805099487305, + "learning_rate": 9.159798994974874e-06, + "loss": 0.907, + "step": 97000 + }, + { + "epoch": 4.85, + "eval_loss": 0.9255304932594299, + "eval_runtime": 38.1396, + "eval_samples_per_second": 26.219, + "eval_steps_per_second": 3.277, + "step": 97000 + }, + { + "epoch": 4.86, + "grad_norm": 4.985959529876709, + "learning_rate": 8.858291457286432e-06, + "loss": 0.8615, + "step": 97100 + }, + { + "epoch": 4.86, + "grad_norm": 4.538032531738281, + "learning_rate": 8.556783919597988e-06, + "loss": 0.8519, + "step": 97200 + }, + { + "epoch": 4.87, + "grad_norm": 6.562105178833008, + "learning_rate": 8.255276381909548e-06, + "loss": 0.8888, + "step": 97300 + }, + { + "epoch": 4.87, + "grad_norm": 2.922360897064209, + "learning_rate": 7.953768844221105e-06, + "loss": 0.8784, + "step": 97400 + }, + { + "epoch": 4.88, + "grad_norm": 3.8349783420562744, + "learning_rate": 7.652261306532663e-06, + "loss": 0.8962, + "step": 97500 + }, + { + "epoch": 4.88, + "grad_norm": 2.096787929534912, + "learning_rate": 7.350753768844221e-06, + "loss": 0.9088, + "step": 97600 + }, + { + "epoch": 4.88, + "grad_norm": 2.512312650680542, + "learning_rate": 7.0492462311557786e-06, + "loss": 0.8816, + "step": 97700 + }, + { + "epoch": 4.89, + "grad_norm": 4.749015808105469, + "learning_rate": 6.7477386934673355e-06, + "loss": 0.8791, + "step": 97800 + }, + { + "epoch": 4.89, + "grad_norm": 3.5753800868988037, + "learning_rate": 6.446231155778894e-06, + "loss": 0.8414, + "step": 97900 + }, + { + "epoch": 4.9, + "grad_norm": 2.849839210510254, + "learning_rate": 6.144723618090452e-06, + "loss": 0.873, + "step": 98000 + }, + { + "epoch": 4.9, + "eval_loss": 0.8922821283340454, + "eval_runtime": 38.1228, + "eval_samples_per_second": 26.231, + "eval_steps_per_second": 3.279, + "step": 98000 + }, + { + "epoch": 4.91, + "grad_norm": 4.473388195037842, + "learning_rate": 5.8432160804020096e-06, + "loss": 0.8428, + "step": 98100 + }, + { + "epoch": 4.91, + "grad_norm": 2.7943496704101562, + "learning_rate": 5.541708542713567e-06, + "loss": 0.8519, + "step": 98200 + }, + { + "epoch": 4.92, + "grad_norm": 2.476835012435913, + "learning_rate": 5.240201005025126e-06, + "loss": 0.8841, + "step": 98300 + }, + { + "epoch": 4.92, + "grad_norm": 4.992676258087158, + "learning_rate": 4.938693467336683e-06, + "loss": 0.8409, + "step": 98400 + }, + { + "epoch": 4.92, + "grad_norm": 2.4756906032562256, + "learning_rate": 4.637185929648241e-06, + "loss": 0.8527, + "step": 98500 + }, + { + "epoch": 4.93, + "grad_norm": 2.157059669494629, + "learning_rate": 4.335678391959798e-06, + "loss": 0.8605, + "step": 98600 + }, + { + "epoch": 4.94, + "grad_norm": 2.8840818405151367, + "learning_rate": 4.034170854271356e-06, + "loss": 0.87, + "step": 98700 + }, + { + "epoch": 4.94, + "grad_norm": 4.124537944793701, + "learning_rate": 3.7326633165829143e-06, + "loss": 0.8318, + "step": 98800 + }, + { + "epoch": 4.95, + "grad_norm": 4.684917449951172, + "learning_rate": 3.431155778894472e-06, + "loss": 0.8479, + "step": 98900 + }, + { + "epoch": 4.95, + "grad_norm": 2.413602590560913, + "learning_rate": 3.12964824120603e-06, + "loss": 0.8452, + "step": 99000 + }, + { + "epoch": 4.95, + "eval_loss": 0.8957632780075073, + "eval_runtime": 38.1658, + "eval_samples_per_second": 26.201, + "eval_steps_per_second": 3.275, + "step": 99000 + }, + { + "epoch": 4.96, + "grad_norm": 3.240213394165039, + "learning_rate": 2.828140703517588e-06, + "loss": 0.8303, + "step": 99100 + }, + { + "epoch": 4.96, + "grad_norm": 4.0827555656433105, + "learning_rate": 2.5266331658291453e-06, + "loss": 0.8872, + "step": 99200 + }, + { + "epoch": 4.96, + "grad_norm": 2.948489189147949, + "learning_rate": 2.2251256281407035e-06, + "loss": 0.8707, + "step": 99300 + }, + { + "epoch": 4.97, + "grad_norm": 6.414693832397461, + "learning_rate": 1.9236180904522612e-06, + "loss": 0.837, + "step": 99400 + }, + { + "epoch": 4.97, + "grad_norm": 5.013907432556152, + "learning_rate": 1.622110552763819e-06, + "loss": 0.8443, + "step": 99500 + }, + { + "epoch": 4.98, + "grad_norm": 2.487205743789673, + "learning_rate": 1.3206030150753765e-06, + "loss": 0.8425, + "step": 99600 + }, + { + "epoch": 4.99, + "grad_norm": 5.77063512802124, + "learning_rate": 1.0190954773869345e-06, + "loss": 0.8509, + "step": 99700 + }, + { + "epoch": 4.99, + "grad_norm": 3.125368356704712, + "learning_rate": 7.175879396984924e-07, + "loss": 0.8874, + "step": 99800 + }, + { + "epoch": 5.0, + "grad_norm": 8.932684898376465, + "learning_rate": 4.160804020100502e-07, + "loss": 0.858, + "step": 99900 + }, + { + "epoch": 5.0, + "grad_norm": 5.0273756980896, + "learning_rate": 1.1457286432160803e-07, + "loss": 0.8394, + "step": 100000 + }, + { + "epoch": 5.0, + "eval_loss": 0.9212185144424438, + "eval_runtime": 38.102, + "eval_samples_per_second": 26.245, + "eval_steps_per_second": 3.281, + "step": 100000 + } + ], + "logging_steps": 100, + "max_steps": 100000, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 1000, + "total_flos": 1.2076594495488e+18, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}