| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 5.0, | |
| "eval_steps": 1000, | |
| "global_step": 100000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 3.8100409507751465, | |
| "learning_rate": 5.9999999999999995e-05, | |
| "loss": 1.816, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.0648651123046875, | |
| "learning_rate": 0.00011999999999999999, | |
| "loss": 1.5968, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 1.9017549753189087, | |
| "learning_rate": 0.00017999999999999998, | |
| "loss": 1.56, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.5334885120391846, | |
| "learning_rate": 0.00023999999999999998, | |
| "loss": 1.587, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.3036648035049438, | |
| "learning_rate": 0.0003, | |
| "loss": 1.6182, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 1.709660530090332, | |
| "learning_rate": 0.00029969849246231153, | |
| "loss": 1.6102, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.5684775114059448, | |
| "learning_rate": 0.0002993969849246231, | |
| "loss": 1.6094, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 3.3330438137054443, | |
| "learning_rate": 0.00029909547738693465, | |
| "loss": 1.6118, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 2.1563549041748047, | |
| "learning_rate": 0.0002987939698492462, | |
| "loss": 1.6596, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 2.4043567180633545, | |
| "learning_rate": 0.00029849547738693464, | |
| "loss": 1.6071, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "eval_loss": 1.585342288017273, | |
| "eval_runtime": 37.6462, | |
| "eval_samples_per_second": 26.563, | |
| "eval_steps_per_second": 3.32, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 2.3647234439849854, | |
| "learning_rate": 0.0002981939698492462, | |
| "loss": 1.611, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 2.3917016983032227, | |
| "learning_rate": 0.00029789246231155776, | |
| "loss": 1.6003, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 1.7931370735168457, | |
| "learning_rate": 0.0002975909547738693, | |
| "loss": 1.5789, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 2.542971611022949, | |
| "learning_rate": 0.0002972894472361809, | |
| "loss": 1.5435, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 1.8555421829223633, | |
| "learning_rate": 0.00029698793969849243, | |
| "loss": 1.5513, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 1.9988830089569092, | |
| "learning_rate": 0.000296686432160804, | |
| "loss": 1.5763, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 1.5328696966171265, | |
| "learning_rate": 0.00029638492462311555, | |
| "loss": 1.5529, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 2.442533254623413, | |
| "learning_rate": 0.0002960834170854271, | |
| "loss": 1.5581, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 1.4188216924667358, | |
| "learning_rate": 0.00029578190954773867, | |
| "loss": 1.5598, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 2.700873851776123, | |
| "learning_rate": 0.00029548040201005023, | |
| "loss": 1.6091, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "eval_loss": 1.5680323839187622, | |
| "eval_runtime": 37.9632, | |
| "eval_samples_per_second": 26.341, | |
| "eval_steps_per_second": 3.293, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 4.415462493896484, | |
| "learning_rate": 0.0002951788944723618, | |
| "loss": 1.5435, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 1.5002624988555908, | |
| "learning_rate": 0.00029487738693467335, | |
| "loss": 1.5485, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 1.8552610874176025, | |
| "learning_rate": 0.0002945758793969849, | |
| "loss": 1.5687, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 2.6914422512054443, | |
| "learning_rate": 0.00029427437185929647, | |
| "loss": 1.5549, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 1.5994210243225098, | |
| "learning_rate": 0.00029397286432160803, | |
| "loss": 1.5541, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 1.9448769092559814, | |
| "learning_rate": 0.0002936713567839196, | |
| "loss": 1.5348, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 2.3909597396850586, | |
| "learning_rate": 0.00029336984924623115, | |
| "loss": 1.5629, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 1.4517822265625, | |
| "learning_rate": 0.0002930683417085427, | |
| "loss": 1.4946, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 1.7407867908477783, | |
| "learning_rate": 0.0002927668341708542, | |
| "loss": 1.568, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 1.3732205629348755, | |
| "learning_rate": 0.0002924653266331658, | |
| "loss": 1.4928, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "eval_loss": 1.5172981023788452, | |
| "eval_runtime": 37.8358, | |
| "eval_samples_per_second": 26.43, | |
| "eval_steps_per_second": 3.304, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 1.9255911111831665, | |
| "learning_rate": 0.0002921638190954774, | |
| "loss": 1.5208, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 1.7328695058822632, | |
| "learning_rate": 0.00029186231155778895, | |
| "loss": 1.5442, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 2.286285400390625, | |
| "learning_rate": 0.00029156080402010045, | |
| "loss": 1.5071, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 2.426595687866211, | |
| "learning_rate": 0.000291259296482412, | |
| "loss": 1.5424, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 1.8213595151901245, | |
| "learning_rate": 0.0002909577889447236, | |
| "loss": 1.487, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 2.4181461334228516, | |
| "learning_rate": 0.000290659296482412, | |
| "loss": 1.5083, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 1.4696974754333496, | |
| "learning_rate": 0.0002903577889447236, | |
| "loss": 1.5204, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 1.285097360610962, | |
| "learning_rate": 0.00029005628140703517, | |
| "loss": 1.515, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 2.7307722568511963, | |
| "learning_rate": 0.00028975477386934673, | |
| "loss": 1.5283, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 2.5405428409576416, | |
| "learning_rate": 0.00028945326633165823, | |
| "loss": 1.4657, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "eval_loss": 1.4836663007736206, | |
| "eval_runtime": 37.7733, | |
| "eval_samples_per_second": 26.474, | |
| "eval_steps_per_second": 3.309, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 2.2221779823303223, | |
| "learning_rate": 0.00028915175879396985, | |
| "loss": 1.4936, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 2.700119733810425, | |
| "learning_rate": 0.0002888502512562814, | |
| "loss": 1.446, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 2.11588716506958, | |
| "learning_rate": 0.0002885487437185929, | |
| "loss": 1.4789, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 2.144611358642578, | |
| "learning_rate": 0.00028824723618090447, | |
| "loss": 1.4913, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 1.7891815900802612, | |
| "learning_rate": 0.0002879457286432161, | |
| "loss": 1.4693, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 2.2549595832824707, | |
| "learning_rate": 0.0002876442211055276, | |
| "loss": 1.4957, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 2.4034409523010254, | |
| "learning_rate": 0.00028734271356783915, | |
| "loss": 1.4909, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 1.4686906337738037, | |
| "learning_rate": 0.0002870412060301507, | |
| "loss": 1.4989, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 2.1314849853515625, | |
| "learning_rate": 0.0002867396984924623, | |
| "loss": 1.4899, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 1.703493595123291, | |
| "learning_rate": 0.00028643819095477383, | |
| "loss": 1.4897, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "eval_loss": 1.5144654512405396, | |
| "eval_runtime": 38.0015, | |
| "eval_samples_per_second": 26.315, | |
| "eval_steps_per_second": 3.289, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 2.8537943363189697, | |
| "learning_rate": 0.0002861366834170854, | |
| "loss": 1.4702, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 1.885312557220459, | |
| "learning_rate": 0.00028583517587939695, | |
| "loss": 1.4918, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 2.6149489879608154, | |
| "learning_rate": 0.0002855336683417085, | |
| "loss": 1.4867, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 1.8222806453704834, | |
| "learning_rate": 0.00028523216080402007, | |
| "loss": 1.4894, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 2.105160713195801, | |
| "learning_rate": 0.0002849306532663316, | |
| "loss": 1.4865, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 1.9180357456207275, | |
| "learning_rate": 0.0002846291457286432, | |
| "loss": 1.4365, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 1.4675670862197876, | |
| "learning_rate": 0.00028432763819095474, | |
| "loss": 1.4323, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 3.664919376373291, | |
| "learning_rate": 0.0002840261306532663, | |
| "loss": 1.4605, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 1.5559368133544922, | |
| "learning_rate": 0.00028372462311557786, | |
| "loss": 1.4799, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 2.0738680362701416, | |
| "learning_rate": 0.0002834261306532663, | |
| "loss": 1.4923, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "eval_loss": 1.4727822542190552, | |
| "eval_runtime": 38.2425, | |
| "eval_samples_per_second": 26.149, | |
| "eval_steps_per_second": 3.269, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 1.9228754043579102, | |
| "learning_rate": 0.00028312462311557785, | |
| "loss": 1.4127, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 2.0438356399536133, | |
| "learning_rate": 0.0002828231155778894, | |
| "loss": 1.4835, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 2.734626293182373, | |
| "learning_rate": 0.00028252160804020097, | |
| "loss": 1.4489, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 2.1490132808685303, | |
| "learning_rate": 0.0002822201005025125, | |
| "loss": 1.4684, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 2.1819868087768555, | |
| "learning_rate": 0.0002819185929648241, | |
| "loss": 1.4416, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 1.5763262510299683, | |
| "learning_rate": 0.00028161708542713565, | |
| "loss": 1.4532, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 1.9584680795669556, | |
| "learning_rate": 0.0002813155778894472, | |
| "loss": 1.4558, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 2.6148059368133545, | |
| "learning_rate": 0.00028101407035175876, | |
| "loss": 1.4588, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 1.5689460039138794, | |
| "learning_rate": 0.0002807125628140703, | |
| "loss": 1.4352, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 2.145756483078003, | |
| "learning_rate": 0.0002804110552763819, | |
| "loss": 1.4207, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "eval_loss": 1.4386738538742065, | |
| "eval_runtime": 38.107, | |
| "eval_samples_per_second": 26.242, | |
| "eval_steps_per_second": 3.28, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 4.316162586212158, | |
| "learning_rate": 0.00028010954773869344, | |
| "loss": 1.4085, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 2.0866541862487793, | |
| "learning_rate": 0.000279808040201005, | |
| "loss": 1.4634, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 3.0577406883239746, | |
| "learning_rate": 0.00027950653266331656, | |
| "loss": 1.4515, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 1.723168969154358, | |
| "learning_rate": 0.0002792050251256281, | |
| "loss": 1.4372, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 2.8033313751220703, | |
| "learning_rate": 0.0002789035175879397, | |
| "loss": 1.4844, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 2.051619529724121, | |
| "learning_rate": 0.00027860201005025124, | |
| "loss": 1.4352, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 1.4199312925338745, | |
| "learning_rate": 0.0002783005025125628, | |
| "loss": 1.4641, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 2.3949058055877686, | |
| "learning_rate": 0.00027799899497487436, | |
| "loss": 1.4592, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 2.8449528217315674, | |
| "learning_rate": 0.0002776974874371859, | |
| "loss": 1.4196, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 3.709972858428955, | |
| "learning_rate": 0.0002773959798994975, | |
| "loss": 1.4375, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "eval_loss": 1.4270827770233154, | |
| "eval_runtime": 38.3346, | |
| "eval_samples_per_second": 26.086, | |
| "eval_steps_per_second": 3.261, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 1.7984100580215454, | |
| "learning_rate": 0.00027709447236180904, | |
| "loss": 1.3943, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 2.1693639755249023, | |
| "learning_rate": 0.00027679597989949746, | |
| "loss": 1.4636, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 1.8211654424667358, | |
| "learning_rate": 0.000276494472361809, | |
| "loss": 1.4539, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 2.11051869392395, | |
| "learning_rate": 0.0002761929648241206, | |
| "loss": 1.4214, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 1.5553231239318848, | |
| "learning_rate": 0.00027589145728643214, | |
| "loss": 1.4475, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 2.0080809593200684, | |
| "learning_rate": 0.0002755899497487437, | |
| "loss": 1.4024, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 2.6698598861694336, | |
| "learning_rate": 0.00027528844221105526, | |
| "loss": 1.4159, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 2.2336277961730957, | |
| "learning_rate": 0.0002749869346733668, | |
| "loss": 1.437, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 1.7006186246871948, | |
| "learning_rate": 0.0002746854271356784, | |
| "loss": 1.4465, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 1.934051513671875, | |
| "learning_rate": 0.0002743839195979899, | |
| "loss": 1.4319, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "eval_loss": 1.4331704378128052, | |
| "eval_runtime": 37.9595, | |
| "eval_samples_per_second": 26.344, | |
| "eval_steps_per_second": 3.293, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 2.549532890319824, | |
| "learning_rate": 0.0002740824120603015, | |
| "loss": 1.4018, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 1.9921625852584839, | |
| "learning_rate": 0.00027378090452261306, | |
| "loss": 1.4354, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 1.5784940719604492, | |
| "learning_rate": 0.0002734793969849246, | |
| "loss": 1.4515, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 1.9822384119033813, | |
| "learning_rate": 0.0002731778894472361, | |
| "loss": 1.4784, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 3.0514814853668213, | |
| "learning_rate": 0.00027287638190954774, | |
| "loss": 1.4235, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 1.5947296619415283, | |
| "learning_rate": 0.0002725748743718593, | |
| "loss": 1.4325, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 2.838723659515381, | |
| "learning_rate": 0.0002722733668341708, | |
| "loss": 1.4318, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 2.7525815963745117, | |
| "learning_rate": 0.00027197185929648236, | |
| "loss": 1.4323, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 2.186182975769043, | |
| "learning_rate": 0.000271670351758794, | |
| "loss": 1.4122, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 1.5111092329025269, | |
| "learning_rate": 0.00027136884422110553, | |
| "loss": 1.4278, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "eval_loss": 1.4226535558700562, | |
| "eval_runtime": 37.925, | |
| "eval_samples_per_second": 26.368, | |
| "eval_steps_per_second": 3.296, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 1.4402307271957397, | |
| "learning_rate": 0.00027106733668341704, | |
| "loss": 1.4775, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 4.803475379943848, | |
| "learning_rate": 0.0002707658291457286, | |
| "loss": 1.4434, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 2.159541606903076, | |
| "learning_rate": 0.0002704643216080402, | |
| "loss": 1.4505, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 1.613765835762024, | |
| "learning_rate": 0.0002701658291457286, | |
| "loss": 1.4336, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 3.0653555393218994, | |
| "learning_rate": 0.0002698643216080402, | |
| "loss": 1.4238, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 2.0688183307647705, | |
| "learning_rate": 0.00026956281407035176, | |
| "loss": 1.4048, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 2.271068572998047, | |
| "learning_rate": 0.0002692613065326633, | |
| "loss": 1.4412, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 1.7365072965621948, | |
| "learning_rate": 0.0002689597989949748, | |
| "loss": 1.3864, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 1.7095474004745483, | |
| "learning_rate": 0.00026865829145728643, | |
| "loss": 1.4509, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 2.595015287399292, | |
| "learning_rate": 0.000268356783919598, | |
| "loss": 1.4068, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "eval_loss": 1.4620698690414429, | |
| "eval_runtime": 37.8254, | |
| "eval_samples_per_second": 26.437, | |
| "eval_steps_per_second": 3.305, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 1.6796025037765503, | |
| "learning_rate": 0.0002680552763819095, | |
| "loss": 1.4059, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 2.259477376937866, | |
| "learning_rate": 0.00026775376884422106, | |
| "loss": 1.4112, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 4.8005051612854, | |
| "learning_rate": 0.00026745226130653267, | |
| "loss": 1.367, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 2.824021577835083, | |
| "learning_rate": 0.00026715075376884423, | |
| "loss": 1.4156, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 2.4818904399871826, | |
| "learning_rate": 0.00026684924623115574, | |
| "loss": 1.3846, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 2.6064958572387695, | |
| "learning_rate": 0.0002665477386934673, | |
| "loss": 1.4062, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 1.8354562520980835, | |
| "learning_rate": 0.00026624623115577886, | |
| "loss": 1.3761, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 3.094172477722168, | |
| "learning_rate": 0.0002659447236180904, | |
| "loss": 1.3576, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 2.000718832015991, | |
| "learning_rate": 0.000265643216080402, | |
| "loss": 1.401, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 2.301866054534912, | |
| "learning_rate": 0.00026534170854271353, | |
| "loss": 1.4267, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "eval_loss": 1.4072773456573486, | |
| "eval_runtime": 37.8474, | |
| "eval_samples_per_second": 26.422, | |
| "eval_steps_per_second": 3.303, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 1.8116004467010498, | |
| "learning_rate": 0.0002650402010050251, | |
| "loss": 1.4141, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 1.7951298952102661, | |
| "learning_rate": 0.00026473869346733665, | |
| "loss": 1.4006, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 1.9248169660568237, | |
| "learning_rate": 0.0002644371859296482, | |
| "loss": 1.4143, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 3.0492172241210938, | |
| "learning_rate": 0.00026413567839195977, | |
| "loss": 1.3808, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 1.3698550462722778, | |
| "learning_rate": 0.00026383417085427133, | |
| "loss": 1.339, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 2.8333966732025146, | |
| "learning_rate": 0.0002635326633165829, | |
| "loss": 1.3977, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 2.5511767864227295, | |
| "learning_rate": 0.0002632341708542713, | |
| "loss": 1.4027, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 1.912987470626831, | |
| "learning_rate": 0.0002629326633165829, | |
| "loss": 1.4062, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 1.8692814111709595, | |
| "learning_rate": 0.00026263115577889444, | |
| "loss": 1.3901, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 2.620612859725952, | |
| "learning_rate": 0.000262329648241206, | |
| "loss": 1.3992, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "eval_loss": 1.3693994283676147, | |
| "eval_runtime": 38.004, | |
| "eval_samples_per_second": 26.313, | |
| "eval_steps_per_second": 3.289, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 3.1771810054779053, | |
| "learning_rate": 0.00026202814070351756, | |
| "loss": 1.3733, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 2.4650421142578125, | |
| "learning_rate": 0.0002617266331658291, | |
| "loss": 1.399, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 2.9789535999298096, | |
| "learning_rate": 0.0002614251256281407, | |
| "loss": 1.4291, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 1.4404784440994263, | |
| "learning_rate": 0.00026112361809045223, | |
| "loss": 1.3833, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 2.0667450428009033, | |
| "learning_rate": 0.0002608221105527638, | |
| "loss": 1.3884, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 2.014460563659668, | |
| "learning_rate": 0.00026052060301507535, | |
| "loss": 1.3819, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 2.360121965408325, | |
| "learning_rate": 0.0002602190954773869, | |
| "loss": 1.3695, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 1.6982303857803345, | |
| "learning_rate": 0.00025991758793969847, | |
| "loss": 1.3864, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 2.2350399494171143, | |
| "learning_rate": 0.00025961608040201003, | |
| "loss": 1.4096, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 1.4647042751312256, | |
| "learning_rate": 0.0002593145728643216, | |
| "loss": 1.3915, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "eval_loss": 1.3878337144851685, | |
| "eval_runtime": 37.7254, | |
| "eval_samples_per_second": 26.507, | |
| "eval_steps_per_second": 3.313, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 2.002542734146118, | |
| "learning_rate": 0.00025901306532663315, | |
| "loss": 1.4214, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 1.9857007265090942, | |
| "learning_rate": 0.0002587115577889447, | |
| "loss": 1.3636, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 2.4016737937927246, | |
| "learning_rate": 0.00025841005025125627, | |
| "loss": 1.4259, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 3.929931879043579, | |
| "learning_rate": 0.0002581085427135678, | |
| "loss": 1.3937, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 1.6266632080078125, | |
| "learning_rate": 0.0002578070351758794, | |
| "loss": 1.3678, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 2.905378580093384, | |
| "learning_rate": 0.00025750552763819095, | |
| "loss": 1.3526, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 2.535842180252075, | |
| "learning_rate": 0.0002572040201005025, | |
| "loss": 1.4062, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 1.5988209247589111, | |
| "learning_rate": 0.000256902512562814, | |
| "loss": 1.3915, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 1.5643303394317627, | |
| "learning_rate": 0.0002566010050251256, | |
| "loss": 1.3783, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 1.4297415018081665, | |
| "learning_rate": 0.0002562994974874372, | |
| "loss": 1.3782, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "eval_loss": 1.405114769935608, | |
| "eval_runtime": 37.9898, | |
| "eval_samples_per_second": 26.323, | |
| "eval_steps_per_second": 3.29, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 1.6650172472000122, | |
| "learning_rate": 0.0002559979899497487, | |
| "loss": 1.3387, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 2.118579864501953, | |
| "learning_rate": 0.00025569648241206025, | |
| "loss": 1.393, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 1.74748694896698, | |
| "learning_rate": 0.00025539497487437186, | |
| "loss": 1.3353, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 1.794631004333496, | |
| "learning_rate": 0.0002550934673366834, | |
| "loss": 1.3942, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 2.7065675258636475, | |
| "learning_rate": 0.00025479195979899493, | |
| "loss": 1.3962, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 3.389014720916748, | |
| "learning_rate": 0.0002544904522613065, | |
| "loss": 1.3758, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 1.534252405166626, | |
| "learning_rate": 0.0002541889447236181, | |
| "loss": 1.3526, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 1.7374197244644165, | |
| "learning_rate": 0.0002538874371859296, | |
| "loss": 1.3577, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 3.1230342388153076, | |
| "learning_rate": 0.00025358592964824117, | |
| "loss": 1.3548, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 3.261570692062378, | |
| "learning_rate": 0.0002532844221105527, | |
| "loss": 1.3932, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "eval_loss": 1.3275749683380127, | |
| "eval_runtime": 37.9493, | |
| "eval_samples_per_second": 26.351, | |
| "eval_steps_per_second": 3.294, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 3.0108933448791504, | |
| "learning_rate": 0.00025298291457286434, | |
| "loss": 1.3445, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 3.536722421646118, | |
| "learning_rate": 0.00025268140703517584, | |
| "loss": 1.364, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 1.637465238571167, | |
| "learning_rate": 0.0002523829145728643, | |
| "loss": 1.376, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 2.8907904624938965, | |
| "learning_rate": 0.0002520814070351759, | |
| "loss": 1.3623, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 2.4385364055633545, | |
| "learning_rate": 0.0002517798994974874, | |
| "loss": 1.318, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 1.9113733768463135, | |
| "learning_rate": 0.00025147839195979895, | |
| "loss": 1.3906, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 5.8118414878845215, | |
| "learning_rate": 0.00025117688442211056, | |
| "loss": 1.3336, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 0.9629586935043335, | |
| "learning_rate": 0.0002508753768844221, | |
| "loss": 1.3959, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 2.0420243740081787, | |
| "learning_rate": 0.0002505738693467336, | |
| "loss": 1.3523, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 2.0758414268493652, | |
| "learning_rate": 0.0002502723618090452, | |
| "loss": 1.3747, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "eval_loss": 1.3606867790222168, | |
| "eval_runtime": 37.9681, | |
| "eval_samples_per_second": 26.338, | |
| "eval_steps_per_second": 3.292, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 2.486980438232422, | |
| "learning_rate": 0.00024997085427135675, | |
| "loss": 1.3402, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 2.211982250213623, | |
| "learning_rate": 0.0002496693467336683, | |
| "loss": 1.3419, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 2.3362228870391846, | |
| "learning_rate": 0.00024936783919597986, | |
| "loss": 1.3748, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 1.515100121498108, | |
| "learning_rate": 0.0002490663316582914, | |
| "loss": 1.3747, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 2.1747968196868896, | |
| "learning_rate": 0.000248764824120603, | |
| "loss": 1.3458, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 2.6045758724212646, | |
| "learning_rate": 0.00024846331658291454, | |
| "loss": 1.3623, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 1.5456433296203613, | |
| "learning_rate": 0.0002481618090452261, | |
| "loss": 1.3107, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 1.5310312509536743, | |
| "learning_rate": 0.00024786030150753766, | |
| "loss": 1.3541, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 3.2094223499298096, | |
| "learning_rate": 0.0002475587939698492, | |
| "loss": 1.3445, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 2.7595880031585693, | |
| "learning_rate": 0.0002472572864321608, | |
| "loss": 1.3537, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "eval_loss": 1.3503804206848145, | |
| "eval_runtime": 37.8049, | |
| "eval_samples_per_second": 26.452, | |
| "eval_steps_per_second": 3.306, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 5.4382781982421875, | |
| "learning_rate": 0.00024695577889447234, | |
| "loss": 1.3584, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 2.7903175354003906, | |
| "learning_rate": 0.0002466542713567839, | |
| "loss": 1.3272, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 1.6171114444732666, | |
| "learning_rate": 0.00024635276381909546, | |
| "loss": 1.3601, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 2.9426279067993164, | |
| "learning_rate": 0.000246051256281407, | |
| "loss": 1.3782, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 2.36596941947937, | |
| "learning_rate": 0.0002457497487437186, | |
| "loss": 1.3307, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 1.3205448389053345, | |
| "learning_rate": 0.00024544824120603014, | |
| "loss": 1.3929, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 1.9464951753616333, | |
| "learning_rate": 0.0002451467336683417, | |
| "loss": 1.3415, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 1.7700294256210327, | |
| "learning_rate": 0.00024484522613065326, | |
| "loss": 1.3473, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 2.687060832977295, | |
| "learning_rate": 0.0002445437185929648, | |
| "loss": 1.3606, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 2.02754282951355, | |
| "learning_rate": 0.0002442422110552764, | |
| "loss": 1.3799, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "eval_loss": 1.365315556526184, | |
| "eval_runtime": 37.6707, | |
| "eval_samples_per_second": 26.546, | |
| "eval_steps_per_second": 3.318, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 2.187087059020996, | |
| "learning_rate": 0.0002439407035175879, | |
| "loss": 1.3585, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 3.8181040287017822, | |
| "learning_rate": 0.00024363919597989947, | |
| "loss": 1.3723, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 1.6949020624160767, | |
| "learning_rate": 0.00024333768844221105, | |
| "loss": 1.3074, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 2.716754913330078, | |
| "learning_rate": 0.00024303618090452259, | |
| "loss": 1.3589, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 1.5216838121414185, | |
| "learning_rate": 0.00024273467336683415, | |
| "loss": 1.3398, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 1.7370058298110962, | |
| "learning_rate": 0.0002424331658291457, | |
| "loss": 1.3546, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 2.0907745361328125, | |
| "learning_rate": 0.00024213165829145726, | |
| "loss": 1.3161, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 2.9564626216888428, | |
| "learning_rate": 0.00024183015075376882, | |
| "loss": 1.3623, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 2.6082723140716553, | |
| "learning_rate": 0.00024152864321608038, | |
| "loss": 1.3158, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 1.0046592950820923, | |
| "learning_rate": 0.00024122713567839192, | |
| "loss": 1.3366, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 1.3484834432601929, | |
| "eval_runtime": 37.9475, | |
| "eval_samples_per_second": 26.352, | |
| "eval_steps_per_second": 3.294, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 2.5935070514678955, | |
| "learning_rate": 0.0002409256281407035, | |
| "loss": 1.3512, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 1.01, | |
| "grad_norm": 3.790050506591797, | |
| "learning_rate": 0.00024062412060301506, | |
| "loss": 1.3272, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 1.01, | |
| "grad_norm": 1.3440461158752441, | |
| "learning_rate": 0.00024032562814070351, | |
| "loss": 1.333, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 1.02, | |
| "grad_norm": 6.51857852935791, | |
| "learning_rate": 0.00024002412060301505, | |
| "loss": 1.3334, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 1.02, | |
| "grad_norm": 1.882919192314148, | |
| "learning_rate": 0.0002397226130653266, | |
| "loss": 1.3241, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 1.03, | |
| "grad_norm": 1.361558198928833, | |
| "learning_rate": 0.00023942110552763817, | |
| "loss": 1.3207, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 1.03, | |
| "grad_norm": 2.0967071056365967, | |
| "learning_rate": 0.00023911959798994975, | |
| "loss": 1.2993, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 2.2517688274383545, | |
| "learning_rate": 0.00023881809045226128, | |
| "loss": 1.3353, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 7.7647480964660645, | |
| "learning_rate": 0.00023851658291457284, | |
| "loss": 1.3326, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "grad_norm": 2.0270638465881348, | |
| "learning_rate": 0.0002382180904522613, | |
| "loss": 1.3046, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "eval_loss": 1.3456777334213257, | |
| "eval_runtime": 38.0868, | |
| "eval_samples_per_second": 26.256, | |
| "eval_steps_per_second": 3.282, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 1.05, | |
| "grad_norm": 1.9642785787582397, | |
| "learning_rate": 0.00023791658291457283, | |
| "loss": 1.3131, | |
| "step": 21100 | |
| }, | |
| { | |
| "epoch": 1.06, | |
| "grad_norm": 2.517357587814331, | |
| "learning_rate": 0.0002376150753768844, | |
| "loss": 1.3627, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 1.06, | |
| "grad_norm": 1.4660860300064087, | |
| "learning_rate": 0.00023731356783919598, | |
| "loss": 1.2805, | |
| "step": 21300 | |
| }, | |
| { | |
| "epoch": 1.07, | |
| "grad_norm": 3.102552652359009, | |
| "learning_rate": 0.00023701206030150753, | |
| "loss": 1.339, | |
| "step": 21400 | |
| }, | |
| { | |
| "epoch": 1.07, | |
| "grad_norm": 2.017504930496216, | |
| "learning_rate": 0.00023671055276381907, | |
| "loss": 1.3307, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "grad_norm": 1.4260824918746948, | |
| "learning_rate": 0.00023640904522613063, | |
| "loss": 1.3216, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "grad_norm": 4.0052361488342285, | |
| "learning_rate": 0.0002361075376884422, | |
| "loss": 1.3544, | |
| "step": 21700 | |
| }, | |
| { | |
| "epoch": 1.09, | |
| "grad_norm": 3.664625883102417, | |
| "learning_rate": 0.00023580603015075375, | |
| "loss": 1.3508, | |
| "step": 21800 | |
| }, | |
| { | |
| "epoch": 1.09, | |
| "grad_norm": 2.1044421195983887, | |
| "learning_rate": 0.0002355045226130653, | |
| "loss": 1.3205, | |
| "step": 21900 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 1.6608549356460571, | |
| "learning_rate": 0.00023520301507537686, | |
| "loss": 1.3373, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "eval_loss": 1.319154977798462, | |
| "eval_runtime": 37.7789, | |
| "eval_samples_per_second": 26.47, | |
| "eval_steps_per_second": 3.309, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 1.1, | |
| "grad_norm": 2.131612777709961, | |
| "learning_rate": 0.00023490150753768845, | |
| "loss": 1.3244, | |
| "step": 22100 | |
| }, | |
| { | |
| "epoch": 1.11, | |
| "grad_norm": 2.0854969024658203, | |
| "learning_rate": 0.00023459999999999998, | |
| "loss": 1.3357, | |
| "step": 22200 | |
| }, | |
| { | |
| "epoch": 1.11, | |
| "grad_norm": 2.3622310161590576, | |
| "learning_rate": 0.0002343075376884422, | |
| "loss": 1.4118, | |
| "step": 22300 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 2.5198066234588623, | |
| "learning_rate": 0.00023400603015075376, | |
| "loss": 1.319, | |
| "step": 22400 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 2.4654555320739746, | |
| "learning_rate": 0.00023370452261306532, | |
| "loss": 1.3055, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 1.13, | |
| "grad_norm": 2.53120756149292, | |
| "learning_rate": 0.00023340301507537685, | |
| "loss": 1.3763, | |
| "step": 22600 | |
| }, | |
| { | |
| "epoch": 1.14, | |
| "grad_norm": 2.199324131011963, | |
| "learning_rate": 0.00023310150753768843, | |
| "loss": 1.3148, | |
| "step": 22700 | |
| }, | |
| { | |
| "epoch": 1.14, | |
| "grad_norm": 2.951871633529663, | |
| "learning_rate": 0.0002328, | |
| "loss": 1.3234, | |
| "step": 22800 | |
| }, | |
| { | |
| "epoch": 1.15, | |
| "grad_norm": 2.5513529777526855, | |
| "learning_rate": 0.00023249849246231153, | |
| "loss": 1.302, | |
| "step": 22900 | |
| }, | |
| { | |
| "epoch": 1.15, | |
| "grad_norm": 5.096097469329834, | |
| "learning_rate": 0.00023219698492462309, | |
| "loss": 1.3102, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 1.15, | |
| "eval_loss": 1.3704819679260254, | |
| "eval_runtime": 37.8283, | |
| "eval_samples_per_second": 26.435, | |
| "eval_steps_per_second": 3.304, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 1.3565678596496582, | |
| "learning_rate": 0.00023189547738693467, | |
| "loss": 1.3182, | |
| "step": 23100 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 3.1972274780273438, | |
| "learning_rate": 0.00023159396984924623, | |
| "loss": 1.316, | |
| "step": 23200 | |
| }, | |
| { | |
| "epoch": 1.17, | |
| "grad_norm": 2.4728245735168457, | |
| "learning_rate": 0.00023129246231155776, | |
| "loss": 1.2934, | |
| "step": 23300 | |
| }, | |
| { | |
| "epoch": 1.17, | |
| "grad_norm": 1.917893648147583, | |
| "learning_rate": 0.00023099095477386932, | |
| "loss": 1.3241, | |
| "step": 23400 | |
| }, | |
| { | |
| "epoch": 1.18, | |
| "grad_norm": 2.30876088142395, | |
| "learning_rate": 0.00023068944723618086, | |
| "loss": 1.3031, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 1.18, | |
| "grad_norm": 2.5653178691864014, | |
| "learning_rate": 0.00023038793969849244, | |
| "loss": 1.2819, | |
| "step": 23600 | |
| }, | |
| { | |
| "epoch": 1.19, | |
| "grad_norm": 3.500821352005005, | |
| "learning_rate": 0.000230086432160804, | |
| "loss": 1.2829, | |
| "step": 23700 | |
| }, | |
| { | |
| "epoch": 1.19, | |
| "grad_norm": 1.6564580202102661, | |
| "learning_rate": 0.00022978492462311556, | |
| "loss": 1.3209, | |
| "step": 23800 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 2.6477315425872803, | |
| "learning_rate": 0.0002294834170854271, | |
| "loss": 1.2991, | |
| "step": 23900 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 2.9583780765533447, | |
| "learning_rate": 0.00022918190954773868, | |
| "loss": 1.3011, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "eval_loss": 1.3160556554794312, | |
| "eval_runtime": 37.7643, | |
| "eval_samples_per_second": 26.48, | |
| "eval_steps_per_second": 3.31, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 1.21, | |
| "grad_norm": 2.3997368812561035, | |
| "learning_rate": 0.00022888040201005024, | |
| "loss": 1.2866, | |
| "step": 24100 | |
| }, | |
| { | |
| "epoch": 1.21, | |
| "grad_norm": 2.5909266471862793, | |
| "learning_rate": 0.00022857889447236177, | |
| "loss": 1.3133, | |
| "step": 24200 | |
| }, | |
| { | |
| "epoch": 1.22, | |
| "grad_norm": 1.9457557201385498, | |
| "learning_rate": 0.00022827738693467333, | |
| "loss": 1.2716, | |
| "step": 24300 | |
| }, | |
| { | |
| "epoch": 1.22, | |
| "grad_norm": 2.85856032371521, | |
| "learning_rate": 0.00022797587939698492, | |
| "loss": 1.2932, | |
| "step": 24400 | |
| }, | |
| { | |
| "epoch": 1.23, | |
| "grad_norm": 3.180671215057373, | |
| "learning_rate": 0.00022767437185929648, | |
| "loss": 1.317, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 1.23, | |
| "grad_norm": 1.630612850189209, | |
| "learning_rate": 0.000227372864321608, | |
| "loss": 1.3176, | |
| "step": 24600 | |
| }, | |
| { | |
| "epoch": 1.23, | |
| "grad_norm": 2.159804582595825, | |
| "learning_rate": 0.00022707135678391957, | |
| "loss": 1.3288, | |
| "step": 24700 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "grad_norm": 1.314036250114441, | |
| "learning_rate": 0.00022676984924623116, | |
| "loss": 1.3157, | |
| "step": 24800 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 2.718198776245117, | |
| "learning_rate": 0.0002264683417085427, | |
| "loss": 1.2915, | |
| "step": 24900 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 2.3423640727996826, | |
| "learning_rate": 0.00022616683417085425, | |
| "loss": 1.2976, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "eval_loss": 1.3594353199005127, | |
| "eval_runtime": 37.7829, | |
| "eval_samples_per_second": 26.467, | |
| "eval_steps_per_second": 3.308, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 2.3341753482818604, | |
| "learning_rate": 0.0002258653266331658, | |
| "loss": 1.322, | |
| "step": 25100 | |
| }, | |
| { | |
| "epoch": 1.26, | |
| "grad_norm": 2.0798075199127197, | |
| "learning_rate": 0.0002255638190954774, | |
| "loss": 1.3182, | |
| "step": 25200 | |
| }, | |
| { | |
| "epoch": 1.27, | |
| "grad_norm": 1.5256847143173218, | |
| "learning_rate": 0.00022526231155778893, | |
| "loss": 1.3102, | |
| "step": 25300 | |
| }, | |
| { | |
| "epoch": 1.27, | |
| "grad_norm": 2.4831185340881348, | |
| "learning_rate": 0.00022496080402010049, | |
| "loss": 1.3183, | |
| "step": 25400 | |
| }, | |
| { | |
| "epoch": 1.27, | |
| "grad_norm": 9.853681564331055, | |
| "learning_rate": 0.00022465929648241204, | |
| "loss": 1.2963, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 2.833552837371826, | |
| "learning_rate": 0.00022435778894472358, | |
| "loss": 1.3226, | |
| "step": 25600 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 2.7486400604248047, | |
| "learning_rate": 0.00022405628140703516, | |
| "loss": 1.2742, | |
| "step": 25700 | |
| }, | |
| { | |
| "epoch": 1.29, | |
| "grad_norm": 1.3708908557891846, | |
| "learning_rate": 0.00022375477386934672, | |
| "loss": 1.2878, | |
| "step": 25800 | |
| }, | |
| { | |
| "epoch": 1.29, | |
| "grad_norm": 3.6677916049957275, | |
| "learning_rate": 0.00022345326633165826, | |
| "loss": 1.3113, | |
| "step": 25900 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "grad_norm": 2.7909395694732666, | |
| "learning_rate": 0.00022315175879396981, | |
| "loss": 1.3221, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "eval_loss": 1.313453197479248, | |
| "eval_runtime": 37.7782, | |
| "eval_samples_per_second": 26.47, | |
| "eval_steps_per_second": 3.309, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 1.3, | |
| "grad_norm": 2.592221736907959, | |
| "learning_rate": 0.0002228502512562814, | |
| "loss": 1.2918, | |
| "step": 26100 | |
| }, | |
| { | |
| "epoch": 1.31, | |
| "grad_norm": 2.911118984222412, | |
| "learning_rate": 0.00022254874371859296, | |
| "loss": 1.3392, | |
| "step": 26200 | |
| }, | |
| { | |
| "epoch": 1.31, | |
| "grad_norm": 2.15328049659729, | |
| "learning_rate": 0.0002222472361809045, | |
| "loss": 1.261, | |
| "step": 26300 | |
| }, | |
| { | |
| "epoch": 1.32, | |
| "grad_norm": 3.0731029510498047, | |
| "learning_rate": 0.00022194572864321605, | |
| "loss": 1.289, | |
| "step": 26400 | |
| }, | |
| { | |
| "epoch": 1.32, | |
| "grad_norm": 3.032560348510742, | |
| "learning_rate": 0.00022164422110552764, | |
| "loss": 1.3186, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 1.33, | |
| "grad_norm": 5.388736724853516, | |
| "learning_rate": 0.00022134271356783917, | |
| "loss": 1.3214, | |
| "step": 26600 | |
| }, | |
| { | |
| "epoch": 1.33, | |
| "grad_norm": 2.6400022506713867, | |
| "learning_rate": 0.00022104120603015073, | |
| "loss": 1.2936, | |
| "step": 26700 | |
| }, | |
| { | |
| "epoch": 1.34, | |
| "grad_norm": 3.9355711936950684, | |
| "learning_rate": 0.0002207396984924623, | |
| "loss": 1.3039, | |
| "step": 26800 | |
| }, | |
| { | |
| "epoch": 1.34, | |
| "grad_norm": 1.6818647384643555, | |
| "learning_rate": 0.00022043819095477388, | |
| "loss": 1.2992, | |
| "step": 26900 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "grad_norm": 2.2356157302856445, | |
| "learning_rate": 0.0002201366834170854, | |
| "loss": 1.3011, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "eval_loss": 1.3157364130020142, | |
| "eval_runtime": 37.9238, | |
| "eval_samples_per_second": 26.369, | |
| "eval_steps_per_second": 3.296, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "grad_norm": 2.158803701400757, | |
| "learning_rate": 0.00021983517587939697, | |
| "loss": 1.308, | |
| "step": 27100 | |
| }, | |
| { | |
| "epoch": 1.36, | |
| "grad_norm": 1.4748259782791138, | |
| "learning_rate": 0.0002195336683417085, | |
| "loss": 1.2873, | |
| "step": 27200 | |
| }, | |
| { | |
| "epoch": 1.36, | |
| "grad_norm": 2.382047653198242, | |
| "learning_rate": 0.0002192321608040201, | |
| "loss": 1.2795, | |
| "step": 27300 | |
| }, | |
| { | |
| "epoch": 1.37, | |
| "grad_norm": 1.8785953521728516, | |
| "learning_rate": 0.00021893065326633165, | |
| "loss": 1.3101, | |
| "step": 27400 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "grad_norm": 2.4842770099639893, | |
| "learning_rate": 0.0002186291457286432, | |
| "loss": 1.3124, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 1.38, | |
| "grad_norm": 1.7258535623550415, | |
| "learning_rate": 0.00021832763819095474, | |
| "loss": 1.3315, | |
| "step": 27600 | |
| }, | |
| { | |
| "epoch": 1.39, | |
| "grad_norm": 2.157860517501831, | |
| "learning_rate": 0.00021802613065326633, | |
| "loss": 1.2848, | |
| "step": 27700 | |
| }, | |
| { | |
| "epoch": 1.39, | |
| "grad_norm": 3.1965837478637695, | |
| "learning_rate": 0.00021772462311557788, | |
| "loss": 1.3105, | |
| "step": 27800 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 3.141603708267212, | |
| "learning_rate": 0.00021742311557788942, | |
| "loss": 1.3197, | |
| "step": 27900 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 2.0368692874908447, | |
| "learning_rate": 0.00021712160804020098, | |
| "loss": 1.3113, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "eval_loss": 1.3079107999801636, | |
| "eval_runtime": 37.8037, | |
| "eval_samples_per_second": 26.452, | |
| "eval_steps_per_second": 3.307, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 1.41, | |
| "grad_norm": 3.013373851776123, | |
| "learning_rate": 0.00021682010050251254, | |
| "loss": 1.2892, | |
| "step": 28100 | |
| }, | |
| { | |
| "epoch": 1.41, | |
| "grad_norm": 2.766491651535034, | |
| "learning_rate": 0.00021651859296482412, | |
| "loss": 1.3414, | |
| "step": 28200 | |
| }, | |
| { | |
| "epoch": 1.42, | |
| "grad_norm": 1.6288301944732666, | |
| "learning_rate": 0.00021621708542713566, | |
| "loss": 1.3156, | |
| "step": 28300 | |
| }, | |
| { | |
| "epoch": 1.42, | |
| "grad_norm": 2.3904545307159424, | |
| "learning_rate": 0.00021591557788944721, | |
| "loss": 1.2905, | |
| "step": 28400 | |
| }, | |
| { | |
| "epoch": 1.43, | |
| "grad_norm": 2.263744831085205, | |
| "learning_rate": 0.00021561407035175877, | |
| "loss": 1.2961, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 1.43, | |
| "grad_norm": 1.985129714012146, | |
| "learning_rate": 0.00021531256281407033, | |
| "loss": 1.2703, | |
| "step": 28600 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 2.4574270248413086, | |
| "learning_rate": 0.0002150110552763819, | |
| "loss": 1.2793, | |
| "step": 28700 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 2.312525510787964, | |
| "learning_rate": 0.00021470954773869345, | |
| "loss": 1.2669, | |
| "step": 28800 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "grad_norm": 1.5253132581710815, | |
| "learning_rate": 0.00021440804020100498, | |
| "loss": 1.3187, | |
| "step": 28900 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "grad_norm": 1.7550122737884521, | |
| "learning_rate": 0.00021410653266331657, | |
| "loss": 1.3154, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 1.45, | |
| "eval_loss": 1.2937275171279907, | |
| "eval_runtime": 37.9639, | |
| "eval_samples_per_second": 26.341, | |
| "eval_steps_per_second": 3.293, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 1.46, | |
| "grad_norm": 2.492000102996826, | |
| "learning_rate": 0.00021380502512562813, | |
| "loss": 1.2868, | |
| "step": 29100 | |
| }, | |
| { | |
| "epoch": 1.46, | |
| "grad_norm": 4.013311862945557, | |
| "learning_rate": 0.00021350351758793966, | |
| "loss": 1.2578, | |
| "step": 29200 | |
| }, | |
| { | |
| "epoch": 1.47, | |
| "grad_norm": 3.991748809814453, | |
| "learning_rate": 0.00021320201005025122, | |
| "loss": 1.3347, | |
| "step": 29300 | |
| }, | |
| { | |
| "epoch": 1.47, | |
| "grad_norm": 4.655180931091309, | |
| "learning_rate": 0.0002129005025125628, | |
| "loss": 1.2935, | |
| "step": 29400 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": 1.9497921466827393, | |
| "learning_rate": 0.00021259899497487437, | |
| "loss": 1.248, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": 3.372061252593994, | |
| "learning_rate": 0.0002122974874371859, | |
| "loss": 1.2877, | |
| "step": 29600 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": 2.1920547485351562, | |
| "learning_rate": 0.00021199597989949746, | |
| "loss": 1.2407, | |
| "step": 29700 | |
| }, | |
| { | |
| "epoch": 1.49, | |
| "grad_norm": 3.5231897830963135, | |
| "learning_rate": 0.0002116974874371859, | |
| "loss": 1.2296, | |
| "step": 29800 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 4.537712097167969, | |
| "learning_rate": 0.00021139597989949745, | |
| "loss": 1.2704, | |
| "step": 29900 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 3.12864351272583, | |
| "learning_rate": 0.00021109447236180903, | |
| "loss": 1.3093, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "eval_loss": 1.2697720527648926, | |
| "eval_runtime": 37.8104, | |
| "eval_samples_per_second": 26.448, | |
| "eval_steps_per_second": 3.306, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 1.9532142877578735, | |
| "learning_rate": 0.0002107929648241206, | |
| "loss": 1.2892, | |
| "step": 30100 | |
| }, | |
| { | |
| "epoch": 1.51, | |
| "grad_norm": 1.9121806621551514, | |
| "learning_rate": 0.00021049145728643215, | |
| "loss": 1.282, | |
| "step": 30200 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 1.2597557306289673, | |
| "learning_rate": 0.00021018994974874368, | |
| "loss": 1.2793, | |
| "step": 30300 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 1.7637083530426025, | |
| "learning_rate": 0.00020988844221105527, | |
| "loss": 1.3253, | |
| "step": 30400 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 3.788984775543213, | |
| "learning_rate": 0.00020958693467336683, | |
| "loss": 1.249, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 1.53, | |
| "grad_norm": 3.1422038078308105, | |
| "learning_rate": 0.00020928542713567836, | |
| "loss": 1.2429, | |
| "step": 30600 | |
| }, | |
| { | |
| "epoch": 1.54, | |
| "grad_norm": 1.995868444442749, | |
| "learning_rate": 0.00020898391959798992, | |
| "loss": 1.2827, | |
| "step": 30700 | |
| }, | |
| { | |
| "epoch": 1.54, | |
| "grad_norm": 2.3635036945343018, | |
| "learning_rate": 0.00020868241206030148, | |
| "loss": 1.2653, | |
| "step": 30800 | |
| }, | |
| { | |
| "epoch": 1.54, | |
| "grad_norm": 2.0892832279205322, | |
| "learning_rate": 0.00020838090452261307, | |
| "loss": 1.2814, | |
| "step": 30900 | |
| }, | |
| { | |
| "epoch": 1.55, | |
| "grad_norm": 2.8766140937805176, | |
| "learning_rate": 0.0002080793969849246, | |
| "loss": 1.2809, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 1.55, | |
| "eval_loss": 1.2703502178192139, | |
| "eval_runtime": 37.818, | |
| "eval_samples_per_second": 26.442, | |
| "eval_steps_per_second": 3.305, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "grad_norm": 2.5487587451934814, | |
| "learning_rate": 0.00020777788944723616, | |
| "loss": 1.2811, | |
| "step": 31100 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "grad_norm": 2.325295925140381, | |
| "learning_rate": 0.00020747638190954772, | |
| "loss": 1.2769, | |
| "step": 31200 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "grad_norm": 1.741773009300232, | |
| "learning_rate": 0.00020717487437185928, | |
| "loss": 1.2741, | |
| "step": 31300 | |
| }, | |
| { | |
| "epoch": 1.57, | |
| "grad_norm": 5.916422367095947, | |
| "learning_rate": 0.00020687336683417084, | |
| "loss": 1.2567, | |
| "step": 31400 | |
| }, | |
| { | |
| "epoch": 1.57, | |
| "grad_norm": 2.166018009185791, | |
| "learning_rate": 0.0002065718592964824, | |
| "loss": 1.2491, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 1.58, | |
| "grad_norm": 1.7622108459472656, | |
| "learning_rate": 0.00020627035175879393, | |
| "loss": 1.2815, | |
| "step": 31600 | |
| }, | |
| { | |
| "epoch": 1.58, | |
| "grad_norm": 2.2861111164093018, | |
| "learning_rate": 0.00020596884422110552, | |
| "loss": 1.2485, | |
| "step": 31700 | |
| }, | |
| { | |
| "epoch": 1.59, | |
| "grad_norm": 2.8738324642181396, | |
| "learning_rate": 0.00020566733668341708, | |
| "loss": 1.2747, | |
| "step": 31800 | |
| }, | |
| { | |
| "epoch": 1.59, | |
| "grad_norm": 1.920782208442688, | |
| "learning_rate": 0.00020536582914572863, | |
| "loss": 1.3094, | |
| "step": 31900 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 2.591792345046997, | |
| "learning_rate": 0.00020506432160804017, | |
| "loss": 1.3178, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "eval_loss": 1.2383744716644287, | |
| "eval_runtime": 37.8786, | |
| "eval_samples_per_second": 26.4, | |
| "eval_steps_per_second": 3.3, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 3.4940438270568848, | |
| "learning_rate": 0.00020476281407035175, | |
| "loss": 1.2755, | |
| "step": 32100 | |
| }, | |
| { | |
| "epoch": 1.61, | |
| "grad_norm": 2.377112627029419, | |
| "learning_rate": 0.0002044613065326633, | |
| "loss": 1.2667, | |
| "step": 32200 | |
| }, | |
| { | |
| "epoch": 1.61, | |
| "grad_norm": 2.5229716300964355, | |
| "learning_rate": 0.00020415979899497485, | |
| "loss": 1.2695, | |
| "step": 32300 | |
| }, | |
| { | |
| "epoch": 1.62, | |
| "grad_norm": 2.469883441925049, | |
| "learning_rate": 0.0002038582914572864, | |
| "loss": 1.3089, | |
| "step": 32400 | |
| }, | |
| { | |
| "epoch": 1.62, | |
| "grad_norm": 1.9299498796463013, | |
| "learning_rate": 0.000203556783919598, | |
| "loss": 1.2835, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 1.63, | |
| "grad_norm": 2.486790895462036, | |
| "learning_rate": 0.00020325527638190955, | |
| "loss": 1.2531, | |
| "step": 32600 | |
| }, | |
| { | |
| "epoch": 1.64, | |
| "grad_norm": 3.485691785812378, | |
| "learning_rate": 0.00020295376884422108, | |
| "loss": 1.2568, | |
| "step": 32700 | |
| }, | |
| { | |
| "epoch": 1.64, | |
| "grad_norm": 1.674727201461792, | |
| "learning_rate": 0.00020265226130653264, | |
| "loss": 1.2739, | |
| "step": 32800 | |
| }, | |
| { | |
| "epoch": 1.65, | |
| "grad_norm": 4.50739049911499, | |
| "learning_rate": 0.00020235075376884417, | |
| "loss": 1.211, | |
| "step": 32900 | |
| }, | |
| { | |
| "epoch": 1.65, | |
| "grad_norm": 11.218056678771973, | |
| "learning_rate": 0.00020204924623115576, | |
| "loss": 1.2891, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 1.65, | |
| "eval_loss": 1.2705625295639038, | |
| "eval_runtime": 37.8291, | |
| "eval_samples_per_second": 26.435, | |
| "eval_steps_per_second": 3.304, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 1.66, | |
| "grad_norm": 1.9991952180862427, | |
| "learning_rate": 0.00020174773869346732, | |
| "loss": 1.2636, | |
| "step": 33100 | |
| }, | |
| { | |
| "epoch": 1.66, | |
| "grad_norm": 3.0366969108581543, | |
| "learning_rate": 0.00020144623115577888, | |
| "loss": 1.2903, | |
| "step": 33200 | |
| }, | |
| { | |
| "epoch": 1.67, | |
| "grad_norm": 1.7985395193099976, | |
| "learning_rate": 0.0002011447236180904, | |
| "loss": 1.2437, | |
| "step": 33300 | |
| }, | |
| { | |
| "epoch": 1.67, | |
| "grad_norm": 3.8208954334259033, | |
| "learning_rate": 0.000200843216080402, | |
| "loss": 1.244, | |
| "step": 33400 | |
| }, | |
| { | |
| "epoch": 1.68, | |
| "grad_norm": 3.2836215496063232, | |
| "learning_rate": 0.00020054170854271356, | |
| "loss": 1.2837, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 1.68, | |
| "grad_norm": 3.15663480758667, | |
| "learning_rate": 0.0002002402010050251, | |
| "loss": 1.2253, | |
| "step": 33600 | |
| }, | |
| { | |
| "epoch": 1.69, | |
| "grad_norm": 1.6871391534805298, | |
| "learning_rate": 0.00019993869346733665, | |
| "loss": 1.2564, | |
| "step": 33700 | |
| }, | |
| { | |
| "epoch": 1.69, | |
| "grad_norm": 2.3701913356781006, | |
| "learning_rate": 0.00019963718592964824, | |
| "loss": 1.2925, | |
| "step": 33800 | |
| }, | |
| { | |
| "epoch": 1.69, | |
| "grad_norm": 2.9534804821014404, | |
| "learning_rate": 0.0001993356783919598, | |
| "loss": 1.2613, | |
| "step": 33900 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "grad_norm": 2.273113489151001, | |
| "learning_rate": 0.00019903417085427133, | |
| "loss": 1.29, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "eval_loss": 1.2713490724563599, | |
| "eval_runtime": 37.9786, | |
| "eval_samples_per_second": 26.331, | |
| "eval_steps_per_second": 3.291, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 1.71, | |
| "grad_norm": 2.1708054542541504, | |
| "learning_rate": 0.0001987326633165829, | |
| "loss": 1.2775, | |
| "step": 34100 | |
| }, | |
| { | |
| "epoch": 1.71, | |
| "grad_norm": 2.242708683013916, | |
| "learning_rate": 0.00019843115577889447, | |
| "loss": 1.2561, | |
| "step": 34200 | |
| }, | |
| { | |
| "epoch": 1.71, | |
| "grad_norm": 2.0170931816101074, | |
| "learning_rate": 0.000198129648241206, | |
| "loss": 1.2168, | |
| "step": 34300 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 2.094848871231079, | |
| "learning_rate": 0.00019782814070351757, | |
| "loss": 1.2588, | |
| "step": 34400 | |
| }, | |
| { | |
| "epoch": 1.73, | |
| "grad_norm": 2.1762752532958984, | |
| "learning_rate": 0.00019752663316582913, | |
| "loss": 1.1837, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 1.73, | |
| "grad_norm": 3.1318016052246094, | |
| "learning_rate": 0.0001972251256281407, | |
| "loss": 1.2196, | |
| "step": 34600 | |
| }, | |
| { | |
| "epoch": 1.73, | |
| "grad_norm": 3.2971861362457275, | |
| "learning_rate": 0.00019692361809045225, | |
| "loss": 1.2778, | |
| "step": 34700 | |
| }, | |
| { | |
| "epoch": 1.74, | |
| "grad_norm": 3.452091693878174, | |
| "learning_rate": 0.0001966221105527638, | |
| "loss": 1.2385, | |
| "step": 34800 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 1.7514299154281616, | |
| "learning_rate": 0.00019632060301507536, | |
| "loss": 1.2769, | |
| "step": 34900 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 2.3494088649749756, | |
| "learning_rate": 0.00019601909547738692, | |
| "loss": 1.2689, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "eval_loss": 1.2675199508666992, | |
| "eval_runtime": 37.8879, | |
| "eval_samples_per_second": 26.394, | |
| "eval_steps_per_second": 3.299, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 1.5741009712219238, | |
| "learning_rate": 0.00019571758793969848, | |
| "loss": 1.2352, | |
| "step": 35100 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 2.652435302734375, | |
| "learning_rate": 0.00019541608040201004, | |
| "loss": 1.2824, | |
| "step": 35200 | |
| }, | |
| { | |
| "epoch": 1.77, | |
| "grad_norm": 2.9557676315307617, | |
| "learning_rate": 0.00019511457286432157, | |
| "loss": 1.2453, | |
| "step": 35300 | |
| }, | |
| { | |
| "epoch": 1.77, | |
| "grad_norm": 2.8758041858673096, | |
| "learning_rate": 0.00019481306532663313, | |
| "loss": 1.2507, | |
| "step": 35400 | |
| }, | |
| { | |
| "epoch": 1.77, | |
| "grad_norm": 2.5828402042388916, | |
| "learning_rate": 0.0001945145728643216, | |
| "loss": 1.2201, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 1.78, | |
| "grad_norm": 2.887206554412842, | |
| "learning_rate": 0.00019421306532663312, | |
| "loss": 1.2754, | |
| "step": 35600 | |
| }, | |
| { | |
| "epoch": 1.79, | |
| "grad_norm": 2.5521140098571777, | |
| "learning_rate": 0.0001939115577889447, | |
| "loss": 1.234, | |
| "step": 35700 | |
| }, | |
| { | |
| "epoch": 1.79, | |
| "grad_norm": 1.9570846557617188, | |
| "learning_rate": 0.00019361005025125627, | |
| "loss": 1.2708, | |
| "step": 35800 | |
| }, | |
| { | |
| "epoch": 1.79, | |
| "grad_norm": 2.89273738861084, | |
| "learning_rate": 0.00019330854271356782, | |
| "loss": 1.2343, | |
| "step": 35900 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 3.624706506729126, | |
| "learning_rate": 0.00019300703517587936, | |
| "loss": 1.2576, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "eval_loss": 1.2644726037979126, | |
| "eval_runtime": 37.8527, | |
| "eval_samples_per_second": 26.418, | |
| "eval_steps_per_second": 3.302, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 1.81, | |
| "grad_norm": 2.5976133346557617, | |
| "learning_rate": 0.00019270552763819094, | |
| "loss": 1.2812, | |
| "step": 36100 | |
| }, | |
| { | |
| "epoch": 1.81, | |
| "grad_norm": 2.899306297302246, | |
| "learning_rate": 0.0001924040201005025, | |
| "loss": 1.2541, | |
| "step": 36200 | |
| }, | |
| { | |
| "epoch": 1.81, | |
| "grad_norm": 3.964782476425171, | |
| "learning_rate": 0.00019210251256281404, | |
| "loss": 1.2639, | |
| "step": 36300 | |
| }, | |
| { | |
| "epoch": 1.82, | |
| "grad_norm": 2.4634933471679688, | |
| "learning_rate": 0.0001918010050251256, | |
| "loss": 1.2089, | |
| "step": 36400 | |
| }, | |
| { | |
| "epoch": 1.82, | |
| "grad_norm": 2.6023619174957275, | |
| "learning_rate": 0.00019149949748743718, | |
| "loss": 1.2612, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 1.83, | |
| "grad_norm": 3.0462849140167236, | |
| "learning_rate": 0.00019119798994974874, | |
| "loss": 1.2204, | |
| "step": 36600 | |
| }, | |
| { | |
| "epoch": 1.83, | |
| "grad_norm": 2.1344144344329834, | |
| "learning_rate": 0.00019089648241206027, | |
| "loss": 1.2142, | |
| "step": 36700 | |
| }, | |
| { | |
| "epoch": 1.84, | |
| "grad_norm": 1.5994189977645874, | |
| "learning_rate": 0.00019059497487437183, | |
| "loss": 1.2586, | |
| "step": 36800 | |
| }, | |
| { | |
| "epoch": 1.84, | |
| "grad_norm": 1.357469916343689, | |
| "learning_rate": 0.00019029346733668342, | |
| "loss": 1.2705, | |
| "step": 36900 | |
| }, | |
| { | |
| "epoch": 1.85, | |
| "grad_norm": 2.4201526641845703, | |
| "learning_rate": 0.00018999195979899495, | |
| "loss": 1.2409, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 1.85, | |
| "eval_loss": 1.2103183269500732, | |
| "eval_runtime": 37.8707, | |
| "eval_samples_per_second": 26.406, | |
| "eval_steps_per_second": 3.301, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 1.85, | |
| "grad_norm": 3.1790504455566406, | |
| "learning_rate": 0.0001896904522613065, | |
| "loss": 1.2639, | |
| "step": 37100 | |
| }, | |
| { | |
| "epoch": 1.86, | |
| "grad_norm": 2.565474033355713, | |
| "learning_rate": 0.00018938894472361807, | |
| "loss": 1.2853, | |
| "step": 37200 | |
| }, | |
| { | |
| "epoch": 1.86, | |
| "grad_norm": 2.6977927684783936, | |
| "learning_rate": 0.00018908743718592966, | |
| "loss": 1.2178, | |
| "step": 37300 | |
| }, | |
| { | |
| "epoch": 1.87, | |
| "grad_norm": 2.588975191116333, | |
| "learning_rate": 0.0001887859296482412, | |
| "loss": 1.2492, | |
| "step": 37400 | |
| }, | |
| { | |
| "epoch": 1.88, | |
| "grad_norm": 2.23592209815979, | |
| "learning_rate": 0.00018848442211055275, | |
| "loss": 1.2273, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 1.88, | |
| "grad_norm": 2.0961692333221436, | |
| "learning_rate": 0.0001881859296482412, | |
| "loss": 1.2375, | |
| "step": 37600 | |
| }, | |
| { | |
| "epoch": 1.89, | |
| "grad_norm": 2.4870264530181885, | |
| "learning_rate": 0.00018788442211055273, | |
| "loss": 1.2564, | |
| "step": 37700 | |
| }, | |
| { | |
| "epoch": 1.89, | |
| "grad_norm": 1.9144058227539062, | |
| "learning_rate": 0.0001875829145728643, | |
| "loss": 1.2403, | |
| "step": 37800 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "grad_norm": 2.209117889404297, | |
| "learning_rate": 0.00018728140703517588, | |
| "loss": 1.2168, | |
| "step": 37900 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "grad_norm": 2.7400968074798584, | |
| "learning_rate": 0.00018697989949748744, | |
| "loss": 1.1786, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 1.9, | |
| "eval_loss": 1.2550157308578491, | |
| "eval_runtime": 37.907, | |
| "eval_samples_per_second": 26.38, | |
| "eval_steps_per_second": 3.298, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 1.91, | |
| "grad_norm": 2.392390251159668, | |
| "learning_rate": 0.00018667839195979897, | |
| "loss": 1.2294, | |
| "step": 38100 | |
| }, | |
| { | |
| "epoch": 1.91, | |
| "grad_norm": 3.434168577194214, | |
| "learning_rate": 0.00018637688442211053, | |
| "loss": 1.2491, | |
| "step": 38200 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 2.082618236541748, | |
| "learning_rate": 0.0001860753768844221, | |
| "loss": 1.2602, | |
| "step": 38300 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 1.6049084663391113, | |
| "learning_rate": 0.00018577386934673365, | |
| "loss": 1.2067, | |
| "step": 38400 | |
| }, | |
| { | |
| "epoch": 1.93, | |
| "grad_norm": 2.1953368186950684, | |
| "learning_rate": 0.0001854723618090452, | |
| "loss": 1.2292, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 1.93, | |
| "grad_norm": 2.6085190773010254, | |
| "learning_rate": 0.00018517085427135677, | |
| "loss": 1.2269, | |
| "step": 38600 | |
| }, | |
| { | |
| "epoch": 1.94, | |
| "grad_norm": 2.9110639095306396, | |
| "learning_rate": 0.0001848693467336683, | |
| "loss": 1.1898, | |
| "step": 38700 | |
| }, | |
| { | |
| "epoch": 1.94, | |
| "grad_norm": 1.514410138130188, | |
| "learning_rate": 0.0001845678391959799, | |
| "loss": 1.199, | |
| "step": 38800 | |
| }, | |
| { | |
| "epoch": 1.94, | |
| "grad_norm": 4.6756134033203125, | |
| "learning_rate": 0.00018426633165829145, | |
| "loss": 1.183, | |
| "step": 38900 | |
| }, | |
| { | |
| "epoch": 1.95, | |
| "grad_norm": 2.704317808151245, | |
| "learning_rate": 0.000183964824120603, | |
| "loss": 1.1999, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 1.95, | |
| "eval_loss": 1.2309662103652954, | |
| "eval_runtime": 37.8598, | |
| "eval_samples_per_second": 26.413, | |
| "eval_steps_per_second": 3.302, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "grad_norm": 2.5975565910339355, | |
| "learning_rate": 0.00018366331658291454, | |
| "loss": 1.2576, | |
| "step": 39100 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "grad_norm": 3.3112730979919434, | |
| "learning_rate": 0.00018336180904522613, | |
| "loss": 1.2128, | |
| "step": 39200 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "grad_norm": 2.5991640090942383, | |
| "learning_rate": 0.00018306030150753769, | |
| "loss": 1.2294, | |
| "step": 39300 | |
| }, | |
| { | |
| "epoch": 1.97, | |
| "grad_norm": 4.411704063415527, | |
| "learning_rate": 0.00018275879396984922, | |
| "loss": 1.1977, | |
| "step": 39400 | |
| }, | |
| { | |
| "epoch": 1.98, | |
| "grad_norm": 1.509308099746704, | |
| "learning_rate": 0.00018245728643216078, | |
| "loss": 1.2712, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 1.98, | |
| "grad_norm": 2.136350631713867, | |
| "learning_rate": 0.00018215577889447236, | |
| "loss": 1.2359, | |
| "step": 39600 | |
| }, | |
| { | |
| "epoch": 1.98, | |
| "grad_norm": 2.1651546955108643, | |
| "learning_rate": 0.0001818542713567839, | |
| "loss": 1.2448, | |
| "step": 39700 | |
| }, | |
| { | |
| "epoch": 1.99, | |
| "grad_norm": 2.9962761402130127, | |
| "learning_rate": 0.00018155577889447235, | |
| "loss": 1.218, | |
| "step": 39800 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 2.8525376319885254, | |
| "learning_rate": 0.0001812542713567839, | |
| "loss": 1.2564, | |
| "step": 39900 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 2.120208740234375, | |
| "learning_rate": 0.00018095276381909547, | |
| "loss": 1.2287, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 1.2058476209640503, | |
| "eval_runtime": 38.0203, | |
| "eval_samples_per_second": 26.302, | |
| "eval_steps_per_second": 3.288, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 3.9785573482513428, | |
| "learning_rate": 0.000180651256281407, | |
| "loss": 1.2161, | |
| "step": 40100 | |
| }, | |
| { | |
| "epoch": 2.01, | |
| "grad_norm": 2.7897050380706787, | |
| "learning_rate": 0.0001803497487437186, | |
| "loss": 1.2525, | |
| "step": 40200 | |
| }, | |
| { | |
| "epoch": 2.02, | |
| "grad_norm": 2.042492389678955, | |
| "learning_rate": 0.00018004824120603015, | |
| "loss": 1.2087, | |
| "step": 40300 | |
| }, | |
| { | |
| "epoch": 2.02, | |
| "grad_norm": 1.8287073373794556, | |
| "learning_rate": 0.00017974673366834168, | |
| "loss": 1.2404, | |
| "step": 40400 | |
| }, | |
| { | |
| "epoch": 2.02, | |
| "grad_norm": 1.6399390697479248, | |
| "learning_rate": 0.00017944522613065324, | |
| "loss": 1.174, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 2.03, | |
| "grad_norm": 3.9909472465515137, | |
| "learning_rate": 0.00017914371859296482, | |
| "loss": 1.1869, | |
| "step": 40600 | |
| }, | |
| { | |
| "epoch": 2.04, | |
| "grad_norm": 2.9356400966644287, | |
| "learning_rate": 0.00017884221105527638, | |
| "loss": 1.2271, | |
| "step": 40700 | |
| }, | |
| { | |
| "epoch": 2.04, | |
| "grad_norm": 2.205498218536377, | |
| "learning_rate": 0.00017854070351758792, | |
| "loss": 1.2505, | |
| "step": 40800 | |
| }, | |
| { | |
| "epoch": 2.04, | |
| "grad_norm": 2.2801437377929688, | |
| "learning_rate": 0.00017823919597989948, | |
| "loss": 1.2232, | |
| "step": 40900 | |
| }, | |
| { | |
| "epoch": 2.05, | |
| "grad_norm": 4.001745223999023, | |
| "learning_rate": 0.00017793768844221104, | |
| "loss": 1.257, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 2.05, | |
| "eval_loss": 1.1965339183807373, | |
| "eval_runtime": 37.9045, | |
| "eval_samples_per_second": 26.382, | |
| "eval_steps_per_second": 3.298, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 2.06, | |
| "grad_norm": 3.484135150909424, | |
| "learning_rate": 0.0001776361809045226, | |
| "loss": 1.2232, | |
| "step": 41100 | |
| }, | |
| { | |
| "epoch": 2.06, | |
| "grad_norm": 2.7462897300720215, | |
| "learning_rate": 0.00017733467336683415, | |
| "loss": 1.22, | |
| "step": 41200 | |
| }, | |
| { | |
| "epoch": 2.06, | |
| "grad_norm": 2.9418435096740723, | |
| "learning_rate": 0.00017703316582914571, | |
| "loss": 1.2141, | |
| "step": 41300 | |
| }, | |
| { | |
| "epoch": 2.07, | |
| "grad_norm": 2.188680410385132, | |
| "learning_rate": 0.00017673165829145725, | |
| "loss": 1.1909, | |
| "step": 41400 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 3.728938579559326, | |
| "learning_rate": 0.00017643015075376883, | |
| "loss": 1.2146, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 2.8790736198425293, | |
| "learning_rate": 0.0001761286432160804, | |
| "loss": 1.2305, | |
| "step": 41600 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 3.6593847274780273, | |
| "learning_rate": 0.00017582713567839195, | |
| "loss": 1.1753, | |
| "step": 41700 | |
| }, | |
| { | |
| "epoch": 2.09, | |
| "grad_norm": 2.408237934112549, | |
| "learning_rate": 0.00017552562814070348, | |
| "loss": 1.2229, | |
| "step": 41800 | |
| }, | |
| { | |
| "epoch": 2.1, | |
| "grad_norm": 2.574580669403076, | |
| "learning_rate": 0.00017522412060301507, | |
| "loss": 1.2173, | |
| "step": 41900 | |
| }, | |
| { | |
| "epoch": 2.1, | |
| "grad_norm": 2.2249817848205566, | |
| "learning_rate": 0.00017492261306532663, | |
| "loss": 1.2112, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 2.1, | |
| "eval_loss": 1.2255558967590332, | |
| "eval_runtime": 37.9009, | |
| "eval_samples_per_second": 26.385, | |
| "eval_steps_per_second": 3.298, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 2.1, | |
| "grad_norm": 2.2712411880493164, | |
| "learning_rate": 0.00017462110552763816, | |
| "loss": 1.1862, | |
| "step": 42100 | |
| }, | |
| { | |
| "epoch": 2.11, | |
| "grad_norm": 1.646330714225769, | |
| "learning_rate": 0.00017431959798994972, | |
| "loss": 1.1812, | |
| "step": 42200 | |
| }, | |
| { | |
| "epoch": 2.12, | |
| "grad_norm": 2.9691689014434814, | |
| "learning_rate": 0.0001740180904522613, | |
| "loss": 1.2055, | |
| "step": 42300 | |
| }, | |
| { | |
| "epoch": 2.12, | |
| "grad_norm": 5.179681777954102, | |
| "learning_rate": 0.00017371658291457287, | |
| "loss": 1.1625, | |
| "step": 42400 | |
| }, | |
| { | |
| "epoch": 2.12, | |
| "grad_norm": 2.634462833404541, | |
| "learning_rate": 0.0001734150753768844, | |
| "loss": 1.2257, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 2.13, | |
| "grad_norm": 8.693337440490723, | |
| "learning_rate": 0.00017311356783919596, | |
| "loss": 1.2447, | |
| "step": 42600 | |
| }, | |
| { | |
| "epoch": 2.13, | |
| "grad_norm": 3.228513240814209, | |
| "learning_rate": 0.00017281206030150755, | |
| "loss": 1.1993, | |
| "step": 42700 | |
| }, | |
| { | |
| "epoch": 2.14, | |
| "grad_norm": 7.938237190246582, | |
| "learning_rate": 0.00017251055276381908, | |
| "loss": 1.2084, | |
| "step": 42800 | |
| }, | |
| { | |
| "epoch": 2.15, | |
| "grad_norm": 3.0843794345855713, | |
| "learning_rate": 0.00017220904522613064, | |
| "loss": 1.2017, | |
| "step": 42900 | |
| }, | |
| { | |
| "epoch": 2.15, | |
| "grad_norm": 2.86205792427063, | |
| "learning_rate": 0.0001719075376884422, | |
| "loss": 1.1706, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 2.15, | |
| "eval_loss": 1.2179350852966309, | |
| "eval_runtime": 37.9173, | |
| "eval_samples_per_second": 26.373, | |
| "eval_steps_per_second": 3.297, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 2.15, | |
| "grad_norm": 2.137380361557007, | |
| "learning_rate": 0.00017160904522613062, | |
| "loss": 1.2066, | |
| "step": 43100 | |
| }, | |
| { | |
| "epoch": 2.16, | |
| "grad_norm": 2.250091075897217, | |
| "learning_rate": 0.00017130753768844218, | |
| "loss": 1.211, | |
| "step": 43200 | |
| }, | |
| { | |
| "epoch": 2.17, | |
| "grad_norm": 2.008875608444214, | |
| "learning_rate": 0.00017100603015075377, | |
| "loss": 1.2116, | |
| "step": 43300 | |
| }, | |
| { | |
| "epoch": 2.17, | |
| "grad_norm": 2.6691529750823975, | |
| "learning_rate": 0.00017070452261306533, | |
| "loss": 1.1844, | |
| "step": 43400 | |
| }, | |
| { | |
| "epoch": 2.17, | |
| "grad_norm": 1.8802026510238647, | |
| "learning_rate": 0.00017040301507537686, | |
| "loss": 1.1849, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 2.18, | |
| "grad_norm": 2.4100139141082764, | |
| "learning_rate": 0.00017010150753768842, | |
| "loss": 1.1887, | |
| "step": 43600 | |
| }, | |
| { | |
| "epoch": 2.19, | |
| "grad_norm": 3.3384740352630615, | |
| "learning_rate": 0.00016979999999999998, | |
| "loss": 1.2338, | |
| "step": 43700 | |
| }, | |
| { | |
| "epoch": 2.19, | |
| "grad_norm": 2.349433183670044, | |
| "learning_rate": 0.00016949849246231154, | |
| "loss": 1.1633, | |
| "step": 43800 | |
| }, | |
| { | |
| "epoch": 2.19, | |
| "grad_norm": 3.019296884536743, | |
| "learning_rate": 0.0001691969849246231, | |
| "loss": 1.2456, | |
| "step": 43900 | |
| }, | |
| { | |
| "epoch": 2.2, | |
| "grad_norm": 2.497424364089966, | |
| "learning_rate": 0.00016889547738693466, | |
| "loss": 1.1671, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 2.2, | |
| "eval_loss": 1.2000114917755127, | |
| "eval_runtime": 40.4714, | |
| "eval_samples_per_second": 24.709, | |
| "eval_steps_per_second": 3.089, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 2.21, | |
| "grad_norm": 1.6698800325393677, | |
| "learning_rate": 0.0001685939698492462, | |
| "loss": 1.2105, | |
| "step": 44100 | |
| }, | |
| { | |
| "epoch": 2.21, | |
| "grad_norm": 2.3846988677978516, | |
| "learning_rate": 0.00016829246231155778, | |
| "loss": 1.2229, | |
| "step": 44200 | |
| }, | |
| { | |
| "epoch": 2.21, | |
| "grad_norm": 5.891537189483643, | |
| "learning_rate": 0.00016799095477386934, | |
| "loss": 1.1848, | |
| "step": 44300 | |
| }, | |
| { | |
| "epoch": 2.22, | |
| "grad_norm": 1.4433008432388306, | |
| "learning_rate": 0.0001676894472361809, | |
| "loss": 1.1905, | |
| "step": 44400 | |
| }, | |
| { | |
| "epoch": 2.23, | |
| "grad_norm": 2.5641889572143555, | |
| "learning_rate": 0.00016738793969849243, | |
| "loss": 1.2219, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 2.23, | |
| "grad_norm": 3.052948474884033, | |
| "learning_rate": 0.00016708643216080402, | |
| "loss": 1.1887, | |
| "step": 44600 | |
| }, | |
| { | |
| "epoch": 2.23, | |
| "grad_norm": 2.8185369968414307, | |
| "learning_rate": 0.00016678492462311557, | |
| "loss": 1.2107, | |
| "step": 44700 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 2.9409399032592773, | |
| "learning_rate": 0.0001664834170854271, | |
| "loss": 1.2222, | |
| "step": 44800 | |
| }, | |
| { | |
| "epoch": 2.25, | |
| "grad_norm": 2.728256940841675, | |
| "learning_rate": 0.00016618190954773867, | |
| "loss": 1.1767, | |
| "step": 44900 | |
| }, | |
| { | |
| "epoch": 2.25, | |
| "grad_norm": 2.4744584560394287, | |
| "learning_rate": 0.00016588040201005025, | |
| "loss": 1.1663, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 2.25, | |
| "eval_loss": 1.2085031270980835, | |
| "eval_runtime": 41.0345, | |
| "eval_samples_per_second": 24.37, | |
| "eval_steps_per_second": 3.046, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 2.25, | |
| "grad_norm": 3.215564250946045, | |
| "learning_rate": 0.00016558190954773868, | |
| "loss": 1.173, | |
| "step": 45100 | |
| }, | |
| { | |
| "epoch": 2.26, | |
| "grad_norm": 1.7013347148895264, | |
| "learning_rate": 0.00016528040201005024, | |
| "loss": 1.1637, | |
| "step": 45200 | |
| }, | |
| { | |
| "epoch": 2.27, | |
| "grad_norm": 3.1096675395965576, | |
| "learning_rate": 0.0001649788944723618, | |
| "loss": 1.1702, | |
| "step": 45300 | |
| }, | |
| { | |
| "epoch": 2.27, | |
| "grad_norm": 2.5975756645202637, | |
| "learning_rate": 0.00016467738693467336, | |
| "loss": 1.1763, | |
| "step": 45400 | |
| }, | |
| { | |
| "epoch": 2.27, | |
| "grad_norm": 2.7020699977874756, | |
| "learning_rate": 0.0001643758793969849, | |
| "loss": 1.1761, | |
| "step": 45500 | |
| }, | |
| { | |
| "epoch": 2.28, | |
| "grad_norm": 1.7007007598876953, | |
| "learning_rate": 0.00016407437185929648, | |
| "loss": 1.2064, | |
| "step": 45600 | |
| }, | |
| { | |
| "epoch": 2.29, | |
| "grad_norm": 3.6038424968719482, | |
| "learning_rate": 0.00016377286432160804, | |
| "loss": 1.1716, | |
| "step": 45700 | |
| }, | |
| { | |
| "epoch": 2.29, | |
| "grad_norm": 2.3656082153320312, | |
| "learning_rate": 0.0001634713567839196, | |
| "loss": 1.1954, | |
| "step": 45800 | |
| }, | |
| { | |
| "epoch": 2.29, | |
| "grad_norm": 2.390509605407715, | |
| "learning_rate": 0.00016316984924623113, | |
| "loss": 1.1664, | |
| "step": 45900 | |
| }, | |
| { | |
| "epoch": 2.3, | |
| "grad_norm": 1.8767670392990112, | |
| "learning_rate": 0.00016286834170854271, | |
| "loss": 1.1784, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 2.3, | |
| "eval_loss": 1.1809154748916626, | |
| "eval_runtime": 43.7304, | |
| "eval_samples_per_second": 22.867, | |
| "eval_steps_per_second": 2.858, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 2.31, | |
| "grad_norm": 3.4367122650146484, | |
| "learning_rate": 0.00016256683417085427, | |
| "loss": 1.2055, | |
| "step": 46100 | |
| }, | |
| { | |
| "epoch": 2.31, | |
| "grad_norm": 1.672525405883789, | |
| "learning_rate": 0.0001622653266331658, | |
| "loss": 1.1954, | |
| "step": 46200 | |
| }, | |
| { | |
| "epoch": 2.31, | |
| "grad_norm": 3.2755866050720215, | |
| "learning_rate": 0.00016196381909547737, | |
| "loss": 1.1801, | |
| "step": 46300 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "grad_norm": 2.347280979156494, | |
| "learning_rate": 0.00016166231155778892, | |
| "loss": 1.1651, | |
| "step": 46400 | |
| }, | |
| { | |
| "epoch": 2.33, | |
| "grad_norm": 1.9565701484680176, | |
| "learning_rate": 0.0001613608040201005, | |
| "loss": 1.2142, | |
| "step": 46500 | |
| }, | |
| { | |
| "epoch": 2.33, | |
| "grad_norm": 2.317847728729248, | |
| "learning_rate": 0.00016105929648241204, | |
| "loss": 1.188, | |
| "step": 46600 | |
| }, | |
| { | |
| "epoch": 2.33, | |
| "grad_norm": 1.812322974205017, | |
| "learning_rate": 0.0001607577889447236, | |
| "loss": 1.1425, | |
| "step": 46700 | |
| }, | |
| { | |
| "epoch": 2.34, | |
| "grad_norm": 2.5393502712249756, | |
| "learning_rate": 0.00016045628140703514, | |
| "loss": 1.1854, | |
| "step": 46800 | |
| }, | |
| { | |
| "epoch": 2.34, | |
| "grad_norm": 6.562712669372559, | |
| "learning_rate": 0.00016015477386934672, | |
| "loss": 1.1517, | |
| "step": 46900 | |
| }, | |
| { | |
| "epoch": 2.35, | |
| "grad_norm": 2.2086706161499023, | |
| "learning_rate": 0.00015985326633165828, | |
| "loss": 1.1634, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 2.35, | |
| "eval_loss": 1.1972031593322754, | |
| "eval_runtime": 43.2883, | |
| "eval_samples_per_second": 23.101, | |
| "eval_steps_per_second": 2.888, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 2.35, | |
| "grad_norm": 2.061951160430908, | |
| "learning_rate": 0.00015955175879396984, | |
| "loss": 1.2409, | |
| "step": 47100 | |
| }, | |
| { | |
| "epoch": 2.36, | |
| "grad_norm": 2.0312881469726562, | |
| "learning_rate": 0.00015925025125628137, | |
| "loss": 1.1731, | |
| "step": 47200 | |
| }, | |
| { | |
| "epoch": 2.37, | |
| "grad_norm": 4.90245246887207, | |
| "learning_rate": 0.00015894874371859296, | |
| "loss": 1.1849, | |
| "step": 47300 | |
| }, | |
| { | |
| "epoch": 2.37, | |
| "grad_norm": 2.4970901012420654, | |
| "learning_rate": 0.00015864723618090452, | |
| "loss": 1.1684, | |
| "step": 47400 | |
| }, | |
| { | |
| "epoch": 2.38, | |
| "grad_norm": 2.4406049251556396, | |
| "learning_rate": 0.00015834572864321605, | |
| "loss": 1.1855, | |
| "step": 47500 | |
| }, | |
| { | |
| "epoch": 2.38, | |
| "grad_norm": 2.8650543689727783, | |
| "learning_rate": 0.0001580442211055276, | |
| "loss": 1.1586, | |
| "step": 47600 | |
| }, | |
| { | |
| "epoch": 2.38, | |
| "grad_norm": 2.4787731170654297, | |
| "learning_rate": 0.0001577427135678392, | |
| "loss": 1.1913, | |
| "step": 47700 | |
| }, | |
| { | |
| "epoch": 2.39, | |
| "grad_norm": 2.5188841819763184, | |
| "learning_rate": 0.00015744120603015076, | |
| "loss": 1.1938, | |
| "step": 47800 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 3.8095650672912598, | |
| "learning_rate": 0.0001571396984924623, | |
| "loss": 1.1858, | |
| "step": 47900 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 2.147993564605713, | |
| "learning_rate": 0.00015683819095477385, | |
| "loss": 1.1703, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "eval_loss": 1.1952226161956787, | |
| "eval_runtime": 42.4811, | |
| "eval_samples_per_second": 23.54, | |
| "eval_steps_per_second": 2.942, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 2.41, | |
| "grad_norm": 3.050976514816284, | |
| "learning_rate": 0.00015653668341708544, | |
| "loss": 1.1868, | |
| "step": 48100 | |
| }, | |
| { | |
| "epoch": 2.41, | |
| "grad_norm": 2.6880428791046143, | |
| "learning_rate": 0.00015623517587939697, | |
| "loss": 1.1486, | |
| "step": 48200 | |
| }, | |
| { | |
| "epoch": 2.42, | |
| "grad_norm": 2.169895648956299, | |
| "learning_rate": 0.00015593366834170853, | |
| "loss": 1.1646, | |
| "step": 48300 | |
| }, | |
| { | |
| "epoch": 2.42, | |
| "grad_norm": 9.948437690734863, | |
| "learning_rate": 0.0001556321608040201, | |
| "loss": 1.1625, | |
| "step": 48400 | |
| }, | |
| { | |
| "epoch": 2.42, | |
| "grad_norm": 2.1219215393066406, | |
| "learning_rate": 0.00015533065326633162, | |
| "loss": 1.1854, | |
| "step": 48500 | |
| }, | |
| { | |
| "epoch": 2.43, | |
| "grad_norm": 3.2466542720794678, | |
| "learning_rate": 0.0001550291457286432, | |
| "loss": 1.1556, | |
| "step": 48600 | |
| }, | |
| { | |
| "epoch": 2.44, | |
| "grad_norm": 1.8362162113189697, | |
| "learning_rate": 0.00015472763819095477, | |
| "loss": 1.177, | |
| "step": 48700 | |
| }, | |
| { | |
| "epoch": 2.44, | |
| "grad_norm": 3.579221725463867, | |
| "learning_rate": 0.00015442613065326632, | |
| "loss": 1.1671, | |
| "step": 48800 | |
| }, | |
| { | |
| "epoch": 2.44, | |
| "grad_norm": 2.256967782974243, | |
| "learning_rate": 0.00015412462311557786, | |
| "loss": 1.1807, | |
| "step": 48900 | |
| }, | |
| { | |
| "epoch": 2.45, | |
| "grad_norm": 2.107179641723633, | |
| "learning_rate": 0.00015382311557788944, | |
| "loss": 1.186, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 2.45, | |
| "eval_loss": 1.1811304092407227, | |
| "eval_runtime": 43.1582, | |
| "eval_samples_per_second": 23.171, | |
| "eval_steps_per_second": 2.896, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 2.46, | |
| "grad_norm": 2.615290880203247, | |
| "learning_rate": 0.000153521608040201, | |
| "loss": 1.1828, | |
| "step": 49100 | |
| }, | |
| { | |
| "epoch": 2.46, | |
| "grad_norm": 1.600845217704773, | |
| "learning_rate": 0.00015322010050251254, | |
| "loss": 1.1438, | |
| "step": 49200 | |
| }, | |
| { | |
| "epoch": 2.46, | |
| "grad_norm": 2.272726058959961, | |
| "learning_rate": 0.0001529185929648241, | |
| "loss": 1.1802, | |
| "step": 49300 | |
| }, | |
| { | |
| "epoch": 2.47, | |
| "grad_norm": 1.9845112562179565, | |
| "learning_rate": 0.00015261708542713568, | |
| "loss": 1.1828, | |
| "step": 49400 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "grad_norm": 1.4725877046585083, | |
| "learning_rate": 0.00015231859296482408, | |
| "loss": 1.1938, | |
| "step": 49500 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "grad_norm": 2.4453134536743164, | |
| "learning_rate": 0.00015201708542713567, | |
| "loss": 1.1928, | |
| "step": 49600 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "grad_norm": 2.9869000911712646, | |
| "learning_rate": 0.00015171557788944723, | |
| "loss": 1.1982, | |
| "step": 49700 | |
| }, | |
| { | |
| "epoch": 2.49, | |
| "grad_norm": 2.633794069290161, | |
| "learning_rate": 0.00015141407035175879, | |
| "loss": 1.1287, | |
| "step": 49800 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 1.8146005868911743, | |
| "learning_rate": 0.00015111256281407032, | |
| "loss": 1.1747, | |
| "step": 49900 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 6.4758405685424805, | |
| "learning_rate": 0.0001508110552763819, | |
| "loss": 1.1548, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "eval_loss": 1.1896699666976929, | |
| "eval_runtime": 43.2315, | |
| "eval_samples_per_second": 23.131, | |
| "eval_steps_per_second": 2.891, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 1.5688796043395996, | |
| "learning_rate": 0.00015050954773869346, | |
| "loss": 1.168, | |
| "step": 50100 | |
| }, | |
| { | |
| "epoch": 2.51, | |
| "grad_norm": 1.4024161100387573, | |
| "learning_rate": 0.000150208040201005, | |
| "loss": 1.1796, | |
| "step": 50200 | |
| }, | |
| { | |
| "epoch": 2.52, | |
| "grad_norm": 2.066570997238159, | |
| "learning_rate": 0.00014990653266331658, | |
| "loss": 1.1419, | |
| "step": 50300 | |
| }, | |
| { | |
| "epoch": 2.52, | |
| "grad_norm": 3.7978389263153076, | |
| "learning_rate": 0.00014960502512562812, | |
| "loss": 1.1497, | |
| "step": 50400 | |
| }, | |
| { | |
| "epoch": 2.52, | |
| "grad_norm": 2.2129733562469482, | |
| "learning_rate": 0.0001493035175879397, | |
| "loss": 1.1371, | |
| "step": 50500 | |
| }, | |
| { | |
| "epoch": 2.53, | |
| "grad_norm": 3.0140724182128906, | |
| "learning_rate": 0.00014900201005025123, | |
| "loss": 1.1778, | |
| "step": 50600 | |
| }, | |
| { | |
| "epoch": 2.54, | |
| "grad_norm": 2.457521915435791, | |
| "learning_rate": 0.00014870050251256282, | |
| "loss": 1.1266, | |
| "step": 50700 | |
| }, | |
| { | |
| "epoch": 2.54, | |
| "grad_norm": 2.1066813468933105, | |
| "learning_rate": 0.00014839899497487435, | |
| "loss": 1.1635, | |
| "step": 50800 | |
| }, | |
| { | |
| "epoch": 2.54, | |
| "grad_norm": 2.801196336746216, | |
| "learning_rate": 0.0001480974874371859, | |
| "loss": 1.1842, | |
| "step": 50900 | |
| }, | |
| { | |
| "epoch": 2.55, | |
| "grad_norm": 4.693379878997803, | |
| "learning_rate": 0.00014779597989949747, | |
| "loss": 1.1449, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 2.55, | |
| "eval_loss": 1.1495003700256348, | |
| "eval_runtime": 37.9097, | |
| "eval_samples_per_second": 26.378, | |
| "eval_steps_per_second": 3.297, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 1.917925477027893, | |
| "learning_rate": 0.00014749447236180903, | |
| "loss": 1.1303, | |
| "step": 51100 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 2.6460864543914795, | |
| "learning_rate": 0.0001471929648241206, | |
| "loss": 1.1638, | |
| "step": 51200 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 2.5040736198425293, | |
| "learning_rate": 0.00014689145728643215, | |
| "loss": 1.1382, | |
| "step": 51300 | |
| }, | |
| { | |
| "epoch": 2.57, | |
| "grad_norm": 2.7533071041107178, | |
| "learning_rate": 0.0001465899497487437, | |
| "loss": 1.1803, | |
| "step": 51400 | |
| }, | |
| { | |
| "epoch": 2.58, | |
| "grad_norm": 2.220345973968506, | |
| "learning_rate": 0.00014629145728643214, | |
| "loss": 1.1506, | |
| "step": 51500 | |
| }, | |
| { | |
| "epoch": 2.58, | |
| "grad_norm": 1.3668216466903687, | |
| "learning_rate": 0.0001459899497487437, | |
| "loss": 1.1538, | |
| "step": 51600 | |
| }, | |
| { | |
| "epoch": 2.58, | |
| "grad_norm": 2.26232647895813, | |
| "learning_rate": 0.00014568844221105525, | |
| "loss": 1.2085, | |
| "step": 51700 | |
| }, | |
| { | |
| "epoch": 2.59, | |
| "grad_norm": 5.508904933929443, | |
| "learning_rate": 0.00014538693467336681, | |
| "loss": 1.1528, | |
| "step": 51800 | |
| }, | |
| { | |
| "epoch": 2.59, | |
| "grad_norm": 2.9169905185699463, | |
| "learning_rate": 0.00014508542713567837, | |
| "loss": 1.1632, | |
| "step": 51900 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "grad_norm": 2.5156240463256836, | |
| "learning_rate": 0.00014478391959798993, | |
| "loss": 1.1677, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "eval_loss": 1.174816370010376, | |
| "eval_runtime": 42.0784, | |
| "eval_samples_per_second": 23.765, | |
| "eval_steps_per_second": 2.971, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "grad_norm": 1.622004747390747, | |
| "learning_rate": 0.0001444824120603015, | |
| "loss": 1.1174, | |
| "step": 52100 | |
| }, | |
| { | |
| "epoch": 2.61, | |
| "grad_norm": 2.5255143642425537, | |
| "learning_rate": 0.00014418090452261305, | |
| "loss": 1.1415, | |
| "step": 52200 | |
| }, | |
| { | |
| "epoch": 2.62, | |
| "grad_norm": 1.7780824899673462, | |
| "learning_rate": 0.0001438793969849246, | |
| "loss": 1.1871, | |
| "step": 52300 | |
| }, | |
| { | |
| "epoch": 2.62, | |
| "grad_norm": 2.320028305053711, | |
| "learning_rate": 0.00014357788944723617, | |
| "loss": 1.1841, | |
| "step": 52400 | |
| }, | |
| { | |
| "epoch": 2.62, | |
| "grad_norm": 2.6219685077667236, | |
| "learning_rate": 0.00014327638190954773, | |
| "loss": 1.1349, | |
| "step": 52500 | |
| }, | |
| { | |
| "epoch": 2.63, | |
| "grad_norm": 3.0288233757019043, | |
| "learning_rate": 0.0001429748743718593, | |
| "loss": 1.1753, | |
| "step": 52600 | |
| }, | |
| { | |
| "epoch": 2.63, | |
| "grad_norm": 2.3062517642974854, | |
| "learning_rate": 0.00014267336683417085, | |
| "loss": 1.1836, | |
| "step": 52700 | |
| }, | |
| { | |
| "epoch": 2.64, | |
| "grad_norm": 1.8819166421890259, | |
| "learning_rate": 0.0001423718592964824, | |
| "loss": 1.1491, | |
| "step": 52800 | |
| }, | |
| { | |
| "epoch": 2.65, | |
| "grad_norm": 1.7771334648132324, | |
| "learning_rate": 0.00014207035175879397, | |
| "loss": 1.1311, | |
| "step": 52900 | |
| }, | |
| { | |
| "epoch": 2.65, | |
| "grad_norm": 1.9495539665222168, | |
| "learning_rate": 0.00014176884422110553, | |
| "loss": 1.1757, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 2.65, | |
| "eval_loss": 1.161841869354248, | |
| "eval_runtime": 41.6597, | |
| "eval_samples_per_second": 24.004, | |
| "eval_steps_per_second": 3.0, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 2.66, | |
| "grad_norm": 2.317021131515503, | |
| "learning_rate": 0.00014146733668341706, | |
| "loss": 1.145, | |
| "step": 53100 | |
| }, | |
| { | |
| "epoch": 2.66, | |
| "grad_norm": 1.4079538583755493, | |
| "learning_rate": 0.00014116582914572865, | |
| "loss": 1.0893, | |
| "step": 53200 | |
| }, | |
| { | |
| "epoch": 2.67, | |
| "grad_norm": 6.593141555786133, | |
| "learning_rate": 0.00014086432160804018, | |
| "loss": 1.1357, | |
| "step": 53300 | |
| }, | |
| { | |
| "epoch": 2.67, | |
| "grad_norm": 2.657529830932617, | |
| "learning_rate": 0.00014056281407035177, | |
| "loss": 1.1651, | |
| "step": 53400 | |
| }, | |
| { | |
| "epoch": 2.67, | |
| "grad_norm": 3.312056541442871, | |
| "learning_rate": 0.0001402613065326633, | |
| "loss": 1.165, | |
| "step": 53500 | |
| }, | |
| { | |
| "epoch": 2.68, | |
| "grad_norm": 2.1961281299591064, | |
| "learning_rate": 0.00013995979899497486, | |
| "loss": 1.1584, | |
| "step": 53600 | |
| }, | |
| { | |
| "epoch": 2.69, | |
| "grad_norm": 1.933409571647644, | |
| "learning_rate": 0.00013965829145728642, | |
| "loss": 1.1382, | |
| "step": 53700 | |
| }, | |
| { | |
| "epoch": 2.69, | |
| "grad_norm": 2.6763832569122314, | |
| "learning_rate": 0.00013935678391959798, | |
| "loss": 1.1238, | |
| "step": 53800 | |
| }, | |
| { | |
| "epoch": 2.69, | |
| "grad_norm": 3.3957033157348633, | |
| "learning_rate": 0.00013905527638190954, | |
| "loss": 1.154, | |
| "step": 53900 | |
| }, | |
| { | |
| "epoch": 2.7, | |
| "grad_norm": 3.526700019836426, | |
| "learning_rate": 0.0001387537688442211, | |
| "loss": 1.1325, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 2.7, | |
| "eval_loss": 1.141178011894226, | |
| "eval_runtime": 37.9667, | |
| "eval_samples_per_second": 26.339, | |
| "eval_steps_per_second": 3.292, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 2.71, | |
| "grad_norm": 3.3937137126922607, | |
| "learning_rate": 0.00013845226130653265, | |
| "loss": 1.141, | |
| "step": 54100 | |
| }, | |
| { | |
| "epoch": 2.71, | |
| "grad_norm": 1.9187488555908203, | |
| "learning_rate": 0.00013815075376884421, | |
| "loss": 1.1253, | |
| "step": 54200 | |
| }, | |
| { | |
| "epoch": 2.71, | |
| "grad_norm": 2.2351136207580566, | |
| "learning_rate": 0.00013784924623115577, | |
| "loss": 1.2008, | |
| "step": 54300 | |
| }, | |
| { | |
| "epoch": 2.72, | |
| "grad_norm": 3.97955584526062, | |
| "learning_rate": 0.0001375477386934673, | |
| "loss": 1.1609, | |
| "step": 54400 | |
| }, | |
| { | |
| "epoch": 2.73, | |
| "grad_norm": 3.5734050273895264, | |
| "learning_rate": 0.0001372462311557789, | |
| "loss": 1.1584, | |
| "step": 54500 | |
| }, | |
| { | |
| "epoch": 2.73, | |
| "grad_norm": 2.3804807662963867, | |
| "learning_rate": 0.00013694472361809042, | |
| "loss": 1.1343, | |
| "step": 54600 | |
| }, | |
| { | |
| "epoch": 2.73, | |
| "grad_norm": 2.0606038570404053, | |
| "learning_rate": 0.000136643216080402, | |
| "loss": 1.1555, | |
| "step": 54700 | |
| }, | |
| { | |
| "epoch": 2.74, | |
| "grad_norm": 4.046571731567383, | |
| "learning_rate": 0.00013634170854271354, | |
| "loss": 1.1543, | |
| "step": 54800 | |
| }, | |
| { | |
| "epoch": 2.75, | |
| "grad_norm": 2.470393180847168, | |
| "learning_rate": 0.00013604020100502513, | |
| "loss": 1.1651, | |
| "step": 54900 | |
| }, | |
| { | |
| "epoch": 2.75, | |
| "grad_norm": 1.4677540063858032, | |
| "learning_rate": 0.00013573869346733666, | |
| "loss": 1.1366, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 2.75, | |
| "eval_loss": 1.1223907470703125, | |
| "eval_runtime": 43.6458, | |
| "eval_samples_per_second": 22.912, | |
| "eval_steps_per_second": 2.864, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 2.75, | |
| "grad_norm": 2.5567593574523926, | |
| "learning_rate": 0.00013543718592964822, | |
| "loss": 1.1348, | |
| "step": 55100 | |
| }, | |
| { | |
| "epoch": 2.76, | |
| "grad_norm": 4.812506675720215, | |
| "learning_rate": 0.00013513567839195978, | |
| "loss": 1.1675, | |
| "step": 55200 | |
| }, | |
| { | |
| "epoch": 2.77, | |
| "grad_norm": 2.5467748641967773, | |
| "learning_rate": 0.00013483417085427134, | |
| "loss": 1.1238, | |
| "step": 55300 | |
| }, | |
| { | |
| "epoch": 2.77, | |
| "grad_norm": 4.469081878662109, | |
| "learning_rate": 0.0001345326633165829, | |
| "loss": 1.102, | |
| "step": 55400 | |
| }, | |
| { | |
| "epoch": 2.77, | |
| "grad_norm": 3.878526449203491, | |
| "learning_rate": 0.00013423115577889446, | |
| "loss": 1.131, | |
| "step": 55500 | |
| }, | |
| { | |
| "epoch": 2.78, | |
| "grad_norm": 2.0142953395843506, | |
| "learning_rate": 0.00013392964824120602, | |
| "loss": 1.1349, | |
| "step": 55600 | |
| }, | |
| { | |
| "epoch": 2.79, | |
| "grad_norm": 2.600478410720825, | |
| "learning_rate": 0.00013362814070351758, | |
| "loss": 1.1363, | |
| "step": 55700 | |
| }, | |
| { | |
| "epoch": 2.79, | |
| "grad_norm": 2.58322811126709, | |
| "learning_rate": 0.00013332663316582914, | |
| "loss": 1.1426, | |
| "step": 55800 | |
| }, | |
| { | |
| "epoch": 2.79, | |
| "grad_norm": 2.2471609115600586, | |
| "learning_rate": 0.0001330251256281407, | |
| "loss": 1.1446, | |
| "step": 55900 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 1.8442782163619995, | |
| "learning_rate": 0.00013272361809045226, | |
| "loss": 1.1315, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "eval_loss": 1.1661006212234497, | |
| "eval_runtime": 47.0159, | |
| "eval_samples_per_second": 21.269, | |
| "eval_steps_per_second": 2.659, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 2.81, | |
| "grad_norm": 2.2928128242492676, | |
| "learning_rate": 0.0001324221105527638, | |
| "loss": 1.122, | |
| "step": 56100 | |
| }, | |
| { | |
| "epoch": 2.81, | |
| "grad_norm": 2.192915201187134, | |
| "learning_rate": 0.00013212361809045224, | |
| "loss": 1.1251, | |
| "step": 56200 | |
| }, | |
| { | |
| "epoch": 2.81, | |
| "grad_norm": 2.334547519683838, | |
| "learning_rate": 0.00013182211055276383, | |
| "loss": 1.1408, | |
| "step": 56300 | |
| }, | |
| { | |
| "epoch": 2.82, | |
| "grad_norm": 1.832930088043213, | |
| "learning_rate": 0.00013152060301507536, | |
| "loss": 1.1146, | |
| "step": 56400 | |
| }, | |
| { | |
| "epoch": 2.83, | |
| "grad_norm": 4.524071216583252, | |
| "learning_rate": 0.00013121909547738692, | |
| "loss": 1.1661, | |
| "step": 56500 | |
| }, | |
| { | |
| "epoch": 2.83, | |
| "grad_norm": 1.4990063905715942, | |
| "learning_rate": 0.00013091758793969848, | |
| "loss": 1.1247, | |
| "step": 56600 | |
| }, | |
| { | |
| "epoch": 2.83, | |
| "grad_norm": 3.572678804397583, | |
| "learning_rate": 0.00013061608040201004, | |
| "loss": 1.1251, | |
| "step": 56700 | |
| }, | |
| { | |
| "epoch": 2.84, | |
| "grad_norm": 2.0090138912200928, | |
| "learning_rate": 0.0001303145728643216, | |
| "loss": 1.1267, | |
| "step": 56800 | |
| }, | |
| { | |
| "epoch": 2.84, | |
| "grad_norm": 2.0328962802886963, | |
| "learning_rate": 0.00013001306532663316, | |
| "loss": 1.1343, | |
| "step": 56900 | |
| }, | |
| { | |
| "epoch": 2.85, | |
| "grad_norm": 1.5744613409042358, | |
| "learning_rate": 0.00012971155778894472, | |
| "loss": 1.1208, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 2.85, | |
| "eval_loss": 1.1388169527053833, | |
| "eval_runtime": 65.7696, | |
| "eval_samples_per_second": 15.205, | |
| "eval_steps_per_second": 1.901, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 2.85, | |
| "grad_norm": 1.2835485935211182, | |
| "learning_rate": 0.00012941005025125628, | |
| "loss": 1.1561, | |
| "step": 57100 | |
| }, | |
| { | |
| "epoch": 2.86, | |
| "grad_norm": 3.413334846496582, | |
| "learning_rate": 0.00012910854271356784, | |
| "loss": 1.126, | |
| "step": 57200 | |
| }, | |
| { | |
| "epoch": 2.87, | |
| "grad_norm": 2.6612489223480225, | |
| "learning_rate": 0.00012880703517587937, | |
| "loss": 1.1705, | |
| "step": 57300 | |
| }, | |
| { | |
| "epoch": 2.87, | |
| "grad_norm": 2.0389411449432373, | |
| "learning_rate": 0.00012850552763819096, | |
| "loss": 1.1322, | |
| "step": 57400 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "grad_norm": 2.203789710998535, | |
| "learning_rate": 0.0001282040201005025, | |
| "loss": 1.1437, | |
| "step": 57500 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "grad_norm": 5.272101879119873, | |
| "learning_rate": 0.00012790251256281407, | |
| "loss": 1.1333, | |
| "step": 57600 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "grad_norm": 3.0776541233062744, | |
| "learning_rate": 0.0001276010050251256, | |
| "loss": 1.1235, | |
| "step": 57700 | |
| }, | |
| { | |
| "epoch": 2.89, | |
| "grad_norm": 3.8333828449249268, | |
| "learning_rate": 0.0001272994974874372, | |
| "loss": 1.1141, | |
| "step": 57800 | |
| }, | |
| { | |
| "epoch": 2.9, | |
| "grad_norm": 3.3916189670562744, | |
| "learning_rate": 0.00012699798994974873, | |
| "loss": 1.1084, | |
| "step": 57900 | |
| }, | |
| { | |
| "epoch": 2.9, | |
| "grad_norm": 1.6035398244857788, | |
| "learning_rate": 0.00012669648241206029, | |
| "loss": 1.1057, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 2.9, | |
| "eval_loss": 1.1182321310043335, | |
| "eval_runtime": 60.567, | |
| "eval_samples_per_second": 16.511, | |
| "eval_steps_per_second": 2.064, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 2.91, | |
| "grad_norm": 2.41086745262146, | |
| "learning_rate": 0.00012639497487437184, | |
| "loss": 1.1424, | |
| "step": 58100 | |
| }, | |
| { | |
| "epoch": 2.91, | |
| "grad_norm": 1.8278477191925049, | |
| "learning_rate": 0.0001260934673366834, | |
| "loss": 1.1126, | |
| "step": 58200 | |
| }, | |
| { | |
| "epoch": 2.92, | |
| "grad_norm": 2.7294256687164307, | |
| "learning_rate": 0.00012579195979899496, | |
| "loss": 1.1207, | |
| "step": 58300 | |
| }, | |
| { | |
| "epoch": 2.92, | |
| "grad_norm": 2.813084602355957, | |
| "learning_rate": 0.00012549045226130652, | |
| "loss": 1.1498, | |
| "step": 58400 | |
| }, | |
| { | |
| "epoch": 2.92, | |
| "grad_norm": 2.6869473457336426, | |
| "learning_rate": 0.00012519195979899495, | |
| "loss": 1.1198, | |
| "step": 58500 | |
| }, | |
| { | |
| "epoch": 2.93, | |
| "grad_norm": 1.8101871013641357, | |
| "learning_rate": 0.00012489045226130654, | |
| "loss": 1.1725, | |
| "step": 58600 | |
| }, | |
| { | |
| "epoch": 2.94, | |
| "grad_norm": 4.7469305992126465, | |
| "learning_rate": 0.00012458894472361807, | |
| "loss": 1.1382, | |
| "step": 58700 | |
| }, | |
| { | |
| "epoch": 2.94, | |
| "grad_norm": 1.8046541213989258, | |
| "learning_rate": 0.00012428743718592965, | |
| "loss": 1.082, | |
| "step": 58800 | |
| }, | |
| { | |
| "epoch": 2.94, | |
| "grad_norm": 2.176015615463257, | |
| "learning_rate": 0.0001239859296482412, | |
| "loss": 1.1304, | |
| "step": 58900 | |
| }, | |
| { | |
| "epoch": 2.95, | |
| "grad_norm": 1.8910236358642578, | |
| "learning_rate": 0.00012368442211055277, | |
| "loss": 1.1638, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 2.95, | |
| "eval_loss": 1.1192156076431274, | |
| "eval_runtime": 41.8257, | |
| "eval_samples_per_second": 23.909, | |
| "eval_steps_per_second": 2.989, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 2.96, | |
| "grad_norm": 2.288358211517334, | |
| "learning_rate": 0.0001233829145728643, | |
| "loss": 1.1203, | |
| "step": 59100 | |
| }, | |
| { | |
| "epoch": 2.96, | |
| "grad_norm": 1.9389914274215698, | |
| "learning_rate": 0.00012308140703517586, | |
| "loss": 1.0892, | |
| "step": 59200 | |
| }, | |
| { | |
| "epoch": 2.96, | |
| "grad_norm": 2.1551334857940674, | |
| "learning_rate": 0.00012277989949748742, | |
| "loss": 1.1046, | |
| "step": 59300 | |
| }, | |
| { | |
| "epoch": 2.97, | |
| "grad_norm": 1.5200018882751465, | |
| "learning_rate": 0.00012247839195979898, | |
| "loss": 1.1373, | |
| "step": 59400 | |
| }, | |
| { | |
| "epoch": 2.98, | |
| "grad_norm": 2.45053768157959, | |
| "learning_rate": 0.00012217688442211054, | |
| "loss": 1.1403, | |
| "step": 59500 | |
| }, | |
| { | |
| "epoch": 2.98, | |
| "grad_norm": 2.767160177230835, | |
| "learning_rate": 0.00012187537688442209, | |
| "loss": 1.0693, | |
| "step": 59600 | |
| }, | |
| { | |
| "epoch": 2.98, | |
| "grad_norm": 2.3581674098968506, | |
| "learning_rate": 0.00012157386934673366, | |
| "loss": 1.125, | |
| "step": 59700 | |
| }, | |
| { | |
| "epoch": 2.99, | |
| "grad_norm": 1.4579651355743408, | |
| "learning_rate": 0.00012127236180904521, | |
| "loss": 1.127, | |
| "step": 59800 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 4.08085298538208, | |
| "learning_rate": 0.00012097085427135678, | |
| "loss": 1.1539, | |
| "step": 59900 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 1.5620448589324951, | |
| "learning_rate": 0.00012066934673366833, | |
| "loss": 1.1372, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 1.130272626876831, | |
| "eval_runtime": 37.9665, | |
| "eval_samples_per_second": 26.339, | |
| "eval_steps_per_second": 3.292, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 3.270860433578491, | |
| "learning_rate": 0.00012036783919597989, | |
| "loss": 1.0761, | |
| "step": 60100 | |
| }, | |
| { | |
| "epoch": 3.01, | |
| "grad_norm": 2.5301287174224854, | |
| "learning_rate": 0.00012006633165829145, | |
| "loss": 1.0881, | |
| "step": 60200 | |
| }, | |
| { | |
| "epoch": 3.02, | |
| "grad_norm": 2.5292015075683594, | |
| "learning_rate": 0.000119764824120603, | |
| "loss": 1.046, | |
| "step": 60300 | |
| }, | |
| { | |
| "epoch": 3.02, | |
| "grad_norm": 2.8234751224517822, | |
| "learning_rate": 0.00011946331658291456, | |
| "loss": 1.0802, | |
| "step": 60400 | |
| }, | |
| { | |
| "epoch": 3.02, | |
| "grad_norm": 2.536975860595703, | |
| "learning_rate": 0.00011916180904522612, | |
| "loss": 1.0993, | |
| "step": 60500 | |
| }, | |
| { | |
| "epoch": 3.03, | |
| "grad_norm": 3.510464906692505, | |
| "learning_rate": 0.00011886030150753767, | |
| "loss": 1.1108, | |
| "step": 60600 | |
| }, | |
| { | |
| "epoch": 3.04, | |
| "grad_norm": 1.9273101091384888, | |
| "learning_rate": 0.00011855879396984924, | |
| "loss": 1.1081, | |
| "step": 60700 | |
| }, | |
| { | |
| "epoch": 3.04, | |
| "grad_norm": 2.1979687213897705, | |
| "learning_rate": 0.00011825728643216079, | |
| "loss": 1.1059, | |
| "step": 60800 | |
| }, | |
| { | |
| "epoch": 3.04, | |
| "grad_norm": 2.097529172897339, | |
| "learning_rate": 0.00011795577889447236, | |
| "loss": 1.1098, | |
| "step": 60900 | |
| }, | |
| { | |
| "epoch": 3.05, | |
| "grad_norm": 2.970689296722412, | |
| "learning_rate": 0.00011765427135678391, | |
| "loss": 1.0915, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 3.05, | |
| "eval_loss": 1.0778993368148804, | |
| "eval_runtime": 37.9552, | |
| "eval_samples_per_second": 26.347, | |
| "eval_steps_per_second": 3.293, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 3.06, | |
| "grad_norm": 2.3489325046539307, | |
| "learning_rate": 0.00011735577889447236, | |
| "loss": 1.1174, | |
| "step": 61100 | |
| }, | |
| { | |
| "epoch": 3.06, | |
| "grad_norm": 2.9280216693878174, | |
| "learning_rate": 0.00011705427135678391, | |
| "loss": 1.0977, | |
| "step": 61200 | |
| }, | |
| { | |
| "epoch": 3.06, | |
| "grad_norm": 2.231684446334839, | |
| "learning_rate": 0.00011675276381909548, | |
| "loss": 1.1004, | |
| "step": 61300 | |
| }, | |
| { | |
| "epoch": 3.07, | |
| "grad_norm": 1.8373113870620728, | |
| "learning_rate": 0.00011645125628140703, | |
| "loss": 1.109, | |
| "step": 61400 | |
| }, | |
| { | |
| "epoch": 3.08, | |
| "grad_norm": 3.446971893310547, | |
| "learning_rate": 0.00011614974874371859, | |
| "loss": 1.092, | |
| "step": 61500 | |
| }, | |
| { | |
| "epoch": 3.08, | |
| "grad_norm": 2.2681097984313965, | |
| "learning_rate": 0.00011584824120603014, | |
| "loss": 1.0901, | |
| "step": 61600 | |
| }, | |
| { | |
| "epoch": 3.08, | |
| "grad_norm": 2.173755407333374, | |
| "learning_rate": 0.0001155467336683417, | |
| "loss": 1.0638, | |
| "step": 61700 | |
| }, | |
| { | |
| "epoch": 3.09, | |
| "grad_norm": 3.3374030590057373, | |
| "learning_rate": 0.00011524522613065325, | |
| "loss": 1.1036, | |
| "step": 61800 | |
| }, | |
| { | |
| "epoch": 3.1, | |
| "grad_norm": 2.082169771194458, | |
| "learning_rate": 0.00011494371859296481, | |
| "loss": 1.0737, | |
| "step": 61900 | |
| }, | |
| { | |
| "epoch": 3.1, | |
| "grad_norm": 2.741830587387085, | |
| "learning_rate": 0.00011464221105527637, | |
| "loss": 1.0705, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 3.1, | |
| "eval_loss": 1.079288125038147, | |
| "eval_runtime": 37.9824, | |
| "eval_samples_per_second": 26.328, | |
| "eval_steps_per_second": 3.291, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 3.1, | |
| "grad_norm": 2.128262996673584, | |
| "learning_rate": 0.00011434070351758793, | |
| "loss": 1.0964, | |
| "step": 62100 | |
| }, | |
| { | |
| "epoch": 3.11, | |
| "grad_norm": 2.100025177001953, | |
| "learning_rate": 0.00011403919597989949, | |
| "loss": 1.0951, | |
| "step": 62200 | |
| }, | |
| { | |
| "epoch": 3.12, | |
| "grad_norm": 7.355963706970215, | |
| "learning_rate": 0.00011373768844221103, | |
| "loss": 1.128, | |
| "step": 62300 | |
| }, | |
| { | |
| "epoch": 3.12, | |
| "grad_norm": 2.6374123096466064, | |
| "learning_rate": 0.0001134361809045226, | |
| "loss": 1.0928, | |
| "step": 62400 | |
| }, | |
| { | |
| "epoch": 3.12, | |
| "grad_norm": 2.6389834880828857, | |
| "learning_rate": 0.00011313467336683415, | |
| "loss": 1.1067, | |
| "step": 62500 | |
| }, | |
| { | |
| "epoch": 3.13, | |
| "grad_norm": 3.367866277694702, | |
| "learning_rate": 0.00011283316582914573, | |
| "loss": 1.0719, | |
| "step": 62600 | |
| }, | |
| { | |
| "epoch": 3.13, | |
| "grad_norm": 2.0250422954559326, | |
| "learning_rate": 0.00011253165829145727, | |
| "loss": 1.0967, | |
| "step": 62700 | |
| }, | |
| { | |
| "epoch": 3.14, | |
| "grad_norm": 3.8763527870178223, | |
| "learning_rate": 0.00011223015075376884, | |
| "loss": 1.0819, | |
| "step": 62800 | |
| }, | |
| { | |
| "epoch": 3.15, | |
| "grad_norm": 2.7926995754241943, | |
| "learning_rate": 0.00011192864321608039, | |
| "loss": 1.1123, | |
| "step": 62900 | |
| }, | |
| { | |
| "epoch": 3.15, | |
| "grad_norm": 2.5031745433807373, | |
| "learning_rate": 0.00011162713567839195, | |
| "loss": 1.0725, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 3.15, | |
| "eval_loss": 1.117138147354126, | |
| "eval_runtime": 37.9757, | |
| "eval_samples_per_second": 26.333, | |
| "eval_steps_per_second": 3.292, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 3.15, | |
| "grad_norm": 2.086465835571289, | |
| "learning_rate": 0.00011132562814070351, | |
| "loss": 1.0588, | |
| "step": 63100 | |
| }, | |
| { | |
| "epoch": 3.16, | |
| "grad_norm": 3.295759439468384, | |
| "learning_rate": 0.00011102412060301507, | |
| "loss": 1.1175, | |
| "step": 63200 | |
| }, | |
| { | |
| "epoch": 3.17, | |
| "grad_norm": 2.666032075881958, | |
| "learning_rate": 0.00011072261306532661, | |
| "loss": 1.0963, | |
| "step": 63300 | |
| }, | |
| { | |
| "epoch": 3.17, | |
| "grad_norm": 1.8267697095870972, | |
| "learning_rate": 0.00011042110552763819, | |
| "loss": 1.0691, | |
| "step": 63400 | |
| }, | |
| { | |
| "epoch": 3.17, | |
| "grad_norm": 2.682745933532715, | |
| "learning_rate": 0.00011011959798994973, | |
| "loss": 1.0671, | |
| "step": 63500 | |
| }, | |
| { | |
| "epoch": 3.18, | |
| "grad_norm": 2.914111375808716, | |
| "learning_rate": 0.00010982110552763819, | |
| "loss": 1.0809, | |
| "step": 63600 | |
| }, | |
| { | |
| "epoch": 3.19, | |
| "grad_norm": 2.7258005142211914, | |
| "learning_rate": 0.00010951959798994973, | |
| "loss": 1.0527, | |
| "step": 63700 | |
| }, | |
| { | |
| "epoch": 3.19, | |
| "grad_norm": 2.646939992904663, | |
| "learning_rate": 0.0001092180904522613, | |
| "loss": 1.0523, | |
| "step": 63800 | |
| }, | |
| { | |
| "epoch": 3.19, | |
| "grad_norm": 2.107849359512329, | |
| "learning_rate": 0.00010891658291457285, | |
| "loss": 1.0629, | |
| "step": 63900 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "grad_norm": 1.9583218097686768, | |
| "learning_rate": 0.00010861507537688442, | |
| "loss": 1.065, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "eval_loss": 1.1121866703033447, | |
| "eval_runtime": 37.9368, | |
| "eval_samples_per_second": 26.36, | |
| "eval_steps_per_second": 3.295, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 3.21, | |
| "grad_norm": 2.384493589401245, | |
| "learning_rate": 0.00010831356783919597, | |
| "loss": 1.0664, | |
| "step": 64100 | |
| }, | |
| { | |
| "epoch": 3.21, | |
| "grad_norm": 2.060441732406616, | |
| "learning_rate": 0.00010801206030150753, | |
| "loss": 1.0762, | |
| "step": 64200 | |
| }, | |
| { | |
| "epoch": 3.21, | |
| "grad_norm": 6.751837253570557, | |
| "learning_rate": 0.00010771055276381909, | |
| "loss": 1.0553, | |
| "step": 64300 | |
| }, | |
| { | |
| "epoch": 3.22, | |
| "grad_norm": 2.9765820503234863, | |
| "learning_rate": 0.00010740904522613064, | |
| "loss": 1.0636, | |
| "step": 64400 | |
| }, | |
| { | |
| "epoch": 3.23, | |
| "grad_norm": 2.2694509029388428, | |
| "learning_rate": 0.00010710753768844221, | |
| "loss": 1.1031, | |
| "step": 64500 | |
| }, | |
| { | |
| "epoch": 3.23, | |
| "grad_norm": 3.272937536239624, | |
| "learning_rate": 0.00010680603015075375, | |
| "loss": 1.1053, | |
| "step": 64600 | |
| }, | |
| { | |
| "epoch": 3.23, | |
| "grad_norm": 3.242722988128662, | |
| "learning_rate": 0.00010650452261306531, | |
| "loss": 1.1013, | |
| "step": 64700 | |
| }, | |
| { | |
| "epoch": 3.24, | |
| "grad_norm": 2.7234878540039062, | |
| "learning_rate": 0.00010620301507537687, | |
| "loss": 1.0428, | |
| "step": 64800 | |
| }, | |
| { | |
| "epoch": 3.25, | |
| "grad_norm": 2.30928373336792, | |
| "learning_rate": 0.00010590150753768843, | |
| "loss": 1.067, | |
| "step": 64900 | |
| }, | |
| { | |
| "epoch": 3.25, | |
| "grad_norm": 4.809457302093506, | |
| "learning_rate": 0.00010559999999999998, | |
| "loss": 1.053, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 3.25, | |
| "eval_loss": 1.1082242727279663, | |
| "eval_runtime": 37.9286, | |
| "eval_samples_per_second": 26.365, | |
| "eval_steps_per_second": 3.296, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 3.25, | |
| "grad_norm": 2.282684087753296, | |
| "learning_rate": 0.00010529849246231155, | |
| "loss": 1.0547, | |
| "step": 65100 | |
| }, | |
| { | |
| "epoch": 3.26, | |
| "grad_norm": 3.756114959716797, | |
| "learning_rate": 0.0001049969849246231, | |
| "loss": 1.0435, | |
| "step": 65200 | |
| }, | |
| { | |
| "epoch": 3.27, | |
| "grad_norm": 3.709932565689087, | |
| "learning_rate": 0.00010469547738693467, | |
| "loss": 1.0678, | |
| "step": 65300 | |
| }, | |
| { | |
| "epoch": 3.27, | |
| "grad_norm": 1.6080820560455322, | |
| "learning_rate": 0.00010439396984924622, | |
| "loss": 1.101, | |
| "step": 65400 | |
| }, | |
| { | |
| "epoch": 3.27, | |
| "grad_norm": 2.2617008686065674, | |
| "learning_rate": 0.00010409246231155779, | |
| "loss": 1.0729, | |
| "step": 65500 | |
| }, | |
| { | |
| "epoch": 3.28, | |
| "grad_norm": 3.1394824981689453, | |
| "learning_rate": 0.00010379095477386933, | |
| "loss": 1.0861, | |
| "step": 65600 | |
| }, | |
| { | |
| "epoch": 3.29, | |
| "grad_norm": 2.8208096027374268, | |
| "learning_rate": 0.0001034894472361809, | |
| "loss": 1.0535, | |
| "step": 65700 | |
| }, | |
| { | |
| "epoch": 3.29, | |
| "grad_norm": 2.7133829593658447, | |
| "learning_rate": 0.00010318793969849245, | |
| "loss": 1.0498, | |
| "step": 65800 | |
| }, | |
| { | |
| "epoch": 3.29, | |
| "grad_norm": 2.2674591541290283, | |
| "learning_rate": 0.00010288643216080401, | |
| "loss": 1.0861, | |
| "step": 65900 | |
| }, | |
| { | |
| "epoch": 3.3, | |
| "grad_norm": 2.238206386566162, | |
| "learning_rate": 0.00010258492462311557, | |
| "loss": 1.0557, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 3.3, | |
| "eval_loss": 1.0877478122711182, | |
| "eval_runtime": 37.9734, | |
| "eval_samples_per_second": 26.334, | |
| "eval_steps_per_second": 3.292, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 3.31, | |
| "grad_norm": 1.8776639699935913, | |
| "learning_rate": 0.00010228643216080401, | |
| "loss": 1.0898, | |
| "step": 66100 | |
| }, | |
| { | |
| "epoch": 3.31, | |
| "grad_norm": 2.540071725845337, | |
| "learning_rate": 0.00010198492462311557, | |
| "loss": 1.0437, | |
| "step": 66200 | |
| }, | |
| { | |
| "epoch": 3.31, | |
| "grad_norm": 3.616443157196045, | |
| "learning_rate": 0.00010168341708542713, | |
| "loss": 1.0698, | |
| "step": 66300 | |
| }, | |
| { | |
| "epoch": 3.32, | |
| "grad_norm": 2.866360902786255, | |
| "learning_rate": 0.00010138190954773868, | |
| "loss": 1.0666, | |
| "step": 66400 | |
| }, | |
| { | |
| "epoch": 3.33, | |
| "grad_norm": 3.1752941608428955, | |
| "learning_rate": 0.00010108040201005025, | |
| "loss": 1.0723, | |
| "step": 66500 | |
| }, | |
| { | |
| "epoch": 3.33, | |
| "grad_norm": 4.475529193878174, | |
| "learning_rate": 0.0001007788944723618, | |
| "loss": 1.105, | |
| "step": 66600 | |
| }, | |
| { | |
| "epoch": 3.33, | |
| "grad_norm": 2.9230782985687256, | |
| "learning_rate": 0.00010047738693467337, | |
| "loss": 1.0674, | |
| "step": 66700 | |
| }, | |
| { | |
| "epoch": 3.34, | |
| "grad_norm": 4.472579479217529, | |
| "learning_rate": 0.00010017587939698491, | |
| "loss": 1.0798, | |
| "step": 66800 | |
| }, | |
| { | |
| "epoch": 3.34, | |
| "grad_norm": 2.9080252647399902, | |
| "learning_rate": 9.987437185929649e-05, | |
| "loss": 1.0789, | |
| "step": 66900 | |
| }, | |
| { | |
| "epoch": 3.35, | |
| "grad_norm": 2.728170394897461, | |
| "learning_rate": 9.957286432160803e-05, | |
| "loss": 1.0771, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 3.35, | |
| "eval_loss": 1.0558359622955322, | |
| "eval_runtime": 37.9887, | |
| "eval_samples_per_second": 26.324, | |
| "eval_steps_per_second": 3.29, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 3.35, | |
| "grad_norm": 2.227384328842163, | |
| "learning_rate": 9.927135678391958e-05, | |
| "loss": 1.0336, | |
| "step": 67100 | |
| }, | |
| { | |
| "epoch": 3.36, | |
| "grad_norm": 2.5888235569000244, | |
| "learning_rate": 9.896984924623115e-05, | |
| "loss": 1.0525, | |
| "step": 67200 | |
| }, | |
| { | |
| "epoch": 3.37, | |
| "grad_norm": 1.9375131130218506, | |
| "learning_rate": 9.86683417085427e-05, | |
| "loss": 1.1218, | |
| "step": 67300 | |
| }, | |
| { | |
| "epoch": 3.37, | |
| "grad_norm": 1.8543367385864258, | |
| "learning_rate": 9.836683417085426e-05, | |
| "loss": 1.0761, | |
| "step": 67400 | |
| }, | |
| { | |
| "epoch": 3.38, | |
| "grad_norm": 3.050717353820801, | |
| "learning_rate": 9.806532663316582e-05, | |
| "loss": 1.07, | |
| "step": 67500 | |
| }, | |
| { | |
| "epoch": 3.38, | |
| "grad_norm": 3.321708917617798, | |
| "learning_rate": 9.776381909547738e-05, | |
| "loss": 1.0606, | |
| "step": 67600 | |
| }, | |
| { | |
| "epoch": 3.38, | |
| "grad_norm": 2.958376407623291, | |
| "learning_rate": 9.746231155778894e-05, | |
| "loss": 1.0608, | |
| "step": 67700 | |
| }, | |
| { | |
| "epoch": 3.39, | |
| "grad_norm": 2.215822219848633, | |
| "learning_rate": 9.71608040201005e-05, | |
| "loss": 1.0605, | |
| "step": 67800 | |
| }, | |
| { | |
| "epoch": 3.4, | |
| "grad_norm": 2.430649518966675, | |
| "learning_rate": 9.685929648241204e-05, | |
| "loss": 1.0783, | |
| "step": 67900 | |
| }, | |
| { | |
| "epoch": 3.4, | |
| "grad_norm": 2.4160895347595215, | |
| "learning_rate": 9.655778894472361e-05, | |
| "loss": 1.0783, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 3.4, | |
| "eval_loss": 1.1083147525787354, | |
| "eval_runtime": 37.9578, | |
| "eval_samples_per_second": 26.345, | |
| "eval_steps_per_second": 3.293, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 3.41, | |
| "grad_norm": 3.5485310554504395, | |
| "learning_rate": 9.625628140703516e-05, | |
| "loss": 1.0299, | |
| "step": 68100 | |
| }, | |
| { | |
| "epoch": 3.41, | |
| "grad_norm": 2.0450522899627686, | |
| "learning_rate": 9.595477386934673e-05, | |
| "loss": 1.0662, | |
| "step": 68200 | |
| }, | |
| { | |
| "epoch": 3.42, | |
| "grad_norm": 2.339768171310425, | |
| "learning_rate": 9.565326633165828e-05, | |
| "loss": 1.0781, | |
| "step": 68300 | |
| }, | |
| { | |
| "epoch": 3.42, | |
| "grad_norm": 2.055027484893799, | |
| "learning_rate": 9.535477386934673e-05, | |
| "loss": 1.0586, | |
| "step": 68400 | |
| }, | |
| { | |
| "epoch": 3.42, | |
| "grad_norm": 3.186723232269287, | |
| "learning_rate": 9.505326633165828e-05, | |
| "loss": 1.071, | |
| "step": 68500 | |
| }, | |
| { | |
| "epoch": 3.43, | |
| "grad_norm": 2.934070587158203, | |
| "learning_rate": 9.475175879396985e-05, | |
| "loss": 1.0474, | |
| "step": 68600 | |
| }, | |
| { | |
| "epoch": 3.44, | |
| "grad_norm": 4.080368995666504, | |
| "learning_rate": 9.44502512562814e-05, | |
| "loss": 1.0376, | |
| "step": 68700 | |
| }, | |
| { | |
| "epoch": 3.44, | |
| "grad_norm": 9.1796236038208, | |
| "learning_rate": 9.415175879396985e-05, | |
| "loss": 1.0362, | |
| "step": 68800 | |
| }, | |
| { | |
| "epoch": 3.44, | |
| "grad_norm": 2.9005532264709473, | |
| "learning_rate": 9.38502512562814e-05, | |
| "loss": 1.0581, | |
| "step": 68900 | |
| }, | |
| { | |
| "epoch": 3.45, | |
| "grad_norm": 2.2525532245635986, | |
| "learning_rate": 9.354874371859296e-05, | |
| "loss": 1.0664, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 3.45, | |
| "eval_loss": 1.0977917909622192, | |
| "eval_runtime": 37.9301, | |
| "eval_samples_per_second": 26.364, | |
| "eval_steps_per_second": 3.296, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 3.46, | |
| "grad_norm": 4.754021644592285, | |
| "learning_rate": 9.324723618090452e-05, | |
| "loss": 1.0512, | |
| "step": 69100 | |
| }, | |
| { | |
| "epoch": 3.46, | |
| "grad_norm": 2.1440653800964355, | |
| "learning_rate": 9.294572864321607e-05, | |
| "loss": 1.0653, | |
| "step": 69200 | |
| }, | |
| { | |
| "epoch": 3.46, | |
| "grad_norm": 2.278679609298706, | |
| "learning_rate": 9.264422110552762e-05, | |
| "loss": 1.0466, | |
| "step": 69300 | |
| }, | |
| { | |
| "epoch": 3.47, | |
| "grad_norm": 2.176259994506836, | |
| "learning_rate": 9.23427135678392e-05, | |
| "loss": 1.0664, | |
| "step": 69400 | |
| }, | |
| { | |
| "epoch": 3.48, | |
| "grad_norm": 2.2514779567718506, | |
| "learning_rate": 9.204120603015074e-05, | |
| "loss": 1.0597, | |
| "step": 69500 | |
| }, | |
| { | |
| "epoch": 3.48, | |
| "grad_norm": 3.136343002319336, | |
| "learning_rate": 9.173969849246231e-05, | |
| "loss": 1.0742, | |
| "step": 69600 | |
| }, | |
| { | |
| "epoch": 3.48, | |
| "grad_norm": 1.6031814813613892, | |
| "learning_rate": 9.143819095477386e-05, | |
| "loss": 1.0435, | |
| "step": 69700 | |
| }, | |
| { | |
| "epoch": 3.49, | |
| "grad_norm": 5.727216720581055, | |
| "learning_rate": 9.113668341708543e-05, | |
| "loss": 1.0837, | |
| "step": 69800 | |
| }, | |
| { | |
| "epoch": 3.5, | |
| "grad_norm": 2.909613609313965, | |
| "learning_rate": 9.083517587939698e-05, | |
| "loss": 1.0292, | |
| "step": 69900 | |
| }, | |
| { | |
| "epoch": 3.5, | |
| "grad_norm": 2.8508193492889404, | |
| "learning_rate": 9.053366834170854e-05, | |
| "loss": 1.0643, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 3.5, | |
| "eval_loss": 1.0314569473266602, | |
| "eval_runtime": 45.3565, | |
| "eval_samples_per_second": 22.048, | |
| "eval_steps_per_second": 2.756, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 3.5, | |
| "grad_norm": 1.3868812322616577, | |
| "learning_rate": 9.02321608040201e-05, | |
| "loss": 1.0719, | |
| "step": 70100 | |
| }, | |
| { | |
| "epoch": 3.51, | |
| "grad_norm": 2.059966564178467, | |
| "learning_rate": 8.993065326633164e-05, | |
| "loss": 1.0496, | |
| "step": 70200 | |
| }, | |
| { | |
| "epoch": 3.52, | |
| "grad_norm": 2.371212959289551, | |
| "learning_rate": 8.962914572864322e-05, | |
| "loss": 1.0416, | |
| "step": 70300 | |
| }, | |
| { | |
| "epoch": 3.52, | |
| "grad_norm": 5.051455497741699, | |
| "learning_rate": 8.932763819095476e-05, | |
| "loss": 1.0817, | |
| "step": 70400 | |
| }, | |
| { | |
| "epoch": 3.52, | |
| "grad_norm": 2.4436607360839844, | |
| "learning_rate": 8.902613065326632e-05, | |
| "loss": 1.0434, | |
| "step": 70500 | |
| }, | |
| { | |
| "epoch": 3.53, | |
| "grad_norm": 2.097843885421753, | |
| "learning_rate": 8.872462311557788e-05, | |
| "loss": 1.06, | |
| "step": 70600 | |
| }, | |
| { | |
| "epoch": 3.54, | |
| "grad_norm": 3.9826953411102295, | |
| "learning_rate": 8.842311557788944e-05, | |
| "loss": 1.0921, | |
| "step": 70700 | |
| }, | |
| { | |
| "epoch": 3.54, | |
| "grad_norm": 3.572988748550415, | |
| "learning_rate": 8.812160804020099e-05, | |
| "loss": 1.0503, | |
| "step": 70800 | |
| }, | |
| { | |
| "epoch": 3.54, | |
| "grad_norm": 3.2607603073120117, | |
| "learning_rate": 8.782010050251256e-05, | |
| "loss": 1.0308, | |
| "step": 70900 | |
| }, | |
| { | |
| "epoch": 3.55, | |
| "grad_norm": 2.152568817138672, | |
| "learning_rate": 8.75185929648241e-05, | |
| "loss": 1.0508, | |
| "step": 71000 | |
| }, | |
| { | |
| "epoch": 3.55, | |
| "eval_loss": 1.035280704498291, | |
| "eval_runtime": 44.3432, | |
| "eval_samples_per_second": 22.551, | |
| "eval_steps_per_second": 2.819, | |
| "step": 71000 | |
| }, | |
| { | |
| "epoch": 3.56, | |
| "grad_norm": 1.5636742115020752, | |
| "learning_rate": 8.721708542713568e-05, | |
| "loss": 1.0177, | |
| "step": 71100 | |
| }, | |
| { | |
| "epoch": 3.56, | |
| "grad_norm": 1.9526029825210571, | |
| "learning_rate": 8.691557788944722e-05, | |
| "loss": 1.0516, | |
| "step": 71200 | |
| }, | |
| { | |
| "epoch": 3.56, | |
| "grad_norm": 2.2071800231933594, | |
| "learning_rate": 8.66140703517588e-05, | |
| "loss": 1.034, | |
| "step": 71300 | |
| }, | |
| { | |
| "epoch": 3.57, | |
| "grad_norm": 2.6768360137939453, | |
| "learning_rate": 8.631256281407034e-05, | |
| "loss": 1.0642, | |
| "step": 71400 | |
| }, | |
| { | |
| "epoch": 3.58, | |
| "grad_norm": 1.6602065563201904, | |
| "learning_rate": 8.60110552763819e-05, | |
| "loss": 1.0389, | |
| "step": 71500 | |
| }, | |
| { | |
| "epoch": 3.58, | |
| "grad_norm": 2.439145565032959, | |
| "learning_rate": 8.570954773869346e-05, | |
| "loss": 1.0536, | |
| "step": 71600 | |
| }, | |
| { | |
| "epoch": 3.58, | |
| "grad_norm": 6.254899978637695, | |
| "learning_rate": 8.54110552763819e-05, | |
| "loss": 1.0141, | |
| "step": 71700 | |
| }, | |
| { | |
| "epoch": 3.59, | |
| "grad_norm": 1.8221715688705444, | |
| "learning_rate": 8.510954773869346e-05, | |
| "loss": 1.044, | |
| "step": 71800 | |
| }, | |
| { | |
| "epoch": 3.59, | |
| "grad_norm": 4.5664849281311035, | |
| "learning_rate": 8.480804020100502e-05, | |
| "loss": 1.0665, | |
| "step": 71900 | |
| }, | |
| { | |
| "epoch": 3.6, | |
| "grad_norm": 2.4576423168182373, | |
| "learning_rate": 8.450653266331658e-05, | |
| "loss": 1.0615, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 3.6, | |
| "eval_loss": 1.031246542930603, | |
| "eval_runtime": 42.8264, | |
| "eval_samples_per_second": 23.35, | |
| "eval_steps_per_second": 2.919, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 3.6, | |
| "grad_norm": 2.763627290725708, | |
| "learning_rate": 8.420502512562814e-05, | |
| "loss": 1.0333, | |
| "step": 72100 | |
| }, | |
| { | |
| "epoch": 3.61, | |
| "grad_norm": 1.6231377124786377, | |
| "learning_rate": 8.390351758793968e-05, | |
| "loss": 1.0572, | |
| "step": 72200 | |
| }, | |
| { | |
| "epoch": 3.62, | |
| "grad_norm": 1.9768860340118408, | |
| "learning_rate": 8.360201005025126e-05, | |
| "loss": 1.0423, | |
| "step": 72300 | |
| }, | |
| { | |
| "epoch": 3.62, | |
| "grad_norm": 2.292513132095337, | |
| "learning_rate": 8.33005025125628e-05, | |
| "loss": 1.0655, | |
| "step": 72400 | |
| }, | |
| { | |
| "epoch": 3.62, | |
| "grad_norm": 2.1181390285491943, | |
| "learning_rate": 8.299899497487438e-05, | |
| "loss": 1.0216, | |
| "step": 72500 | |
| }, | |
| { | |
| "epoch": 3.63, | |
| "grad_norm": 2.3944106101989746, | |
| "learning_rate": 8.269748743718592e-05, | |
| "loss": 1.0585, | |
| "step": 72600 | |
| }, | |
| { | |
| "epoch": 3.63, | |
| "grad_norm": 1.5745407342910767, | |
| "learning_rate": 8.23959798994975e-05, | |
| "loss": 1.0629, | |
| "step": 72700 | |
| }, | |
| { | |
| "epoch": 3.64, | |
| "grad_norm": 2.130709648132324, | |
| "learning_rate": 8.209447236180904e-05, | |
| "loss": 1.0027, | |
| "step": 72800 | |
| }, | |
| { | |
| "epoch": 3.65, | |
| "grad_norm": 3.202035427093506, | |
| "learning_rate": 8.179296482412059e-05, | |
| "loss": 1.0385, | |
| "step": 72900 | |
| }, | |
| { | |
| "epoch": 3.65, | |
| "grad_norm": 2.009536027908325, | |
| "learning_rate": 8.149145728643216e-05, | |
| "loss": 1.0471, | |
| "step": 73000 | |
| }, | |
| { | |
| "epoch": 3.65, | |
| "eval_loss": 1.047244668006897, | |
| "eval_runtime": 38.0986, | |
| "eval_samples_per_second": 26.248, | |
| "eval_steps_per_second": 3.281, | |
| "step": 73000 | |
| }, | |
| { | |
| "epoch": 3.66, | |
| "grad_norm": 5.239896774291992, | |
| "learning_rate": 8.11899497487437e-05, | |
| "loss": 1.0527, | |
| "step": 73100 | |
| }, | |
| { | |
| "epoch": 3.66, | |
| "grad_norm": 3.438692808151245, | |
| "learning_rate": 8.088844221105527e-05, | |
| "loss": 1.0198, | |
| "step": 73200 | |
| }, | |
| { | |
| "epoch": 3.67, | |
| "grad_norm": 2.0132901668548584, | |
| "learning_rate": 8.058693467336682e-05, | |
| "loss": 0.989, | |
| "step": 73300 | |
| }, | |
| { | |
| "epoch": 3.67, | |
| "grad_norm": 2.9494431018829346, | |
| "learning_rate": 8.028542713567838e-05, | |
| "loss": 1.0329, | |
| "step": 73400 | |
| }, | |
| { | |
| "epoch": 3.67, | |
| "grad_norm": 2.8393380641937256, | |
| "learning_rate": 7.998391959798994e-05, | |
| "loss": 1.043, | |
| "step": 73500 | |
| }, | |
| { | |
| "epoch": 3.68, | |
| "grad_norm": 3.039391279220581, | |
| "learning_rate": 7.96824120603015e-05, | |
| "loss": 1.0035, | |
| "step": 73600 | |
| }, | |
| { | |
| "epoch": 3.69, | |
| "grad_norm": 3.696676731109619, | |
| "learning_rate": 7.938090452261305e-05, | |
| "loss": 1.0472, | |
| "step": 73700 | |
| }, | |
| { | |
| "epoch": 3.69, | |
| "grad_norm": 2.8557331562042236, | |
| "learning_rate": 7.907939698492462e-05, | |
| "loss": 1.0665, | |
| "step": 73800 | |
| }, | |
| { | |
| "epoch": 3.69, | |
| "grad_norm": 3.7987170219421387, | |
| "learning_rate": 7.877788944723617e-05, | |
| "loss": 1.0233, | |
| "step": 73900 | |
| }, | |
| { | |
| "epoch": 3.7, | |
| "grad_norm": 1.9759894609451294, | |
| "learning_rate": 7.847638190954774e-05, | |
| "loss": 1.0303, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 3.7, | |
| "eval_loss": 1.0124469995498657, | |
| "eval_runtime": 38.8346, | |
| "eval_samples_per_second": 25.75, | |
| "eval_steps_per_second": 3.219, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 3.71, | |
| "grad_norm": 1.9311368465423584, | |
| "learning_rate": 7.817487437185929e-05, | |
| "loss": 1.0479, | |
| "step": 74100 | |
| }, | |
| { | |
| "epoch": 3.71, | |
| "grad_norm": 4.948327541351318, | |
| "learning_rate": 7.787336683417086e-05, | |
| "loss": 1.0197, | |
| "step": 74200 | |
| }, | |
| { | |
| "epoch": 3.71, | |
| "grad_norm": 2.6867167949676514, | |
| "learning_rate": 7.75718592964824e-05, | |
| "loss": 1.0209, | |
| "step": 74300 | |
| }, | |
| { | |
| "epoch": 3.72, | |
| "grad_norm": 1.8292616605758667, | |
| "learning_rate": 7.727035175879396e-05, | |
| "loss": 1.0257, | |
| "step": 74400 | |
| }, | |
| { | |
| "epoch": 3.73, | |
| "grad_norm": 3.2925384044647217, | |
| "learning_rate": 7.696884422110552e-05, | |
| "loss": 1.0635, | |
| "step": 74500 | |
| }, | |
| { | |
| "epoch": 3.73, | |
| "grad_norm": 2.2040624618530273, | |
| "learning_rate": 7.666733668341708e-05, | |
| "loss": 1.0285, | |
| "step": 74600 | |
| }, | |
| { | |
| "epoch": 3.73, | |
| "grad_norm": 2.1025142669677734, | |
| "learning_rate": 7.636582914572863e-05, | |
| "loss": 1.05, | |
| "step": 74700 | |
| }, | |
| { | |
| "epoch": 3.74, | |
| "grad_norm": 2.409148693084717, | |
| "learning_rate": 7.60643216080402e-05, | |
| "loss": 1.0638, | |
| "step": 74800 | |
| }, | |
| { | |
| "epoch": 3.75, | |
| "grad_norm": 3.284660577774048, | |
| "learning_rate": 7.576281407035175e-05, | |
| "loss": 1.0203, | |
| "step": 74900 | |
| }, | |
| { | |
| "epoch": 3.75, | |
| "grad_norm": 2.3454208374023438, | |
| "learning_rate": 7.546130653266332e-05, | |
| "loss": 1.0425, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 3.75, | |
| "eval_loss": 1.0414044857025146, | |
| "eval_runtime": 38.2892, | |
| "eval_samples_per_second": 26.117, | |
| "eval_steps_per_second": 3.265, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 3.75, | |
| "grad_norm": 2.6853275299072266, | |
| "learning_rate": 7.515979899497487e-05, | |
| "loss": 0.9762, | |
| "step": 75100 | |
| }, | |
| { | |
| "epoch": 3.76, | |
| "grad_norm": 1.439287543296814, | |
| "learning_rate": 7.485829145728643e-05, | |
| "loss": 0.9955, | |
| "step": 75200 | |
| }, | |
| { | |
| "epoch": 3.77, | |
| "grad_norm": 2.0795187950134277, | |
| "learning_rate": 7.455678391959799e-05, | |
| "loss": 1.0148, | |
| "step": 75300 | |
| }, | |
| { | |
| "epoch": 3.77, | |
| "grad_norm": 2.318300247192383, | |
| "learning_rate": 7.425527638190955e-05, | |
| "loss": 1.0368, | |
| "step": 75400 | |
| }, | |
| { | |
| "epoch": 3.77, | |
| "grad_norm": 2.979464054107666, | |
| "learning_rate": 7.39537688442211e-05, | |
| "loss": 1.0233, | |
| "step": 75500 | |
| }, | |
| { | |
| "epoch": 3.78, | |
| "grad_norm": 2.384615421295166, | |
| "learning_rate": 7.365226130653266e-05, | |
| "loss": 1.0183, | |
| "step": 75600 | |
| }, | |
| { | |
| "epoch": 3.79, | |
| "grad_norm": 2.2947332859039307, | |
| "learning_rate": 7.335075376884421e-05, | |
| "loss": 1.046, | |
| "step": 75700 | |
| }, | |
| { | |
| "epoch": 3.79, | |
| "grad_norm": 2.707266330718994, | |
| "learning_rate": 7.304924623115577e-05, | |
| "loss": 1.0145, | |
| "step": 75800 | |
| }, | |
| { | |
| "epoch": 3.79, | |
| "grad_norm": 1.8125189542770386, | |
| "learning_rate": 7.275075376884422e-05, | |
| "loss": 1.0508, | |
| "step": 75900 | |
| }, | |
| { | |
| "epoch": 3.8, | |
| "grad_norm": 1.833924412727356, | |
| "learning_rate": 7.244924623115577e-05, | |
| "loss": 1.051, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 3.8, | |
| "eval_loss": 1.0207512378692627, | |
| "eval_runtime": 38.1696, | |
| "eval_samples_per_second": 26.199, | |
| "eval_steps_per_second": 3.275, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 3.81, | |
| "grad_norm": 2.3891940116882324, | |
| "learning_rate": 7.214773869346733e-05, | |
| "loss": 1.0006, | |
| "step": 76100 | |
| }, | |
| { | |
| "epoch": 3.81, | |
| "grad_norm": 2.6063296794891357, | |
| "learning_rate": 7.184623115577889e-05, | |
| "loss": 1.0011, | |
| "step": 76200 | |
| }, | |
| { | |
| "epoch": 3.81, | |
| "grad_norm": 1.7001017332077026, | |
| "learning_rate": 7.154472361809045e-05, | |
| "loss": 1.0172, | |
| "step": 76300 | |
| }, | |
| { | |
| "epoch": 3.82, | |
| "grad_norm": 2.0134339332580566, | |
| "learning_rate": 7.124321608040201e-05, | |
| "loss": 1.0367, | |
| "step": 76400 | |
| }, | |
| { | |
| "epoch": 3.83, | |
| "grad_norm": 2.199366807937622, | |
| "learning_rate": 7.094170854271357e-05, | |
| "loss": 1.044, | |
| "step": 76500 | |
| }, | |
| { | |
| "epoch": 3.83, | |
| "grad_norm": 2.8991353511810303, | |
| "learning_rate": 7.064020100502511e-05, | |
| "loss": 1.0121, | |
| "step": 76600 | |
| }, | |
| { | |
| "epoch": 3.83, | |
| "grad_norm": 5.798487663269043, | |
| "learning_rate": 7.033869346733667e-05, | |
| "loss": 0.9734, | |
| "step": 76700 | |
| }, | |
| { | |
| "epoch": 3.84, | |
| "grad_norm": 2.8960068225860596, | |
| "learning_rate": 7.003718592964823e-05, | |
| "loss": 1.004, | |
| "step": 76800 | |
| }, | |
| { | |
| "epoch": 3.84, | |
| "grad_norm": 2.980179786682129, | |
| "learning_rate": 6.973567839195979e-05, | |
| "loss": 1.0118, | |
| "step": 76900 | |
| }, | |
| { | |
| "epoch": 3.85, | |
| "grad_norm": 6.4917988777160645, | |
| "learning_rate": 6.943417085427135e-05, | |
| "loss": 0.9682, | |
| "step": 77000 | |
| }, | |
| { | |
| "epoch": 3.85, | |
| "eval_loss": 1.0282562971115112, | |
| "eval_runtime": 38.0717, | |
| "eval_samples_per_second": 26.266, | |
| "eval_steps_per_second": 3.283, | |
| "step": 77000 | |
| }, | |
| { | |
| "epoch": 3.85, | |
| "grad_norm": 2.9224038124084473, | |
| "learning_rate": 6.913266331658291e-05, | |
| "loss": 1.0385, | |
| "step": 77100 | |
| }, | |
| { | |
| "epoch": 3.86, | |
| "grad_norm": 4.447437763214111, | |
| "learning_rate": 6.883115577889447e-05, | |
| "loss": 1.0388, | |
| "step": 77200 | |
| }, | |
| { | |
| "epoch": 3.87, | |
| "grad_norm": 2.2013559341430664, | |
| "learning_rate": 6.852964824120603e-05, | |
| "loss": 1.034, | |
| "step": 77300 | |
| }, | |
| { | |
| "epoch": 3.87, | |
| "grad_norm": 1.3720605373382568, | |
| "learning_rate": 6.822814070351757e-05, | |
| "loss": 1.0512, | |
| "step": 77400 | |
| }, | |
| { | |
| "epoch": 3.88, | |
| "grad_norm": 2.4448797702789307, | |
| "learning_rate": 6.792663316582913e-05, | |
| "loss": 1.0012, | |
| "step": 77500 | |
| }, | |
| { | |
| "epoch": 3.88, | |
| "grad_norm": 4.061469554901123, | |
| "learning_rate": 6.762512562814069e-05, | |
| "loss": 1.0144, | |
| "step": 77600 | |
| }, | |
| { | |
| "epoch": 3.88, | |
| "grad_norm": 1.62380850315094, | |
| "learning_rate": 6.732361809045225e-05, | |
| "loss": 1.0369, | |
| "step": 77700 | |
| }, | |
| { | |
| "epoch": 3.89, | |
| "grad_norm": 1.3728336095809937, | |
| "learning_rate": 6.702211055276381e-05, | |
| "loss": 1.0133, | |
| "step": 77800 | |
| }, | |
| { | |
| "epoch": 3.9, | |
| "grad_norm": 7.0939435958862305, | |
| "learning_rate": 6.672060301507537e-05, | |
| "loss": 0.9797, | |
| "step": 77900 | |
| }, | |
| { | |
| "epoch": 3.9, | |
| "grad_norm": 2.0842604637145996, | |
| "learning_rate": 6.642211055276381e-05, | |
| "loss": 1.0035, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 3.9, | |
| "eval_loss": 1.0243637561798096, | |
| "eval_runtime": 38.1566, | |
| "eval_samples_per_second": 26.208, | |
| "eval_steps_per_second": 3.276, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 3.91, | |
| "grad_norm": 3.6360020637512207, | |
| "learning_rate": 6.612060301507537e-05, | |
| "loss": 0.9969, | |
| "step": 78100 | |
| }, | |
| { | |
| "epoch": 3.91, | |
| "grad_norm": 2.5551681518554688, | |
| "learning_rate": 6.581909547738693e-05, | |
| "loss": 1.0203, | |
| "step": 78200 | |
| }, | |
| { | |
| "epoch": 3.92, | |
| "grad_norm": 6.86871862411499, | |
| "learning_rate": 6.551758793969849e-05, | |
| "loss": 1.0472, | |
| "step": 78300 | |
| }, | |
| { | |
| "epoch": 3.92, | |
| "grad_norm": 2.3950083255767822, | |
| "learning_rate": 6.521608040201005e-05, | |
| "loss": 1.0167, | |
| "step": 78400 | |
| }, | |
| { | |
| "epoch": 3.92, | |
| "grad_norm": 1.422188401222229, | |
| "learning_rate": 6.491457286432161e-05, | |
| "loss": 0.9968, | |
| "step": 78500 | |
| }, | |
| { | |
| "epoch": 3.93, | |
| "grad_norm": 2.186511993408203, | |
| "learning_rate": 6.461306532663317e-05, | |
| "loss": 1.0113, | |
| "step": 78600 | |
| }, | |
| { | |
| "epoch": 3.94, | |
| "grad_norm": 1.764722228050232, | |
| "learning_rate": 6.431155778894471e-05, | |
| "loss": 0.983, | |
| "step": 78700 | |
| }, | |
| { | |
| "epoch": 3.94, | |
| "grad_norm": 4.928635597229004, | |
| "learning_rate": 6.401005025125627e-05, | |
| "loss": 1.0164, | |
| "step": 78800 | |
| }, | |
| { | |
| "epoch": 3.94, | |
| "grad_norm": 2.1061389446258545, | |
| "learning_rate": 6.370854271356783e-05, | |
| "loss": 1.0171, | |
| "step": 78900 | |
| }, | |
| { | |
| "epoch": 3.95, | |
| "grad_norm": 4.193387985229492, | |
| "learning_rate": 6.340703517587939e-05, | |
| "loss": 1.0072, | |
| "step": 79000 | |
| }, | |
| { | |
| "epoch": 3.95, | |
| "eval_loss": 1.00971519947052, | |
| "eval_runtime": 38.1263, | |
| "eval_samples_per_second": 26.229, | |
| "eval_steps_per_second": 3.279, | |
| "step": 79000 | |
| }, | |
| { | |
| "epoch": 3.96, | |
| "grad_norm": 2.4844706058502197, | |
| "learning_rate": 6.310552763819095e-05, | |
| "loss": 1.0064, | |
| "step": 79100 | |
| }, | |
| { | |
| "epoch": 3.96, | |
| "grad_norm": 5.7934746742248535, | |
| "learning_rate": 6.280402010050251e-05, | |
| "loss": 0.9509, | |
| "step": 79200 | |
| }, | |
| { | |
| "epoch": 3.96, | |
| "grad_norm": 3.7046196460723877, | |
| "learning_rate": 6.250251256281406e-05, | |
| "loss": 1.0139, | |
| "step": 79300 | |
| }, | |
| { | |
| "epoch": 3.97, | |
| "grad_norm": 1.9528000354766846, | |
| "learning_rate": 6.220100502512562e-05, | |
| "loss": 1.0214, | |
| "step": 79400 | |
| }, | |
| { | |
| "epoch": 3.98, | |
| "grad_norm": 3.4000682830810547, | |
| "learning_rate": 6.189949748743718e-05, | |
| "loss": 1.006, | |
| "step": 79500 | |
| }, | |
| { | |
| "epoch": 3.98, | |
| "grad_norm": 3.152561664581299, | |
| "learning_rate": 6.159798994974874e-05, | |
| "loss": 1.0288, | |
| "step": 79600 | |
| }, | |
| { | |
| "epoch": 3.98, | |
| "grad_norm": 3.774915933609009, | |
| "learning_rate": 6.12964824120603e-05, | |
| "loss": 1.022, | |
| "step": 79700 | |
| }, | |
| { | |
| "epoch": 3.99, | |
| "grad_norm": 2.291813373565674, | |
| "learning_rate": 6.0994974874371854e-05, | |
| "loss": 0.9845, | |
| "step": 79800 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 3.019514560699463, | |
| "learning_rate": 6.0693467336683413e-05, | |
| "loss": 1.0246, | |
| "step": 79900 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 2.4409408569335938, | |
| "learning_rate": 6.0391959798994966e-05, | |
| "loss": 0.9951, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 0.9992234110832214, | |
| "eval_runtime": 39.3867, | |
| "eval_samples_per_second": 25.389, | |
| "eval_steps_per_second": 3.174, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 1.4257367849349976, | |
| "learning_rate": 6.0090452261306526e-05, | |
| "loss": 0.9763, | |
| "step": 80100 | |
| }, | |
| { | |
| "epoch": 4.01, | |
| "grad_norm": 4.97927713394165, | |
| "learning_rate": 5.9788944723618085e-05, | |
| "loss": 0.9417, | |
| "step": 80200 | |
| }, | |
| { | |
| "epoch": 4.01, | |
| "grad_norm": 2.8552098274230957, | |
| "learning_rate": 5.9487437185929644e-05, | |
| "loss": 0.9591, | |
| "step": 80300 | |
| }, | |
| { | |
| "epoch": 4.02, | |
| "grad_norm": Infinity, | |
| "learning_rate": 5.9188944723618084e-05, | |
| "loss": 0.9783, | |
| "step": 80400 | |
| }, | |
| { | |
| "epoch": 4.03, | |
| "grad_norm": 3.83720064163208, | |
| "learning_rate": 5.8887437185929643e-05, | |
| "loss": 0.9607, | |
| "step": 80500 | |
| }, | |
| { | |
| "epoch": 4.03, | |
| "grad_norm": 2.607973337173462, | |
| "learning_rate": 5.85859296482412e-05, | |
| "loss": 0.9556, | |
| "step": 80600 | |
| }, | |
| { | |
| "epoch": 4.04, | |
| "grad_norm": 3.51914381980896, | |
| "learning_rate": 5.8284422110552756e-05, | |
| "loss": 0.9371, | |
| "step": 80700 | |
| }, | |
| { | |
| "epoch": 4.04, | |
| "grad_norm": 2.0518856048583984, | |
| "learning_rate": 5.7982914572864315e-05, | |
| "loss": 1.0154, | |
| "step": 80800 | |
| }, | |
| { | |
| "epoch": 4.04, | |
| "grad_norm": 3.5824625492095947, | |
| "learning_rate": 5.7681407035175874e-05, | |
| "loss": 0.9894, | |
| "step": 80900 | |
| }, | |
| { | |
| "epoch": 4.05, | |
| "grad_norm": 7.991865634918213, | |
| "learning_rate": 5.7379899497487434e-05, | |
| "loss": 0.9719, | |
| "step": 81000 | |
| }, | |
| { | |
| "epoch": 4.05, | |
| "eval_loss": 1.0105689764022827, | |
| "eval_runtime": 38.1347, | |
| "eval_samples_per_second": 26.223, | |
| "eval_steps_per_second": 3.278, | |
| "step": 81000 | |
| }, | |
| { | |
| "epoch": 4.05, | |
| "grad_norm": 1.6757104396820068, | |
| "learning_rate": 5.707839195979899e-05, | |
| "loss": 0.9526, | |
| "step": 81100 | |
| }, | |
| { | |
| "epoch": 4.06, | |
| "grad_norm": 3.1675045490264893, | |
| "learning_rate": 5.677688442211055e-05, | |
| "loss": 0.9798, | |
| "step": 81200 | |
| }, | |
| { | |
| "epoch": 4.07, | |
| "grad_norm": 2.8390209674835205, | |
| "learning_rate": 5.6475376884422105e-05, | |
| "loss": 0.9455, | |
| "step": 81300 | |
| }, | |
| { | |
| "epoch": 4.07, | |
| "grad_norm": 2.2900238037109375, | |
| "learning_rate": 5.6173869346733665e-05, | |
| "loss": 1.0016, | |
| "step": 81400 | |
| }, | |
| { | |
| "epoch": 4.08, | |
| "grad_norm": 2.4220378398895264, | |
| "learning_rate": 5.5872361809045224e-05, | |
| "loss": 0.9681, | |
| "step": 81500 | |
| }, | |
| { | |
| "epoch": 4.08, | |
| "grad_norm": 2.7175300121307373, | |
| "learning_rate": 5.5570854271356784e-05, | |
| "loss": 0.9822, | |
| "step": 81600 | |
| }, | |
| { | |
| "epoch": 4.08, | |
| "grad_norm": 3.7499475479125977, | |
| "learning_rate": 5.526934673366834e-05, | |
| "loss": 0.9501, | |
| "step": 81700 | |
| }, | |
| { | |
| "epoch": 4.09, | |
| "grad_norm": 2.1566553115844727, | |
| "learning_rate": 5.4967839195979896e-05, | |
| "loss": 0.9601, | |
| "step": 81800 | |
| }, | |
| { | |
| "epoch": 4.09, | |
| "grad_norm": 2.080754280090332, | |
| "learning_rate": 5.466633165829145e-05, | |
| "loss": 0.954, | |
| "step": 81900 | |
| }, | |
| { | |
| "epoch": 4.1, | |
| "grad_norm": 3.1466102600097656, | |
| "learning_rate": 5.436482412060301e-05, | |
| "loss": 0.9896, | |
| "step": 82000 | |
| }, | |
| { | |
| "epoch": 4.1, | |
| "eval_loss": 1.0087724924087524, | |
| "eval_runtime": 37.9931, | |
| "eval_samples_per_second": 26.321, | |
| "eval_steps_per_second": 3.29, | |
| "step": 82000 | |
| }, | |
| { | |
| "epoch": 4.11, | |
| "grad_norm": 4.262351989746094, | |
| "learning_rate": 5.406331658291457e-05, | |
| "loss": 0.9454, | |
| "step": 82100 | |
| }, | |
| { | |
| "epoch": 4.11, | |
| "grad_norm": 1.9488756656646729, | |
| "learning_rate": 5.376180904522612e-05, | |
| "loss": 0.9494, | |
| "step": 82200 | |
| }, | |
| { | |
| "epoch": 4.12, | |
| "grad_norm": 1.6786818504333496, | |
| "learning_rate": 5.346030150753768e-05, | |
| "loss": 0.9241, | |
| "step": 82300 | |
| }, | |
| { | |
| "epoch": 4.12, | |
| "grad_norm": 2.143955945968628, | |
| "learning_rate": 5.315879396984924e-05, | |
| "loss": 0.9958, | |
| "step": 82400 | |
| }, | |
| { | |
| "epoch": 4.12, | |
| "grad_norm": 3.6211471557617188, | |
| "learning_rate": 5.286030150753768e-05, | |
| "loss": 0.9641, | |
| "step": 82500 | |
| }, | |
| { | |
| "epoch": 4.13, | |
| "grad_norm": 4.066643238067627, | |
| "learning_rate": 5.255879396984924e-05, | |
| "loss": 0.9698, | |
| "step": 82600 | |
| }, | |
| { | |
| "epoch": 4.13, | |
| "grad_norm": 2.151590585708618, | |
| "learning_rate": 5.22572864321608e-05, | |
| "loss": 0.9388, | |
| "step": 82700 | |
| }, | |
| { | |
| "epoch": 4.14, | |
| "grad_norm": 4.644803524017334, | |
| "learning_rate": 5.195577889447236e-05, | |
| "loss": 0.9141, | |
| "step": 82800 | |
| }, | |
| { | |
| "epoch": 4.14, | |
| "grad_norm": 2.652754068374634, | |
| "learning_rate": 5.1654271356783916e-05, | |
| "loss": 0.9592, | |
| "step": 82900 | |
| }, | |
| { | |
| "epoch": 4.15, | |
| "grad_norm": 4.528812885284424, | |
| "learning_rate": 5.135276381909547e-05, | |
| "loss": 0.9778, | |
| "step": 83000 | |
| }, | |
| { | |
| "epoch": 4.15, | |
| "eval_loss": 0.9974797368049622, | |
| "eval_runtime": 38.0893, | |
| "eval_samples_per_second": 26.254, | |
| "eval_steps_per_second": 3.282, | |
| "step": 83000 | |
| }, | |
| { | |
| "epoch": 4.16, | |
| "grad_norm": 2.625786542892456, | |
| "learning_rate": 5.105125628140703e-05, | |
| "loss": 0.9594, | |
| "step": 83100 | |
| }, | |
| { | |
| "epoch": 4.16, | |
| "grad_norm": 3.7137229442596436, | |
| "learning_rate": 5.074974874371859e-05, | |
| "loss": 0.9462, | |
| "step": 83200 | |
| }, | |
| { | |
| "epoch": 4.17, | |
| "grad_norm": 6.682472229003906, | |
| "learning_rate": 5.044824120603015e-05, | |
| "loss": 0.9301, | |
| "step": 83300 | |
| }, | |
| { | |
| "epoch": 4.17, | |
| "grad_norm": 2.7188687324523926, | |
| "learning_rate": 5.014673366834171e-05, | |
| "loss": 0.9801, | |
| "step": 83400 | |
| }, | |
| { | |
| "epoch": 4.17, | |
| "grad_norm": 2.7037341594696045, | |
| "learning_rate": 4.984522613065326e-05, | |
| "loss": 0.9475, | |
| "step": 83500 | |
| }, | |
| { | |
| "epoch": 4.18, | |
| "grad_norm": 2.815229654312134, | |
| "learning_rate": 4.954371859296482e-05, | |
| "loss": 0.9012, | |
| "step": 83600 | |
| }, | |
| { | |
| "epoch": 4.18, | |
| "grad_norm": 2.7187130451202393, | |
| "learning_rate": 4.924221105527638e-05, | |
| "loss": 0.9199, | |
| "step": 83700 | |
| }, | |
| { | |
| "epoch": 4.19, | |
| "grad_norm": 1.6610496044158936, | |
| "learning_rate": 4.894070351758794e-05, | |
| "loss": 0.9321, | |
| "step": 83800 | |
| }, | |
| { | |
| "epoch": 4.2, | |
| "grad_norm": 2.1496291160583496, | |
| "learning_rate": 4.86391959798995e-05, | |
| "loss": 0.9003, | |
| "step": 83900 | |
| }, | |
| { | |
| "epoch": 4.2, | |
| "grad_norm": 2.9933974742889404, | |
| "learning_rate": 4.833768844221105e-05, | |
| "loss": 0.9467, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 4.2, | |
| "eval_loss": 0.9802306890487671, | |
| "eval_runtime": 38.0487, | |
| "eval_samples_per_second": 26.282, | |
| "eval_steps_per_second": 3.285, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 4.21, | |
| "grad_norm": 4.368553161621094, | |
| "learning_rate": 4.803618090452261e-05, | |
| "loss": 0.921, | |
| "step": 84100 | |
| }, | |
| { | |
| "epoch": 4.21, | |
| "grad_norm": 4.087899684906006, | |
| "learning_rate": 4.773467336683417e-05, | |
| "loss": 0.9413, | |
| "step": 84200 | |
| }, | |
| { | |
| "epoch": 4.21, | |
| "grad_norm": 1.8541690111160278, | |
| "learning_rate": 4.743316582914573e-05, | |
| "loss": 0.9657, | |
| "step": 84300 | |
| }, | |
| { | |
| "epoch": 4.22, | |
| "grad_norm": 2.6514675617218018, | |
| "learning_rate": 4.713165829145729e-05, | |
| "loss": 0.9645, | |
| "step": 84400 | |
| }, | |
| { | |
| "epoch": 4.22, | |
| "grad_norm": 3.2329466342926025, | |
| "learning_rate": 4.683015075376885e-05, | |
| "loss": 0.9465, | |
| "step": 84500 | |
| }, | |
| { | |
| "epoch": 4.23, | |
| "grad_norm": 2.358675241470337, | |
| "learning_rate": 4.652864321608039e-05, | |
| "loss": 0.9644, | |
| "step": 84600 | |
| }, | |
| { | |
| "epoch": 4.24, | |
| "grad_norm": 3.6738836765289307, | |
| "learning_rate": 4.6230150753768846e-05, | |
| "loss": 0.9357, | |
| "step": 84700 | |
| }, | |
| { | |
| "epoch": 4.24, | |
| "grad_norm": 2.8447327613830566, | |
| "learning_rate": 4.59286432160804e-05, | |
| "loss": 0.9308, | |
| "step": 84800 | |
| }, | |
| { | |
| "epoch": 4.25, | |
| "grad_norm": 1.6326079368591309, | |
| "learning_rate": 4.562713567839195e-05, | |
| "loss": 0.9068, | |
| "step": 84900 | |
| }, | |
| { | |
| "epoch": 4.25, | |
| "grad_norm": 2.3545360565185547, | |
| "learning_rate": 4.532562814070351e-05, | |
| "loss": 0.9436, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 4.25, | |
| "eval_loss": 0.9844674468040466, | |
| "eval_runtime": 38.274, | |
| "eval_samples_per_second": 26.127, | |
| "eval_steps_per_second": 3.266, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 4.25, | |
| "grad_norm": 3.2402210235595703, | |
| "learning_rate": 4.502412060301507e-05, | |
| "loss": 0.9313, | |
| "step": 85100 | |
| }, | |
| { | |
| "epoch": 4.26, | |
| "grad_norm": 3.3900952339172363, | |
| "learning_rate": 4.472261306532662e-05, | |
| "loss": 0.9385, | |
| "step": 85200 | |
| }, | |
| { | |
| "epoch": 4.26, | |
| "grad_norm": 3.8531854152679443, | |
| "learning_rate": 4.442110552763818e-05, | |
| "loss": 0.9292, | |
| "step": 85300 | |
| }, | |
| { | |
| "epoch": 4.27, | |
| "grad_norm": 2.3123373985290527, | |
| "learning_rate": 4.411959798994974e-05, | |
| "loss": 0.9544, | |
| "step": 85400 | |
| }, | |
| { | |
| "epoch": 4.28, | |
| "grad_norm": 2.5710906982421875, | |
| "learning_rate": 4.38180904522613e-05, | |
| "loss": 0.9591, | |
| "step": 85500 | |
| }, | |
| { | |
| "epoch": 4.28, | |
| "grad_norm": 3.4481329917907715, | |
| "learning_rate": 4.351658291457286e-05, | |
| "loss": 0.9281, | |
| "step": 85600 | |
| }, | |
| { | |
| "epoch": 4.29, | |
| "grad_norm": 1.7887803316116333, | |
| "learning_rate": 4.321507537688442e-05, | |
| "loss": 0.9371, | |
| "step": 85700 | |
| }, | |
| { | |
| "epoch": 4.29, | |
| "grad_norm": 6.177557945251465, | |
| "learning_rate": 4.291356783919597e-05, | |
| "loss": 0.9154, | |
| "step": 85800 | |
| }, | |
| { | |
| "epoch": 4.29, | |
| "grad_norm": 3.0554301738739014, | |
| "learning_rate": 4.261206030150753e-05, | |
| "loss": 0.9483, | |
| "step": 85900 | |
| }, | |
| { | |
| "epoch": 4.3, | |
| "grad_norm": 2.0133023262023926, | |
| "learning_rate": 4.231055276381909e-05, | |
| "loss": 0.9557, | |
| "step": 86000 | |
| }, | |
| { | |
| "epoch": 4.3, | |
| "eval_loss": 0.9593837261199951, | |
| "eval_runtime": 38.1446, | |
| "eval_samples_per_second": 26.216, | |
| "eval_steps_per_second": 3.277, | |
| "step": 86000 | |
| }, | |
| { | |
| "epoch": 4.3, | |
| "grad_norm": 2.1396610736846924, | |
| "learning_rate": 4.200904522613065e-05, | |
| "loss": 0.9643, | |
| "step": 86100 | |
| }, | |
| { | |
| "epoch": 4.31, | |
| "grad_norm": 2.709627628326416, | |
| "learning_rate": 4.170753768844221e-05, | |
| "loss": 0.9365, | |
| "step": 86200 | |
| }, | |
| { | |
| "epoch": 4.32, | |
| "grad_norm": 4.406678199768066, | |
| "learning_rate": 4.1406030150753764e-05, | |
| "loss": 0.9553, | |
| "step": 86300 | |
| }, | |
| { | |
| "epoch": 4.32, | |
| "grad_norm": 4.822593688964844, | |
| "learning_rate": 4.110452261306532e-05, | |
| "loss": 0.9213, | |
| "step": 86400 | |
| }, | |
| { | |
| "epoch": 4.33, | |
| "grad_norm": 4.148794651031494, | |
| "learning_rate": 4.080301507537688e-05, | |
| "loss": 0.9808, | |
| "step": 86500 | |
| }, | |
| { | |
| "epoch": 4.33, | |
| "grad_norm": 3.7028510570526123, | |
| "learning_rate": 4.050150753768844e-05, | |
| "loss": 0.9331, | |
| "step": 86600 | |
| }, | |
| { | |
| "epoch": 4.33, | |
| "grad_norm": 2.314500093460083, | |
| "learning_rate": 4.02e-05, | |
| "loss": 0.9551, | |
| "step": 86700 | |
| }, | |
| { | |
| "epoch": 4.34, | |
| "grad_norm": 3.741234302520752, | |
| "learning_rate": 3.9898492462311554e-05, | |
| "loss": 0.9053, | |
| "step": 86800 | |
| }, | |
| { | |
| "epoch": 4.34, | |
| "grad_norm": 3.7346441745758057, | |
| "learning_rate": 3.9596984924623113e-05, | |
| "loss": 0.9517, | |
| "step": 86900 | |
| }, | |
| { | |
| "epoch": 4.35, | |
| "grad_norm": 1.324827790260315, | |
| "learning_rate": 3.929849246231156e-05, | |
| "loss": 0.9764, | |
| "step": 87000 | |
| }, | |
| { | |
| "epoch": 4.35, | |
| "eval_loss": 1.0139998197555542, | |
| "eval_runtime": 38.1639, | |
| "eval_samples_per_second": 26.203, | |
| "eval_steps_per_second": 3.275, | |
| "step": 87000 | |
| }, | |
| { | |
| "epoch": 4.36, | |
| "grad_norm": 5.19126033782959, | |
| "learning_rate": 3.899698492462311e-05, | |
| "loss": 0.9366, | |
| "step": 87100 | |
| }, | |
| { | |
| "epoch": 4.36, | |
| "grad_norm": 2.899726629257202, | |
| "learning_rate": 3.869547738693467e-05, | |
| "loss": 0.9555, | |
| "step": 87200 | |
| }, | |
| { | |
| "epoch": 4.37, | |
| "grad_norm": 1.9099615812301636, | |
| "learning_rate": 3.839396984924623e-05, | |
| "loss": 0.9033, | |
| "step": 87300 | |
| }, | |
| { | |
| "epoch": 4.37, | |
| "grad_norm": 1.5814082622528076, | |
| "learning_rate": 3.809246231155779e-05, | |
| "loss": 0.9978, | |
| "step": 87400 | |
| }, | |
| { | |
| "epoch": 4.38, | |
| "grad_norm": 3.4520106315612793, | |
| "learning_rate": 3.779095477386935e-05, | |
| "loss": 0.9343, | |
| "step": 87500 | |
| }, | |
| { | |
| "epoch": 4.38, | |
| "grad_norm": 3.0876681804656982, | |
| "learning_rate": 3.74894472361809e-05, | |
| "loss": 0.9094, | |
| "step": 87600 | |
| }, | |
| { | |
| "epoch": 4.38, | |
| "grad_norm": 3.5139119625091553, | |
| "learning_rate": 3.718793969849246e-05, | |
| "loss": 0.8677, | |
| "step": 87700 | |
| }, | |
| { | |
| "epoch": 4.39, | |
| "grad_norm": 2.003330945968628, | |
| "learning_rate": 3.6886432160804015e-05, | |
| "loss": 0.9351, | |
| "step": 87800 | |
| }, | |
| { | |
| "epoch": 4.39, | |
| "grad_norm": 2.259235382080078, | |
| "learning_rate": 3.6584924623115574e-05, | |
| "loss": 0.9388, | |
| "step": 87900 | |
| }, | |
| { | |
| "epoch": 4.4, | |
| "grad_norm": 2.2141153812408447, | |
| "learning_rate": 3.6283417085427134e-05, | |
| "loss": 0.9169, | |
| "step": 88000 | |
| }, | |
| { | |
| "epoch": 4.4, | |
| "eval_loss": 0.9528889060020447, | |
| "eval_runtime": 38.0305, | |
| "eval_samples_per_second": 26.295, | |
| "eval_steps_per_second": 3.287, | |
| "step": 88000 | |
| }, | |
| { | |
| "epoch": 4.41, | |
| "grad_norm": 4.264975547790527, | |
| "learning_rate": 3.5981909547738693e-05, | |
| "loss": 0.9309, | |
| "step": 88100 | |
| }, | |
| { | |
| "epoch": 4.41, | |
| "grad_norm": 4.431647777557373, | |
| "learning_rate": 3.5680402010050246e-05, | |
| "loss": 0.9035, | |
| "step": 88200 | |
| }, | |
| { | |
| "epoch": 4.42, | |
| "grad_norm": 2.326883316040039, | |
| "learning_rate": 3.5378894472361806e-05, | |
| "loss": 0.904, | |
| "step": 88300 | |
| }, | |
| { | |
| "epoch": 4.42, | |
| "grad_norm": 2.6951944828033447, | |
| "learning_rate": 3.5077386934673365e-05, | |
| "loss": 0.9195, | |
| "step": 88400 | |
| }, | |
| { | |
| "epoch": 4.42, | |
| "grad_norm": 1.8017208576202393, | |
| "learning_rate": 3.477587939698492e-05, | |
| "loss": 0.9398, | |
| "step": 88500 | |
| }, | |
| { | |
| "epoch": 4.43, | |
| "grad_norm": 3.8392789363861084, | |
| "learning_rate": 3.447437185929648e-05, | |
| "loss": 0.9591, | |
| "step": 88600 | |
| }, | |
| { | |
| "epoch": 4.43, | |
| "grad_norm": 2.541273593902588, | |
| "learning_rate": 3.4172864321608037e-05, | |
| "loss": 0.9054, | |
| "step": 88700 | |
| }, | |
| { | |
| "epoch": 4.44, | |
| "grad_norm": 2.7736191749572754, | |
| "learning_rate": 3.3874371859296476e-05, | |
| "loss": 0.9473, | |
| "step": 88800 | |
| }, | |
| { | |
| "epoch": 4.45, | |
| "grad_norm": 2.660540819168091, | |
| "learning_rate": 3.3572864321608036e-05, | |
| "loss": 0.9582, | |
| "step": 88900 | |
| }, | |
| { | |
| "epoch": 4.45, | |
| "grad_norm": 3.161513328552246, | |
| "learning_rate": 3.3271356783919595e-05, | |
| "loss": 0.8943, | |
| "step": 89000 | |
| }, | |
| { | |
| "epoch": 4.45, | |
| "eval_loss": 0.9552559852600098, | |
| "eval_runtime": 38.1158, | |
| "eval_samples_per_second": 26.236, | |
| "eval_steps_per_second": 3.279, | |
| "step": 89000 | |
| }, | |
| { | |
| "epoch": 4.46, | |
| "grad_norm": 4.881318092346191, | |
| "learning_rate": 3.2969849246231154e-05, | |
| "loss": 0.9053, | |
| "step": 89100 | |
| }, | |
| { | |
| "epoch": 4.46, | |
| "grad_norm": 1.7572602033615112, | |
| "learning_rate": 3.2668341708542714e-05, | |
| "loss": 0.9364, | |
| "step": 89200 | |
| }, | |
| { | |
| "epoch": 4.46, | |
| "grad_norm": 3.067507743835449, | |
| "learning_rate": 3.2366834170854267e-05, | |
| "loss": 0.9355, | |
| "step": 89300 | |
| }, | |
| { | |
| "epoch": 4.47, | |
| "grad_norm": 3.1982858180999756, | |
| "learning_rate": 3.2065326633165826e-05, | |
| "loss": 0.9333, | |
| "step": 89400 | |
| }, | |
| { | |
| "epoch": 4.47, | |
| "grad_norm": 3.596789598464966, | |
| "learning_rate": 3.1763819095477385e-05, | |
| "loss": 0.8978, | |
| "step": 89500 | |
| }, | |
| { | |
| "epoch": 4.48, | |
| "grad_norm": 5.035818576812744, | |
| "learning_rate": 3.1462311557788945e-05, | |
| "loss": 0.9337, | |
| "step": 89600 | |
| }, | |
| { | |
| "epoch": 4.49, | |
| "grad_norm": 3.149653673171997, | |
| "learning_rate": 3.11608040201005e-05, | |
| "loss": 0.9515, | |
| "step": 89700 | |
| }, | |
| { | |
| "epoch": 4.49, | |
| "grad_norm": 3.4601404666900635, | |
| "learning_rate": 3.085929648241206e-05, | |
| "loss": 0.9021, | |
| "step": 89800 | |
| }, | |
| { | |
| "epoch": 4.5, | |
| "grad_norm": 2.6317124366760254, | |
| "learning_rate": 3.0557788944723616e-05, | |
| "loss": 0.9559, | |
| "step": 89900 | |
| }, | |
| { | |
| "epoch": 4.5, | |
| "grad_norm": 2.667861223220825, | |
| "learning_rate": 3.0256281407035173e-05, | |
| "loss": 0.9341, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 4.5, | |
| "eval_loss": 0.9440233111381531, | |
| "eval_runtime": 38.0809, | |
| "eval_samples_per_second": 26.26, | |
| "eval_steps_per_second": 3.282, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 4.5, | |
| "grad_norm": 3.903172016143799, | |
| "learning_rate": 2.9954773869346732e-05, | |
| "loss": 0.8857, | |
| "step": 90100 | |
| }, | |
| { | |
| "epoch": 4.51, | |
| "grad_norm": 3.9286229610443115, | |
| "learning_rate": 2.9653266331658288e-05, | |
| "loss": 0.9119, | |
| "step": 90200 | |
| }, | |
| { | |
| "epoch": 4.51, | |
| "grad_norm": 2.812256336212158, | |
| "learning_rate": 2.9351758793969847e-05, | |
| "loss": 0.9026, | |
| "step": 90300 | |
| }, | |
| { | |
| "epoch": 4.52, | |
| "grad_norm": 2.2835099697113037, | |
| "learning_rate": 2.9050251256281404e-05, | |
| "loss": 0.885, | |
| "step": 90400 | |
| }, | |
| { | |
| "epoch": 4.53, | |
| "grad_norm": 3.383111000061035, | |
| "learning_rate": 2.8748743718592963e-05, | |
| "loss": 0.8838, | |
| "step": 90500 | |
| }, | |
| { | |
| "epoch": 4.53, | |
| "grad_norm": 2.7682292461395264, | |
| "learning_rate": 2.8447236180904522e-05, | |
| "loss": 0.9139, | |
| "step": 90600 | |
| }, | |
| { | |
| "epoch": 4.54, | |
| "grad_norm": 6.3915019035339355, | |
| "learning_rate": 2.814572864321608e-05, | |
| "loss": 0.9188, | |
| "step": 90700 | |
| }, | |
| { | |
| "epoch": 4.54, | |
| "grad_norm": 5.53504753112793, | |
| "learning_rate": 2.7844221105527635e-05, | |
| "loss": 0.9118, | |
| "step": 90800 | |
| }, | |
| { | |
| "epoch": 4.54, | |
| "grad_norm": 2.5919177532196045, | |
| "learning_rate": 2.754271356783919e-05, | |
| "loss": 0.8844, | |
| "step": 90900 | |
| }, | |
| { | |
| "epoch": 4.55, | |
| "grad_norm": 1.9481797218322754, | |
| "learning_rate": 2.724120603015075e-05, | |
| "loss": 0.9192, | |
| "step": 91000 | |
| }, | |
| { | |
| "epoch": 4.55, | |
| "eval_loss": 0.9217103123664856, | |
| "eval_runtime": 38.1169, | |
| "eval_samples_per_second": 26.235, | |
| "eval_steps_per_second": 3.279, | |
| "step": 91000 | |
| }, | |
| { | |
| "epoch": 4.55, | |
| "grad_norm": 2.1429965496063232, | |
| "learning_rate": 2.693969849246231e-05, | |
| "loss": 0.8889, | |
| "step": 91100 | |
| }, | |
| { | |
| "epoch": 4.56, | |
| "grad_norm": 3.4818546772003174, | |
| "learning_rate": 2.6638190954773866e-05, | |
| "loss": 0.8932, | |
| "step": 91200 | |
| }, | |
| { | |
| "epoch": 4.56, | |
| "grad_norm": 2.3813984394073486, | |
| "learning_rate": 2.6336683417085425e-05, | |
| "loss": 0.9154, | |
| "step": 91300 | |
| }, | |
| { | |
| "epoch": 4.57, | |
| "grad_norm": 2.4688570499420166, | |
| "learning_rate": 2.6035175879396984e-05, | |
| "loss": 0.9344, | |
| "step": 91400 | |
| }, | |
| { | |
| "epoch": 4.58, | |
| "grad_norm": 4.330790996551514, | |
| "learning_rate": 2.573366834170854e-05, | |
| "loss": 0.9137, | |
| "step": 91500 | |
| }, | |
| { | |
| "epoch": 4.58, | |
| "grad_norm": 2.8123939037323, | |
| "learning_rate": 2.54321608040201e-05, | |
| "loss": 0.9041, | |
| "step": 91600 | |
| }, | |
| { | |
| "epoch": 4.58, | |
| "grad_norm": 2.1815638542175293, | |
| "learning_rate": 2.5130653266331656e-05, | |
| "loss": 0.8606, | |
| "step": 91700 | |
| }, | |
| { | |
| "epoch": 4.59, | |
| "grad_norm": 3.3489341735839844, | |
| "learning_rate": 2.4829145728643216e-05, | |
| "loss": 0.934, | |
| "step": 91800 | |
| }, | |
| { | |
| "epoch": 4.59, | |
| "grad_norm": 2.9650094509124756, | |
| "learning_rate": 2.4527638190954775e-05, | |
| "loss": 0.8893, | |
| "step": 91900 | |
| }, | |
| { | |
| "epoch": 4.6, | |
| "grad_norm": 3.541456460952759, | |
| "learning_rate": 2.4226130653266328e-05, | |
| "loss": 0.9239, | |
| "step": 92000 | |
| }, | |
| { | |
| "epoch": 4.6, | |
| "eval_loss": 0.9656698107719421, | |
| "eval_runtime": 38.5991, | |
| "eval_samples_per_second": 25.907, | |
| "eval_steps_per_second": 3.238, | |
| "step": 92000 | |
| }, | |
| { | |
| "epoch": 4.61, | |
| "grad_norm": 3.1648945808410645, | |
| "learning_rate": 2.3924623115577887e-05, | |
| "loss": 0.8777, | |
| "step": 92100 | |
| }, | |
| { | |
| "epoch": 4.61, | |
| "grad_norm": 8.632335662841797, | |
| "learning_rate": 2.3623115577889443e-05, | |
| "loss": 0.9047, | |
| "step": 92200 | |
| }, | |
| { | |
| "epoch": 4.62, | |
| "grad_norm": 2.9412002563476562, | |
| "learning_rate": 2.3321608040201003e-05, | |
| "loss": 0.8964, | |
| "step": 92300 | |
| }, | |
| { | |
| "epoch": 4.62, | |
| "grad_norm": 2.7501888275146484, | |
| "learning_rate": 2.3020100502512562e-05, | |
| "loss": 0.9303, | |
| "step": 92400 | |
| }, | |
| { | |
| "epoch": 4.62, | |
| "grad_norm": 3.36631178855896, | |
| "learning_rate": 2.2718592964824118e-05, | |
| "loss": 0.8987, | |
| "step": 92500 | |
| }, | |
| { | |
| "epoch": 4.63, | |
| "grad_norm": 2.6061251163482666, | |
| "learning_rate": 2.2417085427135678e-05, | |
| "loss": 0.8981, | |
| "step": 92600 | |
| }, | |
| { | |
| "epoch": 4.63, | |
| "grad_norm": 3.9636521339416504, | |
| "learning_rate": 2.2115577889447234e-05, | |
| "loss": 0.893, | |
| "step": 92700 | |
| }, | |
| { | |
| "epoch": 4.64, | |
| "grad_norm": 3.2085049152374268, | |
| "learning_rate": 2.1814070351758793e-05, | |
| "loss": 0.9298, | |
| "step": 92800 | |
| }, | |
| { | |
| "epoch": 4.64, | |
| "grad_norm": 2.590059995651245, | |
| "learning_rate": 2.1512562814070353e-05, | |
| "loss": 0.9118, | |
| "step": 92900 | |
| }, | |
| { | |
| "epoch": 4.65, | |
| "grad_norm": 4.868690013885498, | |
| "learning_rate": 2.121105527638191e-05, | |
| "loss": 0.8873, | |
| "step": 93000 | |
| }, | |
| { | |
| "epoch": 4.65, | |
| "eval_loss": 0.918121337890625, | |
| "eval_runtime": 38.3542, | |
| "eval_samples_per_second": 26.073, | |
| "eval_steps_per_second": 3.259, | |
| "step": 93000 | |
| }, | |
| { | |
| "epoch": 4.66, | |
| "grad_norm": 4.0143303871154785, | |
| "learning_rate": 2.0909547738693465e-05, | |
| "loss": 0.871, | |
| "step": 93100 | |
| }, | |
| { | |
| "epoch": 4.66, | |
| "grad_norm": 4.423349857330322, | |
| "learning_rate": 2.060804020100502e-05, | |
| "loss": 0.9232, | |
| "step": 93200 | |
| }, | |
| { | |
| "epoch": 4.67, | |
| "grad_norm": 3.6609606742858887, | |
| "learning_rate": 2.030653266331658e-05, | |
| "loss": 0.8782, | |
| "step": 93300 | |
| }, | |
| { | |
| "epoch": 4.67, | |
| "grad_norm": 3.252089738845825, | |
| "learning_rate": 2.0008040201005026e-05, | |
| "loss": 0.9232, | |
| "step": 93400 | |
| }, | |
| { | |
| "epoch": 4.67, | |
| "grad_norm": 2.8783979415893555, | |
| "learning_rate": 1.970653266331658e-05, | |
| "loss": 0.8539, | |
| "step": 93500 | |
| }, | |
| { | |
| "epoch": 4.68, | |
| "grad_norm": 5.381927967071533, | |
| "learning_rate": 1.940502512562814e-05, | |
| "loss": 0.9263, | |
| "step": 93600 | |
| }, | |
| { | |
| "epoch": 4.69, | |
| "grad_norm": 3.1031525135040283, | |
| "learning_rate": 1.9103517587939695e-05, | |
| "loss": 0.9095, | |
| "step": 93700 | |
| }, | |
| { | |
| "epoch": 4.69, | |
| "grad_norm": 2.668039321899414, | |
| "learning_rate": 1.8802010050251254e-05, | |
| "loss": 0.892, | |
| "step": 93800 | |
| }, | |
| { | |
| "epoch": 4.7, | |
| "grad_norm": 2.6661875247955322, | |
| "learning_rate": 1.8500502512562814e-05, | |
| "loss": 0.8944, | |
| "step": 93900 | |
| }, | |
| { | |
| "epoch": 4.7, | |
| "grad_norm": 3.5291526317596436, | |
| "learning_rate": 1.819899497487437e-05, | |
| "loss": 0.9074, | |
| "step": 94000 | |
| }, | |
| { | |
| "epoch": 4.7, | |
| "eval_loss": 0.9208371639251709, | |
| "eval_runtime": 38.4003, | |
| "eval_samples_per_second": 26.041, | |
| "eval_steps_per_second": 3.255, | |
| "step": 94000 | |
| }, | |
| { | |
| "epoch": 4.71, | |
| "grad_norm": 4.160482883453369, | |
| "learning_rate": 1.789748743718593e-05, | |
| "loss": 0.9045, | |
| "step": 94100 | |
| }, | |
| { | |
| "epoch": 4.71, | |
| "grad_norm": 3.8051962852478027, | |
| "learning_rate": 1.7595979899497485e-05, | |
| "loss": 0.899, | |
| "step": 94200 | |
| }, | |
| { | |
| "epoch": 4.71, | |
| "grad_norm": 3.431490898132324, | |
| "learning_rate": 1.7294472361809045e-05, | |
| "loss": 0.8577, | |
| "step": 94300 | |
| }, | |
| { | |
| "epoch": 4.72, | |
| "grad_norm": 2.356250524520874, | |
| "learning_rate": 1.69929648241206e-05, | |
| "loss": 0.9204, | |
| "step": 94400 | |
| }, | |
| { | |
| "epoch": 4.72, | |
| "grad_norm": 5.237595081329346, | |
| "learning_rate": 1.669145728643216e-05, | |
| "loss": 0.8973, | |
| "step": 94500 | |
| }, | |
| { | |
| "epoch": 4.73, | |
| "grad_norm": 5.023568153381348, | |
| "learning_rate": 1.6389949748743716e-05, | |
| "loss": 0.9064, | |
| "step": 94600 | |
| }, | |
| { | |
| "epoch": 4.74, | |
| "grad_norm": 6.610247611999512, | |
| "learning_rate": 1.6088442211055276e-05, | |
| "loss": 0.858, | |
| "step": 94700 | |
| }, | |
| { | |
| "epoch": 4.74, | |
| "grad_norm": 2.1937615871429443, | |
| "learning_rate": 1.5786934673366835e-05, | |
| "loss": 0.872, | |
| "step": 94800 | |
| }, | |
| { | |
| "epoch": 4.75, | |
| "grad_norm": 4.40328311920166, | |
| "learning_rate": 1.548542713567839e-05, | |
| "loss": 0.88, | |
| "step": 94900 | |
| }, | |
| { | |
| "epoch": 4.75, | |
| "grad_norm": 3.0487658977508545, | |
| "learning_rate": 1.5183919597989947e-05, | |
| "loss": 0.8779, | |
| "step": 95000 | |
| }, | |
| { | |
| "epoch": 4.75, | |
| "eval_loss": 0.9459323883056641, | |
| "eval_runtime": 38.1338, | |
| "eval_samples_per_second": 26.223, | |
| "eval_steps_per_second": 3.278, | |
| "step": 95000 | |
| }, | |
| { | |
| "epoch": 4.75, | |
| "grad_norm": 3.8922808170318604, | |
| "learning_rate": 1.4882412060301507e-05, | |
| "loss": 0.9075, | |
| "step": 95100 | |
| }, | |
| { | |
| "epoch": 4.76, | |
| "grad_norm": 3.232625722885132, | |
| "learning_rate": 1.4580904522613064e-05, | |
| "loss": 0.869, | |
| "step": 95200 | |
| }, | |
| { | |
| "epoch": 4.76, | |
| "grad_norm": 8.73833179473877, | |
| "learning_rate": 1.4279396984924622e-05, | |
| "loss": 0.8741, | |
| "step": 95300 | |
| }, | |
| { | |
| "epoch": 4.77, | |
| "grad_norm": 4.5711846351623535, | |
| "learning_rate": 1.397788944723618e-05, | |
| "loss": 0.8976, | |
| "step": 95400 | |
| }, | |
| { | |
| "epoch": 4.78, | |
| "grad_norm": 4.647241115570068, | |
| "learning_rate": 1.3676381909547736e-05, | |
| "loss": 0.8392, | |
| "step": 95500 | |
| }, | |
| { | |
| "epoch": 4.78, | |
| "grad_norm": 4.90078067779541, | |
| "learning_rate": 1.337788944723618e-05, | |
| "loss": 0.8739, | |
| "step": 95600 | |
| }, | |
| { | |
| "epoch": 4.79, | |
| "grad_norm": 3.1595067977905273, | |
| "learning_rate": 1.3076381909547738e-05, | |
| "loss": 0.8398, | |
| "step": 95700 | |
| }, | |
| { | |
| "epoch": 4.79, | |
| "grad_norm": 2.488835096359253, | |
| "learning_rate": 1.2774874371859296e-05, | |
| "loss": 0.868, | |
| "step": 95800 | |
| }, | |
| { | |
| "epoch": 4.79, | |
| "grad_norm": 4.495543003082275, | |
| "learning_rate": 1.2473366834170852e-05, | |
| "loss": 0.8872, | |
| "step": 95900 | |
| }, | |
| { | |
| "epoch": 4.8, | |
| "grad_norm": 3.673161268234253, | |
| "learning_rate": 1.217185929648241e-05, | |
| "loss": 0.8824, | |
| "step": 96000 | |
| }, | |
| { | |
| "epoch": 4.8, | |
| "eval_loss": 0.910308301448822, | |
| "eval_runtime": 38.0891, | |
| "eval_samples_per_second": 26.254, | |
| "eval_steps_per_second": 3.282, | |
| "step": 96000 | |
| }, | |
| { | |
| "epoch": 4.8, | |
| "grad_norm": 5.159984111785889, | |
| "learning_rate": 1.187035175879397e-05, | |
| "loss": 0.8672, | |
| "step": 96100 | |
| }, | |
| { | |
| "epoch": 4.81, | |
| "grad_norm": 2.706937551498413, | |
| "learning_rate": 1.1568844221105527e-05, | |
| "loss": 0.8914, | |
| "step": 96200 | |
| }, | |
| { | |
| "epoch": 4.81, | |
| "grad_norm": 3.727692127227783, | |
| "learning_rate": 1.1267336683417085e-05, | |
| "loss": 0.8485, | |
| "step": 96300 | |
| }, | |
| { | |
| "epoch": 4.82, | |
| "grad_norm": 2.665670156478882, | |
| "learning_rate": 1.0965829145728641e-05, | |
| "loss": 0.8695, | |
| "step": 96400 | |
| }, | |
| { | |
| "epoch": 4.83, | |
| "grad_norm": 5.077518463134766, | |
| "learning_rate": 1.0664321608040199e-05, | |
| "loss": 0.8767, | |
| "step": 96500 | |
| }, | |
| { | |
| "epoch": 4.83, | |
| "grad_norm": 3.4337048530578613, | |
| "learning_rate": 1.0362814070351758e-05, | |
| "loss": 0.8673, | |
| "step": 96600 | |
| }, | |
| { | |
| "epoch": 4.83, | |
| "grad_norm": 3.231494665145874, | |
| "learning_rate": 1.0061306532663316e-05, | |
| "loss": 0.8767, | |
| "step": 96700 | |
| }, | |
| { | |
| "epoch": 4.84, | |
| "grad_norm": 4.2955002784729, | |
| "learning_rate": 9.759798994974874e-06, | |
| "loss": 0.8645, | |
| "step": 96800 | |
| }, | |
| { | |
| "epoch": 4.84, | |
| "grad_norm": 6.2070698738098145, | |
| "learning_rate": 9.458291457286431e-06, | |
| "loss": 0.8683, | |
| "step": 96900 | |
| }, | |
| { | |
| "epoch": 4.85, | |
| "grad_norm": 3.6267805099487305, | |
| "learning_rate": 9.159798994974874e-06, | |
| "loss": 0.907, | |
| "step": 97000 | |
| }, | |
| { | |
| "epoch": 4.85, | |
| "eval_loss": 0.9255304932594299, | |
| "eval_runtime": 38.1396, | |
| "eval_samples_per_second": 26.219, | |
| "eval_steps_per_second": 3.277, | |
| "step": 97000 | |
| }, | |
| { | |
| "epoch": 4.86, | |
| "grad_norm": 4.985959529876709, | |
| "learning_rate": 8.858291457286432e-06, | |
| "loss": 0.8615, | |
| "step": 97100 | |
| }, | |
| { | |
| "epoch": 4.86, | |
| "grad_norm": 4.538032531738281, | |
| "learning_rate": 8.556783919597988e-06, | |
| "loss": 0.8519, | |
| "step": 97200 | |
| }, | |
| { | |
| "epoch": 4.87, | |
| "grad_norm": 6.562105178833008, | |
| "learning_rate": 8.255276381909548e-06, | |
| "loss": 0.8888, | |
| "step": 97300 | |
| }, | |
| { | |
| "epoch": 4.87, | |
| "grad_norm": 2.922360897064209, | |
| "learning_rate": 7.953768844221105e-06, | |
| "loss": 0.8784, | |
| "step": 97400 | |
| }, | |
| { | |
| "epoch": 4.88, | |
| "grad_norm": 3.8349783420562744, | |
| "learning_rate": 7.652261306532663e-06, | |
| "loss": 0.8962, | |
| "step": 97500 | |
| }, | |
| { | |
| "epoch": 4.88, | |
| "grad_norm": 2.096787929534912, | |
| "learning_rate": 7.350753768844221e-06, | |
| "loss": 0.9088, | |
| "step": 97600 | |
| }, | |
| { | |
| "epoch": 4.88, | |
| "grad_norm": 2.512312650680542, | |
| "learning_rate": 7.0492462311557786e-06, | |
| "loss": 0.8816, | |
| "step": 97700 | |
| }, | |
| { | |
| "epoch": 4.89, | |
| "grad_norm": 4.749015808105469, | |
| "learning_rate": 6.7477386934673355e-06, | |
| "loss": 0.8791, | |
| "step": 97800 | |
| }, | |
| { | |
| "epoch": 4.89, | |
| "grad_norm": 3.5753800868988037, | |
| "learning_rate": 6.446231155778894e-06, | |
| "loss": 0.8414, | |
| "step": 97900 | |
| }, | |
| { | |
| "epoch": 4.9, | |
| "grad_norm": 2.849839210510254, | |
| "learning_rate": 6.144723618090452e-06, | |
| "loss": 0.873, | |
| "step": 98000 | |
| }, | |
| { | |
| "epoch": 4.9, | |
| "eval_loss": 0.8922821283340454, | |
| "eval_runtime": 38.1228, | |
| "eval_samples_per_second": 26.231, | |
| "eval_steps_per_second": 3.279, | |
| "step": 98000 | |
| }, | |
| { | |
| "epoch": 4.91, | |
| "grad_norm": 4.473388195037842, | |
| "learning_rate": 5.8432160804020096e-06, | |
| "loss": 0.8428, | |
| "step": 98100 | |
| }, | |
| { | |
| "epoch": 4.91, | |
| "grad_norm": 2.7943496704101562, | |
| "learning_rate": 5.541708542713567e-06, | |
| "loss": 0.8519, | |
| "step": 98200 | |
| }, | |
| { | |
| "epoch": 4.92, | |
| "grad_norm": 2.476835012435913, | |
| "learning_rate": 5.240201005025126e-06, | |
| "loss": 0.8841, | |
| "step": 98300 | |
| }, | |
| { | |
| "epoch": 4.92, | |
| "grad_norm": 4.992676258087158, | |
| "learning_rate": 4.938693467336683e-06, | |
| "loss": 0.8409, | |
| "step": 98400 | |
| }, | |
| { | |
| "epoch": 4.92, | |
| "grad_norm": 2.4756906032562256, | |
| "learning_rate": 4.637185929648241e-06, | |
| "loss": 0.8527, | |
| "step": 98500 | |
| }, | |
| { | |
| "epoch": 4.93, | |
| "grad_norm": 2.157059669494629, | |
| "learning_rate": 4.335678391959798e-06, | |
| "loss": 0.8605, | |
| "step": 98600 | |
| }, | |
| { | |
| "epoch": 4.94, | |
| "grad_norm": 2.8840818405151367, | |
| "learning_rate": 4.034170854271356e-06, | |
| "loss": 0.87, | |
| "step": 98700 | |
| }, | |
| { | |
| "epoch": 4.94, | |
| "grad_norm": 4.124537944793701, | |
| "learning_rate": 3.7326633165829143e-06, | |
| "loss": 0.8318, | |
| "step": 98800 | |
| }, | |
| { | |
| "epoch": 4.95, | |
| "grad_norm": 4.684917449951172, | |
| "learning_rate": 3.431155778894472e-06, | |
| "loss": 0.8479, | |
| "step": 98900 | |
| }, | |
| { | |
| "epoch": 4.95, | |
| "grad_norm": 2.413602590560913, | |
| "learning_rate": 3.12964824120603e-06, | |
| "loss": 0.8452, | |
| "step": 99000 | |
| }, | |
| { | |
| "epoch": 4.95, | |
| "eval_loss": 0.8957632780075073, | |
| "eval_runtime": 38.1658, | |
| "eval_samples_per_second": 26.201, | |
| "eval_steps_per_second": 3.275, | |
| "step": 99000 | |
| }, | |
| { | |
| "epoch": 4.96, | |
| "grad_norm": 3.240213394165039, | |
| "learning_rate": 2.828140703517588e-06, | |
| "loss": 0.8303, | |
| "step": 99100 | |
| }, | |
| { | |
| "epoch": 4.96, | |
| "grad_norm": 4.0827555656433105, | |
| "learning_rate": 2.5266331658291453e-06, | |
| "loss": 0.8872, | |
| "step": 99200 | |
| }, | |
| { | |
| "epoch": 4.96, | |
| "grad_norm": 2.948489189147949, | |
| "learning_rate": 2.2251256281407035e-06, | |
| "loss": 0.8707, | |
| "step": 99300 | |
| }, | |
| { | |
| "epoch": 4.97, | |
| "grad_norm": 6.414693832397461, | |
| "learning_rate": 1.9236180904522612e-06, | |
| "loss": 0.837, | |
| "step": 99400 | |
| }, | |
| { | |
| "epoch": 4.97, | |
| "grad_norm": 5.013907432556152, | |
| "learning_rate": 1.622110552763819e-06, | |
| "loss": 0.8443, | |
| "step": 99500 | |
| }, | |
| { | |
| "epoch": 4.98, | |
| "grad_norm": 2.487205743789673, | |
| "learning_rate": 1.3206030150753765e-06, | |
| "loss": 0.8425, | |
| "step": 99600 | |
| }, | |
| { | |
| "epoch": 4.99, | |
| "grad_norm": 5.77063512802124, | |
| "learning_rate": 1.0190954773869345e-06, | |
| "loss": 0.8509, | |
| "step": 99700 | |
| }, | |
| { | |
| "epoch": 4.99, | |
| "grad_norm": 3.125368356704712, | |
| "learning_rate": 7.175879396984924e-07, | |
| "loss": 0.8874, | |
| "step": 99800 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 8.932684898376465, | |
| "learning_rate": 4.160804020100502e-07, | |
| "loss": 0.858, | |
| "step": 99900 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 5.0273756980896, | |
| "learning_rate": 1.1457286432160803e-07, | |
| "loss": 0.8394, | |
| "step": 100000 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_loss": 0.9212185144424438, | |
| "eval_runtime": 38.102, | |
| "eval_samples_per_second": 26.245, | |
| "eval_steps_per_second": 3.281, | |
| "step": 100000 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 100000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 1000, | |
| "total_flos": 1.2076594495488e+18, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |