| { |
| "best_metric": 0.21740412712097168, |
| "best_model_checkpoint": "learning_source_20260316/compounds/bert-output/compounds-large/checkpoint-57000", |
| "epoch": 5.783313694163914, |
| "eval_steps": 100, |
| "global_step": 60000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.009638856156939856, |
| "grad_norm": 12.63675594329834, |
| "learning_rate": 3e-06, |
| "loss": 3.5958, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.009638856156939856, |
| "eval_loss": 2.634005546569824, |
| "eval_runtime": 59.03, |
| "eval_samples_per_second": 169.405, |
| "eval_steps_per_second": 21.176, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.019277712313879713, |
| "grad_norm": 7.811080455780029, |
| "learning_rate": 6e-06, |
| "loss": 2.6098, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.019277712313879713, |
| "eval_loss": 2.495374917984009, |
| "eval_runtime": 59.2811, |
| "eval_samples_per_second": 168.688, |
| "eval_steps_per_second": 21.086, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.028916568470819567, |
| "grad_norm": 3.372339963912964, |
| "learning_rate": 5.989966555183947e-06, |
| "loss": 2.4015, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.028916568470819567, |
| "eval_loss": 2.288954734802246, |
| "eval_runtime": 59.465, |
| "eval_samples_per_second": 168.166, |
| "eval_steps_per_second": 21.021, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.038555424627759426, |
| "grad_norm": 28.15775489807129, |
| "learning_rate": 5.979933110367893e-06, |
| "loss": 2.2342, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.038555424627759426, |
| "eval_loss": 2.1951370239257812, |
| "eval_runtime": 59.6014, |
| "eval_samples_per_second": 167.781, |
| "eval_steps_per_second": 20.973, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.04819428078469928, |
| "grad_norm": 8.421930313110352, |
| "learning_rate": 5.96989966555184e-06, |
| "loss": 2.1235, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.04819428078469928, |
| "eval_loss": 2.0739598274230957, |
| "eval_runtime": 59.6726, |
| "eval_samples_per_second": 167.581, |
| "eval_steps_per_second": 20.948, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.057833136941639135, |
| "grad_norm": 5.261558532714844, |
| "learning_rate": 5.959866220735786e-06, |
| "loss": 2.0483, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.057833136941639135, |
| "eval_loss": 2.0082836151123047, |
| "eval_runtime": 59.569, |
| "eval_samples_per_second": 167.873, |
| "eval_steps_per_second": 20.984, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.06747199309857899, |
| "grad_norm": 6.819973945617676, |
| "learning_rate": 5.949832775919732e-06, |
| "loss": 1.9985, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.06747199309857899, |
| "eval_loss": 1.995147466659546, |
| "eval_runtime": 59.5165, |
| "eval_samples_per_second": 168.021, |
| "eval_steps_per_second": 21.003, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.07711084925551885, |
| "grad_norm": 10.216808319091797, |
| "learning_rate": 5.939799331103679e-06, |
| "loss": 1.9442, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.07711084925551885, |
| "eval_loss": 1.901943325996399, |
| "eval_runtime": 59.7859, |
| "eval_samples_per_second": 167.264, |
| "eval_steps_per_second": 20.908, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.0867497054124587, |
| "grad_norm": 4.163369178771973, |
| "learning_rate": 5.929765886287626e-06, |
| "loss": 1.9063, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.0867497054124587, |
| "eval_loss": 1.8962064981460571, |
| "eval_runtime": 59.5545, |
| "eval_samples_per_second": 167.913, |
| "eval_steps_per_second": 20.989, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.09638856156939855, |
| "grad_norm": 6.629236698150635, |
| "learning_rate": 5.919732441471572e-06, |
| "loss": 1.865, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.09638856156939855, |
| "eval_loss": 1.8345894813537598, |
| "eval_runtime": 59.5791, |
| "eval_samples_per_second": 167.844, |
| "eval_steps_per_second": 20.981, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.10602741772633842, |
| "grad_norm": 5.353513240814209, |
| "learning_rate": 5.9096989966555185e-06, |
| "loss": 1.829, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.10602741772633842, |
| "eval_loss": 1.8244454860687256, |
| "eval_runtime": 59.7272, |
| "eval_samples_per_second": 167.428, |
| "eval_steps_per_second": 20.928, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.11566627388327827, |
| "grad_norm": 8.38032341003418, |
| "learning_rate": 5.899665551839465e-06, |
| "loss": 1.7982, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.11566627388327827, |
| "eval_loss": 1.8015681505203247, |
| "eval_runtime": 59.5571, |
| "eval_samples_per_second": 167.906, |
| "eval_steps_per_second": 20.988, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.12530513004021812, |
| "grad_norm": 4.500089645385742, |
| "learning_rate": 5.889632107023412e-06, |
| "loss": 1.7623, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.12530513004021812, |
| "eval_loss": 1.7761175632476807, |
| "eval_runtime": 59.5755, |
| "eval_samples_per_second": 167.854, |
| "eval_steps_per_second": 20.982, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.13494398619715797, |
| "grad_norm": 4.475337028503418, |
| "learning_rate": 5.879598662207358e-06, |
| "loss": 1.741, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.13494398619715797, |
| "eval_loss": 1.7422441244125366, |
| "eval_runtime": 59.526, |
| "eval_samples_per_second": 167.994, |
| "eval_steps_per_second": 20.999, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.14458284235409785, |
| "grad_norm": 4.09720516204834, |
| "learning_rate": 5.869565217391305e-06, |
| "loss": 1.715, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.14458284235409785, |
| "eval_loss": 1.7092541456222534, |
| "eval_runtime": 59.7218, |
| "eval_samples_per_second": 167.443, |
| "eval_steps_per_second": 20.93, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.1542216985110377, |
| "grad_norm": 3.275571584701538, |
| "learning_rate": 5.8595317725752514e-06, |
| "loss": 1.6867, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.1542216985110377, |
| "eval_loss": 1.7210519313812256, |
| "eval_runtime": 59.5686, |
| "eval_samples_per_second": 167.874, |
| "eval_steps_per_second": 20.984, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.16386055466797755, |
| "grad_norm": 3.5458016395568848, |
| "learning_rate": 5.849498327759197e-06, |
| "loss": 1.6636, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.16386055466797755, |
| "eval_loss": 1.6789331436157227, |
| "eval_runtime": 59.7455, |
| "eval_samples_per_second": 167.376, |
| "eval_steps_per_second": 20.922, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.1734994108249174, |
| "grad_norm": 3.0849575996398926, |
| "learning_rate": 5.839464882943144e-06, |
| "loss": 1.6401, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.1734994108249174, |
| "eval_loss": 1.6254615783691406, |
| "eval_runtime": 59.5834, |
| "eval_samples_per_second": 167.832, |
| "eval_steps_per_second": 20.979, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.18313826698185726, |
| "grad_norm": 3.4666318893432617, |
| "learning_rate": 5.829431438127091e-06, |
| "loss": 1.6181, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.18313826698185726, |
| "eval_loss": 1.6075893640518188, |
| "eval_runtime": 59.5639, |
| "eval_samples_per_second": 167.887, |
| "eval_steps_per_second": 20.986, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.1927771231387971, |
| "grad_norm": 3.3370141983032227, |
| "learning_rate": 5.819397993311037e-06, |
| "loss": 1.597, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.1927771231387971, |
| "eval_loss": 1.6024377346038818, |
| "eval_runtime": 59.7218, |
| "eval_samples_per_second": 167.443, |
| "eval_steps_per_second": 20.93, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.202415979295737, |
| "grad_norm": 4.16128396987915, |
| "learning_rate": 5.8093645484949836e-06, |
| "loss": 1.5706, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.202415979295737, |
| "eval_loss": 1.5627351999282837, |
| "eval_runtime": 58.7245, |
| "eval_samples_per_second": 170.287, |
| "eval_steps_per_second": 21.286, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.21205483545267684, |
| "grad_norm": 4.344303131103516, |
| "learning_rate": 5.79933110367893e-06, |
| "loss": 1.5538, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.21205483545267684, |
| "eval_loss": 1.5229161977767944, |
| "eval_runtime": 58.5407, |
| "eval_samples_per_second": 170.821, |
| "eval_steps_per_second": 21.353, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.2216936916096167, |
| "grad_norm": 3.6315417289733887, |
| "learning_rate": 5.789297658862876e-06, |
| "loss": 1.5317, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.2216936916096167, |
| "eval_loss": 1.5036700963974, |
| "eval_runtime": 58.5025, |
| "eval_samples_per_second": 170.933, |
| "eval_steps_per_second": 21.367, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.23133254776655654, |
| "grad_norm": 3.0974552631378174, |
| "learning_rate": 5.779264214046823e-06, |
| "loss": 1.5018, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.23133254776655654, |
| "eval_loss": 1.4860730171203613, |
| "eval_runtime": 58.715, |
| "eval_samples_per_second": 170.314, |
| "eval_steps_per_second": 21.289, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.2409714039234964, |
| "grad_norm": 3.808483362197876, |
| "learning_rate": 5.76923076923077e-06, |
| "loss": 1.4821, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.2409714039234964, |
| "eval_loss": 1.446873664855957, |
| "eval_runtime": 60.847, |
| "eval_samples_per_second": 164.347, |
| "eval_steps_per_second": 20.543, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.25061026008043624, |
| "grad_norm": 4.335411071777344, |
| "learning_rate": 5.759197324414716e-06, |
| "loss": 1.4527, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.25061026008043624, |
| "eval_loss": 1.4302533864974976, |
| "eval_runtime": 58.726, |
| "eval_samples_per_second": 170.282, |
| "eval_steps_per_second": 21.285, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.2602491162373761, |
| "grad_norm": 3.0945868492126465, |
| "learning_rate": 5.7491638795986624e-06, |
| "loss": 1.4247, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.2602491162373761, |
| "eval_loss": 1.386735200881958, |
| "eval_runtime": 58.8022, |
| "eval_samples_per_second": 170.062, |
| "eval_steps_per_second": 21.258, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.26988797239431594, |
| "grad_norm": 3.442647933959961, |
| "learning_rate": 5.739130434782609e-06, |
| "loss": 1.3978, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.26988797239431594, |
| "eval_loss": 1.3386584520339966, |
| "eval_runtime": 58.8842, |
| "eval_samples_per_second": 169.825, |
| "eval_steps_per_second": 21.228, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.2795268285512558, |
| "grad_norm": 3.443277597427368, |
| "learning_rate": 5.729096989966555e-06, |
| "loss": 1.3653, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.2795268285512558, |
| "eval_loss": 1.3010079860687256, |
| "eval_runtime": 58.9775, |
| "eval_samples_per_second": 169.556, |
| "eval_steps_per_second": 21.195, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.2891656847081957, |
| "grad_norm": 2.790172576904297, |
| "learning_rate": 5.719063545150502e-06, |
| "loss": 1.3367, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.2891656847081957, |
| "eval_loss": 1.2636170387268066, |
| "eval_runtime": 58.8313, |
| "eval_samples_per_second": 169.978, |
| "eval_steps_per_second": 21.247, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.2988045408651355, |
| "grad_norm": 3.3221867084503174, |
| "learning_rate": 5.709030100334449e-06, |
| "loss": 1.2931, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.2988045408651355, |
| "eval_loss": 1.2360334396362305, |
| "eval_runtime": 58.8407, |
| "eval_samples_per_second": 169.95, |
| "eval_steps_per_second": 21.244, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.3084433970220754, |
| "grad_norm": 3.623957395553589, |
| "learning_rate": 5.698996655518395e-06, |
| "loss": 1.2604, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.3084433970220754, |
| "eval_loss": 1.1867709159851074, |
| "eval_runtime": 59.044, |
| "eval_samples_per_second": 169.365, |
| "eval_steps_per_second": 21.171, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.31808225317901523, |
| "grad_norm": 3.344835042953491, |
| "learning_rate": 5.688963210702341e-06, |
| "loss": 1.2309, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.31808225317901523, |
| "eval_loss": 1.1461621522903442, |
| "eval_runtime": 58.853, |
| "eval_samples_per_second": 169.915, |
| "eval_steps_per_second": 21.239, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.3277211093359551, |
| "grad_norm": 3.0504872798919678, |
| "learning_rate": 5.678929765886288e-06, |
| "loss": 1.2023, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.3277211093359551, |
| "eval_loss": 1.1227951049804688, |
| "eval_runtime": 58.8532, |
| "eval_samples_per_second": 169.914, |
| "eval_steps_per_second": 21.239, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.33735996549289493, |
| "grad_norm": 3.358616352081299, |
| "learning_rate": 5.668896321070235e-06, |
| "loss": 1.1815, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.33735996549289493, |
| "eval_loss": 1.0813926458358765, |
| "eval_runtime": 59.0352, |
| "eval_samples_per_second": 169.39, |
| "eval_steps_per_second": 21.174, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.3469988216498348, |
| "grad_norm": 2.9233672618865967, |
| "learning_rate": 5.658862876254181e-06, |
| "loss": 1.1533, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.3469988216498348, |
| "eval_loss": 1.0554317235946655, |
| "eval_runtime": 58.8485, |
| "eval_samples_per_second": 169.928, |
| "eval_steps_per_second": 21.241, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.3566376778067747, |
| "grad_norm": 2.5221972465515137, |
| "learning_rate": 5.6488294314381275e-06, |
| "loss": 1.1312, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.3566376778067747, |
| "eval_loss": 1.040281891822815, |
| "eval_runtime": 58.8638, |
| "eval_samples_per_second": 169.884, |
| "eval_steps_per_second": 21.235, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.3662765339637145, |
| "grad_norm": 2.611217737197876, |
| "learning_rate": 5.638795986622074e-06, |
| "loss": 1.109, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.3662765339637145, |
| "eval_loss": 1.0209678411483765, |
| "eval_runtime": 59.0369, |
| "eval_samples_per_second": 169.385, |
| "eval_steps_per_second": 21.173, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.3759153901206544, |
| "grad_norm": 2.4788668155670166, |
| "learning_rate": 5.62876254180602e-06, |
| "loss": 1.0839, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.3759153901206544, |
| "eval_loss": 0.9792139530181885, |
| "eval_runtime": 58.8512, |
| "eval_samples_per_second": 169.92, |
| "eval_steps_per_second": 21.24, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.3855542462775942, |
| "grad_norm": 2.7431282997131348, |
| "learning_rate": 5.618729096989967e-06, |
| "loss": 1.0636, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.3855542462775942, |
| "eval_loss": 0.9677358269691467, |
| "eval_runtime": 58.6358, |
| "eval_samples_per_second": 170.544, |
| "eval_steps_per_second": 21.318, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.3951931024345341, |
| "grad_norm": 3.1817986965179443, |
| "learning_rate": 5.608695652173914e-06, |
| "loss": 1.0438, |
| "step": 4100 |
| }, |
| { |
| "epoch": 0.3951931024345341, |
| "eval_loss": 0.9539806842803955, |
| "eval_runtime": 59.0244, |
| "eval_samples_per_second": 169.422, |
| "eval_steps_per_second": 21.178, |
| "step": 4100 |
| }, |
| { |
| "epoch": 0.404831958591474, |
| "grad_norm": 2.875279664993286, |
| "learning_rate": 5.59866220735786e-06, |
| "loss": 1.0221, |
| "step": 4200 |
| }, |
| { |
| "epoch": 0.404831958591474, |
| "eval_loss": 0.9230886697769165, |
| "eval_runtime": 58.867, |
| "eval_samples_per_second": 169.874, |
| "eval_steps_per_second": 21.234, |
| "step": 4200 |
| }, |
| { |
| "epoch": 0.4144708147484138, |
| "grad_norm": 3.0002613067626953, |
| "learning_rate": 5.588628762541806e-06, |
| "loss": 1.0044, |
| "step": 4300 |
| }, |
| { |
| "epoch": 0.4144708147484138, |
| "eval_loss": 0.9111505150794983, |
| "eval_runtime": 59.1064, |
| "eval_samples_per_second": 169.186, |
| "eval_steps_per_second": 21.148, |
| "step": 4300 |
| }, |
| { |
| "epoch": 0.4241096709053537, |
| "grad_norm": 3.194329261779785, |
| "learning_rate": 5.578595317725753e-06, |
| "loss": 0.9883, |
| "step": 4400 |
| }, |
| { |
| "epoch": 0.4241096709053537, |
| "eval_loss": 0.887302815914154, |
| "eval_runtime": 58.9661, |
| "eval_samples_per_second": 169.589, |
| "eval_steps_per_second": 21.199, |
| "step": 4400 |
| }, |
| { |
| "epoch": 0.4337485270622935, |
| "grad_norm": 2.66054630279541, |
| "learning_rate": 5.568561872909699e-06, |
| "loss": 0.9736, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.4337485270622935, |
| "eval_loss": 0.8647195100784302, |
| "eval_runtime": 59.1105, |
| "eval_samples_per_second": 169.175, |
| "eval_steps_per_second": 21.147, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.4433873832192334, |
| "grad_norm": 2.8895978927612305, |
| "learning_rate": 5.558528428093646e-06, |
| "loss": 0.9549, |
| "step": 4600 |
| }, |
| { |
| "epoch": 0.4433873832192334, |
| "eval_loss": 0.8601205348968506, |
| "eval_runtime": 58.9049, |
| "eval_samples_per_second": 169.765, |
| "eval_steps_per_second": 21.221, |
| "step": 4600 |
| }, |
| { |
| "epoch": 0.45302623937617326, |
| "grad_norm": 3.127106189727783, |
| "learning_rate": 5.548494983277593e-06, |
| "loss": 0.941, |
| "step": 4700 |
| }, |
| { |
| "epoch": 0.45302623937617326, |
| "eval_loss": 0.8301200270652771, |
| "eval_runtime": 58.8868, |
| "eval_samples_per_second": 169.817, |
| "eval_steps_per_second": 21.227, |
| "step": 4700 |
| }, |
| { |
| "epoch": 0.4626650955331131, |
| "grad_norm": 2.7476322650909424, |
| "learning_rate": 5.5384615384615385e-06, |
| "loss": 0.9228, |
| "step": 4800 |
| }, |
| { |
| "epoch": 0.4626650955331131, |
| "eval_loss": 0.8239719867706299, |
| "eval_runtime": 59.0464, |
| "eval_samples_per_second": 169.358, |
| "eval_steps_per_second": 21.17, |
| "step": 4800 |
| }, |
| { |
| "epoch": 0.47230395169005296, |
| "grad_norm": 2.541245460510254, |
| "learning_rate": 5.528428093645485e-06, |
| "loss": 0.9153, |
| "step": 4900 |
| }, |
| { |
| "epoch": 0.47230395169005296, |
| "eval_loss": 0.8104541301727295, |
| "eval_runtime": 58.8715, |
| "eval_samples_per_second": 169.861, |
| "eval_steps_per_second": 21.233, |
| "step": 4900 |
| }, |
| { |
| "epoch": 0.4819428078469928, |
| "grad_norm": 2.851243495941162, |
| "learning_rate": 5.518394648829432e-06, |
| "loss": 0.8994, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.4819428078469928, |
| "eval_loss": 0.8006957769393921, |
| "eval_runtime": 58.8695, |
| "eval_samples_per_second": 169.867, |
| "eval_steps_per_second": 21.233, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.49158166400393266, |
| "grad_norm": 2.5873661041259766, |
| "learning_rate": 5.508361204013378e-06, |
| "loss": 0.8893, |
| "step": 5100 |
| }, |
| { |
| "epoch": 0.49158166400393266, |
| "eval_loss": 0.7788259983062744, |
| "eval_runtime": 59.0735, |
| "eval_samples_per_second": 169.281, |
| "eval_steps_per_second": 21.16, |
| "step": 5100 |
| }, |
| { |
| "epoch": 0.5012205201608725, |
| "grad_norm": 2.6130380630493164, |
| "learning_rate": 5.498327759197324e-06, |
| "loss": 0.8711, |
| "step": 5200 |
| }, |
| { |
| "epoch": 0.5012205201608725, |
| "eval_loss": 0.7742456793785095, |
| "eval_runtime": 58.8663, |
| "eval_samples_per_second": 169.877, |
| "eval_steps_per_second": 21.235, |
| "step": 5200 |
| }, |
| { |
| "epoch": 0.5108593763178124, |
| "grad_norm": 2.606877565383911, |
| "learning_rate": 5.488294314381271e-06, |
| "loss": 0.8602, |
| "step": 5300 |
| }, |
| { |
| "epoch": 0.5108593763178124, |
| "eval_loss": 0.7536377310752869, |
| "eval_runtime": 58.833, |
| "eval_samples_per_second": 169.973, |
| "eval_steps_per_second": 21.247, |
| "step": 5300 |
| }, |
| { |
| "epoch": 0.5204982324747522, |
| "grad_norm": 2.596773624420166, |
| "learning_rate": 5.478260869565217e-06, |
| "loss": 0.853, |
| "step": 5400 |
| }, |
| { |
| "epoch": 0.5204982324747522, |
| "eval_loss": 0.747475802898407, |
| "eval_runtime": 59.0592, |
| "eval_samples_per_second": 169.322, |
| "eval_steps_per_second": 21.165, |
| "step": 5400 |
| }, |
| { |
| "epoch": 0.5301370886316921, |
| "grad_norm": 2.5571069717407227, |
| "learning_rate": 5.468227424749163e-06, |
| "loss": 0.8383, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.5301370886316921, |
| "eval_loss": 0.7383442521095276, |
| "eval_runtime": 58.8303, |
| "eval_samples_per_second": 169.98, |
| "eval_steps_per_second": 21.248, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.5397759447886319, |
| "grad_norm": 2.3702540397644043, |
| "learning_rate": 5.45819397993311e-06, |
| "loss": 0.8259, |
| "step": 5600 |
| }, |
| { |
| "epoch": 0.5397759447886319, |
| "eval_loss": 0.7310738563537598, |
| "eval_runtime": 58.825, |
| "eval_samples_per_second": 169.996, |
| "eval_steps_per_second": 21.249, |
| "step": 5600 |
| }, |
| { |
| "epoch": 0.5494148009455718, |
| "grad_norm": 2.6348025798797607, |
| "learning_rate": 5.448160535117057e-06, |
| "loss": 0.8094, |
| "step": 5700 |
| }, |
| { |
| "epoch": 0.5494148009455718, |
| "eval_loss": 0.7255334854125977, |
| "eval_runtime": 58.9121, |
| "eval_samples_per_second": 169.744, |
| "eval_steps_per_second": 21.218, |
| "step": 5700 |
| }, |
| { |
| "epoch": 0.5590536571025116, |
| "grad_norm": 2.291001796722412, |
| "learning_rate": 5.438127090301003e-06, |
| "loss": 0.8038, |
| "step": 5800 |
| }, |
| { |
| "epoch": 0.5590536571025116, |
| "eval_loss": 0.7195360064506531, |
| "eval_runtime": 59.0355, |
| "eval_samples_per_second": 169.389, |
| "eval_steps_per_second": 21.174, |
| "step": 5800 |
| }, |
| { |
| "epoch": 0.5686925132594515, |
| "grad_norm": 2.163550853729248, |
| "learning_rate": 5.4280936454849495e-06, |
| "loss": 0.7985, |
| "step": 5900 |
| }, |
| { |
| "epoch": 0.5686925132594515, |
| "eval_loss": 0.7090545892715454, |
| "eval_runtime": 59.11, |
| "eval_samples_per_second": 169.176, |
| "eval_steps_per_second": 21.147, |
| "step": 5900 |
| }, |
| { |
| "epoch": 0.5783313694163914, |
| "grad_norm": 2.325115442276001, |
| "learning_rate": 5.418060200668896e-06, |
| "loss": 0.7814, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.5783313694163914, |
| "eval_loss": 0.6920502185821533, |
| "eval_runtime": 58.907, |
| "eval_samples_per_second": 169.759, |
| "eval_steps_per_second": 21.22, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.5879702255733312, |
| "grad_norm": 2.495279550552368, |
| "learning_rate": 5.408026755852843e-06, |
| "loss": 0.7771, |
| "step": 6100 |
| }, |
| { |
| "epoch": 0.5879702255733312, |
| "eval_loss": 0.6769218444824219, |
| "eval_runtime": 58.9261, |
| "eval_samples_per_second": 169.704, |
| "eval_steps_per_second": 21.213, |
| "step": 6100 |
| }, |
| { |
| "epoch": 0.597609081730271, |
| "grad_norm": 2.5387184619903564, |
| "learning_rate": 5.397993311036789e-06, |
| "loss": 0.763, |
| "step": 6200 |
| }, |
| { |
| "epoch": 0.597609081730271, |
| "eval_loss": 0.6797191500663757, |
| "eval_runtime": 58.9067, |
| "eval_samples_per_second": 169.76, |
| "eval_steps_per_second": 21.22, |
| "step": 6200 |
| }, |
| { |
| "epoch": 0.6072479378872109, |
| "grad_norm": 2.340493679046631, |
| "learning_rate": 5.387959866220736e-06, |
| "loss": 0.7531, |
| "step": 6300 |
| }, |
| { |
| "epoch": 0.6072479378872109, |
| "eval_loss": 0.6665938496589661, |
| "eval_runtime": 59.074, |
| "eval_samples_per_second": 169.279, |
| "eval_steps_per_second": 21.16, |
| "step": 6300 |
| }, |
| { |
| "epoch": 0.6168867940441508, |
| "grad_norm": 2.8370766639709473, |
| "learning_rate": 5.3779264214046825e-06, |
| "loss": 0.7482, |
| "step": 6400 |
| }, |
| { |
| "epoch": 0.6168867940441508, |
| "eval_loss": 0.6552098393440247, |
| "eval_runtime": 58.9034, |
| "eval_samples_per_second": 169.769, |
| "eval_steps_per_second": 21.221, |
| "step": 6400 |
| }, |
| { |
| "epoch": 0.6265256502010906, |
| "grad_norm": 2.3310625553131104, |
| "learning_rate": 5.367892976588628e-06, |
| "loss": 0.7362, |
| "step": 6500 |
| }, |
| { |
| "epoch": 0.6265256502010906, |
| "eval_loss": 0.6410496234893799, |
| "eval_runtime": 58.8769, |
| "eval_samples_per_second": 169.846, |
| "eval_steps_per_second": 21.231, |
| "step": 6500 |
| }, |
| { |
| "epoch": 0.6361645063580305, |
| "grad_norm": 2.431213855743408, |
| "learning_rate": 5.357859531772575e-06, |
| "loss": 0.7287, |
| "step": 6600 |
| }, |
| { |
| "epoch": 0.6361645063580305, |
| "eval_loss": 0.6448028087615967, |
| "eval_runtime": 58.914, |
| "eval_samples_per_second": 169.739, |
| "eval_steps_per_second": 21.217, |
| "step": 6600 |
| }, |
| { |
| "epoch": 0.6458033625149704, |
| "grad_norm": 2.526439905166626, |
| "learning_rate": 5.347826086956522e-06, |
| "loss": 0.7191, |
| "step": 6700 |
| }, |
| { |
| "epoch": 0.6458033625149704, |
| "eval_loss": 0.6372745037078857, |
| "eval_runtime": 59.064, |
| "eval_samples_per_second": 169.308, |
| "eval_steps_per_second": 21.163, |
| "step": 6700 |
| }, |
| { |
| "epoch": 0.6554422186719102, |
| "grad_norm": 2.177318811416626, |
| "learning_rate": 5.337792642140468e-06, |
| "loss": 0.7133, |
| "step": 6800 |
| }, |
| { |
| "epoch": 0.6554422186719102, |
| "eval_loss": 0.6324734091758728, |
| "eval_runtime": 58.894, |
| "eval_samples_per_second": 169.796, |
| "eval_steps_per_second": 21.225, |
| "step": 6800 |
| }, |
| { |
| "epoch": 0.66508107482885, |
| "grad_norm": 2.04829740524292, |
| "learning_rate": 5.327759197324415e-06, |
| "loss": 0.7106, |
| "step": 6900 |
| }, |
| { |
| "epoch": 0.66508107482885, |
| "eval_loss": 0.6275954246520996, |
| "eval_runtime": 59.0721, |
| "eval_samples_per_second": 169.285, |
| "eval_steps_per_second": 21.161, |
| "step": 6900 |
| }, |
| { |
| "epoch": 0.6747199309857899, |
| "grad_norm": 2.3506946563720703, |
| "learning_rate": 5.317725752508361e-06, |
| "loss": 0.7021, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.6747199309857899, |
| "eval_loss": 0.6138682961463928, |
| "eval_runtime": 58.6329, |
| "eval_samples_per_second": 170.553, |
| "eval_steps_per_second": 21.319, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.6843587871427298, |
| "grad_norm": 2.0266621112823486, |
| "learning_rate": 5.307692307692307e-06, |
| "loss": 0.6934, |
| "step": 7100 |
| }, |
| { |
| "epoch": 0.6843587871427298, |
| "eval_loss": 0.6087481379508972, |
| "eval_runtime": 58.7542, |
| "eval_samples_per_second": 170.2, |
| "eval_steps_per_second": 21.275, |
| "step": 7100 |
| }, |
| { |
| "epoch": 0.6939976432996696, |
| "grad_norm": 2.079177141189575, |
| "learning_rate": 5.297658862876254e-06, |
| "loss": 0.6899, |
| "step": 7200 |
| }, |
| { |
| "epoch": 0.6939976432996696, |
| "eval_loss": 0.6015520095825195, |
| "eval_runtime": 58.7595, |
| "eval_samples_per_second": 170.185, |
| "eval_steps_per_second": 21.273, |
| "step": 7200 |
| }, |
| { |
| "epoch": 0.7036364994566094, |
| "grad_norm": 1.8569824695587158, |
| "learning_rate": 5.287625418060201e-06, |
| "loss": 0.6771, |
| "step": 7300 |
| }, |
| { |
| "epoch": 0.7036364994566094, |
| "eval_loss": 0.5964910984039307, |
| "eval_runtime": 58.8357, |
| "eval_samples_per_second": 169.965, |
| "eval_steps_per_second": 21.246, |
| "step": 7300 |
| }, |
| { |
| "epoch": 0.7132753556135494, |
| "grad_norm": 2.9515089988708496, |
| "learning_rate": 5.277591973244147e-06, |
| "loss": 0.6715, |
| "step": 7400 |
| }, |
| { |
| "epoch": 0.7132753556135494, |
| "eval_loss": 0.5907161235809326, |
| "eval_runtime": 59.0755, |
| "eval_samples_per_second": 169.275, |
| "eval_steps_per_second": 21.159, |
| "step": 7400 |
| }, |
| { |
| "epoch": 0.7229142117704892, |
| "grad_norm": 2.322965383529663, |
| "learning_rate": 5.2675585284280935e-06, |
| "loss": 0.6703, |
| "step": 7500 |
| }, |
| { |
| "epoch": 0.7229142117704892, |
| "eval_loss": 0.5861114263534546, |
| "eval_runtime": 58.8744, |
| "eval_samples_per_second": 169.853, |
| "eval_steps_per_second": 21.232, |
| "step": 7500 |
| }, |
| { |
| "epoch": 0.732553067927429, |
| "grad_norm": 2.576846122741699, |
| "learning_rate": 5.25752508361204e-06, |
| "loss": 0.6655, |
| "step": 7600 |
| }, |
| { |
| "epoch": 0.732553067927429, |
| "eval_loss": 0.5830049514770508, |
| "eval_runtime": 58.8479, |
| "eval_samples_per_second": 169.93, |
| "eval_steps_per_second": 21.241, |
| "step": 7600 |
| }, |
| { |
| "epoch": 0.742191924084369, |
| "grad_norm": 2.485050678253174, |
| "learning_rate": 5.247491638795986e-06, |
| "loss": 0.6531, |
| "step": 7700 |
| }, |
| { |
| "epoch": 0.742191924084369, |
| "eval_loss": 0.58382648229599, |
| "eval_runtime": 59.0846, |
| "eval_samples_per_second": 169.249, |
| "eval_steps_per_second": 21.156, |
| "step": 7700 |
| }, |
| { |
| "epoch": 0.7518307802413088, |
| "grad_norm": 2.1257286071777344, |
| "learning_rate": 5.237458193979933e-06, |
| "loss": 0.6468, |
| "step": 7800 |
| }, |
| { |
| "epoch": 0.7518307802413088, |
| "eval_loss": 0.5759472846984863, |
| "eval_runtime": 58.9027, |
| "eval_samples_per_second": 169.771, |
| "eval_steps_per_second": 21.221, |
| "step": 7800 |
| }, |
| { |
| "epoch": 0.7614696363982486, |
| "grad_norm": 2.0705039501190186, |
| "learning_rate": 5.22742474916388e-06, |
| "loss": 0.64, |
| "step": 7900 |
| }, |
| { |
| "epoch": 0.7614696363982486, |
| "eval_loss": 0.5682920813560486, |
| "eval_runtime": 58.8786, |
| "eval_samples_per_second": 169.841, |
| "eval_steps_per_second": 21.23, |
| "step": 7900 |
| }, |
| { |
| "epoch": 0.7711084925551884, |
| "grad_norm": 2.1057567596435547, |
| "learning_rate": 5.2173913043478265e-06, |
| "loss": 0.642, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.7711084925551884, |
| "eval_loss": 0.5687887072563171, |
| "eval_runtime": 58.9975, |
| "eval_samples_per_second": 169.499, |
| "eval_steps_per_second": 21.187, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.7807473487121284, |
| "grad_norm": 2.056955337524414, |
| "learning_rate": 5.207357859531772e-06, |
| "loss": 0.6348, |
| "step": 8100 |
| }, |
| { |
| "epoch": 0.7807473487121284, |
| "eval_loss": 0.5623904466629028, |
| "eval_runtime": 58.9219, |
| "eval_samples_per_second": 169.716, |
| "eval_steps_per_second": 21.215, |
| "step": 8100 |
| }, |
| { |
| "epoch": 0.7903862048690682, |
| "grad_norm": 2.312119483947754, |
| "learning_rate": 5.197324414715719e-06, |
| "loss": 0.628, |
| "step": 8200 |
| }, |
| { |
| "epoch": 0.7903862048690682, |
| "eval_loss": 0.5485402345657349, |
| "eval_runtime": 58.8197, |
| "eval_samples_per_second": 170.011, |
| "eval_steps_per_second": 21.251, |
| "step": 8200 |
| }, |
| { |
| "epoch": 0.800025061026008, |
| "grad_norm": 2.316204309463501, |
| "learning_rate": 5.187290969899666e-06, |
| "loss": 0.6219, |
| "step": 8300 |
| }, |
| { |
| "epoch": 0.800025061026008, |
| "eval_loss": 0.5563015341758728, |
| "eval_runtime": 58.9997, |
| "eval_samples_per_second": 169.492, |
| "eval_steps_per_second": 21.187, |
| "step": 8300 |
| }, |
| { |
| "epoch": 0.809663917182948, |
| "grad_norm": 2.1646835803985596, |
| "learning_rate": 5.177257525083612e-06, |
| "loss": 0.6229, |
| "step": 8400 |
| }, |
| { |
| "epoch": 0.809663917182948, |
| "eval_loss": 0.5380101799964905, |
| "eval_runtime": 58.5918, |
| "eval_samples_per_second": 170.672, |
| "eval_steps_per_second": 21.334, |
| "step": 8400 |
| }, |
| { |
| "epoch": 0.8193027733398878, |
| "grad_norm": 1.8544220924377441, |
| "learning_rate": 5.167224080267559e-06, |
| "loss": 0.616, |
| "step": 8500 |
| }, |
| { |
| "epoch": 0.8193027733398878, |
| "eval_loss": 0.5434267520904541, |
| "eval_runtime": 58.8918, |
| "eval_samples_per_second": 169.803, |
| "eval_steps_per_second": 21.225, |
| "step": 8500 |
| }, |
| { |
| "epoch": 0.8289416294968276, |
| "grad_norm": 1.9332796335220337, |
| "learning_rate": 5.157190635451505e-06, |
| "loss": 0.6112, |
| "step": 8600 |
| }, |
| { |
| "epoch": 0.8289416294968276, |
| "eval_loss": 0.5379143357276917, |
| "eval_runtime": 59.0748, |
| "eval_samples_per_second": 169.277, |
| "eval_steps_per_second": 21.16, |
| "step": 8600 |
| }, |
| { |
| "epoch": 0.8385804856537674, |
| "grad_norm": 1.9109045267105103, |
| "learning_rate": 5.147157190635451e-06, |
| "loss": 0.6065, |
| "step": 8700 |
| }, |
| { |
| "epoch": 0.8385804856537674, |
| "eval_loss": 0.5346018075942993, |
| "eval_runtime": 58.8974, |
| "eval_samples_per_second": 169.787, |
| "eval_steps_per_second": 21.223, |
| "step": 8700 |
| }, |
| { |
| "epoch": 0.8482193418107074, |
| "grad_norm": 2.635373115539551, |
| "learning_rate": 5.137123745819398e-06, |
| "loss": 0.6007, |
| "step": 8800 |
| }, |
| { |
| "epoch": 0.8482193418107074, |
| "eval_loss": 0.5312322378158569, |
| "eval_runtime": 59.1058, |
| "eval_samples_per_second": 169.188, |
| "eval_steps_per_second": 21.149, |
| "step": 8800 |
| }, |
| { |
| "epoch": 0.8578581979676472, |
| "grad_norm": 2.175607919692993, |
| "learning_rate": 5.127090301003345e-06, |
| "loss": 0.5994, |
| "step": 8900 |
| }, |
| { |
| "epoch": 0.8578581979676472, |
| "eval_loss": 0.5221748948097229, |
| "eval_runtime": 58.9139, |
| "eval_samples_per_second": 169.739, |
| "eval_steps_per_second": 21.217, |
| "step": 8900 |
| }, |
| { |
| "epoch": 0.867497054124587, |
| "grad_norm": 2.22357439994812, |
| "learning_rate": 5.117056856187291e-06, |
| "loss": 0.5891, |
| "step": 9000 |
| }, |
| { |
| "epoch": 0.867497054124587, |
| "eval_loss": 0.523512065410614, |
| "eval_runtime": 59.0804, |
| "eval_samples_per_second": 169.261, |
| "eval_steps_per_second": 21.158, |
| "step": 9000 |
| }, |
| { |
| "epoch": 0.8771359102815269, |
| "grad_norm": 2.295689582824707, |
| "learning_rate": 5.1070234113712375e-06, |
| "loss": 0.5906, |
| "step": 9100 |
| }, |
| { |
| "epoch": 0.8771359102815269, |
| "eval_loss": 0.524342954158783, |
| "eval_runtime": 58.896, |
| "eval_samples_per_second": 169.791, |
| "eval_steps_per_second": 21.224, |
| "step": 9100 |
| }, |
| { |
| "epoch": 0.8867747664384668, |
| "grad_norm": 1.769615650177002, |
| "learning_rate": 5.096989966555184e-06, |
| "loss": 0.5879, |
| "step": 9200 |
| }, |
| { |
| "epoch": 0.8867747664384668, |
| "eval_loss": 0.5283530354499817, |
| "eval_runtime": 58.9193, |
| "eval_samples_per_second": 169.724, |
| "eval_steps_per_second": 21.215, |
| "step": 9200 |
| }, |
| { |
| "epoch": 0.8964136225954066, |
| "grad_norm": 1.7100142240524292, |
| "learning_rate": 5.08695652173913e-06, |
| "loss": 0.5838, |
| "step": 9300 |
| }, |
| { |
| "epoch": 0.8964136225954066, |
| "eval_loss": 0.5141860842704773, |
| "eval_runtime": 59.0717, |
| "eval_samples_per_second": 169.286, |
| "eval_steps_per_second": 21.161, |
| "step": 9300 |
| }, |
| { |
| "epoch": 0.9060524787523465, |
| "grad_norm": 2.1682353019714355, |
| "learning_rate": 5.076923076923077e-06, |
| "loss": 0.5743, |
| "step": 9400 |
| }, |
| { |
| "epoch": 0.9060524787523465, |
| "eval_loss": 0.5123055577278137, |
| "eval_runtime": 58.9232, |
| "eval_samples_per_second": 169.713, |
| "eval_steps_per_second": 21.214, |
| "step": 9400 |
| }, |
| { |
| "epoch": 0.9156913349092863, |
| "grad_norm": 1.9889121055603027, |
| "learning_rate": 5.066889632107024e-06, |
| "loss": 0.5745, |
| "step": 9500 |
| }, |
| { |
| "epoch": 0.9156913349092863, |
| "eval_loss": 0.503046989440918, |
| "eval_runtime": 59.0466, |
| "eval_samples_per_second": 169.358, |
| "eval_steps_per_second": 21.17, |
| "step": 9500 |
| }, |
| { |
| "epoch": 0.9253301910662262, |
| "grad_norm": 1.9235670566558838, |
| "learning_rate": 5.05685618729097e-06, |
| "loss": 0.5695, |
| "step": 9600 |
| }, |
| { |
| "epoch": 0.9253301910662262, |
| "eval_loss": 0.5009673833847046, |
| "eval_runtime": 58.906, |
| "eval_samples_per_second": 169.762, |
| "eval_steps_per_second": 21.22, |
| "step": 9600 |
| }, |
| { |
| "epoch": 0.934969047223166, |
| "grad_norm": 2.0465199947357178, |
| "learning_rate": 5.046822742474916e-06, |
| "loss": 0.5687, |
| "step": 9700 |
| }, |
| { |
| "epoch": 0.934969047223166, |
| "eval_loss": 0.5043101906776428, |
| "eval_runtime": 58.9009, |
| "eval_samples_per_second": 169.777, |
| "eval_steps_per_second": 21.222, |
| "step": 9700 |
| }, |
| { |
| "epoch": 0.9446079033801059, |
| "grad_norm": 1.7765299081802368, |
| "learning_rate": 5.036789297658863e-06, |
| "loss": 0.5577, |
| "step": 9800 |
| }, |
| { |
| "epoch": 0.9446079033801059, |
| "eval_loss": 0.4905799329280853, |
| "eval_runtime": 59.0953, |
| "eval_samples_per_second": 169.218, |
| "eval_steps_per_second": 21.152, |
| "step": 9800 |
| }, |
| { |
| "epoch": 0.9542467595370457, |
| "grad_norm": 1.8332940340042114, |
| "learning_rate": 5.02675585284281e-06, |
| "loss": 0.5616, |
| "step": 9900 |
| }, |
| { |
| "epoch": 0.9542467595370457, |
| "eval_loss": 0.48462817072868347, |
| "eval_runtime": 58.826, |
| "eval_samples_per_second": 169.993, |
| "eval_steps_per_second": 21.249, |
| "step": 9900 |
| }, |
| { |
| "epoch": 0.9638856156939856, |
| "grad_norm": 1.9404815435409546, |
| "learning_rate": 5.016722408026756e-06, |
| "loss": 0.5576, |
| "step": 10000 |
| }, |
| { |
| "epoch": 0.9638856156939856, |
| "eval_loss": 0.4888601005077362, |
| "eval_runtime": 58.8596, |
| "eval_samples_per_second": 169.896, |
| "eval_steps_per_second": 21.237, |
| "step": 10000 |
| }, |
| { |
| "epoch": 0.9735244718509255, |
| "grad_norm": 1.857668399810791, |
| "learning_rate": 5.0066889632107026e-06, |
| "loss": 0.5503, |
| "step": 10100 |
| }, |
| { |
| "epoch": 0.9735244718509255, |
| "eval_loss": 0.48140355944633484, |
| "eval_runtime": 58.9457, |
| "eval_samples_per_second": 169.648, |
| "eval_steps_per_second": 21.206, |
| "step": 10100 |
| }, |
| { |
| "epoch": 0.9831633280078653, |
| "grad_norm": 1.7843375205993652, |
| "learning_rate": 4.996655518394649e-06, |
| "loss": 0.5477, |
| "step": 10200 |
| }, |
| { |
| "epoch": 0.9831633280078653, |
| "eval_loss": 0.4800611436367035, |
| "eval_runtime": 59.1103, |
| "eval_samples_per_second": 169.175, |
| "eval_steps_per_second": 21.147, |
| "step": 10200 |
| }, |
| { |
| "epoch": 0.9928021841648051, |
| "grad_norm": 1.8434568643569946, |
| "learning_rate": 4.986622073578595e-06, |
| "loss": 0.5434, |
| "step": 10300 |
| }, |
| { |
| "epoch": 0.9928021841648051, |
| "eval_loss": 0.47122901678085327, |
| "eval_runtime": 58.9167, |
| "eval_samples_per_second": 169.731, |
| "eval_steps_per_second": 21.216, |
| "step": 10300 |
| }, |
| { |
| "epoch": 1.002441040321745, |
| "grad_norm": 1.779107928276062, |
| "learning_rate": 4.976588628762542e-06, |
| "loss": 0.5412, |
| "step": 10400 |
| }, |
| { |
| "epoch": 1.002441040321745, |
| "eval_loss": 0.477405846118927, |
| "eval_runtime": 58.9086, |
| "eval_samples_per_second": 169.754, |
| "eval_steps_per_second": 21.219, |
| "step": 10400 |
| }, |
| { |
| "epoch": 1.0120798964786848, |
| "grad_norm": 2.1223175525665283, |
| "learning_rate": 4.966555183946489e-06, |
| "loss": 0.5395, |
| "step": 10500 |
| }, |
| { |
| "epoch": 1.0120798964786848, |
| "eval_loss": 0.48660337924957275, |
| "eval_runtime": 58.9373, |
| "eval_samples_per_second": 169.672, |
| "eval_steps_per_second": 21.209, |
| "step": 10500 |
| }, |
| { |
| "epoch": 1.0217187526356248, |
| "grad_norm": 1.8705182075500488, |
| "learning_rate": 4.956521739130435e-06, |
| "loss": 0.5384, |
| "step": 10600 |
| }, |
| { |
| "epoch": 1.0217187526356248, |
| "eval_loss": 0.4735105335712433, |
| "eval_runtime": 59.0979, |
| "eval_samples_per_second": 169.211, |
| "eval_steps_per_second": 21.151, |
| "step": 10600 |
| }, |
| { |
| "epoch": 1.0313576087925647, |
| "grad_norm": 2.186157464981079, |
| "learning_rate": 4.9464882943143815e-06, |
| "loss": 0.5318, |
| "step": 10700 |
| }, |
| { |
| "epoch": 1.0313576087925647, |
| "eval_loss": 0.47351425886154175, |
| "eval_runtime": 58.9071, |
| "eval_samples_per_second": 169.759, |
| "eval_steps_per_second": 21.22, |
| "step": 10700 |
| }, |
| { |
| "epoch": 1.0409964649495045, |
| "grad_norm": 2.1391384601593018, |
| "learning_rate": 4.936454849498328e-06, |
| "loss": 0.5304, |
| "step": 10800 |
| }, |
| { |
| "epoch": 1.0409964649495045, |
| "eval_loss": 0.4776832163333893, |
| "eval_runtime": 58.9018, |
| "eval_samples_per_second": 169.774, |
| "eval_steps_per_second": 21.222, |
| "step": 10800 |
| }, |
| { |
| "epoch": 1.0506353211064443, |
| "grad_norm": 1.825791835784912, |
| "learning_rate": 4.926421404682274e-06, |
| "loss": 0.5243, |
| "step": 10900 |
| }, |
| { |
| "epoch": 1.0506353211064443, |
| "eval_loss": 0.4646834433078766, |
| "eval_runtime": 58.9286, |
| "eval_samples_per_second": 169.697, |
| "eval_steps_per_second": 21.212, |
| "step": 10900 |
| }, |
| { |
| "epoch": 1.0602741772633841, |
| "grad_norm": 1.7971795797348022, |
| "learning_rate": 4.916387959866221e-06, |
| "loss": 0.5247, |
| "step": 11000 |
| }, |
| { |
| "epoch": 1.0602741772633841, |
| "eval_loss": 0.4617900252342224, |
| "eval_runtime": 59.0814, |
| "eval_samples_per_second": 169.258, |
| "eval_steps_per_second": 21.157, |
| "step": 11000 |
| }, |
| { |
| "epoch": 1.069913033420324, |
| "grad_norm": 1.6506189107894897, |
| "learning_rate": 4.906354515050168e-06, |
| "loss": 0.5204, |
| "step": 11100 |
| }, |
| { |
| "epoch": 1.069913033420324, |
| "eval_loss": 0.4610148072242737, |
| "eval_runtime": 58.8704, |
| "eval_samples_per_second": 169.865, |
| "eval_steps_per_second": 21.233, |
| "step": 11100 |
| }, |
| { |
| "epoch": 1.0795518895772638, |
| "grad_norm": 1.682634949684143, |
| "learning_rate": 4.8963210702341136e-06, |
| "loss": 0.5147, |
| "step": 11200 |
| }, |
| { |
| "epoch": 1.0795518895772638, |
| "eval_loss": 0.4534702003002167, |
| "eval_runtime": 58.872, |
| "eval_samples_per_second": 169.86, |
| "eval_steps_per_second": 21.232, |
| "step": 11200 |
| }, |
| { |
| "epoch": 1.0891907457342038, |
| "grad_norm": 1.6429296731948853, |
| "learning_rate": 4.88628762541806e-06, |
| "loss": 0.5163, |
| "step": 11300 |
| }, |
| { |
| "epoch": 1.0891907457342038, |
| "eval_loss": 0.45869511365890503, |
| "eval_runtime": 59.0441, |
| "eval_samples_per_second": 169.365, |
| "eval_steps_per_second": 21.171, |
| "step": 11300 |
| }, |
| { |
| "epoch": 1.0988296018911436, |
| "grad_norm": 1.7855538129806519, |
| "learning_rate": 4.876254180602007e-06, |
| "loss": 0.5093, |
| "step": 11400 |
| }, |
| { |
| "epoch": 1.0988296018911436, |
| "eval_loss": 0.45184823870658875, |
| "eval_runtime": 58.8447, |
| "eval_samples_per_second": 169.939, |
| "eval_steps_per_second": 21.242, |
| "step": 11400 |
| }, |
| { |
| "epoch": 1.1084684580480835, |
| "grad_norm": 1.6376959085464478, |
| "learning_rate": 4.866220735785953e-06, |
| "loss": 0.5107, |
| "step": 11500 |
| }, |
| { |
| "epoch": 1.1084684580480835, |
| "eval_loss": 0.4540416896343231, |
| "eval_runtime": 58.9029, |
| "eval_samples_per_second": 169.771, |
| "eval_steps_per_second": 21.221, |
| "step": 11500 |
| }, |
| { |
| "epoch": 1.1181073142050233, |
| "grad_norm": 1.8928024768829346, |
| "learning_rate": 4.8561872909699e-06, |
| "loss": 0.5067, |
| "step": 11600 |
| }, |
| { |
| "epoch": 1.1181073142050233, |
| "eval_loss": 0.44899114966392517, |
| "eval_runtime": 58.9326, |
| "eval_samples_per_second": 169.685, |
| "eval_steps_per_second": 21.211, |
| "step": 11600 |
| }, |
| { |
| "epoch": 1.1277461703619631, |
| "grad_norm": 1.828368902206421, |
| "learning_rate": 4.8461538461538465e-06, |
| "loss": 0.5064, |
| "step": 11700 |
| }, |
| { |
| "epoch": 1.1277461703619631, |
| "eval_loss": 0.4536297917366028, |
| "eval_runtime": 59.1031, |
| "eval_samples_per_second": 169.196, |
| "eval_steps_per_second": 21.149, |
| "step": 11700 |
| }, |
| { |
| "epoch": 1.137385026518903, |
| "grad_norm": 1.7877882719039917, |
| "learning_rate": 4.8361204013377925e-06, |
| "loss": 0.5009, |
| "step": 11800 |
| }, |
| { |
| "epoch": 1.137385026518903, |
| "eval_loss": 0.4467906355857849, |
| "eval_runtime": 58.8742, |
| "eval_samples_per_second": 169.854, |
| "eval_steps_per_second": 21.232, |
| "step": 11800 |
| }, |
| { |
| "epoch": 1.1470238826758428, |
| "grad_norm": 1.7465142011642456, |
| "learning_rate": 4.826086956521739e-06, |
| "loss": 0.5003, |
| "step": 11900 |
| }, |
| { |
| "epoch": 1.1470238826758428, |
| "eval_loss": 0.43356460332870483, |
| "eval_runtime": 58.9139, |
| "eval_samples_per_second": 169.739, |
| "eval_steps_per_second": 21.217, |
| "step": 11900 |
| }, |
| { |
| "epoch": 1.1566627388327828, |
| "grad_norm": 1.816881775856018, |
| "learning_rate": 4.816053511705686e-06, |
| "loss": 0.4993, |
| "step": 12000 |
| }, |
| { |
| "epoch": 1.1566627388327828, |
| "eval_loss": 0.43435898423194885, |
| "eval_runtime": 59.0853, |
| "eval_samples_per_second": 169.247, |
| "eval_steps_per_second": 21.156, |
| "step": 12000 |
| }, |
| { |
| "epoch": 1.1663015949897226, |
| "grad_norm": 1.6352607011795044, |
| "learning_rate": 4.806020066889633e-06, |
| "loss": 0.4953, |
| "step": 12100 |
| }, |
| { |
| "epoch": 1.1663015949897226, |
| "eval_loss": 0.44119584560394287, |
| "eval_runtime": 58.8922, |
| "eval_samples_per_second": 169.802, |
| "eval_steps_per_second": 21.225, |
| "step": 12100 |
| }, |
| { |
| "epoch": 1.1759404511466625, |
| "grad_norm": 1.791181206703186, |
| "learning_rate": 4.795986622073579e-06, |
| "loss": 0.4953, |
| "step": 12200 |
| }, |
| { |
| "epoch": 1.1759404511466625, |
| "eval_loss": 0.43825873732566833, |
| "eval_runtime": 59.0873, |
| "eval_samples_per_second": 169.241, |
| "eval_steps_per_second": 21.155, |
| "step": 12200 |
| }, |
| { |
| "epoch": 1.1855793073036023, |
| "grad_norm": 1.5742549896240234, |
| "learning_rate": 4.785953177257525e-06, |
| "loss": 0.4922, |
| "step": 12300 |
| }, |
| { |
| "epoch": 1.1855793073036023, |
| "eval_loss": 0.42847940325737, |
| "eval_runtime": 58.906, |
| "eval_samples_per_second": 169.762, |
| "eval_steps_per_second": 21.22, |
| "step": 12300 |
| }, |
| { |
| "epoch": 1.195218163460542, |
| "grad_norm": 1.666723608970642, |
| "learning_rate": 4.775919732441472e-06, |
| "loss": 0.4896, |
| "step": 12400 |
| }, |
| { |
| "epoch": 1.195218163460542, |
| "eval_loss": 0.4272569417953491, |
| "eval_runtime": 58.9642, |
| "eval_samples_per_second": 169.594, |
| "eval_steps_per_second": 21.199, |
| "step": 12400 |
| }, |
| { |
| "epoch": 1.204857019617482, |
| "grad_norm": 1.5803587436676025, |
| "learning_rate": 4.765886287625418e-06, |
| "loss": 0.4848, |
| "step": 12500 |
| }, |
| { |
| "epoch": 1.204857019617482, |
| "eval_loss": 0.4343504011631012, |
| "eval_runtime": 59.102, |
| "eval_samples_per_second": 169.199, |
| "eval_steps_per_second": 21.15, |
| "step": 12500 |
| }, |
| { |
| "epoch": 1.2144958757744218, |
| "grad_norm": 1.6863902807235718, |
| "learning_rate": 4.755852842809365e-06, |
| "loss": 0.486, |
| "step": 12600 |
| }, |
| { |
| "epoch": 1.2144958757744218, |
| "eval_loss": 0.42895078659057617, |
| "eval_runtime": 58.8995, |
| "eval_samples_per_second": 169.781, |
| "eval_steps_per_second": 21.223, |
| "step": 12600 |
| }, |
| { |
| "epoch": 1.2241347319313618, |
| "grad_norm": 1.6389436721801758, |
| "learning_rate": 4.745819397993312e-06, |
| "loss": 0.4787, |
| "step": 12700 |
| }, |
| { |
| "epoch": 1.2241347319313618, |
| "eval_loss": 0.43419140577316284, |
| "eval_runtime": 59.0691, |
| "eval_samples_per_second": 169.293, |
| "eval_steps_per_second": 21.162, |
| "step": 12700 |
| }, |
| { |
| "epoch": 1.2337735880883016, |
| "grad_norm": 1.7968857288360596, |
| "learning_rate": 4.7357859531772575e-06, |
| "loss": 0.4802, |
| "step": 12800 |
| }, |
| { |
| "epoch": 1.2337735880883016, |
| "eval_loss": 0.4281242787837982, |
| "eval_runtime": 58.9246, |
| "eval_samples_per_second": 169.708, |
| "eval_steps_per_second": 21.214, |
| "step": 12800 |
| }, |
| { |
| "epoch": 1.2434124442452414, |
| "grad_norm": 1.577661156654358, |
| "learning_rate": 4.725752508361204e-06, |
| "loss": 0.4741, |
| "step": 12900 |
| }, |
| { |
| "epoch": 1.2434124442452414, |
| "eval_loss": 0.42887967824935913, |
| "eval_runtime": 58.6822, |
| "eval_samples_per_second": 170.409, |
| "eval_steps_per_second": 21.301, |
| "step": 12900 |
| }, |
| { |
| "epoch": 1.2530513004021813, |
| "grad_norm": 1.6630301475524902, |
| "learning_rate": 4.715719063545151e-06, |
| "loss": 0.4763, |
| "step": 13000 |
| }, |
| { |
| "epoch": 1.2530513004021813, |
| "eval_loss": 0.41134434938430786, |
| "eval_runtime": 59.0454, |
| "eval_samples_per_second": 169.361, |
| "eval_steps_per_second": 21.17, |
| "step": 13000 |
| }, |
| { |
| "epoch": 1.262690156559121, |
| "grad_norm": 1.6969964504241943, |
| "learning_rate": 4.705685618729097e-06, |
| "loss": 0.4695, |
| "step": 13100 |
| }, |
| { |
| "epoch": 1.262690156559121, |
| "eval_loss": 0.4251042902469635, |
| "eval_runtime": 58.8993, |
| "eval_samples_per_second": 169.781, |
| "eval_steps_per_second": 21.223, |
| "step": 13100 |
| }, |
| { |
| "epoch": 1.272329012716061, |
| "grad_norm": 1.5483554601669312, |
| "learning_rate": 4.695652173913044e-06, |
| "loss": 0.4724, |
| "step": 13200 |
| }, |
| { |
| "epoch": 1.272329012716061, |
| "eval_loss": 0.4123547375202179, |
| "eval_runtime": 58.9538, |
| "eval_samples_per_second": 169.624, |
| "eval_steps_per_second": 21.203, |
| "step": 13200 |
| }, |
| { |
| "epoch": 1.281967868873001, |
| "grad_norm": 1.6243360042572021, |
| "learning_rate": 4.6856187290969905e-06, |
| "loss": 0.467, |
| "step": 13300 |
| }, |
| { |
| "epoch": 1.281967868873001, |
| "eval_loss": 0.41819024085998535, |
| "eval_runtime": 59.0792, |
| "eval_samples_per_second": 169.264, |
| "eval_steps_per_second": 21.158, |
| "step": 13300 |
| }, |
| { |
| "epoch": 1.2916067250299408, |
| "grad_norm": 1.5376485586166382, |
| "learning_rate": 4.675585284280936e-06, |
| "loss": 0.4649, |
| "step": 13400 |
| }, |
| { |
| "epoch": 1.2916067250299408, |
| "eval_loss": 0.4179925322532654, |
| "eval_runtime": 58.8694, |
| "eval_samples_per_second": 169.868, |
| "eval_steps_per_second": 21.233, |
| "step": 13400 |
| }, |
| { |
| "epoch": 1.3012455811868806, |
| "grad_norm": 1.6273411512374878, |
| "learning_rate": 4.665551839464883e-06, |
| "loss": 0.4619, |
| "step": 13500 |
| }, |
| { |
| "epoch": 1.3012455811868806, |
| "eval_loss": 0.40902629494667053, |
| "eval_runtime": 59.0101, |
| "eval_samples_per_second": 169.463, |
| "eval_steps_per_second": 21.183, |
| "step": 13500 |
| }, |
| { |
| "epoch": 1.3108844373438204, |
| "grad_norm": 1.4825048446655273, |
| "learning_rate": 4.65551839464883e-06, |
| "loss": 0.4621, |
| "step": 13600 |
| }, |
| { |
| "epoch": 1.3108844373438204, |
| "eval_loss": 0.407672256231308, |
| "eval_runtime": 58.9292, |
| "eval_samples_per_second": 169.695, |
| "eval_steps_per_second": 21.212, |
| "step": 13600 |
| }, |
| { |
| "epoch": 1.3205232935007603, |
| "grad_norm": 1.4943957328796387, |
| "learning_rate": 4.645484949832776e-06, |
| "loss": 0.4627, |
| "step": 13700 |
| }, |
| { |
| "epoch": 1.3205232935007603, |
| "eval_loss": 0.4040186405181885, |
| "eval_runtime": 58.8579, |
| "eval_samples_per_second": 169.901, |
| "eval_steps_per_second": 21.238, |
| "step": 13700 |
| }, |
| { |
| "epoch": 1.3301621496577, |
| "grad_norm": 1.5842742919921875, |
| "learning_rate": 4.635451505016723e-06, |
| "loss": 0.4658, |
| "step": 13800 |
| }, |
| { |
| "epoch": 1.3301621496577, |
| "eval_loss": 0.39733609557151794, |
| "eval_runtime": 59.0877, |
| "eval_samples_per_second": 169.24, |
| "eval_steps_per_second": 21.155, |
| "step": 13800 |
| }, |
| { |
| "epoch": 1.33980100581464, |
| "grad_norm": 1.5690122842788696, |
| "learning_rate": 4.625418060200669e-06, |
| "loss": 0.4602, |
| "step": 13900 |
| }, |
| { |
| "epoch": 1.33980100581464, |
| "eval_loss": 0.4122777283191681, |
| "eval_runtime": 58.8805, |
| "eval_samples_per_second": 169.836, |
| "eval_steps_per_second": 21.229, |
| "step": 13900 |
| }, |
| { |
| "epoch": 1.3494398619715797, |
| "grad_norm": 1.5999127626419067, |
| "learning_rate": 4.615384615384616e-06, |
| "loss": 0.4527, |
| "step": 14000 |
| }, |
| { |
| "epoch": 1.3494398619715797, |
| "eval_loss": 0.4009742736816406, |
| "eval_runtime": 58.908, |
| "eval_samples_per_second": 169.756, |
| "eval_steps_per_second": 21.22, |
| "step": 14000 |
| }, |
| { |
| "epoch": 1.3590787181285198, |
| "grad_norm": 1.482398271560669, |
| "learning_rate": 4.605351170568562e-06, |
| "loss": 0.4542, |
| "step": 14100 |
| }, |
| { |
| "epoch": 1.3590787181285198, |
| "eval_loss": 0.39838653802871704, |
| "eval_runtime": 59.0666, |
| "eval_samples_per_second": 169.3, |
| "eval_steps_per_second": 21.163, |
| "step": 14100 |
| }, |
| { |
| "epoch": 1.3687175742854596, |
| "grad_norm": 1.7599948644638062, |
| "learning_rate": 4.595317725752509e-06, |
| "loss": 0.4493, |
| "step": 14200 |
| }, |
| { |
| "epoch": 1.3687175742854596, |
| "eval_loss": 0.40471258759498596, |
| "eval_runtime": 58.9227, |
| "eval_samples_per_second": 169.714, |
| "eval_steps_per_second": 21.214, |
| "step": 14200 |
| }, |
| { |
| "epoch": 1.3783564304423994, |
| "grad_norm": 1.6642801761627197, |
| "learning_rate": 4.585284280936456e-06, |
| "loss": 0.4469, |
| "step": 14300 |
| }, |
| { |
| "epoch": 1.3783564304423994, |
| "eval_loss": 0.40282630920410156, |
| "eval_runtime": 58.8961, |
| "eval_samples_per_second": 169.79, |
| "eval_steps_per_second": 21.224, |
| "step": 14300 |
| }, |
| { |
| "epoch": 1.3879952865993392, |
| "grad_norm": 1.6965595483779907, |
| "learning_rate": 4.5752508361204015e-06, |
| "loss": 0.4468, |
| "step": 14400 |
| }, |
| { |
| "epoch": 1.3879952865993392, |
| "eval_loss": 0.3986397087574005, |
| "eval_runtime": 58.865, |
| "eval_samples_per_second": 169.88, |
| "eval_steps_per_second": 21.235, |
| "step": 14400 |
| }, |
| { |
| "epoch": 1.397634142756279, |
| "grad_norm": 1.5238665342330933, |
| "learning_rate": 4.565217391304348e-06, |
| "loss": 0.4475, |
| "step": 14500 |
| }, |
| { |
| "epoch": 1.397634142756279, |
| "eval_loss": 0.3981897532939911, |
| "eval_runtime": 59.0627, |
| "eval_samples_per_second": 169.312, |
| "eval_steps_per_second": 21.164, |
| "step": 14500 |
| }, |
| { |
| "epoch": 1.4072729989132189, |
| "grad_norm": 1.5769461393356323, |
| "learning_rate": 4.555183946488295e-06, |
| "loss": 0.4482, |
| "step": 14600 |
| }, |
| { |
| "epoch": 1.4072729989132189, |
| "eval_loss": 0.39567071199417114, |
| "eval_runtime": 58.8331, |
| "eval_samples_per_second": 169.972, |
| "eval_steps_per_second": 21.247, |
| "step": 14600 |
| }, |
| { |
| "epoch": 1.416911855070159, |
| "grad_norm": 1.604698896408081, |
| "learning_rate": 4.545150501672241e-06, |
| "loss": 0.4462, |
| "step": 14700 |
| }, |
| { |
| "epoch": 1.416911855070159, |
| "eval_loss": 0.3909485340118408, |
| "eval_runtime": 58.8119, |
| "eval_samples_per_second": 170.034, |
| "eval_steps_per_second": 21.254, |
| "step": 14700 |
| }, |
| { |
| "epoch": 1.4265507112270988, |
| "grad_norm": 1.5470887422561646, |
| "learning_rate": 4.535117056856188e-06, |
| "loss": 0.4413, |
| "step": 14800 |
| }, |
| { |
| "epoch": 1.4265507112270988, |
| "eval_loss": 0.3952423334121704, |
| "eval_runtime": 59.0239, |
| "eval_samples_per_second": 169.423, |
| "eval_steps_per_second": 21.178, |
| "step": 14800 |
| }, |
| { |
| "epoch": 1.4361895673840386, |
| "grad_norm": 1.5020021200180054, |
| "learning_rate": 4.5250836120401345e-06, |
| "loss": 0.44, |
| "step": 14900 |
| }, |
| { |
| "epoch": 1.4361895673840386, |
| "eval_loss": 0.3845099210739136, |
| "eval_runtime": 58.9294, |
| "eval_samples_per_second": 169.695, |
| "eval_steps_per_second": 21.212, |
| "step": 14900 |
| }, |
| { |
| "epoch": 1.4458284235409784, |
| "grad_norm": 1.6386624574661255, |
| "learning_rate": 4.51505016722408e-06, |
| "loss": 0.4397, |
| "step": 15000 |
| }, |
| { |
| "epoch": 1.4458284235409784, |
| "eval_loss": 0.38025835156440735, |
| "eval_runtime": 58.818, |
| "eval_samples_per_second": 170.016, |
| "eval_steps_per_second": 21.252, |
| "step": 15000 |
| }, |
| { |
| "epoch": 1.4554672796979182, |
| "grad_norm": 1.4663512706756592, |
| "learning_rate": 4.505016722408027e-06, |
| "loss": 0.4373, |
| "step": 15100 |
| }, |
| { |
| "epoch": 1.4554672796979182, |
| "eval_loss": 0.3857060968875885, |
| "eval_runtime": 58.8494, |
| "eval_samples_per_second": 169.925, |
| "eval_steps_per_second": 21.241, |
| "step": 15100 |
| }, |
| { |
| "epoch": 1.465106135854858, |
| "grad_norm": 1.6452239751815796, |
| "learning_rate": 4.494983277591973e-06, |
| "loss": 0.4334, |
| "step": 15200 |
| }, |
| { |
| "epoch": 1.465106135854858, |
| "eval_loss": 0.3850370943546295, |
| "eval_runtime": 58.9917, |
| "eval_samples_per_second": 169.515, |
| "eval_steps_per_second": 21.189, |
| "step": 15200 |
| }, |
| { |
| "epoch": 1.4747449920117979, |
| "grad_norm": 1.6599483489990234, |
| "learning_rate": 4.48494983277592e-06, |
| "loss": 0.434, |
| "step": 15300 |
| }, |
| { |
| "epoch": 1.4747449920117979, |
| "eval_loss": 0.38289549946784973, |
| "eval_runtime": 59.0362, |
| "eval_samples_per_second": 169.388, |
| "eval_steps_per_second": 21.173, |
| "step": 15300 |
| }, |
| { |
| "epoch": 1.4843838481687377, |
| "grad_norm": 1.665375828742981, |
| "learning_rate": 4.474916387959866e-06, |
| "loss": 0.4325, |
| "step": 15400 |
| }, |
| { |
| "epoch": 1.4843838481687377, |
| "eval_loss": 0.38493427634239197, |
| "eval_runtime": 58.7929, |
| "eval_samples_per_second": 170.088, |
| "eval_steps_per_second": 21.261, |
| "step": 15400 |
| }, |
| { |
| "epoch": 1.4940227043256777, |
| "grad_norm": 1.5351532697677612, |
| "learning_rate": 4.4648829431438125e-06, |
| "loss": 0.4292, |
| "step": 15500 |
| }, |
| { |
| "epoch": 1.4940227043256777, |
| "eval_loss": 0.3742278814315796, |
| "eval_runtime": 58.8841, |
| "eval_samples_per_second": 169.825, |
| "eval_steps_per_second": 21.228, |
| "step": 15500 |
| }, |
| { |
| "epoch": 1.5036615604826176, |
| "grad_norm": 1.4911319017410278, |
| "learning_rate": 4.454849498327759e-06, |
| "loss": 0.4265, |
| "step": 15600 |
| }, |
| { |
| "epoch": 1.5036615604826176, |
| "eval_loss": 0.3842261731624603, |
| "eval_runtime": 59.0288, |
| "eval_samples_per_second": 169.409, |
| "eval_steps_per_second": 21.176, |
| "step": 15600 |
| }, |
| { |
| "epoch": 1.5133004166395574, |
| "grad_norm": 1.4675853252410889, |
| "learning_rate": 4.444816053511705e-06, |
| "loss": 0.4237, |
| "step": 15700 |
| }, |
| { |
| "epoch": 1.5133004166395574, |
| "eval_loss": 0.3784289062023163, |
| "eval_runtime": 58.8858, |
| "eval_samples_per_second": 169.82, |
| "eval_steps_per_second": 21.228, |
| "step": 15700 |
| }, |
| { |
| "epoch": 1.5229392727964972, |
| "grad_norm": 1.5063199996948242, |
| "learning_rate": 4.434782608695652e-06, |
| "loss": 0.4251, |
| "step": 15800 |
| }, |
| { |
| "epoch": 1.5229392727964972, |
| "eval_loss": 0.3759336471557617, |
| "eval_runtime": 59.0897, |
| "eval_samples_per_second": 169.234, |
| "eval_steps_per_second": 21.154, |
| "step": 15800 |
| }, |
| { |
| "epoch": 1.5325781289534373, |
| "grad_norm": 1.4472297430038452, |
| "learning_rate": 4.424749163879599e-06, |
| "loss": 0.4248, |
| "step": 15900 |
| }, |
| { |
| "epoch": 1.5325781289534373, |
| "eval_loss": 0.37335318326950073, |
| "eval_runtime": 58.6972, |
| "eval_samples_per_second": 170.366, |
| "eval_steps_per_second": 21.296, |
| "step": 15900 |
| }, |
| { |
| "epoch": 1.542216985110377, |
| "grad_norm": 1.5391136407852173, |
| "learning_rate": 4.414715719063545e-06, |
| "loss": 0.4236, |
| "step": 16000 |
| }, |
| { |
| "epoch": 1.542216985110377, |
| "eval_loss": 0.3732203543186188, |
| "eval_runtime": 58.7578, |
| "eval_samples_per_second": 170.19, |
| "eval_steps_per_second": 21.274, |
| "step": 16000 |
| }, |
| { |
| "epoch": 1.551855841267317, |
| "grad_norm": 1.4061559438705444, |
| "learning_rate": 4.404682274247491e-06, |
| "loss": 0.4198, |
| "step": 16100 |
| }, |
| { |
| "epoch": 1.551855841267317, |
| "eval_loss": 0.37672555446624756, |
| "eval_runtime": 58.9119, |
| "eval_samples_per_second": 169.745, |
| "eval_steps_per_second": 21.218, |
| "step": 16100 |
| }, |
| { |
| "epoch": 1.5614946974242567, |
| "grad_norm": 1.5127891302108765, |
| "learning_rate": 4.394648829431438e-06, |
| "loss": 0.4201, |
| "step": 16200 |
| }, |
| { |
| "epoch": 1.5614946974242567, |
| "eval_loss": 0.38093164563179016, |
| "eval_runtime": 59.0714, |
| "eval_samples_per_second": 169.287, |
| "eval_steps_per_second": 21.161, |
| "step": 16200 |
| }, |
| { |
| "epoch": 1.5711335535811966, |
| "grad_norm": 1.487502098083496, |
| "learning_rate": 4.384615384615384e-06, |
| "loss": 0.417, |
| "step": 16300 |
| }, |
| { |
| "epoch": 1.5711335535811966, |
| "eval_loss": 0.372848242521286, |
| "eval_runtime": 58.9058, |
| "eval_samples_per_second": 169.763, |
| "eval_steps_per_second": 21.22, |
| "step": 16300 |
| }, |
| { |
| "epoch": 1.5807724097381364, |
| "grad_norm": 1.5137122869491577, |
| "learning_rate": 4.374581939799331e-06, |
| "loss": 0.4151, |
| "step": 16400 |
| }, |
| { |
| "epoch": 1.5807724097381364, |
| "eval_loss": 0.37606263160705566, |
| "eval_runtime": 58.8935, |
| "eval_samples_per_second": 169.798, |
| "eval_steps_per_second": 21.225, |
| "step": 16400 |
| }, |
| { |
| "epoch": 1.5904112658950762, |
| "grad_norm": 1.7061500549316406, |
| "learning_rate": 4.364548494983278e-06, |
| "loss": 0.4178, |
| "step": 16500 |
| }, |
| { |
| "epoch": 1.5904112658950762, |
| "eval_loss": 0.37127670645713806, |
| "eval_runtime": 59.0158, |
| "eval_samples_per_second": 169.446, |
| "eval_steps_per_second": 21.181, |
| "step": 16500 |
| }, |
| { |
| "epoch": 1.600050122052016, |
| "grad_norm": 1.618320345878601, |
| "learning_rate": 4.354515050167224e-06, |
| "loss": 0.4169, |
| "step": 16600 |
| }, |
| { |
| "epoch": 1.600050122052016, |
| "eval_loss": 0.37215638160705566, |
| "eval_runtime": 58.8708, |
| "eval_samples_per_second": 169.864, |
| "eval_steps_per_second": 21.233, |
| "step": 16600 |
| }, |
| { |
| "epoch": 1.6096889782089558, |
| "grad_norm": 1.5095313787460327, |
| "learning_rate": 4.34448160535117e-06, |
| "loss": 0.4117, |
| "step": 16700 |
| }, |
| { |
| "epoch": 1.6096889782089558, |
| "eval_loss": 0.368650883436203, |
| "eval_runtime": 59.0494, |
| "eval_samples_per_second": 169.35, |
| "eval_steps_per_second": 21.169, |
| "step": 16700 |
| }, |
| { |
| "epoch": 1.6193278343658957, |
| "grad_norm": 1.56742525100708, |
| "learning_rate": 4.334448160535117e-06, |
| "loss": 0.4111, |
| "step": 16800 |
| }, |
| { |
| "epoch": 1.6193278343658957, |
| "eval_loss": 0.36101919412612915, |
| "eval_runtime": 58.9246, |
| "eval_samples_per_second": 169.708, |
| "eval_steps_per_second": 21.214, |
| "step": 16800 |
| }, |
| { |
| "epoch": 1.6289666905228355, |
| "grad_norm": 1.5075291395187378, |
| "learning_rate": 4.324414715719064e-06, |
| "loss": 0.4132, |
| "step": 16900 |
| }, |
| { |
| "epoch": 1.6289666905228355, |
| "eval_loss": 0.3604821264743805, |
| "eval_runtime": 59.0583, |
| "eval_samples_per_second": 169.324, |
| "eval_steps_per_second": 21.166, |
| "step": 16900 |
| }, |
| { |
| "epoch": 1.6386055466797755, |
| "grad_norm": 1.6991218328475952, |
| "learning_rate": 4.31438127090301e-06, |
| "loss": 0.4103, |
| "step": 17000 |
| }, |
| { |
| "epoch": 1.6386055466797755, |
| "eval_loss": 0.35928505659103394, |
| "eval_runtime": 58.8981, |
| "eval_samples_per_second": 169.785, |
| "eval_steps_per_second": 21.223, |
| "step": 17000 |
| }, |
| { |
| "epoch": 1.6482444028367154, |
| "grad_norm": 1.4736828804016113, |
| "learning_rate": 4.3043478260869565e-06, |
| "loss": 0.4079, |
| "step": 17100 |
| }, |
| { |
| "epoch": 1.6482444028367154, |
| "eval_loss": 0.35935309529304504, |
| "eval_runtime": 58.8933, |
| "eval_samples_per_second": 169.799, |
| "eval_steps_per_second": 21.225, |
| "step": 17100 |
| }, |
| { |
| "epoch": 1.6578832589936552, |
| "grad_norm": 1.4704461097717285, |
| "learning_rate": 4.294314381270903e-06, |
| "loss": 0.4041, |
| "step": 17200 |
| }, |
| { |
| "epoch": 1.6578832589936552, |
| "eval_loss": 0.35896405577659607, |
| "eval_runtime": 59.0753, |
| "eval_samples_per_second": 169.276, |
| "eval_steps_per_second": 21.159, |
| "step": 17200 |
| }, |
| { |
| "epoch": 1.6675221151505952, |
| "grad_norm": 1.5166268348693848, |
| "learning_rate": 4.284280936454849e-06, |
| "loss": 0.4053, |
| "step": 17300 |
| }, |
| { |
| "epoch": 1.6675221151505952, |
| "eval_loss": 0.35407283902168274, |
| "eval_runtime": 58.899, |
| "eval_samples_per_second": 169.782, |
| "eval_steps_per_second": 21.223, |
| "step": 17300 |
| }, |
| { |
| "epoch": 1.677160971307535, |
| "grad_norm": 1.4285303354263306, |
| "learning_rate": 4.274247491638796e-06, |
| "loss": 0.4023, |
| "step": 17400 |
| }, |
| { |
| "epoch": 1.677160971307535, |
| "eval_loss": 0.3566473424434662, |
| "eval_runtime": 58.8279, |
| "eval_samples_per_second": 169.987, |
| "eval_steps_per_second": 21.248, |
| "step": 17400 |
| }, |
| { |
| "epoch": 1.6867998274644749, |
| "grad_norm": 1.6289827823638916, |
| "learning_rate": 4.264214046822743e-06, |
| "loss": 0.4019, |
| "step": 17500 |
| }, |
| { |
| "epoch": 1.6867998274644749, |
| "eval_loss": 0.35341745615005493, |
| "eval_runtime": 58.9406, |
| "eval_samples_per_second": 169.662, |
| "eval_steps_per_second": 21.208, |
| "step": 17500 |
| }, |
| { |
| "epoch": 1.6964386836214147, |
| "grad_norm": 1.4782038927078247, |
| "learning_rate": 4.254180602006689e-06, |
| "loss": 0.4021, |
| "step": 17600 |
| }, |
| { |
| "epoch": 1.6964386836214147, |
| "eval_loss": 0.35872602462768555, |
| "eval_runtime": 58.8807, |
| "eval_samples_per_second": 169.835, |
| "eval_steps_per_second": 21.229, |
| "step": 17600 |
| }, |
| { |
| "epoch": 1.7060775397783545, |
| "grad_norm": 1.544632077217102, |
| "learning_rate": 4.244147157190635e-06, |
| "loss": 0.3984, |
| "step": 17700 |
| }, |
| { |
| "epoch": 1.7060775397783545, |
| "eval_loss": 0.3579792380332947, |
| "eval_runtime": 59.0883, |
| "eval_samples_per_second": 169.238, |
| "eval_steps_per_second": 21.155, |
| "step": 17700 |
| }, |
| { |
| "epoch": 1.7157163959352943, |
| "grad_norm": 1.4467884302139282, |
| "learning_rate": 4.234113712374582e-06, |
| "loss": 0.3982, |
| "step": 17800 |
| }, |
| { |
| "epoch": 1.7157163959352943, |
| "eval_loss": 0.3508288562297821, |
| "eval_runtime": 58.9148, |
| "eval_samples_per_second": 169.737, |
| "eval_steps_per_second": 21.217, |
| "step": 17800 |
| }, |
| { |
| "epoch": 1.7253552520922342, |
| "grad_norm": 1.4046787023544312, |
| "learning_rate": 4.224080267558528e-06, |
| "loss": 0.3967, |
| "step": 17900 |
| }, |
| { |
| "epoch": 1.7253552520922342, |
| "eval_loss": 0.3544318974018097, |
| "eval_runtime": 58.917, |
| "eval_samples_per_second": 169.73, |
| "eval_steps_per_second": 21.216, |
| "step": 17900 |
| }, |
| { |
| "epoch": 1.734994108249174, |
| "grad_norm": 1.5111027956008911, |
| "learning_rate": 4.214046822742475e-06, |
| "loss": 0.3962, |
| "step": 18000 |
| }, |
| { |
| "epoch": 1.734994108249174, |
| "eval_loss": 0.3524232804775238, |
| "eval_runtime": 59.0409, |
| "eval_samples_per_second": 169.374, |
| "eval_steps_per_second": 21.172, |
| "step": 18000 |
| }, |
| { |
| "epoch": 1.7446329644061138, |
| "grad_norm": 1.469098687171936, |
| "learning_rate": 4.2040133779264216e-06, |
| "loss": 0.3968, |
| "step": 18100 |
| }, |
| { |
| "epoch": 1.7446329644061138, |
| "eval_loss": 0.352710485458374, |
| "eval_runtime": 58.913, |
| "eval_samples_per_second": 169.742, |
| "eval_steps_per_second": 21.218, |
| "step": 18100 |
| }, |
| { |
| "epoch": 1.7542718205630536, |
| "grad_norm": 1.253414511680603, |
| "learning_rate": 4.1939799331103675e-06, |
| "loss": 0.3952, |
| "step": 18200 |
| }, |
| { |
| "epoch": 1.7542718205630536, |
| "eval_loss": 0.3524448871612549, |
| "eval_runtime": 59.0814, |
| "eval_samples_per_second": 169.258, |
| "eval_steps_per_second": 21.157, |
| "step": 18200 |
| }, |
| { |
| "epoch": 1.7639106767199937, |
| "grad_norm": 1.425121545791626, |
| "learning_rate": 4.183946488294314e-06, |
| "loss": 0.3897, |
| "step": 18300 |
| }, |
| { |
| "epoch": 1.7639106767199937, |
| "eval_loss": 0.3566833436489105, |
| "eval_runtime": 58.9319, |
| "eval_samples_per_second": 169.687, |
| "eval_steps_per_second": 21.211, |
| "step": 18300 |
| }, |
| { |
| "epoch": 1.7735495328769335, |
| "grad_norm": 1.4836667776107788, |
| "learning_rate": 4.173913043478261e-06, |
| "loss": 0.391, |
| "step": 18400 |
| }, |
| { |
| "epoch": 1.7735495328769335, |
| "eval_loss": 0.3474622666835785, |
| "eval_runtime": 59.0355, |
| "eval_samples_per_second": 169.389, |
| "eval_steps_per_second": 21.174, |
| "step": 18400 |
| }, |
| { |
| "epoch": 1.7831883890338733, |
| "grad_norm": 1.3883644342422485, |
| "learning_rate": 4.163879598662208e-06, |
| "loss": 0.3928, |
| "step": 18500 |
| }, |
| { |
| "epoch": 1.7831883890338733, |
| "eval_loss": 0.35389477014541626, |
| "eval_runtime": 58.8499, |
| "eval_samples_per_second": 169.924, |
| "eval_steps_per_second": 21.24, |
| "step": 18500 |
| }, |
| { |
| "epoch": 1.7928272451908132, |
| "grad_norm": 1.4131203889846802, |
| "learning_rate": 4.153846153846154e-06, |
| "loss": 0.3856, |
| "step": 18600 |
| }, |
| { |
| "epoch": 1.7928272451908132, |
| "eval_loss": 0.34588733315467834, |
| "eval_runtime": 58.8426, |
| "eval_samples_per_second": 169.945, |
| "eval_steps_per_second": 21.243, |
| "step": 18600 |
| }, |
| { |
| "epoch": 1.8024661013477532, |
| "grad_norm": 1.407915711402893, |
| "learning_rate": 4.1438127090301005e-06, |
| "loss": 0.386, |
| "step": 18700 |
| }, |
| { |
| "epoch": 1.8024661013477532, |
| "eval_loss": 0.3406791687011719, |
| "eval_runtime": 59.0271, |
| "eval_samples_per_second": 169.414, |
| "eval_steps_per_second": 21.177, |
| "step": 18700 |
| }, |
| { |
| "epoch": 1.812104957504693, |
| "grad_norm": 1.5311362743377686, |
| "learning_rate": 4.133779264214047e-06, |
| "loss": 0.3856, |
| "step": 18800 |
| }, |
| { |
| "epoch": 1.812104957504693, |
| "eval_loss": 0.3458515703678131, |
| "eval_runtime": 58.7489, |
| "eval_samples_per_second": 170.216, |
| "eval_steps_per_second": 21.277, |
| "step": 18800 |
| }, |
| { |
| "epoch": 1.8217438136616328, |
| "grad_norm": 1.3238445520401, |
| "learning_rate": 4.123745819397993e-06, |
| "loss": 0.3837, |
| "step": 18900 |
| }, |
| { |
| "epoch": 1.8217438136616328, |
| "eval_loss": 0.3374645411968231, |
| "eval_runtime": 58.8018, |
| "eval_samples_per_second": 170.063, |
| "eval_steps_per_second": 21.258, |
| "step": 18900 |
| }, |
| { |
| "epoch": 1.8313826698185727, |
| "grad_norm": 1.363261342048645, |
| "learning_rate": 4.11371237458194e-06, |
| "loss": 0.3885, |
| "step": 19000 |
| }, |
| { |
| "epoch": 1.8313826698185727, |
| "eval_loss": 0.3434898853302002, |
| "eval_runtime": 58.8257, |
| "eval_samples_per_second": 169.994, |
| "eval_steps_per_second": 21.249, |
| "step": 19000 |
| }, |
| { |
| "epoch": 1.8410215259755125, |
| "grad_norm": 1.4662117958068848, |
| "learning_rate": 4.103678929765887e-06, |
| "loss": 0.3844, |
| "step": 19100 |
| }, |
| { |
| "epoch": 1.8410215259755125, |
| "eval_loss": 0.3399724066257477, |
| "eval_runtime": 58.9861, |
| "eval_samples_per_second": 169.532, |
| "eval_steps_per_second": 21.191, |
| "step": 19100 |
| }, |
| { |
| "epoch": 1.8506603821324523, |
| "grad_norm": 1.4030060768127441, |
| "learning_rate": 4.0936454849498326e-06, |
| "loss": 0.3842, |
| "step": 19200 |
| }, |
| { |
| "epoch": 1.8506603821324523, |
| "eval_loss": 0.3475658595561981, |
| "eval_runtime": 59.1051, |
| "eval_samples_per_second": 169.19, |
| "eval_steps_per_second": 21.149, |
| "step": 19200 |
| }, |
| { |
| "epoch": 1.8602992382893921, |
| "grad_norm": 1.3595353364944458, |
| "learning_rate": 4.083612040133779e-06, |
| "loss": 0.3852, |
| "step": 19300 |
| }, |
| { |
| "epoch": 1.8602992382893921, |
| "eval_loss": 0.3436114192008972, |
| "eval_runtime": 58.8926, |
| "eval_samples_per_second": 169.801, |
| "eval_steps_per_second": 21.225, |
| "step": 19300 |
| }, |
| { |
| "epoch": 1.869938094446332, |
| "grad_norm": 1.29786217212677, |
| "learning_rate": 4.073578595317726e-06, |
| "loss": 0.3773, |
| "step": 19400 |
| }, |
| { |
| "epoch": 1.869938094446332, |
| "eval_loss": 0.3408568799495697, |
| "eval_runtime": 58.9024, |
| "eval_samples_per_second": 169.772, |
| "eval_steps_per_second": 21.222, |
| "step": 19400 |
| }, |
| { |
| "epoch": 1.8795769506032718, |
| "grad_norm": 1.3584972620010376, |
| "learning_rate": 4.063545150501672e-06, |
| "loss": 0.3797, |
| "step": 19500 |
| }, |
| { |
| "epoch": 1.8795769506032718, |
| "eval_loss": 0.33426791429519653, |
| "eval_runtime": 59.1051, |
| "eval_samples_per_second": 169.19, |
| "eval_steps_per_second": 21.149, |
| "step": 19500 |
| }, |
| { |
| "epoch": 1.8892158067602116, |
| "grad_norm": 1.424649953842163, |
| "learning_rate": 4.053511705685619e-06, |
| "loss": 0.3786, |
| "step": 19600 |
| }, |
| { |
| "epoch": 1.8892158067602116, |
| "eval_loss": 0.33714547753334045, |
| "eval_runtime": 58.9501, |
| "eval_samples_per_second": 169.635, |
| "eval_steps_per_second": 21.204, |
| "step": 19600 |
| }, |
| { |
| "epoch": 1.8988546629171517, |
| "grad_norm": 1.4854973554611206, |
| "learning_rate": 4.0434782608695655e-06, |
| "loss": 0.3795, |
| "step": 19700 |
| }, |
| { |
| "epoch": 1.8988546629171517, |
| "eval_loss": 0.33414432406425476, |
| "eval_runtime": 59.106, |
| "eval_samples_per_second": 169.188, |
| "eval_steps_per_second": 21.148, |
| "step": 19700 |
| }, |
| { |
| "epoch": 1.9084935190740915, |
| "grad_norm": 1.3664216995239258, |
| "learning_rate": 4.0334448160535115e-06, |
| "loss": 0.3785, |
| "step": 19800 |
| }, |
| { |
| "epoch": 1.9084935190740915, |
| "eval_loss": 0.344148188829422, |
| "eval_runtime": 58.8743, |
| "eval_samples_per_second": 169.854, |
| "eval_steps_per_second": 21.232, |
| "step": 19800 |
| }, |
| { |
| "epoch": 1.9181323752310313, |
| "grad_norm": 1.3980112075805664, |
| "learning_rate": 4.023411371237458e-06, |
| "loss": 0.3764, |
| "step": 19900 |
| }, |
| { |
| "epoch": 1.9181323752310313, |
| "eval_loss": 0.33199581503868103, |
| "eval_runtime": 58.8637, |
| "eval_samples_per_second": 169.884, |
| "eval_steps_per_second": 21.236, |
| "step": 19900 |
| }, |
| { |
| "epoch": 1.9277712313879714, |
| "grad_norm": 1.4668302536010742, |
| "learning_rate": 4.013377926421405e-06, |
| "loss": 0.376, |
| "step": 20000 |
| }, |
| { |
| "epoch": 1.9277712313879714, |
| "eval_loss": 0.3381511867046356, |
| "eval_runtime": 58.8277, |
| "eval_samples_per_second": 169.988, |
| "eval_steps_per_second": 21.248, |
| "step": 20000 |
| }, |
| { |
| "epoch": 1.9374100875449112, |
| "grad_norm": 1.46481454372406, |
| "learning_rate": 4.003344481605351e-06, |
| "loss": 0.3765, |
| "step": 20100 |
| }, |
| { |
| "epoch": 1.9374100875449112, |
| "eval_loss": 0.33972933888435364, |
| "eval_runtime": 59.083, |
| "eval_samples_per_second": 169.253, |
| "eval_steps_per_second": 21.157, |
| "step": 20100 |
| }, |
| { |
| "epoch": 1.947048943701851, |
| "grad_norm": 1.3771693706512451, |
| "learning_rate": 3.993311036789298e-06, |
| "loss": 0.3742, |
| "step": 20200 |
| }, |
| { |
| "epoch": 1.947048943701851, |
| "eval_loss": 0.3317793607711792, |
| "eval_runtime": 58.8425, |
| "eval_samples_per_second": 169.945, |
| "eval_steps_per_second": 21.243, |
| "step": 20200 |
| }, |
| { |
| "epoch": 1.9566877998587908, |
| "grad_norm": 1.3235597610473633, |
| "learning_rate": 3.9832775919732444e-06, |
| "loss": 0.3715, |
| "step": 20300 |
| }, |
| { |
| "epoch": 1.9566877998587908, |
| "eval_loss": 0.32993626594543457, |
| "eval_runtime": 58.6506, |
| "eval_samples_per_second": 170.501, |
| "eval_steps_per_second": 21.313, |
| "step": 20300 |
| }, |
| { |
| "epoch": 1.9663266560157306, |
| "grad_norm": 1.308266043663025, |
| "learning_rate": 3.97324414715719e-06, |
| "loss": 0.3723, |
| "step": 20400 |
| }, |
| { |
| "epoch": 1.9663266560157306, |
| "eval_loss": 0.3278212547302246, |
| "eval_runtime": 59.0497, |
| "eval_samples_per_second": 169.349, |
| "eval_steps_per_second": 21.169, |
| "step": 20400 |
| }, |
| { |
| "epoch": 1.9759655121726705, |
| "grad_norm": 1.32529878616333, |
| "learning_rate": 3.963210702341137e-06, |
| "loss": 0.3747, |
| "step": 20500 |
| }, |
| { |
| "epoch": 1.9759655121726705, |
| "eval_loss": 0.32411912083625793, |
| "eval_runtime": 59.0234, |
| "eval_samples_per_second": 169.424, |
| "eval_steps_per_second": 21.178, |
| "step": 20500 |
| }, |
| { |
| "epoch": 1.9856043683296103, |
| "grad_norm": 1.380763292312622, |
| "learning_rate": 3.953177257525084e-06, |
| "loss": 0.3685, |
| "step": 20600 |
| }, |
| { |
| "epoch": 1.9856043683296103, |
| "eval_loss": 0.33261290192604065, |
| "eval_runtime": 58.8622, |
| "eval_samples_per_second": 169.888, |
| "eval_steps_per_second": 21.236, |
| "step": 20600 |
| }, |
| { |
| "epoch": 1.9952432244865501, |
| "grad_norm": 1.405206322669983, |
| "learning_rate": 3.943143812709031e-06, |
| "loss": 0.371, |
| "step": 20700 |
| }, |
| { |
| "epoch": 1.9952432244865501, |
| "eval_loss": 0.3292562961578369, |
| "eval_runtime": 58.8554, |
| "eval_samples_per_second": 169.908, |
| "eval_steps_per_second": 21.239, |
| "step": 20700 |
| }, |
| { |
| "epoch": 2.00488208064349, |
| "grad_norm": 1.4182567596435547, |
| "learning_rate": 3.9331103678929765e-06, |
| "loss": 0.366, |
| "step": 20800 |
| }, |
| { |
| "epoch": 2.00488208064349, |
| "eval_loss": 0.32856816053390503, |
| "eval_runtime": 59.0258, |
| "eval_samples_per_second": 169.418, |
| "eval_steps_per_second": 21.177, |
| "step": 20800 |
| }, |
| { |
| "epoch": 2.0145209368004298, |
| "grad_norm": 1.3217737674713135, |
| "learning_rate": 3.923076923076923e-06, |
| "loss": 0.3662, |
| "step": 20900 |
| }, |
| { |
| "epoch": 2.0145209368004298, |
| "eval_loss": 0.3281511664390564, |
| "eval_runtime": 58.8512, |
| "eval_samples_per_second": 169.92, |
| "eval_steps_per_second": 21.24, |
| "step": 20900 |
| }, |
| { |
| "epoch": 2.0241597929573696, |
| "grad_norm": 1.3803727626800537, |
| "learning_rate": 3.91304347826087e-06, |
| "loss": 0.3643, |
| "step": 21000 |
| }, |
| { |
| "epoch": 2.0241597929573696, |
| "eval_loss": 0.31252560019493103, |
| "eval_runtime": 58.9054, |
| "eval_samples_per_second": 169.764, |
| "eval_steps_per_second": 21.22, |
| "step": 21000 |
| }, |
| { |
| "epoch": 2.0337986491143094, |
| "grad_norm": 1.3696268796920776, |
| "learning_rate": 3.903010033444816e-06, |
| "loss": 0.365, |
| "step": 21100 |
| }, |
| { |
| "epoch": 2.0337986491143094, |
| "eval_loss": 0.3254716694355011, |
| "eval_runtime": 59.0681, |
| "eval_samples_per_second": 169.296, |
| "eval_steps_per_second": 21.162, |
| "step": 21100 |
| }, |
| { |
| "epoch": 2.0434375052712497, |
| "grad_norm": 1.443435788154602, |
| "learning_rate": 3.892976588628763e-06, |
| "loss": 0.3616, |
| "step": 21200 |
| }, |
| { |
| "epoch": 2.0434375052712497, |
| "eval_loss": 0.3301333487033844, |
| "eval_runtime": 58.8781, |
| "eval_samples_per_second": 169.842, |
| "eval_steps_per_second": 21.23, |
| "step": 21200 |
| }, |
| { |
| "epoch": 2.0530763614281895, |
| "grad_norm": 1.2991629838943481, |
| "learning_rate": 3.8829431438127095e-06, |
| "loss": 0.3662, |
| "step": 21300 |
| }, |
| { |
| "epoch": 2.0530763614281895, |
| "eval_loss": 0.3216095566749573, |
| "eval_runtime": 58.8486, |
| "eval_samples_per_second": 169.928, |
| "eval_steps_per_second": 21.241, |
| "step": 21300 |
| }, |
| { |
| "epoch": 2.0627152175851293, |
| "grad_norm": 1.3617690801620483, |
| "learning_rate": 3.8729096989966554e-06, |
| "loss": 0.3609, |
| "step": 21400 |
| }, |
| { |
| "epoch": 2.0627152175851293, |
| "eval_loss": 0.33237212896347046, |
| "eval_runtime": 59.04, |
| "eval_samples_per_second": 169.377, |
| "eval_steps_per_second": 21.172, |
| "step": 21400 |
| }, |
| { |
| "epoch": 2.072354073742069, |
| "grad_norm": 1.4248161315917969, |
| "learning_rate": 3.862876254180602e-06, |
| "loss": 0.3627, |
| "step": 21500 |
| }, |
| { |
| "epoch": 2.072354073742069, |
| "eval_loss": 0.31593969464302063, |
| "eval_runtime": 58.8557, |
| "eval_samples_per_second": 169.907, |
| "eval_steps_per_second": 21.238, |
| "step": 21500 |
| }, |
| { |
| "epoch": 2.081992929899009, |
| "grad_norm": 1.3086020946502686, |
| "learning_rate": 3.852842809364549e-06, |
| "loss": 0.3594, |
| "step": 21600 |
| }, |
| { |
| "epoch": 2.081992929899009, |
| "eval_loss": 0.3166731595993042, |
| "eval_runtime": 58.9107, |
| "eval_samples_per_second": 169.749, |
| "eval_steps_per_second": 21.219, |
| "step": 21600 |
| }, |
| { |
| "epoch": 2.091631786055949, |
| "grad_norm": 1.3267004489898682, |
| "learning_rate": 3.842809364548495e-06, |
| "loss": 0.3594, |
| "step": 21700 |
| }, |
| { |
| "epoch": 2.091631786055949, |
| "eval_loss": 0.32076331973075867, |
| "eval_runtime": 59.1128, |
| "eval_samples_per_second": 169.168, |
| "eval_steps_per_second": 21.146, |
| "step": 21700 |
| }, |
| { |
| "epoch": 2.1012706422128886, |
| "grad_norm": 1.327438473701477, |
| "learning_rate": 3.832775919732442e-06, |
| "loss": 0.3567, |
| "step": 21800 |
| }, |
| { |
| "epoch": 2.1012706422128886, |
| "eval_loss": 0.3243319094181061, |
| "eval_runtime": 58.7006, |
| "eval_samples_per_second": 170.356, |
| "eval_steps_per_second": 21.295, |
| "step": 21800 |
| }, |
| { |
| "epoch": 2.1109094983698284, |
| "grad_norm": 1.3011143207550049, |
| "learning_rate": 3.822742474916388e-06, |
| "loss": 0.3605, |
| "step": 21900 |
| }, |
| { |
| "epoch": 2.1109094983698284, |
| "eval_loss": 0.3196314871311188, |
| "eval_runtime": 58.8983, |
| "eval_samples_per_second": 169.784, |
| "eval_steps_per_second": 21.223, |
| "step": 21900 |
| }, |
| { |
| "epoch": 2.1205483545267683, |
| "grad_norm": 1.337679386138916, |
| "learning_rate": 3.8127090301003347e-06, |
| "loss": 0.3605, |
| "step": 22000 |
| }, |
| { |
| "epoch": 2.1205483545267683, |
| "eval_loss": 0.32047805190086365, |
| "eval_runtime": 58.9713, |
| "eval_samples_per_second": 169.574, |
| "eval_steps_per_second": 21.197, |
| "step": 22000 |
| }, |
| { |
| "epoch": 2.130187210683708, |
| "grad_norm": 1.4192520380020142, |
| "learning_rate": 3.802675585284281e-06, |
| "loss": 0.3575, |
| "step": 22100 |
| }, |
| { |
| "epoch": 2.130187210683708, |
| "eval_loss": 0.31928718090057373, |
| "eval_runtime": 58.9231, |
| "eval_samples_per_second": 169.713, |
| "eval_steps_per_second": 21.214, |
| "step": 22100 |
| }, |
| { |
| "epoch": 2.139826066840648, |
| "grad_norm": 1.340483546257019, |
| "learning_rate": 3.792642140468228e-06, |
| "loss": 0.3582, |
| "step": 22200 |
| }, |
| { |
| "epoch": 2.139826066840648, |
| "eval_loss": 0.32130032777786255, |
| "eval_runtime": 58.8988, |
| "eval_samples_per_second": 169.783, |
| "eval_steps_per_second": 21.223, |
| "step": 22200 |
| }, |
| { |
| "epoch": 2.1494649229975877, |
| "grad_norm": 1.2616146802902222, |
| "learning_rate": 3.782608695652174e-06, |
| "loss": 0.3559, |
| "step": 22300 |
| }, |
| { |
| "epoch": 2.1494649229975877, |
| "eval_loss": 0.3182724118232727, |
| "eval_runtime": 59.0569, |
| "eval_samples_per_second": 169.328, |
| "eval_steps_per_second": 21.166, |
| "step": 22300 |
| }, |
| { |
| "epoch": 2.1591037791545276, |
| "grad_norm": 1.4252580404281616, |
| "learning_rate": 3.7725752508361205e-06, |
| "loss": 0.3529, |
| "step": 22400 |
| }, |
| { |
| "epoch": 2.1591037791545276, |
| "eval_loss": 0.3209792971611023, |
| "eval_runtime": 59.0164, |
| "eval_samples_per_second": 169.445, |
| "eval_steps_per_second": 21.181, |
| "step": 22400 |
| }, |
| { |
| "epoch": 2.1687426353114674, |
| "grad_norm": 1.3374176025390625, |
| "learning_rate": 3.7625418060200673e-06, |
| "loss": 0.3533, |
| "step": 22500 |
| }, |
| { |
| "epoch": 2.1687426353114674, |
| "eval_loss": 0.3203897476196289, |
| "eval_runtime": 58.852, |
| "eval_samples_per_second": 169.918, |
| "eval_steps_per_second": 21.24, |
| "step": 22500 |
| }, |
| { |
| "epoch": 2.1783814914684076, |
| "grad_norm": 1.366310715675354, |
| "learning_rate": 3.7525083612040136e-06, |
| "loss": 0.3518, |
| "step": 22600 |
| }, |
| { |
| "epoch": 2.1783814914684076, |
| "eval_loss": 0.31347817182540894, |
| "eval_runtime": 58.8483, |
| "eval_samples_per_second": 169.928, |
| "eval_steps_per_second": 21.241, |
| "step": 22600 |
| }, |
| { |
| "epoch": 2.1880203476253475, |
| "grad_norm": 1.4128994941711426, |
| "learning_rate": 3.74247491638796e-06, |
| "loss": 0.351, |
| "step": 22700 |
| }, |
| { |
| "epoch": 2.1880203476253475, |
| "eval_loss": 0.3184278905391693, |
| "eval_runtime": 59.0451, |
| "eval_samples_per_second": 169.362, |
| "eval_steps_per_second": 21.17, |
| "step": 22700 |
| }, |
| { |
| "epoch": 2.1976592037822873, |
| "grad_norm": 1.3148337602615356, |
| "learning_rate": 3.7324414715719067e-06, |
| "loss": 0.3523, |
| "step": 22800 |
| }, |
| { |
| "epoch": 2.1976592037822873, |
| "eval_loss": 0.31502121686935425, |
| "eval_runtime": 58.8414, |
| "eval_samples_per_second": 169.948, |
| "eval_steps_per_second": 21.244, |
| "step": 22800 |
| }, |
| { |
| "epoch": 2.207298059939227, |
| "grad_norm": 1.3226207494735718, |
| "learning_rate": 3.722408026755853e-06, |
| "loss": 0.3498, |
| "step": 22900 |
| }, |
| { |
| "epoch": 2.207298059939227, |
| "eval_loss": 0.32347571849823, |
| "eval_runtime": 58.8138, |
| "eval_samples_per_second": 170.028, |
| "eval_steps_per_second": 21.254, |
| "step": 22900 |
| }, |
| { |
| "epoch": 2.216936916096167, |
| "grad_norm": 1.4576925039291382, |
| "learning_rate": 3.7123745819398e-06, |
| "loss": 0.3487, |
| "step": 23000 |
| }, |
| { |
| "epoch": 2.216936916096167, |
| "eval_loss": 0.314643532037735, |
| "eval_runtime": 58.9415, |
| "eval_samples_per_second": 169.66, |
| "eval_steps_per_second": 21.207, |
| "step": 23000 |
| }, |
| { |
| "epoch": 2.2265757722531068, |
| "grad_norm": 1.3837294578552246, |
| "learning_rate": 3.702341137123746e-06, |
| "loss": 0.3513, |
| "step": 23100 |
| }, |
| { |
| "epoch": 2.2265757722531068, |
| "eval_loss": 0.3134578764438629, |
| "eval_runtime": 58.8134, |
| "eval_samples_per_second": 170.029, |
| "eval_steps_per_second": 21.254, |
| "step": 23100 |
| }, |
| { |
| "epoch": 2.2362146284100466, |
| "grad_norm": 1.2935632467269897, |
| "learning_rate": 3.6923076923076925e-06, |
| "loss": 0.349, |
| "step": 23200 |
| }, |
| { |
| "epoch": 2.2362146284100466, |
| "eval_loss": 0.31015080213546753, |
| "eval_runtime": 58.7672, |
| "eval_samples_per_second": 170.163, |
| "eval_steps_per_second": 21.27, |
| "step": 23200 |
| }, |
| { |
| "epoch": 2.2458534845669864, |
| "grad_norm": 1.2622168064117432, |
| "learning_rate": 3.6822742474916393e-06, |
| "loss": 0.3474, |
| "step": 23300 |
| }, |
| { |
| "epoch": 2.2458534845669864, |
| "eval_loss": 0.31574249267578125, |
| "eval_runtime": 58.725, |
| "eval_samples_per_second": 170.285, |
| "eval_steps_per_second": 21.286, |
| "step": 23300 |
| }, |
| { |
| "epoch": 2.2554923407239262, |
| "grad_norm": 1.5001434087753296, |
| "learning_rate": 3.6722408026755856e-06, |
| "loss": 0.3507, |
| "step": 23400 |
| }, |
| { |
| "epoch": 2.2554923407239262, |
| "eval_loss": 0.32215574383735657, |
| "eval_runtime": 58.7674, |
| "eval_samples_per_second": 170.162, |
| "eval_steps_per_second": 21.27, |
| "step": 23400 |
| }, |
| { |
| "epoch": 2.265131196880866, |
| "grad_norm": 1.3347852230072021, |
| "learning_rate": 3.662207357859532e-06, |
| "loss": 0.3453, |
| "step": 23500 |
| }, |
| { |
| "epoch": 2.265131196880866, |
| "eval_loss": 0.30782532691955566, |
| "eval_runtime": 58.7918, |
| "eval_samples_per_second": 170.092, |
| "eval_steps_per_second": 21.261, |
| "step": 23500 |
| }, |
| { |
| "epoch": 2.274770053037806, |
| "grad_norm": 1.2803244590759277, |
| "learning_rate": 3.6521739130434787e-06, |
| "loss": 0.3463, |
| "step": 23600 |
| }, |
| { |
| "epoch": 2.274770053037806, |
| "eval_loss": 0.3062191903591156, |
| "eval_runtime": 58.9104, |
| "eval_samples_per_second": 169.749, |
| "eval_steps_per_second": 21.219, |
| "step": 23600 |
| }, |
| { |
| "epoch": 2.2844089091947457, |
| "grad_norm": 1.2594212293624878, |
| "learning_rate": 3.642140468227425e-06, |
| "loss": 0.3449, |
| "step": 23700 |
| }, |
| { |
| "epoch": 2.2844089091947457, |
| "eval_loss": 0.30870264768600464, |
| "eval_runtime": 58.753, |
| "eval_samples_per_second": 170.204, |
| "eval_steps_per_second": 21.276, |
| "step": 23700 |
| }, |
| { |
| "epoch": 2.2940477653516855, |
| "grad_norm": 1.3449223041534424, |
| "learning_rate": 3.6321070234113714e-06, |
| "loss": 0.3421, |
| "step": 23800 |
| }, |
| { |
| "epoch": 2.2940477653516855, |
| "eval_loss": 0.30901628732681274, |
| "eval_runtime": 58.7205, |
| "eval_samples_per_second": 170.298, |
| "eval_steps_per_second": 21.287, |
| "step": 23800 |
| }, |
| { |
| "epoch": 2.303686621508626, |
| "grad_norm": 1.2479959726333618, |
| "learning_rate": 3.622073578595318e-06, |
| "loss": 0.343, |
| "step": 23900 |
| }, |
| { |
| "epoch": 2.303686621508626, |
| "eval_loss": 0.30690494179725647, |
| "eval_runtime": 58.9674, |
| "eval_samples_per_second": 169.585, |
| "eval_steps_per_second": 21.198, |
| "step": 23900 |
| }, |
| { |
| "epoch": 2.3133254776655656, |
| "grad_norm": 1.3553513288497925, |
| "learning_rate": 3.6120401337792645e-06, |
| "loss": 0.3429, |
| "step": 24000 |
| }, |
| { |
| "epoch": 2.3133254776655656, |
| "eval_loss": 0.3068706691265106, |
| "eval_runtime": 58.8271, |
| "eval_samples_per_second": 169.99, |
| "eval_steps_per_second": 21.249, |
| "step": 24000 |
| }, |
| { |
| "epoch": 2.3229643338225054, |
| "grad_norm": 1.3287960290908813, |
| "learning_rate": 3.6020066889632112e-06, |
| "loss": 0.3415, |
| "step": 24100 |
| }, |
| { |
| "epoch": 2.3229643338225054, |
| "eval_loss": 0.3021765947341919, |
| "eval_runtime": 58.7866, |
| "eval_samples_per_second": 170.107, |
| "eval_steps_per_second": 21.263, |
| "step": 24100 |
| }, |
| { |
| "epoch": 2.3326031899794453, |
| "grad_norm": 1.3244682550430298, |
| "learning_rate": 3.5919732441471576e-06, |
| "loss": 0.3394, |
| "step": 24200 |
| }, |
| { |
| "epoch": 2.3326031899794453, |
| "eval_loss": 0.30216312408447266, |
| "eval_runtime": 58.9521, |
| "eval_samples_per_second": 169.629, |
| "eval_steps_per_second": 21.204, |
| "step": 24200 |
| }, |
| { |
| "epoch": 2.342242046136385, |
| "grad_norm": 1.286342978477478, |
| "learning_rate": 3.581939799331104e-06, |
| "loss": 0.34, |
| "step": 24300 |
| }, |
| { |
| "epoch": 2.342242046136385, |
| "eval_loss": 0.3076727092266083, |
| "eval_runtime": 58.7703, |
| "eval_samples_per_second": 170.154, |
| "eval_steps_per_second": 21.269, |
| "step": 24300 |
| }, |
| { |
| "epoch": 2.351880902293325, |
| "grad_norm": 1.2560862302780151, |
| "learning_rate": 3.5719063545150507e-06, |
| "loss": 0.3344, |
| "step": 24400 |
| }, |
| { |
| "epoch": 2.351880902293325, |
| "eval_loss": 0.30005696415901184, |
| "eval_runtime": 58.9499, |
| "eval_samples_per_second": 169.635, |
| "eval_steps_per_second": 21.204, |
| "step": 24400 |
| }, |
| { |
| "epoch": 2.3615197584502647, |
| "grad_norm": 1.3560149669647217, |
| "learning_rate": 3.561872909698997e-06, |
| "loss": 0.3363, |
| "step": 24500 |
| }, |
| { |
| "epoch": 2.3615197584502647, |
| "eval_loss": 0.3021916449069977, |
| "eval_runtime": 58.7981, |
| "eval_samples_per_second": 170.073, |
| "eval_steps_per_second": 21.259, |
| "step": 24500 |
| }, |
| { |
| "epoch": 2.3711586146072046, |
| "grad_norm": 1.1849713325500488, |
| "learning_rate": 3.5518394648829434e-06, |
| "loss": 0.3367, |
| "step": 24600 |
| }, |
| { |
| "epoch": 2.3711586146072046, |
| "eval_loss": 0.3012247681617737, |
| "eval_runtime": 58.8396, |
| "eval_samples_per_second": 169.954, |
| "eval_steps_per_second": 21.244, |
| "step": 24600 |
| }, |
| { |
| "epoch": 2.3807974707641444, |
| "grad_norm": 1.2602722644805908, |
| "learning_rate": 3.54180602006689e-06, |
| "loss": 0.3376, |
| "step": 24700 |
| }, |
| { |
| "epoch": 2.3807974707641444, |
| "eval_loss": 0.304877907037735, |
| "eval_runtime": 59.0108, |
| "eval_samples_per_second": 169.46, |
| "eval_steps_per_second": 21.183, |
| "step": 24700 |
| }, |
| { |
| "epoch": 2.390436326921084, |
| "grad_norm": 1.194183111190796, |
| "learning_rate": 3.5317725752508365e-06, |
| "loss": 0.3354, |
| "step": 24800 |
| }, |
| { |
| "epoch": 2.390436326921084, |
| "eval_loss": 0.3006995618343353, |
| "eval_runtime": 58.5927, |
| "eval_samples_per_second": 170.67, |
| "eval_steps_per_second": 21.334, |
| "step": 24800 |
| }, |
| { |
| "epoch": 2.400075183078024, |
| "grad_norm": 1.3535934686660767, |
| "learning_rate": 3.521739130434783e-06, |
| "loss": 0.3381, |
| "step": 24900 |
| }, |
| { |
| "epoch": 2.400075183078024, |
| "eval_loss": 0.2990175485610962, |
| "eval_runtime": 58.785, |
| "eval_samples_per_second": 170.111, |
| "eval_steps_per_second": 21.264, |
| "step": 24900 |
| }, |
| { |
| "epoch": 2.409714039234964, |
| "grad_norm": 1.2365829944610596, |
| "learning_rate": 3.5117056856187296e-06, |
| "loss": 0.3381, |
| "step": 25000 |
| }, |
| { |
| "epoch": 2.409714039234964, |
| "eval_loss": 0.30268794298171997, |
| "eval_runtime": 58.955, |
| "eval_samples_per_second": 169.621, |
| "eval_steps_per_second": 21.203, |
| "step": 25000 |
| }, |
| { |
| "epoch": 2.4193528953919037, |
| "grad_norm": 1.3548235893249512, |
| "learning_rate": 3.501672240802676e-06, |
| "loss": 0.3381, |
| "step": 25100 |
| }, |
| { |
| "epoch": 2.4193528953919037, |
| "eval_loss": 0.3028050363063812, |
| "eval_runtime": 58.9666, |
| "eval_samples_per_second": 169.587, |
| "eval_steps_per_second": 21.198, |
| "step": 25100 |
| }, |
| { |
| "epoch": 2.4289917515488435, |
| "grad_norm": 1.3373470306396484, |
| "learning_rate": 3.491638795986622e-06, |
| "loss": 0.3347, |
| "step": 25200 |
| }, |
| { |
| "epoch": 2.4289917515488435, |
| "eval_loss": 0.3040294647216797, |
| "eval_runtime": 58.786, |
| "eval_samples_per_second": 170.109, |
| "eval_steps_per_second": 21.264, |
| "step": 25200 |
| }, |
| { |
| "epoch": 2.4386306077057833, |
| "grad_norm": 1.2805660963058472, |
| "learning_rate": 3.481605351170568e-06, |
| "loss": 0.3362, |
| "step": 25300 |
| }, |
| { |
| "epoch": 2.4386306077057833, |
| "eval_loss": 0.294087290763855, |
| "eval_runtime": 58.8166, |
| "eval_samples_per_second": 170.02, |
| "eval_steps_per_second": 21.252, |
| "step": 25300 |
| }, |
| { |
| "epoch": 2.4482694638627236, |
| "grad_norm": 1.2862471342086792, |
| "learning_rate": 3.471571906354515e-06, |
| "loss": 0.332, |
| "step": 25400 |
| }, |
| { |
| "epoch": 2.4482694638627236, |
| "eval_loss": 0.3031771183013916, |
| "eval_runtime": 58.8813, |
| "eval_samples_per_second": 169.833, |
| "eval_steps_per_second": 21.229, |
| "step": 25400 |
| }, |
| { |
| "epoch": 2.4579083200196634, |
| "grad_norm": 1.2386327981948853, |
| "learning_rate": 3.4615384615384613e-06, |
| "loss": 0.3361, |
| "step": 25500 |
| }, |
| { |
| "epoch": 2.4579083200196634, |
| "eval_loss": 0.30101174116134644, |
| "eval_runtime": 58.7288, |
| "eval_samples_per_second": 170.274, |
| "eval_steps_per_second": 21.284, |
| "step": 25500 |
| }, |
| { |
| "epoch": 2.4675471761766032, |
| "grad_norm": 1.2309975624084473, |
| "learning_rate": 3.4515050167224076e-06, |
| "loss": 0.3324, |
| "step": 25600 |
| }, |
| { |
| "epoch": 2.4675471761766032, |
| "eval_loss": 0.2973729968070984, |
| "eval_runtime": 58.9177, |
| "eval_samples_per_second": 169.728, |
| "eval_steps_per_second": 21.216, |
| "step": 25600 |
| }, |
| { |
| "epoch": 2.477186032333543, |
| "grad_norm": 1.2784621715545654, |
| "learning_rate": 3.4414715719063544e-06, |
| "loss": 0.3315, |
| "step": 25700 |
| }, |
| { |
| "epoch": 2.477186032333543, |
| "eval_loss": 0.2908300757408142, |
| "eval_runtime": 58.7497, |
| "eval_samples_per_second": 170.214, |
| "eval_steps_per_second": 21.277, |
| "step": 25700 |
| }, |
| { |
| "epoch": 2.486824888490483, |
| "grad_norm": 1.2238744497299194, |
| "learning_rate": 3.4314381270903007e-06, |
| "loss": 0.3295, |
| "step": 25800 |
| }, |
| { |
| "epoch": 2.486824888490483, |
| "eval_loss": 0.2972917854785919, |
| "eval_runtime": 58.8169, |
| "eval_samples_per_second": 170.019, |
| "eval_steps_per_second": 21.252, |
| "step": 25800 |
| }, |
| { |
| "epoch": 2.4964637446474227, |
| "grad_norm": 1.1721047163009644, |
| "learning_rate": 3.4214046822742475e-06, |
| "loss": 0.33, |
| "step": 25900 |
| }, |
| { |
| "epoch": 2.4964637446474227, |
| "eval_loss": 0.2929130494594574, |
| "eval_runtime": 58.9116, |
| "eval_samples_per_second": 169.746, |
| "eval_steps_per_second": 21.218, |
| "step": 25900 |
| }, |
| { |
| "epoch": 2.5061026008043625, |
| "grad_norm": 1.2451221942901611, |
| "learning_rate": 3.411371237458194e-06, |
| "loss": 0.3316, |
| "step": 26000 |
| }, |
| { |
| "epoch": 2.5061026008043625, |
| "eval_loss": 0.2947503328323364, |
| "eval_runtime": 58.7698, |
| "eval_samples_per_second": 170.155, |
| "eval_steps_per_second": 21.269, |
| "step": 26000 |
| }, |
| { |
| "epoch": 2.5157414569613024, |
| "grad_norm": 1.345779299736023, |
| "learning_rate": 3.40133779264214e-06, |
| "loss": 0.3313, |
| "step": 26100 |
| }, |
| { |
| "epoch": 2.5157414569613024, |
| "eval_loss": 0.2994622588157654, |
| "eval_runtime": 58.7514, |
| "eval_samples_per_second": 170.209, |
| "eval_steps_per_second": 21.276, |
| "step": 26100 |
| }, |
| { |
| "epoch": 2.525380313118242, |
| "grad_norm": 1.25875985622406, |
| "learning_rate": 3.391304347826087e-06, |
| "loss": 0.3308, |
| "step": 26200 |
| }, |
| { |
| "epoch": 2.525380313118242, |
| "eval_loss": 0.2986910343170166, |
| "eval_runtime": 58.7651, |
| "eval_samples_per_second": 170.169, |
| "eval_steps_per_second": 21.271, |
| "step": 26200 |
| }, |
| { |
| "epoch": 2.535019169275182, |
| "grad_norm": 1.1642955541610718, |
| "learning_rate": 3.3812709030100333e-06, |
| "loss": 0.3297, |
| "step": 26300 |
| }, |
| { |
| "epoch": 2.535019169275182, |
| "eval_loss": 0.29979461431503296, |
| "eval_runtime": 58.8621, |
| "eval_samples_per_second": 169.889, |
| "eval_steps_per_second": 21.236, |
| "step": 26300 |
| }, |
| { |
| "epoch": 2.544658025432122, |
| "grad_norm": 1.3095877170562744, |
| "learning_rate": 3.3712374581939796e-06, |
| "loss": 0.3259, |
| "step": 26400 |
| }, |
| { |
| "epoch": 2.544658025432122, |
| "eval_loss": 0.28874439001083374, |
| "eval_runtime": 58.8355, |
| "eval_samples_per_second": 169.965, |
| "eval_steps_per_second": 21.246, |
| "step": 26400 |
| }, |
| { |
| "epoch": 2.554296881589062, |
| "grad_norm": 1.3071867227554321, |
| "learning_rate": 3.3612040133779264e-06, |
| "loss": 0.3266, |
| "step": 26500 |
| }, |
| { |
| "epoch": 2.554296881589062, |
| "eval_loss": 0.28603237867355347, |
| "eval_runtime": 59.0324, |
| "eval_samples_per_second": 169.398, |
| "eval_steps_per_second": 21.175, |
| "step": 26500 |
| }, |
| { |
| "epoch": 2.563935737746002, |
| "grad_norm": 1.341209888458252, |
| "learning_rate": 3.3511705685618727e-06, |
| "loss": 0.3223, |
| "step": 26600 |
| }, |
| { |
| "epoch": 2.563935737746002, |
| "eval_loss": 0.29270127415657043, |
| "eval_runtime": 58.9317, |
| "eval_samples_per_second": 169.688, |
| "eval_steps_per_second": 21.211, |
| "step": 26600 |
| }, |
| { |
| "epoch": 2.5735745939029417, |
| "grad_norm": 1.2843323945999146, |
| "learning_rate": 3.3411371237458195e-06, |
| "loss": 0.3249, |
| "step": 26700 |
| }, |
| { |
| "epoch": 2.5735745939029417, |
| "eval_loss": 0.28523457050323486, |
| "eval_runtime": 58.7521, |
| "eval_samples_per_second": 170.207, |
| "eval_steps_per_second": 21.276, |
| "step": 26700 |
| }, |
| { |
| "epoch": 2.5832134500598816, |
| "grad_norm": 1.2155966758728027, |
| "learning_rate": 3.331103678929766e-06, |
| "loss": 0.3251, |
| "step": 26800 |
| }, |
| { |
| "epoch": 2.5832134500598816, |
| "eval_loss": 0.28888818621635437, |
| "eval_runtime": 58.7677, |
| "eval_samples_per_second": 170.161, |
| "eval_steps_per_second": 21.27, |
| "step": 26800 |
| }, |
| { |
| "epoch": 2.5928523062168214, |
| "grad_norm": 1.2257170677185059, |
| "learning_rate": 3.321070234113712e-06, |
| "loss": 0.3238, |
| "step": 26900 |
| }, |
| { |
| "epoch": 2.5928523062168214, |
| "eval_loss": 0.29029136896133423, |
| "eval_runtime": 58.9332, |
| "eval_samples_per_second": 169.684, |
| "eval_steps_per_second": 21.21, |
| "step": 26900 |
| }, |
| { |
| "epoch": 2.602491162373761, |
| "grad_norm": 1.2092607021331787, |
| "learning_rate": 3.311036789297659e-06, |
| "loss": 0.3229, |
| "step": 27000 |
| }, |
| { |
| "epoch": 2.602491162373761, |
| "eval_loss": 0.2846786379814148, |
| "eval_runtime": 58.8241, |
| "eval_samples_per_second": 169.998, |
| "eval_steps_per_second": 21.25, |
| "step": 27000 |
| }, |
| { |
| "epoch": 2.612130018530701, |
| "grad_norm": 1.3890790939331055, |
| "learning_rate": 3.3010033444816052e-06, |
| "loss": 0.3252, |
| "step": 27100 |
| }, |
| { |
| "epoch": 2.612130018530701, |
| "eval_loss": 0.28705713152885437, |
| "eval_runtime": 58.8105, |
| "eval_samples_per_second": 170.038, |
| "eval_steps_per_second": 21.255, |
| "step": 27100 |
| }, |
| { |
| "epoch": 2.621768874687641, |
| "grad_norm": 1.2755820751190186, |
| "learning_rate": 3.2909698996655516e-06, |
| "loss": 0.3222, |
| "step": 27200 |
| }, |
| { |
| "epoch": 2.621768874687641, |
| "eval_loss": 0.28242212533950806, |
| "eval_runtime": 58.8787, |
| "eval_samples_per_second": 169.841, |
| "eval_steps_per_second": 21.23, |
| "step": 27200 |
| }, |
| { |
| "epoch": 2.6314077308445807, |
| "grad_norm": 1.2242155075073242, |
| "learning_rate": 3.2809364548494983e-06, |
| "loss": 0.3214, |
| "step": 27300 |
| }, |
| { |
| "epoch": 2.6314077308445807, |
| "eval_loss": 0.2825804054737091, |
| "eval_runtime": 58.9281, |
| "eval_samples_per_second": 169.698, |
| "eval_steps_per_second": 21.212, |
| "step": 27300 |
| }, |
| { |
| "epoch": 2.6410465870015205, |
| "grad_norm": 1.198223352432251, |
| "learning_rate": 3.2709030100334447e-06, |
| "loss": 0.321, |
| "step": 27400 |
| }, |
| { |
| "epoch": 2.6410465870015205, |
| "eval_loss": 0.28742432594299316, |
| "eval_runtime": 58.8235, |
| "eval_samples_per_second": 170.0, |
| "eval_steps_per_second": 21.25, |
| "step": 27400 |
| }, |
| { |
| "epoch": 2.6506854431584603, |
| "grad_norm": 1.2096476554870605, |
| "learning_rate": 3.260869565217391e-06, |
| "loss": 0.3199, |
| "step": 27500 |
| }, |
| { |
| "epoch": 2.6506854431584603, |
| "eval_loss": 0.283199667930603, |
| "eval_runtime": 58.804, |
| "eval_samples_per_second": 170.057, |
| "eval_steps_per_second": 21.257, |
| "step": 27500 |
| }, |
| { |
| "epoch": 2.6603242993154, |
| "grad_norm": 1.3325072526931763, |
| "learning_rate": 3.2508361204013378e-06, |
| "loss": 0.3213, |
| "step": 27600 |
| }, |
| { |
| "epoch": 2.6603242993154, |
| "eval_loss": 0.287946879863739, |
| "eval_runtime": 58.9847, |
| "eval_samples_per_second": 169.535, |
| "eval_steps_per_second": 21.192, |
| "step": 27600 |
| }, |
| { |
| "epoch": 2.66996315547234, |
| "grad_norm": 1.2635695934295654, |
| "learning_rate": 3.240802675585284e-06, |
| "loss": 0.318, |
| "step": 27700 |
| }, |
| { |
| "epoch": 2.66996315547234, |
| "eval_loss": 0.2917640209197998, |
| "eval_runtime": 58.5959, |
| "eval_samples_per_second": 170.661, |
| "eval_steps_per_second": 21.333, |
| "step": 27700 |
| }, |
| { |
| "epoch": 2.67960201162928, |
| "grad_norm": 1.0959738492965698, |
| "learning_rate": 3.230769230769231e-06, |
| "loss": 0.3196, |
| "step": 27800 |
| }, |
| { |
| "epoch": 2.67960201162928, |
| "eval_loss": 0.28916123509407043, |
| "eval_runtime": 59.0359, |
| "eval_samples_per_second": 169.388, |
| "eval_steps_per_second": 21.174, |
| "step": 27800 |
| }, |
| { |
| "epoch": 2.6892408677862196, |
| "grad_norm": 1.2401530742645264, |
| "learning_rate": 3.2207357859531772e-06, |
| "loss": 0.3208, |
| "step": 27900 |
| }, |
| { |
| "epoch": 2.6892408677862196, |
| "eval_loss": 0.2888045012950897, |
| "eval_runtime": 58.8348, |
| "eval_samples_per_second": 169.967, |
| "eval_steps_per_second": 21.246, |
| "step": 27900 |
| }, |
| { |
| "epoch": 2.6988797239431594, |
| "grad_norm": 1.3379608392715454, |
| "learning_rate": 3.2107023411371236e-06, |
| "loss": 0.3184, |
| "step": 28000 |
| }, |
| { |
| "epoch": 2.6988797239431594, |
| "eval_loss": 0.28211450576782227, |
| "eval_runtime": 58.8639, |
| "eval_samples_per_second": 169.884, |
| "eval_steps_per_second": 21.235, |
| "step": 28000 |
| }, |
| { |
| "epoch": 2.7085185801000993, |
| "grad_norm": 1.3035780191421509, |
| "learning_rate": 3.2006688963210703e-06, |
| "loss": 0.3158, |
| "step": 28100 |
| }, |
| { |
| "epoch": 2.7085185801000993, |
| "eval_loss": 0.28730255365371704, |
| "eval_runtime": 59.0338, |
| "eval_samples_per_second": 169.395, |
| "eval_steps_per_second": 21.174, |
| "step": 28100 |
| }, |
| { |
| "epoch": 2.7181574362570395, |
| "grad_norm": 1.1148771047592163, |
| "learning_rate": 3.1906354515050167e-06, |
| "loss": 0.3179, |
| "step": 28200 |
| }, |
| { |
| "epoch": 2.7181574362570395, |
| "eval_loss": 0.2847444415092468, |
| "eval_runtime": 58.8319, |
| "eval_samples_per_second": 169.976, |
| "eval_steps_per_second": 21.247, |
| "step": 28200 |
| }, |
| { |
| "epoch": 2.7277962924139794, |
| "grad_norm": 1.283491611480713, |
| "learning_rate": 3.180602006688963e-06, |
| "loss": 0.3182, |
| "step": 28300 |
| }, |
| { |
| "epoch": 2.7277962924139794, |
| "eval_loss": 0.27807116508483887, |
| "eval_runtime": 58.7961, |
| "eval_samples_per_second": 170.079, |
| "eval_steps_per_second": 21.26, |
| "step": 28300 |
| }, |
| { |
| "epoch": 2.737435148570919, |
| "grad_norm": 1.1780531406402588, |
| "learning_rate": 3.1705685618729098e-06, |
| "loss": 0.3156, |
| "step": 28400 |
| }, |
| { |
| "epoch": 2.737435148570919, |
| "eval_loss": 0.28270605206489563, |
| "eval_runtime": 58.9988, |
| "eval_samples_per_second": 169.495, |
| "eval_steps_per_second": 21.187, |
| "step": 28400 |
| }, |
| { |
| "epoch": 2.747074004727859, |
| "grad_norm": 1.3073583841323853, |
| "learning_rate": 3.160535117056856e-06, |
| "loss": 0.3157, |
| "step": 28500 |
| }, |
| { |
| "epoch": 2.747074004727859, |
| "eval_loss": 0.2826150059700012, |
| "eval_runtime": 58.8208, |
| "eval_samples_per_second": 170.008, |
| "eval_steps_per_second": 21.251, |
| "step": 28500 |
| }, |
| { |
| "epoch": 2.756712860884799, |
| "grad_norm": 1.3253241777420044, |
| "learning_rate": 3.1505016722408024e-06, |
| "loss": 0.3143, |
| "step": 28600 |
| }, |
| { |
| "epoch": 2.756712860884799, |
| "eval_loss": 0.27773648500442505, |
| "eval_runtime": 58.8497, |
| "eval_samples_per_second": 169.924, |
| "eval_steps_per_second": 21.241, |
| "step": 28600 |
| }, |
| { |
| "epoch": 2.7663517170417387, |
| "grad_norm": 1.3754655122756958, |
| "learning_rate": 3.140468227424749e-06, |
| "loss": 0.314, |
| "step": 28700 |
| }, |
| { |
| "epoch": 2.7663517170417387, |
| "eval_loss": 0.2878931760787964, |
| "eval_runtime": 58.9637, |
| "eval_samples_per_second": 169.596, |
| "eval_steps_per_second": 21.199, |
| "step": 28700 |
| }, |
| { |
| "epoch": 2.7759905731986785, |
| "grad_norm": 1.230859398841858, |
| "learning_rate": 3.1304347826086955e-06, |
| "loss": 0.3132, |
| "step": 28800 |
| }, |
| { |
| "epoch": 2.7759905731986785, |
| "eval_loss": 0.28596362471580505, |
| "eval_runtime": 58.8221, |
| "eval_samples_per_second": 170.004, |
| "eval_steps_per_second": 21.251, |
| "step": 28800 |
| }, |
| { |
| "epoch": 2.7856294293556183, |
| "grad_norm": 1.186421513557434, |
| "learning_rate": 3.1204013377926423e-06, |
| "loss": 0.3108, |
| "step": 28900 |
| }, |
| { |
| "epoch": 2.7856294293556183, |
| "eval_loss": 0.28029701113700867, |
| "eval_runtime": 58.7721, |
| "eval_samples_per_second": 170.149, |
| "eval_steps_per_second": 21.269, |
| "step": 28900 |
| }, |
| { |
| "epoch": 2.795268285512558, |
| "grad_norm": 1.1692681312561035, |
| "learning_rate": 3.1103678929765886e-06, |
| "loss": 0.3145, |
| "step": 29000 |
| }, |
| { |
| "epoch": 2.795268285512558, |
| "eval_loss": 0.2787197232246399, |
| "eval_runtime": 58.9813, |
| "eval_samples_per_second": 169.545, |
| "eval_steps_per_second": 21.193, |
| "step": 29000 |
| }, |
| { |
| "epoch": 2.804907141669498, |
| "grad_norm": 1.4014099836349487, |
| "learning_rate": 3.100334448160535e-06, |
| "loss": 0.3149, |
| "step": 29100 |
| }, |
| { |
| "epoch": 2.804907141669498, |
| "eval_loss": 0.2786557972431183, |
| "eval_runtime": 58.706, |
| "eval_samples_per_second": 170.34, |
| "eval_steps_per_second": 21.293, |
| "step": 29100 |
| }, |
| { |
| "epoch": 2.8145459978264378, |
| "grad_norm": 1.2638059854507446, |
| "learning_rate": 3.0903010033444818e-06, |
| "loss": 0.3122, |
| "step": 29200 |
| }, |
| { |
| "epoch": 2.8145459978264378, |
| "eval_loss": 0.2831690013408661, |
| "eval_runtime": 58.5787, |
| "eval_samples_per_second": 170.71, |
| "eval_steps_per_second": 21.339, |
| "step": 29200 |
| }, |
| { |
| "epoch": 2.824184853983378, |
| "grad_norm": 1.3350086212158203, |
| "learning_rate": 3.080267558528428e-06, |
| "loss": 0.3094, |
| "step": 29300 |
| }, |
| { |
| "epoch": 2.824184853983378, |
| "eval_loss": 0.2827381491661072, |
| "eval_runtime": 58.804, |
| "eval_samples_per_second": 170.057, |
| "eval_steps_per_second": 21.257, |
| "step": 29300 |
| }, |
| { |
| "epoch": 2.833823710140318, |
| "grad_norm": 1.162023663520813, |
| "learning_rate": 3.0702341137123744e-06, |
| "loss": 0.3128, |
| "step": 29400 |
| }, |
| { |
| "epoch": 2.833823710140318, |
| "eval_loss": 0.28242263197898865, |
| "eval_runtime": 58.6385, |
| "eval_samples_per_second": 170.537, |
| "eval_steps_per_second": 21.317, |
| "step": 29400 |
| }, |
| { |
| "epoch": 2.8434625662972577, |
| "grad_norm": 1.1858537197113037, |
| "learning_rate": 3.060200668896321e-06, |
| "loss": 0.3125, |
| "step": 29500 |
| }, |
| { |
| "epoch": 2.8434625662972577, |
| "eval_loss": 0.27721619606018066, |
| "eval_runtime": 58.5687, |
| "eval_samples_per_second": 170.74, |
| "eval_steps_per_second": 21.342, |
| "step": 29500 |
| }, |
| { |
| "epoch": 2.8531014224541975, |
| "grad_norm": 1.2944772243499756, |
| "learning_rate": 3.0501672240802675e-06, |
| "loss": 0.3107, |
| "step": 29600 |
| }, |
| { |
| "epoch": 2.8531014224541975, |
| "eval_loss": 0.2762606739997864, |
| "eval_runtime": 58.766, |
| "eval_samples_per_second": 170.166, |
| "eval_steps_per_second": 21.271, |
| "step": 29600 |
| }, |
| { |
| "epoch": 2.8627402786111373, |
| "grad_norm": 1.1597511768341064, |
| "learning_rate": 3.0401337792642143e-06, |
| "loss": 0.3087, |
| "step": 29700 |
| }, |
| { |
| "epoch": 2.8627402786111373, |
| "eval_loss": 0.2779894173145294, |
| "eval_runtime": 58.6408, |
| "eval_samples_per_second": 170.53, |
| "eval_steps_per_second": 21.316, |
| "step": 29700 |
| }, |
| { |
| "epoch": 2.872379134768077, |
| "grad_norm": 1.2793524265289307, |
| "learning_rate": 3.0301003344481606e-06, |
| "loss": 0.3103, |
| "step": 29800 |
| }, |
| { |
| "epoch": 2.872379134768077, |
| "eval_loss": 0.2801125645637512, |
| "eval_runtime": 58.7484, |
| "eval_samples_per_second": 170.217, |
| "eval_steps_per_second": 21.277, |
| "step": 29800 |
| }, |
| { |
| "epoch": 2.882017990925017, |
| "grad_norm": 1.2233638763427734, |
| "learning_rate": 3.020066889632107e-06, |
| "loss": 0.3097, |
| "step": 29900 |
| }, |
| { |
| "epoch": 2.882017990925017, |
| "eval_loss": 0.27292197942733765, |
| "eval_runtime": 58.9228, |
| "eval_samples_per_second": 169.714, |
| "eval_steps_per_second": 21.214, |
| "step": 29900 |
| }, |
| { |
| "epoch": 2.891656847081957, |
| "grad_norm": 1.2502061128616333, |
| "learning_rate": 3.0100334448160537e-06, |
| "loss": 0.3087, |
| "step": 30000 |
| }, |
| { |
| "epoch": 2.891656847081957, |
| "eval_loss": 0.27464571595191956, |
| "eval_runtime": 58.7608, |
| "eval_samples_per_second": 170.182, |
| "eval_steps_per_second": 21.273, |
| "step": 30000 |
| }, |
| { |
| "epoch": 2.9012957032388966, |
| "grad_norm": 1.1711479425430298, |
| "learning_rate": 3e-06, |
| "loss": 0.3079, |
| "step": 30100 |
| }, |
| { |
| "epoch": 2.9012957032388966, |
| "eval_loss": 0.2868233621120453, |
| "eval_runtime": 58.7507, |
| "eval_samples_per_second": 170.211, |
| "eval_steps_per_second": 21.276, |
| "step": 30100 |
| }, |
| { |
| "epoch": 2.9109345593958365, |
| "grad_norm": 1.2344684600830078, |
| "learning_rate": 2.9899665551839464e-06, |
| "loss": 0.305, |
| "step": 30200 |
| }, |
| { |
| "epoch": 2.9109345593958365, |
| "eval_loss": 0.27741649746894836, |
| "eval_runtime": 58.9988, |
| "eval_samples_per_second": 169.495, |
| "eval_steps_per_second": 21.187, |
| "step": 30200 |
| }, |
| { |
| "epoch": 2.9205734155527763, |
| "grad_norm": 1.1582242250442505, |
| "learning_rate": 2.979933110367893e-06, |
| "loss": 0.3087, |
| "step": 30300 |
| }, |
| { |
| "epoch": 2.9205734155527763, |
| "eval_loss": 0.2797699272632599, |
| "eval_runtime": 58.7984, |
| "eval_samples_per_second": 170.073, |
| "eval_steps_per_second": 21.259, |
| "step": 30300 |
| }, |
| { |
| "epoch": 2.930212271709716, |
| "grad_norm": 1.2283897399902344, |
| "learning_rate": 2.9698996655518395e-06, |
| "loss": 0.3049, |
| "step": 30400 |
| }, |
| { |
| "epoch": 2.930212271709716, |
| "eval_loss": 0.27813324332237244, |
| "eval_runtime": 58.7956, |
| "eval_samples_per_second": 170.081, |
| "eval_steps_per_second": 21.26, |
| "step": 30400 |
| }, |
| { |
| "epoch": 2.939851127866656, |
| "grad_norm": 1.3279707431793213, |
| "learning_rate": 2.959866220735786e-06, |
| "loss": 0.3063, |
| "step": 30500 |
| }, |
| { |
| "epoch": 2.939851127866656, |
| "eval_loss": 0.27100399136543274, |
| "eval_runtime": 58.9453, |
| "eval_samples_per_second": 169.649, |
| "eval_steps_per_second": 21.206, |
| "step": 30500 |
| }, |
| { |
| "epoch": 2.9494899840235957, |
| "grad_norm": 1.180038332939148, |
| "learning_rate": 2.9498327759197326e-06, |
| "loss": 0.3099, |
| "step": 30600 |
| }, |
| { |
| "epoch": 2.9494899840235957, |
| "eval_loss": 0.27709901332855225, |
| "eval_runtime": 58.5784, |
| "eval_samples_per_second": 170.711, |
| "eval_steps_per_second": 21.339, |
| "step": 30600 |
| }, |
| { |
| "epoch": 2.9591288401805356, |
| "grad_norm": 1.1865822076797485, |
| "learning_rate": 2.939799331103679e-06, |
| "loss": 0.3052, |
| "step": 30700 |
| }, |
| { |
| "epoch": 2.9591288401805356, |
| "eval_loss": 0.2758926749229431, |
| "eval_runtime": 58.9107, |
| "eval_samples_per_second": 169.749, |
| "eval_steps_per_second": 21.219, |
| "step": 30700 |
| }, |
| { |
| "epoch": 2.9687676963374754, |
| "grad_norm": 1.285046935081482, |
| "learning_rate": 2.9297658862876257e-06, |
| "loss": 0.3056, |
| "step": 30800 |
| }, |
| { |
| "epoch": 2.9687676963374754, |
| "eval_loss": 0.2717057168483734, |
| "eval_runtime": 58.973, |
| "eval_samples_per_second": 169.569, |
| "eval_steps_per_second": 21.196, |
| "step": 30800 |
| }, |
| { |
| "epoch": 2.978406552494415, |
| "grad_norm": 1.138985514640808, |
| "learning_rate": 2.919732441471572e-06, |
| "loss": 0.3026, |
| "step": 30900 |
| }, |
| { |
| "epoch": 2.978406552494415, |
| "eval_loss": 0.2726858854293823, |
| "eval_runtime": 58.8341, |
| "eval_samples_per_second": 169.969, |
| "eval_steps_per_second": 21.246, |
| "step": 30900 |
| }, |
| { |
| "epoch": 2.9880454086513555, |
| "grad_norm": 1.2184150218963623, |
| "learning_rate": 2.9096989966555184e-06, |
| "loss": 0.305, |
| "step": 31000 |
| }, |
| { |
| "epoch": 2.9880454086513555, |
| "eval_loss": 0.2747511863708496, |
| "eval_runtime": 58.8569, |
| "eval_samples_per_second": 169.904, |
| "eval_steps_per_second": 21.238, |
| "step": 31000 |
| }, |
| { |
| "epoch": 2.9976842648082953, |
| "grad_norm": 1.1389625072479248, |
| "learning_rate": 2.899665551839465e-06, |
| "loss": 0.3022, |
| "step": 31100 |
| }, |
| { |
| "epoch": 2.9976842648082953, |
| "eval_loss": 0.27139467000961304, |
| "eval_runtime": 59.0245, |
| "eval_samples_per_second": 169.421, |
| "eval_steps_per_second": 21.178, |
| "step": 31100 |
| }, |
| { |
| "epoch": 3.007323120965235, |
| "grad_norm": 1.2831661701202393, |
| "learning_rate": 2.8896321070234115e-06, |
| "loss": 0.3015, |
| "step": 31200 |
| }, |
| { |
| "epoch": 3.007323120965235, |
| "eval_loss": 0.268490195274353, |
| "eval_runtime": 58.8203, |
| "eval_samples_per_second": 170.009, |
| "eval_steps_per_second": 21.251, |
| "step": 31200 |
| }, |
| { |
| "epoch": 3.016961977122175, |
| "grad_norm": 1.3156960010528564, |
| "learning_rate": 2.879598662207358e-06, |
| "loss": 0.3034, |
| "step": 31300 |
| }, |
| { |
| "epoch": 3.016961977122175, |
| "eval_loss": 0.27620190382003784, |
| "eval_runtime": 58.8431, |
| "eval_samples_per_second": 169.944, |
| "eval_steps_per_second": 21.243, |
| "step": 31300 |
| }, |
| { |
| "epoch": 3.0266008332791148, |
| "grad_norm": 1.1496986150741577, |
| "learning_rate": 2.8695652173913046e-06, |
| "loss": 0.3014, |
| "step": 31400 |
| }, |
| { |
| "epoch": 3.0266008332791148, |
| "eval_loss": 0.26573923230171204, |
| "eval_runtime": 58.9994, |
| "eval_samples_per_second": 169.493, |
| "eval_steps_per_second": 21.187, |
| "step": 31400 |
| }, |
| { |
| "epoch": 3.0362396894360546, |
| "grad_norm": 1.2416408061981201, |
| "learning_rate": 2.859531772575251e-06, |
| "loss": 0.2996, |
| "step": 31500 |
| }, |
| { |
| "epoch": 3.0362396894360546, |
| "eval_loss": 0.26978152990341187, |
| "eval_runtime": 58.8544, |
| "eval_samples_per_second": 169.911, |
| "eval_steps_per_second": 21.239, |
| "step": 31500 |
| }, |
| { |
| "epoch": 3.0458785455929944, |
| "grad_norm": 1.2750462293624878, |
| "learning_rate": 2.8494983277591977e-06, |
| "loss": 0.3049, |
| "step": 31600 |
| }, |
| { |
| "epoch": 3.0458785455929944, |
| "eval_loss": 0.26833203434944153, |
| "eval_runtime": 58.8337, |
| "eval_samples_per_second": 169.971, |
| "eval_steps_per_second": 21.246, |
| "step": 31600 |
| }, |
| { |
| "epoch": 3.0555174017499342, |
| "grad_norm": 1.2201920747756958, |
| "learning_rate": 2.839464882943144e-06, |
| "loss": 0.3029, |
| "step": 31700 |
| }, |
| { |
| "epoch": 3.0555174017499342, |
| "eval_loss": 0.2752624452114105, |
| "eval_runtime": 59.0229, |
| "eval_samples_per_second": 169.426, |
| "eval_steps_per_second": 21.178, |
| "step": 31700 |
| }, |
| { |
| "epoch": 3.065156257906874, |
| "grad_norm": 1.3049486875534058, |
| "learning_rate": 2.8294314381270904e-06, |
| "loss": 0.299, |
| "step": 31800 |
| }, |
| { |
| "epoch": 3.065156257906874, |
| "eval_loss": 0.26851963996887207, |
| "eval_runtime": 58.8811, |
| "eval_samples_per_second": 169.834, |
| "eval_steps_per_second": 21.229, |
| "step": 31800 |
| }, |
| { |
| "epoch": 3.074795114063814, |
| "grad_norm": 1.2307597398757935, |
| "learning_rate": 2.819397993311037e-06, |
| "loss": 0.2998, |
| "step": 31900 |
| }, |
| { |
| "epoch": 3.074795114063814, |
| "eval_loss": 0.27132850885391235, |
| "eval_runtime": 58.8127, |
| "eval_samples_per_second": 170.031, |
| "eval_steps_per_second": 21.254, |
| "step": 31900 |
| }, |
| { |
| "epoch": 3.0844339702207537, |
| "grad_norm": 1.2379592657089233, |
| "learning_rate": 2.8093645484949835e-06, |
| "loss": 0.3009, |
| "step": 32000 |
| }, |
| { |
| "epoch": 3.0844339702207537, |
| "eval_loss": 0.2727653682231903, |
| "eval_runtime": 59.0543, |
| "eval_samples_per_second": 169.336, |
| "eval_steps_per_second": 21.167, |
| "step": 32000 |
| }, |
| { |
| "epoch": 3.094072826377694, |
| "grad_norm": 1.3900628089904785, |
| "learning_rate": 2.79933110367893e-06, |
| "loss": 0.2968, |
| "step": 32100 |
| }, |
| { |
| "epoch": 3.094072826377694, |
| "eval_loss": 0.26818564534187317, |
| "eval_runtime": 58.6043, |
| "eval_samples_per_second": 170.636, |
| "eval_steps_per_second": 21.329, |
| "step": 32100 |
| }, |
| { |
| "epoch": 3.103711682534634, |
| "grad_norm": 1.2621980905532837, |
| "learning_rate": 2.7892976588628766e-06, |
| "loss": 0.2987, |
| "step": 32200 |
| }, |
| { |
| "epoch": 3.103711682534634, |
| "eval_loss": 0.2698274552822113, |
| "eval_runtime": 59.0036, |
| "eval_samples_per_second": 169.481, |
| "eval_steps_per_second": 21.185, |
| "step": 32200 |
| }, |
| { |
| "epoch": 3.1133505386915736, |
| "grad_norm": 1.2347277402877808, |
| "learning_rate": 2.779264214046823e-06, |
| "loss": 0.2987, |
| "step": 32300 |
| }, |
| { |
| "epoch": 3.1133505386915736, |
| "eval_loss": 0.2685893177986145, |
| "eval_runtime": 58.8188, |
| "eval_samples_per_second": 170.014, |
| "eval_steps_per_second": 21.252, |
| "step": 32300 |
| }, |
| { |
| "epoch": 3.1229893948485135, |
| "grad_norm": 1.2878509759902954, |
| "learning_rate": 2.7692307692307693e-06, |
| "loss": 0.2993, |
| "step": 32400 |
| }, |
| { |
| "epoch": 3.1229893948485135, |
| "eval_loss": 0.2640542984008789, |
| "eval_runtime": 59.467, |
| "eval_samples_per_second": 168.161, |
| "eval_steps_per_second": 21.02, |
| "step": 32400 |
| }, |
| { |
| "epoch": 3.1326282510054533, |
| "grad_norm": 1.1735783815383911, |
| "learning_rate": 2.759197324414716e-06, |
| "loss": 0.2978, |
| "step": 32500 |
| }, |
| { |
| "epoch": 3.1326282510054533, |
| "eval_loss": 0.2660863697528839, |
| "eval_runtime": 59.3085, |
| "eval_samples_per_second": 168.61, |
| "eval_steps_per_second": 21.076, |
| "step": 32500 |
| }, |
| { |
| "epoch": 3.142267107162393, |
| "grad_norm": 1.1393107175827026, |
| "learning_rate": 2.749163879598662e-06, |
| "loss": 0.3021, |
| "step": 32600 |
| }, |
| { |
| "epoch": 3.142267107162393, |
| "eval_loss": 0.2683536410331726, |
| "eval_runtime": 59.2664, |
| "eval_samples_per_second": 168.73, |
| "eval_steps_per_second": 21.091, |
| "step": 32600 |
| }, |
| { |
| "epoch": 3.151905963319333, |
| "grad_norm": 1.1794265508651733, |
| "learning_rate": 2.7391304347826087e-06, |
| "loss": 0.3005, |
| "step": 32700 |
| }, |
| { |
| "epoch": 3.151905963319333, |
| "eval_loss": 0.26745280623435974, |
| "eval_runtime": 59.4235, |
| "eval_samples_per_second": 168.284, |
| "eval_steps_per_second": 21.035, |
| "step": 32700 |
| }, |
| { |
| "epoch": 3.1615448194762727, |
| "grad_norm": 1.2067482471466064, |
| "learning_rate": 2.729096989966555e-06, |
| "loss": 0.2968, |
| "step": 32800 |
| }, |
| { |
| "epoch": 3.1615448194762727, |
| "eval_loss": 0.2683298885822296, |
| "eval_runtime": 59.2787, |
| "eval_samples_per_second": 168.695, |
| "eval_steps_per_second": 21.087, |
| "step": 32800 |
| }, |
| { |
| "epoch": 3.1711836756332126, |
| "grad_norm": 1.2533990144729614, |
| "learning_rate": 2.7190635451505014e-06, |
| "loss": 0.2971, |
| "step": 32900 |
| }, |
| { |
| "epoch": 3.1711836756332126, |
| "eval_loss": 0.26645201444625854, |
| "eval_runtime": 59.2542, |
| "eval_samples_per_second": 168.764, |
| "eval_steps_per_second": 21.096, |
| "step": 32900 |
| }, |
| { |
| "epoch": 3.1808225317901524, |
| "grad_norm": 1.3016612529754639, |
| "learning_rate": 2.709030100334448e-06, |
| "loss": 0.2956, |
| "step": 33000 |
| }, |
| { |
| "epoch": 3.1808225317901524, |
| "eval_loss": 0.2659901976585388, |
| "eval_runtime": 59.4032, |
| "eval_samples_per_second": 168.341, |
| "eval_steps_per_second": 21.043, |
| "step": 33000 |
| }, |
| { |
| "epoch": 3.190461387947092, |
| "grad_norm": 1.1479302644729614, |
| "learning_rate": 2.6989966555183945e-06, |
| "loss": 0.294, |
| "step": 33100 |
| }, |
| { |
| "epoch": 3.190461387947092, |
| "eval_loss": 0.26020553708076477, |
| "eval_runtime": 59.4399, |
| "eval_samples_per_second": 168.237, |
| "eval_steps_per_second": 21.03, |
| "step": 33100 |
| }, |
| { |
| "epoch": 3.200100244104032, |
| "grad_norm": 1.220609426498413, |
| "learning_rate": 2.6889632107023413e-06, |
| "loss": 0.2936, |
| "step": 33200 |
| }, |
| { |
| "epoch": 3.200100244104032, |
| "eval_loss": 0.2678441107273102, |
| "eval_runtime": 59.2829, |
| "eval_samples_per_second": 168.683, |
| "eval_steps_per_second": 21.085, |
| "step": 33200 |
| }, |
| { |
| "epoch": 3.209739100260972, |
| "grad_norm": 1.3181507587432861, |
| "learning_rate": 2.6789297658862876e-06, |
| "loss": 0.2952, |
| "step": 33300 |
| }, |
| { |
| "epoch": 3.209739100260972, |
| "eval_loss": 0.26516881585121155, |
| "eval_runtime": 59.4504, |
| "eval_samples_per_second": 168.207, |
| "eval_steps_per_second": 21.026, |
| "step": 33300 |
| }, |
| { |
| "epoch": 3.2193779564179117, |
| "grad_norm": 1.234379529953003, |
| "learning_rate": 2.668896321070234e-06, |
| "loss": 0.297, |
| "step": 33400 |
| }, |
| { |
| "epoch": 3.2193779564179117, |
| "eval_loss": 0.265980064868927, |
| "eval_runtime": 59.3045, |
| "eval_samples_per_second": 168.621, |
| "eval_steps_per_second": 21.078, |
| "step": 33400 |
| }, |
| { |
| "epoch": 3.2290168125748515, |
| "grad_norm": 1.227977991104126, |
| "learning_rate": 2.6588628762541807e-06, |
| "loss": 0.2936, |
| "step": 33500 |
| }, |
| { |
| "epoch": 3.2290168125748515, |
| "eval_loss": 0.2639939785003662, |
| "eval_runtime": 59.2768, |
| "eval_samples_per_second": 168.7, |
| "eval_steps_per_second": 21.087, |
| "step": 33500 |
| }, |
| { |
| "epoch": 3.238655668731792, |
| "grad_norm": 1.1632543802261353, |
| "learning_rate": 2.648829431438127e-06, |
| "loss": 0.2922, |
| "step": 33600 |
| }, |
| { |
| "epoch": 3.238655668731792, |
| "eval_loss": 0.2577446401119232, |
| "eval_runtime": 59.456, |
| "eval_samples_per_second": 168.192, |
| "eval_steps_per_second": 21.024, |
| "step": 33600 |
| }, |
| { |
| "epoch": 3.2482945248887316, |
| "grad_norm": 1.23133385181427, |
| "learning_rate": 2.6387959866220734e-06, |
| "loss": 0.2944, |
| "step": 33700 |
| }, |
| { |
| "epoch": 3.2482945248887316, |
| "eval_loss": 0.2566361725330353, |
| "eval_runtime": 59.1356, |
| "eval_samples_per_second": 169.103, |
| "eval_steps_per_second": 21.138, |
| "step": 33700 |
| }, |
| { |
| "epoch": 3.2579333810456714, |
| "grad_norm": 1.3113288879394531, |
| "learning_rate": 2.62876254180602e-06, |
| "loss": 0.2922, |
| "step": 33800 |
| }, |
| { |
| "epoch": 3.2579333810456714, |
| "eval_loss": 0.2656908631324768, |
| "eval_runtime": 58.9864, |
| "eval_samples_per_second": 169.531, |
| "eval_steps_per_second": 21.191, |
| "step": 33800 |
| }, |
| { |
| "epoch": 3.2675722372026113, |
| "grad_norm": 1.1765536069869995, |
| "learning_rate": 2.6187290969899665e-06, |
| "loss": 0.2949, |
| "step": 33900 |
| }, |
| { |
| "epoch": 3.2675722372026113, |
| "eval_loss": 0.2684057056903839, |
| "eval_runtime": 58.9524, |
| "eval_samples_per_second": 169.628, |
| "eval_steps_per_second": 21.204, |
| "step": 33900 |
| }, |
| { |
| "epoch": 3.277211093359551, |
| "grad_norm": 1.336182951927185, |
| "learning_rate": 2.6086956521739132e-06, |
| "loss": 0.2919, |
| "step": 34000 |
| }, |
| { |
| "epoch": 3.277211093359551, |
| "eval_loss": 0.2648228406906128, |
| "eval_runtime": 59.1485, |
| "eval_samples_per_second": 169.066, |
| "eval_steps_per_second": 21.133, |
| "step": 34000 |
| }, |
| { |
| "epoch": 3.286849949516491, |
| "grad_norm": 1.1484243869781494, |
| "learning_rate": 2.5986622073578596e-06, |
| "loss": 0.2923, |
| "step": 34100 |
| }, |
| { |
| "epoch": 3.286849949516491, |
| "eval_loss": 0.268263041973114, |
| "eval_runtime": 58.9366, |
| "eval_samples_per_second": 169.674, |
| "eval_steps_per_second": 21.209, |
| "step": 34100 |
| }, |
| { |
| "epoch": 3.2964888056734307, |
| "grad_norm": 1.2312963008880615, |
| "learning_rate": 2.588628762541806e-06, |
| "loss": 0.2934, |
| "step": 34200 |
| }, |
| { |
| "epoch": 3.2964888056734307, |
| "eval_loss": 0.26405033469200134, |
| "eval_runtime": 58.9279, |
| "eval_samples_per_second": 169.699, |
| "eval_steps_per_second": 21.212, |
| "step": 34200 |
| }, |
| { |
| "epoch": 3.3061276618303705, |
| "grad_norm": 1.2409456968307495, |
| "learning_rate": 2.5785953177257527e-06, |
| "loss": 0.2917, |
| "step": 34300 |
| }, |
| { |
| "epoch": 3.3061276618303705, |
| "eval_loss": 0.26254796981811523, |
| "eval_runtime": 59.1307, |
| "eval_samples_per_second": 169.117, |
| "eval_steps_per_second": 21.14, |
| "step": 34300 |
| }, |
| { |
| "epoch": 3.3157665179873104, |
| "grad_norm": 1.1522307395935059, |
| "learning_rate": 2.568561872909699e-06, |
| "loss": 0.2887, |
| "step": 34400 |
| }, |
| { |
| "epoch": 3.3157665179873104, |
| "eval_loss": 0.25670239329338074, |
| "eval_runtime": 58.9257, |
| "eval_samples_per_second": 169.705, |
| "eval_steps_per_second": 21.213, |
| "step": 34400 |
| }, |
| { |
| "epoch": 3.32540537414425, |
| "grad_norm": 1.1980903148651123, |
| "learning_rate": 2.5585284280936454e-06, |
| "loss": 0.2938, |
| "step": 34500 |
| }, |
| { |
| "epoch": 3.32540537414425, |
| "eval_loss": 0.25677651166915894, |
| "eval_runtime": 58.9318, |
| "eval_samples_per_second": 169.688, |
| "eval_steps_per_second": 21.211, |
| "step": 34500 |
| }, |
| { |
| "epoch": 3.33504423030119, |
| "grad_norm": 1.169443964958191, |
| "learning_rate": 2.548494983277592e-06, |
| "loss": 0.2884, |
| "step": 34600 |
| }, |
| { |
| "epoch": 3.33504423030119, |
| "eval_loss": 0.2614832818508148, |
| "eval_runtime": 59.0599, |
| "eval_samples_per_second": 169.32, |
| "eval_steps_per_second": 21.165, |
| "step": 34600 |
| }, |
| { |
| "epoch": 3.34468308645813, |
| "grad_norm": 1.151646375656128, |
| "learning_rate": 2.5384615384615385e-06, |
| "loss": 0.2917, |
| "step": 34700 |
| }, |
| { |
| "epoch": 3.34468308645813, |
| "eval_loss": 0.2570364773273468, |
| "eval_runtime": 59.0246, |
| "eval_samples_per_second": 169.421, |
| "eval_steps_per_second": 21.178, |
| "step": 34700 |
| }, |
| { |
| "epoch": 3.35432194261507, |
| "grad_norm": 1.1321110725402832, |
| "learning_rate": 2.528428093645485e-06, |
| "loss": 0.2896, |
| "step": 34800 |
| }, |
| { |
| "epoch": 3.35432194261507, |
| "eval_loss": 0.26307597756385803, |
| "eval_runtime": 58.9477, |
| "eval_samples_per_second": 169.642, |
| "eval_steps_per_second": 21.205, |
| "step": 34800 |
| }, |
| { |
| "epoch": 3.36396079877201, |
| "grad_norm": 1.224995732307434, |
| "learning_rate": 2.5183946488294316e-06, |
| "loss": 0.2899, |
| "step": 34900 |
| }, |
| { |
| "epoch": 3.36396079877201, |
| "eval_loss": 0.25860831141471863, |
| "eval_runtime": 58.9452, |
| "eval_samples_per_second": 169.649, |
| "eval_steps_per_second": 21.206, |
| "step": 34900 |
| }, |
| { |
| "epoch": 3.3735996549289498, |
| "grad_norm": 1.2130616903305054, |
| "learning_rate": 2.508361204013378e-06, |
| "loss": 0.2918, |
| "step": 35000 |
| }, |
| { |
| "epoch": 3.3735996549289498, |
| "eval_loss": 0.26047125458717346, |
| "eval_runtime": 58.891, |
| "eval_samples_per_second": 169.805, |
| "eval_steps_per_second": 21.226, |
| "step": 35000 |
| }, |
| { |
| "epoch": 3.3832385110858896, |
| "grad_norm": 1.1943938732147217, |
| "learning_rate": 2.4983277591973247e-06, |
| "loss": 0.2899, |
| "step": 35100 |
| }, |
| { |
| "epoch": 3.3832385110858896, |
| "eval_loss": 0.2654706537723541, |
| "eval_runtime": 58.7593, |
| "eval_samples_per_second": 170.186, |
| "eval_steps_per_second": 21.273, |
| "step": 35100 |
| }, |
| { |
| "epoch": 3.3928773672428294, |
| "grad_norm": 1.1910595893859863, |
| "learning_rate": 2.488294314381271e-06, |
| "loss": 0.2905, |
| "step": 35200 |
| }, |
| { |
| "epoch": 3.3928773672428294, |
| "eval_loss": 0.252178430557251, |
| "eval_runtime": 58.9565, |
| "eval_samples_per_second": 169.617, |
| "eval_steps_per_second": 21.202, |
| "step": 35200 |
| }, |
| { |
| "epoch": 3.4025162233997692, |
| "grad_norm": 1.189275860786438, |
| "learning_rate": 2.4782608695652173e-06, |
| "loss": 0.2851, |
| "step": 35300 |
| }, |
| { |
| "epoch": 3.4025162233997692, |
| "eval_loss": 0.2556873559951782, |
| "eval_runtime": 59.1306, |
| "eval_samples_per_second": 169.117, |
| "eval_steps_per_second": 21.14, |
| "step": 35300 |
| }, |
| { |
| "epoch": 3.412155079556709, |
| "grad_norm": 1.2805149555206299, |
| "learning_rate": 2.468227424749164e-06, |
| "loss": 0.2878, |
| "step": 35400 |
| }, |
| { |
| "epoch": 3.412155079556709, |
| "eval_loss": 0.25642314553260803, |
| "eval_runtime": 59.0054, |
| "eval_samples_per_second": 169.476, |
| "eval_steps_per_second": 21.185, |
| "step": 35400 |
| }, |
| { |
| "epoch": 3.421793935713649, |
| "grad_norm": 1.1754244565963745, |
| "learning_rate": 2.4581939799331104e-06, |
| "loss": 0.2863, |
| "step": 35500 |
| }, |
| { |
| "epoch": 3.421793935713649, |
| "eval_loss": 0.2632581293582916, |
| "eval_runtime": 59.0769, |
| "eval_samples_per_second": 169.271, |
| "eval_steps_per_second": 21.159, |
| "step": 35500 |
| }, |
| { |
| "epoch": 3.4314327918705887, |
| "grad_norm": 1.2007780075073242, |
| "learning_rate": 2.4481605351170568e-06, |
| "loss": 0.2892, |
| "step": 35600 |
| }, |
| { |
| "epoch": 3.4314327918705887, |
| "eval_loss": 0.25143861770629883, |
| "eval_runtime": 58.9087, |
| "eval_samples_per_second": 169.754, |
| "eval_steps_per_second": 21.219, |
| "step": 35600 |
| }, |
| { |
| "epoch": 3.4410716480275285, |
| "grad_norm": 1.0942296981811523, |
| "learning_rate": 2.4381270903010035e-06, |
| "loss": 0.2865, |
| "step": 35700 |
| }, |
| { |
| "epoch": 3.4410716480275285, |
| "eval_loss": 0.25563955307006836, |
| "eval_runtime": 58.898, |
| "eval_samples_per_second": 169.785, |
| "eval_steps_per_second": 21.223, |
| "step": 35700 |
| }, |
| { |
| "epoch": 3.4507105041844683, |
| "grad_norm": 1.1834352016448975, |
| "learning_rate": 2.42809364548495e-06, |
| "loss": 0.2889, |
| "step": 35800 |
| }, |
| { |
| "epoch": 3.4507105041844683, |
| "eval_loss": 0.25440356135368347, |
| "eval_runtime": 59.1434, |
| "eval_samples_per_second": 169.08, |
| "eval_steps_per_second": 21.135, |
| "step": 35800 |
| }, |
| { |
| "epoch": 3.460349360341408, |
| "grad_norm": 1.3102777004241943, |
| "learning_rate": 2.4180602006688962e-06, |
| "loss": 0.2882, |
| "step": 35900 |
| }, |
| { |
| "epoch": 3.460349360341408, |
| "eval_loss": 0.25780776143074036, |
| "eval_runtime": 58.9453, |
| "eval_samples_per_second": 169.649, |
| "eval_steps_per_second": 21.206, |
| "step": 35900 |
| }, |
| { |
| "epoch": 3.469988216498348, |
| "grad_norm": 1.221442461013794, |
| "learning_rate": 2.408026755852843e-06, |
| "loss": 0.2868, |
| "step": 36000 |
| }, |
| { |
| "epoch": 3.469988216498348, |
| "eval_loss": 0.259140282869339, |
| "eval_runtime": 58.9616, |
| "eval_samples_per_second": 169.602, |
| "eval_steps_per_second": 21.2, |
| "step": 36000 |
| }, |
| { |
| "epoch": 3.479627072655288, |
| "grad_norm": 1.2383184432983398, |
| "learning_rate": 2.3979933110367893e-06, |
| "loss": 0.2869, |
| "step": 36100 |
| }, |
| { |
| "epoch": 3.479627072655288, |
| "eval_loss": 0.25708064436912537, |
| "eval_runtime": 59.1275, |
| "eval_samples_per_second": 169.126, |
| "eval_steps_per_second": 21.141, |
| "step": 36100 |
| }, |
| { |
| "epoch": 3.4892659288122276, |
| "grad_norm": 1.1699203252792358, |
| "learning_rate": 2.387959866220736e-06, |
| "loss": 0.2842, |
| "step": 36200 |
| }, |
| { |
| "epoch": 3.4892659288122276, |
| "eval_loss": 0.25702208280563354, |
| "eval_runtime": 58.9299, |
| "eval_samples_per_second": 169.693, |
| "eval_steps_per_second": 21.212, |
| "step": 36200 |
| }, |
| { |
| "epoch": 3.498904784969168, |
| "grad_norm": 1.2548960447311401, |
| "learning_rate": 2.3779264214046824e-06, |
| "loss": 0.2871, |
| "step": 36300 |
| }, |
| { |
| "epoch": 3.498904784969168, |
| "eval_loss": 0.2522522807121277, |
| "eval_runtime": 58.9452, |
| "eval_samples_per_second": 169.649, |
| "eval_steps_per_second": 21.206, |
| "step": 36300 |
| }, |
| { |
| "epoch": 3.5085436411261077, |
| "grad_norm": 1.2380112409591675, |
| "learning_rate": 2.3678929765886288e-06, |
| "loss": 0.2847, |
| "step": 36400 |
| }, |
| { |
| "epoch": 3.5085436411261077, |
| "eval_loss": 0.25519639253616333, |
| "eval_runtime": 59.1639, |
| "eval_samples_per_second": 169.022, |
| "eval_steps_per_second": 21.128, |
| "step": 36400 |
| }, |
| { |
| "epoch": 3.5181824972830475, |
| "grad_norm": 1.163103699684143, |
| "learning_rate": 2.3578595317725755e-06, |
| "loss": 0.2848, |
| "step": 36500 |
| }, |
| { |
| "epoch": 3.5181824972830475, |
| "eval_loss": 0.25967657566070557, |
| "eval_runtime": 58.7318, |
| "eval_samples_per_second": 170.265, |
| "eval_steps_per_second": 21.283, |
| "step": 36500 |
| }, |
| { |
| "epoch": 3.5278213534399874, |
| "grad_norm": 1.17255437374115, |
| "learning_rate": 2.347826086956522e-06, |
| "loss": 0.285, |
| "step": 36600 |
| }, |
| { |
| "epoch": 3.5278213534399874, |
| "eval_loss": 0.2591026723384857, |
| "eval_runtime": 58.5991, |
| "eval_samples_per_second": 170.651, |
| "eval_steps_per_second": 21.331, |
| "step": 36600 |
| }, |
| { |
| "epoch": 3.537460209596927, |
| "grad_norm": 1.2136497497558594, |
| "learning_rate": 2.337792642140468e-06, |
| "loss": 0.2843, |
| "step": 36700 |
| }, |
| { |
| "epoch": 3.537460209596927, |
| "eval_loss": 0.2578739523887634, |
| "eval_runtime": 59.0072, |
| "eval_samples_per_second": 169.471, |
| "eval_steps_per_second": 21.184, |
| "step": 36700 |
| }, |
| { |
| "epoch": 3.547099065753867, |
| "grad_norm": 1.1942510604858398, |
| "learning_rate": 2.327759197324415e-06, |
| "loss": 0.2806, |
| "step": 36800 |
| }, |
| { |
| "epoch": 3.547099065753867, |
| "eval_loss": 0.251567542552948, |
| "eval_runtime": 58.9372, |
| "eval_samples_per_second": 169.672, |
| "eval_steps_per_second": 21.209, |
| "step": 36800 |
| }, |
| { |
| "epoch": 3.556737921910807, |
| "grad_norm": 1.2007250785827637, |
| "learning_rate": 2.3177257525083613e-06, |
| "loss": 0.284, |
| "step": 36900 |
| }, |
| { |
| "epoch": 3.556737921910807, |
| "eval_loss": 0.25226566195487976, |
| "eval_runtime": 58.9498, |
| "eval_samples_per_second": 169.636, |
| "eval_steps_per_second": 21.204, |
| "step": 36900 |
| }, |
| { |
| "epoch": 3.5663767780677467, |
| "grad_norm": 1.2540044784545898, |
| "learning_rate": 2.307692307692308e-06, |
| "loss": 0.2806, |
| "step": 37000 |
| }, |
| { |
| "epoch": 3.5663767780677467, |
| "eval_loss": 0.24976973235607147, |
| "eval_runtime": 59.0781, |
| "eval_samples_per_second": 169.267, |
| "eval_steps_per_second": 21.158, |
| "step": 37000 |
| }, |
| { |
| "epoch": 3.5760156342246865, |
| "grad_norm": 1.2845237255096436, |
| "learning_rate": 2.2976588628762544e-06, |
| "loss": 0.2814, |
| "step": 37100 |
| }, |
| { |
| "epoch": 3.5760156342246865, |
| "eval_loss": 0.2551785707473755, |
| "eval_runtime": 58.947, |
| "eval_samples_per_second": 169.644, |
| "eval_steps_per_second": 21.205, |
| "step": 37100 |
| }, |
| { |
| "epoch": 3.5856544903816263, |
| "grad_norm": 1.0793906450271606, |
| "learning_rate": 2.2876254180602008e-06, |
| "loss": 0.2831, |
| "step": 37200 |
| }, |
| { |
| "epoch": 3.5856544903816263, |
| "eval_loss": 0.259166955947876, |
| "eval_runtime": 58.9587, |
| "eval_samples_per_second": 169.61, |
| "eval_steps_per_second": 21.201, |
| "step": 37200 |
| }, |
| { |
| "epoch": 3.595293346538566, |
| "grad_norm": 1.2562330961227417, |
| "learning_rate": 2.2775919732441475e-06, |
| "loss": 0.2816, |
| "step": 37300 |
| }, |
| { |
| "epoch": 3.595293346538566, |
| "eval_loss": 0.25133049488067627, |
| "eval_runtime": 59.0752, |
| "eval_samples_per_second": 169.276, |
| "eval_steps_per_second": 21.159, |
| "step": 37300 |
| }, |
| { |
| "epoch": 3.604932202695506, |
| "grad_norm": 1.2339458465576172, |
| "learning_rate": 2.267558528428094e-06, |
| "loss": 0.2811, |
| "step": 37400 |
| }, |
| { |
| "epoch": 3.604932202695506, |
| "eval_loss": 0.2517785131931305, |
| "eval_runtime": 59.0293, |
| "eval_samples_per_second": 169.407, |
| "eval_steps_per_second": 21.176, |
| "step": 37400 |
| }, |
| { |
| "epoch": 3.6145710588524462, |
| "grad_norm": 1.1703965663909912, |
| "learning_rate": 2.25752508361204e-06, |
| "loss": 0.285, |
| "step": 37500 |
| }, |
| { |
| "epoch": 3.6145710588524462, |
| "eval_loss": 0.24998454749584198, |
| "eval_runtime": 58.8551, |
| "eval_samples_per_second": 169.909, |
| "eval_steps_per_second": 21.239, |
| "step": 37500 |
| }, |
| { |
| "epoch": 3.624209915009386, |
| "grad_norm": 1.1811251640319824, |
| "learning_rate": 2.2474916387959865e-06, |
| "loss": 0.2853, |
| "step": 37600 |
| }, |
| { |
| "epoch": 3.624209915009386, |
| "eval_loss": 0.24660471081733704, |
| "eval_runtime": 58.8887, |
| "eval_samples_per_second": 169.812, |
| "eval_steps_per_second": 21.226, |
| "step": 37600 |
| }, |
| { |
| "epoch": 3.633848771166326, |
| "grad_norm": 1.2618868350982666, |
| "learning_rate": 2.237458193979933e-06, |
| "loss": 0.2795, |
| "step": 37700 |
| }, |
| { |
| "epoch": 3.633848771166326, |
| "eval_loss": 0.25702670216560364, |
| "eval_runtime": 59.0964, |
| "eval_samples_per_second": 169.215, |
| "eval_steps_per_second": 21.152, |
| "step": 37700 |
| }, |
| { |
| "epoch": 3.6434876273232657, |
| "grad_norm": 1.1779111623764038, |
| "learning_rate": 2.2274247491638796e-06, |
| "loss": 0.2807, |
| "step": 37800 |
| }, |
| { |
| "epoch": 3.6434876273232657, |
| "eval_loss": 0.24639862775802612, |
| "eval_runtime": 58.9536, |
| "eval_samples_per_second": 169.625, |
| "eval_steps_per_second": 21.203, |
| "step": 37800 |
| }, |
| { |
| "epoch": 3.6531264834802055, |
| "grad_norm": 1.1458935737609863, |
| "learning_rate": 2.217391304347826e-06, |
| "loss": 0.2781, |
| "step": 37900 |
| }, |
| { |
| "epoch": 3.6531264834802055, |
| "eval_loss": 0.25728264451026917, |
| "eval_runtime": 58.9434, |
| "eval_samples_per_second": 169.654, |
| "eval_steps_per_second": 21.207, |
| "step": 37900 |
| }, |
| { |
| "epoch": 3.6627653396371453, |
| "grad_norm": 1.2311447858810425, |
| "learning_rate": 2.2073578595317723e-06, |
| "loss": 0.2804, |
| "step": 38000 |
| }, |
| { |
| "epoch": 3.6627653396371453, |
| "eval_loss": 0.25558847188949585, |
| "eval_runtime": 58.8989, |
| "eval_samples_per_second": 169.782, |
| "eval_steps_per_second": 21.223, |
| "step": 38000 |
| }, |
| { |
| "epoch": 3.672404195794085, |
| "grad_norm": 1.1407544612884521, |
| "learning_rate": 2.197324414715719e-06, |
| "loss": 0.2765, |
| "step": 38100 |
| }, |
| { |
| "epoch": 3.672404195794085, |
| "eval_loss": 0.248101145029068, |
| "eval_runtime": 58.7355, |
| "eval_samples_per_second": 170.255, |
| "eval_steps_per_second": 21.282, |
| "step": 38100 |
| }, |
| { |
| "epoch": 3.682043051951025, |
| "grad_norm": 1.2951027154922485, |
| "learning_rate": 2.1872909698996654e-06, |
| "loss": 0.2802, |
| "step": 38200 |
| }, |
| { |
| "epoch": 3.682043051951025, |
| "eval_loss": 0.25361719727516174, |
| "eval_runtime": 58.7402, |
| "eval_samples_per_second": 170.241, |
| "eval_steps_per_second": 21.28, |
| "step": 38200 |
| }, |
| { |
| "epoch": 3.691681908107965, |
| "grad_norm": 1.1731014251708984, |
| "learning_rate": 2.177257525083612e-06, |
| "loss": 0.2773, |
| "step": 38300 |
| }, |
| { |
| "epoch": 3.691681908107965, |
| "eval_loss": 0.25366759300231934, |
| "eval_runtime": 58.9147, |
| "eval_samples_per_second": 169.737, |
| "eval_steps_per_second": 21.217, |
| "step": 38300 |
| }, |
| { |
| "epoch": 3.7013207642649046, |
| "grad_norm": 1.2044979333877563, |
| "learning_rate": 2.1672240802675585e-06, |
| "loss": 0.2782, |
| "step": 38400 |
| }, |
| { |
| "epoch": 3.7013207642649046, |
| "eval_loss": 0.2512661814689636, |
| "eval_runtime": 58.7469, |
| "eval_samples_per_second": 170.222, |
| "eval_steps_per_second": 21.278, |
| "step": 38400 |
| }, |
| { |
| "epoch": 3.7109596204218445, |
| "grad_norm": 1.1250883340835571, |
| "learning_rate": 2.157190635451505e-06, |
| "loss": 0.2766, |
| "step": 38500 |
| }, |
| { |
| "epoch": 3.7109596204218445, |
| "eval_loss": 0.25230294466018677, |
| "eval_runtime": 58.7174, |
| "eval_samples_per_second": 170.307, |
| "eval_steps_per_second": 21.288, |
| "step": 38500 |
| }, |
| { |
| "epoch": 3.7205984765787843, |
| "grad_norm": 1.1560463905334473, |
| "learning_rate": 2.1471571906354516e-06, |
| "loss": 0.2794, |
| "step": 38600 |
| }, |
| { |
| "epoch": 3.7205984765787843, |
| "eval_loss": 0.25185054540634155, |
| "eval_runtime": 58.9294, |
| "eval_samples_per_second": 169.695, |
| "eval_steps_per_second": 21.212, |
| "step": 38600 |
| }, |
| { |
| "epoch": 3.730237332735724, |
| "grad_norm": 1.104009747505188, |
| "learning_rate": 2.137123745819398e-06, |
| "loss": 0.2792, |
| "step": 38700 |
| }, |
| { |
| "epoch": 3.730237332735724, |
| "eval_loss": 0.25351107120513916, |
| "eval_runtime": 58.77, |
| "eval_samples_per_second": 170.155, |
| "eval_steps_per_second": 21.269, |
| "step": 38700 |
| }, |
| { |
| "epoch": 3.739876188892664, |
| "grad_norm": 1.1741303205490112, |
| "learning_rate": 2.1270903010033443e-06, |
| "loss": 0.2804, |
| "step": 38800 |
| }, |
| { |
| "epoch": 3.739876188892664, |
| "eval_loss": 0.2518996000289917, |
| "eval_runtime": 58.7376, |
| "eval_samples_per_second": 170.249, |
| "eval_steps_per_second": 21.281, |
| "step": 38800 |
| }, |
| { |
| "epoch": 3.7495150450496038, |
| "grad_norm": 1.218940258026123, |
| "learning_rate": 2.117056856187291e-06, |
| "loss": 0.2778, |
| "step": 38900 |
| }, |
| { |
| "epoch": 3.7495150450496038, |
| "eval_loss": 0.25092679262161255, |
| "eval_runtime": 58.9207, |
| "eval_samples_per_second": 169.72, |
| "eval_steps_per_second": 21.215, |
| "step": 38900 |
| }, |
| { |
| "epoch": 3.7591539012065436, |
| "grad_norm": 1.15857994556427, |
| "learning_rate": 2.1070234113712374e-06, |
| "loss": 0.2788, |
| "step": 39000 |
| }, |
| { |
| "epoch": 3.7591539012065436, |
| "eval_loss": 0.2451571226119995, |
| "eval_runtime": 58.789, |
| "eval_samples_per_second": 170.1, |
| "eval_steps_per_second": 21.262, |
| "step": 39000 |
| }, |
| { |
| "epoch": 3.7687927573634834, |
| "grad_norm": 1.1558825969696045, |
| "learning_rate": 2.0969899665551837e-06, |
| "loss": 0.2782, |
| "step": 39100 |
| }, |
| { |
| "epoch": 3.7687927573634834, |
| "eval_loss": 0.2468053698539734, |
| "eval_runtime": 58.7539, |
| "eval_samples_per_second": 170.202, |
| "eval_steps_per_second": 21.275, |
| "step": 39100 |
| }, |
| { |
| "epoch": 3.7784316135204237, |
| "grad_norm": 1.0610604286193848, |
| "learning_rate": 2.0869565217391305e-06, |
| "loss": 0.2774, |
| "step": 39200 |
| }, |
| { |
| "epoch": 3.7784316135204237, |
| "eval_loss": 0.2472023367881775, |
| "eval_runtime": 58.9115, |
| "eval_samples_per_second": 169.746, |
| "eval_steps_per_second": 21.218, |
| "step": 39200 |
| }, |
| { |
| "epoch": 3.7880704696773635, |
| "grad_norm": 1.1503691673278809, |
| "learning_rate": 2.076923076923077e-06, |
| "loss": 0.278, |
| "step": 39300 |
| }, |
| { |
| "epoch": 3.7880704696773635, |
| "eval_loss": 0.2427283674478531, |
| "eval_runtime": 58.7581, |
| "eval_samples_per_second": 170.189, |
| "eval_steps_per_second": 21.274, |
| "step": 39300 |
| }, |
| { |
| "epoch": 3.7977093258343033, |
| "grad_norm": 1.0943491458892822, |
| "learning_rate": 2.0668896321070236e-06, |
| "loss": 0.2787, |
| "step": 39400 |
| }, |
| { |
| "epoch": 3.7977093258343033, |
| "eval_loss": 0.24747537076473236, |
| "eval_runtime": 58.7249, |
| "eval_samples_per_second": 170.286, |
| "eval_steps_per_second": 21.286, |
| "step": 39400 |
| }, |
| { |
| "epoch": 3.807348181991243, |
| "grad_norm": 1.185901403427124, |
| "learning_rate": 2.05685618729097e-06, |
| "loss": 0.277, |
| "step": 39500 |
| }, |
| { |
| "epoch": 3.807348181991243, |
| "eval_loss": 0.24593886733055115, |
| "eval_runtime": 58.9029, |
| "eval_samples_per_second": 169.771, |
| "eval_steps_per_second": 21.221, |
| "step": 39500 |
| }, |
| { |
| "epoch": 3.816987038148183, |
| "grad_norm": 1.1177955865859985, |
| "learning_rate": 2.0468227424749163e-06, |
| "loss": 0.2764, |
| "step": 39600 |
| }, |
| { |
| "epoch": 3.816987038148183, |
| "eval_loss": 0.25283169746398926, |
| "eval_runtime": 58.8362, |
| "eval_samples_per_second": 169.963, |
| "eval_steps_per_second": 21.245, |
| "step": 39600 |
| }, |
| { |
| "epoch": 3.826625894305123, |
| "grad_norm": 1.2729185819625854, |
| "learning_rate": 2.036789297658863e-06, |
| "loss": 0.2756, |
| "step": 39700 |
| }, |
| { |
| "epoch": 3.826625894305123, |
| "eval_loss": 0.24848853051662445, |
| "eval_runtime": 58.9454, |
| "eval_samples_per_second": 169.649, |
| "eval_steps_per_second": 21.206, |
| "step": 39700 |
| }, |
| { |
| "epoch": 3.8362647504620626, |
| "grad_norm": 1.056195855140686, |
| "learning_rate": 2.0267558528428094e-06, |
| "loss": 0.2755, |
| "step": 39800 |
| }, |
| { |
| "epoch": 3.8362647504620626, |
| "eval_loss": 0.24940501153469086, |
| "eval_runtime": 59.1051, |
| "eval_samples_per_second": 169.19, |
| "eval_steps_per_second": 21.149, |
| "step": 39800 |
| }, |
| { |
| "epoch": 3.8459036066190024, |
| "grad_norm": 1.1669950485229492, |
| "learning_rate": 2.0167224080267557e-06, |
| "loss": 0.2715, |
| "step": 39900 |
| }, |
| { |
| "epoch": 3.8459036066190024, |
| "eval_loss": 0.2444242686033249, |
| "eval_runtime": 58.9505, |
| "eval_samples_per_second": 169.634, |
| "eval_steps_per_second": 21.204, |
| "step": 39900 |
| }, |
| { |
| "epoch": 3.8555424627759423, |
| "grad_norm": 1.2285829782485962, |
| "learning_rate": 2.0066889632107025e-06, |
| "loss": 0.2764, |
| "step": 40000 |
| }, |
| { |
| "epoch": 3.8555424627759423, |
| "eval_loss": 0.2512260377407074, |
| "eval_runtime": 58.9442, |
| "eval_samples_per_second": 169.652, |
| "eval_steps_per_second": 21.207, |
| "step": 40000 |
| }, |
| { |
| "epoch": 3.865181318932882, |
| "grad_norm": 1.120784878730774, |
| "learning_rate": 1.996655518394649e-06, |
| "loss": 0.273, |
| "step": 40100 |
| }, |
| { |
| "epoch": 3.865181318932882, |
| "eval_loss": 0.24423950910568237, |
| "eval_runtime": 59.1371, |
| "eval_samples_per_second": 169.099, |
| "eval_steps_per_second": 21.137, |
| "step": 40100 |
| }, |
| { |
| "epoch": 3.8748201750898223, |
| "grad_norm": 1.1356536149978638, |
| "learning_rate": 1.986622073578595e-06, |
| "loss": 0.2742, |
| "step": 40200 |
| }, |
| { |
| "epoch": 3.8748201750898223, |
| "eval_loss": 0.24526259303092957, |
| "eval_runtime": 58.9383, |
| "eval_samples_per_second": 169.669, |
| "eval_steps_per_second": 21.209, |
| "step": 40200 |
| }, |
| { |
| "epoch": 3.884459031246762, |
| "grad_norm": 1.1070743799209595, |
| "learning_rate": 1.976588628762542e-06, |
| "loss": 0.2718, |
| "step": 40300 |
| }, |
| { |
| "epoch": 3.884459031246762, |
| "eval_loss": 0.2504149377346039, |
| "eval_runtime": 58.9412, |
| "eval_samples_per_second": 169.66, |
| "eval_steps_per_second": 21.208, |
| "step": 40300 |
| }, |
| { |
| "epoch": 3.894097887403702, |
| "grad_norm": 1.0678352117538452, |
| "learning_rate": 1.9665551839464883e-06, |
| "loss": 0.2731, |
| "step": 40400 |
| }, |
| { |
| "epoch": 3.894097887403702, |
| "eval_loss": 0.24410085380077362, |
| "eval_runtime": 59.1421, |
| "eval_samples_per_second": 169.084, |
| "eval_steps_per_second": 21.136, |
| "step": 40400 |
| }, |
| { |
| "epoch": 3.903736743560642, |
| "grad_norm": 1.212470293045044, |
| "learning_rate": 1.956521739130435e-06, |
| "loss": 0.2751, |
| "step": 40500 |
| }, |
| { |
| "epoch": 3.903736743560642, |
| "eval_loss": 0.24343110620975494, |
| "eval_runtime": 58.9168, |
| "eval_samples_per_second": 169.731, |
| "eval_steps_per_second": 21.216, |
| "step": 40500 |
| }, |
| { |
| "epoch": 3.9133755997175816, |
| "grad_norm": 1.2708877325057983, |
| "learning_rate": 1.9464882943143814e-06, |
| "loss": 0.2736, |
| "step": 40600 |
| }, |
| { |
| "epoch": 3.9133755997175816, |
| "eval_loss": 0.2406957447528839, |
| "eval_runtime": 58.9388, |
| "eval_samples_per_second": 169.668, |
| "eval_steps_per_second": 21.208, |
| "step": 40600 |
| }, |
| { |
| "epoch": 3.9230144558745215, |
| "grad_norm": 1.4610050916671753, |
| "learning_rate": 1.9364548494983277e-06, |
| "loss": 0.2732, |
| "step": 40700 |
| }, |
| { |
| "epoch": 3.9230144558745215, |
| "eval_loss": 0.23855151236057281, |
| "eval_runtime": 59.1412, |
| "eval_samples_per_second": 169.087, |
| "eval_steps_per_second": 21.136, |
| "step": 40700 |
| }, |
| { |
| "epoch": 3.9326533120314613, |
| "grad_norm": 1.3062421083450317, |
| "learning_rate": 1.9264214046822745e-06, |
| "loss": 0.2737, |
| "step": 40800 |
| }, |
| { |
| "epoch": 3.9326533120314613, |
| "eval_loss": 0.24466918408870697, |
| "eval_runtime": 58.926, |
| "eval_samples_per_second": 169.704, |
| "eval_steps_per_second": 21.213, |
| "step": 40800 |
| }, |
| { |
| "epoch": 3.942292168188401, |
| "grad_norm": 1.1947509050369263, |
| "learning_rate": 1.916387959866221e-06, |
| "loss": 0.2732, |
| "step": 40900 |
| }, |
| { |
| "epoch": 3.942292168188401, |
| "eval_loss": 0.2433861643075943, |
| "eval_runtime": 58.8674, |
| "eval_samples_per_second": 169.873, |
| "eval_steps_per_second": 21.234, |
| "step": 40900 |
| }, |
| { |
| "epoch": 3.951931024345341, |
| "grad_norm": 1.0160512924194336, |
| "learning_rate": 1.9063545150501674e-06, |
| "loss": 0.2734, |
| "step": 41000 |
| }, |
| { |
| "epoch": 3.951931024345341, |
| "eval_loss": 0.24299292266368866, |
| "eval_runtime": 58.6917, |
| "eval_samples_per_second": 170.382, |
| "eval_steps_per_second": 21.298, |
| "step": 41000 |
| }, |
| { |
| "epoch": 3.9615698805022808, |
| "grad_norm": 1.2853399515151978, |
| "learning_rate": 1.896321070234114e-06, |
| "loss": 0.2735, |
| "step": 41100 |
| }, |
| { |
| "epoch": 3.9615698805022808, |
| "eval_loss": 0.2398652285337448, |
| "eval_runtime": 58.63, |
| "eval_samples_per_second": 170.561, |
| "eval_steps_per_second": 21.32, |
| "step": 41100 |
| }, |
| { |
| "epoch": 3.9712087366592206, |
| "grad_norm": 1.2261525392532349, |
| "learning_rate": 1.8862876254180603e-06, |
| "loss": 0.2728, |
| "step": 41200 |
| }, |
| { |
| "epoch": 3.9712087366592206, |
| "eval_loss": 0.24188843369483948, |
| "eval_runtime": 58.8294, |
| "eval_samples_per_second": 169.983, |
| "eval_steps_per_second": 21.248, |
| "step": 41200 |
| }, |
| { |
| "epoch": 3.9808475928161604, |
| "grad_norm": 1.2449485063552856, |
| "learning_rate": 1.8762541806020068e-06, |
| "loss": 0.2703, |
| "step": 41300 |
| }, |
| { |
| "epoch": 3.9808475928161604, |
| "eval_loss": 0.24452358484268188, |
| "eval_runtime": 59.1226, |
| "eval_samples_per_second": 169.14, |
| "eval_steps_per_second": 21.143, |
| "step": 41300 |
| }, |
| { |
| "epoch": 3.9904864489731002, |
| "grad_norm": 1.1758934259414673, |
| "learning_rate": 1.8662207357859534e-06, |
| "loss": 0.276, |
| "step": 41400 |
| }, |
| { |
| "epoch": 3.9904864489731002, |
| "eval_loss": 0.24620556831359863, |
| "eval_runtime": 58.6943, |
| "eval_samples_per_second": 170.374, |
| "eval_steps_per_second": 21.297, |
| "step": 41400 |
| }, |
| { |
| "epoch": 4.00012530513004, |
| "grad_norm": 1.1423110961914062, |
| "learning_rate": 1.8561872909699e-06, |
| "loss": 0.2716, |
| "step": 41500 |
| }, |
| { |
| "epoch": 4.00012530513004, |
| "eval_loss": 0.24084354937076569, |
| "eval_runtime": 58.8706, |
| "eval_samples_per_second": 169.864, |
| "eval_steps_per_second": 21.233, |
| "step": 41500 |
| }, |
| { |
| "epoch": 4.00976416128698, |
| "grad_norm": 1.188114881515503, |
| "learning_rate": 1.8461538461538462e-06, |
| "loss": 0.2712, |
| "step": 41600 |
| }, |
| { |
| "epoch": 4.00976416128698, |
| "eval_loss": 0.2466341108083725, |
| "eval_runtime": 58.8443, |
| "eval_samples_per_second": 169.94, |
| "eval_steps_per_second": 21.242, |
| "step": 41600 |
| }, |
| { |
| "epoch": 4.01940301744392, |
| "grad_norm": 1.0706281661987305, |
| "learning_rate": 1.8361204013377928e-06, |
| "loss": 0.2719, |
| "step": 41700 |
| }, |
| { |
| "epoch": 4.01940301744392, |
| "eval_loss": 0.24489988386631012, |
| "eval_runtime": 58.7263, |
| "eval_samples_per_second": 170.281, |
| "eval_steps_per_second": 21.285, |
| "step": 41700 |
| }, |
| { |
| "epoch": 4.0290418736008595, |
| "grad_norm": 1.134295105934143, |
| "learning_rate": 1.8260869565217394e-06, |
| "loss": 0.2721, |
| "step": 41800 |
| }, |
| { |
| "epoch": 4.0290418736008595, |
| "eval_loss": 0.2406749725341797, |
| "eval_runtime": 59.0931, |
| "eval_samples_per_second": 169.225, |
| "eval_steps_per_second": 21.153, |
| "step": 41800 |
| }, |
| { |
| "epoch": 4.038680729757799, |
| "grad_norm": 1.140578269958496, |
| "learning_rate": 1.8160535117056857e-06, |
| "loss": 0.2703, |
| "step": 41900 |
| }, |
| { |
| "epoch": 4.038680729757799, |
| "eval_loss": 0.24536637961864471, |
| "eval_runtime": 58.9496, |
| "eval_samples_per_second": 169.636, |
| "eval_steps_per_second": 21.205, |
| "step": 41900 |
| }, |
| { |
| "epoch": 4.048319585914739, |
| "grad_norm": 1.1268982887268066, |
| "learning_rate": 1.8060200668896322e-06, |
| "loss": 0.2708, |
| "step": 42000 |
| }, |
| { |
| "epoch": 4.048319585914739, |
| "eval_loss": 0.24372392892837524, |
| "eval_runtime": 58.9564, |
| "eval_samples_per_second": 169.617, |
| "eval_steps_per_second": 21.202, |
| "step": 42000 |
| }, |
| { |
| "epoch": 4.057958442071679, |
| "grad_norm": 1.126116156578064, |
| "learning_rate": 1.7959866220735788e-06, |
| "loss": 0.2674, |
| "step": 42100 |
| }, |
| { |
| "epoch": 4.057958442071679, |
| "eval_loss": 0.24174726009368896, |
| "eval_runtime": 59.1178, |
| "eval_samples_per_second": 169.154, |
| "eval_steps_per_second": 21.144, |
| "step": 42100 |
| }, |
| { |
| "epoch": 4.067597298228619, |
| "grad_norm": 1.1760188341140747, |
| "learning_rate": 1.7859531772575253e-06, |
| "loss": 0.2697, |
| "step": 42200 |
| }, |
| { |
| "epoch": 4.067597298228619, |
| "eval_loss": 0.2439856231212616, |
| "eval_runtime": 58.9349, |
| "eval_samples_per_second": 169.679, |
| "eval_steps_per_second": 21.21, |
| "step": 42200 |
| }, |
| { |
| "epoch": 4.077236154385559, |
| "grad_norm": 0.9911255836486816, |
| "learning_rate": 1.7759197324414717e-06, |
| "loss": 0.2654, |
| "step": 42300 |
| }, |
| { |
| "epoch": 4.077236154385559, |
| "eval_loss": 0.23890195786952972, |
| "eval_runtime": 58.9228, |
| "eval_samples_per_second": 169.714, |
| "eval_steps_per_second": 21.214, |
| "step": 42300 |
| }, |
| { |
| "epoch": 4.086875010542499, |
| "grad_norm": 1.2863502502441406, |
| "learning_rate": 1.7658862876254182e-06, |
| "loss": 0.2692, |
| "step": 42400 |
| }, |
| { |
| "epoch": 4.086875010542499, |
| "eval_loss": 0.24417583644390106, |
| "eval_runtime": 58.8653, |
| "eval_samples_per_second": 169.879, |
| "eval_steps_per_second": 21.235, |
| "step": 42400 |
| }, |
| { |
| "epoch": 4.096513866699439, |
| "grad_norm": 1.1881786584854126, |
| "learning_rate": 1.7558528428093648e-06, |
| "loss": 0.2708, |
| "step": 42500 |
| }, |
| { |
| "epoch": 4.096513866699439, |
| "eval_loss": 0.24346549808979034, |
| "eval_runtime": 58.7861, |
| "eval_samples_per_second": 170.108, |
| "eval_steps_per_second": 21.264, |
| "step": 42500 |
| }, |
| { |
| "epoch": 4.106152722856379, |
| "grad_norm": 1.2108421325683594, |
| "learning_rate": 1.745819397993311e-06, |
| "loss": 0.2692, |
| "step": 42600 |
| }, |
| { |
| "epoch": 4.106152722856379, |
| "eval_loss": 0.2419823706150055, |
| "eval_runtime": 58.7844, |
| "eval_samples_per_second": 170.113, |
| "eval_steps_per_second": 21.264, |
| "step": 42600 |
| }, |
| { |
| "epoch": 4.115791579013319, |
| "grad_norm": 1.2227544784545898, |
| "learning_rate": 1.7357859531772575e-06, |
| "loss": 0.2695, |
| "step": 42700 |
| }, |
| { |
| "epoch": 4.115791579013319, |
| "eval_loss": 0.23794367909431458, |
| "eval_runtime": 59.1212, |
| "eval_samples_per_second": 169.144, |
| "eval_steps_per_second": 21.143, |
| "step": 42700 |
| }, |
| { |
| "epoch": 4.125430435170259, |
| "grad_norm": 1.0150911808013916, |
| "learning_rate": 1.7257525083612038e-06, |
| "loss": 0.2694, |
| "step": 42800 |
| }, |
| { |
| "epoch": 4.125430435170259, |
| "eval_loss": 0.24516792595386505, |
| "eval_runtime": 58.9164, |
| "eval_samples_per_second": 169.732, |
| "eval_steps_per_second": 21.217, |
| "step": 42800 |
| }, |
| { |
| "epoch": 4.1350692913271985, |
| "grad_norm": 1.1706311702728271, |
| "learning_rate": 1.7157190635451504e-06, |
| "loss": 0.2707, |
| "step": 42900 |
| }, |
| { |
| "epoch": 4.1350692913271985, |
| "eval_loss": 0.23993416130542755, |
| "eval_runtime": 58.8705, |
| "eval_samples_per_second": 169.864, |
| "eval_steps_per_second": 21.233, |
| "step": 42900 |
| }, |
| { |
| "epoch": 4.144708147484138, |
| "grad_norm": 1.1141585111618042, |
| "learning_rate": 1.705685618729097e-06, |
| "loss": 0.2696, |
| "step": 43000 |
| }, |
| { |
| "epoch": 4.144708147484138, |
| "eval_loss": 0.2433127909898758, |
| "eval_runtime": 59.0527, |
| "eval_samples_per_second": 169.34, |
| "eval_steps_per_second": 21.168, |
| "step": 43000 |
| }, |
| { |
| "epoch": 4.154347003641078, |
| "grad_norm": 1.11643385887146, |
| "learning_rate": 1.6956521739130435e-06, |
| "loss": 0.2669, |
| "step": 43100 |
| }, |
| { |
| "epoch": 4.154347003641078, |
| "eval_loss": 0.24249985814094543, |
| "eval_runtime": 58.8967, |
| "eval_samples_per_second": 169.789, |
| "eval_steps_per_second": 21.224, |
| "step": 43100 |
| }, |
| { |
| "epoch": 4.163985859798018, |
| "grad_norm": 1.1383628845214844, |
| "learning_rate": 1.6856187290969898e-06, |
| "loss": 0.2688, |
| "step": 43200 |
| }, |
| { |
| "epoch": 4.163985859798018, |
| "eval_loss": 0.24246694147586823, |
| "eval_runtime": 58.9245, |
| "eval_samples_per_second": 169.709, |
| "eval_steps_per_second": 21.214, |
| "step": 43200 |
| }, |
| { |
| "epoch": 4.173624715954958, |
| "grad_norm": 1.0870535373687744, |
| "learning_rate": 1.6755852842809363e-06, |
| "loss": 0.2669, |
| "step": 43300 |
| }, |
| { |
| "epoch": 4.173624715954958, |
| "eval_loss": 0.23730356991291046, |
| "eval_runtime": 59.0839, |
| "eval_samples_per_second": 169.251, |
| "eval_steps_per_second": 21.156, |
| "step": 43300 |
| }, |
| { |
| "epoch": 4.183263572111898, |
| "grad_norm": 1.062546730041504, |
| "learning_rate": 1.665551839464883e-06, |
| "loss": 0.268, |
| "step": 43400 |
| }, |
| { |
| "epoch": 4.183263572111898, |
| "eval_loss": 0.2389833778142929, |
| "eval_runtime": 58.9088, |
| "eval_samples_per_second": 169.754, |
| "eval_steps_per_second": 21.219, |
| "step": 43400 |
| }, |
| { |
| "epoch": 4.192902428268837, |
| "grad_norm": 1.2048453092575073, |
| "learning_rate": 1.6555183946488294e-06, |
| "loss": 0.2661, |
| "step": 43500 |
| }, |
| { |
| "epoch": 4.192902428268837, |
| "eval_loss": 0.24401752650737762, |
| "eval_runtime": 58.9403, |
| "eval_samples_per_second": 169.663, |
| "eval_steps_per_second": 21.208, |
| "step": 43500 |
| }, |
| { |
| "epoch": 4.202541284425777, |
| "grad_norm": 1.2191439867019653, |
| "learning_rate": 1.6454849498327758e-06, |
| "loss": 0.2679, |
| "step": 43600 |
| }, |
| { |
| "epoch": 4.202541284425777, |
| "eval_loss": 0.2426602989435196, |
| "eval_runtime": 59.1448, |
| "eval_samples_per_second": 169.077, |
| "eval_steps_per_second": 21.135, |
| "step": 43600 |
| }, |
| { |
| "epoch": 4.212180140582717, |
| "grad_norm": 1.168209195137024, |
| "learning_rate": 1.6354515050167223e-06, |
| "loss": 0.267, |
| "step": 43700 |
| }, |
| { |
| "epoch": 4.212180140582717, |
| "eval_loss": 0.2363247573375702, |
| "eval_runtime": 58.9517, |
| "eval_samples_per_second": 169.63, |
| "eval_steps_per_second": 21.204, |
| "step": 43700 |
| }, |
| { |
| "epoch": 4.221818996739657, |
| "grad_norm": 1.2940659523010254, |
| "learning_rate": 1.6254180602006689e-06, |
| "loss": 0.2675, |
| "step": 43800 |
| }, |
| { |
| "epoch": 4.221818996739657, |
| "eval_loss": 0.23338085412979126, |
| "eval_runtime": 59.1408, |
| "eval_samples_per_second": 169.088, |
| "eval_steps_per_second": 21.136, |
| "step": 43800 |
| }, |
| { |
| "epoch": 4.231457852896597, |
| "grad_norm": 1.2550647258758545, |
| "learning_rate": 1.6153846153846154e-06, |
| "loss": 0.2677, |
| "step": 43900 |
| }, |
| { |
| "epoch": 4.231457852896597, |
| "eval_loss": 0.24174867570400238, |
| "eval_runtime": 58.7543, |
| "eval_samples_per_second": 170.2, |
| "eval_steps_per_second": 21.275, |
| "step": 43900 |
| }, |
| { |
| "epoch": 4.2410967090535365, |
| "grad_norm": 1.1011639833450317, |
| "learning_rate": 1.6053511705685618e-06, |
| "loss": 0.2671, |
| "step": 44000 |
| }, |
| { |
| "epoch": 4.2410967090535365, |
| "eval_loss": 0.23765331506729126, |
| "eval_runtime": 59.0653, |
| "eval_samples_per_second": 169.304, |
| "eval_steps_per_second": 21.163, |
| "step": 44000 |
| }, |
| { |
| "epoch": 4.250735565210476, |
| "grad_norm": 1.282638430595398, |
| "learning_rate": 1.5953177257525083e-06, |
| "loss": 0.2674, |
| "step": 44100 |
| }, |
| { |
| "epoch": 4.250735565210476, |
| "eval_loss": 0.23663020133972168, |
| "eval_runtime": 58.989, |
| "eval_samples_per_second": 169.523, |
| "eval_steps_per_second": 21.19, |
| "step": 44100 |
| }, |
| { |
| "epoch": 4.260374421367416, |
| "grad_norm": 1.233346700668335, |
| "learning_rate": 1.5852842809364549e-06, |
| "loss": 0.2664, |
| "step": 44200 |
| }, |
| { |
| "epoch": 4.260374421367416, |
| "eval_loss": 0.2390340268611908, |
| "eval_runtime": 58.9592, |
| "eval_samples_per_second": 169.609, |
| "eval_steps_per_second": 21.201, |
| "step": 44200 |
| }, |
| { |
| "epoch": 4.270013277524356, |
| "grad_norm": 1.2142596244812012, |
| "learning_rate": 1.5752508361204012e-06, |
| "loss": 0.2673, |
| "step": 44300 |
| }, |
| { |
| "epoch": 4.270013277524356, |
| "eval_loss": 0.23672623932361603, |
| "eval_runtime": 59.1929, |
| "eval_samples_per_second": 168.939, |
| "eval_steps_per_second": 21.117, |
| "step": 44300 |
| }, |
| { |
| "epoch": 4.279652133681296, |
| "grad_norm": 1.1910849809646606, |
| "learning_rate": 1.5652173913043478e-06, |
| "loss": 0.2663, |
| "step": 44400 |
| }, |
| { |
| "epoch": 4.279652133681296, |
| "eval_loss": 0.23604613542556763, |
| "eval_runtime": 59.1767, |
| "eval_samples_per_second": 168.985, |
| "eval_steps_per_second": 21.123, |
| "step": 44400 |
| }, |
| { |
| "epoch": 4.289290989838236, |
| "grad_norm": 1.0741398334503174, |
| "learning_rate": 1.5551839464882943e-06, |
| "loss": 0.265, |
| "step": 44500 |
| }, |
| { |
| "epoch": 4.289290989838236, |
| "eval_loss": 0.24038757383823395, |
| "eval_runtime": 58.9675, |
| "eval_samples_per_second": 169.585, |
| "eval_steps_per_second": 21.198, |
| "step": 44500 |
| }, |
| { |
| "epoch": 4.2989298459951755, |
| "grad_norm": 1.1280308961868286, |
| "learning_rate": 1.5451505016722409e-06, |
| "loss": 0.2681, |
| "step": 44600 |
| }, |
| { |
| "epoch": 4.2989298459951755, |
| "eval_loss": 0.2376743108034134, |
| "eval_runtime": 59.012, |
| "eval_samples_per_second": 169.457, |
| "eval_steps_per_second": 21.182, |
| "step": 44600 |
| }, |
| { |
| "epoch": 4.308568702152115, |
| "grad_norm": 1.210562825202942, |
| "learning_rate": 1.5351170568561872e-06, |
| "loss": 0.2653, |
| "step": 44700 |
| }, |
| { |
| "epoch": 4.308568702152115, |
| "eval_loss": 0.2342677265405655, |
| "eval_runtime": 59.1509, |
| "eval_samples_per_second": 169.059, |
| "eval_steps_per_second": 21.132, |
| "step": 44700 |
| }, |
| { |
| "epoch": 4.318207558309055, |
| "grad_norm": 1.2077608108520508, |
| "learning_rate": 1.5250836120401338e-06, |
| "loss": 0.2659, |
| "step": 44800 |
| }, |
| { |
| "epoch": 4.318207558309055, |
| "eval_loss": 0.23438528180122375, |
| "eval_runtime": 58.9257, |
| "eval_samples_per_second": 169.705, |
| "eval_steps_per_second": 21.213, |
| "step": 44800 |
| }, |
| { |
| "epoch": 4.327846414465995, |
| "grad_norm": 1.2959463596343994, |
| "learning_rate": 1.5150501672240803e-06, |
| "loss": 0.266, |
| "step": 44900 |
| }, |
| { |
| "epoch": 4.327846414465995, |
| "eval_loss": 0.2376752495765686, |
| "eval_runtime": 58.9319, |
| "eval_samples_per_second": 169.687, |
| "eval_steps_per_second": 21.211, |
| "step": 44900 |
| }, |
| { |
| "epoch": 4.337485270622935, |
| "grad_norm": 1.135292410850525, |
| "learning_rate": 1.5050167224080269e-06, |
| "loss": 0.2649, |
| "step": 45000 |
| }, |
| { |
| "epoch": 4.337485270622935, |
| "eval_loss": 0.24188388884067535, |
| "eval_runtime": 59.1492, |
| "eval_samples_per_second": 169.064, |
| "eval_steps_per_second": 21.133, |
| "step": 45000 |
| }, |
| { |
| "epoch": 4.347124126779875, |
| "grad_norm": 1.1070986986160278, |
| "learning_rate": 1.4949832775919732e-06, |
| "loss": 0.2663, |
| "step": 45100 |
| }, |
| { |
| "epoch": 4.347124126779875, |
| "eval_loss": 0.23631098866462708, |
| "eval_runtime": 58.8991, |
| "eval_samples_per_second": 169.782, |
| "eval_steps_per_second": 21.223, |
| "step": 45100 |
| }, |
| { |
| "epoch": 4.356762982936815, |
| "grad_norm": 1.0398037433624268, |
| "learning_rate": 1.4849498327759198e-06, |
| "loss": 0.2644, |
| "step": 45200 |
| }, |
| { |
| "epoch": 4.356762982936815, |
| "eval_loss": 0.23711133003234863, |
| "eval_runtime": 58.8817, |
| "eval_samples_per_second": 169.832, |
| "eval_steps_per_second": 21.229, |
| "step": 45200 |
| }, |
| { |
| "epoch": 4.366401839093755, |
| "grad_norm": 1.2478735446929932, |
| "learning_rate": 1.4749163879598663e-06, |
| "loss": 0.2654, |
| "step": 45300 |
| }, |
| { |
| "epoch": 4.366401839093755, |
| "eval_loss": 0.2371368706226349, |
| "eval_runtime": 59.1317, |
| "eval_samples_per_second": 169.114, |
| "eval_steps_per_second": 21.139, |
| "step": 45300 |
| }, |
| { |
| "epoch": 4.376040695250695, |
| "grad_norm": 1.1391520500183105, |
| "learning_rate": 1.4648829431438129e-06, |
| "loss": 0.2627, |
| "step": 45400 |
| }, |
| { |
| "epoch": 4.376040695250695, |
| "eval_loss": 0.23908434808254242, |
| "eval_runtime": 58.7291, |
| "eval_samples_per_second": 170.273, |
| "eval_steps_per_second": 21.284, |
| "step": 45400 |
| }, |
| { |
| "epoch": 4.385679551407635, |
| "grad_norm": 1.0501779317855835, |
| "learning_rate": 1.4548494983277592e-06, |
| "loss": 0.265, |
| "step": 45500 |
| }, |
| { |
| "epoch": 4.385679551407635, |
| "eval_loss": 0.2357652336359024, |
| "eval_runtime": 58.7539, |
| "eval_samples_per_second": 170.202, |
| "eval_steps_per_second": 21.275, |
| "step": 45500 |
| }, |
| { |
| "epoch": 4.395318407564575, |
| "grad_norm": 1.1620495319366455, |
| "learning_rate": 1.4448160535117058e-06, |
| "loss": 0.263, |
| "step": 45600 |
| }, |
| { |
| "epoch": 4.395318407564575, |
| "eval_loss": 0.23429590463638306, |
| "eval_runtime": 58.9861, |
| "eval_samples_per_second": 169.532, |
| "eval_steps_per_second": 21.191, |
| "step": 45600 |
| }, |
| { |
| "epoch": 4.404957263721514, |
| "grad_norm": 1.365100383758545, |
| "learning_rate": 1.4347826086956523e-06, |
| "loss": 0.2627, |
| "step": 45700 |
| }, |
| { |
| "epoch": 4.404957263721514, |
| "eval_loss": 0.23306749761104584, |
| "eval_runtime": 58.8225, |
| "eval_samples_per_second": 170.003, |
| "eval_steps_per_second": 21.25, |
| "step": 45700 |
| }, |
| { |
| "epoch": 4.414596119878454, |
| "grad_norm": 1.2309627532958984, |
| "learning_rate": 1.4247491638795989e-06, |
| "loss": 0.2632, |
| "step": 45800 |
| }, |
| { |
| "epoch": 4.414596119878454, |
| "eval_loss": 0.23629753291606903, |
| "eval_runtime": 58.8174, |
| "eval_samples_per_second": 170.018, |
| "eval_steps_per_second": 21.252, |
| "step": 45800 |
| }, |
| { |
| "epoch": 4.424234976035394, |
| "grad_norm": 1.277610421180725, |
| "learning_rate": 1.4147157190635452e-06, |
| "loss": 0.2655, |
| "step": 45900 |
| }, |
| { |
| "epoch": 4.424234976035394, |
| "eval_loss": 0.23689965903759003, |
| "eval_runtime": 58.9758, |
| "eval_samples_per_second": 169.561, |
| "eval_steps_per_second": 21.195, |
| "step": 45900 |
| }, |
| { |
| "epoch": 4.433873832192334, |
| "grad_norm": 1.1359058618545532, |
| "learning_rate": 1.4046822742474917e-06, |
| "loss": 0.2638, |
| "step": 46000 |
| }, |
| { |
| "epoch": 4.433873832192334, |
| "eval_loss": 0.23394687473773956, |
| "eval_runtime": 58.9538, |
| "eval_samples_per_second": 169.624, |
| "eval_steps_per_second": 21.203, |
| "step": 46000 |
| }, |
| { |
| "epoch": 4.443512688349274, |
| "grad_norm": 1.0680986642837524, |
| "learning_rate": 1.3946488294314383e-06, |
| "loss": 0.2611, |
| "step": 46100 |
| }, |
| { |
| "epoch": 4.443512688349274, |
| "eval_loss": 0.23477770388126373, |
| "eval_runtime": 58.934, |
| "eval_samples_per_second": 169.681, |
| "eval_steps_per_second": 21.21, |
| "step": 46100 |
| }, |
| { |
| "epoch": 4.4531515445062135, |
| "grad_norm": 1.0783350467681885, |
| "learning_rate": 1.3846153846153846e-06, |
| "loss": 0.2599, |
| "step": 46200 |
| }, |
| { |
| "epoch": 4.4531515445062135, |
| "eval_loss": 0.23568202555179596, |
| "eval_runtime": 59.1167, |
| "eval_samples_per_second": 169.157, |
| "eval_steps_per_second": 21.145, |
| "step": 46200 |
| }, |
| { |
| "epoch": 4.462790400663153, |
| "grad_norm": 1.2353805303573608, |
| "learning_rate": 1.374581939799331e-06, |
| "loss": 0.2602, |
| "step": 46300 |
| }, |
| { |
| "epoch": 4.462790400663153, |
| "eval_loss": 0.23270440101623535, |
| "eval_runtime": 58.9332, |
| "eval_samples_per_second": 169.684, |
| "eval_steps_per_second": 21.21, |
| "step": 46300 |
| }, |
| { |
| "epoch": 4.472429256820093, |
| "grad_norm": 1.1992213726043701, |
| "learning_rate": 1.3645484949832775e-06, |
| "loss": 0.2618, |
| "step": 46400 |
| }, |
| { |
| "epoch": 4.472429256820093, |
| "eval_loss": 0.23463425040245056, |
| "eval_runtime": 58.97, |
| "eval_samples_per_second": 169.578, |
| "eval_steps_per_second": 21.197, |
| "step": 46400 |
| }, |
| { |
| "epoch": 4.482068112977033, |
| "grad_norm": 1.2045105695724487, |
| "learning_rate": 1.354515050167224e-06, |
| "loss": 0.2607, |
| "step": 46500 |
| }, |
| { |
| "epoch": 4.482068112977033, |
| "eval_loss": 0.2355393022298813, |
| "eval_runtime": 59.1174, |
| "eval_samples_per_second": 169.155, |
| "eval_steps_per_second": 21.144, |
| "step": 46500 |
| }, |
| { |
| "epoch": 4.491706969133973, |
| "grad_norm": 1.1414119005203247, |
| "learning_rate": 1.3444816053511706e-06, |
| "loss": 0.2617, |
| "step": 46600 |
| }, |
| { |
| "epoch": 4.491706969133973, |
| "eval_loss": 0.23534570634365082, |
| "eval_runtime": 58.9211, |
| "eval_samples_per_second": 169.718, |
| "eval_steps_per_second": 21.215, |
| "step": 46600 |
| }, |
| { |
| "epoch": 4.501345825290913, |
| "grad_norm": 1.2172667980194092, |
| "learning_rate": 1.334448160535117e-06, |
| "loss": 0.2627, |
| "step": 46700 |
| }, |
| { |
| "epoch": 4.501345825290913, |
| "eval_loss": 0.2336340993642807, |
| "eval_runtime": 58.9514, |
| "eval_samples_per_second": 169.631, |
| "eval_steps_per_second": 21.204, |
| "step": 46700 |
| }, |
| { |
| "epoch": 4.5109846814478525, |
| "grad_norm": 1.0742268562316895, |
| "learning_rate": 1.3244147157190635e-06, |
| "loss": 0.2567, |
| "step": 46800 |
| }, |
| { |
| "epoch": 4.5109846814478525, |
| "eval_loss": 0.2309028059244156, |
| "eval_runtime": 59.1411, |
| "eval_samples_per_second": 169.087, |
| "eval_steps_per_second": 21.136, |
| "step": 46800 |
| }, |
| { |
| "epoch": 4.520623537604792, |
| "grad_norm": 1.1084363460540771, |
| "learning_rate": 1.31438127090301e-06, |
| "loss": 0.2605, |
| "step": 46900 |
| }, |
| { |
| "epoch": 4.520623537604792, |
| "eval_loss": 0.23382265865802765, |
| "eval_runtime": 58.7079, |
| "eval_samples_per_second": 170.335, |
| "eval_steps_per_second": 21.292, |
| "step": 46900 |
| }, |
| { |
| "epoch": 4.530262393761732, |
| "grad_norm": 1.171460747718811, |
| "learning_rate": 1.3043478260869566e-06, |
| "loss": 0.2591, |
| "step": 47000 |
| }, |
| { |
| "epoch": 4.530262393761732, |
| "eval_loss": 0.23248232901096344, |
| "eval_runtime": 58.9691, |
| "eval_samples_per_second": 169.58, |
| "eval_steps_per_second": 21.198, |
| "step": 47000 |
| }, |
| { |
| "epoch": 4.539901249918672, |
| "grad_norm": 1.2160496711730957, |
| "learning_rate": 1.294314381270903e-06, |
| "loss": 0.2584, |
| "step": 47100 |
| }, |
| { |
| "epoch": 4.539901249918672, |
| "eval_loss": 0.23489028215408325, |
| "eval_runtime": 59.1758, |
| "eval_samples_per_second": 168.988, |
| "eval_steps_per_second": 21.123, |
| "step": 47100 |
| }, |
| { |
| "epoch": 4.549540106075612, |
| "grad_norm": 1.0126835107803345, |
| "learning_rate": 1.2842809364548495e-06, |
| "loss": 0.2608, |
| "step": 47200 |
| }, |
| { |
| "epoch": 4.549540106075612, |
| "eval_loss": 0.22762079536914825, |
| "eval_runtime": 58.9452, |
| "eval_samples_per_second": 169.649, |
| "eval_steps_per_second": 21.206, |
| "step": 47200 |
| }, |
| { |
| "epoch": 4.559178962232552, |
| "grad_norm": 1.160570502281189, |
| "learning_rate": 1.274247491638796e-06, |
| "loss": 0.2615, |
| "step": 47300 |
| }, |
| { |
| "epoch": 4.559178962232552, |
| "eval_loss": 0.23083405196666718, |
| "eval_runtime": 58.9807, |
| "eval_samples_per_second": 169.547, |
| "eval_steps_per_second": 21.193, |
| "step": 47300 |
| }, |
| { |
| "epoch": 4.568817818389491, |
| "grad_norm": 1.1063063144683838, |
| "learning_rate": 1.2642140468227424e-06, |
| "loss": 0.2592, |
| "step": 47400 |
| }, |
| { |
| "epoch": 4.568817818389491, |
| "eval_loss": 0.2323395013809204, |
| "eval_runtime": 59.1709, |
| "eval_samples_per_second": 169.002, |
| "eval_steps_per_second": 21.125, |
| "step": 47400 |
| }, |
| { |
| "epoch": 4.578456674546431, |
| "grad_norm": 1.0993156433105469, |
| "learning_rate": 1.254180602006689e-06, |
| "loss": 0.2604, |
| "step": 47500 |
| }, |
| { |
| "epoch": 4.578456674546431, |
| "eval_loss": 0.2317042052745819, |
| "eval_runtime": 59.0198, |
| "eval_samples_per_second": 169.435, |
| "eval_steps_per_second": 21.179, |
| "step": 47500 |
| }, |
| { |
| "epoch": 4.588095530703371, |
| "grad_norm": 1.2433979511260986, |
| "learning_rate": 1.2441471571906355e-06, |
| "loss": 0.2595, |
| "step": 47600 |
| }, |
| { |
| "epoch": 4.588095530703371, |
| "eval_loss": 0.23653364181518555, |
| "eval_runtime": 58.9575, |
| "eval_samples_per_second": 169.614, |
| "eval_steps_per_second": 21.202, |
| "step": 47600 |
| }, |
| { |
| "epoch": 4.597734386860312, |
| "grad_norm": 1.1454936265945435, |
| "learning_rate": 1.234113712374582e-06, |
| "loss": 0.2623, |
| "step": 47700 |
| }, |
| { |
| "epoch": 4.597734386860312, |
| "eval_loss": 0.23131422698497772, |
| "eval_runtime": 59.0739, |
| "eval_samples_per_second": 169.279, |
| "eval_steps_per_second": 21.16, |
| "step": 47700 |
| }, |
| { |
| "epoch": 4.607373243017252, |
| "grad_norm": 1.0795280933380127, |
| "learning_rate": 1.2240802675585284e-06, |
| "loss": 0.2594, |
| "step": 47800 |
| }, |
| { |
| "epoch": 4.607373243017252, |
| "eval_loss": 0.22916051745414734, |
| "eval_runtime": 58.9155, |
| "eval_samples_per_second": 169.735, |
| "eval_steps_per_second": 21.217, |
| "step": 47800 |
| }, |
| { |
| "epoch": 4.617012099174191, |
| "grad_norm": 1.2649006843566895, |
| "learning_rate": 1.214046822742475e-06, |
| "loss": 0.259, |
| "step": 47900 |
| }, |
| { |
| "epoch": 4.617012099174191, |
| "eval_loss": 0.2325269728899002, |
| "eval_runtime": 58.9759, |
| "eval_samples_per_second": 169.561, |
| "eval_steps_per_second": 21.195, |
| "step": 47900 |
| }, |
| { |
| "epoch": 4.626650955331131, |
| "grad_norm": 1.181906819343567, |
| "learning_rate": 1.2040133779264215e-06, |
| "loss": 0.2588, |
| "step": 48000 |
| }, |
| { |
| "epoch": 4.626650955331131, |
| "eval_loss": 0.23951946198940277, |
| "eval_runtime": 59.1457, |
| "eval_samples_per_second": 169.074, |
| "eval_steps_per_second": 21.134, |
| "step": 48000 |
| }, |
| { |
| "epoch": 4.636289811488071, |
| "grad_norm": 1.1375948190689087, |
| "learning_rate": 1.193979933110368e-06, |
| "loss": 0.2596, |
| "step": 48100 |
| }, |
| { |
| "epoch": 4.636289811488071, |
| "eval_loss": 0.23047949373722076, |
| "eval_runtime": 58.9661, |
| "eval_samples_per_second": 169.589, |
| "eval_steps_per_second": 21.199, |
| "step": 48100 |
| }, |
| { |
| "epoch": 4.645928667645011, |
| "grad_norm": 1.180855393409729, |
| "learning_rate": 1.1839464882943144e-06, |
| "loss": 0.2604, |
| "step": 48200 |
| }, |
| { |
| "epoch": 4.645928667645011, |
| "eval_loss": 0.23023301362991333, |
| "eval_runtime": 58.9745, |
| "eval_samples_per_second": 169.565, |
| "eval_steps_per_second": 21.196, |
| "step": 48200 |
| }, |
| { |
| "epoch": 4.655567523801951, |
| "grad_norm": 1.1689326763153076, |
| "learning_rate": 1.173913043478261e-06, |
| "loss": 0.2577, |
| "step": 48300 |
| }, |
| { |
| "epoch": 4.655567523801951, |
| "eval_loss": 0.23206885159015656, |
| "eval_runtime": 58.8991, |
| "eval_samples_per_second": 169.782, |
| "eval_steps_per_second": 21.223, |
| "step": 48300 |
| }, |
| { |
| "epoch": 4.6652063799588905, |
| "grad_norm": 1.190232276916504, |
| "learning_rate": 1.1638795986622075e-06, |
| "loss": 0.2577, |
| "step": 48400 |
| }, |
| { |
| "epoch": 4.6652063799588905, |
| "eval_loss": 0.23331387341022491, |
| "eval_runtime": 59.0925, |
| "eval_samples_per_second": 169.226, |
| "eval_steps_per_second": 21.153, |
| "step": 48400 |
| }, |
| { |
| "epoch": 4.67484523611583, |
| "grad_norm": 1.231204628944397, |
| "learning_rate": 1.153846153846154e-06, |
| "loss": 0.2585, |
| "step": 48500 |
| }, |
| { |
| "epoch": 4.67484523611583, |
| "eval_loss": 0.22776982188224792, |
| "eval_runtime": 58.9385, |
| "eval_samples_per_second": 169.668, |
| "eval_steps_per_second": 21.209, |
| "step": 48500 |
| }, |
| { |
| "epoch": 4.68448409227277, |
| "grad_norm": 1.1207915544509888, |
| "learning_rate": 1.1438127090301004e-06, |
| "loss": 0.2564, |
| "step": 48600 |
| }, |
| { |
| "epoch": 4.68448409227277, |
| "eval_loss": 0.23199844360351562, |
| "eval_runtime": 58.9559, |
| "eval_samples_per_second": 169.618, |
| "eval_steps_per_second": 21.202, |
| "step": 48600 |
| }, |
| { |
| "epoch": 4.69412294842971, |
| "grad_norm": 1.1697341203689575, |
| "learning_rate": 1.133779264214047e-06, |
| "loss": 0.2588, |
| "step": 48700 |
| }, |
| { |
| "epoch": 4.69412294842971, |
| "eval_loss": 0.2301999032497406, |
| "eval_runtime": 59.1168, |
| "eval_samples_per_second": 169.157, |
| "eval_steps_per_second": 21.145, |
| "step": 48700 |
| }, |
| { |
| "epoch": 4.70376180458665, |
| "grad_norm": 1.206182837486267, |
| "learning_rate": 1.1237458193979933e-06, |
| "loss": 0.257, |
| "step": 48800 |
| }, |
| { |
| "epoch": 4.70376180458665, |
| "eval_loss": 0.2264467477798462, |
| "eval_runtime": 58.8913, |
| "eval_samples_per_second": 169.804, |
| "eval_steps_per_second": 21.226, |
| "step": 48800 |
| }, |
| { |
| "epoch": 4.71340066074359, |
| "grad_norm": 1.2088978290557861, |
| "learning_rate": 1.1137123745819398e-06, |
| "loss": 0.2567, |
| "step": 48900 |
| }, |
| { |
| "epoch": 4.71340066074359, |
| "eval_loss": 0.22952109575271606, |
| "eval_runtime": 58.8906, |
| "eval_samples_per_second": 169.806, |
| "eval_steps_per_second": 21.226, |
| "step": 48900 |
| }, |
| { |
| "epoch": 4.7230395169005295, |
| "grad_norm": 1.1703099012374878, |
| "learning_rate": 1.1036789297658862e-06, |
| "loss": 0.2589, |
| "step": 49000 |
| }, |
| { |
| "epoch": 4.7230395169005295, |
| "eval_loss": 0.23127800226211548, |
| "eval_runtime": 59.0382, |
| "eval_samples_per_second": 169.382, |
| "eval_steps_per_second": 21.173, |
| "step": 49000 |
| }, |
| { |
| "epoch": 4.732678373057469, |
| "grad_norm": 1.1088894605636597, |
| "learning_rate": 1.0936454849498327e-06, |
| "loss": 0.2576, |
| "step": 49100 |
| }, |
| { |
| "epoch": 4.732678373057469, |
| "eval_loss": 0.22928257286548615, |
| "eval_runtime": 58.9548, |
| "eval_samples_per_second": 169.622, |
| "eval_steps_per_second": 21.203, |
| "step": 49100 |
| }, |
| { |
| "epoch": 4.742317229214409, |
| "grad_norm": 1.1792631149291992, |
| "learning_rate": 1.0836120401337793e-06, |
| "loss": 0.256, |
| "step": 49200 |
| }, |
| { |
| "epoch": 4.742317229214409, |
| "eval_loss": 0.2260976880788803, |
| "eval_runtime": 58.9553, |
| "eval_samples_per_second": 169.62, |
| "eval_steps_per_second": 21.202, |
| "step": 49200 |
| }, |
| { |
| "epoch": 4.751956085371349, |
| "grad_norm": 1.115937352180481, |
| "learning_rate": 1.0735785953177258e-06, |
| "loss": 0.2584, |
| "step": 49300 |
| }, |
| { |
| "epoch": 4.751956085371349, |
| "eval_loss": 0.2344072461128235, |
| "eval_runtime": 59.1056, |
| "eval_samples_per_second": 169.189, |
| "eval_steps_per_second": 21.149, |
| "step": 49300 |
| }, |
| { |
| "epoch": 4.761594941528289, |
| "grad_norm": 1.2252027988433838, |
| "learning_rate": 1.0635451505016722e-06, |
| "loss": 0.2577, |
| "step": 49400 |
| }, |
| { |
| "epoch": 4.761594941528289, |
| "eval_loss": 0.2281453162431717, |
| "eval_runtime": 58.9641, |
| "eval_samples_per_second": 169.595, |
| "eval_steps_per_second": 21.199, |
| "step": 49400 |
| }, |
| { |
| "epoch": 4.771233797685229, |
| "grad_norm": 1.164642572402954, |
| "learning_rate": 1.0535117056856187e-06, |
| "loss": 0.2566, |
| "step": 49500 |
| }, |
| { |
| "epoch": 4.771233797685229, |
| "eval_loss": 0.23026709258556366, |
| "eval_runtime": 59.1894, |
| "eval_samples_per_second": 168.949, |
| "eval_steps_per_second": 21.119, |
| "step": 49500 |
| }, |
| { |
| "epoch": 4.780872653842168, |
| "grad_norm": 1.0098907947540283, |
| "learning_rate": 1.0434782608695653e-06, |
| "loss": 0.2589, |
| "step": 49600 |
| }, |
| { |
| "epoch": 4.780872653842168, |
| "eval_loss": 0.23416157066822052, |
| "eval_runtime": 59.0058, |
| "eval_samples_per_second": 169.475, |
| "eval_steps_per_second": 21.184, |
| "step": 49600 |
| }, |
| { |
| "epoch": 4.790511509999108, |
| "grad_norm": 1.1912959814071655, |
| "learning_rate": 1.0334448160535118e-06, |
| "loss": 0.2579, |
| "step": 49700 |
| }, |
| { |
| "epoch": 4.790511509999108, |
| "eval_loss": 0.2309984415769577, |
| "eval_runtime": 58.9325, |
| "eval_samples_per_second": 169.686, |
| "eval_steps_per_second": 21.211, |
| "step": 49700 |
| }, |
| { |
| "epoch": 4.800150366156048, |
| "grad_norm": 1.2718689441680908, |
| "learning_rate": 1.0234113712374581e-06, |
| "loss": 0.2575, |
| "step": 49800 |
| }, |
| { |
| "epoch": 4.800150366156048, |
| "eval_loss": 0.2315024882555008, |
| "eval_runtime": 58.6814, |
| "eval_samples_per_second": 170.412, |
| "eval_steps_per_second": 21.301, |
| "step": 49800 |
| }, |
| { |
| "epoch": 4.809789222312988, |
| "grad_norm": 1.0510720014572144, |
| "learning_rate": 1.0133779264214047e-06, |
| "loss": 0.2567, |
| "step": 49900 |
| }, |
| { |
| "epoch": 4.809789222312988, |
| "eval_loss": 0.2308981865644455, |
| "eval_runtime": 58.7968, |
| "eval_samples_per_second": 170.077, |
| "eval_steps_per_second": 21.26, |
| "step": 49900 |
| }, |
| { |
| "epoch": 4.819428078469928, |
| "grad_norm": 1.1117866039276123, |
| "learning_rate": 1.0033444816053512e-06, |
| "loss": 0.2577, |
| "step": 50000 |
| }, |
| { |
| "epoch": 4.819428078469928, |
| "eval_loss": 0.2323577105998993, |
| "eval_runtime": 58.6609, |
| "eval_samples_per_second": 170.471, |
| "eval_steps_per_second": 21.309, |
| "step": 50000 |
| }, |
| { |
| "epoch": 4.8290669346268675, |
| "grad_norm": 1.1479214429855347, |
| "learning_rate": 9.933110367892976e-07, |
| "loss": 0.2569, |
| "step": 50100 |
| }, |
| { |
| "epoch": 4.8290669346268675, |
| "eval_loss": 0.2267085760831833, |
| "eval_runtime": 58.6686, |
| "eval_samples_per_second": 170.449, |
| "eval_steps_per_second": 21.306, |
| "step": 50100 |
| }, |
| { |
| "epoch": 4.838705790783807, |
| "grad_norm": 1.0486712455749512, |
| "learning_rate": 9.832775919732441e-07, |
| "loss": 0.2565, |
| "step": 50200 |
| }, |
| { |
| "epoch": 4.838705790783807, |
| "eval_loss": 0.233329638838768, |
| "eval_runtime": 58.7371, |
| "eval_samples_per_second": 170.25, |
| "eval_steps_per_second": 21.281, |
| "step": 50200 |
| }, |
| { |
| "epoch": 4.848344646940747, |
| "grad_norm": 1.209004282951355, |
| "learning_rate": 9.732441471571907e-07, |
| "loss": 0.2551, |
| "step": 50300 |
| }, |
| { |
| "epoch": 4.848344646940747, |
| "eval_loss": 0.22935496270656586, |
| "eval_runtime": 58.9021, |
| "eval_samples_per_second": 169.773, |
| "eval_steps_per_second": 21.222, |
| "step": 50300 |
| }, |
| { |
| "epoch": 4.857983503097687, |
| "grad_norm": 1.1527981758117676, |
| "learning_rate": 9.632107023411372e-07, |
| "loss": 0.2578, |
| "step": 50400 |
| }, |
| { |
| "epoch": 4.857983503097687, |
| "eval_loss": 0.2258923351764679, |
| "eval_runtime": 58.7394, |
| "eval_samples_per_second": 170.244, |
| "eval_steps_per_second": 21.28, |
| "step": 50400 |
| }, |
| { |
| "epoch": 4.867622359254627, |
| "grad_norm": 1.131640911102295, |
| "learning_rate": 9.531772575250837e-07, |
| "loss": 0.2569, |
| "step": 50500 |
| }, |
| { |
| "epoch": 4.867622359254627, |
| "eval_loss": 0.22396264970302582, |
| "eval_runtime": 58.5609, |
| "eval_samples_per_second": 170.762, |
| "eval_steps_per_second": 21.345, |
| "step": 50500 |
| }, |
| { |
| "epoch": 4.877261215411567, |
| "grad_norm": 1.0976239442825317, |
| "learning_rate": 9.431438127090301e-07, |
| "loss": 0.2551, |
| "step": 50600 |
| }, |
| { |
| "epoch": 4.877261215411567, |
| "eval_loss": 0.22849638760089874, |
| "eval_runtime": 59.0715, |
| "eval_samples_per_second": 169.286, |
| "eval_steps_per_second": 21.161, |
| "step": 50600 |
| }, |
| { |
| "epoch": 4.8869000715685065, |
| "grad_norm": 1.0808651447296143, |
| "learning_rate": 9.331103678929767e-07, |
| "loss": 0.2557, |
| "step": 50700 |
| }, |
| { |
| "epoch": 4.8869000715685065, |
| "eval_loss": 0.2275507152080536, |
| "eval_runtime": 59.1396, |
| "eval_samples_per_second": 169.092, |
| "eval_steps_per_second": 21.136, |
| "step": 50700 |
| }, |
| { |
| "epoch": 4.896538927725447, |
| "grad_norm": 1.1022206544876099, |
| "learning_rate": 9.230769230769231e-07, |
| "loss": 0.2546, |
| "step": 50800 |
| }, |
| { |
| "epoch": 4.896538927725447, |
| "eval_loss": 0.23217861354351044, |
| "eval_runtime": 58.9856, |
| "eval_samples_per_second": 169.533, |
| "eval_steps_per_second": 21.192, |
| "step": 50800 |
| }, |
| { |
| "epoch": 4.906177783882387, |
| "grad_norm": 1.1429067850112915, |
| "learning_rate": 9.130434782608697e-07, |
| "loss": 0.2556, |
| "step": 50900 |
| }, |
| { |
| "epoch": 4.906177783882387, |
| "eval_loss": 0.23015516996383667, |
| "eval_runtime": 58.9649, |
| "eval_samples_per_second": 169.593, |
| "eval_steps_per_second": 21.199, |
| "step": 50900 |
| }, |
| { |
| "epoch": 4.915816640039327, |
| "grad_norm": 1.1336737871170044, |
| "learning_rate": 9.030100334448161e-07, |
| "loss": 0.2557, |
| "step": 51000 |
| }, |
| { |
| "epoch": 4.915816640039327, |
| "eval_loss": 0.22632111608982086, |
| "eval_runtime": 59.164, |
| "eval_samples_per_second": 169.022, |
| "eval_steps_per_second": 21.128, |
| "step": 51000 |
| }, |
| { |
| "epoch": 4.925455496196267, |
| "grad_norm": 1.0541027784347534, |
| "learning_rate": 8.929765886287627e-07, |
| "loss": 0.2534, |
| "step": 51100 |
| }, |
| { |
| "epoch": 4.925455496196267, |
| "eval_loss": 0.2284763604402542, |
| "eval_runtime": 58.9383, |
| "eval_samples_per_second": 169.669, |
| "eval_steps_per_second": 21.209, |
| "step": 51100 |
| }, |
| { |
| "epoch": 4.9350943523532065, |
| "grad_norm": 1.2004669904708862, |
| "learning_rate": 8.829431438127091e-07, |
| "loss": 0.253, |
| "step": 51200 |
| }, |
| { |
| "epoch": 4.9350943523532065, |
| "eval_loss": 0.22993648052215576, |
| "eval_runtime": 58.9372, |
| "eval_samples_per_second": 169.672, |
| "eval_steps_per_second": 21.209, |
| "step": 51200 |
| }, |
| { |
| "epoch": 4.944733208510146, |
| "grad_norm": 1.3005701303482056, |
| "learning_rate": 8.729096989966555e-07, |
| "loss": 0.2551, |
| "step": 51300 |
| }, |
| { |
| "epoch": 4.944733208510146, |
| "eval_loss": 0.22793498635292053, |
| "eval_runtime": 58.9009, |
| "eval_samples_per_second": 169.777, |
| "eval_steps_per_second": 21.222, |
| "step": 51300 |
| }, |
| { |
| "epoch": 4.954372064667086, |
| "grad_norm": 1.110014796257019, |
| "learning_rate": 8.628762541806019e-07, |
| "loss": 0.255, |
| "step": 51400 |
| }, |
| { |
| "epoch": 4.954372064667086, |
| "eval_loss": 0.23050156235694885, |
| "eval_runtime": 59.1011, |
| "eval_samples_per_second": 169.202, |
| "eval_steps_per_second": 21.15, |
| "step": 51400 |
| }, |
| { |
| "epoch": 4.964010920824026, |
| "grad_norm": 1.448729395866394, |
| "learning_rate": 8.528428093645485e-07, |
| "loss": 0.2523, |
| "step": 51500 |
| }, |
| { |
| "epoch": 4.964010920824026, |
| "eval_loss": 0.22872421145439148, |
| "eval_runtime": 58.8714, |
| "eval_samples_per_second": 169.862, |
| "eval_steps_per_second": 21.233, |
| "step": 51500 |
| }, |
| { |
| "epoch": 4.973649776980966, |
| "grad_norm": 1.3377678394317627, |
| "learning_rate": 8.428093645484949e-07, |
| "loss": 0.2566, |
| "step": 51600 |
| }, |
| { |
| "epoch": 4.973649776980966, |
| "eval_loss": 0.22439132630825043, |
| "eval_runtime": 58.9474, |
| "eval_samples_per_second": 169.643, |
| "eval_steps_per_second": 21.205, |
| "step": 51600 |
| }, |
| { |
| "epoch": 4.983288633137906, |
| "grad_norm": 1.082703709602356, |
| "learning_rate": 8.327759197324414e-07, |
| "loss": 0.2551, |
| "step": 51700 |
| }, |
| { |
| "epoch": 4.983288633137906, |
| "eval_loss": 0.2283206284046173, |
| "eval_runtime": 59.1758, |
| "eval_samples_per_second": 168.988, |
| "eval_steps_per_second": 21.124, |
| "step": 51700 |
| }, |
| { |
| "epoch": 4.992927489294845, |
| "grad_norm": 1.1986274719238281, |
| "learning_rate": 8.227424749163879e-07, |
| "loss": 0.2545, |
| "step": 51800 |
| }, |
| { |
| "epoch": 4.992927489294845, |
| "eval_loss": 0.22571945190429688, |
| "eval_runtime": 59.0014, |
| "eval_samples_per_second": 169.488, |
| "eval_steps_per_second": 21.186, |
| "step": 51800 |
| }, |
| { |
| "epoch": 5.002566345451785, |
| "grad_norm": 1.231094479560852, |
| "learning_rate": 8.127090301003344e-07, |
| "loss": 0.2536, |
| "step": 51900 |
| }, |
| { |
| "epoch": 5.002566345451785, |
| "eval_loss": 0.23076747357845306, |
| "eval_runtime": 58.9604, |
| "eval_samples_per_second": 169.605, |
| "eval_steps_per_second": 21.201, |
| "step": 51900 |
| }, |
| { |
| "epoch": 5.012205201608725, |
| "grad_norm": 1.411320686340332, |
| "learning_rate": 8.026755852842809e-07, |
| "loss": 0.2537, |
| "step": 52000 |
| }, |
| { |
| "epoch": 5.012205201608725, |
| "eval_loss": 0.23237231373786926, |
| "eval_runtime": 59.1237, |
| "eval_samples_per_second": 169.137, |
| "eval_steps_per_second": 21.142, |
| "step": 52000 |
| }, |
| { |
| "epoch": 5.021844057765665, |
| "grad_norm": 1.2033196687698364, |
| "learning_rate": 7.926421404682274e-07, |
| "loss": 0.257, |
| "step": 52100 |
| }, |
| { |
| "epoch": 5.021844057765665, |
| "eval_loss": 0.22967660427093506, |
| "eval_runtime": 59.0144, |
| "eval_samples_per_second": 169.45, |
| "eval_steps_per_second": 21.181, |
| "step": 52100 |
| }, |
| { |
| "epoch": 5.031482913922605, |
| "grad_norm": 1.2692068815231323, |
| "learning_rate": 7.826086956521739e-07, |
| "loss": 0.2555, |
| "step": 52200 |
| }, |
| { |
| "epoch": 5.031482913922605, |
| "eval_loss": 0.22523514926433563, |
| "eval_runtime": 58.9471, |
| "eval_samples_per_second": 169.644, |
| "eval_steps_per_second": 21.205, |
| "step": 52200 |
| }, |
| { |
| "epoch": 5.0411217700795445, |
| "grad_norm": 1.094571590423584, |
| "learning_rate": 7.725752508361204e-07, |
| "loss": 0.2515, |
| "step": 52300 |
| }, |
| { |
| "epoch": 5.0411217700795445, |
| "eval_loss": 0.22601811587810516, |
| "eval_runtime": 59.0812, |
| "eval_samples_per_second": 169.259, |
| "eval_steps_per_second": 21.157, |
| "step": 52300 |
| }, |
| { |
| "epoch": 5.050760626236484, |
| "grad_norm": 1.1147675514221191, |
| "learning_rate": 7.625418060200669e-07, |
| "loss": 0.2521, |
| "step": 52400 |
| }, |
| { |
| "epoch": 5.050760626236484, |
| "eval_loss": 0.22480392456054688, |
| "eval_runtime": 58.9665, |
| "eval_samples_per_second": 169.588, |
| "eval_steps_per_second": 21.198, |
| "step": 52400 |
| }, |
| { |
| "epoch": 5.060399482393424, |
| "grad_norm": 1.1311498880386353, |
| "learning_rate": 7.525083612040134e-07, |
| "loss": 0.2553, |
| "step": 52500 |
| }, |
| { |
| "epoch": 5.060399482393424, |
| "eval_loss": 0.22257648408412933, |
| "eval_runtime": 58.9526, |
| "eval_samples_per_second": 169.628, |
| "eval_steps_per_second": 21.203, |
| "step": 52500 |
| }, |
| { |
| "epoch": 5.070038338550364, |
| "grad_norm": 1.062983512878418, |
| "learning_rate": 7.424749163879599e-07, |
| "loss": 0.2523, |
| "step": 52600 |
| }, |
| { |
| "epoch": 5.070038338550364, |
| "eval_loss": 0.2277909219264984, |
| "eval_runtime": 59.1272, |
| "eval_samples_per_second": 169.127, |
| "eval_steps_per_second": 21.141, |
| "step": 52600 |
| }, |
| { |
| "epoch": 5.079677194707304, |
| "grad_norm": 1.1180320978164673, |
| "learning_rate": 7.324414715719064e-07, |
| "loss": 0.2527, |
| "step": 52700 |
| }, |
| { |
| "epoch": 5.079677194707304, |
| "eval_loss": 0.2237856686115265, |
| "eval_runtime": 59.051, |
| "eval_samples_per_second": 169.345, |
| "eval_steps_per_second": 21.168, |
| "step": 52700 |
| }, |
| { |
| "epoch": 5.089316050864244, |
| "grad_norm": 1.1766620874404907, |
| "learning_rate": 7.224080267558529e-07, |
| "loss": 0.2548, |
| "step": 52800 |
| }, |
| { |
| "epoch": 5.089316050864244, |
| "eval_loss": 0.23022255301475525, |
| "eval_runtime": 58.7083, |
| "eval_samples_per_second": 170.334, |
| "eval_steps_per_second": 21.292, |
| "step": 52800 |
| }, |
| { |
| "epoch": 5.0989549070211835, |
| "grad_norm": 1.159372091293335, |
| "learning_rate": 7.123745819397994e-07, |
| "loss": 0.2544, |
| "step": 52900 |
| }, |
| { |
| "epoch": 5.0989549070211835, |
| "eval_loss": 0.227079376578331, |
| "eval_runtime": 58.9425, |
| "eval_samples_per_second": 169.657, |
| "eval_steps_per_second": 21.207, |
| "step": 52900 |
| }, |
| { |
| "epoch": 5.108593763178123, |
| "grad_norm": 1.1550517082214355, |
| "learning_rate": 7.023411371237459e-07, |
| "loss": 0.253, |
| "step": 53000 |
| }, |
| { |
| "epoch": 5.108593763178123, |
| "eval_loss": 0.22270356118679047, |
| "eval_runtime": 59.1037, |
| "eval_samples_per_second": 169.194, |
| "eval_steps_per_second": 21.149, |
| "step": 53000 |
| }, |
| { |
| "epoch": 5.118232619335063, |
| "grad_norm": 1.164044976234436, |
| "learning_rate": 6.923076923076923e-07, |
| "loss": 0.2525, |
| "step": 53100 |
| }, |
| { |
| "epoch": 5.118232619335063, |
| "eval_loss": 0.22594866156578064, |
| "eval_runtime": 58.9608, |
| "eval_samples_per_second": 169.604, |
| "eval_steps_per_second": 21.201, |
| "step": 53100 |
| }, |
| { |
| "epoch": 5.127871475492003, |
| "grad_norm": 1.2590959072113037, |
| "learning_rate": 6.822742474916388e-07, |
| "loss": 0.2546, |
| "step": 53200 |
| }, |
| { |
| "epoch": 5.127871475492003, |
| "eval_loss": 0.2225504219532013, |
| "eval_runtime": 58.9634, |
| "eval_samples_per_second": 169.597, |
| "eval_steps_per_second": 21.2, |
| "step": 53200 |
| }, |
| { |
| "epoch": 5.137510331648944, |
| "grad_norm": 1.1613073348999023, |
| "learning_rate": 6.722408026755853e-07, |
| "loss": 0.2541, |
| "step": 53300 |
| }, |
| { |
| "epoch": 5.137510331648944, |
| "eval_loss": 0.2252965271472931, |
| "eval_runtime": 59.1154, |
| "eval_samples_per_second": 169.161, |
| "eval_steps_per_second": 21.145, |
| "step": 53300 |
| }, |
| { |
| "epoch": 5.1471491878058835, |
| "grad_norm": 1.0958082675933838, |
| "learning_rate": 6.622073578595318e-07, |
| "loss": 0.2535, |
| "step": 53400 |
| }, |
| { |
| "epoch": 5.1471491878058835, |
| "eval_loss": 0.2281467318534851, |
| "eval_runtime": 58.9189, |
| "eval_samples_per_second": 169.725, |
| "eval_steps_per_second": 21.216, |
| "step": 53400 |
| }, |
| { |
| "epoch": 5.156788043962823, |
| "grad_norm": 1.1621479988098145, |
| "learning_rate": 6.521739130434783e-07, |
| "loss": 0.2545, |
| "step": 53500 |
| }, |
| { |
| "epoch": 5.156788043962823, |
| "eval_loss": 0.22485679388046265, |
| "eval_runtime": 59.1361, |
| "eval_samples_per_second": 169.101, |
| "eval_steps_per_second": 21.138, |
| "step": 53500 |
| }, |
| { |
| "epoch": 5.166426900119763, |
| "grad_norm": 1.081497311592102, |
| "learning_rate": 6.421404682274248e-07, |
| "loss": 0.256, |
| "step": 53600 |
| }, |
| { |
| "epoch": 5.166426900119763, |
| "eval_loss": 0.22066909074783325, |
| "eval_runtime": 58.9601, |
| "eval_samples_per_second": 169.606, |
| "eval_steps_per_second": 21.201, |
| "step": 53600 |
| }, |
| { |
| "epoch": 5.176065756276703, |
| "grad_norm": 1.11661696434021, |
| "learning_rate": 6.321070234113712e-07, |
| "loss": 0.2537, |
| "step": 53700 |
| }, |
| { |
| "epoch": 5.176065756276703, |
| "eval_loss": 0.22867466509342194, |
| "eval_runtime": 58.9203, |
| "eval_samples_per_second": 169.721, |
| "eval_steps_per_second": 21.215, |
| "step": 53700 |
| }, |
| { |
| "epoch": 5.185704612433643, |
| "grad_norm": 1.3185731172561646, |
| "learning_rate": 6.220735785953178e-07, |
| "loss": 0.2535, |
| "step": 53800 |
| }, |
| { |
| "epoch": 5.185704612433643, |
| "eval_loss": 0.22424526512622833, |
| "eval_runtime": 59.0971, |
| "eval_samples_per_second": 169.213, |
| "eval_steps_per_second": 21.152, |
| "step": 53800 |
| }, |
| { |
| "epoch": 5.195343468590583, |
| "grad_norm": 1.1543822288513184, |
| "learning_rate": 6.120401337792642e-07, |
| "loss": 0.2525, |
| "step": 53900 |
| }, |
| { |
| "epoch": 5.195343468590583, |
| "eval_loss": 0.22214870154857635, |
| "eval_runtime": 58.93, |
| "eval_samples_per_second": 169.693, |
| "eval_steps_per_second": 21.212, |
| "step": 53900 |
| }, |
| { |
| "epoch": 5.204982324747522, |
| "grad_norm": 1.2196881771087646, |
| "learning_rate": 6.020066889632107e-07, |
| "loss": 0.2519, |
| "step": 54000 |
| }, |
| { |
| "epoch": 5.204982324747522, |
| "eval_loss": 0.22731679677963257, |
| "eval_runtime": 58.9214, |
| "eval_samples_per_second": 169.718, |
| "eval_steps_per_second": 21.215, |
| "step": 54000 |
| }, |
| { |
| "epoch": 5.214621180904462, |
| "grad_norm": 1.3042584657669067, |
| "learning_rate": 5.919732441471572e-07, |
| "loss": 0.2499, |
| "step": 54100 |
| }, |
| { |
| "epoch": 5.214621180904462, |
| "eval_loss": 0.22149991989135742, |
| "eval_runtime": 59.1436, |
| "eval_samples_per_second": 169.08, |
| "eval_steps_per_second": 21.135, |
| "step": 54100 |
| }, |
| { |
| "epoch": 5.224260037061402, |
| "grad_norm": 1.1391198635101318, |
| "learning_rate": 5.819397993311037e-07, |
| "loss": 0.2541, |
| "step": 54200 |
| }, |
| { |
| "epoch": 5.224260037061402, |
| "eval_loss": 0.22547002136707306, |
| "eval_runtime": 58.734, |
| "eval_samples_per_second": 170.259, |
| "eval_steps_per_second": 21.282, |
| "step": 54200 |
| }, |
| { |
| "epoch": 5.233898893218342, |
| "grad_norm": 1.2511396408081055, |
| "learning_rate": 5.719063545150502e-07, |
| "loss": 0.2534, |
| "step": 54300 |
| }, |
| { |
| "epoch": 5.233898893218342, |
| "eval_loss": 0.22946836054325104, |
| "eval_runtime": 58.9665, |
| "eval_samples_per_second": 169.588, |
| "eval_steps_per_second": 21.198, |
| "step": 54300 |
| }, |
| { |
| "epoch": 5.243537749375282, |
| "grad_norm": 1.0912216901779175, |
| "learning_rate": 5.618729096989966e-07, |
| "loss": 0.2506, |
| "step": 54400 |
| }, |
| { |
| "epoch": 5.243537749375282, |
| "eval_loss": 0.22805634140968323, |
| "eval_runtime": 59.1357, |
| "eval_samples_per_second": 169.103, |
| "eval_steps_per_second": 21.138, |
| "step": 54400 |
| }, |
| { |
| "epoch": 5.2531766055322215, |
| "grad_norm": 1.208542823791504, |
| "learning_rate": 5.518394648829431e-07, |
| "loss": 0.2526, |
| "step": 54500 |
| }, |
| { |
| "epoch": 5.2531766055322215, |
| "eval_loss": 0.21685273945331573, |
| "eval_runtime": 58.9643, |
| "eval_samples_per_second": 169.594, |
| "eval_steps_per_second": 21.199, |
| "step": 54500 |
| }, |
| { |
| "epoch": 5.262815461689161, |
| "grad_norm": 1.152645230293274, |
| "learning_rate": 5.418060200668896e-07, |
| "loss": 0.2496, |
| "step": 54600 |
| }, |
| { |
| "epoch": 5.262815461689161, |
| "eval_loss": 0.2244671881198883, |
| "eval_runtime": 59.2534, |
| "eval_samples_per_second": 168.767, |
| "eval_steps_per_second": 21.096, |
| "step": 54600 |
| }, |
| { |
| "epoch": 5.272454317846101, |
| "grad_norm": 1.2127925157546997, |
| "learning_rate": 5.317725752508361e-07, |
| "loss": 0.2501, |
| "step": 54700 |
| }, |
| { |
| "epoch": 5.272454317846101, |
| "eval_loss": 0.2252870351076126, |
| "eval_runtime": 59.0351, |
| "eval_samples_per_second": 169.391, |
| "eval_steps_per_second": 21.174, |
| "step": 54700 |
| }, |
| { |
| "epoch": 5.282093174003041, |
| "grad_norm": 1.1584402322769165, |
| "learning_rate": 5.217391304347826e-07, |
| "loss": 0.2508, |
| "step": 54800 |
| }, |
| { |
| "epoch": 5.282093174003041, |
| "eval_loss": 0.21999992430210114, |
| "eval_runtime": 59.2774, |
| "eval_samples_per_second": 168.698, |
| "eval_steps_per_second": 21.087, |
| "step": 54800 |
| }, |
| { |
| "epoch": 5.291732030159981, |
| "grad_norm": 1.129681944847107, |
| "learning_rate": 5.117056856187291e-07, |
| "loss": 0.2497, |
| "step": 54900 |
| }, |
| { |
| "epoch": 5.291732030159981, |
| "eval_loss": 0.21917253732681274, |
| "eval_runtime": 59.2951, |
| "eval_samples_per_second": 168.648, |
| "eval_steps_per_second": 21.081, |
| "step": 54900 |
| }, |
| { |
| "epoch": 5.301370886316921, |
| "grad_norm": 1.2014105319976807, |
| "learning_rate": 5.016722408026756e-07, |
| "loss": 0.2512, |
| "step": 55000 |
| }, |
| { |
| "epoch": 5.301370886316921, |
| "eval_loss": 0.2266959398984909, |
| "eval_runtime": 59.486, |
| "eval_samples_per_second": 168.107, |
| "eval_steps_per_second": 21.013, |
| "step": 55000 |
| }, |
| { |
| "epoch": 5.3110097424738605, |
| "grad_norm": 1.09111750125885, |
| "learning_rate": 4.916387959866221e-07, |
| "loss": 0.2505, |
| "step": 55100 |
| }, |
| { |
| "epoch": 5.3110097424738605, |
| "eval_loss": 0.22630225121974945, |
| "eval_runtime": 59.3384, |
| "eval_samples_per_second": 168.525, |
| "eval_steps_per_second": 21.066, |
| "step": 55100 |
| }, |
| { |
| "epoch": 5.3206485986308, |
| "grad_norm": 1.3559210300445557, |
| "learning_rate": 4.816053511705686e-07, |
| "loss": 0.2528, |
| "step": 55200 |
| }, |
| { |
| "epoch": 5.3206485986308, |
| "eval_loss": 0.22501616179943085, |
| "eval_runtime": 59.4826, |
| "eval_samples_per_second": 168.117, |
| "eval_steps_per_second": 21.015, |
| "step": 55200 |
| }, |
| { |
| "epoch": 5.33028745478774, |
| "grad_norm": 1.1638729572296143, |
| "learning_rate": 4.7157190635451506e-07, |
| "loss": 0.2494, |
| "step": 55300 |
| }, |
| { |
| "epoch": 5.33028745478774, |
| "eval_loss": 0.2276040017604828, |
| "eval_runtime": 59.3183, |
| "eval_samples_per_second": 168.582, |
| "eval_steps_per_second": 21.073, |
| "step": 55300 |
| }, |
| { |
| "epoch": 5.33992631094468, |
| "grad_norm": 1.1465582847595215, |
| "learning_rate": 4.6153846153846156e-07, |
| "loss": 0.2511, |
| "step": 55400 |
| }, |
| { |
| "epoch": 5.33992631094468, |
| "eval_loss": 0.2249348908662796, |
| "eval_runtime": 59.2762, |
| "eval_samples_per_second": 168.702, |
| "eval_steps_per_second": 21.088, |
| "step": 55400 |
| }, |
| { |
| "epoch": 5.34956516710162, |
| "grad_norm": 1.1704530715942383, |
| "learning_rate": 4.5150501672240806e-07, |
| "loss": 0.251, |
| "step": 55500 |
| }, |
| { |
| "epoch": 5.34956516710162, |
| "eval_loss": 0.22359701991081238, |
| "eval_runtime": 59.4584, |
| "eval_samples_per_second": 168.185, |
| "eval_steps_per_second": 21.023, |
| "step": 55500 |
| }, |
| { |
| "epoch": 5.35920402325856, |
| "grad_norm": 1.1347342729568481, |
| "learning_rate": 4.4147157190635456e-07, |
| "loss": 0.25, |
| "step": 55600 |
| }, |
| { |
| "epoch": 5.35920402325856, |
| "eval_loss": 0.2178959995508194, |
| "eval_runtime": 59.2741, |
| "eval_samples_per_second": 168.708, |
| "eval_steps_per_second": 21.088, |
| "step": 55600 |
| }, |
| { |
| "epoch": 5.368842879415499, |
| "grad_norm": 1.2167783975601196, |
| "learning_rate": 4.3143812709030095e-07, |
| "loss": 0.2485, |
| "step": 55700 |
| }, |
| { |
| "epoch": 5.368842879415499, |
| "eval_loss": 0.22190746665000916, |
| "eval_runtime": 59.42, |
| "eval_samples_per_second": 168.294, |
| "eval_steps_per_second": 21.037, |
| "step": 55700 |
| }, |
| { |
| "epoch": 5.378481735572439, |
| "grad_norm": 1.1554456949234009, |
| "learning_rate": 4.2140468227424745e-07, |
| "loss": 0.2494, |
| "step": 55800 |
| }, |
| { |
| "epoch": 5.378481735572439, |
| "eval_loss": 0.22227539122104645, |
| "eval_runtime": 59.369, |
| "eval_samples_per_second": 168.438, |
| "eval_steps_per_second": 21.055, |
| "step": 55800 |
| }, |
| { |
| "epoch": 5.388120591729379, |
| "grad_norm": 1.052170991897583, |
| "learning_rate": 4.1137123745819395e-07, |
| "loss": 0.2519, |
| "step": 55900 |
| }, |
| { |
| "epoch": 5.388120591729379, |
| "eval_loss": 0.22797919809818268, |
| "eval_runtime": 58.9622, |
| "eval_samples_per_second": 169.6, |
| "eval_steps_per_second": 21.2, |
| "step": 55900 |
| }, |
| { |
| "epoch": 5.397759447886319, |
| "grad_norm": 1.0489306449890137, |
| "learning_rate": 4.0133779264214045e-07, |
| "loss": 0.2501, |
| "step": 56000 |
| }, |
| { |
| "epoch": 5.397759447886319, |
| "eval_loss": 0.2253742218017578, |
| "eval_runtime": 59.1206, |
| "eval_samples_per_second": 169.146, |
| "eval_steps_per_second": 21.143, |
| "step": 56000 |
| }, |
| { |
| "epoch": 5.407398304043259, |
| "grad_norm": 1.158197045326233, |
| "learning_rate": 3.9130434782608694e-07, |
| "loss": 0.2502, |
| "step": 56100 |
| }, |
| { |
| "epoch": 5.407398304043259, |
| "eval_loss": 0.23163315653800964, |
| "eval_runtime": 59.1282, |
| "eval_samples_per_second": 169.124, |
| "eval_steps_per_second": 21.141, |
| "step": 56100 |
| }, |
| { |
| "epoch": 5.417037160200199, |
| "grad_norm": 1.1179620027542114, |
| "learning_rate": 3.8127090301003344e-07, |
| "loss": 0.2516, |
| "step": 56200 |
| }, |
| { |
| "epoch": 5.417037160200199, |
| "eval_loss": 0.22414067387580872, |
| "eval_runtime": 58.9601, |
| "eval_samples_per_second": 169.606, |
| "eval_steps_per_second": 21.201, |
| "step": 56200 |
| }, |
| { |
| "epoch": 5.426676016357139, |
| "grad_norm": 1.0244063138961792, |
| "learning_rate": 3.7123745819397994e-07, |
| "loss": 0.2493, |
| "step": 56300 |
| }, |
| { |
| "epoch": 5.426676016357139, |
| "eval_loss": 0.22705614566802979, |
| "eval_runtime": 58.9344, |
| "eval_samples_per_second": 169.68, |
| "eval_steps_per_second": 21.21, |
| "step": 56300 |
| }, |
| { |
| "epoch": 5.436314872514079, |
| "grad_norm": 1.1290541887283325, |
| "learning_rate": 3.6120401337792644e-07, |
| "loss": 0.2502, |
| "step": 56400 |
| }, |
| { |
| "epoch": 5.436314872514079, |
| "eval_loss": 0.2250737100839615, |
| "eval_runtime": 59.1308, |
| "eval_samples_per_second": 169.117, |
| "eval_steps_per_second": 21.14, |
| "step": 56400 |
| }, |
| { |
| "epoch": 5.445953728671019, |
| "grad_norm": 1.2744230031967163, |
| "learning_rate": 3.5117056856187294e-07, |
| "loss": 0.2511, |
| "step": 56500 |
| }, |
| { |
| "epoch": 5.445953728671019, |
| "eval_loss": 0.22477617859840393, |
| "eval_runtime": 58.9688, |
| "eval_samples_per_second": 169.581, |
| "eval_steps_per_second": 21.198, |
| "step": 56500 |
| }, |
| { |
| "epoch": 5.455592584827959, |
| "grad_norm": 1.1114057302474976, |
| "learning_rate": 3.411371237458194e-07, |
| "loss": 0.2484, |
| "step": 56600 |
| }, |
| { |
| "epoch": 5.455592584827959, |
| "eval_loss": 0.226850688457489, |
| "eval_runtime": 58.9713, |
| "eval_samples_per_second": 169.574, |
| "eval_steps_per_second": 21.197, |
| "step": 56600 |
| }, |
| { |
| "epoch": 5.4652314409848985, |
| "grad_norm": 1.1400434970855713, |
| "learning_rate": 3.311036789297659e-07, |
| "loss": 0.2508, |
| "step": 56700 |
| }, |
| { |
| "epoch": 5.4652314409848985, |
| "eval_loss": 0.22591835260391235, |
| "eval_runtime": 59.1265, |
| "eval_samples_per_second": 169.129, |
| "eval_steps_per_second": 21.141, |
| "step": 56700 |
| }, |
| { |
| "epoch": 5.474870297141838, |
| "grad_norm": 1.2677631378173828, |
| "learning_rate": 3.210702341137124e-07, |
| "loss": 0.249, |
| "step": 56800 |
| }, |
| { |
| "epoch": 5.474870297141838, |
| "eval_loss": 0.22445721924304962, |
| "eval_runtime": 58.9365, |
| "eval_samples_per_second": 169.674, |
| "eval_steps_per_second": 21.209, |
| "step": 56800 |
| }, |
| { |
| "epoch": 5.484509153298778, |
| "grad_norm": 1.1901522874832153, |
| "learning_rate": 3.110367892976589e-07, |
| "loss": 0.2516, |
| "step": 56900 |
| }, |
| { |
| "epoch": 5.484509153298778, |
| "eval_loss": 0.22300176322460175, |
| "eval_runtime": 59.0008, |
| "eval_samples_per_second": 169.489, |
| "eval_steps_per_second": 21.186, |
| "step": 56900 |
| }, |
| { |
| "epoch": 5.494148009455718, |
| "grad_norm": 1.1611310243606567, |
| "learning_rate": 3.010033444816054e-07, |
| "loss": 0.2497, |
| "step": 57000 |
| }, |
| { |
| "epoch": 5.494148009455718, |
| "eval_loss": 0.21740412712097168, |
| "eval_runtime": 59.137, |
| "eval_samples_per_second": 169.099, |
| "eval_steps_per_second": 21.137, |
| "step": 57000 |
| }, |
| { |
| "epoch": 5.503786865612658, |
| "grad_norm": 1.2610938549041748, |
| "learning_rate": 2.9096989966555187e-07, |
| "loss": 0.2476, |
| "step": 57100 |
| }, |
| { |
| "epoch": 5.503786865612658, |
| "eval_loss": 0.22454555332660675, |
| "eval_runtime": 58.7081, |
| "eval_samples_per_second": 170.334, |
| "eval_steps_per_second": 21.292, |
| "step": 57100 |
| }, |
| { |
| "epoch": 5.513425721769598, |
| "grad_norm": 1.2325226068496704, |
| "learning_rate": 2.809364548494983e-07, |
| "loss": 0.2484, |
| "step": 57200 |
| }, |
| { |
| "epoch": 5.513425721769598, |
| "eval_loss": 0.22510981559753418, |
| "eval_runtime": 58.6381, |
| "eval_samples_per_second": 170.538, |
| "eval_steps_per_second": 21.317, |
| "step": 57200 |
| }, |
| { |
| "epoch": 5.5230645779265375, |
| "grad_norm": 1.0809872150421143, |
| "learning_rate": 2.709030100334448e-07, |
| "loss": 0.2482, |
| "step": 57300 |
| }, |
| { |
| "epoch": 5.5230645779265375, |
| "eval_loss": 0.22954057157039642, |
| "eval_runtime": 58.8227, |
| "eval_samples_per_second": 170.002, |
| "eval_steps_per_second": 21.25, |
| "step": 57300 |
| }, |
| { |
| "epoch": 5.532703434083477, |
| "grad_norm": 1.1554908752441406, |
| "learning_rate": 2.608695652173913e-07, |
| "loss": 0.2489, |
| "step": 57400 |
| }, |
| { |
| "epoch": 5.532703434083477, |
| "eval_loss": 0.21920810639858246, |
| "eval_runtime": 58.641, |
| "eval_samples_per_second": 170.529, |
| "eval_steps_per_second": 21.316, |
| "step": 57400 |
| }, |
| { |
| "epoch": 5.542342290240417, |
| "grad_norm": 1.113728404045105, |
| "learning_rate": 2.508361204013378e-07, |
| "loss": 0.249, |
| "step": 57500 |
| }, |
| { |
| "epoch": 5.542342290240417, |
| "eval_loss": 0.22062529623508453, |
| "eval_runtime": 58.656, |
| "eval_samples_per_second": 170.485, |
| "eval_steps_per_second": 21.311, |
| "step": 57500 |
| }, |
| { |
| "epoch": 5.551981146397357, |
| "grad_norm": 1.179052710533142, |
| "learning_rate": 2.408026755852843e-07, |
| "loss": 0.2502, |
| "step": 57600 |
| }, |
| { |
| "epoch": 5.551981146397357, |
| "eval_loss": 0.22298461198806763, |
| "eval_runtime": 58.8967, |
| "eval_samples_per_second": 169.789, |
| "eval_steps_per_second": 21.224, |
| "step": 57600 |
| }, |
| { |
| "epoch": 5.561620002554297, |
| "grad_norm": 1.1483044624328613, |
| "learning_rate": 2.3076923076923078e-07, |
| "loss": 0.2483, |
| "step": 57700 |
| }, |
| { |
| "epoch": 5.561620002554297, |
| "eval_loss": 0.21685902774333954, |
| "eval_runtime": 58.7747, |
| "eval_samples_per_second": 170.141, |
| "eval_steps_per_second": 21.268, |
| "step": 57700 |
| }, |
| { |
| "epoch": 5.571258858711237, |
| "grad_norm": 1.2611775398254395, |
| "learning_rate": 2.2073578595317728e-07, |
| "loss": 0.25, |
| "step": 57800 |
| }, |
| { |
| "epoch": 5.571258858711237, |
| "eval_loss": 0.22439409792423248, |
| "eval_runtime": 58.7479, |
| "eval_samples_per_second": 170.219, |
| "eval_steps_per_second": 21.277, |
| "step": 57800 |
| }, |
| { |
| "epoch": 5.580897714868176, |
| "grad_norm": 1.1589492559432983, |
| "learning_rate": 2.1070234113712372e-07, |
| "loss": 0.2505, |
| "step": 57900 |
| }, |
| { |
| "epoch": 5.580897714868176, |
| "eval_loss": 0.22764070332050323, |
| "eval_runtime": 59.0825, |
| "eval_samples_per_second": 169.255, |
| "eval_steps_per_second": 21.157, |
| "step": 57900 |
| }, |
| { |
| "epoch": 5.590536571025116, |
| "grad_norm": 1.0722376108169556, |
| "learning_rate": 2.0066889632107022e-07, |
| "loss": 0.2493, |
| "step": 58000 |
| }, |
| { |
| "epoch": 5.590536571025116, |
| "eval_loss": 0.22544412314891815, |
| "eval_runtime": 58.7925, |
| "eval_samples_per_second": 170.09, |
| "eval_steps_per_second": 21.261, |
| "step": 58000 |
| }, |
| { |
| "epoch": 5.600175427182056, |
| "grad_norm": 1.146055817604065, |
| "learning_rate": 1.9063545150501672e-07, |
| "loss": 0.2499, |
| "step": 58100 |
| }, |
| { |
| "epoch": 5.600175427182056, |
| "eval_loss": 0.22268901765346527, |
| "eval_runtime": 58.7727, |
| "eval_samples_per_second": 170.147, |
| "eval_steps_per_second": 21.268, |
| "step": 58100 |
| }, |
| { |
| "epoch": 5.609814283338996, |
| "grad_norm": 1.1639162302017212, |
| "learning_rate": 1.8060200668896322e-07, |
| "loss": 0.2499, |
| "step": 58200 |
| }, |
| { |
| "epoch": 5.609814283338996, |
| "eval_loss": 0.21973471343517303, |
| "eval_runtime": 58.9457, |
| "eval_samples_per_second": 169.648, |
| "eval_steps_per_second": 21.206, |
| "step": 58200 |
| }, |
| { |
| "epoch": 5.619453139495936, |
| "grad_norm": 1.1292901039123535, |
| "learning_rate": 1.705685618729097e-07, |
| "loss": 0.2489, |
| "step": 58300 |
| }, |
| { |
| "epoch": 5.619453139495936, |
| "eval_loss": 0.21718740463256836, |
| "eval_runtime": 58.726, |
| "eval_samples_per_second": 170.282, |
| "eval_steps_per_second": 21.285, |
| "step": 58300 |
| }, |
| { |
| "epoch": 5.6290919956528755, |
| "grad_norm": 1.2977691888809204, |
| "learning_rate": 1.605351170568562e-07, |
| "loss": 0.25, |
| "step": 58400 |
| }, |
| { |
| "epoch": 5.6290919956528755, |
| "eval_loss": 0.21809880435466766, |
| "eval_runtime": 58.7411, |
| "eval_samples_per_second": 170.239, |
| "eval_steps_per_second": 21.28, |
| "step": 58400 |
| }, |
| { |
| "epoch": 5.638730851809815, |
| "grad_norm": 1.0982277393341064, |
| "learning_rate": 1.505016722408027e-07, |
| "loss": 0.2482, |
| "step": 58500 |
| }, |
| { |
| "epoch": 5.638730851809815, |
| "eval_loss": 0.22168171405792236, |
| "eval_runtime": 58.7029, |
| "eval_samples_per_second": 170.349, |
| "eval_steps_per_second": 21.294, |
| "step": 58500 |
| }, |
| { |
| "epoch": 5.648369707966756, |
| "grad_norm": 1.0531952381134033, |
| "learning_rate": 1.4046822742474916e-07, |
| "loss": 0.2499, |
| "step": 58600 |
| }, |
| { |
| "epoch": 5.648369707966756, |
| "eval_loss": 0.22686386108398438, |
| "eval_runtime": 58.7385, |
| "eval_samples_per_second": 170.246, |
| "eval_steps_per_second": 21.281, |
| "step": 58600 |
| }, |
| { |
| "epoch": 5.658008564123696, |
| "grad_norm": 1.147268295288086, |
| "learning_rate": 1.3043478260869566e-07, |
| "loss": 0.2518, |
| "step": 58700 |
| }, |
| { |
| "epoch": 5.658008564123696, |
| "eval_loss": 0.2252393513917923, |
| "eval_runtime": 58.8234, |
| "eval_samples_per_second": 170.0, |
| "eval_steps_per_second": 21.25, |
| "step": 58700 |
| }, |
| { |
| "epoch": 5.667647420280636, |
| "grad_norm": 1.221027135848999, |
| "learning_rate": 1.2040133779264215e-07, |
| "loss": 0.2484, |
| "step": 58800 |
| }, |
| { |
| "epoch": 5.667647420280636, |
| "eval_loss": 0.22393923997879028, |
| "eval_runtime": 59.1929, |
| "eval_samples_per_second": 168.939, |
| "eval_steps_per_second": 21.117, |
| "step": 58800 |
| }, |
| { |
| "epoch": 5.6772862764375756, |
| "grad_norm": 1.302181363105774, |
| "learning_rate": 1.1036789297658864e-07, |
| "loss": 0.2477, |
| "step": 58900 |
| }, |
| { |
| "epoch": 5.6772862764375756, |
| "eval_loss": 0.2217906266450882, |
| "eval_runtime": 58.9206, |
| "eval_samples_per_second": 169.72, |
| "eval_steps_per_second": 21.215, |
| "step": 58900 |
| }, |
| { |
| "epoch": 5.686925132594515, |
| "grad_norm": 1.0798256397247314, |
| "learning_rate": 1.0033444816053511e-07, |
| "loss": 0.2489, |
| "step": 59000 |
| }, |
| { |
| "epoch": 5.686925132594515, |
| "eval_loss": 0.21835671365261078, |
| "eval_runtime": 58.9332, |
| "eval_samples_per_second": 169.684, |
| "eval_steps_per_second": 21.21, |
| "step": 59000 |
| }, |
| { |
| "epoch": 5.696563988751455, |
| "grad_norm": 1.053399682044983, |
| "learning_rate": 9.030100334448161e-08, |
| "loss": 0.2489, |
| "step": 59100 |
| }, |
| { |
| "epoch": 5.696563988751455, |
| "eval_loss": 0.2251000553369522, |
| "eval_runtime": 59.1145, |
| "eval_samples_per_second": 169.163, |
| "eval_steps_per_second": 21.145, |
| "step": 59100 |
| }, |
| { |
| "epoch": 5.706202844908395, |
| "grad_norm": 1.1299549341201782, |
| "learning_rate": 8.02675585284281e-08, |
| "loss": 0.2516, |
| "step": 59200 |
| }, |
| { |
| "epoch": 5.706202844908395, |
| "eval_loss": 0.2261471450328827, |
| "eval_runtime": 58.9787, |
| "eval_samples_per_second": 169.553, |
| "eval_steps_per_second": 21.194, |
| "step": 59200 |
| }, |
| { |
| "epoch": 5.715841701065335, |
| "grad_norm": 1.219848394393921, |
| "learning_rate": 7.023411371237458e-08, |
| "loss": 0.2477, |
| "step": 59300 |
| }, |
| { |
| "epoch": 5.715841701065335, |
| "eval_loss": 0.22193090617656708, |
| "eval_runtime": 58.9498, |
| "eval_samples_per_second": 169.636, |
| "eval_steps_per_second": 21.204, |
| "step": 59300 |
| }, |
| { |
| "epoch": 5.725480557222275, |
| "grad_norm": 1.1641128063201904, |
| "learning_rate": 6.020066889632108e-08, |
| "loss": 0.2497, |
| "step": 59400 |
| }, |
| { |
| "epoch": 5.725480557222275, |
| "eval_loss": 0.22315214574337006, |
| "eval_runtime": 59.123, |
| "eval_samples_per_second": 169.139, |
| "eval_steps_per_second": 21.142, |
| "step": 59400 |
| }, |
| { |
| "epoch": 5.7351194133792145, |
| "grad_norm": 1.2197539806365967, |
| "learning_rate": 5.0167224080267556e-08, |
| "loss": 0.2499, |
| "step": 59500 |
| }, |
| { |
| "epoch": 5.7351194133792145, |
| "eval_loss": 0.22189626097679138, |
| "eval_runtime": 58.965, |
| "eval_samples_per_second": 169.592, |
| "eval_steps_per_second": 21.199, |
| "step": 59500 |
| }, |
| { |
| "epoch": 5.744758269536154, |
| "grad_norm": 1.1124192476272583, |
| "learning_rate": 4.013377926421405e-08, |
| "loss": 0.2476, |
| "step": 59600 |
| }, |
| { |
| "epoch": 5.744758269536154, |
| "eval_loss": 0.2227565199136734, |
| "eval_runtime": 58.9658, |
| "eval_samples_per_second": 169.59, |
| "eval_steps_per_second": 21.199, |
| "step": 59600 |
| }, |
| { |
| "epoch": 5.754397125693094, |
| "grad_norm": 1.182589054107666, |
| "learning_rate": 3.010033444816054e-08, |
| "loss": 0.2482, |
| "step": 59700 |
| }, |
| { |
| "epoch": 5.754397125693094, |
| "eval_loss": 0.22147627174854279, |
| "eval_runtime": 59.1135, |
| "eval_samples_per_second": 169.166, |
| "eval_steps_per_second": 21.146, |
| "step": 59700 |
| }, |
| { |
| "epoch": 5.764035981850034, |
| "grad_norm": 1.1507868766784668, |
| "learning_rate": 2.0066889632107024e-08, |
| "loss": 0.2471, |
| "step": 59800 |
| }, |
| { |
| "epoch": 5.764035981850034, |
| "eval_loss": 0.22146035730838776, |
| "eval_runtime": 58.9479, |
| "eval_samples_per_second": 169.641, |
| "eval_steps_per_second": 21.205, |
| "step": 59800 |
| }, |
| { |
| "epoch": 5.773674838006974, |
| "grad_norm": 1.1648634672164917, |
| "learning_rate": 1.0033444816053512e-08, |
| "loss": 0.2476, |
| "step": 59900 |
| }, |
| { |
| "epoch": 5.773674838006974, |
| "eval_loss": 0.22508075833320618, |
| "eval_runtime": 58.9459, |
| "eval_samples_per_second": 169.647, |
| "eval_steps_per_second": 21.206, |
| "step": 59900 |
| }, |
| { |
| "epoch": 5.783313694163914, |
| "grad_norm": 1.0512384176254272, |
| "learning_rate": 0.0, |
| "loss": 0.2481, |
| "step": 60000 |
| }, |
| { |
| "epoch": 5.783313694163914, |
| "eval_loss": 0.2277609407901764, |
| "eval_runtime": 59.3544, |
| "eval_samples_per_second": 168.48, |
| "eval_steps_per_second": 21.06, |
| "step": 60000 |
| } |
| ], |
| "logging_steps": 100, |
| "max_steps": 60000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 6, |
| "save_steps": 1000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.3925574648642826e+19, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|