{ "best_metric": 0.21740412712097168, "best_model_checkpoint": "learning_source_20260316/compounds/bert-output/compounds-large/checkpoint-57000", "epoch": 5.783313694163914, "eval_steps": 100, "global_step": 60000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.009638856156939856, "grad_norm": 12.63675594329834, "learning_rate": 3e-06, "loss": 3.5958, "step": 100 }, { "epoch": 0.009638856156939856, "eval_loss": 2.634005546569824, "eval_runtime": 59.03, "eval_samples_per_second": 169.405, "eval_steps_per_second": 21.176, "step": 100 }, { "epoch": 0.019277712313879713, "grad_norm": 7.811080455780029, "learning_rate": 6e-06, "loss": 2.6098, "step": 200 }, { "epoch": 0.019277712313879713, "eval_loss": 2.495374917984009, "eval_runtime": 59.2811, "eval_samples_per_second": 168.688, "eval_steps_per_second": 21.086, "step": 200 }, { "epoch": 0.028916568470819567, "grad_norm": 3.372339963912964, "learning_rate": 5.989966555183947e-06, "loss": 2.4015, "step": 300 }, { "epoch": 0.028916568470819567, "eval_loss": 2.288954734802246, "eval_runtime": 59.465, "eval_samples_per_second": 168.166, "eval_steps_per_second": 21.021, "step": 300 }, { "epoch": 0.038555424627759426, "grad_norm": 28.15775489807129, "learning_rate": 5.979933110367893e-06, "loss": 2.2342, "step": 400 }, { "epoch": 0.038555424627759426, "eval_loss": 2.1951370239257812, "eval_runtime": 59.6014, "eval_samples_per_second": 167.781, "eval_steps_per_second": 20.973, "step": 400 }, { "epoch": 0.04819428078469928, "grad_norm": 8.421930313110352, "learning_rate": 5.96989966555184e-06, "loss": 2.1235, "step": 500 }, { "epoch": 0.04819428078469928, "eval_loss": 2.0739598274230957, "eval_runtime": 59.6726, "eval_samples_per_second": 167.581, "eval_steps_per_second": 20.948, "step": 500 }, { "epoch": 0.057833136941639135, "grad_norm": 5.261558532714844, "learning_rate": 5.959866220735786e-06, "loss": 2.0483, "step": 600 }, { "epoch": 0.057833136941639135, "eval_loss": 2.0082836151123047, "eval_runtime": 59.569, "eval_samples_per_second": 167.873, "eval_steps_per_second": 20.984, "step": 600 }, { "epoch": 0.06747199309857899, "grad_norm": 6.819973945617676, "learning_rate": 5.949832775919732e-06, "loss": 1.9985, "step": 700 }, { "epoch": 0.06747199309857899, "eval_loss": 1.995147466659546, "eval_runtime": 59.5165, "eval_samples_per_second": 168.021, "eval_steps_per_second": 21.003, "step": 700 }, { "epoch": 0.07711084925551885, "grad_norm": 10.216808319091797, "learning_rate": 5.939799331103679e-06, "loss": 1.9442, "step": 800 }, { "epoch": 0.07711084925551885, "eval_loss": 1.901943325996399, "eval_runtime": 59.7859, "eval_samples_per_second": 167.264, "eval_steps_per_second": 20.908, "step": 800 }, { "epoch": 0.0867497054124587, "grad_norm": 4.163369178771973, "learning_rate": 5.929765886287626e-06, "loss": 1.9063, "step": 900 }, { "epoch": 0.0867497054124587, "eval_loss": 1.8962064981460571, "eval_runtime": 59.5545, "eval_samples_per_second": 167.913, "eval_steps_per_second": 20.989, "step": 900 }, { "epoch": 0.09638856156939855, "grad_norm": 6.629236698150635, "learning_rate": 5.919732441471572e-06, "loss": 1.865, "step": 1000 }, { "epoch": 0.09638856156939855, "eval_loss": 1.8345894813537598, "eval_runtime": 59.5791, "eval_samples_per_second": 167.844, "eval_steps_per_second": 20.981, "step": 1000 }, { "epoch": 0.10602741772633842, "grad_norm": 5.353513240814209, "learning_rate": 5.9096989966555185e-06, "loss": 1.829, "step": 1100 }, { "epoch": 0.10602741772633842, "eval_loss": 1.8244454860687256, "eval_runtime": 59.7272, "eval_samples_per_second": 167.428, "eval_steps_per_second": 20.928, "step": 1100 }, { "epoch": 0.11566627388327827, "grad_norm": 8.38032341003418, "learning_rate": 5.899665551839465e-06, "loss": 1.7982, "step": 1200 }, { "epoch": 0.11566627388327827, "eval_loss": 1.8015681505203247, "eval_runtime": 59.5571, "eval_samples_per_second": 167.906, "eval_steps_per_second": 20.988, "step": 1200 }, { "epoch": 0.12530513004021812, "grad_norm": 4.500089645385742, "learning_rate": 5.889632107023412e-06, "loss": 1.7623, "step": 1300 }, { "epoch": 0.12530513004021812, "eval_loss": 1.7761175632476807, "eval_runtime": 59.5755, "eval_samples_per_second": 167.854, "eval_steps_per_second": 20.982, "step": 1300 }, { "epoch": 0.13494398619715797, "grad_norm": 4.475337028503418, "learning_rate": 5.879598662207358e-06, "loss": 1.741, "step": 1400 }, { "epoch": 0.13494398619715797, "eval_loss": 1.7422441244125366, "eval_runtime": 59.526, "eval_samples_per_second": 167.994, "eval_steps_per_second": 20.999, "step": 1400 }, { "epoch": 0.14458284235409785, "grad_norm": 4.09720516204834, "learning_rate": 5.869565217391305e-06, "loss": 1.715, "step": 1500 }, { "epoch": 0.14458284235409785, "eval_loss": 1.7092541456222534, "eval_runtime": 59.7218, "eval_samples_per_second": 167.443, "eval_steps_per_second": 20.93, "step": 1500 }, { "epoch": 0.1542216985110377, "grad_norm": 3.275571584701538, "learning_rate": 5.8595317725752514e-06, "loss": 1.6867, "step": 1600 }, { "epoch": 0.1542216985110377, "eval_loss": 1.7210519313812256, "eval_runtime": 59.5686, "eval_samples_per_second": 167.874, "eval_steps_per_second": 20.984, "step": 1600 }, { "epoch": 0.16386055466797755, "grad_norm": 3.5458016395568848, "learning_rate": 5.849498327759197e-06, "loss": 1.6636, "step": 1700 }, { "epoch": 0.16386055466797755, "eval_loss": 1.6789331436157227, "eval_runtime": 59.7455, "eval_samples_per_second": 167.376, "eval_steps_per_second": 20.922, "step": 1700 }, { "epoch": 0.1734994108249174, "grad_norm": 3.0849575996398926, "learning_rate": 5.839464882943144e-06, "loss": 1.6401, "step": 1800 }, { "epoch": 0.1734994108249174, "eval_loss": 1.6254615783691406, "eval_runtime": 59.5834, "eval_samples_per_second": 167.832, "eval_steps_per_second": 20.979, "step": 1800 }, { "epoch": 0.18313826698185726, "grad_norm": 3.4666318893432617, "learning_rate": 5.829431438127091e-06, "loss": 1.6181, "step": 1900 }, { "epoch": 0.18313826698185726, "eval_loss": 1.6075893640518188, "eval_runtime": 59.5639, "eval_samples_per_second": 167.887, "eval_steps_per_second": 20.986, "step": 1900 }, { "epoch": 0.1927771231387971, "grad_norm": 3.3370141983032227, "learning_rate": 5.819397993311037e-06, "loss": 1.597, "step": 2000 }, { "epoch": 0.1927771231387971, "eval_loss": 1.6024377346038818, "eval_runtime": 59.7218, "eval_samples_per_second": 167.443, "eval_steps_per_second": 20.93, "step": 2000 }, { "epoch": 0.202415979295737, "grad_norm": 4.16128396987915, "learning_rate": 5.8093645484949836e-06, "loss": 1.5706, "step": 2100 }, { "epoch": 0.202415979295737, "eval_loss": 1.5627351999282837, "eval_runtime": 58.7245, "eval_samples_per_second": 170.287, "eval_steps_per_second": 21.286, "step": 2100 }, { "epoch": 0.21205483545267684, "grad_norm": 4.344303131103516, "learning_rate": 5.79933110367893e-06, "loss": 1.5538, "step": 2200 }, { "epoch": 0.21205483545267684, "eval_loss": 1.5229161977767944, "eval_runtime": 58.5407, "eval_samples_per_second": 170.821, "eval_steps_per_second": 21.353, "step": 2200 }, { "epoch": 0.2216936916096167, "grad_norm": 3.6315417289733887, "learning_rate": 5.789297658862876e-06, "loss": 1.5317, "step": 2300 }, { "epoch": 0.2216936916096167, "eval_loss": 1.5036700963974, "eval_runtime": 58.5025, "eval_samples_per_second": 170.933, "eval_steps_per_second": 21.367, "step": 2300 }, { "epoch": 0.23133254776655654, "grad_norm": 3.0974552631378174, "learning_rate": 5.779264214046823e-06, "loss": 1.5018, "step": 2400 }, { "epoch": 0.23133254776655654, "eval_loss": 1.4860730171203613, "eval_runtime": 58.715, "eval_samples_per_second": 170.314, "eval_steps_per_second": 21.289, "step": 2400 }, { "epoch": 0.2409714039234964, "grad_norm": 3.808483362197876, "learning_rate": 5.76923076923077e-06, "loss": 1.4821, "step": 2500 }, { "epoch": 0.2409714039234964, "eval_loss": 1.446873664855957, "eval_runtime": 60.847, "eval_samples_per_second": 164.347, "eval_steps_per_second": 20.543, "step": 2500 }, { "epoch": 0.25061026008043624, "grad_norm": 4.335411071777344, "learning_rate": 5.759197324414716e-06, "loss": 1.4527, "step": 2600 }, { "epoch": 0.25061026008043624, "eval_loss": 1.4302533864974976, "eval_runtime": 58.726, "eval_samples_per_second": 170.282, "eval_steps_per_second": 21.285, "step": 2600 }, { "epoch": 0.2602491162373761, "grad_norm": 3.0945868492126465, "learning_rate": 5.7491638795986624e-06, "loss": 1.4247, "step": 2700 }, { "epoch": 0.2602491162373761, "eval_loss": 1.386735200881958, "eval_runtime": 58.8022, "eval_samples_per_second": 170.062, "eval_steps_per_second": 21.258, "step": 2700 }, { "epoch": 0.26988797239431594, "grad_norm": 3.442647933959961, "learning_rate": 5.739130434782609e-06, "loss": 1.3978, "step": 2800 }, { "epoch": 0.26988797239431594, "eval_loss": 1.3386584520339966, "eval_runtime": 58.8842, "eval_samples_per_second": 169.825, "eval_steps_per_second": 21.228, "step": 2800 }, { "epoch": 0.2795268285512558, "grad_norm": 3.443277597427368, "learning_rate": 5.729096989966555e-06, "loss": 1.3653, "step": 2900 }, { "epoch": 0.2795268285512558, "eval_loss": 1.3010079860687256, "eval_runtime": 58.9775, "eval_samples_per_second": 169.556, "eval_steps_per_second": 21.195, "step": 2900 }, { "epoch": 0.2891656847081957, "grad_norm": 2.790172576904297, "learning_rate": 5.719063545150502e-06, "loss": 1.3367, "step": 3000 }, { "epoch": 0.2891656847081957, "eval_loss": 1.2636170387268066, "eval_runtime": 58.8313, "eval_samples_per_second": 169.978, "eval_steps_per_second": 21.247, "step": 3000 }, { "epoch": 0.2988045408651355, "grad_norm": 3.3221867084503174, "learning_rate": 5.709030100334449e-06, "loss": 1.2931, "step": 3100 }, { "epoch": 0.2988045408651355, "eval_loss": 1.2360334396362305, "eval_runtime": 58.8407, "eval_samples_per_second": 169.95, "eval_steps_per_second": 21.244, "step": 3100 }, { "epoch": 0.3084433970220754, "grad_norm": 3.623957395553589, "learning_rate": 5.698996655518395e-06, "loss": 1.2604, "step": 3200 }, { "epoch": 0.3084433970220754, "eval_loss": 1.1867709159851074, "eval_runtime": 59.044, "eval_samples_per_second": 169.365, "eval_steps_per_second": 21.171, "step": 3200 }, { "epoch": 0.31808225317901523, "grad_norm": 3.344835042953491, "learning_rate": 5.688963210702341e-06, "loss": 1.2309, "step": 3300 }, { "epoch": 0.31808225317901523, "eval_loss": 1.1461621522903442, "eval_runtime": 58.853, "eval_samples_per_second": 169.915, "eval_steps_per_second": 21.239, "step": 3300 }, { "epoch": 0.3277211093359551, "grad_norm": 3.0504872798919678, "learning_rate": 5.678929765886288e-06, "loss": 1.2023, "step": 3400 }, { "epoch": 0.3277211093359551, "eval_loss": 1.1227951049804688, "eval_runtime": 58.8532, "eval_samples_per_second": 169.914, "eval_steps_per_second": 21.239, "step": 3400 }, { "epoch": 0.33735996549289493, "grad_norm": 3.358616352081299, "learning_rate": 5.668896321070235e-06, "loss": 1.1815, "step": 3500 }, { "epoch": 0.33735996549289493, "eval_loss": 1.0813926458358765, "eval_runtime": 59.0352, "eval_samples_per_second": 169.39, "eval_steps_per_second": 21.174, "step": 3500 }, { "epoch": 0.3469988216498348, "grad_norm": 2.9233672618865967, "learning_rate": 5.658862876254181e-06, "loss": 1.1533, "step": 3600 }, { "epoch": 0.3469988216498348, "eval_loss": 1.0554317235946655, "eval_runtime": 58.8485, "eval_samples_per_second": 169.928, "eval_steps_per_second": 21.241, "step": 3600 }, { "epoch": 0.3566376778067747, "grad_norm": 2.5221972465515137, "learning_rate": 5.6488294314381275e-06, "loss": 1.1312, "step": 3700 }, { "epoch": 0.3566376778067747, "eval_loss": 1.040281891822815, "eval_runtime": 58.8638, "eval_samples_per_second": 169.884, "eval_steps_per_second": 21.235, "step": 3700 }, { "epoch": 0.3662765339637145, "grad_norm": 2.611217737197876, "learning_rate": 5.638795986622074e-06, "loss": 1.109, "step": 3800 }, { "epoch": 0.3662765339637145, "eval_loss": 1.0209678411483765, "eval_runtime": 59.0369, "eval_samples_per_second": 169.385, "eval_steps_per_second": 21.173, "step": 3800 }, { "epoch": 0.3759153901206544, "grad_norm": 2.4788668155670166, "learning_rate": 5.62876254180602e-06, "loss": 1.0839, "step": 3900 }, { "epoch": 0.3759153901206544, "eval_loss": 0.9792139530181885, "eval_runtime": 58.8512, "eval_samples_per_second": 169.92, "eval_steps_per_second": 21.24, "step": 3900 }, { "epoch": 0.3855542462775942, "grad_norm": 2.7431282997131348, "learning_rate": 5.618729096989967e-06, "loss": 1.0636, "step": 4000 }, { "epoch": 0.3855542462775942, "eval_loss": 0.9677358269691467, "eval_runtime": 58.6358, "eval_samples_per_second": 170.544, "eval_steps_per_second": 21.318, "step": 4000 }, { "epoch": 0.3951931024345341, "grad_norm": 3.1817986965179443, "learning_rate": 5.608695652173914e-06, "loss": 1.0438, "step": 4100 }, { "epoch": 0.3951931024345341, "eval_loss": 0.9539806842803955, "eval_runtime": 59.0244, "eval_samples_per_second": 169.422, "eval_steps_per_second": 21.178, "step": 4100 }, { "epoch": 0.404831958591474, "grad_norm": 2.875279664993286, "learning_rate": 5.59866220735786e-06, "loss": 1.0221, "step": 4200 }, { "epoch": 0.404831958591474, "eval_loss": 0.9230886697769165, "eval_runtime": 58.867, "eval_samples_per_second": 169.874, "eval_steps_per_second": 21.234, "step": 4200 }, { "epoch": 0.4144708147484138, "grad_norm": 3.0002613067626953, "learning_rate": 5.588628762541806e-06, "loss": 1.0044, "step": 4300 }, { "epoch": 0.4144708147484138, "eval_loss": 0.9111505150794983, "eval_runtime": 59.1064, "eval_samples_per_second": 169.186, "eval_steps_per_second": 21.148, "step": 4300 }, { "epoch": 0.4241096709053537, "grad_norm": 3.194329261779785, "learning_rate": 5.578595317725753e-06, "loss": 0.9883, "step": 4400 }, { "epoch": 0.4241096709053537, "eval_loss": 0.887302815914154, "eval_runtime": 58.9661, "eval_samples_per_second": 169.589, "eval_steps_per_second": 21.199, "step": 4400 }, { "epoch": 0.4337485270622935, "grad_norm": 2.66054630279541, "learning_rate": 5.568561872909699e-06, "loss": 0.9736, "step": 4500 }, { "epoch": 0.4337485270622935, "eval_loss": 0.8647195100784302, "eval_runtime": 59.1105, "eval_samples_per_second": 169.175, "eval_steps_per_second": 21.147, "step": 4500 }, { "epoch": 0.4433873832192334, "grad_norm": 2.8895978927612305, "learning_rate": 5.558528428093646e-06, "loss": 0.9549, "step": 4600 }, { "epoch": 0.4433873832192334, "eval_loss": 0.8601205348968506, "eval_runtime": 58.9049, "eval_samples_per_second": 169.765, "eval_steps_per_second": 21.221, "step": 4600 }, { "epoch": 0.45302623937617326, "grad_norm": 3.127106189727783, "learning_rate": 5.548494983277593e-06, "loss": 0.941, "step": 4700 }, { "epoch": 0.45302623937617326, "eval_loss": 0.8301200270652771, "eval_runtime": 58.8868, "eval_samples_per_second": 169.817, "eval_steps_per_second": 21.227, "step": 4700 }, { "epoch": 0.4626650955331131, "grad_norm": 2.7476322650909424, "learning_rate": 5.5384615384615385e-06, "loss": 0.9228, "step": 4800 }, { "epoch": 0.4626650955331131, "eval_loss": 0.8239719867706299, "eval_runtime": 59.0464, "eval_samples_per_second": 169.358, "eval_steps_per_second": 21.17, "step": 4800 }, { "epoch": 0.47230395169005296, "grad_norm": 2.541245460510254, "learning_rate": 5.528428093645485e-06, "loss": 0.9153, "step": 4900 }, { "epoch": 0.47230395169005296, "eval_loss": 0.8104541301727295, "eval_runtime": 58.8715, "eval_samples_per_second": 169.861, "eval_steps_per_second": 21.233, "step": 4900 }, { "epoch": 0.4819428078469928, "grad_norm": 2.851243495941162, "learning_rate": 5.518394648829432e-06, "loss": 0.8994, "step": 5000 }, { "epoch": 0.4819428078469928, "eval_loss": 0.8006957769393921, "eval_runtime": 58.8695, "eval_samples_per_second": 169.867, "eval_steps_per_second": 21.233, "step": 5000 }, { "epoch": 0.49158166400393266, "grad_norm": 2.5873661041259766, "learning_rate": 5.508361204013378e-06, "loss": 0.8893, "step": 5100 }, { "epoch": 0.49158166400393266, "eval_loss": 0.7788259983062744, "eval_runtime": 59.0735, "eval_samples_per_second": 169.281, "eval_steps_per_second": 21.16, "step": 5100 }, { "epoch": 0.5012205201608725, "grad_norm": 2.6130380630493164, "learning_rate": 5.498327759197324e-06, "loss": 0.8711, "step": 5200 }, { "epoch": 0.5012205201608725, "eval_loss": 0.7742456793785095, "eval_runtime": 58.8663, "eval_samples_per_second": 169.877, "eval_steps_per_second": 21.235, "step": 5200 }, { "epoch": 0.5108593763178124, "grad_norm": 2.606877565383911, "learning_rate": 5.488294314381271e-06, "loss": 0.8602, "step": 5300 }, { "epoch": 0.5108593763178124, "eval_loss": 0.7536377310752869, "eval_runtime": 58.833, "eval_samples_per_second": 169.973, "eval_steps_per_second": 21.247, "step": 5300 }, { "epoch": 0.5204982324747522, "grad_norm": 2.596773624420166, "learning_rate": 5.478260869565217e-06, "loss": 0.853, "step": 5400 }, { "epoch": 0.5204982324747522, "eval_loss": 0.747475802898407, "eval_runtime": 59.0592, "eval_samples_per_second": 169.322, "eval_steps_per_second": 21.165, "step": 5400 }, { "epoch": 0.5301370886316921, "grad_norm": 2.5571069717407227, "learning_rate": 5.468227424749163e-06, "loss": 0.8383, "step": 5500 }, { "epoch": 0.5301370886316921, "eval_loss": 0.7383442521095276, "eval_runtime": 58.8303, "eval_samples_per_second": 169.98, "eval_steps_per_second": 21.248, "step": 5500 }, { "epoch": 0.5397759447886319, "grad_norm": 2.3702540397644043, "learning_rate": 5.45819397993311e-06, "loss": 0.8259, "step": 5600 }, { "epoch": 0.5397759447886319, "eval_loss": 0.7310738563537598, "eval_runtime": 58.825, "eval_samples_per_second": 169.996, "eval_steps_per_second": 21.249, "step": 5600 }, { "epoch": 0.5494148009455718, "grad_norm": 2.6348025798797607, "learning_rate": 5.448160535117057e-06, "loss": 0.8094, "step": 5700 }, { "epoch": 0.5494148009455718, "eval_loss": 0.7255334854125977, "eval_runtime": 58.9121, "eval_samples_per_second": 169.744, "eval_steps_per_second": 21.218, "step": 5700 }, { "epoch": 0.5590536571025116, "grad_norm": 2.291001796722412, "learning_rate": 5.438127090301003e-06, "loss": 0.8038, "step": 5800 }, { "epoch": 0.5590536571025116, "eval_loss": 0.7195360064506531, "eval_runtime": 59.0355, "eval_samples_per_second": 169.389, "eval_steps_per_second": 21.174, "step": 5800 }, { "epoch": 0.5686925132594515, "grad_norm": 2.163550853729248, "learning_rate": 5.4280936454849495e-06, "loss": 0.7985, "step": 5900 }, { "epoch": 0.5686925132594515, "eval_loss": 0.7090545892715454, "eval_runtime": 59.11, "eval_samples_per_second": 169.176, "eval_steps_per_second": 21.147, "step": 5900 }, { "epoch": 0.5783313694163914, "grad_norm": 2.325115442276001, "learning_rate": 5.418060200668896e-06, "loss": 0.7814, "step": 6000 }, { "epoch": 0.5783313694163914, "eval_loss": 0.6920502185821533, "eval_runtime": 58.907, "eval_samples_per_second": 169.759, "eval_steps_per_second": 21.22, "step": 6000 }, { "epoch": 0.5879702255733312, "grad_norm": 2.495279550552368, "learning_rate": 5.408026755852843e-06, "loss": 0.7771, "step": 6100 }, { "epoch": 0.5879702255733312, "eval_loss": 0.6769218444824219, "eval_runtime": 58.9261, "eval_samples_per_second": 169.704, "eval_steps_per_second": 21.213, "step": 6100 }, { "epoch": 0.597609081730271, "grad_norm": 2.5387184619903564, "learning_rate": 5.397993311036789e-06, "loss": 0.763, "step": 6200 }, { "epoch": 0.597609081730271, "eval_loss": 0.6797191500663757, "eval_runtime": 58.9067, "eval_samples_per_second": 169.76, "eval_steps_per_second": 21.22, "step": 6200 }, { "epoch": 0.6072479378872109, "grad_norm": 2.340493679046631, "learning_rate": 5.387959866220736e-06, "loss": 0.7531, "step": 6300 }, { "epoch": 0.6072479378872109, "eval_loss": 0.6665938496589661, "eval_runtime": 59.074, "eval_samples_per_second": 169.279, "eval_steps_per_second": 21.16, "step": 6300 }, { "epoch": 0.6168867940441508, "grad_norm": 2.8370766639709473, "learning_rate": 5.3779264214046825e-06, "loss": 0.7482, "step": 6400 }, { "epoch": 0.6168867940441508, "eval_loss": 0.6552098393440247, "eval_runtime": 58.9034, "eval_samples_per_second": 169.769, "eval_steps_per_second": 21.221, "step": 6400 }, { "epoch": 0.6265256502010906, "grad_norm": 2.3310625553131104, "learning_rate": 5.367892976588628e-06, "loss": 0.7362, "step": 6500 }, { "epoch": 0.6265256502010906, "eval_loss": 0.6410496234893799, "eval_runtime": 58.8769, "eval_samples_per_second": 169.846, "eval_steps_per_second": 21.231, "step": 6500 }, { "epoch": 0.6361645063580305, "grad_norm": 2.431213855743408, "learning_rate": 5.357859531772575e-06, "loss": 0.7287, "step": 6600 }, { "epoch": 0.6361645063580305, "eval_loss": 0.6448028087615967, "eval_runtime": 58.914, "eval_samples_per_second": 169.739, "eval_steps_per_second": 21.217, "step": 6600 }, { "epoch": 0.6458033625149704, "grad_norm": 2.526439905166626, "learning_rate": 5.347826086956522e-06, "loss": 0.7191, "step": 6700 }, { "epoch": 0.6458033625149704, "eval_loss": 0.6372745037078857, "eval_runtime": 59.064, "eval_samples_per_second": 169.308, "eval_steps_per_second": 21.163, "step": 6700 }, { "epoch": 0.6554422186719102, "grad_norm": 2.177318811416626, "learning_rate": 5.337792642140468e-06, "loss": 0.7133, "step": 6800 }, { "epoch": 0.6554422186719102, "eval_loss": 0.6324734091758728, "eval_runtime": 58.894, "eval_samples_per_second": 169.796, "eval_steps_per_second": 21.225, "step": 6800 }, { "epoch": 0.66508107482885, "grad_norm": 2.04829740524292, "learning_rate": 5.327759197324415e-06, "loss": 0.7106, "step": 6900 }, { "epoch": 0.66508107482885, "eval_loss": 0.6275954246520996, "eval_runtime": 59.0721, "eval_samples_per_second": 169.285, "eval_steps_per_second": 21.161, "step": 6900 }, { "epoch": 0.6747199309857899, "grad_norm": 2.3506946563720703, "learning_rate": 5.317725752508361e-06, "loss": 0.7021, "step": 7000 }, { "epoch": 0.6747199309857899, "eval_loss": 0.6138682961463928, "eval_runtime": 58.6329, "eval_samples_per_second": 170.553, "eval_steps_per_second": 21.319, "step": 7000 }, { "epoch": 0.6843587871427298, "grad_norm": 2.0266621112823486, "learning_rate": 5.307692307692307e-06, "loss": 0.6934, "step": 7100 }, { "epoch": 0.6843587871427298, "eval_loss": 0.6087481379508972, "eval_runtime": 58.7542, "eval_samples_per_second": 170.2, "eval_steps_per_second": 21.275, "step": 7100 }, { "epoch": 0.6939976432996696, "grad_norm": 2.079177141189575, "learning_rate": 5.297658862876254e-06, "loss": 0.6899, "step": 7200 }, { "epoch": 0.6939976432996696, "eval_loss": 0.6015520095825195, "eval_runtime": 58.7595, "eval_samples_per_second": 170.185, "eval_steps_per_second": 21.273, "step": 7200 }, { "epoch": 0.7036364994566094, "grad_norm": 1.8569824695587158, "learning_rate": 5.287625418060201e-06, "loss": 0.6771, "step": 7300 }, { "epoch": 0.7036364994566094, "eval_loss": 0.5964910984039307, "eval_runtime": 58.8357, "eval_samples_per_second": 169.965, "eval_steps_per_second": 21.246, "step": 7300 }, { "epoch": 0.7132753556135494, "grad_norm": 2.9515089988708496, "learning_rate": 5.277591973244147e-06, "loss": 0.6715, "step": 7400 }, { "epoch": 0.7132753556135494, "eval_loss": 0.5907161235809326, "eval_runtime": 59.0755, "eval_samples_per_second": 169.275, "eval_steps_per_second": 21.159, "step": 7400 }, { "epoch": 0.7229142117704892, "grad_norm": 2.322965383529663, "learning_rate": 5.2675585284280935e-06, "loss": 0.6703, "step": 7500 }, { "epoch": 0.7229142117704892, "eval_loss": 0.5861114263534546, "eval_runtime": 58.8744, "eval_samples_per_second": 169.853, "eval_steps_per_second": 21.232, "step": 7500 }, { "epoch": 0.732553067927429, "grad_norm": 2.576846122741699, "learning_rate": 5.25752508361204e-06, "loss": 0.6655, "step": 7600 }, { "epoch": 0.732553067927429, "eval_loss": 0.5830049514770508, "eval_runtime": 58.8479, "eval_samples_per_second": 169.93, "eval_steps_per_second": 21.241, "step": 7600 }, { "epoch": 0.742191924084369, "grad_norm": 2.485050678253174, "learning_rate": 5.247491638795986e-06, "loss": 0.6531, "step": 7700 }, { "epoch": 0.742191924084369, "eval_loss": 0.58382648229599, "eval_runtime": 59.0846, "eval_samples_per_second": 169.249, "eval_steps_per_second": 21.156, "step": 7700 }, { "epoch": 0.7518307802413088, "grad_norm": 2.1257286071777344, "learning_rate": 5.237458193979933e-06, "loss": 0.6468, "step": 7800 }, { "epoch": 0.7518307802413088, "eval_loss": 0.5759472846984863, "eval_runtime": 58.9027, "eval_samples_per_second": 169.771, "eval_steps_per_second": 21.221, "step": 7800 }, { "epoch": 0.7614696363982486, "grad_norm": 2.0705039501190186, "learning_rate": 5.22742474916388e-06, "loss": 0.64, "step": 7900 }, { "epoch": 0.7614696363982486, "eval_loss": 0.5682920813560486, "eval_runtime": 58.8786, "eval_samples_per_second": 169.841, "eval_steps_per_second": 21.23, "step": 7900 }, { "epoch": 0.7711084925551884, "grad_norm": 2.1057567596435547, "learning_rate": 5.2173913043478265e-06, "loss": 0.642, "step": 8000 }, { "epoch": 0.7711084925551884, "eval_loss": 0.5687887072563171, "eval_runtime": 58.9975, "eval_samples_per_second": 169.499, "eval_steps_per_second": 21.187, "step": 8000 }, { "epoch": 0.7807473487121284, "grad_norm": 2.056955337524414, "learning_rate": 5.207357859531772e-06, "loss": 0.6348, "step": 8100 }, { "epoch": 0.7807473487121284, "eval_loss": 0.5623904466629028, "eval_runtime": 58.9219, "eval_samples_per_second": 169.716, "eval_steps_per_second": 21.215, "step": 8100 }, { "epoch": 0.7903862048690682, "grad_norm": 2.312119483947754, "learning_rate": 5.197324414715719e-06, "loss": 0.628, "step": 8200 }, { "epoch": 0.7903862048690682, "eval_loss": 0.5485402345657349, "eval_runtime": 58.8197, "eval_samples_per_second": 170.011, "eval_steps_per_second": 21.251, "step": 8200 }, { "epoch": 0.800025061026008, "grad_norm": 2.316204309463501, "learning_rate": 5.187290969899666e-06, "loss": 0.6219, "step": 8300 }, { "epoch": 0.800025061026008, "eval_loss": 0.5563015341758728, "eval_runtime": 58.9997, "eval_samples_per_second": 169.492, "eval_steps_per_second": 21.187, "step": 8300 }, { "epoch": 0.809663917182948, "grad_norm": 2.1646835803985596, "learning_rate": 5.177257525083612e-06, "loss": 0.6229, "step": 8400 }, { "epoch": 0.809663917182948, "eval_loss": 0.5380101799964905, "eval_runtime": 58.5918, "eval_samples_per_second": 170.672, "eval_steps_per_second": 21.334, "step": 8400 }, { "epoch": 0.8193027733398878, "grad_norm": 1.8544220924377441, "learning_rate": 5.167224080267559e-06, "loss": 0.616, "step": 8500 }, { "epoch": 0.8193027733398878, "eval_loss": 0.5434267520904541, "eval_runtime": 58.8918, "eval_samples_per_second": 169.803, "eval_steps_per_second": 21.225, "step": 8500 }, { "epoch": 0.8289416294968276, "grad_norm": 1.9332796335220337, "learning_rate": 5.157190635451505e-06, "loss": 0.6112, "step": 8600 }, { "epoch": 0.8289416294968276, "eval_loss": 0.5379143357276917, "eval_runtime": 59.0748, "eval_samples_per_second": 169.277, "eval_steps_per_second": 21.16, "step": 8600 }, { "epoch": 0.8385804856537674, "grad_norm": 1.9109045267105103, "learning_rate": 5.147157190635451e-06, "loss": 0.6065, "step": 8700 }, { "epoch": 0.8385804856537674, "eval_loss": 0.5346018075942993, "eval_runtime": 58.8974, "eval_samples_per_second": 169.787, "eval_steps_per_second": 21.223, "step": 8700 }, { "epoch": 0.8482193418107074, "grad_norm": 2.635373115539551, "learning_rate": 5.137123745819398e-06, "loss": 0.6007, "step": 8800 }, { "epoch": 0.8482193418107074, "eval_loss": 0.5312322378158569, "eval_runtime": 59.1058, "eval_samples_per_second": 169.188, "eval_steps_per_second": 21.149, "step": 8800 }, { "epoch": 0.8578581979676472, "grad_norm": 2.175607919692993, "learning_rate": 5.127090301003345e-06, "loss": 0.5994, "step": 8900 }, { "epoch": 0.8578581979676472, "eval_loss": 0.5221748948097229, "eval_runtime": 58.9139, "eval_samples_per_second": 169.739, "eval_steps_per_second": 21.217, "step": 8900 }, { "epoch": 0.867497054124587, "grad_norm": 2.22357439994812, "learning_rate": 5.117056856187291e-06, "loss": 0.5891, "step": 9000 }, { "epoch": 0.867497054124587, "eval_loss": 0.523512065410614, "eval_runtime": 59.0804, "eval_samples_per_second": 169.261, "eval_steps_per_second": 21.158, "step": 9000 }, { "epoch": 0.8771359102815269, "grad_norm": 2.295689582824707, "learning_rate": 5.1070234113712375e-06, "loss": 0.5906, "step": 9100 }, { "epoch": 0.8771359102815269, "eval_loss": 0.524342954158783, "eval_runtime": 58.896, "eval_samples_per_second": 169.791, "eval_steps_per_second": 21.224, "step": 9100 }, { "epoch": 0.8867747664384668, "grad_norm": 1.769615650177002, "learning_rate": 5.096989966555184e-06, "loss": 0.5879, "step": 9200 }, { "epoch": 0.8867747664384668, "eval_loss": 0.5283530354499817, "eval_runtime": 58.9193, "eval_samples_per_second": 169.724, "eval_steps_per_second": 21.215, "step": 9200 }, { "epoch": 0.8964136225954066, "grad_norm": 1.7100142240524292, "learning_rate": 5.08695652173913e-06, "loss": 0.5838, "step": 9300 }, { "epoch": 0.8964136225954066, "eval_loss": 0.5141860842704773, "eval_runtime": 59.0717, "eval_samples_per_second": 169.286, "eval_steps_per_second": 21.161, "step": 9300 }, { "epoch": 0.9060524787523465, "grad_norm": 2.1682353019714355, "learning_rate": 5.076923076923077e-06, "loss": 0.5743, "step": 9400 }, { "epoch": 0.9060524787523465, "eval_loss": 0.5123055577278137, "eval_runtime": 58.9232, "eval_samples_per_second": 169.713, "eval_steps_per_second": 21.214, "step": 9400 }, { "epoch": 0.9156913349092863, "grad_norm": 1.9889121055603027, "learning_rate": 5.066889632107024e-06, "loss": 0.5745, "step": 9500 }, { "epoch": 0.9156913349092863, "eval_loss": 0.503046989440918, "eval_runtime": 59.0466, "eval_samples_per_second": 169.358, "eval_steps_per_second": 21.17, "step": 9500 }, { "epoch": 0.9253301910662262, "grad_norm": 1.9235670566558838, "learning_rate": 5.05685618729097e-06, "loss": 0.5695, "step": 9600 }, { "epoch": 0.9253301910662262, "eval_loss": 0.5009673833847046, "eval_runtime": 58.906, "eval_samples_per_second": 169.762, "eval_steps_per_second": 21.22, "step": 9600 }, { "epoch": 0.934969047223166, "grad_norm": 2.0465199947357178, "learning_rate": 5.046822742474916e-06, "loss": 0.5687, "step": 9700 }, { "epoch": 0.934969047223166, "eval_loss": 0.5043101906776428, "eval_runtime": 58.9009, "eval_samples_per_second": 169.777, "eval_steps_per_second": 21.222, "step": 9700 }, { "epoch": 0.9446079033801059, "grad_norm": 1.7765299081802368, "learning_rate": 5.036789297658863e-06, "loss": 0.5577, "step": 9800 }, { "epoch": 0.9446079033801059, "eval_loss": 0.4905799329280853, "eval_runtime": 59.0953, "eval_samples_per_second": 169.218, "eval_steps_per_second": 21.152, "step": 9800 }, { "epoch": 0.9542467595370457, "grad_norm": 1.8332940340042114, "learning_rate": 5.02675585284281e-06, "loss": 0.5616, "step": 9900 }, { "epoch": 0.9542467595370457, "eval_loss": 0.48462817072868347, "eval_runtime": 58.826, "eval_samples_per_second": 169.993, "eval_steps_per_second": 21.249, "step": 9900 }, { "epoch": 0.9638856156939856, "grad_norm": 1.9404815435409546, "learning_rate": 5.016722408026756e-06, "loss": 0.5576, "step": 10000 }, { "epoch": 0.9638856156939856, "eval_loss": 0.4888601005077362, "eval_runtime": 58.8596, "eval_samples_per_second": 169.896, "eval_steps_per_second": 21.237, "step": 10000 }, { "epoch": 0.9735244718509255, "grad_norm": 1.857668399810791, "learning_rate": 5.0066889632107026e-06, "loss": 0.5503, "step": 10100 }, { "epoch": 0.9735244718509255, "eval_loss": 0.48140355944633484, "eval_runtime": 58.9457, "eval_samples_per_second": 169.648, "eval_steps_per_second": 21.206, "step": 10100 }, { "epoch": 0.9831633280078653, "grad_norm": 1.7843375205993652, "learning_rate": 4.996655518394649e-06, "loss": 0.5477, "step": 10200 }, { "epoch": 0.9831633280078653, "eval_loss": 0.4800611436367035, "eval_runtime": 59.1103, "eval_samples_per_second": 169.175, "eval_steps_per_second": 21.147, "step": 10200 }, { "epoch": 0.9928021841648051, "grad_norm": 1.8434568643569946, "learning_rate": 4.986622073578595e-06, "loss": 0.5434, "step": 10300 }, { "epoch": 0.9928021841648051, "eval_loss": 0.47122901678085327, "eval_runtime": 58.9167, "eval_samples_per_second": 169.731, "eval_steps_per_second": 21.216, "step": 10300 }, { "epoch": 1.002441040321745, "grad_norm": 1.779107928276062, "learning_rate": 4.976588628762542e-06, "loss": 0.5412, "step": 10400 }, { "epoch": 1.002441040321745, "eval_loss": 0.477405846118927, "eval_runtime": 58.9086, "eval_samples_per_second": 169.754, "eval_steps_per_second": 21.219, "step": 10400 }, { "epoch": 1.0120798964786848, "grad_norm": 2.1223175525665283, "learning_rate": 4.966555183946489e-06, "loss": 0.5395, "step": 10500 }, { "epoch": 1.0120798964786848, "eval_loss": 0.48660337924957275, "eval_runtime": 58.9373, "eval_samples_per_second": 169.672, "eval_steps_per_second": 21.209, "step": 10500 }, { "epoch": 1.0217187526356248, "grad_norm": 1.8705182075500488, "learning_rate": 4.956521739130435e-06, "loss": 0.5384, "step": 10600 }, { "epoch": 1.0217187526356248, "eval_loss": 0.4735105335712433, "eval_runtime": 59.0979, "eval_samples_per_second": 169.211, "eval_steps_per_second": 21.151, "step": 10600 }, { "epoch": 1.0313576087925647, "grad_norm": 2.186157464981079, "learning_rate": 4.9464882943143815e-06, "loss": 0.5318, "step": 10700 }, { "epoch": 1.0313576087925647, "eval_loss": 0.47351425886154175, "eval_runtime": 58.9071, "eval_samples_per_second": 169.759, "eval_steps_per_second": 21.22, "step": 10700 }, { "epoch": 1.0409964649495045, "grad_norm": 2.1391384601593018, "learning_rate": 4.936454849498328e-06, "loss": 0.5304, "step": 10800 }, { "epoch": 1.0409964649495045, "eval_loss": 0.4776832163333893, "eval_runtime": 58.9018, "eval_samples_per_second": 169.774, "eval_steps_per_second": 21.222, "step": 10800 }, { "epoch": 1.0506353211064443, "grad_norm": 1.825791835784912, "learning_rate": 4.926421404682274e-06, "loss": 0.5243, "step": 10900 }, { "epoch": 1.0506353211064443, "eval_loss": 0.4646834433078766, "eval_runtime": 58.9286, "eval_samples_per_second": 169.697, "eval_steps_per_second": 21.212, "step": 10900 }, { "epoch": 1.0602741772633841, "grad_norm": 1.7971795797348022, "learning_rate": 4.916387959866221e-06, "loss": 0.5247, "step": 11000 }, { "epoch": 1.0602741772633841, "eval_loss": 0.4617900252342224, "eval_runtime": 59.0814, "eval_samples_per_second": 169.258, "eval_steps_per_second": 21.157, "step": 11000 }, { "epoch": 1.069913033420324, "grad_norm": 1.6506189107894897, "learning_rate": 4.906354515050168e-06, "loss": 0.5204, "step": 11100 }, { "epoch": 1.069913033420324, "eval_loss": 0.4610148072242737, "eval_runtime": 58.8704, "eval_samples_per_second": 169.865, "eval_steps_per_second": 21.233, "step": 11100 }, { "epoch": 1.0795518895772638, "grad_norm": 1.682634949684143, "learning_rate": 4.8963210702341136e-06, "loss": 0.5147, "step": 11200 }, { "epoch": 1.0795518895772638, "eval_loss": 0.4534702003002167, "eval_runtime": 58.872, "eval_samples_per_second": 169.86, "eval_steps_per_second": 21.232, "step": 11200 }, { "epoch": 1.0891907457342038, "grad_norm": 1.6429296731948853, "learning_rate": 4.88628762541806e-06, "loss": 0.5163, "step": 11300 }, { "epoch": 1.0891907457342038, "eval_loss": 0.45869511365890503, "eval_runtime": 59.0441, "eval_samples_per_second": 169.365, "eval_steps_per_second": 21.171, "step": 11300 }, { "epoch": 1.0988296018911436, "grad_norm": 1.7855538129806519, "learning_rate": 4.876254180602007e-06, "loss": 0.5093, "step": 11400 }, { "epoch": 1.0988296018911436, "eval_loss": 0.45184823870658875, "eval_runtime": 58.8447, "eval_samples_per_second": 169.939, "eval_steps_per_second": 21.242, "step": 11400 }, { "epoch": 1.1084684580480835, "grad_norm": 1.6376959085464478, "learning_rate": 4.866220735785953e-06, "loss": 0.5107, "step": 11500 }, { "epoch": 1.1084684580480835, "eval_loss": 0.4540416896343231, "eval_runtime": 58.9029, "eval_samples_per_second": 169.771, "eval_steps_per_second": 21.221, "step": 11500 }, { "epoch": 1.1181073142050233, "grad_norm": 1.8928024768829346, "learning_rate": 4.8561872909699e-06, "loss": 0.5067, "step": 11600 }, { "epoch": 1.1181073142050233, "eval_loss": 0.44899114966392517, "eval_runtime": 58.9326, "eval_samples_per_second": 169.685, "eval_steps_per_second": 21.211, "step": 11600 }, { "epoch": 1.1277461703619631, "grad_norm": 1.828368902206421, "learning_rate": 4.8461538461538465e-06, "loss": 0.5064, "step": 11700 }, { "epoch": 1.1277461703619631, "eval_loss": 0.4536297917366028, "eval_runtime": 59.1031, "eval_samples_per_second": 169.196, "eval_steps_per_second": 21.149, "step": 11700 }, { "epoch": 1.137385026518903, "grad_norm": 1.7877882719039917, "learning_rate": 4.8361204013377925e-06, "loss": 0.5009, "step": 11800 }, { "epoch": 1.137385026518903, "eval_loss": 0.4467906355857849, "eval_runtime": 58.8742, "eval_samples_per_second": 169.854, "eval_steps_per_second": 21.232, "step": 11800 }, { "epoch": 1.1470238826758428, "grad_norm": 1.7465142011642456, "learning_rate": 4.826086956521739e-06, "loss": 0.5003, "step": 11900 }, { "epoch": 1.1470238826758428, "eval_loss": 0.43356460332870483, "eval_runtime": 58.9139, "eval_samples_per_second": 169.739, "eval_steps_per_second": 21.217, "step": 11900 }, { "epoch": 1.1566627388327828, "grad_norm": 1.816881775856018, "learning_rate": 4.816053511705686e-06, "loss": 0.4993, "step": 12000 }, { "epoch": 1.1566627388327828, "eval_loss": 0.43435898423194885, "eval_runtime": 59.0853, "eval_samples_per_second": 169.247, "eval_steps_per_second": 21.156, "step": 12000 }, { "epoch": 1.1663015949897226, "grad_norm": 1.6352607011795044, "learning_rate": 4.806020066889633e-06, "loss": 0.4953, "step": 12100 }, { "epoch": 1.1663015949897226, "eval_loss": 0.44119584560394287, "eval_runtime": 58.8922, "eval_samples_per_second": 169.802, "eval_steps_per_second": 21.225, "step": 12100 }, { "epoch": 1.1759404511466625, "grad_norm": 1.791181206703186, "learning_rate": 4.795986622073579e-06, "loss": 0.4953, "step": 12200 }, { "epoch": 1.1759404511466625, "eval_loss": 0.43825873732566833, "eval_runtime": 59.0873, "eval_samples_per_second": 169.241, "eval_steps_per_second": 21.155, "step": 12200 }, { "epoch": 1.1855793073036023, "grad_norm": 1.5742549896240234, "learning_rate": 4.785953177257525e-06, "loss": 0.4922, "step": 12300 }, { "epoch": 1.1855793073036023, "eval_loss": 0.42847940325737, "eval_runtime": 58.906, "eval_samples_per_second": 169.762, "eval_steps_per_second": 21.22, "step": 12300 }, { "epoch": 1.195218163460542, "grad_norm": 1.666723608970642, "learning_rate": 4.775919732441472e-06, "loss": 0.4896, "step": 12400 }, { "epoch": 1.195218163460542, "eval_loss": 0.4272569417953491, "eval_runtime": 58.9642, "eval_samples_per_second": 169.594, "eval_steps_per_second": 21.199, "step": 12400 }, { "epoch": 1.204857019617482, "grad_norm": 1.5803587436676025, "learning_rate": 4.765886287625418e-06, "loss": 0.4848, "step": 12500 }, { "epoch": 1.204857019617482, "eval_loss": 0.4343504011631012, "eval_runtime": 59.102, "eval_samples_per_second": 169.199, "eval_steps_per_second": 21.15, "step": 12500 }, { "epoch": 1.2144958757744218, "grad_norm": 1.6863902807235718, "learning_rate": 4.755852842809365e-06, "loss": 0.486, "step": 12600 }, { "epoch": 1.2144958757744218, "eval_loss": 0.42895078659057617, "eval_runtime": 58.8995, "eval_samples_per_second": 169.781, "eval_steps_per_second": 21.223, "step": 12600 }, { "epoch": 1.2241347319313618, "grad_norm": 1.6389436721801758, "learning_rate": 4.745819397993312e-06, "loss": 0.4787, "step": 12700 }, { "epoch": 1.2241347319313618, "eval_loss": 0.43419140577316284, "eval_runtime": 59.0691, "eval_samples_per_second": 169.293, "eval_steps_per_second": 21.162, "step": 12700 }, { "epoch": 1.2337735880883016, "grad_norm": 1.7968857288360596, "learning_rate": 4.7357859531772575e-06, "loss": 0.4802, "step": 12800 }, { "epoch": 1.2337735880883016, "eval_loss": 0.4281242787837982, "eval_runtime": 58.9246, "eval_samples_per_second": 169.708, "eval_steps_per_second": 21.214, "step": 12800 }, { "epoch": 1.2434124442452414, "grad_norm": 1.577661156654358, "learning_rate": 4.725752508361204e-06, "loss": 0.4741, "step": 12900 }, { "epoch": 1.2434124442452414, "eval_loss": 0.42887967824935913, "eval_runtime": 58.6822, "eval_samples_per_second": 170.409, "eval_steps_per_second": 21.301, "step": 12900 }, { "epoch": 1.2530513004021813, "grad_norm": 1.6630301475524902, "learning_rate": 4.715719063545151e-06, "loss": 0.4763, "step": 13000 }, { "epoch": 1.2530513004021813, "eval_loss": 0.41134434938430786, "eval_runtime": 59.0454, "eval_samples_per_second": 169.361, "eval_steps_per_second": 21.17, "step": 13000 }, { "epoch": 1.262690156559121, "grad_norm": 1.6969964504241943, "learning_rate": 4.705685618729097e-06, "loss": 0.4695, "step": 13100 }, { "epoch": 1.262690156559121, "eval_loss": 0.4251042902469635, "eval_runtime": 58.8993, "eval_samples_per_second": 169.781, "eval_steps_per_second": 21.223, "step": 13100 }, { "epoch": 1.272329012716061, "grad_norm": 1.5483554601669312, "learning_rate": 4.695652173913044e-06, "loss": 0.4724, "step": 13200 }, { "epoch": 1.272329012716061, "eval_loss": 0.4123547375202179, "eval_runtime": 58.9538, "eval_samples_per_second": 169.624, "eval_steps_per_second": 21.203, "step": 13200 }, { "epoch": 1.281967868873001, "grad_norm": 1.6243360042572021, "learning_rate": 4.6856187290969905e-06, "loss": 0.467, "step": 13300 }, { "epoch": 1.281967868873001, "eval_loss": 0.41819024085998535, "eval_runtime": 59.0792, "eval_samples_per_second": 169.264, "eval_steps_per_second": 21.158, "step": 13300 }, { "epoch": 1.2916067250299408, "grad_norm": 1.5376485586166382, "learning_rate": 4.675585284280936e-06, "loss": 0.4649, "step": 13400 }, { "epoch": 1.2916067250299408, "eval_loss": 0.4179925322532654, "eval_runtime": 58.8694, "eval_samples_per_second": 169.868, "eval_steps_per_second": 21.233, "step": 13400 }, { "epoch": 1.3012455811868806, "grad_norm": 1.6273411512374878, "learning_rate": 4.665551839464883e-06, "loss": 0.4619, "step": 13500 }, { "epoch": 1.3012455811868806, "eval_loss": 0.40902629494667053, "eval_runtime": 59.0101, "eval_samples_per_second": 169.463, "eval_steps_per_second": 21.183, "step": 13500 }, { "epoch": 1.3108844373438204, "grad_norm": 1.4825048446655273, "learning_rate": 4.65551839464883e-06, "loss": 0.4621, "step": 13600 }, { "epoch": 1.3108844373438204, "eval_loss": 0.407672256231308, "eval_runtime": 58.9292, "eval_samples_per_second": 169.695, "eval_steps_per_second": 21.212, "step": 13600 }, { "epoch": 1.3205232935007603, "grad_norm": 1.4943957328796387, "learning_rate": 4.645484949832776e-06, "loss": 0.4627, "step": 13700 }, { "epoch": 1.3205232935007603, "eval_loss": 0.4040186405181885, "eval_runtime": 58.8579, "eval_samples_per_second": 169.901, "eval_steps_per_second": 21.238, "step": 13700 }, { "epoch": 1.3301621496577, "grad_norm": 1.5842742919921875, "learning_rate": 4.635451505016723e-06, "loss": 0.4658, "step": 13800 }, { "epoch": 1.3301621496577, "eval_loss": 0.39733609557151794, "eval_runtime": 59.0877, "eval_samples_per_second": 169.24, "eval_steps_per_second": 21.155, "step": 13800 }, { "epoch": 1.33980100581464, "grad_norm": 1.5690122842788696, "learning_rate": 4.625418060200669e-06, "loss": 0.4602, "step": 13900 }, { "epoch": 1.33980100581464, "eval_loss": 0.4122777283191681, "eval_runtime": 58.8805, "eval_samples_per_second": 169.836, "eval_steps_per_second": 21.229, "step": 13900 }, { "epoch": 1.3494398619715797, "grad_norm": 1.5999127626419067, "learning_rate": 4.615384615384616e-06, "loss": 0.4527, "step": 14000 }, { "epoch": 1.3494398619715797, "eval_loss": 0.4009742736816406, "eval_runtime": 58.908, "eval_samples_per_second": 169.756, "eval_steps_per_second": 21.22, "step": 14000 }, { "epoch": 1.3590787181285198, "grad_norm": 1.482398271560669, "learning_rate": 4.605351170568562e-06, "loss": 0.4542, "step": 14100 }, { "epoch": 1.3590787181285198, "eval_loss": 0.39838653802871704, "eval_runtime": 59.0666, "eval_samples_per_second": 169.3, "eval_steps_per_second": 21.163, "step": 14100 }, { "epoch": 1.3687175742854596, "grad_norm": 1.7599948644638062, "learning_rate": 4.595317725752509e-06, "loss": 0.4493, "step": 14200 }, { "epoch": 1.3687175742854596, "eval_loss": 0.40471258759498596, "eval_runtime": 58.9227, "eval_samples_per_second": 169.714, "eval_steps_per_second": 21.214, "step": 14200 }, { "epoch": 1.3783564304423994, "grad_norm": 1.6642801761627197, "learning_rate": 4.585284280936456e-06, "loss": 0.4469, "step": 14300 }, { "epoch": 1.3783564304423994, "eval_loss": 0.40282630920410156, "eval_runtime": 58.8961, "eval_samples_per_second": 169.79, "eval_steps_per_second": 21.224, "step": 14300 }, { "epoch": 1.3879952865993392, "grad_norm": 1.6965595483779907, "learning_rate": 4.5752508361204015e-06, "loss": 0.4468, "step": 14400 }, { "epoch": 1.3879952865993392, "eval_loss": 0.3986397087574005, "eval_runtime": 58.865, "eval_samples_per_second": 169.88, "eval_steps_per_second": 21.235, "step": 14400 }, { "epoch": 1.397634142756279, "grad_norm": 1.5238665342330933, "learning_rate": 4.565217391304348e-06, "loss": 0.4475, "step": 14500 }, { "epoch": 1.397634142756279, "eval_loss": 0.3981897532939911, "eval_runtime": 59.0627, "eval_samples_per_second": 169.312, "eval_steps_per_second": 21.164, "step": 14500 }, { "epoch": 1.4072729989132189, "grad_norm": 1.5769461393356323, "learning_rate": 4.555183946488295e-06, "loss": 0.4482, "step": 14600 }, { "epoch": 1.4072729989132189, "eval_loss": 0.39567071199417114, "eval_runtime": 58.8331, "eval_samples_per_second": 169.972, "eval_steps_per_second": 21.247, "step": 14600 }, { "epoch": 1.416911855070159, "grad_norm": 1.604698896408081, "learning_rate": 4.545150501672241e-06, "loss": 0.4462, "step": 14700 }, { "epoch": 1.416911855070159, "eval_loss": 0.3909485340118408, "eval_runtime": 58.8119, "eval_samples_per_second": 170.034, "eval_steps_per_second": 21.254, "step": 14700 }, { "epoch": 1.4265507112270988, "grad_norm": 1.5470887422561646, "learning_rate": 4.535117056856188e-06, "loss": 0.4413, "step": 14800 }, { "epoch": 1.4265507112270988, "eval_loss": 0.3952423334121704, "eval_runtime": 59.0239, "eval_samples_per_second": 169.423, "eval_steps_per_second": 21.178, "step": 14800 }, { "epoch": 1.4361895673840386, "grad_norm": 1.5020021200180054, "learning_rate": 4.5250836120401345e-06, "loss": 0.44, "step": 14900 }, { "epoch": 1.4361895673840386, "eval_loss": 0.3845099210739136, "eval_runtime": 58.9294, "eval_samples_per_second": 169.695, "eval_steps_per_second": 21.212, "step": 14900 }, { "epoch": 1.4458284235409784, "grad_norm": 1.6386624574661255, "learning_rate": 4.51505016722408e-06, "loss": 0.4397, "step": 15000 }, { "epoch": 1.4458284235409784, "eval_loss": 0.38025835156440735, "eval_runtime": 58.818, "eval_samples_per_second": 170.016, "eval_steps_per_second": 21.252, "step": 15000 }, { "epoch": 1.4554672796979182, "grad_norm": 1.4663512706756592, "learning_rate": 4.505016722408027e-06, "loss": 0.4373, "step": 15100 }, { "epoch": 1.4554672796979182, "eval_loss": 0.3857060968875885, "eval_runtime": 58.8494, "eval_samples_per_second": 169.925, "eval_steps_per_second": 21.241, "step": 15100 }, { "epoch": 1.465106135854858, "grad_norm": 1.6452239751815796, "learning_rate": 4.494983277591973e-06, "loss": 0.4334, "step": 15200 }, { "epoch": 1.465106135854858, "eval_loss": 0.3850370943546295, "eval_runtime": 58.9917, "eval_samples_per_second": 169.515, "eval_steps_per_second": 21.189, "step": 15200 }, { "epoch": 1.4747449920117979, "grad_norm": 1.6599483489990234, "learning_rate": 4.48494983277592e-06, "loss": 0.434, "step": 15300 }, { "epoch": 1.4747449920117979, "eval_loss": 0.38289549946784973, "eval_runtime": 59.0362, "eval_samples_per_second": 169.388, "eval_steps_per_second": 21.173, "step": 15300 }, { "epoch": 1.4843838481687377, "grad_norm": 1.665375828742981, "learning_rate": 4.474916387959866e-06, "loss": 0.4325, "step": 15400 }, { "epoch": 1.4843838481687377, "eval_loss": 0.38493427634239197, "eval_runtime": 58.7929, "eval_samples_per_second": 170.088, "eval_steps_per_second": 21.261, "step": 15400 }, { "epoch": 1.4940227043256777, "grad_norm": 1.5351532697677612, "learning_rate": 4.4648829431438125e-06, "loss": 0.4292, "step": 15500 }, { "epoch": 1.4940227043256777, "eval_loss": 0.3742278814315796, "eval_runtime": 58.8841, "eval_samples_per_second": 169.825, "eval_steps_per_second": 21.228, "step": 15500 }, { "epoch": 1.5036615604826176, "grad_norm": 1.4911319017410278, "learning_rate": 4.454849498327759e-06, "loss": 0.4265, "step": 15600 }, { "epoch": 1.5036615604826176, "eval_loss": 0.3842261731624603, "eval_runtime": 59.0288, "eval_samples_per_second": 169.409, "eval_steps_per_second": 21.176, "step": 15600 }, { "epoch": 1.5133004166395574, "grad_norm": 1.4675853252410889, "learning_rate": 4.444816053511705e-06, "loss": 0.4237, "step": 15700 }, { "epoch": 1.5133004166395574, "eval_loss": 0.3784289062023163, "eval_runtime": 58.8858, "eval_samples_per_second": 169.82, "eval_steps_per_second": 21.228, "step": 15700 }, { "epoch": 1.5229392727964972, "grad_norm": 1.5063199996948242, "learning_rate": 4.434782608695652e-06, "loss": 0.4251, "step": 15800 }, { "epoch": 1.5229392727964972, "eval_loss": 0.3759336471557617, "eval_runtime": 59.0897, "eval_samples_per_second": 169.234, "eval_steps_per_second": 21.154, "step": 15800 }, { "epoch": 1.5325781289534373, "grad_norm": 1.4472297430038452, "learning_rate": 4.424749163879599e-06, "loss": 0.4248, "step": 15900 }, { "epoch": 1.5325781289534373, "eval_loss": 0.37335318326950073, "eval_runtime": 58.6972, "eval_samples_per_second": 170.366, "eval_steps_per_second": 21.296, "step": 15900 }, { "epoch": 1.542216985110377, "grad_norm": 1.5391136407852173, "learning_rate": 4.414715719063545e-06, "loss": 0.4236, "step": 16000 }, { "epoch": 1.542216985110377, "eval_loss": 0.3732203543186188, "eval_runtime": 58.7578, "eval_samples_per_second": 170.19, "eval_steps_per_second": 21.274, "step": 16000 }, { "epoch": 1.551855841267317, "grad_norm": 1.4061559438705444, "learning_rate": 4.404682274247491e-06, "loss": 0.4198, "step": 16100 }, { "epoch": 1.551855841267317, "eval_loss": 0.37672555446624756, "eval_runtime": 58.9119, "eval_samples_per_second": 169.745, "eval_steps_per_second": 21.218, "step": 16100 }, { "epoch": 1.5614946974242567, "grad_norm": 1.5127891302108765, "learning_rate": 4.394648829431438e-06, "loss": 0.4201, "step": 16200 }, { "epoch": 1.5614946974242567, "eval_loss": 0.38093164563179016, "eval_runtime": 59.0714, "eval_samples_per_second": 169.287, "eval_steps_per_second": 21.161, "step": 16200 }, { "epoch": 1.5711335535811966, "grad_norm": 1.487502098083496, "learning_rate": 4.384615384615384e-06, "loss": 0.417, "step": 16300 }, { "epoch": 1.5711335535811966, "eval_loss": 0.372848242521286, "eval_runtime": 58.9058, "eval_samples_per_second": 169.763, "eval_steps_per_second": 21.22, "step": 16300 }, { "epoch": 1.5807724097381364, "grad_norm": 1.5137122869491577, "learning_rate": 4.374581939799331e-06, "loss": 0.4151, "step": 16400 }, { "epoch": 1.5807724097381364, "eval_loss": 0.37606263160705566, "eval_runtime": 58.8935, "eval_samples_per_second": 169.798, "eval_steps_per_second": 21.225, "step": 16400 }, { "epoch": 1.5904112658950762, "grad_norm": 1.7061500549316406, "learning_rate": 4.364548494983278e-06, "loss": 0.4178, "step": 16500 }, { "epoch": 1.5904112658950762, "eval_loss": 0.37127670645713806, "eval_runtime": 59.0158, "eval_samples_per_second": 169.446, "eval_steps_per_second": 21.181, "step": 16500 }, { "epoch": 1.600050122052016, "grad_norm": 1.618320345878601, "learning_rate": 4.354515050167224e-06, "loss": 0.4169, "step": 16600 }, { "epoch": 1.600050122052016, "eval_loss": 0.37215638160705566, "eval_runtime": 58.8708, "eval_samples_per_second": 169.864, "eval_steps_per_second": 21.233, "step": 16600 }, { "epoch": 1.6096889782089558, "grad_norm": 1.5095313787460327, "learning_rate": 4.34448160535117e-06, "loss": 0.4117, "step": 16700 }, { "epoch": 1.6096889782089558, "eval_loss": 0.368650883436203, "eval_runtime": 59.0494, "eval_samples_per_second": 169.35, "eval_steps_per_second": 21.169, "step": 16700 }, { "epoch": 1.6193278343658957, "grad_norm": 1.56742525100708, "learning_rate": 4.334448160535117e-06, "loss": 0.4111, "step": 16800 }, { "epoch": 1.6193278343658957, "eval_loss": 0.36101919412612915, "eval_runtime": 58.9246, "eval_samples_per_second": 169.708, "eval_steps_per_second": 21.214, "step": 16800 }, { "epoch": 1.6289666905228355, "grad_norm": 1.5075291395187378, "learning_rate": 4.324414715719064e-06, "loss": 0.4132, "step": 16900 }, { "epoch": 1.6289666905228355, "eval_loss": 0.3604821264743805, "eval_runtime": 59.0583, "eval_samples_per_second": 169.324, "eval_steps_per_second": 21.166, "step": 16900 }, { "epoch": 1.6386055466797755, "grad_norm": 1.6991218328475952, "learning_rate": 4.31438127090301e-06, "loss": 0.4103, "step": 17000 }, { "epoch": 1.6386055466797755, "eval_loss": 0.35928505659103394, "eval_runtime": 58.8981, "eval_samples_per_second": 169.785, "eval_steps_per_second": 21.223, "step": 17000 }, { "epoch": 1.6482444028367154, "grad_norm": 1.4736828804016113, "learning_rate": 4.3043478260869565e-06, "loss": 0.4079, "step": 17100 }, { "epoch": 1.6482444028367154, "eval_loss": 0.35935309529304504, "eval_runtime": 58.8933, "eval_samples_per_second": 169.799, "eval_steps_per_second": 21.225, "step": 17100 }, { "epoch": 1.6578832589936552, "grad_norm": 1.4704461097717285, "learning_rate": 4.294314381270903e-06, "loss": 0.4041, "step": 17200 }, { "epoch": 1.6578832589936552, "eval_loss": 0.35896405577659607, "eval_runtime": 59.0753, "eval_samples_per_second": 169.276, "eval_steps_per_second": 21.159, "step": 17200 }, { "epoch": 1.6675221151505952, "grad_norm": 1.5166268348693848, "learning_rate": 4.284280936454849e-06, "loss": 0.4053, "step": 17300 }, { "epoch": 1.6675221151505952, "eval_loss": 0.35407283902168274, "eval_runtime": 58.899, "eval_samples_per_second": 169.782, "eval_steps_per_second": 21.223, "step": 17300 }, { "epoch": 1.677160971307535, "grad_norm": 1.4285303354263306, "learning_rate": 4.274247491638796e-06, "loss": 0.4023, "step": 17400 }, { "epoch": 1.677160971307535, "eval_loss": 0.3566473424434662, "eval_runtime": 58.8279, "eval_samples_per_second": 169.987, "eval_steps_per_second": 21.248, "step": 17400 }, { "epoch": 1.6867998274644749, "grad_norm": 1.6289827823638916, "learning_rate": 4.264214046822743e-06, "loss": 0.4019, "step": 17500 }, { "epoch": 1.6867998274644749, "eval_loss": 0.35341745615005493, "eval_runtime": 58.9406, "eval_samples_per_second": 169.662, "eval_steps_per_second": 21.208, "step": 17500 }, { "epoch": 1.6964386836214147, "grad_norm": 1.4782038927078247, "learning_rate": 4.254180602006689e-06, "loss": 0.4021, "step": 17600 }, { "epoch": 1.6964386836214147, "eval_loss": 0.35872602462768555, "eval_runtime": 58.8807, "eval_samples_per_second": 169.835, "eval_steps_per_second": 21.229, "step": 17600 }, { "epoch": 1.7060775397783545, "grad_norm": 1.544632077217102, "learning_rate": 4.244147157190635e-06, "loss": 0.3984, "step": 17700 }, { "epoch": 1.7060775397783545, "eval_loss": 0.3579792380332947, "eval_runtime": 59.0883, "eval_samples_per_second": 169.238, "eval_steps_per_second": 21.155, "step": 17700 }, { "epoch": 1.7157163959352943, "grad_norm": 1.4467884302139282, "learning_rate": 4.234113712374582e-06, "loss": 0.3982, "step": 17800 }, { "epoch": 1.7157163959352943, "eval_loss": 0.3508288562297821, "eval_runtime": 58.9148, "eval_samples_per_second": 169.737, "eval_steps_per_second": 21.217, "step": 17800 }, { "epoch": 1.7253552520922342, "grad_norm": 1.4046787023544312, "learning_rate": 4.224080267558528e-06, "loss": 0.3967, "step": 17900 }, { "epoch": 1.7253552520922342, "eval_loss": 0.3544318974018097, "eval_runtime": 58.917, "eval_samples_per_second": 169.73, "eval_steps_per_second": 21.216, "step": 17900 }, { "epoch": 1.734994108249174, "grad_norm": 1.5111027956008911, "learning_rate": 4.214046822742475e-06, "loss": 0.3962, "step": 18000 }, { "epoch": 1.734994108249174, "eval_loss": 0.3524232804775238, "eval_runtime": 59.0409, "eval_samples_per_second": 169.374, "eval_steps_per_second": 21.172, "step": 18000 }, { "epoch": 1.7446329644061138, "grad_norm": 1.469098687171936, "learning_rate": 4.2040133779264216e-06, "loss": 0.3968, "step": 18100 }, { "epoch": 1.7446329644061138, "eval_loss": 0.352710485458374, "eval_runtime": 58.913, "eval_samples_per_second": 169.742, "eval_steps_per_second": 21.218, "step": 18100 }, { "epoch": 1.7542718205630536, "grad_norm": 1.253414511680603, "learning_rate": 4.1939799331103675e-06, "loss": 0.3952, "step": 18200 }, { "epoch": 1.7542718205630536, "eval_loss": 0.3524448871612549, "eval_runtime": 59.0814, "eval_samples_per_second": 169.258, "eval_steps_per_second": 21.157, "step": 18200 }, { "epoch": 1.7639106767199937, "grad_norm": 1.425121545791626, "learning_rate": 4.183946488294314e-06, "loss": 0.3897, "step": 18300 }, { "epoch": 1.7639106767199937, "eval_loss": 0.3566833436489105, "eval_runtime": 58.9319, "eval_samples_per_second": 169.687, "eval_steps_per_second": 21.211, "step": 18300 }, { "epoch": 1.7735495328769335, "grad_norm": 1.4836667776107788, "learning_rate": 4.173913043478261e-06, "loss": 0.391, "step": 18400 }, { "epoch": 1.7735495328769335, "eval_loss": 0.3474622666835785, "eval_runtime": 59.0355, "eval_samples_per_second": 169.389, "eval_steps_per_second": 21.174, "step": 18400 }, { "epoch": 1.7831883890338733, "grad_norm": 1.3883644342422485, "learning_rate": 4.163879598662208e-06, "loss": 0.3928, "step": 18500 }, { "epoch": 1.7831883890338733, "eval_loss": 0.35389477014541626, "eval_runtime": 58.8499, "eval_samples_per_second": 169.924, "eval_steps_per_second": 21.24, "step": 18500 }, { "epoch": 1.7928272451908132, "grad_norm": 1.4131203889846802, "learning_rate": 4.153846153846154e-06, "loss": 0.3856, "step": 18600 }, { "epoch": 1.7928272451908132, "eval_loss": 0.34588733315467834, "eval_runtime": 58.8426, "eval_samples_per_second": 169.945, "eval_steps_per_second": 21.243, "step": 18600 }, { "epoch": 1.8024661013477532, "grad_norm": 1.407915711402893, "learning_rate": 4.1438127090301005e-06, "loss": 0.386, "step": 18700 }, { "epoch": 1.8024661013477532, "eval_loss": 0.3406791687011719, "eval_runtime": 59.0271, "eval_samples_per_second": 169.414, "eval_steps_per_second": 21.177, "step": 18700 }, { "epoch": 1.812104957504693, "grad_norm": 1.5311362743377686, "learning_rate": 4.133779264214047e-06, "loss": 0.3856, "step": 18800 }, { "epoch": 1.812104957504693, "eval_loss": 0.3458515703678131, "eval_runtime": 58.7489, "eval_samples_per_second": 170.216, "eval_steps_per_second": 21.277, "step": 18800 }, { "epoch": 1.8217438136616328, "grad_norm": 1.3238445520401, "learning_rate": 4.123745819397993e-06, "loss": 0.3837, "step": 18900 }, { "epoch": 1.8217438136616328, "eval_loss": 0.3374645411968231, "eval_runtime": 58.8018, "eval_samples_per_second": 170.063, "eval_steps_per_second": 21.258, "step": 18900 }, { "epoch": 1.8313826698185727, "grad_norm": 1.363261342048645, "learning_rate": 4.11371237458194e-06, "loss": 0.3885, "step": 19000 }, { "epoch": 1.8313826698185727, "eval_loss": 0.3434898853302002, "eval_runtime": 58.8257, "eval_samples_per_second": 169.994, "eval_steps_per_second": 21.249, "step": 19000 }, { "epoch": 1.8410215259755125, "grad_norm": 1.4662117958068848, "learning_rate": 4.103678929765887e-06, "loss": 0.3844, "step": 19100 }, { "epoch": 1.8410215259755125, "eval_loss": 0.3399724066257477, "eval_runtime": 58.9861, "eval_samples_per_second": 169.532, "eval_steps_per_second": 21.191, "step": 19100 }, { "epoch": 1.8506603821324523, "grad_norm": 1.4030060768127441, "learning_rate": 4.0936454849498326e-06, "loss": 0.3842, "step": 19200 }, { "epoch": 1.8506603821324523, "eval_loss": 0.3475658595561981, "eval_runtime": 59.1051, "eval_samples_per_second": 169.19, "eval_steps_per_second": 21.149, "step": 19200 }, { "epoch": 1.8602992382893921, "grad_norm": 1.3595353364944458, "learning_rate": 4.083612040133779e-06, "loss": 0.3852, "step": 19300 }, { "epoch": 1.8602992382893921, "eval_loss": 0.3436114192008972, "eval_runtime": 58.8926, "eval_samples_per_second": 169.801, "eval_steps_per_second": 21.225, "step": 19300 }, { "epoch": 1.869938094446332, "grad_norm": 1.29786217212677, "learning_rate": 4.073578595317726e-06, "loss": 0.3773, "step": 19400 }, { "epoch": 1.869938094446332, "eval_loss": 0.3408568799495697, "eval_runtime": 58.9024, "eval_samples_per_second": 169.772, "eval_steps_per_second": 21.222, "step": 19400 }, { "epoch": 1.8795769506032718, "grad_norm": 1.3584972620010376, "learning_rate": 4.063545150501672e-06, "loss": 0.3797, "step": 19500 }, { "epoch": 1.8795769506032718, "eval_loss": 0.33426791429519653, "eval_runtime": 59.1051, "eval_samples_per_second": 169.19, "eval_steps_per_second": 21.149, "step": 19500 }, { "epoch": 1.8892158067602116, "grad_norm": 1.424649953842163, "learning_rate": 4.053511705685619e-06, "loss": 0.3786, "step": 19600 }, { "epoch": 1.8892158067602116, "eval_loss": 0.33714547753334045, "eval_runtime": 58.9501, "eval_samples_per_second": 169.635, "eval_steps_per_second": 21.204, "step": 19600 }, { "epoch": 1.8988546629171517, "grad_norm": 1.4854973554611206, "learning_rate": 4.0434782608695655e-06, "loss": 0.3795, "step": 19700 }, { "epoch": 1.8988546629171517, "eval_loss": 0.33414432406425476, "eval_runtime": 59.106, "eval_samples_per_second": 169.188, "eval_steps_per_second": 21.148, "step": 19700 }, { "epoch": 1.9084935190740915, "grad_norm": 1.3664216995239258, "learning_rate": 4.0334448160535115e-06, "loss": 0.3785, "step": 19800 }, { "epoch": 1.9084935190740915, "eval_loss": 0.344148188829422, "eval_runtime": 58.8743, "eval_samples_per_second": 169.854, "eval_steps_per_second": 21.232, "step": 19800 }, { "epoch": 1.9181323752310313, "grad_norm": 1.3980112075805664, "learning_rate": 4.023411371237458e-06, "loss": 0.3764, "step": 19900 }, { "epoch": 1.9181323752310313, "eval_loss": 0.33199581503868103, "eval_runtime": 58.8637, "eval_samples_per_second": 169.884, "eval_steps_per_second": 21.236, "step": 19900 }, { "epoch": 1.9277712313879714, "grad_norm": 1.4668302536010742, "learning_rate": 4.013377926421405e-06, "loss": 0.376, "step": 20000 }, { "epoch": 1.9277712313879714, "eval_loss": 0.3381511867046356, "eval_runtime": 58.8277, "eval_samples_per_second": 169.988, "eval_steps_per_second": 21.248, "step": 20000 }, { "epoch": 1.9374100875449112, "grad_norm": 1.46481454372406, "learning_rate": 4.003344481605351e-06, "loss": 0.3765, "step": 20100 }, { "epoch": 1.9374100875449112, "eval_loss": 0.33972933888435364, "eval_runtime": 59.083, "eval_samples_per_second": 169.253, "eval_steps_per_second": 21.157, "step": 20100 }, { "epoch": 1.947048943701851, "grad_norm": 1.3771693706512451, "learning_rate": 3.993311036789298e-06, "loss": 0.3742, "step": 20200 }, { "epoch": 1.947048943701851, "eval_loss": 0.3317793607711792, "eval_runtime": 58.8425, "eval_samples_per_second": 169.945, "eval_steps_per_second": 21.243, "step": 20200 }, { "epoch": 1.9566877998587908, "grad_norm": 1.3235597610473633, "learning_rate": 3.9832775919732444e-06, "loss": 0.3715, "step": 20300 }, { "epoch": 1.9566877998587908, "eval_loss": 0.32993626594543457, "eval_runtime": 58.6506, "eval_samples_per_second": 170.501, "eval_steps_per_second": 21.313, "step": 20300 }, { "epoch": 1.9663266560157306, "grad_norm": 1.308266043663025, "learning_rate": 3.97324414715719e-06, "loss": 0.3723, "step": 20400 }, { "epoch": 1.9663266560157306, "eval_loss": 0.3278212547302246, "eval_runtime": 59.0497, "eval_samples_per_second": 169.349, "eval_steps_per_second": 21.169, "step": 20400 }, { "epoch": 1.9759655121726705, "grad_norm": 1.32529878616333, "learning_rate": 3.963210702341137e-06, "loss": 0.3747, "step": 20500 }, { "epoch": 1.9759655121726705, "eval_loss": 0.32411912083625793, "eval_runtime": 59.0234, "eval_samples_per_second": 169.424, "eval_steps_per_second": 21.178, "step": 20500 }, { "epoch": 1.9856043683296103, "grad_norm": 1.380763292312622, "learning_rate": 3.953177257525084e-06, "loss": 0.3685, "step": 20600 }, { "epoch": 1.9856043683296103, "eval_loss": 0.33261290192604065, "eval_runtime": 58.8622, "eval_samples_per_second": 169.888, "eval_steps_per_second": 21.236, "step": 20600 }, { "epoch": 1.9952432244865501, "grad_norm": 1.405206322669983, "learning_rate": 3.943143812709031e-06, "loss": 0.371, "step": 20700 }, { "epoch": 1.9952432244865501, "eval_loss": 0.3292562961578369, "eval_runtime": 58.8554, "eval_samples_per_second": 169.908, "eval_steps_per_second": 21.239, "step": 20700 }, { "epoch": 2.00488208064349, "grad_norm": 1.4182567596435547, "learning_rate": 3.9331103678929765e-06, "loss": 0.366, "step": 20800 }, { "epoch": 2.00488208064349, "eval_loss": 0.32856816053390503, "eval_runtime": 59.0258, "eval_samples_per_second": 169.418, "eval_steps_per_second": 21.177, "step": 20800 }, { "epoch": 2.0145209368004298, "grad_norm": 1.3217737674713135, "learning_rate": 3.923076923076923e-06, "loss": 0.3662, "step": 20900 }, { "epoch": 2.0145209368004298, "eval_loss": 0.3281511664390564, "eval_runtime": 58.8512, "eval_samples_per_second": 169.92, "eval_steps_per_second": 21.24, "step": 20900 }, { "epoch": 2.0241597929573696, "grad_norm": 1.3803727626800537, "learning_rate": 3.91304347826087e-06, "loss": 0.3643, "step": 21000 }, { "epoch": 2.0241597929573696, "eval_loss": 0.31252560019493103, "eval_runtime": 58.9054, "eval_samples_per_second": 169.764, "eval_steps_per_second": 21.22, "step": 21000 }, { "epoch": 2.0337986491143094, "grad_norm": 1.3696268796920776, "learning_rate": 3.903010033444816e-06, "loss": 0.365, "step": 21100 }, { "epoch": 2.0337986491143094, "eval_loss": 0.3254716694355011, "eval_runtime": 59.0681, "eval_samples_per_second": 169.296, "eval_steps_per_second": 21.162, "step": 21100 }, { "epoch": 2.0434375052712497, "grad_norm": 1.443435788154602, "learning_rate": 3.892976588628763e-06, "loss": 0.3616, "step": 21200 }, { "epoch": 2.0434375052712497, "eval_loss": 0.3301333487033844, "eval_runtime": 58.8781, "eval_samples_per_second": 169.842, "eval_steps_per_second": 21.23, "step": 21200 }, { "epoch": 2.0530763614281895, "grad_norm": 1.2991629838943481, "learning_rate": 3.8829431438127095e-06, "loss": 0.3662, "step": 21300 }, { "epoch": 2.0530763614281895, "eval_loss": 0.3216095566749573, "eval_runtime": 58.8486, "eval_samples_per_second": 169.928, "eval_steps_per_second": 21.241, "step": 21300 }, { "epoch": 2.0627152175851293, "grad_norm": 1.3617690801620483, "learning_rate": 3.8729096989966554e-06, "loss": 0.3609, "step": 21400 }, { "epoch": 2.0627152175851293, "eval_loss": 0.33237212896347046, "eval_runtime": 59.04, "eval_samples_per_second": 169.377, "eval_steps_per_second": 21.172, "step": 21400 }, { "epoch": 2.072354073742069, "grad_norm": 1.4248161315917969, "learning_rate": 3.862876254180602e-06, "loss": 0.3627, "step": 21500 }, { "epoch": 2.072354073742069, "eval_loss": 0.31593969464302063, "eval_runtime": 58.8557, "eval_samples_per_second": 169.907, "eval_steps_per_second": 21.238, "step": 21500 }, { "epoch": 2.081992929899009, "grad_norm": 1.3086020946502686, "learning_rate": 3.852842809364549e-06, "loss": 0.3594, "step": 21600 }, { "epoch": 2.081992929899009, "eval_loss": 0.3166731595993042, "eval_runtime": 58.9107, "eval_samples_per_second": 169.749, "eval_steps_per_second": 21.219, "step": 21600 }, { "epoch": 2.091631786055949, "grad_norm": 1.3267004489898682, "learning_rate": 3.842809364548495e-06, "loss": 0.3594, "step": 21700 }, { "epoch": 2.091631786055949, "eval_loss": 0.32076331973075867, "eval_runtime": 59.1128, "eval_samples_per_second": 169.168, "eval_steps_per_second": 21.146, "step": 21700 }, { "epoch": 2.1012706422128886, "grad_norm": 1.327438473701477, "learning_rate": 3.832775919732442e-06, "loss": 0.3567, "step": 21800 }, { "epoch": 2.1012706422128886, "eval_loss": 0.3243319094181061, "eval_runtime": 58.7006, "eval_samples_per_second": 170.356, "eval_steps_per_second": 21.295, "step": 21800 }, { "epoch": 2.1109094983698284, "grad_norm": 1.3011143207550049, "learning_rate": 3.822742474916388e-06, "loss": 0.3605, "step": 21900 }, { "epoch": 2.1109094983698284, "eval_loss": 0.3196314871311188, "eval_runtime": 58.8983, "eval_samples_per_second": 169.784, "eval_steps_per_second": 21.223, "step": 21900 }, { "epoch": 2.1205483545267683, "grad_norm": 1.337679386138916, "learning_rate": 3.8127090301003347e-06, "loss": 0.3605, "step": 22000 }, { "epoch": 2.1205483545267683, "eval_loss": 0.32047805190086365, "eval_runtime": 58.9713, "eval_samples_per_second": 169.574, "eval_steps_per_second": 21.197, "step": 22000 }, { "epoch": 2.130187210683708, "grad_norm": 1.4192520380020142, "learning_rate": 3.802675585284281e-06, "loss": 0.3575, "step": 22100 }, { "epoch": 2.130187210683708, "eval_loss": 0.31928718090057373, "eval_runtime": 58.9231, "eval_samples_per_second": 169.713, "eval_steps_per_second": 21.214, "step": 22100 }, { "epoch": 2.139826066840648, "grad_norm": 1.340483546257019, "learning_rate": 3.792642140468228e-06, "loss": 0.3582, "step": 22200 }, { "epoch": 2.139826066840648, "eval_loss": 0.32130032777786255, "eval_runtime": 58.8988, "eval_samples_per_second": 169.783, "eval_steps_per_second": 21.223, "step": 22200 }, { "epoch": 2.1494649229975877, "grad_norm": 1.2616146802902222, "learning_rate": 3.782608695652174e-06, "loss": 0.3559, "step": 22300 }, { "epoch": 2.1494649229975877, "eval_loss": 0.3182724118232727, "eval_runtime": 59.0569, "eval_samples_per_second": 169.328, "eval_steps_per_second": 21.166, "step": 22300 }, { "epoch": 2.1591037791545276, "grad_norm": 1.4252580404281616, "learning_rate": 3.7725752508361205e-06, "loss": 0.3529, "step": 22400 }, { "epoch": 2.1591037791545276, "eval_loss": 0.3209792971611023, "eval_runtime": 59.0164, "eval_samples_per_second": 169.445, "eval_steps_per_second": 21.181, "step": 22400 }, { "epoch": 2.1687426353114674, "grad_norm": 1.3374176025390625, "learning_rate": 3.7625418060200673e-06, "loss": 0.3533, "step": 22500 }, { "epoch": 2.1687426353114674, "eval_loss": 0.3203897476196289, "eval_runtime": 58.852, "eval_samples_per_second": 169.918, "eval_steps_per_second": 21.24, "step": 22500 }, { "epoch": 2.1783814914684076, "grad_norm": 1.366310715675354, "learning_rate": 3.7525083612040136e-06, "loss": 0.3518, "step": 22600 }, { "epoch": 2.1783814914684076, "eval_loss": 0.31347817182540894, "eval_runtime": 58.8483, "eval_samples_per_second": 169.928, "eval_steps_per_second": 21.241, "step": 22600 }, { "epoch": 2.1880203476253475, "grad_norm": 1.4128994941711426, "learning_rate": 3.74247491638796e-06, "loss": 0.351, "step": 22700 }, { "epoch": 2.1880203476253475, "eval_loss": 0.3184278905391693, "eval_runtime": 59.0451, "eval_samples_per_second": 169.362, "eval_steps_per_second": 21.17, "step": 22700 }, { "epoch": 2.1976592037822873, "grad_norm": 1.3148337602615356, "learning_rate": 3.7324414715719067e-06, "loss": 0.3523, "step": 22800 }, { "epoch": 2.1976592037822873, "eval_loss": 0.31502121686935425, "eval_runtime": 58.8414, "eval_samples_per_second": 169.948, "eval_steps_per_second": 21.244, "step": 22800 }, { "epoch": 2.207298059939227, "grad_norm": 1.3226207494735718, "learning_rate": 3.722408026755853e-06, "loss": 0.3498, "step": 22900 }, { "epoch": 2.207298059939227, "eval_loss": 0.32347571849823, "eval_runtime": 58.8138, "eval_samples_per_second": 170.028, "eval_steps_per_second": 21.254, "step": 22900 }, { "epoch": 2.216936916096167, "grad_norm": 1.4576925039291382, "learning_rate": 3.7123745819398e-06, "loss": 0.3487, "step": 23000 }, { "epoch": 2.216936916096167, "eval_loss": 0.314643532037735, "eval_runtime": 58.9415, "eval_samples_per_second": 169.66, "eval_steps_per_second": 21.207, "step": 23000 }, { "epoch": 2.2265757722531068, "grad_norm": 1.3837294578552246, "learning_rate": 3.702341137123746e-06, "loss": 0.3513, "step": 23100 }, { "epoch": 2.2265757722531068, "eval_loss": 0.3134578764438629, "eval_runtime": 58.8134, "eval_samples_per_second": 170.029, "eval_steps_per_second": 21.254, "step": 23100 }, { "epoch": 2.2362146284100466, "grad_norm": 1.2935632467269897, "learning_rate": 3.6923076923076925e-06, "loss": 0.349, "step": 23200 }, { "epoch": 2.2362146284100466, "eval_loss": 0.31015080213546753, "eval_runtime": 58.7672, "eval_samples_per_second": 170.163, "eval_steps_per_second": 21.27, "step": 23200 }, { "epoch": 2.2458534845669864, "grad_norm": 1.2622168064117432, "learning_rate": 3.6822742474916393e-06, "loss": 0.3474, "step": 23300 }, { "epoch": 2.2458534845669864, "eval_loss": 0.31574249267578125, "eval_runtime": 58.725, "eval_samples_per_second": 170.285, "eval_steps_per_second": 21.286, "step": 23300 }, { "epoch": 2.2554923407239262, "grad_norm": 1.5001434087753296, "learning_rate": 3.6722408026755856e-06, "loss": 0.3507, "step": 23400 }, { "epoch": 2.2554923407239262, "eval_loss": 0.32215574383735657, "eval_runtime": 58.7674, "eval_samples_per_second": 170.162, "eval_steps_per_second": 21.27, "step": 23400 }, { "epoch": 2.265131196880866, "grad_norm": 1.3347852230072021, "learning_rate": 3.662207357859532e-06, "loss": 0.3453, "step": 23500 }, { "epoch": 2.265131196880866, "eval_loss": 0.30782532691955566, "eval_runtime": 58.7918, "eval_samples_per_second": 170.092, "eval_steps_per_second": 21.261, "step": 23500 }, { "epoch": 2.274770053037806, "grad_norm": 1.2803244590759277, "learning_rate": 3.6521739130434787e-06, "loss": 0.3463, "step": 23600 }, { "epoch": 2.274770053037806, "eval_loss": 0.3062191903591156, "eval_runtime": 58.9104, "eval_samples_per_second": 169.749, "eval_steps_per_second": 21.219, "step": 23600 }, { "epoch": 2.2844089091947457, "grad_norm": 1.2594212293624878, "learning_rate": 3.642140468227425e-06, "loss": 0.3449, "step": 23700 }, { "epoch": 2.2844089091947457, "eval_loss": 0.30870264768600464, "eval_runtime": 58.753, "eval_samples_per_second": 170.204, "eval_steps_per_second": 21.276, "step": 23700 }, { "epoch": 2.2940477653516855, "grad_norm": 1.3449223041534424, "learning_rate": 3.6321070234113714e-06, "loss": 0.3421, "step": 23800 }, { "epoch": 2.2940477653516855, "eval_loss": 0.30901628732681274, "eval_runtime": 58.7205, "eval_samples_per_second": 170.298, "eval_steps_per_second": 21.287, "step": 23800 }, { "epoch": 2.303686621508626, "grad_norm": 1.2479959726333618, "learning_rate": 3.622073578595318e-06, "loss": 0.343, "step": 23900 }, { "epoch": 2.303686621508626, "eval_loss": 0.30690494179725647, "eval_runtime": 58.9674, "eval_samples_per_second": 169.585, "eval_steps_per_second": 21.198, "step": 23900 }, { "epoch": 2.3133254776655656, "grad_norm": 1.3553513288497925, "learning_rate": 3.6120401337792645e-06, "loss": 0.3429, "step": 24000 }, { "epoch": 2.3133254776655656, "eval_loss": 0.3068706691265106, "eval_runtime": 58.8271, "eval_samples_per_second": 169.99, "eval_steps_per_second": 21.249, "step": 24000 }, { "epoch": 2.3229643338225054, "grad_norm": 1.3287960290908813, "learning_rate": 3.6020066889632112e-06, "loss": 0.3415, "step": 24100 }, { "epoch": 2.3229643338225054, "eval_loss": 0.3021765947341919, "eval_runtime": 58.7866, "eval_samples_per_second": 170.107, "eval_steps_per_second": 21.263, "step": 24100 }, { "epoch": 2.3326031899794453, "grad_norm": 1.3244682550430298, "learning_rate": 3.5919732441471576e-06, "loss": 0.3394, "step": 24200 }, { "epoch": 2.3326031899794453, "eval_loss": 0.30216312408447266, "eval_runtime": 58.9521, "eval_samples_per_second": 169.629, "eval_steps_per_second": 21.204, "step": 24200 }, { "epoch": 2.342242046136385, "grad_norm": 1.286342978477478, "learning_rate": 3.581939799331104e-06, "loss": 0.34, "step": 24300 }, { "epoch": 2.342242046136385, "eval_loss": 0.3076727092266083, "eval_runtime": 58.7703, "eval_samples_per_second": 170.154, "eval_steps_per_second": 21.269, "step": 24300 }, { "epoch": 2.351880902293325, "grad_norm": 1.2560862302780151, "learning_rate": 3.5719063545150507e-06, "loss": 0.3344, "step": 24400 }, { "epoch": 2.351880902293325, "eval_loss": 0.30005696415901184, "eval_runtime": 58.9499, "eval_samples_per_second": 169.635, "eval_steps_per_second": 21.204, "step": 24400 }, { "epoch": 2.3615197584502647, "grad_norm": 1.3560149669647217, "learning_rate": 3.561872909698997e-06, "loss": 0.3363, "step": 24500 }, { "epoch": 2.3615197584502647, "eval_loss": 0.3021916449069977, "eval_runtime": 58.7981, "eval_samples_per_second": 170.073, "eval_steps_per_second": 21.259, "step": 24500 }, { "epoch": 2.3711586146072046, "grad_norm": 1.1849713325500488, "learning_rate": 3.5518394648829434e-06, "loss": 0.3367, "step": 24600 }, { "epoch": 2.3711586146072046, "eval_loss": 0.3012247681617737, "eval_runtime": 58.8396, "eval_samples_per_second": 169.954, "eval_steps_per_second": 21.244, "step": 24600 }, { "epoch": 2.3807974707641444, "grad_norm": 1.2602722644805908, "learning_rate": 3.54180602006689e-06, "loss": 0.3376, "step": 24700 }, { "epoch": 2.3807974707641444, "eval_loss": 0.304877907037735, "eval_runtime": 59.0108, "eval_samples_per_second": 169.46, "eval_steps_per_second": 21.183, "step": 24700 }, { "epoch": 2.390436326921084, "grad_norm": 1.194183111190796, "learning_rate": 3.5317725752508365e-06, "loss": 0.3354, "step": 24800 }, { "epoch": 2.390436326921084, "eval_loss": 0.3006995618343353, "eval_runtime": 58.5927, "eval_samples_per_second": 170.67, "eval_steps_per_second": 21.334, "step": 24800 }, { "epoch": 2.400075183078024, "grad_norm": 1.3535934686660767, "learning_rate": 3.521739130434783e-06, "loss": 0.3381, "step": 24900 }, { "epoch": 2.400075183078024, "eval_loss": 0.2990175485610962, "eval_runtime": 58.785, "eval_samples_per_second": 170.111, "eval_steps_per_second": 21.264, "step": 24900 }, { "epoch": 2.409714039234964, "grad_norm": 1.2365829944610596, "learning_rate": 3.5117056856187296e-06, "loss": 0.3381, "step": 25000 }, { "epoch": 2.409714039234964, "eval_loss": 0.30268794298171997, "eval_runtime": 58.955, "eval_samples_per_second": 169.621, "eval_steps_per_second": 21.203, "step": 25000 }, { "epoch": 2.4193528953919037, "grad_norm": 1.3548235893249512, "learning_rate": 3.501672240802676e-06, "loss": 0.3381, "step": 25100 }, { "epoch": 2.4193528953919037, "eval_loss": 0.3028050363063812, "eval_runtime": 58.9666, "eval_samples_per_second": 169.587, "eval_steps_per_second": 21.198, "step": 25100 }, { "epoch": 2.4289917515488435, "grad_norm": 1.3373470306396484, "learning_rate": 3.491638795986622e-06, "loss": 0.3347, "step": 25200 }, { "epoch": 2.4289917515488435, "eval_loss": 0.3040294647216797, "eval_runtime": 58.786, "eval_samples_per_second": 170.109, "eval_steps_per_second": 21.264, "step": 25200 }, { "epoch": 2.4386306077057833, "grad_norm": 1.2805660963058472, "learning_rate": 3.481605351170568e-06, "loss": 0.3362, "step": 25300 }, { "epoch": 2.4386306077057833, "eval_loss": 0.294087290763855, "eval_runtime": 58.8166, "eval_samples_per_second": 170.02, "eval_steps_per_second": 21.252, "step": 25300 }, { "epoch": 2.4482694638627236, "grad_norm": 1.2862471342086792, "learning_rate": 3.471571906354515e-06, "loss": 0.332, "step": 25400 }, { "epoch": 2.4482694638627236, "eval_loss": 0.3031771183013916, "eval_runtime": 58.8813, "eval_samples_per_second": 169.833, "eval_steps_per_second": 21.229, "step": 25400 }, { "epoch": 2.4579083200196634, "grad_norm": 1.2386327981948853, "learning_rate": 3.4615384615384613e-06, "loss": 0.3361, "step": 25500 }, { "epoch": 2.4579083200196634, "eval_loss": 0.30101174116134644, "eval_runtime": 58.7288, "eval_samples_per_second": 170.274, "eval_steps_per_second": 21.284, "step": 25500 }, { "epoch": 2.4675471761766032, "grad_norm": 1.2309975624084473, "learning_rate": 3.4515050167224076e-06, "loss": 0.3324, "step": 25600 }, { "epoch": 2.4675471761766032, "eval_loss": 0.2973729968070984, "eval_runtime": 58.9177, "eval_samples_per_second": 169.728, "eval_steps_per_second": 21.216, "step": 25600 }, { "epoch": 2.477186032333543, "grad_norm": 1.2784621715545654, "learning_rate": 3.4414715719063544e-06, "loss": 0.3315, "step": 25700 }, { "epoch": 2.477186032333543, "eval_loss": 0.2908300757408142, "eval_runtime": 58.7497, "eval_samples_per_second": 170.214, "eval_steps_per_second": 21.277, "step": 25700 }, { "epoch": 2.486824888490483, "grad_norm": 1.2238744497299194, "learning_rate": 3.4314381270903007e-06, "loss": 0.3295, "step": 25800 }, { "epoch": 2.486824888490483, "eval_loss": 0.2972917854785919, "eval_runtime": 58.8169, "eval_samples_per_second": 170.019, "eval_steps_per_second": 21.252, "step": 25800 }, { "epoch": 2.4964637446474227, "grad_norm": 1.1721047163009644, "learning_rate": 3.4214046822742475e-06, "loss": 0.33, "step": 25900 }, { "epoch": 2.4964637446474227, "eval_loss": 0.2929130494594574, "eval_runtime": 58.9116, "eval_samples_per_second": 169.746, "eval_steps_per_second": 21.218, "step": 25900 }, { "epoch": 2.5061026008043625, "grad_norm": 1.2451221942901611, "learning_rate": 3.411371237458194e-06, "loss": 0.3316, "step": 26000 }, { "epoch": 2.5061026008043625, "eval_loss": 0.2947503328323364, "eval_runtime": 58.7698, "eval_samples_per_second": 170.155, "eval_steps_per_second": 21.269, "step": 26000 }, { "epoch": 2.5157414569613024, "grad_norm": 1.345779299736023, "learning_rate": 3.40133779264214e-06, "loss": 0.3313, "step": 26100 }, { "epoch": 2.5157414569613024, "eval_loss": 0.2994622588157654, "eval_runtime": 58.7514, "eval_samples_per_second": 170.209, "eval_steps_per_second": 21.276, "step": 26100 }, { "epoch": 2.525380313118242, "grad_norm": 1.25875985622406, "learning_rate": 3.391304347826087e-06, "loss": 0.3308, "step": 26200 }, { "epoch": 2.525380313118242, "eval_loss": 0.2986910343170166, "eval_runtime": 58.7651, "eval_samples_per_second": 170.169, "eval_steps_per_second": 21.271, "step": 26200 }, { "epoch": 2.535019169275182, "grad_norm": 1.1642955541610718, "learning_rate": 3.3812709030100333e-06, "loss": 0.3297, "step": 26300 }, { "epoch": 2.535019169275182, "eval_loss": 0.29979461431503296, "eval_runtime": 58.8621, "eval_samples_per_second": 169.889, "eval_steps_per_second": 21.236, "step": 26300 }, { "epoch": 2.544658025432122, "grad_norm": 1.3095877170562744, "learning_rate": 3.3712374581939796e-06, "loss": 0.3259, "step": 26400 }, { "epoch": 2.544658025432122, "eval_loss": 0.28874439001083374, "eval_runtime": 58.8355, "eval_samples_per_second": 169.965, "eval_steps_per_second": 21.246, "step": 26400 }, { "epoch": 2.554296881589062, "grad_norm": 1.3071867227554321, "learning_rate": 3.3612040133779264e-06, "loss": 0.3266, "step": 26500 }, { "epoch": 2.554296881589062, "eval_loss": 0.28603237867355347, "eval_runtime": 59.0324, "eval_samples_per_second": 169.398, "eval_steps_per_second": 21.175, "step": 26500 }, { "epoch": 2.563935737746002, "grad_norm": 1.341209888458252, "learning_rate": 3.3511705685618727e-06, "loss": 0.3223, "step": 26600 }, { "epoch": 2.563935737746002, "eval_loss": 0.29270127415657043, "eval_runtime": 58.9317, "eval_samples_per_second": 169.688, "eval_steps_per_second": 21.211, "step": 26600 }, { "epoch": 2.5735745939029417, "grad_norm": 1.2843323945999146, "learning_rate": 3.3411371237458195e-06, "loss": 0.3249, "step": 26700 }, { "epoch": 2.5735745939029417, "eval_loss": 0.28523457050323486, "eval_runtime": 58.7521, "eval_samples_per_second": 170.207, "eval_steps_per_second": 21.276, "step": 26700 }, { "epoch": 2.5832134500598816, "grad_norm": 1.2155966758728027, "learning_rate": 3.331103678929766e-06, "loss": 0.3251, "step": 26800 }, { "epoch": 2.5832134500598816, "eval_loss": 0.28888818621635437, "eval_runtime": 58.7677, "eval_samples_per_second": 170.161, "eval_steps_per_second": 21.27, "step": 26800 }, { "epoch": 2.5928523062168214, "grad_norm": 1.2257170677185059, "learning_rate": 3.321070234113712e-06, "loss": 0.3238, "step": 26900 }, { "epoch": 2.5928523062168214, "eval_loss": 0.29029136896133423, "eval_runtime": 58.9332, "eval_samples_per_second": 169.684, "eval_steps_per_second": 21.21, "step": 26900 }, { "epoch": 2.602491162373761, "grad_norm": 1.2092607021331787, "learning_rate": 3.311036789297659e-06, "loss": 0.3229, "step": 27000 }, { "epoch": 2.602491162373761, "eval_loss": 0.2846786379814148, "eval_runtime": 58.8241, "eval_samples_per_second": 169.998, "eval_steps_per_second": 21.25, "step": 27000 }, { "epoch": 2.612130018530701, "grad_norm": 1.3890790939331055, "learning_rate": 3.3010033444816052e-06, "loss": 0.3252, "step": 27100 }, { "epoch": 2.612130018530701, "eval_loss": 0.28705713152885437, "eval_runtime": 58.8105, "eval_samples_per_second": 170.038, "eval_steps_per_second": 21.255, "step": 27100 }, { "epoch": 2.621768874687641, "grad_norm": 1.2755820751190186, "learning_rate": 3.2909698996655516e-06, "loss": 0.3222, "step": 27200 }, { "epoch": 2.621768874687641, "eval_loss": 0.28242212533950806, "eval_runtime": 58.8787, "eval_samples_per_second": 169.841, "eval_steps_per_second": 21.23, "step": 27200 }, { "epoch": 2.6314077308445807, "grad_norm": 1.2242155075073242, "learning_rate": 3.2809364548494983e-06, "loss": 0.3214, "step": 27300 }, { "epoch": 2.6314077308445807, "eval_loss": 0.2825804054737091, "eval_runtime": 58.9281, "eval_samples_per_second": 169.698, "eval_steps_per_second": 21.212, "step": 27300 }, { "epoch": 2.6410465870015205, "grad_norm": 1.198223352432251, "learning_rate": 3.2709030100334447e-06, "loss": 0.321, "step": 27400 }, { "epoch": 2.6410465870015205, "eval_loss": 0.28742432594299316, "eval_runtime": 58.8235, "eval_samples_per_second": 170.0, "eval_steps_per_second": 21.25, "step": 27400 }, { "epoch": 2.6506854431584603, "grad_norm": 1.2096476554870605, "learning_rate": 3.260869565217391e-06, "loss": 0.3199, "step": 27500 }, { "epoch": 2.6506854431584603, "eval_loss": 0.283199667930603, "eval_runtime": 58.804, "eval_samples_per_second": 170.057, "eval_steps_per_second": 21.257, "step": 27500 }, { "epoch": 2.6603242993154, "grad_norm": 1.3325072526931763, "learning_rate": 3.2508361204013378e-06, "loss": 0.3213, "step": 27600 }, { "epoch": 2.6603242993154, "eval_loss": 0.287946879863739, "eval_runtime": 58.9847, "eval_samples_per_second": 169.535, "eval_steps_per_second": 21.192, "step": 27600 }, { "epoch": 2.66996315547234, "grad_norm": 1.2635695934295654, "learning_rate": 3.240802675585284e-06, "loss": 0.318, "step": 27700 }, { "epoch": 2.66996315547234, "eval_loss": 0.2917640209197998, "eval_runtime": 58.5959, "eval_samples_per_second": 170.661, "eval_steps_per_second": 21.333, "step": 27700 }, { "epoch": 2.67960201162928, "grad_norm": 1.0959738492965698, "learning_rate": 3.230769230769231e-06, "loss": 0.3196, "step": 27800 }, { "epoch": 2.67960201162928, "eval_loss": 0.28916123509407043, "eval_runtime": 59.0359, "eval_samples_per_second": 169.388, "eval_steps_per_second": 21.174, "step": 27800 }, { "epoch": 2.6892408677862196, "grad_norm": 1.2401530742645264, "learning_rate": 3.2207357859531772e-06, "loss": 0.3208, "step": 27900 }, { "epoch": 2.6892408677862196, "eval_loss": 0.2888045012950897, "eval_runtime": 58.8348, "eval_samples_per_second": 169.967, "eval_steps_per_second": 21.246, "step": 27900 }, { "epoch": 2.6988797239431594, "grad_norm": 1.3379608392715454, "learning_rate": 3.2107023411371236e-06, "loss": 0.3184, "step": 28000 }, { "epoch": 2.6988797239431594, "eval_loss": 0.28211450576782227, "eval_runtime": 58.8639, "eval_samples_per_second": 169.884, "eval_steps_per_second": 21.235, "step": 28000 }, { "epoch": 2.7085185801000993, "grad_norm": 1.3035780191421509, "learning_rate": 3.2006688963210703e-06, "loss": 0.3158, "step": 28100 }, { "epoch": 2.7085185801000993, "eval_loss": 0.28730255365371704, "eval_runtime": 59.0338, "eval_samples_per_second": 169.395, "eval_steps_per_second": 21.174, "step": 28100 }, { "epoch": 2.7181574362570395, "grad_norm": 1.1148771047592163, "learning_rate": 3.1906354515050167e-06, "loss": 0.3179, "step": 28200 }, { "epoch": 2.7181574362570395, "eval_loss": 0.2847444415092468, "eval_runtime": 58.8319, "eval_samples_per_second": 169.976, "eval_steps_per_second": 21.247, "step": 28200 }, { "epoch": 2.7277962924139794, "grad_norm": 1.283491611480713, "learning_rate": 3.180602006688963e-06, "loss": 0.3182, "step": 28300 }, { "epoch": 2.7277962924139794, "eval_loss": 0.27807116508483887, "eval_runtime": 58.7961, "eval_samples_per_second": 170.079, "eval_steps_per_second": 21.26, "step": 28300 }, { "epoch": 2.737435148570919, "grad_norm": 1.1780531406402588, "learning_rate": 3.1705685618729098e-06, "loss": 0.3156, "step": 28400 }, { "epoch": 2.737435148570919, "eval_loss": 0.28270605206489563, "eval_runtime": 58.9988, "eval_samples_per_second": 169.495, "eval_steps_per_second": 21.187, "step": 28400 }, { "epoch": 2.747074004727859, "grad_norm": 1.3073583841323853, "learning_rate": 3.160535117056856e-06, "loss": 0.3157, "step": 28500 }, { "epoch": 2.747074004727859, "eval_loss": 0.2826150059700012, "eval_runtime": 58.8208, "eval_samples_per_second": 170.008, "eval_steps_per_second": 21.251, "step": 28500 }, { "epoch": 2.756712860884799, "grad_norm": 1.3253241777420044, "learning_rate": 3.1505016722408024e-06, "loss": 0.3143, "step": 28600 }, { "epoch": 2.756712860884799, "eval_loss": 0.27773648500442505, "eval_runtime": 58.8497, "eval_samples_per_second": 169.924, "eval_steps_per_second": 21.241, "step": 28600 }, { "epoch": 2.7663517170417387, "grad_norm": 1.3754655122756958, "learning_rate": 3.140468227424749e-06, "loss": 0.314, "step": 28700 }, { "epoch": 2.7663517170417387, "eval_loss": 0.2878931760787964, "eval_runtime": 58.9637, "eval_samples_per_second": 169.596, "eval_steps_per_second": 21.199, "step": 28700 }, { "epoch": 2.7759905731986785, "grad_norm": 1.230859398841858, "learning_rate": 3.1304347826086955e-06, "loss": 0.3132, "step": 28800 }, { "epoch": 2.7759905731986785, "eval_loss": 0.28596362471580505, "eval_runtime": 58.8221, "eval_samples_per_second": 170.004, "eval_steps_per_second": 21.251, "step": 28800 }, { "epoch": 2.7856294293556183, "grad_norm": 1.186421513557434, "learning_rate": 3.1204013377926423e-06, "loss": 0.3108, "step": 28900 }, { "epoch": 2.7856294293556183, "eval_loss": 0.28029701113700867, "eval_runtime": 58.7721, "eval_samples_per_second": 170.149, "eval_steps_per_second": 21.269, "step": 28900 }, { "epoch": 2.795268285512558, "grad_norm": 1.1692681312561035, "learning_rate": 3.1103678929765886e-06, "loss": 0.3145, "step": 29000 }, { "epoch": 2.795268285512558, "eval_loss": 0.2787197232246399, "eval_runtime": 58.9813, "eval_samples_per_second": 169.545, "eval_steps_per_second": 21.193, "step": 29000 }, { "epoch": 2.804907141669498, "grad_norm": 1.4014099836349487, "learning_rate": 3.100334448160535e-06, "loss": 0.3149, "step": 29100 }, { "epoch": 2.804907141669498, "eval_loss": 0.2786557972431183, "eval_runtime": 58.706, "eval_samples_per_second": 170.34, "eval_steps_per_second": 21.293, "step": 29100 }, { "epoch": 2.8145459978264378, "grad_norm": 1.2638059854507446, "learning_rate": 3.0903010033444818e-06, "loss": 0.3122, "step": 29200 }, { "epoch": 2.8145459978264378, "eval_loss": 0.2831690013408661, "eval_runtime": 58.5787, "eval_samples_per_second": 170.71, "eval_steps_per_second": 21.339, "step": 29200 }, { "epoch": 2.824184853983378, "grad_norm": 1.3350086212158203, "learning_rate": 3.080267558528428e-06, "loss": 0.3094, "step": 29300 }, { "epoch": 2.824184853983378, "eval_loss": 0.2827381491661072, "eval_runtime": 58.804, "eval_samples_per_second": 170.057, "eval_steps_per_second": 21.257, "step": 29300 }, { "epoch": 2.833823710140318, "grad_norm": 1.162023663520813, "learning_rate": 3.0702341137123744e-06, "loss": 0.3128, "step": 29400 }, { "epoch": 2.833823710140318, "eval_loss": 0.28242263197898865, "eval_runtime": 58.6385, "eval_samples_per_second": 170.537, "eval_steps_per_second": 21.317, "step": 29400 }, { "epoch": 2.8434625662972577, "grad_norm": 1.1858537197113037, "learning_rate": 3.060200668896321e-06, "loss": 0.3125, "step": 29500 }, { "epoch": 2.8434625662972577, "eval_loss": 0.27721619606018066, "eval_runtime": 58.5687, "eval_samples_per_second": 170.74, "eval_steps_per_second": 21.342, "step": 29500 }, { "epoch": 2.8531014224541975, "grad_norm": 1.2944772243499756, "learning_rate": 3.0501672240802675e-06, "loss": 0.3107, "step": 29600 }, { "epoch": 2.8531014224541975, "eval_loss": 0.2762606739997864, "eval_runtime": 58.766, "eval_samples_per_second": 170.166, "eval_steps_per_second": 21.271, "step": 29600 }, { "epoch": 2.8627402786111373, "grad_norm": 1.1597511768341064, "learning_rate": 3.0401337792642143e-06, "loss": 0.3087, "step": 29700 }, { "epoch": 2.8627402786111373, "eval_loss": 0.2779894173145294, "eval_runtime": 58.6408, "eval_samples_per_second": 170.53, "eval_steps_per_second": 21.316, "step": 29700 }, { "epoch": 2.872379134768077, "grad_norm": 1.2793524265289307, "learning_rate": 3.0301003344481606e-06, "loss": 0.3103, "step": 29800 }, { "epoch": 2.872379134768077, "eval_loss": 0.2801125645637512, "eval_runtime": 58.7484, "eval_samples_per_second": 170.217, "eval_steps_per_second": 21.277, "step": 29800 }, { "epoch": 2.882017990925017, "grad_norm": 1.2233638763427734, "learning_rate": 3.020066889632107e-06, "loss": 0.3097, "step": 29900 }, { "epoch": 2.882017990925017, "eval_loss": 0.27292197942733765, "eval_runtime": 58.9228, "eval_samples_per_second": 169.714, "eval_steps_per_second": 21.214, "step": 29900 }, { "epoch": 2.891656847081957, "grad_norm": 1.2502061128616333, "learning_rate": 3.0100334448160537e-06, "loss": 0.3087, "step": 30000 }, { "epoch": 2.891656847081957, "eval_loss": 0.27464571595191956, "eval_runtime": 58.7608, "eval_samples_per_second": 170.182, "eval_steps_per_second": 21.273, "step": 30000 }, { "epoch": 2.9012957032388966, "grad_norm": 1.1711479425430298, "learning_rate": 3e-06, "loss": 0.3079, "step": 30100 }, { "epoch": 2.9012957032388966, "eval_loss": 0.2868233621120453, "eval_runtime": 58.7507, "eval_samples_per_second": 170.211, "eval_steps_per_second": 21.276, "step": 30100 }, { "epoch": 2.9109345593958365, "grad_norm": 1.2344684600830078, "learning_rate": 2.9899665551839464e-06, "loss": 0.305, "step": 30200 }, { "epoch": 2.9109345593958365, "eval_loss": 0.27741649746894836, "eval_runtime": 58.9988, "eval_samples_per_second": 169.495, "eval_steps_per_second": 21.187, "step": 30200 }, { "epoch": 2.9205734155527763, "grad_norm": 1.1582242250442505, "learning_rate": 2.979933110367893e-06, "loss": 0.3087, "step": 30300 }, { "epoch": 2.9205734155527763, "eval_loss": 0.2797699272632599, "eval_runtime": 58.7984, "eval_samples_per_second": 170.073, "eval_steps_per_second": 21.259, "step": 30300 }, { "epoch": 2.930212271709716, "grad_norm": 1.2283897399902344, "learning_rate": 2.9698996655518395e-06, "loss": 0.3049, "step": 30400 }, { "epoch": 2.930212271709716, "eval_loss": 0.27813324332237244, "eval_runtime": 58.7956, "eval_samples_per_second": 170.081, "eval_steps_per_second": 21.26, "step": 30400 }, { "epoch": 2.939851127866656, "grad_norm": 1.3279707431793213, "learning_rate": 2.959866220735786e-06, "loss": 0.3063, "step": 30500 }, { "epoch": 2.939851127866656, "eval_loss": 0.27100399136543274, "eval_runtime": 58.9453, "eval_samples_per_second": 169.649, "eval_steps_per_second": 21.206, "step": 30500 }, { "epoch": 2.9494899840235957, "grad_norm": 1.180038332939148, "learning_rate": 2.9498327759197326e-06, "loss": 0.3099, "step": 30600 }, { "epoch": 2.9494899840235957, "eval_loss": 0.27709901332855225, "eval_runtime": 58.5784, "eval_samples_per_second": 170.711, "eval_steps_per_second": 21.339, "step": 30600 }, { "epoch": 2.9591288401805356, "grad_norm": 1.1865822076797485, "learning_rate": 2.939799331103679e-06, "loss": 0.3052, "step": 30700 }, { "epoch": 2.9591288401805356, "eval_loss": 0.2758926749229431, "eval_runtime": 58.9107, "eval_samples_per_second": 169.749, "eval_steps_per_second": 21.219, "step": 30700 }, { "epoch": 2.9687676963374754, "grad_norm": 1.285046935081482, "learning_rate": 2.9297658862876257e-06, "loss": 0.3056, "step": 30800 }, { "epoch": 2.9687676963374754, "eval_loss": 0.2717057168483734, "eval_runtime": 58.973, "eval_samples_per_second": 169.569, "eval_steps_per_second": 21.196, "step": 30800 }, { "epoch": 2.978406552494415, "grad_norm": 1.138985514640808, "learning_rate": 2.919732441471572e-06, "loss": 0.3026, "step": 30900 }, { "epoch": 2.978406552494415, "eval_loss": 0.2726858854293823, "eval_runtime": 58.8341, "eval_samples_per_second": 169.969, "eval_steps_per_second": 21.246, "step": 30900 }, { "epoch": 2.9880454086513555, "grad_norm": 1.2184150218963623, "learning_rate": 2.9096989966555184e-06, "loss": 0.305, "step": 31000 }, { "epoch": 2.9880454086513555, "eval_loss": 0.2747511863708496, "eval_runtime": 58.8569, "eval_samples_per_second": 169.904, "eval_steps_per_second": 21.238, "step": 31000 }, { "epoch": 2.9976842648082953, "grad_norm": 1.1389625072479248, "learning_rate": 2.899665551839465e-06, "loss": 0.3022, "step": 31100 }, { "epoch": 2.9976842648082953, "eval_loss": 0.27139467000961304, "eval_runtime": 59.0245, "eval_samples_per_second": 169.421, "eval_steps_per_second": 21.178, "step": 31100 }, { "epoch": 3.007323120965235, "grad_norm": 1.2831661701202393, "learning_rate": 2.8896321070234115e-06, "loss": 0.3015, "step": 31200 }, { "epoch": 3.007323120965235, "eval_loss": 0.268490195274353, "eval_runtime": 58.8203, "eval_samples_per_second": 170.009, "eval_steps_per_second": 21.251, "step": 31200 }, { "epoch": 3.016961977122175, "grad_norm": 1.3156960010528564, "learning_rate": 2.879598662207358e-06, "loss": 0.3034, "step": 31300 }, { "epoch": 3.016961977122175, "eval_loss": 0.27620190382003784, "eval_runtime": 58.8431, "eval_samples_per_second": 169.944, "eval_steps_per_second": 21.243, "step": 31300 }, { "epoch": 3.0266008332791148, "grad_norm": 1.1496986150741577, "learning_rate": 2.8695652173913046e-06, "loss": 0.3014, "step": 31400 }, { "epoch": 3.0266008332791148, "eval_loss": 0.26573923230171204, "eval_runtime": 58.9994, "eval_samples_per_second": 169.493, "eval_steps_per_second": 21.187, "step": 31400 }, { "epoch": 3.0362396894360546, "grad_norm": 1.2416408061981201, "learning_rate": 2.859531772575251e-06, "loss": 0.2996, "step": 31500 }, { "epoch": 3.0362396894360546, "eval_loss": 0.26978152990341187, "eval_runtime": 58.8544, "eval_samples_per_second": 169.911, "eval_steps_per_second": 21.239, "step": 31500 }, { "epoch": 3.0458785455929944, "grad_norm": 1.2750462293624878, "learning_rate": 2.8494983277591977e-06, "loss": 0.3049, "step": 31600 }, { "epoch": 3.0458785455929944, "eval_loss": 0.26833203434944153, "eval_runtime": 58.8337, "eval_samples_per_second": 169.971, "eval_steps_per_second": 21.246, "step": 31600 }, { "epoch": 3.0555174017499342, "grad_norm": 1.2201920747756958, "learning_rate": 2.839464882943144e-06, "loss": 0.3029, "step": 31700 }, { "epoch": 3.0555174017499342, "eval_loss": 0.2752624452114105, "eval_runtime": 59.0229, "eval_samples_per_second": 169.426, "eval_steps_per_second": 21.178, "step": 31700 }, { "epoch": 3.065156257906874, "grad_norm": 1.3049486875534058, "learning_rate": 2.8294314381270904e-06, "loss": 0.299, "step": 31800 }, { "epoch": 3.065156257906874, "eval_loss": 0.26851963996887207, "eval_runtime": 58.8811, "eval_samples_per_second": 169.834, "eval_steps_per_second": 21.229, "step": 31800 }, { "epoch": 3.074795114063814, "grad_norm": 1.2307597398757935, "learning_rate": 2.819397993311037e-06, "loss": 0.2998, "step": 31900 }, { "epoch": 3.074795114063814, "eval_loss": 0.27132850885391235, "eval_runtime": 58.8127, "eval_samples_per_second": 170.031, "eval_steps_per_second": 21.254, "step": 31900 }, { "epoch": 3.0844339702207537, "grad_norm": 1.2379592657089233, "learning_rate": 2.8093645484949835e-06, "loss": 0.3009, "step": 32000 }, { "epoch": 3.0844339702207537, "eval_loss": 0.2727653682231903, "eval_runtime": 59.0543, "eval_samples_per_second": 169.336, "eval_steps_per_second": 21.167, "step": 32000 }, { "epoch": 3.094072826377694, "grad_norm": 1.3900628089904785, "learning_rate": 2.79933110367893e-06, "loss": 0.2968, "step": 32100 }, { "epoch": 3.094072826377694, "eval_loss": 0.26818564534187317, "eval_runtime": 58.6043, "eval_samples_per_second": 170.636, "eval_steps_per_second": 21.329, "step": 32100 }, { "epoch": 3.103711682534634, "grad_norm": 1.2621980905532837, "learning_rate": 2.7892976588628766e-06, "loss": 0.2987, "step": 32200 }, { "epoch": 3.103711682534634, "eval_loss": 0.2698274552822113, "eval_runtime": 59.0036, "eval_samples_per_second": 169.481, "eval_steps_per_second": 21.185, "step": 32200 }, { "epoch": 3.1133505386915736, "grad_norm": 1.2347277402877808, "learning_rate": 2.779264214046823e-06, "loss": 0.2987, "step": 32300 }, { "epoch": 3.1133505386915736, "eval_loss": 0.2685893177986145, "eval_runtime": 58.8188, "eval_samples_per_second": 170.014, "eval_steps_per_second": 21.252, "step": 32300 }, { "epoch": 3.1229893948485135, "grad_norm": 1.2878509759902954, "learning_rate": 2.7692307692307693e-06, "loss": 0.2993, "step": 32400 }, { "epoch": 3.1229893948485135, "eval_loss": 0.2640542984008789, "eval_runtime": 59.467, "eval_samples_per_second": 168.161, "eval_steps_per_second": 21.02, "step": 32400 }, { "epoch": 3.1326282510054533, "grad_norm": 1.1735783815383911, "learning_rate": 2.759197324414716e-06, "loss": 0.2978, "step": 32500 }, { "epoch": 3.1326282510054533, "eval_loss": 0.2660863697528839, "eval_runtime": 59.3085, "eval_samples_per_second": 168.61, "eval_steps_per_second": 21.076, "step": 32500 }, { "epoch": 3.142267107162393, "grad_norm": 1.1393107175827026, "learning_rate": 2.749163879598662e-06, "loss": 0.3021, "step": 32600 }, { "epoch": 3.142267107162393, "eval_loss": 0.2683536410331726, "eval_runtime": 59.2664, "eval_samples_per_second": 168.73, "eval_steps_per_second": 21.091, "step": 32600 }, { "epoch": 3.151905963319333, "grad_norm": 1.1794265508651733, "learning_rate": 2.7391304347826087e-06, "loss": 0.3005, "step": 32700 }, { "epoch": 3.151905963319333, "eval_loss": 0.26745280623435974, "eval_runtime": 59.4235, "eval_samples_per_second": 168.284, "eval_steps_per_second": 21.035, "step": 32700 }, { "epoch": 3.1615448194762727, "grad_norm": 1.2067482471466064, "learning_rate": 2.729096989966555e-06, "loss": 0.2968, "step": 32800 }, { "epoch": 3.1615448194762727, "eval_loss": 0.2683298885822296, "eval_runtime": 59.2787, "eval_samples_per_second": 168.695, "eval_steps_per_second": 21.087, "step": 32800 }, { "epoch": 3.1711836756332126, "grad_norm": 1.2533990144729614, "learning_rate": 2.7190635451505014e-06, "loss": 0.2971, "step": 32900 }, { "epoch": 3.1711836756332126, "eval_loss": 0.26645201444625854, "eval_runtime": 59.2542, "eval_samples_per_second": 168.764, "eval_steps_per_second": 21.096, "step": 32900 }, { "epoch": 3.1808225317901524, "grad_norm": 1.3016612529754639, "learning_rate": 2.709030100334448e-06, "loss": 0.2956, "step": 33000 }, { "epoch": 3.1808225317901524, "eval_loss": 0.2659901976585388, "eval_runtime": 59.4032, "eval_samples_per_second": 168.341, "eval_steps_per_second": 21.043, "step": 33000 }, { "epoch": 3.190461387947092, "grad_norm": 1.1479302644729614, "learning_rate": 2.6989966555183945e-06, "loss": 0.294, "step": 33100 }, { "epoch": 3.190461387947092, "eval_loss": 0.26020553708076477, "eval_runtime": 59.4399, "eval_samples_per_second": 168.237, "eval_steps_per_second": 21.03, "step": 33100 }, { "epoch": 3.200100244104032, "grad_norm": 1.220609426498413, "learning_rate": 2.6889632107023413e-06, "loss": 0.2936, "step": 33200 }, { "epoch": 3.200100244104032, "eval_loss": 0.2678441107273102, "eval_runtime": 59.2829, "eval_samples_per_second": 168.683, "eval_steps_per_second": 21.085, "step": 33200 }, { "epoch": 3.209739100260972, "grad_norm": 1.3181507587432861, "learning_rate": 2.6789297658862876e-06, "loss": 0.2952, "step": 33300 }, { "epoch": 3.209739100260972, "eval_loss": 0.26516881585121155, "eval_runtime": 59.4504, "eval_samples_per_second": 168.207, "eval_steps_per_second": 21.026, "step": 33300 }, { "epoch": 3.2193779564179117, "grad_norm": 1.234379529953003, "learning_rate": 2.668896321070234e-06, "loss": 0.297, "step": 33400 }, { "epoch": 3.2193779564179117, "eval_loss": 0.265980064868927, "eval_runtime": 59.3045, "eval_samples_per_second": 168.621, "eval_steps_per_second": 21.078, "step": 33400 }, { "epoch": 3.2290168125748515, "grad_norm": 1.227977991104126, "learning_rate": 2.6588628762541807e-06, "loss": 0.2936, "step": 33500 }, { "epoch": 3.2290168125748515, "eval_loss": 0.2639939785003662, "eval_runtime": 59.2768, "eval_samples_per_second": 168.7, "eval_steps_per_second": 21.087, "step": 33500 }, { "epoch": 3.238655668731792, "grad_norm": 1.1632543802261353, "learning_rate": 2.648829431438127e-06, "loss": 0.2922, "step": 33600 }, { "epoch": 3.238655668731792, "eval_loss": 0.2577446401119232, "eval_runtime": 59.456, "eval_samples_per_second": 168.192, "eval_steps_per_second": 21.024, "step": 33600 }, { "epoch": 3.2482945248887316, "grad_norm": 1.23133385181427, "learning_rate": 2.6387959866220734e-06, "loss": 0.2944, "step": 33700 }, { "epoch": 3.2482945248887316, "eval_loss": 0.2566361725330353, "eval_runtime": 59.1356, "eval_samples_per_second": 169.103, "eval_steps_per_second": 21.138, "step": 33700 }, { "epoch": 3.2579333810456714, "grad_norm": 1.3113288879394531, "learning_rate": 2.62876254180602e-06, "loss": 0.2922, "step": 33800 }, { "epoch": 3.2579333810456714, "eval_loss": 0.2656908631324768, "eval_runtime": 58.9864, "eval_samples_per_second": 169.531, "eval_steps_per_second": 21.191, "step": 33800 }, { "epoch": 3.2675722372026113, "grad_norm": 1.1765536069869995, "learning_rate": 2.6187290969899665e-06, "loss": 0.2949, "step": 33900 }, { "epoch": 3.2675722372026113, "eval_loss": 0.2684057056903839, "eval_runtime": 58.9524, "eval_samples_per_second": 169.628, "eval_steps_per_second": 21.204, "step": 33900 }, { "epoch": 3.277211093359551, "grad_norm": 1.336182951927185, "learning_rate": 2.6086956521739132e-06, "loss": 0.2919, "step": 34000 }, { "epoch": 3.277211093359551, "eval_loss": 0.2648228406906128, "eval_runtime": 59.1485, "eval_samples_per_second": 169.066, "eval_steps_per_second": 21.133, "step": 34000 }, { "epoch": 3.286849949516491, "grad_norm": 1.1484243869781494, "learning_rate": 2.5986622073578596e-06, "loss": 0.2923, "step": 34100 }, { "epoch": 3.286849949516491, "eval_loss": 0.268263041973114, "eval_runtime": 58.9366, "eval_samples_per_second": 169.674, "eval_steps_per_second": 21.209, "step": 34100 }, { "epoch": 3.2964888056734307, "grad_norm": 1.2312963008880615, "learning_rate": 2.588628762541806e-06, "loss": 0.2934, "step": 34200 }, { "epoch": 3.2964888056734307, "eval_loss": 0.26405033469200134, "eval_runtime": 58.9279, "eval_samples_per_second": 169.699, "eval_steps_per_second": 21.212, "step": 34200 }, { "epoch": 3.3061276618303705, "grad_norm": 1.2409456968307495, "learning_rate": 2.5785953177257527e-06, "loss": 0.2917, "step": 34300 }, { "epoch": 3.3061276618303705, "eval_loss": 0.26254796981811523, "eval_runtime": 59.1307, "eval_samples_per_second": 169.117, "eval_steps_per_second": 21.14, "step": 34300 }, { "epoch": 3.3157665179873104, "grad_norm": 1.1522307395935059, "learning_rate": 2.568561872909699e-06, "loss": 0.2887, "step": 34400 }, { "epoch": 3.3157665179873104, "eval_loss": 0.25670239329338074, "eval_runtime": 58.9257, "eval_samples_per_second": 169.705, "eval_steps_per_second": 21.213, "step": 34400 }, { "epoch": 3.32540537414425, "grad_norm": 1.1980903148651123, "learning_rate": 2.5585284280936454e-06, "loss": 0.2938, "step": 34500 }, { "epoch": 3.32540537414425, "eval_loss": 0.25677651166915894, "eval_runtime": 58.9318, "eval_samples_per_second": 169.688, "eval_steps_per_second": 21.211, "step": 34500 }, { "epoch": 3.33504423030119, "grad_norm": 1.169443964958191, "learning_rate": 2.548494983277592e-06, "loss": 0.2884, "step": 34600 }, { "epoch": 3.33504423030119, "eval_loss": 0.2614832818508148, "eval_runtime": 59.0599, "eval_samples_per_second": 169.32, "eval_steps_per_second": 21.165, "step": 34600 }, { "epoch": 3.34468308645813, "grad_norm": 1.151646375656128, "learning_rate": 2.5384615384615385e-06, "loss": 0.2917, "step": 34700 }, { "epoch": 3.34468308645813, "eval_loss": 0.2570364773273468, "eval_runtime": 59.0246, "eval_samples_per_second": 169.421, "eval_steps_per_second": 21.178, "step": 34700 }, { "epoch": 3.35432194261507, "grad_norm": 1.1321110725402832, "learning_rate": 2.528428093645485e-06, "loss": 0.2896, "step": 34800 }, { "epoch": 3.35432194261507, "eval_loss": 0.26307597756385803, "eval_runtime": 58.9477, "eval_samples_per_second": 169.642, "eval_steps_per_second": 21.205, "step": 34800 }, { "epoch": 3.36396079877201, "grad_norm": 1.224995732307434, "learning_rate": 2.5183946488294316e-06, "loss": 0.2899, "step": 34900 }, { "epoch": 3.36396079877201, "eval_loss": 0.25860831141471863, "eval_runtime": 58.9452, "eval_samples_per_second": 169.649, "eval_steps_per_second": 21.206, "step": 34900 }, { "epoch": 3.3735996549289498, "grad_norm": 1.2130616903305054, "learning_rate": 2.508361204013378e-06, "loss": 0.2918, "step": 35000 }, { "epoch": 3.3735996549289498, "eval_loss": 0.26047125458717346, "eval_runtime": 58.891, "eval_samples_per_second": 169.805, "eval_steps_per_second": 21.226, "step": 35000 }, { "epoch": 3.3832385110858896, "grad_norm": 1.1943938732147217, "learning_rate": 2.4983277591973247e-06, "loss": 0.2899, "step": 35100 }, { "epoch": 3.3832385110858896, "eval_loss": 0.2654706537723541, "eval_runtime": 58.7593, "eval_samples_per_second": 170.186, "eval_steps_per_second": 21.273, "step": 35100 }, { "epoch": 3.3928773672428294, "grad_norm": 1.1910595893859863, "learning_rate": 2.488294314381271e-06, "loss": 0.2905, "step": 35200 }, { "epoch": 3.3928773672428294, "eval_loss": 0.252178430557251, "eval_runtime": 58.9565, "eval_samples_per_second": 169.617, "eval_steps_per_second": 21.202, "step": 35200 }, { "epoch": 3.4025162233997692, "grad_norm": 1.189275860786438, "learning_rate": 2.4782608695652173e-06, "loss": 0.2851, "step": 35300 }, { "epoch": 3.4025162233997692, "eval_loss": 0.2556873559951782, "eval_runtime": 59.1306, "eval_samples_per_second": 169.117, "eval_steps_per_second": 21.14, "step": 35300 }, { "epoch": 3.412155079556709, "grad_norm": 1.2805149555206299, "learning_rate": 2.468227424749164e-06, "loss": 0.2878, "step": 35400 }, { "epoch": 3.412155079556709, "eval_loss": 0.25642314553260803, "eval_runtime": 59.0054, "eval_samples_per_second": 169.476, "eval_steps_per_second": 21.185, "step": 35400 }, { "epoch": 3.421793935713649, "grad_norm": 1.1754244565963745, "learning_rate": 2.4581939799331104e-06, "loss": 0.2863, "step": 35500 }, { "epoch": 3.421793935713649, "eval_loss": 0.2632581293582916, "eval_runtime": 59.0769, "eval_samples_per_second": 169.271, "eval_steps_per_second": 21.159, "step": 35500 }, { "epoch": 3.4314327918705887, "grad_norm": 1.2007780075073242, "learning_rate": 2.4481605351170568e-06, "loss": 0.2892, "step": 35600 }, { "epoch": 3.4314327918705887, "eval_loss": 0.25143861770629883, "eval_runtime": 58.9087, "eval_samples_per_second": 169.754, "eval_steps_per_second": 21.219, "step": 35600 }, { "epoch": 3.4410716480275285, "grad_norm": 1.0942296981811523, "learning_rate": 2.4381270903010035e-06, "loss": 0.2865, "step": 35700 }, { "epoch": 3.4410716480275285, "eval_loss": 0.25563955307006836, "eval_runtime": 58.898, "eval_samples_per_second": 169.785, "eval_steps_per_second": 21.223, "step": 35700 }, { "epoch": 3.4507105041844683, "grad_norm": 1.1834352016448975, "learning_rate": 2.42809364548495e-06, "loss": 0.2889, "step": 35800 }, { "epoch": 3.4507105041844683, "eval_loss": 0.25440356135368347, "eval_runtime": 59.1434, "eval_samples_per_second": 169.08, "eval_steps_per_second": 21.135, "step": 35800 }, { "epoch": 3.460349360341408, "grad_norm": 1.3102777004241943, "learning_rate": 2.4180602006688962e-06, "loss": 0.2882, "step": 35900 }, { "epoch": 3.460349360341408, "eval_loss": 0.25780776143074036, "eval_runtime": 58.9453, "eval_samples_per_second": 169.649, "eval_steps_per_second": 21.206, "step": 35900 }, { "epoch": 3.469988216498348, "grad_norm": 1.221442461013794, "learning_rate": 2.408026755852843e-06, "loss": 0.2868, "step": 36000 }, { "epoch": 3.469988216498348, "eval_loss": 0.259140282869339, "eval_runtime": 58.9616, "eval_samples_per_second": 169.602, "eval_steps_per_second": 21.2, "step": 36000 }, { "epoch": 3.479627072655288, "grad_norm": 1.2383184432983398, "learning_rate": 2.3979933110367893e-06, "loss": 0.2869, "step": 36100 }, { "epoch": 3.479627072655288, "eval_loss": 0.25708064436912537, "eval_runtime": 59.1275, "eval_samples_per_second": 169.126, "eval_steps_per_second": 21.141, "step": 36100 }, { "epoch": 3.4892659288122276, "grad_norm": 1.1699203252792358, "learning_rate": 2.387959866220736e-06, "loss": 0.2842, "step": 36200 }, { "epoch": 3.4892659288122276, "eval_loss": 0.25702208280563354, "eval_runtime": 58.9299, "eval_samples_per_second": 169.693, "eval_steps_per_second": 21.212, "step": 36200 }, { "epoch": 3.498904784969168, "grad_norm": 1.2548960447311401, "learning_rate": 2.3779264214046824e-06, "loss": 0.2871, "step": 36300 }, { "epoch": 3.498904784969168, "eval_loss": 0.2522522807121277, "eval_runtime": 58.9452, "eval_samples_per_second": 169.649, "eval_steps_per_second": 21.206, "step": 36300 }, { "epoch": 3.5085436411261077, "grad_norm": 1.2380112409591675, "learning_rate": 2.3678929765886288e-06, "loss": 0.2847, "step": 36400 }, { "epoch": 3.5085436411261077, "eval_loss": 0.25519639253616333, "eval_runtime": 59.1639, "eval_samples_per_second": 169.022, "eval_steps_per_second": 21.128, "step": 36400 }, { "epoch": 3.5181824972830475, "grad_norm": 1.163103699684143, "learning_rate": 2.3578595317725755e-06, "loss": 0.2848, "step": 36500 }, { "epoch": 3.5181824972830475, "eval_loss": 0.25967657566070557, "eval_runtime": 58.7318, "eval_samples_per_second": 170.265, "eval_steps_per_second": 21.283, "step": 36500 }, { "epoch": 3.5278213534399874, "grad_norm": 1.17255437374115, "learning_rate": 2.347826086956522e-06, "loss": 0.285, "step": 36600 }, { "epoch": 3.5278213534399874, "eval_loss": 0.2591026723384857, "eval_runtime": 58.5991, "eval_samples_per_second": 170.651, "eval_steps_per_second": 21.331, "step": 36600 }, { "epoch": 3.537460209596927, "grad_norm": 1.2136497497558594, "learning_rate": 2.337792642140468e-06, "loss": 0.2843, "step": 36700 }, { "epoch": 3.537460209596927, "eval_loss": 0.2578739523887634, "eval_runtime": 59.0072, "eval_samples_per_second": 169.471, "eval_steps_per_second": 21.184, "step": 36700 }, { "epoch": 3.547099065753867, "grad_norm": 1.1942510604858398, "learning_rate": 2.327759197324415e-06, "loss": 0.2806, "step": 36800 }, { "epoch": 3.547099065753867, "eval_loss": 0.251567542552948, "eval_runtime": 58.9372, "eval_samples_per_second": 169.672, "eval_steps_per_second": 21.209, "step": 36800 }, { "epoch": 3.556737921910807, "grad_norm": 1.2007250785827637, "learning_rate": 2.3177257525083613e-06, "loss": 0.284, "step": 36900 }, { "epoch": 3.556737921910807, "eval_loss": 0.25226566195487976, "eval_runtime": 58.9498, "eval_samples_per_second": 169.636, "eval_steps_per_second": 21.204, "step": 36900 }, { "epoch": 3.5663767780677467, "grad_norm": 1.2540044784545898, "learning_rate": 2.307692307692308e-06, "loss": 0.2806, "step": 37000 }, { "epoch": 3.5663767780677467, "eval_loss": 0.24976973235607147, "eval_runtime": 59.0781, "eval_samples_per_second": 169.267, "eval_steps_per_second": 21.158, "step": 37000 }, { "epoch": 3.5760156342246865, "grad_norm": 1.2845237255096436, "learning_rate": 2.2976588628762544e-06, "loss": 0.2814, "step": 37100 }, { "epoch": 3.5760156342246865, "eval_loss": 0.2551785707473755, "eval_runtime": 58.947, "eval_samples_per_second": 169.644, "eval_steps_per_second": 21.205, "step": 37100 }, { "epoch": 3.5856544903816263, "grad_norm": 1.0793906450271606, "learning_rate": 2.2876254180602008e-06, "loss": 0.2831, "step": 37200 }, { "epoch": 3.5856544903816263, "eval_loss": 0.259166955947876, "eval_runtime": 58.9587, "eval_samples_per_second": 169.61, "eval_steps_per_second": 21.201, "step": 37200 }, { "epoch": 3.595293346538566, "grad_norm": 1.2562330961227417, "learning_rate": 2.2775919732441475e-06, "loss": 0.2816, "step": 37300 }, { "epoch": 3.595293346538566, "eval_loss": 0.25133049488067627, "eval_runtime": 59.0752, "eval_samples_per_second": 169.276, "eval_steps_per_second": 21.159, "step": 37300 }, { "epoch": 3.604932202695506, "grad_norm": 1.2339458465576172, "learning_rate": 2.267558528428094e-06, "loss": 0.2811, "step": 37400 }, { "epoch": 3.604932202695506, "eval_loss": 0.2517785131931305, "eval_runtime": 59.0293, "eval_samples_per_second": 169.407, "eval_steps_per_second": 21.176, "step": 37400 }, { "epoch": 3.6145710588524462, "grad_norm": 1.1703965663909912, "learning_rate": 2.25752508361204e-06, "loss": 0.285, "step": 37500 }, { "epoch": 3.6145710588524462, "eval_loss": 0.24998454749584198, "eval_runtime": 58.8551, "eval_samples_per_second": 169.909, "eval_steps_per_second": 21.239, "step": 37500 }, { "epoch": 3.624209915009386, "grad_norm": 1.1811251640319824, "learning_rate": 2.2474916387959865e-06, "loss": 0.2853, "step": 37600 }, { "epoch": 3.624209915009386, "eval_loss": 0.24660471081733704, "eval_runtime": 58.8887, "eval_samples_per_second": 169.812, "eval_steps_per_second": 21.226, "step": 37600 }, { "epoch": 3.633848771166326, "grad_norm": 1.2618868350982666, "learning_rate": 2.237458193979933e-06, "loss": 0.2795, "step": 37700 }, { "epoch": 3.633848771166326, "eval_loss": 0.25702670216560364, "eval_runtime": 59.0964, "eval_samples_per_second": 169.215, "eval_steps_per_second": 21.152, "step": 37700 }, { "epoch": 3.6434876273232657, "grad_norm": 1.1779111623764038, "learning_rate": 2.2274247491638796e-06, "loss": 0.2807, "step": 37800 }, { "epoch": 3.6434876273232657, "eval_loss": 0.24639862775802612, "eval_runtime": 58.9536, "eval_samples_per_second": 169.625, "eval_steps_per_second": 21.203, "step": 37800 }, { "epoch": 3.6531264834802055, "grad_norm": 1.1458935737609863, "learning_rate": 2.217391304347826e-06, "loss": 0.2781, "step": 37900 }, { "epoch": 3.6531264834802055, "eval_loss": 0.25728264451026917, "eval_runtime": 58.9434, "eval_samples_per_second": 169.654, "eval_steps_per_second": 21.207, "step": 37900 }, { "epoch": 3.6627653396371453, "grad_norm": 1.2311447858810425, "learning_rate": 2.2073578595317723e-06, "loss": 0.2804, "step": 38000 }, { "epoch": 3.6627653396371453, "eval_loss": 0.25558847188949585, "eval_runtime": 58.8989, "eval_samples_per_second": 169.782, "eval_steps_per_second": 21.223, "step": 38000 }, { "epoch": 3.672404195794085, "grad_norm": 1.1407544612884521, "learning_rate": 2.197324414715719e-06, "loss": 0.2765, "step": 38100 }, { "epoch": 3.672404195794085, "eval_loss": 0.248101145029068, "eval_runtime": 58.7355, "eval_samples_per_second": 170.255, "eval_steps_per_second": 21.282, "step": 38100 }, { "epoch": 3.682043051951025, "grad_norm": 1.2951027154922485, "learning_rate": 2.1872909698996654e-06, "loss": 0.2802, "step": 38200 }, { "epoch": 3.682043051951025, "eval_loss": 0.25361719727516174, "eval_runtime": 58.7402, "eval_samples_per_second": 170.241, "eval_steps_per_second": 21.28, "step": 38200 }, { "epoch": 3.691681908107965, "grad_norm": 1.1731014251708984, "learning_rate": 2.177257525083612e-06, "loss": 0.2773, "step": 38300 }, { "epoch": 3.691681908107965, "eval_loss": 0.25366759300231934, "eval_runtime": 58.9147, "eval_samples_per_second": 169.737, "eval_steps_per_second": 21.217, "step": 38300 }, { "epoch": 3.7013207642649046, "grad_norm": 1.2044979333877563, "learning_rate": 2.1672240802675585e-06, "loss": 0.2782, "step": 38400 }, { "epoch": 3.7013207642649046, "eval_loss": 0.2512661814689636, "eval_runtime": 58.7469, "eval_samples_per_second": 170.222, "eval_steps_per_second": 21.278, "step": 38400 }, { "epoch": 3.7109596204218445, "grad_norm": 1.1250883340835571, "learning_rate": 2.157190635451505e-06, "loss": 0.2766, "step": 38500 }, { "epoch": 3.7109596204218445, "eval_loss": 0.25230294466018677, "eval_runtime": 58.7174, "eval_samples_per_second": 170.307, "eval_steps_per_second": 21.288, "step": 38500 }, { "epoch": 3.7205984765787843, "grad_norm": 1.1560463905334473, "learning_rate": 2.1471571906354516e-06, "loss": 0.2794, "step": 38600 }, { "epoch": 3.7205984765787843, "eval_loss": 0.25185054540634155, "eval_runtime": 58.9294, "eval_samples_per_second": 169.695, "eval_steps_per_second": 21.212, "step": 38600 }, { "epoch": 3.730237332735724, "grad_norm": 1.104009747505188, "learning_rate": 2.137123745819398e-06, "loss": 0.2792, "step": 38700 }, { "epoch": 3.730237332735724, "eval_loss": 0.25351107120513916, "eval_runtime": 58.77, "eval_samples_per_second": 170.155, "eval_steps_per_second": 21.269, "step": 38700 }, { "epoch": 3.739876188892664, "grad_norm": 1.1741303205490112, "learning_rate": 2.1270903010033443e-06, "loss": 0.2804, "step": 38800 }, { "epoch": 3.739876188892664, "eval_loss": 0.2518996000289917, "eval_runtime": 58.7376, "eval_samples_per_second": 170.249, "eval_steps_per_second": 21.281, "step": 38800 }, { "epoch": 3.7495150450496038, "grad_norm": 1.218940258026123, "learning_rate": 2.117056856187291e-06, "loss": 0.2778, "step": 38900 }, { "epoch": 3.7495150450496038, "eval_loss": 0.25092679262161255, "eval_runtime": 58.9207, "eval_samples_per_second": 169.72, "eval_steps_per_second": 21.215, "step": 38900 }, { "epoch": 3.7591539012065436, "grad_norm": 1.15857994556427, "learning_rate": 2.1070234113712374e-06, "loss": 0.2788, "step": 39000 }, { "epoch": 3.7591539012065436, "eval_loss": 0.2451571226119995, "eval_runtime": 58.789, "eval_samples_per_second": 170.1, "eval_steps_per_second": 21.262, "step": 39000 }, { "epoch": 3.7687927573634834, "grad_norm": 1.1558825969696045, "learning_rate": 2.0969899665551837e-06, "loss": 0.2782, "step": 39100 }, { "epoch": 3.7687927573634834, "eval_loss": 0.2468053698539734, "eval_runtime": 58.7539, "eval_samples_per_second": 170.202, "eval_steps_per_second": 21.275, "step": 39100 }, { "epoch": 3.7784316135204237, "grad_norm": 1.0610604286193848, "learning_rate": 2.0869565217391305e-06, "loss": 0.2774, "step": 39200 }, { "epoch": 3.7784316135204237, "eval_loss": 0.2472023367881775, "eval_runtime": 58.9115, "eval_samples_per_second": 169.746, "eval_steps_per_second": 21.218, "step": 39200 }, { "epoch": 3.7880704696773635, "grad_norm": 1.1503691673278809, "learning_rate": 2.076923076923077e-06, "loss": 0.278, "step": 39300 }, { "epoch": 3.7880704696773635, "eval_loss": 0.2427283674478531, "eval_runtime": 58.7581, "eval_samples_per_second": 170.189, "eval_steps_per_second": 21.274, "step": 39300 }, { "epoch": 3.7977093258343033, "grad_norm": 1.0943491458892822, "learning_rate": 2.0668896321070236e-06, "loss": 0.2787, "step": 39400 }, { "epoch": 3.7977093258343033, "eval_loss": 0.24747537076473236, "eval_runtime": 58.7249, "eval_samples_per_second": 170.286, "eval_steps_per_second": 21.286, "step": 39400 }, { "epoch": 3.807348181991243, "grad_norm": 1.185901403427124, "learning_rate": 2.05685618729097e-06, "loss": 0.277, "step": 39500 }, { "epoch": 3.807348181991243, "eval_loss": 0.24593886733055115, "eval_runtime": 58.9029, "eval_samples_per_second": 169.771, "eval_steps_per_second": 21.221, "step": 39500 }, { "epoch": 3.816987038148183, "grad_norm": 1.1177955865859985, "learning_rate": 2.0468227424749163e-06, "loss": 0.2764, "step": 39600 }, { "epoch": 3.816987038148183, "eval_loss": 0.25283169746398926, "eval_runtime": 58.8362, "eval_samples_per_second": 169.963, "eval_steps_per_second": 21.245, "step": 39600 }, { "epoch": 3.826625894305123, "grad_norm": 1.2729185819625854, "learning_rate": 2.036789297658863e-06, "loss": 0.2756, "step": 39700 }, { "epoch": 3.826625894305123, "eval_loss": 0.24848853051662445, "eval_runtime": 58.9454, "eval_samples_per_second": 169.649, "eval_steps_per_second": 21.206, "step": 39700 }, { "epoch": 3.8362647504620626, "grad_norm": 1.056195855140686, "learning_rate": 2.0267558528428094e-06, "loss": 0.2755, "step": 39800 }, { "epoch": 3.8362647504620626, "eval_loss": 0.24940501153469086, "eval_runtime": 59.1051, "eval_samples_per_second": 169.19, "eval_steps_per_second": 21.149, "step": 39800 }, { "epoch": 3.8459036066190024, "grad_norm": 1.1669950485229492, "learning_rate": 2.0167224080267557e-06, "loss": 0.2715, "step": 39900 }, { "epoch": 3.8459036066190024, "eval_loss": 0.2444242686033249, "eval_runtime": 58.9505, "eval_samples_per_second": 169.634, "eval_steps_per_second": 21.204, "step": 39900 }, { "epoch": 3.8555424627759423, "grad_norm": 1.2285829782485962, "learning_rate": 2.0066889632107025e-06, "loss": 0.2764, "step": 40000 }, { "epoch": 3.8555424627759423, "eval_loss": 0.2512260377407074, "eval_runtime": 58.9442, "eval_samples_per_second": 169.652, "eval_steps_per_second": 21.207, "step": 40000 }, { "epoch": 3.865181318932882, "grad_norm": 1.120784878730774, "learning_rate": 1.996655518394649e-06, "loss": 0.273, "step": 40100 }, { "epoch": 3.865181318932882, "eval_loss": 0.24423950910568237, "eval_runtime": 59.1371, "eval_samples_per_second": 169.099, "eval_steps_per_second": 21.137, "step": 40100 }, { "epoch": 3.8748201750898223, "grad_norm": 1.1356536149978638, "learning_rate": 1.986622073578595e-06, "loss": 0.2742, "step": 40200 }, { "epoch": 3.8748201750898223, "eval_loss": 0.24526259303092957, "eval_runtime": 58.9383, "eval_samples_per_second": 169.669, "eval_steps_per_second": 21.209, "step": 40200 }, { "epoch": 3.884459031246762, "grad_norm": 1.1070743799209595, "learning_rate": 1.976588628762542e-06, "loss": 0.2718, "step": 40300 }, { "epoch": 3.884459031246762, "eval_loss": 0.2504149377346039, "eval_runtime": 58.9412, "eval_samples_per_second": 169.66, "eval_steps_per_second": 21.208, "step": 40300 }, { "epoch": 3.894097887403702, "grad_norm": 1.0678352117538452, "learning_rate": 1.9665551839464883e-06, "loss": 0.2731, "step": 40400 }, { "epoch": 3.894097887403702, "eval_loss": 0.24410085380077362, "eval_runtime": 59.1421, "eval_samples_per_second": 169.084, "eval_steps_per_second": 21.136, "step": 40400 }, { "epoch": 3.903736743560642, "grad_norm": 1.212470293045044, "learning_rate": 1.956521739130435e-06, "loss": 0.2751, "step": 40500 }, { "epoch": 3.903736743560642, "eval_loss": 0.24343110620975494, "eval_runtime": 58.9168, "eval_samples_per_second": 169.731, "eval_steps_per_second": 21.216, "step": 40500 }, { "epoch": 3.9133755997175816, "grad_norm": 1.2708877325057983, "learning_rate": 1.9464882943143814e-06, "loss": 0.2736, "step": 40600 }, { "epoch": 3.9133755997175816, "eval_loss": 0.2406957447528839, "eval_runtime": 58.9388, "eval_samples_per_second": 169.668, "eval_steps_per_second": 21.208, "step": 40600 }, { "epoch": 3.9230144558745215, "grad_norm": 1.4610050916671753, "learning_rate": 1.9364548494983277e-06, "loss": 0.2732, "step": 40700 }, { "epoch": 3.9230144558745215, "eval_loss": 0.23855151236057281, "eval_runtime": 59.1412, "eval_samples_per_second": 169.087, "eval_steps_per_second": 21.136, "step": 40700 }, { "epoch": 3.9326533120314613, "grad_norm": 1.3062421083450317, "learning_rate": 1.9264214046822745e-06, "loss": 0.2737, "step": 40800 }, { "epoch": 3.9326533120314613, "eval_loss": 0.24466918408870697, "eval_runtime": 58.926, "eval_samples_per_second": 169.704, "eval_steps_per_second": 21.213, "step": 40800 }, { "epoch": 3.942292168188401, "grad_norm": 1.1947509050369263, "learning_rate": 1.916387959866221e-06, "loss": 0.2732, "step": 40900 }, { "epoch": 3.942292168188401, "eval_loss": 0.2433861643075943, "eval_runtime": 58.8674, "eval_samples_per_second": 169.873, "eval_steps_per_second": 21.234, "step": 40900 }, { "epoch": 3.951931024345341, "grad_norm": 1.0160512924194336, "learning_rate": 1.9063545150501674e-06, "loss": 0.2734, "step": 41000 }, { "epoch": 3.951931024345341, "eval_loss": 0.24299292266368866, "eval_runtime": 58.6917, "eval_samples_per_second": 170.382, "eval_steps_per_second": 21.298, "step": 41000 }, { "epoch": 3.9615698805022808, "grad_norm": 1.2853399515151978, "learning_rate": 1.896321070234114e-06, "loss": 0.2735, "step": 41100 }, { "epoch": 3.9615698805022808, "eval_loss": 0.2398652285337448, "eval_runtime": 58.63, "eval_samples_per_second": 170.561, "eval_steps_per_second": 21.32, "step": 41100 }, { "epoch": 3.9712087366592206, "grad_norm": 1.2261525392532349, "learning_rate": 1.8862876254180603e-06, "loss": 0.2728, "step": 41200 }, { "epoch": 3.9712087366592206, "eval_loss": 0.24188843369483948, "eval_runtime": 58.8294, "eval_samples_per_second": 169.983, "eval_steps_per_second": 21.248, "step": 41200 }, { "epoch": 3.9808475928161604, "grad_norm": 1.2449485063552856, "learning_rate": 1.8762541806020068e-06, "loss": 0.2703, "step": 41300 }, { "epoch": 3.9808475928161604, "eval_loss": 0.24452358484268188, "eval_runtime": 59.1226, "eval_samples_per_second": 169.14, "eval_steps_per_second": 21.143, "step": 41300 }, { "epoch": 3.9904864489731002, "grad_norm": 1.1758934259414673, "learning_rate": 1.8662207357859534e-06, "loss": 0.276, "step": 41400 }, { "epoch": 3.9904864489731002, "eval_loss": 0.24620556831359863, "eval_runtime": 58.6943, "eval_samples_per_second": 170.374, "eval_steps_per_second": 21.297, "step": 41400 }, { "epoch": 4.00012530513004, "grad_norm": 1.1423110961914062, "learning_rate": 1.8561872909699e-06, "loss": 0.2716, "step": 41500 }, { "epoch": 4.00012530513004, "eval_loss": 0.24084354937076569, "eval_runtime": 58.8706, "eval_samples_per_second": 169.864, "eval_steps_per_second": 21.233, "step": 41500 }, { "epoch": 4.00976416128698, "grad_norm": 1.188114881515503, "learning_rate": 1.8461538461538462e-06, "loss": 0.2712, "step": 41600 }, { "epoch": 4.00976416128698, "eval_loss": 0.2466341108083725, "eval_runtime": 58.8443, "eval_samples_per_second": 169.94, "eval_steps_per_second": 21.242, "step": 41600 }, { "epoch": 4.01940301744392, "grad_norm": 1.0706281661987305, "learning_rate": 1.8361204013377928e-06, "loss": 0.2719, "step": 41700 }, { "epoch": 4.01940301744392, "eval_loss": 0.24489988386631012, "eval_runtime": 58.7263, "eval_samples_per_second": 170.281, "eval_steps_per_second": 21.285, "step": 41700 }, { "epoch": 4.0290418736008595, "grad_norm": 1.134295105934143, "learning_rate": 1.8260869565217394e-06, "loss": 0.2721, "step": 41800 }, { "epoch": 4.0290418736008595, "eval_loss": 0.2406749725341797, "eval_runtime": 59.0931, "eval_samples_per_second": 169.225, "eval_steps_per_second": 21.153, "step": 41800 }, { "epoch": 4.038680729757799, "grad_norm": 1.140578269958496, "learning_rate": 1.8160535117056857e-06, "loss": 0.2703, "step": 41900 }, { "epoch": 4.038680729757799, "eval_loss": 0.24536637961864471, "eval_runtime": 58.9496, "eval_samples_per_second": 169.636, "eval_steps_per_second": 21.205, "step": 41900 }, { "epoch": 4.048319585914739, "grad_norm": 1.1268982887268066, "learning_rate": 1.8060200668896322e-06, "loss": 0.2708, "step": 42000 }, { "epoch": 4.048319585914739, "eval_loss": 0.24372392892837524, "eval_runtime": 58.9564, "eval_samples_per_second": 169.617, "eval_steps_per_second": 21.202, "step": 42000 }, { "epoch": 4.057958442071679, "grad_norm": 1.126116156578064, "learning_rate": 1.7959866220735788e-06, "loss": 0.2674, "step": 42100 }, { "epoch": 4.057958442071679, "eval_loss": 0.24174726009368896, "eval_runtime": 59.1178, "eval_samples_per_second": 169.154, "eval_steps_per_second": 21.144, "step": 42100 }, { "epoch": 4.067597298228619, "grad_norm": 1.1760188341140747, "learning_rate": 1.7859531772575253e-06, "loss": 0.2697, "step": 42200 }, { "epoch": 4.067597298228619, "eval_loss": 0.2439856231212616, "eval_runtime": 58.9349, "eval_samples_per_second": 169.679, "eval_steps_per_second": 21.21, "step": 42200 }, { "epoch": 4.077236154385559, "grad_norm": 0.9911255836486816, "learning_rate": 1.7759197324414717e-06, "loss": 0.2654, "step": 42300 }, { "epoch": 4.077236154385559, "eval_loss": 0.23890195786952972, "eval_runtime": 58.9228, "eval_samples_per_second": 169.714, "eval_steps_per_second": 21.214, "step": 42300 }, { "epoch": 4.086875010542499, "grad_norm": 1.2863502502441406, "learning_rate": 1.7658862876254182e-06, "loss": 0.2692, "step": 42400 }, { "epoch": 4.086875010542499, "eval_loss": 0.24417583644390106, "eval_runtime": 58.8653, "eval_samples_per_second": 169.879, "eval_steps_per_second": 21.235, "step": 42400 }, { "epoch": 4.096513866699439, "grad_norm": 1.1881786584854126, "learning_rate": 1.7558528428093648e-06, "loss": 0.2708, "step": 42500 }, { "epoch": 4.096513866699439, "eval_loss": 0.24346549808979034, "eval_runtime": 58.7861, "eval_samples_per_second": 170.108, "eval_steps_per_second": 21.264, "step": 42500 }, { "epoch": 4.106152722856379, "grad_norm": 1.2108421325683594, "learning_rate": 1.745819397993311e-06, "loss": 0.2692, "step": 42600 }, { "epoch": 4.106152722856379, "eval_loss": 0.2419823706150055, "eval_runtime": 58.7844, "eval_samples_per_second": 170.113, "eval_steps_per_second": 21.264, "step": 42600 }, { "epoch": 4.115791579013319, "grad_norm": 1.2227544784545898, "learning_rate": 1.7357859531772575e-06, "loss": 0.2695, "step": 42700 }, { "epoch": 4.115791579013319, "eval_loss": 0.23794367909431458, "eval_runtime": 59.1212, "eval_samples_per_second": 169.144, "eval_steps_per_second": 21.143, "step": 42700 }, { "epoch": 4.125430435170259, "grad_norm": 1.0150911808013916, "learning_rate": 1.7257525083612038e-06, "loss": 0.2694, "step": 42800 }, { "epoch": 4.125430435170259, "eval_loss": 0.24516792595386505, "eval_runtime": 58.9164, "eval_samples_per_second": 169.732, "eval_steps_per_second": 21.217, "step": 42800 }, { "epoch": 4.1350692913271985, "grad_norm": 1.1706311702728271, "learning_rate": 1.7157190635451504e-06, "loss": 0.2707, "step": 42900 }, { "epoch": 4.1350692913271985, "eval_loss": 0.23993416130542755, "eval_runtime": 58.8705, "eval_samples_per_second": 169.864, "eval_steps_per_second": 21.233, "step": 42900 }, { "epoch": 4.144708147484138, "grad_norm": 1.1141585111618042, "learning_rate": 1.705685618729097e-06, "loss": 0.2696, "step": 43000 }, { "epoch": 4.144708147484138, "eval_loss": 0.2433127909898758, "eval_runtime": 59.0527, "eval_samples_per_second": 169.34, "eval_steps_per_second": 21.168, "step": 43000 }, { "epoch": 4.154347003641078, "grad_norm": 1.11643385887146, "learning_rate": 1.6956521739130435e-06, "loss": 0.2669, "step": 43100 }, { "epoch": 4.154347003641078, "eval_loss": 0.24249985814094543, "eval_runtime": 58.8967, "eval_samples_per_second": 169.789, "eval_steps_per_second": 21.224, "step": 43100 }, { "epoch": 4.163985859798018, "grad_norm": 1.1383628845214844, "learning_rate": 1.6856187290969898e-06, "loss": 0.2688, "step": 43200 }, { "epoch": 4.163985859798018, "eval_loss": 0.24246694147586823, "eval_runtime": 58.9245, "eval_samples_per_second": 169.709, "eval_steps_per_second": 21.214, "step": 43200 }, { "epoch": 4.173624715954958, "grad_norm": 1.0870535373687744, "learning_rate": 1.6755852842809363e-06, "loss": 0.2669, "step": 43300 }, { "epoch": 4.173624715954958, "eval_loss": 0.23730356991291046, "eval_runtime": 59.0839, "eval_samples_per_second": 169.251, "eval_steps_per_second": 21.156, "step": 43300 }, { "epoch": 4.183263572111898, "grad_norm": 1.062546730041504, "learning_rate": 1.665551839464883e-06, "loss": 0.268, "step": 43400 }, { "epoch": 4.183263572111898, "eval_loss": 0.2389833778142929, "eval_runtime": 58.9088, "eval_samples_per_second": 169.754, "eval_steps_per_second": 21.219, "step": 43400 }, { "epoch": 4.192902428268837, "grad_norm": 1.2048453092575073, "learning_rate": 1.6555183946488294e-06, "loss": 0.2661, "step": 43500 }, { "epoch": 4.192902428268837, "eval_loss": 0.24401752650737762, "eval_runtime": 58.9403, "eval_samples_per_second": 169.663, "eval_steps_per_second": 21.208, "step": 43500 }, { "epoch": 4.202541284425777, "grad_norm": 1.2191439867019653, "learning_rate": 1.6454849498327758e-06, "loss": 0.2679, "step": 43600 }, { "epoch": 4.202541284425777, "eval_loss": 0.2426602989435196, "eval_runtime": 59.1448, "eval_samples_per_second": 169.077, "eval_steps_per_second": 21.135, "step": 43600 }, { "epoch": 4.212180140582717, "grad_norm": 1.168209195137024, "learning_rate": 1.6354515050167223e-06, "loss": 0.267, "step": 43700 }, { "epoch": 4.212180140582717, "eval_loss": 0.2363247573375702, "eval_runtime": 58.9517, "eval_samples_per_second": 169.63, "eval_steps_per_second": 21.204, "step": 43700 }, { "epoch": 4.221818996739657, "grad_norm": 1.2940659523010254, "learning_rate": 1.6254180602006689e-06, "loss": 0.2675, "step": 43800 }, { "epoch": 4.221818996739657, "eval_loss": 0.23338085412979126, "eval_runtime": 59.1408, "eval_samples_per_second": 169.088, "eval_steps_per_second": 21.136, "step": 43800 }, { "epoch": 4.231457852896597, "grad_norm": 1.2550647258758545, "learning_rate": 1.6153846153846154e-06, "loss": 0.2677, "step": 43900 }, { "epoch": 4.231457852896597, "eval_loss": 0.24174867570400238, "eval_runtime": 58.7543, "eval_samples_per_second": 170.2, "eval_steps_per_second": 21.275, "step": 43900 }, { "epoch": 4.2410967090535365, "grad_norm": 1.1011639833450317, "learning_rate": 1.6053511705685618e-06, "loss": 0.2671, "step": 44000 }, { "epoch": 4.2410967090535365, "eval_loss": 0.23765331506729126, "eval_runtime": 59.0653, "eval_samples_per_second": 169.304, "eval_steps_per_second": 21.163, "step": 44000 }, { "epoch": 4.250735565210476, "grad_norm": 1.282638430595398, "learning_rate": 1.5953177257525083e-06, "loss": 0.2674, "step": 44100 }, { "epoch": 4.250735565210476, "eval_loss": 0.23663020133972168, "eval_runtime": 58.989, "eval_samples_per_second": 169.523, "eval_steps_per_second": 21.19, "step": 44100 }, { "epoch": 4.260374421367416, "grad_norm": 1.233346700668335, "learning_rate": 1.5852842809364549e-06, "loss": 0.2664, "step": 44200 }, { "epoch": 4.260374421367416, "eval_loss": 0.2390340268611908, "eval_runtime": 58.9592, "eval_samples_per_second": 169.609, "eval_steps_per_second": 21.201, "step": 44200 }, { "epoch": 4.270013277524356, "grad_norm": 1.2142596244812012, "learning_rate": 1.5752508361204012e-06, "loss": 0.2673, "step": 44300 }, { "epoch": 4.270013277524356, "eval_loss": 0.23672623932361603, "eval_runtime": 59.1929, "eval_samples_per_second": 168.939, "eval_steps_per_second": 21.117, "step": 44300 }, { "epoch": 4.279652133681296, "grad_norm": 1.1910849809646606, "learning_rate": 1.5652173913043478e-06, "loss": 0.2663, "step": 44400 }, { "epoch": 4.279652133681296, "eval_loss": 0.23604613542556763, "eval_runtime": 59.1767, "eval_samples_per_second": 168.985, "eval_steps_per_second": 21.123, "step": 44400 }, { "epoch": 4.289290989838236, "grad_norm": 1.0741398334503174, "learning_rate": 1.5551839464882943e-06, "loss": 0.265, "step": 44500 }, { "epoch": 4.289290989838236, "eval_loss": 0.24038757383823395, "eval_runtime": 58.9675, "eval_samples_per_second": 169.585, "eval_steps_per_second": 21.198, "step": 44500 }, { "epoch": 4.2989298459951755, "grad_norm": 1.1280308961868286, "learning_rate": 1.5451505016722409e-06, "loss": 0.2681, "step": 44600 }, { "epoch": 4.2989298459951755, "eval_loss": 0.2376743108034134, "eval_runtime": 59.012, "eval_samples_per_second": 169.457, "eval_steps_per_second": 21.182, "step": 44600 }, { "epoch": 4.308568702152115, "grad_norm": 1.210562825202942, "learning_rate": 1.5351170568561872e-06, "loss": 0.2653, "step": 44700 }, { "epoch": 4.308568702152115, "eval_loss": 0.2342677265405655, "eval_runtime": 59.1509, "eval_samples_per_second": 169.059, "eval_steps_per_second": 21.132, "step": 44700 }, { "epoch": 4.318207558309055, "grad_norm": 1.2077608108520508, "learning_rate": 1.5250836120401338e-06, "loss": 0.2659, "step": 44800 }, { "epoch": 4.318207558309055, "eval_loss": 0.23438528180122375, "eval_runtime": 58.9257, "eval_samples_per_second": 169.705, "eval_steps_per_second": 21.213, "step": 44800 }, { "epoch": 4.327846414465995, "grad_norm": 1.2959463596343994, "learning_rate": 1.5150501672240803e-06, "loss": 0.266, "step": 44900 }, { "epoch": 4.327846414465995, "eval_loss": 0.2376752495765686, "eval_runtime": 58.9319, "eval_samples_per_second": 169.687, "eval_steps_per_second": 21.211, "step": 44900 }, { "epoch": 4.337485270622935, "grad_norm": 1.135292410850525, "learning_rate": 1.5050167224080269e-06, "loss": 0.2649, "step": 45000 }, { "epoch": 4.337485270622935, "eval_loss": 0.24188388884067535, "eval_runtime": 59.1492, "eval_samples_per_second": 169.064, "eval_steps_per_second": 21.133, "step": 45000 }, { "epoch": 4.347124126779875, "grad_norm": 1.1070986986160278, "learning_rate": 1.4949832775919732e-06, "loss": 0.2663, "step": 45100 }, { "epoch": 4.347124126779875, "eval_loss": 0.23631098866462708, "eval_runtime": 58.8991, "eval_samples_per_second": 169.782, "eval_steps_per_second": 21.223, "step": 45100 }, { "epoch": 4.356762982936815, "grad_norm": 1.0398037433624268, "learning_rate": 1.4849498327759198e-06, "loss": 0.2644, "step": 45200 }, { "epoch": 4.356762982936815, "eval_loss": 0.23711133003234863, "eval_runtime": 58.8817, "eval_samples_per_second": 169.832, "eval_steps_per_second": 21.229, "step": 45200 }, { "epoch": 4.366401839093755, "grad_norm": 1.2478735446929932, "learning_rate": 1.4749163879598663e-06, "loss": 0.2654, "step": 45300 }, { "epoch": 4.366401839093755, "eval_loss": 0.2371368706226349, "eval_runtime": 59.1317, "eval_samples_per_second": 169.114, "eval_steps_per_second": 21.139, "step": 45300 }, { "epoch": 4.376040695250695, "grad_norm": 1.1391520500183105, "learning_rate": 1.4648829431438129e-06, "loss": 0.2627, "step": 45400 }, { "epoch": 4.376040695250695, "eval_loss": 0.23908434808254242, "eval_runtime": 58.7291, "eval_samples_per_second": 170.273, "eval_steps_per_second": 21.284, "step": 45400 }, { "epoch": 4.385679551407635, "grad_norm": 1.0501779317855835, "learning_rate": 1.4548494983277592e-06, "loss": 0.265, "step": 45500 }, { "epoch": 4.385679551407635, "eval_loss": 0.2357652336359024, "eval_runtime": 58.7539, "eval_samples_per_second": 170.202, "eval_steps_per_second": 21.275, "step": 45500 }, { "epoch": 4.395318407564575, "grad_norm": 1.1620495319366455, "learning_rate": 1.4448160535117058e-06, "loss": 0.263, "step": 45600 }, { "epoch": 4.395318407564575, "eval_loss": 0.23429590463638306, "eval_runtime": 58.9861, "eval_samples_per_second": 169.532, "eval_steps_per_second": 21.191, "step": 45600 }, { "epoch": 4.404957263721514, "grad_norm": 1.365100383758545, "learning_rate": 1.4347826086956523e-06, "loss": 0.2627, "step": 45700 }, { "epoch": 4.404957263721514, "eval_loss": 0.23306749761104584, "eval_runtime": 58.8225, "eval_samples_per_second": 170.003, "eval_steps_per_second": 21.25, "step": 45700 }, { "epoch": 4.414596119878454, "grad_norm": 1.2309627532958984, "learning_rate": 1.4247491638795989e-06, "loss": 0.2632, "step": 45800 }, { "epoch": 4.414596119878454, "eval_loss": 0.23629753291606903, "eval_runtime": 58.8174, "eval_samples_per_second": 170.018, "eval_steps_per_second": 21.252, "step": 45800 }, { "epoch": 4.424234976035394, "grad_norm": 1.277610421180725, "learning_rate": 1.4147157190635452e-06, "loss": 0.2655, "step": 45900 }, { "epoch": 4.424234976035394, "eval_loss": 0.23689965903759003, "eval_runtime": 58.9758, "eval_samples_per_second": 169.561, "eval_steps_per_second": 21.195, "step": 45900 }, { "epoch": 4.433873832192334, "grad_norm": 1.1359058618545532, "learning_rate": 1.4046822742474917e-06, "loss": 0.2638, "step": 46000 }, { "epoch": 4.433873832192334, "eval_loss": 0.23394687473773956, "eval_runtime": 58.9538, "eval_samples_per_second": 169.624, "eval_steps_per_second": 21.203, "step": 46000 }, { "epoch": 4.443512688349274, "grad_norm": 1.0680986642837524, "learning_rate": 1.3946488294314383e-06, "loss": 0.2611, "step": 46100 }, { "epoch": 4.443512688349274, "eval_loss": 0.23477770388126373, "eval_runtime": 58.934, "eval_samples_per_second": 169.681, "eval_steps_per_second": 21.21, "step": 46100 }, { "epoch": 4.4531515445062135, "grad_norm": 1.0783350467681885, "learning_rate": 1.3846153846153846e-06, "loss": 0.2599, "step": 46200 }, { "epoch": 4.4531515445062135, "eval_loss": 0.23568202555179596, "eval_runtime": 59.1167, "eval_samples_per_second": 169.157, "eval_steps_per_second": 21.145, "step": 46200 }, { "epoch": 4.462790400663153, "grad_norm": 1.2353805303573608, "learning_rate": 1.374581939799331e-06, "loss": 0.2602, "step": 46300 }, { "epoch": 4.462790400663153, "eval_loss": 0.23270440101623535, "eval_runtime": 58.9332, "eval_samples_per_second": 169.684, "eval_steps_per_second": 21.21, "step": 46300 }, { "epoch": 4.472429256820093, "grad_norm": 1.1992213726043701, "learning_rate": 1.3645484949832775e-06, "loss": 0.2618, "step": 46400 }, { "epoch": 4.472429256820093, "eval_loss": 0.23463425040245056, "eval_runtime": 58.97, "eval_samples_per_second": 169.578, "eval_steps_per_second": 21.197, "step": 46400 }, { "epoch": 4.482068112977033, "grad_norm": 1.2045105695724487, "learning_rate": 1.354515050167224e-06, "loss": 0.2607, "step": 46500 }, { "epoch": 4.482068112977033, "eval_loss": 0.2355393022298813, "eval_runtime": 59.1174, "eval_samples_per_second": 169.155, "eval_steps_per_second": 21.144, "step": 46500 }, { "epoch": 4.491706969133973, "grad_norm": 1.1414119005203247, "learning_rate": 1.3444816053511706e-06, "loss": 0.2617, "step": 46600 }, { "epoch": 4.491706969133973, "eval_loss": 0.23534570634365082, "eval_runtime": 58.9211, "eval_samples_per_second": 169.718, "eval_steps_per_second": 21.215, "step": 46600 }, { "epoch": 4.501345825290913, "grad_norm": 1.2172667980194092, "learning_rate": 1.334448160535117e-06, "loss": 0.2627, "step": 46700 }, { "epoch": 4.501345825290913, "eval_loss": 0.2336340993642807, "eval_runtime": 58.9514, "eval_samples_per_second": 169.631, "eval_steps_per_second": 21.204, "step": 46700 }, { "epoch": 4.5109846814478525, "grad_norm": 1.0742268562316895, "learning_rate": 1.3244147157190635e-06, "loss": 0.2567, "step": 46800 }, { "epoch": 4.5109846814478525, "eval_loss": 0.2309028059244156, "eval_runtime": 59.1411, "eval_samples_per_second": 169.087, "eval_steps_per_second": 21.136, "step": 46800 }, { "epoch": 4.520623537604792, "grad_norm": 1.1084363460540771, "learning_rate": 1.31438127090301e-06, "loss": 0.2605, "step": 46900 }, { "epoch": 4.520623537604792, "eval_loss": 0.23382265865802765, "eval_runtime": 58.7079, "eval_samples_per_second": 170.335, "eval_steps_per_second": 21.292, "step": 46900 }, { "epoch": 4.530262393761732, "grad_norm": 1.171460747718811, "learning_rate": 1.3043478260869566e-06, "loss": 0.2591, "step": 47000 }, { "epoch": 4.530262393761732, "eval_loss": 0.23248232901096344, "eval_runtime": 58.9691, "eval_samples_per_second": 169.58, "eval_steps_per_second": 21.198, "step": 47000 }, { "epoch": 4.539901249918672, "grad_norm": 1.2160496711730957, "learning_rate": 1.294314381270903e-06, "loss": 0.2584, "step": 47100 }, { "epoch": 4.539901249918672, "eval_loss": 0.23489028215408325, "eval_runtime": 59.1758, "eval_samples_per_second": 168.988, "eval_steps_per_second": 21.123, "step": 47100 }, { "epoch": 4.549540106075612, "grad_norm": 1.0126835107803345, "learning_rate": 1.2842809364548495e-06, "loss": 0.2608, "step": 47200 }, { "epoch": 4.549540106075612, "eval_loss": 0.22762079536914825, "eval_runtime": 58.9452, "eval_samples_per_second": 169.649, "eval_steps_per_second": 21.206, "step": 47200 }, { "epoch": 4.559178962232552, "grad_norm": 1.160570502281189, "learning_rate": 1.274247491638796e-06, "loss": 0.2615, "step": 47300 }, { "epoch": 4.559178962232552, "eval_loss": 0.23083405196666718, "eval_runtime": 58.9807, "eval_samples_per_second": 169.547, "eval_steps_per_second": 21.193, "step": 47300 }, { "epoch": 4.568817818389491, "grad_norm": 1.1063063144683838, "learning_rate": 1.2642140468227424e-06, "loss": 0.2592, "step": 47400 }, { "epoch": 4.568817818389491, "eval_loss": 0.2323395013809204, "eval_runtime": 59.1709, "eval_samples_per_second": 169.002, "eval_steps_per_second": 21.125, "step": 47400 }, { "epoch": 4.578456674546431, "grad_norm": 1.0993156433105469, "learning_rate": 1.254180602006689e-06, "loss": 0.2604, "step": 47500 }, { "epoch": 4.578456674546431, "eval_loss": 0.2317042052745819, "eval_runtime": 59.0198, "eval_samples_per_second": 169.435, "eval_steps_per_second": 21.179, "step": 47500 }, { "epoch": 4.588095530703371, "grad_norm": 1.2433979511260986, "learning_rate": 1.2441471571906355e-06, "loss": 0.2595, "step": 47600 }, { "epoch": 4.588095530703371, "eval_loss": 0.23653364181518555, "eval_runtime": 58.9575, "eval_samples_per_second": 169.614, "eval_steps_per_second": 21.202, "step": 47600 }, { "epoch": 4.597734386860312, "grad_norm": 1.1454936265945435, "learning_rate": 1.234113712374582e-06, "loss": 0.2623, "step": 47700 }, { "epoch": 4.597734386860312, "eval_loss": 0.23131422698497772, "eval_runtime": 59.0739, "eval_samples_per_second": 169.279, "eval_steps_per_second": 21.16, "step": 47700 }, { "epoch": 4.607373243017252, "grad_norm": 1.0795280933380127, "learning_rate": 1.2240802675585284e-06, "loss": 0.2594, "step": 47800 }, { "epoch": 4.607373243017252, "eval_loss": 0.22916051745414734, "eval_runtime": 58.9155, "eval_samples_per_second": 169.735, "eval_steps_per_second": 21.217, "step": 47800 }, { "epoch": 4.617012099174191, "grad_norm": 1.2649006843566895, "learning_rate": 1.214046822742475e-06, "loss": 0.259, "step": 47900 }, { "epoch": 4.617012099174191, "eval_loss": 0.2325269728899002, "eval_runtime": 58.9759, "eval_samples_per_second": 169.561, "eval_steps_per_second": 21.195, "step": 47900 }, { "epoch": 4.626650955331131, "grad_norm": 1.181906819343567, "learning_rate": 1.2040133779264215e-06, "loss": 0.2588, "step": 48000 }, { "epoch": 4.626650955331131, "eval_loss": 0.23951946198940277, "eval_runtime": 59.1457, "eval_samples_per_second": 169.074, "eval_steps_per_second": 21.134, "step": 48000 }, { "epoch": 4.636289811488071, "grad_norm": 1.1375948190689087, "learning_rate": 1.193979933110368e-06, "loss": 0.2596, "step": 48100 }, { "epoch": 4.636289811488071, "eval_loss": 0.23047949373722076, "eval_runtime": 58.9661, "eval_samples_per_second": 169.589, "eval_steps_per_second": 21.199, "step": 48100 }, { "epoch": 4.645928667645011, "grad_norm": 1.180855393409729, "learning_rate": 1.1839464882943144e-06, "loss": 0.2604, "step": 48200 }, { "epoch": 4.645928667645011, "eval_loss": 0.23023301362991333, "eval_runtime": 58.9745, "eval_samples_per_second": 169.565, "eval_steps_per_second": 21.196, "step": 48200 }, { "epoch": 4.655567523801951, "grad_norm": 1.1689326763153076, "learning_rate": 1.173913043478261e-06, "loss": 0.2577, "step": 48300 }, { "epoch": 4.655567523801951, "eval_loss": 0.23206885159015656, "eval_runtime": 58.8991, "eval_samples_per_second": 169.782, "eval_steps_per_second": 21.223, "step": 48300 }, { "epoch": 4.6652063799588905, "grad_norm": 1.190232276916504, "learning_rate": 1.1638795986622075e-06, "loss": 0.2577, "step": 48400 }, { "epoch": 4.6652063799588905, "eval_loss": 0.23331387341022491, "eval_runtime": 59.0925, "eval_samples_per_second": 169.226, "eval_steps_per_second": 21.153, "step": 48400 }, { "epoch": 4.67484523611583, "grad_norm": 1.231204628944397, "learning_rate": 1.153846153846154e-06, "loss": 0.2585, "step": 48500 }, { "epoch": 4.67484523611583, "eval_loss": 0.22776982188224792, "eval_runtime": 58.9385, "eval_samples_per_second": 169.668, "eval_steps_per_second": 21.209, "step": 48500 }, { "epoch": 4.68448409227277, "grad_norm": 1.1207915544509888, "learning_rate": 1.1438127090301004e-06, "loss": 0.2564, "step": 48600 }, { "epoch": 4.68448409227277, "eval_loss": 0.23199844360351562, "eval_runtime": 58.9559, "eval_samples_per_second": 169.618, "eval_steps_per_second": 21.202, "step": 48600 }, { "epoch": 4.69412294842971, "grad_norm": 1.1697341203689575, "learning_rate": 1.133779264214047e-06, "loss": 0.2588, "step": 48700 }, { "epoch": 4.69412294842971, "eval_loss": 0.2301999032497406, "eval_runtime": 59.1168, "eval_samples_per_second": 169.157, "eval_steps_per_second": 21.145, "step": 48700 }, { "epoch": 4.70376180458665, "grad_norm": 1.206182837486267, "learning_rate": 1.1237458193979933e-06, "loss": 0.257, "step": 48800 }, { "epoch": 4.70376180458665, "eval_loss": 0.2264467477798462, "eval_runtime": 58.8913, "eval_samples_per_second": 169.804, "eval_steps_per_second": 21.226, "step": 48800 }, { "epoch": 4.71340066074359, "grad_norm": 1.2088978290557861, "learning_rate": 1.1137123745819398e-06, "loss": 0.2567, "step": 48900 }, { "epoch": 4.71340066074359, "eval_loss": 0.22952109575271606, "eval_runtime": 58.8906, "eval_samples_per_second": 169.806, "eval_steps_per_second": 21.226, "step": 48900 }, { "epoch": 4.7230395169005295, "grad_norm": 1.1703099012374878, "learning_rate": 1.1036789297658862e-06, "loss": 0.2589, "step": 49000 }, { "epoch": 4.7230395169005295, "eval_loss": 0.23127800226211548, "eval_runtime": 59.0382, "eval_samples_per_second": 169.382, "eval_steps_per_second": 21.173, "step": 49000 }, { "epoch": 4.732678373057469, "grad_norm": 1.1088894605636597, "learning_rate": 1.0936454849498327e-06, "loss": 0.2576, "step": 49100 }, { "epoch": 4.732678373057469, "eval_loss": 0.22928257286548615, "eval_runtime": 58.9548, "eval_samples_per_second": 169.622, "eval_steps_per_second": 21.203, "step": 49100 }, { "epoch": 4.742317229214409, "grad_norm": 1.1792631149291992, "learning_rate": 1.0836120401337793e-06, "loss": 0.256, "step": 49200 }, { "epoch": 4.742317229214409, "eval_loss": 0.2260976880788803, "eval_runtime": 58.9553, "eval_samples_per_second": 169.62, "eval_steps_per_second": 21.202, "step": 49200 }, { "epoch": 4.751956085371349, "grad_norm": 1.115937352180481, "learning_rate": 1.0735785953177258e-06, "loss": 0.2584, "step": 49300 }, { "epoch": 4.751956085371349, "eval_loss": 0.2344072461128235, "eval_runtime": 59.1056, "eval_samples_per_second": 169.189, "eval_steps_per_second": 21.149, "step": 49300 }, { "epoch": 4.761594941528289, "grad_norm": 1.2252027988433838, "learning_rate": 1.0635451505016722e-06, "loss": 0.2577, "step": 49400 }, { "epoch": 4.761594941528289, "eval_loss": 0.2281453162431717, "eval_runtime": 58.9641, "eval_samples_per_second": 169.595, "eval_steps_per_second": 21.199, "step": 49400 }, { "epoch": 4.771233797685229, "grad_norm": 1.164642572402954, "learning_rate": 1.0535117056856187e-06, "loss": 0.2566, "step": 49500 }, { "epoch": 4.771233797685229, "eval_loss": 0.23026709258556366, "eval_runtime": 59.1894, "eval_samples_per_second": 168.949, "eval_steps_per_second": 21.119, "step": 49500 }, { "epoch": 4.780872653842168, "grad_norm": 1.0098907947540283, "learning_rate": 1.0434782608695653e-06, "loss": 0.2589, "step": 49600 }, { "epoch": 4.780872653842168, "eval_loss": 0.23416157066822052, "eval_runtime": 59.0058, "eval_samples_per_second": 169.475, "eval_steps_per_second": 21.184, "step": 49600 }, { "epoch": 4.790511509999108, "grad_norm": 1.1912959814071655, "learning_rate": 1.0334448160535118e-06, "loss": 0.2579, "step": 49700 }, { "epoch": 4.790511509999108, "eval_loss": 0.2309984415769577, "eval_runtime": 58.9325, "eval_samples_per_second": 169.686, "eval_steps_per_second": 21.211, "step": 49700 }, { "epoch": 4.800150366156048, "grad_norm": 1.2718689441680908, "learning_rate": 1.0234113712374581e-06, "loss": 0.2575, "step": 49800 }, { "epoch": 4.800150366156048, "eval_loss": 0.2315024882555008, "eval_runtime": 58.6814, "eval_samples_per_second": 170.412, "eval_steps_per_second": 21.301, "step": 49800 }, { "epoch": 4.809789222312988, "grad_norm": 1.0510720014572144, "learning_rate": 1.0133779264214047e-06, "loss": 0.2567, "step": 49900 }, { "epoch": 4.809789222312988, "eval_loss": 0.2308981865644455, "eval_runtime": 58.7968, "eval_samples_per_second": 170.077, "eval_steps_per_second": 21.26, "step": 49900 }, { "epoch": 4.819428078469928, "grad_norm": 1.1117866039276123, "learning_rate": 1.0033444816053512e-06, "loss": 0.2577, "step": 50000 }, { "epoch": 4.819428078469928, "eval_loss": 0.2323577105998993, "eval_runtime": 58.6609, "eval_samples_per_second": 170.471, "eval_steps_per_second": 21.309, "step": 50000 }, { "epoch": 4.8290669346268675, "grad_norm": 1.1479214429855347, "learning_rate": 9.933110367892976e-07, "loss": 0.2569, "step": 50100 }, { "epoch": 4.8290669346268675, "eval_loss": 0.2267085760831833, "eval_runtime": 58.6686, "eval_samples_per_second": 170.449, "eval_steps_per_second": 21.306, "step": 50100 }, { "epoch": 4.838705790783807, "grad_norm": 1.0486712455749512, "learning_rate": 9.832775919732441e-07, "loss": 0.2565, "step": 50200 }, { "epoch": 4.838705790783807, "eval_loss": 0.233329638838768, "eval_runtime": 58.7371, "eval_samples_per_second": 170.25, "eval_steps_per_second": 21.281, "step": 50200 }, { "epoch": 4.848344646940747, "grad_norm": 1.209004282951355, "learning_rate": 9.732441471571907e-07, "loss": 0.2551, "step": 50300 }, { "epoch": 4.848344646940747, "eval_loss": 0.22935496270656586, "eval_runtime": 58.9021, "eval_samples_per_second": 169.773, "eval_steps_per_second": 21.222, "step": 50300 }, { "epoch": 4.857983503097687, "grad_norm": 1.1527981758117676, "learning_rate": 9.632107023411372e-07, "loss": 0.2578, "step": 50400 }, { "epoch": 4.857983503097687, "eval_loss": 0.2258923351764679, "eval_runtime": 58.7394, "eval_samples_per_second": 170.244, "eval_steps_per_second": 21.28, "step": 50400 }, { "epoch": 4.867622359254627, "grad_norm": 1.131640911102295, "learning_rate": 9.531772575250837e-07, "loss": 0.2569, "step": 50500 }, { "epoch": 4.867622359254627, "eval_loss": 0.22396264970302582, "eval_runtime": 58.5609, "eval_samples_per_second": 170.762, "eval_steps_per_second": 21.345, "step": 50500 }, { "epoch": 4.877261215411567, "grad_norm": 1.0976239442825317, "learning_rate": 9.431438127090301e-07, "loss": 0.2551, "step": 50600 }, { "epoch": 4.877261215411567, "eval_loss": 0.22849638760089874, "eval_runtime": 59.0715, "eval_samples_per_second": 169.286, "eval_steps_per_second": 21.161, "step": 50600 }, { "epoch": 4.8869000715685065, "grad_norm": 1.0808651447296143, "learning_rate": 9.331103678929767e-07, "loss": 0.2557, "step": 50700 }, { "epoch": 4.8869000715685065, "eval_loss": 0.2275507152080536, "eval_runtime": 59.1396, "eval_samples_per_second": 169.092, "eval_steps_per_second": 21.136, "step": 50700 }, { "epoch": 4.896538927725447, "grad_norm": 1.1022206544876099, "learning_rate": 9.230769230769231e-07, "loss": 0.2546, "step": 50800 }, { "epoch": 4.896538927725447, "eval_loss": 0.23217861354351044, "eval_runtime": 58.9856, "eval_samples_per_second": 169.533, "eval_steps_per_second": 21.192, "step": 50800 }, { "epoch": 4.906177783882387, "grad_norm": 1.1429067850112915, "learning_rate": 9.130434782608697e-07, "loss": 0.2556, "step": 50900 }, { "epoch": 4.906177783882387, "eval_loss": 0.23015516996383667, "eval_runtime": 58.9649, "eval_samples_per_second": 169.593, "eval_steps_per_second": 21.199, "step": 50900 }, { "epoch": 4.915816640039327, "grad_norm": 1.1336737871170044, "learning_rate": 9.030100334448161e-07, "loss": 0.2557, "step": 51000 }, { "epoch": 4.915816640039327, "eval_loss": 0.22632111608982086, "eval_runtime": 59.164, "eval_samples_per_second": 169.022, "eval_steps_per_second": 21.128, "step": 51000 }, { "epoch": 4.925455496196267, "grad_norm": 1.0541027784347534, "learning_rate": 8.929765886287627e-07, "loss": 0.2534, "step": 51100 }, { "epoch": 4.925455496196267, "eval_loss": 0.2284763604402542, "eval_runtime": 58.9383, "eval_samples_per_second": 169.669, "eval_steps_per_second": 21.209, "step": 51100 }, { "epoch": 4.9350943523532065, "grad_norm": 1.2004669904708862, "learning_rate": 8.829431438127091e-07, "loss": 0.253, "step": 51200 }, { "epoch": 4.9350943523532065, "eval_loss": 0.22993648052215576, "eval_runtime": 58.9372, "eval_samples_per_second": 169.672, "eval_steps_per_second": 21.209, "step": 51200 }, { "epoch": 4.944733208510146, "grad_norm": 1.3005701303482056, "learning_rate": 8.729096989966555e-07, "loss": 0.2551, "step": 51300 }, { "epoch": 4.944733208510146, "eval_loss": 0.22793498635292053, "eval_runtime": 58.9009, "eval_samples_per_second": 169.777, "eval_steps_per_second": 21.222, "step": 51300 }, { "epoch": 4.954372064667086, "grad_norm": 1.110014796257019, "learning_rate": 8.628762541806019e-07, "loss": 0.255, "step": 51400 }, { "epoch": 4.954372064667086, "eval_loss": 0.23050156235694885, "eval_runtime": 59.1011, "eval_samples_per_second": 169.202, "eval_steps_per_second": 21.15, "step": 51400 }, { "epoch": 4.964010920824026, "grad_norm": 1.448729395866394, "learning_rate": 8.528428093645485e-07, "loss": 0.2523, "step": 51500 }, { "epoch": 4.964010920824026, "eval_loss": 0.22872421145439148, "eval_runtime": 58.8714, "eval_samples_per_second": 169.862, "eval_steps_per_second": 21.233, "step": 51500 }, { "epoch": 4.973649776980966, "grad_norm": 1.3377678394317627, "learning_rate": 8.428093645484949e-07, "loss": 0.2566, "step": 51600 }, { "epoch": 4.973649776980966, "eval_loss": 0.22439132630825043, "eval_runtime": 58.9474, "eval_samples_per_second": 169.643, "eval_steps_per_second": 21.205, "step": 51600 }, { "epoch": 4.983288633137906, "grad_norm": 1.082703709602356, "learning_rate": 8.327759197324414e-07, "loss": 0.2551, "step": 51700 }, { "epoch": 4.983288633137906, "eval_loss": 0.2283206284046173, "eval_runtime": 59.1758, "eval_samples_per_second": 168.988, "eval_steps_per_second": 21.124, "step": 51700 }, { "epoch": 4.992927489294845, "grad_norm": 1.1986274719238281, "learning_rate": 8.227424749163879e-07, "loss": 0.2545, "step": 51800 }, { "epoch": 4.992927489294845, "eval_loss": 0.22571945190429688, "eval_runtime": 59.0014, "eval_samples_per_second": 169.488, "eval_steps_per_second": 21.186, "step": 51800 }, { "epoch": 5.002566345451785, "grad_norm": 1.231094479560852, "learning_rate": 8.127090301003344e-07, "loss": 0.2536, "step": 51900 }, { "epoch": 5.002566345451785, "eval_loss": 0.23076747357845306, "eval_runtime": 58.9604, "eval_samples_per_second": 169.605, "eval_steps_per_second": 21.201, "step": 51900 }, { "epoch": 5.012205201608725, "grad_norm": 1.411320686340332, "learning_rate": 8.026755852842809e-07, "loss": 0.2537, "step": 52000 }, { "epoch": 5.012205201608725, "eval_loss": 0.23237231373786926, "eval_runtime": 59.1237, "eval_samples_per_second": 169.137, "eval_steps_per_second": 21.142, "step": 52000 }, { "epoch": 5.021844057765665, "grad_norm": 1.2033196687698364, "learning_rate": 7.926421404682274e-07, "loss": 0.257, "step": 52100 }, { "epoch": 5.021844057765665, "eval_loss": 0.22967660427093506, "eval_runtime": 59.0144, "eval_samples_per_second": 169.45, "eval_steps_per_second": 21.181, "step": 52100 }, { "epoch": 5.031482913922605, "grad_norm": 1.2692068815231323, "learning_rate": 7.826086956521739e-07, "loss": 0.2555, "step": 52200 }, { "epoch": 5.031482913922605, "eval_loss": 0.22523514926433563, "eval_runtime": 58.9471, "eval_samples_per_second": 169.644, "eval_steps_per_second": 21.205, "step": 52200 }, { "epoch": 5.0411217700795445, "grad_norm": 1.094571590423584, "learning_rate": 7.725752508361204e-07, "loss": 0.2515, "step": 52300 }, { "epoch": 5.0411217700795445, "eval_loss": 0.22601811587810516, "eval_runtime": 59.0812, "eval_samples_per_second": 169.259, "eval_steps_per_second": 21.157, "step": 52300 }, { "epoch": 5.050760626236484, "grad_norm": 1.1147675514221191, "learning_rate": 7.625418060200669e-07, "loss": 0.2521, "step": 52400 }, { "epoch": 5.050760626236484, "eval_loss": 0.22480392456054688, "eval_runtime": 58.9665, "eval_samples_per_second": 169.588, "eval_steps_per_second": 21.198, "step": 52400 }, { "epoch": 5.060399482393424, "grad_norm": 1.1311498880386353, "learning_rate": 7.525083612040134e-07, "loss": 0.2553, "step": 52500 }, { "epoch": 5.060399482393424, "eval_loss": 0.22257648408412933, "eval_runtime": 58.9526, "eval_samples_per_second": 169.628, "eval_steps_per_second": 21.203, "step": 52500 }, { "epoch": 5.070038338550364, "grad_norm": 1.062983512878418, "learning_rate": 7.424749163879599e-07, "loss": 0.2523, "step": 52600 }, { "epoch": 5.070038338550364, "eval_loss": 0.2277909219264984, "eval_runtime": 59.1272, "eval_samples_per_second": 169.127, "eval_steps_per_second": 21.141, "step": 52600 }, { "epoch": 5.079677194707304, "grad_norm": 1.1180320978164673, "learning_rate": 7.324414715719064e-07, "loss": 0.2527, "step": 52700 }, { "epoch": 5.079677194707304, "eval_loss": 0.2237856686115265, "eval_runtime": 59.051, "eval_samples_per_second": 169.345, "eval_steps_per_second": 21.168, "step": 52700 }, { "epoch": 5.089316050864244, "grad_norm": 1.1766620874404907, "learning_rate": 7.224080267558529e-07, "loss": 0.2548, "step": 52800 }, { "epoch": 5.089316050864244, "eval_loss": 0.23022255301475525, "eval_runtime": 58.7083, "eval_samples_per_second": 170.334, "eval_steps_per_second": 21.292, "step": 52800 }, { "epoch": 5.0989549070211835, "grad_norm": 1.159372091293335, "learning_rate": 7.123745819397994e-07, "loss": 0.2544, "step": 52900 }, { "epoch": 5.0989549070211835, "eval_loss": 0.227079376578331, "eval_runtime": 58.9425, "eval_samples_per_second": 169.657, "eval_steps_per_second": 21.207, "step": 52900 }, { "epoch": 5.108593763178123, "grad_norm": 1.1550517082214355, "learning_rate": 7.023411371237459e-07, "loss": 0.253, "step": 53000 }, { "epoch": 5.108593763178123, "eval_loss": 0.22270356118679047, "eval_runtime": 59.1037, "eval_samples_per_second": 169.194, "eval_steps_per_second": 21.149, "step": 53000 }, { "epoch": 5.118232619335063, "grad_norm": 1.164044976234436, "learning_rate": 6.923076923076923e-07, "loss": 0.2525, "step": 53100 }, { "epoch": 5.118232619335063, "eval_loss": 0.22594866156578064, "eval_runtime": 58.9608, "eval_samples_per_second": 169.604, "eval_steps_per_second": 21.201, "step": 53100 }, { "epoch": 5.127871475492003, "grad_norm": 1.2590959072113037, "learning_rate": 6.822742474916388e-07, "loss": 0.2546, "step": 53200 }, { "epoch": 5.127871475492003, "eval_loss": 0.2225504219532013, "eval_runtime": 58.9634, "eval_samples_per_second": 169.597, "eval_steps_per_second": 21.2, "step": 53200 }, { "epoch": 5.137510331648944, "grad_norm": 1.1613073348999023, "learning_rate": 6.722408026755853e-07, "loss": 0.2541, "step": 53300 }, { "epoch": 5.137510331648944, "eval_loss": 0.2252965271472931, "eval_runtime": 59.1154, "eval_samples_per_second": 169.161, "eval_steps_per_second": 21.145, "step": 53300 }, { "epoch": 5.1471491878058835, "grad_norm": 1.0958082675933838, "learning_rate": 6.622073578595318e-07, "loss": 0.2535, "step": 53400 }, { "epoch": 5.1471491878058835, "eval_loss": 0.2281467318534851, "eval_runtime": 58.9189, "eval_samples_per_second": 169.725, "eval_steps_per_second": 21.216, "step": 53400 }, { "epoch": 5.156788043962823, "grad_norm": 1.1621479988098145, "learning_rate": 6.521739130434783e-07, "loss": 0.2545, "step": 53500 }, { "epoch": 5.156788043962823, "eval_loss": 0.22485679388046265, "eval_runtime": 59.1361, "eval_samples_per_second": 169.101, "eval_steps_per_second": 21.138, "step": 53500 }, { "epoch": 5.166426900119763, "grad_norm": 1.081497311592102, "learning_rate": 6.421404682274248e-07, "loss": 0.256, "step": 53600 }, { "epoch": 5.166426900119763, "eval_loss": 0.22066909074783325, "eval_runtime": 58.9601, "eval_samples_per_second": 169.606, "eval_steps_per_second": 21.201, "step": 53600 }, { "epoch": 5.176065756276703, "grad_norm": 1.11661696434021, "learning_rate": 6.321070234113712e-07, "loss": 0.2537, "step": 53700 }, { "epoch": 5.176065756276703, "eval_loss": 0.22867466509342194, "eval_runtime": 58.9203, "eval_samples_per_second": 169.721, "eval_steps_per_second": 21.215, "step": 53700 }, { "epoch": 5.185704612433643, "grad_norm": 1.3185731172561646, "learning_rate": 6.220735785953178e-07, "loss": 0.2535, "step": 53800 }, { "epoch": 5.185704612433643, "eval_loss": 0.22424526512622833, "eval_runtime": 59.0971, "eval_samples_per_second": 169.213, "eval_steps_per_second": 21.152, "step": 53800 }, { "epoch": 5.195343468590583, "grad_norm": 1.1543822288513184, "learning_rate": 6.120401337792642e-07, "loss": 0.2525, "step": 53900 }, { "epoch": 5.195343468590583, "eval_loss": 0.22214870154857635, "eval_runtime": 58.93, "eval_samples_per_second": 169.693, "eval_steps_per_second": 21.212, "step": 53900 }, { "epoch": 5.204982324747522, "grad_norm": 1.2196881771087646, "learning_rate": 6.020066889632107e-07, "loss": 0.2519, "step": 54000 }, { "epoch": 5.204982324747522, "eval_loss": 0.22731679677963257, "eval_runtime": 58.9214, "eval_samples_per_second": 169.718, "eval_steps_per_second": 21.215, "step": 54000 }, { "epoch": 5.214621180904462, "grad_norm": 1.3042584657669067, "learning_rate": 5.919732441471572e-07, "loss": 0.2499, "step": 54100 }, { "epoch": 5.214621180904462, "eval_loss": 0.22149991989135742, "eval_runtime": 59.1436, "eval_samples_per_second": 169.08, "eval_steps_per_second": 21.135, "step": 54100 }, { "epoch": 5.224260037061402, "grad_norm": 1.1391198635101318, "learning_rate": 5.819397993311037e-07, "loss": 0.2541, "step": 54200 }, { "epoch": 5.224260037061402, "eval_loss": 0.22547002136707306, "eval_runtime": 58.734, "eval_samples_per_second": 170.259, "eval_steps_per_second": 21.282, "step": 54200 }, { "epoch": 5.233898893218342, "grad_norm": 1.2511396408081055, "learning_rate": 5.719063545150502e-07, "loss": 0.2534, "step": 54300 }, { "epoch": 5.233898893218342, "eval_loss": 0.22946836054325104, "eval_runtime": 58.9665, "eval_samples_per_second": 169.588, "eval_steps_per_second": 21.198, "step": 54300 }, { "epoch": 5.243537749375282, "grad_norm": 1.0912216901779175, "learning_rate": 5.618729096989966e-07, "loss": 0.2506, "step": 54400 }, { "epoch": 5.243537749375282, "eval_loss": 0.22805634140968323, "eval_runtime": 59.1357, "eval_samples_per_second": 169.103, "eval_steps_per_second": 21.138, "step": 54400 }, { "epoch": 5.2531766055322215, "grad_norm": 1.208542823791504, "learning_rate": 5.518394648829431e-07, "loss": 0.2526, "step": 54500 }, { "epoch": 5.2531766055322215, "eval_loss": 0.21685273945331573, "eval_runtime": 58.9643, "eval_samples_per_second": 169.594, "eval_steps_per_second": 21.199, "step": 54500 }, { "epoch": 5.262815461689161, "grad_norm": 1.152645230293274, "learning_rate": 5.418060200668896e-07, "loss": 0.2496, "step": 54600 }, { "epoch": 5.262815461689161, "eval_loss": 0.2244671881198883, "eval_runtime": 59.2534, "eval_samples_per_second": 168.767, "eval_steps_per_second": 21.096, "step": 54600 }, { "epoch": 5.272454317846101, "grad_norm": 1.2127925157546997, "learning_rate": 5.317725752508361e-07, "loss": 0.2501, "step": 54700 }, { "epoch": 5.272454317846101, "eval_loss": 0.2252870351076126, "eval_runtime": 59.0351, "eval_samples_per_second": 169.391, "eval_steps_per_second": 21.174, "step": 54700 }, { "epoch": 5.282093174003041, "grad_norm": 1.1584402322769165, "learning_rate": 5.217391304347826e-07, "loss": 0.2508, "step": 54800 }, { "epoch": 5.282093174003041, "eval_loss": 0.21999992430210114, "eval_runtime": 59.2774, "eval_samples_per_second": 168.698, "eval_steps_per_second": 21.087, "step": 54800 }, { "epoch": 5.291732030159981, "grad_norm": 1.129681944847107, "learning_rate": 5.117056856187291e-07, "loss": 0.2497, "step": 54900 }, { "epoch": 5.291732030159981, "eval_loss": 0.21917253732681274, "eval_runtime": 59.2951, "eval_samples_per_second": 168.648, "eval_steps_per_second": 21.081, "step": 54900 }, { "epoch": 5.301370886316921, "grad_norm": 1.2014105319976807, "learning_rate": 5.016722408026756e-07, "loss": 0.2512, "step": 55000 }, { "epoch": 5.301370886316921, "eval_loss": 0.2266959398984909, "eval_runtime": 59.486, "eval_samples_per_second": 168.107, "eval_steps_per_second": 21.013, "step": 55000 }, { "epoch": 5.3110097424738605, "grad_norm": 1.09111750125885, "learning_rate": 4.916387959866221e-07, "loss": 0.2505, "step": 55100 }, { "epoch": 5.3110097424738605, "eval_loss": 0.22630225121974945, "eval_runtime": 59.3384, "eval_samples_per_second": 168.525, "eval_steps_per_second": 21.066, "step": 55100 }, { "epoch": 5.3206485986308, "grad_norm": 1.3559210300445557, "learning_rate": 4.816053511705686e-07, "loss": 0.2528, "step": 55200 }, { "epoch": 5.3206485986308, "eval_loss": 0.22501616179943085, "eval_runtime": 59.4826, "eval_samples_per_second": 168.117, "eval_steps_per_second": 21.015, "step": 55200 }, { "epoch": 5.33028745478774, "grad_norm": 1.1638729572296143, "learning_rate": 4.7157190635451506e-07, "loss": 0.2494, "step": 55300 }, { "epoch": 5.33028745478774, "eval_loss": 0.2276040017604828, "eval_runtime": 59.3183, "eval_samples_per_second": 168.582, "eval_steps_per_second": 21.073, "step": 55300 }, { "epoch": 5.33992631094468, "grad_norm": 1.1465582847595215, "learning_rate": 4.6153846153846156e-07, "loss": 0.2511, "step": 55400 }, { "epoch": 5.33992631094468, "eval_loss": 0.2249348908662796, "eval_runtime": 59.2762, "eval_samples_per_second": 168.702, "eval_steps_per_second": 21.088, "step": 55400 }, { "epoch": 5.34956516710162, "grad_norm": 1.1704530715942383, "learning_rate": 4.5150501672240806e-07, "loss": 0.251, "step": 55500 }, { "epoch": 5.34956516710162, "eval_loss": 0.22359701991081238, "eval_runtime": 59.4584, "eval_samples_per_second": 168.185, "eval_steps_per_second": 21.023, "step": 55500 }, { "epoch": 5.35920402325856, "grad_norm": 1.1347342729568481, "learning_rate": 4.4147157190635456e-07, "loss": 0.25, "step": 55600 }, { "epoch": 5.35920402325856, "eval_loss": 0.2178959995508194, "eval_runtime": 59.2741, "eval_samples_per_second": 168.708, "eval_steps_per_second": 21.088, "step": 55600 }, { "epoch": 5.368842879415499, "grad_norm": 1.2167783975601196, "learning_rate": 4.3143812709030095e-07, "loss": 0.2485, "step": 55700 }, { "epoch": 5.368842879415499, "eval_loss": 0.22190746665000916, "eval_runtime": 59.42, "eval_samples_per_second": 168.294, "eval_steps_per_second": 21.037, "step": 55700 }, { "epoch": 5.378481735572439, "grad_norm": 1.1554456949234009, "learning_rate": 4.2140468227424745e-07, "loss": 0.2494, "step": 55800 }, { "epoch": 5.378481735572439, "eval_loss": 0.22227539122104645, "eval_runtime": 59.369, "eval_samples_per_second": 168.438, "eval_steps_per_second": 21.055, "step": 55800 }, { "epoch": 5.388120591729379, "grad_norm": 1.052170991897583, "learning_rate": 4.1137123745819395e-07, "loss": 0.2519, "step": 55900 }, { "epoch": 5.388120591729379, "eval_loss": 0.22797919809818268, "eval_runtime": 58.9622, "eval_samples_per_second": 169.6, "eval_steps_per_second": 21.2, "step": 55900 }, { "epoch": 5.397759447886319, "grad_norm": 1.0489306449890137, "learning_rate": 4.0133779264214045e-07, "loss": 0.2501, "step": 56000 }, { "epoch": 5.397759447886319, "eval_loss": 0.2253742218017578, "eval_runtime": 59.1206, "eval_samples_per_second": 169.146, "eval_steps_per_second": 21.143, "step": 56000 }, { "epoch": 5.407398304043259, "grad_norm": 1.158197045326233, "learning_rate": 3.9130434782608694e-07, "loss": 0.2502, "step": 56100 }, { "epoch": 5.407398304043259, "eval_loss": 0.23163315653800964, "eval_runtime": 59.1282, "eval_samples_per_second": 169.124, "eval_steps_per_second": 21.141, "step": 56100 }, { "epoch": 5.417037160200199, "grad_norm": 1.1179620027542114, "learning_rate": 3.8127090301003344e-07, "loss": 0.2516, "step": 56200 }, { "epoch": 5.417037160200199, "eval_loss": 0.22414067387580872, "eval_runtime": 58.9601, "eval_samples_per_second": 169.606, "eval_steps_per_second": 21.201, "step": 56200 }, { "epoch": 5.426676016357139, "grad_norm": 1.0244063138961792, "learning_rate": 3.7123745819397994e-07, "loss": 0.2493, "step": 56300 }, { "epoch": 5.426676016357139, "eval_loss": 0.22705614566802979, "eval_runtime": 58.9344, "eval_samples_per_second": 169.68, "eval_steps_per_second": 21.21, "step": 56300 }, { "epoch": 5.436314872514079, "grad_norm": 1.1290541887283325, "learning_rate": 3.6120401337792644e-07, "loss": 0.2502, "step": 56400 }, { "epoch": 5.436314872514079, "eval_loss": 0.2250737100839615, "eval_runtime": 59.1308, "eval_samples_per_second": 169.117, "eval_steps_per_second": 21.14, "step": 56400 }, { "epoch": 5.445953728671019, "grad_norm": 1.2744230031967163, "learning_rate": 3.5117056856187294e-07, "loss": 0.2511, "step": 56500 }, { "epoch": 5.445953728671019, "eval_loss": 0.22477617859840393, "eval_runtime": 58.9688, "eval_samples_per_second": 169.581, "eval_steps_per_second": 21.198, "step": 56500 }, { "epoch": 5.455592584827959, "grad_norm": 1.1114057302474976, "learning_rate": 3.411371237458194e-07, "loss": 0.2484, "step": 56600 }, { "epoch": 5.455592584827959, "eval_loss": 0.226850688457489, "eval_runtime": 58.9713, "eval_samples_per_second": 169.574, "eval_steps_per_second": 21.197, "step": 56600 }, { "epoch": 5.4652314409848985, "grad_norm": 1.1400434970855713, "learning_rate": 3.311036789297659e-07, "loss": 0.2508, "step": 56700 }, { "epoch": 5.4652314409848985, "eval_loss": 0.22591835260391235, "eval_runtime": 59.1265, "eval_samples_per_second": 169.129, "eval_steps_per_second": 21.141, "step": 56700 }, { "epoch": 5.474870297141838, "grad_norm": 1.2677631378173828, "learning_rate": 3.210702341137124e-07, "loss": 0.249, "step": 56800 }, { "epoch": 5.474870297141838, "eval_loss": 0.22445721924304962, "eval_runtime": 58.9365, "eval_samples_per_second": 169.674, "eval_steps_per_second": 21.209, "step": 56800 }, { "epoch": 5.484509153298778, "grad_norm": 1.1901522874832153, "learning_rate": 3.110367892976589e-07, "loss": 0.2516, "step": 56900 }, { "epoch": 5.484509153298778, "eval_loss": 0.22300176322460175, "eval_runtime": 59.0008, "eval_samples_per_second": 169.489, "eval_steps_per_second": 21.186, "step": 56900 }, { "epoch": 5.494148009455718, "grad_norm": 1.1611310243606567, "learning_rate": 3.010033444816054e-07, "loss": 0.2497, "step": 57000 }, { "epoch": 5.494148009455718, "eval_loss": 0.21740412712097168, "eval_runtime": 59.137, "eval_samples_per_second": 169.099, "eval_steps_per_second": 21.137, "step": 57000 }, { "epoch": 5.503786865612658, "grad_norm": 1.2610938549041748, "learning_rate": 2.9096989966555187e-07, "loss": 0.2476, "step": 57100 }, { "epoch": 5.503786865612658, "eval_loss": 0.22454555332660675, "eval_runtime": 58.7081, "eval_samples_per_second": 170.334, "eval_steps_per_second": 21.292, "step": 57100 }, { "epoch": 5.513425721769598, "grad_norm": 1.2325226068496704, "learning_rate": 2.809364548494983e-07, "loss": 0.2484, "step": 57200 }, { "epoch": 5.513425721769598, "eval_loss": 0.22510981559753418, "eval_runtime": 58.6381, "eval_samples_per_second": 170.538, "eval_steps_per_second": 21.317, "step": 57200 }, { "epoch": 5.5230645779265375, "grad_norm": 1.0809872150421143, "learning_rate": 2.709030100334448e-07, "loss": 0.2482, "step": 57300 }, { "epoch": 5.5230645779265375, "eval_loss": 0.22954057157039642, "eval_runtime": 58.8227, "eval_samples_per_second": 170.002, "eval_steps_per_second": 21.25, "step": 57300 }, { "epoch": 5.532703434083477, "grad_norm": 1.1554908752441406, "learning_rate": 2.608695652173913e-07, "loss": 0.2489, "step": 57400 }, { "epoch": 5.532703434083477, "eval_loss": 0.21920810639858246, "eval_runtime": 58.641, "eval_samples_per_second": 170.529, "eval_steps_per_second": 21.316, "step": 57400 }, { "epoch": 5.542342290240417, "grad_norm": 1.113728404045105, "learning_rate": 2.508361204013378e-07, "loss": 0.249, "step": 57500 }, { "epoch": 5.542342290240417, "eval_loss": 0.22062529623508453, "eval_runtime": 58.656, "eval_samples_per_second": 170.485, "eval_steps_per_second": 21.311, "step": 57500 }, { "epoch": 5.551981146397357, "grad_norm": 1.179052710533142, "learning_rate": 2.408026755852843e-07, "loss": 0.2502, "step": 57600 }, { "epoch": 5.551981146397357, "eval_loss": 0.22298461198806763, "eval_runtime": 58.8967, "eval_samples_per_second": 169.789, "eval_steps_per_second": 21.224, "step": 57600 }, { "epoch": 5.561620002554297, "grad_norm": 1.1483044624328613, "learning_rate": 2.3076923076923078e-07, "loss": 0.2483, "step": 57700 }, { "epoch": 5.561620002554297, "eval_loss": 0.21685902774333954, "eval_runtime": 58.7747, "eval_samples_per_second": 170.141, "eval_steps_per_second": 21.268, "step": 57700 }, { "epoch": 5.571258858711237, "grad_norm": 1.2611775398254395, "learning_rate": 2.2073578595317728e-07, "loss": 0.25, "step": 57800 }, { "epoch": 5.571258858711237, "eval_loss": 0.22439409792423248, "eval_runtime": 58.7479, "eval_samples_per_second": 170.219, "eval_steps_per_second": 21.277, "step": 57800 }, { "epoch": 5.580897714868176, "grad_norm": 1.1589492559432983, "learning_rate": 2.1070234113712372e-07, "loss": 0.2505, "step": 57900 }, { "epoch": 5.580897714868176, "eval_loss": 0.22764070332050323, "eval_runtime": 59.0825, "eval_samples_per_second": 169.255, "eval_steps_per_second": 21.157, "step": 57900 }, { "epoch": 5.590536571025116, "grad_norm": 1.0722376108169556, "learning_rate": 2.0066889632107022e-07, "loss": 0.2493, "step": 58000 }, { "epoch": 5.590536571025116, "eval_loss": 0.22544412314891815, "eval_runtime": 58.7925, "eval_samples_per_second": 170.09, "eval_steps_per_second": 21.261, "step": 58000 }, { "epoch": 5.600175427182056, "grad_norm": 1.146055817604065, "learning_rate": 1.9063545150501672e-07, "loss": 0.2499, "step": 58100 }, { "epoch": 5.600175427182056, "eval_loss": 0.22268901765346527, "eval_runtime": 58.7727, "eval_samples_per_second": 170.147, "eval_steps_per_second": 21.268, "step": 58100 }, { "epoch": 5.609814283338996, "grad_norm": 1.1639162302017212, "learning_rate": 1.8060200668896322e-07, "loss": 0.2499, "step": 58200 }, { "epoch": 5.609814283338996, "eval_loss": 0.21973471343517303, "eval_runtime": 58.9457, "eval_samples_per_second": 169.648, "eval_steps_per_second": 21.206, "step": 58200 }, { "epoch": 5.619453139495936, "grad_norm": 1.1292901039123535, "learning_rate": 1.705685618729097e-07, "loss": 0.2489, "step": 58300 }, { "epoch": 5.619453139495936, "eval_loss": 0.21718740463256836, "eval_runtime": 58.726, "eval_samples_per_second": 170.282, "eval_steps_per_second": 21.285, "step": 58300 }, { "epoch": 5.6290919956528755, "grad_norm": 1.2977691888809204, "learning_rate": 1.605351170568562e-07, "loss": 0.25, "step": 58400 }, { "epoch": 5.6290919956528755, "eval_loss": 0.21809880435466766, "eval_runtime": 58.7411, "eval_samples_per_second": 170.239, "eval_steps_per_second": 21.28, "step": 58400 }, { "epoch": 5.638730851809815, "grad_norm": 1.0982277393341064, "learning_rate": 1.505016722408027e-07, "loss": 0.2482, "step": 58500 }, { "epoch": 5.638730851809815, "eval_loss": 0.22168171405792236, "eval_runtime": 58.7029, "eval_samples_per_second": 170.349, "eval_steps_per_second": 21.294, "step": 58500 }, { "epoch": 5.648369707966756, "grad_norm": 1.0531952381134033, "learning_rate": 1.4046822742474916e-07, "loss": 0.2499, "step": 58600 }, { "epoch": 5.648369707966756, "eval_loss": 0.22686386108398438, "eval_runtime": 58.7385, "eval_samples_per_second": 170.246, "eval_steps_per_second": 21.281, "step": 58600 }, { "epoch": 5.658008564123696, "grad_norm": 1.147268295288086, "learning_rate": 1.3043478260869566e-07, "loss": 0.2518, "step": 58700 }, { "epoch": 5.658008564123696, "eval_loss": 0.2252393513917923, "eval_runtime": 58.8234, "eval_samples_per_second": 170.0, "eval_steps_per_second": 21.25, "step": 58700 }, { "epoch": 5.667647420280636, "grad_norm": 1.221027135848999, "learning_rate": 1.2040133779264215e-07, "loss": 0.2484, "step": 58800 }, { "epoch": 5.667647420280636, "eval_loss": 0.22393923997879028, "eval_runtime": 59.1929, "eval_samples_per_second": 168.939, "eval_steps_per_second": 21.117, "step": 58800 }, { "epoch": 5.6772862764375756, "grad_norm": 1.302181363105774, "learning_rate": 1.1036789297658864e-07, "loss": 0.2477, "step": 58900 }, { "epoch": 5.6772862764375756, "eval_loss": 0.2217906266450882, "eval_runtime": 58.9206, "eval_samples_per_second": 169.72, "eval_steps_per_second": 21.215, "step": 58900 }, { "epoch": 5.686925132594515, "grad_norm": 1.0798256397247314, "learning_rate": 1.0033444816053511e-07, "loss": 0.2489, "step": 59000 }, { "epoch": 5.686925132594515, "eval_loss": 0.21835671365261078, "eval_runtime": 58.9332, "eval_samples_per_second": 169.684, "eval_steps_per_second": 21.21, "step": 59000 }, { "epoch": 5.696563988751455, "grad_norm": 1.053399682044983, "learning_rate": 9.030100334448161e-08, "loss": 0.2489, "step": 59100 }, { "epoch": 5.696563988751455, "eval_loss": 0.2251000553369522, "eval_runtime": 59.1145, "eval_samples_per_second": 169.163, "eval_steps_per_second": 21.145, "step": 59100 }, { "epoch": 5.706202844908395, "grad_norm": 1.1299549341201782, "learning_rate": 8.02675585284281e-08, "loss": 0.2516, "step": 59200 }, { "epoch": 5.706202844908395, "eval_loss": 0.2261471450328827, "eval_runtime": 58.9787, "eval_samples_per_second": 169.553, "eval_steps_per_second": 21.194, "step": 59200 }, { "epoch": 5.715841701065335, "grad_norm": 1.219848394393921, "learning_rate": 7.023411371237458e-08, "loss": 0.2477, "step": 59300 }, { "epoch": 5.715841701065335, "eval_loss": 0.22193090617656708, "eval_runtime": 58.9498, "eval_samples_per_second": 169.636, "eval_steps_per_second": 21.204, "step": 59300 }, { "epoch": 5.725480557222275, "grad_norm": 1.1641128063201904, "learning_rate": 6.020066889632108e-08, "loss": 0.2497, "step": 59400 }, { "epoch": 5.725480557222275, "eval_loss": 0.22315214574337006, "eval_runtime": 59.123, "eval_samples_per_second": 169.139, "eval_steps_per_second": 21.142, "step": 59400 }, { "epoch": 5.7351194133792145, "grad_norm": 1.2197539806365967, "learning_rate": 5.0167224080267556e-08, "loss": 0.2499, "step": 59500 }, { "epoch": 5.7351194133792145, "eval_loss": 0.22189626097679138, "eval_runtime": 58.965, "eval_samples_per_second": 169.592, "eval_steps_per_second": 21.199, "step": 59500 }, { "epoch": 5.744758269536154, "grad_norm": 1.1124192476272583, "learning_rate": 4.013377926421405e-08, "loss": 0.2476, "step": 59600 }, { "epoch": 5.744758269536154, "eval_loss": 0.2227565199136734, "eval_runtime": 58.9658, "eval_samples_per_second": 169.59, "eval_steps_per_second": 21.199, "step": 59600 }, { "epoch": 5.754397125693094, "grad_norm": 1.182589054107666, "learning_rate": 3.010033444816054e-08, "loss": 0.2482, "step": 59700 }, { "epoch": 5.754397125693094, "eval_loss": 0.22147627174854279, "eval_runtime": 59.1135, "eval_samples_per_second": 169.166, "eval_steps_per_second": 21.146, "step": 59700 }, { "epoch": 5.764035981850034, "grad_norm": 1.1507868766784668, "learning_rate": 2.0066889632107024e-08, "loss": 0.2471, "step": 59800 }, { "epoch": 5.764035981850034, "eval_loss": 0.22146035730838776, "eval_runtime": 58.9479, "eval_samples_per_second": 169.641, "eval_steps_per_second": 21.205, "step": 59800 }, { "epoch": 5.773674838006974, "grad_norm": 1.1648634672164917, "learning_rate": 1.0033444816053512e-08, "loss": 0.2476, "step": 59900 }, { "epoch": 5.773674838006974, "eval_loss": 0.22508075833320618, "eval_runtime": 58.9459, "eval_samples_per_second": 169.647, "eval_steps_per_second": 21.206, "step": 59900 }, { "epoch": 5.783313694163914, "grad_norm": 1.0512384176254272, "learning_rate": 0.0, "loss": 0.2481, "step": 60000 }, { "epoch": 5.783313694163914, "eval_loss": 0.2277609407901764, "eval_runtime": 59.3544, "eval_samples_per_second": 168.48, "eval_steps_per_second": 21.06, "step": 60000 } ], "logging_steps": 100, "max_steps": 60000, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.3925574648642826e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null }