diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,83538 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 11928, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00025150905432595576, + "grad_norm": 5.952406406402588, + "learning_rate": 8.382229673093043e-09, + "loss": 0.8367, + "step": 1 + }, + { + "epoch": 0.0005030181086519115, + "grad_norm": 6.153665542602539, + "learning_rate": 1.6764459346186086e-08, + "loss": 0.8394, + "step": 2 + }, + { + "epoch": 0.0007545271629778672, + "grad_norm": 5.872622966766357, + "learning_rate": 2.5146689019279132e-08, + "loss": 0.8047, + "step": 3 + }, + { + "epoch": 0.001006036217303823, + "grad_norm": 6.132771968841553, + "learning_rate": 3.352891869237217e-08, + "loss": 0.8308, + "step": 4 + }, + { + "epoch": 0.0012575452716297787, + "grad_norm": 6.068148612976074, + "learning_rate": 4.191114836546522e-08, + "loss": 0.7871, + "step": 5 + }, + { + "epoch": 0.0015090543259557343, + "grad_norm": 5.759831428527832, + "learning_rate": 5.0293378038558264e-08, + "loss": 0.8009, + "step": 6 + }, + { + "epoch": 0.0017605633802816902, + "grad_norm": 6.06758975982666, + "learning_rate": 5.8675607711651307e-08, + "loss": 0.8295, + "step": 7 + }, + { + "epoch": 0.002012072434607646, + "grad_norm": 6.002471923828125, + "learning_rate": 6.705783738474434e-08, + "loss": 0.8253, + "step": 8 + }, + { + "epoch": 0.0022635814889336017, + "grad_norm": 6.0556440353393555, + "learning_rate": 7.544006705783739e-08, + "loss": 0.8259, + "step": 9 + }, + { + "epoch": 0.0025150905432595573, + "grad_norm": 5.968860626220703, + "learning_rate": 8.382229673093044e-08, + "loss": 0.8368, + "step": 10 + }, + { + "epoch": 0.002766599597585513, + "grad_norm": 5.972998142242432, + "learning_rate": 9.220452640402346e-08, + "loss": 0.8222, + "step": 11 + }, + { + "epoch": 0.0030181086519114686, + "grad_norm": 5.935672760009766, + "learning_rate": 1.0058675607711653e-07, + "loss": 0.8099, + "step": 12 + }, + { + "epoch": 0.0032696177062374247, + "grad_norm": 6.067246913909912, + "learning_rate": 1.0896898575020955e-07, + "loss": 0.832, + "step": 13 + }, + { + "epoch": 0.0035211267605633804, + "grad_norm": 6.090888977050781, + "learning_rate": 1.1735121542330261e-07, + "loss": 0.8276, + "step": 14 + }, + { + "epoch": 0.003772635814889336, + "grad_norm": 5.8601837158203125, + "learning_rate": 1.2573344509639564e-07, + "loss": 0.8273, + "step": 15 + }, + { + "epoch": 0.004024144869215292, + "grad_norm": 6.011853218078613, + "learning_rate": 1.3411567476948869e-07, + "loss": 0.8194, + "step": 16 + }, + { + "epoch": 0.004275653923541248, + "grad_norm": 5.5620808601379395, + "learning_rate": 1.4249790444258174e-07, + "loss": 0.791, + "step": 17 + }, + { + "epoch": 0.004527162977867203, + "grad_norm": 5.906956672668457, + "learning_rate": 1.5088013411567478e-07, + "loss": 0.8452, + "step": 18 + }, + { + "epoch": 0.004778672032193159, + "grad_norm": 5.789212226867676, + "learning_rate": 1.592623637887678e-07, + "loss": 0.8181, + "step": 19 + }, + { + "epoch": 0.005030181086519115, + "grad_norm": 5.891463756561279, + "learning_rate": 1.6764459346186088e-07, + "loss": 0.8255, + "step": 20 + }, + { + "epoch": 0.00528169014084507, + "grad_norm": 5.891812801361084, + "learning_rate": 1.7602682313495393e-07, + "loss": 0.8251, + "step": 21 + }, + { + "epoch": 0.005533199195171026, + "grad_norm": 5.88001823425293, + "learning_rate": 1.8440905280804693e-07, + "loss": 0.8264, + "step": 22 + }, + { + "epoch": 0.005784708249496982, + "grad_norm": 5.622618675231934, + "learning_rate": 1.9279128248113998e-07, + "loss": 0.7947, + "step": 23 + }, + { + "epoch": 0.006036217303822937, + "grad_norm": 5.888658046722412, + "learning_rate": 2.0117351215423305e-07, + "loss": 0.843, + "step": 24 + }, + { + "epoch": 0.006287726358148894, + "grad_norm": 5.512774467468262, + "learning_rate": 2.095557418273261e-07, + "loss": 0.7908, + "step": 25 + }, + { + "epoch": 0.006539235412474849, + "grad_norm": 5.594388484954834, + "learning_rate": 2.179379715004191e-07, + "loss": 0.8071, + "step": 26 + }, + { + "epoch": 0.006790744466800805, + "grad_norm": 5.4865617752075195, + "learning_rate": 2.2632020117351218e-07, + "loss": 0.7797, + "step": 27 + }, + { + "epoch": 0.007042253521126761, + "grad_norm": 5.4088969230651855, + "learning_rate": 2.3470243084660523e-07, + "loss": 0.7925, + "step": 28 + }, + { + "epoch": 0.007293762575452716, + "grad_norm": 5.587332248687744, + "learning_rate": 2.430846605196983e-07, + "loss": 0.7799, + "step": 29 + }, + { + "epoch": 0.007545271629778672, + "grad_norm": 5.692927837371826, + "learning_rate": 2.5146689019279127e-07, + "loss": 0.8255, + "step": 30 + }, + { + "epoch": 0.007796780684104628, + "grad_norm": 5.381591796875, + "learning_rate": 2.598491198658843e-07, + "loss": 0.8173, + "step": 31 + }, + { + "epoch": 0.008048289738430584, + "grad_norm": 5.045378684997559, + "learning_rate": 2.6823134953897737e-07, + "loss": 0.7774, + "step": 32 + }, + { + "epoch": 0.00829979879275654, + "grad_norm": 4.887575626373291, + "learning_rate": 2.766135792120704e-07, + "loss": 0.7928, + "step": 33 + }, + { + "epoch": 0.008551307847082495, + "grad_norm": 4.738620281219482, + "learning_rate": 2.8499580888516347e-07, + "loss": 0.768, + "step": 34 + }, + { + "epoch": 0.008802816901408451, + "grad_norm": 4.525857448577881, + "learning_rate": 2.933780385582565e-07, + "loss": 0.7588, + "step": 35 + }, + { + "epoch": 0.009054325955734407, + "grad_norm": 4.540877342224121, + "learning_rate": 3.0176026823134957e-07, + "loss": 0.7746, + "step": 36 + }, + { + "epoch": 0.009305835010060362, + "grad_norm": 4.293127059936523, + "learning_rate": 3.101424979044426e-07, + "loss": 0.7452, + "step": 37 + }, + { + "epoch": 0.009557344064386318, + "grad_norm": 4.619227409362793, + "learning_rate": 3.185247275775356e-07, + "loss": 0.764, + "step": 38 + }, + { + "epoch": 0.009808853118712274, + "grad_norm": 4.370224475860596, + "learning_rate": 3.269069572506287e-07, + "loss": 0.7549, + "step": 39 + }, + { + "epoch": 0.01006036217303823, + "grad_norm": 4.469829082489014, + "learning_rate": 3.3528918692372177e-07, + "loss": 0.7507, + "step": 40 + }, + { + "epoch": 0.010311871227364185, + "grad_norm": 4.509310722351074, + "learning_rate": 3.4367141659681476e-07, + "loss": 0.753, + "step": 41 + }, + { + "epoch": 0.01056338028169014, + "grad_norm": 4.393963813781738, + "learning_rate": 3.5205364626990787e-07, + "loss": 0.7567, + "step": 42 + }, + { + "epoch": 0.010814889336016096, + "grad_norm": 4.315878391265869, + "learning_rate": 3.6043587594300086e-07, + "loss": 0.7331, + "step": 43 + }, + { + "epoch": 0.011066398390342052, + "grad_norm": 3.5298051834106445, + "learning_rate": 3.6881810561609386e-07, + "loss": 0.7346, + "step": 44 + }, + { + "epoch": 0.011317907444668008, + "grad_norm": 2.8748891353607178, + "learning_rate": 3.7720033528918696e-07, + "loss": 0.6869, + "step": 45 + }, + { + "epoch": 0.011569416498993963, + "grad_norm": 2.6105713844299316, + "learning_rate": 3.8558256496227996e-07, + "loss": 0.7293, + "step": 46 + }, + { + "epoch": 0.011820925553319919, + "grad_norm": 2.637141704559326, + "learning_rate": 3.9396479463537306e-07, + "loss": 0.7522, + "step": 47 + }, + { + "epoch": 0.012072434607645875, + "grad_norm": 2.4601848125457764, + "learning_rate": 4.023470243084661e-07, + "loss": 0.7251, + "step": 48 + }, + { + "epoch": 0.01232394366197183, + "grad_norm": 2.4165666103363037, + "learning_rate": 4.107292539815591e-07, + "loss": 0.6959, + "step": 49 + }, + { + "epoch": 0.012575452716297788, + "grad_norm": 2.4845943450927734, + "learning_rate": 4.191114836546522e-07, + "loss": 0.7036, + "step": 50 + }, + { + "epoch": 0.012826961770623743, + "grad_norm": 2.3925764560699463, + "learning_rate": 4.274937133277452e-07, + "loss": 0.719, + "step": 51 + }, + { + "epoch": 0.013078470824949699, + "grad_norm": 2.3824775218963623, + "learning_rate": 4.358759430008382e-07, + "loss": 0.673, + "step": 52 + }, + { + "epoch": 0.013329979879275655, + "grad_norm": 2.228027820587158, + "learning_rate": 4.442581726739313e-07, + "loss": 0.7147, + "step": 53 + }, + { + "epoch": 0.01358148893360161, + "grad_norm": 2.108171224594116, + "learning_rate": 4.5264040234702435e-07, + "loss": 0.712, + "step": 54 + }, + { + "epoch": 0.013832997987927566, + "grad_norm": 1.9733554124832153, + "learning_rate": 4.610226320201174e-07, + "loss": 0.6986, + "step": 55 + }, + { + "epoch": 0.014084507042253521, + "grad_norm": 1.9992328882217407, + "learning_rate": 4.6940486169321045e-07, + "loss": 0.6853, + "step": 56 + }, + { + "epoch": 0.014336016096579477, + "grad_norm": 1.991721749305725, + "learning_rate": 4.777870913663035e-07, + "loss": 0.6935, + "step": 57 + }, + { + "epoch": 0.014587525150905433, + "grad_norm": 1.8876029253005981, + "learning_rate": 4.861693210393966e-07, + "loss": 0.686, + "step": 58 + }, + { + "epoch": 0.014839034205231388, + "grad_norm": 1.6275702714920044, + "learning_rate": 4.945515507124896e-07, + "loss": 0.6991, + "step": 59 + }, + { + "epoch": 0.015090543259557344, + "grad_norm": 1.4490710496902466, + "learning_rate": 5.029337803855825e-07, + "loss": 0.7004, + "step": 60 + }, + { + "epoch": 0.0153420523138833, + "grad_norm": 1.453107476234436, + "learning_rate": 5.113160100586757e-07, + "loss": 0.6786, + "step": 61 + }, + { + "epoch": 0.015593561368209255, + "grad_norm": 1.5337774753570557, + "learning_rate": 5.196982397317686e-07, + "loss": 0.6438, + "step": 62 + }, + { + "epoch": 0.01584507042253521, + "grad_norm": 1.4964603185653687, + "learning_rate": 5.280804694048618e-07, + "loss": 0.6416, + "step": 63 + }, + { + "epoch": 0.01609657947686117, + "grad_norm": 1.6416826248168945, + "learning_rate": 5.364626990779547e-07, + "loss": 0.6863, + "step": 64 + }, + { + "epoch": 0.016348088531187122, + "grad_norm": 1.6651757955551147, + "learning_rate": 5.448449287510478e-07, + "loss": 0.6805, + "step": 65 + }, + { + "epoch": 0.01659959758551308, + "grad_norm": 1.5026111602783203, + "learning_rate": 5.532271584241408e-07, + "loss": 0.6329, + "step": 66 + }, + { + "epoch": 0.016851106639839034, + "grad_norm": 1.5439717769622803, + "learning_rate": 5.616093880972339e-07, + "loss": 0.6221, + "step": 67 + }, + { + "epoch": 0.01710261569416499, + "grad_norm": 1.5488256216049194, + "learning_rate": 5.699916177703269e-07, + "loss": 0.657, + "step": 68 + }, + { + "epoch": 0.017354124748490945, + "grad_norm": 1.4593708515167236, + "learning_rate": 5.7837384744342e-07, + "loss": 0.6557, + "step": 69 + }, + { + "epoch": 0.017605633802816902, + "grad_norm": 1.368078589439392, + "learning_rate": 5.86756077116513e-07, + "loss": 0.6378, + "step": 70 + }, + { + "epoch": 0.017857142857142856, + "grad_norm": 1.2166882753372192, + "learning_rate": 5.951383067896061e-07, + "loss": 0.6182, + "step": 71 + }, + { + "epoch": 0.018108651911468814, + "grad_norm": 1.1713978052139282, + "learning_rate": 6.035205364626991e-07, + "loss": 0.6465, + "step": 72 + }, + { + "epoch": 0.018360160965794767, + "grad_norm": 1.0981969833374023, + "learning_rate": 6.119027661357922e-07, + "loss": 0.6062, + "step": 73 + }, + { + "epoch": 0.018611670020120725, + "grad_norm": 1.0137531757354736, + "learning_rate": 6.202849958088852e-07, + "loss": 0.6123, + "step": 74 + }, + { + "epoch": 0.01886317907444668, + "grad_norm": 1.0327494144439697, + "learning_rate": 6.286672254819783e-07, + "loss": 0.6315, + "step": 75 + }, + { + "epoch": 0.019114688128772636, + "grad_norm": 0.9916748404502869, + "learning_rate": 6.370494551550712e-07, + "loss": 0.6504, + "step": 76 + }, + { + "epoch": 0.01936619718309859, + "grad_norm": 0.9969479441642761, + "learning_rate": 6.454316848281643e-07, + "loss": 0.6598, + "step": 77 + }, + { + "epoch": 0.019617706237424547, + "grad_norm": 0.9439221620559692, + "learning_rate": 6.538139145012574e-07, + "loss": 0.6136, + "step": 78 + }, + { + "epoch": 0.0198692152917505, + "grad_norm": 0.9918829202651978, + "learning_rate": 6.621961441743505e-07, + "loss": 0.6273, + "step": 79 + }, + { + "epoch": 0.02012072434607646, + "grad_norm": 0.8537020683288574, + "learning_rate": 6.705783738474435e-07, + "loss": 0.6015, + "step": 80 + }, + { + "epoch": 0.020372233400402416, + "grad_norm": 0.8386716246604919, + "learning_rate": 6.789606035205365e-07, + "loss": 0.6091, + "step": 81 + }, + { + "epoch": 0.02062374245472837, + "grad_norm": 0.8146318793296814, + "learning_rate": 6.873428331936295e-07, + "loss": 0.6225, + "step": 82 + }, + { + "epoch": 0.020875251509054327, + "grad_norm": 0.7194071412086487, + "learning_rate": 6.957250628667227e-07, + "loss": 0.6023, + "step": 83 + }, + { + "epoch": 0.02112676056338028, + "grad_norm": 0.672926127910614, + "learning_rate": 7.041072925398157e-07, + "loss": 0.572, + "step": 84 + }, + { + "epoch": 0.02137826961770624, + "grad_norm": 0.729033887386322, + "learning_rate": 7.124895222129087e-07, + "loss": 0.6156, + "step": 85 + }, + { + "epoch": 0.021629778672032193, + "grad_norm": 0.6650065779685974, + "learning_rate": 7.208717518860017e-07, + "loss": 0.5969, + "step": 86 + }, + { + "epoch": 0.02188128772635815, + "grad_norm": 0.7430527806282043, + "learning_rate": 7.292539815590948e-07, + "loss": 0.583, + "step": 87 + }, + { + "epoch": 0.022132796780684104, + "grad_norm": 0.7079744338989258, + "learning_rate": 7.376362112321877e-07, + "loss": 0.5836, + "step": 88 + }, + { + "epoch": 0.02238430583501006, + "grad_norm": 0.6756497025489807, + "learning_rate": 7.460184409052809e-07, + "loss": 0.5745, + "step": 89 + }, + { + "epoch": 0.022635814889336015, + "grad_norm": 0.655807614326477, + "learning_rate": 7.544006705783739e-07, + "loss": 0.5946, + "step": 90 + }, + { + "epoch": 0.022887323943661973, + "grad_norm": 0.5830637216567993, + "learning_rate": 7.62782900251467e-07, + "loss": 0.5515, + "step": 91 + }, + { + "epoch": 0.023138832997987926, + "grad_norm": 0.6083541512489319, + "learning_rate": 7.711651299245599e-07, + "loss": 0.6012, + "step": 92 + }, + { + "epoch": 0.023390342052313884, + "grad_norm": 0.5965361595153809, + "learning_rate": 7.79547359597653e-07, + "loss": 0.6039, + "step": 93 + }, + { + "epoch": 0.023641851106639838, + "grad_norm": 0.5302997827529907, + "learning_rate": 7.879295892707461e-07, + "loss": 0.5848, + "step": 94 + }, + { + "epoch": 0.023893360160965795, + "grad_norm": 0.6046608090400696, + "learning_rate": 7.963118189438392e-07, + "loss": 0.5951, + "step": 95 + }, + { + "epoch": 0.02414486921529175, + "grad_norm": 0.5611419677734375, + "learning_rate": 8.046940486169322e-07, + "loss": 0.545, + "step": 96 + }, + { + "epoch": 0.024396378269617706, + "grad_norm": 0.5234445929527283, + "learning_rate": 8.130762782900252e-07, + "loss": 0.5226, + "step": 97 + }, + { + "epoch": 0.02464788732394366, + "grad_norm": 0.564545750617981, + "learning_rate": 8.214585079631182e-07, + "loss": 0.5596, + "step": 98 + }, + { + "epoch": 0.024899396378269618, + "grad_norm": 0.5597972273826599, + "learning_rate": 8.298407376362114e-07, + "loss": 0.5828, + "step": 99 + }, + { + "epoch": 0.025150905432595575, + "grad_norm": 0.5264250040054321, + "learning_rate": 8.382229673093044e-07, + "loss": 0.5531, + "step": 100 + }, + { + "epoch": 0.02540241448692153, + "grad_norm": 0.5365298390388489, + "learning_rate": 8.466051969823974e-07, + "loss": 0.5539, + "step": 101 + }, + { + "epoch": 0.025653923541247486, + "grad_norm": 0.5420358180999756, + "learning_rate": 8.549874266554904e-07, + "loss": 0.5675, + "step": 102 + }, + { + "epoch": 0.02590543259557344, + "grad_norm": 0.48730742931365967, + "learning_rate": 8.633696563285835e-07, + "loss": 0.5321, + "step": 103 + }, + { + "epoch": 0.026156941649899398, + "grad_norm": 0.47938722372055054, + "learning_rate": 8.717518860016764e-07, + "loss": 0.5236, + "step": 104 + }, + { + "epoch": 0.02640845070422535, + "grad_norm": 0.4556962549686432, + "learning_rate": 8.801341156747697e-07, + "loss": 0.5348, + "step": 105 + }, + { + "epoch": 0.02665995975855131, + "grad_norm": 0.5305030941963196, + "learning_rate": 8.885163453478626e-07, + "loss": 0.5567, + "step": 106 + }, + { + "epoch": 0.026911468812877263, + "grad_norm": 0.5030955672264099, + "learning_rate": 8.968985750209557e-07, + "loss": 0.5235, + "step": 107 + }, + { + "epoch": 0.02716297786720322, + "grad_norm": 0.45537424087524414, + "learning_rate": 9.052808046940487e-07, + "loss": 0.5289, + "step": 108 + }, + { + "epoch": 0.027414486921529174, + "grad_norm": 0.4804135859012604, + "learning_rate": 9.136630343671417e-07, + "loss": 0.5481, + "step": 109 + }, + { + "epoch": 0.02766599597585513, + "grad_norm": 0.4467982351779938, + "learning_rate": 9.220452640402348e-07, + "loss": 0.5437, + "step": 110 + }, + { + "epoch": 0.027917505030181086, + "grad_norm": 0.5055573582649231, + "learning_rate": 9.304274937133279e-07, + "loss": 0.5313, + "step": 111 + }, + { + "epoch": 0.028169014084507043, + "grad_norm": 0.5437252521514893, + "learning_rate": 9.388097233864209e-07, + "loss": 0.538, + "step": 112 + }, + { + "epoch": 0.028420523138832997, + "grad_norm": 0.44438403844833374, + "learning_rate": 9.471919530595138e-07, + "loss": 0.5469, + "step": 113 + }, + { + "epoch": 0.028672032193158954, + "grad_norm": 0.45675307512283325, + "learning_rate": 9.55574182732607e-07, + "loss": 0.5007, + "step": 114 + }, + { + "epoch": 0.028923541247484908, + "grad_norm": 0.44954171776771545, + "learning_rate": 9.639564124056998e-07, + "loss": 0.5704, + "step": 115 + }, + { + "epoch": 0.029175050301810865, + "grad_norm": 0.4356625974178314, + "learning_rate": 9.72338642078793e-07, + "loss": 0.5339, + "step": 116 + }, + { + "epoch": 0.02942655935613682, + "grad_norm": 0.44852906465530396, + "learning_rate": 9.807208717518862e-07, + "loss": 0.5344, + "step": 117 + }, + { + "epoch": 0.029678068410462777, + "grad_norm": 0.5255141258239746, + "learning_rate": 9.891031014249792e-07, + "loss": 0.5457, + "step": 118 + }, + { + "epoch": 0.02992957746478873, + "grad_norm": 0.4613071084022522, + "learning_rate": 9.97485331098072e-07, + "loss": 0.566, + "step": 119 + }, + { + "epoch": 0.030181086519114688, + "grad_norm": 0.4343860149383545, + "learning_rate": 1.005867560771165e-06, + "loss": 0.5314, + "step": 120 + }, + { + "epoch": 0.030432595573440645, + "grad_norm": 0.4519299864768982, + "learning_rate": 1.0142497904442584e-06, + "loss": 0.5543, + "step": 121 + }, + { + "epoch": 0.0306841046277666, + "grad_norm": 0.45609790086746216, + "learning_rate": 1.0226320201173514e-06, + "loss": 0.5292, + "step": 122 + }, + { + "epoch": 0.030935613682092557, + "grad_norm": 0.449398010969162, + "learning_rate": 1.0310142497904444e-06, + "loss": 0.5476, + "step": 123 + }, + { + "epoch": 0.03118712273641851, + "grad_norm": 0.45544400811195374, + "learning_rate": 1.0393964794635373e-06, + "loss": 0.5264, + "step": 124 + }, + { + "epoch": 0.031438631790744465, + "grad_norm": 0.4554864466190338, + "learning_rate": 1.0477787091366303e-06, + "loss": 0.5316, + "step": 125 + }, + { + "epoch": 0.03169014084507042, + "grad_norm": 0.42402562499046326, + "learning_rate": 1.0561609388097236e-06, + "loss": 0.5422, + "step": 126 + }, + { + "epoch": 0.03194164989939638, + "grad_norm": 0.47918954491615295, + "learning_rate": 1.0645431684828166e-06, + "loss": 0.5639, + "step": 127 + }, + { + "epoch": 0.03219315895372234, + "grad_norm": 0.44865623116493225, + "learning_rate": 1.0729253981559095e-06, + "loss": 0.5292, + "step": 128 + }, + { + "epoch": 0.03244466800804829, + "grad_norm": 0.4049345850944519, + "learning_rate": 1.0813076278290025e-06, + "loss": 0.5318, + "step": 129 + }, + { + "epoch": 0.032696177062374245, + "grad_norm": 0.4720096290111542, + "learning_rate": 1.0896898575020956e-06, + "loss": 0.5059, + "step": 130 + }, + { + "epoch": 0.0329476861167002, + "grad_norm": 0.4590936005115509, + "learning_rate": 1.0980720871751886e-06, + "loss": 0.4943, + "step": 131 + }, + { + "epoch": 0.03319919517102616, + "grad_norm": 0.4278355538845062, + "learning_rate": 1.1064543168482817e-06, + "loss": 0.5273, + "step": 132 + }, + { + "epoch": 0.03345070422535211, + "grad_norm": 0.4701170325279236, + "learning_rate": 1.1148365465213747e-06, + "loss": 0.5292, + "step": 133 + }, + { + "epoch": 0.03370221327967807, + "grad_norm": 0.46045827865600586, + "learning_rate": 1.1232187761944678e-06, + "loss": 0.5208, + "step": 134 + }, + { + "epoch": 0.033953722334004025, + "grad_norm": 0.4302027225494385, + "learning_rate": 1.1316010058675608e-06, + "loss": 0.5287, + "step": 135 + }, + { + "epoch": 0.03420523138832998, + "grad_norm": 0.510970413684845, + "learning_rate": 1.1399832355406539e-06, + "loss": 0.5514, + "step": 136 + }, + { + "epoch": 0.03445674044265593, + "grad_norm": 0.4021950364112854, + "learning_rate": 1.148365465213747e-06, + "loss": 0.5356, + "step": 137 + }, + { + "epoch": 0.03470824949698189, + "grad_norm": 0.4260016083717346, + "learning_rate": 1.15674769488684e-06, + "loss": 0.5668, + "step": 138 + }, + { + "epoch": 0.03495975855130785, + "grad_norm": 0.4197461009025574, + "learning_rate": 1.165129924559933e-06, + "loss": 0.5274, + "step": 139 + }, + { + "epoch": 0.035211267605633804, + "grad_norm": 0.4209143817424774, + "learning_rate": 1.173512154233026e-06, + "loss": 0.5337, + "step": 140 + }, + { + "epoch": 0.03546277665995976, + "grad_norm": 0.4232824742794037, + "learning_rate": 1.1818943839061191e-06, + "loss": 0.5264, + "step": 141 + }, + { + "epoch": 0.03571428571428571, + "grad_norm": 0.45739907026290894, + "learning_rate": 1.1902766135792122e-06, + "loss": 0.5232, + "step": 142 + }, + { + "epoch": 0.03596579476861167, + "grad_norm": 0.4441063702106476, + "learning_rate": 1.1986588432523052e-06, + "loss": 0.5409, + "step": 143 + }, + { + "epoch": 0.03621730382293763, + "grad_norm": 0.43283092975616455, + "learning_rate": 1.2070410729253983e-06, + "loss": 0.5167, + "step": 144 + }, + { + "epoch": 0.036468812877263584, + "grad_norm": 0.4424374997615814, + "learning_rate": 1.2154233025984913e-06, + "loss": 0.5366, + "step": 145 + }, + { + "epoch": 0.036720321931589535, + "grad_norm": 0.5044413805007935, + "learning_rate": 1.2238055322715844e-06, + "loss": 0.5398, + "step": 146 + }, + { + "epoch": 0.03697183098591549, + "grad_norm": 0.46940240263938904, + "learning_rate": 1.2321877619446772e-06, + "loss": 0.499, + "step": 147 + }, + { + "epoch": 0.03722334004024145, + "grad_norm": 0.44987642765045166, + "learning_rate": 1.2405699916177705e-06, + "loss": 0.5116, + "step": 148 + }, + { + "epoch": 0.03747484909456741, + "grad_norm": 0.4250459671020508, + "learning_rate": 1.2489522212908635e-06, + "loss": 0.5055, + "step": 149 + }, + { + "epoch": 0.03772635814889336, + "grad_norm": 0.4617857038974762, + "learning_rate": 1.2573344509639566e-06, + "loss": 0.5347, + "step": 150 + }, + { + "epoch": 0.037977867203219315, + "grad_norm": 0.48853859305381775, + "learning_rate": 1.2657166806370496e-06, + "loss": 0.5067, + "step": 151 + }, + { + "epoch": 0.03822937625754527, + "grad_norm": 0.4387054145336151, + "learning_rate": 1.2740989103101425e-06, + "loss": 0.5091, + "step": 152 + }, + { + "epoch": 0.03848088531187123, + "grad_norm": 0.41969773173332214, + "learning_rate": 1.2824811399832357e-06, + "loss": 0.495, + "step": 153 + }, + { + "epoch": 0.03873239436619718, + "grad_norm": 0.43173882365226746, + "learning_rate": 1.2908633696563286e-06, + "loss": 0.4991, + "step": 154 + }, + { + "epoch": 0.03898390342052314, + "grad_norm": 0.4111224412918091, + "learning_rate": 1.2992455993294218e-06, + "loss": 0.5335, + "step": 155 + }, + { + "epoch": 0.039235412474849095, + "grad_norm": 0.40816769003868103, + "learning_rate": 1.3076278290025149e-06, + "loss": 0.5024, + "step": 156 + }, + { + "epoch": 0.03948692152917505, + "grad_norm": 0.41955387592315674, + "learning_rate": 1.3160100586756077e-06, + "loss": 0.5378, + "step": 157 + }, + { + "epoch": 0.039738430583501, + "grad_norm": 0.4666993021965027, + "learning_rate": 1.324392288348701e-06, + "loss": 0.5112, + "step": 158 + }, + { + "epoch": 0.03998993963782696, + "grad_norm": 0.41977325081825256, + "learning_rate": 1.3327745180217938e-06, + "loss": 0.5198, + "step": 159 + }, + { + "epoch": 0.04024144869215292, + "grad_norm": 0.4063010811805725, + "learning_rate": 1.341156747694887e-06, + "loss": 0.4893, + "step": 160 + }, + { + "epoch": 0.040492957746478875, + "grad_norm": 0.42728182673454285, + "learning_rate": 1.3495389773679801e-06, + "loss": 0.5049, + "step": 161 + }, + { + "epoch": 0.04074446680080483, + "grad_norm": 0.4450185298919678, + "learning_rate": 1.357921207041073e-06, + "loss": 0.5015, + "step": 162 + }, + { + "epoch": 0.04099597585513078, + "grad_norm": 0.3846238851547241, + "learning_rate": 1.3663034367141662e-06, + "loss": 0.515, + "step": 163 + }, + { + "epoch": 0.04124748490945674, + "grad_norm": 0.4138552248477936, + "learning_rate": 1.374685666387259e-06, + "loss": 0.5249, + "step": 164 + }, + { + "epoch": 0.0414989939637827, + "grad_norm": 0.4613569378852844, + "learning_rate": 1.383067896060352e-06, + "loss": 0.5409, + "step": 165 + }, + { + "epoch": 0.041750503018108655, + "grad_norm": 0.40616121888160706, + "learning_rate": 1.3914501257334454e-06, + "loss": 0.503, + "step": 166 + }, + { + "epoch": 0.042002012072434605, + "grad_norm": 0.4463784098625183, + "learning_rate": 1.3998323554065382e-06, + "loss": 0.5177, + "step": 167 + }, + { + "epoch": 0.04225352112676056, + "grad_norm": 0.40528833866119385, + "learning_rate": 1.4082145850796315e-06, + "loss": 0.4995, + "step": 168 + }, + { + "epoch": 0.04250503018108652, + "grad_norm": 0.41246384382247925, + "learning_rate": 1.4165968147527243e-06, + "loss": 0.5055, + "step": 169 + }, + { + "epoch": 0.04275653923541248, + "grad_norm": 0.4155355989933014, + "learning_rate": 1.4249790444258174e-06, + "loss": 0.5046, + "step": 170 + }, + { + "epoch": 0.04300804828973843, + "grad_norm": 0.4172171354293823, + "learning_rate": 1.4333612740989102e-06, + "loss": 0.4889, + "step": 171 + }, + { + "epoch": 0.043259557344064385, + "grad_norm": 0.40003055334091187, + "learning_rate": 1.4417435037720035e-06, + "loss": 0.4965, + "step": 172 + }, + { + "epoch": 0.04351106639839034, + "grad_norm": 0.48593074083328247, + "learning_rate": 1.4501257334450967e-06, + "loss": 0.4937, + "step": 173 + }, + { + "epoch": 0.0437625754527163, + "grad_norm": 0.41993674635887146, + "learning_rate": 1.4585079631181895e-06, + "loss": 0.4663, + "step": 174 + }, + { + "epoch": 0.04401408450704225, + "grad_norm": 0.40402930974960327, + "learning_rate": 1.4668901927912826e-06, + "loss": 0.5113, + "step": 175 + }, + { + "epoch": 0.04426559356136821, + "grad_norm": 0.4269852340221405, + "learning_rate": 1.4752724224643754e-06, + "loss": 0.5034, + "step": 176 + }, + { + "epoch": 0.044517102615694165, + "grad_norm": 0.41009941697120667, + "learning_rate": 1.4836546521374687e-06, + "loss": 0.51, + "step": 177 + }, + { + "epoch": 0.04476861167002012, + "grad_norm": 0.40047401189804077, + "learning_rate": 1.4920368818105617e-06, + "loss": 0.4719, + "step": 178 + }, + { + "epoch": 0.04502012072434608, + "grad_norm": 0.389114648103714, + "learning_rate": 1.5004191114836548e-06, + "loss": 0.4971, + "step": 179 + }, + { + "epoch": 0.04527162977867203, + "grad_norm": 0.41636624932289124, + "learning_rate": 1.5088013411567478e-06, + "loss": 0.4907, + "step": 180 + }, + { + "epoch": 0.04552313883299799, + "grad_norm": 0.41309916973114014, + "learning_rate": 1.5171835708298407e-06, + "loss": 0.509, + "step": 181 + }, + { + "epoch": 0.045774647887323945, + "grad_norm": 0.4215018153190613, + "learning_rate": 1.525565800502934e-06, + "loss": 0.4977, + "step": 182 + }, + { + "epoch": 0.0460261569416499, + "grad_norm": 0.4160129427909851, + "learning_rate": 1.533948030176027e-06, + "loss": 0.5073, + "step": 183 + }, + { + "epoch": 0.04627766599597585, + "grad_norm": 0.492434024810791, + "learning_rate": 1.5423302598491198e-06, + "loss": 0.4979, + "step": 184 + }, + { + "epoch": 0.04652917505030181, + "grad_norm": 0.3985655605792999, + "learning_rate": 1.550712489522213e-06, + "loss": 0.5291, + "step": 185 + }, + { + "epoch": 0.04678068410462777, + "grad_norm": 0.4607956111431122, + "learning_rate": 1.559094719195306e-06, + "loss": 0.5082, + "step": 186 + }, + { + "epoch": 0.047032193158953725, + "grad_norm": 0.4652690887451172, + "learning_rate": 1.5674769488683992e-06, + "loss": 0.5108, + "step": 187 + }, + { + "epoch": 0.047283702213279676, + "grad_norm": 0.43926411867141724, + "learning_rate": 1.5758591785414922e-06, + "loss": 0.5274, + "step": 188 + }, + { + "epoch": 0.04753521126760563, + "grad_norm": 0.4484438896179199, + "learning_rate": 1.584241408214585e-06, + "loss": 0.5397, + "step": 189 + }, + { + "epoch": 0.04778672032193159, + "grad_norm": 0.45970824360847473, + "learning_rate": 1.5926236378876783e-06, + "loss": 0.5249, + "step": 190 + }, + { + "epoch": 0.04803822937625755, + "grad_norm": 0.44468578696250916, + "learning_rate": 1.6010058675607712e-06, + "loss": 0.4686, + "step": 191 + }, + { + "epoch": 0.0482897384305835, + "grad_norm": 0.4506581127643585, + "learning_rate": 1.6093880972338644e-06, + "loss": 0.4777, + "step": 192 + }, + { + "epoch": 0.048541247484909456, + "grad_norm": 0.39604270458221436, + "learning_rate": 1.6177703269069575e-06, + "loss": 0.4839, + "step": 193 + }, + { + "epoch": 0.04879275653923541, + "grad_norm": 0.41182881593704224, + "learning_rate": 1.6261525565800503e-06, + "loss": 0.4888, + "step": 194 + }, + { + "epoch": 0.04904426559356137, + "grad_norm": 0.44034087657928467, + "learning_rate": 1.6345347862531436e-06, + "loss": 0.547, + "step": 195 + }, + { + "epoch": 0.04929577464788732, + "grad_norm": 0.453494131565094, + "learning_rate": 1.6429170159262364e-06, + "loss": 0.4995, + "step": 196 + }, + { + "epoch": 0.04954728370221328, + "grad_norm": 0.41761818528175354, + "learning_rate": 1.6512992455993297e-06, + "loss": 0.4797, + "step": 197 + }, + { + "epoch": 0.049798792756539235, + "grad_norm": 0.4482254385948181, + "learning_rate": 1.6596814752724227e-06, + "loss": 0.4607, + "step": 198 + }, + { + "epoch": 0.05005030181086519, + "grad_norm": 0.41836273670196533, + "learning_rate": 1.6680637049455156e-06, + "loss": 0.4702, + "step": 199 + }, + { + "epoch": 0.05030181086519115, + "grad_norm": 0.46722131967544556, + "learning_rate": 1.6764459346186088e-06, + "loss": 0.5276, + "step": 200 + }, + { + "epoch": 0.0505533199195171, + "grad_norm": 0.43717604875564575, + "learning_rate": 1.6848281642917017e-06, + "loss": 0.4681, + "step": 201 + }, + { + "epoch": 0.05080482897384306, + "grad_norm": 0.3926369249820709, + "learning_rate": 1.6932103939647947e-06, + "loss": 0.5017, + "step": 202 + }, + { + "epoch": 0.051056338028169015, + "grad_norm": 0.4253596365451813, + "learning_rate": 1.7015926236378878e-06, + "loss": 0.507, + "step": 203 + }, + { + "epoch": 0.05130784708249497, + "grad_norm": 0.4381323456764221, + "learning_rate": 1.7099748533109808e-06, + "loss": 0.4898, + "step": 204 + }, + { + "epoch": 0.05155935613682092, + "grad_norm": 0.4161489009857178, + "learning_rate": 1.718357082984074e-06, + "loss": 0.4943, + "step": 205 + }, + { + "epoch": 0.05181086519114688, + "grad_norm": 0.4436219036579132, + "learning_rate": 1.726739312657167e-06, + "loss": 0.4665, + "step": 206 + }, + { + "epoch": 0.05206237424547284, + "grad_norm": 0.43012455105781555, + "learning_rate": 1.73512154233026e-06, + "loss": 0.515, + "step": 207 + }, + { + "epoch": 0.052313883299798795, + "grad_norm": 0.4141829311847687, + "learning_rate": 1.7435037720033528e-06, + "loss": 0.5038, + "step": 208 + }, + { + "epoch": 0.052565392354124746, + "grad_norm": 0.41762787103652954, + "learning_rate": 1.751886001676446e-06, + "loss": 0.5124, + "step": 209 + }, + { + "epoch": 0.0528169014084507, + "grad_norm": 0.39720189571380615, + "learning_rate": 1.7602682313495393e-06, + "loss": 0.4578, + "step": 210 + }, + { + "epoch": 0.05306841046277666, + "grad_norm": 0.41066601872444153, + "learning_rate": 1.7686504610226322e-06, + "loss": 0.4708, + "step": 211 + }, + { + "epoch": 0.05331991951710262, + "grad_norm": 0.43149372935295105, + "learning_rate": 1.7770326906957252e-06, + "loss": 0.4894, + "step": 212 + }, + { + "epoch": 0.05357142857142857, + "grad_norm": 0.42833876609802246, + "learning_rate": 1.785414920368818e-06, + "loss": 0.4971, + "step": 213 + }, + { + "epoch": 0.053822937625754526, + "grad_norm": 0.43520480394363403, + "learning_rate": 1.7937971500419113e-06, + "loss": 0.518, + "step": 214 + }, + { + "epoch": 0.05407444668008048, + "grad_norm": 0.432537317276001, + "learning_rate": 1.8021793797150044e-06, + "loss": 0.4551, + "step": 215 + }, + { + "epoch": 0.05432595573440644, + "grad_norm": 0.4245152771472931, + "learning_rate": 1.8105616093880974e-06, + "loss": 0.4727, + "step": 216 + }, + { + "epoch": 0.05457746478873239, + "grad_norm": 0.4602232873439789, + "learning_rate": 1.8189438390611905e-06, + "loss": 0.4905, + "step": 217 + }, + { + "epoch": 0.05482897384305835, + "grad_norm": 0.4128243625164032, + "learning_rate": 1.8273260687342833e-06, + "loss": 0.4866, + "step": 218 + }, + { + "epoch": 0.055080482897384306, + "grad_norm": 0.44259390234947205, + "learning_rate": 1.8357082984073766e-06, + "loss": 0.4894, + "step": 219 + }, + { + "epoch": 0.05533199195171026, + "grad_norm": 0.46606069803237915, + "learning_rate": 1.8440905280804696e-06, + "loss": 0.5172, + "step": 220 + }, + { + "epoch": 0.05558350100603622, + "grad_norm": 0.4518153667449951, + "learning_rate": 1.8524727577535625e-06, + "loss": 0.5033, + "step": 221 + }, + { + "epoch": 0.05583501006036217, + "grad_norm": 0.4819628894329071, + "learning_rate": 1.8608549874266557e-06, + "loss": 0.488, + "step": 222 + }, + { + "epoch": 0.05608651911468813, + "grad_norm": 0.44667211174964905, + "learning_rate": 1.8692372170997485e-06, + "loss": 0.5062, + "step": 223 + }, + { + "epoch": 0.056338028169014086, + "grad_norm": 0.4287307560443878, + "learning_rate": 1.8776194467728418e-06, + "loss": 0.4432, + "step": 224 + }, + { + "epoch": 0.05658953722334004, + "grad_norm": 0.46748512983322144, + "learning_rate": 1.8860016764459349e-06, + "loss": 0.4407, + "step": 225 + }, + { + "epoch": 0.056841046277665994, + "grad_norm": 0.44293415546417236, + "learning_rate": 1.8943839061190277e-06, + "loss": 0.4634, + "step": 226 + }, + { + "epoch": 0.05709255533199195, + "grad_norm": 0.41884273290634155, + "learning_rate": 1.902766135792121e-06, + "loss": 0.4978, + "step": 227 + }, + { + "epoch": 0.05734406438631791, + "grad_norm": 0.4945652186870575, + "learning_rate": 1.911148365465214e-06, + "loss": 0.493, + "step": 228 + }, + { + "epoch": 0.057595573440643866, + "grad_norm": 0.4238063097000122, + "learning_rate": 1.919530595138307e-06, + "loss": 0.4558, + "step": 229 + }, + { + "epoch": 0.057847082494969816, + "grad_norm": 0.39406704902648926, + "learning_rate": 1.9279128248113997e-06, + "loss": 0.479, + "step": 230 + }, + { + "epoch": 0.058098591549295774, + "grad_norm": 0.4454141855239868, + "learning_rate": 1.936295054484493e-06, + "loss": 0.4817, + "step": 231 + }, + { + "epoch": 0.05835010060362173, + "grad_norm": 0.45866918563842773, + "learning_rate": 1.944677284157586e-06, + "loss": 0.461, + "step": 232 + }, + { + "epoch": 0.05860160965794769, + "grad_norm": 0.41936853528022766, + "learning_rate": 1.953059513830679e-06, + "loss": 0.4745, + "step": 233 + }, + { + "epoch": 0.05885311871227364, + "grad_norm": 0.4199956953525543, + "learning_rate": 1.9614417435037723e-06, + "loss": 0.486, + "step": 234 + }, + { + "epoch": 0.059104627766599596, + "grad_norm": 0.4122017025947571, + "learning_rate": 1.969823973176865e-06, + "loss": 0.5059, + "step": 235 + }, + { + "epoch": 0.059356136820925554, + "grad_norm": 0.4136897027492523, + "learning_rate": 1.9782062028499584e-06, + "loss": 0.4689, + "step": 236 + }, + { + "epoch": 0.05960764587525151, + "grad_norm": 0.44036155939102173, + "learning_rate": 1.9865884325230515e-06, + "loss": 0.4885, + "step": 237 + }, + { + "epoch": 0.05985915492957746, + "grad_norm": 0.39697256684303284, + "learning_rate": 1.994970662196144e-06, + "loss": 0.4923, + "step": 238 + }, + { + "epoch": 0.06011066398390342, + "grad_norm": 0.3980274200439453, + "learning_rate": 2.0033528918692376e-06, + "loss": 0.4702, + "step": 239 + }, + { + "epoch": 0.060362173038229376, + "grad_norm": 0.4443422257900238, + "learning_rate": 2.01173512154233e-06, + "loss": 0.4845, + "step": 240 + }, + { + "epoch": 0.060613682092555334, + "grad_norm": 0.39754951000213623, + "learning_rate": 2.0201173512154237e-06, + "loss": 0.4652, + "step": 241 + }, + { + "epoch": 0.06086519114688129, + "grad_norm": 0.41801464557647705, + "learning_rate": 2.0284995808885167e-06, + "loss": 0.4821, + "step": 242 + }, + { + "epoch": 0.06111670020120724, + "grad_norm": 0.45718705654144287, + "learning_rate": 2.0368818105616093e-06, + "loss": 0.5005, + "step": 243 + }, + { + "epoch": 0.0613682092555332, + "grad_norm": 0.4293982684612274, + "learning_rate": 2.045264040234703e-06, + "loss": 0.5131, + "step": 244 + }, + { + "epoch": 0.061619718309859156, + "grad_norm": 0.4417547285556793, + "learning_rate": 2.0536462699077954e-06, + "loss": 0.4852, + "step": 245 + }, + { + "epoch": 0.061871227364185113, + "grad_norm": 0.43375566601753235, + "learning_rate": 2.062028499580889e-06, + "loss": 0.474, + "step": 246 + }, + { + "epoch": 0.062122736418511064, + "grad_norm": 0.4546149969100952, + "learning_rate": 2.070410729253982e-06, + "loss": 0.4667, + "step": 247 + }, + { + "epoch": 0.06237424547283702, + "grad_norm": 0.4586324095726013, + "learning_rate": 2.0787929589270746e-06, + "loss": 0.514, + "step": 248 + }, + { + "epoch": 0.06262575452716297, + "grad_norm": 0.40271812677383423, + "learning_rate": 2.087175188600168e-06, + "loss": 0.4615, + "step": 249 + }, + { + "epoch": 0.06287726358148893, + "grad_norm": 0.44942378997802734, + "learning_rate": 2.0955574182732607e-06, + "loss": 0.4823, + "step": 250 + }, + { + "epoch": 0.06312877263581489, + "grad_norm": 0.3924196660518646, + "learning_rate": 2.1039396479463537e-06, + "loss": 0.4828, + "step": 251 + }, + { + "epoch": 0.06338028169014084, + "grad_norm": 0.4237384796142578, + "learning_rate": 2.112321877619447e-06, + "loss": 0.4931, + "step": 252 + }, + { + "epoch": 0.0636317907444668, + "grad_norm": 0.40384024381637573, + "learning_rate": 2.12070410729254e-06, + "loss": 0.473, + "step": 253 + }, + { + "epoch": 0.06388329979879276, + "grad_norm": 0.4171174466609955, + "learning_rate": 2.1290863369656333e-06, + "loss": 0.4971, + "step": 254 + }, + { + "epoch": 0.06413480885311872, + "grad_norm": 0.43529245257377625, + "learning_rate": 2.137468566638726e-06, + "loss": 0.4433, + "step": 255 + }, + { + "epoch": 0.06438631790744467, + "grad_norm": 0.46728262305259705, + "learning_rate": 2.145850796311819e-06, + "loss": 0.4561, + "step": 256 + }, + { + "epoch": 0.06463782696177062, + "grad_norm": 0.5025231242179871, + "learning_rate": 2.1542330259849124e-06, + "loss": 0.4916, + "step": 257 + }, + { + "epoch": 0.06488933601609657, + "grad_norm": 0.3811725378036499, + "learning_rate": 2.162615255658005e-06, + "loss": 0.4842, + "step": 258 + }, + { + "epoch": 0.06514084507042253, + "grad_norm": 0.4305613040924072, + "learning_rate": 2.1709974853310985e-06, + "loss": 0.4609, + "step": 259 + }, + { + "epoch": 0.06539235412474849, + "grad_norm": 0.42544025182724, + "learning_rate": 2.179379715004191e-06, + "loss": 0.4745, + "step": 260 + }, + { + "epoch": 0.06564386317907445, + "grad_norm": 0.42461109161376953, + "learning_rate": 2.1877619446772842e-06, + "loss": 0.4945, + "step": 261 + }, + { + "epoch": 0.0658953722334004, + "grad_norm": 0.42680662870407104, + "learning_rate": 2.1961441743503773e-06, + "loss": 0.4918, + "step": 262 + }, + { + "epoch": 0.06614688128772636, + "grad_norm": 0.42169326543807983, + "learning_rate": 2.2045264040234703e-06, + "loss": 0.4724, + "step": 263 + }, + { + "epoch": 0.06639839034205232, + "grad_norm": 0.4097406268119812, + "learning_rate": 2.2129086336965634e-06, + "loss": 0.4682, + "step": 264 + }, + { + "epoch": 0.06664989939637828, + "grad_norm": 0.5153217911720276, + "learning_rate": 2.2212908633696564e-06, + "loss": 0.4742, + "step": 265 + }, + { + "epoch": 0.06690140845070422, + "grad_norm": 0.4730076491832733, + "learning_rate": 2.2296730930427495e-06, + "loss": 0.4757, + "step": 266 + }, + { + "epoch": 0.06715291750503018, + "grad_norm": 0.48586782813072205, + "learning_rate": 2.2380553227158425e-06, + "loss": 0.4851, + "step": 267 + }, + { + "epoch": 0.06740442655935613, + "grad_norm": 0.4455511271953583, + "learning_rate": 2.2464375523889356e-06, + "loss": 0.4448, + "step": 268 + }, + { + "epoch": 0.06765593561368209, + "grad_norm": 0.5135384798049927, + "learning_rate": 2.2548197820620286e-06, + "loss": 0.4944, + "step": 269 + }, + { + "epoch": 0.06790744466800805, + "grad_norm": 0.4233526289463043, + "learning_rate": 2.2632020117351217e-06, + "loss": 0.4523, + "step": 270 + }, + { + "epoch": 0.068158953722334, + "grad_norm": 0.4129721522331238, + "learning_rate": 2.2715842414082147e-06, + "loss": 0.4809, + "step": 271 + }, + { + "epoch": 0.06841046277665996, + "grad_norm": 0.4507489800453186, + "learning_rate": 2.2799664710813078e-06, + "loss": 0.4521, + "step": 272 + }, + { + "epoch": 0.06866197183098592, + "grad_norm": 0.5152677297592163, + "learning_rate": 2.288348700754401e-06, + "loss": 0.4878, + "step": 273 + }, + { + "epoch": 0.06891348088531186, + "grad_norm": 0.4466034471988678, + "learning_rate": 2.296730930427494e-06, + "loss": 0.4493, + "step": 274 + }, + { + "epoch": 0.06916498993963782, + "grad_norm": 0.4332417845726013, + "learning_rate": 2.305113160100587e-06, + "loss": 0.4904, + "step": 275 + }, + { + "epoch": 0.06941649899396378, + "grad_norm": 0.4686703085899353, + "learning_rate": 2.31349538977368e-06, + "loss": 0.5085, + "step": 276 + }, + { + "epoch": 0.06966800804828974, + "grad_norm": 0.46179959177970886, + "learning_rate": 2.321877619446773e-06, + "loss": 0.4841, + "step": 277 + }, + { + "epoch": 0.0699195171026157, + "grad_norm": 0.43604379892349243, + "learning_rate": 2.330259849119866e-06, + "loss": 0.4666, + "step": 278 + }, + { + "epoch": 0.07017102615694165, + "grad_norm": 0.4303692877292633, + "learning_rate": 2.338642078792959e-06, + "loss": 0.4745, + "step": 279 + }, + { + "epoch": 0.07042253521126761, + "grad_norm": 0.46457597613334656, + "learning_rate": 2.347024308466052e-06, + "loss": 0.5055, + "step": 280 + }, + { + "epoch": 0.07067404426559357, + "grad_norm": 0.3907102942466736, + "learning_rate": 2.355406538139145e-06, + "loss": 0.4719, + "step": 281 + }, + { + "epoch": 0.07092555331991952, + "grad_norm": 0.4557718336582184, + "learning_rate": 2.3637887678122383e-06, + "loss": 0.4796, + "step": 282 + }, + { + "epoch": 0.07117706237424547, + "grad_norm": 0.4310013949871063, + "learning_rate": 2.3721709974853313e-06, + "loss": 0.4753, + "step": 283 + }, + { + "epoch": 0.07142857142857142, + "grad_norm": 0.4076017737388611, + "learning_rate": 2.3805532271584244e-06, + "loss": 0.4697, + "step": 284 + }, + { + "epoch": 0.07168008048289738, + "grad_norm": 0.4143841564655304, + "learning_rate": 2.3889354568315174e-06, + "loss": 0.4839, + "step": 285 + }, + { + "epoch": 0.07193158953722334, + "grad_norm": 0.4165581166744232, + "learning_rate": 2.3973176865046105e-06, + "loss": 0.4777, + "step": 286 + }, + { + "epoch": 0.0721830985915493, + "grad_norm": 0.3928240239620209, + "learning_rate": 2.4056999161777035e-06, + "loss": 0.4581, + "step": 287 + }, + { + "epoch": 0.07243460764587525, + "grad_norm": 0.45718878507614136, + "learning_rate": 2.4140821458507966e-06, + "loss": 0.4856, + "step": 288 + }, + { + "epoch": 0.07268611670020121, + "grad_norm": 0.40149986743927, + "learning_rate": 2.4224643755238896e-06, + "loss": 0.4889, + "step": 289 + }, + { + "epoch": 0.07293762575452717, + "grad_norm": 0.40798652172088623, + "learning_rate": 2.4308466051969827e-06, + "loss": 0.4864, + "step": 290 + }, + { + "epoch": 0.07318913480885311, + "grad_norm": 0.4065083861351013, + "learning_rate": 2.4392288348700757e-06, + "loss": 0.4788, + "step": 291 + }, + { + "epoch": 0.07344064386317907, + "grad_norm": 0.4597596824169159, + "learning_rate": 2.4476110645431688e-06, + "loss": 0.4578, + "step": 292 + }, + { + "epoch": 0.07369215291750503, + "grad_norm": 0.4461594521999359, + "learning_rate": 2.455993294216262e-06, + "loss": 0.4879, + "step": 293 + }, + { + "epoch": 0.07394366197183098, + "grad_norm": 0.3977876901626587, + "learning_rate": 2.4643755238893544e-06, + "loss": 0.4687, + "step": 294 + }, + { + "epoch": 0.07419517102615694, + "grad_norm": 0.4070913791656494, + "learning_rate": 2.472757753562448e-06, + "loss": 0.4712, + "step": 295 + }, + { + "epoch": 0.0744466800804829, + "grad_norm": 0.495626300573349, + "learning_rate": 2.481139983235541e-06, + "loss": 0.4682, + "step": 296 + }, + { + "epoch": 0.07469818913480886, + "grad_norm": 0.42833268642425537, + "learning_rate": 2.489522212908634e-06, + "loss": 0.4579, + "step": 297 + }, + { + "epoch": 0.07494969818913481, + "grad_norm": 0.4342649579048157, + "learning_rate": 2.497904442581727e-06, + "loss": 0.4299, + "step": 298 + }, + { + "epoch": 0.07520120724346077, + "grad_norm": 0.48605650663375854, + "learning_rate": 2.50628667225482e-06, + "loss": 0.4821, + "step": 299 + }, + { + "epoch": 0.07545271629778671, + "grad_norm": 0.45871275663375854, + "learning_rate": 2.514668901927913e-06, + "loss": 0.4783, + "step": 300 + }, + { + "epoch": 0.07570422535211267, + "grad_norm": 0.42935168743133545, + "learning_rate": 2.5230511316010058e-06, + "loss": 0.4676, + "step": 301 + }, + { + "epoch": 0.07595573440643863, + "grad_norm": 0.429830938577652, + "learning_rate": 2.5314333612740992e-06, + "loss": 0.4846, + "step": 302 + }, + { + "epoch": 0.07620724346076459, + "grad_norm": 0.4596365988254547, + "learning_rate": 2.5398155909471923e-06, + "loss": 0.47, + "step": 303 + }, + { + "epoch": 0.07645875251509054, + "grad_norm": 0.49432510137557983, + "learning_rate": 2.548197820620285e-06, + "loss": 0.4693, + "step": 304 + }, + { + "epoch": 0.0767102615694165, + "grad_norm": 0.44141435623168945, + "learning_rate": 2.5565800502933784e-06, + "loss": 0.4631, + "step": 305 + }, + { + "epoch": 0.07696177062374246, + "grad_norm": 0.43977904319763184, + "learning_rate": 2.5649622799664714e-06, + "loss": 0.4513, + "step": 306 + }, + { + "epoch": 0.07721327967806842, + "grad_norm": 0.4588465988636017, + "learning_rate": 2.573344509639564e-06, + "loss": 0.4577, + "step": 307 + }, + { + "epoch": 0.07746478873239436, + "grad_norm": 0.454364538192749, + "learning_rate": 2.581726739312657e-06, + "loss": 0.4682, + "step": 308 + }, + { + "epoch": 0.07771629778672032, + "grad_norm": 0.4163168668746948, + "learning_rate": 2.5901089689857506e-06, + "loss": 0.4905, + "step": 309 + }, + { + "epoch": 0.07796780684104627, + "grad_norm": 0.4823428988456726, + "learning_rate": 2.5984911986588436e-06, + "loss": 0.4737, + "step": 310 + }, + { + "epoch": 0.07821931589537223, + "grad_norm": 0.46709927916526794, + "learning_rate": 2.6068734283319363e-06, + "loss": 0.4498, + "step": 311 + }, + { + "epoch": 0.07847082494969819, + "grad_norm": 0.45121198892593384, + "learning_rate": 2.6152556580050297e-06, + "loss": 0.4373, + "step": 312 + }, + { + "epoch": 0.07872233400402415, + "grad_norm": 0.41081297397613525, + "learning_rate": 2.623637887678123e-06, + "loss": 0.4669, + "step": 313 + }, + { + "epoch": 0.0789738430583501, + "grad_norm": 0.4702714681625366, + "learning_rate": 2.6320201173512154e-06, + "loss": 0.45, + "step": 314 + }, + { + "epoch": 0.07922535211267606, + "grad_norm": 0.4380156099796295, + "learning_rate": 2.6404023470243085e-06, + "loss": 0.4857, + "step": 315 + }, + { + "epoch": 0.079476861167002, + "grad_norm": 0.4465760290622711, + "learning_rate": 2.648784576697402e-06, + "loss": 0.495, + "step": 316 + }, + { + "epoch": 0.07972837022132796, + "grad_norm": 0.4858304262161255, + "learning_rate": 2.6571668063704946e-06, + "loss": 0.4662, + "step": 317 + }, + { + "epoch": 0.07997987927565392, + "grad_norm": 0.44153428077697754, + "learning_rate": 2.6655490360435876e-06, + "loss": 0.4861, + "step": 318 + }, + { + "epoch": 0.08023138832997988, + "grad_norm": 0.44172754883766174, + "learning_rate": 2.673931265716681e-06, + "loss": 0.4725, + "step": 319 + }, + { + "epoch": 0.08048289738430583, + "grad_norm": 0.4511873126029968, + "learning_rate": 2.682313495389774e-06, + "loss": 0.4707, + "step": 320 + }, + { + "epoch": 0.08073440643863179, + "grad_norm": 0.4447844326496124, + "learning_rate": 2.6906957250628668e-06, + "loss": 0.4294, + "step": 321 + }, + { + "epoch": 0.08098591549295775, + "grad_norm": 0.4373539686203003, + "learning_rate": 2.6990779547359602e-06, + "loss": 0.4257, + "step": 322 + }, + { + "epoch": 0.08123742454728371, + "grad_norm": 0.4205957055091858, + "learning_rate": 2.7074601844090533e-06, + "loss": 0.4782, + "step": 323 + }, + { + "epoch": 0.08148893360160966, + "grad_norm": 0.4675666093826294, + "learning_rate": 2.715842414082146e-06, + "loss": 0.4852, + "step": 324 + }, + { + "epoch": 0.08174044265593561, + "grad_norm": 0.4324727952480316, + "learning_rate": 2.724224643755239e-06, + "loss": 0.4666, + "step": 325 + }, + { + "epoch": 0.08199195171026157, + "grad_norm": 0.49073663353919983, + "learning_rate": 2.7326068734283324e-06, + "loss": 0.4565, + "step": 326 + }, + { + "epoch": 0.08224346076458752, + "grad_norm": 0.38460773229599, + "learning_rate": 2.740989103101425e-06, + "loss": 0.4608, + "step": 327 + }, + { + "epoch": 0.08249496981891348, + "grad_norm": 0.4604051113128662, + "learning_rate": 2.749371332774518e-06, + "loss": 0.5168, + "step": 328 + }, + { + "epoch": 0.08274647887323944, + "grad_norm": 0.4031774401664734, + "learning_rate": 2.7577535624476116e-06, + "loss": 0.4719, + "step": 329 + }, + { + "epoch": 0.0829979879275654, + "grad_norm": 0.40108442306518555, + "learning_rate": 2.766135792120704e-06, + "loss": 0.4358, + "step": 330 + }, + { + "epoch": 0.08324949698189135, + "grad_norm": 0.533218502998352, + "learning_rate": 2.7745180217937973e-06, + "loss": 0.4492, + "step": 331 + }, + { + "epoch": 0.08350100603621731, + "grad_norm": 0.43500781059265137, + "learning_rate": 2.7829002514668907e-06, + "loss": 0.4561, + "step": 332 + }, + { + "epoch": 0.08375251509054325, + "grad_norm": 0.47511211037635803, + "learning_rate": 2.7912824811399838e-06, + "loss": 0.4575, + "step": 333 + }, + { + "epoch": 0.08400402414486921, + "grad_norm": 0.41863763332366943, + "learning_rate": 2.7996647108130764e-06, + "loss": 0.4525, + "step": 334 + }, + { + "epoch": 0.08425553319919517, + "grad_norm": 0.4359985589981079, + "learning_rate": 2.8080469404861695e-06, + "loss": 0.4666, + "step": 335 + }, + { + "epoch": 0.08450704225352113, + "grad_norm": 0.44898366928100586, + "learning_rate": 2.816429170159263e-06, + "loss": 0.4736, + "step": 336 + }, + { + "epoch": 0.08475855130784708, + "grad_norm": 0.44528570771217346, + "learning_rate": 2.8248113998323556e-06, + "loss": 0.4477, + "step": 337 + }, + { + "epoch": 0.08501006036217304, + "grad_norm": 0.4499231278896332, + "learning_rate": 2.8331936295054486e-06, + "loss": 0.4482, + "step": 338 + }, + { + "epoch": 0.085261569416499, + "grad_norm": 0.4471151828765869, + "learning_rate": 2.841575859178542e-06, + "loss": 0.4732, + "step": 339 + }, + { + "epoch": 0.08551307847082495, + "grad_norm": 0.42974570393562317, + "learning_rate": 2.8499580888516347e-06, + "loss": 0.4857, + "step": 340 + }, + { + "epoch": 0.08576458752515091, + "grad_norm": 0.40349292755126953, + "learning_rate": 2.8583403185247278e-06, + "loss": 0.4637, + "step": 341 + }, + { + "epoch": 0.08601609657947686, + "grad_norm": 0.38529425859451294, + "learning_rate": 2.8667225481978204e-06, + "loss": 0.4581, + "step": 342 + }, + { + "epoch": 0.08626760563380281, + "grad_norm": 0.4391642212867737, + "learning_rate": 2.875104777870914e-06, + "loss": 0.4549, + "step": 343 + }, + { + "epoch": 0.08651911468812877, + "grad_norm": 0.43910202383995056, + "learning_rate": 2.883487007544007e-06, + "loss": 0.45, + "step": 344 + }, + { + "epoch": 0.08677062374245473, + "grad_norm": 0.4166280925273895, + "learning_rate": 2.8918692372171e-06, + "loss": 0.4512, + "step": 345 + }, + { + "epoch": 0.08702213279678069, + "grad_norm": 0.40169304609298706, + "learning_rate": 2.9002514668901934e-06, + "loss": 0.4629, + "step": 346 + }, + { + "epoch": 0.08727364185110664, + "grad_norm": 0.45463281869888306, + "learning_rate": 2.908633696563286e-06, + "loss": 0.4465, + "step": 347 + }, + { + "epoch": 0.0875251509054326, + "grad_norm": 0.4328685700893402, + "learning_rate": 2.917015926236379e-06, + "loss": 0.4352, + "step": 348 + }, + { + "epoch": 0.08777665995975856, + "grad_norm": 0.4192608892917633, + "learning_rate": 2.9253981559094726e-06, + "loss": 0.483, + "step": 349 + }, + { + "epoch": 0.0880281690140845, + "grad_norm": 0.4864928126335144, + "learning_rate": 2.933780385582565e-06, + "loss": 0.464, + "step": 350 + }, + { + "epoch": 0.08827967806841046, + "grad_norm": 0.4200999140739441, + "learning_rate": 2.9421626152556582e-06, + "loss": 0.4781, + "step": 351 + }, + { + "epoch": 0.08853118712273642, + "grad_norm": 0.430940181016922, + "learning_rate": 2.950544844928751e-06, + "loss": 0.5017, + "step": 352 + }, + { + "epoch": 0.08878269617706237, + "grad_norm": 0.4381304085254669, + "learning_rate": 2.9589270746018443e-06, + "loss": 0.4565, + "step": 353 + }, + { + "epoch": 0.08903420523138833, + "grad_norm": 0.4690648019313812, + "learning_rate": 2.9673093042749374e-06, + "loss": 0.4408, + "step": 354 + }, + { + "epoch": 0.08928571428571429, + "grad_norm": 0.45738598704338074, + "learning_rate": 2.97569153394803e-06, + "loss": 0.4732, + "step": 355 + }, + { + "epoch": 0.08953722334004025, + "grad_norm": 0.4283290505409241, + "learning_rate": 2.9840737636211235e-06, + "loss": 0.4854, + "step": 356 + }, + { + "epoch": 0.0897887323943662, + "grad_norm": 0.4096241891384125, + "learning_rate": 2.9924559932942165e-06, + "loss": 0.4874, + "step": 357 + }, + { + "epoch": 0.09004024144869216, + "grad_norm": 0.47754043340682983, + "learning_rate": 3.0008382229673096e-06, + "loss": 0.4878, + "step": 358 + }, + { + "epoch": 0.0902917505030181, + "grad_norm": 0.43061840534210205, + "learning_rate": 3.009220452640403e-06, + "loss": 0.4499, + "step": 359 + }, + { + "epoch": 0.09054325955734406, + "grad_norm": 0.4733719825744629, + "learning_rate": 3.0176026823134957e-06, + "loss": 0.4817, + "step": 360 + }, + { + "epoch": 0.09079476861167002, + "grad_norm": 0.44821831583976746, + "learning_rate": 3.0259849119865887e-06, + "loss": 0.4783, + "step": 361 + }, + { + "epoch": 0.09104627766599598, + "grad_norm": 0.4314734935760498, + "learning_rate": 3.0343671416596814e-06, + "loss": 0.4892, + "step": 362 + }, + { + "epoch": 0.09129778672032193, + "grad_norm": 0.4240245223045349, + "learning_rate": 3.042749371332775e-06, + "loss": 0.448, + "step": 363 + }, + { + "epoch": 0.09154929577464789, + "grad_norm": 0.41133949160575867, + "learning_rate": 3.051131601005868e-06, + "loss": 0.4519, + "step": 364 + }, + { + "epoch": 0.09180080482897385, + "grad_norm": 0.4324391782283783, + "learning_rate": 3.0595138306789605e-06, + "loss": 0.4609, + "step": 365 + }, + { + "epoch": 0.0920523138832998, + "grad_norm": 0.3948531150817871, + "learning_rate": 3.067896060352054e-06, + "loss": 0.4525, + "step": 366 + }, + { + "epoch": 0.09230382293762575, + "grad_norm": 0.4043181836605072, + "learning_rate": 3.076278290025147e-06, + "loss": 0.4684, + "step": 367 + }, + { + "epoch": 0.0925553319919517, + "grad_norm": 0.49464017152786255, + "learning_rate": 3.0846605196982397e-06, + "loss": 0.4863, + "step": 368 + }, + { + "epoch": 0.09280684104627766, + "grad_norm": 0.4050739109516144, + "learning_rate": 3.0930427493713327e-06, + "loss": 0.4399, + "step": 369 + }, + { + "epoch": 0.09305835010060362, + "grad_norm": 0.46191224455833435, + "learning_rate": 3.101424979044426e-06, + "loss": 0.4729, + "step": 370 + }, + { + "epoch": 0.09330985915492958, + "grad_norm": 0.4379456043243408, + "learning_rate": 3.1098072087175192e-06, + "loss": 0.4765, + "step": 371 + }, + { + "epoch": 0.09356136820925554, + "grad_norm": 0.443185031414032, + "learning_rate": 3.118189438390612e-06, + "loss": 0.4497, + "step": 372 + }, + { + "epoch": 0.09381287726358149, + "grad_norm": 0.4520972967147827, + "learning_rate": 3.1265716680637053e-06, + "loss": 0.4766, + "step": 373 + }, + { + "epoch": 0.09406438631790745, + "grad_norm": 0.4627498984336853, + "learning_rate": 3.1349538977367984e-06, + "loss": 0.4559, + "step": 374 + }, + { + "epoch": 0.0943158953722334, + "grad_norm": 0.42422908544540405, + "learning_rate": 3.143336127409891e-06, + "loss": 0.442, + "step": 375 + }, + { + "epoch": 0.09456740442655935, + "grad_norm": 0.4159485399723053, + "learning_rate": 3.1517183570829845e-06, + "loss": 0.4841, + "step": 376 + }, + { + "epoch": 0.09481891348088531, + "grad_norm": 0.3887682855129242, + "learning_rate": 3.1601005867560775e-06, + "loss": 0.4615, + "step": 377 + }, + { + "epoch": 0.09507042253521127, + "grad_norm": 0.4549719989299774, + "learning_rate": 3.16848281642917e-06, + "loss": 0.4507, + "step": 378 + }, + { + "epoch": 0.09532193158953722, + "grad_norm": 0.4081493020057678, + "learning_rate": 3.176865046102263e-06, + "loss": 0.4658, + "step": 379 + }, + { + "epoch": 0.09557344064386318, + "grad_norm": 0.4539159834384918, + "learning_rate": 3.1852472757753567e-06, + "loss": 0.4618, + "step": 380 + }, + { + "epoch": 0.09582494969818914, + "grad_norm": 0.4574434459209442, + "learning_rate": 3.1936295054484493e-06, + "loss": 0.4952, + "step": 381 + }, + { + "epoch": 0.0960764587525151, + "grad_norm": 0.4301118552684784, + "learning_rate": 3.2020117351215424e-06, + "loss": 0.4533, + "step": 382 + }, + { + "epoch": 0.09632796780684105, + "grad_norm": 0.422446072101593, + "learning_rate": 3.210393964794636e-06, + "loss": 0.4402, + "step": 383 + }, + { + "epoch": 0.096579476861167, + "grad_norm": 0.43976905941963196, + "learning_rate": 3.218776194467729e-06, + "loss": 0.4586, + "step": 384 + }, + { + "epoch": 0.09683098591549295, + "grad_norm": 0.4008273482322693, + "learning_rate": 3.2271584241408215e-06, + "loss": 0.4547, + "step": 385 + }, + { + "epoch": 0.09708249496981891, + "grad_norm": 0.3887328803539276, + "learning_rate": 3.235540653813915e-06, + "loss": 0.4525, + "step": 386 + }, + { + "epoch": 0.09733400402414487, + "grad_norm": 0.4399072229862213, + "learning_rate": 3.243922883487008e-06, + "loss": 0.4558, + "step": 387 + }, + { + "epoch": 0.09758551307847083, + "grad_norm": 0.42582908272743225, + "learning_rate": 3.2523051131601007e-06, + "loss": 0.4602, + "step": 388 + }, + { + "epoch": 0.09783702213279678, + "grad_norm": 0.41464588046073914, + "learning_rate": 3.2606873428331937e-06, + "loss": 0.48, + "step": 389 + }, + { + "epoch": 0.09808853118712274, + "grad_norm": 0.4072147309780121, + "learning_rate": 3.269069572506287e-06, + "loss": 0.461, + "step": 390 + }, + { + "epoch": 0.0983400402414487, + "grad_norm": 0.39760878682136536, + "learning_rate": 3.27745180217938e-06, + "loss": 0.4555, + "step": 391 + }, + { + "epoch": 0.09859154929577464, + "grad_norm": 0.42994043231010437, + "learning_rate": 3.285834031852473e-06, + "loss": 0.4648, + "step": 392 + }, + { + "epoch": 0.0988430583501006, + "grad_norm": 0.4006013870239258, + "learning_rate": 3.2942162615255663e-06, + "loss": 0.4379, + "step": 393 + }, + { + "epoch": 0.09909456740442656, + "grad_norm": 0.44781169295310974, + "learning_rate": 3.3025984911986594e-06, + "loss": 0.4628, + "step": 394 + }, + { + "epoch": 0.09934607645875251, + "grad_norm": 0.45750489830970764, + "learning_rate": 3.310980720871752e-06, + "loss": 0.4313, + "step": 395 + }, + { + "epoch": 0.09959758551307847, + "grad_norm": 0.4136182963848114, + "learning_rate": 3.3193629505448455e-06, + "loss": 0.4463, + "step": 396 + }, + { + "epoch": 0.09984909456740443, + "grad_norm": 0.45176711678504944, + "learning_rate": 3.3277451802179385e-06, + "loss": 0.4713, + "step": 397 + }, + { + "epoch": 0.10010060362173039, + "grad_norm": 0.4363192021846771, + "learning_rate": 3.336127409891031e-06, + "loss": 0.4316, + "step": 398 + }, + { + "epoch": 0.10035211267605634, + "grad_norm": 0.4392482340335846, + "learning_rate": 3.344509639564124e-06, + "loss": 0.4388, + "step": 399 + }, + { + "epoch": 0.1006036217303823, + "grad_norm": 0.43276289105415344, + "learning_rate": 3.3528918692372177e-06, + "loss": 0.481, + "step": 400 + }, + { + "epoch": 0.10085513078470824, + "grad_norm": 0.40245917439460754, + "learning_rate": 3.3612740989103103e-06, + "loss": 0.4321, + "step": 401 + }, + { + "epoch": 0.1011066398390342, + "grad_norm": 0.4259881377220154, + "learning_rate": 3.3696563285834033e-06, + "loss": 0.4465, + "step": 402 + }, + { + "epoch": 0.10135814889336016, + "grad_norm": 0.452953964471817, + "learning_rate": 3.378038558256497e-06, + "loss": 0.4421, + "step": 403 + }, + { + "epoch": 0.10160965794768612, + "grad_norm": 0.39972519874572754, + "learning_rate": 3.3864207879295894e-06, + "loss": 0.431, + "step": 404 + }, + { + "epoch": 0.10186116700201207, + "grad_norm": 0.48080217838287354, + "learning_rate": 3.3948030176026825e-06, + "loss": 0.4602, + "step": 405 + }, + { + "epoch": 0.10211267605633803, + "grad_norm": 0.3960705101490021, + "learning_rate": 3.4031852472757755e-06, + "loss": 0.4666, + "step": 406 + }, + { + "epoch": 0.10236418511066399, + "grad_norm": 0.40246954560279846, + "learning_rate": 3.411567476948869e-06, + "loss": 0.4266, + "step": 407 + }, + { + "epoch": 0.10261569416498995, + "grad_norm": 0.43389272689819336, + "learning_rate": 3.4199497066219616e-06, + "loss": 0.4464, + "step": 408 + }, + { + "epoch": 0.10286720321931589, + "grad_norm": 0.3892342746257782, + "learning_rate": 3.4283319362950547e-06, + "loss": 0.4403, + "step": 409 + }, + { + "epoch": 0.10311871227364185, + "grad_norm": 0.45472291111946106, + "learning_rate": 3.436714165968148e-06, + "loss": 0.479, + "step": 410 + }, + { + "epoch": 0.1033702213279678, + "grad_norm": 0.4327657222747803, + "learning_rate": 3.445096395641241e-06, + "loss": 0.4696, + "step": 411 + }, + { + "epoch": 0.10362173038229376, + "grad_norm": 0.44970017671585083, + "learning_rate": 3.453478625314334e-06, + "loss": 0.4425, + "step": 412 + }, + { + "epoch": 0.10387323943661972, + "grad_norm": 0.4211669862270355, + "learning_rate": 3.4618608549874273e-06, + "loss": 0.4674, + "step": 413 + }, + { + "epoch": 0.10412474849094568, + "grad_norm": 0.3977251946926117, + "learning_rate": 3.47024308466052e-06, + "loss": 0.4427, + "step": 414 + }, + { + "epoch": 0.10437625754527163, + "grad_norm": 0.41054680943489075, + "learning_rate": 3.478625314333613e-06, + "loss": 0.4539, + "step": 415 + }, + { + "epoch": 0.10462776659959759, + "grad_norm": 0.42676669359207153, + "learning_rate": 3.4870075440067056e-06, + "loss": 0.4568, + "step": 416 + }, + { + "epoch": 0.10487927565392353, + "grad_norm": 0.4475442171096802, + "learning_rate": 3.495389773679799e-06, + "loss": 0.4512, + "step": 417 + }, + { + "epoch": 0.10513078470824949, + "grad_norm": 0.406112402677536, + "learning_rate": 3.503772003352892e-06, + "loss": 0.3903, + "step": 418 + }, + { + "epoch": 0.10538229376257545, + "grad_norm": 0.4194413125514984, + "learning_rate": 3.512154233025985e-06, + "loss": 0.4447, + "step": 419 + }, + { + "epoch": 0.1056338028169014, + "grad_norm": 0.4342004954814911, + "learning_rate": 3.5205364626990787e-06, + "loss": 0.4612, + "step": 420 + }, + { + "epoch": 0.10588531187122736, + "grad_norm": 0.408413290977478, + "learning_rate": 3.5289186923721713e-06, + "loss": 0.4595, + "step": 421 + }, + { + "epoch": 0.10613682092555332, + "grad_norm": 0.45207443833351135, + "learning_rate": 3.5373009220452643e-06, + "loss": 0.4918, + "step": 422 + }, + { + "epoch": 0.10638832997987928, + "grad_norm": 0.4605708718299866, + "learning_rate": 3.545683151718358e-06, + "loss": 0.4277, + "step": 423 + }, + { + "epoch": 0.10663983903420524, + "grad_norm": 0.45539170503616333, + "learning_rate": 3.5540653813914504e-06, + "loss": 0.4672, + "step": 424 + }, + { + "epoch": 0.1068913480885312, + "grad_norm": 0.4451138973236084, + "learning_rate": 3.5624476110645435e-06, + "loss": 0.4436, + "step": 425 + }, + { + "epoch": 0.10714285714285714, + "grad_norm": 0.4522078335285187, + "learning_rate": 3.570829840737636e-06, + "loss": 0.4035, + "step": 426 + }, + { + "epoch": 0.1073943661971831, + "grad_norm": 0.48479580879211426, + "learning_rate": 3.5792120704107296e-06, + "loss": 0.4759, + "step": 427 + }, + { + "epoch": 0.10764587525150905, + "grad_norm": 0.44372591376304626, + "learning_rate": 3.5875943000838226e-06, + "loss": 0.4588, + "step": 428 + }, + { + "epoch": 0.10789738430583501, + "grad_norm": 0.42548373341560364, + "learning_rate": 3.5959765297569153e-06, + "loss": 0.4506, + "step": 429 + }, + { + "epoch": 0.10814889336016097, + "grad_norm": 0.49597272276878357, + "learning_rate": 3.6043587594300087e-06, + "loss": 0.4831, + "step": 430 + }, + { + "epoch": 0.10840040241448692, + "grad_norm": 0.44530630111694336, + "learning_rate": 3.6127409891031018e-06, + "loss": 0.4139, + "step": 431 + }, + { + "epoch": 0.10865191146881288, + "grad_norm": 0.40483203530311584, + "learning_rate": 3.621123218776195e-06, + "loss": 0.4462, + "step": 432 + }, + { + "epoch": 0.10890342052313884, + "grad_norm": 0.38956066966056824, + "learning_rate": 3.6295054484492875e-06, + "loss": 0.4374, + "step": 433 + }, + { + "epoch": 0.10915492957746478, + "grad_norm": 0.46306437253952026, + "learning_rate": 3.637887678122381e-06, + "loss": 0.4532, + "step": 434 + }, + { + "epoch": 0.10940643863179074, + "grad_norm": 0.43688806891441345, + "learning_rate": 3.646269907795474e-06, + "loss": 0.4223, + "step": 435 + }, + { + "epoch": 0.1096579476861167, + "grad_norm": 0.43097516894340515, + "learning_rate": 3.6546521374685666e-06, + "loss": 0.4335, + "step": 436 + }, + { + "epoch": 0.10990945674044265, + "grad_norm": 0.4253983199596405, + "learning_rate": 3.66303436714166e-06, + "loss": 0.439, + "step": 437 + }, + { + "epoch": 0.11016096579476861, + "grad_norm": 0.4934920072555542, + "learning_rate": 3.671416596814753e-06, + "loss": 0.4308, + "step": 438 + }, + { + "epoch": 0.11041247484909457, + "grad_norm": 0.42577728629112244, + "learning_rate": 3.6797988264878458e-06, + "loss": 0.4365, + "step": 439 + }, + { + "epoch": 0.11066398390342053, + "grad_norm": 0.3953479826450348, + "learning_rate": 3.6881810561609392e-06, + "loss": 0.4392, + "step": 440 + }, + { + "epoch": 0.11091549295774648, + "grad_norm": 0.4147731363773346, + "learning_rate": 3.6965632858340323e-06, + "loss": 0.4588, + "step": 441 + }, + { + "epoch": 0.11116700201207244, + "grad_norm": 0.529985249042511, + "learning_rate": 3.704945515507125e-06, + "loss": 0.4484, + "step": 442 + }, + { + "epoch": 0.11141851106639838, + "grad_norm": 0.4451605975627899, + "learning_rate": 3.713327745180218e-06, + "loss": 0.4692, + "step": 443 + }, + { + "epoch": 0.11167002012072434, + "grad_norm": 0.46244457364082336, + "learning_rate": 3.7217099748533114e-06, + "loss": 0.4871, + "step": 444 + }, + { + "epoch": 0.1119215291750503, + "grad_norm": 0.39799419045448303, + "learning_rate": 3.7300922045264045e-06, + "loss": 0.4153, + "step": 445 + }, + { + "epoch": 0.11217303822937626, + "grad_norm": 0.48522502183914185, + "learning_rate": 3.738474434199497e-06, + "loss": 0.4595, + "step": 446 + }, + { + "epoch": 0.11242454728370221, + "grad_norm": 0.4234901964664459, + "learning_rate": 3.7468566638725906e-06, + "loss": 0.4008, + "step": 447 + }, + { + "epoch": 0.11267605633802817, + "grad_norm": 0.4186321198940277, + "learning_rate": 3.7552388935456836e-06, + "loss": 0.4629, + "step": 448 + }, + { + "epoch": 0.11292756539235413, + "grad_norm": 0.4625905156135559, + "learning_rate": 3.7636211232187762e-06, + "loss": 0.4222, + "step": 449 + }, + { + "epoch": 0.11317907444668009, + "grad_norm": 0.44527822732925415, + "learning_rate": 3.7720033528918697e-06, + "loss": 0.4633, + "step": 450 + }, + { + "epoch": 0.11343058350100603, + "grad_norm": 0.47749683260917664, + "learning_rate": 3.7803855825649628e-06, + "loss": 0.4123, + "step": 451 + }, + { + "epoch": 0.11368209255533199, + "grad_norm": 0.4436284601688385, + "learning_rate": 3.7887678122380554e-06, + "loss": 0.4425, + "step": 452 + }, + { + "epoch": 0.11393360160965794, + "grad_norm": 0.41336312890052795, + "learning_rate": 3.7971500419111484e-06, + "loss": 0.4376, + "step": 453 + }, + { + "epoch": 0.1141851106639839, + "grad_norm": 0.43879643082618713, + "learning_rate": 3.805532271584242e-06, + "loss": 0.4315, + "step": 454 + }, + { + "epoch": 0.11443661971830986, + "grad_norm": 0.44419604539871216, + "learning_rate": 3.8139145012573345e-06, + "loss": 0.4351, + "step": 455 + }, + { + "epoch": 0.11468812877263582, + "grad_norm": 0.46073997020721436, + "learning_rate": 3.822296730930428e-06, + "loss": 0.4417, + "step": 456 + }, + { + "epoch": 0.11493963782696177, + "grad_norm": 0.538664698600769, + "learning_rate": 3.8306789606035215e-06, + "loss": 0.4817, + "step": 457 + }, + { + "epoch": 0.11519114688128773, + "grad_norm": 0.49703216552734375, + "learning_rate": 3.839061190276614e-06, + "loss": 0.4625, + "step": 458 + }, + { + "epoch": 0.11544265593561369, + "grad_norm": 0.4681941568851471, + "learning_rate": 3.847443419949707e-06, + "loss": 0.4454, + "step": 459 + }, + { + "epoch": 0.11569416498993963, + "grad_norm": 0.512788712978363, + "learning_rate": 3.855825649622799e-06, + "loss": 0.4434, + "step": 460 + }, + { + "epoch": 0.11594567404426559, + "grad_norm": 0.5022100210189819, + "learning_rate": 3.864207879295893e-06, + "loss": 0.4626, + "step": 461 + }, + { + "epoch": 0.11619718309859155, + "grad_norm": 0.48100098967552185, + "learning_rate": 3.872590108968986e-06, + "loss": 0.4431, + "step": 462 + }, + { + "epoch": 0.1164486921529175, + "grad_norm": 0.4972802698612213, + "learning_rate": 3.880972338642079e-06, + "loss": 0.4199, + "step": 463 + }, + { + "epoch": 0.11670020120724346, + "grad_norm": 0.492680162191391, + "learning_rate": 3.889354568315172e-06, + "loss": 0.4546, + "step": 464 + }, + { + "epoch": 0.11695171026156942, + "grad_norm": 0.4608025848865509, + "learning_rate": 3.897736797988265e-06, + "loss": 0.4149, + "step": 465 + }, + { + "epoch": 0.11720321931589538, + "grad_norm": 0.4747462868690491, + "learning_rate": 3.906119027661358e-06, + "loss": 0.4254, + "step": 466 + }, + { + "epoch": 0.11745472837022133, + "grad_norm": 0.5062370300292969, + "learning_rate": 3.914501257334451e-06, + "loss": 0.4295, + "step": 467 + }, + { + "epoch": 0.11770623742454728, + "grad_norm": 0.4820232093334198, + "learning_rate": 3.922883487007545e-06, + "loss": 0.4812, + "step": 468 + }, + { + "epoch": 0.11795774647887323, + "grad_norm": 0.5073891282081604, + "learning_rate": 3.931265716680637e-06, + "loss": 0.4456, + "step": 469 + }, + { + "epoch": 0.11820925553319919, + "grad_norm": 0.4702168107032776, + "learning_rate": 3.93964794635373e-06, + "loss": 0.4479, + "step": 470 + }, + { + "epoch": 0.11846076458752515, + "grad_norm": 0.4862816035747528, + "learning_rate": 3.948030176026823e-06, + "loss": 0.4485, + "step": 471 + }, + { + "epoch": 0.11871227364185111, + "grad_norm": 0.49707740545272827, + "learning_rate": 3.956412405699917e-06, + "loss": 0.4292, + "step": 472 + }, + { + "epoch": 0.11896378269617706, + "grad_norm": 0.4554164707660675, + "learning_rate": 3.9647946353730094e-06, + "loss": 0.4791, + "step": 473 + }, + { + "epoch": 0.11921529175050302, + "grad_norm": 0.4359682500362396, + "learning_rate": 3.973176865046103e-06, + "loss": 0.465, + "step": 474 + }, + { + "epoch": 0.11946680080482898, + "grad_norm": 0.47159573435783386, + "learning_rate": 3.9815590947191955e-06, + "loss": 0.46, + "step": 475 + }, + { + "epoch": 0.11971830985915492, + "grad_norm": 0.47286924719810486, + "learning_rate": 3.989941324392288e-06, + "loss": 0.4639, + "step": 476 + }, + { + "epoch": 0.11996981891348088, + "grad_norm": 0.46829286217689514, + "learning_rate": 3.998323554065382e-06, + "loss": 0.4678, + "step": 477 + }, + { + "epoch": 0.12022132796780684, + "grad_norm": 0.44564419984817505, + "learning_rate": 4.006705783738475e-06, + "loss": 0.4401, + "step": 478 + }, + { + "epoch": 0.1204728370221328, + "grad_norm": 0.4393386244773865, + "learning_rate": 4.015088013411568e-06, + "loss": 0.46, + "step": 479 + }, + { + "epoch": 0.12072434607645875, + "grad_norm": 0.4447605311870575, + "learning_rate": 4.02347024308466e-06, + "loss": 0.4711, + "step": 480 + }, + { + "epoch": 0.12097585513078471, + "grad_norm": 0.4248807430267334, + "learning_rate": 4.031852472757754e-06, + "loss": 0.4239, + "step": 481 + }, + { + "epoch": 0.12122736418511067, + "grad_norm": 0.4450969398021698, + "learning_rate": 4.040234702430847e-06, + "loss": 0.4406, + "step": 482 + }, + { + "epoch": 0.12147887323943662, + "grad_norm": 0.5100280046463013, + "learning_rate": 4.04861693210394e-06, + "loss": 0.4406, + "step": 483 + }, + { + "epoch": 0.12173038229376258, + "grad_norm": 0.47610798478126526, + "learning_rate": 4.056999161777033e-06, + "loss": 0.4112, + "step": 484 + }, + { + "epoch": 0.12198189134808853, + "grad_norm": 0.43737804889678955, + "learning_rate": 4.065381391450126e-06, + "loss": 0.4757, + "step": 485 + }, + { + "epoch": 0.12223340040241448, + "grad_norm": 0.4593689441680908, + "learning_rate": 4.073763621123219e-06, + "loss": 0.4445, + "step": 486 + }, + { + "epoch": 0.12248490945674044, + "grad_norm": 0.46989330649375916, + "learning_rate": 4.082145850796312e-06, + "loss": 0.4559, + "step": 487 + }, + { + "epoch": 0.1227364185110664, + "grad_norm": 0.48121753334999084, + "learning_rate": 4.090528080469406e-06, + "loss": 0.4492, + "step": 488 + }, + { + "epoch": 0.12298792756539235, + "grad_norm": 0.4634334444999695, + "learning_rate": 4.098910310142498e-06, + "loss": 0.4346, + "step": 489 + }, + { + "epoch": 0.12323943661971831, + "grad_norm": 0.4351527988910675, + "learning_rate": 4.107292539815591e-06, + "loss": 0.4539, + "step": 490 + }, + { + "epoch": 0.12349094567404427, + "grad_norm": 0.47944945096969604, + "learning_rate": 4.115674769488684e-06, + "loss": 0.4655, + "step": 491 + }, + { + "epoch": 0.12374245472837023, + "grad_norm": 0.45442450046539307, + "learning_rate": 4.124056999161778e-06, + "loss": 0.4277, + "step": 492 + }, + { + "epoch": 0.12399396378269617, + "grad_norm": 0.4729836583137512, + "learning_rate": 4.1324392288348704e-06, + "loss": 0.4299, + "step": 493 + }, + { + "epoch": 0.12424547283702213, + "grad_norm": 0.40985509753227234, + "learning_rate": 4.140821458507964e-06, + "loss": 0.393, + "step": 494 + }, + { + "epoch": 0.12449698189134809, + "grad_norm": 0.49194350838661194, + "learning_rate": 4.1492036881810565e-06, + "loss": 0.4523, + "step": 495 + }, + { + "epoch": 0.12474849094567404, + "grad_norm": 0.47581902146339417, + "learning_rate": 4.157585917854149e-06, + "loss": 0.4494, + "step": 496 + }, + { + "epoch": 0.125, + "grad_norm": 0.4293755888938904, + "learning_rate": 4.165968147527243e-06, + "loss": 0.4087, + "step": 497 + }, + { + "epoch": 0.12525150905432594, + "grad_norm": 0.41463249921798706, + "learning_rate": 4.174350377200336e-06, + "loss": 0.4436, + "step": 498 + }, + { + "epoch": 0.12550301810865191, + "grad_norm": 0.4227687120437622, + "learning_rate": 4.182732606873429e-06, + "loss": 0.4367, + "step": 499 + }, + { + "epoch": 0.12575452716297786, + "grad_norm": 0.4326210916042328, + "learning_rate": 4.191114836546521e-06, + "loss": 0.4265, + "step": 500 + }, + { + "epoch": 0.12600603621730383, + "grad_norm": 0.4279535412788391, + "learning_rate": 4.199497066219615e-06, + "loss": 0.4702, + "step": 501 + }, + { + "epoch": 0.12625754527162977, + "grad_norm": 0.4595158100128174, + "learning_rate": 4.2078792958927074e-06, + "loss": 0.4425, + "step": 502 + }, + { + "epoch": 0.12650905432595574, + "grad_norm": 0.41721752285957336, + "learning_rate": 4.216261525565801e-06, + "loss": 0.4491, + "step": 503 + }, + { + "epoch": 0.1267605633802817, + "grad_norm": 0.43518510460853577, + "learning_rate": 4.224643755238894e-06, + "loss": 0.4387, + "step": 504 + }, + { + "epoch": 0.12701207243460766, + "grad_norm": 0.4557619094848633, + "learning_rate": 4.233025984911987e-06, + "loss": 0.4852, + "step": 505 + }, + { + "epoch": 0.1272635814889336, + "grad_norm": 0.4129253327846527, + "learning_rate": 4.24140821458508e-06, + "loss": 0.4359, + "step": 506 + }, + { + "epoch": 0.12751509054325955, + "grad_norm": 0.4084147810935974, + "learning_rate": 4.249790444258173e-06, + "loss": 0.4316, + "step": 507 + }, + { + "epoch": 0.12776659959758552, + "grad_norm": 0.4409344792366028, + "learning_rate": 4.258172673931267e-06, + "loss": 0.4375, + "step": 508 + }, + { + "epoch": 0.12801810865191146, + "grad_norm": 0.4991019368171692, + "learning_rate": 4.266554903604359e-06, + "loss": 0.4551, + "step": 509 + }, + { + "epoch": 0.12826961770623743, + "grad_norm": 0.4926038682460785, + "learning_rate": 4.274937133277452e-06, + "loss": 0.4643, + "step": 510 + }, + { + "epoch": 0.12852112676056338, + "grad_norm": 0.46267032623291016, + "learning_rate": 4.283319362950545e-06, + "loss": 0.4648, + "step": 511 + }, + { + "epoch": 0.12877263581488935, + "grad_norm": 0.45014211535453796, + "learning_rate": 4.291701592623638e-06, + "loss": 0.4574, + "step": 512 + }, + { + "epoch": 0.1290241448692153, + "grad_norm": 0.5043582320213318, + "learning_rate": 4.300083822296731e-06, + "loss": 0.463, + "step": 513 + }, + { + "epoch": 0.12927565392354123, + "grad_norm": 0.41695088148117065, + "learning_rate": 4.308466051969825e-06, + "loss": 0.4247, + "step": 514 + }, + { + "epoch": 0.1295271629778672, + "grad_norm": 0.4031003713607788, + "learning_rate": 4.3168482816429175e-06, + "loss": 0.4563, + "step": 515 + }, + { + "epoch": 0.12977867203219315, + "grad_norm": 0.4944654405117035, + "learning_rate": 4.32523051131601e-06, + "loss": 0.4323, + "step": 516 + }, + { + "epoch": 0.13003018108651912, + "grad_norm": 0.5555236339569092, + "learning_rate": 4.333612740989104e-06, + "loss": 0.4263, + "step": 517 + }, + { + "epoch": 0.13028169014084506, + "grad_norm": 0.4785892963409424, + "learning_rate": 4.341994970662197e-06, + "loss": 0.4331, + "step": 518 + }, + { + "epoch": 0.13053319919517103, + "grad_norm": 0.4259514808654785, + "learning_rate": 4.35037720033529e-06, + "loss": 0.4495, + "step": 519 + }, + { + "epoch": 0.13078470824949698, + "grad_norm": 0.4405849575996399, + "learning_rate": 4.358759430008382e-06, + "loss": 0.4482, + "step": 520 + }, + { + "epoch": 0.13103621730382295, + "grad_norm": 0.4837403893470764, + "learning_rate": 4.367141659681476e-06, + "loss": 0.4452, + "step": 521 + }, + { + "epoch": 0.1312877263581489, + "grad_norm": 0.44248178601264954, + "learning_rate": 4.3755238893545684e-06, + "loss": 0.4755, + "step": 522 + }, + { + "epoch": 0.13153923541247484, + "grad_norm": 0.4511367678642273, + "learning_rate": 4.383906119027662e-06, + "loss": 0.4325, + "step": 523 + }, + { + "epoch": 0.1317907444668008, + "grad_norm": 0.4366004168987274, + "learning_rate": 4.3922883487007545e-06, + "loss": 0.4221, + "step": 524 + }, + { + "epoch": 0.13204225352112675, + "grad_norm": 0.4173247218132019, + "learning_rate": 4.400670578373848e-06, + "loss": 0.4719, + "step": 525 + }, + { + "epoch": 0.13229376257545272, + "grad_norm": 0.4634314179420471, + "learning_rate": 4.409052808046941e-06, + "loss": 0.4307, + "step": 526 + }, + { + "epoch": 0.13254527162977867, + "grad_norm": 0.464970201253891, + "learning_rate": 4.417435037720033e-06, + "loss": 0.4247, + "step": 527 + }, + { + "epoch": 0.13279678068410464, + "grad_norm": 0.49737635254859924, + "learning_rate": 4.425817267393127e-06, + "loss": 0.4293, + "step": 528 + }, + { + "epoch": 0.13304828973843058, + "grad_norm": 0.4311039447784424, + "learning_rate": 4.43419949706622e-06, + "loss": 0.4513, + "step": 529 + }, + { + "epoch": 0.13329979879275655, + "grad_norm": 0.46590545773506165, + "learning_rate": 4.442581726739313e-06, + "loss": 0.4163, + "step": 530 + }, + { + "epoch": 0.1335513078470825, + "grad_norm": 0.4511418640613556, + "learning_rate": 4.450963956412406e-06, + "loss": 0.4741, + "step": 531 + }, + { + "epoch": 0.13380281690140844, + "grad_norm": 0.4536955952644348, + "learning_rate": 4.459346186085499e-06, + "loss": 0.4525, + "step": 532 + }, + { + "epoch": 0.1340543259557344, + "grad_norm": 0.4565694332122803, + "learning_rate": 4.467728415758592e-06, + "loss": 0.4405, + "step": 533 + }, + { + "epoch": 0.13430583501006035, + "grad_norm": 0.4305288791656494, + "learning_rate": 4.476110645431685e-06, + "loss": 0.4585, + "step": 534 + }, + { + "epoch": 0.13455734406438632, + "grad_norm": 0.4427644610404968, + "learning_rate": 4.4844928751047785e-06, + "loss": 0.4338, + "step": 535 + }, + { + "epoch": 0.13480885311871227, + "grad_norm": 0.4909707307815552, + "learning_rate": 4.492875104777871e-06, + "loss": 0.4331, + "step": 536 + }, + { + "epoch": 0.13506036217303824, + "grad_norm": 0.5035997629165649, + "learning_rate": 4.501257334450964e-06, + "loss": 0.4287, + "step": 537 + }, + { + "epoch": 0.13531187122736418, + "grad_norm": 0.45080435276031494, + "learning_rate": 4.509639564124057e-06, + "loss": 0.452, + "step": 538 + }, + { + "epoch": 0.13556338028169015, + "grad_norm": 0.4062177836894989, + "learning_rate": 4.518021793797151e-06, + "loss": 0.421, + "step": 539 + }, + { + "epoch": 0.1358148893360161, + "grad_norm": 0.47915151715278625, + "learning_rate": 4.526404023470243e-06, + "loss": 0.4678, + "step": 540 + }, + { + "epoch": 0.13606639839034204, + "grad_norm": 0.4649512469768524, + "learning_rate": 4.534786253143337e-06, + "loss": 0.4532, + "step": 541 + }, + { + "epoch": 0.136317907444668, + "grad_norm": 0.5145220756530762, + "learning_rate": 4.5431684828164294e-06, + "loss": 0.4529, + "step": 542 + }, + { + "epoch": 0.13656941649899396, + "grad_norm": 0.39938247203826904, + "learning_rate": 4.551550712489523e-06, + "loss": 0.4457, + "step": 543 + }, + { + "epoch": 0.13682092555331993, + "grad_norm": 0.5199249386787415, + "learning_rate": 4.5599329421626155e-06, + "loss": 0.442, + "step": 544 + }, + { + "epoch": 0.13707243460764587, + "grad_norm": 0.43543845415115356, + "learning_rate": 4.568315171835709e-06, + "loss": 0.4312, + "step": 545 + }, + { + "epoch": 0.13732394366197184, + "grad_norm": 0.44674405455589294, + "learning_rate": 4.576697401508802e-06, + "loss": 0.4456, + "step": 546 + }, + { + "epoch": 0.13757545271629779, + "grad_norm": 0.45393526554107666, + "learning_rate": 4.585079631181894e-06, + "loss": 0.4537, + "step": 547 + }, + { + "epoch": 0.13782696177062373, + "grad_norm": 0.43489089608192444, + "learning_rate": 4.593461860854988e-06, + "loss": 0.398, + "step": 548 + }, + { + "epoch": 0.1380784708249497, + "grad_norm": 0.41286396980285645, + "learning_rate": 4.601844090528081e-06, + "loss": 0.4434, + "step": 549 + }, + { + "epoch": 0.13832997987927564, + "grad_norm": 0.44615626335144043, + "learning_rate": 4.610226320201174e-06, + "loss": 0.4542, + "step": 550 + }, + { + "epoch": 0.13858148893360162, + "grad_norm": 0.4532471001148224, + "learning_rate": 4.6186085498742664e-06, + "loss": 0.462, + "step": 551 + }, + { + "epoch": 0.13883299798792756, + "grad_norm": 0.42889168858528137, + "learning_rate": 4.62699077954736e-06, + "loss": 0.4172, + "step": 552 + }, + { + "epoch": 0.13908450704225353, + "grad_norm": 0.46256425976753235, + "learning_rate": 4.6353730092204525e-06, + "loss": 0.4216, + "step": 553 + }, + { + "epoch": 0.13933601609657947, + "grad_norm": 0.4532136023044586, + "learning_rate": 4.643755238893546e-06, + "loss": 0.4269, + "step": 554 + }, + { + "epoch": 0.13958752515090544, + "grad_norm": 0.43736207485198975, + "learning_rate": 4.6521374685666395e-06, + "loss": 0.4318, + "step": 555 + }, + { + "epoch": 0.1398390342052314, + "grad_norm": 0.40762320160865784, + "learning_rate": 4.660519698239732e-06, + "loss": 0.4601, + "step": 556 + }, + { + "epoch": 0.14009054325955733, + "grad_norm": 0.4910624027252197, + "learning_rate": 4.668901927912825e-06, + "loss": 0.4335, + "step": 557 + }, + { + "epoch": 0.1403420523138833, + "grad_norm": 0.4497995972633362, + "learning_rate": 4.677284157585918e-06, + "loss": 0.4311, + "step": 558 + }, + { + "epoch": 0.14059356136820925, + "grad_norm": 0.4066920876502991, + "learning_rate": 4.685666387259012e-06, + "loss": 0.4364, + "step": 559 + }, + { + "epoch": 0.14084507042253522, + "grad_norm": 0.4641585350036621, + "learning_rate": 4.694048616932104e-06, + "loss": 0.4406, + "step": 560 + }, + { + "epoch": 0.14109657947686116, + "grad_norm": 0.46958115696907043, + "learning_rate": 4.702430846605197e-06, + "loss": 0.4608, + "step": 561 + }, + { + "epoch": 0.14134808853118713, + "grad_norm": 0.45457375049591064, + "learning_rate": 4.71081307627829e-06, + "loss": 0.4342, + "step": 562 + }, + { + "epoch": 0.14159959758551308, + "grad_norm": 0.4222368597984314, + "learning_rate": 4.719195305951383e-06, + "loss": 0.4364, + "step": 563 + }, + { + "epoch": 0.14185110663983905, + "grad_norm": 0.4284665882587433, + "learning_rate": 4.7275775356244765e-06, + "loss": 0.4101, + "step": 564 + }, + { + "epoch": 0.142102615694165, + "grad_norm": 0.44402772188186646, + "learning_rate": 4.73595976529757e-06, + "loss": 0.3907, + "step": 565 + }, + { + "epoch": 0.14235412474849093, + "grad_norm": 0.4303772747516632, + "learning_rate": 4.744341994970663e-06, + "loss": 0.4507, + "step": 566 + }, + { + "epoch": 0.1426056338028169, + "grad_norm": 0.4649859070777893, + "learning_rate": 4.752724224643755e-06, + "loss": 0.4351, + "step": 567 + }, + { + "epoch": 0.14285714285714285, + "grad_norm": 0.4507344365119934, + "learning_rate": 4.761106454316849e-06, + "loss": 0.4358, + "step": 568 + }, + { + "epoch": 0.14310865191146882, + "grad_norm": 0.41727519035339355, + "learning_rate": 4.769488683989942e-06, + "loss": 0.4509, + "step": 569 + }, + { + "epoch": 0.14336016096579476, + "grad_norm": 0.4347597658634186, + "learning_rate": 4.777870913663035e-06, + "loss": 0.4301, + "step": 570 + }, + { + "epoch": 0.14361167002012074, + "grad_norm": 0.46765145659446716, + "learning_rate": 4.7862531433361274e-06, + "loss": 0.4384, + "step": 571 + }, + { + "epoch": 0.14386317907444668, + "grad_norm": 0.46579083800315857, + "learning_rate": 4.794635373009221e-06, + "loss": 0.4452, + "step": 572 + }, + { + "epoch": 0.14411468812877262, + "grad_norm": 0.4726623296737671, + "learning_rate": 4.8030176026823135e-06, + "loss": 0.4457, + "step": 573 + }, + { + "epoch": 0.1443661971830986, + "grad_norm": 0.4563959240913391, + "learning_rate": 4.811399832355407e-06, + "loss": 0.4354, + "step": 574 + }, + { + "epoch": 0.14461770623742454, + "grad_norm": 0.4445574879646301, + "learning_rate": 4.8197820620285005e-06, + "loss": 0.447, + "step": 575 + }, + { + "epoch": 0.1448692152917505, + "grad_norm": 0.43714749813079834, + "learning_rate": 4.828164291701593e-06, + "loss": 0.4659, + "step": 576 + }, + { + "epoch": 0.14512072434607645, + "grad_norm": 0.4672676920890808, + "learning_rate": 4.836546521374686e-06, + "loss": 0.3998, + "step": 577 + }, + { + "epoch": 0.14537223340040242, + "grad_norm": 0.4701994061470032, + "learning_rate": 4.844928751047779e-06, + "loss": 0.4256, + "step": 578 + }, + { + "epoch": 0.14562374245472837, + "grad_norm": 0.44352054595947266, + "learning_rate": 4.853310980720873e-06, + "loss": 0.4475, + "step": 579 + }, + { + "epoch": 0.14587525150905434, + "grad_norm": 0.44086214900016785, + "learning_rate": 4.861693210393965e-06, + "loss": 0.4671, + "step": 580 + }, + { + "epoch": 0.14612676056338028, + "grad_norm": 0.4606599509716034, + "learning_rate": 4.870075440067058e-06, + "loss": 0.4333, + "step": 581 + }, + { + "epoch": 0.14637826961770622, + "grad_norm": 0.42309969663619995, + "learning_rate": 4.878457669740151e-06, + "loss": 0.4494, + "step": 582 + }, + { + "epoch": 0.1466297786720322, + "grad_norm": 0.48204946517944336, + "learning_rate": 4.886839899413244e-06, + "loss": 0.4518, + "step": 583 + }, + { + "epoch": 0.14688128772635814, + "grad_norm": 0.4570547044277191, + "learning_rate": 4.8952221290863375e-06, + "loss": 0.46, + "step": 584 + }, + { + "epoch": 0.1471327967806841, + "grad_norm": 0.4400968551635742, + "learning_rate": 4.903604358759431e-06, + "loss": 0.415, + "step": 585 + }, + { + "epoch": 0.14738430583501005, + "grad_norm": 0.48668766021728516, + "learning_rate": 4.911986588432524e-06, + "loss": 0.4209, + "step": 586 + }, + { + "epoch": 0.14763581488933603, + "grad_norm": 0.4713693857192993, + "learning_rate": 4.920368818105616e-06, + "loss": 0.4527, + "step": 587 + }, + { + "epoch": 0.14788732394366197, + "grad_norm": 0.4478401839733124, + "learning_rate": 4.928751047778709e-06, + "loss": 0.4484, + "step": 588 + }, + { + "epoch": 0.14813883299798794, + "grad_norm": 0.46621429920196533, + "learning_rate": 4.937133277451802e-06, + "loss": 0.4337, + "step": 589 + }, + { + "epoch": 0.14839034205231388, + "grad_norm": 0.4544297456741333, + "learning_rate": 4.945515507124896e-06, + "loss": 0.4596, + "step": 590 + }, + { + "epoch": 0.14864185110663983, + "grad_norm": 0.4180624186992645, + "learning_rate": 4.9538977367979884e-06, + "loss": 0.4492, + "step": 591 + }, + { + "epoch": 0.1488933601609658, + "grad_norm": 0.4281826615333557, + "learning_rate": 4.962279966471082e-06, + "loss": 0.4149, + "step": 592 + }, + { + "epoch": 0.14914486921529174, + "grad_norm": 0.4696996808052063, + "learning_rate": 4.9706621961441745e-06, + "loss": 0.4426, + "step": 593 + }, + { + "epoch": 0.1493963782696177, + "grad_norm": 0.42568251490592957, + "learning_rate": 4.979044425817268e-06, + "loss": 0.4312, + "step": 594 + }, + { + "epoch": 0.14964788732394366, + "grad_norm": 0.45928239822387695, + "learning_rate": 4.9874266554903615e-06, + "loss": 0.4318, + "step": 595 + }, + { + "epoch": 0.14989939637826963, + "grad_norm": 0.45567813515663147, + "learning_rate": 4.995808885163454e-06, + "loss": 0.457, + "step": 596 + }, + { + "epoch": 0.15015090543259557, + "grad_norm": 0.42233970761299133, + "learning_rate": 5.0041911148365476e-06, + "loss": 0.4411, + "step": 597 + }, + { + "epoch": 0.15040241448692154, + "grad_norm": 0.5426906943321228, + "learning_rate": 5.01257334450964e-06, + "loss": 0.4288, + "step": 598 + }, + { + "epoch": 0.1506539235412475, + "grad_norm": 0.43617966771125793, + "learning_rate": 5.020955574182733e-06, + "loss": 0.4557, + "step": 599 + }, + { + "epoch": 0.15090543259557343, + "grad_norm": 0.42684197425842285, + "learning_rate": 5.029337803855826e-06, + "loss": 0.4727, + "step": 600 + }, + { + "epoch": 0.1511569416498994, + "grad_norm": 0.4799075126647949, + "learning_rate": 5.037720033528919e-06, + "loss": 0.4247, + "step": 601 + }, + { + "epoch": 0.15140845070422534, + "grad_norm": 0.47558286786079407, + "learning_rate": 5.0461022632020115e-06, + "loss": 0.4396, + "step": 602 + }, + { + "epoch": 0.15165995975855132, + "grad_norm": 0.6897980570793152, + "learning_rate": 5.054484492875105e-06, + "loss": 0.4226, + "step": 603 + }, + { + "epoch": 0.15191146881287726, + "grad_norm": 0.49807223677635193, + "learning_rate": 5.0628667225481985e-06, + "loss": 0.4555, + "step": 604 + }, + { + "epoch": 0.15216297786720323, + "grad_norm": 0.4609929025173187, + "learning_rate": 5.071248952221292e-06, + "loss": 0.44, + "step": 605 + }, + { + "epoch": 0.15241448692152917, + "grad_norm": 0.42947277426719666, + "learning_rate": 5.079631181894385e-06, + "loss": 0.4162, + "step": 606 + }, + { + "epoch": 0.15266599597585512, + "grad_norm": 0.4348846971988678, + "learning_rate": 5.088013411567477e-06, + "loss": 0.4073, + "step": 607 + }, + { + "epoch": 0.1529175050301811, + "grad_norm": 0.5616949200630188, + "learning_rate": 5.09639564124057e-06, + "loss": 0.4516, + "step": 608 + }, + { + "epoch": 0.15316901408450703, + "grad_norm": 0.4498026371002197, + "learning_rate": 5.104777870913663e-06, + "loss": 0.3877, + "step": 609 + }, + { + "epoch": 0.153420523138833, + "grad_norm": 0.4951980412006378, + "learning_rate": 5.113160100586757e-06, + "loss": 0.4378, + "step": 610 + }, + { + "epoch": 0.15367203219315895, + "grad_norm": 0.5059778094291687, + "learning_rate": 5.12154233025985e-06, + "loss": 0.4475, + "step": 611 + }, + { + "epoch": 0.15392354124748492, + "grad_norm": 0.4149338901042938, + "learning_rate": 5.129924559932943e-06, + "loss": 0.451, + "step": 612 + }, + { + "epoch": 0.15417505030181086, + "grad_norm": 0.4755608141422272, + "learning_rate": 5.1383067896060355e-06, + "loss": 0.4458, + "step": 613 + }, + { + "epoch": 0.15442655935613683, + "grad_norm": 0.4765431582927704, + "learning_rate": 5.146689019279128e-06, + "loss": 0.4514, + "step": 614 + }, + { + "epoch": 0.15467806841046278, + "grad_norm": 0.445700079202652, + "learning_rate": 5.155071248952222e-06, + "loss": 0.4362, + "step": 615 + }, + { + "epoch": 0.15492957746478872, + "grad_norm": 0.46473878622055054, + "learning_rate": 5.163453478625314e-06, + "loss": 0.4453, + "step": 616 + }, + { + "epoch": 0.1551810865191147, + "grad_norm": 0.471645712852478, + "learning_rate": 5.1718357082984086e-06, + "loss": 0.4708, + "step": 617 + }, + { + "epoch": 0.15543259557344064, + "grad_norm": 0.4333302676677704, + "learning_rate": 5.180217937971501e-06, + "loss": 0.436, + "step": 618 + }, + { + "epoch": 0.1556841046277666, + "grad_norm": 0.414048433303833, + "learning_rate": 5.188600167644594e-06, + "loss": 0.4313, + "step": 619 + }, + { + "epoch": 0.15593561368209255, + "grad_norm": 0.4397493302822113, + "learning_rate": 5.196982397317687e-06, + "loss": 0.4555, + "step": 620 + }, + { + "epoch": 0.15618712273641852, + "grad_norm": 0.43406859040260315, + "learning_rate": 5.20536462699078e-06, + "loss": 0.4276, + "step": 621 + }, + { + "epoch": 0.15643863179074446, + "grad_norm": 0.39688730239868164, + "learning_rate": 5.2137468566638725e-06, + "loss": 0.4435, + "step": 622 + }, + { + "epoch": 0.15669014084507044, + "grad_norm": 0.4431197941303253, + "learning_rate": 5.222129086336965e-06, + "loss": 0.4676, + "step": 623 + }, + { + "epoch": 0.15694164989939638, + "grad_norm": 0.3907865881919861, + "learning_rate": 5.2305113160100595e-06, + "loss": 0.4142, + "step": 624 + }, + { + "epoch": 0.15719315895372232, + "grad_norm": 0.48109179735183716, + "learning_rate": 5.238893545683152e-06, + "loss": 0.4503, + "step": 625 + }, + { + "epoch": 0.1574446680080483, + "grad_norm": 0.42568618059158325, + "learning_rate": 5.247275775356246e-06, + "loss": 0.4298, + "step": 626 + }, + { + "epoch": 0.15769617706237424, + "grad_norm": 0.4349900186061859, + "learning_rate": 5.255658005029338e-06, + "loss": 0.4306, + "step": 627 + }, + { + "epoch": 0.1579476861167002, + "grad_norm": 0.45168644189834595, + "learning_rate": 5.264040234702431e-06, + "loss": 0.4328, + "step": 628 + }, + { + "epoch": 0.15819919517102615, + "grad_norm": 0.4641101360321045, + "learning_rate": 5.272422464375524e-06, + "loss": 0.4293, + "step": 629 + }, + { + "epoch": 0.15845070422535212, + "grad_norm": 0.450825035572052, + "learning_rate": 5.280804694048617e-06, + "loss": 0.4415, + "step": 630 + }, + { + "epoch": 0.15870221327967807, + "grad_norm": 0.4262521266937256, + "learning_rate": 5.289186923721711e-06, + "loss": 0.4678, + "step": 631 + }, + { + "epoch": 0.158953722334004, + "grad_norm": 0.4274531304836273, + "learning_rate": 5.297569153394804e-06, + "loss": 0.4443, + "step": 632 + }, + { + "epoch": 0.15920523138832998, + "grad_norm": 0.46715742349624634, + "learning_rate": 5.3059513830678965e-06, + "loss": 0.4056, + "step": 633 + }, + { + "epoch": 0.15945674044265593, + "grad_norm": 0.4867947995662689, + "learning_rate": 5.314333612740989e-06, + "loss": 0.4539, + "step": 634 + }, + { + "epoch": 0.1597082494969819, + "grad_norm": 0.44424811005592346, + "learning_rate": 5.322715842414083e-06, + "loss": 0.4558, + "step": 635 + }, + { + "epoch": 0.15995975855130784, + "grad_norm": 0.4673738181591034, + "learning_rate": 5.331098072087175e-06, + "loss": 0.4272, + "step": 636 + }, + { + "epoch": 0.1602112676056338, + "grad_norm": 0.4130132794380188, + "learning_rate": 5.3394803017602695e-06, + "loss": 0.4446, + "step": 637 + }, + { + "epoch": 0.16046277665995975, + "grad_norm": 0.43853166699409485, + "learning_rate": 5.347862531433362e-06, + "loss": 0.4472, + "step": 638 + }, + { + "epoch": 0.16071428571428573, + "grad_norm": 0.4459279477596283, + "learning_rate": 5.356244761106455e-06, + "loss": 0.4636, + "step": 639 + }, + { + "epoch": 0.16096579476861167, + "grad_norm": 0.4516228437423706, + "learning_rate": 5.364626990779548e-06, + "loss": 0.4081, + "step": 640 + }, + { + "epoch": 0.1612173038229376, + "grad_norm": 0.41011399030685425, + "learning_rate": 5.373009220452641e-06, + "loss": 0.4401, + "step": 641 + }, + { + "epoch": 0.16146881287726358, + "grad_norm": 0.3807559311389923, + "learning_rate": 5.3813914501257335e-06, + "loss": 0.4051, + "step": 642 + }, + { + "epoch": 0.16172032193158953, + "grad_norm": 0.41934090852737427, + "learning_rate": 5.389773679798826e-06, + "loss": 0.4234, + "step": 643 + }, + { + "epoch": 0.1619718309859155, + "grad_norm": 0.39974445104599, + "learning_rate": 5.3981559094719205e-06, + "loss": 0.421, + "step": 644 + }, + { + "epoch": 0.16222334004024144, + "grad_norm": 0.4237842261791229, + "learning_rate": 5.406538139145013e-06, + "loss": 0.4301, + "step": 645 + }, + { + "epoch": 0.16247484909456741, + "grad_norm": 0.4234157204627991, + "learning_rate": 5.4149203688181066e-06, + "loss": 0.414, + "step": 646 + }, + { + "epoch": 0.16272635814889336, + "grad_norm": 0.4288012385368347, + "learning_rate": 5.423302598491199e-06, + "loss": 0.4019, + "step": 647 + }, + { + "epoch": 0.16297786720321933, + "grad_norm": 0.4307977855205536, + "learning_rate": 5.431684828164292e-06, + "loss": 0.4164, + "step": 648 + }, + { + "epoch": 0.16322937625754527, + "grad_norm": 0.4072883725166321, + "learning_rate": 5.4400670578373844e-06, + "loss": 0.4237, + "step": 649 + }, + { + "epoch": 0.16348088531187122, + "grad_norm": 0.44802358746528625, + "learning_rate": 5.448449287510478e-06, + "loss": 0.4257, + "step": 650 + }, + { + "epoch": 0.1637323943661972, + "grad_norm": 0.44640132784843445, + "learning_rate": 5.456831517183571e-06, + "loss": 0.4128, + "step": 651 + }, + { + "epoch": 0.16398390342052313, + "grad_norm": 0.45102569460868835, + "learning_rate": 5.465213746856665e-06, + "loss": 0.4205, + "step": 652 + }, + { + "epoch": 0.1642354124748491, + "grad_norm": 0.4356350004673004, + "learning_rate": 5.4735959765297575e-06, + "loss": 0.4444, + "step": 653 + }, + { + "epoch": 0.16448692152917505, + "grad_norm": 0.4442964196205139, + "learning_rate": 5.48197820620285e-06, + "loss": 0.4406, + "step": 654 + }, + { + "epoch": 0.16473843058350102, + "grad_norm": 0.5060700178146362, + "learning_rate": 5.490360435875944e-06, + "loss": 0.4425, + "step": 655 + }, + { + "epoch": 0.16498993963782696, + "grad_norm": 0.45222505927085876, + "learning_rate": 5.498742665549036e-06, + "loss": 0.4296, + "step": 656 + }, + { + "epoch": 0.16524144869215293, + "grad_norm": 0.47446000576019287, + "learning_rate": 5.507124895222129e-06, + "loss": 0.4272, + "step": 657 + }, + { + "epoch": 0.16549295774647887, + "grad_norm": 0.4703046977519989, + "learning_rate": 5.515507124895223e-06, + "loss": 0.4174, + "step": 658 + }, + { + "epoch": 0.16574446680080482, + "grad_norm": 0.48616233468055725, + "learning_rate": 5.523889354568316e-06, + "loss": 0.4248, + "step": 659 + }, + { + "epoch": 0.1659959758551308, + "grad_norm": 0.4568127989768982, + "learning_rate": 5.532271584241408e-06, + "loss": 0.4402, + "step": 660 + }, + { + "epoch": 0.16624748490945673, + "grad_norm": 0.44677838683128357, + "learning_rate": 5.540653813914502e-06, + "loss": 0.4516, + "step": 661 + }, + { + "epoch": 0.1664989939637827, + "grad_norm": 0.42325931787490845, + "learning_rate": 5.5490360435875945e-06, + "loss": 0.4393, + "step": 662 + }, + { + "epoch": 0.16675050301810865, + "grad_norm": 0.4119207561016083, + "learning_rate": 5.557418273260687e-06, + "loss": 0.3994, + "step": 663 + }, + { + "epoch": 0.16700201207243462, + "grad_norm": 0.45278915762901306, + "learning_rate": 5.5658005029337815e-06, + "loss": 0.4034, + "step": 664 + }, + { + "epoch": 0.16725352112676056, + "grad_norm": 0.5348659157752991, + "learning_rate": 5.574182732606874e-06, + "loss": 0.4471, + "step": 665 + }, + { + "epoch": 0.1675050301810865, + "grad_norm": 0.44384071230888367, + "learning_rate": 5.5825649622799676e-06, + "loss": 0.4334, + "step": 666 + }, + { + "epoch": 0.16775653923541248, + "grad_norm": 0.4300418496131897, + "learning_rate": 5.59094719195306e-06, + "loss": 0.4094, + "step": 667 + }, + { + "epoch": 0.16800804828973842, + "grad_norm": 0.4564829170703888, + "learning_rate": 5.599329421626153e-06, + "loss": 0.4138, + "step": 668 + }, + { + "epoch": 0.1682595573440644, + "grad_norm": 0.5108182430267334, + "learning_rate": 5.6077116512992454e-06, + "loss": 0.4135, + "step": 669 + }, + { + "epoch": 0.16851106639839034, + "grad_norm": 0.4513753652572632, + "learning_rate": 5.616093880972339e-06, + "loss": 0.456, + "step": 670 + }, + { + "epoch": 0.1687625754527163, + "grad_norm": 0.4487459659576416, + "learning_rate": 5.624476110645432e-06, + "loss": 0.4374, + "step": 671 + }, + { + "epoch": 0.16901408450704225, + "grad_norm": 0.48467063903808594, + "learning_rate": 5.632858340318526e-06, + "loss": 0.4575, + "step": 672 + }, + { + "epoch": 0.16926559356136822, + "grad_norm": 0.47948938608169556, + "learning_rate": 5.6412405699916185e-06, + "loss": 0.4399, + "step": 673 + }, + { + "epoch": 0.16951710261569417, + "grad_norm": 0.4448109269142151, + "learning_rate": 5.649622799664711e-06, + "loss": 0.4395, + "step": 674 + }, + { + "epoch": 0.1697686116700201, + "grad_norm": 0.44593971967697144, + "learning_rate": 5.658005029337804e-06, + "loss": 0.4382, + "step": 675 + }, + { + "epoch": 0.17002012072434608, + "grad_norm": 0.4504484236240387, + "learning_rate": 5.666387259010897e-06, + "loss": 0.4252, + "step": 676 + }, + { + "epoch": 0.17027162977867202, + "grad_norm": 0.5128602981567383, + "learning_rate": 5.67476948868399e-06, + "loss": 0.4444, + "step": 677 + }, + { + "epoch": 0.170523138832998, + "grad_norm": 0.4349728226661682, + "learning_rate": 5.683151718357084e-06, + "loss": 0.4152, + "step": 678 + }, + { + "epoch": 0.17077464788732394, + "grad_norm": 0.4326830804347992, + "learning_rate": 5.691533948030177e-06, + "loss": 0.4416, + "step": 679 + }, + { + "epoch": 0.1710261569416499, + "grad_norm": 0.42152220010757446, + "learning_rate": 5.699916177703269e-06, + "loss": 0.4363, + "step": 680 + }, + { + "epoch": 0.17127766599597585, + "grad_norm": 0.4485848844051361, + "learning_rate": 5.708298407376363e-06, + "loss": 0.4467, + "step": 681 + }, + { + "epoch": 0.17152917505030182, + "grad_norm": 0.42400580644607544, + "learning_rate": 5.7166806370494555e-06, + "loss": 0.4293, + "step": 682 + }, + { + "epoch": 0.17178068410462777, + "grad_norm": 0.4600382447242737, + "learning_rate": 5.725062866722548e-06, + "loss": 0.4293, + "step": 683 + }, + { + "epoch": 0.1720321931589537, + "grad_norm": 0.5220768451690674, + "learning_rate": 5.733445096395641e-06, + "loss": 0.3961, + "step": 684 + }, + { + "epoch": 0.17228370221327968, + "grad_norm": 0.5387700796127319, + "learning_rate": 5.741827326068735e-06, + "loss": 0.468, + "step": 685 + }, + { + "epoch": 0.17253521126760563, + "grad_norm": 0.4986119270324707, + "learning_rate": 5.750209555741828e-06, + "loss": 0.4452, + "step": 686 + }, + { + "epoch": 0.1727867203219316, + "grad_norm": 0.4661785066127777, + "learning_rate": 5.758591785414921e-06, + "loss": 0.4413, + "step": 687 + }, + { + "epoch": 0.17303822937625754, + "grad_norm": 0.4665091335773468, + "learning_rate": 5.766974015088014e-06, + "loss": 0.4327, + "step": 688 + }, + { + "epoch": 0.1732897384305835, + "grad_norm": 0.46862325072288513, + "learning_rate": 5.7753562447611064e-06, + "loss": 0.4422, + "step": 689 + }, + { + "epoch": 0.17354124748490946, + "grad_norm": 0.47405147552490234, + "learning_rate": 5.7837384744342e-06, + "loss": 0.4377, + "step": 690 + }, + { + "epoch": 0.1737927565392354, + "grad_norm": 0.49012836813926697, + "learning_rate": 5.792120704107293e-06, + "loss": 0.45, + "step": 691 + }, + { + "epoch": 0.17404426559356137, + "grad_norm": 0.49610865116119385, + "learning_rate": 5.800502933780387e-06, + "loss": 0.405, + "step": 692 + }, + { + "epoch": 0.1742957746478873, + "grad_norm": 0.5310260653495789, + "learning_rate": 5.8088851634534795e-06, + "loss": 0.4148, + "step": 693 + }, + { + "epoch": 0.17454728370221329, + "grad_norm": 0.4385521411895752, + "learning_rate": 5.817267393126572e-06, + "loss": 0.3932, + "step": 694 + }, + { + "epoch": 0.17479879275653923, + "grad_norm": 0.44616925716400146, + "learning_rate": 5.825649622799665e-06, + "loss": 0.4482, + "step": 695 + }, + { + "epoch": 0.1750503018108652, + "grad_norm": 0.5454944372177124, + "learning_rate": 5.834031852472758e-06, + "loss": 0.4579, + "step": 696 + }, + { + "epoch": 0.17530181086519114, + "grad_norm": 0.4732443690299988, + "learning_rate": 5.842414082145851e-06, + "loss": 0.4198, + "step": 697 + }, + { + "epoch": 0.17555331991951711, + "grad_norm": 0.5170097947120667, + "learning_rate": 5.850796311818945e-06, + "loss": 0.4551, + "step": 698 + }, + { + "epoch": 0.17580482897384306, + "grad_norm": 0.4894830584526062, + "learning_rate": 5.859178541492038e-06, + "loss": 0.411, + "step": 699 + }, + { + "epoch": 0.176056338028169, + "grad_norm": 0.47865331172943115, + "learning_rate": 5.86756077116513e-06, + "loss": 0.4162, + "step": 700 + }, + { + "epoch": 0.17630784708249497, + "grad_norm": 0.5109310150146484, + "learning_rate": 5.875943000838223e-06, + "loss": 0.4255, + "step": 701 + }, + { + "epoch": 0.17655935613682092, + "grad_norm": 0.4590068459510803, + "learning_rate": 5.8843252305113165e-06, + "loss": 0.4096, + "step": 702 + }, + { + "epoch": 0.1768108651911469, + "grad_norm": 0.5353825092315674, + "learning_rate": 5.892707460184409e-06, + "loss": 0.4477, + "step": 703 + }, + { + "epoch": 0.17706237424547283, + "grad_norm": 0.4406241178512573, + "learning_rate": 5.901089689857502e-06, + "loss": 0.4037, + "step": 704 + }, + { + "epoch": 0.1773138832997988, + "grad_norm": 0.4536992609500885, + "learning_rate": 5.909471919530596e-06, + "loss": 0.4397, + "step": 705 + }, + { + "epoch": 0.17756539235412475, + "grad_norm": 0.4446880519390106, + "learning_rate": 5.917854149203689e-06, + "loss": 0.4454, + "step": 706 + }, + { + "epoch": 0.17781690140845072, + "grad_norm": 0.4549560546875, + "learning_rate": 5.926236378876782e-06, + "loss": 0.4199, + "step": 707 + }, + { + "epoch": 0.17806841046277666, + "grad_norm": 0.4807390570640564, + "learning_rate": 5.934618608549875e-06, + "loss": 0.4336, + "step": 708 + }, + { + "epoch": 0.1783199195171026, + "grad_norm": 0.4760938882827759, + "learning_rate": 5.943000838222967e-06, + "loss": 0.4095, + "step": 709 + }, + { + "epoch": 0.17857142857142858, + "grad_norm": 0.4646598696708679, + "learning_rate": 5.95138306789606e-06, + "loss": 0.3987, + "step": 710 + }, + { + "epoch": 0.17882293762575452, + "grad_norm": 0.5018036961555481, + "learning_rate": 5.9597652975691535e-06, + "loss": 0.4004, + "step": 711 + }, + { + "epoch": 0.1790744466800805, + "grad_norm": 0.4650225341320038, + "learning_rate": 5.968147527242247e-06, + "loss": 0.4507, + "step": 712 + }, + { + "epoch": 0.17932595573440643, + "grad_norm": 0.44860950112342834, + "learning_rate": 5.9765297569153405e-06, + "loss": 0.44, + "step": 713 + }, + { + "epoch": 0.1795774647887324, + "grad_norm": 0.46424388885498047, + "learning_rate": 5.984911986588433e-06, + "loss": 0.4355, + "step": 714 + }, + { + "epoch": 0.17982897384305835, + "grad_norm": 0.49789562821388245, + "learning_rate": 5.993294216261526e-06, + "loss": 0.4256, + "step": 715 + }, + { + "epoch": 0.18008048289738432, + "grad_norm": 0.4016691744327545, + "learning_rate": 6.001676445934619e-06, + "loss": 0.4454, + "step": 716 + }, + { + "epoch": 0.18033199195171026, + "grad_norm": 0.47032442688941956, + "learning_rate": 6.010058675607712e-06, + "loss": 0.4023, + "step": 717 + }, + { + "epoch": 0.1805835010060362, + "grad_norm": 0.4652996063232422, + "learning_rate": 6.018440905280806e-06, + "loss": 0.4393, + "step": 718 + }, + { + "epoch": 0.18083501006036218, + "grad_norm": 0.4808005392551422, + "learning_rate": 6.026823134953899e-06, + "loss": 0.4102, + "step": 719 + }, + { + "epoch": 0.18108651911468812, + "grad_norm": 0.42760953307151794, + "learning_rate": 6.035205364626991e-06, + "loss": 0.4416, + "step": 720 + }, + { + "epoch": 0.1813380281690141, + "grad_norm": 0.4750773012638092, + "learning_rate": 6.043587594300084e-06, + "loss": 0.4321, + "step": 721 + }, + { + "epoch": 0.18158953722334004, + "grad_norm": 0.4393182098865509, + "learning_rate": 6.0519698239731775e-06, + "loss": 0.4264, + "step": 722 + }, + { + "epoch": 0.181841046277666, + "grad_norm": 0.47229018807411194, + "learning_rate": 6.06035205364627e-06, + "loss": 0.4255, + "step": 723 + }, + { + "epoch": 0.18209255533199195, + "grad_norm": 0.4402623772621155, + "learning_rate": 6.068734283319363e-06, + "loss": 0.4308, + "step": 724 + }, + { + "epoch": 0.1823440643863179, + "grad_norm": 0.4287785589694977, + "learning_rate": 6.077116512992457e-06, + "loss": 0.4413, + "step": 725 + }, + { + "epoch": 0.18259557344064387, + "grad_norm": 0.49795329570770264, + "learning_rate": 6.08549874266555e-06, + "loss": 0.4398, + "step": 726 + }, + { + "epoch": 0.1828470824949698, + "grad_norm": 0.4231499433517456, + "learning_rate": 6.093880972338643e-06, + "loss": 0.4207, + "step": 727 + }, + { + "epoch": 0.18309859154929578, + "grad_norm": 0.4155465066432953, + "learning_rate": 6.102263202011736e-06, + "loss": 0.4195, + "step": 728 + }, + { + "epoch": 0.18335010060362172, + "grad_norm": 0.4875541925430298, + "learning_rate": 6.110645431684828e-06, + "loss": 0.4675, + "step": 729 + }, + { + "epoch": 0.1836016096579477, + "grad_norm": 0.46131083369255066, + "learning_rate": 6.119027661357921e-06, + "loss": 0.4296, + "step": 730 + }, + { + "epoch": 0.18385311871227364, + "grad_norm": 0.46306151151657104, + "learning_rate": 6.1274098910310145e-06, + "loss": 0.4315, + "step": 731 + }, + { + "epoch": 0.1841046277665996, + "grad_norm": 0.4583171308040619, + "learning_rate": 6.135792120704108e-06, + "loss": 0.3973, + "step": 732 + }, + { + "epoch": 0.18435613682092555, + "grad_norm": 0.4396132528781891, + "learning_rate": 6.1441743503772014e-06, + "loss": 0.4166, + "step": 733 + }, + { + "epoch": 0.1846076458752515, + "grad_norm": 0.45620885491371155, + "learning_rate": 6.152556580050294e-06, + "loss": 0.4134, + "step": 734 + }, + { + "epoch": 0.18485915492957747, + "grad_norm": 0.48990169167518616, + "learning_rate": 6.160938809723387e-06, + "loss": 0.4615, + "step": 735 + }, + { + "epoch": 0.1851106639839034, + "grad_norm": 0.4548690617084503, + "learning_rate": 6.169321039396479e-06, + "loss": 0.4019, + "step": 736 + }, + { + "epoch": 0.18536217303822938, + "grad_norm": 0.4465884864330292, + "learning_rate": 6.177703269069573e-06, + "loss": 0.4115, + "step": 737 + }, + { + "epoch": 0.18561368209255533, + "grad_norm": 0.5192622542381287, + "learning_rate": 6.1860854987426654e-06, + "loss": 0.4071, + "step": 738 + }, + { + "epoch": 0.1858651911468813, + "grad_norm": 0.5081340074539185, + "learning_rate": 6.19446772841576e-06, + "loss": 0.4464, + "step": 739 + }, + { + "epoch": 0.18611670020120724, + "grad_norm": 0.6187810301780701, + "learning_rate": 6.202849958088852e-06, + "loss": 0.4386, + "step": 740 + }, + { + "epoch": 0.1863682092555332, + "grad_norm": 0.45214521884918213, + "learning_rate": 6.211232187761945e-06, + "loss": 0.4626, + "step": 741 + }, + { + "epoch": 0.18661971830985916, + "grad_norm": 0.5023926496505737, + "learning_rate": 6.2196144174350385e-06, + "loss": 0.4768, + "step": 742 + }, + { + "epoch": 0.1868712273641851, + "grad_norm": 0.49080634117126465, + "learning_rate": 6.227996647108131e-06, + "loss": 0.4381, + "step": 743 + }, + { + "epoch": 0.18712273641851107, + "grad_norm": 0.5521237850189209, + "learning_rate": 6.236378876781224e-06, + "loss": 0.4458, + "step": 744 + }, + { + "epoch": 0.18737424547283701, + "grad_norm": 0.46638408303260803, + "learning_rate": 6.244761106454318e-06, + "loss": 0.4433, + "step": 745 + }, + { + "epoch": 0.18762575452716299, + "grad_norm": 0.5203446745872498, + "learning_rate": 6.253143336127411e-06, + "loss": 0.4255, + "step": 746 + }, + { + "epoch": 0.18787726358148893, + "grad_norm": 0.5413115620613098, + "learning_rate": 6.261525565800503e-06, + "loss": 0.4302, + "step": 747 + }, + { + "epoch": 0.1881287726358149, + "grad_norm": 0.5347134470939636, + "learning_rate": 6.269907795473597e-06, + "loss": 0.4414, + "step": 748 + }, + { + "epoch": 0.18838028169014084, + "grad_norm": 0.43149664998054504, + "learning_rate": 6.278290025146689e-06, + "loss": 0.4167, + "step": 749 + }, + { + "epoch": 0.1886317907444668, + "grad_norm": 0.522585391998291, + "learning_rate": 6.286672254819782e-06, + "loss": 0.4172, + "step": 750 + }, + { + "epoch": 0.18888329979879276, + "grad_norm": 0.5557539463043213, + "learning_rate": 6.2950544844928755e-06, + "loss": 0.436, + "step": 751 + }, + { + "epoch": 0.1891348088531187, + "grad_norm": 0.47711122035980225, + "learning_rate": 6.303436714165969e-06, + "loss": 0.4602, + "step": 752 + }, + { + "epoch": 0.18938631790744467, + "grad_norm": 0.5225254893302917, + "learning_rate": 6.3118189438390624e-06, + "loss": 0.438, + "step": 753 + }, + { + "epoch": 0.18963782696177062, + "grad_norm": 0.4202134907245636, + "learning_rate": 6.320201173512155e-06, + "loss": 0.4453, + "step": 754 + }, + { + "epoch": 0.1898893360160966, + "grad_norm": 0.48253220319747925, + "learning_rate": 6.328583403185248e-06, + "loss": 0.4296, + "step": 755 + }, + { + "epoch": 0.19014084507042253, + "grad_norm": 0.525970995426178, + "learning_rate": 6.33696563285834e-06, + "loss": 0.4248, + "step": 756 + }, + { + "epoch": 0.1903923541247485, + "grad_norm": 0.49865517020225525, + "learning_rate": 6.345347862531434e-06, + "loss": 0.4083, + "step": 757 + }, + { + "epoch": 0.19064386317907445, + "grad_norm": 0.5443746447563171, + "learning_rate": 6.353730092204526e-06, + "loss": 0.4326, + "step": 758 + }, + { + "epoch": 0.1908953722334004, + "grad_norm": 0.5639643669128418, + "learning_rate": 6.362112321877621e-06, + "loss": 0.4212, + "step": 759 + }, + { + "epoch": 0.19114688128772636, + "grad_norm": 0.4444926381111145, + "learning_rate": 6.370494551550713e-06, + "loss": 0.407, + "step": 760 + }, + { + "epoch": 0.1913983903420523, + "grad_norm": 0.5601978898048401, + "learning_rate": 6.378876781223806e-06, + "loss": 0.4492, + "step": 761 + }, + { + "epoch": 0.19164989939637828, + "grad_norm": 0.5076690912246704, + "learning_rate": 6.387259010896899e-06, + "loss": 0.4241, + "step": 762 + }, + { + "epoch": 0.19190140845070422, + "grad_norm": 0.45043453574180603, + "learning_rate": 6.395641240569992e-06, + "loss": 0.4345, + "step": 763 + }, + { + "epoch": 0.1921529175050302, + "grad_norm": 0.413898229598999, + "learning_rate": 6.404023470243085e-06, + "loss": 0.4396, + "step": 764 + }, + { + "epoch": 0.19240442655935613, + "grad_norm": 0.43958157300949097, + "learning_rate": 6.412405699916177e-06, + "loss": 0.4294, + "step": 765 + }, + { + "epoch": 0.1926559356136821, + "grad_norm": 0.46111080050468445, + "learning_rate": 6.420787929589272e-06, + "loss": 0.4508, + "step": 766 + }, + { + "epoch": 0.19290744466800805, + "grad_norm": 0.4380728602409363, + "learning_rate": 6.429170159262364e-06, + "loss": 0.4308, + "step": 767 + }, + { + "epoch": 0.193158953722334, + "grad_norm": 0.47174325585365295, + "learning_rate": 6.437552388935458e-06, + "loss": 0.4047, + "step": 768 + }, + { + "epoch": 0.19341046277665996, + "grad_norm": 0.5069427490234375, + "learning_rate": 6.44593461860855e-06, + "loss": 0.4623, + "step": 769 + }, + { + "epoch": 0.1936619718309859, + "grad_norm": 0.47728949785232544, + "learning_rate": 6.454316848281643e-06, + "loss": 0.4226, + "step": 770 + }, + { + "epoch": 0.19391348088531188, + "grad_norm": 0.4369402229785919, + "learning_rate": 6.462699077954736e-06, + "loss": 0.4355, + "step": 771 + }, + { + "epoch": 0.19416498993963782, + "grad_norm": 0.49592912197113037, + "learning_rate": 6.47108130762783e-06, + "loss": 0.4504, + "step": 772 + }, + { + "epoch": 0.1944164989939638, + "grad_norm": 0.4935101270675659, + "learning_rate": 6.479463537300923e-06, + "loss": 0.4434, + "step": 773 + }, + { + "epoch": 0.19466800804828974, + "grad_norm": 0.4729015827178955, + "learning_rate": 6.487845766974016e-06, + "loss": 0.4066, + "step": 774 + }, + { + "epoch": 0.19491951710261568, + "grad_norm": 0.5029162168502808, + "learning_rate": 6.496227996647109e-06, + "loss": 0.4262, + "step": 775 + }, + { + "epoch": 0.19517102615694165, + "grad_norm": 0.6650319695472717, + "learning_rate": 6.504610226320201e-06, + "loss": 0.434, + "step": 776 + }, + { + "epoch": 0.1954225352112676, + "grad_norm": 0.464813768863678, + "learning_rate": 6.512992455993295e-06, + "loss": 0.4368, + "step": 777 + }, + { + "epoch": 0.19567404426559357, + "grad_norm": 0.5334668755531311, + "learning_rate": 6.521374685666387e-06, + "loss": 0.4279, + "step": 778 + }, + { + "epoch": 0.1959255533199195, + "grad_norm": 0.6014412641525269, + "learning_rate": 6.529756915339482e-06, + "loss": 0.4236, + "step": 779 + }, + { + "epoch": 0.19617706237424548, + "grad_norm": 0.48810648918151855, + "learning_rate": 6.538139145012574e-06, + "loss": 0.4528, + "step": 780 + }, + { + "epoch": 0.19642857142857142, + "grad_norm": 0.5004664063453674, + "learning_rate": 6.546521374685667e-06, + "loss": 0.4335, + "step": 781 + }, + { + "epoch": 0.1966800804828974, + "grad_norm": 0.5810479521751404, + "learning_rate": 6.55490360435876e-06, + "loss": 0.4348, + "step": 782 + }, + { + "epoch": 0.19693158953722334, + "grad_norm": 0.5041718482971191, + "learning_rate": 6.563285834031853e-06, + "loss": 0.4341, + "step": 783 + }, + { + "epoch": 0.19718309859154928, + "grad_norm": 0.498522013425827, + "learning_rate": 6.571668063704946e-06, + "loss": 0.4269, + "step": 784 + }, + { + "epoch": 0.19743460764587525, + "grad_norm": 0.45353996753692627, + "learning_rate": 6.580050293378038e-06, + "loss": 0.4199, + "step": 785 + }, + { + "epoch": 0.1976861167002012, + "grad_norm": 0.45865288376808167, + "learning_rate": 6.588432523051133e-06, + "loss": 0.4335, + "step": 786 + }, + { + "epoch": 0.19793762575452717, + "grad_norm": 0.4978802800178528, + "learning_rate": 6.596814752724225e-06, + "loss": 0.424, + "step": 787 + }, + { + "epoch": 0.1981891348088531, + "grad_norm": 0.45598986744880676, + "learning_rate": 6.605196982397319e-06, + "loss": 0.4375, + "step": 788 + }, + { + "epoch": 0.19844064386317908, + "grad_norm": 0.448933869600296, + "learning_rate": 6.613579212070411e-06, + "loss": 0.4242, + "step": 789 + }, + { + "epoch": 0.19869215291750503, + "grad_norm": 0.4256827235221863, + "learning_rate": 6.621961441743504e-06, + "loss": 0.4396, + "step": 790 + }, + { + "epoch": 0.198943661971831, + "grad_norm": 0.4087207317352295, + "learning_rate": 6.630343671416597e-06, + "loss": 0.3984, + "step": 791 + }, + { + "epoch": 0.19919517102615694, + "grad_norm": 0.5045828819274902, + "learning_rate": 6.638725901089691e-06, + "loss": 0.4333, + "step": 792 + }, + { + "epoch": 0.19944668008048289, + "grad_norm": 0.4137192368507385, + "learning_rate": 6.6471081307627836e-06, + "loss": 0.4048, + "step": 793 + }, + { + "epoch": 0.19969818913480886, + "grad_norm": 0.43266329169273376, + "learning_rate": 6.655490360435877e-06, + "loss": 0.4228, + "step": 794 + }, + { + "epoch": 0.1999496981891348, + "grad_norm": 0.43893131613731384, + "learning_rate": 6.66387259010897e-06, + "loss": 0.4284, + "step": 795 + }, + { + "epoch": 0.20020120724346077, + "grad_norm": 0.4277397096157074, + "learning_rate": 6.672254819782062e-06, + "loss": 0.4379, + "step": 796 + }, + { + "epoch": 0.20045271629778671, + "grad_norm": 0.4732038974761963, + "learning_rate": 6.680637049455155e-06, + "loss": 0.4393, + "step": 797 + }, + { + "epoch": 0.2007042253521127, + "grad_norm": 0.40471675992012024, + "learning_rate": 6.689019279128248e-06, + "loss": 0.4128, + "step": 798 + }, + { + "epoch": 0.20095573440643863, + "grad_norm": 0.47634950280189514, + "learning_rate": 6.697401508801342e-06, + "loss": 0.4163, + "step": 799 + }, + { + "epoch": 0.2012072434607646, + "grad_norm": 0.44929447770118713, + "learning_rate": 6.705783738474435e-06, + "loss": 0.4569, + "step": 800 + }, + { + "epoch": 0.20145875251509054, + "grad_norm": 0.40945762395858765, + "learning_rate": 6.714165968147528e-06, + "loss": 0.4021, + "step": 801 + }, + { + "epoch": 0.2017102615694165, + "grad_norm": 0.4612343907356262, + "learning_rate": 6.722548197820621e-06, + "loss": 0.4204, + "step": 802 + }, + { + "epoch": 0.20196177062374246, + "grad_norm": 0.40259966254234314, + "learning_rate": 6.730930427493714e-06, + "loss": 0.4317, + "step": 803 + }, + { + "epoch": 0.2022132796780684, + "grad_norm": 0.4034963548183441, + "learning_rate": 6.739312657166807e-06, + "loss": 0.4112, + "step": 804 + }, + { + "epoch": 0.20246478873239437, + "grad_norm": 0.4619353413581848, + "learning_rate": 6.747694886839899e-06, + "loss": 0.4341, + "step": 805 + }, + { + "epoch": 0.20271629778672032, + "grad_norm": 0.40814584493637085, + "learning_rate": 6.756077116512994e-06, + "loss": 0.4438, + "step": 806 + }, + { + "epoch": 0.2029678068410463, + "grad_norm": 0.47560909390449524, + "learning_rate": 6.764459346186086e-06, + "loss": 0.4254, + "step": 807 + }, + { + "epoch": 0.20321931589537223, + "grad_norm": 0.43871134519577026, + "learning_rate": 6.772841575859179e-06, + "loss": 0.4295, + "step": 808 + }, + { + "epoch": 0.20347082494969818, + "grad_norm": 0.49143895506858826, + "learning_rate": 6.781223805532272e-06, + "loss": 0.4294, + "step": 809 + }, + { + "epoch": 0.20372233400402415, + "grad_norm": 0.4383901059627533, + "learning_rate": 6.789606035205365e-06, + "loss": 0.4183, + "step": 810 + }, + { + "epoch": 0.2039738430583501, + "grad_norm": 0.4648225009441376, + "learning_rate": 6.797988264878458e-06, + "loss": 0.4105, + "step": 811 + }, + { + "epoch": 0.20422535211267606, + "grad_norm": 0.4313584566116333, + "learning_rate": 6.806370494551551e-06, + "loss": 0.4132, + "step": 812 + }, + { + "epoch": 0.204476861167002, + "grad_norm": 0.4867895543575287, + "learning_rate": 6.8147527242246446e-06, + "loss": 0.4601, + "step": 813 + }, + { + "epoch": 0.20472837022132798, + "grad_norm": 0.4108380675315857, + "learning_rate": 6.823134953897738e-06, + "loss": 0.4437, + "step": 814 + }, + { + "epoch": 0.20497987927565392, + "grad_norm": 0.46014007925987244, + "learning_rate": 6.831517183570831e-06, + "loss": 0.4335, + "step": 815 + }, + { + "epoch": 0.2052313883299799, + "grad_norm": 0.42462071776390076, + "learning_rate": 6.839899413243923e-06, + "loss": 0.4316, + "step": 816 + }, + { + "epoch": 0.20548289738430583, + "grad_norm": 0.4779759645462036, + "learning_rate": 6.848281642917016e-06, + "loss": 0.3952, + "step": 817 + }, + { + "epoch": 0.20573440643863178, + "grad_norm": 0.4665524363517761, + "learning_rate": 6.856663872590109e-06, + "loss": 0.4114, + "step": 818 + }, + { + "epoch": 0.20598591549295775, + "grad_norm": 0.5013046264648438, + "learning_rate": 6.865046102263203e-06, + "loss": 0.4086, + "step": 819 + }, + { + "epoch": 0.2062374245472837, + "grad_norm": 0.4999219477176666, + "learning_rate": 6.873428331936296e-06, + "loss": 0.433, + "step": 820 + }, + { + "epoch": 0.20648893360160966, + "grad_norm": 0.46619290113449097, + "learning_rate": 6.881810561609389e-06, + "loss": 0.4102, + "step": 821 + }, + { + "epoch": 0.2067404426559356, + "grad_norm": 0.4653915464878082, + "learning_rate": 6.890192791282482e-06, + "loss": 0.42, + "step": 822 + }, + { + "epoch": 0.20699195171026158, + "grad_norm": 0.4532758295536041, + "learning_rate": 6.898575020955574e-06, + "loss": 0.4216, + "step": 823 + }, + { + "epoch": 0.20724346076458752, + "grad_norm": 0.45343294739723206, + "learning_rate": 6.906957250628668e-06, + "loss": 0.4259, + "step": 824 + }, + { + "epoch": 0.2074949698189135, + "grad_norm": 0.4827394485473633, + "learning_rate": 6.91533948030176e-06, + "loss": 0.4252, + "step": 825 + }, + { + "epoch": 0.20774647887323944, + "grad_norm": 0.48318737745285034, + "learning_rate": 6.923721709974855e-06, + "loss": 0.4241, + "step": 826 + }, + { + "epoch": 0.20799798792756538, + "grad_norm": 0.4679957330226898, + "learning_rate": 6.932103939647947e-06, + "loss": 0.4154, + "step": 827 + }, + { + "epoch": 0.20824949698189135, + "grad_norm": 0.43381795287132263, + "learning_rate": 6.94048616932104e-06, + "loss": 0.4172, + "step": 828 + }, + { + "epoch": 0.2085010060362173, + "grad_norm": 0.45988166332244873, + "learning_rate": 6.948868398994133e-06, + "loss": 0.4147, + "step": 829 + }, + { + "epoch": 0.20875251509054327, + "grad_norm": 0.400395005941391, + "learning_rate": 6.957250628667226e-06, + "loss": 0.4308, + "step": 830 + }, + { + "epoch": 0.2090040241448692, + "grad_norm": 0.4453587532043457, + "learning_rate": 6.965632858340319e-06, + "loss": 0.4018, + "step": 831 + }, + { + "epoch": 0.20925553319919518, + "grad_norm": 0.4165091812610626, + "learning_rate": 6.974015088013411e-06, + "loss": 0.4022, + "step": 832 + }, + { + "epoch": 0.20950704225352113, + "grad_norm": 0.49082446098327637, + "learning_rate": 6.9823973176865055e-06, + "loss": 0.4378, + "step": 833 + }, + { + "epoch": 0.20975855130784707, + "grad_norm": 0.4576517641544342, + "learning_rate": 6.990779547359598e-06, + "loss": 0.4107, + "step": 834 + }, + { + "epoch": 0.21001006036217304, + "grad_norm": 0.43755435943603516, + "learning_rate": 6.999161777032692e-06, + "loss": 0.4226, + "step": 835 + }, + { + "epoch": 0.21026156941649898, + "grad_norm": 0.45865172147750854, + "learning_rate": 7.007544006705784e-06, + "loss": 0.4089, + "step": 836 + }, + { + "epoch": 0.21051307847082495, + "grad_norm": 0.5721135139465332, + "learning_rate": 7.015926236378877e-06, + "loss": 0.4173, + "step": 837 + }, + { + "epoch": 0.2107645875251509, + "grad_norm": 0.43587735295295715, + "learning_rate": 7.02430846605197e-06, + "loss": 0.4121, + "step": 838 + }, + { + "epoch": 0.21101609657947687, + "grad_norm": 0.5028086304664612, + "learning_rate": 7.032690695725063e-06, + "loss": 0.4339, + "step": 839 + }, + { + "epoch": 0.2112676056338028, + "grad_norm": 0.5480897426605225, + "learning_rate": 7.041072925398157e-06, + "loss": 0.422, + "step": 840 + }, + { + "epoch": 0.21151911468812878, + "grad_norm": 0.4621444642543793, + "learning_rate": 7.04945515507125e-06, + "loss": 0.396, + "step": 841 + }, + { + "epoch": 0.21177062374245473, + "grad_norm": 0.4964749813079834, + "learning_rate": 7.0578373847443426e-06, + "loss": 0.4597, + "step": 842 + }, + { + "epoch": 0.21202213279678067, + "grad_norm": 0.48463791608810425, + "learning_rate": 7.066219614417435e-06, + "loss": 0.421, + "step": 843 + }, + { + "epoch": 0.21227364185110664, + "grad_norm": 0.4578920900821686, + "learning_rate": 7.074601844090529e-06, + "loss": 0.4552, + "step": 844 + }, + { + "epoch": 0.21252515090543259, + "grad_norm": 0.5604377388954163, + "learning_rate": 7.082984073763621e-06, + "loss": 0.4275, + "step": 845 + }, + { + "epoch": 0.21277665995975856, + "grad_norm": 0.424990177154541, + "learning_rate": 7.091366303436716e-06, + "loss": 0.4237, + "step": 846 + }, + { + "epoch": 0.2130281690140845, + "grad_norm": 0.5051470398902893, + "learning_rate": 7.099748533109808e-06, + "loss": 0.4197, + "step": 847 + }, + { + "epoch": 0.21327967806841047, + "grad_norm": 0.5595173239707947, + "learning_rate": 7.108130762782901e-06, + "loss": 0.4045, + "step": 848 + }, + { + "epoch": 0.21353118712273642, + "grad_norm": 0.43428587913513184, + "learning_rate": 7.116512992455994e-06, + "loss": 0.416, + "step": 849 + }, + { + "epoch": 0.2137826961770624, + "grad_norm": 0.5479362607002258, + "learning_rate": 7.124895222129087e-06, + "loss": 0.4212, + "step": 850 + }, + { + "epoch": 0.21403420523138833, + "grad_norm": 0.4589390754699707, + "learning_rate": 7.13327745180218e-06, + "loss": 0.4336, + "step": 851 + }, + { + "epoch": 0.21428571428571427, + "grad_norm": 0.4126336872577667, + "learning_rate": 7.141659681475272e-06, + "loss": 0.4356, + "step": 852 + }, + { + "epoch": 0.21453722334004025, + "grad_norm": 0.5102934241294861, + "learning_rate": 7.1500419111483665e-06, + "loss": 0.4416, + "step": 853 + }, + { + "epoch": 0.2147887323943662, + "grad_norm": 0.4380166232585907, + "learning_rate": 7.158424140821459e-06, + "loss": 0.4087, + "step": 854 + }, + { + "epoch": 0.21504024144869216, + "grad_norm": 0.4341869652271271, + "learning_rate": 7.166806370494553e-06, + "loss": 0.4436, + "step": 855 + }, + { + "epoch": 0.2152917505030181, + "grad_norm": 0.42801982164382935, + "learning_rate": 7.175188600167645e-06, + "loss": 0.4048, + "step": 856 + }, + { + "epoch": 0.21554325955734407, + "grad_norm": 0.46920889616012573, + "learning_rate": 7.183570829840738e-06, + "loss": 0.4304, + "step": 857 + }, + { + "epoch": 0.21579476861167002, + "grad_norm": 0.4114355444908142, + "learning_rate": 7.1919530595138305e-06, + "loss": 0.4258, + "step": 858 + }, + { + "epoch": 0.216046277665996, + "grad_norm": 0.47151094675064087, + "learning_rate": 7.200335289186924e-06, + "loss": 0.3901, + "step": 859 + }, + { + "epoch": 0.21629778672032193, + "grad_norm": 0.4899015724658966, + "learning_rate": 7.2087175188600175e-06, + "loss": 0.4144, + "step": 860 + }, + { + "epoch": 0.21654929577464788, + "grad_norm": 0.4297754764556885, + "learning_rate": 7.217099748533111e-06, + "loss": 0.4041, + "step": 861 + }, + { + "epoch": 0.21680080482897385, + "grad_norm": 0.5144082307815552, + "learning_rate": 7.2254819782062036e-06, + "loss": 0.4448, + "step": 862 + }, + { + "epoch": 0.2170523138832998, + "grad_norm": 0.4766099750995636, + "learning_rate": 7.233864207879296e-06, + "loss": 0.3988, + "step": 863 + }, + { + "epoch": 0.21730382293762576, + "grad_norm": 0.46279922127723694, + "learning_rate": 7.24224643755239e-06, + "loss": 0.3987, + "step": 864 + }, + { + "epoch": 0.2175553319919517, + "grad_norm": 0.5055639743804932, + "learning_rate": 7.250628667225482e-06, + "loss": 0.4175, + "step": 865 + }, + { + "epoch": 0.21780684104627768, + "grad_norm": 0.5239527821540833, + "learning_rate": 7.259010896898575e-06, + "loss": 0.4355, + "step": 866 + }, + { + "epoch": 0.21805835010060362, + "grad_norm": 0.4184260964393616, + "learning_rate": 7.267393126571669e-06, + "loss": 0.4454, + "step": 867 + }, + { + "epoch": 0.21830985915492956, + "grad_norm": 0.5631728768348694, + "learning_rate": 7.275775356244762e-06, + "loss": 0.4466, + "step": 868 + }, + { + "epoch": 0.21856136820925554, + "grad_norm": 0.4764256775379181, + "learning_rate": 7.2841575859178545e-06, + "loss": 0.4278, + "step": 869 + }, + { + "epoch": 0.21881287726358148, + "grad_norm": 0.4316968619823456, + "learning_rate": 7.292539815590948e-06, + "loss": 0.4642, + "step": 870 + }, + { + "epoch": 0.21906438631790745, + "grad_norm": 0.4983559846878052, + "learning_rate": 7.300922045264041e-06, + "loss": 0.4296, + "step": 871 + }, + { + "epoch": 0.2193158953722334, + "grad_norm": 0.49155017733573914, + "learning_rate": 7.309304274937133e-06, + "loss": 0.452, + "step": 872 + }, + { + "epoch": 0.21956740442655936, + "grad_norm": 0.5096904635429382, + "learning_rate": 7.3176865046102275e-06, + "loss": 0.4112, + "step": 873 + }, + { + "epoch": 0.2198189134808853, + "grad_norm": 0.41947141289711, + "learning_rate": 7.32606873428332e-06, + "loss": 0.426, + "step": 874 + }, + { + "epoch": 0.22007042253521128, + "grad_norm": 0.49421125650405884, + "learning_rate": 7.334450963956414e-06, + "loss": 0.4014, + "step": 875 + }, + { + "epoch": 0.22032193158953722, + "grad_norm": 0.532922089099884, + "learning_rate": 7.342833193629506e-06, + "loss": 0.4217, + "step": 876 + }, + { + "epoch": 0.22057344064386317, + "grad_norm": 0.4467262923717499, + "learning_rate": 7.351215423302599e-06, + "loss": 0.4348, + "step": 877 + }, + { + "epoch": 0.22082494969818914, + "grad_norm": 0.4734607934951782, + "learning_rate": 7.3595976529756915e-06, + "loss": 0.432, + "step": 878 + }, + { + "epoch": 0.22107645875251508, + "grad_norm": 0.4447082579135895, + "learning_rate": 7.367979882648785e-06, + "loss": 0.4272, + "step": 879 + }, + { + "epoch": 0.22132796780684105, + "grad_norm": 0.4721175730228424, + "learning_rate": 7.3763621123218785e-06, + "loss": 0.4091, + "step": 880 + }, + { + "epoch": 0.221579476861167, + "grad_norm": 0.45364779233932495, + "learning_rate": 7.384744341994972e-06, + "loss": 0.422, + "step": 881 + }, + { + "epoch": 0.22183098591549297, + "grad_norm": 0.5278758406639099, + "learning_rate": 7.3931265716680646e-06, + "loss": 0.4253, + "step": 882 + }, + { + "epoch": 0.2220824949698189, + "grad_norm": 0.4425363540649414, + "learning_rate": 7.401508801341157e-06, + "loss": 0.4227, + "step": 883 + }, + { + "epoch": 0.22233400402414488, + "grad_norm": 0.4397623836994171, + "learning_rate": 7.40989103101425e-06, + "loss": 0.4274, + "step": 884 + }, + { + "epoch": 0.22258551307847083, + "grad_norm": 0.5451933741569519, + "learning_rate": 7.418273260687343e-06, + "loss": 0.4258, + "step": 885 + }, + { + "epoch": 0.22283702213279677, + "grad_norm": 0.5051050186157227, + "learning_rate": 7.426655490360436e-06, + "loss": 0.4037, + "step": 886 + }, + { + "epoch": 0.22308853118712274, + "grad_norm": 0.4557664096355438, + "learning_rate": 7.43503772003353e-06, + "loss": 0.4222, + "step": 887 + }, + { + "epoch": 0.22334004024144868, + "grad_norm": 0.5401166677474976, + "learning_rate": 7.443419949706623e-06, + "loss": 0.4399, + "step": 888 + }, + { + "epoch": 0.22359154929577466, + "grad_norm": 0.48601412773132324, + "learning_rate": 7.4518021793797155e-06, + "loss": 0.4292, + "step": 889 + }, + { + "epoch": 0.2238430583501006, + "grad_norm": 0.5143569707870483, + "learning_rate": 7.460184409052809e-06, + "loss": 0.4331, + "step": 890 + }, + { + "epoch": 0.22409456740442657, + "grad_norm": 0.4917342960834503, + "learning_rate": 7.4685666387259016e-06, + "loss": 0.4079, + "step": 891 + }, + { + "epoch": 0.2243460764587525, + "grad_norm": 0.5398588180541992, + "learning_rate": 7.476948868398994e-06, + "loss": 0.4244, + "step": 892 + }, + { + "epoch": 0.22459758551307846, + "grad_norm": 0.4740265905857086, + "learning_rate": 7.485331098072087e-06, + "loss": 0.4131, + "step": 893 + }, + { + "epoch": 0.22484909456740443, + "grad_norm": 0.5333375334739685, + "learning_rate": 7.493713327745181e-06, + "loss": 0.3979, + "step": 894 + }, + { + "epoch": 0.22510060362173037, + "grad_norm": 0.47554007172584534, + "learning_rate": 7.502095557418274e-06, + "loss": 0.4184, + "step": 895 + }, + { + "epoch": 0.22535211267605634, + "grad_norm": 0.5044271945953369, + "learning_rate": 7.510477787091367e-06, + "loss": 0.4374, + "step": 896 + }, + { + "epoch": 0.2256036217303823, + "grad_norm": 0.4042213261127472, + "learning_rate": 7.51886001676446e-06, + "loss": 0.4121, + "step": 897 + }, + { + "epoch": 0.22585513078470826, + "grad_norm": 0.47659948468208313, + "learning_rate": 7.5272422464375525e-06, + "loss": 0.411, + "step": 898 + }, + { + "epoch": 0.2261066398390342, + "grad_norm": 0.48039042949676514, + "learning_rate": 7.535624476110646e-06, + "loss": 0.4294, + "step": 899 + }, + { + "epoch": 0.22635814889336017, + "grad_norm": 0.468188613653183, + "learning_rate": 7.5440067057837394e-06, + "loss": 0.4076, + "step": 900 + }, + { + "epoch": 0.22660965794768612, + "grad_norm": 0.47702232003211975, + "learning_rate": 7.552388935456833e-06, + "loss": 0.423, + "step": 901 + }, + { + "epoch": 0.22686116700201206, + "grad_norm": 0.4733107089996338, + "learning_rate": 7.5607711651299255e-06, + "loss": 0.4356, + "step": 902 + }, + { + "epoch": 0.22711267605633803, + "grad_norm": 0.4408296048641205, + "learning_rate": 7.569153394803018e-06, + "loss": 0.4132, + "step": 903 + }, + { + "epoch": 0.22736418511066397, + "grad_norm": 0.4489647150039673, + "learning_rate": 7.577535624476111e-06, + "loss": 0.4259, + "step": 904 + }, + { + "epoch": 0.22761569416498995, + "grad_norm": 0.42070379853248596, + "learning_rate": 7.585917854149204e-06, + "loss": 0.4326, + "step": 905 + }, + { + "epoch": 0.2278672032193159, + "grad_norm": 0.43029969930648804, + "learning_rate": 7.594300083822297e-06, + "loss": 0.4053, + "step": 906 + }, + { + "epoch": 0.22811871227364186, + "grad_norm": 0.38450947403907776, + "learning_rate": 7.602682313495391e-06, + "loss": 0.4256, + "step": 907 + }, + { + "epoch": 0.2283702213279678, + "grad_norm": 0.45817065238952637, + "learning_rate": 7.611064543168484e-06, + "loss": 0.4427, + "step": 908 + }, + { + "epoch": 0.22862173038229378, + "grad_norm": 0.44824421405792236, + "learning_rate": 7.6194467728415765e-06, + "loss": 0.4002, + "step": 909 + }, + { + "epoch": 0.22887323943661972, + "grad_norm": 0.4120684862136841, + "learning_rate": 7.627829002514669e-06, + "loss": 0.4054, + "step": 910 + }, + { + "epoch": 0.22912474849094566, + "grad_norm": 0.4775068759918213, + "learning_rate": 7.636211232187762e-06, + "loss": 0.4149, + "step": 911 + }, + { + "epoch": 0.22937625754527163, + "grad_norm": 0.5041594505310059, + "learning_rate": 7.644593461860856e-06, + "loss": 0.4529, + "step": 912 + }, + { + "epoch": 0.22962776659959758, + "grad_norm": 0.4362272620201111, + "learning_rate": 7.652975691533949e-06, + "loss": 0.4144, + "step": 913 + }, + { + "epoch": 0.22987927565392355, + "grad_norm": 0.4660468101501465, + "learning_rate": 7.661357921207043e-06, + "loss": 0.4065, + "step": 914 + }, + { + "epoch": 0.2301307847082495, + "grad_norm": 0.4543038606643677, + "learning_rate": 7.669740150880136e-06, + "loss": 0.4356, + "step": 915 + }, + { + "epoch": 0.23038229376257546, + "grad_norm": 0.4837052524089813, + "learning_rate": 7.678122380553228e-06, + "loss": 0.4341, + "step": 916 + }, + { + "epoch": 0.2306338028169014, + "grad_norm": 0.47712114453315735, + "learning_rate": 7.686504610226321e-06, + "loss": 0.4127, + "step": 917 + }, + { + "epoch": 0.23088531187122738, + "grad_norm": 0.44308313727378845, + "learning_rate": 7.694886839899413e-06, + "loss": 0.4173, + "step": 918 + }, + { + "epoch": 0.23113682092555332, + "grad_norm": 0.4565160274505615, + "learning_rate": 7.703269069572506e-06, + "loss": 0.4009, + "step": 919 + }, + { + "epoch": 0.23138832997987926, + "grad_norm": 0.4511704742908478, + "learning_rate": 7.711651299245599e-06, + "loss": 0.4337, + "step": 920 + }, + { + "epoch": 0.23163983903420524, + "grad_norm": 0.46274664998054504, + "learning_rate": 7.720033528918693e-06, + "loss": 0.4184, + "step": 921 + }, + { + "epoch": 0.23189134808853118, + "grad_norm": 0.4714046120643616, + "learning_rate": 7.728415758591786e-06, + "loss": 0.4313, + "step": 922 + }, + { + "epoch": 0.23214285714285715, + "grad_norm": 0.46814224123954773, + "learning_rate": 7.73679798826488e-06, + "loss": 0.4318, + "step": 923 + }, + { + "epoch": 0.2323943661971831, + "grad_norm": 0.47005271911621094, + "learning_rate": 7.745180217937973e-06, + "loss": 0.4237, + "step": 924 + }, + { + "epoch": 0.23264587525150907, + "grad_norm": 0.48916304111480713, + "learning_rate": 7.753562447611065e-06, + "loss": 0.3893, + "step": 925 + }, + { + "epoch": 0.232897384305835, + "grad_norm": 0.4641698896884918, + "learning_rate": 7.761944677284158e-06, + "loss": 0.4322, + "step": 926 + }, + { + "epoch": 0.23314889336016095, + "grad_norm": 0.44672325253486633, + "learning_rate": 7.770326906957252e-06, + "loss": 0.4581, + "step": 927 + }, + { + "epoch": 0.23340040241448692, + "grad_norm": 0.43729367852211, + "learning_rate": 7.778709136630345e-06, + "loss": 0.4139, + "step": 928 + }, + { + "epoch": 0.23365191146881287, + "grad_norm": 0.45290911197662354, + "learning_rate": 7.787091366303437e-06, + "loss": 0.418, + "step": 929 + }, + { + "epoch": 0.23390342052313884, + "grad_norm": 0.42791497707366943, + "learning_rate": 7.79547359597653e-06, + "loss": 0.4188, + "step": 930 + }, + { + "epoch": 0.23415492957746478, + "grad_norm": 0.5161230564117432, + "learning_rate": 7.803855825649623e-06, + "loss": 0.4236, + "step": 931 + }, + { + "epoch": 0.23440643863179075, + "grad_norm": 0.4312325119972229, + "learning_rate": 7.812238055322715e-06, + "loss": 0.4313, + "step": 932 + }, + { + "epoch": 0.2346579476861167, + "grad_norm": 0.47641661763191223, + "learning_rate": 7.82062028499581e-06, + "loss": 0.4152, + "step": 933 + }, + { + "epoch": 0.23490945674044267, + "grad_norm": 0.4813452363014221, + "learning_rate": 7.829002514668902e-06, + "loss": 0.4111, + "step": 934 + }, + { + "epoch": 0.2351609657947686, + "grad_norm": 0.49906906485557556, + "learning_rate": 7.837384744341997e-06, + "loss": 0.3846, + "step": 935 + }, + { + "epoch": 0.23541247484909456, + "grad_norm": 0.48748451471328735, + "learning_rate": 7.84576697401509e-06, + "loss": 0.4329, + "step": 936 + }, + { + "epoch": 0.23566398390342053, + "grad_norm": 0.5034842491149902, + "learning_rate": 7.854149203688182e-06, + "loss": 0.4054, + "step": 937 + }, + { + "epoch": 0.23591549295774647, + "grad_norm": 0.45687928795814514, + "learning_rate": 7.862531433361274e-06, + "loss": 0.4466, + "step": 938 + }, + { + "epoch": 0.23616700201207244, + "grad_norm": 0.4433618485927582, + "learning_rate": 7.870913663034367e-06, + "loss": 0.3867, + "step": 939 + }, + { + "epoch": 0.23641851106639838, + "grad_norm": 0.4428368806838989, + "learning_rate": 7.87929589270746e-06, + "loss": 0.3938, + "step": 940 + }, + { + "epoch": 0.23667002012072436, + "grad_norm": 0.49069783091545105, + "learning_rate": 7.887678122380554e-06, + "loss": 0.3964, + "step": 941 + }, + { + "epoch": 0.2369215291750503, + "grad_norm": 0.45439690351486206, + "learning_rate": 7.896060352053647e-06, + "loss": 0.4115, + "step": 942 + }, + { + "epoch": 0.23717303822937627, + "grad_norm": 0.4886208772659302, + "learning_rate": 7.90444258172674e-06, + "loss": 0.4415, + "step": 943 + }, + { + "epoch": 0.23742454728370221, + "grad_norm": 0.47720471024513245, + "learning_rate": 7.912824811399834e-06, + "loss": 0.3896, + "step": 944 + }, + { + "epoch": 0.23767605633802816, + "grad_norm": 0.47643229365348816, + "learning_rate": 7.921207041072926e-06, + "loss": 0.4226, + "step": 945 + }, + { + "epoch": 0.23792756539235413, + "grad_norm": 0.4700299799442291, + "learning_rate": 7.929589270746019e-06, + "loss": 0.4165, + "step": 946 + }, + { + "epoch": 0.23817907444668007, + "grad_norm": 0.41711124777793884, + "learning_rate": 7.937971500419113e-06, + "loss": 0.4345, + "step": 947 + }, + { + "epoch": 0.23843058350100604, + "grad_norm": 0.45945024490356445, + "learning_rate": 7.946353730092206e-06, + "loss": 0.4379, + "step": 948 + }, + { + "epoch": 0.238682092555332, + "grad_norm": 0.4340158998966217, + "learning_rate": 7.954735959765298e-06, + "loss": 0.391, + "step": 949 + }, + { + "epoch": 0.23893360160965796, + "grad_norm": 0.4647141396999359, + "learning_rate": 7.963118189438391e-06, + "loss": 0.4351, + "step": 950 + }, + { + "epoch": 0.2391851106639839, + "grad_norm": 0.43337324261665344, + "learning_rate": 7.971500419111484e-06, + "loss": 0.4201, + "step": 951 + }, + { + "epoch": 0.23943661971830985, + "grad_norm": 0.45250189304351807, + "learning_rate": 7.979882648784576e-06, + "loss": 0.4328, + "step": 952 + }, + { + "epoch": 0.23968812877263582, + "grad_norm": 0.47799962759017944, + "learning_rate": 7.98826487845767e-06, + "loss": 0.4089, + "step": 953 + }, + { + "epoch": 0.23993963782696176, + "grad_norm": 0.4971179962158203, + "learning_rate": 7.996647108130763e-06, + "loss": 0.4312, + "step": 954 + }, + { + "epoch": 0.24019114688128773, + "grad_norm": 0.4583699703216553, + "learning_rate": 8.005029337803858e-06, + "loss": 0.4179, + "step": 955 + }, + { + "epoch": 0.24044265593561368, + "grad_norm": 0.3827430009841919, + "learning_rate": 8.01341156747695e-06, + "loss": 0.4026, + "step": 956 + }, + { + "epoch": 0.24069416498993965, + "grad_norm": 0.4374185800552368, + "learning_rate": 8.021793797150043e-06, + "loss": 0.4196, + "step": 957 + }, + { + "epoch": 0.2409456740442656, + "grad_norm": 0.4754588305950165, + "learning_rate": 8.030176026823135e-06, + "loss": 0.4214, + "step": 958 + }, + { + "epoch": 0.24119718309859156, + "grad_norm": 0.44250696897506714, + "learning_rate": 8.038558256496228e-06, + "loss": 0.3991, + "step": 959 + }, + { + "epoch": 0.2414486921529175, + "grad_norm": 0.5119554400444031, + "learning_rate": 8.04694048616932e-06, + "loss": 0.428, + "step": 960 + }, + { + "epoch": 0.24170020120724345, + "grad_norm": 0.5002796649932861, + "learning_rate": 8.055322715842415e-06, + "loss": 0.4342, + "step": 961 + }, + { + "epoch": 0.24195171026156942, + "grad_norm": 0.470553457736969, + "learning_rate": 8.063704945515508e-06, + "loss": 0.3983, + "step": 962 + }, + { + "epoch": 0.24220321931589536, + "grad_norm": 0.4792780876159668, + "learning_rate": 8.0720871751886e-06, + "loss": 0.4156, + "step": 963 + }, + { + "epoch": 0.24245472837022133, + "grad_norm": 0.44463014602661133, + "learning_rate": 8.080469404861695e-06, + "loss": 0.4143, + "step": 964 + }, + { + "epoch": 0.24270623742454728, + "grad_norm": 0.4440299868583679, + "learning_rate": 8.088851634534787e-06, + "loss": 0.3994, + "step": 965 + }, + { + "epoch": 0.24295774647887325, + "grad_norm": 0.45312440395355225, + "learning_rate": 8.09723386420788e-06, + "loss": 0.4236, + "step": 966 + }, + { + "epoch": 0.2432092555331992, + "grad_norm": 0.4270833134651184, + "learning_rate": 8.105616093880972e-06, + "loss": 0.4166, + "step": 967 + }, + { + "epoch": 0.24346076458752516, + "grad_norm": 0.46889373660087585, + "learning_rate": 8.113998323554067e-06, + "loss": 0.4467, + "step": 968 + }, + { + "epoch": 0.2437122736418511, + "grad_norm": 0.43110471963882446, + "learning_rate": 8.12238055322716e-06, + "loss": 0.4207, + "step": 969 + }, + { + "epoch": 0.24396378269617705, + "grad_norm": 0.500971257686615, + "learning_rate": 8.130762782900252e-06, + "loss": 0.4096, + "step": 970 + }, + { + "epoch": 0.24421529175050302, + "grad_norm": 0.43988287448883057, + "learning_rate": 8.139145012573345e-06, + "loss": 0.4236, + "step": 971 + }, + { + "epoch": 0.24446680080482897, + "grad_norm": 0.43207594752311707, + "learning_rate": 8.147527242246437e-06, + "loss": 0.4139, + "step": 972 + }, + { + "epoch": 0.24471830985915494, + "grad_norm": 0.6875976920127869, + "learning_rate": 8.155909471919532e-06, + "loss": 0.4108, + "step": 973 + }, + { + "epoch": 0.24496981891348088, + "grad_norm": 0.4743395447731018, + "learning_rate": 8.164291701592624e-06, + "loss": 0.423, + "step": 974 + }, + { + "epoch": 0.24522132796780685, + "grad_norm": 0.45225653052330017, + "learning_rate": 8.172673931265719e-06, + "loss": 0.4037, + "step": 975 + }, + { + "epoch": 0.2454728370221328, + "grad_norm": 0.45756688714027405, + "learning_rate": 8.181056160938811e-06, + "loss": 0.4419, + "step": 976 + }, + { + "epoch": 0.24572434607645877, + "grad_norm": 0.48021066188812256, + "learning_rate": 8.189438390611904e-06, + "loss": 0.4449, + "step": 977 + }, + { + "epoch": 0.2459758551307847, + "grad_norm": 0.44351431727409363, + "learning_rate": 8.197820620284996e-06, + "loss": 0.4226, + "step": 978 + }, + { + "epoch": 0.24622736418511065, + "grad_norm": 0.4529029428958893, + "learning_rate": 8.206202849958089e-06, + "loss": 0.4384, + "step": 979 + }, + { + "epoch": 0.24647887323943662, + "grad_norm": 0.5040020942687988, + "learning_rate": 8.214585079631182e-06, + "loss": 0.4464, + "step": 980 + }, + { + "epoch": 0.24673038229376257, + "grad_norm": 0.45902392268180847, + "learning_rate": 8.222967309304276e-06, + "loss": 0.3818, + "step": 981 + }, + { + "epoch": 0.24698189134808854, + "grad_norm": 0.4509660601615906, + "learning_rate": 8.231349538977369e-06, + "loss": 0.3941, + "step": 982 + }, + { + "epoch": 0.24723340040241448, + "grad_norm": 0.38173332810401917, + "learning_rate": 8.239731768650461e-06, + "loss": 0.4172, + "step": 983 + }, + { + "epoch": 0.24748490945674045, + "grad_norm": 0.45830610394477844, + "learning_rate": 8.248113998323556e-06, + "loss": 0.407, + "step": 984 + }, + { + "epoch": 0.2477364185110664, + "grad_norm": 0.49054500460624695, + "learning_rate": 8.256496227996648e-06, + "loss": 0.4212, + "step": 985 + }, + { + "epoch": 0.24798792756539234, + "grad_norm": 0.4405127763748169, + "learning_rate": 8.264878457669741e-06, + "loss": 0.4381, + "step": 986 + }, + { + "epoch": 0.2482394366197183, + "grad_norm": 0.4861133098602295, + "learning_rate": 8.273260687342833e-06, + "loss": 0.4282, + "step": 987 + }, + { + "epoch": 0.24849094567404426, + "grad_norm": 0.39729857444763184, + "learning_rate": 8.281642917015928e-06, + "loss": 0.4062, + "step": 988 + }, + { + "epoch": 0.24874245472837023, + "grad_norm": 0.48022037744522095, + "learning_rate": 8.29002514668902e-06, + "loss": 0.425, + "step": 989 + }, + { + "epoch": 0.24899396378269617, + "grad_norm": 0.46410584449768066, + "learning_rate": 8.298407376362113e-06, + "loss": 0.3949, + "step": 990 + }, + { + "epoch": 0.24924547283702214, + "grad_norm": 0.48868152499198914, + "learning_rate": 8.306789606035206e-06, + "loss": 0.4166, + "step": 991 + }, + { + "epoch": 0.24949698189134809, + "grad_norm": 0.46738943457603455, + "learning_rate": 8.315171835708298e-06, + "loss": 0.4221, + "step": 992 + }, + { + "epoch": 0.24974849094567406, + "grad_norm": 0.42502716183662415, + "learning_rate": 8.323554065381391e-06, + "loss": 0.4006, + "step": 993 + }, + { + "epoch": 0.25, + "grad_norm": 0.41025519371032715, + "learning_rate": 8.331936295054485e-06, + "loss": 0.4027, + "step": 994 + }, + { + "epoch": 0.25025150905432597, + "grad_norm": 0.46180248260498047, + "learning_rate": 8.340318524727578e-06, + "loss": 0.4375, + "step": 995 + }, + { + "epoch": 0.2505030181086519, + "grad_norm": 0.4456992447376251, + "learning_rate": 8.348700754400672e-06, + "loss": 0.4348, + "step": 996 + }, + { + "epoch": 0.25075452716297786, + "grad_norm": 0.4408442974090576, + "learning_rate": 8.357082984073765e-06, + "loss": 0.4062, + "step": 997 + }, + { + "epoch": 0.25100603621730383, + "grad_norm": 0.49061262607574463, + "learning_rate": 8.365465213746857e-06, + "loss": 0.4509, + "step": 998 + }, + { + "epoch": 0.2512575452716298, + "grad_norm": 0.5399837493896484, + "learning_rate": 8.37384744341995e-06, + "loss": 0.4313, + "step": 999 + }, + { + "epoch": 0.2515090543259557, + "grad_norm": 0.5015750527381897, + "learning_rate": 8.382229673093043e-06, + "loss": 0.4199, + "step": 1000 + }, + { + "epoch": 0.2517605633802817, + "grad_norm": 0.550068199634552, + "learning_rate": 8.390611902766137e-06, + "loss": 0.4691, + "step": 1001 + }, + { + "epoch": 0.25201207243460766, + "grad_norm": 0.4467526376247406, + "learning_rate": 8.39899413243923e-06, + "loss": 0.4382, + "step": 1002 + }, + { + "epoch": 0.2522635814889336, + "grad_norm": 0.5115616321563721, + "learning_rate": 8.407376362112322e-06, + "loss": 0.4377, + "step": 1003 + }, + { + "epoch": 0.25251509054325955, + "grad_norm": 0.4908027946949005, + "learning_rate": 8.415758591785415e-06, + "loss": 0.4417, + "step": 1004 + }, + { + "epoch": 0.2527665995975855, + "grad_norm": 0.5211309790611267, + "learning_rate": 8.42414082145851e-06, + "loss": 0.4143, + "step": 1005 + }, + { + "epoch": 0.2530181086519115, + "grad_norm": 0.5352211594581604, + "learning_rate": 8.432523051131602e-06, + "loss": 0.3923, + "step": 1006 + }, + { + "epoch": 0.2532696177062374, + "grad_norm": 0.511669397354126, + "learning_rate": 8.440905280804694e-06, + "loss": 0.4113, + "step": 1007 + }, + { + "epoch": 0.2535211267605634, + "grad_norm": 0.5156270861625671, + "learning_rate": 8.449287510477789e-06, + "loss": 0.4233, + "step": 1008 + }, + { + "epoch": 0.25377263581488935, + "grad_norm": 0.49817922711372375, + "learning_rate": 8.457669740150881e-06, + "loss": 0.3877, + "step": 1009 + }, + { + "epoch": 0.2540241448692153, + "grad_norm": 0.4637084901332855, + "learning_rate": 8.466051969823974e-06, + "loss": 0.4413, + "step": 1010 + }, + { + "epoch": 0.25427565392354123, + "grad_norm": 0.5281439423561096, + "learning_rate": 8.474434199497067e-06, + "loss": 0.4063, + "step": 1011 + }, + { + "epoch": 0.2545271629778672, + "grad_norm": 0.45103931427001953, + "learning_rate": 8.48281642917016e-06, + "loss": 0.4049, + "step": 1012 + }, + { + "epoch": 0.2547786720321932, + "grad_norm": 0.47929713129997253, + "learning_rate": 8.491198658843252e-06, + "loss": 0.413, + "step": 1013 + }, + { + "epoch": 0.2550301810865191, + "grad_norm": 0.4654606580734253, + "learning_rate": 8.499580888516346e-06, + "loss": 0.4246, + "step": 1014 + }, + { + "epoch": 0.25528169014084506, + "grad_norm": 0.4492524266242981, + "learning_rate": 8.507963118189439e-06, + "loss": 0.4328, + "step": 1015 + }, + { + "epoch": 0.25553319919517103, + "grad_norm": 0.5223946571350098, + "learning_rate": 8.516345347862533e-06, + "loss": 0.3964, + "step": 1016 + }, + { + "epoch": 0.255784708249497, + "grad_norm": 0.5574830770492554, + "learning_rate": 8.524727577535626e-06, + "loss": 0.4041, + "step": 1017 + }, + { + "epoch": 0.2560362173038229, + "grad_norm": 0.5438414216041565, + "learning_rate": 8.533109807208718e-06, + "loss": 0.422, + "step": 1018 + }, + { + "epoch": 0.2562877263581489, + "grad_norm": 0.49950259923934937, + "learning_rate": 8.541492036881811e-06, + "loss": 0.4054, + "step": 1019 + }, + { + "epoch": 0.25653923541247486, + "grad_norm": 0.5470491051673889, + "learning_rate": 8.549874266554904e-06, + "loss": 0.4237, + "step": 1020 + }, + { + "epoch": 0.2567907444668008, + "grad_norm": 0.4732471704483032, + "learning_rate": 8.558256496227996e-06, + "loss": 0.4189, + "step": 1021 + }, + { + "epoch": 0.25704225352112675, + "grad_norm": 0.4920424818992615, + "learning_rate": 8.56663872590109e-06, + "loss": 0.4443, + "step": 1022 + }, + { + "epoch": 0.2572937625754527, + "grad_norm": 0.4606372117996216, + "learning_rate": 8.575020955574183e-06, + "loss": 0.4187, + "step": 1023 + }, + { + "epoch": 0.2575452716297787, + "grad_norm": 0.5176842212677002, + "learning_rate": 8.583403185247276e-06, + "loss": 0.4096, + "step": 1024 + }, + { + "epoch": 0.2577967806841046, + "grad_norm": 0.4598933458328247, + "learning_rate": 8.59178541492037e-06, + "loss": 0.4443, + "step": 1025 + }, + { + "epoch": 0.2580482897384306, + "grad_norm": 0.4918026030063629, + "learning_rate": 8.600167644593463e-06, + "loss": 0.4208, + "step": 1026 + }, + { + "epoch": 0.25829979879275655, + "grad_norm": 0.46681898832321167, + "learning_rate": 8.608549874266555e-06, + "loss": 0.4168, + "step": 1027 + }, + { + "epoch": 0.25855130784708247, + "grad_norm": 0.4654182493686676, + "learning_rate": 8.61693210393965e-06, + "loss": 0.4186, + "step": 1028 + }, + { + "epoch": 0.25880281690140844, + "grad_norm": 0.5116965770721436, + "learning_rate": 8.625314333612742e-06, + "loss": 0.43, + "step": 1029 + }, + { + "epoch": 0.2590543259557344, + "grad_norm": 0.4793979823589325, + "learning_rate": 8.633696563285835e-06, + "loss": 0.4371, + "step": 1030 + }, + { + "epoch": 0.2593058350100604, + "grad_norm": 0.49715420603752136, + "learning_rate": 8.642078792958928e-06, + "loss": 0.4197, + "step": 1031 + }, + { + "epoch": 0.2595573440643863, + "grad_norm": 0.43482688069343567, + "learning_rate": 8.65046102263202e-06, + "loss": 0.4, + "step": 1032 + }, + { + "epoch": 0.25980885311871227, + "grad_norm": 0.5626974701881409, + "learning_rate": 8.658843252305113e-06, + "loss": 0.4498, + "step": 1033 + }, + { + "epoch": 0.26006036217303824, + "grad_norm": 0.4670769274234772, + "learning_rate": 8.667225481978207e-06, + "loss": 0.4038, + "step": 1034 + }, + { + "epoch": 0.2603118712273642, + "grad_norm": 0.4788265824317932, + "learning_rate": 8.6756077116513e-06, + "loss": 0.4186, + "step": 1035 + }, + { + "epoch": 0.2605633802816901, + "grad_norm": 0.5014175772666931, + "learning_rate": 8.683989941324394e-06, + "loss": 0.4175, + "step": 1036 + }, + { + "epoch": 0.2608148893360161, + "grad_norm": 0.4646727740764618, + "learning_rate": 8.692372170997487e-06, + "loss": 0.4149, + "step": 1037 + }, + { + "epoch": 0.26106639839034207, + "grad_norm": 0.4624409079551697, + "learning_rate": 8.70075440067058e-06, + "loss": 0.376, + "step": 1038 + }, + { + "epoch": 0.261317907444668, + "grad_norm": 0.4742669463157654, + "learning_rate": 8.709136630343672e-06, + "loss": 0.4228, + "step": 1039 + }, + { + "epoch": 0.26156941649899396, + "grad_norm": 0.551210880279541, + "learning_rate": 8.717518860016765e-06, + "loss": 0.4292, + "step": 1040 + }, + { + "epoch": 0.2618209255533199, + "grad_norm": 0.48132550716400146, + "learning_rate": 8.725901089689857e-06, + "loss": 0.4313, + "step": 1041 + }, + { + "epoch": 0.2620724346076459, + "grad_norm": 0.5049678683280945, + "learning_rate": 8.734283319362952e-06, + "loss": 0.4193, + "step": 1042 + }, + { + "epoch": 0.2623239436619718, + "grad_norm": 0.5476779341697693, + "learning_rate": 8.742665549036044e-06, + "loss": 0.4106, + "step": 1043 + }, + { + "epoch": 0.2625754527162978, + "grad_norm": 0.4799102246761322, + "learning_rate": 8.751047778709137e-06, + "loss": 0.4198, + "step": 1044 + }, + { + "epoch": 0.26282696177062376, + "grad_norm": 0.46002301573753357, + "learning_rate": 8.75943000838223e-06, + "loss": 0.4007, + "step": 1045 + }, + { + "epoch": 0.2630784708249497, + "grad_norm": 0.5139147043228149, + "learning_rate": 8.767812238055324e-06, + "loss": 0.399, + "step": 1046 + }, + { + "epoch": 0.26332997987927564, + "grad_norm": 0.4995320439338684, + "learning_rate": 8.776194467728416e-06, + "loss": 0.4379, + "step": 1047 + }, + { + "epoch": 0.2635814889336016, + "grad_norm": 0.4521893560886383, + "learning_rate": 8.784576697401509e-06, + "loss": 0.4338, + "step": 1048 + }, + { + "epoch": 0.2638329979879276, + "grad_norm": 0.4362622797489166, + "learning_rate": 8.792958927074603e-06, + "loss": 0.4217, + "step": 1049 + }, + { + "epoch": 0.2640845070422535, + "grad_norm": 0.4953138828277588, + "learning_rate": 8.801341156747696e-06, + "loss": 0.4167, + "step": 1050 + }, + { + "epoch": 0.2643360160965795, + "grad_norm": 0.4723548889160156, + "learning_rate": 8.809723386420789e-06, + "loss": 0.4537, + "step": 1051 + }, + { + "epoch": 0.26458752515090544, + "grad_norm": 0.5056288242340088, + "learning_rate": 8.818105616093881e-06, + "loss": 0.4412, + "step": 1052 + }, + { + "epoch": 0.26483903420523136, + "grad_norm": 0.5119395852088928, + "learning_rate": 8.826487845766974e-06, + "loss": 0.4184, + "step": 1053 + }, + { + "epoch": 0.26509054325955733, + "grad_norm": 0.45162543654441833, + "learning_rate": 8.834870075440067e-06, + "loss": 0.4135, + "step": 1054 + }, + { + "epoch": 0.2653420523138833, + "grad_norm": 0.5046412348747253, + "learning_rate": 8.84325230511316e-06, + "loss": 0.3995, + "step": 1055 + }, + { + "epoch": 0.2655935613682093, + "grad_norm": 0.42225155234336853, + "learning_rate": 8.851634534786253e-06, + "loss": 0.4316, + "step": 1056 + }, + { + "epoch": 0.2658450704225352, + "grad_norm": 0.5072396397590637, + "learning_rate": 8.860016764459348e-06, + "loss": 0.4268, + "step": 1057 + }, + { + "epoch": 0.26609657947686116, + "grad_norm": 0.4895479083061218, + "learning_rate": 8.86839899413244e-06, + "loss": 0.4227, + "step": 1058 + }, + { + "epoch": 0.26634808853118713, + "grad_norm": 0.42159146070480347, + "learning_rate": 8.876781223805533e-06, + "loss": 0.4053, + "step": 1059 + }, + { + "epoch": 0.2665995975855131, + "grad_norm": 0.4871593713760376, + "learning_rate": 8.885163453478626e-06, + "loss": 0.4027, + "step": 1060 + }, + { + "epoch": 0.266851106639839, + "grad_norm": 0.4512350559234619, + "learning_rate": 8.893545683151718e-06, + "loss": 0.4113, + "step": 1061 + }, + { + "epoch": 0.267102615694165, + "grad_norm": 0.4652332365512848, + "learning_rate": 8.901927912824813e-06, + "loss": 0.4308, + "step": 1062 + }, + { + "epoch": 0.26735412474849096, + "grad_norm": 0.4900280833244324, + "learning_rate": 8.910310142497905e-06, + "loss": 0.399, + "step": 1063 + }, + { + "epoch": 0.2676056338028169, + "grad_norm": 0.42869365215301514, + "learning_rate": 8.918692372170998e-06, + "loss": 0.4127, + "step": 1064 + }, + { + "epoch": 0.26785714285714285, + "grad_norm": 0.4814034700393677, + "learning_rate": 8.92707460184409e-06, + "loss": 0.4181, + "step": 1065 + }, + { + "epoch": 0.2681086519114688, + "grad_norm": 0.47565895318984985, + "learning_rate": 8.935456831517185e-06, + "loss": 0.4279, + "step": 1066 + }, + { + "epoch": 0.2683601609657948, + "grad_norm": 0.4539470374584198, + "learning_rate": 8.943839061190277e-06, + "loss": 0.4044, + "step": 1067 + }, + { + "epoch": 0.2686116700201207, + "grad_norm": 0.44740232825279236, + "learning_rate": 8.95222129086337e-06, + "loss": 0.4206, + "step": 1068 + }, + { + "epoch": 0.2688631790744467, + "grad_norm": 0.46885591745376587, + "learning_rate": 8.960603520536464e-06, + "loss": 0.3829, + "step": 1069 + }, + { + "epoch": 0.26911468812877265, + "grad_norm": 0.5142282247543335, + "learning_rate": 8.968985750209557e-06, + "loss": 0.4, + "step": 1070 + }, + { + "epoch": 0.26936619718309857, + "grad_norm": 0.406148225069046, + "learning_rate": 8.97736797988265e-06, + "loss": 0.3931, + "step": 1071 + }, + { + "epoch": 0.26961770623742454, + "grad_norm": 0.449266254901886, + "learning_rate": 8.985750209555742e-06, + "loss": 0.3975, + "step": 1072 + }, + { + "epoch": 0.2698692152917505, + "grad_norm": 0.530730664730072, + "learning_rate": 8.994132439228835e-06, + "loss": 0.4193, + "step": 1073 + }, + { + "epoch": 0.2701207243460765, + "grad_norm": 0.495766282081604, + "learning_rate": 9.002514668901928e-06, + "loss": 0.4453, + "step": 1074 + }, + { + "epoch": 0.2703722334004024, + "grad_norm": 0.43788525462150574, + "learning_rate": 9.010896898575022e-06, + "loss": 0.4432, + "step": 1075 + }, + { + "epoch": 0.27062374245472837, + "grad_norm": 0.5969604253768921, + "learning_rate": 9.019279128248114e-06, + "loss": 0.4155, + "step": 1076 + }, + { + "epoch": 0.27087525150905434, + "grad_norm": 0.5051907896995544, + "learning_rate": 9.027661357921209e-06, + "loss": 0.4363, + "step": 1077 + }, + { + "epoch": 0.2711267605633803, + "grad_norm": 0.48711901903152466, + "learning_rate": 9.036043587594301e-06, + "loss": 0.4054, + "step": 1078 + }, + { + "epoch": 0.2713782696177062, + "grad_norm": 0.745858907699585, + "learning_rate": 9.044425817267394e-06, + "loss": 0.436, + "step": 1079 + }, + { + "epoch": 0.2716297786720322, + "grad_norm": 0.558805525302887, + "learning_rate": 9.052808046940487e-06, + "loss": 0.3979, + "step": 1080 + }, + { + "epoch": 0.27188128772635817, + "grad_norm": 0.5273854732513428, + "learning_rate": 9.06119027661358e-06, + "loss": 0.4198, + "step": 1081 + }, + { + "epoch": 0.2721327967806841, + "grad_norm": 0.5657176375389099, + "learning_rate": 9.069572506286674e-06, + "loss": 0.4124, + "step": 1082 + }, + { + "epoch": 0.27238430583501005, + "grad_norm": 0.46423089504241943, + "learning_rate": 9.077954735959766e-06, + "loss": 0.426, + "step": 1083 + }, + { + "epoch": 0.272635814889336, + "grad_norm": 0.5339906811714172, + "learning_rate": 9.086336965632859e-06, + "loss": 0.4062, + "step": 1084 + }, + { + "epoch": 0.272887323943662, + "grad_norm": 0.46117010712623596, + "learning_rate": 9.094719195305951e-06, + "loss": 0.4059, + "step": 1085 + }, + { + "epoch": 0.2731388329979879, + "grad_norm": 0.4512513279914856, + "learning_rate": 9.103101424979046e-06, + "loss": 0.4119, + "step": 1086 + }, + { + "epoch": 0.2733903420523139, + "grad_norm": 0.5174707770347595, + "learning_rate": 9.111483654652138e-06, + "loss": 0.436, + "step": 1087 + }, + { + "epoch": 0.27364185110663986, + "grad_norm": 0.4645453989505768, + "learning_rate": 9.119865884325231e-06, + "loss": 0.4332, + "step": 1088 + }, + { + "epoch": 0.27389336016096577, + "grad_norm": 0.5541778206825256, + "learning_rate": 9.128248113998325e-06, + "loss": 0.4114, + "step": 1089 + }, + { + "epoch": 0.27414486921529174, + "grad_norm": 0.47235891222953796, + "learning_rate": 9.136630343671418e-06, + "loss": 0.4452, + "step": 1090 + }, + { + "epoch": 0.2743963782696177, + "grad_norm": 0.4705214500427246, + "learning_rate": 9.14501257334451e-06, + "loss": 0.4028, + "step": 1091 + }, + { + "epoch": 0.2746478873239437, + "grad_norm": 0.4457109272480011, + "learning_rate": 9.153394803017603e-06, + "loss": 0.3873, + "step": 1092 + }, + { + "epoch": 0.2748993963782696, + "grad_norm": 0.4580100178718567, + "learning_rate": 9.161777032690696e-06, + "loss": 0.4188, + "step": 1093 + }, + { + "epoch": 0.27515090543259557, + "grad_norm": 0.50628662109375, + "learning_rate": 9.170159262363788e-06, + "loss": 0.4683, + "step": 1094 + }, + { + "epoch": 0.27540241448692154, + "grad_norm": 0.5623934864997864, + "learning_rate": 9.178541492036883e-06, + "loss": 0.4115, + "step": 1095 + }, + { + "epoch": 0.27565392354124746, + "grad_norm": 0.45169952511787415, + "learning_rate": 9.186923721709975e-06, + "loss": 0.4382, + "step": 1096 + }, + { + "epoch": 0.27590543259557343, + "grad_norm": 0.4219382107257843, + "learning_rate": 9.19530595138307e-06, + "loss": 0.3918, + "step": 1097 + }, + { + "epoch": 0.2761569416498994, + "grad_norm": 0.47983744740486145, + "learning_rate": 9.203688181056162e-06, + "loss": 0.4257, + "step": 1098 + }, + { + "epoch": 0.2764084507042254, + "grad_norm": 0.462584525346756, + "learning_rate": 9.212070410729255e-06, + "loss": 0.4289, + "step": 1099 + }, + { + "epoch": 0.2766599597585513, + "grad_norm": 0.4568127393722534, + "learning_rate": 9.220452640402348e-06, + "loss": 0.4181, + "step": 1100 + }, + { + "epoch": 0.27691146881287726, + "grad_norm": 0.42234259843826294, + "learning_rate": 9.22883487007544e-06, + "loss": 0.4076, + "step": 1101 + }, + { + "epoch": 0.27716297786720323, + "grad_norm": 0.5175244212150574, + "learning_rate": 9.237217099748533e-06, + "loss": 0.4207, + "step": 1102 + }, + { + "epoch": 0.2774144869215292, + "grad_norm": 0.4390292763710022, + "learning_rate": 9.245599329421627e-06, + "loss": 0.4413, + "step": 1103 + }, + { + "epoch": 0.2776659959758551, + "grad_norm": 0.5344337821006775, + "learning_rate": 9.25398155909472e-06, + "loss": 0.4002, + "step": 1104 + }, + { + "epoch": 0.2779175050301811, + "grad_norm": 0.5546042919158936, + "learning_rate": 9.262363788767812e-06, + "loss": 0.3978, + "step": 1105 + }, + { + "epoch": 0.27816901408450706, + "grad_norm": 0.45198217034339905, + "learning_rate": 9.270746018440905e-06, + "loss": 0.4228, + "step": 1106 + }, + { + "epoch": 0.278420523138833, + "grad_norm": 0.5244150757789612, + "learning_rate": 9.279128248114e-06, + "loss": 0.4018, + "step": 1107 + }, + { + "epoch": 0.27867203219315895, + "grad_norm": 0.44966578483581543, + "learning_rate": 9.287510477787092e-06, + "loss": 0.388, + "step": 1108 + }, + { + "epoch": 0.2789235412474849, + "grad_norm": 0.5236296653747559, + "learning_rate": 9.295892707460186e-06, + "loss": 0.4182, + "step": 1109 + }, + { + "epoch": 0.2791750503018109, + "grad_norm": 0.46590518951416016, + "learning_rate": 9.304274937133279e-06, + "loss": 0.4121, + "step": 1110 + }, + { + "epoch": 0.2794265593561368, + "grad_norm": 0.4703758955001831, + "learning_rate": 9.312657166806372e-06, + "loss": 0.4208, + "step": 1111 + }, + { + "epoch": 0.2796780684104628, + "grad_norm": 0.431341290473938, + "learning_rate": 9.321039396479464e-06, + "loss": 0.4101, + "step": 1112 + }, + { + "epoch": 0.27992957746478875, + "grad_norm": 0.5059375762939453, + "learning_rate": 9.329421626152557e-06, + "loss": 0.4478, + "step": 1113 + }, + { + "epoch": 0.28018108651911466, + "grad_norm": 0.4744166433811188, + "learning_rate": 9.33780385582565e-06, + "loss": 0.4076, + "step": 1114 + }, + { + "epoch": 0.28043259557344064, + "grad_norm": 0.4455539584159851, + "learning_rate": 9.346186085498742e-06, + "loss": 0.4235, + "step": 1115 + }, + { + "epoch": 0.2806841046277666, + "grad_norm": 0.5029807090759277, + "learning_rate": 9.354568315171836e-06, + "loss": 0.4215, + "step": 1116 + }, + { + "epoch": 0.2809356136820926, + "grad_norm": 0.4714094400405884, + "learning_rate": 9.362950544844929e-06, + "loss": 0.3958, + "step": 1117 + }, + { + "epoch": 0.2811871227364185, + "grad_norm": 0.4298758804798126, + "learning_rate": 9.371332774518023e-06, + "loss": 0.4283, + "step": 1118 + }, + { + "epoch": 0.28143863179074446, + "grad_norm": 0.4799964427947998, + "learning_rate": 9.379715004191116e-06, + "loss": 0.4207, + "step": 1119 + }, + { + "epoch": 0.28169014084507044, + "grad_norm": 0.5128260850906372, + "learning_rate": 9.388097233864209e-06, + "loss": 0.4336, + "step": 1120 + }, + { + "epoch": 0.28194164989939635, + "grad_norm": 0.44608020782470703, + "learning_rate": 9.396479463537301e-06, + "loss": 0.4154, + "step": 1121 + }, + { + "epoch": 0.2821931589537223, + "grad_norm": 0.4927178621292114, + "learning_rate": 9.404861693210394e-06, + "loss": 0.4369, + "step": 1122 + }, + { + "epoch": 0.2824446680080483, + "grad_norm": 0.42505180835723877, + "learning_rate": 9.413243922883488e-06, + "loss": 0.4089, + "step": 1123 + }, + { + "epoch": 0.28269617706237427, + "grad_norm": 0.4519820213317871, + "learning_rate": 9.42162615255658e-06, + "loss": 0.4242, + "step": 1124 + }, + { + "epoch": 0.2829476861167002, + "grad_norm": 0.4712681770324707, + "learning_rate": 9.430008382229673e-06, + "loss": 0.4362, + "step": 1125 + }, + { + "epoch": 0.28319919517102615, + "grad_norm": 0.45173585414886475, + "learning_rate": 9.438390611902766e-06, + "loss": 0.4332, + "step": 1126 + }, + { + "epoch": 0.2834507042253521, + "grad_norm": 0.4184447228908539, + "learning_rate": 9.44677284157586e-06, + "loss": 0.4312, + "step": 1127 + }, + { + "epoch": 0.2837022132796781, + "grad_norm": 0.4533083736896515, + "learning_rate": 9.455155071248953e-06, + "loss": 0.4127, + "step": 1128 + }, + { + "epoch": 0.283953722334004, + "grad_norm": 0.4317009449005127, + "learning_rate": 9.463537300922047e-06, + "loss": 0.4158, + "step": 1129 + }, + { + "epoch": 0.28420523138833, + "grad_norm": 0.4305717945098877, + "learning_rate": 9.47191953059514e-06, + "loss": 0.4266, + "step": 1130 + }, + { + "epoch": 0.28445674044265595, + "grad_norm": 0.4272315204143524, + "learning_rate": 9.480301760268233e-06, + "loss": 0.4434, + "step": 1131 + }, + { + "epoch": 0.28470824949698187, + "grad_norm": 0.4827929735183716, + "learning_rate": 9.488683989941325e-06, + "loss": 0.4404, + "step": 1132 + }, + { + "epoch": 0.28495975855130784, + "grad_norm": 0.4054642617702484, + "learning_rate": 9.497066219614418e-06, + "loss": 0.41, + "step": 1133 + }, + { + "epoch": 0.2852112676056338, + "grad_norm": 0.4518846869468689, + "learning_rate": 9.50544844928751e-06, + "loss": 0.4361, + "step": 1134 + }, + { + "epoch": 0.2854627766599598, + "grad_norm": 0.4325503706932068, + "learning_rate": 9.513830678960603e-06, + "loss": 0.399, + "step": 1135 + }, + { + "epoch": 0.2857142857142857, + "grad_norm": 0.4354625642299652, + "learning_rate": 9.522212908633697e-06, + "loss": 0.4186, + "step": 1136 + }, + { + "epoch": 0.28596579476861167, + "grad_norm": 0.43451640009880066, + "learning_rate": 9.53059513830679e-06, + "loss": 0.4315, + "step": 1137 + }, + { + "epoch": 0.28621730382293764, + "grad_norm": 0.4460737109184265, + "learning_rate": 9.538977367979884e-06, + "loss": 0.3852, + "step": 1138 + }, + { + "epoch": 0.28646881287726356, + "grad_norm": 0.44246113300323486, + "learning_rate": 9.547359597652977e-06, + "loss": 0.4176, + "step": 1139 + }, + { + "epoch": 0.28672032193158953, + "grad_norm": 0.4565284550189972, + "learning_rate": 9.55574182732607e-06, + "loss": 0.4202, + "step": 1140 + }, + { + "epoch": 0.2869718309859155, + "grad_norm": 0.46833330392837524, + "learning_rate": 9.564124056999162e-06, + "loss": 0.4173, + "step": 1141 + }, + { + "epoch": 0.28722334004024147, + "grad_norm": 0.5017228722572327, + "learning_rate": 9.572506286672255e-06, + "loss": 0.3843, + "step": 1142 + }, + { + "epoch": 0.2874748490945674, + "grad_norm": 0.5037913918495178, + "learning_rate": 9.58088851634535e-06, + "loss": 0.4185, + "step": 1143 + }, + { + "epoch": 0.28772635814889336, + "grad_norm": 0.5422639846801758, + "learning_rate": 9.589270746018442e-06, + "loss": 0.4084, + "step": 1144 + }, + { + "epoch": 0.28797786720321933, + "grad_norm": 0.4788282513618469, + "learning_rate": 9.597652975691534e-06, + "loss": 0.3975, + "step": 1145 + }, + { + "epoch": 0.28822937625754524, + "grad_norm": 0.5363534092903137, + "learning_rate": 9.606035205364627e-06, + "loss": 0.4217, + "step": 1146 + }, + { + "epoch": 0.2884808853118712, + "grad_norm": 0.44976770877838135, + "learning_rate": 9.614417435037721e-06, + "loss": 0.3822, + "step": 1147 + }, + { + "epoch": 0.2887323943661972, + "grad_norm": 0.49459314346313477, + "learning_rate": 9.622799664710814e-06, + "loss": 0.4358, + "step": 1148 + }, + { + "epoch": 0.28898390342052316, + "grad_norm": 0.45928260684013367, + "learning_rate": 9.631181894383907e-06, + "loss": 0.4268, + "step": 1149 + }, + { + "epoch": 0.2892354124748491, + "grad_norm": 0.43904656171798706, + "learning_rate": 9.639564124057001e-06, + "loss": 0.4202, + "step": 1150 + }, + { + "epoch": 0.28948692152917505, + "grad_norm": 0.45102381706237793, + "learning_rate": 9.647946353730094e-06, + "loss": 0.4087, + "step": 1151 + }, + { + "epoch": 0.289738430583501, + "grad_norm": 0.4579281210899353, + "learning_rate": 9.656328583403186e-06, + "loss": 0.4153, + "step": 1152 + }, + { + "epoch": 0.289989939637827, + "grad_norm": 0.4361988604068756, + "learning_rate": 9.664710813076279e-06, + "loss": 0.4393, + "step": 1153 + }, + { + "epoch": 0.2902414486921529, + "grad_norm": 0.43042805790901184, + "learning_rate": 9.673093042749371e-06, + "loss": 0.4031, + "step": 1154 + }, + { + "epoch": 0.2904929577464789, + "grad_norm": 0.5227105021476746, + "learning_rate": 9.681475272422464e-06, + "loss": 0.433, + "step": 1155 + }, + { + "epoch": 0.29074446680080485, + "grad_norm": 0.48663586378097534, + "learning_rate": 9.689857502095558e-06, + "loss": 0.3878, + "step": 1156 + }, + { + "epoch": 0.29099597585513076, + "grad_norm": 0.5375685691833496, + "learning_rate": 9.698239731768651e-06, + "loss": 0.4254, + "step": 1157 + }, + { + "epoch": 0.29124748490945673, + "grad_norm": 0.5156546235084534, + "learning_rate": 9.706621961441745e-06, + "loss": 0.414, + "step": 1158 + }, + { + "epoch": 0.2914989939637827, + "grad_norm": 0.4819703698158264, + "learning_rate": 9.715004191114838e-06, + "loss": 0.4194, + "step": 1159 + }, + { + "epoch": 0.2917505030181087, + "grad_norm": 0.4525262117385864, + "learning_rate": 9.72338642078793e-06, + "loss": 0.421, + "step": 1160 + }, + { + "epoch": 0.2920020120724346, + "grad_norm": 0.5090615153312683, + "learning_rate": 9.731768650461023e-06, + "loss": 0.4238, + "step": 1161 + }, + { + "epoch": 0.29225352112676056, + "grad_norm": 0.42741072177886963, + "learning_rate": 9.740150880134116e-06, + "loss": 0.3865, + "step": 1162 + }, + { + "epoch": 0.29250503018108653, + "grad_norm": 0.5380824208259583, + "learning_rate": 9.74853310980721e-06, + "loss": 0.4206, + "step": 1163 + }, + { + "epoch": 0.29275653923541245, + "grad_norm": 0.44021573662757874, + "learning_rate": 9.756915339480303e-06, + "loss": 0.4082, + "step": 1164 + }, + { + "epoch": 0.2930080482897384, + "grad_norm": 0.4981013834476471, + "learning_rate": 9.765297569153395e-06, + "loss": 0.4137, + "step": 1165 + }, + { + "epoch": 0.2932595573440644, + "grad_norm": 0.5061078071594238, + "learning_rate": 9.773679798826488e-06, + "loss": 0.404, + "step": 1166 + }, + { + "epoch": 0.29351106639839036, + "grad_norm": 0.4524628818035126, + "learning_rate": 9.78206202849958e-06, + "loss": 0.4101, + "step": 1167 + }, + { + "epoch": 0.2937625754527163, + "grad_norm": 0.5405965447425842, + "learning_rate": 9.790444258172675e-06, + "loss": 0.4407, + "step": 1168 + }, + { + "epoch": 0.29401408450704225, + "grad_norm": 0.43946367502212524, + "learning_rate": 9.798826487845768e-06, + "loss": 0.4234, + "step": 1169 + }, + { + "epoch": 0.2942655935613682, + "grad_norm": 0.52604079246521, + "learning_rate": 9.807208717518862e-06, + "loss": 0.4164, + "step": 1170 + }, + { + "epoch": 0.29451710261569414, + "grad_norm": 0.46760255098342896, + "learning_rate": 9.815590947191955e-06, + "loss": 0.4186, + "step": 1171 + }, + { + "epoch": 0.2947686116700201, + "grad_norm": 0.5770349502563477, + "learning_rate": 9.823973176865047e-06, + "loss": 0.3987, + "step": 1172 + }, + { + "epoch": 0.2950201207243461, + "grad_norm": 0.5591742992401123, + "learning_rate": 9.83235540653814e-06, + "loss": 0.4338, + "step": 1173 + }, + { + "epoch": 0.29527162977867205, + "grad_norm": 0.5872971415519714, + "learning_rate": 9.840737636211232e-06, + "loss": 0.4077, + "step": 1174 + }, + { + "epoch": 0.29552313883299797, + "grad_norm": 0.5050575733184814, + "learning_rate": 9.849119865884325e-06, + "loss": 0.4163, + "step": 1175 + }, + { + "epoch": 0.29577464788732394, + "grad_norm": 0.5279958844184875, + "learning_rate": 9.857502095557418e-06, + "loss": 0.4493, + "step": 1176 + }, + { + "epoch": 0.2960261569416499, + "grad_norm": 0.4451628625392914, + "learning_rate": 9.865884325230512e-06, + "loss": 0.4077, + "step": 1177 + }, + { + "epoch": 0.2962776659959759, + "grad_norm": 0.42817094922065735, + "learning_rate": 9.874266554903605e-06, + "loss": 0.3958, + "step": 1178 + }, + { + "epoch": 0.2965291750503018, + "grad_norm": 0.5152884125709534, + "learning_rate": 9.882648784576699e-06, + "loss": 0.3644, + "step": 1179 + }, + { + "epoch": 0.29678068410462777, + "grad_norm": 0.5255314707756042, + "learning_rate": 9.891031014249792e-06, + "loss": 0.4299, + "step": 1180 + }, + { + "epoch": 0.29703219315895374, + "grad_norm": 0.45311322808265686, + "learning_rate": 9.899413243922884e-06, + "loss": 0.4234, + "step": 1181 + }, + { + "epoch": 0.29728370221327965, + "grad_norm": 0.5598365068435669, + "learning_rate": 9.907795473595977e-06, + "loss": 0.4224, + "step": 1182 + }, + { + "epoch": 0.2975352112676056, + "grad_norm": 0.45655763149261475, + "learning_rate": 9.916177703269071e-06, + "loss": 0.4283, + "step": 1183 + }, + { + "epoch": 0.2977867203219316, + "grad_norm": 0.5361334085464478, + "learning_rate": 9.924559932942164e-06, + "loss": 0.4216, + "step": 1184 + }, + { + "epoch": 0.29803822937625757, + "grad_norm": 0.45937347412109375, + "learning_rate": 9.932942162615256e-06, + "loss": 0.4372, + "step": 1185 + }, + { + "epoch": 0.2982897384305835, + "grad_norm": 0.5063474178314209, + "learning_rate": 9.941324392288349e-06, + "loss": 0.434, + "step": 1186 + }, + { + "epoch": 0.29854124748490946, + "grad_norm": 0.5321951508522034, + "learning_rate": 9.949706621961442e-06, + "loss": 0.4464, + "step": 1187 + }, + { + "epoch": 0.2987927565392354, + "grad_norm": 0.4343951344490051, + "learning_rate": 9.958088851634536e-06, + "loss": 0.4008, + "step": 1188 + }, + { + "epoch": 0.29904426559356134, + "grad_norm": 0.48022282123565674, + "learning_rate": 9.966471081307629e-06, + "loss": 0.4137, + "step": 1189 + }, + { + "epoch": 0.2992957746478873, + "grad_norm": 0.4919980466365814, + "learning_rate": 9.974853310980723e-06, + "loss": 0.4176, + "step": 1190 + }, + { + "epoch": 0.2995472837022133, + "grad_norm": 0.4395059049129486, + "learning_rate": 9.983235540653816e-06, + "loss": 0.3917, + "step": 1191 + }, + { + "epoch": 0.29979879275653926, + "grad_norm": 0.4927704930305481, + "learning_rate": 9.991617770326908e-06, + "loss": 0.3653, + "step": 1192 + }, + { + "epoch": 0.30005030181086517, + "grad_norm": 0.458484411239624, + "learning_rate": 1e-05, + "loss": 0.4349, + "step": 1193 + }, + { + "epoch": 0.30030181086519114, + "grad_norm": 0.44261494278907776, + "learning_rate": 9.999999785890641e-06, + "loss": 0.4293, + "step": 1194 + }, + { + "epoch": 0.3005533199195171, + "grad_norm": 0.5019251108169556, + "learning_rate": 9.999999143562579e-06, + "loss": 0.423, + "step": 1195 + }, + { + "epoch": 0.3008048289738431, + "grad_norm": 0.42558160424232483, + "learning_rate": 9.99999807301587e-06, + "loss": 0.3981, + "step": 1196 + }, + { + "epoch": 0.301056338028169, + "grad_norm": 0.44284799695014954, + "learning_rate": 9.999996574250606e-06, + "loss": 0.4336, + "step": 1197 + }, + { + "epoch": 0.301307847082495, + "grad_norm": 0.45541757345199585, + "learning_rate": 9.999994647266916e-06, + "loss": 0.4049, + "step": 1198 + }, + { + "epoch": 0.30155935613682094, + "grad_norm": 0.5015101432800293, + "learning_rate": 9.999992292064964e-06, + "loss": 0.3797, + "step": 1199 + }, + { + "epoch": 0.30181086519114686, + "grad_norm": 0.4215429723262787, + "learning_rate": 9.999989508644953e-06, + "loss": 0.4165, + "step": 1200 + }, + { + "epoch": 0.30206237424547283, + "grad_norm": 0.43915969133377075, + "learning_rate": 9.999986297007118e-06, + "loss": 0.4326, + "step": 1201 + }, + { + "epoch": 0.3023138832997988, + "grad_norm": 0.46269261837005615, + "learning_rate": 9.999982657151738e-06, + "loss": 0.4243, + "step": 1202 + }, + { + "epoch": 0.3025653923541248, + "grad_norm": 0.4217459261417389, + "learning_rate": 9.999978589079125e-06, + "loss": 0.4055, + "step": 1203 + }, + { + "epoch": 0.3028169014084507, + "grad_norm": 0.5113700032234192, + "learning_rate": 9.999974092789623e-06, + "loss": 0.4091, + "step": 1204 + }, + { + "epoch": 0.30306841046277666, + "grad_norm": 0.4101250171661377, + "learning_rate": 9.999969168283621e-06, + "loss": 0.401, + "step": 1205 + }, + { + "epoch": 0.30331991951710263, + "grad_norm": 0.5615075826644897, + "learning_rate": 9.999963815561538e-06, + "loss": 0.4226, + "step": 1206 + }, + { + "epoch": 0.30357142857142855, + "grad_norm": 0.4130154550075531, + "learning_rate": 9.999958034623836e-06, + "loss": 0.4115, + "step": 1207 + }, + { + "epoch": 0.3038229376257545, + "grad_norm": 0.4956052899360657, + "learning_rate": 9.999951825471005e-06, + "loss": 0.4305, + "step": 1208 + }, + { + "epoch": 0.3040744466800805, + "grad_norm": 0.5325097441673279, + "learning_rate": 9.999945188103582e-06, + "loss": 0.4063, + "step": 1209 + }, + { + "epoch": 0.30432595573440646, + "grad_norm": 0.45363694429397583, + "learning_rate": 9.999938122522133e-06, + "loss": 0.4244, + "step": 1210 + }, + { + "epoch": 0.3045774647887324, + "grad_norm": 0.41715431213378906, + "learning_rate": 9.999930628727264e-06, + "loss": 0.3687, + "step": 1211 + }, + { + "epoch": 0.30482897384305835, + "grad_norm": 0.49758753180503845, + "learning_rate": 9.999922706719614e-06, + "loss": 0.3923, + "step": 1212 + }, + { + "epoch": 0.3050804828973843, + "grad_norm": 0.4848545491695404, + "learning_rate": 9.999914356499864e-06, + "loss": 0.4266, + "step": 1213 + }, + { + "epoch": 0.30533199195171024, + "grad_norm": 0.3956749439239502, + "learning_rate": 9.99990557806873e-06, + "loss": 0.3876, + "step": 1214 + }, + { + "epoch": 0.3055835010060362, + "grad_norm": 0.4752289354801178, + "learning_rate": 9.999896371426962e-06, + "loss": 0.439, + "step": 1215 + }, + { + "epoch": 0.3058350100603622, + "grad_norm": 0.4210512936115265, + "learning_rate": 9.999886736575349e-06, + "loss": 0.4215, + "step": 1216 + }, + { + "epoch": 0.30608651911468815, + "grad_norm": 0.44492197036743164, + "learning_rate": 9.999876673514718e-06, + "loss": 0.436, + "step": 1217 + }, + { + "epoch": 0.30633802816901406, + "grad_norm": 0.41508495807647705, + "learning_rate": 9.999866182245926e-06, + "loss": 0.3962, + "step": 1218 + }, + { + "epoch": 0.30658953722334004, + "grad_norm": 0.5030407905578613, + "learning_rate": 9.999855262769875e-06, + "loss": 0.4369, + "step": 1219 + }, + { + "epoch": 0.306841046277666, + "grad_norm": 0.43382754921913147, + "learning_rate": 9.9998439150875e-06, + "loss": 0.4056, + "step": 1220 + }, + { + "epoch": 0.307092555331992, + "grad_norm": 0.4502340853214264, + "learning_rate": 9.999832139199775e-06, + "loss": 0.4028, + "step": 1221 + }, + { + "epoch": 0.3073440643863179, + "grad_norm": 0.46800076961517334, + "learning_rate": 9.999819935107705e-06, + "loss": 0.401, + "step": 1222 + }, + { + "epoch": 0.30759557344064387, + "grad_norm": 0.47623687982559204, + "learning_rate": 9.999807302812335e-06, + "loss": 0.394, + "step": 1223 + }, + { + "epoch": 0.30784708249496984, + "grad_norm": 0.4112638831138611, + "learning_rate": 9.99979424231475e-06, + "loss": 0.407, + "step": 1224 + }, + { + "epoch": 0.30809859154929575, + "grad_norm": 0.5141077041625977, + "learning_rate": 9.999780753616064e-06, + "loss": 0.4244, + "step": 1225 + }, + { + "epoch": 0.3083501006036217, + "grad_norm": 0.4799872636795044, + "learning_rate": 9.999766836717437e-06, + "loss": 0.4207, + "step": 1226 + }, + { + "epoch": 0.3086016096579477, + "grad_norm": 0.5113214254379272, + "learning_rate": 9.999752491620058e-06, + "loss": 0.4062, + "step": 1227 + }, + { + "epoch": 0.30885311871227367, + "grad_norm": 0.5039216876029968, + "learning_rate": 9.999737718325157e-06, + "loss": 0.42, + "step": 1228 + }, + { + "epoch": 0.3091046277665996, + "grad_norm": 0.44573184847831726, + "learning_rate": 9.999722516833999e-06, + "loss": 0.4212, + "step": 1229 + }, + { + "epoch": 0.30935613682092555, + "grad_norm": 0.4311496615409851, + "learning_rate": 9.999706887147884e-06, + "loss": 0.4031, + "step": 1230 + }, + { + "epoch": 0.3096076458752515, + "grad_norm": 0.5166363716125488, + "learning_rate": 9.999690829268154e-06, + "loss": 0.4214, + "step": 1231 + }, + { + "epoch": 0.30985915492957744, + "grad_norm": 0.44504567980766296, + "learning_rate": 9.999674343196182e-06, + "loss": 0.3918, + "step": 1232 + }, + { + "epoch": 0.3101106639839034, + "grad_norm": 0.44314002990722656, + "learning_rate": 9.99965742893338e-06, + "loss": 0.4428, + "step": 1233 + }, + { + "epoch": 0.3103621730382294, + "grad_norm": 0.4878178536891937, + "learning_rate": 9.999640086481198e-06, + "loss": 0.3875, + "step": 1234 + }, + { + "epoch": 0.31061368209255535, + "grad_norm": 0.45717740058898926, + "learning_rate": 9.99962231584112e-06, + "loss": 0.3977, + "step": 1235 + }, + { + "epoch": 0.31086519114688127, + "grad_norm": 0.4271807372570038, + "learning_rate": 9.999604117014667e-06, + "loss": 0.4027, + "step": 1236 + }, + { + "epoch": 0.31111670020120724, + "grad_norm": 0.4877811074256897, + "learning_rate": 9.999585490003399e-06, + "loss": 0.3856, + "step": 1237 + }, + { + "epoch": 0.3113682092555332, + "grad_norm": 0.5068530440330505, + "learning_rate": 9.999566434808912e-06, + "loss": 0.3895, + "step": 1238 + }, + { + "epoch": 0.31161971830985913, + "grad_norm": 0.4381490647792816, + "learning_rate": 9.999546951432837e-06, + "loss": 0.4222, + "step": 1239 + }, + { + "epoch": 0.3118712273641851, + "grad_norm": 0.44648122787475586, + "learning_rate": 9.999527039876843e-06, + "loss": 0.4092, + "step": 1240 + }, + { + "epoch": 0.31212273641851107, + "grad_norm": 0.5271798968315125, + "learning_rate": 9.999506700142633e-06, + "loss": 0.4265, + "step": 1241 + }, + { + "epoch": 0.31237424547283704, + "grad_norm": 0.467507928609848, + "learning_rate": 9.999485932231951e-06, + "loss": 0.4255, + "step": 1242 + }, + { + "epoch": 0.31262575452716296, + "grad_norm": 0.43331214785575867, + "learning_rate": 9.999464736146578e-06, + "loss": 0.4022, + "step": 1243 + }, + { + "epoch": 0.31287726358148893, + "grad_norm": 0.511960506439209, + "learning_rate": 9.999443111888325e-06, + "loss": 0.4229, + "step": 1244 + }, + { + "epoch": 0.3131287726358149, + "grad_norm": 0.4495149850845337, + "learning_rate": 9.999421059459047e-06, + "loss": 0.4109, + "step": 1245 + }, + { + "epoch": 0.31338028169014087, + "grad_norm": 0.4889768362045288, + "learning_rate": 9.999398578860631e-06, + "loss": 0.3932, + "step": 1246 + }, + { + "epoch": 0.3136317907444668, + "grad_norm": 0.4641020596027374, + "learning_rate": 9.999375670095003e-06, + "loss": 0.4236, + "step": 1247 + }, + { + "epoch": 0.31388329979879276, + "grad_norm": 0.47060105204582214, + "learning_rate": 9.999352333164125e-06, + "loss": 0.418, + "step": 1248 + }, + { + "epoch": 0.31413480885311873, + "grad_norm": 0.4797796308994293, + "learning_rate": 9.999328568069994e-06, + "loss": 0.4168, + "step": 1249 + }, + { + "epoch": 0.31438631790744465, + "grad_norm": 0.474357932806015, + "learning_rate": 9.99930437481465e-06, + "loss": 0.4113, + "step": 1250 + }, + { + "epoch": 0.3146378269617706, + "grad_norm": 0.4692329466342926, + "learning_rate": 9.99927975340016e-06, + "loss": 0.3967, + "step": 1251 + }, + { + "epoch": 0.3148893360160966, + "grad_norm": 0.5016288161277771, + "learning_rate": 9.999254703828634e-06, + "loss": 0.427, + "step": 1252 + }, + { + "epoch": 0.31514084507042256, + "grad_norm": 0.42063063383102417, + "learning_rate": 9.999229226102218e-06, + "loss": 0.423, + "step": 1253 + }, + { + "epoch": 0.3153923541247485, + "grad_norm": 0.4207392632961273, + "learning_rate": 9.999203320223095e-06, + "loss": 0.4237, + "step": 1254 + }, + { + "epoch": 0.31564386317907445, + "grad_norm": 0.5132213830947876, + "learning_rate": 9.999176986193481e-06, + "loss": 0.3826, + "step": 1255 + }, + { + "epoch": 0.3158953722334004, + "grad_norm": 0.4629036784172058, + "learning_rate": 9.999150224015634e-06, + "loss": 0.3976, + "step": 1256 + }, + { + "epoch": 0.31614688128772633, + "grad_norm": 0.45483675599098206, + "learning_rate": 9.999123033691844e-06, + "loss": 0.4291, + "step": 1257 + }, + { + "epoch": 0.3163983903420523, + "grad_norm": 0.45758309960365295, + "learning_rate": 9.999095415224443e-06, + "loss": 0.4088, + "step": 1258 + }, + { + "epoch": 0.3166498993963783, + "grad_norm": 0.49029773473739624, + "learning_rate": 9.999067368615791e-06, + "loss": 0.4092, + "step": 1259 + }, + { + "epoch": 0.31690140845070425, + "grad_norm": 0.4661732017993927, + "learning_rate": 9.999038893868293e-06, + "loss": 0.4094, + "step": 1260 + }, + { + "epoch": 0.31715291750503016, + "grad_norm": 0.43760454654693604, + "learning_rate": 9.999009990984389e-06, + "loss": 0.3962, + "step": 1261 + }, + { + "epoch": 0.31740442655935613, + "grad_norm": 0.46413469314575195, + "learning_rate": 9.998980659966553e-06, + "loss": 0.4161, + "step": 1262 + }, + { + "epoch": 0.3176559356136821, + "grad_norm": 0.46015673875808716, + "learning_rate": 9.998950900817297e-06, + "loss": 0.3893, + "step": 1263 + }, + { + "epoch": 0.317907444668008, + "grad_norm": 0.44548308849334717, + "learning_rate": 9.998920713539169e-06, + "loss": 0.4252, + "step": 1264 + }, + { + "epoch": 0.318158953722334, + "grad_norm": 0.452181875705719, + "learning_rate": 9.998890098134757e-06, + "loss": 0.4226, + "step": 1265 + }, + { + "epoch": 0.31841046277665996, + "grad_norm": 0.5392069220542908, + "learning_rate": 9.998859054606677e-06, + "loss": 0.4124, + "step": 1266 + }, + { + "epoch": 0.31866197183098594, + "grad_norm": 0.43742406368255615, + "learning_rate": 9.998827582957596e-06, + "loss": 0.4125, + "step": 1267 + }, + { + "epoch": 0.31891348088531185, + "grad_norm": 0.4227537512779236, + "learning_rate": 9.998795683190202e-06, + "loss": 0.4211, + "step": 1268 + }, + { + "epoch": 0.3191649899396378, + "grad_norm": 0.4836387038230896, + "learning_rate": 9.998763355307232e-06, + "loss": 0.4342, + "step": 1269 + }, + { + "epoch": 0.3194164989939638, + "grad_norm": 0.48729145526885986, + "learning_rate": 9.998730599311452e-06, + "loss": 0.4045, + "step": 1270 + }, + { + "epoch": 0.31966800804828976, + "grad_norm": 0.46449029445648193, + "learning_rate": 9.998697415205667e-06, + "loss": 0.4175, + "step": 1271 + }, + { + "epoch": 0.3199195171026157, + "grad_norm": 0.4990949332714081, + "learning_rate": 9.998663802992723e-06, + "loss": 0.435, + "step": 1272 + }, + { + "epoch": 0.32017102615694165, + "grad_norm": 0.42860570549964905, + "learning_rate": 9.998629762675493e-06, + "loss": 0.4114, + "step": 1273 + }, + { + "epoch": 0.3204225352112676, + "grad_norm": 0.5099340081214905, + "learning_rate": 9.998595294256897e-06, + "loss": 0.4395, + "step": 1274 + }, + { + "epoch": 0.32067404426559354, + "grad_norm": 0.5020102262496948, + "learning_rate": 9.998560397739885e-06, + "loss": 0.3879, + "step": 1275 + }, + { + "epoch": 0.3209255533199195, + "grad_norm": 0.41706791520118713, + "learning_rate": 9.998525073127445e-06, + "loss": 0.4166, + "step": 1276 + }, + { + "epoch": 0.3211770623742455, + "grad_norm": 0.43367356061935425, + "learning_rate": 9.998489320422604e-06, + "loss": 0.4152, + "step": 1277 + }, + { + "epoch": 0.32142857142857145, + "grad_norm": 0.3915824294090271, + "learning_rate": 9.998453139628422e-06, + "loss": 0.393, + "step": 1278 + }, + { + "epoch": 0.32168008048289737, + "grad_norm": 0.49054497480392456, + "learning_rate": 9.998416530748e-06, + "loss": 0.4235, + "step": 1279 + }, + { + "epoch": 0.32193158953722334, + "grad_norm": 0.4301794469356537, + "learning_rate": 9.998379493784472e-06, + "loss": 0.4096, + "step": 1280 + }, + { + "epoch": 0.3221830985915493, + "grad_norm": 0.5517282485961914, + "learning_rate": 9.998342028741009e-06, + "loss": 0.395, + "step": 1281 + }, + { + "epoch": 0.3224346076458752, + "grad_norm": 0.47454625368118286, + "learning_rate": 9.998304135620822e-06, + "loss": 0.4091, + "step": 1282 + }, + { + "epoch": 0.3226861167002012, + "grad_norm": 0.44431060552597046, + "learning_rate": 9.998265814427156e-06, + "loss": 0.4341, + "step": 1283 + }, + { + "epoch": 0.32293762575452717, + "grad_norm": 0.5420903563499451, + "learning_rate": 9.998227065163289e-06, + "loss": 0.3937, + "step": 1284 + }, + { + "epoch": 0.32318913480885314, + "grad_norm": 0.49979931116104126, + "learning_rate": 9.998187887832544e-06, + "loss": 0.4236, + "step": 1285 + }, + { + "epoch": 0.32344064386317906, + "grad_norm": 0.488714337348938, + "learning_rate": 9.998148282438276e-06, + "loss": 0.3862, + "step": 1286 + }, + { + "epoch": 0.323692152917505, + "grad_norm": 0.5104209184646606, + "learning_rate": 9.998108248983875e-06, + "loss": 0.4263, + "step": 1287 + }, + { + "epoch": 0.323943661971831, + "grad_norm": 0.5148292779922485, + "learning_rate": 9.998067787472772e-06, + "loss": 0.3773, + "step": 1288 + }, + { + "epoch": 0.3241951710261569, + "grad_norm": 0.46651163697242737, + "learning_rate": 9.998026897908429e-06, + "loss": 0.4054, + "step": 1289 + }, + { + "epoch": 0.3244466800804829, + "grad_norm": 0.5032572150230408, + "learning_rate": 9.99798558029435e-06, + "loss": 0.3935, + "step": 1290 + }, + { + "epoch": 0.32469818913480886, + "grad_norm": 0.4591807425022125, + "learning_rate": 9.997943834634074e-06, + "loss": 0.3915, + "step": 1291 + }, + { + "epoch": 0.32494969818913483, + "grad_norm": 0.4753274917602539, + "learning_rate": 9.997901660931175e-06, + "loss": 0.4095, + "step": 1292 + }, + { + "epoch": 0.32520120724346074, + "grad_norm": 0.4749658703804016, + "learning_rate": 9.997859059189266e-06, + "loss": 0.384, + "step": 1293 + }, + { + "epoch": 0.3254527162977867, + "grad_norm": 0.4379113018512726, + "learning_rate": 9.997816029411996e-06, + "loss": 0.4158, + "step": 1294 + }, + { + "epoch": 0.3257042253521127, + "grad_norm": 0.5521072745323181, + "learning_rate": 9.997772571603047e-06, + "loss": 0.4081, + "step": 1295 + }, + { + "epoch": 0.32595573440643866, + "grad_norm": 0.45863693952560425, + "learning_rate": 9.997728685766144e-06, + "loss": 0.4219, + "step": 1296 + }, + { + "epoch": 0.3262072434607646, + "grad_norm": 0.5103247165679932, + "learning_rate": 9.997684371905046e-06, + "loss": 0.428, + "step": 1297 + }, + { + "epoch": 0.32645875251509054, + "grad_norm": 0.4744894802570343, + "learning_rate": 9.997639630023546e-06, + "loss": 0.4249, + "step": 1298 + }, + { + "epoch": 0.3267102615694165, + "grad_norm": 0.47997725009918213, + "learning_rate": 9.997594460125477e-06, + "loss": 0.4097, + "step": 1299 + }, + { + "epoch": 0.32696177062374243, + "grad_norm": 0.48586708307266235, + "learning_rate": 9.997548862214708e-06, + "loss": 0.4036, + "step": 1300 + }, + { + "epoch": 0.3272132796780684, + "grad_norm": 0.4447525441646576, + "learning_rate": 9.99750283629514e-06, + "loss": 0.3932, + "step": 1301 + }, + { + "epoch": 0.3274647887323944, + "grad_norm": 0.4103879928588867, + "learning_rate": 9.997456382370723e-06, + "loss": 0.4181, + "step": 1302 + }, + { + "epoch": 0.32771629778672035, + "grad_norm": 0.4686889052391052, + "learning_rate": 9.99740950044543e-06, + "loss": 0.4132, + "step": 1303 + }, + { + "epoch": 0.32796780684104626, + "grad_norm": 0.41833746433258057, + "learning_rate": 9.997362190523275e-06, + "loss": 0.4269, + "step": 1304 + }, + { + "epoch": 0.32821931589537223, + "grad_norm": 0.4362890124320984, + "learning_rate": 9.997314452608313e-06, + "loss": 0.4028, + "step": 1305 + }, + { + "epoch": 0.3284708249496982, + "grad_norm": 0.47095364332199097, + "learning_rate": 9.99726628670463e-06, + "loss": 0.4157, + "step": 1306 + }, + { + "epoch": 0.3287223340040241, + "grad_norm": 0.44647061824798584, + "learning_rate": 9.997217692816355e-06, + "loss": 0.3963, + "step": 1307 + }, + { + "epoch": 0.3289738430583501, + "grad_norm": 0.44647330045700073, + "learning_rate": 9.997168670947644e-06, + "loss": 0.3993, + "step": 1308 + }, + { + "epoch": 0.32922535211267606, + "grad_norm": 0.46429726481437683, + "learning_rate": 9.9971192211027e-06, + "loss": 0.4067, + "step": 1309 + }, + { + "epoch": 0.32947686116700203, + "grad_norm": 0.465006560087204, + "learning_rate": 9.997069343285757e-06, + "loss": 0.3886, + "step": 1310 + }, + { + "epoch": 0.32972837022132795, + "grad_norm": 0.4865345358848572, + "learning_rate": 9.997019037501086e-06, + "loss": 0.4055, + "step": 1311 + }, + { + "epoch": 0.3299798792756539, + "grad_norm": 0.46681225299835205, + "learning_rate": 9.996968303752996e-06, + "loss": 0.4287, + "step": 1312 + }, + { + "epoch": 0.3302313883299799, + "grad_norm": 0.5172387361526489, + "learning_rate": 9.996917142045832e-06, + "loss": 0.4436, + "step": 1313 + }, + { + "epoch": 0.33048289738430586, + "grad_norm": 0.3964807689189911, + "learning_rate": 9.996865552383975e-06, + "loss": 0.4051, + "step": 1314 + }, + { + "epoch": 0.3307344064386318, + "grad_norm": 0.41110795736312866, + "learning_rate": 9.996813534771843e-06, + "loss": 0.3752, + "step": 1315 + }, + { + "epoch": 0.33098591549295775, + "grad_norm": 0.4638274908065796, + "learning_rate": 9.996761089213891e-06, + "loss": 0.4208, + "step": 1316 + }, + { + "epoch": 0.3312374245472837, + "grad_norm": 0.4156520962715149, + "learning_rate": 9.996708215714613e-06, + "loss": 0.4042, + "step": 1317 + }, + { + "epoch": 0.33148893360160964, + "grad_norm": 0.42104753851890564, + "learning_rate": 9.996654914278535e-06, + "loss": 0.4499, + "step": 1318 + }, + { + "epoch": 0.3317404426559356, + "grad_norm": 0.42806974053382874, + "learning_rate": 9.996601184910223e-06, + "loss": 0.3979, + "step": 1319 + }, + { + "epoch": 0.3319919517102616, + "grad_norm": 0.38782554864883423, + "learning_rate": 9.996547027614279e-06, + "loss": 0.392, + "step": 1320 + }, + { + "epoch": 0.33224346076458755, + "grad_norm": 0.4238336682319641, + "learning_rate": 9.996492442395338e-06, + "loss": 0.4193, + "step": 1321 + }, + { + "epoch": 0.33249496981891347, + "grad_norm": 0.4696759879589081, + "learning_rate": 9.996437429258079e-06, + "loss": 0.4137, + "step": 1322 + }, + { + "epoch": 0.33274647887323944, + "grad_norm": 0.4905252158641815, + "learning_rate": 9.996381988207211e-06, + "loss": 0.4354, + "step": 1323 + }, + { + "epoch": 0.3329979879275654, + "grad_norm": 0.4017488360404968, + "learning_rate": 9.996326119247484e-06, + "loss": 0.4061, + "step": 1324 + }, + { + "epoch": 0.3332494969818913, + "grad_norm": 0.4732036292552948, + "learning_rate": 9.99626982238368e-06, + "loss": 0.4296, + "step": 1325 + }, + { + "epoch": 0.3335010060362173, + "grad_norm": 0.4144299626350403, + "learning_rate": 9.996213097620623e-06, + "loss": 0.4169, + "step": 1326 + }, + { + "epoch": 0.33375251509054327, + "grad_norm": 0.45625752210617065, + "learning_rate": 9.996155944963173e-06, + "loss": 0.4019, + "step": 1327 + }, + { + "epoch": 0.33400402414486924, + "grad_norm": 0.3766085207462311, + "learning_rate": 9.996098364416219e-06, + "loss": 0.3802, + "step": 1328 + }, + { + "epoch": 0.33425553319919515, + "grad_norm": 0.40613994002342224, + "learning_rate": 9.996040355984697e-06, + "loss": 0.3963, + "step": 1329 + }, + { + "epoch": 0.3345070422535211, + "grad_norm": 0.4915297031402588, + "learning_rate": 9.995981919673571e-06, + "loss": 0.4075, + "step": 1330 + }, + { + "epoch": 0.3347585513078471, + "grad_norm": 0.40623193979263306, + "learning_rate": 9.995923055487853e-06, + "loss": 0.392, + "step": 1331 + }, + { + "epoch": 0.335010060362173, + "grad_norm": 0.47242090106010437, + "learning_rate": 9.995863763432577e-06, + "loss": 0.3792, + "step": 1332 + }, + { + "epoch": 0.335261569416499, + "grad_norm": 0.39874276518821716, + "learning_rate": 9.995804043512824e-06, + "loss": 0.3989, + "step": 1333 + }, + { + "epoch": 0.33551307847082495, + "grad_norm": 0.4230663776397705, + "learning_rate": 9.995743895733707e-06, + "loss": 0.4004, + "step": 1334 + }, + { + "epoch": 0.3357645875251509, + "grad_norm": 0.4566534161567688, + "learning_rate": 9.99568332010038e-06, + "loss": 0.4478, + "step": 1335 + }, + { + "epoch": 0.33601609657947684, + "grad_norm": 0.48702409863471985, + "learning_rate": 9.99562231661803e-06, + "loss": 0.4014, + "step": 1336 + }, + { + "epoch": 0.3362676056338028, + "grad_norm": 0.4124425947666168, + "learning_rate": 9.995560885291879e-06, + "loss": 0.4075, + "step": 1337 + }, + { + "epoch": 0.3365191146881288, + "grad_norm": 0.4736267626285553, + "learning_rate": 9.995499026127194e-06, + "loss": 0.381, + "step": 1338 + }, + { + "epoch": 0.33677062374245476, + "grad_norm": 0.41389134526252747, + "learning_rate": 9.995436739129267e-06, + "loss": 0.4299, + "step": 1339 + }, + { + "epoch": 0.33702213279678067, + "grad_norm": 0.49148082733154297, + "learning_rate": 9.995374024303433e-06, + "loss": 0.4016, + "step": 1340 + }, + { + "epoch": 0.33727364185110664, + "grad_norm": 0.4507843554019928, + "learning_rate": 9.995310881655066e-06, + "loss": 0.4305, + "step": 1341 + }, + { + "epoch": 0.3375251509054326, + "grad_norm": 0.41378965973854065, + "learning_rate": 9.995247311189573e-06, + "loss": 0.3896, + "step": 1342 + }, + { + "epoch": 0.33777665995975853, + "grad_norm": 0.511529803276062, + "learning_rate": 9.9951833129124e-06, + "loss": 0.4367, + "step": 1343 + }, + { + "epoch": 0.3380281690140845, + "grad_norm": 0.4993535578250885, + "learning_rate": 9.995118886829023e-06, + "loss": 0.4217, + "step": 1344 + }, + { + "epoch": 0.33827967806841047, + "grad_norm": 0.4538334012031555, + "learning_rate": 9.995054032944963e-06, + "loss": 0.4088, + "step": 1345 + }, + { + "epoch": 0.33853118712273644, + "grad_norm": 0.42592087388038635, + "learning_rate": 9.994988751265775e-06, + "loss": 0.4142, + "step": 1346 + }, + { + "epoch": 0.33878269617706236, + "grad_norm": 0.4514321982860565, + "learning_rate": 9.994923041797049e-06, + "loss": 0.4318, + "step": 1347 + }, + { + "epoch": 0.33903420523138833, + "grad_norm": 0.43856939673423767, + "learning_rate": 9.994856904544412e-06, + "loss": 0.4195, + "step": 1348 + }, + { + "epoch": 0.3392857142857143, + "grad_norm": 0.4376979470252991, + "learning_rate": 9.99479033951353e-06, + "loss": 0.391, + "step": 1349 + }, + { + "epoch": 0.3395372233400402, + "grad_norm": 0.43217065930366516, + "learning_rate": 9.9947233467101e-06, + "loss": 0.4241, + "step": 1350 + }, + { + "epoch": 0.3397887323943662, + "grad_norm": 0.44194620847702026, + "learning_rate": 9.994655926139864e-06, + "loss": 0.401, + "step": 1351 + }, + { + "epoch": 0.34004024144869216, + "grad_norm": 0.439430296421051, + "learning_rate": 9.994588077808595e-06, + "loss": 0.4185, + "step": 1352 + }, + { + "epoch": 0.34029175050301813, + "grad_norm": 0.4721353054046631, + "learning_rate": 9.994519801722103e-06, + "loss": 0.4053, + "step": 1353 + }, + { + "epoch": 0.34054325955734405, + "grad_norm": 0.4690324068069458, + "learning_rate": 9.994451097886236e-06, + "loss": 0.4294, + "step": 1354 + }, + { + "epoch": 0.34079476861167, + "grad_norm": 0.4453498125076294, + "learning_rate": 9.994381966306877e-06, + "loss": 0.4371, + "step": 1355 + }, + { + "epoch": 0.341046277665996, + "grad_norm": 0.43481093645095825, + "learning_rate": 9.994312406989947e-06, + "loss": 0.4068, + "step": 1356 + }, + { + "epoch": 0.3412977867203219, + "grad_norm": 0.45696157217025757, + "learning_rate": 9.994242419941403e-06, + "loss": 0.4137, + "step": 1357 + }, + { + "epoch": 0.3415492957746479, + "grad_norm": 0.5058369636535645, + "learning_rate": 9.994172005167242e-06, + "loss": 0.4273, + "step": 1358 + }, + { + "epoch": 0.34180080482897385, + "grad_norm": 0.43768513202667236, + "learning_rate": 9.994101162673491e-06, + "loss": 0.3701, + "step": 1359 + }, + { + "epoch": 0.3420523138832998, + "grad_norm": 0.447859525680542, + "learning_rate": 9.994029892466218e-06, + "loss": 0.4028, + "step": 1360 + }, + { + "epoch": 0.34230382293762573, + "grad_norm": 0.5294939279556274, + "learning_rate": 9.993958194551528e-06, + "loss": 0.4043, + "step": 1361 + }, + { + "epoch": 0.3425553319919517, + "grad_norm": 0.4057393968105316, + "learning_rate": 9.993886068935559e-06, + "loss": 0.4308, + "step": 1362 + }, + { + "epoch": 0.3428068410462777, + "grad_norm": 0.4801323413848877, + "learning_rate": 9.993813515624491e-06, + "loss": 0.3747, + "step": 1363 + }, + { + "epoch": 0.34305835010060365, + "grad_norm": 0.519101083278656, + "learning_rate": 9.993740534624536e-06, + "loss": 0.3993, + "step": 1364 + }, + { + "epoch": 0.34330985915492956, + "grad_norm": 0.44538334012031555, + "learning_rate": 9.993667125941946e-06, + "loss": 0.3992, + "step": 1365 + }, + { + "epoch": 0.34356136820925554, + "grad_norm": 0.5548309087753296, + "learning_rate": 9.993593289583005e-06, + "loss": 0.4186, + "step": 1366 + }, + { + "epoch": 0.3438128772635815, + "grad_norm": 0.46345213055610657, + "learning_rate": 9.993519025554041e-06, + "loss": 0.3935, + "step": 1367 + }, + { + "epoch": 0.3440643863179074, + "grad_norm": 0.5251536965370178, + "learning_rate": 9.993444333861411e-06, + "loss": 0.4056, + "step": 1368 + }, + { + "epoch": 0.3443158953722334, + "grad_norm": 0.5084334015846252, + "learning_rate": 9.993369214511512e-06, + "loss": 0.4039, + "step": 1369 + }, + { + "epoch": 0.34456740442655936, + "grad_norm": 0.5444664359092712, + "learning_rate": 9.99329366751078e-06, + "loss": 0.3857, + "step": 1370 + }, + { + "epoch": 0.34481891348088534, + "grad_norm": 0.4715754985809326, + "learning_rate": 9.993217692865683e-06, + "loss": 0.4328, + "step": 1371 + }, + { + "epoch": 0.34507042253521125, + "grad_norm": 0.5068243741989136, + "learning_rate": 9.993141290582726e-06, + "loss": 0.379, + "step": 1372 + }, + { + "epoch": 0.3453219315895372, + "grad_norm": 0.5259790420532227, + "learning_rate": 9.993064460668456e-06, + "loss": 0.4116, + "step": 1373 + }, + { + "epoch": 0.3455734406438632, + "grad_norm": 0.46770498156547546, + "learning_rate": 9.992987203129451e-06, + "loss": 0.4302, + "step": 1374 + }, + { + "epoch": 0.3458249496981891, + "grad_norm": 0.47269558906555176, + "learning_rate": 9.99290951797233e-06, + "loss": 0.4102, + "step": 1375 + }, + { + "epoch": 0.3460764587525151, + "grad_norm": 0.43284550309181213, + "learning_rate": 9.99283140520374e-06, + "loss": 0.4029, + "step": 1376 + }, + { + "epoch": 0.34632796780684105, + "grad_norm": 0.49771684408187866, + "learning_rate": 9.992752864830379e-06, + "loss": 0.414, + "step": 1377 + }, + { + "epoch": 0.346579476861167, + "grad_norm": 0.4438658356666565, + "learning_rate": 9.992673896858969e-06, + "loss": 0.4105, + "step": 1378 + }, + { + "epoch": 0.34683098591549294, + "grad_norm": 0.46066081523895264, + "learning_rate": 9.992594501296272e-06, + "loss": 0.4012, + "step": 1379 + }, + { + "epoch": 0.3470824949698189, + "grad_norm": 0.562817394733429, + "learning_rate": 9.992514678149092e-06, + "loss": 0.424, + "step": 1380 + }, + { + "epoch": 0.3473340040241449, + "grad_norm": 0.5239977240562439, + "learning_rate": 9.992434427424261e-06, + "loss": 0.3936, + "step": 1381 + }, + { + "epoch": 0.3475855130784708, + "grad_norm": 0.4289341866970062, + "learning_rate": 9.992353749128653e-06, + "loss": 0.4092, + "step": 1382 + }, + { + "epoch": 0.34783702213279677, + "grad_norm": 0.565528154373169, + "learning_rate": 9.992272643269181e-06, + "loss": 0.4239, + "step": 1383 + }, + { + "epoch": 0.34808853118712274, + "grad_norm": 0.44149690866470337, + "learning_rate": 9.992191109852788e-06, + "loss": 0.4068, + "step": 1384 + }, + { + "epoch": 0.3483400402414487, + "grad_norm": 0.4838801920413971, + "learning_rate": 9.992109148886457e-06, + "loss": 0.4237, + "step": 1385 + }, + { + "epoch": 0.3485915492957746, + "grad_norm": 0.5003640651702881, + "learning_rate": 9.992026760377207e-06, + "loss": 0.3963, + "step": 1386 + }, + { + "epoch": 0.3488430583501006, + "grad_norm": 0.4707315266132355, + "learning_rate": 9.991943944332097e-06, + "loss": 0.4227, + "step": 1387 + }, + { + "epoch": 0.34909456740442657, + "grad_norm": 0.4761320650577545, + "learning_rate": 9.991860700758217e-06, + "loss": 0.4195, + "step": 1388 + }, + { + "epoch": 0.34934607645875254, + "grad_norm": 0.39643487334251404, + "learning_rate": 9.991777029662698e-06, + "loss": 0.4183, + "step": 1389 + }, + { + "epoch": 0.34959758551307846, + "grad_norm": 0.4800592064857483, + "learning_rate": 9.991692931052703e-06, + "loss": 0.4301, + "step": 1390 + }, + { + "epoch": 0.34984909456740443, + "grad_norm": 0.42438507080078125, + "learning_rate": 9.991608404935435e-06, + "loss": 0.4011, + "step": 1391 + }, + { + "epoch": 0.3501006036217304, + "grad_norm": 0.3981070816516876, + "learning_rate": 9.991523451318137e-06, + "loss": 0.4184, + "step": 1392 + }, + { + "epoch": 0.3503521126760563, + "grad_norm": 0.4492408037185669, + "learning_rate": 9.991438070208082e-06, + "loss": 0.4009, + "step": 1393 + }, + { + "epoch": 0.3506036217303823, + "grad_norm": 0.41902977228164673, + "learning_rate": 9.991352261612583e-06, + "loss": 0.4174, + "step": 1394 + }, + { + "epoch": 0.35085513078470826, + "grad_norm": 0.4707157611846924, + "learning_rate": 9.991266025538988e-06, + "loss": 0.4226, + "step": 1395 + }, + { + "epoch": 0.35110663983903423, + "grad_norm": 0.4216676950454712, + "learning_rate": 9.991179361994683e-06, + "loss": 0.4397, + "step": 1396 + }, + { + "epoch": 0.35135814889336014, + "grad_norm": 0.47134819626808167, + "learning_rate": 9.99109227098709e-06, + "loss": 0.3967, + "step": 1397 + }, + { + "epoch": 0.3516096579476861, + "grad_norm": 0.4079870283603668, + "learning_rate": 9.991004752523668e-06, + "loss": 0.394, + "step": 1398 + }, + { + "epoch": 0.3518611670020121, + "grad_norm": 0.46213197708129883, + "learning_rate": 9.990916806611915e-06, + "loss": 0.413, + "step": 1399 + }, + { + "epoch": 0.352112676056338, + "grad_norm": 0.40814223885536194, + "learning_rate": 9.99082843325936e-06, + "loss": 0.4259, + "step": 1400 + }, + { + "epoch": 0.352364185110664, + "grad_norm": 0.39051973819732666, + "learning_rate": 9.99073963247357e-06, + "loss": 0.4197, + "step": 1401 + }, + { + "epoch": 0.35261569416498995, + "grad_norm": 0.4494209885597229, + "learning_rate": 9.990650404262152e-06, + "loss": 0.3972, + "step": 1402 + }, + { + "epoch": 0.3528672032193159, + "grad_norm": 0.43199771642684937, + "learning_rate": 9.99056074863275e-06, + "loss": 0.4034, + "step": 1403 + }, + { + "epoch": 0.35311871227364183, + "grad_norm": 0.46416175365448, + "learning_rate": 9.99047066559304e-06, + "loss": 0.4253, + "step": 1404 + }, + { + "epoch": 0.3533702213279678, + "grad_norm": 0.49905675649642944, + "learning_rate": 9.990380155150739e-06, + "loss": 0.4184, + "step": 1405 + }, + { + "epoch": 0.3536217303822938, + "grad_norm": 0.41556206345558167, + "learning_rate": 9.990289217313597e-06, + "loss": 0.3867, + "step": 1406 + }, + { + "epoch": 0.3538732394366197, + "grad_norm": 0.47178298234939575, + "learning_rate": 9.990197852089403e-06, + "loss": 0.4356, + "step": 1407 + }, + { + "epoch": 0.35412474849094566, + "grad_norm": 0.4404408037662506, + "learning_rate": 9.99010605948598e-06, + "loss": 0.4127, + "step": 1408 + }, + { + "epoch": 0.35437625754527163, + "grad_norm": 0.4431535601615906, + "learning_rate": 9.990013839511193e-06, + "loss": 0.3965, + "step": 1409 + }, + { + "epoch": 0.3546277665995976, + "grad_norm": 0.4630604684352875, + "learning_rate": 9.989921192172936e-06, + "loss": 0.4251, + "step": 1410 + }, + { + "epoch": 0.3548792756539235, + "grad_norm": 0.4784621298313141, + "learning_rate": 9.989828117479149e-06, + "loss": 0.4314, + "step": 1411 + }, + { + "epoch": 0.3551307847082495, + "grad_norm": 0.4682464301586151, + "learning_rate": 9.989734615437797e-06, + "loss": 0.3869, + "step": 1412 + }, + { + "epoch": 0.35538229376257546, + "grad_norm": 0.4220459759235382, + "learning_rate": 9.989640686056891e-06, + "loss": 0.4142, + "step": 1413 + }, + { + "epoch": 0.35563380281690143, + "grad_norm": 0.4825068712234497, + "learning_rate": 9.989546329344474e-06, + "loss": 0.3999, + "step": 1414 + }, + { + "epoch": 0.35588531187122735, + "grad_norm": 0.47478923201560974, + "learning_rate": 9.989451545308633e-06, + "loss": 0.4274, + "step": 1415 + }, + { + "epoch": 0.3561368209255533, + "grad_norm": 0.37347909808158875, + "learning_rate": 9.989356333957477e-06, + "loss": 0.4146, + "step": 1416 + }, + { + "epoch": 0.3563883299798793, + "grad_norm": 0.484246164560318, + "learning_rate": 9.989260695299165e-06, + "loss": 0.3937, + "step": 1417 + }, + { + "epoch": 0.3566398390342052, + "grad_norm": 0.47754985094070435, + "learning_rate": 9.989164629341889e-06, + "loss": 0.4186, + "step": 1418 + }, + { + "epoch": 0.3568913480885312, + "grad_norm": 0.45537054538726807, + "learning_rate": 9.989068136093873e-06, + "loss": 0.4436, + "step": 1419 + }, + { + "epoch": 0.35714285714285715, + "grad_norm": 0.4763503074645996, + "learning_rate": 9.988971215563383e-06, + "loss": 0.4262, + "step": 1420 + }, + { + "epoch": 0.3573943661971831, + "grad_norm": 0.41354265809059143, + "learning_rate": 9.98887386775872e-06, + "loss": 0.4016, + "step": 1421 + }, + { + "epoch": 0.35764587525150904, + "grad_norm": 0.4630916118621826, + "learning_rate": 9.988776092688221e-06, + "loss": 0.438, + "step": 1422 + }, + { + "epoch": 0.357897384305835, + "grad_norm": 0.40816864371299744, + "learning_rate": 9.988677890360258e-06, + "loss": 0.4204, + "step": 1423 + }, + { + "epoch": 0.358148893360161, + "grad_norm": 0.4877932071685791, + "learning_rate": 9.988579260783242e-06, + "loss": 0.3938, + "step": 1424 + }, + { + "epoch": 0.3584004024144869, + "grad_norm": 0.4669157862663269, + "learning_rate": 9.988480203965623e-06, + "loss": 0.4211, + "step": 1425 + }, + { + "epoch": 0.35865191146881287, + "grad_norm": 0.40130600333213806, + "learning_rate": 9.988380719915881e-06, + "loss": 0.4003, + "step": 1426 + }, + { + "epoch": 0.35890342052313884, + "grad_norm": 0.4261634647846222, + "learning_rate": 9.988280808642538e-06, + "loss": 0.4073, + "step": 1427 + }, + { + "epoch": 0.3591549295774648, + "grad_norm": 0.4852161705493927, + "learning_rate": 9.988180470154149e-06, + "loss": 0.4191, + "step": 1428 + }, + { + "epoch": 0.3594064386317907, + "grad_norm": 0.4650234878063202, + "learning_rate": 9.98807970445931e-06, + "loss": 0.4394, + "step": 1429 + }, + { + "epoch": 0.3596579476861167, + "grad_norm": 0.4383230209350586, + "learning_rate": 9.987978511566651e-06, + "loss": 0.3868, + "step": 1430 + }, + { + "epoch": 0.35990945674044267, + "grad_norm": 0.4849243760108948, + "learning_rate": 9.987876891484836e-06, + "loss": 0.4096, + "step": 1431 + }, + { + "epoch": 0.36016096579476864, + "grad_norm": 0.40414345264434814, + "learning_rate": 9.987774844222568e-06, + "loss": 0.4105, + "step": 1432 + }, + { + "epoch": 0.36041247484909456, + "grad_norm": 0.4532756507396698, + "learning_rate": 9.987672369788589e-06, + "loss": 0.4123, + "step": 1433 + }, + { + "epoch": 0.3606639839034205, + "grad_norm": 0.46058785915374756, + "learning_rate": 9.987569468191674e-06, + "loss": 0.4172, + "step": 1434 + }, + { + "epoch": 0.3609154929577465, + "grad_norm": 0.448282390832901, + "learning_rate": 9.987466139440636e-06, + "loss": 0.393, + "step": 1435 + }, + { + "epoch": 0.3611670020120724, + "grad_norm": 0.4629223644733429, + "learning_rate": 9.987362383544326e-06, + "loss": 0.4136, + "step": 1436 + }, + { + "epoch": 0.3614185110663984, + "grad_norm": 0.44559571146965027, + "learning_rate": 9.987258200511627e-06, + "loss": 0.4002, + "step": 1437 + }, + { + "epoch": 0.36167002012072436, + "grad_norm": 0.42130130529403687, + "learning_rate": 9.987153590351466e-06, + "loss": 0.4134, + "step": 1438 + }, + { + "epoch": 0.3619215291750503, + "grad_norm": 0.4000697731971741, + "learning_rate": 9.987048553072796e-06, + "loss": 0.3989, + "step": 1439 + }, + { + "epoch": 0.36217303822937624, + "grad_norm": 0.42225921154022217, + "learning_rate": 9.986943088684619e-06, + "loss": 0.4023, + "step": 1440 + }, + { + "epoch": 0.3624245472837022, + "grad_norm": 0.39401090145111084, + "learning_rate": 9.986837197195964e-06, + "loss": 0.4141, + "step": 1441 + }, + { + "epoch": 0.3626760563380282, + "grad_norm": 0.41057607531547546, + "learning_rate": 9.9867308786159e-06, + "loss": 0.4282, + "step": 1442 + }, + { + "epoch": 0.3629275653923541, + "grad_norm": 0.40181800723075867, + "learning_rate": 9.986624132953533e-06, + "loss": 0.3689, + "step": 1443 + }, + { + "epoch": 0.3631790744466801, + "grad_norm": 0.4301682710647583, + "learning_rate": 9.986516960218005e-06, + "loss": 0.4181, + "step": 1444 + }, + { + "epoch": 0.36343058350100604, + "grad_norm": 0.40248462557792664, + "learning_rate": 9.986409360418497e-06, + "loss": 0.4252, + "step": 1445 + }, + { + "epoch": 0.363682092555332, + "grad_norm": 0.451557993888855, + "learning_rate": 9.98630133356422e-06, + "loss": 0.4201, + "step": 1446 + }, + { + "epoch": 0.36393360160965793, + "grad_norm": 0.4034324586391449, + "learning_rate": 9.986192879664428e-06, + "loss": 0.4109, + "step": 1447 + }, + { + "epoch": 0.3641851106639839, + "grad_norm": 0.41685450077056885, + "learning_rate": 9.98608399872841e-06, + "loss": 0.4119, + "step": 1448 + }, + { + "epoch": 0.3644366197183099, + "grad_norm": 0.4460361897945404, + "learning_rate": 9.985974690765492e-06, + "loss": 0.4037, + "step": 1449 + }, + { + "epoch": 0.3646881287726358, + "grad_norm": 0.42670106887817383, + "learning_rate": 9.985864955785032e-06, + "loss": 0.3833, + "step": 1450 + }, + { + "epoch": 0.36493963782696176, + "grad_norm": 0.4803377091884613, + "learning_rate": 9.98575479379643e-06, + "loss": 0.3959, + "step": 1451 + }, + { + "epoch": 0.36519114688128773, + "grad_norm": 0.4145619869232178, + "learning_rate": 9.98564420480912e-06, + "loss": 0.3964, + "step": 1452 + }, + { + "epoch": 0.3654426559356137, + "grad_norm": 0.45921090245246887, + "learning_rate": 9.985533188832575e-06, + "loss": 0.3993, + "step": 1453 + }, + { + "epoch": 0.3656941649899396, + "grad_norm": 0.39491912722587585, + "learning_rate": 9.985421745876302e-06, + "loss": 0.3956, + "step": 1454 + }, + { + "epoch": 0.3659456740442656, + "grad_norm": 0.4830361306667328, + "learning_rate": 9.985309875949844e-06, + "loss": 0.4028, + "step": 1455 + }, + { + "epoch": 0.36619718309859156, + "grad_norm": 0.42572900652885437, + "learning_rate": 9.985197579062784e-06, + "loss": 0.422, + "step": 1456 + }, + { + "epoch": 0.36644869215291753, + "grad_norm": 0.4604795277118683, + "learning_rate": 9.98508485522474e-06, + "loss": 0.4295, + "step": 1457 + }, + { + "epoch": 0.36670020120724345, + "grad_norm": 0.4539150297641754, + "learning_rate": 9.984971704445363e-06, + "loss": 0.3922, + "step": 1458 + }, + { + "epoch": 0.3669517102615694, + "grad_norm": 0.4076423645019531, + "learning_rate": 9.984858126734345e-06, + "loss": 0.412, + "step": 1459 + }, + { + "epoch": 0.3672032193158954, + "grad_norm": 0.4578339457511902, + "learning_rate": 9.984744122101415e-06, + "loss": 0.4124, + "step": 1460 + }, + { + "epoch": 0.3674547283702213, + "grad_norm": 0.43359407782554626, + "learning_rate": 9.984629690556336e-06, + "loss": 0.4002, + "step": 1461 + }, + { + "epoch": 0.3677062374245473, + "grad_norm": 0.4007089138031006, + "learning_rate": 9.984514832108905e-06, + "loss": 0.413, + "step": 1462 + }, + { + "epoch": 0.36795774647887325, + "grad_norm": 0.4894416928291321, + "learning_rate": 9.984399546768964e-06, + "loss": 0.4383, + "step": 1463 + }, + { + "epoch": 0.3682092555331992, + "grad_norm": 0.45550626516342163, + "learning_rate": 9.984283834546383e-06, + "loss": 0.4012, + "step": 1464 + }, + { + "epoch": 0.36846076458752514, + "grad_norm": 0.4249536991119385, + "learning_rate": 9.984167695451075e-06, + "loss": 0.4238, + "step": 1465 + }, + { + "epoch": 0.3687122736418511, + "grad_norm": 0.43250423669815063, + "learning_rate": 9.984051129492982e-06, + "loss": 0.4172, + "step": 1466 + }, + { + "epoch": 0.3689637826961771, + "grad_norm": 0.42905688285827637, + "learning_rate": 9.983934136682092e-06, + "loss": 0.4134, + "step": 1467 + }, + { + "epoch": 0.369215291750503, + "grad_norm": 0.4656728208065033, + "learning_rate": 9.983816717028421e-06, + "loss": 0.4207, + "step": 1468 + }, + { + "epoch": 0.36946680080482897, + "grad_norm": 0.3909688889980316, + "learning_rate": 9.983698870542028e-06, + "loss": 0.4257, + "step": 1469 + }, + { + "epoch": 0.36971830985915494, + "grad_norm": 0.49250149726867676, + "learning_rate": 9.983580597233005e-06, + "loss": 0.3796, + "step": 1470 + }, + { + "epoch": 0.3699698189134809, + "grad_norm": 0.42419931292533875, + "learning_rate": 9.98346189711148e-06, + "loss": 0.4042, + "step": 1471 + }, + { + "epoch": 0.3702213279678068, + "grad_norm": 0.41211193799972534, + "learning_rate": 9.98334277018762e-06, + "loss": 0.3932, + "step": 1472 + }, + { + "epoch": 0.3704728370221328, + "grad_norm": 0.41742807626724243, + "learning_rate": 9.983223216471627e-06, + "loss": 0.3787, + "step": 1473 + }, + { + "epoch": 0.37072434607645877, + "grad_norm": 0.42407408356666565, + "learning_rate": 9.98310323597374e-06, + "loss": 0.4117, + "step": 1474 + }, + { + "epoch": 0.3709758551307847, + "grad_norm": 0.469777375459671, + "learning_rate": 9.982982828704237e-06, + "loss": 0.4397, + "step": 1475 + }, + { + "epoch": 0.37122736418511065, + "grad_norm": 0.4012710154056549, + "learning_rate": 9.982861994673427e-06, + "loss": 0.3935, + "step": 1476 + }, + { + "epoch": 0.3714788732394366, + "grad_norm": 0.5193198323249817, + "learning_rate": 9.982740733891661e-06, + "loss": 0.3994, + "step": 1477 + }, + { + "epoch": 0.3717303822937626, + "grad_norm": 0.4547984004020691, + "learning_rate": 9.982619046369321e-06, + "loss": 0.3832, + "step": 1478 + }, + { + "epoch": 0.3719818913480885, + "grad_norm": 0.43693944811820984, + "learning_rate": 9.982496932116835e-06, + "loss": 0.3778, + "step": 1479 + }, + { + "epoch": 0.3722334004024145, + "grad_norm": 0.4877944588661194, + "learning_rate": 9.982374391144653e-06, + "loss": 0.4425, + "step": 1480 + }, + { + "epoch": 0.37248490945674045, + "grad_norm": 0.4946407973766327, + "learning_rate": 9.982251423463275e-06, + "loss": 0.407, + "step": 1481 + }, + { + "epoch": 0.3727364185110664, + "grad_norm": 0.4377669394016266, + "learning_rate": 9.98212802908323e-06, + "loss": 0.3936, + "step": 1482 + }, + { + "epoch": 0.37298792756539234, + "grad_norm": 0.42440077662467957, + "learning_rate": 9.982004208015091e-06, + "loss": 0.4186, + "step": 1483 + }, + { + "epoch": 0.3732394366197183, + "grad_norm": 0.5275130867958069, + "learning_rate": 9.981879960269458e-06, + "loss": 0.4257, + "step": 1484 + }, + { + "epoch": 0.3734909456740443, + "grad_norm": 0.5148937106132507, + "learning_rate": 9.98175528585697e-06, + "loss": 0.398, + "step": 1485 + }, + { + "epoch": 0.3737424547283702, + "grad_norm": 0.44241297245025635, + "learning_rate": 9.981630184788311e-06, + "loss": 0.4011, + "step": 1486 + }, + { + "epoch": 0.37399396378269617, + "grad_norm": 0.4566717743873596, + "learning_rate": 9.98150465707419e-06, + "loss": 0.4122, + "step": 1487 + }, + { + "epoch": 0.37424547283702214, + "grad_norm": 0.5034930109977722, + "learning_rate": 9.981378702725359e-06, + "loss": 0.3888, + "step": 1488 + }, + { + "epoch": 0.3744969818913481, + "grad_norm": 0.41092514991760254, + "learning_rate": 9.981252321752606e-06, + "loss": 0.4067, + "step": 1489 + }, + { + "epoch": 0.37474849094567403, + "grad_norm": 0.47386881709098816, + "learning_rate": 9.981125514166755e-06, + "loss": 0.4153, + "step": 1490 + }, + { + "epoch": 0.375, + "grad_norm": 0.4686683714389801, + "learning_rate": 9.980998279978664e-06, + "loss": 0.4279, + "step": 1491 + }, + { + "epoch": 0.37525150905432597, + "grad_norm": 0.4030835032463074, + "learning_rate": 9.980870619199232e-06, + "loss": 0.4043, + "step": 1492 + }, + { + "epoch": 0.3755030181086519, + "grad_norm": 0.39192453026771545, + "learning_rate": 9.980742531839393e-06, + "loss": 0.3521, + "step": 1493 + }, + { + "epoch": 0.37575452716297786, + "grad_norm": 0.42179104685783386, + "learning_rate": 9.980614017910112e-06, + "loss": 0.3976, + "step": 1494 + }, + { + "epoch": 0.37600603621730383, + "grad_norm": 0.4499700665473938, + "learning_rate": 9.980485077422404e-06, + "loss": 0.4033, + "step": 1495 + }, + { + "epoch": 0.3762575452716298, + "grad_norm": 0.40622663497924805, + "learning_rate": 9.980355710387304e-06, + "loss": 0.3914, + "step": 1496 + }, + { + "epoch": 0.3765090543259557, + "grad_norm": 0.441628098487854, + "learning_rate": 9.980225916815894e-06, + "loss": 0.41, + "step": 1497 + }, + { + "epoch": 0.3767605633802817, + "grad_norm": 0.4532158672809601, + "learning_rate": 9.980095696719291e-06, + "loss": 0.4123, + "step": 1498 + }, + { + "epoch": 0.37701207243460766, + "grad_norm": 0.3944168984889984, + "learning_rate": 9.979965050108648e-06, + "loss": 0.3929, + "step": 1499 + }, + { + "epoch": 0.3772635814889336, + "grad_norm": 0.41683652997016907, + "learning_rate": 9.979833976995153e-06, + "loss": 0.3946, + "step": 1500 + }, + { + "epoch": 0.37751509054325955, + "grad_norm": 0.4310116469860077, + "learning_rate": 9.97970247739003e-06, + "loss": 0.4022, + "step": 1501 + }, + { + "epoch": 0.3777665995975855, + "grad_norm": 0.44122573733329773, + "learning_rate": 9.979570551304543e-06, + "loss": 0.403, + "step": 1502 + }, + { + "epoch": 0.3780181086519115, + "grad_norm": 0.4259469211101532, + "learning_rate": 9.979438198749991e-06, + "loss": 0.3935, + "step": 1503 + }, + { + "epoch": 0.3782696177062374, + "grad_norm": 0.4257745146751404, + "learning_rate": 9.979305419737709e-06, + "loss": 0.4151, + "step": 1504 + }, + { + "epoch": 0.3785211267605634, + "grad_norm": 0.44491034746170044, + "learning_rate": 9.979172214279067e-06, + "loss": 0.4164, + "step": 1505 + }, + { + "epoch": 0.37877263581488935, + "grad_norm": 0.40862441062927246, + "learning_rate": 9.979038582385475e-06, + "loss": 0.3876, + "step": 1506 + }, + { + "epoch": 0.3790241448692153, + "grad_norm": 0.3930971026420593, + "learning_rate": 9.978904524068378e-06, + "loss": 0.3938, + "step": 1507 + }, + { + "epoch": 0.37927565392354123, + "grad_norm": 0.3884839117527008, + "learning_rate": 9.978770039339256e-06, + "loss": 0.3696, + "step": 1508 + }, + { + "epoch": 0.3795271629778672, + "grad_norm": 0.43522050976753235, + "learning_rate": 9.978635128209626e-06, + "loss": 0.4352, + "step": 1509 + }, + { + "epoch": 0.3797786720321932, + "grad_norm": 0.3984873294830322, + "learning_rate": 9.978499790691045e-06, + "loss": 0.3988, + "step": 1510 + }, + { + "epoch": 0.3800301810865191, + "grad_norm": 0.4236599802970886, + "learning_rate": 9.978364026795102e-06, + "loss": 0.4121, + "step": 1511 + }, + { + "epoch": 0.38028169014084506, + "grad_norm": 0.40510380268096924, + "learning_rate": 9.978227836533424e-06, + "loss": 0.416, + "step": 1512 + }, + { + "epoch": 0.38053319919517103, + "grad_norm": 0.5215635299682617, + "learning_rate": 9.978091219917675e-06, + "loss": 0.419, + "step": 1513 + }, + { + "epoch": 0.380784708249497, + "grad_norm": 0.4424278140068054, + "learning_rate": 9.977954176959558e-06, + "loss": 0.4188, + "step": 1514 + }, + { + "epoch": 0.3810362173038229, + "grad_norm": 0.40322160720825195, + "learning_rate": 9.977816707670806e-06, + "loss": 0.4067, + "step": 1515 + }, + { + "epoch": 0.3812877263581489, + "grad_norm": 0.4065972864627838, + "learning_rate": 9.977678812063195e-06, + "loss": 0.3698, + "step": 1516 + }, + { + "epoch": 0.38153923541247486, + "grad_norm": 0.47318071126937866, + "learning_rate": 9.977540490148534e-06, + "loss": 0.3954, + "step": 1517 + }, + { + "epoch": 0.3817907444668008, + "grad_norm": 0.4260292947292328, + "learning_rate": 9.977401741938667e-06, + "loss": 0.4078, + "step": 1518 + }, + { + "epoch": 0.38204225352112675, + "grad_norm": 0.5069632530212402, + "learning_rate": 9.977262567445482e-06, + "loss": 0.4108, + "step": 1519 + }, + { + "epoch": 0.3822937625754527, + "grad_norm": 0.42806974053382874, + "learning_rate": 9.977122966680896e-06, + "loss": 0.4037, + "step": 1520 + }, + { + "epoch": 0.3825452716297787, + "grad_norm": 0.42634284496307373, + "learning_rate": 9.976982939656866e-06, + "loss": 0.3896, + "step": 1521 + }, + { + "epoch": 0.3827967806841046, + "grad_norm": 0.4124469757080078, + "learning_rate": 9.976842486385379e-06, + "loss": 0.3939, + "step": 1522 + }, + { + "epoch": 0.3830482897384306, + "grad_norm": 0.43972286581993103, + "learning_rate": 9.976701606878471e-06, + "loss": 0.4143, + "step": 1523 + }, + { + "epoch": 0.38329979879275655, + "grad_norm": 0.44509053230285645, + "learning_rate": 9.976560301148203e-06, + "loss": 0.4016, + "step": 1524 + }, + { + "epoch": 0.38355130784708247, + "grad_norm": 0.4027416408061981, + "learning_rate": 9.976418569206678e-06, + "loss": 0.3824, + "step": 1525 + }, + { + "epoch": 0.38380281690140844, + "grad_norm": 0.4699016511440277, + "learning_rate": 9.976276411066037e-06, + "loss": 0.391, + "step": 1526 + }, + { + "epoch": 0.3840543259557344, + "grad_norm": 0.44244781136512756, + "learning_rate": 9.976133826738452e-06, + "loss": 0.3873, + "step": 1527 + }, + { + "epoch": 0.3843058350100604, + "grad_norm": 0.4619574546813965, + "learning_rate": 9.975990816236135e-06, + "loss": 0.4172, + "step": 1528 + }, + { + "epoch": 0.3845573440643863, + "grad_norm": 0.45410341024398804, + "learning_rate": 9.975847379571336e-06, + "loss": 0.4099, + "step": 1529 + }, + { + "epoch": 0.38480885311871227, + "grad_norm": 0.4313986003398895, + "learning_rate": 9.975703516756334e-06, + "loss": 0.4118, + "step": 1530 + }, + { + "epoch": 0.38506036217303824, + "grad_norm": 0.4216327965259552, + "learning_rate": 9.975559227803458e-06, + "loss": 0.3965, + "step": 1531 + }, + { + "epoch": 0.3853118712273642, + "grad_norm": 0.46033933758735657, + "learning_rate": 9.975414512725058e-06, + "loss": 0.3729, + "step": 1532 + }, + { + "epoch": 0.3855633802816901, + "grad_norm": 0.42646700143814087, + "learning_rate": 9.97526937153353e-06, + "loss": 0.3818, + "step": 1533 + }, + { + "epoch": 0.3858148893360161, + "grad_norm": 0.4267144203186035, + "learning_rate": 9.975123804241309e-06, + "loss": 0.4088, + "step": 1534 + }, + { + "epoch": 0.38606639839034207, + "grad_norm": 0.46120572090148926, + "learning_rate": 9.974977810860858e-06, + "loss": 0.4, + "step": 1535 + }, + { + "epoch": 0.386317907444668, + "grad_norm": 0.41420215368270874, + "learning_rate": 9.97483139140468e-06, + "loss": 0.4145, + "step": 1536 + }, + { + "epoch": 0.38656941649899396, + "grad_norm": 0.514680802822113, + "learning_rate": 9.974684545885315e-06, + "loss": 0.4021, + "step": 1537 + }, + { + "epoch": 0.3868209255533199, + "grad_norm": 0.4189431369304657, + "learning_rate": 9.97453727431534e-06, + "loss": 0.3973, + "step": 1538 + }, + { + "epoch": 0.3870724346076459, + "grad_norm": 0.47960418462753296, + "learning_rate": 9.97438957670737e-06, + "loss": 0.3942, + "step": 1539 + }, + { + "epoch": 0.3873239436619718, + "grad_norm": 0.49763473868370056, + "learning_rate": 9.974241453074051e-06, + "loss": 0.3891, + "step": 1540 + }, + { + "epoch": 0.3875754527162978, + "grad_norm": 0.48309236764907837, + "learning_rate": 9.974092903428072e-06, + "loss": 0.416, + "step": 1541 + }, + { + "epoch": 0.38782696177062376, + "grad_norm": 0.48040205240249634, + "learning_rate": 9.973943927782152e-06, + "loss": 0.3942, + "step": 1542 + }, + { + "epoch": 0.3880784708249497, + "grad_norm": 0.4268017113208771, + "learning_rate": 9.973794526149051e-06, + "loss": 0.4178, + "step": 1543 + }, + { + "epoch": 0.38832997987927564, + "grad_norm": 0.4728141725063324, + "learning_rate": 9.973644698541567e-06, + "loss": 0.3855, + "step": 1544 + }, + { + "epoch": 0.3885814889336016, + "grad_norm": 0.46177420020103455, + "learning_rate": 9.973494444972527e-06, + "loss": 0.4203, + "step": 1545 + }, + { + "epoch": 0.3888329979879276, + "grad_norm": 0.47535744309425354, + "learning_rate": 9.973343765454803e-06, + "loss": 0.4344, + "step": 1546 + }, + { + "epoch": 0.3890845070422535, + "grad_norm": 0.4919487535953522, + "learning_rate": 9.973192660001299e-06, + "loss": 0.429, + "step": 1547 + }, + { + "epoch": 0.3893360160965795, + "grad_norm": 0.40322598814964294, + "learning_rate": 9.973041128624956e-06, + "loss": 0.4181, + "step": 1548 + }, + { + "epoch": 0.38958752515090544, + "grad_norm": 0.538712739944458, + "learning_rate": 9.972889171338752e-06, + "loss": 0.3997, + "step": 1549 + }, + { + "epoch": 0.38983903420523136, + "grad_norm": 0.43118852376937866, + "learning_rate": 9.9727367881557e-06, + "loss": 0.4037, + "step": 1550 + }, + { + "epoch": 0.39009054325955733, + "grad_norm": 0.4399496912956238, + "learning_rate": 9.97258397908885e-06, + "loss": 0.4109, + "step": 1551 + }, + { + "epoch": 0.3903420523138833, + "grad_norm": 0.42758503556251526, + "learning_rate": 9.972430744151292e-06, + "loss": 0.401, + "step": 1552 + }, + { + "epoch": 0.3905935613682093, + "grad_norm": 0.4662076234817505, + "learning_rate": 9.97227708335615e-06, + "loss": 0.4129, + "step": 1553 + }, + { + "epoch": 0.3908450704225352, + "grad_norm": 0.38940244913101196, + "learning_rate": 9.97212299671658e-06, + "loss": 0.3821, + "step": 1554 + }, + { + "epoch": 0.39109657947686116, + "grad_norm": 0.4017338156700134, + "learning_rate": 9.97196848424578e-06, + "loss": 0.4161, + "step": 1555 + }, + { + "epoch": 0.39134808853118713, + "grad_norm": 0.4648374319076538, + "learning_rate": 9.971813545956986e-06, + "loss": 0.3979, + "step": 1556 + }, + { + "epoch": 0.3915995975855131, + "grad_norm": 0.3893650472164154, + "learning_rate": 9.971658181863464e-06, + "loss": 0.3949, + "step": 1557 + }, + { + "epoch": 0.391851106639839, + "grad_norm": 0.47235551476478577, + "learning_rate": 9.971502391978523e-06, + "loss": 0.4114, + "step": 1558 + }, + { + "epoch": 0.392102615694165, + "grad_norm": 0.5151132941246033, + "learning_rate": 9.971346176315501e-06, + "loss": 0.4216, + "step": 1559 + }, + { + "epoch": 0.39235412474849096, + "grad_norm": 0.36058980226516724, + "learning_rate": 9.971189534887781e-06, + "loss": 0.3828, + "step": 1560 + }, + { + "epoch": 0.3926056338028169, + "grad_norm": 0.5667259693145752, + "learning_rate": 9.971032467708779e-06, + "loss": 0.3984, + "step": 1561 + }, + { + "epoch": 0.39285714285714285, + "grad_norm": 0.4725882112979889, + "learning_rate": 9.970874974791942e-06, + "loss": 0.4021, + "step": 1562 + }, + { + "epoch": 0.3931086519114688, + "grad_norm": 0.4235588014125824, + "learning_rate": 9.970717056150764e-06, + "loss": 0.3842, + "step": 1563 + }, + { + "epoch": 0.3933601609657948, + "grad_norm": 0.5390638709068298, + "learning_rate": 9.970558711798763e-06, + "loss": 0.4046, + "step": 1564 + }, + { + "epoch": 0.3936116700201207, + "grad_norm": 0.5337787866592407, + "learning_rate": 9.970399941749507e-06, + "loss": 0.3996, + "step": 1565 + }, + { + "epoch": 0.3938631790744467, + "grad_norm": 0.4832894504070282, + "learning_rate": 9.970240746016588e-06, + "loss": 0.3957, + "step": 1566 + }, + { + "epoch": 0.39411468812877265, + "grad_norm": 0.5507562160491943, + "learning_rate": 9.970081124613647e-06, + "loss": 0.395, + "step": 1567 + }, + { + "epoch": 0.39436619718309857, + "grad_norm": 0.4837891459465027, + "learning_rate": 9.969921077554347e-06, + "loss": 0.3878, + "step": 1568 + }, + { + "epoch": 0.39461770623742454, + "grad_norm": 0.4195319414138794, + "learning_rate": 9.969760604852399e-06, + "loss": 0.4046, + "step": 1569 + }, + { + "epoch": 0.3948692152917505, + "grad_norm": 0.5062696933746338, + "learning_rate": 9.969599706521546e-06, + "loss": 0.4181, + "step": 1570 + }, + { + "epoch": 0.3951207243460765, + "grad_norm": 0.5001570582389832, + "learning_rate": 9.969438382575569e-06, + "loss": 0.3919, + "step": 1571 + }, + { + "epoch": 0.3953722334004024, + "grad_norm": 0.47448521852493286, + "learning_rate": 9.969276633028281e-06, + "loss": 0.3904, + "step": 1572 + }, + { + "epoch": 0.39562374245472837, + "grad_norm": 0.5753097534179688, + "learning_rate": 9.96911445789354e-06, + "loss": 0.4031, + "step": 1573 + }, + { + "epoch": 0.39587525150905434, + "grad_norm": 0.4995958209037781, + "learning_rate": 9.96895185718523e-06, + "loss": 0.4172, + "step": 1574 + }, + { + "epoch": 0.3961267605633803, + "grad_norm": 0.42421847581863403, + "learning_rate": 9.96878883091728e-06, + "loss": 0.3958, + "step": 1575 + }, + { + "epoch": 0.3963782696177062, + "grad_norm": 0.42944973707199097, + "learning_rate": 9.968625379103651e-06, + "loss": 0.4098, + "step": 1576 + }, + { + "epoch": 0.3966297786720322, + "grad_norm": 0.5947927832603455, + "learning_rate": 9.968461501758343e-06, + "loss": 0.4095, + "step": 1577 + }, + { + "epoch": 0.39688128772635817, + "grad_norm": 0.43222859501838684, + "learning_rate": 9.968297198895388e-06, + "loss": 0.4088, + "step": 1578 + }, + { + "epoch": 0.3971327967806841, + "grad_norm": 0.4837366044521332, + "learning_rate": 9.968132470528862e-06, + "loss": 0.3992, + "step": 1579 + }, + { + "epoch": 0.39738430583501005, + "grad_norm": 0.46640220284461975, + "learning_rate": 9.967967316672869e-06, + "loss": 0.3845, + "step": 1580 + }, + { + "epoch": 0.397635814889336, + "grad_norm": 0.39366641640663147, + "learning_rate": 9.967801737341556e-06, + "loss": 0.4074, + "step": 1581 + }, + { + "epoch": 0.397887323943662, + "grad_norm": 0.4720052182674408, + "learning_rate": 9.9676357325491e-06, + "loss": 0.4037, + "step": 1582 + }, + { + "epoch": 0.3981388329979879, + "grad_norm": 0.43749743700027466, + "learning_rate": 9.967469302309722e-06, + "loss": 0.4145, + "step": 1583 + }, + { + "epoch": 0.3983903420523139, + "grad_norm": 0.39652055501937866, + "learning_rate": 9.967302446637677e-06, + "loss": 0.4018, + "step": 1584 + }, + { + "epoch": 0.39864185110663986, + "grad_norm": 0.4925057291984558, + "learning_rate": 9.96713516554725e-06, + "loss": 0.4028, + "step": 1585 + }, + { + "epoch": 0.39889336016096577, + "grad_norm": 0.448711633682251, + "learning_rate": 9.966967459052771e-06, + "loss": 0.4107, + "step": 1586 + }, + { + "epoch": 0.39914486921529174, + "grad_norm": 0.3823001980781555, + "learning_rate": 9.966799327168603e-06, + "loss": 0.4128, + "step": 1587 + }, + { + "epoch": 0.3993963782696177, + "grad_norm": 0.47937512397766113, + "learning_rate": 9.966630769909145e-06, + "loss": 0.4312, + "step": 1588 + }, + { + "epoch": 0.3996478873239437, + "grad_norm": 0.4800701439380646, + "learning_rate": 9.966461787288832e-06, + "loss": 0.4116, + "step": 1589 + }, + { + "epoch": 0.3998993963782696, + "grad_norm": 0.4551905691623688, + "learning_rate": 9.966292379322138e-06, + "loss": 0.4165, + "step": 1590 + }, + { + "epoch": 0.40015090543259557, + "grad_norm": 0.4783093333244324, + "learning_rate": 9.966122546023568e-06, + "loss": 0.3833, + "step": 1591 + }, + { + "epoch": 0.40040241448692154, + "grad_norm": 0.4122217893600464, + "learning_rate": 9.965952287407674e-06, + "loss": 0.3951, + "step": 1592 + }, + { + "epoch": 0.40065392354124746, + "grad_norm": 0.4596366584300995, + "learning_rate": 9.965781603489032e-06, + "loss": 0.3803, + "step": 1593 + }, + { + "epoch": 0.40090543259557343, + "grad_norm": 0.4768621027469635, + "learning_rate": 9.96561049428226e-06, + "loss": 0.3958, + "step": 1594 + }, + { + "epoch": 0.4011569416498994, + "grad_norm": 0.45015203952789307, + "learning_rate": 9.965438959802015e-06, + "loss": 0.3932, + "step": 1595 + }, + { + "epoch": 0.4014084507042254, + "grad_norm": 0.44677606225013733, + "learning_rate": 9.965267000062986e-06, + "loss": 0.4062, + "step": 1596 + }, + { + "epoch": 0.4016599597585513, + "grad_norm": 0.45594921708106995, + "learning_rate": 9.965094615079902e-06, + "loss": 0.3921, + "step": 1597 + }, + { + "epoch": 0.40191146881287726, + "grad_norm": 0.44499877095222473, + "learning_rate": 9.964921804867524e-06, + "loss": 0.4086, + "step": 1598 + }, + { + "epoch": 0.40216297786720323, + "grad_norm": 0.45410072803497314, + "learning_rate": 9.964748569440656e-06, + "loss": 0.4115, + "step": 1599 + }, + { + "epoch": 0.4024144869215292, + "grad_norm": 0.42579391598701477, + "learning_rate": 9.964574908814131e-06, + "loss": 0.4072, + "step": 1600 + }, + { + "epoch": 0.4026659959758551, + "grad_norm": 0.42585766315460205, + "learning_rate": 9.964400823002825e-06, + "loss": 0.3703, + "step": 1601 + }, + { + "epoch": 0.4029175050301811, + "grad_norm": 0.4478361904621124, + "learning_rate": 9.964226312021645e-06, + "loss": 0.4314, + "step": 1602 + }, + { + "epoch": 0.40316901408450706, + "grad_norm": 0.3969224691390991, + "learning_rate": 9.964051375885537e-06, + "loss": 0.3911, + "step": 1603 + }, + { + "epoch": 0.403420523138833, + "grad_norm": 0.4182923138141632, + "learning_rate": 9.963876014609484e-06, + "loss": 0.4009, + "step": 1604 + }, + { + "epoch": 0.40367203219315895, + "grad_norm": 0.43487611413002014, + "learning_rate": 9.963700228208503e-06, + "loss": 0.3828, + "step": 1605 + }, + { + "epoch": 0.4039235412474849, + "grad_norm": 0.4310348927974701, + "learning_rate": 9.963524016697651e-06, + "loss": 0.4142, + "step": 1606 + }, + { + "epoch": 0.4041750503018109, + "grad_norm": 0.4118841588497162, + "learning_rate": 9.96334738009202e-06, + "loss": 0.4051, + "step": 1607 + }, + { + "epoch": 0.4044265593561368, + "grad_norm": 0.4341716170310974, + "learning_rate": 9.963170318406737e-06, + "loss": 0.4171, + "step": 1608 + }, + { + "epoch": 0.4046780684104628, + "grad_norm": 0.4583278298377991, + "learning_rate": 9.962992831656964e-06, + "loss": 0.4026, + "step": 1609 + }, + { + "epoch": 0.40492957746478875, + "grad_norm": 0.38871103525161743, + "learning_rate": 9.962814919857903e-06, + "loss": 0.3959, + "step": 1610 + }, + { + "epoch": 0.40518108651911466, + "grad_norm": 0.4146328866481781, + "learning_rate": 9.962636583024792e-06, + "loss": 0.3994, + "step": 1611 + }, + { + "epoch": 0.40543259557344064, + "grad_norm": 0.371489554643631, + "learning_rate": 9.962457821172903e-06, + "loss": 0.3671, + "step": 1612 + }, + { + "epoch": 0.4056841046277666, + "grad_norm": 0.4303867816925049, + "learning_rate": 9.962278634317549e-06, + "loss": 0.3833, + "step": 1613 + }, + { + "epoch": 0.4059356136820926, + "grad_norm": 0.3755389451980591, + "learning_rate": 9.962099022474072e-06, + "loss": 0.4061, + "step": 1614 + }, + { + "epoch": 0.4061871227364185, + "grad_norm": 0.4392898678779602, + "learning_rate": 9.961918985657857e-06, + "loss": 0.4042, + "step": 1615 + }, + { + "epoch": 0.40643863179074446, + "grad_norm": 0.4400614798069, + "learning_rate": 9.961738523884322e-06, + "loss": 0.4054, + "step": 1616 + }, + { + "epoch": 0.40669014084507044, + "grad_norm": 0.4476352035999298, + "learning_rate": 9.961557637168924e-06, + "loss": 0.416, + "step": 1617 + }, + { + "epoch": 0.40694164989939635, + "grad_norm": 0.4568561613559723, + "learning_rate": 9.961376325527152e-06, + "loss": 0.3866, + "step": 1618 + }, + { + "epoch": 0.4071931589537223, + "grad_norm": 0.40012219548225403, + "learning_rate": 9.961194588974538e-06, + "loss": 0.3919, + "step": 1619 + }, + { + "epoch": 0.4074446680080483, + "grad_norm": 0.4288276135921478, + "learning_rate": 9.961012427526644e-06, + "loss": 0.3801, + "step": 1620 + }, + { + "epoch": 0.40769617706237427, + "grad_norm": 0.4316503703594208, + "learning_rate": 9.960829841199071e-06, + "loss": 0.4102, + "step": 1621 + }, + { + "epoch": 0.4079476861167002, + "grad_norm": 0.4526152014732361, + "learning_rate": 9.960646830007456e-06, + "loss": 0.4437, + "step": 1622 + }, + { + "epoch": 0.40819919517102615, + "grad_norm": 0.4108959138393402, + "learning_rate": 9.960463393967476e-06, + "loss": 0.3752, + "step": 1623 + }, + { + "epoch": 0.4084507042253521, + "grad_norm": 0.4516206383705139, + "learning_rate": 9.960279533094838e-06, + "loss": 0.4061, + "step": 1624 + }, + { + "epoch": 0.4087022132796781, + "grad_norm": 0.46021711826324463, + "learning_rate": 9.96009524740529e-06, + "loss": 0.4302, + "step": 1625 + }, + { + "epoch": 0.408953722334004, + "grad_norm": 0.3884812295436859, + "learning_rate": 9.959910536914614e-06, + "loss": 0.429, + "step": 1626 + }, + { + "epoch": 0.40920523138833, + "grad_norm": 0.4301206171512604, + "learning_rate": 9.95972540163863e-06, + "loss": 0.4074, + "step": 1627 + }, + { + "epoch": 0.40945674044265595, + "grad_norm": 0.4203178286552429, + "learning_rate": 9.959539841593192e-06, + "loss": 0.3939, + "step": 1628 + }, + { + "epoch": 0.40970824949698187, + "grad_norm": 0.4511706233024597, + "learning_rate": 9.959353856794194e-06, + "loss": 0.4382, + "step": 1629 + }, + { + "epoch": 0.40995975855130784, + "grad_norm": 0.42685285210609436, + "learning_rate": 9.959167447257563e-06, + "loss": 0.3805, + "step": 1630 + }, + { + "epoch": 0.4102112676056338, + "grad_norm": 0.4132843315601349, + "learning_rate": 9.958980612999265e-06, + "loss": 0.3986, + "step": 1631 + }, + { + "epoch": 0.4104627766599598, + "grad_norm": 0.3776272237300873, + "learning_rate": 9.9587933540353e-06, + "loss": 0.3944, + "step": 1632 + }, + { + "epoch": 0.4107142857142857, + "grad_norm": 0.5198211073875427, + "learning_rate": 9.958605670381709e-06, + "loss": 0.3953, + "step": 1633 + }, + { + "epoch": 0.41096579476861167, + "grad_norm": 0.4831434488296509, + "learning_rate": 9.958417562054561e-06, + "loss": 0.4026, + "step": 1634 + }, + { + "epoch": 0.41121730382293764, + "grad_norm": 0.43184301257133484, + "learning_rate": 9.958229029069969e-06, + "loss": 0.4047, + "step": 1635 + }, + { + "epoch": 0.41146881287726356, + "grad_norm": 0.46622034907341003, + "learning_rate": 9.95804007144408e-06, + "loss": 0.3992, + "step": 1636 + }, + { + "epoch": 0.41172032193158953, + "grad_norm": 0.4247199296951294, + "learning_rate": 9.957850689193075e-06, + "loss": 0.4125, + "step": 1637 + }, + { + "epoch": 0.4119718309859155, + "grad_norm": 0.4305685758590698, + "learning_rate": 9.957660882333176e-06, + "loss": 0.3811, + "step": 1638 + }, + { + "epoch": 0.41222334004024147, + "grad_norm": 0.45092910528182983, + "learning_rate": 9.957470650880636e-06, + "loss": 0.3988, + "step": 1639 + }, + { + "epoch": 0.4124748490945674, + "grad_norm": 0.40285438299179077, + "learning_rate": 9.957279994851751e-06, + "loss": 0.3985, + "step": 1640 + }, + { + "epoch": 0.41272635814889336, + "grad_norm": 0.3911204934120178, + "learning_rate": 9.957088914262844e-06, + "loss": 0.4334, + "step": 1641 + }, + { + "epoch": 0.41297786720321933, + "grad_norm": 0.4495960474014282, + "learning_rate": 9.956897409130286e-06, + "loss": 0.4241, + "step": 1642 + }, + { + "epoch": 0.41322937625754524, + "grad_norm": 0.4449693560600281, + "learning_rate": 9.956705479470473e-06, + "loss": 0.4062, + "step": 1643 + }, + { + "epoch": 0.4134808853118712, + "grad_norm": 0.4287528097629547, + "learning_rate": 9.956513125299847e-06, + "loss": 0.3907, + "step": 1644 + }, + { + "epoch": 0.4137323943661972, + "grad_norm": 0.40359970927238464, + "learning_rate": 9.956320346634877e-06, + "loss": 0.4134, + "step": 1645 + }, + { + "epoch": 0.41398390342052316, + "grad_norm": 0.48502546548843384, + "learning_rate": 9.956127143492077e-06, + "loss": 0.3796, + "step": 1646 + }, + { + "epoch": 0.4142354124748491, + "grad_norm": 0.4318941533565521, + "learning_rate": 9.955933515887992e-06, + "loss": 0.4273, + "step": 1647 + }, + { + "epoch": 0.41448692152917505, + "grad_norm": 0.4548565447330475, + "learning_rate": 9.955739463839206e-06, + "loss": 0.4136, + "step": 1648 + }, + { + "epoch": 0.414738430583501, + "grad_norm": 0.4548591077327728, + "learning_rate": 9.955544987362339e-06, + "loss": 0.4114, + "step": 1649 + }, + { + "epoch": 0.414989939637827, + "grad_norm": 0.4239116311073303, + "learning_rate": 9.955350086474045e-06, + "loss": 0.4106, + "step": 1650 + }, + { + "epoch": 0.4152414486921529, + "grad_norm": 0.40276509523391724, + "learning_rate": 9.955154761191017e-06, + "loss": 0.3762, + "step": 1651 + }, + { + "epoch": 0.4154929577464789, + "grad_norm": 0.513320803642273, + "learning_rate": 9.954959011529982e-06, + "loss": 0.4079, + "step": 1652 + }, + { + "epoch": 0.41574446680080485, + "grad_norm": 0.42079389095306396, + "learning_rate": 9.954762837507705e-06, + "loss": 0.3759, + "step": 1653 + }, + { + "epoch": 0.41599597585513076, + "grad_norm": 0.4646183252334595, + "learning_rate": 9.95456623914099e-06, + "loss": 0.4014, + "step": 1654 + }, + { + "epoch": 0.41624748490945673, + "grad_norm": 0.49453848600387573, + "learning_rate": 9.954369216446672e-06, + "loss": 0.4085, + "step": 1655 + }, + { + "epoch": 0.4164989939637827, + "grad_norm": 0.4750482738018036, + "learning_rate": 9.954171769441625e-06, + "loss": 0.3955, + "step": 1656 + }, + { + "epoch": 0.4167505030181087, + "grad_norm": 0.4437958598136902, + "learning_rate": 9.953973898142759e-06, + "loss": 0.3681, + "step": 1657 + }, + { + "epoch": 0.4170020120724346, + "grad_norm": 0.4792766571044922, + "learning_rate": 9.953775602567019e-06, + "loss": 0.3918, + "step": 1658 + }, + { + "epoch": 0.41725352112676056, + "grad_norm": 0.5086737275123596, + "learning_rate": 9.95357688273139e-06, + "loss": 0.3897, + "step": 1659 + }, + { + "epoch": 0.41750503018108653, + "grad_norm": 0.4676213264465332, + "learning_rate": 9.953377738652892e-06, + "loss": 0.3902, + "step": 1660 + }, + { + "epoch": 0.41775653923541245, + "grad_norm": 0.5371580719947815, + "learning_rate": 9.953178170348578e-06, + "loss": 0.4179, + "step": 1661 + }, + { + "epoch": 0.4180080482897384, + "grad_norm": 0.4958730638027191, + "learning_rate": 9.952978177835542e-06, + "loss": 0.3896, + "step": 1662 + }, + { + "epoch": 0.4182595573440644, + "grad_norm": 0.43639904260635376, + "learning_rate": 9.952777761130909e-06, + "loss": 0.3949, + "step": 1663 + }, + { + "epoch": 0.41851106639839036, + "grad_norm": 0.43316468596458435, + "learning_rate": 9.952576920251845e-06, + "loss": 0.3852, + "step": 1664 + }, + { + "epoch": 0.4187625754527163, + "grad_norm": 0.3840535879135132, + "learning_rate": 9.952375655215551e-06, + "loss": 0.3757, + "step": 1665 + }, + { + "epoch": 0.41901408450704225, + "grad_norm": 0.4622785151004791, + "learning_rate": 9.952173966039266e-06, + "loss": 0.3856, + "step": 1666 + }, + { + "epoch": 0.4192655935613682, + "grad_norm": 0.4158206880092621, + "learning_rate": 9.951971852740262e-06, + "loss": 0.3824, + "step": 1667 + }, + { + "epoch": 0.41951710261569414, + "grad_norm": 0.42178934812545776, + "learning_rate": 9.951769315335843e-06, + "loss": 0.3969, + "step": 1668 + }, + { + "epoch": 0.4197686116700201, + "grad_norm": 0.4934524595737457, + "learning_rate": 9.951566353843365e-06, + "loss": 0.3818, + "step": 1669 + }, + { + "epoch": 0.4200201207243461, + "grad_norm": 0.4420875608921051, + "learning_rate": 9.951362968280205e-06, + "loss": 0.4343, + "step": 1670 + }, + { + "epoch": 0.42027162977867205, + "grad_norm": 0.42826712131500244, + "learning_rate": 9.951159158663782e-06, + "loss": 0.3899, + "step": 1671 + }, + { + "epoch": 0.42052313883299797, + "grad_norm": 0.4212586283683777, + "learning_rate": 9.950954925011552e-06, + "loss": 0.3819, + "step": 1672 + }, + { + "epoch": 0.42077464788732394, + "grad_norm": 0.5084717869758606, + "learning_rate": 9.950750267341004e-06, + "loss": 0.4106, + "step": 1673 + }, + { + "epoch": 0.4210261569416499, + "grad_norm": 0.4205119013786316, + "learning_rate": 9.950545185669668e-06, + "loss": 0.3926, + "step": 1674 + }, + { + "epoch": 0.4212776659959759, + "grad_norm": 0.4944150447845459, + "learning_rate": 9.95033968001511e-06, + "loss": 0.4076, + "step": 1675 + }, + { + "epoch": 0.4215291750503018, + "grad_norm": 0.4203665852546692, + "learning_rate": 9.950133750394926e-06, + "loss": 0.3952, + "step": 1676 + }, + { + "epoch": 0.42178068410462777, + "grad_norm": 0.41072648763656616, + "learning_rate": 9.949927396826753e-06, + "loss": 0.3923, + "step": 1677 + }, + { + "epoch": 0.42203219315895374, + "grad_norm": 0.4945848882198334, + "learning_rate": 9.949720619328265e-06, + "loss": 0.4096, + "step": 1678 + }, + { + "epoch": 0.42228370221327965, + "grad_norm": 0.3855612576007843, + "learning_rate": 9.949513417917173e-06, + "loss": 0.3916, + "step": 1679 + }, + { + "epoch": 0.4225352112676056, + "grad_norm": 0.38849636912345886, + "learning_rate": 9.94930579261122e-06, + "loss": 0.3744, + "step": 1680 + }, + { + "epoch": 0.4227867203219316, + "grad_norm": 0.45562243461608887, + "learning_rate": 9.949097743428188e-06, + "loss": 0.3882, + "step": 1681 + }, + { + "epoch": 0.42303822937625757, + "grad_norm": 0.38708576560020447, + "learning_rate": 9.948889270385897e-06, + "loss": 0.4077, + "step": 1682 + }, + { + "epoch": 0.4232897384305835, + "grad_norm": 0.4899725317955017, + "learning_rate": 9.948680373502199e-06, + "loss": 0.4271, + "step": 1683 + }, + { + "epoch": 0.42354124748490946, + "grad_norm": 0.4218765199184418, + "learning_rate": 9.948471052794988e-06, + "loss": 0.4052, + "step": 1684 + }, + { + "epoch": 0.4237927565392354, + "grad_norm": 0.45039162039756775, + "learning_rate": 9.948261308282187e-06, + "loss": 0.398, + "step": 1685 + }, + { + "epoch": 0.42404426559356134, + "grad_norm": 0.3818122148513794, + "learning_rate": 9.94805113998176e-06, + "loss": 0.4196, + "step": 1686 + }, + { + "epoch": 0.4242957746478873, + "grad_norm": 0.4184150695800781, + "learning_rate": 9.94784054791171e-06, + "loss": 0.3955, + "step": 1687 + }, + { + "epoch": 0.4245472837022133, + "grad_norm": 0.4112408459186554, + "learning_rate": 9.94762953209007e-06, + "loss": 0.4208, + "step": 1688 + }, + { + "epoch": 0.42479879275653926, + "grad_norm": 0.44503697752952576, + "learning_rate": 9.947418092534912e-06, + "loss": 0.3858, + "step": 1689 + }, + { + "epoch": 0.42505030181086517, + "grad_norm": 0.42923927307128906, + "learning_rate": 9.947206229264346e-06, + "loss": 0.3939, + "step": 1690 + }, + { + "epoch": 0.42530181086519114, + "grad_norm": 0.4138977825641632, + "learning_rate": 9.946993942296517e-06, + "loss": 0.3967, + "step": 1691 + }, + { + "epoch": 0.4255533199195171, + "grad_norm": 0.4376116991043091, + "learning_rate": 9.946781231649605e-06, + "loss": 0.3883, + "step": 1692 + }, + { + "epoch": 0.4258048289738431, + "grad_norm": 0.3793240487575531, + "learning_rate": 9.946568097341827e-06, + "loss": 0.388, + "step": 1693 + }, + { + "epoch": 0.426056338028169, + "grad_norm": 0.4271707832813263, + "learning_rate": 9.946354539391436e-06, + "loss": 0.3985, + "step": 1694 + }, + { + "epoch": 0.426307847082495, + "grad_norm": 0.396963894367218, + "learning_rate": 9.946140557816724e-06, + "loss": 0.3966, + "step": 1695 + }, + { + "epoch": 0.42655935613682094, + "grad_norm": 0.39994847774505615, + "learning_rate": 9.945926152636017e-06, + "loss": 0.3922, + "step": 1696 + }, + { + "epoch": 0.42681086519114686, + "grad_norm": 0.38306140899658203, + "learning_rate": 9.945711323867674e-06, + "loss": 0.406, + "step": 1697 + }, + { + "epoch": 0.42706237424547283, + "grad_norm": 0.4517843723297119, + "learning_rate": 9.945496071530098e-06, + "loss": 0.3871, + "step": 1698 + }, + { + "epoch": 0.4273138832997988, + "grad_norm": 0.40096959471702576, + "learning_rate": 9.945280395641724e-06, + "loss": 0.4098, + "step": 1699 + }, + { + "epoch": 0.4275653923541248, + "grad_norm": 0.42314156889915466, + "learning_rate": 9.945064296221019e-06, + "loss": 0.3858, + "step": 1700 + }, + { + "epoch": 0.4278169014084507, + "grad_norm": 0.4481252431869507, + "learning_rate": 9.944847773286495e-06, + "loss": 0.4273, + "step": 1701 + }, + { + "epoch": 0.42806841046277666, + "grad_norm": 0.4047831892967224, + "learning_rate": 9.944630826856694e-06, + "loss": 0.3892, + "step": 1702 + }, + { + "epoch": 0.42831991951710263, + "grad_norm": 0.40450939536094666, + "learning_rate": 9.944413456950195e-06, + "loss": 0.4199, + "step": 1703 + }, + { + "epoch": 0.42857142857142855, + "grad_norm": 0.3987005352973938, + "learning_rate": 9.944195663585616e-06, + "loss": 0.4254, + "step": 1704 + }, + { + "epoch": 0.4288229376257545, + "grad_norm": 0.426527738571167, + "learning_rate": 9.94397744678161e-06, + "loss": 0.3992, + "step": 1705 + }, + { + "epoch": 0.4290744466800805, + "grad_norm": 0.4253356158733368, + "learning_rate": 9.943758806556864e-06, + "loss": 0.3876, + "step": 1706 + }, + { + "epoch": 0.42932595573440646, + "grad_norm": 0.4413727819919586, + "learning_rate": 9.943539742930105e-06, + "loss": 0.4213, + "step": 1707 + }, + { + "epoch": 0.4295774647887324, + "grad_norm": 0.4230106472969055, + "learning_rate": 9.943320255920093e-06, + "loss": 0.4092, + "step": 1708 + }, + { + "epoch": 0.42982897384305835, + "grad_norm": 0.4080696702003479, + "learning_rate": 9.943100345545627e-06, + "loss": 0.3789, + "step": 1709 + }, + { + "epoch": 0.4300804828973843, + "grad_norm": 0.4206973612308502, + "learning_rate": 9.94288001182554e-06, + "loss": 0.4128, + "step": 1710 + }, + { + "epoch": 0.43033199195171024, + "grad_norm": 0.41314277052879333, + "learning_rate": 9.942659254778703e-06, + "loss": 0.3883, + "step": 1711 + }, + { + "epoch": 0.4305835010060362, + "grad_norm": 0.42901384830474854, + "learning_rate": 9.942438074424024e-06, + "loss": 0.3917, + "step": 1712 + }, + { + "epoch": 0.4308350100603622, + "grad_norm": 0.39158564805984497, + "learning_rate": 9.942216470780441e-06, + "loss": 0.3778, + "step": 1713 + }, + { + "epoch": 0.43108651911468815, + "grad_norm": 0.423117995262146, + "learning_rate": 9.941994443866936e-06, + "loss": 0.408, + "step": 1714 + }, + { + "epoch": 0.43133802816901406, + "grad_norm": 0.44455990195274353, + "learning_rate": 9.941771993702524e-06, + "loss": 0.3849, + "step": 1715 + }, + { + "epoch": 0.43158953722334004, + "grad_norm": 0.41482892632484436, + "learning_rate": 9.941549120306257e-06, + "loss": 0.4121, + "step": 1716 + }, + { + "epoch": 0.431841046277666, + "grad_norm": 0.47499921917915344, + "learning_rate": 9.941325823697221e-06, + "loss": 0.3987, + "step": 1717 + }, + { + "epoch": 0.432092555331992, + "grad_norm": 0.40756213665008545, + "learning_rate": 9.941102103894541e-06, + "loss": 0.4043, + "step": 1718 + }, + { + "epoch": 0.4323440643863179, + "grad_norm": 0.42999786138534546, + "learning_rate": 9.94087796091738e-06, + "loss": 0.4134, + "step": 1719 + }, + { + "epoch": 0.43259557344064387, + "grad_norm": 0.469237744808197, + "learning_rate": 9.940653394784929e-06, + "loss": 0.3982, + "step": 1720 + }, + { + "epoch": 0.43284708249496984, + "grad_norm": 0.3991852402687073, + "learning_rate": 9.940428405516423e-06, + "loss": 0.3709, + "step": 1721 + }, + { + "epoch": 0.43309859154929575, + "grad_norm": 0.42819303274154663, + "learning_rate": 9.940202993131132e-06, + "loss": 0.3955, + "step": 1722 + }, + { + "epoch": 0.4333501006036217, + "grad_norm": 0.40518656373023987, + "learning_rate": 9.939977157648361e-06, + "loss": 0.3797, + "step": 1723 + }, + { + "epoch": 0.4336016096579477, + "grad_norm": 0.38709619641304016, + "learning_rate": 9.939750899087451e-06, + "loss": 0.4136, + "step": 1724 + }, + { + "epoch": 0.43385311871227367, + "grad_norm": 0.400952011346817, + "learning_rate": 9.939524217467779e-06, + "loss": 0.3917, + "step": 1725 + }, + { + "epoch": 0.4341046277665996, + "grad_norm": 0.4235305190086365, + "learning_rate": 9.93929711280876e-06, + "loss": 0.3936, + "step": 1726 + }, + { + "epoch": 0.43435613682092555, + "grad_norm": 0.4117744266986847, + "learning_rate": 9.939069585129841e-06, + "loss": 0.4083, + "step": 1727 + }, + { + "epoch": 0.4346076458752515, + "grad_norm": 0.37095022201538086, + "learning_rate": 9.938841634450513e-06, + "loss": 0.3841, + "step": 1728 + }, + { + "epoch": 0.43485915492957744, + "grad_norm": 0.4193072021007538, + "learning_rate": 9.938613260790295e-06, + "loss": 0.4061, + "step": 1729 + }, + { + "epoch": 0.4351106639839034, + "grad_norm": 0.45826607942581177, + "learning_rate": 9.938384464168748e-06, + "loss": 0.3937, + "step": 1730 + }, + { + "epoch": 0.4353621730382294, + "grad_norm": 0.3635403513908386, + "learning_rate": 9.938155244605467e-06, + "loss": 0.4176, + "step": 1731 + }, + { + "epoch": 0.43561368209255535, + "grad_norm": 0.47581997513771057, + "learning_rate": 9.937925602120083e-06, + "loss": 0.4053, + "step": 1732 + }, + { + "epoch": 0.43586519114688127, + "grad_norm": 0.4900721311569214, + "learning_rate": 9.937695536732259e-06, + "loss": 0.3965, + "step": 1733 + }, + { + "epoch": 0.43611670020120724, + "grad_norm": 0.4279528856277466, + "learning_rate": 9.937465048461705e-06, + "loss": 0.4019, + "step": 1734 + }, + { + "epoch": 0.4363682092555332, + "grad_norm": 0.42261025309562683, + "learning_rate": 9.937234137328157e-06, + "loss": 0.4054, + "step": 1735 + }, + { + "epoch": 0.43661971830985913, + "grad_norm": 0.4319506287574768, + "learning_rate": 9.937002803351394e-06, + "loss": 0.3924, + "step": 1736 + }, + { + "epoch": 0.4368712273641851, + "grad_norm": 0.4078814387321472, + "learning_rate": 9.936771046551225e-06, + "loss": 0.3855, + "step": 1737 + }, + { + "epoch": 0.43712273641851107, + "grad_norm": 0.4161885678768158, + "learning_rate": 9.936538866947501e-06, + "loss": 0.4115, + "step": 1738 + }, + { + "epoch": 0.43737424547283704, + "grad_norm": 0.3567904233932495, + "learning_rate": 9.936306264560107e-06, + "loss": 0.3882, + "step": 1739 + }, + { + "epoch": 0.43762575452716296, + "grad_norm": 0.43193182349205017, + "learning_rate": 9.93607323940896e-06, + "loss": 0.3699, + "step": 1740 + }, + { + "epoch": 0.43787726358148893, + "grad_norm": 0.3843153715133667, + "learning_rate": 9.935839791514024e-06, + "loss": 0.4166, + "step": 1741 + }, + { + "epoch": 0.4381287726358149, + "grad_norm": 0.3638061583042145, + "learning_rate": 9.935605920895286e-06, + "loss": 0.3891, + "step": 1742 + }, + { + "epoch": 0.43838028169014087, + "grad_norm": 0.3796941041946411, + "learning_rate": 9.93537162757278e-06, + "loss": 0.3806, + "step": 1743 + }, + { + "epoch": 0.4386317907444668, + "grad_norm": 0.3639376163482666, + "learning_rate": 9.935136911566566e-06, + "loss": 0.3785, + "step": 1744 + }, + { + "epoch": 0.43888329979879276, + "grad_norm": 0.4310847520828247, + "learning_rate": 9.93490177289675e-06, + "loss": 0.4118, + "step": 1745 + }, + { + "epoch": 0.43913480885311873, + "grad_norm": 0.4827233552932739, + "learning_rate": 9.934666211583472e-06, + "loss": 0.4192, + "step": 1746 + }, + { + "epoch": 0.43938631790744465, + "grad_norm": 0.36181917786598206, + "learning_rate": 9.934430227646904e-06, + "loss": 0.4225, + "step": 1747 + }, + { + "epoch": 0.4396378269617706, + "grad_norm": 0.4338870346546173, + "learning_rate": 9.934193821107256e-06, + "loss": 0.3857, + "step": 1748 + }, + { + "epoch": 0.4398893360160966, + "grad_norm": 0.454122930765152, + "learning_rate": 9.933956991984775e-06, + "loss": 0.4203, + "step": 1749 + }, + { + "epoch": 0.44014084507042256, + "grad_norm": 0.440476655960083, + "learning_rate": 9.933719740299745e-06, + "loss": 0.411, + "step": 1750 + }, + { + "epoch": 0.4403923541247485, + "grad_norm": 0.3782346546649933, + "learning_rate": 9.933482066072485e-06, + "loss": 0.383, + "step": 1751 + }, + { + "epoch": 0.44064386317907445, + "grad_norm": 0.4182790219783783, + "learning_rate": 9.93324396932335e-06, + "loss": 0.3782, + "step": 1752 + }, + { + "epoch": 0.4408953722334004, + "grad_norm": 0.40436455607414246, + "learning_rate": 9.93300545007273e-06, + "loss": 0.3773, + "step": 1753 + }, + { + "epoch": 0.44114688128772633, + "grad_norm": 0.448367714881897, + "learning_rate": 9.932766508341054e-06, + "loss": 0.4067, + "step": 1754 + }, + { + "epoch": 0.4413983903420523, + "grad_norm": 0.39983075857162476, + "learning_rate": 9.932527144148788e-06, + "loss": 0.3907, + "step": 1755 + }, + { + "epoch": 0.4416498993963783, + "grad_norm": 0.4192653000354767, + "learning_rate": 9.932287357516428e-06, + "loss": 0.3743, + "step": 1756 + }, + { + "epoch": 0.44190140845070425, + "grad_norm": 0.4609915614128113, + "learning_rate": 9.932047148464512e-06, + "loss": 0.3671, + "step": 1757 + }, + { + "epoch": 0.44215291750503016, + "grad_norm": 0.4244169294834137, + "learning_rate": 9.931806517013612e-06, + "loss": 0.3848, + "step": 1758 + }, + { + "epoch": 0.44240442655935613, + "grad_norm": 0.4056912064552307, + "learning_rate": 9.931565463184339e-06, + "loss": 0.4049, + "step": 1759 + }, + { + "epoch": 0.4426559356136821, + "grad_norm": 0.5338891744613647, + "learning_rate": 9.931323986997334e-06, + "loss": 0.3858, + "step": 1760 + }, + { + "epoch": 0.442907444668008, + "grad_norm": 0.4530712962150574, + "learning_rate": 9.93108208847328e-06, + "loss": 0.4024, + "step": 1761 + }, + { + "epoch": 0.443158953722334, + "grad_norm": 0.43134480714797974, + "learning_rate": 9.930839767632895e-06, + "loss": 0.4158, + "step": 1762 + }, + { + "epoch": 0.44341046277665996, + "grad_norm": 0.475333571434021, + "learning_rate": 9.930597024496933e-06, + "loss": 0.4289, + "step": 1763 + }, + { + "epoch": 0.44366197183098594, + "grad_norm": 0.46597325801849365, + "learning_rate": 9.930353859086177e-06, + "loss": 0.409, + "step": 1764 + }, + { + "epoch": 0.44391348088531185, + "grad_norm": 0.3584541976451874, + "learning_rate": 9.93011027142146e-06, + "loss": 0.3936, + "step": 1765 + }, + { + "epoch": 0.4441649899396378, + "grad_norm": 0.4492575526237488, + "learning_rate": 9.92986626152364e-06, + "loss": 0.4067, + "step": 1766 + }, + { + "epoch": 0.4444164989939638, + "grad_norm": 0.4518819749355316, + "learning_rate": 9.929621829413616e-06, + "loss": 0.3997, + "step": 1767 + }, + { + "epoch": 0.44466800804828976, + "grad_norm": 0.39923927187919617, + "learning_rate": 9.929376975112324e-06, + "loss": 0.4011, + "step": 1768 + }, + { + "epoch": 0.4449195171026157, + "grad_norm": 0.4502282440662384, + "learning_rate": 9.92913169864073e-06, + "loss": 0.4245, + "step": 1769 + }, + { + "epoch": 0.44517102615694165, + "grad_norm": 0.49698328971862793, + "learning_rate": 9.928886000019842e-06, + "loss": 0.409, + "step": 1770 + }, + { + "epoch": 0.4454225352112676, + "grad_norm": 0.43445685505867004, + "learning_rate": 9.928639879270705e-06, + "loss": 0.3875, + "step": 1771 + }, + { + "epoch": 0.44567404426559354, + "grad_norm": 0.43353793025016785, + "learning_rate": 9.928393336414394e-06, + "loss": 0.4041, + "step": 1772 + }, + { + "epoch": 0.4459255533199195, + "grad_norm": 0.5283091068267822, + "learning_rate": 9.928146371472027e-06, + "loss": 0.409, + "step": 1773 + }, + { + "epoch": 0.4461770623742455, + "grad_norm": 0.3890385925769806, + "learning_rate": 9.927898984464753e-06, + "loss": 0.3992, + "step": 1774 + }, + { + "epoch": 0.44642857142857145, + "grad_norm": 0.36789190769195557, + "learning_rate": 9.92765117541376e-06, + "loss": 0.3825, + "step": 1775 + }, + { + "epoch": 0.44668008048289737, + "grad_norm": 0.4484959840774536, + "learning_rate": 9.927402944340271e-06, + "loss": 0.3804, + "step": 1776 + }, + { + "epoch": 0.44693158953722334, + "grad_norm": 0.4116009473800659, + "learning_rate": 9.927154291265546e-06, + "loss": 0.4022, + "step": 1777 + }, + { + "epoch": 0.4471830985915493, + "grad_norm": 0.4052729308605194, + "learning_rate": 9.92690521621088e-06, + "loss": 0.4163, + "step": 1778 + }, + { + "epoch": 0.4474346076458752, + "grad_norm": 0.4312780797481537, + "learning_rate": 9.926655719197604e-06, + "loss": 0.4167, + "step": 1779 + }, + { + "epoch": 0.4476861167002012, + "grad_norm": 0.38401511311531067, + "learning_rate": 9.926405800247088e-06, + "loss": 0.3918, + "step": 1780 + }, + { + "epoch": 0.44793762575452717, + "grad_norm": 0.4729456305503845, + "learning_rate": 9.926155459380733e-06, + "loss": 0.416, + "step": 1781 + }, + { + "epoch": 0.44818913480885314, + "grad_norm": 0.43422985076904297, + "learning_rate": 9.925904696619983e-06, + "loss": 0.3793, + "step": 1782 + }, + { + "epoch": 0.44844064386317906, + "grad_norm": 0.3729407489299774, + "learning_rate": 9.92565351198631e-06, + "loss": 0.369, + "step": 1783 + }, + { + "epoch": 0.448692152917505, + "grad_norm": 0.3949672281742096, + "learning_rate": 9.92540190550123e-06, + "loss": 0.3661, + "step": 1784 + }, + { + "epoch": 0.448943661971831, + "grad_norm": 0.5043959617614746, + "learning_rate": 9.92514987718629e-06, + "loss": 0.388, + "step": 1785 + }, + { + "epoch": 0.4491951710261569, + "grad_norm": 0.4054137170314789, + "learning_rate": 9.924897427063074e-06, + "loss": 0.3962, + "step": 1786 + }, + { + "epoch": 0.4494466800804829, + "grad_norm": 0.426899790763855, + "learning_rate": 9.924644555153203e-06, + "loss": 0.4045, + "step": 1787 + }, + { + "epoch": 0.44969818913480886, + "grad_norm": 0.4646405577659607, + "learning_rate": 9.924391261478334e-06, + "loss": 0.3934, + "step": 1788 + }, + { + "epoch": 0.44994969818913483, + "grad_norm": 0.3873588442802429, + "learning_rate": 9.924137546060162e-06, + "loss": 0.4043, + "step": 1789 + }, + { + "epoch": 0.45020120724346074, + "grad_norm": 0.5013946890830994, + "learning_rate": 9.923883408920414e-06, + "loss": 0.3877, + "step": 1790 + }, + { + "epoch": 0.4504527162977867, + "grad_norm": 0.4803258776664734, + "learning_rate": 9.923628850080856e-06, + "loss": 0.4154, + "step": 1791 + }, + { + "epoch": 0.4507042253521127, + "grad_norm": 0.47052329778671265, + "learning_rate": 9.923373869563288e-06, + "loss": 0.4065, + "step": 1792 + }, + { + "epoch": 0.45095573440643866, + "grad_norm": 0.4442056119441986, + "learning_rate": 9.92311846738955e-06, + "loss": 0.3974, + "step": 1793 + }, + { + "epoch": 0.4512072434607646, + "grad_norm": 0.4476431906223297, + "learning_rate": 9.922862643581512e-06, + "loss": 0.4098, + "step": 1794 + }, + { + "epoch": 0.45145875251509054, + "grad_norm": 0.41717463731765747, + "learning_rate": 9.922606398161088e-06, + "loss": 0.4118, + "step": 1795 + }, + { + "epoch": 0.4517102615694165, + "grad_norm": 0.4191329777240753, + "learning_rate": 9.922349731150221e-06, + "loss": 0.403, + "step": 1796 + }, + { + "epoch": 0.45196177062374243, + "grad_norm": 0.41227200627326965, + "learning_rate": 9.922092642570894e-06, + "loss": 0.4162, + "step": 1797 + }, + { + "epoch": 0.4522132796780684, + "grad_norm": 0.3862850069999695, + "learning_rate": 9.921835132445124e-06, + "loss": 0.4071, + "step": 1798 + }, + { + "epoch": 0.4524647887323944, + "grad_norm": 0.42549851536750793, + "learning_rate": 9.921577200794968e-06, + "loss": 0.4289, + "step": 1799 + }, + { + "epoch": 0.45271629778672035, + "grad_norm": 0.40862515568733215, + "learning_rate": 9.921318847642511e-06, + "loss": 0.4009, + "step": 1800 + }, + { + "epoch": 0.45296780684104626, + "grad_norm": 0.3994438350200653, + "learning_rate": 9.921060073009884e-06, + "loss": 0.3995, + "step": 1801 + }, + { + "epoch": 0.45321931589537223, + "grad_norm": 0.45960313081741333, + "learning_rate": 9.920800876919248e-06, + "loss": 0.4031, + "step": 1802 + }, + { + "epoch": 0.4534708249496982, + "grad_norm": 0.42091697454452515, + "learning_rate": 9.9205412593928e-06, + "loss": 0.398, + "step": 1803 + }, + { + "epoch": 0.4537223340040241, + "grad_norm": 0.45224541425704956, + "learning_rate": 9.920281220452776e-06, + "loss": 0.3865, + "step": 1804 + }, + { + "epoch": 0.4539738430583501, + "grad_norm": 0.45991694927215576, + "learning_rate": 9.920020760121447e-06, + "loss": 0.3965, + "step": 1805 + }, + { + "epoch": 0.45422535211267606, + "grad_norm": 0.433249294757843, + "learning_rate": 9.919759878421121e-06, + "loss": 0.405, + "step": 1806 + }, + { + "epoch": 0.45447686116700203, + "grad_norm": 0.43937093019485474, + "learning_rate": 9.919498575374138e-06, + "loss": 0.4068, + "step": 1807 + }, + { + "epoch": 0.45472837022132795, + "grad_norm": 0.5110582709312439, + "learning_rate": 9.919236851002879e-06, + "loss": 0.4061, + "step": 1808 + }, + { + "epoch": 0.4549798792756539, + "grad_norm": 0.4061850607395172, + "learning_rate": 9.918974705329756e-06, + "loss": 0.3686, + "step": 1809 + }, + { + "epoch": 0.4552313883299799, + "grad_norm": 0.49449342489242554, + "learning_rate": 9.918712138377226e-06, + "loss": 0.3984, + "step": 1810 + }, + { + "epoch": 0.45548289738430586, + "grad_norm": 0.44511350989341736, + "learning_rate": 9.91844915016777e-06, + "loss": 0.3846, + "step": 1811 + }, + { + "epoch": 0.4557344064386318, + "grad_norm": 0.3825312852859497, + "learning_rate": 9.918185740723916e-06, + "loss": 0.4109, + "step": 1812 + }, + { + "epoch": 0.45598591549295775, + "grad_norm": 0.44606679677963257, + "learning_rate": 9.91792191006822e-06, + "loss": 0.3899, + "step": 1813 + }, + { + "epoch": 0.4562374245472837, + "grad_norm": 0.42780601978302, + "learning_rate": 9.917657658223278e-06, + "loss": 0.409, + "step": 1814 + }, + { + "epoch": 0.45648893360160964, + "grad_norm": 0.44149789214134216, + "learning_rate": 9.917392985211725e-06, + "loss": 0.4126, + "step": 1815 + }, + { + "epoch": 0.4567404426559356, + "grad_norm": 0.40314993262290955, + "learning_rate": 9.917127891056223e-06, + "loss": 0.3824, + "step": 1816 + }, + { + "epoch": 0.4569919517102616, + "grad_norm": 0.42890751361846924, + "learning_rate": 9.916862375779482e-06, + "loss": 0.4023, + "step": 1817 + }, + { + "epoch": 0.45724346076458755, + "grad_norm": 0.39626947045326233, + "learning_rate": 9.916596439404235e-06, + "loss": 0.4113, + "step": 1818 + }, + { + "epoch": 0.45749496981891347, + "grad_norm": 0.5269667506217957, + "learning_rate": 9.916330081953262e-06, + "loss": 0.3958, + "step": 1819 + }, + { + "epoch": 0.45774647887323944, + "grad_norm": 0.4348817765712738, + "learning_rate": 9.916063303449374e-06, + "loss": 0.373, + "step": 1820 + }, + { + "epoch": 0.4579979879275654, + "grad_norm": 0.36105191707611084, + "learning_rate": 9.91579610391542e-06, + "loss": 0.3845, + "step": 1821 + }, + { + "epoch": 0.4582494969818913, + "grad_norm": 0.5355477333068848, + "learning_rate": 9.915528483374283e-06, + "loss": 0.4226, + "step": 1822 + }, + { + "epoch": 0.4585010060362173, + "grad_norm": 0.45753616094589233, + "learning_rate": 9.915260441848883e-06, + "loss": 0.4067, + "step": 1823 + }, + { + "epoch": 0.45875251509054327, + "grad_norm": 0.41938281059265137, + "learning_rate": 9.914991979362173e-06, + "loss": 0.4038, + "step": 1824 + }, + { + "epoch": 0.45900402414486924, + "grad_norm": 0.41249436140060425, + "learning_rate": 9.91472309593715e-06, + "loss": 0.3903, + "step": 1825 + }, + { + "epoch": 0.45925553319919515, + "grad_norm": 0.40385809540748596, + "learning_rate": 9.914453791596841e-06, + "loss": 0.4094, + "step": 1826 + }, + { + "epoch": 0.4595070422535211, + "grad_norm": 0.4012143909931183, + "learning_rate": 9.914184066364308e-06, + "loss": 0.3886, + "step": 1827 + }, + { + "epoch": 0.4597585513078471, + "grad_norm": 0.41770312190055847, + "learning_rate": 9.913913920262654e-06, + "loss": 0.3926, + "step": 1828 + }, + { + "epoch": 0.460010060362173, + "grad_norm": 0.3697265684604645, + "learning_rate": 9.913643353315015e-06, + "loss": 0.3827, + "step": 1829 + }, + { + "epoch": 0.460261569416499, + "grad_norm": 0.4404751658439636, + "learning_rate": 9.91337236554456e-06, + "loss": 0.3875, + "step": 1830 + }, + { + "epoch": 0.46051307847082495, + "grad_norm": 0.43419405817985535, + "learning_rate": 9.9131009569745e-06, + "loss": 0.4164, + "step": 1831 + }, + { + "epoch": 0.4607645875251509, + "grad_norm": 0.4106455445289612, + "learning_rate": 9.91282912762808e-06, + "loss": 0.4161, + "step": 1832 + }, + { + "epoch": 0.46101609657947684, + "grad_norm": 0.48274555802345276, + "learning_rate": 9.912556877528582e-06, + "loss": 0.3943, + "step": 1833 + }, + { + "epoch": 0.4612676056338028, + "grad_norm": 0.39638254046440125, + "learning_rate": 9.912284206699317e-06, + "loss": 0.4098, + "step": 1834 + }, + { + "epoch": 0.4615191146881288, + "grad_norm": 0.4625331461429596, + "learning_rate": 9.912011115163642e-06, + "loss": 0.3682, + "step": 1835 + }, + { + "epoch": 0.46177062374245476, + "grad_norm": 0.4894983172416687, + "learning_rate": 9.911737602944943e-06, + "loss": 0.3913, + "step": 1836 + }, + { + "epoch": 0.46202213279678067, + "grad_norm": 0.3602517247200012, + "learning_rate": 9.911463670066648e-06, + "loss": 0.3814, + "step": 1837 + }, + { + "epoch": 0.46227364185110664, + "grad_norm": 0.4734443128108978, + "learning_rate": 9.911189316552217e-06, + "loss": 0.3926, + "step": 1838 + }, + { + "epoch": 0.4625251509054326, + "grad_norm": 0.4669021964073181, + "learning_rate": 9.910914542425143e-06, + "loss": 0.4003, + "step": 1839 + }, + { + "epoch": 0.46277665995975853, + "grad_norm": 0.3793289363384247, + "learning_rate": 9.91063934770896e-06, + "loss": 0.3894, + "step": 1840 + }, + { + "epoch": 0.4630281690140845, + "grad_norm": 0.4388173818588257, + "learning_rate": 9.910363732427241e-06, + "loss": 0.4081, + "step": 1841 + }, + { + "epoch": 0.46327967806841047, + "grad_norm": 0.44316378235816956, + "learning_rate": 9.910087696603585e-06, + "loss": 0.3768, + "step": 1842 + }, + { + "epoch": 0.46353118712273644, + "grad_norm": 0.4063831567764282, + "learning_rate": 9.909811240261635e-06, + "loss": 0.3986, + "step": 1843 + }, + { + "epoch": 0.46378269617706236, + "grad_norm": 0.40213075280189514, + "learning_rate": 9.90953436342507e-06, + "loss": 0.4181, + "step": 1844 + }, + { + "epoch": 0.46403420523138833, + "grad_norm": 0.38924410939216614, + "learning_rate": 9.909257066117599e-06, + "loss": 0.4157, + "step": 1845 + }, + { + "epoch": 0.4642857142857143, + "grad_norm": 0.3829781115055084, + "learning_rate": 9.908979348362974e-06, + "loss": 0.3814, + "step": 1846 + }, + { + "epoch": 0.4645372233400402, + "grad_norm": 0.3905456066131592, + "learning_rate": 9.908701210184976e-06, + "loss": 0.3843, + "step": 1847 + }, + { + "epoch": 0.4647887323943662, + "grad_norm": 0.3960774838924408, + "learning_rate": 9.90842265160743e-06, + "loss": 0.409, + "step": 1848 + }, + { + "epoch": 0.46504024144869216, + "grad_norm": 0.39505699276924133, + "learning_rate": 9.908143672654192e-06, + "loss": 0.3942, + "step": 1849 + }, + { + "epoch": 0.46529175050301813, + "grad_norm": 0.4073687791824341, + "learning_rate": 9.907864273349152e-06, + "loss": 0.4076, + "step": 1850 + }, + { + "epoch": 0.46554325955734405, + "grad_norm": 0.4546225368976593, + "learning_rate": 9.907584453716238e-06, + "loss": 0.3739, + "step": 1851 + }, + { + "epoch": 0.46579476861167, + "grad_norm": 0.4456266760826111, + "learning_rate": 9.907304213779422e-06, + "loss": 0.3742, + "step": 1852 + }, + { + "epoch": 0.466046277665996, + "grad_norm": 0.4133337438106537, + "learning_rate": 9.907023553562699e-06, + "loss": 0.3993, + "step": 1853 + }, + { + "epoch": 0.4662977867203219, + "grad_norm": 0.4463947117328644, + "learning_rate": 9.906742473090105e-06, + "loss": 0.3906, + "step": 1854 + }, + { + "epoch": 0.4665492957746479, + "grad_norm": 0.4527071714401245, + "learning_rate": 9.906460972385715e-06, + "loss": 0.3838, + "step": 1855 + }, + { + "epoch": 0.46680080482897385, + "grad_norm": 0.42571839690208435, + "learning_rate": 9.906179051473638e-06, + "loss": 0.3812, + "step": 1856 + }, + { + "epoch": 0.4670523138832998, + "grad_norm": 0.5096774697303772, + "learning_rate": 9.905896710378019e-06, + "loss": 0.4192, + "step": 1857 + }, + { + "epoch": 0.46730382293762573, + "grad_norm": 0.43383294343948364, + "learning_rate": 9.905613949123036e-06, + "loss": 0.3769, + "step": 1858 + }, + { + "epoch": 0.4675553319919517, + "grad_norm": 0.45289137959480286, + "learning_rate": 9.90533076773291e-06, + "loss": 0.4123, + "step": 1859 + }, + { + "epoch": 0.4678068410462777, + "grad_norm": 0.4052902162075043, + "learning_rate": 9.905047166231889e-06, + "loss": 0.4346, + "step": 1860 + }, + { + "epoch": 0.46805835010060365, + "grad_norm": 0.41848742961883545, + "learning_rate": 9.904763144644265e-06, + "loss": 0.4022, + "step": 1861 + }, + { + "epoch": 0.46830985915492956, + "grad_norm": 0.4163335859775543, + "learning_rate": 9.904478702994362e-06, + "loss": 0.4254, + "step": 1862 + }, + { + "epoch": 0.46856136820925554, + "grad_norm": 0.36754798889160156, + "learning_rate": 9.90419384130654e-06, + "loss": 0.4084, + "step": 1863 + }, + { + "epoch": 0.4688128772635815, + "grad_norm": 0.4322851002216339, + "learning_rate": 9.903908559605197e-06, + "loss": 0.3982, + "step": 1864 + }, + { + "epoch": 0.4690643863179074, + "grad_norm": 0.41940197348594666, + "learning_rate": 9.903622857914766e-06, + "loss": 0.4128, + "step": 1865 + }, + { + "epoch": 0.4693158953722334, + "grad_norm": 0.3755689859390259, + "learning_rate": 9.90333673625971e-06, + "loss": 0.4054, + "step": 1866 + }, + { + "epoch": 0.46956740442655936, + "grad_norm": 0.422498881816864, + "learning_rate": 9.903050194664541e-06, + "loss": 0.4145, + "step": 1867 + }, + { + "epoch": 0.46981891348088534, + "grad_norm": 0.3807189166545868, + "learning_rate": 9.902763233153796e-06, + "loss": 0.4154, + "step": 1868 + }, + { + "epoch": 0.47007042253521125, + "grad_norm": 0.4063322842121124, + "learning_rate": 9.90247585175205e-06, + "loss": 0.3964, + "step": 1869 + }, + { + "epoch": 0.4703219315895372, + "grad_norm": 0.42195388674736023, + "learning_rate": 9.902188050483918e-06, + "loss": 0.4153, + "step": 1870 + }, + { + "epoch": 0.4705734406438632, + "grad_norm": 0.44243067502975464, + "learning_rate": 9.901899829374048e-06, + "loss": 0.4107, + "step": 1871 + }, + { + "epoch": 0.4708249496981891, + "grad_norm": 0.39202213287353516, + "learning_rate": 9.901611188447123e-06, + "loss": 0.3929, + "step": 1872 + }, + { + "epoch": 0.4710764587525151, + "grad_norm": 0.4214573800563812, + "learning_rate": 9.901322127727864e-06, + "loss": 0.4043, + "step": 1873 + }, + { + "epoch": 0.47132796780684105, + "grad_norm": 0.4241698980331421, + "learning_rate": 9.901032647241028e-06, + "loss": 0.4463, + "step": 1874 + }, + { + "epoch": 0.471579476861167, + "grad_norm": 0.3987068831920624, + "learning_rate": 9.900742747011405e-06, + "loss": 0.4225, + "step": 1875 + }, + { + "epoch": 0.47183098591549294, + "grad_norm": 0.38595229387283325, + "learning_rate": 9.900452427063827e-06, + "loss": 0.401, + "step": 1876 + }, + { + "epoch": 0.4720824949698189, + "grad_norm": 0.4303046464920044, + "learning_rate": 9.900161687423155e-06, + "loss": 0.424, + "step": 1877 + }, + { + "epoch": 0.4723340040241449, + "grad_norm": 0.453784704208374, + "learning_rate": 9.89987052811429e-06, + "loss": 0.4054, + "step": 1878 + }, + { + "epoch": 0.4725855130784708, + "grad_norm": 0.41478443145751953, + "learning_rate": 9.899578949162167e-06, + "loss": 0.3934, + "step": 1879 + }, + { + "epoch": 0.47283702213279677, + "grad_norm": 0.4205837547779083, + "learning_rate": 9.899286950591758e-06, + "loss": 0.379, + "step": 1880 + }, + { + "epoch": 0.47308853118712274, + "grad_norm": 0.4482855498790741, + "learning_rate": 9.898994532428071e-06, + "loss": 0.3874, + "step": 1881 + }, + { + "epoch": 0.4733400402414487, + "grad_norm": 0.3712460994720459, + "learning_rate": 9.898701694696154e-06, + "loss": 0.423, + "step": 1882 + }, + { + "epoch": 0.4735915492957746, + "grad_norm": 0.4586082100868225, + "learning_rate": 9.89840843742108e-06, + "loss": 0.39, + "step": 1883 + }, + { + "epoch": 0.4738430583501006, + "grad_norm": 0.3899551033973694, + "learning_rate": 9.898114760627968e-06, + "loss": 0.4005, + "step": 1884 + }, + { + "epoch": 0.47409456740442657, + "grad_norm": 0.39340347051620483, + "learning_rate": 9.89782066434197e-06, + "loss": 0.4229, + "step": 1885 + }, + { + "epoch": 0.47434607645875254, + "grad_norm": 0.4018033742904663, + "learning_rate": 9.897526148588272e-06, + "loss": 0.3884, + "step": 1886 + }, + { + "epoch": 0.47459758551307846, + "grad_norm": 0.38298699259757996, + "learning_rate": 9.8972312133921e-06, + "loss": 0.414, + "step": 1887 + }, + { + "epoch": 0.47484909456740443, + "grad_norm": 0.36675944924354553, + "learning_rate": 9.896935858778708e-06, + "loss": 0.3942, + "step": 1888 + }, + { + "epoch": 0.4751006036217304, + "grad_norm": 0.4059602916240692, + "learning_rate": 9.896640084773399e-06, + "loss": 0.3728, + "step": 1889 + }, + { + "epoch": 0.4753521126760563, + "grad_norm": 0.4235600233078003, + "learning_rate": 9.896343891401498e-06, + "loss": 0.3886, + "step": 1890 + }, + { + "epoch": 0.4756036217303823, + "grad_norm": 0.3745618760585785, + "learning_rate": 9.896047278688375e-06, + "loss": 0.3832, + "step": 1891 + }, + { + "epoch": 0.47585513078470826, + "grad_norm": 0.41866034269332886, + "learning_rate": 9.89575024665943e-06, + "loss": 0.3678, + "step": 1892 + }, + { + "epoch": 0.47610663983903423, + "grad_norm": 0.39261144399642944, + "learning_rate": 9.895452795340106e-06, + "loss": 0.4036, + "step": 1893 + }, + { + "epoch": 0.47635814889336014, + "grad_norm": 0.43027186393737793, + "learning_rate": 9.895154924755875e-06, + "loss": 0.3915, + "step": 1894 + }, + { + "epoch": 0.4766096579476861, + "grad_norm": 0.42222538590431213, + "learning_rate": 9.894856634932249e-06, + "loss": 0.3949, + "step": 1895 + }, + { + "epoch": 0.4768611670020121, + "grad_norm": 0.3839363157749176, + "learning_rate": 9.894557925894775e-06, + "loss": 0.4005, + "step": 1896 + }, + { + "epoch": 0.477112676056338, + "grad_norm": 0.39426472783088684, + "learning_rate": 9.894258797669034e-06, + "loss": 0.3867, + "step": 1897 + }, + { + "epoch": 0.477364185110664, + "grad_norm": 0.3634321391582489, + "learning_rate": 9.893959250280646e-06, + "loss": 0.3862, + "step": 1898 + }, + { + "epoch": 0.47761569416498995, + "grad_norm": 0.4207385182380676, + "learning_rate": 9.893659283755264e-06, + "loss": 0.3887, + "step": 1899 + }, + { + "epoch": 0.4778672032193159, + "grad_norm": 0.3956140875816345, + "learning_rate": 9.89335889811858e-06, + "loss": 0.397, + "step": 1900 + }, + { + "epoch": 0.47811871227364183, + "grad_norm": 0.37039822340011597, + "learning_rate": 9.893058093396318e-06, + "loss": 0.386, + "step": 1901 + }, + { + "epoch": 0.4783702213279678, + "grad_norm": 0.38195037841796875, + "learning_rate": 9.892756869614242e-06, + "loss": 0.4047, + "step": 1902 + }, + { + "epoch": 0.4786217303822938, + "grad_norm": 0.39862626791000366, + "learning_rate": 9.892455226798148e-06, + "loss": 0.4128, + "step": 1903 + }, + { + "epoch": 0.4788732394366197, + "grad_norm": 0.3864774703979492, + "learning_rate": 9.892153164973873e-06, + "loss": 0.3959, + "step": 1904 + }, + { + "epoch": 0.47912474849094566, + "grad_norm": 0.40958353877067566, + "learning_rate": 9.891850684167284e-06, + "loss": 0.4151, + "step": 1905 + }, + { + "epoch": 0.47937625754527163, + "grad_norm": 0.3836343586444855, + "learning_rate": 9.891547784404285e-06, + "loss": 0.3945, + "step": 1906 + }, + { + "epoch": 0.4796277665995976, + "grad_norm": 0.37921276688575745, + "learning_rate": 9.891244465710822e-06, + "loss": 0.3856, + "step": 1907 + }, + { + "epoch": 0.4798792756539235, + "grad_norm": 0.39044657349586487, + "learning_rate": 9.890940728112869e-06, + "loss": 0.3913, + "step": 1908 + }, + { + "epoch": 0.4801307847082495, + "grad_norm": 0.38842642307281494, + "learning_rate": 9.89063657163644e-06, + "loss": 0.378, + "step": 1909 + }, + { + "epoch": 0.48038229376257546, + "grad_norm": 0.4392659068107605, + "learning_rate": 9.890331996307585e-06, + "loss": 0.3937, + "step": 1910 + }, + { + "epoch": 0.48063380281690143, + "grad_norm": 0.41104656457901, + "learning_rate": 9.89002700215239e-06, + "loss": 0.3849, + "step": 1911 + }, + { + "epoch": 0.48088531187122735, + "grad_norm": 0.3936188519001007, + "learning_rate": 9.88972158919697e-06, + "loss": 0.4185, + "step": 1912 + }, + { + "epoch": 0.4811368209255533, + "grad_norm": 0.39013659954071045, + "learning_rate": 9.88941575746749e-06, + "loss": 0.353, + "step": 1913 + }, + { + "epoch": 0.4813883299798793, + "grad_norm": 0.3972854018211365, + "learning_rate": 9.889109506990137e-06, + "loss": 0.3933, + "step": 1914 + }, + { + "epoch": 0.4816398390342052, + "grad_norm": 0.46189555525779724, + "learning_rate": 9.88880283779114e-06, + "loss": 0.4109, + "step": 1915 + }, + { + "epoch": 0.4818913480885312, + "grad_norm": 0.391872763633728, + "learning_rate": 9.888495749896766e-06, + "loss": 0.4003, + "step": 1916 + }, + { + "epoch": 0.48214285714285715, + "grad_norm": 0.49157020449638367, + "learning_rate": 9.888188243333313e-06, + "loss": 0.3938, + "step": 1917 + }, + { + "epoch": 0.4823943661971831, + "grad_norm": 0.6632839441299438, + "learning_rate": 9.887880318127116e-06, + "loss": 0.3952, + "step": 1918 + }, + { + "epoch": 0.48264587525150904, + "grad_norm": 0.42987990379333496, + "learning_rate": 9.88757197430455e-06, + "loss": 0.4032, + "step": 1919 + }, + { + "epoch": 0.482897384305835, + "grad_norm": 0.43732520937919617, + "learning_rate": 9.887263211892017e-06, + "loss": 0.4166, + "step": 1920 + }, + { + "epoch": 0.483148893360161, + "grad_norm": 0.45760130882263184, + "learning_rate": 9.886954030915969e-06, + "loss": 0.3951, + "step": 1921 + }, + { + "epoch": 0.4834004024144869, + "grad_norm": 0.4097820520401001, + "learning_rate": 9.886644431402879e-06, + "loss": 0.422, + "step": 1922 + }, + { + "epoch": 0.48365191146881287, + "grad_norm": 0.47169190645217896, + "learning_rate": 9.886334413379263e-06, + "loss": 0.4127, + "step": 1923 + }, + { + "epoch": 0.48390342052313884, + "grad_norm": 0.45416510105133057, + "learning_rate": 9.886023976871678e-06, + "loss": 0.3942, + "step": 1924 + }, + { + "epoch": 0.4841549295774648, + "grad_norm": 0.4500854015350342, + "learning_rate": 9.885713121906701e-06, + "loss": 0.3861, + "step": 1925 + }, + { + "epoch": 0.4844064386317907, + "grad_norm": 0.49403610825538635, + "learning_rate": 9.885401848510962e-06, + "loss": 0.4131, + "step": 1926 + }, + { + "epoch": 0.4846579476861167, + "grad_norm": 0.43086037039756775, + "learning_rate": 9.88509015671112e-06, + "loss": 0.3868, + "step": 1927 + }, + { + "epoch": 0.48490945674044267, + "grad_norm": 0.4836970269680023, + "learning_rate": 9.884778046533863e-06, + "loss": 0.3939, + "step": 1928 + }, + { + "epoch": 0.48516096579476864, + "grad_norm": 0.41725337505340576, + "learning_rate": 9.884465518005927e-06, + "loss": 0.4104, + "step": 1929 + }, + { + "epoch": 0.48541247484909456, + "grad_norm": 0.43721911311149597, + "learning_rate": 9.884152571154077e-06, + "loss": 0.4114, + "step": 1930 + }, + { + "epoch": 0.4856639839034205, + "grad_norm": 0.39998695254325867, + "learning_rate": 9.883839206005115e-06, + "loss": 0.386, + "step": 1931 + }, + { + "epoch": 0.4859154929577465, + "grad_norm": 0.41072046756744385, + "learning_rate": 9.883525422585877e-06, + "loss": 0.4145, + "step": 1932 + }, + { + "epoch": 0.4861670020120724, + "grad_norm": 0.40265727043151855, + "learning_rate": 9.883211220923237e-06, + "loss": 0.4205, + "step": 1933 + }, + { + "epoch": 0.4864185110663984, + "grad_norm": 0.4055761396884918, + "learning_rate": 9.882896601044107e-06, + "loss": 0.4011, + "step": 1934 + }, + { + "epoch": 0.48667002012072436, + "grad_norm": 0.43374502658843994, + "learning_rate": 9.882581562975431e-06, + "loss": 0.4031, + "step": 1935 + }, + { + "epoch": 0.4869215291750503, + "grad_norm": 0.3941003978252411, + "learning_rate": 9.882266106744187e-06, + "loss": 0.3767, + "step": 1936 + }, + { + "epoch": 0.48717303822937624, + "grad_norm": 0.41482222080230713, + "learning_rate": 9.881950232377397e-06, + "loss": 0.3977, + "step": 1937 + }, + { + "epoch": 0.4874245472837022, + "grad_norm": 0.39503028988838196, + "learning_rate": 9.88163393990211e-06, + "loss": 0.3972, + "step": 1938 + }, + { + "epoch": 0.4876760563380282, + "grad_norm": 0.43991711735725403, + "learning_rate": 9.881317229345414e-06, + "loss": 0.375, + "step": 1939 + }, + { + "epoch": 0.4879275653923541, + "grad_norm": 0.41871747374534607, + "learning_rate": 9.881000100734436e-06, + "loss": 0.4188, + "step": 1940 + }, + { + "epoch": 0.4881790744466801, + "grad_norm": 0.4454898238182068, + "learning_rate": 9.880682554096334e-06, + "loss": 0.3924, + "step": 1941 + }, + { + "epoch": 0.48843058350100604, + "grad_norm": 0.4203794598579407, + "learning_rate": 9.880364589458306e-06, + "loss": 0.3826, + "step": 1942 + }, + { + "epoch": 0.488682092555332, + "grad_norm": 0.4890785813331604, + "learning_rate": 9.880046206847581e-06, + "loss": 0.4006, + "step": 1943 + }, + { + "epoch": 0.48893360160965793, + "grad_norm": 0.9945582151412964, + "learning_rate": 9.879727406291429e-06, + "loss": 0.3835, + "step": 1944 + }, + { + "epoch": 0.4891851106639839, + "grad_norm": 0.44654008746147156, + "learning_rate": 9.879408187817153e-06, + "loss": 0.3997, + "step": 1945 + }, + { + "epoch": 0.4894366197183099, + "grad_norm": 0.5195226669311523, + "learning_rate": 9.879088551452088e-06, + "loss": 0.3954, + "step": 1946 + }, + { + "epoch": 0.4896881287726358, + "grad_norm": 0.4593251645565033, + "learning_rate": 9.878768497223614e-06, + "loss": 0.4029, + "step": 1947 + }, + { + "epoch": 0.48993963782696176, + "grad_norm": 0.5262354612350464, + "learning_rate": 9.87844802515914e-06, + "loss": 0.3896, + "step": 1948 + }, + { + "epoch": 0.49019114688128773, + "grad_norm": 0.5133786797523499, + "learning_rate": 9.878127135286112e-06, + "loss": 0.3685, + "step": 1949 + }, + { + "epoch": 0.4904426559356137, + "grad_norm": 0.43951818346977234, + "learning_rate": 9.877805827632013e-06, + "loss": 0.4082, + "step": 1950 + }, + { + "epoch": 0.4906941649899396, + "grad_norm": 0.5165718197822571, + "learning_rate": 9.877484102224359e-06, + "loss": 0.4198, + "step": 1951 + }, + { + "epoch": 0.4909456740442656, + "grad_norm": 0.47304123640060425, + "learning_rate": 9.877161959090704e-06, + "loss": 0.4279, + "step": 1952 + }, + { + "epoch": 0.49119718309859156, + "grad_norm": 0.41676169633865356, + "learning_rate": 9.87683939825864e-06, + "loss": 0.3885, + "step": 1953 + }, + { + "epoch": 0.49144869215291753, + "grad_norm": 0.4127172529697418, + "learning_rate": 9.876516419755793e-06, + "loss": 0.3738, + "step": 1954 + }, + { + "epoch": 0.49170020120724345, + "grad_norm": 0.4324229061603546, + "learning_rate": 9.87619302360982e-06, + "loss": 0.3703, + "step": 1955 + }, + { + "epoch": 0.4919517102615694, + "grad_norm": 0.3876967430114746, + "learning_rate": 9.875869209848418e-06, + "loss": 0.4076, + "step": 1956 + }, + { + "epoch": 0.4922032193158954, + "grad_norm": 0.4635095000267029, + "learning_rate": 9.875544978499326e-06, + "loss": 0.4025, + "step": 1957 + }, + { + "epoch": 0.4924547283702213, + "grad_norm": 0.410577654838562, + "learning_rate": 9.875220329590304e-06, + "loss": 0.4046, + "step": 1958 + }, + { + "epoch": 0.4927062374245473, + "grad_norm": 0.3948517441749573, + "learning_rate": 9.874895263149163e-06, + "loss": 0.4266, + "step": 1959 + }, + { + "epoch": 0.49295774647887325, + "grad_norm": 0.3900180757045746, + "learning_rate": 9.874569779203737e-06, + "loss": 0.3899, + "step": 1960 + }, + { + "epoch": 0.4932092555331992, + "grad_norm": 0.3953598737716675, + "learning_rate": 9.874243877781906e-06, + "loss": 0.4044, + "step": 1961 + }, + { + "epoch": 0.49346076458752514, + "grad_norm": 0.3972851037979126, + "learning_rate": 9.87391755891158e-06, + "loss": 0.4096, + "step": 1962 + }, + { + "epoch": 0.4937122736418511, + "grad_norm": 0.3685133457183838, + "learning_rate": 9.873590822620706e-06, + "loss": 0.4009, + "step": 1963 + }, + { + "epoch": 0.4939637826961771, + "grad_norm": 0.5181507468223572, + "learning_rate": 9.873263668937268e-06, + "loss": 0.4115, + "step": 1964 + }, + { + "epoch": 0.494215291750503, + "grad_norm": 0.38723844289779663, + "learning_rate": 9.872936097889284e-06, + "loss": 0.3957, + "step": 1965 + }, + { + "epoch": 0.49446680080482897, + "grad_norm": 0.42989009618759155, + "learning_rate": 9.872608109504807e-06, + "loss": 0.402, + "step": 1966 + }, + { + "epoch": 0.49471830985915494, + "grad_norm": 0.4205054044723511, + "learning_rate": 9.872279703811929e-06, + "loss": 0.4242, + "step": 1967 + }, + { + "epoch": 0.4949698189134809, + "grad_norm": 0.369878888130188, + "learning_rate": 9.871950880838774e-06, + "loss": 0.4199, + "step": 1968 + }, + { + "epoch": 0.4952213279678068, + "grad_norm": 0.3752254843711853, + "learning_rate": 9.871621640613506e-06, + "loss": 0.3765, + "step": 1969 + }, + { + "epoch": 0.4954728370221328, + "grad_norm": 0.4001399278640747, + "learning_rate": 9.871291983164322e-06, + "loss": 0.3697, + "step": 1970 + }, + { + "epoch": 0.49572434607645877, + "grad_norm": 0.38128310441970825, + "learning_rate": 9.870961908519454e-06, + "loss": 0.4113, + "step": 1971 + }, + { + "epoch": 0.4959758551307847, + "grad_norm": 0.3907400071620941, + "learning_rate": 9.87063141670717e-06, + "loss": 0.404, + "step": 1972 + }, + { + "epoch": 0.49622736418511065, + "grad_norm": 0.3958117961883545, + "learning_rate": 9.870300507755777e-06, + "loss": 0.3943, + "step": 1973 + }, + { + "epoch": 0.4964788732394366, + "grad_norm": 0.412076473236084, + "learning_rate": 9.869969181693613e-06, + "loss": 0.4207, + "step": 1974 + }, + { + "epoch": 0.4967303822937626, + "grad_norm": 0.3858659863471985, + "learning_rate": 9.869637438549056e-06, + "loss": 0.396, + "step": 1975 + }, + { + "epoch": 0.4969818913480885, + "grad_norm": 0.44061511754989624, + "learning_rate": 9.869305278350516e-06, + "loss": 0.4096, + "step": 1976 + }, + { + "epoch": 0.4972334004024145, + "grad_norm": 0.400973379611969, + "learning_rate": 9.868972701126442e-06, + "loss": 0.382, + "step": 1977 + }, + { + "epoch": 0.49748490945674045, + "grad_norm": 0.430945485830307, + "learning_rate": 9.868639706905314e-06, + "loss": 0.3994, + "step": 1978 + }, + { + "epoch": 0.4977364185110664, + "grad_norm": 0.36873745918273926, + "learning_rate": 9.868306295715656e-06, + "loss": 0.3846, + "step": 1979 + }, + { + "epoch": 0.49798792756539234, + "grad_norm": 0.3921675682067871, + "learning_rate": 9.86797246758602e-06, + "loss": 0.3977, + "step": 1980 + }, + { + "epoch": 0.4982394366197183, + "grad_norm": 0.3892073333263397, + "learning_rate": 9.867638222544994e-06, + "loss": 0.383, + "step": 1981 + }, + { + "epoch": 0.4984909456740443, + "grad_norm": 0.3832413852214813, + "learning_rate": 9.867303560621207e-06, + "loss": 0.3954, + "step": 1982 + }, + { + "epoch": 0.4987424547283702, + "grad_norm": 0.3576008677482605, + "learning_rate": 9.86696848184332e-06, + "loss": 0.3985, + "step": 1983 + }, + { + "epoch": 0.49899396378269617, + "grad_norm": 0.44481638073921204, + "learning_rate": 9.86663298624003e-06, + "loss": 0.3919, + "step": 1984 + }, + { + "epoch": 0.49924547283702214, + "grad_norm": 0.37005627155303955, + "learning_rate": 9.86629707384007e-06, + "loss": 0.3772, + "step": 1985 + }, + { + "epoch": 0.4994969818913481, + "grad_norm": 0.39425691962242126, + "learning_rate": 9.86596074467221e-06, + "loss": 0.4188, + "step": 1986 + }, + { + "epoch": 0.49974849094567403, + "grad_norm": 0.39733070135116577, + "learning_rate": 9.865623998765253e-06, + "loss": 0.4111, + "step": 1987 + }, + { + "epoch": 0.5, + "grad_norm": 0.40979063510894775, + "learning_rate": 9.865286836148039e-06, + "loss": 0.3874, + "step": 1988 + }, + { + "epoch": 0.5002515090543259, + "grad_norm": 0.361208438873291, + "learning_rate": 9.864949256849445e-06, + "loss": 0.4006, + "step": 1989 + }, + { + "epoch": 0.5005030181086519, + "grad_norm": 0.42786529660224915, + "learning_rate": 9.864611260898383e-06, + "loss": 0.397, + "step": 1990 + }, + { + "epoch": 0.5007545271629779, + "grad_norm": 0.40797531604766846, + "learning_rate": 9.8642728483238e-06, + "loss": 0.4219, + "step": 1991 + }, + { + "epoch": 0.5010060362173038, + "grad_norm": 0.3617155849933624, + "learning_rate": 9.863934019154676e-06, + "loss": 0.4213, + "step": 1992 + }, + { + "epoch": 0.5012575452716298, + "grad_norm": 0.39363011717796326, + "learning_rate": 9.863594773420033e-06, + "loss": 0.391, + "step": 1993 + }, + { + "epoch": 0.5015090543259557, + "grad_norm": 0.36764830350875854, + "learning_rate": 9.863255111148925e-06, + "loss": 0.393, + "step": 1994 + }, + { + "epoch": 0.5017605633802817, + "grad_norm": 0.377768337726593, + "learning_rate": 9.862915032370441e-06, + "loss": 0.4024, + "step": 1995 + }, + { + "epoch": 0.5020120724346077, + "grad_norm": 0.3849732577800751, + "learning_rate": 9.862574537113705e-06, + "loss": 0.3818, + "step": 1996 + }, + { + "epoch": 0.5022635814889336, + "grad_norm": 0.399882048368454, + "learning_rate": 9.862233625407882e-06, + "loss": 0.3862, + "step": 1997 + }, + { + "epoch": 0.5025150905432596, + "grad_norm": 0.439528226852417, + "learning_rate": 9.861892297282167e-06, + "loss": 0.3957, + "step": 1998 + }, + { + "epoch": 0.5027665995975855, + "grad_norm": 0.40875646471977234, + "learning_rate": 9.86155055276579e-06, + "loss": 0.3947, + "step": 1999 + }, + { + "epoch": 0.5030181086519114, + "grad_norm": 0.41643694043159485, + "learning_rate": 9.861208391888024e-06, + "loss": 0.4071, + "step": 2000 + }, + { + "epoch": 0.5032696177062375, + "grad_norm": 0.47003522515296936, + "learning_rate": 9.860865814678172e-06, + "loss": 0.3874, + "step": 2001 + }, + { + "epoch": 0.5035211267605634, + "grad_norm": 0.3645997643470764, + "learning_rate": 9.860522821165572e-06, + "loss": 0.3991, + "step": 2002 + }, + { + "epoch": 0.5037726358148893, + "grad_norm": 0.3944624960422516, + "learning_rate": 9.860179411379598e-06, + "loss": 0.3663, + "step": 2003 + }, + { + "epoch": 0.5040241448692153, + "grad_norm": 0.41858211159706116, + "learning_rate": 9.859835585349664e-06, + "loss": 0.4056, + "step": 2004 + }, + { + "epoch": 0.5042756539235412, + "grad_norm": 0.42395514249801636, + "learning_rate": 9.859491343105215e-06, + "loss": 0.4058, + "step": 2005 + }, + { + "epoch": 0.5045271629778671, + "grad_norm": 0.41400447487831116, + "learning_rate": 9.859146684675733e-06, + "loss": 0.4235, + "step": 2006 + }, + { + "epoch": 0.5047786720321932, + "grad_norm": 0.3939945101737976, + "learning_rate": 9.858801610090736e-06, + "loss": 0.3848, + "step": 2007 + }, + { + "epoch": 0.5050301810865191, + "grad_norm": 0.4896427094936371, + "learning_rate": 9.858456119379779e-06, + "loss": 0.3592, + "step": 2008 + }, + { + "epoch": 0.5052816901408451, + "grad_norm": 0.43367308378219604, + "learning_rate": 9.858110212572448e-06, + "loss": 0.4206, + "step": 2009 + }, + { + "epoch": 0.505533199195171, + "grad_norm": 0.36704427003860474, + "learning_rate": 9.85776388969837e-06, + "loss": 0.4041, + "step": 2010 + }, + { + "epoch": 0.505784708249497, + "grad_norm": 0.5114539265632629, + "learning_rate": 9.857417150787206e-06, + "loss": 0.3805, + "step": 2011 + }, + { + "epoch": 0.506036217303823, + "grad_norm": 0.4381665885448456, + "learning_rate": 9.857069995868648e-06, + "loss": 0.4056, + "step": 2012 + }, + { + "epoch": 0.5062877263581489, + "grad_norm": 0.450435072183609, + "learning_rate": 9.856722424972434e-06, + "loss": 0.4096, + "step": 2013 + }, + { + "epoch": 0.5065392354124748, + "grad_norm": 0.44745296239852905, + "learning_rate": 9.856374438128327e-06, + "loss": 0.4042, + "step": 2014 + }, + { + "epoch": 0.5067907444668008, + "grad_norm": 0.4272843301296234, + "learning_rate": 9.85602603536613e-06, + "loss": 0.3735, + "step": 2015 + }, + { + "epoch": 0.5070422535211268, + "grad_norm": 0.3728364109992981, + "learning_rate": 9.855677216715682e-06, + "loss": 0.4043, + "step": 2016 + }, + { + "epoch": 0.5072937625754527, + "grad_norm": 0.4467531740665436, + "learning_rate": 9.855327982206859e-06, + "loss": 0.3911, + "step": 2017 + }, + { + "epoch": 0.5075452716297787, + "grad_norm": 0.4462815225124359, + "learning_rate": 9.854978331869568e-06, + "loss": 0.4317, + "step": 2018 + }, + { + "epoch": 0.5077967806841046, + "grad_norm": 0.38220712542533875, + "learning_rate": 9.854628265733755e-06, + "loss": 0.4053, + "step": 2019 + }, + { + "epoch": 0.5080482897384306, + "grad_norm": 0.3900271952152252, + "learning_rate": 9.854277783829402e-06, + "loss": 0.3861, + "step": 2020 + }, + { + "epoch": 0.5082997987927566, + "grad_norm": 0.4110308587551117, + "learning_rate": 9.853926886186527e-06, + "loss": 0.3917, + "step": 2021 + }, + { + "epoch": 0.5085513078470825, + "grad_norm": 0.40754181146621704, + "learning_rate": 9.853575572835179e-06, + "loss": 0.4176, + "step": 2022 + }, + { + "epoch": 0.5088028169014085, + "grad_norm": 0.4448073208332062, + "learning_rate": 9.853223843805445e-06, + "loss": 0.3992, + "step": 2023 + }, + { + "epoch": 0.5090543259557344, + "grad_norm": 0.40429484844207764, + "learning_rate": 9.852871699127453e-06, + "loss": 0.4232, + "step": 2024 + }, + { + "epoch": 0.5093058350100603, + "grad_norm": 0.4180743992328644, + "learning_rate": 9.852519138831358e-06, + "loss": 0.4074, + "step": 2025 + }, + { + "epoch": 0.5095573440643864, + "grad_norm": 0.46965914964675903, + "learning_rate": 9.852166162947356e-06, + "loss": 0.3685, + "step": 2026 + }, + { + "epoch": 0.5098088531187123, + "grad_norm": 0.400019109249115, + "learning_rate": 9.851812771505678e-06, + "loss": 0.3915, + "step": 2027 + }, + { + "epoch": 0.5100603621730382, + "grad_norm": 0.39726099371910095, + "learning_rate": 9.851458964536589e-06, + "loss": 0.3435, + "step": 2028 + }, + { + "epoch": 0.5103118712273642, + "grad_norm": 0.41831856966018677, + "learning_rate": 9.85110474207039e-06, + "loss": 0.382, + "step": 2029 + }, + { + "epoch": 0.5105633802816901, + "grad_norm": 0.41029006242752075, + "learning_rate": 9.85075010413742e-06, + "loss": 0.4017, + "step": 2030 + }, + { + "epoch": 0.510814889336016, + "grad_norm": 0.42589080333709717, + "learning_rate": 9.850395050768047e-06, + "loss": 0.3784, + "step": 2031 + }, + { + "epoch": 0.5110663983903421, + "grad_norm": 0.4287776052951813, + "learning_rate": 9.850039581992683e-06, + "loss": 0.4288, + "step": 2032 + }, + { + "epoch": 0.511317907444668, + "grad_norm": 0.38227567076683044, + "learning_rate": 9.84968369784177e-06, + "loss": 0.4244, + "step": 2033 + }, + { + "epoch": 0.511569416498994, + "grad_norm": 0.40670591592788696, + "learning_rate": 9.849327398345788e-06, + "loss": 0.414, + "step": 2034 + }, + { + "epoch": 0.5118209255533199, + "grad_norm": 0.4272759258747101, + "learning_rate": 9.848970683535253e-06, + "loss": 0.3923, + "step": 2035 + }, + { + "epoch": 0.5120724346076458, + "grad_norm": 0.5142744779586792, + "learning_rate": 9.84861355344071e-06, + "loss": 0.3805, + "step": 2036 + }, + { + "epoch": 0.5123239436619719, + "grad_norm": 0.38634923100471497, + "learning_rate": 9.848256008092754e-06, + "loss": 0.3946, + "step": 2037 + }, + { + "epoch": 0.5125754527162978, + "grad_norm": 0.4425947964191437, + "learning_rate": 9.847898047522e-06, + "loss": 0.3706, + "step": 2038 + }, + { + "epoch": 0.5128269617706237, + "grad_norm": 0.4305839240550995, + "learning_rate": 9.847539671759105e-06, + "loss": 0.3941, + "step": 2039 + }, + { + "epoch": 0.5130784708249497, + "grad_norm": 0.4277026653289795, + "learning_rate": 9.847180880834764e-06, + "loss": 0.3712, + "step": 2040 + }, + { + "epoch": 0.5133299798792756, + "grad_norm": 0.3730026185512543, + "learning_rate": 9.846821674779705e-06, + "loss": 0.3905, + "step": 2041 + }, + { + "epoch": 0.5135814889336016, + "grad_norm": 0.39186757802963257, + "learning_rate": 9.846462053624691e-06, + "loss": 0.4005, + "step": 2042 + }, + { + "epoch": 0.5138329979879276, + "grad_norm": 0.39070603251457214, + "learning_rate": 9.846102017400523e-06, + "loss": 0.3957, + "step": 2043 + }, + { + "epoch": 0.5140845070422535, + "grad_norm": 0.3925558924674988, + "learning_rate": 9.845741566138031e-06, + "loss": 0.3974, + "step": 2044 + }, + { + "epoch": 0.5143360160965795, + "grad_norm": 0.4226996600627899, + "learning_rate": 9.84538069986809e-06, + "loss": 0.4011, + "step": 2045 + }, + { + "epoch": 0.5145875251509054, + "grad_norm": 0.39153674244880676, + "learning_rate": 9.845019418621606e-06, + "loss": 0.3668, + "step": 2046 + }, + { + "epoch": 0.5148390342052314, + "grad_norm": 0.4205423891544342, + "learning_rate": 9.844657722429518e-06, + "loss": 0.402, + "step": 2047 + }, + { + "epoch": 0.5150905432595574, + "grad_norm": 0.4090724289417267, + "learning_rate": 9.844295611322804e-06, + "loss": 0.4084, + "step": 2048 + }, + { + "epoch": 0.5153420523138833, + "grad_norm": 0.38471901416778564, + "learning_rate": 9.843933085332477e-06, + "loss": 0.3758, + "step": 2049 + }, + { + "epoch": 0.5155935613682092, + "grad_norm": 0.3938809037208557, + "learning_rate": 9.843570144489585e-06, + "loss": 0.3913, + "step": 2050 + }, + { + "epoch": 0.5158450704225352, + "grad_norm": 0.36798256635665894, + "learning_rate": 9.843206788825211e-06, + "loss": 0.4112, + "step": 2051 + }, + { + "epoch": 0.5160965794768612, + "grad_norm": 0.3941897749900818, + "learning_rate": 9.842843018370475e-06, + "loss": 0.4002, + "step": 2052 + }, + { + "epoch": 0.5163480885311871, + "grad_norm": 0.4066377580165863, + "learning_rate": 9.84247883315653e-06, + "loss": 0.4136, + "step": 2053 + }, + { + "epoch": 0.5165995975855131, + "grad_norm": 0.4132152795791626, + "learning_rate": 9.84211423321457e-06, + "loss": 0.4202, + "step": 2054 + }, + { + "epoch": 0.516851106639839, + "grad_norm": 0.3840596377849579, + "learning_rate": 9.841749218575815e-06, + "loss": 0.4002, + "step": 2055 + }, + { + "epoch": 0.5171026156941649, + "grad_norm": 0.4097898304462433, + "learning_rate": 9.841383789271533e-06, + "loss": 0.4072, + "step": 2056 + }, + { + "epoch": 0.517354124748491, + "grad_norm": 0.39788705110549927, + "learning_rate": 9.841017945333014e-06, + "loss": 0.4225, + "step": 2057 + }, + { + "epoch": 0.5176056338028169, + "grad_norm": 0.3664261996746063, + "learning_rate": 9.840651686791593e-06, + "loss": 0.4118, + "step": 2058 + }, + { + "epoch": 0.5178571428571429, + "grad_norm": 0.31850335001945496, + "learning_rate": 9.84028501367864e-06, + "loss": 0.3815, + "step": 2059 + }, + { + "epoch": 0.5181086519114688, + "grad_norm": 0.4274592697620392, + "learning_rate": 9.839917926025555e-06, + "loss": 0.4198, + "step": 2060 + }, + { + "epoch": 0.5183601609657947, + "grad_norm": 0.3651498854160309, + "learning_rate": 9.839550423863779e-06, + "loss": 0.3958, + "step": 2061 + }, + { + "epoch": 0.5186116700201208, + "grad_norm": 0.35910671949386597, + "learning_rate": 9.839182507224786e-06, + "loss": 0.3888, + "step": 2062 + }, + { + "epoch": 0.5188631790744467, + "grad_norm": 0.39101144671440125, + "learning_rate": 9.838814176140084e-06, + "loss": 0.4144, + "step": 2063 + }, + { + "epoch": 0.5191146881287726, + "grad_norm": 0.34476083517074585, + "learning_rate": 9.838445430641219e-06, + "loss": 0.4075, + "step": 2064 + }, + { + "epoch": 0.5193661971830986, + "grad_norm": 0.3631290793418884, + "learning_rate": 9.838076270759771e-06, + "loss": 0.3737, + "step": 2065 + }, + { + "epoch": 0.5196177062374245, + "grad_norm": 0.38996070623397827, + "learning_rate": 9.83770669652736e-06, + "loss": 0.3986, + "step": 2066 + }, + { + "epoch": 0.5198692152917505, + "grad_norm": 0.35395678877830505, + "learning_rate": 9.837336707975633e-06, + "loss": 0.384, + "step": 2067 + }, + { + "epoch": 0.5201207243460765, + "grad_norm": 0.3823234438896179, + "learning_rate": 9.83696630513628e-06, + "loss": 0.3907, + "step": 2068 + }, + { + "epoch": 0.5203722334004024, + "grad_norm": 0.40023064613342285, + "learning_rate": 9.836595488041022e-06, + "loss": 0.3846, + "step": 2069 + }, + { + "epoch": 0.5206237424547284, + "grad_norm": 0.4110654592514038, + "learning_rate": 9.83622425672162e-06, + "loss": 0.4082, + "step": 2070 + }, + { + "epoch": 0.5208752515090543, + "grad_norm": 0.3667696416378021, + "learning_rate": 9.835852611209865e-06, + "loss": 0.4005, + "step": 2071 + }, + { + "epoch": 0.5211267605633803, + "grad_norm": 0.39130160212516785, + "learning_rate": 9.835480551537587e-06, + "loss": 0.412, + "step": 2072 + }, + { + "epoch": 0.5213782696177063, + "grad_norm": 0.4086337685585022, + "learning_rate": 9.83510807773665e-06, + "loss": 0.4241, + "step": 2073 + }, + { + "epoch": 0.5216297786720322, + "grad_norm": 0.3477274179458618, + "learning_rate": 9.834735189838954e-06, + "loss": 0.4019, + "step": 2074 + }, + { + "epoch": 0.5218812877263581, + "grad_norm": 0.4180651605129242, + "learning_rate": 9.834361887876436e-06, + "loss": 0.4283, + "step": 2075 + }, + { + "epoch": 0.5221327967806841, + "grad_norm": 0.4306004047393799, + "learning_rate": 9.833988171881066e-06, + "loss": 0.4239, + "step": 2076 + }, + { + "epoch": 0.52238430583501, + "grad_norm": 0.38910388946533203, + "learning_rate": 9.83361404188485e-06, + "loss": 0.3697, + "step": 2077 + }, + { + "epoch": 0.522635814889336, + "grad_norm": 0.49108606576919556, + "learning_rate": 9.83323949791983e-06, + "loss": 0.4168, + "step": 2078 + }, + { + "epoch": 0.522887323943662, + "grad_norm": 0.42532941699028015, + "learning_rate": 9.832864540018083e-06, + "loss": 0.4159, + "step": 2079 + }, + { + "epoch": 0.5231388329979879, + "grad_norm": 0.3561785817146301, + "learning_rate": 9.832489168211723e-06, + "loss": 0.3945, + "step": 2080 + }, + { + "epoch": 0.5233903420523138, + "grad_norm": 0.3978264629840851, + "learning_rate": 9.832113382532899e-06, + "loss": 0.4115, + "step": 2081 + }, + { + "epoch": 0.5236418511066399, + "grad_norm": 0.38779211044311523, + "learning_rate": 9.831737183013792e-06, + "loss": 0.3627, + "step": 2082 + }, + { + "epoch": 0.5238933601609658, + "grad_norm": 0.3900914192199707, + "learning_rate": 9.831360569686623e-06, + "loss": 0.4184, + "step": 2083 + }, + { + "epoch": 0.5241448692152918, + "grad_norm": 0.44345182180404663, + "learning_rate": 9.830983542583647e-06, + "loss": 0.4029, + "step": 2084 + }, + { + "epoch": 0.5243963782696177, + "grad_norm": 0.3975575864315033, + "learning_rate": 9.830606101737153e-06, + "loss": 0.3758, + "step": 2085 + }, + { + "epoch": 0.5246478873239436, + "grad_norm": 0.4029177129268646, + "learning_rate": 9.830228247179465e-06, + "loss": 0.4035, + "step": 2086 + }, + { + "epoch": 0.5248993963782697, + "grad_norm": 0.44459268450737, + "learning_rate": 9.829849978942948e-06, + "loss": 0.4014, + "step": 2087 + }, + { + "epoch": 0.5251509054325956, + "grad_norm": 0.39327511191368103, + "learning_rate": 9.829471297059991e-06, + "loss": 0.391, + "step": 2088 + }, + { + "epoch": 0.5254024144869215, + "grad_norm": 0.37261277437210083, + "learning_rate": 9.829092201563035e-06, + "loss": 0.4013, + "step": 2089 + }, + { + "epoch": 0.5256539235412475, + "grad_norm": 0.4421737790107727, + "learning_rate": 9.828712692484541e-06, + "loss": 0.3947, + "step": 2090 + }, + { + "epoch": 0.5259054325955734, + "grad_norm": 0.35906025767326355, + "learning_rate": 9.828332769857014e-06, + "loss": 0.3556, + "step": 2091 + }, + { + "epoch": 0.5261569416498993, + "grad_norm": 0.3702889084815979, + "learning_rate": 9.82795243371299e-06, + "loss": 0.4143, + "step": 2092 + }, + { + "epoch": 0.5264084507042254, + "grad_norm": 0.41510313749313354, + "learning_rate": 9.827571684085045e-06, + "loss": 0.4048, + "step": 2093 + }, + { + "epoch": 0.5266599597585513, + "grad_norm": 0.3920029103755951, + "learning_rate": 9.827190521005786e-06, + "loss": 0.4078, + "step": 2094 + }, + { + "epoch": 0.5269114688128773, + "grad_norm": 0.4151366651058197, + "learning_rate": 9.826808944507855e-06, + "loss": 0.4112, + "step": 2095 + }, + { + "epoch": 0.5271629778672032, + "grad_norm": 0.40691617131233215, + "learning_rate": 9.826426954623937e-06, + "loss": 0.3947, + "step": 2096 + }, + { + "epoch": 0.5274144869215291, + "grad_norm": 0.3902684152126312, + "learning_rate": 9.826044551386743e-06, + "loss": 0.3756, + "step": 2097 + }, + { + "epoch": 0.5276659959758552, + "grad_norm": 0.436042457818985, + "learning_rate": 9.825661734829027e-06, + "loss": 0.3787, + "step": 2098 + }, + { + "epoch": 0.5279175050301811, + "grad_norm": 0.40818825364112854, + "learning_rate": 9.825278504983571e-06, + "loss": 0.408, + "step": 2099 + }, + { + "epoch": 0.528169014084507, + "grad_norm": 0.4207872152328491, + "learning_rate": 9.824894861883198e-06, + "loss": 0.3696, + "step": 2100 + }, + { + "epoch": 0.528420523138833, + "grad_norm": 0.40610066056251526, + "learning_rate": 9.824510805560765e-06, + "loss": 0.3853, + "step": 2101 + }, + { + "epoch": 0.528672032193159, + "grad_norm": 0.4684671461582184, + "learning_rate": 9.824126336049164e-06, + "loss": 0.3864, + "step": 2102 + }, + { + "epoch": 0.5289235412474849, + "grad_norm": 0.41243529319763184, + "learning_rate": 9.823741453381322e-06, + "loss": 0.4081, + "step": 2103 + }, + { + "epoch": 0.5291750503018109, + "grad_norm": 0.3892309069633484, + "learning_rate": 9.8233561575902e-06, + "loss": 0.3777, + "step": 2104 + }, + { + "epoch": 0.5294265593561368, + "grad_norm": 0.4003976285457611, + "learning_rate": 9.822970448708799e-06, + "loss": 0.3686, + "step": 2105 + }, + { + "epoch": 0.5296780684104627, + "grad_norm": 0.38374191522598267, + "learning_rate": 9.822584326770152e-06, + "loss": 0.3977, + "step": 2106 + }, + { + "epoch": 0.5299295774647887, + "grad_norm": 0.4218411147594452, + "learning_rate": 9.822197791807328e-06, + "loss": 0.3836, + "step": 2107 + }, + { + "epoch": 0.5301810865191147, + "grad_norm": 0.4624592959880829, + "learning_rate": 9.821810843853428e-06, + "loss": 0.4268, + "step": 2108 + }, + { + "epoch": 0.5304325955734407, + "grad_norm": 0.3895992040634155, + "learning_rate": 9.821423482941597e-06, + "loss": 0.3982, + "step": 2109 + }, + { + "epoch": 0.5306841046277666, + "grad_norm": 0.4353783428668976, + "learning_rate": 9.821035709105006e-06, + "loss": 0.4131, + "step": 2110 + }, + { + "epoch": 0.5309356136820925, + "grad_norm": 0.4408473074436188, + "learning_rate": 9.820647522376868e-06, + "loss": 0.3785, + "step": 2111 + }, + { + "epoch": 0.5311871227364185, + "grad_norm": 0.39889922738075256, + "learning_rate": 9.820258922790427e-06, + "loss": 0.4074, + "step": 2112 + }, + { + "epoch": 0.5314386317907445, + "grad_norm": 0.43087536096572876, + "learning_rate": 9.819869910378964e-06, + "loss": 0.3995, + "step": 2113 + }, + { + "epoch": 0.5316901408450704, + "grad_norm": 0.3881208002567291, + "learning_rate": 9.819480485175797e-06, + "loss": 0.4023, + "step": 2114 + }, + { + "epoch": 0.5319416498993964, + "grad_norm": 0.39766907691955566, + "learning_rate": 9.819090647214277e-06, + "loss": 0.42, + "step": 2115 + }, + { + "epoch": 0.5321931589537223, + "grad_norm": 0.42352211475372314, + "learning_rate": 9.818700396527791e-06, + "loss": 0.3988, + "step": 2116 + }, + { + "epoch": 0.5324446680080482, + "grad_norm": 0.48607903718948364, + "learning_rate": 9.818309733149762e-06, + "loss": 0.384, + "step": 2117 + }, + { + "epoch": 0.5326961770623743, + "grad_norm": 0.38936519622802734, + "learning_rate": 9.817918657113648e-06, + "loss": 0.4082, + "step": 2118 + }, + { + "epoch": 0.5329476861167002, + "grad_norm": 0.4536868929862976, + "learning_rate": 9.81752716845294e-06, + "loss": 0.399, + "step": 2119 + }, + { + "epoch": 0.5331991951710262, + "grad_norm": 0.42506319284439087, + "learning_rate": 9.81713526720117e-06, + "loss": 0.3987, + "step": 2120 + }, + { + "epoch": 0.5334507042253521, + "grad_norm": 0.42283493280410767, + "learning_rate": 9.8167429533919e-06, + "loss": 0.3804, + "step": 2121 + }, + { + "epoch": 0.533702213279678, + "grad_norm": 0.4490463137626648, + "learning_rate": 9.816350227058728e-06, + "loss": 0.4026, + "step": 2122 + }, + { + "epoch": 0.5339537223340041, + "grad_norm": 0.4041505455970764, + "learning_rate": 9.815957088235293e-06, + "loss": 0.3712, + "step": 2123 + }, + { + "epoch": 0.53420523138833, + "grad_norm": 0.3451807200908661, + "learning_rate": 9.81556353695526e-06, + "loss": 0.376, + "step": 2124 + }, + { + "epoch": 0.5344567404426559, + "grad_norm": 0.3851028382778168, + "learning_rate": 9.815169573252336e-06, + "loss": 0.3902, + "step": 2125 + }, + { + "epoch": 0.5347082494969819, + "grad_norm": 0.38843706250190735, + "learning_rate": 9.814775197160262e-06, + "loss": 0.4208, + "step": 2126 + }, + { + "epoch": 0.5349597585513078, + "grad_norm": 0.3881021738052368, + "learning_rate": 9.814380408712813e-06, + "loss": 0.4059, + "step": 2127 + }, + { + "epoch": 0.5352112676056338, + "grad_norm": 0.33903956413269043, + "learning_rate": 9.813985207943802e-06, + "loss": 0.3739, + "step": 2128 + }, + { + "epoch": 0.5354627766599598, + "grad_norm": 0.36936119198799133, + "learning_rate": 9.813589594887074e-06, + "loss": 0.416, + "step": 2129 + }, + { + "epoch": 0.5357142857142857, + "grad_norm": 0.38683557510375977, + "learning_rate": 9.81319356957651e-06, + "loss": 0.3924, + "step": 2130 + }, + { + "epoch": 0.5359657947686117, + "grad_norm": 0.3721410036087036, + "learning_rate": 9.812797132046028e-06, + "loss": 0.4107, + "step": 2131 + }, + { + "epoch": 0.5362173038229376, + "grad_norm": 0.34902095794677734, + "learning_rate": 9.812400282329579e-06, + "loss": 0.3693, + "step": 2132 + }, + { + "epoch": 0.5364688128772636, + "grad_norm": 0.39966878294944763, + "learning_rate": 9.812003020461155e-06, + "loss": 0.3925, + "step": 2133 + }, + { + "epoch": 0.5367203219315896, + "grad_norm": 0.36834654211997986, + "learning_rate": 9.811605346474775e-06, + "loss": 0.3942, + "step": 2134 + }, + { + "epoch": 0.5369718309859155, + "grad_norm": 0.37292787432670593, + "learning_rate": 9.811207260404499e-06, + "loss": 0.4042, + "step": 2135 + }, + { + "epoch": 0.5372233400402414, + "grad_norm": 0.35834237933158875, + "learning_rate": 9.810808762284419e-06, + "loss": 0.3952, + "step": 2136 + }, + { + "epoch": 0.5374748490945674, + "grad_norm": 0.40317603945732117, + "learning_rate": 9.810409852148665e-06, + "loss": 0.4013, + "step": 2137 + }, + { + "epoch": 0.5377263581488934, + "grad_norm": 0.4264310598373413, + "learning_rate": 9.8100105300314e-06, + "loss": 0.4076, + "step": 2138 + }, + { + "epoch": 0.5379778672032193, + "grad_norm": 0.42888686060905457, + "learning_rate": 9.809610795966826e-06, + "loss": 0.3646, + "step": 2139 + }, + { + "epoch": 0.5382293762575453, + "grad_norm": 0.42549654841423035, + "learning_rate": 9.809210649989175e-06, + "loss": 0.4074, + "step": 2140 + }, + { + "epoch": 0.5384808853118712, + "grad_norm": 0.4057452976703644, + "learning_rate": 9.80881009213272e-06, + "loss": 0.3669, + "step": 2141 + }, + { + "epoch": 0.5387323943661971, + "grad_norm": 0.41238072514533997, + "learning_rate": 9.808409122431764e-06, + "loss": 0.401, + "step": 2142 + }, + { + "epoch": 0.5389839034205232, + "grad_norm": 0.42542120814323425, + "learning_rate": 9.808007740920647e-06, + "loss": 0.3928, + "step": 2143 + }, + { + "epoch": 0.5392354124748491, + "grad_norm": 0.37695184350013733, + "learning_rate": 9.807605947633745e-06, + "loss": 0.3742, + "step": 2144 + }, + { + "epoch": 0.5394869215291751, + "grad_norm": 0.4471421539783478, + "learning_rate": 9.807203742605472e-06, + "loss": 0.3786, + "step": 2145 + }, + { + "epoch": 0.539738430583501, + "grad_norm": 0.45084428787231445, + "learning_rate": 9.80680112587027e-06, + "loss": 0.3869, + "step": 2146 + }, + { + "epoch": 0.5399899396378269, + "grad_norm": 0.37542101740837097, + "learning_rate": 9.806398097462624e-06, + "loss": 0.3566, + "step": 2147 + }, + { + "epoch": 0.540241448692153, + "grad_norm": 0.4376560151576996, + "learning_rate": 9.805994657417049e-06, + "loss": 0.4074, + "step": 2148 + }, + { + "epoch": 0.5404929577464789, + "grad_norm": 0.5008549094200134, + "learning_rate": 9.8055908057681e-06, + "loss": 0.3917, + "step": 2149 + }, + { + "epoch": 0.5407444668008048, + "grad_norm": 0.37436121702194214, + "learning_rate": 9.80518654255036e-06, + "loss": 0.3788, + "step": 2150 + }, + { + "epoch": 0.5409959758551308, + "grad_norm": 0.3661990463733673, + "learning_rate": 9.804781867798454e-06, + "loss": 0.3834, + "step": 2151 + }, + { + "epoch": 0.5412474849094567, + "grad_norm": 0.45389556884765625, + "learning_rate": 9.804376781547041e-06, + "loss": 0.3688, + "step": 2152 + }, + { + "epoch": 0.5414989939637826, + "grad_norm": 0.3825472593307495, + "learning_rate": 9.80397128383081e-06, + "loss": 0.3752, + "step": 2153 + }, + { + "epoch": 0.5417505030181087, + "grad_norm": 0.3918491005897522, + "learning_rate": 9.803565374684494e-06, + "loss": 0.391, + "step": 2154 + }, + { + "epoch": 0.5420020120724346, + "grad_norm": 0.3703051507472992, + "learning_rate": 9.803159054142855e-06, + "loss": 0.3866, + "step": 2155 + }, + { + "epoch": 0.5422535211267606, + "grad_norm": 0.4313543140888214, + "learning_rate": 9.802752322240692e-06, + "loss": 0.4013, + "step": 2156 + }, + { + "epoch": 0.5425050301810865, + "grad_norm": 0.3883597254753113, + "learning_rate": 9.802345179012837e-06, + "loss": 0.4049, + "step": 2157 + }, + { + "epoch": 0.5427565392354124, + "grad_norm": 0.37119433283805847, + "learning_rate": 9.801937624494161e-06, + "loss": 0.3798, + "step": 2158 + }, + { + "epoch": 0.5430080482897385, + "grad_norm": 0.3962958753108978, + "learning_rate": 9.801529658719568e-06, + "loss": 0.4036, + "step": 2159 + }, + { + "epoch": 0.5432595573440644, + "grad_norm": 0.37826666235923767, + "learning_rate": 9.801121281724e-06, + "loss": 0.3931, + "step": 2160 + }, + { + "epoch": 0.5435110663983903, + "grad_norm": 0.41150638461112976, + "learning_rate": 9.800712493542428e-06, + "loss": 0.3994, + "step": 2161 + }, + { + "epoch": 0.5437625754527163, + "grad_norm": 0.3833552896976471, + "learning_rate": 9.800303294209865e-06, + "loss": 0.4038, + "step": 2162 + }, + { + "epoch": 0.5440140845070423, + "grad_norm": 0.35805314779281616, + "learning_rate": 9.799893683761355e-06, + "loss": 0.3862, + "step": 2163 + }, + { + "epoch": 0.5442655935613682, + "grad_norm": 0.39856329560279846, + "learning_rate": 9.79948366223198e-06, + "loss": 0.3888, + "step": 2164 + }, + { + "epoch": 0.5445171026156942, + "grad_norm": 0.42184582352638245, + "learning_rate": 9.799073229656853e-06, + "loss": 0.3901, + "step": 2165 + }, + { + "epoch": 0.5447686116700201, + "grad_norm": 0.36999794840812683, + "learning_rate": 9.798662386071127e-06, + "loss": 0.3794, + "step": 2166 + }, + { + "epoch": 0.545020120724346, + "grad_norm": 0.36188700795173645, + "learning_rate": 9.79825113150999e-06, + "loss": 0.4028, + "step": 2167 + }, + { + "epoch": 0.545271629778672, + "grad_norm": 0.39296042919158936, + "learning_rate": 9.797839466008659e-06, + "loss": 0.3667, + "step": 2168 + }, + { + "epoch": 0.545523138832998, + "grad_norm": 0.38612136244773865, + "learning_rate": 9.797427389602393e-06, + "loss": 0.4027, + "step": 2169 + }, + { + "epoch": 0.545774647887324, + "grad_norm": 0.3653174042701721, + "learning_rate": 9.797014902326487e-06, + "loss": 0.3901, + "step": 2170 + }, + { + "epoch": 0.5460261569416499, + "grad_norm": 0.37336331605911255, + "learning_rate": 9.796602004216261e-06, + "loss": 0.3656, + "step": 2171 + }, + { + "epoch": 0.5462776659959758, + "grad_norm": 0.3571634888648987, + "learning_rate": 9.796188695307083e-06, + "loss": 0.3877, + "step": 2172 + }, + { + "epoch": 0.5465291750503019, + "grad_norm": 0.3366375267505646, + "learning_rate": 9.795774975634347e-06, + "loss": 0.3793, + "step": 2173 + }, + { + "epoch": 0.5467806841046278, + "grad_norm": 0.41412922739982605, + "learning_rate": 9.795360845233485e-06, + "loss": 0.395, + "step": 2174 + }, + { + "epoch": 0.5470321931589537, + "grad_norm": 0.3812969923019409, + "learning_rate": 9.794946304139969e-06, + "loss": 0.4144, + "step": 2175 + }, + { + "epoch": 0.5472837022132797, + "grad_norm": 0.3604353368282318, + "learning_rate": 9.794531352389298e-06, + "loss": 0.3636, + "step": 2176 + }, + { + "epoch": 0.5475352112676056, + "grad_norm": 0.3878379166126251, + "learning_rate": 9.794115990017012e-06, + "loss": 0.4151, + "step": 2177 + }, + { + "epoch": 0.5477867203219315, + "grad_norm": 0.3887823820114136, + "learning_rate": 9.793700217058683e-06, + "loss": 0.4009, + "step": 2178 + }, + { + "epoch": 0.5480382293762576, + "grad_norm": 0.37774476408958435, + "learning_rate": 9.793284033549919e-06, + "loss": 0.4105, + "step": 2179 + }, + { + "epoch": 0.5482897384305835, + "grad_norm": 0.37441837787628174, + "learning_rate": 9.792867439526366e-06, + "loss": 0.4019, + "step": 2180 + }, + { + "epoch": 0.5485412474849095, + "grad_norm": 0.37095755338668823, + "learning_rate": 9.792450435023699e-06, + "loss": 0.3687, + "step": 2181 + }, + { + "epoch": 0.5487927565392354, + "grad_norm": 0.40394505858421326, + "learning_rate": 9.792033020077634e-06, + "loss": 0.4008, + "step": 2182 + }, + { + "epoch": 0.5490442655935613, + "grad_norm": 0.38541585206985474, + "learning_rate": 9.79161519472392e-06, + "loss": 0.4145, + "step": 2183 + }, + { + "epoch": 0.5492957746478874, + "grad_norm": 0.3427443504333496, + "learning_rate": 9.79119695899834e-06, + "loss": 0.3838, + "step": 2184 + }, + { + "epoch": 0.5495472837022133, + "grad_norm": 0.3911786675453186, + "learning_rate": 9.790778312936715e-06, + "loss": 0.3893, + "step": 2185 + }, + { + "epoch": 0.5497987927565392, + "grad_norm": 0.36838027834892273, + "learning_rate": 9.790359256574899e-06, + "loss": 0.3763, + "step": 2186 + }, + { + "epoch": 0.5500503018108652, + "grad_norm": 0.3471270501613617, + "learning_rate": 9.78993978994878e-06, + "loss": 0.3924, + "step": 2187 + }, + { + "epoch": 0.5503018108651911, + "grad_norm": 0.33406156301498413, + "learning_rate": 9.789519913094286e-06, + "loss": 0.3775, + "step": 2188 + }, + { + "epoch": 0.5505533199195171, + "grad_norm": 0.36162400245666504, + "learning_rate": 9.789099626047372e-06, + "loss": 0.3834, + "step": 2189 + }, + { + "epoch": 0.5508048289738431, + "grad_norm": 0.4071558713912964, + "learning_rate": 9.788678928844036e-06, + "loss": 0.3924, + "step": 2190 + }, + { + "epoch": 0.551056338028169, + "grad_norm": 0.3942394256591797, + "learning_rate": 9.788257821520308e-06, + "loss": 0.3962, + "step": 2191 + }, + { + "epoch": 0.5513078470824949, + "grad_norm": 0.41246795654296875, + "learning_rate": 9.787836304112253e-06, + "loss": 0.3822, + "step": 2192 + }, + { + "epoch": 0.5515593561368209, + "grad_norm": 0.3739457130432129, + "learning_rate": 9.78741437665597e-06, + "loss": 0.3988, + "step": 2193 + }, + { + "epoch": 0.5518108651911469, + "grad_norm": 0.3873920440673828, + "learning_rate": 9.786992039187598e-06, + "loss": 0.4142, + "step": 2194 + }, + { + "epoch": 0.5520623742454729, + "grad_norm": 0.3712387979030609, + "learning_rate": 9.786569291743305e-06, + "loss": 0.3964, + "step": 2195 + }, + { + "epoch": 0.5523138832997988, + "grad_norm": 0.4185509979724884, + "learning_rate": 9.786146134359294e-06, + "loss": 0.4275, + "step": 2196 + }, + { + "epoch": 0.5525653923541247, + "grad_norm": 0.3870210647583008, + "learning_rate": 9.785722567071811e-06, + "loss": 0.4037, + "step": 2197 + }, + { + "epoch": 0.5528169014084507, + "grad_norm": 0.39058247208595276, + "learning_rate": 9.785298589917128e-06, + "loss": 0.4133, + "step": 2198 + }, + { + "epoch": 0.5530684104627767, + "grad_norm": 0.39365655183792114, + "learning_rate": 9.784874202931558e-06, + "loss": 0.4017, + "step": 2199 + }, + { + "epoch": 0.5533199195171026, + "grad_norm": 0.44448891282081604, + "learning_rate": 9.784449406151448e-06, + "loss": 0.3863, + "step": 2200 + }, + { + "epoch": 0.5535714285714286, + "grad_norm": 0.3912818133831024, + "learning_rate": 9.784024199613176e-06, + "loss": 0.3809, + "step": 2201 + }, + { + "epoch": 0.5538229376257545, + "grad_norm": 0.3555707633495331, + "learning_rate": 9.783598583353161e-06, + "loss": 0.4174, + "step": 2202 + }, + { + "epoch": 0.5540744466800804, + "grad_norm": 0.3489442765712738, + "learning_rate": 9.783172557407852e-06, + "loss": 0.4051, + "step": 2203 + }, + { + "epoch": 0.5543259557344065, + "grad_norm": 0.4021318554878235, + "learning_rate": 9.78274612181374e-06, + "loss": 0.3971, + "step": 2204 + }, + { + "epoch": 0.5545774647887324, + "grad_norm": 0.3958930969238281, + "learning_rate": 9.78231927660734e-06, + "loss": 0.408, + "step": 2205 + }, + { + "epoch": 0.5548289738430584, + "grad_norm": 0.38725799322128296, + "learning_rate": 9.781892021825215e-06, + "loss": 0.4053, + "step": 2206 + }, + { + "epoch": 0.5550804828973843, + "grad_norm": 0.3717680275440216, + "learning_rate": 9.781464357503951e-06, + "loss": 0.4101, + "step": 2207 + }, + { + "epoch": 0.5553319919517102, + "grad_norm": 0.42192167043685913, + "learning_rate": 9.781036283680179e-06, + "loss": 0.3914, + "step": 2208 + }, + { + "epoch": 0.5555835010060363, + "grad_norm": 0.3718080520629883, + "learning_rate": 9.78060780039056e-06, + "loss": 0.3633, + "step": 2209 + }, + { + "epoch": 0.5558350100603622, + "grad_norm": 0.38646015524864197, + "learning_rate": 9.780178907671788e-06, + "loss": 0.3974, + "step": 2210 + }, + { + "epoch": 0.5560865191146881, + "grad_norm": 0.36778998374938965, + "learning_rate": 9.7797496055606e-06, + "loss": 0.4049, + "step": 2211 + }, + { + "epoch": 0.5563380281690141, + "grad_norm": 0.3933306336402893, + "learning_rate": 9.779319894093759e-06, + "loss": 0.4053, + "step": 2212 + }, + { + "epoch": 0.55658953722334, + "grad_norm": 0.39641377329826355, + "learning_rate": 9.778889773308069e-06, + "loss": 0.4054, + "step": 2213 + }, + { + "epoch": 0.556841046277666, + "grad_norm": 0.44454824924468994, + "learning_rate": 9.778459243240365e-06, + "loss": 0.4066, + "step": 2214 + }, + { + "epoch": 0.557092555331992, + "grad_norm": 0.3420102894306183, + "learning_rate": 9.778028303927522e-06, + "loss": 0.3604, + "step": 2215 + }, + { + "epoch": 0.5573440643863179, + "grad_norm": 0.4014604091644287, + "learning_rate": 9.777596955406446e-06, + "loss": 0.4214, + "step": 2216 + }, + { + "epoch": 0.5575955734406438, + "grad_norm": 0.416327565908432, + "learning_rate": 9.77716519771408e-06, + "loss": 0.4317, + "step": 2217 + }, + { + "epoch": 0.5578470824949698, + "grad_norm": 0.43000099062919617, + "learning_rate": 9.7767330308874e-06, + "loss": 0.4061, + "step": 2218 + }, + { + "epoch": 0.5580985915492958, + "grad_norm": 0.3848433494567871, + "learning_rate": 9.776300454963417e-06, + "loss": 0.3885, + "step": 2219 + }, + { + "epoch": 0.5583501006036218, + "grad_norm": 0.3921235203742981, + "learning_rate": 9.775867469979184e-06, + "loss": 0.4042, + "step": 2220 + }, + { + "epoch": 0.5586016096579477, + "grad_norm": 0.3721983730792999, + "learning_rate": 9.775434075971777e-06, + "loss": 0.3859, + "step": 2221 + }, + { + "epoch": 0.5588531187122736, + "grad_norm": 0.38795045018196106, + "learning_rate": 9.77500027297832e-06, + "loss": 0.3851, + "step": 2222 + }, + { + "epoch": 0.5591046277665996, + "grad_norm": 0.3403348922729492, + "learning_rate": 9.774566061035957e-06, + "loss": 0.3788, + "step": 2223 + }, + { + "epoch": 0.5593561368209256, + "grad_norm": 0.4166750907897949, + "learning_rate": 9.774131440181884e-06, + "loss": 0.3765, + "step": 2224 + }, + { + "epoch": 0.5596076458752515, + "grad_norm": 0.4601471722126007, + "learning_rate": 9.77369641045332e-06, + "loss": 0.4047, + "step": 2225 + }, + { + "epoch": 0.5598591549295775, + "grad_norm": 0.3640338182449341, + "learning_rate": 9.77326097188752e-06, + "loss": 0.4057, + "step": 2226 + }, + { + "epoch": 0.5601106639839034, + "grad_norm": 0.49202394485473633, + "learning_rate": 9.77282512452178e-06, + "loss": 0.3838, + "step": 2227 + }, + { + "epoch": 0.5603621730382293, + "grad_norm": 0.4419405162334442, + "learning_rate": 9.77238886839343e-06, + "loss": 0.4005, + "step": 2228 + }, + { + "epoch": 0.5606136820925554, + "grad_norm": 0.43457552790641785, + "learning_rate": 9.771952203539826e-06, + "loss": 0.3764, + "step": 2229 + }, + { + "epoch": 0.5608651911468813, + "grad_norm": 0.41830188035964966, + "learning_rate": 9.77151512999837e-06, + "loss": 0.3886, + "step": 2230 + }, + { + "epoch": 0.5611167002012073, + "grad_norm": 0.42050424218177795, + "learning_rate": 9.771077647806494e-06, + "loss": 0.4007, + "step": 2231 + }, + { + "epoch": 0.5613682092555332, + "grad_norm": 0.3640553057193756, + "learning_rate": 9.770639757001665e-06, + "loss": 0.4145, + "step": 2232 + }, + { + "epoch": 0.5616197183098591, + "grad_norm": 0.3788280487060547, + "learning_rate": 9.770201457621386e-06, + "loss": 0.3933, + "step": 2233 + }, + { + "epoch": 0.5618712273641852, + "grad_norm": 0.4044826924800873, + "learning_rate": 9.769762749703194e-06, + "loss": 0.3897, + "step": 2234 + }, + { + "epoch": 0.5621227364185111, + "grad_norm": 0.3986685574054718, + "learning_rate": 9.769323633284662e-06, + "loss": 0.3978, + "step": 2235 + }, + { + "epoch": 0.562374245472837, + "grad_norm": 0.3790046274662018, + "learning_rate": 9.768884108403399e-06, + "loss": 0.3759, + "step": 2236 + }, + { + "epoch": 0.562625754527163, + "grad_norm": 0.382379412651062, + "learning_rate": 9.768444175097043e-06, + "loss": 0.3848, + "step": 2237 + }, + { + "epoch": 0.5628772635814889, + "grad_norm": 0.3763681650161743, + "learning_rate": 9.768003833403278e-06, + "loss": 0.3983, + "step": 2238 + }, + { + "epoch": 0.5631287726358148, + "grad_norm": 0.3796675503253937, + "learning_rate": 9.767563083359812e-06, + "loss": 0.4085, + "step": 2239 + }, + { + "epoch": 0.5633802816901409, + "grad_norm": 0.40289655327796936, + "learning_rate": 9.767121925004393e-06, + "loss": 0.4021, + "step": 2240 + }, + { + "epoch": 0.5636317907444668, + "grad_norm": 0.3628096878528595, + "learning_rate": 9.766680358374805e-06, + "loss": 0.3883, + "step": 2241 + }, + { + "epoch": 0.5638832997987927, + "grad_norm": 0.3843015730381012, + "learning_rate": 9.766238383508863e-06, + "loss": 0.4002, + "step": 2242 + }, + { + "epoch": 0.5641348088531187, + "grad_norm": 0.3902144432067871, + "learning_rate": 9.765796000444423e-06, + "loss": 0.3985, + "step": 2243 + }, + { + "epoch": 0.5643863179074446, + "grad_norm": 0.3879857063293457, + "learning_rate": 9.76535320921937e-06, + "loss": 0.396, + "step": 2244 + }, + { + "epoch": 0.5646378269617707, + "grad_norm": 0.38419634103775024, + "learning_rate": 9.764910009871626e-06, + "loss": 0.3882, + "step": 2245 + }, + { + "epoch": 0.5648893360160966, + "grad_norm": 0.4009936451911926, + "learning_rate": 9.76446640243915e-06, + "loss": 0.3931, + "step": 2246 + }, + { + "epoch": 0.5651408450704225, + "grad_norm": 0.35781916975975037, + "learning_rate": 9.764022386959931e-06, + "loss": 0.3602, + "step": 2247 + }, + { + "epoch": 0.5653923541247485, + "grad_norm": 0.3593212366104126, + "learning_rate": 9.763577963472e-06, + "loss": 0.3984, + "step": 2248 + }, + { + "epoch": 0.5656438631790744, + "grad_norm": 0.41306471824645996, + "learning_rate": 9.763133132013415e-06, + "loss": 0.4013, + "step": 2249 + }, + { + "epoch": 0.5658953722334004, + "grad_norm": 0.4159875810146332, + "learning_rate": 9.762687892622278e-06, + "loss": 0.3889, + "step": 2250 + }, + { + "epoch": 0.5661468812877264, + "grad_norm": 0.4286075532436371, + "learning_rate": 9.762242245336718e-06, + "loss": 0.418, + "step": 2251 + }, + { + "epoch": 0.5663983903420523, + "grad_norm": 0.41156652569770813, + "learning_rate": 9.7617961901949e-06, + "loss": 0.4001, + "step": 2252 + }, + { + "epoch": 0.5666498993963782, + "grad_norm": 0.4073083698749542, + "learning_rate": 9.76134972723503e-06, + "loss": 0.3782, + "step": 2253 + }, + { + "epoch": 0.5669014084507042, + "grad_norm": 0.43342074751853943, + "learning_rate": 9.76090285649534e-06, + "loss": 0.4144, + "step": 2254 + }, + { + "epoch": 0.5671529175050302, + "grad_norm": 0.37153980135917664, + "learning_rate": 9.760455578014107e-06, + "loss": 0.4146, + "step": 2255 + }, + { + "epoch": 0.5674044265593562, + "grad_norm": 0.4138156473636627, + "learning_rate": 9.760007891829635e-06, + "loss": 0.411, + "step": 2256 + }, + { + "epoch": 0.5676559356136821, + "grad_norm": 0.39025819301605225, + "learning_rate": 9.759559797980265e-06, + "loss": 0.3999, + "step": 2257 + }, + { + "epoch": 0.567907444668008, + "grad_norm": 0.3810308873653412, + "learning_rate": 9.759111296504374e-06, + "loss": 0.3973, + "step": 2258 + }, + { + "epoch": 0.568158953722334, + "grad_norm": 0.38749489188194275, + "learning_rate": 9.758662387440374e-06, + "loss": 0.3751, + "step": 2259 + }, + { + "epoch": 0.56841046277666, + "grad_norm": 0.42117783427238464, + "learning_rate": 9.75821307082671e-06, + "loss": 0.3943, + "step": 2260 + }, + { + "epoch": 0.5686619718309859, + "grad_norm": 0.38864991068840027, + "learning_rate": 9.757763346701863e-06, + "loss": 0.3931, + "step": 2261 + }, + { + "epoch": 0.5689134808853119, + "grad_norm": 0.4100189805030823, + "learning_rate": 9.757313215104352e-06, + "loss": 0.4236, + "step": 2262 + }, + { + "epoch": 0.5691649899396378, + "grad_norm": 0.36280521750450134, + "learning_rate": 9.756862676072724e-06, + "loss": 0.3773, + "step": 2263 + }, + { + "epoch": 0.5694164989939637, + "grad_norm": 0.4067562222480774, + "learning_rate": 9.756411729645567e-06, + "loss": 0.388, + "step": 2264 + }, + { + "epoch": 0.5696680080482898, + "grad_norm": 0.3898506164550781, + "learning_rate": 9.755960375861502e-06, + "loss": 0.3675, + "step": 2265 + }, + { + "epoch": 0.5699195171026157, + "grad_norm": 0.38783085346221924, + "learning_rate": 9.755508614759183e-06, + "loss": 0.4199, + "step": 2266 + }, + { + "epoch": 0.5701710261569416, + "grad_norm": 0.3513777256011963, + "learning_rate": 9.755056446377302e-06, + "loss": 0.3937, + "step": 2267 + }, + { + "epoch": 0.5704225352112676, + "grad_norm": 0.39856287837028503, + "learning_rate": 9.754603870754584e-06, + "loss": 0.407, + "step": 2268 + }, + { + "epoch": 0.5706740442655935, + "grad_norm": 0.39470675587654114, + "learning_rate": 9.754150887929789e-06, + "loss": 0.3876, + "step": 2269 + }, + { + "epoch": 0.5709255533199196, + "grad_norm": 0.43347692489624023, + "learning_rate": 9.753697497941713e-06, + "loss": 0.4072, + "step": 2270 + }, + { + "epoch": 0.5711770623742455, + "grad_norm": 0.41265973448753357, + "learning_rate": 9.753243700829185e-06, + "loss": 0.3898, + "step": 2271 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 0.4033988416194916, + "learning_rate": 9.75278949663107e-06, + "loss": 0.3958, + "step": 2272 + }, + { + "epoch": 0.5716800804828974, + "grad_norm": 0.4230313301086426, + "learning_rate": 9.752334885386268e-06, + "loss": 0.3657, + "step": 2273 + }, + { + "epoch": 0.5719315895372233, + "grad_norm": 0.4359712600708008, + "learning_rate": 9.751879867133713e-06, + "loss": 0.4302, + "step": 2274 + }, + { + "epoch": 0.5721830985915493, + "grad_norm": 0.3776133060455322, + "learning_rate": 9.751424441912376e-06, + "loss": 0.3786, + "step": 2275 + }, + { + "epoch": 0.5724346076458753, + "grad_norm": 0.39019957184791565, + "learning_rate": 9.75096860976126e-06, + "loss": 0.3824, + "step": 2276 + }, + { + "epoch": 0.5726861167002012, + "grad_norm": 0.38191524147987366, + "learning_rate": 9.750512370719404e-06, + "loss": 0.4106, + "step": 2277 + }, + { + "epoch": 0.5729376257545271, + "grad_norm": 0.35566771030426025, + "learning_rate": 9.750055724825885e-06, + "loss": 0.387, + "step": 2278 + }, + { + "epoch": 0.5731891348088531, + "grad_norm": 0.37126556038856506, + "learning_rate": 9.749598672119807e-06, + "loss": 0.4093, + "step": 2279 + }, + { + "epoch": 0.5734406438631791, + "grad_norm": 0.3403175473213196, + "learning_rate": 9.749141212640317e-06, + "loss": 0.3993, + "step": 2280 + }, + { + "epoch": 0.5736921529175051, + "grad_norm": 0.37190091609954834, + "learning_rate": 9.748683346426591e-06, + "loss": 0.3956, + "step": 2281 + }, + { + "epoch": 0.573943661971831, + "grad_norm": 0.3574543595314026, + "learning_rate": 9.748225073517845e-06, + "loss": 0.4014, + "step": 2282 + }, + { + "epoch": 0.5741951710261569, + "grad_norm": 0.4083276689052582, + "learning_rate": 9.747766393953327e-06, + "loss": 0.424, + "step": 2283 + }, + { + "epoch": 0.5744466800804829, + "grad_norm": 0.35851186513900757, + "learning_rate": 9.74730730777232e-06, + "loss": 0.3974, + "step": 2284 + }, + { + "epoch": 0.5746981891348089, + "grad_norm": 0.3459572494029999, + "learning_rate": 9.74684781501414e-06, + "loss": 0.384, + "step": 2285 + }, + { + "epoch": 0.5749496981891348, + "grad_norm": 0.38862690329551697, + "learning_rate": 9.746387915718139e-06, + "loss": 0.4025, + "step": 2286 + }, + { + "epoch": 0.5752012072434608, + "grad_norm": 0.3741775155067444, + "learning_rate": 9.745927609923709e-06, + "loss": 0.3985, + "step": 2287 + }, + { + "epoch": 0.5754527162977867, + "grad_norm": 0.33961108326911926, + "learning_rate": 9.745466897670268e-06, + "loss": 0.3606, + "step": 2288 + }, + { + "epoch": 0.5757042253521126, + "grad_norm": 0.3567553460597992, + "learning_rate": 9.745005778997277e-06, + "loss": 0.3874, + "step": 2289 + }, + { + "epoch": 0.5759557344064387, + "grad_norm": 0.4063310921192169, + "learning_rate": 9.744544253944223e-06, + "loss": 0.385, + "step": 2290 + }, + { + "epoch": 0.5762072434607646, + "grad_norm": 0.3446536660194397, + "learning_rate": 9.744082322550637e-06, + "loss": 0.4224, + "step": 2291 + }, + { + "epoch": 0.5764587525150905, + "grad_norm": 0.3623557984828949, + "learning_rate": 9.743619984856078e-06, + "loss": 0.3903, + "step": 2292 + }, + { + "epoch": 0.5767102615694165, + "grad_norm": 0.37968283891677856, + "learning_rate": 9.743157240900145e-06, + "loss": 0.404, + "step": 2293 + }, + { + "epoch": 0.5769617706237424, + "grad_norm": 0.37783414125442505, + "learning_rate": 9.742694090722466e-06, + "loss": 0.3811, + "step": 2294 + }, + { + "epoch": 0.5772132796780685, + "grad_norm": 0.36868754029273987, + "learning_rate": 9.742230534362708e-06, + "loss": 0.3869, + "step": 2295 + }, + { + "epoch": 0.5774647887323944, + "grad_norm": 0.4327180087566376, + "learning_rate": 9.741766571860573e-06, + "loss": 0.399, + "step": 2296 + }, + { + "epoch": 0.5777162977867203, + "grad_norm": 0.4020644724369049, + "learning_rate": 9.741302203255796e-06, + "loss": 0.4062, + "step": 2297 + }, + { + "epoch": 0.5779678068410463, + "grad_norm": 0.3767627477645874, + "learning_rate": 9.740837428588147e-06, + "loss": 0.3712, + "step": 2298 + }, + { + "epoch": 0.5782193158953722, + "grad_norm": 0.428296834230423, + "learning_rate": 9.740372247897429e-06, + "loss": 0.3948, + "step": 2299 + }, + { + "epoch": 0.5784708249496981, + "grad_norm": 0.3837886154651642, + "learning_rate": 9.739906661223485e-06, + "loss": 0.3894, + "step": 2300 + }, + { + "epoch": 0.5787223340040242, + "grad_norm": 0.38341113924980164, + "learning_rate": 9.739440668606188e-06, + "loss": 0.3709, + "step": 2301 + }, + { + "epoch": 0.5789738430583501, + "grad_norm": 0.4215813875198364, + "learning_rate": 9.738974270085447e-06, + "loss": 0.3978, + "step": 2302 + }, + { + "epoch": 0.579225352112676, + "grad_norm": 0.4352063834667206, + "learning_rate": 9.738507465701207e-06, + "loss": 0.3826, + "step": 2303 + }, + { + "epoch": 0.579476861167002, + "grad_norm": 0.36459967494010925, + "learning_rate": 9.738040255493446e-06, + "loss": 0.3813, + "step": 2304 + }, + { + "epoch": 0.579728370221328, + "grad_norm": 0.37163200974464417, + "learning_rate": 9.737572639502179e-06, + "loss": 0.3872, + "step": 2305 + }, + { + "epoch": 0.579979879275654, + "grad_norm": 0.4496228098869324, + "learning_rate": 9.737104617767454e-06, + "loss": 0.4004, + "step": 2306 + }, + { + "epoch": 0.5802313883299799, + "grad_norm": 0.4434552490711212, + "learning_rate": 9.736636190329355e-06, + "loss": 0.3779, + "step": 2307 + }, + { + "epoch": 0.5804828973843058, + "grad_norm": 0.3943467438220978, + "learning_rate": 9.736167357227995e-06, + "loss": 0.3992, + "step": 2308 + }, + { + "epoch": 0.5807344064386318, + "grad_norm": 0.41185152530670166, + "learning_rate": 9.735698118503531e-06, + "loss": 0.3785, + "step": 2309 + }, + { + "epoch": 0.5809859154929577, + "grad_norm": 0.4015391170978546, + "learning_rate": 9.735228474196152e-06, + "loss": 0.399, + "step": 2310 + }, + { + "epoch": 0.5812374245472837, + "grad_norm": 0.43395721912384033, + "learning_rate": 9.734758424346075e-06, + "loss": 0.3948, + "step": 2311 + }, + { + "epoch": 0.5814889336016097, + "grad_norm": 0.41467130184173584, + "learning_rate": 9.734287968993561e-06, + "loss": 0.4115, + "step": 2312 + }, + { + "epoch": 0.5817404426559356, + "grad_norm": 0.4236811697483063, + "learning_rate": 9.7338171081789e-06, + "loss": 0.4043, + "step": 2313 + }, + { + "epoch": 0.5819919517102615, + "grad_norm": 0.38736096024513245, + "learning_rate": 9.733345841942418e-06, + "loss": 0.3729, + "step": 2314 + }, + { + "epoch": 0.5822434607645876, + "grad_norm": 0.37845367193222046, + "learning_rate": 9.732874170324479e-06, + "loss": 0.4106, + "step": 2315 + }, + { + "epoch": 0.5824949698189135, + "grad_norm": 0.35245969891548157, + "learning_rate": 9.732402093365471e-06, + "loss": 0.4144, + "step": 2316 + }, + { + "epoch": 0.5827464788732394, + "grad_norm": 0.39204463362693787, + "learning_rate": 9.731929611105833e-06, + "loss": 0.4053, + "step": 2317 + }, + { + "epoch": 0.5829979879275654, + "grad_norm": 0.389548122882843, + "learning_rate": 9.731456723586026e-06, + "loss": 0.4192, + "step": 2318 + }, + { + "epoch": 0.5832494969818913, + "grad_norm": 0.38718488812446594, + "learning_rate": 9.73098343084655e-06, + "loss": 0.3687, + "step": 2319 + }, + { + "epoch": 0.5835010060362174, + "grad_norm": 0.409236341714859, + "learning_rate": 9.73050973292794e-06, + "loss": 0.3934, + "step": 2320 + }, + { + "epoch": 0.5837525150905433, + "grad_norm": 0.41710254549980164, + "learning_rate": 9.730035629870766e-06, + "loss": 0.375, + "step": 2321 + }, + { + "epoch": 0.5840040241448692, + "grad_norm": 0.421811580657959, + "learning_rate": 9.729561121715632e-06, + "loss": 0.3895, + "step": 2322 + }, + { + "epoch": 0.5842555331991952, + "grad_norm": 0.39268958568573, + "learning_rate": 9.729086208503174e-06, + "loss": 0.3992, + "step": 2323 + }, + { + "epoch": 0.5845070422535211, + "grad_norm": 0.3878178298473358, + "learning_rate": 9.728610890274068e-06, + "loss": 0.3971, + "step": 2324 + }, + { + "epoch": 0.584758551307847, + "grad_norm": 0.4158124029636383, + "learning_rate": 9.728135167069022e-06, + "loss": 0.3639, + "step": 2325 + }, + { + "epoch": 0.5850100603621731, + "grad_norm": 0.3816595673561096, + "learning_rate": 9.727659038928778e-06, + "loss": 0.3709, + "step": 2326 + }, + { + "epoch": 0.585261569416499, + "grad_norm": 0.40008431673049927, + "learning_rate": 9.727182505894112e-06, + "loss": 0.3762, + "step": 2327 + }, + { + "epoch": 0.5855130784708249, + "grad_norm": 0.40834906697273254, + "learning_rate": 9.72670556800584e-06, + "loss": 0.3712, + "step": 2328 + }, + { + "epoch": 0.5857645875251509, + "grad_norm": 0.37313729524612427, + "learning_rate": 9.726228225304806e-06, + "loss": 0.3801, + "step": 2329 + }, + { + "epoch": 0.5860160965794768, + "grad_norm": 0.4533138871192932, + "learning_rate": 9.72575047783189e-06, + "loss": 0.4047, + "step": 2330 + }, + { + "epoch": 0.5862676056338029, + "grad_norm": 0.3437679708003998, + "learning_rate": 9.72527232562801e-06, + "loss": 0.4038, + "step": 2331 + }, + { + "epoch": 0.5865191146881288, + "grad_norm": 0.45910757780075073, + "learning_rate": 9.724793768734117e-06, + "loss": 0.4086, + "step": 2332 + }, + { + "epoch": 0.5867706237424547, + "grad_norm": 0.41133296489715576, + "learning_rate": 9.724314807191197e-06, + "loss": 0.386, + "step": 2333 + }, + { + "epoch": 0.5870221327967807, + "grad_norm": 0.39628586173057556, + "learning_rate": 9.723835441040268e-06, + "loss": 0.4064, + "step": 2334 + }, + { + "epoch": 0.5872736418511066, + "grad_norm": 0.4965059459209442, + "learning_rate": 9.723355670322385e-06, + "loss": 0.416, + "step": 2335 + }, + { + "epoch": 0.5875251509054326, + "grad_norm": 0.4447380304336548, + "learning_rate": 9.722875495078638e-06, + "loss": 0.4007, + "step": 2336 + }, + { + "epoch": 0.5877766599597586, + "grad_norm": 0.3534230887889862, + "learning_rate": 9.722394915350153e-06, + "loss": 0.3665, + "step": 2337 + }, + { + "epoch": 0.5880281690140845, + "grad_norm": 0.5113065838813782, + "learning_rate": 9.721913931178084e-06, + "loss": 0.3697, + "step": 2338 + }, + { + "epoch": 0.5882796780684104, + "grad_norm": 0.37465915083885193, + "learning_rate": 9.72143254260363e-06, + "loss": 0.3943, + "step": 2339 + }, + { + "epoch": 0.5885311871227364, + "grad_norm": 0.4188118278980255, + "learning_rate": 9.720950749668013e-06, + "loss": 0.3736, + "step": 2340 + }, + { + "epoch": 0.5887826961770624, + "grad_norm": 0.4283120036125183, + "learning_rate": 9.720468552412501e-06, + "loss": 0.3916, + "step": 2341 + }, + { + "epoch": 0.5890342052313883, + "grad_norm": 0.4219818115234375, + "learning_rate": 9.719985950878386e-06, + "loss": 0.37, + "step": 2342 + }, + { + "epoch": 0.5892857142857143, + "grad_norm": 0.41337502002716064, + "learning_rate": 9.719502945107004e-06, + "loss": 0.3874, + "step": 2343 + }, + { + "epoch": 0.5895372233400402, + "grad_norm": 0.3993411660194397, + "learning_rate": 9.71901953513972e-06, + "loss": 0.3988, + "step": 2344 + }, + { + "epoch": 0.5897887323943662, + "grad_norm": 0.4108680188655853, + "learning_rate": 9.718535721017936e-06, + "loss": 0.4016, + "step": 2345 + }, + { + "epoch": 0.5900402414486922, + "grad_norm": 0.4107624888420105, + "learning_rate": 9.718051502783084e-06, + "loss": 0.3762, + "step": 2346 + }, + { + "epoch": 0.5902917505030181, + "grad_norm": 0.4297145903110504, + "learning_rate": 9.717566880476639e-06, + "loss": 0.3873, + "step": 2347 + }, + { + "epoch": 0.5905432595573441, + "grad_norm": 0.42795687913894653, + "learning_rate": 9.717081854140103e-06, + "loss": 0.4027, + "step": 2348 + }, + { + "epoch": 0.59079476861167, + "grad_norm": 0.42112451791763306, + "learning_rate": 9.716596423815016e-06, + "loss": 0.4161, + "step": 2349 + }, + { + "epoch": 0.5910462776659959, + "grad_norm": 0.46127063035964966, + "learning_rate": 9.716110589542952e-06, + "loss": 0.3833, + "step": 2350 + }, + { + "epoch": 0.591297786720322, + "grad_norm": 0.4687446355819702, + "learning_rate": 9.71562435136552e-06, + "loss": 0.391, + "step": 2351 + }, + { + "epoch": 0.5915492957746479, + "grad_norm": 0.4273424446582794, + "learning_rate": 9.715137709324363e-06, + "loss": 0.3899, + "step": 2352 + }, + { + "epoch": 0.5918008048289738, + "grad_norm": 0.4016727805137634, + "learning_rate": 9.71465066346116e-06, + "loss": 0.3874, + "step": 2353 + }, + { + "epoch": 0.5920523138832998, + "grad_norm": 0.4089299440383911, + "learning_rate": 9.714163213817621e-06, + "loss": 0.3946, + "step": 2354 + }, + { + "epoch": 0.5923038229376257, + "grad_norm": 0.3954598903656006, + "learning_rate": 9.713675360435495e-06, + "loss": 0.393, + "step": 2355 + }, + { + "epoch": 0.5925553319919518, + "grad_norm": 0.42446476221084595, + "learning_rate": 9.713187103356563e-06, + "loss": 0.3604, + "step": 2356 + }, + { + "epoch": 0.5928068410462777, + "grad_norm": 0.461434543132782, + "learning_rate": 9.71269844262264e-06, + "loss": 0.3915, + "step": 2357 + }, + { + "epoch": 0.5930583501006036, + "grad_norm": 0.3812006711959839, + "learning_rate": 9.712209378275581e-06, + "loss": 0.3874, + "step": 2358 + }, + { + "epoch": 0.5933098591549296, + "grad_norm": 0.44694066047668457, + "learning_rate": 9.711719910357267e-06, + "loss": 0.3899, + "step": 2359 + }, + { + "epoch": 0.5935613682092555, + "grad_norm": 0.39847058057785034, + "learning_rate": 9.711230038909619e-06, + "loss": 0.4042, + "step": 2360 + }, + { + "epoch": 0.5938128772635815, + "grad_norm": 0.3936542570590973, + "learning_rate": 9.71073976397459e-06, + "loss": 0.4044, + "step": 2361 + }, + { + "epoch": 0.5940643863179075, + "grad_norm": 0.38293659687042236, + "learning_rate": 9.710249085594171e-06, + "loss": 0.3467, + "step": 2362 + }, + { + "epoch": 0.5943158953722334, + "grad_norm": 0.4120056927204132, + "learning_rate": 9.709758003810388e-06, + "loss": 0.4156, + "step": 2363 + }, + { + "epoch": 0.5945674044265593, + "grad_norm": 0.40175196528434753, + "learning_rate": 9.709266518665293e-06, + "loss": 0.3832, + "step": 2364 + }, + { + "epoch": 0.5948189134808853, + "grad_norm": 0.3934299051761627, + "learning_rate": 9.708774630200983e-06, + "loss": 0.3808, + "step": 2365 + }, + { + "epoch": 0.5950704225352113, + "grad_norm": 0.3721473515033722, + "learning_rate": 9.708282338459582e-06, + "loss": 0.3742, + "step": 2366 + }, + { + "epoch": 0.5953219315895373, + "grad_norm": 0.34081289172172546, + "learning_rate": 9.707789643483256e-06, + "loss": 0.4101, + "step": 2367 + }, + { + "epoch": 0.5955734406438632, + "grad_norm": 0.36988210678100586, + "learning_rate": 9.707296545314197e-06, + "loss": 0.4075, + "step": 2368 + }, + { + "epoch": 0.5958249496981891, + "grad_norm": 0.3822750747203827, + "learning_rate": 9.70680304399464e-06, + "loss": 0.3974, + "step": 2369 + }, + { + "epoch": 0.5960764587525151, + "grad_norm": 0.3905336856842041, + "learning_rate": 9.706309139566847e-06, + "loss": 0.3861, + "step": 2370 + }, + { + "epoch": 0.596327967806841, + "grad_norm": 0.3887244760990143, + "learning_rate": 9.705814832073118e-06, + "loss": 0.3831, + "step": 2371 + }, + { + "epoch": 0.596579476861167, + "grad_norm": 0.38636234402656555, + "learning_rate": 9.705320121555789e-06, + "loss": 0.3969, + "step": 2372 + }, + { + "epoch": 0.596830985915493, + "grad_norm": 0.3817340135574341, + "learning_rate": 9.704825008057229e-06, + "loss": 0.3942, + "step": 2373 + }, + { + "epoch": 0.5970824949698189, + "grad_norm": 0.41212648153305054, + "learning_rate": 9.704329491619837e-06, + "loss": 0.3865, + "step": 2374 + }, + { + "epoch": 0.5973340040241448, + "grad_norm": 0.3645521402359009, + "learning_rate": 9.703833572286056e-06, + "loss": 0.389, + "step": 2375 + }, + { + "epoch": 0.5975855130784709, + "grad_norm": 0.4224928319454193, + "learning_rate": 9.703337250098357e-06, + "loss": 0.3855, + "step": 2376 + }, + { + "epoch": 0.5978370221327968, + "grad_norm": 0.40865787863731384, + "learning_rate": 9.702840525099247e-06, + "loss": 0.3671, + "step": 2377 + }, + { + "epoch": 0.5980885311871227, + "grad_norm": 0.348753422498703, + "learning_rate": 9.702343397331266e-06, + "loss": 0.3635, + "step": 2378 + }, + { + "epoch": 0.5983400402414487, + "grad_norm": 0.4527108669281006, + "learning_rate": 9.701845866836992e-06, + "loss": 0.4217, + "step": 2379 + }, + { + "epoch": 0.5985915492957746, + "grad_norm": 0.4157269597053528, + "learning_rate": 9.70134793365903e-06, + "loss": 0.3824, + "step": 2380 + }, + { + "epoch": 0.5988430583501007, + "grad_norm": 0.35614490509033203, + "learning_rate": 9.700849597840035e-06, + "loss": 0.4004, + "step": 2381 + }, + { + "epoch": 0.5990945674044266, + "grad_norm": 0.41033607721328735, + "learning_rate": 9.700350859422675e-06, + "loss": 0.3927, + "step": 2382 + }, + { + "epoch": 0.5993460764587525, + "grad_norm": 0.380023717880249, + "learning_rate": 9.699851718449672e-06, + "loss": 0.4076, + "step": 2383 + }, + { + "epoch": 0.5995975855130785, + "grad_norm": 0.3968260586261749, + "learning_rate": 9.699352174963772e-06, + "loss": 0.3977, + "step": 2384 + }, + { + "epoch": 0.5998490945674044, + "grad_norm": 0.3838002383708954, + "learning_rate": 9.698852229007756e-06, + "loss": 0.401, + "step": 2385 + }, + { + "epoch": 0.6001006036217303, + "grad_norm": 0.4010978043079376, + "learning_rate": 9.698351880624444e-06, + "loss": 0.3975, + "step": 2386 + }, + { + "epoch": 0.6003521126760564, + "grad_norm": 0.4017295241355896, + "learning_rate": 9.697851129856687e-06, + "loss": 0.3984, + "step": 2387 + }, + { + "epoch": 0.6006036217303823, + "grad_norm": 0.4557097554206848, + "learning_rate": 9.697349976747366e-06, + "loss": 0.388, + "step": 2388 + }, + { + "epoch": 0.6008551307847082, + "grad_norm": 0.3855472803115845, + "learning_rate": 9.696848421339409e-06, + "loss": 0.4105, + "step": 2389 + }, + { + "epoch": 0.6011066398390342, + "grad_norm": 0.37822774052619934, + "learning_rate": 9.696346463675767e-06, + "loss": 0.3676, + "step": 2390 + }, + { + "epoch": 0.6013581488933601, + "grad_norm": 0.41425397992134094, + "learning_rate": 9.695844103799432e-06, + "loss": 0.3996, + "step": 2391 + }, + { + "epoch": 0.6016096579476862, + "grad_norm": 0.3729078769683838, + "learning_rate": 9.695341341753426e-06, + "loss": 0.3881, + "step": 2392 + }, + { + "epoch": 0.6018611670020121, + "grad_norm": 0.41532474756240845, + "learning_rate": 9.69483817758081e-06, + "loss": 0.397, + "step": 2393 + }, + { + "epoch": 0.602112676056338, + "grad_norm": 0.389249324798584, + "learning_rate": 9.694334611324672e-06, + "loss": 0.3902, + "step": 2394 + }, + { + "epoch": 0.602364185110664, + "grad_norm": 0.3921426236629486, + "learning_rate": 9.693830643028142e-06, + "loss": 0.4204, + "step": 2395 + }, + { + "epoch": 0.60261569416499, + "grad_norm": 0.3730478584766388, + "learning_rate": 9.693326272734384e-06, + "loss": 0.384, + "step": 2396 + }, + { + "epoch": 0.6028672032193159, + "grad_norm": 0.3994380831718445, + "learning_rate": 9.692821500486592e-06, + "loss": 0.4142, + "step": 2397 + }, + { + "epoch": 0.6031187122736419, + "grad_norm": 0.3791671395301819, + "learning_rate": 9.692316326327995e-06, + "loss": 0.3905, + "step": 2398 + }, + { + "epoch": 0.6033702213279678, + "grad_norm": 0.3484344780445099, + "learning_rate": 9.69181075030186e-06, + "loss": 0.4205, + "step": 2399 + }, + { + "epoch": 0.6036217303822937, + "grad_norm": 0.3879513144493103, + "learning_rate": 9.691304772451487e-06, + "loss": 0.3693, + "step": 2400 + }, + { + "epoch": 0.6038732394366197, + "grad_norm": 0.4011459946632385, + "learning_rate": 9.690798392820208e-06, + "loss": 0.4037, + "step": 2401 + }, + { + "epoch": 0.6041247484909457, + "grad_norm": 0.3635562062263489, + "learning_rate": 9.690291611451394e-06, + "loss": 0.391, + "step": 2402 + }, + { + "epoch": 0.6043762575452716, + "grad_norm": 0.37556272745132446, + "learning_rate": 9.689784428388444e-06, + "loss": 0.4034, + "step": 2403 + }, + { + "epoch": 0.6046277665995976, + "grad_norm": 0.38869237899780273, + "learning_rate": 9.689276843674797e-06, + "loss": 0.3951, + "step": 2404 + }, + { + "epoch": 0.6048792756539235, + "grad_norm": 0.41964560747146606, + "learning_rate": 9.688768857353925e-06, + "loss": 0.4044, + "step": 2405 + }, + { + "epoch": 0.6051307847082495, + "grad_norm": 0.40390586853027344, + "learning_rate": 9.688260469469333e-06, + "loss": 0.4318, + "step": 2406 + }, + { + "epoch": 0.6053822937625755, + "grad_norm": 0.38943222165107727, + "learning_rate": 9.687751680064562e-06, + "loss": 0.4171, + "step": 2407 + }, + { + "epoch": 0.6056338028169014, + "grad_norm": 0.37889039516448975, + "learning_rate": 9.687242489183187e-06, + "loss": 0.3723, + "step": 2408 + }, + { + "epoch": 0.6058853118712274, + "grad_norm": 0.3625709116458893, + "learning_rate": 9.686732896868814e-06, + "loss": 0.3664, + "step": 2409 + }, + { + "epoch": 0.6061368209255533, + "grad_norm": 0.37902170419692993, + "learning_rate": 9.68622290316509e-06, + "loss": 0.3782, + "step": 2410 + }, + { + "epoch": 0.6063883299798792, + "grad_norm": 0.35425660014152527, + "learning_rate": 9.68571250811569e-06, + "loss": 0.3889, + "step": 2411 + }, + { + "epoch": 0.6066398390342053, + "grad_norm": 0.3910084068775177, + "learning_rate": 9.685201711764328e-06, + "loss": 0.3994, + "step": 2412 + }, + { + "epoch": 0.6068913480885312, + "grad_norm": 0.3474578261375427, + "learning_rate": 9.68469051415475e-06, + "loss": 0.3665, + "step": 2413 + }, + { + "epoch": 0.6071428571428571, + "grad_norm": 0.37786930799484253, + "learning_rate": 9.684178915330736e-06, + "loss": 0.4192, + "step": 2414 + }, + { + "epoch": 0.6073943661971831, + "grad_norm": 0.34964871406555176, + "learning_rate": 9.683666915336102e-06, + "loss": 0.3848, + "step": 2415 + }, + { + "epoch": 0.607645875251509, + "grad_norm": 0.37684234976768494, + "learning_rate": 9.683154514214698e-06, + "loss": 0.3607, + "step": 2416 + }, + { + "epoch": 0.6078973843058351, + "grad_norm": 0.41027969121932983, + "learning_rate": 9.68264171201041e-06, + "loss": 0.4109, + "step": 2417 + }, + { + "epoch": 0.608148893360161, + "grad_norm": 0.3731859028339386, + "learning_rate": 9.682128508767151e-06, + "loss": 0.4011, + "step": 2418 + }, + { + "epoch": 0.6084004024144869, + "grad_norm": 0.3846946954727173, + "learning_rate": 9.681614904528877e-06, + "loss": 0.3984, + "step": 2419 + }, + { + "epoch": 0.6086519114688129, + "grad_norm": 0.378128319978714, + "learning_rate": 9.681100899339574e-06, + "loss": 0.396, + "step": 2420 + }, + { + "epoch": 0.6089034205231388, + "grad_norm": 0.39976274967193604, + "learning_rate": 9.680586493243265e-06, + "loss": 0.3737, + "step": 2421 + }, + { + "epoch": 0.6091549295774648, + "grad_norm": 0.38559094071388245, + "learning_rate": 9.680071686284005e-06, + "loss": 0.385, + "step": 2422 + }, + { + "epoch": 0.6094064386317908, + "grad_norm": 0.43395066261291504, + "learning_rate": 9.679556478505882e-06, + "loss": 0.3969, + "step": 2423 + }, + { + "epoch": 0.6096579476861167, + "grad_norm": 0.3740188181400299, + "learning_rate": 9.679040869953023e-06, + "loss": 0.3783, + "step": 2424 + }, + { + "epoch": 0.6099094567404426, + "grad_norm": 0.38674283027648926, + "learning_rate": 9.678524860669584e-06, + "loss": 0.4026, + "step": 2425 + }, + { + "epoch": 0.6101609657947686, + "grad_norm": 0.399495929479599, + "learning_rate": 9.678008450699761e-06, + "loss": 0.3948, + "step": 2426 + }, + { + "epoch": 0.6104124748490946, + "grad_norm": 0.390152245759964, + "learning_rate": 9.677491640087779e-06, + "loss": 0.3892, + "step": 2427 + }, + { + "epoch": 0.6106639839034205, + "grad_norm": 0.36457139253616333, + "learning_rate": 9.6769744288779e-06, + "loss": 0.3923, + "step": 2428 + }, + { + "epoch": 0.6109154929577465, + "grad_norm": 0.3784310519695282, + "learning_rate": 9.676456817114423e-06, + "loss": 0.3828, + "step": 2429 + }, + { + "epoch": 0.6111670020120724, + "grad_norm": 0.41220057010650635, + "learning_rate": 9.675938804841673e-06, + "loss": 0.4132, + "step": 2430 + }, + { + "epoch": 0.6114185110663984, + "grad_norm": 0.3676017224788666, + "learning_rate": 9.675420392104016e-06, + "loss": 0.3921, + "step": 2431 + }, + { + "epoch": 0.6116700201207244, + "grad_norm": 0.3908953070640564, + "learning_rate": 9.674901578945853e-06, + "loss": 0.3877, + "step": 2432 + }, + { + "epoch": 0.6119215291750503, + "grad_norm": 0.38477277755737305, + "learning_rate": 9.674382365411617e-06, + "loss": 0.3855, + "step": 2433 + }, + { + "epoch": 0.6121730382293763, + "grad_norm": 0.40748539566993713, + "learning_rate": 9.673862751545773e-06, + "loss": 0.3879, + "step": 2434 + }, + { + "epoch": 0.6124245472837022, + "grad_norm": 0.389136403799057, + "learning_rate": 9.673342737392824e-06, + "loss": 0.4344, + "step": 2435 + }, + { + "epoch": 0.6126760563380281, + "grad_norm": 0.3874034583568573, + "learning_rate": 9.672822322997305e-06, + "loss": 0.4105, + "step": 2436 + }, + { + "epoch": 0.6129275653923542, + "grad_norm": 0.3671362102031708, + "learning_rate": 9.672301508403788e-06, + "loss": 0.383, + "step": 2437 + }, + { + "epoch": 0.6131790744466801, + "grad_norm": 0.35450032353401184, + "learning_rate": 9.671780293656876e-06, + "loss": 0.3807, + "step": 2438 + }, + { + "epoch": 0.613430583501006, + "grad_norm": 0.4165121018886566, + "learning_rate": 9.67125867880121e-06, + "loss": 0.3878, + "step": 2439 + }, + { + "epoch": 0.613682092555332, + "grad_norm": 0.40714791417121887, + "learning_rate": 9.67073666388146e-06, + "loss": 0.4076, + "step": 2440 + }, + { + "epoch": 0.6139336016096579, + "grad_norm": 0.42955923080444336, + "learning_rate": 9.670214248942335e-06, + "loss": 0.4108, + "step": 2441 + }, + { + "epoch": 0.614185110663984, + "grad_norm": 0.38127392530441284, + "learning_rate": 9.669691434028576e-06, + "loss": 0.4032, + "step": 2442 + }, + { + "epoch": 0.6144366197183099, + "grad_norm": 0.4305511713027954, + "learning_rate": 9.66916821918496e-06, + "loss": 0.3918, + "step": 2443 + }, + { + "epoch": 0.6146881287726358, + "grad_norm": 0.45002248883247375, + "learning_rate": 9.668644604456297e-06, + "loss": 0.3847, + "step": 2444 + }, + { + "epoch": 0.6149396378269618, + "grad_norm": 0.34926843643188477, + "learning_rate": 9.668120589887429e-06, + "loss": 0.3919, + "step": 2445 + }, + { + "epoch": 0.6151911468812877, + "grad_norm": 0.39543116092681885, + "learning_rate": 9.667596175523237e-06, + "loss": 0.3568, + "step": 2446 + }, + { + "epoch": 0.6154426559356136, + "grad_norm": 0.3891063630580902, + "learning_rate": 9.667071361408633e-06, + "loss": 0.3759, + "step": 2447 + }, + { + "epoch": 0.6156941649899397, + "grad_norm": 0.3981350064277649, + "learning_rate": 9.666546147588563e-06, + "loss": 0.3784, + "step": 2448 + }, + { + "epoch": 0.6159456740442656, + "grad_norm": 0.3683044910430908, + "learning_rate": 9.666020534108009e-06, + "loss": 0.3846, + "step": 2449 + }, + { + "epoch": 0.6161971830985915, + "grad_norm": 0.423318088054657, + "learning_rate": 9.665494521011988e-06, + "loss": 0.3747, + "step": 2450 + }, + { + "epoch": 0.6164486921529175, + "grad_norm": 0.3727003037929535, + "learning_rate": 9.664968108345549e-06, + "loss": 0.3517, + "step": 2451 + }, + { + "epoch": 0.6167002012072434, + "grad_norm": 0.3899252414703369, + "learning_rate": 9.664441296153775e-06, + "loss": 0.3964, + "step": 2452 + }, + { + "epoch": 0.6169517102615694, + "grad_norm": 0.415444940328598, + "learning_rate": 9.663914084481784e-06, + "loss": 0.3811, + "step": 2453 + }, + { + "epoch": 0.6172032193158954, + "grad_norm": 0.4385893940925598, + "learning_rate": 9.66338647337473e-06, + "loss": 0.4241, + "step": 2454 + }, + { + "epoch": 0.6174547283702213, + "grad_norm": 0.3797210454940796, + "learning_rate": 9.662858462877797e-06, + "loss": 0.3835, + "step": 2455 + }, + { + "epoch": 0.6177062374245473, + "grad_norm": 0.4100744426250458, + "learning_rate": 9.662330053036208e-06, + "loss": 0.408, + "step": 2456 + }, + { + "epoch": 0.6179577464788732, + "grad_norm": 0.3938503563404083, + "learning_rate": 9.66180124389522e-06, + "loss": 0.4077, + "step": 2457 + }, + { + "epoch": 0.6182092555331992, + "grad_norm": 0.3984260857105255, + "learning_rate": 9.661272035500115e-06, + "loss": 0.3775, + "step": 2458 + }, + { + "epoch": 0.6184607645875252, + "grad_norm": 0.39625781774520874, + "learning_rate": 9.660742427896224e-06, + "loss": 0.3969, + "step": 2459 + }, + { + "epoch": 0.6187122736418511, + "grad_norm": 0.4334189295768738, + "learning_rate": 9.6602124211289e-06, + "loss": 0.3946, + "step": 2460 + }, + { + "epoch": 0.618963782696177, + "grad_norm": 0.38942191004753113, + "learning_rate": 9.65968201524354e-06, + "loss": 0.3788, + "step": 2461 + }, + { + "epoch": 0.619215291750503, + "grad_norm": 0.36435267329216003, + "learning_rate": 9.659151210285562e-06, + "loss": 0.3685, + "step": 2462 + }, + { + "epoch": 0.619466800804829, + "grad_norm": 0.4146406650543213, + "learning_rate": 9.658620006300432e-06, + "loss": 0.3682, + "step": 2463 + }, + { + "epoch": 0.6197183098591549, + "grad_norm": 0.41353920102119446, + "learning_rate": 9.658088403333642e-06, + "loss": 0.3967, + "step": 2464 + }, + { + "epoch": 0.6199698189134809, + "grad_norm": 0.41589221358299255, + "learning_rate": 9.657556401430723e-06, + "loss": 0.4066, + "step": 2465 + }, + { + "epoch": 0.6202213279678068, + "grad_norm": 0.47424760460853577, + "learning_rate": 9.657024000637235e-06, + "loss": 0.3937, + "step": 2466 + }, + { + "epoch": 0.6204728370221329, + "grad_norm": 0.3641650080680847, + "learning_rate": 9.656491200998774e-06, + "loss": 0.3926, + "step": 2467 + }, + { + "epoch": 0.6207243460764588, + "grad_norm": 0.4509759247303009, + "learning_rate": 9.655958002560974e-06, + "loss": 0.4217, + "step": 2468 + }, + { + "epoch": 0.6209758551307847, + "grad_norm": 0.442841112613678, + "learning_rate": 9.655424405369497e-06, + "loss": 0.3757, + "step": 2469 + }, + { + "epoch": 0.6212273641851107, + "grad_norm": 0.3849235475063324, + "learning_rate": 9.654890409470047e-06, + "loss": 0.4041, + "step": 2470 + }, + { + "epoch": 0.6214788732394366, + "grad_norm": 0.39690494537353516, + "learning_rate": 9.654356014908352e-06, + "loss": 0.3915, + "step": 2471 + }, + { + "epoch": 0.6217303822937625, + "grad_norm": 0.4252414107322693, + "learning_rate": 9.653821221730183e-06, + "loss": 0.3808, + "step": 2472 + }, + { + "epoch": 0.6219818913480886, + "grad_norm": 0.38544967770576477, + "learning_rate": 9.65328602998134e-06, + "loss": 0.3856, + "step": 2473 + }, + { + "epoch": 0.6222334004024145, + "grad_norm": 0.3949087858200073, + "learning_rate": 9.65275043970766e-06, + "loss": 0.3938, + "step": 2474 + }, + { + "epoch": 0.6224849094567404, + "grad_norm": 0.3579958975315094, + "learning_rate": 9.65221445095501e-06, + "loss": 0.4041, + "step": 2475 + }, + { + "epoch": 0.6227364185110664, + "grad_norm": 0.40078824758529663, + "learning_rate": 9.6516780637693e-06, + "loss": 0.367, + "step": 2476 + }, + { + "epoch": 0.6229879275653923, + "grad_norm": 0.39681002497673035, + "learning_rate": 9.651141278196462e-06, + "loss": 0.4074, + "step": 2477 + }, + { + "epoch": 0.6232394366197183, + "grad_norm": 0.37867873907089233, + "learning_rate": 9.650604094282471e-06, + "loss": 0.3701, + "step": 2478 + }, + { + "epoch": 0.6234909456740443, + "grad_norm": 0.43801194429397583, + "learning_rate": 9.650066512073336e-06, + "loss": 0.3802, + "step": 2479 + }, + { + "epoch": 0.6237424547283702, + "grad_norm": 0.34844502806663513, + "learning_rate": 9.649528531615094e-06, + "loss": 0.3914, + "step": 2480 + }, + { + "epoch": 0.6239939637826962, + "grad_norm": 0.3765834867954254, + "learning_rate": 9.64899015295382e-06, + "loss": 0.3964, + "step": 2481 + }, + { + "epoch": 0.6242454728370221, + "grad_norm": 0.4189784526824951, + "learning_rate": 9.648451376135624e-06, + "loss": 0.3936, + "step": 2482 + }, + { + "epoch": 0.6244969818913481, + "grad_norm": 0.3877945840358734, + "learning_rate": 9.647912201206646e-06, + "loss": 0.3809, + "step": 2483 + }, + { + "epoch": 0.6247484909456741, + "grad_norm": 0.34489884972572327, + "learning_rate": 9.647372628213068e-06, + "loss": 0.3805, + "step": 2484 + }, + { + "epoch": 0.625, + "grad_norm": 0.3784612715244293, + "learning_rate": 9.646832657201097e-06, + "loss": 0.3824, + "step": 2485 + }, + { + "epoch": 0.6252515090543259, + "grad_norm": 0.3295539319515228, + "learning_rate": 9.646292288216978e-06, + "loss": 0.3783, + "step": 2486 + }, + { + "epoch": 0.6255030181086519, + "grad_norm": 0.3725329339504242, + "learning_rate": 9.645751521306994e-06, + "loss": 0.4002, + "step": 2487 + }, + { + "epoch": 0.6257545271629779, + "grad_norm": 0.386722207069397, + "learning_rate": 9.645210356517454e-06, + "loss": 0.3934, + "step": 2488 + }, + { + "epoch": 0.6260060362173038, + "grad_norm": 0.3421211838722229, + "learning_rate": 9.64466879389471e-06, + "loss": 0.3854, + "step": 2489 + }, + { + "epoch": 0.6262575452716298, + "grad_norm": 0.42979565262794495, + "learning_rate": 9.644126833485139e-06, + "loss": 0.3659, + "step": 2490 + }, + { + "epoch": 0.6265090543259557, + "grad_norm": 0.36490967869758606, + "learning_rate": 9.643584475335157e-06, + "loss": 0.384, + "step": 2491 + }, + { + "epoch": 0.6267605633802817, + "grad_norm": 0.4025133550167084, + "learning_rate": 9.643041719491218e-06, + "loss": 0.3844, + "step": 2492 + }, + { + "epoch": 0.6270120724346077, + "grad_norm": 0.36443737149238586, + "learning_rate": 9.6424985659998e-06, + "loss": 0.3851, + "step": 2493 + }, + { + "epoch": 0.6272635814889336, + "grad_norm": 0.33622828125953674, + "learning_rate": 9.641955014907425e-06, + "loss": 0.3756, + "step": 2494 + }, + { + "epoch": 0.6275150905432596, + "grad_norm": 0.3600637912750244, + "learning_rate": 9.64141106626064e-06, + "loss": 0.3833, + "step": 2495 + }, + { + "epoch": 0.6277665995975855, + "grad_norm": 0.37680599093437195, + "learning_rate": 9.640866720106037e-06, + "loss": 0.3976, + "step": 2496 + }, + { + "epoch": 0.6280181086519114, + "grad_norm": 0.380525141954422, + "learning_rate": 9.64032197649023e-06, + "loss": 0.3714, + "step": 2497 + }, + { + "epoch": 0.6282696177062375, + "grad_norm": 0.3835989534854889, + "learning_rate": 9.639776835459878e-06, + "loss": 0.4056, + "step": 2498 + }, + { + "epoch": 0.6285211267605634, + "grad_norm": 0.46692678332328796, + "learning_rate": 9.639231297061663e-06, + "loss": 0.4207, + "step": 2499 + }, + { + "epoch": 0.6287726358148893, + "grad_norm": 0.3480624258518219, + "learning_rate": 9.638685361342314e-06, + "loss": 0.4059, + "step": 2500 + }, + { + "epoch": 0.6290241448692153, + "grad_norm": 0.4090098440647125, + "learning_rate": 9.63813902834858e-06, + "loss": 0.3965, + "step": 2501 + }, + { + "epoch": 0.6292756539235412, + "grad_norm": 0.37328529357910156, + "learning_rate": 9.637592298127258e-06, + "loss": 0.3768, + "step": 2502 + }, + { + "epoch": 0.6295271629778671, + "grad_norm": 0.43410345911979675, + "learning_rate": 9.637045170725165e-06, + "loss": 0.3961, + "step": 2503 + }, + { + "epoch": 0.6297786720321932, + "grad_norm": 0.3536240756511688, + "learning_rate": 9.636497646189165e-06, + "loss": 0.3811, + "step": 2504 + }, + { + "epoch": 0.6300301810865191, + "grad_norm": 0.35403186082839966, + "learning_rate": 9.635949724566147e-06, + "loss": 0.3764, + "step": 2505 + }, + { + "epoch": 0.6302816901408451, + "grad_norm": 0.3755878508090973, + "learning_rate": 9.635401405903037e-06, + "loss": 0.378, + "step": 2506 + }, + { + "epoch": 0.630533199195171, + "grad_norm": 0.36756351590156555, + "learning_rate": 9.634852690246795e-06, + "loss": 0.387, + "step": 2507 + }, + { + "epoch": 0.630784708249497, + "grad_norm": 0.3477321267127991, + "learning_rate": 9.634303577644415e-06, + "loss": 0.3545, + "step": 2508 + }, + { + "epoch": 0.631036217303823, + "grad_norm": 0.33334577083587646, + "learning_rate": 9.633754068142928e-06, + "loss": 0.3817, + "step": 2509 + }, + { + "epoch": 0.6312877263581489, + "grad_norm": 0.34079161286354065, + "learning_rate": 9.633204161789392e-06, + "loss": 0.367, + "step": 2510 + }, + { + "epoch": 0.6315392354124748, + "grad_norm": 0.4175172746181488, + "learning_rate": 9.632653858630905e-06, + "loss": 0.3825, + "step": 2511 + }, + { + "epoch": 0.6317907444668008, + "grad_norm": 0.3560580015182495, + "learning_rate": 9.632103158714596e-06, + "loss": 0.3525, + "step": 2512 + }, + { + "epoch": 0.6320422535211268, + "grad_norm": 0.3939068913459778, + "learning_rate": 9.631552062087632e-06, + "loss": 0.4022, + "step": 2513 + }, + { + "epoch": 0.6322937625754527, + "grad_norm": 0.3403138518333435, + "learning_rate": 9.631000568797208e-06, + "loss": 0.4065, + "step": 2514 + }, + { + "epoch": 0.6325452716297787, + "grad_norm": 0.3796042799949646, + "learning_rate": 9.630448678890556e-06, + "loss": 0.4052, + "step": 2515 + }, + { + "epoch": 0.6327967806841046, + "grad_norm": 0.3852931559085846, + "learning_rate": 9.629896392414943e-06, + "loss": 0.3769, + "step": 2516 + }, + { + "epoch": 0.6330482897384306, + "grad_norm": 0.3552147448062897, + "learning_rate": 9.62934370941767e-06, + "loss": 0.3913, + "step": 2517 + }, + { + "epoch": 0.6332997987927566, + "grad_norm": 0.37440934777259827, + "learning_rate": 9.628790629946066e-06, + "loss": 0.387, + "step": 2518 + }, + { + "epoch": 0.6335513078470825, + "grad_norm": 0.4127063751220703, + "learning_rate": 9.628237154047504e-06, + "loss": 0.3924, + "step": 2519 + }, + { + "epoch": 0.6338028169014085, + "grad_norm": 0.3693302273750305, + "learning_rate": 9.627683281769384e-06, + "loss": 0.3824, + "step": 2520 + }, + { + "epoch": 0.6340543259557344, + "grad_norm": 0.3535366654396057, + "learning_rate": 9.627129013159142e-06, + "loss": 0.3505, + "step": 2521 + }, + { + "epoch": 0.6343058350100603, + "grad_norm": 0.43735992908477783, + "learning_rate": 9.626574348264246e-06, + "loss": 0.3739, + "step": 2522 + }, + { + "epoch": 0.6345573440643864, + "grad_norm": 0.41980883479118347, + "learning_rate": 9.626019287132202e-06, + "loss": 0.3887, + "step": 2523 + }, + { + "epoch": 0.6348088531187123, + "grad_norm": 0.3785690665245056, + "learning_rate": 9.625463829810547e-06, + "loss": 0.4176, + "step": 2524 + }, + { + "epoch": 0.6350603621730382, + "grad_norm": 0.42933017015457153, + "learning_rate": 9.62490797634685e-06, + "loss": 0.3994, + "step": 2525 + }, + { + "epoch": 0.6353118712273642, + "grad_norm": 0.3780410885810852, + "learning_rate": 9.624351726788719e-06, + "loss": 0.379, + "step": 2526 + }, + { + "epoch": 0.6355633802816901, + "grad_norm": 0.36872223019599915, + "learning_rate": 9.623795081183794e-06, + "loss": 0.4074, + "step": 2527 + }, + { + "epoch": 0.635814889336016, + "grad_norm": 0.3785404562950134, + "learning_rate": 9.623238039579742e-06, + "loss": 0.3602, + "step": 2528 + }, + { + "epoch": 0.6360663983903421, + "grad_norm": 0.3856281638145447, + "learning_rate": 9.622680602024278e-06, + "loss": 0.3694, + "step": 2529 + }, + { + "epoch": 0.636317907444668, + "grad_norm": 0.3590550124645233, + "learning_rate": 9.62212276856514e-06, + "loss": 0.3768, + "step": 2530 + }, + { + "epoch": 0.636569416498994, + "grad_norm": 0.4134266972541809, + "learning_rate": 9.621564539250103e-06, + "loss": 0.3803, + "step": 2531 + }, + { + "epoch": 0.6368209255533199, + "grad_norm": 0.3633298873901367, + "learning_rate": 9.621005914126974e-06, + "loss": 0.3774, + "step": 2532 + }, + { + "epoch": 0.6370724346076458, + "grad_norm": 0.4221361577510834, + "learning_rate": 9.6204468932436e-06, + "loss": 0.3943, + "step": 2533 + }, + { + "epoch": 0.6373239436619719, + "grad_norm": 0.3500104546546936, + "learning_rate": 9.619887476647854e-06, + "loss": 0.3893, + "step": 2534 + }, + { + "epoch": 0.6375754527162978, + "grad_norm": 0.365842342376709, + "learning_rate": 9.619327664387648e-06, + "loss": 0.3697, + "step": 2535 + }, + { + "epoch": 0.6378269617706237, + "grad_norm": 0.36636513471603394, + "learning_rate": 9.618767456510924e-06, + "loss": 0.3848, + "step": 2536 + }, + { + "epoch": 0.6380784708249497, + "grad_norm": 0.37152019143104553, + "learning_rate": 9.618206853065664e-06, + "loss": 0.377, + "step": 2537 + }, + { + "epoch": 0.6383299798792756, + "grad_norm": 0.37324413657188416, + "learning_rate": 9.617645854099878e-06, + "loss": 0.3837, + "step": 2538 + }, + { + "epoch": 0.6385814889336016, + "grad_norm": 0.3649263083934784, + "learning_rate": 9.617084459661612e-06, + "loss": 0.3921, + "step": 2539 + }, + { + "epoch": 0.6388329979879276, + "grad_norm": 0.4074450135231018, + "learning_rate": 9.616522669798947e-06, + "loss": 0.4087, + "step": 2540 + }, + { + "epoch": 0.6390845070422535, + "grad_norm": 0.3745105564594269, + "learning_rate": 9.615960484559995e-06, + "loss": 0.3926, + "step": 2541 + }, + { + "epoch": 0.6393360160965795, + "grad_norm": 0.42575785517692566, + "learning_rate": 9.615397903992906e-06, + "loss": 0.3892, + "step": 2542 + }, + { + "epoch": 0.6395875251509054, + "grad_norm": 0.3537648916244507, + "learning_rate": 9.61483492814586e-06, + "loss": 0.3679, + "step": 2543 + }, + { + "epoch": 0.6398390342052314, + "grad_norm": 0.37703290581703186, + "learning_rate": 9.614271557067072e-06, + "loss": 0.3567, + "step": 2544 + }, + { + "epoch": 0.6400905432595574, + "grad_norm": 0.40531283617019653, + "learning_rate": 9.613707790804794e-06, + "loss": 0.3992, + "step": 2545 + }, + { + "epoch": 0.6403420523138833, + "grad_norm": 0.37500283122062683, + "learning_rate": 9.613143629407305e-06, + "loss": 0.3933, + "step": 2546 + }, + { + "epoch": 0.6405935613682092, + "grad_norm": 0.41981109976768494, + "learning_rate": 9.612579072922926e-06, + "loss": 0.3811, + "step": 2547 + }, + { + "epoch": 0.6408450704225352, + "grad_norm": 0.416825532913208, + "learning_rate": 9.612014121400003e-06, + "loss": 0.3978, + "step": 2548 + }, + { + "epoch": 0.6410965794768612, + "grad_norm": 0.40258339047431946, + "learning_rate": 9.611448774886925e-06, + "loss": 0.3916, + "step": 2549 + }, + { + "epoch": 0.6413480885311871, + "grad_norm": 0.44783157110214233, + "learning_rate": 9.610883033432107e-06, + "loss": 0.3962, + "step": 2550 + }, + { + "epoch": 0.6415995975855131, + "grad_norm": 0.3728145360946655, + "learning_rate": 9.610316897084004e-06, + "loss": 0.3778, + "step": 2551 + }, + { + "epoch": 0.641851106639839, + "grad_norm": 0.3508928418159485, + "learning_rate": 9.6097503658911e-06, + "loss": 0.3749, + "step": 2552 + }, + { + "epoch": 0.6421026156941649, + "grad_norm": 0.38925930857658386, + "learning_rate": 9.609183439901917e-06, + "loss": 0.3726, + "step": 2553 + }, + { + "epoch": 0.642354124748491, + "grad_norm": 0.3719974458217621, + "learning_rate": 9.608616119165007e-06, + "loss": 0.3923, + "step": 2554 + }, + { + "epoch": 0.6426056338028169, + "grad_norm": 0.3723568618297577, + "learning_rate": 9.608048403728957e-06, + "loss": 0.3739, + "step": 2555 + }, + { + "epoch": 0.6428571428571429, + "grad_norm": 0.3961268365383148, + "learning_rate": 9.60748029364239e-06, + "loss": 0.3812, + "step": 2556 + }, + { + "epoch": 0.6431086519114688, + "grad_norm": 0.38643985986709595, + "learning_rate": 9.60691178895396e-06, + "loss": 0.4085, + "step": 2557 + }, + { + "epoch": 0.6433601609657947, + "grad_norm": 0.3590714931488037, + "learning_rate": 9.606342889712354e-06, + "loss": 0.3862, + "step": 2558 + }, + { + "epoch": 0.6436116700201208, + "grad_norm": 0.35285237431526184, + "learning_rate": 9.605773595966298e-06, + "loss": 0.4006, + "step": 2559 + }, + { + "epoch": 0.6438631790744467, + "grad_norm": 0.36502790451049805, + "learning_rate": 9.605203907764548e-06, + "loss": 0.3655, + "step": 2560 + }, + { + "epoch": 0.6441146881287726, + "grad_norm": 0.3753395676612854, + "learning_rate": 9.604633825155894e-06, + "loss": 0.4009, + "step": 2561 + }, + { + "epoch": 0.6443661971830986, + "grad_norm": 0.39320510625839233, + "learning_rate": 9.604063348189158e-06, + "loss": 0.3895, + "step": 2562 + }, + { + "epoch": 0.6446177062374245, + "grad_norm": 0.320444256067276, + "learning_rate": 9.603492476913199e-06, + "loss": 0.3756, + "step": 2563 + }, + { + "epoch": 0.6448692152917505, + "grad_norm": 0.349065899848938, + "learning_rate": 9.60292121137691e-06, + "loss": 0.4084, + "step": 2564 + }, + { + "epoch": 0.6451207243460765, + "grad_norm": 0.34628018736839294, + "learning_rate": 9.602349551629213e-06, + "loss": 0.4003, + "step": 2565 + }, + { + "epoch": 0.6453722334004024, + "grad_norm": 0.35905465483665466, + "learning_rate": 9.601777497719071e-06, + "loss": 0.3999, + "step": 2566 + }, + { + "epoch": 0.6456237424547284, + "grad_norm": 0.3582354187965393, + "learning_rate": 9.601205049695473e-06, + "loss": 0.3676, + "step": 2567 + }, + { + "epoch": 0.6458752515090543, + "grad_norm": 0.35198500752449036, + "learning_rate": 9.60063220760745e-06, + "loss": 0.4059, + "step": 2568 + }, + { + "epoch": 0.6461267605633803, + "grad_norm": 0.3506782352924347, + "learning_rate": 9.600058971504058e-06, + "loss": 0.3826, + "step": 2569 + }, + { + "epoch": 0.6463782696177063, + "grad_norm": 0.3575206398963928, + "learning_rate": 9.599485341434394e-06, + "loss": 0.3791, + "step": 2570 + }, + { + "epoch": 0.6466297786720322, + "grad_norm": 0.3570882976055145, + "learning_rate": 9.598911317447583e-06, + "loss": 0.3798, + "step": 2571 + }, + { + "epoch": 0.6468812877263581, + "grad_norm": 0.36667707562446594, + "learning_rate": 9.598336899592791e-06, + "loss": 0.3967, + "step": 2572 + }, + { + "epoch": 0.6471327967806841, + "grad_norm": 0.31375589966773987, + "learning_rate": 9.597762087919209e-06, + "loss": 0.3925, + "step": 2573 + }, + { + "epoch": 0.64738430583501, + "grad_norm": 0.3781939446926117, + "learning_rate": 9.597186882476069e-06, + "loss": 0.3934, + "step": 2574 + }, + { + "epoch": 0.647635814889336, + "grad_norm": 0.36152127385139465, + "learning_rate": 9.59661128331263e-06, + "loss": 0.4066, + "step": 2575 + }, + { + "epoch": 0.647887323943662, + "grad_norm": 0.3301715552806854, + "learning_rate": 9.596035290478192e-06, + "loss": 0.3804, + "step": 2576 + }, + { + "epoch": 0.6481388329979879, + "grad_norm": 0.4257327914237976, + "learning_rate": 9.595458904022086e-06, + "loss": 0.3964, + "step": 2577 + }, + { + "epoch": 0.6483903420523138, + "grad_norm": 0.3644980490207672, + "learning_rate": 9.594882123993671e-06, + "loss": 0.3798, + "step": 2578 + }, + { + "epoch": 0.6486418511066399, + "grad_norm": 0.3696765601634979, + "learning_rate": 9.59430495044235e-06, + "loss": 0.3466, + "step": 2579 + }, + { + "epoch": 0.6488933601609658, + "grad_norm": 0.450061172246933, + "learning_rate": 9.593727383417551e-06, + "loss": 0.3997, + "step": 2580 + }, + { + "epoch": 0.6491448692152918, + "grad_norm": 0.4249400794506073, + "learning_rate": 9.59314942296874e-06, + "loss": 0.3988, + "step": 2581 + }, + { + "epoch": 0.6493963782696177, + "grad_norm": 0.44383350014686584, + "learning_rate": 9.592571069145415e-06, + "loss": 0.3948, + "step": 2582 + }, + { + "epoch": 0.6496478873239436, + "grad_norm": 0.3797934353351593, + "learning_rate": 9.591992321997107e-06, + "loss": 0.3807, + "step": 2583 + }, + { + "epoch": 0.6498993963782697, + "grad_norm": 0.40666332840919495, + "learning_rate": 9.591413181573388e-06, + "loss": 0.4152, + "step": 2584 + }, + { + "epoch": 0.6501509054325956, + "grad_norm": 0.36471301317214966, + "learning_rate": 9.590833647923852e-06, + "loss": 0.3952, + "step": 2585 + }, + { + "epoch": 0.6504024144869215, + "grad_norm": 0.40786394476890564, + "learning_rate": 9.590253721098135e-06, + "loss": 0.3957, + "step": 2586 + }, + { + "epoch": 0.6506539235412475, + "grad_norm": 0.3736938238143921, + "learning_rate": 9.589673401145902e-06, + "loss": 0.3554, + "step": 2587 + }, + { + "epoch": 0.6509054325955734, + "grad_norm": 0.3855222761631012, + "learning_rate": 9.589092688116855e-06, + "loss": 0.4014, + "step": 2588 + }, + { + "epoch": 0.6511569416498993, + "grad_norm": 0.41750627756118774, + "learning_rate": 9.58851158206073e-06, + "loss": 0.3998, + "step": 2589 + }, + { + "epoch": 0.6514084507042254, + "grad_norm": 0.4044592082500458, + "learning_rate": 9.587930083027293e-06, + "loss": 0.3571, + "step": 2590 + }, + { + "epoch": 0.6516599597585513, + "grad_norm": 0.3685952126979828, + "learning_rate": 9.587348191066345e-06, + "loss": 0.3984, + "step": 2591 + }, + { + "epoch": 0.6519114688128773, + "grad_norm": 0.3659170866012573, + "learning_rate": 9.586765906227727e-06, + "loss": 0.3794, + "step": 2592 + }, + { + "epoch": 0.6521629778672032, + "grad_norm": 0.3769875466823578, + "learning_rate": 9.586183228561299e-06, + "loss": 0.3803, + "step": 2593 + }, + { + "epoch": 0.6524144869215291, + "grad_norm": 0.40973907709121704, + "learning_rate": 9.58560015811697e-06, + "loss": 0.3876, + "step": 2594 + }, + { + "epoch": 0.6526659959758552, + "grad_norm": 0.377444326877594, + "learning_rate": 9.585016694944676e-06, + "loss": 0.402, + "step": 2595 + }, + { + "epoch": 0.6529175050301811, + "grad_norm": 0.3654913902282715, + "learning_rate": 9.584432839094387e-06, + "loss": 0.3613, + "step": 2596 + }, + { + "epoch": 0.653169014084507, + "grad_norm": 0.3855876326560974, + "learning_rate": 9.583848590616102e-06, + "loss": 0.3833, + "step": 2597 + }, + { + "epoch": 0.653420523138833, + "grad_norm": 0.40650826692581177, + "learning_rate": 9.583263949559864e-06, + "loss": 0.3993, + "step": 2598 + }, + { + "epoch": 0.653672032193159, + "grad_norm": 0.4067690670490265, + "learning_rate": 9.582678915975741e-06, + "loss": 0.3818, + "step": 2599 + }, + { + "epoch": 0.6539235412474849, + "grad_norm": 0.38773733377456665, + "learning_rate": 9.582093489913838e-06, + "loss": 0.39, + "step": 2600 + }, + { + "epoch": 0.6541750503018109, + "grad_norm": 0.3709178864955902, + "learning_rate": 9.581507671424293e-06, + "loss": 0.3899, + "step": 2601 + }, + { + "epoch": 0.6544265593561368, + "grad_norm": 0.4474826157093048, + "learning_rate": 9.580921460557278e-06, + "loss": 0.406, + "step": 2602 + }, + { + "epoch": 0.6546780684104627, + "grad_norm": 0.3681783974170685, + "learning_rate": 9.580334857362997e-06, + "loss": 0.393, + "step": 2603 + }, + { + "epoch": 0.6549295774647887, + "grad_norm": 0.4004930555820465, + "learning_rate": 9.57974786189169e-06, + "loss": 0.4031, + "step": 2604 + }, + { + "epoch": 0.6551810865191147, + "grad_norm": 0.48355597257614136, + "learning_rate": 9.579160474193632e-06, + "loss": 0.3869, + "step": 2605 + }, + { + "epoch": 0.6554325955734407, + "grad_norm": 0.3602335751056671, + "learning_rate": 9.578572694319124e-06, + "loss": 0.4049, + "step": 2606 + }, + { + "epoch": 0.6556841046277666, + "grad_norm": 0.40569397807121277, + "learning_rate": 9.577984522318508e-06, + "loss": 0.3723, + "step": 2607 + }, + { + "epoch": 0.6559356136820925, + "grad_norm": 0.41683316230773926, + "learning_rate": 9.57739595824216e-06, + "loss": 0.3858, + "step": 2608 + }, + { + "epoch": 0.6561871227364185, + "grad_norm": 0.3983708322048187, + "learning_rate": 9.576807002140483e-06, + "loss": 0.4024, + "step": 2609 + }, + { + "epoch": 0.6564386317907445, + "grad_norm": 0.3469032347202301, + "learning_rate": 9.576217654063917e-06, + "loss": 0.3775, + "step": 2610 + }, + { + "epoch": 0.6566901408450704, + "grad_norm": 0.4117089509963989, + "learning_rate": 9.57562791406294e-06, + "loss": 0.4051, + "step": 2611 + }, + { + "epoch": 0.6569416498993964, + "grad_norm": 0.37733131647109985, + "learning_rate": 9.575037782188054e-06, + "loss": 0.3771, + "step": 2612 + }, + { + "epoch": 0.6571931589537223, + "grad_norm": 0.3928118944168091, + "learning_rate": 9.574447258489808e-06, + "loss": 0.3604, + "step": 2613 + }, + { + "epoch": 0.6574446680080482, + "grad_norm": 0.3928970396518707, + "learning_rate": 9.573856343018768e-06, + "loss": 0.3788, + "step": 2614 + }, + { + "epoch": 0.6576961770623743, + "grad_norm": 0.402646541595459, + "learning_rate": 9.573265035825548e-06, + "loss": 0.3922, + "step": 2615 + }, + { + "epoch": 0.6579476861167002, + "grad_norm": 0.3756962716579437, + "learning_rate": 9.572673336960787e-06, + "loss": 0.3839, + "step": 2616 + }, + { + "epoch": 0.6581991951710262, + "grad_norm": 0.3633159399032593, + "learning_rate": 9.572081246475162e-06, + "loss": 0.3558, + "step": 2617 + }, + { + "epoch": 0.6584507042253521, + "grad_norm": 0.39355260133743286, + "learning_rate": 9.571488764419381e-06, + "loss": 0.3855, + "step": 2618 + }, + { + "epoch": 0.658702213279678, + "grad_norm": 0.3599531054496765, + "learning_rate": 9.570895890844188e-06, + "loss": 0.3917, + "step": 2619 + }, + { + "epoch": 0.6589537223340041, + "grad_norm": 0.3619001507759094, + "learning_rate": 9.570302625800353e-06, + "loss": 0.3906, + "step": 2620 + }, + { + "epoch": 0.65920523138833, + "grad_norm": 0.35282886028289795, + "learning_rate": 9.569708969338694e-06, + "loss": 0.3902, + "step": 2621 + }, + { + "epoch": 0.6594567404426559, + "grad_norm": 0.36944806575775146, + "learning_rate": 9.569114921510048e-06, + "loss": 0.3758, + "step": 2622 + }, + { + "epoch": 0.6597082494969819, + "grad_norm": 0.3732403516769409, + "learning_rate": 9.568520482365293e-06, + "loss": 0.3795, + "step": 2623 + }, + { + "epoch": 0.6599597585513078, + "grad_norm": 0.35911738872528076, + "learning_rate": 9.56792565195534e-06, + "loss": 0.4018, + "step": 2624 + }, + { + "epoch": 0.6602112676056338, + "grad_norm": 0.385262131690979, + "learning_rate": 9.567330430331133e-06, + "loss": 0.3562, + "step": 2625 + }, + { + "epoch": 0.6604627766599598, + "grad_norm": 0.3803010582923889, + "learning_rate": 9.566734817543645e-06, + "loss": 0.3979, + "step": 2626 + }, + { + "epoch": 0.6607142857142857, + "grad_norm": 0.4101577401161194, + "learning_rate": 9.566138813643891e-06, + "loss": 0.4253, + "step": 2627 + }, + { + "epoch": 0.6609657947686117, + "grad_norm": 0.39751216769218445, + "learning_rate": 9.565542418682914e-06, + "loss": 0.3937, + "step": 2628 + }, + { + "epoch": 0.6612173038229376, + "grad_norm": 0.3846282958984375, + "learning_rate": 9.564945632711789e-06, + "loss": 0.403, + "step": 2629 + }, + { + "epoch": 0.6614688128772636, + "grad_norm": 0.3686662018299103, + "learning_rate": 9.56434845578163e-06, + "loss": 0.37, + "step": 2630 + }, + { + "epoch": 0.6617203219315896, + "grad_norm": 0.43031734228134155, + "learning_rate": 9.56375088794358e-06, + "loss": 0.381, + "step": 2631 + }, + { + "epoch": 0.6619718309859155, + "grad_norm": 0.36381101608276367, + "learning_rate": 9.563152929248817e-06, + "loss": 0.4052, + "step": 2632 + }, + { + "epoch": 0.6622233400402414, + "grad_norm": 0.3557662069797516, + "learning_rate": 9.562554579748553e-06, + "loss": 0.4099, + "step": 2633 + }, + { + "epoch": 0.6624748490945674, + "grad_norm": 0.3922789692878723, + "learning_rate": 9.561955839494032e-06, + "loss": 0.3999, + "step": 2634 + }, + { + "epoch": 0.6627263581488934, + "grad_norm": 0.39800694584846497, + "learning_rate": 9.561356708536532e-06, + "loss": 0.4047, + "step": 2635 + }, + { + "epoch": 0.6629778672032193, + "grad_norm": 0.35587430000305176, + "learning_rate": 9.560757186927367e-06, + "loss": 0.3867, + "step": 2636 + }, + { + "epoch": 0.6632293762575453, + "grad_norm": 0.4054100811481476, + "learning_rate": 9.56015727471788e-06, + "loss": 0.3606, + "step": 2637 + }, + { + "epoch": 0.6634808853118712, + "grad_norm": 0.3700953423976898, + "learning_rate": 9.559556971959452e-06, + "loss": 0.3743, + "step": 2638 + }, + { + "epoch": 0.6637323943661971, + "grad_norm": 0.3898264169692993, + "learning_rate": 9.558956278703493e-06, + "loss": 0.3565, + "step": 2639 + }, + { + "epoch": 0.6639839034205232, + "grad_norm": 0.4122966527938843, + "learning_rate": 9.55835519500145e-06, + "loss": 0.4195, + "step": 2640 + }, + { + "epoch": 0.6642354124748491, + "grad_norm": 0.375590443611145, + "learning_rate": 9.557753720904801e-06, + "loss": 0.409, + "step": 2641 + }, + { + "epoch": 0.6644869215291751, + "grad_norm": 0.3992207944393158, + "learning_rate": 9.557151856465059e-06, + "loss": 0.3778, + "step": 2642 + }, + { + "epoch": 0.664738430583501, + "grad_norm": 0.36784839630126953, + "learning_rate": 9.556549601733769e-06, + "loss": 0.3846, + "step": 2643 + }, + { + "epoch": 0.6649899396378269, + "grad_norm": 0.3585883677005768, + "learning_rate": 9.555946956762513e-06, + "loss": 0.3747, + "step": 2644 + }, + { + "epoch": 0.665241448692153, + "grad_norm": 0.4158109128475189, + "learning_rate": 9.555343921602901e-06, + "loss": 0.3686, + "step": 2645 + }, + { + "epoch": 0.6654929577464789, + "grad_norm": 0.41617828607559204, + "learning_rate": 9.55474049630658e-06, + "loss": 0.3826, + "step": 2646 + }, + { + "epoch": 0.6657444668008048, + "grad_norm": 0.3927265703678131, + "learning_rate": 9.554136680925232e-06, + "loss": 0.4062, + "step": 2647 + }, + { + "epoch": 0.6659959758551308, + "grad_norm": 0.3728967607021332, + "learning_rate": 9.553532475510565e-06, + "loss": 0.3673, + "step": 2648 + }, + { + "epoch": 0.6662474849094567, + "grad_norm": 0.3700278103351593, + "learning_rate": 9.55292788011433e-06, + "loss": 0.3934, + "step": 2649 + }, + { + "epoch": 0.6664989939637826, + "grad_norm": 0.4425957500934601, + "learning_rate": 9.552322894788306e-06, + "loss": 0.3672, + "step": 2650 + }, + { + "epoch": 0.6667505030181087, + "grad_norm": 0.36857491731643677, + "learning_rate": 9.551717519584303e-06, + "loss": 0.3733, + "step": 2651 + }, + { + "epoch": 0.6670020120724346, + "grad_norm": 0.4787845313549042, + "learning_rate": 9.551111754554172e-06, + "loss": 0.3793, + "step": 2652 + }, + { + "epoch": 0.6672535211267606, + "grad_norm": 0.3845168948173523, + "learning_rate": 9.55050559974979e-06, + "loss": 0.3796, + "step": 2653 + }, + { + "epoch": 0.6675050301810865, + "grad_norm": 0.38348764181137085, + "learning_rate": 9.54989905522307e-06, + "loss": 0.3925, + "step": 2654 + }, + { + "epoch": 0.6677565392354124, + "grad_norm": 0.41745656728744507, + "learning_rate": 9.549292121025961e-06, + "loss": 0.3836, + "step": 2655 + }, + { + "epoch": 0.6680080482897385, + "grad_norm": 0.37018293142318726, + "learning_rate": 9.548684797210444e-06, + "loss": 0.3806, + "step": 2656 + }, + { + "epoch": 0.6682595573440644, + "grad_norm": 0.3973758816719055, + "learning_rate": 9.548077083828528e-06, + "loss": 0.3908, + "step": 2657 + }, + { + "epoch": 0.6685110663983903, + "grad_norm": 0.3895440101623535, + "learning_rate": 9.547468980932263e-06, + "loss": 0.3907, + "step": 2658 + }, + { + "epoch": 0.6687625754527163, + "grad_norm": 0.32289016246795654, + "learning_rate": 9.546860488573729e-06, + "loss": 0.3738, + "step": 2659 + }, + { + "epoch": 0.6690140845070423, + "grad_norm": 0.4137352705001831, + "learning_rate": 9.54625160680504e-06, + "loss": 0.3909, + "step": 2660 + }, + { + "epoch": 0.6692655935613682, + "grad_norm": 0.34959176182746887, + "learning_rate": 9.545642335678341e-06, + "loss": 0.3639, + "step": 2661 + }, + { + "epoch": 0.6695171026156942, + "grad_norm": 0.36586570739746094, + "learning_rate": 9.545032675245814e-06, + "loss": 0.3729, + "step": 2662 + }, + { + "epoch": 0.6697686116700201, + "grad_norm": 0.39214131236076355, + "learning_rate": 9.544422625559671e-06, + "loss": 0.3859, + "step": 2663 + }, + { + "epoch": 0.670020120724346, + "grad_norm": 0.35369038581848145, + "learning_rate": 9.543812186672161e-06, + "loss": 0.3796, + "step": 2664 + }, + { + "epoch": 0.670271629778672, + "grad_norm": 0.40630146861076355, + "learning_rate": 9.543201358635564e-06, + "loss": 0.3971, + "step": 2665 + }, + { + "epoch": 0.670523138832998, + "grad_norm": 0.34926897287368774, + "learning_rate": 9.54259014150219e-06, + "loss": 0.3889, + "step": 2666 + }, + { + "epoch": 0.670774647887324, + "grad_norm": 0.4089818298816681, + "learning_rate": 9.54197853532439e-06, + "loss": 0.392, + "step": 2667 + }, + { + "epoch": 0.6710261569416499, + "grad_norm": 0.3891892433166504, + "learning_rate": 9.541366540154544e-06, + "loss": 0.4018, + "step": 2668 + }, + { + "epoch": 0.6712776659959758, + "grad_norm": 0.3339185118675232, + "learning_rate": 9.540754156045064e-06, + "loss": 0.3712, + "step": 2669 + }, + { + "epoch": 0.6715291750503019, + "grad_norm": 0.40440425276756287, + "learning_rate": 9.540141383048398e-06, + "loss": 0.4362, + "step": 2670 + }, + { + "epoch": 0.6717806841046278, + "grad_norm": 0.39212754368782043, + "learning_rate": 9.539528221217026e-06, + "loss": 0.4054, + "step": 2671 + }, + { + "epoch": 0.6720321931589537, + "grad_norm": 0.3525048494338989, + "learning_rate": 9.538914670603458e-06, + "loss": 0.3754, + "step": 2672 + }, + { + "epoch": 0.6722837022132797, + "grad_norm": 0.37717682123184204, + "learning_rate": 9.538300731260247e-06, + "loss": 0.4042, + "step": 2673 + }, + { + "epoch": 0.6725352112676056, + "grad_norm": 0.4224480986595154, + "learning_rate": 9.537686403239967e-06, + "loss": 0.359, + "step": 2674 + }, + { + "epoch": 0.6727867203219315, + "grad_norm": 0.4100128412246704, + "learning_rate": 9.537071686595237e-06, + "loss": 0.3911, + "step": 2675 + }, + { + "epoch": 0.6730382293762576, + "grad_norm": 0.34391623735427856, + "learning_rate": 9.536456581378699e-06, + "loss": 0.3899, + "step": 2676 + }, + { + "epoch": 0.6732897384305835, + "grad_norm": 0.40277257561683655, + "learning_rate": 9.535841087643036e-06, + "loss": 0.3973, + "step": 2677 + }, + { + "epoch": 0.6735412474849095, + "grad_norm": 0.3905733823776245, + "learning_rate": 9.535225205440958e-06, + "loss": 0.3931, + "step": 2678 + }, + { + "epoch": 0.6737927565392354, + "grad_norm": 0.3948303461074829, + "learning_rate": 9.534608934825217e-06, + "loss": 0.3773, + "step": 2679 + }, + { + "epoch": 0.6740442655935613, + "grad_norm": 0.4447704255580902, + "learning_rate": 9.533992275848587e-06, + "loss": 0.4014, + "step": 2680 + }, + { + "epoch": 0.6742957746478874, + "grad_norm": 0.3908079266548157, + "learning_rate": 9.533375228563883e-06, + "loss": 0.394, + "step": 2681 + }, + { + "epoch": 0.6745472837022133, + "grad_norm": 0.354306697845459, + "learning_rate": 9.532757793023952e-06, + "loss": 0.3703, + "step": 2682 + }, + { + "epoch": 0.6747987927565392, + "grad_norm": 0.3971775472164154, + "learning_rate": 9.532139969281673e-06, + "loss": 0.3872, + "step": 2683 + }, + { + "epoch": 0.6750503018108652, + "grad_norm": 0.3223625421524048, + "learning_rate": 9.531521757389957e-06, + "loss": 0.3702, + "step": 2684 + }, + { + "epoch": 0.6753018108651911, + "grad_norm": 0.4071231186389923, + "learning_rate": 9.530903157401755e-06, + "loss": 0.4038, + "step": 2685 + }, + { + "epoch": 0.6755533199195171, + "grad_norm": 0.3665800094604492, + "learning_rate": 9.530284169370039e-06, + "loss": 0.3836, + "step": 2686 + }, + { + "epoch": 0.6758048289738431, + "grad_norm": 0.3697899878025055, + "learning_rate": 9.529664793347827e-06, + "loss": 0.383, + "step": 2687 + }, + { + "epoch": 0.676056338028169, + "grad_norm": 0.4246238172054291, + "learning_rate": 9.529045029388162e-06, + "loss": 0.4008, + "step": 2688 + }, + { + "epoch": 0.6763078470824949, + "grad_norm": 0.3734142482280731, + "learning_rate": 9.528424877544125e-06, + "loss": 0.3812, + "step": 2689 + }, + { + "epoch": 0.6765593561368209, + "grad_norm": 0.35906723141670227, + "learning_rate": 9.527804337868827e-06, + "loss": 0.3821, + "step": 2690 + }, + { + "epoch": 0.6768108651911469, + "grad_norm": 0.42187240719795227, + "learning_rate": 9.527183410415413e-06, + "loss": 0.3884, + "step": 2691 + }, + { + "epoch": 0.6770623742454729, + "grad_norm": 0.3730980455875397, + "learning_rate": 9.526562095237061e-06, + "loss": 0.3747, + "step": 2692 + }, + { + "epoch": 0.6773138832997988, + "grad_norm": 0.44730526208877563, + "learning_rate": 9.525940392386985e-06, + "loss": 0.3753, + "step": 2693 + }, + { + "epoch": 0.6775653923541247, + "grad_norm": 0.38479888439178467, + "learning_rate": 9.525318301918427e-06, + "loss": 0.402, + "step": 2694 + }, + { + "epoch": 0.6778169014084507, + "grad_norm": 0.40130650997161865, + "learning_rate": 9.524695823884669e-06, + "loss": 0.3775, + "step": 2695 + }, + { + "epoch": 0.6780684104627767, + "grad_norm": 0.41748368740081787, + "learning_rate": 9.524072958339019e-06, + "loss": 0.4146, + "step": 2696 + }, + { + "epoch": 0.6783199195171026, + "grad_norm": 0.3546696901321411, + "learning_rate": 9.523449705334821e-06, + "loss": 0.3782, + "step": 2697 + }, + { + "epoch": 0.6785714285714286, + "grad_norm": 0.4278590679168701, + "learning_rate": 9.522826064925457e-06, + "loss": 0.4017, + "step": 2698 + }, + { + "epoch": 0.6788229376257545, + "grad_norm": 0.3383933901786804, + "learning_rate": 9.522202037164333e-06, + "loss": 0.3774, + "step": 2699 + }, + { + "epoch": 0.6790744466800804, + "grad_norm": 0.3658550977706909, + "learning_rate": 9.521577622104897e-06, + "loss": 0.3783, + "step": 2700 + }, + { + "epoch": 0.6793259557344065, + "grad_norm": 0.40239429473876953, + "learning_rate": 9.520952819800624e-06, + "loss": 0.3896, + "step": 2701 + }, + { + "epoch": 0.6795774647887324, + "grad_norm": 0.3836422562599182, + "learning_rate": 9.520327630305026e-06, + "loss": 0.3932, + "step": 2702 + }, + { + "epoch": 0.6798289738430584, + "grad_norm": 0.3793487548828125, + "learning_rate": 9.519702053671643e-06, + "loss": 0.3851, + "step": 2703 + }, + { + "epoch": 0.6800804828973843, + "grad_norm": 0.38546112179756165, + "learning_rate": 9.519076089954056e-06, + "loss": 0.3839, + "step": 2704 + }, + { + "epoch": 0.6803319919517102, + "grad_norm": 0.38321325182914734, + "learning_rate": 9.518449739205873e-06, + "loss": 0.3793, + "step": 2705 + }, + { + "epoch": 0.6805835010060363, + "grad_norm": 0.35967323184013367, + "learning_rate": 9.517823001480737e-06, + "loss": 0.3805, + "step": 2706 + }, + { + "epoch": 0.6808350100603622, + "grad_norm": 0.3868929445743561, + "learning_rate": 9.517195876832324e-06, + "loss": 0.3666, + "step": 2707 + }, + { + "epoch": 0.6810865191146881, + "grad_norm": 0.33470460772514343, + "learning_rate": 9.516568365314345e-06, + "loss": 0.3674, + "step": 2708 + }, + { + "epoch": 0.6813380281690141, + "grad_norm": 0.37914395332336426, + "learning_rate": 9.51594046698054e-06, + "loss": 0.3757, + "step": 2709 + }, + { + "epoch": 0.68158953722334, + "grad_norm": 0.3739543557167053, + "learning_rate": 9.515312181884685e-06, + "loss": 0.3861, + "step": 2710 + }, + { + "epoch": 0.681841046277666, + "grad_norm": 0.3794519007205963, + "learning_rate": 9.514683510080592e-06, + "loss": 0.3961, + "step": 2711 + }, + { + "epoch": 0.682092555331992, + "grad_norm": 0.39454448223114014, + "learning_rate": 9.514054451622098e-06, + "loss": 0.4207, + "step": 2712 + }, + { + "epoch": 0.6823440643863179, + "grad_norm": 0.38329020142555237, + "learning_rate": 9.51342500656308e-06, + "loss": 0.3901, + "step": 2713 + }, + { + "epoch": 0.6825955734406438, + "grad_norm": 0.40806901454925537, + "learning_rate": 9.512795174957445e-06, + "loss": 0.3576, + "step": 2714 + }, + { + "epoch": 0.6828470824949698, + "grad_norm": 0.4223353862762451, + "learning_rate": 9.512164956859138e-06, + "loss": 0.366, + "step": 2715 + }, + { + "epoch": 0.6830985915492958, + "grad_norm": 0.4174785614013672, + "learning_rate": 9.511534352322128e-06, + "loss": 0.3953, + "step": 2716 + }, + { + "epoch": 0.6833501006036218, + "grad_norm": 0.390242338180542, + "learning_rate": 9.510903361400426e-06, + "loss": 0.3757, + "step": 2717 + }, + { + "epoch": 0.6836016096579477, + "grad_norm": 0.45529499650001526, + "learning_rate": 9.510271984148071e-06, + "loss": 0.3877, + "step": 2718 + }, + { + "epoch": 0.6838531187122736, + "grad_norm": 0.38153916597366333, + "learning_rate": 9.509640220619136e-06, + "loss": 0.4038, + "step": 2719 + }, + { + "epoch": 0.6841046277665996, + "grad_norm": 0.4252956211566925, + "learning_rate": 9.50900807086773e-06, + "loss": 0.3811, + "step": 2720 + }, + { + "epoch": 0.6843561368209256, + "grad_norm": 0.3555954098701477, + "learning_rate": 9.50837553494799e-06, + "loss": 0.3916, + "step": 2721 + }, + { + "epoch": 0.6846076458752515, + "grad_norm": 0.3798430562019348, + "learning_rate": 9.50774261291409e-06, + "loss": 0.3667, + "step": 2722 + }, + { + "epoch": 0.6848591549295775, + "grad_norm": 0.3840332627296448, + "learning_rate": 9.507109304820234e-06, + "loss": 0.3989, + "step": 2723 + }, + { + "epoch": 0.6851106639839034, + "grad_norm": 0.38104766607284546, + "learning_rate": 9.506475610720665e-06, + "loss": 0.3793, + "step": 2724 + }, + { + "epoch": 0.6853621730382293, + "grad_norm": 0.4474826455116272, + "learning_rate": 9.505841530669652e-06, + "loss": 0.3764, + "step": 2725 + }, + { + "epoch": 0.6856136820925554, + "grad_norm": 0.37356293201446533, + "learning_rate": 9.505207064721499e-06, + "loss": 0.4024, + "step": 2726 + }, + { + "epoch": 0.6858651911468813, + "grad_norm": 0.4107537269592285, + "learning_rate": 9.504572212930544e-06, + "loss": 0.3951, + "step": 2727 + }, + { + "epoch": 0.6861167002012073, + "grad_norm": 0.40174853801727295, + "learning_rate": 9.503936975351164e-06, + "loss": 0.3964, + "step": 2728 + }, + { + "epoch": 0.6863682092555332, + "grad_norm": 0.3748626708984375, + "learning_rate": 9.503301352037756e-06, + "loss": 0.3891, + "step": 2729 + }, + { + "epoch": 0.6866197183098591, + "grad_norm": 0.357077032327652, + "learning_rate": 9.50266534304476e-06, + "loss": 0.3828, + "step": 2730 + }, + { + "epoch": 0.6868712273641852, + "grad_norm": 0.40127694606781006, + "learning_rate": 9.502028948426645e-06, + "loss": 0.3847, + "step": 2731 + }, + { + "epoch": 0.6871227364185111, + "grad_norm": 0.3611276149749756, + "learning_rate": 9.501392168237914e-06, + "loss": 0.3901, + "step": 2732 + }, + { + "epoch": 0.687374245472837, + "grad_norm": 0.35712650418281555, + "learning_rate": 9.500755002533109e-06, + "loss": 0.3725, + "step": 2733 + }, + { + "epoch": 0.687625754527163, + "grad_norm": 0.38140082359313965, + "learning_rate": 9.50011745136679e-06, + "loss": 0.4035, + "step": 2734 + }, + { + "epoch": 0.6878772635814889, + "grad_norm": 0.375500351190567, + "learning_rate": 9.499479514793568e-06, + "loss": 0.3949, + "step": 2735 + }, + { + "epoch": 0.6881287726358148, + "grad_norm": 0.3318289816379547, + "learning_rate": 9.49884119286807e-06, + "loss": 0.3663, + "step": 2736 + }, + { + "epoch": 0.6883802816901409, + "grad_norm": 0.4133455753326416, + "learning_rate": 9.498202485644972e-06, + "loss": 0.3892, + "step": 2737 + }, + { + "epoch": 0.6886317907444668, + "grad_norm": 0.34787869453430176, + "learning_rate": 9.49756339317897e-06, + "loss": 0.4052, + "step": 2738 + }, + { + "epoch": 0.6888832997987927, + "grad_norm": 0.3496398627758026, + "learning_rate": 9.4969239155248e-06, + "loss": 0.3995, + "step": 2739 + }, + { + "epoch": 0.6891348088531187, + "grad_norm": 0.3664000630378723, + "learning_rate": 9.49628405273723e-06, + "loss": 0.3926, + "step": 2740 + }, + { + "epoch": 0.6893863179074446, + "grad_norm": 0.38853469491004944, + "learning_rate": 9.49564380487106e-06, + "loss": 0.3909, + "step": 2741 + }, + { + "epoch": 0.6896378269617707, + "grad_norm": 0.3786064684391022, + "learning_rate": 9.495003171981122e-06, + "loss": 0.392, + "step": 2742 + }, + { + "epoch": 0.6898893360160966, + "grad_norm": 0.3976394534111023, + "learning_rate": 9.494362154122283e-06, + "loss": 0.3786, + "step": 2743 + }, + { + "epoch": 0.6901408450704225, + "grad_norm": 0.36655136942863464, + "learning_rate": 9.493720751349442e-06, + "loss": 0.3733, + "step": 2744 + }, + { + "epoch": 0.6903923541247485, + "grad_norm": 0.32457855343818665, + "learning_rate": 9.493078963717533e-06, + "loss": 0.3873, + "step": 2745 + }, + { + "epoch": 0.6906438631790744, + "grad_norm": 0.3550087511539459, + "learning_rate": 9.492436791281516e-06, + "loss": 0.3992, + "step": 2746 + }, + { + "epoch": 0.6908953722334004, + "grad_norm": 0.357652872800827, + "learning_rate": 9.491794234096396e-06, + "loss": 0.3859, + "step": 2747 + }, + { + "epoch": 0.6911468812877264, + "grad_norm": 0.3854414224624634, + "learning_rate": 9.491151292217198e-06, + "loss": 0.4151, + "step": 2748 + }, + { + "epoch": 0.6913983903420523, + "grad_norm": 0.34755367040634155, + "learning_rate": 9.490507965698988e-06, + "loss": 0.3522, + "step": 2749 + }, + { + "epoch": 0.6916498993963782, + "grad_norm": 0.35060808062553406, + "learning_rate": 9.489864254596866e-06, + "loss": 0.3856, + "step": 2750 + }, + { + "epoch": 0.6919014084507042, + "grad_norm": 0.3610481023788452, + "learning_rate": 9.489220158965957e-06, + "loss": 0.393, + "step": 2751 + }, + { + "epoch": 0.6921529175050302, + "grad_norm": 0.32515424489974976, + "learning_rate": 9.488575678861426e-06, + "loss": 0.3973, + "step": 2752 + }, + { + "epoch": 0.6924044265593562, + "grad_norm": 0.3888111412525177, + "learning_rate": 9.487930814338468e-06, + "loss": 0.3946, + "step": 2753 + }, + { + "epoch": 0.6926559356136821, + "grad_norm": 0.3749738931655884, + "learning_rate": 9.487285565452313e-06, + "loss": 0.3777, + "step": 2754 + }, + { + "epoch": 0.692907444668008, + "grad_norm": 0.41394150257110596, + "learning_rate": 9.486639932258223e-06, + "loss": 0.3777, + "step": 2755 + }, + { + "epoch": 0.693158953722334, + "grad_norm": 0.35159727931022644, + "learning_rate": 9.485993914811488e-06, + "loss": 0.3798, + "step": 2756 + }, + { + "epoch": 0.69341046277666, + "grad_norm": 0.36780303716659546, + "learning_rate": 9.485347513167443e-06, + "loss": 0.382, + "step": 2757 + }, + { + "epoch": 0.6936619718309859, + "grad_norm": 0.3663868308067322, + "learning_rate": 9.48470072738144e-06, + "loss": 0.3858, + "step": 2758 + }, + { + "epoch": 0.6939134808853119, + "grad_norm": 0.3358771502971649, + "learning_rate": 9.484053557508876e-06, + "loss": 0.4216, + "step": 2759 + }, + { + "epoch": 0.6941649899396378, + "grad_norm": 0.3996930718421936, + "learning_rate": 9.483406003605178e-06, + "loss": 0.3943, + "step": 2760 + }, + { + "epoch": 0.6944164989939637, + "grad_norm": 0.41368430852890015, + "learning_rate": 9.482758065725805e-06, + "loss": 0.3836, + "step": 2761 + }, + { + "epoch": 0.6946680080482898, + "grad_norm": 0.4018966555595398, + "learning_rate": 9.482109743926247e-06, + "loss": 0.3969, + "step": 2762 + }, + { + "epoch": 0.6949195171026157, + "grad_norm": 0.3682190477848053, + "learning_rate": 9.481461038262027e-06, + "loss": 0.3671, + "step": 2763 + }, + { + "epoch": 0.6951710261569416, + "grad_norm": 0.36615151166915894, + "learning_rate": 9.480811948788708e-06, + "loss": 0.378, + "step": 2764 + }, + { + "epoch": 0.6954225352112676, + "grad_norm": 0.3735343813896179, + "learning_rate": 9.480162475561877e-06, + "loss": 0.3756, + "step": 2765 + }, + { + "epoch": 0.6956740442655935, + "grad_norm": 0.37882131338119507, + "learning_rate": 9.479512618637156e-06, + "loss": 0.3736, + "step": 2766 + }, + { + "epoch": 0.6959255533199196, + "grad_norm": 0.355418860912323, + "learning_rate": 9.478862378070204e-06, + "loss": 0.3856, + "step": 2767 + }, + { + "epoch": 0.6961770623742455, + "grad_norm": 0.34449321031570435, + "learning_rate": 9.47821175391671e-06, + "loss": 0.3713, + "step": 2768 + }, + { + "epoch": 0.6964285714285714, + "grad_norm": 0.39640021324157715, + "learning_rate": 9.477560746232394e-06, + "loss": 0.379, + "step": 2769 + }, + { + "epoch": 0.6966800804828974, + "grad_norm": 0.3537764251232147, + "learning_rate": 9.476909355073012e-06, + "loss": 0.3901, + "step": 2770 + }, + { + "epoch": 0.6969315895372233, + "grad_norm": 0.34988686442375183, + "learning_rate": 9.47625758049435e-06, + "loss": 0.3643, + "step": 2771 + }, + { + "epoch": 0.6971830985915493, + "grad_norm": 0.39470940828323364, + "learning_rate": 9.47560542255223e-06, + "loss": 0.3841, + "step": 2772 + }, + { + "epoch": 0.6974346076458753, + "grad_norm": 0.3538375496864319, + "learning_rate": 9.474952881302506e-06, + "loss": 0.3895, + "step": 2773 + }, + { + "epoch": 0.6976861167002012, + "grad_norm": 0.34231036901474, + "learning_rate": 9.474299956801062e-06, + "loss": 0.385, + "step": 2774 + }, + { + "epoch": 0.6979376257545271, + "grad_norm": 0.3452184796333313, + "learning_rate": 9.473646649103819e-06, + "loss": 0.3913, + "step": 2775 + }, + { + "epoch": 0.6981891348088531, + "grad_norm": 0.3894374966621399, + "learning_rate": 9.472992958266725e-06, + "loss": 0.3922, + "step": 2776 + }, + { + "epoch": 0.6984406438631791, + "grad_norm": 0.35543131828308105, + "learning_rate": 9.47233888434577e-06, + "loss": 0.373, + "step": 2777 + }, + { + "epoch": 0.6986921529175051, + "grad_norm": 0.3584343194961548, + "learning_rate": 9.471684427396966e-06, + "loss": 0.3948, + "step": 2778 + }, + { + "epoch": 0.698943661971831, + "grad_norm": 0.35712918639183044, + "learning_rate": 9.471029587476367e-06, + "loss": 0.4008, + "step": 2779 + }, + { + "epoch": 0.6991951710261569, + "grad_norm": 0.36257344484329224, + "learning_rate": 9.470374364640054e-06, + "loss": 0.4058, + "step": 2780 + }, + { + "epoch": 0.6994466800804829, + "grad_norm": 0.37731629610061646, + "learning_rate": 9.469718758944144e-06, + "loss": 0.4053, + "step": 2781 + }, + { + "epoch": 0.6996981891348089, + "grad_norm": 0.3586489260196686, + "learning_rate": 9.469062770444784e-06, + "loss": 0.402, + "step": 2782 + }, + { + "epoch": 0.6999496981891348, + "grad_norm": 0.36410945653915405, + "learning_rate": 9.468406399198156e-06, + "loss": 0.3954, + "step": 2783 + }, + { + "epoch": 0.7002012072434608, + "grad_norm": 0.40461426973342896, + "learning_rate": 9.467749645260475e-06, + "loss": 0.406, + "step": 2784 + }, + { + "epoch": 0.7004527162977867, + "grad_norm": 0.36193108558654785, + "learning_rate": 9.467092508687987e-06, + "loss": 0.3755, + "step": 2785 + }, + { + "epoch": 0.7007042253521126, + "grad_norm": 0.32516640424728394, + "learning_rate": 9.46643498953697e-06, + "loss": 0.3889, + "step": 2786 + }, + { + "epoch": 0.7009557344064387, + "grad_norm": 0.3632727861404419, + "learning_rate": 9.46577708786374e-06, + "loss": 0.4103, + "step": 2787 + }, + { + "epoch": 0.7012072434607646, + "grad_norm": 0.3422258496284485, + "learning_rate": 9.46511880372464e-06, + "loss": 0.3833, + "step": 2788 + }, + { + "epoch": 0.7014587525150905, + "grad_norm": 0.386264830827713, + "learning_rate": 9.464460137176047e-06, + "loss": 0.3842, + "step": 2789 + }, + { + "epoch": 0.7017102615694165, + "grad_norm": 0.33963924646377563, + "learning_rate": 9.463801088274374e-06, + "loss": 0.3707, + "step": 2790 + }, + { + "epoch": 0.7019617706237424, + "grad_norm": 0.3681434094905853, + "learning_rate": 9.463141657076063e-06, + "loss": 0.3914, + "step": 2791 + }, + { + "epoch": 0.7022132796780685, + "grad_norm": 0.3675888180732727, + "learning_rate": 9.46248184363759e-06, + "loss": 0.3785, + "step": 2792 + }, + { + "epoch": 0.7024647887323944, + "grad_norm": 0.3703523278236389, + "learning_rate": 9.461821648015464e-06, + "loss": 0.4011, + "step": 2793 + }, + { + "epoch": 0.7027162977867203, + "grad_norm": 0.34886541962623596, + "learning_rate": 9.461161070266226e-06, + "loss": 0.3953, + "step": 2794 + }, + { + "epoch": 0.7029678068410463, + "grad_norm": 0.34929558634757996, + "learning_rate": 9.460500110446453e-06, + "loss": 0.4049, + "step": 2795 + }, + { + "epoch": 0.7032193158953722, + "grad_norm": 0.3256225883960724, + "learning_rate": 9.459838768612751e-06, + "loss": 0.3611, + "step": 2796 + }, + { + "epoch": 0.7034708249496981, + "grad_norm": 0.34201568365097046, + "learning_rate": 9.459177044821758e-06, + "loss": 0.3878, + "step": 2797 + }, + { + "epoch": 0.7037223340040242, + "grad_norm": 0.32223188877105713, + "learning_rate": 9.458514939130148e-06, + "loss": 0.3879, + "step": 2798 + }, + { + "epoch": 0.7039738430583501, + "grad_norm": 0.38301658630371094, + "learning_rate": 9.457852451594625e-06, + "loss": 0.3776, + "step": 2799 + }, + { + "epoch": 0.704225352112676, + "grad_norm": 0.3359746038913727, + "learning_rate": 9.457189582271928e-06, + "loss": 0.3815, + "step": 2800 + }, + { + "epoch": 0.704476861167002, + "grad_norm": 0.42329642176628113, + "learning_rate": 9.456526331218827e-06, + "loss": 0.3806, + "step": 2801 + }, + { + "epoch": 0.704728370221328, + "grad_norm": 0.43376052379608154, + "learning_rate": 9.455862698492127e-06, + "loss": 0.3921, + "step": 2802 + }, + { + "epoch": 0.704979879275654, + "grad_norm": 0.3609297275543213, + "learning_rate": 9.455198684148662e-06, + "loss": 0.3749, + "step": 2803 + }, + { + "epoch": 0.7052313883299799, + "grad_norm": 0.3790986239910126, + "learning_rate": 9.454534288245302e-06, + "loss": 0.3872, + "step": 2804 + }, + { + "epoch": 0.7054828973843058, + "grad_norm": 0.36633041501045227, + "learning_rate": 9.453869510838946e-06, + "loss": 0.4007, + "step": 2805 + }, + { + "epoch": 0.7057344064386318, + "grad_norm": 0.4222126305103302, + "learning_rate": 9.45320435198653e-06, + "loss": 0.3949, + "step": 2806 + }, + { + "epoch": 0.7059859154929577, + "grad_norm": 0.34727922081947327, + "learning_rate": 9.452538811745023e-06, + "loss": 0.3738, + "step": 2807 + }, + { + "epoch": 0.7062374245472837, + "grad_norm": 0.371955543756485, + "learning_rate": 9.451872890171419e-06, + "loss": 0.3913, + "step": 2808 + }, + { + "epoch": 0.7064889336016097, + "grad_norm": 0.3855418860912323, + "learning_rate": 9.451206587322754e-06, + "loss": 0.367, + "step": 2809 + }, + { + "epoch": 0.7067404426559356, + "grad_norm": 0.3608242869377136, + "learning_rate": 9.450539903256091e-06, + "loss": 0.3805, + "step": 2810 + }, + { + "epoch": 0.7069919517102615, + "grad_norm": 0.3822399079799652, + "learning_rate": 9.449872838028529e-06, + "loss": 0.3969, + "step": 2811 + }, + { + "epoch": 0.7072434607645876, + "grad_norm": 0.35486891865730286, + "learning_rate": 9.449205391697196e-06, + "loss": 0.3947, + "step": 2812 + }, + { + "epoch": 0.7074949698189135, + "grad_norm": 0.3677852749824524, + "learning_rate": 9.448537564319254e-06, + "loss": 0.3856, + "step": 2813 + }, + { + "epoch": 0.7077464788732394, + "grad_norm": 0.36130937933921814, + "learning_rate": 9.447869355951901e-06, + "loss": 0.401, + "step": 2814 + }, + { + "epoch": 0.7079979879275654, + "grad_norm": 0.35179662704467773, + "learning_rate": 9.447200766652363e-06, + "loss": 0.3734, + "step": 2815 + }, + { + "epoch": 0.7082494969818913, + "grad_norm": 0.3460598289966583, + "learning_rate": 9.446531796477901e-06, + "loss": 0.3591, + "step": 2816 + }, + { + "epoch": 0.7085010060362174, + "grad_norm": 0.34915661811828613, + "learning_rate": 9.445862445485808e-06, + "loss": 0.4048, + "step": 2817 + }, + { + "epoch": 0.7087525150905433, + "grad_norm": 0.38304591178894043, + "learning_rate": 9.44519271373341e-06, + "loss": 0.3861, + "step": 2818 + }, + { + "epoch": 0.7090040241448692, + "grad_norm": 0.4027135968208313, + "learning_rate": 9.444522601278065e-06, + "loss": 0.4149, + "step": 2819 + }, + { + "epoch": 0.7092555331991952, + "grad_norm": 0.3455667793750763, + "learning_rate": 9.443852108177164e-06, + "loss": 0.3671, + "step": 2820 + }, + { + "epoch": 0.7095070422535211, + "grad_norm": 0.4052219092845917, + "learning_rate": 9.44318123448813e-06, + "loss": 0.3654, + "step": 2821 + }, + { + "epoch": 0.709758551307847, + "grad_norm": 0.36053210496902466, + "learning_rate": 9.44250998026842e-06, + "loss": 0.4084, + "step": 2822 + }, + { + "epoch": 0.7100100603621731, + "grad_norm": 0.3680732846260071, + "learning_rate": 9.441838345575523e-06, + "loss": 0.3848, + "step": 2823 + }, + { + "epoch": 0.710261569416499, + "grad_norm": 0.36795133352279663, + "learning_rate": 9.441166330466959e-06, + "loss": 0.3666, + "step": 2824 + }, + { + "epoch": 0.7105130784708249, + "grad_norm": 0.3703378438949585, + "learning_rate": 9.440493935000283e-06, + "loss": 0.3959, + "step": 2825 + }, + { + "epoch": 0.7107645875251509, + "grad_norm": 0.37262654304504395, + "learning_rate": 9.439821159233083e-06, + "loss": 0.3584, + "step": 2826 + }, + { + "epoch": 0.7110160965794768, + "grad_norm": 0.35950973629951477, + "learning_rate": 9.439148003222973e-06, + "loss": 0.373, + "step": 2827 + }, + { + "epoch": 0.7112676056338029, + "grad_norm": 0.3705592751502991, + "learning_rate": 9.43847446702761e-06, + "loss": 0.4044, + "step": 2828 + }, + { + "epoch": 0.7115191146881288, + "grad_norm": 0.3991115987300873, + "learning_rate": 9.437800550704674e-06, + "loss": 0.3979, + "step": 2829 + }, + { + "epoch": 0.7117706237424547, + "grad_norm": 0.37263038754463196, + "learning_rate": 9.437126254311886e-06, + "loss": 0.3845, + "step": 2830 + }, + { + "epoch": 0.7120221327967807, + "grad_norm": 0.389017790555954, + "learning_rate": 9.436451577906991e-06, + "loss": 0.392, + "step": 2831 + }, + { + "epoch": 0.7122736418511066, + "grad_norm": 0.33222588896751404, + "learning_rate": 9.435776521547772e-06, + "loss": 0.3847, + "step": 2832 + }, + { + "epoch": 0.7125251509054326, + "grad_norm": 0.342219740152359, + "learning_rate": 9.435101085292047e-06, + "loss": 0.3885, + "step": 2833 + }, + { + "epoch": 0.7127766599597586, + "grad_norm": 0.3644763231277466, + "learning_rate": 9.434425269197658e-06, + "loss": 0.3635, + "step": 2834 + }, + { + "epoch": 0.7130281690140845, + "grad_norm": 0.37079551815986633, + "learning_rate": 9.433749073322487e-06, + "loss": 0.4046, + "step": 2835 + }, + { + "epoch": 0.7132796780684104, + "grad_norm": 0.39011722803115845, + "learning_rate": 9.433072497724445e-06, + "loss": 0.3589, + "step": 2836 + }, + { + "epoch": 0.7135311871227364, + "grad_norm": 0.36313149333000183, + "learning_rate": 9.432395542461476e-06, + "loss": 0.3964, + "step": 2837 + }, + { + "epoch": 0.7137826961770624, + "grad_norm": 0.35538873076438904, + "learning_rate": 9.431718207591559e-06, + "loss": 0.3914, + "step": 2838 + }, + { + "epoch": 0.7140342052313883, + "grad_norm": 0.3488622307777405, + "learning_rate": 9.431040493172702e-06, + "loss": 0.3841, + "step": 2839 + }, + { + "epoch": 0.7142857142857143, + "grad_norm": 0.4144222140312195, + "learning_rate": 9.430362399262947e-06, + "loss": 0.3963, + "step": 2840 + }, + { + "epoch": 0.7145372233400402, + "grad_norm": 0.34440502524375916, + "learning_rate": 9.429683925920369e-06, + "loss": 0.3667, + "step": 2841 + }, + { + "epoch": 0.7147887323943662, + "grad_norm": 0.33740341663360596, + "learning_rate": 9.429005073203075e-06, + "loss": 0.3964, + "step": 2842 + }, + { + "epoch": 0.7150402414486922, + "grad_norm": 0.38044601678848267, + "learning_rate": 9.428325841169203e-06, + "loss": 0.4139, + "step": 2843 + }, + { + "epoch": 0.7152917505030181, + "grad_norm": 0.39153364300727844, + "learning_rate": 9.427646229876927e-06, + "loss": 0.3849, + "step": 2844 + }, + { + "epoch": 0.7155432595573441, + "grad_norm": 0.3845551908016205, + "learning_rate": 9.42696623938445e-06, + "loss": 0.4071, + "step": 2845 + }, + { + "epoch": 0.71579476861167, + "grad_norm": 0.32743361592292786, + "learning_rate": 9.426285869750012e-06, + "loss": 0.3933, + "step": 2846 + }, + { + "epoch": 0.7160462776659959, + "grad_norm": 0.48608964681625366, + "learning_rate": 9.425605121031878e-06, + "loss": 0.4126, + "step": 2847 + }, + { + "epoch": 0.716297786720322, + "grad_norm": 0.42301592230796814, + "learning_rate": 9.424923993288352e-06, + "loss": 0.3774, + "step": 2848 + }, + { + "epoch": 0.7165492957746479, + "grad_norm": 0.3551022708415985, + "learning_rate": 9.424242486577768e-06, + "loss": 0.3884, + "step": 2849 + }, + { + "epoch": 0.7168008048289738, + "grad_norm": 0.4087948799133301, + "learning_rate": 9.423560600958493e-06, + "loss": 0.3816, + "step": 2850 + }, + { + "epoch": 0.7170523138832998, + "grad_norm": 0.37452781200408936, + "learning_rate": 9.422878336488928e-06, + "loss": 0.3793, + "step": 2851 + }, + { + "epoch": 0.7173038229376257, + "grad_norm": 0.35576769709587097, + "learning_rate": 9.422195693227501e-06, + "loss": 0.3753, + "step": 2852 + }, + { + "epoch": 0.7175553319919518, + "grad_norm": 0.3551032841205597, + "learning_rate": 9.42151267123268e-06, + "loss": 0.3923, + "step": 2853 + }, + { + "epoch": 0.7178068410462777, + "grad_norm": 0.37760961055755615, + "learning_rate": 9.420829270562956e-06, + "loss": 0.4031, + "step": 2854 + }, + { + "epoch": 0.7180583501006036, + "grad_norm": 0.3662315011024475, + "learning_rate": 9.420145491276864e-06, + "loss": 0.3539, + "step": 2855 + }, + { + "epoch": 0.7183098591549296, + "grad_norm": 0.3553292751312256, + "learning_rate": 9.419461333432965e-06, + "loss": 0.3812, + "step": 2856 + }, + { + "epoch": 0.7185613682092555, + "grad_norm": 0.4659683108329773, + "learning_rate": 9.418776797089848e-06, + "loss": 0.3933, + "step": 2857 + }, + { + "epoch": 0.7188128772635815, + "grad_norm": 0.37304118275642395, + "learning_rate": 9.418091882306141e-06, + "loss": 0.3758, + "step": 2858 + }, + { + "epoch": 0.7190643863179075, + "grad_norm": 0.36666375398635864, + "learning_rate": 9.417406589140507e-06, + "loss": 0.4104, + "step": 2859 + }, + { + "epoch": 0.7193158953722334, + "grad_norm": 0.42461302876472473, + "learning_rate": 9.416720917651631e-06, + "loss": 0.3893, + "step": 2860 + }, + { + "epoch": 0.7195674044265593, + "grad_norm": 0.4254009425640106, + "learning_rate": 9.416034867898243e-06, + "loss": 0.3885, + "step": 2861 + }, + { + "epoch": 0.7198189134808853, + "grad_norm": 0.3596349060535431, + "learning_rate": 9.415348439939091e-06, + "loss": 0.3596, + "step": 2862 + }, + { + "epoch": 0.7200704225352113, + "grad_norm": 0.3977017104625702, + "learning_rate": 9.41466163383297e-06, + "loss": 0.3848, + "step": 2863 + }, + { + "epoch": 0.7203219315895373, + "grad_norm": 0.40862926840782166, + "learning_rate": 9.4139744496387e-06, + "loss": 0.4016, + "step": 2864 + }, + { + "epoch": 0.7205734406438632, + "grad_norm": 0.3802616596221924, + "learning_rate": 9.413286887415128e-06, + "loss": 0.3909, + "step": 2865 + }, + { + "epoch": 0.7208249496981891, + "grad_norm": 0.34951651096343994, + "learning_rate": 9.412598947221146e-06, + "loss": 0.3707, + "step": 2866 + }, + { + "epoch": 0.7210764587525151, + "grad_norm": 0.36474528908729553, + "learning_rate": 9.411910629115667e-06, + "loss": 0.4124, + "step": 2867 + }, + { + "epoch": 0.721327967806841, + "grad_norm": 0.3925243020057678, + "learning_rate": 9.411221933157646e-06, + "loss": 0.3872, + "step": 2868 + }, + { + "epoch": 0.721579476861167, + "grad_norm": 0.3429969251155853, + "learning_rate": 9.41053285940606e-06, + "loss": 0.3867, + "step": 2869 + }, + { + "epoch": 0.721830985915493, + "grad_norm": 0.35117414593696594, + "learning_rate": 9.409843407919929e-06, + "loss": 0.3853, + "step": 2870 + }, + { + "epoch": 0.7220824949698189, + "grad_norm": 0.35899850726127625, + "learning_rate": 9.409153578758298e-06, + "loss": 0.3932, + "step": 2871 + }, + { + "epoch": 0.7223340040241448, + "grad_norm": 0.37349846959114075, + "learning_rate": 9.408463371980248e-06, + "loss": 0.3779, + "step": 2872 + }, + { + "epoch": 0.7225855130784709, + "grad_norm": 0.3280394375324249, + "learning_rate": 9.407772787644887e-06, + "loss": 0.3697, + "step": 2873 + }, + { + "epoch": 0.7228370221327968, + "grad_norm": 0.36341550946235657, + "learning_rate": 9.407081825811362e-06, + "loss": 0.3828, + "step": 2874 + }, + { + "epoch": 0.7230885311871227, + "grad_norm": 0.36753761768341064, + "learning_rate": 9.40639048653885e-06, + "loss": 0.384, + "step": 2875 + }, + { + "epoch": 0.7233400402414487, + "grad_norm": 0.3492330014705658, + "learning_rate": 9.405698769886557e-06, + "loss": 0.3842, + "step": 2876 + }, + { + "epoch": 0.7235915492957746, + "grad_norm": 0.40695223212242126, + "learning_rate": 9.405006675913729e-06, + "loss": 0.4184, + "step": 2877 + }, + { + "epoch": 0.7238430583501007, + "grad_norm": 0.342252641916275, + "learning_rate": 9.404314204679636e-06, + "loss": 0.3739, + "step": 2878 + }, + { + "epoch": 0.7240945674044266, + "grad_norm": 0.39723700284957886, + "learning_rate": 9.403621356243584e-06, + "loss": 0.4288, + "step": 2879 + }, + { + "epoch": 0.7243460764587525, + "grad_norm": 0.3767862021923065, + "learning_rate": 9.402928130664913e-06, + "loss": 0.3883, + "step": 2880 + }, + { + "epoch": 0.7245975855130785, + "grad_norm": 0.37945619225502014, + "learning_rate": 9.402234528002991e-06, + "loss": 0.406, + "step": 2881 + }, + { + "epoch": 0.7248490945674044, + "grad_norm": 0.3551163375377655, + "learning_rate": 9.401540548317223e-06, + "loss": 0.3808, + "step": 2882 + }, + { + "epoch": 0.7251006036217303, + "grad_norm": 0.3756506145000458, + "learning_rate": 9.400846191667043e-06, + "loss": 0.3665, + "step": 2883 + }, + { + "epoch": 0.7253521126760564, + "grad_norm": 0.38300275802612305, + "learning_rate": 9.400151458111918e-06, + "loss": 0.3968, + "step": 2884 + }, + { + "epoch": 0.7256036217303823, + "grad_norm": 0.37950319051742554, + "learning_rate": 9.399456347711348e-06, + "loss": 0.3843, + "step": 2885 + }, + { + "epoch": 0.7258551307847082, + "grad_norm": 0.3413698673248291, + "learning_rate": 9.398760860524865e-06, + "loss": 0.3965, + "step": 2886 + }, + { + "epoch": 0.7261066398390342, + "grad_norm": 0.37162476778030396, + "learning_rate": 9.398064996612032e-06, + "loss": 0.3786, + "step": 2887 + }, + { + "epoch": 0.7263581488933601, + "grad_norm": 0.3777274787425995, + "learning_rate": 9.397368756032445e-06, + "loss": 0.4001, + "step": 2888 + }, + { + "epoch": 0.7266096579476862, + "grad_norm": 0.3456476032733917, + "learning_rate": 9.396672138845737e-06, + "loss": 0.3708, + "step": 2889 + }, + { + "epoch": 0.7268611670020121, + "grad_norm": 0.41536620259284973, + "learning_rate": 9.395975145111565e-06, + "loss": 0.3886, + "step": 2890 + }, + { + "epoch": 0.727112676056338, + "grad_norm": 0.3782283067703247, + "learning_rate": 9.395277774889621e-06, + "loss": 0.3711, + "step": 2891 + }, + { + "epoch": 0.727364185110664, + "grad_norm": 0.31303125619888306, + "learning_rate": 9.394580028239633e-06, + "loss": 0.366, + "step": 2892 + }, + { + "epoch": 0.72761569416499, + "grad_norm": 0.3784593641757965, + "learning_rate": 9.39388190522136e-06, + "loss": 0.4001, + "step": 2893 + }, + { + "epoch": 0.7278672032193159, + "grad_norm": 0.43708619475364685, + "learning_rate": 9.393183405894589e-06, + "loss": 0.3977, + "step": 2894 + }, + { + "epoch": 0.7281187122736419, + "grad_norm": 0.34523478150367737, + "learning_rate": 9.39248453031914e-06, + "loss": 0.4209, + "step": 2895 + }, + { + "epoch": 0.7283702213279678, + "grad_norm": 0.37533217668533325, + "learning_rate": 9.391785278554875e-06, + "loss": 0.3937, + "step": 2896 + }, + { + "epoch": 0.7286217303822937, + "grad_norm": 0.42460107803344727, + "learning_rate": 9.391085650661672e-06, + "loss": 0.3885, + "step": 2897 + }, + { + "epoch": 0.7288732394366197, + "grad_norm": 0.3811191916465759, + "learning_rate": 9.390385646699457e-06, + "loss": 0.3899, + "step": 2898 + }, + { + "epoch": 0.7291247484909457, + "grad_norm": 0.41132891178131104, + "learning_rate": 9.389685266728175e-06, + "loss": 0.3725, + "step": 2899 + }, + { + "epoch": 0.7293762575452716, + "grad_norm": 0.4103216230869293, + "learning_rate": 9.388984510807812e-06, + "loss": 0.3838, + "step": 2900 + }, + { + "epoch": 0.7296277665995976, + "grad_norm": 0.399691641330719, + "learning_rate": 9.388283378998382e-06, + "loss": 0.3846, + "step": 2901 + }, + { + "epoch": 0.7298792756539235, + "grad_norm": 0.4203183650970459, + "learning_rate": 9.387581871359936e-06, + "loss": 0.3557, + "step": 2902 + }, + { + "epoch": 0.7301307847082495, + "grad_norm": 0.3951526880264282, + "learning_rate": 9.386879987952549e-06, + "loss": 0.3983, + "step": 2903 + }, + { + "epoch": 0.7303822937625755, + "grad_norm": 0.3411506414413452, + "learning_rate": 9.386177728836337e-06, + "loss": 0.3993, + "step": 2904 + }, + { + "epoch": 0.7306338028169014, + "grad_norm": 0.3646346628665924, + "learning_rate": 9.385475094071442e-06, + "loss": 0.3693, + "step": 2905 + }, + { + "epoch": 0.7308853118712274, + "grad_norm": 0.39517220854759216, + "learning_rate": 9.384772083718042e-06, + "loss": 0.3887, + "step": 2906 + }, + { + "epoch": 0.7311368209255533, + "grad_norm": 0.36498531699180603, + "learning_rate": 9.384068697836342e-06, + "loss": 0.3819, + "step": 2907 + }, + { + "epoch": 0.7313883299798792, + "grad_norm": 0.36910420656204224, + "learning_rate": 9.383364936486585e-06, + "loss": 0.3941, + "step": 2908 + }, + { + "epoch": 0.7316398390342053, + "grad_norm": 0.3762173652648926, + "learning_rate": 9.382660799729044e-06, + "loss": 0.3762, + "step": 2909 + }, + { + "epoch": 0.7318913480885312, + "grad_norm": 0.40564772486686707, + "learning_rate": 9.381956287624024e-06, + "loss": 0.3928, + "step": 2910 + }, + { + "epoch": 0.7321428571428571, + "grad_norm": 0.354449063539505, + "learning_rate": 9.381251400231859e-06, + "loss": 0.4233, + "step": 2911 + }, + { + "epoch": 0.7323943661971831, + "grad_norm": 0.3970174789428711, + "learning_rate": 9.380546137612922e-06, + "loss": 0.4486, + "step": 2912 + }, + { + "epoch": 0.732645875251509, + "grad_norm": 0.3722879886627197, + "learning_rate": 9.379840499827612e-06, + "loss": 0.3981, + "step": 2913 + }, + { + "epoch": 0.7328973843058351, + "grad_norm": 0.3905206024646759, + "learning_rate": 9.379134486936366e-06, + "loss": 0.3734, + "step": 2914 + }, + { + "epoch": 0.733148893360161, + "grad_norm": 0.3671495020389557, + "learning_rate": 9.378428098999645e-06, + "loss": 0.3537, + "step": 2915 + }, + { + "epoch": 0.7334004024144869, + "grad_norm": 0.3310393691062927, + "learning_rate": 9.37772133607795e-06, + "loss": 0.3933, + "step": 2916 + }, + { + "epoch": 0.7336519114688129, + "grad_norm": 0.3759323060512543, + "learning_rate": 9.377014198231807e-06, + "loss": 0.3828, + "step": 2917 + }, + { + "epoch": 0.7339034205231388, + "grad_norm": 0.36852389574050903, + "learning_rate": 9.376306685521784e-06, + "loss": 0.3723, + "step": 2918 + }, + { + "epoch": 0.7341549295774648, + "grad_norm": 0.3421872854232788, + "learning_rate": 9.375598798008468e-06, + "loss": 0.3778, + "step": 2919 + }, + { + "epoch": 0.7344064386317908, + "grad_norm": 0.37425142526626587, + "learning_rate": 9.37489053575249e-06, + "loss": 0.3804, + "step": 2920 + }, + { + "epoch": 0.7346579476861167, + "grad_norm": 0.3890654444694519, + "learning_rate": 9.374181898814508e-06, + "loss": 0.3768, + "step": 2921 + }, + { + "epoch": 0.7349094567404426, + "grad_norm": 0.35745400190353394, + "learning_rate": 9.373472887255209e-06, + "loss": 0.3843, + "step": 2922 + }, + { + "epoch": 0.7351609657947686, + "grad_norm": 0.3883761465549469, + "learning_rate": 9.372763501135319e-06, + "loss": 0.3916, + "step": 2923 + }, + { + "epoch": 0.7354124748490946, + "grad_norm": 0.393292635679245, + "learning_rate": 9.37205374051559e-06, + "loss": 0.3977, + "step": 2924 + }, + { + "epoch": 0.7356639839034205, + "grad_norm": 0.34699979424476624, + "learning_rate": 9.37134360545681e-06, + "loss": 0.383, + "step": 2925 + }, + { + "epoch": 0.7359154929577465, + "grad_norm": 0.40010884404182434, + "learning_rate": 9.370633096019799e-06, + "loss": 0.3855, + "step": 2926 + }, + { + "epoch": 0.7361670020120724, + "grad_norm": 0.36946627497673035, + "learning_rate": 9.369922212265403e-06, + "loss": 0.3852, + "step": 2927 + }, + { + "epoch": 0.7364185110663984, + "grad_norm": 0.4117673635482788, + "learning_rate": 9.36921095425451e-06, + "loss": 0.3849, + "step": 2928 + }, + { + "epoch": 0.7366700201207244, + "grad_norm": 0.36381223797798157, + "learning_rate": 9.368499322048031e-06, + "loss": 0.3921, + "step": 2929 + }, + { + "epoch": 0.7369215291750503, + "grad_norm": 0.3884536027908325, + "learning_rate": 9.367787315706916e-06, + "loss": 0.3812, + "step": 2930 + }, + { + "epoch": 0.7371730382293763, + "grad_norm": 0.35373517870903015, + "learning_rate": 9.36707493529214e-06, + "loss": 0.3656, + "step": 2931 + }, + { + "epoch": 0.7374245472837022, + "grad_norm": 0.3612503111362457, + "learning_rate": 9.366362180864718e-06, + "loss": 0.3782, + "step": 2932 + }, + { + "epoch": 0.7376760563380281, + "grad_norm": 0.37395453453063965, + "learning_rate": 9.36564905248569e-06, + "loss": 0.389, + "step": 2933 + }, + { + "epoch": 0.7379275653923542, + "grad_norm": 0.3890708088874817, + "learning_rate": 9.364935550216133e-06, + "loss": 0.3922, + "step": 2934 + }, + { + "epoch": 0.7381790744466801, + "grad_norm": 0.38554948568344116, + "learning_rate": 9.364221674117151e-06, + "loss": 0.4182, + "step": 2935 + }, + { + "epoch": 0.738430583501006, + "grad_norm": 0.3420090079307556, + "learning_rate": 9.363507424249887e-06, + "loss": 0.4097, + "step": 2936 + }, + { + "epoch": 0.738682092555332, + "grad_norm": 0.3459598124027252, + "learning_rate": 9.362792800675511e-06, + "loss": 0.3826, + "step": 2937 + }, + { + "epoch": 0.7389336016096579, + "grad_norm": 0.35301342606544495, + "learning_rate": 9.362077803455223e-06, + "loss": 0.3986, + "step": 2938 + }, + { + "epoch": 0.739185110663984, + "grad_norm": 0.37958618998527527, + "learning_rate": 9.361362432650261e-06, + "loss": 0.4021, + "step": 2939 + }, + { + "epoch": 0.7394366197183099, + "grad_norm": 0.362627238035202, + "learning_rate": 9.360646688321891e-06, + "loss": 0.3895, + "step": 2940 + }, + { + "epoch": 0.7396881287726358, + "grad_norm": 0.37005671858787537, + "learning_rate": 9.359930570531412e-06, + "loss": 0.3918, + "step": 2941 + }, + { + "epoch": 0.7399396378269618, + "grad_norm": 0.34973013401031494, + "learning_rate": 9.359214079340158e-06, + "loss": 0.3559, + "step": 2942 + }, + { + "epoch": 0.7401911468812877, + "grad_norm": 0.4032422602176666, + "learning_rate": 9.358497214809485e-06, + "loss": 0.3907, + "step": 2943 + }, + { + "epoch": 0.7404426559356136, + "grad_norm": 0.36071479320526123, + "learning_rate": 9.357779977000796e-06, + "loss": 0.4044, + "step": 2944 + }, + { + "epoch": 0.7406941649899397, + "grad_norm": 0.4488549828529358, + "learning_rate": 9.357062365975511e-06, + "loss": 0.3727, + "step": 2945 + }, + { + "epoch": 0.7409456740442656, + "grad_norm": 0.37173953652381897, + "learning_rate": 9.356344381795094e-06, + "loss": 0.3777, + "step": 2946 + }, + { + "epoch": 0.7411971830985915, + "grad_norm": 0.3299272954463959, + "learning_rate": 9.355626024521035e-06, + "loss": 0.3997, + "step": 2947 + }, + { + "epoch": 0.7414486921529175, + "grad_norm": 0.33907902240753174, + "learning_rate": 9.354907294214853e-06, + "loss": 0.3877, + "step": 2948 + }, + { + "epoch": 0.7417002012072434, + "grad_norm": 0.3761814832687378, + "learning_rate": 9.354188190938108e-06, + "loss": 0.3763, + "step": 2949 + }, + { + "epoch": 0.7419517102615694, + "grad_norm": 0.37312766909599304, + "learning_rate": 9.353468714752381e-06, + "loss": 0.3922, + "step": 2950 + }, + { + "epoch": 0.7422032193158954, + "grad_norm": 0.3855811059474945, + "learning_rate": 9.352748865719296e-06, + "loss": 0.3519, + "step": 2951 + }, + { + "epoch": 0.7424547283702213, + "grad_norm": 0.3510308861732483, + "learning_rate": 9.352028643900502e-06, + "loss": 0.3622, + "step": 2952 + }, + { + "epoch": 0.7427062374245473, + "grad_norm": 0.36363258957862854, + "learning_rate": 9.351308049357679e-06, + "loss": 0.403, + "step": 2953 + }, + { + "epoch": 0.7429577464788732, + "grad_norm": 0.35569775104522705, + "learning_rate": 9.350587082152544e-06, + "loss": 0.3811, + "step": 2954 + }, + { + "epoch": 0.7432092555331992, + "grad_norm": 0.32615721225738525, + "learning_rate": 9.349865742346842e-06, + "loss": 0.3984, + "step": 2955 + }, + { + "epoch": 0.7434607645875252, + "grad_norm": 0.3774353563785553, + "learning_rate": 9.349144030002353e-06, + "loss": 0.379, + "step": 2956 + }, + { + "epoch": 0.7437122736418511, + "grad_norm": 0.350462943315506, + "learning_rate": 9.348421945180885e-06, + "loss": 0.3865, + "step": 2957 + }, + { + "epoch": 0.743963782696177, + "grad_norm": 0.36871710419654846, + "learning_rate": 9.347699487944282e-06, + "loss": 0.3823, + "step": 2958 + }, + { + "epoch": 0.744215291750503, + "grad_norm": 0.35802212357521057, + "learning_rate": 9.346976658354417e-06, + "loss": 0.3924, + "step": 2959 + }, + { + "epoch": 0.744466800804829, + "grad_norm": 0.3417361080646515, + "learning_rate": 9.346253456473196e-06, + "loss": 0.3814, + "step": 2960 + }, + { + "epoch": 0.7447183098591549, + "grad_norm": 0.38555172085762024, + "learning_rate": 9.345529882362554e-06, + "loss": 0.3986, + "step": 2961 + }, + { + "epoch": 0.7449698189134809, + "grad_norm": 0.35595080256462097, + "learning_rate": 9.344805936084466e-06, + "loss": 0.3729, + "step": 2962 + }, + { + "epoch": 0.7452213279678068, + "grad_norm": 0.36647212505340576, + "learning_rate": 9.344081617700929e-06, + "loss": 0.3817, + "step": 2963 + }, + { + "epoch": 0.7454728370221329, + "grad_norm": 0.38703134655952454, + "learning_rate": 9.343356927273978e-06, + "loss": 0.3971, + "step": 2964 + }, + { + "epoch": 0.7457243460764588, + "grad_norm": 0.3738643229007721, + "learning_rate": 9.342631864865678e-06, + "loss": 0.3807, + "step": 2965 + }, + { + "epoch": 0.7459758551307847, + "grad_norm": 0.3684743046760559, + "learning_rate": 9.341906430538129e-06, + "loss": 0.3829, + "step": 2966 + }, + { + "epoch": 0.7462273641851107, + "grad_norm": 0.37854307889938354, + "learning_rate": 9.341180624353454e-06, + "loss": 0.3737, + "step": 2967 + }, + { + "epoch": 0.7464788732394366, + "grad_norm": 0.3924698829650879, + "learning_rate": 9.34045444637382e-06, + "loss": 0.3751, + "step": 2968 + }, + { + "epoch": 0.7467303822937625, + "grad_norm": 0.3787640333175659, + "learning_rate": 9.339727896661413e-06, + "loss": 0.3736, + "step": 2969 + }, + { + "epoch": 0.7469818913480886, + "grad_norm": 0.351458877325058, + "learning_rate": 9.339000975278463e-06, + "loss": 0.3986, + "step": 2970 + }, + { + "epoch": 0.7472334004024145, + "grad_norm": 0.3286757469177246, + "learning_rate": 9.338273682287222e-06, + "loss": 0.3831, + "step": 2971 + }, + { + "epoch": 0.7474849094567404, + "grad_norm": 0.3744126558303833, + "learning_rate": 9.337546017749981e-06, + "loss": 0.3868, + "step": 2972 + }, + { + "epoch": 0.7477364185110664, + "grad_norm": 0.3369508981704712, + "learning_rate": 9.33681798172906e-06, + "loss": 0.3785, + "step": 2973 + }, + { + "epoch": 0.7479879275653923, + "grad_norm": 0.3627220690250397, + "learning_rate": 9.33608957428681e-06, + "loss": 0.3718, + "step": 2974 + }, + { + "epoch": 0.7482394366197183, + "grad_norm": 0.36162272095680237, + "learning_rate": 9.335360795485615e-06, + "loss": 0.3863, + "step": 2975 + }, + { + "epoch": 0.7484909456740443, + "grad_norm": 0.3735608160495758, + "learning_rate": 9.334631645387888e-06, + "loss": 0.3625, + "step": 2976 + }, + { + "epoch": 0.7487424547283702, + "grad_norm": 0.34616851806640625, + "learning_rate": 9.33390212405608e-06, + "loss": 0.3572, + "step": 2977 + }, + { + "epoch": 0.7489939637826962, + "grad_norm": 0.35577431321144104, + "learning_rate": 9.333172231552666e-06, + "loss": 0.3677, + "step": 2978 + }, + { + "epoch": 0.7492454728370221, + "grad_norm": 0.3937419652938843, + "learning_rate": 9.332441967940161e-06, + "loss": 0.3848, + "step": 2979 + }, + { + "epoch": 0.7494969818913481, + "grad_norm": 0.3335328996181488, + "learning_rate": 9.331711333281101e-06, + "loss": 0.3998, + "step": 2980 + }, + { + "epoch": 0.7497484909456741, + "grad_norm": 0.37114816904067993, + "learning_rate": 9.330980327638068e-06, + "loss": 0.3803, + "step": 2981 + }, + { + "epoch": 0.75, + "grad_norm": 0.4506068229675293, + "learning_rate": 9.330248951073664e-06, + "loss": 0.4137, + "step": 2982 + }, + { + "epoch": 0.7502515090543259, + "grad_norm": 0.3454316556453705, + "learning_rate": 9.329517203650526e-06, + "loss": 0.3696, + "step": 2983 + }, + { + "epoch": 0.7505030181086519, + "grad_norm": 0.38480788469314575, + "learning_rate": 9.328785085431326e-06, + "loss": 0.3937, + "step": 2984 + }, + { + "epoch": 0.7507545271629779, + "grad_norm": 0.4474145472049713, + "learning_rate": 9.328052596478763e-06, + "loss": 0.3948, + "step": 2985 + }, + { + "epoch": 0.7510060362173038, + "grad_norm": 0.3649021089076996, + "learning_rate": 9.327319736855574e-06, + "loss": 0.3645, + "step": 2986 + }, + { + "epoch": 0.7512575452716298, + "grad_norm": 0.37888169288635254, + "learning_rate": 9.326586506624517e-06, + "loss": 0.3737, + "step": 2987 + }, + { + "epoch": 0.7515090543259557, + "grad_norm": 0.38091379404067993, + "learning_rate": 9.325852905848396e-06, + "loss": 0.421, + "step": 2988 + }, + { + "epoch": 0.7517605633802817, + "grad_norm": 0.38274237513542175, + "learning_rate": 9.325118934590036e-06, + "loss": 0.3658, + "step": 2989 + }, + { + "epoch": 0.7520120724346077, + "grad_norm": 0.38675278425216675, + "learning_rate": 9.324384592912295e-06, + "loss": 0.3835, + "step": 2990 + }, + { + "epoch": 0.7522635814889336, + "grad_norm": 0.3240179717540741, + "learning_rate": 9.323649880878069e-06, + "loss": 0.3706, + "step": 2991 + }, + { + "epoch": 0.7525150905432596, + "grad_norm": 0.4843953549861908, + "learning_rate": 9.322914798550277e-06, + "loss": 0.4072, + "step": 2992 + }, + { + "epoch": 0.7527665995975855, + "grad_norm": 0.40057262778282166, + "learning_rate": 9.32217934599188e-06, + "loss": 0.3835, + "step": 2993 + }, + { + "epoch": 0.7530181086519114, + "grad_norm": 0.3491225838661194, + "learning_rate": 9.321443523265858e-06, + "loss": 0.3848, + "step": 2994 + }, + { + "epoch": 0.7532696177062375, + "grad_norm": 0.4448799192905426, + "learning_rate": 9.320707330435235e-06, + "loss": 0.3585, + "step": 2995 + }, + { + "epoch": 0.7535211267605634, + "grad_norm": 0.3571915328502655, + "learning_rate": 9.319970767563061e-06, + "loss": 0.4124, + "step": 2996 + }, + { + "epoch": 0.7537726358148893, + "grad_norm": 0.3818497657775879, + "learning_rate": 9.319233834712413e-06, + "loss": 0.3861, + "step": 2997 + }, + { + "epoch": 0.7540241448692153, + "grad_norm": 0.4860544204711914, + "learning_rate": 9.318496531946411e-06, + "loss": 0.3887, + "step": 2998 + }, + { + "epoch": 0.7542756539235412, + "grad_norm": 0.3781794011592865, + "learning_rate": 9.317758859328194e-06, + "loss": 0.3826, + "step": 2999 + }, + { + "epoch": 0.7545271629778671, + "grad_norm": 0.3716043531894684, + "learning_rate": 9.317020816920945e-06, + "loss": 0.365, + "step": 3000 + }, + { + "epoch": 0.7547786720321932, + "grad_norm": 0.39104846119880676, + "learning_rate": 9.31628240478787e-06, + "loss": 0.3894, + "step": 3001 + }, + { + "epoch": 0.7550301810865191, + "grad_norm": 0.3662312924861908, + "learning_rate": 9.31554362299221e-06, + "loss": 0.3853, + "step": 3002 + }, + { + "epoch": 0.7552816901408451, + "grad_norm": 0.4046803414821625, + "learning_rate": 9.314804471597235e-06, + "loss": 0.3735, + "step": 3003 + }, + { + "epoch": 0.755533199195171, + "grad_norm": 0.36208009719848633, + "learning_rate": 9.314064950666252e-06, + "loss": 0.4042, + "step": 3004 + }, + { + "epoch": 0.755784708249497, + "grad_norm": 0.33379390835762024, + "learning_rate": 9.313325060262594e-06, + "loss": 0.3717, + "step": 3005 + }, + { + "epoch": 0.756036217303823, + "grad_norm": 0.3620927035808563, + "learning_rate": 9.312584800449629e-06, + "loss": 0.4094, + "step": 3006 + }, + { + "epoch": 0.7562877263581489, + "grad_norm": 0.3750810921192169, + "learning_rate": 9.311844171290755e-06, + "loss": 0.3692, + "step": 3007 + }, + { + "epoch": 0.7565392354124748, + "grad_norm": 0.35461145639419556, + "learning_rate": 9.311103172849404e-06, + "loss": 0.4094, + "step": 3008 + }, + { + "epoch": 0.7567907444668008, + "grad_norm": 0.37598538398742676, + "learning_rate": 9.310361805189033e-06, + "loss": 0.3806, + "step": 3009 + }, + { + "epoch": 0.7570422535211268, + "grad_norm": 0.4294974207878113, + "learning_rate": 9.309620068373143e-06, + "loss": 0.4, + "step": 3010 + }, + { + "epoch": 0.7572937625754527, + "grad_norm": 0.3354276716709137, + "learning_rate": 9.308877962465251e-06, + "loss": 0.3908, + "step": 3011 + }, + { + "epoch": 0.7575452716297787, + "grad_norm": 0.38359084725379944, + "learning_rate": 9.308135487528919e-06, + "loss": 0.3721, + "step": 3012 + }, + { + "epoch": 0.7577967806841046, + "grad_norm": 0.37038716673851013, + "learning_rate": 9.307392643627736e-06, + "loss": 0.3704, + "step": 3013 + }, + { + "epoch": 0.7580482897384306, + "grad_norm": 0.34647151827812195, + "learning_rate": 9.306649430825318e-06, + "loss": 0.3985, + "step": 3014 + }, + { + "epoch": 0.7582997987927566, + "grad_norm": 0.3945516347885132, + "learning_rate": 9.30590584918532e-06, + "loss": 0.3832, + "step": 3015 + }, + { + "epoch": 0.7585513078470825, + "grad_norm": 0.3935578763484955, + "learning_rate": 9.305161898771422e-06, + "loss": 0.3768, + "step": 3016 + }, + { + "epoch": 0.7588028169014085, + "grad_norm": 0.37610089778900146, + "learning_rate": 9.304417579647343e-06, + "loss": 0.3763, + "step": 3017 + }, + { + "epoch": 0.7590543259557344, + "grad_norm": 0.3979266881942749, + "learning_rate": 9.303672891876825e-06, + "loss": 0.4074, + "step": 3018 + }, + { + "epoch": 0.7593058350100603, + "grad_norm": 0.36932116746902466, + "learning_rate": 9.302927835523647e-06, + "loss": 0.4137, + "step": 3019 + }, + { + "epoch": 0.7595573440643864, + "grad_norm": 0.3783572018146515, + "learning_rate": 9.302182410651618e-06, + "loss": 0.3826, + "step": 3020 + }, + { + "epoch": 0.7598088531187123, + "grad_norm": 0.34661227464675903, + "learning_rate": 9.301436617324584e-06, + "loss": 0.3745, + "step": 3021 + }, + { + "epoch": 0.7600603621730382, + "grad_norm": 0.3569898307323456, + "learning_rate": 9.30069045560641e-06, + "loss": 0.3919, + "step": 3022 + }, + { + "epoch": 0.7603118712273642, + "grad_norm": 0.43195685744285583, + "learning_rate": 9.299943925561004e-06, + "loss": 0.3983, + "step": 3023 + }, + { + "epoch": 0.7605633802816901, + "grad_norm": 0.35552507638931274, + "learning_rate": 9.299197027252302e-06, + "loss": 0.3962, + "step": 3024 + }, + { + "epoch": 0.760814889336016, + "grad_norm": 0.3416507840156555, + "learning_rate": 9.29844976074427e-06, + "loss": 0.3906, + "step": 3025 + }, + { + "epoch": 0.7610663983903421, + "grad_norm": 0.3881553113460541, + "learning_rate": 9.297702126100906e-06, + "loss": 0.3863, + "step": 3026 + }, + { + "epoch": 0.761317907444668, + "grad_norm": 0.3564288914203644, + "learning_rate": 9.296954123386243e-06, + "loss": 0.364, + "step": 3027 + }, + { + "epoch": 0.761569416498994, + "grad_norm": 0.3937084674835205, + "learning_rate": 9.29620575266434e-06, + "loss": 0.3769, + "step": 3028 + }, + { + "epoch": 0.7618209255533199, + "grad_norm": 0.3818730115890503, + "learning_rate": 9.295457013999291e-06, + "loss": 0.3923, + "step": 3029 + }, + { + "epoch": 0.7620724346076458, + "grad_norm": 0.4041476547718048, + "learning_rate": 9.294707907455223e-06, + "loss": 0.3904, + "step": 3030 + }, + { + "epoch": 0.7623239436619719, + "grad_norm": 0.36864545941352844, + "learning_rate": 9.293958433096289e-06, + "loss": 0.419, + "step": 3031 + }, + { + "epoch": 0.7625754527162978, + "grad_norm": 0.37678250670433044, + "learning_rate": 9.293208590986676e-06, + "loss": 0.389, + "step": 3032 + }, + { + "epoch": 0.7628269617706237, + "grad_norm": 0.35496804118156433, + "learning_rate": 9.292458381190608e-06, + "loss": 0.3996, + "step": 3033 + }, + { + "epoch": 0.7630784708249497, + "grad_norm": 0.3487403392791748, + "learning_rate": 9.291707803772332e-06, + "loss": 0.3565, + "step": 3034 + }, + { + "epoch": 0.7633299798792756, + "grad_norm": 0.3654000461101532, + "learning_rate": 9.290956858796132e-06, + "loss": 0.3849, + "step": 3035 + }, + { + "epoch": 0.7635814889336016, + "grad_norm": 0.3704080879688263, + "learning_rate": 9.29020554632632e-06, + "loss": 0.37, + "step": 3036 + }, + { + "epoch": 0.7638329979879276, + "grad_norm": 0.3763080835342407, + "learning_rate": 9.289453866427245e-06, + "loss": 0.4109, + "step": 3037 + }, + { + "epoch": 0.7640845070422535, + "grad_norm": 0.38172775506973267, + "learning_rate": 9.288701819163279e-06, + "loss": 0.3791, + "step": 3038 + }, + { + "epoch": 0.7643360160965795, + "grad_norm": 0.34394824504852295, + "learning_rate": 9.287949404598833e-06, + "loss": 0.3799, + "step": 3039 + }, + { + "epoch": 0.7645875251509054, + "grad_norm": 0.34752610325813293, + "learning_rate": 9.287196622798346e-06, + "loss": 0.374, + "step": 3040 + }, + { + "epoch": 0.7648390342052314, + "grad_norm": 0.34505367279052734, + "learning_rate": 9.286443473826288e-06, + "loss": 0.3854, + "step": 3041 + }, + { + "epoch": 0.7650905432595574, + "grad_norm": 0.3505760431289673, + "learning_rate": 9.285689957747163e-06, + "loss": 0.3881, + "step": 3042 + }, + { + "epoch": 0.7653420523138833, + "grad_norm": 0.36895132064819336, + "learning_rate": 9.284936074625503e-06, + "loss": 0.3623, + "step": 3043 + }, + { + "epoch": 0.7655935613682092, + "grad_norm": 0.3329405188560486, + "learning_rate": 9.284181824525877e-06, + "loss": 0.3676, + "step": 3044 + }, + { + "epoch": 0.7658450704225352, + "grad_norm": 0.3532065749168396, + "learning_rate": 9.283427207512878e-06, + "loss": 0.3853, + "step": 3045 + }, + { + "epoch": 0.7660965794768612, + "grad_norm": 0.3908108174800873, + "learning_rate": 9.282672223651137e-06, + "loss": 0.3951, + "step": 3046 + }, + { + "epoch": 0.7663480885311871, + "grad_norm": 0.34189653396606445, + "learning_rate": 9.28191687300531e-06, + "loss": 0.3953, + "step": 3047 + }, + { + "epoch": 0.7665995975855131, + "grad_norm": 0.357767790555954, + "learning_rate": 9.281161155640093e-06, + "loss": 0.402, + "step": 3048 + }, + { + "epoch": 0.766851106639839, + "grad_norm": 0.361261248588562, + "learning_rate": 9.280405071620204e-06, + "loss": 0.3652, + "step": 3049 + }, + { + "epoch": 0.7671026156941649, + "grad_norm": 0.34182727336883545, + "learning_rate": 9.2796486210104e-06, + "loss": 0.3679, + "step": 3050 + }, + { + "epoch": 0.767354124748491, + "grad_norm": 0.3388752341270447, + "learning_rate": 9.278891803875466e-06, + "loss": 0.4031, + "step": 3051 + }, + { + "epoch": 0.7676056338028169, + "grad_norm": 0.3388610780239105, + "learning_rate": 9.278134620280215e-06, + "loss": 0.3881, + "step": 3052 + }, + { + "epoch": 0.7678571428571429, + "grad_norm": 0.32106414437294006, + "learning_rate": 9.277377070289498e-06, + "loss": 0.3708, + "step": 3053 + }, + { + "epoch": 0.7681086519114688, + "grad_norm": 0.402169406414032, + "learning_rate": 9.276619153968197e-06, + "loss": 0.3991, + "step": 3054 + }, + { + "epoch": 0.7683601609657947, + "grad_norm": 0.32417601346969604, + "learning_rate": 9.275860871381217e-06, + "loss": 0.3865, + "step": 3055 + }, + { + "epoch": 0.7686116700201208, + "grad_norm": 0.3850818872451782, + "learning_rate": 9.275102222593503e-06, + "loss": 0.3899, + "step": 3056 + }, + { + "epoch": 0.7688631790744467, + "grad_norm": 0.36075320839881897, + "learning_rate": 9.27434320767003e-06, + "loss": 0.3824, + "step": 3057 + }, + { + "epoch": 0.7691146881287726, + "grad_norm": 0.37227320671081543, + "learning_rate": 9.2735838266758e-06, + "loss": 0.3926, + "step": 3058 + }, + { + "epoch": 0.7693661971830986, + "grad_norm": 0.395997554063797, + "learning_rate": 9.272824079675854e-06, + "loss": 0.3692, + "step": 3059 + }, + { + "epoch": 0.7696177062374245, + "grad_norm": 0.3681703507900238, + "learning_rate": 9.272063966735253e-06, + "loss": 0.4038, + "step": 3060 + }, + { + "epoch": 0.7698692152917505, + "grad_norm": 0.3632868230342865, + "learning_rate": 9.2713034879191e-06, + "loss": 0.4239, + "step": 3061 + }, + { + "epoch": 0.7701207243460765, + "grad_norm": 0.38830870389938354, + "learning_rate": 9.270542643292523e-06, + "loss": 0.3763, + "step": 3062 + }, + { + "epoch": 0.7703722334004024, + "grad_norm": 0.35282665491104126, + "learning_rate": 9.269781432920688e-06, + "loss": 0.3827, + "step": 3063 + }, + { + "epoch": 0.7706237424547284, + "grad_norm": 0.3645845949649811, + "learning_rate": 9.269019856868784e-06, + "loss": 0.3984, + "step": 3064 + }, + { + "epoch": 0.7708752515090543, + "grad_norm": 0.35338932275772095, + "learning_rate": 9.268257915202037e-06, + "loss": 0.3992, + "step": 3065 + }, + { + "epoch": 0.7711267605633803, + "grad_norm": 0.3768799304962158, + "learning_rate": 9.2674956079857e-06, + "loss": 0.3858, + "step": 3066 + }, + { + "epoch": 0.7713782696177063, + "grad_norm": 0.3796723484992981, + "learning_rate": 9.26673293528506e-06, + "loss": 0.3858, + "step": 3067 + }, + { + "epoch": 0.7716297786720322, + "grad_norm": 0.3658340275287628, + "learning_rate": 9.26596989716544e-06, + "loss": 0.3844, + "step": 3068 + }, + { + "epoch": 0.7718812877263581, + "grad_norm": 0.41284385323524475, + "learning_rate": 9.265206493692185e-06, + "loss": 0.3697, + "step": 3069 + }, + { + "epoch": 0.7721327967806841, + "grad_norm": 0.4010927379131317, + "learning_rate": 9.264442724930675e-06, + "loss": 0.3509, + "step": 3070 + }, + { + "epoch": 0.77238430583501, + "grad_norm": 0.37611401081085205, + "learning_rate": 9.263678590946326e-06, + "loss": 0.3877, + "step": 3071 + }, + { + "epoch": 0.772635814889336, + "grad_norm": 0.4208700656890869, + "learning_rate": 9.26291409180458e-06, + "loss": 0.3599, + "step": 3072 + }, + { + "epoch": 0.772887323943662, + "grad_norm": 0.3623879849910736, + "learning_rate": 9.262149227570908e-06, + "loss": 0.4031, + "step": 3073 + }, + { + "epoch": 0.7731388329979879, + "grad_norm": 0.3639488220214844, + "learning_rate": 9.261383998310822e-06, + "loss": 0.3963, + "step": 3074 + }, + { + "epoch": 0.7733903420523138, + "grad_norm": 0.4167322814464569, + "learning_rate": 9.260618404089853e-06, + "loss": 0.376, + "step": 3075 + }, + { + "epoch": 0.7736418511066399, + "grad_norm": 0.33790600299835205, + "learning_rate": 9.259852444973573e-06, + "loss": 0.3803, + "step": 3076 + }, + { + "epoch": 0.7738933601609658, + "grad_norm": 0.4101136028766632, + "learning_rate": 9.25908612102758e-06, + "loss": 0.3665, + "step": 3077 + }, + { + "epoch": 0.7741448692152918, + "grad_norm": 0.3587418794631958, + "learning_rate": 9.258319432317506e-06, + "loss": 0.3875, + "step": 3078 + }, + { + "epoch": 0.7743963782696177, + "grad_norm": 0.36111775040626526, + "learning_rate": 9.257552378909013e-06, + "loss": 0.3797, + "step": 3079 + }, + { + "epoch": 0.7746478873239436, + "grad_norm": 0.4025454521179199, + "learning_rate": 9.256784960867793e-06, + "loss": 0.3968, + "step": 3080 + }, + { + "epoch": 0.7748993963782697, + "grad_norm": 0.37682798504829407, + "learning_rate": 9.256017178259572e-06, + "loss": 0.3583, + "step": 3081 + }, + { + "epoch": 0.7751509054325956, + "grad_norm": 0.40718135237693787, + "learning_rate": 9.255249031150106e-06, + "loss": 0.4151, + "step": 3082 + }, + { + "epoch": 0.7754024144869215, + "grad_norm": 0.45130354166030884, + "learning_rate": 9.25448051960518e-06, + "loss": 0.3853, + "step": 3083 + }, + { + "epoch": 0.7756539235412475, + "grad_norm": 0.3748539090156555, + "learning_rate": 9.253711643690612e-06, + "loss": 0.3902, + "step": 3084 + }, + { + "epoch": 0.7759054325955734, + "grad_norm": 0.4091399610042572, + "learning_rate": 9.252942403472256e-06, + "loss": 0.3685, + "step": 3085 + }, + { + "epoch": 0.7761569416498993, + "grad_norm": 0.5184744000434875, + "learning_rate": 9.252172799015989e-06, + "loss": 0.4028, + "step": 3086 + }, + { + "epoch": 0.7764084507042254, + "grad_norm": 0.3658330738544464, + "learning_rate": 9.251402830387721e-06, + "loss": 0.398, + "step": 3087 + }, + { + "epoch": 0.7766599597585513, + "grad_norm": 0.4008294641971588, + "learning_rate": 9.250632497653398e-06, + "loss": 0.392, + "step": 3088 + }, + { + "epoch": 0.7769114688128773, + "grad_norm": 0.42256030440330505, + "learning_rate": 9.249861800878995e-06, + "loss": 0.3954, + "step": 3089 + }, + { + "epoch": 0.7771629778672032, + "grad_norm": 0.38519713282585144, + "learning_rate": 9.249090740130515e-06, + "loss": 0.3897, + "step": 3090 + }, + { + "epoch": 0.7774144869215291, + "grad_norm": 0.41861796379089355, + "learning_rate": 9.248319315473995e-06, + "loss": 0.3863, + "step": 3091 + }, + { + "epoch": 0.7776659959758552, + "grad_norm": 0.41915813088417053, + "learning_rate": 9.247547526975505e-06, + "loss": 0.4048, + "step": 3092 + }, + { + "epoch": 0.7779175050301811, + "grad_norm": 0.34398457407951355, + "learning_rate": 9.246775374701139e-06, + "loss": 0.3496, + "step": 3093 + }, + { + "epoch": 0.778169014084507, + "grad_norm": 0.4545997679233551, + "learning_rate": 9.246002858717031e-06, + "loss": 0.3996, + "step": 3094 + }, + { + "epoch": 0.778420523138833, + "grad_norm": 0.34864526987075806, + "learning_rate": 9.245229979089341e-06, + "loss": 0.3708, + "step": 3095 + }, + { + "epoch": 0.778672032193159, + "grad_norm": 0.32429444789886475, + "learning_rate": 9.244456735884261e-06, + "loss": 0.3773, + "step": 3096 + }, + { + "epoch": 0.7789235412474849, + "grad_norm": 0.3772585093975067, + "learning_rate": 9.243683129168016e-06, + "loss": 0.3615, + "step": 3097 + }, + { + "epoch": 0.7791750503018109, + "grad_norm": 0.4083939492702484, + "learning_rate": 9.242909159006858e-06, + "loss": 0.4064, + "step": 3098 + }, + { + "epoch": 0.7794265593561368, + "grad_norm": 0.3335086703300476, + "learning_rate": 9.242134825467076e-06, + "loss": 0.346, + "step": 3099 + }, + { + "epoch": 0.7796780684104627, + "grad_norm": 0.3571106493473053, + "learning_rate": 9.241360128614984e-06, + "loss": 0.3438, + "step": 3100 + }, + { + "epoch": 0.7799295774647887, + "grad_norm": 0.3634457588195801, + "learning_rate": 9.24058506851693e-06, + "loss": 0.3714, + "step": 3101 + }, + { + "epoch": 0.7801810865191147, + "grad_norm": 0.36663031578063965, + "learning_rate": 9.239809645239295e-06, + "loss": 0.4045, + "step": 3102 + }, + { + "epoch": 0.7804325955734407, + "grad_norm": 0.35045841336250305, + "learning_rate": 9.239033858848487e-06, + "loss": 0.3919, + "step": 3103 + }, + { + "epoch": 0.7806841046277666, + "grad_norm": 0.4044874310493469, + "learning_rate": 9.238257709410949e-06, + "loss": 0.3807, + "step": 3104 + }, + { + "epoch": 0.7809356136820925, + "grad_norm": 0.3510379493236542, + "learning_rate": 9.237481196993152e-06, + "loss": 0.3711, + "step": 3105 + }, + { + "epoch": 0.7811871227364185, + "grad_norm": 0.39423373341560364, + "learning_rate": 9.2367043216616e-06, + "loss": 0.3912, + "step": 3106 + }, + { + "epoch": 0.7814386317907445, + "grad_norm": 0.3713517189025879, + "learning_rate": 9.23592708348283e-06, + "loss": 0.3774, + "step": 3107 + }, + { + "epoch": 0.7816901408450704, + "grad_norm": 0.3517390787601471, + "learning_rate": 9.235149482523402e-06, + "loss": 0.3597, + "step": 3108 + }, + { + "epoch": 0.7819416498993964, + "grad_norm": 0.38510939478874207, + "learning_rate": 9.234371518849918e-06, + "loss": 0.3465, + "step": 3109 + }, + { + "epoch": 0.7821931589537223, + "grad_norm": 0.3652108609676361, + "learning_rate": 9.233593192529002e-06, + "loss": 0.3745, + "step": 3110 + }, + { + "epoch": 0.7824446680080482, + "grad_norm": 0.39064309000968933, + "learning_rate": 9.232814503627316e-06, + "loss": 0.3825, + "step": 3111 + }, + { + "epoch": 0.7826961770623743, + "grad_norm": 0.34675848484039307, + "learning_rate": 9.232035452211546e-06, + "loss": 0.3928, + "step": 3112 + }, + { + "epoch": 0.7829476861167002, + "grad_norm": 0.36757782101631165, + "learning_rate": 9.231256038348418e-06, + "loss": 0.3966, + "step": 3113 + }, + { + "epoch": 0.7831991951710262, + "grad_norm": 0.36838653683662415, + "learning_rate": 9.230476262104678e-06, + "loss": 0.3835, + "step": 3114 + }, + { + "epoch": 0.7834507042253521, + "grad_norm": 0.33910053968429565, + "learning_rate": 9.229696123547114e-06, + "loss": 0.4, + "step": 3115 + }, + { + "epoch": 0.783702213279678, + "grad_norm": 0.3448108434677124, + "learning_rate": 9.228915622742536e-06, + "loss": 0.3819, + "step": 3116 + }, + { + "epoch": 0.7839537223340041, + "grad_norm": 0.3653436303138733, + "learning_rate": 9.228134759757791e-06, + "loss": 0.3806, + "step": 3117 + }, + { + "epoch": 0.78420523138833, + "grad_norm": 0.33516430854797363, + "learning_rate": 9.227353534659758e-06, + "loss": 0.3677, + "step": 3118 + }, + { + "epoch": 0.7844567404426559, + "grad_norm": 0.36953532695770264, + "learning_rate": 9.226571947515339e-06, + "loss": 0.3824, + "step": 3119 + }, + { + "epoch": 0.7847082494969819, + "grad_norm": 0.37254080176353455, + "learning_rate": 9.225789998391473e-06, + "loss": 0.3841, + "step": 3120 + }, + { + "epoch": 0.7849597585513078, + "grad_norm": 0.34900060296058655, + "learning_rate": 9.225007687355132e-06, + "loss": 0.3699, + "step": 3121 + }, + { + "epoch": 0.7852112676056338, + "grad_norm": 0.42499294877052307, + "learning_rate": 9.224225014473312e-06, + "loss": 0.377, + "step": 3122 + }, + { + "epoch": 0.7854627766599598, + "grad_norm": 0.4103010296821594, + "learning_rate": 9.223441979813049e-06, + "loss": 0.3935, + "step": 3123 + }, + { + "epoch": 0.7857142857142857, + "grad_norm": 0.34033411741256714, + "learning_rate": 9.222658583441399e-06, + "loss": 0.3798, + "step": 3124 + }, + { + "epoch": 0.7859657947686117, + "grad_norm": 0.45133984088897705, + "learning_rate": 9.221874825425461e-06, + "loss": 0.3996, + "step": 3125 + }, + { + "epoch": 0.7862173038229376, + "grad_norm": 0.3784712255001068, + "learning_rate": 9.221090705832353e-06, + "loss": 0.3937, + "step": 3126 + }, + { + "epoch": 0.7864688128772636, + "grad_norm": 0.36274659633636475, + "learning_rate": 9.220306224729237e-06, + "loss": 0.3899, + "step": 3127 + }, + { + "epoch": 0.7867203219315896, + "grad_norm": 0.36185523867607117, + "learning_rate": 9.219521382183291e-06, + "loss": 0.38, + "step": 3128 + }, + { + "epoch": 0.7869718309859155, + "grad_norm": 0.3780527710914612, + "learning_rate": 9.21873617826174e-06, + "loss": 0.3743, + "step": 3129 + }, + { + "epoch": 0.7872233400402414, + "grad_norm": 0.3563750088214874, + "learning_rate": 9.217950613031826e-06, + "loss": 0.3779, + "step": 3130 + }, + { + "epoch": 0.7874748490945674, + "grad_norm": 0.33744367957115173, + "learning_rate": 9.21716468656083e-06, + "loss": 0.3847, + "step": 3131 + }, + { + "epoch": 0.7877263581488934, + "grad_norm": 0.4224430322647095, + "learning_rate": 9.216378398916059e-06, + "loss": 0.4001, + "step": 3132 + }, + { + "epoch": 0.7879778672032193, + "grad_norm": 0.3875563144683838, + "learning_rate": 9.215591750164856e-06, + "loss": 0.3813, + "step": 3133 + }, + { + "epoch": 0.7882293762575453, + "grad_norm": 0.41393423080444336, + "learning_rate": 9.214804740374594e-06, + "loss": 0.3688, + "step": 3134 + }, + { + "epoch": 0.7884808853118712, + "grad_norm": 0.39912310242652893, + "learning_rate": 9.214017369612672e-06, + "loss": 0.3786, + "step": 3135 + }, + { + "epoch": 0.7887323943661971, + "grad_norm": 0.3480337858200073, + "learning_rate": 9.213229637946526e-06, + "loss": 0.3602, + "step": 3136 + }, + { + "epoch": 0.7889839034205232, + "grad_norm": 0.3411673605442047, + "learning_rate": 9.21244154544362e-06, + "loss": 0.3987, + "step": 3137 + }, + { + "epoch": 0.7892354124748491, + "grad_norm": 0.3513990342617035, + "learning_rate": 9.211653092171447e-06, + "loss": 0.3716, + "step": 3138 + }, + { + "epoch": 0.7894869215291751, + "grad_norm": 0.3667123317718506, + "learning_rate": 9.210864278197536e-06, + "loss": 0.3672, + "step": 3139 + }, + { + "epoch": 0.789738430583501, + "grad_norm": 0.3660683333873749, + "learning_rate": 9.210075103589443e-06, + "loss": 0.4099, + "step": 3140 + }, + { + "epoch": 0.7899899396378269, + "grad_norm": 0.3497997224330902, + "learning_rate": 9.209285568414755e-06, + "loss": 0.3661, + "step": 3141 + }, + { + "epoch": 0.790241448692153, + "grad_norm": 0.3466975688934326, + "learning_rate": 9.20849567274109e-06, + "loss": 0.3909, + "step": 3142 + }, + { + "epoch": 0.7904929577464789, + "grad_norm": 0.3633776009082794, + "learning_rate": 9.2077054166361e-06, + "loss": 0.3701, + "step": 3143 + }, + { + "epoch": 0.7907444668008048, + "grad_norm": 0.36884036660194397, + "learning_rate": 9.206914800167463e-06, + "loss": 0.3917, + "step": 3144 + }, + { + "epoch": 0.7909959758551308, + "grad_norm": 0.34670379757881165, + "learning_rate": 9.206123823402894e-06, + "loss": 0.4399, + "step": 3145 + }, + { + "epoch": 0.7912474849094567, + "grad_norm": 0.3583228588104248, + "learning_rate": 9.205332486410133e-06, + "loss": 0.4055, + "step": 3146 + }, + { + "epoch": 0.7914989939637826, + "grad_norm": 0.38067370653152466, + "learning_rate": 9.204540789256951e-06, + "loss": 0.4284, + "step": 3147 + }, + { + "epoch": 0.7917505030181087, + "grad_norm": 0.3454825282096863, + "learning_rate": 9.203748732011154e-06, + "loss": 0.4033, + "step": 3148 + }, + { + "epoch": 0.7920020120724346, + "grad_norm": 0.3998292088508606, + "learning_rate": 9.202956314740578e-06, + "loss": 0.3992, + "step": 3149 + }, + { + "epoch": 0.7922535211267606, + "grad_norm": 0.3713396489620209, + "learning_rate": 9.202163537513088e-06, + "loss": 0.3693, + "step": 3150 + }, + { + "epoch": 0.7925050301810865, + "grad_norm": 0.393930047750473, + "learning_rate": 9.201370400396578e-06, + "loss": 0.3837, + "step": 3151 + }, + { + "epoch": 0.7927565392354124, + "grad_norm": 0.388494610786438, + "learning_rate": 9.200576903458978e-06, + "loss": 0.3694, + "step": 3152 + }, + { + "epoch": 0.7930080482897385, + "grad_norm": 0.4377411901950836, + "learning_rate": 9.199783046768245e-06, + "loss": 0.4008, + "step": 3153 + }, + { + "epoch": 0.7932595573440644, + "grad_norm": 0.3396592438220978, + "learning_rate": 9.198988830392365e-06, + "loss": 0.3754, + "step": 3154 + }, + { + "epoch": 0.7935110663983903, + "grad_norm": 0.3519555926322937, + "learning_rate": 9.198194254399364e-06, + "loss": 0.3816, + "step": 3155 + }, + { + "epoch": 0.7937625754527163, + "grad_norm": 0.368377149105072, + "learning_rate": 9.197399318857288e-06, + "loss": 0.371, + "step": 3156 + }, + { + "epoch": 0.7940140845070423, + "grad_norm": 0.3477652370929718, + "learning_rate": 9.19660402383422e-06, + "loss": 0.3841, + "step": 3157 + }, + { + "epoch": 0.7942655935613682, + "grad_norm": 0.37322860956192017, + "learning_rate": 9.19580836939827e-06, + "loss": 0.3886, + "step": 3158 + }, + { + "epoch": 0.7945171026156942, + "grad_norm": 0.39691033959388733, + "learning_rate": 9.195012355617581e-06, + "loss": 0.373, + "step": 3159 + }, + { + "epoch": 0.7947686116700201, + "grad_norm": 0.355160117149353, + "learning_rate": 9.194215982560328e-06, + "loss": 0.3933, + "step": 3160 + }, + { + "epoch": 0.795020120724346, + "grad_norm": 0.36236023902893066, + "learning_rate": 9.193419250294717e-06, + "loss": 0.4166, + "step": 3161 + }, + { + "epoch": 0.795271629778672, + "grad_norm": 0.35770562291145325, + "learning_rate": 9.192622158888979e-06, + "loss": 0.3911, + "step": 3162 + }, + { + "epoch": 0.795523138832998, + "grad_norm": 0.36751723289489746, + "learning_rate": 9.191824708411384e-06, + "loss": 0.3973, + "step": 3163 + }, + { + "epoch": 0.795774647887324, + "grad_norm": 0.365925133228302, + "learning_rate": 9.191026898930224e-06, + "loss": 0.3763, + "step": 3164 + }, + { + "epoch": 0.7960261569416499, + "grad_norm": 0.3750317394733429, + "learning_rate": 9.190228730513832e-06, + "loss": 0.3783, + "step": 3165 + }, + { + "epoch": 0.7962776659959758, + "grad_norm": 0.343177855014801, + "learning_rate": 9.189430203230562e-06, + "loss": 0.3744, + "step": 3166 + }, + { + "epoch": 0.7965291750503019, + "grad_norm": 0.3460996150970459, + "learning_rate": 9.188631317148804e-06, + "loss": 0.4047, + "step": 3167 + }, + { + "epoch": 0.7967806841046278, + "grad_norm": 0.38344576954841614, + "learning_rate": 9.187832072336977e-06, + "loss": 0.3893, + "step": 3168 + }, + { + "epoch": 0.7970321931589537, + "grad_norm": 0.36154693365097046, + "learning_rate": 9.187032468863532e-06, + "loss": 0.3512, + "step": 3169 + }, + { + "epoch": 0.7972837022132797, + "grad_norm": 0.38037383556365967, + "learning_rate": 9.186232506796952e-06, + "loss": 0.382, + "step": 3170 + }, + { + "epoch": 0.7975352112676056, + "grad_norm": 0.33756163716316223, + "learning_rate": 9.185432186205744e-06, + "loss": 0.3945, + "step": 3171 + }, + { + "epoch": 0.7977867203219315, + "grad_norm": 0.394203245639801, + "learning_rate": 9.184631507158456e-06, + "loss": 0.3817, + "step": 3172 + }, + { + "epoch": 0.7980382293762576, + "grad_norm": 0.37978580594062805, + "learning_rate": 9.183830469723658e-06, + "loss": 0.3803, + "step": 3173 + }, + { + "epoch": 0.7982897384305835, + "grad_norm": 0.36637356877326965, + "learning_rate": 9.183029073969953e-06, + "loss": 0.3997, + "step": 3174 + }, + { + "epoch": 0.7985412474849095, + "grad_norm": 0.40284034609794617, + "learning_rate": 9.182227319965978e-06, + "loss": 0.3895, + "step": 3175 + }, + { + "epoch": 0.7987927565392354, + "grad_norm": 0.34037232398986816, + "learning_rate": 9.181425207780396e-06, + "loss": 0.3971, + "step": 3176 + }, + { + "epoch": 0.7990442655935613, + "grad_norm": 0.3634487986564636, + "learning_rate": 9.180622737481904e-06, + "loss": 0.3883, + "step": 3177 + }, + { + "epoch": 0.7992957746478874, + "grad_norm": 0.3719540238380432, + "learning_rate": 9.179819909139228e-06, + "loss": 0.3946, + "step": 3178 + }, + { + "epoch": 0.7995472837022133, + "grad_norm": 0.34387439489364624, + "learning_rate": 9.179016722821126e-06, + "loss": 0.3701, + "step": 3179 + }, + { + "epoch": 0.7997987927565392, + "grad_norm": 0.41140124201774597, + "learning_rate": 9.178213178596386e-06, + "loss": 0.3822, + "step": 3180 + }, + { + "epoch": 0.8000503018108652, + "grad_norm": 0.3771357238292694, + "learning_rate": 9.177409276533825e-06, + "loss": 0.4017, + "step": 3181 + }, + { + "epoch": 0.8003018108651911, + "grad_norm": 0.37716689705848694, + "learning_rate": 9.176605016702294e-06, + "loss": 0.3574, + "step": 3182 + }, + { + "epoch": 0.8005533199195171, + "grad_norm": 0.357952356338501, + "learning_rate": 9.175800399170673e-06, + "loss": 0.3653, + "step": 3183 + }, + { + "epoch": 0.8008048289738431, + "grad_norm": 0.3895781636238098, + "learning_rate": 9.17499542400787e-06, + "loss": 0.3879, + "step": 3184 + }, + { + "epoch": 0.801056338028169, + "grad_norm": 0.37393030524253845, + "learning_rate": 9.174190091282828e-06, + "loss": 0.3834, + "step": 3185 + }, + { + "epoch": 0.8013078470824949, + "grad_norm": 0.4014933407306671, + "learning_rate": 9.173384401064519e-06, + "loss": 0.3581, + "step": 3186 + }, + { + "epoch": 0.8015593561368209, + "grad_norm": 0.39869409799575806, + "learning_rate": 9.172578353421943e-06, + "loss": 0.3896, + "step": 3187 + }, + { + "epoch": 0.8018108651911469, + "grad_norm": 0.45484480261802673, + "learning_rate": 9.171771948424138e-06, + "loss": 0.3831, + "step": 3188 + }, + { + "epoch": 0.8020623742454729, + "grad_norm": 0.4879845380783081, + "learning_rate": 9.17096518614016e-06, + "loss": 0.3998, + "step": 3189 + }, + { + "epoch": 0.8023138832997988, + "grad_norm": 0.37346404790878296, + "learning_rate": 9.17015806663911e-06, + "loss": 0.3768, + "step": 3190 + }, + { + "epoch": 0.8025653923541247, + "grad_norm": 0.38394322991371155, + "learning_rate": 9.169350589990109e-06, + "loss": 0.37, + "step": 3191 + }, + { + "epoch": 0.8028169014084507, + "grad_norm": 0.42078500986099243, + "learning_rate": 9.168542756262313e-06, + "loss": 0.373, + "step": 3192 + }, + { + "epoch": 0.8030684104627767, + "grad_norm": 0.37299197912216187, + "learning_rate": 9.16773456552491e-06, + "loss": 0.3727, + "step": 3193 + }, + { + "epoch": 0.8033199195171026, + "grad_norm": 0.36001479625701904, + "learning_rate": 9.166926017847113e-06, + "loss": 0.3708, + "step": 3194 + }, + { + "epoch": 0.8035714285714286, + "grad_norm": 0.4601292014122009, + "learning_rate": 9.166117113298172e-06, + "loss": 0.3842, + "step": 3195 + }, + { + "epoch": 0.8038229376257545, + "grad_norm": 0.3911070227622986, + "learning_rate": 9.165307851947362e-06, + "loss": 0.3644, + "step": 3196 + }, + { + "epoch": 0.8040744466800804, + "grad_norm": 0.3885655701160431, + "learning_rate": 9.164498233863994e-06, + "loss": 0.3674, + "step": 3197 + }, + { + "epoch": 0.8043259557344065, + "grad_norm": 0.46075260639190674, + "learning_rate": 9.163688259117405e-06, + "loss": 0.3889, + "step": 3198 + }, + { + "epoch": 0.8045774647887324, + "grad_norm": 0.4196132719516754, + "learning_rate": 9.162877927776963e-06, + "loss": 0.428, + "step": 3199 + }, + { + "epoch": 0.8048289738430584, + "grad_norm": 0.38232311606407166, + "learning_rate": 9.162067239912072e-06, + "loss": 0.4044, + "step": 3200 + }, + { + "epoch": 0.8050804828973843, + "grad_norm": 0.39964160323143005, + "learning_rate": 9.161256195592157e-06, + "loss": 0.3878, + "step": 3201 + }, + { + "epoch": 0.8053319919517102, + "grad_norm": 0.3779206871986389, + "learning_rate": 9.160444794886682e-06, + "loss": 0.3822, + "step": 3202 + }, + { + "epoch": 0.8055835010060363, + "grad_norm": 0.3651765286922455, + "learning_rate": 9.15963303786514e-06, + "loss": 0.3952, + "step": 3203 + }, + { + "epoch": 0.8058350100603622, + "grad_norm": 0.3684598207473755, + "learning_rate": 9.15882092459705e-06, + "loss": 0.3693, + "step": 3204 + }, + { + "epoch": 0.8060865191146881, + "grad_norm": 0.35791802406311035, + "learning_rate": 9.158008455151965e-06, + "loss": 0.4058, + "step": 3205 + }, + { + "epoch": 0.8063380281690141, + "grad_norm": 0.3659406900405884, + "learning_rate": 9.157195629599468e-06, + "loss": 0.3879, + "step": 3206 + }, + { + "epoch": 0.80658953722334, + "grad_norm": 0.3828873932361603, + "learning_rate": 9.156382448009173e-06, + "loss": 0.3876, + "step": 3207 + }, + { + "epoch": 0.806841046277666, + "grad_norm": 0.33085519075393677, + "learning_rate": 9.155568910450722e-06, + "loss": 0.3745, + "step": 3208 + }, + { + "epoch": 0.807092555331992, + "grad_norm": 0.35317882895469666, + "learning_rate": 9.154755016993794e-06, + "loss": 0.3587, + "step": 3209 + }, + { + "epoch": 0.8073440643863179, + "grad_norm": 0.37781715393066406, + "learning_rate": 9.15394076770809e-06, + "loss": 0.4005, + "step": 3210 + }, + { + "epoch": 0.8075955734406438, + "grad_norm": 0.33460843563079834, + "learning_rate": 9.153126162663343e-06, + "loss": 0.3969, + "step": 3211 + }, + { + "epoch": 0.8078470824949698, + "grad_norm": 0.4079720377922058, + "learning_rate": 9.152311201929326e-06, + "loss": 0.3821, + "step": 3212 + }, + { + "epoch": 0.8080985915492958, + "grad_norm": 0.3783186972141266, + "learning_rate": 9.15149588557583e-06, + "loss": 0.3985, + "step": 3213 + }, + { + "epoch": 0.8083501006036218, + "grad_norm": 0.33954814076423645, + "learning_rate": 9.150680213672683e-06, + "loss": 0.4062, + "step": 3214 + }, + { + "epoch": 0.8086016096579477, + "grad_norm": 0.4284415543079376, + "learning_rate": 9.149864186289743e-06, + "loss": 0.3759, + "step": 3215 + }, + { + "epoch": 0.8088531187122736, + "grad_norm": 0.349122554063797, + "learning_rate": 9.149047803496896e-06, + "loss": 0.3752, + "step": 3216 + }, + { + "epoch": 0.8091046277665996, + "grad_norm": 0.3756011426448822, + "learning_rate": 9.148231065364062e-06, + "loss": 0.3826, + "step": 3217 + }, + { + "epoch": 0.8093561368209256, + "grad_norm": 0.42645543813705444, + "learning_rate": 9.147413971961187e-06, + "loss": 0.404, + "step": 3218 + }, + { + "epoch": 0.8096076458752515, + "grad_norm": 0.35553669929504395, + "learning_rate": 9.146596523358252e-06, + "loss": 0.3921, + "step": 3219 + }, + { + "epoch": 0.8098591549295775, + "grad_norm": 0.38775989413261414, + "learning_rate": 9.145778719625266e-06, + "loss": 0.3986, + "step": 3220 + }, + { + "epoch": 0.8101106639839034, + "grad_norm": 0.3600097596645355, + "learning_rate": 9.144960560832268e-06, + "loss": 0.385, + "step": 3221 + }, + { + "epoch": 0.8103621730382293, + "grad_norm": 0.3468279540538788, + "learning_rate": 9.144142047049329e-06, + "loss": 0.3786, + "step": 3222 + }, + { + "epoch": 0.8106136820925554, + "grad_norm": 0.3529747426509857, + "learning_rate": 9.14332317834655e-06, + "loss": 0.3729, + "step": 3223 + }, + { + "epoch": 0.8108651911468813, + "grad_norm": 0.31644824147224426, + "learning_rate": 9.14250395479406e-06, + "loss": 0.3905, + "step": 3224 + }, + { + "epoch": 0.8111167002012073, + "grad_norm": 0.3824257254600525, + "learning_rate": 9.141684376462024e-06, + "loss": 0.389, + "step": 3225 + }, + { + "epoch": 0.8113682092555332, + "grad_norm": 0.3192039132118225, + "learning_rate": 9.140864443420629e-06, + "loss": 0.3734, + "step": 3226 + }, + { + "epoch": 0.8116197183098591, + "grad_norm": 0.3341809809207916, + "learning_rate": 9.140044155740102e-06, + "loss": 0.383, + "step": 3227 + }, + { + "epoch": 0.8118712273641852, + "grad_norm": 0.33594390749931335, + "learning_rate": 9.139223513490692e-06, + "loss": 0.3706, + "step": 3228 + }, + { + "epoch": 0.8121227364185111, + "grad_norm": 0.34297066926956177, + "learning_rate": 9.138402516742681e-06, + "loss": 0.3923, + "step": 3229 + }, + { + "epoch": 0.812374245472837, + "grad_norm": 0.32909277081489563, + "learning_rate": 9.137581165566388e-06, + "loss": 0.3715, + "step": 3230 + }, + { + "epoch": 0.812625754527163, + "grad_norm": 0.35775429010391235, + "learning_rate": 9.13675946003215e-06, + "loss": 0.3597, + "step": 3231 + }, + { + "epoch": 0.8128772635814889, + "grad_norm": 0.3310528099536896, + "learning_rate": 9.135937400210345e-06, + "loss": 0.3929, + "step": 3232 + }, + { + "epoch": 0.8131287726358148, + "grad_norm": 0.35094767808914185, + "learning_rate": 9.135114986171373e-06, + "loss": 0.4054, + "step": 3233 + }, + { + "epoch": 0.8133802816901409, + "grad_norm": 0.38536086678504944, + "learning_rate": 9.134292217985675e-06, + "loss": 0.4061, + "step": 3234 + }, + { + "epoch": 0.8136317907444668, + "grad_norm": 0.35083386301994324, + "learning_rate": 9.133469095723712e-06, + "loss": 0.3607, + "step": 3235 + }, + { + "epoch": 0.8138832997987927, + "grad_norm": 0.34941115975379944, + "learning_rate": 9.13264561945598e-06, + "loss": 0.3522, + "step": 3236 + }, + { + "epoch": 0.8141348088531187, + "grad_norm": 0.3351636826992035, + "learning_rate": 9.131821789253003e-06, + "loss": 0.3788, + "step": 3237 + }, + { + "epoch": 0.8143863179074446, + "grad_norm": 0.3241204023361206, + "learning_rate": 9.130997605185338e-06, + "loss": 0.3929, + "step": 3238 + }, + { + "epoch": 0.8146378269617707, + "grad_norm": 0.35494330525398254, + "learning_rate": 9.130173067323575e-06, + "loss": 0.4019, + "step": 3239 + }, + { + "epoch": 0.8148893360160966, + "grad_norm": 0.3341788351535797, + "learning_rate": 9.129348175738324e-06, + "loss": 0.3821, + "step": 3240 + }, + { + "epoch": 0.8151408450704225, + "grad_norm": 0.34449446201324463, + "learning_rate": 9.128522930500237e-06, + "loss": 0.3671, + "step": 3241 + }, + { + "epoch": 0.8153923541247485, + "grad_norm": 0.31433650851249695, + "learning_rate": 9.127697331679988e-06, + "loss": 0.3803, + "step": 3242 + }, + { + "epoch": 0.8156438631790744, + "grad_norm": 0.3512547016143799, + "learning_rate": 9.126871379348284e-06, + "loss": 0.3744, + "step": 3243 + }, + { + "epoch": 0.8158953722334004, + "grad_norm": 0.3199286162853241, + "learning_rate": 9.126045073575865e-06, + "loss": 0.4147, + "step": 3244 + }, + { + "epoch": 0.8161468812877264, + "grad_norm": 0.37143221497535706, + "learning_rate": 9.125218414433498e-06, + "loss": 0.3738, + "step": 3245 + }, + { + "epoch": 0.8163983903420523, + "grad_norm": 0.3939398229122162, + "learning_rate": 9.124391401991981e-06, + "loss": 0.3829, + "step": 3246 + }, + { + "epoch": 0.8166498993963782, + "grad_norm": 0.34580913186073303, + "learning_rate": 9.123564036322143e-06, + "loss": 0.3956, + "step": 3247 + }, + { + "epoch": 0.8169014084507042, + "grad_norm": 0.32879477739334106, + "learning_rate": 9.122736317494842e-06, + "loss": 0.3746, + "step": 3248 + }, + { + "epoch": 0.8171529175050302, + "grad_norm": 0.375439316034317, + "learning_rate": 9.121908245580967e-06, + "loss": 0.3988, + "step": 3249 + }, + { + "epoch": 0.8174044265593562, + "grad_norm": 0.3741564452648163, + "learning_rate": 9.121079820651438e-06, + "loss": 0.4051, + "step": 3250 + }, + { + "epoch": 0.8176559356136821, + "grad_norm": 0.3896329700946808, + "learning_rate": 9.120251042777203e-06, + "loss": 0.38, + "step": 3251 + }, + { + "epoch": 0.817907444668008, + "grad_norm": 0.38490694761276245, + "learning_rate": 9.119421912029243e-06, + "loss": 0.352, + "step": 3252 + }, + { + "epoch": 0.818158953722334, + "grad_norm": 0.3421596884727478, + "learning_rate": 9.118592428478565e-06, + "loss": 0.3867, + "step": 3253 + }, + { + "epoch": 0.81841046277666, + "grad_norm": 0.36744779348373413, + "learning_rate": 9.117762592196214e-06, + "loss": 0.3598, + "step": 3254 + }, + { + "epoch": 0.8186619718309859, + "grad_norm": 0.33967792987823486, + "learning_rate": 9.116932403253257e-06, + "loss": 0.4086, + "step": 3255 + }, + { + "epoch": 0.8189134808853119, + "grad_norm": 0.35060232877731323, + "learning_rate": 9.116101861720793e-06, + "loss": 0.3724, + "step": 3256 + }, + { + "epoch": 0.8191649899396378, + "grad_norm": 0.3623863458633423, + "learning_rate": 9.115270967669958e-06, + "loss": 0.3924, + "step": 3257 + }, + { + "epoch": 0.8194164989939637, + "grad_norm": 0.3579019606113434, + "learning_rate": 9.114439721171909e-06, + "loss": 0.4078, + "step": 3258 + }, + { + "epoch": 0.8196680080482898, + "grad_norm": 0.3488744795322418, + "learning_rate": 9.113608122297836e-06, + "loss": 0.4064, + "step": 3259 + }, + { + "epoch": 0.8199195171026157, + "grad_norm": 0.38803979754447937, + "learning_rate": 9.112776171118964e-06, + "loss": 0.4151, + "step": 3260 + }, + { + "epoch": 0.8201710261569416, + "grad_norm": 0.3529983162879944, + "learning_rate": 9.11194386770654e-06, + "loss": 0.3722, + "step": 3261 + }, + { + "epoch": 0.8204225352112676, + "grad_norm": 0.39066073298454285, + "learning_rate": 9.111111212131851e-06, + "loss": 0.4216, + "step": 3262 + }, + { + "epoch": 0.8206740442655935, + "grad_norm": 0.34698736667633057, + "learning_rate": 9.110278204466203e-06, + "loss": 0.3725, + "step": 3263 + }, + { + "epoch": 0.8209255533199196, + "grad_norm": 0.337932288646698, + "learning_rate": 9.109444844780942e-06, + "loss": 0.4123, + "step": 3264 + }, + { + "epoch": 0.8211770623742455, + "grad_norm": 0.3363993465900421, + "learning_rate": 9.108611133147438e-06, + "loss": 0.3716, + "step": 3265 + }, + { + "epoch": 0.8214285714285714, + "grad_norm": 0.3734219968318939, + "learning_rate": 9.107777069637094e-06, + "loss": 0.3874, + "step": 3266 + }, + { + "epoch": 0.8216800804828974, + "grad_norm": 0.32347917556762695, + "learning_rate": 9.106942654321343e-06, + "loss": 0.358, + "step": 3267 + }, + { + "epoch": 0.8219315895372233, + "grad_norm": 0.3709772229194641, + "learning_rate": 9.106107887271647e-06, + "loss": 0.3634, + "step": 3268 + }, + { + "epoch": 0.8221830985915493, + "grad_norm": 0.3945581018924713, + "learning_rate": 9.105272768559496e-06, + "loss": 0.379, + "step": 3269 + }, + { + "epoch": 0.8224346076458753, + "grad_norm": 0.33245036005973816, + "learning_rate": 9.104437298256416e-06, + "loss": 0.3605, + "step": 3270 + }, + { + "epoch": 0.8226861167002012, + "grad_norm": 0.37672650814056396, + "learning_rate": 9.103601476433959e-06, + "loss": 0.389, + "step": 3271 + }, + { + "epoch": 0.8229376257545271, + "grad_norm": 0.37281307578086853, + "learning_rate": 9.102765303163708e-06, + "loss": 0.3872, + "step": 3272 + }, + { + "epoch": 0.8231891348088531, + "grad_norm": 0.3488701581954956, + "learning_rate": 9.101928778517275e-06, + "loss": 0.3543, + "step": 3273 + }, + { + "epoch": 0.8234406438631791, + "grad_norm": 0.33811309933662415, + "learning_rate": 9.101091902566303e-06, + "loss": 0.3833, + "step": 3274 + }, + { + "epoch": 0.8236921529175051, + "grad_norm": 0.40357568860054016, + "learning_rate": 9.100254675382467e-06, + "loss": 0.3806, + "step": 3275 + }, + { + "epoch": 0.823943661971831, + "grad_norm": 0.39533787965774536, + "learning_rate": 9.099417097037468e-06, + "loss": 0.3767, + "step": 3276 + }, + { + "epoch": 0.8241951710261569, + "grad_norm": 0.38113778829574585, + "learning_rate": 9.098579167603042e-06, + "loss": 0.3862, + "step": 3277 + }, + { + "epoch": 0.8244466800804829, + "grad_norm": 0.38164108991622925, + "learning_rate": 9.09774088715095e-06, + "loss": 0.3707, + "step": 3278 + }, + { + "epoch": 0.8246981891348089, + "grad_norm": 0.4421541094779968, + "learning_rate": 9.096902255752986e-06, + "loss": 0.3845, + "step": 3279 + }, + { + "epoch": 0.8249496981891348, + "grad_norm": 0.34088340401649475, + "learning_rate": 9.096063273480975e-06, + "loss": 0.3685, + "step": 3280 + }, + { + "epoch": 0.8252012072434608, + "grad_norm": 0.44785863161087036, + "learning_rate": 9.09522394040677e-06, + "loss": 0.3757, + "step": 3281 + }, + { + "epoch": 0.8254527162977867, + "grad_norm": 0.4285958409309387, + "learning_rate": 9.094384256602252e-06, + "loss": 0.3549, + "step": 3282 + }, + { + "epoch": 0.8257042253521126, + "grad_norm": 0.35675740242004395, + "learning_rate": 9.093544222139338e-06, + "loss": 0.364, + "step": 3283 + }, + { + "epoch": 0.8259557344064387, + "grad_norm": 0.3988295793533325, + "learning_rate": 9.09270383708997e-06, + "loss": 0.4011, + "step": 3284 + }, + { + "epoch": 0.8262072434607646, + "grad_norm": 0.4526250958442688, + "learning_rate": 9.091863101526124e-06, + "loss": 0.3882, + "step": 3285 + }, + { + "epoch": 0.8264587525150905, + "grad_norm": 0.416526734828949, + "learning_rate": 9.091022015519798e-06, + "loss": 0.4004, + "step": 3286 + }, + { + "epoch": 0.8267102615694165, + "grad_norm": 0.3291431665420532, + "learning_rate": 9.090180579143033e-06, + "loss": 0.3499, + "step": 3287 + }, + { + "epoch": 0.8269617706237424, + "grad_norm": 0.34957194328308105, + "learning_rate": 9.08933879246789e-06, + "loss": 0.36, + "step": 3288 + }, + { + "epoch": 0.8272132796780685, + "grad_norm": 0.35774147510528564, + "learning_rate": 9.08849665556646e-06, + "loss": 0.3683, + "step": 3289 + }, + { + "epoch": 0.8274647887323944, + "grad_norm": 0.34094980359077454, + "learning_rate": 9.087654168510871e-06, + "loss": 0.3622, + "step": 3290 + }, + { + "epoch": 0.8277162977867203, + "grad_norm": 0.3316769599914551, + "learning_rate": 9.086811331373273e-06, + "loss": 0.3855, + "step": 3291 + }, + { + "epoch": 0.8279678068410463, + "grad_norm": 0.35548505187034607, + "learning_rate": 9.085968144225853e-06, + "loss": 0.3918, + "step": 3292 + }, + { + "epoch": 0.8282193158953722, + "grad_norm": 0.38808438181877136, + "learning_rate": 9.085124607140822e-06, + "loss": 0.3791, + "step": 3293 + }, + { + "epoch": 0.8284708249496981, + "grad_norm": 0.37164920568466187, + "learning_rate": 9.084280720190426e-06, + "loss": 0.4012, + "step": 3294 + }, + { + "epoch": 0.8287223340040242, + "grad_norm": 0.35741862654685974, + "learning_rate": 9.083436483446937e-06, + "loss": 0.3659, + "step": 3295 + }, + { + "epoch": 0.8289738430583501, + "grad_norm": 0.3638781011104584, + "learning_rate": 9.082591896982658e-06, + "loss": 0.3967, + "step": 3296 + }, + { + "epoch": 0.829225352112676, + "grad_norm": 0.42987996339797974, + "learning_rate": 9.081746960869926e-06, + "loss": 0.418, + "step": 3297 + }, + { + "epoch": 0.829476861167002, + "grad_norm": 0.36449959874153137, + "learning_rate": 9.0809016751811e-06, + "loss": 0.3814, + "step": 3298 + }, + { + "epoch": 0.829728370221328, + "grad_norm": 0.37069931626319885, + "learning_rate": 9.080056039988576e-06, + "loss": 0.3668, + "step": 3299 + }, + { + "epoch": 0.829979879275654, + "grad_norm": 0.3759606182575226, + "learning_rate": 9.079210055364777e-06, + "loss": 0.3879, + "step": 3300 + }, + { + "epoch": 0.8302313883299799, + "grad_norm": 0.3514502942562103, + "learning_rate": 9.078363721382157e-06, + "loss": 0.3783, + "step": 3301 + }, + { + "epoch": 0.8304828973843058, + "grad_norm": 0.3794296979904175, + "learning_rate": 9.077517038113197e-06, + "loss": 0.3802, + "step": 3302 + }, + { + "epoch": 0.8307344064386318, + "grad_norm": 0.3764783442020416, + "learning_rate": 9.076670005630413e-06, + "loss": 0.3802, + "step": 3303 + }, + { + "epoch": 0.8309859154929577, + "grad_norm": 0.31016576290130615, + "learning_rate": 9.075822624006345e-06, + "loss": 0.3664, + "step": 3304 + }, + { + "epoch": 0.8312374245472837, + "grad_norm": 0.3612141013145447, + "learning_rate": 9.074974893313571e-06, + "loss": 0.3909, + "step": 3305 + }, + { + "epoch": 0.8314889336016097, + "grad_norm": 0.34043174982070923, + "learning_rate": 9.074126813624687e-06, + "loss": 0.3947, + "step": 3306 + }, + { + "epoch": 0.8317404426559356, + "grad_norm": 0.3125540614128113, + "learning_rate": 9.07327838501233e-06, + "loss": 0.3821, + "step": 3307 + }, + { + "epoch": 0.8319919517102615, + "grad_norm": 0.34392425417900085, + "learning_rate": 9.072429607549161e-06, + "loss": 0.3785, + "step": 3308 + }, + { + "epoch": 0.8322434607645876, + "grad_norm": 0.3548860251903534, + "learning_rate": 9.071580481307875e-06, + "loss": 0.394, + "step": 3309 + }, + { + "epoch": 0.8324949698189135, + "grad_norm": 0.34213536977767944, + "learning_rate": 9.070731006361191e-06, + "loss": 0.3685, + "step": 3310 + }, + { + "epoch": 0.8327464788732394, + "grad_norm": 0.40426772832870483, + "learning_rate": 9.069881182781864e-06, + "loss": 0.3613, + "step": 3311 + }, + { + "epoch": 0.8329979879275654, + "grad_norm": 0.34350600838661194, + "learning_rate": 9.069031010642673e-06, + "loss": 0.3672, + "step": 3312 + }, + { + "epoch": 0.8332494969818913, + "grad_norm": 0.3403952717781067, + "learning_rate": 9.068180490016432e-06, + "loss": 0.3847, + "step": 3313 + }, + { + "epoch": 0.8335010060362174, + "grad_norm": 0.4108610153198242, + "learning_rate": 9.067329620975983e-06, + "loss": 0.4024, + "step": 3314 + }, + { + "epoch": 0.8337525150905433, + "grad_norm": 0.38602137565612793, + "learning_rate": 9.066478403594196e-06, + "loss": 0.3723, + "step": 3315 + }, + { + "epoch": 0.8340040241448692, + "grad_norm": 0.36774957180023193, + "learning_rate": 9.065626837943977e-06, + "loss": 0.3747, + "step": 3316 + }, + { + "epoch": 0.8342555331991952, + "grad_norm": 0.39249497652053833, + "learning_rate": 9.06477492409825e-06, + "loss": 0.3835, + "step": 3317 + }, + { + "epoch": 0.8345070422535211, + "grad_norm": 0.3368522524833679, + "learning_rate": 9.063922662129981e-06, + "loss": 0.4076, + "step": 3318 + }, + { + "epoch": 0.834758551307847, + "grad_norm": 0.37408214807510376, + "learning_rate": 9.063070052112161e-06, + "loss": 0.398, + "step": 3319 + }, + { + "epoch": 0.8350100603621731, + "grad_norm": 0.35431167483329773, + "learning_rate": 9.06221709411781e-06, + "loss": 0.3663, + "step": 3320 + }, + { + "epoch": 0.835261569416499, + "grad_norm": 0.3219449818134308, + "learning_rate": 9.061363788219975e-06, + "loss": 0.3794, + "step": 3321 + }, + { + "epoch": 0.8355130784708249, + "grad_norm": 0.36822405457496643, + "learning_rate": 9.060510134491742e-06, + "loss": 0.3678, + "step": 3322 + }, + { + "epoch": 0.8357645875251509, + "grad_norm": 0.32559409737586975, + "learning_rate": 9.059656133006216e-06, + "loss": 0.3461, + "step": 3323 + }, + { + "epoch": 0.8360160965794768, + "grad_norm": 0.3630576431751251, + "learning_rate": 9.058801783836542e-06, + "loss": 0.3485, + "step": 3324 + }, + { + "epoch": 0.8362676056338029, + "grad_norm": 0.3687560260295868, + "learning_rate": 9.057947087055885e-06, + "loss": 0.3806, + "step": 3325 + }, + { + "epoch": 0.8365191146881288, + "grad_norm": 0.3402329981327057, + "learning_rate": 9.057092042737447e-06, + "loss": 0.3662, + "step": 3326 + }, + { + "epoch": 0.8367706237424547, + "grad_norm": 0.34224575757980347, + "learning_rate": 9.056236650954457e-06, + "loss": 0.3971, + "step": 3327 + }, + { + "epoch": 0.8370221327967807, + "grad_norm": 0.3538998067378998, + "learning_rate": 9.055380911780175e-06, + "loss": 0.3753, + "step": 3328 + }, + { + "epoch": 0.8372736418511066, + "grad_norm": 0.3665867745876312, + "learning_rate": 9.054524825287885e-06, + "loss": 0.3894, + "step": 3329 + }, + { + "epoch": 0.8375251509054326, + "grad_norm": 0.3492240309715271, + "learning_rate": 9.053668391550912e-06, + "loss": 0.385, + "step": 3330 + }, + { + "epoch": 0.8377766599597586, + "grad_norm": 0.3423280417919159, + "learning_rate": 9.052811610642599e-06, + "loss": 0.3453, + "step": 3331 + }, + { + "epoch": 0.8380281690140845, + "grad_norm": 0.4070049226284027, + "learning_rate": 9.051954482636327e-06, + "loss": 0.3987, + "step": 3332 + }, + { + "epoch": 0.8382796780684104, + "grad_norm": 0.3565402328968048, + "learning_rate": 9.051097007605501e-06, + "loss": 0.3752, + "step": 3333 + }, + { + "epoch": 0.8385311871227364, + "grad_norm": 0.31762558221817017, + "learning_rate": 9.050239185623562e-06, + "loss": 0.356, + "step": 3334 + }, + { + "epoch": 0.8387826961770624, + "grad_norm": 0.36530783772468567, + "learning_rate": 9.049381016763973e-06, + "loss": 0.4123, + "step": 3335 + }, + { + "epoch": 0.8390342052313883, + "grad_norm": 0.33472946286201477, + "learning_rate": 9.048522501100233e-06, + "loss": 0.3735, + "step": 3336 + }, + { + "epoch": 0.8392857142857143, + "grad_norm": 0.3728182315826416, + "learning_rate": 9.047663638705868e-06, + "loss": 0.3716, + "step": 3337 + }, + { + "epoch": 0.8395372233400402, + "grad_norm": 0.394871324300766, + "learning_rate": 9.046804429654437e-06, + "loss": 0.4137, + "step": 3338 + }, + { + "epoch": 0.8397887323943662, + "grad_norm": 0.3549323081970215, + "learning_rate": 9.045944874019522e-06, + "loss": 0.4062, + "step": 3339 + }, + { + "epoch": 0.8400402414486922, + "grad_norm": 0.34717488288879395, + "learning_rate": 9.045084971874738e-06, + "loss": 0.3685, + "step": 3340 + }, + { + "epoch": 0.8402917505030181, + "grad_norm": 0.34687772393226624, + "learning_rate": 9.044224723293734e-06, + "loss": 0.3554, + "step": 3341 + }, + { + "epoch": 0.8405432595573441, + "grad_norm": 0.4031140208244324, + "learning_rate": 9.043364128350183e-06, + "loss": 0.4139, + "step": 3342 + }, + { + "epoch": 0.84079476861167, + "grad_norm": 0.33681127429008484, + "learning_rate": 9.042503187117788e-06, + "loss": 0.3659, + "step": 3343 + }, + { + "epoch": 0.8410462776659959, + "grad_norm": 0.31947392225265503, + "learning_rate": 9.041641899670286e-06, + "loss": 0.3752, + "step": 3344 + }, + { + "epoch": 0.841297786720322, + "grad_norm": 0.4157194197177887, + "learning_rate": 9.04078026608144e-06, + "loss": 0.4241, + "step": 3345 + }, + { + "epoch": 0.8415492957746479, + "grad_norm": 0.340215802192688, + "learning_rate": 9.039918286425042e-06, + "loss": 0.3933, + "step": 3346 + }, + { + "epoch": 0.8418008048289738, + "grad_norm": 0.3351587951183319, + "learning_rate": 9.039055960774918e-06, + "loss": 0.3608, + "step": 3347 + }, + { + "epoch": 0.8420523138832998, + "grad_norm": 0.3640073537826538, + "learning_rate": 9.038193289204919e-06, + "loss": 0.3826, + "step": 3348 + }, + { + "epoch": 0.8423038229376257, + "grad_norm": 0.37294724583625793, + "learning_rate": 9.037330271788927e-06, + "loss": 0.3644, + "step": 3349 + }, + { + "epoch": 0.8425553319919518, + "grad_norm": 0.3982885777950287, + "learning_rate": 9.036466908600856e-06, + "loss": 0.3856, + "step": 3350 + }, + { + "epoch": 0.8428068410462777, + "grad_norm": 0.3464895486831665, + "learning_rate": 9.035603199714645e-06, + "loss": 0.3804, + "step": 3351 + }, + { + "epoch": 0.8430583501006036, + "grad_norm": 0.34546592831611633, + "learning_rate": 9.034739145204266e-06, + "loss": 0.3949, + "step": 3352 + }, + { + "epoch": 0.8433098591549296, + "grad_norm": 0.39847803115844727, + "learning_rate": 9.033874745143722e-06, + "loss": 0.3775, + "step": 3353 + }, + { + "epoch": 0.8435613682092555, + "grad_norm": 0.34165751934051514, + "learning_rate": 9.033009999607042e-06, + "loss": 0.3892, + "step": 3354 + }, + { + "epoch": 0.8438128772635815, + "grad_norm": 0.35674750804901123, + "learning_rate": 9.032144908668284e-06, + "loss": 0.3703, + "step": 3355 + }, + { + "epoch": 0.8440643863179075, + "grad_norm": 0.3607436418533325, + "learning_rate": 9.031279472401542e-06, + "loss": 0.402, + "step": 3356 + }, + { + "epoch": 0.8443158953722334, + "grad_norm": 0.35292503237724304, + "learning_rate": 9.03041369088093e-06, + "loss": 0.3808, + "step": 3357 + }, + { + "epoch": 0.8445674044265593, + "grad_norm": 0.35224297642707825, + "learning_rate": 9.029547564180602e-06, + "loss": 0.3921, + "step": 3358 + }, + { + "epoch": 0.8448189134808853, + "grad_norm": 0.3472045361995697, + "learning_rate": 9.028681092374733e-06, + "loss": 0.4116, + "step": 3359 + }, + { + "epoch": 0.8450704225352113, + "grad_norm": 0.3715015947818756, + "learning_rate": 9.027814275537533e-06, + "loss": 0.3823, + "step": 3360 + }, + { + "epoch": 0.8453219315895373, + "grad_norm": 0.3859766721725464, + "learning_rate": 9.026947113743237e-06, + "loss": 0.3872, + "step": 3361 + }, + { + "epoch": 0.8455734406438632, + "grad_norm": 0.36284056305885315, + "learning_rate": 9.026079607066112e-06, + "loss": 0.3781, + "step": 3362 + }, + { + "epoch": 0.8458249496981891, + "grad_norm": 0.35753417015075684, + "learning_rate": 9.025211755580458e-06, + "loss": 0.3545, + "step": 3363 + }, + { + "epoch": 0.8460764587525151, + "grad_norm": 0.4312700927257538, + "learning_rate": 9.024343559360597e-06, + "loss": 0.3925, + "step": 3364 + }, + { + "epoch": 0.846327967806841, + "grad_norm": 0.3738209903240204, + "learning_rate": 9.023475018480888e-06, + "loss": 0.3912, + "step": 3365 + }, + { + "epoch": 0.846579476861167, + "grad_norm": 0.3941510021686554, + "learning_rate": 9.022606133015713e-06, + "loss": 0.3617, + "step": 3366 + }, + { + "epoch": 0.846830985915493, + "grad_norm": 0.4269218146800995, + "learning_rate": 9.021736903039488e-06, + "loss": 0.3998, + "step": 3367 + }, + { + "epoch": 0.8470824949698189, + "grad_norm": 0.35374465584754944, + "learning_rate": 9.020867328626659e-06, + "loss": 0.3785, + "step": 3368 + }, + { + "epoch": 0.8473340040241448, + "grad_norm": 0.34942927956581116, + "learning_rate": 9.019997409851696e-06, + "loss": 0.374, + "step": 3369 + }, + { + "epoch": 0.8475855130784709, + "grad_norm": 0.4133439064025879, + "learning_rate": 9.019127146789106e-06, + "loss": 0.4161, + "step": 3370 + }, + { + "epoch": 0.8478370221327968, + "grad_norm": 0.38321179151535034, + "learning_rate": 9.018256539513417e-06, + "loss": 0.3675, + "step": 3371 + }, + { + "epoch": 0.8480885311871227, + "grad_norm": 0.3283655345439911, + "learning_rate": 9.017385588099195e-06, + "loss": 0.3838, + "step": 3372 + }, + { + "epoch": 0.8483400402414487, + "grad_norm": 0.3512178063392639, + "learning_rate": 9.016514292621027e-06, + "loss": 0.3827, + "step": 3373 + }, + { + "epoch": 0.8485915492957746, + "grad_norm": 0.4433364272117615, + "learning_rate": 9.015642653153542e-06, + "loss": 0.3925, + "step": 3374 + }, + { + "epoch": 0.8488430583501007, + "grad_norm": 0.3722871243953705, + "learning_rate": 9.014770669771383e-06, + "loss": 0.364, + "step": 3375 + }, + { + "epoch": 0.8490945674044266, + "grad_norm": 0.3842884600162506, + "learning_rate": 9.013898342549233e-06, + "loss": 0.3998, + "step": 3376 + }, + { + "epoch": 0.8493460764587525, + "grad_norm": 0.39675992727279663, + "learning_rate": 9.013025671561798e-06, + "loss": 0.4026, + "step": 3377 + }, + { + "epoch": 0.8495975855130785, + "grad_norm": 0.34844744205474854, + "learning_rate": 9.012152656883824e-06, + "loss": 0.3794, + "step": 3378 + }, + { + "epoch": 0.8498490945674044, + "grad_norm": 0.3376064598560333, + "learning_rate": 9.011279298590072e-06, + "loss": 0.3641, + "step": 3379 + }, + { + "epoch": 0.8501006036217303, + "grad_norm": 0.3386366665363312, + "learning_rate": 9.010405596755345e-06, + "loss": 0.3722, + "step": 3380 + }, + { + "epoch": 0.8503521126760564, + "grad_norm": 0.3699395954608917, + "learning_rate": 9.009531551454465e-06, + "loss": 0.39, + "step": 3381 + }, + { + "epoch": 0.8506036217303823, + "grad_norm": 0.35013461112976074, + "learning_rate": 9.008657162762293e-06, + "loss": 0.3577, + "step": 3382 + }, + { + "epoch": 0.8508551307847082, + "grad_norm": 0.3586879074573517, + "learning_rate": 9.007782430753712e-06, + "loss": 0.3754, + "step": 3383 + }, + { + "epoch": 0.8511066398390342, + "grad_norm": 0.3715461790561676, + "learning_rate": 9.006907355503639e-06, + "loss": 0.3569, + "step": 3384 + }, + { + "epoch": 0.8513581488933601, + "grad_norm": 0.4095889925956726, + "learning_rate": 9.006031937087018e-06, + "loss": 0.3823, + "step": 3385 + }, + { + "epoch": 0.8516096579476862, + "grad_norm": 0.34555843472480774, + "learning_rate": 9.005156175578823e-06, + "loss": 0.4092, + "step": 3386 + }, + { + "epoch": 0.8518611670020121, + "grad_norm": 0.3641471266746521, + "learning_rate": 9.004280071054058e-06, + "loss": 0.3735, + "step": 3387 + }, + { + "epoch": 0.852112676056338, + "grad_norm": 0.3723176419734955, + "learning_rate": 9.003403623587757e-06, + "loss": 0.3639, + "step": 3388 + }, + { + "epoch": 0.852364185110664, + "grad_norm": 0.3690603971481323, + "learning_rate": 9.002526833254979e-06, + "loss": 0.3734, + "step": 3389 + }, + { + "epoch": 0.85261569416499, + "grad_norm": 0.33589431643486023, + "learning_rate": 9.001649700130816e-06, + "loss": 0.3743, + "step": 3390 + }, + { + "epoch": 0.8528672032193159, + "grad_norm": 0.3753211498260498, + "learning_rate": 9.000772224290393e-06, + "loss": 0.4133, + "step": 3391 + }, + { + "epoch": 0.8531187122736419, + "grad_norm": 0.34786948561668396, + "learning_rate": 8.999894405808857e-06, + "loss": 0.3791, + "step": 3392 + }, + { + "epoch": 0.8533702213279678, + "grad_norm": 0.3724614977836609, + "learning_rate": 8.99901624476139e-06, + "loss": 0.3819, + "step": 3393 + }, + { + "epoch": 0.8536217303822937, + "grad_norm": 0.34503018856048584, + "learning_rate": 8.998137741223196e-06, + "loss": 0.3858, + "step": 3394 + }, + { + "epoch": 0.8538732394366197, + "grad_norm": 0.37502291798591614, + "learning_rate": 8.99725889526952e-06, + "loss": 0.4026, + "step": 3395 + }, + { + "epoch": 0.8541247484909457, + "grad_norm": 0.3725481927394867, + "learning_rate": 8.996379706975624e-06, + "loss": 0.353, + "step": 3396 + }, + { + "epoch": 0.8543762575452716, + "grad_norm": 0.3500644862651825, + "learning_rate": 8.995500176416809e-06, + "loss": 0.3909, + "step": 3397 + }, + { + "epoch": 0.8546277665995976, + "grad_norm": 0.3630251884460449, + "learning_rate": 8.9946203036684e-06, + "loss": 0.3797, + "step": 3398 + }, + { + "epoch": 0.8548792756539235, + "grad_norm": 0.3816511034965515, + "learning_rate": 8.99374008880575e-06, + "loss": 0.3622, + "step": 3399 + }, + { + "epoch": 0.8551307847082495, + "grad_norm": 0.37829670310020447, + "learning_rate": 8.992859531904247e-06, + "loss": 0.3791, + "step": 3400 + }, + { + "epoch": 0.8553822937625755, + "grad_norm": 0.3598082363605499, + "learning_rate": 8.991978633039305e-06, + "loss": 0.3779, + "step": 3401 + }, + { + "epoch": 0.8556338028169014, + "grad_norm": 0.3571262061595917, + "learning_rate": 8.991097392286368e-06, + "loss": 0.3634, + "step": 3402 + }, + { + "epoch": 0.8558853118712274, + "grad_norm": 0.38512668013572693, + "learning_rate": 8.990215809720905e-06, + "loss": 0.3692, + "step": 3403 + }, + { + "epoch": 0.8561368209255533, + "grad_norm": 0.35630372166633606, + "learning_rate": 8.989333885418423e-06, + "loss": 0.3878, + "step": 3404 + }, + { + "epoch": 0.8563883299798792, + "grad_norm": 0.37692365050315857, + "learning_rate": 8.988451619454449e-06, + "loss": 0.3886, + "step": 3405 + }, + { + "epoch": 0.8566398390342053, + "grad_norm": 0.3339127004146576, + "learning_rate": 8.987569011904547e-06, + "loss": 0.3821, + "step": 3406 + }, + { + "epoch": 0.8568913480885312, + "grad_norm": 0.3581376075744629, + "learning_rate": 8.986686062844303e-06, + "loss": 0.4086, + "step": 3407 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 0.39820095896720886, + "learning_rate": 8.985802772349341e-06, + "loss": 0.3902, + "step": 3408 + }, + { + "epoch": 0.8573943661971831, + "grad_norm": 0.3543420135974884, + "learning_rate": 8.984919140495305e-06, + "loss": 0.3623, + "step": 3409 + }, + { + "epoch": 0.857645875251509, + "grad_norm": 0.3442290723323822, + "learning_rate": 8.984035167357874e-06, + "loss": 0.3704, + "step": 3410 + }, + { + "epoch": 0.8578973843058351, + "grad_norm": 0.3457908034324646, + "learning_rate": 8.983150853012756e-06, + "loss": 0.3606, + "step": 3411 + }, + { + "epoch": 0.858148893360161, + "grad_norm": 0.36458566784858704, + "learning_rate": 8.982266197535685e-06, + "loss": 0.3841, + "step": 3412 + }, + { + "epoch": 0.8584004024144869, + "grad_norm": 0.3184933364391327, + "learning_rate": 8.981381201002428e-06, + "loss": 0.3728, + "step": 3413 + }, + { + "epoch": 0.8586519114688129, + "grad_norm": 0.3275623917579651, + "learning_rate": 8.98049586348878e-06, + "loss": 0.373, + "step": 3414 + }, + { + "epoch": 0.8589034205231388, + "grad_norm": 0.35574376583099365, + "learning_rate": 8.979610185070562e-06, + "loss": 0.3598, + "step": 3415 + }, + { + "epoch": 0.8591549295774648, + "grad_norm": 0.3995131850242615, + "learning_rate": 8.978724165823626e-06, + "loss": 0.3797, + "step": 3416 + }, + { + "epoch": 0.8594064386317908, + "grad_norm": 0.37157928943634033, + "learning_rate": 8.977837805823856e-06, + "loss": 0.4016, + "step": 3417 + }, + { + "epoch": 0.8596579476861167, + "grad_norm": 0.33222272992134094, + "learning_rate": 8.976951105147167e-06, + "loss": 0.3852, + "step": 3418 + }, + { + "epoch": 0.8599094567404426, + "grad_norm": 0.37010085582733154, + "learning_rate": 8.976064063869493e-06, + "loss": 0.3865, + "step": 3419 + }, + { + "epoch": 0.8601609657947686, + "grad_norm": 0.3240501582622528, + "learning_rate": 8.975176682066805e-06, + "loss": 0.4128, + "step": 3420 + }, + { + "epoch": 0.8604124748490946, + "grad_norm": 0.33426669239997864, + "learning_rate": 8.974288959815105e-06, + "loss": 0.3636, + "step": 3421 + }, + { + "epoch": 0.8606639839034205, + "grad_norm": 0.34247368574142456, + "learning_rate": 8.973400897190418e-06, + "loss": 0.3842, + "step": 3422 + }, + { + "epoch": 0.8609154929577465, + "grad_norm": 0.343246191740036, + "learning_rate": 8.9725124942688e-06, + "loss": 0.3869, + "step": 3423 + }, + { + "epoch": 0.8611670020120724, + "grad_norm": 0.3464204668998718, + "learning_rate": 8.971623751126341e-06, + "loss": 0.387, + "step": 3424 + }, + { + "epoch": 0.8614185110663984, + "grad_norm": 0.3398580551147461, + "learning_rate": 8.970734667839155e-06, + "loss": 0.3775, + "step": 3425 + }, + { + "epoch": 0.8616700201207244, + "grad_norm": 0.3092399835586548, + "learning_rate": 8.969845244483383e-06, + "loss": 0.3597, + "step": 3426 + }, + { + "epoch": 0.8619215291750503, + "grad_norm": 0.37892070412635803, + "learning_rate": 8.968955481135202e-06, + "loss": 0.4005, + "step": 3427 + }, + { + "epoch": 0.8621730382293763, + "grad_norm": 0.33688947558403015, + "learning_rate": 8.968065377870814e-06, + "loss": 0.3724, + "step": 3428 + }, + { + "epoch": 0.8624245472837022, + "grad_norm": 0.35349977016448975, + "learning_rate": 8.967174934766452e-06, + "loss": 0.3535, + "step": 3429 + }, + { + "epoch": 0.8626760563380281, + "grad_norm": 0.34343835711479187, + "learning_rate": 8.966284151898373e-06, + "loss": 0.3627, + "step": 3430 + }, + { + "epoch": 0.8629275653923542, + "grad_norm": 0.34207355976104736, + "learning_rate": 8.965393029342871e-06, + "loss": 0.3745, + "step": 3431 + }, + { + "epoch": 0.8631790744466801, + "grad_norm": 0.3593146502971649, + "learning_rate": 8.964501567176263e-06, + "loss": 0.3759, + "step": 3432 + }, + { + "epoch": 0.863430583501006, + "grad_norm": 0.37325525283813477, + "learning_rate": 8.963609765474897e-06, + "loss": 0.3783, + "step": 3433 + }, + { + "epoch": 0.863682092555332, + "grad_norm": 0.375765860080719, + "learning_rate": 8.96271762431515e-06, + "loss": 0.3963, + "step": 3434 + }, + { + "epoch": 0.8639336016096579, + "grad_norm": 0.3585033714771271, + "learning_rate": 8.96182514377343e-06, + "loss": 0.3527, + "step": 3435 + }, + { + "epoch": 0.864185110663984, + "grad_norm": 0.42358458042144775, + "learning_rate": 8.960932323926172e-06, + "loss": 0.3856, + "step": 3436 + }, + { + "epoch": 0.8644366197183099, + "grad_norm": 0.374779611825943, + "learning_rate": 8.96003916484984e-06, + "loss": 0.3643, + "step": 3437 + }, + { + "epoch": 0.8646881287726358, + "grad_norm": 0.3398546874523163, + "learning_rate": 8.959145666620928e-06, + "loss": 0.368, + "step": 3438 + }, + { + "epoch": 0.8649396378269618, + "grad_norm": 0.36778420209884644, + "learning_rate": 8.958251829315957e-06, + "loss": 0.3526, + "step": 3439 + }, + { + "epoch": 0.8651911468812877, + "grad_norm": 0.34406542778015137, + "learning_rate": 8.957357653011481e-06, + "loss": 0.3614, + "step": 3440 + }, + { + "epoch": 0.8654426559356136, + "grad_norm": 0.3503788411617279, + "learning_rate": 8.956463137784077e-06, + "loss": 0.3757, + "step": 3441 + }, + { + "epoch": 0.8656941649899397, + "grad_norm": 0.3319445550441742, + "learning_rate": 8.955568283710359e-06, + "loss": 0.3972, + "step": 3442 + }, + { + "epoch": 0.8659456740442656, + "grad_norm": 0.39669501781463623, + "learning_rate": 8.95467309086696e-06, + "loss": 0.3801, + "step": 3443 + }, + { + "epoch": 0.8661971830985915, + "grad_norm": 0.3425920605659485, + "learning_rate": 8.953777559330554e-06, + "loss": 0.3563, + "step": 3444 + }, + { + "epoch": 0.8664486921529175, + "grad_norm": 0.335559606552124, + "learning_rate": 8.952881689177834e-06, + "loss": 0.3907, + "step": 3445 + }, + { + "epoch": 0.8667002012072434, + "grad_norm": 0.3417853116989136, + "learning_rate": 8.951985480485528e-06, + "loss": 0.3652, + "step": 3446 + }, + { + "epoch": 0.8669517102615694, + "grad_norm": 0.3930087983608246, + "learning_rate": 8.951088933330387e-06, + "loss": 0.3849, + "step": 3447 + }, + { + "epoch": 0.8672032193158954, + "grad_norm": 0.3572435677051544, + "learning_rate": 8.950192047789198e-06, + "loss": 0.368, + "step": 3448 + }, + { + "epoch": 0.8674547283702213, + "grad_norm": 0.3223349153995514, + "learning_rate": 8.949294823938773e-06, + "loss": 0.4082, + "step": 3449 + }, + { + "epoch": 0.8677062374245473, + "grad_norm": 0.40850120782852173, + "learning_rate": 8.94839726185595e-06, + "loss": 0.3569, + "step": 3450 + }, + { + "epoch": 0.8679577464788732, + "grad_norm": 0.3549629747867584, + "learning_rate": 8.947499361617606e-06, + "loss": 0.3895, + "step": 3451 + }, + { + "epoch": 0.8682092555331992, + "grad_norm": 0.333429753780365, + "learning_rate": 8.946601123300636e-06, + "loss": 0.3784, + "step": 3452 + }, + { + "epoch": 0.8684607645875252, + "grad_norm": 0.34425967931747437, + "learning_rate": 8.94570254698197e-06, + "loss": 0.3644, + "step": 3453 + }, + { + "epoch": 0.8687122736418511, + "grad_norm": 0.4264945089817047, + "learning_rate": 8.944803632738563e-06, + "loss": 0.3787, + "step": 3454 + }, + { + "epoch": 0.868963782696177, + "grad_norm": 0.35965225100517273, + "learning_rate": 8.943904380647406e-06, + "loss": 0.3696, + "step": 3455 + }, + { + "epoch": 0.869215291750503, + "grad_norm": 0.39319103956222534, + "learning_rate": 8.94300479078551e-06, + "loss": 0.3785, + "step": 3456 + }, + { + "epoch": 0.869466800804829, + "grad_norm": 0.3534758985042572, + "learning_rate": 8.942104863229923e-06, + "loss": 0.3616, + "step": 3457 + }, + { + "epoch": 0.8697183098591549, + "grad_norm": 0.3524431586265564, + "learning_rate": 8.941204598057715e-06, + "loss": 0.3972, + "step": 3458 + }, + { + "epoch": 0.8699698189134809, + "grad_norm": 0.36104217171669006, + "learning_rate": 8.940303995345988e-06, + "loss": 0.37, + "step": 3459 + }, + { + "epoch": 0.8702213279678068, + "grad_norm": 0.39354491233825684, + "learning_rate": 8.939403055171877e-06, + "loss": 0.3858, + "step": 3460 + }, + { + "epoch": 0.8704728370221329, + "grad_norm": 0.3502269983291626, + "learning_rate": 8.938501777612538e-06, + "loss": 0.3702, + "step": 3461 + }, + { + "epoch": 0.8707243460764588, + "grad_norm": 0.3592958450317383, + "learning_rate": 8.937600162745159e-06, + "loss": 0.3917, + "step": 3462 + }, + { + "epoch": 0.8709758551307847, + "grad_norm": 0.3925688862800598, + "learning_rate": 8.936698210646962e-06, + "loss": 0.3878, + "step": 3463 + }, + { + "epoch": 0.8712273641851107, + "grad_norm": 0.35481512546539307, + "learning_rate": 8.93579592139519e-06, + "loss": 0.3642, + "step": 3464 + }, + { + "epoch": 0.8714788732394366, + "grad_norm": 0.35621413588523865, + "learning_rate": 8.93489329506712e-06, + "loss": 0.408, + "step": 3465 + }, + { + "epoch": 0.8717303822937625, + "grad_norm": 0.3342260420322418, + "learning_rate": 8.933990331740056e-06, + "loss": 0.3867, + "step": 3466 + }, + { + "epoch": 0.8719818913480886, + "grad_norm": 0.3696461617946625, + "learning_rate": 8.933087031491332e-06, + "loss": 0.3788, + "step": 3467 + }, + { + "epoch": 0.8722334004024145, + "grad_norm": 0.38092076778411865, + "learning_rate": 8.932183394398309e-06, + "loss": 0.375, + "step": 3468 + }, + { + "epoch": 0.8724849094567404, + "grad_norm": 0.34763234853744507, + "learning_rate": 8.931279420538377e-06, + "loss": 0.3722, + "step": 3469 + }, + { + "epoch": 0.8727364185110664, + "grad_norm": 0.3381394147872925, + "learning_rate": 8.930375109988956e-06, + "loss": 0.4076, + "step": 3470 + }, + { + "epoch": 0.8729879275653923, + "grad_norm": 0.34451600909233093, + "learning_rate": 8.929470462827496e-06, + "loss": 0.3787, + "step": 3471 + }, + { + "epoch": 0.8732394366197183, + "grad_norm": 0.33672693371772766, + "learning_rate": 8.928565479131473e-06, + "loss": 0.3889, + "step": 3472 + }, + { + "epoch": 0.8734909456740443, + "grad_norm": 0.3693379759788513, + "learning_rate": 8.927660158978392e-06, + "loss": 0.3873, + "step": 3473 + }, + { + "epoch": 0.8737424547283702, + "grad_norm": 0.32799556851387024, + "learning_rate": 8.926754502445794e-06, + "loss": 0.3619, + "step": 3474 + }, + { + "epoch": 0.8739939637826962, + "grad_norm": 0.36521029472351074, + "learning_rate": 8.925848509611237e-06, + "loss": 0.3807, + "step": 3475 + }, + { + "epoch": 0.8742454728370221, + "grad_norm": 0.3440057933330536, + "learning_rate": 8.924942180552315e-06, + "loss": 0.3797, + "step": 3476 + }, + { + "epoch": 0.8744969818913481, + "grad_norm": 0.35602283477783203, + "learning_rate": 8.924035515346648e-06, + "loss": 0.3678, + "step": 3477 + }, + { + "epoch": 0.8747484909456741, + "grad_norm": 0.34389829635620117, + "learning_rate": 8.923128514071888e-06, + "loss": 0.3683, + "step": 3478 + }, + { + "epoch": 0.875, + "grad_norm": 0.37036365270614624, + "learning_rate": 8.922221176805715e-06, + "loss": 0.3992, + "step": 3479 + }, + { + "epoch": 0.8752515090543259, + "grad_norm": 0.3461644649505615, + "learning_rate": 8.921313503625835e-06, + "loss": 0.3997, + "step": 3480 + }, + { + "epoch": 0.8755030181086519, + "grad_norm": 0.35094374418258667, + "learning_rate": 8.920405494609986e-06, + "loss": 0.3968, + "step": 3481 + }, + { + "epoch": 0.8757545271629779, + "grad_norm": 0.35670819878578186, + "learning_rate": 8.919497149835932e-06, + "loss": 0.3609, + "step": 3482 + }, + { + "epoch": 0.8760060362173038, + "grad_norm": 0.37578085064888, + "learning_rate": 8.918588469381468e-06, + "loss": 0.3756, + "step": 3483 + }, + { + "epoch": 0.8762575452716298, + "grad_norm": 0.35081595182418823, + "learning_rate": 8.917679453324415e-06, + "loss": 0.379, + "step": 3484 + }, + { + "epoch": 0.8765090543259557, + "grad_norm": 0.3629753291606903, + "learning_rate": 8.916770101742627e-06, + "loss": 0.4066, + "step": 3485 + }, + { + "epoch": 0.8767605633802817, + "grad_norm": 0.3397653102874756, + "learning_rate": 8.915860414713981e-06, + "loss": 0.4097, + "step": 3486 + }, + { + "epoch": 0.8770120724346077, + "grad_norm": 0.3421812951564789, + "learning_rate": 8.91495039231639e-06, + "loss": 0.4011, + "step": 3487 + }, + { + "epoch": 0.8772635814889336, + "grad_norm": 0.39413323998451233, + "learning_rate": 8.914040034627788e-06, + "loss": 0.4002, + "step": 3488 + }, + { + "epoch": 0.8775150905432596, + "grad_norm": 0.37600401043891907, + "learning_rate": 8.913129341726144e-06, + "loss": 0.3507, + "step": 3489 + }, + { + "epoch": 0.8777665995975855, + "grad_norm": 0.3659670352935791, + "learning_rate": 8.912218313689453e-06, + "loss": 0.3735, + "step": 3490 + }, + { + "epoch": 0.8780181086519114, + "grad_norm": 0.3923790156841278, + "learning_rate": 8.911306950595737e-06, + "loss": 0.358, + "step": 3491 + }, + { + "epoch": 0.8782696177062375, + "grad_norm": 0.37082934379577637, + "learning_rate": 8.910395252523053e-06, + "loss": 0.392, + "step": 3492 + }, + { + "epoch": 0.8785211267605634, + "grad_norm": 0.4046441912651062, + "learning_rate": 8.909483219549475e-06, + "loss": 0.3697, + "step": 3493 + }, + { + "epoch": 0.8787726358148893, + "grad_norm": 0.3740319311618805, + "learning_rate": 8.908570851753117e-06, + "loss": 0.3507, + "step": 3494 + }, + { + "epoch": 0.8790241448692153, + "grad_norm": 0.368447870016098, + "learning_rate": 8.907658149212119e-06, + "loss": 0.3536, + "step": 3495 + }, + { + "epoch": 0.8792756539235412, + "grad_norm": 0.3541277348995209, + "learning_rate": 8.906745112004646e-06, + "loss": 0.3628, + "step": 3496 + }, + { + "epoch": 0.8795271629778671, + "grad_norm": 0.36320897936820984, + "learning_rate": 8.905831740208896e-06, + "loss": 0.3541, + "step": 3497 + }, + { + "epoch": 0.8797786720321932, + "grad_norm": 0.3422798216342926, + "learning_rate": 8.904918033903091e-06, + "loss": 0.364, + "step": 3498 + }, + { + "epoch": 0.8800301810865191, + "grad_norm": 0.35075971484184265, + "learning_rate": 8.904003993165487e-06, + "loss": 0.3883, + "step": 3499 + }, + { + "epoch": 0.8802816901408451, + "grad_norm": 0.3443789780139923, + "learning_rate": 8.903089618074362e-06, + "loss": 0.3693, + "step": 3500 + }, + { + "epoch": 0.880533199195171, + "grad_norm": 0.3593481481075287, + "learning_rate": 8.902174908708032e-06, + "loss": 0.3749, + "step": 3501 + }, + { + "epoch": 0.880784708249497, + "grad_norm": 0.33803337812423706, + "learning_rate": 8.901259865144831e-06, + "loss": 0.3927, + "step": 3502 + }, + { + "epoch": 0.881036217303823, + "grad_norm": 0.4012623131275177, + "learning_rate": 8.900344487463128e-06, + "loss": 0.393, + "step": 3503 + }, + { + "epoch": 0.8812877263581489, + "grad_norm": 0.35286056995391846, + "learning_rate": 8.899428775741321e-06, + "loss": 0.377, + "step": 3504 + }, + { + "epoch": 0.8815392354124748, + "grad_norm": 0.3573596775531769, + "learning_rate": 8.898512730057835e-06, + "loss": 0.3808, + "step": 3505 + }, + { + "epoch": 0.8817907444668008, + "grad_norm": 0.33580759167671204, + "learning_rate": 8.897596350491122e-06, + "loss": 0.3601, + "step": 3506 + }, + { + "epoch": 0.8820422535211268, + "grad_norm": 0.32967039942741394, + "learning_rate": 8.896679637119665e-06, + "loss": 0.3586, + "step": 3507 + }, + { + "epoch": 0.8822937625754527, + "grad_norm": 0.3663707971572876, + "learning_rate": 8.895762590021973e-06, + "loss": 0.3816, + "step": 3508 + }, + { + "epoch": 0.8825452716297787, + "grad_norm": 0.33888426423072815, + "learning_rate": 8.89484520927659e-06, + "loss": 0.3904, + "step": 3509 + }, + { + "epoch": 0.8827967806841046, + "grad_norm": 0.334952175617218, + "learning_rate": 8.893927494962078e-06, + "loss": 0.3941, + "step": 3510 + }, + { + "epoch": 0.8830482897384306, + "grad_norm": 0.32887038588523865, + "learning_rate": 8.893009447157039e-06, + "loss": 0.3737, + "step": 3511 + }, + { + "epoch": 0.8832997987927566, + "grad_norm": 0.32995015382766724, + "learning_rate": 8.892091065940093e-06, + "loss": 0.3787, + "step": 3512 + }, + { + "epoch": 0.8835513078470825, + "grad_norm": 0.33268022537231445, + "learning_rate": 8.891172351389898e-06, + "loss": 0.3939, + "step": 3513 + }, + { + "epoch": 0.8838028169014085, + "grad_norm": 0.34591397643089294, + "learning_rate": 8.890253303585133e-06, + "loss": 0.3809, + "step": 3514 + }, + { + "epoch": 0.8840543259557344, + "grad_norm": 0.357835054397583, + "learning_rate": 8.88933392260451e-06, + "loss": 0.3565, + "step": 3515 + }, + { + "epoch": 0.8843058350100603, + "grad_norm": 0.3453388214111328, + "learning_rate": 8.888414208526768e-06, + "loss": 0.3891, + "step": 3516 + }, + { + "epoch": 0.8845573440643864, + "grad_norm": 0.35793086886405945, + "learning_rate": 8.887494161430676e-06, + "loss": 0.3938, + "step": 3517 + }, + { + "epoch": 0.8848088531187123, + "grad_norm": 0.32767254114151, + "learning_rate": 8.886573781395028e-06, + "loss": 0.3872, + "step": 3518 + }, + { + "epoch": 0.8850603621730382, + "grad_norm": 0.3634639084339142, + "learning_rate": 8.88565306849865e-06, + "loss": 0.4025, + "step": 3519 + }, + { + "epoch": 0.8853118712273642, + "grad_norm": 0.34274405241012573, + "learning_rate": 8.884732022820396e-06, + "loss": 0.4144, + "step": 3520 + }, + { + "epoch": 0.8855633802816901, + "grad_norm": 0.3598005771636963, + "learning_rate": 8.883810644439146e-06, + "loss": 0.3667, + "step": 3521 + }, + { + "epoch": 0.885814889336016, + "grad_norm": 0.3647115230560303, + "learning_rate": 8.882888933433813e-06, + "loss": 0.3797, + "step": 3522 + }, + { + "epoch": 0.8860663983903421, + "grad_norm": 0.3279248774051666, + "learning_rate": 8.881966889883334e-06, + "loss": 0.3718, + "step": 3523 + }, + { + "epoch": 0.886317907444668, + "grad_norm": 0.39648541808128357, + "learning_rate": 8.881044513866675e-06, + "loss": 0.3783, + "step": 3524 + }, + { + "epoch": 0.886569416498994, + "grad_norm": 0.3662944734096527, + "learning_rate": 8.880121805462834e-06, + "loss": 0.3796, + "step": 3525 + }, + { + "epoch": 0.8868209255533199, + "grad_norm": 0.3248366415500641, + "learning_rate": 8.879198764750834e-06, + "loss": 0.3804, + "step": 3526 + }, + { + "epoch": 0.8870724346076458, + "grad_norm": 0.3559374511241913, + "learning_rate": 8.878275391809727e-06, + "loss": 0.3701, + "step": 3527 + }, + { + "epoch": 0.8873239436619719, + "grad_norm": 0.37350761890411377, + "learning_rate": 8.877351686718596e-06, + "loss": 0.3769, + "step": 3528 + }, + { + "epoch": 0.8875754527162978, + "grad_norm": 0.37377414107322693, + "learning_rate": 8.876427649556549e-06, + "loss": 0.4021, + "step": 3529 + }, + { + "epoch": 0.8878269617706237, + "grad_norm": 0.3348924219608307, + "learning_rate": 8.875503280402727e-06, + "loss": 0.3881, + "step": 3530 + }, + { + "epoch": 0.8880784708249497, + "grad_norm": 0.33580613136291504, + "learning_rate": 8.874578579336293e-06, + "loss": 0.3735, + "step": 3531 + }, + { + "epoch": 0.8883299798792756, + "grad_norm": 0.35645267367362976, + "learning_rate": 8.873653546436442e-06, + "loss": 0.3988, + "step": 3532 + }, + { + "epoch": 0.8885814889336016, + "grad_norm": 0.33889126777648926, + "learning_rate": 8.872728181782399e-06, + "loss": 0.3856, + "step": 3533 + }, + { + "epoch": 0.8888329979879276, + "grad_norm": 0.329007089138031, + "learning_rate": 8.871802485453414e-06, + "loss": 0.37, + "step": 3534 + }, + { + "epoch": 0.8890845070422535, + "grad_norm": 0.31005194783210754, + "learning_rate": 8.87087645752877e-06, + "loss": 0.3784, + "step": 3535 + }, + { + "epoch": 0.8893360160965795, + "grad_norm": 0.3521806299686432, + "learning_rate": 8.86995009808777e-06, + "loss": 0.3909, + "step": 3536 + }, + { + "epoch": 0.8895875251509054, + "grad_norm": 0.3143180310726166, + "learning_rate": 8.869023407209758e-06, + "loss": 0.4024, + "step": 3537 + }, + { + "epoch": 0.8898390342052314, + "grad_norm": 0.32495957612991333, + "learning_rate": 8.868096384974094e-06, + "loss": 0.3827, + "step": 3538 + }, + { + "epoch": 0.8900905432595574, + "grad_norm": 0.30812323093414307, + "learning_rate": 8.867169031460175e-06, + "loss": 0.362, + "step": 3539 + }, + { + "epoch": 0.8903420523138833, + "grad_norm": 0.3465098440647125, + "learning_rate": 8.86624134674742e-06, + "loss": 0.3744, + "step": 3540 + }, + { + "epoch": 0.8905935613682092, + "grad_norm": 0.3428845703601837, + "learning_rate": 8.86531333091528e-06, + "loss": 0.3918, + "step": 3541 + }, + { + "epoch": 0.8908450704225352, + "grad_norm": 0.35687991976737976, + "learning_rate": 8.864384984043234e-06, + "loss": 0.3882, + "step": 3542 + }, + { + "epoch": 0.8910965794768612, + "grad_norm": 0.37335455417633057, + "learning_rate": 8.863456306210793e-06, + "loss": 0.3702, + "step": 3543 + }, + { + "epoch": 0.8913480885311871, + "grad_norm": 0.3432762622833252, + "learning_rate": 8.862527297497488e-06, + "loss": 0.3811, + "step": 3544 + }, + { + "epoch": 0.8915995975855131, + "grad_norm": 0.3809764087200165, + "learning_rate": 8.861597957982881e-06, + "loss": 0.3995, + "step": 3545 + }, + { + "epoch": 0.891851106639839, + "grad_norm": 0.3548218309879303, + "learning_rate": 8.86066828774657e-06, + "loss": 0.3965, + "step": 3546 + }, + { + "epoch": 0.8921026156941649, + "grad_norm": 0.3315393924713135, + "learning_rate": 8.859738286868172e-06, + "loss": 0.374, + "step": 3547 + }, + { + "epoch": 0.892354124748491, + "grad_norm": 0.35290321707725525, + "learning_rate": 8.858807955427335e-06, + "loss": 0.3589, + "step": 3548 + }, + { + "epoch": 0.8926056338028169, + "grad_norm": 0.368217796087265, + "learning_rate": 8.857877293503739e-06, + "loss": 0.3713, + "step": 3549 + }, + { + "epoch": 0.8928571428571429, + "grad_norm": 0.3560136556625366, + "learning_rate": 8.856946301177085e-06, + "loss": 0.3697, + "step": 3550 + }, + { + "epoch": 0.8931086519114688, + "grad_norm": 0.3632422089576721, + "learning_rate": 8.85601497852711e-06, + "loss": 0.3798, + "step": 3551 + }, + { + "epoch": 0.8933601609657947, + "grad_norm": 0.3566484749317169, + "learning_rate": 8.855083325633578e-06, + "loss": 0.3761, + "step": 3552 + }, + { + "epoch": 0.8936116700201208, + "grad_norm": 0.36059099435806274, + "learning_rate": 8.854151342576274e-06, + "loss": 0.3683, + "step": 3553 + }, + { + "epoch": 0.8938631790744467, + "grad_norm": 0.36407434940338135, + "learning_rate": 8.853219029435019e-06, + "loss": 0.3793, + "step": 3554 + }, + { + "epoch": 0.8941146881287726, + "grad_norm": 0.36072081327438354, + "learning_rate": 8.852286386289662e-06, + "loss": 0.3949, + "step": 3555 + }, + { + "epoch": 0.8943661971830986, + "grad_norm": 0.3494257628917694, + "learning_rate": 8.851353413220073e-06, + "loss": 0.3772, + "step": 3556 + }, + { + "epoch": 0.8946177062374245, + "grad_norm": 0.3464342951774597, + "learning_rate": 8.850420110306159e-06, + "loss": 0.4028, + "step": 3557 + }, + { + "epoch": 0.8948692152917505, + "grad_norm": 0.3632872402667999, + "learning_rate": 8.849486477627851e-06, + "loss": 0.378, + "step": 3558 + }, + { + "epoch": 0.8951207243460765, + "grad_norm": 0.3627418875694275, + "learning_rate": 8.848552515265108e-06, + "loss": 0.3887, + "step": 3559 + }, + { + "epoch": 0.8953722334004024, + "grad_norm": 0.3480847477912903, + "learning_rate": 8.84761822329792e-06, + "loss": 0.3831, + "step": 3560 + }, + { + "epoch": 0.8956237424547284, + "grad_norm": 0.3408527374267578, + "learning_rate": 8.8466836018063e-06, + "loss": 0.3716, + "step": 3561 + }, + { + "epoch": 0.8958752515090543, + "grad_norm": 0.35451194643974304, + "learning_rate": 8.845748650870297e-06, + "loss": 0.3733, + "step": 3562 + }, + { + "epoch": 0.8961267605633803, + "grad_norm": 0.3594439625740051, + "learning_rate": 8.844813370569978e-06, + "loss": 0.3822, + "step": 3563 + }, + { + "epoch": 0.8963782696177063, + "grad_norm": 0.33832788467407227, + "learning_rate": 8.843877760985447e-06, + "loss": 0.3858, + "step": 3564 + }, + { + "epoch": 0.8966297786720322, + "grad_norm": 0.33233413100242615, + "learning_rate": 8.842941822196835e-06, + "loss": 0.3995, + "step": 3565 + }, + { + "epoch": 0.8968812877263581, + "grad_norm": 0.3875477612018585, + "learning_rate": 8.842005554284296e-06, + "loss": 0.3922, + "step": 3566 + }, + { + "epoch": 0.8971327967806841, + "grad_norm": 0.32244715094566345, + "learning_rate": 8.841068957328018e-06, + "loss": 0.345, + "step": 3567 + }, + { + "epoch": 0.89738430583501, + "grad_norm": 0.36826932430267334, + "learning_rate": 8.84013203140821e-06, + "loss": 0.3665, + "step": 3568 + }, + { + "epoch": 0.897635814889336, + "grad_norm": 0.3594595789909363, + "learning_rate": 8.839194776605121e-06, + "loss": 0.3917, + "step": 3569 + }, + { + "epoch": 0.897887323943662, + "grad_norm": 0.31721267104148865, + "learning_rate": 8.838257192999016e-06, + "loss": 0.3672, + "step": 3570 + }, + { + "epoch": 0.8981388329979879, + "grad_norm": 0.37041252851486206, + "learning_rate": 8.837319280670196e-06, + "loss": 0.3852, + "step": 3571 + }, + { + "epoch": 0.8983903420523138, + "grad_norm": 0.3686506152153015, + "learning_rate": 8.836381039698983e-06, + "loss": 0.3628, + "step": 3572 + }, + { + "epoch": 0.8986418511066399, + "grad_norm": 0.3326662480831146, + "learning_rate": 8.835442470165736e-06, + "loss": 0.3954, + "step": 3573 + }, + { + "epoch": 0.8988933601609658, + "grad_norm": 0.35208654403686523, + "learning_rate": 8.834503572150835e-06, + "loss": 0.3983, + "step": 3574 + }, + { + "epoch": 0.8991448692152918, + "grad_norm": 0.34213465452194214, + "learning_rate": 8.833564345734693e-06, + "loss": 0.3492, + "step": 3575 + }, + { + "epoch": 0.8993963782696177, + "grad_norm": 0.3441331684589386, + "learning_rate": 8.832624790997747e-06, + "loss": 0.3914, + "step": 3576 + }, + { + "epoch": 0.8996478873239436, + "grad_norm": 0.33155056834220886, + "learning_rate": 8.831684908020463e-06, + "loss": 0.4002, + "step": 3577 + }, + { + "epoch": 0.8998993963782697, + "grad_norm": 0.36803004145622253, + "learning_rate": 8.83074469688334e-06, + "loss": 0.3868, + "step": 3578 + }, + { + "epoch": 0.9001509054325956, + "grad_norm": 0.35357141494750977, + "learning_rate": 8.829804157666896e-06, + "loss": 0.3788, + "step": 3579 + }, + { + "epoch": 0.9004024144869215, + "grad_norm": 0.3572876751422882, + "learning_rate": 8.828863290451689e-06, + "loss": 0.414, + "step": 3580 + }, + { + "epoch": 0.9006539235412475, + "grad_norm": 0.3837227523326874, + "learning_rate": 8.82792209531829e-06, + "loss": 0.3958, + "step": 3581 + }, + { + "epoch": 0.9009054325955734, + "grad_norm": 0.3402830958366394, + "learning_rate": 8.826980572347314e-06, + "loss": 0.3599, + "step": 3582 + }, + { + "epoch": 0.9011569416498993, + "grad_norm": 0.41772645711898804, + "learning_rate": 8.826038721619393e-06, + "loss": 0.3844, + "step": 3583 + }, + { + "epoch": 0.9014084507042254, + "grad_norm": 0.36479735374450684, + "learning_rate": 8.82509654321519e-06, + "loss": 0.3924, + "step": 3584 + }, + { + "epoch": 0.9016599597585513, + "grad_norm": 0.3720943033695221, + "learning_rate": 8.824154037215399e-06, + "loss": 0.3861, + "step": 3585 + }, + { + "epoch": 0.9019114688128773, + "grad_norm": 0.3853396475315094, + "learning_rate": 8.823211203700738e-06, + "loss": 0.4013, + "step": 3586 + }, + { + "epoch": 0.9021629778672032, + "grad_norm": 0.3603013753890991, + "learning_rate": 8.822268042751956e-06, + "loss": 0.379, + "step": 3587 + }, + { + "epoch": 0.9024144869215291, + "grad_norm": 0.3663939833641052, + "learning_rate": 8.821324554449826e-06, + "loss": 0.3767, + "step": 3588 + }, + { + "epoch": 0.9026659959758552, + "grad_norm": 0.3621923327445984, + "learning_rate": 8.820380738875156e-06, + "loss": 0.3731, + "step": 3589 + }, + { + "epoch": 0.9029175050301811, + "grad_norm": 0.333791583776474, + "learning_rate": 8.819436596108775e-06, + "loss": 0.384, + "step": 3590 + }, + { + "epoch": 0.903169014084507, + "grad_norm": 0.36841997504234314, + "learning_rate": 8.818492126231545e-06, + "loss": 0.3656, + "step": 3591 + }, + { + "epoch": 0.903420523138833, + "grad_norm": 0.35295945405960083, + "learning_rate": 8.817547329324352e-06, + "loss": 0.3935, + "step": 3592 + }, + { + "epoch": 0.903672032193159, + "grad_norm": 0.3341405987739563, + "learning_rate": 8.816602205468113e-06, + "loss": 0.3899, + "step": 3593 + }, + { + "epoch": 0.9039235412474849, + "grad_norm": 0.32092586159706116, + "learning_rate": 8.815656754743772e-06, + "loss": 0.3715, + "step": 3594 + }, + { + "epoch": 0.9041750503018109, + "grad_norm": 0.3715393841266632, + "learning_rate": 8.814710977232299e-06, + "loss": 0.3753, + "step": 3595 + }, + { + "epoch": 0.9044265593561368, + "grad_norm": 0.35690295696258545, + "learning_rate": 8.813764873014697e-06, + "loss": 0.3926, + "step": 3596 + }, + { + "epoch": 0.9046780684104627, + "grad_norm": 0.3435974419116974, + "learning_rate": 8.812818442171994e-06, + "loss": 0.3801, + "step": 3597 + }, + { + "epoch": 0.9049295774647887, + "grad_norm": 0.3631943166255951, + "learning_rate": 8.811871684785242e-06, + "loss": 0.3926, + "step": 3598 + }, + { + "epoch": 0.9051810865191147, + "grad_norm": 0.3382608890533447, + "learning_rate": 8.810924600935527e-06, + "loss": 0.3677, + "step": 3599 + }, + { + "epoch": 0.9054325955734407, + "grad_norm": 0.3581875264644623, + "learning_rate": 8.809977190703961e-06, + "loss": 0.367, + "step": 3600 + }, + { + "epoch": 0.9056841046277666, + "grad_norm": 0.3534909784793854, + "learning_rate": 8.809029454171684e-06, + "loss": 0.424, + "step": 3601 + }, + { + "epoch": 0.9059356136820925, + "grad_norm": 0.3113289475440979, + "learning_rate": 8.808081391419865e-06, + "loss": 0.3781, + "step": 3602 + }, + { + "epoch": 0.9061871227364185, + "grad_norm": 0.39388152956962585, + "learning_rate": 8.807133002529697e-06, + "loss": 0.4189, + "step": 3603 + }, + { + "epoch": 0.9064386317907445, + "grad_norm": 0.3407596945762634, + "learning_rate": 8.806184287582404e-06, + "loss": 0.3558, + "step": 3604 + }, + { + "epoch": 0.9066901408450704, + "grad_norm": 0.38331282138824463, + "learning_rate": 8.80523524665924e-06, + "loss": 0.3851, + "step": 3605 + }, + { + "epoch": 0.9069416498993964, + "grad_norm": 0.3627070188522339, + "learning_rate": 8.804285879841481e-06, + "loss": 0.3898, + "step": 3606 + }, + { + "epoch": 0.9071931589537223, + "grad_norm": 0.31576594710350037, + "learning_rate": 8.803336187210437e-06, + "loss": 0.3956, + "step": 3607 + }, + { + "epoch": 0.9074446680080482, + "grad_norm": 0.38593655824661255, + "learning_rate": 8.802386168847442e-06, + "loss": 0.3779, + "step": 3608 + }, + { + "epoch": 0.9076961770623743, + "grad_norm": 0.36420145630836487, + "learning_rate": 8.80143582483386e-06, + "loss": 0.3473, + "step": 3609 + }, + { + "epoch": 0.9079476861167002, + "grad_norm": 0.3500615656375885, + "learning_rate": 8.800485155251079e-06, + "loss": 0.3624, + "step": 3610 + }, + { + "epoch": 0.9081991951710262, + "grad_norm": 0.3793070614337921, + "learning_rate": 8.799534160180521e-06, + "loss": 0.3848, + "step": 3611 + }, + { + "epoch": 0.9084507042253521, + "grad_norm": 0.34537017345428467, + "learning_rate": 8.798582839703634e-06, + "loss": 0.3851, + "step": 3612 + }, + { + "epoch": 0.908702213279678, + "grad_norm": 0.3755810260772705, + "learning_rate": 8.797631193901888e-06, + "loss": 0.3578, + "step": 3613 + }, + { + "epoch": 0.9089537223340041, + "grad_norm": 0.36311477422714233, + "learning_rate": 8.79667922285679e-06, + "loss": 0.4059, + "step": 3614 + }, + { + "epoch": 0.90920523138833, + "grad_norm": 0.3528873920440674, + "learning_rate": 8.795726926649867e-06, + "loss": 0.3788, + "step": 3615 + }, + { + "epoch": 0.9094567404426559, + "grad_norm": 0.3925495445728302, + "learning_rate": 8.794774305362679e-06, + "loss": 0.3891, + "step": 3616 + }, + { + "epoch": 0.9097082494969819, + "grad_norm": 0.3574492931365967, + "learning_rate": 8.793821359076814e-06, + "loss": 0.4097, + "step": 3617 + }, + { + "epoch": 0.9099597585513078, + "grad_norm": 0.3412817716598511, + "learning_rate": 8.79286808787388e-06, + "loss": 0.3801, + "step": 3618 + }, + { + "epoch": 0.9102112676056338, + "grad_norm": 0.38012850284576416, + "learning_rate": 8.791914491835525e-06, + "loss": 0.3801, + "step": 3619 + }, + { + "epoch": 0.9104627766599598, + "grad_norm": 0.3861580789089203, + "learning_rate": 8.790960571043416e-06, + "loss": 0.4157, + "step": 3620 + }, + { + "epoch": 0.9107142857142857, + "grad_norm": 0.36277568340301514, + "learning_rate": 8.79000632557925e-06, + "loss": 0.3985, + "step": 3621 + }, + { + "epoch": 0.9109657947686117, + "grad_norm": 0.39433610439300537, + "learning_rate": 8.789051755524752e-06, + "loss": 0.4042, + "step": 3622 + }, + { + "epoch": 0.9112173038229376, + "grad_norm": 0.34599095582962036, + "learning_rate": 8.788096860961674e-06, + "loss": 0.3646, + "step": 3623 + }, + { + "epoch": 0.9114688128772636, + "grad_norm": 0.35807907581329346, + "learning_rate": 8.7871416419718e-06, + "loss": 0.3742, + "step": 3624 + }, + { + "epoch": 0.9117203219315896, + "grad_norm": 0.4068434536457062, + "learning_rate": 8.786186098636935e-06, + "loss": 0.3949, + "step": 3625 + }, + { + "epoch": 0.9119718309859155, + "grad_norm": 0.34174662828445435, + "learning_rate": 8.78523023103892e-06, + "loss": 0.3641, + "step": 3626 + }, + { + "epoch": 0.9122233400402414, + "grad_norm": 0.35758674144744873, + "learning_rate": 8.784274039259613e-06, + "loss": 0.3822, + "step": 3627 + }, + { + "epoch": 0.9124748490945674, + "grad_norm": 0.3813161849975586, + "learning_rate": 8.78331752338091e-06, + "loss": 0.398, + "step": 3628 + }, + { + "epoch": 0.9127263581488934, + "grad_norm": 0.37230536341667175, + "learning_rate": 8.78236068348473e-06, + "loss": 0.3727, + "step": 3629 + }, + { + "epoch": 0.9129778672032193, + "grad_norm": 0.3861142098903656, + "learning_rate": 8.781403519653018e-06, + "loss": 0.3595, + "step": 3630 + }, + { + "epoch": 0.9132293762575453, + "grad_norm": 0.35886576771736145, + "learning_rate": 8.780446031967753e-06, + "loss": 0.3457, + "step": 3631 + }, + { + "epoch": 0.9134808853118712, + "grad_norm": 0.4137284755706787, + "learning_rate": 8.779488220510935e-06, + "loss": 0.393, + "step": 3632 + }, + { + "epoch": 0.9137323943661971, + "grad_norm": 0.3399023115634918, + "learning_rate": 8.778530085364595e-06, + "loss": 0.3529, + "step": 3633 + }, + { + "epoch": 0.9139839034205232, + "grad_norm": 0.35265326499938965, + "learning_rate": 8.777571626610793e-06, + "loss": 0.3613, + "step": 3634 + }, + { + "epoch": 0.9142354124748491, + "grad_norm": 0.38770735263824463, + "learning_rate": 8.776612844331611e-06, + "loss": 0.3949, + "step": 3635 + }, + { + "epoch": 0.9144869215291751, + "grad_norm": 0.33681389689445496, + "learning_rate": 8.775653738609167e-06, + "loss": 0.3873, + "step": 3636 + }, + { + "epoch": 0.914738430583501, + "grad_norm": 0.36732831597328186, + "learning_rate": 8.7746943095256e-06, + "loss": 0.3771, + "step": 3637 + }, + { + "epoch": 0.9149899396378269, + "grad_norm": 0.35177841782569885, + "learning_rate": 8.77373455716308e-06, + "loss": 0.3578, + "step": 3638 + }, + { + "epoch": 0.915241448692153, + "grad_norm": 0.342661589384079, + "learning_rate": 8.772774481603805e-06, + "loss": 0.3895, + "step": 3639 + }, + { + "epoch": 0.9154929577464789, + "grad_norm": 0.3410513699054718, + "learning_rate": 8.771814082929997e-06, + "loss": 0.369, + "step": 3640 + }, + { + "epoch": 0.9157444668008048, + "grad_norm": 0.3464511036872864, + "learning_rate": 8.77085336122391e-06, + "loss": 0.3705, + "step": 3641 + }, + { + "epoch": 0.9159959758551308, + "grad_norm": 0.35824257135391235, + "learning_rate": 8.76989231656782e-06, + "loss": 0.3974, + "step": 3642 + }, + { + "epoch": 0.9162474849094567, + "grad_norm": 0.3505759835243225, + "learning_rate": 8.768930949044041e-06, + "loss": 0.3966, + "step": 3643 + }, + { + "epoch": 0.9164989939637826, + "grad_norm": 0.347253680229187, + "learning_rate": 8.767969258734903e-06, + "loss": 0.4126, + "step": 3644 + }, + { + "epoch": 0.9167505030181087, + "grad_norm": 0.36433228850364685, + "learning_rate": 8.767007245722769e-06, + "loss": 0.3819, + "step": 3645 + }, + { + "epoch": 0.9170020120724346, + "grad_norm": 0.34045785665512085, + "learning_rate": 8.766044910090033e-06, + "loss": 0.3647, + "step": 3646 + }, + { + "epoch": 0.9172535211267606, + "grad_norm": 0.38475969433784485, + "learning_rate": 8.76508225191911e-06, + "loss": 0.4221, + "step": 3647 + }, + { + "epoch": 0.9175050301810865, + "grad_norm": 0.3489360809326172, + "learning_rate": 8.764119271292446e-06, + "loss": 0.3675, + "step": 3648 + }, + { + "epoch": 0.9177565392354124, + "grad_norm": 0.33245396614074707, + "learning_rate": 8.763155968292517e-06, + "loss": 0.3854, + "step": 3649 + }, + { + "epoch": 0.9180080482897385, + "grad_norm": 0.3598446547985077, + "learning_rate": 8.762192343001818e-06, + "loss": 0.3766, + "step": 3650 + }, + { + "epoch": 0.9182595573440644, + "grad_norm": 0.3599037528038025, + "learning_rate": 8.761228395502883e-06, + "loss": 0.3813, + "step": 3651 + }, + { + "epoch": 0.9185110663983903, + "grad_norm": 0.3316107392311096, + "learning_rate": 8.760264125878266e-06, + "loss": 0.383, + "step": 3652 + }, + { + "epoch": 0.9187625754527163, + "grad_norm": 0.3711565136909485, + "learning_rate": 8.75929953421055e-06, + "loss": 0.3962, + "step": 3653 + }, + { + "epoch": 0.9190140845070423, + "grad_norm": 0.34968239068984985, + "learning_rate": 8.758334620582346e-06, + "loss": 0.3993, + "step": 3654 + }, + { + "epoch": 0.9192655935613682, + "grad_norm": 0.327274888753891, + "learning_rate": 8.757369385076296e-06, + "loss": 0.4015, + "step": 3655 + }, + { + "epoch": 0.9195171026156942, + "grad_norm": 0.38763725757598877, + "learning_rate": 8.756403827775063e-06, + "loss": 0.3636, + "step": 3656 + }, + { + "epoch": 0.9197686116700201, + "grad_norm": 0.3284519612789154, + "learning_rate": 8.755437948761344e-06, + "loss": 0.3446, + "step": 3657 + }, + { + "epoch": 0.920020120724346, + "grad_norm": 0.34053710103034973, + "learning_rate": 8.754471748117857e-06, + "loss": 0.3627, + "step": 3658 + }, + { + "epoch": 0.920271629778672, + "grad_norm": 0.3225838243961334, + "learning_rate": 8.753505225927352e-06, + "loss": 0.3911, + "step": 3659 + }, + { + "epoch": 0.920523138832998, + "grad_norm": 0.36908066272735596, + "learning_rate": 8.752538382272608e-06, + "loss": 0.3626, + "step": 3660 + }, + { + "epoch": 0.920774647887324, + "grad_norm": 0.3289790749549866, + "learning_rate": 8.751571217236426e-06, + "loss": 0.4104, + "step": 3661 + }, + { + "epoch": 0.9210261569416499, + "grad_norm": 0.35275307297706604, + "learning_rate": 8.75060373090164e-06, + "loss": 0.3679, + "step": 3662 + }, + { + "epoch": 0.9212776659959758, + "grad_norm": 0.381730318069458, + "learning_rate": 8.749635923351108e-06, + "loss": 0.3927, + "step": 3663 + }, + { + "epoch": 0.9215291750503019, + "grad_norm": 0.36701148748397827, + "learning_rate": 8.748667794667715e-06, + "loss": 0.3881, + "step": 3664 + }, + { + "epoch": 0.9217806841046278, + "grad_norm": 0.3680243492126465, + "learning_rate": 8.74769934493438e-06, + "loss": 0.3805, + "step": 3665 + }, + { + "epoch": 0.9220321931589537, + "grad_norm": 0.344265878200531, + "learning_rate": 8.74673057423404e-06, + "loss": 0.3879, + "step": 3666 + }, + { + "epoch": 0.9222837022132797, + "grad_norm": 0.3326249122619629, + "learning_rate": 8.745761482649667e-06, + "loss": 0.3726, + "step": 3667 + }, + { + "epoch": 0.9225352112676056, + "grad_norm": 0.3622082769870758, + "learning_rate": 8.744792070264254e-06, + "loss": 0.3893, + "step": 3668 + }, + { + "epoch": 0.9227867203219315, + "grad_norm": 0.4040569067001343, + "learning_rate": 8.743822337160829e-06, + "loss": 0.3559, + "step": 3669 + }, + { + "epoch": 0.9230382293762576, + "grad_norm": 0.34916210174560547, + "learning_rate": 8.742852283422443e-06, + "loss": 0.3637, + "step": 3670 + }, + { + "epoch": 0.9232897384305835, + "grad_norm": 0.34636667370796204, + "learning_rate": 8.741881909132171e-06, + "loss": 0.3768, + "step": 3671 + }, + { + "epoch": 0.9235412474849095, + "grad_norm": 0.344542533159256, + "learning_rate": 8.740911214373125e-06, + "loss": 0.3727, + "step": 3672 + }, + { + "epoch": 0.9237927565392354, + "grad_norm": 0.37723875045776367, + "learning_rate": 8.739940199228436e-06, + "loss": 0.3802, + "step": 3673 + }, + { + "epoch": 0.9240442655935613, + "grad_norm": 0.3496474027633667, + "learning_rate": 8.738968863781267e-06, + "loss": 0.3946, + "step": 3674 + }, + { + "epoch": 0.9242957746478874, + "grad_norm": 0.31747496128082275, + "learning_rate": 8.737997208114806e-06, + "loss": 0.3364, + "step": 3675 + }, + { + "epoch": 0.9245472837022133, + "grad_norm": 0.36128780245780945, + "learning_rate": 8.737025232312267e-06, + "loss": 0.3646, + "step": 3676 + }, + { + "epoch": 0.9247987927565392, + "grad_norm": 0.3360286056995392, + "learning_rate": 8.736052936456897e-06, + "loss": 0.3604, + "step": 3677 + }, + { + "epoch": 0.9250503018108652, + "grad_norm": 0.37256118655204773, + "learning_rate": 8.735080320631966e-06, + "loss": 0.3742, + "step": 3678 + }, + { + "epoch": 0.9253018108651911, + "grad_norm": 0.3662985861301422, + "learning_rate": 8.734107384920771e-06, + "loss": 0.4107, + "step": 3679 + }, + { + "epoch": 0.9255533199195171, + "grad_norm": 0.37539082765579224, + "learning_rate": 8.733134129406638e-06, + "loss": 0.3769, + "step": 3680 + }, + { + "epoch": 0.9258048289738431, + "grad_norm": 0.379936546087265, + "learning_rate": 8.732160554172923e-06, + "loss": 0.3548, + "step": 3681 + }, + { + "epoch": 0.926056338028169, + "grad_norm": 0.3667657673358917, + "learning_rate": 8.731186659303004e-06, + "loss": 0.3804, + "step": 3682 + }, + { + "epoch": 0.9263078470824949, + "grad_norm": 0.374489963054657, + "learning_rate": 8.73021244488029e-06, + "loss": 0.3722, + "step": 3683 + }, + { + "epoch": 0.9265593561368209, + "grad_norm": 0.37897610664367676, + "learning_rate": 8.729237910988218e-06, + "loss": 0.4052, + "step": 3684 + }, + { + "epoch": 0.9268108651911469, + "grad_norm": 0.332736611366272, + "learning_rate": 8.728263057710247e-06, + "loss": 0.3906, + "step": 3685 + }, + { + "epoch": 0.9270623742454729, + "grad_norm": 0.37332481145858765, + "learning_rate": 8.72728788512987e-06, + "loss": 0.3862, + "step": 3686 + }, + { + "epoch": 0.9273138832997988, + "grad_norm": 0.33331403136253357, + "learning_rate": 8.726312393330602e-06, + "loss": 0.3965, + "step": 3687 + }, + { + "epoch": 0.9275653923541247, + "grad_norm": 0.3460955321788788, + "learning_rate": 8.72533658239599e-06, + "loss": 0.3946, + "step": 3688 + }, + { + "epoch": 0.9278169014084507, + "grad_norm": 0.3659730553627014, + "learning_rate": 8.724360452409606e-06, + "loss": 0.3875, + "step": 3689 + }, + { + "epoch": 0.9280684104627767, + "grad_norm": 0.33877822756767273, + "learning_rate": 8.723384003455049e-06, + "loss": 0.3761, + "step": 3690 + }, + { + "epoch": 0.9283199195171026, + "grad_norm": 0.34681734442710876, + "learning_rate": 8.722407235615944e-06, + "loss": 0.3584, + "step": 3691 + }, + { + "epoch": 0.9285714285714286, + "grad_norm": 0.3557383716106415, + "learning_rate": 8.721430148975946e-06, + "loss": 0.39, + "step": 3692 + }, + { + "epoch": 0.9288229376257545, + "grad_norm": 0.3659672141075134, + "learning_rate": 8.72045274361874e-06, + "loss": 0.3935, + "step": 3693 + }, + { + "epoch": 0.9290744466800804, + "grad_norm": 0.3567541241645813, + "learning_rate": 8.71947501962803e-06, + "loss": 0.3621, + "step": 3694 + }, + { + "epoch": 0.9293259557344065, + "grad_norm": 0.3142370581626892, + "learning_rate": 8.718496977087554e-06, + "loss": 0.3623, + "step": 3695 + }, + { + "epoch": 0.9295774647887324, + "grad_norm": 0.3656524121761322, + "learning_rate": 8.717518616081073e-06, + "loss": 0.3803, + "step": 3696 + }, + { + "epoch": 0.9298289738430584, + "grad_norm": 0.3680150508880615, + "learning_rate": 8.716539936692381e-06, + "loss": 0.3862, + "step": 3697 + }, + { + "epoch": 0.9300804828973843, + "grad_norm": 0.32054194808006287, + "learning_rate": 8.715560939005293e-06, + "loss": 0.401, + "step": 3698 + }, + { + "epoch": 0.9303319919517102, + "grad_norm": 0.3696306049823761, + "learning_rate": 8.714581623103654e-06, + "loss": 0.3615, + "step": 3699 + }, + { + "epoch": 0.9305835010060363, + "grad_norm": 0.3776056170463562, + "learning_rate": 8.713601989071342e-06, + "loss": 0.3761, + "step": 3700 + }, + { + "epoch": 0.9308350100603622, + "grad_norm": 0.3336465656757355, + "learning_rate": 8.712622036992248e-06, + "loss": 0.3806, + "step": 3701 + }, + { + "epoch": 0.9310865191146881, + "grad_norm": 0.3547345995903015, + "learning_rate": 8.711641766950302e-06, + "loss": 0.3758, + "step": 3702 + }, + { + "epoch": 0.9313380281690141, + "grad_norm": 0.3893287181854248, + "learning_rate": 8.710661179029461e-06, + "loss": 0.3892, + "step": 3703 + }, + { + "epoch": 0.93158953722334, + "grad_norm": 0.36249130964279175, + "learning_rate": 8.709680273313703e-06, + "loss": 0.3726, + "step": 3704 + }, + { + "epoch": 0.931841046277666, + "grad_norm": 0.35625818371772766, + "learning_rate": 8.708699049887038e-06, + "loss": 0.3782, + "step": 3705 + }, + { + "epoch": 0.932092555331992, + "grad_norm": 0.3533684313297272, + "learning_rate": 8.707717508833499e-06, + "loss": 0.4038, + "step": 3706 + }, + { + "epoch": 0.9323440643863179, + "grad_norm": 0.3416036367416382, + "learning_rate": 8.706735650237153e-06, + "loss": 0.3865, + "step": 3707 + }, + { + "epoch": 0.9325955734406438, + "grad_norm": 0.3354741036891937, + "learning_rate": 8.705753474182085e-06, + "loss": 0.3464, + "step": 3708 + }, + { + "epoch": 0.9328470824949698, + "grad_norm": 0.37271448969841003, + "learning_rate": 8.704770980752417e-06, + "loss": 0.3944, + "step": 3709 + }, + { + "epoch": 0.9330985915492958, + "grad_norm": 0.345333993434906, + "learning_rate": 8.703788170032293e-06, + "loss": 0.3661, + "step": 3710 + }, + { + "epoch": 0.9333501006036218, + "grad_norm": 0.39637884497642517, + "learning_rate": 8.70280504210588e-06, + "loss": 0.3813, + "step": 3711 + }, + { + "epoch": 0.9336016096579477, + "grad_norm": 0.35547831654548645, + "learning_rate": 8.701821597057381e-06, + "loss": 0.4037, + "step": 3712 + }, + { + "epoch": 0.9338531187122736, + "grad_norm": 0.3298320472240448, + "learning_rate": 8.700837834971021e-06, + "loss": 0.3871, + "step": 3713 + }, + { + "epoch": 0.9341046277665996, + "grad_norm": 0.3607611358165741, + "learning_rate": 8.699853755931053e-06, + "loss": 0.3879, + "step": 3714 + }, + { + "epoch": 0.9343561368209256, + "grad_norm": 0.3281857967376709, + "learning_rate": 8.698869360021755e-06, + "loss": 0.3806, + "step": 3715 + }, + { + "epoch": 0.9346076458752515, + "grad_norm": 0.3486366271972656, + "learning_rate": 8.697884647327438e-06, + "loss": 0.3984, + "step": 3716 + }, + { + "epoch": 0.9348591549295775, + "grad_norm": 0.32354462146759033, + "learning_rate": 8.696899617932436e-06, + "loss": 0.3889, + "step": 3717 + }, + { + "epoch": 0.9351106639839034, + "grad_norm": 0.3479534387588501, + "learning_rate": 8.69591427192111e-06, + "loss": 0.3921, + "step": 3718 + }, + { + "epoch": 0.9353621730382293, + "grad_norm": 0.3232344090938568, + "learning_rate": 8.694928609377844e-06, + "loss": 0.3755, + "step": 3719 + }, + { + "epoch": 0.9356136820925554, + "grad_norm": 0.35283714532852173, + "learning_rate": 8.69394263038706e-06, + "loss": 0.3604, + "step": 3720 + }, + { + "epoch": 0.9358651911468813, + "grad_norm": 0.35240834951400757, + "learning_rate": 8.6929563350332e-06, + "loss": 0.3754, + "step": 3721 + }, + { + "epoch": 0.9361167002012073, + "grad_norm": 0.34090203046798706, + "learning_rate": 8.691969723400732e-06, + "loss": 0.4053, + "step": 3722 + }, + { + "epoch": 0.9363682092555332, + "grad_norm": 0.389652281999588, + "learning_rate": 8.690982795574155e-06, + "loss": 0.3918, + "step": 3723 + }, + { + "epoch": 0.9366197183098591, + "grad_norm": 0.3855276107788086, + "learning_rate": 8.689995551637992e-06, + "loss": 0.3933, + "step": 3724 + }, + { + "epoch": 0.9368712273641852, + "grad_norm": 0.3753201365470886, + "learning_rate": 8.689007991676795e-06, + "loss": 0.37, + "step": 3725 + }, + { + "epoch": 0.9371227364185111, + "grad_norm": 0.34677380323410034, + "learning_rate": 8.68802011577514e-06, + "loss": 0.3988, + "step": 3726 + }, + { + "epoch": 0.937374245472837, + "grad_norm": 0.3636135458946228, + "learning_rate": 8.687031924017635e-06, + "loss": 0.3825, + "step": 3727 + }, + { + "epoch": 0.937625754527163, + "grad_norm": 0.3638603389263153, + "learning_rate": 8.686043416488913e-06, + "loss": 0.3959, + "step": 3728 + }, + { + "epoch": 0.9378772635814889, + "grad_norm": 0.40067726373672485, + "learning_rate": 8.685054593273631e-06, + "loss": 0.3546, + "step": 3729 + }, + { + "epoch": 0.9381287726358148, + "grad_norm": 0.34523940086364746, + "learning_rate": 8.684065454456478e-06, + "loss": 0.3723, + "step": 3730 + }, + { + "epoch": 0.9383802816901409, + "grad_norm": 0.3261178135871887, + "learning_rate": 8.683076000122165e-06, + "loss": 0.3744, + "step": 3731 + }, + { + "epoch": 0.9386317907444668, + "grad_norm": 0.4006332457065582, + "learning_rate": 8.682086230355432e-06, + "loss": 0.357, + "step": 3732 + }, + { + "epoch": 0.9388832997987927, + "grad_norm": 0.33518972992897034, + "learning_rate": 8.681096145241053e-06, + "loss": 0.3654, + "step": 3733 + }, + { + "epoch": 0.9391348088531187, + "grad_norm": 0.3528534770011902, + "learning_rate": 8.680105744863817e-06, + "loss": 0.3879, + "step": 3734 + }, + { + "epoch": 0.9393863179074446, + "grad_norm": 0.3312220573425293, + "learning_rate": 8.679115029308543e-06, + "loss": 0.3973, + "step": 3735 + }, + { + "epoch": 0.9396378269617707, + "grad_norm": 0.38453420996665955, + "learning_rate": 8.678123998660087e-06, + "loss": 0.3724, + "step": 3736 + }, + { + "epoch": 0.9398893360160966, + "grad_norm": 0.3209126889705658, + "learning_rate": 8.677132653003318e-06, + "loss": 0.3678, + "step": 3737 + }, + { + "epoch": 0.9401408450704225, + "grad_norm": 0.39340800046920776, + "learning_rate": 8.676140992423143e-06, + "loss": 0.3451, + "step": 3738 + }, + { + "epoch": 0.9403923541247485, + "grad_norm": 0.38696199655532837, + "learning_rate": 8.67514901700449e-06, + "loss": 0.4004, + "step": 3739 + }, + { + "epoch": 0.9406438631790744, + "grad_norm": 0.35889217257499695, + "learning_rate": 8.674156726832315e-06, + "loss": 0.3793, + "step": 3740 + }, + { + "epoch": 0.9408953722334004, + "grad_norm": 0.403751015663147, + "learning_rate": 8.673164121991601e-06, + "loss": 0.4051, + "step": 3741 + }, + { + "epoch": 0.9411468812877264, + "grad_norm": 0.43993040919303894, + "learning_rate": 8.672171202567359e-06, + "loss": 0.3685, + "step": 3742 + }, + { + "epoch": 0.9413983903420523, + "grad_norm": 0.3885231614112854, + "learning_rate": 8.671177968644628e-06, + "loss": 0.3809, + "step": 3743 + }, + { + "epoch": 0.9416498993963782, + "grad_norm": 0.3906707167625427, + "learning_rate": 8.67018442030847e-06, + "loss": 0.3877, + "step": 3744 + }, + { + "epoch": 0.9419014084507042, + "grad_norm": 0.35688477754592896, + "learning_rate": 8.669190557643977e-06, + "loss": 0.3578, + "step": 3745 + }, + { + "epoch": 0.9421529175050302, + "grad_norm": 0.37239667773246765, + "learning_rate": 8.668196380736267e-06, + "loss": 0.3765, + "step": 3746 + }, + { + "epoch": 0.9424044265593562, + "grad_norm": 0.38432252407073975, + "learning_rate": 8.667201889670485e-06, + "loss": 0.375, + "step": 3747 + }, + { + "epoch": 0.9426559356136821, + "grad_norm": 0.36130064725875854, + "learning_rate": 8.666207084531804e-06, + "loss": 0.3808, + "step": 3748 + }, + { + "epoch": 0.942907444668008, + "grad_norm": 0.34920212626457214, + "learning_rate": 8.665211965405422e-06, + "loss": 0.3584, + "step": 3749 + }, + { + "epoch": 0.943158953722334, + "grad_norm": 0.34862497448921204, + "learning_rate": 8.664216532376563e-06, + "loss": 0.3585, + "step": 3750 + }, + { + "epoch": 0.94341046277666, + "grad_norm": 0.35228458046913147, + "learning_rate": 8.663220785530485e-06, + "loss": 0.3725, + "step": 3751 + }, + { + "epoch": 0.9436619718309859, + "grad_norm": 0.3779107928276062, + "learning_rate": 8.662224724952459e-06, + "loss": 0.3687, + "step": 3752 + }, + { + "epoch": 0.9439134808853119, + "grad_norm": 0.36739468574523926, + "learning_rate": 8.661228350727798e-06, + "loss": 0.3429, + "step": 3753 + }, + { + "epoch": 0.9441649899396378, + "grad_norm": 0.381713330745697, + "learning_rate": 8.660231662941834e-06, + "loss": 0.3805, + "step": 3754 + }, + { + "epoch": 0.9444164989939637, + "grad_norm": 0.35245978832244873, + "learning_rate": 8.659234661679926e-06, + "loss": 0.3795, + "step": 3755 + }, + { + "epoch": 0.9446680080482898, + "grad_norm": 0.3956359028816223, + "learning_rate": 8.658237347027461e-06, + "loss": 0.3569, + "step": 3756 + }, + { + "epoch": 0.9449195171026157, + "grad_norm": 0.3203680217266083, + "learning_rate": 8.657239719069854e-06, + "loss": 0.386, + "step": 3757 + }, + { + "epoch": 0.9451710261569416, + "grad_norm": 0.3383410573005676, + "learning_rate": 8.656241777892544e-06, + "loss": 0.3816, + "step": 3758 + }, + { + "epoch": 0.9454225352112676, + "grad_norm": 0.3426474928855896, + "learning_rate": 8.655243523580998e-06, + "loss": 0.3848, + "step": 3759 + }, + { + "epoch": 0.9456740442655935, + "grad_norm": 0.3572126030921936, + "learning_rate": 8.654244956220713e-06, + "loss": 0.3666, + "step": 3760 + }, + { + "epoch": 0.9459255533199196, + "grad_norm": 0.3556264340877533, + "learning_rate": 8.653246075897208e-06, + "loss": 0.4032, + "step": 3761 + }, + { + "epoch": 0.9461770623742455, + "grad_norm": 0.3631819486618042, + "learning_rate": 8.652246882696032e-06, + "loss": 0.3807, + "step": 3762 + }, + { + "epoch": 0.9464285714285714, + "grad_norm": 0.36357030272483826, + "learning_rate": 8.651247376702756e-06, + "loss": 0.3523, + "step": 3763 + }, + { + "epoch": 0.9466800804828974, + "grad_norm": 0.34328320622444153, + "learning_rate": 8.650247558002987e-06, + "loss": 0.3881, + "step": 3764 + }, + { + "epoch": 0.9469315895372233, + "grad_norm": 0.349572092294693, + "learning_rate": 8.64924742668235e-06, + "loss": 0.3589, + "step": 3765 + }, + { + "epoch": 0.9471830985915493, + "grad_norm": 0.3555934429168701, + "learning_rate": 8.6482469828265e-06, + "loss": 0.3848, + "step": 3766 + }, + { + "epoch": 0.9474346076458753, + "grad_norm": 0.32735902070999146, + "learning_rate": 8.64724622652112e-06, + "loss": 0.3698, + "step": 3767 + }, + { + "epoch": 0.9476861167002012, + "grad_norm": 0.35206887125968933, + "learning_rate": 8.646245157851918e-06, + "loss": 0.3755, + "step": 3768 + }, + { + "epoch": 0.9479376257545271, + "grad_norm": 0.34176838397979736, + "learning_rate": 8.645243776904629e-06, + "loss": 0.3976, + "step": 3769 + }, + { + "epoch": 0.9481891348088531, + "grad_norm": 0.3586902320384979, + "learning_rate": 8.644242083765014e-06, + "loss": 0.3693, + "step": 3770 + }, + { + "epoch": 0.9484406438631791, + "grad_norm": 0.33086657524108887, + "learning_rate": 8.643240078518865e-06, + "loss": 0.3722, + "step": 3771 + }, + { + "epoch": 0.9486921529175051, + "grad_norm": 0.384689062833786, + "learning_rate": 8.642237761251992e-06, + "loss": 0.3484, + "step": 3772 + }, + { + "epoch": 0.948943661971831, + "grad_norm": 0.3711892366409302, + "learning_rate": 8.641235132050243e-06, + "loss": 0.3786, + "step": 3773 + }, + { + "epoch": 0.9491951710261569, + "grad_norm": 0.325150728225708, + "learning_rate": 8.640232190999484e-06, + "loss": 0.4007, + "step": 3774 + }, + { + "epoch": 0.9494466800804829, + "grad_norm": 0.3467983603477478, + "learning_rate": 8.63922893818561e-06, + "loss": 0.367, + "step": 3775 + }, + { + "epoch": 0.9496981891348089, + "grad_norm": 0.40407732129096985, + "learning_rate": 8.638225373694546e-06, + "loss": 0.374, + "step": 3776 + }, + { + "epoch": 0.9499496981891348, + "grad_norm": 0.3339010775089264, + "learning_rate": 8.637221497612238e-06, + "loss": 0.3786, + "step": 3777 + }, + { + "epoch": 0.9502012072434608, + "grad_norm": 0.33294668793678284, + "learning_rate": 8.636217310024664e-06, + "loss": 0.4051, + "step": 3778 + }, + { + "epoch": 0.9504527162977867, + "grad_norm": 0.32620373368263245, + "learning_rate": 8.635212811017826e-06, + "loss": 0.3619, + "step": 3779 + }, + { + "epoch": 0.9507042253521126, + "grad_norm": 0.3283163011074066, + "learning_rate": 8.634208000677751e-06, + "loss": 0.3663, + "step": 3780 + }, + { + "epoch": 0.9509557344064387, + "grad_norm": 0.3436046838760376, + "learning_rate": 8.633202879090496e-06, + "loss": 0.3836, + "step": 3781 + }, + { + "epoch": 0.9512072434607646, + "grad_norm": 0.3591076731681824, + "learning_rate": 8.632197446342145e-06, + "loss": 0.3852, + "step": 3782 + }, + { + "epoch": 0.9514587525150905, + "grad_norm": 0.3302284777164459, + "learning_rate": 8.631191702518806e-06, + "loss": 0.4113, + "step": 3783 + }, + { + "epoch": 0.9517102615694165, + "grad_norm": 0.3252635896205902, + "learning_rate": 8.630185647706614e-06, + "loss": 0.3768, + "step": 3784 + }, + { + "epoch": 0.9519617706237424, + "grad_norm": 0.3460747003555298, + "learning_rate": 8.629179281991732e-06, + "loss": 0.4131, + "step": 3785 + }, + { + "epoch": 0.9522132796780685, + "grad_norm": 0.3621247112751007, + "learning_rate": 8.628172605460347e-06, + "loss": 0.393, + "step": 3786 + }, + { + "epoch": 0.9524647887323944, + "grad_norm": 0.3651982843875885, + "learning_rate": 8.627165618198676e-06, + "loss": 0.3871, + "step": 3787 + }, + { + "epoch": 0.9527162977867203, + "grad_norm": 0.3419380784034729, + "learning_rate": 8.626158320292963e-06, + "loss": 0.3815, + "step": 3788 + }, + { + "epoch": 0.9529678068410463, + "grad_norm": 0.3466692268848419, + "learning_rate": 8.625150711829475e-06, + "loss": 0.3587, + "step": 3789 + }, + { + "epoch": 0.9532193158953722, + "grad_norm": 0.3754306435585022, + "learning_rate": 8.624142792894505e-06, + "loss": 0.4, + "step": 3790 + }, + { + "epoch": 0.9534708249496981, + "grad_norm": 0.348619669675827, + "learning_rate": 8.62313456357438e-06, + "loss": 0.3605, + "step": 3791 + }, + { + "epoch": 0.9537223340040242, + "grad_norm": 0.34513843059539795, + "learning_rate": 8.622126023955446e-06, + "loss": 0.3467, + "step": 3792 + }, + { + "epoch": 0.9539738430583501, + "grad_norm": 0.3718356788158417, + "learning_rate": 8.621117174124076e-06, + "loss": 0.379, + "step": 3793 + }, + { + "epoch": 0.954225352112676, + "grad_norm": 0.38062798976898193, + "learning_rate": 8.620108014166674e-06, + "loss": 0.3619, + "step": 3794 + }, + { + "epoch": 0.954476861167002, + "grad_norm": 0.3671969473361969, + "learning_rate": 8.619098544169671e-06, + "loss": 0.3721, + "step": 3795 + }, + { + "epoch": 0.954728370221328, + "grad_norm": 0.40202799439430237, + "learning_rate": 8.618088764219514e-06, + "loss": 0.3779, + "step": 3796 + }, + { + "epoch": 0.954979879275654, + "grad_norm": 0.36368417739868164, + "learning_rate": 8.617078674402692e-06, + "loss": 0.3934, + "step": 3797 + }, + { + "epoch": 0.9552313883299799, + "grad_norm": 0.42696356773376465, + "learning_rate": 8.616068274805709e-06, + "loss": 0.3764, + "step": 3798 + }, + { + "epoch": 0.9554828973843058, + "grad_norm": 0.3894241750240326, + "learning_rate": 8.615057565515102e-06, + "loss": 0.3658, + "step": 3799 + }, + { + "epoch": 0.9557344064386318, + "grad_norm": 0.38357433676719666, + "learning_rate": 8.614046546617427e-06, + "loss": 0.3971, + "step": 3800 + }, + { + "epoch": 0.9559859154929577, + "grad_norm": 0.3617189824581146, + "learning_rate": 8.613035218199276e-06, + "loss": 0.3809, + "step": 3801 + }, + { + "epoch": 0.9562374245472837, + "grad_norm": 0.3896433711051941, + "learning_rate": 8.612023580347264e-06, + "loss": 0.3586, + "step": 3802 + }, + { + "epoch": 0.9564889336016097, + "grad_norm": 0.42416492104530334, + "learning_rate": 8.611011633148027e-06, + "loss": 0.4036, + "step": 3803 + }, + { + "epoch": 0.9567404426559356, + "grad_norm": 0.35320156812667847, + "learning_rate": 8.609999376688235e-06, + "loss": 0.3765, + "step": 3804 + }, + { + "epoch": 0.9569919517102615, + "grad_norm": 0.37311264872550964, + "learning_rate": 8.60898681105458e-06, + "loss": 0.3923, + "step": 3805 + }, + { + "epoch": 0.9572434607645876, + "grad_norm": 0.3776063621044159, + "learning_rate": 8.607973936333782e-06, + "loss": 0.3784, + "step": 3806 + }, + { + "epoch": 0.9574949698189135, + "grad_norm": 0.34867435693740845, + "learning_rate": 8.606960752612587e-06, + "loss": 0.3768, + "step": 3807 + }, + { + "epoch": 0.9577464788732394, + "grad_norm": 0.3809923529624939, + "learning_rate": 8.60594725997777e-06, + "loss": 0.3824, + "step": 3808 + }, + { + "epoch": 0.9579979879275654, + "grad_norm": 0.352938711643219, + "learning_rate": 8.604933458516129e-06, + "loss": 0.403, + "step": 3809 + }, + { + "epoch": 0.9582494969818913, + "grad_norm": 0.3831866681575775, + "learning_rate": 8.603919348314487e-06, + "loss": 0.37, + "step": 3810 + }, + { + "epoch": 0.9585010060362174, + "grad_norm": 0.3907982110977173, + "learning_rate": 8.602904929459702e-06, + "loss": 0.3907, + "step": 3811 + }, + { + "epoch": 0.9587525150905433, + "grad_norm": 0.3331465423107147, + "learning_rate": 8.601890202038648e-06, + "loss": 0.3946, + "step": 3812 + }, + { + "epoch": 0.9590040241448692, + "grad_norm": 0.3363270163536072, + "learning_rate": 8.600875166138232e-06, + "loss": 0.375, + "step": 3813 + }, + { + "epoch": 0.9592555331991952, + "grad_norm": 0.34430885314941406, + "learning_rate": 8.599859821845386e-06, + "loss": 0.3892, + "step": 3814 + }, + { + "epoch": 0.9595070422535211, + "grad_norm": 0.35445672273635864, + "learning_rate": 8.598844169247064e-06, + "loss": 0.3733, + "step": 3815 + }, + { + "epoch": 0.959758551307847, + "grad_norm": 0.3446792960166931, + "learning_rate": 8.597828208430257e-06, + "loss": 0.3627, + "step": 3816 + }, + { + "epoch": 0.9600100603621731, + "grad_norm": 0.39067867398262024, + "learning_rate": 8.596811939481971e-06, + "loss": 0.3893, + "step": 3817 + }, + { + "epoch": 0.960261569416499, + "grad_norm": 0.373501181602478, + "learning_rate": 8.59579536248924e-06, + "loss": 0.3754, + "step": 3818 + }, + { + "epoch": 0.9605130784708249, + "grad_norm": 0.33692559599876404, + "learning_rate": 8.594778477539136e-06, + "loss": 0.3615, + "step": 3819 + }, + { + "epoch": 0.9607645875251509, + "grad_norm": 0.38633298873901367, + "learning_rate": 8.593761284718742e-06, + "loss": 0.3761, + "step": 3820 + }, + { + "epoch": 0.9610160965794768, + "grad_norm": 0.37641310691833496, + "learning_rate": 8.592743784115178e-06, + "loss": 0.3895, + "step": 3821 + }, + { + "epoch": 0.9612676056338029, + "grad_norm": 0.34640181064605713, + "learning_rate": 8.591725975815584e-06, + "loss": 0.3954, + "step": 3822 + }, + { + "epoch": 0.9615191146881288, + "grad_norm": 0.3677440583705902, + "learning_rate": 8.59070785990713e-06, + "loss": 0.3547, + "step": 3823 + }, + { + "epoch": 0.9617706237424547, + "grad_norm": 0.3272249102592468, + "learning_rate": 8.589689436477011e-06, + "loss": 0.3579, + "step": 3824 + }, + { + "epoch": 0.9620221327967807, + "grad_norm": 0.3785829544067383, + "learning_rate": 8.58867070561245e-06, + "loss": 0.3946, + "step": 3825 + }, + { + "epoch": 0.9622736418511066, + "grad_norm": 0.36952534317970276, + "learning_rate": 8.587651667400692e-06, + "loss": 0.3967, + "step": 3826 + }, + { + "epoch": 0.9625251509054326, + "grad_norm": 0.3846037983894348, + "learning_rate": 8.586632321929013e-06, + "loss": 0.3898, + "step": 3827 + }, + { + "epoch": 0.9627766599597586, + "grad_norm": 0.38403981924057007, + "learning_rate": 8.585612669284715e-06, + "loss": 0.3846, + "step": 3828 + }, + { + "epoch": 0.9630281690140845, + "grad_norm": 0.3689204156398773, + "learning_rate": 8.584592709555125e-06, + "loss": 0.3439, + "step": 3829 + }, + { + "epoch": 0.9632796780684104, + "grad_norm": 0.34798160195350647, + "learning_rate": 8.58357244282759e-06, + "loss": 0.372, + "step": 3830 + }, + { + "epoch": 0.9635311871227364, + "grad_norm": 0.4079124629497528, + "learning_rate": 8.582551869189497e-06, + "loss": 0.3902, + "step": 3831 + }, + { + "epoch": 0.9637826961770624, + "grad_norm": 0.4098033010959625, + "learning_rate": 8.581530988728249e-06, + "loss": 0.3796, + "step": 3832 + }, + { + "epoch": 0.9640342052313883, + "grad_norm": 0.3369835615158081, + "learning_rate": 8.580509801531276e-06, + "loss": 0.3445, + "step": 3833 + }, + { + "epoch": 0.9642857142857143, + "grad_norm": 0.3969493806362152, + "learning_rate": 8.57948830768604e-06, + "loss": 0.3782, + "step": 3834 + }, + { + "epoch": 0.9645372233400402, + "grad_norm": 0.4460330903530121, + "learning_rate": 8.57846650728002e-06, + "loss": 0.3792, + "step": 3835 + }, + { + "epoch": 0.9647887323943662, + "grad_norm": 0.3550911843776703, + "learning_rate": 8.577444400400733e-06, + "loss": 0.3809, + "step": 3836 + }, + { + "epoch": 0.9650402414486922, + "grad_norm": 0.4082079827785492, + "learning_rate": 8.576421987135716e-06, + "loss": 0.4033, + "step": 3837 + }, + { + "epoch": 0.9652917505030181, + "grad_norm": 0.40458518266677856, + "learning_rate": 8.575399267572527e-06, + "loss": 0.3572, + "step": 3838 + }, + { + "epoch": 0.9655432595573441, + "grad_norm": 0.3640364110469818, + "learning_rate": 8.574376241798758e-06, + "loss": 0.3959, + "step": 3839 + }, + { + "epoch": 0.96579476861167, + "grad_norm": 0.3785410523414612, + "learning_rate": 8.573352909902027e-06, + "loss": 0.3834, + "step": 3840 + }, + { + "epoch": 0.9660462776659959, + "grad_norm": 0.35065385699272156, + "learning_rate": 8.572329271969972e-06, + "loss": 0.3953, + "step": 3841 + }, + { + "epoch": 0.966297786720322, + "grad_norm": 0.3185063600540161, + "learning_rate": 8.571305328090264e-06, + "loss": 0.3784, + "step": 3842 + }, + { + "epoch": 0.9665492957746479, + "grad_norm": 0.3544987142086029, + "learning_rate": 8.570281078350598e-06, + "loss": 0.3713, + "step": 3843 + }, + { + "epoch": 0.9668008048289738, + "grad_norm": 0.3804239332675934, + "learning_rate": 8.569256522838692e-06, + "loss": 0.3862, + "step": 3844 + }, + { + "epoch": 0.9670523138832998, + "grad_norm": 0.32411083579063416, + "learning_rate": 8.568231661642294e-06, + "loss": 0.364, + "step": 3845 + }, + { + "epoch": 0.9673038229376257, + "grad_norm": 0.3537362515926361, + "learning_rate": 8.567206494849178e-06, + "loss": 0.3677, + "step": 3846 + }, + { + "epoch": 0.9675553319919518, + "grad_norm": 0.3677324056625366, + "learning_rate": 8.56618102254714e-06, + "loss": 0.4157, + "step": 3847 + }, + { + "epoch": 0.9678068410462777, + "grad_norm": 0.35382696986198425, + "learning_rate": 8.56515524482401e-06, + "loss": 0.3653, + "step": 3848 + }, + { + "epoch": 0.9680583501006036, + "grad_norm": 0.35097870230674744, + "learning_rate": 8.564129161767636e-06, + "loss": 0.3956, + "step": 3849 + }, + { + "epoch": 0.9683098591549296, + "grad_norm": 0.35212019085884094, + "learning_rate": 8.563102773465894e-06, + "loss": 0.3872, + "step": 3850 + }, + { + "epoch": 0.9685613682092555, + "grad_norm": 0.36232250928878784, + "learning_rate": 8.562076080006693e-06, + "loss": 0.3922, + "step": 3851 + }, + { + "epoch": 0.9688128772635815, + "grad_norm": 0.33010706305503845, + "learning_rate": 8.561049081477958e-06, + "loss": 0.3726, + "step": 3852 + }, + { + "epoch": 0.9690643863179075, + "grad_norm": 0.3573167026042938, + "learning_rate": 8.56002177796765e-06, + "loss": 0.3629, + "step": 3853 + }, + { + "epoch": 0.9693158953722334, + "grad_norm": 0.372215211391449, + "learning_rate": 8.558994169563745e-06, + "loss": 0.3706, + "step": 3854 + }, + { + "epoch": 0.9695674044265593, + "grad_norm": 0.33711981773376465, + "learning_rate": 8.557966256354256e-06, + "loss": 0.3661, + "step": 3855 + }, + { + "epoch": 0.9698189134808853, + "grad_norm": 0.37138739228248596, + "learning_rate": 8.556938038427217e-06, + "loss": 0.3524, + "step": 3856 + }, + { + "epoch": 0.9700704225352113, + "grad_norm": 0.34503763914108276, + "learning_rate": 8.555909515870683e-06, + "loss": 0.3523, + "step": 3857 + }, + { + "epoch": 0.9703219315895373, + "grad_norm": 0.3223390281200409, + "learning_rate": 8.55488068877275e-06, + "loss": 0.3686, + "step": 3858 + }, + { + "epoch": 0.9705734406438632, + "grad_norm": 0.3484128713607788, + "learning_rate": 8.553851557221521e-06, + "loss": 0.3785, + "step": 3859 + }, + { + "epoch": 0.9708249496981891, + "grad_norm": 0.36130252480506897, + "learning_rate": 8.552822121305139e-06, + "loss": 0.408, + "step": 3860 + }, + { + "epoch": 0.9710764587525151, + "grad_norm": 0.3652491569519043, + "learning_rate": 8.551792381111771e-06, + "loss": 0.3706, + "step": 3861 + }, + { + "epoch": 0.971327967806841, + "grad_norm": 0.3538327217102051, + "learning_rate": 8.550762336729605e-06, + "loss": 0.3522, + "step": 3862 + }, + { + "epoch": 0.971579476861167, + "grad_norm": 0.35552993416786194, + "learning_rate": 8.549731988246858e-06, + "loss": 0.3924, + "step": 3863 + }, + { + "epoch": 0.971830985915493, + "grad_norm": 0.3610166013240814, + "learning_rate": 8.548701335751774e-06, + "loss": 0.3665, + "step": 3864 + }, + { + "epoch": 0.9720824949698189, + "grad_norm": 0.368213951587677, + "learning_rate": 8.54767037933262e-06, + "loss": 0.3747, + "step": 3865 + }, + { + "epoch": 0.9723340040241448, + "grad_norm": 0.3184708058834076, + "learning_rate": 8.546639119077693e-06, + "loss": 0.3717, + "step": 3866 + }, + { + "epoch": 0.9725855130784709, + "grad_norm": 0.35572683811187744, + "learning_rate": 8.545607555075313e-06, + "loss": 0.3767, + "step": 3867 + }, + { + "epoch": 0.9728370221327968, + "grad_norm": 0.33128035068511963, + "learning_rate": 8.544575687413826e-06, + "loss": 0.3522, + "step": 3868 + }, + { + "epoch": 0.9730885311871227, + "grad_norm": 0.3530205488204956, + "learning_rate": 8.543543516181607e-06, + "loss": 0.409, + "step": 3869 + }, + { + "epoch": 0.9733400402414487, + "grad_norm": 0.31648722290992737, + "learning_rate": 8.542511041467054e-06, + "loss": 0.3962, + "step": 3870 + }, + { + "epoch": 0.9735915492957746, + "grad_norm": 0.33272168040275574, + "learning_rate": 8.541478263358594e-06, + "loss": 0.3799, + "step": 3871 + }, + { + "epoch": 0.9738430583501007, + "grad_norm": 0.3218432366847992, + "learning_rate": 8.540445181944673e-06, + "loss": 0.3903, + "step": 3872 + }, + { + "epoch": 0.9740945674044266, + "grad_norm": 0.33532074093818665, + "learning_rate": 8.539411797313772e-06, + "loss": 0.3831, + "step": 3873 + }, + { + "epoch": 0.9743460764587525, + "grad_norm": 0.336195170879364, + "learning_rate": 8.538378109554395e-06, + "loss": 0.3744, + "step": 3874 + }, + { + "epoch": 0.9745975855130785, + "grad_norm": 0.364101380109787, + "learning_rate": 8.537344118755067e-06, + "loss": 0.4106, + "step": 3875 + }, + { + "epoch": 0.9748490945674044, + "grad_norm": 0.3507227301597595, + "learning_rate": 8.536309825004346e-06, + "loss": 0.3932, + "step": 3876 + }, + { + "epoch": 0.9751006036217303, + "grad_norm": 0.368144154548645, + "learning_rate": 8.53527522839081e-06, + "loss": 0.3728, + "step": 3877 + }, + { + "epoch": 0.9753521126760564, + "grad_norm": 0.3607654273509979, + "learning_rate": 8.53424032900307e-06, + "loss": 0.379, + "step": 3878 + }, + { + "epoch": 0.9756036217303823, + "grad_norm": 0.3326222598552704, + "learning_rate": 8.533205126929754e-06, + "loss": 0.3651, + "step": 3879 + }, + { + "epoch": 0.9758551307847082, + "grad_norm": 0.3740299344062805, + "learning_rate": 8.532169622259524e-06, + "loss": 0.3597, + "step": 3880 + }, + { + "epoch": 0.9761066398390342, + "grad_norm": 0.3337053060531616, + "learning_rate": 8.531133815081061e-06, + "loss": 0.4037, + "step": 3881 + }, + { + "epoch": 0.9763581488933601, + "grad_norm": 0.35001134872436523, + "learning_rate": 8.530097705483078e-06, + "loss": 0.3737, + "step": 3882 + }, + { + "epoch": 0.9766096579476862, + "grad_norm": 0.3661789000034332, + "learning_rate": 8.52906129355431e-06, + "loss": 0.3587, + "step": 3883 + }, + { + "epoch": 0.9768611670020121, + "grad_norm": 0.33241093158721924, + "learning_rate": 8.528024579383522e-06, + "loss": 0.377, + "step": 3884 + }, + { + "epoch": 0.977112676056338, + "grad_norm": 0.3616609573364258, + "learning_rate": 8.5269875630595e-06, + "loss": 0.3804, + "step": 3885 + }, + { + "epoch": 0.977364185110664, + "grad_norm": 0.3413762152194977, + "learning_rate": 8.525950244671056e-06, + "loss": 0.3983, + "step": 3886 + }, + { + "epoch": 0.97761569416499, + "grad_norm": 0.41827529668807983, + "learning_rate": 8.524912624307033e-06, + "loss": 0.3762, + "step": 3887 + }, + { + "epoch": 0.9778672032193159, + "grad_norm": 0.36674749851226807, + "learning_rate": 8.523874702056296e-06, + "loss": 0.37, + "step": 3888 + }, + { + "epoch": 0.9781187122736419, + "grad_norm": 0.3255644142627716, + "learning_rate": 8.522836478007734e-06, + "loss": 0.3585, + "step": 3889 + }, + { + "epoch": 0.9783702213279678, + "grad_norm": 0.3610647916793823, + "learning_rate": 8.521797952250269e-06, + "loss": 0.3807, + "step": 3890 + }, + { + "epoch": 0.9786217303822937, + "grad_norm": 0.37085309624671936, + "learning_rate": 8.52075912487284e-06, + "loss": 0.3336, + "step": 3891 + }, + { + "epoch": 0.9788732394366197, + "grad_norm": 0.3353044390678406, + "learning_rate": 8.519719995964419e-06, + "loss": 0.3723, + "step": 3892 + }, + { + "epoch": 0.9791247484909457, + "grad_norm": 0.3177686631679535, + "learning_rate": 8.518680565614e-06, + "loss": 0.3736, + "step": 3893 + }, + { + "epoch": 0.9793762575452716, + "grad_norm": 0.36374950408935547, + "learning_rate": 8.517640833910602e-06, + "loss": 0.4041, + "step": 3894 + }, + { + "epoch": 0.9796277665995976, + "grad_norm": 0.3766506314277649, + "learning_rate": 8.516600800943273e-06, + "loss": 0.3796, + "step": 3895 + }, + { + "epoch": 0.9798792756539235, + "grad_norm": 0.3212386667728424, + "learning_rate": 8.515560466801085e-06, + "loss": 0.3817, + "step": 3896 + }, + { + "epoch": 0.9801307847082495, + "grad_norm": 0.35744473338127136, + "learning_rate": 8.514519831573137e-06, + "loss": 0.395, + "step": 3897 + }, + { + "epoch": 0.9803822937625755, + "grad_norm": 0.3569160997867584, + "learning_rate": 8.513478895348552e-06, + "loss": 0.3832, + "step": 3898 + }, + { + "epoch": 0.9806338028169014, + "grad_norm": 0.3330937922000885, + "learning_rate": 8.512437658216479e-06, + "loss": 0.4121, + "step": 3899 + }, + { + "epoch": 0.9808853118712274, + "grad_norm": 0.3199692368507385, + "learning_rate": 8.511396120266095e-06, + "loss": 0.3629, + "step": 3900 + }, + { + "epoch": 0.9811368209255533, + "grad_norm": 0.35635682940483093, + "learning_rate": 8.510354281586601e-06, + "loss": 0.3798, + "step": 3901 + }, + { + "epoch": 0.9813883299798792, + "grad_norm": 0.3268703520298004, + "learning_rate": 8.509312142267223e-06, + "loss": 0.378, + "step": 3902 + }, + { + "epoch": 0.9816398390342053, + "grad_norm": 0.3214782774448395, + "learning_rate": 8.508269702397214e-06, + "loss": 0.3788, + "step": 3903 + }, + { + "epoch": 0.9818913480885312, + "grad_norm": 0.3744148015975952, + "learning_rate": 8.507226962065852e-06, + "loss": 0.39, + "step": 3904 + }, + { + "epoch": 0.9821428571428571, + "grad_norm": 0.33143913745880127, + "learning_rate": 8.506183921362443e-06, + "loss": 0.3522, + "step": 3905 + }, + { + "epoch": 0.9823943661971831, + "grad_norm": 0.3448927104473114, + "learning_rate": 8.505140580376317e-06, + "loss": 0.3731, + "step": 3906 + }, + { + "epoch": 0.982645875251509, + "grad_norm": 0.37213438749313354, + "learning_rate": 8.504096939196826e-06, + "loss": 0.3569, + "step": 3907 + }, + { + "epoch": 0.9828973843058351, + "grad_norm": 0.3362247943878174, + "learning_rate": 8.503052997913354e-06, + "loss": 0.3844, + "step": 3908 + }, + { + "epoch": 0.983148893360161, + "grad_norm": 0.35339227318763733, + "learning_rate": 8.50200875661531e-06, + "loss": 0.3926, + "step": 3909 + }, + { + "epoch": 0.9834004024144869, + "grad_norm": 0.3829168975353241, + "learning_rate": 8.500964215392122e-06, + "loss": 0.3594, + "step": 3910 + }, + { + "epoch": 0.9836519114688129, + "grad_norm": 0.338537335395813, + "learning_rate": 8.499919374333251e-06, + "loss": 0.3922, + "step": 3911 + }, + { + "epoch": 0.9839034205231388, + "grad_norm": 0.3299337923526764, + "learning_rate": 8.498874233528183e-06, + "loss": 0.3835, + "step": 3912 + }, + { + "epoch": 0.9841549295774648, + "grad_norm": 0.3599414527416229, + "learning_rate": 8.497828793066425e-06, + "loss": 0.3518, + "step": 3913 + }, + { + "epoch": 0.9844064386317908, + "grad_norm": 0.3576641380786896, + "learning_rate": 8.496783053037512e-06, + "loss": 0.3741, + "step": 3914 + }, + { + "epoch": 0.9846579476861167, + "grad_norm": 0.3295727074146271, + "learning_rate": 8.495737013531008e-06, + "loss": 0.3744, + "step": 3915 + }, + { + "epoch": 0.9849094567404426, + "grad_norm": 0.3430532217025757, + "learning_rate": 8.494690674636497e-06, + "loss": 0.3722, + "step": 3916 + }, + { + "epoch": 0.9851609657947686, + "grad_norm": 0.3892742395401001, + "learning_rate": 8.493644036443592e-06, + "loss": 0.3736, + "step": 3917 + }, + { + "epoch": 0.9854124748490946, + "grad_norm": 0.3252136707305908, + "learning_rate": 8.492597099041932e-06, + "loss": 0.3948, + "step": 3918 + }, + { + "epoch": 0.9856639839034205, + "grad_norm": 0.3606947660446167, + "learning_rate": 8.49154986252118e-06, + "loss": 0.3537, + "step": 3919 + }, + { + "epoch": 0.9859154929577465, + "grad_norm": 0.34790658950805664, + "learning_rate": 8.490502326971026e-06, + "loss": 0.3874, + "step": 3920 + }, + { + "epoch": 0.9861670020120724, + "grad_norm": 0.34207773208618164, + "learning_rate": 8.489454492481184e-06, + "loss": 0.4054, + "step": 3921 + }, + { + "epoch": 0.9864185110663984, + "grad_norm": 0.32859066128730774, + "learning_rate": 8.488406359141395e-06, + "loss": 0.3512, + "step": 3922 + }, + { + "epoch": 0.9866700201207244, + "grad_norm": 0.42876073718070984, + "learning_rate": 8.487357927041425e-06, + "loss": 0.3749, + "step": 3923 + }, + { + "epoch": 0.9869215291750503, + "grad_norm": 0.3597780764102936, + "learning_rate": 8.486309196271063e-06, + "loss": 0.3671, + "step": 3924 + }, + { + "epoch": 0.9871730382293763, + "grad_norm": 0.3601861596107483, + "learning_rate": 8.485260166920131e-06, + "loss": 0.3696, + "step": 3925 + }, + { + "epoch": 0.9874245472837022, + "grad_norm": 0.450916588306427, + "learning_rate": 8.484210839078467e-06, + "loss": 0.3995, + "step": 3926 + }, + { + "epoch": 0.9876760563380281, + "grad_norm": 0.38571393489837646, + "learning_rate": 8.483161212835944e-06, + "loss": 0.3746, + "step": 3927 + }, + { + "epoch": 0.9879275653923542, + "grad_norm": 0.32381007075309753, + "learning_rate": 8.482111288282452e-06, + "loss": 0.3762, + "step": 3928 + }, + { + "epoch": 0.9881790744466801, + "grad_norm": 0.4630524516105652, + "learning_rate": 8.481061065507915e-06, + "loss": 0.3895, + "step": 3929 + }, + { + "epoch": 0.988430583501006, + "grad_norm": 0.3516670763492584, + "learning_rate": 8.480010544602274e-06, + "loss": 0.3775, + "step": 3930 + }, + { + "epoch": 0.988682092555332, + "grad_norm": 0.3213813006877899, + "learning_rate": 8.4789597256555e-06, + "loss": 0.3751, + "step": 3931 + }, + { + "epoch": 0.9889336016096579, + "grad_norm": 0.37605127692222595, + "learning_rate": 8.47790860875759e-06, + "loss": 0.3625, + "step": 3932 + }, + { + "epoch": 0.989185110663984, + "grad_norm": 0.3587568700313568, + "learning_rate": 8.476857193998564e-06, + "loss": 0.372, + "step": 3933 + }, + { + "epoch": 0.9894366197183099, + "grad_norm": 0.3404126763343811, + "learning_rate": 8.475805481468472e-06, + "loss": 0.3751, + "step": 3934 + }, + { + "epoch": 0.9896881287726358, + "grad_norm": 0.3651595711708069, + "learning_rate": 8.474753471257385e-06, + "loss": 0.3587, + "step": 3935 + }, + { + "epoch": 0.9899396378269618, + "grad_norm": 0.3151306211948395, + "learning_rate": 8.473701163455401e-06, + "loss": 0.3889, + "step": 3936 + }, + { + "epoch": 0.9901911468812877, + "grad_norm": 0.327453076839447, + "learning_rate": 8.472648558152646e-06, + "loss": 0.3706, + "step": 3937 + }, + { + "epoch": 0.9904426559356136, + "grad_norm": 0.3796255588531494, + "learning_rate": 8.471595655439263e-06, + "loss": 0.3789, + "step": 3938 + }, + { + "epoch": 0.9906941649899397, + "grad_norm": 0.3350921869277954, + "learning_rate": 8.470542455405432e-06, + "loss": 0.3938, + "step": 3939 + }, + { + "epoch": 0.9909456740442656, + "grad_norm": 0.3343561291694641, + "learning_rate": 8.469488958141352e-06, + "loss": 0.3646, + "step": 3940 + }, + { + "epoch": 0.9911971830985915, + "grad_norm": 0.35207462310791016, + "learning_rate": 8.468435163737248e-06, + "loss": 0.3782, + "step": 3941 + }, + { + "epoch": 0.9914486921529175, + "grad_norm": 0.32531505823135376, + "learning_rate": 8.46738107228337e-06, + "loss": 0.3897, + "step": 3942 + }, + { + "epoch": 0.9917002012072434, + "grad_norm": 0.367502897977829, + "learning_rate": 8.466326683869994e-06, + "loss": 0.3878, + "step": 3943 + }, + { + "epoch": 0.9919517102615694, + "grad_norm": 0.37151041626930237, + "learning_rate": 8.465271998587424e-06, + "loss": 0.3696, + "step": 3944 + }, + { + "epoch": 0.9922032193158954, + "grad_norm": 0.3209855556488037, + "learning_rate": 8.464217016525985e-06, + "loss": 0.3534, + "step": 3945 + }, + { + "epoch": 0.9924547283702213, + "grad_norm": 0.3292153477668762, + "learning_rate": 8.463161737776031e-06, + "loss": 0.3612, + "step": 3946 + }, + { + "epoch": 0.9927062374245473, + "grad_norm": 0.35022595524787903, + "learning_rate": 8.46210616242794e-06, + "loss": 0.3583, + "step": 3947 + }, + { + "epoch": 0.9929577464788732, + "grad_norm": 0.3790688216686249, + "learning_rate": 8.461050290572114e-06, + "loss": 0.3809, + "step": 3948 + }, + { + "epoch": 0.9932092555331992, + "grad_norm": 0.3780459463596344, + "learning_rate": 8.459994122298985e-06, + "loss": 0.3793, + "step": 3949 + }, + { + "epoch": 0.9934607645875252, + "grad_norm": 0.325181782245636, + "learning_rate": 8.458937657699004e-06, + "loss": 0.3724, + "step": 3950 + }, + { + "epoch": 0.9937122736418511, + "grad_norm": 0.35834598541259766, + "learning_rate": 8.457880896862651e-06, + "loss": 0.3791, + "step": 3951 + }, + { + "epoch": 0.993963782696177, + "grad_norm": 0.37170320749282837, + "learning_rate": 8.456823839880433e-06, + "loss": 0.3852, + "step": 3952 + }, + { + "epoch": 0.994215291750503, + "grad_norm": 0.3402329385280609, + "learning_rate": 8.455766486842878e-06, + "loss": 0.3763, + "step": 3953 + }, + { + "epoch": 0.994466800804829, + "grad_norm": 0.3130355179309845, + "learning_rate": 8.454708837840543e-06, + "loss": 0.3747, + "step": 3954 + }, + { + "epoch": 0.9947183098591549, + "grad_norm": 0.32022619247436523, + "learning_rate": 8.453650892964008e-06, + "loss": 0.3611, + "step": 3955 + }, + { + "epoch": 0.9949698189134809, + "grad_norm": 0.35048943758010864, + "learning_rate": 8.45259265230388e-06, + "loss": 0.3663, + "step": 3956 + }, + { + "epoch": 0.9952213279678068, + "grad_norm": 0.3296698033809662, + "learning_rate": 8.45153411595079e-06, + "loss": 0.3886, + "step": 3957 + }, + { + "epoch": 0.9954728370221329, + "grad_norm": 0.3470214605331421, + "learning_rate": 8.450475283995398e-06, + "loss": 0.411, + "step": 3958 + }, + { + "epoch": 0.9957243460764588, + "grad_norm": 0.3476480543613434, + "learning_rate": 8.449416156528383e-06, + "loss": 0.3892, + "step": 3959 + }, + { + "epoch": 0.9959758551307847, + "grad_norm": 0.3552456796169281, + "learning_rate": 8.448356733640453e-06, + "loss": 0.4037, + "step": 3960 + }, + { + "epoch": 0.9962273641851107, + "grad_norm": 0.38896414637565613, + "learning_rate": 8.447297015422342e-06, + "loss": 0.3919, + "step": 3961 + }, + { + "epoch": 0.9964788732394366, + "grad_norm": 0.3473275899887085, + "learning_rate": 8.446237001964808e-06, + "loss": 0.3809, + "step": 3962 + }, + { + "epoch": 0.9967303822937625, + "grad_norm": 0.3539365530014038, + "learning_rate": 8.445176693358634e-06, + "loss": 0.3513, + "step": 3963 + }, + { + "epoch": 0.9969818913480886, + "grad_norm": 0.39693740010261536, + "learning_rate": 8.444116089694631e-06, + "loss": 0.3734, + "step": 3964 + }, + { + "epoch": 0.9972334004024145, + "grad_norm": 0.3378576934337616, + "learning_rate": 8.443055191063629e-06, + "loss": 0.3644, + "step": 3965 + }, + { + "epoch": 0.9974849094567404, + "grad_norm": 0.3627392053604126, + "learning_rate": 8.44199399755649e-06, + "loss": 0.3903, + "step": 3966 + }, + { + "epoch": 0.9977364185110664, + "grad_norm": 0.38984745740890503, + "learning_rate": 8.440932509264099e-06, + "loss": 0.3932, + "step": 3967 + }, + { + "epoch": 0.9979879275653923, + "grad_norm": 0.34223487973213196, + "learning_rate": 8.439870726277364e-06, + "loss": 0.3599, + "step": 3968 + }, + { + "epoch": 0.9982394366197183, + "grad_norm": 0.33947303891181946, + "learning_rate": 8.438808648687223e-06, + "loss": 0.381, + "step": 3969 + }, + { + "epoch": 0.9984909456740443, + "grad_norm": 0.3354572057723999, + "learning_rate": 8.437746276584631e-06, + "loss": 0.3726, + "step": 3970 + }, + { + "epoch": 0.9987424547283702, + "grad_norm": 0.36442792415618896, + "learning_rate": 8.43668361006058e-06, + "loss": 0.3951, + "step": 3971 + }, + { + "epoch": 0.9989939637826962, + "grad_norm": 0.3597300946712494, + "learning_rate": 8.435620649206076e-06, + "loss": 0.3476, + "step": 3972 + }, + { + "epoch": 0.9992454728370221, + "grad_norm": 0.3294813632965088, + "learning_rate": 8.434557394112156e-06, + "loss": 0.3753, + "step": 3973 + }, + { + "epoch": 0.9994969818913481, + "grad_norm": 0.32441380620002747, + "learning_rate": 8.433493844869883e-06, + "loss": 0.3672, + "step": 3974 + }, + { + "epoch": 0.9997484909456741, + "grad_norm": 0.3753267526626587, + "learning_rate": 8.432430001570343e-06, + "loss": 0.377, + "step": 3975 + }, + { + "epoch": 1.0, + "grad_norm": 0.3612028658390045, + "learning_rate": 8.431365864304645e-06, + "loss": 0.3862, + "step": 3976 + }, + { + "epoch": 1.000251509054326, + "grad_norm": 0.3513164818286896, + "learning_rate": 8.430301433163927e-06, + "loss": 0.3648, + "step": 3977 + }, + { + "epoch": 1.0005030181086518, + "grad_norm": 0.3320719599723816, + "learning_rate": 8.42923670823935e-06, + "loss": 0.3366, + "step": 3978 + }, + { + "epoch": 1.0007545271629779, + "grad_norm": 0.3477346897125244, + "learning_rate": 8.428171689622105e-06, + "loss": 0.3598, + "step": 3979 + }, + { + "epoch": 1.0010060362173039, + "grad_norm": 0.3291724920272827, + "learning_rate": 8.4271063774034e-06, + "loss": 0.3622, + "step": 3980 + }, + { + "epoch": 1.0012575452716297, + "grad_norm": 0.3617793917655945, + "learning_rate": 8.426040771674475e-06, + "loss": 0.3601, + "step": 3981 + }, + { + "epoch": 1.0015090543259557, + "grad_norm": 0.33502376079559326, + "learning_rate": 8.42497487252659e-06, + "loss": 0.3577, + "step": 3982 + }, + { + "epoch": 1.0017605633802817, + "grad_norm": 0.35225436091423035, + "learning_rate": 8.423908680051035e-06, + "loss": 0.3543, + "step": 3983 + }, + { + "epoch": 1.0020120724346075, + "grad_norm": 0.3484959304332733, + "learning_rate": 8.42284219433912e-06, + "loss": 0.3501, + "step": 3984 + }, + { + "epoch": 1.0022635814889336, + "grad_norm": 0.348874568939209, + "learning_rate": 8.421775415482183e-06, + "loss": 0.3557, + "step": 3985 + }, + { + "epoch": 1.0025150905432596, + "grad_norm": 0.36964142322540283, + "learning_rate": 8.42070834357159e-06, + "loss": 0.3564, + "step": 3986 + }, + { + "epoch": 1.0027665995975854, + "grad_norm": 0.3703876733779907, + "learning_rate": 8.419640978698728e-06, + "loss": 0.3379, + "step": 3987 + }, + { + "epoch": 1.0030181086519114, + "grad_norm": 0.35077622532844543, + "learning_rate": 8.418573320955008e-06, + "loss": 0.3577, + "step": 3988 + }, + { + "epoch": 1.0032696177062375, + "grad_norm": 0.35275188088417053, + "learning_rate": 8.417505370431869e-06, + "loss": 0.3819, + "step": 3989 + }, + { + "epoch": 1.0035211267605635, + "grad_norm": 0.3845551311969757, + "learning_rate": 8.416437127220777e-06, + "loss": 0.3294, + "step": 3990 + }, + { + "epoch": 1.0037726358148893, + "grad_norm": 0.3270184397697449, + "learning_rate": 8.415368591413218e-06, + "loss": 0.3455, + "step": 3991 + }, + { + "epoch": 1.0040241448692153, + "grad_norm": 0.3642314374446869, + "learning_rate": 8.414299763100704e-06, + "loss": 0.3649, + "step": 3992 + }, + { + "epoch": 1.0042756539235413, + "grad_norm": 0.39639967679977417, + "learning_rate": 8.413230642374776e-06, + "loss": 0.3558, + "step": 3993 + }, + { + "epoch": 1.0045271629778671, + "grad_norm": 0.3734232485294342, + "learning_rate": 8.412161229326997e-06, + "loss": 0.3575, + "step": 3994 + }, + { + "epoch": 1.0047786720321932, + "grad_norm": 0.3293455243110657, + "learning_rate": 8.411091524048953e-06, + "loss": 0.369, + "step": 3995 + }, + { + "epoch": 1.0050301810865192, + "grad_norm": 0.3649202883243561, + "learning_rate": 8.410021526632262e-06, + "loss": 0.3557, + "step": 3996 + }, + { + "epoch": 1.005281690140845, + "grad_norm": 0.3437403440475464, + "learning_rate": 8.408951237168559e-06, + "loss": 0.3742, + "step": 3997 + }, + { + "epoch": 1.005533199195171, + "grad_norm": 0.34628748893737793, + "learning_rate": 8.40788065574951e-06, + "loss": 0.3851, + "step": 3998 + }, + { + "epoch": 1.005784708249497, + "grad_norm": 0.40169525146484375, + "learning_rate": 8.4068097824668e-06, + "loss": 0.3603, + "step": 3999 + }, + { + "epoch": 1.0060362173038229, + "grad_norm": 0.3196563422679901, + "learning_rate": 8.405738617412148e-06, + "loss": 0.3477, + "step": 4000 + }, + { + "epoch": 1.006287726358149, + "grad_norm": 0.3684934675693512, + "learning_rate": 8.404667160677289e-06, + "loss": 0.3876, + "step": 4001 + }, + { + "epoch": 1.006539235412475, + "grad_norm": 0.35352370142936707, + "learning_rate": 8.403595412353987e-06, + "loss": 0.3676, + "step": 4002 + }, + { + "epoch": 1.0067907444668007, + "grad_norm": 0.3421543836593628, + "learning_rate": 8.40252337253403e-06, + "loss": 0.3632, + "step": 4003 + }, + { + "epoch": 1.0070422535211268, + "grad_norm": 0.34123724699020386, + "learning_rate": 8.401451041309233e-06, + "loss": 0.3834, + "step": 4004 + }, + { + "epoch": 1.0072937625754528, + "grad_norm": 0.3148738741874695, + "learning_rate": 8.400378418771434e-06, + "loss": 0.3571, + "step": 4005 + }, + { + "epoch": 1.0075452716297786, + "grad_norm": 0.3324401378631592, + "learning_rate": 8.399305505012496e-06, + "loss": 0.3645, + "step": 4006 + }, + { + "epoch": 1.0077967806841046, + "grad_norm": 0.3198019564151764, + "learning_rate": 8.398232300124307e-06, + "loss": 0.3457, + "step": 4007 + }, + { + "epoch": 1.0080482897384306, + "grad_norm": 0.3236132264137268, + "learning_rate": 8.39715880419878e-06, + "loss": 0.3419, + "step": 4008 + }, + { + "epoch": 1.0082997987927564, + "grad_norm": 0.3504725396633148, + "learning_rate": 8.396085017327854e-06, + "loss": 0.3376, + "step": 4009 + }, + { + "epoch": 1.0085513078470825, + "grad_norm": 0.3179847002029419, + "learning_rate": 8.395010939603493e-06, + "loss": 0.3549, + "step": 4010 + }, + { + "epoch": 1.0088028169014085, + "grad_norm": 0.31894704699516296, + "learning_rate": 8.393936571117685e-06, + "loss": 0.3486, + "step": 4011 + }, + { + "epoch": 1.0090543259557343, + "grad_norm": 0.3283448815345764, + "learning_rate": 8.392861911962441e-06, + "loss": 0.3577, + "step": 4012 + }, + { + "epoch": 1.0093058350100603, + "grad_norm": 0.33308663964271545, + "learning_rate": 8.3917869622298e-06, + "loss": 0.3494, + "step": 4013 + }, + { + "epoch": 1.0095573440643864, + "grad_norm": 0.33545833826065063, + "learning_rate": 8.390711722011825e-06, + "loss": 0.3131, + "step": 4014 + }, + { + "epoch": 1.0098088531187124, + "grad_norm": 0.36120879650115967, + "learning_rate": 8.389636191400603e-06, + "loss": 0.3673, + "step": 4015 + }, + { + "epoch": 1.0100603621730382, + "grad_norm": 0.3518035411834717, + "learning_rate": 8.388560370488247e-06, + "loss": 0.3616, + "step": 4016 + }, + { + "epoch": 1.0103118712273642, + "grad_norm": 0.3019152879714966, + "learning_rate": 8.387484259366894e-06, + "loss": 0.3508, + "step": 4017 + }, + { + "epoch": 1.0105633802816902, + "grad_norm": 0.3448386788368225, + "learning_rate": 8.386407858128707e-06, + "loss": 0.3544, + "step": 4018 + }, + { + "epoch": 1.010814889336016, + "grad_norm": 0.32667118310928345, + "learning_rate": 8.38533116686587e-06, + "loss": 0.3429, + "step": 4019 + }, + { + "epoch": 1.011066398390342, + "grad_norm": 0.3479123115539551, + "learning_rate": 8.384254185670599e-06, + "loss": 0.3632, + "step": 4020 + }, + { + "epoch": 1.011317907444668, + "grad_norm": 0.3146592378616333, + "learning_rate": 8.383176914635127e-06, + "loss": 0.3615, + "step": 4021 + }, + { + "epoch": 1.011569416498994, + "grad_norm": 0.36215662956237793, + "learning_rate": 8.38209935385172e-06, + "loss": 0.3659, + "step": 4022 + }, + { + "epoch": 1.01182092555332, + "grad_norm": 0.3197457790374756, + "learning_rate": 8.381021503412659e-06, + "loss": 0.3269, + "step": 4023 + }, + { + "epoch": 1.012072434607646, + "grad_norm": 0.34255650639533997, + "learning_rate": 8.379943363410259e-06, + "loss": 0.3715, + "step": 4024 + }, + { + "epoch": 1.0123239436619718, + "grad_norm": 0.34382399916648865, + "learning_rate": 8.378864933936856e-06, + "loss": 0.3318, + "step": 4025 + }, + { + "epoch": 1.0125754527162978, + "grad_norm": 0.3288285434246063, + "learning_rate": 8.37778621508481e-06, + "loss": 0.3737, + "step": 4026 + }, + { + "epoch": 1.0128269617706238, + "grad_norm": 0.33756786584854126, + "learning_rate": 8.376707206946503e-06, + "loss": 0.3459, + "step": 4027 + }, + { + "epoch": 1.0130784708249496, + "grad_norm": 0.37219181656837463, + "learning_rate": 8.37562790961435e-06, + "loss": 0.3549, + "step": 4028 + }, + { + "epoch": 1.0133299798792756, + "grad_norm": 0.3869669735431671, + "learning_rate": 8.374548323180783e-06, + "loss": 0.3594, + "step": 4029 + }, + { + "epoch": 1.0135814889336017, + "grad_norm": 0.3502846360206604, + "learning_rate": 8.373468447738265e-06, + "loss": 0.3629, + "step": 4030 + }, + { + "epoch": 1.0138329979879275, + "grad_norm": 0.34061774611473083, + "learning_rate": 8.372388283379277e-06, + "loss": 0.3644, + "step": 4031 + }, + { + "epoch": 1.0140845070422535, + "grad_norm": 0.3984410762786865, + "learning_rate": 8.37130783019633e-06, + "loss": 0.3692, + "step": 4032 + }, + { + "epoch": 1.0143360160965795, + "grad_norm": 0.3561273515224457, + "learning_rate": 8.370227088281962e-06, + "loss": 0.3375, + "step": 4033 + }, + { + "epoch": 1.0145875251509053, + "grad_norm": 0.3067105710506439, + "learning_rate": 8.369146057728726e-06, + "loss": 0.3403, + "step": 4034 + }, + { + "epoch": 1.0148390342052314, + "grad_norm": 0.39280879497528076, + "learning_rate": 8.368064738629205e-06, + "loss": 0.3513, + "step": 4035 + }, + { + "epoch": 1.0150905432595574, + "grad_norm": 0.3620704114437103, + "learning_rate": 8.366983131076012e-06, + "loss": 0.3579, + "step": 4036 + }, + { + "epoch": 1.0153420523138832, + "grad_norm": 0.3495025038719177, + "learning_rate": 8.365901235161778e-06, + "loss": 0.3641, + "step": 4037 + }, + { + "epoch": 1.0155935613682092, + "grad_norm": 0.350424587726593, + "learning_rate": 8.36481905097916e-06, + "loss": 0.3567, + "step": 4038 + }, + { + "epoch": 1.0158450704225352, + "grad_norm": 0.31671619415283203, + "learning_rate": 8.363736578620838e-06, + "loss": 0.3795, + "step": 4039 + }, + { + "epoch": 1.0160965794768613, + "grad_norm": 0.3311213552951813, + "learning_rate": 8.362653818179524e-06, + "loss": 0.3523, + "step": 4040 + }, + { + "epoch": 1.016348088531187, + "grad_norm": 0.3370438814163208, + "learning_rate": 8.361570769747948e-06, + "loss": 0.3614, + "step": 4041 + }, + { + "epoch": 1.016599597585513, + "grad_norm": 0.33826106786727905, + "learning_rate": 8.360487433418863e-06, + "loss": 0.3289, + "step": 4042 + }, + { + "epoch": 1.0168511066398391, + "grad_norm": 0.37292927503585815, + "learning_rate": 8.359403809285054e-06, + "loss": 0.3549, + "step": 4043 + }, + { + "epoch": 1.017102615694165, + "grad_norm": 0.3336230516433716, + "learning_rate": 8.358319897439324e-06, + "loss": 0.3397, + "step": 4044 + }, + { + "epoch": 1.017354124748491, + "grad_norm": 0.3715561032295227, + "learning_rate": 8.357235697974506e-06, + "loss": 0.335, + "step": 4045 + }, + { + "epoch": 1.017605633802817, + "grad_norm": 0.3317098617553711, + "learning_rate": 8.356151210983451e-06, + "loss": 0.3557, + "step": 4046 + }, + { + "epoch": 1.0178571428571428, + "grad_norm": 0.3567894697189331, + "learning_rate": 8.355066436559042e-06, + "loss": 0.3431, + "step": 4047 + }, + { + "epoch": 1.0181086519114688, + "grad_norm": 0.31306058168411255, + "learning_rate": 8.353981374794184e-06, + "loss": 0.3529, + "step": 4048 + }, + { + "epoch": 1.0183601609657948, + "grad_norm": 0.3463604748249054, + "learning_rate": 8.3528960257818e-06, + "loss": 0.3559, + "step": 4049 + }, + { + "epoch": 1.0186116700201207, + "grad_norm": 0.36035671830177307, + "learning_rate": 8.35181038961485e-06, + "loss": 0.3697, + "step": 4050 + }, + { + "epoch": 1.0188631790744467, + "grad_norm": 0.31114959716796875, + "learning_rate": 8.350724466386309e-06, + "loss": 0.3163, + "step": 4051 + }, + { + "epoch": 1.0191146881287727, + "grad_norm": 0.33771616220474243, + "learning_rate": 8.349638256189178e-06, + "loss": 0.3651, + "step": 4052 + }, + { + "epoch": 1.0193661971830985, + "grad_norm": 0.38065779209136963, + "learning_rate": 8.348551759116485e-06, + "loss": 0.337, + "step": 4053 + }, + { + "epoch": 1.0196177062374245, + "grad_norm": 0.3252953290939331, + "learning_rate": 8.347464975261283e-06, + "loss": 0.3479, + "step": 4054 + }, + { + "epoch": 1.0198692152917506, + "grad_norm": 0.3428071439266205, + "learning_rate": 8.346377904716649e-06, + "loss": 0.3571, + "step": 4055 + }, + { + "epoch": 1.0201207243460764, + "grad_norm": 0.3688924312591553, + "learning_rate": 8.34529054757568e-06, + "loss": 0.3772, + "step": 4056 + }, + { + "epoch": 1.0203722334004024, + "grad_norm": 0.3667786121368408, + "learning_rate": 8.344202903931504e-06, + "loss": 0.359, + "step": 4057 + }, + { + "epoch": 1.0206237424547284, + "grad_norm": 0.3390733003616333, + "learning_rate": 8.343114973877273e-06, + "loss": 0.3513, + "step": 4058 + }, + { + "epoch": 1.0208752515090542, + "grad_norm": 0.34325966238975525, + "learning_rate": 8.342026757506156e-06, + "loss": 0.3834, + "step": 4059 + }, + { + "epoch": 1.0211267605633803, + "grad_norm": 0.37722912430763245, + "learning_rate": 8.340938254911358e-06, + "loss": 0.3481, + "step": 4060 + }, + { + "epoch": 1.0213782696177063, + "grad_norm": 0.33139869570732117, + "learning_rate": 8.339849466186096e-06, + "loss": 0.3473, + "step": 4061 + }, + { + "epoch": 1.021629778672032, + "grad_norm": 0.3419877886772156, + "learning_rate": 8.338760391423623e-06, + "loss": 0.3419, + "step": 4062 + }, + { + "epoch": 1.0218812877263581, + "grad_norm": 0.37569326162338257, + "learning_rate": 8.33767103071721e-06, + "loss": 0.3518, + "step": 4063 + }, + { + "epoch": 1.0221327967806841, + "grad_norm": 0.3753376603126526, + "learning_rate": 8.336581384160152e-06, + "loss": 0.3399, + "step": 4064 + }, + { + "epoch": 1.0223843058350102, + "grad_norm": 0.3511211574077606, + "learning_rate": 8.335491451845774e-06, + "loss": 0.3732, + "step": 4065 + }, + { + "epoch": 1.022635814889336, + "grad_norm": 0.3783870339393616, + "learning_rate": 8.334401233867418e-06, + "loss": 0.365, + "step": 4066 + }, + { + "epoch": 1.022887323943662, + "grad_norm": 0.41309934854507446, + "learning_rate": 8.333310730318457e-06, + "loss": 0.3703, + "step": 4067 + }, + { + "epoch": 1.023138832997988, + "grad_norm": 0.5328013300895691, + "learning_rate": 8.332219941292286e-06, + "loss": 0.3598, + "step": 4068 + }, + { + "epoch": 1.0233903420523138, + "grad_norm": 0.3765561282634735, + "learning_rate": 8.331128866882323e-06, + "loss": 0.3709, + "step": 4069 + }, + { + "epoch": 1.0236418511066399, + "grad_norm": 0.37244337797164917, + "learning_rate": 8.330037507182012e-06, + "loss": 0.3685, + "step": 4070 + }, + { + "epoch": 1.0238933601609659, + "grad_norm": 0.3581307530403137, + "learning_rate": 8.328945862284821e-06, + "loss": 0.364, + "step": 4071 + }, + { + "epoch": 1.0241448692152917, + "grad_norm": 0.32666563987731934, + "learning_rate": 8.327853932284242e-06, + "loss": 0.3368, + "step": 4072 + }, + { + "epoch": 1.0243963782696177, + "grad_norm": 0.31926366686820984, + "learning_rate": 8.326761717273793e-06, + "loss": 0.3595, + "step": 4073 + }, + { + "epoch": 1.0246478873239437, + "grad_norm": 0.3939271867275238, + "learning_rate": 8.325669217347017e-06, + "loss": 0.3444, + "step": 4074 + }, + { + "epoch": 1.0248993963782695, + "grad_norm": 0.33755072951316833, + "learning_rate": 8.324576432597476e-06, + "loss": 0.3422, + "step": 4075 + }, + { + "epoch": 1.0251509054325956, + "grad_norm": 0.3223980963230133, + "learning_rate": 8.32348336311876e-06, + "loss": 0.3244, + "step": 4076 + }, + { + "epoch": 1.0254024144869216, + "grad_norm": 0.3657818138599396, + "learning_rate": 8.322390009004488e-06, + "loss": 0.3487, + "step": 4077 + }, + { + "epoch": 1.0256539235412474, + "grad_norm": 0.3748267590999603, + "learning_rate": 8.321296370348297e-06, + "loss": 0.353, + "step": 4078 + }, + { + "epoch": 1.0259054325955734, + "grad_norm": 0.3817254304885864, + "learning_rate": 8.320202447243851e-06, + "loss": 0.3505, + "step": 4079 + }, + { + "epoch": 1.0261569416498995, + "grad_norm": 0.32929888367652893, + "learning_rate": 8.319108239784834e-06, + "loss": 0.3457, + "step": 4080 + }, + { + "epoch": 1.0264084507042253, + "grad_norm": 0.377116322517395, + "learning_rate": 8.318013748064962e-06, + "loss": 0.349, + "step": 4081 + }, + { + "epoch": 1.0266599597585513, + "grad_norm": 0.34213367104530334, + "learning_rate": 8.316918972177968e-06, + "loss": 0.3627, + "step": 4082 + }, + { + "epoch": 1.0269114688128773, + "grad_norm": 0.3591276705265045, + "learning_rate": 8.315823912217615e-06, + "loss": 0.3497, + "step": 4083 + }, + { + "epoch": 1.0271629778672031, + "grad_norm": 0.33743128180503845, + "learning_rate": 8.314728568277691e-06, + "loss": 0.3589, + "step": 4084 + }, + { + "epoch": 1.0274144869215291, + "grad_norm": 0.34647324681282043, + "learning_rate": 8.313632940452e-06, + "loss": 0.3588, + "step": 4085 + }, + { + "epoch": 1.0276659959758552, + "grad_norm": 0.3492962718009949, + "learning_rate": 8.312537028834374e-06, + "loss": 0.3581, + "step": 4086 + }, + { + "epoch": 1.027917505030181, + "grad_norm": 0.3124542832374573, + "learning_rate": 8.311440833518678e-06, + "loss": 0.3418, + "step": 4087 + }, + { + "epoch": 1.028169014084507, + "grad_norm": 0.3951411545276642, + "learning_rate": 8.310344354598791e-06, + "loss": 0.3615, + "step": 4088 + }, + { + "epoch": 1.028420523138833, + "grad_norm": 0.3501111567020416, + "learning_rate": 8.30924759216862e-06, + "loss": 0.3331, + "step": 4089 + }, + { + "epoch": 1.028672032193159, + "grad_norm": 0.36356234550476074, + "learning_rate": 8.308150546322093e-06, + "loss": 0.3728, + "step": 4090 + }, + { + "epoch": 1.0289235412474849, + "grad_norm": 0.3362111747264862, + "learning_rate": 8.30705321715317e-06, + "loss": 0.3322, + "step": 4091 + }, + { + "epoch": 1.029175050301811, + "grad_norm": 0.3690354824066162, + "learning_rate": 8.305955604755827e-06, + "loss": 0.3943, + "step": 4092 + }, + { + "epoch": 1.029426559356137, + "grad_norm": 0.3664776682853699, + "learning_rate": 8.304857709224068e-06, + "loss": 0.3586, + "step": 4093 + }, + { + "epoch": 1.0296780684104627, + "grad_norm": 0.33852067589759827, + "learning_rate": 8.303759530651921e-06, + "loss": 0.3354, + "step": 4094 + }, + { + "epoch": 1.0299295774647887, + "grad_norm": 0.339894562959671, + "learning_rate": 8.30266106913344e-06, + "loss": 0.3487, + "step": 4095 + }, + { + "epoch": 1.0301810865191148, + "grad_norm": 0.34180840849876404, + "learning_rate": 8.301562324762698e-06, + "loss": 0.3661, + "step": 4096 + }, + { + "epoch": 1.0304325955734406, + "grad_norm": 0.35557109117507935, + "learning_rate": 8.300463297633798e-06, + "loss": 0.3469, + "step": 4097 + }, + { + "epoch": 1.0306841046277666, + "grad_norm": 0.32619279623031616, + "learning_rate": 8.299363987840864e-06, + "loss": 0.3601, + "step": 4098 + }, + { + "epoch": 1.0309356136820926, + "grad_norm": 0.32750555872917175, + "learning_rate": 8.298264395478046e-06, + "loss": 0.3436, + "step": 4099 + }, + { + "epoch": 1.0311871227364184, + "grad_norm": 0.3431573808193207, + "learning_rate": 8.297164520639515e-06, + "loss": 0.3605, + "step": 4100 + }, + { + "epoch": 1.0314386317907445, + "grad_norm": 0.3561439514160156, + "learning_rate": 8.29606436341947e-06, + "loss": 0.3392, + "step": 4101 + }, + { + "epoch": 1.0316901408450705, + "grad_norm": 0.3567664921283722, + "learning_rate": 8.294963923912134e-06, + "loss": 0.3511, + "step": 4102 + }, + { + "epoch": 1.0319416498993963, + "grad_norm": 0.35723668336868286, + "learning_rate": 8.293863202211751e-06, + "loss": 0.3727, + "step": 4103 + }, + { + "epoch": 1.0321931589537223, + "grad_norm": 0.34615108370780945, + "learning_rate": 8.292762198412591e-06, + "loss": 0.3533, + "step": 4104 + }, + { + "epoch": 1.0324446680080483, + "grad_norm": 0.345207542181015, + "learning_rate": 8.291660912608948e-06, + "loss": 0.3307, + "step": 4105 + }, + { + "epoch": 1.0326961770623742, + "grad_norm": 0.3570762276649475, + "learning_rate": 8.290559344895139e-06, + "loss": 0.3474, + "step": 4106 + }, + { + "epoch": 1.0329476861167002, + "grad_norm": 0.3682233393192291, + "learning_rate": 8.28945749536551e-06, + "loss": 0.3729, + "step": 4107 + }, + { + "epoch": 1.0331991951710262, + "grad_norm": 0.4089745879173279, + "learning_rate": 8.288355364114423e-06, + "loss": 0.3803, + "step": 4108 + }, + { + "epoch": 1.033450704225352, + "grad_norm": 0.3189091682434082, + "learning_rate": 8.287252951236272e-06, + "loss": 0.367, + "step": 4109 + }, + { + "epoch": 1.033702213279678, + "grad_norm": 0.3654859662055969, + "learning_rate": 8.28615025682547e-06, + "loss": 0.3674, + "step": 4110 + }, + { + "epoch": 1.033953722334004, + "grad_norm": 0.38779398798942566, + "learning_rate": 8.285047280976458e-06, + "loss": 0.3746, + "step": 4111 + }, + { + "epoch": 1.0342052313883299, + "grad_norm": 0.3446475863456726, + "learning_rate": 8.283944023783697e-06, + "loss": 0.3528, + "step": 4112 + }, + { + "epoch": 1.034456740442656, + "grad_norm": 0.33927372097969055, + "learning_rate": 8.282840485341675e-06, + "loss": 0.3567, + "step": 4113 + }, + { + "epoch": 1.034708249496982, + "grad_norm": 0.35856184363365173, + "learning_rate": 8.281736665744902e-06, + "loss": 0.3442, + "step": 4114 + }, + { + "epoch": 1.034959758551308, + "grad_norm": 0.35313889384269714, + "learning_rate": 8.280632565087913e-06, + "loss": 0.3552, + "step": 4115 + }, + { + "epoch": 1.0352112676056338, + "grad_norm": 0.33305633068084717, + "learning_rate": 8.27952818346527e-06, + "loss": 0.3299, + "step": 4116 + }, + { + "epoch": 1.0354627766599598, + "grad_norm": 0.3259493112564087, + "learning_rate": 8.278423520971556e-06, + "loss": 0.3376, + "step": 4117 + }, + { + "epoch": 1.0357142857142858, + "grad_norm": 0.3438626527786255, + "learning_rate": 8.277318577701375e-06, + "loss": 0.3475, + "step": 4118 + }, + { + "epoch": 1.0359657947686116, + "grad_norm": 0.31181490421295166, + "learning_rate": 8.27621335374936e-06, + "loss": 0.3567, + "step": 4119 + }, + { + "epoch": 1.0362173038229376, + "grad_norm": 0.3249053359031677, + "learning_rate": 8.275107849210168e-06, + "loss": 0.358, + "step": 4120 + }, + { + "epoch": 1.0364688128772637, + "grad_norm": 0.38430964946746826, + "learning_rate": 8.27400206417848e-06, + "loss": 0.3461, + "step": 4121 + }, + { + "epoch": 1.0367203219315895, + "grad_norm": 0.3271859884262085, + "learning_rate": 8.272895998748996e-06, + "loss": 0.349, + "step": 4122 + }, + { + "epoch": 1.0369718309859155, + "grad_norm": 0.32328277826309204, + "learning_rate": 8.271789653016445e-06, + "loss": 0.3598, + "step": 4123 + }, + { + "epoch": 1.0372233400402415, + "grad_norm": 0.3417379856109619, + "learning_rate": 8.270683027075576e-06, + "loss": 0.3591, + "step": 4124 + }, + { + "epoch": 1.0374748490945673, + "grad_norm": 0.31911101937294006, + "learning_rate": 8.26957612102117e-06, + "loss": 0.3416, + "step": 4125 + }, + { + "epoch": 1.0377263581488934, + "grad_norm": 0.31615710258483887, + "learning_rate": 8.268468934948023e-06, + "loss": 0.3325, + "step": 4126 + }, + { + "epoch": 1.0379778672032194, + "grad_norm": 0.34855955839157104, + "learning_rate": 8.267361468950958e-06, + "loss": 0.3408, + "step": 4127 + }, + { + "epoch": 1.0382293762575452, + "grad_norm": 0.3437637984752655, + "learning_rate": 8.266253723124825e-06, + "loss": 0.3463, + "step": 4128 + }, + { + "epoch": 1.0384808853118712, + "grad_norm": 0.33804404735565186, + "learning_rate": 8.265145697564493e-06, + "loss": 0.3452, + "step": 4129 + }, + { + "epoch": 1.0387323943661972, + "grad_norm": 0.3620853126049042, + "learning_rate": 8.26403739236486e-06, + "loss": 0.364, + "step": 4130 + }, + { + "epoch": 1.038983903420523, + "grad_norm": 0.3575562834739685, + "learning_rate": 8.262928807620843e-06, + "loss": 0.3741, + "step": 4131 + }, + { + "epoch": 1.039235412474849, + "grad_norm": 0.34148597717285156, + "learning_rate": 8.261819943427387e-06, + "loss": 0.3194, + "step": 4132 + }, + { + "epoch": 1.039486921529175, + "grad_norm": 0.3127982020378113, + "learning_rate": 8.26071079987946e-06, + "loss": 0.3364, + "step": 4133 + }, + { + "epoch": 1.039738430583501, + "grad_norm": 0.3347800374031067, + "learning_rate": 8.25960137707205e-06, + "loss": 0.3603, + "step": 4134 + }, + { + "epoch": 1.039989939637827, + "grad_norm": 0.3456893861293793, + "learning_rate": 8.258491675100175e-06, + "loss": 0.3734, + "step": 4135 + }, + { + "epoch": 1.040241448692153, + "grad_norm": 0.3339012861251831, + "learning_rate": 8.257381694058873e-06, + "loss": 0.3563, + "step": 4136 + }, + { + "epoch": 1.040492957746479, + "grad_norm": 0.31846827268600464, + "learning_rate": 8.256271434043206e-06, + "loss": 0.3469, + "step": 4137 + }, + { + "epoch": 1.0407444668008048, + "grad_norm": 0.3521193861961365, + "learning_rate": 8.255160895148263e-06, + "loss": 0.3225, + "step": 4138 + }, + { + "epoch": 1.0409959758551308, + "grad_norm": 0.34375718235969543, + "learning_rate": 8.254050077469153e-06, + "loss": 0.3366, + "step": 4139 + }, + { + "epoch": 1.0412474849094568, + "grad_norm": 0.3343333899974823, + "learning_rate": 8.252938981101011e-06, + "loss": 0.3674, + "step": 4140 + }, + { + "epoch": 1.0414989939637826, + "grad_norm": 0.32967597246170044, + "learning_rate": 8.251827606138996e-06, + "loss": 0.3581, + "step": 4141 + }, + { + "epoch": 1.0417505030181087, + "grad_norm": 0.3318265676498413, + "learning_rate": 8.25071595267829e-06, + "loss": 0.3474, + "step": 4142 + }, + { + "epoch": 1.0420020120724347, + "grad_norm": 0.29342010617256165, + "learning_rate": 8.249604020814099e-06, + "loss": 0.3322, + "step": 4143 + }, + { + "epoch": 1.0422535211267605, + "grad_norm": 0.3520354628562927, + "learning_rate": 8.248491810641655e-06, + "loss": 0.3457, + "step": 4144 + }, + { + "epoch": 1.0425050301810865, + "grad_norm": 0.3410925269126892, + "learning_rate": 8.247379322256206e-06, + "loss": 0.3429, + "step": 4145 + }, + { + "epoch": 1.0427565392354126, + "grad_norm": 0.31338703632354736, + "learning_rate": 8.246266555753036e-06, + "loss": 0.3464, + "step": 4146 + }, + { + "epoch": 1.0430080482897384, + "grad_norm": 0.31659606099128723, + "learning_rate": 8.245153511227443e-06, + "loss": 0.3355, + "step": 4147 + }, + { + "epoch": 1.0432595573440644, + "grad_norm": 0.34237006306648254, + "learning_rate": 8.244040188774755e-06, + "loss": 0.3729, + "step": 4148 + }, + { + "epoch": 1.0435110663983904, + "grad_norm": 0.3474802076816559, + "learning_rate": 8.24292658849032e-06, + "loss": 0.3643, + "step": 4149 + }, + { + "epoch": 1.0437625754527162, + "grad_norm": 0.31629714369773865, + "learning_rate": 8.241812710469507e-06, + "loss": 0.3424, + "step": 4150 + }, + { + "epoch": 1.0440140845070423, + "grad_norm": 0.3564571440219879, + "learning_rate": 8.240698554807717e-06, + "loss": 0.333, + "step": 4151 + }, + { + "epoch": 1.0442655935613683, + "grad_norm": 0.36076509952545166, + "learning_rate": 8.239584121600371e-06, + "loss": 0.3744, + "step": 4152 + }, + { + "epoch": 1.044517102615694, + "grad_norm": 0.33399519324302673, + "learning_rate": 8.238469410942911e-06, + "loss": 0.3488, + "step": 4153 + }, + { + "epoch": 1.04476861167002, + "grad_norm": 0.35272693634033203, + "learning_rate": 8.237354422930807e-06, + "loss": 0.3651, + "step": 4154 + }, + { + "epoch": 1.0450201207243461, + "grad_norm": 0.3702928125858307, + "learning_rate": 8.236239157659548e-06, + "loss": 0.3624, + "step": 4155 + }, + { + "epoch": 1.045271629778672, + "grad_norm": 0.3613063395023346, + "learning_rate": 8.235123615224651e-06, + "loss": 0.3579, + "step": 4156 + }, + { + "epoch": 1.045523138832998, + "grad_norm": 0.3584921956062317, + "learning_rate": 8.234007795721657e-06, + "loss": 0.3474, + "step": 4157 + }, + { + "epoch": 1.045774647887324, + "grad_norm": 0.3665167987346649, + "learning_rate": 8.232891699246126e-06, + "loss": 0.3479, + "step": 4158 + }, + { + "epoch": 1.0460261569416498, + "grad_norm": 0.3354139029979706, + "learning_rate": 8.231775325893646e-06, + "loss": 0.3608, + "step": 4159 + }, + { + "epoch": 1.0462776659959758, + "grad_norm": 0.31492966413497925, + "learning_rate": 8.230658675759827e-06, + "loss": 0.3536, + "step": 4160 + }, + { + "epoch": 1.0465291750503019, + "grad_norm": 0.3596387505531311, + "learning_rate": 8.229541748940301e-06, + "loss": 0.3646, + "step": 4161 + }, + { + "epoch": 1.0467806841046277, + "grad_norm": 0.3706818222999573, + "learning_rate": 8.22842454553073e-06, + "loss": 0.355, + "step": 4162 + }, + { + "epoch": 1.0470321931589537, + "grad_norm": 0.37192225456237793, + "learning_rate": 8.227307065626796e-06, + "loss": 0.3815, + "step": 4163 + }, + { + "epoch": 1.0472837022132797, + "grad_norm": 0.3455519378185272, + "learning_rate": 8.2261893093242e-06, + "loss": 0.363, + "step": 4164 + }, + { + "epoch": 1.0475352112676057, + "grad_norm": 0.33832308650016785, + "learning_rate": 8.225071276718672e-06, + "loss": 0.3476, + "step": 4165 + }, + { + "epoch": 1.0477867203219315, + "grad_norm": 0.3629179000854492, + "learning_rate": 8.223952967905967e-06, + "loss": 0.3711, + "step": 4166 + }, + { + "epoch": 1.0480382293762576, + "grad_norm": 0.3548809587955475, + "learning_rate": 8.222834382981858e-06, + "loss": 0.3507, + "step": 4167 + }, + { + "epoch": 1.0482897384305836, + "grad_norm": 0.3626018464565277, + "learning_rate": 8.221715522042146e-06, + "loss": 0.3621, + "step": 4168 + }, + { + "epoch": 1.0485412474849094, + "grad_norm": 0.310674786567688, + "learning_rate": 8.220596385182654e-06, + "loss": 0.3744, + "step": 4169 + }, + { + "epoch": 1.0487927565392354, + "grad_norm": 0.35073432326316833, + "learning_rate": 8.219476972499229e-06, + "loss": 0.3453, + "step": 4170 + }, + { + "epoch": 1.0490442655935615, + "grad_norm": 0.3294939696788788, + "learning_rate": 8.218357284087745e-06, + "loss": 0.3357, + "step": 4171 + }, + { + "epoch": 1.0492957746478873, + "grad_norm": 0.33905303478240967, + "learning_rate": 8.217237320044092e-06, + "loss": 0.3369, + "step": 4172 + }, + { + "epoch": 1.0495472837022133, + "grad_norm": 0.34386754035949707, + "learning_rate": 8.216117080464189e-06, + "loss": 0.3506, + "step": 4173 + }, + { + "epoch": 1.0497987927565393, + "grad_norm": 0.3818106949329376, + "learning_rate": 8.214996565443979e-06, + "loss": 0.3677, + "step": 4174 + }, + { + "epoch": 1.0500503018108651, + "grad_norm": 0.3344820439815521, + "learning_rate": 8.213875775079426e-06, + "loss": 0.3711, + "step": 4175 + }, + { + "epoch": 1.0503018108651911, + "grad_norm": 0.3421842157840729, + "learning_rate": 8.212754709466519e-06, + "loss": 0.3573, + "step": 4176 + }, + { + "epoch": 1.0505533199195172, + "grad_norm": 0.37867775559425354, + "learning_rate": 8.211633368701268e-06, + "loss": 0.3564, + "step": 4177 + }, + { + "epoch": 1.050804828973843, + "grad_norm": 0.3431204855442047, + "learning_rate": 8.210511752879713e-06, + "loss": 0.3577, + "step": 4178 + }, + { + "epoch": 1.051056338028169, + "grad_norm": 0.32079920172691345, + "learning_rate": 8.209389862097912e-06, + "loss": 0.3484, + "step": 4179 + }, + { + "epoch": 1.051307847082495, + "grad_norm": 0.3572438955307007, + "learning_rate": 8.208267696451947e-06, + "loss": 0.382, + "step": 4180 + }, + { + "epoch": 1.0515593561368208, + "grad_norm": 0.3327823579311371, + "learning_rate": 8.207145256037922e-06, + "loss": 0.332, + "step": 4181 + }, + { + "epoch": 1.0518108651911469, + "grad_norm": 0.35914427042007446, + "learning_rate": 8.206022540951972e-06, + "loss": 0.3365, + "step": 4182 + }, + { + "epoch": 1.0520623742454729, + "grad_norm": 0.35635095834732056, + "learning_rate": 8.204899551290246e-06, + "loss": 0.366, + "step": 4183 + }, + { + "epoch": 1.0523138832997987, + "grad_norm": 0.3339400291442871, + "learning_rate": 8.203776287148925e-06, + "loss": 0.342, + "step": 4184 + }, + { + "epoch": 1.0525653923541247, + "grad_norm": 0.3433733582496643, + "learning_rate": 8.202652748624208e-06, + "loss": 0.3418, + "step": 4185 + }, + { + "epoch": 1.0528169014084507, + "grad_norm": 0.34672051668167114, + "learning_rate": 8.201528935812318e-06, + "loss": 0.3415, + "step": 4186 + }, + { + "epoch": 1.0530684104627768, + "grad_norm": 0.3377094566822052, + "learning_rate": 8.200404848809504e-06, + "loss": 0.3579, + "step": 4187 + }, + { + "epoch": 1.0533199195171026, + "grad_norm": 0.3439509868621826, + "learning_rate": 8.199280487712035e-06, + "loss": 0.3485, + "step": 4188 + }, + { + "epoch": 1.0535714285714286, + "grad_norm": 0.3966297507286072, + "learning_rate": 8.198155852616208e-06, + "loss": 0.381, + "step": 4189 + }, + { + "epoch": 1.0538229376257546, + "grad_norm": 0.3165939450263977, + "learning_rate": 8.19703094361834e-06, + "loss": 0.3462, + "step": 4190 + }, + { + "epoch": 1.0540744466800804, + "grad_norm": 0.30367371439933777, + "learning_rate": 8.195905760814772e-06, + "loss": 0.3467, + "step": 4191 + }, + { + "epoch": 1.0543259557344065, + "grad_norm": 0.34282615780830383, + "learning_rate": 8.194780304301869e-06, + "loss": 0.3466, + "step": 4192 + }, + { + "epoch": 1.0545774647887325, + "grad_norm": 0.3848336338996887, + "learning_rate": 8.19365457417602e-06, + "loss": 0.3636, + "step": 4193 + }, + { + "epoch": 1.0548289738430583, + "grad_norm": 0.32148125767707825, + "learning_rate": 8.192528570533636e-06, + "loss": 0.3611, + "step": 4194 + }, + { + "epoch": 1.0550804828973843, + "grad_norm": 0.35009652376174927, + "learning_rate": 8.191402293471151e-06, + "loss": 0.3771, + "step": 4195 + }, + { + "epoch": 1.0553319919517103, + "grad_norm": 0.37090542912483215, + "learning_rate": 8.190275743085025e-06, + "loss": 0.3681, + "step": 4196 + }, + { + "epoch": 1.0555835010060362, + "grad_norm": 0.3471124470233917, + "learning_rate": 8.18914891947174e-06, + "loss": 0.3665, + "step": 4197 + }, + { + "epoch": 1.0558350100603622, + "grad_norm": 0.34268495440483093, + "learning_rate": 8.188021822727804e-06, + "loss": 0.3778, + "step": 4198 + }, + { + "epoch": 1.0560865191146882, + "grad_norm": 0.35175612568855286, + "learning_rate": 8.18689445294974e-06, + "loss": 0.3611, + "step": 4199 + }, + { + "epoch": 1.056338028169014, + "grad_norm": 0.3248383104801178, + "learning_rate": 8.185766810234106e-06, + "loss": 0.3624, + "step": 4200 + }, + { + "epoch": 1.05658953722334, + "grad_norm": 0.35728633403778076, + "learning_rate": 8.184638894677472e-06, + "loss": 0.345, + "step": 4201 + }, + { + "epoch": 1.056841046277666, + "grad_norm": 0.3417149782180786, + "learning_rate": 8.183510706376441e-06, + "loss": 0.3637, + "step": 4202 + }, + { + "epoch": 1.0570925553319919, + "grad_norm": 0.37575763463974, + "learning_rate": 8.182382245427634e-06, + "loss": 0.3408, + "step": 4203 + }, + { + "epoch": 1.057344064386318, + "grad_norm": 0.37389978766441345, + "learning_rate": 8.181253511927696e-06, + "loss": 0.3252, + "step": 4204 + }, + { + "epoch": 1.057595573440644, + "grad_norm": 0.35000425577163696, + "learning_rate": 8.180124505973299e-06, + "loss": 0.3451, + "step": 4205 + }, + { + "epoch": 1.0578470824949697, + "grad_norm": 0.3386198878288269, + "learning_rate": 8.17899522766113e-06, + "loss": 0.3179, + "step": 4206 + }, + { + "epoch": 1.0580985915492958, + "grad_norm": 0.3945431411266327, + "learning_rate": 8.177865677087908e-06, + "loss": 0.3231, + "step": 4207 + }, + { + "epoch": 1.0583501006036218, + "grad_norm": 0.34005576372146606, + "learning_rate": 8.176735854350373e-06, + "loss": 0.347, + "step": 4208 + }, + { + "epoch": 1.0586016096579476, + "grad_norm": 0.36175423860549927, + "learning_rate": 8.175605759545285e-06, + "loss": 0.3508, + "step": 4209 + }, + { + "epoch": 1.0588531187122736, + "grad_norm": 0.35928767919540405, + "learning_rate": 8.17447539276943e-06, + "loss": 0.3871, + "step": 4210 + }, + { + "epoch": 1.0591046277665996, + "grad_norm": 0.4204372763633728, + "learning_rate": 8.173344754119615e-06, + "loss": 0.3536, + "step": 4211 + }, + { + "epoch": 1.0593561368209254, + "grad_norm": 0.3679214119911194, + "learning_rate": 8.172213843692676e-06, + "loss": 0.3841, + "step": 4212 + }, + { + "epoch": 1.0596076458752515, + "grad_norm": 0.3367496132850647, + "learning_rate": 8.171082661585468e-06, + "loss": 0.3806, + "step": 4213 + }, + { + "epoch": 1.0598591549295775, + "grad_norm": 0.3853965401649475, + "learning_rate": 8.169951207894866e-06, + "loss": 0.35, + "step": 4214 + }, + { + "epoch": 1.0601106639839035, + "grad_norm": 0.3903757929801941, + "learning_rate": 8.168819482717775e-06, + "loss": 0.3529, + "step": 4215 + }, + { + "epoch": 1.0603621730382293, + "grad_norm": 0.35560932755470276, + "learning_rate": 8.167687486151119e-06, + "loss": 0.3368, + "step": 4216 + }, + { + "epoch": 1.0606136820925554, + "grad_norm": 0.3805678188800812, + "learning_rate": 8.166555218291847e-06, + "loss": 0.332, + "step": 4217 + }, + { + "epoch": 1.0608651911468814, + "grad_norm": 0.32614269852638245, + "learning_rate": 8.16542267923693e-06, + "loss": 0.3387, + "step": 4218 + }, + { + "epoch": 1.0611167002012072, + "grad_norm": 0.35080480575561523, + "learning_rate": 8.164289869083365e-06, + "loss": 0.3367, + "step": 4219 + }, + { + "epoch": 1.0613682092555332, + "grad_norm": 0.3625401556491852, + "learning_rate": 8.163156787928169e-06, + "loss": 0.3508, + "step": 4220 + }, + { + "epoch": 1.0616197183098592, + "grad_norm": 0.4160093367099762, + "learning_rate": 8.162023435868381e-06, + "loss": 0.3577, + "step": 4221 + }, + { + "epoch": 1.061871227364185, + "grad_norm": 0.3521963059902191, + "learning_rate": 8.160889813001066e-06, + "loss": 0.3468, + "step": 4222 + }, + { + "epoch": 1.062122736418511, + "grad_norm": 0.37421825528144836, + "learning_rate": 8.159755919423315e-06, + "loss": 0.3796, + "step": 4223 + }, + { + "epoch": 1.062374245472837, + "grad_norm": 0.34013405442237854, + "learning_rate": 8.158621755232237e-06, + "loss": 0.3479, + "step": 4224 + }, + { + "epoch": 1.062625754527163, + "grad_norm": 0.376768559217453, + "learning_rate": 8.157487320524964e-06, + "loss": 0.3701, + "step": 4225 + }, + { + "epoch": 1.062877263581489, + "grad_norm": 0.36440804600715637, + "learning_rate": 8.156352615398658e-06, + "loss": 0.386, + "step": 4226 + }, + { + "epoch": 1.063128772635815, + "grad_norm": 0.3305298089981079, + "learning_rate": 8.155217639950494e-06, + "loss": 0.3242, + "step": 4227 + }, + { + "epoch": 1.0633802816901408, + "grad_norm": 0.33112478256225586, + "learning_rate": 8.154082394277678e-06, + "loss": 0.3429, + "step": 4228 + }, + { + "epoch": 1.0636317907444668, + "grad_norm": 0.3222111165523529, + "learning_rate": 8.15294687847744e-06, + "loss": 0.3478, + "step": 4229 + }, + { + "epoch": 1.0638832997987928, + "grad_norm": 0.3655967116355896, + "learning_rate": 8.151811092647024e-06, + "loss": 0.3619, + "step": 4230 + }, + { + "epoch": 1.0641348088531186, + "grad_norm": 0.3364644944667816, + "learning_rate": 8.150675036883705e-06, + "loss": 0.3434, + "step": 4231 + }, + { + "epoch": 1.0643863179074446, + "grad_norm": 0.34147587418556213, + "learning_rate": 8.149538711284782e-06, + "loss": 0.3561, + "step": 4232 + }, + { + "epoch": 1.0646378269617707, + "grad_norm": 0.33940380811691284, + "learning_rate": 8.14840211594757e-06, + "loss": 0.3664, + "step": 4233 + }, + { + "epoch": 1.0648893360160965, + "grad_norm": 0.3298323154449463, + "learning_rate": 8.147265250969415e-06, + "loss": 0.3362, + "step": 4234 + }, + { + "epoch": 1.0651408450704225, + "grad_norm": 0.34013256430625916, + "learning_rate": 8.146128116447679e-06, + "loss": 0.3753, + "step": 4235 + }, + { + "epoch": 1.0653923541247485, + "grad_norm": 0.3295930325984955, + "learning_rate": 8.144990712479753e-06, + "loss": 0.3482, + "step": 4236 + }, + { + "epoch": 1.0656438631790746, + "grad_norm": 0.33826765418052673, + "learning_rate": 8.143853039163046e-06, + "loss": 0.3402, + "step": 4237 + }, + { + "epoch": 1.0658953722334004, + "grad_norm": 0.35167795419692993, + "learning_rate": 8.142715096594994e-06, + "loss": 0.3993, + "step": 4238 + }, + { + "epoch": 1.0661468812877264, + "grad_norm": 0.35461968183517456, + "learning_rate": 8.141576884873054e-06, + "loss": 0.3738, + "step": 4239 + }, + { + "epoch": 1.0663983903420524, + "grad_norm": 0.3144620358943939, + "learning_rate": 8.140438404094711e-06, + "loss": 0.3594, + "step": 4240 + }, + { + "epoch": 1.0666498993963782, + "grad_norm": 0.33328554034233093, + "learning_rate": 8.139299654357462e-06, + "loss": 0.3361, + "step": 4241 + }, + { + "epoch": 1.0669014084507042, + "grad_norm": 0.4244604706764221, + "learning_rate": 8.138160635758839e-06, + "loss": 0.3409, + "step": 4242 + }, + { + "epoch": 1.0671529175050303, + "grad_norm": 0.336699515581131, + "learning_rate": 8.137021348396389e-06, + "loss": 0.3496, + "step": 4243 + }, + { + "epoch": 1.067404426559356, + "grad_norm": 0.3206990361213684, + "learning_rate": 8.135881792367686e-06, + "loss": 0.3369, + "step": 4244 + }, + { + "epoch": 1.067655935613682, + "grad_norm": 0.34667348861694336, + "learning_rate": 8.134741967770325e-06, + "loss": 0.3572, + "step": 4245 + }, + { + "epoch": 1.0679074446680081, + "grad_norm": 0.3545694351196289, + "learning_rate": 8.133601874701926e-06, + "loss": 0.3533, + "step": 4246 + }, + { + "epoch": 1.068158953722334, + "grad_norm": 0.33755382895469666, + "learning_rate": 8.13246151326013e-06, + "loss": 0.3517, + "step": 4247 + }, + { + "epoch": 1.06841046277666, + "grad_norm": 0.33736327290534973, + "learning_rate": 8.131320883542601e-06, + "loss": 0.3281, + "step": 4248 + }, + { + "epoch": 1.068661971830986, + "grad_norm": 0.3781490623950958, + "learning_rate": 8.13017998564703e-06, + "loss": 0.3733, + "step": 4249 + }, + { + "epoch": 1.0689134808853118, + "grad_norm": 0.35155022144317627, + "learning_rate": 8.129038819671122e-06, + "loss": 0.3716, + "step": 4250 + }, + { + "epoch": 1.0691649899396378, + "grad_norm": 0.36461761593818665, + "learning_rate": 8.127897385712616e-06, + "loss": 0.3367, + "step": 4251 + }, + { + "epoch": 1.0694164989939638, + "grad_norm": 0.3492373526096344, + "learning_rate": 8.126755683869267e-06, + "loss": 0.3376, + "step": 4252 + }, + { + "epoch": 1.0696680080482897, + "grad_norm": 0.3286789357662201, + "learning_rate": 8.125613714238855e-06, + "loss": 0.3334, + "step": 4253 + }, + { + "epoch": 1.0699195171026157, + "grad_norm": 0.33751797676086426, + "learning_rate": 8.124471476919183e-06, + "loss": 0.3599, + "step": 4254 + }, + { + "epoch": 1.0701710261569417, + "grad_norm": 0.37270689010620117, + "learning_rate": 8.123328972008075e-06, + "loss": 0.3753, + "step": 4255 + }, + { + "epoch": 1.0704225352112675, + "grad_norm": 0.3367967903614044, + "learning_rate": 8.122186199603378e-06, + "loss": 0.3578, + "step": 4256 + }, + { + "epoch": 1.0706740442655935, + "grad_norm": 0.3527868986129761, + "learning_rate": 8.121043159802969e-06, + "loss": 0.3491, + "step": 4257 + }, + { + "epoch": 1.0709255533199196, + "grad_norm": 0.3186793029308319, + "learning_rate": 8.119899852704736e-06, + "loss": 0.3345, + "step": 4258 + }, + { + "epoch": 1.0711770623742454, + "grad_norm": 0.34226033091545105, + "learning_rate": 8.1187562784066e-06, + "loss": 0.371, + "step": 4259 + }, + { + "epoch": 1.0714285714285714, + "grad_norm": 0.36788326501846313, + "learning_rate": 8.1176124370065e-06, + "loss": 0.3486, + "step": 4260 + }, + { + "epoch": 1.0716800804828974, + "grad_norm": 0.33759674429893494, + "learning_rate": 8.116468328602397e-06, + "loss": 0.3845, + "step": 4261 + }, + { + "epoch": 1.0719315895372232, + "grad_norm": 0.32118430733680725, + "learning_rate": 8.115323953292278e-06, + "loss": 0.3362, + "step": 4262 + }, + { + "epoch": 1.0721830985915493, + "grad_norm": 0.3653360605239868, + "learning_rate": 8.114179311174154e-06, + "loss": 0.3502, + "step": 4263 + }, + { + "epoch": 1.0724346076458753, + "grad_norm": 0.32888534665107727, + "learning_rate": 8.113034402346052e-06, + "loss": 0.3454, + "step": 4264 + }, + { + "epoch": 1.0726861167002013, + "grad_norm": 0.3332807719707489, + "learning_rate": 8.11188922690603e-06, + "loss": 0.3384, + "step": 4265 + }, + { + "epoch": 1.0729376257545271, + "grad_norm": 0.36372309923171997, + "learning_rate": 8.110743784952162e-06, + "loss": 0.3606, + "step": 4266 + }, + { + "epoch": 1.0731891348088531, + "grad_norm": 0.3299432396888733, + "learning_rate": 8.10959807658255e-06, + "loss": 0.3564, + "step": 4267 + }, + { + "epoch": 1.0734406438631792, + "grad_norm": 0.3273921012878418, + "learning_rate": 8.108452101895317e-06, + "loss": 0.33, + "step": 4268 + }, + { + "epoch": 1.073692152917505, + "grad_norm": 0.3181513547897339, + "learning_rate": 8.107305860988608e-06, + "loss": 0.3416, + "step": 4269 + }, + { + "epoch": 1.073943661971831, + "grad_norm": 0.3415665924549103, + "learning_rate": 8.10615935396059e-06, + "loss": 0.353, + "step": 4270 + }, + { + "epoch": 1.074195171026157, + "grad_norm": 0.34259921312332153, + "learning_rate": 8.105012580909457e-06, + "loss": 0.373, + "step": 4271 + }, + { + "epoch": 1.0744466800804828, + "grad_norm": 0.3158281147480011, + "learning_rate": 8.103865541933421e-06, + "loss": 0.3496, + "step": 4272 + }, + { + "epoch": 1.0746981891348089, + "grad_norm": 0.32061338424682617, + "learning_rate": 8.102718237130718e-06, + "loss": 0.3632, + "step": 4273 + }, + { + "epoch": 1.0749496981891349, + "grad_norm": 0.35798168182373047, + "learning_rate": 8.101570666599608e-06, + "loss": 0.3756, + "step": 4274 + }, + { + "epoch": 1.0752012072434607, + "grad_norm": 0.30440935492515564, + "learning_rate": 8.100422830438376e-06, + "loss": 0.359, + "step": 4275 + }, + { + "epoch": 1.0754527162977867, + "grad_norm": 0.31603163480758667, + "learning_rate": 8.099274728745324e-06, + "loss": 0.339, + "step": 4276 + }, + { + "epoch": 1.0757042253521127, + "grad_norm": 0.3540377914905548, + "learning_rate": 8.09812636161878e-06, + "loss": 0.3457, + "step": 4277 + }, + { + "epoch": 1.0759557344064385, + "grad_norm": 0.33938080072402954, + "learning_rate": 8.096977729157096e-06, + "loss": 0.3666, + "step": 4278 + }, + { + "epoch": 1.0762072434607646, + "grad_norm": 0.36812707781791687, + "learning_rate": 8.095828831458643e-06, + "loss": 0.3479, + "step": 4279 + }, + { + "epoch": 1.0764587525150906, + "grad_norm": 0.33766672015190125, + "learning_rate": 8.094679668621818e-06, + "loss": 0.3474, + "step": 4280 + }, + { + "epoch": 1.0767102615694164, + "grad_norm": 0.3450581133365631, + "learning_rate": 8.09353024074504e-06, + "loss": 0.3536, + "step": 4281 + }, + { + "epoch": 1.0769617706237424, + "grad_norm": 0.37925583124160767, + "learning_rate": 8.09238054792675e-06, + "loss": 0.347, + "step": 4282 + }, + { + "epoch": 1.0772132796780685, + "grad_norm": 0.35549673438072205, + "learning_rate": 8.091230590265411e-06, + "loss": 0.3493, + "step": 4283 + }, + { + "epoch": 1.0774647887323943, + "grad_norm": 0.317950040102005, + "learning_rate": 8.090080367859512e-06, + "loss": 0.3477, + "step": 4284 + }, + { + "epoch": 1.0777162977867203, + "grad_norm": 0.3614874482154846, + "learning_rate": 8.08892988080756e-06, + "loss": 0.36, + "step": 4285 + }, + { + "epoch": 1.0779678068410463, + "grad_norm": 0.35449376702308655, + "learning_rate": 8.087779129208088e-06, + "loss": 0.371, + "step": 4286 + }, + { + "epoch": 1.0782193158953723, + "grad_norm": 0.32577499747276306, + "learning_rate": 8.086628113159651e-06, + "loss": 0.3402, + "step": 4287 + }, + { + "epoch": 1.0784708249496981, + "grad_norm": 0.3384754955768585, + "learning_rate": 8.085476832760828e-06, + "loss": 0.3697, + "step": 4288 + }, + { + "epoch": 1.0787223340040242, + "grad_norm": 0.3504764139652252, + "learning_rate": 8.084325288110215e-06, + "loss": 0.3467, + "step": 4289 + }, + { + "epoch": 1.0789738430583502, + "grad_norm": 0.3410959243774414, + "learning_rate": 8.083173479306436e-06, + "loss": 0.3749, + "step": 4290 + }, + { + "epoch": 1.079225352112676, + "grad_norm": 0.34386518597602844, + "learning_rate": 8.082021406448137e-06, + "loss": 0.3713, + "step": 4291 + }, + { + "epoch": 1.079476861167002, + "grad_norm": 0.3209969699382782, + "learning_rate": 8.080869069633987e-06, + "loss": 0.339, + "step": 4292 + }, + { + "epoch": 1.079728370221328, + "grad_norm": 0.3502729833126068, + "learning_rate": 8.079716468962673e-06, + "loss": 0.3597, + "step": 4293 + }, + { + "epoch": 1.0799798792756539, + "grad_norm": 0.3301171064376831, + "learning_rate": 8.078563604532911e-06, + "loss": 0.3534, + "step": 4294 + }, + { + "epoch": 1.08023138832998, + "grad_norm": 0.38163283467292786, + "learning_rate": 8.077410476443436e-06, + "loss": 0.3326, + "step": 4295 + }, + { + "epoch": 1.080482897384306, + "grad_norm": 0.30207228660583496, + "learning_rate": 8.076257084793007e-06, + "loss": 0.3437, + "step": 4296 + }, + { + "epoch": 1.0807344064386317, + "grad_norm": 0.34427884221076965, + "learning_rate": 8.075103429680402e-06, + "loss": 0.3593, + "step": 4297 + }, + { + "epoch": 1.0809859154929577, + "grad_norm": 0.40237826108932495, + "learning_rate": 8.073949511204426e-06, + "loss": 0.3564, + "step": 4298 + }, + { + "epoch": 1.0812374245472838, + "grad_norm": 0.34032902121543884, + "learning_rate": 8.072795329463907e-06, + "loss": 0.3585, + "step": 4299 + }, + { + "epoch": 1.0814889336016096, + "grad_norm": 0.3582513630390167, + "learning_rate": 8.07164088455769e-06, + "loss": 0.3272, + "step": 4300 + }, + { + "epoch": 1.0817404426559356, + "grad_norm": 0.3856683075428009, + "learning_rate": 8.070486176584647e-06, + "loss": 0.3603, + "step": 4301 + }, + { + "epoch": 1.0819919517102616, + "grad_norm": 0.3110501766204834, + "learning_rate": 8.069331205643671e-06, + "loss": 0.3748, + "step": 4302 + }, + { + "epoch": 1.0822434607645874, + "grad_norm": 0.3185417950153351, + "learning_rate": 8.068175971833679e-06, + "loss": 0.3331, + "step": 4303 + }, + { + "epoch": 1.0824949698189135, + "grad_norm": 0.33823883533477783, + "learning_rate": 8.06702047525361e-06, + "loss": 0.3492, + "step": 4304 + }, + { + "epoch": 1.0827464788732395, + "grad_norm": 0.3247283399105072, + "learning_rate": 8.065864716002426e-06, + "loss": 0.3648, + "step": 4305 + }, + { + "epoch": 1.0829979879275653, + "grad_norm": 0.35124877095222473, + "learning_rate": 8.064708694179107e-06, + "loss": 0.345, + "step": 4306 + }, + { + "epoch": 1.0832494969818913, + "grad_norm": 0.3573397696018219, + "learning_rate": 8.063552409882662e-06, + "loss": 0.3676, + "step": 4307 + }, + { + "epoch": 1.0835010060362174, + "grad_norm": 0.33024418354034424, + "learning_rate": 8.06239586321212e-06, + "loss": 0.3826, + "step": 4308 + }, + { + "epoch": 1.0837525150905432, + "grad_norm": 0.33846285939216614, + "learning_rate": 8.06123905426653e-06, + "loss": 0.3585, + "step": 4309 + }, + { + "epoch": 1.0840040241448692, + "grad_norm": 0.34866371750831604, + "learning_rate": 8.060081983144964e-06, + "loss": 0.3492, + "step": 4310 + }, + { + "epoch": 1.0842555331991952, + "grad_norm": 0.3329204320907593, + "learning_rate": 8.058924649946523e-06, + "loss": 0.3892, + "step": 4311 + }, + { + "epoch": 1.084507042253521, + "grad_norm": 0.32447880506515503, + "learning_rate": 8.05776705477032e-06, + "loss": 0.3566, + "step": 4312 + }, + { + "epoch": 1.084758551307847, + "grad_norm": 0.33247438073158264, + "learning_rate": 8.0566091977155e-06, + "loss": 0.3458, + "step": 4313 + }, + { + "epoch": 1.085010060362173, + "grad_norm": 0.3221612572669983, + "learning_rate": 8.055451078881221e-06, + "loss": 0.3629, + "step": 4314 + }, + { + "epoch": 1.085261569416499, + "grad_norm": 0.3234616816043854, + "learning_rate": 8.054292698366674e-06, + "loss": 0.3566, + "step": 4315 + }, + { + "epoch": 1.085513078470825, + "grad_norm": 0.3308125436306, + "learning_rate": 8.053134056271064e-06, + "loss": 0.3519, + "step": 4316 + }, + { + "epoch": 1.085764587525151, + "grad_norm": 0.3306364119052887, + "learning_rate": 8.051975152693623e-06, + "loss": 0.3253, + "step": 4317 + }, + { + "epoch": 1.086016096579477, + "grad_norm": 0.343723326921463, + "learning_rate": 8.050815987733604e-06, + "loss": 0.3455, + "step": 4318 + }, + { + "epoch": 1.0862676056338028, + "grad_norm": 0.36126473546028137, + "learning_rate": 8.049656561490282e-06, + "loss": 0.3846, + "step": 4319 + }, + { + "epoch": 1.0865191146881288, + "grad_norm": 0.3083570897579193, + "learning_rate": 8.048496874062953e-06, + "loss": 0.3608, + "step": 4320 + }, + { + "epoch": 1.0867706237424548, + "grad_norm": 0.33808434009552, + "learning_rate": 8.04733692555094e-06, + "loss": 0.3466, + "step": 4321 + }, + { + "epoch": 1.0870221327967806, + "grad_norm": 0.3496091067790985, + "learning_rate": 8.04617671605358e-06, + "loss": 0.3795, + "step": 4322 + }, + { + "epoch": 1.0872736418511066, + "grad_norm": 0.34945693612098694, + "learning_rate": 8.045016245670243e-06, + "loss": 0.354, + "step": 4323 + }, + { + "epoch": 1.0875251509054327, + "grad_norm": 0.3567672669887543, + "learning_rate": 8.043855514500314e-06, + "loss": 0.3537, + "step": 4324 + }, + { + "epoch": 1.0877766599597585, + "grad_norm": 0.35311269760131836, + "learning_rate": 8.042694522643202e-06, + "loss": 0.3536, + "step": 4325 + }, + { + "epoch": 1.0880281690140845, + "grad_norm": 0.33748674392700195, + "learning_rate": 8.041533270198341e-06, + "loss": 0.371, + "step": 4326 + }, + { + "epoch": 1.0882796780684105, + "grad_norm": 0.35377809405326843, + "learning_rate": 8.04037175726518e-06, + "loss": 0.3517, + "step": 4327 + }, + { + "epoch": 1.0885311871227363, + "grad_norm": 0.3475686311721802, + "learning_rate": 8.039209983943201e-06, + "loss": 0.3564, + "step": 4328 + }, + { + "epoch": 1.0887826961770624, + "grad_norm": 0.35228431224823, + "learning_rate": 8.0380479503319e-06, + "loss": 0.3494, + "step": 4329 + }, + { + "epoch": 1.0890342052313884, + "grad_norm": 0.31928402185440063, + "learning_rate": 8.036885656530797e-06, + "loss": 0.3457, + "step": 4330 + }, + { + "epoch": 1.0892857142857142, + "grad_norm": 0.3187348246574402, + "learning_rate": 8.035723102639437e-06, + "loss": 0.3289, + "step": 4331 + }, + { + "epoch": 1.0895372233400402, + "grad_norm": 0.3454899191856384, + "learning_rate": 8.034560288757386e-06, + "loss": 0.3655, + "step": 4332 + }, + { + "epoch": 1.0897887323943662, + "grad_norm": 0.3134077787399292, + "learning_rate": 8.033397214984226e-06, + "loss": 0.35, + "step": 4333 + }, + { + "epoch": 1.0900402414486923, + "grad_norm": 0.326066792011261, + "learning_rate": 8.032233881419576e-06, + "loss": 0.3629, + "step": 4334 + }, + { + "epoch": 1.090291750503018, + "grad_norm": 0.34841668605804443, + "learning_rate": 8.031070288163061e-06, + "loss": 0.3792, + "step": 4335 + }, + { + "epoch": 1.090543259557344, + "grad_norm": 0.3101913034915924, + "learning_rate": 8.029906435314339e-06, + "loss": 0.3148, + "step": 4336 + }, + { + "epoch": 1.0907947686116701, + "grad_norm": 0.36832088232040405, + "learning_rate": 8.028742322973085e-06, + "loss": 0.3647, + "step": 4337 + }, + { + "epoch": 1.091046277665996, + "grad_norm": 0.3566989004611969, + "learning_rate": 8.027577951238999e-06, + "loss": 0.3651, + "step": 4338 + }, + { + "epoch": 1.091297786720322, + "grad_norm": 0.3333961069583893, + "learning_rate": 8.026413320211804e-06, + "loss": 0.3356, + "step": 4339 + }, + { + "epoch": 1.091549295774648, + "grad_norm": 0.38320064544677734, + "learning_rate": 8.02524842999124e-06, + "loss": 0.3819, + "step": 4340 + }, + { + "epoch": 1.0918008048289738, + "grad_norm": 0.3310604989528656, + "learning_rate": 8.024083280677073e-06, + "loss": 0.3626, + "step": 4341 + }, + { + "epoch": 1.0920523138832998, + "grad_norm": 0.3529944121837616, + "learning_rate": 8.02291787236909e-06, + "loss": 0.3707, + "step": 4342 + }, + { + "epoch": 1.0923038229376258, + "grad_norm": 0.3804549276828766, + "learning_rate": 8.021752205167108e-06, + "loss": 0.3516, + "step": 4343 + }, + { + "epoch": 1.0925553319919517, + "grad_norm": 0.3501472473144531, + "learning_rate": 8.02058627917095e-06, + "loss": 0.3521, + "step": 4344 + }, + { + "epoch": 1.0928068410462777, + "grad_norm": 0.36724236607551575, + "learning_rate": 8.019420094480475e-06, + "loss": 0.3649, + "step": 4345 + }, + { + "epoch": 1.0930583501006037, + "grad_norm": 0.3527858555316925, + "learning_rate": 8.018253651195556e-06, + "loss": 0.36, + "step": 4346 + }, + { + "epoch": 1.0933098591549295, + "grad_norm": 0.37071692943573, + "learning_rate": 8.017086949416095e-06, + "loss": 0.3735, + "step": 4347 + }, + { + "epoch": 1.0935613682092555, + "grad_norm": 0.3491380512714386, + "learning_rate": 8.015919989242014e-06, + "loss": 0.3577, + "step": 4348 + }, + { + "epoch": 1.0938128772635816, + "grad_norm": 0.36504706740379333, + "learning_rate": 8.014752770773252e-06, + "loss": 0.3314, + "step": 4349 + }, + { + "epoch": 1.0940643863179074, + "grad_norm": 0.36621904373168945, + "learning_rate": 8.013585294109773e-06, + "loss": 0.3417, + "step": 4350 + }, + { + "epoch": 1.0943158953722334, + "grad_norm": 0.35412460565567017, + "learning_rate": 8.012417559351569e-06, + "loss": 0.3774, + "step": 4351 + }, + { + "epoch": 1.0945674044265594, + "grad_norm": 0.37957510352134705, + "learning_rate": 8.011249566598647e-06, + "loss": 0.384, + "step": 4352 + }, + { + "epoch": 1.0948189134808852, + "grad_norm": 0.34848281741142273, + "learning_rate": 8.010081315951037e-06, + "loss": 0.3393, + "step": 4353 + }, + { + "epoch": 1.0950704225352113, + "grad_norm": 0.33474472165107727, + "learning_rate": 8.008912807508794e-06, + "loss": 0.351, + "step": 4354 + }, + { + "epoch": 1.0953219315895373, + "grad_norm": 0.34298956394195557, + "learning_rate": 8.007744041371993e-06, + "loss": 0.337, + "step": 4355 + }, + { + "epoch": 1.095573440643863, + "grad_norm": 0.3466607332229614, + "learning_rate": 8.00657501764073e-06, + "loss": 0.3387, + "step": 4356 + }, + { + "epoch": 1.095824949698189, + "grad_norm": 0.3456454575061798, + "learning_rate": 8.005405736415127e-06, + "loss": 0.3355, + "step": 4357 + }, + { + "epoch": 1.0960764587525151, + "grad_norm": 0.3142288327217102, + "learning_rate": 8.004236197795323e-06, + "loss": 0.3811, + "step": 4358 + }, + { + "epoch": 1.096327967806841, + "grad_norm": 0.3510921597480774, + "learning_rate": 8.003066401881484e-06, + "loss": 0.3682, + "step": 4359 + }, + { + "epoch": 1.096579476861167, + "grad_norm": 0.37072980403900146, + "learning_rate": 8.001896348773795e-06, + "loss": 0.3559, + "step": 4360 + }, + { + "epoch": 1.096830985915493, + "grad_norm": 0.3439767062664032, + "learning_rate": 8.000726038572463e-06, + "loss": 0.3517, + "step": 4361 + }, + { + "epoch": 1.0970824949698188, + "grad_norm": 0.3286254405975342, + "learning_rate": 7.999555471377719e-06, + "loss": 0.3587, + "step": 4362 + }, + { + "epoch": 1.0973340040241448, + "grad_norm": 0.33705952763557434, + "learning_rate": 7.998384647289813e-06, + "loss": 0.3666, + "step": 4363 + }, + { + "epoch": 1.0975855130784709, + "grad_norm": 0.3337515890598297, + "learning_rate": 7.997213566409022e-06, + "loss": 0.3502, + "step": 4364 + }, + { + "epoch": 1.0978370221327969, + "grad_norm": 0.3719703257083893, + "learning_rate": 7.996042228835637e-06, + "loss": 0.3815, + "step": 4365 + }, + { + "epoch": 1.0980885311871227, + "grad_norm": 0.35766953229904175, + "learning_rate": 7.994870634669978e-06, + "loss": 0.3295, + "step": 4366 + }, + { + "epoch": 1.0983400402414487, + "grad_norm": 0.33565643429756165, + "learning_rate": 7.993698784012387e-06, + "loss": 0.3499, + "step": 4367 + }, + { + "epoch": 1.0985915492957747, + "grad_norm": 0.34716835618019104, + "learning_rate": 7.992526676963222e-06, + "loss": 0.3523, + "step": 4368 + }, + { + "epoch": 1.0988430583501005, + "grad_norm": 0.33831220865249634, + "learning_rate": 7.991354313622868e-06, + "loss": 0.3546, + "step": 4369 + }, + { + "epoch": 1.0990945674044266, + "grad_norm": 0.36244910955429077, + "learning_rate": 7.990181694091733e-06, + "loss": 0.3547, + "step": 4370 + }, + { + "epoch": 1.0993460764587526, + "grad_norm": 0.33830562233924866, + "learning_rate": 7.98900881847024e-06, + "loss": 0.366, + "step": 4371 + }, + { + "epoch": 1.0995975855130784, + "grad_norm": 0.35777151584625244, + "learning_rate": 7.987835686858845e-06, + "loss": 0.336, + "step": 4372 + }, + { + "epoch": 1.0998490945674044, + "grad_norm": 0.36557304859161377, + "learning_rate": 7.986662299358012e-06, + "loss": 0.3743, + "step": 4373 + }, + { + "epoch": 1.1001006036217305, + "grad_norm": 0.33524492383003235, + "learning_rate": 7.985488656068238e-06, + "loss": 0.3942, + "step": 4374 + }, + { + "epoch": 1.1003521126760563, + "grad_norm": 0.370383620262146, + "learning_rate": 7.984314757090036e-06, + "loss": 0.3503, + "step": 4375 + }, + { + "epoch": 1.1006036217303823, + "grad_norm": 0.31732454895973206, + "learning_rate": 7.983140602523949e-06, + "loss": 0.3528, + "step": 4376 + }, + { + "epoch": 1.1008551307847083, + "grad_norm": 0.3776906728744507, + "learning_rate": 7.981966192470529e-06, + "loss": 0.3587, + "step": 4377 + }, + { + "epoch": 1.1011066398390341, + "grad_norm": 0.30028223991394043, + "learning_rate": 7.980791527030361e-06, + "loss": 0.3388, + "step": 4378 + }, + { + "epoch": 1.1013581488933601, + "grad_norm": 0.32881900668144226, + "learning_rate": 7.979616606304045e-06, + "loss": 0.3689, + "step": 4379 + }, + { + "epoch": 1.1016096579476862, + "grad_norm": 0.4046640694141388, + "learning_rate": 7.978441430392208e-06, + "loss": 0.374, + "step": 4380 + }, + { + "epoch": 1.101861167002012, + "grad_norm": 0.3668253719806671, + "learning_rate": 7.977265999395496e-06, + "loss": 0.3649, + "step": 4381 + }, + { + "epoch": 1.102112676056338, + "grad_norm": 0.34378865361213684, + "learning_rate": 7.976090313414575e-06, + "loss": 0.3674, + "step": 4382 + }, + { + "epoch": 1.102364185110664, + "grad_norm": 0.3307862877845764, + "learning_rate": 7.974914372550139e-06, + "loss": 0.3359, + "step": 4383 + }, + { + "epoch": 1.10261569416499, + "grad_norm": 0.3454669713973999, + "learning_rate": 7.973738176902897e-06, + "loss": 0.3663, + "step": 4384 + }, + { + "epoch": 1.1028672032193159, + "grad_norm": 0.3528616428375244, + "learning_rate": 7.972561726573584e-06, + "loss": 0.3359, + "step": 4385 + }, + { + "epoch": 1.1031187122736419, + "grad_norm": 0.35522276163101196, + "learning_rate": 7.971385021662956e-06, + "loss": 0.3597, + "step": 4386 + }, + { + "epoch": 1.103370221327968, + "grad_norm": 0.3870282769203186, + "learning_rate": 7.970208062271791e-06, + "loss": 0.3485, + "step": 4387 + }, + { + "epoch": 1.1036217303822937, + "grad_norm": 0.3756462037563324, + "learning_rate": 7.969030848500886e-06, + "loss": 0.3455, + "step": 4388 + }, + { + "epoch": 1.1038732394366197, + "grad_norm": 0.34227368235588074, + "learning_rate": 7.967853380451062e-06, + "loss": 0.3508, + "step": 4389 + }, + { + "epoch": 1.1041247484909458, + "grad_norm": 0.4221192002296448, + "learning_rate": 7.966675658223162e-06, + "loss": 0.3851, + "step": 4390 + }, + { + "epoch": 1.1043762575452716, + "grad_norm": 0.31191956996917725, + "learning_rate": 7.965497681918052e-06, + "loss": 0.3239, + "step": 4391 + }, + { + "epoch": 1.1046277665995976, + "grad_norm": 0.32340946793556213, + "learning_rate": 7.96431945163662e-06, + "loss": 0.3637, + "step": 4392 + }, + { + "epoch": 1.1048792756539236, + "grad_norm": 0.3376328945159912, + "learning_rate": 7.96314096747977e-06, + "loss": 0.3395, + "step": 4393 + }, + { + "epoch": 1.1051307847082494, + "grad_norm": 0.33500152826309204, + "learning_rate": 7.961962229548433e-06, + "loss": 0.3576, + "step": 4394 + }, + { + "epoch": 1.1053822937625755, + "grad_norm": 0.3390989601612091, + "learning_rate": 7.960783237943561e-06, + "loss": 0.3521, + "step": 4395 + }, + { + "epoch": 1.1056338028169015, + "grad_norm": 0.3361843228340149, + "learning_rate": 7.959603992766127e-06, + "loss": 0.3493, + "step": 4396 + }, + { + "epoch": 1.1058853118712273, + "grad_norm": 0.3171842098236084, + "learning_rate": 7.958424494117128e-06, + "loss": 0.3438, + "step": 4397 + }, + { + "epoch": 1.1061368209255533, + "grad_norm": 0.2926395535469055, + "learning_rate": 7.957244742097579e-06, + "loss": 0.343, + "step": 4398 + }, + { + "epoch": 1.1063883299798793, + "grad_norm": 0.34122782945632935, + "learning_rate": 7.956064736808516e-06, + "loss": 0.345, + "step": 4399 + }, + { + "epoch": 1.1066398390342052, + "grad_norm": 0.3118756115436554, + "learning_rate": 7.954884478351003e-06, + "loss": 0.338, + "step": 4400 + }, + { + "epoch": 1.1068913480885312, + "grad_norm": 0.3336293399333954, + "learning_rate": 7.953703966826118e-06, + "loss": 0.3618, + "step": 4401 + }, + { + "epoch": 1.1071428571428572, + "grad_norm": 0.3180750906467438, + "learning_rate": 7.95252320233497e-06, + "loss": 0.3661, + "step": 4402 + }, + { + "epoch": 1.107394366197183, + "grad_norm": 0.3299802541732788, + "learning_rate": 7.951342184978678e-06, + "loss": 0.3472, + "step": 4403 + }, + { + "epoch": 1.107645875251509, + "grad_norm": 0.34347137808799744, + "learning_rate": 7.950160914858392e-06, + "loss": 0.3272, + "step": 4404 + }, + { + "epoch": 1.107897384305835, + "grad_norm": 0.36151158809661865, + "learning_rate": 7.94897939207528e-06, + "loss": 0.3307, + "step": 4405 + }, + { + "epoch": 1.1081488933601609, + "grad_norm": 0.323889821767807, + "learning_rate": 7.947797616730532e-06, + "loss": 0.3571, + "step": 4406 + }, + { + "epoch": 1.108400402414487, + "grad_norm": 0.33470696210861206, + "learning_rate": 7.94661558892536e-06, + "loss": 0.354, + "step": 4407 + }, + { + "epoch": 1.108651911468813, + "grad_norm": 0.33871525526046753, + "learning_rate": 7.945433308760998e-06, + "loss": 0.3722, + "step": 4408 + }, + { + "epoch": 1.1089034205231387, + "grad_norm": 0.3187379837036133, + "learning_rate": 7.944250776338696e-06, + "loss": 0.3618, + "step": 4409 + }, + { + "epoch": 1.1091549295774648, + "grad_norm": 0.3083096444606781, + "learning_rate": 7.943067991759736e-06, + "loss": 0.3508, + "step": 4410 + }, + { + "epoch": 1.1094064386317908, + "grad_norm": 0.33438965678215027, + "learning_rate": 7.941884955125416e-06, + "loss": 0.3632, + "step": 4411 + }, + { + "epoch": 1.1096579476861166, + "grad_norm": 0.35571396350860596, + "learning_rate": 7.940701666537051e-06, + "loss": 0.3458, + "step": 4412 + }, + { + "epoch": 1.1099094567404426, + "grad_norm": 0.3360385298728943, + "learning_rate": 7.939518126095986e-06, + "loss": 0.3572, + "step": 4413 + }, + { + "epoch": 1.1101609657947686, + "grad_norm": 0.3844273090362549, + "learning_rate": 7.938334333903584e-06, + "loss": 0.3569, + "step": 4414 + }, + { + "epoch": 1.1104124748490947, + "grad_norm": 0.3142733573913574, + "learning_rate": 7.937150290061228e-06, + "loss": 0.3575, + "step": 4415 + }, + { + "epoch": 1.1106639839034205, + "grad_norm": 0.32374638319015503, + "learning_rate": 7.935965994670325e-06, + "loss": 0.3405, + "step": 4416 + }, + { + "epoch": 1.1109154929577465, + "grad_norm": 0.33620190620422363, + "learning_rate": 7.9347814478323e-06, + "loss": 0.3691, + "step": 4417 + }, + { + "epoch": 1.1111670020120725, + "grad_norm": 0.3400149345397949, + "learning_rate": 7.933596649648606e-06, + "loss": 0.3382, + "step": 4418 + }, + { + "epoch": 1.1114185110663983, + "grad_norm": 0.34986597299575806, + "learning_rate": 7.932411600220712e-06, + "loss": 0.3667, + "step": 4419 + }, + { + "epoch": 1.1116700201207244, + "grad_norm": 0.34316298365592957, + "learning_rate": 7.931226299650108e-06, + "loss": 0.3592, + "step": 4420 + }, + { + "epoch": 1.1119215291750504, + "grad_norm": 0.362543523311615, + "learning_rate": 7.930040748038309e-06, + "loss": 0.3544, + "step": 4421 + }, + { + "epoch": 1.1121730382293762, + "grad_norm": 0.35389113426208496, + "learning_rate": 7.92885494548685e-06, + "loss": 0.3427, + "step": 4422 + }, + { + "epoch": 1.1124245472837022, + "grad_norm": 0.32142454385757446, + "learning_rate": 7.927668892097288e-06, + "loss": 0.354, + "step": 4423 + }, + { + "epoch": 1.1126760563380282, + "grad_norm": 0.3471391499042511, + "learning_rate": 7.926482587971202e-06, + "loss": 0.3538, + "step": 4424 + }, + { + "epoch": 1.112927565392354, + "grad_norm": 0.33723774552345276, + "learning_rate": 7.925296033210191e-06, + "loss": 0.3634, + "step": 4425 + }, + { + "epoch": 1.11317907444668, + "grad_norm": 0.3441479206085205, + "learning_rate": 7.924109227915872e-06, + "loss": 0.3669, + "step": 4426 + }, + { + "epoch": 1.113430583501006, + "grad_norm": 0.3241642713546753, + "learning_rate": 7.922922172189892e-06, + "loss": 0.3629, + "step": 4427 + }, + { + "epoch": 1.113682092555332, + "grad_norm": 0.35414087772369385, + "learning_rate": 7.921734866133917e-06, + "loss": 0.3546, + "step": 4428 + }, + { + "epoch": 1.113933601609658, + "grad_norm": 0.3270585834980011, + "learning_rate": 7.920547309849626e-06, + "loss": 0.3449, + "step": 4429 + }, + { + "epoch": 1.114185110663984, + "grad_norm": 0.3540771007537842, + "learning_rate": 7.91935950343873e-06, + "loss": 0.3508, + "step": 4430 + }, + { + "epoch": 1.1144366197183098, + "grad_norm": 0.3404322862625122, + "learning_rate": 7.918171447002955e-06, + "loss": 0.3358, + "step": 4431 + }, + { + "epoch": 1.1146881287726358, + "grad_norm": 0.335871160030365, + "learning_rate": 7.916983140644052e-06, + "loss": 0.345, + "step": 4432 + }, + { + "epoch": 1.1149396378269618, + "grad_norm": 0.351862370967865, + "learning_rate": 7.915794584463792e-06, + "loss": 0.3417, + "step": 4433 + }, + { + "epoch": 1.1151911468812878, + "grad_norm": 0.3961928188800812, + "learning_rate": 7.914605778563965e-06, + "loss": 0.3346, + "step": 4434 + }, + { + "epoch": 1.1154426559356136, + "grad_norm": 0.33614158630371094, + "learning_rate": 7.913416723046387e-06, + "loss": 0.3679, + "step": 4435 + }, + { + "epoch": 1.1156941649899397, + "grad_norm": 0.3272840082645416, + "learning_rate": 7.912227418012895e-06, + "loss": 0.3413, + "step": 4436 + }, + { + "epoch": 1.1159456740442657, + "grad_norm": 0.3329838812351227, + "learning_rate": 7.911037863565344e-06, + "loss": 0.3587, + "step": 4437 + }, + { + "epoch": 1.1161971830985915, + "grad_norm": 0.41790270805358887, + "learning_rate": 7.90984805980561e-06, + "loss": 0.3805, + "step": 4438 + }, + { + "epoch": 1.1164486921529175, + "grad_norm": 0.3353562355041504, + "learning_rate": 7.908658006835593e-06, + "loss": 0.3301, + "step": 4439 + }, + { + "epoch": 1.1167002012072436, + "grad_norm": 0.3343071937561035, + "learning_rate": 7.907467704757214e-06, + "loss": 0.3614, + "step": 4440 + }, + { + "epoch": 1.1169517102615694, + "grad_norm": 0.37369048595428467, + "learning_rate": 7.906277153672417e-06, + "loss": 0.3467, + "step": 4441 + }, + { + "epoch": 1.1172032193158954, + "grad_norm": 0.3621070981025696, + "learning_rate": 7.905086353683162e-06, + "loss": 0.3337, + "step": 4442 + }, + { + "epoch": 1.1174547283702214, + "grad_norm": 0.33832234144210815, + "learning_rate": 7.903895304891436e-06, + "loss": 0.3742, + "step": 4443 + }, + { + "epoch": 1.1177062374245472, + "grad_norm": 0.3197494447231293, + "learning_rate": 7.902704007399243e-06, + "loss": 0.3868, + "step": 4444 + }, + { + "epoch": 1.1179577464788732, + "grad_norm": 0.3825930058956146, + "learning_rate": 7.901512461308612e-06, + "loss": 0.3603, + "step": 4445 + }, + { + "epoch": 1.1182092555331993, + "grad_norm": 0.3661872446537018, + "learning_rate": 7.90032066672159e-06, + "loss": 0.3633, + "step": 4446 + }, + { + "epoch": 1.118460764587525, + "grad_norm": 0.31290680170059204, + "learning_rate": 7.899128623740246e-06, + "loss": 0.3503, + "step": 4447 + }, + { + "epoch": 1.118712273641851, + "grad_norm": 0.35227593779563904, + "learning_rate": 7.897936332466674e-06, + "loss": 0.3781, + "step": 4448 + }, + { + "epoch": 1.1189637826961771, + "grad_norm": 0.3571607768535614, + "learning_rate": 7.896743793002983e-06, + "loss": 0.355, + "step": 4449 + }, + { + "epoch": 1.119215291750503, + "grad_norm": 0.35419657826423645, + "learning_rate": 7.89555100545131e-06, + "loss": 0.3337, + "step": 4450 + }, + { + "epoch": 1.119466800804829, + "grad_norm": 0.31918078660964966, + "learning_rate": 7.894357969913807e-06, + "loss": 0.3707, + "step": 4451 + }, + { + "epoch": 1.119718309859155, + "grad_norm": 0.33282384276390076, + "learning_rate": 7.893164686492652e-06, + "loss": 0.3861, + "step": 4452 + }, + { + "epoch": 1.1199698189134808, + "grad_norm": 0.3873891234397888, + "learning_rate": 7.891971155290039e-06, + "loss": 0.3788, + "step": 4453 + }, + { + "epoch": 1.1202213279678068, + "grad_norm": 0.3210555613040924, + "learning_rate": 7.89077737640819e-06, + "loss": 0.3426, + "step": 4454 + }, + { + "epoch": 1.1204728370221329, + "grad_norm": 0.33169567584991455, + "learning_rate": 7.889583349949341e-06, + "loss": 0.3279, + "step": 4455 + }, + { + "epoch": 1.1207243460764587, + "grad_norm": 0.36757755279541016, + "learning_rate": 7.88838907601576e-06, + "loss": 0.3631, + "step": 4456 + }, + { + "epoch": 1.1209758551307847, + "grad_norm": 0.32751592993736267, + "learning_rate": 7.88719455470972e-06, + "loss": 0.3331, + "step": 4457 + }, + { + "epoch": 1.1212273641851107, + "grad_norm": 0.3210444152355194, + "learning_rate": 7.88599978613353e-06, + "loss": 0.3527, + "step": 4458 + }, + { + "epoch": 1.1214788732394365, + "grad_norm": 0.37147364020347595, + "learning_rate": 7.884804770389514e-06, + "loss": 0.3601, + "step": 4459 + }, + { + "epoch": 1.1217303822937625, + "grad_norm": 0.33347150683403015, + "learning_rate": 7.883609507580016e-06, + "loss": 0.336, + "step": 4460 + }, + { + "epoch": 1.1219818913480886, + "grad_norm": 0.38581255078315735, + "learning_rate": 7.882413997807404e-06, + "loss": 0.3488, + "step": 4461 + }, + { + "epoch": 1.1222334004024144, + "grad_norm": 0.3418939411640167, + "learning_rate": 7.881218241174064e-06, + "loss": 0.3565, + "step": 4462 + }, + { + "epoch": 1.1224849094567404, + "grad_norm": 0.3888770043849945, + "learning_rate": 7.880022237782407e-06, + "loss": 0.3497, + "step": 4463 + }, + { + "epoch": 1.1227364185110664, + "grad_norm": 0.3226841688156128, + "learning_rate": 7.878825987734864e-06, + "loss": 0.3558, + "step": 4464 + }, + { + "epoch": 1.1229879275653925, + "grad_norm": 0.3461357355117798, + "learning_rate": 7.877629491133884e-06, + "loss": 0.3294, + "step": 4465 + }, + { + "epoch": 1.1232394366197183, + "grad_norm": 0.3348862826824188, + "learning_rate": 7.876432748081939e-06, + "loss": 0.3626, + "step": 4466 + }, + { + "epoch": 1.1234909456740443, + "grad_norm": 0.3597047030925751, + "learning_rate": 7.875235758681527e-06, + "loss": 0.3706, + "step": 4467 + }, + { + "epoch": 1.1237424547283703, + "grad_norm": 0.3321612477302551, + "learning_rate": 7.874038523035157e-06, + "loss": 0.3608, + "step": 4468 + }, + { + "epoch": 1.1239939637826961, + "grad_norm": 0.3340033292770386, + "learning_rate": 7.872841041245369e-06, + "loss": 0.3429, + "step": 4469 + }, + { + "epoch": 1.1242454728370221, + "grad_norm": 0.34898126125335693, + "learning_rate": 7.871643313414718e-06, + "loss": 0.3439, + "step": 4470 + }, + { + "epoch": 1.1244969818913482, + "grad_norm": 0.33899804949760437, + "learning_rate": 7.870445339645783e-06, + "loss": 0.3356, + "step": 4471 + }, + { + "epoch": 1.124748490945674, + "grad_norm": 0.30220794677734375, + "learning_rate": 7.869247120041161e-06, + "loss": 0.3427, + "step": 4472 + }, + { + "epoch": 1.125, + "grad_norm": 0.3090187907218933, + "learning_rate": 7.868048654703474e-06, + "loss": 0.344, + "step": 4473 + }, + { + "epoch": 1.125251509054326, + "grad_norm": 0.33787021040916443, + "learning_rate": 7.866849943735361e-06, + "loss": 0.3362, + "step": 4474 + }, + { + "epoch": 1.1255030181086518, + "grad_norm": 0.3474128842353821, + "learning_rate": 7.865650987239485e-06, + "loss": 0.3474, + "step": 4475 + }, + { + "epoch": 1.1257545271629779, + "grad_norm": 0.34391891956329346, + "learning_rate": 7.864451785318532e-06, + "loss": 0.3917, + "step": 4476 + }, + { + "epoch": 1.1260060362173039, + "grad_norm": 0.34758976101875305, + "learning_rate": 7.863252338075202e-06, + "loss": 0.3533, + "step": 4477 + }, + { + "epoch": 1.1262575452716297, + "grad_norm": 0.32106417417526245, + "learning_rate": 7.862052645612222e-06, + "loss": 0.3453, + "step": 4478 + }, + { + "epoch": 1.1265090543259557, + "grad_norm": 0.337090402841568, + "learning_rate": 7.860852708032337e-06, + "loss": 0.3541, + "step": 4479 + }, + { + "epoch": 1.1267605633802817, + "grad_norm": 0.3655794560909271, + "learning_rate": 7.859652525438314e-06, + "loss": 0.3466, + "step": 4480 + }, + { + "epoch": 1.1270120724346078, + "grad_norm": 0.32245180010795593, + "learning_rate": 7.858452097932945e-06, + "loss": 0.3464, + "step": 4481 + }, + { + "epoch": 1.1272635814889336, + "grad_norm": 0.32112833857536316, + "learning_rate": 7.857251425619034e-06, + "loss": 0.3705, + "step": 4482 + }, + { + "epoch": 1.1275150905432596, + "grad_norm": 0.3438814580440521, + "learning_rate": 7.856050508599413e-06, + "loss": 0.337, + "step": 4483 + }, + { + "epoch": 1.1277665995975856, + "grad_norm": 0.33978280425071716, + "learning_rate": 7.854849346976935e-06, + "loss": 0.3429, + "step": 4484 + }, + { + "epoch": 1.1280181086519114, + "grad_norm": 0.3462032079696655, + "learning_rate": 7.85364794085447e-06, + "loss": 0.3581, + "step": 4485 + }, + { + "epoch": 1.1282696177062375, + "grad_norm": 0.3374899923801422, + "learning_rate": 7.85244629033491e-06, + "loss": 0.3614, + "step": 4486 + }, + { + "epoch": 1.1285211267605635, + "grad_norm": 0.3162481486797333, + "learning_rate": 7.851244395521171e-06, + "loss": 0.3457, + "step": 4487 + }, + { + "epoch": 1.1287726358148893, + "grad_norm": 0.3583298921585083, + "learning_rate": 7.850042256516187e-06, + "loss": 0.3418, + "step": 4488 + }, + { + "epoch": 1.1290241448692153, + "grad_norm": 0.33638957142829895, + "learning_rate": 7.848839873422913e-06, + "loss": 0.3523, + "step": 4489 + }, + { + "epoch": 1.1292756539235413, + "grad_norm": 0.3432900905609131, + "learning_rate": 7.847637246344326e-06, + "loss": 0.3641, + "step": 4490 + }, + { + "epoch": 1.1295271629778671, + "grad_norm": 0.3538980185985565, + "learning_rate": 7.846434375383425e-06, + "loss": 0.347, + "step": 4491 + }, + { + "epoch": 1.1297786720321932, + "grad_norm": 0.3335748016834259, + "learning_rate": 7.845231260643226e-06, + "loss": 0.3426, + "step": 4492 + }, + { + "epoch": 1.1300301810865192, + "grad_norm": 0.3384160101413727, + "learning_rate": 7.84402790222677e-06, + "loss": 0.3483, + "step": 4493 + }, + { + "epoch": 1.130281690140845, + "grad_norm": 0.3617180585861206, + "learning_rate": 7.842824300237114e-06, + "loss": 0.3726, + "step": 4494 + }, + { + "epoch": 1.130533199195171, + "grad_norm": 0.3085126280784607, + "learning_rate": 7.841620454777344e-06, + "loss": 0.3614, + "step": 4495 + }, + { + "epoch": 1.130784708249497, + "grad_norm": 0.3673594892024994, + "learning_rate": 7.840416365950558e-06, + "loss": 0.3277, + "step": 4496 + }, + { + "epoch": 1.1310362173038229, + "grad_norm": 0.3403297960758209, + "learning_rate": 7.839212033859882e-06, + "loss": 0.36, + "step": 4497 + }, + { + "epoch": 1.131287726358149, + "grad_norm": 0.3315725028514862, + "learning_rate": 7.838007458608455e-06, + "loss": 0.3523, + "step": 4498 + }, + { + "epoch": 1.131539235412475, + "grad_norm": 0.3540287911891937, + "learning_rate": 7.836802640299442e-06, + "loss": 0.3584, + "step": 4499 + }, + { + "epoch": 1.1317907444668007, + "grad_norm": 0.32690343260765076, + "learning_rate": 7.835597579036031e-06, + "loss": 0.3393, + "step": 4500 + }, + { + "epoch": 1.1320422535211268, + "grad_norm": 0.37675777077674866, + "learning_rate": 7.83439227492143e-06, + "loss": 0.3682, + "step": 4501 + }, + { + "epoch": 1.1322937625754528, + "grad_norm": 0.35298097133636475, + "learning_rate": 7.833186728058859e-06, + "loss": 0.3618, + "step": 4502 + }, + { + "epoch": 1.1325452716297786, + "grad_norm": 0.33117878437042236, + "learning_rate": 7.831980938551572e-06, + "loss": 0.3414, + "step": 4503 + }, + { + "epoch": 1.1327967806841046, + "grad_norm": 0.32076895236968994, + "learning_rate": 7.83077490650283e-06, + "loss": 0.3558, + "step": 4504 + }, + { + "epoch": 1.1330482897384306, + "grad_norm": 0.3626091182231903, + "learning_rate": 7.829568632015932e-06, + "loss": 0.339, + "step": 4505 + }, + { + "epoch": 1.1332997987927564, + "grad_norm": 0.35690760612487793, + "learning_rate": 7.828362115194179e-06, + "loss": 0.349, + "step": 4506 + }, + { + "epoch": 1.1335513078470825, + "grad_norm": 0.32652193307876587, + "learning_rate": 7.827155356140905e-06, + "loss": 0.3566, + "step": 4507 + }, + { + "epoch": 1.1338028169014085, + "grad_norm": 0.3355286419391632, + "learning_rate": 7.825948354959464e-06, + "loss": 0.3516, + "step": 4508 + }, + { + "epoch": 1.1340543259557343, + "grad_norm": 0.35634520649909973, + "learning_rate": 7.824741111753223e-06, + "loss": 0.3398, + "step": 4509 + }, + { + "epoch": 1.1343058350100603, + "grad_norm": 0.35237765312194824, + "learning_rate": 7.823533626625577e-06, + "loss": 0.3559, + "step": 4510 + }, + { + "epoch": 1.1345573440643864, + "grad_norm": 0.3162923753261566, + "learning_rate": 7.822325899679941e-06, + "loss": 0.3601, + "step": 4511 + }, + { + "epoch": 1.1348088531187122, + "grad_norm": 0.33308616280555725, + "learning_rate": 7.821117931019749e-06, + "loss": 0.346, + "step": 4512 + }, + { + "epoch": 1.1350603621730382, + "grad_norm": 0.3171202540397644, + "learning_rate": 7.819909720748454e-06, + "loss": 0.3534, + "step": 4513 + }, + { + "epoch": 1.1353118712273642, + "grad_norm": 0.35061752796173096, + "learning_rate": 7.818701268969532e-06, + "loss": 0.3766, + "step": 4514 + }, + { + "epoch": 1.1355633802816902, + "grad_norm": 0.3866909444332123, + "learning_rate": 7.817492575786481e-06, + "loss": 0.3422, + "step": 4515 + }, + { + "epoch": 1.135814889336016, + "grad_norm": 0.3715068995952606, + "learning_rate": 7.816283641302815e-06, + "loss": 0.3729, + "step": 4516 + }, + { + "epoch": 1.136066398390342, + "grad_norm": 0.3531903922557831, + "learning_rate": 7.815074465622076e-06, + "loss": 0.3698, + "step": 4517 + }, + { + "epoch": 1.136317907444668, + "grad_norm": 0.4138471186161041, + "learning_rate": 7.81386504884782e-06, + "loss": 0.341, + "step": 4518 + }, + { + "epoch": 1.136569416498994, + "grad_norm": 0.350268691778183, + "learning_rate": 7.812655391083625e-06, + "loss": 0.3162, + "step": 4519 + }, + { + "epoch": 1.13682092555332, + "grad_norm": 0.36769789457321167, + "learning_rate": 7.811445492433091e-06, + "loss": 0.3529, + "step": 4520 + }, + { + "epoch": 1.137072434607646, + "grad_norm": 0.3521837890148163, + "learning_rate": 7.810235352999842e-06, + "loss": 0.3505, + "step": 4521 + }, + { + "epoch": 1.1373239436619718, + "grad_norm": 0.3225024342536926, + "learning_rate": 7.809024972887513e-06, + "loss": 0.3648, + "step": 4522 + }, + { + "epoch": 1.1375754527162978, + "grad_norm": 0.34223929047584534, + "learning_rate": 7.807814352199769e-06, + "loss": 0.3557, + "step": 4523 + }, + { + "epoch": 1.1378269617706238, + "grad_norm": 0.33616122603416443, + "learning_rate": 7.806603491040293e-06, + "loss": 0.348, + "step": 4524 + }, + { + "epoch": 1.1380784708249496, + "grad_norm": 0.36442843079566956, + "learning_rate": 7.805392389512785e-06, + "loss": 0.3607, + "step": 4525 + }, + { + "epoch": 1.1383299798792756, + "grad_norm": 0.3255125880241394, + "learning_rate": 7.80418104772097e-06, + "loss": 0.3719, + "step": 4526 + }, + { + "epoch": 1.1385814889336017, + "grad_norm": 0.31949490308761597, + "learning_rate": 7.802969465768588e-06, + "loss": 0.3558, + "step": 4527 + }, + { + "epoch": 1.1388329979879275, + "grad_norm": 0.32983100414276123, + "learning_rate": 7.801757643759408e-06, + "loss": 0.363, + "step": 4528 + }, + { + "epoch": 1.1390845070422535, + "grad_norm": 0.3546229898929596, + "learning_rate": 7.800545581797217e-06, + "loss": 0.3636, + "step": 4529 + }, + { + "epoch": 1.1393360160965795, + "grad_norm": 0.40820834040641785, + "learning_rate": 7.799333279985813e-06, + "loss": 0.3654, + "step": 4530 + }, + { + "epoch": 1.1395875251509056, + "grad_norm": 0.35177260637283325, + "learning_rate": 7.798120738429028e-06, + "loss": 0.357, + "step": 4531 + }, + { + "epoch": 1.1398390342052314, + "grad_norm": 0.3217819929122925, + "learning_rate": 7.796907957230706e-06, + "loss": 0.3425, + "step": 4532 + }, + { + "epoch": 1.1400905432595574, + "grad_norm": 0.4682590961456299, + "learning_rate": 7.795694936494715e-06, + "loss": 0.3726, + "step": 4533 + }, + { + "epoch": 1.1403420523138834, + "grad_norm": 0.4092445373535156, + "learning_rate": 7.794481676324944e-06, + "loss": 0.3628, + "step": 4534 + }, + { + "epoch": 1.1405935613682092, + "grad_norm": 0.31070423126220703, + "learning_rate": 7.793268176825297e-06, + "loss": 0.3421, + "step": 4535 + }, + { + "epoch": 1.1408450704225352, + "grad_norm": 0.34520095586776733, + "learning_rate": 7.792054438099706e-06, + "loss": 0.36, + "step": 4536 + }, + { + "epoch": 1.1410965794768613, + "grad_norm": 0.3309139609336853, + "learning_rate": 7.790840460252121e-06, + "loss": 0.3451, + "step": 4537 + }, + { + "epoch": 1.141348088531187, + "grad_norm": 0.36794281005859375, + "learning_rate": 7.789626243386508e-06, + "loss": 0.3752, + "step": 4538 + }, + { + "epoch": 1.141599597585513, + "grad_norm": 0.3274729251861572, + "learning_rate": 7.78841178760686e-06, + "loss": 0.3556, + "step": 4539 + }, + { + "epoch": 1.1418511066398391, + "grad_norm": 0.32525065541267395, + "learning_rate": 7.787197093017186e-06, + "loss": 0.3666, + "step": 4540 + }, + { + "epoch": 1.142102615694165, + "grad_norm": 0.3399212062358856, + "learning_rate": 7.78598215972152e-06, + "loss": 0.3457, + "step": 4541 + }, + { + "epoch": 1.142354124748491, + "grad_norm": 0.3338754177093506, + "learning_rate": 7.784766987823908e-06, + "loss": 0.3819, + "step": 4542 + }, + { + "epoch": 1.142605633802817, + "grad_norm": 0.3345630168914795, + "learning_rate": 7.783551577428427e-06, + "loss": 0.3374, + "step": 4543 + }, + { + "epoch": 1.1428571428571428, + "grad_norm": 0.3454863727092743, + "learning_rate": 7.782335928639167e-06, + "loss": 0.3487, + "step": 4544 + }, + { + "epoch": 1.1431086519114688, + "grad_norm": 0.34839943051338196, + "learning_rate": 7.781120041560241e-06, + "loss": 0.3635, + "step": 4545 + }, + { + "epoch": 1.1433601609657948, + "grad_norm": 0.3082118034362793, + "learning_rate": 7.779903916295781e-06, + "loss": 0.3468, + "step": 4546 + }, + { + "epoch": 1.1436116700201207, + "grad_norm": 0.32561156153678894, + "learning_rate": 7.778687552949944e-06, + "loss": 0.3237, + "step": 4547 + }, + { + "epoch": 1.1438631790744467, + "grad_norm": 0.3549325466156006, + "learning_rate": 7.777470951626899e-06, + "loss": 0.3679, + "step": 4548 + }, + { + "epoch": 1.1441146881287727, + "grad_norm": 0.33792486786842346, + "learning_rate": 7.776254112430843e-06, + "loss": 0.3508, + "step": 4549 + }, + { + "epoch": 1.1443661971830985, + "grad_norm": 0.3282465934753418, + "learning_rate": 7.775037035465992e-06, + "loss": 0.3501, + "step": 4550 + }, + { + "epoch": 1.1446177062374245, + "grad_norm": 0.3474048972129822, + "learning_rate": 7.773819720836577e-06, + "loss": 0.3573, + "step": 4551 + }, + { + "epoch": 1.1448692152917506, + "grad_norm": 0.36000293493270874, + "learning_rate": 7.77260216864686e-06, + "loss": 0.3291, + "step": 4552 + }, + { + "epoch": 1.1451207243460764, + "grad_norm": 0.3380999267101288, + "learning_rate": 7.771384379001107e-06, + "loss": 0.3461, + "step": 4553 + }, + { + "epoch": 1.1453722334004024, + "grad_norm": 0.3201408386230469, + "learning_rate": 7.770166352003622e-06, + "loss": 0.3633, + "step": 4554 + }, + { + "epoch": 1.1456237424547284, + "grad_norm": 0.3245110809803009, + "learning_rate": 7.768948087758721e-06, + "loss": 0.3486, + "step": 4555 + }, + { + "epoch": 1.1458752515090542, + "grad_norm": 0.33637115359306335, + "learning_rate": 7.767729586370738e-06, + "loss": 0.3535, + "step": 4556 + }, + { + "epoch": 1.1461267605633803, + "grad_norm": 0.33941903710365295, + "learning_rate": 7.76651084794403e-06, + "loss": 0.3553, + "step": 4557 + }, + { + "epoch": 1.1463782696177063, + "grad_norm": 0.3234095275402069, + "learning_rate": 7.765291872582975e-06, + "loss": 0.3674, + "step": 4558 + }, + { + "epoch": 1.146629778672032, + "grad_norm": 0.34868893027305603, + "learning_rate": 7.76407266039197e-06, + "loss": 0.349, + "step": 4559 + }, + { + "epoch": 1.1468812877263581, + "grad_norm": 0.3286401927471161, + "learning_rate": 7.762853211475436e-06, + "loss": 0.3362, + "step": 4560 + }, + { + "epoch": 1.1471327967806841, + "grad_norm": 0.3449994623661041, + "learning_rate": 7.761633525937808e-06, + "loss": 0.3408, + "step": 4561 + }, + { + "epoch": 1.14738430583501, + "grad_norm": 0.42614054679870605, + "learning_rate": 7.760413603883546e-06, + "loss": 0.3397, + "step": 4562 + }, + { + "epoch": 1.147635814889336, + "grad_norm": 0.34790942072868347, + "learning_rate": 7.759193445417126e-06, + "loss": 0.3705, + "step": 4563 + }, + { + "epoch": 1.147887323943662, + "grad_norm": 0.33942970633506775, + "learning_rate": 7.757973050643048e-06, + "loss": 0.3671, + "step": 4564 + }, + { + "epoch": 1.148138832997988, + "grad_norm": 0.34391435980796814, + "learning_rate": 7.756752419665833e-06, + "loss": 0.3341, + "step": 4565 + }, + { + "epoch": 1.1483903420523138, + "grad_norm": 0.3598215579986572, + "learning_rate": 7.755531552590021e-06, + "loss": 0.347, + "step": 4566 + }, + { + "epoch": 1.1486418511066399, + "grad_norm": 0.31856873631477356, + "learning_rate": 7.754310449520169e-06, + "loss": 0.3475, + "step": 4567 + }, + { + "epoch": 1.1488933601609659, + "grad_norm": 0.3364936411380768, + "learning_rate": 7.753089110560858e-06, + "loss": 0.3521, + "step": 4568 + }, + { + "epoch": 1.1491448692152917, + "grad_norm": 0.3098117709159851, + "learning_rate": 7.751867535816689e-06, + "loss": 0.3435, + "step": 4569 + }, + { + "epoch": 1.1493963782696177, + "grad_norm": 0.33782443404197693, + "learning_rate": 7.750645725392278e-06, + "loss": 0.3309, + "step": 4570 + }, + { + "epoch": 1.1496478873239437, + "grad_norm": 0.3181942403316498, + "learning_rate": 7.749423679392271e-06, + "loss": 0.3532, + "step": 4571 + }, + { + "epoch": 1.1498993963782695, + "grad_norm": 0.32375866174697876, + "learning_rate": 7.748201397921326e-06, + "loss": 0.3546, + "step": 4572 + }, + { + "epoch": 1.1501509054325956, + "grad_norm": 0.32787060737609863, + "learning_rate": 7.746978881084124e-06, + "loss": 0.3732, + "step": 4573 + }, + { + "epoch": 1.1504024144869216, + "grad_norm": 0.34005919098854065, + "learning_rate": 7.745756128985367e-06, + "loss": 0.3587, + "step": 4574 + }, + { + "epoch": 1.1506539235412474, + "grad_norm": 0.3388522267341614, + "learning_rate": 7.744533141729773e-06, + "loss": 0.3393, + "step": 4575 + }, + { + "epoch": 1.1509054325955734, + "grad_norm": 0.35405799746513367, + "learning_rate": 7.743309919422086e-06, + "loss": 0.3485, + "step": 4576 + }, + { + "epoch": 1.1511569416498995, + "grad_norm": 0.3198212683200836, + "learning_rate": 7.742086462167066e-06, + "loss": 0.3477, + "step": 4577 + }, + { + "epoch": 1.1514084507042253, + "grad_norm": 0.3119359612464905, + "learning_rate": 7.740862770069494e-06, + "loss": 0.3464, + "step": 4578 + }, + { + "epoch": 1.1516599597585513, + "grad_norm": 0.32057052850723267, + "learning_rate": 7.739638843234176e-06, + "loss": 0.3406, + "step": 4579 + }, + { + "epoch": 1.1519114688128773, + "grad_norm": 0.37748971581459045, + "learning_rate": 7.738414681765928e-06, + "loss": 0.3845, + "step": 4580 + }, + { + "epoch": 1.1521629778672033, + "grad_norm": 0.35111579298973083, + "learning_rate": 7.737190285769594e-06, + "loss": 0.3628, + "step": 4581 + }, + { + "epoch": 1.1524144869215291, + "grad_norm": 0.31299889087677, + "learning_rate": 7.735965655350034e-06, + "loss": 0.3171, + "step": 4582 + }, + { + "epoch": 1.1526659959758552, + "grad_norm": 0.3520580232143402, + "learning_rate": 7.734740790612137e-06, + "loss": 0.3537, + "step": 4583 + }, + { + "epoch": 1.1529175050301812, + "grad_norm": 0.3616025447845459, + "learning_rate": 7.733515691660795e-06, + "loss": 0.3483, + "step": 4584 + }, + { + "epoch": 1.153169014084507, + "grad_norm": 0.3729570508003235, + "learning_rate": 7.732290358600936e-06, + "loss": 0.3418, + "step": 4585 + }, + { + "epoch": 1.153420523138833, + "grad_norm": 0.3173588812351227, + "learning_rate": 7.731064791537501e-06, + "loss": 0.3572, + "step": 4586 + }, + { + "epoch": 1.153672032193159, + "grad_norm": 0.3116159439086914, + "learning_rate": 7.72983899057545e-06, + "loss": 0.3276, + "step": 4587 + }, + { + "epoch": 1.1539235412474849, + "grad_norm": 0.33578798174858093, + "learning_rate": 7.728612955819773e-06, + "loss": 0.3469, + "step": 4588 + }, + { + "epoch": 1.154175050301811, + "grad_norm": 0.35310569405555725, + "learning_rate": 7.727386687375461e-06, + "loss": 0.3701, + "step": 4589 + }, + { + "epoch": 1.154426559356137, + "grad_norm": 0.37188154458999634, + "learning_rate": 7.726160185347544e-06, + "loss": 0.4008, + "step": 4590 + }, + { + "epoch": 1.1546780684104627, + "grad_norm": 0.3462255597114563, + "learning_rate": 7.724933449841061e-06, + "loss": 0.34, + "step": 4591 + }, + { + "epoch": 1.1549295774647887, + "grad_norm": 0.31674739718437195, + "learning_rate": 7.723706480961078e-06, + "loss": 0.3469, + "step": 4592 + }, + { + "epoch": 1.1551810865191148, + "grad_norm": 0.3254792392253876, + "learning_rate": 7.722479278812672e-06, + "loss": 0.349, + "step": 4593 + }, + { + "epoch": 1.1554325955734406, + "grad_norm": 0.36106935143470764, + "learning_rate": 7.721251843500948e-06, + "loss": 0.347, + "step": 4594 + }, + { + "epoch": 1.1556841046277666, + "grad_norm": 0.3273780345916748, + "learning_rate": 7.720024175131027e-06, + "loss": 0.3775, + "step": 4595 + }, + { + "epoch": 1.1559356136820926, + "grad_norm": 0.31497684121131897, + "learning_rate": 7.71879627380805e-06, + "loss": 0.3642, + "step": 4596 + }, + { + "epoch": 1.1561871227364184, + "grad_norm": 0.360487163066864, + "learning_rate": 7.717568139637184e-06, + "loss": 0.3624, + "step": 4597 + }, + { + "epoch": 1.1564386317907445, + "grad_norm": 0.3374006748199463, + "learning_rate": 7.716339772723608e-06, + "loss": 0.3559, + "step": 4598 + }, + { + "epoch": 1.1566901408450705, + "grad_norm": 0.3451824486255646, + "learning_rate": 7.715111173172522e-06, + "loss": 0.3666, + "step": 4599 + }, + { + "epoch": 1.1569416498993963, + "grad_norm": 0.3282189667224884, + "learning_rate": 7.713882341089151e-06, + "loss": 0.3532, + "step": 4600 + }, + { + "epoch": 1.1571931589537223, + "grad_norm": 0.34830883145332336, + "learning_rate": 7.712653276578734e-06, + "loss": 0.3937, + "step": 4601 + }, + { + "epoch": 1.1574446680080483, + "grad_norm": 0.32040953636169434, + "learning_rate": 7.711423979746537e-06, + "loss": 0.3401, + "step": 4602 + }, + { + "epoch": 1.1576961770623742, + "grad_norm": 0.3415544927120209, + "learning_rate": 7.710194450697837e-06, + "loss": 0.3673, + "step": 4603 + }, + { + "epoch": 1.1579476861167002, + "grad_norm": 0.35662606358528137, + "learning_rate": 7.708964689537937e-06, + "loss": 0.3489, + "step": 4604 + }, + { + "epoch": 1.1581991951710262, + "grad_norm": 0.34960421919822693, + "learning_rate": 7.707734696372158e-06, + "loss": 0.3528, + "step": 4605 + }, + { + "epoch": 1.158450704225352, + "grad_norm": 0.30523577332496643, + "learning_rate": 7.706504471305843e-06, + "loss": 0.344, + "step": 4606 + }, + { + "epoch": 1.158702213279678, + "grad_norm": 0.33770856261253357, + "learning_rate": 7.70527401444435e-06, + "loss": 0.3737, + "step": 4607 + }, + { + "epoch": 1.158953722334004, + "grad_norm": 0.34141412377357483, + "learning_rate": 7.704043325893064e-06, + "loss": 0.3384, + "step": 4608 + }, + { + "epoch": 1.1592052313883299, + "grad_norm": 0.35698843002319336, + "learning_rate": 7.702812405757382e-06, + "loss": 0.3231, + "step": 4609 + }, + { + "epoch": 1.159456740442656, + "grad_norm": 0.37319764494895935, + "learning_rate": 7.701581254142728e-06, + "loss": 0.3508, + "step": 4610 + }, + { + "epoch": 1.159708249496982, + "grad_norm": 0.32721927762031555, + "learning_rate": 7.70034987115454e-06, + "loss": 0.3482, + "step": 4611 + }, + { + "epoch": 1.1599597585513077, + "grad_norm": 0.3475745618343353, + "learning_rate": 7.699118256898277e-06, + "loss": 0.3454, + "step": 4612 + }, + { + "epoch": 1.1602112676056338, + "grad_norm": 0.3759898245334625, + "learning_rate": 7.697886411479422e-06, + "loss": 0.3683, + "step": 4613 + }, + { + "epoch": 1.1604627766599598, + "grad_norm": 0.35182997584342957, + "learning_rate": 7.696654335003475e-06, + "loss": 0.3519, + "step": 4614 + }, + { + "epoch": 1.1607142857142858, + "grad_norm": 0.38022348284721375, + "learning_rate": 7.695422027575953e-06, + "loss": 0.3434, + "step": 4615 + }, + { + "epoch": 1.1609657947686116, + "grad_norm": 0.36011841893196106, + "learning_rate": 7.694189489302399e-06, + "loss": 0.3508, + "step": 4616 + }, + { + "epoch": 1.1612173038229376, + "grad_norm": 0.3434779942035675, + "learning_rate": 7.692956720288369e-06, + "loss": 0.3481, + "step": 4617 + }, + { + "epoch": 1.1614688128772637, + "grad_norm": 0.3736356198787689, + "learning_rate": 7.69172372063944e-06, + "loss": 0.3376, + "step": 4618 + }, + { + "epoch": 1.1617203219315895, + "grad_norm": 0.3600045144557953, + "learning_rate": 7.690490490461217e-06, + "loss": 0.3435, + "step": 4619 + }, + { + "epoch": 1.1619718309859155, + "grad_norm": 0.3338755965232849, + "learning_rate": 7.689257029859316e-06, + "loss": 0.3457, + "step": 4620 + }, + { + "epoch": 1.1622233400402415, + "grad_norm": 0.33322474360466003, + "learning_rate": 7.688023338939373e-06, + "loss": 0.3442, + "step": 4621 + }, + { + "epoch": 1.1624748490945673, + "grad_norm": 0.3295151889324188, + "learning_rate": 7.686789417807045e-06, + "loss": 0.329, + "step": 4622 + }, + { + "epoch": 1.1627263581488934, + "grad_norm": 0.36218923330307007, + "learning_rate": 7.685555266568014e-06, + "loss": 0.3676, + "step": 4623 + }, + { + "epoch": 1.1629778672032194, + "grad_norm": 0.3499116003513336, + "learning_rate": 7.684320885327976e-06, + "loss": 0.374, + "step": 4624 + }, + { + "epoch": 1.1632293762575452, + "grad_norm": 0.3172034025192261, + "learning_rate": 7.683086274192647e-06, + "loss": 0.3304, + "step": 4625 + }, + { + "epoch": 1.1634808853118712, + "grad_norm": 0.3143727481365204, + "learning_rate": 7.681851433267762e-06, + "loss": 0.35, + "step": 4626 + }, + { + "epoch": 1.1637323943661972, + "grad_norm": 0.3694257140159607, + "learning_rate": 7.680616362659082e-06, + "loss": 0.3583, + "step": 4627 + }, + { + "epoch": 1.163983903420523, + "grad_norm": 0.3408942222595215, + "learning_rate": 7.679381062472377e-06, + "loss": 0.3374, + "step": 4628 + }, + { + "epoch": 1.164235412474849, + "grad_norm": 0.29644355177879333, + "learning_rate": 7.678145532813448e-06, + "loss": 0.3226, + "step": 4629 + }, + { + "epoch": 1.164486921529175, + "grad_norm": 0.3752206265926361, + "learning_rate": 7.67690977378811e-06, + "loss": 0.3861, + "step": 4630 + }, + { + "epoch": 1.1647384305835011, + "grad_norm": 0.3474452495574951, + "learning_rate": 7.675673785502195e-06, + "loss": 0.3807, + "step": 4631 + }, + { + "epoch": 1.164989939637827, + "grad_norm": 0.3323638141155243, + "learning_rate": 7.674437568061559e-06, + "loss": 0.347, + "step": 4632 + }, + { + "epoch": 1.165241448692153, + "grad_norm": 0.322618305683136, + "learning_rate": 7.673201121572077e-06, + "loss": 0.3689, + "step": 4633 + }, + { + "epoch": 1.165492957746479, + "grad_norm": 0.3595317304134369, + "learning_rate": 7.671964446139643e-06, + "loss": 0.3701, + "step": 4634 + }, + { + "epoch": 1.1657444668008048, + "grad_norm": 0.33650046586990356, + "learning_rate": 7.67072754187017e-06, + "loss": 0.3491, + "step": 4635 + }, + { + "epoch": 1.1659959758551308, + "grad_norm": 0.3536091148853302, + "learning_rate": 7.66949040886959e-06, + "loss": 0.3488, + "step": 4636 + }, + { + "epoch": 1.1662474849094568, + "grad_norm": 0.3690861761569977, + "learning_rate": 7.668253047243856e-06, + "loss": 0.3511, + "step": 4637 + }, + { + "epoch": 1.1664989939637826, + "grad_norm": 0.33993828296661377, + "learning_rate": 7.667015457098944e-06, + "loss": 0.358, + "step": 4638 + }, + { + "epoch": 1.1667505030181087, + "grad_norm": 0.3575940728187561, + "learning_rate": 7.66577763854084e-06, + "loss": 0.3414, + "step": 4639 + }, + { + "epoch": 1.1670020120724347, + "grad_norm": 0.3681694567203522, + "learning_rate": 7.664539591675559e-06, + "loss": 0.3922, + "step": 4640 + }, + { + "epoch": 1.1672535211267605, + "grad_norm": 0.3579506576061249, + "learning_rate": 7.663301316609131e-06, + "loss": 0.341, + "step": 4641 + }, + { + "epoch": 1.1675050301810865, + "grad_norm": 0.3645816147327423, + "learning_rate": 7.662062813447608e-06, + "loss": 0.3502, + "step": 4642 + }, + { + "epoch": 1.1677565392354126, + "grad_norm": 0.34026220440864563, + "learning_rate": 7.660824082297057e-06, + "loss": 0.3656, + "step": 4643 + }, + { + "epoch": 1.1680080482897384, + "grad_norm": 0.3202768862247467, + "learning_rate": 7.659585123263571e-06, + "loss": 0.354, + "step": 4644 + }, + { + "epoch": 1.1682595573440644, + "grad_norm": 0.3285091519355774, + "learning_rate": 7.658345936453257e-06, + "loss": 0.3649, + "step": 4645 + }, + { + "epoch": 1.1685110663983904, + "grad_norm": 0.3168383538722992, + "learning_rate": 7.657106521972246e-06, + "loss": 0.3475, + "step": 4646 + }, + { + "epoch": 1.1687625754527162, + "grad_norm": 0.3457097113132477, + "learning_rate": 7.655866879926682e-06, + "loss": 0.3591, + "step": 4647 + }, + { + "epoch": 1.1690140845070423, + "grad_norm": 0.31571948528289795, + "learning_rate": 7.654627010422735e-06, + "loss": 0.3494, + "step": 4648 + }, + { + "epoch": 1.1692655935613683, + "grad_norm": 0.3494684398174286, + "learning_rate": 7.653386913566593e-06, + "loss": 0.3648, + "step": 4649 + }, + { + "epoch": 1.169517102615694, + "grad_norm": 0.3523896038532257, + "learning_rate": 7.65214658946446e-06, + "loss": 0.3614, + "step": 4650 + }, + { + "epoch": 1.16976861167002, + "grad_norm": 0.32975268363952637, + "learning_rate": 7.650906038222563e-06, + "loss": 0.3505, + "step": 4651 + }, + { + "epoch": 1.1700201207243461, + "grad_norm": 0.35798007249832153, + "learning_rate": 7.64966525994715e-06, + "loss": 0.355, + "step": 4652 + }, + { + "epoch": 1.170271629778672, + "grad_norm": 0.3435303568840027, + "learning_rate": 7.648424254744481e-06, + "loss": 0.3548, + "step": 4653 + }, + { + "epoch": 1.170523138832998, + "grad_norm": 0.3186478316783905, + "learning_rate": 7.647183022720846e-06, + "loss": 0.3469, + "step": 4654 + }, + { + "epoch": 1.170774647887324, + "grad_norm": 0.3599293529987335, + "learning_rate": 7.645941563982544e-06, + "loss": 0.3726, + "step": 4655 + }, + { + "epoch": 1.1710261569416498, + "grad_norm": 0.3607332706451416, + "learning_rate": 7.644699878635901e-06, + "loss": 0.339, + "step": 4656 + }, + { + "epoch": 1.1712776659959758, + "grad_norm": 0.3603340983390808, + "learning_rate": 7.643457966787258e-06, + "loss": 0.369, + "step": 4657 + }, + { + "epoch": 1.1715291750503019, + "grad_norm": 0.3286212980747223, + "learning_rate": 7.642215828542977e-06, + "loss": 0.3645, + "step": 4658 + }, + { + "epoch": 1.1717806841046277, + "grad_norm": 0.35398638248443604, + "learning_rate": 7.64097346400944e-06, + "loss": 0.3453, + "step": 4659 + }, + { + "epoch": 1.1720321931589537, + "grad_norm": 0.3555023968219757, + "learning_rate": 7.63973087329305e-06, + "loss": 0.3515, + "step": 4660 + }, + { + "epoch": 1.1722837022132797, + "grad_norm": 0.3314943015575409, + "learning_rate": 7.638488056500222e-06, + "loss": 0.3579, + "step": 4661 + }, + { + "epoch": 1.1725352112676055, + "grad_norm": 0.3424238860607147, + "learning_rate": 7.637245013737399e-06, + "loss": 0.3462, + "step": 4662 + }, + { + "epoch": 1.1727867203219315, + "grad_norm": 0.3617474138736725, + "learning_rate": 7.636001745111039e-06, + "loss": 0.3541, + "step": 4663 + }, + { + "epoch": 1.1730382293762576, + "grad_norm": 0.3585506081581116, + "learning_rate": 7.634758250727621e-06, + "loss": 0.3603, + "step": 4664 + }, + { + "epoch": 1.1732897384305836, + "grad_norm": 0.35511475801467896, + "learning_rate": 7.633514530693642e-06, + "loss": 0.3553, + "step": 4665 + }, + { + "epoch": 1.1735412474849094, + "grad_norm": 0.3293471038341522, + "learning_rate": 7.632270585115618e-06, + "loss": 0.3478, + "step": 4666 + }, + { + "epoch": 1.1737927565392354, + "grad_norm": 0.3938407599925995, + "learning_rate": 7.631026414100086e-06, + "loss": 0.3484, + "step": 4667 + }, + { + "epoch": 1.1740442655935615, + "grad_norm": 0.3182221055030823, + "learning_rate": 7.629782017753602e-06, + "loss": 0.3623, + "step": 4668 + }, + { + "epoch": 1.1742957746478873, + "grad_norm": 0.34111714363098145, + "learning_rate": 7.628537396182739e-06, + "loss": 0.3747, + "step": 4669 + }, + { + "epoch": 1.1745472837022133, + "grad_norm": 0.3676598072052002, + "learning_rate": 7.627292549494092e-06, + "loss": 0.3687, + "step": 4670 + }, + { + "epoch": 1.1747987927565393, + "grad_norm": 0.3540399670600891, + "learning_rate": 7.626047477794276e-06, + "loss": 0.3611, + "step": 4671 + }, + { + "epoch": 1.1750503018108651, + "grad_norm": 0.32811835408210754, + "learning_rate": 7.62480218118992e-06, + "loss": 0.3235, + "step": 4672 + }, + { + "epoch": 1.1753018108651911, + "grad_norm": 0.3579806089401245, + "learning_rate": 7.6235566597876786e-06, + "loss": 0.3429, + "step": 4673 + }, + { + "epoch": 1.1755533199195172, + "grad_norm": 0.3340808153152466, + "learning_rate": 7.622310913694222e-06, + "loss": 0.3554, + "step": 4674 + }, + { + "epoch": 1.175804828973843, + "grad_norm": 0.3324950933456421, + "learning_rate": 7.621064943016241e-06, + "loss": 0.3406, + "step": 4675 + }, + { + "epoch": 1.176056338028169, + "grad_norm": 0.3520958423614502, + "learning_rate": 7.6198187478604455e-06, + "loss": 0.3199, + "step": 4676 + }, + { + "epoch": 1.176307847082495, + "grad_norm": 0.3330570459365845, + "learning_rate": 7.618572328333565e-06, + "loss": 0.3447, + "step": 4677 + }, + { + "epoch": 1.1765593561368208, + "grad_norm": 0.3154620826244354, + "learning_rate": 7.617325684542344e-06, + "loss": 0.3549, + "step": 4678 + }, + { + "epoch": 1.1768108651911469, + "grad_norm": 0.29848769307136536, + "learning_rate": 7.6160788165935525e-06, + "loss": 0.3355, + "step": 4679 + }, + { + "epoch": 1.1770623742454729, + "grad_norm": 0.32562530040740967, + "learning_rate": 7.6148317245939766e-06, + "loss": 0.3314, + "step": 4680 + }, + { + "epoch": 1.177313883299799, + "grad_norm": 0.33761054277420044, + "learning_rate": 7.613584408650423e-06, + "loss": 0.3452, + "step": 4681 + }, + { + "epoch": 1.1775653923541247, + "grad_norm": 0.3247910737991333, + "learning_rate": 7.612336868869714e-06, + "loss": 0.3577, + "step": 4682 + }, + { + "epoch": 1.1778169014084507, + "grad_norm": 0.3061235845088959, + "learning_rate": 7.611089105358695e-06, + "loss": 0.3222, + "step": 4683 + }, + { + "epoch": 1.1780684104627768, + "grad_norm": 0.32039159536361694, + "learning_rate": 7.609841118224229e-06, + "loss": 0.3555, + "step": 4684 + }, + { + "epoch": 1.1783199195171026, + "grad_norm": 0.3344159722328186, + "learning_rate": 7.608592907573199e-06, + "loss": 0.3256, + "step": 4685 + }, + { + "epoch": 1.1785714285714286, + "grad_norm": 0.32991334795951843, + "learning_rate": 7.607344473512506e-06, + "loss": 0.3265, + "step": 4686 + }, + { + "epoch": 1.1788229376257546, + "grad_norm": 0.35629698634147644, + "learning_rate": 7.606095816149069e-06, + "loss": 0.3511, + "step": 4687 + }, + { + "epoch": 1.1790744466800804, + "grad_norm": 0.3231953978538513, + "learning_rate": 7.60484693558983e-06, + "loss": 0.3497, + "step": 4688 + }, + { + "epoch": 1.1793259557344065, + "grad_norm": 0.35513460636138916, + "learning_rate": 7.603597831941747e-06, + "loss": 0.3495, + "step": 4689 + }, + { + "epoch": 1.1795774647887325, + "grad_norm": 0.34873831272125244, + "learning_rate": 7.602348505311797e-06, + "loss": 0.3525, + "step": 4690 + }, + { + "epoch": 1.1798289738430583, + "grad_norm": 0.300342857837677, + "learning_rate": 7.601098955806978e-06, + "loss": 0.3704, + "step": 4691 + }, + { + "epoch": 1.1800804828973843, + "grad_norm": 0.3526897430419922, + "learning_rate": 7.5998491835343065e-06, + "loss": 0.3513, + "step": 4692 + }, + { + "epoch": 1.1803319919517103, + "grad_norm": 0.3581010103225708, + "learning_rate": 7.598599188600817e-06, + "loss": 0.3653, + "step": 4693 + }, + { + "epoch": 1.1805835010060362, + "grad_norm": 0.31560835242271423, + "learning_rate": 7.5973489711135625e-06, + "loss": 0.3527, + "step": 4694 + }, + { + "epoch": 1.1808350100603622, + "grad_norm": 0.3127973973751068, + "learning_rate": 7.596098531179619e-06, + "loss": 0.378, + "step": 4695 + }, + { + "epoch": 1.1810865191146882, + "grad_norm": 0.35242587327957153, + "learning_rate": 7.594847868906076e-06, + "loss": 0.3592, + "step": 4696 + }, + { + "epoch": 1.181338028169014, + "grad_norm": 0.33138447999954224, + "learning_rate": 7.593596984400048e-06, + "loss": 0.391, + "step": 4697 + }, + { + "epoch": 1.18158953722334, + "grad_norm": 0.3515087068080902, + "learning_rate": 7.592345877768663e-06, + "loss": 0.3641, + "step": 4698 + }, + { + "epoch": 1.181841046277666, + "grad_norm": 0.32551079988479614, + "learning_rate": 7.591094549119071e-06, + "loss": 0.3725, + "step": 4699 + }, + { + "epoch": 1.1820925553319919, + "grad_norm": 0.32543718814849854, + "learning_rate": 7.589842998558441e-06, + "loss": 0.3395, + "step": 4700 + }, + { + "epoch": 1.182344064386318, + "grad_norm": 0.33554258942604065, + "learning_rate": 7.58859122619396e-06, + "loss": 0.3503, + "step": 4701 + }, + { + "epoch": 1.182595573440644, + "grad_norm": 0.33632540702819824, + "learning_rate": 7.587339232132835e-06, + "loss": 0.3622, + "step": 4702 + }, + { + "epoch": 1.1828470824949697, + "grad_norm": 0.32022663950920105, + "learning_rate": 7.586087016482291e-06, + "loss": 0.3443, + "step": 4703 + }, + { + "epoch": 1.1830985915492958, + "grad_norm": 0.3406105637550354, + "learning_rate": 7.584834579349572e-06, + "loss": 0.3413, + "step": 4704 + }, + { + "epoch": 1.1833501006036218, + "grad_norm": 0.3901978135108948, + "learning_rate": 7.5835819208419425e-06, + "loss": 0.3665, + "step": 4705 + }, + { + "epoch": 1.1836016096579476, + "grad_norm": 0.3854331076145172, + "learning_rate": 7.5823290410666835e-06, + "loss": 0.3607, + "step": 4706 + }, + { + "epoch": 1.1838531187122736, + "grad_norm": 0.3867035210132599, + "learning_rate": 7.5810759401310975e-06, + "loss": 0.3537, + "step": 4707 + }, + { + "epoch": 1.1841046277665996, + "grad_norm": 0.34121641516685486, + "learning_rate": 7.579822618142505e-06, + "loss": 0.3496, + "step": 4708 + }, + { + "epoch": 1.1843561368209254, + "grad_norm": 0.3433249592781067, + "learning_rate": 7.578569075208244e-06, + "loss": 0.3553, + "step": 4709 + }, + { + "epoch": 1.1846076458752515, + "grad_norm": 0.34000903367996216, + "learning_rate": 7.577315311435674e-06, + "loss": 0.3435, + "step": 4710 + }, + { + "epoch": 1.1848591549295775, + "grad_norm": 0.3518748879432678, + "learning_rate": 7.5760613269321715e-06, + "loss": 0.382, + "step": 4711 + }, + { + "epoch": 1.1851106639839033, + "grad_norm": 0.30435121059417725, + "learning_rate": 7.574807121805131e-06, + "loss": 0.3234, + "step": 4712 + }, + { + "epoch": 1.1853621730382293, + "grad_norm": 0.33615031838417053, + "learning_rate": 7.573552696161969e-06, + "loss": 0.3528, + "step": 4713 + }, + { + "epoch": 1.1856136820925554, + "grad_norm": 0.3324757516384125, + "learning_rate": 7.572298050110118e-06, + "loss": 0.3725, + "step": 4714 + }, + { + "epoch": 1.1858651911468814, + "grad_norm": 0.37572944164276123, + "learning_rate": 7.571043183757032e-06, + "loss": 0.3328, + "step": 4715 + }, + { + "epoch": 1.1861167002012072, + "grad_norm": 0.31655603647232056, + "learning_rate": 7.56978809721018e-06, + "loss": 0.3458, + "step": 4716 + }, + { + "epoch": 1.1863682092555332, + "grad_norm": 0.33465415239334106, + "learning_rate": 7.568532790577057e-06, + "loss": 0.369, + "step": 4717 + }, + { + "epoch": 1.1866197183098592, + "grad_norm": 0.33323749899864197, + "learning_rate": 7.567277263965167e-06, + "loss": 0.3469, + "step": 4718 + }, + { + "epoch": 1.186871227364185, + "grad_norm": 0.31222718954086304, + "learning_rate": 7.566021517482041e-06, + "loss": 0.37, + "step": 4719 + }, + { + "epoch": 1.187122736418511, + "grad_norm": 0.3619459271430969, + "learning_rate": 7.5647655512352245e-06, + "loss": 0.3535, + "step": 4720 + }, + { + "epoch": 1.187374245472837, + "grad_norm": 0.3543090224266052, + "learning_rate": 7.563509365332285e-06, + "loss": 0.3578, + "step": 4721 + }, + { + "epoch": 1.187625754527163, + "grad_norm": 0.3527710437774658, + "learning_rate": 7.562252959880804e-06, + "loss": 0.356, + "step": 4722 + }, + { + "epoch": 1.187877263581489, + "grad_norm": 0.3505546748638153, + "learning_rate": 7.560996334988386e-06, + "loss": 0.3641, + "step": 4723 + }, + { + "epoch": 1.188128772635815, + "grad_norm": 0.3206879794597626, + "learning_rate": 7.5597394907626555e-06, + "loss": 0.361, + "step": 4724 + }, + { + "epoch": 1.1883802816901408, + "grad_norm": 0.3364570140838623, + "learning_rate": 7.55848242731125e-06, + "loss": 0.339, + "step": 4725 + }, + { + "epoch": 1.1886317907444668, + "grad_norm": 0.3259868919849396, + "learning_rate": 7.557225144741831e-06, + "loss": 0.3472, + "step": 4726 + }, + { + "epoch": 1.1888832997987928, + "grad_norm": 0.34462079405784607, + "learning_rate": 7.5559676431620745e-06, + "loss": 0.3461, + "step": 4727 + }, + { + "epoch": 1.1891348088531186, + "grad_norm": 0.34534603357315063, + "learning_rate": 7.554709922679681e-06, + "loss": 0.38, + "step": 4728 + }, + { + "epoch": 1.1893863179074446, + "grad_norm": 0.32316854596138, + "learning_rate": 7.553451983402364e-06, + "loss": 0.3291, + "step": 4729 + }, + { + "epoch": 1.1896378269617707, + "grad_norm": 0.34536492824554443, + "learning_rate": 7.552193825437861e-06, + "loss": 0.3607, + "step": 4730 + }, + { + "epoch": 1.1898893360160967, + "grad_norm": 0.34370651841163635, + "learning_rate": 7.550935448893921e-06, + "loss": 0.3472, + "step": 4731 + }, + { + "epoch": 1.1901408450704225, + "grad_norm": 0.3375087380409241, + "learning_rate": 7.54967685387832e-06, + "loss": 0.364, + "step": 4732 + }, + { + "epoch": 1.1903923541247485, + "grad_norm": 0.3438502848148346, + "learning_rate": 7.548418040498847e-06, + "loss": 0.3446, + "step": 4733 + }, + { + "epoch": 1.1906438631790746, + "grad_norm": 0.34919726848602295, + "learning_rate": 7.547159008863312e-06, + "loss": 0.3661, + "step": 4734 + }, + { + "epoch": 1.1908953722334004, + "grad_norm": 0.3322198987007141, + "learning_rate": 7.545899759079542e-06, + "loss": 0.3845, + "step": 4735 + }, + { + "epoch": 1.1911468812877264, + "grad_norm": 0.3598858416080475, + "learning_rate": 7.544640291255385e-06, + "loss": 0.3669, + "step": 4736 + }, + { + "epoch": 1.1913983903420524, + "grad_norm": 0.3647436797618866, + "learning_rate": 7.543380605498707e-06, + "loss": 0.3699, + "step": 4737 + }, + { + "epoch": 1.1916498993963782, + "grad_norm": 0.3301692605018616, + "learning_rate": 7.542120701917391e-06, + "loss": 0.3728, + "step": 4738 + }, + { + "epoch": 1.1919014084507042, + "grad_norm": 0.33648648858070374, + "learning_rate": 7.540860580619339e-06, + "loss": 0.3443, + "step": 4739 + }, + { + "epoch": 1.1921529175050303, + "grad_norm": 0.3432157337665558, + "learning_rate": 7.539600241712475e-06, + "loss": 0.3518, + "step": 4740 + }, + { + "epoch": 1.192404426559356, + "grad_norm": 0.32822272181510925, + "learning_rate": 7.538339685304737e-06, + "loss": 0.3609, + "step": 4741 + }, + { + "epoch": 1.192655935613682, + "grad_norm": 0.334031343460083, + "learning_rate": 7.537078911504087e-06, + "loss": 0.3529, + "step": 4742 + }, + { + "epoch": 1.1929074446680081, + "grad_norm": 0.3261743187904358, + "learning_rate": 7.5358179204184975e-06, + "loss": 0.3695, + "step": 4743 + }, + { + "epoch": 1.193158953722334, + "grad_norm": 0.3211628496646881, + "learning_rate": 7.53455671215597e-06, + "loss": 0.3529, + "step": 4744 + }, + { + "epoch": 1.19341046277666, + "grad_norm": 0.30509626865386963, + "learning_rate": 7.533295286824513e-06, + "loss": 0.3663, + "step": 4745 + }, + { + "epoch": 1.193661971830986, + "grad_norm": 0.3319106101989746, + "learning_rate": 7.532033644532166e-06, + "loss": 0.3598, + "step": 4746 + }, + { + "epoch": 1.1939134808853118, + "grad_norm": 0.34237736463546753, + "learning_rate": 7.530771785386976e-06, + "loss": 0.3661, + "step": 4747 + }, + { + "epoch": 1.1941649899396378, + "grad_norm": 0.3307172656059265, + "learning_rate": 7.5295097094970136e-06, + "loss": 0.3489, + "step": 4748 + }, + { + "epoch": 1.1944164989939638, + "grad_norm": 0.32638081908226013, + "learning_rate": 7.528247416970371e-06, + "loss": 0.3613, + "step": 4749 + }, + { + "epoch": 1.1946680080482897, + "grad_norm": 0.3481774628162384, + "learning_rate": 7.5269849079151535e-06, + "loss": 0.3557, + "step": 4750 + }, + { + "epoch": 1.1949195171026157, + "grad_norm": 0.3151308298110962, + "learning_rate": 7.525722182439488e-06, + "loss": 0.3375, + "step": 4751 + }, + { + "epoch": 1.1951710261569417, + "grad_norm": 0.32890570163726807, + "learning_rate": 7.524459240651518e-06, + "loss": 0.3387, + "step": 4752 + }, + { + "epoch": 1.1954225352112675, + "grad_norm": 0.3260030150413513, + "learning_rate": 7.523196082659408e-06, + "loss": 0.3428, + "step": 4753 + }, + { + "epoch": 1.1956740442655935, + "grad_norm": 0.3384077847003937, + "learning_rate": 7.521932708571338e-06, + "loss": 0.3585, + "step": 4754 + }, + { + "epoch": 1.1959255533199196, + "grad_norm": 0.3400633633136749, + "learning_rate": 7.520669118495507e-06, + "loss": 0.3382, + "step": 4755 + }, + { + "epoch": 1.1961770623742454, + "grad_norm": 0.3479855954647064, + "learning_rate": 7.519405312540137e-06, + "loss": 0.3566, + "step": 4756 + }, + { + "epoch": 1.1964285714285714, + "grad_norm": 0.3062722980976105, + "learning_rate": 7.518141290813463e-06, + "loss": 0.3361, + "step": 4757 + }, + { + "epoch": 1.1966800804828974, + "grad_norm": 0.33496060967445374, + "learning_rate": 7.5168770534237414e-06, + "loss": 0.3296, + "step": 4758 + }, + { + "epoch": 1.1969315895372232, + "grad_norm": 0.3257729709148407, + "learning_rate": 7.515612600479243e-06, + "loss": 0.3502, + "step": 4759 + }, + { + "epoch": 1.1971830985915493, + "grad_norm": 0.3281537890434265, + "learning_rate": 7.514347932088266e-06, + "loss": 0.3289, + "step": 4760 + }, + { + "epoch": 1.1974346076458753, + "grad_norm": 0.324022501707077, + "learning_rate": 7.513083048359117e-06, + "loss": 0.327, + "step": 4761 + }, + { + "epoch": 1.197686116700201, + "grad_norm": 0.3308146595954895, + "learning_rate": 7.511817949400126e-06, + "loss": 0.3437, + "step": 4762 + }, + { + "epoch": 1.1979376257545271, + "grad_norm": 0.36193880438804626, + "learning_rate": 7.510552635319643e-06, + "loss": 0.3438, + "step": 4763 + }, + { + "epoch": 1.1981891348088531, + "grad_norm": 0.3138233721256256, + "learning_rate": 7.509287106226033e-06, + "loss": 0.3297, + "step": 4764 + }, + { + "epoch": 1.1984406438631792, + "grad_norm": 0.3267454504966736, + "learning_rate": 7.50802136222768e-06, + "loss": 0.3373, + "step": 4765 + }, + { + "epoch": 1.198692152917505, + "grad_norm": 0.34192612767219543, + "learning_rate": 7.506755403432987e-06, + "loss": 0.366, + "step": 4766 + }, + { + "epoch": 1.198943661971831, + "grad_norm": 0.3394359350204468, + "learning_rate": 7.505489229950375e-06, + "loss": 0.3285, + "step": 4767 + }, + { + "epoch": 1.199195171026157, + "grad_norm": 0.3370779752731323, + "learning_rate": 7.504222841888287e-06, + "loss": 0.3262, + "step": 4768 + }, + { + "epoch": 1.1994466800804828, + "grad_norm": 0.33641576766967773, + "learning_rate": 7.502956239355178e-06, + "loss": 0.3301, + "step": 4769 + }, + { + "epoch": 1.1996981891348089, + "grad_norm": 0.33692795038223267, + "learning_rate": 7.5016894224595235e-06, + "loss": 0.3633, + "step": 4770 + }, + { + "epoch": 1.1999496981891349, + "grad_norm": 0.3135643005371094, + "learning_rate": 7.5004223913098224e-06, + "loss": 0.3528, + "step": 4771 + }, + { + "epoch": 1.2002012072434607, + "grad_norm": 0.37040475010871887, + "learning_rate": 7.4991551460145874e-06, + "loss": 0.3748, + "step": 4772 + }, + { + "epoch": 1.2004527162977867, + "grad_norm": 0.35265108942985535, + "learning_rate": 7.497887686682347e-06, + "loss": 0.3373, + "step": 4773 + }, + { + "epoch": 1.2007042253521127, + "grad_norm": 0.30251818895339966, + "learning_rate": 7.496620013421654e-06, + "loss": 0.348, + "step": 4774 + }, + { + "epoch": 1.2009557344064385, + "grad_norm": 0.37606722116470337, + "learning_rate": 7.495352126341074e-06, + "loss": 0.3535, + "step": 4775 + }, + { + "epoch": 1.2012072434607646, + "grad_norm": 0.3614901602268219, + "learning_rate": 7.4940840255491975e-06, + "loss": 0.3766, + "step": 4776 + }, + { + "epoch": 1.2014587525150906, + "grad_norm": 0.33923235535621643, + "learning_rate": 7.492815711154626e-06, + "loss": 0.3423, + "step": 4777 + }, + { + "epoch": 1.2017102615694164, + "grad_norm": 0.388129860162735, + "learning_rate": 7.4915471832659835e-06, + "loss": 0.372, + "step": 4778 + }, + { + "epoch": 1.2019617706237424, + "grad_norm": 0.3465493619441986, + "learning_rate": 7.490278441991911e-06, + "loss": 0.3534, + "step": 4779 + }, + { + "epoch": 1.2022132796780685, + "grad_norm": 0.3515889644622803, + "learning_rate": 7.489009487441071e-06, + "loss": 0.3313, + "step": 4780 + }, + { + "epoch": 1.2024647887323945, + "grad_norm": 0.375966340303421, + "learning_rate": 7.4877403197221385e-06, + "loss": 0.4004, + "step": 4781 + }, + { + "epoch": 1.2027162977867203, + "grad_norm": 0.3232581317424774, + "learning_rate": 7.48647093894381e-06, + "loss": 0.3247, + "step": 4782 + }, + { + "epoch": 1.2029678068410463, + "grad_norm": 0.3182356059551239, + "learning_rate": 7.485201345214803e-06, + "loss": 0.3493, + "step": 4783 + }, + { + "epoch": 1.2032193158953723, + "grad_norm": 0.340154230594635, + "learning_rate": 7.483931538643847e-06, + "loss": 0.3646, + "step": 4784 + }, + { + "epoch": 1.2034708249496981, + "grad_norm": 0.35035526752471924, + "learning_rate": 7.482661519339696e-06, + "loss": 0.3693, + "step": 4785 + }, + { + "epoch": 1.2037223340040242, + "grad_norm": 0.34506088495254517, + "learning_rate": 7.481391287411115e-06, + "loss": 0.343, + "step": 4786 + }, + { + "epoch": 1.2039738430583502, + "grad_norm": 0.34625160694122314, + "learning_rate": 7.480120842966895e-06, + "loss": 0.3267, + "step": 4787 + }, + { + "epoch": 1.204225352112676, + "grad_norm": 0.35969218611717224, + "learning_rate": 7.478850186115839e-06, + "loss": 0.3706, + "step": 4788 + }, + { + "epoch": 1.204476861167002, + "grad_norm": 0.3208048641681671, + "learning_rate": 7.477579316966773e-06, + "loss": 0.3509, + "step": 4789 + }, + { + "epoch": 1.204728370221328, + "grad_norm": 0.34545454382896423, + "learning_rate": 7.4763082356285385e-06, + "loss": 0.3524, + "step": 4790 + }, + { + "epoch": 1.2049798792756539, + "grad_norm": 0.37139612436294556, + "learning_rate": 7.4750369422099955e-06, + "loss": 0.3832, + "step": 4791 + }, + { + "epoch": 1.20523138832998, + "grad_norm": 0.3318977952003479, + "learning_rate": 7.473765436820021e-06, + "loss": 0.3348, + "step": 4792 + }, + { + "epoch": 1.205482897384306, + "grad_norm": 0.34390851855278015, + "learning_rate": 7.472493719567513e-06, + "loss": 0.3608, + "step": 4793 + }, + { + "epoch": 1.2057344064386317, + "grad_norm": 0.3598691523075104, + "learning_rate": 7.471221790561387e-06, + "loss": 0.3632, + "step": 4794 + }, + { + "epoch": 1.2059859154929577, + "grad_norm": 0.3379955291748047, + "learning_rate": 7.469949649910574e-06, + "loss": 0.3586, + "step": 4795 + }, + { + "epoch": 1.2062374245472838, + "grad_norm": 0.37648218870162964, + "learning_rate": 7.468677297724025e-06, + "loss": 0.3536, + "step": 4796 + }, + { + "epoch": 1.2064889336016096, + "grad_norm": 0.3806565999984741, + "learning_rate": 7.46740473411071e-06, + "loss": 0.3358, + "step": 4797 + }, + { + "epoch": 1.2067404426559356, + "grad_norm": 0.34232839941978455, + "learning_rate": 7.466131959179614e-06, + "loss": 0.3444, + "step": 4798 + }, + { + "epoch": 1.2069919517102616, + "grad_norm": 0.3516295254230499, + "learning_rate": 7.464858973039746e-06, + "loss": 0.3442, + "step": 4799 + }, + { + "epoch": 1.2072434607645874, + "grad_norm": 0.3554040491580963, + "learning_rate": 7.4635857758001266e-06, + "loss": 0.3613, + "step": 4800 + }, + { + "epoch": 1.2074949698189135, + "grad_norm": 0.3366535007953644, + "learning_rate": 7.4623123675697976e-06, + "loss": 0.3493, + "step": 4801 + }, + { + "epoch": 1.2077464788732395, + "grad_norm": 0.34996628761291504, + "learning_rate": 7.461038748457818e-06, + "loss": 0.3765, + "step": 4802 + }, + { + "epoch": 1.2079979879275653, + "grad_norm": 0.36713096499443054, + "learning_rate": 7.459764918573264e-06, + "loss": 0.3513, + "step": 4803 + }, + { + "epoch": 1.2082494969818913, + "grad_norm": 0.3152211308479309, + "learning_rate": 7.4584908780252365e-06, + "loss": 0.3413, + "step": 4804 + }, + { + "epoch": 1.2085010060362174, + "grad_norm": 0.33181753754615784, + "learning_rate": 7.457216626922843e-06, + "loss": 0.3507, + "step": 4805 + }, + { + "epoch": 1.2087525150905432, + "grad_norm": 0.32960614562034607, + "learning_rate": 7.45594216537522e-06, + "loss": 0.3585, + "step": 4806 + }, + { + "epoch": 1.2090040241448692, + "grad_norm": 0.3321514427661896, + "learning_rate": 7.454667493491513e-06, + "loss": 0.3329, + "step": 4807 + }, + { + "epoch": 1.2092555331991952, + "grad_norm": 0.33364543318748474, + "learning_rate": 7.453392611380892e-06, + "loss": 0.3285, + "step": 4808 + }, + { + "epoch": 1.209507042253521, + "grad_norm": 0.31527218222618103, + "learning_rate": 7.452117519152542e-06, + "loss": 0.3499, + "step": 4809 + }, + { + "epoch": 1.209758551307847, + "grad_norm": 0.31813716888427734, + "learning_rate": 7.450842216915667e-06, + "loss": 0.3498, + "step": 4810 + }, + { + "epoch": 1.210010060362173, + "grad_norm": 0.30781570076942444, + "learning_rate": 7.449566704779488e-06, + "loss": 0.3651, + "step": 4811 + }, + { + "epoch": 1.2102615694164989, + "grad_norm": 0.31299492716789246, + "learning_rate": 7.448290982853247e-06, + "loss": 0.3392, + "step": 4812 + }, + { + "epoch": 1.210513078470825, + "grad_norm": 0.3456888794898987, + "learning_rate": 7.4470150512461966e-06, + "loss": 0.3628, + "step": 4813 + }, + { + "epoch": 1.210764587525151, + "grad_norm": 0.3564990758895874, + "learning_rate": 7.445738910067618e-06, + "loss": 0.3645, + "step": 4814 + }, + { + "epoch": 1.211016096579477, + "grad_norm": 0.32649949193000793, + "learning_rate": 7.444462559426802e-06, + "loss": 0.3589, + "step": 4815 + }, + { + "epoch": 1.2112676056338028, + "grad_norm": 0.36671027541160583, + "learning_rate": 7.44318599943306e-06, + "loss": 0.3756, + "step": 4816 + }, + { + "epoch": 1.2115191146881288, + "grad_norm": 0.36808088421821594, + "learning_rate": 7.441909230195722e-06, + "loss": 0.3589, + "step": 4817 + }, + { + "epoch": 1.2117706237424548, + "grad_norm": 0.35420069098472595, + "learning_rate": 7.440632251824135e-06, + "loss": 0.332, + "step": 4818 + }, + { + "epoch": 1.2120221327967806, + "grad_norm": 0.3255870044231415, + "learning_rate": 7.4393550644276635e-06, + "loss": 0.3622, + "step": 4819 + }, + { + "epoch": 1.2122736418511066, + "grad_norm": 0.335366427898407, + "learning_rate": 7.438077668115692e-06, + "loss": 0.3587, + "step": 4820 + }, + { + "epoch": 1.2125251509054327, + "grad_norm": 0.3825639486312866, + "learning_rate": 7.43680006299762e-06, + "loss": 0.3776, + "step": 4821 + }, + { + "epoch": 1.2127766599597585, + "grad_norm": 0.3819475769996643, + "learning_rate": 7.435522249182868e-06, + "loss": 0.3309, + "step": 4822 + }, + { + "epoch": 1.2130281690140845, + "grad_norm": 0.3363218605518341, + "learning_rate": 7.434244226780871e-06, + "loss": 0.3539, + "step": 4823 + }, + { + "epoch": 1.2132796780684105, + "grad_norm": 0.4024132490158081, + "learning_rate": 7.432965995901085e-06, + "loss": 0.3538, + "step": 4824 + }, + { + "epoch": 1.2135311871227363, + "grad_norm": 0.35047945380210876, + "learning_rate": 7.4316875566529825e-06, + "loss": 0.3228, + "step": 4825 + }, + { + "epoch": 1.2137826961770624, + "grad_norm": 0.3095787763595581, + "learning_rate": 7.430408909146053e-06, + "loss": 0.3695, + "step": 4826 + }, + { + "epoch": 1.2140342052313884, + "grad_norm": 0.33355486392974854, + "learning_rate": 7.429130053489805e-06, + "loss": 0.3494, + "step": 4827 + }, + { + "epoch": 1.2142857142857142, + "grad_norm": 0.37087491154670715, + "learning_rate": 7.427850989793764e-06, + "loss": 0.3527, + "step": 4828 + }, + { + "epoch": 1.2145372233400402, + "grad_norm": 0.3255065977573395, + "learning_rate": 7.426571718167476e-06, + "loss": 0.3543, + "step": 4829 + }, + { + "epoch": 1.2147887323943662, + "grad_norm": 0.33966347575187683, + "learning_rate": 7.4252922387205e-06, + "loss": 0.3526, + "step": 4830 + }, + { + "epoch": 1.2150402414486923, + "grad_norm": 0.32921141386032104, + "learning_rate": 7.424012551562416e-06, + "loss": 0.3662, + "step": 4831 + }, + { + "epoch": 1.215291750503018, + "grad_norm": 0.3616410791873932, + "learning_rate": 7.422732656802821e-06, + "loss": 0.3537, + "step": 4832 + }, + { + "epoch": 1.215543259557344, + "grad_norm": 0.3477363586425781, + "learning_rate": 7.421452554551332e-06, + "loss": 0.3633, + "step": 4833 + }, + { + "epoch": 1.2157947686116701, + "grad_norm": 0.34547752141952515, + "learning_rate": 7.420172244917579e-06, + "loss": 0.3455, + "step": 4834 + }, + { + "epoch": 1.216046277665996, + "grad_norm": 0.36257532238960266, + "learning_rate": 7.418891728011214e-06, + "loss": 0.3381, + "step": 4835 + }, + { + "epoch": 1.216297786720322, + "grad_norm": 0.34526896476745605, + "learning_rate": 7.417611003941905e-06, + "loss": 0.3347, + "step": 4836 + }, + { + "epoch": 1.216549295774648, + "grad_norm": 0.35130295157432556, + "learning_rate": 7.416330072819338e-06, + "loss": 0.3579, + "step": 4837 + }, + { + "epoch": 1.2168008048289738, + "grad_norm": 0.3254670202732086, + "learning_rate": 7.415048934753217e-06, + "loss": 0.3455, + "step": 4838 + }, + { + "epoch": 1.2170523138832998, + "grad_norm": 0.3543333113193512, + "learning_rate": 7.413767589853264e-06, + "loss": 0.3465, + "step": 4839 + }, + { + "epoch": 1.2173038229376258, + "grad_norm": 0.3948500156402588, + "learning_rate": 7.412486038229217e-06, + "loss": 0.354, + "step": 4840 + }, + { + "epoch": 1.2175553319919517, + "grad_norm": 0.3444294035434723, + "learning_rate": 7.411204279990832e-06, + "loss": 0.3694, + "step": 4841 + }, + { + "epoch": 1.2178068410462777, + "grad_norm": 0.35417085886001587, + "learning_rate": 7.4099223152478865e-06, + "loss": 0.3593, + "step": 4842 + }, + { + "epoch": 1.2180583501006037, + "grad_norm": 0.35232633352279663, + "learning_rate": 7.408640144110171e-06, + "loss": 0.3407, + "step": 4843 + }, + { + "epoch": 1.2183098591549295, + "grad_norm": 0.35325491428375244, + "learning_rate": 7.407357766687495e-06, + "loss": 0.3534, + "step": 4844 + }, + { + "epoch": 1.2185613682092555, + "grad_norm": 0.31295645236968994, + "learning_rate": 7.406075183089686e-06, + "loss": 0.3598, + "step": 4845 + }, + { + "epoch": 1.2188128772635816, + "grad_norm": 0.3566473424434662, + "learning_rate": 7.40479239342659e-06, + "loss": 0.3381, + "step": 4846 + }, + { + "epoch": 1.2190643863179074, + "grad_norm": 0.36591169238090515, + "learning_rate": 7.403509397808071e-06, + "loss": 0.3401, + "step": 4847 + }, + { + "epoch": 1.2193158953722334, + "grad_norm": 0.33974406123161316, + "learning_rate": 7.402226196344008e-06, + "loss": 0.3635, + "step": 4848 + }, + { + "epoch": 1.2195674044265594, + "grad_norm": 0.33850404620170593, + "learning_rate": 7.400942789144299e-06, + "loss": 0.37, + "step": 4849 + }, + { + "epoch": 1.2198189134808852, + "grad_norm": 0.34571510553359985, + "learning_rate": 7.399659176318861e-06, + "loss": 0.3681, + "step": 4850 + }, + { + "epoch": 1.2200704225352113, + "grad_norm": 0.29631301760673523, + "learning_rate": 7.398375357977626e-06, + "loss": 0.3839, + "step": 4851 + }, + { + "epoch": 1.2203219315895373, + "grad_norm": 0.33825013041496277, + "learning_rate": 7.397091334230547e-06, + "loss": 0.3593, + "step": 4852 + }, + { + "epoch": 1.220573440643863, + "grad_norm": 0.3286561667919159, + "learning_rate": 7.395807105187591e-06, + "loss": 0.3398, + "step": 4853 + }, + { + "epoch": 1.220824949698189, + "grad_norm": 0.34450599551200867, + "learning_rate": 7.3945226709587434e-06, + "loss": 0.3356, + "step": 4854 + }, + { + "epoch": 1.2210764587525151, + "grad_norm": 0.3374083638191223, + "learning_rate": 7.393238031654011e-06, + "loss": 0.342, + "step": 4855 + }, + { + "epoch": 1.221327967806841, + "grad_norm": 0.3444235920906067, + "learning_rate": 7.391953187383411e-06, + "loss": 0.3222, + "step": 4856 + }, + { + "epoch": 1.221579476861167, + "grad_norm": 0.33307015895843506, + "learning_rate": 7.390668138256987e-06, + "loss": 0.3378, + "step": 4857 + }, + { + "epoch": 1.221830985915493, + "grad_norm": 0.362884521484375, + "learning_rate": 7.38938288438479e-06, + "loss": 0.3765, + "step": 4858 + }, + { + "epoch": 1.2220824949698188, + "grad_norm": 0.30631768703460693, + "learning_rate": 7.388097425876899e-06, + "loss": 0.3387, + "step": 4859 + }, + { + "epoch": 1.2223340040241448, + "grad_norm": 0.3113538920879364, + "learning_rate": 7.386811762843404e-06, + "loss": 0.3577, + "step": 4860 + }, + { + "epoch": 1.2225855130784709, + "grad_norm": 0.335956871509552, + "learning_rate": 7.385525895394412e-06, + "loss": 0.3272, + "step": 4861 + }, + { + "epoch": 1.2228370221327967, + "grad_norm": 0.3435831367969513, + "learning_rate": 7.384239823640052e-06, + "loss": 0.3509, + "step": 4862 + }, + { + "epoch": 1.2230885311871227, + "grad_norm": 0.31683871150016785, + "learning_rate": 7.382953547690465e-06, + "loss": 0.3362, + "step": 4863 + }, + { + "epoch": 1.2233400402414487, + "grad_norm": 0.33224907517433167, + "learning_rate": 7.381667067655815e-06, + "loss": 0.3766, + "step": 4864 + }, + { + "epoch": 1.2235915492957747, + "grad_norm": 0.34509238600730896, + "learning_rate": 7.380380383646282e-06, + "loss": 0.3812, + "step": 4865 + }, + { + "epoch": 1.2238430583501005, + "grad_norm": 0.36535876989364624, + "learning_rate": 7.379093495772059e-06, + "loss": 0.3212, + "step": 4866 + }, + { + "epoch": 1.2240945674044266, + "grad_norm": 0.3264424204826355, + "learning_rate": 7.377806404143362e-06, + "loss": 0.3563, + "step": 4867 + }, + { + "epoch": 1.2243460764587526, + "grad_norm": 0.332707017660141, + "learning_rate": 7.376519108870423e-06, + "loss": 0.3651, + "step": 4868 + }, + { + "epoch": 1.2245975855130784, + "grad_norm": 0.3452300429344177, + "learning_rate": 7.375231610063488e-06, + "loss": 0.3535, + "step": 4869 + }, + { + "epoch": 1.2248490945674044, + "grad_norm": 0.2937973439693451, + "learning_rate": 7.373943907832826e-06, + "loss": 0.354, + "step": 4870 + }, + { + "epoch": 1.2251006036217305, + "grad_norm": 0.33667927980422974, + "learning_rate": 7.37265600228872e-06, + "loss": 0.3762, + "step": 4871 + }, + { + "epoch": 1.2253521126760563, + "grad_norm": 0.32756221294403076, + "learning_rate": 7.37136789354147e-06, + "loss": 0.35, + "step": 4872 + }, + { + "epoch": 1.2256036217303823, + "grad_norm": 0.3230566680431366, + "learning_rate": 7.370079581701396e-06, + "loss": 0.3338, + "step": 4873 + }, + { + "epoch": 1.2258551307847083, + "grad_norm": 0.3240143358707428, + "learning_rate": 7.368791066878832e-06, + "loss": 0.3427, + "step": 4874 + }, + { + "epoch": 1.2261066398390341, + "grad_norm": 0.3279706835746765, + "learning_rate": 7.367502349184132e-06, + "loss": 0.3649, + "step": 4875 + }, + { + "epoch": 1.2263581488933601, + "grad_norm": 0.35657429695129395, + "learning_rate": 7.366213428727669e-06, + "loss": 0.3473, + "step": 4876 + }, + { + "epoch": 1.2266096579476862, + "grad_norm": 0.3800789713859558, + "learning_rate": 7.364924305619827e-06, + "loss": 0.3424, + "step": 4877 + }, + { + "epoch": 1.226861167002012, + "grad_norm": 0.33746159076690674, + "learning_rate": 7.363634979971012e-06, + "loss": 0.3587, + "step": 4878 + }, + { + "epoch": 1.227112676056338, + "grad_norm": 0.319251149892807, + "learning_rate": 7.362345451891649e-06, + "loss": 0.36, + "step": 4879 + }, + { + "epoch": 1.227364185110664, + "grad_norm": 0.3327750265598297, + "learning_rate": 7.361055721492177e-06, + "loss": 0.3533, + "step": 4880 + }, + { + "epoch": 1.22761569416499, + "grad_norm": 0.3240911364555359, + "learning_rate": 7.359765788883053e-06, + "loss": 0.3473, + "step": 4881 + }, + { + "epoch": 1.2278672032193159, + "grad_norm": 0.3320063054561615, + "learning_rate": 7.358475654174753e-06, + "loss": 0.3587, + "step": 4882 + }, + { + "epoch": 1.2281187122736419, + "grad_norm": 0.31429430842399597, + "learning_rate": 7.357185317477766e-06, + "loss": 0.3512, + "step": 4883 + }, + { + "epoch": 1.228370221327968, + "grad_norm": 0.33201098442077637, + "learning_rate": 7.355894778902605e-06, + "loss": 0.3469, + "step": 4884 + }, + { + "epoch": 1.2286217303822937, + "grad_norm": 0.31669896841049194, + "learning_rate": 7.354604038559794e-06, + "loss": 0.3443, + "step": 4885 + }, + { + "epoch": 1.2288732394366197, + "grad_norm": 0.34272029995918274, + "learning_rate": 7.3533130965598765e-06, + "loss": 0.3383, + "step": 4886 + }, + { + "epoch": 1.2291247484909458, + "grad_norm": 0.3486393988132477, + "learning_rate": 7.352021953013415e-06, + "loss": 0.3458, + "step": 4887 + }, + { + "epoch": 1.2293762575452716, + "grad_norm": 0.383306622505188, + "learning_rate": 7.350730608030987e-06, + "loss": 0.3301, + "step": 4888 + }, + { + "epoch": 1.2296277665995976, + "grad_norm": 0.3175544738769531, + "learning_rate": 7.3494390617231896e-06, + "loss": 0.3521, + "step": 4889 + }, + { + "epoch": 1.2298792756539236, + "grad_norm": 0.32851359248161316, + "learning_rate": 7.348147314200634e-06, + "loss": 0.3298, + "step": 4890 + }, + { + "epoch": 1.2301307847082494, + "grad_norm": 0.35966333746910095, + "learning_rate": 7.346855365573951e-06, + "loss": 0.3386, + "step": 4891 + }, + { + "epoch": 1.2303822937625755, + "grad_norm": 0.34041905403137207, + "learning_rate": 7.345563215953787e-06, + "loss": 0.3561, + "step": 4892 + }, + { + "epoch": 1.2306338028169015, + "grad_norm": 0.3270442485809326, + "learning_rate": 7.3442708654508085e-06, + "loss": 0.3536, + "step": 4893 + }, + { + "epoch": 1.2308853118712273, + "grad_norm": 0.3366083800792694, + "learning_rate": 7.342978314175695e-06, + "loss": 0.3454, + "step": 4894 + }, + { + "epoch": 1.2311368209255533, + "grad_norm": 0.3429853320121765, + "learning_rate": 7.341685562239148e-06, + "loss": 0.3413, + "step": 4895 + }, + { + "epoch": 1.2313883299798793, + "grad_norm": 0.3033541738986969, + "learning_rate": 7.3403926097518805e-06, + "loss": 0.3329, + "step": 4896 + }, + { + "epoch": 1.2316398390342052, + "grad_norm": 0.33623993396759033, + "learning_rate": 7.339099456824628e-06, + "loss": 0.3463, + "step": 4897 + }, + { + "epoch": 1.2318913480885312, + "grad_norm": 0.32286033034324646, + "learning_rate": 7.3378061035681415e-06, + "loss": 0.344, + "step": 4898 + }, + { + "epoch": 1.2321428571428572, + "grad_norm": 0.3481849431991577, + "learning_rate": 7.336512550093186e-06, + "loss": 0.3445, + "step": 4899 + }, + { + "epoch": 1.232394366197183, + "grad_norm": 0.3121998906135559, + "learning_rate": 7.335218796510548e-06, + "loss": 0.3416, + "step": 4900 + }, + { + "epoch": 1.232645875251509, + "grad_norm": 0.3597795367240906, + "learning_rate": 7.333924842931031e-06, + "loss": 0.3588, + "step": 4901 + }, + { + "epoch": 1.232897384305835, + "grad_norm": 0.31854021549224854, + "learning_rate": 7.332630689465449e-06, + "loss": 0.3392, + "step": 4902 + }, + { + "epoch": 1.2331488933601609, + "grad_norm": 0.35786551237106323, + "learning_rate": 7.331336336224643e-06, + "loss": 0.3464, + "step": 4903 + }, + { + "epoch": 1.233400402414487, + "grad_norm": 0.36554989218711853, + "learning_rate": 7.330041783319466e-06, + "loss": 0.3524, + "step": 4904 + }, + { + "epoch": 1.233651911468813, + "grad_norm": 0.3338761627674103, + "learning_rate": 7.328747030860786e-06, + "loss": 0.3245, + "step": 4905 + }, + { + "epoch": 1.2339034205231387, + "grad_norm": 0.33866626024246216, + "learning_rate": 7.327452078959491e-06, + "loss": 0.3601, + "step": 4906 + }, + { + "epoch": 1.2341549295774648, + "grad_norm": 0.3417625427246094, + "learning_rate": 7.326156927726485e-06, + "loss": 0.3723, + "step": 4907 + }, + { + "epoch": 1.2344064386317908, + "grad_norm": 0.402842253446579, + "learning_rate": 7.324861577272693e-06, + "loss": 0.3539, + "step": 4908 + }, + { + "epoch": 1.2346579476861166, + "grad_norm": 0.3404982089996338, + "learning_rate": 7.323566027709049e-06, + "loss": 0.3507, + "step": 4909 + }, + { + "epoch": 1.2349094567404426, + "grad_norm": 0.32714393734931946, + "learning_rate": 7.322270279146512e-06, + "loss": 0.3506, + "step": 4910 + }, + { + "epoch": 1.2351609657947686, + "grad_norm": 0.35655197501182556, + "learning_rate": 7.320974331696053e-06, + "loss": 0.3603, + "step": 4911 + }, + { + "epoch": 1.2354124748490944, + "grad_norm": 0.3561979830265045, + "learning_rate": 7.319678185468662e-06, + "loss": 0.3486, + "step": 4912 + }, + { + "epoch": 1.2356639839034205, + "grad_norm": 0.36184290051460266, + "learning_rate": 7.318381840575347e-06, + "loss": 0.3567, + "step": 4913 + }, + { + "epoch": 1.2359154929577465, + "grad_norm": 0.30532655119895935, + "learning_rate": 7.317085297127131e-06, + "loss": 0.3518, + "step": 4914 + }, + { + "epoch": 1.2361670020120725, + "grad_norm": 0.3469163477420807, + "learning_rate": 7.315788555235055e-06, + "loss": 0.3704, + "step": 4915 + }, + { + "epoch": 1.2364185110663983, + "grad_norm": 0.34762290120124817, + "learning_rate": 7.314491615010178e-06, + "loss": 0.3404, + "step": 4916 + }, + { + "epoch": 1.2366700201207244, + "grad_norm": 0.33168843388557434, + "learning_rate": 7.313194476563572e-06, + "loss": 0.3181, + "step": 4917 + }, + { + "epoch": 1.2369215291750504, + "grad_norm": 0.34730586409568787, + "learning_rate": 7.311897140006331e-06, + "loss": 0.37, + "step": 4918 + }, + { + "epoch": 1.2371730382293762, + "grad_norm": 0.3283190131187439, + "learning_rate": 7.310599605449563e-06, + "loss": 0.3246, + "step": 4919 + }, + { + "epoch": 1.2374245472837022, + "grad_norm": 0.34569719433784485, + "learning_rate": 7.309301873004394e-06, + "loss": 0.3726, + "step": 4920 + }, + { + "epoch": 1.2376760563380282, + "grad_norm": 0.33751311898231506, + "learning_rate": 7.308003942781966e-06, + "loss": 0.3524, + "step": 4921 + }, + { + "epoch": 1.237927565392354, + "grad_norm": 0.31516730785369873, + "learning_rate": 7.30670581489344e-06, + "loss": 0.3436, + "step": 4922 + }, + { + "epoch": 1.23817907444668, + "grad_norm": 0.40124109387397766, + "learning_rate": 7.305407489449991e-06, + "loss": 0.3557, + "step": 4923 + }, + { + "epoch": 1.238430583501006, + "grad_norm": 0.31682252883911133, + "learning_rate": 7.3041089665628125e-06, + "loss": 0.3739, + "step": 4924 + }, + { + "epoch": 1.238682092555332, + "grad_norm": 0.32461637258529663, + "learning_rate": 7.302810246343117e-06, + "loss": 0.3655, + "step": 4925 + }, + { + "epoch": 1.238933601609658, + "grad_norm": 0.32863402366638184, + "learning_rate": 7.30151132890213e-06, + "loss": 0.3388, + "step": 4926 + }, + { + "epoch": 1.239185110663984, + "grad_norm": 0.35274699330329895, + "learning_rate": 7.300212214351095e-06, + "loss": 0.3431, + "step": 4927 + }, + { + "epoch": 1.2394366197183098, + "grad_norm": 0.343986839056015, + "learning_rate": 7.298912902801274e-06, + "loss": 0.3763, + "step": 4928 + }, + { + "epoch": 1.2396881287726358, + "grad_norm": 0.3199062943458557, + "learning_rate": 7.297613394363946e-06, + "loss": 0.3223, + "step": 4929 + }, + { + "epoch": 1.2399396378269618, + "grad_norm": 0.3335794508457184, + "learning_rate": 7.296313689150404e-06, + "loss": 0.37, + "step": 4930 + }, + { + "epoch": 1.2401911468812878, + "grad_norm": 0.365524560213089, + "learning_rate": 7.295013787271959e-06, + "loss": 0.3746, + "step": 4931 + }, + { + "epoch": 1.2404426559356136, + "grad_norm": 0.3475268483161926, + "learning_rate": 7.293713688839941e-06, + "loss": 0.3658, + "step": 4932 + }, + { + "epoch": 1.2406941649899397, + "grad_norm": 0.3196357786655426, + "learning_rate": 7.292413393965696e-06, + "loss": 0.334, + "step": 4933 + }, + { + "epoch": 1.2409456740442657, + "grad_norm": 0.3176928460597992, + "learning_rate": 7.291112902760584e-06, + "loss": 0.3439, + "step": 4934 + }, + { + "epoch": 1.2411971830985915, + "grad_norm": 0.3524039685726166, + "learning_rate": 7.289812215335987e-06, + "loss": 0.3795, + "step": 4935 + }, + { + "epoch": 1.2414486921529175, + "grad_norm": 0.3586216866970062, + "learning_rate": 7.288511331803296e-06, + "loss": 0.3812, + "step": 4936 + }, + { + "epoch": 1.2417002012072436, + "grad_norm": 0.31926751136779785, + "learning_rate": 7.2872102522739286e-06, + "loss": 0.3386, + "step": 4937 + }, + { + "epoch": 1.2419517102615694, + "grad_norm": 0.338290274143219, + "learning_rate": 7.285908976859312e-06, + "loss": 0.3553, + "step": 4938 + }, + { + "epoch": 1.2422032193158954, + "grad_norm": 0.3404878079891205, + "learning_rate": 7.284607505670891e-06, + "loss": 0.3486, + "step": 4939 + }, + { + "epoch": 1.2424547283702214, + "grad_norm": 0.3355543613433838, + "learning_rate": 7.283305838820129e-06, + "loss": 0.3815, + "step": 4940 + }, + { + "epoch": 1.2427062374245472, + "grad_norm": 0.3367030620574951, + "learning_rate": 7.282003976418508e-06, + "loss": 0.3753, + "step": 4941 + }, + { + "epoch": 1.2429577464788732, + "grad_norm": 0.3236370086669922, + "learning_rate": 7.280701918577521e-06, + "loss": 0.3493, + "step": 4942 + }, + { + "epoch": 1.2432092555331993, + "grad_norm": 0.3462064862251282, + "learning_rate": 7.279399665408684e-06, + "loss": 0.3639, + "step": 4943 + }, + { + "epoch": 1.243460764587525, + "grad_norm": 0.35062482953071594, + "learning_rate": 7.278097217023523e-06, + "loss": 0.3644, + "step": 4944 + }, + { + "epoch": 1.243712273641851, + "grad_norm": 0.38055381178855896, + "learning_rate": 7.27679457353359e-06, + "loss": 0.3517, + "step": 4945 + }, + { + "epoch": 1.2439637826961771, + "grad_norm": 0.36633816361427307, + "learning_rate": 7.275491735050444e-06, + "loss": 0.36, + "step": 4946 + }, + { + "epoch": 1.244215291750503, + "grad_norm": 0.3013075590133667, + "learning_rate": 7.274188701685666e-06, + "loss": 0.3564, + "step": 4947 + }, + { + "epoch": 1.244466800804829, + "grad_norm": 0.35343462228775024, + "learning_rate": 7.272885473550855e-06, + "loss": 0.3335, + "step": 4948 + }, + { + "epoch": 1.244718309859155, + "grad_norm": 0.337429404258728, + "learning_rate": 7.271582050757619e-06, + "loss": 0.3329, + "step": 4949 + }, + { + "epoch": 1.2449698189134808, + "grad_norm": 0.3367510735988617, + "learning_rate": 7.2702784334175945e-06, + "loss": 0.3195, + "step": 4950 + }, + { + "epoch": 1.2452213279678068, + "grad_norm": 0.32875868678092957, + "learning_rate": 7.268974621642424e-06, + "loss": 0.3472, + "step": 4951 + }, + { + "epoch": 1.2454728370221329, + "grad_norm": 0.3070097863674164, + "learning_rate": 7.267670615543771e-06, + "loss": 0.3493, + "step": 4952 + }, + { + "epoch": 1.2457243460764587, + "grad_norm": 0.3756659924983978, + "learning_rate": 7.266366415233317e-06, + "loss": 0.3728, + "step": 4953 + }, + { + "epoch": 1.2459758551307847, + "grad_norm": 0.3073340356349945, + "learning_rate": 7.265062020822757e-06, + "loss": 0.3499, + "step": 4954 + }, + { + "epoch": 1.2462273641851107, + "grad_norm": 0.3308236300945282, + "learning_rate": 7.263757432423806e-06, + "loss": 0.385, + "step": 4955 + }, + { + "epoch": 1.2464788732394365, + "grad_norm": 0.3272673487663269, + "learning_rate": 7.262452650148193e-06, + "loss": 0.3626, + "step": 4956 + }, + { + "epoch": 1.2467303822937625, + "grad_norm": 0.33081069588661194, + "learning_rate": 7.2611476741076636e-06, + "loss": 0.3471, + "step": 4957 + }, + { + "epoch": 1.2469818913480886, + "grad_norm": 0.33813828229904175, + "learning_rate": 7.2598425044139835e-06, + "loss": 0.3423, + "step": 4958 + }, + { + "epoch": 1.2472334004024144, + "grad_norm": 0.319319486618042, + "learning_rate": 7.258537141178928e-06, + "loss": 0.3595, + "step": 4959 + }, + { + "epoch": 1.2474849094567404, + "grad_norm": 0.30126598477363586, + "learning_rate": 7.257231584514297e-06, + "loss": 0.3368, + "step": 4960 + }, + { + "epoch": 1.2477364185110664, + "grad_norm": 0.3376123309135437, + "learning_rate": 7.255925834531902e-06, + "loss": 0.34, + "step": 4961 + }, + { + "epoch": 1.2479879275653922, + "grad_norm": 0.3421916663646698, + "learning_rate": 7.254619891343572e-06, + "loss": 0.3507, + "step": 4962 + }, + { + "epoch": 1.2482394366197183, + "grad_norm": 0.29679960012435913, + "learning_rate": 7.253313755061153e-06, + "loss": 0.3465, + "step": 4963 + }, + { + "epoch": 1.2484909456740443, + "grad_norm": 0.3243803381919861, + "learning_rate": 7.252007425796508e-06, + "loss": 0.3479, + "step": 4964 + }, + { + "epoch": 1.2487424547283703, + "grad_norm": 0.3532167971134186, + "learning_rate": 7.2507009036615136e-06, + "loss": 0.346, + "step": 4965 + }, + { + "epoch": 1.2489939637826961, + "grad_norm": 0.30621767044067383, + "learning_rate": 7.24939418876807e-06, + "loss": 0.3443, + "step": 4966 + }, + { + "epoch": 1.2492454728370221, + "grad_norm": 0.3399735689163208, + "learning_rate": 7.248087281228085e-06, + "loss": 0.3302, + "step": 4967 + }, + { + "epoch": 1.2494969818913482, + "grad_norm": 0.34660354256629944, + "learning_rate": 7.246780181153489e-06, + "loss": 0.3406, + "step": 4968 + }, + { + "epoch": 1.249748490945674, + "grad_norm": 0.3369465470314026, + "learning_rate": 7.2454728886562255e-06, + "loss": 0.3625, + "step": 4969 + }, + { + "epoch": 1.25, + "grad_norm": 0.3431127369403839, + "learning_rate": 7.244165403848257e-06, + "loss": 0.3365, + "step": 4970 + }, + { + "epoch": 1.250251509054326, + "grad_norm": 0.3352718651294708, + "learning_rate": 7.242857726841561e-06, + "loss": 0.3753, + "step": 4971 + }, + { + "epoch": 1.2505030181086518, + "grad_norm": 0.3221033215522766, + "learning_rate": 7.2415498577481325e-06, + "loss": 0.342, + "step": 4972 + }, + { + "epoch": 1.2507545271629779, + "grad_norm": 0.33727914094924927, + "learning_rate": 7.240241796679981e-06, + "loss": 0.3419, + "step": 4973 + }, + { + "epoch": 1.2510060362173039, + "grad_norm": 0.3359692096710205, + "learning_rate": 7.238933543749135e-06, + "loss": 0.3671, + "step": 4974 + }, + { + "epoch": 1.25125754527163, + "grad_norm": 0.32565391063690186, + "learning_rate": 7.237625099067638e-06, + "loss": 0.3667, + "step": 4975 + }, + { + "epoch": 1.2515090543259557, + "grad_norm": 0.310983806848526, + "learning_rate": 7.236316462747548e-06, + "loss": 0.3488, + "step": 4976 + }, + { + "epoch": 1.2517605633802817, + "grad_norm": 0.3416151702404022, + "learning_rate": 7.235007634900945e-06, + "loss": 0.3496, + "step": 4977 + }, + { + "epoch": 1.2520120724346078, + "grad_norm": 0.3046923577785492, + "learning_rate": 7.23369861563992e-06, + "loss": 0.3299, + "step": 4978 + }, + { + "epoch": 1.2522635814889336, + "grad_norm": 0.3572548031806946, + "learning_rate": 7.232389405076583e-06, + "loss": 0.3538, + "step": 4979 + }, + { + "epoch": 1.2525150905432596, + "grad_norm": 0.32661741971969604, + "learning_rate": 7.231080003323059e-06, + "loss": 0.3715, + "step": 4980 + }, + { + "epoch": 1.2527665995975856, + "grad_norm": 0.3313870131969452, + "learning_rate": 7.22977041049149e-06, + "loss": 0.3579, + "step": 4981 + }, + { + "epoch": 1.2530181086519114, + "grad_norm": 0.3333343267440796, + "learning_rate": 7.228460626694034e-06, + "loss": 0.318, + "step": 4982 + }, + { + "epoch": 1.2532696177062375, + "grad_norm": 0.3657362163066864, + "learning_rate": 7.227150652042868e-06, + "loss": 0.3484, + "step": 4983 + }, + { + "epoch": 1.2535211267605635, + "grad_norm": 0.3520519733428955, + "learning_rate": 7.225840486650181e-06, + "loss": 0.3439, + "step": 4984 + }, + { + "epoch": 1.2537726358148893, + "grad_norm": 0.3073820173740387, + "learning_rate": 7.2245301306281815e-06, + "loss": 0.3389, + "step": 4985 + }, + { + "epoch": 1.2540241448692153, + "grad_norm": 0.3221682608127594, + "learning_rate": 7.223219584089091e-06, + "loss": 0.3708, + "step": 4986 + }, + { + "epoch": 1.2542756539235413, + "grad_norm": 0.32095620036125183, + "learning_rate": 7.221908847145153e-06, + "loss": 0.3656, + "step": 4987 + }, + { + "epoch": 1.2545271629778671, + "grad_norm": 0.3564129173755646, + "learning_rate": 7.220597919908624e-06, + "loss": 0.3757, + "step": 4988 + }, + { + "epoch": 1.2547786720321932, + "grad_norm": 0.33230066299438477, + "learning_rate": 7.219286802491774e-06, + "loss": 0.3458, + "step": 4989 + }, + { + "epoch": 1.2550301810865192, + "grad_norm": 0.33365389704704285, + "learning_rate": 7.217975495006892e-06, + "loss": 0.3577, + "step": 4990 + }, + { + "epoch": 1.255281690140845, + "grad_norm": 0.3553404211997986, + "learning_rate": 7.216663997566285e-06, + "loss": 0.3845, + "step": 4991 + }, + { + "epoch": 1.255533199195171, + "grad_norm": 0.3103611171245575, + "learning_rate": 7.215352310282275e-06, + "loss": 0.3408, + "step": 4992 + }, + { + "epoch": 1.255784708249497, + "grad_norm": 0.34521424770355225, + "learning_rate": 7.2140404332671986e-06, + "loss": 0.342, + "step": 4993 + }, + { + "epoch": 1.2560362173038229, + "grad_norm": 0.31152811646461487, + "learning_rate": 7.212728366633411e-06, + "loss": 0.3621, + "step": 4994 + }, + { + "epoch": 1.256287726358149, + "grad_norm": 0.335000604391098, + "learning_rate": 7.211416110493279e-06, + "loss": 0.3201, + "step": 4995 + }, + { + "epoch": 1.256539235412475, + "grad_norm": 0.3560849130153656, + "learning_rate": 7.210103664959194e-06, + "loss": 0.373, + "step": 4996 + }, + { + "epoch": 1.2567907444668007, + "grad_norm": 0.357250452041626, + "learning_rate": 7.2087910301435545e-06, + "loss": 0.343, + "step": 4997 + }, + { + "epoch": 1.2570422535211268, + "grad_norm": 0.32552990317344666, + "learning_rate": 7.207478206158782e-06, + "loss": 0.3473, + "step": 4998 + }, + { + "epoch": 1.2572937625754528, + "grad_norm": 0.3105219006538391, + "learning_rate": 7.2061651931173115e-06, + "loss": 0.3472, + "step": 4999 + }, + { + "epoch": 1.2575452716297786, + "grad_norm": 0.3435421586036682, + "learning_rate": 7.204851991131594e-06, + "loss": 0.3495, + "step": 5000 + }, + { + "epoch": 1.2577967806841046, + "grad_norm": 0.34035179018974304, + "learning_rate": 7.203538600314096e-06, + "loss": 0.3422, + "step": 5001 + }, + { + "epoch": 1.2580482897384306, + "grad_norm": 0.3376530408859253, + "learning_rate": 7.2022250207773035e-06, + "loss": 0.3356, + "step": 5002 + }, + { + "epoch": 1.2582997987927564, + "grad_norm": 0.3429863750934601, + "learning_rate": 7.200911252633714e-06, + "loss": 0.3598, + "step": 5003 + }, + { + "epoch": 1.2585513078470825, + "grad_norm": 0.3484230935573578, + "learning_rate": 7.199597295995846e-06, + "loss": 0.3519, + "step": 5004 + }, + { + "epoch": 1.2588028169014085, + "grad_norm": 0.35319894552230835, + "learning_rate": 7.1982831509762294e-06, + "loss": 0.3632, + "step": 5005 + }, + { + "epoch": 1.2590543259557343, + "grad_norm": 0.3312159478664398, + "learning_rate": 7.196968817687413e-06, + "loss": 0.3324, + "step": 5006 + }, + { + "epoch": 1.2593058350100603, + "grad_norm": 0.34530702233314514, + "learning_rate": 7.195654296241963e-06, + "loss": 0.3721, + "step": 5007 + }, + { + "epoch": 1.2595573440643864, + "grad_norm": 0.3831932842731476, + "learning_rate": 7.194339586752457e-06, + "loss": 0.3432, + "step": 5008 + }, + { + "epoch": 1.2598088531187122, + "grad_norm": 0.3508305847644806, + "learning_rate": 7.193024689331493e-06, + "loss": 0.3323, + "step": 5009 + }, + { + "epoch": 1.2600603621730382, + "grad_norm": 0.33183974027633667, + "learning_rate": 7.1917096040916835e-06, + "loss": 0.3467, + "step": 5010 + }, + { + "epoch": 1.2603118712273642, + "grad_norm": 0.33060377836227417, + "learning_rate": 7.190394331145659e-06, + "loss": 0.3598, + "step": 5011 + }, + { + "epoch": 1.26056338028169, + "grad_norm": 0.3584049940109253, + "learning_rate": 7.189078870606063e-06, + "loss": 0.333, + "step": 5012 + }, + { + "epoch": 1.260814889336016, + "grad_norm": 0.35449668765068054, + "learning_rate": 7.187763222585556e-06, + "loss": 0.37, + "step": 5013 + }, + { + "epoch": 1.261066398390342, + "grad_norm": 0.34245765209198, + "learning_rate": 7.186447387196815e-06, + "loss": 0.3424, + "step": 5014 + }, + { + "epoch": 1.2613179074446679, + "grad_norm": 0.3323255777359009, + "learning_rate": 7.1851313645525356e-06, + "loss": 0.3614, + "step": 5015 + }, + { + "epoch": 1.261569416498994, + "grad_norm": 0.35160186886787415, + "learning_rate": 7.183815154765423e-06, + "loss": 0.3396, + "step": 5016 + }, + { + "epoch": 1.26182092555332, + "grad_norm": 0.31636881828308105, + "learning_rate": 7.182498757948207e-06, + "loss": 0.3336, + "step": 5017 + }, + { + "epoch": 1.262072434607646, + "grad_norm": 0.36350032687187195, + "learning_rate": 7.181182174213623e-06, + "loss": 0.3448, + "step": 5018 + }, + { + "epoch": 1.2623239436619718, + "grad_norm": 0.3364773690700531, + "learning_rate": 7.179865403674433e-06, + "loss": 0.3263, + "step": 5019 + }, + { + "epoch": 1.2625754527162978, + "grad_norm": 0.32822203636169434, + "learning_rate": 7.178548446443407e-06, + "loss": 0.3504, + "step": 5020 + }, + { + "epoch": 1.2628269617706238, + "grad_norm": 0.33104854822158813, + "learning_rate": 7.177231302633337e-06, + "loss": 0.3321, + "step": 5021 + }, + { + "epoch": 1.2630784708249496, + "grad_norm": 0.34756192564964294, + "learning_rate": 7.175913972357025e-06, + "loss": 0.3538, + "step": 5022 + }, + { + "epoch": 1.2633299798792756, + "grad_norm": 0.35056936740875244, + "learning_rate": 7.174596455727295e-06, + "loss": 0.352, + "step": 5023 + }, + { + "epoch": 1.2635814889336017, + "grad_norm": 0.3366295099258423, + "learning_rate": 7.173278752856983e-06, + "loss": 0.3545, + "step": 5024 + }, + { + "epoch": 1.2638329979879277, + "grad_norm": 0.3299401104450226, + "learning_rate": 7.171960863858941e-06, + "loss": 0.3385, + "step": 5025 + }, + { + "epoch": 1.2640845070422535, + "grad_norm": 0.3155493140220642, + "learning_rate": 7.17064278884604e-06, + "loss": 0.3304, + "step": 5026 + }, + { + "epoch": 1.2643360160965795, + "grad_norm": 0.3616369068622589, + "learning_rate": 7.169324527931162e-06, + "loss": 0.3593, + "step": 5027 + }, + { + "epoch": 1.2645875251509056, + "grad_norm": 0.3170899748802185, + "learning_rate": 7.16800608122721e-06, + "loss": 0.3077, + "step": 5028 + }, + { + "epoch": 1.2648390342052314, + "grad_norm": 0.3360782563686371, + "learning_rate": 7.1666874488471e-06, + "loss": 0.3558, + "step": 5029 + }, + { + "epoch": 1.2650905432595574, + "grad_norm": 0.32805049419403076, + "learning_rate": 7.165368630903766e-06, + "loss": 0.332, + "step": 5030 + }, + { + "epoch": 1.2653420523138834, + "grad_norm": 0.34620416164398193, + "learning_rate": 7.164049627510154e-06, + "loss": 0.3559, + "step": 5031 + }, + { + "epoch": 1.2655935613682092, + "grad_norm": 0.33738458156585693, + "learning_rate": 7.1627304387792285e-06, + "loss": 0.355, + "step": 5032 + }, + { + "epoch": 1.2658450704225352, + "grad_norm": 0.3080560266971588, + "learning_rate": 7.161411064823973e-06, + "loss": 0.3521, + "step": 5033 + }, + { + "epoch": 1.2660965794768613, + "grad_norm": 0.3708006739616394, + "learning_rate": 7.160091505757381e-06, + "loss": 0.384, + "step": 5034 + }, + { + "epoch": 1.266348088531187, + "grad_norm": 0.34174585342407227, + "learning_rate": 7.158771761692464e-06, + "loss": 0.3298, + "step": 5035 + }, + { + "epoch": 1.266599597585513, + "grad_norm": 0.3278071880340576, + "learning_rate": 7.157451832742253e-06, + "loss": 0.3747, + "step": 5036 + }, + { + "epoch": 1.2668511066398391, + "grad_norm": 0.3372129797935486, + "learning_rate": 7.156131719019789e-06, + "loss": 0.371, + "step": 5037 + }, + { + "epoch": 1.267102615694165, + "grad_norm": 0.3328630030155182, + "learning_rate": 7.15481142063813e-06, + "loss": 0.3402, + "step": 5038 + }, + { + "epoch": 1.267354124748491, + "grad_norm": 0.3106217086315155, + "learning_rate": 7.1534909377103555e-06, + "loss": 0.3318, + "step": 5039 + }, + { + "epoch": 1.267605633802817, + "grad_norm": 0.3225652575492859, + "learning_rate": 7.152170270349553e-06, + "loss": 0.3657, + "step": 5040 + }, + { + "epoch": 1.2678571428571428, + "grad_norm": 0.3484492003917694, + "learning_rate": 7.1508494186688305e-06, + "loss": 0.3852, + "step": 5041 + }, + { + "epoch": 1.2681086519114688, + "grad_norm": 0.3207867741584778, + "learning_rate": 7.149528382781312e-06, + "loss": 0.3607, + "step": 5042 + }, + { + "epoch": 1.2683601609657948, + "grad_norm": 0.32192859053611755, + "learning_rate": 7.148207162800135e-06, + "loss": 0.3463, + "step": 5043 + }, + { + "epoch": 1.2686116700201207, + "grad_norm": 0.31769081950187683, + "learning_rate": 7.146885758838453e-06, + "loss": 0.3437, + "step": 5044 + }, + { + "epoch": 1.2688631790744467, + "grad_norm": 0.3407571315765381, + "learning_rate": 7.145564171009437e-06, + "loss": 0.3505, + "step": 5045 + }, + { + "epoch": 1.2691146881287727, + "grad_norm": 0.32075372338294983, + "learning_rate": 7.144242399426272e-06, + "loss": 0.3455, + "step": 5046 + }, + { + "epoch": 1.2693661971830985, + "grad_norm": 0.30012497305870056, + "learning_rate": 7.14292044420216e-06, + "loss": 0.33, + "step": 5047 + }, + { + "epoch": 1.2696177062374245, + "grad_norm": 0.3274124264717102, + "learning_rate": 7.141598305450319e-06, + "loss": 0.3383, + "step": 5048 + }, + { + "epoch": 1.2698692152917506, + "grad_norm": 0.3339790403842926, + "learning_rate": 7.1402759832839795e-06, + "loss": 0.3598, + "step": 5049 + }, + { + "epoch": 1.2701207243460764, + "grad_norm": 0.33048734068870544, + "learning_rate": 7.138953477816393e-06, + "loss": 0.3401, + "step": 5050 + }, + { + "epoch": 1.2703722334004024, + "grad_norm": 0.3411872386932373, + "learning_rate": 7.137630789160821e-06, + "loss": 0.3694, + "step": 5051 + }, + { + "epoch": 1.2706237424547284, + "grad_norm": 0.3702620267868042, + "learning_rate": 7.136307917430545e-06, + "loss": 0.3455, + "step": 5052 + }, + { + "epoch": 1.2708752515090542, + "grad_norm": 0.3102932274341583, + "learning_rate": 7.1349848627388616e-06, + "loss": 0.3652, + "step": 5053 + }, + { + "epoch": 1.2711267605633803, + "grad_norm": 0.3049952983856201, + "learning_rate": 7.13366162519908e-06, + "loss": 0.3456, + "step": 5054 + }, + { + "epoch": 1.2713782696177063, + "grad_norm": 0.34085172414779663, + "learning_rate": 7.132338204924529e-06, + "loss": 0.3461, + "step": 5055 + }, + { + "epoch": 1.271629778672032, + "grad_norm": 0.33940356969833374, + "learning_rate": 7.131014602028551e-06, + "loss": 0.3462, + "step": 5056 + }, + { + "epoch": 1.2718812877263581, + "grad_norm": 0.3196280896663666, + "learning_rate": 7.129690816624504e-06, + "loss": 0.357, + "step": 5057 + }, + { + "epoch": 1.2721327967806841, + "grad_norm": 0.3302951157093048, + "learning_rate": 7.128366848825761e-06, + "loss": 0.3587, + "step": 5058 + }, + { + "epoch": 1.27238430583501, + "grad_norm": 0.3771762549877167, + "learning_rate": 7.127042698745712e-06, + "loss": 0.3846, + "step": 5059 + }, + { + "epoch": 1.272635814889336, + "grad_norm": 0.3365668058395386, + "learning_rate": 7.125718366497763e-06, + "loss": 0.3886, + "step": 5060 + }, + { + "epoch": 1.272887323943662, + "grad_norm": 0.299969345331192, + "learning_rate": 7.124393852195335e-06, + "loss": 0.3545, + "step": 5061 + }, + { + "epoch": 1.2731388329979878, + "grad_norm": 0.3523898422718048, + "learning_rate": 7.123069155951864e-06, + "loss": 0.3674, + "step": 5062 + }, + { + "epoch": 1.2733903420523138, + "grad_norm": 0.3061320185661316, + "learning_rate": 7.1217442778808e-06, + "loss": 0.3479, + "step": 5063 + }, + { + "epoch": 1.2736418511066399, + "grad_norm": 0.37146908044815063, + "learning_rate": 7.120419218095614e-06, + "loss": 0.3602, + "step": 5064 + }, + { + "epoch": 1.2738933601609657, + "grad_norm": 0.32229381799697876, + "learning_rate": 7.119093976709785e-06, + "loss": 0.3616, + "step": 5065 + }, + { + "epoch": 1.2741448692152917, + "grad_norm": 0.3241557776927948, + "learning_rate": 7.117768553836816e-06, + "loss": 0.3269, + "step": 5066 + }, + { + "epoch": 1.2743963782696177, + "grad_norm": 0.33732870221138, + "learning_rate": 7.1164429495902185e-06, + "loss": 0.3396, + "step": 5067 + }, + { + "epoch": 1.2746478873239437, + "grad_norm": 0.34197482466697693, + "learning_rate": 7.115117164083522e-06, + "loss": 0.3481, + "step": 5068 + }, + { + "epoch": 1.2748993963782695, + "grad_norm": 0.32349589467048645, + "learning_rate": 7.113791197430275e-06, + "loss": 0.3505, + "step": 5069 + }, + { + "epoch": 1.2751509054325956, + "grad_norm": 0.31883949041366577, + "learning_rate": 7.112465049744033e-06, + "loss": 0.305, + "step": 5070 + }, + { + "epoch": 1.2754024144869216, + "grad_norm": 0.3327849507331848, + "learning_rate": 7.111138721138376e-06, + "loss": 0.3398, + "step": 5071 + }, + { + "epoch": 1.2756539235412474, + "grad_norm": 0.3398069143295288, + "learning_rate": 7.109812211726895e-06, + "loss": 0.3686, + "step": 5072 + }, + { + "epoch": 1.2759054325955734, + "grad_norm": 0.3131691515445709, + "learning_rate": 7.108485521623196e-06, + "loss": 0.3489, + "step": 5073 + }, + { + "epoch": 1.2761569416498995, + "grad_norm": 0.3182958662509918, + "learning_rate": 7.107158650940904e-06, + "loss": 0.349, + "step": 5074 + }, + { + "epoch": 1.2764084507042255, + "grad_norm": 0.2990667521953583, + "learning_rate": 7.105831599793655e-06, + "loss": 0.3483, + "step": 5075 + }, + { + "epoch": 1.2766599597585513, + "grad_norm": 0.3262629508972168, + "learning_rate": 7.104504368295105e-06, + "loss": 0.364, + "step": 5076 + }, + { + "epoch": 1.2769114688128773, + "grad_norm": 0.3283725082874298, + "learning_rate": 7.10317695655892e-06, + "loss": 0.356, + "step": 5077 + }, + { + "epoch": 1.2771629778672033, + "grad_norm": 0.31310954689979553, + "learning_rate": 7.101849364698786e-06, + "loss": 0.3504, + "step": 5078 + }, + { + "epoch": 1.2774144869215291, + "grad_norm": 0.3293515145778656, + "learning_rate": 7.100521592828405e-06, + "loss": 0.3362, + "step": 5079 + }, + { + "epoch": 1.2776659959758552, + "grad_norm": 0.31112322211265564, + "learning_rate": 7.0991936410614885e-06, + "loss": 0.3425, + "step": 5080 + }, + { + "epoch": 1.2779175050301812, + "grad_norm": 0.3442460298538208, + "learning_rate": 7.09786550951177e-06, + "loss": 0.3326, + "step": 5081 + }, + { + "epoch": 1.278169014084507, + "grad_norm": 0.3595294952392578, + "learning_rate": 7.096537198292994e-06, + "loss": 0.3586, + "step": 5082 + }, + { + "epoch": 1.278420523138833, + "grad_norm": 0.359210729598999, + "learning_rate": 7.0952087075189235e-06, + "loss": 0.382, + "step": 5083 + }, + { + "epoch": 1.278672032193159, + "grad_norm": 0.3556901216506958, + "learning_rate": 7.0938800373033355e-06, + "loss": 0.3755, + "step": 5084 + }, + { + "epoch": 1.2789235412474849, + "grad_norm": 0.3710108697414398, + "learning_rate": 7.0925511877600195e-06, + "loss": 0.3571, + "step": 5085 + }, + { + "epoch": 1.279175050301811, + "grad_norm": 0.3306417465209961, + "learning_rate": 7.091222159002786e-06, + "loss": 0.3409, + "step": 5086 + }, + { + "epoch": 1.279426559356137, + "grad_norm": 0.35231444239616394, + "learning_rate": 7.0898929511454585e-06, + "loss": 0.3652, + "step": 5087 + }, + { + "epoch": 1.2796780684104627, + "grad_norm": 0.35659119486808777, + "learning_rate": 7.088563564301874e-06, + "loss": 0.3515, + "step": 5088 + }, + { + "epoch": 1.2799295774647887, + "grad_norm": 0.35471397638320923, + "learning_rate": 7.0872339985858855e-06, + "loss": 0.3801, + "step": 5089 + }, + { + "epoch": 1.2801810865191148, + "grad_norm": 0.3614024221897125, + "learning_rate": 7.085904254111362e-06, + "loss": 0.3543, + "step": 5090 + }, + { + "epoch": 1.2804325955734406, + "grad_norm": 0.3554777503013611, + "learning_rate": 7.0845743309921896e-06, + "loss": 0.3349, + "step": 5091 + }, + { + "epoch": 1.2806841046277666, + "grad_norm": 0.33316969871520996, + "learning_rate": 7.083244229342266e-06, + "loss": 0.3471, + "step": 5092 + }, + { + "epoch": 1.2809356136820926, + "grad_norm": 0.3104395866394043, + "learning_rate": 7.081913949275508e-06, + "loss": 0.3418, + "step": 5093 + }, + { + "epoch": 1.2811871227364184, + "grad_norm": 0.3648764491081238, + "learning_rate": 7.080583490905845e-06, + "loss": 0.3543, + "step": 5094 + }, + { + "epoch": 1.2814386317907445, + "grad_norm": 0.36732056736946106, + "learning_rate": 7.07925285434722e-06, + "loss": 0.3516, + "step": 5095 + }, + { + "epoch": 1.2816901408450705, + "grad_norm": 0.31676825881004333, + "learning_rate": 7.077922039713596e-06, + "loss": 0.3175, + "step": 5096 + }, + { + "epoch": 1.2819416498993963, + "grad_norm": 0.35175764560699463, + "learning_rate": 7.07659104711895e-06, + "loss": 0.3602, + "step": 5097 + }, + { + "epoch": 1.2821931589537223, + "grad_norm": 0.3779788911342621, + "learning_rate": 7.075259876677272e-06, + "loss": 0.3549, + "step": 5098 + }, + { + "epoch": 1.2824446680080483, + "grad_norm": 0.35006478428840637, + "learning_rate": 7.073928528502569e-06, + "loss": 0.3774, + "step": 5099 + }, + { + "epoch": 1.2826961770623742, + "grad_norm": 0.3286049962043762, + "learning_rate": 7.07259700270886e-06, + "loss": 0.3319, + "step": 5100 + }, + { + "epoch": 1.2829476861167002, + "grad_norm": 0.3339432179927826, + "learning_rate": 7.071265299410185e-06, + "loss": 0.3614, + "step": 5101 + }, + { + "epoch": 1.2831991951710262, + "grad_norm": 0.34671711921691895, + "learning_rate": 7.069933418720594e-06, + "loss": 0.3644, + "step": 5102 + }, + { + "epoch": 1.283450704225352, + "grad_norm": 0.33326461911201477, + "learning_rate": 7.068601360754157e-06, + "loss": 0.3369, + "step": 5103 + }, + { + "epoch": 1.283702213279678, + "grad_norm": 0.3220560550689697, + "learning_rate": 7.0672691256249526e-06, + "loss": 0.3421, + "step": 5104 + }, + { + "epoch": 1.283953722334004, + "grad_norm": 0.3266070485115051, + "learning_rate": 7.065936713447081e-06, + "loss": 0.3523, + "step": 5105 + }, + { + "epoch": 1.2842052313883299, + "grad_norm": 0.33557188510894775, + "learning_rate": 7.064604124334655e-06, + "loss": 0.3568, + "step": 5106 + }, + { + "epoch": 1.284456740442656, + "grad_norm": 0.33787620067596436, + "learning_rate": 7.063271358401802e-06, + "loss": 0.3501, + "step": 5107 + }, + { + "epoch": 1.284708249496982, + "grad_norm": 0.31968948245048523, + "learning_rate": 7.061938415762664e-06, + "loss": 0.3282, + "step": 5108 + }, + { + "epoch": 1.2849597585513077, + "grad_norm": 0.309199720621109, + "learning_rate": 7.060605296531401e-06, + "loss": 0.3448, + "step": 5109 + }, + { + "epoch": 1.2852112676056338, + "grad_norm": 0.31305763125419617, + "learning_rate": 7.059272000822185e-06, + "loss": 0.3649, + "step": 5110 + }, + { + "epoch": 1.2854627766599598, + "grad_norm": 0.3350978195667267, + "learning_rate": 7.057938528749204e-06, + "loss": 0.3349, + "step": 5111 + }, + { + "epoch": 1.2857142857142856, + "grad_norm": 0.3685004413127899, + "learning_rate": 7.056604880426664e-06, + "loss": 0.3658, + "step": 5112 + }, + { + "epoch": 1.2859657947686116, + "grad_norm": 0.3085992634296417, + "learning_rate": 7.055271055968782e-06, + "loss": 0.3426, + "step": 5113 + }, + { + "epoch": 1.2862173038229376, + "grad_norm": 0.4011576175689697, + "learning_rate": 7.05393705548979e-06, + "loss": 0.3575, + "step": 5114 + }, + { + "epoch": 1.2864688128772634, + "grad_norm": 0.3109534680843353, + "learning_rate": 7.052602879103939e-06, + "loss": 0.3262, + "step": 5115 + }, + { + "epoch": 1.2867203219315895, + "grad_norm": 0.3395739197731018, + "learning_rate": 7.051268526925493e-06, + "loss": 0.3527, + "step": 5116 + }, + { + "epoch": 1.2869718309859155, + "grad_norm": 0.31294944882392883, + "learning_rate": 7.049933999068731e-06, + "loss": 0.3524, + "step": 5117 + }, + { + "epoch": 1.2872233400402415, + "grad_norm": 0.36359307169914246, + "learning_rate": 7.0485992956479466e-06, + "loss": 0.338, + "step": 5118 + }, + { + "epoch": 1.2874748490945673, + "grad_norm": 0.32687315344810486, + "learning_rate": 7.0472644167774474e-06, + "loss": 0.3405, + "step": 5119 + }, + { + "epoch": 1.2877263581488934, + "grad_norm": 0.28143706917762756, + "learning_rate": 7.045929362571559e-06, + "loss": 0.3903, + "step": 5120 + }, + { + "epoch": 1.2879778672032194, + "grad_norm": 0.32885175943374634, + "learning_rate": 7.04459413314462e-06, + "loss": 0.3788, + "step": 5121 + }, + { + "epoch": 1.2882293762575452, + "grad_norm": 0.31145739555358887, + "learning_rate": 7.0432587286109845e-06, + "loss": 0.3523, + "step": 5122 + }, + { + "epoch": 1.2884808853118712, + "grad_norm": 0.3306049108505249, + "learning_rate": 7.041923149085021e-06, + "loss": 0.3387, + "step": 5123 + }, + { + "epoch": 1.2887323943661972, + "grad_norm": 0.31860941648483276, + "learning_rate": 7.040587394681115e-06, + "loss": 0.3411, + "step": 5124 + }, + { + "epoch": 1.2889839034205233, + "grad_norm": 0.37950772047042847, + "learning_rate": 7.039251465513664e-06, + "loss": 0.3654, + "step": 5125 + }, + { + "epoch": 1.289235412474849, + "grad_norm": 0.3230748474597931, + "learning_rate": 7.037915361697082e-06, + "loss": 0.372, + "step": 5126 + }, + { + "epoch": 1.289486921529175, + "grad_norm": 0.3333132266998291, + "learning_rate": 7.036579083345799e-06, + "loss": 0.3265, + "step": 5127 + }, + { + "epoch": 1.2897384305835011, + "grad_norm": 0.3597933053970337, + "learning_rate": 7.035242630574257e-06, + "loss": 0.363, + "step": 5128 + }, + { + "epoch": 1.289989939637827, + "grad_norm": 0.3329184055328369, + "learning_rate": 7.033906003496917e-06, + "loss": 0.3719, + "step": 5129 + }, + { + "epoch": 1.290241448692153, + "grad_norm": 0.32776138186454773, + "learning_rate": 7.0325692022282535e-06, + "loss": 0.3565, + "step": 5130 + }, + { + "epoch": 1.290492957746479, + "grad_norm": 0.3282029628753662, + "learning_rate": 7.031232226882752e-06, + "loss": 0.345, + "step": 5131 + }, + { + "epoch": 1.2907444668008048, + "grad_norm": 0.36610522866249084, + "learning_rate": 7.029895077574918e-06, + "loss": 0.331, + "step": 5132 + }, + { + "epoch": 1.2909959758551308, + "grad_norm": 0.3236190974712372, + "learning_rate": 7.028557754419269e-06, + "loss": 0.3581, + "step": 5133 + }, + { + "epoch": 1.2912474849094568, + "grad_norm": 0.34408023953437805, + "learning_rate": 7.0272202575303395e-06, + "loss": 0.3365, + "step": 5134 + }, + { + "epoch": 1.2914989939637826, + "grad_norm": 0.32077619433403015, + "learning_rate": 7.025882587022676e-06, + "loss": 0.3443, + "step": 5135 + }, + { + "epoch": 1.2917505030181087, + "grad_norm": 0.3198366165161133, + "learning_rate": 7.024544743010845e-06, + "loss": 0.348, + "step": 5136 + }, + { + "epoch": 1.2920020120724347, + "grad_norm": 0.33954963088035583, + "learning_rate": 7.023206725609421e-06, + "loss": 0.3489, + "step": 5137 + }, + { + "epoch": 1.2922535211267605, + "grad_norm": 0.3308905065059662, + "learning_rate": 7.021868534932998e-06, + "loss": 0.367, + "step": 5138 + }, + { + "epoch": 1.2925050301810865, + "grad_norm": 0.3465730547904968, + "learning_rate": 7.0205301710961825e-06, + "loss": 0.3596, + "step": 5139 + }, + { + "epoch": 1.2927565392354126, + "grad_norm": 0.3291175067424774, + "learning_rate": 7.0191916342136e-06, + "loss": 0.3774, + "step": 5140 + }, + { + "epoch": 1.2930080482897384, + "grad_norm": 0.3781532645225525, + "learning_rate": 7.017852924399886e-06, + "loss": 0.3358, + "step": 5141 + }, + { + "epoch": 1.2932595573440644, + "grad_norm": 0.3416769504547119, + "learning_rate": 7.016514041769693e-06, + "loss": 0.34, + "step": 5142 + }, + { + "epoch": 1.2935110663983904, + "grad_norm": 0.31418657302856445, + "learning_rate": 7.015174986437687e-06, + "loss": 0.3507, + "step": 5143 + }, + { + "epoch": 1.2937625754527162, + "grad_norm": 0.394488662481308, + "learning_rate": 7.01383575851855e-06, + "loss": 0.3669, + "step": 5144 + }, + { + "epoch": 1.2940140845070423, + "grad_norm": 0.3305054008960724, + "learning_rate": 7.012496358126979e-06, + "loss": 0.37, + "step": 5145 + }, + { + "epoch": 1.2942655935613683, + "grad_norm": 0.33333835005760193, + "learning_rate": 7.011156785377686e-06, + "loss": 0.3599, + "step": 5146 + }, + { + "epoch": 1.294517102615694, + "grad_norm": 0.32395216822624207, + "learning_rate": 7.009817040385395e-06, + "loss": 0.3402, + "step": 5147 + }, + { + "epoch": 1.29476861167002, + "grad_norm": 0.35054081678390503, + "learning_rate": 7.008477123264849e-06, + "loss": 0.3328, + "step": 5148 + }, + { + "epoch": 1.2950201207243461, + "grad_norm": 0.32759636640548706, + "learning_rate": 7.007137034130801e-06, + "loss": 0.3594, + "step": 5149 + }, + { + "epoch": 1.295271629778672, + "grad_norm": 0.3191884160041809, + "learning_rate": 7.005796773098023e-06, + "loss": 0.3493, + "step": 5150 + }, + { + "epoch": 1.295523138832998, + "grad_norm": 0.33479100465774536, + "learning_rate": 7.0044563402813e-06, + "loss": 0.3783, + "step": 5151 + }, + { + "epoch": 1.295774647887324, + "grad_norm": 0.3412805199623108, + "learning_rate": 7.003115735795431e-06, + "loss": 0.3371, + "step": 5152 + }, + { + "epoch": 1.2960261569416498, + "grad_norm": 0.3714444041252136, + "learning_rate": 7.00177495975523e-06, + "loss": 0.3369, + "step": 5153 + }, + { + "epoch": 1.2962776659959758, + "grad_norm": 0.33331310749053955, + "learning_rate": 7.000434012275528e-06, + "loss": 0.3524, + "step": 5154 + }, + { + "epoch": 1.2965291750503019, + "grad_norm": 0.36643174290657043, + "learning_rate": 6.999092893471166e-06, + "loss": 0.3589, + "step": 5155 + }, + { + "epoch": 1.2967806841046277, + "grad_norm": 0.35542479157447815, + "learning_rate": 6.997751603457006e-06, + "loss": 0.3364, + "step": 5156 + }, + { + "epoch": 1.2970321931589537, + "grad_norm": 0.3719502389431, + "learning_rate": 6.996410142347918e-06, + "loss": 0.3643, + "step": 5157 + }, + { + "epoch": 1.2972837022132797, + "grad_norm": 0.3411712348461151, + "learning_rate": 6.995068510258791e-06, + "loss": 0.3813, + "step": 5158 + }, + { + "epoch": 1.2975352112676055, + "grad_norm": 0.3225451707839966, + "learning_rate": 6.993726707304527e-06, + "loss": 0.3343, + "step": 5159 + }, + { + "epoch": 1.2977867203219315, + "grad_norm": 0.3910596966743469, + "learning_rate": 6.992384733600044e-06, + "loss": 0.3578, + "step": 5160 + }, + { + "epoch": 1.2980382293762576, + "grad_norm": 0.3352847993373871, + "learning_rate": 6.991042589260271e-06, + "loss": 0.3369, + "step": 5161 + }, + { + "epoch": 1.2982897384305834, + "grad_norm": 0.3266632854938507, + "learning_rate": 6.989700274400157e-06, + "loss": 0.3751, + "step": 5162 + }, + { + "epoch": 1.2985412474849094, + "grad_norm": 0.36302119493484497, + "learning_rate": 6.988357789134662e-06, + "loss": 0.3549, + "step": 5163 + }, + { + "epoch": 1.2987927565392354, + "grad_norm": 0.3544050455093384, + "learning_rate": 6.987015133578763e-06, + "loss": 0.3655, + "step": 5164 + }, + { + "epoch": 1.2990442655935612, + "grad_norm": 0.3703245520591736, + "learning_rate": 6.985672307847447e-06, + "loss": 0.3511, + "step": 5165 + }, + { + "epoch": 1.2992957746478873, + "grad_norm": 0.3711421489715576, + "learning_rate": 6.98432931205572e-06, + "loss": 0.3394, + "step": 5166 + }, + { + "epoch": 1.2995472837022133, + "grad_norm": 0.37200912833213806, + "learning_rate": 6.982986146318602e-06, + "loss": 0.3642, + "step": 5167 + }, + { + "epoch": 1.2997987927565393, + "grad_norm": 0.35503825545310974, + "learning_rate": 6.981642810751126e-06, + "loss": 0.3262, + "step": 5168 + }, + { + "epoch": 1.3000503018108651, + "grad_norm": 0.35312941670417786, + "learning_rate": 6.980299305468341e-06, + "loss": 0.3773, + "step": 5169 + }, + { + "epoch": 1.3003018108651911, + "grad_norm": 0.35091546177864075, + "learning_rate": 6.978955630585309e-06, + "loss": 0.3409, + "step": 5170 + }, + { + "epoch": 1.3005533199195172, + "grad_norm": 0.30944791436195374, + "learning_rate": 6.9776117862171065e-06, + "loss": 0.3384, + "step": 5171 + }, + { + "epoch": 1.3008048289738432, + "grad_norm": 0.36516040563583374, + "learning_rate": 6.976267772478828e-06, + "loss": 0.3533, + "step": 5172 + }, + { + "epoch": 1.301056338028169, + "grad_norm": 0.34464800357818604, + "learning_rate": 6.974923589485577e-06, + "loss": 0.3366, + "step": 5173 + }, + { + "epoch": 1.301307847082495, + "grad_norm": 0.3472990095615387, + "learning_rate": 6.973579237352475e-06, + "loss": 0.3472, + "step": 5174 + }, + { + "epoch": 1.301559356136821, + "grad_norm": 0.33396056294441223, + "learning_rate": 6.97223471619466e-06, + "loss": 0.3405, + "step": 5175 + }, + { + "epoch": 1.3018108651911469, + "grad_norm": 0.31734970211982727, + "learning_rate": 6.970890026127278e-06, + "loss": 0.34, + "step": 5176 + }, + { + "epoch": 1.3020623742454729, + "grad_norm": 0.34466373920440674, + "learning_rate": 6.9695451672654965e-06, + "loss": 0.3438, + "step": 5177 + }, + { + "epoch": 1.302313883299799, + "grad_norm": 0.3734425902366638, + "learning_rate": 6.968200139724492e-06, + "loss": 0.3757, + "step": 5178 + }, + { + "epoch": 1.3025653923541247, + "grad_norm": 0.3214707374572754, + "learning_rate": 6.966854943619459e-06, + "loss": 0.3634, + "step": 5179 + }, + { + "epoch": 1.3028169014084507, + "grad_norm": 0.32335829734802246, + "learning_rate": 6.965509579065605e-06, + "loss": 0.3365, + "step": 5180 + }, + { + "epoch": 1.3030684104627768, + "grad_norm": 0.3490079641342163, + "learning_rate": 6.964164046178151e-06, + "loss": 0.3431, + "step": 5181 + }, + { + "epoch": 1.3033199195171026, + "grad_norm": 0.3411654531955719, + "learning_rate": 6.962818345072333e-06, + "loss": 0.365, + "step": 5182 + }, + { + "epoch": 1.3035714285714286, + "grad_norm": 0.3305014669895172, + "learning_rate": 6.961472475863406e-06, + "loss": 0.342, + "step": 5183 + }, + { + "epoch": 1.3038229376257546, + "grad_norm": 0.33585986495018005, + "learning_rate": 6.96012643866663e-06, + "loss": 0.336, + "step": 5184 + }, + { + "epoch": 1.3040744466800804, + "grad_norm": 0.3303409516811371, + "learning_rate": 6.958780233597289e-06, + "loss": 0.3481, + "step": 5185 + }, + { + "epoch": 1.3043259557344065, + "grad_norm": 0.362411230802536, + "learning_rate": 6.957433860770674e-06, + "loss": 0.3593, + "step": 5186 + }, + { + "epoch": 1.3045774647887325, + "grad_norm": 0.3479267656803131, + "learning_rate": 6.956087320302094e-06, + "loss": 0.3674, + "step": 5187 + }, + { + "epoch": 1.3048289738430583, + "grad_norm": 0.3524285554885864, + "learning_rate": 6.9547406123068724e-06, + "loss": 0.3559, + "step": 5188 + }, + { + "epoch": 1.3050804828973843, + "grad_norm": 0.3306998014450073, + "learning_rate": 6.953393736900346e-06, + "loss": 0.3547, + "step": 5189 + }, + { + "epoch": 1.3053319919517103, + "grad_norm": 0.35222721099853516, + "learning_rate": 6.9520466941978685e-06, + "loss": 0.3576, + "step": 5190 + }, + { + "epoch": 1.3055835010060362, + "grad_norm": 0.30763161182403564, + "learning_rate": 6.950699484314802e-06, + "loss": 0.359, + "step": 5191 + }, + { + "epoch": 1.3058350100603622, + "grad_norm": 0.3283224105834961, + "learning_rate": 6.949352107366528e-06, + "loss": 0.3612, + "step": 5192 + }, + { + "epoch": 1.3060865191146882, + "grad_norm": 0.32542482018470764, + "learning_rate": 6.9480045634684405e-06, + "loss": 0.3491, + "step": 5193 + }, + { + "epoch": 1.306338028169014, + "grad_norm": 0.3252429962158203, + "learning_rate": 6.94665685273595e-06, + "loss": 0.3556, + "step": 5194 + }, + { + "epoch": 1.30658953722334, + "grad_norm": 0.30148446559906006, + "learning_rate": 6.945308975284478e-06, + "loss": 0.347, + "step": 5195 + }, + { + "epoch": 1.306841046277666, + "grad_norm": 0.3339576721191406, + "learning_rate": 6.943960931229462e-06, + "loss": 0.3376, + "step": 5196 + }, + { + "epoch": 1.3070925553319919, + "grad_norm": 0.3315853476524353, + "learning_rate": 6.942612720686355e-06, + "loss": 0.3464, + "step": 5197 + }, + { + "epoch": 1.307344064386318, + "grad_norm": 0.3184394836425781, + "learning_rate": 6.9412643437706194e-06, + "loss": 0.3549, + "step": 5198 + }, + { + "epoch": 1.307595573440644, + "grad_norm": 0.34649431705474854, + "learning_rate": 6.939915800597738e-06, + "loss": 0.3625, + "step": 5199 + }, + { + "epoch": 1.3078470824949697, + "grad_norm": 0.34910690784454346, + "learning_rate": 6.938567091283205e-06, + "loss": 0.3417, + "step": 5200 + }, + { + "epoch": 1.3080985915492958, + "grad_norm": 0.34020349383354187, + "learning_rate": 6.937218215942527e-06, + "loss": 0.3706, + "step": 5201 + }, + { + "epoch": 1.3083501006036218, + "grad_norm": 0.30801448225975037, + "learning_rate": 6.935869174691229e-06, + "loss": 0.351, + "step": 5202 + }, + { + "epoch": 1.3086016096579476, + "grad_norm": 0.3194606900215149, + "learning_rate": 6.934519967644847e-06, + "loss": 0.3343, + "step": 5203 + }, + { + "epoch": 1.3088531187122736, + "grad_norm": 0.3294532001018524, + "learning_rate": 6.93317059491893e-06, + "loss": 0.3352, + "step": 5204 + }, + { + "epoch": 1.3091046277665996, + "grad_norm": 0.3065512180328369, + "learning_rate": 6.931821056629048e-06, + "loss": 0.362, + "step": 5205 + }, + { + "epoch": 1.3093561368209254, + "grad_norm": 0.31400826573371887, + "learning_rate": 6.930471352890777e-06, + "loss": 0.3369, + "step": 5206 + }, + { + "epoch": 1.3096076458752515, + "grad_norm": 0.32463496923446655, + "learning_rate": 6.9291214838197114e-06, + "loss": 0.3772, + "step": 5207 + }, + { + "epoch": 1.3098591549295775, + "grad_norm": 0.3142559230327606, + "learning_rate": 6.92777144953146e-06, + "loss": 0.3552, + "step": 5208 + }, + { + "epoch": 1.3101106639839033, + "grad_norm": 0.33447784185409546, + "learning_rate": 6.926421250141644e-06, + "loss": 0.372, + "step": 5209 + }, + { + "epoch": 1.3103621730382293, + "grad_norm": 0.34302520751953125, + "learning_rate": 6.925070885765899e-06, + "loss": 0.335, + "step": 5210 + }, + { + "epoch": 1.3106136820925554, + "grad_norm": 0.32417216897010803, + "learning_rate": 6.923720356519877e-06, + "loss": 0.3592, + "step": 5211 + }, + { + "epoch": 1.3108651911468812, + "grad_norm": 0.30292844772338867, + "learning_rate": 6.922369662519239e-06, + "loss": 0.338, + "step": 5212 + }, + { + "epoch": 1.3111167002012072, + "grad_norm": 0.32632318139076233, + "learning_rate": 6.921018803879667e-06, + "loss": 0.3547, + "step": 5213 + }, + { + "epoch": 1.3113682092555332, + "grad_norm": 0.3217123746871948, + "learning_rate": 6.919667780716852e-06, + "loss": 0.3661, + "step": 5214 + }, + { + "epoch": 1.311619718309859, + "grad_norm": 0.32875531911849976, + "learning_rate": 6.918316593146501e-06, + "loss": 0.3559, + "step": 5215 + }, + { + "epoch": 1.311871227364185, + "grad_norm": 0.33865031599998474, + "learning_rate": 6.916965241284335e-06, + "loss": 0.3564, + "step": 5216 + }, + { + "epoch": 1.312122736418511, + "grad_norm": 0.2909534275531769, + "learning_rate": 6.9156137252460885e-06, + "loss": 0.3394, + "step": 5217 + }, + { + "epoch": 1.312374245472837, + "grad_norm": 0.3667255640029907, + "learning_rate": 6.91426204514751e-06, + "loss": 0.355, + "step": 5218 + }, + { + "epoch": 1.312625754527163, + "grad_norm": 0.34232789278030396, + "learning_rate": 6.912910201104363e-06, + "loss": 0.343, + "step": 5219 + }, + { + "epoch": 1.312877263581489, + "grad_norm": 0.33543142676353455, + "learning_rate": 6.911558193232426e-06, + "loss": 0.3417, + "step": 5220 + }, + { + "epoch": 1.313128772635815, + "grad_norm": 0.3160308003425598, + "learning_rate": 6.910206021647487e-06, + "loss": 0.3491, + "step": 5221 + }, + { + "epoch": 1.313380281690141, + "grad_norm": 0.3830724060535431, + "learning_rate": 6.908853686465353e-06, + "loss": 0.3414, + "step": 5222 + }, + { + "epoch": 1.3136317907444668, + "grad_norm": 0.33467820286750793, + "learning_rate": 6.907501187801843e-06, + "loss": 0.3589, + "step": 5223 + }, + { + "epoch": 1.3138832997987928, + "grad_norm": 0.3496091961860657, + "learning_rate": 6.906148525772789e-06, + "loss": 0.3608, + "step": 5224 + }, + { + "epoch": 1.3141348088531188, + "grad_norm": 0.3401433229446411, + "learning_rate": 6.904795700494038e-06, + "loss": 0.3581, + "step": 5225 + }, + { + "epoch": 1.3143863179074446, + "grad_norm": 0.3567175567150116, + "learning_rate": 6.9034427120814505e-06, + "loss": 0.337, + "step": 5226 + }, + { + "epoch": 1.3146378269617707, + "grad_norm": 0.33114707469940186, + "learning_rate": 6.902089560650904e-06, + "loss": 0.3523, + "step": 5227 + }, + { + "epoch": 1.3148893360160967, + "grad_norm": 0.3312157988548279, + "learning_rate": 6.900736246318287e-06, + "loss": 0.353, + "step": 5228 + }, + { + "epoch": 1.3151408450704225, + "grad_norm": 0.3470541536808014, + "learning_rate": 6.899382769199501e-06, + "loss": 0.358, + "step": 5229 + }, + { + "epoch": 1.3153923541247485, + "grad_norm": 0.3177447021007538, + "learning_rate": 6.898029129410463e-06, + "loss": 0.3718, + "step": 5230 + }, + { + "epoch": 1.3156438631790746, + "grad_norm": 0.297389954328537, + "learning_rate": 6.896675327067104e-06, + "loss": 0.342, + "step": 5231 + }, + { + "epoch": 1.3158953722334004, + "grad_norm": 0.33685746788978577, + "learning_rate": 6.895321362285369e-06, + "loss": 0.3386, + "step": 5232 + }, + { + "epoch": 1.3161468812877264, + "grad_norm": 0.3213297724723816, + "learning_rate": 6.893967235181216e-06, + "loss": 0.3766, + "step": 5233 + }, + { + "epoch": 1.3163983903420524, + "grad_norm": 0.2946451008319855, + "learning_rate": 6.892612945870618e-06, + "loss": 0.3446, + "step": 5234 + }, + { + "epoch": 1.3166498993963782, + "grad_norm": 0.3347758650779724, + "learning_rate": 6.891258494469561e-06, + "loss": 0.3605, + "step": 5235 + }, + { + "epoch": 1.3169014084507042, + "grad_norm": 0.30589884519577026, + "learning_rate": 6.889903881094047e-06, + "loss": 0.3453, + "step": 5236 + }, + { + "epoch": 1.3171529175050303, + "grad_norm": 0.3220566511154175, + "learning_rate": 6.888549105860088e-06, + "loss": 0.3572, + "step": 5237 + }, + { + "epoch": 1.317404426559356, + "grad_norm": 0.3071398138999939, + "learning_rate": 6.887194168883713e-06, + "loss": 0.3602, + "step": 5238 + }, + { + "epoch": 1.317655935613682, + "grad_norm": 0.34958013892173767, + "learning_rate": 6.885839070280964e-06, + "loss": 0.3445, + "step": 5239 + }, + { + "epoch": 1.3179074446680081, + "grad_norm": 0.33036911487579346, + "learning_rate": 6.884483810167896e-06, + "loss": 0.3653, + "step": 5240 + }, + { + "epoch": 1.318158953722334, + "grad_norm": 0.3227456212043762, + "learning_rate": 6.883128388660578e-06, + "loss": 0.3573, + "step": 5241 + }, + { + "epoch": 1.31841046277666, + "grad_norm": 0.304959774017334, + "learning_rate": 6.881772805875095e-06, + "loss": 0.3421, + "step": 5242 + }, + { + "epoch": 1.318661971830986, + "grad_norm": 0.3372381627559662, + "learning_rate": 6.8804170619275445e-06, + "loss": 0.3234, + "step": 5243 + }, + { + "epoch": 1.3189134808853118, + "grad_norm": 0.3018898665904999, + "learning_rate": 6.879061156934035e-06, + "loss": 0.3658, + "step": 5244 + }, + { + "epoch": 1.3191649899396378, + "grad_norm": 0.3537410497665405, + "learning_rate": 6.877705091010693e-06, + "loss": 0.3786, + "step": 5245 + }, + { + "epoch": 1.3194164989939638, + "grad_norm": 0.3048062026500702, + "learning_rate": 6.8763488642736585e-06, + "loss": 0.3467, + "step": 5246 + }, + { + "epoch": 1.3196680080482897, + "grad_norm": 0.3114982545375824, + "learning_rate": 6.874992476839081e-06, + "loss": 0.3364, + "step": 5247 + }, + { + "epoch": 1.3199195171026157, + "grad_norm": 0.35760530829429626, + "learning_rate": 6.873635928823127e-06, + "loss": 0.3648, + "step": 5248 + }, + { + "epoch": 1.3201710261569417, + "grad_norm": 0.3398659825325012, + "learning_rate": 6.8722792203419775e-06, + "loss": 0.3682, + "step": 5249 + }, + { + "epoch": 1.3204225352112675, + "grad_norm": 0.29969504475593567, + "learning_rate": 6.8709223515118265e-06, + "loss": 0.3255, + "step": 5250 + }, + { + "epoch": 1.3206740442655935, + "grad_norm": 0.31497398018836975, + "learning_rate": 6.8695653224488805e-06, + "loss": 0.382, + "step": 5251 + }, + { + "epoch": 1.3209255533199196, + "grad_norm": 0.3121738135814667, + "learning_rate": 6.868208133269359e-06, + "loss": 0.3807, + "step": 5252 + }, + { + "epoch": 1.3211770623742454, + "grad_norm": 0.31807759404182434, + "learning_rate": 6.8668507840895005e-06, + "loss": 0.3508, + "step": 5253 + }, + { + "epoch": 1.3214285714285714, + "grad_norm": 0.3177856504917145, + "learning_rate": 6.86549327502555e-06, + "loss": 0.3825, + "step": 5254 + }, + { + "epoch": 1.3216800804828974, + "grad_norm": 0.3255070447921753, + "learning_rate": 6.864135606193771e-06, + "loss": 0.3538, + "step": 5255 + }, + { + "epoch": 1.3219315895372232, + "grad_norm": 0.33017322421073914, + "learning_rate": 6.862777777710441e-06, + "loss": 0.3318, + "step": 5256 + }, + { + "epoch": 1.3221830985915493, + "grad_norm": 0.3061206340789795, + "learning_rate": 6.861419789691845e-06, + "loss": 0.351, + "step": 5257 + }, + { + "epoch": 1.3224346076458753, + "grad_norm": 0.3208281993865967, + "learning_rate": 6.860061642254291e-06, + "loss": 0.3615, + "step": 5258 + }, + { + "epoch": 1.322686116700201, + "grad_norm": 0.31412699818611145, + "learning_rate": 6.858703335514093e-06, + "loss": 0.3814, + "step": 5259 + }, + { + "epoch": 1.3229376257545271, + "grad_norm": 0.3277878165245056, + "learning_rate": 6.857344869587583e-06, + "loss": 0.3539, + "step": 5260 + }, + { + "epoch": 1.3231891348088531, + "grad_norm": 0.34601694345474243, + "learning_rate": 6.855986244591104e-06, + "loss": 0.3516, + "step": 5261 + }, + { + "epoch": 1.323440643863179, + "grad_norm": 0.33867859840393066, + "learning_rate": 6.854627460641015e-06, + "loss": 0.3606, + "step": 5262 + }, + { + "epoch": 1.323692152917505, + "grad_norm": 0.3611641824245453, + "learning_rate": 6.8532685178536865e-06, + "loss": 0.3437, + "step": 5263 + }, + { + "epoch": 1.323943661971831, + "grad_norm": 0.3286331593990326, + "learning_rate": 6.851909416345502e-06, + "loss": 0.3623, + "step": 5264 + }, + { + "epoch": 1.3241951710261568, + "grad_norm": 0.3348291218280792, + "learning_rate": 6.850550156232862e-06, + "loss": 0.3634, + "step": 5265 + }, + { + "epoch": 1.3244466800804828, + "grad_norm": 0.3188924491405487, + "learning_rate": 6.849190737632179e-06, + "loss": 0.3462, + "step": 5266 + }, + { + "epoch": 1.3246981891348089, + "grad_norm": 0.35183659195899963, + "learning_rate": 6.847831160659877e-06, + "loss": 0.355, + "step": 5267 + }, + { + "epoch": 1.3249496981891349, + "grad_norm": 0.2944953739643097, + "learning_rate": 6.846471425432397e-06, + "loss": 0.3173, + "step": 5268 + }, + { + "epoch": 1.3252012072434607, + "grad_norm": 0.34920939803123474, + "learning_rate": 6.845111532066189e-06, + "loss": 0.3544, + "step": 5269 + }, + { + "epoch": 1.3254527162977867, + "grad_norm": 0.3055819571018219, + "learning_rate": 6.843751480677723e-06, + "loss": 0.3572, + "step": 5270 + }, + { + "epoch": 1.3257042253521127, + "grad_norm": 0.33443525433540344, + "learning_rate": 6.8423912713834765e-06, + "loss": 0.3686, + "step": 5271 + }, + { + "epoch": 1.3259557344064388, + "grad_norm": 0.3278621435165405, + "learning_rate": 6.841030904299943e-06, + "loss": 0.368, + "step": 5272 + }, + { + "epoch": 1.3262072434607646, + "grad_norm": 0.3702344000339508, + "learning_rate": 6.839670379543632e-06, + "loss": 0.3329, + "step": 5273 + }, + { + "epoch": 1.3264587525150906, + "grad_norm": 0.31350359320640564, + "learning_rate": 6.838309697231061e-06, + "loss": 0.3686, + "step": 5274 + }, + { + "epoch": 1.3267102615694166, + "grad_norm": 0.3589387834072113, + "learning_rate": 6.836948857478764e-06, + "loss": 0.3401, + "step": 5275 + }, + { + "epoch": 1.3269617706237424, + "grad_norm": 0.37206393480300903, + "learning_rate": 6.83558786040329e-06, + "loss": 0.3487, + "step": 5276 + }, + { + "epoch": 1.3272132796780685, + "grad_norm": 0.3218708336353302, + "learning_rate": 6.834226706121198e-06, + "loss": 0.3381, + "step": 5277 + }, + { + "epoch": 1.3274647887323945, + "grad_norm": 0.32571589946746826, + "learning_rate": 6.832865394749065e-06, + "loss": 0.349, + "step": 5278 + }, + { + "epoch": 1.3277162977867203, + "grad_norm": 0.3747791051864624, + "learning_rate": 6.831503926403476e-06, + "loss": 0.3726, + "step": 5279 + }, + { + "epoch": 1.3279678068410463, + "grad_norm": 0.33507394790649414, + "learning_rate": 6.830142301201035e-06, + "loss": 0.3544, + "step": 5280 + }, + { + "epoch": 1.3282193158953723, + "grad_norm": 0.35225021839141846, + "learning_rate": 6.8287805192583534e-06, + "loss": 0.3614, + "step": 5281 + }, + { + "epoch": 1.3284708249496981, + "grad_norm": 0.3295999765396118, + "learning_rate": 6.8274185806920625e-06, + "loss": 0.3604, + "step": 5282 + }, + { + "epoch": 1.3287223340040242, + "grad_norm": 0.3460395038127899, + "learning_rate": 6.826056485618803e-06, + "loss": 0.352, + "step": 5283 + }, + { + "epoch": 1.3289738430583502, + "grad_norm": 0.3388642370700836, + "learning_rate": 6.824694234155228e-06, + "loss": 0.3313, + "step": 5284 + }, + { + "epoch": 1.329225352112676, + "grad_norm": 0.3405665457248688, + "learning_rate": 6.823331826418008e-06, + "loss": 0.3362, + "step": 5285 + }, + { + "epoch": 1.329476861167002, + "grad_norm": 0.3302387297153473, + "learning_rate": 6.8219692625238236e-06, + "loss": 0.3406, + "step": 5286 + }, + { + "epoch": 1.329728370221328, + "grad_norm": 0.3334062695503235, + "learning_rate": 6.8206065425893695e-06, + "loss": 0.3437, + "step": 5287 + }, + { + "epoch": 1.3299798792756539, + "grad_norm": 0.3606247007846832, + "learning_rate": 6.819243666731356e-06, + "loss": 0.3348, + "step": 5288 + }, + { + "epoch": 1.33023138832998, + "grad_norm": 0.3403606712818146, + "learning_rate": 6.817880635066503e-06, + "loss": 0.3566, + "step": 5289 + }, + { + "epoch": 1.330482897384306, + "grad_norm": 0.29230329394340515, + "learning_rate": 6.816517447711546e-06, + "loss": 0.3524, + "step": 5290 + }, + { + "epoch": 1.3307344064386317, + "grad_norm": 0.35570812225341797, + "learning_rate": 6.815154104783233e-06, + "loss": 0.3324, + "step": 5291 + }, + { + "epoch": 1.3309859154929577, + "grad_norm": 0.3560788333415985, + "learning_rate": 6.813790606398327e-06, + "loss": 0.3826, + "step": 5292 + }, + { + "epoch": 1.3312374245472838, + "grad_norm": 0.34960126876831055, + "learning_rate": 6.8124269526736035e-06, + "loss": 0.3403, + "step": 5293 + }, + { + "epoch": 1.3314889336016096, + "grad_norm": 0.37958434224128723, + "learning_rate": 6.811063143725849e-06, + "loss": 0.3727, + "step": 5294 + }, + { + "epoch": 1.3317404426559356, + "grad_norm": 0.3619947135448456, + "learning_rate": 6.809699179671867e-06, + "loss": 0.3378, + "step": 5295 + }, + { + "epoch": 1.3319919517102616, + "grad_norm": 0.35475045442581177, + "learning_rate": 6.808335060628471e-06, + "loss": 0.3679, + "step": 5296 + }, + { + "epoch": 1.3322434607645874, + "grad_norm": 0.3520188331604004, + "learning_rate": 6.806970786712489e-06, + "loss": 0.3589, + "step": 5297 + }, + { + "epoch": 1.3324949698189135, + "grad_norm": 0.3598145842552185, + "learning_rate": 6.8056063580407636e-06, + "loss": 0.3664, + "step": 5298 + }, + { + "epoch": 1.3327464788732395, + "grad_norm": 0.3405790627002716, + "learning_rate": 6.804241774730152e-06, + "loss": 0.3671, + "step": 5299 + }, + { + "epoch": 1.3329979879275653, + "grad_norm": 0.33218327164649963, + "learning_rate": 6.802877036897518e-06, + "loss": 0.3589, + "step": 5300 + }, + { + "epoch": 1.3332494969818913, + "grad_norm": 0.33771365880966187, + "learning_rate": 6.801512144659745e-06, + "loss": 0.364, + "step": 5301 + }, + { + "epoch": 1.3335010060362174, + "grad_norm": 0.31353524327278137, + "learning_rate": 6.800147098133727e-06, + "loss": 0.3522, + "step": 5302 + }, + { + "epoch": 1.3337525150905432, + "grad_norm": 0.2938469350337982, + "learning_rate": 6.798781897436371e-06, + "loss": 0.3508, + "step": 5303 + }, + { + "epoch": 1.3340040241448692, + "grad_norm": 0.342989057302475, + "learning_rate": 6.7974165426845996e-06, + "loss": 0.3249, + "step": 5304 + }, + { + "epoch": 1.3342555331991952, + "grad_norm": 0.3309261202812195, + "learning_rate": 6.796051033995346e-06, + "loss": 0.3399, + "step": 5305 + }, + { + "epoch": 1.334507042253521, + "grad_norm": 0.3215269446372986, + "learning_rate": 6.7946853714855565e-06, + "loss": 0.3515, + "step": 5306 + }, + { + "epoch": 1.334758551307847, + "grad_norm": 0.33649203181266785, + "learning_rate": 6.793319555272192e-06, + "loss": 0.3635, + "step": 5307 + }, + { + "epoch": 1.335010060362173, + "grad_norm": 0.34016871452331543, + "learning_rate": 6.791953585472228e-06, + "loss": 0.3454, + "step": 5308 + }, + { + "epoch": 1.3352615694164989, + "grad_norm": 0.3312501609325409, + "learning_rate": 6.790587462202649e-06, + "loss": 0.3533, + "step": 5309 + }, + { + "epoch": 1.335513078470825, + "grad_norm": 0.33119526505470276, + "learning_rate": 6.789221185580456e-06, + "loss": 0.3511, + "step": 5310 + }, + { + "epoch": 1.335764587525151, + "grad_norm": 0.325152188539505, + "learning_rate": 6.7878547557226616e-06, + "loss": 0.3515, + "step": 5311 + }, + { + "epoch": 1.3360160965794767, + "grad_norm": 0.31850430369377136, + "learning_rate": 6.786488172746293e-06, + "loss": 0.3725, + "step": 5312 + }, + { + "epoch": 1.3362676056338028, + "grad_norm": 0.3241901397705078, + "learning_rate": 6.785121436768387e-06, + "loss": 0.3467, + "step": 5313 + }, + { + "epoch": 1.3365191146881288, + "grad_norm": 0.3087508976459503, + "learning_rate": 6.783754547905999e-06, + "loss": 0.3348, + "step": 5314 + }, + { + "epoch": 1.3367706237424548, + "grad_norm": 0.32791846990585327, + "learning_rate": 6.782387506276191e-06, + "loss": 0.3526, + "step": 5315 + }, + { + "epoch": 1.3370221327967806, + "grad_norm": 0.32451775670051575, + "learning_rate": 6.781020311996046e-06, + "loss": 0.3459, + "step": 5316 + }, + { + "epoch": 1.3372736418511066, + "grad_norm": 0.2936570644378662, + "learning_rate": 6.7796529651826525e-06, + "loss": 0.3415, + "step": 5317 + }, + { + "epoch": 1.3375251509054327, + "grad_norm": 0.3310757577419281, + "learning_rate": 6.778285465953116e-06, + "loss": 0.3567, + "step": 5318 + }, + { + "epoch": 1.3377766599597585, + "grad_norm": 0.3124331533908844, + "learning_rate": 6.776917814424555e-06, + "loss": 0.3571, + "step": 5319 + }, + { + "epoch": 1.3380281690140845, + "grad_norm": 0.3068905174732208, + "learning_rate": 6.775550010714099e-06, + "loss": 0.3553, + "step": 5320 + }, + { + "epoch": 1.3382796780684105, + "grad_norm": 0.31664854288101196, + "learning_rate": 6.774182054938893e-06, + "loss": 0.3275, + "step": 5321 + }, + { + "epoch": 1.3385311871227366, + "grad_norm": 0.3378714919090271, + "learning_rate": 6.772813947216092e-06, + "loss": 0.3547, + "step": 5322 + }, + { + "epoch": 1.3387826961770624, + "grad_norm": 0.3408490717411041, + "learning_rate": 6.771445687662868e-06, + "loss": 0.3551, + "step": 5323 + }, + { + "epoch": 1.3390342052313884, + "grad_norm": 0.3183440864086151, + "learning_rate": 6.770077276396402e-06, + "loss": 0.382, + "step": 5324 + }, + { + "epoch": 1.3392857142857144, + "grad_norm": 0.32740750908851624, + "learning_rate": 6.7687087135338915e-06, + "loss": 0.3654, + "step": 5325 + }, + { + "epoch": 1.3395372233400402, + "grad_norm": 0.3713664412498474, + "learning_rate": 6.7673399991925445e-06, + "loss": 0.3561, + "step": 5326 + }, + { + "epoch": 1.3397887323943662, + "grad_norm": 0.3032859265804291, + "learning_rate": 6.765971133489584e-06, + "loss": 0.3618, + "step": 5327 + }, + { + "epoch": 1.3400402414486923, + "grad_norm": 0.36480486392974854, + "learning_rate": 6.764602116542243e-06, + "loss": 0.3482, + "step": 5328 + }, + { + "epoch": 1.340291750503018, + "grad_norm": 0.3249639570713043, + "learning_rate": 6.763232948467769e-06, + "loss": 0.3664, + "step": 5329 + }, + { + "epoch": 1.340543259557344, + "grad_norm": 0.338945209980011, + "learning_rate": 6.761863629383425e-06, + "loss": 0.3656, + "step": 5330 + }, + { + "epoch": 1.3407947686116701, + "grad_norm": 0.31661343574523926, + "learning_rate": 6.760494159406483e-06, + "loss": 0.3439, + "step": 5331 + }, + { + "epoch": 1.341046277665996, + "grad_norm": 0.3573559522628784, + "learning_rate": 6.759124538654231e-06, + "loss": 0.3649, + "step": 5332 + }, + { + "epoch": 1.341297786720322, + "grad_norm": 0.3466895818710327, + "learning_rate": 6.757754767243966e-06, + "loss": 0.3554, + "step": 5333 + }, + { + "epoch": 1.341549295774648, + "grad_norm": 0.32429978251457214, + "learning_rate": 6.756384845293002e-06, + "loss": 0.3441, + "step": 5334 + }, + { + "epoch": 1.3418008048289738, + "grad_norm": 0.3625079393386841, + "learning_rate": 6.7550147729186635e-06, + "loss": 0.354, + "step": 5335 + }, + { + "epoch": 1.3420523138832998, + "grad_norm": 0.33437997102737427, + "learning_rate": 6.75364455023829e-06, + "loss": 0.3665, + "step": 5336 + }, + { + "epoch": 1.3423038229376258, + "grad_norm": 0.32188209891319275, + "learning_rate": 6.7522741773692305e-06, + "loss": 0.3652, + "step": 5337 + }, + { + "epoch": 1.3425553319919517, + "grad_norm": 0.3210011422634125, + "learning_rate": 6.75090365442885e-06, + "loss": 0.3763, + "step": 5338 + }, + { + "epoch": 1.3428068410462777, + "grad_norm": 0.32578200101852417, + "learning_rate": 6.749532981534526e-06, + "loss": 0.354, + "step": 5339 + }, + { + "epoch": 1.3430583501006037, + "grad_norm": 0.3230711817741394, + "learning_rate": 6.748162158803646e-06, + "loss": 0.3456, + "step": 5340 + }, + { + "epoch": 1.3433098591549295, + "grad_norm": 0.3201490342617035, + "learning_rate": 6.746791186353614e-06, + "loss": 0.3599, + "step": 5341 + }, + { + "epoch": 1.3435613682092555, + "grad_norm": 0.3256816267967224, + "learning_rate": 6.745420064301845e-06, + "loss": 0.3373, + "step": 5342 + }, + { + "epoch": 1.3438128772635816, + "grad_norm": 0.3285231590270996, + "learning_rate": 6.744048792765767e-06, + "loss": 0.3481, + "step": 5343 + }, + { + "epoch": 1.3440643863179074, + "grad_norm": 0.3109275698661804, + "learning_rate": 6.74267737186282e-06, + "loss": 0.3511, + "step": 5344 + }, + { + "epoch": 1.3443158953722334, + "grad_norm": 0.32050541043281555, + "learning_rate": 6.7413058017104585e-06, + "loss": 0.3641, + "step": 5345 + }, + { + "epoch": 1.3445674044265594, + "grad_norm": 0.3238290846347809, + "learning_rate": 6.739934082426149e-06, + "loss": 0.365, + "step": 5346 + }, + { + "epoch": 1.3448189134808852, + "grad_norm": 0.3411909341812134, + "learning_rate": 6.73856221412737e-06, + "loss": 0.3451, + "step": 5347 + }, + { + "epoch": 1.3450704225352113, + "grad_norm": 0.33590638637542725, + "learning_rate": 6.737190196931614e-06, + "loss": 0.3539, + "step": 5348 + }, + { + "epoch": 1.3453219315895373, + "grad_norm": 0.31567126512527466, + "learning_rate": 6.735818030956386e-06, + "loss": 0.3732, + "step": 5349 + }, + { + "epoch": 1.345573440643863, + "grad_norm": 0.3316844701766968, + "learning_rate": 6.734445716319202e-06, + "loss": 0.35, + "step": 5350 + }, + { + "epoch": 1.345824949698189, + "grad_norm": 0.2987160384654999, + "learning_rate": 6.733073253137593e-06, + "loss": 0.3509, + "step": 5351 + }, + { + "epoch": 1.3460764587525151, + "grad_norm": 0.3660013675689697, + "learning_rate": 6.731700641529103e-06, + "loss": 0.3718, + "step": 5352 + }, + { + "epoch": 1.346327967806841, + "grad_norm": 0.3536967635154724, + "learning_rate": 6.730327881611286e-06, + "loss": 0.37, + "step": 5353 + }, + { + "epoch": 1.346579476861167, + "grad_norm": 0.34046685695648193, + "learning_rate": 6.72895497350171e-06, + "loss": 0.3546, + "step": 5354 + }, + { + "epoch": 1.346830985915493, + "grad_norm": 0.3331921100616455, + "learning_rate": 6.727581917317958e-06, + "loss": 0.3619, + "step": 5355 + }, + { + "epoch": 1.3470824949698188, + "grad_norm": 0.3279572129249573, + "learning_rate": 6.726208713177622e-06, + "loss": 0.3664, + "step": 5356 + }, + { + "epoch": 1.3473340040241448, + "grad_norm": 0.3194010257720947, + "learning_rate": 6.7248353611983084e-06, + "loss": 0.3375, + "step": 5357 + }, + { + "epoch": 1.3475855130784709, + "grad_norm": 0.28898268938064575, + "learning_rate": 6.7234618614976375e-06, + "loss": 0.3384, + "step": 5358 + }, + { + "epoch": 1.3478370221327967, + "grad_norm": 0.33326342701911926, + "learning_rate": 6.72208821419324e-06, + "loss": 0.3417, + "step": 5359 + }, + { + "epoch": 1.3480885311871227, + "grad_norm": 0.36949121952056885, + "learning_rate": 6.7207144194027605e-06, + "loss": 0.379, + "step": 5360 + }, + { + "epoch": 1.3483400402414487, + "grad_norm": 0.3311922550201416, + "learning_rate": 6.719340477243854e-06, + "loss": 0.3505, + "step": 5361 + }, + { + "epoch": 1.3485915492957745, + "grad_norm": 0.30410414934158325, + "learning_rate": 6.717966387834194e-06, + "loss": 0.3436, + "step": 5362 + }, + { + "epoch": 1.3488430583501005, + "grad_norm": 0.3302040696144104, + "learning_rate": 6.716592151291459e-06, + "loss": 0.3396, + "step": 5363 + }, + { + "epoch": 1.3490945674044266, + "grad_norm": 0.3249216377735138, + "learning_rate": 6.715217767733346e-06, + "loss": 0.3363, + "step": 5364 + }, + { + "epoch": 1.3493460764587526, + "grad_norm": 0.30946671962738037, + "learning_rate": 6.713843237277562e-06, + "loss": 0.3726, + "step": 5365 + }, + { + "epoch": 1.3495975855130784, + "grad_norm": 0.3507625162601471, + "learning_rate": 6.712468560041825e-06, + "loss": 0.3618, + "step": 5366 + }, + { + "epoch": 1.3498490945674044, + "grad_norm": 0.3141408860683441, + "learning_rate": 6.711093736143869e-06, + "loss": 0.3566, + "step": 5367 + }, + { + "epoch": 1.3501006036217305, + "grad_norm": 0.31941407918930054, + "learning_rate": 6.7097187657014395e-06, + "loss": 0.3568, + "step": 5368 + }, + { + "epoch": 1.3503521126760563, + "grad_norm": 0.29959455132484436, + "learning_rate": 6.708343648832294e-06, + "loss": 0.3355, + "step": 5369 + }, + { + "epoch": 1.3506036217303823, + "grad_norm": 0.32126718759536743, + "learning_rate": 6.706968385654202e-06, + "loss": 0.3352, + "step": 5370 + }, + { + "epoch": 1.3508551307847083, + "grad_norm": 0.31553056836128235, + "learning_rate": 6.705592976284948e-06, + "loss": 0.3303, + "step": 5371 + }, + { + "epoch": 1.3511066398390343, + "grad_norm": 0.3731096088886261, + "learning_rate": 6.704217420842325e-06, + "loss": 0.3627, + "step": 5372 + }, + { + "epoch": 1.3513581488933601, + "grad_norm": 0.3548559248447418, + "learning_rate": 6.702841719444141e-06, + "loss": 0.3606, + "step": 5373 + }, + { + "epoch": 1.3516096579476862, + "grad_norm": 0.3334583342075348, + "learning_rate": 6.701465872208216e-06, + "loss": 0.3507, + "step": 5374 + }, + { + "epoch": 1.3518611670020122, + "grad_norm": 0.36840519309043884, + "learning_rate": 6.700089879252385e-06, + "loss": 0.3599, + "step": 5375 + }, + { + "epoch": 1.352112676056338, + "grad_norm": 0.33070552349090576, + "learning_rate": 6.69871374069449e-06, + "loss": 0.3544, + "step": 5376 + }, + { + "epoch": 1.352364185110664, + "grad_norm": 0.3327641487121582, + "learning_rate": 6.6973374566523904e-06, + "loss": 0.3508, + "step": 5377 + }, + { + "epoch": 1.35261569416499, + "grad_norm": 0.32900556921958923, + "learning_rate": 6.695961027243957e-06, + "loss": 0.3593, + "step": 5378 + }, + { + "epoch": 1.3528672032193159, + "grad_norm": 0.33715447783470154, + "learning_rate": 6.694584452587071e-06, + "loss": 0.3524, + "step": 5379 + }, + { + "epoch": 1.3531187122736419, + "grad_norm": 0.31155914068222046, + "learning_rate": 6.693207732799628e-06, + "loss": 0.3298, + "step": 5380 + }, + { + "epoch": 1.353370221327968, + "grad_norm": 0.3378448188304901, + "learning_rate": 6.691830867999536e-06, + "loss": 0.3213, + "step": 5381 + }, + { + "epoch": 1.3536217303822937, + "grad_norm": 0.31682777404785156, + "learning_rate": 6.690453858304713e-06, + "loss": 0.3306, + "step": 5382 + }, + { + "epoch": 1.3538732394366197, + "grad_norm": 0.3367248773574829, + "learning_rate": 6.689076703833093e-06, + "loss": 0.3662, + "step": 5383 + }, + { + "epoch": 1.3541247484909458, + "grad_norm": 0.3396073579788208, + "learning_rate": 6.687699404702621e-06, + "loss": 0.3576, + "step": 5384 + }, + { + "epoch": 1.3543762575452716, + "grad_norm": 0.33652400970458984, + "learning_rate": 6.686321961031252e-06, + "loss": 0.3369, + "step": 5385 + }, + { + "epoch": 1.3546277665995976, + "grad_norm": 0.33180204033851624, + "learning_rate": 6.684944372936958e-06, + "loss": 0.3447, + "step": 5386 + }, + { + "epoch": 1.3548792756539236, + "grad_norm": 0.32716917991638184, + "learning_rate": 6.6835666405377185e-06, + "loss": 0.352, + "step": 5387 + }, + { + "epoch": 1.3551307847082494, + "grad_norm": 0.33084622025489807, + "learning_rate": 6.682188763951528e-06, + "loss": 0.3252, + "step": 5388 + }, + { + "epoch": 1.3553822937625755, + "grad_norm": 0.3281020224094391, + "learning_rate": 6.680810743296394e-06, + "loss": 0.3865, + "step": 5389 + }, + { + "epoch": 1.3556338028169015, + "grad_norm": 0.3415309488773346, + "learning_rate": 6.6794325786903346e-06, + "loss": 0.3433, + "step": 5390 + }, + { + "epoch": 1.3558853118712273, + "grad_norm": 0.31689122319221497, + "learning_rate": 6.678054270251383e-06, + "loss": 0.3526, + "step": 5391 + }, + { + "epoch": 1.3561368209255533, + "grad_norm": 0.33708077669143677, + "learning_rate": 6.67667581809758e-06, + "loss": 0.326, + "step": 5392 + }, + { + "epoch": 1.3563883299798793, + "grad_norm": 0.3383912146091461, + "learning_rate": 6.6752972223469825e-06, + "loss": 0.3415, + "step": 5393 + }, + { + "epoch": 1.3566398390342052, + "grad_norm": 0.32202810049057007, + "learning_rate": 6.673918483117659e-06, + "loss": 0.3472, + "step": 5394 + }, + { + "epoch": 1.3568913480885312, + "grad_norm": 0.31360799074172974, + "learning_rate": 6.672539600527688e-06, + "loss": 0.3451, + "step": 5395 + }, + { + "epoch": 1.3571428571428572, + "grad_norm": 0.33597296476364136, + "learning_rate": 6.671160574695164e-06, + "loss": 0.3443, + "step": 5396 + }, + { + "epoch": 1.357394366197183, + "grad_norm": 0.35125431418418884, + "learning_rate": 6.669781405738193e-06, + "loss": 0.3528, + "step": 5397 + }, + { + "epoch": 1.357645875251509, + "grad_norm": 0.2997105121612549, + "learning_rate": 6.66840209377489e-06, + "loss": 0.3226, + "step": 5398 + }, + { + "epoch": 1.357897384305835, + "grad_norm": 0.3280561566352844, + "learning_rate": 6.667022638923384e-06, + "loss": 0.3512, + "step": 5399 + }, + { + "epoch": 1.3581488933601609, + "grad_norm": 0.3559575080871582, + "learning_rate": 6.665643041301818e-06, + "loss": 0.3466, + "step": 5400 + }, + { + "epoch": 1.358400402414487, + "grad_norm": 0.3210724890232086, + "learning_rate": 6.6642633010283464e-06, + "loss": 0.3509, + "step": 5401 + }, + { + "epoch": 1.358651911468813, + "grad_norm": 0.3236124813556671, + "learning_rate": 6.662883418221136e-06, + "loss": 0.3407, + "step": 5402 + }, + { + "epoch": 1.3589034205231387, + "grad_norm": 0.3242613971233368, + "learning_rate": 6.661503392998362e-06, + "loss": 0.3438, + "step": 5403 + }, + { + "epoch": 1.3591549295774648, + "grad_norm": 0.33517026901245117, + "learning_rate": 6.660123225478217e-06, + "loss": 0.3578, + "step": 5404 + }, + { + "epoch": 1.3594064386317908, + "grad_norm": 0.3171681761741638, + "learning_rate": 6.658742915778904e-06, + "loss": 0.339, + "step": 5405 + }, + { + "epoch": 1.3596579476861166, + "grad_norm": 0.3439024090766907, + "learning_rate": 6.6573624640186375e-06, + "loss": 0.3572, + "step": 5406 + }, + { + "epoch": 1.3599094567404426, + "grad_norm": 0.3413439691066742, + "learning_rate": 6.655981870315643e-06, + "loss": 0.3571, + "step": 5407 + }, + { + "epoch": 1.3601609657947686, + "grad_norm": 0.33437255024909973, + "learning_rate": 6.654601134788162e-06, + "loss": 0.3457, + "step": 5408 + }, + { + "epoch": 1.3604124748490944, + "grad_norm": 0.36181584000587463, + "learning_rate": 6.653220257554446e-06, + "loss": 0.3669, + "step": 5409 + }, + { + "epoch": 1.3606639839034205, + "grad_norm": 0.34758883714675903, + "learning_rate": 6.6518392387327545e-06, + "loss": 0.3664, + "step": 5410 + }, + { + "epoch": 1.3609154929577465, + "grad_norm": 0.3313416540622711, + "learning_rate": 6.650458078441368e-06, + "loss": 0.3483, + "step": 5411 + }, + { + "epoch": 1.3611670020120723, + "grad_norm": 0.3105969727039337, + "learning_rate": 6.649076776798573e-06, + "loss": 0.3585, + "step": 5412 + }, + { + "epoch": 1.3614185110663983, + "grad_norm": 0.3290698230266571, + "learning_rate": 6.647695333922668e-06, + "loss": 0.3359, + "step": 5413 + }, + { + "epoch": 1.3616700201207244, + "grad_norm": 0.3040936589241028, + "learning_rate": 6.646313749931965e-06, + "loss": 0.3348, + "step": 5414 + }, + { + "epoch": 1.3619215291750504, + "grad_norm": 0.3156396746635437, + "learning_rate": 6.6449320249447905e-06, + "loss": 0.3541, + "step": 5415 + }, + { + "epoch": 1.3621730382293762, + "grad_norm": 0.31559011340141296, + "learning_rate": 6.6435501590794775e-06, + "loss": 0.3304, + "step": 5416 + }, + { + "epoch": 1.3624245472837022, + "grad_norm": 0.30537793040275574, + "learning_rate": 6.642168152454375e-06, + "loss": 0.3714, + "step": 5417 + }, + { + "epoch": 1.3626760563380282, + "grad_norm": 0.3393206298351288, + "learning_rate": 6.640786005187844e-06, + "loss": 0.3662, + "step": 5418 + }, + { + "epoch": 1.362927565392354, + "grad_norm": 0.3266585171222687, + "learning_rate": 6.639403717398256e-06, + "loss": 0.369, + "step": 5419 + }, + { + "epoch": 1.36317907444668, + "grad_norm": 0.31395620107650757, + "learning_rate": 6.6380212892039954e-06, + "loss": 0.3556, + "step": 5420 + }, + { + "epoch": 1.363430583501006, + "grad_norm": 0.32085859775543213, + "learning_rate": 6.636638720723459e-06, + "loss": 0.3398, + "step": 5421 + }, + { + "epoch": 1.3636820925553321, + "grad_norm": 0.325742244720459, + "learning_rate": 6.635256012075056e-06, + "loss": 0.3437, + "step": 5422 + }, + { + "epoch": 1.363933601609658, + "grad_norm": 0.32642316818237305, + "learning_rate": 6.633873163377206e-06, + "loss": 0.3582, + "step": 5423 + }, + { + "epoch": 1.364185110663984, + "grad_norm": 0.3359222710132599, + "learning_rate": 6.63249017474834e-06, + "loss": 0.3304, + "step": 5424 + }, + { + "epoch": 1.36443661971831, + "grad_norm": 0.3578890562057495, + "learning_rate": 6.631107046306902e-06, + "loss": 0.3756, + "step": 5425 + }, + { + "epoch": 1.3646881287726358, + "grad_norm": 0.3200285732746124, + "learning_rate": 6.629723778171352e-06, + "loss": 0.3426, + "step": 5426 + }, + { + "epoch": 1.3649396378269618, + "grad_norm": 0.2880136966705322, + "learning_rate": 6.628340370460156e-06, + "loss": 0.3657, + "step": 5427 + }, + { + "epoch": 1.3651911468812878, + "grad_norm": 0.3454881012439728, + "learning_rate": 6.626956823291793e-06, + "loss": 0.3552, + "step": 5428 + }, + { + "epoch": 1.3654426559356136, + "grad_norm": 0.3579876124858856, + "learning_rate": 6.625573136784755e-06, + "loss": 0.3684, + "step": 5429 + }, + { + "epoch": 1.3656941649899397, + "grad_norm": 0.32106253504753113, + "learning_rate": 6.62418931105755e-06, + "loss": 0.379, + "step": 5430 + }, + { + "epoch": 1.3659456740442657, + "grad_norm": 0.34069597721099854, + "learning_rate": 6.6228053462286905e-06, + "loss": 0.3692, + "step": 5431 + }, + { + "epoch": 1.3661971830985915, + "grad_norm": 0.31019026041030884, + "learning_rate": 6.621421242416703e-06, + "loss": 0.3666, + "step": 5432 + }, + { + "epoch": 1.3664486921529175, + "grad_norm": 0.41305261850357056, + "learning_rate": 6.6200369997401325e-06, + "loss": 0.3689, + "step": 5433 + }, + { + "epoch": 1.3667002012072436, + "grad_norm": 0.3238224685192108, + "learning_rate": 6.618652618317527e-06, + "loss": 0.3469, + "step": 5434 + }, + { + "epoch": 1.3669517102615694, + "grad_norm": 0.3078828454017639, + "learning_rate": 6.617268098267451e-06, + "loss": 0.3354, + "step": 5435 + }, + { + "epoch": 1.3672032193158954, + "grad_norm": 0.29392170906066895, + "learning_rate": 6.615883439708481e-06, + "loss": 0.3254, + "step": 5436 + }, + { + "epoch": 1.3674547283702214, + "grad_norm": 0.34196245670318604, + "learning_rate": 6.6144986427592014e-06, + "loss": 0.3339, + "step": 5437 + }, + { + "epoch": 1.3677062374245472, + "grad_norm": 0.31988444924354553, + "learning_rate": 6.613113707538214e-06, + "loss": 0.3454, + "step": 5438 + }, + { + "epoch": 1.3679577464788732, + "grad_norm": 0.3208816349506378, + "learning_rate": 6.6117286341641305e-06, + "loss": 0.3436, + "step": 5439 + }, + { + "epoch": 1.3682092555331993, + "grad_norm": 0.30724069476127625, + "learning_rate": 6.610343422755572e-06, + "loss": 0.3624, + "step": 5440 + }, + { + "epoch": 1.368460764587525, + "grad_norm": 0.34737128019332886, + "learning_rate": 6.608958073431173e-06, + "loss": 0.3271, + "step": 5441 + }, + { + "epoch": 1.368712273641851, + "grad_norm": 0.3552662134170532, + "learning_rate": 6.607572586309581e-06, + "loss": 0.3809, + "step": 5442 + }, + { + "epoch": 1.3689637826961771, + "grad_norm": 0.2979397475719452, + "learning_rate": 6.606186961509452e-06, + "loss": 0.3306, + "step": 5443 + }, + { + "epoch": 1.369215291750503, + "grad_norm": 0.3502536118030548, + "learning_rate": 6.6048011991494595e-06, + "loss": 0.3469, + "step": 5444 + }, + { + "epoch": 1.369466800804829, + "grad_norm": 0.33274149894714355, + "learning_rate": 6.603415299348284e-06, + "loss": 0.3341, + "step": 5445 + }, + { + "epoch": 1.369718309859155, + "grad_norm": 0.3190041482448578, + "learning_rate": 6.60202926222462e-06, + "loss": 0.3562, + "step": 5446 + }, + { + "epoch": 1.3699698189134808, + "grad_norm": 0.31601500511169434, + "learning_rate": 6.600643087897171e-06, + "loss": 0.356, + "step": 5447 + }, + { + "epoch": 1.3702213279678068, + "grad_norm": 0.3234473764896393, + "learning_rate": 6.599256776484655e-06, + "loss": 0.3756, + "step": 5448 + }, + { + "epoch": 1.3704728370221329, + "grad_norm": 0.33312302827835083, + "learning_rate": 6.597870328105801e-06, + "loss": 0.3666, + "step": 5449 + }, + { + "epoch": 1.3707243460764587, + "grad_norm": 0.32716864347457886, + "learning_rate": 6.59648374287935e-06, + "loss": 0.3525, + "step": 5450 + }, + { + "epoch": 1.3709758551307847, + "grad_norm": 0.315767765045166, + "learning_rate": 6.595097020924054e-06, + "loss": 0.3242, + "step": 5451 + }, + { + "epoch": 1.3712273641851107, + "grad_norm": 0.3259473741054535, + "learning_rate": 6.593710162358676e-06, + "loss": 0.3392, + "step": 5452 + }, + { + "epoch": 1.3714788732394365, + "grad_norm": 0.3474818170070648, + "learning_rate": 6.592323167301994e-06, + "loss": 0.3662, + "step": 5453 + }, + { + "epoch": 1.3717303822937625, + "grad_norm": 0.3318646252155304, + "learning_rate": 6.590936035872792e-06, + "loss": 0.3754, + "step": 5454 + }, + { + "epoch": 1.3719818913480886, + "grad_norm": 0.3492967486381531, + "learning_rate": 6.589548768189875e-06, + "loss": 0.3491, + "step": 5455 + }, + { + "epoch": 1.3722334004024144, + "grad_norm": 0.33331018686294556, + "learning_rate": 6.588161364372047e-06, + "loss": 0.3543, + "step": 5456 + }, + { + "epoch": 1.3724849094567404, + "grad_norm": 0.3557076156139374, + "learning_rate": 6.586773824538136e-06, + "loss": 0.3616, + "step": 5457 + }, + { + "epoch": 1.3727364185110664, + "grad_norm": 0.34410160779953003, + "learning_rate": 6.585386148806974e-06, + "loss": 0.3588, + "step": 5458 + }, + { + "epoch": 1.3729879275653922, + "grad_norm": 0.32868391275405884, + "learning_rate": 6.5839983372974045e-06, + "loss": 0.3557, + "step": 5459 + }, + { + "epoch": 1.3732394366197183, + "grad_norm": 0.29760217666625977, + "learning_rate": 6.5826103901282875e-06, + "loss": 0.3299, + "step": 5460 + }, + { + "epoch": 1.3734909456740443, + "grad_norm": 0.33605876564979553, + "learning_rate": 6.581222307418492e-06, + "loss": 0.3432, + "step": 5461 + }, + { + "epoch": 1.37374245472837, + "grad_norm": 0.30532124638557434, + "learning_rate": 6.579834089286898e-06, + "loss": 0.3592, + "step": 5462 + }, + { + "epoch": 1.3739939637826961, + "grad_norm": 0.30395251512527466, + "learning_rate": 6.578445735852397e-06, + "loss": 0.341, + "step": 5463 + }, + { + "epoch": 1.3742454728370221, + "grad_norm": 0.32410097122192383, + "learning_rate": 6.5770572472338935e-06, + "loss": 0.3545, + "step": 5464 + }, + { + "epoch": 1.3744969818913482, + "grad_norm": 0.32119905948638916, + "learning_rate": 6.575668623550302e-06, + "loss": 0.346, + "step": 5465 + }, + { + "epoch": 1.374748490945674, + "grad_norm": 0.30863913893699646, + "learning_rate": 6.574279864920552e-06, + "loss": 0.363, + "step": 5466 + }, + { + "epoch": 1.375, + "grad_norm": 0.3214772939682007, + "learning_rate": 6.572890971463579e-06, + "loss": 0.3546, + "step": 5467 + }, + { + "epoch": 1.375251509054326, + "grad_norm": 0.30236709117889404, + "learning_rate": 6.571501943298335e-06, + "loss": 0.3459, + "step": 5468 + }, + { + "epoch": 1.3755030181086518, + "grad_norm": 0.35171201825141907, + "learning_rate": 6.57011278054378e-06, + "loss": 0.3439, + "step": 5469 + }, + { + "epoch": 1.3757545271629779, + "grad_norm": 0.3353353440761566, + "learning_rate": 6.568723483318889e-06, + "loss": 0.3565, + "step": 5470 + }, + { + "epoch": 1.3760060362173039, + "grad_norm": 0.3299080431461334, + "learning_rate": 6.567334051742645e-06, + "loss": 0.3654, + "step": 5471 + }, + { + "epoch": 1.37625754527163, + "grad_norm": 0.3008752167224884, + "learning_rate": 6.565944485934046e-06, + "loss": 0.3346, + "step": 5472 + }, + { + "epoch": 1.3765090543259557, + "grad_norm": 0.33441752195358276, + "learning_rate": 6.564554786012096e-06, + "loss": 0.3622, + "step": 5473 + }, + { + "epoch": 1.3767605633802817, + "grad_norm": 0.326872855424881, + "learning_rate": 6.563164952095818e-06, + "loss": 0.374, + "step": 5474 + }, + { + "epoch": 1.3770120724346078, + "grad_norm": 0.30647537112236023, + "learning_rate": 6.561774984304241e-06, + "loss": 0.3631, + "step": 5475 + }, + { + "epoch": 1.3772635814889336, + "grad_norm": 0.2820424735546112, + "learning_rate": 6.560384882756406e-06, + "loss": 0.3572, + "step": 5476 + }, + { + "epoch": 1.3775150905432596, + "grad_norm": 0.3259304463863373, + "learning_rate": 6.558994647571369e-06, + "loss": 0.3525, + "step": 5477 + }, + { + "epoch": 1.3777665995975856, + "grad_norm": 0.3217025399208069, + "learning_rate": 6.557604278868193e-06, + "loss": 0.3713, + "step": 5478 + }, + { + "epoch": 1.3780181086519114, + "grad_norm": 0.336544394493103, + "learning_rate": 6.5562137767659516e-06, + "loss": 0.3471, + "step": 5479 + }, + { + "epoch": 1.3782696177062375, + "grad_norm": 0.3470653295516968, + "learning_rate": 6.554823141383739e-06, + "loss": 0.3517, + "step": 5480 + }, + { + "epoch": 1.3785211267605635, + "grad_norm": 0.37579667568206787, + "learning_rate": 6.553432372840651e-06, + "loss": 0.3326, + "step": 5481 + }, + { + "epoch": 1.3787726358148893, + "grad_norm": 0.31945011019706726, + "learning_rate": 6.552041471255799e-06, + "loss": 0.343, + "step": 5482 + }, + { + "epoch": 1.3790241448692153, + "grad_norm": 0.3295787274837494, + "learning_rate": 6.550650436748304e-06, + "loss": 0.3246, + "step": 5483 + }, + { + "epoch": 1.3792756539235413, + "grad_norm": 0.392974317073822, + "learning_rate": 6.5492592694373e-06, + "loss": 0.3482, + "step": 5484 + }, + { + "epoch": 1.3795271629778671, + "grad_norm": 0.33396854996681213, + "learning_rate": 6.547867969441931e-06, + "loss": 0.3562, + "step": 5485 + }, + { + "epoch": 1.3797786720321932, + "grad_norm": 0.3102302551269531, + "learning_rate": 6.546476536881355e-06, + "loss": 0.3471, + "step": 5486 + }, + { + "epoch": 1.3800301810865192, + "grad_norm": 0.36257678270339966, + "learning_rate": 6.545084971874738e-06, + "loss": 0.3628, + "step": 5487 + }, + { + "epoch": 1.380281690140845, + "grad_norm": 0.3609892725944519, + "learning_rate": 6.543693274541259e-06, + "loss": 0.353, + "step": 5488 + }, + { + "epoch": 1.380533199195171, + "grad_norm": 0.3334091603755951, + "learning_rate": 6.542301445000108e-06, + "loss": 0.3528, + "step": 5489 + }, + { + "epoch": 1.380784708249497, + "grad_norm": 0.3559829890727997, + "learning_rate": 6.540909483370488e-06, + "loss": 0.342, + "step": 5490 + }, + { + "epoch": 1.3810362173038229, + "grad_norm": 0.35456085205078125, + "learning_rate": 6.539517389771609e-06, + "loss": 0.3803, + "step": 5491 + }, + { + "epoch": 1.381287726358149, + "grad_norm": 0.3310755491256714, + "learning_rate": 6.538125164322699e-06, + "loss": 0.3475, + "step": 5492 + }, + { + "epoch": 1.381539235412475, + "grad_norm": 0.35925358533859253, + "learning_rate": 6.536732807142989e-06, + "loss": 0.37, + "step": 5493 + }, + { + "epoch": 1.3817907444668007, + "grad_norm": 0.3170955777168274, + "learning_rate": 6.535340318351729e-06, + "loss": 0.3237, + "step": 5494 + }, + { + "epoch": 1.3820422535211268, + "grad_norm": 0.31947341561317444, + "learning_rate": 6.5339476980681746e-06, + "loss": 0.3396, + "step": 5495 + }, + { + "epoch": 1.3822937625754528, + "grad_norm": 0.3393300473690033, + "learning_rate": 6.532554946411598e-06, + "loss": 0.3373, + "step": 5496 + }, + { + "epoch": 1.3825452716297786, + "grad_norm": 0.3222068250179291, + "learning_rate": 6.531162063501275e-06, + "loss": 0.3653, + "step": 5497 + }, + { + "epoch": 1.3827967806841046, + "grad_norm": 0.3433172404766083, + "learning_rate": 6.529769049456502e-06, + "loss": 0.3526, + "step": 5498 + }, + { + "epoch": 1.3830482897384306, + "grad_norm": 0.31900420784950256, + "learning_rate": 6.52837590439658e-06, + "loss": 0.3523, + "step": 5499 + }, + { + "epoch": 1.3832997987927564, + "grad_norm": 0.3333498537540436, + "learning_rate": 6.5269826284408235e-06, + "loss": 0.3579, + "step": 5500 + }, + { + "epoch": 1.3835513078470825, + "grad_norm": 0.3352772891521454, + "learning_rate": 6.525589221708558e-06, + "loss": 0.3495, + "step": 5501 + }, + { + "epoch": 1.3838028169014085, + "grad_norm": 0.35834312438964844, + "learning_rate": 6.524195684319119e-06, + "loss": 0.3236, + "step": 5502 + }, + { + "epoch": 1.3840543259557343, + "grad_norm": 0.3257136940956116, + "learning_rate": 6.522802016391856e-06, + "loss": 0.3437, + "step": 5503 + }, + { + "epoch": 1.3843058350100603, + "grad_norm": 0.337438702583313, + "learning_rate": 6.521408218046126e-06, + "loss": 0.3477, + "step": 5504 + }, + { + "epoch": 1.3845573440643864, + "grad_norm": 0.33436325192451477, + "learning_rate": 6.5200142894012995e-06, + "loss": 0.3485, + "step": 5505 + }, + { + "epoch": 1.3848088531187122, + "grad_norm": 0.3181125521659851, + "learning_rate": 6.51862023057676e-06, + "loss": 0.3766, + "step": 5506 + }, + { + "epoch": 1.3850603621730382, + "grad_norm": 0.3215303421020508, + "learning_rate": 6.517226041691897e-06, + "loss": 0.3572, + "step": 5507 + }, + { + "epoch": 1.3853118712273642, + "grad_norm": 0.3357185125350952, + "learning_rate": 6.515831722866115e-06, + "loss": 0.343, + "step": 5508 + }, + { + "epoch": 1.38556338028169, + "grad_norm": 0.31294816732406616, + "learning_rate": 6.514437274218829e-06, + "loss": 0.3519, + "step": 5509 + }, + { + "epoch": 1.385814889336016, + "grad_norm": 0.30923759937286377, + "learning_rate": 6.513042695869465e-06, + "loss": 0.3427, + "step": 5510 + }, + { + "epoch": 1.386066398390342, + "grad_norm": 0.3177286982536316, + "learning_rate": 6.51164798793746e-06, + "loss": 0.3483, + "step": 5511 + }, + { + "epoch": 1.3863179074446679, + "grad_norm": 0.30468153953552246, + "learning_rate": 6.510253150542262e-06, + "loss": 0.3639, + "step": 5512 + }, + { + "epoch": 1.386569416498994, + "grad_norm": 0.33087295293807983, + "learning_rate": 6.508858183803328e-06, + "loss": 0.352, + "step": 5513 + }, + { + "epoch": 1.38682092555332, + "grad_norm": 0.32209211587905884, + "learning_rate": 6.507463087840133e-06, + "loss": 0.3568, + "step": 5514 + }, + { + "epoch": 1.387072434607646, + "grad_norm": 0.3076551854610443, + "learning_rate": 6.506067862772153e-06, + "loss": 0.3604, + "step": 5515 + }, + { + "epoch": 1.3873239436619718, + "grad_norm": 0.3314887285232544, + "learning_rate": 6.504672508718882e-06, + "loss": 0.3678, + "step": 5516 + }, + { + "epoch": 1.3875754527162978, + "grad_norm": 0.31186097860336304, + "learning_rate": 6.503277025799825e-06, + "loss": 0.3576, + "step": 5517 + }, + { + "epoch": 1.3878269617706238, + "grad_norm": 0.29338905215263367, + "learning_rate": 6.501881414134495e-06, + "loss": 0.3615, + "step": 5518 + }, + { + "epoch": 1.3880784708249496, + "grad_norm": 0.3048270046710968, + "learning_rate": 6.500485673842417e-06, + "loss": 0.3395, + "step": 5519 + }, + { + "epoch": 1.3883299798792756, + "grad_norm": 0.30701351165771484, + "learning_rate": 6.499089805043129e-06, + "loss": 0.3739, + "step": 5520 + }, + { + "epoch": 1.3885814889336017, + "grad_norm": 0.3141977787017822, + "learning_rate": 6.497693807856177e-06, + "loss": 0.3709, + "step": 5521 + }, + { + "epoch": 1.3888329979879277, + "grad_norm": 0.3316255509853363, + "learning_rate": 6.496297682401121e-06, + "loss": 0.3554, + "step": 5522 + }, + { + "epoch": 1.3890845070422535, + "grad_norm": 0.3405584990978241, + "learning_rate": 6.494901428797526e-06, + "loss": 0.3521, + "step": 5523 + }, + { + "epoch": 1.3893360160965795, + "grad_norm": 0.34003138542175293, + "learning_rate": 6.493505047164978e-06, + "loss": 0.3322, + "step": 5524 + }, + { + "epoch": 1.3895875251509056, + "grad_norm": 0.32138487696647644, + "learning_rate": 6.492108537623067e-06, + "loss": 0.3329, + "step": 5525 + }, + { + "epoch": 1.3898390342052314, + "grad_norm": 0.328475683927536, + "learning_rate": 6.490711900291393e-06, + "loss": 0.349, + "step": 5526 + }, + { + "epoch": 1.3900905432595574, + "grad_norm": 0.3196926414966583, + "learning_rate": 6.489315135289571e-06, + "loss": 0.3406, + "step": 5527 + }, + { + "epoch": 1.3903420523138834, + "grad_norm": 0.32950088381767273, + "learning_rate": 6.487918242737225e-06, + "loss": 0.3643, + "step": 5528 + }, + { + "epoch": 1.3905935613682092, + "grad_norm": 0.32452213764190674, + "learning_rate": 6.4865212227539895e-06, + "loss": 0.3614, + "step": 5529 + }, + { + "epoch": 1.3908450704225352, + "grad_norm": 0.3157281279563904, + "learning_rate": 6.4851240754595104e-06, + "loss": 0.3499, + "step": 5530 + }, + { + "epoch": 1.3910965794768613, + "grad_norm": 0.3089020252227783, + "learning_rate": 6.483726800973447e-06, + "loss": 0.351, + "step": 5531 + }, + { + "epoch": 1.391348088531187, + "grad_norm": 0.32950559258461, + "learning_rate": 6.482329399415463e-06, + "loss": 0.3476, + "step": 5532 + }, + { + "epoch": 1.391599597585513, + "grad_norm": 0.31183376908302307, + "learning_rate": 6.48093187090524e-06, + "loss": 0.3315, + "step": 5533 + }, + { + "epoch": 1.3918511066398391, + "grad_norm": 0.34056970477104187, + "learning_rate": 6.4795342155624685e-06, + "loss": 0.3579, + "step": 5534 + }, + { + "epoch": 1.392102615694165, + "grad_norm": 0.29875314235687256, + "learning_rate": 6.478136433506846e-06, + "loss": 0.3608, + "step": 5535 + }, + { + "epoch": 1.392354124748491, + "grad_norm": 0.32909777760505676, + "learning_rate": 6.4767385248580865e-06, + "loss": 0.3757, + "step": 5536 + }, + { + "epoch": 1.392605633802817, + "grad_norm": 0.33318114280700684, + "learning_rate": 6.4753404897359085e-06, + "loss": 0.3523, + "step": 5537 + }, + { + "epoch": 1.3928571428571428, + "grad_norm": 0.32883554697036743, + "learning_rate": 6.473942328260049e-06, + "loss": 0.3536, + "step": 5538 + }, + { + "epoch": 1.3931086519114688, + "grad_norm": 0.3212123215198517, + "learning_rate": 6.4725440405502495e-06, + "loss": 0.3598, + "step": 5539 + }, + { + "epoch": 1.3933601609657948, + "grad_norm": 0.3189206123352051, + "learning_rate": 6.471145626726265e-06, + "loss": 0.3533, + "step": 5540 + }, + { + "epoch": 1.3936116700201207, + "grad_norm": 0.3086141347885132, + "learning_rate": 6.469747086907862e-06, + "loss": 0.3797, + "step": 5541 + }, + { + "epoch": 1.3938631790744467, + "grad_norm": 0.32901155948638916, + "learning_rate": 6.468348421214814e-06, + "loss": 0.3595, + "step": 5542 + }, + { + "epoch": 1.3941146881287727, + "grad_norm": 0.33005937933921814, + "learning_rate": 6.466949629766911e-06, + "loss": 0.3243, + "step": 5543 + }, + { + "epoch": 1.3943661971830985, + "grad_norm": 0.3070705831050873, + "learning_rate": 6.465550712683949e-06, + "loss": 0.3423, + "step": 5544 + }, + { + "epoch": 1.3946177062374245, + "grad_norm": 0.32530298829078674, + "learning_rate": 6.464151670085738e-06, + "loss": 0.3656, + "step": 5545 + }, + { + "epoch": 1.3948692152917506, + "grad_norm": 0.31196725368499756, + "learning_rate": 6.4627525020920946e-06, + "loss": 0.375, + "step": 5546 + }, + { + "epoch": 1.3951207243460764, + "grad_norm": 0.32940876483917236, + "learning_rate": 6.461353208822851e-06, + "loss": 0.3649, + "step": 5547 + }, + { + "epoch": 1.3953722334004024, + "grad_norm": 0.31415605545043945, + "learning_rate": 6.459953790397847e-06, + "loss": 0.3396, + "step": 5548 + }, + { + "epoch": 1.3956237424547284, + "grad_norm": 0.34104838967323303, + "learning_rate": 6.458554246936934e-06, + "loss": 0.3251, + "step": 5549 + }, + { + "epoch": 1.3958752515090542, + "grad_norm": 0.33823341131210327, + "learning_rate": 6.457154578559975e-06, + "loss": 0.3594, + "step": 5550 + }, + { + "epoch": 1.3961267605633803, + "grad_norm": 0.34172114729881287, + "learning_rate": 6.455754785386843e-06, + "loss": 0.3679, + "step": 5551 + }, + { + "epoch": 1.3963782696177063, + "grad_norm": 0.3444267213344574, + "learning_rate": 6.454354867537418e-06, + "loss": 0.366, + "step": 5552 + }, + { + "epoch": 1.396629778672032, + "grad_norm": 0.32145577669143677, + "learning_rate": 6.452954825131599e-06, + "loss": 0.3459, + "step": 5553 + }, + { + "epoch": 1.3968812877263581, + "grad_norm": 0.3104569911956787, + "learning_rate": 6.4515546582892895e-06, + "loss": 0.338, + "step": 5554 + }, + { + "epoch": 1.3971327967806841, + "grad_norm": 0.3085978031158447, + "learning_rate": 6.450154367130403e-06, + "loss": 0.3378, + "step": 5555 + }, + { + "epoch": 1.39738430583501, + "grad_norm": 0.33246156573295593, + "learning_rate": 6.448753951774869e-06, + "loss": 0.34, + "step": 5556 + }, + { + "epoch": 1.397635814889336, + "grad_norm": 0.3080597519874573, + "learning_rate": 6.447353412342621e-06, + "loss": 0.3697, + "step": 5557 + }, + { + "epoch": 1.397887323943662, + "grad_norm": 0.3250534236431122, + "learning_rate": 6.445952748953607e-06, + "loss": 0.3371, + "step": 5558 + }, + { + "epoch": 1.3981388329979878, + "grad_norm": 0.3486895263195038, + "learning_rate": 6.4445519617277874e-06, + "loss": 0.3384, + "step": 5559 + }, + { + "epoch": 1.3983903420523138, + "grad_norm": 0.3366147577762604, + "learning_rate": 6.443151050785129e-06, + "loss": 0.3304, + "step": 5560 + }, + { + "epoch": 1.3986418511066399, + "grad_norm": 0.3010328412055969, + "learning_rate": 6.4417500162456114e-06, + "loss": 0.34, + "step": 5561 + }, + { + "epoch": 1.3988933601609657, + "grad_norm": 0.32444706559181213, + "learning_rate": 6.440348858229224e-06, + "loss": 0.3565, + "step": 5562 + }, + { + "epoch": 1.3991448692152917, + "grad_norm": 0.3689126968383789, + "learning_rate": 6.4389475768559675e-06, + "loss": 0.3513, + "step": 5563 + }, + { + "epoch": 1.3993963782696177, + "grad_norm": 0.3049902319908142, + "learning_rate": 6.437546172245855e-06, + "loss": 0.3215, + "step": 5564 + }, + { + "epoch": 1.3996478873239437, + "grad_norm": 0.3056788146495819, + "learning_rate": 6.436144644518905e-06, + "loss": 0.3364, + "step": 5565 + }, + { + "epoch": 1.3998993963782695, + "grad_norm": 0.33238446712493896, + "learning_rate": 6.434742993795149e-06, + "loss": 0.3486, + "step": 5566 + }, + { + "epoch": 1.4001509054325956, + "grad_norm": 0.3054902255535126, + "learning_rate": 6.433341220194633e-06, + "loss": 0.3234, + "step": 5567 + }, + { + "epoch": 1.4004024144869216, + "grad_norm": 0.3176266551017761, + "learning_rate": 6.431939323837409e-06, + "loss": 0.3447, + "step": 5568 + }, + { + "epoch": 1.4006539235412474, + "grad_norm": 0.3124770522117615, + "learning_rate": 6.430537304843539e-06, + "loss": 0.3541, + "step": 5569 + }, + { + "epoch": 1.4009054325955734, + "grad_norm": 0.2936578094959259, + "learning_rate": 6.429135163333099e-06, + "loss": 0.3353, + "step": 5570 + }, + { + "epoch": 1.4011569416498995, + "grad_norm": 0.32754024863243103, + "learning_rate": 6.427732899426172e-06, + "loss": 0.3473, + "step": 5571 + }, + { + "epoch": 1.4014084507042255, + "grad_norm": 0.31696975231170654, + "learning_rate": 6.426330513242855e-06, + "loss": 0.3386, + "step": 5572 + }, + { + "epoch": 1.4016599597585513, + "grad_norm": 0.33291855454444885, + "learning_rate": 6.424928004903252e-06, + "loss": 0.3386, + "step": 5573 + }, + { + "epoch": 1.4019114688128773, + "grad_norm": 0.3217375874519348, + "learning_rate": 6.423525374527479e-06, + "loss": 0.3495, + "step": 5574 + }, + { + "epoch": 1.4021629778672033, + "grad_norm": 0.2930941879749298, + "learning_rate": 6.422122622235665e-06, + "loss": 0.357, + "step": 5575 + }, + { + "epoch": 1.4024144869215291, + "grad_norm": 0.3463757336139679, + "learning_rate": 6.420719748147943e-06, + "loss": 0.3652, + "step": 5576 + }, + { + "epoch": 1.4026659959758552, + "grad_norm": 0.4066574275493622, + "learning_rate": 6.419316752384464e-06, + "loss": 0.3555, + "step": 5577 + }, + { + "epoch": 1.4029175050301812, + "grad_norm": 0.33143872022628784, + "learning_rate": 6.417913635065385e-06, + "loss": 0.3418, + "step": 5578 + }, + { + "epoch": 1.403169014084507, + "grad_norm": 0.30597761273384094, + "learning_rate": 6.4165103963108724e-06, + "loss": 0.3428, + "step": 5579 + }, + { + "epoch": 1.403420523138833, + "grad_norm": 0.33264756202697754, + "learning_rate": 6.415107036241106e-06, + "loss": 0.3635, + "step": 5580 + }, + { + "epoch": 1.403672032193159, + "grad_norm": 0.32489314675331116, + "learning_rate": 6.413703554976276e-06, + "loss": 0.3375, + "step": 5581 + }, + { + "epoch": 1.4039235412474849, + "grad_norm": 0.3370649218559265, + "learning_rate": 6.41229995263658e-06, + "loss": 0.3336, + "step": 5582 + }, + { + "epoch": 1.404175050301811, + "grad_norm": 0.34233397245407104, + "learning_rate": 6.410896229342228e-06, + "loss": 0.3634, + "step": 5583 + }, + { + "epoch": 1.404426559356137, + "grad_norm": 0.34717515110969543, + "learning_rate": 6.40949238521344e-06, + "loss": 0.3228, + "step": 5584 + }, + { + "epoch": 1.4046780684104627, + "grad_norm": 0.30195194482803345, + "learning_rate": 6.408088420370448e-06, + "loss": 0.3488, + "step": 5585 + }, + { + "epoch": 1.4049295774647887, + "grad_norm": 0.33940741419792175, + "learning_rate": 6.406684334933491e-06, + "loss": 0.3741, + "step": 5586 + }, + { + "epoch": 1.4051810865191148, + "grad_norm": 0.3448837101459503, + "learning_rate": 6.405280129022821e-06, + "loss": 0.3687, + "step": 5587 + }, + { + "epoch": 1.4054325955734406, + "grad_norm": 0.33431434631347656, + "learning_rate": 6.4038758027587e-06, + "loss": 0.3487, + "step": 5588 + }, + { + "epoch": 1.4056841046277666, + "grad_norm": 0.32833728194236755, + "learning_rate": 6.402471356261399e-06, + "loss": 0.3436, + "step": 5589 + }, + { + "epoch": 1.4059356136820926, + "grad_norm": 0.3193908929824829, + "learning_rate": 6.4010667896512e-06, + "loss": 0.3325, + "step": 5590 + }, + { + "epoch": 1.4061871227364184, + "grad_norm": 0.36166292428970337, + "learning_rate": 6.399662103048396e-06, + "loss": 0.3701, + "step": 5591 + }, + { + "epoch": 1.4064386317907445, + "grad_norm": 0.3254150152206421, + "learning_rate": 6.398257296573288e-06, + "loss": 0.3524, + "step": 5592 + }, + { + "epoch": 1.4066901408450705, + "grad_norm": 0.30729982256889343, + "learning_rate": 6.396852370346191e-06, + "loss": 0.3713, + "step": 5593 + }, + { + "epoch": 1.4069416498993963, + "grad_norm": 0.3343053162097931, + "learning_rate": 6.395447324487427e-06, + "loss": 0.3501, + "step": 5594 + }, + { + "epoch": 1.4071931589537223, + "grad_norm": 0.3478783369064331, + "learning_rate": 6.394042159117329e-06, + "loss": 0.3196, + "step": 5595 + }, + { + "epoch": 1.4074446680080483, + "grad_norm": 0.30152082443237305, + "learning_rate": 6.392636874356242e-06, + "loss": 0.3359, + "step": 5596 + }, + { + "epoch": 1.4076961770623742, + "grad_norm": 0.3307269513607025, + "learning_rate": 6.391231470324517e-06, + "loss": 0.3744, + "step": 5597 + }, + { + "epoch": 1.4079476861167002, + "grad_norm": 0.33799752593040466, + "learning_rate": 6.389825947142523e-06, + "loss": 0.3632, + "step": 5598 + }, + { + "epoch": 1.4081991951710262, + "grad_norm": 0.31849145889282227, + "learning_rate": 6.38842030493063e-06, + "loss": 0.3573, + "step": 5599 + }, + { + "epoch": 1.408450704225352, + "grad_norm": 0.3179837763309479, + "learning_rate": 6.387014543809224e-06, + "loss": 0.3578, + "step": 5600 + }, + { + "epoch": 1.408702213279678, + "grad_norm": 0.3246977627277374, + "learning_rate": 6.3856086638986995e-06, + "loss": 0.3751, + "step": 5601 + }, + { + "epoch": 1.408953722334004, + "grad_norm": 0.35665377974510193, + "learning_rate": 6.38420266531946e-06, + "loss": 0.3697, + "step": 5602 + }, + { + "epoch": 1.4092052313883299, + "grad_norm": 0.3288874924182892, + "learning_rate": 6.382796548191923e-06, + "loss": 0.3492, + "step": 5603 + }, + { + "epoch": 1.409456740442656, + "grad_norm": 0.3103671073913574, + "learning_rate": 6.381390312636513e-06, + "loss": 0.3453, + "step": 5604 + }, + { + "epoch": 1.409708249496982, + "grad_norm": 0.3277508616447449, + "learning_rate": 6.379983958773663e-06, + "loss": 0.348, + "step": 5605 + }, + { + "epoch": 1.4099597585513077, + "grad_norm": 0.2977486848831177, + "learning_rate": 6.378577486723821e-06, + "loss": 0.3576, + "step": 5606 + }, + { + "epoch": 1.4102112676056338, + "grad_norm": 0.30424800515174866, + "learning_rate": 6.377170896607442e-06, + "loss": 0.3413, + "step": 5607 + }, + { + "epoch": 1.4104627766599598, + "grad_norm": 0.35188329219818115, + "learning_rate": 6.3757641885449904e-06, + "loss": 0.3816, + "step": 5608 + }, + { + "epoch": 1.4107142857142856, + "grad_norm": 0.299070805311203, + "learning_rate": 6.374357362656944e-06, + "loss": 0.3635, + "step": 5609 + }, + { + "epoch": 1.4109657947686116, + "grad_norm": 0.32047006487846375, + "learning_rate": 6.372950419063787e-06, + "loss": 0.3641, + "step": 5610 + }, + { + "epoch": 1.4112173038229376, + "grad_norm": 0.32744547724723816, + "learning_rate": 6.3715433578860155e-06, + "loss": 0.358, + "step": 5611 + }, + { + "epoch": 1.4114688128772634, + "grad_norm": 0.34040847420692444, + "learning_rate": 6.3701361792441355e-06, + "loss": 0.3533, + "step": 5612 + }, + { + "epoch": 1.4117203219315895, + "grad_norm": 0.34646251797676086, + "learning_rate": 6.368728883258664e-06, + "loss": 0.3391, + "step": 5613 + }, + { + "epoch": 1.4119718309859155, + "grad_norm": 0.3157348930835724, + "learning_rate": 6.367321470050125e-06, + "loss": 0.349, + "step": 5614 + }, + { + "epoch": 1.4122233400402415, + "grad_norm": 0.3335605263710022, + "learning_rate": 6.365913939739057e-06, + "loss": 0.3467, + "step": 5615 + }, + { + "epoch": 1.4124748490945673, + "grad_norm": 0.35618653893470764, + "learning_rate": 6.364506292446005e-06, + "loss": 0.383, + "step": 5616 + }, + { + "epoch": 1.4127263581488934, + "grad_norm": 0.3262132406234741, + "learning_rate": 6.363098528291525e-06, + "loss": 0.3624, + "step": 5617 + }, + { + "epoch": 1.4129778672032194, + "grad_norm": 0.3189094662666321, + "learning_rate": 6.361690647396184e-06, + "loss": 0.3665, + "step": 5618 + }, + { + "epoch": 1.4132293762575452, + "grad_norm": 0.29632070660591125, + "learning_rate": 6.3602826498805585e-06, + "loss": 0.3518, + "step": 5619 + }, + { + "epoch": 1.4134808853118712, + "grad_norm": 0.35020795464515686, + "learning_rate": 6.358874535865233e-06, + "loss": 0.342, + "step": 5620 + }, + { + "epoch": 1.4137323943661972, + "grad_norm": 0.322587788105011, + "learning_rate": 6.357466305470805e-06, + "loss": 0.3361, + "step": 5621 + }, + { + "epoch": 1.4139839034205233, + "grad_norm": 0.31895244121551514, + "learning_rate": 6.356057958817879e-06, + "loss": 0.3561, + "step": 5622 + }, + { + "epoch": 1.414235412474849, + "grad_norm": 0.36562541127204895, + "learning_rate": 6.354649496027075e-06, + "loss": 0.3231, + "step": 5623 + }, + { + "epoch": 1.414486921529175, + "grad_norm": 0.3357252776622772, + "learning_rate": 6.353240917219014e-06, + "loss": 0.3615, + "step": 5624 + }, + { + "epoch": 1.4147384305835011, + "grad_norm": 0.3266746699810028, + "learning_rate": 6.351832222514335e-06, + "loss": 0.3515, + "step": 5625 + }, + { + "epoch": 1.414989939637827, + "grad_norm": 0.317125529050827, + "learning_rate": 6.350423412033683e-06, + "loss": 0.3172, + "step": 5626 + }, + { + "epoch": 1.415241448692153, + "grad_norm": 0.3371025621891022, + "learning_rate": 6.349014485897714e-06, + "loss": 0.351, + "step": 5627 + }, + { + "epoch": 1.415492957746479, + "grad_norm": 0.33615246415138245, + "learning_rate": 6.347605444227093e-06, + "loss": 0.3431, + "step": 5628 + }, + { + "epoch": 1.4157444668008048, + "grad_norm": 0.36137500405311584, + "learning_rate": 6.346196287142497e-06, + "loss": 0.3449, + "step": 5629 + }, + { + "epoch": 1.4159959758551308, + "grad_norm": 0.37134072184562683, + "learning_rate": 6.344787014764611e-06, + "loss": 0.3563, + "step": 5630 + }, + { + "epoch": 1.4162474849094568, + "grad_norm": 0.3492390811443329, + "learning_rate": 6.34337762721413e-06, + "loss": 0.3607, + "step": 5631 + }, + { + "epoch": 1.4164989939637826, + "grad_norm": 0.3468017578125, + "learning_rate": 6.341968124611759e-06, + "loss": 0.3554, + "step": 5632 + }, + { + "epoch": 1.4167505030181087, + "grad_norm": 0.2991659939289093, + "learning_rate": 6.340558507078215e-06, + "loss": 0.3202, + "step": 5633 + }, + { + "epoch": 1.4170020120724347, + "grad_norm": 0.3461068272590637, + "learning_rate": 6.339148774734221e-06, + "loss": 0.35, + "step": 5634 + }, + { + "epoch": 1.4172535211267605, + "grad_norm": 0.3426593542098999, + "learning_rate": 6.33773892770051e-06, + "loss": 0.3411, + "step": 5635 + }, + { + "epoch": 1.4175050301810865, + "grad_norm": 0.3247561752796173, + "learning_rate": 6.3363289660978315e-06, + "loss": 0.3484, + "step": 5636 + }, + { + "epoch": 1.4177565392354126, + "grad_norm": 0.3241268992424011, + "learning_rate": 6.334918890046935e-06, + "loss": 0.3471, + "step": 5637 + }, + { + "epoch": 1.4180080482897384, + "grad_norm": 0.36099836230278015, + "learning_rate": 6.333508699668587e-06, + "loss": 0.3545, + "step": 5638 + }, + { + "epoch": 1.4182595573440644, + "grad_norm": 0.31838127970695496, + "learning_rate": 6.332098395083562e-06, + "loss": 0.3412, + "step": 5639 + }, + { + "epoch": 1.4185110663983904, + "grad_norm": 0.3607694208621979, + "learning_rate": 6.330687976412642e-06, + "loss": 0.3937, + "step": 5640 + }, + { + "epoch": 1.4187625754527162, + "grad_norm": 0.3097631335258484, + "learning_rate": 6.329277443776623e-06, + "loss": 0.3405, + "step": 5641 + }, + { + "epoch": 1.4190140845070423, + "grad_norm": 0.3108190894126892, + "learning_rate": 6.327866797296306e-06, + "loss": 0.3474, + "step": 5642 + }, + { + "epoch": 1.4192655935613683, + "grad_norm": 0.32645246386528015, + "learning_rate": 6.326456037092505e-06, + "loss": 0.3367, + "step": 5643 + }, + { + "epoch": 1.419517102615694, + "grad_norm": 0.3206358850002289, + "learning_rate": 6.325045163286043e-06, + "loss": 0.3654, + "step": 5644 + }, + { + "epoch": 1.41976861167002, + "grad_norm": 0.3293161988258362, + "learning_rate": 6.323634175997753e-06, + "loss": 0.3539, + "step": 5645 + }, + { + "epoch": 1.4200201207243461, + "grad_norm": 0.3224540948867798, + "learning_rate": 6.322223075348475e-06, + "loss": 0.3403, + "step": 5646 + }, + { + "epoch": 1.420271629778672, + "grad_norm": 0.3604409694671631, + "learning_rate": 6.320811861459063e-06, + "loss": 0.3479, + "step": 5647 + }, + { + "epoch": 1.420523138832998, + "grad_norm": 0.337251216173172, + "learning_rate": 6.319400534450378e-06, + "loss": 0.3347, + "step": 5648 + }, + { + "epoch": 1.420774647887324, + "grad_norm": 0.2904793322086334, + "learning_rate": 6.317989094443291e-06, + "loss": 0.3252, + "step": 5649 + }, + { + "epoch": 1.4210261569416498, + "grad_norm": 0.31111976504325867, + "learning_rate": 6.316577541558683e-06, + "loss": 0.3448, + "step": 5650 + }, + { + "epoch": 1.4212776659959758, + "grad_norm": 0.3352508544921875, + "learning_rate": 6.315165875917446e-06, + "loss": 0.3272, + "step": 5651 + }, + { + "epoch": 1.4215291750503019, + "grad_norm": 0.33469197154045105, + "learning_rate": 6.313754097640479e-06, + "loss": 0.3502, + "step": 5652 + }, + { + "epoch": 1.4217806841046277, + "grad_norm": 0.3249984383583069, + "learning_rate": 6.312342206848693e-06, + "loss": 0.3638, + "step": 5653 + }, + { + "epoch": 1.4220321931589537, + "grad_norm": 0.3012884259223938, + "learning_rate": 6.310930203663006e-06, + "loss": 0.3517, + "step": 5654 + }, + { + "epoch": 1.4222837022132797, + "grad_norm": 0.32274380326271057, + "learning_rate": 6.309518088204349e-06, + "loss": 0.3494, + "step": 5655 + }, + { + "epoch": 1.4225352112676055, + "grad_norm": 0.3095703721046448, + "learning_rate": 6.3081058605936594e-06, + "loss": 0.3247, + "step": 5656 + }, + { + "epoch": 1.4227867203219315, + "grad_norm": 0.3313826024532318, + "learning_rate": 6.3066935209518875e-06, + "loss": 0.3503, + "step": 5657 + }, + { + "epoch": 1.4230382293762576, + "grad_norm": 0.3489868938922882, + "learning_rate": 6.305281069399989e-06, + "loss": 0.349, + "step": 5658 + }, + { + "epoch": 1.4232897384305834, + "grad_norm": 0.2928728759288788, + "learning_rate": 6.303868506058933e-06, + "loss": 0.3532, + "step": 5659 + }, + { + "epoch": 1.4235412474849094, + "grad_norm": 0.3247617781162262, + "learning_rate": 6.302455831049696e-06, + "loss": 0.3393, + "step": 5660 + }, + { + "epoch": 1.4237927565392354, + "grad_norm": 0.32944563031196594, + "learning_rate": 6.301043044493266e-06, + "loss": 0.3453, + "step": 5661 + }, + { + "epoch": 1.4240442655935612, + "grad_norm": 0.31871670484542847, + "learning_rate": 6.299630146510638e-06, + "loss": 0.3624, + "step": 5662 + }, + { + "epoch": 1.4242957746478873, + "grad_norm": 0.34367579221725464, + "learning_rate": 6.2982171372228196e-06, + "loss": 0.3523, + "step": 5663 + }, + { + "epoch": 1.4245472837022133, + "grad_norm": 0.3165755569934845, + "learning_rate": 6.296804016750824e-06, + "loss": 0.3379, + "step": 5664 + }, + { + "epoch": 1.4247987927565393, + "grad_norm": 0.31686344742774963, + "learning_rate": 6.295390785215677e-06, + "loss": 0.3231, + "step": 5665 + }, + { + "epoch": 1.4250503018108651, + "grad_norm": 0.3904785215854645, + "learning_rate": 6.293977442738414e-06, + "loss": 0.3522, + "step": 5666 + }, + { + "epoch": 1.4253018108651911, + "grad_norm": 0.336385041475296, + "learning_rate": 6.292563989440077e-06, + "loss": 0.358, + "step": 5667 + }, + { + "epoch": 1.4255533199195172, + "grad_norm": 0.3112278878688812, + "learning_rate": 6.291150425441721e-06, + "loss": 0.3522, + "step": 5668 + }, + { + "epoch": 1.4258048289738432, + "grad_norm": 0.3297823667526245, + "learning_rate": 6.289736750864409e-06, + "loss": 0.3437, + "step": 5669 + }, + { + "epoch": 1.426056338028169, + "grad_norm": 0.3268705904483795, + "learning_rate": 6.288322965829212e-06, + "loss": 0.3288, + "step": 5670 + }, + { + "epoch": 1.426307847082495, + "grad_norm": 0.3283686339855194, + "learning_rate": 6.286909070457213e-06, + "loss": 0.3501, + "step": 5671 + }, + { + "epoch": 1.426559356136821, + "grad_norm": 0.3172062635421753, + "learning_rate": 6.285495064869503e-06, + "loss": 0.3426, + "step": 5672 + }, + { + "epoch": 1.4268108651911469, + "grad_norm": 0.35733988881111145, + "learning_rate": 6.284080949187183e-06, + "loss": 0.3113, + "step": 5673 + }, + { + "epoch": 1.4270623742454729, + "grad_norm": 0.32833603024482727, + "learning_rate": 6.282666723531363e-06, + "loss": 0.345, + "step": 5674 + }, + { + "epoch": 1.427313883299799, + "grad_norm": 0.3474726378917694, + "learning_rate": 6.281252388023162e-06, + "loss": 0.351, + "step": 5675 + }, + { + "epoch": 1.4275653923541247, + "grad_norm": 0.3398433029651642, + "learning_rate": 6.279837942783711e-06, + "loss": 0.3523, + "step": 5676 + }, + { + "epoch": 1.4278169014084507, + "grad_norm": 0.35440170764923096, + "learning_rate": 6.278423387934145e-06, + "loss": 0.3421, + "step": 5677 + }, + { + "epoch": 1.4280684104627768, + "grad_norm": 0.3336888551712036, + "learning_rate": 6.277008723595615e-06, + "loss": 0.3525, + "step": 5678 + }, + { + "epoch": 1.4283199195171026, + "grad_norm": 0.2921485900878906, + "learning_rate": 6.275593949889276e-06, + "loss": 0.3576, + "step": 5679 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 0.3030543029308319, + "learning_rate": 6.274179066936294e-06, + "loss": 0.364, + "step": 5680 + }, + { + "epoch": 1.4288229376257546, + "grad_norm": 0.328914612531662, + "learning_rate": 6.272764074857848e-06, + "loss": 0.3624, + "step": 5681 + }, + { + "epoch": 1.4290744466800804, + "grad_norm": 0.3531835973262787, + "learning_rate": 6.27134897377512e-06, + "loss": 0.3486, + "step": 5682 + }, + { + "epoch": 1.4293259557344065, + "grad_norm": 0.3383164703845978, + "learning_rate": 6.269933763809306e-06, + "loss": 0.3471, + "step": 5683 + }, + { + "epoch": 1.4295774647887325, + "grad_norm": 0.31009918451309204, + "learning_rate": 6.268518445081611e-06, + "loss": 0.3755, + "step": 5684 + }, + { + "epoch": 1.4298289738430583, + "grad_norm": 0.29445788264274597, + "learning_rate": 6.2671030177132466e-06, + "loss": 0.3177, + "step": 5685 + }, + { + "epoch": 1.4300804828973843, + "grad_norm": 0.33102452754974365, + "learning_rate": 6.265687481825435e-06, + "loss": 0.3525, + "step": 5686 + }, + { + "epoch": 1.4303319919517103, + "grad_norm": 0.31349411606788635, + "learning_rate": 6.26427183753941e-06, + "loss": 0.3394, + "step": 5687 + }, + { + "epoch": 1.4305835010060362, + "grad_norm": 0.30602481961250305, + "learning_rate": 6.262856084976411e-06, + "loss": 0.3581, + "step": 5688 + }, + { + "epoch": 1.4308350100603622, + "grad_norm": 0.31970512866973877, + "learning_rate": 6.261440224257688e-06, + "loss": 0.3677, + "step": 5689 + }, + { + "epoch": 1.4310865191146882, + "grad_norm": 0.29605525732040405, + "learning_rate": 6.260024255504502e-06, + "loss": 0.3446, + "step": 5690 + }, + { + "epoch": 1.431338028169014, + "grad_norm": 0.3755524754524231, + "learning_rate": 6.258608178838122e-06, + "loss": 0.3658, + "step": 5691 + }, + { + "epoch": 1.43158953722334, + "grad_norm": 0.31317082047462463, + "learning_rate": 6.2571919943798235e-06, + "loss": 0.3382, + "step": 5692 + }, + { + "epoch": 1.431841046277666, + "grad_norm": 0.32691314816474915, + "learning_rate": 6.255775702250895e-06, + "loss": 0.349, + "step": 5693 + }, + { + "epoch": 1.4320925553319919, + "grad_norm": 0.31760266423225403, + "learning_rate": 6.254359302572635e-06, + "loss": 0.3833, + "step": 5694 + }, + { + "epoch": 1.432344064386318, + "grad_norm": 0.3521325886249542, + "learning_rate": 6.252942795466348e-06, + "loss": 0.3629, + "step": 5695 + }, + { + "epoch": 1.432595573440644, + "grad_norm": 0.30917102098464966, + "learning_rate": 6.251526181053349e-06, + "loss": 0.3508, + "step": 5696 + }, + { + "epoch": 1.4328470824949697, + "grad_norm": 0.3195309340953827, + "learning_rate": 6.250109459454963e-06, + "loss": 0.3367, + "step": 5697 + }, + { + "epoch": 1.4330985915492958, + "grad_norm": 0.32293376326560974, + "learning_rate": 6.248692630792521e-06, + "loss": 0.3347, + "step": 5698 + }, + { + "epoch": 1.4333501006036218, + "grad_norm": 0.34250807762145996, + "learning_rate": 6.247275695187368e-06, + "loss": 0.3414, + "step": 5699 + }, + { + "epoch": 1.4336016096579476, + "grad_norm": 0.3318396806716919, + "learning_rate": 6.245858652760854e-06, + "loss": 0.336, + "step": 5700 + }, + { + "epoch": 1.4338531187122736, + "grad_norm": 0.34146398305892944, + "learning_rate": 6.244441503634341e-06, + "loss": 0.357, + "step": 5701 + }, + { + "epoch": 1.4341046277665996, + "grad_norm": 0.2918093204498291, + "learning_rate": 6.243024247929198e-06, + "loss": 0.3461, + "step": 5702 + }, + { + "epoch": 1.4343561368209254, + "grad_norm": 0.32614874839782715, + "learning_rate": 6.2416068857668045e-06, + "loss": 0.3522, + "step": 5703 + }, + { + "epoch": 1.4346076458752515, + "grad_norm": 0.3218587040901184, + "learning_rate": 6.240189417268548e-06, + "loss": 0.3755, + "step": 5704 + }, + { + "epoch": 1.4348591549295775, + "grad_norm": 0.3028290271759033, + "learning_rate": 6.238771842555828e-06, + "loss": 0.3393, + "step": 5705 + }, + { + "epoch": 1.4351106639839033, + "grad_norm": 0.2967846393585205, + "learning_rate": 6.2373541617500475e-06, + "loss": 0.3598, + "step": 5706 + }, + { + "epoch": 1.4353621730382293, + "grad_norm": 0.3133407235145569, + "learning_rate": 6.235936374972626e-06, + "loss": 0.3657, + "step": 5707 + }, + { + "epoch": 1.4356136820925554, + "grad_norm": 0.30290549993515015, + "learning_rate": 6.234518482344985e-06, + "loss": 0.34, + "step": 5708 + }, + { + "epoch": 1.4358651911468812, + "grad_norm": 0.2992716133594513, + "learning_rate": 6.233100483988559e-06, + "loss": 0.3348, + "step": 5709 + }, + { + "epoch": 1.4361167002012072, + "grad_norm": 0.34080713987350464, + "learning_rate": 6.231682380024792e-06, + "loss": 0.3632, + "step": 5710 + }, + { + "epoch": 1.4363682092555332, + "grad_norm": 0.3630107641220093, + "learning_rate": 6.230264170575133e-06, + "loss": 0.3345, + "step": 5711 + }, + { + "epoch": 1.436619718309859, + "grad_norm": 0.31429681181907654, + "learning_rate": 6.228845855761044e-06, + "loss": 0.3585, + "step": 5712 + }, + { + "epoch": 1.436871227364185, + "grad_norm": 0.298191100358963, + "learning_rate": 6.227427435703997e-06, + "loss": 0.3379, + "step": 5713 + }, + { + "epoch": 1.437122736418511, + "grad_norm": 0.33849817514419556, + "learning_rate": 6.226008910525466e-06, + "loss": 0.3389, + "step": 5714 + }, + { + "epoch": 1.437374245472837, + "grad_norm": 0.3714556396007538, + "learning_rate": 6.224590280346944e-06, + "loss": 0.3591, + "step": 5715 + }, + { + "epoch": 1.437625754527163, + "grad_norm": 0.3296235501766205, + "learning_rate": 6.223171545289925e-06, + "loss": 0.3273, + "step": 5716 + }, + { + "epoch": 1.437877263581489, + "grad_norm": 0.34104108810424805, + "learning_rate": 6.221752705475915e-06, + "loss": 0.3745, + "step": 5717 + }, + { + "epoch": 1.438128772635815, + "grad_norm": 0.3797450065612793, + "learning_rate": 6.22033376102643e-06, + "loss": 0.3646, + "step": 5718 + }, + { + "epoch": 1.438380281690141, + "grad_norm": 0.3067338764667511, + "learning_rate": 6.218914712062992e-06, + "loss": 0.3526, + "step": 5719 + }, + { + "epoch": 1.4386317907444668, + "grad_norm": 0.30397993326187134, + "learning_rate": 6.217495558707135e-06, + "loss": 0.3417, + "step": 5720 + }, + { + "epoch": 1.4388832997987928, + "grad_norm": 0.34097644686698914, + "learning_rate": 6.2160763010803995e-06, + "loss": 0.3868, + "step": 5721 + }, + { + "epoch": 1.4391348088531188, + "grad_norm": 0.3221248686313629, + "learning_rate": 6.214656939304337e-06, + "loss": 0.3695, + "step": 5722 + }, + { + "epoch": 1.4393863179074446, + "grad_norm": 0.30336886644363403, + "learning_rate": 6.213237473500505e-06, + "loss": 0.3425, + "step": 5723 + }, + { + "epoch": 1.4396378269617707, + "grad_norm": 0.30754998326301575, + "learning_rate": 6.2118179037904755e-06, + "loss": 0.3354, + "step": 5724 + }, + { + "epoch": 1.4398893360160967, + "grad_norm": 0.35792577266693115, + "learning_rate": 6.2103982302958225e-06, + "loss": 0.3458, + "step": 5725 + }, + { + "epoch": 1.4401408450704225, + "grad_norm": 0.3202516436576843, + "learning_rate": 6.208978453138134e-06, + "loss": 0.3495, + "step": 5726 + }, + { + "epoch": 1.4403923541247485, + "grad_norm": 0.30896228551864624, + "learning_rate": 6.207558572439003e-06, + "loss": 0.3359, + "step": 5727 + }, + { + "epoch": 1.4406438631790746, + "grad_norm": 0.3637436330318451, + "learning_rate": 6.2061385883200365e-06, + "loss": 0.3406, + "step": 5728 + }, + { + "epoch": 1.4408953722334004, + "grad_norm": 0.32914838194847107, + "learning_rate": 6.204718500902845e-06, + "loss": 0.3735, + "step": 5729 + }, + { + "epoch": 1.4411468812877264, + "grad_norm": 0.3156082332134247, + "learning_rate": 6.20329831030905e-06, + "loss": 0.3455, + "step": 5730 + }, + { + "epoch": 1.4413983903420524, + "grad_norm": 0.35359394550323486, + "learning_rate": 6.201878016660282e-06, + "loss": 0.3381, + "step": 5731 + }, + { + "epoch": 1.4416498993963782, + "grad_norm": 0.3604298532009125, + "learning_rate": 6.200457620078182e-06, + "loss": 0.3768, + "step": 5732 + }, + { + "epoch": 1.4419014084507042, + "grad_norm": 0.31656792759895325, + "learning_rate": 6.199037120684396e-06, + "loss": 0.3786, + "step": 5733 + }, + { + "epoch": 1.4421529175050303, + "grad_norm": 0.32002219557762146, + "learning_rate": 6.1976165186005825e-06, + "loss": 0.3784, + "step": 5734 + }, + { + "epoch": 1.442404426559356, + "grad_norm": 0.33488380908966064, + "learning_rate": 6.196195813948406e-06, + "loss": 0.3552, + "step": 5735 + }, + { + "epoch": 1.442655935613682, + "grad_norm": 0.33732855319976807, + "learning_rate": 6.194775006849541e-06, + "loss": 0.3446, + "step": 5736 + }, + { + "epoch": 1.4429074446680081, + "grad_norm": 0.27075111865997314, + "learning_rate": 6.19335409742567e-06, + "loss": 0.3357, + "step": 5737 + }, + { + "epoch": 1.443158953722334, + "grad_norm": 0.3128473162651062, + "learning_rate": 6.191933085798488e-06, + "loss": 0.3443, + "step": 5738 + }, + { + "epoch": 1.44341046277666, + "grad_norm": 0.3088710606098175, + "learning_rate": 6.190511972089694e-06, + "loss": 0.3581, + "step": 5739 + }, + { + "epoch": 1.443661971830986, + "grad_norm": 0.29833412170410156, + "learning_rate": 6.189090756420997e-06, + "loss": 0.3203, + "step": 5740 + }, + { + "epoch": 1.4439134808853118, + "grad_norm": 0.318301260471344, + "learning_rate": 6.187669438914116e-06, + "loss": 0.3681, + "step": 5741 + }, + { + "epoch": 1.4441649899396378, + "grad_norm": 0.3490622341632843, + "learning_rate": 6.186248019690777e-06, + "loss": 0.3474, + "step": 5742 + }, + { + "epoch": 1.4444164989939638, + "grad_norm": 0.33258891105651855, + "learning_rate": 6.1848264988727165e-06, + "loss": 0.3604, + "step": 5743 + }, + { + "epoch": 1.4446680080482897, + "grad_norm": 0.354710191488266, + "learning_rate": 6.183404876581679e-06, + "loss": 0.3331, + "step": 5744 + }, + { + "epoch": 1.4449195171026157, + "grad_norm": 0.3129853904247284, + "learning_rate": 6.181983152939417e-06, + "loss": 0.3661, + "step": 5745 + }, + { + "epoch": 1.4451710261569417, + "grad_norm": 0.3277970850467682, + "learning_rate": 6.180561328067692e-06, + "loss": 0.3596, + "step": 5746 + }, + { + "epoch": 1.4454225352112675, + "grad_norm": 0.3437343239784241, + "learning_rate": 6.179139402088275e-06, + "loss": 0.3577, + "step": 5747 + }, + { + "epoch": 1.4456740442655935, + "grad_norm": 0.3663479685783386, + "learning_rate": 6.1777173751229445e-06, + "loss": 0.3585, + "step": 5748 + }, + { + "epoch": 1.4459255533199196, + "grad_norm": 0.3225582242012024, + "learning_rate": 6.176295247293489e-06, + "loss": 0.347, + "step": 5749 + }, + { + "epoch": 1.4461770623742454, + "grad_norm": 0.33006182312965393, + "learning_rate": 6.174873018721705e-06, + "loss": 0.3493, + "step": 5750 + }, + { + "epoch": 1.4464285714285714, + "grad_norm": 0.33908021450042725, + "learning_rate": 6.173450689529397e-06, + "loss": 0.3456, + "step": 5751 + }, + { + "epoch": 1.4466800804828974, + "grad_norm": 0.35512205958366394, + "learning_rate": 6.172028259838378e-06, + "loss": 0.3777, + "step": 5752 + }, + { + "epoch": 1.4469315895372232, + "grad_norm": 0.3256780207157135, + "learning_rate": 6.17060572977047e-06, + "loss": 0.3644, + "step": 5753 + }, + { + "epoch": 1.4471830985915493, + "grad_norm": 0.39678511023521423, + "learning_rate": 6.169183099447505e-06, + "loss": 0.3304, + "step": 5754 + }, + { + "epoch": 1.4474346076458753, + "grad_norm": 0.3706040680408478, + "learning_rate": 6.167760368991322e-06, + "loss": 0.3491, + "step": 5755 + }, + { + "epoch": 1.447686116700201, + "grad_norm": 0.3017033040523529, + "learning_rate": 6.16633753852377e-06, + "loss": 0.3393, + "step": 5756 + }, + { + "epoch": 1.4479376257545271, + "grad_norm": 0.32443055510520935, + "learning_rate": 6.164914608166703e-06, + "loss": 0.3369, + "step": 5757 + }, + { + "epoch": 1.4481891348088531, + "grad_norm": 0.33810749650001526, + "learning_rate": 6.163491578041988e-06, + "loss": 0.3254, + "step": 5758 + }, + { + "epoch": 1.448440643863179, + "grad_norm": 0.3356841504573822, + "learning_rate": 6.1620684482714975e-06, + "loss": 0.3624, + "step": 5759 + }, + { + "epoch": 1.448692152917505, + "grad_norm": 0.32188060879707336, + "learning_rate": 6.160645218977115e-06, + "loss": 0.327, + "step": 5760 + }, + { + "epoch": 1.448943661971831, + "grad_norm": 0.30660948157310486, + "learning_rate": 6.159221890280731e-06, + "loss": 0.3529, + "step": 5761 + }, + { + "epoch": 1.4491951710261568, + "grad_norm": 0.3086193799972534, + "learning_rate": 6.157798462304243e-06, + "loss": 0.3633, + "step": 5762 + }, + { + "epoch": 1.4494466800804828, + "grad_norm": 0.33312803506851196, + "learning_rate": 6.15637493516956e-06, + "loss": 0.3652, + "step": 5763 + }, + { + "epoch": 1.4496981891348089, + "grad_norm": 0.3129318058490753, + "learning_rate": 6.154951308998599e-06, + "loss": 0.378, + "step": 5764 + }, + { + "epoch": 1.4499496981891349, + "grad_norm": 0.31855911016464233, + "learning_rate": 6.153527583913284e-06, + "loss": 0.338, + "step": 5765 + }, + { + "epoch": 1.4502012072434607, + "grad_norm": 0.3277183473110199, + "learning_rate": 6.152103760035546e-06, + "loss": 0.3527, + "step": 5766 + }, + { + "epoch": 1.4504527162977867, + "grad_norm": 0.34277212619781494, + "learning_rate": 6.15067983748733e-06, + "loss": 0.3351, + "step": 5767 + }, + { + "epoch": 1.4507042253521127, + "grad_norm": 0.3247811198234558, + "learning_rate": 6.149255816390585e-06, + "loss": 0.3464, + "step": 5768 + }, + { + "epoch": 1.4509557344064388, + "grad_norm": 0.3350554406642914, + "learning_rate": 6.147831696867266e-06, + "loss": 0.3549, + "step": 5769 + }, + { + "epoch": 1.4512072434607646, + "grad_norm": 0.33898022770881653, + "learning_rate": 6.146407479039345e-06, + "loss": 0.3493, + "step": 5770 + }, + { + "epoch": 1.4514587525150906, + "grad_norm": 0.3372337818145752, + "learning_rate": 6.144983163028796e-06, + "loss": 0.3463, + "step": 5771 + }, + { + "epoch": 1.4517102615694166, + "grad_norm": 0.36416393518447876, + "learning_rate": 6.143558748957601e-06, + "loss": 0.3475, + "step": 5772 + }, + { + "epoch": 1.4519617706237424, + "grad_norm": 0.33351224660873413, + "learning_rate": 6.142134236947755e-06, + "loss": 0.358, + "step": 5773 + }, + { + "epoch": 1.4522132796780685, + "grad_norm": 0.29332223534584045, + "learning_rate": 6.140709627121255e-06, + "loss": 0.3688, + "step": 5774 + }, + { + "epoch": 1.4524647887323945, + "grad_norm": 0.34419727325439453, + "learning_rate": 6.1392849196001125e-06, + "loss": 0.3338, + "step": 5775 + }, + { + "epoch": 1.4527162977867203, + "grad_norm": 0.32147595286369324, + "learning_rate": 6.137860114506343e-06, + "loss": 0.3517, + "step": 5776 + }, + { + "epoch": 1.4529678068410463, + "grad_norm": 0.3297555148601532, + "learning_rate": 6.136435211961974e-06, + "loss": 0.3182, + "step": 5777 + }, + { + "epoch": 1.4532193158953723, + "grad_norm": 0.3482721745967865, + "learning_rate": 6.135010212089038e-06, + "loss": 0.3632, + "step": 5778 + }, + { + "epoch": 1.4534708249496981, + "grad_norm": 0.32314664125442505, + "learning_rate": 6.133585115009579e-06, + "loss": 0.3837, + "step": 5779 + }, + { + "epoch": 1.4537223340040242, + "grad_norm": 0.33287930488586426, + "learning_rate": 6.132159920845645e-06, + "loss": 0.3742, + "step": 5780 + }, + { + "epoch": 1.4539738430583502, + "grad_norm": 0.32902443408966064, + "learning_rate": 6.1307346297192984e-06, + "loss": 0.3698, + "step": 5781 + }, + { + "epoch": 1.454225352112676, + "grad_norm": 0.3330136835575104, + "learning_rate": 6.129309241752603e-06, + "loss": 0.358, + "step": 5782 + }, + { + "epoch": 1.454476861167002, + "grad_norm": 0.3234924376010895, + "learning_rate": 6.127883757067636e-06, + "loss": 0.3546, + "step": 5783 + }, + { + "epoch": 1.454728370221328, + "grad_norm": 0.3510439693927765, + "learning_rate": 6.126458175786483e-06, + "loss": 0.3506, + "step": 5784 + }, + { + "epoch": 1.4549798792756539, + "grad_norm": 0.32338663935661316, + "learning_rate": 6.125032498031234e-06, + "loss": 0.3674, + "step": 5785 + }, + { + "epoch": 1.45523138832998, + "grad_norm": 0.3300342857837677, + "learning_rate": 6.1236067239239885e-06, + "loss": 0.3535, + "step": 5786 + }, + { + "epoch": 1.455482897384306, + "grad_norm": 0.31873607635498047, + "learning_rate": 6.122180853586857e-06, + "loss": 0.3526, + "step": 5787 + }, + { + "epoch": 1.4557344064386317, + "grad_norm": 0.3264380991458893, + "learning_rate": 6.120754887141955e-06, + "loss": 0.3741, + "step": 5788 + }, + { + "epoch": 1.4559859154929577, + "grad_norm": 0.36225321888923645, + "learning_rate": 6.119328824711409e-06, + "loss": 0.3473, + "step": 5789 + }, + { + "epoch": 1.4562374245472838, + "grad_norm": 0.3539520800113678, + "learning_rate": 6.117902666417352e-06, + "loss": 0.3592, + "step": 5790 + }, + { + "epoch": 1.4564889336016096, + "grad_norm": 0.3481525182723999, + "learning_rate": 6.116476412381926e-06, + "loss": 0.364, + "step": 5791 + }, + { + "epoch": 1.4567404426559356, + "grad_norm": 0.3597382605075836, + "learning_rate": 6.115050062727278e-06, + "loss": 0.3502, + "step": 5792 + }, + { + "epoch": 1.4569919517102616, + "grad_norm": 0.3276865482330322, + "learning_rate": 6.113623617575568e-06, + "loss": 0.3589, + "step": 5793 + }, + { + "epoch": 1.4572434607645874, + "grad_norm": 0.35708415508270264, + "learning_rate": 6.112197077048963e-06, + "loss": 0.374, + "step": 5794 + }, + { + "epoch": 1.4574949698189135, + "grad_norm": 0.33358344435691833, + "learning_rate": 6.110770441269636e-06, + "loss": 0.3551, + "step": 5795 + }, + { + "epoch": 1.4577464788732395, + "grad_norm": 0.3249559998512268, + "learning_rate": 6.10934371035977e-06, + "loss": 0.3376, + "step": 5796 + }, + { + "epoch": 1.4579979879275653, + "grad_norm": 0.35899099707603455, + "learning_rate": 6.1079168844415535e-06, + "loss": 0.3644, + "step": 5797 + }, + { + "epoch": 1.4582494969818913, + "grad_norm": 0.33779412508010864, + "learning_rate": 6.106489963637189e-06, + "loss": 0.3545, + "step": 5798 + }, + { + "epoch": 1.4585010060362174, + "grad_norm": 0.3385300040245056, + "learning_rate": 6.105062948068881e-06, + "loss": 0.3381, + "step": 5799 + }, + { + "epoch": 1.4587525150905432, + "grad_norm": 0.3292633295059204, + "learning_rate": 6.103635837858844e-06, + "loss": 0.3323, + "step": 5800 + }, + { + "epoch": 1.4590040241448692, + "grad_norm": 0.3348362445831299, + "learning_rate": 6.1022086331293005e-06, + "loss": 0.3462, + "step": 5801 + }, + { + "epoch": 1.4592555331991952, + "grad_norm": 0.3152986764907837, + "learning_rate": 6.100781334002485e-06, + "loss": 0.3209, + "step": 5802 + }, + { + "epoch": 1.459507042253521, + "grad_norm": 0.3165169656276703, + "learning_rate": 6.099353940600634e-06, + "loss": 0.336, + "step": 5803 + }, + { + "epoch": 1.459758551307847, + "grad_norm": 0.36417675018310547, + "learning_rate": 6.097926453045996e-06, + "loss": 0.3508, + "step": 5804 + }, + { + "epoch": 1.460010060362173, + "grad_norm": 0.3556537926197052, + "learning_rate": 6.0964988714608255e-06, + "loss": 0.3644, + "step": 5805 + }, + { + "epoch": 1.4602615694164989, + "grad_norm": 0.3152537941932678, + "learning_rate": 6.0950711959673854e-06, + "loss": 0.3538, + "step": 5806 + }, + { + "epoch": 1.460513078470825, + "grad_norm": 0.3294949531555176, + "learning_rate": 6.093643426687949e-06, + "loss": 0.3665, + "step": 5807 + }, + { + "epoch": 1.460764587525151, + "grad_norm": 0.33261191844940186, + "learning_rate": 6.092215563744797e-06, + "loss": 0.3564, + "step": 5808 + }, + { + "epoch": 1.4610160965794767, + "grad_norm": 0.36204931139945984, + "learning_rate": 6.0907876072602126e-06, + "loss": 0.3704, + "step": 5809 + }, + { + "epoch": 1.4612676056338028, + "grad_norm": 0.34487199783325195, + "learning_rate": 6.0893595573564935e-06, + "loss": 0.3626, + "step": 5810 + }, + { + "epoch": 1.4615191146881288, + "grad_norm": 0.3406592607498169, + "learning_rate": 6.0879314141559434e-06, + "loss": 0.3619, + "step": 5811 + }, + { + "epoch": 1.4617706237424548, + "grad_norm": 0.3268572986125946, + "learning_rate": 6.086503177780874e-06, + "loss": 0.3383, + "step": 5812 + }, + { + "epoch": 1.4620221327967806, + "grad_norm": 0.33901724219322205, + "learning_rate": 6.085074848353604e-06, + "loss": 0.3209, + "step": 5813 + }, + { + "epoch": 1.4622736418511066, + "grad_norm": 0.2882567346096039, + "learning_rate": 6.083646425996462e-06, + "loss": 0.3496, + "step": 5814 + }, + { + "epoch": 1.4625251509054327, + "grad_norm": 0.30257371068000793, + "learning_rate": 6.082217910831784e-06, + "loss": 0.3685, + "step": 5815 + }, + { + "epoch": 1.4627766599597585, + "grad_norm": 0.34829580783843994, + "learning_rate": 6.080789302981911e-06, + "loss": 0.356, + "step": 5816 + }, + { + "epoch": 1.4630281690140845, + "grad_norm": 0.32623910903930664, + "learning_rate": 6.079360602569196e-06, + "loss": 0.3614, + "step": 5817 + }, + { + "epoch": 1.4632796780684105, + "grad_norm": 0.3189256191253662, + "learning_rate": 6.0779318097159965e-06, + "loss": 0.3708, + "step": 5818 + }, + { + "epoch": 1.4635311871227366, + "grad_norm": 0.32534220814704895, + "learning_rate": 6.076502924544683e-06, + "loss": 0.3314, + "step": 5819 + }, + { + "epoch": 1.4637826961770624, + "grad_norm": 0.3330930471420288, + "learning_rate": 6.075073947177628e-06, + "loss": 0.3503, + "step": 5820 + }, + { + "epoch": 1.4640342052313884, + "grad_norm": 0.32736390829086304, + "learning_rate": 6.073644877737215e-06, + "loss": 0.3712, + "step": 5821 + }, + { + "epoch": 1.4642857142857144, + "grad_norm": 0.3611101806163788, + "learning_rate": 6.072215716345835e-06, + "loss": 0.3876, + "step": 5822 + }, + { + "epoch": 1.4645372233400402, + "grad_norm": 0.33069029450416565, + "learning_rate": 6.070786463125885e-06, + "loss": 0.3498, + "step": 5823 + }, + { + "epoch": 1.4647887323943662, + "grad_norm": 0.32250040769577026, + "learning_rate": 6.069357118199775e-06, + "loss": 0.3651, + "step": 5824 + }, + { + "epoch": 1.4650402414486923, + "grad_norm": 0.3161625564098358, + "learning_rate": 6.067927681689917e-06, + "loss": 0.367, + "step": 5825 + }, + { + "epoch": 1.465291750503018, + "grad_norm": 0.3699602484703064, + "learning_rate": 6.066498153718735e-06, + "loss": 0.342, + "step": 5826 + }, + { + "epoch": 1.465543259557344, + "grad_norm": 0.33579832315444946, + "learning_rate": 6.065068534408657e-06, + "loss": 0.3465, + "step": 5827 + }, + { + "epoch": 1.4657947686116701, + "grad_norm": 0.3177899718284607, + "learning_rate": 6.063638823882123e-06, + "loss": 0.3512, + "step": 5828 + }, + { + "epoch": 1.466046277665996, + "grad_norm": 0.31834790110588074, + "learning_rate": 6.062209022261577e-06, + "loss": 0.3652, + "step": 5829 + }, + { + "epoch": 1.466297786720322, + "grad_norm": 0.30503425002098083, + "learning_rate": 6.060779129669474e-06, + "loss": 0.3319, + "step": 5830 + }, + { + "epoch": 1.466549295774648, + "grad_norm": 0.32431966066360474, + "learning_rate": 6.059349146228275e-06, + "loss": 0.35, + "step": 5831 + }, + { + "epoch": 1.4668008048289738, + "grad_norm": 0.3199666440486908, + "learning_rate": 6.057919072060448e-06, + "loss": 0.3537, + "step": 5832 + }, + { + "epoch": 1.4670523138832998, + "grad_norm": 0.3279325067996979, + "learning_rate": 6.0564889072884715e-06, + "loss": 0.3408, + "step": 5833 + }, + { + "epoch": 1.4673038229376258, + "grad_norm": 0.3161599338054657, + "learning_rate": 6.0550586520348285e-06, + "loss": 0.3576, + "step": 5834 + }, + { + "epoch": 1.4675553319919517, + "grad_norm": 0.32007861137390137, + "learning_rate": 6.053628306422014e-06, + "loss": 0.3436, + "step": 5835 + }, + { + "epoch": 1.4678068410462777, + "grad_norm": 0.3110705316066742, + "learning_rate": 6.052197870572525e-06, + "loss": 0.3723, + "step": 5836 + }, + { + "epoch": 1.4680583501006037, + "grad_norm": 0.33387455344200134, + "learning_rate": 6.0507673446088726e-06, + "loss": 0.3686, + "step": 5837 + }, + { + "epoch": 1.4683098591549295, + "grad_norm": 0.3183239996433258, + "learning_rate": 6.049336728653569e-06, + "loss": 0.367, + "step": 5838 + }, + { + "epoch": 1.4685613682092555, + "grad_norm": 0.3348864018917084, + "learning_rate": 6.0479060228291396e-06, + "loss": 0.3625, + "step": 5839 + }, + { + "epoch": 1.4688128772635816, + "grad_norm": 0.3361799120903015, + "learning_rate": 6.046475227258115e-06, + "loss": 0.3475, + "step": 5840 + }, + { + "epoch": 1.4690643863179074, + "grad_norm": 0.3200930655002594, + "learning_rate": 6.045044342063034e-06, + "loss": 0.3711, + "step": 5841 + }, + { + "epoch": 1.4693158953722334, + "grad_norm": 0.2997252345085144, + "learning_rate": 6.043613367366444e-06, + "loss": 0.3506, + "step": 5842 + }, + { + "epoch": 1.4695674044265594, + "grad_norm": 0.3714944124221802, + "learning_rate": 6.042182303290898e-06, + "loss": 0.3711, + "step": 5843 + }, + { + "epoch": 1.4698189134808852, + "grad_norm": 0.32264792919158936, + "learning_rate": 6.040751149958955e-06, + "loss": 0.3467, + "step": 5844 + }, + { + "epoch": 1.4700704225352113, + "grad_norm": 0.3230764865875244, + "learning_rate": 6.0393199074931886e-06, + "loss": 0.3556, + "step": 5845 + }, + { + "epoch": 1.4703219315895373, + "grad_norm": 0.31362685561180115, + "learning_rate": 6.037888576016174e-06, + "loss": 0.3421, + "step": 5846 + }, + { + "epoch": 1.470573440643863, + "grad_norm": 0.3665870130062103, + "learning_rate": 6.036457155650496e-06, + "loss": 0.3703, + "step": 5847 + }, + { + "epoch": 1.470824949698189, + "grad_norm": 0.327773779630661, + "learning_rate": 6.035025646518747e-06, + "loss": 0.357, + "step": 5848 + }, + { + "epoch": 1.4710764587525151, + "grad_norm": 0.32785624265670776, + "learning_rate": 6.033594048743525e-06, + "loss": 0.3694, + "step": 5849 + }, + { + "epoch": 1.471327967806841, + "grad_norm": 0.3332252502441406, + "learning_rate": 6.03216236244744e-06, + "loss": 0.3334, + "step": 5850 + }, + { + "epoch": 1.471579476861167, + "grad_norm": 0.3057732582092285, + "learning_rate": 6.030730587753106e-06, + "loss": 0.37, + "step": 5851 + }, + { + "epoch": 1.471830985915493, + "grad_norm": 0.28898757696151733, + "learning_rate": 6.0292987247831455e-06, + "loss": 0.3595, + "step": 5852 + }, + { + "epoch": 1.4720824949698188, + "grad_norm": 0.3072353005409241, + "learning_rate": 6.0278667736601885e-06, + "loss": 0.3623, + "step": 5853 + }, + { + "epoch": 1.4723340040241448, + "grad_norm": 0.32753127813339233, + "learning_rate": 6.026434734506872e-06, + "loss": 0.3604, + "step": 5854 + }, + { + "epoch": 1.4725855130784709, + "grad_norm": 0.33749762177467346, + "learning_rate": 6.025002607445842e-06, + "loss": 0.3586, + "step": 5855 + }, + { + "epoch": 1.4728370221327967, + "grad_norm": 0.3170812427997589, + "learning_rate": 6.023570392599751e-06, + "loss": 0.3351, + "step": 5856 + }, + { + "epoch": 1.4730885311871227, + "grad_norm": 0.3415549099445343, + "learning_rate": 6.02213809009126e-06, + "loss": 0.334, + "step": 5857 + }, + { + "epoch": 1.4733400402414487, + "grad_norm": 0.32296910881996155, + "learning_rate": 6.020705700043036e-06, + "loss": 0.3375, + "step": 5858 + }, + { + "epoch": 1.4735915492957745, + "grad_norm": 0.3227507770061493, + "learning_rate": 6.019273222577754e-06, + "loss": 0.3523, + "step": 5859 + }, + { + "epoch": 1.4738430583501005, + "grad_norm": 0.3127809166908264, + "learning_rate": 6.017840657818097e-06, + "loss": 0.3279, + "step": 5860 + }, + { + "epoch": 1.4740945674044266, + "grad_norm": 0.3176402449607849, + "learning_rate": 6.016408005886756e-06, + "loss": 0.3604, + "step": 5861 + }, + { + "epoch": 1.4743460764587526, + "grad_norm": 0.3369061350822449, + "learning_rate": 6.014975266906427e-06, + "loss": 0.3406, + "step": 5862 + }, + { + "epoch": 1.4745975855130784, + "grad_norm": 0.3053755462169647, + "learning_rate": 6.0135424409998156e-06, + "loss": 0.3587, + "step": 5863 + }, + { + "epoch": 1.4748490945674044, + "grad_norm": 0.30855026841163635, + "learning_rate": 6.0121095282896356e-06, + "loss": 0.3404, + "step": 5864 + }, + { + "epoch": 1.4751006036217305, + "grad_norm": 0.31946873664855957, + "learning_rate": 6.010676528898606e-06, + "loss": 0.334, + "step": 5865 + }, + { + "epoch": 1.4753521126760563, + "grad_norm": 0.34600701928138733, + "learning_rate": 6.009243442949454e-06, + "loss": 0.3573, + "step": 5866 + }, + { + "epoch": 1.4756036217303823, + "grad_norm": 0.32230931520462036, + "learning_rate": 6.007810270564916e-06, + "loss": 0.3549, + "step": 5867 + }, + { + "epoch": 1.4758551307847083, + "grad_norm": 0.3308599293231964, + "learning_rate": 6.006377011867732e-06, + "loss": 0.3662, + "step": 5868 + }, + { + "epoch": 1.4761066398390343, + "grad_norm": 0.3071610629558563, + "learning_rate": 6.004943666980654e-06, + "loss": 0.334, + "step": 5869 + }, + { + "epoch": 1.4763581488933601, + "grad_norm": 0.30614978075027466, + "learning_rate": 6.003510236026436e-06, + "loss": 0.3623, + "step": 5870 + }, + { + "epoch": 1.4766096579476862, + "grad_norm": 0.325398325920105, + "learning_rate": 6.0020767191278465e-06, + "loss": 0.3395, + "step": 5871 + }, + { + "epoch": 1.4768611670020122, + "grad_norm": 0.3325035572052002, + "learning_rate": 6.000643116407654e-06, + "loss": 0.3651, + "step": 5872 + }, + { + "epoch": 1.477112676056338, + "grad_norm": 0.30189478397369385, + "learning_rate": 5.999209427988638e-06, + "loss": 0.3646, + "step": 5873 + }, + { + "epoch": 1.477364185110664, + "grad_norm": 0.3175562918186188, + "learning_rate": 5.997775653993586e-06, + "loss": 0.3582, + "step": 5874 + }, + { + "epoch": 1.47761569416499, + "grad_norm": 0.3137342929840088, + "learning_rate": 5.996341794545292e-06, + "loss": 0.3407, + "step": 5875 + }, + { + "epoch": 1.4778672032193159, + "grad_norm": 0.3235321342945099, + "learning_rate": 5.9949078497665555e-06, + "loss": 0.3353, + "step": 5876 + }, + { + "epoch": 1.4781187122736419, + "grad_norm": 0.3290002942085266, + "learning_rate": 5.993473819780185e-06, + "loss": 0.3676, + "step": 5877 + }, + { + "epoch": 1.478370221327968, + "grad_norm": 0.3220723271369934, + "learning_rate": 5.992039704708998e-06, + "loss": 0.356, + "step": 5878 + }, + { + "epoch": 1.4786217303822937, + "grad_norm": 0.33870404958724976, + "learning_rate": 5.9906055046758174e-06, + "loss": 0.3572, + "step": 5879 + }, + { + "epoch": 1.4788732394366197, + "grad_norm": 0.3206806778907776, + "learning_rate": 5.989171219803471e-06, + "loss": 0.3593, + "step": 5880 + }, + { + "epoch": 1.4791247484909458, + "grad_norm": 0.35171839594841003, + "learning_rate": 5.987736850214798e-06, + "loss": 0.357, + "step": 5881 + }, + { + "epoch": 1.4793762575452716, + "grad_norm": 0.3402028977870941, + "learning_rate": 5.986302396032644e-06, + "loss": 0.3724, + "step": 5882 + }, + { + "epoch": 1.4796277665995976, + "grad_norm": 0.35204577445983887, + "learning_rate": 5.98486785737986e-06, + "loss": 0.35, + "step": 5883 + }, + { + "epoch": 1.4798792756539236, + "grad_norm": 0.3249172568321228, + "learning_rate": 5.983433234379306e-06, + "loss": 0.3657, + "step": 5884 + }, + { + "epoch": 1.4801307847082494, + "grad_norm": 0.3084418475627899, + "learning_rate": 5.9819985271538465e-06, + "loss": 0.3527, + "step": 5885 + }, + { + "epoch": 1.4803822937625755, + "grad_norm": 0.3349623382091522, + "learning_rate": 5.980563735826355e-06, + "loss": 0.3399, + "step": 5886 + }, + { + "epoch": 1.4806338028169015, + "grad_norm": 0.31369122862815857, + "learning_rate": 5.9791288605197175e-06, + "loss": 0.3405, + "step": 5887 + }, + { + "epoch": 1.4808853118712273, + "grad_norm": 0.3203195035457611, + "learning_rate": 5.9776939013568145e-06, + "loss": 0.3463, + "step": 5888 + }, + { + "epoch": 1.4811368209255533, + "grad_norm": 0.31735894083976746, + "learning_rate": 5.976258858460548e-06, + "loss": 0.3336, + "step": 5889 + }, + { + "epoch": 1.4813883299798793, + "grad_norm": 0.3369232416152954, + "learning_rate": 5.974823731953817e-06, + "loss": 0.3262, + "step": 5890 + }, + { + "epoch": 1.4816398390342052, + "grad_norm": 0.31136730313301086, + "learning_rate": 5.973388521959532e-06, + "loss": 0.3543, + "step": 5891 + }, + { + "epoch": 1.4818913480885312, + "grad_norm": 0.3076231777667999, + "learning_rate": 5.97195322860061e-06, + "loss": 0.3416, + "step": 5892 + }, + { + "epoch": 1.4821428571428572, + "grad_norm": 0.33634480834007263, + "learning_rate": 5.970517851999974e-06, + "loss": 0.345, + "step": 5893 + }, + { + "epoch": 1.482394366197183, + "grad_norm": 0.32039153575897217, + "learning_rate": 5.969082392280557e-06, + "loss": 0.3473, + "step": 5894 + }, + { + "epoch": 1.482645875251509, + "grad_norm": 0.3011680543422699, + "learning_rate": 5.967646849565294e-06, + "loss": 0.343, + "step": 5895 + }, + { + "epoch": 1.482897384305835, + "grad_norm": 0.33874598145484924, + "learning_rate": 5.966211223977132e-06, + "loss": 0.36, + "step": 5896 + }, + { + "epoch": 1.4831488933601609, + "grad_norm": 0.3423505127429962, + "learning_rate": 5.964775515639023e-06, + "loss": 0.3646, + "step": 5897 + }, + { + "epoch": 1.483400402414487, + "grad_norm": 0.33873575925827026, + "learning_rate": 5.963339724673928e-06, + "loss": 0.3458, + "step": 5898 + }, + { + "epoch": 1.483651911468813, + "grad_norm": 0.3190835416316986, + "learning_rate": 5.96190385120481e-06, + "loss": 0.3767, + "step": 5899 + }, + { + "epoch": 1.4839034205231387, + "grad_norm": 0.33720406889915466, + "learning_rate": 5.960467895354646e-06, + "loss": 0.3535, + "step": 5900 + }, + { + "epoch": 1.4841549295774648, + "grad_norm": 0.3207988440990448, + "learning_rate": 5.959031857246415e-06, + "loss": 0.3518, + "step": 5901 + }, + { + "epoch": 1.4844064386317908, + "grad_norm": 0.3778199255466461, + "learning_rate": 5.9575957370031065e-06, + "loss": 0.3516, + "step": 5902 + }, + { + "epoch": 1.4846579476861166, + "grad_norm": 0.3221379220485687, + "learning_rate": 5.956159534747713e-06, + "loss": 0.3216, + "step": 5903 + }, + { + "epoch": 1.4849094567404426, + "grad_norm": 0.296059787273407, + "learning_rate": 5.954723250603237e-06, + "loss": 0.3198, + "step": 5904 + }, + { + "epoch": 1.4851609657947686, + "grad_norm": 0.394718736410141, + "learning_rate": 5.953286884692688e-06, + "loss": 0.3623, + "step": 5905 + }, + { + "epoch": 1.4854124748490944, + "grad_norm": 0.3284577429294586, + "learning_rate": 5.9518504371390805e-06, + "loss": 0.3453, + "step": 5906 + }, + { + "epoch": 1.4856639839034205, + "grad_norm": 0.3131723999977112, + "learning_rate": 5.950413908065437e-06, + "loss": 0.3722, + "step": 5907 + }, + { + "epoch": 1.4859154929577465, + "grad_norm": 0.3282751142978668, + "learning_rate": 5.94897729759479e-06, + "loss": 0.3348, + "step": 5908 + }, + { + "epoch": 1.4861670020120723, + "grad_norm": 0.3348959684371948, + "learning_rate": 5.947540605850173e-06, + "loss": 0.3454, + "step": 5909 + }, + { + "epoch": 1.4864185110663983, + "grad_norm": 0.34094908833503723, + "learning_rate": 5.946103832954631e-06, + "loss": 0.3677, + "step": 5910 + }, + { + "epoch": 1.4866700201207244, + "grad_norm": 0.35626813769340515, + "learning_rate": 5.944666979031215e-06, + "loss": 0.3592, + "step": 5911 + }, + { + "epoch": 1.4869215291750504, + "grad_norm": 0.3323785960674286, + "learning_rate": 5.943230044202981e-06, + "loss": 0.3671, + "step": 5912 + }, + { + "epoch": 1.4871730382293762, + "grad_norm": 0.32840991020202637, + "learning_rate": 5.941793028592996e-06, + "loss": 0.3488, + "step": 5913 + }, + { + "epoch": 1.4874245472837022, + "grad_norm": 0.3254261314868927, + "learning_rate": 5.940355932324329e-06, + "loss": 0.343, + "step": 5914 + }, + { + "epoch": 1.4876760563380282, + "grad_norm": 0.3155461847782135, + "learning_rate": 5.938918755520059e-06, + "loss": 0.3398, + "step": 5915 + }, + { + "epoch": 1.487927565392354, + "grad_norm": 0.3168039619922638, + "learning_rate": 5.9374814983032724e-06, + "loss": 0.3664, + "step": 5916 + }, + { + "epoch": 1.48817907444668, + "grad_norm": 0.344101220369339, + "learning_rate": 5.9360441607970585e-06, + "loss": 0.3417, + "step": 5917 + }, + { + "epoch": 1.488430583501006, + "grad_norm": 0.3618612587451935, + "learning_rate": 5.93460674312452e-06, + "loss": 0.3684, + "step": 5918 + }, + { + "epoch": 1.4886820925553321, + "grad_norm": 0.32880085706710815, + "learning_rate": 5.933169245408761e-06, + "loss": 0.3381, + "step": 5919 + }, + { + "epoch": 1.488933601609658, + "grad_norm": 0.3657226860523224, + "learning_rate": 5.931731667772893e-06, + "loss": 0.3754, + "step": 5920 + }, + { + "epoch": 1.489185110663984, + "grad_norm": 0.3417291045188904, + "learning_rate": 5.930294010340035e-06, + "loss": 0.3586, + "step": 5921 + }, + { + "epoch": 1.48943661971831, + "grad_norm": 0.3205607235431671, + "learning_rate": 5.928856273233316e-06, + "loss": 0.3415, + "step": 5922 + }, + { + "epoch": 1.4896881287726358, + "grad_norm": 0.35127079486846924, + "learning_rate": 5.92741845657587e-06, + "loss": 0.3403, + "step": 5923 + }, + { + "epoch": 1.4899396378269618, + "grad_norm": 0.3201601207256317, + "learning_rate": 5.925980560490834e-06, + "loss": 0.3326, + "step": 5924 + }, + { + "epoch": 1.4901911468812878, + "grad_norm": 0.3274860978126526, + "learning_rate": 5.924542585101356e-06, + "loss": 0.3409, + "step": 5925 + }, + { + "epoch": 1.4904426559356136, + "grad_norm": 0.3243807852268219, + "learning_rate": 5.923104530530589e-06, + "loss": 0.3353, + "step": 5926 + }, + { + "epoch": 1.4906941649899397, + "grad_norm": 0.3296162486076355, + "learning_rate": 5.921666396901694e-06, + "loss": 0.3653, + "step": 5927 + }, + { + "epoch": 1.4909456740442657, + "grad_norm": 0.31820058822631836, + "learning_rate": 5.9202281843378385e-06, + "loss": 0.343, + "step": 5928 + }, + { + "epoch": 1.4911971830985915, + "grad_norm": 0.3398142457008362, + "learning_rate": 5.918789892962196e-06, + "loss": 0.3509, + "step": 5929 + }, + { + "epoch": 1.4914486921529175, + "grad_norm": 0.32767853140830994, + "learning_rate": 5.917351522897946e-06, + "loss": 0.3578, + "step": 5930 + }, + { + "epoch": 1.4917002012072436, + "grad_norm": 0.30963650345802307, + "learning_rate": 5.915913074268277e-06, + "loss": 0.3416, + "step": 5931 + }, + { + "epoch": 1.4919517102615694, + "grad_norm": 0.3078908920288086, + "learning_rate": 5.914474547196384e-06, + "loss": 0.3493, + "step": 5932 + }, + { + "epoch": 1.4922032193158954, + "grad_norm": 0.32465776801109314, + "learning_rate": 5.913035941805467e-06, + "loss": 0.3434, + "step": 5933 + }, + { + "epoch": 1.4924547283702214, + "grad_norm": 0.32990938425064087, + "learning_rate": 5.911597258218733e-06, + "loss": 0.3676, + "step": 5934 + }, + { + "epoch": 1.4927062374245472, + "grad_norm": 0.33850571513175964, + "learning_rate": 5.9101584965593975e-06, + "loss": 0.342, + "step": 5935 + }, + { + "epoch": 1.4929577464788732, + "grad_norm": 0.3495611846446991, + "learning_rate": 5.90871965695068e-06, + "loss": 0.3366, + "step": 5936 + }, + { + "epoch": 1.4932092555331993, + "grad_norm": 0.32349708676338196, + "learning_rate": 5.907280739515809e-06, + "loss": 0.3408, + "step": 5937 + }, + { + "epoch": 1.493460764587525, + "grad_norm": 0.3027772307395935, + "learning_rate": 5.905841744378019e-06, + "loss": 0.3582, + "step": 5938 + }, + { + "epoch": 1.493712273641851, + "grad_norm": 0.3265613913536072, + "learning_rate": 5.904402671660551e-06, + "loss": 0.3438, + "step": 5939 + }, + { + "epoch": 1.4939637826961771, + "grad_norm": 0.3158648908138275, + "learning_rate": 5.902963521486651e-06, + "loss": 0.3569, + "step": 5940 + }, + { + "epoch": 1.494215291750503, + "grad_norm": 0.3225020468235016, + "learning_rate": 5.901524293979575e-06, + "loss": 0.3403, + "step": 5941 + }, + { + "epoch": 1.494466800804829, + "grad_norm": 0.33680132031440735, + "learning_rate": 5.900084989262581e-06, + "loss": 0.3514, + "step": 5942 + }, + { + "epoch": 1.494718309859155, + "grad_norm": 0.30138635635375977, + "learning_rate": 5.898645607458941e-06, + "loss": 0.3465, + "step": 5943 + }, + { + "epoch": 1.4949698189134808, + "grad_norm": 0.3087744116783142, + "learning_rate": 5.897206148691925e-06, + "loss": 0.3362, + "step": 5944 + }, + { + "epoch": 1.4952213279678068, + "grad_norm": 0.3069364130496979, + "learning_rate": 5.895766613084817e-06, + "loss": 0.3122, + "step": 5945 + }, + { + "epoch": 1.4954728370221329, + "grad_norm": 0.289499968290329, + "learning_rate": 5.8943270007609026e-06, + "loss": 0.3384, + "step": 5946 + }, + { + "epoch": 1.4957243460764587, + "grad_norm": 0.3094853162765503, + "learning_rate": 5.892887311843474e-06, + "loss": 0.333, + "step": 5947 + }, + { + "epoch": 1.4959758551307847, + "grad_norm": 0.28878265619277954, + "learning_rate": 5.891447546455833e-06, + "loss": 0.32, + "step": 5948 + }, + { + "epoch": 1.4962273641851107, + "grad_norm": 0.32799869775772095, + "learning_rate": 5.890007704721288e-06, + "loss": 0.3462, + "step": 5949 + }, + { + "epoch": 1.4964788732394365, + "grad_norm": 0.30716651678085327, + "learning_rate": 5.88856778676315e-06, + "loss": 0.3626, + "step": 5950 + }, + { + "epoch": 1.4967303822937625, + "grad_norm": 0.31167250871658325, + "learning_rate": 5.88712779270474e-06, + "loss": 0.3524, + "step": 5951 + }, + { + "epoch": 1.4969818913480886, + "grad_norm": 0.31440672278404236, + "learning_rate": 5.885687722669384e-06, + "loss": 0.3277, + "step": 5952 + }, + { + "epoch": 1.4972334004024144, + "grad_norm": 0.28327929973602295, + "learning_rate": 5.884247576780416e-06, + "loss": 0.347, + "step": 5953 + }, + { + "epoch": 1.4974849094567404, + "grad_norm": 0.33180156350135803, + "learning_rate": 5.882807355161174e-06, + "loss": 0.3556, + "step": 5954 + }, + { + "epoch": 1.4977364185110664, + "grad_norm": 0.30268731713294983, + "learning_rate": 5.881367057935005e-06, + "loss": 0.337, + "step": 5955 + }, + { + "epoch": 1.4979879275653922, + "grad_norm": 0.32002800703048706, + "learning_rate": 5.879926685225264e-06, + "loss": 0.3529, + "step": 5956 + }, + { + "epoch": 1.4982394366197183, + "grad_norm": 0.31586262583732605, + "learning_rate": 5.878486237155304e-06, + "loss": 0.3336, + "step": 5957 + }, + { + "epoch": 1.4984909456740443, + "grad_norm": 0.3224563002586365, + "learning_rate": 5.877045713848495e-06, + "loss": 0.3587, + "step": 5958 + }, + { + "epoch": 1.49874245472837, + "grad_norm": 0.32255980372428894, + "learning_rate": 5.875605115428207e-06, + "loss": 0.3308, + "step": 5959 + }, + { + "epoch": 1.4989939637826961, + "grad_norm": 0.3053814172744751, + "learning_rate": 5.874164442017819e-06, + "loss": 0.3301, + "step": 5960 + }, + { + "epoch": 1.4992454728370221, + "grad_norm": 0.32673344016075134, + "learning_rate": 5.872723693740715e-06, + "loss": 0.3345, + "step": 5961 + }, + { + "epoch": 1.4994969818913482, + "grad_norm": 0.3168867230415344, + "learning_rate": 5.871282870720286e-06, + "loss": 0.3376, + "step": 5962 + }, + { + "epoch": 1.499748490945674, + "grad_norm": 0.3028486371040344, + "learning_rate": 5.869841973079931e-06, + "loss": 0.3365, + "step": 5963 + }, + { + "epoch": 1.5, + "grad_norm": 0.34289392828941345, + "learning_rate": 5.868401000943051e-06, + "loss": 0.348, + "step": 5964 + }, + { + "epoch": 1.5002515090543258, + "grad_norm": 0.3447049856185913, + "learning_rate": 5.866959954433058e-06, + "loss": 0.3576, + "step": 5965 + }, + { + "epoch": 1.500503018108652, + "grad_norm": 0.3279689848423004, + "learning_rate": 5.86551883367337e-06, + "loss": 0.3803, + "step": 5966 + }, + { + "epoch": 1.5007545271629779, + "grad_norm": 0.3094026744365692, + "learning_rate": 5.8640776387874085e-06, + "loss": 0.351, + "step": 5967 + }, + { + "epoch": 1.5010060362173037, + "grad_norm": 0.3381515443325043, + "learning_rate": 5.8626363698986025e-06, + "loss": 0.3711, + "step": 5968 + }, + { + "epoch": 1.50125754527163, + "grad_norm": 0.3412519693374634, + "learning_rate": 5.861195027130388e-06, + "loss": 0.3376, + "step": 5969 + }, + { + "epoch": 1.5015090543259557, + "grad_norm": 0.33772408962249756, + "learning_rate": 5.859753610606207e-06, + "loss": 0.3629, + "step": 5970 + }, + { + "epoch": 1.5017605633802817, + "grad_norm": 0.3265193700790405, + "learning_rate": 5.858312120449507e-06, + "loss": 0.347, + "step": 5971 + }, + { + "epoch": 1.5020120724346078, + "grad_norm": 0.3511604368686676, + "learning_rate": 5.856870556783746e-06, + "loss": 0.3296, + "step": 5972 + }, + { + "epoch": 1.5022635814889336, + "grad_norm": 0.3105302155017853, + "learning_rate": 5.85542891973238e-06, + "loss": 0.3374, + "step": 5973 + }, + { + "epoch": 1.5025150905432596, + "grad_norm": 0.34747567772865295, + "learning_rate": 5.85398720941888e-06, + "loss": 0.3545, + "step": 5974 + }, + { + "epoch": 1.5027665995975856, + "grad_norm": 0.3080263137817383, + "learning_rate": 5.852545425966717e-06, + "loss": 0.3281, + "step": 5975 + }, + { + "epoch": 1.5030181086519114, + "grad_norm": 0.34550175070762634, + "learning_rate": 5.851103569499372e-06, + "loss": 0.3526, + "step": 5976 + }, + { + "epoch": 1.5032696177062375, + "grad_norm": 0.3092076778411865, + "learning_rate": 5.849661640140332e-06, + "loss": 0.3381, + "step": 5977 + }, + { + "epoch": 1.5035211267605635, + "grad_norm": 0.31573188304901123, + "learning_rate": 5.848219638013086e-06, + "loss": 0.3599, + "step": 5978 + }, + { + "epoch": 1.5037726358148893, + "grad_norm": 0.2985522449016571, + "learning_rate": 5.846777563241136e-06, + "loss": 0.3504, + "step": 5979 + }, + { + "epoch": 1.5040241448692153, + "grad_norm": 0.2984470725059509, + "learning_rate": 5.845335415947985e-06, + "loss": 0.334, + "step": 5980 + }, + { + "epoch": 1.5042756539235413, + "grad_norm": 0.3081437349319458, + "learning_rate": 5.8438931962571435e-06, + "loss": 0.3274, + "step": 5981 + }, + { + "epoch": 1.5045271629778671, + "grad_norm": 0.31499478220939636, + "learning_rate": 5.842450904292128e-06, + "loss": 0.3468, + "step": 5982 + }, + { + "epoch": 1.5047786720321932, + "grad_norm": 0.30446550250053406, + "learning_rate": 5.841008540176465e-06, + "loss": 0.3299, + "step": 5983 + }, + { + "epoch": 1.5050301810865192, + "grad_norm": 0.2896031141281128, + "learning_rate": 5.83956610403368e-06, + "loss": 0.3371, + "step": 5984 + }, + { + "epoch": 1.505281690140845, + "grad_norm": 0.3181334435939789, + "learning_rate": 5.838123595987312e-06, + "loss": 0.3294, + "step": 5985 + }, + { + "epoch": 1.505533199195171, + "grad_norm": 0.32969263195991516, + "learning_rate": 5.8366810161609e-06, + "loss": 0.3314, + "step": 5986 + }, + { + "epoch": 1.505784708249497, + "grad_norm": 0.3354569375514984, + "learning_rate": 5.835238364677994e-06, + "loss": 0.3647, + "step": 5987 + }, + { + "epoch": 1.5060362173038229, + "grad_norm": 0.30361291766166687, + "learning_rate": 5.8337956416621465e-06, + "loss": 0.3302, + "step": 5988 + }, + { + "epoch": 1.506287726358149, + "grad_norm": 0.3423859775066376, + "learning_rate": 5.832352847236919e-06, + "loss": 0.3275, + "step": 5989 + }, + { + "epoch": 1.506539235412475, + "grad_norm": 0.3418254554271698, + "learning_rate": 5.830909981525879e-06, + "loss": 0.3161, + "step": 5990 + }, + { + "epoch": 1.5067907444668007, + "grad_norm": 0.33632370829582214, + "learning_rate": 5.829467044652595e-06, + "loss": 0.3405, + "step": 5991 + }, + { + "epoch": 1.5070422535211268, + "grad_norm": 0.31483152508735657, + "learning_rate": 5.828024036740649e-06, + "loss": 0.3588, + "step": 5992 + }, + { + "epoch": 1.5072937625754528, + "grad_norm": 0.3332071900367737, + "learning_rate": 5.826580957913624e-06, + "loss": 0.3365, + "step": 5993 + }, + { + "epoch": 1.5075452716297786, + "grad_norm": 0.3333156704902649, + "learning_rate": 5.825137808295111e-06, + "loss": 0.3691, + "step": 5994 + }, + { + "epoch": 1.5077967806841046, + "grad_norm": 0.32617276906967163, + "learning_rate": 5.823694588008707e-06, + "loss": 0.3303, + "step": 5995 + }, + { + "epoch": 1.5080482897384306, + "grad_norm": 0.3059340715408325, + "learning_rate": 5.822251297178014e-06, + "loss": 0.3357, + "step": 5996 + }, + { + "epoch": 1.5082997987927564, + "grad_norm": 0.3015100359916687, + "learning_rate": 5.82080793592664e-06, + "loss": 0.3366, + "step": 5997 + }, + { + "epoch": 1.5085513078470825, + "grad_norm": 0.3392370939254761, + "learning_rate": 5.819364504378203e-06, + "loss": 0.3774, + "step": 5998 + }, + { + "epoch": 1.5088028169014085, + "grad_norm": 0.34105855226516724, + "learning_rate": 5.817921002656323e-06, + "loss": 0.3507, + "step": 5999 + }, + { + "epoch": 1.5090543259557343, + "grad_norm": 0.3226035237312317, + "learning_rate": 5.816477430884625e-06, + "loss": 0.3672, + "step": 6000 + }, + { + "epoch": 1.5093058350100603, + "grad_norm": 0.32897132635116577, + "learning_rate": 5.815033789186743e-06, + "loss": 0.354, + "step": 6001 + }, + { + "epoch": 1.5095573440643864, + "grad_norm": 0.30205658078193665, + "learning_rate": 5.813590077686315e-06, + "loss": 0.3416, + "step": 6002 + }, + { + "epoch": 1.5098088531187122, + "grad_norm": 0.33135828375816345, + "learning_rate": 5.812146296506987e-06, + "loss": 0.3654, + "step": 6003 + }, + { + "epoch": 1.5100603621730382, + "grad_norm": 0.3252605199813843, + "learning_rate": 5.810702445772411e-06, + "loss": 0.3617, + "step": 6004 + }, + { + "epoch": 1.5103118712273642, + "grad_norm": 0.3258083760738373, + "learning_rate": 5.809258525606241e-06, + "loss": 0.3513, + "step": 6005 + }, + { + "epoch": 1.51056338028169, + "grad_norm": 0.3420291244983673, + "learning_rate": 5.807814536132141e-06, + "loss": 0.3803, + "step": 6006 + }, + { + "epoch": 1.510814889336016, + "grad_norm": 0.317022442817688, + "learning_rate": 5.806370477473778e-06, + "loss": 0.35, + "step": 6007 + }, + { + "epoch": 1.511066398390342, + "grad_norm": 0.30522531270980835, + "learning_rate": 5.80492634975483e-06, + "loss": 0.3376, + "step": 6008 + }, + { + "epoch": 1.5113179074446679, + "grad_norm": 0.3109252452850342, + "learning_rate": 5.803482153098976e-06, + "loss": 0.3218, + "step": 6009 + }, + { + "epoch": 1.5115694164989941, + "grad_norm": 0.3221626281738281, + "learning_rate": 5.802037887629902e-06, + "loss": 0.3244, + "step": 6010 + }, + { + "epoch": 1.51182092555332, + "grad_norm": 0.31988272070884705, + "learning_rate": 5.800593553471301e-06, + "loss": 0.3714, + "step": 6011 + }, + { + "epoch": 1.5120724346076457, + "grad_norm": 0.298286497592926, + "learning_rate": 5.79914915074687e-06, + "loss": 0.3457, + "step": 6012 + }, + { + "epoch": 1.512323943661972, + "grad_norm": 0.3138798773288727, + "learning_rate": 5.797704679580313e-06, + "loss": 0.3422, + "step": 6013 + }, + { + "epoch": 1.5125754527162978, + "grad_norm": 0.307769238948822, + "learning_rate": 5.796260140095343e-06, + "loss": 0.3435, + "step": 6014 + }, + { + "epoch": 1.5128269617706236, + "grad_norm": 0.30271801352500916, + "learning_rate": 5.79481553241567e-06, + "loss": 0.3565, + "step": 6015 + }, + { + "epoch": 1.5130784708249498, + "grad_norm": 0.31750625371932983, + "learning_rate": 5.793370856665021e-06, + "loss": 0.3732, + "step": 6016 + }, + { + "epoch": 1.5133299798792756, + "grad_norm": 0.3040974736213684, + "learning_rate": 5.791926112967122e-06, + "loss": 0.3527, + "step": 6017 + }, + { + "epoch": 1.5135814889336014, + "grad_norm": 0.2947345972061157, + "learning_rate": 5.790481301445704e-06, + "loss": 0.3503, + "step": 6018 + }, + { + "epoch": 1.5138329979879277, + "grad_norm": 0.3347560167312622, + "learning_rate": 5.789036422224508e-06, + "loss": 0.3437, + "step": 6019 + }, + { + "epoch": 1.5140845070422535, + "grad_norm": 0.2965729236602783, + "learning_rate": 5.78759147542728e-06, + "loss": 0.3459, + "step": 6020 + }, + { + "epoch": 1.5143360160965795, + "grad_norm": 0.2949177622795105, + "learning_rate": 5.786146461177769e-06, + "loss": 0.3359, + "step": 6021 + }, + { + "epoch": 1.5145875251509056, + "grad_norm": 0.3233572542667389, + "learning_rate": 5.7847013795997306e-06, + "loss": 0.376, + "step": 6022 + }, + { + "epoch": 1.5148390342052314, + "grad_norm": 0.3046778440475464, + "learning_rate": 5.7832562308169286e-06, + "loss": 0.339, + "step": 6023 + }, + { + "epoch": 1.5150905432595574, + "grad_norm": 0.2932363450527191, + "learning_rate": 5.78181101495313e-06, + "loss": 0.3297, + "step": 6024 + }, + { + "epoch": 1.5153420523138834, + "grad_norm": 0.29733753204345703, + "learning_rate": 5.78036573213211e-06, + "loss": 0.3531, + "step": 6025 + }, + { + "epoch": 1.5155935613682092, + "grad_norm": 0.2947341501712799, + "learning_rate": 5.778920382477647e-06, + "loss": 0.3486, + "step": 6026 + }, + { + "epoch": 1.5158450704225352, + "grad_norm": 0.3160892724990845, + "learning_rate": 5.7774749661135255e-06, + "loss": 0.3706, + "step": 6027 + }, + { + "epoch": 1.5160965794768613, + "grad_norm": 0.3119431734085083, + "learning_rate": 5.776029483163538e-06, + "loss": 0.3675, + "step": 6028 + }, + { + "epoch": 1.516348088531187, + "grad_norm": 0.30674445629119873, + "learning_rate": 5.774583933751477e-06, + "loss": 0.3458, + "step": 6029 + }, + { + "epoch": 1.516599597585513, + "grad_norm": 0.3203643560409546, + "learning_rate": 5.773138318001151e-06, + "loss": 0.3581, + "step": 6030 + }, + { + "epoch": 1.5168511066398391, + "grad_norm": 0.3252681493759155, + "learning_rate": 5.771692636036365e-06, + "loss": 0.3449, + "step": 6031 + }, + { + "epoch": 1.517102615694165, + "grad_norm": 0.30788522958755493, + "learning_rate": 5.77024688798093e-06, + "loss": 0.3391, + "step": 6032 + }, + { + "epoch": 1.517354124748491, + "grad_norm": 0.3264254331588745, + "learning_rate": 5.76880107395867e-06, + "loss": 0.3492, + "step": 6033 + }, + { + "epoch": 1.517605633802817, + "grad_norm": 0.31012457609176636, + "learning_rate": 5.767355194093407e-06, + "loss": 0.3388, + "step": 6034 + }, + { + "epoch": 1.5178571428571428, + "grad_norm": 0.2838296592235565, + "learning_rate": 5.765909248508972e-06, + "loss": 0.344, + "step": 6035 + }, + { + "epoch": 1.5181086519114688, + "grad_norm": 0.29018688201904297, + "learning_rate": 5.764463237329201e-06, + "loss": 0.3587, + "step": 6036 + }, + { + "epoch": 1.5183601609657948, + "grad_norm": 0.3031754195690155, + "learning_rate": 5.7630171606779365e-06, + "loss": 0.3667, + "step": 6037 + }, + { + "epoch": 1.5186116700201207, + "grad_norm": 0.3265339732170105, + "learning_rate": 5.761571018679025e-06, + "loss": 0.3765, + "step": 6038 + }, + { + "epoch": 1.5188631790744467, + "grad_norm": 0.3091321289539337, + "learning_rate": 5.760124811456322e-06, + "loss": 0.3388, + "step": 6039 + }, + { + "epoch": 1.5191146881287727, + "grad_norm": 0.32367005944252014, + "learning_rate": 5.758678539133682e-06, + "loss": 0.342, + "step": 6040 + }, + { + "epoch": 1.5193661971830985, + "grad_norm": 0.3425564765930176, + "learning_rate": 5.757232201834973e-06, + "loss": 0.3465, + "step": 6041 + }, + { + "epoch": 1.5196177062374245, + "grad_norm": 0.28837791085243225, + "learning_rate": 5.755785799684063e-06, + "loss": 0.3377, + "step": 6042 + }, + { + "epoch": 1.5198692152917506, + "grad_norm": 0.31125667691230774, + "learning_rate": 5.754339332804826e-06, + "loss": 0.3478, + "step": 6043 + }, + { + "epoch": 1.5201207243460764, + "grad_norm": 0.342917263507843, + "learning_rate": 5.752892801321146e-06, + "loss": 0.3409, + "step": 6044 + }, + { + "epoch": 1.5203722334004024, + "grad_norm": 0.3406009376049042, + "learning_rate": 5.751446205356906e-06, + "loss": 0.3429, + "step": 6045 + }, + { + "epoch": 1.5206237424547284, + "grad_norm": 0.32107996940612793, + "learning_rate": 5.749999545036001e-06, + "loss": 0.332, + "step": 6046 + }, + { + "epoch": 1.5208752515090542, + "grad_norm": 0.31515464186668396, + "learning_rate": 5.7485528204823275e-06, + "loss": 0.3561, + "step": 6047 + }, + { + "epoch": 1.5211267605633803, + "grad_norm": 0.2875670790672302, + "learning_rate": 5.7471060318197856e-06, + "loss": 0.3564, + "step": 6048 + }, + { + "epoch": 1.5213782696177063, + "grad_norm": 0.3263548016548157, + "learning_rate": 5.7456591791722875e-06, + "loss": 0.3738, + "step": 6049 + }, + { + "epoch": 1.521629778672032, + "grad_norm": 0.30848369002342224, + "learning_rate": 5.744212262663745e-06, + "loss": 0.351, + "step": 6050 + }, + { + "epoch": 1.5218812877263581, + "grad_norm": 0.29957401752471924, + "learning_rate": 5.742765282418077e-06, + "loss": 0.3695, + "step": 6051 + }, + { + "epoch": 1.5221327967806841, + "grad_norm": 0.30603325366973877, + "learning_rate": 5.74131823855921e-06, + "loss": 0.3635, + "step": 6052 + }, + { + "epoch": 1.52238430583501, + "grad_norm": 0.31580379605293274, + "learning_rate": 5.739871131211074e-06, + "loss": 0.3334, + "step": 6053 + }, + { + "epoch": 1.522635814889336, + "grad_norm": 0.31090569496154785, + "learning_rate": 5.738423960497604e-06, + "loss": 0.3362, + "step": 6054 + }, + { + "epoch": 1.522887323943662, + "grad_norm": 0.33442890644073486, + "learning_rate": 5.736976726542742e-06, + "loss": 0.363, + "step": 6055 + }, + { + "epoch": 1.5231388329979878, + "grad_norm": 0.3184659481048584, + "learning_rate": 5.735529429470433e-06, + "loss": 0.3529, + "step": 6056 + }, + { + "epoch": 1.5233903420523138, + "grad_norm": 0.3236134946346283, + "learning_rate": 5.734082069404631e-06, + "loss": 0.3506, + "step": 6057 + }, + { + "epoch": 1.5236418511066399, + "grad_norm": 0.29839229583740234, + "learning_rate": 5.732634646469291e-06, + "loss": 0.3242, + "step": 6058 + }, + { + "epoch": 1.5238933601609657, + "grad_norm": 0.30354323983192444, + "learning_rate": 5.731187160788377e-06, + "loss": 0.3657, + "step": 6059 + }, + { + "epoch": 1.524144869215292, + "grad_norm": 0.3145615756511688, + "learning_rate": 5.729739612485857e-06, + "loss": 0.3592, + "step": 6060 + }, + { + "epoch": 1.5243963782696177, + "grad_norm": 0.322421133518219, + "learning_rate": 5.728292001685706e-06, + "loss": 0.3392, + "step": 6061 + }, + { + "epoch": 1.5246478873239435, + "grad_norm": 0.3052886128425598, + "learning_rate": 5.726844328511898e-06, + "loss": 0.3416, + "step": 6062 + }, + { + "epoch": 1.5248993963782698, + "grad_norm": 0.2889304459095001, + "learning_rate": 5.725396593088423e-06, + "loss": 0.3424, + "step": 6063 + }, + { + "epoch": 1.5251509054325956, + "grad_norm": 0.3219023048877716, + "learning_rate": 5.723948795539267e-06, + "loss": 0.3305, + "step": 6064 + }, + { + "epoch": 1.5254024144869214, + "grad_norm": 0.3205178678035736, + "learning_rate": 5.722500935988425e-06, + "loss": 0.3336, + "step": 6065 + }, + { + "epoch": 1.5256539235412476, + "grad_norm": 0.3172166347503662, + "learning_rate": 5.721053014559898e-06, + "loss": 0.3496, + "step": 6066 + }, + { + "epoch": 1.5259054325955734, + "grad_norm": 0.30717623233795166, + "learning_rate": 5.719605031377693e-06, + "loss": 0.3197, + "step": 6067 + }, + { + "epoch": 1.5261569416498992, + "grad_norm": 0.33775895833969116, + "learning_rate": 5.718156986565817e-06, + "loss": 0.3463, + "step": 6068 + }, + { + "epoch": 1.5264084507042255, + "grad_norm": 0.31737133860588074, + "learning_rate": 5.71670888024829e-06, + "loss": 0.3468, + "step": 6069 + }, + { + "epoch": 1.5266599597585513, + "grad_norm": 0.33389195799827576, + "learning_rate": 5.715260712549129e-06, + "loss": 0.364, + "step": 6070 + }, + { + "epoch": 1.5269114688128773, + "grad_norm": 0.32125452160835266, + "learning_rate": 5.713812483592364e-06, + "loss": 0.331, + "step": 6071 + }, + { + "epoch": 1.5271629778672033, + "grad_norm": 0.319833904504776, + "learning_rate": 5.712364193502024e-06, + "loss": 0.3332, + "step": 6072 + }, + { + "epoch": 1.5274144869215291, + "grad_norm": 0.3113959729671478, + "learning_rate": 5.710915842402147e-06, + "loss": 0.3545, + "step": 6073 + }, + { + "epoch": 1.5276659959758552, + "grad_norm": 0.31851494312286377, + "learning_rate": 5.7094674304167766e-06, + "loss": 0.3371, + "step": 6074 + }, + { + "epoch": 1.5279175050301812, + "grad_norm": 0.3372229337692261, + "learning_rate": 5.708018957669959e-06, + "loss": 0.3374, + "step": 6075 + }, + { + "epoch": 1.528169014084507, + "grad_norm": 0.3625347316265106, + "learning_rate": 5.706570424285747e-06, + "loss": 0.3519, + "step": 6076 + }, + { + "epoch": 1.528420523138833, + "grad_norm": 0.31328415870666504, + "learning_rate": 5.705121830388199e-06, + "loss": 0.3275, + "step": 6077 + }, + { + "epoch": 1.528672032193159, + "grad_norm": 0.36509522795677185, + "learning_rate": 5.703673176101377e-06, + "loss": 0.3426, + "step": 6078 + }, + { + "epoch": 1.5289235412474849, + "grad_norm": 0.3331649601459503, + "learning_rate": 5.702224461549351e-06, + "loss": 0.348, + "step": 6079 + }, + { + "epoch": 1.529175050301811, + "grad_norm": 0.33841848373413086, + "learning_rate": 5.700775686856192e-06, + "loss": 0.3439, + "step": 6080 + }, + { + "epoch": 1.529426559356137, + "grad_norm": 0.35388943552970886, + "learning_rate": 5.699326852145979e-06, + "loss": 0.3504, + "step": 6081 + }, + { + "epoch": 1.5296780684104627, + "grad_norm": 0.31031563878059387, + "learning_rate": 5.697877957542795e-06, + "loss": 0.3449, + "step": 6082 + }, + { + "epoch": 1.5299295774647887, + "grad_norm": 0.34058165550231934, + "learning_rate": 5.696429003170732e-06, + "loss": 0.347, + "step": 6083 + }, + { + "epoch": 1.5301810865191148, + "grad_norm": 0.30972498655319214, + "learning_rate": 5.69497998915388e-06, + "loss": 0.3401, + "step": 6084 + }, + { + "epoch": 1.5304325955734406, + "grad_norm": 0.32734933495521545, + "learning_rate": 5.693530915616341e-06, + "loss": 0.3593, + "step": 6085 + }, + { + "epoch": 1.5306841046277666, + "grad_norm": 0.34089869260787964, + "learning_rate": 5.6920817826822176e-06, + "loss": 0.3545, + "step": 6086 + }, + { + "epoch": 1.5309356136820926, + "grad_norm": 0.3333452641963959, + "learning_rate": 5.690632590475618e-06, + "loss": 0.3751, + "step": 6087 + }, + { + "epoch": 1.5311871227364184, + "grad_norm": 0.3344314992427826, + "learning_rate": 5.6891833391206585e-06, + "loss": 0.3573, + "step": 6088 + }, + { + "epoch": 1.5314386317907445, + "grad_norm": 0.2930935323238373, + "learning_rate": 5.687734028741459e-06, + "loss": 0.3553, + "step": 6089 + }, + { + "epoch": 1.5316901408450705, + "grad_norm": 0.32319191098213196, + "learning_rate": 5.68628465946214e-06, + "loss": 0.3395, + "step": 6090 + }, + { + "epoch": 1.5319416498993963, + "grad_norm": 0.30757129192352295, + "learning_rate": 5.684835231406835e-06, + "loss": 0.3526, + "step": 6091 + }, + { + "epoch": 1.5321931589537223, + "grad_norm": 0.31132009625434875, + "learning_rate": 5.683385744699675e-06, + "loss": 0.3385, + "step": 6092 + }, + { + "epoch": 1.5324446680080483, + "grad_norm": 0.31980809569358826, + "learning_rate": 5.681936199464803e-06, + "loss": 0.3532, + "step": 6093 + }, + { + "epoch": 1.5326961770623742, + "grad_norm": 0.29808422923088074, + "learning_rate": 5.680486595826361e-06, + "loss": 0.3306, + "step": 6094 + }, + { + "epoch": 1.5329476861167002, + "grad_norm": 0.3331838548183441, + "learning_rate": 5.6790369339085e-06, + "loss": 0.3695, + "step": 6095 + }, + { + "epoch": 1.5331991951710262, + "grad_norm": 0.3027997612953186, + "learning_rate": 5.677587213835372e-06, + "loss": 0.3521, + "step": 6096 + }, + { + "epoch": 1.533450704225352, + "grad_norm": 0.3264061212539673, + "learning_rate": 5.676137435731139e-06, + "loss": 0.3425, + "step": 6097 + }, + { + "epoch": 1.533702213279678, + "grad_norm": 0.32032856345176697, + "learning_rate": 5.674687599719963e-06, + "loss": 0.343, + "step": 6098 + }, + { + "epoch": 1.533953722334004, + "grad_norm": 0.33043432235717773, + "learning_rate": 5.673237705926018e-06, + "loss": 0.3522, + "step": 6099 + }, + { + "epoch": 1.5342052313883299, + "grad_norm": 0.30190953612327576, + "learning_rate": 5.6717877544734735e-06, + "loss": 0.3511, + "step": 6100 + }, + { + "epoch": 1.534456740442656, + "grad_norm": 0.2926798462867737, + "learning_rate": 5.670337745486511e-06, + "loss": 0.3662, + "step": 6101 + }, + { + "epoch": 1.534708249496982, + "grad_norm": 0.289917528629303, + "learning_rate": 5.668887679089314e-06, + "loss": 0.3499, + "step": 6102 + }, + { + "epoch": 1.5349597585513077, + "grad_norm": 0.3212122619152069, + "learning_rate": 5.66743755540607e-06, + "loss": 0.3655, + "step": 6103 + }, + { + "epoch": 1.5352112676056338, + "grad_norm": 0.31146296858787537, + "learning_rate": 5.665987374560977e-06, + "loss": 0.3524, + "step": 6104 + }, + { + "epoch": 1.5354627766599598, + "grad_norm": 0.3085387647151947, + "learning_rate": 5.66453713667823e-06, + "loss": 0.3444, + "step": 6105 + }, + { + "epoch": 1.5357142857142856, + "grad_norm": 0.3076164126396179, + "learning_rate": 5.663086841882036e-06, + "loss": 0.3589, + "step": 6106 + }, + { + "epoch": 1.5359657947686118, + "grad_norm": 0.36159980297088623, + "learning_rate": 5.661636490296602e-06, + "loss": 0.3798, + "step": 6107 + }, + { + "epoch": 1.5362173038229376, + "grad_norm": 0.33638373017311096, + "learning_rate": 5.660186082046142e-06, + "loss": 0.3668, + "step": 6108 + }, + { + "epoch": 1.5364688128772634, + "grad_norm": 0.2957940697669983, + "learning_rate": 5.658735617254874e-06, + "loss": 0.3312, + "step": 6109 + }, + { + "epoch": 1.5367203219315897, + "grad_norm": 0.3306633532047272, + "learning_rate": 5.6572850960470215e-06, + "loss": 0.3716, + "step": 6110 + }, + { + "epoch": 1.5369718309859155, + "grad_norm": 0.2976609468460083, + "learning_rate": 5.655834518546813e-06, + "loss": 0.3471, + "step": 6111 + }, + { + "epoch": 1.5372233400402413, + "grad_norm": 0.34043240547180176, + "learning_rate": 5.654383884878481e-06, + "loss": 0.332, + "step": 6112 + }, + { + "epoch": 1.5374748490945676, + "grad_norm": 0.3192569315433502, + "learning_rate": 5.6529331951662615e-06, + "loss": 0.3513, + "step": 6113 + }, + { + "epoch": 1.5377263581488934, + "grad_norm": 0.2940548062324524, + "learning_rate": 5.6514824495344e-06, + "loss": 0.3441, + "step": 6114 + }, + { + "epoch": 1.5379778672032192, + "grad_norm": 0.3519013524055481, + "learning_rate": 5.650031648107142e-06, + "loss": 0.3306, + "step": 6115 + }, + { + "epoch": 1.5382293762575454, + "grad_norm": 0.3195810616016388, + "learning_rate": 5.648580791008739e-06, + "loss": 0.35, + "step": 6116 + }, + { + "epoch": 1.5384808853118712, + "grad_norm": 0.324769526720047, + "learning_rate": 5.647129878363449e-06, + "loss": 0.3416, + "step": 6117 + }, + { + "epoch": 1.538732394366197, + "grad_norm": 0.310882568359375, + "learning_rate": 5.645678910295533e-06, + "loss": 0.3592, + "step": 6118 + }, + { + "epoch": 1.5389839034205233, + "grad_norm": 0.3428153097629547, + "learning_rate": 5.64422788692926e-06, + "loss": 0.3694, + "step": 6119 + }, + { + "epoch": 1.539235412474849, + "grad_norm": 0.39914482831954956, + "learning_rate": 5.642776808388897e-06, + "loss": 0.3532, + "step": 6120 + }, + { + "epoch": 1.539486921529175, + "grad_norm": 0.3099226653575897, + "learning_rate": 5.641325674798722e-06, + "loss": 0.3303, + "step": 6121 + }, + { + "epoch": 1.5397384305835011, + "grad_norm": 0.32525354623794556, + "learning_rate": 5.639874486283015e-06, + "loss": 0.3348, + "step": 6122 + }, + { + "epoch": 1.539989939637827, + "grad_norm": 0.3201609253883362, + "learning_rate": 5.638423242966061e-06, + "loss": 0.3441, + "step": 6123 + }, + { + "epoch": 1.540241448692153, + "grad_norm": 0.31889769434928894, + "learning_rate": 5.63697194497215e-06, + "loss": 0.3829, + "step": 6124 + }, + { + "epoch": 1.540492957746479, + "grad_norm": 0.31827834248542786, + "learning_rate": 5.635520592425579e-06, + "loss": 0.3438, + "step": 6125 + }, + { + "epoch": 1.5407444668008048, + "grad_norm": 0.30125829577445984, + "learning_rate": 5.634069185450642e-06, + "loss": 0.3563, + "step": 6126 + }, + { + "epoch": 1.5409959758551308, + "grad_norm": 0.2951527237892151, + "learning_rate": 5.6326177241716466e-06, + "loss": 0.3313, + "step": 6127 + }, + { + "epoch": 1.5412474849094568, + "grad_norm": 0.32351893186569214, + "learning_rate": 5.631166208712902e-06, + "loss": 0.354, + "step": 6128 + }, + { + "epoch": 1.5414989939637826, + "grad_norm": 0.3129163384437561, + "learning_rate": 5.629714639198719e-06, + "loss": 0.3546, + "step": 6129 + }, + { + "epoch": 1.5417505030181087, + "grad_norm": 0.29270297288894653, + "learning_rate": 5.628263015753418e-06, + "loss": 0.3406, + "step": 6130 + }, + { + "epoch": 1.5420020120724347, + "grad_norm": 0.29806989431381226, + "learning_rate": 5.626811338501319e-06, + "loss": 0.3735, + "step": 6131 + }, + { + "epoch": 1.5422535211267605, + "grad_norm": 0.31030187010765076, + "learning_rate": 5.625359607566751e-06, + "loss": 0.3505, + "step": 6132 + }, + { + "epoch": 1.5425050301810865, + "grad_norm": 0.30579960346221924, + "learning_rate": 5.623907823074044e-06, + "loss": 0.3578, + "step": 6133 + }, + { + "epoch": 1.5427565392354126, + "grad_norm": 0.3121272325515747, + "learning_rate": 5.622455985147536e-06, + "loss": 0.3878, + "step": 6134 + }, + { + "epoch": 1.5430080482897384, + "grad_norm": 0.3233649730682373, + "learning_rate": 5.621004093911566e-06, + "loss": 0.3413, + "step": 6135 + }, + { + "epoch": 1.5432595573440644, + "grad_norm": 0.32058221101760864, + "learning_rate": 5.6195521494904815e-06, + "loss": 0.3535, + "step": 6136 + }, + { + "epoch": 1.5435110663983904, + "grad_norm": 0.3023648262023926, + "learning_rate": 5.61810015200863e-06, + "loss": 0.3657, + "step": 6137 + }, + { + "epoch": 1.5437625754527162, + "grad_norm": 0.3155832290649414, + "learning_rate": 5.616648101590367e-06, + "loss": 0.3481, + "step": 6138 + }, + { + "epoch": 1.5440140845070423, + "grad_norm": 0.33503812551498413, + "learning_rate": 5.615195998360053e-06, + "loss": 0.3623, + "step": 6139 + }, + { + "epoch": 1.5442655935613683, + "grad_norm": 0.31416836380958557, + "learning_rate": 5.61374384244205e-06, + "loss": 0.3691, + "step": 6140 + }, + { + "epoch": 1.544517102615694, + "grad_norm": 0.3015648424625397, + "learning_rate": 5.612291633960727e-06, + "loss": 0.3305, + "step": 6141 + }, + { + "epoch": 1.54476861167002, + "grad_norm": 0.33664670586586, + "learning_rate": 5.610839373040455e-06, + "loss": 0.3321, + "step": 6142 + }, + { + "epoch": 1.5450201207243461, + "grad_norm": 0.31471818685531616, + "learning_rate": 5.609387059805614e-06, + "loss": 0.3553, + "step": 6143 + }, + { + "epoch": 1.545271629778672, + "grad_norm": 0.3041958212852478, + "learning_rate": 5.607934694380581e-06, + "loss": 0.3485, + "step": 6144 + }, + { + "epoch": 1.545523138832998, + "grad_norm": 0.3049089312553406, + "learning_rate": 5.606482276889746e-06, + "loss": 0.3366, + "step": 6145 + }, + { + "epoch": 1.545774647887324, + "grad_norm": 0.2998283803462982, + "learning_rate": 5.605029807457499e-06, + "loss": 0.3243, + "step": 6146 + }, + { + "epoch": 1.5460261569416498, + "grad_norm": 0.32313594222068787, + "learning_rate": 5.603577286208234e-06, + "loss": 0.3821, + "step": 6147 + }, + { + "epoch": 1.5462776659959758, + "grad_norm": 0.3039160668849945, + "learning_rate": 5.602124713266349e-06, + "loss": 0.3721, + "step": 6148 + }, + { + "epoch": 1.5465291750503019, + "grad_norm": 0.31081444025039673, + "learning_rate": 5.60067208875625e-06, + "loss": 0.3554, + "step": 6149 + }, + { + "epoch": 1.5467806841046277, + "grad_norm": 0.3036777675151825, + "learning_rate": 5.599219412802344e-06, + "loss": 0.3459, + "step": 6150 + }, + { + "epoch": 1.5470321931589537, + "grad_norm": 0.2954856753349304, + "learning_rate": 5.597766685529043e-06, + "loss": 0.3439, + "step": 6151 + }, + { + "epoch": 1.5472837022132797, + "grad_norm": 0.3063904941082001, + "learning_rate": 5.596313907060766e-06, + "loss": 0.3452, + "step": 6152 + }, + { + "epoch": 1.5475352112676055, + "grad_norm": 0.33368048071861267, + "learning_rate": 5.594861077521935e-06, + "loss": 0.3523, + "step": 6153 + }, + { + "epoch": 1.5477867203219315, + "grad_norm": 0.31460052728652954, + "learning_rate": 5.593408197036973e-06, + "loss": 0.3389, + "step": 6154 + }, + { + "epoch": 1.5480382293762576, + "grad_norm": 0.32115140557289124, + "learning_rate": 5.59195526573031e-06, + "loss": 0.3714, + "step": 6155 + }, + { + "epoch": 1.5482897384305834, + "grad_norm": 0.3152860105037689, + "learning_rate": 5.590502283726383e-06, + "loss": 0.3539, + "step": 6156 + }, + { + "epoch": 1.5485412474849096, + "grad_norm": 0.33016350865364075, + "learning_rate": 5.5890492511496294e-06, + "loss": 0.366, + "step": 6157 + }, + { + "epoch": 1.5487927565392354, + "grad_norm": 0.3130359351634979, + "learning_rate": 5.587596168124493e-06, + "loss": 0.3789, + "step": 6158 + }, + { + "epoch": 1.5490442655935612, + "grad_norm": 0.3576628565788269, + "learning_rate": 5.5861430347754195e-06, + "loss": 0.3626, + "step": 6159 + }, + { + "epoch": 1.5492957746478875, + "grad_norm": 0.28835195302963257, + "learning_rate": 5.584689851226863e-06, + "loss": 0.3559, + "step": 6160 + }, + { + "epoch": 1.5495472837022133, + "grad_norm": 0.32597804069519043, + "learning_rate": 5.583236617603278e-06, + "loss": 0.3582, + "step": 6161 + }, + { + "epoch": 1.549798792756539, + "grad_norm": 0.29985523223876953, + "learning_rate": 5.5817833340291265e-06, + "loss": 0.3467, + "step": 6162 + }, + { + "epoch": 1.5500503018108653, + "grad_norm": 0.3118489384651184, + "learning_rate": 5.5803300006288704e-06, + "loss": 0.3401, + "step": 6163 + }, + { + "epoch": 1.5503018108651911, + "grad_norm": 0.302613228559494, + "learning_rate": 5.578876617526982e-06, + "loss": 0.3266, + "step": 6164 + }, + { + "epoch": 1.550553319919517, + "grad_norm": 0.3214384615421295, + "learning_rate": 5.577423184847932e-06, + "loss": 0.3369, + "step": 6165 + }, + { + "epoch": 1.5508048289738432, + "grad_norm": 0.2934117019176483, + "learning_rate": 5.575969702716199e-06, + "loss": 0.3559, + "step": 6166 + }, + { + "epoch": 1.551056338028169, + "grad_norm": 0.3137935698032379, + "learning_rate": 5.574516171256263e-06, + "loss": 0.3543, + "step": 6167 + }, + { + "epoch": 1.5513078470824948, + "grad_norm": 0.29603636264801025, + "learning_rate": 5.5730625905926114e-06, + "loss": 0.3514, + "step": 6168 + }, + { + "epoch": 1.551559356136821, + "grad_norm": 0.3102024793624878, + "learning_rate": 5.571608960849735e-06, + "loss": 0.3519, + "step": 6169 + }, + { + "epoch": 1.5518108651911469, + "grad_norm": 0.2983706593513489, + "learning_rate": 5.570155282152125e-06, + "loss": 0.3428, + "step": 6170 + }, + { + "epoch": 1.5520623742454729, + "grad_norm": 0.31096917390823364, + "learning_rate": 5.568701554624284e-06, + "loss": 0.3497, + "step": 6171 + }, + { + "epoch": 1.552313883299799, + "grad_norm": 0.28655806183815, + "learning_rate": 5.567247778390712e-06, + "loss": 0.3459, + "step": 6172 + }, + { + "epoch": 1.5525653923541247, + "grad_norm": 0.30332696437835693, + "learning_rate": 5.565793953575916e-06, + "loss": 0.3527, + "step": 6173 + }, + { + "epoch": 1.5528169014084507, + "grad_norm": 0.3258723020553589, + "learning_rate": 5.5643400803044075e-06, + "loss": 0.3454, + "step": 6174 + }, + { + "epoch": 1.5530684104627768, + "grad_norm": 0.34094491600990295, + "learning_rate": 5.5628861587007035e-06, + "loss": 0.3808, + "step": 6175 + }, + { + "epoch": 1.5533199195171026, + "grad_norm": 0.2914903461933136, + "learning_rate": 5.5614321888893195e-06, + "loss": 0.346, + "step": 6176 + }, + { + "epoch": 1.5535714285714286, + "grad_norm": 0.338826060295105, + "learning_rate": 5.559978170994781e-06, + "loss": 0.3567, + "step": 6177 + }, + { + "epoch": 1.5538229376257546, + "grad_norm": 0.3329686224460602, + "learning_rate": 5.558524105141616e-06, + "loss": 0.3439, + "step": 6178 + }, + { + "epoch": 1.5540744466800804, + "grad_norm": 0.3257056474685669, + "learning_rate": 5.557069991454356e-06, + "loss": 0.3378, + "step": 6179 + }, + { + "epoch": 1.5543259557344065, + "grad_norm": 0.3230164349079132, + "learning_rate": 5.5556158300575345e-06, + "loss": 0.3419, + "step": 6180 + }, + { + "epoch": 1.5545774647887325, + "grad_norm": 0.3321927487850189, + "learning_rate": 5.554161621075693e-06, + "loss": 0.3163, + "step": 6181 + }, + { + "epoch": 1.5548289738430583, + "grad_norm": 0.36797481775283813, + "learning_rate": 5.552707364633376e-06, + "loss": 0.3797, + "step": 6182 + }, + { + "epoch": 1.5550804828973843, + "grad_norm": 0.3352421820163727, + "learning_rate": 5.5512530608551315e-06, + "loss": 0.3408, + "step": 6183 + }, + { + "epoch": 1.5553319919517103, + "grad_norm": 0.2971833050251007, + "learning_rate": 5.549798709865512e-06, + "loss": 0.3406, + "step": 6184 + }, + { + "epoch": 1.5555835010060362, + "grad_norm": 0.30301129817962646, + "learning_rate": 5.5483443117890715e-06, + "loss": 0.3458, + "step": 6185 + }, + { + "epoch": 1.5558350100603622, + "grad_norm": 0.34085342288017273, + "learning_rate": 5.546889866750371e-06, + "loss": 0.3523, + "step": 6186 + }, + { + "epoch": 1.5560865191146882, + "grad_norm": 0.3280344307422638, + "learning_rate": 5.5454353748739755e-06, + "loss": 0.3563, + "step": 6187 + }, + { + "epoch": 1.556338028169014, + "grad_norm": 0.32044655084609985, + "learning_rate": 5.543980836284451e-06, + "loss": 0.3555, + "step": 6188 + }, + { + "epoch": 1.55658953722334, + "grad_norm": 0.30782240629196167, + "learning_rate": 5.542526251106372e-06, + "loss": 0.3402, + "step": 6189 + }, + { + "epoch": 1.556841046277666, + "grad_norm": 0.3036595284938812, + "learning_rate": 5.541071619464314e-06, + "loss": 0.3148, + "step": 6190 + }, + { + "epoch": 1.5570925553319919, + "grad_norm": 0.30741608142852783, + "learning_rate": 5.539616941482855e-06, + "loss": 0.3522, + "step": 6191 + }, + { + "epoch": 1.557344064386318, + "grad_norm": 0.32885679602622986, + "learning_rate": 5.538162217286581e-06, + "loss": 0.3384, + "step": 6192 + }, + { + "epoch": 1.557595573440644, + "grad_norm": 0.29733267426490784, + "learning_rate": 5.53670744700008e-06, + "loss": 0.3465, + "step": 6193 + }, + { + "epoch": 1.5578470824949697, + "grad_norm": 0.33164188265800476, + "learning_rate": 5.535252630747945e-06, + "loss": 0.3429, + "step": 6194 + }, + { + "epoch": 1.5580985915492958, + "grad_norm": 0.3217810392379761, + "learning_rate": 5.533797768654771e-06, + "loss": 0.3594, + "step": 6195 + }, + { + "epoch": 1.5583501006036218, + "grad_norm": 0.29132455587387085, + "learning_rate": 5.532342860845157e-06, + "loss": 0.3428, + "step": 6196 + }, + { + "epoch": 1.5586016096579476, + "grad_norm": 0.30485719442367554, + "learning_rate": 5.5308879074437065e-06, + "loss": 0.3193, + "step": 6197 + }, + { + "epoch": 1.5588531187122736, + "grad_norm": 0.310776025056839, + "learning_rate": 5.529432908575029e-06, + "loss": 0.3504, + "step": 6198 + }, + { + "epoch": 1.5591046277665996, + "grad_norm": 0.32021844387054443, + "learning_rate": 5.527977864363734e-06, + "loss": 0.3495, + "step": 6199 + }, + { + "epoch": 1.5593561368209254, + "grad_norm": 0.2971351146697998, + "learning_rate": 5.5265227749344385e-06, + "loss": 0.3765, + "step": 6200 + }, + { + "epoch": 1.5596076458752515, + "grad_norm": 0.30885204672813416, + "learning_rate": 5.525067640411761e-06, + "loss": 0.3426, + "step": 6201 + }, + { + "epoch": 1.5598591549295775, + "grad_norm": 0.31158149242401123, + "learning_rate": 5.523612460920326e-06, + "loss": 0.3468, + "step": 6202 + }, + { + "epoch": 1.5601106639839033, + "grad_norm": 0.3428870439529419, + "learning_rate": 5.5221572365847565e-06, + "loss": 0.3758, + "step": 6203 + }, + { + "epoch": 1.5603621730382293, + "grad_norm": 0.35247480869293213, + "learning_rate": 5.520701967529689e-06, + "loss": 0.3744, + "step": 6204 + }, + { + "epoch": 1.5606136820925554, + "grad_norm": 0.3332814872264862, + "learning_rate": 5.519246653879754e-06, + "loss": 0.3641, + "step": 6205 + }, + { + "epoch": 1.5608651911468812, + "grad_norm": 0.3046468198299408, + "learning_rate": 5.517791295759592e-06, + "loss": 0.3338, + "step": 6206 + }, + { + "epoch": 1.5611167002012074, + "grad_norm": 0.32283511757850647, + "learning_rate": 5.516335893293846e-06, + "loss": 0.3549, + "step": 6207 + }, + { + "epoch": 1.5613682092555332, + "grad_norm": 0.2913826107978821, + "learning_rate": 5.514880446607161e-06, + "loss": 0.319, + "step": 6208 + }, + { + "epoch": 1.561619718309859, + "grad_norm": 0.33693376183509827, + "learning_rate": 5.513424955824185e-06, + "loss": 0.3555, + "step": 6209 + }, + { + "epoch": 1.5618712273641853, + "grad_norm": 0.348818838596344, + "learning_rate": 5.511969421069574e-06, + "loss": 0.3493, + "step": 6210 + }, + { + "epoch": 1.562122736418511, + "grad_norm": 0.3071136474609375, + "learning_rate": 5.510513842467986e-06, + "loss": 0.3714, + "step": 6211 + }, + { + "epoch": 1.5623742454728369, + "grad_norm": 0.34389618039131165, + "learning_rate": 5.50905822014408e-06, + "loss": 0.3539, + "step": 6212 + }, + { + "epoch": 1.5626257545271631, + "grad_norm": 0.3115088641643524, + "learning_rate": 5.507602554222523e-06, + "loss": 0.3671, + "step": 6213 + }, + { + "epoch": 1.562877263581489, + "grad_norm": 0.3112759292125702, + "learning_rate": 5.506146844827981e-06, + "loss": 0.3616, + "step": 6214 + }, + { + "epoch": 1.5631287726358147, + "grad_norm": 0.32954081892967224, + "learning_rate": 5.50469109208513e-06, + "loss": 0.333, + "step": 6215 + }, + { + "epoch": 1.563380281690141, + "grad_norm": 0.3064148426055908, + "learning_rate": 5.503235296118643e-06, + "loss": 0.3339, + "step": 6216 + }, + { + "epoch": 1.5636317907444668, + "grad_norm": 0.29228824377059937, + "learning_rate": 5.501779457053202e-06, + "loss": 0.3435, + "step": 6217 + }, + { + "epoch": 1.5638832997987926, + "grad_norm": 0.32584089040756226, + "learning_rate": 5.50032357501349e-06, + "loss": 0.3494, + "step": 6218 + }, + { + "epoch": 1.5641348088531188, + "grad_norm": 0.29713818430900574, + "learning_rate": 5.498867650124193e-06, + "loss": 0.3463, + "step": 6219 + }, + { + "epoch": 1.5643863179074446, + "grad_norm": 0.31487134099006653, + "learning_rate": 5.4974116825100035e-06, + "loss": 0.3632, + "step": 6220 + }, + { + "epoch": 1.5646378269617707, + "grad_norm": 0.30101948976516724, + "learning_rate": 5.495955672295615e-06, + "loss": 0.3582, + "step": 6221 + }, + { + "epoch": 1.5648893360160967, + "grad_norm": 0.30046340823173523, + "learning_rate": 5.494499619605725e-06, + "loss": 0.3532, + "step": 6222 + }, + { + "epoch": 1.5651408450704225, + "grad_norm": 0.3313204050064087, + "learning_rate": 5.493043524565037e-06, + "loss": 0.3435, + "step": 6223 + }, + { + "epoch": 1.5653923541247485, + "grad_norm": 0.30352333188056946, + "learning_rate": 5.491587387298256e-06, + "loss": 0.3368, + "step": 6224 + }, + { + "epoch": 1.5656438631790746, + "grad_norm": 0.30736225843429565, + "learning_rate": 5.490131207930089e-06, + "loss": 0.3715, + "step": 6225 + }, + { + "epoch": 1.5658953722334004, + "grad_norm": 0.3192325234413147, + "learning_rate": 5.488674986585252e-06, + "loss": 0.3587, + "step": 6226 + }, + { + "epoch": 1.5661468812877264, + "grad_norm": 0.32336869835853577, + "learning_rate": 5.487218723388459e-06, + "loss": 0.3587, + "step": 6227 + }, + { + "epoch": 1.5663983903420524, + "grad_norm": 0.33127787709236145, + "learning_rate": 5.48576241846443e-06, + "loss": 0.3737, + "step": 6228 + }, + { + "epoch": 1.5666498993963782, + "grad_norm": 0.30386435985565186, + "learning_rate": 5.484306071937889e-06, + "loss": 0.3378, + "step": 6229 + }, + { + "epoch": 1.5669014084507042, + "grad_norm": 0.30401545763015747, + "learning_rate": 5.4828496839335635e-06, + "loss": 0.3511, + "step": 6230 + }, + { + "epoch": 1.5671529175050303, + "grad_norm": 0.327298641204834, + "learning_rate": 5.4813932545761815e-06, + "loss": 0.3497, + "step": 6231 + }, + { + "epoch": 1.567404426559356, + "grad_norm": 0.3015185296535492, + "learning_rate": 5.4799367839904805e-06, + "loss": 0.3462, + "step": 6232 + }, + { + "epoch": 1.567655935613682, + "grad_norm": 0.3208427429199219, + "learning_rate": 5.478480272301195e-06, + "loss": 0.3293, + "step": 6233 + }, + { + "epoch": 1.5679074446680081, + "grad_norm": 0.34195858240127563, + "learning_rate": 5.477023719633069e-06, + "loss": 0.3793, + "step": 6234 + }, + { + "epoch": 1.568158953722334, + "grad_norm": 0.31113073229789734, + "learning_rate": 5.4755671261108445e-06, + "loss": 0.3565, + "step": 6235 + }, + { + "epoch": 1.56841046277666, + "grad_norm": 0.3247259557247162, + "learning_rate": 5.474110491859272e-06, + "loss": 0.3352, + "step": 6236 + }, + { + "epoch": 1.568661971830986, + "grad_norm": 0.3409235179424286, + "learning_rate": 5.4726538170031e-06, + "loss": 0.3498, + "step": 6237 + }, + { + "epoch": 1.5689134808853118, + "grad_norm": 0.32643982768058777, + "learning_rate": 5.471197101667087e-06, + "loss": 0.3437, + "step": 6238 + }, + { + "epoch": 1.5691649899396378, + "grad_norm": 0.3240383267402649, + "learning_rate": 5.469740345975989e-06, + "loss": 0.3527, + "step": 6239 + }, + { + "epoch": 1.5694164989939638, + "grad_norm": 0.34534913301467896, + "learning_rate": 5.468283550054571e-06, + "loss": 0.3658, + "step": 6240 + }, + { + "epoch": 1.5696680080482897, + "grad_norm": 0.2972480356693268, + "learning_rate": 5.466826714027595e-06, + "loss": 0.3357, + "step": 6241 + }, + { + "epoch": 1.5699195171026157, + "grad_norm": 0.3179325461387634, + "learning_rate": 5.465369838019832e-06, + "loss": 0.3268, + "step": 6242 + }, + { + "epoch": 1.5701710261569417, + "grad_norm": 0.3161191940307617, + "learning_rate": 5.463912922156053e-06, + "loss": 0.3631, + "step": 6243 + }, + { + "epoch": 1.5704225352112675, + "grad_norm": 0.32680991291999817, + "learning_rate": 5.462455966561034e-06, + "loss": 0.3689, + "step": 6244 + }, + { + "epoch": 1.5706740442655935, + "grad_norm": 0.31402167677879333, + "learning_rate": 5.460998971359556e-06, + "loss": 0.3468, + "step": 6245 + }, + { + "epoch": 1.5709255533199196, + "grad_norm": 0.3343561589717865, + "learning_rate": 5.459541936676398e-06, + "loss": 0.3655, + "step": 6246 + }, + { + "epoch": 1.5711770623742454, + "grad_norm": 0.30885884165763855, + "learning_rate": 5.45808486263635e-06, + "loss": 0.3284, + "step": 6247 + }, + { + "epoch": 1.5714285714285714, + "grad_norm": 0.3167196214199066, + "learning_rate": 5.456627749364198e-06, + "loss": 0.3454, + "step": 6248 + }, + { + "epoch": 1.5716800804828974, + "grad_norm": 0.33008265495300293, + "learning_rate": 5.455170596984738e-06, + "loss": 0.3528, + "step": 6249 + }, + { + "epoch": 1.5719315895372232, + "grad_norm": 0.30799832940101624, + "learning_rate": 5.4537134056227626e-06, + "loss": 0.3536, + "step": 6250 + }, + { + "epoch": 1.5721830985915493, + "grad_norm": 0.29477211833000183, + "learning_rate": 5.452256175403072e-06, + "loss": 0.3448, + "step": 6251 + }, + { + "epoch": 1.5724346076458753, + "grad_norm": 0.3047988712787628, + "learning_rate": 5.4507989064504695e-06, + "loss": 0.3506, + "step": 6252 + }, + { + "epoch": 1.572686116700201, + "grad_norm": 0.32256054878234863, + "learning_rate": 5.4493415988897615e-06, + "loss": 0.3653, + "step": 6253 + }, + { + "epoch": 1.5729376257545271, + "grad_norm": 0.3178042471408844, + "learning_rate": 5.4478842528457565e-06, + "loss": 0.3709, + "step": 6254 + }, + { + "epoch": 1.5731891348088531, + "grad_norm": 0.329878568649292, + "learning_rate": 5.4464268684432664e-06, + "loss": 0.3329, + "step": 6255 + }, + { + "epoch": 1.573440643863179, + "grad_norm": 0.30290189385414124, + "learning_rate": 5.444969445807109e-06, + "loss": 0.3458, + "step": 6256 + }, + { + "epoch": 1.5736921529175052, + "grad_norm": 0.3331491947174072, + "learning_rate": 5.4435119850621e-06, + "loss": 0.3516, + "step": 6257 + }, + { + "epoch": 1.573943661971831, + "grad_norm": 0.2870064675807953, + "learning_rate": 5.442054486333066e-06, + "loss": 0.3636, + "step": 6258 + }, + { + "epoch": 1.5741951710261568, + "grad_norm": 0.3208286762237549, + "learning_rate": 5.440596949744831e-06, + "loss": 0.3486, + "step": 6259 + }, + { + "epoch": 1.574446680080483, + "grad_norm": 0.30544915795326233, + "learning_rate": 5.439139375422223e-06, + "loss": 0.3647, + "step": 6260 + }, + { + "epoch": 1.5746981891348089, + "grad_norm": 0.3159911036491394, + "learning_rate": 5.437681763490075e-06, + "loss": 0.3524, + "step": 6261 + }, + { + "epoch": 1.5749496981891347, + "grad_norm": 0.315897136926651, + "learning_rate": 5.4362241140732215e-06, + "loss": 0.3794, + "step": 6262 + }, + { + "epoch": 1.575201207243461, + "grad_norm": 0.3100701570510864, + "learning_rate": 5.434766427296502e-06, + "loss": 0.3537, + "step": 6263 + }, + { + "epoch": 1.5754527162977867, + "grad_norm": 0.327070951461792, + "learning_rate": 5.433308703284759e-06, + "loss": 0.3974, + "step": 6264 + }, + { + "epoch": 1.5757042253521125, + "grad_norm": 0.31196388602256775, + "learning_rate": 5.431850942162834e-06, + "loss": 0.3373, + "step": 6265 + }, + { + "epoch": 1.5759557344064388, + "grad_norm": 0.28920623660087585, + "learning_rate": 5.430393144055579e-06, + "loss": 0.3308, + "step": 6266 + }, + { + "epoch": 1.5762072434607646, + "grad_norm": 0.30474618077278137, + "learning_rate": 5.428935309087844e-06, + "loss": 0.3323, + "step": 6267 + }, + { + "epoch": 1.5764587525150904, + "grad_norm": 0.3304254412651062, + "learning_rate": 5.427477437384482e-06, + "loss": 0.3647, + "step": 6268 + }, + { + "epoch": 1.5767102615694166, + "grad_norm": 0.28863289952278137, + "learning_rate": 5.426019529070352e-06, + "loss": 0.362, + "step": 6269 + }, + { + "epoch": 1.5769617706237424, + "grad_norm": 0.31353408098220825, + "learning_rate": 5.4245615842703146e-06, + "loss": 0.3637, + "step": 6270 + }, + { + "epoch": 1.5772132796780685, + "grad_norm": 0.3224716782569885, + "learning_rate": 5.4231036031092345e-06, + "loss": 0.3273, + "step": 6271 + }, + { + "epoch": 1.5774647887323945, + "grad_norm": 0.31004685163497925, + "learning_rate": 5.4216455857119765e-06, + "loss": 0.354, + "step": 6272 + }, + { + "epoch": 1.5777162977867203, + "grad_norm": 0.32508277893066406, + "learning_rate": 5.420187532203413e-06, + "loss": 0.3552, + "step": 6273 + }, + { + "epoch": 1.5779678068410463, + "grad_norm": 0.33112356066703796, + "learning_rate": 5.418729442708416e-06, + "loss": 0.3846, + "step": 6274 + }, + { + "epoch": 1.5782193158953723, + "grad_norm": 0.32484641671180725, + "learning_rate": 5.417271317351861e-06, + "loss": 0.3589, + "step": 6275 + }, + { + "epoch": 1.5784708249496981, + "grad_norm": 0.3176564574241638, + "learning_rate": 5.415813156258628e-06, + "loss": 0.3525, + "step": 6276 + }, + { + "epoch": 1.5787223340040242, + "grad_norm": 0.31098705530166626, + "learning_rate": 5.4143549595536e-06, + "loss": 0.3355, + "step": 6277 + }, + { + "epoch": 1.5789738430583502, + "grad_norm": 0.34657394886016846, + "learning_rate": 5.412896727361663e-06, + "loss": 0.3566, + "step": 6278 + }, + { + "epoch": 1.579225352112676, + "grad_norm": 0.31053581833839417, + "learning_rate": 5.411438459807703e-06, + "loss": 0.3741, + "step": 6279 + }, + { + "epoch": 1.579476861167002, + "grad_norm": 0.28460362553596497, + "learning_rate": 5.4099801570166135e-06, + "loss": 0.3299, + "step": 6280 + }, + { + "epoch": 1.579728370221328, + "grad_norm": 0.3039803206920624, + "learning_rate": 5.408521819113287e-06, + "loss": 0.3297, + "step": 6281 + }, + { + "epoch": 1.5799798792756539, + "grad_norm": 0.31220072507858276, + "learning_rate": 5.407063446222623e-06, + "loss": 0.3368, + "step": 6282 + }, + { + "epoch": 1.58023138832998, + "grad_norm": 0.31574392318725586, + "learning_rate": 5.4056050384695225e-06, + "loss": 0.3582, + "step": 6283 + }, + { + "epoch": 1.580482897384306, + "grad_norm": 0.34987935423851013, + "learning_rate": 5.404146595978887e-06, + "loss": 0.3484, + "step": 6284 + }, + { + "epoch": 1.5807344064386317, + "grad_norm": 0.3038938641548157, + "learning_rate": 5.402688118875624e-06, + "loss": 0.3401, + "step": 6285 + }, + { + "epoch": 1.5809859154929577, + "grad_norm": 0.29709485173225403, + "learning_rate": 5.401229607284644e-06, + "loss": 0.3588, + "step": 6286 + }, + { + "epoch": 1.5812374245472838, + "grad_norm": 0.3312198221683502, + "learning_rate": 5.3997710613308565e-06, + "loss": 0.3632, + "step": 6287 + }, + { + "epoch": 1.5814889336016096, + "grad_norm": 0.30179715156555176, + "learning_rate": 5.398312481139179e-06, + "loss": 0.3404, + "step": 6288 + }, + { + "epoch": 1.5817404426559356, + "grad_norm": 0.3145090639591217, + "learning_rate": 5.396853866834529e-06, + "loss": 0.3781, + "step": 6289 + }, + { + "epoch": 1.5819919517102616, + "grad_norm": 0.33599305152893066, + "learning_rate": 5.395395218541829e-06, + "loss": 0.3496, + "step": 6290 + }, + { + "epoch": 1.5822434607645874, + "grad_norm": 0.30032995343208313, + "learning_rate": 5.393936536386001e-06, + "loss": 0.3498, + "step": 6291 + }, + { + "epoch": 1.5824949698189135, + "grad_norm": 0.30552947521209717, + "learning_rate": 5.392477820491974e-06, + "loss": 0.3472, + "step": 6292 + }, + { + "epoch": 1.5827464788732395, + "grad_norm": 0.29774901270866394, + "learning_rate": 5.391019070984676e-06, + "loss": 0.3306, + "step": 6293 + }, + { + "epoch": 1.5829979879275653, + "grad_norm": 0.3048769235610962, + "learning_rate": 5.389560287989043e-06, + "loss": 0.3284, + "step": 6294 + }, + { + "epoch": 1.5832494969818913, + "grad_norm": 0.33023905754089355, + "learning_rate": 5.388101471630006e-06, + "loss": 0.3384, + "step": 6295 + }, + { + "epoch": 1.5835010060362174, + "grad_norm": 0.30348482728004456, + "learning_rate": 5.3866426220325075e-06, + "loss": 0.3713, + "step": 6296 + }, + { + "epoch": 1.5837525150905432, + "grad_norm": 0.3093292713165283, + "learning_rate": 5.385183739321486e-06, + "loss": 0.3527, + "step": 6297 + }, + { + "epoch": 1.5840040241448692, + "grad_norm": 0.30583062767982483, + "learning_rate": 5.383724823621889e-06, + "loss": 0.3345, + "step": 6298 + }, + { + "epoch": 1.5842555331991952, + "grad_norm": 0.33037832379341125, + "learning_rate": 5.38226587505866e-06, + "loss": 0.3787, + "step": 6299 + }, + { + "epoch": 1.584507042253521, + "grad_norm": 0.312497615814209, + "learning_rate": 5.380806893756748e-06, + "loss": 0.3446, + "step": 6300 + }, + { + "epoch": 1.584758551307847, + "grad_norm": 0.3048454225063324, + "learning_rate": 5.3793478798411105e-06, + "loss": 0.3678, + "step": 6301 + }, + { + "epoch": 1.585010060362173, + "grad_norm": 0.290546178817749, + "learning_rate": 5.3778888334367e-06, + "loss": 0.3401, + "step": 6302 + }, + { + "epoch": 1.5852615694164989, + "grad_norm": 0.3122500479221344, + "learning_rate": 5.376429754668475e-06, + "loss": 0.3374, + "step": 6303 + }, + { + "epoch": 1.585513078470825, + "grad_norm": 0.3183901011943817, + "learning_rate": 5.374970643661397e-06, + "loss": 0.3668, + "step": 6304 + }, + { + "epoch": 1.585764587525151, + "grad_norm": 0.3228647708892822, + "learning_rate": 5.373511500540428e-06, + "loss": 0.3438, + "step": 6305 + }, + { + "epoch": 1.5860160965794767, + "grad_norm": 0.3075144290924072, + "learning_rate": 5.372052325430537e-06, + "loss": 0.3626, + "step": 6306 + }, + { + "epoch": 1.586267605633803, + "grad_norm": 0.3228583335876465, + "learning_rate": 5.37059311845669e-06, + "loss": 0.3396, + "step": 6307 + }, + { + "epoch": 1.5865191146881288, + "grad_norm": 0.33655041456222534, + "learning_rate": 5.3691338797438615e-06, + "loss": 0.3342, + "step": 6308 + }, + { + "epoch": 1.5867706237424546, + "grad_norm": 0.3635300397872925, + "learning_rate": 5.3676746094170265e-06, + "loss": 0.355, + "step": 6309 + }, + { + "epoch": 1.5870221327967808, + "grad_norm": 0.32811570167541504, + "learning_rate": 5.3662153076011614e-06, + "loss": 0.3313, + "step": 6310 + }, + { + "epoch": 1.5872736418511066, + "grad_norm": 0.3333826959133148, + "learning_rate": 5.364755974421244e-06, + "loss": 0.342, + "step": 6311 + }, + { + "epoch": 1.5875251509054324, + "grad_norm": 0.34751078486442566, + "learning_rate": 5.363296610002261e-06, + "loss": 0.348, + "step": 6312 + }, + { + "epoch": 1.5877766599597587, + "grad_norm": 0.3453575372695923, + "learning_rate": 5.361837214469197e-06, + "loss": 0.349, + "step": 6313 + }, + { + "epoch": 1.5880281690140845, + "grad_norm": 0.3269634544849396, + "learning_rate": 5.360377787947037e-06, + "loss": 0.3284, + "step": 6314 + }, + { + "epoch": 1.5882796780684103, + "grad_norm": 0.3597438335418701, + "learning_rate": 5.358918330560776e-06, + "loss": 0.3663, + "step": 6315 + }, + { + "epoch": 1.5885311871227366, + "grad_norm": 0.3321945369243622, + "learning_rate": 5.357458842435405e-06, + "loss": 0.3514, + "step": 6316 + }, + { + "epoch": 1.5887826961770624, + "grad_norm": 0.33890068531036377, + "learning_rate": 5.35599932369592e-06, + "loss": 0.3409, + "step": 6317 + }, + { + "epoch": 1.5890342052313882, + "grad_norm": 0.33739712834358215, + "learning_rate": 5.35453977446732e-06, + "loss": 0.3655, + "step": 6318 + }, + { + "epoch": 1.5892857142857144, + "grad_norm": 0.3216194808483124, + "learning_rate": 5.353080194874606e-06, + "loss": 0.325, + "step": 6319 + }, + { + "epoch": 1.5895372233400402, + "grad_norm": 0.2983427941799164, + "learning_rate": 5.351620585042783e-06, + "loss": 0.354, + "step": 6320 + }, + { + "epoch": 1.5897887323943662, + "grad_norm": 0.3386406898498535, + "learning_rate": 5.350160945096856e-06, + "loss": 0.3736, + "step": 6321 + }, + { + "epoch": 1.5900402414486923, + "grad_norm": 0.3367750346660614, + "learning_rate": 5.348701275161834e-06, + "loss": 0.3222, + "step": 6322 + }, + { + "epoch": 1.590291750503018, + "grad_norm": 0.3235786259174347, + "learning_rate": 5.347241575362729e-06, + "loss": 0.3701, + "step": 6323 + }, + { + "epoch": 1.590543259557344, + "grad_norm": 0.29877564311027527, + "learning_rate": 5.345781845824557e-06, + "loss": 0.3367, + "step": 6324 + }, + { + "epoch": 1.5907947686116701, + "grad_norm": 0.30587801337242126, + "learning_rate": 5.344322086672332e-06, + "loss": 0.3363, + "step": 6325 + }, + { + "epoch": 1.591046277665996, + "grad_norm": 0.31401360034942627, + "learning_rate": 5.3428622980310755e-06, + "loss": 0.3309, + "step": 6326 + }, + { + "epoch": 1.591297786720322, + "grad_norm": 0.2830161452293396, + "learning_rate": 5.341402480025808e-06, + "loss": 0.3416, + "step": 6327 + }, + { + "epoch": 1.591549295774648, + "grad_norm": 0.3165720999240875, + "learning_rate": 5.339942632781553e-06, + "loss": 0.3532, + "step": 6328 + }, + { + "epoch": 1.5918008048289738, + "grad_norm": 0.3214383125305176, + "learning_rate": 5.338482756423339e-06, + "loss": 0.3412, + "step": 6329 + }, + { + "epoch": 1.5920523138832998, + "grad_norm": 0.2742040455341339, + "learning_rate": 5.337022851076193e-06, + "loss": 0.3516, + "step": 6330 + }, + { + "epoch": 1.5923038229376258, + "grad_norm": 0.2959214150905609, + "learning_rate": 5.33556291686515e-06, + "loss": 0.3405, + "step": 6331 + }, + { + "epoch": 1.5925553319919517, + "grad_norm": 0.28972509503364563, + "learning_rate": 5.334102953915242e-06, + "loss": 0.3324, + "step": 6332 + }, + { + "epoch": 1.5928068410462777, + "grad_norm": 0.34085187315940857, + "learning_rate": 5.332642962351505e-06, + "loss": 0.3419, + "step": 6333 + }, + { + "epoch": 1.5930583501006037, + "grad_norm": 0.32344910502433777, + "learning_rate": 5.331182942298981e-06, + "loss": 0.3387, + "step": 6334 + }, + { + "epoch": 1.5933098591549295, + "grad_norm": 0.3067835867404938, + "learning_rate": 5.329722893882708e-06, + "loss": 0.3325, + "step": 6335 + }, + { + "epoch": 1.5935613682092555, + "grad_norm": 0.31385210156440735, + "learning_rate": 5.328262817227733e-06, + "loss": 0.3588, + "step": 6336 + }, + { + "epoch": 1.5938128772635816, + "grad_norm": 0.30561771988868713, + "learning_rate": 5.326802712459101e-06, + "loss": 0.3438, + "step": 6337 + }, + { + "epoch": 1.5940643863179074, + "grad_norm": 0.30326759815216064, + "learning_rate": 5.325342579701862e-06, + "loss": 0.3669, + "step": 6338 + }, + { + "epoch": 1.5943158953722334, + "grad_norm": 0.31566211581230164, + "learning_rate": 5.323882419081066e-06, + "loss": 0.3275, + "step": 6339 + }, + { + "epoch": 1.5945674044265594, + "grad_norm": 0.32427820563316345, + "learning_rate": 5.3224222307217665e-06, + "loss": 0.3351, + "step": 6340 + }, + { + "epoch": 1.5948189134808852, + "grad_norm": 0.33437636494636536, + "learning_rate": 5.32096201474902e-06, + "loss": 0.3573, + "step": 6341 + }, + { + "epoch": 1.5950704225352113, + "grad_norm": 0.3199196457862854, + "learning_rate": 5.319501771287885e-06, + "loss": 0.3591, + "step": 6342 + }, + { + "epoch": 1.5953219315895373, + "grad_norm": 0.332581102848053, + "learning_rate": 5.318041500463423e-06, + "loss": 0.3374, + "step": 6343 + }, + { + "epoch": 1.595573440643863, + "grad_norm": 0.3242298662662506, + "learning_rate": 5.316581202400694e-06, + "loss": 0.3266, + "step": 6344 + }, + { + "epoch": 1.595824949698189, + "grad_norm": 0.3219207525253296, + "learning_rate": 5.315120877224767e-06, + "loss": 0.3477, + "step": 6345 + }, + { + "epoch": 1.5960764587525151, + "grad_norm": 0.3285122215747833, + "learning_rate": 5.313660525060709e-06, + "loss": 0.3691, + "step": 6346 + }, + { + "epoch": 1.596327967806841, + "grad_norm": 0.32978108525276184, + "learning_rate": 5.312200146033588e-06, + "loss": 0.3434, + "step": 6347 + }, + { + "epoch": 1.596579476861167, + "grad_norm": 0.3488015830516815, + "learning_rate": 5.310739740268478e-06, + "loss": 0.3484, + "step": 6348 + }, + { + "epoch": 1.596830985915493, + "grad_norm": 0.31981801986694336, + "learning_rate": 5.309279307890453e-06, + "loss": 0.3259, + "step": 6349 + }, + { + "epoch": 1.5970824949698188, + "grad_norm": 0.31969207525253296, + "learning_rate": 5.3078188490245905e-06, + "loss": 0.3416, + "step": 6350 + }, + { + "epoch": 1.5973340040241448, + "grad_norm": 0.3058636784553528, + "learning_rate": 5.30635836379597e-06, + "loss": 0.3111, + "step": 6351 + }, + { + "epoch": 1.5975855130784709, + "grad_norm": 0.3224903345108032, + "learning_rate": 5.304897852329671e-06, + "loss": 0.3454, + "step": 6352 + }, + { + "epoch": 1.5978370221327967, + "grad_norm": 0.33611172437667847, + "learning_rate": 5.303437314750779e-06, + "loss": 0.3447, + "step": 6353 + }, + { + "epoch": 1.5980885311871227, + "grad_norm": 0.3095466196537018, + "learning_rate": 5.301976751184379e-06, + "loss": 0.3523, + "step": 6354 + }, + { + "epoch": 1.5983400402414487, + "grad_norm": 0.3098112642765045, + "learning_rate": 5.300516161755559e-06, + "loss": 0.3419, + "step": 6355 + }, + { + "epoch": 1.5985915492957745, + "grad_norm": 0.3331829309463501, + "learning_rate": 5.29905554658941e-06, + "loss": 0.3652, + "step": 6356 + }, + { + "epoch": 1.5988430583501008, + "grad_norm": 0.30637791752815247, + "learning_rate": 5.297594905811024e-06, + "loss": 0.3591, + "step": 6357 + }, + { + "epoch": 1.5990945674044266, + "grad_norm": 0.3300563395023346, + "learning_rate": 5.296134239545497e-06, + "loss": 0.3505, + "step": 6358 + }, + { + "epoch": 1.5993460764587524, + "grad_norm": 0.34214162826538086, + "learning_rate": 5.294673547917925e-06, + "loss": 0.3663, + "step": 6359 + }, + { + "epoch": 1.5995975855130786, + "grad_norm": 0.3289168179035187, + "learning_rate": 5.293212831053407e-06, + "loss": 0.3356, + "step": 6360 + }, + { + "epoch": 1.5998490945674044, + "grad_norm": 0.3313751816749573, + "learning_rate": 5.291752089077044e-06, + "loss": 0.3586, + "step": 6361 + }, + { + "epoch": 1.6001006036217302, + "grad_norm": 0.32884520292282104, + "learning_rate": 5.29029132211394e-06, + "loss": 0.3481, + "step": 6362 + }, + { + "epoch": 1.6003521126760565, + "grad_norm": 0.3014932870864868, + "learning_rate": 5.2888305302891996e-06, + "loss": 0.3677, + "step": 6363 + }, + { + "epoch": 1.6006036217303823, + "grad_norm": 0.3143293857574463, + "learning_rate": 5.287369713727933e-06, + "loss": 0.3352, + "step": 6364 + }, + { + "epoch": 1.600855130784708, + "grad_norm": 0.30233773589134216, + "learning_rate": 5.285908872555247e-06, + "loss": 0.3466, + "step": 6365 + }, + { + "epoch": 1.6011066398390343, + "grad_norm": 0.3170585036277771, + "learning_rate": 5.284448006896252e-06, + "loss": 0.3253, + "step": 6366 + }, + { + "epoch": 1.6013581488933601, + "grad_norm": 0.3214609920978546, + "learning_rate": 5.282987116876068e-06, + "loss": 0.3341, + "step": 6367 + }, + { + "epoch": 1.6016096579476862, + "grad_norm": 0.3339797854423523, + "learning_rate": 5.281526202619808e-06, + "loss": 0.3594, + "step": 6368 + }, + { + "epoch": 1.6018611670020122, + "grad_norm": 0.30358412861824036, + "learning_rate": 5.2800652642525885e-06, + "loss": 0.3265, + "step": 6369 + }, + { + "epoch": 1.602112676056338, + "grad_norm": 0.3258301615715027, + "learning_rate": 5.278604301899531e-06, + "loss": 0.335, + "step": 6370 + }, + { + "epoch": 1.602364185110664, + "grad_norm": 0.32706478238105774, + "learning_rate": 5.27714331568576e-06, + "loss": 0.3561, + "step": 6371 + }, + { + "epoch": 1.60261569416499, + "grad_norm": 0.3139190077781677, + "learning_rate": 5.275682305736396e-06, + "loss": 0.3447, + "step": 6372 + }, + { + "epoch": 1.6028672032193159, + "grad_norm": 0.30157265067100525, + "learning_rate": 5.274221272176569e-06, + "loss": 0.3914, + "step": 6373 + }, + { + "epoch": 1.6031187122736419, + "grad_norm": 0.311866819858551, + "learning_rate": 5.2727602151314035e-06, + "loss": 0.3594, + "step": 6374 + }, + { + "epoch": 1.603370221327968, + "grad_norm": 0.29979294538497925, + "learning_rate": 5.271299134726034e-06, + "loss": 0.3382, + "step": 6375 + }, + { + "epoch": 1.6036217303822937, + "grad_norm": 0.3057425320148468, + "learning_rate": 5.269838031085588e-06, + "loss": 0.3378, + "step": 6376 + }, + { + "epoch": 1.6038732394366197, + "grad_norm": 0.33399319648742676, + "learning_rate": 5.268376904335204e-06, + "loss": 0.3625, + "step": 6377 + }, + { + "epoch": 1.6041247484909458, + "grad_norm": 0.31966760754585266, + "learning_rate": 5.266915754600018e-06, + "loss": 0.3683, + "step": 6378 + }, + { + "epoch": 1.6043762575452716, + "grad_norm": 0.3125464916229248, + "learning_rate": 5.265454582005167e-06, + "loss": 0.3498, + "step": 6379 + }, + { + "epoch": 1.6046277665995976, + "grad_norm": 0.30603522062301636, + "learning_rate": 5.263993386675792e-06, + "loss": 0.3682, + "step": 6380 + }, + { + "epoch": 1.6048792756539236, + "grad_norm": 0.3024091422557831, + "learning_rate": 5.2625321687370345e-06, + "loss": 0.3384, + "step": 6381 + }, + { + "epoch": 1.6051307847082494, + "grad_norm": 0.2936726212501526, + "learning_rate": 5.261070928314039e-06, + "loss": 0.373, + "step": 6382 + }, + { + "epoch": 1.6053822937625755, + "grad_norm": 0.31139662861824036, + "learning_rate": 5.259609665531951e-06, + "loss": 0.3445, + "step": 6383 + }, + { + "epoch": 1.6056338028169015, + "grad_norm": 0.30623286962509155, + "learning_rate": 5.258148380515922e-06, + "loss": 0.3562, + "step": 6384 + }, + { + "epoch": 1.6058853118712273, + "grad_norm": 0.3077852129936218, + "learning_rate": 5.256687073391097e-06, + "loss": 0.3313, + "step": 6385 + }, + { + "epoch": 1.6061368209255533, + "grad_norm": 0.28741592168807983, + "learning_rate": 5.255225744282631e-06, + "loss": 0.354, + "step": 6386 + }, + { + "epoch": 1.6063883299798793, + "grad_norm": 0.32665884494781494, + "learning_rate": 5.253764393315674e-06, + "loss": 0.3458, + "step": 6387 + }, + { + "epoch": 1.6066398390342052, + "grad_norm": 0.31474220752716064, + "learning_rate": 5.252303020615387e-06, + "loss": 0.3425, + "step": 6388 + }, + { + "epoch": 1.6068913480885312, + "grad_norm": 0.3298763930797577, + "learning_rate": 5.250841626306924e-06, + "loss": 0.3634, + "step": 6389 + }, + { + "epoch": 1.6071428571428572, + "grad_norm": 0.2954833209514618, + "learning_rate": 5.249380210515446e-06, + "loss": 0.3702, + "step": 6390 + }, + { + "epoch": 1.607394366197183, + "grad_norm": 0.3127298057079315, + "learning_rate": 5.247918773366112e-06, + "loss": 0.348, + "step": 6391 + }, + { + "epoch": 1.607645875251509, + "grad_norm": 0.2956436574459076, + "learning_rate": 5.246457314984086e-06, + "loss": 0.3504, + "step": 6392 + }, + { + "epoch": 1.607897384305835, + "grad_norm": 0.35721251368522644, + "learning_rate": 5.2449958354945326e-06, + "loss": 0.3357, + "step": 6393 + }, + { + "epoch": 1.6081488933601609, + "grad_norm": 0.31743982434272766, + "learning_rate": 5.24353433502262e-06, + "loss": 0.3412, + "step": 6394 + }, + { + "epoch": 1.608400402414487, + "grad_norm": 0.31196045875549316, + "learning_rate": 5.242072813693514e-06, + "loss": 0.3253, + "step": 6395 + }, + { + "epoch": 1.608651911468813, + "grad_norm": 0.29665273427963257, + "learning_rate": 5.240611271632386e-06, + "loss": 0.3622, + "step": 6396 + }, + { + "epoch": 1.6089034205231387, + "grad_norm": 0.36455845832824707, + "learning_rate": 5.239149708964409e-06, + "loss": 0.3435, + "step": 6397 + }, + { + "epoch": 1.6091549295774648, + "grad_norm": 0.3024010956287384, + "learning_rate": 5.237688125814752e-06, + "loss": 0.3549, + "step": 6398 + }, + { + "epoch": 1.6094064386317908, + "grad_norm": 0.3012137711048126, + "learning_rate": 5.236226522308596e-06, + "loss": 0.3332, + "step": 6399 + }, + { + "epoch": 1.6096579476861166, + "grad_norm": 0.31425225734710693, + "learning_rate": 5.234764898571118e-06, + "loss": 0.3436, + "step": 6400 + }, + { + "epoch": 1.6099094567404426, + "grad_norm": 0.3358578383922577, + "learning_rate": 5.233303254727493e-06, + "loss": 0.3441, + "step": 6401 + }, + { + "epoch": 1.6101609657947686, + "grad_norm": 0.3274340033531189, + "learning_rate": 5.231841590902905e-06, + "loss": 0.3244, + "step": 6402 + }, + { + "epoch": 1.6104124748490944, + "grad_norm": 0.3106974959373474, + "learning_rate": 5.230379907222535e-06, + "loss": 0.3764, + "step": 6403 + }, + { + "epoch": 1.6106639839034205, + "grad_norm": 0.2865849435329437, + "learning_rate": 5.228918203811566e-06, + "loss": 0.3393, + "step": 6404 + }, + { + "epoch": 1.6109154929577465, + "grad_norm": 0.31629478931427, + "learning_rate": 5.227456480795187e-06, + "loss": 0.3581, + "step": 6405 + }, + { + "epoch": 1.6111670020120723, + "grad_norm": 0.3469734787940979, + "learning_rate": 5.225994738298582e-06, + "loss": 0.3523, + "step": 6406 + }, + { + "epoch": 1.6114185110663986, + "grad_norm": 0.31348875164985657, + "learning_rate": 5.224532976446941e-06, + "loss": 0.3381, + "step": 6407 + }, + { + "epoch": 1.6116700201207244, + "grad_norm": 0.29473093152046204, + "learning_rate": 5.223071195365456e-06, + "loss": 0.3483, + "step": 6408 + }, + { + "epoch": 1.6119215291750502, + "grad_norm": 0.3042793869972229, + "learning_rate": 5.221609395179319e-06, + "loss": 0.3646, + "step": 6409 + }, + { + "epoch": 1.6121730382293764, + "grad_norm": 0.3180801272392273, + "learning_rate": 5.220147576013724e-06, + "loss": 0.3661, + "step": 6410 + }, + { + "epoch": 1.6124245472837022, + "grad_norm": 0.318566232919693, + "learning_rate": 5.218685737993865e-06, + "loss": 0.3538, + "step": 6411 + }, + { + "epoch": 1.612676056338028, + "grad_norm": 0.3094564974308014, + "learning_rate": 5.217223881244942e-06, + "loss": 0.3289, + "step": 6412 + }, + { + "epoch": 1.6129275653923543, + "grad_norm": 0.3494105041027069, + "learning_rate": 5.215762005892151e-06, + "loss": 0.3474, + "step": 6413 + }, + { + "epoch": 1.61317907444668, + "grad_norm": 0.312987357378006, + "learning_rate": 5.214300112060695e-06, + "loss": 0.3601, + "step": 6414 + }, + { + "epoch": 1.6134305835010059, + "grad_norm": 0.30890733003616333, + "learning_rate": 5.212838199875775e-06, + "loss": 0.3478, + "step": 6415 + }, + { + "epoch": 1.6136820925553321, + "grad_norm": 0.3125660717487335, + "learning_rate": 5.211376269462594e-06, + "loss": 0.3657, + "step": 6416 + }, + { + "epoch": 1.613933601609658, + "grad_norm": 0.32715532183647156, + "learning_rate": 5.209914320946359e-06, + "loss": 0.3662, + "step": 6417 + }, + { + "epoch": 1.614185110663984, + "grad_norm": 0.3133423626422882, + "learning_rate": 5.208452354452275e-06, + "loss": 0.3472, + "step": 6418 + }, + { + "epoch": 1.61443661971831, + "grad_norm": 0.3222804069519043, + "learning_rate": 5.20699037010555e-06, + "loss": 0.3355, + "step": 6419 + }, + { + "epoch": 1.6146881287726358, + "grad_norm": 0.3086826205253601, + "learning_rate": 5.205528368031395e-06, + "loss": 0.3552, + "step": 6420 + }, + { + "epoch": 1.6149396378269618, + "grad_norm": 0.3171122968196869, + "learning_rate": 5.204066348355022e-06, + "loss": 0.3557, + "step": 6421 + }, + { + "epoch": 1.6151911468812878, + "grad_norm": 0.31034934520721436, + "learning_rate": 5.202604311201642e-06, + "loss": 0.3478, + "step": 6422 + }, + { + "epoch": 1.6154426559356136, + "grad_norm": 0.29711443185806274, + "learning_rate": 5.201142256696472e-06, + "loss": 0.349, + "step": 6423 + }, + { + "epoch": 1.6156941649899397, + "grad_norm": 0.3413085639476776, + "learning_rate": 5.199680184964725e-06, + "loss": 0.3472, + "step": 6424 + }, + { + "epoch": 1.6159456740442657, + "grad_norm": 0.2911038100719452, + "learning_rate": 5.198218096131619e-06, + "loss": 0.342, + "step": 6425 + }, + { + "epoch": 1.6161971830985915, + "grad_norm": 0.291364461183548, + "learning_rate": 5.196755990322373e-06, + "loss": 0.3439, + "step": 6426 + }, + { + "epoch": 1.6164486921529175, + "grad_norm": 0.3112189769744873, + "learning_rate": 5.195293867662208e-06, + "loss": 0.3266, + "step": 6427 + }, + { + "epoch": 1.6167002012072436, + "grad_norm": 0.33983948826789856, + "learning_rate": 5.193831728276345e-06, + "loss": 0.3386, + "step": 6428 + }, + { + "epoch": 1.6169517102615694, + "grad_norm": 0.3202919363975525, + "learning_rate": 5.192369572290007e-06, + "loss": 0.3472, + "step": 6429 + }, + { + "epoch": 1.6172032193158954, + "grad_norm": 0.31747010350227356, + "learning_rate": 5.190907399828418e-06, + "loss": 0.3429, + "step": 6430 + }, + { + "epoch": 1.6174547283702214, + "grad_norm": 0.3446648120880127, + "learning_rate": 5.189445211016804e-06, + "loss": 0.3562, + "step": 6431 + }, + { + "epoch": 1.6177062374245472, + "grad_norm": 0.3192874491214752, + "learning_rate": 5.187983005980393e-06, + "loss": 0.3591, + "step": 6432 + }, + { + "epoch": 1.6179577464788732, + "grad_norm": 0.3468160331249237, + "learning_rate": 5.186520784844416e-06, + "loss": 0.3513, + "step": 6433 + }, + { + "epoch": 1.6182092555331993, + "grad_norm": 0.32596200704574585, + "learning_rate": 5.185058547734098e-06, + "loss": 0.3382, + "step": 6434 + }, + { + "epoch": 1.618460764587525, + "grad_norm": 0.2944319546222687, + "learning_rate": 5.1835962947746744e-06, + "loss": 0.3301, + "step": 6435 + }, + { + "epoch": 1.618712273641851, + "grad_norm": 0.32907602190971375, + "learning_rate": 5.1821340260913765e-06, + "loss": 0.3355, + "step": 6436 + }, + { + "epoch": 1.6189637826961771, + "grad_norm": 0.3250274956226349, + "learning_rate": 5.180671741809439e-06, + "loss": 0.3566, + "step": 6437 + }, + { + "epoch": 1.619215291750503, + "grad_norm": 0.3051334321498871, + "learning_rate": 5.179209442054096e-06, + "loss": 0.3507, + "step": 6438 + }, + { + "epoch": 1.619466800804829, + "grad_norm": 0.3153745234012604, + "learning_rate": 5.177747126950587e-06, + "loss": 0.3558, + "step": 6439 + }, + { + "epoch": 1.619718309859155, + "grad_norm": 0.31299081444740295, + "learning_rate": 5.176284796624147e-06, + "loss": 0.3314, + "step": 6440 + }, + { + "epoch": 1.6199698189134808, + "grad_norm": 0.29618367552757263, + "learning_rate": 5.174822451200018e-06, + "loss": 0.3563, + "step": 6441 + }, + { + "epoch": 1.6202213279678068, + "grad_norm": 0.3016599118709564, + "learning_rate": 5.173360090803437e-06, + "loss": 0.3545, + "step": 6442 + }, + { + "epoch": 1.6204728370221329, + "grad_norm": 0.30213463306427, + "learning_rate": 5.1718977155596515e-06, + "loss": 0.348, + "step": 6443 + }, + { + "epoch": 1.6207243460764587, + "grad_norm": 0.3281251788139343, + "learning_rate": 5.170435325593902e-06, + "loss": 0.3423, + "step": 6444 + }, + { + "epoch": 1.6209758551307847, + "grad_norm": 0.33480846881866455, + "learning_rate": 5.168972921031433e-06, + "loss": 0.3396, + "step": 6445 + }, + { + "epoch": 1.6212273641851107, + "grad_norm": 0.32940176129341125, + "learning_rate": 5.1675105019974905e-06, + "loss": 0.3567, + "step": 6446 + }, + { + "epoch": 1.6214788732394365, + "grad_norm": 0.3426181674003601, + "learning_rate": 5.166048068617321e-06, + "loss": 0.3681, + "step": 6447 + }, + { + "epoch": 1.6217303822937625, + "grad_norm": 0.3707010746002197, + "learning_rate": 5.164585621016174e-06, + "loss": 0.3475, + "step": 6448 + }, + { + "epoch": 1.6219818913480886, + "grad_norm": 0.32672780752182007, + "learning_rate": 5.163123159319298e-06, + "loss": 0.3372, + "step": 6449 + }, + { + "epoch": 1.6222334004024144, + "grad_norm": 0.32823729515075684, + "learning_rate": 5.161660683651943e-06, + "loss": 0.3486, + "step": 6450 + }, + { + "epoch": 1.6224849094567404, + "grad_norm": 0.3079800009727478, + "learning_rate": 5.160198194139362e-06, + "loss": 0.3535, + "step": 6451 + }, + { + "epoch": 1.6227364185110664, + "grad_norm": 0.34681087732315063, + "learning_rate": 5.158735690906808e-06, + "loss": 0.3498, + "step": 6452 + }, + { + "epoch": 1.6229879275653922, + "grad_norm": 0.33672547340393066, + "learning_rate": 5.157273174079535e-06, + "loss": 0.3343, + "step": 6453 + }, + { + "epoch": 1.6232394366197183, + "grad_norm": 0.34650522470474243, + "learning_rate": 5.155810643782798e-06, + "loss": 0.3391, + "step": 6454 + }, + { + "epoch": 1.6234909456740443, + "grad_norm": 0.3248770236968994, + "learning_rate": 5.154348100141855e-06, + "loss": 0.3862, + "step": 6455 + }, + { + "epoch": 1.62374245472837, + "grad_norm": 0.3029992878437042, + "learning_rate": 5.152885543281964e-06, + "loss": 0.3286, + "step": 6456 + }, + { + "epoch": 1.6239939637826963, + "grad_norm": 0.32535600662231445, + "learning_rate": 5.151422973328381e-06, + "loss": 0.3105, + "step": 6457 + }, + { + "epoch": 1.6242454728370221, + "grad_norm": 0.3738095164299011, + "learning_rate": 5.149960390406368e-06, + "loss": 0.3534, + "step": 6458 + }, + { + "epoch": 1.624496981891348, + "grad_norm": 0.31076717376708984, + "learning_rate": 5.1484977946411855e-06, + "loss": 0.3666, + "step": 6459 + }, + { + "epoch": 1.6247484909456742, + "grad_norm": 0.32721608877182007, + "learning_rate": 5.147035186158096e-06, + "loss": 0.3678, + "step": 6460 + }, + { + "epoch": 1.625, + "grad_norm": 0.3344939947128296, + "learning_rate": 5.145572565082363e-06, + "loss": 0.3726, + "step": 6461 + }, + { + "epoch": 1.6252515090543258, + "grad_norm": 0.29688549041748047, + "learning_rate": 5.144109931539251e-06, + "loss": 0.3751, + "step": 6462 + }, + { + "epoch": 1.625503018108652, + "grad_norm": 0.29007625579833984, + "learning_rate": 5.142647285654023e-06, + "loss": 0.3259, + "step": 6463 + }, + { + "epoch": 1.6257545271629779, + "grad_norm": 0.3192233443260193, + "learning_rate": 5.14118462755195e-06, + "loss": 0.378, + "step": 6464 + }, + { + "epoch": 1.6260060362173037, + "grad_norm": 0.3485788106918335, + "learning_rate": 5.139721957358295e-06, + "loss": 0.3603, + "step": 6465 + }, + { + "epoch": 1.62625754527163, + "grad_norm": 0.30432939529418945, + "learning_rate": 5.138259275198329e-06, + "loss": 0.3284, + "step": 6466 + }, + { + "epoch": 1.6265090543259557, + "grad_norm": 0.3238847851753235, + "learning_rate": 5.1367965811973204e-06, + "loss": 0.3459, + "step": 6467 + }, + { + "epoch": 1.6267605633802817, + "grad_norm": 0.2998338043689728, + "learning_rate": 5.135333875480541e-06, + "loss": 0.3437, + "step": 6468 + }, + { + "epoch": 1.6270120724346078, + "grad_norm": 0.3004499673843384, + "learning_rate": 5.133871158173262e-06, + "loss": 0.3473, + "step": 6469 + }, + { + "epoch": 1.6272635814889336, + "grad_norm": 0.3121063709259033, + "learning_rate": 5.132408429400755e-06, + "loss": 0.3421, + "step": 6470 + }, + { + "epoch": 1.6275150905432596, + "grad_norm": 0.3265574276447296, + "learning_rate": 5.130945689288295e-06, + "loss": 0.3465, + "step": 6471 + }, + { + "epoch": 1.6277665995975856, + "grad_norm": 0.32837405800819397, + "learning_rate": 5.129482937961155e-06, + "loss": 0.3612, + "step": 6472 + }, + { + "epoch": 1.6280181086519114, + "grad_norm": 0.3113304078578949, + "learning_rate": 5.128020175544612e-06, + "loss": 0.3388, + "step": 6473 + }, + { + "epoch": 1.6282696177062375, + "grad_norm": 0.32058820128440857, + "learning_rate": 5.126557402163943e-06, + "loss": 0.3485, + "step": 6474 + }, + { + "epoch": 1.6285211267605635, + "grad_norm": 0.34431353211402893, + "learning_rate": 5.125094617944424e-06, + "loss": 0.3268, + "step": 6475 + }, + { + "epoch": 1.6287726358148893, + "grad_norm": 0.3116149604320526, + "learning_rate": 5.123631823011333e-06, + "loss": 0.3588, + "step": 6476 + }, + { + "epoch": 1.6290241448692153, + "grad_norm": 0.2909471392631531, + "learning_rate": 5.122169017489949e-06, + "loss": 0.3541, + "step": 6477 + }, + { + "epoch": 1.6292756539235413, + "grad_norm": 0.35828930139541626, + "learning_rate": 5.120706201505554e-06, + "loss": 0.3597, + "step": 6478 + }, + { + "epoch": 1.6295271629778671, + "grad_norm": 0.28314337134361267, + "learning_rate": 5.119243375183427e-06, + "loss": 0.3599, + "step": 6479 + }, + { + "epoch": 1.6297786720321932, + "grad_norm": 0.2970763146877289, + "learning_rate": 5.1177805386488525e-06, + "loss": 0.3418, + "step": 6480 + }, + { + "epoch": 1.6300301810865192, + "grad_norm": 0.3072241246700287, + "learning_rate": 5.116317692027111e-06, + "loss": 0.3745, + "step": 6481 + }, + { + "epoch": 1.630281690140845, + "grad_norm": 0.32199907302856445, + "learning_rate": 5.114854835443486e-06, + "loss": 0.355, + "step": 6482 + }, + { + "epoch": 1.630533199195171, + "grad_norm": 0.328183650970459, + "learning_rate": 5.113391969023264e-06, + "loss": 0.358, + "step": 6483 + }, + { + "epoch": 1.630784708249497, + "grad_norm": 0.3165997266769409, + "learning_rate": 5.1119290928917285e-06, + "loss": 0.3576, + "step": 6484 + }, + { + "epoch": 1.6310362173038229, + "grad_norm": 0.3254222273826599, + "learning_rate": 5.110466207174165e-06, + "loss": 0.3737, + "step": 6485 + }, + { + "epoch": 1.631287726358149, + "grad_norm": 0.3073231279850006, + "learning_rate": 5.109003311995864e-06, + "loss": 0.3435, + "step": 6486 + }, + { + "epoch": 1.631539235412475, + "grad_norm": 0.3206403851509094, + "learning_rate": 5.107540407482111e-06, + "loss": 0.3568, + "step": 6487 + }, + { + "epoch": 1.6317907444668007, + "grad_norm": 0.29873090982437134, + "learning_rate": 5.106077493758195e-06, + "loss": 0.3167, + "step": 6488 + }, + { + "epoch": 1.6320422535211268, + "grad_norm": 0.2889578342437744, + "learning_rate": 5.104614570949404e-06, + "loss": 0.3574, + "step": 6489 + }, + { + "epoch": 1.6322937625754528, + "grad_norm": 0.3086026608943939, + "learning_rate": 5.1031516391810306e-06, + "loss": 0.3079, + "step": 6490 + }, + { + "epoch": 1.6325452716297786, + "grad_norm": 0.2884649634361267, + "learning_rate": 5.101688698578364e-06, + "loss": 0.3466, + "step": 6491 + }, + { + "epoch": 1.6327967806841046, + "grad_norm": 0.31144869327545166, + "learning_rate": 5.100225749266698e-06, + "loss": 0.3572, + "step": 6492 + }, + { + "epoch": 1.6330482897384306, + "grad_norm": 0.3050963282585144, + "learning_rate": 5.098762791371322e-06, + "loss": 0.3376, + "step": 6493 + }, + { + "epoch": 1.6332997987927564, + "grad_norm": 0.2924579083919525, + "learning_rate": 5.097299825017532e-06, + "loss": 0.3409, + "step": 6494 + }, + { + "epoch": 1.6335513078470825, + "grad_norm": 0.31079205870628357, + "learning_rate": 5.09583685033062e-06, + "loss": 0.3827, + "step": 6495 + }, + { + "epoch": 1.6338028169014085, + "grad_norm": 0.30637675523757935, + "learning_rate": 5.09437386743588e-06, + "loss": 0.3354, + "step": 6496 + }, + { + "epoch": 1.6340543259557343, + "grad_norm": 0.29085665941238403, + "learning_rate": 5.09291087645861e-06, + "loss": 0.349, + "step": 6497 + }, + { + "epoch": 1.6343058350100603, + "grad_norm": 0.2851710915565491, + "learning_rate": 5.091447877524105e-06, + "loss": 0.3623, + "step": 6498 + }, + { + "epoch": 1.6345573440643864, + "grad_norm": 0.29473739862442017, + "learning_rate": 5.089984870757661e-06, + "loss": 0.3461, + "step": 6499 + }, + { + "epoch": 1.6348088531187122, + "grad_norm": 0.3163786828517914, + "learning_rate": 5.088521856284576e-06, + "loss": 0.3542, + "step": 6500 + }, + { + "epoch": 1.6350603621730382, + "grad_norm": 0.3166099786758423, + "learning_rate": 5.087058834230148e-06, + "loss": 0.3335, + "step": 6501 + }, + { + "epoch": 1.6353118712273642, + "grad_norm": 0.3093219995498657, + "learning_rate": 5.0855958047196744e-06, + "loss": 0.3258, + "step": 6502 + }, + { + "epoch": 1.63556338028169, + "grad_norm": 0.3152346611022949, + "learning_rate": 5.084132767878457e-06, + "loss": 0.3328, + "step": 6503 + }, + { + "epoch": 1.635814889336016, + "grad_norm": 0.2982290983200073, + "learning_rate": 5.082669723831793e-06, + "loss": 0.35, + "step": 6504 + }, + { + "epoch": 1.636066398390342, + "grad_norm": 0.3094666004180908, + "learning_rate": 5.081206672704986e-06, + "loss": 0.3647, + "step": 6505 + }, + { + "epoch": 1.6363179074446679, + "grad_norm": 0.31432268023490906, + "learning_rate": 5.079743614623334e-06, + "loss": 0.3449, + "step": 6506 + }, + { + "epoch": 1.6365694164989941, + "grad_norm": 0.32027164101600647, + "learning_rate": 5.07828054971214e-06, + "loss": 0.3704, + "step": 6507 + }, + { + "epoch": 1.63682092555332, + "grad_norm": 0.30292704701423645, + "learning_rate": 5.076817478096707e-06, + "loss": 0.3522, + "step": 6508 + }, + { + "epoch": 1.6370724346076457, + "grad_norm": 0.30981820821762085, + "learning_rate": 5.075354399902338e-06, + "loss": 0.3399, + "step": 6509 + }, + { + "epoch": 1.637323943661972, + "grad_norm": 0.3107950687408447, + "learning_rate": 5.073891315254337e-06, + "loss": 0.3439, + "step": 6510 + }, + { + "epoch": 1.6375754527162978, + "grad_norm": 0.29735639691352844, + "learning_rate": 5.072428224278005e-06, + "loss": 0.3326, + "step": 6511 + }, + { + "epoch": 1.6378269617706236, + "grad_norm": 0.34328925609588623, + "learning_rate": 5.07096512709865e-06, + "loss": 0.3414, + "step": 6512 + }, + { + "epoch": 1.6380784708249498, + "grad_norm": 0.306029736995697, + "learning_rate": 5.069502023841576e-06, + "loss": 0.3733, + "step": 6513 + }, + { + "epoch": 1.6383299798792756, + "grad_norm": 0.33722612261772156, + "learning_rate": 5.068038914632088e-06, + "loss": 0.3623, + "step": 6514 + }, + { + "epoch": 1.6385814889336014, + "grad_norm": 0.3304372727870941, + "learning_rate": 5.066575799595494e-06, + "loss": 0.3489, + "step": 6515 + }, + { + "epoch": 1.6388329979879277, + "grad_norm": 0.32422342896461487, + "learning_rate": 5.065112678857097e-06, + "loss": 0.3753, + "step": 6516 + }, + { + "epoch": 1.6390845070422535, + "grad_norm": 0.341206431388855, + "learning_rate": 5.063649552542208e-06, + "loss": 0.3352, + "step": 6517 + }, + { + "epoch": 1.6393360160965795, + "grad_norm": 0.30149486660957336, + "learning_rate": 5.062186420776132e-06, + "loss": 0.3636, + "step": 6518 + }, + { + "epoch": 1.6395875251509056, + "grad_norm": 0.31636372208595276, + "learning_rate": 5.060723283684178e-06, + "loss": 0.3196, + "step": 6519 + }, + { + "epoch": 1.6398390342052314, + "grad_norm": 0.3157700300216675, + "learning_rate": 5.0592601413916555e-06, + "loss": 0.3368, + "step": 6520 + }, + { + "epoch": 1.6400905432595574, + "grad_norm": 0.3472077548503876, + "learning_rate": 5.057796994023873e-06, + "loss": 0.3561, + "step": 6521 + }, + { + "epoch": 1.6403420523138834, + "grad_norm": 0.3037931025028229, + "learning_rate": 5.056333841706138e-06, + "loss": 0.3218, + "step": 6522 + }, + { + "epoch": 1.6405935613682092, + "grad_norm": 0.3278881907463074, + "learning_rate": 5.054870684563763e-06, + "loss": 0.3403, + "step": 6523 + }, + { + "epoch": 1.6408450704225352, + "grad_norm": 0.326613187789917, + "learning_rate": 5.053407522722057e-06, + "loss": 0.3386, + "step": 6524 + }, + { + "epoch": 1.6410965794768613, + "grad_norm": 0.31864145398139954, + "learning_rate": 5.0519443563063306e-06, + "loss": 0.3662, + "step": 6525 + }, + { + "epoch": 1.641348088531187, + "grad_norm": 0.29436439275741577, + "learning_rate": 5.0504811854418946e-06, + "loss": 0.3386, + "step": 6526 + }, + { + "epoch": 1.641599597585513, + "grad_norm": 0.3037160634994507, + "learning_rate": 5.049018010254062e-06, + "loss": 0.3429, + "step": 6527 + }, + { + "epoch": 1.6418511066398391, + "grad_norm": 0.29753682017326355, + "learning_rate": 5.047554830868142e-06, + "loss": 0.3608, + "step": 6528 + }, + { + "epoch": 1.642102615694165, + "grad_norm": 0.3260503113269806, + "learning_rate": 5.04609164740945e-06, + "loss": 0.3306, + "step": 6529 + }, + { + "epoch": 1.642354124748491, + "grad_norm": 0.34160879254341125, + "learning_rate": 5.044628460003296e-06, + "loss": 0.3614, + "step": 6530 + }, + { + "epoch": 1.642605633802817, + "grad_norm": 0.31074759364128113, + "learning_rate": 5.043165268774993e-06, + "loss": 0.3435, + "step": 6531 + }, + { + "epoch": 1.6428571428571428, + "grad_norm": 0.29596835374832153, + "learning_rate": 5.041702073849856e-06, + "loss": 0.3331, + "step": 6532 + }, + { + "epoch": 1.6431086519114688, + "grad_norm": 0.3086645305156708, + "learning_rate": 5.040238875353196e-06, + "loss": 0.3534, + "step": 6533 + }, + { + "epoch": 1.6433601609657948, + "grad_norm": 0.3223525285720825, + "learning_rate": 5.038775673410329e-06, + "loss": 0.3513, + "step": 6534 + }, + { + "epoch": 1.6436116700201207, + "grad_norm": 0.3153035044670105, + "learning_rate": 5.037312468146567e-06, + "loss": 0.3537, + "step": 6535 + }, + { + "epoch": 1.6438631790744467, + "grad_norm": 0.3239792287349701, + "learning_rate": 5.035849259687227e-06, + "loss": 0.3684, + "step": 6536 + }, + { + "epoch": 1.6441146881287727, + "grad_norm": 0.31517869234085083, + "learning_rate": 5.034386048157622e-06, + "loss": 0.3477, + "step": 6537 + }, + { + "epoch": 1.6443661971830985, + "grad_norm": 0.334823876619339, + "learning_rate": 5.032922833683066e-06, + "loss": 0.3537, + "step": 6538 + }, + { + "epoch": 1.6446177062374245, + "grad_norm": 0.2976732552051544, + "learning_rate": 5.031459616388874e-06, + "loss": 0.3599, + "step": 6539 + }, + { + "epoch": 1.6448692152917506, + "grad_norm": 0.3268083333969116, + "learning_rate": 5.029996396400365e-06, + "loss": 0.3449, + "step": 6540 + }, + { + "epoch": 1.6451207243460764, + "grad_norm": 0.3218528628349304, + "learning_rate": 5.028533173842851e-06, + "loss": 0.3338, + "step": 6541 + }, + { + "epoch": 1.6453722334004024, + "grad_norm": 0.28955259919166565, + "learning_rate": 5.02706994884165e-06, + "loss": 0.3325, + "step": 6542 + }, + { + "epoch": 1.6456237424547284, + "grad_norm": 0.2912788987159729, + "learning_rate": 5.025606721522077e-06, + "loss": 0.3465, + "step": 6543 + }, + { + "epoch": 1.6458752515090542, + "grad_norm": 0.33705586194992065, + "learning_rate": 5.024143492009449e-06, + "loss": 0.3647, + "step": 6544 + }, + { + "epoch": 1.6461267605633803, + "grad_norm": 0.32726725935935974, + "learning_rate": 5.022680260429082e-06, + "loss": 0.3435, + "step": 6545 + }, + { + "epoch": 1.6463782696177063, + "grad_norm": 0.32821711897850037, + "learning_rate": 5.021217026906292e-06, + "loss": 0.3583, + "step": 6546 + }, + { + "epoch": 1.646629778672032, + "grad_norm": 0.3105396330356598, + "learning_rate": 5.019753791566396e-06, + "loss": 0.34, + "step": 6547 + }, + { + "epoch": 1.6468812877263581, + "grad_norm": 0.3292068839073181, + "learning_rate": 5.0182905545347125e-06, + "loss": 0.3265, + "step": 6548 + }, + { + "epoch": 1.6471327967806841, + "grad_norm": 0.3053254783153534, + "learning_rate": 5.016827315936557e-06, + "loss": 0.3391, + "step": 6549 + }, + { + "epoch": 1.64738430583501, + "grad_norm": 0.3218199908733368, + "learning_rate": 5.015364075897246e-06, + "loss": 0.3596, + "step": 6550 + }, + { + "epoch": 1.647635814889336, + "grad_norm": 0.28854915499687195, + "learning_rate": 5.013900834542099e-06, + "loss": 0.3419, + "step": 6551 + }, + { + "epoch": 1.647887323943662, + "grad_norm": 0.33648034930229187, + "learning_rate": 5.012437591996432e-06, + "loss": 0.3496, + "step": 6552 + }, + { + "epoch": 1.6481388329979878, + "grad_norm": 0.2896369993686676, + "learning_rate": 5.010974348385565e-06, + "loss": 0.3423, + "step": 6553 + }, + { + "epoch": 1.6483903420523138, + "grad_norm": 0.31751370429992676, + "learning_rate": 5.009511103834811e-06, + "loss": 0.3562, + "step": 6554 + }, + { + "epoch": 1.6486418511066399, + "grad_norm": 0.2887558341026306, + "learning_rate": 5.008047858469492e-06, + "loss": 0.3316, + "step": 6555 + }, + { + "epoch": 1.6488933601609657, + "grad_norm": 0.3214130103588104, + "learning_rate": 5.006584612414924e-06, + "loss": 0.3598, + "step": 6556 + }, + { + "epoch": 1.649144869215292, + "grad_norm": 0.32030197978019714, + "learning_rate": 5.0051213657964245e-06, + "loss": 0.3675, + "step": 6557 + }, + { + "epoch": 1.6493963782696177, + "grad_norm": 0.3157503008842468, + "learning_rate": 5.003658118739313e-06, + "loss": 0.3495, + "step": 6558 + }, + { + "epoch": 1.6496478873239435, + "grad_norm": 0.31887757778167725, + "learning_rate": 5.0021948713689064e-06, + "loss": 0.3527, + "step": 6559 + }, + { + "epoch": 1.6498993963782698, + "grad_norm": 0.3290422558784485, + "learning_rate": 5.000731623810523e-06, + "loss": 0.3633, + "step": 6560 + }, + { + "epoch": 1.6501509054325956, + "grad_norm": 0.31094884872436523, + "learning_rate": 4.99926837618948e-06, + "loss": 0.3623, + "step": 6561 + }, + { + "epoch": 1.6504024144869214, + "grad_norm": 0.2901002764701843, + "learning_rate": 4.997805128631095e-06, + "loss": 0.355, + "step": 6562 + }, + { + "epoch": 1.6506539235412476, + "grad_norm": 0.28659021854400635, + "learning_rate": 4.996341881260689e-06, + "loss": 0.3392, + "step": 6563 + }, + { + "epoch": 1.6509054325955734, + "grad_norm": 0.324534147977829, + "learning_rate": 4.994878634203576e-06, + "loss": 0.3179, + "step": 6564 + }, + { + "epoch": 1.6511569416498992, + "grad_norm": 0.3127976357936859, + "learning_rate": 4.993415387585079e-06, + "loss": 0.3493, + "step": 6565 + }, + { + "epoch": 1.6514084507042255, + "grad_norm": 0.3226813077926636, + "learning_rate": 4.991952141530509e-06, + "loss": 0.3516, + "step": 6566 + }, + { + "epoch": 1.6516599597585513, + "grad_norm": 0.30921557545661926, + "learning_rate": 4.9904888961651895e-06, + "loss": 0.3334, + "step": 6567 + }, + { + "epoch": 1.6519114688128773, + "grad_norm": 0.31748196482658386, + "learning_rate": 4.989025651614438e-06, + "loss": 0.3315, + "step": 6568 + }, + { + "epoch": 1.6521629778672033, + "grad_norm": 0.3072637915611267, + "learning_rate": 4.987562408003568e-06, + "loss": 0.3572, + "step": 6569 + }, + { + "epoch": 1.6524144869215291, + "grad_norm": 0.3248019218444824, + "learning_rate": 4.9860991654579025e-06, + "loss": 0.3596, + "step": 6570 + }, + { + "epoch": 1.6526659959758552, + "grad_norm": 0.3171079754829407, + "learning_rate": 4.984635924102754e-06, + "loss": 0.3551, + "step": 6571 + }, + { + "epoch": 1.6529175050301812, + "grad_norm": 0.3320086598396301, + "learning_rate": 4.983172684063446e-06, + "loss": 0.3329, + "step": 6572 + }, + { + "epoch": 1.653169014084507, + "grad_norm": 0.3475695848464966, + "learning_rate": 4.981709445465288e-06, + "loss": 0.3538, + "step": 6573 + }, + { + "epoch": 1.653420523138833, + "grad_norm": 0.29529550671577454, + "learning_rate": 4.980246208433606e-06, + "loss": 0.315, + "step": 6574 + }, + { + "epoch": 1.653672032193159, + "grad_norm": 0.31380680203437805, + "learning_rate": 4.978782973093709e-06, + "loss": 0.3334, + "step": 6575 + }, + { + "epoch": 1.6539235412474849, + "grad_norm": 0.3308258354663849, + "learning_rate": 4.977319739570921e-06, + "loss": 0.3196, + "step": 6576 + }, + { + "epoch": 1.654175050301811, + "grad_norm": 0.3524826169013977, + "learning_rate": 4.975856507990552e-06, + "loss": 0.3344, + "step": 6577 + }, + { + "epoch": 1.654426559356137, + "grad_norm": 0.3204286992549896, + "learning_rate": 4.974393278477923e-06, + "loss": 0.3661, + "step": 6578 + }, + { + "epoch": 1.6546780684104627, + "grad_norm": 0.3220480978488922, + "learning_rate": 4.972930051158351e-06, + "loss": 0.3719, + "step": 6579 + }, + { + "epoch": 1.6549295774647887, + "grad_norm": 0.3321162760257721, + "learning_rate": 4.971466826157149e-06, + "loss": 0.3661, + "step": 6580 + }, + { + "epoch": 1.6551810865191148, + "grad_norm": 0.3444232642650604, + "learning_rate": 4.970003603599637e-06, + "loss": 0.3676, + "step": 6581 + }, + { + "epoch": 1.6554325955734406, + "grad_norm": 0.33695197105407715, + "learning_rate": 4.968540383611126e-06, + "loss": 0.3604, + "step": 6582 + }, + { + "epoch": 1.6556841046277666, + "grad_norm": 0.35238873958587646, + "learning_rate": 4.967077166316937e-06, + "loss": 0.3389, + "step": 6583 + }, + { + "epoch": 1.6559356136820926, + "grad_norm": 0.30956074595451355, + "learning_rate": 4.96561395184238e-06, + "loss": 0.3846, + "step": 6584 + }, + { + "epoch": 1.6561871227364184, + "grad_norm": 0.29510757327079773, + "learning_rate": 4.964150740312776e-06, + "loss": 0.3391, + "step": 6585 + }, + { + "epoch": 1.6564386317907445, + "grad_norm": 0.309314489364624, + "learning_rate": 4.962687531853434e-06, + "loss": 0.3519, + "step": 6586 + }, + { + "epoch": 1.6566901408450705, + "grad_norm": 0.3318527638912201, + "learning_rate": 4.961224326589674e-06, + "loss": 0.3607, + "step": 6587 + }, + { + "epoch": 1.6569416498993963, + "grad_norm": 0.3277136981487274, + "learning_rate": 4.959761124646805e-06, + "loss": 0.351, + "step": 6588 + }, + { + "epoch": 1.6571931589537223, + "grad_norm": 0.29636046290397644, + "learning_rate": 4.958297926150146e-06, + "loss": 0.3345, + "step": 6589 + }, + { + "epoch": 1.6574446680080483, + "grad_norm": 0.2861737310886383, + "learning_rate": 4.956834731225008e-06, + "loss": 0.3306, + "step": 6590 + }, + { + "epoch": 1.6576961770623742, + "grad_norm": 0.31142330169677734, + "learning_rate": 4.955371539996706e-06, + "loss": 0.3601, + "step": 6591 + }, + { + "epoch": 1.6579476861167002, + "grad_norm": 0.319341778755188, + "learning_rate": 4.953908352590552e-06, + "loss": 0.3525, + "step": 6592 + }, + { + "epoch": 1.6581991951710262, + "grad_norm": 0.295574426651001, + "learning_rate": 4.9524451691318585e-06, + "loss": 0.338, + "step": 6593 + }, + { + "epoch": 1.658450704225352, + "grad_norm": 0.3026335537433624, + "learning_rate": 4.95098198974594e-06, + "loss": 0.3506, + "step": 6594 + }, + { + "epoch": 1.658702213279678, + "grad_norm": 0.3181147873401642, + "learning_rate": 4.949518814558106e-06, + "loss": 0.3806, + "step": 6595 + }, + { + "epoch": 1.658953722334004, + "grad_norm": 0.30840447545051575, + "learning_rate": 4.948055643693671e-06, + "loss": 0.3635, + "step": 6596 + }, + { + "epoch": 1.6592052313883299, + "grad_norm": 0.3271403908729553, + "learning_rate": 4.946592477277945e-06, + "loss": 0.3833, + "step": 6597 + }, + { + "epoch": 1.659456740442656, + "grad_norm": 0.3057454228401184, + "learning_rate": 4.945129315436239e-06, + "loss": 0.3459, + "step": 6598 + }, + { + "epoch": 1.659708249496982, + "grad_norm": 0.30508214235305786, + "learning_rate": 4.943666158293864e-06, + "loss": 0.3659, + "step": 6599 + }, + { + "epoch": 1.6599597585513077, + "grad_norm": 0.3431748151779175, + "learning_rate": 4.942203005976128e-06, + "loss": 0.3873, + "step": 6600 + }, + { + "epoch": 1.6602112676056338, + "grad_norm": 0.3456651270389557, + "learning_rate": 4.940739858608346e-06, + "loss": 0.3472, + "step": 6601 + }, + { + "epoch": 1.6604627766599598, + "grad_norm": 0.3188634514808655, + "learning_rate": 4.939276716315822e-06, + "loss": 0.3422, + "step": 6602 + }, + { + "epoch": 1.6607142857142856, + "grad_norm": 0.31093767285346985, + "learning_rate": 4.937813579223871e-06, + "loss": 0.359, + "step": 6603 + }, + { + "epoch": 1.6609657947686118, + "grad_norm": 0.30429354310035706, + "learning_rate": 4.9363504474577936e-06, + "loss": 0.3307, + "step": 6604 + }, + { + "epoch": 1.6612173038229376, + "grad_norm": 0.31555190682411194, + "learning_rate": 4.934887321142905e-06, + "loss": 0.3468, + "step": 6605 + }, + { + "epoch": 1.6614688128772634, + "grad_norm": 0.30331486463546753, + "learning_rate": 4.933424200404508e-06, + "loss": 0.3412, + "step": 6606 + }, + { + "epoch": 1.6617203219315897, + "grad_norm": 0.3019322454929352, + "learning_rate": 4.9319610853679136e-06, + "loss": 0.3432, + "step": 6607 + }, + { + "epoch": 1.6619718309859155, + "grad_norm": 0.3192391097545624, + "learning_rate": 4.9304979761584256e-06, + "loss": 0.3571, + "step": 6608 + }, + { + "epoch": 1.6622233400402413, + "grad_norm": 0.31620514392852783, + "learning_rate": 4.929034872901352e-06, + "loss": 0.3415, + "step": 6609 + }, + { + "epoch": 1.6624748490945676, + "grad_norm": 0.32536083459854126, + "learning_rate": 4.927571775721996e-06, + "loss": 0.3433, + "step": 6610 + }, + { + "epoch": 1.6627263581488934, + "grad_norm": 0.2909981608390808, + "learning_rate": 4.926108684745664e-06, + "loss": 0.3614, + "step": 6611 + }, + { + "epoch": 1.6629778672032192, + "grad_norm": 0.30204445123672485, + "learning_rate": 4.924645600097663e-06, + "loss": 0.3639, + "step": 6612 + }, + { + "epoch": 1.6632293762575454, + "grad_norm": 0.3101191818714142, + "learning_rate": 4.923182521903293e-06, + "loss": 0.3451, + "step": 6613 + }, + { + "epoch": 1.6634808853118712, + "grad_norm": 0.3223671615123749, + "learning_rate": 4.9217194502878615e-06, + "loss": 0.353, + "step": 6614 + }, + { + "epoch": 1.663732394366197, + "grad_norm": 0.3006145656108856, + "learning_rate": 4.920256385376668e-06, + "loss": 0.3356, + "step": 6615 + }, + { + "epoch": 1.6639839034205233, + "grad_norm": 0.2919999957084656, + "learning_rate": 4.918793327295018e-06, + "loss": 0.3371, + "step": 6616 + }, + { + "epoch": 1.664235412474849, + "grad_norm": 0.3268287777900696, + "learning_rate": 4.917330276168208e-06, + "loss": 0.3501, + "step": 6617 + }, + { + "epoch": 1.664486921529175, + "grad_norm": 0.29942867159843445, + "learning_rate": 4.915867232121546e-06, + "loss": 0.3629, + "step": 6618 + }, + { + "epoch": 1.6647384305835011, + "grad_norm": 0.30160030722618103, + "learning_rate": 4.914404195280326e-06, + "loss": 0.3347, + "step": 6619 + }, + { + "epoch": 1.664989939637827, + "grad_norm": 0.30396658182144165, + "learning_rate": 4.912941165769855e-06, + "loss": 0.367, + "step": 6620 + }, + { + "epoch": 1.665241448692153, + "grad_norm": 0.34666627645492554, + "learning_rate": 4.9114781437154255e-06, + "loss": 0.3528, + "step": 6621 + }, + { + "epoch": 1.665492957746479, + "grad_norm": 0.324301153421402, + "learning_rate": 4.910015129242339e-06, + "loss": 0.3496, + "step": 6622 + }, + { + "epoch": 1.6657444668008048, + "grad_norm": 0.3196171224117279, + "learning_rate": 4.908552122475897e-06, + "loss": 0.338, + "step": 6623 + }, + { + "epoch": 1.6659959758551308, + "grad_norm": 0.2906225323677063, + "learning_rate": 4.90708912354139e-06, + "loss": 0.3447, + "step": 6624 + }, + { + "epoch": 1.6662474849094568, + "grad_norm": 0.3057789206504822, + "learning_rate": 4.905626132564121e-06, + "loss": 0.3315, + "step": 6625 + }, + { + "epoch": 1.6664989939637826, + "grad_norm": 0.34539157152175903, + "learning_rate": 4.904163149669382e-06, + "loss": 0.3487, + "step": 6626 + }, + { + "epoch": 1.6667505030181087, + "grad_norm": 0.3429698944091797, + "learning_rate": 4.902700174982471e-06, + "loss": 0.3373, + "step": 6627 + }, + { + "epoch": 1.6670020120724347, + "grad_norm": 0.29753679037094116, + "learning_rate": 4.901237208628679e-06, + "loss": 0.3371, + "step": 6628 + }, + { + "epoch": 1.6672535211267605, + "grad_norm": 0.3603936433792114, + "learning_rate": 4.899774250733305e-06, + "loss": 0.3302, + "step": 6629 + }, + { + "epoch": 1.6675050301810865, + "grad_norm": 0.315326452255249, + "learning_rate": 4.8983113014216365e-06, + "loss": 0.3192, + "step": 6630 + }, + { + "epoch": 1.6677565392354126, + "grad_norm": 0.28168460726737976, + "learning_rate": 4.896848360818971e-06, + "loss": 0.3242, + "step": 6631 + }, + { + "epoch": 1.6680080482897384, + "grad_norm": 0.3165770173072815, + "learning_rate": 4.895385429050597e-06, + "loss": 0.3665, + "step": 6632 + }, + { + "epoch": 1.6682595573440644, + "grad_norm": 0.3269800543785095, + "learning_rate": 4.893922506241806e-06, + "loss": 0.3463, + "step": 6633 + }, + { + "epoch": 1.6685110663983904, + "grad_norm": 0.3024964928627014, + "learning_rate": 4.8924595925178905e-06, + "loss": 0.3233, + "step": 6634 + }, + { + "epoch": 1.6687625754527162, + "grad_norm": 0.3181130290031433, + "learning_rate": 4.890996688004136e-06, + "loss": 0.3512, + "step": 6635 + }, + { + "epoch": 1.6690140845070423, + "grad_norm": 0.3129771053791046, + "learning_rate": 4.889533792825836e-06, + "loss": 0.3609, + "step": 6636 + }, + { + "epoch": 1.6692655935613683, + "grad_norm": 0.297387957572937, + "learning_rate": 4.888070907108273e-06, + "loss": 0.3696, + "step": 6637 + }, + { + "epoch": 1.669517102615694, + "grad_norm": 0.3127029836177826, + "learning_rate": 4.886608030976739e-06, + "loss": 0.3338, + "step": 6638 + }, + { + "epoch": 1.66976861167002, + "grad_norm": 0.326416939496994, + "learning_rate": 4.885145164556516e-06, + "loss": 0.3449, + "step": 6639 + }, + { + "epoch": 1.6700201207243461, + "grad_norm": 0.3048277795314789, + "learning_rate": 4.8836823079728925e-06, + "loss": 0.3643, + "step": 6640 + }, + { + "epoch": 1.670271629778672, + "grad_norm": 0.3344593644142151, + "learning_rate": 4.882219461351149e-06, + "loss": 0.3471, + "step": 6641 + }, + { + "epoch": 1.670523138832998, + "grad_norm": 0.3117375075817108, + "learning_rate": 4.880756624816574e-06, + "loss": 0.3416, + "step": 6642 + }, + { + "epoch": 1.670774647887324, + "grad_norm": 0.31037411093711853, + "learning_rate": 4.879293798494448e-06, + "loss": 0.3518, + "step": 6643 + }, + { + "epoch": 1.6710261569416498, + "grad_norm": 0.3362254500389099, + "learning_rate": 4.877830982510052e-06, + "loss": 0.3156, + "step": 6644 + }, + { + "epoch": 1.6712776659959758, + "grad_norm": 0.3232327699661255, + "learning_rate": 4.876368176988669e-06, + "loss": 0.3462, + "step": 6645 + }, + { + "epoch": 1.6715291750503019, + "grad_norm": 0.29828834533691406, + "learning_rate": 4.874905382055578e-06, + "loss": 0.3654, + "step": 6646 + }, + { + "epoch": 1.6717806841046277, + "grad_norm": 0.3095129430294037, + "learning_rate": 4.873442597836058e-06, + "loss": 0.3525, + "step": 6647 + }, + { + "epoch": 1.6720321931589537, + "grad_norm": 0.3279082775115967, + "learning_rate": 4.871979824455388e-06, + "loss": 0.3591, + "step": 6648 + }, + { + "epoch": 1.6722837022132797, + "grad_norm": 0.33682674169540405, + "learning_rate": 4.870517062038846e-06, + "loss": 0.3456, + "step": 6649 + }, + { + "epoch": 1.6725352112676055, + "grad_norm": 0.3893029987812042, + "learning_rate": 4.869054310711707e-06, + "loss": 0.3861, + "step": 6650 + }, + { + "epoch": 1.6727867203219315, + "grad_norm": 0.346723735332489, + "learning_rate": 4.867591570599247e-06, + "loss": 0.3562, + "step": 6651 + }, + { + "epoch": 1.6730382293762576, + "grad_norm": 0.3105736970901489, + "learning_rate": 4.8661288418267395e-06, + "loss": 0.3493, + "step": 6652 + }, + { + "epoch": 1.6732897384305834, + "grad_norm": 0.33905136585235596, + "learning_rate": 4.8646661245194605e-06, + "loss": 0.336, + "step": 6653 + }, + { + "epoch": 1.6735412474849096, + "grad_norm": 0.3350204527378082, + "learning_rate": 4.863203418802681e-06, + "loss": 0.3214, + "step": 6654 + }, + { + "epoch": 1.6737927565392354, + "grad_norm": 0.3009431064128876, + "learning_rate": 4.861740724801673e-06, + "loss": 0.336, + "step": 6655 + }, + { + "epoch": 1.6740442655935612, + "grad_norm": 0.3083348572254181, + "learning_rate": 4.860278042641707e-06, + "loss": 0.3652, + "step": 6656 + }, + { + "epoch": 1.6742957746478875, + "grad_norm": 0.30141690373420715, + "learning_rate": 4.858815372448053e-06, + "loss": 0.3533, + "step": 6657 + }, + { + "epoch": 1.6745472837022133, + "grad_norm": 0.2887999415397644, + "learning_rate": 4.857352714345978e-06, + "loss": 0.349, + "step": 6658 + }, + { + "epoch": 1.674798792756539, + "grad_norm": 0.28986456990242004, + "learning_rate": 4.8558900684607515e-06, + "loss": 0.3252, + "step": 6659 + }, + { + "epoch": 1.6750503018108653, + "grad_norm": 0.3002329468727112, + "learning_rate": 4.854427434917638e-06, + "loss": 0.3624, + "step": 6660 + }, + { + "epoch": 1.6753018108651911, + "grad_norm": 0.3201340138912201, + "learning_rate": 4.852964813841906e-06, + "loss": 0.3457, + "step": 6661 + }, + { + "epoch": 1.675553319919517, + "grad_norm": 0.30298128724098206, + "learning_rate": 4.851502205358816e-06, + "loss": 0.365, + "step": 6662 + }, + { + "epoch": 1.6758048289738432, + "grad_norm": 0.33912432193756104, + "learning_rate": 4.850039609593634e-06, + "loss": 0.3453, + "step": 6663 + }, + { + "epoch": 1.676056338028169, + "grad_norm": 0.3226577341556549, + "learning_rate": 4.84857702667162e-06, + "loss": 0.3462, + "step": 6664 + }, + { + "epoch": 1.6763078470824948, + "grad_norm": 0.2974978983402252, + "learning_rate": 4.847114456718039e-06, + "loss": 0.3518, + "step": 6665 + }, + { + "epoch": 1.676559356136821, + "grad_norm": 0.3077523708343506, + "learning_rate": 4.8456518998581445e-06, + "loss": 0.3547, + "step": 6666 + }, + { + "epoch": 1.6768108651911469, + "grad_norm": 0.32342010736465454, + "learning_rate": 4.844189356217203e-06, + "loss": 0.3691, + "step": 6667 + }, + { + "epoch": 1.6770623742454729, + "grad_norm": 0.33299824595451355, + "learning_rate": 4.842726825920466e-06, + "loss": 0.368, + "step": 6668 + }, + { + "epoch": 1.677313883299799, + "grad_norm": 0.2916051149368286, + "learning_rate": 4.8412643090931945e-06, + "loss": 0.3292, + "step": 6669 + }, + { + "epoch": 1.6775653923541247, + "grad_norm": 0.3211728632450104, + "learning_rate": 4.839801805860639e-06, + "loss": 0.3667, + "step": 6670 + }, + { + "epoch": 1.6778169014084507, + "grad_norm": 0.3185436725616455, + "learning_rate": 4.83833931634806e-06, + "loss": 0.3416, + "step": 6671 + }, + { + "epoch": 1.6780684104627768, + "grad_norm": 0.3072052001953125, + "learning_rate": 4.8368768406807045e-06, + "loss": 0.3526, + "step": 6672 + }, + { + "epoch": 1.6783199195171026, + "grad_norm": 0.29810407757759094, + "learning_rate": 4.8354143789838285e-06, + "loss": 0.339, + "step": 6673 + }, + { + "epoch": 1.6785714285714286, + "grad_norm": 0.3369450271129608, + "learning_rate": 4.83395193138268e-06, + "loss": 0.3568, + "step": 6674 + }, + { + "epoch": 1.6788229376257546, + "grad_norm": 0.33037352561950684, + "learning_rate": 4.83248949800251e-06, + "loss": 0.3522, + "step": 6675 + }, + { + "epoch": 1.6790744466800804, + "grad_norm": 0.36880165338516235, + "learning_rate": 4.831027078968568e-06, + "loss": 0.3664, + "step": 6676 + }, + { + "epoch": 1.6793259557344065, + "grad_norm": 0.3295780420303345, + "learning_rate": 4.829564674406098e-06, + "loss": 0.3236, + "step": 6677 + }, + { + "epoch": 1.6795774647887325, + "grad_norm": 0.327239066362381, + "learning_rate": 4.828102284440349e-06, + "loss": 0.3442, + "step": 6678 + }, + { + "epoch": 1.6798289738430583, + "grad_norm": 0.3397291898727417, + "learning_rate": 4.826639909196562e-06, + "loss": 0.3433, + "step": 6679 + }, + { + "epoch": 1.6800804828973843, + "grad_norm": 0.3069293797016144, + "learning_rate": 4.825177548799985e-06, + "loss": 0.3455, + "step": 6680 + }, + { + "epoch": 1.6803319919517103, + "grad_norm": 0.35793232917785645, + "learning_rate": 4.823715203375854e-06, + "loss": 0.3675, + "step": 6681 + }, + { + "epoch": 1.6805835010060362, + "grad_norm": 0.291510671377182, + "learning_rate": 4.822252873049416e-06, + "loss": 0.3461, + "step": 6682 + }, + { + "epoch": 1.6808350100603622, + "grad_norm": 0.32121041417121887, + "learning_rate": 4.8207905579459054e-06, + "loss": 0.3352, + "step": 6683 + }, + { + "epoch": 1.6810865191146882, + "grad_norm": 0.337119460105896, + "learning_rate": 4.819328258190564e-06, + "loss": 0.3171, + "step": 6684 + }, + { + "epoch": 1.681338028169014, + "grad_norm": 0.32898882031440735, + "learning_rate": 4.817865973908625e-06, + "loss": 0.35, + "step": 6685 + }, + { + "epoch": 1.68158953722334, + "grad_norm": 0.3288995623588562, + "learning_rate": 4.816403705225326e-06, + "loss": 0.3474, + "step": 6686 + }, + { + "epoch": 1.681841046277666, + "grad_norm": 0.3149915039539337, + "learning_rate": 4.814941452265903e-06, + "loss": 0.3283, + "step": 6687 + }, + { + "epoch": 1.6820925553319919, + "grad_norm": 0.3553139865398407, + "learning_rate": 4.813479215155585e-06, + "loss": 0.378, + "step": 6688 + }, + { + "epoch": 1.682344064386318, + "grad_norm": 0.3398416042327881, + "learning_rate": 4.812016994019607e-06, + "loss": 0.3614, + "step": 6689 + }, + { + "epoch": 1.682595573440644, + "grad_norm": 0.33960628509521484, + "learning_rate": 4.810554788983196e-06, + "loss": 0.348, + "step": 6690 + }, + { + "epoch": 1.6828470824949697, + "grad_norm": 0.3453803062438965, + "learning_rate": 4.809092600171584e-06, + "loss": 0.3615, + "step": 6691 + }, + { + "epoch": 1.6830985915492958, + "grad_norm": 0.3408578634262085, + "learning_rate": 4.807630427709995e-06, + "loss": 0.3595, + "step": 6692 + }, + { + "epoch": 1.6833501006036218, + "grad_norm": 0.3130299746990204, + "learning_rate": 4.806168271723657e-06, + "loss": 0.3441, + "step": 6693 + }, + { + "epoch": 1.6836016096579476, + "grad_norm": 0.3115828037261963, + "learning_rate": 4.804706132337793e-06, + "loss": 0.3505, + "step": 6694 + }, + { + "epoch": 1.6838531187122736, + "grad_norm": 0.34020087122917175, + "learning_rate": 4.803244009677629e-06, + "loss": 0.3569, + "step": 6695 + }, + { + "epoch": 1.6841046277665996, + "grad_norm": 0.3176514804363251, + "learning_rate": 4.801781903868383e-06, + "loss": 0.3452, + "step": 6696 + }, + { + "epoch": 1.6843561368209254, + "grad_norm": 0.33271244168281555, + "learning_rate": 4.8003198150352755e-06, + "loss": 0.3536, + "step": 6697 + }, + { + "epoch": 1.6846076458752515, + "grad_norm": 0.3202335834503174, + "learning_rate": 4.79885774330353e-06, + "loss": 0.3692, + "step": 6698 + }, + { + "epoch": 1.6848591549295775, + "grad_norm": 0.314801424741745, + "learning_rate": 4.797395688798358e-06, + "loss": 0.3468, + "step": 6699 + }, + { + "epoch": 1.6851106639839033, + "grad_norm": 0.35798442363739014, + "learning_rate": 4.7959336516449795e-06, + "loss": 0.3449, + "step": 6700 + }, + { + "epoch": 1.6853621730382293, + "grad_norm": 0.35215941071510315, + "learning_rate": 4.794471631968606e-06, + "loss": 0.3699, + "step": 6701 + }, + { + "epoch": 1.6856136820925554, + "grad_norm": 0.30209723114967346, + "learning_rate": 4.793009629894451e-06, + "loss": 0.3386, + "step": 6702 + }, + { + "epoch": 1.6858651911468812, + "grad_norm": 0.3090112507343292, + "learning_rate": 4.791547645547727e-06, + "loss": 0.3756, + "step": 6703 + }, + { + "epoch": 1.6861167002012074, + "grad_norm": 0.32377490401268005, + "learning_rate": 4.790085679053644e-06, + "loss": 0.3814, + "step": 6704 + }, + { + "epoch": 1.6863682092555332, + "grad_norm": 0.30719804763793945, + "learning_rate": 4.788623730537407e-06, + "loss": 0.3386, + "step": 6705 + }, + { + "epoch": 1.686619718309859, + "grad_norm": 0.3107820451259613, + "learning_rate": 4.787161800124228e-06, + "loss": 0.3603, + "step": 6706 + }, + { + "epoch": 1.6868712273641853, + "grad_norm": 0.30711448192596436, + "learning_rate": 4.785699887939307e-06, + "loss": 0.344, + "step": 6707 + }, + { + "epoch": 1.687122736418511, + "grad_norm": 0.30240827798843384, + "learning_rate": 4.78423799410785e-06, + "loss": 0.3494, + "step": 6708 + }, + { + "epoch": 1.6873742454728369, + "grad_norm": 0.30051419138908386, + "learning_rate": 4.782776118755061e-06, + "loss": 0.3477, + "step": 6709 + }, + { + "epoch": 1.6876257545271631, + "grad_norm": 0.3172437846660614, + "learning_rate": 4.7813142620061365e-06, + "loss": 0.3885, + "step": 6710 + }, + { + "epoch": 1.687877263581489, + "grad_norm": 0.30793458223342896, + "learning_rate": 4.779852423986278e-06, + "loss": 0.3312, + "step": 6711 + }, + { + "epoch": 1.6881287726358147, + "grad_norm": 0.31784626841545105, + "learning_rate": 4.778390604820683e-06, + "loss": 0.3646, + "step": 6712 + }, + { + "epoch": 1.688380281690141, + "grad_norm": 0.3140884041786194, + "learning_rate": 4.776928804634545e-06, + "loss": 0.3276, + "step": 6713 + }, + { + "epoch": 1.6886317907444668, + "grad_norm": 0.3013300895690918, + "learning_rate": 4.775467023553061e-06, + "loss": 0.3607, + "step": 6714 + }, + { + "epoch": 1.6888832997987926, + "grad_norm": 0.30073946714401245, + "learning_rate": 4.77400526170142e-06, + "loss": 0.348, + "step": 6715 + }, + { + "epoch": 1.6891348088531188, + "grad_norm": 0.3390004336833954, + "learning_rate": 4.7725435192048156e-06, + "loss": 0.324, + "step": 6716 + }, + { + "epoch": 1.6893863179074446, + "grad_norm": 0.31539881229400635, + "learning_rate": 4.771081796188435e-06, + "loss": 0.3331, + "step": 6717 + }, + { + "epoch": 1.6896378269617707, + "grad_norm": 0.34204819798469543, + "learning_rate": 4.7696200927774675e-06, + "loss": 0.3243, + "step": 6718 + }, + { + "epoch": 1.6898893360160967, + "grad_norm": 0.312477171421051, + "learning_rate": 4.768158409097096e-06, + "loss": 0.3378, + "step": 6719 + }, + { + "epoch": 1.6901408450704225, + "grad_norm": 0.32249870896339417, + "learning_rate": 4.766696745272508e-06, + "loss": 0.3498, + "step": 6720 + }, + { + "epoch": 1.6903923541247485, + "grad_norm": 0.31593847274780273, + "learning_rate": 4.765235101428883e-06, + "loss": 0.3244, + "step": 6721 + }, + { + "epoch": 1.6906438631790746, + "grad_norm": 0.31690171360969543, + "learning_rate": 4.7637734776914045e-06, + "loss": 0.3346, + "step": 6722 + }, + { + "epoch": 1.6908953722334004, + "grad_norm": 0.3169795274734497, + "learning_rate": 4.7623118741852484e-06, + "loss": 0.3551, + "step": 6723 + }, + { + "epoch": 1.6911468812877264, + "grad_norm": 0.2886812686920166, + "learning_rate": 4.760850291035595e-06, + "loss": 0.3425, + "step": 6724 + }, + { + "epoch": 1.6913983903420524, + "grad_norm": 0.3049076199531555, + "learning_rate": 4.759388728367615e-06, + "loss": 0.3521, + "step": 6725 + }, + { + "epoch": 1.6916498993963782, + "grad_norm": 0.2920475900173187, + "learning_rate": 4.757927186306489e-06, + "loss": 0.3332, + "step": 6726 + }, + { + "epoch": 1.6919014084507042, + "grad_norm": 0.3055409789085388, + "learning_rate": 4.756465664977381e-06, + "loss": 0.3586, + "step": 6727 + }, + { + "epoch": 1.6921529175050303, + "grad_norm": 0.3127298057079315, + "learning_rate": 4.75500416450547e-06, + "loss": 0.3309, + "step": 6728 + }, + { + "epoch": 1.692404426559356, + "grad_norm": 0.3161258101463318, + "learning_rate": 4.753542685015916e-06, + "loss": 0.3436, + "step": 6729 + }, + { + "epoch": 1.692655935613682, + "grad_norm": 0.3124731481075287, + "learning_rate": 4.752081226633888e-06, + "loss": 0.3495, + "step": 6730 + }, + { + "epoch": 1.6929074446680081, + "grad_norm": 0.30646365880966187, + "learning_rate": 4.750619789484556e-06, + "loss": 0.328, + "step": 6731 + }, + { + "epoch": 1.693158953722334, + "grad_norm": 0.30611762404441833, + "learning_rate": 4.749158373693076e-06, + "loss": 0.363, + "step": 6732 + }, + { + "epoch": 1.69341046277666, + "grad_norm": 0.3001714050769806, + "learning_rate": 4.7476969793846136e-06, + "loss": 0.3489, + "step": 6733 + }, + { + "epoch": 1.693661971830986, + "grad_norm": 0.30453163385391235, + "learning_rate": 4.746235606684326e-06, + "loss": 0.3424, + "step": 6734 + }, + { + "epoch": 1.6939134808853118, + "grad_norm": 0.31667834520339966, + "learning_rate": 4.744774255717372e-06, + "loss": 0.3638, + "step": 6735 + }, + { + "epoch": 1.6941649899396378, + "grad_norm": 0.30841580033302307, + "learning_rate": 4.7433129266089045e-06, + "loss": 0.3691, + "step": 6736 + }, + { + "epoch": 1.6944164989939638, + "grad_norm": 0.3112299144268036, + "learning_rate": 4.74185161948408e-06, + "loss": 0.3445, + "step": 6737 + }, + { + "epoch": 1.6946680080482897, + "grad_norm": 0.32629552483558655, + "learning_rate": 4.7403903344680495e-06, + "loss": 0.344, + "step": 6738 + }, + { + "epoch": 1.6949195171026157, + "grad_norm": 0.3291604220867157, + "learning_rate": 4.7389290716859634e-06, + "loss": 0.3614, + "step": 6739 + }, + { + "epoch": 1.6951710261569417, + "grad_norm": 0.3310836851596832, + "learning_rate": 4.737467831262967e-06, + "loss": 0.3459, + "step": 6740 + }, + { + "epoch": 1.6954225352112675, + "grad_norm": 0.3385966718196869, + "learning_rate": 4.736006613324209e-06, + "loss": 0.3583, + "step": 6741 + }, + { + "epoch": 1.6956740442655935, + "grad_norm": 0.29955390095710754, + "learning_rate": 4.734545417994834e-06, + "loss": 0.3815, + "step": 6742 + }, + { + "epoch": 1.6959255533199196, + "grad_norm": 0.30086326599121094, + "learning_rate": 4.7330842453999825e-06, + "loss": 0.3576, + "step": 6743 + }, + { + "epoch": 1.6961770623742454, + "grad_norm": 0.29754674434661865, + "learning_rate": 4.731623095664797e-06, + "loss": 0.3435, + "step": 6744 + }, + { + "epoch": 1.6964285714285714, + "grad_norm": 0.34076476097106934, + "learning_rate": 4.730161968914412e-06, + "loss": 0.3372, + "step": 6745 + }, + { + "epoch": 1.6966800804828974, + "grad_norm": 0.3299402594566345, + "learning_rate": 4.72870086527397e-06, + "loss": 0.34, + "step": 6746 + }, + { + "epoch": 1.6969315895372232, + "grad_norm": 0.3097115755081177, + "learning_rate": 4.727239784868597e-06, + "loss": 0.3463, + "step": 6747 + }, + { + "epoch": 1.6971830985915493, + "grad_norm": 0.30106401443481445, + "learning_rate": 4.725778727823434e-06, + "loss": 0.3376, + "step": 6748 + }, + { + "epoch": 1.6974346076458753, + "grad_norm": 0.3247431814670563, + "learning_rate": 4.724317694263605e-06, + "loss": 0.3387, + "step": 6749 + }, + { + "epoch": 1.697686116700201, + "grad_norm": 0.30322572588920593, + "learning_rate": 4.7228566843142426e-06, + "loss": 0.3589, + "step": 6750 + }, + { + "epoch": 1.6979376257545271, + "grad_norm": 0.3284371793270111, + "learning_rate": 4.72139569810047e-06, + "loss": 0.3592, + "step": 6751 + }, + { + "epoch": 1.6981891348088531, + "grad_norm": 0.3332383632659912, + "learning_rate": 4.7199347357474115e-06, + "loss": 0.357, + "step": 6752 + }, + { + "epoch": 1.698440643863179, + "grad_norm": 0.32514792680740356, + "learning_rate": 4.7184737973801945e-06, + "loss": 0.3224, + "step": 6753 + }, + { + "epoch": 1.6986921529175052, + "grad_norm": 0.2966492176055908, + "learning_rate": 4.717012883123932e-06, + "loss": 0.3646, + "step": 6754 + }, + { + "epoch": 1.698943661971831, + "grad_norm": 0.32919812202453613, + "learning_rate": 4.715551993103749e-06, + "loss": 0.3455, + "step": 6755 + }, + { + "epoch": 1.6991951710261568, + "grad_norm": 0.29699820280075073, + "learning_rate": 4.714091127444755e-06, + "loss": 0.3478, + "step": 6756 + }, + { + "epoch": 1.699446680080483, + "grad_norm": 0.33212509751319885, + "learning_rate": 4.712630286272071e-06, + "loss": 0.351, + "step": 6757 + }, + { + "epoch": 1.6996981891348089, + "grad_norm": 0.323261022567749, + "learning_rate": 4.711169469710802e-06, + "loss": 0.353, + "step": 6758 + }, + { + "epoch": 1.6999496981891347, + "grad_norm": 0.3002278804779053, + "learning_rate": 4.7097086778860625e-06, + "loss": 0.3433, + "step": 6759 + }, + { + "epoch": 1.700201207243461, + "grad_norm": 0.31812816858291626, + "learning_rate": 4.708247910922958e-06, + "loss": 0.3633, + "step": 6760 + }, + { + "epoch": 1.7004527162977867, + "grad_norm": 0.3050650656223297, + "learning_rate": 4.706787168946596e-06, + "loss": 0.3376, + "step": 6761 + }, + { + "epoch": 1.7007042253521125, + "grad_norm": 0.3200231194496155, + "learning_rate": 4.705326452082076e-06, + "loss": 0.3421, + "step": 6762 + }, + { + "epoch": 1.7009557344064388, + "grad_norm": 0.32024839520454407, + "learning_rate": 4.703865760454503e-06, + "loss": 0.3384, + "step": 6763 + }, + { + "epoch": 1.7012072434607646, + "grad_norm": 0.3096054494380951, + "learning_rate": 4.702405094188977e-06, + "loss": 0.3466, + "step": 6764 + }, + { + "epoch": 1.7014587525150904, + "grad_norm": 0.30760541558265686, + "learning_rate": 4.70094445341059e-06, + "loss": 0.3337, + "step": 6765 + }, + { + "epoch": 1.7017102615694166, + "grad_norm": 0.2908000349998474, + "learning_rate": 4.699483838244443e-06, + "loss": 0.3563, + "step": 6766 + }, + { + "epoch": 1.7019617706237424, + "grad_norm": 0.3102479875087738, + "learning_rate": 4.698023248815623e-06, + "loss": 0.34, + "step": 6767 + }, + { + "epoch": 1.7022132796780685, + "grad_norm": 0.317518413066864, + "learning_rate": 4.6965626852492235e-06, + "loss": 0.3706, + "step": 6768 + }, + { + "epoch": 1.7024647887323945, + "grad_norm": 0.33676648139953613, + "learning_rate": 4.6951021476703304e-06, + "loss": 0.348, + "step": 6769 + }, + { + "epoch": 1.7027162977867203, + "grad_norm": 0.30047839879989624, + "learning_rate": 4.6936416362040325e-06, + "loss": 0.3411, + "step": 6770 + }, + { + "epoch": 1.7029678068410463, + "grad_norm": 0.325817734003067, + "learning_rate": 4.69218115097541e-06, + "loss": 0.3657, + "step": 6771 + }, + { + "epoch": 1.7032193158953723, + "grad_norm": 0.28770050406455994, + "learning_rate": 4.690720692109549e-06, + "loss": 0.3375, + "step": 6772 + }, + { + "epoch": 1.7034708249496981, + "grad_norm": 0.30655109882354736, + "learning_rate": 4.689260259731523e-06, + "loss": 0.3397, + "step": 6773 + }, + { + "epoch": 1.7037223340040242, + "grad_norm": 0.3254563510417938, + "learning_rate": 4.687799853966413e-06, + "loss": 0.3299, + "step": 6774 + }, + { + "epoch": 1.7039738430583502, + "grad_norm": 0.3062305450439453, + "learning_rate": 4.686339474939293e-06, + "loss": 0.3617, + "step": 6775 + }, + { + "epoch": 1.704225352112676, + "grad_norm": 0.3447815179824829, + "learning_rate": 4.6848791227752335e-06, + "loss": 0.3491, + "step": 6776 + }, + { + "epoch": 1.704476861167002, + "grad_norm": 0.3192574083805084, + "learning_rate": 4.6834187975993065e-06, + "loss": 0.3619, + "step": 6777 + }, + { + "epoch": 1.704728370221328, + "grad_norm": 0.3120723068714142, + "learning_rate": 4.681958499536579e-06, + "loss": 0.3453, + "step": 6778 + }, + { + "epoch": 1.7049798792756539, + "grad_norm": 0.31706029176712036, + "learning_rate": 4.680498228712116e-06, + "loss": 0.3423, + "step": 6779 + }, + { + "epoch": 1.70523138832998, + "grad_norm": 0.3090144693851471, + "learning_rate": 4.679037985250981e-06, + "loss": 0.3368, + "step": 6780 + }, + { + "epoch": 1.705482897384306, + "grad_norm": 0.3182971179485321, + "learning_rate": 4.677577769278235e-06, + "loss": 0.3429, + "step": 6781 + }, + { + "epoch": 1.7057344064386317, + "grad_norm": 0.2918050289154053, + "learning_rate": 4.6761175809189366e-06, + "loss": 0.3477, + "step": 6782 + }, + { + "epoch": 1.7059859154929577, + "grad_norm": 0.3405851721763611, + "learning_rate": 4.67465742029814e-06, + "loss": 0.3507, + "step": 6783 + }, + { + "epoch": 1.7062374245472838, + "grad_norm": 0.2979619801044464, + "learning_rate": 4.6731972875409e-06, + "loss": 0.3183, + "step": 6784 + }, + { + "epoch": 1.7064889336016096, + "grad_norm": 0.3200087249279022, + "learning_rate": 4.671737182772267e-06, + "loss": 0.3588, + "step": 6785 + }, + { + "epoch": 1.7067404426559356, + "grad_norm": 0.3113258183002472, + "learning_rate": 4.6702771061172935e-06, + "loss": 0.3237, + "step": 6786 + }, + { + "epoch": 1.7069919517102616, + "grad_norm": 0.3095763325691223, + "learning_rate": 4.66881705770102e-06, + "loss": 0.3481, + "step": 6787 + }, + { + "epoch": 1.7072434607645874, + "grad_norm": 0.32275089621543884, + "learning_rate": 4.667357037648496e-06, + "loss": 0.3434, + "step": 6788 + }, + { + "epoch": 1.7074949698189135, + "grad_norm": 0.3030339777469635, + "learning_rate": 4.665897046084759e-06, + "loss": 0.3369, + "step": 6789 + }, + { + "epoch": 1.7077464788732395, + "grad_norm": 0.32319900393486023, + "learning_rate": 4.6644370831348524e-06, + "loss": 0.3355, + "step": 6790 + }, + { + "epoch": 1.7079979879275653, + "grad_norm": 0.2972315847873688, + "learning_rate": 4.662977148923808e-06, + "loss": 0.337, + "step": 6791 + }, + { + "epoch": 1.7082494969818913, + "grad_norm": 0.3062729239463806, + "learning_rate": 4.6615172435766636e-06, + "loss": 0.3504, + "step": 6792 + }, + { + "epoch": 1.7085010060362174, + "grad_norm": 0.3187355697154999, + "learning_rate": 4.660057367218448e-06, + "loss": 0.3579, + "step": 6793 + }, + { + "epoch": 1.7087525150905432, + "grad_norm": 0.33169421553611755, + "learning_rate": 4.658597519974193e-06, + "loss": 0.3504, + "step": 6794 + }, + { + "epoch": 1.7090040241448692, + "grad_norm": 0.3317626118659973, + "learning_rate": 4.657137701968925e-06, + "loss": 0.3571, + "step": 6795 + }, + { + "epoch": 1.7092555331991952, + "grad_norm": 0.31924623250961304, + "learning_rate": 4.655677913327668e-06, + "loss": 0.3457, + "step": 6796 + }, + { + "epoch": 1.709507042253521, + "grad_norm": 0.3141118288040161, + "learning_rate": 4.654218154175444e-06, + "loss": 0.3178, + "step": 6797 + }, + { + "epoch": 1.709758551307847, + "grad_norm": 0.3203374445438385, + "learning_rate": 4.652758424637271e-06, + "loss": 0.3472, + "step": 6798 + }, + { + "epoch": 1.710010060362173, + "grad_norm": 0.33453568816185, + "learning_rate": 4.651298724838168e-06, + "loss": 0.3485, + "step": 6799 + }, + { + "epoch": 1.7102615694164989, + "grad_norm": 0.3020557761192322, + "learning_rate": 4.649839054903146e-06, + "loss": 0.3516, + "step": 6800 + }, + { + "epoch": 1.710513078470825, + "grad_norm": 0.32627707719802856, + "learning_rate": 4.6483794149572196e-06, + "loss": 0.3663, + "step": 6801 + }, + { + "epoch": 1.710764587525151, + "grad_norm": 0.29445070028305054, + "learning_rate": 4.646919805125396e-06, + "loss": 0.3552, + "step": 6802 + }, + { + "epoch": 1.7110160965794767, + "grad_norm": 0.30278632044792175, + "learning_rate": 4.645460225532683e-06, + "loss": 0.3602, + "step": 6803 + }, + { + "epoch": 1.711267605633803, + "grad_norm": 0.3449458181858063, + "learning_rate": 4.644000676304082e-06, + "loss": 0.3329, + "step": 6804 + }, + { + "epoch": 1.7115191146881288, + "grad_norm": 0.32875609397888184, + "learning_rate": 4.642541157564596e-06, + "loss": 0.3313, + "step": 6805 + }, + { + "epoch": 1.7117706237424546, + "grad_norm": 0.3220992386341095, + "learning_rate": 4.641081669439226e-06, + "loss": 0.3739, + "step": 6806 + }, + { + "epoch": 1.7120221327967808, + "grad_norm": 0.30020537972450256, + "learning_rate": 4.6396222120529625e-06, + "loss": 0.33, + "step": 6807 + }, + { + "epoch": 1.7122736418511066, + "grad_norm": 0.3377690315246582, + "learning_rate": 4.638162785530805e-06, + "loss": 0.3478, + "step": 6808 + }, + { + "epoch": 1.7125251509054324, + "grad_norm": 0.30975767970085144, + "learning_rate": 4.636703389997739e-06, + "loss": 0.3479, + "step": 6809 + }, + { + "epoch": 1.7127766599597587, + "grad_norm": 0.31768518686294556, + "learning_rate": 4.635244025578757e-06, + "loss": 0.3301, + "step": 6810 + }, + { + "epoch": 1.7130281690140845, + "grad_norm": 0.28654173016548157, + "learning_rate": 4.63378469239884e-06, + "loss": 0.3402, + "step": 6811 + }, + { + "epoch": 1.7132796780684103, + "grad_norm": 0.33102425932884216, + "learning_rate": 4.632325390582976e-06, + "loss": 0.3487, + "step": 6812 + }, + { + "epoch": 1.7135311871227366, + "grad_norm": 0.30774351954460144, + "learning_rate": 4.630866120256139e-06, + "loss": 0.3409, + "step": 6813 + }, + { + "epoch": 1.7137826961770624, + "grad_norm": 0.3273797035217285, + "learning_rate": 4.629406881543312e-06, + "loss": 0.3329, + "step": 6814 + }, + { + "epoch": 1.7140342052313882, + "grad_norm": 0.31635209918022156, + "learning_rate": 4.6279476745694655e-06, + "loss": 0.3387, + "step": 6815 + }, + { + "epoch": 1.7142857142857144, + "grad_norm": 0.3162465989589691, + "learning_rate": 4.6264884994595725e-06, + "loss": 0.3405, + "step": 6816 + }, + { + "epoch": 1.7145372233400402, + "grad_norm": 0.35132935643196106, + "learning_rate": 4.625029356338605e-06, + "loss": 0.3522, + "step": 6817 + }, + { + "epoch": 1.7147887323943662, + "grad_norm": 0.31276100873947144, + "learning_rate": 4.623570245331525e-06, + "loss": 0.3402, + "step": 6818 + }, + { + "epoch": 1.7150402414486923, + "grad_norm": 0.2836771309375763, + "learning_rate": 4.622111166563301e-06, + "loss": 0.3389, + "step": 6819 + }, + { + "epoch": 1.715291750503018, + "grad_norm": 0.33196336030960083, + "learning_rate": 4.6206521201588894e-06, + "loss": 0.3398, + "step": 6820 + }, + { + "epoch": 1.715543259557344, + "grad_norm": 0.28593510389328003, + "learning_rate": 4.6191931062432526e-06, + "loss": 0.366, + "step": 6821 + }, + { + "epoch": 1.7157947686116701, + "grad_norm": 0.3059460520744324, + "learning_rate": 4.617734124941342e-06, + "loss": 0.3467, + "step": 6822 + }, + { + "epoch": 1.716046277665996, + "grad_norm": 0.3157053589820862, + "learning_rate": 4.6162751763781146e-06, + "loss": 0.309, + "step": 6823 + }, + { + "epoch": 1.716297786720322, + "grad_norm": 0.29486018419265747, + "learning_rate": 4.6148162606785144e-06, + "loss": 0.3461, + "step": 6824 + }, + { + "epoch": 1.716549295774648, + "grad_norm": 0.32476910948753357, + "learning_rate": 4.613357377967495e-06, + "loss": 0.3405, + "step": 6825 + }, + { + "epoch": 1.7168008048289738, + "grad_norm": 0.3237256705760956, + "learning_rate": 4.611898528369995e-06, + "loss": 0.3262, + "step": 6826 + }, + { + "epoch": 1.7170523138832998, + "grad_norm": 0.3211307227611542, + "learning_rate": 4.610439712010959e-06, + "loss": 0.3476, + "step": 6827 + }, + { + "epoch": 1.7173038229376258, + "grad_norm": 0.29768112301826477, + "learning_rate": 4.6089809290153245e-06, + "loss": 0.3629, + "step": 6828 + }, + { + "epoch": 1.7175553319919517, + "grad_norm": 0.2961174547672272, + "learning_rate": 4.607522179508027e-06, + "loss": 0.343, + "step": 6829 + }, + { + "epoch": 1.7178068410462777, + "grad_norm": 0.2991441488265991, + "learning_rate": 4.606063463614e-06, + "loss": 0.3571, + "step": 6830 + }, + { + "epoch": 1.7180583501006037, + "grad_norm": 0.2887977063655853, + "learning_rate": 4.604604781458173e-06, + "loss": 0.3516, + "step": 6831 + }, + { + "epoch": 1.7183098591549295, + "grad_norm": 0.31125563383102417, + "learning_rate": 4.6031461331654725e-06, + "loss": 0.3294, + "step": 6832 + }, + { + "epoch": 1.7185613682092555, + "grad_norm": 0.3178209960460663, + "learning_rate": 4.601687518860823e-06, + "loss": 0.3519, + "step": 6833 + }, + { + "epoch": 1.7188128772635816, + "grad_norm": 0.29017218947410583, + "learning_rate": 4.600228938669146e-06, + "loss": 0.3451, + "step": 6834 + }, + { + "epoch": 1.7190643863179074, + "grad_norm": 0.3055189549922943, + "learning_rate": 4.5987703927153575e-06, + "loss": 0.3346, + "step": 6835 + }, + { + "epoch": 1.7193158953722334, + "grad_norm": 0.3083680272102356, + "learning_rate": 4.597311881124378e-06, + "loss": 0.3386, + "step": 6836 + }, + { + "epoch": 1.7195674044265594, + "grad_norm": 0.32472705841064453, + "learning_rate": 4.595853404021114e-06, + "loss": 0.3384, + "step": 6837 + }, + { + "epoch": 1.7198189134808852, + "grad_norm": 0.3365086317062378, + "learning_rate": 4.594394961530479e-06, + "loss": 0.3496, + "step": 6838 + }, + { + "epoch": 1.7200704225352113, + "grad_norm": 0.33195602893829346, + "learning_rate": 4.592936553777378e-06, + "loss": 0.3715, + "step": 6839 + }, + { + "epoch": 1.7203219315895373, + "grad_norm": 0.3010450005531311, + "learning_rate": 4.591478180886714e-06, + "loss": 0.3476, + "step": 6840 + }, + { + "epoch": 1.720573440643863, + "grad_norm": 0.3121878206729889, + "learning_rate": 4.590019842983389e-06, + "loss": 0.3477, + "step": 6841 + }, + { + "epoch": 1.720824949698189, + "grad_norm": 0.3093717396259308, + "learning_rate": 4.588561540192299e-06, + "loss": 0.3335, + "step": 6842 + }, + { + "epoch": 1.7210764587525151, + "grad_norm": 0.3174624741077423, + "learning_rate": 4.587103272638339e-06, + "loss": 0.355, + "step": 6843 + }, + { + "epoch": 1.721327967806841, + "grad_norm": 0.3113724887371063, + "learning_rate": 4.585645040446401e-06, + "loss": 0.3357, + "step": 6844 + }, + { + "epoch": 1.721579476861167, + "grad_norm": 0.33507058024406433, + "learning_rate": 4.584186843741373e-06, + "loss": 0.3498, + "step": 6845 + }, + { + "epoch": 1.721830985915493, + "grad_norm": 0.34897923469543457, + "learning_rate": 4.58272868264814e-06, + "loss": 0.3539, + "step": 6846 + }, + { + "epoch": 1.7220824949698188, + "grad_norm": 0.3107760548591614, + "learning_rate": 4.581270557291586e-06, + "loss": 0.3618, + "step": 6847 + }, + { + "epoch": 1.7223340040241448, + "grad_norm": 0.3432888984680176, + "learning_rate": 4.579812467796588e-06, + "loss": 0.3521, + "step": 6848 + }, + { + "epoch": 1.7225855130784709, + "grad_norm": 0.35445329546928406, + "learning_rate": 4.5783544142880235e-06, + "loss": 0.3892, + "step": 6849 + }, + { + "epoch": 1.7228370221327967, + "grad_norm": 0.35715755820274353, + "learning_rate": 4.576896396890767e-06, + "loss": 0.3621, + "step": 6850 + }, + { + "epoch": 1.7230885311871227, + "grad_norm": 0.30892953276634216, + "learning_rate": 4.575438415729685e-06, + "loss": 0.3596, + "step": 6851 + }, + { + "epoch": 1.7233400402414487, + "grad_norm": 0.339541494846344, + "learning_rate": 4.573980470929649e-06, + "loss": 0.3262, + "step": 6852 + }, + { + "epoch": 1.7235915492957745, + "grad_norm": 0.33129116892814636, + "learning_rate": 4.572522562615519e-06, + "loss": 0.3431, + "step": 6853 + }, + { + "epoch": 1.7238430583501008, + "grad_norm": 0.3051982522010803, + "learning_rate": 4.5710646909121585e-06, + "loss": 0.3451, + "step": 6854 + }, + { + "epoch": 1.7240945674044266, + "grad_norm": 0.3270474076271057, + "learning_rate": 4.5696068559444225e-06, + "loss": 0.3626, + "step": 6855 + }, + { + "epoch": 1.7243460764587524, + "grad_norm": 0.3316960334777832, + "learning_rate": 4.568149057837168e-06, + "loss": 0.3443, + "step": 6856 + }, + { + "epoch": 1.7245975855130786, + "grad_norm": 0.32199814915657043, + "learning_rate": 4.5666912967152435e-06, + "loss": 0.3544, + "step": 6857 + }, + { + "epoch": 1.7248490945674044, + "grad_norm": 0.32749658823013306, + "learning_rate": 4.5652335727035e-06, + "loss": 0.3256, + "step": 6858 + }, + { + "epoch": 1.7251006036217302, + "grad_norm": 0.3357948362827301, + "learning_rate": 4.56377588592678e-06, + "loss": 0.3441, + "step": 6859 + }, + { + "epoch": 1.7253521126760565, + "grad_norm": 0.3046301603317261, + "learning_rate": 4.562318236509926e-06, + "loss": 0.3464, + "step": 6860 + }, + { + "epoch": 1.7256036217303823, + "grad_norm": 0.33477479219436646, + "learning_rate": 4.560860624577779e-06, + "loss": 0.3365, + "step": 6861 + }, + { + "epoch": 1.725855130784708, + "grad_norm": 0.3378863036632538, + "learning_rate": 4.559403050255169e-06, + "loss": 0.3526, + "step": 6862 + }, + { + "epoch": 1.7261066398390343, + "grad_norm": 0.3373473584651947, + "learning_rate": 4.557945513666935e-06, + "loss": 0.3376, + "step": 6863 + }, + { + "epoch": 1.7263581488933601, + "grad_norm": 0.33986005187034607, + "learning_rate": 4.5564880149378995e-06, + "loss": 0.3471, + "step": 6864 + }, + { + "epoch": 1.7266096579476862, + "grad_norm": 0.2948022782802582, + "learning_rate": 4.555030554192894e-06, + "loss": 0.3384, + "step": 6865 + }, + { + "epoch": 1.7268611670020122, + "grad_norm": 0.3061770796775818, + "learning_rate": 4.553573131556734e-06, + "loss": 0.3311, + "step": 6866 + }, + { + "epoch": 1.727112676056338, + "grad_norm": 0.32700714468955994, + "learning_rate": 4.552115747154247e-06, + "loss": 0.3456, + "step": 6867 + }, + { + "epoch": 1.727364185110664, + "grad_norm": 0.3140343725681305, + "learning_rate": 4.55065840111024e-06, + "loss": 0.3903, + "step": 6868 + }, + { + "epoch": 1.72761569416499, + "grad_norm": 0.32476603984832764, + "learning_rate": 4.549201093549533e-06, + "loss": 0.3226, + "step": 6869 + }, + { + "epoch": 1.7278672032193159, + "grad_norm": 0.32490959763526917, + "learning_rate": 4.547743824596929e-06, + "loss": 0.3408, + "step": 6870 + }, + { + "epoch": 1.7281187122736419, + "grad_norm": 0.3557870388031006, + "learning_rate": 4.546286594377238e-06, + "loss": 0.3438, + "step": 6871 + }, + { + "epoch": 1.728370221327968, + "grad_norm": 0.3313372731208801, + "learning_rate": 4.544829403015264e-06, + "loss": 0.3448, + "step": 6872 + }, + { + "epoch": 1.7286217303822937, + "grad_norm": 0.3483891189098358, + "learning_rate": 4.543372250635801e-06, + "loss": 0.3346, + "step": 6873 + }, + { + "epoch": 1.7288732394366197, + "grad_norm": 0.3317503035068512, + "learning_rate": 4.541915137363651e-06, + "loss": 0.3537, + "step": 6874 + }, + { + "epoch": 1.7291247484909458, + "grad_norm": 0.33802077174186707, + "learning_rate": 4.540458063323601e-06, + "loss": 0.3488, + "step": 6875 + }, + { + "epoch": 1.7293762575452716, + "grad_norm": 0.30749836564064026, + "learning_rate": 4.539001028640447e-06, + "loss": 0.35, + "step": 6876 + }, + { + "epoch": 1.7296277665995976, + "grad_norm": 0.3114013671875, + "learning_rate": 4.537544033438967e-06, + "loss": 0.3305, + "step": 6877 + }, + { + "epoch": 1.7298792756539236, + "grad_norm": 0.3545249104499817, + "learning_rate": 4.53608707784395e-06, + "loss": 0.347, + "step": 6878 + }, + { + "epoch": 1.7301307847082494, + "grad_norm": 0.3405088484287262, + "learning_rate": 4.534630161980171e-06, + "loss": 0.3498, + "step": 6879 + }, + { + "epoch": 1.7303822937625755, + "grad_norm": 0.3269940912723541, + "learning_rate": 4.533173285972408e-06, + "loss": 0.3648, + "step": 6880 + }, + { + "epoch": 1.7306338028169015, + "grad_norm": 0.30962666869163513, + "learning_rate": 4.531716449945431e-06, + "loss": 0.3474, + "step": 6881 + }, + { + "epoch": 1.7308853118712273, + "grad_norm": 0.3638971149921417, + "learning_rate": 4.530259654024011e-06, + "loss": 0.3424, + "step": 6882 + }, + { + "epoch": 1.7311368209255533, + "grad_norm": 0.38066476583480835, + "learning_rate": 4.528802898332914e-06, + "loss": 0.3332, + "step": 6883 + }, + { + "epoch": 1.7313883299798793, + "grad_norm": 0.30477452278137207, + "learning_rate": 4.5273461829969e-06, + "loss": 0.3625, + "step": 6884 + }, + { + "epoch": 1.7316398390342052, + "grad_norm": 0.3417944014072418, + "learning_rate": 4.525889508140731e-06, + "loss": 0.3661, + "step": 6885 + }, + { + "epoch": 1.7318913480885312, + "grad_norm": 0.340406209230423, + "learning_rate": 4.524432873889156e-06, + "loss": 0.3576, + "step": 6886 + }, + { + "epoch": 1.7321428571428572, + "grad_norm": 0.40321967005729675, + "learning_rate": 4.522976280366934e-06, + "loss": 0.346, + "step": 6887 + }, + { + "epoch": 1.732394366197183, + "grad_norm": 0.3876849412918091, + "learning_rate": 4.5215197276988055e-06, + "loss": 0.3598, + "step": 6888 + }, + { + "epoch": 1.732645875251509, + "grad_norm": 0.3189951479434967, + "learning_rate": 4.520063216009522e-06, + "loss": 0.3289, + "step": 6889 + }, + { + "epoch": 1.732897384305835, + "grad_norm": 0.3129585087299347, + "learning_rate": 4.518606745423819e-06, + "loss": 0.325, + "step": 6890 + }, + { + "epoch": 1.7331488933601609, + "grad_norm": 0.2990803122520447, + "learning_rate": 4.517150316066439e-06, + "loss": 0.3333, + "step": 6891 + }, + { + "epoch": 1.733400402414487, + "grad_norm": 0.31391507387161255, + "learning_rate": 4.515693928062112e-06, + "loss": 0.3527, + "step": 6892 + }, + { + "epoch": 1.733651911468813, + "grad_norm": 0.3259546756744385, + "learning_rate": 4.514237581535571e-06, + "loss": 0.3729, + "step": 6893 + }, + { + "epoch": 1.7339034205231387, + "grad_norm": 0.3225707411766052, + "learning_rate": 4.512781276611542e-06, + "loss": 0.3587, + "step": 6894 + }, + { + "epoch": 1.7341549295774648, + "grad_norm": 0.311168909072876, + "learning_rate": 4.511325013414749e-06, + "loss": 0.3533, + "step": 6895 + }, + { + "epoch": 1.7344064386317908, + "grad_norm": 0.30714693665504456, + "learning_rate": 4.509868792069912e-06, + "loss": 0.3373, + "step": 6896 + }, + { + "epoch": 1.7346579476861166, + "grad_norm": 0.31528759002685547, + "learning_rate": 4.508412612701746e-06, + "loss": 0.3632, + "step": 6897 + }, + { + "epoch": 1.7349094567404426, + "grad_norm": 0.29422181844711304, + "learning_rate": 4.506956475434964e-06, + "loss": 0.3536, + "step": 6898 + }, + { + "epoch": 1.7351609657947686, + "grad_norm": 0.2980017364025116, + "learning_rate": 4.505500380394276e-06, + "loss": 0.3205, + "step": 6899 + }, + { + "epoch": 1.7354124748490944, + "grad_norm": 0.32730206847190857, + "learning_rate": 4.504044327704387e-06, + "loss": 0.3645, + "step": 6900 + }, + { + "epoch": 1.7356639839034205, + "grad_norm": 0.3445815145969391, + "learning_rate": 4.502588317489997e-06, + "loss": 0.382, + "step": 6901 + }, + { + "epoch": 1.7359154929577465, + "grad_norm": 0.3304295241832733, + "learning_rate": 4.501132349875808e-06, + "loss": 0.3683, + "step": 6902 + }, + { + "epoch": 1.7361670020120723, + "grad_norm": 0.30265411734580994, + "learning_rate": 4.499676424986512e-06, + "loss": 0.3542, + "step": 6903 + }, + { + "epoch": 1.7364185110663986, + "grad_norm": 0.3029594421386719, + "learning_rate": 4.498220542946798e-06, + "loss": 0.3597, + "step": 6904 + }, + { + "epoch": 1.7366700201207244, + "grad_norm": 0.327571302652359, + "learning_rate": 4.4967647038813575e-06, + "loss": 0.353, + "step": 6905 + }, + { + "epoch": 1.7369215291750502, + "grad_norm": 0.2830412685871124, + "learning_rate": 4.495308907914871e-06, + "loss": 0.3255, + "step": 6906 + }, + { + "epoch": 1.7371730382293764, + "grad_norm": 0.32870203256607056, + "learning_rate": 4.49385315517202e-06, + "loss": 0.3445, + "step": 6907 + }, + { + "epoch": 1.7374245472837022, + "grad_norm": 0.34851449728012085, + "learning_rate": 4.492397445777479e-06, + "loss": 0.3283, + "step": 6908 + }, + { + "epoch": 1.737676056338028, + "grad_norm": 0.3187304735183716, + "learning_rate": 4.490941779855922e-06, + "loss": 0.3495, + "step": 6909 + }, + { + "epoch": 1.7379275653923543, + "grad_norm": 0.31370389461517334, + "learning_rate": 4.489486157532016e-06, + "loss": 0.3392, + "step": 6910 + }, + { + "epoch": 1.73817907444668, + "grad_norm": 0.3236584961414337, + "learning_rate": 4.488030578930428e-06, + "loss": 0.3764, + "step": 6911 + }, + { + "epoch": 1.7384305835010059, + "grad_norm": 0.3015926778316498, + "learning_rate": 4.486575044175817e-06, + "loss": 0.348, + "step": 6912 + }, + { + "epoch": 1.7386820925553321, + "grad_norm": 0.33164599537849426, + "learning_rate": 4.485119553392843e-06, + "loss": 0.3407, + "step": 6913 + }, + { + "epoch": 1.738933601609658, + "grad_norm": 0.3237990736961365, + "learning_rate": 4.483664106706155e-06, + "loss": 0.3614, + "step": 6914 + }, + { + "epoch": 1.739185110663984, + "grad_norm": 0.30540236830711365, + "learning_rate": 4.482208704240408e-06, + "loss": 0.3377, + "step": 6915 + }, + { + "epoch": 1.73943661971831, + "grad_norm": 0.32697343826293945, + "learning_rate": 4.480753346120247e-06, + "loss": 0.336, + "step": 6916 + }, + { + "epoch": 1.7396881287726358, + "grad_norm": 0.32882174849510193, + "learning_rate": 4.479298032470312e-06, + "loss": 0.3385, + "step": 6917 + }, + { + "epoch": 1.7399396378269618, + "grad_norm": 0.315578430891037, + "learning_rate": 4.477842763415244e-06, + "loss": 0.3481, + "step": 6918 + }, + { + "epoch": 1.7401911468812878, + "grad_norm": 0.2991701066493988, + "learning_rate": 4.476387539079676e-06, + "loss": 0.321, + "step": 6919 + }, + { + "epoch": 1.7404426559356136, + "grad_norm": 0.3147648274898529, + "learning_rate": 4.474932359588241e-06, + "loss": 0.3375, + "step": 6920 + }, + { + "epoch": 1.7406941649899397, + "grad_norm": 0.34548136591911316, + "learning_rate": 4.473477225065563e-06, + "loss": 0.3568, + "step": 6921 + }, + { + "epoch": 1.7409456740442657, + "grad_norm": 0.3195883631706238, + "learning_rate": 4.472022135636268e-06, + "loss": 0.3509, + "step": 6922 + }, + { + "epoch": 1.7411971830985915, + "grad_norm": 0.2943090498447418, + "learning_rate": 4.470567091424973e-06, + "loss": 0.3495, + "step": 6923 + }, + { + "epoch": 1.7414486921529175, + "grad_norm": 0.32778680324554443, + "learning_rate": 4.469112092556296e-06, + "loss": 0.3478, + "step": 6924 + }, + { + "epoch": 1.7417002012072436, + "grad_norm": 0.2987135052680969, + "learning_rate": 4.467657139154845e-06, + "loss": 0.3446, + "step": 6925 + }, + { + "epoch": 1.7419517102615694, + "grad_norm": 0.3135947287082672, + "learning_rate": 4.466202231345229e-06, + "loss": 0.3638, + "step": 6926 + }, + { + "epoch": 1.7422032193158954, + "grad_norm": 0.29023781418800354, + "learning_rate": 4.464747369252056e-06, + "loss": 0.3696, + "step": 6927 + }, + { + "epoch": 1.7424547283702214, + "grad_norm": 0.31927579641342163, + "learning_rate": 4.463292552999919e-06, + "loss": 0.3555, + "step": 6928 + }, + { + "epoch": 1.7427062374245472, + "grad_norm": 0.36215946078300476, + "learning_rate": 4.4618377827134205e-06, + "loss": 0.3435, + "step": 6929 + }, + { + "epoch": 1.7429577464788732, + "grad_norm": 0.30876463651657104, + "learning_rate": 4.460383058517146e-06, + "loss": 0.3416, + "step": 6930 + }, + { + "epoch": 1.7432092555331993, + "grad_norm": 0.2847077250480652, + "learning_rate": 4.458928380535689e-06, + "loss": 0.3375, + "step": 6931 + }, + { + "epoch": 1.743460764587525, + "grad_norm": 0.30569058656692505, + "learning_rate": 4.45747374889363e-06, + "loss": 0.3246, + "step": 6932 + }, + { + "epoch": 1.743712273641851, + "grad_norm": 0.2917396128177643, + "learning_rate": 4.456019163715552e-06, + "loss": 0.3466, + "step": 6933 + }, + { + "epoch": 1.7439637826961771, + "grad_norm": 0.29794490337371826, + "learning_rate": 4.454564625126026e-06, + "loss": 0.3593, + "step": 6934 + }, + { + "epoch": 1.744215291750503, + "grad_norm": 0.31993839144706726, + "learning_rate": 4.45311013324963e-06, + "loss": 0.3431, + "step": 6935 + }, + { + "epoch": 1.744466800804829, + "grad_norm": 0.2840554714202881, + "learning_rate": 4.45165568821093e-06, + "loss": 0.3485, + "step": 6936 + }, + { + "epoch": 1.744718309859155, + "grad_norm": 0.32249221205711365, + "learning_rate": 4.450201290134489e-06, + "loss": 0.3301, + "step": 6937 + }, + { + "epoch": 1.7449698189134808, + "grad_norm": 0.290263831615448, + "learning_rate": 4.448746939144869e-06, + "loss": 0.3413, + "step": 6938 + }, + { + "epoch": 1.7452213279678068, + "grad_norm": 0.30761563777923584, + "learning_rate": 4.447292635366623e-06, + "loss": 0.3572, + "step": 6939 + }, + { + "epoch": 1.7454728370221329, + "grad_norm": 0.3331648111343384, + "learning_rate": 4.4458383789243086e-06, + "loss": 0.3372, + "step": 6940 + }, + { + "epoch": 1.7457243460764587, + "grad_norm": 0.3228667080402374, + "learning_rate": 4.444384169942466e-06, + "loss": 0.3245, + "step": 6941 + }, + { + "epoch": 1.7459758551307847, + "grad_norm": 0.2900792360305786, + "learning_rate": 4.4429300085456475e-06, + "loss": 0.3437, + "step": 6942 + }, + { + "epoch": 1.7462273641851107, + "grad_norm": 0.31676149368286133, + "learning_rate": 4.4414758948583855e-06, + "loss": 0.3397, + "step": 6943 + }, + { + "epoch": 1.7464788732394365, + "grad_norm": 0.32281917333602905, + "learning_rate": 4.440021829005221e-06, + "loss": 0.3333, + "step": 6944 + }, + { + "epoch": 1.7467303822937625, + "grad_norm": 0.3074769377708435, + "learning_rate": 4.438567811110682e-06, + "loss": 0.3588, + "step": 6945 + }, + { + "epoch": 1.7469818913480886, + "grad_norm": 0.31130295991897583, + "learning_rate": 4.437113841299297e-06, + "loss": 0.3617, + "step": 6946 + }, + { + "epoch": 1.7472334004024144, + "grad_norm": 0.3043542802333832, + "learning_rate": 4.435659919695593e-06, + "loss": 0.3479, + "step": 6947 + }, + { + "epoch": 1.7474849094567404, + "grad_norm": 0.37658563256263733, + "learning_rate": 4.434206046424085e-06, + "loss": 0.3292, + "step": 6948 + }, + { + "epoch": 1.7477364185110664, + "grad_norm": 0.3038911521434784, + "learning_rate": 4.43275222160929e-06, + "loss": 0.3645, + "step": 6949 + }, + { + "epoch": 1.7479879275653922, + "grad_norm": 0.3258115351200104, + "learning_rate": 4.431298445375717e-06, + "loss": 0.3271, + "step": 6950 + }, + { + "epoch": 1.7482394366197183, + "grad_norm": 0.3135131001472473, + "learning_rate": 4.429844717847876e-06, + "loss": 0.349, + "step": 6951 + }, + { + "epoch": 1.7484909456740443, + "grad_norm": 0.32563337683677673, + "learning_rate": 4.428391039150266e-06, + "loss": 0.3503, + "step": 6952 + }, + { + "epoch": 1.74874245472837, + "grad_norm": 0.3328086733818054, + "learning_rate": 4.426937409407391e-06, + "loss": 0.3368, + "step": 6953 + }, + { + "epoch": 1.7489939637826963, + "grad_norm": 0.3279817998409271, + "learning_rate": 4.4254838287437386e-06, + "loss": 0.3378, + "step": 6954 + }, + { + "epoch": 1.7492454728370221, + "grad_norm": 0.30334845185279846, + "learning_rate": 4.424030297283805e-06, + "loss": 0.3261, + "step": 6955 + }, + { + "epoch": 1.749496981891348, + "grad_norm": 0.32851114869117737, + "learning_rate": 4.42257681515207e-06, + "loss": 0.3525, + "step": 6956 + }, + { + "epoch": 1.7497484909456742, + "grad_norm": 0.32159623503685, + "learning_rate": 4.42112338247302e-06, + "loss": 0.3465, + "step": 6957 + }, + { + "epoch": 1.75, + "grad_norm": 0.3375226855278015, + "learning_rate": 4.41966999937113e-06, + "loss": 0.3525, + "step": 6958 + }, + { + "epoch": 1.7502515090543258, + "grad_norm": 0.34559088945388794, + "learning_rate": 4.418216665970875e-06, + "loss": 0.3692, + "step": 6959 + }, + { + "epoch": 1.750503018108652, + "grad_norm": 0.2969364523887634, + "learning_rate": 4.416763382396723e-06, + "loss": 0.3436, + "step": 6960 + }, + { + "epoch": 1.7507545271629779, + "grad_norm": 0.3050820529460907, + "learning_rate": 4.4153101487731385e-06, + "loss": 0.3235, + "step": 6961 + }, + { + "epoch": 1.7510060362173037, + "grad_norm": 0.32389354705810547, + "learning_rate": 4.413856965224581e-06, + "loss": 0.3538, + "step": 6962 + }, + { + "epoch": 1.75125754527163, + "grad_norm": 0.36416080594062805, + "learning_rate": 4.412403831875509e-06, + "loss": 0.3653, + "step": 6963 + }, + { + "epoch": 1.7515090543259557, + "grad_norm": 0.31853196024894714, + "learning_rate": 4.410950748850372e-06, + "loss": 0.3283, + "step": 6964 + }, + { + "epoch": 1.7517605633802817, + "grad_norm": 0.29971179366111755, + "learning_rate": 4.409497716273618e-06, + "loss": 0.3397, + "step": 6965 + }, + { + "epoch": 1.7520120724346078, + "grad_norm": 0.3168172836303711, + "learning_rate": 4.408044734269692e-06, + "loss": 0.3361, + "step": 6966 + }, + { + "epoch": 1.7522635814889336, + "grad_norm": 0.3141171336174011, + "learning_rate": 4.40659180296303e-06, + "loss": 0.3341, + "step": 6967 + }, + { + "epoch": 1.7525150905432596, + "grad_norm": 0.3332131803035736, + "learning_rate": 4.405138922478066e-06, + "loss": 0.327, + "step": 6968 + }, + { + "epoch": 1.7527665995975856, + "grad_norm": 0.3187667727470398, + "learning_rate": 4.403686092939235e-06, + "loss": 0.3512, + "step": 6969 + }, + { + "epoch": 1.7530181086519114, + "grad_norm": 0.3000306487083435, + "learning_rate": 4.4022333144709566e-06, + "loss": 0.363, + "step": 6970 + }, + { + "epoch": 1.7532696177062375, + "grad_norm": 0.32261931896209717, + "learning_rate": 4.400780587197658e-06, + "loss": 0.3832, + "step": 6971 + }, + { + "epoch": 1.7535211267605635, + "grad_norm": 0.31589117646217346, + "learning_rate": 4.399327911243751e-06, + "loss": 0.3646, + "step": 6972 + }, + { + "epoch": 1.7537726358148893, + "grad_norm": 0.3187357187271118, + "learning_rate": 4.3978752867336536e-06, + "loss": 0.3653, + "step": 6973 + }, + { + "epoch": 1.7540241448692153, + "grad_norm": 0.3000258505344391, + "learning_rate": 4.396422713791768e-06, + "loss": 0.3428, + "step": 6974 + }, + { + "epoch": 1.7542756539235413, + "grad_norm": 0.30427271127700806, + "learning_rate": 4.394970192542504e-06, + "loss": 0.3423, + "step": 6975 + }, + { + "epoch": 1.7545271629778671, + "grad_norm": 0.3039720952510834, + "learning_rate": 4.3935177231102544e-06, + "loss": 0.3351, + "step": 6976 + }, + { + "epoch": 1.7547786720321932, + "grad_norm": 0.29543939232826233, + "learning_rate": 4.3920653056194205e-06, + "loss": 0.3216, + "step": 6977 + }, + { + "epoch": 1.7550301810865192, + "grad_norm": 0.3125627934932709, + "learning_rate": 4.390612940194388e-06, + "loss": 0.3434, + "step": 6978 + }, + { + "epoch": 1.755281690140845, + "grad_norm": 0.2896956205368042, + "learning_rate": 4.389160626959545e-06, + "loss": 0.3435, + "step": 6979 + }, + { + "epoch": 1.755533199195171, + "grad_norm": 0.3043994903564453, + "learning_rate": 4.387708366039275e-06, + "loss": 0.3524, + "step": 6980 + }, + { + "epoch": 1.755784708249497, + "grad_norm": 0.3003641664981842, + "learning_rate": 4.38625615755795e-06, + "loss": 0.321, + "step": 6981 + }, + { + "epoch": 1.7560362173038229, + "grad_norm": 0.33770516514778137, + "learning_rate": 4.384804001639948e-06, + "loss": 0.3518, + "step": 6982 + }, + { + "epoch": 1.756287726358149, + "grad_norm": 0.3379949927330017, + "learning_rate": 4.383351898409634e-06, + "loss": 0.34, + "step": 6983 + }, + { + "epoch": 1.756539235412475, + "grad_norm": 0.33452707529067993, + "learning_rate": 4.381899847991372e-06, + "loss": 0.385, + "step": 6984 + }, + { + "epoch": 1.7567907444668007, + "grad_norm": 0.30171477794647217, + "learning_rate": 4.38044785050952e-06, + "loss": 0.3481, + "step": 6985 + }, + { + "epoch": 1.7570422535211268, + "grad_norm": 0.2875131666660309, + "learning_rate": 4.378995906088436e-06, + "loss": 0.3327, + "step": 6986 + }, + { + "epoch": 1.7572937625754528, + "grad_norm": 0.2994171977043152, + "learning_rate": 4.377544014852466e-06, + "loss": 0.3355, + "step": 6987 + }, + { + "epoch": 1.7575452716297786, + "grad_norm": 0.3051261007785797, + "learning_rate": 4.3760921769259585e-06, + "loss": 0.3314, + "step": 6988 + }, + { + "epoch": 1.7577967806841046, + "grad_norm": 0.29593223333358765, + "learning_rate": 4.374640392433251e-06, + "loss": 0.3445, + "step": 6989 + }, + { + "epoch": 1.7580482897384306, + "grad_norm": 0.3013860285282135, + "learning_rate": 4.3731886614986815e-06, + "loss": 0.3661, + "step": 6990 + }, + { + "epoch": 1.7582997987927564, + "grad_norm": 0.2953547537326813, + "learning_rate": 4.371736984246584e-06, + "loss": 0.3256, + "step": 6991 + }, + { + "epoch": 1.7585513078470825, + "grad_norm": 0.3246706426143646, + "learning_rate": 4.370285360801281e-06, + "loss": 0.3298, + "step": 6992 + }, + { + "epoch": 1.7588028169014085, + "grad_norm": 0.3016349673271179, + "learning_rate": 4.3688337912871e-06, + "loss": 0.3447, + "step": 6993 + }, + { + "epoch": 1.7590543259557343, + "grad_norm": 0.295219361782074, + "learning_rate": 4.367382275828353e-06, + "loss": 0.3534, + "step": 6994 + }, + { + "epoch": 1.7593058350100603, + "grad_norm": 0.3270989954471588, + "learning_rate": 4.36593081454936e-06, + "loss": 0.3554, + "step": 6995 + }, + { + "epoch": 1.7595573440643864, + "grad_norm": 0.31494227051734924, + "learning_rate": 4.364479407574424e-06, + "loss": 0.3628, + "step": 6996 + }, + { + "epoch": 1.7598088531187122, + "grad_norm": 0.3002457022666931, + "learning_rate": 4.363028055027852e-06, + "loss": 0.3426, + "step": 6997 + }, + { + "epoch": 1.7600603621730382, + "grad_norm": 0.3095671236515045, + "learning_rate": 4.36157675703394e-06, + "loss": 0.3341, + "step": 6998 + }, + { + "epoch": 1.7603118712273642, + "grad_norm": 0.2987361252307892, + "learning_rate": 4.360125513716988e-06, + "loss": 0.3406, + "step": 6999 + }, + { + "epoch": 1.76056338028169, + "grad_norm": 0.30728045105934143, + "learning_rate": 4.35867432520128e-06, + "loss": 0.3648, + "step": 7000 + }, + { + "epoch": 1.760814889336016, + "grad_norm": 0.279835969209671, + "learning_rate": 4.357223191611103e-06, + "loss": 0.3338, + "step": 7001 + }, + { + "epoch": 1.761066398390342, + "grad_norm": 0.32082340121269226, + "learning_rate": 4.355772113070742e-06, + "loss": 0.364, + "step": 7002 + }, + { + "epoch": 1.7613179074446679, + "grad_norm": 0.2994687259197235, + "learning_rate": 4.354321089704466e-06, + "loss": 0.3723, + "step": 7003 + }, + { + "epoch": 1.7615694164989941, + "grad_norm": 0.3012014329433441, + "learning_rate": 4.352870121636553e-06, + "loss": 0.3351, + "step": 7004 + }, + { + "epoch": 1.76182092555332, + "grad_norm": 0.288789838552475, + "learning_rate": 4.351419208991262e-06, + "loss": 0.3324, + "step": 7005 + }, + { + "epoch": 1.7620724346076457, + "grad_norm": 0.2942191958427429, + "learning_rate": 4.349968351892861e-06, + "loss": 0.3425, + "step": 7006 + }, + { + "epoch": 1.762323943661972, + "grad_norm": 0.29075539112091064, + "learning_rate": 4.348517550465602e-06, + "loss": 0.3457, + "step": 7007 + }, + { + "epoch": 1.7625754527162978, + "grad_norm": 0.31270796060562134, + "learning_rate": 4.34706680483374e-06, + "loss": 0.3489, + "step": 7008 + }, + { + "epoch": 1.7628269617706236, + "grad_norm": 0.2807566821575165, + "learning_rate": 4.345616115121521e-06, + "loss": 0.3408, + "step": 7009 + }, + { + "epoch": 1.7630784708249498, + "grad_norm": 0.3200899064540863, + "learning_rate": 4.34416548145319e-06, + "loss": 0.3233, + "step": 7010 + }, + { + "epoch": 1.7633299798792756, + "grad_norm": 0.2903282642364502, + "learning_rate": 4.342714903952979e-06, + "loss": 0.3452, + "step": 7011 + }, + { + "epoch": 1.7635814889336014, + "grad_norm": 0.3277839720249176, + "learning_rate": 4.341264382745127e-06, + "loss": 0.3368, + "step": 7012 + }, + { + "epoch": 1.7638329979879277, + "grad_norm": 0.28098973631858826, + "learning_rate": 4.339813917953859e-06, + "loss": 0.3404, + "step": 7013 + }, + { + "epoch": 1.7640845070422535, + "grad_norm": 0.3015226423740387, + "learning_rate": 4.338363509703399e-06, + "loss": 0.3395, + "step": 7014 + }, + { + "epoch": 1.7643360160965795, + "grad_norm": 0.33274605870246887, + "learning_rate": 4.336913158117965e-06, + "loss": 0.3596, + "step": 7015 + }, + { + "epoch": 1.7645875251509056, + "grad_norm": 0.31674689054489136, + "learning_rate": 4.33546286332177e-06, + "loss": 0.3396, + "step": 7016 + }, + { + "epoch": 1.7648390342052314, + "grad_norm": 0.31130534410476685, + "learning_rate": 4.3340126254390255e-06, + "loss": 0.3906, + "step": 7017 + }, + { + "epoch": 1.7650905432595574, + "grad_norm": 0.30620113015174866, + "learning_rate": 4.3325624445939306e-06, + "loss": 0.3371, + "step": 7018 + }, + { + "epoch": 1.7653420523138834, + "grad_norm": 0.33001548051834106, + "learning_rate": 4.33111232091069e-06, + "loss": 0.3542, + "step": 7019 + }, + { + "epoch": 1.7655935613682092, + "grad_norm": 0.3208904266357422, + "learning_rate": 4.329662254513492e-06, + "loss": 0.3387, + "step": 7020 + }, + { + "epoch": 1.7658450704225352, + "grad_norm": 0.32378435134887695, + "learning_rate": 4.32821224552653e-06, + "loss": 0.3505, + "step": 7021 + }, + { + "epoch": 1.7660965794768613, + "grad_norm": 0.3116562068462372, + "learning_rate": 4.326762294073984e-06, + "loss": 0.3457, + "step": 7022 + }, + { + "epoch": 1.766348088531187, + "grad_norm": 0.3385581374168396, + "learning_rate": 4.3253124002800376e-06, + "loss": 0.3637, + "step": 7023 + }, + { + "epoch": 1.766599597585513, + "grad_norm": 0.29773086309432983, + "learning_rate": 4.323862564268862e-06, + "loss": 0.334, + "step": 7024 + }, + { + "epoch": 1.7668511066398391, + "grad_norm": 0.3357481062412262, + "learning_rate": 4.32241278616463e-06, + "loss": 0.3617, + "step": 7025 + }, + { + "epoch": 1.767102615694165, + "grad_norm": 0.32160642743110657, + "learning_rate": 4.320963066091503e-06, + "loss": 0.3617, + "step": 7026 + }, + { + "epoch": 1.767354124748491, + "grad_norm": 0.29699718952178955, + "learning_rate": 4.319513404173641e-06, + "loss": 0.3469, + "step": 7027 + }, + { + "epoch": 1.767605633802817, + "grad_norm": 0.31124669313430786, + "learning_rate": 4.318063800535199e-06, + "loss": 0.3523, + "step": 7028 + }, + { + "epoch": 1.7678571428571428, + "grad_norm": 0.3128409683704376, + "learning_rate": 4.316614255300326e-06, + "loss": 0.3102, + "step": 7029 + }, + { + "epoch": 1.7681086519114688, + "grad_norm": 0.2870144844055176, + "learning_rate": 4.315164768593167e-06, + "loss": 0.3445, + "step": 7030 + }, + { + "epoch": 1.7683601609657948, + "grad_norm": 0.3071323037147522, + "learning_rate": 4.313715340537861e-06, + "loss": 0.3552, + "step": 7031 + }, + { + "epoch": 1.7686116700201207, + "grad_norm": 0.33453187346458435, + "learning_rate": 4.312265971258544e-06, + "loss": 0.3461, + "step": 7032 + }, + { + "epoch": 1.7688631790744467, + "grad_norm": 0.3303498923778534, + "learning_rate": 4.310816660879342e-06, + "loss": 0.3508, + "step": 7033 + }, + { + "epoch": 1.7691146881287727, + "grad_norm": 0.30803173780441284, + "learning_rate": 4.3093674095243825e-06, + "loss": 0.3221, + "step": 7034 + }, + { + "epoch": 1.7693661971830985, + "grad_norm": 0.3237571716308594, + "learning_rate": 4.307918217317785e-06, + "loss": 0.3409, + "step": 7035 + }, + { + "epoch": 1.7696177062374245, + "grad_norm": 0.3842838704586029, + "learning_rate": 4.30646908438366e-06, + "loss": 0.3455, + "step": 7036 + }, + { + "epoch": 1.7698692152917506, + "grad_norm": 0.3077596426010132, + "learning_rate": 4.305020010846121e-06, + "loss": 0.3569, + "step": 7037 + }, + { + "epoch": 1.7701207243460764, + "grad_norm": 0.32035428285598755, + "learning_rate": 4.303570996829269e-06, + "loss": 0.3468, + "step": 7038 + }, + { + "epoch": 1.7703722334004024, + "grad_norm": 0.3082960546016693, + "learning_rate": 4.302122042457206e-06, + "loss": 0.3595, + "step": 7039 + }, + { + "epoch": 1.7706237424547284, + "grad_norm": 0.32630711793899536, + "learning_rate": 4.300673147854023e-06, + "loss": 0.323, + "step": 7040 + }, + { + "epoch": 1.7708752515090542, + "grad_norm": 0.3460298180580139, + "learning_rate": 4.299224313143811e-06, + "loss": 0.3253, + "step": 7041 + }, + { + "epoch": 1.7711267605633803, + "grad_norm": 0.30822697281837463, + "learning_rate": 4.297775538450651e-06, + "loss": 0.3232, + "step": 7042 + }, + { + "epoch": 1.7713782696177063, + "grad_norm": 0.3184284567832947, + "learning_rate": 4.296326823898625e-06, + "loss": 0.3431, + "step": 7043 + }, + { + "epoch": 1.771629778672032, + "grad_norm": 0.3198661804199219, + "learning_rate": 4.294878169611802e-06, + "loss": 0.3349, + "step": 7044 + }, + { + "epoch": 1.7718812877263581, + "grad_norm": 0.3085322976112366, + "learning_rate": 4.2934295757142526e-06, + "loss": 0.3351, + "step": 7045 + }, + { + "epoch": 1.7721327967806841, + "grad_norm": 0.31835395097732544, + "learning_rate": 4.291981042330042e-06, + "loss": 0.3702, + "step": 7046 + }, + { + "epoch": 1.77238430583501, + "grad_norm": 0.3139986991882324, + "learning_rate": 4.290532569583223e-06, + "loss": 0.33, + "step": 7047 + }, + { + "epoch": 1.772635814889336, + "grad_norm": 0.30537134408950806, + "learning_rate": 4.289084157597854e-06, + "loss": 0.3513, + "step": 7048 + }, + { + "epoch": 1.772887323943662, + "grad_norm": 0.2940613627433777, + "learning_rate": 4.287635806497977e-06, + "loss": 0.3523, + "step": 7049 + }, + { + "epoch": 1.7731388329979878, + "grad_norm": 0.3057369887828827, + "learning_rate": 4.2861875164076394e-06, + "loss": 0.3706, + "step": 7050 + }, + { + "epoch": 1.7733903420523138, + "grad_norm": 0.328139066696167, + "learning_rate": 4.284739287450873e-06, + "loss": 0.3603, + "step": 7051 + }, + { + "epoch": 1.7736418511066399, + "grad_norm": 0.32239678502082825, + "learning_rate": 4.283291119751714e-06, + "loss": 0.3415, + "step": 7052 + }, + { + "epoch": 1.7738933601609657, + "grad_norm": 0.3281613290309906, + "learning_rate": 4.2818430134341835e-06, + "loss": 0.3683, + "step": 7053 + }, + { + "epoch": 1.774144869215292, + "grad_norm": 0.29835787415504456, + "learning_rate": 4.280394968622309e-06, + "loss": 0.3449, + "step": 7054 + }, + { + "epoch": 1.7743963782696177, + "grad_norm": 0.3036617040634155, + "learning_rate": 4.2789469854401025e-06, + "loss": 0.3389, + "step": 7055 + }, + { + "epoch": 1.7746478873239435, + "grad_norm": 0.2952130138874054, + "learning_rate": 4.277499064011575e-06, + "loss": 0.3251, + "step": 7056 + }, + { + "epoch": 1.7748993963782698, + "grad_norm": 0.3045006990432739, + "learning_rate": 4.276051204460735e-06, + "loss": 0.3491, + "step": 7057 + }, + { + "epoch": 1.7751509054325956, + "grad_norm": 0.304092675447464, + "learning_rate": 4.274603406911578e-06, + "loss": 0.3373, + "step": 7058 + }, + { + "epoch": 1.7754024144869214, + "grad_norm": 0.3368243873119354, + "learning_rate": 4.273155671488103e-06, + "loss": 0.3633, + "step": 7059 + }, + { + "epoch": 1.7756539235412476, + "grad_norm": 0.3125411868095398, + "learning_rate": 4.271707998314296e-06, + "loss": 0.3279, + "step": 7060 + }, + { + "epoch": 1.7759054325955734, + "grad_norm": 0.31142720580101013, + "learning_rate": 4.270260387514145e-06, + "loss": 0.3408, + "step": 7061 + }, + { + "epoch": 1.7761569416498992, + "grad_norm": 0.3212254047393799, + "learning_rate": 4.268812839211624e-06, + "loss": 0.3604, + "step": 7062 + }, + { + "epoch": 1.7764084507042255, + "grad_norm": 0.3029959797859192, + "learning_rate": 4.267365353530711e-06, + "loss": 0.3625, + "step": 7063 + }, + { + "epoch": 1.7766599597585513, + "grad_norm": 0.30155423283576965, + "learning_rate": 4.265917930595371e-06, + "loss": 0.3484, + "step": 7064 + }, + { + "epoch": 1.7769114688128773, + "grad_norm": 0.3170860707759857, + "learning_rate": 4.264470570529569e-06, + "loss": 0.3534, + "step": 7065 + }, + { + "epoch": 1.7771629778672033, + "grad_norm": 0.3197275698184967, + "learning_rate": 4.2630232734572594e-06, + "loss": 0.3507, + "step": 7066 + }, + { + "epoch": 1.7774144869215291, + "grad_norm": 0.3387235999107361, + "learning_rate": 4.2615760395023956e-06, + "loss": 0.3531, + "step": 7067 + }, + { + "epoch": 1.7776659959758552, + "grad_norm": 0.30116546154022217, + "learning_rate": 4.260128868788927e-06, + "loss": 0.3455, + "step": 7068 + }, + { + "epoch": 1.7779175050301812, + "grad_norm": 0.32368162274360657, + "learning_rate": 4.25868176144079e-06, + "loss": 0.3489, + "step": 7069 + }, + { + "epoch": 1.778169014084507, + "grad_norm": 0.29791736602783203, + "learning_rate": 4.2572347175819245e-06, + "loss": 0.3504, + "step": 7070 + }, + { + "epoch": 1.778420523138833, + "grad_norm": 0.33331525325775146, + "learning_rate": 4.255787737336257e-06, + "loss": 0.3276, + "step": 7071 + }, + { + "epoch": 1.778672032193159, + "grad_norm": 0.28021112084388733, + "learning_rate": 4.254340820827715e-06, + "loss": 0.3205, + "step": 7072 + }, + { + "epoch": 1.7789235412474849, + "grad_norm": 0.29119399189949036, + "learning_rate": 4.252893968180215e-06, + "loss": 0.3536, + "step": 7073 + }, + { + "epoch": 1.779175050301811, + "grad_norm": 0.29139408469200134, + "learning_rate": 4.251447179517676e-06, + "loss": 0.3535, + "step": 7074 + }, + { + "epoch": 1.779426559356137, + "grad_norm": 0.3197605311870575, + "learning_rate": 4.250000454964001e-06, + "loss": 0.336, + "step": 7075 + }, + { + "epoch": 1.7796780684104627, + "grad_norm": 0.3005029261112213, + "learning_rate": 4.248553794643096e-06, + "loss": 0.3385, + "step": 7076 + }, + { + "epoch": 1.7799295774647887, + "grad_norm": 0.31606951355934143, + "learning_rate": 4.247107198678856e-06, + "loss": 0.3525, + "step": 7077 + }, + { + "epoch": 1.7801810865191148, + "grad_norm": 0.3125983476638794, + "learning_rate": 4.245660667195175e-06, + "loss": 0.3413, + "step": 7078 + }, + { + "epoch": 1.7804325955734406, + "grad_norm": 0.3108173906803131, + "learning_rate": 4.244214200315939e-06, + "loss": 0.3516, + "step": 7079 + }, + { + "epoch": 1.7806841046277666, + "grad_norm": 0.31591638922691345, + "learning_rate": 4.242767798165028e-06, + "loss": 0.3521, + "step": 7080 + }, + { + "epoch": 1.7809356136820926, + "grad_norm": 0.30157795548439026, + "learning_rate": 4.241321460866319e-06, + "loss": 0.3294, + "step": 7081 + }, + { + "epoch": 1.7811871227364184, + "grad_norm": 0.3349739611148834, + "learning_rate": 4.23987518854368e-06, + "loss": 0.3625, + "step": 7082 + }, + { + "epoch": 1.7814386317907445, + "grad_norm": 0.3226284086704254, + "learning_rate": 4.2384289813209754e-06, + "loss": 0.3247, + "step": 7083 + }, + { + "epoch": 1.7816901408450705, + "grad_norm": 0.3141878545284271, + "learning_rate": 4.236982839322064e-06, + "loss": 0.3171, + "step": 7084 + }, + { + "epoch": 1.7819416498993963, + "grad_norm": 0.3055501878261566, + "learning_rate": 4.235536762670801e-06, + "loss": 0.3376, + "step": 7085 + }, + { + "epoch": 1.7821931589537223, + "grad_norm": 0.31992486119270325, + "learning_rate": 4.23409075149103e-06, + "loss": 0.3444, + "step": 7086 + }, + { + "epoch": 1.7824446680080483, + "grad_norm": 0.3101184070110321, + "learning_rate": 4.2326448059065935e-06, + "loss": 0.3499, + "step": 7087 + }, + { + "epoch": 1.7826961770623742, + "grad_norm": 0.33392560482025146, + "learning_rate": 4.231198926041332e-06, + "loss": 0.3843, + "step": 7088 + }, + { + "epoch": 1.7829476861167002, + "grad_norm": 0.2991205155849457, + "learning_rate": 4.229753112019069e-06, + "loss": 0.3313, + "step": 7089 + }, + { + "epoch": 1.7831991951710262, + "grad_norm": 0.314127653837204, + "learning_rate": 4.2283073639636376e-06, + "loss": 0.3379, + "step": 7090 + }, + { + "epoch": 1.783450704225352, + "grad_norm": 0.31345435976982117, + "learning_rate": 4.226861681998849e-06, + "loss": 0.3603, + "step": 7091 + }, + { + "epoch": 1.783702213279678, + "grad_norm": 0.3256407082080841, + "learning_rate": 4.2254160662485236e-06, + "loss": 0.3277, + "step": 7092 + }, + { + "epoch": 1.783953722334004, + "grad_norm": 0.3122624158859253, + "learning_rate": 4.2239705168364636e-06, + "loss": 0.3499, + "step": 7093 + }, + { + "epoch": 1.7842052313883299, + "grad_norm": 0.2897671163082123, + "learning_rate": 4.222525033886476e-06, + "loss": 0.3465, + "step": 7094 + }, + { + "epoch": 1.784456740442656, + "grad_norm": 0.31568416953086853, + "learning_rate": 4.221079617522354e-06, + "loss": 0.3393, + "step": 7095 + }, + { + "epoch": 1.784708249496982, + "grad_norm": 0.31326398253440857, + "learning_rate": 4.219634267867892e-06, + "loss": 0.3521, + "step": 7096 + }, + { + "epoch": 1.7849597585513077, + "grad_norm": 0.346648246049881, + "learning_rate": 4.2181889850468704e-06, + "loss": 0.3414, + "step": 7097 + }, + { + "epoch": 1.7852112676056338, + "grad_norm": 0.29954788088798523, + "learning_rate": 4.216743769183071e-06, + "loss": 0.3569, + "step": 7098 + }, + { + "epoch": 1.7854627766599598, + "grad_norm": 0.3176994323730469, + "learning_rate": 4.215298620400271e-06, + "loss": 0.3513, + "step": 7099 + }, + { + "epoch": 1.7857142857142856, + "grad_norm": 0.3110980987548828, + "learning_rate": 4.213853538822232e-06, + "loss": 0.3463, + "step": 7100 + }, + { + "epoch": 1.7859657947686118, + "grad_norm": 0.3204716145992279, + "learning_rate": 4.212408524572722e-06, + "loss": 0.3722, + "step": 7101 + }, + { + "epoch": 1.7862173038229376, + "grad_norm": 0.3436601161956787, + "learning_rate": 4.210963577775492e-06, + "loss": 0.3489, + "step": 7102 + }, + { + "epoch": 1.7864688128772634, + "grad_norm": 0.31376877427101135, + "learning_rate": 4.209518698554298e-06, + "loss": 0.3449, + "step": 7103 + }, + { + "epoch": 1.7867203219315897, + "grad_norm": 0.31126469373703003, + "learning_rate": 4.2080738870328795e-06, + "loss": 0.3685, + "step": 7104 + }, + { + "epoch": 1.7869718309859155, + "grad_norm": 0.29963016510009766, + "learning_rate": 4.206629143334981e-06, + "loss": 0.3533, + "step": 7105 + }, + { + "epoch": 1.7872233400402413, + "grad_norm": 0.3086322546005249, + "learning_rate": 4.20518446758433e-06, + "loss": 0.3215, + "step": 7106 + }, + { + "epoch": 1.7874748490945676, + "grad_norm": 0.35310494899749756, + "learning_rate": 4.20373985990466e-06, + "loss": 0.3563, + "step": 7107 + }, + { + "epoch": 1.7877263581488934, + "grad_norm": 0.2881607413291931, + "learning_rate": 4.202295320419687e-06, + "loss": 0.3393, + "step": 7108 + }, + { + "epoch": 1.7879778672032192, + "grad_norm": 0.3135796785354614, + "learning_rate": 4.2008508492531305e-06, + "loss": 0.3605, + "step": 7109 + }, + { + "epoch": 1.7882293762575454, + "grad_norm": 0.3010595440864563, + "learning_rate": 4.1994064465287e-06, + "loss": 0.356, + "step": 7110 + }, + { + "epoch": 1.7884808853118712, + "grad_norm": 0.3232562839984894, + "learning_rate": 4.1979621123700976e-06, + "loss": 0.335, + "step": 7111 + }, + { + "epoch": 1.788732394366197, + "grad_norm": 0.3245859742164612, + "learning_rate": 4.196517846901025e-06, + "loss": 0.3427, + "step": 7112 + }, + { + "epoch": 1.7889839034205233, + "grad_norm": 0.323621928691864, + "learning_rate": 4.195073650245169e-06, + "loss": 0.3566, + "step": 7113 + }, + { + "epoch": 1.789235412474849, + "grad_norm": 0.3078625500202179, + "learning_rate": 4.193629522526223e-06, + "loss": 0.3304, + "step": 7114 + }, + { + "epoch": 1.789486921529175, + "grad_norm": 0.3693898320198059, + "learning_rate": 4.19218546386786e-06, + "loss": 0.3466, + "step": 7115 + }, + { + "epoch": 1.7897384305835011, + "grad_norm": 0.2945307493209839, + "learning_rate": 4.190741474393762e-06, + "loss": 0.3452, + "step": 7116 + }, + { + "epoch": 1.789989939637827, + "grad_norm": 0.3134191930294037, + "learning_rate": 4.18929755422759e-06, + "loss": 0.3609, + "step": 7117 + }, + { + "epoch": 1.790241448692153, + "grad_norm": 0.3266321122646332, + "learning_rate": 4.187853703493014e-06, + "loss": 0.3513, + "step": 7118 + }, + { + "epoch": 1.790492957746479, + "grad_norm": 0.3312418758869171, + "learning_rate": 4.186409922313686e-06, + "loss": 0.3385, + "step": 7119 + }, + { + "epoch": 1.7907444668008048, + "grad_norm": 0.30360767245292664, + "learning_rate": 4.184966210813258e-06, + "loss": 0.3557, + "step": 7120 + }, + { + "epoch": 1.7909959758551308, + "grad_norm": 0.3027835786342621, + "learning_rate": 4.183522569115377e-06, + "loss": 0.3553, + "step": 7121 + }, + { + "epoch": 1.7912474849094568, + "grad_norm": 0.317096084356308, + "learning_rate": 4.182078997343678e-06, + "loss": 0.3794, + "step": 7122 + }, + { + "epoch": 1.7914989939637826, + "grad_norm": 0.31175005435943604, + "learning_rate": 4.180635495621798e-06, + "loss": 0.3567, + "step": 7123 + }, + { + "epoch": 1.7917505030181087, + "grad_norm": 0.30733904242515564, + "learning_rate": 4.1791920640733596e-06, + "loss": 0.3446, + "step": 7124 + }, + { + "epoch": 1.7920020120724347, + "grad_norm": 0.3084380030632019, + "learning_rate": 4.177748702821988e-06, + "loss": 0.3611, + "step": 7125 + }, + { + "epoch": 1.7922535211267605, + "grad_norm": 0.3090236783027649, + "learning_rate": 4.176305411991295e-06, + "loss": 0.3313, + "step": 7126 + }, + { + "epoch": 1.7925050301810865, + "grad_norm": 0.2927131950855255, + "learning_rate": 4.174862191704892e-06, + "loss": 0.3476, + "step": 7127 + }, + { + "epoch": 1.7927565392354126, + "grad_norm": 0.3258267343044281, + "learning_rate": 4.173419042086377e-06, + "loss": 0.3354, + "step": 7128 + }, + { + "epoch": 1.7930080482897384, + "grad_norm": 0.2800391614437103, + "learning_rate": 4.171975963259354e-06, + "loss": 0.352, + "step": 7129 + }, + { + "epoch": 1.7932595573440644, + "grad_norm": 0.32517415285110474, + "learning_rate": 4.170532955347406e-06, + "loss": 0.3151, + "step": 7130 + }, + { + "epoch": 1.7935110663983904, + "grad_norm": 0.30167156457901, + "learning_rate": 4.169090018474122e-06, + "loss": 0.3481, + "step": 7131 + }, + { + "epoch": 1.7937625754527162, + "grad_norm": 0.31728750467300415, + "learning_rate": 4.1676471527630815e-06, + "loss": 0.3554, + "step": 7132 + }, + { + "epoch": 1.7940140845070423, + "grad_norm": 0.306044340133667, + "learning_rate": 4.1662043583378534e-06, + "loss": 0.3496, + "step": 7133 + }, + { + "epoch": 1.7942655935613683, + "grad_norm": 0.3083249628543854, + "learning_rate": 4.164761635322007e-06, + "loss": 0.3371, + "step": 7134 + }, + { + "epoch": 1.794517102615694, + "grad_norm": 0.2971269190311432, + "learning_rate": 4.163318983839101e-06, + "loss": 0.3446, + "step": 7135 + }, + { + "epoch": 1.79476861167002, + "grad_norm": 0.29189154505729675, + "learning_rate": 4.1618764040126905e-06, + "loss": 0.3526, + "step": 7136 + }, + { + "epoch": 1.7950201207243461, + "grad_norm": 0.31266725063323975, + "learning_rate": 4.1604338959663204e-06, + "loss": 0.3506, + "step": 7137 + }, + { + "epoch": 1.795271629778672, + "grad_norm": 0.294013112783432, + "learning_rate": 4.158991459823538e-06, + "loss": 0.3593, + "step": 7138 + }, + { + "epoch": 1.795523138832998, + "grad_norm": 0.31676429510116577, + "learning_rate": 4.1575490957078725e-06, + "loss": 0.3357, + "step": 7139 + }, + { + "epoch": 1.795774647887324, + "grad_norm": 0.3218402862548828, + "learning_rate": 4.15610680374286e-06, + "loss": 0.3462, + "step": 7140 + }, + { + "epoch": 1.7960261569416498, + "grad_norm": 0.2809396982192993, + "learning_rate": 4.154664584052018e-06, + "loss": 0.3368, + "step": 7141 + }, + { + "epoch": 1.7962776659959758, + "grad_norm": 0.30327579379081726, + "learning_rate": 4.153222436758866e-06, + "loss": 0.3773, + "step": 7142 + }, + { + "epoch": 1.7965291750503019, + "grad_norm": 0.2996901869773865, + "learning_rate": 4.151780361986915e-06, + "loss": 0.3433, + "step": 7143 + }, + { + "epoch": 1.7967806841046277, + "grad_norm": 0.30090850591659546, + "learning_rate": 4.1503383598596705e-06, + "loss": 0.3518, + "step": 7144 + }, + { + "epoch": 1.7970321931589537, + "grad_norm": 0.3128022849559784, + "learning_rate": 4.148896430500629e-06, + "loss": 0.3521, + "step": 7145 + }, + { + "epoch": 1.7972837022132797, + "grad_norm": 0.26742905378341675, + "learning_rate": 4.147454574033284e-06, + "loss": 0.3435, + "step": 7146 + }, + { + "epoch": 1.7975352112676055, + "grad_norm": 0.31213289499282837, + "learning_rate": 4.146012790581121e-06, + "loss": 0.3451, + "step": 7147 + }, + { + "epoch": 1.7977867203219315, + "grad_norm": 0.2901371419429779, + "learning_rate": 4.144571080267621e-06, + "loss": 0.3496, + "step": 7148 + }, + { + "epoch": 1.7980382293762576, + "grad_norm": 0.3160885274410248, + "learning_rate": 4.143129443216256e-06, + "loss": 0.3258, + "step": 7149 + }, + { + "epoch": 1.7982897384305834, + "grad_norm": 0.3092404007911682, + "learning_rate": 4.141687879550494e-06, + "loss": 0.3679, + "step": 7150 + }, + { + "epoch": 1.7985412474849096, + "grad_norm": 0.3174445629119873, + "learning_rate": 4.140246389393794e-06, + "loss": 0.3338, + "step": 7151 + }, + { + "epoch": 1.7987927565392354, + "grad_norm": 0.28808581829071045, + "learning_rate": 4.138804972869613e-06, + "loss": 0.3477, + "step": 7152 + }, + { + "epoch": 1.7990442655935612, + "grad_norm": 0.3383561670780182, + "learning_rate": 4.137363630101398e-06, + "loss": 0.3548, + "step": 7153 + }, + { + "epoch": 1.7992957746478875, + "grad_norm": 0.30686813592910767, + "learning_rate": 4.135922361212593e-06, + "loss": 0.3549, + "step": 7154 + }, + { + "epoch": 1.7995472837022133, + "grad_norm": 0.2914334833621979, + "learning_rate": 4.134481166326631e-06, + "loss": 0.3306, + "step": 7155 + }, + { + "epoch": 1.799798792756539, + "grad_norm": 0.3026004135608673, + "learning_rate": 4.133040045566942e-06, + "loss": 0.3397, + "step": 7156 + }, + { + "epoch": 1.8000503018108653, + "grad_norm": 0.3466000258922577, + "learning_rate": 4.13159899905695e-06, + "loss": 0.3348, + "step": 7157 + }, + { + "epoch": 1.8003018108651911, + "grad_norm": 0.30805760622024536, + "learning_rate": 4.130158026920072e-06, + "loss": 0.3512, + "step": 7158 + }, + { + "epoch": 1.800553319919517, + "grad_norm": 0.32429513335227966, + "learning_rate": 4.128717129279715e-06, + "loss": 0.3828, + "step": 7159 + }, + { + "epoch": 1.8008048289738432, + "grad_norm": 0.3071700930595398, + "learning_rate": 4.127276306259288e-06, + "loss": 0.3582, + "step": 7160 + }, + { + "epoch": 1.801056338028169, + "grad_norm": 0.31725555658340454, + "learning_rate": 4.125835557982183e-06, + "loss": 0.361, + "step": 7161 + }, + { + "epoch": 1.8013078470824948, + "grad_norm": 0.31528329849243164, + "learning_rate": 4.124394884571796e-06, + "loss": 0.3451, + "step": 7162 + }, + { + "epoch": 1.801559356136821, + "grad_norm": 0.3092787563800812, + "learning_rate": 4.122954286151507e-06, + "loss": 0.3431, + "step": 7163 + }, + { + "epoch": 1.8018108651911469, + "grad_norm": 0.3064475953578949, + "learning_rate": 4.121513762844696e-06, + "loss": 0.3313, + "step": 7164 + }, + { + "epoch": 1.8020623742454729, + "grad_norm": 0.30549487471580505, + "learning_rate": 4.120073314774739e-06, + "loss": 0.3576, + "step": 7165 + }, + { + "epoch": 1.802313883299799, + "grad_norm": 0.30158209800720215, + "learning_rate": 4.118632942064995e-06, + "loss": 0.3607, + "step": 7166 + }, + { + "epoch": 1.8025653923541247, + "grad_norm": 0.2950282394886017, + "learning_rate": 4.117192644838827e-06, + "loss": 0.3414, + "step": 7167 + }, + { + "epoch": 1.8028169014084507, + "grad_norm": 0.3308418393135071, + "learning_rate": 4.115752423219585e-06, + "loss": 0.3561, + "step": 7168 + }, + { + "epoch": 1.8030684104627768, + "grad_norm": 0.30721715092658997, + "learning_rate": 4.114312277330617e-06, + "loss": 0.3459, + "step": 7169 + }, + { + "epoch": 1.8033199195171026, + "grad_norm": 0.300238698720932, + "learning_rate": 4.112872207295262e-06, + "loss": 0.3266, + "step": 7170 + }, + { + "epoch": 1.8035714285714286, + "grad_norm": 0.3531527519226074, + "learning_rate": 4.1114322132368524e-06, + "loss": 0.3471, + "step": 7171 + }, + { + "epoch": 1.8038229376257546, + "grad_norm": 0.3371135890483856, + "learning_rate": 4.109992295278714e-06, + "loss": 0.3284, + "step": 7172 + }, + { + "epoch": 1.8040744466800804, + "grad_norm": 0.30015435814857483, + "learning_rate": 4.108552453544169e-06, + "loss": 0.3471, + "step": 7173 + }, + { + "epoch": 1.8043259557344065, + "grad_norm": 0.29010263085365295, + "learning_rate": 4.107112688156528e-06, + "loss": 0.3366, + "step": 7174 + }, + { + "epoch": 1.8045774647887325, + "grad_norm": 0.3293246626853943, + "learning_rate": 4.105672999239098e-06, + "loss": 0.3481, + "step": 7175 + }, + { + "epoch": 1.8048289738430583, + "grad_norm": 0.34103676676750183, + "learning_rate": 4.104233386915185e-06, + "loss": 0.341, + "step": 7176 + }, + { + "epoch": 1.8050804828973843, + "grad_norm": 0.2750874161720276, + "learning_rate": 4.102793851308074e-06, + "loss": 0.3327, + "step": 7177 + }, + { + "epoch": 1.8053319919517103, + "grad_norm": 0.30653923749923706, + "learning_rate": 4.101354392541061e-06, + "loss": 0.335, + "step": 7178 + }, + { + "epoch": 1.8055835010060362, + "grad_norm": 0.32137489318847656, + "learning_rate": 4.099915010737419e-06, + "loss": 0.3409, + "step": 7179 + }, + { + "epoch": 1.8058350100603622, + "grad_norm": 0.3056570291519165, + "learning_rate": 4.098475706020428e-06, + "loss": 0.3229, + "step": 7180 + }, + { + "epoch": 1.8060865191146882, + "grad_norm": 0.30447155237197876, + "learning_rate": 4.0970364785133506e-06, + "loss": 0.3495, + "step": 7181 + }, + { + "epoch": 1.806338028169014, + "grad_norm": 0.2888796925544739, + "learning_rate": 4.0955973283394525e-06, + "loss": 0.3445, + "step": 7182 + }, + { + "epoch": 1.80658953722334, + "grad_norm": 0.29918986558914185, + "learning_rate": 4.094158255621983e-06, + "loss": 0.3205, + "step": 7183 + }, + { + "epoch": 1.806841046277666, + "grad_norm": 0.31096193194389343, + "learning_rate": 4.0927192604841935e-06, + "loss": 0.3415, + "step": 7184 + }, + { + "epoch": 1.8070925553319919, + "grad_norm": 0.3002293109893799, + "learning_rate": 4.0912803430493215e-06, + "loss": 0.33, + "step": 7185 + }, + { + "epoch": 1.807344064386318, + "grad_norm": 0.2879883348941803, + "learning_rate": 4.089841503440603e-06, + "loss": 0.3347, + "step": 7186 + }, + { + "epoch": 1.807595573440644, + "grad_norm": 0.2978774607181549, + "learning_rate": 4.088402741781269e-06, + "loss": 0.3424, + "step": 7187 + }, + { + "epoch": 1.8078470824949697, + "grad_norm": 0.3050556480884552, + "learning_rate": 4.086964058194534e-06, + "loss": 0.3397, + "step": 7188 + }, + { + "epoch": 1.8080985915492958, + "grad_norm": 0.30784785747528076, + "learning_rate": 4.085525452803618e-06, + "loss": 0.3586, + "step": 7189 + }, + { + "epoch": 1.8083501006036218, + "grad_norm": 0.3016720712184906, + "learning_rate": 4.084086925731723e-06, + "loss": 0.3537, + "step": 7190 + }, + { + "epoch": 1.8086016096579476, + "grad_norm": 0.3045290410518646, + "learning_rate": 4.0826484771020565e-06, + "loss": 0.3393, + "step": 7191 + }, + { + "epoch": 1.8088531187122736, + "grad_norm": 0.2894311249256134, + "learning_rate": 4.081210107037806e-06, + "loss": 0.351, + "step": 7192 + }, + { + "epoch": 1.8091046277665996, + "grad_norm": 0.30208760499954224, + "learning_rate": 4.079771815662164e-06, + "loss": 0.3349, + "step": 7193 + }, + { + "epoch": 1.8093561368209254, + "grad_norm": 0.3042428195476532, + "learning_rate": 4.078333603098307e-06, + "loss": 0.3831, + "step": 7194 + }, + { + "epoch": 1.8096076458752515, + "grad_norm": 0.3321954905986786, + "learning_rate": 4.076895469469413e-06, + "loss": 0.3566, + "step": 7195 + }, + { + "epoch": 1.8098591549295775, + "grad_norm": 0.3012005686759949, + "learning_rate": 4.075457414898646e-06, + "loss": 0.3242, + "step": 7196 + }, + { + "epoch": 1.8101106639839033, + "grad_norm": 0.29138419032096863, + "learning_rate": 4.074019439509168e-06, + "loss": 0.3078, + "step": 7197 + }, + { + "epoch": 1.8103621730382293, + "grad_norm": 0.29968154430389404, + "learning_rate": 4.072581543424132e-06, + "loss": 0.3189, + "step": 7198 + }, + { + "epoch": 1.8106136820925554, + "grad_norm": 0.3333386778831482, + "learning_rate": 4.071143726766683e-06, + "loss": 0.3542, + "step": 7199 + }, + { + "epoch": 1.8108651911468812, + "grad_norm": 0.31147098541259766, + "learning_rate": 4.069705989659966e-06, + "loss": 0.3586, + "step": 7200 + }, + { + "epoch": 1.8111167002012074, + "grad_norm": 0.31270739436149597, + "learning_rate": 4.0682683322271086e-06, + "loss": 0.3408, + "step": 7201 + }, + { + "epoch": 1.8113682092555332, + "grad_norm": 0.30901169776916504, + "learning_rate": 4.066830754591242e-06, + "loss": 0.346, + "step": 7202 + }, + { + "epoch": 1.811619718309859, + "grad_norm": 0.30854368209838867, + "learning_rate": 4.065393256875481e-06, + "loss": 0.3383, + "step": 7203 + }, + { + "epoch": 1.8118712273641853, + "grad_norm": 0.304512083530426, + "learning_rate": 4.063955839202943e-06, + "loss": 0.312, + "step": 7204 + }, + { + "epoch": 1.812122736418511, + "grad_norm": 0.29708781838417053, + "learning_rate": 4.062518501696729e-06, + "loss": 0.3454, + "step": 7205 + }, + { + "epoch": 1.8123742454728369, + "grad_norm": 0.304839551448822, + "learning_rate": 4.061081244479943e-06, + "loss": 0.3573, + "step": 7206 + }, + { + "epoch": 1.8126257545271631, + "grad_norm": 0.3408084213733673, + "learning_rate": 4.059644067675673e-06, + "loss": 0.3337, + "step": 7207 + }, + { + "epoch": 1.812877263581489, + "grad_norm": 0.2825142443180084, + "learning_rate": 4.058206971407006e-06, + "loss": 0.378, + "step": 7208 + }, + { + "epoch": 1.8131287726358147, + "grad_norm": 0.285431444644928, + "learning_rate": 4.05676995579702e-06, + "loss": 0.3331, + "step": 7209 + }, + { + "epoch": 1.813380281690141, + "grad_norm": 0.32098081707954407, + "learning_rate": 4.055333020968787e-06, + "loss": 0.342, + "step": 7210 + }, + { + "epoch": 1.8136317907444668, + "grad_norm": 0.2935009002685547, + "learning_rate": 4.05389616704537e-06, + "loss": 0.3568, + "step": 7211 + }, + { + "epoch": 1.8138832997987926, + "grad_norm": 0.29505932331085205, + "learning_rate": 4.052459394149829e-06, + "loss": 0.351, + "step": 7212 + }, + { + "epoch": 1.8141348088531188, + "grad_norm": 0.30334311723709106, + "learning_rate": 4.0510227024052115e-06, + "loss": 0.3446, + "step": 7213 + }, + { + "epoch": 1.8143863179074446, + "grad_norm": 0.3207496404647827, + "learning_rate": 4.049586091934563e-06, + "loss": 0.3443, + "step": 7214 + }, + { + "epoch": 1.8146378269617707, + "grad_norm": 0.31141701340675354, + "learning_rate": 4.048149562860921e-06, + "loss": 0.3418, + "step": 7215 + }, + { + "epoch": 1.8148893360160967, + "grad_norm": 0.3144972622394562, + "learning_rate": 4.046713115307314e-06, + "loss": 0.3316, + "step": 7216 + }, + { + "epoch": 1.8151408450704225, + "grad_norm": 0.30265048146247864, + "learning_rate": 4.045276749396764e-06, + "loss": 0.3368, + "step": 7217 + }, + { + "epoch": 1.8153923541247485, + "grad_norm": 0.2881247401237488, + "learning_rate": 4.043840465252289e-06, + "loss": 0.3493, + "step": 7218 + }, + { + "epoch": 1.8156438631790746, + "grad_norm": 0.30661842226982117, + "learning_rate": 4.042404262996894e-06, + "loss": 0.3615, + "step": 7219 + }, + { + "epoch": 1.8158953722334004, + "grad_norm": 0.2946421205997467, + "learning_rate": 4.0409681427535855e-06, + "loss": 0.331, + "step": 7220 + }, + { + "epoch": 1.8161468812877264, + "grad_norm": 0.34275466203689575, + "learning_rate": 4.039532104645354e-06, + "loss": 0.3441, + "step": 7221 + }, + { + "epoch": 1.8163983903420524, + "grad_norm": 0.3059110641479492, + "learning_rate": 4.0380961487951915e-06, + "loss": 0.3422, + "step": 7222 + }, + { + "epoch": 1.8166498993963782, + "grad_norm": 0.29724571108818054, + "learning_rate": 4.0366602753260745e-06, + "loss": 0.3245, + "step": 7223 + }, + { + "epoch": 1.8169014084507042, + "grad_norm": 0.31005269289016724, + "learning_rate": 4.035224484360979e-06, + "loss": 0.335, + "step": 7224 + }, + { + "epoch": 1.8171529175050303, + "grad_norm": 0.31909066438674927, + "learning_rate": 4.03378877602287e-06, + "loss": 0.359, + "step": 7225 + }, + { + "epoch": 1.817404426559356, + "grad_norm": 0.2967131435871124, + "learning_rate": 4.032353150434709e-06, + "loss": 0.3227, + "step": 7226 + }, + { + "epoch": 1.817655935613682, + "grad_norm": 0.3029683828353882, + "learning_rate": 4.030917607719446e-06, + "loss": 0.3581, + "step": 7227 + }, + { + "epoch": 1.8179074446680081, + "grad_norm": 0.3194609582424164, + "learning_rate": 4.029482148000028e-06, + "loss": 0.3354, + "step": 7228 + }, + { + "epoch": 1.818158953722334, + "grad_norm": 0.3186091482639313, + "learning_rate": 4.028046771399391e-06, + "loss": 0.3477, + "step": 7229 + }, + { + "epoch": 1.81841046277666, + "grad_norm": 0.28328919410705566, + "learning_rate": 4.026611478040468e-06, + "loss": 0.3498, + "step": 7230 + }, + { + "epoch": 1.818661971830986, + "grad_norm": 0.28632745146751404, + "learning_rate": 4.025176268046184e-06, + "loss": 0.3525, + "step": 7231 + }, + { + "epoch": 1.8189134808853118, + "grad_norm": 0.310872346162796, + "learning_rate": 4.023741141539453e-06, + "loss": 0.3418, + "step": 7232 + }, + { + "epoch": 1.8191649899396378, + "grad_norm": 0.3138779401779175, + "learning_rate": 4.022306098643186e-06, + "loss": 0.3636, + "step": 7233 + }, + { + "epoch": 1.8194164989939638, + "grad_norm": 0.31211575865745544, + "learning_rate": 4.020871139480285e-06, + "loss": 0.3466, + "step": 7234 + }, + { + "epoch": 1.8196680080482897, + "grad_norm": 0.2843218445777893, + "learning_rate": 4.019436264173646e-06, + "loss": 0.3238, + "step": 7235 + }, + { + "epoch": 1.8199195171026157, + "grad_norm": 0.3115207552909851, + "learning_rate": 4.018001472846156e-06, + "loss": 0.342, + "step": 7236 + }, + { + "epoch": 1.8201710261569417, + "grad_norm": 0.3359984755516052, + "learning_rate": 4.0165667656206975e-06, + "loss": 0.3548, + "step": 7237 + }, + { + "epoch": 1.8204225352112675, + "grad_norm": 0.31638795137405396, + "learning_rate": 4.0151321426201414e-06, + "loss": 0.363, + "step": 7238 + }, + { + "epoch": 1.8206740442655935, + "grad_norm": 0.3151678442955017, + "learning_rate": 4.013697603967356e-06, + "loss": 0.3564, + "step": 7239 + }, + { + "epoch": 1.8209255533199196, + "grad_norm": 0.2856763005256653, + "learning_rate": 4.012263149785203e-06, + "loss": 0.3497, + "step": 7240 + }, + { + "epoch": 1.8211770623742454, + "grad_norm": 0.32092490792274475, + "learning_rate": 4.010828780196529e-06, + "loss": 0.3394, + "step": 7241 + }, + { + "epoch": 1.8214285714285714, + "grad_norm": 0.3086574971675873, + "learning_rate": 4.009394495324185e-06, + "loss": 0.3707, + "step": 7242 + }, + { + "epoch": 1.8216800804828974, + "grad_norm": 0.3423568606376648, + "learning_rate": 4.007960295291002e-06, + "loss": 0.3474, + "step": 7243 + }, + { + "epoch": 1.8219315895372232, + "grad_norm": 0.28429266810417175, + "learning_rate": 4.006526180219816e-06, + "loss": 0.3341, + "step": 7244 + }, + { + "epoch": 1.8221830985915493, + "grad_norm": 0.2965366244316101, + "learning_rate": 4.005092150233445e-06, + "loss": 0.3395, + "step": 7245 + }, + { + "epoch": 1.8224346076458753, + "grad_norm": 0.34402769804000854, + "learning_rate": 4.00365820545471e-06, + "loss": 0.3643, + "step": 7246 + }, + { + "epoch": 1.822686116700201, + "grad_norm": 0.30077889561653137, + "learning_rate": 4.002224346006415e-06, + "loss": 0.3334, + "step": 7247 + }, + { + "epoch": 1.8229376257545271, + "grad_norm": 0.31439444422721863, + "learning_rate": 4.000790572011365e-06, + "loss": 0.3382, + "step": 7248 + }, + { + "epoch": 1.8231891348088531, + "grad_norm": 0.34265372157096863, + "learning_rate": 3.999356883592348e-06, + "loss": 0.3576, + "step": 7249 + }, + { + "epoch": 1.823440643863179, + "grad_norm": 0.3146701753139496, + "learning_rate": 3.997923280872154e-06, + "loss": 0.3728, + "step": 7250 + }, + { + "epoch": 1.8236921529175052, + "grad_norm": 0.3109131455421448, + "learning_rate": 3.9964897639735644e-06, + "loss": 0.3297, + "step": 7251 + }, + { + "epoch": 1.823943661971831, + "grad_norm": 0.28482386469841003, + "learning_rate": 3.995056333019347e-06, + "loss": 0.33, + "step": 7252 + }, + { + "epoch": 1.8241951710261568, + "grad_norm": 0.3102104067802429, + "learning_rate": 3.993622988132269e-06, + "loss": 0.3738, + "step": 7253 + }, + { + "epoch": 1.824446680080483, + "grad_norm": 0.34564486145973206, + "learning_rate": 3.992189729435085e-06, + "loss": 0.3482, + "step": 7254 + }, + { + "epoch": 1.8246981891348089, + "grad_norm": 0.3167095482349396, + "learning_rate": 3.990756557050548e-06, + "loss": 0.3487, + "step": 7255 + }, + { + "epoch": 1.8249496981891347, + "grad_norm": 0.28097379207611084, + "learning_rate": 3.989323471101395e-06, + "loss": 0.3564, + "step": 7256 + }, + { + "epoch": 1.825201207243461, + "grad_norm": 0.29964274168014526, + "learning_rate": 3.987890471710367e-06, + "loss": 0.3434, + "step": 7257 + }, + { + "epoch": 1.8254527162977867, + "grad_norm": 0.32765746116638184, + "learning_rate": 3.986457559000185e-06, + "loss": 0.3327, + "step": 7258 + }, + { + "epoch": 1.8257042253521125, + "grad_norm": 0.3484762907028198, + "learning_rate": 3.985024733093576e-06, + "loss": 0.3504, + "step": 7259 + }, + { + "epoch": 1.8259557344064388, + "grad_norm": 0.3308793306350708, + "learning_rate": 3.9835919941132464e-06, + "loss": 0.3428, + "step": 7260 + }, + { + "epoch": 1.8262072434607646, + "grad_norm": 0.3149087429046631, + "learning_rate": 3.982159342181904e-06, + "loss": 0.3478, + "step": 7261 + }, + { + "epoch": 1.8264587525150904, + "grad_norm": 0.3215365707874298, + "learning_rate": 3.9807267774222475e-06, + "loss": 0.3417, + "step": 7262 + }, + { + "epoch": 1.8267102615694166, + "grad_norm": 0.3201581835746765, + "learning_rate": 3.979294299956965e-06, + "loss": 0.3392, + "step": 7263 + }, + { + "epoch": 1.8269617706237424, + "grad_norm": 0.3202977478504181, + "learning_rate": 3.977861909908741e-06, + "loss": 0.3425, + "step": 7264 + }, + { + "epoch": 1.8272132796780685, + "grad_norm": 0.31886807084083557, + "learning_rate": 3.976429607400249e-06, + "loss": 0.371, + "step": 7265 + }, + { + "epoch": 1.8274647887323945, + "grad_norm": 0.3250950872898102, + "learning_rate": 3.9749973925541585e-06, + "loss": 0.3473, + "step": 7266 + }, + { + "epoch": 1.8277162977867203, + "grad_norm": 0.30726975202560425, + "learning_rate": 3.973565265493129e-06, + "loss": 0.3644, + "step": 7267 + }, + { + "epoch": 1.8279678068410463, + "grad_norm": 0.3059992492198944, + "learning_rate": 3.972133226339812e-06, + "loss": 0.3607, + "step": 7268 + }, + { + "epoch": 1.8282193158953723, + "grad_norm": 0.32286500930786133, + "learning_rate": 3.970701275216855e-06, + "loss": 0.339, + "step": 7269 + }, + { + "epoch": 1.8284708249496981, + "grad_norm": 0.319062203168869, + "learning_rate": 3.969269412246895e-06, + "loss": 0.3551, + "step": 7270 + }, + { + "epoch": 1.8287223340040242, + "grad_norm": 0.3119877576828003, + "learning_rate": 3.967837637552561e-06, + "loss": 0.3506, + "step": 7271 + }, + { + "epoch": 1.8289738430583502, + "grad_norm": 0.32227200269699097, + "learning_rate": 3.966405951256475e-06, + "loss": 0.3581, + "step": 7272 + }, + { + "epoch": 1.829225352112676, + "grad_norm": 0.3127002716064453, + "learning_rate": 3.964974353481254e-06, + "loss": 0.3448, + "step": 7273 + }, + { + "epoch": 1.829476861167002, + "grad_norm": 0.3057059049606323, + "learning_rate": 3.963542844349505e-06, + "loss": 0.374, + "step": 7274 + }, + { + "epoch": 1.829728370221328, + "grad_norm": 0.29945117235183716, + "learning_rate": 3.962111423983827e-06, + "loss": 0.3443, + "step": 7275 + }, + { + "epoch": 1.8299798792756539, + "grad_norm": 0.30039167404174805, + "learning_rate": 3.960680092506812e-06, + "loss": 0.3194, + "step": 7276 + }, + { + "epoch": 1.83023138832998, + "grad_norm": 0.32396718859672546, + "learning_rate": 3.9592488500410465e-06, + "loss": 0.3649, + "step": 7277 + }, + { + "epoch": 1.830482897384306, + "grad_norm": 0.33545982837677, + "learning_rate": 3.957817696709104e-06, + "loss": 0.3568, + "step": 7278 + }, + { + "epoch": 1.8307344064386317, + "grad_norm": 0.2859671413898468, + "learning_rate": 3.9563866326335575e-06, + "loss": 0.3704, + "step": 7279 + }, + { + "epoch": 1.8309859154929577, + "grad_norm": 0.2837298512458801, + "learning_rate": 3.9549556579369665e-06, + "loss": 0.3228, + "step": 7280 + }, + { + "epoch": 1.8312374245472838, + "grad_norm": 0.2976028025150299, + "learning_rate": 3.953524772741886e-06, + "loss": 0.3342, + "step": 7281 + }, + { + "epoch": 1.8314889336016096, + "grad_norm": 0.30670884251594543, + "learning_rate": 3.952093977170861e-06, + "loss": 0.3578, + "step": 7282 + }, + { + "epoch": 1.8317404426559356, + "grad_norm": 0.2988281846046448, + "learning_rate": 3.950663271346432e-06, + "loss": 0.3489, + "step": 7283 + }, + { + "epoch": 1.8319919517102616, + "grad_norm": 0.276311993598938, + "learning_rate": 3.94923265539113e-06, + "loss": 0.3466, + "step": 7284 + }, + { + "epoch": 1.8322434607645874, + "grad_norm": 0.3069443106651306, + "learning_rate": 3.947802129427476e-06, + "loss": 0.3689, + "step": 7285 + }, + { + "epoch": 1.8324949698189135, + "grad_norm": 0.3089459240436554, + "learning_rate": 3.946371693577988e-06, + "loss": 0.3199, + "step": 7286 + }, + { + "epoch": 1.8327464788732395, + "grad_norm": 0.3473307192325592, + "learning_rate": 3.9449413479651715e-06, + "loss": 0.3832, + "step": 7287 + }, + { + "epoch": 1.8329979879275653, + "grad_norm": 0.2900719940662384, + "learning_rate": 3.94351109271153e-06, + "loss": 0.3357, + "step": 7288 + }, + { + "epoch": 1.8332494969818913, + "grad_norm": 0.29746437072753906, + "learning_rate": 3.9420809279395525e-06, + "loss": 0.3304, + "step": 7289 + }, + { + "epoch": 1.8335010060362174, + "grad_norm": 0.3016701340675354, + "learning_rate": 3.940650853771727e-06, + "loss": 0.371, + "step": 7290 + }, + { + "epoch": 1.8337525150905432, + "grad_norm": 0.2984379231929779, + "learning_rate": 3.939220870330527e-06, + "loss": 0.3555, + "step": 7291 + }, + { + "epoch": 1.8340040241448692, + "grad_norm": 0.3294828534126282, + "learning_rate": 3.937790977738425e-06, + "loss": 0.3452, + "step": 7292 + }, + { + "epoch": 1.8342555331991952, + "grad_norm": 0.3317073881626129, + "learning_rate": 3.936361176117879e-06, + "loss": 0.3459, + "step": 7293 + }, + { + "epoch": 1.834507042253521, + "grad_norm": 0.2974916994571686, + "learning_rate": 3.934931465591343e-06, + "loss": 0.3601, + "step": 7294 + }, + { + "epoch": 1.834758551307847, + "grad_norm": 0.30047857761383057, + "learning_rate": 3.9335018462812664e-06, + "loss": 0.3436, + "step": 7295 + }, + { + "epoch": 1.835010060362173, + "grad_norm": 0.3131597936153412, + "learning_rate": 3.9320723183100824e-06, + "loss": 0.3716, + "step": 7296 + }, + { + "epoch": 1.8352615694164989, + "grad_norm": 0.3238302171230316, + "learning_rate": 3.930642881800227e-06, + "loss": 0.3654, + "step": 7297 + }, + { + "epoch": 1.835513078470825, + "grad_norm": 0.31614577770233154, + "learning_rate": 3.9292135368741155e-06, + "loss": 0.3463, + "step": 7298 + }, + { + "epoch": 1.835764587525151, + "grad_norm": 0.31243595480918884, + "learning_rate": 3.927784283654168e-06, + "loss": 0.3459, + "step": 7299 + }, + { + "epoch": 1.8360160965794767, + "grad_norm": 0.2986874580383301, + "learning_rate": 3.926355122262787e-06, + "loss": 0.3263, + "step": 7300 + }, + { + "epoch": 1.836267605633803, + "grad_norm": 0.33278465270996094, + "learning_rate": 3.9249260528223745e-06, + "loss": 0.3431, + "step": 7301 + }, + { + "epoch": 1.8365191146881288, + "grad_norm": 0.3187868893146515, + "learning_rate": 3.923497075455319e-06, + "loss": 0.3582, + "step": 7302 + }, + { + "epoch": 1.8367706237424546, + "grad_norm": 0.33840519189834595, + "learning_rate": 3.922068190284005e-06, + "loss": 0.363, + "step": 7303 + }, + { + "epoch": 1.8370221327967808, + "grad_norm": 0.32831573486328125, + "learning_rate": 3.920639397430806e-06, + "loss": 0.3694, + "step": 7304 + }, + { + "epoch": 1.8372736418511066, + "grad_norm": 0.3055534362792969, + "learning_rate": 3.91921069701809e-06, + "loss": 0.3389, + "step": 7305 + }, + { + "epoch": 1.8375251509054324, + "grad_norm": 0.31282302737236023, + "learning_rate": 3.917782089168218e-06, + "loss": 0.3161, + "step": 7306 + }, + { + "epoch": 1.8377766599597587, + "grad_norm": 0.350875586271286, + "learning_rate": 3.916353574003538e-06, + "loss": 0.3565, + "step": 7307 + }, + { + "epoch": 1.8380281690140845, + "grad_norm": 0.3590453267097473, + "learning_rate": 3.914925151646397e-06, + "loss": 0.3605, + "step": 7308 + }, + { + "epoch": 1.8382796780684103, + "grad_norm": 0.3325250446796417, + "learning_rate": 3.913496822219127e-06, + "loss": 0.3876, + "step": 7309 + }, + { + "epoch": 1.8385311871227366, + "grad_norm": 0.2975671589374542, + "learning_rate": 3.912068585844059e-06, + "loss": 0.3617, + "step": 7310 + }, + { + "epoch": 1.8387826961770624, + "grad_norm": 0.30971285700798035, + "learning_rate": 3.910640442643508e-06, + "loss": 0.3263, + "step": 7311 + }, + { + "epoch": 1.8390342052313882, + "grad_norm": 0.34714826941490173, + "learning_rate": 3.909212392739791e-06, + "loss": 0.333, + "step": 7312 + }, + { + "epoch": 1.8392857142857144, + "grad_norm": 0.34102630615234375, + "learning_rate": 3.907784436255205e-06, + "loss": 0.3454, + "step": 7313 + }, + { + "epoch": 1.8395372233400402, + "grad_norm": 0.31585681438446045, + "learning_rate": 3.906356573312052e-06, + "loss": 0.3302, + "step": 7314 + }, + { + "epoch": 1.8397887323943662, + "grad_norm": 0.32975485920906067, + "learning_rate": 3.904928804032615e-06, + "loss": 0.3249, + "step": 7315 + }, + { + "epoch": 1.8400402414486923, + "grad_norm": 0.3263443112373352, + "learning_rate": 3.903501128539175e-06, + "loss": 0.331, + "step": 7316 + }, + { + "epoch": 1.840291750503018, + "grad_norm": 0.33371224999427795, + "learning_rate": 3.902073546954006e-06, + "loss": 0.3425, + "step": 7317 + }, + { + "epoch": 1.840543259557344, + "grad_norm": 0.35445427894592285, + "learning_rate": 3.900646059399367e-06, + "loss": 0.3474, + "step": 7318 + }, + { + "epoch": 1.8407947686116701, + "grad_norm": 0.3176020085811615, + "learning_rate": 3.899218665997517e-06, + "loss": 0.3538, + "step": 7319 + }, + { + "epoch": 1.841046277665996, + "grad_norm": 0.34081193804740906, + "learning_rate": 3.8977913668707e-06, + "loss": 0.3394, + "step": 7320 + }, + { + "epoch": 1.841297786720322, + "grad_norm": 0.35305309295654297, + "learning_rate": 3.896364162141159e-06, + "loss": 0.3447, + "step": 7321 + }, + { + "epoch": 1.841549295774648, + "grad_norm": 0.2991912364959717, + "learning_rate": 3.894937051931122e-06, + "loss": 0.3299, + "step": 7322 + }, + { + "epoch": 1.8418008048289738, + "grad_norm": 0.32608288526535034, + "learning_rate": 3.8935100363628135e-06, + "loss": 0.3316, + "step": 7323 + }, + { + "epoch": 1.8420523138832998, + "grad_norm": 0.3336857855319977, + "learning_rate": 3.892083115558447e-06, + "loss": 0.3469, + "step": 7324 + }, + { + "epoch": 1.8423038229376258, + "grad_norm": 0.3458016514778137, + "learning_rate": 3.890656289640233e-06, + "loss": 0.3666, + "step": 7325 + }, + { + "epoch": 1.8425553319919517, + "grad_norm": 0.3222261667251587, + "learning_rate": 3.889229558730365e-06, + "loss": 0.3593, + "step": 7326 + }, + { + "epoch": 1.8428068410462777, + "grad_norm": 0.31532028317451477, + "learning_rate": 3.887802922951038e-06, + "loss": 0.3456, + "step": 7327 + }, + { + "epoch": 1.8430583501006037, + "grad_norm": 0.306510865688324, + "learning_rate": 3.886376382424433e-06, + "loss": 0.3506, + "step": 7328 + }, + { + "epoch": 1.8433098591549295, + "grad_norm": 0.3103088438510895, + "learning_rate": 3.884949937272724e-06, + "loss": 0.3657, + "step": 7329 + }, + { + "epoch": 1.8435613682092555, + "grad_norm": 0.3240601718425751, + "learning_rate": 3.883523587618077e-06, + "loss": 0.34, + "step": 7330 + }, + { + "epoch": 1.8438128772635816, + "grad_norm": 0.30873456597328186, + "learning_rate": 3.8820973335826494e-06, + "loss": 0.3478, + "step": 7331 + }, + { + "epoch": 1.8440643863179074, + "grad_norm": 0.34828251600265503, + "learning_rate": 3.880671175288592e-06, + "loss": 0.3613, + "step": 7332 + }, + { + "epoch": 1.8443158953722334, + "grad_norm": 0.30683434009552, + "learning_rate": 3.879245112858046e-06, + "loss": 0.3606, + "step": 7333 + }, + { + "epoch": 1.8445674044265594, + "grad_norm": 0.2935948073863983, + "learning_rate": 3.877819146413144e-06, + "loss": 0.3395, + "step": 7334 + }, + { + "epoch": 1.8448189134808852, + "grad_norm": 0.2853759527206421, + "learning_rate": 3.876393276076013e-06, + "loss": 0.3525, + "step": 7335 + }, + { + "epoch": 1.8450704225352113, + "grad_norm": 0.31282341480255127, + "learning_rate": 3.8749675019687684e-06, + "loss": 0.3469, + "step": 7336 + }, + { + "epoch": 1.8453219315895373, + "grad_norm": 0.2779003083705902, + "learning_rate": 3.873541824213518e-06, + "loss": 0.3664, + "step": 7337 + }, + { + "epoch": 1.845573440643863, + "grad_norm": 0.3223724961280823, + "learning_rate": 3.872116242932363e-06, + "loss": 0.3689, + "step": 7338 + }, + { + "epoch": 1.845824949698189, + "grad_norm": 0.3017749786376953, + "learning_rate": 3.870690758247399e-06, + "loss": 0.3387, + "step": 7339 + }, + { + "epoch": 1.8460764587525151, + "grad_norm": 0.3117123544216156, + "learning_rate": 3.869265370280702e-06, + "loss": 0.318, + "step": 7340 + }, + { + "epoch": 1.846327967806841, + "grad_norm": 0.29091158509254456, + "learning_rate": 3.867840079154356e-06, + "loss": 0.3381, + "step": 7341 + }, + { + "epoch": 1.846579476861167, + "grad_norm": 0.3203681707382202, + "learning_rate": 3.866414884990422e-06, + "loss": 0.3435, + "step": 7342 + }, + { + "epoch": 1.846830985915493, + "grad_norm": 0.3184633255004883, + "learning_rate": 3.864989787910964e-06, + "loss": 0.3623, + "step": 7343 + }, + { + "epoch": 1.8470824949698188, + "grad_norm": 0.2918001413345337, + "learning_rate": 3.863564788038027e-06, + "loss": 0.37, + "step": 7344 + }, + { + "epoch": 1.8473340040241448, + "grad_norm": 0.3107215166091919, + "learning_rate": 3.862139885493659e-06, + "loss": 0.33, + "step": 7345 + }, + { + "epoch": 1.8475855130784709, + "grad_norm": 0.3049178719520569, + "learning_rate": 3.860715080399889e-06, + "loss": 0.3381, + "step": 7346 + }, + { + "epoch": 1.8478370221327967, + "grad_norm": 0.30147770047187805, + "learning_rate": 3.859290372878748e-06, + "loss": 0.3285, + "step": 7347 + }, + { + "epoch": 1.8480885311871227, + "grad_norm": 0.2963356375694275, + "learning_rate": 3.857865763052247e-06, + "loss": 0.3157, + "step": 7348 + }, + { + "epoch": 1.8483400402414487, + "grad_norm": 0.29773834347724915, + "learning_rate": 3.856441251042399e-06, + "loss": 0.3511, + "step": 7349 + }, + { + "epoch": 1.8485915492957745, + "grad_norm": 0.3260302245616913, + "learning_rate": 3.8550168369712055e-06, + "loss": 0.3778, + "step": 7350 + }, + { + "epoch": 1.8488430583501008, + "grad_norm": 0.30447903275489807, + "learning_rate": 3.8535925209606554e-06, + "loss": 0.3527, + "step": 7351 + }, + { + "epoch": 1.8490945674044266, + "grad_norm": 0.3211756944656372, + "learning_rate": 3.852168303132735e-06, + "loss": 0.3288, + "step": 7352 + }, + { + "epoch": 1.8493460764587524, + "grad_norm": 0.3038700819015503, + "learning_rate": 3.8507441836094175e-06, + "loss": 0.3574, + "step": 7353 + }, + { + "epoch": 1.8495975855130786, + "grad_norm": 0.3078562915325165, + "learning_rate": 3.849320162512672e-06, + "loss": 0.3248, + "step": 7354 + }, + { + "epoch": 1.8498490945674044, + "grad_norm": 0.31693726778030396, + "learning_rate": 3.847896239964455e-06, + "loss": 0.3387, + "step": 7355 + }, + { + "epoch": 1.8501006036217302, + "grad_norm": 0.32334980368614197, + "learning_rate": 3.8464724160867195e-06, + "loss": 0.3423, + "step": 7356 + }, + { + "epoch": 1.8503521126760565, + "grad_norm": 0.3063230812549591, + "learning_rate": 3.845048691001402e-06, + "loss": 0.3663, + "step": 7357 + }, + { + "epoch": 1.8506036217303823, + "grad_norm": 0.30338233709335327, + "learning_rate": 3.8436250648304415e-06, + "loss": 0.3548, + "step": 7358 + }, + { + "epoch": 1.850855130784708, + "grad_norm": 0.31649091839790344, + "learning_rate": 3.842201537695758e-06, + "loss": 0.3481, + "step": 7359 + }, + { + "epoch": 1.8511066398390343, + "grad_norm": 0.31132972240448, + "learning_rate": 3.84077810971927e-06, + "loss": 0.3467, + "step": 7360 + }, + { + "epoch": 1.8513581488933601, + "grad_norm": 0.3053782284259796, + "learning_rate": 3.839354781022886e-06, + "loss": 0.3211, + "step": 7361 + }, + { + "epoch": 1.8516096579476862, + "grad_norm": 0.314434677362442, + "learning_rate": 3.8379315517285025e-06, + "loss": 0.3406, + "step": 7362 + }, + { + "epoch": 1.8518611670020122, + "grad_norm": 0.29468590021133423, + "learning_rate": 3.836508421958014e-06, + "loss": 0.3442, + "step": 7363 + }, + { + "epoch": 1.852112676056338, + "grad_norm": 0.3091891407966614, + "learning_rate": 3.8350853918332974e-06, + "loss": 0.324, + "step": 7364 + }, + { + "epoch": 1.852364185110664, + "grad_norm": 0.30145037174224854, + "learning_rate": 3.833662461476233e-06, + "loss": 0.3225, + "step": 7365 + }, + { + "epoch": 1.85261569416499, + "grad_norm": 0.31025686860084534, + "learning_rate": 3.8322396310086785e-06, + "loss": 0.3593, + "step": 7366 + }, + { + "epoch": 1.8528672032193159, + "grad_norm": 0.32085564732551575, + "learning_rate": 3.8308169005524964e-06, + "loss": 0.3509, + "step": 7367 + }, + { + "epoch": 1.8531187122736419, + "grad_norm": 0.3120383322238922, + "learning_rate": 3.829394270229531e-06, + "loss": 0.3462, + "step": 7368 + }, + { + "epoch": 1.853370221327968, + "grad_norm": 0.3061147928237915, + "learning_rate": 3.827971740161625e-06, + "loss": 0.365, + "step": 7369 + }, + { + "epoch": 1.8536217303822937, + "grad_norm": 0.2813125550746918, + "learning_rate": 3.826549310470605e-06, + "loss": 0.3114, + "step": 7370 + }, + { + "epoch": 1.8538732394366197, + "grad_norm": 0.30677247047424316, + "learning_rate": 3.825126981278296e-06, + "loss": 0.368, + "step": 7371 + }, + { + "epoch": 1.8541247484909458, + "grad_norm": 0.3084120452404022, + "learning_rate": 3.823704752706512e-06, + "loss": 0.3603, + "step": 7372 + }, + { + "epoch": 1.8543762575452716, + "grad_norm": 0.29872360825538635, + "learning_rate": 3.8222826248770555e-06, + "loss": 0.3452, + "step": 7373 + }, + { + "epoch": 1.8546277665995976, + "grad_norm": 0.32304060459136963, + "learning_rate": 3.820860597911726e-06, + "loss": 0.3371, + "step": 7374 + }, + { + "epoch": 1.8548792756539236, + "grad_norm": 0.3058139681816101, + "learning_rate": 3.819438671932308e-06, + "loss": 0.3539, + "step": 7375 + }, + { + "epoch": 1.8551307847082494, + "grad_norm": 0.3089655041694641, + "learning_rate": 3.818016847060585e-06, + "loss": 0.3519, + "step": 7376 + }, + { + "epoch": 1.8553822937625755, + "grad_norm": 0.2986527681350708, + "learning_rate": 3.816595123418322e-06, + "loss": 0.3401, + "step": 7377 + }, + { + "epoch": 1.8556338028169015, + "grad_norm": 0.33130231499671936, + "learning_rate": 3.815173501127285e-06, + "loss": 0.3656, + "step": 7378 + }, + { + "epoch": 1.8558853118712273, + "grad_norm": 0.328773558139801, + "learning_rate": 3.813751980309224e-06, + "loss": 0.3466, + "step": 7379 + }, + { + "epoch": 1.8561368209255533, + "grad_norm": 0.32407474517822266, + "learning_rate": 3.8123305610858863e-06, + "loss": 0.35, + "step": 7380 + }, + { + "epoch": 1.8563883299798793, + "grad_norm": 0.3152269721031189, + "learning_rate": 3.810909243579004e-06, + "loss": 0.3328, + "step": 7381 + }, + { + "epoch": 1.8566398390342052, + "grad_norm": 0.32206395268440247, + "learning_rate": 3.8094880279103063e-06, + "loss": 0.353, + "step": 7382 + }, + { + "epoch": 1.8568913480885312, + "grad_norm": 0.3166285455226898, + "learning_rate": 3.808066914201513e-06, + "loss": 0.3548, + "step": 7383 + }, + { + "epoch": 1.8571428571428572, + "grad_norm": 0.3051060438156128, + "learning_rate": 3.80664590257433e-06, + "loss": 0.3394, + "step": 7384 + }, + { + "epoch": 1.857394366197183, + "grad_norm": 0.33410540223121643, + "learning_rate": 3.8052249931504614e-06, + "loss": 0.3551, + "step": 7385 + }, + { + "epoch": 1.857645875251509, + "grad_norm": 0.2926395535469055, + "learning_rate": 3.8038041860515962e-06, + "loss": 0.3339, + "step": 7386 + }, + { + "epoch": 1.857897384305835, + "grad_norm": 0.3076823949813843, + "learning_rate": 3.802383481399421e-06, + "loss": 0.3417, + "step": 7387 + }, + { + "epoch": 1.8581488933601609, + "grad_norm": 0.3127098083496094, + "learning_rate": 3.8009628793156056e-06, + "loss": 0.3316, + "step": 7388 + }, + { + "epoch": 1.858400402414487, + "grad_norm": 0.3178151845932007, + "learning_rate": 3.799542379921821e-06, + "loss": 0.3539, + "step": 7389 + }, + { + "epoch": 1.858651911468813, + "grad_norm": 0.3181746006011963, + "learning_rate": 3.79812198333972e-06, + "loss": 0.3584, + "step": 7390 + }, + { + "epoch": 1.8589034205231387, + "grad_norm": 0.33825597167015076, + "learning_rate": 3.7967016896909524e-06, + "loss": 0.3672, + "step": 7391 + }, + { + "epoch": 1.8591549295774648, + "grad_norm": 0.2978416085243225, + "learning_rate": 3.7952814990971575e-06, + "loss": 0.3398, + "step": 7392 + }, + { + "epoch": 1.8594064386317908, + "grad_norm": 0.3217383623123169, + "learning_rate": 3.7938614116799655e-06, + "loss": 0.3486, + "step": 7393 + }, + { + "epoch": 1.8596579476861166, + "grad_norm": 0.34177547693252563, + "learning_rate": 3.792441427560998e-06, + "loss": 0.3472, + "step": 7394 + }, + { + "epoch": 1.8599094567404426, + "grad_norm": 0.30881890654563904, + "learning_rate": 3.791021546861868e-06, + "loss": 0.3487, + "step": 7395 + }, + { + "epoch": 1.8601609657947686, + "grad_norm": 0.28196999430656433, + "learning_rate": 3.7896017697041788e-06, + "loss": 0.3386, + "step": 7396 + }, + { + "epoch": 1.8604124748490944, + "grad_norm": 0.2953253984451294, + "learning_rate": 3.788182096209526e-06, + "loss": 0.353, + "step": 7397 + }, + { + "epoch": 1.8606639839034205, + "grad_norm": 0.31603649258613586, + "learning_rate": 3.7867625264994954e-06, + "loss": 0.357, + "step": 7398 + }, + { + "epoch": 1.8609154929577465, + "grad_norm": 0.2854761481285095, + "learning_rate": 3.7853430606956647e-06, + "loss": 0.3246, + "step": 7399 + }, + { + "epoch": 1.8611670020120723, + "grad_norm": 0.3070826530456543, + "learning_rate": 3.783923698919602e-06, + "loss": 0.3417, + "step": 7400 + }, + { + "epoch": 1.8614185110663986, + "grad_norm": 0.3086259365081787, + "learning_rate": 3.782504441292867e-06, + "loss": 0.3459, + "step": 7401 + }, + { + "epoch": 1.8616700201207244, + "grad_norm": 0.30796676874160767, + "learning_rate": 3.7810852879370084e-06, + "loss": 0.3557, + "step": 7402 + }, + { + "epoch": 1.8619215291750502, + "grad_norm": 0.3138282001018524, + "learning_rate": 3.7796662389735718e-06, + "loss": 0.3609, + "step": 7403 + }, + { + "epoch": 1.8621730382293764, + "grad_norm": 0.29625728726387024, + "learning_rate": 3.778247294524085e-06, + "loss": 0.339, + "step": 7404 + }, + { + "epoch": 1.8624245472837022, + "grad_norm": 0.29120931029319763, + "learning_rate": 3.7768284547100763e-06, + "loss": 0.3245, + "step": 7405 + }, + { + "epoch": 1.862676056338028, + "grad_norm": 0.30197152495384216, + "learning_rate": 3.7754097196530566e-06, + "loss": 0.3421, + "step": 7406 + }, + { + "epoch": 1.8629275653923543, + "grad_norm": 0.2998182773590088, + "learning_rate": 3.7739910894745345e-06, + "loss": 0.3357, + "step": 7407 + }, + { + "epoch": 1.86317907444668, + "grad_norm": 0.33410176634788513, + "learning_rate": 3.7725725642960047e-06, + "loss": 0.3829, + "step": 7408 + }, + { + "epoch": 1.8634305835010059, + "grad_norm": 0.31223413348197937, + "learning_rate": 3.771154144238958e-06, + "loss": 0.3493, + "step": 7409 + }, + { + "epoch": 1.8636820925553321, + "grad_norm": 0.32594525814056396, + "learning_rate": 3.7697358294248687e-06, + "loss": 0.3278, + "step": 7410 + }, + { + "epoch": 1.863933601609658, + "grad_norm": 0.3015677034854889, + "learning_rate": 3.7683176199752115e-06, + "loss": 0.3588, + "step": 7411 + }, + { + "epoch": 1.864185110663984, + "grad_norm": 0.33221563696861267, + "learning_rate": 3.7668995160114424e-06, + "loss": 0.368, + "step": 7412 + }, + { + "epoch": 1.86443661971831, + "grad_norm": 0.3092586100101471, + "learning_rate": 3.765481517655015e-06, + "loss": 0.3353, + "step": 7413 + }, + { + "epoch": 1.8646881287726358, + "grad_norm": 0.3521566092967987, + "learning_rate": 3.7640636250273754e-06, + "loss": 0.3502, + "step": 7414 + }, + { + "epoch": 1.8649396378269618, + "grad_norm": 0.33217549324035645, + "learning_rate": 3.7626458382499525e-06, + "loss": 0.3509, + "step": 7415 + }, + { + "epoch": 1.8651911468812878, + "grad_norm": 0.30001208186149597, + "learning_rate": 3.7612281574441744e-06, + "loss": 0.3576, + "step": 7416 + }, + { + "epoch": 1.8654426559356136, + "grad_norm": 0.32171934843063354, + "learning_rate": 3.7598105827314524e-06, + "loss": 0.3404, + "step": 7417 + }, + { + "epoch": 1.8656941649899397, + "grad_norm": 0.31113913655281067, + "learning_rate": 3.758393114233198e-06, + "loss": 0.3632, + "step": 7418 + }, + { + "epoch": 1.8659456740442657, + "grad_norm": 0.31468674540519714, + "learning_rate": 3.7569757520708034e-06, + "loss": 0.3457, + "step": 7419 + }, + { + "epoch": 1.8661971830985915, + "grad_norm": 0.29202166199684143, + "learning_rate": 3.7555584963656615e-06, + "loss": 0.3184, + "step": 7420 + }, + { + "epoch": 1.8664486921529175, + "grad_norm": 0.3300865888595581, + "learning_rate": 3.7541413472391474e-06, + "loss": 0.3406, + "step": 7421 + }, + { + "epoch": 1.8667002012072436, + "grad_norm": 0.32692527770996094, + "learning_rate": 3.752724304812635e-06, + "loss": 0.3436, + "step": 7422 + }, + { + "epoch": 1.8669517102615694, + "grad_norm": 0.3171963393688202, + "learning_rate": 3.7513073692074802e-06, + "loss": 0.3179, + "step": 7423 + }, + { + "epoch": 1.8672032193158954, + "grad_norm": 0.31962621212005615, + "learning_rate": 3.749890540545038e-06, + "loss": 0.3683, + "step": 7424 + }, + { + "epoch": 1.8674547283702214, + "grad_norm": 0.3360038995742798, + "learning_rate": 3.748473818946652e-06, + "loss": 0.3318, + "step": 7425 + }, + { + "epoch": 1.8677062374245472, + "grad_norm": 0.3103548586368561, + "learning_rate": 3.7470572045336518e-06, + "loss": 0.3393, + "step": 7426 + }, + { + "epoch": 1.8679577464788732, + "grad_norm": 0.3288777470588684, + "learning_rate": 3.745640697427366e-06, + "loss": 0.3435, + "step": 7427 + }, + { + "epoch": 1.8682092555331993, + "grad_norm": 0.3266128897666931, + "learning_rate": 3.744224297749105e-06, + "loss": 0.328, + "step": 7428 + }, + { + "epoch": 1.868460764587525, + "grad_norm": 0.2997075617313385, + "learning_rate": 3.7428080056201794e-06, + "loss": 0.3406, + "step": 7429 + }, + { + "epoch": 1.868712273641851, + "grad_norm": 0.3005223870277405, + "learning_rate": 3.7413918211618804e-06, + "loss": 0.3256, + "step": 7430 + }, + { + "epoch": 1.8689637826961771, + "grad_norm": 0.3092925548553467, + "learning_rate": 3.739975744495501e-06, + "loss": 0.3591, + "step": 7431 + }, + { + "epoch": 1.869215291750503, + "grad_norm": 0.31816989183425903, + "learning_rate": 3.738559775742313e-06, + "loss": 0.3276, + "step": 7432 + }, + { + "epoch": 1.869466800804829, + "grad_norm": 0.29660165309906006, + "learning_rate": 3.7371439150235923e-06, + "loss": 0.3396, + "step": 7433 + }, + { + "epoch": 1.869718309859155, + "grad_norm": 0.3243357241153717, + "learning_rate": 3.735728162460591e-06, + "loss": 0.3516, + "step": 7434 + }, + { + "epoch": 1.8699698189134808, + "grad_norm": 0.29959172010421753, + "learning_rate": 3.734312518174565e-06, + "loss": 0.3762, + "step": 7435 + }, + { + "epoch": 1.8702213279678068, + "grad_norm": 0.29366421699523926, + "learning_rate": 3.732896982286755e-06, + "loss": 0.3441, + "step": 7436 + }, + { + "epoch": 1.8704728370221329, + "grad_norm": 0.29330679774284363, + "learning_rate": 3.731481554918389e-06, + "loss": 0.3299, + "step": 7437 + }, + { + "epoch": 1.8707243460764587, + "grad_norm": 0.3113592565059662, + "learning_rate": 3.7300662361906946e-06, + "loss": 0.3302, + "step": 7438 + }, + { + "epoch": 1.8709758551307847, + "grad_norm": 0.31629472970962524, + "learning_rate": 3.728651026224881e-06, + "loss": 0.3492, + "step": 7439 + }, + { + "epoch": 1.8712273641851107, + "grad_norm": 0.3160915672779083, + "learning_rate": 3.727235925142154e-06, + "loss": 0.3628, + "step": 7440 + }, + { + "epoch": 1.8714788732394365, + "grad_norm": 0.2999648153781891, + "learning_rate": 3.725820933063707e-06, + "loss": 0.342, + "step": 7441 + }, + { + "epoch": 1.8717303822937625, + "grad_norm": 0.2990424931049347, + "learning_rate": 3.724406050110727e-06, + "loss": 0.3564, + "step": 7442 + }, + { + "epoch": 1.8719818913480886, + "grad_norm": 0.29632776975631714, + "learning_rate": 3.7229912764043874e-06, + "loss": 0.3448, + "step": 7443 + }, + { + "epoch": 1.8722334004024144, + "grad_norm": 0.33509159088134766, + "learning_rate": 3.7215766120658568e-06, + "loss": 0.3618, + "step": 7444 + }, + { + "epoch": 1.8724849094567404, + "grad_norm": 0.328966349363327, + "learning_rate": 3.720162057216291e-06, + "loss": 0.3296, + "step": 7445 + }, + { + "epoch": 1.8727364185110664, + "grad_norm": 0.293011337518692, + "learning_rate": 3.7187476119768383e-06, + "loss": 0.3452, + "step": 7446 + }, + { + "epoch": 1.8729879275653922, + "grad_norm": 0.3068729043006897, + "learning_rate": 3.7173332764686375e-06, + "loss": 0.3468, + "step": 7447 + }, + { + "epoch": 1.8732394366197183, + "grad_norm": 0.3060571849346161, + "learning_rate": 3.715919050812817e-06, + "loss": 0.3509, + "step": 7448 + }, + { + "epoch": 1.8734909456740443, + "grad_norm": 0.31499195098876953, + "learning_rate": 3.7145049351304973e-06, + "loss": 0.3422, + "step": 7449 + }, + { + "epoch": 1.87374245472837, + "grad_norm": 0.2951659858226776, + "learning_rate": 3.7130909295427873e-06, + "loss": 0.331, + "step": 7450 + }, + { + "epoch": 1.8739939637826963, + "grad_norm": 0.30359193682670593, + "learning_rate": 3.7116770341707893e-06, + "loss": 0.3479, + "step": 7451 + }, + { + "epoch": 1.8742454728370221, + "grad_norm": 0.28634950518608093, + "learning_rate": 3.710263249135593e-06, + "loss": 0.3286, + "step": 7452 + }, + { + "epoch": 1.874496981891348, + "grad_norm": 0.30430909991264343, + "learning_rate": 3.7088495745582803e-06, + "loss": 0.3244, + "step": 7453 + }, + { + "epoch": 1.8747484909456742, + "grad_norm": 0.3412688970565796, + "learning_rate": 3.7074360105599246e-06, + "loss": 0.3657, + "step": 7454 + }, + { + "epoch": 1.875, + "grad_norm": 0.3176121413707733, + "learning_rate": 3.706022557261588e-06, + "loss": 0.3633, + "step": 7455 + }, + { + "epoch": 1.8752515090543258, + "grad_norm": 0.2989017069339752, + "learning_rate": 3.704609214784325e-06, + "loss": 0.3417, + "step": 7456 + }, + { + "epoch": 1.875503018108652, + "grad_norm": 0.31169548630714417, + "learning_rate": 3.703195983249177e-06, + "loss": 0.3268, + "step": 7457 + }, + { + "epoch": 1.8757545271629779, + "grad_norm": 0.306718647480011, + "learning_rate": 3.7017828627771825e-06, + "loss": 0.3479, + "step": 7458 + }, + { + "epoch": 1.8760060362173037, + "grad_norm": 0.29229748249053955, + "learning_rate": 3.7003698534893623e-06, + "loss": 0.3337, + "step": 7459 + }, + { + "epoch": 1.87625754527163, + "grad_norm": 0.3112187683582306, + "learning_rate": 3.6989569555067357e-06, + "loss": 0.3334, + "step": 7460 + }, + { + "epoch": 1.8765090543259557, + "grad_norm": 0.2908962368965149, + "learning_rate": 3.6975441689503044e-06, + "loss": 0.3319, + "step": 7461 + }, + { + "epoch": 1.8767605633802817, + "grad_norm": 0.2986694872379303, + "learning_rate": 3.6961314939410674e-06, + "loss": 0.3649, + "step": 7462 + }, + { + "epoch": 1.8770120724346078, + "grad_norm": 0.2823163568973541, + "learning_rate": 3.694718930600012e-06, + "loss": 0.3357, + "step": 7463 + }, + { + "epoch": 1.8772635814889336, + "grad_norm": 0.28905975818634033, + "learning_rate": 3.693306479048114e-06, + "loss": 0.3284, + "step": 7464 + }, + { + "epoch": 1.8775150905432596, + "grad_norm": 0.3094346821308136, + "learning_rate": 3.6918941394063414e-06, + "loss": 0.3834, + "step": 7465 + }, + { + "epoch": 1.8777665995975856, + "grad_norm": 0.30385082960128784, + "learning_rate": 3.6904819117956526e-06, + "loss": 0.3556, + "step": 7466 + }, + { + "epoch": 1.8780181086519114, + "grad_norm": 0.31416597962379456, + "learning_rate": 3.6890697963369947e-06, + "loss": 0.3567, + "step": 7467 + }, + { + "epoch": 1.8782696177062375, + "grad_norm": 0.2962150275707245, + "learning_rate": 3.6876577931513076e-06, + "loss": 0.3439, + "step": 7468 + }, + { + "epoch": 1.8785211267605635, + "grad_norm": 0.30000972747802734, + "learning_rate": 3.686245902359522e-06, + "loss": 0.3308, + "step": 7469 + }, + { + "epoch": 1.8787726358148893, + "grad_norm": 0.31348156929016113, + "learning_rate": 3.6848341240825548e-06, + "loss": 0.3402, + "step": 7470 + }, + { + "epoch": 1.8790241448692153, + "grad_norm": 0.34603050351142883, + "learning_rate": 3.6834224584413183e-06, + "loss": 0.3684, + "step": 7471 + }, + { + "epoch": 1.8792756539235413, + "grad_norm": 0.3139626681804657, + "learning_rate": 3.68201090555671e-06, + "loss": 0.3524, + "step": 7472 + }, + { + "epoch": 1.8795271629778671, + "grad_norm": 0.3043857216835022, + "learning_rate": 3.6805994655496245e-06, + "loss": 0.3485, + "step": 7473 + }, + { + "epoch": 1.8797786720321932, + "grad_norm": 0.33150768280029297, + "learning_rate": 3.6791881385409383e-06, + "loss": 0.3492, + "step": 7474 + }, + { + "epoch": 1.8800301810865192, + "grad_norm": 0.30486994981765747, + "learning_rate": 3.6777769246515275e-06, + "loss": 0.3585, + "step": 7475 + }, + { + "epoch": 1.880281690140845, + "grad_norm": 0.2925189733505249, + "learning_rate": 3.6763658240022495e-06, + "loss": 0.31, + "step": 7476 + }, + { + "epoch": 1.880533199195171, + "grad_norm": 0.2866535484790802, + "learning_rate": 3.67495483671396e-06, + "loss": 0.3426, + "step": 7477 + }, + { + "epoch": 1.880784708249497, + "grad_norm": 0.32085758447647095, + "learning_rate": 3.6735439629074964e-06, + "loss": 0.3441, + "step": 7478 + }, + { + "epoch": 1.8810362173038229, + "grad_norm": 0.3063550293445587, + "learning_rate": 3.672133202703694e-06, + "loss": 0.3211, + "step": 7479 + }, + { + "epoch": 1.881287726358149, + "grad_norm": 0.32170259952545166, + "learning_rate": 3.670722556223379e-06, + "loss": 0.3347, + "step": 7480 + }, + { + "epoch": 1.881539235412475, + "grad_norm": 0.30713140964508057, + "learning_rate": 3.669312023587358e-06, + "loss": 0.3515, + "step": 7481 + }, + { + "epoch": 1.8817907444668007, + "grad_norm": 0.3304310142993927, + "learning_rate": 3.66790160491644e-06, + "loss": 0.3393, + "step": 7482 + }, + { + "epoch": 1.8820422535211268, + "grad_norm": 0.3182964026927948, + "learning_rate": 3.666491300331414e-06, + "loss": 0.3511, + "step": 7483 + }, + { + "epoch": 1.8822937625754528, + "grad_norm": 0.2859238088130951, + "learning_rate": 3.6650811099530673e-06, + "loss": 0.3363, + "step": 7484 + }, + { + "epoch": 1.8825452716297786, + "grad_norm": 0.3283844292163849, + "learning_rate": 3.663671033902171e-06, + "loss": 0.3292, + "step": 7485 + }, + { + "epoch": 1.8827967806841046, + "grad_norm": 0.31571048498153687, + "learning_rate": 3.662261072299492e-06, + "loss": 0.359, + "step": 7486 + }, + { + "epoch": 1.8830482897384306, + "grad_norm": 0.32221272587776184, + "learning_rate": 3.660851225265781e-06, + "loss": 0.3513, + "step": 7487 + }, + { + "epoch": 1.8832997987927564, + "grad_norm": 0.29501035809516907, + "learning_rate": 3.659441492921788e-06, + "loss": 0.3411, + "step": 7488 + }, + { + "epoch": 1.8835513078470825, + "grad_norm": 0.31255054473876953, + "learning_rate": 3.6580318753882414e-06, + "loss": 0.3369, + "step": 7489 + }, + { + "epoch": 1.8838028169014085, + "grad_norm": 0.3012523055076599, + "learning_rate": 3.6566223727858697e-06, + "loss": 0.333, + "step": 7490 + }, + { + "epoch": 1.8840543259557343, + "grad_norm": 0.36300086975097656, + "learning_rate": 3.65521298523539e-06, + "loss": 0.3814, + "step": 7491 + }, + { + "epoch": 1.8843058350100603, + "grad_norm": 0.3129633069038391, + "learning_rate": 3.653803712857503e-06, + "loss": 0.3338, + "step": 7492 + }, + { + "epoch": 1.8845573440643864, + "grad_norm": 0.29744789004325867, + "learning_rate": 3.652394555772908e-06, + "loss": 0.3582, + "step": 7493 + }, + { + "epoch": 1.8848088531187122, + "grad_norm": 0.30003800988197327, + "learning_rate": 3.650985514102287e-06, + "loss": 0.3399, + "step": 7494 + }, + { + "epoch": 1.8850603621730382, + "grad_norm": 0.2842726707458496, + "learning_rate": 3.6495765879663194e-06, + "loss": 0.3373, + "step": 7495 + }, + { + "epoch": 1.8853118712273642, + "grad_norm": 0.2723131775856018, + "learning_rate": 3.6481677774856666e-06, + "loss": 0.3369, + "step": 7496 + }, + { + "epoch": 1.88556338028169, + "grad_norm": 0.3011725842952728, + "learning_rate": 3.6467590827809885e-06, + "loss": 0.3421, + "step": 7497 + }, + { + "epoch": 1.885814889336016, + "grad_norm": 0.31559064984321594, + "learning_rate": 3.6453505039729274e-06, + "loss": 0.3782, + "step": 7498 + }, + { + "epoch": 1.886066398390342, + "grad_norm": 0.29440516233444214, + "learning_rate": 3.6439420411821226e-06, + "loss": 0.3317, + "step": 7499 + }, + { + "epoch": 1.8863179074446679, + "grad_norm": 0.2825719714164734, + "learning_rate": 3.642533694529197e-06, + "loss": 0.3552, + "step": 7500 + }, + { + "epoch": 1.8865694164989941, + "grad_norm": 0.3134790360927582, + "learning_rate": 3.641125464134768e-06, + "loss": 0.3633, + "step": 7501 + }, + { + "epoch": 1.88682092555332, + "grad_norm": 0.33232244849205017, + "learning_rate": 3.6397173501194436e-06, + "loss": 0.3448, + "step": 7502 + }, + { + "epoch": 1.8870724346076457, + "grad_norm": 0.31418442726135254, + "learning_rate": 3.638309352603816e-06, + "loss": 0.3756, + "step": 7503 + }, + { + "epoch": 1.887323943661972, + "grad_norm": 0.31419721245765686, + "learning_rate": 3.6369014717084768e-06, + "loss": 0.3438, + "step": 7504 + }, + { + "epoch": 1.8875754527162978, + "grad_norm": 0.2999892234802246, + "learning_rate": 3.635493707553996e-06, + "loss": 0.3494, + "step": 7505 + }, + { + "epoch": 1.8878269617706236, + "grad_norm": 0.2898130714893341, + "learning_rate": 3.634086060260945e-06, + "loss": 0.3625, + "step": 7506 + }, + { + "epoch": 1.8880784708249498, + "grad_norm": 0.30754363536834717, + "learning_rate": 3.6326785299498758e-06, + "loss": 0.3224, + "step": 7507 + }, + { + "epoch": 1.8883299798792756, + "grad_norm": 0.3098406493663788, + "learning_rate": 3.6312711167413394e-06, + "loss": 0.3585, + "step": 7508 + }, + { + "epoch": 1.8885814889336014, + "grad_norm": 0.29480767250061035, + "learning_rate": 3.629863820755866e-06, + "loss": 0.3193, + "step": 7509 + }, + { + "epoch": 1.8888329979879277, + "grad_norm": 0.3022323548793793, + "learning_rate": 3.628456642113988e-06, + "loss": 0.3516, + "step": 7510 + }, + { + "epoch": 1.8890845070422535, + "grad_norm": 0.2908898591995239, + "learning_rate": 3.627049580936215e-06, + "loss": 0.3337, + "step": 7511 + }, + { + "epoch": 1.8893360160965795, + "grad_norm": 0.3488399088382721, + "learning_rate": 3.6256426373430577e-06, + "loss": 0.3307, + "step": 7512 + }, + { + "epoch": 1.8895875251509056, + "grad_norm": 0.33839771151542664, + "learning_rate": 3.6242358114550104e-06, + "loss": 0.3452, + "step": 7513 + }, + { + "epoch": 1.8898390342052314, + "grad_norm": 0.31028321385383606, + "learning_rate": 3.6228291033925596e-06, + "loss": 0.3599, + "step": 7514 + }, + { + "epoch": 1.8900905432595574, + "grad_norm": 0.3041622042655945, + "learning_rate": 3.6214225132761806e-06, + "loss": 0.3322, + "step": 7515 + }, + { + "epoch": 1.8903420523138834, + "grad_norm": 0.2997733950614929, + "learning_rate": 3.620016041226338e-06, + "loss": 0.3191, + "step": 7516 + }, + { + "epoch": 1.8905935613682092, + "grad_norm": 0.3369154632091522, + "learning_rate": 3.618609687363489e-06, + "loss": 0.35, + "step": 7517 + }, + { + "epoch": 1.8908450704225352, + "grad_norm": 0.3129628896713257, + "learning_rate": 3.6172034518080785e-06, + "loss": 0.3106, + "step": 7518 + }, + { + "epoch": 1.8910965794768613, + "grad_norm": 0.2884010672569275, + "learning_rate": 3.615797334680541e-06, + "loss": 0.3325, + "step": 7519 + }, + { + "epoch": 1.891348088531187, + "grad_norm": 0.2941190004348755, + "learning_rate": 3.6143913361013026e-06, + "loss": 0.3346, + "step": 7520 + }, + { + "epoch": 1.891599597585513, + "grad_norm": 0.314097136259079, + "learning_rate": 3.6129854561907786e-06, + "loss": 0.3595, + "step": 7521 + }, + { + "epoch": 1.8918511066398391, + "grad_norm": 0.30389559268951416, + "learning_rate": 3.611579695069372e-06, + "loss": 0.3565, + "step": 7522 + }, + { + "epoch": 1.892102615694165, + "grad_norm": 0.3276069760322571, + "learning_rate": 3.610174052857478e-06, + "loss": 0.3509, + "step": 7523 + }, + { + "epoch": 1.892354124748491, + "grad_norm": 0.3453889787197113, + "learning_rate": 3.608768529675484e-06, + "loss": 0.3276, + "step": 7524 + }, + { + "epoch": 1.892605633802817, + "grad_norm": 0.28621137142181396, + "learning_rate": 3.607363125643759e-06, + "loss": 0.3435, + "step": 7525 + }, + { + "epoch": 1.8928571428571428, + "grad_norm": 0.33448565006256104, + "learning_rate": 3.6059578408826734e-06, + "loss": 0.346, + "step": 7526 + }, + { + "epoch": 1.8931086519114688, + "grad_norm": 0.3352438509464264, + "learning_rate": 3.604552675512574e-06, + "loss": 0.3487, + "step": 7527 + }, + { + "epoch": 1.8933601609657948, + "grad_norm": 0.3266422152519226, + "learning_rate": 3.6031476296538113e-06, + "loss": 0.341, + "step": 7528 + }, + { + "epoch": 1.8936116700201207, + "grad_norm": 0.2940256595611572, + "learning_rate": 3.601742703426713e-06, + "loss": 0.3504, + "step": 7529 + }, + { + "epoch": 1.8938631790744467, + "grad_norm": 0.34617501497268677, + "learning_rate": 3.6003378969516067e-06, + "loss": 0.3384, + "step": 7530 + }, + { + "epoch": 1.8941146881287727, + "grad_norm": 0.33195579051971436, + "learning_rate": 3.5989332103488013e-06, + "loss": 0.3491, + "step": 7531 + }, + { + "epoch": 1.8943661971830985, + "grad_norm": 0.3452162742614746, + "learning_rate": 3.5975286437386014e-06, + "loss": 0.3862, + "step": 7532 + }, + { + "epoch": 1.8946177062374245, + "grad_norm": 0.2940990626811981, + "learning_rate": 3.5961241972413012e-06, + "loss": 0.3175, + "step": 7533 + }, + { + "epoch": 1.8948692152917506, + "grad_norm": 0.30977126955986023, + "learning_rate": 3.594719870977179e-06, + "loss": 0.3298, + "step": 7534 + }, + { + "epoch": 1.8951207243460764, + "grad_norm": 0.2974280118942261, + "learning_rate": 3.5933156650665102e-06, + "loss": 0.3208, + "step": 7535 + }, + { + "epoch": 1.8953722334004024, + "grad_norm": 0.323830246925354, + "learning_rate": 3.591911579629553e-06, + "loss": 0.363, + "step": 7536 + }, + { + "epoch": 1.8956237424547284, + "grad_norm": 0.308124840259552, + "learning_rate": 3.590507614786561e-06, + "loss": 0.3466, + "step": 7537 + }, + { + "epoch": 1.8958752515090542, + "grad_norm": 0.2933880090713501, + "learning_rate": 3.5891037706577736e-06, + "loss": 0.3256, + "step": 7538 + }, + { + "epoch": 1.8961267605633803, + "grad_norm": 0.30009788274765015, + "learning_rate": 3.5877000473634227e-06, + "loss": 0.3625, + "step": 7539 + }, + { + "epoch": 1.8963782696177063, + "grad_norm": 0.3329128324985504, + "learning_rate": 3.586296445023726e-06, + "loss": 0.3725, + "step": 7540 + }, + { + "epoch": 1.896629778672032, + "grad_norm": 0.2923327088356018, + "learning_rate": 3.584892963758896e-06, + "loss": 0.3407, + "step": 7541 + }, + { + "epoch": 1.8968812877263581, + "grad_norm": 0.336452454328537, + "learning_rate": 3.583489603689129e-06, + "loss": 0.3387, + "step": 7542 + }, + { + "epoch": 1.8971327967806841, + "grad_norm": 0.3008471131324768, + "learning_rate": 3.5820863649346162e-06, + "loss": 0.3415, + "step": 7543 + }, + { + "epoch": 1.89738430583501, + "grad_norm": 0.3129260241985321, + "learning_rate": 3.5806832476155373e-06, + "loss": 0.365, + "step": 7544 + }, + { + "epoch": 1.897635814889336, + "grad_norm": 0.3290889263153076, + "learning_rate": 3.579280251852057e-06, + "loss": 0.3611, + "step": 7545 + }, + { + "epoch": 1.897887323943662, + "grad_norm": 0.2925894856452942, + "learning_rate": 3.577877377764337e-06, + "loss": 0.3765, + "step": 7546 + }, + { + "epoch": 1.8981388329979878, + "grad_norm": 0.3327069878578186, + "learning_rate": 3.5764746254725213e-06, + "loss": 0.3524, + "step": 7547 + }, + { + "epoch": 1.8983903420523138, + "grad_norm": 0.3372514545917511, + "learning_rate": 3.5750719950967507e-06, + "loss": 0.365, + "step": 7548 + }, + { + "epoch": 1.8986418511066399, + "grad_norm": 0.2978571057319641, + "learning_rate": 3.5736694867571465e-06, + "loss": 0.3321, + "step": 7549 + }, + { + "epoch": 1.8988933601609657, + "grad_norm": 0.33006635308265686, + "learning_rate": 3.5722671005738303e-06, + "loss": 0.3283, + "step": 7550 + }, + { + "epoch": 1.899144869215292, + "grad_norm": 0.32429298758506775, + "learning_rate": 3.570864836666903e-06, + "loss": 0.3622, + "step": 7551 + }, + { + "epoch": 1.8993963782696177, + "grad_norm": 0.3145343065261841, + "learning_rate": 3.5694626951564637e-06, + "loss": 0.3564, + "step": 7552 + }, + { + "epoch": 1.8996478873239435, + "grad_norm": 0.30636870861053467, + "learning_rate": 3.5680606761625925e-06, + "loss": 0.3545, + "step": 7553 + }, + { + "epoch": 1.8998993963782698, + "grad_norm": 0.27128949761390686, + "learning_rate": 3.566658779805367e-06, + "loss": 0.351, + "step": 7554 + }, + { + "epoch": 1.9001509054325956, + "grad_norm": 0.318464070558548, + "learning_rate": 3.565257006204852e-06, + "loss": 0.3615, + "step": 7555 + }, + { + "epoch": 1.9004024144869214, + "grad_norm": 0.3404373824596405, + "learning_rate": 3.5638553554810963e-06, + "loss": 0.366, + "step": 7556 + }, + { + "epoch": 1.9006539235412476, + "grad_norm": 0.29662925004959106, + "learning_rate": 3.5624538277541474e-06, + "loss": 0.3436, + "step": 7557 + }, + { + "epoch": 1.9009054325955734, + "grad_norm": 0.2770460546016693, + "learning_rate": 3.5610524231440324e-06, + "loss": 0.3663, + "step": 7558 + }, + { + "epoch": 1.9011569416498992, + "grad_norm": 0.3167244493961334, + "learning_rate": 3.559651141770778e-06, + "loss": 0.3376, + "step": 7559 + }, + { + "epoch": 1.9014084507042255, + "grad_norm": 0.30453893542289734, + "learning_rate": 3.5582499837543894e-06, + "loss": 0.3545, + "step": 7560 + }, + { + "epoch": 1.9016599597585513, + "grad_norm": 0.3127380907535553, + "learning_rate": 3.5568489492148728e-06, + "loss": 0.3502, + "step": 7561 + }, + { + "epoch": 1.9019114688128773, + "grad_norm": 0.3051506280899048, + "learning_rate": 3.5554480382722134e-06, + "loss": 0.3508, + "step": 7562 + }, + { + "epoch": 1.9021629778672033, + "grad_norm": 0.28955167531967163, + "learning_rate": 3.5540472510463947e-06, + "loss": 0.3445, + "step": 7563 + }, + { + "epoch": 1.9024144869215291, + "grad_norm": 0.30066999793052673, + "learning_rate": 3.552646587657381e-06, + "loss": 0.3658, + "step": 7564 + }, + { + "epoch": 1.9026659959758552, + "grad_norm": 0.2882605791091919, + "learning_rate": 3.551246048225132e-06, + "loss": 0.362, + "step": 7565 + }, + { + "epoch": 1.9029175050301812, + "grad_norm": 0.29483652114868164, + "learning_rate": 3.549845632869598e-06, + "loss": 0.3343, + "step": 7566 + }, + { + "epoch": 1.903169014084507, + "grad_norm": 0.28981560468673706, + "learning_rate": 3.5484453417107113e-06, + "loss": 0.3553, + "step": 7567 + }, + { + "epoch": 1.903420523138833, + "grad_norm": 0.321000337600708, + "learning_rate": 3.547045174868402e-06, + "loss": 0.3451, + "step": 7568 + }, + { + "epoch": 1.903672032193159, + "grad_norm": 0.29945504665374756, + "learning_rate": 3.545645132462582e-06, + "loss": 0.3361, + "step": 7569 + }, + { + "epoch": 1.9039235412474849, + "grad_norm": 0.31492796540260315, + "learning_rate": 3.54424521461316e-06, + "loss": 0.3388, + "step": 7570 + }, + { + "epoch": 1.904175050301811, + "grad_norm": 0.28294259309768677, + "learning_rate": 3.5428454214400265e-06, + "loss": 0.3443, + "step": 7571 + }, + { + "epoch": 1.904426559356137, + "grad_norm": 0.3141169846057892, + "learning_rate": 3.541445753063068e-06, + "loss": 0.3318, + "step": 7572 + }, + { + "epoch": 1.9046780684104627, + "grad_norm": 0.2912863492965698, + "learning_rate": 3.5400462096021547e-06, + "loss": 0.3491, + "step": 7573 + }, + { + "epoch": 1.9049295774647887, + "grad_norm": 0.32144850492477417, + "learning_rate": 3.5386467911771518e-06, + "loss": 0.3553, + "step": 7574 + }, + { + "epoch": 1.9051810865191148, + "grad_norm": 0.33859172463417053, + "learning_rate": 3.5372474979079067e-06, + "loss": 0.3514, + "step": 7575 + }, + { + "epoch": 1.9054325955734406, + "grad_norm": 0.3066120743751526, + "learning_rate": 3.5358483299142645e-06, + "loss": 0.3434, + "step": 7576 + }, + { + "epoch": 1.9056841046277666, + "grad_norm": 0.35578417778015137, + "learning_rate": 3.534449287316052e-06, + "loss": 0.3699, + "step": 7577 + }, + { + "epoch": 1.9059356136820926, + "grad_norm": 0.30244138836860657, + "learning_rate": 3.5330503702330898e-06, + "loss": 0.3566, + "step": 7578 + }, + { + "epoch": 1.9061871227364184, + "grad_norm": 0.2991444170475006, + "learning_rate": 3.5316515787851867e-06, + "loss": 0.3412, + "step": 7579 + }, + { + "epoch": 1.9064386317907445, + "grad_norm": 0.29319190979003906, + "learning_rate": 3.53025291309214e-06, + "loss": 0.343, + "step": 7580 + }, + { + "epoch": 1.9066901408450705, + "grad_norm": 0.3237687945365906, + "learning_rate": 3.528854373273736e-06, + "loss": 0.339, + "step": 7581 + }, + { + "epoch": 1.9069416498993963, + "grad_norm": 0.31021758913993835, + "learning_rate": 3.5274559594497513e-06, + "loss": 0.3483, + "step": 7582 + }, + { + "epoch": 1.9071931589537223, + "grad_norm": 0.3291212320327759, + "learning_rate": 3.5260576717399518e-06, + "loss": 0.3483, + "step": 7583 + }, + { + "epoch": 1.9074446680080483, + "grad_norm": 0.2789864242076874, + "learning_rate": 3.5246595102640924e-06, + "loss": 0.3251, + "step": 7584 + }, + { + "epoch": 1.9076961770623742, + "grad_norm": 0.29820239543914795, + "learning_rate": 3.523261475141916e-06, + "loss": 0.3354, + "step": 7585 + }, + { + "epoch": 1.9079476861167002, + "grad_norm": 0.33340615034103394, + "learning_rate": 3.5218635664931556e-06, + "loss": 0.352, + "step": 7586 + }, + { + "epoch": 1.9081991951710262, + "grad_norm": 0.3217755854129791, + "learning_rate": 3.5204657844375323e-06, + "loss": 0.3632, + "step": 7587 + }, + { + "epoch": 1.908450704225352, + "grad_norm": 0.28968545794487, + "learning_rate": 3.5190681290947603e-06, + "loss": 0.3159, + "step": 7588 + }, + { + "epoch": 1.908702213279678, + "grad_norm": 0.2866925895214081, + "learning_rate": 3.517670600584537e-06, + "loss": 0.3443, + "step": 7589 + }, + { + "epoch": 1.908953722334004, + "grad_norm": 0.30135875940322876, + "learning_rate": 3.5162731990265553e-06, + "loss": 0.3629, + "step": 7590 + }, + { + "epoch": 1.9092052313883299, + "grad_norm": 0.28496021032333374, + "learning_rate": 3.5148759245404895e-06, + "loss": 0.3375, + "step": 7591 + }, + { + "epoch": 1.909456740442656, + "grad_norm": 0.29872947931289673, + "learning_rate": 3.513478777246012e-06, + "loss": 0.32, + "step": 7592 + }, + { + "epoch": 1.909708249496982, + "grad_norm": 0.3028433620929718, + "learning_rate": 3.5120817572627763e-06, + "loss": 0.3223, + "step": 7593 + }, + { + "epoch": 1.9099597585513077, + "grad_norm": 0.32273510098457336, + "learning_rate": 3.510684864710431e-06, + "loss": 0.3305, + "step": 7594 + }, + { + "epoch": 1.9102112676056338, + "grad_norm": 0.30309391021728516, + "learning_rate": 3.5092880997086076e-06, + "loss": 0.3564, + "step": 7595 + }, + { + "epoch": 1.9104627766599598, + "grad_norm": 0.3080253601074219, + "learning_rate": 3.5078914623769357e-06, + "loss": 0.3443, + "step": 7596 + }, + { + "epoch": 1.9107142857142856, + "grad_norm": 0.299723356962204, + "learning_rate": 3.506494952835022e-06, + "loss": 0.3494, + "step": 7597 + }, + { + "epoch": 1.9109657947686118, + "grad_norm": 0.3054339587688446, + "learning_rate": 3.505098571202473e-06, + "loss": 0.3533, + "step": 7598 + }, + { + "epoch": 1.9112173038229376, + "grad_norm": 0.2950621545314789, + "learning_rate": 3.5037023175988818e-06, + "loss": 0.3482, + "step": 7599 + }, + { + "epoch": 1.9114688128772634, + "grad_norm": 0.279410183429718, + "learning_rate": 3.502306192143824e-06, + "loss": 0.346, + "step": 7600 + }, + { + "epoch": 1.9117203219315897, + "grad_norm": 0.3337315320968628, + "learning_rate": 3.500910194956873e-06, + "loss": 0.3456, + "step": 7601 + }, + { + "epoch": 1.9119718309859155, + "grad_norm": 0.3044759929180145, + "learning_rate": 3.4995143261575835e-06, + "loss": 0.3345, + "step": 7602 + }, + { + "epoch": 1.9122233400402413, + "grad_norm": 0.33510109782218933, + "learning_rate": 3.4981185858655076e-06, + "loss": 0.3427, + "step": 7603 + }, + { + "epoch": 1.9124748490945676, + "grad_norm": 0.30306974053382874, + "learning_rate": 3.4967229742001764e-06, + "loss": 0.3364, + "step": 7604 + }, + { + "epoch": 1.9127263581488934, + "grad_norm": 0.31048643589019775, + "learning_rate": 3.4953274912811198e-06, + "loss": 0.3343, + "step": 7605 + }, + { + "epoch": 1.9129778672032192, + "grad_norm": 0.30860239267349243, + "learning_rate": 3.493932137227849e-06, + "loss": 0.3387, + "step": 7606 + }, + { + "epoch": 1.9132293762575454, + "grad_norm": 0.33586663007736206, + "learning_rate": 3.4925369121598708e-06, + "loss": 0.3428, + "step": 7607 + }, + { + "epoch": 1.9134808853118712, + "grad_norm": 0.2864849269390106, + "learning_rate": 3.4911418161966726e-06, + "loss": 0.3449, + "step": 7608 + }, + { + "epoch": 1.913732394366197, + "grad_norm": 0.3048544228076935, + "learning_rate": 3.489746849457739e-06, + "loss": 0.349, + "step": 7609 + }, + { + "epoch": 1.9139839034205233, + "grad_norm": 0.3124396502971649, + "learning_rate": 3.4883520120625414e-06, + "loss": 0.3296, + "step": 7610 + }, + { + "epoch": 1.914235412474849, + "grad_norm": 0.32257482409477234, + "learning_rate": 3.486957304130535e-06, + "loss": 0.3414, + "step": 7611 + }, + { + "epoch": 1.914486921529175, + "grad_norm": 0.30354657769203186, + "learning_rate": 3.4855627257811727e-06, + "loss": 0.3571, + "step": 7612 + }, + { + "epoch": 1.9147384305835011, + "grad_norm": 0.2927219867706299, + "learning_rate": 3.484168277133886e-06, + "loss": 0.3492, + "step": 7613 + }, + { + "epoch": 1.914989939637827, + "grad_norm": 0.3123704195022583, + "learning_rate": 3.4827739583081054e-06, + "loss": 0.3536, + "step": 7614 + }, + { + "epoch": 1.915241448692153, + "grad_norm": 0.3322194814682007, + "learning_rate": 3.481379769423242e-06, + "loss": 0.366, + "step": 7615 + }, + { + "epoch": 1.915492957746479, + "grad_norm": 0.3077843487262726, + "learning_rate": 3.479985710598702e-06, + "loss": 0.3643, + "step": 7616 + }, + { + "epoch": 1.9157444668008048, + "grad_norm": 0.33019596338272095, + "learning_rate": 3.4785917819538757e-06, + "loss": 0.3169, + "step": 7617 + }, + { + "epoch": 1.9159959758551308, + "grad_norm": 0.29199883341789246, + "learning_rate": 3.477197983608147e-06, + "loss": 0.3315, + "step": 7618 + }, + { + "epoch": 1.9162474849094568, + "grad_norm": 0.29495760798454285, + "learning_rate": 3.475804315680882e-06, + "loss": 0.3453, + "step": 7619 + }, + { + "epoch": 1.9164989939637826, + "grad_norm": 0.2928338646888733, + "learning_rate": 3.4744107782914425e-06, + "loss": 0.3661, + "step": 7620 + }, + { + "epoch": 1.9167505030181087, + "grad_norm": 0.3239118754863739, + "learning_rate": 3.4730173715591773e-06, + "loss": 0.3537, + "step": 7621 + }, + { + "epoch": 1.9170020120724347, + "grad_norm": 0.29765039682388306, + "learning_rate": 3.4716240956034197e-06, + "loss": 0.3481, + "step": 7622 + }, + { + "epoch": 1.9172535211267605, + "grad_norm": 0.2840639054775238, + "learning_rate": 3.4702309505434996e-06, + "loss": 0.3504, + "step": 7623 + }, + { + "epoch": 1.9175050301810865, + "grad_norm": 0.3031284511089325, + "learning_rate": 3.468837936498725e-06, + "loss": 0.3553, + "step": 7624 + }, + { + "epoch": 1.9177565392354126, + "grad_norm": 0.31842097640037537, + "learning_rate": 3.4674450535884053e-06, + "loss": 0.3413, + "step": 7625 + }, + { + "epoch": 1.9180080482897384, + "grad_norm": 0.3358910083770752, + "learning_rate": 3.4660523019318267e-06, + "loss": 0.3566, + "step": 7626 + }, + { + "epoch": 1.9182595573440644, + "grad_norm": 0.33151108026504517, + "learning_rate": 3.4646596816482743e-06, + "loss": 0.3433, + "step": 7627 + }, + { + "epoch": 1.9185110663983904, + "grad_norm": 0.31569045782089233, + "learning_rate": 3.4632671928570126e-06, + "loss": 0.3264, + "step": 7628 + }, + { + "epoch": 1.9187625754527162, + "grad_norm": 0.30424806475639343, + "learning_rate": 3.4618748356773046e-06, + "loss": 0.3341, + "step": 7629 + }, + { + "epoch": 1.9190140845070423, + "grad_norm": 0.29679644107818604, + "learning_rate": 3.460482610228392e-06, + "loss": 0.3414, + "step": 7630 + }, + { + "epoch": 1.9192655935613683, + "grad_norm": 0.3199297785758972, + "learning_rate": 3.459090516629514e-06, + "loss": 0.3571, + "step": 7631 + }, + { + "epoch": 1.919517102615694, + "grad_norm": 0.3305947482585907, + "learning_rate": 3.457698554999893e-06, + "loss": 0.353, + "step": 7632 + }, + { + "epoch": 1.91976861167002, + "grad_norm": 0.31420543789863586, + "learning_rate": 3.4563067254587424e-06, + "loss": 0.3556, + "step": 7633 + }, + { + "epoch": 1.9200201207243461, + "grad_norm": 0.2856319844722748, + "learning_rate": 3.4549150281252635e-06, + "loss": 0.3611, + "step": 7634 + }, + { + "epoch": 1.920271629778672, + "grad_norm": 0.30721721053123474, + "learning_rate": 3.4535234631186466e-06, + "loss": 0.324, + "step": 7635 + }, + { + "epoch": 1.920523138832998, + "grad_norm": 0.3075389266014099, + "learning_rate": 3.4521320305580697e-06, + "loss": 0.3438, + "step": 7636 + }, + { + "epoch": 1.920774647887324, + "grad_norm": 0.32247766852378845, + "learning_rate": 3.4507407305627018e-06, + "loss": 0.3569, + "step": 7637 + }, + { + "epoch": 1.9210261569416498, + "grad_norm": 0.32027557492256165, + "learning_rate": 3.449349563251697e-06, + "loss": 0.3591, + "step": 7638 + }, + { + "epoch": 1.9212776659959758, + "grad_norm": 0.292683482170105, + "learning_rate": 3.4479585287442025e-06, + "loss": 0.3264, + "step": 7639 + }, + { + "epoch": 1.9215291750503019, + "grad_norm": 0.3263826072216034, + "learning_rate": 3.4465676271593495e-06, + "loss": 0.3507, + "step": 7640 + }, + { + "epoch": 1.9217806841046277, + "grad_norm": 0.3174888491630554, + "learning_rate": 3.445176858616262e-06, + "loss": 0.346, + "step": 7641 + }, + { + "epoch": 1.9220321931589537, + "grad_norm": 0.301901638507843, + "learning_rate": 3.443786223234048e-06, + "loss": 0.357, + "step": 7642 + }, + { + "epoch": 1.9222837022132797, + "grad_norm": 0.3539835810661316, + "learning_rate": 3.4423957211318092e-06, + "loss": 0.3506, + "step": 7643 + }, + { + "epoch": 1.9225352112676055, + "grad_norm": 0.3143352270126343, + "learning_rate": 3.441005352428633e-06, + "loss": 0.3439, + "step": 7644 + }, + { + "epoch": 1.9227867203219315, + "grad_norm": 0.30168989300727844, + "learning_rate": 3.4396151172435954e-06, + "loss": 0.3677, + "step": 7645 + }, + { + "epoch": 1.9230382293762576, + "grad_norm": 0.3380453884601593, + "learning_rate": 3.4382250156957607e-06, + "loss": 0.3338, + "step": 7646 + }, + { + "epoch": 1.9232897384305834, + "grad_norm": 0.2964382767677307, + "learning_rate": 3.4368350479041836e-06, + "loss": 0.3358, + "step": 7647 + }, + { + "epoch": 1.9235412474849096, + "grad_norm": 0.2817126512527466, + "learning_rate": 3.4354452139879044e-06, + "loss": 0.3568, + "step": 7648 + }, + { + "epoch": 1.9237927565392354, + "grad_norm": 0.32317665219306946, + "learning_rate": 3.434055514065956e-06, + "loss": 0.358, + "step": 7649 + }, + { + "epoch": 1.9240442655935612, + "grad_norm": 0.3008497953414917, + "learning_rate": 3.4326659482573556e-06, + "loss": 0.3482, + "step": 7650 + }, + { + "epoch": 1.9242957746478875, + "grad_norm": 0.29205819964408875, + "learning_rate": 3.431276516681112e-06, + "loss": 0.3532, + "step": 7651 + }, + { + "epoch": 1.9245472837022133, + "grad_norm": 0.3241797983646393, + "learning_rate": 3.4298872194562203e-06, + "loss": 0.3549, + "step": 7652 + }, + { + "epoch": 1.924798792756539, + "grad_norm": 0.31223297119140625, + "learning_rate": 3.428498056701665e-06, + "loss": 0.3619, + "step": 7653 + }, + { + "epoch": 1.9250503018108653, + "grad_norm": 0.3212898075580597, + "learning_rate": 3.4271090285364216e-06, + "loss": 0.367, + "step": 7654 + }, + { + "epoch": 1.9253018108651911, + "grad_norm": 0.29246649146080017, + "learning_rate": 3.4257201350794487e-06, + "loss": 0.3404, + "step": 7655 + }, + { + "epoch": 1.925553319919517, + "grad_norm": 0.33508625626564026, + "learning_rate": 3.424331376449699e-06, + "loss": 0.3407, + "step": 7656 + }, + { + "epoch": 1.9258048289738432, + "grad_norm": 0.3068259358406067, + "learning_rate": 3.4229427527661074e-06, + "loss": 0.3378, + "step": 7657 + }, + { + "epoch": 1.926056338028169, + "grad_norm": 0.2951129972934723, + "learning_rate": 3.4215542641476053e-06, + "loss": 0.3404, + "step": 7658 + }, + { + "epoch": 1.9263078470824948, + "grad_norm": 0.28135162591934204, + "learning_rate": 3.4201659107131036e-06, + "loss": 0.3448, + "step": 7659 + }, + { + "epoch": 1.926559356136821, + "grad_norm": 0.2802700698375702, + "learning_rate": 3.4187776925815103e-06, + "loss": 0.3163, + "step": 7660 + }, + { + "epoch": 1.9268108651911469, + "grad_norm": 0.29353809356689453, + "learning_rate": 3.4173896098717134e-06, + "loss": 0.3673, + "step": 7661 + }, + { + "epoch": 1.9270623742454729, + "grad_norm": 0.3133329153060913, + "learning_rate": 3.4160016627025976e-06, + "loss": 0.3452, + "step": 7662 + }, + { + "epoch": 1.927313883299799, + "grad_norm": 0.29814791679382324, + "learning_rate": 3.414613851193028e-06, + "loss": 0.3653, + "step": 7663 + }, + { + "epoch": 1.9275653923541247, + "grad_norm": 0.28661277890205383, + "learning_rate": 3.4132261754618646e-06, + "loss": 0.3484, + "step": 7664 + }, + { + "epoch": 1.9278169014084507, + "grad_norm": 0.3048580586910248, + "learning_rate": 3.411838635627953e-06, + "loss": 0.3434, + "step": 7665 + }, + { + "epoch": 1.9280684104627768, + "grad_norm": 0.3117848336696625, + "learning_rate": 3.4104512318101256e-06, + "loss": 0.3471, + "step": 7666 + }, + { + "epoch": 1.9283199195171026, + "grad_norm": 0.31257185339927673, + "learning_rate": 3.4090639641272085e-06, + "loss": 0.3702, + "step": 7667 + }, + { + "epoch": 1.9285714285714286, + "grad_norm": 0.30880677700042725, + "learning_rate": 3.407676832698007e-06, + "loss": 0.337, + "step": 7668 + }, + { + "epoch": 1.9288229376257546, + "grad_norm": 0.294489324092865, + "learning_rate": 3.4062898376413257e-06, + "loss": 0.3519, + "step": 7669 + }, + { + "epoch": 1.9290744466800804, + "grad_norm": 0.2875889539718628, + "learning_rate": 3.404902979075948e-06, + "loss": 0.3557, + "step": 7670 + }, + { + "epoch": 1.9293259557344065, + "grad_norm": 0.33200985193252563, + "learning_rate": 3.4035162571206528e-06, + "loss": 0.364, + "step": 7671 + }, + { + "epoch": 1.9295774647887325, + "grad_norm": 0.31349629163742065, + "learning_rate": 3.4021296718942006e-06, + "loss": 0.3433, + "step": 7672 + }, + { + "epoch": 1.9298289738430583, + "grad_norm": 0.32347166538238525, + "learning_rate": 3.400743223515348e-06, + "loss": 0.3675, + "step": 7673 + }, + { + "epoch": 1.9300804828973843, + "grad_norm": 0.30907049775123596, + "learning_rate": 3.3993569121028306e-06, + "loss": 0.3577, + "step": 7674 + }, + { + "epoch": 1.9303319919517103, + "grad_norm": 0.28248360753059387, + "learning_rate": 3.397970737775381e-06, + "loss": 0.3327, + "step": 7675 + }, + { + "epoch": 1.9305835010060362, + "grad_norm": 0.3216158449649811, + "learning_rate": 3.396584700651717e-06, + "loss": 0.3264, + "step": 7676 + }, + { + "epoch": 1.9308350100603622, + "grad_norm": 0.3292233943939209, + "learning_rate": 3.395198800850541e-06, + "loss": 0.368, + "step": 7677 + }, + { + "epoch": 1.9310865191146882, + "grad_norm": 0.30121901631355286, + "learning_rate": 3.3938130384905495e-06, + "loss": 0.3396, + "step": 7678 + }, + { + "epoch": 1.931338028169014, + "grad_norm": 0.32539746165275574, + "learning_rate": 3.3924274136904214e-06, + "loss": 0.3689, + "step": 7679 + }, + { + "epoch": 1.93158953722334, + "grad_norm": 0.3132033944129944, + "learning_rate": 3.39104192656883e-06, + "loss": 0.3617, + "step": 7680 + }, + { + "epoch": 1.931841046277666, + "grad_norm": 0.30390465259552, + "learning_rate": 3.3896565772444303e-06, + "loss": 0.3432, + "step": 7681 + }, + { + "epoch": 1.9320925553319919, + "grad_norm": 0.3210819959640503, + "learning_rate": 3.3882713658358716e-06, + "loss": 0.3315, + "step": 7682 + }, + { + "epoch": 1.932344064386318, + "grad_norm": 0.2830940783023834, + "learning_rate": 3.3868862924617862e-06, + "loss": 0.319, + "step": 7683 + }, + { + "epoch": 1.932595573440644, + "grad_norm": 0.3306945264339447, + "learning_rate": 3.385501357240798e-06, + "loss": 0.3682, + "step": 7684 + }, + { + "epoch": 1.9328470824949697, + "grad_norm": 0.3089801073074341, + "learning_rate": 3.3841165602915206e-06, + "loss": 0.3496, + "step": 7685 + }, + { + "epoch": 1.9330985915492958, + "grad_norm": 0.31166577339172363, + "learning_rate": 3.3827319017325486e-06, + "loss": 0.3641, + "step": 7686 + }, + { + "epoch": 1.9333501006036218, + "grad_norm": 0.3063109517097473, + "learning_rate": 3.3813473816824743e-06, + "loss": 0.3454, + "step": 7687 + }, + { + "epoch": 1.9336016096579476, + "grad_norm": 0.29401347041130066, + "learning_rate": 3.3799630002598683e-06, + "loss": 0.3354, + "step": 7688 + }, + { + "epoch": 1.9338531187122736, + "grad_norm": 0.2971067726612091, + "learning_rate": 3.3785787575832974e-06, + "loss": 0.3436, + "step": 7689 + }, + { + "epoch": 1.9341046277665996, + "grad_norm": 0.28379392623901367, + "learning_rate": 3.377194653771311e-06, + "loss": 0.3655, + "step": 7690 + }, + { + "epoch": 1.9343561368209254, + "grad_norm": 0.3166361451148987, + "learning_rate": 3.3758106889424526e-06, + "loss": 0.3555, + "step": 7691 + }, + { + "epoch": 1.9346076458752515, + "grad_norm": 0.2777943015098572, + "learning_rate": 3.3744268632152454e-06, + "loss": 0.3448, + "step": 7692 + }, + { + "epoch": 1.9348591549295775, + "grad_norm": 0.3278443217277527, + "learning_rate": 3.37304317670821e-06, + "loss": 0.373, + "step": 7693 + }, + { + "epoch": 1.9351106639839033, + "grad_norm": 0.30939531326293945, + "learning_rate": 3.371659629539846e-06, + "loss": 0.3366, + "step": 7694 + }, + { + "epoch": 1.9353621730382293, + "grad_norm": 0.3135591447353363, + "learning_rate": 3.3702762218286487e-06, + "loss": 0.337, + "step": 7695 + }, + { + "epoch": 1.9356136820925554, + "grad_norm": 0.3024998903274536, + "learning_rate": 3.368892953693098e-06, + "loss": 0.3488, + "step": 7696 + }, + { + "epoch": 1.9358651911468812, + "grad_norm": 0.2925957143306732, + "learning_rate": 3.367509825251662e-06, + "loss": 0.3415, + "step": 7697 + }, + { + "epoch": 1.9361167002012074, + "grad_norm": 0.26913854479789734, + "learning_rate": 3.366126836622796e-06, + "loss": 0.3147, + "step": 7698 + }, + { + "epoch": 1.9363682092555332, + "grad_norm": 0.331188827753067, + "learning_rate": 3.3647439879249453e-06, + "loss": 0.3634, + "step": 7699 + }, + { + "epoch": 1.936619718309859, + "grad_norm": 0.3402562439441681, + "learning_rate": 3.363361279276541e-06, + "loss": 0.356, + "step": 7700 + }, + { + "epoch": 1.9368712273641853, + "grad_norm": 0.3227146863937378, + "learning_rate": 3.3619787107960054e-06, + "loss": 0.3238, + "step": 7701 + }, + { + "epoch": 1.937122736418511, + "grad_norm": 0.31124117970466614, + "learning_rate": 3.3605962826017457e-06, + "loss": 0.3313, + "step": 7702 + }, + { + "epoch": 1.9373742454728369, + "grad_norm": 0.303874671459198, + "learning_rate": 3.359213994812158e-06, + "loss": 0.338, + "step": 7703 + }, + { + "epoch": 1.9376257545271631, + "grad_norm": 0.3281771242618561, + "learning_rate": 3.357831847545627e-06, + "loss": 0.3628, + "step": 7704 + }, + { + "epoch": 1.937877263581489, + "grad_norm": 0.3109123408794403, + "learning_rate": 3.356449840920525e-06, + "loss": 0.3361, + "step": 7705 + }, + { + "epoch": 1.9381287726358147, + "grad_norm": 0.30376991629600525, + "learning_rate": 3.3550679750552107e-06, + "loss": 0.3625, + "step": 7706 + }, + { + "epoch": 1.938380281690141, + "grad_norm": 0.29432567954063416, + "learning_rate": 3.3536862500680354e-06, + "loss": 0.3326, + "step": 7707 + }, + { + "epoch": 1.9386317907444668, + "grad_norm": 0.3185214102268219, + "learning_rate": 3.3523046660773327e-06, + "loss": 0.3599, + "step": 7708 + }, + { + "epoch": 1.9388832997987926, + "grad_norm": 0.35394561290740967, + "learning_rate": 3.3509232232014287e-06, + "loss": 0.376, + "step": 7709 + }, + { + "epoch": 1.9391348088531188, + "grad_norm": 0.3000744581222534, + "learning_rate": 3.3495419215586324e-06, + "loss": 0.3304, + "step": 7710 + }, + { + "epoch": 1.9393863179074446, + "grad_norm": 0.29905176162719727, + "learning_rate": 3.3481607612672464e-06, + "loss": 0.3643, + "step": 7711 + }, + { + "epoch": 1.9396378269617707, + "grad_norm": 0.3088197410106659, + "learning_rate": 3.346779742445556e-06, + "loss": 0.3307, + "step": 7712 + }, + { + "epoch": 1.9398893360160967, + "grad_norm": 0.2897097170352936, + "learning_rate": 3.3453988652118398e-06, + "loss": 0.3296, + "step": 7713 + }, + { + "epoch": 1.9401408450704225, + "grad_norm": 0.31739160418510437, + "learning_rate": 3.344018129684358e-06, + "loss": 0.3387, + "step": 7714 + }, + { + "epoch": 1.9403923541247485, + "grad_norm": 0.315039724111557, + "learning_rate": 3.3426375359813655e-06, + "loss": 0.3463, + "step": 7715 + }, + { + "epoch": 1.9406438631790746, + "grad_norm": 0.28398001194000244, + "learning_rate": 3.341257084221098e-06, + "loss": 0.378, + "step": 7716 + }, + { + "epoch": 1.9408953722334004, + "grad_norm": 0.3012690246105194, + "learning_rate": 3.339876774521783e-06, + "loss": 0.3328, + "step": 7717 + }, + { + "epoch": 1.9411468812877264, + "grad_norm": 0.3159896731376648, + "learning_rate": 3.33849660700164e-06, + "loss": 0.3328, + "step": 7718 + }, + { + "epoch": 1.9413983903420524, + "grad_norm": 0.3031710088253021, + "learning_rate": 3.3371165817788655e-06, + "loss": 0.336, + "step": 7719 + }, + { + "epoch": 1.9416498993963782, + "grad_norm": 0.3224204182624817, + "learning_rate": 3.3357366989716544e-06, + "loss": 0.3553, + "step": 7720 + }, + { + "epoch": 1.9419014084507042, + "grad_norm": 0.31610408425331116, + "learning_rate": 3.3343569586981823e-06, + "loss": 0.307, + "step": 7721 + }, + { + "epoch": 1.9421529175050303, + "grad_norm": 0.32398226857185364, + "learning_rate": 3.332977361076618e-06, + "loss": 0.375, + "step": 7722 + }, + { + "epoch": 1.942404426559356, + "grad_norm": 0.3373188078403473, + "learning_rate": 3.331597906225112e-06, + "loss": 0.3535, + "step": 7723 + }, + { + "epoch": 1.942655935613682, + "grad_norm": 0.3140803575515747, + "learning_rate": 3.330218594261809e-06, + "loss": 0.3512, + "step": 7724 + }, + { + "epoch": 1.9429074446680081, + "grad_norm": 0.29968270659446716, + "learning_rate": 3.3288394253048365e-06, + "loss": 0.3476, + "step": 7725 + }, + { + "epoch": 1.943158953722334, + "grad_norm": 0.29010340571403503, + "learning_rate": 3.3274603994723144e-06, + "loss": 0.3188, + "step": 7726 + }, + { + "epoch": 1.94341046277666, + "grad_norm": 0.3180834949016571, + "learning_rate": 3.3260815168823433e-06, + "loss": 0.3317, + "step": 7727 + }, + { + "epoch": 1.943661971830986, + "grad_norm": 0.3009601831436157, + "learning_rate": 3.3247027776530183e-06, + "loss": 0.3546, + "step": 7728 + }, + { + "epoch": 1.9439134808853118, + "grad_norm": 0.29234665632247925, + "learning_rate": 3.323324181902422e-06, + "loss": 0.3357, + "step": 7729 + }, + { + "epoch": 1.9441649899396378, + "grad_norm": 0.2999419867992401, + "learning_rate": 3.321945729748618e-06, + "loss": 0.3627, + "step": 7730 + }, + { + "epoch": 1.9444164989939638, + "grad_norm": 0.2987615466117859, + "learning_rate": 3.3205674213096662e-06, + "loss": 0.3133, + "step": 7731 + }, + { + "epoch": 1.9446680080482897, + "grad_norm": 0.30129966139793396, + "learning_rate": 3.3191892567036065e-06, + "loss": 0.3278, + "step": 7732 + }, + { + "epoch": 1.9449195171026157, + "grad_norm": 0.2908097505569458, + "learning_rate": 3.317811236048474e-06, + "loss": 0.3454, + "step": 7733 + }, + { + "epoch": 1.9451710261569417, + "grad_norm": 0.30645039677619934, + "learning_rate": 3.316433359462283e-06, + "loss": 0.336, + "step": 7734 + }, + { + "epoch": 1.9454225352112675, + "grad_norm": 0.30733421444892883, + "learning_rate": 3.315055627063045e-06, + "loss": 0.3457, + "step": 7735 + }, + { + "epoch": 1.9456740442655935, + "grad_norm": 0.3086591362953186, + "learning_rate": 3.313678038968749e-06, + "loss": 0.3393, + "step": 7736 + }, + { + "epoch": 1.9459255533199196, + "grad_norm": 0.3118712902069092, + "learning_rate": 3.312300595297382e-06, + "loss": 0.3467, + "step": 7737 + }, + { + "epoch": 1.9461770623742454, + "grad_norm": 0.29625868797302246, + "learning_rate": 3.310923296166908e-06, + "loss": 0.3222, + "step": 7738 + }, + { + "epoch": 1.9464285714285714, + "grad_norm": 0.3383052349090576, + "learning_rate": 3.309546141695287e-06, + "loss": 0.3333, + "step": 7739 + }, + { + "epoch": 1.9466800804828974, + "grad_norm": 0.29828548431396484, + "learning_rate": 3.308169132000466e-06, + "loss": 0.3462, + "step": 7740 + }, + { + "epoch": 1.9469315895372232, + "grad_norm": 0.31392189860343933, + "learning_rate": 3.3067922672003727e-06, + "loss": 0.3538, + "step": 7741 + }, + { + "epoch": 1.9471830985915493, + "grad_norm": 0.28369051218032837, + "learning_rate": 3.3054155474129306e-06, + "loss": 0.3093, + "step": 7742 + }, + { + "epoch": 1.9474346076458753, + "grad_norm": 0.3025501072406769, + "learning_rate": 3.304038972756044e-06, + "loss": 0.3583, + "step": 7743 + }, + { + "epoch": 1.947686116700201, + "grad_norm": 0.3110698461532593, + "learning_rate": 3.3026625433476112e-06, + "loss": 0.3538, + "step": 7744 + }, + { + "epoch": 1.9479376257545271, + "grad_norm": 0.3204960823059082, + "learning_rate": 3.301286259305511e-06, + "loss": 0.3606, + "step": 7745 + }, + { + "epoch": 1.9481891348088531, + "grad_norm": 0.28646546602249146, + "learning_rate": 3.299910120747618e-06, + "loss": 0.3416, + "step": 7746 + }, + { + "epoch": 1.948440643863179, + "grad_norm": 0.31837978959083557, + "learning_rate": 3.298534127791785e-06, + "loss": 0.3218, + "step": 7747 + }, + { + "epoch": 1.9486921529175052, + "grad_norm": 0.2954898476600647, + "learning_rate": 3.2971582805558622e-06, + "loss": 0.37, + "step": 7748 + }, + { + "epoch": 1.948943661971831, + "grad_norm": 0.3241606652736664, + "learning_rate": 3.295782579157677e-06, + "loss": 0.3637, + "step": 7749 + }, + { + "epoch": 1.9491951710261568, + "grad_norm": 0.3172752559185028, + "learning_rate": 3.294407023715053e-06, + "loss": 0.3458, + "step": 7750 + }, + { + "epoch": 1.949446680080483, + "grad_norm": 0.3054000735282898, + "learning_rate": 3.2930316143457984e-06, + "loss": 0.3666, + "step": 7751 + }, + { + "epoch": 1.9496981891348089, + "grad_norm": 0.27285468578338623, + "learning_rate": 3.2916563511677057e-06, + "loss": 0.3397, + "step": 7752 + }, + { + "epoch": 1.9499496981891347, + "grad_norm": 0.32400456070899963, + "learning_rate": 3.2902812342985613e-06, + "loss": 0.3159, + "step": 7753 + }, + { + "epoch": 1.950201207243461, + "grad_norm": 0.30845338106155396, + "learning_rate": 3.2889062638561313e-06, + "loss": 0.3652, + "step": 7754 + }, + { + "epoch": 1.9504527162977867, + "grad_norm": 0.2901265621185303, + "learning_rate": 3.287531439958177e-06, + "loss": 0.3283, + "step": 7755 + }, + { + "epoch": 1.9507042253521125, + "grad_norm": 0.32865509390830994, + "learning_rate": 3.28615676272244e-06, + "loss": 0.3355, + "step": 7756 + }, + { + "epoch": 1.9509557344064388, + "grad_norm": 0.28665393590927124, + "learning_rate": 3.2847822322666564e-06, + "loss": 0.3413, + "step": 7757 + }, + { + "epoch": 1.9512072434607646, + "grad_norm": 0.3147144019603729, + "learning_rate": 3.283407848708542e-06, + "loss": 0.3519, + "step": 7758 + }, + { + "epoch": 1.9514587525150904, + "grad_norm": 0.32015910744667053, + "learning_rate": 3.2820336121658084e-06, + "loss": 0.3478, + "step": 7759 + }, + { + "epoch": 1.9517102615694166, + "grad_norm": 0.29439136385917664, + "learning_rate": 3.2806595227561464e-06, + "loss": 0.3274, + "step": 7760 + }, + { + "epoch": 1.9519617706237424, + "grad_norm": 0.303396075963974, + "learning_rate": 3.279285580597241e-06, + "loss": 0.3506, + "step": 7761 + }, + { + "epoch": 1.9522132796780685, + "grad_norm": 0.2776741087436676, + "learning_rate": 3.277911785806761e-06, + "loss": 0.344, + "step": 7762 + }, + { + "epoch": 1.9524647887323945, + "grad_norm": 0.3228535056114197, + "learning_rate": 3.2765381385023638e-06, + "loss": 0.3522, + "step": 7763 + }, + { + "epoch": 1.9527162977867203, + "grad_norm": 0.3100894093513489, + "learning_rate": 3.2751646388016924e-06, + "loss": 0.3582, + "step": 7764 + }, + { + "epoch": 1.9529678068410463, + "grad_norm": 0.3215596079826355, + "learning_rate": 3.27379128682238e-06, + "loss": 0.3395, + "step": 7765 + }, + { + "epoch": 1.9532193158953723, + "grad_norm": 0.30445247888565063, + "learning_rate": 3.2724180826820436e-06, + "loss": 0.3348, + "step": 7766 + }, + { + "epoch": 1.9534708249496981, + "grad_norm": 0.29653722047805786, + "learning_rate": 3.2710450264982906e-06, + "loss": 0.3616, + "step": 7767 + }, + { + "epoch": 1.9537223340040242, + "grad_norm": 0.31945815682411194, + "learning_rate": 3.269672118388716e-06, + "loss": 0.3714, + "step": 7768 + }, + { + "epoch": 1.9539738430583502, + "grad_norm": 0.3002147972583771, + "learning_rate": 3.2682993584708988e-06, + "loss": 0.3323, + "step": 7769 + }, + { + "epoch": 1.954225352112676, + "grad_norm": 0.3391788601875305, + "learning_rate": 3.2669267468624077e-06, + "loss": 0.3334, + "step": 7770 + }, + { + "epoch": 1.954476861167002, + "grad_norm": 0.28778892755508423, + "learning_rate": 3.2655542836807998e-06, + "loss": 0.3374, + "step": 7771 + }, + { + "epoch": 1.954728370221328, + "grad_norm": 0.3348093032836914, + "learning_rate": 3.264181969043615e-06, + "loss": 0.3448, + "step": 7772 + }, + { + "epoch": 1.9549798792756539, + "grad_norm": 0.3041500449180603, + "learning_rate": 3.2628098030683873e-06, + "loss": 0.3534, + "step": 7773 + }, + { + "epoch": 1.95523138832998, + "grad_norm": 0.32908380031585693, + "learning_rate": 3.26143778587263e-06, + "loss": 0.3498, + "step": 7774 + }, + { + "epoch": 1.955482897384306, + "grad_norm": 0.33852681517601013, + "learning_rate": 3.2600659175738524e-06, + "loss": 0.3642, + "step": 7775 + }, + { + "epoch": 1.9557344064386317, + "grad_norm": 0.3201028108596802, + "learning_rate": 3.2586941982895414e-06, + "loss": 0.3265, + "step": 7776 + }, + { + "epoch": 1.9559859154929577, + "grad_norm": 0.2781035602092743, + "learning_rate": 3.2573226281371817e-06, + "loss": 0.336, + "step": 7777 + }, + { + "epoch": 1.9562374245472838, + "grad_norm": 0.30619314312934875, + "learning_rate": 3.2559512072342342e-06, + "loss": 0.3426, + "step": 7778 + }, + { + "epoch": 1.9564889336016096, + "grad_norm": 0.31019851565361023, + "learning_rate": 3.2545799356981566e-06, + "loss": 0.3383, + "step": 7779 + }, + { + "epoch": 1.9567404426559356, + "grad_norm": 0.32283902168273926, + "learning_rate": 3.2532088136463867e-06, + "loss": 0.3363, + "step": 7780 + }, + { + "epoch": 1.9569919517102616, + "grad_norm": 0.2992129325866699, + "learning_rate": 3.2518378411963565e-06, + "loss": 0.328, + "step": 7781 + }, + { + "epoch": 1.9572434607645874, + "grad_norm": 0.2824527323246002, + "learning_rate": 3.2504670184654764e-06, + "loss": 0.3299, + "step": 7782 + }, + { + "epoch": 1.9574949698189135, + "grad_norm": 0.3210359811782837, + "learning_rate": 3.2490963455711506e-06, + "loss": 0.3574, + "step": 7783 + }, + { + "epoch": 1.9577464788732395, + "grad_norm": 0.3022248446941376, + "learning_rate": 3.2477258226307716e-06, + "loss": 0.3502, + "step": 7784 + }, + { + "epoch": 1.9579979879275653, + "grad_norm": 0.292578786611557, + "learning_rate": 3.2463554497617113e-06, + "loss": 0.3535, + "step": 7785 + }, + { + "epoch": 1.9582494969818913, + "grad_norm": 0.2895413935184479, + "learning_rate": 3.2449852270813386e-06, + "loss": 0.3292, + "step": 7786 + }, + { + "epoch": 1.9585010060362174, + "grad_norm": 0.3182920515537262, + "learning_rate": 3.243615154706999e-06, + "loss": 0.336, + "step": 7787 + }, + { + "epoch": 1.9587525150905432, + "grad_norm": 0.3026027977466583, + "learning_rate": 3.242245232756036e-06, + "loss": 0.345, + "step": 7788 + }, + { + "epoch": 1.9590040241448692, + "grad_norm": 0.28621864318847656, + "learning_rate": 3.2408754613457703e-06, + "loss": 0.3604, + "step": 7789 + }, + { + "epoch": 1.9592555331991952, + "grad_norm": 0.2878339886665344, + "learning_rate": 3.2395058405935186e-06, + "loss": 0.3392, + "step": 7790 + }, + { + "epoch": 1.959507042253521, + "grad_norm": 0.3039291203022003, + "learning_rate": 3.238136370616576e-06, + "loss": 0.3317, + "step": 7791 + }, + { + "epoch": 1.959758551307847, + "grad_norm": 0.30178239941596985, + "learning_rate": 3.2367670515322324e-06, + "loss": 0.3338, + "step": 7792 + }, + { + "epoch": 1.960010060362173, + "grad_norm": 0.2928571403026581, + "learning_rate": 3.2353978834577587e-06, + "loss": 0.3344, + "step": 7793 + }, + { + "epoch": 1.9602615694164989, + "grad_norm": 0.3058040142059326, + "learning_rate": 3.2340288665104167e-06, + "loss": 0.3786, + "step": 7794 + }, + { + "epoch": 1.960513078470825, + "grad_norm": 0.3003310263156891, + "learning_rate": 3.232660000807457e-06, + "loss": 0.351, + "step": 7795 + }, + { + "epoch": 1.960764587525151, + "grad_norm": 0.30328527092933655, + "learning_rate": 3.231291286466109e-06, + "loss": 0.3547, + "step": 7796 + }, + { + "epoch": 1.9610160965794767, + "grad_norm": 0.2882510721683502, + "learning_rate": 3.2299227236035996e-06, + "loss": 0.3301, + "step": 7797 + }, + { + "epoch": 1.961267605633803, + "grad_norm": 0.3310767114162445, + "learning_rate": 3.2285543123371333e-06, + "loss": 0.3515, + "step": 7798 + }, + { + "epoch": 1.9615191146881288, + "grad_norm": 0.31823989748954773, + "learning_rate": 3.2271860527839104e-06, + "loss": 0.3701, + "step": 7799 + }, + { + "epoch": 1.9617706237424546, + "grad_norm": 0.28764277696609497, + "learning_rate": 3.2258179450611086e-06, + "loss": 0.3319, + "step": 7800 + }, + { + "epoch": 1.9620221327967808, + "grad_norm": 0.26832863688468933, + "learning_rate": 3.2244499892859032e-06, + "loss": 0.3329, + "step": 7801 + }, + { + "epoch": 1.9622736418511066, + "grad_norm": 0.29785555601119995, + "learning_rate": 3.2230821855754464e-06, + "loss": 0.3667, + "step": 7802 + }, + { + "epoch": 1.9625251509054324, + "grad_norm": 0.3114742934703827, + "learning_rate": 3.221714534046886e-06, + "loss": 0.3327, + "step": 7803 + }, + { + "epoch": 1.9627766599597587, + "grad_norm": 0.32278159260749817, + "learning_rate": 3.2203470348173483e-06, + "loss": 0.3631, + "step": 7804 + }, + { + "epoch": 1.9630281690140845, + "grad_norm": 0.283717542886734, + "learning_rate": 3.2189796880039535e-06, + "loss": 0.3244, + "step": 7805 + }, + { + "epoch": 1.9632796780684103, + "grad_norm": 0.3093319833278656, + "learning_rate": 3.2176124937238094e-06, + "loss": 0.3288, + "step": 7806 + }, + { + "epoch": 1.9635311871227366, + "grad_norm": 0.295280396938324, + "learning_rate": 3.2162454520940024e-06, + "loss": 0.3391, + "step": 7807 + }, + { + "epoch": 1.9637826961770624, + "grad_norm": 0.2879166007041931, + "learning_rate": 3.214878563231615e-06, + "loss": 0.3515, + "step": 7808 + }, + { + "epoch": 1.9640342052313882, + "grad_norm": 0.32011473178863525, + "learning_rate": 3.2135118272537093e-06, + "loss": 0.3262, + "step": 7809 + }, + { + "epoch": 1.9642857142857144, + "grad_norm": 0.305389940738678, + "learning_rate": 3.2121452442773405e-06, + "loss": 0.3509, + "step": 7810 + }, + { + "epoch": 1.9645372233400402, + "grad_norm": 0.3013128936290741, + "learning_rate": 3.210778814419545e-06, + "loss": 0.3232, + "step": 7811 + }, + { + "epoch": 1.9647887323943662, + "grad_norm": 0.28125154972076416, + "learning_rate": 3.2094125377973534e-06, + "loss": 0.3342, + "step": 7812 + }, + { + "epoch": 1.9650402414486923, + "grad_norm": 0.30323272943496704, + "learning_rate": 3.2080464145277736e-06, + "loss": 0.3375, + "step": 7813 + }, + { + "epoch": 1.965291750503018, + "grad_norm": 0.29133641719818115, + "learning_rate": 3.20668044472781e-06, + "loss": 0.3335, + "step": 7814 + }, + { + "epoch": 1.965543259557344, + "grad_norm": 0.30253052711486816, + "learning_rate": 3.2053146285144456e-06, + "loss": 0.3318, + "step": 7815 + }, + { + "epoch": 1.9657947686116701, + "grad_norm": 0.2923412024974823, + "learning_rate": 3.2039489660046565e-06, + "loss": 0.33, + "step": 7816 + }, + { + "epoch": 1.966046277665996, + "grad_norm": 0.30035942792892456, + "learning_rate": 3.2025834573154025e-06, + "loss": 0.3195, + "step": 7817 + }, + { + "epoch": 1.966297786720322, + "grad_norm": 0.30417102575302124, + "learning_rate": 3.2012181025636303e-06, + "loss": 0.3386, + "step": 7818 + }, + { + "epoch": 1.966549295774648, + "grad_norm": 0.288310170173645, + "learning_rate": 3.1998529018662748e-06, + "loss": 0.3736, + "step": 7819 + }, + { + "epoch": 1.9668008048289738, + "grad_norm": 0.2810823321342468, + "learning_rate": 3.1984878553402566e-06, + "loss": 0.322, + "step": 7820 + }, + { + "epoch": 1.9670523138832998, + "grad_norm": 0.28224313259124756, + "learning_rate": 3.1971229631024836e-06, + "loss": 0.3359, + "step": 7821 + }, + { + "epoch": 1.9673038229376258, + "grad_norm": 0.3008633553981781, + "learning_rate": 3.19575822526985e-06, + "loss": 0.3528, + "step": 7822 + }, + { + "epoch": 1.9675553319919517, + "grad_norm": 0.3109699487686157, + "learning_rate": 3.194393641959237e-06, + "loss": 0.335, + "step": 7823 + }, + { + "epoch": 1.9678068410462777, + "grad_norm": 0.2929900288581848, + "learning_rate": 3.193029213287513e-06, + "loss": 0.3471, + "step": 7824 + }, + { + "epoch": 1.9680583501006037, + "grad_norm": 0.3091834485530853, + "learning_rate": 3.1916649393715314e-06, + "loss": 0.3395, + "step": 7825 + }, + { + "epoch": 1.9683098591549295, + "grad_norm": 0.2836105525493622, + "learning_rate": 3.190300820328135e-06, + "loss": 0.3434, + "step": 7826 + }, + { + "epoch": 1.9685613682092555, + "grad_norm": 0.331920325756073, + "learning_rate": 3.1889368562741527e-06, + "loss": 0.3478, + "step": 7827 + }, + { + "epoch": 1.9688128772635816, + "grad_norm": 0.34226587414741516, + "learning_rate": 3.187573047326398e-06, + "loss": 0.3698, + "step": 7828 + }, + { + "epoch": 1.9690643863179074, + "grad_norm": 0.31301939487457275, + "learning_rate": 3.186209393601674e-06, + "loss": 0.3542, + "step": 7829 + }, + { + "epoch": 1.9693158953722334, + "grad_norm": 0.31900009512901306, + "learning_rate": 3.184845895216768e-06, + "loss": 0.3532, + "step": 7830 + }, + { + "epoch": 1.9695674044265594, + "grad_norm": 0.29443228244781494, + "learning_rate": 3.183482552288456e-06, + "loss": 0.3383, + "step": 7831 + }, + { + "epoch": 1.9698189134808852, + "grad_norm": 0.31553661823272705, + "learning_rate": 3.1821193649334993e-06, + "loss": 0.3484, + "step": 7832 + }, + { + "epoch": 1.9700704225352113, + "grad_norm": 0.2916860580444336, + "learning_rate": 3.180756333268646e-06, + "loss": 0.3332, + "step": 7833 + }, + { + "epoch": 1.9703219315895373, + "grad_norm": 0.3077201247215271, + "learning_rate": 3.1793934574106317e-06, + "loss": 0.3442, + "step": 7834 + }, + { + "epoch": 1.970573440643863, + "grad_norm": 0.32148656249046326, + "learning_rate": 3.1780307374761777e-06, + "loss": 0.3529, + "step": 7835 + }, + { + "epoch": 1.970824949698189, + "grad_norm": 0.30224621295928955, + "learning_rate": 3.1766681735819926e-06, + "loss": 0.3389, + "step": 7836 + }, + { + "epoch": 1.9710764587525151, + "grad_norm": 0.3319030702114105, + "learning_rate": 3.1753057658447726e-06, + "loss": 0.3552, + "step": 7837 + }, + { + "epoch": 1.971327967806841, + "grad_norm": 0.28931450843811035, + "learning_rate": 3.173943514381198e-06, + "loss": 0.326, + "step": 7838 + }, + { + "epoch": 1.971579476861167, + "grad_norm": 0.3092653453350067, + "learning_rate": 3.1725814193079384e-06, + "loss": 0.3511, + "step": 7839 + }, + { + "epoch": 1.971830985915493, + "grad_norm": 0.2999701499938965, + "learning_rate": 3.171219480741646e-06, + "loss": 0.3566, + "step": 7840 + }, + { + "epoch": 1.9720824949698188, + "grad_norm": 0.29582321643829346, + "learning_rate": 3.1698576987989672e-06, + "loss": 0.3297, + "step": 7841 + }, + { + "epoch": 1.9723340040241448, + "grad_norm": 0.288314551115036, + "learning_rate": 3.168496073596524e-06, + "loss": 0.3499, + "step": 7842 + }, + { + "epoch": 1.9725855130784709, + "grad_norm": 0.292122483253479, + "learning_rate": 3.167134605250938e-06, + "loss": 0.3154, + "step": 7843 + }, + { + "epoch": 1.9728370221327967, + "grad_norm": 0.3173863887786865, + "learning_rate": 3.1657732938788033e-06, + "loss": 0.3427, + "step": 7844 + }, + { + "epoch": 1.9730885311871227, + "grad_norm": 0.3070237934589386, + "learning_rate": 3.164412139596713e-06, + "loss": 0.3399, + "step": 7845 + }, + { + "epoch": 1.9733400402414487, + "grad_norm": 0.30909422039985657, + "learning_rate": 3.163051142521238e-06, + "loss": 0.3436, + "step": 7846 + }, + { + "epoch": 1.9735915492957745, + "grad_norm": 0.3138062059879303, + "learning_rate": 3.1616903027689407e-06, + "loss": 0.3271, + "step": 7847 + }, + { + "epoch": 1.9738430583501008, + "grad_norm": 0.30243563652038574, + "learning_rate": 3.1603296204563707e-06, + "loss": 0.3389, + "step": 7848 + }, + { + "epoch": 1.9740945674044266, + "grad_norm": 0.31402406096458435, + "learning_rate": 3.158969095700057e-06, + "loss": 0.3529, + "step": 7849 + }, + { + "epoch": 1.9743460764587524, + "grad_norm": 0.32227790355682373, + "learning_rate": 3.157608728616525e-06, + "loss": 0.3545, + "step": 7850 + }, + { + "epoch": 1.9745975855130786, + "grad_norm": 0.301570862531662, + "learning_rate": 3.156248519322278e-06, + "loss": 0.337, + "step": 7851 + }, + { + "epoch": 1.9748490945674044, + "grad_norm": 0.27347323298454285, + "learning_rate": 3.154888467933812e-06, + "loss": 0.328, + "step": 7852 + }, + { + "epoch": 1.9751006036217302, + "grad_norm": 0.2885780930519104, + "learning_rate": 3.153528574567605e-06, + "loss": 0.3225, + "step": 7853 + }, + { + "epoch": 1.9753521126760565, + "grad_norm": 0.30733510851860046, + "learning_rate": 3.152168839340125e-06, + "loss": 0.348, + "step": 7854 + }, + { + "epoch": 1.9756036217303823, + "grad_norm": 0.30902665853500366, + "learning_rate": 3.1508092623678223e-06, + "loss": 0.3374, + "step": 7855 + }, + { + "epoch": 1.975855130784708, + "grad_norm": 0.28816184401512146, + "learning_rate": 3.14944984376714e-06, + "loss": 0.3434, + "step": 7856 + }, + { + "epoch": 1.9761066398390343, + "grad_norm": 0.315157026052475, + "learning_rate": 3.1480905836544996e-06, + "loss": 0.3565, + "step": 7857 + }, + { + "epoch": 1.9763581488933601, + "grad_norm": 0.3467448651790619, + "learning_rate": 3.1467314821463147e-06, + "loss": 0.3529, + "step": 7858 + }, + { + "epoch": 1.9766096579476862, + "grad_norm": 0.3209885358810425, + "learning_rate": 3.145372539358987e-06, + "loss": 0.34, + "step": 7859 + }, + { + "epoch": 1.9768611670020122, + "grad_norm": 0.3004145920276642, + "learning_rate": 3.1440137554088957e-06, + "loss": 0.3567, + "step": 7860 + }, + { + "epoch": 1.977112676056338, + "grad_norm": 0.31923210620880127, + "learning_rate": 3.1426551304124187e-06, + "loss": 0.3249, + "step": 7861 + }, + { + "epoch": 1.977364185110664, + "grad_norm": 0.2933943569660187, + "learning_rate": 3.1412966644859073e-06, + "loss": 0.3495, + "step": 7862 + }, + { + "epoch": 1.97761569416499, + "grad_norm": 0.31692373752593994, + "learning_rate": 3.139938357745711e-06, + "loss": 0.3705, + "step": 7863 + }, + { + "epoch": 1.9778672032193159, + "grad_norm": 0.30593782663345337, + "learning_rate": 3.138580210308155e-06, + "loss": 0.3254, + "step": 7864 + }, + { + "epoch": 1.9781187122736419, + "grad_norm": 0.34505024552345276, + "learning_rate": 3.137222222289562e-06, + "loss": 0.3704, + "step": 7865 + }, + { + "epoch": 1.978370221327968, + "grad_norm": 0.322317898273468, + "learning_rate": 3.1358643938062295e-06, + "loss": 0.3434, + "step": 7866 + }, + { + "epoch": 1.9786217303822937, + "grad_norm": 0.2983636260032654, + "learning_rate": 3.134506724974452e-06, + "loss": 0.3634, + "step": 7867 + }, + { + "epoch": 1.9788732394366197, + "grad_norm": 0.2886245846748352, + "learning_rate": 3.1331492159105007e-06, + "loss": 0.3471, + "step": 7868 + }, + { + "epoch": 1.9791247484909458, + "grad_norm": 0.2910291254520416, + "learning_rate": 3.1317918667306406e-06, + "loss": 0.3388, + "step": 7869 + }, + { + "epoch": 1.9793762575452716, + "grad_norm": 0.32111358642578125, + "learning_rate": 3.130434677551122e-06, + "loss": 0.3532, + "step": 7870 + }, + { + "epoch": 1.9796277665995976, + "grad_norm": 0.3142852783203125, + "learning_rate": 3.129077648488174e-06, + "loss": 0.3354, + "step": 7871 + }, + { + "epoch": 1.9798792756539236, + "grad_norm": 0.29093170166015625, + "learning_rate": 3.1277207796580237e-06, + "loss": 0.3475, + "step": 7872 + }, + { + "epoch": 1.9801307847082494, + "grad_norm": 0.3069877624511719, + "learning_rate": 3.126364071176874e-06, + "loss": 0.3158, + "step": 7873 + }, + { + "epoch": 1.9803822937625755, + "grad_norm": 0.3028368353843689, + "learning_rate": 3.125007523160921e-06, + "loss": 0.3464, + "step": 7874 + }, + { + "epoch": 1.9806338028169015, + "grad_norm": 0.31962573528289795, + "learning_rate": 3.123651135726343e-06, + "loss": 0.3508, + "step": 7875 + }, + { + "epoch": 1.9808853118712273, + "grad_norm": 0.28765198588371277, + "learning_rate": 3.1222949089893085e-06, + "loss": 0.3446, + "step": 7876 + }, + { + "epoch": 1.9811368209255533, + "grad_norm": 0.30417191982269287, + "learning_rate": 3.120938843065966e-06, + "loss": 0.3452, + "step": 7877 + }, + { + "epoch": 1.9813883299798793, + "grad_norm": 0.2888725996017456, + "learning_rate": 3.1195829380724585e-06, + "loss": 0.348, + "step": 7878 + }, + { + "epoch": 1.9816398390342052, + "grad_norm": 0.3087455630302429, + "learning_rate": 3.1182271941249054e-06, + "loss": 0.361, + "step": 7879 + }, + { + "epoch": 1.9818913480885312, + "grad_norm": 0.2925972044467926, + "learning_rate": 3.1168716113394224e-06, + "loss": 0.3458, + "step": 7880 + }, + { + "epoch": 1.9821428571428572, + "grad_norm": 0.3094989061355591, + "learning_rate": 3.1155161898321064e-06, + "loss": 0.3298, + "step": 7881 + }, + { + "epoch": 1.982394366197183, + "grad_norm": 0.29126372933387756, + "learning_rate": 3.114160929719038e-06, + "loss": 0.3506, + "step": 7882 + }, + { + "epoch": 1.982645875251509, + "grad_norm": 0.29920557141304016, + "learning_rate": 3.1128058311162885e-06, + "loss": 0.3524, + "step": 7883 + }, + { + "epoch": 1.982897384305835, + "grad_norm": 0.3097069263458252, + "learning_rate": 3.1114508941399135e-06, + "loss": 0.3685, + "step": 7884 + }, + { + "epoch": 1.9831488933601609, + "grad_norm": 0.28832173347473145, + "learning_rate": 3.110096118905954e-06, + "loss": 0.3692, + "step": 7885 + }, + { + "epoch": 1.983400402414487, + "grad_norm": 0.30503520369529724, + "learning_rate": 3.1087415055304392e-06, + "loss": 0.3686, + "step": 7886 + }, + { + "epoch": 1.983651911468813, + "grad_norm": 0.2966339886188507, + "learning_rate": 3.1073870541293834e-06, + "loss": 0.3295, + "step": 7887 + }, + { + "epoch": 1.9839034205231387, + "grad_norm": 0.30514541268348694, + "learning_rate": 3.1060327648187855e-06, + "loss": 0.3639, + "step": 7888 + }, + { + "epoch": 1.9841549295774648, + "grad_norm": 0.3116115629673004, + "learning_rate": 3.1046786377146332e-06, + "loss": 0.3457, + "step": 7889 + }, + { + "epoch": 1.9844064386317908, + "grad_norm": 0.31909826397895813, + "learning_rate": 3.103324672932898e-06, + "loss": 0.3306, + "step": 7890 + }, + { + "epoch": 1.9846579476861166, + "grad_norm": 0.3044300973415375, + "learning_rate": 3.101970870589538e-06, + "loss": 0.3527, + "step": 7891 + }, + { + "epoch": 1.9849094567404426, + "grad_norm": 0.29158785939216614, + "learning_rate": 3.1006172308005012e-06, + "loss": 0.3387, + "step": 7892 + }, + { + "epoch": 1.9851609657947686, + "grad_norm": 0.308321475982666, + "learning_rate": 3.099263753681714e-06, + "loss": 0.3492, + "step": 7893 + }, + { + "epoch": 1.9854124748490944, + "grad_norm": 0.2938082218170166, + "learning_rate": 3.0979104393490965e-06, + "loss": 0.3443, + "step": 7894 + }, + { + "epoch": 1.9856639839034205, + "grad_norm": 0.30831268429756165, + "learning_rate": 3.0965572879185495e-06, + "loss": 0.354, + "step": 7895 + }, + { + "epoch": 1.9859154929577465, + "grad_norm": 0.32580068707466125, + "learning_rate": 3.095204299505965e-06, + "loss": 0.3326, + "step": 7896 + }, + { + "epoch": 1.9861670020120723, + "grad_norm": 0.2983664572238922, + "learning_rate": 3.093851474227213e-06, + "loss": 0.3367, + "step": 7897 + }, + { + "epoch": 1.9864185110663986, + "grad_norm": 0.2757643759250641, + "learning_rate": 3.0924988121981604e-06, + "loss": 0.3575, + "step": 7898 + }, + { + "epoch": 1.9866700201207244, + "grad_norm": 0.2955786883831024, + "learning_rate": 3.0911463135346486e-06, + "loss": 0.3523, + "step": 7899 + }, + { + "epoch": 1.9869215291750502, + "grad_norm": 0.285170316696167, + "learning_rate": 3.0897939783525156e-06, + "loss": 0.3152, + "step": 7900 + }, + { + "epoch": 1.9871730382293764, + "grad_norm": 0.28338173031806946, + "learning_rate": 3.0884418067675755e-06, + "loss": 0.3367, + "step": 7901 + }, + { + "epoch": 1.9874245472837022, + "grad_norm": 0.3066754937171936, + "learning_rate": 3.0870897988956362e-06, + "loss": 0.3347, + "step": 7902 + }, + { + "epoch": 1.987676056338028, + "grad_norm": 0.30550864338874817, + "learning_rate": 3.0857379548524914e-06, + "loss": 0.3358, + "step": 7903 + }, + { + "epoch": 1.9879275653923543, + "grad_norm": 0.32733213901519775, + "learning_rate": 3.0843862747539123e-06, + "loss": 0.3602, + "step": 7904 + }, + { + "epoch": 1.98817907444668, + "grad_norm": 0.2825915813446045, + "learning_rate": 3.0830347587156667e-06, + "loss": 0.3433, + "step": 7905 + }, + { + "epoch": 1.9884305835010059, + "grad_norm": 0.3268314003944397, + "learning_rate": 3.0816834068534994e-06, + "loss": 0.3502, + "step": 7906 + }, + { + "epoch": 1.9886820925553321, + "grad_norm": 0.3060625195503235, + "learning_rate": 3.0803322192831496e-06, + "loss": 0.3397, + "step": 7907 + }, + { + "epoch": 1.988933601609658, + "grad_norm": 0.28180480003356934, + "learning_rate": 3.0789811961203342e-06, + "loss": 0.3592, + "step": 7908 + }, + { + "epoch": 1.989185110663984, + "grad_norm": 0.28377094864845276, + "learning_rate": 3.077630337480764e-06, + "loss": 0.3577, + "step": 7909 + }, + { + "epoch": 1.98943661971831, + "grad_norm": 0.28484049439430237, + "learning_rate": 3.076279643480126e-06, + "loss": 0.3419, + "step": 7910 + }, + { + "epoch": 1.9896881287726358, + "grad_norm": 0.3092370927333832, + "learning_rate": 3.0749291142341037e-06, + "loss": 0.3364, + "step": 7911 + }, + { + "epoch": 1.9899396378269618, + "grad_norm": 0.29064294695854187, + "learning_rate": 3.073578749858358e-06, + "loss": 0.3469, + "step": 7912 + }, + { + "epoch": 1.9901911468812878, + "grad_norm": 0.2736586630344391, + "learning_rate": 3.0722285504685405e-06, + "loss": 0.3434, + "step": 7913 + }, + { + "epoch": 1.9904426559356136, + "grad_norm": 0.3202298581600189, + "learning_rate": 3.0708785161802902e-06, + "loss": 0.3541, + "step": 7914 + }, + { + "epoch": 1.9906941649899397, + "grad_norm": 0.2941378355026245, + "learning_rate": 3.0695286471092235e-06, + "loss": 0.3528, + "step": 7915 + }, + { + "epoch": 1.9909456740442657, + "grad_norm": 0.28272390365600586, + "learning_rate": 3.0681789433709535e-06, + "loss": 0.3324, + "step": 7916 + }, + { + "epoch": 1.9911971830985915, + "grad_norm": 0.3015698492527008, + "learning_rate": 3.06682940508107e-06, + "loss": 0.3348, + "step": 7917 + }, + { + "epoch": 1.9914486921529175, + "grad_norm": 0.3570209741592407, + "learning_rate": 3.065480032355156e-06, + "loss": 0.3712, + "step": 7918 + }, + { + "epoch": 1.9917002012072436, + "grad_norm": 0.29426610469818115, + "learning_rate": 3.0641308253087722e-06, + "loss": 0.3465, + "step": 7919 + }, + { + "epoch": 1.9919517102615694, + "grad_norm": 0.3420696258544922, + "learning_rate": 3.0627817840574747e-06, + "loss": 0.3412, + "step": 7920 + }, + { + "epoch": 1.9922032193158954, + "grad_norm": 0.3334300220012665, + "learning_rate": 3.061432908716797e-06, + "loss": 0.3811, + "step": 7921 + }, + { + "epoch": 1.9924547283702214, + "grad_norm": 0.3178221881389618, + "learning_rate": 3.0600841994022645e-06, + "loss": 0.3462, + "step": 7922 + }, + { + "epoch": 1.9927062374245472, + "grad_norm": 0.2942996919155121, + "learning_rate": 3.058735656229382e-06, + "loss": 0.3455, + "step": 7923 + }, + { + "epoch": 1.9929577464788732, + "grad_norm": 0.3083072006702423, + "learning_rate": 3.057387279313646e-06, + "loss": 0.353, + "step": 7924 + }, + { + "epoch": 1.9932092555331993, + "grad_norm": 0.30187562108039856, + "learning_rate": 3.056039068770539e-06, + "loss": 0.3473, + "step": 7925 + }, + { + "epoch": 1.993460764587525, + "grad_norm": 0.3180212080478668, + "learning_rate": 3.0546910247155224e-06, + "loss": 0.34, + "step": 7926 + }, + { + "epoch": 1.993712273641851, + "grad_norm": 0.3074866235256195, + "learning_rate": 3.053343147264052e-06, + "loss": 0.3273, + "step": 7927 + }, + { + "epoch": 1.9939637826961771, + "grad_norm": 0.3176601231098175, + "learning_rate": 3.0519954365315595e-06, + "loss": 0.3411, + "step": 7928 + }, + { + "epoch": 1.994215291750503, + "grad_norm": 0.31003740429878235, + "learning_rate": 3.050647892633474e-06, + "loss": 0.3533, + "step": 7929 + }, + { + "epoch": 1.994466800804829, + "grad_norm": 0.2907339632511139, + "learning_rate": 3.0493005156851997e-06, + "loss": 0.3365, + "step": 7930 + }, + { + "epoch": 1.994718309859155, + "grad_norm": 0.31127139925956726, + "learning_rate": 3.0479533058021345e-06, + "loss": 0.3386, + "step": 7931 + }, + { + "epoch": 1.9949698189134808, + "grad_norm": 0.30205103754997253, + "learning_rate": 3.046606263099654e-06, + "loss": 0.3306, + "step": 7932 + }, + { + "epoch": 1.9952213279678068, + "grad_norm": 0.30589067935943604, + "learning_rate": 3.0452593876931296e-06, + "loss": 0.378, + "step": 7933 + }, + { + "epoch": 1.9954728370221329, + "grad_norm": 0.3329513669013977, + "learning_rate": 3.0439126796979074e-06, + "loss": 0.3734, + "step": 7934 + }, + { + "epoch": 1.9957243460764587, + "grad_norm": 0.2932153046131134, + "learning_rate": 3.042566139229327e-06, + "loss": 0.3653, + "step": 7935 + }, + { + "epoch": 1.9959758551307847, + "grad_norm": 0.2880392074584961, + "learning_rate": 3.041219766402713e-06, + "loss": 0.3714, + "step": 7936 + }, + { + "epoch": 1.9962273641851107, + "grad_norm": 0.31852367520332336, + "learning_rate": 3.03987356133337e-06, + "loss": 0.3305, + "step": 7937 + }, + { + "epoch": 1.9964788732394365, + "grad_norm": 0.3372204899787903, + "learning_rate": 3.0385275241365965e-06, + "loss": 0.334, + "step": 7938 + }, + { + "epoch": 1.9967303822937625, + "grad_norm": 0.3119423985481262, + "learning_rate": 3.0371816549276667e-06, + "loss": 0.3488, + "step": 7939 + }, + { + "epoch": 1.9969818913480886, + "grad_norm": 0.2920767366886139, + "learning_rate": 3.035835953821851e-06, + "loss": 0.3238, + "step": 7940 + }, + { + "epoch": 1.9972334004024144, + "grad_norm": 0.3219856321811676, + "learning_rate": 3.0344904209343962e-06, + "loss": 0.3492, + "step": 7941 + }, + { + "epoch": 1.9974849094567404, + "grad_norm": 0.2855657935142517, + "learning_rate": 3.0331450563805433e-06, + "loss": 0.3, + "step": 7942 + }, + { + "epoch": 1.9977364185110664, + "grad_norm": 0.3001307547092438, + "learning_rate": 3.0317998602755087e-06, + "loss": 0.3469, + "step": 7943 + }, + { + "epoch": 1.9979879275653922, + "grad_norm": 0.29662996530532837, + "learning_rate": 3.0304548327345056e-06, + "loss": 0.3425, + "step": 7944 + }, + { + "epoch": 1.9982394366197183, + "grad_norm": 0.28861871361732483, + "learning_rate": 3.0291099738727226e-06, + "loss": 0.3553, + "step": 7945 + }, + { + "epoch": 1.9984909456740443, + "grad_norm": 0.30145004391670227, + "learning_rate": 3.0277652838053416e-06, + "loss": 0.3693, + "step": 7946 + }, + { + "epoch": 1.99874245472837, + "grad_norm": 0.32663676142692566, + "learning_rate": 3.0264207626475254e-06, + "loss": 0.343, + "step": 7947 + }, + { + "epoch": 1.9989939637826963, + "grad_norm": 0.3272598385810852, + "learning_rate": 3.025076410514425e-06, + "loss": 0.3422, + "step": 7948 + }, + { + "epoch": 1.9992454728370221, + "grad_norm": 0.3103751540184021, + "learning_rate": 3.023732227521174e-06, + "loss": 0.3515, + "step": 7949 + }, + { + "epoch": 1.999496981891348, + "grad_norm": 0.297633558511734, + "learning_rate": 3.0223882137828947e-06, + "loss": 0.3398, + "step": 7950 + }, + { + "epoch": 1.9997484909456742, + "grad_norm": 0.3038352131843567, + "learning_rate": 3.021044369414693e-06, + "loss": 0.3405, + "step": 7951 + }, + { + "epoch": 2.0, + "grad_norm": 0.3005836606025696, + "learning_rate": 3.0197006945316604e-06, + "loss": 0.3303, + "step": 7952 + }, + { + "epoch": 2.000251509054326, + "grad_norm": 0.3221258819103241, + "learning_rate": 3.018357189248875e-06, + "loss": 0.3005, + "step": 7953 + }, + { + "epoch": 2.000503018108652, + "grad_norm": 0.31759047508239746, + "learning_rate": 3.0170138536813984e-06, + "loss": 0.3198, + "step": 7954 + }, + { + "epoch": 2.000754527162978, + "grad_norm": 0.327646940946579, + "learning_rate": 3.015670687944281e-06, + "loss": 0.3294, + "step": 7955 + }, + { + "epoch": 2.0010060362173037, + "grad_norm": 0.2987602949142456, + "learning_rate": 3.014327692152554e-06, + "loss": 0.3277, + "step": 7956 + }, + { + "epoch": 2.00125754527163, + "grad_norm": 0.31388530135154724, + "learning_rate": 3.012984866421238e-06, + "loss": 0.333, + "step": 7957 + }, + { + "epoch": 2.0015090543259557, + "grad_norm": 0.29405900835990906, + "learning_rate": 3.0116422108653387e-06, + "loss": 0.3272, + "step": 7958 + }, + { + "epoch": 2.0017605633802815, + "grad_norm": 0.3055470287799835, + "learning_rate": 3.0102997255998433e-06, + "loss": 0.3186, + "step": 7959 + }, + { + "epoch": 2.0020120724346078, + "grad_norm": 0.308438777923584, + "learning_rate": 3.0089574107397306e-06, + "loss": 0.3175, + "step": 7960 + }, + { + "epoch": 2.0022635814889336, + "grad_norm": 0.3100303113460541, + "learning_rate": 3.007615266399958e-06, + "loss": 0.2978, + "step": 7961 + }, + { + "epoch": 2.0025150905432594, + "grad_norm": 0.28166472911834717, + "learning_rate": 3.006273292695475e-06, + "loss": 0.3129, + "step": 7962 + }, + { + "epoch": 2.0027665995975856, + "grad_norm": 0.2950986921787262, + "learning_rate": 3.0049314897412106e-06, + "loss": 0.3231, + "step": 7963 + }, + { + "epoch": 2.0030181086519114, + "grad_norm": 0.3130662441253662, + "learning_rate": 3.0035898576520844e-06, + "loss": 0.3189, + "step": 7964 + }, + { + "epoch": 2.0032696177062372, + "grad_norm": 0.3303288221359253, + "learning_rate": 3.002248396542996e-06, + "loss": 0.3069, + "step": 7965 + }, + { + "epoch": 2.0035211267605635, + "grad_norm": 0.30254948139190674, + "learning_rate": 3.000907106528836e-06, + "loss": 0.3085, + "step": 7966 + }, + { + "epoch": 2.0037726358148893, + "grad_norm": 0.3039596676826477, + "learning_rate": 2.9995659877244736e-06, + "loss": 0.3117, + "step": 7967 + }, + { + "epoch": 2.004024144869215, + "grad_norm": 0.30226194858551025, + "learning_rate": 2.9982250402447706e-06, + "loss": 0.3352, + "step": 7968 + }, + { + "epoch": 2.0042756539235413, + "grad_norm": 0.29361942410469055, + "learning_rate": 2.9968842642045713e-06, + "loss": 0.332, + "step": 7969 + }, + { + "epoch": 2.004527162977867, + "grad_norm": 0.2824215888977051, + "learning_rate": 2.9955436597187016e-06, + "loss": 0.3018, + "step": 7970 + }, + { + "epoch": 2.004778672032193, + "grad_norm": 0.3032889664173126, + "learning_rate": 2.9942032269019792e-06, + "loss": 0.3291, + "step": 7971 + }, + { + "epoch": 2.005030181086519, + "grad_norm": 0.2901354432106018, + "learning_rate": 2.9928629658692006e-06, + "loss": 0.2918, + "step": 7972 + }, + { + "epoch": 2.005281690140845, + "grad_norm": 0.3217187225818634, + "learning_rate": 2.991522876735154e-06, + "loss": 0.3387, + "step": 7973 + }, + { + "epoch": 2.005533199195171, + "grad_norm": 0.326134592294693, + "learning_rate": 2.9901829596146057e-06, + "loss": 0.3327, + "step": 7974 + }, + { + "epoch": 2.005784708249497, + "grad_norm": 0.30647000670433044, + "learning_rate": 2.9888432146223167e-06, + "loss": 0.3048, + "step": 7975 + }, + { + "epoch": 2.006036217303823, + "grad_norm": 0.2938840389251709, + "learning_rate": 2.9875036418730218e-06, + "loss": 0.3367, + "step": 7976 + }, + { + "epoch": 2.006287726358149, + "grad_norm": 0.3004414737224579, + "learning_rate": 2.9861642414814502e-06, + "loss": 0.3116, + "step": 7977 + }, + { + "epoch": 2.006539235412475, + "grad_norm": 0.3037022650241852, + "learning_rate": 2.984825013562315e-06, + "loss": 0.3275, + "step": 7978 + }, + { + "epoch": 2.0067907444668007, + "grad_norm": 0.3147777318954468, + "learning_rate": 2.983485958230308e-06, + "loss": 0.3348, + "step": 7979 + }, + { + "epoch": 2.007042253521127, + "grad_norm": 0.30514058470726013, + "learning_rate": 2.9821470756001148e-06, + "loss": 0.3299, + "step": 7980 + }, + { + "epoch": 2.0072937625754528, + "grad_norm": 0.33660414814949036, + "learning_rate": 2.9808083657863994e-06, + "loss": 0.3422, + "step": 7981 + }, + { + "epoch": 2.0075452716297786, + "grad_norm": 0.29854559898376465, + "learning_rate": 2.9794698289038183e-06, + "loss": 0.319, + "step": 7982 + }, + { + "epoch": 2.007796780684105, + "grad_norm": 0.31165847182273865, + "learning_rate": 2.9781314650670033e-06, + "loss": 0.3134, + "step": 7983 + }, + { + "epoch": 2.0080482897384306, + "grad_norm": 0.3158718943595886, + "learning_rate": 2.9767932743905813e-06, + "loss": 0.3041, + "step": 7984 + }, + { + "epoch": 2.0082997987927564, + "grad_norm": 0.3166303336620331, + "learning_rate": 2.9754552569891566e-06, + "loss": 0.3155, + "step": 7985 + }, + { + "epoch": 2.0085513078470827, + "grad_norm": 0.3297767639160156, + "learning_rate": 2.9741174129773253e-06, + "loss": 0.3683, + "step": 7986 + }, + { + "epoch": 2.0088028169014085, + "grad_norm": 0.2979738414287567, + "learning_rate": 2.972779742469662e-06, + "loss": 0.2971, + "step": 7987 + }, + { + "epoch": 2.0090543259557343, + "grad_norm": 0.2994316518306732, + "learning_rate": 2.971442245580731e-06, + "loss": 0.3391, + "step": 7988 + }, + { + "epoch": 2.0093058350100605, + "grad_norm": 0.31496062874794006, + "learning_rate": 2.970104922425084e-06, + "loss": 0.3329, + "step": 7989 + }, + { + "epoch": 2.0095573440643864, + "grad_norm": 0.3337770700454712, + "learning_rate": 2.9687677731172486e-06, + "loss": 0.3214, + "step": 7990 + }, + { + "epoch": 2.009808853118712, + "grad_norm": 0.31632599234580994, + "learning_rate": 2.9674307977717486e-06, + "loss": 0.3217, + "step": 7991 + }, + { + "epoch": 2.0100603621730384, + "grad_norm": 0.33396008610725403, + "learning_rate": 2.9660939965030826e-06, + "loss": 0.3301, + "step": 7992 + }, + { + "epoch": 2.010311871227364, + "grad_norm": 0.3123500347137451, + "learning_rate": 2.9647573694257436e-06, + "loss": 0.3126, + "step": 7993 + }, + { + "epoch": 2.01056338028169, + "grad_norm": 0.31851232051849365, + "learning_rate": 2.963420916654202e-06, + "loss": 0.351, + "step": 7994 + }, + { + "epoch": 2.0108148893360163, + "grad_norm": 0.31659290194511414, + "learning_rate": 2.96208463830292e-06, + "loss": 0.3098, + "step": 7995 + }, + { + "epoch": 2.011066398390342, + "grad_norm": 0.31261491775512695, + "learning_rate": 2.9607485344863375e-06, + "loss": 0.3452, + "step": 7996 + }, + { + "epoch": 2.011317907444668, + "grad_norm": 0.3377598822116852, + "learning_rate": 2.9594126053188874e-06, + "loss": 0.3072, + "step": 7997 + }, + { + "epoch": 2.011569416498994, + "grad_norm": 0.3177874684333801, + "learning_rate": 2.95807685091498e-06, + "loss": 0.3158, + "step": 7998 + }, + { + "epoch": 2.01182092555332, + "grad_norm": 0.3485296368598938, + "learning_rate": 2.9567412713890163e-06, + "loss": 0.3123, + "step": 7999 + }, + { + "epoch": 2.0120724346076457, + "grad_norm": 0.3071430027484894, + "learning_rate": 2.955405866855381e-06, + "loss": 0.3191, + "step": 8000 + }, + { + "epoch": 2.012323943661972, + "grad_norm": 0.30622658133506775, + "learning_rate": 2.9540706374284423e-06, + "loss": 0.3353, + "step": 8001 + }, + { + "epoch": 2.012575452716298, + "grad_norm": 0.32716530561447144, + "learning_rate": 2.9527355832225542e-06, + "loss": 0.3129, + "step": 8002 + }, + { + "epoch": 2.0128269617706236, + "grad_norm": 0.31558316946029663, + "learning_rate": 2.9514007043520555e-06, + "loss": 0.3257, + "step": 8003 + }, + { + "epoch": 2.01307847082495, + "grad_norm": 0.3272193968296051, + "learning_rate": 2.9500660009312698e-06, + "loss": 0.3263, + "step": 8004 + }, + { + "epoch": 2.0133299798792756, + "grad_norm": 0.30284756422042847, + "learning_rate": 2.9487314730745075e-06, + "loss": 0.3047, + "step": 8005 + }, + { + "epoch": 2.0135814889336014, + "grad_norm": 0.3058840036392212, + "learning_rate": 2.947397120896062e-06, + "loss": 0.3387, + "step": 8006 + }, + { + "epoch": 2.0138329979879277, + "grad_norm": 0.31466108560562134, + "learning_rate": 2.9460629445102106e-06, + "loss": 0.3362, + "step": 8007 + }, + { + "epoch": 2.0140845070422535, + "grad_norm": 0.3358847200870514, + "learning_rate": 2.944728944031221e-06, + "loss": 0.3371, + "step": 8008 + }, + { + "epoch": 2.0143360160965793, + "grad_norm": 0.28841134905815125, + "learning_rate": 2.9433951195733374e-06, + "loss": 0.302, + "step": 8009 + }, + { + "epoch": 2.0145875251509056, + "grad_norm": 0.32336318492889404, + "learning_rate": 2.9420614712507966e-06, + "loss": 0.3229, + "step": 8010 + }, + { + "epoch": 2.0148390342052314, + "grad_norm": 0.3032439053058624, + "learning_rate": 2.940727999177817e-06, + "loss": 0.3102, + "step": 8011 + }, + { + "epoch": 2.015090543259557, + "grad_norm": 0.29863134026527405, + "learning_rate": 2.939394703468601e-06, + "loss": 0.3196, + "step": 8012 + }, + { + "epoch": 2.0153420523138834, + "grad_norm": 0.30820432305336, + "learning_rate": 2.9380615842373372e-06, + "loss": 0.3429, + "step": 8013 + }, + { + "epoch": 2.015593561368209, + "grad_norm": 0.32752588391304016, + "learning_rate": 2.9367286415982e-06, + "loss": 0.3339, + "step": 8014 + }, + { + "epoch": 2.015845070422535, + "grad_norm": 0.3171319365501404, + "learning_rate": 2.935395875665346e-06, + "loss": 0.3269, + "step": 8015 + }, + { + "epoch": 2.0160965794768613, + "grad_norm": 0.3042650520801544, + "learning_rate": 2.9340632865529194e-06, + "loss": 0.3174, + "step": 8016 + }, + { + "epoch": 2.016348088531187, + "grad_norm": 0.3020657002925873, + "learning_rate": 2.9327308743750483e-06, + "loss": 0.3195, + "step": 8017 + }, + { + "epoch": 2.016599597585513, + "grad_norm": 0.298667311668396, + "learning_rate": 2.931398639245845e-06, + "loss": 0.3408, + "step": 8018 + }, + { + "epoch": 2.016851106639839, + "grad_norm": 0.30726972222328186, + "learning_rate": 2.9300665812794073e-06, + "loss": 0.2885, + "step": 8019 + }, + { + "epoch": 2.017102615694165, + "grad_norm": 0.2989729642868042, + "learning_rate": 2.9287347005898162e-06, + "loss": 0.3166, + "step": 8020 + }, + { + "epoch": 2.0173541247484907, + "grad_norm": 0.305683970451355, + "learning_rate": 2.9274029972911404e-06, + "loss": 0.3279, + "step": 8021 + }, + { + "epoch": 2.017605633802817, + "grad_norm": 0.30576571822166443, + "learning_rate": 2.926071471497434e-06, + "loss": 0.3285, + "step": 8022 + }, + { + "epoch": 2.017857142857143, + "grad_norm": 0.3315543830394745, + "learning_rate": 2.9247401233227285e-06, + "loss": 0.336, + "step": 8023 + }, + { + "epoch": 2.0181086519114686, + "grad_norm": 0.3203072249889374, + "learning_rate": 2.923408952881051e-06, + "loss": 0.3441, + "step": 8024 + }, + { + "epoch": 2.018360160965795, + "grad_norm": 0.3002256751060486, + "learning_rate": 2.9220779602864035e-06, + "loss": 0.3027, + "step": 8025 + }, + { + "epoch": 2.0186116700201207, + "grad_norm": 0.3109540641307831, + "learning_rate": 2.920747145652782e-06, + "loss": 0.3432, + "step": 8026 + }, + { + "epoch": 2.0188631790744465, + "grad_norm": 0.3139438033103943, + "learning_rate": 2.9194165090941575e-06, + "loss": 0.321, + "step": 8027 + }, + { + "epoch": 2.0191146881287727, + "grad_norm": 0.32630038261413574, + "learning_rate": 2.9180860507244936e-06, + "loss": 0.3284, + "step": 8028 + }, + { + "epoch": 2.0193661971830985, + "grad_norm": 0.31174954771995544, + "learning_rate": 2.916755770657733e-06, + "loss": 0.2986, + "step": 8029 + }, + { + "epoch": 2.0196177062374248, + "grad_norm": 0.3617261052131653, + "learning_rate": 2.915425669007812e-06, + "loss": 0.323, + "step": 8030 + }, + { + "epoch": 2.0198692152917506, + "grad_norm": 0.28990018367767334, + "learning_rate": 2.914095745888638e-06, + "loss": 0.3193, + "step": 8031 + }, + { + "epoch": 2.0201207243460764, + "grad_norm": 0.2929520905017853, + "learning_rate": 2.912766001414116e-06, + "loss": 0.3319, + "step": 8032 + }, + { + "epoch": 2.0203722334004026, + "grad_norm": 0.3127540647983551, + "learning_rate": 2.9114364356981274e-06, + "loss": 0.3179, + "step": 8033 + }, + { + "epoch": 2.0206237424547284, + "grad_norm": 0.3118465840816498, + "learning_rate": 2.9101070488545424e-06, + "loss": 0.3215, + "step": 8034 + }, + { + "epoch": 2.0208752515090542, + "grad_norm": 0.31171420216560364, + "learning_rate": 2.9087778409972132e-06, + "loss": 0.3351, + "step": 8035 + }, + { + "epoch": 2.0211267605633805, + "grad_norm": 0.3322293758392334, + "learning_rate": 2.9074488122399813e-06, + "loss": 0.3329, + "step": 8036 + }, + { + "epoch": 2.0213782696177063, + "grad_norm": 0.28532707691192627, + "learning_rate": 2.906119962696666e-06, + "loss": 0.3257, + "step": 8037 + }, + { + "epoch": 2.021629778672032, + "grad_norm": 0.2843002676963806, + "learning_rate": 2.9047912924810786e-06, + "loss": 0.3388, + "step": 8038 + }, + { + "epoch": 2.0218812877263583, + "grad_norm": 0.3153465688228607, + "learning_rate": 2.9034628017070064e-06, + "loss": 0.3385, + "step": 8039 + }, + { + "epoch": 2.022132796780684, + "grad_norm": 0.29112154245376587, + "learning_rate": 2.9021344904882324e-06, + "loss": 0.3294, + "step": 8040 + }, + { + "epoch": 2.02238430583501, + "grad_norm": 0.307882696390152, + "learning_rate": 2.9008063589385127e-06, + "loss": 0.2989, + "step": 8041 + }, + { + "epoch": 2.022635814889336, + "grad_norm": 0.3012144863605499, + "learning_rate": 2.899478407171598e-06, + "loss": 0.3385, + "step": 8042 + }, + { + "epoch": 2.022887323943662, + "grad_norm": 0.26836854219436646, + "learning_rate": 2.8981506353012145e-06, + "loss": 0.3189, + "step": 8043 + }, + { + "epoch": 2.023138832997988, + "grad_norm": 0.29367396235466003, + "learning_rate": 2.896823043441083e-06, + "loss": 0.3315, + "step": 8044 + }, + { + "epoch": 2.023390342052314, + "grad_norm": 0.29954785108566284, + "learning_rate": 2.895495631704898e-06, + "loss": 0.3305, + "step": 8045 + }, + { + "epoch": 2.02364185110664, + "grad_norm": 0.28079143166542053, + "learning_rate": 2.8941684002063473e-06, + "loss": 0.3255, + "step": 8046 + }, + { + "epoch": 2.0238933601609657, + "grad_norm": 0.28768080472946167, + "learning_rate": 2.892841349059098e-06, + "loss": 0.3205, + "step": 8047 + }, + { + "epoch": 2.024144869215292, + "grad_norm": 0.29650044441223145, + "learning_rate": 2.8915144783768047e-06, + "loss": 0.3387, + "step": 8048 + }, + { + "epoch": 2.0243963782696177, + "grad_norm": 0.2993822991847992, + "learning_rate": 2.8901877882731076e-06, + "loss": 0.293, + "step": 8049 + }, + { + "epoch": 2.0246478873239435, + "grad_norm": 0.29370445013046265, + "learning_rate": 2.8888612788616256e-06, + "loss": 0.3055, + "step": 8050 + }, + { + "epoch": 2.0248993963782698, + "grad_norm": 0.29673752188682556, + "learning_rate": 2.887534950255969e-06, + "loss": 0.2896, + "step": 8051 + }, + { + "epoch": 2.0251509054325956, + "grad_norm": 0.2987542748451233, + "learning_rate": 2.886208802569728e-06, + "loss": 0.31, + "step": 8052 + }, + { + "epoch": 2.0254024144869214, + "grad_norm": 0.30512362718582153, + "learning_rate": 2.8848828359164797e-06, + "loss": 0.3223, + "step": 8053 + }, + { + "epoch": 2.0256539235412476, + "grad_norm": 0.3173840343952179, + "learning_rate": 2.883557050409783e-06, + "loss": 0.3265, + "step": 8054 + }, + { + "epoch": 2.0259054325955734, + "grad_norm": 0.3171699047088623, + "learning_rate": 2.882231446163187e-06, + "loss": 0.3339, + "step": 8055 + }, + { + "epoch": 2.0261569416498992, + "grad_norm": 0.3157178461551666, + "learning_rate": 2.8809060232902165e-06, + "loss": 0.3051, + "step": 8056 + }, + { + "epoch": 2.0264084507042255, + "grad_norm": 0.3298434615135193, + "learning_rate": 2.8795807819043898e-06, + "loss": 0.3176, + "step": 8057 + }, + { + "epoch": 2.0266599597585513, + "grad_norm": 0.290964812040329, + "learning_rate": 2.878255722119202e-06, + "loss": 0.3387, + "step": 8058 + }, + { + "epoch": 2.026911468812877, + "grad_norm": 0.3040768504142761, + "learning_rate": 2.87693084404814e-06, + "loss": 0.3093, + "step": 8059 + }, + { + "epoch": 2.0271629778672033, + "grad_norm": 0.3241004943847656, + "learning_rate": 2.875606147804667e-06, + "loss": 0.313, + "step": 8060 + }, + { + "epoch": 2.027414486921529, + "grad_norm": 0.296888142824173, + "learning_rate": 2.87428163350224e-06, + "loss": 0.3126, + "step": 8061 + }, + { + "epoch": 2.027665995975855, + "grad_norm": 0.29444047808647156, + "learning_rate": 2.87295730125429e-06, + "loss": 0.3275, + "step": 8062 + }, + { + "epoch": 2.027917505030181, + "grad_norm": 0.2897621989250183, + "learning_rate": 2.871633151174243e-06, + "loss": 0.3072, + "step": 8063 + }, + { + "epoch": 2.028169014084507, + "grad_norm": 0.31098416447639465, + "learning_rate": 2.8703091833754993e-06, + "loss": 0.3159, + "step": 8064 + }, + { + "epoch": 2.028420523138833, + "grad_norm": 0.2953088581562042, + "learning_rate": 2.8689853979714505e-06, + "loss": 0.314, + "step": 8065 + }, + { + "epoch": 2.028672032193159, + "grad_norm": 0.2950742840766907, + "learning_rate": 2.8676617950754733e-06, + "loss": 0.3236, + "step": 8066 + }, + { + "epoch": 2.028923541247485, + "grad_norm": 0.29954662919044495, + "learning_rate": 2.866338374800921e-06, + "loss": 0.3318, + "step": 8067 + }, + { + "epoch": 2.0291750503018107, + "grad_norm": 0.28929510712623596, + "learning_rate": 2.8650151372611414e-06, + "loss": 0.341, + "step": 8068 + }, + { + "epoch": 2.029426559356137, + "grad_norm": 0.32224392890930176, + "learning_rate": 2.8636920825694557e-06, + "loss": 0.3236, + "step": 8069 + }, + { + "epoch": 2.0296780684104627, + "grad_norm": 0.2966301739215851, + "learning_rate": 2.8623692108391808e-06, + "loss": 0.3327, + "step": 8070 + }, + { + "epoch": 2.0299295774647885, + "grad_norm": 0.2904188930988312, + "learning_rate": 2.8610465221836094e-06, + "loss": 0.3303, + "step": 8071 + }, + { + "epoch": 2.0301810865191148, + "grad_norm": 0.3125072419643402, + "learning_rate": 2.859724016716022e-06, + "loss": 0.3256, + "step": 8072 + }, + { + "epoch": 2.0304325955734406, + "grad_norm": 0.3028983771800995, + "learning_rate": 2.858401694549683e-06, + "loss": 0.3228, + "step": 8073 + }, + { + "epoch": 2.0306841046277664, + "grad_norm": 0.30204835534095764, + "learning_rate": 2.8570795557978413e-06, + "loss": 0.3266, + "step": 8074 + }, + { + "epoch": 2.0309356136820926, + "grad_norm": 0.29210224747657776, + "learning_rate": 2.8557576005737286e-06, + "loss": 0.3212, + "step": 8075 + }, + { + "epoch": 2.0311871227364184, + "grad_norm": 0.32311296463012695, + "learning_rate": 2.854435828990563e-06, + "loss": 0.3015, + "step": 8076 + }, + { + "epoch": 2.0314386317907447, + "grad_norm": 0.3028305470943451, + "learning_rate": 2.853114241161549e-06, + "loss": 0.3349, + "step": 8077 + }, + { + "epoch": 2.0316901408450705, + "grad_norm": 0.3027438223361969, + "learning_rate": 2.851792837199866e-06, + "loss": 0.3319, + "step": 8078 + }, + { + "epoch": 2.0319416498993963, + "grad_norm": 0.30897650122642517, + "learning_rate": 2.85047161721869e-06, + "loss": 0.3249, + "step": 8079 + }, + { + "epoch": 2.0321931589537225, + "grad_norm": 0.30179494619369507, + "learning_rate": 2.849150581331169e-06, + "loss": 0.3486, + "step": 8080 + }, + { + "epoch": 2.0324446680080483, + "grad_norm": 0.3074062466621399, + "learning_rate": 2.8478297296504487e-06, + "loss": 0.3201, + "step": 8081 + }, + { + "epoch": 2.032696177062374, + "grad_norm": 0.29912763833999634, + "learning_rate": 2.846509062289646e-06, + "loss": 0.3105, + "step": 8082 + }, + { + "epoch": 2.0329476861167004, + "grad_norm": 0.32092058658599854, + "learning_rate": 2.8451885793618716e-06, + "loss": 0.3287, + "step": 8083 + }, + { + "epoch": 2.033199195171026, + "grad_norm": 0.3172852694988251, + "learning_rate": 2.8438682809802133e-06, + "loss": 0.3266, + "step": 8084 + }, + { + "epoch": 2.033450704225352, + "grad_norm": 0.2866196036338806, + "learning_rate": 2.8425481672577494e-06, + "loss": 0.3152, + "step": 8085 + }, + { + "epoch": 2.0337022132796783, + "grad_norm": 0.3131830394268036, + "learning_rate": 2.8412282383075362e-06, + "loss": 0.3061, + "step": 8086 + }, + { + "epoch": 2.033953722334004, + "grad_norm": 0.308741956949234, + "learning_rate": 2.8399084942426193e-06, + "loss": 0.3321, + "step": 8087 + }, + { + "epoch": 2.03420523138833, + "grad_norm": 0.29598724842071533, + "learning_rate": 2.8385889351760283e-06, + "loss": 0.3332, + "step": 8088 + }, + { + "epoch": 2.034456740442656, + "grad_norm": 0.2867431640625, + "learning_rate": 2.8372695612207715e-06, + "loss": 0.3383, + "step": 8089 + }, + { + "epoch": 2.034708249496982, + "grad_norm": 0.30722376704216003, + "learning_rate": 2.8359503724898485e-06, + "loss": 0.3236, + "step": 8090 + }, + { + "epoch": 2.0349597585513077, + "grad_norm": 0.305992990732193, + "learning_rate": 2.8346313690962358e-06, + "loss": 0.3416, + "step": 8091 + }, + { + "epoch": 2.035211267605634, + "grad_norm": 0.30428627133369446, + "learning_rate": 2.8333125511529012e-06, + "loss": 0.3309, + "step": 8092 + }, + { + "epoch": 2.03546277665996, + "grad_norm": 0.3013574779033661, + "learning_rate": 2.8319939187727913e-06, + "loss": 0.3321, + "step": 8093 + }, + { + "epoch": 2.0357142857142856, + "grad_norm": 0.2905578315258026, + "learning_rate": 2.83067547206884e-06, + "loss": 0.2902, + "step": 8094 + }, + { + "epoch": 2.035965794768612, + "grad_norm": 0.3262609839439392, + "learning_rate": 2.8293572111539625e-06, + "loss": 0.3395, + "step": 8095 + }, + { + "epoch": 2.0362173038229376, + "grad_norm": 0.31515759229660034, + "learning_rate": 2.8280391361410614e-06, + "loss": 0.347, + "step": 8096 + }, + { + "epoch": 2.0364688128772634, + "grad_norm": 0.3180714547634125, + "learning_rate": 2.826721247143018e-06, + "loss": 0.3489, + "step": 8097 + }, + { + "epoch": 2.0367203219315897, + "grad_norm": 0.30598127841949463, + "learning_rate": 2.825403544272706e-06, + "loss": 0.3448, + "step": 8098 + }, + { + "epoch": 2.0369718309859155, + "grad_norm": 0.3095637261867523, + "learning_rate": 2.824086027642976e-06, + "loss": 0.3007, + "step": 8099 + }, + { + "epoch": 2.0372233400402413, + "grad_norm": 0.32037293910980225, + "learning_rate": 2.822768697366664e-06, + "loss": 0.3288, + "step": 8100 + }, + { + "epoch": 2.0374748490945676, + "grad_norm": 0.32354459166526794, + "learning_rate": 2.8214515535565946e-06, + "loss": 0.3091, + "step": 8101 + }, + { + "epoch": 2.0377263581488934, + "grad_norm": 0.28764376044273376, + "learning_rate": 2.820134596325568e-06, + "loss": 0.3336, + "step": 8102 + }, + { + "epoch": 2.037977867203219, + "grad_norm": 0.31268617510795593, + "learning_rate": 2.8188178257863784e-06, + "loss": 0.3147, + "step": 8103 + }, + { + "epoch": 2.0382293762575454, + "grad_norm": 0.3202211856842041, + "learning_rate": 2.8175012420517954e-06, + "loss": 0.3116, + "step": 8104 + }, + { + "epoch": 2.038480885311871, + "grad_norm": 0.32938629388809204, + "learning_rate": 2.8161848452345784e-06, + "loss": 0.3222, + "step": 8105 + }, + { + "epoch": 2.038732394366197, + "grad_norm": 0.31832754611968994, + "learning_rate": 2.8148686354474657e-06, + "loss": 0.3221, + "step": 8106 + }, + { + "epoch": 2.0389839034205233, + "grad_norm": 0.29280537366867065, + "learning_rate": 2.8135526128031864e-06, + "loss": 0.3317, + "step": 8107 + }, + { + "epoch": 2.039235412474849, + "grad_norm": 0.2781398594379425, + "learning_rate": 2.8122367774144454e-06, + "loss": 0.3139, + "step": 8108 + }, + { + "epoch": 2.039486921529175, + "grad_norm": 0.33072832226753235, + "learning_rate": 2.8109211293939376e-06, + "loss": 0.3226, + "step": 8109 + }, + { + "epoch": 2.039738430583501, + "grad_norm": 0.32349687814712524, + "learning_rate": 2.809605668854343e-06, + "loss": 0.3172, + "step": 8110 + }, + { + "epoch": 2.039989939637827, + "grad_norm": 0.2853679358959198, + "learning_rate": 2.8082903959083165e-06, + "loss": 0.3007, + "step": 8111 + }, + { + "epoch": 2.0402414486921527, + "grad_norm": 0.2898256778717041, + "learning_rate": 2.8069753106685093e-06, + "loss": 0.3371, + "step": 8112 + }, + { + "epoch": 2.040492957746479, + "grad_norm": 0.31160539388656616, + "learning_rate": 2.8056604132475445e-06, + "loss": 0.3333, + "step": 8113 + }, + { + "epoch": 2.040744466800805, + "grad_norm": 0.2927866578102112, + "learning_rate": 2.80434570375804e-06, + "loss": 0.307, + "step": 8114 + }, + { + "epoch": 2.0409959758551306, + "grad_norm": 0.34478503465652466, + "learning_rate": 2.8030311823125877e-06, + "loss": 0.3505, + "step": 8115 + }, + { + "epoch": 2.041247484909457, + "grad_norm": 0.29528501629829407, + "learning_rate": 2.8017168490237735e-06, + "loss": 0.328, + "step": 8116 + }, + { + "epoch": 2.0414989939637826, + "grad_norm": 0.30806538462638855, + "learning_rate": 2.8004027040041555e-06, + "loss": 0.3352, + "step": 8117 + }, + { + "epoch": 2.0417505030181085, + "grad_norm": 0.34178322553634644, + "learning_rate": 2.7990887473662875e-06, + "loss": 0.3195, + "step": 8118 + }, + { + "epoch": 2.0420020120724347, + "grad_norm": 0.3116021454334259, + "learning_rate": 2.7977749792226978e-06, + "loss": 0.3202, + "step": 8119 + }, + { + "epoch": 2.0422535211267605, + "grad_norm": 0.33535781502723694, + "learning_rate": 2.7964613996859037e-06, + "loss": 0.3406, + "step": 8120 + }, + { + "epoch": 2.0425050301810863, + "grad_norm": 0.3147551417350769, + "learning_rate": 2.795148008868408e-06, + "loss": 0.312, + "step": 8121 + }, + { + "epoch": 2.0427565392354126, + "grad_norm": 0.30535510182380676, + "learning_rate": 2.7938348068826893e-06, + "loss": 0.3357, + "step": 8122 + }, + { + "epoch": 2.0430080482897384, + "grad_norm": 0.27879422903060913, + "learning_rate": 2.79252179384122e-06, + "loss": 0.2919, + "step": 8123 + }, + { + "epoch": 2.043259557344064, + "grad_norm": 0.29287123680114746, + "learning_rate": 2.791208969856447e-06, + "loss": 0.3204, + "step": 8124 + }, + { + "epoch": 2.0435110663983904, + "grad_norm": 0.30926448106765747, + "learning_rate": 2.7898963350408093e-06, + "loss": 0.3161, + "step": 8125 + }, + { + "epoch": 2.0437625754527162, + "grad_norm": 0.29730355739593506, + "learning_rate": 2.788583889506722e-06, + "loss": 0.3248, + "step": 8126 + }, + { + "epoch": 2.044014084507042, + "grad_norm": 0.3108483850955963, + "learning_rate": 2.7872716333665928e-06, + "loss": 0.3357, + "step": 8127 + }, + { + "epoch": 2.0442655935613683, + "grad_norm": 0.3125215470790863, + "learning_rate": 2.7859595667328027e-06, + "loss": 0.3266, + "step": 8128 + }, + { + "epoch": 2.044517102615694, + "grad_norm": 0.31513240933418274, + "learning_rate": 2.784647689717725e-06, + "loss": 0.3317, + "step": 8129 + }, + { + "epoch": 2.0447686116700203, + "grad_norm": 0.344375342130661, + "learning_rate": 2.7833360024337152e-06, + "loss": 0.3385, + "step": 8130 + }, + { + "epoch": 2.045020120724346, + "grad_norm": 0.309125691652298, + "learning_rate": 2.782024504993108e-06, + "loss": 0.337, + "step": 8131 + }, + { + "epoch": 2.045271629778672, + "grad_norm": 0.3255023956298828, + "learning_rate": 2.780713197508228e-06, + "loss": 0.3483, + "step": 8132 + }, + { + "epoch": 2.045523138832998, + "grad_norm": 0.3013308644294739, + "learning_rate": 2.779402080091377e-06, + "loss": 0.3441, + "step": 8133 + }, + { + "epoch": 2.045774647887324, + "grad_norm": 0.34748101234436035, + "learning_rate": 2.778091152854847e-06, + "loss": 0.3485, + "step": 8134 + }, + { + "epoch": 2.04602615694165, + "grad_norm": 0.3031235337257385, + "learning_rate": 2.776780415910908e-06, + "loss": 0.3231, + "step": 8135 + }, + { + "epoch": 2.046277665995976, + "grad_norm": 0.3195328116416931, + "learning_rate": 2.7754698693718206e-06, + "loss": 0.308, + "step": 8136 + }, + { + "epoch": 2.046529175050302, + "grad_norm": 0.2759738266468048, + "learning_rate": 2.77415951334982e-06, + "loss": 0.3014, + "step": 8137 + }, + { + "epoch": 2.0467806841046277, + "grad_norm": 0.3147771656513214, + "learning_rate": 2.772849347957134e-06, + "loss": 0.3274, + "step": 8138 + }, + { + "epoch": 2.047032193158954, + "grad_norm": 0.2973617911338806, + "learning_rate": 2.7715393733059657e-06, + "loss": 0.3163, + "step": 8139 + }, + { + "epoch": 2.0472837022132797, + "grad_norm": 0.3099863529205322, + "learning_rate": 2.7702295895085097e-06, + "loss": 0.3241, + "step": 8140 + }, + { + "epoch": 2.0475352112676055, + "grad_norm": 0.2844668924808502, + "learning_rate": 2.768919996676942e-06, + "loss": 0.3176, + "step": 8141 + }, + { + "epoch": 2.0477867203219318, + "grad_norm": 0.34302541613578796, + "learning_rate": 2.7676105949234168e-06, + "loss": 0.3257, + "step": 8142 + }, + { + "epoch": 2.0480382293762576, + "grad_norm": 0.32527220249176025, + "learning_rate": 2.7663013843600805e-06, + "loss": 0.3046, + "step": 8143 + }, + { + "epoch": 2.0482897384305834, + "grad_norm": 0.31658607721328735, + "learning_rate": 2.764992365099054e-06, + "loss": 0.3138, + "step": 8144 + }, + { + "epoch": 2.0485412474849096, + "grad_norm": 0.31021907925605774, + "learning_rate": 2.7636835372524516e-06, + "loss": 0.3358, + "step": 8145 + }, + { + "epoch": 2.0487927565392354, + "grad_norm": 0.3204159736633301, + "learning_rate": 2.7623749009323626e-06, + "loss": 0.3216, + "step": 8146 + }, + { + "epoch": 2.0490442655935612, + "grad_norm": 0.2946203947067261, + "learning_rate": 2.761066456250866e-06, + "loss": 0.309, + "step": 8147 + }, + { + "epoch": 2.0492957746478875, + "grad_norm": 0.29937252402305603, + "learning_rate": 2.759758203320019e-06, + "loss": 0.326, + "step": 8148 + }, + { + "epoch": 2.0495472837022133, + "grad_norm": 0.30016854405403137, + "learning_rate": 2.7584501422518696e-06, + "loss": 0.3309, + "step": 8149 + }, + { + "epoch": 2.049798792756539, + "grad_norm": 0.3086334466934204, + "learning_rate": 2.75714227315844e-06, + "loss": 0.3039, + "step": 8150 + }, + { + "epoch": 2.0500503018108653, + "grad_norm": 0.30553144216537476, + "learning_rate": 2.7558345961517422e-06, + "loss": 0.3332, + "step": 8151 + }, + { + "epoch": 2.050301810865191, + "grad_norm": 0.32114413380622864, + "learning_rate": 2.754527111343775e-06, + "loss": 0.3279, + "step": 8152 + }, + { + "epoch": 2.050553319919517, + "grad_norm": 0.3155249357223511, + "learning_rate": 2.753219818846511e-06, + "loss": 0.3356, + "step": 8153 + }, + { + "epoch": 2.050804828973843, + "grad_norm": 0.30092206597328186, + "learning_rate": 2.751912718771915e-06, + "loss": 0.3133, + "step": 8154 + }, + { + "epoch": 2.051056338028169, + "grad_norm": 0.2950682044029236, + "learning_rate": 2.75060581123193e-06, + "loss": 0.3122, + "step": 8155 + }, + { + "epoch": 2.051307847082495, + "grad_norm": 0.3297138214111328, + "learning_rate": 2.749299096338486e-06, + "loss": 0.3259, + "step": 8156 + }, + { + "epoch": 2.051559356136821, + "grad_norm": 0.29814019799232483, + "learning_rate": 2.7479925742034926e-06, + "loss": 0.3146, + "step": 8157 + }, + { + "epoch": 2.051810865191147, + "grad_norm": 0.29181772470474243, + "learning_rate": 2.7466862449388483e-06, + "loss": 0.3268, + "step": 8158 + }, + { + "epoch": 2.0520623742454727, + "grad_norm": 0.31455108523368835, + "learning_rate": 2.7453801086564284e-06, + "loss": 0.3424, + "step": 8159 + }, + { + "epoch": 2.052313883299799, + "grad_norm": 0.2971799969673157, + "learning_rate": 2.7440741654680995e-06, + "loss": 0.3083, + "step": 8160 + }, + { + "epoch": 2.0525653923541247, + "grad_norm": 0.3010028898715973, + "learning_rate": 2.7427684154857036e-06, + "loss": 0.3197, + "step": 8161 + }, + { + "epoch": 2.0528169014084505, + "grad_norm": 0.30157679319381714, + "learning_rate": 2.7414628588210736e-06, + "loss": 0.3374, + "step": 8162 + }, + { + "epoch": 2.0530684104627768, + "grad_norm": 0.3262830078601837, + "learning_rate": 2.7401574955860177e-06, + "loss": 0.332, + "step": 8163 + }, + { + "epoch": 2.0533199195171026, + "grad_norm": 0.3156241178512573, + "learning_rate": 2.7388523258923373e-06, + "loss": 0.351, + "step": 8164 + }, + { + "epoch": 2.0535714285714284, + "grad_norm": 0.28493210673332214, + "learning_rate": 2.737547349851808e-06, + "loss": 0.3083, + "step": 8165 + }, + { + "epoch": 2.0538229376257546, + "grad_norm": 0.3349147140979767, + "learning_rate": 2.7362425675761955e-06, + "loss": 0.3097, + "step": 8166 + }, + { + "epoch": 2.0540744466800804, + "grad_norm": 0.32758259773254395, + "learning_rate": 2.7349379791772434e-06, + "loss": 0.3391, + "step": 8167 + }, + { + "epoch": 2.0543259557344062, + "grad_norm": 0.3020482361316681, + "learning_rate": 2.733633584766685e-06, + "loss": 0.3545, + "step": 8168 + }, + { + "epoch": 2.0545774647887325, + "grad_norm": 0.31516534090042114, + "learning_rate": 2.7323293844562305e-06, + "loss": 0.3087, + "step": 8169 + }, + { + "epoch": 2.0548289738430583, + "grad_norm": 0.2857208549976349, + "learning_rate": 2.731025378357579e-06, + "loss": 0.316, + "step": 8170 + }, + { + "epoch": 2.055080482897384, + "grad_norm": 0.3077753186225891, + "learning_rate": 2.729721566582407e-06, + "loss": 0.3417, + "step": 8171 + }, + { + "epoch": 2.0553319919517103, + "grad_norm": 0.31536969542503357, + "learning_rate": 2.7284179492423825e-06, + "loss": 0.3375, + "step": 8172 + }, + { + "epoch": 2.055583501006036, + "grad_norm": 0.3076031804084778, + "learning_rate": 2.7271145264491473e-06, + "loss": 0.3405, + "step": 8173 + }, + { + "epoch": 2.055835010060362, + "grad_norm": 0.30033233761787415, + "learning_rate": 2.725811298314336e-06, + "loss": 0.3091, + "step": 8174 + }, + { + "epoch": 2.056086519114688, + "grad_norm": 0.28484421968460083, + "learning_rate": 2.724508264949558e-06, + "loss": 0.3294, + "step": 8175 + }, + { + "epoch": 2.056338028169014, + "grad_norm": 0.29345014691352844, + "learning_rate": 2.723205426466413e-06, + "loss": 0.3031, + "step": 8176 + }, + { + "epoch": 2.0565895372233403, + "grad_norm": 0.3001652657985687, + "learning_rate": 2.7219027829764777e-06, + "loss": 0.3259, + "step": 8177 + }, + { + "epoch": 2.056841046277666, + "grad_norm": 0.29563117027282715, + "learning_rate": 2.72060033459132e-06, + "loss": 0.3285, + "step": 8178 + }, + { + "epoch": 2.057092555331992, + "grad_norm": 0.28922346234321594, + "learning_rate": 2.719298081422481e-06, + "loss": 0.323, + "step": 8179 + }, + { + "epoch": 2.057344064386318, + "grad_norm": 0.3013193905353546, + "learning_rate": 2.7179960235814963e-06, + "loss": 0.307, + "step": 8180 + }, + { + "epoch": 2.057595573440644, + "grad_norm": 0.301833838224411, + "learning_rate": 2.716694161179873e-06, + "loss": 0.3321, + "step": 8181 + }, + { + "epoch": 2.0578470824949697, + "grad_norm": 0.3148650825023651, + "learning_rate": 2.7153924943291125e-06, + "loss": 0.2919, + "step": 8182 + }, + { + "epoch": 2.058098591549296, + "grad_norm": 0.3130474090576172, + "learning_rate": 2.7140910231406915e-06, + "loss": 0.3154, + "step": 8183 + }, + { + "epoch": 2.058350100603622, + "grad_norm": 0.30896395444869995, + "learning_rate": 2.7127897477260723e-06, + "loss": 0.2989, + "step": 8184 + }, + { + "epoch": 2.0586016096579476, + "grad_norm": 0.3000273108482361, + "learning_rate": 2.711488668196706e-06, + "loss": 0.3157, + "step": 8185 + }, + { + "epoch": 2.058853118712274, + "grad_norm": 0.32448527216911316, + "learning_rate": 2.710187784664015e-06, + "loss": 0.3304, + "step": 8186 + }, + { + "epoch": 2.0591046277665996, + "grad_norm": 0.327384889125824, + "learning_rate": 2.708887097239418e-06, + "loss": 0.3002, + "step": 8187 + }, + { + "epoch": 2.0593561368209254, + "grad_norm": 0.340179443359375, + "learning_rate": 2.7075866060343057e-06, + "loss": 0.3246, + "step": 8188 + }, + { + "epoch": 2.0596076458752517, + "grad_norm": 0.35086414217948914, + "learning_rate": 2.706286311160061e-06, + "loss": 0.3246, + "step": 8189 + }, + { + "epoch": 2.0598591549295775, + "grad_norm": 0.2781917154788971, + "learning_rate": 2.704986212728043e-06, + "loss": 0.3393, + "step": 8190 + }, + { + "epoch": 2.0601106639839033, + "grad_norm": 0.3145841360092163, + "learning_rate": 2.7036863108495996e-06, + "loss": 0.2988, + "step": 8191 + }, + { + "epoch": 2.0603621730382295, + "grad_norm": 0.32019394636154175, + "learning_rate": 2.702386605636057e-06, + "loss": 0.3337, + "step": 8192 + }, + { + "epoch": 2.0606136820925554, + "grad_norm": 0.2975797951221466, + "learning_rate": 2.701087097198729e-06, + "loss": 0.296, + "step": 8193 + }, + { + "epoch": 2.060865191146881, + "grad_norm": 0.2773379385471344, + "learning_rate": 2.6997877856489073e-06, + "loss": 0.32, + "step": 8194 + }, + { + "epoch": 2.0611167002012074, + "grad_norm": 0.31435322761535645, + "learning_rate": 2.698488671097872e-06, + "loss": 0.3106, + "step": 8195 + }, + { + "epoch": 2.061368209255533, + "grad_norm": 0.32029151916503906, + "learning_rate": 2.6971897536568853e-06, + "loss": 0.3177, + "step": 8196 + }, + { + "epoch": 2.061619718309859, + "grad_norm": 0.3515484035015106, + "learning_rate": 2.695891033437188e-06, + "loss": 0.3305, + "step": 8197 + }, + { + "epoch": 2.0618712273641853, + "grad_norm": 0.29757028818130493, + "learning_rate": 2.6945925105500117e-06, + "loss": 0.3383, + "step": 8198 + }, + { + "epoch": 2.062122736418511, + "grad_norm": 0.32565537095069885, + "learning_rate": 2.693294185106562e-06, + "loss": 0.3371, + "step": 8199 + }, + { + "epoch": 2.062374245472837, + "grad_norm": 0.3038772642612457, + "learning_rate": 2.691996057218036e-06, + "loss": 0.3313, + "step": 8200 + }, + { + "epoch": 2.062625754527163, + "grad_norm": 0.3129657506942749, + "learning_rate": 2.6906981269956077e-06, + "loss": 0.3235, + "step": 8201 + }, + { + "epoch": 2.062877263581489, + "grad_norm": 0.3186679780483246, + "learning_rate": 2.6894003945504393e-06, + "loss": 0.3371, + "step": 8202 + }, + { + "epoch": 2.0631287726358147, + "grad_norm": 0.29854926466941833, + "learning_rate": 2.6881028599936705e-06, + "loss": 0.3208, + "step": 8203 + }, + { + "epoch": 2.063380281690141, + "grad_norm": 0.28703761100769043, + "learning_rate": 2.6868055234364304e-06, + "loss": 0.3137, + "step": 8204 + }, + { + "epoch": 2.063631790744467, + "grad_norm": 0.3135847747325897, + "learning_rate": 2.685508384989824e-06, + "loss": 0.3263, + "step": 8205 + }, + { + "epoch": 2.0638832997987926, + "grad_norm": 0.32722026109695435, + "learning_rate": 2.684211444764945e-06, + "loss": 0.3446, + "step": 8206 + }, + { + "epoch": 2.064134808853119, + "grad_norm": 0.2794181704521179, + "learning_rate": 2.6829147028728695e-06, + "loss": 0.3239, + "step": 8207 + }, + { + "epoch": 2.0643863179074446, + "grad_norm": 0.29036521911621094, + "learning_rate": 2.6816181594246534e-06, + "loss": 0.3363, + "step": 8208 + }, + { + "epoch": 2.0646378269617705, + "grad_norm": 0.29604461789131165, + "learning_rate": 2.6803218145313392e-06, + "loss": 0.2908, + "step": 8209 + }, + { + "epoch": 2.0648893360160967, + "grad_norm": 0.3000899851322174, + "learning_rate": 2.6790256683039485e-06, + "loss": 0.314, + "step": 8210 + }, + { + "epoch": 2.0651408450704225, + "grad_norm": 0.3037608563899994, + "learning_rate": 2.6777297208534903e-06, + "loss": 0.3712, + "step": 8211 + }, + { + "epoch": 2.0653923541247483, + "grad_norm": 0.30880311131477356, + "learning_rate": 2.6764339722909523e-06, + "loss": 0.3349, + "step": 8212 + }, + { + "epoch": 2.0656438631790746, + "grad_norm": 0.3168122470378876, + "learning_rate": 2.67513842272731e-06, + "loss": 0.307, + "step": 8213 + }, + { + "epoch": 2.0658953722334004, + "grad_norm": 0.29152563214302063, + "learning_rate": 2.6738430722735155e-06, + "loss": 0.3272, + "step": 8214 + }, + { + "epoch": 2.066146881287726, + "grad_norm": 0.29437023401260376, + "learning_rate": 2.6725479210405114e-06, + "loss": 0.3171, + "step": 8215 + }, + { + "epoch": 2.0663983903420524, + "grad_norm": 0.2880735397338867, + "learning_rate": 2.671252969139216e-06, + "loss": 0.3026, + "step": 8216 + }, + { + "epoch": 2.066649899396378, + "grad_norm": 0.3284808099269867, + "learning_rate": 2.669958216680535e-06, + "loss": 0.3173, + "step": 8217 + }, + { + "epoch": 2.066901408450704, + "grad_norm": 0.30216220021247864, + "learning_rate": 2.668663663775357e-06, + "loss": 0.3389, + "step": 8218 + }, + { + "epoch": 2.0671529175050303, + "grad_norm": 0.3077835738658905, + "learning_rate": 2.6673693105345506e-06, + "loss": 0.318, + "step": 8219 + }, + { + "epoch": 2.067404426559356, + "grad_norm": 0.3287010192871094, + "learning_rate": 2.6660751570689715e-06, + "loss": 0.3402, + "step": 8220 + }, + { + "epoch": 2.067655935613682, + "grad_norm": 0.28192028403282166, + "learning_rate": 2.6647812034894516e-06, + "loss": 0.301, + "step": 8221 + }, + { + "epoch": 2.067907444668008, + "grad_norm": 0.3122316598892212, + "learning_rate": 2.6634874499068154e-06, + "loss": 0.3272, + "step": 8222 + }, + { + "epoch": 2.068158953722334, + "grad_norm": 0.2841907739639282, + "learning_rate": 2.6621938964318593e-06, + "loss": 0.3069, + "step": 8223 + }, + { + "epoch": 2.0684104627766597, + "grad_norm": 0.2972943186759949, + "learning_rate": 2.6609005431753733e-06, + "loss": 0.3083, + "step": 8224 + }, + { + "epoch": 2.068661971830986, + "grad_norm": 0.29851382970809937, + "learning_rate": 2.65960739024812e-06, + "loss": 0.3166, + "step": 8225 + }, + { + "epoch": 2.068913480885312, + "grad_norm": 0.32675349712371826, + "learning_rate": 2.658314437760855e-06, + "loss": 0.3267, + "step": 8226 + }, + { + "epoch": 2.0691649899396376, + "grad_norm": 0.2977888584136963, + "learning_rate": 2.6570216858243057e-06, + "loss": 0.3502, + "step": 8227 + }, + { + "epoch": 2.069416498993964, + "grad_norm": 0.30917125940322876, + "learning_rate": 2.655729134549192e-06, + "loss": 0.3134, + "step": 8228 + }, + { + "epoch": 2.0696680080482897, + "grad_norm": 0.3009907603263855, + "learning_rate": 2.654436784046214e-06, + "loss": 0.3215, + "step": 8229 + }, + { + "epoch": 2.069919517102616, + "grad_norm": 0.29445791244506836, + "learning_rate": 2.6531446344260503e-06, + "loss": 0.3027, + "step": 8230 + }, + { + "epoch": 2.0701710261569417, + "grad_norm": 0.2821619212627411, + "learning_rate": 2.651852685799368e-06, + "loss": 0.2985, + "step": 8231 + }, + { + "epoch": 2.0704225352112675, + "grad_norm": 0.28280389308929443, + "learning_rate": 2.6505609382768117e-06, + "loss": 0.3142, + "step": 8232 + }, + { + "epoch": 2.0706740442655938, + "grad_norm": 0.3183631896972656, + "learning_rate": 2.649269391969015e-06, + "loss": 0.3409, + "step": 8233 + }, + { + "epoch": 2.0709255533199196, + "grad_norm": 0.30372148752212524, + "learning_rate": 2.6479780469865864e-06, + "loss": 0.3102, + "step": 8234 + }, + { + "epoch": 2.0711770623742454, + "grad_norm": 0.29154670238494873, + "learning_rate": 2.646686903440126e-06, + "loss": 0.3123, + "step": 8235 + }, + { + "epoch": 2.0714285714285716, + "grad_norm": 0.30079391598701477, + "learning_rate": 2.645395961440208e-06, + "loss": 0.3023, + "step": 8236 + }, + { + "epoch": 2.0716800804828974, + "grad_norm": 0.304799348115921, + "learning_rate": 2.6441052210973974e-06, + "loss": 0.3398, + "step": 8237 + }, + { + "epoch": 2.0719315895372232, + "grad_norm": 0.2910962998867035, + "learning_rate": 2.6428146825222344e-06, + "loss": 0.3074, + "step": 8238 + }, + { + "epoch": 2.0721830985915495, + "grad_norm": 0.28885748982429504, + "learning_rate": 2.641524345825248e-06, + "loss": 0.3215, + "step": 8239 + }, + { + "epoch": 2.0724346076458753, + "grad_norm": 0.28982120752334595, + "learning_rate": 2.6402342111169476e-06, + "loss": 0.324, + "step": 8240 + }, + { + "epoch": 2.072686116700201, + "grad_norm": 0.30088311433792114, + "learning_rate": 2.6389442785078227e-06, + "loss": 0.325, + "step": 8241 + }, + { + "epoch": 2.0729376257545273, + "grad_norm": 0.30647531151771545, + "learning_rate": 2.637654548108352e-06, + "loss": 0.3286, + "step": 8242 + }, + { + "epoch": 2.073189134808853, + "grad_norm": 0.3003426492214203, + "learning_rate": 2.636365020028988e-06, + "loss": 0.3363, + "step": 8243 + }, + { + "epoch": 2.073440643863179, + "grad_norm": 0.30914726853370667, + "learning_rate": 2.635075694380176e-06, + "loss": 0.3139, + "step": 8244 + }, + { + "epoch": 2.073692152917505, + "grad_norm": 0.3180095851421356, + "learning_rate": 2.633786571272333e-06, + "loss": 0.3075, + "step": 8245 + }, + { + "epoch": 2.073943661971831, + "grad_norm": 0.31122735142707825, + "learning_rate": 2.6324976508158697e-06, + "loss": 0.3222, + "step": 8246 + }, + { + "epoch": 2.074195171026157, + "grad_norm": 0.30536743998527527, + "learning_rate": 2.6312089331211693e-06, + "loss": 0.3163, + "step": 8247 + }, + { + "epoch": 2.074446680080483, + "grad_norm": 0.32808443903923035, + "learning_rate": 2.6299204182986072e-06, + "loss": 0.3442, + "step": 8248 + }, + { + "epoch": 2.074698189134809, + "grad_norm": 0.2878994941711426, + "learning_rate": 2.6286321064585315e-06, + "loss": 0.3282, + "step": 8249 + }, + { + "epoch": 2.0749496981891347, + "grad_norm": 0.303153395652771, + "learning_rate": 2.6273439977112803e-06, + "loss": 0.3128, + "step": 8250 + }, + { + "epoch": 2.075201207243461, + "grad_norm": 0.29553067684173584, + "learning_rate": 2.626056092167175e-06, + "loss": 0.3316, + "step": 8251 + }, + { + "epoch": 2.0754527162977867, + "grad_norm": 0.3366231918334961, + "learning_rate": 2.6247683899365117e-06, + "loss": 0.3172, + "step": 8252 + }, + { + "epoch": 2.0757042253521125, + "grad_norm": 0.33030417561531067, + "learning_rate": 2.623480891129579e-06, + "loss": 0.3245, + "step": 8253 + }, + { + "epoch": 2.0759557344064388, + "grad_norm": 0.30303242802619934, + "learning_rate": 2.622193595856638e-06, + "loss": 0.3321, + "step": 8254 + }, + { + "epoch": 2.0762072434607646, + "grad_norm": 0.29723554849624634, + "learning_rate": 2.6209065042279426e-06, + "loss": 0.3069, + "step": 8255 + }, + { + "epoch": 2.0764587525150904, + "grad_norm": 0.294047087430954, + "learning_rate": 2.619619616353719e-06, + "loss": 0.3519, + "step": 8256 + }, + { + "epoch": 2.0767102615694166, + "grad_norm": 0.31295087933540344, + "learning_rate": 2.618332932344185e-06, + "loss": 0.3249, + "step": 8257 + }, + { + "epoch": 2.0769617706237424, + "grad_norm": 0.3490563929080963, + "learning_rate": 2.617046452309535e-06, + "loss": 0.326, + "step": 8258 + }, + { + "epoch": 2.0772132796780682, + "grad_norm": 0.30121055245399475, + "learning_rate": 2.6157601763599504e-06, + "loss": 0.3128, + "step": 8259 + }, + { + "epoch": 2.0774647887323945, + "grad_norm": 0.29791775345802307, + "learning_rate": 2.614474104605589e-06, + "loss": 0.3224, + "step": 8260 + }, + { + "epoch": 2.0777162977867203, + "grad_norm": 0.3081105947494507, + "learning_rate": 2.613188237156596e-06, + "loss": 0.3093, + "step": 8261 + }, + { + "epoch": 2.077967806841046, + "grad_norm": 0.30400124192237854, + "learning_rate": 2.6119025741231007e-06, + "loss": 0.3172, + "step": 8262 + }, + { + "epoch": 2.0782193158953723, + "grad_norm": 0.3303559422492981, + "learning_rate": 2.610617115615208e-06, + "loss": 0.3163, + "step": 8263 + }, + { + "epoch": 2.078470824949698, + "grad_norm": 0.2974235415458679, + "learning_rate": 2.609331861743014e-06, + "loss": 0.3333, + "step": 8264 + }, + { + "epoch": 2.078722334004024, + "grad_norm": 0.32158637046813965, + "learning_rate": 2.608046812616588e-06, + "loss": 0.328, + "step": 8265 + }, + { + "epoch": 2.07897384305835, + "grad_norm": 0.3166216313838959, + "learning_rate": 2.6067619683459904e-06, + "loss": 0.317, + "step": 8266 + }, + { + "epoch": 2.079225352112676, + "grad_norm": 0.2718227803707123, + "learning_rate": 2.605477329041256e-06, + "loss": 0.3305, + "step": 8267 + }, + { + "epoch": 2.079476861167002, + "grad_norm": 0.33164918422698975, + "learning_rate": 2.6041928948124107e-06, + "loss": 0.322, + "step": 8268 + }, + { + "epoch": 2.079728370221328, + "grad_norm": 0.3085431158542633, + "learning_rate": 2.6029086657694537e-06, + "loss": 0.3276, + "step": 8269 + }, + { + "epoch": 2.079979879275654, + "grad_norm": 0.30075356364250183, + "learning_rate": 2.6016246420223744e-06, + "loss": 0.325, + "step": 8270 + }, + { + "epoch": 2.0802313883299797, + "grad_norm": 0.3089926540851593, + "learning_rate": 2.600340823681139e-06, + "loss": 0.3003, + "step": 8271 + }, + { + "epoch": 2.080482897384306, + "grad_norm": 0.3290434777736664, + "learning_rate": 2.5990572108557e-06, + "loss": 0.3306, + "step": 8272 + }, + { + "epoch": 2.0807344064386317, + "grad_norm": 0.3045661747455597, + "learning_rate": 2.597773803655993e-06, + "loss": 0.3234, + "step": 8273 + }, + { + "epoch": 2.080985915492958, + "grad_norm": 0.2929016053676605, + "learning_rate": 2.596490602191929e-06, + "loss": 0.312, + "step": 8274 + }, + { + "epoch": 2.0812374245472838, + "grad_norm": 0.2901984453201294, + "learning_rate": 2.59520760657341e-06, + "loss": 0.3147, + "step": 8275 + }, + { + "epoch": 2.0814889336016096, + "grad_norm": 0.33070576190948486, + "learning_rate": 2.5939248169103136e-06, + "loss": 0.3309, + "step": 8276 + }, + { + "epoch": 2.081740442655936, + "grad_norm": 0.3062666654586792, + "learning_rate": 2.5926422333125066e-06, + "loss": 0.3098, + "step": 8277 + }, + { + "epoch": 2.0819919517102616, + "grad_norm": 0.31025469303131104, + "learning_rate": 2.59135985588983e-06, + "loss": 0.3265, + "step": 8278 + }, + { + "epoch": 2.0822434607645874, + "grad_norm": 0.29651591181755066, + "learning_rate": 2.5900776847521148e-06, + "loss": 0.3186, + "step": 8279 + }, + { + "epoch": 2.0824949698189137, + "grad_norm": 0.31347307562828064, + "learning_rate": 2.588795720009168e-06, + "loss": 0.3182, + "step": 8280 + }, + { + "epoch": 2.0827464788732395, + "grad_norm": 0.3125113546848297, + "learning_rate": 2.587513961770785e-06, + "loss": 0.339, + "step": 8281 + }, + { + "epoch": 2.0829979879275653, + "grad_norm": 0.3104122281074524, + "learning_rate": 2.586232410146737e-06, + "loss": 0.3019, + "step": 8282 + }, + { + "epoch": 2.0832494969818915, + "grad_norm": 0.7815317511558533, + "learning_rate": 2.584951065246784e-06, + "loss": 0.3071, + "step": 8283 + }, + { + "epoch": 2.0835010060362174, + "grad_norm": 0.32731983065605164, + "learning_rate": 2.583669927180662e-06, + "loss": 0.3127, + "step": 8284 + }, + { + "epoch": 2.083752515090543, + "grad_norm": 0.3034411668777466, + "learning_rate": 2.5823889960580967e-06, + "loss": 0.3335, + "step": 8285 + }, + { + "epoch": 2.0840040241448694, + "grad_norm": 0.30236032605171204, + "learning_rate": 2.581108271988787e-06, + "loss": 0.3237, + "step": 8286 + }, + { + "epoch": 2.084255533199195, + "grad_norm": 0.3101537525653839, + "learning_rate": 2.5798277550824238e-06, + "loss": 0.2826, + "step": 8287 + }, + { + "epoch": 2.084507042253521, + "grad_norm": 0.2881515324115753, + "learning_rate": 2.5785474454486696e-06, + "loss": 0.3407, + "step": 8288 + }, + { + "epoch": 2.0847585513078473, + "grad_norm": 0.31731492280960083, + "learning_rate": 2.5772673431971805e-06, + "loss": 0.3017, + "step": 8289 + }, + { + "epoch": 2.085010060362173, + "grad_norm": 0.29667213559150696, + "learning_rate": 2.575987448437586e-06, + "loss": 0.3199, + "step": 8290 + }, + { + "epoch": 2.085261569416499, + "grad_norm": 0.29330283403396606, + "learning_rate": 2.574707761279503e-06, + "loss": 0.3388, + "step": 8291 + }, + { + "epoch": 2.085513078470825, + "grad_norm": 0.2838901877403259, + "learning_rate": 2.5734282818325256e-06, + "loss": 0.3097, + "step": 8292 + }, + { + "epoch": 2.085764587525151, + "grad_norm": 0.3035966753959656, + "learning_rate": 2.5721490102062373e-06, + "loss": 0.3277, + "step": 8293 + }, + { + "epoch": 2.0860160965794767, + "grad_norm": 0.32544568181037903, + "learning_rate": 2.570869946510196e-06, + "loss": 0.347, + "step": 8294 + }, + { + "epoch": 2.086267605633803, + "grad_norm": 0.30072399973869324, + "learning_rate": 2.5695910908539494e-06, + "loss": 0.3299, + "step": 8295 + }, + { + "epoch": 2.086519114688129, + "grad_norm": 0.3020775318145752, + "learning_rate": 2.568312443347019e-06, + "loss": 0.3178, + "step": 8296 + }, + { + "epoch": 2.0867706237424546, + "grad_norm": 0.3153866231441498, + "learning_rate": 2.567034004098917e-06, + "loss": 0.3386, + "step": 8297 + }, + { + "epoch": 2.087022132796781, + "grad_norm": 0.29892367124557495, + "learning_rate": 2.565755773219131e-06, + "loss": 0.3268, + "step": 8298 + }, + { + "epoch": 2.0872736418511066, + "grad_norm": 0.30764761567115784, + "learning_rate": 2.564477750817135e-06, + "loss": 0.3481, + "step": 8299 + }, + { + "epoch": 2.0875251509054324, + "grad_norm": 0.2927669882774353, + "learning_rate": 2.563199937002382e-06, + "loss": 0.3064, + "step": 8300 + }, + { + "epoch": 2.0877766599597587, + "grad_norm": 0.30873027443885803, + "learning_rate": 2.561922331884311e-06, + "loss": 0.3375, + "step": 8301 + }, + { + "epoch": 2.0880281690140845, + "grad_norm": 0.30623045563697815, + "learning_rate": 2.560644935572338e-06, + "loss": 0.3257, + "step": 8302 + }, + { + "epoch": 2.0882796780684103, + "grad_norm": 0.29799333214759827, + "learning_rate": 2.559367748175867e-06, + "loss": 0.3113, + "step": 8303 + }, + { + "epoch": 2.0885311871227366, + "grad_norm": 0.31221136450767517, + "learning_rate": 2.5580907698042802e-06, + "loss": 0.3235, + "step": 8304 + }, + { + "epoch": 2.0887826961770624, + "grad_norm": 0.30643758177757263, + "learning_rate": 2.5568140005669414e-06, + "loss": 0.3348, + "step": 8305 + }, + { + "epoch": 2.089034205231388, + "grad_norm": 0.3108401894569397, + "learning_rate": 2.5555374405732e-06, + "loss": 0.3332, + "step": 8306 + }, + { + "epoch": 2.0892857142857144, + "grad_norm": 0.31820324063301086, + "learning_rate": 2.5542610899323826e-06, + "loss": 0.3098, + "step": 8307 + }, + { + "epoch": 2.08953722334004, + "grad_norm": 0.30521127581596375, + "learning_rate": 2.552984948753805e-06, + "loss": 0.3333, + "step": 8308 + }, + { + "epoch": 2.089788732394366, + "grad_norm": 0.304718941450119, + "learning_rate": 2.5517090171467557e-06, + "loss": 0.3161, + "step": 8309 + }, + { + "epoch": 2.0900402414486923, + "grad_norm": 0.33663034439086914, + "learning_rate": 2.550433295220515e-06, + "loss": 0.3282, + "step": 8310 + }, + { + "epoch": 2.090291750503018, + "grad_norm": 0.3158341646194458, + "learning_rate": 2.549157783084335e-06, + "loss": 0.3157, + "step": 8311 + }, + { + "epoch": 2.090543259557344, + "grad_norm": 0.3130324184894562, + "learning_rate": 2.5478824808474613e-06, + "loss": 0.3192, + "step": 8312 + }, + { + "epoch": 2.09079476861167, + "grad_norm": 0.3025255799293518, + "learning_rate": 2.546607388619111e-06, + "loss": 0.3188, + "step": 8313 + }, + { + "epoch": 2.091046277665996, + "grad_norm": 0.3400377333164215, + "learning_rate": 2.5453325065084887e-06, + "loss": 0.3063, + "step": 8314 + }, + { + "epoch": 2.0912977867203217, + "grad_norm": 0.300741046667099, + "learning_rate": 2.5440578346247834e-06, + "loss": 0.3058, + "step": 8315 + }, + { + "epoch": 2.091549295774648, + "grad_norm": 0.31331273913383484, + "learning_rate": 2.5427833730771577e-06, + "loss": 0.3098, + "step": 8316 + }, + { + "epoch": 2.091800804828974, + "grad_norm": 0.2970178425312042, + "learning_rate": 2.541509121974766e-06, + "loss": 0.3035, + "step": 8317 + }, + { + "epoch": 2.0920523138832996, + "grad_norm": 0.31109151244163513, + "learning_rate": 2.5402350814267364e-06, + "loss": 0.3272, + "step": 8318 + }, + { + "epoch": 2.092303822937626, + "grad_norm": 0.29535481333732605, + "learning_rate": 2.538961251542185e-06, + "loss": 0.3169, + "step": 8319 + }, + { + "epoch": 2.0925553319919517, + "grad_norm": 0.30848875641822815, + "learning_rate": 2.5376876324302045e-06, + "loss": 0.3327, + "step": 8320 + }, + { + "epoch": 2.0928068410462775, + "grad_norm": 0.28316324949264526, + "learning_rate": 2.5364142241998755e-06, + "loss": 0.3399, + "step": 8321 + }, + { + "epoch": 2.0930583501006037, + "grad_norm": 0.3024502992630005, + "learning_rate": 2.535141026960255e-06, + "loss": 0.3063, + "step": 8322 + }, + { + "epoch": 2.0933098591549295, + "grad_norm": 0.33002567291259766, + "learning_rate": 2.5338680408203875e-06, + "loss": 0.3255, + "step": 8323 + }, + { + "epoch": 2.0935613682092553, + "grad_norm": 0.2928857207298279, + "learning_rate": 2.5325952658892916e-06, + "loss": 0.3213, + "step": 8324 + }, + { + "epoch": 2.0938128772635816, + "grad_norm": 0.30271077156066895, + "learning_rate": 2.531322702275976e-06, + "loss": 0.3324, + "step": 8325 + }, + { + "epoch": 2.0940643863179074, + "grad_norm": 0.2813225984573364, + "learning_rate": 2.530050350089428e-06, + "loss": 0.3231, + "step": 8326 + }, + { + "epoch": 2.094315895372233, + "grad_norm": 0.3095519542694092, + "learning_rate": 2.528778209438614e-06, + "loss": 0.3165, + "step": 8327 + }, + { + "epoch": 2.0945674044265594, + "grad_norm": 0.30373334884643555, + "learning_rate": 2.527506280432488e-06, + "loss": 0.3349, + "step": 8328 + }, + { + "epoch": 2.0948189134808852, + "grad_norm": 0.30294981598854065, + "learning_rate": 2.5262345631799794e-06, + "loss": 0.3178, + "step": 8329 + }, + { + "epoch": 2.0950704225352115, + "grad_norm": 0.3043777644634247, + "learning_rate": 2.524963057790007e-06, + "loss": 0.3115, + "step": 8330 + }, + { + "epoch": 2.0953219315895373, + "grad_norm": 0.288493275642395, + "learning_rate": 2.5236917643714628e-06, + "loss": 0.3383, + "step": 8331 + }, + { + "epoch": 2.095573440643863, + "grad_norm": 0.29598721861839294, + "learning_rate": 2.5224206830332286e-06, + "loss": 0.3202, + "step": 8332 + }, + { + "epoch": 2.0958249496981893, + "grad_norm": 0.299293577671051, + "learning_rate": 2.521149813884162e-06, + "loss": 0.3185, + "step": 8333 + }, + { + "epoch": 2.096076458752515, + "grad_norm": 0.2833903729915619, + "learning_rate": 2.5198791570331083e-06, + "loss": 0.3333, + "step": 8334 + }, + { + "epoch": 2.096327967806841, + "grad_norm": 0.33051058650016785, + "learning_rate": 2.5186087125888863e-06, + "loss": 0.3338, + "step": 8335 + }, + { + "epoch": 2.096579476861167, + "grad_norm": 0.293345183134079, + "learning_rate": 2.5173384806603052e-06, + "loss": 0.3244, + "step": 8336 + }, + { + "epoch": 2.096830985915493, + "grad_norm": 0.2857302725315094, + "learning_rate": 2.516068461356154e-06, + "loss": 0.2933, + "step": 8337 + }, + { + "epoch": 2.097082494969819, + "grad_norm": 0.3116956949234009, + "learning_rate": 2.514798654785197e-06, + "loss": 0.315, + "step": 8338 + }, + { + "epoch": 2.097334004024145, + "grad_norm": 0.278083473443985, + "learning_rate": 2.51352906105619e-06, + "loss": 0.3115, + "step": 8339 + }, + { + "epoch": 2.097585513078471, + "grad_norm": 0.30503949522972107, + "learning_rate": 2.512259680277862e-06, + "loss": 0.3144, + "step": 8340 + }, + { + "epoch": 2.0978370221327967, + "grad_norm": 0.3300677537918091, + "learning_rate": 2.510990512558931e-06, + "loss": 0.3183, + "step": 8341 + }, + { + "epoch": 2.098088531187123, + "grad_norm": 0.3177797496318817, + "learning_rate": 2.509721558008089e-06, + "loss": 0.3577, + "step": 8342 + }, + { + "epoch": 2.0983400402414487, + "grad_norm": 0.3060501515865326, + "learning_rate": 2.508452816734019e-06, + "loss": 0.3328, + "step": 8343 + }, + { + "epoch": 2.0985915492957745, + "grad_norm": 0.3117055594921112, + "learning_rate": 2.507184288845376e-06, + "loss": 0.314, + "step": 8344 + }, + { + "epoch": 2.0988430583501008, + "grad_norm": 0.31024450063705444, + "learning_rate": 2.5059159744508055e-06, + "loss": 0.2982, + "step": 8345 + }, + { + "epoch": 2.0990945674044266, + "grad_norm": 0.33131489157676697, + "learning_rate": 2.5046478736589264e-06, + "loss": 0.3093, + "step": 8346 + }, + { + "epoch": 2.0993460764587524, + "grad_norm": 0.3035459816455841, + "learning_rate": 2.503379986578347e-06, + "loss": 0.321, + "step": 8347 + }, + { + "epoch": 2.0995975855130786, + "grad_norm": 0.3351083993911743, + "learning_rate": 2.502112313317654e-06, + "loss": 0.3098, + "step": 8348 + }, + { + "epoch": 2.0998490945674044, + "grad_norm": 0.3084772825241089, + "learning_rate": 2.5008448539854134e-06, + "loss": 0.3327, + "step": 8349 + }, + { + "epoch": 2.1001006036217302, + "grad_norm": 0.3130403459072113, + "learning_rate": 2.499577608690178e-06, + "loss": 0.3342, + "step": 8350 + }, + { + "epoch": 2.1003521126760565, + "grad_norm": 0.32867324352264404, + "learning_rate": 2.498310577540476e-06, + "loss": 0.3216, + "step": 8351 + }, + { + "epoch": 2.1006036217303823, + "grad_norm": 0.3281696140766144, + "learning_rate": 2.4970437606448245e-06, + "loss": 0.3087, + "step": 8352 + }, + { + "epoch": 2.100855130784708, + "grad_norm": 0.3127487897872925, + "learning_rate": 2.495777158111714e-06, + "loss": 0.3194, + "step": 8353 + }, + { + "epoch": 2.1011066398390343, + "grad_norm": 0.29324328899383545, + "learning_rate": 2.4945107700496263e-06, + "loss": 0.3323, + "step": 8354 + }, + { + "epoch": 2.10135814889336, + "grad_norm": 0.27764442563056946, + "learning_rate": 2.4932445965670145e-06, + "loss": 0.2898, + "step": 8355 + }, + { + "epoch": 2.101609657947686, + "grad_norm": 0.3088850677013397, + "learning_rate": 2.4919786377723225e-06, + "loss": 0.3096, + "step": 8356 + }, + { + "epoch": 2.101861167002012, + "grad_norm": 0.29290542006492615, + "learning_rate": 2.490712893773968e-06, + "loss": 0.3038, + "step": 8357 + }, + { + "epoch": 2.102112676056338, + "grad_norm": 0.30638036131858826, + "learning_rate": 2.489447364680357e-06, + "loss": 0.3554, + "step": 8358 + }, + { + "epoch": 2.102364185110664, + "grad_norm": 0.3109561800956726, + "learning_rate": 2.4881820505998743e-06, + "loss": 0.319, + "step": 8359 + }, + { + "epoch": 2.10261569416499, + "grad_norm": 0.2938198745250702, + "learning_rate": 2.486916951640884e-06, + "loss": 0.319, + "step": 8360 + }, + { + "epoch": 2.102867203219316, + "grad_norm": 0.32792484760284424, + "learning_rate": 2.4856520679117357e-06, + "loss": 0.3347, + "step": 8361 + }, + { + "epoch": 2.1031187122736417, + "grad_norm": 0.3298247158527374, + "learning_rate": 2.4843873995207567e-06, + "loss": 0.3105, + "step": 8362 + }, + { + "epoch": 2.103370221327968, + "grad_norm": 0.32708612084388733, + "learning_rate": 2.483122946576262e-06, + "loss": 0.3043, + "step": 8363 + }, + { + "epoch": 2.1036217303822937, + "grad_norm": 0.31453192234039307, + "learning_rate": 2.4818587091865386e-06, + "loss": 0.3036, + "step": 8364 + }, + { + "epoch": 2.1038732394366195, + "grad_norm": 0.2795354425907135, + "learning_rate": 2.480594687459865e-06, + "loss": 0.3167, + "step": 8365 + }, + { + "epoch": 2.1041247484909458, + "grad_norm": 0.29401612281799316, + "learning_rate": 2.4793308815044943e-06, + "loss": 0.3266, + "step": 8366 + }, + { + "epoch": 2.1043762575452716, + "grad_norm": 0.3023243844509125, + "learning_rate": 2.4780672914286652e-06, + "loss": 0.322, + "step": 8367 + }, + { + "epoch": 2.1046277665995974, + "grad_norm": 0.3229201138019562, + "learning_rate": 2.476803917340594e-06, + "loss": 0.3315, + "step": 8368 + }, + { + "epoch": 2.1048792756539236, + "grad_norm": 0.3104352355003357, + "learning_rate": 2.475540759348482e-06, + "loss": 0.3132, + "step": 8369 + }, + { + "epoch": 2.1051307847082494, + "grad_norm": 0.30430683493614197, + "learning_rate": 2.474277817560513e-06, + "loss": 0.3303, + "step": 8370 + }, + { + "epoch": 2.1053822937625752, + "grad_norm": 0.296283483505249, + "learning_rate": 2.473015092084846e-06, + "loss": 0.3168, + "step": 8371 + }, + { + "epoch": 2.1056338028169015, + "grad_norm": 0.31060439348220825, + "learning_rate": 2.4717525830296295e-06, + "loss": 0.3165, + "step": 8372 + }, + { + "epoch": 2.1058853118712273, + "grad_norm": 0.3210805654525757, + "learning_rate": 2.470490290502986e-06, + "loss": 0.3222, + "step": 8373 + }, + { + "epoch": 2.1061368209255535, + "grad_norm": 0.3033636212348938, + "learning_rate": 2.4692282146130266e-06, + "loss": 0.3395, + "step": 8374 + }, + { + "epoch": 2.1063883299798793, + "grad_norm": 0.3115089535713196, + "learning_rate": 2.4679663554678357e-06, + "loss": 0.3245, + "step": 8375 + }, + { + "epoch": 2.106639839034205, + "grad_norm": 0.319614976644516, + "learning_rate": 2.4667047131754884e-06, + "loss": 0.3062, + "step": 8376 + }, + { + "epoch": 2.1068913480885314, + "grad_norm": 0.30483153462409973, + "learning_rate": 2.465443287844032e-06, + "loss": 0.3157, + "step": 8377 + }, + { + "epoch": 2.107142857142857, + "grad_norm": 0.30842599272727966, + "learning_rate": 2.464182079581504e-06, + "loss": 0.3497, + "step": 8378 + }, + { + "epoch": 2.107394366197183, + "grad_norm": 0.30280083417892456, + "learning_rate": 2.462921088495915e-06, + "loss": 0.3356, + "step": 8379 + }, + { + "epoch": 2.1076458752515093, + "grad_norm": 0.2851159870624542, + "learning_rate": 2.4616603146952628e-06, + "loss": 0.3035, + "step": 8380 + }, + { + "epoch": 2.107897384305835, + "grad_norm": 0.3186687231063843, + "learning_rate": 2.4603997582875266e-06, + "loss": 0.322, + "step": 8381 + }, + { + "epoch": 2.108148893360161, + "grad_norm": 0.2962452471256256, + "learning_rate": 2.4591394193806615e-06, + "loss": 0.3278, + "step": 8382 + }, + { + "epoch": 2.108400402414487, + "grad_norm": 0.29877567291259766, + "learning_rate": 2.4578792980826114e-06, + "loss": 0.3511, + "step": 8383 + }, + { + "epoch": 2.108651911468813, + "grad_norm": 0.2987919747829437, + "learning_rate": 2.4566193945012946e-06, + "loss": 0.3165, + "step": 8384 + }, + { + "epoch": 2.1089034205231387, + "grad_norm": 0.28243282437324524, + "learning_rate": 2.4553597087446163e-06, + "loss": 0.3505, + "step": 8385 + }, + { + "epoch": 2.109154929577465, + "grad_norm": 0.31294065713882446, + "learning_rate": 2.4541002409204584e-06, + "loss": 0.3179, + "step": 8386 + }, + { + "epoch": 2.109406438631791, + "grad_norm": 0.2762846052646637, + "learning_rate": 2.4528409911366897e-06, + "loss": 0.3258, + "step": 8387 + }, + { + "epoch": 2.1096579476861166, + "grad_norm": 0.28698083758354187, + "learning_rate": 2.4515819595011532e-06, + "loss": 0.3219, + "step": 8388 + }, + { + "epoch": 2.109909456740443, + "grad_norm": 0.32928720116615295, + "learning_rate": 2.450323146121681e-06, + "loss": 0.3351, + "step": 8389 + }, + { + "epoch": 2.1101609657947686, + "grad_norm": 0.30943191051483154, + "learning_rate": 2.4490645511060784e-06, + "loss": 0.3292, + "step": 8390 + }, + { + "epoch": 2.1104124748490944, + "grad_norm": 0.2983676493167877, + "learning_rate": 2.4478061745621383e-06, + "loss": 0.3354, + "step": 8391 + }, + { + "epoch": 2.1106639839034207, + "grad_norm": 0.29229018092155457, + "learning_rate": 2.446548016597635e-06, + "loss": 0.3172, + "step": 8392 + }, + { + "epoch": 2.1109154929577465, + "grad_norm": 0.30713313817977905, + "learning_rate": 2.4452900773203182e-06, + "loss": 0.3452, + "step": 8393 + }, + { + "epoch": 2.1111670020120723, + "grad_norm": 0.3006666600704193, + "learning_rate": 2.4440323568379255e-06, + "loss": 0.2936, + "step": 8394 + }, + { + "epoch": 2.1114185110663986, + "grad_norm": 0.30126234889030457, + "learning_rate": 2.4427748552581694e-06, + "loss": 0.3048, + "step": 8395 + }, + { + "epoch": 2.1116700201207244, + "grad_norm": 0.2818053960800171, + "learning_rate": 2.4415175726887513e-06, + "loss": 0.3103, + "step": 8396 + }, + { + "epoch": 2.11192152917505, + "grad_norm": 0.2906033992767334, + "learning_rate": 2.440260509237345e-06, + "loss": 0.3261, + "step": 8397 + }, + { + "epoch": 2.1121730382293764, + "grad_norm": 0.28882506489753723, + "learning_rate": 2.4390036650116144e-06, + "loss": 0.3137, + "step": 8398 + }, + { + "epoch": 2.112424547283702, + "grad_norm": 0.31607574224472046, + "learning_rate": 2.4377470401191965e-06, + "loss": 0.3077, + "step": 8399 + }, + { + "epoch": 2.112676056338028, + "grad_norm": 0.3000749945640564, + "learning_rate": 2.436490634667717e-06, + "loss": 0.3225, + "step": 8400 + }, + { + "epoch": 2.1129275653923543, + "grad_norm": 0.29554128646850586, + "learning_rate": 2.4352344487647755e-06, + "loss": 0.3399, + "step": 8401 + }, + { + "epoch": 2.11317907444668, + "grad_norm": 0.308362752199173, + "learning_rate": 2.4339784825179606e-06, + "loss": 0.3279, + "step": 8402 + }, + { + "epoch": 2.113430583501006, + "grad_norm": 0.28427183628082275, + "learning_rate": 2.4327227360348333e-06, + "loss": 0.3256, + "step": 8403 + }, + { + "epoch": 2.113682092555332, + "grad_norm": 0.3028574287891388, + "learning_rate": 2.431467209422945e-06, + "loss": 0.3003, + "step": 8404 + }, + { + "epoch": 2.113933601609658, + "grad_norm": 0.32093802094459534, + "learning_rate": 2.4302119027898195e-06, + "loss": 0.3018, + "step": 8405 + }, + { + "epoch": 2.1141851106639837, + "grad_norm": 0.3018190562725067, + "learning_rate": 2.42895681624297e-06, + "loss": 0.3397, + "step": 8406 + }, + { + "epoch": 2.11443661971831, + "grad_norm": 0.30792275071144104, + "learning_rate": 2.427701949889883e-06, + "loss": 0.3254, + "step": 8407 + }, + { + "epoch": 2.114688128772636, + "grad_norm": 0.2988547384738922, + "learning_rate": 2.426447303838033e-06, + "loss": 0.3281, + "step": 8408 + }, + { + "epoch": 2.1149396378269616, + "grad_norm": 0.2943371832370758, + "learning_rate": 2.4251928781948704e-06, + "loss": 0.3151, + "step": 8409 + }, + { + "epoch": 2.115191146881288, + "grad_norm": 0.31441688537597656, + "learning_rate": 2.423938673067831e-06, + "loss": 0.3286, + "step": 8410 + }, + { + "epoch": 2.1154426559356136, + "grad_norm": 0.31938257813453674, + "learning_rate": 2.422684688564327e-06, + "loss": 0.3401, + "step": 8411 + }, + { + "epoch": 2.1156941649899395, + "grad_norm": 0.29643604159355164, + "learning_rate": 2.4214309247917558e-06, + "loss": 0.3549, + "step": 8412 + }, + { + "epoch": 2.1159456740442657, + "grad_norm": 0.30069106817245483, + "learning_rate": 2.4201773818574956e-06, + "loss": 0.3058, + "step": 8413 + }, + { + "epoch": 2.1161971830985915, + "grad_norm": 0.2893010079860687, + "learning_rate": 2.4189240598689025e-06, + "loss": 0.3402, + "step": 8414 + }, + { + "epoch": 2.1164486921529173, + "grad_norm": 0.30323195457458496, + "learning_rate": 2.4176709589333173e-06, + "loss": 0.3333, + "step": 8415 + }, + { + "epoch": 2.1167002012072436, + "grad_norm": 0.29671111702919006, + "learning_rate": 2.4164180791580584e-06, + "loss": 0.3183, + "step": 8416 + }, + { + "epoch": 2.1169517102615694, + "grad_norm": 0.2824574112892151, + "learning_rate": 2.41516542065043e-06, + "loss": 0.3322, + "step": 8417 + }, + { + "epoch": 2.117203219315895, + "grad_norm": 0.30182933807373047, + "learning_rate": 2.4139129835177104e-06, + "loss": 0.313, + "step": 8418 + }, + { + "epoch": 2.1174547283702214, + "grad_norm": 0.325040727853775, + "learning_rate": 2.4126607678671672e-06, + "loss": 0.3391, + "step": 8419 + }, + { + "epoch": 2.1177062374245472, + "grad_norm": 0.3018413484096527, + "learning_rate": 2.411408773806041e-06, + "loss": 0.316, + "step": 8420 + }, + { + "epoch": 2.117957746478873, + "grad_norm": 0.29537421464920044, + "learning_rate": 2.410157001441561e-06, + "loss": 0.3042, + "step": 8421 + }, + { + "epoch": 2.1182092555331993, + "grad_norm": 0.2987903952598572, + "learning_rate": 2.40890545088093e-06, + "loss": 0.3171, + "step": 8422 + }, + { + "epoch": 2.118460764587525, + "grad_norm": 0.30431345105171204, + "learning_rate": 2.407654122231339e-06, + "loss": 0.3215, + "step": 8423 + }, + { + "epoch": 2.118712273641851, + "grad_norm": 0.277266263961792, + "learning_rate": 2.4064030155999534e-06, + "loss": 0.3201, + "step": 8424 + }, + { + "epoch": 2.118963782696177, + "grad_norm": 0.3193032443523407, + "learning_rate": 2.4051521310939258e-06, + "loss": 0.3517, + "step": 8425 + }, + { + "epoch": 2.119215291750503, + "grad_norm": 0.31553915143013, + "learning_rate": 2.4039014688203825e-06, + "loss": 0.3249, + "step": 8426 + }, + { + "epoch": 2.119466800804829, + "grad_norm": 0.2827959656715393, + "learning_rate": 2.4026510288864396e-06, + "loss": 0.3281, + "step": 8427 + }, + { + "epoch": 2.119718309859155, + "grad_norm": 0.28876793384552, + "learning_rate": 2.4014008113991855e-06, + "loss": 0.3331, + "step": 8428 + }, + { + "epoch": 2.119969818913481, + "grad_norm": 0.2849387526512146, + "learning_rate": 2.400150816465696e-06, + "loss": 0.3234, + "step": 8429 + }, + { + "epoch": 2.120221327967807, + "grad_norm": 0.3101980984210968, + "learning_rate": 2.398901044193023e-06, + "loss": 0.34, + "step": 8430 + }, + { + "epoch": 2.120472837022133, + "grad_norm": 0.2996865212917328, + "learning_rate": 2.3976514946882057e-06, + "loss": 0.3353, + "step": 8431 + }, + { + "epoch": 2.1207243460764587, + "grad_norm": 0.2950674593448639, + "learning_rate": 2.396402168058255e-06, + "loss": 0.3319, + "step": 8432 + }, + { + "epoch": 2.120975855130785, + "grad_norm": 0.28375566005706787, + "learning_rate": 2.395153064410171e-06, + "loss": 0.34, + "step": 8433 + }, + { + "epoch": 2.1212273641851107, + "grad_norm": 0.33272629976272583, + "learning_rate": 2.3939041838509324e-06, + "loss": 0.3399, + "step": 8434 + }, + { + "epoch": 2.1214788732394365, + "grad_norm": 0.28813275694847107, + "learning_rate": 2.3926555264874956e-06, + "loss": 0.3091, + "step": 8435 + }, + { + "epoch": 2.1217303822937628, + "grad_norm": 0.3202812075614929, + "learning_rate": 2.391407092426803e-06, + "loss": 0.3143, + "step": 8436 + }, + { + "epoch": 2.1219818913480886, + "grad_norm": 0.2992876470088959, + "learning_rate": 2.390158881775772e-06, + "loss": 0.3201, + "step": 8437 + }, + { + "epoch": 2.1222334004024144, + "grad_norm": 0.3007931113243103, + "learning_rate": 2.388910894641307e-06, + "loss": 0.322, + "step": 8438 + }, + { + "epoch": 2.1224849094567406, + "grad_norm": 0.2985740303993225, + "learning_rate": 2.387663131130288e-06, + "loss": 0.324, + "step": 8439 + }, + { + "epoch": 2.1227364185110664, + "grad_norm": 0.29932522773742676, + "learning_rate": 2.3864155913495803e-06, + "loss": 0.3135, + "step": 8440 + }, + { + "epoch": 2.1229879275653922, + "grad_norm": 0.3008473813533783, + "learning_rate": 2.3851682754060247e-06, + "loss": 0.3353, + "step": 8441 + }, + { + "epoch": 2.1232394366197185, + "grad_norm": 0.3180418610572815, + "learning_rate": 2.3839211834064496e-06, + "loss": 0.2981, + "step": 8442 + }, + { + "epoch": 2.1234909456740443, + "grad_norm": 0.29798197746276855, + "learning_rate": 2.3826743154576576e-06, + "loss": 0.333, + "step": 8443 + }, + { + "epoch": 2.12374245472837, + "grad_norm": 0.29840409755706787, + "learning_rate": 2.3814276716664365e-06, + "loss": 0.3147, + "step": 8444 + }, + { + "epoch": 2.1239939637826963, + "grad_norm": 0.30087023973464966, + "learning_rate": 2.3801812521395557e-06, + "loss": 0.3251, + "step": 8445 + }, + { + "epoch": 2.124245472837022, + "grad_norm": 0.27059686183929443, + "learning_rate": 2.3789350569837588e-06, + "loss": 0.3134, + "step": 8446 + }, + { + "epoch": 2.124496981891348, + "grad_norm": 0.2948879599571228, + "learning_rate": 2.377689086305779e-06, + "loss": 0.3309, + "step": 8447 + }, + { + "epoch": 2.124748490945674, + "grad_norm": 0.30460086464881897, + "learning_rate": 2.3764433402123223e-06, + "loss": 0.3201, + "step": 8448 + }, + { + "epoch": 2.125, + "grad_norm": 0.2858458161354065, + "learning_rate": 2.3751978188100816e-06, + "loss": 0.3085, + "step": 8449 + }, + { + "epoch": 2.125251509054326, + "grad_norm": 0.2818026542663574, + "learning_rate": 2.3739525222057257e-06, + "loss": 0.3384, + "step": 8450 + }, + { + "epoch": 2.125503018108652, + "grad_norm": 0.2949872612953186, + "learning_rate": 2.37270745050591e-06, + "loss": 0.3338, + "step": 8451 + }, + { + "epoch": 2.125754527162978, + "grad_norm": 0.33432409167289734, + "learning_rate": 2.3714626038172623e-06, + "loss": 0.3343, + "step": 8452 + }, + { + "epoch": 2.1260060362173037, + "grad_norm": 0.32601264119148254, + "learning_rate": 2.3702179822464006e-06, + "loss": 0.3158, + "step": 8453 + }, + { + "epoch": 2.12625754527163, + "grad_norm": 0.31106406450271606, + "learning_rate": 2.3689735858999152e-06, + "loss": 0.3377, + "step": 8454 + }, + { + "epoch": 2.1265090543259557, + "grad_norm": 0.3100818991661072, + "learning_rate": 2.367729414884383e-06, + "loss": 0.31, + "step": 8455 + }, + { + "epoch": 2.1267605633802815, + "grad_norm": 0.29893726110458374, + "learning_rate": 2.3664854693063598e-06, + "loss": 0.3122, + "step": 8456 + }, + { + "epoch": 2.1270120724346078, + "grad_norm": 0.2957155108451843, + "learning_rate": 2.3652417492723795e-06, + "loss": 0.3108, + "step": 8457 + }, + { + "epoch": 2.1272635814889336, + "grad_norm": 0.3101329803466797, + "learning_rate": 2.3639982548889623e-06, + "loss": 0.3182, + "step": 8458 + }, + { + "epoch": 2.1275150905432594, + "grad_norm": 0.29328593611717224, + "learning_rate": 2.3627549862626014e-06, + "loss": 0.324, + "step": 8459 + }, + { + "epoch": 2.1277665995975856, + "grad_norm": 0.30886930227279663, + "learning_rate": 2.3615119434997803e-06, + "loss": 0.312, + "step": 8460 + }, + { + "epoch": 2.1280181086519114, + "grad_norm": 0.339282751083374, + "learning_rate": 2.360269126706952e-06, + "loss": 0.3421, + "step": 8461 + }, + { + "epoch": 2.1282696177062372, + "grad_norm": 0.3135293126106262, + "learning_rate": 2.359026535990561e-06, + "loss": 0.2935, + "step": 8462 + }, + { + "epoch": 2.1285211267605635, + "grad_norm": 0.3130805492401123, + "learning_rate": 2.357784171457024e-06, + "loss": 0.3256, + "step": 8463 + }, + { + "epoch": 2.1287726358148893, + "grad_norm": 0.3069337010383606, + "learning_rate": 2.3565420332127447e-06, + "loss": 0.3241, + "step": 8464 + }, + { + "epoch": 2.129024144869215, + "grad_norm": 0.3093840181827545, + "learning_rate": 2.355300121364101e-06, + "loss": 0.31, + "step": 8465 + }, + { + "epoch": 2.1292756539235413, + "grad_norm": 0.2900824546813965, + "learning_rate": 2.354058436017456e-06, + "loss": 0.355, + "step": 8466 + }, + { + "epoch": 2.129527162977867, + "grad_norm": 0.3184683322906494, + "learning_rate": 2.352816977279156e-06, + "loss": 0.3043, + "step": 8467 + }, + { + "epoch": 2.129778672032193, + "grad_norm": 0.29197773337364197, + "learning_rate": 2.351575745255519e-06, + "loss": 0.3052, + "step": 8468 + }, + { + "epoch": 2.130030181086519, + "grad_norm": 0.2952864468097687, + "learning_rate": 2.350334740052852e-06, + "loss": 0.319, + "step": 8469 + }, + { + "epoch": 2.130281690140845, + "grad_norm": 0.29490870237350464, + "learning_rate": 2.349093961777437e-06, + "loss": 0.3397, + "step": 8470 + }, + { + "epoch": 2.1305331991951713, + "grad_norm": 0.33022499084472656, + "learning_rate": 2.3478534105355423e-06, + "loss": 0.3102, + "step": 8471 + }, + { + "epoch": 2.130784708249497, + "grad_norm": 0.3145847022533417, + "learning_rate": 2.3466130864334085e-06, + "loss": 0.3431, + "step": 8472 + }, + { + "epoch": 2.131036217303823, + "grad_norm": 0.2934006154537201, + "learning_rate": 2.345372989577267e-06, + "loss": 0.3187, + "step": 8473 + }, + { + "epoch": 2.131287726358149, + "grad_norm": 0.29067620635032654, + "learning_rate": 2.344133120073319e-06, + "loss": 0.2961, + "step": 8474 + }, + { + "epoch": 2.131539235412475, + "grad_norm": 0.28378283977508545, + "learning_rate": 2.3428934780277567e-06, + "loss": 0.3215, + "step": 8475 + }, + { + "epoch": 2.1317907444668007, + "grad_norm": 0.2955572009086609, + "learning_rate": 2.341654063546743e-06, + "loss": 0.3309, + "step": 8476 + }, + { + "epoch": 2.132042253521127, + "grad_norm": 0.31690630316734314, + "learning_rate": 2.340414876736429e-06, + "loss": 0.3254, + "step": 8477 + }, + { + "epoch": 2.1322937625754528, + "grad_norm": 0.297673761844635, + "learning_rate": 2.339175917702943e-06, + "loss": 0.3179, + "step": 8478 + }, + { + "epoch": 2.1325452716297786, + "grad_norm": 0.31934690475463867, + "learning_rate": 2.3379371865523926e-06, + "loss": 0.3356, + "step": 8479 + }, + { + "epoch": 2.132796780684105, + "grad_norm": 0.2896483540534973, + "learning_rate": 2.33669868339087e-06, + "loss": 0.3258, + "step": 8480 + }, + { + "epoch": 2.1330482897384306, + "grad_norm": 0.2923082113265991, + "learning_rate": 2.335460408324442e-06, + "loss": 0.326, + "step": 8481 + }, + { + "epoch": 2.1332997987927564, + "grad_norm": 0.30978333950042725, + "learning_rate": 2.3342223614591623e-06, + "loss": 0.3199, + "step": 8482 + }, + { + "epoch": 2.1335513078470827, + "grad_norm": 0.2992372512817383, + "learning_rate": 2.332984542901059e-06, + "loss": 0.3611, + "step": 8483 + }, + { + "epoch": 2.1338028169014085, + "grad_norm": 0.3130771517753601, + "learning_rate": 2.331746952756146e-06, + "loss": 0.3089, + "step": 8484 + }, + { + "epoch": 2.1340543259557343, + "grad_norm": 0.31406500935554504, + "learning_rate": 2.3305095911304123e-06, + "loss": 0.3132, + "step": 8485 + }, + { + "epoch": 2.1343058350100605, + "grad_norm": 0.30472251772880554, + "learning_rate": 2.3292724581298338e-06, + "loss": 0.3216, + "step": 8486 + }, + { + "epoch": 2.1345573440643864, + "grad_norm": 0.31913644075393677, + "learning_rate": 2.328035553860359e-06, + "loss": 0.3051, + "step": 8487 + }, + { + "epoch": 2.134808853118712, + "grad_norm": 0.3231859803199768, + "learning_rate": 2.326798878427924e-06, + "loss": 0.3293, + "step": 8488 + }, + { + "epoch": 2.1350603621730384, + "grad_norm": 0.2944040298461914, + "learning_rate": 2.325562431938442e-06, + "loss": 0.3373, + "step": 8489 + }, + { + "epoch": 2.135311871227364, + "grad_norm": 0.2909225523471832, + "learning_rate": 2.3243262144978063e-06, + "loss": 0.3285, + "step": 8490 + }, + { + "epoch": 2.13556338028169, + "grad_norm": 0.283944696187973, + "learning_rate": 2.3230902262118922e-06, + "loss": 0.3177, + "step": 8491 + }, + { + "epoch": 2.1358148893360163, + "grad_norm": 0.31086745858192444, + "learning_rate": 2.321854467186552e-06, + "loss": 0.3492, + "step": 8492 + }, + { + "epoch": 2.136066398390342, + "grad_norm": 0.3098077178001404, + "learning_rate": 2.320618937527624e-06, + "loss": 0.3144, + "step": 8493 + }, + { + "epoch": 2.136317907444668, + "grad_norm": 0.31440508365631104, + "learning_rate": 2.31938363734092e-06, + "loss": 0.3136, + "step": 8494 + }, + { + "epoch": 2.136569416498994, + "grad_norm": 0.3078277111053467, + "learning_rate": 2.3181485667322397e-06, + "loss": 0.3126, + "step": 8495 + }, + { + "epoch": 2.13682092555332, + "grad_norm": 0.3191978633403778, + "learning_rate": 2.316913725807355e-06, + "loss": 0.2969, + "step": 8496 + }, + { + "epoch": 2.1370724346076457, + "grad_norm": 0.2999849319458008, + "learning_rate": 2.3156791146720266e-06, + "loss": 0.3327, + "step": 8497 + }, + { + "epoch": 2.137323943661972, + "grad_norm": 0.2959345281124115, + "learning_rate": 2.3144447334319866e-06, + "loss": 0.3355, + "step": 8498 + }, + { + "epoch": 2.137575452716298, + "grad_norm": 0.29375961422920227, + "learning_rate": 2.313210582192954e-06, + "loss": 0.3151, + "step": 8499 + }, + { + "epoch": 2.1378269617706236, + "grad_norm": 0.2988927960395813, + "learning_rate": 2.3119766610606293e-06, + "loss": 0.3479, + "step": 8500 + }, + { + "epoch": 2.13807847082495, + "grad_norm": 0.32058557868003845, + "learning_rate": 2.3107429701406845e-06, + "loss": 0.3118, + "step": 8501 + }, + { + "epoch": 2.1383299798792756, + "grad_norm": 0.31546249985694885, + "learning_rate": 2.309509509538783e-06, + "loss": 0.3289, + "step": 8502 + }, + { + "epoch": 2.1385814889336014, + "grad_norm": 0.3331628739833832, + "learning_rate": 2.3082762793605582e-06, + "loss": 0.3276, + "step": 8503 + }, + { + "epoch": 2.1388329979879277, + "grad_norm": 0.3143128752708435, + "learning_rate": 2.307043279711633e-06, + "loss": 0.3154, + "step": 8504 + }, + { + "epoch": 2.1390845070422535, + "grad_norm": 0.33894532918930054, + "learning_rate": 2.3058105106976013e-06, + "loss": 0.3275, + "step": 8505 + }, + { + "epoch": 2.1393360160965793, + "grad_norm": 0.30815622210502625, + "learning_rate": 2.3045779724240468e-06, + "loss": 0.3163, + "step": 8506 + }, + { + "epoch": 2.1395875251509056, + "grad_norm": 0.29994967579841614, + "learning_rate": 2.3033456649965246e-06, + "loss": 0.3283, + "step": 8507 + }, + { + "epoch": 2.1398390342052314, + "grad_norm": 0.3090510666370392, + "learning_rate": 2.302113588520578e-06, + "loss": 0.3087, + "step": 8508 + }, + { + "epoch": 2.140090543259557, + "grad_norm": 0.3417157530784607, + "learning_rate": 2.3008817431017225e-06, + "loss": 0.3144, + "step": 8509 + }, + { + "epoch": 2.1403420523138834, + "grad_norm": 0.3093573749065399, + "learning_rate": 2.2996501288454606e-06, + "loss": 0.3099, + "step": 8510 + }, + { + "epoch": 2.140593561368209, + "grad_norm": 0.2987154722213745, + "learning_rate": 2.2984187458572727e-06, + "loss": 0.3293, + "step": 8511 + }, + { + "epoch": 2.140845070422535, + "grad_norm": 0.31684228777885437, + "learning_rate": 2.297187594242617e-06, + "loss": 0.3171, + "step": 8512 + }, + { + "epoch": 2.1410965794768613, + "grad_norm": 0.2938196063041687, + "learning_rate": 2.2959566741069365e-06, + "loss": 0.3196, + "step": 8513 + }, + { + "epoch": 2.141348088531187, + "grad_norm": 0.30638089776039124, + "learning_rate": 2.2947259855556493e-06, + "loss": 0.3256, + "step": 8514 + }, + { + "epoch": 2.141599597585513, + "grad_norm": 0.2842661738395691, + "learning_rate": 2.2934955286941583e-06, + "loss": 0.3199, + "step": 8515 + }, + { + "epoch": 2.141851106639839, + "grad_norm": 0.30369311571121216, + "learning_rate": 2.292265303627842e-06, + "loss": 0.3241, + "step": 8516 + }, + { + "epoch": 2.142102615694165, + "grad_norm": 0.3095688223838806, + "learning_rate": 2.2910353104620647e-06, + "loss": 0.3437, + "step": 8517 + }, + { + "epoch": 2.1423541247484907, + "grad_norm": 0.29922717809677124, + "learning_rate": 2.2898055493021644e-06, + "loss": 0.3326, + "step": 8518 + }, + { + "epoch": 2.142605633802817, + "grad_norm": 0.30769720673561096, + "learning_rate": 2.288576020253465e-06, + "loss": 0.3249, + "step": 8519 + }, + { + "epoch": 2.142857142857143, + "grad_norm": 0.2776482403278351, + "learning_rate": 2.2873467234212654e-06, + "loss": 0.327, + "step": 8520 + }, + { + "epoch": 2.1431086519114686, + "grad_norm": 0.30429205298423767, + "learning_rate": 2.2861176589108487e-06, + "loss": 0.3109, + "step": 8521 + }, + { + "epoch": 2.143360160965795, + "grad_norm": 0.2737998366355896, + "learning_rate": 2.284888826827478e-06, + "loss": 0.3143, + "step": 8522 + }, + { + "epoch": 2.1436116700201207, + "grad_norm": 0.29733067750930786, + "learning_rate": 2.2836602272763924e-06, + "loss": 0.3157, + "step": 8523 + }, + { + "epoch": 2.1438631790744465, + "grad_norm": 0.32914045453071594, + "learning_rate": 2.2824318603628163e-06, + "loss": 0.3138, + "step": 8524 + }, + { + "epoch": 2.1441146881287727, + "grad_norm": 0.3098900020122528, + "learning_rate": 2.2812037261919483e-06, + "loss": 0.3287, + "step": 8525 + }, + { + "epoch": 2.1443661971830985, + "grad_norm": 0.3037329316139221, + "learning_rate": 2.2799758248689747e-06, + "loss": 0.3236, + "step": 8526 + }, + { + "epoch": 2.1446177062374243, + "grad_norm": 0.334514856338501, + "learning_rate": 2.2787481564990533e-06, + "loss": 0.3309, + "step": 8527 + }, + { + "epoch": 2.1448692152917506, + "grad_norm": 0.28961655497550964, + "learning_rate": 2.27752072118733e-06, + "loss": 0.3469, + "step": 8528 + }, + { + "epoch": 2.1451207243460764, + "grad_norm": 0.31919920444488525, + "learning_rate": 2.2762935190389233e-06, + "loss": 0.3104, + "step": 8529 + }, + { + "epoch": 2.1453722334004026, + "grad_norm": 0.29657045006752014, + "learning_rate": 2.275066550158939e-06, + "loss": 0.3139, + "step": 8530 + }, + { + "epoch": 2.1456237424547284, + "grad_norm": 0.29360705614089966, + "learning_rate": 2.273839814652456e-06, + "loss": 0.3169, + "step": 8531 + }, + { + "epoch": 2.1458752515090542, + "grad_norm": 0.331571102142334, + "learning_rate": 2.27261331262454e-06, + "loss": 0.3003, + "step": 8532 + }, + { + "epoch": 2.1461267605633805, + "grad_norm": 0.28920167684555054, + "learning_rate": 2.2713870441802287e-06, + "loss": 0.3175, + "step": 8533 + }, + { + "epoch": 2.1463782696177063, + "grad_norm": 0.27948108315467834, + "learning_rate": 2.2701610094245496e-06, + "loss": 0.2903, + "step": 8534 + }, + { + "epoch": 2.146629778672032, + "grad_norm": 0.2875995934009552, + "learning_rate": 2.2689352084625e-06, + "loss": 0.3286, + "step": 8535 + }, + { + "epoch": 2.1468812877263583, + "grad_norm": 0.2751031219959259, + "learning_rate": 2.267709641399066e-06, + "loss": 0.2945, + "step": 8536 + }, + { + "epoch": 2.147132796780684, + "grad_norm": 0.3058602213859558, + "learning_rate": 2.2664843083392063e-06, + "loss": 0.3593, + "step": 8537 + }, + { + "epoch": 2.14738430583501, + "grad_norm": 0.31076836585998535, + "learning_rate": 2.265259209387867e-06, + "loss": 0.332, + "step": 8538 + }, + { + "epoch": 2.147635814889336, + "grad_norm": 0.2831321954727173, + "learning_rate": 2.2640343446499656e-06, + "loss": 0.3137, + "step": 8539 + }, + { + "epoch": 2.147887323943662, + "grad_norm": 0.2820286452770233, + "learning_rate": 2.2628097142304083e-06, + "loss": 0.3312, + "step": 8540 + }, + { + "epoch": 2.148138832997988, + "grad_norm": 0.29765576124191284, + "learning_rate": 2.2615853182340737e-06, + "loss": 0.3077, + "step": 8541 + }, + { + "epoch": 2.148390342052314, + "grad_norm": 0.3157854676246643, + "learning_rate": 2.2603611567658267e-06, + "loss": 0.3455, + "step": 8542 + }, + { + "epoch": 2.14864185110664, + "grad_norm": 0.30632486939430237, + "learning_rate": 2.259137229930506e-06, + "loss": 0.3374, + "step": 8543 + }, + { + "epoch": 2.1488933601609657, + "grad_norm": 0.312173068523407, + "learning_rate": 2.2579135378329357e-06, + "loss": 0.3233, + "step": 8544 + }, + { + "epoch": 2.149144869215292, + "grad_norm": 0.2874966263771057, + "learning_rate": 2.2566900805779157e-06, + "loss": 0.3079, + "step": 8545 + }, + { + "epoch": 2.1493963782696177, + "grad_norm": 0.30250847339630127, + "learning_rate": 2.2554668582702294e-06, + "loss": 0.3519, + "step": 8546 + }, + { + "epoch": 2.1496478873239435, + "grad_norm": 0.2963038384914398, + "learning_rate": 2.2542438710146354e-06, + "loss": 0.3195, + "step": 8547 + }, + { + "epoch": 2.1498993963782698, + "grad_norm": 0.299123615026474, + "learning_rate": 2.253021118915878e-06, + "loss": 0.3204, + "step": 8548 + }, + { + "epoch": 2.1501509054325956, + "grad_norm": 0.28678908944129944, + "learning_rate": 2.2517986020786745e-06, + "loss": 0.308, + "step": 8549 + }, + { + "epoch": 2.1504024144869214, + "grad_norm": 0.2993088662624359, + "learning_rate": 2.2505763206077306e-06, + "loss": 0.306, + "step": 8550 + }, + { + "epoch": 2.1506539235412476, + "grad_norm": 0.30504879355430603, + "learning_rate": 2.249354274607723e-06, + "loss": 0.3226, + "step": 8551 + }, + { + "epoch": 2.1509054325955734, + "grad_norm": 0.33471235632896423, + "learning_rate": 2.2481324641833146e-06, + "loss": 0.3101, + "step": 8552 + }, + { + "epoch": 2.1511569416498992, + "grad_norm": 0.30294346809387207, + "learning_rate": 2.246910889439144e-06, + "loss": 0.3211, + "step": 8553 + }, + { + "epoch": 2.1514084507042255, + "grad_norm": 0.2953152656555176, + "learning_rate": 2.2456895504798322e-06, + "loss": 0.3226, + "step": 8554 + }, + { + "epoch": 2.1516599597585513, + "grad_norm": 0.2956666052341461, + "learning_rate": 2.2444684474099807e-06, + "loss": 0.3312, + "step": 8555 + }, + { + "epoch": 2.151911468812877, + "grad_norm": 0.3139225244522095, + "learning_rate": 2.2432475803341674e-06, + "loss": 0.3122, + "step": 8556 + }, + { + "epoch": 2.1521629778672033, + "grad_norm": 0.3028757572174072, + "learning_rate": 2.242026949356954e-06, + "loss": 0.3361, + "step": 8557 + }, + { + "epoch": 2.152414486921529, + "grad_norm": 0.31237322092056274, + "learning_rate": 2.2408065545828765e-06, + "loss": 0.338, + "step": 8558 + }, + { + "epoch": 2.152665995975855, + "grad_norm": 0.3021441400051117, + "learning_rate": 2.239586396116458e-06, + "loss": 0.2959, + "step": 8559 + }, + { + "epoch": 2.152917505030181, + "grad_norm": 0.29930517077445984, + "learning_rate": 2.238366474062194e-06, + "loss": 0.3408, + "step": 8560 + }, + { + "epoch": 2.153169014084507, + "grad_norm": 0.2866148352622986, + "learning_rate": 2.2371467885245667e-06, + "loss": 0.3145, + "step": 8561 + }, + { + "epoch": 2.153420523138833, + "grad_norm": 0.3019120693206787, + "learning_rate": 2.2359273396080306e-06, + "loss": 0.3423, + "step": 8562 + }, + { + "epoch": 2.153672032193159, + "grad_norm": 0.31220588088035583, + "learning_rate": 2.2347081274170273e-06, + "loss": 0.3306, + "step": 8563 + }, + { + "epoch": 2.153923541247485, + "grad_norm": 0.31017324328422546, + "learning_rate": 2.233489152055972e-06, + "loss": 0.3089, + "step": 8564 + }, + { + "epoch": 2.1541750503018107, + "grad_norm": 0.2911800146102905, + "learning_rate": 2.2322704136292632e-06, + "loss": 0.3092, + "step": 8565 + }, + { + "epoch": 2.154426559356137, + "grad_norm": 0.2987365424633026, + "learning_rate": 2.231051912241281e-06, + "loss": 0.3316, + "step": 8566 + }, + { + "epoch": 2.1546780684104627, + "grad_norm": 0.30714505910873413, + "learning_rate": 2.229833647996378e-06, + "loss": 0.2903, + "step": 8567 + }, + { + "epoch": 2.1549295774647885, + "grad_norm": 0.31266728043556213, + "learning_rate": 2.228615620998894e-06, + "loss": 0.3185, + "step": 8568 + }, + { + "epoch": 2.1551810865191148, + "grad_norm": 0.27466437220573425, + "learning_rate": 2.2273978313531436e-06, + "loss": 0.294, + "step": 8569 + }, + { + "epoch": 2.1554325955734406, + "grad_norm": 0.3066222071647644, + "learning_rate": 2.2261802791634245e-06, + "loss": 0.3285, + "step": 8570 + }, + { + "epoch": 2.155684104627767, + "grad_norm": 0.31608250737190247, + "learning_rate": 2.2249629645340104e-06, + "loss": 0.3345, + "step": 8571 + }, + { + "epoch": 2.1559356136820926, + "grad_norm": 0.3242324888706207, + "learning_rate": 2.2237458875691592e-06, + "loss": 0.2912, + "step": 8572 + }, + { + "epoch": 2.1561871227364184, + "grad_norm": 0.2953372895717621, + "learning_rate": 2.2225290483731034e-06, + "loss": 0.3208, + "step": 8573 + }, + { + "epoch": 2.1564386317907447, + "grad_norm": 0.3065500259399414, + "learning_rate": 2.221312447050058e-06, + "loss": 0.333, + "step": 8574 + }, + { + "epoch": 2.1566901408450705, + "grad_norm": 0.2919135093688965, + "learning_rate": 2.2200960837042202e-06, + "loss": 0.3145, + "step": 8575 + }, + { + "epoch": 2.1569416498993963, + "grad_norm": 0.2694363594055176, + "learning_rate": 2.2188799584397604e-06, + "loss": 0.3248, + "step": 8576 + }, + { + "epoch": 2.1571931589537225, + "grad_norm": 0.29707813262939453, + "learning_rate": 2.2176640713608345e-06, + "loss": 0.2982, + "step": 8577 + }, + { + "epoch": 2.1574446680080483, + "grad_norm": 0.3023638129234314, + "learning_rate": 2.2164484225715734e-06, + "loss": 0.3199, + "step": 8578 + }, + { + "epoch": 2.157696177062374, + "grad_norm": 0.29433873295783997, + "learning_rate": 2.215233012176093e-06, + "loss": 0.3304, + "step": 8579 + }, + { + "epoch": 2.1579476861167004, + "grad_norm": 0.2980920672416687, + "learning_rate": 2.2140178402784814e-06, + "loss": 0.3326, + "step": 8580 + }, + { + "epoch": 2.158199195171026, + "grad_norm": 0.3132590651512146, + "learning_rate": 2.212802906982815e-06, + "loss": 0.3246, + "step": 8581 + }, + { + "epoch": 2.158450704225352, + "grad_norm": 0.29961881041526794, + "learning_rate": 2.2115882123931403e-06, + "loss": 0.3313, + "step": 8582 + }, + { + "epoch": 2.1587022132796783, + "grad_norm": 0.3205260634422302, + "learning_rate": 2.210373756613494e-06, + "loss": 0.3257, + "step": 8583 + }, + { + "epoch": 2.158953722334004, + "grad_norm": 0.2848275601863861, + "learning_rate": 2.209159539747881e-06, + "loss": 0.3306, + "step": 8584 + }, + { + "epoch": 2.15920523138833, + "grad_norm": 0.3031473457813263, + "learning_rate": 2.2079455619002936e-06, + "loss": 0.3369, + "step": 8585 + }, + { + "epoch": 2.159456740442656, + "grad_norm": 0.3208923935890198, + "learning_rate": 2.2067318231747047e-06, + "loss": 0.3092, + "step": 8586 + }, + { + "epoch": 2.159708249496982, + "grad_norm": 0.31306546926498413, + "learning_rate": 2.2055183236750577e-06, + "loss": 0.3051, + "step": 8587 + }, + { + "epoch": 2.1599597585513077, + "grad_norm": 0.3087550103664398, + "learning_rate": 2.2043050635052866e-06, + "loss": 0.3105, + "step": 8588 + }, + { + "epoch": 2.160211267605634, + "grad_norm": 0.2876415252685547, + "learning_rate": 2.2030920427692947e-06, + "loss": 0.3006, + "step": 8589 + }, + { + "epoch": 2.16046277665996, + "grad_norm": 0.3287197947502136, + "learning_rate": 2.201879261570974e-06, + "loss": 0.3298, + "step": 8590 + }, + { + "epoch": 2.1607142857142856, + "grad_norm": 0.30581316351890564, + "learning_rate": 2.2006667200141877e-06, + "loss": 0.3205, + "step": 8591 + }, + { + "epoch": 2.160965794768612, + "grad_norm": 0.30748024582862854, + "learning_rate": 2.199454418202786e-06, + "loss": 0.3159, + "step": 8592 + }, + { + "epoch": 2.1612173038229376, + "grad_norm": 0.32226288318634033, + "learning_rate": 2.1982423562405915e-06, + "loss": 0.3129, + "step": 8593 + }, + { + "epoch": 2.1614688128772634, + "grad_norm": 0.2897011935710907, + "learning_rate": 2.1970305342314135e-06, + "loss": 0.336, + "step": 8594 + }, + { + "epoch": 2.1617203219315897, + "grad_norm": 0.3219929039478302, + "learning_rate": 2.1958189522790325e-06, + "loss": 0.3294, + "step": 8595 + }, + { + "epoch": 2.1619718309859155, + "grad_norm": 0.3067570626735687, + "learning_rate": 2.194607610487216e-06, + "loss": 0.3422, + "step": 8596 + }, + { + "epoch": 2.1622233400402413, + "grad_norm": 0.2897886633872986, + "learning_rate": 2.1933965089597087e-06, + "loss": 0.3096, + "step": 8597 + }, + { + "epoch": 2.1624748490945676, + "grad_norm": 0.2907717525959015, + "learning_rate": 2.1921856478002302e-06, + "loss": 0.3176, + "step": 8598 + }, + { + "epoch": 2.1627263581488934, + "grad_norm": 0.31856968998908997, + "learning_rate": 2.1909750271124873e-06, + "loss": 0.324, + "step": 8599 + }, + { + "epoch": 2.162977867203219, + "grad_norm": 0.30686262249946594, + "learning_rate": 2.1897646470001588e-06, + "loss": 0.3286, + "step": 8600 + }, + { + "epoch": 2.1632293762575454, + "grad_norm": 0.30653515458106995, + "learning_rate": 2.188554507566909e-06, + "loss": 0.3303, + "step": 8601 + }, + { + "epoch": 2.163480885311871, + "grad_norm": 0.3120279908180237, + "learning_rate": 2.187344608916375e-06, + "loss": 0.3467, + "step": 8602 + }, + { + "epoch": 2.163732394366197, + "grad_norm": 0.32078132033348083, + "learning_rate": 2.1861349511521817e-06, + "loss": 0.3224, + "step": 8603 + }, + { + "epoch": 2.1639839034205233, + "grad_norm": 0.2890268862247467, + "learning_rate": 2.1849255343779246e-06, + "loss": 0.3249, + "step": 8604 + }, + { + "epoch": 2.164235412474849, + "grad_norm": 0.3207026720046997, + "learning_rate": 2.183716358697186e-06, + "loss": 0.3349, + "step": 8605 + }, + { + "epoch": 2.164486921529175, + "grad_norm": 0.30875587463378906, + "learning_rate": 2.1825074242135206e-06, + "loss": 0.2937, + "step": 8606 + }, + { + "epoch": 2.164738430583501, + "grad_norm": 0.27507445216178894, + "learning_rate": 2.181298731030469e-06, + "loss": 0.3357, + "step": 8607 + }, + { + "epoch": 2.164989939637827, + "grad_norm": 0.2858785092830658, + "learning_rate": 2.180090279251548e-06, + "loss": 0.3203, + "step": 8608 + }, + { + "epoch": 2.1652414486921527, + "grad_norm": 0.3202469050884247, + "learning_rate": 2.1788820689802524e-06, + "loss": 0.329, + "step": 8609 + }, + { + "epoch": 2.165492957746479, + "grad_norm": 0.30295151472091675, + "learning_rate": 2.1776741003200603e-06, + "loss": 0.3369, + "step": 8610 + }, + { + "epoch": 2.165744466800805, + "grad_norm": 0.3162943720817566, + "learning_rate": 2.1764663733744234e-06, + "loss": 0.3167, + "step": 8611 + }, + { + "epoch": 2.1659959758551306, + "grad_norm": 0.2848318815231323, + "learning_rate": 2.175258888246779e-06, + "loss": 0.311, + "step": 8612 + }, + { + "epoch": 2.166247484909457, + "grad_norm": 0.31551775336265564, + "learning_rate": 2.174051645040538e-06, + "loss": 0.3374, + "step": 8613 + }, + { + "epoch": 2.1664989939637826, + "grad_norm": 0.2893933057785034, + "learning_rate": 2.172844643859096e-06, + "loss": 0.3212, + "step": 8614 + }, + { + "epoch": 2.1667505030181085, + "grad_norm": 0.295808881521225, + "learning_rate": 2.1716378848058217e-06, + "loss": 0.3155, + "step": 8615 + }, + { + "epoch": 2.1670020120724347, + "grad_norm": 0.3089081346988678, + "learning_rate": 2.1704313679840706e-06, + "loss": 0.3483, + "step": 8616 + }, + { + "epoch": 2.1672535211267605, + "grad_norm": 0.3219684064388275, + "learning_rate": 2.169225093497169e-06, + "loss": 0.3358, + "step": 8617 + }, + { + "epoch": 2.1675050301810863, + "grad_norm": 0.3217262029647827, + "learning_rate": 2.1680190614484292e-06, + "loss": 0.3385, + "step": 8618 + }, + { + "epoch": 2.1677565392354126, + "grad_norm": 0.28920111060142517, + "learning_rate": 2.166813271941141e-06, + "loss": 0.3109, + "step": 8619 + }, + { + "epoch": 2.1680080482897384, + "grad_norm": 0.32903382182121277, + "learning_rate": 2.165607725078571e-06, + "loss": 0.3133, + "step": 8620 + }, + { + "epoch": 2.168259557344064, + "grad_norm": 0.3116111159324646, + "learning_rate": 2.1644024209639687e-06, + "loss": 0.3415, + "step": 8621 + }, + { + "epoch": 2.1685110663983904, + "grad_norm": 0.30731502175331116, + "learning_rate": 2.1631973597005574e-06, + "loss": 0.3426, + "step": 8622 + }, + { + "epoch": 2.1687625754527162, + "grad_norm": 0.29497724771499634, + "learning_rate": 2.1619925413915475e-06, + "loss": 0.3216, + "step": 8623 + }, + { + "epoch": 2.169014084507042, + "grad_norm": 0.30117177963256836, + "learning_rate": 2.16078796614012e-06, + "loss": 0.3228, + "step": 8624 + }, + { + "epoch": 2.1692655935613683, + "grad_norm": 0.29439517855644226, + "learning_rate": 2.159583634049443e-06, + "loss": 0.3107, + "step": 8625 + }, + { + "epoch": 2.169517102615694, + "grad_norm": 0.2719261944293976, + "learning_rate": 2.1583795452226563e-06, + "loss": 0.2919, + "step": 8626 + }, + { + "epoch": 2.16976861167002, + "grad_norm": 0.32442960143089294, + "learning_rate": 2.157175699762886e-06, + "loss": 0.3018, + "step": 8627 + }, + { + "epoch": 2.170020120724346, + "grad_norm": 0.2717970013618469, + "learning_rate": 2.155972097773231e-06, + "loss": 0.3407, + "step": 8628 + }, + { + "epoch": 2.170271629778672, + "grad_norm": 0.2874637842178345, + "learning_rate": 2.1547687393567736e-06, + "loss": 0.2937, + "step": 8629 + }, + { + "epoch": 2.170523138832998, + "grad_norm": 0.28952640295028687, + "learning_rate": 2.153565624616576e-06, + "loss": 0.3032, + "step": 8630 + }, + { + "epoch": 2.170774647887324, + "grad_norm": 0.30068325996398926, + "learning_rate": 2.1523627536556728e-06, + "loss": 0.344, + "step": 8631 + }, + { + "epoch": 2.17102615694165, + "grad_norm": 0.2989504933357239, + "learning_rate": 2.1511601265770876e-06, + "loss": 0.3452, + "step": 8632 + }, + { + "epoch": 2.171277665995976, + "grad_norm": 0.30707207322120667, + "learning_rate": 2.1499577434838132e-06, + "loss": 0.2972, + "step": 8633 + }, + { + "epoch": 2.171529175050302, + "grad_norm": 0.30241820216178894, + "learning_rate": 2.14875560447883e-06, + "loss": 0.3093, + "step": 8634 + }, + { + "epoch": 2.1717806841046277, + "grad_norm": 0.28262701630592346, + "learning_rate": 2.14755370966509e-06, + "loss": 0.3204, + "step": 8635 + }, + { + "epoch": 2.172032193158954, + "grad_norm": 0.3182533085346222, + "learning_rate": 2.146352059145532e-06, + "loss": 0.3132, + "step": 8636 + }, + { + "epoch": 2.1722837022132797, + "grad_norm": 0.30099421739578247, + "learning_rate": 2.1451506530230654e-06, + "loss": 0.3211, + "step": 8637 + }, + { + "epoch": 2.1725352112676055, + "grad_norm": 0.28418776392936707, + "learning_rate": 2.1439494914005877e-06, + "loss": 0.3228, + "step": 8638 + }, + { + "epoch": 2.1727867203219318, + "grad_norm": 0.31524962186813354, + "learning_rate": 2.1427485743809667e-06, + "loss": 0.3276, + "step": 8639 + }, + { + "epoch": 2.1730382293762576, + "grad_norm": 0.28617793321609497, + "learning_rate": 2.141547902067056e-06, + "loss": 0.3266, + "step": 8640 + }, + { + "epoch": 2.1732897384305834, + "grad_norm": 0.2999754548072815, + "learning_rate": 2.1403474745616863e-06, + "loss": 0.3383, + "step": 8641 + }, + { + "epoch": 2.1735412474849096, + "grad_norm": 0.3075141906738281, + "learning_rate": 2.1391472919676637e-06, + "loss": 0.313, + "step": 8642 + }, + { + "epoch": 2.1737927565392354, + "grad_norm": 0.3021905720233917, + "learning_rate": 2.13794735438778e-06, + "loss": 0.3151, + "step": 8643 + }, + { + "epoch": 2.1740442655935612, + "grad_norm": 0.3037421703338623, + "learning_rate": 2.136747661924799e-06, + "loss": 0.3213, + "step": 8644 + }, + { + "epoch": 2.1742957746478875, + "grad_norm": 0.3154190480709076, + "learning_rate": 2.1355482146814693e-06, + "loss": 0.3147, + "step": 8645 + }, + { + "epoch": 2.1745472837022133, + "grad_norm": 0.3106088638305664, + "learning_rate": 2.1343490127605136e-06, + "loss": 0.3102, + "step": 8646 + }, + { + "epoch": 2.174798792756539, + "grad_norm": 0.3053724765777588, + "learning_rate": 2.1331500562646396e-06, + "loss": 0.3177, + "step": 8647 + }, + { + "epoch": 2.1750503018108653, + "grad_norm": 0.30620327591896057, + "learning_rate": 2.1319513452965264e-06, + "loss": 0.314, + "step": 8648 + }, + { + "epoch": 2.175301810865191, + "grad_norm": 0.291100412607193, + "learning_rate": 2.13075287995884e-06, + "loss": 0.3018, + "step": 8649 + }, + { + "epoch": 2.175553319919517, + "grad_norm": 0.3090902864933014, + "learning_rate": 2.129554660354217e-06, + "loss": 0.3476, + "step": 8650 + }, + { + "epoch": 2.175804828973843, + "grad_norm": 0.27171772718429565, + "learning_rate": 2.1283566865852824e-06, + "loss": 0.3119, + "step": 8651 + }, + { + "epoch": 2.176056338028169, + "grad_norm": 0.31004947423934937, + "learning_rate": 2.1271589587546303e-06, + "loss": 0.3367, + "step": 8652 + }, + { + "epoch": 2.176307847082495, + "grad_norm": 0.2942144274711609, + "learning_rate": 2.1259614769648434e-06, + "loss": 0.3417, + "step": 8653 + }, + { + "epoch": 2.176559356136821, + "grad_norm": 0.29157233238220215, + "learning_rate": 2.124764241318474e-06, + "loss": 0.3177, + "step": 8654 + }, + { + "epoch": 2.176810865191147, + "grad_norm": 0.34676676988601685, + "learning_rate": 2.1235672519180615e-06, + "loss": 0.3086, + "step": 8655 + }, + { + "epoch": 2.1770623742454727, + "grad_norm": 0.30454134941101074, + "learning_rate": 2.1223705088661174e-06, + "loss": 0.3145, + "step": 8656 + }, + { + "epoch": 2.177313883299799, + "grad_norm": 0.2989632487297058, + "learning_rate": 2.121174012265138e-06, + "loss": 0.3293, + "step": 8657 + }, + { + "epoch": 2.1775653923541247, + "grad_norm": 0.3130037784576416, + "learning_rate": 2.119977762217594e-06, + "loss": 0.3147, + "step": 8658 + }, + { + "epoch": 2.1778169014084505, + "grad_norm": 0.28079167008399963, + "learning_rate": 2.118781758825938e-06, + "loss": 0.3225, + "step": 8659 + }, + { + "epoch": 2.1780684104627768, + "grad_norm": 0.30156850814819336, + "learning_rate": 2.117586002192598e-06, + "loss": 0.3157, + "step": 8660 + }, + { + "epoch": 2.1783199195171026, + "grad_norm": 0.3073710799217224, + "learning_rate": 2.1163904924199865e-06, + "loss": 0.3166, + "step": 8661 + }, + { + "epoch": 2.1785714285714284, + "grad_norm": 0.3458421528339386, + "learning_rate": 2.1151952296104876e-06, + "loss": 0.3273, + "step": 8662 + }, + { + "epoch": 2.1788229376257546, + "grad_norm": 0.3005492687225342, + "learning_rate": 2.1140002138664718e-06, + "loss": 0.3192, + "step": 8663 + }, + { + "epoch": 2.1790744466800804, + "grad_norm": 0.301626056432724, + "learning_rate": 2.1128054452902812e-06, + "loss": 0.2996, + "step": 8664 + }, + { + "epoch": 2.1793259557344062, + "grad_norm": 0.28979891538619995, + "learning_rate": 2.111610923984244e-06, + "loss": 0.3323, + "step": 8665 + }, + { + "epoch": 2.1795774647887325, + "grad_norm": 0.32507914304733276, + "learning_rate": 2.1104166500506596e-06, + "loss": 0.3134, + "step": 8666 + }, + { + "epoch": 2.1798289738430583, + "grad_norm": 0.30215296149253845, + "learning_rate": 2.1092226235918135e-06, + "loss": 0.343, + "step": 8667 + }, + { + "epoch": 2.1800804828973845, + "grad_norm": 0.3291065990924835, + "learning_rate": 2.1080288447099635e-06, + "loss": 0.3444, + "step": 8668 + }, + { + "epoch": 2.1803319919517103, + "grad_norm": 0.29154881834983826, + "learning_rate": 2.106835313507352e-06, + "loss": 0.3347, + "step": 8669 + }, + { + "epoch": 2.180583501006036, + "grad_norm": 0.3173246383666992, + "learning_rate": 2.1056420300861953e-06, + "loss": 0.3234, + "step": 8670 + }, + { + "epoch": 2.1808350100603624, + "grad_norm": 0.29975295066833496, + "learning_rate": 2.104448994548693e-06, + "loss": 0.3172, + "step": 8671 + }, + { + "epoch": 2.181086519114688, + "grad_norm": 0.29570671916007996, + "learning_rate": 2.103256206997018e-06, + "loss": 0.3228, + "step": 8672 + }, + { + "epoch": 2.181338028169014, + "grad_norm": 0.3010045289993286, + "learning_rate": 2.1020636675333273e-06, + "loss": 0.3299, + "step": 8673 + }, + { + "epoch": 2.1815895372233403, + "grad_norm": 0.32294243574142456, + "learning_rate": 2.1008713762597554e-06, + "loss": 0.3119, + "step": 8674 + }, + { + "epoch": 2.181841046277666, + "grad_norm": 0.3337547779083252, + "learning_rate": 2.0996793332784116e-06, + "loss": 0.3317, + "step": 8675 + }, + { + "epoch": 2.182092555331992, + "grad_norm": 0.3114114999771118, + "learning_rate": 2.0984875386913904e-06, + "loss": 0.3526, + "step": 8676 + }, + { + "epoch": 2.182344064386318, + "grad_norm": 0.31616508960723877, + "learning_rate": 2.097295992600758e-06, + "loss": 0.2981, + "step": 8677 + }, + { + "epoch": 2.182595573440644, + "grad_norm": 0.2805394232273102, + "learning_rate": 2.0961046951085662e-06, + "loss": 0.3403, + "step": 8678 + }, + { + "epoch": 2.1828470824949697, + "grad_norm": 0.28356558084487915, + "learning_rate": 2.094913646316839e-06, + "loss": 0.3279, + "step": 8679 + }, + { + "epoch": 2.183098591549296, + "grad_norm": 0.2977490723133087, + "learning_rate": 2.0937228463275854e-06, + "loss": 0.3316, + "step": 8680 + }, + { + "epoch": 2.183350100603622, + "grad_norm": 0.27247682213783264, + "learning_rate": 2.092532295242787e-06, + "loss": 0.3144, + "step": 8681 + }, + { + "epoch": 2.1836016096579476, + "grad_norm": 0.307102233171463, + "learning_rate": 2.0913419931644095e-06, + "loss": 0.3271, + "step": 8682 + }, + { + "epoch": 2.183853118712274, + "grad_norm": 0.2990267276763916, + "learning_rate": 2.0901519401943924e-06, + "loss": 0.2964, + "step": 8683 + }, + { + "epoch": 2.1841046277665996, + "grad_norm": 0.30906108021736145, + "learning_rate": 2.088962136434658e-06, + "loss": 0.3318, + "step": 8684 + }, + { + "epoch": 2.1843561368209254, + "grad_norm": 0.2764362394809723, + "learning_rate": 2.0877725819871065e-06, + "loss": 0.3023, + "step": 8685 + }, + { + "epoch": 2.1846076458752517, + "grad_norm": 0.3090154826641083, + "learning_rate": 2.0865832769536125e-06, + "loss": 0.346, + "step": 8686 + }, + { + "epoch": 2.1848591549295775, + "grad_norm": 0.3105575442314148, + "learning_rate": 2.0853942214360365e-06, + "loss": 0.3426, + "step": 8687 + }, + { + "epoch": 2.1851106639839033, + "grad_norm": 0.2777365744113922, + "learning_rate": 2.0842054155362105e-06, + "loss": 0.3196, + "step": 8688 + }, + { + "epoch": 2.1853621730382295, + "grad_norm": 0.29215675592422485, + "learning_rate": 2.0830168593559513e-06, + "loss": 0.3114, + "step": 8689 + }, + { + "epoch": 2.1856136820925554, + "grad_norm": 0.29614973068237305, + "learning_rate": 2.081828552997047e-06, + "loss": 0.3229, + "step": 8690 + }, + { + "epoch": 2.185865191146881, + "grad_norm": 0.30876439809799194, + "learning_rate": 2.0806404965612737e-06, + "loss": 0.3022, + "step": 8691 + }, + { + "epoch": 2.1861167002012074, + "grad_norm": 0.3041656017303467, + "learning_rate": 2.0794526901503757e-06, + "loss": 0.3057, + "step": 8692 + }, + { + "epoch": 2.186368209255533, + "grad_norm": 0.29282814264297485, + "learning_rate": 2.0782651338660862e-06, + "loss": 0.3257, + "step": 8693 + }, + { + "epoch": 2.186619718309859, + "grad_norm": 0.3103329837322235, + "learning_rate": 2.077077827810108e-06, + "loss": 0.3163, + "step": 8694 + }, + { + "epoch": 2.1868712273641853, + "grad_norm": 0.28378820419311523, + "learning_rate": 2.075890772084128e-06, + "loss": 0.3226, + "step": 8695 + }, + { + "epoch": 2.187122736418511, + "grad_norm": 0.30652692914009094, + "learning_rate": 2.074703966789812e-06, + "loss": 0.3087, + "step": 8696 + }, + { + "epoch": 2.187374245472837, + "grad_norm": 0.3030233681201935, + "learning_rate": 2.073517412028799e-06, + "loss": 0.3096, + "step": 8697 + }, + { + "epoch": 2.187625754527163, + "grad_norm": 0.28076255321502686, + "learning_rate": 2.072331107902713e-06, + "loss": 0.3153, + "step": 8698 + }, + { + "epoch": 2.187877263581489, + "grad_norm": 0.2828161418437958, + "learning_rate": 2.0711450545131505e-06, + "loss": 0.3112, + "step": 8699 + }, + { + "epoch": 2.1881287726358147, + "grad_norm": 0.29777970910072327, + "learning_rate": 2.0699592519616934e-06, + "loss": 0.3253, + "step": 8700 + }, + { + "epoch": 2.188380281690141, + "grad_norm": 0.32333430647850037, + "learning_rate": 2.0687737003498944e-06, + "loss": 0.3359, + "step": 8701 + }, + { + "epoch": 2.188631790744467, + "grad_norm": 0.30716678500175476, + "learning_rate": 2.0675883997792913e-06, + "loss": 0.3126, + "step": 8702 + }, + { + "epoch": 2.1888832997987926, + "grad_norm": 0.28720593452453613, + "learning_rate": 2.0664033503513953e-06, + "loss": 0.3196, + "step": 8703 + }, + { + "epoch": 2.189134808853119, + "grad_norm": 0.29374513030052185, + "learning_rate": 2.0652185521677016e-06, + "loss": 0.3285, + "step": 8704 + }, + { + "epoch": 2.1893863179074446, + "grad_norm": 0.30496105551719666, + "learning_rate": 2.064034005329677e-06, + "loss": 0.3008, + "step": 8705 + }, + { + "epoch": 2.1896378269617705, + "grad_norm": 0.29388949275016785, + "learning_rate": 2.0628497099387727e-06, + "loss": 0.3009, + "step": 8706 + }, + { + "epoch": 2.1898893360160967, + "grad_norm": 0.3176174461841583, + "learning_rate": 2.061665666096418e-06, + "loss": 0.3277, + "step": 8707 + }, + { + "epoch": 2.1901408450704225, + "grad_norm": 0.3095850646495819, + "learning_rate": 2.0604818739040143e-06, + "loss": 0.3092, + "step": 8708 + }, + { + "epoch": 2.1903923541247483, + "grad_norm": 0.28293004631996155, + "learning_rate": 2.0592983334629506e-06, + "loss": 0.3213, + "step": 8709 + }, + { + "epoch": 2.1906438631790746, + "grad_norm": 0.2949610650539398, + "learning_rate": 2.0581150448745863e-06, + "loss": 0.3194, + "step": 8710 + }, + { + "epoch": 2.1908953722334004, + "grad_norm": 0.299869179725647, + "learning_rate": 2.0569320082402654e-06, + "loss": 0.3376, + "step": 8711 + }, + { + "epoch": 2.191146881287726, + "grad_norm": 0.28059983253479004, + "learning_rate": 2.055749223661305e-06, + "loss": 0.3159, + "step": 8712 + }, + { + "epoch": 2.1913983903420524, + "grad_norm": 0.3038329780101776, + "learning_rate": 2.0545666912390053e-06, + "loss": 0.3033, + "step": 8713 + }, + { + "epoch": 2.191649899396378, + "grad_norm": 0.3102431893348694, + "learning_rate": 2.053384411074641e-06, + "loss": 0.3305, + "step": 8714 + }, + { + "epoch": 2.191901408450704, + "grad_norm": 0.30974650382995605, + "learning_rate": 2.0522023832694694e-06, + "loss": 0.3425, + "step": 8715 + }, + { + "epoch": 2.1921529175050303, + "grad_norm": 0.31163522601127625, + "learning_rate": 2.051020607924721e-06, + "loss": 0.3385, + "step": 8716 + }, + { + "epoch": 2.192404426559356, + "grad_norm": 0.29782193899154663, + "learning_rate": 2.049839085141608e-06, + "loss": 0.3237, + "step": 8717 + }, + { + "epoch": 2.192655935613682, + "grad_norm": 0.3062836229801178, + "learning_rate": 2.048657815021323e-06, + "loss": 0.3187, + "step": 8718 + }, + { + "epoch": 2.192907444668008, + "grad_norm": 0.32062938809394836, + "learning_rate": 2.0474767976650313e-06, + "loss": 0.2985, + "step": 8719 + }, + { + "epoch": 2.193158953722334, + "grad_norm": 0.3006410300731659, + "learning_rate": 2.0462960331738824e-06, + "loss": 0.3242, + "step": 8720 + }, + { + "epoch": 2.1934104627766597, + "grad_norm": 0.28875410556793213, + "learning_rate": 2.0451155216489983e-06, + "loss": 0.3338, + "step": 8721 + }, + { + "epoch": 2.193661971830986, + "grad_norm": 0.3076934218406677, + "learning_rate": 2.043935263191486e-06, + "loss": 0.3354, + "step": 8722 + }, + { + "epoch": 2.193913480885312, + "grad_norm": 0.301831990480423, + "learning_rate": 2.0427552579024234e-06, + "loss": 0.3407, + "step": 8723 + }, + { + "epoch": 2.1941649899396376, + "grad_norm": 0.31272050738334656, + "learning_rate": 2.041575505882874e-06, + "loss": 0.3296, + "step": 8724 + }, + { + "epoch": 2.194416498993964, + "grad_norm": 0.29415902495384216, + "learning_rate": 2.040396007233873e-06, + "loss": 0.315, + "step": 8725 + }, + { + "epoch": 2.1946680080482897, + "grad_norm": 0.32999396324157715, + "learning_rate": 2.039216762056439e-06, + "loss": 0.3289, + "step": 8726 + }, + { + "epoch": 2.1949195171026155, + "grad_norm": 0.3093070089817047, + "learning_rate": 2.0380377704515687e-06, + "loss": 0.3259, + "step": 8727 + }, + { + "epoch": 2.1951710261569417, + "grad_norm": 0.31014779210090637, + "learning_rate": 2.0368590325202315e-06, + "loss": 0.318, + "step": 8728 + }, + { + "epoch": 2.1954225352112675, + "grad_norm": 0.29741162061691284, + "learning_rate": 2.035680548363382e-06, + "loss": 0.3241, + "step": 8729 + }, + { + "epoch": 2.1956740442655938, + "grad_norm": 0.27723386883735657, + "learning_rate": 2.0345023180819474e-06, + "loss": 0.3286, + "step": 8730 + }, + { + "epoch": 2.1959255533199196, + "grad_norm": 0.31631845235824585, + "learning_rate": 2.033324341776839e-06, + "loss": 0.2967, + "step": 8731 + }, + { + "epoch": 2.1961770623742454, + "grad_norm": 0.3349016606807709, + "learning_rate": 2.03214661954894e-06, + "loss": 0.3171, + "step": 8732 + }, + { + "epoch": 2.1964285714285716, + "grad_norm": 0.3033004701137543, + "learning_rate": 2.030969151499117e-06, + "loss": 0.3221, + "step": 8733 + }, + { + "epoch": 2.1966800804828974, + "grad_norm": 0.29788362979888916, + "learning_rate": 2.0297919377282106e-06, + "loss": 0.3317, + "step": 8734 + }, + { + "epoch": 2.1969315895372232, + "grad_norm": 0.2985301911830902, + "learning_rate": 2.0286149783370453e-06, + "loss": 0.3315, + "step": 8735 + }, + { + "epoch": 2.1971830985915495, + "grad_norm": 0.30903440713882446, + "learning_rate": 2.027438273426416e-06, + "loss": 0.3339, + "step": 8736 + }, + { + "epoch": 2.1974346076458753, + "grad_norm": 0.33624449372291565, + "learning_rate": 2.0262618230971023e-06, + "loss": 0.3379, + "step": 8737 + }, + { + "epoch": 2.197686116700201, + "grad_norm": 0.31500041484832764, + "learning_rate": 2.0250856274498617e-06, + "loss": 0.3312, + "step": 8738 + }, + { + "epoch": 2.1979376257545273, + "grad_norm": 0.3018837571144104, + "learning_rate": 2.0239096865854243e-06, + "loss": 0.3333, + "step": 8739 + }, + { + "epoch": 2.198189134808853, + "grad_norm": 0.305803120136261, + "learning_rate": 2.0227340006045056e-06, + "loss": 0.3403, + "step": 8740 + }, + { + "epoch": 2.198440643863179, + "grad_norm": 0.2978799045085907, + "learning_rate": 2.021558569607792e-06, + "loss": 0.3093, + "step": 8741 + }, + { + "epoch": 2.198692152917505, + "grad_norm": 0.2844066023826599, + "learning_rate": 2.020383393695956e-06, + "loss": 0.3106, + "step": 8742 + }, + { + "epoch": 2.198943661971831, + "grad_norm": 0.30605459213256836, + "learning_rate": 2.01920847296964e-06, + "loss": 0.3195, + "step": 8743 + }, + { + "epoch": 2.199195171026157, + "grad_norm": 0.30778858065605164, + "learning_rate": 2.0180338075294726e-06, + "loss": 0.3221, + "step": 8744 + }, + { + "epoch": 2.199446680080483, + "grad_norm": 0.29119858145713806, + "learning_rate": 2.016859397476052e-06, + "loss": 0.3172, + "step": 8745 + }, + { + "epoch": 2.199698189134809, + "grad_norm": 0.3088303804397583, + "learning_rate": 2.0156852429099638e-06, + "loss": 0.3166, + "step": 8746 + }, + { + "epoch": 2.1999496981891347, + "grad_norm": 0.281408429145813, + "learning_rate": 2.014511343931763e-06, + "loss": 0.3352, + "step": 8747 + }, + { + "epoch": 2.200201207243461, + "grad_norm": 0.30251288414001465, + "learning_rate": 2.0133377006419885e-06, + "loss": 0.3042, + "step": 8748 + }, + { + "epoch": 2.2004527162977867, + "grad_norm": 0.3207908868789673, + "learning_rate": 2.0121643131411568e-06, + "loss": 0.3338, + "step": 8749 + }, + { + "epoch": 2.2007042253521125, + "grad_norm": 0.3297126889228821, + "learning_rate": 2.0109911815297585e-06, + "loss": 0.3362, + "step": 8750 + }, + { + "epoch": 2.2009557344064388, + "grad_norm": 0.29778042435646057, + "learning_rate": 2.0098183059082675e-06, + "loss": 0.3456, + "step": 8751 + }, + { + "epoch": 2.2012072434607646, + "grad_norm": 0.30129680037498474, + "learning_rate": 2.008645686377131e-06, + "loss": 0.3625, + "step": 8752 + }, + { + "epoch": 2.2014587525150904, + "grad_norm": 0.2970339059829712, + "learning_rate": 2.007473323036779e-06, + "loss": 0.3177, + "step": 8753 + }, + { + "epoch": 2.2017102615694166, + "grad_norm": 0.3131774961948395, + "learning_rate": 2.0063012159876138e-06, + "loss": 0.3363, + "step": 8754 + }, + { + "epoch": 2.2019617706237424, + "grad_norm": 0.31431764364242554, + "learning_rate": 2.005129365330023e-06, + "loss": 0.3446, + "step": 8755 + }, + { + "epoch": 2.2022132796780682, + "grad_norm": 0.2815970182418823, + "learning_rate": 2.0039577711643642e-06, + "loss": 0.3325, + "step": 8756 + }, + { + "epoch": 2.2024647887323945, + "grad_norm": 0.2978397309780121, + "learning_rate": 2.002786433590981e-06, + "loss": 0.3338, + "step": 8757 + }, + { + "epoch": 2.2027162977867203, + "grad_norm": 0.3014795780181885, + "learning_rate": 2.001615352710188e-06, + "loss": 0.3291, + "step": 8758 + }, + { + "epoch": 2.202967806841046, + "grad_norm": 0.3280890882015228, + "learning_rate": 2.0004445286222818e-06, + "loss": 0.3253, + "step": 8759 + }, + { + "epoch": 2.2032193158953723, + "grad_norm": 0.3170720636844635, + "learning_rate": 1.999273961427538e-06, + "loss": 0.3304, + "step": 8760 + }, + { + "epoch": 2.203470824949698, + "grad_norm": 0.31641075015068054, + "learning_rate": 1.9981036512262054e-06, + "loss": 0.3233, + "step": 8761 + }, + { + "epoch": 2.203722334004024, + "grad_norm": 0.30692046880722046, + "learning_rate": 1.9969335981185173e-06, + "loss": 0.3543, + "step": 8762 + }, + { + "epoch": 2.20397384305835, + "grad_norm": 0.31193432211875916, + "learning_rate": 1.9957638022046773e-06, + "loss": 0.3305, + "step": 8763 + }, + { + "epoch": 2.204225352112676, + "grad_norm": 0.3167397677898407, + "learning_rate": 1.9945942635848745e-06, + "loss": 0.3031, + "step": 8764 + }, + { + "epoch": 2.204476861167002, + "grad_norm": 0.29315993189811707, + "learning_rate": 1.9934249823592703e-06, + "loss": 0.3366, + "step": 8765 + }, + { + "epoch": 2.204728370221328, + "grad_norm": 0.2992209196090698, + "learning_rate": 1.992255958628009e-06, + "loss": 0.3327, + "step": 8766 + }, + { + "epoch": 2.204979879275654, + "grad_norm": 0.29840826988220215, + "learning_rate": 1.9910871924912063e-06, + "loss": 0.3068, + "step": 8767 + }, + { + "epoch": 2.20523138832998, + "grad_norm": 0.318357914686203, + "learning_rate": 1.989918684048964e-06, + "loss": 0.3294, + "step": 8768 + }, + { + "epoch": 2.205482897384306, + "grad_norm": 0.28239601850509644, + "learning_rate": 1.9887504334013534e-06, + "loss": 0.3296, + "step": 8769 + }, + { + "epoch": 2.2057344064386317, + "grad_norm": 0.32247522473335266, + "learning_rate": 1.9875824406484318e-06, + "loss": 0.332, + "step": 8770 + }, + { + "epoch": 2.205985915492958, + "grad_norm": 0.2952177822589874, + "learning_rate": 1.986414705890226e-06, + "loss": 0.3287, + "step": 8771 + }, + { + "epoch": 2.2062374245472838, + "grad_norm": 0.3215632438659668, + "learning_rate": 1.9852472292267505e-06, + "loss": 0.3458, + "step": 8772 + }, + { + "epoch": 2.2064889336016096, + "grad_norm": 0.30382493138313293, + "learning_rate": 1.9840800107579872e-06, + "loss": 0.3165, + "step": 8773 + }, + { + "epoch": 2.206740442655936, + "grad_norm": 0.31115108728408813, + "learning_rate": 1.9829130505839058e-06, + "loss": 0.3306, + "step": 8774 + }, + { + "epoch": 2.2069919517102616, + "grad_norm": 0.29950037598609924, + "learning_rate": 1.9817463488044446e-06, + "loss": 0.3125, + "step": 8775 + }, + { + "epoch": 2.2072434607645874, + "grad_norm": 0.2986210882663727, + "learning_rate": 1.9805799055195264e-06, + "loss": 0.3062, + "step": 8776 + }, + { + "epoch": 2.2074949698189137, + "grad_norm": 0.3211692273616791, + "learning_rate": 1.9794137208290516e-06, + "loss": 0.3129, + "step": 8777 + }, + { + "epoch": 2.2077464788732395, + "grad_norm": 0.29951202869415283, + "learning_rate": 1.9782477948328933e-06, + "loss": 0.3337, + "step": 8778 + }, + { + "epoch": 2.2079979879275653, + "grad_norm": 0.3311370611190796, + "learning_rate": 1.9770821276309093e-06, + "loss": 0.3185, + "step": 8779 + }, + { + "epoch": 2.2082494969818915, + "grad_norm": 0.31037113070487976, + "learning_rate": 1.9759167193229277e-06, + "loss": 0.3226, + "step": 8780 + }, + { + "epoch": 2.2085010060362174, + "grad_norm": 0.315602570772171, + "learning_rate": 1.974751570008762e-06, + "loss": 0.334, + "step": 8781 + }, + { + "epoch": 2.208752515090543, + "grad_norm": 0.2930223345756531, + "learning_rate": 1.9735866797881977e-06, + "loss": 0.2912, + "step": 8782 + }, + { + "epoch": 2.2090040241448694, + "grad_norm": 0.2838136553764343, + "learning_rate": 1.972422048761002e-06, + "loss": 0.3148, + "step": 8783 + }, + { + "epoch": 2.209255533199195, + "grad_norm": 0.3084590435028076, + "learning_rate": 1.9712576770269155e-06, + "loss": 0.3255, + "step": 8784 + }, + { + "epoch": 2.209507042253521, + "grad_norm": 0.3039499819278717, + "learning_rate": 1.9700935646856634e-06, + "loss": 0.341, + "step": 8785 + }, + { + "epoch": 2.2097585513078473, + "grad_norm": 0.30746522545814514, + "learning_rate": 1.9689297118369403e-06, + "loss": 0.3122, + "step": 8786 + }, + { + "epoch": 2.210010060362173, + "grad_norm": 0.31201913952827454, + "learning_rate": 1.967766118580427e-06, + "loss": 0.2904, + "step": 8787 + }, + { + "epoch": 2.210261569416499, + "grad_norm": 0.3182002007961273, + "learning_rate": 1.9666027850157745e-06, + "loss": 0.3133, + "step": 8788 + }, + { + "epoch": 2.210513078470825, + "grad_norm": 0.3027884066104889, + "learning_rate": 1.965439711242618e-06, + "loss": 0.307, + "step": 8789 + }, + { + "epoch": 2.210764587525151, + "grad_norm": 0.3081648051738739, + "learning_rate": 1.964276897360565e-06, + "loss": 0.3231, + "step": 8790 + }, + { + "epoch": 2.2110160965794767, + "grad_norm": 0.30565327405929565, + "learning_rate": 1.9631143434692054e-06, + "loss": 0.3237, + "step": 8791 + }, + { + "epoch": 2.211267605633803, + "grad_norm": 0.2992105484008789, + "learning_rate": 1.9619520496681015e-06, + "loss": 0.3361, + "step": 8792 + }, + { + "epoch": 2.211519114688129, + "grad_norm": 0.28107115626335144, + "learning_rate": 1.960790016056801e-06, + "loss": 0.3208, + "step": 8793 + }, + { + "epoch": 2.2117706237424546, + "grad_norm": 0.31731727719306946, + "learning_rate": 1.9596282427348206e-06, + "loss": 0.3032, + "step": 8794 + }, + { + "epoch": 2.212022132796781, + "grad_norm": 0.30339574813842773, + "learning_rate": 1.958466729801662e-06, + "loss": 0.3053, + "step": 8795 + }, + { + "epoch": 2.2122736418511066, + "grad_norm": 0.29197800159454346, + "learning_rate": 1.957305477356799e-06, + "loss": 0.3513, + "step": 8796 + }, + { + "epoch": 2.2125251509054324, + "grad_norm": 0.2851543426513672, + "learning_rate": 1.956144485499688e-06, + "loss": 0.3059, + "step": 8797 + }, + { + "epoch": 2.2127766599597587, + "grad_norm": 0.2786988615989685, + "learning_rate": 1.9549837543297585e-06, + "loss": 0.3259, + "step": 8798 + }, + { + "epoch": 2.2130281690140845, + "grad_norm": 0.2844609022140503, + "learning_rate": 1.953823283946422e-06, + "loss": 0.3198, + "step": 8799 + }, + { + "epoch": 2.2132796780684103, + "grad_norm": 0.32058948278427124, + "learning_rate": 1.952663074449063e-06, + "loss": 0.3098, + "step": 8800 + }, + { + "epoch": 2.2135311871227366, + "grad_norm": 0.2962084412574768, + "learning_rate": 1.9515031259370493e-06, + "loss": 0.3343, + "step": 8801 + }, + { + "epoch": 2.2137826961770624, + "grad_norm": 0.2991352081298828, + "learning_rate": 1.95034343850972e-06, + "loss": 0.3374, + "step": 8802 + }, + { + "epoch": 2.214034205231388, + "grad_norm": 0.3038289248943329, + "learning_rate": 1.9491840122663965e-06, + "loss": 0.3438, + "step": 8803 + }, + { + "epoch": 2.2142857142857144, + "grad_norm": 0.3067372143268585, + "learning_rate": 1.948024847306378e-06, + "loss": 0.302, + "step": 8804 + }, + { + "epoch": 2.21453722334004, + "grad_norm": 0.27954307198524475, + "learning_rate": 1.946865943728936e-06, + "loss": 0.3406, + "step": 8805 + }, + { + "epoch": 2.214788732394366, + "grad_norm": 0.2837284207344055, + "learning_rate": 1.945707301633328e-06, + "loss": 0.3044, + "step": 8806 + }, + { + "epoch": 2.2150402414486923, + "grad_norm": 0.29118677973747253, + "learning_rate": 1.94454892111878e-06, + "loss": 0.3169, + "step": 8807 + }, + { + "epoch": 2.215291750503018, + "grad_norm": 0.301509827375412, + "learning_rate": 1.9433908022845046e-06, + "loss": 0.3166, + "step": 8808 + }, + { + "epoch": 2.215543259557344, + "grad_norm": 0.3074178099632263, + "learning_rate": 1.9422329452296825e-06, + "loss": 0.3188, + "step": 8809 + }, + { + "epoch": 2.21579476861167, + "grad_norm": 0.2955106496810913, + "learning_rate": 1.941075350053481e-06, + "loss": 0.3116, + "step": 8810 + }, + { + "epoch": 2.216046277665996, + "grad_norm": 0.2759988009929657, + "learning_rate": 1.9399180168550374e-06, + "loss": 0.311, + "step": 8811 + }, + { + "epoch": 2.2162977867203217, + "grad_norm": 0.3274937570095062, + "learning_rate": 1.9387609457334734e-06, + "loss": 0.3431, + "step": 8812 + }, + { + "epoch": 2.216549295774648, + "grad_norm": 0.2869795858860016, + "learning_rate": 1.937604136787882e-06, + "loss": 0.3158, + "step": 8813 + }, + { + "epoch": 2.216800804828974, + "grad_norm": 0.33137309551239014, + "learning_rate": 1.936447590117338e-06, + "loss": 0.3242, + "step": 8814 + }, + { + "epoch": 2.2170523138832996, + "grad_norm": 0.2800137996673584, + "learning_rate": 1.935291305820894e-06, + "loss": 0.3108, + "step": 8815 + }, + { + "epoch": 2.217303822937626, + "grad_norm": 0.29524585604667664, + "learning_rate": 1.9341352839975753e-06, + "loss": 0.3126, + "step": 8816 + }, + { + "epoch": 2.2175553319919517, + "grad_norm": 0.30382615327835083, + "learning_rate": 1.9329795247463913e-06, + "loss": 0.3184, + "step": 8817 + }, + { + "epoch": 2.2178068410462775, + "grad_norm": 0.3121460974216461, + "learning_rate": 1.9318240281663215e-06, + "loss": 0.3162, + "step": 8818 + }, + { + "epoch": 2.2180583501006037, + "grad_norm": 0.28468015789985657, + "learning_rate": 1.930668794356331e-06, + "loss": 0.3268, + "step": 8819 + }, + { + "epoch": 2.2183098591549295, + "grad_norm": 0.3230193257331848, + "learning_rate": 1.929513823415356e-06, + "loss": 0.309, + "step": 8820 + }, + { + "epoch": 2.2185613682092553, + "grad_norm": 0.30588406324386597, + "learning_rate": 1.928359115442314e-06, + "loss": 0.3162, + "step": 8821 + }, + { + "epoch": 2.2188128772635816, + "grad_norm": 0.2859831750392914, + "learning_rate": 1.9272046705360958e-06, + "loss": 0.3161, + "step": 8822 + }, + { + "epoch": 2.2190643863179074, + "grad_norm": 0.2836097478866577, + "learning_rate": 1.926050488795576e-06, + "loss": 0.3085, + "step": 8823 + }, + { + "epoch": 2.219315895372233, + "grad_norm": 0.29898855090141296, + "learning_rate": 1.9248965703196e-06, + "loss": 0.3115, + "step": 8824 + }, + { + "epoch": 2.2195674044265594, + "grad_norm": 0.3057454228401184, + "learning_rate": 1.9237429152069948e-06, + "loss": 0.2977, + "step": 8825 + }, + { + "epoch": 2.2198189134808852, + "grad_norm": 0.30841681361198425, + "learning_rate": 1.922589523556565e-06, + "loss": 0.3491, + "step": 8826 + }, + { + "epoch": 2.2200704225352115, + "grad_norm": 0.3082001507282257, + "learning_rate": 1.9214363954670895e-06, + "loss": 0.3165, + "step": 8827 + }, + { + "epoch": 2.2203219315895373, + "grad_norm": 0.3125540614128113, + "learning_rate": 1.9202835310373285e-06, + "loss": 0.3291, + "step": 8828 + }, + { + "epoch": 2.220573440643863, + "grad_norm": 0.3130517303943634, + "learning_rate": 1.9191309303660145e-06, + "loss": 0.3229, + "step": 8829 + }, + { + "epoch": 2.2208249496981893, + "grad_norm": 0.3235696852207184, + "learning_rate": 1.9179785935518647e-06, + "loss": 0.328, + "step": 8830 + }, + { + "epoch": 2.221076458752515, + "grad_norm": 0.30910971760749817, + "learning_rate": 1.9168265206935655e-06, + "loss": 0.3185, + "step": 8831 + }, + { + "epoch": 2.221327967806841, + "grad_norm": 0.282240092754364, + "learning_rate": 1.9156747118897878e-06, + "loss": 0.32, + "step": 8832 + }, + { + "epoch": 2.221579476861167, + "grad_norm": 0.30179286003112793, + "learning_rate": 1.914523167239174e-06, + "loss": 0.3176, + "step": 8833 + }, + { + "epoch": 2.221830985915493, + "grad_norm": 0.3033350110054016, + "learning_rate": 1.91337188684035e-06, + "loss": 0.3161, + "step": 8834 + }, + { + "epoch": 2.222082494969819, + "grad_norm": 0.30857256054878235, + "learning_rate": 1.9122208707919125e-06, + "loss": 0.3243, + "step": 8835 + }, + { + "epoch": 2.222334004024145, + "grad_norm": 0.3050358295440674, + "learning_rate": 1.9110701191924403e-06, + "loss": 0.3213, + "step": 8836 + }, + { + "epoch": 2.222585513078471, + "grad_norm": 0.3051794171333313, + "learning_rate": 1.9099196321404895e-06, + "loss": 0.3034, + "step": 8837 + }, + { + "epoch": 2.2228370221327967, + "grad_norm": 0.30538561940193176, + "learning_rate": 1.9087694097345895e-06, + "loss": 0.3213, + "step": 8838 + }, + { + "epoch": 2.223088531187123, + "grad_norm": 0.3018161952495575, + "learning_rate": 1.9076194520732523e-06, + "loss": 0.3263, + "step": 8839 + }, + { + "epoch": 2.2233400402414487, + "grad_norm": 0.3152303397655487, + "learning_rate": 1.9064697592549613e-06, + "loss": 0.323, + "step": 8840 + }, + { + "epoch": 2.2235915492957745, + "grad_norm": 0.2975950539112091, + "learning_rate": 1.9053203313781843e-06, + "loss": 0.3556, + "step": 8841 + }, + { + "epoch": 2.2238430583501008, + "grad_norm": 0.3171286880970001, + "learning_rate": 1.9041711685413588e-06, + "loss": 0.3144, + "step": 8842 + }, + { + "epoch": 2.2240945674044266, + "grad_norm": 0.2995269298553467, + "learning_rate": 1.903022270842907e-06, + "loss": 0.3153, + "step": 8843 + }, + { + "epoch": 2.2243460764587524, + "grad_norm": 0.3209341764450073, + "learning_rate": 1.9018736383812214e-06, + "loss": 0.3278, + "step": 8844 + }, + { + "epoch": 2.2245975855130786, + "grad_norm": 0.3014548718929291, + "learning_rate": 1.9007252712546786e-06, + "loss": 0.3342, + "step": 8845 + }, + { + "epoch": 2.2248490945674044, + "grad_norm": 0.3181338310241699, + "learning_rate": 1.8995771695616255e-06, + "loss": 0.3352, + "step": 8846 + }, + { + "epoch": 2.2251006036217302, + "grad_norm": 0.28974583745002747, + "learning_rate": 1.8984293334003917e-06, + "loss": 0.3346, + "step": 8847 + }, + { + "epoch": 2.2253521126760565, + "grad_norm": 0.30787721276283264, + "learning_rate": 1.897281762869284e-06, + "loss": 0.3173, + "step": 8848 + }, + { + "epoch": 2.2256036217303823, + "grad_norm": 0.29022449254989624, + "learning_rate": 1.8961344580665808e-06, + "loss": 0.3134, + "step": 8849 + }, + { + "epoch": 2.225855130784708, + "grad_norm": 0.2811683416366577, + "learning_rate": 1.8949874190905453e-06, + "loss": 0.2889, + "step": 8850 + }, + { + "epoch": 2.2261066398390343, + "grad_norm": 0.2896963059902191, + "learning_rate": 1.893840646039411e-06, + "loss": 0.3004, + "step": 8851 + }, + { + "epoch": 2.22635814889336, + "grad_norm": 0.2802124619483948, + "learning_rate": 1.8926941390113946e-06, + "loss": 0.3045, + "step": 8852 + }, + { + "epoch": 2.226609657947686, + "grad_norm": 0.293868750333786, + "learning_rate": 1.8915478981046847e-06, + "loss": 0.3134, + "step": 8853 + }, + { + "epoch": 2.226861167002012, + "grad_norm": 0.28831586241722107, + "learning_rate": 1.8904019234174526e-06, + "loss": 0.3105, + "step": 8854 + }, + { + "epoch": 2.227112676056338, + "grad_norm": 0.2980342507362366, + "learning_rate": 1.88925621504784e-06, + "loss": 0.3228, + "step": 8855 + }, + { + "epoch": 2.227364185110664, + "grad_norm": 0.30006951093673706, + "learning_rate": 1.8881107730939734e-06, + "loss": 0.3245, + "step": 8856 + }, + { + "epoch": 2.22761569416499, + "grad_norm": 0.2884455919265747, + "learning_rate": 1.8869655976539502e-06, + "loss": 0.3142, + "step": 8857 + }, + { + "epoch": 2.227867203219316, + "grad_norm": 0.32939469814300537, + "learning_rate": 1.885820688825848e-06, + "loss": 0.3118, + "step": 8858 + }, + { + "epoch": 2.2281187122736417, + "grad_norm": 0.30000847578048706, + "learning_rate": 1.8846760467077236e-06, + "loss": 0.3248, + "step": 8859 + }, + { + "epoch": 2.228370221327968, + "grad_norm": 0.29850441217422485, + "learning_rate": 1.8835316713976043e-06, + "loss": 0.3101, + "step": 8860 + }, + { + "epoch": 2.2286217303822937, + "grad_norm": 0.2936405837535858, + "learning_rate": 1.882387562993503e-06, + "loss": 0.3186, + "step": 8861 + }, + { + "epoch": 2.2288732394366195, + "grad_norm": 0.2933601438999176, + "learning_rate": 1.881243721593401e-06, + "loss": 0.3265, + "step": 8862 + }, + { + "epoch": 2.2291247484909458, + "grad_norm": 0.2859182059764862, + "learning_rate": 1.8801001472952651e-06, + "loss": 0.3317, + "step": 8863 + }, + { + "epoch": 2.2293762575452716, + "grad_norm": 0.2935737371444702, + "learning_rate": 1.878956840197032e-06, + "loss": 0.3221, + "step": 8864 + }, + { + "epoch": 2.2296277665995974, + "grad_norm": 0.31956246495246887, + "learning_rate": 1.8778138003966218e-06, + "loss": 0.3337, + "step": 8865 + }, + { + "epoch": 2.2298792756539236, + "grad_norm": 0.3022806644439697, + "learning_rate": 1.876671027991926e-06, + "loss": 0.3045, + "step": 8866 + }, + { + "epoch": 2.2301307847082494, + "grad_norm": 0.29735061526298523, + "learning_rate": 1.8755285230808185e-06, + "loss": 0.3275, + "step": 8867 + }, + { + "epoch": 2.2303822937625757, + "grad_norm": 0.3103269934654236, + "learning_rate": 1.874386285761145e-06, + "loss": 0.3247, + "step": 8868 + }, + { + "epoch": 2.2306338028169015, + "grad_norm": 0.3051782250404358, + "learning_rate": 1.8732443161307323e-06, + "loss": 0.3151, + "step": 8869 + }, + { + "epoch": 2.2308853118712273, + "grad_norm": 0.2836689352989197, + "learning_rate": 1.8721026142873843e-06, + "loss": 0.2943, + "step": 8870 + }, + { + "epoch": 2.2311368209255535, + "grad_norm": 0.31249019503593445, + "learning_rate": 1.8709611803288779e-06, + "loss": 0.3151, + "step": 8871 + }, + { + "epoch": 2.2313883299798793, + "grad_norm": 0.289578378200531, + "learning_rate": 1.8698200143529733e-06, + "loss": 0.321, + "step": 8872 + }, + { + "epoch": 2.231639839034205, + "grad_norm": 0.3105667531490326, + "learning_rate": 1.8686791164573997e-06, + "loss": 0.3235, + "step": 8873 + }, + { + "epoch": 2.2318913480885314, + "grad_norm": 0.31519603729248047, + "learning_rate": 1.8675384867398722e-06, + "loss": 0.3308, + "step": 8874 + }, + { + "epoch": 2.232142857142857, + "grad_norm": 0.30213382840156555, + "learning_rate": 1.866398125298075e-06, + "loss": 0.2995, + "step": 8875 + }, + { + "epoch": 2.232394366197183, + "grad_norm": 0.30496177077293396, + "learning_rate": 1.8652580322296766e-06, + "loss": 0.3019, + "step": 8876 + }, + { + "epoch": 2.2326458752515093, + "grad_norm": 0.30280569195747375, + "learning_rate": 1.864118207632315e-06, + "loss": 0.355, + "step": 8877 + }, + { + "epoch": 2.232897384305835, + "grad_norm": 0.3069732189178467, + "learning_rate": 1.8629786516036109e-06, + "loss": 0.3714, + "step": 8878 + }, + { + "epoch": 2.233148893360161, + "grad_norm": 0.3117799162864685, + "learning_rate": 1.861839364241162e-06, + "loss": 0.3448, + "step": 8879 + }, + { + "epoch": 2.233400402414487, + "grad_norm": 0.29441627860069275, + "learning_rate": 1.860700345642537e-06, + "loss": 0.324, + "step": 8880 + }, + { + "epoch": 2.233651911468813, + "grad_norm": 0.29074686765670776, + "learning_rate": 1.8595615959052905e-06, + "loss": 0.323, + "step": 8881 + }, + { + "epoch": 2.2339034205231387, + "grad_norm": 0.2976393699645996, + "learning_rate": 1.8584231151269444e-06, + "loss": 0.3263, + "step": 8882 + }, + { + "epoch": 2.234154929577465, + "grad_norm": 0.3090425133705139, + "learning_rate": 1.8572849034050066e-06, + "loss": 0.347, + "step": 8883 + }, + { + "epoch": 2.234406438631791, + "grad_norm": 0.2783370316028595, + "learning_rate": 1.8561469608369547e-06, + "loss": 0.3137, + "step": 8884 + }, + { + "epoch": 2.2346579476861166, + "grad_norm": 0.2869153320789337, + "learning_rate": 1.8550092875202497e-06, + "loss": 0.3286, + "step": 8885 + }, + { + "epoch": 2.234909456740443, + "grad_norm": 0.3169291317462921, + "learning_rate": 1.8538718835523217e-06, + "loss": 0.3386, + "step": 8886 + }, + { + "epoch": 2.2351609657947686, + "grad_norm": 0.2993890643119812, + "learning_rate": 1.852734749030587e-06, + "loss": 0.3145, + "step": 8887 + }, + { + "epoch": 2.2354124748490944, + "grad_norm": 0.3018781244754791, + "learning_rate": 1.8515978840524302e-06, + "loss": 0.3272, + "step": 8888 + }, + { + "epoch": 2.2356639839034207, + "grad_norm": 0.29869964718818665, + "learning_rate": 1.850461288715218e-06, + "loss": 0.3214, + "step": 8889 + }, + { + "epoch": 2.2359154929577465, + "grad_norm": 0.2824086844921112, + "learning_rate": 1.8493249631162947e-06, + "loss": 0.3178, + "step": 8890 + }, + { + "epoch": 2.2361670020120723, + "grad_norm": 0.2874877452850342, + "learning_rate": 1.8481889073529762e-06, + "loss": 0.3227, + "step": 8891 + }, + { + "epoch": 2.2364185110663986, + "grad_norm": 0.29150328040122986, + "learning_rate": 1.8470531215225617e-06, + "loss": 0.3245, + "step": 8892 + }, + { + "epoch": 2.2366700201207244, + "grad_norm": 0.28248754143714905, + "learning_rate": 1.845917605722321e-06, + "loss": 0.325, + "step": 8893 + }, + { + "epoch": 2.23692152917505, + "grad_norm": 0.28792303800582886, + "learning_rate": 1.8447823600495068e-06, + "loss": 0.3175, + "step": 8894 + }, + { + "epoch": 2.2371730382293764, + "grad_norm": 0.28080111742019653, + "learning_rate": 1.8436473846013432e-06, + "loss": 0.3382, + "step": 8895 + }, + { + "epoch": 2.237424547283702, + "grad_norm": 0.28693875670433044, + "learning_rate": 1.842512679475037e-06, + "loss": 0.3263, + "step": 8896 + }, + { + "epoch": 2.237676056338028, + "grad_norm": 0.2953447103500366, + "learning_rate": 1.8413782447677641e-06, + "loss": 0.3322, + "step": 8897 + }, + { + "epoch": 2.2379275653923543, + "grad_norm": 0.3262675404548645, + "learning_rate": 1.8402440805766863e-06, + "loss": 0.317, + "step": 8898 + }, + { + "epoch": 2.23817907444668, + "grad_norm": 0.28162655234336853, + "learning_rate": 1.8391101869989341e-06, + "loss": 0.3345, + "step": 8899 + }, + { + "epoch": 2.238430583501006, + "grad_norm": 0.27264997363090515, + "learning_rate": 1.8379765641316216e-06, + "loss": 0.2932, + "step": 8900 + }, + { + "epoch": 2.238682092555332, + "grad_norm": 0.30955466628074646, + "learning_rate": 1.8368432120718332e-06, + "loss": 0.3388, + "step": 8901 + }, + { + "epoch": 2.238933601609658, + "grad_norm": 0.29133230447769165, + "learning_rate": 1.8357101309166364e-06, + "loss": 0.3317, + "step": 8902 + }, + { + "epoch": 2.2391851106639837, + "grad_norm": 0.3130471110343933, + "learning_rate": 1.8345773207630696e-06, + "loss": 0.31, + "step": 8903 + }, + { + "epoch": 2.23943661971831, + "grad_norm": 0.30992329120635986, + "learning_rate": 1.833444781708154e-06, + "loss": 0.3204, + "step": 8904 + }, + { + "epoch": 2.239688128772636, + "grad_norm": 0.2960168719291687, + "learning_rate": 1.8323125138488818e-06, + "loss": 0.3244, + "step": 8905 + }, + { + "epoch": 2.2399396378269616, + "grad_norm": 0.31481972336769104, + "learning_rate": 1.8311805172822272e-06, + "loss": 0.3318, + "step": 8906 + }, + { + "epoch": 2.240191146881288, + "grad_norm": 0.3201163113117218, + "learning_rate": 1.8300487921051352e-06, + "loss": 0.3213, + "step": 8907 + }, + { + "epoch": 2.2404426559356136, + "grad_norm": 0.31222665309906006, + "learning_rate": 1.8289173384145354e-06, + "loss": 0.2939, + "step": 8908 + }, + { + "epoch": 2.2406941649899395, + "grad_norm": 0.3136330246925354, + "learning_rate": 1.8277861563073252e-06, + "loss": 0.3286, + "step": 8909 + }, + { + "epoch": 2.2409456740442657, + "grad_norm": 0.3031679391860962, + "learning_rate": 1.8266552458803872e-06, + "loss": 0.3048, + "step": 8910 + }, + { + "epoch": 2.2411971830985915, + "grad_norm": 0.29609373211860657, + "learning_rate": 1.8255246072305727e-06, + "loss": 0.2845, + "step": 8911 + }, + { + "epoch": 2.2414486921529173, + "grad_norm": 0.32585108280181885, + "learning_rate": 1.8243942404547183e-06, + "loss": 0.3267, + "step": 8912 + }, + { + "epoch": 2.2417002012072436, + "grad_norm": 0.30860912799835205, + "learning_rate": 1.8232641456496292e-06, + "loss": 0.3092, + "step": 8913 + }, + { + "epoch": 2.2419517102615694, + "grad_norm": 0.30172842741012573, + "learning_rate": 1.822134322912094e-06, + "loss": 0.2914, + "step": 8914 + }, + { + "epoch": 2.242203219315895, + "grad_norm": 0.30393847823143005, + "learning_rate": 1.8210047723388718e-06, + "loss": 0.3236, + "step": 8915 + }, + { + "epoch": 2.2424547283702214, + "grad_norm": 0.2871536314487457, + "learning_rate": 1.8198754940267044e-06, + "loss": 0.3247, + "step": 8916 + }, + { + "epoch": 2.2427062374245472, + "grad_norm": 0.31319135427474976, + "learning_rate": 1.818746488072305e-06, + "loss": 0.3169, + "step": 8917 + }, + { + "epoch": 2.242957746478873, + "grad_norm": 0.3419165015220642, + "learning_rate": 1.8176177545723683e-06, + "loss": 0.3388, + "step": 8918 + }, + { + "epoch": 2.2432092555331993, + "grad_norm": 0.308427631855011, + "learning_rate": 1.8164892936235602e-06, + "loss": 0.3216, + "step": 8919 + }, + { + "epoch": 2.243460764587525, + "grad_norm": 0.33541321754455566, + "learning_rate": 1.81536110532253e-06, + "loss": 0.3406, + "step": 8920 + }, + { + "epoch": 2.243712273641851, + "grad_norm": 0.3217279314994812, + "learning_rate": 1.8142331897658967e-06, + "loss": 0.3293, + "step": 8921 + }, + { + "epoch": 2.243963782696177, + "grad_norm": 0.30454570055007935, + "learning_rate": 1.8131055470502601e-06, + "loss": 0.3415, + "step": 8922 + }, + { + "epoch": 2.244215291750503, + "grad_norm": 0.30971288681030273, + "learning_rate": 1.8119781772721984e-06, + "loss": 0.3416, + "step": 8923 + }, + { + "epoch": 2.2444668008048287, + "grad_norm": 0.3124922513961792, + "learning_rate": 1.8108510805282598e-06, + "loss": 0.3392, + "step": 8924 + }, + { + "epoch": 2.244718309859155, + "grad_norm": 0.29240474104881287, + "learning_rate": 1.8097242569149765e-06, + "loss": 0.3084, + "step": 8925 + }, + { + "epoch": 2.244969818913481, + "grad_norm": 0.2906413674354553, + "learning_rate": 1.8085977065288502e-06, + "loss": 0.3244, + "step": 8926 + }, + { + "epoch": 2.245221327967807, + "grad_norm": 0.27822086215019226, + "learning_rate": 1.807471429466367e-06, + "loss": 0.3249, + "step": 8927 + }, + { + "epoch": 2.245472837022133, + "grad_norm": 0.3114204704761505, + "learning_rate": 1.8063454258239821e-06, + "loss": 0.3361, + "step": 8928 + }, + { + "epoch": 2.2457243460764587, + "grad_norm": 0.3120749890804291, + "learning_rate": 1.8052196956981333e-06, + "loss": 0.3021, + "step": 8929 + }, + { + "epoch": 2.245975855130785, + "grad_norm": 0.31990909576416016, + "learning_rate": 1.8040942391852296e-06, + "loss": 0.3188, + "step": 8930 + }, + { + "epoch": 2.2462273641851107, + "grad_norm": 0.3102615475654602, + "learning_rate": 1.8029690563816626e-06, + "loss": 0.3298, + "step": 8931 + }, + { + "epoch": 2.2464788732394365, + "grad_norm": 0.29477155208587646, + "learning_rate": 1.8018441473837934e-06, + "loss": 0.3149, + "step": 8932 + }, + { + "epoch": 2.2467303822937628, + "grad_norm": 0.2773665189743042, + "learning_rate": 1.8007195122879656e-06, + "loss": 0.3093, + "step": 8933 + }, + { + "epoch": 2.2469818913480886, + "grad_norm": 0.31135937571525574, + "learning_rate": 1.7995951511904985e-06, + "loss": 0.334, + "step": 8934 + }, + { + "epoch": 2.2472334004024144, + "grad_norm": 0.28784048557281494, + "learning_rate": 1.7984710641876829e-06, + "loss": 0.3057, + "step": 8935 + }, + { + "epoch": 2.2474849094567406, + "grad_norm": 0.3066675364971161, + "learning_rate": 1.7973472513757945e-06, + "loss": 0.3211, + "step": 8936 + }, + { + "epoch": 2.2477364185110664, + "grad_norm": 0.30136793851852417, + "learning_rate": 1.7962237128510761e-06, + "loss": 0.3269, + "step": 8937 + }, + { + "epoch": 2.2479879275653922, + "grad_norm": 0.30192577838897705, + "learning_rate": 1.7951004487097557e-06, + "loss": 0.3124, + "step": 8938 + }, + { + "epoch": 2.2482394366197185, + "grad_norm": 0.29201701283454895, + "learning_rate": 1.7939774590480301e-06, + "loss": 0.3281, + "step": 8939 + }, + { + "epoch": 2.2484909456740443, + "grad_norm": 0.29174402356147766, + "learning_rate": 1.7928547439620808e-06, + "loss": 0.3287, + "step": 8940 + }, + { + "epoch": 2.24874245472837, + "grad_norm": 0.3136996924877167, + "learning_rate": 1.7917323035480567e-06, + "loss": 0.3081, + "step": 8941 + }, + { + "epoch": 2.2489939637826963, + "grad_norm": 0.2897532880306244, + "learning_rate": 1.7906101379020912e-06, + "loss": 0.3015, + "step": 8942 + }, + { + "epoch": 2.249245472837022, + "grad_norm": 0.30037471652030945, + "learning_rate": 1.7894882471202884e-06, + "loss": 0.344, + "step": 8943 + }, + { + "epoch": 2.249496981891348, + "grad_norm": 0.32366058230400085, + "learning_rate": 1.7883666312987319e-06, + "loss": 0.328, + "step": 8944 + }, + { + "epoch": 2.249748490945674, + "grad_norm": 0.295149564743042, + "learning_rate": 1.7872452905334836e-06, + "loss": 0.3338, + "step": 8945 + }, + { + "epoch": 2.25, + "grad_norm": 0.31807345151901245, + "learning_rate": 1.7861242249205752e-06, + "loss": 0.3177, + "step": 8946 + }, + { + "epoch": 2.250251509054326, + "grad_norm": 0.31685301661491394, + "learning_rate": 1.785003434556023e-06, + "loss": 0.3012, + "step": 8947 + }, + { + "epoch": 2.250503018108652, + "grad_norm": 0.3292388617992401, + "learning_rate": 1.783882919535812e-06, + "loss": 0.3129, + "step": 8948 + }, + { + "epoch": 2.250754527162978, + "grad_norm": 0.3227696716785431, + "learning_rate": 1.7827626799559105e-06, + "loss": 0.3408, + "step": 8949 + }, + { + "epoch": 2.2510060362173037, + "grad_norm": 0.2962954044342041, + "learning_rate": 1.7816427159122569e-06, + "loss": 0.339, + "step": 8950 + }, + { + "epoch": 2.25125754527163, + "grad_norm": 0.3421285152435303, + "learning_rate": 1.7805230275007724e-06, + "loss": 0.3196, + "step": 8951 + }, + { + "epoch": 2.2515090543259557, + "grad_norm": 0.3049623668193817, + "learning_rate": 1.7794036148173477e-06, + "loss": 0.344, + "step": 8952 + }, + { + "epoch": 2.2517605633802815, + "grad_norm": 0.29739803075790405, + "learning_rate": 1.7782844779578574e-06, + "loss": 0.3286, + "step": 8953 + }, + { + "epoch": 2.2520120724346078, + "grad_norm": 0.31580233573913574, + "learning_rate": 1.7771656170181445e-06, + "loss": 0.3177, + "step": 8954 + }, + { + "epoch": 2.2522635814889336, + "grad_norm": 0.321826308965683, + "learning_rate": 1.7760470320940348e-06, + "loss": 0.3254, + "step": 8955 + }, + { + "epoch": 2.2525150905432594, + "grad_norm": 0.2998609244823456, + "learning_rate": 1.7749287232813296e-06, + "loss": 0.3193, + "step": 8956 + }, + { + "epoch": 2.2527665995975856, + "grad_norm": 0.29203975200653076, + "learning_rate": 1.7738106906758013e-06, + "loss": 0.3187, + "step": 8957 + }, + { + "epoch": 2.2530181086519114, + "grad_norm": 0.30509132146835327, + "learning_rate": 1.7726929343732059e-06, + "loss": 0.3282, + "step": 8958 + }, + { + "epoch": 2.2532696177062372, + "grad_norm": 0.29113608598709106, + "learning_rate": 1.7715754544692692e-06, + "loss": 0.3167, + "step": 8959 + }, + { + "epoch": 2.2535211267605635, + "grad_norm": 0.322614848613739, + "learning_rate": 1.7704582510596996e-06, + "loss": 0.3343, + "step": 8960 + }, + { + "epoch": 2.2537726358148893, + "grad_norm": 0.31486186385154724, + "learning_rate": 1.7693413242401753e-06, + "loss": 0.3303, + "step": 8961 + }, + { + "epoch": 2.2540241448692155, + "grad_norm": 0.30907559394836426, + "learning_rate": 1.7682246741063568e-06, + "loss": 0.3308, + "step": 8962 + }, + { + "epoch": 2.2542756539235413, + "grad_norm": 0.3086276054382324, + "learning_rate": 1.7671083007538765e-06, + "loss": 0.3373, + "step": 8963 + }, + { + "epoch": 2.254527162977867, + "grad_norm": 0.31900346279144287, + "learning_rate": 1.7659922042783463e-06, + "loss": 0.3326, + "step": 8964 + }, + { + "epoch": 2.2547786720321934, + "grad_norm": 0.29411938786506653, + "learning_rate": 1.7648763847753497e-06, + "loss": 0.3219, + "step": 8965 + }, + { + "epoch": 2.255030181086519, + "grad_norm": 0.2990444004535675, + "learning_rate": 1.7637608423404524e-06, + "loss": 0.3265, + "step": 8966 + }, + { + "epoch": 2.255281690140845, + "grad_norm": 0.3038404881954193, + "learning_rate": 1.7626455770691947e-06, + "loss": 0.2958, + "step": 8967 + }, + { + "epoch": 2.2555331991951713, + "grad_norm": 0.2940180003643036, + "learning_rate": 1.7615305890570888e-06, + "loss": 0.3148, + "step": 8968 + }, + { + "epoch": 2.255784708249497, + "grad_norm": 0.31063640117645264, + "learning_rate": 1.76041587839963e-06, + "loss": 0.3319, + "step": 8969 + }, + { + "epoch": 2.256036217303823, + "grad_norm": 0.2945641875267029, + "learning_rate": 1.759301445192283e-06, + "loss": 0.3332, + "step": 8970 + }, + { + "epoch": 2.256287726358149, + "grad_norm": 0.2893473505973816, + "learning_rate": 1.7581872895304947e-06, + "loss": 0.3137, + "step": 8971 + }, + { + "epoch": 2.256539235412475, + "grad_norm": 0.2820425033569336, + "learning_rate": 1.7570734115096827e-06, + "loss": 0.3094, + "step": 8972 + }, + { + "epoch": 2.2567907444668007, + "grad_norm": 0.27644819021224976, + "learning_rate": 1.7559598112252475e-06, + "loss": 0.326, + "step": 8973 + }, + { + "epoch": 2.257042253521127, + "grad_norm": 0.276979923248291, + "learning_rate": 1.7548464887725576e-06, + "loss": 0.3127, + "step": 8974 + }, + { + "epoch": 2.2572937625754528, + "grad_norm": 0.3153163492679596, + "learning_rate": 1.753733444246966e-06, + "loss": 0.3206, + "step": 8975 + }, + { + "epoch": 2.2575452716297786, + "grad_norm": 0.286739706993103, + "learning_rate": 1.7526206777437948e-06, + "loss": 0.3228, + "step": 8976 + }, + { + "epoch": 2.257796780684105, + "grad_norm": 0.3067798912525177, + "learning_rate": 1.7515081893583469e-06, + "loss": 0.3415, + "step": 8977 + }, + { + "epoch": 2.2580482897384306, + "grad_norm": 0.28727710247039795, + "learning_rate": 1.7503959791859016e-06, + "loss": 0.3038, + "step": 8978 + }, + { + "epoch": 2.2582997987927564, + "grad_norm": 0.29552116990089417, + "learning_rate": 1.7492840473217099e-06, + "loss": 0.3148, + "step": 8979 + }, + { + "epoch": 2.2585513078470827, + "grad_norm": 0.2867220342159271, + "learning_rate": 1.7481723938610045e-06, + "loss": 0.3154, + "step": 8980 + }, + { + "epoch": 2.2588028169014085, + "grad_norm": 0.2924220860004425, + "learning_rate": 1.747061018898989e-06, + "loss": 0.288, + "step": 8981 + }, + { + "epoch": 2.2590543259557343, + "grad_norm": 0.29958319664001465, + "learning_rate": 1.745949922530848e-06, + "loss": 0.3132, + "step": 8982 + }, + { + "epoch": 2.2593058350100605, + "grad_norm": 0.3001472055912018, + "learning_rate": 1.7448391048517378e-06, + "loss": 0.3353, + "step": 8983 + }, + { + "epoch": 2.2595573440643864, + "grad_norm": 0.32707712054252625, + "learning_rate": 1.7437285659567954e-06, + "loss": 0.3129, + "step": 8984 + }, + { + "epoch": 2.259808853118712, + "grad_norm": 0.2877408266067505, + "learning_rate": 1.7426183059411284e-06, + "loss": 0.334, + "step": 8985 + }, + { + "epoch": 2.2600603621730384, + "grad_norm": 0.3208245635032654, + "learning_rate": 1.7415083248998271e-06, + "loss": 0.3472, + "step": 8986 + }, + { + "epoch": 2.260311871227364, + "grad_norm": 0.31004029512405396, + "learning_rate": 1.7403986229279506e-06, + "loss": 0.3382, + "step": 8987 + }, + { + "epoch": 2.26056338028169, + "grad_norm": 0.3009289503097534, + "learning_rate": 1.7392892001205409e-06, + "loss": 0.3078, + "step": 8988 + }, + { + "epoch": 2.2608148893360163, + "grad_norm": 0.29705020785331726, + "learning_rate": 1.7381800565726138e-06, + "loss": 0.3133, + "step": 8989 + }, + { + "epoch": 2.261066398390342, + "grad_norm": 0.30671584606170654, + "learning_rate": 1.7370711923791567e-06, + "loss": 0.3117, + "step": 8990 + }, + { + "epoch": 2.261317907444668, + "grad_norm": 0.2912043333053589, + "learning_rate": 1.735962607635141e-06, + "loss": 0.3099, + "step": 8991 + }, + { + "epoch": 2.261569416498994, + "grad_norm": 0.3244048058986664, + "learning_rate": 1.7348543024355068e-06, + "loss": 0.329, + "step": 8992 + }, + { + "epoch": 2.26182092555332, + "grad_norm": 0.28510650992393494, + "learning_rate": 1.7337462768751766e-06, + "loss": 0.3192, + "step": 8993 + }, + { + "epoch": 2.2620724346076457, + "grad_norm": 0.3037286698818207, + "learning_rate": 1.7326385310490424e-06, + "loss": 0.3109, + "step": 8994 + }, + { + "epoch": 2.262323943661972, + "grad_norm": 0.3154160678386688, + "learning_rate": 1.731531065051979e-06, + "loss": 0.3267, + "step": 8995 + }, + { + "epoch": 2.262575452716298, + "grad_norm": 0.2968575358390808, + "learning_rate": 1.7304238789788308e-06, + "loss": 0.3137, + "step": 8996 + }, + { + "epoch": 2.2628269617706236, + "grad_norm": 0.2917513847351074, + "learning_rate": 1.7293169729244247e-06, + "loss": 0.2899, + "step": 8997 + }, + { + "epoch": 2.26307847082495, + "grad_norm": 0.29885080456733704, + "learning_rate": 1.728210346983557e-06, + "loss": 0.3232, + "step": 8998 + }, + { + "epoch": 2.2633299798792756, + "grad_norm": 0.2979438006877899, + "learning_rate": 1.7271040012510044e-06, + "loss": 0.3017, + "step": 8999 + }, + { + "epoch": 2.2635814889336014, + "grad_norm": 0.2976253628730774, + "learning_rate": 1.7259979358215213e-06, + "loss": 0.3289, + "step": 9000 + }, + { + "epoch": 2.2638329979879277, + "grad_norm": 0.2976281940937042, + "learning_rate": 1.7248921507898304e-06, + "loss": 0.3013, + "step": 9001 + }, + { + "epoch": 2.2640845070422535, + "grad_norm": 0.31368160247802734, + "learning_rate": 1.7237866462506398e-06, + "loss": 0.2938, + "step": 9002 + }, + { + "epoch": 2.2643360160965793, + "grad_norm": 0.30961135029792786, + "learning_rate": 1.7226814222986255e-06, + "loss": 0.3168, + "step": 9003 + }, + { + "epoch": 2.2645875251509056, + "grad_norm": 0.2974013686180115, + "learning_rate": 1.7215764790284462e-06, + "loss": 0.3062, + "step": 9004 + }, + { + "epoch": 2.2648390342052314, + "grad_norm": 0.2913673520088196, + "learning_rate": 1.7204718165347301e-06, + "loss": 0.3068, + "step": 9005 + }, + { + "epoch": 2.265090543259557, + "grad_norm": 0.3057686686515808, + "learning_rate": 1.7193674349120877e-06, + "loss": 0.3078, + "step": 9006 + }, + { + "epoch": 2.2653420523138834, + "grad_norm": 0.29526326060295105, + "learning_rate": 1.7182633342550991e-06, + "loss": 0.2993, + "step": 9007 + }, + { + "epoch": 2.265593561368209, + "grad_norm": 0.30530017614364624, + "learning_rate": 1.7171595146583275e-06, + "loss": 0.3163, + "step": 9008 + }, + { + "epoch": 2.265845070422535, + "grad_norm": 0.31532004475593567, + "learning_rate": 1.7160559762163042e-06, + "loss": 0.3276, + "step": 9009 + }, + { + "epoch": 2.2660965794768613, + "grad_norm": 0.30409690737724304, + "learning_rate": 1.714952719023542e-06, + "loss": 0.3166, + "step": 9010 + }, + { + "epoch": 2.266348088531187, + "grad_norm": 0.3003282845020294, + "learning_rate": 1.7138497431745304e-06, + "loss": 0.3452, + "step": 9011 + }, + { + "epoch": 2.266599597585513, + "grad_norm": 0.29305264353752136, + "learning_rate": 1.7127470487637282e-06, + "loss": 0.3278, + "step": 9012 + }, + { + "epoch": 2.266851106639839, + "grad_norm": 0.27770164608955383, + "learning_rate": 1.7116446358855781e-06, + "loss": 0.3196, + "step": 9013 + }, + { + "epoch": 2.267102615694165, + "grad_norm": 0.2922765016555786, + "learning_rate": 1.7105425046344914e-06, + "loss": 0.3444, + "step": 9014 + }, + { + "epoch": 2.2673541247484907, + "grad_norm": 0.32455912232398987, + "learning_rate": 1.709440655104862e-06, + "loss": 0.2895, + "step": 9015 + }, + { + "epoch": 2.267605633802817, + "grad_norm": 0.2932599186897278, + "learning_rate": 1.7083390873910533e-06, + "loss": 0.3285, + "step": 9016 + }, + { + "epoch": 2.267857142857143, + "grad_norm": 0.29908427596092224, + "learning_rate": 1.7072378015874107e-06, + "loss": 0.3236, + "step": 9017 + }, + { + "epoch": 2.2681086519114686, + "grad_norm": 0.31389617919921875, + "learning_rate": 1.7061367977882493e-06, + "loss": 0.3431, + "step": 9018 + }, + { + "epoch": 2.268360160965795, + "grad_norm": 0.2915651798248291, + "learning_rate": 1.7050360760878665e-06, + "loss": 0.3442, + "step": 9019 + }, + { + "epoch": 2.2686116700201207, + "grad_norm": 0.3061988949775696, + "learning_rate": 1.7039356365805286e-06, + "loss": 0.3118, + "step": 9020 + }, + { + "epoch": 2.2688631790744465, + "grad_norm": 0.3040686249732971, + "learning_rate": 1.7028354793604857e-06, + "loss": 0.3257, + "step": 9021 + }, + { + "epoch": 2.2691146881287727, + "grad_norm": 0.3051724135875702, + "learning_rate": 1.7017356045219545e-06, + "loss": 0.3085, + "step": 9022 + }, + { + "epoch": 2.2693661971830985, + "grad_norm": 0.32119080424308777, + "learning_rate": 1.700636012159137e-06, + "loss": 0.3363, + "step": 9023 + }, + { + "epoch": 2.2696177062374243, + "grad_norm": 0.28925296664237976, + "learning_rate": 1.699536702366203e-06, + "loss": 0.3113, + "step": 9024 + }, + { + "epoch": 2.2698692152917506, + "grad_norm": 0.30859729647636414, + "learning_rate": 1.6984376752373038e-06, + "loss": 0.2988, + "step": 9025 + }, + { + "epoch": 2.2701207243460764, + "grad_norm": 0.3168920576572418, + "learning_rate": 1.6973389308665617e-06, + "loss": 0.3182, + "step": 9026 + }, + { + "epoch": 2.270372233400402, + "grad_norm": 0.3211313784122467, + "learning_rate": 1.6962404693480805e-06, + "loss": 0.3277, + "step": 9027 + }, + { + "epoch": 2.2706237424547284, + "grad_norm": 0.30997171998023987, + "learning_rate": 1.6951422907759336e-06, + "loss": 0.3244, + "step": 9028 + }, + { + "epoch": 2.2708752515090542, + "grad_norm": 0.3075638711452484, + "learning_rate": 1.6940443952441755e-06, + "loss": 0.3263, + "step": 9029 + }, + { + "epoch": 2.2711267605633805, + "grad_norm": 0.31429892778396606, + "learning_rate": 1.6929467828468316e-06, + "loss": 0.3171, + "step": 9030 + }, + { + "epoch": 2.2713782696177063, + "grad_norm": 0.2928819954395294, + "learning_rate": 1.6918494536779084e-06, + "loss": 0.2967, + "step": 9031 + }, + { + "epoch": 2.271629778672032, + "grad_norm": 0.2912546694278717, + "learning_rate": 1.690752407831382e-06, + "loss": 0.3185, + "step": 9032 + }, + { + "epoch": 2.2718812877263583, + "grad_norm": 0.31666284799575806, + "learning_rate": 1.689655645401211e-06, + "loss": 0.351, + "step": 9033 + }, + { + "epoch": 2.272132796780684, + "grad_norm": 0.30313748121261597, + "learning_rate": 1.6885591664813228e-06, + "loss": 0.3183, + "step": 9034 + }, + { + "epoch": 2.27238430583501, + "grad_norm": 0.2988080084323883, + "learning_rate": 1.6874629711656275e-06, + "loss": 0.3295, + "step": 9035 + }, + { + "epoch": 2.272635814889336, + "grad_norm": 0.2955930531024933, + "learning_rate": 1.6863670595480042e-06, + "loss": 0.3343, + "step": 9036 + }, + { + "epoch": 2.272887323943662, + "grad_norm": 0.33139654994010925, + "learning_rate": 1.6852714317223118e-06, + "loss": 0.3049, + "step": 9037 + }, + { + "epoch": 2.273138832997988, + "grad_norm": 0.29346272349357605, + "learning_rate": 1.684176087782386e-06, + "loss": 0.3158, + "step": 9038 + }, + { + "epoch": 2.273390342052314, + "grad_norm": 0.3030097186565399, + "learning_rate": 1.6830810278220327e-06, + "loss": 0.3183, + "step": 9039 + }, + { + "epoch": 2.27364185110664, + "grad_norm": 0.28629571199417114, + "learning_rate": 1.681986251935041e-06, + "loss": 0.3128, + "step": 9040 + }, + { + "epoch": 2.2738933601609657, + "grad_norm": 0.31391236186027527, + "learning_rate": 1.6808917602151676e-06, + "loss": 0.3154, + "step": 9041 + }, + { + "epoch": 2.274144869215292, + "grad_norm": 0.27841538190841675, + "learning_rate": 1.6797975527561522e-06, + "loss": 0.3176, + "step": 9042 + }, + { + "epoch": 2.2743963782696177, + "grad_norm": 0.2939876914024353, + "learning_rate": 1.6787036296517034e-06, + "loss": 0.3298, + "step": 9043 + }, + { + "epoch": 2.2746478873239435, + "grad_norm": 0.3155568540096283, + "learning_rate": 1.677609990995513e-06, + "loss": 0.3457, + "step": 9044 + }, + { + "epoch": 2.2748993963782698, + "grad_norm": 0.30136945843696594, + "learning_rate": 1.6765166368812403e-06, + "loss": 0.3086, + "step": 9045 + }, + { + "epoch": 2.2751509054325956, + "grad_norm": 0.2959335744380951, + "learning_rate": 1.6754235674025271e-06, + "loss": 0.3092, + "step": 9046 + }, + { + "epoch": 2.2754024144869214, + "grad_norm": 0.2871185541152954, + "learning_rate": 1.6743307826529858e-06, + "loss": 0.3144, + "step": 9047 + }, + { + "epoch": 2.2756539235412476, + "grad_norm": 0.30033811926841736, + "learning_rate": 1.6732382827262089e-06, + "loss": 0.31, + "step": 9048 + }, + { + "epoch": 2.2759054325955734, + "grad_norm": 0.3050321936607361, + "learning_rate": 1.6721460677157591e-06, + "loss": 0.3331, + "step": 9049 + }, + { + "epoch": 2.2761569416498992, + "grad_norm": 0.33643415570259094, + "learning_rate": 1.6710541377151818e-06, + "loss": 0.3273, + "step": 9050 + }, + { + "epoch": 2.2764084507042255, + "grad_norm": 0.32750841975212097, + "learning_rate": 1.6699624928179897e-06, + "loss": 0.3269, + "step": 9051 + }, + { + "epoch": 2.2766599597585513, + "grad_norm": 0.29840290546417236, + "learning_rate": 1.6688711331176777e-06, + "loss": 0.313, + "step": 9052 + }, + { + "epoch": 2.276911468812877, + "grad_norm": 0.3018534481525421, + "learning_rate": 1.6677800587077153e-06, + "loss": 0.3041, + "step": 9053 + }, + { + "epoch": 2.2771629778672033, + "grad_norm": 0.297966867685318, + "learning_rate": 1.6666892696815428e-06, + "loss": 0.3085, + "step": 9054 + }, + { + "epoch": 2.277414486921529, + "grad_norm": 0.29202499985694885, + "learning_rate": 1.6655987661325835e-06, + "loss": 0.313, + "step": 9055 + }, + { + "epoch": 2.277665995975855, + "grad_norm": 0.27722373604774475, + "learning_rate": 1.6645085481542273e-06, + "loss": 0.3335, + "step": 9056 + }, + { + "epoch": 2.277917505030181, + "grad_norm": 0.3057959973812103, + "learning_rate": 1.6634186158398496e-06, + "loss": 0.3158, + "step": 9057 + }, + { + "epoch": 2.278169014084507, + "grad_norm": 0.29990729689598083, + "learning_rate": 1.6623289692827916e-06, + "loss": 0.3162, + "step": 9058 + }, + { + "epoch": 2.278420523138833, + "grad_norm": 0.3051862418651581, + "learning_rate": 1.6612396085763794e-06, + "loss": 0.2969, + "step": 9059 + }, + { + "epoch": 2.278672032193159, + "grad_norm": 0.30120375752449036, + "learning_rate": 1.660150533813905e-06, + "loss": 0.3275, + "step": 9060 + }, + { + "epoch": 2.278923541247485, + "grad_norm": 0.3087018132209778, + "learning_rate": 1.6590617450886453e-06, + "loss": 0.3146, + "step": 9061 + }, + { + "epoch": 2.279175050301811, + "grad_norm": 0.3511127531528473, + "learning_rate": 1.657973242493845e-06, + "loss": 0.3205, + "step": 9062 + }, + { + "epoch": 2.279426559356137, + "grad_norm": 0.29214683175086975, + "learning_rate": 1.6568850261227282e-06, + "loss": 0.3271, + "step": 9063 + }, + { + "epoch": 2.2796780684104627, + "grad_norm": 0.3279742896556854, + "learning_rate": 1.6557970960684965e-06, + "loss": 0.3187, + "step": 9064 + }, + { + "epoch": 2.279929577464789, + "grad_norm": 0.2921777367591858, + "learning_rate": 1.6547094524243207e-06, + "loss": 0.3241, + "step": 9065 + }, + { + "epoch": 2.2801810865191148, + "grad_norm": 0.2994241714477539, + "learning_rate": 1.6536220952833536e-06, + "loss": 0.3162, + "step": 9066 + }, + { + "epoch": 2.2804325955734406, + "grad_norm": 0.29721885919570923, + "learning_rate": 1.6525350247387178e-06, + "loss": 0.3329, + "step": 9067 + }, + { + "epoch": 2.280684104627767, + "grad_norm": 0.3295932412147522, + "learning_rate": 1.651448240883517e-06, + "loss": 0.312, + "step": 9068 + }, + { + "epoch": 2.2809356136820926, + "grad_norm": 0.3002535402774811, + "learning_rate": 1.6503617438108243e-06, + "loss": 0.3275, + "step": 9069 + }, + { + "epoch": 2.2811871227364184, + "grad_norm": 0.2872998118400574, + "learning_rate": 1.6492755336136945e-06, + "loss": 0.306, + "step": 9070 + }, + { + "epoch": 2.2814386317907447, + "grad_norm": 0.29634666442871094, + "learning_rate": 1.6481896103851513e-06, + "loss": 0.3245, + "step": 9071 + }, + { + "epoch": 2.2816901408450705, + "grad_norm": 0.28420212864875793, + "learning_rate": 1.647103974218201e-06, + "loss": 0.3308, + "step": 9072 + }, + { + "epoch": 2.2819416498993963, + "grad_norm": 0.3027746081352234, + "learning_rate": 1.6460186252058181e-06, + "loss": 0.3234, + "step": 9073 + }, + { + "epoch": 2.2821931589537225, + "grad_norm": 0.3052777945995331, + "learning_rate": 1.6449335634409575e-06, + "loss": 0.3114, + "step": 9074 + }, + { + "epoch": 2.2824446680080483, + "grad_norm": 0.299187570810318, + "learning_rate": 1.6438487890165494e-06, + "loss": 0.3206, + "step": 9075 + }, + { + "epoch": 2.282696177062374, + "grad_norm": 0.2914719879627228, + "learning_rate": 1.6427643020254951e-06, + "loss": 0.3431, + "step": 9076 + }, + { + "epoch": 2.2829476861167004, + "grad_norm": 0.29015323519706726, + "learning_rate": 1.641680102560677e-06, + "loss": 0.3297, + "step": 9077 + }, + { + "epoch": 2.283199195171026, + "grad_norm": 0.28477156162261963, + "learning_rate": 1.640596190714947e-06, + "loss": 0.3376, + "step": 9078 + }, + { + "epoch": 2.283450704225352, + "grad_norm": 0.2982317805290222, + "learning_rate": 1.6395125665811385e-06, + "loss": 0.3274, + "step": 9079 + }, + { + "epoch": 2.2837022132796783, + "grad_norm": 0.3212500512599945, + "learning_rate": 1.6384292302520537e-06, + "loss": 0.3396, + "step": 9080 + }, + { + "epoch": 2.283953722334004, + "grad_norm": 0.2954564392566681, + "learning_rate": 1.6373461818204773e-06, + "loss": 0.3362, + "step": 9081 + }, + { + "epoch": 2.28420523138833, + "grad_norm": 0.306144118309021, + "learning_rate": 1.6362634213791617e-06, + "loss": 0.34, + "step": 9082 + }, + { + "epoch": 2.284456740442656, + "grad_norm": 0.28161266446113586, + "learning_rate": 1.6351809490208426e-06, + "loss": 0.3189, + "step": 9083 + }, + { + "epoch": 2.284708249496982, + "grad_norm": 0.3151404857635498, + "learning_rate": 1.6340987648382233e-06, + "loss": 0.3294, + "step": 9084 + }, + { + "epoch": 2.2849597585513077, + "grad_norm": 0.28344619274139404, + "learning_rate": 1.6330168689239879e-06, + "loss": 0.317, + "step": 9085 + }, + { + "epoch": 2.285211267605634, + "grad_norm": 0.2821776568889618, + "learning_rate": 1.6319352613707956e-06, + "loss": 0.3065, + "step": 9086 + }, + { + "epoch": 2.28546277665996, + "grad_norm": 0.29392242431640625, + "learning_rate": 1.6308539422712756e-06, + "loss": 0.3496, + "step": 9087 + }, + { + "epoch": 2.2857142857142856, + "grad_norm": 0.2900041937828064, + "learning_rate": 1.62977291171804e-06, + "loss": 0.303, + "step": 9088 + }, + { + "epoch": 2.285965794768612, + "grad_norm": 0.30908921360969543, + "learning_rate": 1.6286921698036685e-06, + "loss": 0.3404, + "step": 9089 + }, + { + "epoch": 2.2862173038229376, + "grad_norm": 0.28658002614974976, + "learning_rate": 1.627611716620724e-06, + "loss": 0.3158, + "step": 9090 + }, + { + "epoch": 2.2864688128772634, + "grad_norm": 0.2914651930332184, + "learning_rate": 1.6265315522617365e-06, + "loss": 0.3237, + "step": 9091 + }, + { + "epoch": 2.2867203219315897, + "grad_norm": 0.3053009808063507, + "learning_rate": 1.6254516768192185e-06, + "loss": 0.3383, + "step": 9092 + }, + { + "epoch": 2.2869718309859155, + "grad_norm": 0.2992967367172241, + "learning_rate": 1.6243720903856518e-06, + "loss": 0.3272, + "step": 9093 + }, + { + "epoch": 2.2872233400402413, + "grad_norm": 0.2941541075706482, + "learning_rate": 1.6232927930534997e-06, + "loss": 0.3164, + "step": 9094 + }, + { + "epoch": 2.2874748490945676, + "grad_norm": 0.28971970081329346, + "learning_rate": 1.6222137849151932e-06, + "loss": 0.3291, + "step": 9095 + }, + { + "epoch": 2.2877263581488934, + "grad_norm": 0.27451595664024353, + "learning_rate": 1.6211350660631448e-06, + "loss": 0.2967, + "step": 9096 + }, + { + "epoch": 2.287977867203219, + "grad_norm": 0.2858002483844757, + "learning_rate": 1.6200566365897414e-06, + "loss": 0.3311, + "step": 9097 + }, + { + "epoch": 2.2882293762575454, + "grad_norm": 0.28279751539230347, + "learning_rate": 1.6189784965873407e-06, + "loss": 0.3334, + "step": 9098 + }, + { + "epoch": 2.288480885311871, + "grad_norm": 0.29283228516578674, + "learning_rate": 1.617900646148282e-06, + "loss": 0.3158, + "step": 9099 + }, + { + "epoch": 2.288732394366197, + "grad_norm": 0.29870137572288513, + "learning_rate": 1.6168230853648725e-06, + "loss": 0.3458, + "step": 9100 + }, + { + "epoch": 2.2889839034205233, + "grad_norm": 0.2867933213710785, + "learning_rate": 1.615745814329403e-06, + "loss": 0.3067, + "step": 9101 + }, + { + "epoch": 2.289235412474849, + "grad_norm": 0.27168020606040955, + "learning_rate": 1.6146688331341303e-06, + "loss": 0.3111, + "step": 9102 + }, + { + "epoch": 2.289486921529175, + "grad_norm": 0.2864697277545929, + "learning_rate": 1.6135921418712959e-06, + "loss": 0.3208, + "step": 9103 + }, + { + "epoch": 2.289738430583501, + "grad_norm": 0.2777710556983948, + "learning_rate": 1.6125157406331065e-06, + "loss": 0.3309, + "step": 9104 + }, + { + "epoch": 2.289989939637827, + "grad_norm": 0.27852359414100647, + "learning_rate": 1.6114396295117547e-06, + "loss": 0.3057, + "step": 9105 + }, + { + "epoch": 2.2902414486921527, + "grad_norm": 0.30460140109062195, + "learning_rate": 1.6103638085993972e-06, + "loss": 0.3214, + "step": 9106 + }, + { + "epoch": 2.290492957746479, + "grad_norm": 0.3020101487636566, + "learning_rate": 1.6092882779881746e-06, + "loss": 0.3219, + "step": 9107 + }, + { + "epoch": 2.290744466800805, + "grad_norm": 0.29373854398727417, + "learning_rate": 1.6082130377702004e-06, + "loss": 0.3454, + "step": 9108 + }, + { + "epoch": 2.2909959758551306, + "grad_norm": 0.27670902013778687, + "learning_rate": 1.6071380880375586e-06, + "loss": 0.3154, + "step": 9109 + }, + { + "epoch": 2.291247484909457, + "grad_norm": 0.3227442502975464, + "learning_rate": 1.6060634288823158e-06, + "loss": 0.3082, + "step": 9110 + }, + { + "epoch": 2.2914989939637826, + "grad_norm": 0.3054077923297882, + "learning_rate": 1.6049890603965063e-06, + "loss": 0.3121, + "step": 9111 + }, + { + "epoch": 2.2917505030181085, + "grad_norm": 0.2847161293029785, + "learning_rate": 1.6039149826721462e-06, + "loss": 0.2923, + "step": 9112 + }, + { + "epoch": 2.2920020120724347, + "grad_norm": 0.3265150487422943, + "learning_rate": 1.6028411958012203e-06, + "loss": 0.3319, + "step": 9113 + }, + { + "epoch": 2.2922535211267605, + "grad_norm": 0.2997869551181793, + "learning_rate": 1.6017676998756947e-06, + "loss": 0.3079, + "step": 9114 + }, + { + "epoch": 2.2925050301810863, + "grad_norm": 0.3061138391494751, + "learning_rate": 1.6006944949875052e-06, + "loss": 0.3217, + "step": 9115 + }, + { + "epoch": 2.2927565392354126, + "grad_norm": 0.2861214876174927, + "learning_rate": 1.5996215812285682e-06, + "loss": 0.3015, + "step": 9116 + }, + { + "epoch": 2.2930080482897384, + "grad_norm": 0.31791844964027405, + "learning_rate": 1.5985489586907676e-06, + "loss": 0.3303, + "step": 9117 + }, + { + "epoch": 2.293259557344064, + "grad_norm": 0.2969111502170563, + "learning_rate": 1.5974766274659697e-06, + "loss": 0.3213, + "step": 9118 + }, + { + "epoch": 2.2935110663983904, + "grad_norm": 0.2913528084754944, + "learning_rate": 1.5964045876460143e-06, + "loss": 0.3531, + "step": 9119 + }, + { + "epoch": 2.2937625754527162, + "grad_norm": 0.30313563346862793, + "learning_rate": 1.5953328393227113e-06, + "loss": 0.3048, + "step": 9120 + }, + { + "epoch": 2.294014084507042, + "grad_norm": 0.3050040304660797, + "learning_rate": 1.5942613825878527e-06, + "loss": 0.3299, + "step": 9121 + }, + { + "epoch": 2.2942655935613683, + "grad_norm": 0.2819101810455322, + "learning_rate": 1.5931902175331986e-06, + "loss": 0.3041, + "step": 9122 + }, + { + "epoch": 2.294517102615694, + "grad_norm": 0.28545475006103516, + "learning_rate": 1.5921193442504918e-06, + "loss": 0.3063, + "step": 9123 + }, + { + "epoch": 2.29476861167002, + "grad_norm": 0.33305785059928894, + "learning_rate": 1.5910487628314414e-06, + "loss": 0.3053, + "step": 9124 + }, + { + "epoch": 2.295020120724346, + "grad_norm": 0.30768680572509766, + "learning_rate": 1.5899784733677397e-06, + "loss": 0.3215, + "step": 9125 + }, + { + "epoch": 2.295271629778672, + "grad_norm": 0.28684544563293457, + "learning_rate": 1.588908475951047e-06, + "loss": 0.3256, + "step": 9126 + }, + { + "epoch": 2.2955231388329977, + "grad_norm": 0.2967512011528015, + "learning_rate": 1.5878387706730053e-06, + "loss": 0.3377, + "step": 9127 + }, + { + "epoch": 2.295774647887324, + "grad_norm": 0.29876625537872314, + "learning_rate": 1.5867693576252252e-06, + "loss": 0.3176, + "step": 9128 + }, + { + "epoch": 2.29602615694165, + "grad_norm": 0.3200327754020691, + "learning_rate": 1.5857002368992963e-06, + "loss": 0.3297, + "step": 9129 + }, + { + "epoch": 2.296277665995976, + "grad_norm": 0.31204643845558167, + "learning_rate": 1.5846314085867836e-06, + "loss": 0.3412, + "step": 9130 + }, + { + "epoch": 2.296529175050302, + "grad_norm": 0.29177501797676086, + "learning_rate": 1.5835628727792228e-06, + "loss": 0.3191, + "step": 9131 + }, + { + "epoch": 2.2967806841046277, + "grad_norm": 0.2729922831058502, + "learning_rate": 1.582494629568131e-06, + "loss": 0.3133, + "step": 9132 + }, + { + "epoch": 2.297032193158954, + "grad_norm": 0.3033842146396637, + "learning_rate": 1.5814266790449922e-06, + "loss": 0.313, + "step": 9133 + }, + { + "epoch": 2.2972837022132797, + "grad_norm": 0.29062163829803467, + "learning_rate": 1.5803590213012738e-06, + "loss": 0.2842, + "step": 9134 + }, + { + "epoch": 2.2975352112676055, + "grad_norm": 0.2890336215496063, + "learning_rate": 1.57929165642841e-06, + "loss": 0.3044, + "step": 9135 + }, + { + "epoch": 2.2977867203219318, + "grad_norm": 0.2879352569580078, + "learning_rate": 1.578224584517818e-06, + "loss": 0.3019, + "step": 9136 + }, + { + "epoch": 2.2980382293762576, + "grad_norm": 0.3032044768333435, + "learning_rate": 1.5771578056608816e-06, + "loss": 0.3173, + "step": 9137 + }, + { + "epoch": 2.2982897384305834, + "grad_norm": 0.3006070852279663, + "learning_rate": 1.5760913199489674e-06, + "loss": 0.3308, + "step": 9138 + }, + { + "epoch": 2.2985412474849096, + "grad_norm": 0.3287603259086609, + "learning_rate": 1.5750251274734107e-06, + "loss": 0.3182, + "step": 9139 + }, + { + "epoch": 2.2987927565392354, + "grad_norm": 0.2758319675922394, + "learning_rate": 1.5739592283255251e-06, + "loss": 0.3166, + "step": 9140 + }, + { + "epoch": 2.2990442655935612, + "grad_norm": 0.29488644003868103, + "learning_rate": 1.5728936225966002e-06, + "loss": 0.293, + "step": 9141 + }, + { + "epoch": 2.2992957746478875, + "grad_norm": 0.31970569491386414, + "learning_rate": 1.5718283103778941e-06, + "loss": 0.313, + "step": 9142 + }, + { + "epoch": 2.2995472837022133, + "grad_norm": 0.29525116086006165, + "learning_rate": 1.5707632917606491e-06, + "loss": 0.3212, + "step": 9143 + }, + { + "epoch": 2.299798792756539, + "grad_norm": 0.29070043563842773, + "learning_rate": 1.569698566836073e-06, + "loss": 0.3151, + "step": 9144 + }, + { + "epoch": 2.3000503018108653, + "grad_norm": 0.3086007237434387, + "learning_rate": 1.5686341356953566e-06, + "loss": 0.3343, + "step": 9145 + }, + { + "epoch": 2.300301810865191, + "grad_norm": 0.3106892704963684, + "learning_rate": 1.5675699984296584e-06, + "loss": 0.3108, + "step": 9146 + }, + { + "epoch": 2.300553319919517, + "grad_norm": 0.2823115289211273, + "learning_rate": 1.5665061551301175e-06, + "loss": 0.3255, + "step": 9147 + }, + { + "epoch": 2.300804828973843, + "grad_norm": 0.3261948227882385, + "learning_rate": 1.5654426058878436e-06, + "loss": 0.3265, + "step": 9148 + }, + { + "epoch": 2.301056338028169, + "grad_norm": 0.30333390831947327, + "learning_rate": 1.5643793507939253e-06, + "loss": 0.3367, + "step": 9149 + }, + { + "epoch": 2.301307847082495, + "grad_norm": 0.28751394152641296, + "learning_rate": 1.5633163899394211e-06, + "loss": 0.3166, + "step": 9150 + }, + { + "epoch": 2.301559356136821, + "grad_norm": 0.2967854142189026, + "learning_rate": 1.5622537234153695e-06, + "loss": 0.3093, + "step": 9151 + }, + { + "epoch": 2.301810865191147, + "grad_norm": 0.2856552004814148, + "learning_rate": 1.561191351312779e-06, + "loss": 0.3194, + "step": 9152 + }, + { + "epoch": 2.3020623742454727, + "grad_norm": 0.30799177289009094, + "learning_rate": 1.560129273722637e-06, + "loss": 0.313, + "step": 9153 + }, + { + "epoch": 2.302313883299799, + "grad_norm": 0.2922762334346771, + "learning_rate": 1.559067490735902e-06, + "loss": 0.3232, + "step": 9154 + }, + { + "epoch": 2.3025653923541247, + "grad_norm": 0.28383079171180725, + "learning_rate": 1.5580060024435112e-06, + "loss": 0.3039, + "step": 9155 + }, + { + "epoch": 2.3028169014084505, + "grad_norm": 0.2966978847980499, + "learning_rate": 1.556944808936372e-06, + "loss": 0.3438, + "step": 9156 + }, + { + "epoch": 2.3030684104627768, + "grad_norm": 0.3017922639846802, + "learning_rate": 1.5558839103053713e-06, + "loss": 0.3177, + "step": 9157 + }, + { + "epoch": 2.3033199195171026, + "grad_norm": 0.2967662513256073, + "learning_rate": 1.554823306641366e-06, + "loss": 0.2977, + "step": 9158 + }, + { + "epoch": 2.3035714285714284, + "grad_norm": 0.2964570224285126, + "learning_rate": 1.5537629980351932e-06, + "loss": 0.3134, + "step": 9159 + }, + { + "epoch": 2.3038229376257546, + "grad_norm": 0.307412713766098, + "learning_rate": 1.5527029845776587e-06, + "loss": 0.3429, + "step": 9160 + }, + { + "epoch": 2.3040744466800804, + "grad_norm": 0.32412174344062805, + "learning_rate": 1.5516432663595483e-06, + "loss": 0.3201, + "step": 9161 + }, + { + "epoch": 2.3043259557344067, + "grad_norm": 0.3185696601867676, + "learning_rate": 1.550583843471618e-06, + "loss": 0.3388, + "step": 9162 + }, + { + "epoch": 2.3045774647887325, + "grad_norm": 0.2888423502445221, + "learning_rate": 1.5495247160046039e-06, + "loss": 0.3304, + "step": 9163 + }, + { + "epoch": 2.3048289738430583, + "grad_norm": 0.2889467477798462, + "learning_rate": 1.54846588404921e-06, + "loss": 0.3088, + "step": 9164 + }, + { + "epoch": 2.3050804828973845, + "grad_norm": 0.3054656982421875, + "learning_rate": 1.5474073476961216e-06, + "loss": 0.3256, + "step": 9165 + }, + { + "epoch": 2.3053319919517103, + "grad_norm": 0.30939677357673645, + "learning_rate": 1.5463491070359937e-06, + "loss": 0.3316, + "step": 9166 + }, + { + "epoch": 2.305583501006036, + "grad_norm": 0.31722491979599, + "learning_rate": 1.5452911621594596e-06, + "loss": 0.314, + "step": 9167 + }, + { + "epoch": 2.3058350100603624, + "grad_norm": 0.31227657198905945, + "learning_rate": 1.544233513157124e-06, + "loss": 0.3222, + "step": 9168 + }, + { + "epoch": 2.306086519114688, + "grad_norm": 0.2999171018600464, + "learning_rate": 1.54317616011957e-06, + "loss": 0.3119, + "step": 9169 + }, + { + "epoch": 2.306338028169014, + "grad_norm": 0.30300676822662354, + "learning_rate": 1.54211910313735e-06, + "loss": 0.2988, + "step": 9170 + }, + { + "epoch": 2.3065895372233403, + "grad_norm": 0.31904926896095276, + "learning_rate": 1.541062342300997e-06, + "loss": 0.3285, + "step": 9171 + }, + { + "epoch": 2.306841046277666, + "grad_norm": 0.29842498898506165, + "learning_rate": 1.5400058777010169e-06, + "loss": 0.3163, + "step": 9172 + }, + { + "epoch": 2.307092555331992, + "grad_norm": 0.3134821951389313, + "learning_rate": 1.5389497094278861e-06, + "loss": 0.3168, + "step": 9173 + }, + { + "epoch": 2.307344064386318, + "grad_norm": 0.3033539950847626, + "learning_rate": 1.537893837572062e-06, + "loss": 0.2992, + "step": 9174 + }, + { + "epoch": 2.307595573440644, + "grad_norm": 0.2625449001789093, + "learning_rate": 1.5368382622239703e-06, + "loss": 0.3167, + "step": 9175 + }, + { + "epoch": 2.3078470824949697, + "grad_norm": 0.3125070035457611, + "learning_rate": 1.5357829834740174e-06, + "loss": 0.3273, + "step": 9176 + }, + { + "epoch": 2.308098591549296, + "grad_norm": 0.3032437264919281, + "learning_rate": 1.5347280014125782e-06, + "loss": 0.3069, + "step": 9177 + }, + { + "epoch": 2.308350100603622, + "grad_norm": 0.3202145993709564, + "learning_rate": 1.5336733161300088e-06, + "loss": 0.3211, + "step": 9178 + }, + { + "epoch": 2.3086016096579476, + "grad_norm": 0.30158209800720215, + "learning_rate": 1.5326189277166325e-06, + "loss": 0.3295, + "step": 9179 + }, + { + "epoch": 2.308853118712274, + "grad_norm": 0.3015068769454956, + "learning_rate": 1.5315648362627556e-06, + "loss": 0.3257, + "step": 9180 + }, + { + "epoch": 2.3091046277665996, + "grad_norm": 0.30036771297454834, + "learning_rate": 1.53051104185865e-06, + "loss": 0.3132, + "step": 9181 + }, + { + "epoch": 2.3093561368209254, + "grad_norm": 0.2994939386844635, + "learning_rate": 1.5294575445945687e-06, + "loss": 0.3275, + "step": 9182 + }, + { + "epoch": 2.3096076458752517, + "grad_norm": 0.3159419298171997, + "learning_rate": 1.5284043445607383e-06, + "loss": 0.2978, + "step": 9183 + }, + { + "epoch": 2.3098591549295775, + "grad_norm": 0.3039325475692749, + "learning_rate": 1.5273514418473566e-06, + "loss": 0.3127, + "step": 9184 + }, + { + "epoch": 2.3101106639839033, + "grad_norm": 0.32907170057296753, + "learning_rate": 1.5262988365446002e-06, + "loss": 0.3283, + "step": 9185 + }, + { + "epoch": 2.3103621730382295, + "grad_norm": 0.30235663056373596, + "learning_rate": 1.5252465287426154e-06, + "loss": 0.3055, + "step": 9186 + }, + { + "epoch": 2.3106136820925554, + "grad_norm": 0.30011188983917236, + "learning_rate": 1.5241945185315292e-06, + "loss": 0.3435, + "step": 9187 + }, + { + "epoch": 2.310865191146881, + "grad_norm": 0.3140828311443329, + "learning_rate": 1.5231428060014363e-06, + "loss": 0.3229, + "step": 9188 + }, + { + "epoch": 2.3111167002012074, + "grad_norm": 0.30959945917129517, + "learning_rate": 1.5220913912424128e-06, + "loss": 0.3224, + "step": 9189 + }, + { + "epoch": 2.311368209255533, + "grad_norm": 0.2961542010307312, + "learning_rate": 1.5210402743445018e-06, + "loss": 0.343, + "step": 9190 + }, + { + "epoch": 2.311619718309859, + "grad_norm": 0.31126269698143005, + "learning_rate": 1.519989455397729e-06, + "loss": 0.2833, + "step": 9191 + }, + { + "epoch": 2.3118712273641853, + "grad_norm": 0.2786300480365753, + "learning_rate": 1.518938934492087e-06, + "loss": 0.3245, + "step": 9192 + }, + { + "epoch": 2.312122736418511, + "grad_norm": 0.30616483092308044, + "learning_rate": 1.5178887117175472e-06, + "loss": 0.3253, + "step": 9193 + }, + { + "epoch": 2.312374245472837, + "grad_norm": 0.27918025851249695, + "learning_rate": 1.5168387871640572e-06, + "loss": 0.325, + "step": 9194 + }, + { + "epoch": 2.312625754527163, + "grad_norm": 0.2911374866962433, + "learning_rate": 1.5157891609215331e-06, + "loss": 0.3209, + "step": 9195 + }, + { + "epoch": 2.312877263581489, + "grad_norm": 0.28203797340393066, + "learning_rate": 1.5147398330798712e-06, + "loss": 0.3063, + "step": 9196 + }, + { + "epoch": 2.3131287726358147, + "grad_norm": 0.29836559295654297, + "learning_rate": 1.5136908037289377e-06, + "loss": 0.3154, + "step": 9197 + }, + { + "epoch": 2.313380281690141, + "grad_norm": 0.3110038936138153, + "learning_rate": 1.5126420729585784e-06, + "loss": 0.3066, + "step": 9198 + }, + { + "epoch": 2.313631790744467, + "grad_norm": 0.3166778087615967, + "learning_rate": 1.5115936408586069e-06, + "loss": 0.3624, + "step": 9199 + }, + { + "epoch": 2.3138832997987926, + "grad_norm": 0.2844400107860565, + "learning_rate": 1.510545507518818e-06, + "loss": 0.3125, + "step": 9200 + }, + { + "epoch": 2.314134808853119, + "grad_norm": 0.3151662051677704, + "learning_rate": 1.5094976730289751e-06, + "loss": 0.3391, + "step": 9201 + }, + { + "epoch": 2.3143863179074446, + "grad_norm": 0.30355745553970337, + "learning_rate": 1.5084501374788213e-06, + "loss": 0.3132, + "step": 9202 + }, + { + "epoch": 2.3146378269617705, + "grad_norm": 0.3106285631656647, + "learning_rate": 1.5074029009580687e-06, + "loss": 0.3235, + "step": 9203 + }, + { + "epoch": 2.3148893360160967, + "grad_norm": 0.31202206015586853, + "learning_rate": 1.5063559635564078e-06, + "loss": 0.362, + "step": 9204 + }, + { + "epoch": 2.3151408450704225, + "grad_norm": 0.2972472012042999, + "learning_rate": 1.5053093253635043e-06, + "loss": 0.3213, + "step": 9205 + }, + { + "epoch": 2.3153923541247483, + "grad_norm": 0.29215607047080994, + "learning_rate": 1.5042629864689927e-06, + "loss": 0.3171, + "step": 9206 + }, + { + "epoch": 2.3156438631790746, + "grad_norm": 0.3004598617553711, + "learning_rate": 1.5032169469624892e-06, + "loss": 0.3167, + "step": 9207 + }, + { + "epoch": 2.3158953722334004, + "grad_norm": 0.2856040894985199, + "learning_rate": 1.502171206933576e-06, + "loss": 0.3203, + "step": 9208 + }, + { + "epoch": 2.316146881287726, + "grad_norm": 0.2829803228378296, + "learning_rate": 1.5011257664718187e-06, + "loss": 0.315, + "step": 9209 + }, + { + "epoch": 2.3163983903420524, + "grad_norm": 0.28959813714027405, + "learning_rate": 1.500080625666749e-06, + "loss": 0.3013, + "step": 9210 + }, + { + "epoch": 2.316649899396378, + "grad_norm": 0.3246660530567169, + "learning_rate": 1.4990357846078795e-06, + "loss": 0.3109, + "step": 9211 + }, + { + "epoch": 2.316901408450704, + "grad_norm": 0.30734169483184814, + "learning_rate": 1.4979912433846917e-06, + "loss": 0.3233, + "step": 9212 + }, + { + "epoch": 2.3171529175050303, + "grad_norm": 0.3003253638744354, + "learning_rate": 1.4969470020866467e-06, + "loss": 0.3286, + "step": 9213 + }, + { + "epoch": 2.317404426559356, + "grad_norm": 0.31494301557540894, + "learning_rate": 1.4959030608031749e-06, + "loss": 0.3244, + "step": 9214 + }, + { + "epoch": 2.317655935613682, + "grad_norm": 0.3177368640899658, + "learning_rate": 1.4948594196236838e-06, + "loss": 0.2982, + "step": 9215 + }, + { + "epoch": 2.317907444668008, + "grad_norm": 0.2995319664478302, + "learning_rate": 1.4938160786375571e-06, + "loss": 0.3289, + "step": 9216 + }, + { + "epoch": 2.318158953722334, + "grad_norm": 0.32530516386032104, + "learning_rate": 1.4927730379341476e-06, + "loss": 0.3391, + "step": 9217 + }, + { + "epoch": 2.3184104627766597, + "grad_norm": 0.2980848252773285, + "learning_rate": 1.491730297602787e-06, + "loss": 0.3546, + "step": 9218 + }, + { + "epoch": 2.318661971830986, + "grad_norm": 0.2969314455986023, + "learning_rate": 1.4906878577327776e-06, + "loss": 0.3178, + "step": 9219 + }, + { + "epoch": 2.318913480885312, + "grad_norm": 0.2775447368621826, + "learning_rate": 1.4896457184134005e-06, + "loss": 0.3162, + "step": 9220 + }, + { + "epoch": 2.3191649899396376, + "grad_norm": 0.2855249047279358, + "learning_rate": 1.4886038797339058e-06, + "loss": 0.3313, + "step": 9221 + }, + { + "epoch": 2.319416498993964, + "grad_norm": 0.3028770089149475, + "learning_rate": 1.4875623417835227e-06, + "loss": 0.3522, + "step": 9222 + }, + { + "epoch": 2.3196680080482897, + "grad_norm": 0.2914048731327057, + "learning_rate": 1.4865211046514494e-06, + "loss": 0.2839, + "step": 9223 + }, + { + "epoch": 2.3199195171026155, + "grad_norm": 0.3016049563884735, + "learning_rate": 1.4854801684268655e-06, + "loss": 0.3268, + "step": 9224 + }, + { + "epoch": 2.3201710261569417, + "grad_norm": 0.3333890438079834, + "learning_rate": 1.4844395331989164e-06, + "loss": 0.324, + "step": 9225 + }, + { + "epoch": 2.3204225352112675, + "grad_norm": 0.2845839858055115, + "learning_rate": 1.4833991990567281e-06, + "loss": 0.3258, + "step": 9226 + }, + { + "epoch": 2.3206740442655933, + "grad_norm": 0.30603617429733276, + "learning_rate": 1.4823591660894e-06, + "loss": 0.3091, + "step": 9227 + }, + { + "epoch": 2.3209255533199196, + "grad_norm": 0.2999797761440277, + "learning_rate": 1.4813194343860015e-06, + "loss": 0.3111, + "step": 9228 + }, + { + "epoch": 2.3211770623742454, + "grad_norm": 0.29560771584510803, + "learning_rate": 1.4802800040355825e-06, + "loss": 0.3128, + "step": 9229 + }, + { + "epoch": 2.3214285714285716, + "grad_norm": 0.28773796558380127, + "learning_rate": 1.4792408751271603e-06, + "loss": 0.3219, + "step": 9230 + }, + { + "epoch": 2.3216800804828974, + "grad_norm": 0.3093893826007843, + "learning_rate": 1.4782020477497328e-06, + "loss": 0.3332, + "step": 9231 + }, + { + "epoch": 2.3219315895372232, + "grad_norm": 0.3124389350414276, + "learning_rate": 1.4771635219922658e-06, + "loss": 0.3106, + "step": 9232 + }, + { + "epoch": 2.3221830985915495, + "grad_norm": 0.29394763708114624, + "learning_rate": 1.4761252979437062e-06, + "loss": 0.3145, + "step": 9233 + }, + { + "epoch": 2.3224346076458753, + "grad_norm": 0.29244881868362427, + "learning_rate": 1.475087375692968e-06, + "loss": 0.3295, + "step": 9234 + }, + { + "epoch": 2.322686116700201, + "grad_norm": 0.3140466511249542, + "learning_rate": 1.4740497553289456e-06, + "loss": 0.3344, + "step": 9235 + }, + { + "epoch": 2.3229376257545273, + "grad_norm": 0.2974216341972351, + "learning_rate": 1.473012436940502e-06, + "loss": 0.3238, + "step": 9236 + }, + { + "epoch": 2.323189134808853, + "grad_norm": 0.29725712537765503, + "learning_rate": 1.4719754206164782e-06, + "loss": 0.3307, + "step": 9237 + }, + { + "epoch": 2.323440643863179, + "grad_norm": 0.3167237937450409, + "learning_rate": 1.4709387064456899e-06, + "loss": 0.3255, + "step": 9238 + }, + { + "epoch": 2.323692152917505, + "grad_norm": 0.28957289457321167, + "learning_rate": 1.4699022945169221e-06, + "loss": 0.3095, + "step": 9239 + }, + { + "epoch": 2.323943661971831, + "grad_norm": 0.2918396294116974, + "learning_rate": 1.4688661849189407e-06, + "loss": 0.34, + "step": 9240 + }, + { + "epoch": 2.324195171026157, + "grad_norm": 0.29980701208114624, + "learning_rate": 1.4678303777404778e-06, + "loss": 0.3276, + "step": 9241 + }, + { + "epoch": 2.324446680080483, + "grad_norm": 0.2971269488334656, + "learning_rate": 1.4667948730702474e-06, + "loss": 0.3113, + "step": 9242 + }, + { + "epoch": 2.324698189134809, + "grad_norm": 0.29910194873809814, + "learning_rate": 1.4657596709969313e-06, + "loss": 0.3286, + "step": 9243 + }, + { + "epoch": 2.3249496981891347, + "grad_norm": 0.2877163887023926, + "learning_rate": 1.46472477160919e-06, + "loss": 0.3216, + "step": 9244 + }, + { + "epoch": 2.325201207243461, + "grad_norm": 0.308214396238327, + "learning_rate": 1.4636901749956544e-06, + "loss": 0.3337, + "step": 9245 + }, + { + "epoch": 2.3254527162977867, + "grad_norm": 0.3051459491252899, + "learning_rate": 1.4626558812449337e-06, + "loss": 0.3487, + "step": 9246 + }, + { + "epoch": 2.3257042253521125, + "grad_norm": 0.29465314745903015, + "learning_rate": 1.461621890445606e-06, + "loss": 0.3044, + "step": 9247 + }, + { + "epoch": 2.3259557344064388, + "grad_norm": 0.3071502447128296, + "learning_rate": 1.4605882026862267e-06, + "loss": 0.3422, + "step": 9248 + }, + { + "epoch": 2.3262072434607646, + "grad_norm": 0.3024512529373169, + "learning_rate": 1.4595548180553275e-06, + "loss": 0.3562, + "step": 9249 + }, + { + "epoch": 2.3264587525150904, + "grad_norm": 0.2800012528896332, + "learning_rate": 1.4585217366414072e-06, + "loss": 0.3417, + "step": 9250 + }, + { + "epoch": 2.3267102615694166, + "grad_norm": 0.30198147892951965, + "learning_rate": 1.4574889585329466e-06, + "loss": 0.3352, + "step": 9251 + }, + { + "epoch": 2.3269617706237424, + "grad_norm": 0.2976208031177521, + "learning_rate": 1.456456483818393e-06, + "loss": 0.3333, + "step": 9252 + }, + { + "epoch": 2.3272132796780682, + "grad_norm": 0.30566921830177307, + "learning_rate": 1.455424312586175e-06, + "loss": 0.3108, + "step": 9253 + }, + { + "epoch": 2.3274647887323945, + "grad_norm": 0.2987402379512787, + "learning_rate": 1.4543924449246882e-06, + "loss": 0.3167, + "step": 9254 + }, + { + "epoch": 2.3277162977867203, + "grad_norm": 0.3012560307979584, + "learning_rate": 1.4533608809223087e-06, + "loss": 0.3464, + "step": 9255 + }, + { + "epoch": 2.327967806841046, + "grad_norm": 0.3095744848251343, + "learning_rate": 1.452329620667381e-06, + "loss": 0.3325, + "step": 9256 + }, + { + "epoch": 2.3282193158953723, + "grad_norm": 0.29700353741645813, + "learning_rate": 1.4512986642482279e-06, + "loss": 0.3131, + "step": 9257 + }, + { + "epoch": 2.328470824949698, + "grad_norm": 0.2864930033683777, + "learning_rate": 1.4502680117531425e-06, + "loss": 0.334, + "step": 9258 + }, + { + "epoch": 2.328722334004024, + "grad_norm": 0.3204570710659027, + "learning_rate": 1.4492376632703947e-06, + "loss": 0.345, + "step": 9259 + }, + { + "epoch": 2.32897384305835, + "grad_norm": 0.2854207158088684, + "learning_rate": 1.4482076188882293e-06, + "loss": 0.3135, + "step": 9260 + }, + { + "epoch": 2.329225352112676, + "grad_norm": 0.2826644480228424, + "learning_rate": 1.4471778786948598e-06, + "loss": 0.3473, + "step": 9261 + }, + { + "epoch": 2.3294768611670023, + "grad_norm": 0.29891934990882874, + "learning_rate": 1.44614844277848e-06, + "loss": 0.3143, + "step": 9262 + }, + { + "epoch": 2.329728370221328, + "grad_norm": 0.3028249144554138, + "learning_rate": 1.4451193112272515e-06, + "loss": 0.3143, + "step": 9263 + }, + { + "epoch": 2.329979879275654, + "grad_norm": 0.30594244599342346, + "learning_rate": 1.4440904841293168e-06, + "loss": 0.3291, + "step": 9264 + }, + { + "epoch": 2.33023138832998, + "grad_norm": 0.3240032196044922, + "learning_rate": 1.4430619615727842e-06, + "loss": 0.3143, + "step": 9265 + }, + { + "epoch": 2.330482897384306, + "grad_norm": 0.2798107862472534, + "learning_rate": 1.442033743645745e-06, + "loss": 0.3249, + "step": 9266 + }, + { + "epoch": 2.3307344064386317, + "grad_norm": 0.3072926104068756, + "learning_rate": 1.4410058304362546e-06, + "loss": 0.347, + "step": 9267 + }, + { + "epoch": 2.330985915492958, + "grad_norm": 0.3058062493801117, + "learning_rate": 1.4399782220323515e-06, + "loss": 0.3252, + "step": 9268 + }, + { + "epoch": 2.3312374245472838, + "grad_norm": 0.2971736490726471, + "learning_rate": 1.4389509185220412e-06, + "loss": 0.3435, + "step": 9269 + }, + { + "epoch": 2.3314889336016096, + "grad_norm": 0.2837081253528595, + "learning_rate": 1.4379239199933082e-06, + "loss": 0.3381, + "step": 9270 + }, + { + "epoch": 2.331740442655936, + "grad_norm": 0.3149973452091217, + "learning_rate": 1.4368972265341052e-06, + "loss": 0.3382, + "step": 9271 + }, + { + "epoch": 2.3319919517102616, + "grad_norm": 0.3013942837715149, + "learning_rate": 1.435870838232366e-06, + "loss": 0.3117, + "step": 9272 + }, + { + "epoch": 2.3322434607645874, + "grad_norm": 0.3119725286960602, + "learning_rate": 1.4348447551759908e-06, + "loss": 0.3132, + "step": 9273 + }, + { + "epoch": 2.3324949698189137, + "grad_norm": 0.3200342059135437, + "learning_rate": 1.4338189774528605e-06, + "loss": 0.3183, + "step": 9274 + }, + { + "epoch": 2.3327464788732395, + "grad_norm": 0.2716279923915863, + "learning_rate": 1.432793505150823e-06, + "loss": 0.3251, + "step": 9275 + }, + { + "epoch": 2.3329979879275653, + "grad_norm": 0.31028926372528076, + "learning_rate": 1.4317683383577074e-06, + "loss": 0.3335, + "step": 9276 + }, + { + "epoch": 2.3332494969818915, + "grad_norm": 0.2938306927680969, + "learning_rate": 1.4307434771613087e-06, + "loss": 0.3327, + "step": 9277 + }, + { + "epoch": 2.3335010060362174, + "grad_norm": 0.30416056513786316, + "learning_rate": 1.429718921649404e-06, + "loss": 0.3397, + "step": 9278 + }, + { + "epoch": 2.333752515090543, + "grad_norm": 0.3303104341030121, + "learning_rate": 1.428694671909736e-06, + "loss": 0.3197, + "step": 9279 + }, + { + "epoch": 2.3340040241448694, + "grad_norm": 0.29095789790153503, + "learning_rate": 1.4276707280300295e-06, + "loss": 0.3101, + "step": 9280 + }, + { + "epoch": 2.334255533199195, + "grad_norm": 0.308417946100235, + "learning_rate": 1.4266470900979746e-06, + "loss": 0.2987, + "step": 9281 + }, + { + "epoch": 2.334507042253521, + "grad_norm": 0.33862265944480896, + "learning_rate": 1.4256237582012433e-06, + "loss": 0.3286, + "step": 9282 + }, + { + "epoch": 2.3347585513078473, + "grad_norm": 0.28483128547668457, + "learning_rate": 1.4246007324274747e-06, + "loss": 0.3082, + "step": 9283 + }, + { + "epoch": 2.335010060362173, + "grad_norm": 0.3114943504333496, + "learning_rate": 1.4235780128642867e-06, + "loss": 0.3211, + "step": 9284 + }, + { + "epoch": 2.335261569416499, + "grad_norm": 0.30490976572036743, + "learning_rate": 1.4225555995992668e-06, + "loss": 0.3289, + "step": 9285 + }, + { + "epoch": 2.335513078470825, + "grad_norm": 0.29261600971221924, + "learning_rate": 1.4215334927199808e-06, + "loss": 0.3234, + "step": 9286 + }, + { + "epoch": 2.335764587525151, + "grad_norm": 0.30190035700798035, + "learning_rate": 1.4205116923139628e-06, + "loss": 0.3004, + "step": 9287 + }, + { + "epoch": 2.3360160965794767, + "grad_norm": 0.30952534079551697, + "learning_rate": 1.4194901984687266e-06, + "loss": 0.3236, + "step": 9288 + }, + { + "epoch": 2.336267605633803, + "grad_norm": 0.3120158910751343, + "learning_rate": 1.4184690112717536e-06, + "loss": 0.3316, + "step": 9289 + }, + { + "epoch": 2.336519114688129, + "grad_norm": 0.2920396029949188, + "learning_rate": 1.417448130810506e-06, + "loss": 0.3116, + "step": 9290 + }, + { + "epoch": 2.3367706237424546, + "grad_norm": 0.31028714776039124, + "learning_rate": 1.4164275571724112e-06, + "loss": 0.3198, + "step": 9291 + }, + { + "epoch": 2.337022132796781, + "grad_norm": 0.33299851417541504, + "learning_rate": 1.4154072904448778e-06, + "loss": 0.3254, + "step": 9292 + }, + { + "epoch": 2.3372736418511066, + "grad_norm": 0.2730899155139923, + "learning_rate": 1.4143873307152867e-06, + "loss": 0.3093, + "step": 9293 + }, + { + "epoch": 2.3375251509054324, + "grad_norm": 0.30700913071632385, + "learning_rate": 1.413367678070987e-06, + "loss": 0.322, + "step": 9294 + }, + { + "epoch": 2.3377766599597587, + "grad_norm": 0.31583860516548157, + "learning_rate": 1.41234833259931e-06, + "loss": 0.3232, + "step": 9295 + }, + { + "epoch": 2.3380281690140845, + "grad_norm": 0.31791380047798157, + "learning_rate": 1.4113292943875518e-06, + "loss": 0.3248, + "step": 9296 + }, + { + "epoch": 2.3382796780684103, + "grad_norm": 0.2840316593647003, + "learning_rate": 1.4103105635229907e-06, + "loss": 0.311, + "step": 9297 + }, + { + "epoch": 2.3385311871227366, + "grad_norm": 0.31344839930534363, + "learning_rate": 1.4092921400928717e-06, + "loss": 0.3031, + "step": 9298 + }, + { + "epoch": 2.3387826961770624, + "grad_norm": 0.29697540402412415, + "learning_rate": 1.4082740241844185e-06, + "loss": 0.3319, + "step": 9299 + }, + { + "epoch": 2.339034205231388, + "grad_norm": 0.30100035667419434, + "learning_rate": 1.4072562158848241e-06, + "loss": 0.2954, + "step": 9300 + }, + { + "epoch": 2.3392857142857144, + "grad_norm": 0.31279945373535156, + "learning_rate": 1.4062387152812595e-06, + "loss": 0.3215, + "step": 9301 + }, + { + "epoch": 2.33953722334004, + "grad_norm": 0.29303503036499023, + "learning_rate": 1.4052215224608656e-06, + "loss": 0.3127, + "step": 9302 + }, + { + "epoch": 2.339788732394366, + "grad_norm": 0.3038380444049835, + "learning_rate": 1.4042046375107592e-06, + "loss": 0.3176, + "step": 9303 + }, + { + "epoch": 2.3400402414486923, + "grad_norm": 0.3016948103904724, + "learning_rate": 1.4031880605180325e-06, + "loss": 0.3134, + "step": 9304 + }, + { + "epoch": 2.340291750503018, + "grad_norm": 0.3166118264198303, + "learning_rate": 1.4021717915697448e-06, + "loss": 0.3081, + "step": 9305 + }, + { + "epoch": 2.340543259557344, + "grad_norm": 0.31366658210754395, + "learning_rate": 1.4011558307529366e-06, + "loss": 0.3166, + "step": 9306 + }, + { + "epoch": 2.34079476861167, + "grad_norm": 0.3000653088092804, + "learning_rate": 1.400140178154616e-06, + "loss": 0.3153, + "step": 9307 + }, + { + "epoch": 2.341046277665996, + "grad_norm": 0.2916873097419739, + "learning_rate": 1.3991248338617697e-06, + "loss": 0.3262, + "step": 9308 + }, + { + "epoch": 2.3412977867203217, + "grad_norm": 0.29192009568214417, + "learning_rate": 1.3981097979613528e-06, + "loss": 0.323, + "step": 9309 + }, + { + "epoch": 2.341549295774648, + "grad_norm": 0.31591734290122986, + "learning_rate": 1.3970950705403003e-06, + "loss": 0.3382, + "step": 9310 + }, + { + "epoch": 2.341800804828974, + "grad_norm": 0.32917144894599915, + "learning_rate": 1.3960806516855136e-06, + "loss": 0.3127, + "step": 9311 + }, + { + "epoch": 2.3420523138832996, + "grad_norm": 0.29949262738227844, + "learning_rate": 1.3950665414838744e-06, + "loss": 0.3415, + "step": 9312 + }, + { + "epoch": 2.342303822937626, + "grad_norm": 0.31109297275543213, + "learning_rate": 1.394052740022232e-06, + "loss": 0.3125, + "step": 9313 + }, + { + "epoch": 2.3425553319919517, + "grad_norm": 0.31198766827583313, + "learning_rate": 1.3930392473874138e-06, + "loss": 0.3121, + "step": 9314 + }, + { + "epoch": 2.3428068410462775, + "grad_norm": 0.30766913294792175, + "learning_rate": 1.3920260636662208e-06, + "loss": 0.3147, + "step": 9315 + }, + { + "epoch": 2.3430583501006037, + "grad_norm": 0.3334919512271881, + "learning_rate": 1.391013188945422e-06, + "loss": 0.3141, + "step": 9316 + }, + { + "epoch": 2.3433098591549295, + "grad_norm": 0.3124428391456604, + "learning_rate": 1.3900006233117675e-06, + "loss": 0.341, + "step": 9317 + }, + { + "epoch": 2.3435613682092553, + "grad_norm": 0.29679644107818604, + "learning_rate": 1.3889883668519744e-06, + "loss": 0.304, + "step": 9318 + }, + { + "epoch": 2.3438128772635816, + "grad_norm": 0.2995522916316986, + "learning_rate": 1.3879764196527384e-06, + "loss": 0.321, + "step": 9319 + }, + { + "epoch": 2.3440643863179074, + "grad_norm": 0.31305447220802307, + "learning_rate": 1.3869647818007236e-06, + "loss": 0.2999, + "step": 9320 + }, + { + "epoch": 2.344315895372233, + "grad_norm": 0.2792012095451355, + "learning_rate": 1.385953453382574e-06, + "loss": 0.3074, + "step": 9321 + }, + { + "epoch": 2.3445674044265594, + "grad_norm": 0.29924044013023376, + "learning_rate": 1.3849424344849e-06, + "loss": 0.2991, + "step": 9322 + }, + { + "epoch": 2.3448189134808852, + "grad_norm": 0.30037921667099, + "learning_rate": 1.3839317251942907e-06, + "loss": 0.3035, + "step": 9323 + }, + { + "epoch": 2.345070422535211, + "grad_norm": 0.3079024851322174, + "learning_rate": 1.3829213255973089e-06, + "loss": 0.321, + "step": 9324 + }, + { + "epoch": 2.3453219315895373, + "grad_norm": 0.2921197712421417, + "learning_rate": 1.3819112357804859e-06, + "loss": 0.3223, + "step": 9325 + }, + { + "epoch": 2.345573440643863, + "grad_norm": 0.28539982438087463, + "learning_rate": 1.3809014558303319e-06, + "loss": 0.321, + "step": 9326 + }, + { + "epoch": 2.345824949698189, + "grad_norm": 0.2913080155849457, + "learning_rate": 1.3798919858333254e-06, + "loss": 0.3126, + "step": 9327 + }, + { + "epoch": 2.346076458752515, + "grad_norm": 0.31199610233306885, + "learning_rate": 1.3788828258759251e-06, + "loss": 0.3611, + "step": 9328 + }, + { + "epoch": 2.346327967806841, + "grad_norm": 0.301520973443985, + "learning_rate": 1.3778739760445552e-06, + "loss": 0.3339, + "step": 9329 + }, + { + "epoch": 2.346579476861167, + "grad_norm": 0.29440373182296753, + "learning_rate": 1.3768654364256212e-06, + "loss": 0.3115, + "step": 9330 + }, + { + "epoch": 2.346830985915493, + "grad_norm": 0.29134684801101685, + "learning_rate": 1.375857207105495e-06, + "loss": 0.3267, + "step": 9331 + }, + { + "epoch": 2.347082494969819, + "grad_norm": 0.2973969280719757, + "learning_rate": 1.3748492881705272e-06, + "loss": 0.3272, + "step": 9332 + }, + { + "epoch": 2.347334004024145, + "grad_norm": 0.30077022314071655, + "learning_rate": 1.373841679707038e-06, + "loss": 0.3198, + "step": 9333 + }, + { + "epoch": 2.347585513078471, + "grad_norm": 0.27227941155433655, + "learning_rate": 1.3728343818013233e-06, + "loss": 0.282, + "step": 9334 + }, + { + "epoch": 2.3478370221327967, + "grad_norm": 0.30526816844940186, + "learning_rate": 1.3718273945396542e-06, + "loss": 0.3189, + "step": 9335 + }, + { + "epoch": 2.348088531187123, + "grad_norm": 0.27349480986595154, + "learning_rate": 1.3708207180082694e-06, + "loss": 0.3208, + "step": 9336 + }, + { + "epoch": 2.3483400402414487, + "grad_norm": 0.30724623799324036, + "learning_rate": 1.3698143522933876e-06, + "loss": 0.3169, + "step": 9337 + }, + { + "epoch": 2.3485915492957745, + "grad_norm": 0.2816196084022522, + "learning_rate": 1.368808297481195e-06, + "loss": 0.3152, + "step": 9338 + }, + { + "epoch": 2.3488430583501008, + "grad_norm": 0.2888517379760742, + "learning_rate": 1.3678025536578559e-06, + "loss": 0.3093, + "step": 9339 + }, + { + "epoch": 2.3490945674044266, + "grad_norm": 0.2849167287349701, + "learning_rate": 1.3667971209095039e-06, + "loss": 0.3124, + "step": 9340 + }, + { + "epoch": 2.3493460764587524, + "grad_norm": 0.29156047105789185, + "learning_rate": 1.3657919993222507e-06, + "loss": 0.3193, + "step": 9341 + }, + { + "epoch": 2.3495975855130786, + "grad_norm": 0.30401670932769775, + "learning_rate": 1.3647871889821762e-06, + "loss": 0.3127, + "step": 9342 + }, + { + "epoch": 2.3498490945674044, + "grad_norm": 0.2958643436431885, + "learning_rate": 1.363782689975338e-06, + "loss": 0.3034, + "step": 9343 + }, + { + "epoch": 2.3501006036217302, + "grad_norm": 0.2841004729270935, + "learning_rate": 1.3627785023877633e-06, + "loss": 0.3383, + "step": 9344 + }, + { + "epoch": 2.3503521126760565, + "grad_norm": 0.2793041169643402, + "learning_rate": 1.3617746263054548e-06, + "loss": 0.3506, + "step": 9345 + }, + { + "epoch": 2.3506036217303823, + "grad_norm": 0.29398274421691895, + "learning_rate": 1.360771061814391e-06, + "loss": 0.3331, + "step": 9346 + }, + { + "epoch": 2.350855130784708, + "grad_norm": 0.3107350468635559, + "learning_rate": 1.3597678090005168e-06, + "loss": 0.3286, + "step": 9347 + }, + { + "epoch": 2.3511066398390343, + "grad_norm": 0.2840394973754883, + "learning_rate": 1.3587648679497583e-06, + "loss": 0.3182, + "step": 9348 + }, + { + "epoch": 2.35135814889336, + "grad_norm": 0.3232376277446747, + "learning_rate": 1.3577622387480082e-06, + "loss": 0.3361, + "step": 9349 + }, + { + "epoch": 2.351609657947686, + "grad_norm": 0.30280598998069763, + "learning_rate": 1.3567599214811379e-06, + "loss": 0.3451, + "step": 9350 + }, + { + "epoch": 2.351861167002012, + "grad_norm": 0.2855769693851471, + "learning_rate": 1.3557579162349864e-06, + "loss": 0.2765, + "step": 9351 + }, + { + "epoch": 2.352112676056338, + "grad_norm": 0.30225083231925964, + "learning_rate": 1.3547562230953726e-06, + "loss": 0.3393, + "step": 9352 + }, + { + "epoch": 2.352364185110664, + "grad_norm": 0.29712530970573425, + "learning_rate": 1.353754842148083e-06, + "loss": 0.316, + "step": 9353 + }, + { + "epoch": 2.35261569416499, + "grad_norm": 0.3048205077648163, + "learning_rate": 1.352753773478881e-06, + "loss": 0.3233, + "step": 9354 + }, + { + "epoch": 2.352867203219316, + "grad_norm": 0.3305099308490753, + "learning_rate": 1.3517530171735e-06, + "loss": 0.3363, + "step": 9355 + }, + { + "epoch": 2.3531187122736417, + "grad_norm": 0.27165278792381287, + "learning_rate": 1.3507525733176497e-06, + "loss": 0.331, + "step": 9356 + }, + { + "epoch": 2.353370221327968, + "grad_norm": 0.3076144754886627, + "learning_rate": 1.3497524419970132e-06, + "loss": 0.3293, + "step": 9357 + }, + { + "epoch": 2.3536217303822937, + "grad_norm": 0.31551241874694824, + "learning_rate": 1.348752623297243e-06, + "loss": 0.3077, + "step": 9358 + }, + { + "epoch": 2.3538732394366195, + "grad_norm": 0.3101344108581543, + "learning_rate": 1.3477531173039697e-06, + "loss": 0.3015, + "step": 9359 + }, + { + "epoch": 2.3541247484909458, + "grad_norm": 0.3111810088157654, + "learning_rate": 1.3467539241027922e-06, + "loss": 0.3292, + "step": 9360 + }, + { + "epoch": 2.3543762575452716, + "grad_norm": 0.32074761390686035, + "learning_rate": 1.3457550437792876e-06, + "loss": 0.3281, + "step": 9361 + }, + { + "epoch": 2.354627766599598, + "grad_norm": 0.2921229600906372, + "learning_rate": 1.344756476419002e-06, + "loss": 0.3177, + "step": 9362 + }, + { + "epoch": 2.3548792756539236, + "grad_norm": 0.28211191296577454, + "learning_rate": 1.3437582221074574e-06, + "loss": 0.3583, + "step": 9363 + }, + { + "epoch": 2.3551307847082494, + "grad_norm": 0.31738191843032837, + "learning_rate": 1.342760280930147e-06, + "loss": 0.3441, + "step": 9364 + }, + { + "epoch": 2.3553822937625757, + "grad_norm": 0.28729522228240967, + "learning_rate": 1.3417626529725402e-06, + "loss": 0.3326, + "step": 9365 + }, + { + "epoch": 2.3556338028169015, + "grad_norm": 0.3042784333229065, + "learning_rate": 1.3407653383200747e-06, + "loss": 0.3347, + "step": 9366 + }, + { + "epoch": 2.3558853118712273, + "grad_norm": 0.2783034145832062, + "learning_rate": 1.339768337058166e-06, + "loss": 0.2803, + "step": 9367 + }, + { + "epoch": 2.3561368209255535, + "grad_norm": 0.2911806106567383, + "learning_rate": 1.3387716492722025e-06, + "loss": 0.303, + "step": 9368 + }, + { + "epoch": 2.3563883299798793, + "grad_norm": 0.2815414071083069, + "learning_rate": 1.337775275047541e-06, + "loss": 0.3222, + "step": 9369 + }, + { + "epoch": 2.356639839034205, + "grad_norm": 0.2898373007774353, + "learning_rate": 1.336779214469518e-06, + "loss": 0.3224, + "step": 9370 + }, + { + "epoch": 2.3568913480885314, + "grad_norm": 0.3051788806915283, + "learning_rate": 1.3357834676234366e-06, + "loss": 0.3103, + "step": 9371 + }, + { + "epoch": 2.357142857142857, + "grad_norm": 0.31733277440071106, + "learning_rate": 1.3347880345945796e-06, + "loss": 0.3446, + "step": 9372 + }, + { + "epoch": 2.357394366197183, + "grad_norm": 0.30566954612731934, + "learning_rate": 1.333792915468196e-06, + "loss": 0.3109, + "step": 9373 + }, + { + "epoch": 2.3576458752515093, + "grad_norm": 0.2991909682750702, + "learning_rate": 1.3327981103295156e-06, + "loss": 0.3123, + "step": 9374 + }, + { + "epoch": 2.357897384305835, + "grad_norm": 0.2768964469432831, + "learning_rate": 1.3318036192637334e-06, + "loss": 0.3224, + "step": 9375 + }, + { + "epoch": 2.358148893360161, + "grad_norm": 0.2982604205608368, + "learning_rate": 1.3308094423560242e-06, + "loss": 0.3151, + "step": 9376 + }, + { + "epoch": 2.358400402414487, + "grad_norm": 0.31598472595214844, + "learning_rate": 1.3298155796915307e-06, + "loss": 0.3188, + "step": 9377 + }, + { + "epoch": 2.358651911468813, + "grad_norm": 0.29461905360221863, + "learning_rate": 1.3288220313553723e-06, + "loss": 0.3171, + "step": 9378 + }, + { + "epoch": 2.3589034205231387, + "grad_norm": 0.2900139391422272, + "learning_rate": 1.3278287974326415e-06, + "loss": 0.3277, + "step": 9379 + }, + { + "epoch": 2.359154929577465, + "grad_norm": 0.2897052764892578, + "learning_rate": 1.3268358780083995e-06, + "loss": 0.3078, + "step": 9380 + }, + { + "epoch": 2.359406438631791, + "grad_norm": 0.2946465015411377, + "learning_rate": 1.3258432731676867e-06, + "loss": 0.3077, + "step": 9381 + }, + { + "epoch": 2.3596579476861166, + "grad_norm": 0.29683101177215576, + "learning_rate": 1.324850982995511e-06, + "loss": 0.3171, + "step": 9382 + }, + { + "epoch": 2.359909456740443, + "grad_norm": 0.32809895277023315, + "learning_rate": 1.3238590075768582e-06, + "loss": 0.3467, + "step": 9383 + }, + { + "epoch": 2.3601609657947686, + "grad_norm": 0.28296294808387756, + "learning_rate": 1.3228673469966819e-06, + "loss": 0.3348, + "step": 9384 + }, + { + "epoch": 2.3604124748490944, + "grad_norm": 0.31060582399368286, + "learning_rate": 1.321876001339915e-06, + "loss": 0.317, + "step": 9385 + }, + { + "epoch": 2.3606639839034207, + "grad_norm": 0.2970585525035858, + "learning_rate": 1.3208849706914567e-06, + "loss": 0.3378, + "step": 9386 + }, + { + "epoch": 2.3609154929577465, + "grad_norm": 0.310907781124115, + "learning_rate": 1.319894255136186e-06, + "loss": 0.3151, + "step": 9387 + }, + { + "epoch": 2.3611670020120723, + "grad_norm": 0.3074939250946045, + "learning_rate": 1.3189038547589479e-06, + "loss": 0.3049, + "step": 9388 + }, + { + "epoch": 2.3614185110663986, + "grad_norm": 0.2910417318344116, + "learning_rate": 1.317913769644567e-06, + "loss": 0.325, + "step": 9389 + }, + { + "epoch": 2.3616700201207244, + "grad_norm": 0.29934588074684143, + "learning_rate": 1.3169239998778361e-06, + "loss": 0.3217, + "step": 9390 + }, + { + "epoch": 2.36192152917505, + "grad_norm": 0.3142791986465454, + "learning_rate": 1.3159345455435241e-06, + "loss": 0.3297, + "step": 9391 + }, + { + "epoch": 2.3621730382293764, + "grad_norm": 0.30525627732276917, + "learning_rate": 1.3149454067263696e-06, + "loss": 0.3088, + "step": 9392 + }, + { + "epoch": 2.362424547283702, + "grad_norm": 0.2881489396095276, + "learning_rate": 1.3139565835110884e-06, + "loss": 0.308, + "step": 9393 + }, + { + "epoch": 2.362676056338028, + "grad_norm": 0.32259997725486755, + "learning_rate": 1.3129680759823648e-06, + "loss": 0.3287, + "step": 9394 + }, + { + "epoch": 2.3629275653923543, + "grad_norm": 0.30433225631713867, + "learning_rate": 1.311979884224861e-06, + "loss": 0.3147, + "step": 9395 + }, + { + "epoch": 2.36317907444668, + "grad_norm": 0.3119969964027405, + "learning_rate": 1.3109920083232064e-06, + "loss": 0.3184, + "step": 9396 + }, + { + "epoch": 2.363430583501006, + "grad_norm": 0.2946832776069641, + "learning_rate": 1.3100044483620094e-06, + "loss": 0.318, + "step": 9397 + }, + { + "epoch": 2.363682092555332, + "grad_norm": 0.282425194978714, + "learning_rate": 1.3090172044258458e-06, + "loss": 0.3321, + "step": 9398 + }, + { + "epoch": 2.363933601609658, + "grad_norm": 0.3065638244152069, + "learning_rate": 1.3080302765992692e-06, + "loss": 0.3006, + "step": 9399 + }, + { + "epoch": 2.3641851106639837, + "grad_norm": 0.2910194993019104, + "learning_rate": 1.3070436649668006e-06, + "loss": 0.3033, + "step": 9400 + }, + { + "epoch": 2.36443661971831, + "grad_norm": 0.306165874004364, + "learning_rate": 1.3060573696129396e-06, + "loss": 0.3083, + "step": 9401 + }, + { + "epoch": 2.364688128772636, + "grad_norm": 0.3192187249660492, + "learning_rate": 1.305071390622157e-06, + "loss": 0.302, + "step": 9402 + }, + { + "epoch": 2.3649396378269616, + "grad_norm": 0.3046712577342987, + "learning_rate": 1.3040857280788927e-06, + "loss": 0.302, + "step": 9403 + }, + { + "epoch": 2.365191146881288, + "grad_norm": 0.2873310446739197, + "learning_rate": 1.3031003820675659e-06, + "loss": 0.2988, + "step": 9404 + }, + { + "epoch": 2.3654426559356136, + "grad_norm": 0.31300392746925354, + "learning_rate": 1.3021153526725615e-06, + "loss": 0.317, + "step": 9405 + }, + { + "epoch": 2.3656941649899395, + "grad_norm": 0.29510292410850525, + "learning_rate": 1.3011306399782458e-06, + "loss": 0.2967, + "step": 9406 + }, + { + "epoch": 2.3659456740442657, + "grad_norm": 0.3187848627567291, + "learning_rate": 1.3001462440689488e-06, + "loss": 0.3349, + "step": 9407 + }, + { + "epoch": 2.3661971830985915, + "grad_norm": 0.2933845818042755, + "learning_rate": 1.2991621650289809e-06, + "loss": 0.2987, + "step": 9408 + }, + { + "epoch": 2.3664486921529173, + "grad_norm": 0.2807075083255768, + "learning_rate": 1.2981784029426203e-06, + "loss": 0.2975, + "step": 9409 + }, + { + "epoch": 2.3667002012072436, + "grad_norm": 0.2867673635482788, + "learning_rate": 1.2971949578941217e-06, + "loss": 0.3161, + "step": 9410 + }, + { + "epoch": 2.3669517102615694, + "grad_norm": 0.2991779148578644, + "learning_rate": 1.2962118299677095e-06, + "loss": 0.3163, + "step": 9411 + }, + { + "epoch": 2.367203219315895, + "grad_norm": 0.3035879135131836, + "learning_rate": 1.2952290192475848e-06, + "loss": 0.29, + "step": 9412 + }, + { + "epoch": 2.3674547283702214, + "grad_norm": 0.3021244704723358, + "learning_rate": 1.2942465258179155e-06, + "loss": 0.3111, + "step": 9413 + }, + { + "epoch": 2.3677062374245472, + "grad_norm": 0.2943519949913025, + "learning_rate": 1.29326434976285e-06, + "loss": 0.3085, + "step": 9414 + }, + { + "epoch": 2.367957746478873, + "grad_norm": 0.29150792956352234, + "learning_rate": 1.2922824911665021e-06, + "loss": 0.3083, + "step": 9415 + }, + { + "epoch": 2.3682092555331993, + "grad_norm": 0.3039129972457886, + "learning_rate": 1.2913009501129653e-06, + "loss": 0.3093, + "step": 9416 + }, + { + "epoch": 2.368460764587525, + "grad_norm": 0.28910478949546814, + "learning_rate": 1.2903197266862989e-06, + "loss": 0.3038, + "step": 9417 + }, + { + "epoch": 2.368712273641851, + "grad_norm": 0.29740893840789795, + "learning_rate": 1.289338820970541e-06, + "loss": 0.3274, + "step": 9418 + }, + { + "epoch": 2.368963782696177, + "grad_norm": 0.2945566773414612, + "learning_rate": 1.2883582330496986e-06, + "loss": 0.3546, + "step": 9419 + }, + { + "epoch": 2.369215291750503, + "grad_norm": 0.2992318272590637, + "learning_rate": 1.287377963007755e-06, + "loss": 0.3222, + "step": 9420 + }, + { + "epoch": 2.3694668008048287, + "grad_norm": 0.2896125614643097, + "learning_rate": 1.2863980109286605e-06, + "loss": 0.3324, + "step": 9421 + }, + { + "epoch": 2.369718309859155, + "grad_norm": 0.31392091512680054, + "learning_rate": 1.2854183768963453e-06, + "loss": 0.318, + "step": 9422 + }, + { + "epoch": 2.369969818913481, + "grad_norm": 0.2963343560695648, + "learning_rate": 1.2844390609947082e-06, + "loss": 0.3077, + "step": 9423 + }, + { + "epoch": 2.3702213279678066, + "grad_norm": 0.32114240527153015, + "learning_rate": 1.2834600633076205e-06, + "loss": 0.332, + "step": 9424 + }, + { + "epoch": 2.370472837022133, + "grad_norm": 0.28790682554244995, + "learning_rate": 1.2824813839189288e-06, + "loss": 0.3242, + "step": 9425 + }, + { + "epoch": 2.3707243460764587, + "grad_norm": 0.31781139969825745, + "learning_rate": 1.2815030229124481e-06, + "loss": 0.3148, + "step": 9426 + }, + { + "epoch": 2.3709758551307845, + "grad_norm": 0.3021380305290222, + "learning_rate": 1.2805249803719722e-06, + "loss": 0.3122, + "step": 9427 + }, + { + "epoch": 2.3712273641851107, + "grad_norm": 0.3446398973464966, + "learning_rate": 1.2795472563812617e-06, + "loss": 0.3437, + "step": 9428 + }, + { + "epoch": 2.3714788732394365, + "grad_norm": 0.2881867587566376, + "learning_rate": 1.2785698510240546e-06, + "loss": 0.3348, + "step": 9429 + }, + { + "epoch": 2.3717303822937628, + "grad_norm": 0.28368863463401794, + "learning_rate": 1.2775927643840575e-06, + "loss": 0.3034, + "step": 9430 + }, + { + "epoch": 2.3719818913480886, + "grad_norm": 0.281934529542923, + "learning_rate": 1.2766159965449543e-06, + "loss": 0.3107, + "step": 9431 + }, + { + "epoch": 2.3722334004024144, + "grad_norm": 0.2791815996170044, + "learning_rate": 1.275639547590396e-06, + "loss": 0.3286, + "step": 9432 + }, + { + "epoch": 2.3724849094567406, + "grad_norm": 0.29985862970352173, + "learning_rate": 1.2746634176040107e-06, + "loss": 0.3342, + "step": 9433 + }, + { + "epoch": 2.3727364185110664, + "grad_norm": 0.31482386589050293, + "learning_rate": 1.2736876066693999e-06, + "loss": 0.3313, + "step": 9434 + }, + { + "epoch": 2.3729879275653922, + "grad_norm": 0.2999832332134247, + "learning_rate": 1.2727121148701322e-06, + "loss": 0.3334, + "step": 9435 + }, + { + "epoch": 2.3732394366197185, + "grad_norm": 0.3035413920879364, + "learning_rate": 1.2717369422897552e-06, + "loss": 0.3196, + "step": 9436 + }, + { + "epoch": 2.3734909456740443, + "grad_norm": 0.28583768010139465, + "learning_rate": 1.270762089011784e-06, + "loss": 0.3175, + "step": 9437 + }, + { + "epoch": 2.37374245472837, + "grad_norm": 0.2991746962070465, + "learning_rate": 1.2697875551197113e-06, + "loss": 0.3391, + "step": 9438 + }, + { + "epoch": 2.3739939637826963, + "grad_norm": 0.29370561242103577, + "learning_rate": 1.268813340696997e-06, + "loss": 0.3131, + "step": 9439 + }, + { + "epoch": 2.374245472837022, + "grad_norm": 0.31026825308799744, + "learning_rate": 1.2678394458270794e-06, + "loss": 0.3207, + "step": 9440 + }, + { + "epoch": 2.374496981891348, + "grad_norm": 0.31360548734664917, + "learning_rate": 1.2668658705933628e-06, + "loss": 0.314, + "step": 9441 + }, + { + "epoch": 2.374748490945674, + "grad_norm": 0.31127217411994934, + "learning_rate": 1.2658926150792321e-06, + "loss": 0.3276, + "step": 9442 + }, + { + "epoch": 2.375, + "grad_norm": 0.31766191124916077, + "learning_rate": 1.2649196793680367e-06, + "loss": 0.328, + "step": 9443 + }, + { + "epoch": 2.375251509054326, + "grad_norm": 0.32087278366088867, + "learning_rate": 1.2639470635431044e-06, + "loss": 0.3228, + "step": 9444 + }, + { + "epoch": 2.375503018108652, + "grad_norm": 0.29224544763565063, + "learning_rate": 1.2629747676877347e-06, + "loss": 0.3348, + "step": 9445 + }, + { + "epoch": 2.375754527162978, + "grad_norm": 0.3009779155254364, + "learning_rate": 1.2620027918851956e-06, + "loss": 0.3244, + "step": 9446 + }, + { + "epoch": 2.3760060362173037, + "grad_norm": 0.304565966129303, + "learning_rate": 1.2610311362187343e-06, + "loss": 0.3357, + "step": 9447 + }, + { + "epoch": 2.37625754527163, + "grad_norm": 0.3026162087917328, + "learning_rate": 1.260059800771564e-06, + "loss": 0.3098, + "step": 9448 + }, + { + "epoch": 2.3765090543259557, + "grad_norm": 0.3093135952949524, + "learning_rate": 1.2590887856268764e-06, + "loss": 0.3289, + "step": 9449 + }, + { + "epoch": 2.3767605633802815, + "grad_norm": 0.2858860194683075, + "learning_rate": 1.2581180908678291e-06, + "loss": 0.3005, + "step": 9450 + }, + { + "epoch": 2.3770120724346078, + "grad_norm": 0.28006115555763245, + "learning_rate": 1.2571477165775597e-06, + "loss": 0.3127, + "step": 9451 + }, + { + "epoch": 2.3772635814889336, + "grad_norm": 0.2762000560760498, + "learning_rate": 1.2561776628391725e-06, + "loss": 0.3349, + "step": 9452 + }, + { + "epoch": 2.3775150905432594, + "grad_norm": 0.3034118413925171, + "learning_rate": 1.2552079297357478e-06, + "loss": 0.318, + "step": 9453 + }, + { + "epoch": 2.3777665995975856, + "grad_norm": 0.29778799414634705, + "learning_rate": 1.2542385173503359e-06, + "loss": 0.3035, + "step": 9454 + }, + { + "epoch": 2.3780181086519114, + "grad_norm": 0.30300748348236084, + "learning_rate": 1.2532694257659605e-06, + "loss": 0.3182, + "step": 9455 + }, + { + "epoch": 2.3782696177062372, + "grad_norm": 0.27301889657974243, + "learning_rate": 1.252300655065622e-06, + "loss": 0.3322, + "step": 9456 + }, + { + "epoch": 2.3785211267605635, + "grad_norm": 0.3212936222553253, + "learning_rate": 1.2513322053322847e-06, + "loss": 0.3413, + "step": 9457 + }, + { + "epoch": 2.3787726358148893, + "grad_norm": 0.3001827597618103, + "learning_rate": 1.250364076648894e-06, + "loss": 0.3137, + "step": 9458 + }, + { + "epoch": 2.3790241448692155, + "grad_norm": 0.29002684354782104, + "learning_rate": 1.2493962690983608e-06, + "loss": 0.296, + "step": 9459 + }, + { + "epoch": 2.3792756539235413, + "grad_norm": 0.30628740787506104, + "learning_rate": 1.248428782763575e-06, + "loss": 0.3288, + "step": 9460 + }, + { + "epoch": 2.379527162977867, + "grad_norm": 0.30561593174934387, + "learning_rate": 1.2474616177273928e-06, + "loss": 0.3324, + "step": 9461 + }, + { + "epoch": 2.3797786720321934, + "grad_norm": 0.3026500642299652, + "learning_rate": 1.2464947740726491e-06, + "loss": 0.3171, + "step": 9462 + }, + { + "epoch": 2.380030181086519, + "grad_norm": 0.2937304377555847, + "learning_rate": 1.2455282518821442e-06, + "loss": 0.3395, + "step": 9463 + }, + { + "epoch": 2.380281690140845, + "grad_norm": 0.30851301550865173, + "learning_rate": 1.244562051238658e-06, + "loss": 0.3576, + "step": 9464 + }, + { + "epoch": 2.3805331991951713, + "grad_norm": 0.3095259666442871, + "learning_rate": 1.2435961722249374e-06, + "loss": 0.332, + "step": 9465 + }, + { + "epoch": 2.380784708249497, + "grad_norm": 0.2995477616786957, + "learning_rate": 1.2426306149237039e-06, + "loss": 0.3151, + "step": 9466 + }, + { + "epoch": 2.381036217303823, + "grad_norm": 0.2909550964832306, + "learning_rate": 1.2416653794176542e-06, + "loss": 0.3098, + "step": 9467 + }, + { + "epoch": 2.381287726358149, + "grad_norm": 0.28925126791000366, + "learning_rate": 1.2407004657894505e-06, + "loss": 0.3065, + "step": 9468 + }, + { + "epoch": 2.381539235412475, + "grad_norm": 0.28470999002456665, + "learning_rate": 1.2397358741217359e-06, + "loss": 0.3151, + "step": 9469 + }, + { + "epoch": 2.3817907444668007, + "grad_norm": 0.3109789192676544, + "learning_rate": 1.2387716044971181e-06, + "loss": 0.3137, + "step": 9470 + }, + { + "epoch": 2.382042253521127, + "grad_norm": 0.3258463740348816, + "learning_rate": 1.2378076569981833e-06, + "loss": 0.3246, + "step": 9471 + }, + { + "epoch": 2.3822937625754528, + "grad_norm": 0.3138425052165985, + "learning_rate": 1.2368440317074854e-06, + "loss": 0.3248, + "step": 9472 + }, + { + "epoch": 2.3825452716297786, + "grad_norm": 0.31474605202674866, + "learning_rate": 1.2358807287075553e-06, + "loss": 0.3273, + "step": 9473 + }, + { + "epoch": 2.382796780684105, + "grad_norm": 0.30715155601501465, + "learning_rate": 1.234917748080891e-06, + "loss": 0.3247, + "step": 9474 + }, + { + "epoch": 2.3830482897384306, + "grad_norm": 0.29947811365127563, + "learning_rate": 1.2339550899099673e-06, + "loss": 0.319, + "step": 9475 + }, + { + "epoch": 2.3832997987927564, + "grad_norm": 0.27288398146629333, + "learning_rate": 1.2329927542772314e-06, + "loss": 0.338, + "step": 9476 + }, + { + "epoch": 2.3835513078470827, + "grad_norm": 0.30378079414367676, + "learning_rate": 1.2320307412650978e-06, + "loss": 0.3123, + "step": 9477 + }, + { + "epoch": 2.3838028169014085, + "grad_norm": 0.2979941666126251, + "learning_rate": 1.2310690509559609e-06, + "loss": 0.3077, + "step": 9478 + }, + { + "epoch": 2.3840543259557343, + "grad_norm": 0.3131090998649597, + "learning_rate": 1.2301076834321796e-06, + "loss": 0.3274, + "step": 9479 + }, + { + "epoch": 2.3843058350100605, + "grad_norm": 0.2883933186531067, + "learning_rate": 1.2291466387760925e-06, + "loss": 0.3344, + "step": 9480 + }, + { + "epoch": 2.3845573440643864, + "grad_norm": 0.3163345158100128, + "learning_rate": 1.2281859170700039e-06, + "loss": 0.3107, + "step": 9481 + }, + { + "epoch": 2.384808853118712, + "grad_norm": 0.2901724874973297, + "learning_rate": 1.2272255183961968e-06, + "loss": 0.3282, + "step": 9482 + }, + { + "epoch": 2.3850603621730384, + "grad_norm": 0.31056809425354004, + "learning_rate": 1.2262654428369198e-06, + "loss": 0.3323, + "step": 9483 + }, + { + "epoch": 2.385311871227364, + "grad_norm": 0.299684077501297, + "learning_rate": 1.225305690474401e-06, + "loss": 0.3099, + "step": 9484 + }, + { + "epoch": 2.38556338028169, + "grad_norm": 0.31283077597618103, + "learning_rate": 1.2243462613908336e-06, + "loss": 0.3079, + "step": 9485 + }, + { + "epoch": 2.3858148893360163, + "grad_norm": 0.28932955861091614, + "learning_rate": 1.2233871556683891e-06, + "loss": 0.2907, + "step": 9486 + }, + { + "epoch": 2.386066398390342, + "grad_norm": 0.30000898241996765, + "learning_rate": 1.222428373389209e-06, + "loss": 0.3257, + "step": 9487 + }, + { + "epoch": 2.386317907444668, + "grad_norm": 0.3037915825843811, + "learning_rate": 1.2214699146354054e-06, + "loss": 0.3359, + "step": 9488 + }, + { + "epoch": 2.386569416498994, + "grad_norm": 0.30737778544425964, + "learning_rate": 1.2205117794890665e-06, + "loss": 0.3159, + "step": 9489 + }, + { + "epoch": 2.38682092555332, + "grad_norm": 0.318010538816452, + "learning_rate": 1.2195539680322476e-06, + "loss": 0.3197, + "step": 9490 + }, + { + "epoch": 2.3870724346076457, + "grad_norm": 0.32174649834632874, + "learning_rate": 1.2185964803469824e-06, + "loss": 0.3168, + "step": 9491 + }, + { + "epoch": 2.387323943661972, + "grad_norm": 0.28798073530197144, + "learning_rate": 1.2176393165152712e-06, + "loss": 0.3204, + "step": 9492 + }, + { + "epoch": 2.387575452716298, + "grad_norm": 0.3134564757347107, + "learning_rate": 1.2166824766190916e-06, + "loss": 0.3338, + "step": 9493 + }, + { + "epoch": 2.3878269617706236, + "grad_norm": 0.2944769859313965, + "learning_rate": 1.2157259607403877e-06, + "loss": 0.3366, + "step": 9494 + }, + { + "epoch": 2.38807847082495, + "grad_norm": 0.28981396555900574, + "learning_rate": 1.2147697689610826e-06, + "loss": 0.3254, + "step": 9495 + }, + { + "epoch": 2.3883299798792756, + "grad_norm": 0.28675341606140137, + "learning_rate": 1.213813901363065e-06, + "loss": 0.3123, + "step": 9496 + }, + { + "epoch": 2.3885814889336014, + "grad_norm": 0.29373446106910706, + "learning_rate": 1.2128583580282005e-06, + "loss": 0.3495, + "step": 9497 + }, + { + "epoch": 2.3888329979879277, + "grad_norm": 0.286771684885025, + "learning_rate": 1.2119031390383268e-06, + "loss": 0.3153, + "step": 9498 + }, + { + "epoch": 2.3890845070422535, + "grad_norm": 0.29528728127479553, + "learning_rate": 1.210948244475249e-06, + "loss": 0.3318, + "step": 9499 + }, + { + "epoch": 2.3893360160965793, + "grad_norm": 0.2970670759677887, + "learning_rate": 1.209993674420752e-06, + "loss": 0.3444, + "step": 9500 + }, + { + "epoch": 2.3895875251509056, + "grad_norm": 0.2939586639404297, + "learning_rate": 1.2090394289565849e-06, + "loss": 0.3132, + "step": 9501 + }, + { + "epoch": 2.3898390342052314, + "grad_norm": 0.2839318513870239, + "learning_rate": 1.208085508164476e-06, + "loss": 0.3193, + "step": 9502 + }, + { + "epoch": 2.390090543259557, + "grad_norm": 0.31266748905181885, + "learning_rate": 1.2071319121261194e-06, + "loss": 0.3341, + "step": 9503 + }, + { + "epoch": 2.3903420523138834, + "grad_norm": 0.29940277338027954, + "learning_rate": 1.2061786409231884e-06, + "loss": 0.3222, + "step": 9504 + }, + { + "epoch": 2.390593561368209, + "grad_norm": 0.2878960967063904, + "learning_rate": 1.2052256946373209e-06, + "loss": 0.3252, + "step": 9505 + }, + { + "epoch": 2.390845070422535, + "grad_norm": 0.3191094398498535, + "learning_rate": 1.204273073350134e-06, + "loss": 0.3082, + "step": 9506 + }, + { + "epoch": 2.3910965794768613, + "grad_norm": 0.3221496343612671, + "learning_rate": 1.203320777143211e-06, + "loss": 0.3168, + "step": 9507 + }, + { + "epoch": 2.391348088531187, + "grad_norm": 0.30812880396842957, + "learning_rate": 1.202368806098112e-06, + "loss": 0.3156, + "step": 9508 + }, + { + "epoch": 2.391599597585513, + "grad_norm": 0.28987714648246765, + "learning_rate": 1.2014171602963676e-06, + "loss": 0.3236, + "step": 9509 + }, + { + "epoch": 2.391851106639839, + "grad_norm": 0.2956389784812927, + "learning_rate": 1.2004658398194786e-06, + "loss": 0.3101, + "step": 9510 + }, + { + "epoch": 2.392102615694165, + "grad_norm": 0.2824747860431671, + "learning_rate": 1.199514844748922e-06, + "loss": 0.3145, + "step": 9511 + }, + { + "epoch": 2.3923541247484907, + "grad_norm": 0.3140904903411865, + "learning_rate": 1.1985641751661415e-06, + "loss": 0.3101, + "step": 9512 + }, + { + "epoch": 2.392605633802817, + "grad_norm": 0.288084477186203, + "learning_rate": 1.1976138311525592e-06, + "loss": 0.3343, + "step": 9513 + }, + { + "epoch": 2.392857142857143, + "grad_norm": 0.27887189388275146, + "learning_rate": 1.196663812789563e-06, + "loss": 0.3242, + "step": 9514 + }, + { + "epoch": 2.3931086519114686, + "grad_norm": 0.30040696263313293, + "learning_rate": 1.1957141201585193e-06, + "loss": 0.3303, + "step": 9515 + }, + { + "epoch": 2.393360160965795, + "grad_norm": 0.2744455635547638, + "learning_rate": 1.1947647533407602e-06, + "loss": 0.3199, + "step": 9516 + }, + { + "epoch": 2.3936116700201207, + "grad_norm": 0.3067541718482971, + "learning_rate": 1.1938157124175959e-06, + "loss": 0.3125, + "step": 9517 + }, + { + "epoch": 2.3938631790744465, + "grad_norm": 0.29640766978263855, + "learning_rate": 1.1928669974703033e-06, + "loss": 0.3321, + "step": 9518 + }, + { + "epoch": 2.3941146881287727, + "grad_norm": 0.30596280097961426, + "learning_rate": 1.191918608580136e-06, + "loss": 0.3078, + "step": 9519 + }, + { + "epoch": 2.3943661971830985, + "grad_norm": 0.27518826723098755, + "learning_rate": 1.1909705458283155e-06, + "loss": 0.3197, + "step": 9520 + }, + { + "epoch": 2.3946177062374243, + "grad_norm": 0.27755483984947205, + "learning_rate": 1.1900228092960398e-06, + "loss": 0.3256, + "step": 9521 + }, + { + "epoch": 2.3948692152917506, + "grad_norm": 0.30145710706710815, + "learning_rate": 1.1890753990644738e-06, + "loss": 0.3207, + "step": 9522 + }, + { + "epoch": 2.3951207243460764, + "grad_norm": 0.3164133131504059, + "learning_rate": 1.1881283152147606e-06, + "loss": 0.3314, + "step": 9523 + }, + { + "epoch": 2.395372233400402, + "grad_norm": 0.3069988787174225, + "learning_rate": 1.1871815578280083e-06, + "loss": 0.3108, + "step": 9524 + }, + { + "epoch": 2.3956237424547284, + "grad_norm": 0.3104003965854645, + "learning_rate": 1.186235126985304e-06, + "loss": 0.3224, + "step": 9525 + }, + { + "epoch": 2.3958752515090542, + "grad_norm": 0.2976137399673462, + "learning_rate": 1.185289022767701e-06, + "loss": 0.3175, + "step": 9526 + }, + { + "epoch": 2.3961267605633805, + "grad_norm": 0.3276063799858093, + "learning_rate": 1.1843432452562303e-06, + "loss": 0.3342, + "step": 9527 + }, + { + "epoch": 2.3963782696177063, + "grad_norm": 0.3153345584869385, + "learning_rate": 1.183397794531888e-06, + "loss": 0.3397, + "step": 9528 + }, + { + "epoch": 2.396629778672032, + "grad_norm": 0.3022550940513611, + "learning_rate": 1.18245267067565e-06, + "loss": 0.3021, + "step": 9529 + }, + { + "epoch": 2.3968812877263583, + "grad_norm": 0.3086456060409546, + "learning_rate": 1.1815078737684566e-06, + "loss": 0.3211, + "step": 9530 + }, + { + "epoch": 2.397132796780684, + "grad_norm": 0.3157150447368622, + "learning_rate": 1.1805634038912268e-06, + "loss": 0.3374, + "step": 9531 + }, + { + "epoch": 2.39738430583501, + "grad_norm": 0.2981681823730469, + "learning_rate": 1.1796192611248452e-06, + "loss": 0.3354, + "step": 9532 + }, + { + "epoch": 2.397635814889336, + "grad_norm": 0.2994699478149414, + "learning_rate": 1.1786754455501759e-06, + "loss": 0.3359, + "step": 9533 + }, + { + "epoch": 2.397887323943662, + "grad_norm": 0.3035818338394165, + "learning_rate": 1.1777319572480468e-06, + "loss": 0.3408, + "step": 9534 + }, + { + "epoch": 2.398138832997988, + "grad_norm": 0.2822018265724182, + "learning_rate": 1.1767887962992647e-06, + "loss": 0.3129, + "step": 9535 + }, + { + "epoch": 2.398390342052314, + "grad_norm": 0.2857167720794678, + "learning_rate": 1.1758459627846031e-06, + "loss": 0.3171, + "step": 9536 + }, + { + "epoch": 2.39864185110664, + "grad_norm": 0.2896609306335449, + "learning_rate": 1.1749034567848122e-06, + "loss": 0.3178, + "step": 9537 + }, + { + "epoch": 2.3988933601609657, + "grad_norm": 0.3051433563232422, + "learning_rate": 1.1739612783806092e-06, + "loss": 0.3435, + "step": 9538 + }, + { + "epoch": 2.399144869215292, + "grad_norm": 0.3050413727760315, + "learning_rate": 1.1730194276526885e-06, + "loss": 0.3223, + "step": 9539 + }, + { + "epoch": 2.3993963782696177, + "grad_norm": 0.30371037125587463, + "learning_rate": 1.1720779046817104e-06, + "loss": 0.3187, + "step": 9540 + }, + { + "epoch": 2.3996478873239435, + "grad_norm": 0.28884759545326233, + "learning_rate": 1.1711367095483134e-06, + "loss": 0.3246, + "step": 9541 + }, + { + "epoch": 2.3998993963782698, + "grad_norm": 0.3120008409023285, + "learning_rate": 1.1701958423331044e-06, + "loss": 0.3344, + "step": 9542 + }, + { + "epoch": 2.4001509054325956, + "grad_norm": 0.28870880603790283, + "learning_rate": 1.1692553031166616e-06, + "loss": 0.3225, + "step": 9543 + }, + { + "epoch": 2.4004024144869214, + "grad_norm": 0.28078019618988037, + "learning_rate": 1.1683150919795378e-06, + "loss": 0.2904, + "step": 9544 + }, + { + "epoch": 2.4006539235412476, + "grad_norm": 0.28186681866645813, + "learning_rate": 1.1673752090022544e-06, + "loss": 0.3233, + "step": 9545 + }, + { + "epoch": 2.4009054325955734, + "grad_norm": 0.2807130813598633, + "learning_rate": 1.1664356542653088e-06, + "loss": 0.3163, + "step": 9546 + }, + { + "epoch": 2.4011569416498992, + "grad_norm": 0.28840017318725586, + "learning_rate": 1.1654964278491653e-06, + "loss": 0.3297, + "step": 9547 + }, + { + "epoch": 2.4014084507042255, + "grad_norm": 0.2861834466457367, + "learning_rate": 1.1645575298342659e-06, + "loss": 0.3246, + "step": 9548 + }, + { + "epoch": 2.4016599597585513, + "grad_norm": 0.2845926582813263, + "learning_rate": 1.1636189603010179e-06, + "loss": 0.3242, + "step": 9549 + }, + { + "epoch": 2.401911468812877, + "grad_norm": 0.28834056854248047, + "learning_rate": 1.1626807193298073e-06, + "loss": 0.3234, + "step": 9550 + }, + { + "epoch": 2.4021629778672033, + "grad_norm": 0.31181180477142334, + "learning_rate": 1.161742807000985e-06, + "loss": 0.3505, + "step": 9551 + }, + { + "epoch": 2.402414486921529, + "grad_norm": 0.30017805099487305, + "learning_rate": 1.1608052233948797e-06, + "loss": 0.332, + "step": 9552 + }, + { + "epoch": 2.402665995975855, + "grad_norm": 0.2945021986961365, + "learning_rate": 1.1598679685917901e-06, + "loss": 0.3185, + "step": 9553 + }, + { + "epoch": 2.402917505030181, + "grad_norm": 0.29077422618865967, + "learning_rate": 1.158931042671984e-06, + "loss": 0.3215, + "step": 9554 + }, + { + "epoch": 2.403169014084507, + "grad_norm": 0.28947049379348755, + "learning_rate": 1.157994445715706e-06, + "loss": 0.3264, + "step": 9555 + }, + { + "epoch": 2.403420523138833, + "grad_norm": 0.2847462296485901, + "learning_rate": 1.1570581778031665e-06, + "loss": 0.3298, + "step": 9556 + }, + { + "epoch": 2.403672032193159, + "grad_norm": 0.29877012968063354, + "learning_rate": 1.1561222390145543e-06, + "loss": 0.324, + "step": 9557 + }, + { + "epoch": 2.403923541247485, + "grad_norm": 0.2849181294441223, + "learning_rate": 1.1551866294300234e-06, + "loss": 0.321, + "step": 9558 + }, + { + "epoch": 2.404175050301811, + "grad_norm": 0.3224382698535919, + "learning_rate": 1.1542513491297063e-06, + "loss": 0.3194, + "step": 9559 + }, + { + "epoch": 2.404426559356137, + "grad_norm": 0.2912129759788513, + "learning_rate": 1.1533163981937012e-06, + "loss": 0.3128, + "step": 9560 + }, + { + "epoch": 2.4046780684104627, + "grad_norm": 0.2958779036998749, + "learning_rate": 1.1523817767020829e-06, + "loss": 0.3136, + "step": 9561 + }, + { + "epoch": 2.404929577464789, + "grad_norm": 0.300225168466568, + "learning_rate": 1.1514474847348934e-06, + "loss": 0.338, + "step": 9562 + }, + { + "epoch": 2.4051810865191148, + "grad_norm": 0.31059449911117554, + "learning_rate": 1.1505135223721498e-06, + "loss": 0.3326, + "step": 9563 + }, + { + "epoch": 2.4054325955734406, + "grad_norm": 0.293338805437088, + "learning_rate": 1.1495798896938426e-06, + "loss": 0.3294, + "step": 9564 + }, + { + "epoch": 2.405684104627767, + "grad_norm": 0.3122572898864746, + "learning_rate": 1.1486465867799284e-06, + "loss": 0.3319, + "step": 9565 + }, + { + "epoch": 2.4059356136820926, + "grad_norm": 0.2884367108345032, + "learning_rate": 1.147713613710341e-06, + "loss": 0.2962, + "step": 9566 + }, + { + "epoch": 2.4061871227364184, + "grad_norm": 0.3003442585468292, + "learning_rate": 1.1467809705649817e-06, + "loss": 0.3139, + "step": 9567 + }, + { + "epoch": 2.4064386317907447, + "grad_norm": 0.2922108471393585, + "learning_rate": 1.1458486574237281e-06, + "loss": 0.3144, + "step": 9568 + }, + { + "epoch": 2.4066901408450705, + "grad_norm": 0.2977858781814575, + "learning_rate": 1.144916674366424e-06, + "loss": 0.3182, + "step": 9569 + }, + { + "epoch": 2.4069416498993963, + "grad_norm": 0.2883601486682892, + "learning_rate": 1.1439850214728908e-06, + "loss": 0.3377, + "step": 9570 + }, + { + "epoch": 2.4071931589537225, + "grad_norm": 0.28815150260925293, + "learning_rate": 1.1430536988229157e-06, + "loss": 0.3322, + "step": 9571 + }, + { + "epoch": 2.4074446680080483, + "grad_norm": 0.3212835192680359, + "learning_rate": 1.1421227064962641e-06, + "loss": 0.303, + "step": 9572 + }, + { + "epoch": 2.407696177062374, + "grad_norm": 0.2907494008541107, + "learning_rate": 1.1411920445726666e-06, + "loss": 0.3007, + "step": 9573 + }, + { + "epoch": 2.4079476861167004, + "grad_norm": 0.3153952360153198, + "learning_rate": 1.1402617131318295e-06, + "loss": 0.3223, + "step": 9574 + }, + { + "epoch": 2.408199195171026, + "grad_norm": 0.2907307744026184, + "learning_rate": 1.1393317122534315e-06, + "loss": 0.3325, + "step": 9575 + }, + { + "epoch": 2.408450704225352, + "grad_norm": 0.30540433526039124, + "learning_rate": 1.1384020420171194e-06, + "loss": 0.3305, + "step": 9576 + }, + { + "epoch": 2.4087022132796783, + "grad_norm": 0.2847049832344055, + "learning_rate": 1.137472702502515e-06, + "loss": 0.3229, + "step": 9577 + }, + { + "epoch": 2.408953722334004, + "grad_norm": 0.29284361004829407, + "learning_rate": 1.1365436937892082e-06, + "loss": 0.3176, + "step": 9578 + }, + { + "epoch": 2.40920523138833, + "grad_norm": 0.28661638498306274, + "learning_rate": 1.1356150159567664e-06, + "loss": 0.338, + "step": 9579 + }, + { + "epoch": 2.409456740442656, + "grad_norm": 0.2953725755214691, + "learning_rate": 1.1346866690847214e-06, + "loss": 0.3218, + "step": 9580 + }, + { + "epoch": 2.409708249496982, + "grad_norm": 0.30523139238357544, + "learning_rate": 1.133758653252583e-06, + "loss": 0.3014, + "step": 9581 + }, + { + "epoch": 2.4099597585513077, + "grad_norm": 0.2769249379634857, + "learning_rate": 1.1328309685398275e-06, + "loss": 0.3114, + "step": 9582 + }, + { + "epoch": 2.410211267605634, + "grad_norm": 0.2869117856025696, + "learning_rate": 1.1319036150259078e-06, + "loss": 0.339, + "step": 9583 + }, + { + "epoch": 2.41046277665996, + "grad_norm": 0.2989134192466736, + "learning_rate": 1.1309765927902439e-06, + "loss": 0.2997, + "step": 9584 + }, + { + "epoch": 2.4107142857142856, + "grad_norm": 0.29306572675704956, + "learning_rate": 1.1300499019122295e-06, + "loss": 0.321, + "step": 9585 + }, + { + "epoch": 2.410965794768612, + "grad_norm": 0.27259159088134766, + "learning_rate": 1.1291235424712328e-06, + "loss": 0.3093, + "step": 9586 + }, + { + "epoch": 2.4112173038229376, + "grad_norm": 0.3309045732021332, + "learning_rate": 1.1281975145465867e-06, + "loss": 0.316, + "step": 9587 + }, + { + "epoch": 2.4114688128772634, + "grad_norm": 0.3107389211654663, + "learning_rate": 1.1272718182176034e-06, + "loss": 0.3225, + "step": 9588 + }, + { + "epoch": 2.4117203219315897, + "grad_norm": 0.29425039887428284, + "learning_rate": 1.1263464535635594e-06, + "loss": 0.336, + "step": 9589 + }, + { + "epoch": 2.4119718309859155, + "grad_norm": 0.29913732409477234, + "learning_rate": 1.1254214206637099e-06, + "loss": 0.3135, + "step": 9590 + }, + { + "epoch": 2.4122233400402413, + "grad_norm": 0.2878449261188507, + "learning_rate": 1.1244967195972745e-06, + "loss": 0.3308, + "step": 9591 + }, + { + "epoch": 2.4124748490945676, + "grad_norm": 0.2862037122249603, + "learning_rate": 1.123572350443452e-06, + "loss": 0.3186, + "step": 9592 + }, + { + "epoch": 2.4127263581488934, + "grad_norm": 0.30815479159355164, + "learning_rate": 1.1226483132814048e-06, + "loss": 0.3305, + "step": 9593 + }, + { + "epoch": 2.412977867203219, + "grad_norm": 0.29880401492118835, + "learning_rate": 1.121724608190275e-06, + "loss": 0.3288, + "step": 9594 + }, + { + "epoch": 2.4132293762575454, + "grad_norm": 0.3055490553379059, + "learning_rate": 1.1208012352491681e-06, + "loss": 0.3273, + "step": 9595 + }, + { + "epoch": 2.413480885311871, + "grad_norm": 0.31553953886032104, + "learning_rate": 1.1198781945371673e-06, + "loss": 0.2943, + "step": 9596 + }, + { + "epoch": 2.413732394366197, + "grad_norm": 0.293268084526062, + "learning_rate": 1.118955486133327e-06, + "loss": 0.323, + "step": 9597 + }, + { + "epoch": 2.4139839034205233, + "grad_norm": 0.304035484790802, + "learning_rate": 1.1180331101166675e-06, + "loss": 0.3229, + "step": 9598 + }, + { + "epoch": 2.414235412474849, + "grad_norm": 0.2874673008918762, + "learning_rate": 1.1171110665661888e-06, + "loss": 0.3199, + "step": 9599 + }, + { + "epoch": 2.414486921529175, + "grad_norm": 0.2967674434185028, + "learning_rate": 1.1161893555608538e-06, + "loss": 0.3396, + "step": 9600 + }, + { + "epoch": 2.414738430583501, + "grad_norm": 0.3168094754219055, + "learning_rate": 1.1152679771796054e-06, + "loss": 0.3291, + "step": 9601 + }, + { + "epoch": 2.414989939637827, + "grad_norm": 0.3081880509853363, + "learning_rate": 1.1143469315013505e-06, + "loss": 0.3123, + "step": 9602 + }, + { + "epoch": 2.4152414486921527, + "grad_norm": 0.3019590377807617, + "learning_rate": 1.1134262186049732e-06, + "loss": 0.3103, + "step": 9603 + }, + { + "epoch": 2.415492957746479, + "grad_norm": 0.3133490979671478, + "learning_rate": 1.1125058385693255e-06, + "loss": 0.3362, + "step": 9604 + }, + { + "epoch": 2.415744466800805, + "grad_norm": 0.2901817262172699, + "learning_rate": 1.1115857914732332e-06, + "loss": 0.3293, + "step": 9605 + }, + { + "epoch": 2.4159959758551306, + "grad_norm": 0.2860255241394043, + "learning_rate": 1.1106660773954908e-06, + "loss": 0.3074, + "step": 9606 + }, + { + "epoch": 2.416247484909457, + "grad_norm": 0.29888466000556946, + "learning_rate": 1.109746696414868e-06, + "loss": 0.3458, + "step": 9607 + }, + { + "epoch": 2.4164989939637826, + "grad_norm": 0.30433139204978943, + "learning_rate": 1.1088276486101034e-06, + "loss": 0.3235, + "step": 9608 + }, + { + "epoch": 2.4167505030181085, + "grad_norm": 0.30524951219558716, + "learning_rate": 1.107908934059907e-06, + "loss": 0.3208, + "step": 9609 + }, + { + "epoch": 2.4170020120724347, + "grad_norm": 0.29082682728767395, + "learning_rate": 1.1069905528429631e-06, + "loss": 0.3234, + "step": 9610 + }, + { + "epoch": 2.4172535211267605, + "grad_norm": 0.2806370258331299, + "learning_rate": 1.1060725050379223e-06, + "loss": 0.3113, + "step": 9611 + }, + { + "epoch": 2.4175050301810863, + "grad_norm": 0.2872363328933716, + "learning_rate": 1.1051547907234122e-06, + "loss": 0.3013, + "step": 9612 + }, + { + "epoch": 2.4177565392354126, + "grad_norm": 0.2908620834350586, + "learning_rate": 1.1042374099780268e-06, + "loss": 0.3258, + "step": 9613 + }, + { + "epoch": 2.4180080482897384, + "grad_norm": 0.2825472354888916, + "learning_rate": 1.1033203628803369e-06, + "loss": 0.3147, + "step": 9614 + }, + { + "epoch": 2.418259557344064, + "grad_norm": 0.29791221022605896, + "learning_rate": 1.1024036495088792e-06, + "loss": 0.3144, + "step": 9615 + }, + { + "epoch": 2.4185110663983904, + "grad_norm": 0.3021547198295593, + "learning_rate": 1.1014872699421669e-06, + "loss": 0.3039, + "step": 9616 + }, + { + "epoch": 2.4187625754527162, + "grad_norm": 0.2768807113170624, + "learning_rate": 1.100571224258679e-06, + "loss": 0.3238, + "step": 9617 + }, + { + "epoch": 2.419014084507042, + "grad_norm": 0.29537469148635864, + "learning_rate": 1.099655512536872e-06, + "loss": 0.3087, + "step": 9618 + }, + { + "epoch": 2.4192655935613683, + "grad_norm": 0.28753364086151123, + "learning_rate": 1.0987401348551702e-06, + "loss": 0.3266, + "step": 9619 + }, + { + "epoch": 2.419517102615694, + "grad_norm": 0.30412012338638306, + "learning_rate": 1.097825091291969e-06, + "loss": 0.3196, + "step": 9620 + }, + { + "epoch": 2.41976861167002, + "grad_norm": 0.29906362295150757, + "learning_rate": 1.0969103819256377e-06, + "loss": 0.3135, + "step": 9621 + }, + { + "epoch": 2.420020120724346, + "grad_norm": 0.3171037435531616, + "learning_rate": 1.0959960068345139e-06, + "loss": 0.3056, + "step": 9622 + }, + { + "epoch": 2.420271629778672, + "grad_norm": 0.2972031831741333, + "learning_rate": 1.0950819660969092e-06, + "loss": 0.3344, + "step": 9623 + }, + { + "epoch": 2.4205231388329977, + "grad_norm": 0.29488030076026917, + "learning_rate": 1.094168259791104e-06, + "loss": 0.3244, + "step": 9624 + }, + { + "epoch": 2.420774647887324, + "grad_norm": 0.290930837392807, + "learning_rate": 1.0932548879953543e-06, + "loss": 0.3018, + "step": 9625 + }, + { + "epoch": 2.42102615694165, + "grad_norm": 0.29524073004722595, + "learning_rate": 1.0923418507878807e-06, + "loss": 0.3232, + "step": 9626 + }, + { + "epoch": 2.421277665995976, + "grad_norm": 0.30457374453544617, + "learning_rate": 1.091429148246882e-06, + "loss": 0.3264, + "step": 9627 + }, + { + "epoch": 2.421529175050302, + "grad_norm": 0.27748963236808777, + "learning_rate": 1.0905167804505263e-06, + "loss": 0.3311, + "step": 9628 + }, + { + "epoch": 2.4217806841046277, + "grad_norm": 0.29854872822761536, + "learning_rate": 1.0896047474769489e-06, + "loss": 0.3514, + "step": 9629 + }, + { + "epoch": 2.422032193158954, + "grad_norm": 0.3030698597431183, + "learning_rate": 1.088693049404263e-06, + "loss": 0.3006, + "step": 9630 + }, + { + "epoch": 2.4222837022132797, + "grad_norm": 0.31721991300582886, + "learning_rate": 1.0877816863105473e-06, + "loss": 0.3031, + "step": 9631 + }, + { + "epoch": 2.4225352112676055, + "grad_norm": 0.2863737642765045, + "learning_rate": 1.0868706582738563e-06, + "loss": 0.3057, + "step": 9632 + }, + { + "epoch": 2.4227867203219318, + "grad_norm": 0.303460031747818, + "learning_rate": 1.085959965372212e-06, + "loss": 0.3319, + "step": 9633 + }, + { + "epoch": 2.4230382293762576, + "grad_norm": 0.29836776852607727, + "learning_rate": 1.085049607683612e-06, + "loss": 0.3197, + "step": 9634 + }, + { + "epoch": 2.4232897384305834, + "grad_norm": 0.2951622009277344, + "learning_rate": 1.0841395852860194e-06, + "loss": 0.2886, + "step": 9635 + }, + { + "epoch": 2.4235412474849096, + "grad_norm": 0.29916274547576904, + "learning_rate": 1.0832298982573753e-06, + "loss": 0.3294, + "step": 9636 + }, + { + "epoch": 2.4237927565392354, + "grad_norm": 0.29172125458717346, + "learning_rate": 1.0823205466755858e-06, + "loss": 0.3011, + "step": 9637 + }, + { + "epoch": 2.4240442655935612, + "grad_norm": 0.28251975774765015, + "learning_rate": 1.0814115306185342e-06, + "loss": 0.3193, + "step": 9638 + }, + { + "epoch": 2.4242957746478875, + "grad_norm": 0.31814032793045044, + "learning_rate": 1.0805028501640686e-06, + "loss": 0.3236, + "step": 9639 + }, + { + "epoch": 2.4245472837022133, + "grad_norm": 0.313716858625412, + "learning_rate": 1.0795945053900153e-06, + "loss": 0.2972, + "step": 9640 + }, + { + "epoch": 2.424798792756539, + "grad_norm": 0.29684916138648987, + "learning_rate": 1.078686496374165e-06, + "loss": 0.3449, + "step": 9641 + }, + { + "epoch": 2.4250503018108653, + "grad_norm": 0.2841853201389313, + "learning_rate": 1.0777788231942859e-06, + "loss": 0.3192, + "step": 9642 + }, + { + "epoch": 2.425301810865191, + "grad_norm": 0.29728415608406067, + "learning_rate": 1.076871485928112e-06, + "loss": 0.3058, + "step": 9643 + }, + { + "epoch": 2.425553319919517, + "grad_norm": 0.29951390624046326, + "learning_rate": 1.075964484653354e-06, + "loss": 0.3084, + "step": 9644 + }, + { + "epoch": 2.425804828973843, + "grad_norm": 0.3010987937450409, + "learning_rate": 1.0750578194476875e-06, + "loss": 0.3049, + "step": 9645 + }, + { + "epoch": 2.426056338028169, + "grad_norm": 0.29705509543418884, + "learning_rate": 1.0741514903887657e-06, + "loss": 0.3206, + "step": 9646 + }, + { + "epoch": 2.426307847082495, + "grad_norm": 0.28296321630477905, + "learning_rate": 1.0732454975542079e-06, + "loss": 0.3216, + "step": 9647 + }, + { + "epoch": 2.426559356136821, + "grad_norm": 0.2731419503688812, + "learning_rate": 1.0723398410216085e-06, + "loss": 0.3335, + "step": 9648 + }, + { + "epoch": 2.426810865191147, + "grad_norm": 0.29805314540863037, + "learning_rate": 1.071434520868529e-06, + "loss": 0.3268, + "step": 9649 + }, + { + "epoch": 2.4270623742454727, + "grad_norm": 0.27815183997154236, + "learning_rate": 1.0705295371725066e-06, + "loss": 0.3374, + "step": 9650 + }, + { + "epoch": 2.427313883299799, + "grad_norm": 0.31619152426719666, + "learning_rate": 1.0696248900110461e-06, + "loss": 0.332, + "step": 9651 + }, + { + "epoch": 2.4275653923541247, + "grad_norm": 0.27347874641418457, + "learning_rate": 1.0687205794616262e-06, + "loss": 0.2954, + "step": 9652 + }, + { + "epoch": 2.4278169014084505, + "grad_norm": 0.323209285736084, + "learning_rate": 1.0678166056016936e-06, + "loss": 0.3199, + "step": 9653 + }, + { + "epoch": 2.4280684104627768, + "grad_norm": 0.2880942225456238, + "learning_rate": 1.0669129685086705e-06, + "loss": 0.3555, + "step": 9654 + }, + { + "epoch": 2.4283199195171026, + "grad_norm": 0.30394867062568665, + "learning_rate": 1.0660096682599453e-06, + "loss": 0.3058, + "step": 9655 + }, + { + "epoch": 2.4285714285714284, + "grad_norm": 0.29747632145881653, + "learning_rate": 1.0651067049328818e-06, + "loss": 0.324, + "step": 9656 + }, + { + "epoch": 2.4288229376257546, + "grad_norm": 0.305722177028656, + "learning_rate": 1.0642040786048113e-06, + "loss": 0.3026, + "step": 9657 + }, + { + "epoch": 2.4290744466800804, + "grad_norm": 0.30282852053642273, + "learning_rate": 1.0633017893530407e-06, + "loss": 0.3272, + "step": 9658 + }, + { + "epoch": 2.4293259557344067, + "grad_norm": 0.29805997014045715, + "learning_rate": 1.0623998372548423e-06, + "loss": 0.3306, + "step": 9659 + }, + { + "epoch": 2.4295774647887325, + "grad_norm": 0.2963772118091583, + "learning_rate": 1.0614982223874642e-06, + "loss": 0.3272, + "step": 9660 + }, + { + "epoch": 2.4298289738430583, + "grad_norm": 0.3043665587902069, + "learning_rate": 1.0605969448281257e-06, + "loss": 0.3119, + "step": 9661 + }, + { + "epoch": 2.4300804828973845, + "grad_norm": 0.2912227511405945, + "learning_rate": 1.0596960046540129e-06, + "loss": 0.3362, + "step": 9662 + }, + { + "epoch": 2.4303319919517103, + "grad_norm": 0.3193990886211395, + "learning_rate": 1.0587954019422874e-06, + "loss": 0.3238, + "step": 9663 + }, + { + "epoch": 2.430583501006036, + "grad_norm": 0.2979458272457123, + "learning_rate": 1.057895136770079e-06, + "loss": 0.3141, + "step": 9664 + }, + { + "epoch": 2.4308350100603624, + "grad_norm": 0.2856670916080475, + "learning_rate": 1.056995209214492e-06, + "loss": 0.3083, + "step": 9665 + }, + { + "epoch": 2.431086519114688, + "grad_norm": 0.3085869550704956, + "learning_rate": 1.0560956193525961e-06, + "loss": 0.3193, + "step": 9666 + }, + { + "epoch": 2.431338028169014, + "grad_norm": 0.30326932668685913, + "learning_rate": 1.0551963672614385e-06, + "loss": 0.3145, + "step": 9667 + }, + { + "epoch": 2.4315895372233403, + "grad_norm": 0.30177420377731323, + "learning_rate": 1.0542974530180327e-06, + "loss": 0.3422, + "step": 9668 + }, + { + "epoch": 2.431841046277666, + "grad_norm": 0.28768646717071533, + "learning_rate": 1.0533988766993668e-06, + "loss": 0.3213, + "step": 9669 + }, + { + "epoch": 2.432092555331992, + "grad_norm": 0.3230489194393158, + "learning_rate": 1.052500638382396e-06, + "loss": 0.3326, + "step": 9670 + }, + { + "epoch": 2.432344064386318, + "grad_norm": 0.3124891519546509, + "learning_rate": 1.0516027381440502e-06, + "loss": 0.3219, + "step": 9671 + }, + { + "epoch": 2.432595573440644, + "grad_norm": 0.2979585826396942, + "learning_rate": 1.0507051760612302e-06, + "loss": 0.304, + "step": 9672 + }, + { + "epoch": 2.4328470824949697, + "grad_norm": 0.2868764400482178, + "learning_rate": 1.0498079522108034e-06, + "loss": 0.349, + "step": 9673 + }, + { + "epoch": 2.433098591549296, + "grad_norm": 0.32363319396972656, + "learning_rate": 1.0489110666696144e-06, + "loss": 0.3378, + "step": 9674 + }, + { + "epoch": 2.433350100603622, + "grad_norm": 0.28327086567878723, + "learning_rate": 1.0480145195144736e-06, + "loss": 0.3041, + "step": 9675 + }, + { + "epoch": 2.4336016096579476, + "grad_norm": 0.30430731177330017, + "learning_rate": 1.0471183108221673e-06, + "loss": 0.3099, + "step": 9676 + }, + { + "epoch": 2.433853118712274, + "grad_norm": 0.3252510130405426, + "learning_rate": 1.0462224406694471e-06, + "loss": 0.3302, + "step": 9677 + }, + { + "epoch": 2.4341046277665996, + "grad_norm": 0.29767587780952454, + "learning_rate": 1.045326909133041e-06, + "loss": 0.2966, + "step": 9678 + }, + { + "epoch": 2.4343561368209254, + "grad_norm": 0.3064614534378052, + "learning_rate": 1.0444317162896433e-06, + "loss": 0.3123, + "step": 9679 + }, + { + "epoch": 2.4346076458752517, + "grad_norm": 0.30733683705329895, + "learning_rate": 1.0435368622159254e-06, + "loss": 0.2995, + "step": 9680 + }, + { + "epoch": 2.4348591549295775, + "grad_norm": 0.31988590955734253, + "learning_rate": 1.0426423469885216e-06, + "loss": 0.3176, + "step": 9681 + }, + { + "epoch": 2.4351106639839033, + "grad_norm": 0.29145556688308716, + "learning_rate": 1.0417481706840439e-06, + "loss": 0.3056, + "step": 9682 + }, + { + "epoch": 2.4353621730382295, + "grad_norm": 0.2926798462867737, + "learning_rate": 1.0408543333790738e-06, + "loss": 0.3149, + "step": 9683 + }, + { + "epoch": 2.4356136820925554, + "grad_norm": 0.2882416844367981, + "learning_rate": 1.0399608351501606e-06, + "loss": 0.3247, + "step": 9684 + }, + { + "epoch": 2.435865191146881, + "grad_norm": 0.3121548593044281, + "learning_rate": 1.0390676760738289e-06, + "loss": 0.336, + "step": 9685 + }, + { + "epoch": 2.4361167002012074, + "grad_norm": 0.2849773168563843, + "learning_rate": 1.0381748562265704e-06, + "loss": 0.3229, + "step": 9686 + }, + { + "epoch": 2.436368209255533, + "grad_norm": 0.30011287331581116, + "learning_rate": 1.037282375684851e-06, + "loss": 0.3302, + "step": 9687 + }, + { + "epoch": 2.436619718309859, + "grad_norm": 0.298657089471817, + "learning_rate": 1.0363902345251048e-06, + "loss": 0.3117, + "step": 9688 + }, + { + "epoch": 2.4368712273641853, + "grad_norm": 0.3116760551929474, + "learning_rate": 1.0354984328237399e-06, + "loss": 0.3293, + "step": 9689 + }, + { + "epoch": 2.437122736418511, + "grad_norm": 0.30939438939094543, + "learning_rate": 1.034606970657131e-06, + "loss": 0.3208, + "step": 9690 + }, + { + "epoch": 2.437374245472837, + "grad_norm": 0.3038461208343506, + "learning_rate": 1.0337158481016285e-06, + "loss": 0.3762, + "step": 9691 + }, + { + "epoch": 2.437625754527163, + "grad_norm": 0.2885994613170624, + "learning_rate": 1.0328250652335497e-06, + "loss": 0.3434, + "step": 9692 + }, + { + "epoch": 2.437877263581489, + "grad_norm": 0.28599095344543457, + "learning_rate": 1.031934622129186e-06, + "loss": 0.3283, + "step": 9693 + }, + { + "epoch": 2.4381287726358147, + "grad_norm": 0.2979143559932709, + "learning_rate": 1.0310445188647983e-06, + "loss": 0.3298, + "step": 9694 + }, + { + "epoch": 2.438380281690141, + "grad_norm": 0.300065815448761, + "learning_rate": 1.030154755516617e-06, + "loss": 0.3153, + "step": 9695 + }, + { + "epoch": 2.438631790744467, + "grad_norm": 0.29073888063430786, + "learning_rate": 1.029265332160847e-06, + "loss": 0.3157, + "step": 9696 + }, + { + "epoch": 2.4388832997987926, + "grad_norm": 0.28464123606681824, + "learning_rate": 1.0283762488736588e-06, + "loss": 0.2994, + "step": 9697 + }, + { + "epoch": 2.439134808853119, + "grad_norm": 0.299254447221756, + "learning_rate": 1.0274875057312001e-06, + "loss": 0.3058, + "step": 9698 + }, + { + "epoch": 2.4393863179074446, + "grad_norm": 0.29035767912864685, + "learning_rate": 1.0265991028095828e-06, + "loss": 0.3407, + "step": 9699 + }, + { + "epoch": 2.4396378269617705, + "grad_norm": 0.30013376474380493, + "learning_rate": 1.0257110401848963e-06, + "loss": 0.3296, + "step": 9700 + }, + { + "epoch": 2.4398893360160967, + "grad_norm": 0.30098024010658264, + "learning_rate": 1.0248233179331952e-06, + "loss": 0.3096, + "step": 9701 + }, + { + "epoch": 2.4401408450704225, + "grad_norm": 0.3151046633720398, + "learning_rate": 1.0239359361305091e-06, + "loss": 0.2879, + "step": 9702 + }, + { + "epoch": 2.4403923541247483, + "grad_norm": 0.273389607667923, + "learning_rate": 1.023048894852835e-06, + "loss": 0.3119, + "step": 9703 + }, + { + "epoch": 2.4406438631790746, + "grad_norm": 0.31045857071876526, + "learning_rate": 1.0221621941761428e-06, + "loss": 0.3154, + "step": 9704 + }, + { + "epoch": 2.4408953722334004, + "grad_norm": 0.27846378087997437, + "learning_rate": 1.0212758341763752e-06, + "loss": 0.3276, + "step": 9705 + }, + { + "epoch": 2.441146881287726, + "grad_norm": 0.3135693371295929, + "learning_rate": 1.02038981492944e-06, + "loss": 0.3199, + "step": 9706 + }, + { + "epoch": 2.4413983903420524, + "grad_norm": 0.30014142394065857, + "learning_rate": 1.0195041365112224e-06, + "loss": 0.3388, + "step": 9707 + }, + { + "epoch": 2.441649899396378, + "grad_norm": 0.30678755044937134, + "learning_rate": 1.0186187989975722e-06, + "loss": 0.3294, + "step": 9708 + }, + { + "epoch": 2.441901408450704, + "grad_norm": 0.2942301332950592, + "learning_rate": 1.0177338024643157e-06, + "loss": 0.3288, + "step": 9709 + }, + { + "epoch": 2.4421529175050303, + "grad_norm": 0.2993881404399872, + "learning_rate": 1.0168491469872444e-06, + "loss": 0.3219, + "step": 9710 + }, + { + "epoch": 2.442404426559356, + "grad_norm": 0.322645366191864, + "learning_rate": 1.0159648326421268e-06, + "loss": 0.2956, + "step": 9711 + }, + { + "epoch": 2.442655935613682, + "grad_norm": 0.2843986749649048, + "learning_rate": 1.0150808595046963e-06, + "loss": 0.3088, + "step": 9712 + }, + { + "epoch": 2.442907444668008, + "grad_norm": 0.2960312068462372, + "learning_rate": 1.0141972276506612e-06, + "loss": 0.3394, + "step": 9713 + }, + { + "epoch": 2.443158953722334, + "grad_norm": 0.27722686529159546, + "learning_rate": 1.013313937155697e-06, + "loss": 0.3286, + "step": 9714 + }, + { + "epoch": 2.4434104627766597, + "grad_norm": 0.31136950850486755, + "learning_rate": 1.012430988095454e-06, + "loss": 0.2942, + "step": 9715 + }, + { + "epoch": 2.443661971830986, + "grad_norm": 0.28949692845344543, + "learning_rate": 1.0115483805455522e-06, + "loss": 0.3309, + "step": 9716 + }, + { + "epoch": 2.443913480885312, + "grad_norm": 0.29630956053733826, + "learning_rate": 1.0106661145815783e-06, + "loss": 0.3099, + "step": 9717 + }, + { + "epoch": 2.4441649899396376, + "grad_norm": 0.3057538866996765, + "learning_rate": 1.0097841902790955e-06, + "loss": 0.3265, + "step": 9718 + }, + { + "epoch": 2.444416498993964, + "grad_norm": 0.2971940338611603, + "learning_rate": 1.0089026077136333e-06, + "loss": 0.3257, + "step": 9719 + }, + { + "epoch": 2.4446680080482897, + "grad_norm": 0.30409669876098633, + "learning_rate": 1.0080213669606958e-06, + "loss": 0.3457, + "step": 9720 + }, + { + "epoch": 2.4449195171026155, + "grad_norm": 0.30436453223228455, + "learning_rate": 1.007140468095753e-06, + "loss": 0.3052, + "step": 9721 + }, + { + "epoch": 2.4451710261569417, + "grad_norm": 0.28796908259391785, + "learning_rate": 1.006259911194251e-06, + "loss": 0.3238, + "step": 9722 + }, + { + "epoch": 2.4454225352112675, + "grad_norm": 0.3076968491077423, + "learning_rate": 1.005379696331602e-06, + "loss": 0.3161, + "step": 9723 + }, + { + "epoch": 2.4456740442655933, + "grad_norm": 0.2974853813648224, + "learning_rate": 1.0044998235831927e-06, + "loss": 0.3177, + "step": 9724 + }, + { + "epoch": 2.4459255533199196, + "grad_norm": 0.2881965637207031, + "learning_rate": 1.0036202930243766e-06, + "loss": 0.3106, + "step": 9725 + }, + { + "epoch": 2.4461770623742454, + "grad_norm": 0.2931132912635803, + "learning_rate": 1.0027411047304808e-06, + "loss": 0.319, + "step": 9726 + }, + { + "epoch": 2.4464285714285716, + "grad_norm": 0.2949577569961548, + "learning_rate": 1.001862258776804e-06, + "loss": 0.3085, + "step": 9727 + }, + { + "epoch": 2.4466800804828974, + "grad_norm": 0.2667643129825592, + "learning_rate": 1.0009837552386114e-06, + "loss": 0.3312, + "step": 9728 + }, + { + "epoch": 2.4469315895372232, + "grad_norm": 0.30988672375679016, + "learning_rate": 1.0001055941911437e-06, + "loss": 0.3215, + "step": 9729 + }, + { + "epoch": 2.4471830985915495, + "grad_norm": 0.3098249137401581, + "learning_rate": 9.992277757096069e-07, + "loss": 0.303, + "step": 9730 + }, + { + "epoch": 2.4474346076458753, + "grad_norm": 0.27878841757774353, + "learning_rate": 9.98350299869184e-07, + "loss": 0.322, + "step": 9731 + }, + { + "epoch": 2.447686116700201, + "grad_norm": 0.306535005569458, + "learning_rate": 9.974731667450227e-07, + "loss": 0.3044, + "step": 9732 + }, + { + "epoch": 2.4479376257545273, + "grad_norm": 0.3038245737552643, + "learning_rate": 9.965963764122455e-07, + "loss": 0.342, + "step": 9733 + }, + { + "epoch": 2.448189134808853, + "grad_norm": 0.2671661972999573, + "learning_rate": 9.957199289459424e-07, + "loss": 0.3162, + "step": 9734 + }, + { + "epoch": 2.448440643863179, + "grad_norm": 0.3104800879955292, + "learning_rate": 9.948438244211784e-07, + "loss": 0.3096, + "step": 9735 + }, + { + "epoch": 2.448692152917505, + "grad_norm": 0.3143140971660614, + "learning_rate": 9.939680629129828e-07, + "loss": 0.318, + "step": 9736 + }, + { + "epoch": 2.448943661971831, + "grad_norm": 0.2797510325908661, + "learning_rate": 9.930926444963612e-07, + "loss": 0.311, + "step": 9737 + }, + { + "epoch": 2.449195171026157, + "grad_norm": 0.2918142080307007, + "learning_rate": 9.922175692462887e-07, + "loss": 0.3394, + "step": 9738 + }, + { + "epoch": 2.449446680080483, + "grad_norm": 0.3032245337963104, + "learning_rate": 9.913428372377077e-07, + "loss": 0.3326, + "step": 9739 + }, + { + "epoch": 2.449698189134809, + "grad_norm": 0.2985784411430359, + "learning_rate": 9.904684485455358e-07, + "loss": 0.3224, + "step": 9740 + }, + { + "epoch": 2.4499496981891347, + "grad_norm": 0.30142176151275635, + "learning_rate": 9.895944032446563e-07, + "loss": 0.335, + "step": 9741 + }, + { + "epoch": 2.450201207243461, + "grad_norm": 0.3030637204647064, + "learning_rate": 9.887207014099288e-07, + "loss": 0.3076, + "step": 9742 + }, + { + "epoch": 2.4504527162977867, + "grad_norm": 0.2667844593524933, + "learning_rate": 9.878473431161767e-07, + "loss": 0.3327, + "step": 9743 + }, + { + "epoch": 2.4507042253521125, + "grad_norm": 0.3067658245563507, + "learning_rate": 9.869743284382016e-07, + "loss": 0.3115, + "step": 9744 + }, + { + "epoch": 2.4509557344064388, + "grad_norm": 0.2902940511703491, + "learning_rate": 9.861016574507686e-07, + "loss": 0.3084, + "step": 9745 + }, + { + "epoch": 2.4512072434607646, + "grad_norm": 0.28992483019828796, + "learning_rate": 9.852293302286186e-07, + "loss": 0.3156, + "step": 9746 + }, + { + "epoch": 2.4514587525150904, + "grad_norm": 0.3165602385997772, + "learning_rate": 9.843573468464596e-07, + "loss": 0.3362, + "step": 9747 + }, + { + "epoch": 2.4517102615694166, + "grad_norm": 0.2829355299472809, + "learning_rate": 9.834857073789716e-07, + "loss": 0.3233, + "step": 9748 + }, + { + "epoch": 2.4519617706237424, + "grad_norm": 0.3075374960899353, + "learning_rate": 9.826144119008068e-07, + "loss": 0.3281, + "step": 9749 + }, + { + "epoch": 2.4522132796780682, + "grad_norm": 0.32207030057907104, + "learning_rate": 9.817434604865833e-07, + "loss": 0.332, + "step": 9750 + }, + { + "epoch": 2.4524647887323945, + "grad_norm": 0.29507938027381897, + "learning_rate": 9.808728532108963e-07, + "loss": 0.3202, + "step": 9751 + }, + { + "epoch": 2.4527162977867203, + "grad_norm": 0.29577136039733887, + "learning_rate": 9.80002590148304e-07, + "loss": 0.349, + "step": 9752 + }, + { + "epoch": 2.452967806841046, + "grad_norm": 0.2732732594013214, + "learning_rate": 9.791326713733424e-07, + "loss": 0.3364, + "step": 9753 + }, + { + "epoch": 2.4532193158953723, + "grad_norm": 0.2873142957687378, + "learning_rate": 9.782630969605116e-07, + "loss": 0.3449, + "step": 9754 + }, + { + "epoch": 2.453470824949698, + "grad_norm": 0.28171807527542114, + "learning_rate": 9.77393866984288e-07, + "loss": 0.3179, + "step": 9755 + }, + { + "epoch": 2.453722334004024, + "grad_norm": 0.28104138374328613, + "learning_rate": 9.76524981519113e-07, + "loss": 0.3137, + "step": 9756 + }, + { + "epoch": 2.45397384305835, + "grad_norm": 0.3065677881240845, + "learning_rate": 9.756564406394042e-07, + "loss": 0.3079, + "step": 9757 + }, + { + "epoch": 2.454225352112676, + "grad_norm": 0.2809961438179016, + "learning_rate": 9.747882444195434e-07, + "loss": 0.3275, + "step": 9758 + }, + { + "epoch": 2.4544768611670023, + "grad_norm": 0.2860454022884369, + "learning_rate": 9.739203929338892e-07, + "loss": 0.3225, + "step": 9759 + }, + { + "epoch": 2.454728370221328, + "grad_norm": 0.29092496633529663, + "learning_rate": 9.730528862567645e-07, + "loss": 0.3164, + "step": 9760 + }, + { + "epoch": 2.454979879275654, + "grad_norm": 0.31237539649009705, + "learning_rate": 9.721857244624695e-07, + "loss": 0.3227, + "step": 9761 + }, + { + "epoch": 2.45523138832998, + "grad_norm": 0.28859943151474, + "learning_rate": 9.713189076252676e-07, + "loss": 0.3339, + "step": 9762 + }, + { + "epoch": 2.455482897384306, + "grad_norm": 0.2893824875354767, + "learning_rate": 9.704524358193996e-07, + "loss": 0.3166, + "step": 9763 + }, + { + "epoch": 2.4557344064386317, + "grad_norm": 0.2801450788974762, + "learning_rate": 9.6958630911907e-07, + "loss": 0.3077, + "step": 9764 + }, + { + "epoch": 2.455985915492958, + "grad_norm": 0.2814953029155731, + "learning_rate": 9.68720527598459e-07, + "loss": 0.3253, + "step": 9765 + }, + { + "epoch": 2.4562374245472838, + "grad_norm": 0.2909027338027954, + "learning_rate": 9.678550913317169e-07, + "loss": 0.3334, + "step": 9766 + }, + { + "epoch": 2.4564889336016096, + "grad_norm": 0.3029521703720093, + "learning_rate": 9.669900003929595e-07, + "loss": 0.321, + "step": 9767 + }, + { + "epoch": 2.456740442655936, + "grad_norm": 0.2950665056705475, + "learning_rate": 9.661252548562794e-07, + "loss": 0.3358, + "step": 9768 + }, + { + "epoch": 2.4569919517102616, + "grad_norm": 0.2943587005138397, + "learning_rate": 9.652608547957343e-07, + "loss": 0.3022, + "step": 9769 + }, + { + "epoch": 2.4572434607645874, + "grad_norm": 0.2907446622848511, + "learning_rate": 9.643968002853566e-07, + "loss": 0.3282, + "step": 9770 + }, + { + "epoch": 2.4574949698189137, + "grad_norm": 0.2860673666000366, + "learning_rate": 9.635330913991453e-07, + "loss": 0.3258, + "step": 9771 + }, + { + "epoch": 2.4577464788732395, + "grad_norm": 0.2849615514278412, + "learning_rate": 9.626697282110743e-07, + "loss": 0.3418, + "step": 9772 + }, + { + "epoch": 2.4579979879275653, + "grad_norm": 0.2882292866706848, + "learning_rate": 9.61806710795082e-07, + "loss": 0.3055, + "step": 9773 + }, + { + "epoch": 2.4582494969818915, + "grad_norm": 0.28438448905944824, + "learning_rate": 9.609440392250829e-07, + "loss": 0.3548, + "step": 9774 + }, + { + "epoch": 2.4585010060362174, + "grad_norm": 0.28415539860725403, + "learning_rate": 9.600817135749579e-07, + "loss": 0.3328, + "step": 9775 + }, + { + "epoch": 2.458752515090543, + "grad_norm": 0.2894395887851715, + "learning_rate": 9.592197339185617e-07, + "loss": 0.3477, + "step": 9776 + }, + { + "epoch": 2.4590040241448694, + "grad_norm": 0.2790740728378296, + "learning_rate": 9.583581003297148e-07, + "loss": 0.326, + "step": 9777 + }, + { + "epoch": 2.459255533199195, + "grad_norm": 0.29626601934432983, + "learning_rate": 9.574968128822138e-07, + "loss": 0.3037, + "step": 9778 + }, + { + "epoch": 2.459507042253521, + "grad_norm": 0.29274749755859375, + "learning_rate": 9.566358716498192e-07, + "loss": 0.3609, + "step": 9779 + }, + { + "epoch": 2.4597585513078473, + "grad_norm": 0.2905212640762329, + "learning_rate": 9.557752767062683e-07, + "loss": 0.3246, + "step": 9780 + }, + { + "epoch": 2.460010060362173, + "grad_norm": 0.2953580915927887, + "learning_rate": 9.549150281252633e-07, + "loss": 0.3493, + "step": 9781 + }, + { + "epoch": 2.460261569416499, + "grad_norm": 0.2971348464488983, + "learning_rate": 9.540551259804814e-07, + "loss": 0.3454, + "step": 9782 + }, + { + "epoch": 2.460513078470825, + "grad_norm": 0.2992364168167114, + "learning_rate": 9.531955703455654e-07, + "loss": 0.3294, + "step": 9783 + }, + { + "epoch": 2.460764587525151, + "grad_norm": 0.3017844259738922, + "learning_rate": 9.523363612941333e-07, + "loss": 0.3127, + "step": 9784 + }, + { + "epoch": 2.4610160965794767, + "grad_norm": 0.2864784300327301, + "learning_rate": 9.514774988997683e-07, + "loss": 0.3399, + "step": 9785 + }, + { + "epoch": 2.461267605633803, + "grad_norm": 0.29324662685394287, + "learning_rate": 9.506189832360296e-07, + "loss": 0.3104, + "step": 9786 + }, + { + "epoch": 2.461519114688129, + "grad_norm": 0.30676597356796265, + "learning_rate": 9.497608143764403e-07, + "loss": 0.298, + "step": 9787 + }, + { + "epoch": 2.4617706237424546, + "grad_norm": 0.2943531274795532, + "learning_rate": 9.489029923945009e-07, + "loss": 0.3265, + "step": 9788 + }, + { + "epoch": 2.462022132796781, + "grad_norm": 0.30594533681869507, + "learning_rate": 9.480455173636754e-07, + "loss": 0.3198, + "step": 9789 + }, + { + "epoch": 2.4622736418511066, + "grad_norm": 0.2777751684188843, + "learning_rate": 9.471883893574019e-07, + "loss": 0.3289, + "step": 9790 + }, + { + "epoch": 2.4625251509054324, + "grad_norm": 0.3079993426799774, + "learning_rate": 9.463316084490903e-07, + "loss": 0.3278, + "step": 9791 + }, + { + "epoch": 2.4627766599597587, + "grad_norm": 0.29290157556533813, + "learning_rate": 9.454751747121149e-07, + "loss": 0.3278, + "step": 9792 + }, + { + "epoch": 2.4630281690140845, + "grad_norm": 0.2852899432182312, + "learning_rate": 9.446190882198275e-07, + "loss": 0.3066, + "step": 9793 + }, + { + "epoch": 2.4632796780684103, + "grad_norm": 0.2716057300567627, + "learning_rate": 9.437633490455434e-07, + "loss": 0.3427, + "step": 9794 + }, + { + "epoch": 2.4635311871227366, + "grad_norm": 0.2801603674888611, + "learning_rate": 9.429079572625543e-07, + "loss": 0.331, + "step": 9795 + }, + { + "epoch": 2.4637826961770624, + "grad_norm": 0.2798592150211334, + "learning_rate": 9.420529129441159e-07, + "loss": 0.3137, + "step": 9796 + }, + { + "epoch": 2.464034205231388, + "grad_norm": 0.2886095345020294, + "learning_rate": 9.411982161634603e-07, + "loss": 0.2978, + "step": 9797 + }, + { + "epoch": 2.4642857142857144, + "grad_norm": 0.31059303879737854, + "learning_rate": 9.403438669937848e-07, + "loss": 0.3069, + "step": 9798 + }, + { + "epoch": 2.46453722334004, + "grad_norm": 0.2892918884754181, + "learning_rate": 9.394898655082607e-07, + "loss": 0.3412, + "step": 9799 + }, + { + "epoch": 2.464788732394366, + "grad_norm": 0.29814252257347107, + "learning_rate": 9.386362117800262e-07, + "loss": 0.313, + "step": 9800 + }, + { + "epoch": 2.4650402414486923, + "grad_norm": 0.30424758791923523, + "learning_rate": 9.377829058821924e-07, + "loss": 0.3106, + "step": 9801 + }, + { + "epoch": 2.465291750503018, + "grad_norm": 0.3049749732017517, + "learning_rate": 9.369299478878408e-07, + "loss": 0.3122, + "step": 9802 + }, + { + "epoch": 2.465543259557344, + "grad_norm": 0.2787761092185974, + "learning_rate": 9.360773378700194e-07, + "loss": 0.319, + "step": 9803 + }, + { + "epoch": 2.46579476861167, + "grad_norm": 0.3029969334602356, + "learning_rate": 9.352250759017517e-07, + "loss": 0.3046, + "step": 9804 + }, + { + "epoch": 2.466046277665996, + "grad_norm": 0.2986536920070648, + "learning_rate": 9.343731620560254e-07, + "loss": 0.3045, + "step": 9805 + }, + { + "epoch": 2.4662977867203217, + "grad_norm": 0.3007206618785858, + "learning_rate": 9.335215964058047e-07, + "loss": 0.3246, + "step": 9806 + }, + { + "epoch": 2.466549295774648, + "grad_norm": 0.31535202264785767, + "learning_rate": 9.326703790240183e-07, + "loss": 0.3106, + "step": 9807 + }, + { + "epoch": 2.466800804828974, + "grad_norm": 0.29556044936180115, + "learning_rate": 9.318195099835697e-07, + "loss": 0.3332, + "step": 9808 + }, + { + "epoch": 2.4670523138832996, + "grad_norm": 0.2860323190689087, + "learning_rate": 9.309689893573287e-07, + "loss": 0.3225, + "step": 9809 + }, + { + "epoch": 2.467303822937626, + "grad_norm": 0.3032556176185608, + "learning_rate": 9.301188172181391e-07, + "loss": 0.2963, + "step": 9810 + }, + { + "epoch": 2.4675553319919517, + "grad_norm": 0.29713839292526245, + "learning_rate": 9.292689936388106e-07, + "loss": 0.327, + "step": 9811 + }, + { + "epoch": 2.4678068410462775, + "grad_norm": 0.2727225124835968, + "learning_rate": 9.284195186921263e-07, + "loss": 0.3128, + "step": 9812 + }, + { + "epoch": 2.4680583501006037, + "grad_norm": 0.2951027452945709, + "learning_rate": 9.2757039245084e-07, + "loss": 0.3165, + "step": 9813 + }, + { + "epoch": 2.4683098591549295, + "grad_norm": 0.31366926431655884, + "learning_rate": 9.267216149876712e-07, + "loss": 0.3337, + "step": 9814 + }, + { + "epoch": 2.4685613682092553, + "grad_norm": 0.30813443660736084, + "learning_rate": 9.258731863753145e-07, + "loss": 0.3278, + "step": 9815 + }, + { + "epoch": 2.4688128772635816, + "grad_norm": 0.30502599477767944, + "learning_rate": 9.250251066864308e-07, + "loss": 0.3154, + "step": 9816 + }, + { + "epoch": 2.4690643863179074, + "grad_norm": 0.2972979247570038, + "learning_rate": 9.241773759936551e-07, + "loss": 0.3481, + "step": 9817 + }, + { + "epoch": 2.469315895372233, + "grad_norm": 0.32507577538490295, + "learning_rate": 9.233299943695878e-07, + "loss": 0.3105, + "step": 9818 + }, + { + "epoch": 2.4695674044265594, + "grad_norm": 0.30700263381004333, + "learning_rate": 9.224829618868037e-07, + "loss": 0.3226, + "step": 9819 + }, + { + "epoch": 2.4698189134808852, + "grad_norm": 0.3043920397758484, + "learning_rate": 9.21636278617844e-07, + "loss": 0.3128, + "step": 9820 + }, + { + "epoch": 2.470070422535211, + "grad_norm": 0.2863234579563141, + "learning_rate": 9.207899446352242e-07, + "loss": 0.3091, + "step": 9821 + }, + { + "epoch": 2.4703219315895373, + "grad_norm": 0.3094150722026825, + "learning_rate": 9.199439600114247e-07, + "loss": 0.3213, + "step": 9822 + }, + { + "epoch": 2.470573440643863, + "grad_norm": 0.2926952838897705, + "learning_rate": 9.190983248189006e-07, + "loss": 0.3025, + "step": 9823 + }, + { + "epoch": 2.470824949698189, + "grad_norm": 0.3109200596809387, + "learning_rate": 9.18253039130076e-07, + "loss": 0.3124, + "step": 9824 + }, + { + "epoch": 2.471076458752515, + "grad_norm": 0.32803231477737427, + "learning_rate": 9.174081030173421e-07, + "loss": 0.2898, + "step": 9825 + }, + { + "epoch": 2.471327967806841, + "grad_norm": 0.30386537313461304, + "learning_rate": 9.165635165530645e-07, + "loss": 0.3086, + "step": 9826 + }, + { + "epoch": 2.471579476861167, + "grad_norm": 0.27869728207588196, + "learning_rate": 9.157192798095748e-07, + "loss": 0.341, + "step": 9827 + }, + { + "epoch": 2.471830985915493, + "grad_norm": 0.2737056612968445, + "learning_rate": 9.148753928591791e-07, + "loss": 0.318, + "step": 9828 + }, + { + "epoch": 2.472082494969819, + "grad_norm": 0.3147308826446533, + "learning_rate": 9.140318557741479e-07, + "loss": 0.309, + "step": 9829 + }, + { + "epoch": 2.472334004024145, + "grad_norm": 0.324812650680542, + "learning_rate": 9.131886686267277e-07, + "loss": 0.3185, + "step": 9830 + }, + { + "epoch": 2.472585513078471, + "grad_norm": 0.29894959926605225, + "learning_rate": 9.123458314891304e-07, + "loss": 0.3313, + "step": 9831 + }, + { + "epoch": 2.4728370221327967, + "grad_norm": 0.278921514749527, + "learning_rate": 9.115033444335408e-07, + "loss": 0.3172, + "step": 9832 + }, + { + "epoch": 2.473088531187123, + "grad_norm": 0.27208349108695984, + "learning_rate": 9.106612075321114e-07, + "loss": 0.3138, + "step": 9833 + }, + { + "epoch": 2.4733400402414487, + "grad_norm": 0.27421247959136963, + "learning_rate": 9.098194208569666e-07, + "loss": 0.3112, + "step": 9834 + }, + { + "epoch": 2.4735915492957745, + "grad_norm": 0.3077579736709595, + "learning_rate": 9.089779844802016e-07, + "loss": 0.3335, + "step": 9835 + }, + { + "epoch": 2.4738430583501008, + "grad_norm": 0.2903458774089813, + "learning_rate": 9.081368984738781e-07, + "loss": 0.3324, + "step": 9836 + }, + { + "epoch": 2.4740945674044266, + "grad_norm": 0.2951434254646301, + "learning_rate": 9.072961629100313e-07, + "loss": 0.3341, + "step": 9837 + }, + { + "epoch": 2.4743460764587524, + "grad_norm": 0.28142881393432617, + "learning_rate": 9.064557778606631e-07, + "loss": 0.3054, + "step": 9838 + }, + { + "epoch": 2.4745975855130786, + "grad_norm": 0.2758212387561798, + "learning_rate": 9.056157433977497e-07, + "loss": 0.3227, + "step": 9839 + }, + { + "epoch": 2.4748490945674044, + "grad_norm": 0.2981395125389099, + "learning_rate": 9.04776059593232e-07, + "loss": 0.3408, + "step": 9840 + }, + { + "epoch": 2.4751006036217302, + "grad_norm": 0.2696145176887512, + "learning_rate": 9.039367265190268e-07, + "loss": 0.3244, + "step": 9841 + }, + { + "epoch": 2.4753521126760565, + "grad_norm": 0.2982428967952728, + "learning_rate": 9.030977442470146e-07, + "loss": 0.3094, + "step": 9842 + }, + { + "epoch": 2.4756036217303823, + "grad_norm": 0.2985045611858368, + "learning_rate": 9.022591128490521e-07, + "loss": 0.3323, + "step": 9843 + }, + { + "epoch": 2.475855130784708, + "grad_norm": 0.2910267114639282, + "learning_rate": 9.014208323969598e-07, + "loss": 0.3386, + "step": 9844 + }, + { + "epoch": 2.4761066398390343, + "grad_norm": 0.290507972240448, + "learning_rate": 9.005829029625324e-07, + "loss": 0.3003, + "step": 9845 + }, + { + "epoch": 2.47635814889336, + "grad_norm": 0.2682448923587799, + "learning_rate": 8.997453246175347e-07, + "loss": 0.3272, + "step": 9846 + }, + { + "epoch": 2.476609657947686, + "grad_norm": 0.2868577241897583, + "learning_rate": 8.989080974336972e-07, + "loss": 0.3217, + "step": 9847 + }, + { + "epoch": 2.476861167002012, + "grad_norm": 0.28790926933288574, + "learning_rate": 8.98071221482727e-07, + "loss": 0.3042, + "step": 9848 + }, + { + "epoch": 2.477112676056338, + "grad_norm": 0.29729732871055603, + "learning_rate": 8.972346968362932e-07, + "loss": 0.304, + "step": 9849 + }, + { + "epoch": 2.477364185110664, + "grad_norm": 0.28235840797424316, + "learning_rate": 8.963985235660422e-07, + "loss": 0.325, + "step": 9850 + }, + { + "epoch": 2.47761569416499, + "grad_norm": 0.2943761646747589, + "learning_rate": 8.955627017435841e-07, + "loss": 0.3345, + "step": 9851 + }, + { + "epoch": 2.477867203219316, + "grad_norm": 0.29644346237182617, + "learning_rate": 8.947272314405048e-07, + "loss": 0.326, + "step": 9852 + }, + { + "epoch": 2.4781187122736417, + "grad_norm": 0.29010266065597534, + "learning_rate": 8.938921127283545e-07, + "loss": 0.337, + "step": 9853 + }, + { + "epoch": 2.478370221327968, + "grad_norm": 0.31179898977279663, + "learning_rate": 8.930573456786584e-07, + "loss": 0.3329, + "step": 9854 + }, + { + "epoch": 2.4786217303822937, + "grad_norm": 0.30490022897720337, + "learning_rate": 8.922229303629059e-07, + "loss": 0.3509, + "step": 9855 + }, + { + "epoch": 2.4788732394366195, + "grad_norm": 0.2934214472770691, + "learning_rate": 8.913888668525616e-07, + "loss": 0.3173, + "step": 9856 + }, + { + "epoch": 2.4791247484909458, + "grad_norm": 0.29519298672676086, + "learning_rate": 8.905551552190589e-07, + "loss": 0.3348, + "step": 9857 + }, + { + "epoch": 2.4793762575452716, + "grad_norm": 0.2930413484573364, + "learning_rate": 8.897217955337967e-07, + "loss": 0.3564, + "step": 9858 + }, + { + "epoch": 2.479627766599598, + "grad_norm": 0.2853700816631317, + "learning_rate": 8.888887878681507e-07, + "loss": 0.3438, + "step": 9859 + }, + { + "epoch": 2.4798792756539236, + "grad_norm": 0.30569860339164734, + "learning_rate": 8.880561322934595e-07, + "loss": 0.3153, + "step": 9860 + }, + { + "epoch": 2.4801307847082494, + "grad_norm": 0.30227360129356384, + "learning_rate": 8.872238288810375e-07, + "loss": 0.3207, + "step": 9861 + }, + { + "epoch": 2.4803822937625757, + "grad_norm": 0.2963835895061493, + "learning_rate": 8.86391877702164e-07, + "loss": 0.3277, + "step": 9862 + }, + { + "epoch": 2.4806338028169015, + "grad_norm": 0.29063740372657776, + "learning_rate": 8.855602788280926e-07, + "loss": 0.3126, + "step": 9863 + }, + { + "epoch": 2.4808853118712273, + "grad_norm": 0.2818237245082855, + "learning_rate": 8.847290323300423e-07, + "loss": 0.3289, + "step": 9864 + }, + { + "epoch": 2.4811368209255535, + "grad_norm": 0.2938326299190521, + "learning_rate": 8.838981382792067e-07, + "loss": 0.3429, + "step": 9865 + }, + { + "epoch": 2.4813883299798793, + "grad_norm": 0.30507832765579224, + "learning_rate": 8.830675967467439e-07, + "loss": 0.3219, + "step": 9866 + }, + { + "epoch": 2.481639839034205, + "grad_norm": 0.2869739234447479, + "learning_rate": 8.822374078037859e-07, + "loss": 0.305, + "step": 9867 + }, + { + "epoch": 2.4818913480885314, + "grad_norm": 0.3017198145389557, + "learning_rate": 8.814075715214348e-07, + "loss": 0.3455, + "step": 9868 + }, + { + "epoch": 2.482142857142857, + "grad_norm": 0.2792239189147949, + "learning_rate": 8.805780879707582e-07, + "loss": 0.3077, + "step": 9869 + }, + { + "epoch": 2.482394366197183, + "grad_norm": 0.2923182249069214, + "learning_rate": 8.797489572227985e-07, + "loss": 0.3207, + "step": 9870 + }, + { + "epoch": 2.4826458752515093, + "grad_norm": 0.3037014901638031, + "learning_rate": 8.78920179348563e-07, + "loss": 0.3091, + "step": 9871 + }, + { + "epoch": 2.482897384305835, + "grad_norm": 0.312212735414505, + "learning_rate": 8.780917544190337e-07, + "loss": 0.3114, + "step": 9872 + }, + { + "epoch": 2.483148893360161, + "grad_norm": 0.29248368740081787, + "learning_rate": 8.772636825051584e-07, + "loss": 0.3165, + "step": 9873 + }, + { + "epoch": 2.483400402414487, + "grad_norm": 0.291723370552063, + "learning_rate": 8.76435963677858e-07, + "loss": 0.3269, + "step": 9874 + }, + { + "epoch": 2.483651911468813, + "grad_norm": 0.26432013511657715, + "learning_rate": 8.756085980080193e-07, + "loss": 0.3193, + "step": 9875 + }, + { + "epoch": 2.4839034205231387, + "grad_norm": 0.2941466271877289, + "learning_rate": 8.747815855665026e-07, + "loss": 0.3385, + "step": 9876 + }, + { + "epoch": 2.484154929577465, + "grad_norm": 0.2962441146373749, + "learning_rate": 8.739549264241353e-07, + "loss": 0.3287, + "step": 9877 + }, + { + "epoch": 2.484406438631791, + "grad_norm": 0.29611530900001526, + "learning_rate": 8.731286206517158e-07, + "loss": 0.3197, + "step": 9878 + }, + { + "epoch": 2.4846579476861166, + "grad_norm": 0.2853570878505707, + "learning_rate": 8.723026683200136e-07, + "loss": 0.3227, + "step": 9879 + }, + { + "epoch": 2.484909456740443, + "grad_norm": 0.3189356029033661, + "learning_rate": 8.714770694997637e-07, + "loss": 0.351, + "step": 9880 + }, + { + "epoch": 2.4851609657947686, + "grad_norm": 0.2843788266181946, + "learning_rate": 8.706518242616762e-07, + "loss": 0.357, + "step": 9881 + }, + { + "epoch": 2.4854124748490944, + "grad_norm": 0.2974276542663574, + "learning_rate": 8.698269326764258e-07, + "loss": 0.3195, + "step": 9882 + }, + { + "epoch": 2.4856639839034207, + "grad_norm": 0.28935566544532776, + "learning_rate": 8.690023948146614e-07, + "loss": 0.3518, + "step": 9883 + }, + { + "epoch": 2.4859154929577465, + "grad_norm": 0.29735007882118225, + "learning_rate": 8.681782107469971e-07, + "loss": 0.3126, + "step": 9884 + }, + { + "epoch": 2.4861670020120723, + "grad_norm": 0.29163575172424316, + "learning_rate": 8.673543805440216e-07, + "loss": 0.3077, + "step": 9885 + }, + { + "epoch": 2.4864185110663986, + "grad_norm": 0.29763737320899963, + "learning_rate": 8.665309042762888e-07, + "loss": 0.3097, + "step": 9886 + }, + { + "epoch": 2.4866700201207244, + "grad_norm": 0.3058236539363861, + "learning_rate": 8.657077820143262e-07, + "loss": 0.296, + "step": 9887 + }, + { + "epoch": 2.48692152917505, + "grad_norm": 0.2766462564468384, + "learning_rate": 8.648850138286263e-07, + "loss": 0.3218, + "step": 9888 + }, + { + "epoch": 2.4871730382293764, + "grad_norm": 0.3000984489917755, + "learning_rate": 8.640625997896579e-07, + "loss": 0.3407, + "step": 9889 + }, + { + "epoch": 2.487424547283702, + "grad_norm": 0.3003200888633728, + "learning_rate": 8.632405399678518e-07, + "loss": 0.3151, + "step": 9890 + }, + { + "epoch": 2.487676056338028, + "grad_norm": 0.288412481546402, + "learning_rate": 8.624188344336148e-07, + "loss": 0.3449, + "step": 9891 + }, + { + "epoch": 2.4879275653923543, + "grad_norm": 0.3167831599712372, + "learning_rate": 8.615974832573193e-07, + "loss": 0.3157, + "step": 9892 + }, + { + "epoch": 2.48817907444668, + "grad_norm": 0.27706781029701233, + "learning_rate": 8.607764865093104e-07, + "loss": 0.3028, + "step": 9893 + }, + { + "epoch": 2.488430583501006, + "grad_norm": 0.2621653079986572, + "learning_rate": 8.599558442598998e-07, + "loss": 0.3303, + "step": 9894 + }, + { + "epoch": 2.488682092555332, + "grad_norm": 0.29237544536590576, + "learning_rate": 8.591355565793724e-07, + "loss": 0.3163, + "step": 9895 + }, + { + "epoch": 2.488933601609658, + "grad_norm": 0.31671497225761414, + "learning_rate": 8.583156235379774e-07, + "loss": 0.3258, + "step": 9896 + }, + { + "epoch": 2.4891851106639837, + "grad_norm": 0.3017057478427887, + "learning_rate": 8.574960452059411e-07, + "loss": 0.3216, + "step": 9897 + }, + { + "epoch": 2.48943661971831, + "grad_norm": 0.2894316017627716, + "learning_rate": 8.566768216534516e-07, + "loss": 0.3205, + "step": 9898 + }, + { + "epoch": 2.489688128772636, + "grad_norm": 0.28005433082580566, + "learning_rate": 8.558579529506728e-07, + "loss": 0.3178, + "step": 9899 + }, + { + "epoch": 2.4899396378269616, + "grad_norm": 0.3050450086593628, + "learning_rate": 8.55039439167733e-07, + "loss": 0.3008, + "step": 9900 + }, + { + "epoch": 2.490191146881288, + "grad_norm": 0.2795218527317047, + "learning_rate": 8.542212803747363e-07, + "loss": 0.3197, + "step": 9901 + }, + { + "epoch": 2.4904426559356136, + "grad_norm": 0.2750023901462555, + "learning_rate": 8.534034766417498e-07, + "loss": 0.3155, + "step": 9902 + }, + { + "epoch": 2.4906941649899395, + "grad_norm": 0.30676746368408203, + "learning_rate": 8.525860280388154e-07, + "loss": 0.31, + "step": 9903 + }, + { + "epoch": 2.4909456740442657, + "grad_norm": 0.28755173087120056, + "learning_rate": 8.517689346359409e-07, + "loss": 0.3182, + "step": 9904 + }, + { + "epoch": 2.4911971830985915, + "grad_norm": 0.30206596851348877, + "learning_rate": 8.509521965031064e-07, + "loss": 0.299, + "step": 9905 + }, + { + "epoch": 2.4914486921529173, + "grad_norm": 0.28528329730033875, + "learning_rate": 8.501358137102589e-07, + "loss": 0.3261, + "step": 9906 + }, + { + "epoch": 2.4917002012072436, + "grad_norm": 0.27517518401145935, + "learning_rate": 8.493197863273189e-07, + "loss": 0.3225, + "step": 9907 + }, + { + "epoch": 2.4919517102615694, + "grad_norm": 0.29692593216896057, + "learning_rate": 8.485041144241712e-07, + "loss": 0.3461, + "step": 9908 + }, + { + "epoch": 2.492203219315895, + "grad_norm": 0.288084477186203, + "learning_rate": 8.476887980706761e-07, + "loss": 0.3184, + "step": 9909 + }, + { + "epoch": 2.4924547283702214, + "grad_norm": 0.2816907465457916, + "learning_rate": 8.468738373366569e-07, + "loss": 0.3151, + "step": 9910 + }, + { + "epoch": 2.4927062374245472, + "grad_norm": 0.3093920946121216, + "learning_rate": 8.46059232291912e-07, + "loss": 0.3172, + "step": 9911 + }, + { + "epoch": 2.492957746478873, + "grad_norm": 0.30304044485092163, + "learning_rate": 8.452449830062082e-07, + "loss": 0.3061, + "step": 9912 + }, + { + "epoch": 2.4932092555331993, + "grad_norm": 0.2883797287940979, + "learning_rate": 8.44431089549278e-07, + "loss": 0.3292, + "step": 9913 + }, + { + "epoch": 2.493460764587525, + "grad_norm": 0.2927064597606659, + "learning_rate": 8.436175519908291e-07, + "loss": 0.3416, + "step": 9914 + }, + { + "epoch": 2.493712273641851, + "grad_norm": 0.29678410291671753, + "learning_rate": 8.428043704005334e-07, + "loss": 0.3239, + "step": 9915 + }, + { + "epoch": 2.493963782696177, + "grad_norm": 0.299121230840683, + "learning_rate": 8.419915448480376e-07, + "loss": 0.3275, + "step": 9916 + }, + { + "epoch": 2.494215291750503, + "grad_norm": 0.2699803411960602, + "learning_rate": 8.41179075402952e-07, + "loss": 0.3286, + "step": 9917 + }, + { + "epoch": 2.4944668008048287, + "grad_norm": 0.29386457800865173, + "learning_rate": 8.40366962134862e-07, + "loss": 0.3343, + "step": 9918 + }, + { + "epoch": 2.494718309859155, + "grad_norm": 0.29827818274497986, + "learning_rate": 8.395552051133182e-07, + "loss": 0.3185, + "step": 9919 + }, + { + "epoch": 2.494969818913481, + "grad_norm": 0.3134204149246216, + "learning_rate": 8.387438044078439e-07, + "loss": 0.3283, + "step": 9920 + }, + { + "epoch": 2.4952213279678066, + "grad_norm": 0.2914014458656311, + "learning_rate": 8.379327600879306e-07, + "loss": 0.3361, + "step": 9921 + }, + { + "epoch": 2.495472837022133, + "grad_norm": 0.29312896728515625, + "learning_rate": 8.371220722230378e-07, + "loss": 0.3204, + "step": 9922 + }, + { + "epoch": 2.4957243460764587, + "grad_norm": 0.29510068893432617, + "learning_rate": 8.363117408825972e-07, + "loss": 0.3093, + "step": 9923 + }, + { + "epoch": 2.4959758551307845, + "grad_norm": 0.29542407393455505, + "learning_rate": 8.355017661360077e-07, + "loss": 0.3169, + "step": 9924 + }, + { + "epoch": 2.4962273641851107, + "grad_norm": 0.3483355939388275, + "learning_rate": 8.346921480526393e-07, + "loss": 0.3323, + "step": 9925 + }, + { + "epoch": 2.4964788732394365, + "grad_norm": 0.302891343832016, + "learning_rate": 8.338828867018295e-07, + "loss": 0.3261, + "step": 9926 + }, + { + "epoch": 2.4967303822937628, + "grad_norm": 0.28868502378463745, + "learning_rate": 8.330739821528888e-07, + "loss": 0.3307, + "step": 9927 + }, + { + "epoch": 2.4969818913480886, + "grad_norm": 0.2844146192073822, + "learning_rate": 8.322654344750914e-07, + "loss": 0.3226, + "step": 9928 + }, + { + "epoch": 2.4972334004024144, + "grad_norm": 0.30544909834861755, + "learning_rate": 8.314572437376883e-07, + "loss": 0.3061, + "step": 9929 + }, + { + "epoch": 2.4974849094567406, + "grad_norm": 0.2997104525566101, + "learning_rate": 8.306494100098922e-07, + "loss": 0.3298, + "step": 9930 + }, + { + "epoch": 2.4977364185110664, + "grad_norm": 0.311212956905365, + "learning_rate": 8.298419333608909e-07, + "loss": 0.3136, + "step": 9931 + }, + { + "epoch": 2.4979879275653922, + "grad_norm": 0.3107351064682007, + "learning_rate": 8.290348138598408e-07, + "loss": 0.3482, + "step": 9932 + }, + { + "epoch": 2.4982394366197185, + "grad_norm": 0.3184583783149719, + "learning_rate": 8.282280515758639e-07, + "loss": 0.321, + "step": 9933 + }, + { + "epoch": 2.4984909456740443, + "grad_norm": 0.3004460036754608, + "learning_rate": 8.274216465780577e-07, + "loss": 0.3046, + "step": 9934 + }, + { + "epoch": 2.49874245472837, + "grad_norm": 0.26762136816978455, + "learning_rate": 8.266155989354823e-07, + "loss": 0.3174, + "step": 9935 + }, + { + "epoch": 2.4989939637826963, + "grad_norm": 0.31421777606010437, + "learning_rate": 8.258099087171734e-07, + "loss": 0.2931, + "step": 9936 + }, + { + "epoch": 2.499245472837022, + "grad_norm": 0.311367005109787, + "learning_rate": 8.25004575992131e-07, + "loss": 0.3506, + "step": 9937 + }, + { + "epoch": 2.499496981891348, + "grad_norm": 0.28332027792930603, + "learning_rate": 8.241996008293295e-07, + "loss": 0.307, + "step": 9938 + }, + { + "epoch": 2.499748490945674, + "grad_norm": 0.2985660433769226, + "learning_rate": 8.233949832977067e-07, + "loss": 0.3112, + "step": 9939 + }, + { + "epoch": 2.5, + "grad_norm": 0.30060288310050964, + "learning_rate": 8.225907234661767e-07, + "loss": 0.344, + "step": 9940 + }, + { + "epoch": 2.500251509054326, + "grad_norm": 0.2916584610939026, + "learning_rate": 8.217868214036156e-07, + "loss": 0.3283, + "step": 9941 + }, + { + "epoch": 2.500503018108652, + "grad_norm": 0.3100978434085846, + "learning_rate": 8.20983277178875e-07, + "loss": 0.3318, + "step": 9942 + }, + { + "epoch": 2.500754527162978, + "grad_norm": 0.27890169620513916, + "learning_rate": 8.201800908607738e-07, + "loss": 0.3292, + "step": 9943 + }, + { + "epoch": 2.5010060362173037, + "grad_norm": 0.29491671919822693, + "learning_rate": 8.193772625180974e-07, + "loss": 0.3101, + "step": 9944 + }, + { + "epoch": 2.50125754527163, + "grad_norm": 0.2769851088523865, + "learning_rate": 8.185747922196063e-07, + "loss": 0.3225, + "step": 9945 + }, + { + "epoch": 2.5015090543259557, + "grad_norm": 0.2963421046733856, + "learning_rate": 8.177726800340235e-07, + "loss": 0.318, + "step": 9946 + }, + { + "epoch": 2.501760563380282, + "grad_norm": 0.2658748924732208, + "learning_rate": 8.169709260300485e-07, + "loss": 0.319, + "step": 9947 + }, + { + "epoch": 2.5020120724346078, + "grad_norm": 0.2909032106399536, + "learning_rate": 8.161695302763434e-07, + "loss": 0.3245, + "step": 9948 + }, + { + "epoch": 2.5022635814889336, + "grad_norm": 0.28727254271507263, + "learning_rate": 8.15368492841545e-07, + "loss": 0.3029, + "step": 9949 + }, + { + "epoch": 2.50251509054326, + "grad_norm": 0.29924967885017395, + "learning_rate": 8.145678137942553e-07, + "loss": 0.3283, + "step": 9950 + }, + { + "epoch": 2.5027665995975856, + "grad_norm": 0.28090178966522217, + "learning_rate": 8.137674932030499e-07, + "loss": 0.3244, + "step": 9951 + }, + { + "epoch": 2.5030181086519114, + "grad_norm": 0.29318609833717346, + "learning_rate": 8.129675311364682e-07, + "loss": 0.334, + "step": 9952 + }, + { + "epoch": 2.5032696177062377, + "grad_norm": 0.28759893774986267, + "learning_rate": 8.121679276630235e-07, + "loss": 0.3267, + "step": 9953 + }, + { + "epoch": 2.5035211267605635, + "grad_norm": 0.3086696267127991, + "learning_rate": 8.113686828511974e-07, + "loss": 0.3411, + "step": 9954 + }, + { + "epoch": 2.5037726358148893, + "grad_norm": 0.298218697309494, + "learning_rate": 8.105697967694393e-07, + "loss": 0.3243, + "step": 9955 + }, + { + "epoch": 2.5040241448692155, + "grad_norm": 0.31472504138946533, + "learning_rate": 8.097712694861698e-07, + "loss": 0.3349, + "step": 9956 + }, + { + "epoch": 2.5042756539235413, + "grad_norm": 0.3033403754234314, + "learning_rate": 8.08973101069776e-07, + "loss": 0.3157, + "step": 9957 + }, + { + "epoch": 2.504527162977867, + "grad_norm": 0.25785255432128906, + "learning_rate": 8.081752915886182e-07, + "loss": 0.3112, + "step": 9958 + }, + { + "epoch": 2.5047786720321934, + "grad_norm": 0.29093462228775024, + "learning_rate": 8.073778411110216e-07, + "loss": 0.33, + "step": 9959 + }, + { + "epoch": 2.505030181086519, + "grad_norm": 0.28467297554016113, + "learning_rate": 8.065807497052852e-07, + "loss": 0.3359, + "step": 9960 + }, + { + "epoch": 2.505281690140845, + "grad_norm": 0.28814148902893066, + "learning_rate": 8.057840174396725e-07, + "loss": 0.3229, + "step": 9961 + }, + { + "epoch": 2.5055331991951713, + "grad_norm": 0.301241934299469, + "learning_rate": 8.049876443824212e-07, + "loss": 0.3187, + "step": 9962 + }, + { + "epoch": 2.505784708249497, + "grad_norm": 0.2975336015224457, + "learning_rate": 8.041916306017322e-07, + "loss": 0.3204, + "step": 9963 + }, + { + "epoch": 2.506036217303823, + "grad_norm": 0.30471301078796387, + "learning_rate": 8.033959761657817e-07, + "loss": 0.3381, + "step": 9964 + }, + { + "epoch": 2.506287726358149, + "grad_norm": 0.2744012176990509, + "learning_rate": 8.026006811427134e-07, + "loss": 0.3128, + "step": 9965 + }, + { + "epoch": 2.506539235412475, + "grad_norm": 0.2855711877346039, + "learning_rate": 8.018057456006362e-07, + "loss": 0.32, + "step": 9966 + }, + { + "epoch": 2.5067907444668007, + "grad_norm": 0.28882476687431335, + "learning_rate": 8.010111696076344e-07, + "loss": 0.3253, + "step": 9967 + }, + { + "epoch": 2.507042253521127, + "grad_norm": 0.29218101501464844, + "learning_rate": 8.002169532317566e-07, + "loss": 0.3323, + "step": 9968 + }, + { + "epoch": 2.5072937625754528, + "grad_norm": 0.3086097836494446, + "learning_rate": 7.994230965410232e-07, + "loss": 0.3173, + "step": 9969 + }, + { + "epoch": 2.5075452716297786, + "grad_norm": 0.2888603210449219, + "learning_rate": 7.986295996034221e-07, + "loss": 0.3351, + "step": 9970 + }, + { + "epoch": 2.507796780684105, + "grad_norm": 0.27783140540122986, + "learning_rate": 7.978364624869134e-07, + "loss": 0.3177, + "step": 9971 + }, + { + "epoch": 2.5080482897384306, + "grad_norm": 0.2933598756790161, + "learning_rate": 7.970436852594221e-07, + "loss": 0.3139, + "step": 9972 + }, + { + "epoch": 2.5082997987927564, + "grad_norm": 0.2918246388435364, + "learning_rate": 7.962512679888462e-07, + "loss": 0.3066, + "step": 9973 + }, + { + "epoch": 2.5085513078470827, + "grad_norm": 0.28106623888015747, + "learning_rate": 7.954592107430498e-07, + "loss": 0.3278, + "step": 9974 + }, + { + "epoch": 2.5088028169014085, + "grad_norm": 0.3085310459136963, + "learning_rate": 7.946675135898679e-07, + "loss": 0.2984, + "step": 9975 + }, + { + "epoch": 2.5090543259557343, + "grad_norm": 0.2928948402404785, + "learning_rate": 7.938761765971065e-07, + "loss": 0.3113, + "step": 9976 + }, + { + "epoch": 2.5093058350100605, + "grad_norm": 0.29120776057243347, + "learning_rate": 7.930851998325362e-07, + "loss": 0.3081, + "step": 9977 + }, + { + "epoch": 2.5095573440643864, + "grad_norm": 0.289899080991745, + "learning_rate": 7.922945833639012e-07, + "loss": 0.3236, + "step": 9978 + }, + { + "epoch": 2.509808853118712, + "grad_norm": 0.27034732699394226, + "learning_rate": 7.915043272589106e-07, + "loss": 0.3194, + "step": 9979 + }, + { + "epoch": 2.5100603621730384, + "grad_norm": 0.2954181730747223, + "learning_rate": 7.907144315852472e-07, + "loss": 0.3179, + "step": 9980 + }, + { + "epoch": 2.510311871227364, + "grad_norm": 0.2821013033390045, + "learning_rate": 7.899248964105583e-07, + "loss": 0.3298, + "step": 9981 + }, + { + "epoch": 2.51056338028169, + "grad_norm": 0.2877882122993469, + "learning_rate": 7.891357218024653e-07, + "loss": 0.3265, + "step": 9982 + }, + { + "epoch": 2.5108148893360163, + "grad_norm": 0.32784876227378845, + "learning_rate": 7.883469078285533e-07, + "loss": 0.3191, + "step": 9983 + }, + { + "epoch": 2.511066398390342, + "grad_norm": 0.28723639249801636, + "learning_rate": 7.875584545563819e-07, + "loss": 0.346, + "step": 9984 + }, + { + "epoch": 2.511317907444668, + "grad_norm": 0.2926671504974365, + "learning_rate": 7.867703620534744e-07, + "loss": 0.2905, + "step": 9985 + }, + { + "epoch": 2.511569416498994, + "grad_norm": 0.2825871706008911, + "learning_rate": 7.85982630387328e-07, + "loss": 0.3087, + "step": 9986 + }, + { + "epoch": 2.51182092555332, + "grad_norm": 0.28117263317108154, + "learning_rate": 7.851952596254076e-07, + "loss": 0.3221, + "step": 9987 + }, + { + "epoch": 2.5120724346076457, + "grad_norm": 0.2999492883682251, + "learning_rate": 7.844082498351441e-07, + "loss": 0.3078, + "step": 9988 + }, + { + "epoch": 2.512323943661972, + "grad_norm": 0.2990661859512329, + "learning_rate": 7.836216010839426e-07, + "loss": 0.3162, + "step": 9989 + }, + { + "epoch": 2.512575452716298, + "grad_norm": 0.30873674154281616, + "learning_rate": 7.82835313439172e-07, + "loss": 0.312, + "step": 9990 + }, + { + "epoch": 2.5128269617706236, + "grad_norm": 0.300627201795578, + "learning_rate": 7.820493869681761e-07, + "loss": 0.318, + "step": 9991 + }, + { + "epoch": 2.51307847082495, + "grad_norm": 0.30525755882263184, + "learning_rate": 7.812638217382612e-07, + "loss": 0.3613, + "step": 9992 + }, + { + "epoch": 2.5133299798792756, + "grad_norm": 0.30198240280151367, + "learning_rate": 7.804786178167085e-07, + "loss": 0.3029, + "step": 9993 + }, + { + "epoch": 2.5135814889336014, + "grad_norm": 0.3166106343269348, + "learning_rate": 7.796937752707639e-07, + "loss": 0.3309, + "step": 9994 + }, + { + "epoch": 2.5138329979879277, + "grad_norm": 0.3065146207809448, + "learning_rate": 7.789092941676468e-07, + "loss": 0.345, + "step": 9995 + }, + { + "epoch": 2.5140845070422535, + "grad_norm": 0.3031245470046997, + "learning_rate": 7.781251745745405e-07, + "loss": 0.3172, + "step": 9996 + }, + { + "epoch": 2.5143360160965793, + "grad_norm": 0.29413849115371704, + "learning_rate": 7.773414165586007e-07, + "loss": 0.322, + "step": 9997 + }, + { + "epoch": 2.5145875251509056, + "grad_norm": 0.29748326539993286, + "learning_rate": 7.765580201869527e-07, + "loss": 0.2931, + "step": 9998 + }, + { + "epoch": 2.5148390342052314, + "grad_norm": 0.3018585443496704, + "learning_rate": 7.757749855266878e-07, + "loss": 0.3339, + "step": 9999 + }, + { + "epoch": 2.515090543259557, + "grad_norm": 0.30707988142967224, + "learning_rate": 7.749923126448694e-07, + "loss": 0.3281, + "step": 10000 + }, + { + "epoch": 2.5153420523138834, + "grad_norm": 0.3032652735710144, + "learning_rate": 7.742100016085269e-07, + "loss": 0.3183, + "step": 10001 + }, + { + "epoch": 2.515593561368209, + "grad_norm": 0.30448129773139954, + "learning_rate": 7.734280524846627e-07, + "loss": 0.298, + "step": 10002 + }, + { + "epoch": 2.515845070422535, + "grad_norm": 0.3087362051010132, + "learning_rate": 7.726464653402432e-07, + "loss": 0.3155, + "step": 10003 + }, + { + "epoch": 2.5160965794768613, + "grad_norm": 0.2881805896759033, + "learning_rate": 7.718652402422088e-07, + "loss": 0.3456, + "step": 10004 + }, + { + "epoch": 2.516348088531187, + "grad_norm": 0.301011323928833, + "learning_rate": 7.710843772574644e-07, + "loss": 0.3288, + "step": 10005 + }, + { + "epoch": 2.516599597585513, + "grad_norm": 0.31224364042282104, + "learning_rate": 7.70303876452888e-07, + "loss": 0.3128, + "step": 10006 + }, + { + "epoch": 2.516851106639839, + "grad_norm": 0.2763179838657379, + "learning_rate": 7.695237378953224e-07, + "loss": 0.3003, + "step": 10007 + }, + { + "epoch": 2.517102615694165, + "grad_norm": 0.3021027445793152, + "learning_rate": 7.687439616515846e-07, + "loss": 0.3198, + "step": 10008 + }, + { + "epoch": 2.5173541247484907, + "grad_norm": 0.2829072177410126, + "learning_rate": 7.67964547788454e-07, + "loss": 0.2887, + "step": 10009 + }, + { + "epoch": 2.517605633802817, + "grad_norm": 0.31430134177207947, + "learning_rate": 7.67185496372686e-07, + "loss": 0.3293, + "step": 10010 + }, + { + "epoch": 2.517857142857143, + "grad_norm": 0.30119824409484863, + "learning_rate": 7.664068074709985e-07, + "loss": 0.3175, + "step": 10011 + }, + { + "epoch": 2.5181086519114686, + "grad_norm": 0.2803148329257965, + "learning_rate": 7.656284811500842e-07, + "loss": 0.3335, + "step": 10012 + }, + { + "epoch": 2.518360160965795, + "grad_norm": 0.28627124428749084, + "learning_rate": 7.648505174765986e-07, + "loss": 0.3127, + "step": 10013 + }, + { + "epoch": 2.5186116700201207, + "grad_norm": 0.2865979075431824, + "learning_rate": 7.640729165171723e-07, + "loss": 0.2974, + "step": 10014 + }, + { + "epoch": 2.5188631790744465, + "grad_norm": 0.2760888636112213, + "learning_rate": 7.632956783383999e-07, + "loss": 0.3136, + "step": 10015 + }, + { + "epoch": 2.5191146881287727, + "grad_norm": 0.2837379276752472, + "learning_rate": 7.625188030068492e-07, + "loss": 0.324, + "step": 10016 + }, + { + "epoch": 2.5193661971830985, + "grad_norm": 0.3180212676525116, + "learning_rate": 7.617422905890521e-07, + "loss": 0.3085, + "step": 10017 + }, + { + "epoch": 2.5196177062374243, + "grad_norm": 0.29850107431411743, + "learning_rate": 7.609661411515146e-07, + "loss": 0.3149, + "step": 10018 + }, + { + "epoch": 2.5198692152917506, + "grad_norm": 0.32031044363975525, + "learning_rate": 7.601903547607064e-07, + "loss": 0.3302, + "step": 10019 + }, + { + "epoch": 2.5201207243460764, + "grad_norm": 0.30878177285194397, + "learning_rate": 7.594149314830717e-07, + "loss": 0.3003, + "step": 10020 + }, + { + "epoch": 2.520372233400402, + "grad_norm": 0.2950962781906128, + "learning_rate": 7.586398713850179e-07, + "loss": 0.327, + "step": 10021 + }, + { + "epoch": 2.5206237424547284, + "grad_norm": 0.29020336270332336, + "learning_rate": 7.578651745329263e-07, + "loss": 0.3268, + "step": 10022 + }, + { + "epoch": 2.5208752515090542, + "grad_norm": 0.3015232980251312, + "learning_rate": 7.570908409931427e-07, + "loss": 0.3087, + "step": 10023 + }, + { + "epoch": 2.52112676056338, + "grad_norm": 0.29202380776405334, + "learning_rate": 7.563168708319857e-07, + "loss": 0.3301, + "step": 10024 + }, + { + "epoch": 2.5213782696177063, + "grad_norm": 0.2775900065898895, + "learning_rate": 7.555432641157396e-07, + "loss": 0.3149, + "step": 10025 + }, + { + "epoch": 2.521629778672032, + "grad_norm": 0.2708669900894165, + "learning_rate": 7.547700209106606e-07, + "loss": 0.3238, + "step": 10026 + }, + { + "epoch": 2.521881287726358, + "grad_norm": 0.30167317390441895, + "learning_rate": 7.539971412829705e-07, + "loss": 0.3001, + "step": 10027 + }, + { + "epoch": 2.522132796780684, + "grad_norm": 0.3165789246559143, + "learning_rate": 7.532246252988617e-07, + "loss": 0.3382, + "step": 10028 + }, + { + "epoch": 2.52238430583501, + "grad_norm": 0.29492056369781494, + "learning_rate": 7.524524730244975e-07, + "loss": 0.3383, + "step": 10029 + }, + { + "epoch": 2.5226358148893357, + "grad_norm": 0.2822633981704712, + "learning_rate": 7.516806845260055e-07, + "loss": 0.3003, + "step": 10030 + }, + { + "epoch": 2.522887323943662, + "grad_norm": 0.3063894212245941, + "learning_rate": 7.509092598694861e-07, + "loss": 0.3138, + "step": 10031 + }, + { + "epoch": 2.523138832997988, + "grad_norm": 0.32523077726364136, + "learning_rate": 7.501381991210061e-07, + "loss": 0.3164, + "step": 10032 + }, + { + "epoch": 2.5233903420523136, + "grad_norm": 0.28941982984542847, + "learning_rate": 7.493675023466024e-07, + "loss": 0.3347, + "step": 10033 + }, + { + "epoch": 2.52364185110664, + "grad_norm": 0.29666292667388916, + "learning_rate": 7.485971696122796e-07, + "loss": 0.324, + "step": 10034 + }, + { + "epoch": 2.5238933601609657, + "grad_norm": 0.3100225031375885, + "learning_rate": 7.478272009840137e-07, + "loss": 0.3254, + "step": 10035 + }, + { + "epoch": 2.524144869215292, + "grad_norm": 0.28848564624786377, + "learning_rate": 7.470575965277455e-07, + "loss": 0.3452, + "step": 10036 + }, + { + "epoch": 2.5243963782696177, + "grad_norm": 0.2774829864501953, + "learning_rate": 7.462883563093887e-07, + "loss": 0.3218, + "step": 10037 + }, + { + "epoch": 2.5246478873239435, + "grad_norm": 0.3099637031555176, + "learning_rate": 7.455194803948218e-07, + "loss": 0.2875, + "step": 10038 + }, + { + "epoch": 2.5248993963782698, + "grad_norm": 0.28145939111709595, + "learning_rate": 7.447509688498971e-07, + "loss": 0.3332, + "step": 10039 + }, + { + "epoch": 2.5251509054325956, + "grad_norm": 0.296622633934021, + "learning_rate": 7.439828217404293e-07, + "loss": 0.3478, + "step": 10040 + }, + { + "epoch": 2.5254024144869214, + "grad_norm": 0.27812138199806213, + "learning_rate": 7.432150391322079e-07, + "loss": 0.3302, + "step": 10041 + }, + { + "epoch": 2.5256539235412476, + "grad_norm": 0.26120641827583313, + "learning_rate": 7.424476210909893e-07, + "loss": 0.2886, + "step": 10042 + }, + { + "epoch": 2.5259054325955734, + "grad_norm": 0.29901203513145447, + "learning_rate": 7.41680567682495e-07, + "loss": 0.3021, + "step": 10043 + }, + { + "epoch": 2.5261569416498992, + "grad_norm": 0.28617119789123535, + "learning_rate": 7.409138789724213e-07, + "loss": 0.321, + "step": 10044 + }, + { + "epoch": 2.5264084507042255, + "grad_norm": 0.28953617811203003, + "learning_rate": 7.401475550264286e-07, + "loss": 0.346, + "step": 10045 + }, + { + "epoch": 2.5266599597585513, + "grad_norm": 0.28805407881736755, + "learning_rate": 7.393815959101491e-07, + "loss": 0.3206, + "step": 10046 + }, + { + "epoch": 2.5269114688128775, + "grad_norm": 0.29841816425323486, + "learning_rate": 7.386160016891802e-07, + "loss": 0.3242, + "step": 10047 + }, + { + "epoch": 2.5271629778672033, + "grad_norm": 0.2983440160751343, + "learning_rate": 7.378507724290929e-07, + "loss": 0.3347, + "step": 10048 + }, + { + "epoch": 2.527414486921529, + "grad_norm": 0.2765296697616577, + "learning_rate": 7.370859081954219e-07, + "loss": 0.3093, + "step": 10049 + }, + { + "epoch": 2.5276659959758554, + "grad_norm": 0.2982845902442932, + "learning_rate": 7.363214090536752e-07, + "loss": 0.3454, + "step": 10050 + }, + { + "epoch": 2.527917505030181, + "grad_norm": 0.2902871370315552, + "learning_rate": 7.355572750693252e-07, + "loss": 0.3436, + "step": 10051 + }, + { + "epoch": 2.528169014084507, + "grad_norm": 0.319418340921402, + "learning_rate": 7.347935063078165e-07, + "loss": 0.3195, + "step": 10052 + }, + { + "epoch": 2.5284205231388333, + "grad_norm": 0.2917003631591797, + "learning_rate": 7.34030102834562e-07, + "loss": 0.3125, + "step": 10053 + }, + { + "epoch": 2.528672032193159, + "grad_norm": 0.30017781257629395, + "learning_rate": 7.332670647149398e-07, + "loss": 0.3478, + "step": 10054 + }, + { + "epoch": 2.528923541247485, + "grad_norm": 0.3094007968902588, + "learning_rate": 7.325043920143027e-07, + "loss": 0.3284, + "step": 10055 + }, + { + "epoch": 2.529175050301811, + "grad_norm": 0.3049699366092682, + "learning_rate": 7.317420847979656e-07, + "loss": 0.3051, + "step": 10056 + }, + { + "epoch": 2.529426559356137, + "grad_norm": 0.29030466079711914, + "learning_rate": 7.30980143131218e-07, + "loss": 0.3203, + "step": 10057 + }, + { + "epoch": 2.5296780684104627, + "grad_norm": 0.2822028398513794, + "learning_rate": 7.302185670793132e-07, + "loss": 0.3209, + "step": 10058 + }, + { + "epoch": 2.529929577464789, + "grad_norm": 0.28473353385925293, + "learning_rate": 7.294573567074776e-07, + "loss": 0.3205, + "step": 10059 + }, + { + "epoch": 2.5301810865191148, + "grad_norm": 0.2997729778289795, + "learning_rate": 7.286965120809014e-07, + "loss": 0.3134, + "step": 10060 + }, + { + "epoch": 2.5304325955734406, + "grad_norm": 0.2938189208507538, + "learning_rate": 7.279360332647495e-07, + "loss": 0.313, + "step": 10061 + }, + { + "epoch": 2.530684104627767, + "grad_norm": 0.291318416595459, + "learning_rate": 7.271759203241485e-07, + "loss": 0.2945, + "step": 10062 + }, + { + "epoch": 2.5309356136820926, + "grad_norm": 0.29242825508117676, + "learning_rate": 7.264161733241998e-07, + "loss": 0.2912, + "step": 10063 + }, + { + "epoch": 2.5311871227364184, + "grad_norm": 0.2778705358505249, + "learning_rate": 7.256567923299712e-07, + "loss": 0.3547, + "step": 10064 + }, + { + "epoch": 2.5314386317907447, + "grad_norm": 0.29071563482284546, + "learning_rate": 7.248977774064975e-07, + "loss": 0.3357, + "step": 10065 + }, + { + "epoch": 2.5316901408450705, + "grad_norm": 0.2635411322116852, + "learning_rate": 7.241391286187849e-07, + "loss": 0.3257, + "step": 10066 + }, + { + "epoch": 2.5319416498993963, + "grad_norm": 0.2990947961807251, + "learning_rate": 7.233808460318054e-07, + "loss": 0.3175, + "step": 10067 + }, + { + "epoch": 2.5321931589537225, + "grad_norm": 0.2650429904460907, + "learning_rate": 7.226229297105031e-07, + "loss": 0.3322, + "step": 10068 + }, + { + "epoch": 2.5324446680080483, + "grad_norm": 0.2894965410232544, + "learning_rate": 7.218653797197861e-07, + "loss": 0.307, + "step": 10069 + }, + { + "epoch": 2.532696177062374, + "grad_norm": 0.26556655764579773, + "learning_rate": 7.21108196124537e-07, + "loss": 0.3372, + "step": 10070 + }, + { + "epoch": 2.5329476861167004, + "grad_norm": 0.3001581132411957, + "learning_rate": 7.20351378989601e-07, + "loss": 0.3338, + "step": 10071 + }, + { + "epoch": 2.533199195171026, + "grad_norm": 0.3141966164112091, + "learning_rate": 7.195949283797959e-07, + "loss": 0.3287, + "step": 10072 + }, + { + "epoch": 2.533450704225352, + "grad_norm": 0.2945497930049896, + "learning_rate": 7.188388443599081e-07, + "loss": 0.3309, + "step": 10073 + }, + { + "epoch": 2.5337022132796783, + "grad_norm": 0.3076508641242981, + "learning_rate": 7.180831269946898e-07, + "loss": 0.335, + "step": 10074 + }, + { + "epoch": 2.533953722334004, + "grad_norm": 0.2986814081668854, + "learning_rate": 7.173277763488646e-07, + "loss": 0.3138, + "step": 10075 + }, + { + "epoch": 2.53420523138833, + "grad_norm": 0.2792844772338867, + "learning_rate": 7.165727924871224e-07, + "loss": 0.3147, + "step": 10076 + }, + { + "epoch": 2.534456740442656, + "grad_norm": 0.3163931667804718, + "learning_rate": 7.158181754741239e-07, + "loss": 0.3245, + "step": 10077 + }, + { + "epoch": 2.534708249496982, + "grad_norm": 0.2911807894706726, + "learning_rate": 7.150639253744967e-07, + "loss": 0.3191, + "step": 10078 + }, + { + "epoch": 2.5349597585513077, + "grad_norm": 0.284397691488266, + "learning_rate": 7.143100422528382e-07, + "loss": 0.3186, + "step": 10079 + }, + { + "epoch": 2.535211267605634, + "grad_norm": 0.2803077697753906, + "learning_rate": 7.135565261737126e-07, + "loss": 0.3276, + "step": 10080 + }, + { + "epoch": 2.53546277665996, + "grad_norm": 0.27358850836753845, + "learning_rate": 7.128033772016557e-07, + "loss": 0.3073, + "step": 10081 + }, + { + "epoch": 2.5357142857142856, + "grad_norm": 0.30038154125213623, + "learning_rate": 7.120505954011681e-07, + "loss": 0.3076, + "step": 10082 + }, + { + "epoch": 2.535965794768612, + "grad_norm": 0.3066074848175049, + "learning_rate": 7.112981808367214e-07, + "loss": 0.3354, + "step": 10083 + }, + { + "epoch": 2.5362173038229376, + "grad_norm": 0.30170679092407227, + "learning_rate": 7.105461335727564e-07, + "loss": 0.3102, + "step": 10084 + }, + { + "epoch": 2.5364688128772634, + "grad_norm": 0.3071909546852112, + "learning_rate": 7.097944536736795e-07, + "loss": 0.3112, + "step": 10085 + }, + { + "epoch": 2.5367203219315897, + "grad_norm": 0.2948451638221741, + "learning_rate": 7.09043141203869e-07, + "loss": 0.3097, + "step": 10086 + }, + { + "epoch": 2.5369718309859155, + "grad_norm": 0.31133005023002625, + "learning_rate": 7.082921962276684e-07, + "loss": 0.3196, + "step": 10087 + }, + { + "epoch": 2.5372233400402413, + "grad_norm": 0.29081660509109497, + "learning_rate": 7.075416188093936e-07, + "loss": 0.2977, + "step": 10088 + }, + { + "epoch": 2.5374748490945676, + "grad_norm": 0.2829488515853882, + "learning_rate": 7.067914090133244e-07, + "loss": 0.3158, + "step": 10089 + }, + { + "epoch": 2.5377263581488934, + "grad_norm": 0.2930876612663269, + "learning_rate": 7.060415669037135e-07, + "loss": 0.3419, + "step": 10090 + }, + { + "epoch": 2.537977867203219, + "grad_norm": 0.32287168502807617, + "learning_rate": 7.052920925447792e-07, + "loss": 0.3263, + "step": 10091 + }, + { + "epoch": 2.5382293762575454, + "grad_norm": 0.31697362661361694, + "learning_rate": 7.045429860007102e-07, + "loss": 0.2973, + "step": 10092 + }, + { + "epoch": 2.538480885311871, + "grad_norm": 0.28679776191711426, + "learning_rate": 7.037942473356607e-07, + "loss": 0.3167, + "step": 10093 + }, + { + "epoch": 2.538732394366197, + "grad_norm": 0.3053102195262909, + "learning_rate": 7.030458766137577e-07, + "loss": 0.3345, + "step": 10094 + }, + { + "epoch": 2.5389839034205233, + "grad_norm": 0.2991425395011902, + "learning_rate": 7.022978738990943e-07, + "loss": 0.3406, + "step": 10095 + }, + { + "epoch": 2.539235412474849, + "grad_norm": 0.3642655909061432, + "learning_rate": 7.015502392557305e-07, + "loss": 0.3412, + "step": 10096 + }, + { + "epoch": 2.539486921529175, + "grad_norm": 0.31416499614715576, + "learning_rate": 7.00802972747699e-07, + "loss": 0.3052, + "step": 10097 + }, + { + "epoch": 2.539738430583501, + "grad_norm": 0.296009361743927, + "learning_rate": 7.000560744389962e-07, + "loss": 0.3015, + "step": 10098 + }, + { + "epoch": 2.539989939637827, + "grad_norm": 0.31365057826042175, + "learning_rate": 6.99309544393591e-07, + "loss": 0.3134, + "step": 10099 + }, + { + "epoch": 2.5402414486921527, + "grad_norm": 0.2986956536769867, + "learning_rate": 6.985633826754173e-07, + "loss": 0.3184, + "step": 10100 + }, + { + "epoch": 2.540492957746479, + "grad_norm": 0.2894130349159241, + "learning_rate": 6.978175893483812e-07, + "loss": 0.312, + "step": 10101 + }, + { + "epoch": 2.540744466800805, + "grad_norm": 0.29604774713516235, + "learning_rate": 6.970721644763534e-07, + "loss": 0.3198, + "step": 10102 + }, + { + "epoch": 2.5409959758551306, + "grad_norm": 0.28080493211746216, + "learning_rate": 6.963271081231765e-07, + "loss": 0.3098, + "step": 10103 + }, + { + "epoch": 2.541247484909457, + "grad_norm": 0.2949555218219757, + "learning_rate": 6.955824203526585e-07, + "loss": 0.3102, + "step": 10104 + }, + { + "epoch": 2.5414989939637826, + "grad_norm": 0.304453581571579, + "learning_rate": 6.948381012285771e-07, + "loss": 0.3332, + "step": 10105 + }, + { + "epoch": 2.5417505030181085, + "grad_norm": 0.3298734128475189, + "learning_rate": 6.940941508146809e-07, + "loss": 0.3199, + "step": 10106 + }, + { + "epoch": 2.5420020120724347, + "grad_norm": 0.2873101830482483, + "learning_rate": 6.933505691746816e-07, + "loss": 0.3262, + "step": 10107 + }, + { + "epoch": 2.5422535211267605, + "grad_norm": 0.2953130900859833, + "learning_rate": 6.926073563722652e-07, + "loss": 0.3264, + "step": 10108 + }, + { + "epoch": 2.5425050301810863, + "grad_norm": 0.3112882971763611, + "learning_rate": 6.918645124710805e-07, + "loss": 0.3349, + "step": 10109 + }, + { + "epoch": 2.5427565392354126, + "grad_norm": 0.28245386481285095, + "learning_rate": 6.911220375347499e-07, + "loss": 0.2928, + "step": 10110 + }, + { + "epoch": 2.5430080482897384, + "grad_norm": 0.30176132917404175, + "learning_rate": 6.903799316268595e-07, + "loss": 0.3107, + "step": 10111 + }, + { + "epoch": 2.543259557344064, + "grad_norm": 0.29093387722969055, + "learning_rate": 6.896381948109682e-07, + "loss": 0.2997, + "step": 10112 + }, + { + "epoch": 2.5435110663983904, + "grad_norm": 0.2985793650150299, + "learning_rate": 6.888968271505986e-07, + "loss": 0.317, + "step": 10113 + }, + { + "epoch": 2.5437625754527162, + "grad_norm": 0.3139978349208832, + "learning_rate": 6.881558287092466e-07, + "loss": 0.3097, + "step": 10114 + }, + { + "epoch": 2.544014084507042, + "grad_norm": 0.30869972705841064, + "learning_rate": 6.87415199550372e-07, + "loss": 0.3236, + "step": 10115 + }, + { + "epoch": 2.5442655935613683, + "grad_norm": 0.2910330593585968, + "learning_rate": 6.866749397374062e-07, + "loss": 0.303, + "step": 10116 + }, + { + "epoch": 2.544517102615694, + "grad_norm": 0.31802523136138916, + "learning_rate": 6.859350493337491e-07, + "loss": 0.3032, + "step": 10117 + }, + { + "epoch": 2.54476861167002, + "grad_norm": 0.2831653356552124, + "learning_rate": 6.85195528402765e-07, + "loss": 0.3226, + "step": 10118 + }, + { + "epoch": 2.545020120724346, + "grad_norm": 0.29345449805259705, + "learning_rate": 6.844563770077917e-07, + "loss": 0.3364, + "step": 10119 + }, + { + "epoch": 2.545271629778672, + "grad_norm": 0.30557841062545776, + "learning_rate": 6.837175952121305e-07, + "loss": 0.3013, + "step": 10120 + }, + { + "epoch": 2.5455231388329977, + "grad_norm": 0.2902613580226898, + "learning_rate": 6.829791830790555e-07, + "loss": 0.3354, + "step": 10121 + }, + { + "epoch": 2.545774647887324, + "grad_norm": 0.3239326775074005, + "learning_rate": 6.82241140671806e-07, + "loss": 0.3009, + "step": 10122 + }, + { + "epoch": 2.54602615694165, + "grad_norm": 0.30113399028778076, + "learning_rate": 6.815034680535915e-07, + "loss": 0.3348, + "step": 10123 + }, + { + "epoch": 2.5462776659959756, + "grad_norm": 0.30587083101272583, + "learning_rate": 6.807661652875875e-07, + "loss": 0.3163, + "step": 10124 + }, + { + "epoch": 2.546529175050302, + "grad_norm": 0.2883608639240265, + "learning_rate": 6.800292324369417e-07, + "loss": 0.3324, + "step": 10125 + }, + { + "epoch": 2.5467806841046277, + "grad_norm": 0.30463874340057373, + "learning_rate": 6.79292669564765e-07, + "loss": 0.3423, + "step": 10126 + }, + { + "epoch": 2.5470321931589535, + "grad_norm": 0.28268906474113464, + "learning_rate": 6.785564767341423e-07, + "loss": 0.301, + "step": 10127 + }, + { + "epoch": 2.5472837022132797, + "grad_norm": 0.29041969776153564, + "learning_rate": 6.778206540081211e-07, + "loss": 0.3189, + "step": 10128 + }, + { + "epoch": 2.5475352112676055, + "grad_norm": 0.2949490547180176, + "learning_rate": 6.77085201449722e-07, + "loss": 0.3236, + "step": 10129 + }, + { + "epoch": 2.5477867203219313, + "grad_norm": 0.31995368003845215, + "learning_rate": 6.763501191219319e-07, + "loss": 0.3262, + "step": 10130 + }, + { + "epoch": 2.5480382293762576, + "grad_norm": 0.29409512877464294, + "learning_rate": 6.756154070877047e-07, + "loss": 0.3152, + "step": 10131 + }, + { + "epoch": 2.5482897384305834, + "grad_norm": 0.3013671040534973, + "learning_rate": 6.748810654099652e-07, + "loss": 0.3187, + "step": 10132 + }, + { + "epoch": 2.5485412474849096, + "grad_norm": 0.3068297207355499, + "learning_rate": 6.741470941516043e-07, + "loss": 0.302, + "step": 10133 + }, + { + "epoch": 2.5487927565392354, + "grad_norm": 0.2889004945755005, + "learning_rate": 6.734134933754826e-07, + "loss": 0.3019, + "step": 10134 + }, + { + "epoch": 2.5490442655935612, + "grad_norm": 0.3107074499130249, + "learning_rate": 6.726802631444274e-07, + "loss": 0.3373, + "step": 10135 + }, + { + "epoch": 2.5492957746478875, + "grad_norm": 0.30381691455841064, + "learning_rate": 6.719474035212376e-07, + "loss": 0.3, + "step": 10136 + }, + { + "epoch": 2.5495472837022133, + "grad_norm": 0.29392245411872864, + "learning_rate": 6.712149145686748e-07, + "loss": 0.3328, + "step": 10137 + }, + { + "epoch": 2.549798792756539, + "grad_norm": 0.2984176576137543, + "learning_rate": 6.704827963494748e-07, + "loss": 0.2996, + "step": 10138 + }, + { + "epoch": 2.5500503018108653, + "grad_norm": 0.31401655077934265, + "learning_rate": 6.697510489263371e-07, + "loss": 0.3237, + "step": 10139 + }, + { + "epoch": 2.550301810865191, + "grad_norm": 0.2747150659561157, + "learning_rate": 6.69019672361933e-07, + "loss": 0.323, + "step": 10140 + }, + { + "epoch": 2.550553319919517, + "grad_norm": 0.29566800594329834, + "learning_rate": 6.682886667188987e-07, + "loss": 0.3063, + "step": 10141 + }, + { + "epoch": 2.550804828973843, + "grad_norm": 0.3180108964443207, + "learning_rate": 6.675580320598418e-07, + "loss": 0.3439, + "step": 10142 + }, + { + "epoch": 2.551056338028169, + "grad_norm": 0.2787119150161743, + "learning_rate": 6.668277684473346e-07, + "loss": 0.3225, + "step": 10143 + }, + { + "epoch": 2.551307847082495, + "grad_norm": 0.2893412709236145, + "learning_rate": 6.660978759439219e-07, + "loss": 0.3334, + "step": 10144 + }, + { + "epoch": 2.551559356136821, + "grad_norm": 0.30765849351882935, + "learning_rate": 6.653683546121126e-07, + "loss": 0.3181, + "step": 10145 + }, + { + "epoch": 2.551810865191147, + "grad_norm": 0.29015520215034485, + "learning_rate": 6.646392045143868e-07, + "loss": 0.327, + "step": 10146 + }, + { + "epoch": 2.552062374245473, + "grad_norm": 0.29602178931236267, + "learning_rate": 6.639104257131907e-07, + "loss": 0.3178, + "step": 10147 + }, + { + "epoch": 2.552313883299799, + "grad_norm": 0.3179301917552948, + "learning_rate": 6.631820182709409e-07, + "loss": 0.334, + "step": 10148 + }, + { + "epoch": 2.5525653923541247, + "grad_norm": 0.315472811460495, + "learning_rate": 6.624539822500192e-07, + "loss": 0.3132, + "step": 10149 + }, + { + "epoch": 2.552816901408451, + "grad_norm": 0.3242339491844177, + "learning_rate": 6.617263177127797e-07, + "loss": 0.3154, + "step": 10150 + }, + { + "epoch": 2.5530684104627768, + "grad_norm": 0.26861920952796936, + "learning_rate": 6.609990247215393e-07, + "loss": 0.3406, + "step": 10151 + }, + { + "epoch": 2.5533199195171026, + "grad_norm": 0.29673323035240173, + "learning_rate": 6.602721033385889e-07, + "loss": 0.322, + "step": 10152 + }, + { + "epoch": 2.553571428571429, + "grad_norm": 0.32738587260246277, + "learning_rate": 6.59545553626183e-07, + "loss": 0.3157, + "step": 10153 + }, + { + "epoch": 2.5538229376257546, + "grad_norm": 0.3015281558036804, + "learning_rate": 6.588193756465472e-07, + "loss": 0.3277, + "step": 10154 + }, + { + "epoch": 2.5540744466800804, + "grad_norm": 0.2911074161529541, + "learning_rate": 6.580935694618728e-07, + "loss": 0.3128, + "step": 10155 + }, + { + "epoch": 2.5543259557344067, + "grad_norm": 0.29950693249702454, + "learning_rate": 6.573681351343226e-07, + "loss": 0.3063, + "step": 10156 + }, + { + "epoch": 2.5545774647887325, + "grad_norm": 0.28311774134635925, + "learning_rate": 6.566430727260226e-07, + "loss": 0.2949, + "step": 10157 + }, + { + "epoch": 2.5548289738430583, + "grad_norm": 0.27676114439964294, + "learning_rate": 6.559183822990727e-07, + "loss": 0.3299, + "step": 10158 + }, + { + "epoch": 2.5550804828973845, + "grad_norm": 0.2759498059749603, + "learning_rate": 6.551940639155357e-07, + "loss": 0.3193, + "step": 10159 + }, + { + "epoch": 2.5553319919517103, + "grad_norm": 0.29512062668800354, + "learning_rate": 6.544701176374462e-07, + "loss": 0.3159, + "step": 10160 + }, + { + "epoch": 2.555583501006036, + "grad_norm": 0.2883027195930481, + "learning_rate": 6.537465435268065e-07, + "loss": 0.3145, + "step": 10161 + }, + { + "epoch": 2.5558350100603624, + "grad_norm": 0.2832835614681244, + "learning_rate": 6.530233416455845e-07, + "loss": 0.3234, + "step": 10162 + }, + { + "epoch": 2.556086519114688, + "grad_norm": 0.28715407848358154, + "learning_rate": 6.523005120557197e-07, + "loss": 0.3323, + "step": 10163 + }, + { + "epoch": 2.556338028169014, + "grad_norm": 0.3091478645801544, + "learning_rate": 6.515780548191159e-07, + "loss": 0.3248, + "step": 10164 + }, + { + "epoch": 2.5565895372233403, + "grad_norm": 0.2894414961338043, + "learning_rate": 6.508559699976486e-07, + "loss": 0.3113, + "step": 10165 + }, + { + "epoch": 2.556841046277666, + "grad_norm": 0.2907220721244812, + "learning_rate": 6.501342576531589e-07, + "loss": 0.3341, + "step": 10166 + }, + { + "epoch": 2.557092555331992, + "grad_norm": 0.31087636947631836, + "learning_rate": 6.494129178474579e-07, + "loss": 0.338, + "step": 10167 + }, + { + "epoch": 2.557344064386318, + "grad_norm": 0.2967066168785095, + "learning_rate": 6.486919506423228e-07, + "loss": 0.3225, + "step": 10168 + }, + { + "epoch": 2.557595573440644, + "grad_norm": 0.31147968769073486, + "learning_rate": 6.47971356099501e-07, + "loss": 0.3556, + "step": 10169 + }, + { + "epoch": 2.5578470824949697, + "grad_norm": 0.3024539053440094, + "learning_rate": 6.472511342807052e-07, + "loss": 0.3199, + "step": 10170 + }, + { + "epoch": 2.558098591549296, + "grad_norm": 0.28230634331703186, + "learning_rate": 6.465312852476197e-07, + "loss": 0.3253, + "step": 10171 + }, + { + "epoch": 2.558350100603622, + "grad_norm": 0.30267390608787537, + "learning_rate": 6.458118090618948e-07, + "loss": 0.3296, + "step": 10172 + }, + { + "epoch": 2.5586016096579476, + "grad_norm": 0.27803835272789, + "learning_rate": 6.450927057851481e-07, + "loss": 0.296, + "step": 10173 + }, + { + "epoch": 2.558853118712274, + "grad_norm": 0.2754371166229248, + "learning_rate": 6.44373975478968e-07, + "loss": 0.3001, + "step": 10174 + }, + { + "epoch": 2.5591046277665996, + "grad_norm": 0.2934453785419464, + "learning_rate": 6.436556182049069e-07, + "loss": 0.3301, + "step": 10175 + }, + { + "epoch": 2.5593561368209254, + "grad_norm": 0.29431700706481934, + "learning_rate": 6.429376340244897e-07, + "loss": 0.3223, + "step": 10176 + }, + { + "epoch": 2.5596076458752517, + "grad_norm": 0.26826512813568115, + "learning_rate": 6.42220022999206e-07, + "loss": 0.3048, + "step": 10177 + }, + { + "epoch": 2.5598591549295775, + "grad_norm": 0.2905735671520233, + "learning_rate": 6.415027851905159e-07, + "loss": 0.3369, + "step": 10178 + }, + { + "epoch": 2.5601106639839033, + "grad_norm": 0.2780333161354065, + "learning_rate": 6.407859206598443e-07, + "loss": 0.3325, + "step": 10179 + }, + { + "epoch": 2.5603621730382295, + "grad_norm": 0.3008924126625061, + "learning_rate": 6.400694294685889e-07, + "loss": 0.3048, + "step": 10180 + }, + { + "epoch": 2.5606136820925554, + "grad_norm": 0.2865042984485626, + "learning_rate": 6.393533116781098e-07, + "loss": 0.3104, + "step": 10181 + }, + { + "epoch": 2.560865191146881, + "grad_norm": 0.28145965933799744, + "learning_rate": 6.386375673497397e-07, + "loss": 0.3036, + "step": 10182 + }, + { + "epoch": 2.5611167002012074, + "grad_norm": 0.2906494438648224, + "learning_rate": 6.379221965447785e-07, + "loss": 0.312, + "step": 10183 + }, + { + "epoch": 2.561368209255533, + "grad_norm": 0.2934057116508484, + "learning_rate": 6.37207199324491e-07, + "loss": 0.3203, + "step": 10184 + }, + { + "epoch": 2.561619718309859, + "grad_norm": 0.3017788827419281, + "learning_rate": 6.364925757501139e-07, + "loss": 0.3339, + "step": 10185 + }, + { + "epoch": 2.5618712273641853, + "grad_norm": 0.2982634902000427, + "learning_rate": 6.357783258828493e-07, + "loss": 0.3241, + "step": 10186 + }, + { + "epoch": 2.562122736418511, + "grad_norm": 0.28276070952415466, + "learning_rate": 6.350644497838692e-07, + "loss": 0.3201, + "step": 10187 + }, + { + "epoch": 2.562374245472837, + "grad_norm": 0.27246373891830444, + "learning_rate": 6.343509475143112e-07, + "loss": 0.3196, + "step": 10188 + }, + { + "epoch": 2.562625754527163, + "grad_norm": 0.2898280918598175, + "learning_rate": 6.336378191352838e-07, + "loss": 0.3311, + "step": 10189 + }, + { + "epoch": 2.562877263581489, + "grad_norm": 0.29259827733039856, + "learning_rate": 6.329250647078605e-07, + "loss": 0.3205, + "step": 10190 + }, + { + "epoch": 2.5631287726358147, + "grad_norm": 0.3051588237285614, + "learning_rate": 6.322126842930864e-07, + "loss": 0.3302, + "step": 10191 + }, + { + "epoch": 2.563380281690141, + "grad_norm": 0.28429844975471497, + "learning_rate": 6.3150067795197e-07, + "loss": 0.3169, + "step": 10192 + }, + { + "epoch": 2.563631790744467, + "grad_norm": 0.2904362082481384, + "learning_rate": 6.307890457454907e-07, + "loss": 0.3111, + "step": 10193 + }, + { + "epoch": 2.5638832997987926, + "grad_norm": 0.28321024775505066, + "learning_rate": 6.300777877345976e-07, + "loss": 0.3277, + "step": 10194 + }, + { + "epoch": 2.564134808853119, + "grad_norm": 0.2912009656429291, + "learning_rate": 6.293669039802025e-07, + "loss": 0.3177, + "step": 10195 + }, + { + "epoch": 2.5643863179074446, + "grad_norm": 0.29499733448028564, + "learning_rate": 6.286563945431906e-07, + "loss": 0.3204, + "step": 10196 + }, + { + "epoch": 2.5646378269617705, + "grad_norm": 0.2899915874004364, + "learning_rate": 6.279462594844105e-07, + "loss": 0.3003, + "step": 10197 + }, + { + "epoch": 2.5648893360160967, + "grad_norm": 0.29533299803733826, + "learning_rate": 6.272364988646828e-07, + "loss": 0.3252, + "step": 10198 + }, + { + "epoch": 2.5651408450704225, + "grad_norm": 0.2786716818809509, + "learning_rate": 6.26527112744792e-07, + "loss": 0.3268, + "step": 10199 + }, + { + "epoch": 2.5653923541247483, + "grad_norm": 0.28114524483680725, + "learning_rate": 6.258181011854947e-07, + "loss": 0.3183, + "step": 10200 + }, + { + "epoch": 2.5656438631790746, + "grad_norm": 0.30359089374542236, + "learning_rate": 6.251094642475108e-07, + "loss": 0.3293, + "step": 10201 + }, + { + "epoch": 2.5658953722334004, + "grad_norm": 0.295454204082489, + "learning_rate": 6.244012019915335e-07, + "loss": 0.3227, + "step": 10202 + }, + { + "epoch": 2.566146881287726, + "grad_norm": 0.29742130637168884, + "learning_rate": 6.236933144782187e-07, + "loss": 0.3293, + "step": 10203 + }, + { + "epoch": 2.5663983903420524, + "grad_norm": 0.29131677746772766, + "learning_rate": 6.229858017681933e-07, + "loss": 0.3234, + "step": 10204 + }, + { + "epoch": 2.566649899396378, + "grad_norm": 0.280320942401886, + "learning_rate": 6.222786639220524e-07, + "loss": 0.3104, + "step": 10205 + }, + { + "epoch": 2.566901408450704, + "grad_norm": 0.33856505155563354, + "learning_rate": 6.215719010003557e-07, + "loss": 0.3169, + "step": 10206 + }, + { + "epoch": 2.5671529175050303, + "grad_norm": 0.2823481559753418, + "learning_rate": 6.208655130636354e-07, + "loss": 0.3126, + "step": 10207 + }, + { + "epoch": 2.567404426559356, + "grad_norm": 0.28553295135498047, + "learning_rate": 6.201595001723876e-07, + "loss": 0.3226, + "step": 10208 + }, + { + "epoch": 2.567655935613682, + "grad_norm": 0.28577616810798645, + "learning_rate": 6.194538623870794e-07, + "loss": 0.3168, + "step": 10209 + }, + { + "epoch": 2.567907444668008, + "grad_norm": 0.29407092928886414, + "learning_rate": 6.187485997681419e-07, + "loss": 0.31, + "step": 10210 + }, + { + "epoch": 2.568158953722334, + "grad_norm": 0.29761141538619995, + "learning_rate": 6.180437123759786e-07, + "loss": 0.2955, + "step": 10211 + }, + { + "epoch": 2.5684104627766597, + "grad_norm": 0.27446889877319336, + "learning_rate": 6.173392002709572e-07, + "loss": 0.3325, + "step": 10212 + }, + { + "epoch": 2.568661971830986, + "grad_norm": 0.2877654731273651, + "learning_rate": 6.166350635134166e-07, + "loss": 0.3209, + "step": 10213 + }, + { + "epoch": 2.568913480885312, + "grad_norm": 0.30045285820961, + "learning_rate": 6.159313021636593e-07, + "loss": 0.3168, + "step": 10214 + }, + { + "epoch": 2.5691649899396376, + "grad_norm": 0.28202614188194275, + "learning_rate": 6.152279162819597e-07, + "loss": 0.3216, + "step": 10215 + }, + { + "epoch": 2.569416498993964, + "grad_norm": 0.33453133702278137, + "learning_rate": 6.145249059285585e-07, + "loss": 0.3074, + "step": 10216 + }, + { + "epoch": 2.5696680080482897, + "grad_norm": 0.28775471448898315, + "learning_rate": 6.138222711636632e-07, + "loss": 0.3357, + "step": 10217 + }, + { + "epoch": 2.5699195171026155, + "grad_norm": 0.29132527112960815, + "learning_rate": 6.131200120474512e-07, + "loss": 0.3223, + "step": 10218 + }, + { + "epoch": 2.5701710261569417, + "grad_norm": 0.29309695959091187, + "learning_rate": 6.124181286400649e-07, + "loss": 0.3157, + "step": 10219 + }, + { + "epoch": 2.5704225352112675, + "grad_norm": 0.2960768938064575, + "learning_rate": 6.117166210016184e-07, + "loss": 0.3223, + "step": 10220 + }, + { + "epoch": 2.5706740442655933, + "grad_norm": 0.2980702817440033, + "learning_rate": 6.110154891921894e-07, + "loss": 0.3371, + "step": 10221 + }, + { + "epoch": 2.5709255533199196, + "grad_norm": 0.2795586884021759, + "learning_rate": 6.103147332718274e-07, + "loss": 0.3141, + "step": 10222 + }, + { + "epoch": 2.5711770623742454, + "grad_norm": 0.2727452516555786, + "learning_rate": 6.096143533005455e-07, + "loss": 0.3137, + "step": 10223 + }, + { + "epoch": 2.571428571428571, + "grad_norm": 0.28229624032974243, + "learning_rate": 6.089143493383281e-07, + "loss": 0.3211, + "step": 10224 + }, + { + "epoch": 2.5716800804828974, + "grad_norm": 0.3388809263706207, + "learning_rate": 6.082147214451272e-07, + "loss": 0.3339, + "step": 10225 + }, + { + "epoch": 2.5719315895372232, + "grad_norm": 0.2950616776943207, + "learning_rate": 6.075154696808594e-07, + "loss": 0.3158, + "step": 10226 + }, + { + "epoch": 2.572183098591549, + "grad_norm": 0.2900426983833313, + "learning_rate": 6.068165941054133e-07, + "loss": 0.3371, + "step": 10227 + }, + { + "epoch": 2.5724346076458753, + "grad_norm": 0.29088470339775085, + "learning_rate": 6.061180947786411e-07, + "loss": 0.3193, + "step": 10228 + }, + { + "epoch": 2.572686116700201, + "grad_norm": 0.30321744084358215, + "learning_rate": 6.054199717603671e-07, + "loss": 0.3315, + "step": 10229 + }, + { + "epoch": 2.572937625754527, + "grad_norm": 0.3024834990501404, + "learning_rate": 6.047222251103796e-07, + "loss": 0.3233, + "step": 10230 + }, + { + "epoch": 2.573189134808853, + "grad_norm": 0.2948141396045685, + "learning_rate": 6.04024854888437e-07, + "loss": 0.3318, + "step": 10231 + }, + { + "epoch": 2.573440643863179, + "grad_norm": 0.323097825050354, + "learning_rate": 6.033278611542642e-07, + "loss": 0.3179, + "step": 10232 + }, + { + "epoch": 2.573692152917505, + "grad_norm": 0.2814940810203552, + "learning_rate": 6.026312439675553e-07, + "loss": 0.3333, + "step": 10233 + }, + { + "epoch": 2.573943661971831, + "grad_norm": 0.2813478708267212, + "learning_rate": 6.01935003387969e-07, + "loss": 0.3158, + "step": 10234 + }, + { + "epoch": 2.574195171026157, + "grad_norm": 0.27595117688179016, + "learning_rate": 6.01239139475136e-07, + "loss": 0.2883, + "step": 10235 + }, + { + "epoch": 2.574446680080483, + "grad_norm": 0.278484970331192, + "learning_rate": 6.005436522886532e-07, + "loss": 0.308, + "step": 10236 + }, + { + "epoch": 2.574698189134809, + "grad_norm": 0.285978227853775, + "learning_rate": 5.998485418880822e-07, + "loss": 0.3014, + "step": 10237 + }, + { + "epoch": 2.5749496981891347, + "grad_norm": 0.3101520538330078, + "learning_rate": 5.991538083329579e-07, + "loss": 0.3149, + "step": 10238 + }, + { + "epoch": 2.575201207243461, + "grad_norm": 0.2651280462741852, + "learning_rate": 5.984594516827769e-07, + "loss": 0.3223, + "step": 10239 + }, + { + "epoch": 2.5754527162977867, + "grad_norm": 0.307229608297348, + "learning_rate": 5.977654719970088e-07, + "loss": 0.3224, + "step": 10240 + }, + { + "epoch": 2.5757042253521125, + "grad_norm": 0.28054678440093994, + "learning_rate": 5.970718693350874e-07, + "loss": 0.3274, + "step": 10241 + }, + { + "epoch": 2.5759557344064388, + "grad_norm": 0.2816450297832489, + "learning_rate": 5.963786437564161e-07, + "loss": 0.3536, + "step": 10242 + }, + { + "epoch": 2.5762072434607646, + "grad_norm": 0.281474769115448, + "learning_rate": 5.956857953203643e-07, + "loss": 0.324, + "step": 10243 + }, + { + "epoch": 2.5764587525150904, + "grad_norm": 0.3121716380119324, + "learning_rate": 5.949933240862715e-07, + "loss": 0.3248, + "step": 10244 + }, + { + "epoch": 2.5767102615694166, + "grad_norm": 0.28747543692588806, + "learning_rate": 5.94301230113442e-07, + "loss": 0.3225, + "step": 10245 + }, + { + "epoch": 2.5769617706237424, + "grad_norm": 0.29833775758743286, + "learning_rate": 5.936095134611508e-07, + "loss": 0.2994, + "step": 10246 + }, + { + "epoch": 2.5772132796780687, + "grad_norm": 0.2817133069038391, + "learning_rate": 5.929181741886386e-07, + "loss": 0.3039, + "step": 10247 + }, + { + "epoch": 2.5774647887323945, + "grad_norm": 0.28056350350379944, + "learning_rate": 5.922272123551137e-07, + "loss": 0.3196, + "step": 10248 + }, + { + "epoch": 2.5777162977867203, + "grad_norm": 0.28009307384490967, + "learning_rate": 5.915366280197537e-07, + "loss": 0.3259, + "step": 10249 + }, + { + "epoch": 2.5779678068410465, + "grad_norm": 0.303177148103714, + "learning_rate": 5.908464212417014e-07, + "loss": 0.3336, + "step": 10250 + }, + { + "epoch": 2.5782193158953723, + "grad_norm": 0.2899198532104492, + "learning_rate": 5.901565920800711e-07, + "loss": 0.2883, + "step": 10251 + }, + { + "epoch": 2.578470824949698, + "grad_norm": 0.28011229634284973, + "learning_rate": 5.894671405939389e-07, + "loss": 0.331, + "step": 10252 + }, + { + "epoch": 2.5787223340040244, + "grad_norm": 0.2783331274986267, + "learning_rate": 5.887780668423553e-07, + "loss": 0.3173, + "step": 10253 + }, + { + "epoch": 2.57897384305835, + "grad_norm": 0.31792569160461426, + "learning_rate": 5.880893708843332e-07, + "loss": 0.3361, + "step": 10254 + }, + { + "epoch": 2.579225352112676, + "grad_norm": 0.2781875729560852, + "learning_rate": 5.87401052778856e-07, + "loss": 0.3333, + "step": 10255 + }, + { + "epoch": 2.5794768611670023, + "grad_norm": 0.29065370559692383, + "learning_rate": 5.867131125848729e-07, + "loss": 0.3436, + "step": 10256 + }, + { + "epoch": 2.579728370221328, + "grad_norm": 0.2745533585548401, + "learning_rate": 5.860255503613033e-07, + "loss": 0.3263, + "step": 10257 + }, + { + "epoch": 2.579979879275654, + "grad_norm": 0.2924485206604004, + "learning_rate": 5.853383661670303e-07, + "loss": 0.3291, + "step": 10258 + }, + { + "epoch": 2.58023138832998, + "grad_norm": 0.288885235786438, + "learning_rate": 5.846515600609093e-07, + "loss": 0.3057, + "step": 10259 + }, + { + "epoch": 2.580482897384306, + "grad_norm": 0.2955508530139923, + "learning_rate": 5.839651321017586e-07, + "loss": 0.3173, + "step": 10260 + }, + { + "epoch": 2.5807344064386317, + "grad_norm": 0.289850115776062, + "learning_rate": 5.832790823483691e-07, + "loss": 0.3112, + "step": 10261 + }, + { + "epoch": 2.580985915492958, + "grad_norm": 0.30250605940818787, + "learning_rate": 5.825934108594938e-07, + "loss": 0.3184, + "step": 10262 + }, + { + "epoch": 2.5812374245472838, + "grad_norm": 0.28620943427085876, + "learning_rate": 5.81908117693859e-07, + "loss": 0.3132, + "step": 10263 + }, + { + "epoch": 2.5814889336016096, + "grad_norm": 0.2834034264087677, + "learning_rate": 5.812232029101533e-07, + "loss": 0.3057, + "step": 10264 + }, + { + "epoch": 2.581740442655936, + "grad_norm": 0.2939947545528412, + "learning_rate": 5.805386665670376e-07, + "loss": 0.3077, + "step": 10265 + }, + { + "epoch": 2.5819919517102616, + "grad_norm": 0.3035759925842285, + "learning_rate": 5.79854508723136e-07, + "loss": 0.3171, + "step": 10266 + }, + { + "epoch": 2.5822434607645874, + "grad_norm": 0.28236502408981323, + "learning_rate": 5.791707294370447e-07, + "loss": 0.3288, + "step": 10267 + }, + { + "epoch": 2.5824949698189137, + "grad_norm": 0.2845684587955475, + "learning_rate": 5.784873287673226e-07, + "loss": 0.3225, + "step": 10268 + }, + { + "epoch": 2.5827464788732395, + "grad_norm": 0.31281614303588867, + "learning_rate": 5.778043067725009e-07, + "loss": 0.3177, + "step": 10269 + }, + { + "epoch": 2.5829979879275653, + "grad_norm": 0.33208024501800537, + "learning_rate": 5.771216635110738e-07, + "loss": 0.2962, + "step": 10270 + }, + { + "epoch": 2.5832494969818915, + "grad_norm": 0.28506436944007874, + "learning_rate": 5.764393990415079e-07, + "loss": 0.3374, + "step": 10271 + }, + { + "epoch": 2.5835010060362174, + "grad_norm": 0.29046157002449036, + "learning_rate": 5.757575134222332e-07, + "loss": 0.3216, + "step": 10272 + }, + { + "epoch": 2.583752515090543, + "grad_norm": 0.2891055941581726, + "learning_rate": 5.750760067116501e-07, + "loss": 0.3129, + "step": 10273 + }, + { + "epoch": 2.5840040241448694, + "grad_norm": 0.2932874858379364, + "learning_rate": 5.743948789681236e-07, + "loss": 0.2921, + "step": 10274 + }, + { + "epoch": 2.584255533199195, + "grad_norm": 0.3003051280975342, + "learning_rate": 5.73714130249991e-07, + "loss": 0.3158, + "step": 10275 + }, + { + "epoch": 2.584507042253521, + "grad_norm": 0.278804749250412, + "learning_rate": 5.730337606155506e-07, + "loss": 0.3074, + "step": 10276 + }, + { + "epoch": 2.5847585513078473, + "grad_norm": 0.297124445438385, + "learning_rate": 5.723537701230747e-07, + "loss": 0.3316, + "step": 10277 + }, + { + "epoch": 2.585010060362173, + "grad_norm": 0.28858664631843567, + "learning_rate": 5.716741588307983e-07, + "loss": 0.3064, + "step": 10278 + }, + { + "epoch": 2.585261569416499, + "grad_norm": 0.30511143803596497, + "learning_rate": 5.709949267969267e-07, + "loss": 0.3197, + "step": 10279 + }, + { + "epoch": 2.585513078470825, + "grad_norm": 0.2882242500782013, + "learning_rate": 5.703160740796332e-07, + "loss": 0.3344, + "step": 10280 + }, + { + "epoch": 2.585764587525151, + "grad_norm": 0.32658329606056213, + "learning_rate": 5.696376007370541e-07, + "loss": 0.3257, + "step": 10281 + }, + { + "epoch": 2.5860160965794767, + "grad_norm": 0.3152960538864136, + "learning_rate": 5.689595068273002e-07, + "loss": 0.3103, + "step": 10282 + }, + { + "epoch": 2.586267605633803, + "grad_norm": 0.2786172330379486, + "learning_rate": 5.682817924084422e-07, + "loss": 0.3174, + "step": 10283 + }, + { + "epoch": 2.586519114688129, + "grad_norm": 0.29683375358581543, + "learning_rate": 5.676044575385254e-07, + "loss": 0.3426, + "step": 10284 + }, + { + "epoch": 2.5867706237424546, + "grad_norm": 0.28705403208732605, + "learning_rate": 5.669275022755566e-07, + "loss": 0.3055, + "step": 10285 + }, + { + "epoch": 2.587022132796781, + "grad_norm": 0.29597151279449463, + "learning_rate": 5.662509266775151e-07, + "loss": 0.3117, + "step": 10286 + }, + { + "epoch": 2.5872736418511066, + "grad_norm": 0.29061004519462585, + "learning_rate": 5.655747308023434e-07, + "loss": 0.3414, + "step": 10287 + }, + { + "epoch": 2.5875251509054324, + "grad_norm": 0.3225204646587372, + "learning_rate": 5.648989147079553e-07, + "loss": 0.3319, + "step": 10288 + }, + { + "epoch": 2.5877766599597587, + "grad_norm": 0.27866053581237793, + "learning_rate": 5.642234784522282e-07, + "loss": 0.3036, + "step": 10289 + }, + { + "epoch": 2.5880281690140845, + "grad_norm": 0.3126038610935211, + "learning_rate": 5.635484220930098e-07, + "loss": 0.3094, + "step": 10290 + }, + { + "epoch": 2.5882796780684103, + "grad_norm": 0.2966510057449341, + "learning_rate": 5.628737456881161e-07, + "loss": 0.3066, + "step": 10291 + }, + { + "epoch": 2.5885311871227366, + "grad_norm": 0.2831152677536011, + "learning_rate": 5.621994492953264e-07, + "loss": 0.3296, + "step": 10292 + }, + { + "epoch": 2.5887826961770624, + "grad_norm": 0.2895983159542084, + "learning_rate": 5.615255329723917e-07, + "loss": 0.3145, + "step": 10293 + }, + { + "epoch": 2.589034205231388, + "grad_norm": 0.29133400321006775, + "learning_rate": 5.608519967770276e-07, + "loss": 0.3194, + "step": 10294 + }, + { + "epoch": 2.5892857142857144, + "grad_norm": 0.2729475200176239, + "learning_rate": 5.601788407669196e-07, + "loss": 0.3218, + "step": 10295 + }, + { + "epoch": 2.58953722334004, + "grad_norm": 0.2915710508823395, + "learning_rate": 5.595060649997175e-07, + "loss": 0.3297, + "step": 10296 + }, + { + "epoch": 2.589788732394366, + "grad_norm": 0.31030383706092834, + "learning_rate": 5.588336695330421e-07, + "loss": 0.3357, + "step": 10297 + }, + { + "epoch": 2.5900402414486923, + "grad_norm": 0.3029539883136749, + "learning_rate": 5.581616544244778e-07, + "loss": 0.2825, + "step": 10298 + }, + { + "epoch": 2.590291750503018, + "grad_norm": 0.30874502658843994, + "learning_rate": 5.574900197315814e-07, + "loss": 0.3133, + "step": 10299 + }, + { + "epoch": 2.590543259557344, + "grad_norm": 0.3056495785713196, + "learning_rate": 5.568187655118712e-07, + "loss": 0.3328, + "step": 10300 + }, + { + "epoch": 2.59079476861167, + "grad_norm": 0.29649117588996887, + "learning_rate": 5.561478918228369e-07, + "loss": 0.31, + "step": 10301 + }, + { + "epoch": 2.591046277665996, + "grad_norm": 0.2896314263343811, + "learning_rate": 5.55477398721937e-07, + "loss": 0.3237, + "step": 10302 + }, + { + "epoch": 2.5912977867203217, + "grad_norm": 0.2920903265476227, + "learning_rate": 5.548072862665909e-07, + "loss": 0.2982, + "step": 10303 + }, + { + "epoch": 2.591549295774648, + "grad_norm": 0.309803307056427, + "learning_rate": 5.541375545141936e-07, + "loss": 0.3153, + "step": 10304 + }, + { + "epoch": 2.591800804828974, + "grad_norm": 0.3066963255405426, + "learning_rate": 5.534682035221e-07, + "loss": 0.3182, + "step": 10305 + }, + { + "epoch": 2.5920523138832996, + "grad_norm": 0.27324411273002625, + "learning_rate": 5.527992333476389e-07, + "loss": 0.3304, + "step": 10306 + }, + { + "epoch": 2.592303822937626, + "grad_norm": 0.3093414604663849, + "learning_rate": 5.521306440481005e-07, + "loss": 0.331, + "step": 10307 + }, + { + "epoch": 2.5925553319919517, + "grad_norm": 0.29584071040153503, + "learning_rate": 5.514624356807474e-07, + "loss": 0.3306, + "step": 10308 + }, + { + "epoch": 2.5928068410462775, + "grad_norm": 0.27764448523521423, + "learning_rate": 5.507946083028059e-07, + "loss": 0.3037, + "step": 10309 + }, + { + "epoch": 2.5930583501006037, + "grad_norm": 0.30795976519584656, + "learning_rate": 5.501271619714732e-07, + "loss": 0.311, + "step": 10310 + }, + { + "epoch": 2.5933098591549295, + "grad_norm": 0.29350659251213074, + "learning_rate": 5.494600967439095e-07, + "loss": 0.3163, + "step": 10311 + }, + { + "epoch": 2.5935613682092553, + "grad_norm": 0.3054122030735016, + "learning_rate": 5.487934126772465e-07, + "loss": 0.327, + "step": 10312 + }, + { + "epoch": 2.5938128772635816, + "grad_norm": 0.29255709052085876, + "learning_rate": 5.481271098285818e-07, + "loss": 0.3198, + "step": 10313 + }, + { + "epoch": 2.5940643863179074, + "grad_norm": 0.26492074131965637, + "learning_rate": 5.474611882549785e-07, + "loss": 0.3185, + "step": 10314 + }, + { + "epoch": 2.594315895372233, + "grad_norm": 0.306562215089798, + "learning_rate": 5.4679564801347e-07, + "loss": 0.3028, + "step": 10315 + }, + { + "epoch": 2.5945674044265594, + "grad_norm": 0.3125389814376831, + "learning_rate": 5.461304891610541e-07, + "loss": 0.3101, + "step": 10316 + }, + { + "epoch": 2.5948189134808852, + "grad_norm": 0.31623390316963196, + "learning_rate": 5.454657117546996e-07, + "loss": 0.3141, + "step": 10317 + }, + { + "epoch": 2.595070422535211, + "grad_norm": 0.2654491066932678, + "learning_rate": 5.448013158513388e-07, + "loss": 0.3211, + "step": 10318 + }, + { + "epoch": 2.5953219315895373, + "grad_norm": 0.30861228704452515, + "learning_rate": 5.441373015078744e-07, + "loss": 0.3259, + "step": 10319 + }, + { + "epoch": 2.595573440643863, + "grad_norm": 0.26960867643356323, + "learning_rate": 5.434736687811731e-07, + "loss": 0.3155, + "step": 10320 + }, + { + "epoch": 2.595824949698189, + "grad_norm": 0.26485928893089294, + "learning_rate": 5.428104177280735e-07, + "loss": 0.3251, + "step": 10321 + }, + { + "epoch": 2.596076458752515, + "grad_norm": 0.29853057861328125, + "learning_rate": 5.421475484053762e-07, + "loss": 0.3669, + "step": 10322 + }, + { + "epoch": 2.596327967806841, + "grad_norm": 0.29327720403671265, + "learning_rate": 5.414850608698535e-07, + "loss": 0.3355, + "step": 10323 + }, + { + "epoch": 2.5965794768611667, + "grad_norm": 0.2685185372829437, + "learning_rate": 5.408229551782435e-07, + "loss": 0.3319, + "step": 10324 + }, + { + "epoch": 2.596830985915493, + "grad_norm": 0.3062201142311096, + "learning_rate": 5.4016123138725e-07, + "loss": 0.3081, + "step": 10325 + }, + { + "epoch": 2.597082494969819, + "grad_norm": 0.3145555555820465, + "learning_rate": 5.394998895535475e-07, + "loss": 0.312, + "step": 10326 + }, + { + "epoch": 2.5973340040241446, + "grad_norm": 0.30714958906173706, + "learning_rate": 5.388389297337737e-07, + "loss": 0.3083, + "step": 10327 + }, + { + "epoch": 2.597585513078471, + "grad_norm": 0.28293806314468384, + "learning_rate": 5.381783519845374e-07, + "loss": 0.3066, + "step": 10328 + }, + { + "epoch": 2.5978370221327967, + "grad_norm": 0.2907370626926422, + "learning_rate": 5.375181563624116e-07, + "loss": 0.3577, + "step": 10329 + }, + { + "epoch": 2.5980885311871225, + "grad_norm": 0.3050544559955597, + "learning_rate": 5.368583429239394e-07, + "loss": 0.3285, + "step": 10330 + }, + { + "epoch": 2.5983400402414487, + "grad_norm": 0.3092591166496277, + "learning_rate": 5.361989117256277e-07, + "loss": 0.3384, + "step": 10331 + }, + { + "epoch": 2.5985915492957745, + "grad_norm": 0.27648666501045227, + "learning_rate": 5.355398628239544e-07, + "loss": 0.3422, + "step": 10332 + }, + { + "epoch": 2.5988430583501008, + "grad_norm": 0.3023586869239807, + "learning_rate": 5.34881196275362e-07, + "loss": 0.3052, + "step": 10333 + }, + { + "epoch": 2.5990945674044266, + "grad_norm": 0.2839665710926056, + "learning_rate": 5.342229121362607e-07, + "loss": 0.2988, + "step": 10334 + }, + { + "epoch": 2.5993460764587524, + "grad_norm": 0.3002241849899292, + "learning_rate": 5.335650104630308e-07, + "loss": 0.3427, + "step": 10335 + }, + { + "epoch": 2.5995975855130786, + "grad_norm": 0.27492570877075195, + "learning_rate": 5.329074913120141e-07, + "loss": 0.3233, + "step": 10336 + }, + { + "epoch": 2.5998490945674044, + "grad_norm": 0.29533281922340393, + "learning_rate": 5.322503547395263e-07, + "loss": 0.3129, + "step": 10337 + }, + { + "epoch": 2.6001006036217302, + "grad_norm": 0.285576194524765, + "learning_rate": 5.31593600801844e-07, + "loss": 0.314, + "step": 10338 + }, + { + "epoch": 2.6003521126760565, + "grad_norm": 0.2748103439807892, + "learning_rate": 5.309372295552173e-07, + "loss": 0.3299, + "step": 10339 + }, + { + "epoch": 2.6006036217303823, + "grad_norm": 0.2951182425022125, + "learning_rate": 5.302812410558567e-07, + "loss": 0.3202, + "step": 10340 + }, + { + "epoch": 2.600855130784708, + "grad_norm": 0.30300065875053406, + "learning_rate": 5.296256353599466e-07, + "loss": 0.3153, + "step": 10341 + }, + { + "epoch": 2.6011066398390343, + "grad_norm": 0.3175194561481476, + "learning_rate": 5.289704125236333e-07, + "loss": 0.3119, + "step": 10342 + }, + { + "epoch": 2.60135814889336, + "grad_norm": 0.2759941518306732, + "learning_rate": 5.283155726030348e-07, + "loss": 0.3194, + "step": 10343 + }, + { + "epoch": 2.6016096579476864, + "grad_norm": 0.30253252387046814, + "learning_rate": 5.276611156542316e-07, + "loss": 0.2918, + "step": 10344 + }, + { + "epoch": 2.601861167002012, + "grad_norm": 0.2967860698699951, + "learning_rate": 5.270070417332745e-07, + "loss": 0.3106, + "step": 10345 + }, + { + "epoch": 2.602112676056338, + "grad_norm": 0.2763741612434387, + "learning_rate": 5.263533508961827e-07, + "loss": 0.3111, + "step": 10346 + }, + { + "epoch": 2.6023641851106643, + "grad_norm": 0.28950735926628113, + "learning_rate": 5.257000431989384e-07, + "loss": 0.3368, + "step": 10347 + }, + { + "epoch": 2.60261569416499, + "grad_norm": 0.2865469753742218, + "learning_rate": 5.250471186974954e-07, + "loss": 0.3163, + "step": 10348 + }, + { + "epoch": 2.602867203219316, + "grad_norm": 0.27980104088783264, + "learning_rate": 5.243945774477699e-07, + "loss": 0.333, + "step": 10349 + }, + { + "epoch": 2.603118712273642, + "grad_norm": 0.2943830192089081, + "learning_rate": 5.237424195056512e-07, + "loss": 0.3181, + "step": 10350 + }, + { + "epoch": 2.603370221327968, + "grad_norm": 0.2930108606815338, + "learning_rate": 5.230906449269895e-07, + "loss": 0.3005, + "step": 10351 + }, + { + "epoch": 2.6036217303822937, + "grad_norm": 0.291398823261261, + "learning_rate": 5.224392537676077e-07, + "loss": 0.3232, + "step": 10352 + }, + { + "epoch": 2.60387323943662, + "grad_norm": 0.30126404762268066, + "learning_rate": 5.217882460832912e-07, + "loss": 0.3264, + "step": 10353 + }, + { + "epoch": 2.6041247484909458, + "grad_norm": 0.28334009647369385, + "learning_rate": 5.21137621929797e-07, + "loss": 0.3262, + "step": 10354 + }, + { + "epoch": 2.6043762575452716, + "grad_norm": 0.2912385165691376, + "learning_rate": 5.204873813628447e-07, + "loss": 0.3388, + "step": 10355 + }, + { + "epoch": 2.604627766599598, + "grad_norm": 0.29965826869010925, + "learning_rate": 5.198375244381243e-07, + "loss": 0.324, + "step": 10356 + }, + { + "epoch": 2.6048792756539236, + "grad_norm": 0.2777501046657562, + "learning_rate": 5.191880512112934e-07, + "loss": 0.3292, + "step": 10357 + }, + { + "epoch": 2.6051307847082494, + "grad_norm": 0.2996978163719177, + "learning_rate": 5.185389617379727e-07, + "loss": 0.337, + "step": 10358 + }, + { + "epoch": 2.6053822937625757, + "grad_norm": 0.2818640470504761, + "learning_rate": 5.178902560737554e-07, + "loss": 0.3316, + "step": 10359 + }, + { + "epoch": 2.6056338028169015, + "grad_norm": 0.2935522794723511, + "learning_rate": 5.172419342741963e-07, + "loss": 0.3255, + "step": 10360 + }, + { + "epoch": 2.6058853118712273, + "grad_norm": 0.27610835433006287, + "learning_rate": 5.165939963948225e-07, + "loss": 0.3366, + "step": 10361 + }, + { + "epoch": 2.6061368209255535, + "grad_norm": 0.3033030331134796, + "learning_rate": 5.159464424911242e-07, + "loss": 0.3233, + "step": 10362 + }, + { + "epoch": 2.6063883299798793, + "grad_norm": 0.29915034770965576, + "learning_rate": 5.152992726185619e-07, + "loss": 0.3066, + "step": 10363 + }, + { + "epoch": 2.606639839034205, + "grad_norm": 0.3014090955257416, + "learning_rate": 5.146524868325592e-07, + "loss": 0.3178, + "step": 10364 + }, + { + "epoch": 2.6068913480885314, + "grad_norm": 0.31432896852493286, + "learning_rate": 5.140060851885109e-07, + "loss": 0.3133, + "step": 10365 + }, + { + "epoch": 2.607142857142857, + "grad_norm": 0.28758054971694946, + "learning_rate": 5.133600677417782e-07, + "loss": 0.3389, + "step": 10366 + }, + { + "epoch": 2.607394366197183, + "grad_norm": 0.2802176773548126, + "learning_rate": 5.127144345476865e-07, + "loss": 0.2957, + "step": 10367 + }, + { + "epoch": 2.6076458752515093, + "grad_norm": 0.30097344517707825, + "learning_rate": 5.120691856615323e-07, + "loss": 0.3412, + "step": 10368 + }, + { + "epoch": 2.607897384305835, + "grad_norm": 0.28366386890411377, + "learning_rate": 5.114243211385744e-07, + "loss": 0.3119, + "step": 10369 + }, + { + "epoch": 2.608148893360161, + "grad_norm": 0.2719464898109436, + "learning_rate": 5.107798410340442e-07, + "loss": 0.3054, + "step": 10370 + }, + { + "epoch": 2.608400402414487, + "grad_norm": 0.2885161340236664, + "learning_rate": 5.101357454031352e-07, + "loss": 0.3178, + "step": 10371 + }, + { + "epoch": 2.608651911468813, + "grad_norm": 0.29005712270736694, + "learning_rate": 5.094920343010124e-07, + "loss": 0.3128, + "step": 10372 + }, + { + "epoch": 2.6089034205231387, + "grad_norm": 0.27925872802734375, + "learning_rate": 5.08848707782803e-07, + "loss": 0.3333, + "step": 10373 + }, + { + "epoch": 2.609154929577465, + "grad_norm": 0.272777795791626, + "learning_rate": 5.082057659036061e-07, + "loss": 0.3407, + "step": 10374 + }, + { + "epoch": 2.609406438631791, + "grad_norm": 0.2825217843055725, + "learning_rate": 5.07563208718484e-07, + "loss": 0.3213, + "step": 10375 + }, + { + "epoch": 2.6096579476861166, + "grad_norm": 0.2770220637321472, + "learning_rate": 5.069210362824694e-07, + "loss": 0.3095, + "step": 10376 + }, + { + "epoch": 2.609909456740443, + "grad_norm": 0.296953946352005, + "learning_rate": 5.062792486505586e-07, + "loss": 0.3356, + "step": 10377 + }, + { + "epoch": 2.6101609657947686, + "grad_norm": 0.305620402097702, + "learning_rate": 5.056378458777183e-07, + "loss": 0.3265, + "step": 10378 + }, + { + "epoch": 2.6104124748490944, + "grad_norm": 0.2921883463859558, + "learning_rate": 5.049968280188788e-07, + "loss": 0.3275, + "step": 10379 + }, + { + "epoch": 2.6106639839034207, + "grad_norm": 0.31497353315353394, + "learning_rate": 5.043561951289411e-07, + "loss": 0.3168, + "step": 10380 + }, + { + "epoch": 2.6109154929577465, + "grad_norm": 0.2880299687385559, + "learning_rate": 5.0371594726277e-07, + "loss": 0.3245, + "step": 10381 + }, + { + "epoch": 2.6111670020120723, + "grad_norm": 0.2965189814567566, + "learning_rate": 5.030760844752003e-07, + "loss": 0.3289, + "step": 10382 + }, + { + "epoch": 2.6114185110663986, + "grad_norm": 0.29246169328689575, + "learning_rate": 5.024366068210307e-07, + "loss": 0.3146, + "step": 10383 + }, + { + "epoch": 2.6116700201207244, + "grad_norm": 0.2918647229671478, + "learning_rate": 5.017975143550296e-07, + "loss": 0.3293, + "step": 10384 + }, + { + "epoch": 2.61192152917505, + "grad_norm": 0.30444636940956116, + "learning_rate": 5.011588071319295e-07, + "loss": 0.3029, + "step": 10385 + }, + { + "epoch": 2.6121730382293764, + "grad_norm": 0.3097932040691376, + "learning_rate": 5.005204852064344e-07, + "loss": 0.3443, + "step": 10386 + }, + { + "epoch": 2.612424547283702, + "grad_norm": 0.2824104428291321, + "learning_rate": 4.998825486332098e-07, + "loss": 0.3114, + "step": 10387 + }, + { + "epoch": 2.612676056338028, + "grad_norm": 0.27352190017700195, + "learning_rate": 4.992449974668933e-07, + "loss": 0.2861, + "step": 10388 + }, + { + "epoch": 2.6129275653923543, + "grad_norm": 0.31323835253715515, + "learning_rate": 4.986078317620852e-07, + "loss": 0.3132, + "step": 10389 + }, + { + "epoch": 2.61317907444668, + "grad_norm": 0.3092406690120697, + "learning_rate": 4.979710515733566e-07, + "loss": 0.2916, + "step": 10390 + }, + { + "epoch": 2.613430583501006, + "grad_norm": 0.28336817026138306, + "learning_rate": 4.973346569552417e-07, + "loss": 0.314, + "step": 10391 + }, + { + "epoch": 2.613682092555332, + "grad_norm": 0.27843061089515686, + "learning_rate": 4.966986479622454e-07, + "loss": 0.3101, + "step": 10392 + }, + { + "epoch": 2.613933601609658, + "grad_norm": 0.2896105945110321, + "learning_rate": 4.96063024648838e-07, + "loss": 0.3245, + "step": 10393 + }, + { + "epoch": 2.6141851106639837, + "grad_norm": 0.2984098792076111, + "learning_rate": 4.954277870694552e-07, + "loss": 0.3126, + "step": 10394 + }, + { + "epoch": 2.61443661971831, + "grad_norm": 0.3082030117511749, + "learning_rate": 4.947929352785024e-07, + "loss": 0.298, + "step": 10395 + }, + { + "epoch": 2.614688128772636, + "grad_norm": 0.3350622355937958, + "learning_rate": 4.941584693303497e-07, + "loss": 0.3467, + "step": 10396 + }, + { + "epoch": 2.6149396378269616, + "grad_norm": 0.2979256510734558, + "learning_rate": 4.935243892793362e-07, + "loss": 0.3165, + "step": 10397 + }, + { + "epoch": 2.615191146881288, + "grad_norm": 0.3071826100349426, + "learning_rate": 4.928906951797657e-07, + "loss": 0.3366, + "step": 10398 + }, + { + "epoch": 2.6154426559356136, + "grad_norm": 0.2692726254463196, + "learning_rate": 4.922573870859115e-07, + "loss": 0.2949, + "step": 10399 + }, + { + "epoch": 2.6156941649899395, + "grad_norm": 0.2713705003261566, + "learning_rate": 4.916244650520108e-07, + "loss": 0.3231, + "step": 10400 + }, + { + "epoch": 2.6159456740442657, + "grad_norm": 0.282787561416626, + "learning_rate": 4.909919291322718e-07, + "loss": 0.3244, + "step": 10401 + }, + { + "epoch": 2.6161971830985915, + "grad_norm": 0.2967943549156189, + "learning_rate": 4.90359779380864e-07, + "loss": 0.3158, + "step": 10402 + }, + { + "epoch": 2.6164486921529173, + "grad_norm": 0.30457451939582825, + "learning_rate": 4.897280158519307e-07, + "loss": 0.3176, + "step": 10403 + }, + { + "epoch": 2.6167002012072436, + "grad_norm": 0.3144105076789856, + "learning_rate": 4.890966385995754e-07, + "loss": 0.3444, + "step": 10404 + }, + { + "epoch": 2.6169517102615694, + "grad_norm": 0.28179723024368286, + "learning_rate": 4.884656476778738e-07, + "loss": 0.3197, + "step": 10405 + }, + { + "epoch": 2.617203219315895, + "grad_norm": 0.2976565361022949, + "learning_rate": 4.878350431408641e-07, + "loss": 0.32, + "step": 10406 + }, + { + "epoch": 2.6174547283702214, + "grad_norm": 0.2884789705276489, + "learning_rate": 4.872048250425565e-07, + "loss": 0.3244, + "step": 10407 + }, + { + "epoch": 2.6177062374245472, + "grad_norm": 0.30798858404159546, + "learning_rate": 4.865749934369224e-07, + "loss": 0.3102, + "step": 10408 + }, + { + "epoch": 2.617957746478873, + "grad_norm": 0.2883943021297455, + "learning_rate": 4.859455483779041e-07, + "loss": 0.3245, + "step": 10409 + }, + { + "epoch": 2.6182092555331993, + "grad_norm": 0.28204745054244995, + "learning_rate": 4.853164899194107e-07, + "loss": 0.3222, + "step": 10410 + }, + { + "epoch": 2.618460764587525, + "grad_norm": 0.28429892659187317, + "learning_rate": 4.846878181153153e-07, + "loss": 0.3085, + "step": 10411 + }, + { + "epoch": 2.618712273641851, + "grad_norm": 0.2938331663608551, + "learning_rate": 4.840595330194614e-07, + "loss": 0.3001, + "step": 10412 + }, + { + "epoch": 2.618963782696177, + "grad_norm": 0.28804659843444824, + "learning_rate": 4.834316346856565e-07, + "loss": 0.352, + "step": 10413 + }, + { + "epoch": 2.619215291750503, + "grad_norm": 0.2770189642906189, + "learning_rate": 4.828041231676766e-07, + "loss": 0.3067, + "step": 10414 + }, + { + "epoch": 2.6194668008048287, + "grad_norm": 0.2969238758087158, + "learning_rate": 4.821769985192637e-07, + "loss": 0.3361, + "step": 10415 + }, + { + "epoch": 2.619718309859155, + "grad_norm": 0.2775512933731079, + "learning_rate": 4.815502607941286e-07, + "loss": 0.3031, + "step": 10416 + }, + { + "epoch": 2.619969818913481, + "grad_norm": 0.287936806678772, + "learning_rate": 4.809239100459451e-07, + "loss": 0.3292, + "step": 10417 + }, + { + "epoch": 2.6202213279678066, + "grad_norm": 0.2935434877872467, + "learning_rate": 4.80297946328358e-07, + "loss": 0.2995, + "step": 10418 + }, + { + "epoch": 2.620472837022133, + "grad_norm": 0.3020229637622833, + "learning_rate": 4.796723696949762e-07, + "loss": 0.3263, + "step": 10419 + }, + { + "epoch": 2.6207243460764587, + "grad_norm": 0.29107797145843506, + "learning_rate": 4.790471801993768e-07, + "loss": 0.2952, + "step": 10420 + }, + { + "epoch": 2.6209758551307845, + "grad_norm": 0.29532748460769653, + "learning_rate": 4.784223778951042e-07, + "loss": 0.3259, + "step": 10421 + }, + { + "epoch": 2.6212273641851107, + "grad_norm": 0.29053303599357605, + "learning_rate": 4.777979628356672e-07, + "loss": 0.3274, + "step": 10422 + }, + { + "epoch": 2.6214788732394365, + "grad_norm": 0.3015473783016205, + "learning_rate": 4.771739350745447e-07, + "loss": 0.3069, + "step": 10423 + }, + { + "epoch": 2.6217303822937623, + "grad_norm": 0.29153281450271606, + "learning_rate": 4.7655029466517897e-07, + "loss": 0.3218, + "step": 10424 + }, + { + "epoch": 2.6219818913480886, + "grad_norm": 0.30175068974494934, + "learning_rate": 4.759270416609829e-07, + "loss": 0.3358, + "step": 10425 + }, + { + "epoch": 2.6222334004024144, + "grad_norm": 0.2931424081325531, + "learning_rate": 4.753041761153326e-07, + "loss": 0.3102, + "step": 10426 + }, + { + "epoch": 2.62248490945674, + "grad_norm": 0.2768230438232422, + "learning_rate": 4.746816980815738e-07, + "loss": 0.3365, + "step": 10427 + }, + { + "epoch": 2.6227364185110664, + "grad_norm": 0.2788086235523224, + "learning_rate": 4.7405960761301606e-07, + "loss": 0.3253, + "step": 10428 + }, + { + "epoch": 2.6229879275653922, + "grad_norm": 0.27975523471832275, + "learning_rate": 4.7343790476294005e-07, + "loss": 0.3132, + "step": 10429 + }, + { + "epoch": 2.623239436619718, + "grad_norm": 0.28822508454322815, + "learning_rate": 4.7281658958458877e-07, + "loss": 0.3114, + "step": 10430 + }, + { + "epoch": 2.6234909456740443, + "grad_norm": 0.29682502150535583, + "learning_rate": 4.7219566213117406e-07, + "loss": 0.326, + "step": 10431 + }, + { + "epoch": 2.62374245472837, + "grad_norm": 0.290158748626709, + "learning_rate": 4.7157512245587623e-07, + "loss": 0.3218, + "step": 10432 + }, + { + "epoch": 2.6239939637826963, + "grad_norm": 0.3001720607280731, + "learning_rate": 4.7095497061183826e-07, + "loss": 0.3339, + "step": 10433 + }, + { + "epoch": 2.624245472837022, + "grad_norm": 0.2894241511821747, + "learning_rate": 4.703352066521749e-07, + "loss": 0.3424, + "step": 10434 + }, + { + "epoch": 2.624496981891348, + "grad_norm": 0.2992819547653198, + "learning_rate": 4.697158306299621e-07, + "loss": 0.2913, + "step": 10435 + }, + { + "epoch": 2.624748490945674, + "grad_norm": 0.2854137420654297, + "learning_rate": 4.6909684259824785e-07, + "loss": 0.3385, + "step": 10436 + }, + { + "epoch": 2.625, + "grad_norm": 0.28658536076545715, + "learning_rate": 4.6847824261004313e-07, + "loss": 0.3146, + "step": 10437 + }, + { + "epoch": 2.625251509054326, + "grad_norm": 0.2668541371822357, + "learning_rate": 4.6786003071832895e-07, + "loss": 0.3246, + "step": 10438 + }, + { + "epoch": 2.625503018108652, + "grad_norm": 0.3052661120891571, + "learning_rate": 4.6724220697604904e-07, + "loss": 0.3195, + "step": 10439 + }, + { + "epoch": 2.625754527162978, + "grad_norm": 0.2899504005908966, + "learning_rate": 4.666247714361183e-07, + "loss": 0.3266, + "step": 10440 + }, + { + "epoch": 2.6260060362173037, + "grad_norm": 0.3089952766895294, + "learning_rate": 4.6600772415141437e-07, + "loss": 0.3129, + "step": 10441 + }, + { + "epoch": 2.62625754527163, + "grad_norm": 0.281039834022522, + "learning_rate": 4.6539106517478394e-07, + "loss": 0.2899, + "step": 10442 + }, + { + "epoch": 2.6265090543259557, + "grad_norm": 0.2719120383262634, + "learning_rate": 4.647747945590414e-07, + "loss": 0.3245, + "step": 10443 + }, + { + "epoch": 2.626760563380282, + "grad_norm": 0.2826182544231415, + "learning_rate": 4.6415891235696453e-07, + "loss": 0.3268, + "step": 10444 + }, + { + "epoch": 2.6270120724346078, + "grad_norm": 0.30722248554229736, + "learning_rate": 4.635434186213017e-07, + "loss": 0.3347, + "step": 10445 + }, + { + "epoch": 2.6272635814889336, + "grad_norm": 0.2918623685836792, + "learning_rate": 4.6292831340476406e-07, + "loss": 0.3351, + "step": 10446 + }, + { + "epoch": 2.62751509054326, + "grad_norm": 0.29145464301109314, + "learning_rate": 4.623135967600334e-07, + "loss": 0.3048, + "step": 10447 + }, + { + "epoch": 2.6277665995975856, + "grad_norm": 0.298068106174469, + "learning_rate": 4.616992687397548e-07, + "loss": 0.2789, + "step": 10448 + }, + { + "epoch": 2.6280181086519114, + "grad_norm": 0.3069632649421692, + "learning_rate": 4.610853293965434e-07, + "loss": 0.3214, + "step": 10449 + }, + { + "epoch": 2.6282696177062377, + "grad_norm": 0.28947851061820984, + "learning_rate": 4.6047177878297654e-07, + "loss": 0.3121, + "step": 10450 + }, + { + "epoch": 2.6285211267605635, + "grad_norm": 0.2965468168258667, + "learning_rate": 4.5985861695160393e-07, + "loss": 0.3294, + "step": 10451 + }, + { + "epoch": 2.6287726358148893, + "grad_norm": 0.2832191288471222, + "learning_rate": 4.592458439549369e-07, + "loss": 0.3213, + "step": 10452 + }, + { + "epoch": 2.6290241448692155, + "grad_norm": 0.3064347803592682, + "learning_rate": 4.586334598454567e-07, + "loss": 0.3124, + "step": 10453 + }, + { + "epoch": 2.6292756539235413, + "grad_norm": 0.3119833469390869, + "learning_rate": 4.5802146467561035e-07, + "loss": 0.3171, + "step": 10454 + }, + { + "epoch": 2.629527162977867, + "grad_norm": 0.2910575568675995, + "learning_rate": 4.574098584978104e-07, + "loss": 0.2897, + "step": 10455 + }, + { + "epoch": 2.6297786720321934, + "grad_norm": 0.30219748616218567, + "learning_rate": 4.5679864136443874e-07, + "loss": 0.3135, + "step": 10456 + }, + { + "epoch": 2.630030181086519, + "grad_norm": 0.30393391847610474, + "learning_rate": 4.5618781332784026e-07, + "loss": 0.343, + "step": 10457 + }, + { + "epoch": 2.630281690140845, + "grad_norm": 0.3024074137210846, + "learning_rate": 4.5557737444033025e-07, + "loss": 0.3359, + "step": 10458 + }, + { + "epoch": 2.6305331991951713, + "grad_norm": 0.2811712324619293, + "learning_rate": 4.549673247541875e-07, + "loss": 0.3059, + "step": 10459 + }, + { + "epoch": 2.630784708249497, + "grad_norm": 0.2942906320095062, + "learning_rate": 4.543576643216607e-07, + "loss": 0.3207, + "step": 10460 + }, + { + "epoch": 2.631036217303823, + "grad_norm": 0.2751673758029938, + "learning_rate": 4.5374839319496156e-07, + "loss": 0.3229, + "step": 10461 + }, + { + "epoch": 2.631287726358149, + "grad_norm": 0.2935747504234314, + "learning_rate": 4.531395114262721e-07, + "loss": 0.3112, + "step": 10462 + }, + { + "epoch": 2.631539235412475, + "grad_norm": 0.27842676639556885, + "learning_rate": 4.525310190677379e-07, + "loss": 0.3132, + "step": 10463 + }, + { + "epoch": 2.6317907444668007, + "grad_norm": 0.27436089515686035, + "learning_rate": 4.5192291617147274e-07, + "loss": 0.3149, + "step": 10464 + }, + { + "epoch": 2.632042253521127, + "grad_norm": 0.27890241146087646, + "learning_rate": 4.5131520278955785e-07, + "loss": 0.2893, + "step": 10465 + }, + { + "epoch": 2.6322937625754528, + "grad_norm": 0.27693304419517517, + "learning_rate": 4.507078789740388e-07, + "loss": 0.315, + "step": 10466 + }, + { + "epoch": 2.6325452716297786, + "grad_norm": 0.309488445520401, + "learning_rate": 4.5010094477693057e-07, + "loss": 0.3331, + "step": 10467 + }, + { + "epoch": 2.632796780684105, + "grad_norm": 0.29617559909820557, + "learning_rate": 4.4949440025021105e-07, + "loss": 0.3234, + "step": 10468 + }, + { + "epoch": 2.6330482897384306, + "grad_norm": 0.2881315350532532, + "learning_rate": 4.488882454458299e-07, + "loss": 0.3431, + "step": 10469 + }, + { + "epoch": 2.6332997987927564, + "grad_norm": 0.3251035213470459, + "learning_rate": 4.482824804156971e-07, + "loss": 0.3014, + "step": 10470 + }, + { + "epoch": 2.6335513078470827, + "grad_norm": 0.3029896020889282, + "learning_rate": 4.476771052116957e-07, + "loss": 0.2974, + "step": 10471 + }, + { + "epoch": 2.6338028169014085, + "grad_norm": 0.2852858901023865, + "learning_rate": 4.4707211988567036e-07, + "loss": 0.2965, + "step": 10472 + }, + { + "epoch": 2.6340543259557343, + "grad_norm": 0.2950040400028229, + "learning_rate": 4.464675244894351e-07, + "loss": 0.3049, + "step": 10473 + }, + { + "epoch": 2.6343058350100605, + "grad_norm": 0.3013119399547577, + "learning_rate": 4.458633190747691e-07, + "loss": 0.3099, + "step": 10474 + }, + { + "epoch": 2.6345573440643864, + "grad_norm": 0.30011510848999023, + "learning_rate": 4.452595036934193e-07, + "loss": 0.3137, + "step": 10475 + }, + { + "epoch": 2.634808853118712, + "grad_norm": 0.2907141149044037, + "learning_rate": 4.4465607839709934e-07, + "loss": 0.3165, + "step": 10476 + }, + { + "epoch": 2.6350603621730384, + "grad_norm": 0.28316307067871094, + "learning_rate": 4.440530432374873e-07, + "loss": 0.3043, + "step": 10477 + }, + { + "epoch": 2.635311871227364, + "grad_norm": 0.315193772315979, + "learning_rate": 4.434503982662314e-07, + "loss": 0.3224, + "step": 10478 + }, + { + "epoch": 2.63556338028169, + "grad_norm": 0.27484557032585144, + "learning_rate": 4.4284814353494187e-07, + "loss": 0.2935, + "step": 10479 + }, + { + "epoch": 2.6358148893360163, + "grad_norm": 0.30151981115341187, + "learning_rate": 4.4224627909520034e-07, + "loss": 0.3119, + "step": 10480 + }, + { + "epoch": 2.636066398390342, + "grad_norm": 0.2999078631401062, + "learning_rate": 4.4164480499855114e-07, + "loss": 0.3054, + "step": 10481 + }, + { + "epoch": 2.636317907444668, + "grad_norm": 0.30388715863227844, + "learning_rate": 4.410437212965085e-07, + "loss": 0.3041, + "step": 10482 + }, + { + "epoch": 2.636569416498994, + "grad_norm": 0.2887566387653351, + "learning_rate": 4.404430280405492e-07, + "loss": 0.2967, + "step": 10483 + }, + { + "epoch": 2.63682092555332, + "grad_norm": 0.27546751499176025, + "learning_rate": 4.3984272528212077e-07, + "loss": 0.3144, + "step": 10484 + }, + { + "epoch": 2.6370724346076457, + "grad_norm": 0.2938852310180664, + "learning_rate": 4.3924281307263397e-07, + "loss": 0.3284, + "step": 10485 + }, + { + "epoch": 2.637323943661972, + "grad_norm": 0.30135706067085266, + "learning_rate": 4.3864329146346804e-07, + "loss": 0.3305, + "step": 10486 + }, + { + "epoch": 2.637575452716298, + "grad_norm": 0.2923177182674408, + "learning_rate": 4.3804416050596933e-07, + "loss": 0.3229, + "step": 10487 + }, + { + "epoch": 2.6378269617706236, + "grad_norm": 0.28316444158554077, + "learning_rate": 4.374454202514483e-07, + "loss": 0.3427, + "step": 10488 + }, + { + "epoch": 2.63807847082495, + "grad_norm": 0.28909602761268616, + "learning_rate": 4.3684707075118403e-07, + "loss": 0.3228, + "step": 10489 + }, + { + "epoch": 2.6383299798792756, + "grad_norm": 0.28920796513557434, + "learning_rate": 4.362491120564205e-07, + "loss": 0.3346, + "step": 10490 + }, + { + "epoch": 2.6385814889336014, + "grad_norm": 0.2695193290710449, + "learning_rate": 4.356515442183712e-07, + "loss": 0.3424, + "step": 10491 + }, + { + "epoch": 2.6388329979879277, + "grad_norm": 0.28552255034446716, + "learning_rate": 4.3505436728821125e-07, + "loss": 0.3054, + "step": 10492 + }, + { + "epoch": 2.6390845070422535, + "grad_norm": 0.27092212438583374, + "learning_rate": 4.344575813170876e-07, + "loss": 0.3152, + "step": 10493 + }, + { + "epoch": 2.6393360160965793, + "grad_norm": 0.29936766624450684, + "learning_rate": 4.3386118635610875e-07, + "loss": 0.3311, + "step": 10494 + }, + { + "epoch": 2.6395875251509056, + "grad_norm": 0.2581084966659546, + "learning_rate": 4.3326518245635494e-07, + "loss": 0.3198, + "step": 10495 + }, + { + "epoch": 2.6398390342052314, + "grad_norm": 0.27823901176452637, + "learning_rate": 4.326695696688682e-07, + "loss": 0.3174, + "step": 10496 + }, + { + "epoch": 2.640090543259557, + "grad_norm": 0.2890242636203766, + "learning_rate": 4.320743480446593e-07, + "loss": 0.3019, + "step": 10497 + }, + { + "epoch": 2.6403420523138834, + "grad_norm": 0.2868576645851135, + "learning_rate": 4.31479517634707e-07, + "loss": 0.3138, + "step": 10498 + }, + { + "epoch": 2.640593561368209, + "grad_norm": 0.3102249205112457, + "learning_rate": 4.308850784899521e-07, + "loss": 0.3264, + "step": 10499 + }, + { + "epoch": 2.640845070422535, + "grad_norm": 0.27739331126213074, + "learning_rate": 4.3029103066130673e-07, + "loss": 0.3361, + "step": 10500 + }, + { + "epoch": 2.6410965794768613, + "grad_norm": 0.28932255506515503, + "learning_rate": 4.296973741996463e-07, + "loss": 0.3229, + "step": 10501 + }, + { + "epoch": 2.641348088531187, + "grad_norm": 0.2557661235332489, + "learning_rate": 4.291041091558146e-07, + "loss": 0.3158, + "step": 10502 + }, + { + "epoch": 2.641599597585513, + "grad_norm": 0.282735675573349, + "learning_rate": 4.2851123558061927e-07, + "loss": 0.3341, + "step": 10503 + }, + { + "epoch": 2.641851106639839, + "grad_norm": 0.2887149155139923, + "learning_rate": 4.2791875352483857e-07, + "loss": 0.3283, + "step": 10504 + }, + { + "epoch": 2.642102615694165, + "grad_norm": 0.29206418991088867, + "learning_rate": 4.273266630392131e-07, + "loss": 0.3085, + "step": 10505 + }, + { + "epoch": 2.6423541247484907, + "grad_norm": 0.26142215728759766, + "learning_rate": 4.267349641744534e-07, + "loss": 0.3334, + "step": 10506 + }, + { + "epoch": 2.642605633802817, + "grad_norm": 0.2744672894477844, + "learning_rate": 4.261436569812322e-07, + "loss": 0.3323, + "step": 10507 + }, + { + "epoch": 2.642857142857143, + "grad_norm": 0.2745734453201294, + "learning_rate": 4.2555274151019456e-07, + "loss": 0.335, + "step": 10508 + }, + { + "epoch": 2.6431086519114686, + "grad_norm": 0.2822088897228241, + "learning_rate": 4.2496221781194557e-07, + "loss": 0.3042, + "step": 10509 + }, + { + "epoch": 2.643360160965795, + "grad_norm": 0.2912130653858185, + "learning_rate": 4.2437208593706204e-07, + "loss": 0.3195, + "step": 10510 + }, + { + "epoch": 2.6436116700201207, + "grad_norm": 0.284824401140213, + "learning_rate": 4.2378234593608346e-07, + "loss": 0.3012, + "step": 10511 + }, + { + "epoch": 2.6438631790744465, + "grad_norm": 0.28413233160972595, + "learning_rate": 4.231929978595195e-07, + "loss": 0.3298, + "step": 10512 + }, + { + "epoch": 2.6441146881287727, + "grad_norm": 0.3148362636566162, + "learning_rate": 4.226040417578414e-07, + "loss": 0.3313, + "step": 10513 + }, + { + "epoch": 2.6443661971830985, + "grad_norm": 0.3029150366783142, + "learning_rate": 4.2201547768149277e-07, + "loss": 0.319, + "step": 10514 + }, + { + "epoch": 2.6446177062374243, + "grad_norm": 0.3129327595233917, + "learning_rate": 4.214273056808771e-07, + "loss": 0.2861, + "step": 10515 + }, + { + "epoch": 2.6448692152917506, + "grad_norm": 0.2981908917427063, + "learning_rate": 4.208395258063702e-07, + "loss": 0.3343, + "step": 10516 + }, + { + "epoch": 2.6451207243460764, + "grad_norm": 0.2968314588069916, + "learning_rate": 4.202521381083102e-07, + "loss": 0.3344, + "step": 10517 + }, + { + "epoch": 2.645372233400402, + "grad_norm": 0.30850720405578613, + "learning_rate": 4.196651426370041e-07, + "loss": 0.3308, + "step": 10518 + }, + { + "epoch": 2.6456237424547284, + "grad_norm": 0.2996951937675476, + "learning_rate": 4.190785394427238e-07, + "loss": 0.3206, + "step": 10519 + }, + { + "epoch": 2.6458752515090542, + "grad_norm": 0.2981989085674286, + "learning_rate": 4.184923285757092e-07, + "loss": 0.334, + "step": 10520 + }, + { + "epoch": 2.64612676056338, + "grad_norm": 0.26357463002204895, + "learning_rate": 4.1790651008616343e-07, + "loss": 0.335, + "step": 10521 + }, + { + "epoch": 2.6463782696177063, + "grad_norm": 0.2741795480251312, + "learning_rate": 4.1732108402426087e-07, + "loss": 0.3304, + "step": 10522 + }, + { + "epoch": 2.646629778672032, + "grad_norm": 0.292894572019577, + "learning_rate": 4.167360504401374e-07, + "loss": 0.3099, + "step": 10523 + }, + { + "epoch": 2.646881287726358, + "grad_norm": 0.30272337794303894, + "learning_rate": 4.161514093838992e-07, + "loss": 0.3106, + "step": 10524 + }, + { + "epoch": 2.647132796780684, + "grad_norm": 0.31396594643592834, + "learning_rate": 4.155671609056156e-07, + "loss": 0.314, + "step": 10525 + }, + { + "epoch": 2.64738430583501, + "grad_norm": 0.29578897356987, + "learning_rate": 4.1498330505532533e-07, + "loss": 0.3251, + "step": 10526 + }, + { + "epoch": 2.6476358148893357, + "grad_norm": 0.3138672709465027, + "learning_rate": 4.1439984188303027e-07, + "loss": 0.3185, + "step": 10527 + }, + { + "epoch": 2.647887323943662, + "grad_norm": 0.28872033953666687, + "learning_rate": 4.1381677143870147e-07, + "loss": 0.3277, + "step": 10528 + }, + { + "epoch": 2.648138832997988, + "grad_norm": 0.28270837664604187, + "learning_rate": 4.1323409377227565e-07, + "loss": 0.3312, + "step": 10529 + }, + { + "epoch": 2.6483903420523136, + "grad_norm": 0.3057926595211029, + "learning_rate": 4.1265180893365453e-07, + "loss": 0.3222, + "step": 10530 + }, + { + "epoch": 2.64864185110664, + "grad_norm": 0.30965495109558105, + "learning_rate": 4.1206991697270825e-07, + "loss": 0.3254, + "step": 10531 + }, + { + "epoch": 2.6488933601609657, + "grad_norm": 0.2973712086677551, + "learning_rate": 4.114884179392709e-07, + "loss": 0.332, + "step": 10532 + }, + { + "epoch": 2.649144869215292, + "grad_norm": 0.26278215646743774, + "learning_rate": 4.1090731188314583e-07, + "loss": 0.32, + "step": 10533 + }, + { + "epoch": 2.6493963782696177, + "grad_norm": 0.28730037808418274, + "learning_rate": 4.103265988540989e-07, + "loss": 0.3395, + "step": 10534 + }, + { + "epoch": 2.6496478873239435, + "grad_norm": 0.2832060754299164, + "learning_rate": 4.0974627890186745e-07, + "loss": 0.2925, + "step": 10535 + }, + { + "epoch": 2.6498993963782698, + "grad_norm": 0.28094425797462463, + "learning_rate": 4.0916635207614906e-07, + "loss": 0.3131, + "step": 10536 + }, + { + "epoch": 2.6501509054325956, + "grad_norm": 0.2945941984653473, + "learning_rate": 4.0858681842661395e-07, + "loss": 0.3151, + "step": 10537 + }, + { + "epoch": 2.6504024144869214, + "grad_norm": 0.28159478306770325, + "learning_rate": 4.080076780028924e-07, + "loss": 0.3046, + "step": 10538 + }, + { + "epoch": 2.6506539235412476, + "grad_norm": 0.3074738085269928, + "learning_rate": 4.0742893085458644e-07, + "loss": 0.3226, + "step": 10539 + }, + { + "epoch": 2.6509054325955734, + "grad_norm": 0.28362908959388733, + "learning_rate": 4.068505770312625e-07, + "loss": 0.3322, + "step": 10540 + }, + { + "epoch": 2.6511569416498992, + "grad_norm": 0.3007882833480835, + "learning_rate": 4.062726165824504e-07, + "loss": 0.3386, + "step": 10541 + }, + { + "epoch": 2.6514084507042255, + "grad_norm": 0.3021930158138275, + "learning_rate": 4.0569504955765227e-07, + "loss": 0.3079, + "step": 10542 + }, + { + "epoch": 2.6516599597585513, + "grad_norm": 0.2768537700176239, + "learning_rate": 4.0511787600632955e-07, + "loss": 0.3495, + "step": 10543 + }, + { + "epoch": 2.6519114688128775, + "grad_norm": 0.2638303339481354, + "learning_rate": 4.045410959779167e-07, + "loss": 0.3116, + "step": 10544 + }, + { + "epoch": 2.6521629778672033, + "grad_norm": 0.27994871139526367, + "learning_rate": 4.0396470952180857e-07, + "loss": 0.3143, + "step": 10545 + }, + { + "epoch": 2.652414486921529, + "grad_norm": 0.286160409450531, + "learning_rate": 4.033887166873712e-07, + "loss": 0.3084, + "step": 10546 + }, + { + "epoch": 2.6526659959758554, + "grad_norm": 0.28356102108955383, + "learning_rate": 4.028131175239336e-07, + "loss": 0.3367, + "step": 10547 + }, + { + "epoch": 2.652917505030181, + "grad_norm": 0.31378769874572754, + "learning_rate": 4.022379120807929e-07, + "loss": 0.3136, + "step": 10548 + }, + { + "epoch": 2.653169014084507, + "grad_norm": 0.29306289553642273, + "learning_rate": 4.016631004072108e-07, + "loss": 0.3319, + "step": 10549 + }, + { + "epoch": 2.6534205231388333, + "grad_norm": 0.2860982418060303, + "learning_rate": 4.010886825524174e-07, + "loss": 0.3403, + "step": 10550 + }, + { + "epoch": 2.653672032193159, + "grad_norm": 0.29843586683273315, + "learning_rate": 4.0051465856560836e-07, + "loss": 0.2994, + "step": 10551 + }, + { + "epoch": 2.653923541247485, + "grad_norm": 0.29828014969825745, + "learning_rate": 3.999410284959432e-07, + "loss": 0.3126, + "step": 10552 + }, + { + "epoch": 2.654175050301811, + "grad_norm": 0.2804122567176819, + "learning_rate": 3.9936779239255207e-07, + "loss": 0.3178, + "step": 10553 + }, + { + "epoch": 2.654426559356137, + "grad_norm": 0.29577210545539856, + "learning_rate": 3.987949503045274e-07, + "loss": 0.326, + "step": 10554 + }, + { + "epoch": 2.6546780684104627, + "grad_norm": 0.2835969030857086, + "learning_rate": 3.982225022809311e-07, + "loss": 0.3026, + "step": 10555 + }, + { + "epoch": 2.654929577464789, + "grad_norm": 0.27092668414115906, + "learning_rate": 3.9765044837078825e-07, + "loss": 0.2987, + "step": 10556 + }, + { + "epoch": 2.6551810865191148, + "grad_norm": 0.28689372539520264, + "learning_rate": 3.970787886230926e-07, + "loss": 0.3423, + "step": 10557 + }, + { + "epoch": 2.6554325955734406, + "grad_norm": 0.294495165348053, + "learning_rate": 3.965075230868026e-07, + "loss": 0.3245, + "step": 10558 + }, + { + "epoch": 2.655684104627767, + "grad_norm": 0.2807731628417969, + "learning_rate": 3.9593665181084427e-07, + "loss": 0.3214, + "step": 10559 + }, + { + "epoch": 2.6559356136820926, + "grad_norm": 0.29218292236328125, + "learning_rate": 3.953661748441079e-07, + "loss": 0.2978, + "step": 10560 + }, + { + "epoch": 2.6561871227364184, + "grad_norm": 0.3011685013771057, + "learning_rate": 3.947960922354527e-07, + "loss": 0.3007, + "step": 10561 + }, + { + "epoch": 2.6564386317907447, + "grad_norm": 0.2898184359073639, + "learning_rate": 3.9422640403370236e-07, + "loss": 0.3316, + "step": 10562 + }, + { + "epoch": 2.6566901408450705, + "grad_norm": 0.28175485134124756, + "learning_rate": 3.936571102876463e-07, + "loss": 0.319, + "step": 10563 + }, + { + "epoch": 2.6569416498993963, + "grad_norm": 0.2649247944355011, + "learning_rate": 3.930882110460421e-07, + "loss": 0.3151, + "step": 10564 + }, + { + "epoch": 2.6571931589537225, + "grad_norm": 0.2886830270290375, + "learning_rate": 3.925197063576114e-07, + "loss": 0.3035, + "step": 10565 + }, + { + "epoch": 2.6574446680080483, + "grad_norm": 0.28880345821380615, + "learning_rate": 3.9195159627104465e-07, + "loss": 0.3268, + "step": 10566 + }, + { + "epoch": 2.657696177062374, + "grad_norm": 0.2972525954246521, + "learning_rate": 3.913838808349946e-07, + "loss": 0.3078, + "step": 10567 + }, + { + "epoch": 2.6579476861167004, + "grad_norm": 0.298945814371109, + "learning_rate": 3.908165600980845e-07, + "loss": 0.3142, + "step": 10568 + }, + { + "epoch": 2.658199195171026, + "grad_norm": 0.3075450658798218, + "learning_rate": 3.9024963410890015e-07, + "loss": 0.3152, + "step": 10569 + }, + { + "epoch": 2.658450704225352, + "grad_norm": 0.31689146161079407, + "learning_rate": 3.8968310291599753e-07, + "loss": 0.3172, + "step": 10570 + }, + { + "epoch": 2.6587022132796783, + "grad_norm": 0.2927457094192505, + "learning_rate": 3.89116966567894e-07, + "loss": 0.3291, + "step": 10571 + }, + { + "epoch": 2.658953722334004, + "grad_norm": 0.2956581115722656, + "learning_rate": 3.885512251130763e-07, + "loss": 0.3041, + "step": 10572 + }, + { + "epoch": 2.65920523138833, + "grad_norm": 0.29633039236068726, + "learning_rate": 3.879858785999979e-07, + "loss": 0.3256, + "step": 10573 + }, + { + "epoch": 2.659456740442656, + "grad_norm": 0.2878909707069397, + "learning_rate": 3.874209270770762e-07, + "loss": 0.3356, + "step": 10574 + }, + { + "epoch": 2.659708249496982, + "grad_norm": 0.3077471852302551, + "learning_rate": 3.8685637059269587e-07, + "loss": 0.3131, + "step": 10575 + }, + { + "epoch": 2.6599597585513077, + "grad_norm": 0.28095880150794983, + "learning_rate": 3.862922091952076e-07, + "loss": 0.3165, + "step": 10576 + }, + { + "epoch": 2.660211267605634, + "grad_norm": 0.3052351772785187, + "learning_rate": 3.857284429329289e-07, + "loss": 0.3114, + "step": 10577 + }, + { + "epoch": 2.66046277665996, + "grad_norm": 0.3149491250514984, + "learning_rate": 3.851650718541411e-07, + "loss": 0.3186, + "step": 10578 + }, + { + "epoch": 2.6607142857142856, + "grad_norm": 0.30292919278144836, + "learning_rate": 3.846020960070956e-07, + "loss": 0.3441, + "step": 10579 + }, + { + "epoch": 2.660965794768612, + "grad_norm": 0.3036801218986511, + "learning_rate": 3.8403951544000617e-07, + "loss": 0.3152, + "step": 10580 + }, + { + "epoch": 2.6612173038229376, + "grad_norm": 0.2941226661205292, + "learning_rate": 3.834773302010553e-07, + "loss": 0.3112, + "step": 10581 + }, + { + "epoch": 2.6614688128772634, + "grad_norm": 0.30887505412101746, + "learning_rate": 3.829155403383894e-07, + "loss": 0.3293, + "step": 10582 + }, + { + "epoch": 2.6617203219315897, + "grad_norm": 0.28087976574897766, + "learning_rate": 3.823541459001234e-07, + "loss": 0.3246, + "step": 10583 + }, + { + "epoch": 2.6619718309859155, + "grad_norm": 0.28773432970046997, + "learning_rate": 3.8179314693433775e-07, + "loss": 0.3033, + "step": 10584 + }, + { + "epoch": 2.6622233400402413, + "grad_norm": 0.27361011505126953, + "learning_rate": 3.8123254348907676e-07, + "loss": 0.3338, + "step": 10585 + }, + { + "epoch": 2.6624748490945676, + "grad_norm": 0.28717923164367676, + "learning_rate": 3.806723356123543e-07, + "loss": 0.3316, + "step": 10586 + }, + { + "epoch": 2.6627263581488934, + "grad_norm": 0.2967682182788849, + "learning_rate": 3.801125233521469e-07, + "loss": 0.3631, + "step": 10587 + }, + { + "epoch": 2.662977867203219, + "grad_norm": 0.25790077447891235, + "learning_rate": 3.7955310675640066e-07, + "loss": 0.3155, + "step": 10588 + }, + { + "epoch": 2.6632293762575454, + "grad_norm": 0.25773194432258606, + "learning_rate": 3.789940858730251e-07, + "loss": 0.2978, + "step": 10589 + }, + { + "epoch": 2.663480885311871, + "grad_norm": 0.295448899269104, + "learning_rate": 3.7843546074989747e-07, + "loss": 0.3467, + "step": 10590 + }, + { + "epoch": 2.663732394366197, + "grad_norm": 0.27285072207450867, + "learning_rate": 3.778772314348594e-07, + "loss": 0.3165, + "step": 10591 + }, + { + "epoch": 2.6639839034205233, + "grad_norm": 0.2875916361808777, + "learning_rate": 3.773193979757217e-07, + "loss": 0.2996, + "step": 10592 + }, + { + "epoch": 2.664235412474849, + "grad_norm": 0.3071140646934509, + "learning_rate": 3.7676196042025715e-07, + "loss": 0.3157, + "step": 10593 + }, + { + "epoch": 2.664486921529175, + "grad_norm": 0.3103017807006836, + "learning_rate": 3.7620491881620814e-07, + "loss": 0.3411, + "step": 10594 + }, + { + "epoch": 2.664738430583501, + "grad_norm": 0.3201421797275543, + "learning_rate": 3.7564827321128203e-07, + "loss": 0.3409, + "step": 10595 + }, + { + "epoch": 2.664989939637827, + "grad_norm": 0.2804863154888153, + "learning_rate": 3.750920236531502e-07, + "loss": 0.3188, + "step": 10596 + }, + { + "epoch": 2.6652414486921527, + "grad_norm": 0.30754902958869934, + "learning_rate": 3.7453617018945453e-07, + "loss": 0.3183, + "step": 10597 + }, + { + "epoch": 2.665492957746479, + "grad_norm": 0.2903851866722107, + "learning_rate": 3.739807128677986e-07, + "loss": 0.3096, + "step": 10598 + }, + { + "epoch": 2.665744466800805, + "grad_norm": 0.2837523818016052, + "learning_rate": 3.734256517357543e-07, + "loss": 0.3035, + "step": 10599 + }, + { + "epoch": 2.6659959758551306, + "grad_norm": 0.3047906160354614, + "learning_rate": 3.7287098684085867e-07, + "loss": 0.3157, + "step": 10600 + }, + { + "epoch": 2.666247484909457, + "grad_norm": 0.2783104181289673, + "learning_rate": 3.72316718230617e-07, + "loss": 0.3306, + "step": 10601 + }, + { + "epoch": 2.6664989939637826, + "grad_norm": 0.27863505482673645, + "learning_rate": 3.717628459524963e-07, + "loss": 0.3356, + "step": 10602 + }, + { + "epoch": 2.6667505030181085, + "grad_norm": 0.2805593013763428, + "learning_rate": 3.7120937005393487e-07, + "loss": 0.3201, + "step": 10603 + }, + { + "epoch": 2.6670020120724347, + "grad_norm": 0.30254265666007996, + "learning_rate": 3.7065629058233245e-07, + "loss": 0.2921, + "step": 10604 + }, + { + "epoch": 2.6672535211267605, + "grad_norm": 0.30693769454956055, + "learning_rate": 3.701036075850578e-07, + "loss": 0.3277, + "step": 10605 + }, + { + "epoch": 2.6675050301810863, + "grad_norm": 0.29164475202560425, + "learning_rate": 3.695513211094448e-07, + "loss": 0.3082, + "step": 10606 + }, + { + "epoch": 2.6677565392354126, + "grad_norm": 0.2975282371044159, + "learning_rate": 3.689994312027928e-07, + "loss": 0.3147, + "step": 10607 + }, + { + "epoch": 2.6680080482897384, + "grad_norm": 0.2964397072792053, + "learning_rate": 3.6844793791236897e-07, + "loss": 0.3245, + "step": 10608 + }, + { + "epoch": 2.668259557344064, + "grad_norm": 0.29004722833633423, + "learning_rate": 3.678968412854034e-07, + "loss": 0.3261, + "step": 10609 + }, + { + "epoch": 2.6685110663983904, + "grad_norm": 0.27896592020988464, + "learning_rate": 3.67346141369096e-07, + "loss": 0.3172, + "step": 10610 + }, + { + "epoch": 2.6687625754527162, + "grad_norm": 0.2943844497203827, + "learning_rate": 3.6679583821060904e-07, + "loss": 0.3059, + "step": 10611 + }, + { + "epoch": 2.669014084507042, + "grad_norm": 0.3017599880695343, + "learning_rate": 3.662459318570738e-07, + "loss": 0.3441, + "step": 10612 + }, + { + "epoch": 2.6692655935613683, + "grad_norm": 0.29490116238594055, + "learning_rate": 3.656964223555848e-07, + "loss": 0.3213, + "step": 10613 + }, + { + "epoch": 2.669517102615694, + "grad_norm": 0.30418360233306885, + "learning_rate": 3.6514730975320657e-07, + "loss": 0.3226, + "step": 10614 + }, + { + "epoch": 2.66976861167002, + "grad_norm": 0.2704751789569855, + "learning_rate": 3.645985940969643e-07, + "loss": 0.3592, + "step": 10615 + }, + { + "epoch": 2.670020120724346, + "grad_norm": 0.27038201689720154, + "learning_rate": 3.6405027543385374e-07, + "loss": 0.3128, + "step": 10616 + }, + { + "epoch": 2.670271629778672, + "grad_norm": 0.3021993637084961, + "learning_rate": 3.6350235381083563e-07, + "loss": 0.3064, + "step": 10617 + }, + { + "epoch": 2.6705231388329977, + "grad_norm": 0.2776448428630829, + "learning_rate": 3.629548292748342e-07, + "loss": 0.3311, + "step": 10618 + }, + { + "epoch": 2.670774647887324, + "grad_norm": 0.2895011901855469, + "learning_rate": 3.624077018727429e-07, + "loss": 0.2909, + "step": 10619 + }, + { + "epoch": 2.67102615694165, + "grad_norm": 0.27862903475761414, + "learning_rate": 3.6186097165141875e-07, + "loss": 0.3213, + "step": 10620 + }, + { + "epoch": 2.6712776659959756, + "grad_norm": 0.3124517798423767, + "learning_rate": 3.6131463865768714e-07, + "loss": 0.3084, + "step": 10621 + }, + { + "epoch": 2.671529175050302, + "grad_norm": 0.28285762667655945, + "learning_rate": 3.607687029383361e-07, + "loss": 0.3213, + "step": 10622 + }, + { + "epoch": 2.6717806841046277, + "grad_norm": 0.30333417654037476, + "learning_rate": 3.6022316454012387e-07, + "loss": 0.3243, + "step": 10623 + }, + { + "epoch": 2.6720321931589535, + "grad_norm": 0.2860763370990753, + "learning_rate": 3.5967802350977024e-07, + "loss": 0.3055, + "step": 10624 + }, + { + "epoch": 2.6722837022132797, + "grad_norm": 0.3174950182437897, + "learning_rate": 3.591332798939651e-07, + "loss": 0.3369, + "step": 10625 + }, + { + "epoch": 2.6725352112676055, + "grad_norm": 0.3028983175754547, + "learning_rate": 3.5858893373936e-07, + "loss": 0.3352, + "step": 10626 + }, + { + "epoch": 2.6727867203219313, + "grad_norm": 0.27620789408683777, + "learning_rate": 3.5804498509257766e-07, + "loss": 0.3241, + "step": 10627 + }, + { + "epoch": 2.6730382293762576, + "grad_norm": 0.2942667007446289, + "learning_rate": 3.575014340002009e-07, + "loss": 0.3004, + "step": 10628 + }, + { + "epoch": 2.6732897384305834, + "grad_norm": 0.300465852022171, + "learning_rate": 3.56958280508784e-07, + "loss": 0.3163, + "step": 10629 + }, + { + "epoch": 2.6735412474849096, + "grad_norm": 0.30546244978904724, + "learning_rate": 3.564155246648432e-07, + "loss": 0.3497, + "step": 10630 + }, + { + "epoch": 2.6737927565392354, + "grad_norm": 0.27925771474838257, + "learning_rate": 3.558731665148629e-07, + "loss": 0.3144, + "step": 10631 + }, + { + "epoch": 2.6740442655935612, + "grad_norm": 0.2924439013004303, + "learning_rate": 3.5533120610529157e-07, + "loss": 0.3182, + "step": 10632 + }, + { + "epoch": 2.6742957746478875, + "grad_norm": 0.27821362018585205, + "learning_rate": 3.547896434825465e-07, + "loss": 0.3016, + "step": 10633 + }, + { + "epoch": 2.6745472837022133, + "grad_norm": 0.3052501976490021, + "learning_rate": 3.542484786930073e-07, + "loss": 0.3394, + "step": 10634 + }, + { + "epoch": 2.674798792756539, + "grad_norm": 0.3052050769329071, + "learning_rate": 3.537077117830229e-07, + "loss": 0.2972, + "step": 10635 + }, + { + "epoch": 2.6750503018108653, + "grad_norm": 0.2863207757472992, + "learning_rate": 3.531673427989046e-07, + "loss": 0.3262, + "step": 10636 + }, + { + "epoch": 2.675301810865191, + "grad_norm": 0.26985007524490356, + "learning_rate": 3.526273717869344e-07, + "loss": 0.3168, + "step": 10637 + }, + { + "epoch": 2.675553319919517, + "grad_norm": 0.31778794527053833, + "learning_rate": 3.5208779879335465e-07, + "loss": 0.3115, + "step": 10638 + }, + { + "epoch": 2.675804828973843, + "grad_norm": 0.2770874500274658, + "learning_rate": 3.5154862386437894e-07, + "loss": 0.3377, + "step": 10639 + }, + { + "epoch": 2.676056338028169, + "grad_norm": 0.2914242148399353, + "learning_rate": 3.5100984704618145e-07, + "loss": 0.3152, + "step": 10640 + }, + { + "epoch": 2.676307847082495, + "grad_norm": 0.313717246055603, + "learning_rate": 3.504714683849081e-07, + "loss": 0.3311, + "step": 10641 + }, + { + "epoch": 2.676559356136821, + "grad_norm": 0.31820422410964966, + "learning_rate": 3.499334879266653e-07, + "loss": 0.3342, + "step": 10642 + }, + { + "epoch": 2.676810865191147, + "grad_norm": 0.2878066897392273, + "learning_rate": 3.4939590571752893e-07, + "loss": 0.3092, + "step": 10643 + }, + { + "epoch": 2.677062374245473, + "grad_norm": 0.32320746779441833, + "learning_rate": 3.488587218035383e-07, + "loss": 0.3276, + "step": 10644 + }, + { + "epoch": 2.677313883299799, + "grad_norm": 0.26483622193336487, + "learning_rate": 3.4832193623070167e-07, + "loss": 0.2976, + "step": 10645 + }, + { + "epoch": 2.6775653923541247, + "grad_norm": 0.29713913798332214, + "learning_rate": 3.4778554904498996e-07, + "loss": 0.3227, + "step": 10646 + }, + { + "epoch": 2.677816901408451, + "grad_norm": 0.2784908711910248, + "learning_rate": 3.47249560292342e-07, + "loss": 0.3209, + "step": 10647 + }, + { + "epoch": 2.6780684104627768, + "grad_norm": 0.3123219609260559, + "learning_rate": 3.4671397001866116e-07, + "loss": 0.3209, + "step": 10648 + }, + { + "epoch": 2.6783199195171026, + "grad_norm": 0.27715906500816345, + "learning_rate": 3.4617877826981785e-07, + "loss": 0.3067, + "step": 10649 + }, + { + "epoch": 2.678571428571429, + "grad_norm": 0.2937963902950287, + "learning_rate": 3.4564398509164877e-07, + "loss": 0.3111, + "step": 10650 + }, + { + "epoch": 2.6788229376257546, + "grad_norm": 0.2874857783317566, + "learning_rate": 3.451095905299545e-07, + "loss": 0.3135, + "step": 10651 + }, + { + "epoch": 2.6790744466800804, + "grad_norm": 0.2816859781742096, + "learning_rate": 3.4457559463050293e-07, + "loss": 0.3099, + "step": 10652 + }, + { + "epoch": 2.6793259557344067, + "grad_norm": 0.28699028491973877, + "learning_rate": 3.440419974390269e-07, + "loss": 0.3174, + "step": 10653 + }, + { + "epoch": 2.6795774647887325, + "grad_norm": 0.30524781346321106, + "learning_rate": 3.43508799001227e-07, + "loss": 0.3135, + "step": 10654 + }, + { + "epoch": 2.6798289738430583, + "grad_norm": 0.2902679741382599, + "learning_rate": 3.429759993627674e-07, + "loss": 0.3229, + "step": 10655 + }, + { + "epoch": 2.6800804828973845, + "grad_norm": 0.331782728433609, + "learning_rate": 3.424435985692792e-07, + "loss": 0.3208, + "step": 10656 + }, + { + "epoch": 2.6803319919517103, + "grad_norm": 0.2915107309818268, + "learning_rate": 3.419115966663583e-07, + "loss": 0.3151, + "step": 10657 + }, + { + "epoch": 2.680583501006036, + "grad_norm": 0.3128476142883301, + "learning_rate": 3.413799936995699e-07, + "loss": 0.3463, + "step": 10658 + }, + { + "epoch": 2.6808350100603624, + "grad_norm": 0.2803356349468231, + "learning_rate": 3.4084878971443914e-07, + "loss": 0.3348, + "step": 10659 + }, + { + "epoch": 2.681086519114688, + "grad_norm": 0.2853963375091553, + "learning_rate": 3.403179847564625e-07, + "loss": 0.3282, + "step": 10660 + }, + { + "epoch": 2.681338028169014, + "grad_norm": 0.28760576248168945, + "learning_rate": 3.397875788711003e-07, + "loss": 0.3046, + "step": 10661 + }, + { + "epoch": 2.6815895372233403, + "grad_norm": 0.298556923866272, + "learning_rate": 3.3925757210377664e-07, + "loss": 0.2909, + "step": 10662 + }, + { + "epoch": 2.681841046277666, + "grad_norm": 0.2910586893558502, + "learning_rate": 3.387279644998853e-07, + "loss": 0.3091, + "step": 10663 + }, + { + "epoch": 2.682092555331992, + "grad_norm": 0.3114005923271179, + "learning_rate": 3.3819875610478225e-07, + "loss": 0.3163, + "step": 10664 + }, + { + "epoch": 2.682344064386318, + "grad_norm": 0.29191139340400696, + "learning_rate": 3.3766994696379283e-07, + "loss": 0.3394, + "step": 10665 + }, + { + "epoch": 2.682595573440644, + "grad_norm": 0.2941957116127014, + "learning_rate": 3.3714153712220364e-07, + "loss": 0.3197, + "step": 10666 + }, + { + "epoch": 2.6828470824949697, + "grad_norm": 0.269555002450943, + "learning_rate": 3.3661352662527234e-07, + "loss": 0.2953, + "step": 10667 + }, + { + "epoch": 2.683098591549296, + "grad_norm": 0.30141013860702515, + "learning_rate": 3.3608591551821724e-07, + "loss": 0.3079, + "step": 10668 + }, + { + "epoch": 2.683350100603622, + "grad_norm": 0.3198448121547699, + "learning_rate": 3.35558703846226e-07, + "loss": 0.3246, + "step": 10669 + }, + { + "epoch": 2.6836016096579476, + "grad_norm": 0.3114519715309143, + "learning_rate": 3.3503189165445205e-07, + "loss": 0.3223, + "step": 10670 + }, + { + "epoch": 2.683853118712274, + "grad_norm": 0.2955555319786072, + "learning_rate": 3.34505478988012e-07, + "loss": 0.3222, + "step": 10671 + }, + { + "epoch": 2.6841046277665996, + "grad_norm": 0.31283295154571533, + "learning_rate": 3.339794658919915e-07, + "loss": 0.3082, + "step": 10672 + }, + { + "epoch": 2.6843561368209254, + "grad_norm": 0.2927229702472687, + "learning_rate": 3.334538524114378e-07, + "loss": 0.3144, + "step": 10673 + }, + { + "epoch": 2.6846076458752517, + "grad_norm": 0.2944906949996948, + "learning_rate": 3.329286385913688e-07, + "loss": 0.3193, + "step": 10674 + }, + { + "epoch": 2.6848591549295775, + "grad_norm": 0.2984817326068878, + "learning_rate": 3.3240382447676413e-07, + "loss": 0.3145, + "step": 10675 + }, + { + "epoch": 2.6851106639839033, + "grad_norm": 0.28597378730773926, + "learning_rate": 3.3187941011257217e-07, + "loss": 0.3065, + "step": 10676 + }, + { + "epoch": 2.6853621730382295, + "grad_norm": 0.2896621525287628, + "learning_rate": 3.313553955437049e-07, + "loss": 0.3322, + "step": 10677 + }, + { + "epoch": 2.6856136820925554, + "grad_norm": 0.29271525144577026, + "learning_rate": 3.308317808150413e-07, + "loss": 0.3155, + "step": 10678 + }, + { + "epoch": 2.685865191146881, + "grad_norm": 0.2880185544490814, + "learning_rate": 3.303085659714245e-07, + "loss": 0.3204, + "step": 10679 + }, + { + "epoch": 2.6861167002012074, + "grad_norm": 0.29037216305732727, + "learning_rate": 3.297857510576658e-07, + "loss": 0.3253, + "step": 10680 + }, + { + "epoch": 2.686368209255533, + "grad_norm": 0.2900163531303406, + "learning_rate": 3.29263336118541e-07, + "loss": 0.3198, + "step": 10681 + }, + { + "epoch": 2.686619718309859, + "grad_norm": 0.28541356325149536, + "learning_rate": 3.287413211987911e-07, + "loss": 0.3171, + "step": 10682 + }, + { + "epoch": 2.6868712273641853, + "grad_norm": 0.3070562183856964, + "learning_rate": 3.2821970634312474e-07, + "loss": 0.3107, + "step": 10683 + }, + { + "epoch": 2.687122736418511, + "grad_norm": 0.296577513217926, + "learning_rate": 3.276984915962128e-07, + "loss": 0.3507, + "step": 10684 + }, + { + "epoch": 2.687374245472837, + "grad_norm": 0.286769300699234, + "learning_rate": 3.271776770026963e-07, + "loss": 0.3122, + "step": 10685 + }, + { + "epoch": 2.687625754527163, + "grad_norm": 0.31535083055496216, + "learning_rate": 3.266572626071773e-07, + "loss": 0.3166, + "step": 10686 + }, + { + "epoch": 2.687877263581489, + "grad_norm": 0.2796669602394104, + "learning_rate": 3.261372484542291e-07, + "loss": 0.3233, + "step": 10687 + }, + { + "epoch": 2.6881287726358147, + "grad_norm": 0.29072704911231995, + "learning_rate": 3.2561763458838436e-07, + "loss": 0.3206, + "step": 10688 + }, + { + "epoch": 2.688380281690141, + "grad_norm": 0.291552871465683, + "learning_rate": 3.25098421054148e-07, + "loss": 0.3319, + "step": 10689 + }, + { + "epoch": 2.688631790744467, + "grad_norm": 0.2940013110637665, + "learning_rate": 3.2457960789598453e-07, + "loss": 0.3319, + "step": 10690 + }, + { + "epoch": 2.6888832997987926, + "grad_norm": 0.2845515310764313, + "learning_rate": 3.2406119515832777e-07, + "loss": 0.3211, + "step": 10691 + }, + { + "epoch": 2.689134808853119, + "grad_norm": 0.30485305190086365, + "learning_rate": 3.235431828855784e-07, + "loss": 0.3121, + "step": 10692 + }, + { + "epoch": 2.6893863179074446, + "grad_norm": 0.2907279431819916, + "learning_rate": 3.230255711220992e-07, + "loss": 0.3204, + "step": 10693 + }, + { + "epoch": 2.6896378269617705, + "grad_norm": 0.2880786061286926, + "learning_rate": 3.2250835991222084e-07, + "loss": 0.3015, + "step": 10694 + }, + { + "epoch": 2.6898893360160967, + "grad_norm": 0.28251487016677856, + "learning_rate": 3.2199154930023903e-07, + "loss": 0.3232, + "step": 10695 + }, + { + "epoch": 2.6901408450704225, + "grad_norm": 0.27719196677207947, + "learning_rate": 3.214751393304155e-07, + "loss": 0.3025, + "step": 10696 + }, + { + "epoch": 2.6903923541247483, + "grad_norm": 0.2866508364677429, + "learning_rate": 3.209591300469772e-07, + "loss": 0.3137, + "step": 10697 + }, + { + "epoch": 2.6906438631790746, + "grad_norm": 0.269786536693573, + "learning_rate": 3.204435214941182e-07, + "loss": 0.3266, + "step": 10698 + }, + { + "epoch": 2.6908953722334004, + "grad_norm": 0.27028945088386536, + "learning_rate": 3.1992831371599584e-07, + "loss": 0.3135, + "step": 10699 + }, + { + "epoch": 2.691146881287726, + "grad_norm": 0.2978735864162445, + "learning_rate": 3.1941350675673557e-07, + "loss": 0.3323, + "step": 10700 + }, + { + "epoch": 2.6913983903420524, + "grad_norm": 0.3069797456264496, + "learning_rate": 3.1889910066042587e-07, + "loss": 0.3128, + "step": 10701 + }, + { + "epoch": 2.691649899396378, + "grad_norm": 0.2964140474796295, + "learning_rate": 3.183850954711232e-07, + "loss": 0.3123, + "step": 10702 + }, + { + "epoch": 2.691901408450704, + "grad_norm": 0.3021068572998047, + "learning_rate": 3.178714912328501e-07, + "loss": 0.287, + "step": 10703 + }, + { + "epoch": 2.6921529175050303, + "grad_norm": 0.2956176698207855, + "learning_rate": 3.1735828798959146e-07, + "loss": 0.3147, + "step": 10704 + }, + { + "epoch": 2.692404426559356, + "grad_norm": 0.27685198187828064, + "learning_rate": 3.168454857853015e-07, + "loss": 0.3162, + "step": 10705 + }, + { + "epoch": 2.692655935613682, + "grad_norm": 0.285156786441803, + "learning_rate": 3.163330846638979e-07, + "loss": 0.324, + "step": 10706 + }, + { + "epoch": 2.692907444668008, + "grad_norm": 0.31111207604408264, + "learning_rate": 3.1582108466926497e-07, + "loss": 0.3137, + "step": 10707 + }, + { + "epoch": 2.693158953722334, + "grad_norm": 0.32243606448173523, + "learning_rate": 3.15309485845251e-07, + "loss": 0.3186, + "step": 10708 + }, + { + "epoch": 2.6934104627766597, + "grad_norm": 0.2985250651836395, + "learning_rate": 3.147982882356732e-07, + "loss": 0.3265, + "step": 10709 + }, + { + "epoch": 2.693661971830986, + "grad_norm": 0.2669808566570282, + "learning_rate": 3.1428749188431086e-07, + "loss": 0.3053, + "step": 10710 + }, + { + "epoch": 2.693913480885312, + "grad_norm": 0.3001328706741333, + "learning_rate": 3.1377709683491185e-07, + "loss": 0.3213, + "step": 10711 + }, + { + "epoch": 2.6941649899396376, + "grad_norm": 0.2827167809009552, + "learning_rate": 3.132671031311868e-07, + "loss": 0.3212, + "step": 10712 + }, + { + "epoch": 2.694416498993964, + "grad_norm": 0.27102166414260864, + "learning_rate": 3.127575108168146e-07, + "loss": 0.3306, + "step": 10713 + }, + { + "epoch": 2.6946680080482897, + "grad_norm": 0.2781563997268677, + "learning_rate": 3.1224831993543867e-07, + "loss": 0.3274, + "step": 10714 + }, + { + "epoch": 2.6949195171026155, + "grad_norm": 0.3081703782081604, + "learning_rate": 3.11739530530667e-07, + "loss": 0.3186, + "step": 10715 + }, + { + "epoch": 2.6951710261569417, + "grad_norm": 0.31926673650741577, + "learning_rate": 3.1123114264607515e-07, + "loss": 0.3128, + "step": 10716 + }, + { + "epoch": 2.6954225352112675, + "grad_norm": 0.28970199823379517, + "learning_rate": 3.1072315632520277e-07, + "loss": 0.2909, + "step": 10717 + }, + { + "epoch": 2.6956740442655933, + "grad_norm": 0.28923845291137695, + "learning_rate": 3.102155716115568e-07, + "loss": 0.3278, + "step": 10718 + }, + { + "epoch": 2.6959255533199196, + "grad_norm": 0.3150719106197357, + "learning_rate": 3.097083885486074e-07, + "loss": 0.3314, + "step": 10719 + }, + { + "epoch": 2.6961770623742454, + "grad_norm": 0.2829613983631134, + "learning_rate": 3.0920160717979264e-07, + "loss": 0.3328, + "step": 10720 + }, + { + "epoch": 2.696428571428571, + "grad_norm": 0.2828682065010071, + "learning_rate": 3.0869522754851387e-07, + "loss": 0.3164, + "step": 10721 + }, + { + "epoch": 2.6966800804828974, + "grad_norm": 0.2709435522556305, + "learning_rate": 3.081892496981409e-07, + "loss": 0.3154, + "step": 10722 + }, + { + "epoch": 2.6969315895372232, + "grad_norm": 0.3398783504962921, + "learning_rate": 3.0768367367200624e-07, + "loss": 0.3052, + "step": 10723 + }, + { + "epoch": 2.697183098591549, + "grad_norm": 0.30523836612701416, + "learning_rate": 3.0717849951340973e-07, + "loss": 0.3241, + "step": 10724 + }, + { + "epoch": 2.6974346076458753, + "grad_norm": 0.2881937623023987, + "learning_rate": 3.066737272656173e-07, + "loss": 0.3551, + "step": 10725 + }, + { + "epoch": 2.697686116700201, + "grad_norm": 0.28372272849082947, + "learning_rate": 3.061693569718577e-07, + "loss": 0.3136, + "step": 10726 + }, + { + "epoch": 2.697937625754527, + "grad_norm": 0.2901321053504944, + "learning_rate": 3.056653886753297e-07, + "loss": 0.3264, + "step": 10727 + }, + { + "epoch": 2.698189134808853, + "grad_norm": 0.2705398499965668, + "learning_rate": 3.051618224191921e-07, + "loss": 0.3243, + "step": 10728 + }, + { + "epoch": 2.698440643863179, + "grad_norm": 0.2847079336643219, + "learning_rate": 3.046586582465744e-07, + "loss": 0.3083, + "step": 10729 + }, + { + "epoch": 2.698692152917505, + "grad_norm": 0.29345259070396423, + "learning_rate": 3.0415589620056807e-07, + "loss": 0.3141, + "step": 10730 + }, + { + "epoch": 2.698943661971831, + "grad_norm": 0.2997452914714813, + "learning_rate": 3.036535363242327e-07, + "loss": 0.2915, + "step": 10731 + }, + { + "epoch": 2.699195171026157, + "grad_norm": 0.2906875014305115, + "learning_rate": 3.0315157866059154e-07, + "loss": 0.3336, + "step": 10732 + }, + { + "epoch": 2.699446680080483, + "grad_norm": 0.2780359387397766, + "learning_rate": 3.0265002325263415e-07, + "loss": 0.3313, + "step": 10733 + }, + { + "epoch": 2.699698189134809, + "grad_norm": 0.2860005497932434, + "learning_rate": 3.021488701433156e-07, + "loss": 0.3464, + "step": 10734 + }, + { + "epoch": 2.6999496981891347, + "grad_norm": 0.30452266335487366, + "learning_rate": 3.016481193755566e-07, + "loss": 0.3243, + "step": 10735 + }, + { + "epoch": 2.700201207243461, + "grad_norm": 0.28733357787132263, + "learning_rate": 3.011477709922439e-07, + "loss": 0.3178, + "step": 10736 + }, + { + "epoch": 2.7004527162977867, + "grad_norm": 0.26170480251312256, + "learning_rate": 3.0064782503622827e-07, + "loss": 0.3241, + "step": 10737 + }, + { + "epoch": 2.7007042253521125, + "grad_norm": 0.27204710245132446, + "learning_rate": 3.001482815503276e-07, + "loss": 0.3139, + "step": 10738 + }, + { + "epoch": 2.7009557344064388, + "grad_norm": 0.3037412464618683, + "learning_rate": 2.9964914057732386e-07, + "loss": 0.3181, + "step": 10739 + }, + { + "epoch": 2.7012072434607646, + "grad_norm": 0.27614688873291016, + "learning_rate": 2.9915040215996726e-07, + "loss": 0.3135, + "step": 10740 + }, + { + "epoch": 2.7014587525150904, + "grad_norm": 0.2942797839641571, + "learning_rate": 2.986520663409687e-07, + "loss": 0.3279, + "step": 10741 + }, + { + "epoch": 2.7017102615694166, + "grad_norm": 0.2742573320865631, + "learning_rate": 2.9815413316301055e-07, + "loss": 0.3289, + "step": 10742 + }, + { + "epoch": 2.7019617706237424, + "grad_norm": 0.27266111969947815, + "learning_rate": 2.97656602668735e-07, + "loss": 0.3331, + "step": 10743 + }, + { + "epoch": 2.7022132796780687, + "grad_norm": 0.29972144961357117, + "learning_rate": 2.9715947490075447e-07, + "loss": 0.3313, + "step": 10744 + }, + { + "epoch": 2.7024647887323945, + "grad_norm": 0.30087292194366455, + "learning_rate": 2.9666274990164336e-07, + "loss": 0.3195, + "step": 10745 + }, + { + "epoch": 2.7027162977867203, + "grad_norm": 0.28101980686187744, + "learning_rate": 2.961664277139448e-07, + "loss": 0.3034, + "step": 10746 + }, + { + "epoch": 2.7029678068410465, + "grad_norm": 0.2926780879497528, + "learning_rate": 2.956705083801636e-07, + "loss": 0.3363, + "step": 10747 + }, + { + "epoch": 2.7032193158953723, + "grad_norm": 0.30072760581970215, + "learning_rate": 2.951749919427738e-07, + "loss": 0.3311, + "step": 10748 + }, + { + "epoch": 2.703470824949698, + "grad_norm": 0.3085022568702698, + "learning_rate": 2.9467987844421173e-07, + "loss": 0.3272, + "step": 10749 + }, + { + "epoch": 2.7037223340040244, + "grad_norm": 0.28719770908355713, + "learning_rate": 2.941851679268831e-07, + "loss": 0.3186, + "step": 10750 + }, + { + "epoch": 2.70397384305835, + "grad_norm": 0.30047592520713806, + "learning_rate": 2.93690860433154e-07, + "loss": 0.308, + "step": 10751 + }, + { + "epoch": 2.704225352112676, + "grad_norm": 0.29841721057891846, + "learning_rate": 2.93196956005361e-07, + "loss": 0.315, + "step": 10752 + }, + { + "epoch": 2.7044768611670023, + "grad_norm": 0.3019961416721344, + "learning_rate": 2.927034546858026e-07, + "loss": 0.3273, + "step": 10753 + }, + { + "epoch": 2.704728370221328, + "grad_norm": 0.2860671579837799, + "learning_rate": 2.92210356516745e-07, + "loss": 0.3405, + "step": 10754 + }, + { + "epoch": 2.704979879275654, + "grad_norm": 0.2890463173389435, + "learning_rate": 2.917176615404177e-07, + "loss": 0.3264, + "step": 10755 + }, + { + "epoch": 2.70523138832998, + "grad_norm": 0.27873101830482483, + "learning_rate": 2.9122536979901807e-07, + "loss": 0.3134, + "step": 10756 + }, + { + "epoch": 2.705482897384306, + "grad_norm": 0.28098541498184204, + "learning_rate": 2.9073348133470844e-07, + "loss": 0.3087, + "step": 10757 + }, + { + "epoch": 2.7057344064386317, + "grad_norm": 0.27469319105148315, + "learning_rate": 2.902419961896136e-07, + "loss": 0.3253, + "step": 10758 + }, + { + "epoch": 2.705985915492958, + "grad_norm": 0.295611172914505, + "learning_rate": 2.8975091440582915e-07, + "loss": 0.3224, + "step": 10759 + }, + { + "epoch": 2.7062374245472838, + "grad_norm": 0.2876059114933014, + "learning_rate": 2.892602360254104e-07, + "loss": 0.3465, + "step": 10760 + }, + { + "epoch": 2.7064889336016096, + "grad_norm": 0.285182923078537, + "learning_rate": 2.887699610903827e-07, + "loss": 0.3092, + "step": 10761 + }, + { + "epoch": 2.706740442655936, + "grad_norm": 0.2798616290092468, + "learning_rate": 2.8828008964273404e-07, + "loss": 0.3096, + "step": 10762 + }, + { + "epoch": 2.7069919517102616, + "grad_norm": 0.2900584042072296, + "learning_rate": 2.877906217244203e-07, + "loss": 0.3103, + "step": 10763 + }, + { + "epoch": 2.7072434607645874, + "grad_norm": 0.2769961357116699, + "learning_rate": 2.873015573773591e-07, + "loss": 0.3217, + "step": 10764 + }, + { + "epoch": 2.7074949698189137, + "grad_norm": 0.2770218849182129, + "learning_rate": 2.86812896643438e-07, + "loss": 0.3292, + "step": 10765 + }, + { + "epoch": 2.7077464788732395, + "grad_norm": 0.29273056983947754, + "learning_rate": 2.8632463956450583e-07, + "loss": 0.3286, + "step": 10766 + }, + { + "epoch": 2.7079979879275653, + "grad_norm": 0.2939911186695099, + "learning_rate": 2.858367861823802e-07, + "loss": 0.3022, + "step": 10767 + }, + { + "epoch": 2.7082494969818915, + "grad_norm": 0.3015715777873993, + "learning_rate": 2.8534933653884157e-07, + "loss": 0.3051, + "step": 10768 + }, + { + "epoch": 2.7085010060362174, + "grad_norm": 0.28284311294555664, + "learning_rate": 2.848622906756382e-07, + "loss": 0.3136, + "step": 10769 + }, + { + "epoch": 2.708752515090543, + "grad_norm": 0.2819089889526367, + "learning_rate": 2.843756486344812e-07, + "loss": 0.3146, + "step": 10770 + }, + { + "epoch": 2.7090040241448694, + "grad_norm": 0.3178308606147766, + "learning_rate": 2.8388941045705e-07, + "loss": 0.3401, + "step": 10771 + }, + { + "epoch": 2.709255533199195, + "grad_norm": 0.2823493182659149, + "learning_rate": 2.834035761849857e-07, + "loss": 0.3334, + "step": 10772 + }, + { + "epoch": 2.709507042253521, + "grad_norm": 0.2783769369125366, + "learning_rate": 2.8291814585989894e-07, + "loss": 0.3232, + "step": 10773 + }, + { + "epoch": 2.7097585513078473, + "grad_norm": 0.2950974404811859, + "learning_rate": 2.824331195233626e-07, + "loss": 0.3237, + "step": 10774 + }, + { + "epoch": 2.710010060362173, + "grad_norm": 0.2713472247123718, + "learning_rate": 2.8194849721691673e-07, + "loss": 0.2991, + "step": 10775 + }, + { + "epoch": 2.710261569416499, + "grad_norm": 0.2863728702068329, + "learning_rate": 2.81464278982066e-07, + "loss": 0.3167, + "step": 10776 + }, + { + "epoch": 2.710513078470825, + "grad_norm": 0.285849392414093, + "learning_rate": 2.80980464860281e-07, + "loss": 0.3086, + "step": 10777 + }, + { + "epoch": 2.710764587525151, + "grad_norm": 0.31550291180610657, + "learning_rate": 2.804970548929964e-07, + "loss": 0.3127, + "step": 10778 + }, + { + "epoch": 2.7110160965794767, + "grad_norm": 0.29624804854393005, + "learning_rate": 2.8001404912161413e-07, + "loss": 0.3238, + "step": 10779 + }, + { + "epoch": 2.711267605633803, + "grad_norm": 0.2810332179069519, + "learning_rate": 2.7953144758750107e-07, + "loss": 0.3298, + "step": 10780 + }, + { + "epoch": 2.711519114688129, + "grad_norm": 0.29689785838127136, + "learning_rate": 2.790492503319875e-07, + "loss": 0.3364, + "step": 10781 + }, + { + "epoch": 2.7117706237424546, + "grad_norm": 0.30866745114326477, + "learning_rate": 2.7856745739637206e-07, + "loss": 0.3216, + "step": 10782 + }, + { + "epoch": 2.712022132796781, + "grad_norm": 0.26995787024497986, + "learning_rate": 2.7808606882191615e-07, + "loss": 0.328, + "step": 10783 + }, + { + "epoch": 2.7122736418511066, + "grad_norm": 0.2834140956401825, + "learning_rate": 2.7760508464984904e-07, + "loss": 0.3015, + "step": 10784 + }, + { + "epoch": 2.7125251509054324, + "grad_norm": 0.28610095381736755, + "learning_rate": 2.771245049213628e-07, + "loss": 0.3174, + "step": 10785 + }, + { + "epoch": 2.7127766599597587, + "grad_norm": 0.2732725441455841, + "learning_rate": 2.7664432967761667e-07, + "loss": 0.3077, + "step": 10786 + }, + { + "epoch": 2.7130281690140845, + "grad_norm": 0.2708452641963959, + "learning_rate": 2.7616455895973393e-07, + "loss": 0.3036, + "step": 10787 + }, + { + "epoch": 2.7132796780684103, + "grad_norm": 0.28488287329673767, + "learning_rate": 2.756851928088056e-07, + "loss": 0.3226, + "step": 10788 + }, + { + "epoch": 2.7135311871227366, + "grad_norm": 0.3026772737503052, + "learning_rate": 2.752062312658838e-07, + "loss": 0.3404, + "step": 10789 + }, + { + "epoch": 2.7137826961770624, + "grad_norm": 0.28403767943382263, + "learning_rate": 2.7472767437199067e-07, + "loss": 0.325, + "step": 10790 + }, + { + "epoch": 2.714034205231388, + "grad_norm": 0.2924075424671173, + "learning_rate": 2.742495221681113e-07, + "loss": 0.322, + "step": 10791 + }, + { + "epoch": 2.7142857142857144, + "grad_norm": 0.28228023648262024, + "learning_rate": 2.7377177469519565e-07, + "loss": 0.3184, + "step": 10792 + }, + { + "epoch": 2.71453722334004, + "grad_norm": 0.29336363077163696, + "learning_rate": 2.7329443199416105e-07, + "loss": 0.3155, + "step": 10793 + }, + { + "epoch": 2.714788732394366, + "grad_norm": 0.31586167216300964, + "learning_rate": 2.7281749410588753e-07, + "loss": 0.3242, + "step": 10794 + }, + { + "epoch": 2.7150402414486923, + "grad_norm": 0.30801641941070557, + "learning_rate": 2.7234096107122357e-07, + "loss": 0.3315, + "step": 10795 + }, + { + "epoch": 2.715291750503018, + "grad_norm": 0.29662755131721497, + "learning_rate": 2.7186483293097863e-07, + "loss": 0.3071, + "step": 10796 + }, + { + "epoch": 2.715543259557344, + "grad_norm": 0.28053396940231323, + "learning_rate": 2.71389109725933e-07, + "loss": 0.3257, + "step": 10797 + }, + { + "epoch": 2.71579476861167, + "grad_norm": 0.29082953929901123, + "learning_rate": 2.7091379149682683e-07, + "loss": 0.2968, + "step": 10798 + }, + { + "epoch": 2.716046277665996, + "grad_norm": 0.30169904232025146, + "learning_rate": 2.7043887828437033e-07, + "loss": 0.3146, + "step": 10799 + }, + { + "epoch": 2.7162977867203217, + "grad_norm": 0.3031311333179474, + "learning_rate": 2.699643701292348e-07, + "loss": 0.3383, + "step": 10800 + }, + { + "epoch": 2.716549295774648, + "grad_norm": 0.28714466094970703, + "learning_rate": 2.694902670720606e-07, + "loss": 0.3333, + "step": 10801 + }, + { + "epoch": 2.716800804828974, + "grad_norm": 0.2786526679992676, + "learning_rate": 2.690165691534513e-07, + "loss": 0.3273, + "step": 10802 + }, + { + "epoch": 2.7170523138832996, + "grad_norm": 0.2807994484901428, + "learning_rate": 2.6854327641397504e-07, + "loss": 0.2979, + "step": 10803 + }, + { + "epoch": 2.717303822937626, + "grad_norm": 0.3144852817058563, + "learning_rate": 2.6807038889416824e-07, + "loss": 0.3153, + "step": 10804 + }, + { + "epoch": 2.7175553319919517, + "grad_norm": 0.2780417501926422, + "learning_rate": 2.675979066345291e-07, + "loss": 0.3198, + "step": 10805 + }, + { + "epoch": 2.7178068410462775, + "grad_norm": 0.27267181873321533, + "learning_rate": 2.671258296755241e-07, + "loss": 0.2898, + "step": 10806 + }, + { + "epoch": 2.7180583501006037, + "grad_norm": 0.29780054092407227, + "learning_rate": 2.6665415805758264e-07, + "loss": 0.3345, + "step": 10807 + }, + { + "epoch": 2.7183098591549295, + "grad_norm": 0.286492258310318, + "learning_rate": 2.661828918211012e-07, + "loss": 0.3259, + "step": 10808 + }, + { + "epoch": 2.7185613682092553, + "grad_norm": 0.2818504571914673, + "learning_rate": 2.657120310064393e-07, + "loss": 0.32, + "step": 10809 + }, + { + "epoch": 2.7188128772635816, + "grad_norm": 0.28963735699653625, + "learning_rate": 2.6524157565392506e-07, + "loss": 0.3339, + "step": 10810 + }, + { + "epoch": 2.7190643863179074, + "grad_norm": 0.268822580575943, + "learning_rate": 2.647715258038497e-07, + "loss": 0.3148, + "step": 10811 + }, + { + "epoch": 2.719315895372233, + "grad_norm": 0.304812490940094, + "learning_rate": 2.643018814964687e-07, + "loss": 0.3164, + "step": 10812 + }, + { + "epoch": 2.7195674044265594, + "grad_norm": 0.28156810998916626, + "learning_rate": 2.6383264277200616e-07, + "loss": 0.3312, + "step": 10813 + }, + { + "epoch": 2.7198189134808852, + "grad_norm": 0.2927444875240326, + "learning_rate": 2.6336380967064754e-07, + "loss": 0.3059, + "step": 10814 + }, + { + "epoch": 2.720070422535211, + "grad_norm": 0.2759106159210205, + "learning_rate": 2.6289538223254695e-07, + "loss": 0.3408, + "step": 10815 + }, + { + "epoch": 2.7203219315895373, + "grad_norm": 0.2832070589065552, + "learning_rate": 2.624273604978211e-07, + "loss": 0.3276, + "step": 10816 + }, + { + "epoch": 2.720573440643863, + "grad_norm": 0.27949801087379456, + "learning_rate": 2.6195974450655415e-07, + "loss": 0.3654, + "step": 10817 + }, + { + "epoch": 2.720824949698189, + "grad_norm": 0.29671213030815125, + "learning_rate": 2.6149253429879397e-07, + "loss": 0.3069, + "step": 10818 + }, + { + "epoch": 2.721076458752515, + "grad_norm": 0.31909680366516113, + "learning_rate": 2.610257299145541e-07, + "loss": 0.3114, + "step": 10819 + }, + { + "epoch": 2.721327967806841, + "grad_norm": 0.3081822395324707, + "learning_rate": 2.6055933139381315e-07, + "loss": 0.3447, + "step": 10820 + }, + { + "epoch": 2.7215794768611667, + "grad_norm": 0.30881619453430176, + "learning_rate": 2.600933387765159e-07, + "loss": 0.2926, + "step": 10821 + }, + { + "epoch": 2.721830985915493, + "grad_norm": 0.28827062249183655, + "learning_rate": 2.596277521025714e-07, + "loss": 0.326, + "step": 10822 + }, + { + "epoch": 2.722082494969819, + "grad_norm": 0.30842483043670654, + "learning_rate": 2.5916257141185395e-07, + "loss": 0.3216, + "step": 10823 + }, + { + "epoch": 2.7223340040241446, + "grad_norm": 0.3006271421909332, + "learning_rate": 2.586977967442045e-07, + "loss": 0.3144, + "step": 10824 + }, + { + "epoch": 2.722585513078471, + "grad_norm": 0.2951270341873169, + "learning_rate": 2.5823342813942665e-07, + "loss": 0.329, + "step": 10825 + }, + { + "epoch": 2.7228370221327967, + "grad_norm": 0.2893744111061096, + "learning_rate": 2.57769465637292e-07, + "loss": 0.3137, + "step": 10826 + }, + { + "epoch": 2.7230885311871225, + "grad_norm": 0.2886696457862854, + "learning_rate": 2.573059092775343e-07, + "loss": 0.3488, + "step": 10827 + }, + { + "epoch": 2.7233400402414487, + "grad_norm": 0.3092022240161896, + "learning_rate": 2.568427590998557e-07, + "loss": 0.3293, + "step": 10828 + }, + { + "epoch": 2.7235915492957745, + "grad_norm": 0.2914911210536957, + "learning_rate": 2.563800151439216e-07, + "loss": 0.3183, + "step": 10829 + }, + { + "epoch": 2.7238430583501008, + "grad_norm": 0.29686030745506287, + "learning_rate": 2.559176774493638e-07, + "loss": 0.3196, + "step": 10830 + }, + { + "epoch": 2.7240945674044266, + "grad_norm": 0.303915411233902, + "learning_rate": 2.554557460557772e-07, + "loss": 0.3234, + "step": 10831 + }, + { + "epoch": 2.7243460764587524, + "grad_norm": 0.3081934452056885, + "learning_rate": 2.54994221002724e-07, + "loss": 0.3263, + "step": 10832 + }, + { + "epoch": 2.7245975855130786, + "grad_norm": 0.290477454662323, + "learning_rate": 2.5453310232973205e-07, + "loss": 0.3441, + "step": 10833 + }, + { + "epoch": 2.7248490945674044, + "grad_norm": 0.2770725190639496, + "learning_rate": 2.5407239007629145e-07, + "loss": 0.3166, + "step": 10834 + }, + { + "epoch": 2.7251006036217302, + "grad_norm": 0.28399229049682617, + "learning_rate": 2.536120842818612e-07, + "loss": 0.2891, + "step": 10835 + }, + { + "epoch": 2.7253521126760565, + "grad_norm": 0.2806536853313446, + "learning_rate": 2.53152184985862e-07, + "loss": 0.3178, + "step": 10836 + }, + { + "epoch": 2.7256036217303823, + "grad_norm": 0.27946868538856506, + "learning_rate": 2.5269269222768234e-07, + "loss": 0.3259, + "step": 10837 + }, + { + "epoch": 2.725855130784708, + "grad_norm": 0.2891778349876404, + "learning_rate": 2.5223360604667404e-07, + "loss": 0.3271, + "step": 10838 + }, + { + "epoch": 2.7261066398390343, + "grad_norm": 0.2895258069038391, + "learning_rate": 2.517749264821556e-07, + "loss": 0.3193, + "step": 10839 + }, + { + "epoch": 2.72635814889336, + "grad_norm": 0.2947310507297516, + "learning_rate": 2.5131665357340963e-07, + "loss": 0.3362, + "step": 10840 + }, + { + "epoch": 2.7266096579476864, + "grad_norm": 0.28745022416114807, + "learning_rate": 2.508587873596857e-07, + "loss": 0.2958, + "step": 10841 + }, + { + "epoch": 2.726861167002012, + "grad_norm": 0.2851249873638153, + "learning_rate": 2.504013278801948e-07, + "loss": 0.327, + "step": 10842 + }, + { + "epoch": 2.727112676056338, + "grad_norm": 0.28858688473701477, + "learning_rate": 2.499442751741171e-07, + "loss": 0.3361, + "step": 10843 + }, + { + "epoch": 2.7273641851106643, + "grad_norm": 0.29522156715393066, + "learning_rate": 2.4948762928059647e-07, + "loss": 0.3238, + "step": 10844 + }, + { + "epoch": 2.72761569416499, + "grad_norm": 0.2840255796909332, + "learning_rate": 2.490313902387409e-07, + "loss": 0.3464, + "step": 10845 + }, + { + "epoch": 2.727867203219316, + "grad_norm": 0.29043489694595337, + "learning_rate": 2.485755580876248e-07, + "loss": 0.3275, + "step": 10846 + }, + { + "epoch": 2.728118712273642, + "grad_norm": 0.29880261421203613, + "learning_rate": 2.4812013286628747e-07, + "loss": 0.3568, + "step": 10847 + }, + { + "epoch": 2.728370221327968, + "grad_norm": 0.31489261984825134, + "learning_rate": 2.4766511461373324e-07, + "loss": 0.3404, + "step": 10848 + }, + { + "epoch": 2.7286217303822937, + "grad_norm": 0.28683874011039734, + "learning_rate": 2.4721050336893094e-07, + "loss": 0.3227, + "step": 10849 + }, + { + "epoch": 2.72887323943662, + "grad_norm": 0.27683448791503906, + "learning_rate": 2.467562991708161e-07, + "loss": 0.3275, + "step": 10850 + }, + { + "epoch": 2.7291247484909458, + "grad_norm": 0.28187745809555054, + "learning_rate": 2.4630250205828767e-07, + "loss": 0.331, + "step": 10851 + }, + { + "epoch": 2.7293762575452716, + "grad_norm": 0.30056044459342957, + "learning_rate": 2.458491120702117e-07, + "loss": 0.2948, + "step": 10852 + }, + { + "epoch": 2.729627766599598, + "grad_norm": 0.287810742855072, + "learning_rate": 2.453961292454166e-07, + "loss": 0.327, + "step": 10853 + }, + { + "epoch": 2.7298792756539236, + "grad_norm": 0.2935195565223694, + "learning_rate": 2.4494355362269796e-07, + "loss": 0.3268, + "step": 10854 + }, + { + "epoch": 2.7301307847082494, + "grad_norm": 0.2871311902999878, + "learning_rate": 2.444913852408176e-07, + "loss": 0.3373, + "step": 10855 + }, + { + "epoch": 2.7303822937625757, + "grad_norm": 0.2980871796607971, + "learning_rate": 2.440396241384985e-07, + "loss": 0.3115, + "step": 10856 + }, + { + "epoch": 2.7306338028169015, + "grad_norm": 0.29173511266708374, + "learning_rate": 2.435882703544334e-07, + "loss": 0.3228, + "step": 10857 + }, + { + "epoch": 2.7308853118712273, + "grad_norm": 0.3014184534549713, + "learning_rate": 2.4313732392727664e-07, + "loss": 0.3397, + "step": 10858 + }, + { + "epoch": 2.7311368209255535, + "grad_norm": 0.29081061482429504, + "learning_rate": 2.4268678489564935e-07, + "loss": 0.3183, + "step": 10859 + }, + { + "epoch": 2.7313883299798793, + "grad_norm": 0.3027068078517914, + "learning_rate": 2.422366532981368e-07, + "loss": 0.3241, + "step": 10860 + }, + { + "epoch": 2.731639839034205, + "grad_norm": 0.29450279474258423, + "learning_rate": 2.4178692917329106e-07, + "loss": 0.3406, + "step": 10861 + }, + { + "epoch": 2.7318913480885314, + "grad_norm": 0.26403874158859253, + "learning_rate": 2.413376125596267e-07, + "loss": 0.2853, + "step": 10862 + }, + { + "epoch": 2.732142857142857, + "grad_norm": 0.3012741804122925, + "learning_rate": 2.4088870349562644e-07, + "loss": 0.3511, + "step": 10863 + }, + { + "epoch": 2.732394366197183, + "grad_norm": 0.2905932068824768, + "learning_rate": 2.404402020197355e-07, + "loss": 0.303, + "step": 10864 + }, + { + "epoch": 2.7326458752515093, + "grad_norm": 0.2958056926727295, + "learning_rate": 2.399921081703654e-07, + "loss": 0.3264, + "step": 10865 + }, + { + "epoch": 2.732897384305835, + "grad_norm": 0.2873068153858185, + "learning_rate": 2.3954442198589334e-07, + "loss": 0.2966, + "step": 10866 + }, + { + "epoch": 2.733148893360161, + "grad_norm": 0.3035353720188141, + "learning_rate": 2.390971435046596e-07, + "loss": 0.3086, + "step": 10867 + }, + { + "epoch": 2.733400402414487, + "grad_norm": 0.3155568838119507, + "learning_rate": 2.38650272764972e-07, + "loss": 0.3157, + "step": 10868 + }, + { + "epoch": 2.733651911468813, + "grad_norm": 0.2832005023956299, + "learning_rate": 2.3820380980510093e-07, + "loss": 0.3133, + "step": 10869 + }, + { + "epoch": 2.7339034205231387, + "grad_norm": 0.28932392597198486, + "learning_rate": 2.3775775466328422e-07, + "loss": 0.3163, + "step": 10870 + }, + { + "epoch": 2.734154929577465, + "grad_norm": 0.30421337485313416, + "learning_rate": 2.3731210737772293e-07, + "loss": 0.3355, + "step": 10871 + }, + { + "epoch": 2.734406438631791, + "grad_norm": 0.2934373915195465, + "learning_rate": 2.3686686798658543e-07, + "loss": 0.3051, + "step": 10872 + }, + { + "epoch": 2.7346579476861166, + "grad_norm": 0.28928595781326294, + "learning_rate": 2.364220365280012e-07, + "loss": 0.3199, + "step": 10873 + }, + { + "epoch": 2.734909456740443, + "grad_norm": 0.30236467719078064, + "learning_rate": 2.3597761304006984e-07, + "loss": 0.3511, + "step": 10874 + }, + { + "epoch": 2.7351609657947686, + "grad_norm": 0.29965662956237793, + "learning_rate": 2.3553359756085192e-07, + "loss": 0.2945, + "step": 10875 + }, + { + "epoch": 2.7354124748490944, + "grad_norm": 0.31335750222206116, + "learning_rate": 2.3508999012837484e-07, + "loss": 0.3408, + "step": 10876 + }, + { + "epoch": 2.7356639839034207, + "grad_norm": 0.28357815742492676, + "learning_rate": 2.3464679078063102e-07, + "loss": 0.3215, + "step": 10877 + }, + { + "epoch": 2.7359154929577465, + "grad_norm": 0.30339914560317993, + "learning_rate": 2.3420399955557782e-07, + "loss": 0.3115, + "step": 10878 + }, + { + "epoch": 2.7361670020120723, + "grad_norm": 0.31013423204421997, + "learning_rate": 2.337616164911366e-07, + "loss": 0.3253, + "step": 10879 + }, + { + "epoch": 2.7364185110663986, + "grad_norm": 0.30664506554603577, + "learning_rate": 2.333196416251965e-07, + "loss": 0.306, + "step": 10880 + }, + { + "epoch": 2.7366700201207244, + "grad_norm": 0.27958711981773376, + "learning_rate": 2.3287807499560777e-07, + "loss": 0.3085, + "step": 10881 + }, + { + "epoch": 2.73692152917505, + "grad_norm": 0.27122312784194946, + "learning_rate": 2.3243691664018964e-07, + "loss": 0.3313, + "step": 10882 + }, + { + "epoch": 2.7371730382293764, + "grad_norm": 0.2907143533229828, + "learning_rate": 2.3199616659672352e-07, + "loss": 0.3293, + "step": 10883 + }, + { + "epoch": 2.737424547283702, + "grad_norm": 0.29304689168930054, + "learning_rate": 2.315558249029576e-07, + "loss": 0.3059, + "step": 10884 + }, + { + "epoch": 2.737676056338028, + "grad_norm": 0.2985115945339203, + "learning_rate": 2.311158915966033e-07, + "loss": 0.3035, + "step": 10885 + }, + { + "epoch": 2.7379275653923543, + "grad_norm": 0.3043895661830902, + "learning_rate": 2.3067636671533944e-07, + "loss": 0.3192, + "step": 10886 + }, + { + "epoch": 2.73817907444668, + "grad_norm": 0.2754703462123871, + "learning_rate": 2.30237250296807e-07, + "loss": 0.3498, + "step": 10887 + }, + { + "epoch": 2.738430583501006, + "grad_norm": 0.27213260531425476, + "learning_rate": 2.2979854237861588e-07, + "loss": 0.3159, + "step": 10888 + }, + { + "epoch": 2.738682092555332, + "grad_norm": 0.29739394783973694, + "learning_rate": 2.2936024299833605e-07, + "loss": 0.3277, + "step": 10889 + }, + { + "epoch": 2.738933601609658, + "grad_norm": 0.2812947928905487, + "learning_rate": 2.2892235219350745e-07, + "loss": 0.3192, + "step": 10890 + }, + { + "epoch": 2.7391851106639837, + "grad_norm": 0.29224130511283875, + "learning_rate": 2.2848487000163067e-07, + "loss": 0.3248, + "step": 10891 + }, + { + "epoch": 2.73943661971831, + "grad_norm": 0.2976722717285156, + "learning_rate": 2.2804779646017517e-07, + "loss": 0.3271, + "step": 10892 + }, + { + "epoch": 2.739688128772636, + "grad_norm": 0.28829827904701233, + "learning_rate": 2.276111316065721e-07, + "loss": 0.3171, + "step": 10893 + }, + { + "epoch": 2.7399396378269616, + "grad_norm": 0.29329490661621094, + "learning_rate": 2.2717487547821992e-07, + "loss": 0.3202, + "step": 10894 + }, + { + "epoch": 2.740191146881288, + "grad_norm": 0.29280152916908264, + "learning_rate": 2.267390281124804e-07, + "loss": 0.305, + "step": 10895 + }, + { + "epoch": 2.7404426559356136, + "grad_norm": 0.2917034924030304, + "learning_rate": 2.2630358954668253e-07, + "loss": 0.3526, + "step": 10896 + }, + { + "epoch": 2.7406941649899395, + "grad_norm": 0.2969276010990143, + "learning_rate": 2.258685598181176e-07, + "loss": 0.333, + "step": 10897 + }, + { + "epoch": 2.7409456740442657, + "grad_norm": 0.2692883610725403, + "learning_rate": 2.2543393896404308e-07, + "loss": 0.329, + "step": 10898 + }, + { + "epoch": 2.7411971830985915, + "grad_norm": 0.29979148507118225, + "learning_rate": 2.2499972702168304e-07, + "loss": 0.3182, + "step": 10899 + }, + { + "epoch": 2.7414486921529173, + "grad_norm": 0.2951495349407196, + "learning_rate": 2.245659240282233e-07, + "loss": 0.3015, + "step": 10900 + }, + { + "epoch": 2.7417002012072436, + "grad_norm": 0.2918504774570465, + "learning_rate": 2.2413253002081803e-07, + "loss": 0.3195, + "step": 10901 + }, + { + "epoch": 2.7419517102615694, + "grad_norm": 0.28916507959365845, + "learning_rate": 2.2369954503658308e-07, + "loss": 0.3268, + "step": 10902 + }, + { + "epoch": 2.742203219315895, + "grad_norm": 0.29391786456108093, + "learning_rate": 2.2326696911260215e-07, + "loss": 0.3068, + "step": 10903 + }, + { + "epoch": 2.7424547283702214, + "grad_norm": 0.28322654962539673, + "learning_rate": 2.2283480228592168e-07, + "loss": 0.3066, + "step": 10904 + }, + { + "epoch": 2.7427062374245472, + "grad_norm": 0.3049914538860321, + "learning_rate": 2.2240304459355544e-07, + "loss": 0.3235, + "step": 10905 + }, + { + "epoch": 2.742957746478873, + "grad_norm": 0.26875460147857666, + "learning_rate": 2.2197169607247882e-07, + "loss": 0.3386, + "step": 10906 + }, + { + "epoch": 2.7432092555331993, + "grad_norm": 0.29541999101638794, + "learning_rate": 2.2154075675963617e-07, + "loss": 0.3232, + "step": 10907 + }, + { + "epoch": 2.743460764587525, + "grad_norm": 0.26899954676628113, + "learning_rate": 2.2111022669193293e-07, + "loss": 0.3177, + "step": 10908 + }, + { + "epoch": 2.743712273641851, + "grad_norm": 0.27981895208358765, + "learning_rate": 2.206801059062419e-07, + "loss": 0.3132, + "step": 10909 + }, + { + "epoch": 2.743963782696177, + "grad_norm": 0.28188595175743103, + "learning_rate": 2.2025039443940134e-07, + "loss": 0.3502, + "step": 10910 + }, + { + "epoch": 2.744215291750503, + "grad_norm": 0.27629154920578003, + "learning_rate": 2.198210923282118e-07, + "loss": 0.324, + "step": 10911 + }, + { + "epoch": 2.7444668008048287, + "grad_norm": 0.2914685904979706, + "learning_rate": 2.1939219960944168e-07, + "loss": 0.3242, + "step": 10912 + }, + { + "epoch": 2.744718309859155, + "grad_norm": 0.2767096757888794, + "learning_rate": 2.1896371631982162e-07, + "loss": 0.3063, + "step": 10913 + }, + { + "epoch": 2.744969818913481, + "grad_norm": 0.28286007046699524, + "learning_rate": 2.1853564249604996e-07, + "loss": 0.3, + "step": 10914 + }, + { + "epoch": 2.7452213279678066, + "grad_norm": 0.28479135036468506, + "learning_rate": 2.181079781747869e-07, + "loss": 0.3035, + "step": 10915 + }, + { + "epoch": 2.745472837022133, + "grad_norm": 0.27827152609825134, + "learning_rate": 2.176807233926609e-07, + "loss": 0.3327, + "step": 10916 + }, + { + "epoch": 2.7457243460764587, + "grad_norm": 0.29502740502357483, + "learning_rate": 2.172538781862621e-07, + "loss": 0.3055, + "step": 10917 + }, + { + "epoch": 2.7459758551307845, + "grad_norm": 0.2725423276424408, + "learning_rate": 2.168274425921485e-07, + "loss": 0.3063, + "step": 10918 + }, + { + "epoch": 2.7462273641851107, + "grad_norm": 0.29838913679122925, + "learning_rate": 2.1640141664684034e-07, + "loss": 0.2981, + "step": 10919 + }, + { + "epoch": 2.7464788732394365, + "grad_norm": 0.29055625200271606, + "learning_rate": 2.1597580038682453e-07, + "loss": 0.32, + "step": 10920 + }, + { + "epoch": 2.7467303822937623, + "grad_norm": 0.27832192182540894, + "learning_rate": 2.1555059384855358e-07, + "loss": 0.3281, + "step": 10921 + }, + { + "epoch": 2.7469818913480886, + "grad_norm": 0.27420729398727417, + "learning_rate": 2.1512579706844227e-07, + "loss": 0.3345, + "step": 10922 + }, + { + "epoch": 2.7472334004024144, + "grad_norm": 0.3057540953159332, + "learning_rate": 2.1470141008287316e-07, + "loss": 0.3079, + "step": 10923 + }, + { + "epoch": 2.74748490945674, + "grad_norm": 0.27829062938690186, + "learning_rate": 2.1427743292819047e-07, + "loss": 0.3347, + "step": 10924 + }, + { + "epoch": 2.7477364185110664, + "grad_norm": 0.3032315969467163, + "learning_rate": 2.1385386564070688e-07, + "loss": 0.3091, + "step": 10925 + }, + { + "epoch": 2.7479879275653922, + "grad_norm": 0.30215707421302795, + "learning_rate": 2.1343070825669776e-07, + "loss": 0.3233, + "step": 10926 + }, + { + "epoch": 2.748239436619718, + "grad_norm": 0.3079783320426941, + "learning_rate": 2.130079608124036e-07, + "loss": 0.3209, + "step": 10927 + }, + { + "epoch": 2.7484909456740443, + "grad_norm": 0.3027101457118988, + "learning_rate": 2.1258562334402987e-07, + "loss": 0.3379, + "step": 10928 + }, + { + "epoch": 2.74874245472837, + "grad_norm": 0.3107944130897522, + "learning_rate": 2.121636958877482e-07, + "loss": 0.3205, + "step": 10929 + }, + { + "epoch": 2.7489939637826963, + "grad_norm": 0.2983773946762085, + "learning_rate": 2.1174217847969302e-07, + "loss": 0.3062, + "step": 10930 + }, + { + "epoch": 2.749245472837022, + "grad_norm": 0.29391881823539734, + "learning_rate": 2.1132107115596434e-07, + "loss": 0.3116, + "step": 10931 + }, + { + "epoch": 2.749496981891348, + "grad_norm": 0.3001260459423065, + "learning_rate": 2.1090037395262941e-07, + "loss": 0.3488, + "step": 10932 + }, + { + "epoch": 2.749748490945674, + "grad_norm": 0.3142930269241333, + "learning_rate": 2.104800869057161e-07, + "loss": 0.3339, + "step": 10933 + }, + { + "epoch": 2.75, + "grad_norm": 0.28879308700561523, + "learning_rate": 2.1006021005122057e-07, + "loss": 0.291, + "step": 10934 + }, + { + "epoch": 2.750251509054326, + "grad_norm": 0.3108893930912018, + "learning_rate": 2.0964074342510187e-07, + "loss": 0.3433, + "step": 10935 + }, + { + "epoch": 2.750503018108652, + "grad_norm": 0.30376648902893066, + "learning_rate": 2.0922168706328572e-07, + "loss": 0.3144, + "step": 10936 + }, + { + "epoch": 2.750754527162978, + "grad_norm": 0.2809644639492035, + "learning_rate": 2.0880304100166004e-07, + "loss": 0.3439, + "step": 10937 + }, + { + "epoch": 2.7510060362173037, + "grad_norm": 0.29834315180778503, + "learning_rate": 2.0838480527608118e-07, + "loss": 0.3044, + "step": 10938 + }, + { + "epoch": 2.75125754527163, + "grad_norm": 0.28084221482276917, + "learning_rate": 2.0796697992236713e-07, + "loss": 0.3288, + "step": 10939 + }, + { + "epoch": 2.7515090543259557, + "grad_norm": 0.2842169404029846, + "learning_rate": 2.0754956497630262e-07, + "loss": 0.3257, + "step": 10940 + }, + { + "epoch": 2.751760563380282, + "grad_norm": 0.2855300009250641, + "learning_rate": 2.0713256047363573e-07, + "loss": 0.3319, + "step": 10941 + }, + { + "epoch": 2.7520120724346078, + "grad_norm": 0.2749643921852112, + "learning_rate": 2.067159664500812e-07, + "loss": 0.3204, + "step": 10942 + }, + { + "epoch": 2.7522635814889336, + "grad_norm": 0.2852446436882019, + "learning_rate": 2.0629978294131824e-07, + "loss": 0.3053, + "step": 10943 + }, + { + "epoch": 2.75251509054326, + "grad_norm": 0.2823444604873657, + "learning_rate": 2.058840099829884e-07, + "loss": 0.3115, + "step": 10944 + }, + { + "epoch": 2.7527665995975856, + "grad_norm": 0.28557664155960083, + "learning_rate": 2.0546864761070262e-07, + "loss": 0.3398, + "step": 10945 + }, + { + "epoch": 2.7530181086519114, + "grad_norm": 0.28932204842567444, + "learning_rate": 2.050536958600313e-07, + "loss": 0.3197, + "step": 10946 + }, + { + "epoch": 2.7532696177062377, + "grad_norm": 0.27391183376312256, + "learning_rate": 2.0463915476651496e-07, + "loss": 0.3173, + "step": 10947 + }, + { + "epoch": 2.7535211267605635, + "grad_norm": 0.28741297125816345, + "learning_rate": 2.0422502436565462e-07, + "loss": 0.3155, + "step": 10948 + }, + { + "epoch": 2.7537726358148893, + "grad_norm": 0.2882639169692993, + "learning_rate": 2.038113046929191e-07, + "loss": 0.3166, + "step": 10949 + }, + { + "epoch": 2.7540241448692155, + "grad_norm": 0.27754950523376465, + "learning_rate": 2.0339799578373954e-07, + "loss": 0.3292, + "step": 10950 + }, + { + "epoch": 2.7542756539235413, + "grad_norm": 0.2686426043510437, + "learning_rate": 2.0298509767351538e-07, + "loss": 0.3227, + "step": 10951 + }, + { + "epoch": 2.754527162977867, + "grad_norm": 0.3171316981315613, + "learning_rate": 2.0257261039760667e-07, + "loss": 0.351, + "step": 10952 + }, + { + "epoch": 2.7547786720321934, + "grad_norm": 0.3160313367843628, + "learning_rate": 2.0216053399134127e-07, + "loss": 0.3276, + "step": 10953 + }, + { + "epoch": 2.755030181086519, + "grad_norm": 0.3004869520664215, + "learning_rate": 2.0174886849001207e-07, + "loss": 0.2926, + "step": 10954 + }, + { + "epoch": 2.755281690140845, + "grad_norm": 0.2918458580970764, + "learning_rate": 2.0133761392887308e-07, + "loss": 0.3214, + "step": 10955 + }, + { + "epoch": 2.7555331991951713, + "grad_norm": 0.279135137796402, + "learning_rate": 2.0092677034314834e-07, + "loss": 0.3276, + "step": 10956 + }, + { + "epoch": 2.755784708249497, + "grad_norm": 0.28750714659690857, + "learning_rate": 2.0051633776802192e-07, + "loss": 0.3329, + "step": 10957 + }, + { + "epoch": 2.756036217303823, + "grad_norm": 0.29169023036956787, + "learning_rate": 2.001063162386463e-07, + "loss": 0.3052, + "step": 10958 + }, + { + "epoch": 2.756287726358149, + "grad_norm": 0.3238070011138916, + "learning_rate": 1.996967057901361e-07, + "loss": 0.339, + "step": 10959 + }, + { + "epoch": 2.756539235412475, + "grad_norm": 0.31088271737098694, + "learning_rate": 1.9928750645757332e-07, + "loss": 0.3617, + "step": 10960 + }, + { + "epoch": 2.7567907444668007, + "grad_norm": 0.28064385056495667, + "learning_rate": 1.9887871827600158e-07, + "loss": 0.3263, + "step": 10961 + }, + { + "epoch": 2.757042253521127, + "grad_norm": 0.2962087094783783, + "learning_rate": 1.9847034128043175e-07, + "loss": 0.3306, + "step": 10962 + }, + { + "epoch": 2.7572937625754528, + "grad_norm": 0.2970735728740692, + "learning_rate": 1.9806237550583974e-07, + "loss": 0.3042, + "step": 10963 + }, + { + "epoch": 2.7575452716297786, + "grad_norm": 0.30512532591819763, + "learning_rate": 1.976548209871637e-07, + "loss": 0.2982, + "step": 10964 + }, + { + "epoch": 2.757796780684105, + "grad_norm": 0.3158949315547943, + "learning_rate": 1.9724767775930965e-07, + "loss": 0.3253, + "step": 10965 + }, + { + "epoch": 2.7580482897384306, + "grad_norm": 0.30506396293640137, + "learning_rate": 1.9684094585714575e-07, + "loss": 0.3229, + "step": 10966 + }, + { + "epoch": 2.7582997987927564, + "grad_norm": 0.2968839704990387, + "learning_rate": 1.964346253155064e-07, + "loss": 0.3332, + "step": 10967 + }, + { + "epoch": 2.7585513078470827, + "grad_norm": 0.28158625960350037, + "learning_rate": 1.9602871616918985e-07, + "loss": 0.3014, + "step": 10968 + }, + { + "epoch": 2.7588028169014085, + "grad_norm": 0.30511099100112915, + "learning_rate": 1.9562321845296106e-07, + "loss": 0.3046, + "step": 10969 + }, + { + "epoch": 2.7590543259557343, + "grad_norm": 0.3017299175262451, + "learning_rate": 1.9521813220154672e-07, + "loss": 0.2923, + "step": 10970 + }, + { + "epoch": 2.7593058350100605, + "grad_norm": 0.3004690706729889, + "learning_rate": 1.948134574496413e-07, + "loss": 0.3107, + "step": 10971 + }, + { + "epoch": 2.7595573440643864, + "grad_norm": 0.28788694739341736, + "learning_rate": 1.9440919423190208e-07, + "loss": 0.3172, + "step": 10972 + }, + { + "epoch": 2.759808853118712, + "grad_norm": 0.265638530254364, + "learning_rate": 1.9400534258295078e-07, + "loss": 0.3227, + "step": 10973 + }, + { + "epoch": 2.7600603621730384, + "grad_norm": 0.32271093130111694, + "learning_rate": 1.9360190253737698e-07, + "loss": 0.297, + "step": 10974 + }, + { + "epoch": 2.760311871227364, + "grad_norm": 0.30242830514907837, + "learning_rate": 1.9319887412973083e-07, + "loss": 0.3182, + "step": 10975 + }, + { + "epoch": 2.76056338028169, + "grad_norm": 0.2825239300727844, + "learning_rate": 1.9279625739453022e-07, + "loss": 0.3349, + "step": 10976 + }, + { + "epoch": 2.7608148893360163, + "grad_norm": 0.2780284285545349, + "learning_rate": 1.923940523662554e-07, + "loss": 0.3108, + "step": 10977 + }, + { + "epoch": 2.761066398390342, + "grad_norm": 0.2897617816925049, + "learning_rate": 1.9199225907935492e-07, + "loss": 0.3242, + "step": 10978 + }, + { + "epoch": 2.761317907444668, + "grad_norm": 0.287583589553833, + "learning_rate": 1.9159087756823792e-07, + "loss": 0.3197, + "step": 10979 + }, + { + "epoch": 2.761569416498994, + "grad_norm": 0.27631574869155884, + "learning_rate": 1.911899078672813e-07, + "loss": 0.3144, + "step": 10980 + }, + { + "epoch": 2.76182092555332, + "grad_norm": 0.2759281396865845, + "learning_rate": 1.9078935001082487e-07, + "loss": 0.3438, + "step": 10981 + }, + { + "epoch": 2.7620724346076457, + "grad_norm": 0.30593448877334595, + "learning_rate": 1.9038920403317507e-07, + "loss": 0.3062, + "step": 10982 + }, + { + "epoch": 2.762323943661972, + "grad_norm": 0.30097198486328125, + "learning_rate": 1.8998946996860002e-07, + "loss": 0.316, + "step": 10983 + }, + { + "epoch": 2.762575452716298, + "grad_norm": 0.28567543625831604, + "learning_rate": 1.8959014785133621e-07, + "loss": 0.3111, + "step": 10984 + }, + { + "epoch": 2.7628269617706236, + "grad_norm": 0.2774016857147217, + "learning_rate": 1.8919123771558246e-07, + "loss": 0.3204, + "step": 10985 + }, + { + "epoch": 2.76307847082495, + "grad_norm": 0.317721962928772, + "learning_rate": 1.8879273959550248e-07, + "loss": 0.3089, + "step": 10986 + }, + { + "epoch": 2.7633299798792756, + "grad_norm": 0.28425851464271545, + "learning_rate": 1.8839465352522623e-07, + "loss": 0.3328, + "step": 10987 + }, + { + "epoch": 2.7635814889336014, + "grad_norm": 0.2964561879634857, + "learning_rate": 1.8799697953884587e-07, + "loss": 0.3114, + "step": 10988 + }, + { + "epoch": 2.7638329979879277, + "grad_norm": 0.2809069752693176, + "learning_rate": 1.8759971767042085e-07, + "loss": 0.3146, + "step": 10989 + }, + { + "epoch": 2.7640845070422535, + "grad_norm": 0.305742084980011, + "learning_rate": 1.8720286795397335e-07, + "loss": 0.3204, + "step": 10990 + }, + { + "epoch": 2.7643360160965793, + "grad_norm": 0.298260360956192, + "learning_rate": 1.868064304234918e-07, + "loss": 0.3394, + "step": 10991 + }, + { + "epoch": 2.7645875251509056, + "grad_norm": 0.2834104299545288, + "learning_rate": 1.8641040511292786e-07, + "loss": 0.3074, + "step": 10992 + }, + { + "epoch": 2.7648390342052314, + "grad_norm": 0.2777165472507477, + "learning_rate": 1.8601479205619945e-07, + "loss": 0.3359, + "step": 10993 + }, + { + "epoch": 2.765090543259557, + "grad_norm": 0.29171082377433777, + "learning_rate": 1.856195912871872e-07, + "loss": 0.3155, + "step": 10994 + }, + { + "epoch": 2.7653420523138834, + "grad_norm": 0.29711124300956726, + "learning_rate": 1.8522480283973908e-07, + "loss": 0.3459, + "step": 10995 + }, + { + "epoch": 2.765593561368209, + "grad_norm": 0.32558995485305786, + "learning_rate": 1.8483042674766527e-07, + "loss": 0.3161, + "step": 10996 + }, + { + "epoch": 2.765845070422535, + "grad_norm": 0.29762402176856995, + "learning_rate": 1.8443646304474206e-07, + "loss": 0.3293, + "step": 10997 + }, + { + "epoch": 2.7660965794768613, + "grad_norm": 0.2941772937774658, + "learning_rate": 1.8404291176470857e-07, + "loss": 0.3248, + "step": 10998 + }, + { + "epoch": 2.766348088531187, + "grad_norm": 0.2893567681312561, + "learning_rate": 1.836497729412723e-07, + "loss": 0.3181, + "step": 10999 + }, + { + "epoch": 2.766599597585513, + "grad_norm": 0.28409847617149353, + "learning_rate": 1.8325704660810128e-07, + "loss": 0.3346, + "step": 11000 + }, + { + "epoch": 2.766851106639839, + "grad_norm": 0.2963768541812897, + "learning_rate": 1.8286473279883142e-07, + "loss": 0.3117, + "step": 11001 + }, + { + "epoch": 2.767102615694165, + "grad_norm": 0.30031341314315796, + "learning_rate": 1.8247283154706085e-07, + "loss": 0.3261, + "step": 11002 + }, + { + "epoch": 2.7673541247484907, + "grad_norm": 0.292661190032959, + "learning_rate": 1.8208134288635438e-07, + "loss": 0.3137, + "step": 11003 + }, + { + "epoch": 2.767605633802817, + "grad_norm": 0.2892690598964691, + "learning_rate": 1.816902668502396e-07, + "loss": 0.3256, + "step": 11004 + }, + { + "epoch": 2.767857142857143, + "grad_norm": 0.2964717745780945, + "learning_rate": 1.812996034722103e-07, + "loss": 0.3335, + "step": 11005 + }, + { + "epoch": 2.7681086519114686, + "grad_norm": 0.26202383637428284, + "learning_rate": 1.809093527857242e-07, + "loss": 0.3231, + "step": 11006 + }, + { + "epoch": 2.768360160965795, + "grad_norm": 0.26758965849876404, + "learning_rate": 1.805195148242045e-07, + "loss": 0.3121, + "step": 11007 + }, + { + "epoch": 2.7686116700201207, + "grad_norm": 0.2875462472438812, + "learning_rate": 1.8013008962103674e-07, + "loss": 0.3221, + "step": 11008 + }, + { + "epoch": 2.7688631790744465, + "grad_norm": 0.2862987220287323, + "learning_rate": 1.7974107720957478e-07, + "loss": 0.33, + "step": 11009 + }, + { + "epoch": 2.7691146881287727, + "grad_norm": 0.289815753698349, + "learning_rate": 1.7935247762313312e-07, + "loss": 0.318, + "step": 11010 + }, + { + "epoch": 2.7693661971830985, + "grad_norm": 0.2913775146007538, + "learning_rate": 1.7896429089499455e-07, + "loss": 0.3035, + "step": 11011 + }, + { + "epoch": 2.7696177062374243, + "grad_norm": 0.2775018513202667, + "learning_rate": 1.7857651705840419e-07, + "loss": 0.318, + "step": 11012 + }, + { + "epoch": 2.7698692152917506, + "grad_norm": 0.27448347210884094, + "learning_rate": 1.781891561465726e-07, + "loss": 0.339, + "step": 11013 + }, + { + "epoch": 2.7701207243460764, + "grad_norm": 0.31805574893951416, + "learning_rate": 1.778022081926739e-07, + "loss": 0.3193, + "step": 11014 + }, + { + "epoch": 2.770372233400402, + "grad_norm": 0.292459100484848, + "learning_rate": 1.774156732298493e-07, + "loss": 0.3134, + "step": 11015 + }, + { + "epoch": 2.7706237424547284, + "grad_norm": 0.28790146112442017, + "learning_rate": 1.7702955129120125e-07, + "loss": 0.3145, + "step": 11016 + }, + { + "epoch": 2.7708752515090542, + "grad_norm": 0.26574012637138367, + "learning_rate": 1.7664384240979993e-07, + "loss": 0.3131, + "step": 11017 + }, + { + "epoch": 2.77112676056338, + "grad_norm": 0.2886962890625, + "learning_rate": 1.7625854661867947e-07, + "loss": 0.2903, + "step": 11018 + }, + { + "epoch": 2.7713782696177063, + "grad_norm": 0.2845294773578644, + "learning_rate": 1.7587366395083683e-07, + "loss": 0.3289, + "step": 11019 + }, + { + "epoch": 2.771629778672032, + "grad_norm": 0.27203091979026794, + "learning_rate": 1.754891944392356e-07, + "loss": 0.3224, + "step": 11020 + }, + { + "epoch": 2.771881287726358, + "grad_norm": 0.27143409848213196, + "learning_rate": 1.751051381168023e-07, + "loss": 0.3132, + "step": 11021 + }, + { + "epoch": 2.772132796780684, + "grad_norm": 0.2910093665122986, + "learning_rate": 1.7472149501643e-07, + "loss": 0.3269, + "step": 11022 + }, + { + "epoch": 2.77238430583501, + "grad_norm": 0.29057279229164124, + "learning_rate": 1.7433826517097407e-07, + "loss": 0.3368, + "step": 11023 + }, + { + "epoch": 2.7726358148893357, + "grad_norm": 0.2875846326351166, + "learning_rate": 1.7395544861325718e-07, + "loss": 0.3195, + "step": 11024 + }, + { + "epoch": 2.772887323943662, + "grad_norm": 0.2830538749694824, + "learning_rate": 1.7357304537606367e-07, + "loss": 0.2901, + "step": 11025 + }, + { + "epoch": 2.773138832997988, + "grad_norm": 0.29084324836730957, + "learning_rate": 1.7319105549214564e-07, + "loss": 0.3114, + "step": 11026 + }, + { + "epoch": 2.7733903420523136, + "grad_norm": 0.29110777378082275, + "learning_rate": 1.7280947899421695e-07, + "loss": 0.3129, + "step": 11027 + }, + { + "epoch": 2.77364185110664, + "grad_norm": 0.2726239562034607, + "learning_rate": 1.7242831591495701e-07, + "loss": 0.3164, + "step": 11028 + }, + { + "epoch": 2.7738933601609657, + "grad_norm": 0.28240305185317993, + "learning_rate": 1.7204756628701192e-07, + "loss": 0.3127, + "step": 11029 + }, + { + "epoch": 2.774144869215292, + "grad_norm": 0.29125747084617615, + "learning_rate": 1.716672301429878e-07, + "loss": 0.3089, + "step": 11030 + }, + { + "epoch": 2.7743963782696177, + "grad_norm": 0.2842182219028473, + "learning_rate": 1.7128730751546086e-07, + "loss": 0.3137, + "step": 11031 + }, + { + "epoch": 2.7746478873239435, + "grad_norm": 0.28499892354011536, + "learning_rate": 1.709077984369667e-07, + "loss": 0.3305, + "step": 11032 + }, + { + "epoch": 2.7748993963782698, + "grad_norm": 0.29583901166915894, + "learning_rate": 1.7052870294000933e-07, + "loss": 0.3467, + "step": 11033 + }, + { + "epoch": 2.7751509054325956, + "grad_norm": 0.2947319746017456, + "learning_rate": 1.7015002105705502e-07, + "loss": 0.3388, + "step": 11034 + }, + { + "epoch": 2.7754024144869214, + "grad_norm": 0.28705930709838867, + "learning_rate": 1.6977175282053672e-07, + "loss": 0.3503, + "step": 11035 + }, + { + "epoch": 2.7756539235412476, + "grad_norm": 0.27521228790283203, + "learning_rate": 1.6939389826284903e-07, + "loss": 0.3195, + "step": 11036 + }, + { + "epoch": 2.7759054325955734, + "grad_norm": 0.2797527313232422, + "learning_rate": 1.690164574163544e-07, + "loss": 0.3026, + "step": 11037 + }, + { + "epoch": 2.7761569416498992, + "grad_norm": 0.29513728618621826, + "learning_rate": 1.686394303133776e-07, + "loss": 0.3057, + "step": 11038 + }, + { + "epoch": 2.7764084507042255, + "grad_norm": 0.27043822407722473, + "learning_rate": 1.6826281698620827e-07, + "loss": 0.3386, + "step": 11039 + }, + { + "epoch": 2.7766599597585513, + "grad_norm": 0.2766996920108795, + "learning_rate": 1.6788661746710178e-07, + "loss": 0.3178, + "step": 11040 + }, + { + "epoch": 2.7769114688128775, + "grad_norm": 0.2824966609477997, + "learning_rate": 1.6751083178827675e-07, + "loss": 0.3275, + "step": 11041 + }, + { + "epoch": 2.7771629778672033, + "grad_norm": 0.27287888526916504, + "learning_rate": 1.6713545998191748e-07, + "loss": 0.3203, + "step": 11042 + }, + { + "epoch": 2.777414486921529, + "grad_norm": 0.28277119994163513, + "learning_rate": 1.6676050208017102e-07, + "loss": 0.3313, + "step": 11043 + }, + { + "epoch": 2.7776659959758554, + "grad_norm": 0.2877527177333832, + "learning_rate": 1.663859581151517e-07, + "loss": 0.3023, + "step": 11044 + }, + { + "epoch": 2.777917505030181, + "grad_norm": 0.2927461266517639, + "learning_rate": 1.6601182811893545e-07, + "loss": 0.334, + "step": 11045 + }, + { + "epoch": 2.778169014084507, + "grad_norm": 0.28662925958633423, + "learning_rate": 1.6563811212356506e-07, + "loss": 0.32, + "step": 11046 + }, + { + "epoch": 2.7784205231388333, + "grad_norm": 0.300090491771698, + "learning_rate": 1.6526481016104655e-07, + "loss": 0.3101, + "step": 11047 + }, + { + "epoch": 2.778672032193159, + "grad_norm": 0.28218507766723633, + "learning_rate": 1.6489192226335104e-07, + "loss": 0.3205, + "step": 11048 + }, + { + "epoch": 2.778923541247485, + "grad_norm": 0.2887854278087616, + "learning_rate": 1.6451944846241408e-07, + "loss": 0.3097, + "step": 11049 + }, + { + "epoch": 2.779175050301811, + "grad_norm": 0.28003057837486267, + "learning_rate": 1.641473887901357e-07, + "loss": 0.3098, + "step": 11050 + }, + { + "epoch": 2.779426559356137, + "grad_norm": 0.2808135449886322, + "learning_rate": 1.6377574327838041e-07, + "loss": 0.319, + "step": 11051 + }, + { + "epoch": 2.7796780684104627, + "grad_norm": 0.2711102366447449, + "learning_rate": 1.6340451195897722e-07, + "loss": 0.3376, + "step": 11052 + }, + { + "epoch": 2.779929577464789, + "grad_norm": 0.30259403586387634, + "learning_rate": 1.6303369486372067e-07, + "loss": 0.3333, + "step": 11053 + }, + { + "epoch": 2.7801810865191148, + "grad_norm": 0.27301356196403503, + "learning_rate": 1.6266329202436758e-07, + "loss": 0.318, + "step": 11054 + }, + { + "epoch": 2.7804325955734406, + "grad_norm": 0.28594970703125, + "learning_rate": 1.622933034726415e-07, + "loss": 0.3366, + "step": 11055 + }, + { + "epoch": 2.780684104627767, + "grad_norm": 0.2891000509262085, + "learning_rate": 1.6192372924022925e-07, + "loss": 0.3187, + "step": 11056 + }, + { + "epoch": 2.7809356136820926, + "grad_norm": 0.299022376537323, + "learning_rate": 1.6155456935878277e-07, + "loss": 0.3196, + "step": 11057 + }, + { + "epoch": 2.7811871227364184, + "grad_norm": 0.27598682045936584, + "learning_rate": 1.6118582385991787e-07, + "loss": 0.3121, + "step": 11058 + }, + { + "epoch": 2.7814386317907447, + "grad_norm": 0.29095298051834106, + "learning_rate": 1.6081749277521598e-07, + "loss": 0.2999, + "step": 11059 + }, + { + "epoch": 2.7816901408450705, + "grad_norm": 0.28317791223526, + "learning_rate": 1.604495761362218e-07, + "loss": 0.3121, + "step": 11060 + }, + { + "epoch": 2.7819416498993963, + "grad_norm": 0.2773258686065674, + "learning_rate": 1.600820739744452e-07, + "loss": 0.3309, + "step": 11061 + }, + { + "epoch": 2.7821931589537225, + "grad_norm": 0.28628113865852356, + "learning_rate": 1.5971498632136096e-07, + "loss": 0.3194, + "step": 11062 + }, + { + "epoch": 2.7824446680080483, + "grad_norm": 0.28664273023605347, + "learning_rate": 1.5934831320840672e-07, + "loss": 0.3022, + "step": 11063 + }, + { + "epoch": 2.782696177062374, + "grad_norm": 0.2975166141986847, + "learning_rate": 1.5898205466698736e-07, + "loss": 0.3185, + "step": 11064 + }, + { + "epoch": 2.7829476861167004, + "grad_norm": 0.27436116337776184, + "learning_rate": 1.586162107284689e-07, + "loss": 0.3214, + "step": 11065 + }, + { + "epoch": 2.783199195171026, + "grad_norm": 0.2933892607688904, + "learning_rate": 1.5825078142418516e-07, + "loss": 0.3318, + "step": 11066 + }, + { + "epoch": 2.783450704225352, + "grad_norm": 0.2979046702384949, + "learning_rate": 1.578857667854311e-07, + "loss": 0.3252, + "step": 11067 + }, + { + "epoch": 2.7837022132796783, + "grad_norm": 0.2885100841522217, + "learning_rate": 1.5752116684347008e-07, + "loss": 0.3235, + "step": 11068 + }, + { + "epoch": 2.783953722334004, + "grad_norm": 0.2883824110031128, + "learning_rate": 1.5715698162952597e-07, + "loss": 0.3425, + "step": 11069 + }, + { + "epoch": 2.78420523138833, + "grad_norm": 0.2968808114528656, + "learning_rate": 1.5679321117478995e-07, + "loss": 0.3058, + "step": 11070 + }, + { + "epoch": 2.784456740442656, + "grad_norm": 0.27541083097457886, + "learning_rate": 1.56429855510416e-07, + "loss": 0.3413, + "step": 11071 + }, + { + "epoch": 2.784708249496982, + "grad_norm": 0.26949572563171387, + "learning_rate": 1.5606691466752366e-07, + "loss": 0.3173, + "step": 11072 + }, + { + "epoch": 2.7849597585513077, + "grad_norm": 0.28457048535346985, + "learning_rate": 1.5570438867719695e-07, + "loss": 0.3225, + "step": 11073 + }, + { + "epoch": 2.785211267605634, + "grad_norm": 0.27355846762657166, + "learning_rate": 1.553422775704827e-07, + "loss": 0.322, + "step": 11074 + }, + { + "epoch": 2.78546277665996, + "grad_norm": 0.2736510634422302, + "learning_rate": 1.5498058137839555e-07, + "loss": 0.3038, + "step": 11075 + }, + { + "epoch": 2.7857142857142856, + "grad_norm": 0.2820262908935547, + "learning_rate": 1.5461930013191018e-07, + "loss": 0.3123, + "step": 11076 + }, + { + "epoch": 2.785965794768612, + "grad_norm": 0.2771058976650238, + "learning_rate": 1.5425843386197015e-07, + "loss": 0.3065, + "step": 11077 + }, + { + "epoch": 2.7862173038229376, + "grad_norm": 0.2988390326499939, + "learning_rate": 1.5389798259947908e-07, + "loss": 0.3116, + "step": 11078 + }, + { + "epoch": 2.7864688128772634, + "grad_norm": 0.27741125226020813, + "learning_rate": 1.5353794637531005e-07, + "loss": 0.305, + "step": 11079 + }, + { + "epoch": 2.7867203219315897, + "grad_norm": 0.3002147078514099, + "learning_rate": 1.5317832522029563e-07, + "loss": 0.3438, + "step": 11080 + }, + { + "epoch": 2.7869718309859155, + "grad_norm": 0.2924443483352661, + "learning_rate": 1.5281911916523672e-07, + "loss": 0.3429, + "step": 11081 + }, + { + "epoch": 2.7872233400402413, + "grad_norm": 0.2645252048969269, + "learning_rate": 1.524603282408954e-07, + "loss": 0.3502, + "step": 11082 + }, + { + "epoch": 2.7874748490945676, + "grad_norm": 0.26844504475593567, + "learning_rate": 1.5210195247800153e-07, + "loss": 0.3112, + "step": 11083 + }, + { + "epoch": 2.7877263581488934, + "grad_norm": 0.2783125638961792, + "learning_rate": 1.517439919072472e-07, + "loss": 0.3137, + "step": 11084 + }, + { + "epoch": 2.787977867203219, + "grad_norm": 0.29496461153030396, + "learning_rate": 1.5138644655928848e-07, + "loss": 0.3234, + "step": 11085 + }, + { + "epoch": 2.7882293762575454, + "grad_norm": 0.3056720793247223, + "learning_rate": 1.5102931646474917e-07, + "loss": 0.3139, + "step": 11086 + }, + { + "epoch": 2.788480885311871, + "grad_norm": 0.2965206801891327, + "learning_rate": 1.506726016542126e-07, + "loss": 0.3233, + "step": 11087 + }, + { + "epoch": 2.788732394366197, + "grad_norm": 0.27822843194007874, + "learning_rate": 1.50316302158231e-07, + "loss": 0.3161, + "step": 11088 + }, + { + "epoch": 2.7889839034205233, + "grad_norm": 0.2970999479293823, + "learning_rate": 1.4996041800731832e-07, + "loss": 0.3139, + "step": 11089 + }, + { + "epoch": 2.789235412474849, + "grad_norm": 0.2687007188796997, + "learning_rate": 1.4960494923195457e-07, + "loss": 0.3199, + "step": 11090 + }, + { + "epoch": 2.789486921529175, + "grad_norm": 0.31234511733055115, + "learning_rate": 1.4924989586258265e-07, + "loss": 0.3306, + "step": 11091 + }, + { + "epoch": 2.789738430583501, + "grad_norm": 0.312133252620697, + "learning_rate": 1.4889525792961103e-07, + "loss": 0.3264, + "step": 11092 + }, + { + "epoch": 2.789989939637827, + "grad_norm": 0.28549107909202576, + "learning_rate": 1.4854103546341204e-07, + "loss": 0.3121, + "step": 11093 + }, + { + "epoch": 2.7902414486921527, + "grad_norm": 0.28895583748817444, + "learning_rate": 1.4818722849432253e-07, + "loss": 0.3081, + "step": 11094 + }, + { + "epoch": 2.790492957746479, + "grad_norm": 0.2902131974697113, + "learning_rate": 1.4783383705264444e-07, + "loss": 0.3254, + "step": 11095 + }, + { + "epoch": 2.790744466800805, + "grad_norm": 0.2827642858028412, + "learning_rate": 1.474808611686429e-07, + "loss": 0.3218, + "step": 11096 + }, + { + "epoch": 2.7909959758551306, + "grad_norm": 0.3049061894416809, + "learning_rate": 1.4712830087254825e-07, + "loss": 0.3173, + "step": 11097 + }, + { + "epoch": 2.791247484909457, + "grad_norm": 0.2864426076412201, + "learning_rate": 1.467761561945552e-07, + "loss": 0.326, + "step": 11098 + }, + { + "epoch": 2.7914989939637826, + "grad_norm": 0.28067395091056824, + "learning_rate": 1.4642442716482298e-07, + "loss": 0.3218, + "step": 11099 + }, + { + "epoch": 2.7917505030181085, + "grad_norm": 0.28718701004981995, + "learning_rate": 1.4607311381347467e-07, + "loss": 0.3006, + "step": 11100 + }, + { + "epoch": 2.7920020120724347, + "grad_norm": 0.30900463461875916, + "learning_rate": 1.457222161705979e-07, + "loss": 0.3336, + "step": 11101 + }, + { + "epoch": 2.7922535211267605, + "grad_norm": 0.31647589802742004, + "learning_rate": 1.453717342662453e-07, + "loss": 0.3224, + "step": 11102 + }, + { + "epoch": 2.7925050301810863, + "grad_norm": 0.27756398916244507, + "learning_rate": 1.4502166813043283e-07, + "loss": 0.3268, + "step": 11103 + }, + { + "epoch": 2.7927565392354126, + "grad_norm": 0.2882534861564636, + "learning_rate": 1.446720177931421e-07, + "loss": 0.3331, + "step": 11104 + }, + { + "epoch": 2.7930080482897384, + "grad_norm": 0.2910817861557007, + "learning_rate": 1.4432278328431748e-07, + "loss": 0.3222, + "step": 11105 + }, + { + "epoch": 2.793259557344064, + "grad_norm": 0.26894426345825195, + "learning_rate": 1.4397396463387059e-07, + "loss": 0.3128, + "step": 11106 + }, + { + "epoch": 2.7935110663983904, + "grad_norm": 0.28713172674179077, + "learning_rate": 1.4362556187167365e-07, + "loss": 0.3278, + "step": 11107 + }, + { + "epoch": 2.7937625754527162, + "grad_norm": 0.28895851969718933, + "learning_rate": 1.4327757502756668e-07, + "loss": 0.3292, + "step": 11108 + }, + { + "epoch": 2.794014084507042, + "grad_norm": 0.285756379365921, + "learning_rate": 1.4293000413135084e-07, + "loss": 0.3139, + "step": 11109 + }, + { + "epoch": 2.7942655935613683, + "grad_norm": 0.3005828857421875, + "learning_rate": 1.4258284921279565e-07, + "loss": 0.3371, + "step": 11110 + }, + { + "epoch": 2.794517102615694, + "grad_norm": 0.282633900642395, + "learning_rate": 1.4223611030163064e-07, + "loss": 0.3343, + "step": 11111 + }, + { + "epoch": 2.79476861167002, + "grad_norm": 0.29902184009552, + "learning_rate": 1.4188978742755322e-07, + "loss": 0.2928, + "step": 11112 + }, + { + "epoch": 2.795020120724346, + "grad_norm": 0.3080330491065979, + "learning_rate": 1.415438806202224e-07, + "loss": 0.3043, + "step": 11113 + }, + { + "epoch": 2.795271629778672, + "grad_norm": 0.2750524878501892, + "learning_rate": 1.4119838990926448e-07, + "loss": 0.3241, + "step": 11114 + }, + { + "epoch": 2.7955231388329977, + "grad_norm": 0.27586209774017334, + "learning_rate": 1.4085331532426748e-07, + "loss": 0.326, + "step": 11115 + }, + { + "epoch": 2.795774647887324, + "grad_norm": 0.29278305172920227, + "learning_rate": 1.405086568947861e-07, + "loss": 0.3371, + "step": 11116 + }, + { + "epoch": 2.79602615694165, + "grad_norm": 0.30196496844291687, + "learning_rate": 1.401644146503367e-07, + "loss": 0.3354, + "step": 11117 + }, + { + "epoch": 2.7962776659959756, + "grad_norm": 0.2952038645744324, + "learning_rate": 1.3982058862040238e-07, + "loss": 0.2915, + "step": 11118 + }, + { + "epoch": 2.796529175050302, + "grad_norm": 0.29630571603775024, + "learning_rate": 1.3947717883442903e-07, + "loss": 0.3196, + "step": 11119 + }, + { + "epoch": 2.7967806841046277, + "grad_norm": 0.27728769183158875, + "learning_rate": 1.391341853218281e-07, + "loss": 0.3405, + "step": 11120 + }, + { + "epoch": 2.7970321931589535, + "grad_norm": 0.272607684135437, + "learning_rate": 1.3879160811197556e-07, + "loss": 0.3274, + "step": 11121 + }, + { + "epoch": 2.7972837022132797, + "grad_norm": 0.29455098509788513, + "learning_rate": 1.3844944723420906e-07, + "loss": 0.3242, + "step": 11122 + }, + { + "epoch": 2.7975352112676055, + "grad_norm": 0.2698933184146881, + "learning_rate": 1.3810770271783457e-07, + "loss": 0.3153, + "step": 11123 + }, + { + "epoch": 2.7977867203219313, + "grad_norm": 0.2703859806060791, + "learning_rate": 1.3776637459211872e-07, + "loss": 0.3044, + "step": 11124 + }, + { + "epoch": 2.7980382293762576, + "grad_norm": 0.29078856110572815, + "learning_rate": 1.3742546288629532e-07, + "loss": 0.3257, + "step": 11125 + }, + { + "epoch": 2.7982897384305834, + "grad_norm": 0.2826457917690277, + "learning_rate": 1.3708496762956047e-07, + "loss": 0.3087, + "step": 11126 + }, + { + "epoch": 2.7985412474849096, + "grad_norm": 0.2665698826313019, + "learning_rate": 1.367448888510764e-07, + "loss": 0.3194, + "step": 11127 + }, + { + "epoch": 2.7987927565392354, + "grad_norm": 0.27445247769355774, + "learning_rate": 1.3640522657996757e-07, + "loss": 0.3108, + "step": 11128 + }, + { + "epoch": 2.7990442655935612, + "grad_norm": 0.2883208096027374, + "learning_rate": 1.3606598084532517e-07, + "loss": 0.3259, + "step": 11129 + }, + { + "epoch": 2.7992957746478875, + "grad_norm": 0.2990553677082062, + "learning_rate": 1.357271516762021e-07, + "loss": 0.3369, + "step": 11130 + }, + { + "epoch": 2.7995472837022133, + "grad_norm": 0.2815016806125641, + "learning_rate": 1.3538873910161788e-07, + "loss": 0.3235, + "step": 11131 + }, + { + "epoch": 2.799798792756539, + "grad_norm": 0.2700446844100952, + "learning_rate": 1.3505074315055545e-07, + "loss": 0.3025, + "step": 11132 + }, + { + "epoch": 2.8000503018108653, + "grad_norm": 0.2885153889656067, + "learning_rate": 1.347131638519622e-07, + "loss": 0.3216, + "step": 11133 + }, + { + "epoch": 2.800301810865191, + "grad_norm": 0.2758094370365143, + "learning_rate": 1.3437600123474837e-07, + "loss": 0.3108, + "step": 11134 + }, + { + "epoch": 2.800553319919517, + "grad_norm": 0.2886246144771576, + "learning_rate": 1.3403925532779137e-07, + "loss": 0.3076, + "step": 11135 + }, + { + "epoch": 2.800804828973843, + "grad_norm": 0.28727173805236816, + "learning_rate": 1.3370292615993098e-07, + "loss": 0.3063, + "step": 11136 + }, + { + "epoch": 2.801056338028169, + "grad_norm": 0.2814168632030487, + "learning_rate": 1.333670137599713e-07, + "loss": 0.3179, + "step": 11137 + }, + { + "epoch": 2.801307847082495, + "grad_norm": 0.28780585527420044, + "learning_rate": 1.3303151815668103e-07, + "loss": 0.3431, + "step": 11138 + }, + { + "epoch": 2.801559356136821, + "grad_norm": 0.296987384557724, + "learning_rate": 1.3269643937879384e-07, + "loss": 0.3119, + "step": 11139 + }, + { + "epoch": 2.801810865191147, + "grad_norm": 0.2830405831336975, + "learning_rate": 1.323617774550068e-07, + "loss": 0.3119, + "step": 11140 + }, + { + "epoch": 2.802062374245473, + "grad_norm": 0.29808276891708374, + "learning_rate": 1.3202753241398192e-07, + "loss": 0.3218, + "step": 11141 + }, + { + "epoch": 2.802313883299799, + "grad_norm": 0.27036961913108826, + "learning_rate": 1.316937042843447e-07, + "loss": 0.3352, + "step": 11142 + }, + { + "epoch": 2.8025653923541247, + "grad_norm": 0.2818096876144409, + "learning_rate": 1.3136029309468612e-07, + "loss": 0.3312, + "step": 11143 + }, + { + "epoch": 2.802816901408451, + "grad_norm": 0.3205699026584625, + "learning_rate": 1.3102729887355947e-07, + "loss": 0.3306, + "step": 11144 + }, + { + "epoch": 2.8030684104627768, + "grad_norm": 0.28567755222320557, + "learning_rate": 1.3069472164948526e-07, + "loss": 0.3176, + "step": 11145 + }, + { + "epoch": 2.8033199195171026, + "grad_norm": 0.30168774724006653, + "learning_rate": 1.3036256145094516e-07, + "loss": 0.3242, + "step": 11146 + }, + { + "epoch": 2.803571428571429, + "grad_norm": 0.304684579372406, + "learning_rate": 1.3003081830638752e-07, + "loss": 0.3341, + "step": 11147 + }, + { + "epoch": 2.8038229376257546, + "grad_norm": 0.29443493485450745, + "learning_rate": 1.2969949224422407e-07, + "loss": 0.3268, + "step": 11148 + }, + { + "epoch": 2.8040744466800804, + "grad_norm": 0.2812149226665497, + "learning_rate": 1.2936858329283043e-07, + "loss": 0.3193, + "step": 11149 + }, + { + "epoch": 2.8043259557344067, + "grad_norm": 0.3072992265224457, + "learning_rate": 1.290380914805478e-07, + "loss": 0.3232, + "step": 11150 + }, + { + "epoch": 2.8045774647887325, + "grad_norm": 0.2899894416332245, + "learning_rate": 1.2870801683567913e-07, + "loss": 0.3289, + "step": 11151 + }, + { + "epoch": 2.8048289738430583, + "grad_norm": 0.29002153873443604, + "learning_rate": 1.2837835938649456e-07, + "loss": 0.3479, + "step": 11152 + }, + { + "epoch": 2.8050804828973845, + "grad_norm": 0.30149370431900024, + "learning_rate": 1.2804911916122596e-07, + "loss": 0.305, + "step": 11153 + }, + { + "epoch": 2.8053319919517103, + "grad_norm": 0.2866424322128296, + "learning_rate": 1.2772029618807247e-07, + "loss": 0.3309, + "step": 11154 + }, + { + "epoch": 2.805583501006036, + "grad_norm": 0.3057340383529663, + "learning_rate": 1.2739189049519429e-07, + "loss": 0.2923, + "step": 11155 + }, + { + "epoch": 2.8058350100603624, + "grad_norm": 0.27844423055648804, + "learning_rate": 1.2706390211071784e-07, + "loss": 0.3382, + "step": 11156 + }, + { + "epoch": 2.806086519114688, + "grad_norm": 0.28191810846328735, + "learning_rate": 1.2673633106273286e-07, + "loss": 0.3181, + "step": 11157 + }, + { + "epoch": 2.806338028169014, + "grad_norm": 0.2744201719760895, + "learning_rate": 1.2640917737929414e-07, + "loss": 0.3168, + "step": 11158 + }, + { + "epoch": 2.8065895372233403, + "grad_norm": 0.28745532035827637, + "learning_rate": 1.2608244108842094e-07, + "loss": 0.3212, + "step": 11159 + }, + { + "epoch": 2.806841046277666, + "grad_norm": 0.2886742055416107, + "learning_rate": 1.2575612221809476e-07, + "loss": 0.3253, + "step": 11160 + }, + { + "epoch": 2.807092555331992, + "grad_norm": 0.2701511085033417, + "learning_rate": 1.2543022079626376e-07, + "loss": 0.3295, + "step": 11161 + }, + { + "epoch": 2.807344064386318, + "grad_norm": 0.2751189172267914, + "learning_rate": 1.25104736850839e-07, + "loss": 0.3468, + "step": 11162 + }, + { + "epoch": 2.807595573440644, + "grad_norm": 0.2670914828777313, + "learning_rate": 1.2477967040969708e-07, + "loss": 0.3246, + "step": 11163 + }, + { + "epoch": 2.8078470824949697, + "grad_norm": 0.27149543166160583, + "learning_rate": 1.2445502150067623e-07, + "loss": 0.3093, + "step": 11164 + }, + { + "epoch": 2.808098591549296, + "grad_norm": 0.3040313124656677, + "learning_rate": 1.2413079015158202e-07, + "loss": 0.3027, + "step": 11165 + }, + { + "epoch": 2.808350100603622, + "grad_norm": 0.2989967465400696, + "learning_rate": 1.238069763901817e-07, + "loss": 0.3109, + "step": 11166 + }, + { + "epoch": 2.8086016096579476, + "grad_norm": 0.2808802127838135, + "learning_rate": 1.2348358024420914e-07, + "loss": 0.3019, + "step": 11167 + }, + { + "epoch": 2.808853118712274, + "grad_norm": 0.28532177209854126, + "learning_rate": 1.2316060174136e-07, + "loss": 0.3205, + "step": 11168 + }, + { + "epoch": 2.8091046277665996, + "grad_norm": 0.29730409383773804, + "learning_rate": 1.2283804090929608e-07, + "loss": 0.3281, + "step": 11169 + }, + { + "epoch": 2.8093561368209254, + "grad_norm": 0.28388866782188416, + "learning_rate": 1.2251589777564242e-07, + "loss": 0.3285, + "step": 11170 + }, + { + "epoch": 2.8096076458752517, + "grad_norm": 0.2799801230430603, + "learning_rate": 1.221941723679887e-07, + "loss": 0.3403, + "step": 11171 + }, + { + "epoch": 2.8098591549295775, + "grad_norm": 0.2923330068588257, + "learning_rate": 1.2187286471388893e-07, + "loss": 0.36, + "step": 11172 + }, + { + "epoch": 2.8101106639839033, + "grad_norm": 0.31289908289909363, + "learning_rate": 1.2155197484086055e-07, + "loss": 0.3146, + "step": 11173 + }, + { + "epoch": 2.8103621730382295, + "grad_norm": 0.30059537291526794, + "learning_rate": 1.2123150277638662e-07, + "loss": 0.3224, + "step": 11174 + }, + { + "epoch": 2.8106136820925554, + "grad_norm": 0.27516844868659973, + "learning_rate": 1.2091144854791237e-07, + "loss": 0.3201, + "step": 11175 + }, + { + "epoch": 2.810865191146881, + "grad_norm": 0.2999837398529053, + "learning_rate": 1.2059181218284922e-07, + "loss": 0.3061, + "step": 11176 + }, + { + "epoch": 2.8111167002012074, + "grad_norm": 0.2801435887813568, + "learning_rate": 1.2027259370857193e-07, + "loss": 0.3218, + "step": 11177 + }, + { + "epoch": 2.811368209255533, + "grad_norm": 0.28390711545944214, + "learning_rate": 1.1995379315241972e-07, + "loss": 0.3263, + "step": 11178 + }, + { + "epoch": 2.811619718309859, + "grad_norm": 0.28144189715385437, + "learning_rate": 1.1963541054169526e-07, + "loss": 0.3042, + "step": 11179 + }, + { + "epoch": 2.8118712273641853, + "grad_norm": 0.3061021566390991, + "learning_rate": 1.193174459036661e-07, + "loss": 0.3118, + "step": 11180 + }, + { + "epoch": 2.812122736418511, + "grad_norm": 0.267780601978302, + "learning_rate": 1.1899989926556498e-07, + "loss": 0.334, + "step": 11181 + }, + { + "epoch": 2.812374245472837, + "grad_norm": 0.301758736371994, + "learning_rate": 1.1868277065458677e-07, + "loss": 0.3052, + "step": 11182 + }, + { + "epoch": 2.812625754527163, + "grad_norm": 0.2828764319419861, + "learning_rate": 1.1836606009789198e-07, + "loss": 0.3152, + "step": 11183 + }, + { + "epoch": 2.812877263581489, + "grad_norm": 0.27735185623168945, + "learning_rate": 1.1804976762260445e-07, + "loss": 0.3104, + "step": 11184 + }, + { + "epoch": 2.8131287726358147, + "grad_norm": 0.2653750777244568, + "learning_rate": 1.1773389325581363e-07, + "loss": 0.3207, + "step": 11185 + }, + { + "epoch": 2.813380281690141, + "grad_norm": 0.2903718948364258, + "learning_rate": 1.1741843702457068e-07, + "loss": 0.3296, + "step": 11186 + }, + { + "epoch": 2.813631790744467, + "grad_norm": 0.2752559185028076, + "learning_rate": 1.171033989558934e-07, + "loss": 0.3011, + "step": 11187 + }, + { + "epoch": 2.8138832997987926, + "grad_norm": 0.3056381642818451, + "learning_rate": 1.16788779076763e-07, + "loss": 0.3116, + "step": 11188 + }, + { + "epoch": 2.814134808853119, + "grad_norm": 0.28672274947166443, + "learning_rate": 1.164745774141246e-07, + "loss": 0.3503, + "step": 11189 + }, + { + "epoch": 2.8143863179074446, + "grad_norm": 0.2924738824367523, + "learning_rate": 1.1616079399488667e-07, + "loss": 0.3123, + "step": 11190 + }, + { + "epoch": 2.8146378269617705, + "grad_norm": 0.2763361930847168, + "learning_rate": 1.1584742884592382e-07, + "loss": 0.2954, + "step": 11191 + }, + { + "epoch": 2.8148893360160967, + "grad_norm": 0.28140226006507874, + "learning_rate": 1.1553448199407346e-07, + "loss": 0.3254, + "step": 11192 + }, + { + "epoch": 2.8151408450704225, + "grad_norm": 0.281065434217453, + "learning_rate": 1.1522195346613752e-07, + "loss": 0.3213, + "step": 11193 + }, + { + "epoch": 2.8153923541247483, + "grad_norm": 0.2887207269668579, + "learning_rate": 1.1490984328888288e-07, + "loss": 0.3029, + "step": 11194 + }, + { + "epoch": 2.8156438631790746, + "grad_norm": 0.26352542638778687, + "learning_rate": 1.1459815148903819e-07, + "loss": 0.3126, + "step": 11195 + }, + { + "epoch": 2.8158953722334004, + "grad_norm": 0.2894093692302704, + "learning_rate": 1.1428687809329986e-07, + "loss": 0.3023, + "step": 11196 + }, + { + "epoch": 2.816146881287726, + "grad_norm": 0.285803884267807, + "learning_rate": 1.139760231283249e-07, + "loss": 0.3372, + "step": 11197 + }, + { + "epoch": 2.8163983903420524, + "grad_norm": 0.2947225868701935, + "learning_rate": 1.1366558662073646e-07, + "loss": 0.3078, + "step": 11198 + }, + { + "epoch": 2.816649899396378, + "grad_norm": 0.3021707236766815, + "learning_rate": 1.1335556859712216e-07, + "loss": 0.3293, + "step": 11199 + }, + { + "epoch": 2.816901408450704, + "grad_norm": 0.27112025022506714, + "learning_rate": 1.1304596908403243e-07, + "loss": 0.3064, + "step": 11200 + }, + { + "epoch": 2.8171529175050303, + "grad_norm": 0.2846040427684784, + "learning_rate": 1.1273678810798272e-07, + "loss": 0.3141, + "step": 11201 + }, + { + "epoch": 2.817404426559356, + "grad_norm": 0.28327932953834534, + "learning_rate": 1.1242802569545241e-07, + "loss": 0.334, + "step": 11202 + }, + { + "epoch": 2.817655935613682, + "grad_norm": 0.2747574746608734, + "learning_rate": 1.1211968187288535e-07, + "loss": 0.3375, + "step": 11203 + }, + { + "epoch": 2.817907444668008, + "grad_norm": 0.2933041751384735, + "learning_rate": 1.1181175666668931e-07, + "loss": 0.3076, + "step": 11204 + }, + { + "epoch": 2.818158953722334, + "grad_norm": 0.288150817155838, + "learning_rate": 1.1150425010323541e-07, + "loss": 0.3362, + "step": 11205 + }, + { + "epoch": 2.8184104627766597, + "grad_norm": 0.27248165011405945, + "learning_rate": 1.1119716220886034e-07, + "loss": 0.3292, + "step": 11206 + }, + { + "epoch": 2.818661971830986, + "grad_norm": 0.2919750511646271, + "learning_rate": 1.1089049300986421e-07, + "loss": 0.3198, + "step": 11207 + }, + { + "epoch": 2.818913480885312, + "grad_norm": 0.28943032026290894, + "learning_rate": 1.1058424253251099e-07, + "loss": 0.3201, + "step": 11208 + }, + { + "epoch": 2.8191649899396376, + "grad_norm": 0.28583675622940063, + "learning_rate": 1.102784108030297e-07, + "loss": 0.3293, + "step": 11209 + }, + { + "epoch": 2.819416498993964, + "grad_norm": 0.2858569025993347, + "learning_rate": 1.0997299784761218e-07, + "loss": 0.3149, + "step": 11210 + }, + { + "epoch": 2.8196680080482897, + "grad_norm": 0.3017948269844055, + "learning_rate": 1.0966800369241526e-07, + "loss": 0.3415, + "step": 11211 + }, + { + "epoch": 2.8199195171026155, + "grad_norm": 0.2693357765674591, + "learning_rate": 1.0936342836356028e-07, + "loss": 0.3366, + "step": 11212 + }, + { + "epoch": 2.8201710261569417, + "grad_norm": 0.29551607370376587, + "learning_rate": 1.0905927188713195e-07, + "loss": 0.3274, + "step": 11213 + }, + { + "epoch": 2.8204225352112675, + "grad_norm": 0.3079835772514343, + "learning_rate": 1.0875553428917939e-07, + "loss": 0.3176, + "step": 11214 + }, + { + "epoch": 2.8206740442655933, + "grad_norm": 0.28076887130737305, + "learning_rate": 1.0845221559571517e-07, + "loss": 0.3163, + "step": 11215 + }, + { + "epoch": 2.8209255533199196, + "grad_norm": 0.2994937598705292, + "learning_rate": 1.0814931583271848e-07, + "loss": 0.3265, + "step": 11216 + }, + { + "epoch": 2.8211770623742454, + "grad_norm": 0.293682724237442, + "learning_rate": 1.0784683502612858e-07, + "loss": 0.3358, + "step": 11217 + }, + { + "epoch": 2.821428571428571, + "grad_norm": 0.3048734664916992, + "learning_rate": 1.0754477320185253e-07, + "loss": 0.3298, + "step": 11218 + }, + { + "epoch": 2.8216800804828974, + "grad_norm": 0.27152693271636963, + "learning_rate": 1.0724313038575906e-07, + "loss": 0.3252, + "step": 11219 + }, + { + "epoch": 2.8219315895372232, + "grad_norm": 0.3005521893501282, + "learning_rate": 1.0694190660368309e-07, + "loss": 0.3093, + "step": 11220 + }, + { + "epoch": 2.822183098591549, + "grad_norm": 0.281059205532074, + "learning_rate": 1.0664110188142118e-07, + "loss": 0.3335, + "step": 11221 + }, + { + "epoch": 2.8224346076458753, + "grad_norm": 0.2841849625110626, + "learning_rate": 1.0634071624473719e-07, + "loss": 0.3297, + "step": 11222 + }, + { + "epoch": 2.822686116700201, + "grad_norm": 0.29039302468299866, + "learning_rate": 1.0604074971935497e-07, + "loss": 0.323, + "step": 11223 + }, + { + "epoch": 2.822937625754527, + "grad_norm": 0.2728997766971588, + "learning_rate": 1.0574120233096674e-07, + "loss": 0.3357, + "step": 11224 + }, + { + "epoch": 2.823189134808853, + "grad_norm": 0.2747051417827606, + "learning_rate": 1.0544207410522644e-07, + "loss": 0.324, + "step": 11225 + }, + { + "epoch": 2.823440643863179, + "grad_norm": 0.286855548620224, + "learning_rate": 1.0514336506775135e-07, + "loss": 0.2887, + "step": 11226 + }, + { + "epoch": 2.823692152917505, + "grad_norm": 0.28169578313827515, + "learning_rate": 1.0484507524412602e-07, + "loss": 0.3123, + "step": 11227 + }, + { + "epoch": 2.823943661971831, + "grad_norm": 0.2875922918319702, + "learning_rate": 1.0454720465989498e-07, + "loss": 0.3339, + "step": 11228 + }, + { + "epoch": 2.824195171026157, + "grad_norm": 0.31413838267326355, + "learning_rate": 1.0424975334057064e-07, + "loss": 0.3335, + "step": 11229 + }, + { + "epoch": 2.824446680080483, + "grad_norm": 0.2952876091003418, + "learning_rate": 1.0395272131162703e-07, + "loss": 0.3128, + "step": 11230 + }, + { + "epoch": 2.824698189134809, + "grad_norm": 0.2815072238445282, + "learning_rate": 1.0365610859850328e-07, + "loss": 0.324, + "step": 11231 + }, + { + "epoch": 2.8249496981891347, + "grad_norm": 0.2810538709163666, + "learning_rate": 1.0335991522660239e-07, + "loss": 0.3117, + "step": 11232 + }, + { + "epoch": 2.825201207243461, + "grad_norm": 0.2817157804965973, + "learning_rate": 1.0306414122129127e-07, + "loss": 0.3148, + "step": 11233 + }, + { + "epoch": 2.8254527162977867, + "grad_norm": 0.2859732210636139, + "learning_rate": 1.0276878660790135e-07, + "loss": 0.3146, + "step": 11234 + }, + { + "epoch": 2.8257042253521125, + "grad_norm": 0.2922956347465515, + "learning_rate": 1.0247385141172794e-07, + "loss": 0.3036, + "step": 11235 + }, + { + "epoch": 2.8259557344064388, + "grad_norm": 0.2939150631427765, + "learning_rate": 1.0217933565803085e-07, + "loss": 0.3114, + "step": 11236 + }, + { + "epoch": 2.8262072434607646, + "grad_norm": 0.2930050492286682, + "learning_rate": 1.018852393720321e-07, + "loss": 0.3278, + "step": 11237 + }, + { + "epoch": 2.8264587525150904, + "grad_norm": 0.2771678566932678, + "learning_rate": 1.01591562578921e-07, + "loss": 0.3314, + "step": 11238 + }, + { + "epoch": 2.8267102615694166, + "grad_norm": 0.2838783264160156, + "learning_rate": 1.0129830530384743e-07, + "loss": 0.302, + "step": 11239 + }, + { + "epoch": 2.8269617706237424, + "grad_norm": 0.28282177448272705, + "learning_rate": 1.0100546757192853e-07, + "loss": 0.3397, + "step": 11240 + }, + { + "epoch": 2.8272132796780687, + "grad_norm": 0.28225719928741455, + "learning_rate": 1.0071304940824255e-07, + "loss": 0.3246, + "step": 11241 + }, + { + "epoch": 2.8274647887323945, + "grad_norm": 0.29289302229881287, + "learning_rate": 1.0042105083783449e-07, + "loss": 0.312, + "step": 11242 + }, + { + "epoch": 2.8277162977867203, + "grad_norm": 0.2851783335208893, + "learning_rate": 1.0012947188571154e-07, + "loss": 0.3026, + "step": 11243 + }, + { + "epoch": 2.8279678068410465, + "grad_norm": 0.3011089265346527, + "learning_rate": 9.983831257684651e-08, + "loss": 0.3134, + "step": 11244 + }, + { + "epoch": 2.8282193158953723, + "grad_norm": 0.28961464762687683, + "learning_rate": 9.95475729361739e-08, + "loss": 0.3119, + "step": 11245 + }, + { + "epoch": 2.828470824949698, + "grad_norm": 0.30916205048561096, + "learning_rate": 9.92572529885949e-08, + "loss": 0.3098, + "step": 11246 + }, + { + "epoch": 2.8287223340040244, + "grad_norm": 0.29237234592437744, + "learning_rate": 9.896735275897296e-08, + "loss": 0.3082, + "step": 11247 + }, + { + "epoch": 2.82897384305835, + "grad_norm": 0.3051128387451172, + "learning_rate": 9.867787227213655e-08, + "loss": 0.3121, + "step": 11248 + }, + { + "epoch": 2.829225352112676, + "grad_norm": 0.2796797454357147, + "learning_rate": 9.838881155287806e-08, + "loss": 0.3058, + "step": 11249 + }, + { + "epoch": 2.8294768611670023, + "grad_norm": 0.28661051392555237, + "learning_rate": 9.810017062595322e-08, + "loss": 0.3158, + "step": 11250 + }, + { + "epoch": 2.829728370221328, + "grad_norm": 0.28525805473327637, + "learning_rate": 9.781194951608286e-08, + "loss": 0.3155, + "step": 11251 + }, + { + "epoch": 2.829979879275654, + "grad_norm": 0.2928963601589203, + "learning_rate": 9.75241482479511e-08, + "loss": 0.3174, + "step": 11252 + }, + { + "epoch": 2.83023138832998, + "grad_norm": 0.28580477833747864, + "learning_rate": 9.723676684620542e-08, + "loss": 0.3443, + "step": 11253 + }, + { + "epoch": 2.830482897384306, + "grad_norm": 0.29559531807899475, + "learning_rate": 9.694980533546005e-08, + "loss": 0.3303, + "step": 11254 + }, + { + "epoch": 2.8307344064386317, + "grad_norm": 0.27034255862236023, + "learning_rate": 9.666326374028979e-08, + "loss": 0.3093, + "step": 11255 + }, + { + "epoch": 2.830985915492958, + "grad_norm": 0.29185721278190613, + "learning_rate": 9.637714208523664e-08, + "loss": 0.331, + "step": 11256 + }, + { + "epoch": 2.8312374245472838, + "grad_norm": 0.2903611660003662, + "learning_rate": 9.609144039480323e-08, + "loss": 0.3167, + "step": 11257 + }, + { + "epoch": 2.8314889336016096, + "grad_norm": 0.28841114044189453, + "learning_rate": 9.580615869345999e-08, + "loss": 0.3126, + "step": 11258 + }, + { + "epoch": 2.831740442655936, + "grad_norm": 0.2874075174331665, + "learning_rate": 9.55212970056385e-08, + "loss": 0.3075, + "step": 11259 + }, + { + "epoch": 2.8319919517102616, + "grad_norm": 0.2626476287841797, + "learning_rate": 9.52368553557359e-08, + "loss": 0.3212, + "step": 11260 + }, + { + "epoch": 2.8322434607645874, + "grad_norm": 0.28571265935897827, + "learning_rate": 9.49528337681116e-08, + "loss": 0.3141, + "step": 11261 + }, + { + "epoch": 2.8324949698189137, + "grad_norm": 0.27694886922836304, + "learning_rate": 9.466923226709223e-08, + "loss": 0.3197, + "step": 11262 + }, + { + "epoch": 2.8327464788732395, + "grad_norm": 0.30933791399002075, + "learning_rate": 9.43860508769645e-08, + "loss": 0.333, + "step": 11263 + }, + { + "epoch": 2.8329979879275653, + "grad_norm": 0.28589534759521484, + "learning_rate": 9.410328962198289e-08, + "loss": 0.3427, + "step": 11264 + }, + { + "epoch": 2.8332494969818915, + "grad_norm": 0.2972225248813629, + "learning_rate": 9.382094852636303e-08, + "loss": 0.3231, + "step": 11265 + }, + { + "epoch": 2.8335010060362174, + "grad_norm": 0.2830668091773987, + "learning_rate": 9.353902761428557e-08, + "loss": 0.3156, + "step": 11266 + }, + { + "epoch": 2.833752515090543, + "grad_norm": 0.30200186371803284, + "learning_rate": 9.325752690989676e-08, + "loss": 0.3334, + "step": 11267 + }, + { + "epoch": 2.8340040241448694, + "grad_norm": 0.27993977069854736, + "learning_rate": 9.297644643730342e-08, + "loss": 0.3043, + "step": 11268 + }, + { + "epoch": 2.834255533199195, + "grad_norm": 0.27513226866722107, + "learning_rate": 9.269578622057962e-08, + "loss": 0.3329, + "step": 11269 + }, + { + "epoch": 2.834507042253521, + "grad_norm": 0.2780003249645233, + "learning_rate": 9.241554628376115e-08, + "loss": 0.3006, + "step": 11270 + }, + { + "epoch": 2.8347585513078473, + "grad_norm": 0.2845858633518219, + "learning_rate": 9.213572665084991e-08, + "loss": 0.3041, + "step": 11271 + }, + { + "epoch": 2.835010060362173, + "grad_norm": 0.27587416768074036, + "learning_rate": 9.185632734581008e-08, + "loss": 0.3032, + "step": 11272 + }, + { + "epoch": 2.835261569416499, + "grad_norm": 0.28815388679504395, + "learning_rate": 9.157734839257026e-08, + "loss": 0.3105, + "step": 11273 + }, + { + "epoch": 2.835513078470825, + "grad_norm": 0.279449999332428, + "learning_rate": 9.12987898150236e-08, + "loss": 0.3063, + "step": 11274 + }, + { + "epoch": 2.835764587525151, + "grad_norm": 0.2929026484489441, + "learning_rate": 9.102065163702767e-08, + "loss": 0.3263, + "step": 11275 + }, + { + "epoch": 2.8360160965794767, + "grad_norm": 0.28392016887664795, + "learning_rate": 9.074293388240118e-08, + "loss": 0.3048, + "step": 11276 + }, + { + "epoch": 2.836267605633803, + "grad_norm": 0.27088797092437744, + "learning_rate": 9.046563657493068e-08, + "loss": 0.3295, + "step": 11277 + }, + { + "epoch": 2.836519114688129, + "grad_norm": 0.28667935729026794, + "learning_rate": 9.018875973836493e-08, + "loss": 0.3453, + "step": 11278 + }, + { + "epoch": 2.8367706237424546, + "grad_norm": 0.29696646332740784, + "learning_rate": 8.991230339641554e-08, + "loss": 0.286, + "step": 11279 + }, + { + "epoch": 2.837022132796781, + "grad_norm": 0.2868374288082123, + "learning_rate": 8.963626757276078e-08, + "loss": 0.3175, + "step": 11280 + }, + { + "epoch": 2.8372736418511066, + "grad_norm": 0.2921113967895508, + "learning_rate": 8.936065229104007e-08, + "loss": 0.3318, + "step": 11281 + }, + { + "epoch": 2.8375251509054324, + "grad_norm": 0.2965297996997833, + "learning_rate": 8.908545757485843e-08, + "loss": 0.3135, + "step": 11282 + }, + { + "epoch": 2.8377766599597587, + "grad_norm": 0.2960835099220276, + "learning_rate": 8.881068344778476e-08, + "loss": 0.3107, + "step": 11283 + }, + { + "epoch": 2.8380281690140845, + "grad_norm": 0.27558842301368713, + "learning_rate": 8.853632993335248e-08, + "loss": 0.3091, + "step": 11284 + }, + { + "epoch": 2.8382796780684103, + "grad_norm": 0.28575974702835083, + "learning_rate": 8.826239705505668e-08, + "loss": 0.3177, + "step": 11285 + }, + { + "epoch": 2.8385311871227366, + "grad_norm": 0.2759111225605011, + "learning_rate": 8.79888848363597e-08, + "loss": 0.3333, + "step": 11286 + }, + { + "epoch": 2.8387826961770624, + "grad_norm": 0.2808525264263153, + "learning_rate": 8.771579330068447e-08, + "loss": 0.3325, + "step": 11287 + }, + { + "epoch": 2.839034205231388, + "grad_norm": 0.2843991816043854, + "learning_rate": 8.744312247142005e-08, + "loss": 0.297, + "step": 11288 + }, + { + "epoch": 2.8392857142857144, + "grad_norm": 0.28934431076049805, + "learning_rate": 8.717087237192057e-08, + "loss": 0.3257, + "step": 11289 + }, + { + "epoch": 2.83953722334004, + "grad_norm": 0.3172331750392914, + "learning_rate": 8.689904302550011e-08, + "loss": 0.3271, + "step": 11290 + }, + { + "epoch": 2.839788732394366, + "grad_norm": 0.27618086338043213, + "learning_rate": 8.662763445544121e-08, + "loss": 0.34, + "step": 11291 + }, + { + "epoch": 2.8400402414486923, + "grad_norm": 0.29746612906455994, + "learning_rate": 8.635664668498744e-08, + "loss": 0.3051, + "step": 11292 + }, + { + "epoch": 2.840291750503018, + "grad_norm": 0.3068857789039612, + "learning_rate": 8.608607973734695e-08, + "loss": 0.3271, + "step": 11293 + }, + { + "epoch": 2.840543259557344, + "grad_norm": 0.28964129090309143, + "learning_rate": 8.58159336356923e-08, + "loss": 0.3228, + "step": 11294 + }, + { + "epoch": 2.84079476861167, + "grad_norm": 0.28352829813957214, + "learning_rate": 8.554620840315997e-08, + "loss": 0.3148, + "step": 11295 + }, + { + "epoch": 2.841046277665996, + "grad_norm": 0.3184901475906372, + "learning_rate": 8.527690406285039e-08, + "loss": 0.3446, + "step": 11296 + }, + { + "epoch": 2.8412977867203217, + "grad_norm": 0.2846779525279999, + "learning_rate": 8.500802063782732e-08, + "loss": 0.3211, + "step": 11297 + }, + { + "epoch": 2.841549295774648, + "grad_norm": 0.2861534655094147, + "learning_rate": 8.4739558151119e-08, + "loss": 0.3138, + "step": 11298 + }, + { + "epoch": 2.841800804828974, + "grad_norm": 0.3019251227378845, + "learning_rate": 8.447151662571762e-08, + "loss": 0.3251, + "step": 11299 + }, + { + "epoch": 2.8420523138832996, + "grad_norm": 0.27594611048698425, + "learning_rate": 8.420389608458035e-08, + "loss": 0.3178, + "step": 11300 + }, + { + "epoch": 2.842303822937626, + "grad_norm": 0.28519585728645325, + "learning_rate": 8.393669655062553e-08, + "loss": 0.3354, + "step": 11301 + }, + { + "epoch": 2.8425553319919517, + "grad_norm": 0.31750738620758057, + "learning_rate": 8.366991804673818e-08, + "loss": 0.3143, + "step": 11302 + }, + { + "epoch": 2.8428068410462775, + "grad_norm": 0.3031100332736969, + "learning_rate": 8.340356059576505e-08, + "loss": 0.3148, + "step": 11303 + }, + { + "epoch": 2.8430583501006037, + "grad_norm": 0.2872277498245239, + "learning_rate": 8.313762422052008e-08, + "loss": 0.321, + "step": 11304 + }, + { + "epoch": 2.8433098591549295, + "grad_norm": 0.2903568744659424, + "learning_rate": 8.287210894377673e-08, + "loss": 0.312, + "step": 11305 + }, + { + "epoch": 2.8435613682092553, + "grad_norm": 0.2857918441295624, + "learning_rate": 8.260701478827626e-08, + "loss": 0.3355, + "step": 11306 + }, + { + "epoch": 2.8438128772635816, + "grad_norm": 0.2868918180465698, + "learning_rate": 8.234234177672217e-08, + "loss": 0.3188, + "step": 11307 + }, + { + "epoch": 2.8440643863179074, + "grad_norm": 0.2801552414894104, + "learning_rate": 8.207808993178134e-08, + "loss": 0.3317, + "step": 11308 + }, + { + "epoch": 2.844315895372233, + "grad_norm": 0.28546252846717834, + "learning_rate": 8.181425927608566e-08, + "loss": 0.3225, + "step": 11309 + }, + { + "epoch": 2.8445674044265594, + "grad_norm": 0.3166808485984802, + "learning_rate": 8.155084983223038e-08, + "loss": 0.3293, + "step": 11310 + }, + { + "epoch": 2.8448189134808852, + "grad_norm": 0.287341445684433, + "learning_rate": 8.12878616227758e-08, + "loss": 0.3176, + "step": 11311 + }, + { + "epoch": 2.845070422535211, + "grad_norm": 0.27534937858581543, + "learning_rate": 8.102529467024389e-08, + "loss": 0.3313, + "step": 11312 + }, + { + "epoch": 2.8453219315895373, + "grad_norm": 0.28979775309562683, + "learning_rate": 8.076314899712279e-08, + "loss": 0.3389, + "step": 11313 + }, + { + "epoch": 2.845573440643863, + "grad_norm": 0.2847757339477539, + "learning_rate": 8.050142462586285e-08, + "loss": 0.3058, + "step": 11314 + }, + { + "epoch": 2.845824949698189, + "grad_norm": 0.2654733657836914, + "learning_rate": 8.024012157888062e-08, + "loss": 0.3408, + "step": 11315 + }, + { + "epoch": 2.846076458752515, + "grad_norm": 0.2785297632217407, + "learning_rate": 7.997923987855316e-08, + "loss": 0.3114, + "step": 11316 + }, + { + "epoch": 2.846327967806841, + "grad_norm": 0.27078497409820557, + "learning_rate": 7.971877954722429e-08, + "loss": 0.305, + "step": 11317 + }, + { + "epoch": 2.8465794768611667, + "grad_norm": 0.29301923513412476, + "learning_rate": 7.945874060720116e-08, + "loss": 0.3411, + "step": 11318 + }, + { + "epoch": 2.846830985915493, + "grad_norm": 0.2895153760910034, + "learning_rate": 7.919912308075428e-08, + "loss": 0.2994, + "step": 11319 + }, + { + "epoch": 2.847082494969819, + "grad_norm": 0.3017180263996124, + "learning_rate": 7.893992699011754e-08, + "loss": 0.3418, + "step": 11320 + }, + { + "epoch": 2.8473340040241446, + "grad_norm": 0.3073883652687073, + "learning_rate": 7.868115235748986e-08, + "loss": 0.3158, + "step": 11321 + }, + { + "epoch": 2.847585513078471, + "grad_norm": 0.28526899218559265, + "learning_rate": 7.842279920503404e-08, + "loss": 0.3253, + "step": 11322 + }, + { + "epoch": 2.8478370221327967, + "grad_norm": 0.2872920036315918, + "learning_rate": 7.81648675548763e-08, + "loss": 0.3285, + "step": 11323 + }, + { + "epoch": 2.8480885311871225, + "grad_norm": 0.2851753532886505, + "learning_rate": 7.790735742910671e-08, + "loss": 0.3204, + "step": 11324 + }, + { + "epoch": 2.8483400402414487, + "grad_norm": 0.28143948316574097, + "learning_rate": 7.765026884977934e-08, + "loss": 0.3232, + "step": 11325 + }, + { + "epoch": 2.8485915492957745, + "grad_norm": 0.28405773639678955, + "learning_rate": 7.739360183891265e-08, + "loss": 0.3175, + "step": 11326 + }, + { + "epoch": 2.8488430583501008, + "grad_norm": 0.3014451265335083, + "learning_rate": 7.713735641848796e-08, + "loss": 0.3247, + "step": 11327 + }, + { + "epoch": 2.8490945674044266, + "grad_norm": 0.2807544469833374, + "learning_rate": 7.688153261045161e-08, + "loss": 0.3086, + "step": 11328 + }, + { + "epoch": 2.8493460764587524, + "grad_norm": 0.2870287001132965, + "learning_rate": 7.662613043671274e-08, + "loss": 0.3077, + "step": 11329 + }, + { + "epoch": 2.8495975855130786, + "grad_norm": 0.2920602560043335, + "learning_rate": 7.637114991914552e-08, + "loss": 0.3255, + "step": 11330 + }, + { + "epoch": 2.8498490945674044, + "grad_norm": 0.29726293683052063, + "learning_rate": 7.611659107958692e-08, + "loss": 0.3174, + "step": 11331 + }, + { + "epoch": 2.8501006036217302, + "grad_norm": 0.2930297553539276, + "learning_rate": 7.586245393983837e-08, + "loss": 0.3149, + "step": 11332 + }, + { + "epoch": 2.8503521126760565, + "grad_norm": 0.29084110260009766, + "learning_rate": 7.560873852166584e-08, + "loss": 0.3169, + "step": 11333 + }, + { + "epoch": 2.8506036217303823, + "grad_norm": 0.29585593938827515, + "learning_rate": 7.535544484679747e-08, + "loss": 0.3074, + "step": 11334 + }, + { + "epoch": 2.850855130784708, + "grad_norm": 0.29283884167671204, + "learning_rate": 7.510257293692702e-08, + "loss": 0.3137, + "step": 11335 + }, + { + "epoch": 2.8511066398390343, + "grad_norm": 0.31102699041366577, + "learning_rate": 7.485012281371107e-08, + "loss": 0.3049, + "step": 11336 + }, + { + "epoch": 2.85135814889336, + "grad_norm": 0.3003372848033905, + "learning_rate": 7.45980944987712e-08, + "loss": 0.3204, + "step": 11337 + }, + { + "epoch": 2.8516096579476864, + "grad_norm": 0.2767249643802643, + "learning_rate": 7.434648801369015e-08, + "loss": 0.3144, + "step": 11338 + }, + { + "epoch": 2.851861167002012, + "grad_norm": 0.2756442725658417, + "learning_rate": 7.409530338001846e-08, + "loss": 0.3237, + "step": 11339 + }, + { + "epoch": 2.852112676056338, + "grad_norm": 0.29652678966522217, + "learning_rate": 7.384454061926727e-08, + "loss": 0.3038, + "step": 11340 + }, + { + "epoch": 2.8523641851106643, + "grad_norm": 0.266886830329895, + "learning_rate": 7.359419975291326e-08, + "loss": 0.3405, + "step": 11341 + }, + { + "epoch": 2.85261569416499, + "grad_norm": 0.284145325422287, + "learning_rate": 7.334428080239653e-08, + "loss": 0.3099, + "step": 11342 + }, + { + "epoch": 2.852867203219316, + "grad_norm": 0.2697547674179077, + "learning_rate": 7.309478378912105e-08, + "loss": 0.309, + "step": 11343 + }, + { + "epoch": 2.853118712273642, + "grad_norm": 0.29870113730430603, + "learning_rate": 7.28457087344553e-08, + "loss": 0.3209, + "step": 11344 + }, + { + "epoch": 2.853370221327968, + "grad_norm": 0.2862483561038971, + "learning_rate": 7.259705565972941e-08, + "loss": 0.3286, + "step": 11345 + }, + { + "epoch": 2.8536217303822937, + "grad_norm": 0.29490405321121216, + "learning_rate": 7.234882458624081e-08, + "loss": 0.3179, + "step": 11346 + }, + { + "epoch": 2.85387323943662, + "grad_norm": 0.2868093252182007, + "learning_rate": 7.210101553524751e-08, + "loss": 0.3436, + "step": 11347 + }, + { + "epoch": 2.8541247484909458, + "grad_norm": 0.2712860703468323, + "learning_rate": 7.185362852797417e-08, + "loss": 0.331, + "step": 11348 + }, + { + "epoch": 2.8543762575452716, + "grad_norm": 0.2829849421977997, + "learning_rate": 7.160666358560664e-08, + "loss": 0.32, + "step": 11349 + }, + { + "epoch": 2.854627766599598, + "grad_norm": 0.28862541913986206, + "learning_rate": 7.136012072929632e-08, + "loss": 0.3101, + "step": 11350 + }, + { + "epoch": 2.8548792756539236, + "grad_norm": 0.27747079730033875, + "learning_rate": 7.111399998015856e-08, + "loss": 0.3098, + "step": 11351 + }, + { + "epoch": 2.8551307847082494, + "grad_norm": 0.2933838963508606, + "learning_rate": 7.086830135927148e-08, + "loss": 0.3195, + "step": 11352 + }, + { + "epoch": 2.8553822937625757, + "grad_norm": 0.2843649685382843, + "learning_rate": 7.06230248876777e-08, + "loss": 0.3176, + "step": 11353 + }, + { + "epoch": 2.8556338028169015, + "grad_norm": 0.2857533395290375, + "learning_rate": 7.037817058638375e-08, + "loss": 0.2968, + "step": 11354 + }, + { + "epoch": 2.8558853118712273, + "grad_norm": 0.27660998702049255, + "learning_rate": 7.013373847636007e-08, + "loss": 0.3313, + "step": 11355 + }, + { + "epoch": 2.8561368209255535, + "grad_norm": 0.28614291548728943, + "learning_rate": 6.988972857853993e-08, + "loss": 0.3525, + "step": 11356 + }, + { + "epoch": 2.8563883299798793, + "grad_norm": 0.2815568447113037, + "learning_rate": 6.964614091382272e-08, + "loss": 0.318, + "step": 11357 + }, + { + "epoch": 2.856639839034205, + "grad_norm": 0.2796972393989563, + "learning_rate": 6.940297550306895e-08, + "loss": 0.3181, + "step": 11358 + }, + { + "epoch": 2.8568913480885314, + "grad_norm": 0.2727015018463135, + "learning_rate": 6.916023236710478e-08, + "loss": 0.339, + "step": 11359 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 0.2959345281124115, + "learning_rate": 6.891791152671912e-08, + "loss": 0.3253, + "step": 11360 + }, + { + "epoch": 2.857394366197183, + "grad_norm": 0.29117295145988464, + "learning_rate": 6.867601300266647e-08, + "loss": 0.3153, + "step": 11361 + }, + { + "epoch": 2.8576458752515093, + "grad_norm": 0.31557777523994446, + "learning_rate": 6.843453681566192e-08, + "loss": 0.3248, + "step": 11362 + }, + { + "epoch": 2.857897384305835, + "grad_norm": 0.28794246912002563, + "learning_rate": 6.819348298638839e-08, + "loss": 0.3368, + "step": 11363 + }, + { + "epoch": 2.858148893360161, + "grad_norm": 0.2903740108013153, + "learning_rate": 6.79528515354888e-08, + "loss": 0.3016, + "step": 11364 + }, + { + "epoch": 2.858400402414487, + "grad_norm": 0.2773711681365967, + "learning_rate": 6.771264248357389e-08, + "loss": 0.3324, + "step": 11365 + }, + { + "epoch": 2.858651911468813, + "grad_norm": 0.3121846318244934, + "learning_rate": 6.747285585121388e-08, + "loss": 0.3131, + "step": 11366 + }, + { + "epoch": 2.8589034205231387, + "grad_norm": 0.27057939767837524, + "learning_rate": 6.723349165894621e-08, + "loss": 0.3408, + "step": 11367 + }, + { + "epoch": 2.859154929577465, + "grad_norm": 0.3090129494667053, + "learning_rate": 6.699454992727061e-08, + "loss": 0.3223, + "step": 11368 + }, + { + "epoch": 2.859406438631791, + "grad_norm": 0.2833699882030487, + "learning_rate": 6.675603067665182e-08, + "loss": 0.3158, + "step": 11369 + }, + { + "epoch": 2.8596579476861166, + "grad_norm": 0.29948365688323975, + "learning_rate": 6.651793392751571e-08, + "loss": 0.325, + "step": 11370 + }, + { + "epoch": 2.859909456740443, + "grad_norm": 0.27591371536254883, + "learning_rate": 6.628025970025542e-08, + "loss": 0.3463, + "step": 11371 + }, + { + "epoch": 2.8601609657947686, + "grad_norm": 0.3021274209022522, + "learning_rate": 6.604300801522523e-08, + "loss": 0.2923, + "step": 11372 + }, + { + "epoch": 2.8604124748490944, + "grad_norm": 0.26336097717285156, + "learning_rate": 6.580617889274498e-08, + "loss": 0.3211, + "step": 11373 + }, + { + "epoch": 2.8606639839034207, + "grad_norm": 0.26855531334877014, + "learning_rate": 6.55697723530968e-08, + "loss": 0.3167, + "step": 11374 + }, + { + "epoch": 2.8609154929577465, + "grad_norm": 0.28841209411621094, + "learning_rate": 6.533378841652893e-08, + "loss": 0.3344, + "step": 11375 + }, + { + "epoch": 2.8611670020120723, + "grad_norm": 0.28485679626464844, + "learning_rate": 6.509822710324964e-08, + "loss": 0.3074, + "step": 11376 + }, + { + "epoch": 2.8614185110663986, + "grad_norm": 0.3013753294944763, + "learning_rate": 6.486308843343558e-08, + "loss": 0.3302, + "step": 11377 + }, + { + "epoch": 2.8616700201207244, + "grad_norm": 0.29902198910713196, + "learning_rate": 6.462837242722342e-08, + "loss": 0.341, + "step": 11378 + }, + { + "epoch": 2.86192152917505, + "grad_norm": 0.29432061314582825, + "learning_rate": 6.43940791047154e-08, + "loss": 0.3172, + "step": 11379 + }, + { + "epoch": 2.8621730382293764, + "grad_norm": 0.28921979665756226, + "learning_rate": 6.41602084859777e-08, + "loss": 0.3307, + "step": 11380 + }, + { + "epoch": 2.862424547283702, + "grad_norm": 0.3158515989780426, + "learning_rate": 6.392676059103986e-08, + "loss": 0.3208, + "step": 11381 + }, + { + "epoch": 2.862676056338028, + "grad_norm": 0.27476269006729126, + "learning_rate": 6.36937354398942e-08, + "loss": 0.3132, + "step": 11382 + }, + { + "epoch": 2.8629275653923543, + "grad_norm": 0.2913151681423187, + "learning_rate": 6.346113305249923e-08, + "loss": 0.2988, + "step": 11383 + }, + { + "epoch": 2.86317907444668, + "grad_norm": 0.3120485544204712, + "learning_rate": 6.322895344877566e-08, + "loss": 0.3129, + "step": 11384 + }, + { + "epoch": 2.863430583501006, + "grad_norm": 0.2824515402317047, + "learning_rate": 6.299719664860704e-08, + "loss": 0.301, + "step": 11385 + }, + { + "epoch": 2.863682092555332, + "grad_norm": 0.28883475065231323, + "learning_rate": 6.27658626718436e-08, + "loss": 0.3207, + "step": 11386 + }, + { + "epoch": 2.863933601609658, + "grad_norm": 0.3073241710662842, + "learning_rate": 6.253495153829614e-08, + "loss": 0.3431, + "step": 11387 + }, + { + "epoch": 2.8641851106639837, + "grad_norm": 0.2900136709213257, + "learning_rate": 6.230446326774165e-08, + "loss": 0.3236, + "step": 11388 + }, + { + "epoch": 2.86443661971831, + "grad_norm": 0.3047274351119995, + "learning_rate": 6.207439787991986e-08, + "loss": 0.3098, + "step": 11389 + }, + { + "epoch": 2.864688128772636, + "grad_norm": 0.2926124930381775, + "learning_rate": 6.184475539453394e-08, + "loss": 0.3156, + "step": 11390 + }, + { + "epoch": 2.8649396378269616, + "grad_norm": 0.290855348110199, + "learning_rate": 6.161553583125202e-08, + "loss": 0.3187, + "step": 11391 + }, + { + "epoch": 2.865191146881288, + "grad_norm": 0.30072471499443054, + "learning_rate": 6.138673920970505e-08, + "loss": 0.3089, + "step": 11392 + }, + { + "epoch": 2.8654426559356136, + "grad_norm": 0.2740122675895691, + "learning_rate": 6.115836554948795e-08, + "loss": 0.3369, + "step": 11393 + }, + { + "epoch": 2.8656941649899395, + "grad_norm": 0.2824988067150116, + "learning_rate": 6.09304148701595e-08, + "loss": 0.3327, + "step": 11394 + }, + { + "epoch": 2.8659456740442657, + "grad_norm": 0.28075578808784485, + "learning_rate": 6.07028871912424e-08, + "loss": 0.3295, + "step": 11395 + }, + { + "epoch": 2.8661971830985915, + "grad_norm": 0.32692331075668335, + "learning_rate": 6.047578253222274e-08, + "loss": 0.3303, + "step": 11396 + }, + { + "epoch": 2.8664486921529173, + "grad_norm": 0.27784934639930725, + "learning_rate": 6.024910091255054e-08, + "loss": 0.3183, + "step": 11397 + }, + { + "epoch": 2.8667002012072436, + "grad_norm": 0.30311983823776245, + "learning_rate": 6.002284235164024e-08, + "loss": 0.3179, + "step": 11398 + }, + { + "epoch": 2.8669517102615694, + "grad_norm": 0.2793339490890503, + "learning_rate": 5.979700686886914e-08, + "loss": 0.3189, + "step": 11399 + }, + { + "epoch": 2.867203219315895, + "grad_norm": 0.3019621968269348, + "learning_rate": 5.9571594483577855e-08, + "loss": 0.3127, + "step": 11400 + }, + { + "epoch": 2.8674547283702214, + "grad_norm": 0.28011688590049744, + "learning_rate": 5.9346605215073185e-08, + "loss": 0.3086, + "step": 11401 + }, + { + "epoch": 2.8677062374245472, + "grad_norm": 0.31461960077285767, + "learning_rate": 5.9122039082622486e-08, + "loss": 0.3178, + "step": 11402 + }, + { + "epoch": 2.867957746478873, + "grad_norm": 0.27287060022354126, + "learning_rate": 5.889789610545982e-08, + "loss": 0.3329, + "step": 11403 + }, + { + "epoch": 2.8682092555331993, + "grad_norm": 0.28381481766700745, + "learning_rate": 5.867417630277983e-08, + "loss": 0.3345, + "step": 11404 + }, + { + "epoch": 2.868460764587525, + "grad_norm": 0.29278185963630676, + "learning_rate": 5.845087969374497e-08, + "loss": 0.3284, + "step": 11405 + }, + { + "epoch": 2.868712273641851, + "grad_norm": 0.27680209279060364, + "learning_rate": 5.8228006297477156e-08, + "loss": 0.3217, + "step": 11406 + }, + { + "epoch": 2.868963782696177, + "grad_norm": 0.2894262373447418, + "learning_rate": 5.8005556133065575e-08, + "loss": 0.2931, + "step": 11407 + }, + { + "epoch": 2.869215291750503, + "grad_norm": 0.26620855927467346, + "learning_rate": 5.7783529219560544e-08, + "loss": 0.3274, + "step": 11408 + }, + { + "epoch": 2.8694668008048287, + "grad_norm": 0.2928333878517151, + "learning_rate": 5.756192557597795e-08, + "loss": 0.314, + "step": 11409 + }, + { + "epoch": 2.869718309859155, + "grad_norm": 0.2882349193096161, + "learning_rate": 5.734074522129707e-08, + "loss": 0.34, + "step": 11410 + }, + { + "epoch": 2.869969818913481, + "grad_norm": 0.2803259789943695, + "learning_rate": 5.711998817445996e-08, + "loss": 0.3263, + "step": 11411 + }, + { + "epoch": 2.8702213279678066, + "grad_norm": 0.3050937354564667, + "learning_rate": 5.689965445437318e-08, + "loss": 0.3102, + "step": 11412 + }, + { + "epoch": 2.870472837022133, + "grad_norm": 0.2841440439224243, + "learning_rate": 5.6679744079907176e-08, + "loss": 0.3356, + "step": 11413 + }, + { + "epoch": 2.8707243460764587, + "grad_norm": 0.28401076793670654, + "learning_rate": 5.646025706989577e-08, + "loss": 0.3222, + "step": 11414 + }, + { + "epoch": 2.8709758551307845, + "grad_norm": 0.3013293147087097, + "learning_rate": 5.624119344313672e-08, + "loss": 0.3261, + "step": 11415 + }, + { + "epoch": 2.8712273641851107, + "grad_norm": 0.2837771773338318, + "learning_rate": 5.6022553218391674e-08, + "loss": 0.313, + "step": 11416 + }, + { + "epoch": 2.8714788732394365, + "grad_norm": 0.28893938660621643, + "learning_rate": 5.580433641438454e-08, + "loss": 0.3093, + "step": 11417 + }, + { + "epoch": 2.8717303822937623, + "grad_norm": 0.2678500711917877, + "learning_rate": 5.558654304980593e-08, + "loss": 0.3037, + "step": 11418 + }, + { + "epoch": 2.8719818913480886, + "grad_norm": 0.29921528697013855, + "learning_rate": 5.536917314330759e-08, + "loss": 0.3199, + "step": 11419 + }, + { + "epoch": 2.8722334004024144, + "grad_norm": 0.2949788570404053, + "learning_rate": 5.5152226713506285e-08, + "loss": 0.311, + "step": 11420 + }, + { + "epoch": 2.87248490945674, + "grad_norm": 0.31055039167404175, + "learning_rate": 5.493570377898161e-08, + "loss": 0.3221, + "step": 11421 + }, + { + "epoch": 2.8727364185110664, + "grad_norm": 0.28630343079566956, + "learning_rate": 5.4719604358277615e-08, + "loss": 0.33, + "step": 11422 + }, + { + "epoch": 2.8729879275653922, + "grad_norm": 0.30379021167755127, + "learning_rate": 5.450392846990227e-08, + "loss": 0.3115, + "step": 11423 + }, + { + "epoch": 2.873239436619718, + "grad_norm": 0.2785795032978058, + "learning_rate": 5.428867613232636e-08, + "loss": 0.2873, + "step": 11424 + }, + { + "epoch": 2.8734909456740443, + "grad_norm": 0.271603524684906, + "learning_rate": 5.407384736398513e-08, + "loss": 0.3243, + "step": 11425 + }, + { + "epoch": 2.87374245472837, + "grad_norm": 0.2870146930217743, + "learning_rate": 5.385944218327721e-08, + "loss": 0.3185, + "step": 11426 + }, + { + "epoch": 2.8739939637826963, + "grad_norm": 0.2926969528198242, + "learning_rate": 5.3645460608565124e-08, + "loss": 0.301, + "step": 11427 + }, + { + "epoch": 2.874245472837022, + "grad_norm": 0.2959466278553009, + "learning_rate": 5.3431902658174776e-08, + "loss": 0.3264, + "step": 11428 + }, + { + "epoch": 2.874496981891348, + "grad_norm": 0.3116055130958557, + "learning_rate": 5.321876835039652e-08, + "loss": 0.2992, + "step": 11429 + }, + { + "epoch": 2.874748490945674, + "grad_norm": 0.28724607825279236, + "learning_rate": 5.30060577034841e-08, + "loss": 0.3454, + "step": 11430 + }, + { + "epoch": 2.875, + "grad_norm": 0.29540908336639404, + "learning_rate": 5.279377073565406e-08, + "loss": 0.2933, + "step": 11431 + }, + { + "epoch": 2.875251509054326, + "grad_norm": 0.3000968098640442, + "learning_rate": 5.258190746508796e-08, + "loss": 0.3071, + "step": 11432 + }, + { + "epoch": 2.875503018108652, + "grad_norm": 0.29470768570899963, + "learning_rate": 5.237046790993072e-08, + "loss": 0.324, + "step": 11433 + }, + { + "epoch": 2.875754527162978, + "grad_norm": 0.29645785689353943, + "learning_rate": 5.2159452088290654e-08, + "loss": 0.3147, + "step": 11434 + }, + { + "epoch": 2.8760060362173037, + "grad_norm": 0.27087098360061646, + "learning_rate": 5.1948860018239954e-08, + "loss": 0.3253, + "step": 11435 + }, + { + "epoch": 2.87625754527163, + "grad_norm": 0.30059030652046204, + "learning_rate": 5.17386917178142e-08, + "loss": 0.3133, + "step": 11436 + }, + { + "epoch": 2.8765090543259557, + "grad_norm": 0.27557048201560974, + "learning_rate": 5.152894720501345e-08, + "loss": 0.312, + "step": 11437 + }, + { + "epoch": 2.876760563380282, + "grad_norm": 0.2941986322402954, + "learning_rate": 5.131962649780109e-08, + "loss": 0.3058, + "step": 11438 + }, + { + "epoch": 2.8770120724346078, + "grad_norm": 0.2895359694957733, + "learning_rate": 5.111072961410335e-08, + "loss": 0.3238, + "step": 11439 + }, + { + "epoch": 2.8772635814889336, + "grad_norm": 0.3026026785373688, + "learning_rate": 5.090225657181203e-08, + "loss": 0.3418, + "step": 11440 + }, + { + "epoch": 2.87751509054326, + "grad_norm": 0.29785895347595215, + "learning_rate": 5.069420738878061e-08, + "loss": 0.3185, + "step": 11441 + }, + { + "epoch": 2.8777665995975856, + "grad_norm": 0.3039131164550781, + "learning_rate": 5.048658208282764e-08, + "loss": 0.3096, + "step": 11442 + }, + { + "epoch": 2.8780181086519114, + "grad_norm": 0.29077792167663574, + "learning_rate": 5.027938067173499e-08, + "loss": 0.3351, + "step": 11443 + }, + { + "epoch": 2.8782696177062377, + "grad_norm": 0.2855316698551178, + "learning_rate": 5.0072603173247914e-08, + "loss": 0.3141, + "step": 11444 + }, + { + "epoch": 2.8785211267605635, + "grad_norm": 0.2709047496318817, + "learning_rate": 4.986624960507558e-08, + "loss": 0.3076, + "step": 11445 + }, + { + "epoch": 2.8787726358148893, + "grad_norm": 0.27553707361221313, + "learning_rate": 4.966031998489107e-08, + "loss": 0.3167, + "step": 11446 + }, + { + "epoch": 2.8790241448692155, + "grad_norm": 0.29868531227111816, + "learning_rate": 4.945481433033139e-08, + "loss": 0.3118, + "step": 11447 + }, + { + "epoch": 2.8792756539235413, + "grad_norm": 0.28663668036460876, + "learning_rate": 4.92497326589958e-08, + "loss": 0.3408, + "step": 11448 + }, + { + "epoch": 2.879527162977867, + "grad_norm": 0.2877984046936035, + "learning_rate": 4.9045074988449125e-08, + "loss": 0.2728, + "step": 11449 + }, + { + "epoch": 2.8797786720321934, + "grad_norm": 0.2943393290042877, + "learning_rate": 4.884084133621847e-08, + "loss": 0.3304, + "step": 11450 + }, + { + "epoch": 2.880030181086519, + "grad_norm": 0.28473910689353943, + "learning_rate": 4.863703171979539e-08, + "loss": 0.3062, + "step": 11451 + }, + { + "epoch": 2.880281690140845, + "grad_norm": 0.2870289981365204, + "learning_rate": 4.843364615663537e-08, + "loss": 0.3047, + "step": 11452 + }, + { + "epoch": 2.8805331991951713, + "grad_norm": 0.2811976671218872, + "learning_rate": 4.823068466415615e-08, + "loss": 0.3187, + "step": 11453 + }, + { + "epoch": 2.880784708249497, + "grad_norm": 0.2888941466808319, + "learning_rate": 4.802814725974048e-08, + "loss": 0.3171, + "step": 11454 + }, + { + "epoch": 2.881036217303823, + "grad_norm": 0.2838338613510132, + "learning_rate": 4.782603396073504e-08, + "loss": 0.3013, + "step": 11455 + }, + { + "epoch": 2.881287726358149, + "grad_norm": 0.29402175545692444, + "learning_rate": 4.7624344784448774e-08, + "loss": 0.3276, + "step": 11456 + }, + { + "epoch": 2.881539235412475, + "grad_norm": 0.28359174728393555, + "learning_rate": 4.742307974815563e-08, + "loss": 0.3127, + "step": 11457 + }, + { + "epoch": 2.8817907444668007, + "grad_norm": 0.3000732660293579, + "learning_rate": 4.7222238869092386e-08, + "loss": 0.3278, + "step": 11458 + }, + { + "epoch": 2.882042253521127, + "grad_norm": 0.2840462327003479, + "learning_rate": 4.702182216445972e-08, + "loss": 0.3335, + "step": 11459 + }, + { + "epoch": 2.8822937625754528, + "grad_norm": 0.3007800281047821, + "learning_rate": 4.682182965142279e-08, + "loss": 0.3129, + "step": 11460 + }, + { + "epoch": 2.8825452716297786, + "grad_norm": 0.291432648897171, + "learning_rate": 4.6622261347108456e-08, + "loss": 0.3206, + "step": 11461 + }, + { + "epoch": 2.882796780684105, + "grad_norm": 0.2831004858016968, + "learning_rate": 4.6423117268609704e-08, + "loss": 0.3142, + "step": 11462 + }, + { + "epoch": 2.8830482897384306, + "grad_norm": 0.29433655738830566, + "learning_rate": 4.622439743298124e-08, + "loss": 0.3303, + "step": 11463 + }, + { + "epoch": 2.8832997987927564, + "grad_norm": 0.28008735179901123, + "learning_rate": 4.6026101857242765e-08, + "loss": 0.3134, + "step": 11464 + }, + { + "epoch": 2.8835513078470827, + "grad_norm": 0.26906129717826843, + "learning_rate": 4.582823055837626e-08, + "loss": 0.3519, + "step": 11465 + }, + { + "epoch": 2.8838028169014085, + "grad_norm": 0.27299466729164124, + "learning_rate": 4.563078355332873e-08, + "loss": 0.3303, + "step": 11466 + }, + { + "epoch": 2.8840543259557343, + "grad_norm": 0.29059502482414246, + "learning_rate": 4.543376085901052e-08, + "loss": 0.3287, + "step": 11467 + }, + { + "epoch": 2.8843058350100605, + "grad_norm": 0.28941747546195984, + "learning_rate": 4.523716249229426e-08, + "loss": 0.3179, + "step": 11468 + }, + { + "epoch": 2.8845573440643864, + "grad_norm": 0.2766706645488739, + "learning_rate": 4.504098847001925e-08, + "loss": 0.307, + "step": 11469 + }, + { + "epoch": 2.884808853118712, + "grad_norm": 0.28143224120140076, + "learning_rate": 4.484523880898428e-08, + "loss": 0.3123, + "step": 11470 + }, + { + "epoch": 2.8850603621730384, + "grad_norm": 0.30837175250053406, + "learning_rate": 4.464991352595593e-08, + "loss": 0.2922, + "step": 11471 + }, + { + "epoch": 2.885311871227364, + "grad_norm": 0.26661986112594604, + "learning_rate": 4.445501263766194e-08, + "loss": 0.3325, + "step": 11472 + }, + { + "epoch": 2.88556338028169, + "grad_norm": 0.2966688275337219, + "learning_rate": 4.426053616079395e-08, + "loss": 0.3129, + "step": 11473 + }, + { + "epoch": 2.8858148893360163, + "grad_norm": 0.2732764184474945, + "learning_rate": 4.406648411200809e-08, + "loss": 0.2998, + "step": 11474 + }, + { + "epoch": 2.886066398390342, + "grad_norm": 0.3051884174346924, + "learning_rate": 4.3872856507923835e-08, + "loss": 0.3071, + "step": 11475 + }, + { + "epoch": 2.886317907444668, + "grad_norm": 0.2977435290813446, + "learning_rate": 4.367965336512403e-08, + "loss": 0.3247, + "step": 11476 + }, + { + "epoch": 2.886569416498994, + "grad_norm": 0.27795737981796265, + "learning_rate": 4.348687470015489e-08, + "loss": 0.3144, + "step": 11477 + }, + { + "epoch": 2.88682092555332, + "grad_norm": 0.29412925243377686, + "learning_rate": 4.329452052952765e-08, + "loss": 0.3354, + "step": 11478 + }, + { + "epoch": 2.8870724346076457, + "grad_norm": 0.30797427892684937, + "learning_rate": 4.3102590869715246e-08, + "loss": 0.3209, + "step": 11479 + }, + { + "epoch": 2.887323943661972, + "grad_norm": 0.3010835349559784, + "learning_rate": 4.291108573715563e-08, + "loss": 0.3181, + "step": 11480 + }, + { + "epoch": 2.887575452716298, + "grad_norm": 0.27650344371795654, + "learning_rate": 4.272000514825014e-08, + "loss": 0.3232, + "step": 11481 + }, + { + "epoch": 2.8878269617706236, + "grad_norm": 0.284249871969223, + "learning_rate": 4.2529349119364014e-08, + "loss": 0.3218, + "step": 11482 + }, + { + "epoch": 2.88807847082495, + "grad_norm": 0.27574625611305237, + "learning_rate": 4.233911766682475e-08, + "loss": 0.3078, + "step": 11483 + }, + { + "epoch": 2.8883299798792756, + "grad_norm": 0.31248676776885986, + "learning_rate": 4.214931080692486e-08, + "loss": 0.3008, + "step": 11484 + }, + { + "epoch": 2.8885814889336014, + "grad_norm": 0.2914111316204071, + "learning_rate": 4.195992855592079e-08, + "loss": 0.3229, + "step": 11485 + }, + { + "epoch": 2.8888329979879277, + "grad_norm": 0.31353798508644104, + "learning_rate": 4.177097093003124e-08, + "loss": 0.3246, + "step": 11486 + }, + { + "epoch": 2.8890845070422535, + "grad_norm": 0.2765747308731079, + "learning_rate": 4.158243794543992e-08, + "loss": 0.3021, + "step": 11487 + }, + { + "epoch": 2.8893360160965793, + "grad_norm": 0.2932724058628082, + "learning_rate": 4.1394329618292265e-08, + "loss": 0.3156, + "step": 11488 + }, + { + "epoch": 2.8895875251509056, + "grad_norm": 0.2794465720653534, + "learning_rate": 4.120664596469981e-08, + "loss": 0.3164, + "step": 11489 + }, + { + "epoch": 2.8898390342052314, + "grad_norm": 0.2837032973766327, + "learning_rate": 4.101938700073582e-08, + "loss": 0.3202, + "step": 11490 + }, + { + "epoch": 2.890090543259557, + "grad_norm": 0.27074161171913147, + "learning_rate": 4.083255274243858e-08, + "loss": 0.3186, + "step": 11491 + }, + { + "epoch": 2.8903420523138834, + "grad_norm": 0.290571391582489, + "learning_rate": 4.0646143205808063e-08, + "loss": 0.3222, + "step": 11492 + }, + { + "epoch": 2.890593561368209, + "grad_norm": 0.2796350419521332, + "learning_rate": 4.046015840680984e-08, + "loss": 0.3166, + "step": 11493 + }, + { + "epoch": 2.890845070422535, + "grad_norm": 0.30049675703048706, + "learning_rate": 4.0274598361372266e-08, + "loss": 0.3055, + "step": 11494 + }, + { + "epoch": 2.8910965794768613, + "grad_norm": 0.3056521415710449, + "learning_rate": 4.008946308538764e-08, + "loss": 0.3097, + "step": 11495 + }, + { + "epoch": 2.891348088531187, + "grad_norm": 0.2760073244571686, + "learning_rate": 3.990475259471105e-08, + "loss": 0.3126, + "step": 11496 + }, + { + "epoch": 2.891599597585513, + "grad_norm": 0.2556244134902954, + "learning_rate": 3.9720466905162625e-08, + "loss": 0.3084, + "step": 11497 + }, + { + "epoch": 2.891851106639839, + "grad_norm": 0.2833138704299927, + "learning_rate": 3.953660603252474e-08, + "loss": 0.3218, + "step": 11498 + }, + { + "epoch": 2.892102615694165, + "grad_norm": 0.27805668115615845, + "learning_rate": 3.9353169992543684e-08, + "loss": 0.3332, + "step": 11499 + }, + { + "epoch": 2.8923541247484907, + "grad_norm": 0.30059728026390076, + "learning_rate": 3.917015880092967e-08, + "loss": 0.3212, + "step": 11500 + }, + { + "epoch": 2.892605633802817, + "grad_norm": 0.27901148796081543, + "learning_rate": 3.898757247335738e-08, + "loss": 0.3197, + "step": 11501 + }, + { + "epoch": 2.892857142857143, + "grad_norm": 0.29673993587493896, + "learning_rate": 3.8805411025463204e-08, + "loss": 0.3502, + "step": 11502 + }, + { + "epoch": 2.8931086519114686, + "grad_norm": 0.2697283923625946, + "learning_rate": 3.862367447284854e-08, + "loss": 0.2868, + "step": 11503 + }, + { + "epoch": 2.893360160965795, + "grad_norm": 0.27969419956207275, + "learning_rate": 3.8442362831077603e-08, + "loss": 0.3168, + "step": 11504 + }, + { + "epoch": 2.8936116700201207, + "grad_norm": 0.29848575592041016, + "learning_rate": 3.82614761156791e-08, + "loss": 0.3243, + "step": 11505 + }, + { + "epoch": 2.8938631790744465, + "grad_norm": 0.2888948619365692, + "learning_rate": 3.8081014342144506e-08, + "loss": 0.3113, + "step": 11506 + }, + { + "epoch": 2.8941146881287727, + "grad_norm": 0.2911056876182556, + "learning_rate": 3.790097752592925e-08, + "loss": 0.3319, + "step": 11507 + }, + { + "epoch": 2.8943661971830985, + "grad_norm": 0.2891317903995514, + "learning_rate": 3.772136568245266e-08, + "loss": 0.3201, + "step": 11508 + }, + { + "epoch": 2.8946177062374243, + "grad_norm": 0.3018838167190552, + "learning_rate": 3.754217882709743e-08, + "loss": 0.2983, + "step": 11509 + }, + { + "epoch": 2.8948692152917506, + "grad_norm": 0.30178946256637573, + "learning_rate": 3.7363416975209065e-08, + "loss": 0.3322, + "step": 11510 + }, + { + "epoch": 2.8951207243460764, + "grad_norm": 0.28581055998802185, + "learning_rate": 3.718508014209809e-08, + "loss": 0.3176, + "step": 11511 + }, + { + "epoch": 2.895372233400402, + "grad_norm": 0.3123716711997986, + "learning_rate": 3.700716834303786e-08, + "loss": 0.328, + "step": 11512 + }, + { + "epoch": 2.8956237424547284, + "grad_norm": 0.2836924195289612, + "learning_rate": 3.682968159326505e-08, + "loss": 0.3073, + "step": 11513 + }, + { + "epoch": 2.8958752515090542, + "grad_norm": 0.2993314266204834, + "learning_rate": 3.665261990798086e-08, + "loss": 0.327, + "step": 11514 + }, + { + "epoch": 2.89612676056338, + "grad_norm": 0.2848893105983734, + "learning_rate": 3.6475983302348695e-08, + "loss": 0.3163, + "step": 11515 + }, + { + "epoch": 2.8963782696177063, + "grad_norm": 0.30268558859825134, + "learning_rate": 3.629977179149702e-08, + "loss": 0.3199, + "step": 11516 + }, + { + "epoch": 2.896629778672032, + "grad_norm": 0.28077784180641174, + "learning_rate": 3.6123985390517094e-08, + "loss": 0.3212, + "step": 11517 + }, + { + "epoch": 2.896881287726358, + "grad_norm": 0.31762558221817017, + "learning_rate": 3.5948624114464094e-08, + "loss": 0.3415, + "step": 11518 + }, + { + "epoch": 2.897132796780684, + "grad_norm": 0.267600953578949, + "learning_rate": 3.577368797835601e-08, + "loss": 0.3174, + "step": 11519 + }, + { + "epoch": 2.89738430583501, + "grad_norm": 0.29275646805763245, + "learning_rate": 3.5599176997175853e-08, + "loss": 0.308, + "step": 11520 + }, + { + "epoch": 2.8976358148893357, + "grad_norm": 0.29874688386917114, + "learning_rate": 3.54250911858689e-08, + "loss": 0.2987, + "step": 11521 + }, + { + "epoch": 2.897887323943662, + "grad_norm": 0.29817038774490356, + "learning_rate": 3.525143055934488e-08, + "loss": 0.3041, + "step": 11522 + }, + { + "epoch": 2.898138832997988, + "grad_norm": 0.28878530859947205, + "learning_rate": 3.507819513247579e-08, + "loss": 0.3076, + "step": 11523 + }, + { + "epoch": 2.8983903420523136, + "grad_norm": 0.2917855679988861, + "learning_rate": 3.4905384920099204e-08, + "loss": 0.3071, + "step": 11524 + }, + { + "epoch": 2.89864185110664, + "grad_norm": 0.28223884105682373, + "learning_rate": 3.473299993701496e-08, + "loss": 0.3096, + "step": 11525 + }, + { + "epoch": 2.8988933601609657, + "grad_norm": 0.2846827805042267, + "learning_rate": 3.4561040197986785e-08, + "loss": 0.3293, + "step": 11526 + }, + { + "epoch": 2.899144869215292, + "grad_norm": 0.31608250737190247, + "learning_rate": 3.4389505717741246e-08, + "loss": 0.3281, + "step": 11527 + }, + { + "epoch": 2.8993963782696177, + "grad_norm": 0.29066160321235657, + "learning_rate": 3.421839651096992e-08, + "loss": 0.3134, + "step": 11528 + }, + { + "epoch": 2.8996478873239435, + "grad_norm": 0.28401198983192444, + "learning_rate": 3.4047712592327753e-08, + "loss": 0.3228, + "step": 11529 + }, + { + "epoch": 2.8998993963782698, + "grad_norm": 0.27312031388282776, + "learning_rate": 3.3877453976431386e-08, + "loss": 0.3142, + "step": 11530 + }, + { + "epoch": 2.9001509054325956, + "grad_norm": 0.27587342262268066, + "learning_rate": 3.37076206778636e-08, + "loss": 0.3202, + "step": 11531 + }, + { + "epoch": 2.9004024144869214, + "grad_norm": 0.2901318073272705, + "learning_rate": 3.353821271116886e-08, + "loss": 0.3086, + "step": 11532 + }, + { + "epoch": 2.9006539235412476, + "grad_norm": 0.28231751918792725, + "learning_rate": 3.336923009085613e-08, + "loss": 0.3257, + "step": 11533 + }, + { + "epoch": 2.9009054325955734, + "grad_norm": 0.27121472358703613, + "learning_rate": 3.320067283139772e-08, + "loss": 0.3256, + "step": 11534 + }, + { + "epoch": 2.9011569416498992, + "grad_norm": 0.2887016832828522, + "learning_rate": 3.3032540947229296e-08, + "loss": 0.3162, + "step": 11535 + }, + { + "epoch": 2.9014084507042255, + "grad_norm": 0.2676198184490204, + "learning_rate": 3.286483445275046e-08, + "loss": 0.3181, + "step": 11536 + }, + { + "epoch": 2.9016599597585513, + "grad_norm": 0.2852703630924225, + "learning_rate": 3.269755336232472e-08, + "loss": 0.2963, + "step": 11537 + }, + { + "epoch": 2.9019114688128775, + "grad_norm": 0.2976604402065277, + "learning_rate": 3.2530697690277835e-08, + "loss": 0.348, + "step": 11538 + }, + { + "epoch": 2.9021629778672033, + "grad_norm": 0.27679896354675293, + "learning_rate": 3.236426745090004e-08, + "loss": 0.3239, + "step": 11539 + }, + { + "epoch": 2.902414486921529, + "grad_norm": 0.2862226068973541, + "learning_rate": 3.219826265844606e-08, + "loss": 0.3037, + "step": 11540 + }, + { + "epoch": 2.9026659959758554, + "grad_norm": 0.27300092577934265, + "learning_rate": 3.203268332713172e-08, + "loss": 0.328, + "step": 11541 + }, + { + "epoch": 2.902917505030181, + "grad_norm": 0.2864827513694763, + "learning_rate": 3.1867529471139025e-08, + "loss": 0.3227, + "step": 11542 + }, + { + "epoch": 2.903169014084507, + "grad_norm": 0.2821025848388672, + "learning_rate": 3.1702801104611655e-08, + "loss": 0.312, + "step": 11543 + }, + { + "epoch": 2.9034205231388333, + "grad_norm": 0.27062085270881653, + "learning_rate": 3.153849824165778e-08, + "loss": 0.2998, + "step": 11544 + }, + { + "epoch": 2.903672032193159, + "grad_norm": 0.2897660434246063, + "learning_rate": 3.1374620896348905e-08, + "loss": 0.3393, + "step": 11545 + }, + { + "epoch": 2.903923541247485, + "grad_norm": 0.3015936017036438, + "learning_rate": 3.121116908272048e-08, + "loss": 0.315, + "step": 11546 + }, + { + "epoch": 2.904175050301811, + "grad_norm": 0.28271710872650146, + "learning_rate": 3.10481428147702e-08, + "loss": 0.3177, + "step": 11547 + }, + { + "epoch": 2.904426559356137, + "grad_norm": 0.2824450135231018, + "learning_rate": 3.088554210646133e-08, + "loss": 0.3422, + "step": 11548 + }, + { + "epoch": 2.9046780684104627, + "grad_norm": 0.289212703704834, + "learning_rate": 3.072336697171885e-08, + "loss": 0.3493, + "step": 11549 + }, + { + "epoch": 2.904929577464789, + "grad_norm": 0.29587748646736145, + "learning_rate": 3.05616174244322e-08, + "loss": 0.2915, + "step": 11550 + }, + { + "epoch": 2.9051810865191148, + "grad_norm": 0.26594677567481995, + "learning_rate": 3.040029347845419e-08, + "loss": 0.3406, + "step": 11551 + }, + { + "epoch": 2.9054325955734406, + "grad_norm": 0.28409308195114136, + "learning_rate": 3.0239395147601547e-08, + "loss": 0.3331, + "step": 11552 + }, + { + "epoch": 2.905684104627767, + "grad_norm": 0.29760369658470154, + "learning_rate": 3.007892244565436e-08, + "loss": 0.3406, + "step": 11553 + }, + { + "epoch": 2.9059356136820926, + "grad_norm": 0.276475191116333, + "learning_rate": 2.991887538635496e-08, + "loss": 0.3105, + "step": 11554 + }, + { + "epoch": 2.9061871227364184, + "grad_norm": 0.29626041650772095, + "learning_rate": 2.975925398341184e-08, + "loss": 0.3304, + "step": 11555 + }, + { + "epoch": 2.9064386317907447, + "grad_norm": 0.29270699620246887, + "learning_rate": 2.960005825049461e-08, + "loss": 0.342, + "step": 11556 + }, + { + "epoch": 2.9066901408450705, + "grad_norm": 0.2765859365463257, + "learning_rate": 2.944128820123737e-08, + "loss": 0.326, + "step": 11557 + }, + { + "epoch": 2.9069416498993963, + "grad_norm": 0.2837342917919159, + "learning_rate": 2.9282943849238687e-08, + "loss": 0.3028, + "step": 11558 + }, + { + "epoch": 2.9071931589537225, + "grad_norm": 0.3145996630191803, + "learning_rate": 2.9125025208058823e-08, + "loss": 0.3317, + "step": 11559 + }, + { + "epoch": 2.9074446680080483, + "grad_norm": 0.28416410088539124, + "learning_rate": 2.8967532291222512e-08, + "loss": 0.3253, + "step": 11560 + }, + { + "epoch": 2.907696177062374, + "grad_norm": 0.2724005877971649, + "learning_rate": 2.8810465112218965e-08, + "loss": 0.3177, + "step": 11561 + }, + { + "epoch": 2.9079476861167004, + "grad_norm": 0.2829074561595917, + "learning_rate": 2.8653823684499093e-08, + "loss": 0.3286, + "step": 11562 + }, + { + "epoch": 2.908199195171026, + "grad_norm": 0.2702285349369049, + "learning_rate": 2.849760802147883e-08, + "loss": 0.3161, + "step": 11563 + }, + { + "epoch": 2.908450704225352, + "grad_norm": 0.2694869637489319, + "learning_rate": 2.8341818136536915e-08, + "loss": 0.3177, + "step": 11564 + }, + { + "epoch": 2.9087022132796783, + "grad_norm": 0.302658349275589, + "learning_rate": 2.8186454043014898e-08, + "loss": 0.3182, + "step": 11565 + }, + { + "epoch": 2.908953722334004, + "grad_norm": 0.2830633521080017, + "learning_rate": 2.8031515754220473e-08, + "loss": 0.3262, + "step": 11566 + }, + { + "epoch": 2.90920523138833, + "grad_norm": 0.25605013966560364, + "learning_rate": 2.7877003283421356e-08, + "loss": 0.3138, + "step": 11567 + }, + { + "epoch": 2.909456740442656, + "grad_norm": 0.30849385261535645, + "learning_rate": 2.772291664385196e-08, + "loss": 0.3408, + "step": 11568 + }, + { + "epoch": 2.909708249496982, + "grad_norm": 0.2905694246292114, + "learning_rate": 2.7569255848708397e-08, + "loss": 0.332, + "step": 11569 + }, + { + "epoch": 2.9099597585513077, + "grad_norm": 0.31673821806907654, + "learning_rate": 2.7416020911150144e-08, + "loss": 0.3283, + "step": 11570 + }, + { + "epoch": 2.910211267605634, + "grad_norm": 0.28510385751724243, + "learning_rate": 2.7263211844301695e-08, + "loss": 0.3045, + "step": 11571 + }, + { + "epoch": 2.91046277665996, + "grad_norm": 0.2822716534137726, + "learning_rate": 2.7110828661249255e-08, + "loss": 0.3522, + "step": 11572 + }, + { + "epoch": 2.9107142857142856, + "grad_norm": 0.3001321256160736, + "learning_rate": 2.6958871375044605e-08, + "loss": 0.3117, + "step": 11573 + }, + { + "epoch": 2.910965794768612, + "grad_norm": 0.2811189889907837, + "learning_rate": 2.680733999870122e-08, + "loss": 0.3131, + "step": 11574 + }, + { + "epoch": 2.9112173038229376, + "grad_norm": 0.3301267921924591, + "learning_rate": 2.6656234545197057e-08, + "loss": 0.3362, + "step": 11575 + }, + { + "epoch": 2.9114688128772634, + "grad_norm": 0.2918173372745514, + "learning_rate": 2.6505555027472875e-08, + "loss": 0.3034, + "step": 11576 + }, + { + "epoch": 2.9117203219315897, + "grad_norm": 0.29059723019599915, + "learning_rate": 2.6355301458434457e-08, + "loss": 0.3262, + "step": 11577 + }, + { + "epoch": 2.9119718309859155, + "grad_norm": 0.29665839672088623, + "learning_rate": 2.620547385094929e-08, + "loss": 0.3288, + "step": 11578 + }, + { + "epoch": 2.9122233400402413, + "grad_norm": 0.275056391954422, + "learning_rate": 2.6056072217848783e-08, + "loss": 0.3151, + "step": 11579 + }, + { + "epoch": 2.9124748490945676, + "grad_norm": 0.2742392122745514, + "learning_rate": 2.5907096571929357e-08, + "loss": 0.3128, + "step": 11580 + }, + { + "epoch": 2.9127263581488934, + "grad_norm": 0.2982200086116791, + "learning_rate": 2.5758546925949148e-08, + "loss": 0.3303, + "step": 11581 + }, + { + "epoch": 2.912977867203219, + "grad_norm": 0.284332275390625, + "learning_rate": 2.5610423292630195e-08, + "loss": 0.335, + "step": 11582 + }, + { + "epoch": 2.9132293762575454, + "grad_norm": 0.289644718170166, + "learning_rate": 2.5462725684659573e-08, + "loss": 0.3368, + "step": 11583 + }, + { + "epoch": 2.913480885311871, + "grad_norm": 0.27438437938690186, + "learning_rate": 2.53154541146855e-08, + "loss": 0.3279, + "step": 11584 + }, + { + "epoch": 2.913732394366197, + "grad_norm": 0.2902214527130127, + "learning_rate": 2.516860859532122e-08, + "loss": 0.3175, + "step": 11585 + }, + { + "epoch": 2.9139839034205233, + "grad_norm": 0.28719666600227356, + "learning_rate": 2.5022189139143338e-08, + "loss": 0.3358, + "step": 11586 + }, + { + "epoch": 2.914235412474849, + "grad_norm": 0.298453688621521, + "learning_rate": 2.487619575869127e-08, + "loss": 0.3301, + "step": 11587 + }, + { + "epoch": 2.914486921529175, + "grad_norm": 0.2861069142818451, + "learning_rate": 2.4730628466468898e-08, + "loss": 0.288, + "step": 11588 + }, + { + "epoch": 2.914738430583501, + "grad_norm": 0.3006594479084015, + "learning_rate": 2.4585487274942922e-08, + "loss": 0.3185, + "step": 11589 + }, + { + "epoch": 2.914989939637827, + "grad_norm": 0.2652274966239929, + "learning_rate": 2.444077219654395e-08, + "loss": 0.3174, + "step": 11590 + }, + { + "epoch": 2.9152414486921527, + "grad_norm": 0.29787126183509827, + "learning_rate": 2.4296483243665958e-08, + "loss": 0.3119, + "step": 11591 + }, + { + "epoch": 2.915492957746479, + "grad_norm": 0.26815494894981384, + "learning_rate": 2.4152620428666284e-08, + "loss": 0.3128, + "step": 11592 + }, + { + "epoch": 2.915744466800805, + "grad_norm": 0.28812792897224426, + "learning_rate": 2.400918376386563e-08, + "loss": 0.3161, + "step": 11593 + }, + { + "epoch": 2.9159959758551306, + "grad_norm": 0.27825263142585754, + "learning_rate": 2.386617326154861e-08, + "loss": 0.2885, + "step": 11594 + }, + { + "epoch": 2.916247484909457, + "grad_norm": 0.30951130390167236, + "learning_rate": 2.372358893396376e-08, + "loss": 0.3131, + "step": 11595 + }, + { + "epoch": 2.9164989939637826, + "grad_norm": 0.3053039610385895, + "learning_rate": 2.358143079332187e-08, + "loss": 0.3242, + "step": 11596 + }, + { + "epoch": 2.9167505030181085, + "grad_norm": 0.2770836651325226, + "learning_rate": 2.3439698851797643e-08, + "loss": 0.33, + "step": 11597 + }, + { + "epoch": 2.9170020120724347, + "grad_norm": 0.29645803570747375, + "learning_rate": 2.329839312153026e-08, + "loss": 0.3375, + "step": 11598 + }, + { + "epoch": 2.9172535211267605, + "grad_norm": 0.26674768328666687, + "learning_rate": 2.3157513614621706e-08, + "loss": 0.3247, + "step": 11599 + }, + { + "epoch": 2.9175050301810863, + "grad_norm": 0.2916862666606903, + "learning_rate": 2.3017060343136223e-08, + "loss": 0.322, + "step": 11600 + }, + { + "epoch": 2.9177565392354126, + "grad_norm": 0.2905628979206085, + "learning_rate": 2.287703331910418e-08, + "loss": 0.2903, + "step": 11601 + }, + { + "epoch": 2.9180080482897384, + "grad_norm": 0.28413960337638855, + "learning_rate": 2.273743255451766e-08, + "loss": 0.3085, + "step": 11602 + }, + { + "epoch": 2.918259557344064, + "grad_norm": 0.290414035320282, + "learning_rate": 2.2598258061331536e-08, + "loss": 0.3394, + "step": 11603 + }, + { + "epoch": 2.9185110663983904, + "grad_norm": 0.2848317623138428, + "learning_rate": 2.2459509851466833e-08, + "loss": 0.3058, + "step": 11604 + }, + { + "epoch": 2.9187625754527162, + "grad_norm": 0.2922884225845337, + "learning_rate": 2.232118793680571e-08, + "loss": 0.3092, + "step": 11605 + }, + { + "epoch": 2.919014084507042, + "grad_norm": 0.2839473485946655, + "learning_rate": 2.218329232919425e-08, + "loss": 0.3318, + "step": 11606 + }, + { + "epoch": 2.9192655935613683, + "grad_norm": 0.28309670090675354, + "learning_rate": 2.2045823040443005e-08, + "loss": 0.31, + "step": 11607 + }, + { + "epoch": 2.919517102615694, + "grad_norm": 0.2878527045249939, + "learning_rate": 2.1908780082324777e-08, + "loss": 0.3089, + "step": 11608 + }, + { + "epoch": 2.91976861167002, + "grad_norm": 0.3030252158641815, + "learning_rate": 2.177216346657629e-08, + "loss": 0.2988, + "step": 11609 + }, + { + "epoch": 2.920020120724346, + "grad_norm": 0.2812774181365967, + "learning_rate": 2.1635973204899296e-08, + "loss": 0.3232, + "step": 11610 + }, + { + "epoch": 2.920271629778672, + "grad_norm": 0.2975054383277893, + "learning_rate": 2.1500209308956132e-08, + "loss": 0.3178, + "step": 11611 + }, + { + "epoch": 2.9205231388329977, + "grad_norm": 0.262926310300827, + "learning_rate": 2.136487179037472e-08, + "loss": 0.3276, + "step": 11612 + }, + { + "epoch": 2.920774647887324, + "grad_norm": 0.30193185806274414, + "learning_rate": 2.122996066074523e-08, + "loss": 0.3339, + "step": 11613 + }, + { + "epoch": 2.92102615694165, + "grad_norm": 0.28565213084220886, + "learning_rate": 2.1095475931623422e-08, + "loss": 0.3043, + "step": 11614 + }, + { + "epoch": 2.9212776659959756, + "grad_norm": 0.32027721405029297, + "learning_rate": 2.0961417614525638e-08, + "loss": 0.2899, + "step": 11615 + }, + { + "epoch": 2.921529175050302, + "grad_norm": 0.29877468943595886, + "learning_rate": 2.0827785720933803e-08, + "loss": 0.3295, + "step": 11616 + }, + { + "epoch": 2.9217806841046277, + "grad_norm": 0.291421502828598, + "learning_rate": 2.0694580262292096e-08, + "loss": 0.3109, + "step": 11617 + }, + { + "epoch": 2.9220321931589535, + "grad_norm": 0.2898477613925934, + "learning_rate": 2.0561801250009727e-08, + "loss": 0.3352, + "step": 11618 + }, + { + "epoch": 2.9222837022132797, + "grad_norm": 0.27962225675582886, + "learning_rate": 2.04294486954576e-08, + "loss": 0.329, + "step": 11619 + }, + { + "epoch": 2.9225352112676055, + "grad_norm": 0.31196466088294983, + "learning_rate": 2.0297522609971087e-08, + "loss": 0.3335, + "step": 11620 + }, + { + "epoch": 2.9227867203219313, + "grad_norm": 0.2565595507621765, + "learning_rate": 2.0166023004848934e-08, + "loss": 0.2932, + "step": 11621 + }, + { + "epoch": 2.9230382293762576, + "grad_norm": 0.2930534780025482, + "learning_rate": 2.003494989135324e-08, + "loss": 0.329, + "step": 11622 + }, + { + "epoch": 2.9232897384305834, + "grad_norm": 0.2888554632663727, + "learning_rate": 1.990430328070947e-08, + "loss": 0.3193, + "step": 11623 + }, + { + "epoch": 2.9235412474849096, + "grad_norm": 0.27373868227005005, + "learning_rate": 1.977408318410645e-08, + "loss": 0.3213, + "step": 11624 + }, + { + "epoch": 2.9237927565392354, + "grad_norm": 0.2846567630767822, + "learning_rate": 1.9644289612697487e-08, + "loss": 0.3173, + "step": 11625 + }, + { + "epoch": 2.9240442655935612, + "grad_norm": 0.25684884190559387, + "learning_rate": 1.951492257759757e-08, + "loss": 0.3325, + "step": 11626 + }, + { + "epoch": 2.9242957746478875, + "grad_norm": 0.2954108417034149, + "learning_rate": 1.938598208988729e-08, + "loss": 0.3258, + "step": 11627 + }, + { + "epoch": 2.9245472837022133, + "grad_norm": 0.28941529989242554, + "learning_rate": 1.9257468160608917e-08, + "loss": 0.3436, + "step": 11628 + }, + { + "epoch": 2.924798792756539, + "grad_norm": 0.310966432094574, + "learning_rate": 1.912938080076865e-08, + "loss": 0.3186, + "step": 11629 + }, + { + "epoch": 2.9250503018108653, + "grad_norm": 0.29559624195098877, + "learning_rate": 1.90017200213366e-08, + "loss": 0.3081, + "step": 11630 + }, + { + "epoch": 2.925301810865191, + "grad_norm": 0.2723540663719177, + "learning_rate": 1.887448583324625e-08, + "loss": 0.3291, + "step": 11631 + }, + { + "epoch": 2.925553319919517, + "grad_norm": 0.290840208530426, + "learning_rate": 1.8747678247394984e-08, + "loss": 0.3344, + "step": 11632 + }, + { + "epoch": 2.925804828973843, + "grad_norm": 0.31380322575569153, + "learning_rate": 1.8621297274641904e-08, + "loss": 0.2994, + "step": 11633 + }, + { + "epoch": 2.926056338028169, + "grad_norm": 0.279624342918396, + "learning_rate": 1.8495342925811122e-08, + "loss": 0.3212, + "step": 11634 + }, + { + "epoch": 2.926307847082495, + "grad_norm": 0.27431485056877136, + "learning_rate": 1.8369815211690678e-08, + "loss": 0.2976, + "step": 11635 + }, + { + "epoch": 2.926559356136821, + "grad_norm": 0.27514395117759705, + "learning_rate": 1.8244714143029752e-08, + "loss": 0.305, + "step": 11636 + }, + { + "epoch": 2.926810865191147, + "grad_norm": 0.27842938899993896, + "learning_rate": 1.8120039730544214e-08, + "loss": 0.3311, + "step": 11637 + }, + { + "epoch": 2.927062374245473, + "grad_norm": 0.26175856590270996, + "learning_rate": 1.799579198490997e-08, + "loss": 0.323, + "step": 11638 + }, + { + "epoch": 2.927313883299799, + "grad_norm": 0.3041093647480011, + "learning_rate": 1.7871970916769067e-08, + "loss": 0.3209, + "step": 11639 + }, + { + "epoch": 2.9275653923541247, + "grad_norm": 0.28131407499313354, + "learning_rate": 1.774857653672579e-08, + "loss": 0.3049, + "step": 11640 + }, + { + "epoch": 2.927816901408451, + "grad_norm": 0.3082786500453949, + "learning_rate": 1.7625608855348365e-08, + "loss": 0.302, + "step": 11641 + }, + { + "epoch": 2.9280684104627768, + "grad_norm": 0.28668394684791565, + "learning_rate": 1.7503067883167247e-08, + "loss": 0.3034, + "step": 11642 + }, + { + "epoch": 2.9283199195171026, + "grad_norm": 0.2656381130218506, + "learning_rate": 1.7380953630678488e-08, + "loss": 0.3191, + "step": 11643 + }, + { + "epoch": 2.928571428571429, + "grad_norm": 0.30544763803482056, + "learning_rate": 1.7259266108339833e-08, + "loss": 0.3299, + "step": 11644 + }, + { + "epoch": 2.9288229376257546, + "grad_norm": 0.2998693287372589, + "learning_rate": 1.713800532657295e-08, + "loss": 0.3154, + "step": 11645 + }, + { + "epoch": 2.9290744466800804, + "grad_norm": 0.2941758334636688, + "learning_rate": 1.7017171295763412e-08, + "loss": 0.3054, + "step": 11646 + }, + { + "epoch": 2.9293259557344067, + "grad_norm": 0.29245221614837646, + "learning_rate": 1.6896764026259616e-08, + "loss": 0.326, + "step": 11647 + }, + { + "epoch": 2.9295774647887325, + "grad_norm": 0.2588028907775879, + "learning_rate": 1.6776783528373864e-08, + "loss": 0.2949, + "step": 11648 + }, + { + "epoch": 2.9298289738430583, + "grad_norm": 0.27226272225379944, + "learning_rate": 1.665722981238127e-08, + "loss": 0.3068, + "step": 11649 + }, + { + "epoch": 2.9300804828973845, + "grad_norm": 0.27903351187705994, + "learning_rate": 1.6538102888521423e-08, + "loss": 0.3074, + "step": 11650 + }, + { + "epoch": 2.9303319919517103, + "grad_norm": 0.2830352187156677, + "learning_rate": 1.6419402766996717e-08, + "loss": 0.3179, + "step": 11651 + }, + { + "epoch": 2.930583501006036, + "grad_norm": 0.29141873121261597, + "learning_rate": 1.630112945797291e-08, + "loss": 0.3243, + "step": 11652 + }, + { + "epoch": 2.9308350100603624, + "grad_norm": 0.28866273164749146, + "learning_rate": 1.6183282971579673e-08, + "loss": 0.3409, + "step": 11653 + }, + { + "epoch": 2.931086519114688, + "grad_norm": 0.27328038215637207, + "learning_rate": 1.606586331790949e-08, + "loss": 0.3436, + "step": 11654 + }, + { + "epoch": 2.931338028169014, + "grad_norm": 0.2887606620788574, + "learning_rate": 1.5948870507018766e-08, + "loss": 0.3276, + "step": 11655 + }, + { + "epoch": 2.9315895372233403, + "grad_norm": 0.2820006012916565, + "learning_rate": 1.5832304548926703e-08, + "loss": 0.2983, + "step": 11656 + }, + { + "epoch": 2.931841046277666, + "grad_norm": 0.2761184573173523, + "learning_rate": 1.571616545361754e-08, + "loss": 0.3168, + "step": 11657 + }, + { + "epoch": 2.932092555331992, + "grad_norm": 0.3011208474636078, + "learning_rate": 1.560045323103665e-08, + "loss": 0.3198, + "step": 11658 + }, + { + "epoch": 2.932344064386318, + "grad_norm": 0.27800092101097107, + "learning_rate": 1.5485167891095e-08, + "loss": 0.321, + "step": 11659 + }, + { + "epoch": 2.932595573440644, + "grad_norm": 0.2734206020832062, + "learning_rate": 1.53703094436658e-08, + "loss": 0.3157, + "step": 11660 + }, + { + "epoch": 2.9328470824949697, + "grad_norm": 0.26894569396972656, + "learning_rate": 1.5255877898585624e-08, + "loss": 0.3178, + "step": 11661 + }, + { + "epoch": 2.933098591549296, + "grad_norm": 0.2924859821796417, + "learning_rate": 1.5141873265654973e-08, + "loss": 0.3364, + "step": 11662 + }, + { + "epoch": 2.933350100603622, + "grad_norm": 0.2777864634990692, + "learning_rate": 1.5028295554637695e-08, + "loss": 0.3277, + "step": 11663 + }, + { + "epoch": 2.9336016096579476, + "grad_norm": 0.3020499348640442, + "learning_rate": 1.4915144775261014e-08, + "loss": 0.3211, + "step": 11664 + }, + { + "epoch": 2.933853118712274, + "grad_norm": 0.2888851761817932, + "learning_rate": 1.4802420937216066e-08, + "loss": 0.3298, + "step": 11665 + }, + { + "epoch": 2.9341046277665996, + "grad_norm": 0.27491968870162964, + "learning_rate": 1.4690124050155686e-08, + "loss": 0.3193, + "step": 11666 + }, + { + "epoch": 2.9343561368209254, + "grad_norm": 0.29931673407554626, + "learning_rate": 1.4578254123698844e-08, + "loss": 0.299, + "step": 11667 + }, + { + "epoch": 2.9346076458752517, + "grad_norm": 0.2893234193325043, + "learning_rate": 1.4466811167425655e-08, + "loss": 0.3203, + "step": 11668 + }, + { + "epoch": 2.9348591549295775, + "grad_norm": 0.2912534475326538, + "learning_rate": 1.4355795190880707e-08, + "loss": 0.3138, + "step": 11669 + }, + { + "epoch": 2.9351106639839033, + "grad_norm": 0.28769242763519287, + "learning_rate": 1.4245206203571393e-08, + "loss": 0.3178, + "step": 11670 + }, + { + "epoch": 2.9353621730382295, + "grad_norm": 0.31816333532333374, + "learning_rate": 1.4135044214969585e-08, + "loss": 0.3455, + "step": 11671 + }, + { + "epoch": 2.9356136820925554, + "grad_norm": 0.3172784149646759, + "learning_rate": 1.4025309234510509e-08, + "loss": 0.3227, + "step": 11672 + }, + { + "epoch": 2.935865191146881, + "grad_norm": 0.31336894631385803, + "learning_rate": 1.391600127159054e-08, + "loss": 0.3134, + "step": 11673 + }, + { + "epoch": 2.9361167002012074, + "grad_norm": 0.2920610010623932, + "learning_rate": 1.3807120335572743e-08, + "loss": 0.3017, + "step": 11674 + }, + { + "epoch": 2.936368209255533, + "grad_norm": 0.27723726630210876, + "learning_rate": 1.3698666435781327e-08, + "loss": 0.3341, + "step": 11675 + }, + { + "epoch": 2.936619718309859, + "grad_norm": 0.28226420283317566, + "learning_rate": 1.3590639581504971e-08, + "loss": 0.3079, + "step": 11676 + }, + { + "epoch": 2.9368712273641853, + "grad_norm": 0.3186614215373993, + "learning_rate": 1.3483039781995721e-08, + "loss": 0.3145, + "step": 11677 + }, + { + "epoch": 2.937122736418511, + "grad_norm": 0.27414506673812866, + "learning_rate": 1.3375867046468427e-08, + "loss": 0.2952, + "step": 11678 + }, + { + "epoch": 2.937374245472837, + "grad_norm": 0.28623631596565247, + "learning_rate": 1.3269121384101857e-08, + "loss": 0.34, + "step": 11679 + }, + { + "epoch": 2.937625754527163, + "grad_norm": 0.29475000500679016, + "learning_rate": 1.316280280403759e-08, + "loss": 0.3194, + "step": 11680 + }, + { + "epoch": 2.937877263581489, + "grad_norm": 0.3122510015964508, + "learning_rate": 1.3056911315382226e-08, + "loss": 0.3142, + "step": 11681 + }, + { + "epoch": 2.9381287726358147, + "grad_norm": 0.29271435737609863, + "learning_rate": 1.2951446927204625e-08, + "loss": 0.3127, + "step": 11682 + }, + { + "epoch": 2.938380281690141, + "grad_norm": 0.3412143886089325, + "learning_rate": 1.2846409648535896e-08, + "loss": 0.3269, + "step": 11683 + }, + { + "epoch": 2.938631790744467, + "grad_norm": 0.2909950613975525, + "learning_rate": 1.2741799488373285e-08, + "loss": 0.3061, + "step": 11684 + }, + { + "epoch": 2.9388832997987926, + "grad_norm": 0.276928573846817, + "learning_rate": 1.2637616455675183e-08, + "loss": 0.3038, + "step": 11685 + }, + { + "epoch": 2.939134808853119, + "grad_norm": 0.2644422948360443, + "learning_rate": 1.2533860559363897e-08, + "loss": 0.332, + "step": 11686 + }, + { + "epoch": 2.9393863179074446, + "grad_norm": 0.28948256373405457, + "learning_rate": 1.2430531808326763e-08, + "loss": 0.325, + "step": 11687 + }, + { + "epoch": 2.9396378269617705, + "grad_norm": 0.2958967387676239, + "learning_rate": 1.23276302114117e-08, + "loss": 0.3028, + "step": 11688 + }, + { + "epoch": 2.9398893360160967, + "grad_norm": 0.2886587977409363, + "learning_rate": 1.2225155777432773e-08, + "loss": 0.3312, + "step": 11689 + }, + { + "epoch": 2.9401408450704225, + "grad_norm": 0.3025323748588562, + "learning_rate": 1.2123108515165738e-08, + "loss": 0.3017, + "step": 11690 + }, + { + "epoch": 2.9403923541247483, + "grad_norm": 0.28794747591018677, + "learning_rate": 1.202148843335027e-08, + "loss": 0.3094, + "step": 11691 + }, + { + "epoch": 2.9406438631790746, + "grad_norm": 0.287518709897995, + "learning_rate": 1.1920295540689964e-08, + "loss": 0.3325, + "step": 11692 + }, + { + "epoch": 2.9408953722334004, + "grad_norm": 0.3010087013244629, + "learning_rate": 1.1819529845850664e-08, + "loss": 0.3359, + "step": 11693 + }, + { + "epoch": 2.941146881287726, + "grad_norm": 0.28999802470207214, + "learning_rate": 1.1719191357463245e-08, + "loss": 0.3375, + "step": 11694 + }, + { + "epoch": 2.9413983903420524, + "grad_norm": 0.2955481708049774, + "learning_rate": 1.1619280084119722e-08, + "loss": 0.3295, + "step": 11695 + }, + { + "epoch": 2.941649899396378, + "grad_norm": 0.28115126490592957, + "learning_rate": 1.151979603437825e-08, + "loss": 0.3115, + "step": 11696 + }, + { + "epoch": 2.941901408450704, + "grad_norm": 0.2838100492954254, + "learning_rate": 1.1420739216758125e-08, + "loss": 0.3389, + "step": 11697 + }, + { + "epoch": 2.9421529175050303, + "grad_norm": 0.291797935962677, + "learning_rate": 1.1322109639743117e-08, + "loss": 0.3357, + "step": 11698 + }, + { + "epoch": 2.942404426559356, + "grad_norm": 0.2834652066230774, + "learning_rate": 1.1223907311780358e-08, + "loss": 0.303, + "step": 11699 + }, + { + "epoch": 2.942655935613682, + "grad_norm": 0.2929726243019104, + "learning_rate": 1.1126132241280342e-08, + "loss": 0.3303, + "step": 11700 + }, + { + "epoch": 2.942907444668008, + "grad_norm": 0.290096253156662, + "learning_rate": 1.1028784436616923e-08, + "loss": 0.323, + "step": 11701 + }, + { + "epoch": 2.943158953722334, + "grad_norm": 0.2806238830089569, + "learning_rate": 1.0931863906127327e-08, + "loss": 0.3357, + "step": 11702 + }, + { + "epoch": 2.9434104627766597, + "grad_norm": 0.2727678716182709, + "learning_rate": 1.0835370658111577e-08, + "loss": 0.3191, + "step": 11703 + }, + { + "epoch": 2.943661971830986, + "grad_norm": 0.3112581670284271, + "learning_rate": 1.073930470083473e-08, + "loss": 0.314, + "step": 11704 + }, + { + "epoch": 2.943913480885312, + "grad_norm": 0.29780250787734985, + "learning_rate": 1.0643666042523537e-08, + "loss": 0.3292, + "step": 11705 + }, + { + "epoch": 2.9441649899396376, + "grad_norm": 0.283771276473999, + "learning_rate": 1.054845469136867e-08, + "loss": 0.3173, + "step": 11706 + }, + { + "epoch": 2.944416498993964, + "grad_norm": 0.2684139311313629, + "learning_rate": 1.0453670655525273e-08, + "loss": 0.3129, + "step": 11707 + }, + { + "epoch": 2.9446680080482897, + "grad_norm": 0.29727786779403687, + "learning_rate": 1.0359313943110183e-08, + "loss": 0.3189, + "step": 11708 + }, + { + "epoch": 2.9449195171026155, + "grad_norm": 0.28367939591407776, + "learning_rate": 1.0265384562205272e-08, + "loss": 0.3273, + "step": 11709 + }, + { + "epoch": 2.9451710261569417, + "grad_norm": 0.28062888979911804, + "learning_rate": 1.0171882520853548e-08, + "loss": 0.3059, + "step": 11710 + }, + { + "epoch": 2.9454225352112675, + "grad_norm": 0.28939807415008545, + "learning_rate": 1.0078807827064718e-08, + "loss": 0.3318, + "step": 11711 + }, + { + "epoch": 2.9456740442655933, + "grad_norm": 0.27794498205184937, + "learning_rate": 9.986160488808517e-09, + "loss": 0.3177, + "step": 11712 + }, + { + "epoch": 2.9459255533199196, + "grad_norm": 0.3011113107204437, + "learning_rate": 9.893940514020817e-09, + "loss": 0.3293, + "step": 11713 + }, + { + "epoch": 2.9461770623742454, + "grad_norm": 0.2947550117969513, + "learning_rate": 9.802147910598637e-09, + "loss": 0.3097, + "step": 11714 + }, + { + "epoch": 2.946428571428571, + "grad_norm": 0.2753145694732666, + "learning_rate": 9.71078268640402e-09, + "loss": 0.3261, + "step": 11715 + }, + { + "epoch": 2.9466800804828974, + "grad_norm": 0.29693037271499634, + "learning_rate": 9.619844849261816e-09, + "loss": 0.3104, + "step": 11716 + }, + { + "epoch": 2.9469315895372232, + "grad_norm": 0.2914050221443176, + "learning_rate": 9.529334406960245e-09, + "loss": 0.3201, + "step": 11717 + }, + { + "epoch": 2.947183098591549, + "grad_norm": 0.3129412829875946, + "learning_rate": 9.439251367250879e-09, + "loss": 0.3052, + "step": 11718 + }, + { + "epoch": 2.9474346076458753, + "grad_norm": 0.26742586493492126, + "learning_rate": 9.349595737848105e-09, + "loss": 0.3316, + "step": 11719 + }, + { + "epoch": 2.947686116700201, + "grad_norm": 0.2630806565284729, + "learning_rate": 9.260367526431891e-09, + "loss": 0.3085, + "step": 11720 + }, + { + "epoch": 2.947937625754527, + "grad_norm": 0.28567931056022644, + "learning_rate": 9.17156674064279e-09, + "loss": 0.3279, + "step": 11721 + }, + { + "epoch": 2.948189134808853, + "grad_norm": 0.2721531391143799, + "learning_rate": 9.083193388086941e-09, + "loss": 0.3058, + "step": 11722 + }, + { + "epoch": 2.948440643863179, + "grad_norm": 0.31495407223701477, + "learning_rate": 8.99524747633218e-09, + "loss": 0.3336, + "step": 11723 + }, + { + "epoch": 2.948692152917505, + "grad_norm": 0.26969239115715027, + "learning_rate": 8.907729012910814e-09, + "loss": 0.3234, + "step": 11724 + }, + { + "epoch": 2.948943661971831, + "grad_norm": 0.3003351092338562, + "learning_rate": 8.820638005317961e-09, + "loss": 0.3474, + "step": 11725 + }, + { + "epoch": 2.949195171026157, + "grad_norm": 0.2833467125892639, + "learning_rate": 8.733974461013207e-09, + "loss": 0.3048, + "step": 11726 + }, + { + "epoch": 2.949446680080483, + "grad_norm": 0.2887994050979614, + "learning_rate": 8.647738387418391e-09, + "loss": 0.3077, + "step": 11727 + }, + { + "epoch": 2.949698189134809, + "grad_norm": 0.2787559926509857, + "learning_rate": 8.561929791918722e-09, + "loss": 0.3414, + "step": 11728 + }, + { + "epoch": 2.9499496981891347, + "grad_norm": 0.2879605293273926, + "learning_rate": 8.476548681863316e-09, + "loss": 0.3144, + "step": 11729 + }, + { + "epoch": 2.950201207243461, + "grad_norm": 0.2889518141746521, + "learning_rate": 8.391595064564661e-09, + "loss": 0.3077, + "step": 11730 + }, + { + "epoch": 2.9504527162977867, + "grad_norm": 0.2559886872768402, + "learning_rate": 8.307068947299157e-09, + "loss": 0.3284, + "step": 11731 + }, + { + "epoch": 2.9507042253521125, + "grad_norm": 0.3104651868343353, + "learning_rate": 8.222970337304347e-09, + "loss": 0.3091, + "step": 11732 + }, + { + "epoch": 2.9509557344064388, + "grad_norm": 0.2882402241230011, + "learning_rate": 8.139299241783916e-09, + "loss": 0.3232, + "step": 11733 + }, + { + "epoch": 2.9512072434607646, + "grad_norm": 0.2959415018558502, + "learning_rate": 8.056055667903795e-09, + "loss": 0.3191, + "step": 11734 + }, + { + "epoch": 2.9514587525150904, + "grad_norm": 0.2943558394908905, + "learning_rate": 7.973239622792728e-09, + "loss": 0.3241, + "step": 11735 + }, + { + "epoch": 2.9517102615694166, + "grad_norm": 0.271892786026001, + "learning_rate": 7.89085111354393e-09, + "loss": 0.3026, + "step": 11736 + }, + { + "epoch": 2.9519617706237424, + "grad_norm": 0.2949487268924713, + "learning_rate": 7.808890147213422e-09, + "loss": 0.3119, + "step": 11737 + }, + { + "epoch": 2.9522132796780687, + "grad_norm": 0.2683142125606537, + "learning_rate": 7.727356730820035e-09, + "loss": 0.3042, + "step": 11738 + }, + { + "epoch": 2.9524647887323945, + "grad_norm": 0.3025915026664734, + "learning_rate": 7.646250871347072e-09, + "loss": 0.3253, + "step": 11739 + }, + { + "epoch": 2.9527162977867203, + "grad_norm": 0.2929423451423645, + "learning_rate": 7.565572575740087e-09, + "loss": 0.3055, + "step": 11740 + }, + { + "epoch": 2.9529678068410465, + "grad_norm": 0.27529823780059814, + "learning_rate": 7.485321850910221e-09, + "loss": 0.3363, + "step": 11741 + }, + { + "epoch": 2.9532193158953723, + "grad_norm": 0.28709059953689575, + "learning_rate": 7.405498703728642e-09, + "loss": 0.3161, + "step": 11742 + }, + { + "epoch": 2.953470824949698, + "grad_norm": 0.28413447737693787, + "learning_rate": 7.326103141033214e-09, + "loss": 0.2844, + "step": 11743 + }, + { + "epoch": 2.9537223340040244, + "grad_norm": 0.2665967643260956, + "learning_rate": 7.247135169622388e-09, + "loss": 0.2992, + "step": 11744 + }, + { + "epoch": 2.95397384305835, + "grad_norm": 0.29413825273513794, + "learning_rate": 7.1685947962601976e-09, + "loss": 0.3074, + "step": 11745 + }, + { + "epoch": 2.954225352112676, + "grad_norm": 0.29388710856437683, + "learning_rate": 7.0904820276729294e-09, + "loss": 0.311, + "step": 11746 + }, + { + "epoch": 2.9544768611670023, + "grad_norm": 0.30368688702583313, + "learning_rate": 7.012796870549676e-09, + "loss": 0.3278, + "step": 11747 + }, + { + "epoch": 2.954728370221328, + "grad_norm": 0.30217698216438293, + "learning_rate": 6.935539331545116e-09, + "loss": 0.3486, + "step": 11748 + }, + { + "epoch": 2.954979879275654, + "grad_norm": 0.30664876103401184, + "learning_rate": 6.858709417274512e-09, + "loss": 0.3202, + "step": 11749 + }, + { + "epoch": 2.95523138832998, + "grad_norm": 0.2738303244113922, + "learning_rate": 6.7823071343187106e-09, + "loss": 0.3249, + "step": 11750 + }, + { + "epoch": 2.955482897384306, + "grad_norm": 0.30232658982276917, + "learning_rate": 6.7063324892208125e-09, + "loss": 0.3419, + "step": 11751 + }, + { + "epoch": 2.9557344064386317, + "grad_norm": 0.2941600978374481, + "learning_rate": 6.630785488487834e-09, + "loss": 0.3004, + "step": 11752 + }, + { + "epoch": 2.955985915492958, + "grad_norm": 0.27397286891937256, + "learning_rate": 6.5556661385896005e-09, + "loss": 0.3072, + "step": 11753 + }, + { + "epoch": 2.9562374245472838, + "grad_norm": 0.2965390980243683, + "learning_rate": 6.480974445959298e-09, + "loss": 0.345, + "step": 11754 + }, + { + "epoch": 2.9564889336016096, + "grad_norm": 0.27063122391700745, + "learning_rate": 6.406710416994588e-09, + "loss": 0.2997, + "step": 11755 + }, + { + "epoch": 2.956740442655936, + "grad_norm": 0.2771882712841034, + "learning_rate": 6.3328740580548275e-09, + "loss": 0.3317, + "step": 11756 + }, + { + "epoch": 2.9569919517102616, + "grad_norm": 0.2760043442249298, + "learning_rate": 6.259465375464402e-09, + "loss": 0.3197, + "step": 11757 + }, + { + "epoch": 2.9572434607645874, + "grad_norm": 0.30299848318099976, + "learning_rate": 6.1864843755099495e-09, + "loss": 0.3115, + "step": 11758 + }, + { + "epoch": 2.9574949698189137, + "grad_norm": 0.2945834994316101, + "learning_rate": 6.113931064442025e-09, + "loss": 0.3084, + "step": 11759 + }, + { + "epoch": 2.9577464788732395, + "grad_norm": 0.2729983329772949, + "learning_rate": 6.041805448474547e-09, + "loss": 0.3012, + "step": 11760 + }, + { + "epoch": 2.9579979879275653, + "grad_norm": 0.29064232110977173, + "learning_rate": 5.970107533783687e-09, + "loss": 0.3246, + "step": 11761 + }, + { + "epoch": 2.9582494969818915, + "grad_norm": 0.29307058453559875, + "learning_rate": 5.898837326511197e-09, + "loss": 0.3051, + "step": 11762 + }, + { + "epoch": 2.9585010060362174, + "grad_norm": 0.2878889739513397, + "learning_rate": 5.8279948327599754e-09, + "loss": 0.3432, + "step": 11763 + }, + { + "epoch": 2.958752515090543, + "grad_norm": 0.272480309009552, + "learning_rate": 5.75758005859739e-09, + "loss": 0.3215, + "step": 11764 + }, + { + "epoch": 2.9590040241448694, + "grad_norm": 0.280304878950119, + "learning_rate": 5.6875930100541706e-09, + "loss": 0.3251, + "step": 11765 + }, + { + "epoch": 2.959255533199195, + "grad_norm": 0.2817912697792053, + "learning_rate": 5.618033693124414e-09, + "loss": 0.3462, + "step": 11766 + }, + { + "epoch": 2.959507042253521, + "grad_norm": 0.2780134677886963, + "learning_rate": 5.548902113765575e-09, + "loss": 0.299, + "step": 11767 + }, + { + "epoch": 2.9597585513078473, + "grad_norm": 0.28865960240364075, + "learning_rate": 5.480198277897919e-09, + "loss": 0.3421, + "step": 11768 + }, + { + "epoch": 2.960010060362173, + "grad_norm": 0.2794874608516693, + "learning_rate": 5.411922191405627e-09, + "loss": 0.3387, + "step": 11769 + }, + { + "epoch": 2.960261569416499, + "grad_norm": 0.27201390266418457, + "learning_rate": 5.34407386013569e-09, + "loss": 0.3346, + "step": 11770 + }, + { + "epoch": 2.960513078470825, + "grad_norm": 0.28588712215423584, + "learning_rate": 5.276653289900124e-09, + "loss": 0.3482, + "step": 11771 + }, + { + "epoch": 2.960764587525151, + "grad_norm": 0.2717914283275604, + "learning_rate": 5.20966048647209e-09, + "loss": 0.3096, + "step": 11772 + }, + { + "epoch": 2.9610160965794767, + "grad_norm": 0.2592178285121918, + "learning_rate": 5.14309545558922e-09, + "loss": 0.2934, + "step": 11773 + }, + { + "epoch": 2.961267605633803, + "grad_norm": 0.2764432728290558, + "learning_rate": 5.076958202952509e-09, + "loss": 0.3238, + "step": 11774 + }, + { + "epoch": 2.961519114688129, + "grad_norm": 0.3098163902759552, + "learning_rate": 5.01124873422576e-09, + "loss": 0.3151, + "step": 11775 + }, + { + "epoch": 2.9617706237424546, + "grad_norm": 0.2991572916507721, + "learning_rate": 4.945967055037803e-09, + "loss": 0.3173, + "step": 11776 + }, + { + "epoch": 2.962022132796781, + "grad_norm": 0.26591500639915466, + "learning_rate": 4.881113170978058e-09, + "loss": 0.3266, + "step": 11777 + }, + { + "epoch": 2.9622736418511066, + "grad_norm": 0.29749444127082825, + "learning_rate": 4.8166870876020786e-09, + "loss": 0.3228, + "step": 11778 + }, + { + "epoch": 2.9625251509054324, + "grad_norm": 0.2841416001319885, + "learning_rate": 4.75268881042712e-09, + "loss": 0.3343, + "step": 11779 + }, + { + "epoch": 2.9627766599597587, + "grad_norm": 0.26449331641197205, + "learning_rate": 4.689118344933796e-09, + "loss": 0.3179, + "step": 11780 + }, + { + "epoch": 2.9630281690140845, + "grad_norm": 0.3073911666870117, + "learning_rate": 4.625975696567197e-09, + "loss": 0.3033, + "step": 11781 + }, + { + "epoch": 2.9632796780684103, + "grad_norm": 0.28680184483528137, + "learning_rate": 4.56326087073522e-09, + "loss": 0.2853, + "step": 11782 + }, + { + "epoch": 2.9635311871227366, + "grad_norm": 0.2971070408821106, + "learning_rate": 4.500973872808012e-09, + "loss": 0.3466, + "step": 11783 + }, + { + "epoch": 2.9637826961770624, + "grad_norm": 0.2824273705482483, + "learning_rate": 4.439114708120751e-09, + "loss": 0.3272, + "step": 11784 + }, + { + "epoch": 2.964034205231388, + "grad_norm": 0.30176204442977905, + "learning_rate": 4.37768338197142e-09, + "loss": 0.3077, + "step": 11785 + }, + { + "epoch": 2.9642857142857144, + "grad_norm": 0.27803835272789, + "learning_rate": 4.3166798996208125e-09, + "loss": 0.3356, + "step": 11786 + }, + { + "epoch": 2.96453722334004, + "grad_norm": 0.30758580565452576, + "learning_rate": 4.256104266293637e-09, + "loss": 0.3263, + "step": 11787 + }, + { + "epoch": 2.964788732394366, + "grad_norm": 0.27492862939834595, + "learning_rate": 4.19595648717741e-09, + "loss": 0.3243, + "step": 11788 + }, + { + "epoch": 2.9650402414486923, + "grad_norm": 0.2788017988204956, + "learning_rate": 4.136236567424679e-09, + "loss": 0.3405, + "step": 11789 + }, + { + "epoch": 2.965291750503018, + "grad_norm": 0.2948797047138214, + "learning_rate": 4.076944512148573e-09, + "loss": 0.3143, + "step": 11790 + }, + { + "epoch": 2.965543259557344, + "grad_norm": 0.2663567364215851, + "learning_rate": 4.018080326428364e-09, + "loss": 0.2989, + "step": 11791 + }, + { + "epoch": 2.96579476861167, + "grad_norm": 0.2669561803340912, + "learning_rate": 3.9596440153044645e-09, + "loss": 0.3026, + "step": 11792 + }, + { + "epoch": 2.966046277665996, + "grad_norm": 0.28775742650032043, + "learning_rate": 3.901635583782315e-09, + "loss": 0.333, + "step": 11793 + }, + { + "epoch": 2.9662977867203217, + "grad_norm": 0.293270468711853, + "learning_rate": 3.844055036829053e-09, + "loss": 0.3608, + "step": 11794 + }, + { + "epoch": 2.966549295774648, + "grad_norm": 0.2894499897956848, + "learning_rate": 3.786902379376844e-09, + "loss": 0.3251, + "step": 11795 + }, + { + "epoch": 2.966800804828974, + "grad_norm": 0.2831835448741913, + "learning_rate": 3.730177616320108e-09, + "loss": 0.3216, + "step": 11796 + }, + { + "epoch": 2.9670523138832996, + "grad_norm": 0.2966955900192261, + "learning_rate": 3.6738807525171784e-09, + "loss": 0.3252, + "step": 11797 + }, + { + "epoch": 2.967303822937626, + "grad_norm": 0.2994356155395508, + "learning_rate": 3.618011792789755e-09, + "loss": 0.3243, + "step": 11798 + }, + { + "epoch": 2.9675553319919517, + "grad_norm": 0.2818380892276764, + "learning_rate": 3.562570741921789e-09, + "loss": 0.3147, + "step": 11799 + }, + { + "epoch": 2.9678068410462775, + "grad_norm": 0.29389482736587524, + "learning_rate": 3.5075576046628145e-09, + "loss": 0.3545, + "step": 11800 + }, + { + "epoch": 2.9680583501006037, + "grad_norm": 0.2830759584903717, + "learning_rate": 3.4529723857229526e-09, + "loss": 0.3221, + "step": 11801 + }, + { + "epoch": 2.9683098591549295, + "grad_norm": 0.28202909231185913, + "learning_rate": 3.3988150897779073e-09, + "loss": 0.3163, + "step": 11802 + }, + { + "epoch": 2.9685613682092553, + "grad_norm": 0.3006686866283417, + "learning_rate": 3.345085721465635e-09, + "loss": 0.3173, + "step": 11803 + }, + { + "epoch": 2.9688128772635816, + "grad_norm": 0.28550851345062256, + "learning_rate": 3.291784285387456e-09, + "loss": 0.3222, + "step": 11804 + }, + { + "epoch": 2.9690643863179074, + "grad_norm": 0.2890929877758026, + "learning_rate": 3.238910786109162e-09, + "loss": 0.305, + "step": 11805 + }, + { + "epoch": 2.969315895372233, + "grad_norm": 0.27377450466156006, + "learning_rate": 3.186465228158242e-09, + "loss": 0.3526, + "step": 11806 + }, + { + "epoch": 2.9695674044265594, + "grad_norm": 0.285478413105011, + "learning_rate": 3.1344476160266592e-09, + "loss": 0.3079, + "step": 11807 + }, + { + "epoch": 2.9698189134808852, + "grad_norm": 0.29703089594841003, + "learning_rate": 3.082857954169738e-09, + "loss": 0.2883, + "step": 11808 + }, + { + "epoch": 2.970070422535211, + "grad_norm": 0.2617098391056061, + "learning_rate": 3.031696247005056e-09, + "loss": 0.3055, + "step": 11809 + }, + { + "epoch": 2.9703219315895373, + "grad_norm": 0.2684277594089508, + "learning_rate": 2.9809624989146633e-09, + "loss": 0.3213, + "step": 11810 + }, + { + "epoch": 2.970573440643863, + "grad_norm": 0.28313401341438293, + "learning_rate": 2.9306567142434183e-09, + "loss": 0.3155, + "step": 11811 + }, + { + "epoch": 2.970824949698189, + "grad_norm": 0.2776716351509094, + "learning_rate": 2.8807788973000962e-09, + "loss": 0.3116, + "step": 11812 + }, + { + "epoch": 2.971076458752515, + "grad_norm": 0.29418906569480896, + "learning_rate": 2.83132905235628e-09, + "loss": 0.2839, + "step": 11813 + }, + { + "epoch": 2.971327967806841, + "grad_norm": 0.2912454903125763, + "learning_rate": 2.78230718364636e-09, + "loss": 0.3275, + "step": 11814 + }, + { + "epoch": 2.9715794768611667, + "grad_norm": 0.2744319438934326, + "learning_rate": 2.7337132953697555e-09, + "loss": 0.3291, + "step": 11815 + }, + { + "epoch": 2.971830985915493, + "grad_norm": 0.2988802194595337, + "learning_rate": 2.685547391688137e-09, + "loss": 0.3212, + "step": 11816 + }, + { + "epoch": 2.972082494969819, + "grad_norm": 0.28176239132881165, + "learning_rate": 2.6378094767259833e-09, + "loss": 0.3297, + "step": 11817 + }, + { + "epoch": 2.9723340040241446, + "grad_norm": 0.30423882603645325, + "learning_rate": 2.5904995545716903e-09, + "loss": 0.3004, + "step": 11818 + }, + { + "epoch": 2.972585513078471, + "grad_norm": 0.27662205696105957, + "learning_rate": 2.5436176292781277e-09, + "loss": 0.3055, + "step": 11819 + }, + { + "epoch": 2.9728370221327967, + "grad_norm": 0.28385183215141296, + "learning_rate": 2.497163704859307e-09, + "loss": 0.2986, + "step": 11820 + }, + { + "epoch": 2.9730885311871225, + "grad_norm": 0.2943600118160248, + "learning_rate": 2.4511377852937114e-09, + "loss": 0.3284, + "step": 11821 + }, + { + "epoch": 2.9733400402414487, + "grad_norm": 0.27290356159210205, + "learning_rate": 2.4055398745242987e-09, + "loss": 0.3114, + "step": 11822 + }, + { + "epoch": 2.9735915492957745, + "grad_norm": 0.2819903790950775, + "learning_rate": 2.360369976455168e-09, + "loss": 0.3206, + "step": 11823 + }, + { + "epoch": 2.9738430583501008, + "grad_norm": 0.28330057859420776, + "learning_rate": 2.3156280949554465e-09, + "loss": 0.3173, + "step": 11824 + }, + { + "epoch": 2.9740945674044266, + "grad_norm": 0.3153158724308014, + "learning_rate": 2.2713142338565142e-09, + "loss": 0.3447, + "step": 11825 + }, + { + "epoch": 2.9743460764587524, + "grad_norm": 0.278213232755661, + "learning_rate": 2.2274283969542233e-09, + "loss": 0.3002, + "step": 11826 + }, + { + "epoch": 2.9745975855130786, + "grad_norm": 0.29102084040641785, + "learning_rate": 2.1839705880061235e-09, + "loss": 0.3253, + "step": 11827 + }, + { + "epoch": 2.9748490945674044, + "grad_norm": 0.28229936957359314, + "learning_rate": 2.1409408107353478e-09, + "loss": 0.3337, + "step": 11828 + }, + { + "epoch": 2.9751006036217302, + "grad_norm": 0.28082865476608276, + "learning_rate": 2.0983390688261718e-09, + "loss": 0.3255, + "step": 11829 + }, + { + "epoch": 2.9753521126760565, + "grad_norm": 0.27319207787513733, + "learning_rate": 2.056165365927343e-09, + "loss": 0.3132, + "step": 11830 + }, + { + "epoch": 2.9756036217303823, + "grad_norm": 0.2930520474910736, + "learning_rate": 2.0144197056509717e-09, + "loss": 0.3183, + "step": 11831 + }, + { + "epoch": 2.975855130784708, + "grad_norm": 0.28141433000564575, + "learning_rate": 1.9731020915725317e-09, + "loss": 0.3456, + "step": 11832 + }, + { + "epoch": 2.9761066398390343, + "grad_norm": 0.28628331422805786, + "learning_rate": 1.9322125272297488e-09, + "loss": 0.3227, + "step": 11833 + }, + { + "epoch": 2.97635814889336, + "grad_norm": 0.30301791429519653, + "learning_rate": 1.8917510161259312e-09, + "loss": 0.315, + "step": 11834 + }, + { + "epoch": 2.9766096579476864, + "grad_norm": 0.28687921166419983, + "learning_rate": 1.851717561724975e-09, + "loss": 0.3036, + "step": 11835 + }, + { + "epoch": 2.976861167002012, + "grad_norm": 0.26738491654396057, + "learning_rate": 1.812112167456359e-09, + "loss": 0.3211, + "step": 11836 + }, + { + "epoch": 2.977112676056338, + "grad_norm": 0.3000773787498474, + "learning_rate": 1.7729348367118148e-09, + "loss": 0.3202, + "step": 11837 + }, + { + "epoch": 2.9773641851106643, + "grad_norm": 0.28814077377319336, + "learning_rate": 1.7341855728464363e-09, + "loss": 0.2894, + "step": 11838 + }, + { + "epoch": 2.97761569416499, + "grad_norm": 0.26308709383010864, + "learning_rate": 1.695864379179235e-09, + "loss": 0.3039, + "step": 11839 + }, + { + "epoch": 2.977867203219316, + "grad_norm": 0.28434452414512634, + "learning_rate": 1.6579712589914754e-09, + "loss": 0.322, + "step": 11840 + }, + { + "epoch": 2.978118712273642, + "grad_norm": 0.2894355058670044, + "learning_rate": 1.6205062155294494e-09, + "loss": 0.3197, + "step": 11841 + }, + { + "epoch": 2.978370221327968, + "grad_norm": 0.31105583906173706, + "learning_rate": 1.5834692520011462e-09, + "loss": 0.3124, + "step": 11842 + }, + { + "epoch": 2.9786217303822937, + "grad_norm": 0.2865217924118042, + "learning_rate": 1.5468603715784736e-09, + "loss": 0.3336, + "step": 11843 + }, + { + "epoch": 2.97887323943662, + "grad_norm": 0.3048990070819855, + "learning_rate": 1.510679577397256e-09, + "loss": 0.3265, + "step": 11844 + }, + { + "epoch": 2.9791247484909458, + "grad_norm": 0.27794545888900757, + "learning_rate": 1.4749268725555709e-09, + "loss": 0.3072, + "step": 11845 + }, + { + "epoch": 2.9793762575452716, + "grad_norm": 0.31118738651275635, + "learning_rate": 1.4396022601159687e-09, + "loss": 0.3398, + "step": 11846 + }, + { + "epoch": 2.979627766599598, + "grad_norm": 0.28356632590293884, + "learning_rate": 1.404705743103807e-09, + "loss": 0.3246, + "step": 11847 + }, + { + "epoch": 2.9798792756539236, + "grad_norm": 0.28750303387641907, + "learning_rate": 1.370237324507251e-09, + "loss": 0.3162, + "step": 11848 + }, + { + "epoch": 2.9801307847082494, + "grad_norm": 0.346623957157135, + "learning_rate": 1.3361970072783836e-09, + "loss": 0.2966, + "step": 11849 + }, + { + "epoch": 2.9803822937625757, + "grad_norm": 0.309396892786026, + "learning_rate": 1.3025847943326508e-09, + "loss": 0.3277, + "step": 11850 + }, + { + "epoch": 2.9806338028169015, + "grad_norm": 0.2721833884716034, + "learning_rate": 1.2694006885488609e-09, + "loss": 0.3412, + "step": 11851 + }, + { + "epoch": 2.9808853118712273, + "grad_norm": 0.3036994934082031, + "learning_rate": 1.2366446927691844e-09, + "loss": 0.2995, + "step": 11852 + }, + { + "epoch": 2.9811368209255535, + "grad_norm": 0.2673596441745758, + "learning_rate": 1.2043168097986002e-09, + "loss": 0.3089, + "step": 11853 + }, + { + "epoch": 2.9813883299798793, + "grad_norm": 0.3089800179004669, + "learning_rate": 1.1724170424054493e-09, + "loss": 0.3185, + "step": 11854 + }, + { + "epoch": 2.981639839034205, + "grad_norm": 0.26984110474586487, + "learning_rate": 1.1409453933225457e-09, + "loss": 0.3277, + "step": 11855 + }, + { + "epoch": 2.9818913480885314, + "grad_norm": 0.2922016382217407, + "learning_rate": 1.1099018652449557e-09, + "loss": 0.3058, + "step": 11856 + }, + { + "epoch": 2.982142857142857, + "grad_norm": 0.2652096152305603, + "learning_rate": 1.0792864608316634e-09, + "loss": 0.3217, + "step": 11857 + }, + { + "epoch": 2.982394366197183, + "grad_norm": 0.3048350512981415, + "learning_rate": 1.0490991827039055e-09, + "loss": 0.2961, + "step": 11858 + }, + { + "epoch": 2.9826458752515093, + "grad_norm": 0.26128822565078735, + "learning_rate": 1.0193400334473913e-09, + "loss": 0.3407, + "step": 11859 + }, + { + "epoch": 2.982897384305835, + "grad_norm": 0.28069064021110535, + "learning_rate": 9.90009015611193e-10, + "loss": 0.3106, + "step": 11860 + }, + { + "epoch": 2.983148893360161, + "grad_norm": 0.2878744602203369, + "learning_rate": 9.61106131706635e-10, + "loss": 0.3249, + "step": 11861 + }, + { + "epoch": 2.983400402414487, + "grad_norm": 0.2882971167564392, + "learning_rate": 9.326313842100698e-10, + "loss": 0.3099, + "step": 11862 + }, + { + "epoch": 2.983651911468813, + "grad_norm": 0.2869325578212738, + "learning_rate": 9.045847755589921e-10, + "loss": 0.3298, + "step": 11863 + }, + { + "epoch": 2.9839034205231387, + "grad_norm": 0.29187971353530884, + "learning_rate": 8.769663081559243e-10, + "loss": 0.3112, + "step": 11864 + }, + { + "epoch": 2.984154929577465, + "grad_norm": 0.2968631684780121, + "learning_rate": 8.497759843667519e-10, + "loss": 0.3326, + "step": 11865 + }, + { + "epoch": 2.984406438631791, + "grad_norm": 0.3074916899204254, + "learning_rate": 8.230138065196125e-10, + "loss": 0.3237, + "step": 11866 + }, + { + "epoch": 2.9846579476861166, + "grad_norm": 0.2778856158256531, + "learning_rate": 7.966797769065615e-10, + "loss": 0.3291, + "step": 11867 + }, + { + "epoch": 2.984909456740443, + "grad_norm": 0.2621375620365143, + "learning_rate": 7.707738977824619e-10, + "loss": 0.3146, + "step": 11868 + }, + { + "epoch": 2.9851609657947686, + "grad_norm": 0.25857844948768616, + "learning_rate": 7.452961713672046e-10, + "loss": 0.3171, + "step": 11869 + }, + { + "epoch": 2.9854124748490944, + "grad_norm": 0.2810332775115967, + "learning_rate": 7.202465998412678e-10, + "loss": 0.3306, + "step": 11870 + }, + { + "epoch": 2.9856639839034207, + "grad_norm": 0.2880176603794098, + "learning_rate": 6.956251853512675e-10, + "loss": 0.3065, + "step": 11871 + }, + { + "epoch": 2.9859154929577465, + "grad_norm": 0.2959819436073303, + "learning_rate": 6.714319300055172e-10, + "loss": 0.332, + "step": 11872 + }, + { + "epoch": 2.9861670020120723, + "grad_norm": 0.2593652606010437, + "learning_rate": 6.476668358762483e-10, + "loss": 0.3388, + "step": 11873 + }, + { + "epoch": 2.9864185110663986, + "grad_norm": 0.28062987327575684, + "learning_rate": 6.243299049979445e-10, + "loss": 0.3382, + "step": 11874 + }, + { + "epoch": 2.9866700201207244, + "grad_norm": 0.27674224972724915, + "learning_rate": 6.014211393695623e-10, + "loss": 0.3121, + "step": 11875 + }, + { + "epoch": 2.98692152917505, + "grad_norm": 0.2833153009414673, + "learning_rate": 5.789405409539761e-10, + "loss": 0.3147, + "step": 11876 + }, + { + "epoch": 2.9871730382293764, + "grad_norm": 0.2868058383464813, + "learning_rate": 5.568881116752023e-10, + "loss": 0.3182, + "step": 11877 + }, + { + "epoch": 2.987424547283702, + "grad_norm": 0.2811480760574341, + "learning_rate": 5.352638534228405e-10, + "loss": 0.2941, + "step": 11878 + }, + { + "epoch": 2.987676056338028, + "grad_norm": 0.29555627703666687, + "learning_rate": 5.140677680487427e-10, + "loss": 0.3058, + "step": 11879 + }, + { + "epoch": 2.9879275653923543, + "grad_norm": 0.2923000752925873, + "learning_rate": 4.932998573681236e-10, + "loss": 0.3494, + "step": 11880 + }, + { + "epoch": 2.98817907444668, + "grad_norm": 0.27389127016067505, + "learning_rate": 4.729601231590053e-10, + "loss": 0.3071, + "step": 11881 + }, + { + "epoch": 2.988430583501006, + "grad_norm": 0.27962055802345276, + "learning_rate": 4.5304856716443803e-10, + "loss": 0.3109, + "step": 11882 + }, + { + "epoch": 2.988682092555332, + "grad_norm": 0.30079811811447144, + "learning_rate": 4.3356519108916914e-10, + "loss": 0.3221, + "step": 11883 + }, + { + "epoch": 2.988933601609658, + "grad_norm": 0.2800407409667969, + "learning_rate": 4.14509996601864e-10, + "loss": 0.3115, + "step": 11884 + }, + { + "epoch": 2.9891851106639837, + "grad_norm": 0.27782750129699707, + "learning_rate": 3.9588298533399515e-10, + "loss": 0.3386, + "step": 11885 + }, + { + "epoch": 2.98943661971831, + "grad_norm": 0.2804793119430542, + "learning_rate": 3.7768415888150835e-10, + "loss": 0.3244, + "step": 11886 + }, + { + "epoch": 2.989688128772636, + "grad_norm": 0.2901947796344757, + "learning_rate": 3.5991351880315663e-10, + "loss": 0.3037, + "step": 11887 + }, + { + "epoch": 2.9899396378269616, + "grad_norm": 0.2956394553184509, + "learning_rate": 3.4257106662050066e-10, + "loss": 0.3442, + "step": 11888 + }, + { + "epoch": 2.990191146881288, + "grad_norm": 0.3086378574371338, + "learning_rate": 3.2565680381846377e-10, + "loss": 0.3066, + "step": 11889 + }, + { + "epoch": 2.9904426559356136, + "grad_norm": 0.30673331022262573, + "learning_rate": 3.091707318464421e-10, + "loss": 0.3219, + "step": 11890 + }, + { + "epoch": 2.9906941649899395, + "grad_norm": 0.28686726093292236, + "learning_rate": 2.931128521160842e-10, + "loss": 0.3265, + "step": 11891 + }, + { + "epoch": 2.9909456740442657, + "grad_norm": 0.2747303247451782, + "learning_rate": 2.774831660018462e-10, + "loss": 0.3198, + "step": 11892 + }, + { + "epoch": 2.9911971830985915, + "grad_norm": 0.27951404452323914, + "learning_rate": 2.622816748437673e-10, + "loss": 0.3252, + "step": 11893 + }, + { + "epoch": 2.9914486921529173, + "grad_norm": 0.27234378457069397, + "learning_rate": 2.475083799424738e-10, + "loss": 0.2957, + "step": 11894 + }, + { + "epoch": 2.9917002012072436, + "grad_norm": 0.28382253646850586, + "learning_rate": 2.33163282564175e-10, + "loss": 0.3286, + "step": 11895 + }, + { + "epoch": 2.9919517102615694, + "grad_norm": 0.28595641255378723, + "learning_rate": 2.1924638393677755e-10, + "loss": 0.3089, + "step": 11896 + }, + { + "epoch": 2.992203219315895, + "grad_norm": 0.28392264246940613, + "learning_rate": 2.0575768525266105e-10, + "loss": 0.3039, + "step": 11897 + }, + { + "epoch": 2.9924547283702214, + "grad_norm": 0.307187020778656, + "learning_rate": 1.926971876664574e-10, + "loss": 0.3291, + "step": 11898 + }, + { + "epoch": 2.9927062374245472, + "grad_norm": 0.29181286692619324, + "learning_rate": 1.800648922967163e-10, + "loss": 0.3418, + "step": 11899 + }, + { + "epoch": 2.992957746478873, + "grad_norm": 0.2649078965187073, + "learning_rate": 1.6786080022646034e-10, + "loss": 0.3342, + "step": 11900 + }, + { + "epoch": 2.9932092555331993, + "grad_norm": 0.2709779739379883, + "learning_rate": 1.5608491249929913e-10, + "loss": 0.3176, + "step": 11901 + }, + { + "epoch": 2.993460764587525, + "grad_norm": 0.2838386595249176, + "learning_rate": 1.4473723012498053e-10, + "loss": 0.3059, + "step": 11902 + }, + { + "epoch": 2.993712273641851, + "grad_norm": 0.2875151038169861, + "learning_rate": 1.338177540749497e-10, + "loss": 0.3346, + "step": 11903 + }, + { + "epoch": 2.993963782696177, + "grad_norm": 0.29026007652282715, + "learning_rate": 1.2332648528401436e-10, + "loss": 0.3377, + "step": 11904 + }, + { + "epoch": 2.994215291750503, + "grad_norm": 0.29916590452194214, + "learning_rate": 1.1326342465145523e-10, + "loss": 0.3209, + "step": 11905 + }, + { + "epoch": 2.9944668008048287, + "grad_norm": 0.28423720598220825, + "learning_rate": 1.0362857303825025e-10, + "loss": 0.3078, + "step": 11906 + }, + { + "epoch": 2.994718309859155, + "grad_norm": 0.2835725247859955, + "learning_rate": 9.442193127040533e-11, + "loss": 0.318, + "step": 11907 + }, + { + "epoch": 2.994969818913481, + "grad_norm": 0.27757924795150757, + "learning_rate": 8.564350013617884e-11, + "loss": 0.3137, + "step": 11908 + }, + { + "epoch": 2.9952213279678066, + "grad_norm": 0.28157174587249756, + "learning_rate": 7.729328038663663e-11, + "loss": 0.3522, + "step": 11909 + }, + { + "epoch": 2.995472837022133, + "grad_norm": 0.3048500716686249, + "learning_rate": 6.937127273787258e-11, + "loss": 0.3175, + "step": 11910 + }, + { + "epoch": 2.9957243460764587, + "grad_norm": 0.28835025429725647, + "learning_rate": 6.187747786767783e-11, + "loss": 0.3322, + "step": 11911 + }, + { + "epoch": 2.9959758551307845, + "grad_norm": 0.2930144965648651, + "learning_rate": 5.4811896418316414e-11, + "loss": 0.3168, + "step": 11912 + }, + { + "epoch": 2.9962273641851107, + "grad_norm": 0.2962915599346161, + "learning_rate": 4.817452899485986e-11, + "loss": 0.3037, + "step": 11913 + }, + { + "epoch": 2.9964788732394365, + "grad_norm": 0.2874841094017029, + "learning_rate": 4.1965376165742365e-11, + "loss": 0.3398, + "step": 11914 + }, + { + "epoch": 2.9967303822937623, + "grad_norm": 0.2807866632938385, + "learning_rate": 3.618443846276076e-11, + "loss": 0.3102, + "step": 11915 + }, + { + "epoch": 2.9969818913480886, + "grad_norm": 0.27110809087753296, + "learning_rate": 3.0831716380519406e-11, + "loss": 0.3245, + "step": 11916 + }, + { + "epoch": 2.9972334004024144, + "grad_norm": 0.30204376578330994, + "learning_rate": 2.5907210378095514e-11, + "loss": 0.3111, + "step": 11917 + }, + { + "epoch": 2.99748490945674, + "grad_norm": 0.29356303811073303, + "learning_rate": 2.141092087681873e-11, + "loss": 0.3641, + "step": 11918 + }, + { + "epoch": 2.9977364185110664, + "grad_norm": 0.3049434721469879, + "learning_rate": 1.734284826193644e-11, + "loss": 0.3259, + "step": 11919 + }, + { + "epoch": 2.9979879275653922, + "grad_norm": 0.28138816356658936, + "learning_rate": 1.3702992882058675e-11, + "loss": 0.3219, + "step": 11920 + }, + { + "epoch": 2.998239436619718, + "grad_norm": 0.29172345995903015, + "learning_rate": 1.049135504804788e-11, + "loss": 0.3215, + "step": 11921 + }, + { + "epoch": 2.9984909456740443, + "grad_norm": 0.28759902715682983, + "learning_rate": 7.70793503634959e-12, + "loss": 0.3142, + "step": 11922 + }, + { + "epoch": 2.99874245472837, + "grad_norm": 0.3014575242996216, + "learning_rate": 5.352733084551531e-12, + "loss": 0.3054, + "step": 11923 + }, + { + "epoch": 2.9989939637826963, + "grad_norm": 0.2920631766319275, + "learning_rate": 3.4257493941591837e-12, + "loss": 0.3298, + "step": 11924 + }, + { + "epoch": 2.999245472837022, + "grad_norm": 0.2961787283420563, + "learning_rate": 1.926984130595777e-12, + "loss": 0.3011, + "step": 11925 + }, + { + "epoch": 2.999496981891348, + "grad_norm": 0.28382429480552673, + "learning_rate": 8.564374220920713e-13, + "loss": 0.3406, + "step": 11926 + }, + { + "epoch": 2.999748490945674, + "grad_norm": 0.2791459858417511, + "learning_rate": 2.1410936024146568e-13, + "loss": 0.3243, + "step": 11927 + }, + { + "epoch": 3.0, + "grad_norm": 0.2965925633907318, + "learning_rate": 0.0, + "loss": 0.2831, + "step": 11928 + }, + { + "epoch": 3.0, + "step": 11928, + "total_flos": 1.1291892680425472e+16, + "train_loss": 0.36148936700982826, + "train_runtime": 214787.5482, + "train_samples_per_second": 5.331, + "train_steps_per_second": 0.056 + } + ], + "logging_steps": 1.0, + "max_steps": 11928, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.1291892680425472e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}