diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100755--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,25256 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.3508515815085158, + "eval_steps": 500, + "global_step": 3605, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 9.732360097323601e-05, + "grad_norm": 16.13226202148005, + "learning_rate": 3.2362459546925574e-08, + "loss": 1.1997, + "step": 1 + }, + { + "epoch": 0.00019464720194647202, + "grad_norm": 15.765097511926365, + "learning_rate": 6.472491909385115e-08, + "loss": 1.384, + "step": 2 + }, + { + "epoch": 0.00029197080291970805, + "grad_norm": 16.64113665586635, + "learning_rate": 9.70873786407767e-08, + "loss": 1.2291, + "step": 3 + }, + { + "epoch": 0.00038929440389294404, + "grad_norm": 20.34864047521242, + "learning_rate": 1.294498381877023e-07, + "loss": 0.9025, + "step": 4 + }, + { + "epoch": 0.00048661800486618007, + "grad_norm": 28.710711096046108, + "learning_rate": 1.6181229773462782e-07, + "loss": 1.0305, + "step": 5 + }, + { + "epoch": 0.0005839416058394161, + "grad_norm": 21.945801992582915, + "learning_rate": 1.941747572815534e-07, + "loss": 1.0979, + "step": 6 + }, + { + "epoch": 0.0006812652068126521, + "grad_norm": 23.947905905644966, + "learning_rate": 2.26537216828479e-07, + "loss": 1.1909, + "step": 7 + }, + { + "epoch": 0.0007785888077858881, + "grad_norm": 19.835016686730835, + "learning_rate": 2.588996763754046e-07, + "loss": 1.2083, + "step": 8 + }, + { + "epoch": 0.0008759124087591241, + "grad_norm": 16.926846352507788, + "learning_rate": 2.9126213592233014e-07, + "loss": 1.2369, + "step": 9 + }, + { + "epoch": 0.0009732360097323601, + "grad_norm": 21.349924470647284, + "learning_rate": 3.2362459546925565e-07, + "loss": 1.0052, + "step": 10 + }, + { + "epoch": 0.0010705596107055961, + "grad_norm": 25.127579741628022, + "learning_rate": 3.5598705501618125e-07, + "loss": 1.2631, + "step": 11 + }, + { + "epoch": 0.0011678832116788322, + "grad_norm": 12.524049131196549, + "learning_rate": 3.883495145631068e-07, + "loss": 1.0884, + "step": 12 + }, + { + "epoch": 0.001265206812652068, + "grad_norm": 20.706648432487587, + "learning_rate": 4.207119741100324e-07, + "loss": 1.1469, + "step": 13 + }, + { + "epoch": 0.0013625304136253042, + "grad_norm": 17.655230197318655, + "learning_rate": 4.53074433656958e-07, + "loss": 1.2922, + "step": 14 + }, + { + "epoch": 0.00145985401459854, + "grad_norm": 16.550170455008725, + "learning_rate": 4.854368932038835e-07, + "loss": 1.1792, + "step": 15 + }, + { + "epoch": 0.0015571776155717761, + "grad_norm": 24.456798845425887, + "learning_rate": 5.177993527508092e-07, + "loss": 1.0804, + "step": 16 + }, + { + "epoch": 0.0016545012165450122, + "grad_norm": 14.659117460865279, + "learning_rate": 5.501618122977346e-07, + "loss": 1.0973, + "step": 17 + }, + { + "epoch": 0.0017518248175182481, + "grad_norm": 15.324823146378344, + "learning_rate": 5.825242718446603e-07, + "loss": 0.9791, + "step": 18 + }, + { + "epoch": 0.0018491484184914842, + "grad_norm": 12.483869597287145, + "learning_rate": 6.148867313915858e-07, + "loss": 1.0829, + "step": 19 + }, + { + "epoch": 0.0019464720194647203, + "grad_norm": 11.921211994178957, + "learning_rate": 6.472491909385113e-07, + "loss": 0.6862, + "step": 20 + }, + { + "epoch": 0.0020437956204379564, + "grad_norm": 14.53279456676939, + "learning_rate": 6.79611650485437e-07, + "loss": 0.7814, + "step": 21 + }, + { + "epoch": 0.0021411192214111923, + "grad_norm": 15.68359520937104, + "learning_rate": 7.119741100323625e-07, + "loss": 0.883, + "step": 22 + }, + { + "epoch": 0.002238442822384428, + "grad_norm": 14.062468532950906, + "learning_rate": 7.443365695792882e-07, + "loss": 1.0087, + "step": 23 + }, + { + "epoch": 0.0023357664233576644, + "grad_norm": 11.150778403716444, + "learning_rate": 7.766990291262136e-07, + "loss": 0.4884, + "step": 24 + }, + { + "epoch": 0.0024330900243309003, + "grad_norm": 7.740982223602688, + "learning_rate": 8.090614886731392e-07, + "loss": 0.8543, + "step": 25 + }, + { + "epoch": 0.002530413625304136, + "grad_norm": 6.4338060169141915, + "learning_rate": 8.414239482200648e-07, + "loss": 0.7948, + "step": 26 + }, + { + "epoch": 0.002627737226277372, + "grad_norm": 6.227022582398367, + "learning_rate": 8.737864077669904e-07, + "loss": 0.7814, + "step": 27 + }, + { + "epoch": 0.0027250608272506084, + "grad_norm": 7.989531820662516, + "learning_rate": 9.06148867313916e-07, + "loss": 0.5645, + "step": 28 + }, + { + "epoch": 0.0028223844282238442, + "grad_norm": 6.4745089193753, + "learning_rate": 9.385113268608415e-07, + "loss": 0.6802, + "step": 29 + }, + { + "epoch": 0.00291970802919708, + "grad_norm": 8.23650018531745, + "learning_rate": 9.70873786407767e-07, + "loss": 0.6218, + "step": 30 + }, + { + "epoch": 0.0030170316301703164, + "grad_norm": 4.915479010119541, + "learning_rate": 1.0032362459546926e-06, + "loss": 0.8879, + "step": 31 + }, + { + "epoch": 0.0031143552311435523, + "grad_norm": 4.288138757396447, + "learning_rate": 1.0355987055016184e-06, + "loss": 0.5917, + "step": 32 + }, + { + "epoch": 0.003211678832116788, + "grad_norm": 4.230901102531741, + "learning_rate": 1.0679611650485437e-06, + "loss": 0.7373, + "step": 33 + }, + { + "epoch": 0.0033090024330900245, + "grad_norm": 4.714303656539792, + "learning_rate": 1.1003236245954693e-06, + "loss": 0.5886, + "step": 34 + }, + { + "epoch": 0.0034063260340632603, + "grad_norm": 4.1204943469600925, + "learning_rate": 1.132686084142395e-06, + "loss": 0.5991, + "step": 35 + }, + { + "epoch": 0.0035036496350364962, + "grad_norm": 3.124375547961107, + "learning_rate": 1.1650485436893206e-06, + "loss": 0.432, + "step": 36 + }, + { + "epoch": 0.0036009732360097325, + "grad_norm": 3.741153837090354, + "learning_rate": 1.197411003236246e-06, + "loss": 0.6379, + "step": 37 + }, + { + "epoch": 0.0036982968369829684, + "grad_norm": 3.7740270813504506, + "learning_rate": 1.2297734627831717e-06, + "loss": 0.5595, + "step": 38 + }, + { + "epoch": 0.0037956204379562043, + "grad_norm": 4.783986424289694, + "learning_rate": 1.2621359223300972e-06, + "loss": 0.8717, + "step": 39 + }, + { + "epoch": 0.0038929440389294406, + "grad_norm": 4.242597978097827, + "learning_rate": 1.2944983818770226e-06, + "loss": 0.6632, + "step": 40 + }, + { + "epoch": 0.0039902676399026765, + "grad_norm": 4.309602952976607, + "learning_rate": 1.3268608414239483e-06, + "loss": 0.7191, + "step": 41 + }, + { + "epoch": 0.004087591240875913, + "grad_norm": 4.136462382872819, + "learning_rate": 1.359223300970874e-06, + "loss": 0.6782, + "step": 42 + }, + { + "epoch": 0.004184914841849148, + "grad_norm": 4.2148643401229, + "learning_rate": 1.3915857605177997e-06, + "loss": 0.8932, + "step": 43 + }, + { + "epoch": 0.0042822384428223845, + "grad_norm": 3.829331188520966, + "learning_rate": 1.423948220064725e-06, + "loss": 0.4697, + "step": 44 + }, + { + "epoch": 0.004379562043795621, + "grad_norm": 3.4564347781684557, + "learning_rate": 1.4563106796116506e-06, + "loss": 0.3377, + "step": 45 + }, + { + "epoch": 0.004476885644768856, + "grad_norm": 3.319649807488789, + "learning_rate": 1.4886731391585763e-06, + "loss": 0.4589, + "step": 46 + }, + { + "epoch": 0.0045742092457420926, + "grad_norm": 3.8856546910308034, + "learning_rate": 1.5210355987055017e-06, + "loss": 0.8413, + "step": 47 + }, + { + "epoch": 0.004671532846715329, + "grad_norm": 3.7955924171570605, + "learning_rate": 1.5533980582524272e-06, + "loss": 0.588, + "step": 48 + }, + { + "epoch": 0.004768856447688564, + "grad_norm": 4.5762685715882805, + "learning_rate": 1.585760517799353e-06, + "loss": 0.6472, + "step": 49 + }, + { + "epoch": 0.004866180048661801, + "grad_norm": 4.284420204063246, + "learning_rate": 1.6181229773462783e-06, + "loss": 0.5233, + "step": 50 + }, + { + "epoch": 0.004963503649635037, + "grad_norm": 4.0399534913964645, + "learning_rate": 1.650485436893204e-06, + "loss": 0.6737, + "step": 51 + }, + { + "epoch": 0.005060827250608272, + "grad_norm": 4.850258079033273, + "learning_rate": 1.6828478964401297e-06, + "loss": 0.5017, + "step": 52 + }, + { + "epoch": 0.005158150851581509, + "grad_norm": 3.289730774319516, + "learning_rate": 1.715210355987055e-06, + "loss": 0.6378, + "step": 53 + }, + { + "epoch": 0.005255474452554744, + "grad_norm": 3.116783938182044, + "learning_rate": 1.7475728155339808e-06, + "loss": 0.5681, + "step": 54 + }, + { + "epoch": 0.00535279805352798, + "grad_norm": 3.5896487509946677, + "learning_rate": 1.7799352750809063e-06, + "loss": 0.5222, + "step": 55 + }, + { + "epoch": 0.005450121654501217, + "grad_norm": 3.3627737905222146, + "learning_rate": 1.812297734627832e-06, + "loss": 0.351, + "step": 56 + }, + { + "epoch": 0.005547445255474452, + "grad_norm": 3.405981770724818, + "learning_rate": 1.8446601941747574e-06, + "loss": 0.5832, + "step": 57 + }, + { + "epoch": 0.0056447688564476885, + "grad_norm": 3.231134680455488, + "learning_rate": 1.877022653721683e-06, + "loss": 0.5558, + "step": 58 + }, + { + "epoch": 0.005742092457420925, + "grad_norm": 4.2963387449464605, + "learning_rate": 1.9093851132686085e-06, + "loss": 0.7544, + "step": 59 + }, + { + "epoch": 0.00583941605839416, + "grad_norm": 3.3678084152804315, + "learning_rate": 1.941747572815534e-06, + "loss": 0.554, + "step": 60 + }, + { + "epoch": 0.0059367396593673965, + "grad_norm": 3.635756089652443, + "learning_rate": 1.9741100323624596e-06, + "loss": 0.5312, + "step": 61 + }, + { + "epoch": 0.006034063260340633, + "grad_norm": 3.91764256649437, + "learning_rate": 2.006472491909385e-06, + "loss": 0.4329, + "step": 62 + }, + { + "epoch": 0.006131386861313868, + "grad_norm": 3.4866607421863565, + "learning_rate": 2.0388349514563107e-06, + "loss": 0.4453, + "step": 63 + }, + { + "epoch": 0.006228710462287105, + "grad_norm": 2.9369425143161147, + "learning_rate": 2.0711974110032367e-06, + "loss": 0.467, + "step": 64 + }, + { + "epoch": 0.006326034063260341, + "grad_norm": 3.0906723589687024, + "learning_rate": 2.103559870550162e-06, + "loss": 0.3917, + "step": 65 + }, + { + "epoch": 0.006423357664233576, + "grad_norm": 3.5121616512799747, + "learning_rate": 2.1359223300970874e-06, + "loss": 0.6428, + "step": 66 + }, + { + "epoch": 0.006520681265206813, + "grad_norm": 3.470270871630247, + "learning_rate": 2.1682847896440134e-06, + "loss": 0.586, + "step": 67 + }, + { + "epoch": 0.006618004866180049, + "grad_norm": 2.8689679430782498, + "learning_rate": 2.2006472491909385e-06, + "loss": 0.2938, + "step": 68 + }, + { + "epoch": 0.006715328467153284, + "grad_norm": 4.115573400175418, + "learning_rate": 2.2330097087378645e-06, + "loss": 0.3855, + "step": 69 + }, + { + "epoch": 0.006812652068126521, + "grad_norm": 3.903319335204406, + "learning_rate": 2.26537216828479e-06, + "loss": 0.6272, + "step": 70 + }, + { + "epoch": 0.006909975669099757, + "grad_norm": 2.649165320750572, + "learning_rate": 2.297734627831715e-06, + "loss": 0.5229, + "step": 71 + }, + { + "epoch": 0.0070072992700729924, + "grad_norm": 2.8543884488184235, + "learning_rate": 2.330097087378641e-06, + "loss": 0.4006, + "step": 72 + }, + { + "epoch": 0.007104622871046229, + "grad_norm": 2.9817247056794134, + "learning_rate": 2.3624595469255667e-06, + "loss": 0.2331, + "step": 73 + }, + { + "epoch": 0.007201946472019465, + "grad_norm": 3.592880940053797, + "learning_rate": 2.394822006472492e-06, + "loss": 0.4889, + "step": 74 + }, + { + "epoch": 0.0072992700729927005, + "grad_norm": 2.89844013224274, + "learning_rate": 2.427184466019418e-06, + "loss": 0.4711, + "step": 75 + }, + { + "epoch": 0.007396593673965937, + "grad_norm": 2.6071345596032134, + "learning_rate": 2.4595469255663434e-06, + "loss": 0.4844, + "step": 76 + }, + { + "epoch": 0.007493917274939173, + "grad_norm": 2.9053930844585776, + "learning_rate": 2.491909385113269e-06, + "loss": 0.5163, + "step": 77 + }, + { + "epoch": 0.0075912408759124085, + "grad_norm": 3.4016540038418115, + "learning_rate": 2.5242718446601945e-06, + "loss": 0.5852, + "step": 78 + }, + { + "epoch": 0.007688564476885645, + "grad_norm": 2.7133170026932887, + "learning_rate": 2.55663430420712e-06, + "loss": 0.4934, + "step": 79 + }, + { + "epoch": 0.007785888077858881, + "grad_norm": 3.2321439410345585, + "learning_rate": 2.588996763754045e-06, + "loss": 0.62, + "step": 80 + }, + { + "epoch": 0.007883211678832117, + "grad_norm": 2.6835948161160545, + "learning_rate": 2.621359223300971e-06, + "loss": 0.4689, + "step": 81 + }, + { + "epoch": 0.007980535279805353, + "grad_norm": 4.716894934604404, + "learning_rate": 2.6537216828478967e-06, + "loss": 0.3364, + "step": 82 + }, + { + "epoch": 0.00807785888077859, + "grad_norm": 2.6507857723180646, + "learning_rate": 2.686084142394822e-06, + "loss": 0.3785, + "step": 83 + }, + { + "epoch": 0.008175182481751826, + "grad_norm": 2.356714630861861, + "learning_rate": 2.718446601941748e-06, + "loss": 0.2591, + "step": 84 + }, + { + "epoch": 0.00827250608272506, + "grad_norm": 2.755477478688418, + "learning_rate": 2.7508090614886734e-06, + "loss": 0.4762, + "step": 85 + }, + { + "epoch": 0.008369829683698296, + "grad_norm": 3.7771581783688837, + "learning_rate": 2.7831715210355993e-06, + "loss": 0.4627, + "step": 86 + }, + { + "epoch": 0.008467153284671533, + "grad_norm": 2.8568450908810257, + "learning_rate": 2.8155339805825245e-06, + "loss": 0.4322, + "step": 87 + }, + { + "epoch": 0.008564476885644769, + "grad_norm": 2.914756058289183, + "learning_rate": 2.84789644012945e-06, + "loss": 0.4835, + "step": 88 + }, + { + "epoch": 0.008661800486618005, + "grad_norm": 2.414182197047686, + "learning_rate": 2.880258899676376e-06, + "loss": 0.493, + "step": 89 + }, + { + "epoch": 0.008759124087591242, + "grad_norm": 2.8597853736106975, + "learning_rate": 2.912621359223301e-06, + "loss": 0.6063, + "step": 90 + }, + { + "epoch": 0.008856447688564476, + "grad_norm": 2.4567808863650007, + "learning_rate": 2.9449838187702267e-06, + "loss": 0.5874, + "step": 91 + }, + { + "epoch": 0.008953771289537713, + "grad_norm": 2.819434031784131, + "learning_rate": 2.9773462783171527e-06, + "loss": 0.552, + "step": 92 + }, + { + "epoch": 0.009051094890510949, + "grad_norm": 1.9840396387462764, + "learning_rate": 3.0097087378640778e-06, + "loss": 0.3736, + "step": 93 + }, + { + "epoch": 0.009148418491484185, + "grad_norm": 2.52047300259283, + "learning_rate": 3.0420711974110033e-06, + "loss": 0.407, + "step": 94 + }, + { + "epoch": 0.009245742092457421, + "grad_norm": 3.140839526692518, + "learning_rate": 3.0744336569579293e-06, + "loss": 0.6513, + "step": 95 + }, + { + "epoch": 0.009343065693430658, + "grad_norm": 3.1368865731879554, + "learning_rate": 3.1067961165048544e-06, + "loss": 0.4804, + "step": 96 + }, + { + "epoch": 0.009440389294403892, + "grad_norm": 2.6987222968513196, + "learning_rate": 3.13915857605178e-06, + "loss": 0.4228, + "step": 97 + }, + { + "epoch": 0.009537712895377129, + "grad_norm": 2.5779408707034026, + "learning_rate": 3.171521035598706e-06, + "loss": 0.4654, + "step": 98 + }, + { + "epoch": 0.009635036496350365, + "grad_norm": 2.5189587792888934, + "learning_rate": 3.2038834951456315e-06, + "loss": 0.5465, + "step": 99 + }, + { + "epoch": 0.009732360097323601, + "grad_norm": 2.457408493992738, + "learning_rate": 3.2362459546925567e-06, + "loss": 0.5077, + "step": 100 + }, + { + "epoch": 0.009829683698296838, + "grad_norm": 2.445932328031196, + "learning_rate": 3.2686084142394826e-06, + "loss": 0.492, + "step": 101 + }, + { + "epoch": 0.009927007299270074, + "grad_norm": 2.3199141960061915, + "learning_rate": 3.300970873786408e-06, + "loss": 0.4432, + "step": 102 + }, + { + "epoch": 0.010024330900243308, + "grad_norm": 3.88769555780582, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.3684, + "step": 103 + }, + { + "epoch": 0.010121654501216545, + "grad_norm": 2.63905676146042, + "learning_rate": 3.3656957928802593e-06, + "loss": 0.4238, + "step": 104 + }, + { + "epoch": 0.010218978102189781, + "grad_norm": 3.0073749174392885, + "learning_rate": 3.398058252427185e-06, + "loss": 0.4655, + "step": 105 + }, + { + "epoch": 0.010316301703163017, + "grad_norm": 2.613524831872459, + "learning_rate": 3.43042071197411e-06, + "loss": 0.4948, + "step": 106 + }, + { + "epoch": 0.010413625304136254, + "grad_norm": 2.4293628733346764, + "learning_rate": 3.462783171521036e-06, + "loss": 0.3717, + "step": 107 + }, + { + "epoch": 0.010510948905109488, + "grad_norm": 3.3036504610837016, + "learning_rate": 3.4951456310679615e-06, + "loss": 0.4939, + "step": 108 + }, + { + "epoch": 0.010608272506082725, + "grad_norm": 2.6808221933664846, + "learning_rate": 3.5275080906148866e-06, + "loss": 0.4809, + "step": 109 + }, + { + "epoch": 0.01070559610705596, + "grad_norm": 2.853958419293739, + "learning_rate": 3.5598705501618126e-06, + "loss": 0.4066, + "step": 110 + }, + { + "epoch": 0.010802919708029197, + "grad_norm": 5.3412930378250145, + "learning_rate": 3.592233009708738e-06, + "loss": 0.3599, + "step": 111 + }, + { + "epoch": 0.010900243309002433, + "grad_norm": 2.983669976646381, + "learning_rate": 3.624595469255664e-06, + "loss": 0.6187, + "step": 112 + }, + { + "epoch": 0.01099756690997567, + "grad_norm": 3.388543821878077, + "learning_rate": 3.6569579288025893e-06, + "loss": 0.717, + "step": 113 + }, + { + "epoch": 0.011094890510948904, + "grad_norm": 3.0720120062792127, + "learning_rate": 3.689320388349515e-06, + "loss": 0.5057, + "step": 114 + }, + { + "epoch": 0.01119221411192214, + "grad_norm": 2.521868238475485, + "learning_rate": 3.721682847896441e-06, + "loss": 0.4308, + "step": 115 + }, + { + "epoch": 0.011289537712895377, + "grad_norm": 2.641085251645149, + "learning_rate": 3.754045307443366e-06, + "loss": 0.4047, + "step": 116 + }, + { + "epoch": 0.011386861313868613, + "grad_norm": 2.6936547530255828, + "learning_rate": 3.7864077669902915e-06, + "loss": 0.5548, + "step": 117 + }, + { + "epoch": 0.01148418491484185, + "grad_norm": 5.599830434139348, + "learning_rate": 3.818770226537217e-06, + "loss": 0.5338, + "step": 118 + }, + { + "epoch": 0.011581508515815086, + "grad_norm": 2.6372065340185378, + "learning_rate": 3.851132686084142e-06, + "loss": 0.4833, + "step": 119 + }, + { + "epoch": 0.01167883211678832, + "grad_norm": 2.555049765563167, + "learning_rate": 3.883495145631068e-06, + "loss": 0.4295, + "step": 120 + }, + { + "epoch": 0.011776155717761557, + "grad_norm": 2.22725048478721, + "learning_rate": 3.915857605177994e-06, + "loss": 0.4074, + "step": 121 + }, + { + "epoch": 0.011873479318734793, + "grad_norm": 3.0093045583939984, + "learning_rate": 3.948220064724919e-06, + "loss": 0.7168, + "step": 122 + }, + { + "epoch": 0.01197080291970803, + "grad_norm": 2.8800338131191223, + "learning_rate": 3.980582524271845e-06, + "loss": 0.3826, + "step": 123 + }, + { + "epoch": 0.012068126520681266, + "grad_norm": 2.3197904571086974, + "learning_rate": 4.01294498381877e-06, + "loss": 0.2584, + "step": 124 + }, + { + "epoch": 0.012165450121654502, + "grad_norm": 2.929540360888414, + "learning_rate": 4.045307443365696e-06, + "loss": 0.4617, + "step": 125 + }, + { + "epoch": 0.012262773722627737, + "grad_norm": 2.5602803735383137, + "learning_rate": 4.0776699029126215e-06, + "loss": 0.2561, + "step": 126 + }, + { + "epoch": 0.012360097323600973, + "grad_norm": 2.676345297957673, + "learning_rate": 4.1100323624595475e-06, + "loss": 0.2996, + "step": 127 + }, + { + "epoch": 0.01245742092457421, + "grad_norm": 1.9047794610871986, + "learning_rate": 4.1423948220064734e-06, + "loss": 0.3475, + "step": 128 + }, + { + "epoch": 0.012554744525547445, + "grad_norm": 2.9014607006450555, + "learning_rate": 4.1747572815533986e-06, + "loss": 0.4748, + "step": 129 + }, + { + "epoch": 0.012652068126520682, + "grad_norm": 2.2992367182815987, + "learning_rate": 4.207119741100324e-06, + "loss": 0.3465, + "step": 130 + }, + { + "epoch": 0.012749391727493918, + "grad_norm": 2.668874383033437, + "learning_rate": 4.23948220064725e-06, + "loss": 0.6119, + "step": 131 + }, + { + "epoch": 0.012846715328467153, + "grad_norm": 2.69106703615133, + "learning_rate": 4.271844660194175e-06, + "loss": 0.4743, + "step": 132 + }, + { + "epoch": 0.012944038929440389, + "grad_norm": 2.972314561813759, + "learning_rate": 4.304207119741101e-06, + "loss": 0.5766, + "step": 133 + }, + { + "epoch": 0.013041362530413625, + "grad_norm": 2.7487017428059635, + "learning_rate": 4.336569579288027e-06, + "loss": 0.5818, + "step": 134 + }, + { + "epoch": 0.013138686131386862, + "grad_norm": 3.1117207482379663, + "learning_rate": 4.368932038834952e-06, + "loss": 0.6468, + "step": 135 + }, + { + "epoch": 0.013236009732360098, + "grad_norm": 2.781796948090657, + "learning_rate": 4.401294498381877e-06, + "loss": 0.7209, + "step": 136 + }, + { + "epoch": 0.013333333333333334, + "grad_norm": 2.5480533986327556, + "learning_rate": 4.433656957928803e-06, + "loss": 0.5907, + "step": 137 + }, + { + "epoch": 0.013430656934306569, + "grad_norm": 2.054397852683208, + "learning_rate": 4.466019417475729e-06, + "loss": 0.4079, + "step": 138 + }, + { + "epoch": 0.013527980535279805, + "grad_norm": 2.2564046621809037, + "learning_rate": 4.498381877022654e-06, + "loss": 0.4, + "step": 139 + }, + { + "epoch": 0.013625304136253041, + "grad_norm": 2.8739841159071022, + "learning_rate": 4.53074433656958e-06, + "loss": 0.5819, + "step": 140 + }, + { + "epoch": 0.013722627737226278, + "grad_norm": 2.6418540847993657, + "learning_rate": 4.563106796116505e-06, + "loss": 0.589, + "step": 141 + }, + { + "epoch": 0.013819951338199514, + "grad_norm": 2.431908870746442, + "learning_rate": 4.59546925566343e-06, + "loss": 0.5468, + "step": 142 + }, + { + "epoch": 0.013917274939172749, + "grad_norm": 4.44933942542394, + "learning_rate": 4.627831715210356e-06, + "loss": 0.3846, + "step": 143 + }, + { + "epoch": 0.014014598540145985, + "grad_norm": 2.2469929628351126, + "learning_rate": 4.660194174757282e-06, + "loss": 0.3047, + "step": 144 + }, + { + "epoch": 0.014111922141119221, + "grad_norm": 2.8361034502388205, + "learning_rate": 4.6925566343042074e-06, + "loss": 0.4186, + "step": 145 + }, + { + "epoch": 0.014209245742092457, + "grad_norm": 2.485184255788147, + "learning_rate": 4.724919093851133e-06, + "loss": 0.455, + "step": 146 + }, + { + "epoch": 0.014306569343065694, + "grad_norm": 2.677307495548506, + "learning_rate": 4.7572815533980585e-06, + "loss": 0.6346, + "step": 147 + }, + { + "epoch": 0.01440389294403893, + "grad_norm": 2.9440091029213034, + "learning_rate": 4.789644012944984e-06, + "loss": 0.4961, + "step": 148 + }, + { + "epoch": 0.014501216545012165, + "grad_norm": 2.6810327828724723, + "learning_rate": 4.82200647249191e-06, + "loss": 0.3754, + "step": 149 + }, + { + "epoch": 0.014598540145985401, + "grad_norm": 2.519257002697837, + "learning_rate": 4.854368932038836e-06, + "loss": 0.249, + "step": 150 + }, + { + "epoch": 0.014695863746958637, + "grad_norm": 2.8041238457488578, + "learning_rate": 4.886731391585761e-06, + "loss": 0.3117, + "step": 151 + }, + { + "epoch": 0.014793187347931874, + "grad_norm": 2.363481194731433, + "learning_rate": 4.919093851132687e-06, + "loss": 0.3325, + "step": 152 + }, + { + "epoch": 0.01489051094890511, + "grad_norm": 3.078347599868747, + "learning_rate": 4.951456310679612e-06, + "loss": 0.3569, + "step": 153 + }, + { + "epoch": 0.014987834549878346, + "grad_norm": 3.2926461094535515, + "learning_rate": 4.983818770226538e-06, + "loss": 0.716, + "step": 154 + }, + { + "epoch": 0.01508515815085158, + "grad_norm": 2.340052421830345, + "learning_rate": 5.016181229773464e-06, + "loss": 0.2642, + "step": 155 + }, + { + "epoch": 0.015182481751824817, + "grad_norm": 1.8915730140906823, + "learning_rate": 5.048543689320389e-06, + "loss": 0.3523, + "step": 156 + }, + { + "epoch": 0.015279805352798053, + "grad_norm": 4.2448533254564484, + "learning_rate": 5.080906148867314e-06, + "loss": 0.5185, + "step": 157 + }, + { + "epoch": 0.01537712895377129, + "grad_norm": 2.1172922256300333, + "learning_rate": 5.11326860841424e-06, + "loss": 0.3341, + "step": 158 + }, + { + "epoch": 0.015474452554744526, + "grad_norm": 2.7414250631657113, + "learning_rate": 5.145631067961165e-06, + "loss": 0.5965, + "step": 159 + }, + { + "epoch": 0.015571776155717762, + "grad_norm": 1.977804344185745, + "learning_rate": 5.17799352750809e-06, + "loss": 0.239, + "step": 160 + }, + { + "epoch": 0.015669099756690997, + "grad_norm": 2.771807640315191, + "learning_rate": 5.210355987055017e-06, + "loss": 0.4122, + "step": 161 + }, + { + "epoch": 0.015766423357664233, + "grad_norm": 1.9977073642008174, + "learning_rate": 5.242718446601942e-06, + "loss": 0.3423, + "step": 162 + }, + { + "epoch": 0.01586374695863747, + "grad_norm": 3.222730527079622, + "learning_rate": 5.275080906148867e-06, + "loss": 0.5647, + "step": 163 + }, + { + "epoch": 0.015961070559610706, + "grad_norm": 2.95441646694508, + "learning_rate": 5.307443365695793e-06, + "loss": 0.5198, + "step": 164 + }, + { + "epoch": 0.016058394160583942, + "grad_norm": 2.3346384576429116, + "learning_rate": 5.3398058252427185e-06, + "loss": 0.3516, + "step": 165 + }, + { + "epoch": 0.01615571776155718, + "grad_norm": 2.089159587923689, + "learning_rate": 5.372168284789644e-06, + "loss": 0.3704, + "step": 166 + }, + { + "epoch": 0.016253041362530415, + "grad_norm": 2.8135820638465088, + "learning_rate": 5.4045307443365705e-06, + "loss": 0.3729, + "step": 167 + }, + { + "epoch": 0.01635036496350365, + "grad_norm": 2.991259557993277, + "learning_rate": 5.436893203883496e-06, + "loss": 0.5622, + "step": 168 + }, + { + "epoch": 0.016447688564476887, + "grad_norm": 3.1512644455187857, + "learning_rate": 5.4692556634304216e-06, + "loss": 0.5915, + "step": 169 + }, + { + "epoch": 0.01654501216545012, + "grad_norm": 2.616126184062516, + "learning_rate": 5.501618122977347e-06, + "loss": 0.4252, + "step": 170 + }, + { + "epoch": 0.016642335766423356, + "grad_norm": 1.9958281517625203, + "learning_rate": 5.533980582524272e-06, + "loss": 0.3704, + "step": 171 + }, + { + "epoch": 0.016739659367396593, + "grad_norm": 2.470731302334384, + "learning_rate": 5.566343042071199e-06, + "loss": 0.4373, + "step": 172 + }, + { + "epoch": 0.01683698296836983, + "grad_norm": 2.583270308023139, + "learning_rate": 5.598705501618124e-06, + "loss": 0.4125, + "step": 173 + }, + { + "epoch": 0.016934306569343065, + "grad_norm": 1.9644684632241667, + "learning_rate": 5.631067961165049e-06, + "loss": 0.3522, + "step": 174 + }, + { + "epoch": 0.0170316301703163, + "grad_norm": 2.4207097357376046, + "learning_rate": 5.663430420711975e-06, + "loss": 0.3579, + "step": 175 + }, + { + "epoch": 0.017128953771289538, + "grad_norm": 2.3511041847292034, + "learning_rate": 5.6957928802589e-06, + "loss": 0.5412, + "step": 176 + }, + { + "epoch": 0.017226277372262774, + "grad_norm": 2.274427899539275, + "learning_rate": 5.728155339805825e-06, + "loss": 0.5353, + "step": 177 + }, + { + "epoch": 0.01732360097323601, + "grad_norm": 2.133749284526256, + "learning_rate": 5.760517799352752e-06, + "loss": 0.4392, + "step": 178 + }, + { + "epoch": 0.017420924574209247, + "grad_norm": 2.3097462109285787, + "learning_rate": 5.792880258899677e-06, + "loss": 0.4442, + "step": 179 + }, + { + "epoch": 0.017518248175182483, + "grad_norm": 2.2128802818602056, + "learning_rate": 5.825242718446602e-06, + "loss": 0.5635, + "step": 180 + }, + { + "epoch": 0.017615571776155716, + "grad_norm": 2.103405792854256, + "learning_rate": 5.857605177993528e-06, + "loss": 0.4533, + "step": 181 + }, + { + "epoch": 0.017712895377128952, + "grad_norm": 2.0565661990183597, + "learning_rate": 5.889967637540453e-06, + "loss": 0.3806, + "step": 182 + }, + { + "epoch": 0.01781021897810219, + "grad_norm": 2.179649872267064, + "learning_rate": 5.9223300970873785e-06, + "loss": 0.3842, + "step": 183 + }, + { + "epoch": 0.017907542579075425, + "grad_norm": 3.8333244047199146, + "learning_rate": 5.954692556634305e-06, + "loss": 0.3876, + "step": 184 + }, + { + "epoch": 0.01800486618004866, + "grad_norm": 2.2893517217095716, + "learning_rate": 5.9870550161812304e-06, + "loss": 0.4781, + "step": 185 + }, + { + "epoch": 0.018102189781021898, + "grad_norm": 1.6022498167897639, + "learning_rate": 6.0194174757281556e-06, + "loss": 0.2306, + "step": 186 + }, + { + "epoch": 0.018199513381995134, + "grad_norm": 2.32863493589546, + "learning_rate": 6.0517799352750815e-06, + "loss": 0.5139, + "step": 187 + }, + { + "epoch": 0.01829683698296837, + "grad_norm": 2.0789478938631314, + "learning_rate": 6.084142394822007e-06, + "loss": 0.2824, + "step": 188 + }, + { + "epoch": 0.018394160583941607, + "grad_norm": 1.7544615955949223, + "learning_rate": 6.116504854368932e-06, + "loss": 0.4172, + "step": 189 + }, + { + "epoch": 0.018491484184914843, + "grad_norm": 1.931043696572374, + "learning_rate": 6.148867313915859e-06, + "loss": 0.3584, + "step": 190 + }, + { + "epoch": 0.01858880778588808, + "grad_norm": 2.467258437370788, + "learning_rate": 6.181229773462784e-06, + "loss": 0.462, + "step": 191 + }, + { + "epoch": 0.018686131386861315, + "grad_norm": 2.1541091684996965, + "learning_rate": 6.213592233009709e-06, + "loss": 0.3967, + "step": 192 + }, + { + "epoch": 0.01878345498783455, + "grad_norm": 2.2330486922808395, + "learning_rate": 6.245954692556635e-06, + "loss": 0.5316, + "step": 193 + }, + { + "epoch": 0.018880778588807785, + "grad_norm": 2.3498262097642395, + "learning_rate": 6.27831715210356e-06, + "loss": 0.4815, + "step": 194 + }, + { + "epoch": 0.01897810218978102, + "grad_norm": 1.7045092076002246, + "learning_rate": 6.310679611650487e-06, + "loss": 0.3, + "step": 195 + }, + { + "epoch": 0.019075425790754257, + "grad_norm": 2.5703331850837023, + "learning_rate": 6.343042071197412e-06, + "loss": 0.4143, + "step": 196 + }, + { + "epoch": 0.019172749391727494, + "grad_norm": 2.6940646171495133, + "learning_rate": 6.375404530744337e-06, + "loss": 0.5463, + "step": 197 + }, + { + "epoch": 0.01927007299270073, + "grad_norm": 2.4185580273524847, + "learning_rate": 6.407766990291263e-06, + "loss": 0.5215, + "step": 198 + }, + { + "epoch": 0.019367396593673966, + "grad_norm": 2.6509824694985946, + "learning_rate": 6.440129449838188e-06, + "loss": 0.5286, + "step": 199 + }, + { + "epoch": 0.019464720194647202, + "grad_norm": 2.4807219128312767, + "learning_rate": 6.472491909385113e-06, + "loss": 0.3996, + "step": 200 + }, + { + "epoch": 0.01956204379562044, + "grad_norm": 2.651883834043772, + "learning_rate": 6.50485436893204e-06, + "loss": 0.3499, + "step": 201 + }, + { + "epoch": 0.019659367396593675, + "grad_norm": 2.670759179984812, + "learning_rate": 6.537216828478965e-06, + "loss": 0.552, + "step": 202 + }, + { + "epoch": 0.01975669099756691, + "grad_norm": 2.51305850829245, + "learning_rate": 6.56957928802589e-06, + "loss": 0.3806, + "step": 203 + }, + { + "epoch": 0.019854014598540148, + "grad_norm": 2.435954851305265, + "learning_rate": 6.601941747572816e-06, + "loss": 0.6093, + "step": 204 + }, + { + "epoch": 0.01995133819951338, + "grad_norm": 2.091315833022872, + "learning_rate": 6.6343042071197415e-06, + "loss": 0.3573, + "step": 205 + }, + { + "epoch": 0.020048661800486617, + "grad_norm": 2.205515437184344, + "learning_rate": 6.666666666666667e-06, + "loss": 0.2892, + "step": 206 + }, + { + "epoch": 0.020145985401459853, + "grad_norm": 2.314981932930035, + "learning_rate": 6.6990291262135935e-06, + "loss": 0.4184, + "step": 207 + }, + { + "epoch": 0.02024330900243309, + "grad_norm": 1.9102474885146974, + "learning_rate": 6.731391585760519e-06, + "loss": 0.2287, + "step": 208 + }, + { + "epoch": 0.020340632603406326, + "grad_norm": 1.9408029275065433, + "learning_rate": 6.763754045307444e-06, + "loss": 0.3958, + "step": 209 + }, + { + "epoch": 0.020437956204379562, + "grad_norm": 2.1006467731485823, + "learning_rate": 6.79611650485437e-06, + "loss": 0.3764, + "step": 210 + }, + { + "epoch": 0.0205352798053528, + "grad_norm": 2.0927447282795146, + "learning_rate": 6.828478964401295e-06, + "loss": 0.531, + "step": 211 + }, + { + "epoch": 0.020632603406326035, + "grad_norm": 3.4830081465453633, + "learning_rate": 6.86084142394822e-06, + "loss": 0.4887, + "step": 212 + }, + { + "epoch": 0.02072992700729927, + "grad_norm": 2.253360993066953, + "learning_rate": 6.893203883495147e-06, + "loss": 0.4587, + "step": 213 + }, + { + "epoch": 0.020827250608272507, + "grad_norm": 3.3751096354443852, + "learning_rate": 6.925566343042072e-06, + "loss": 0.3427, + "step": 214 + }, + { + "epoch": 0.020924574209245744, + "grad_norm": 1.9729713112803993, + "learning_rate": 6.957928802588997e-06, + "loss": 0.384, + "step": 215 + }, + { + "epoch": 0.021021897810218976, + "grad_norm": 2.761285167796522, + "learning_rate": 6.990291262135923e-06, + "loss": 0.3512, + "step": 216 + }, + { + "epoch": 0.021119221411192213, + "grad_norm": 2.431882400442612, + "learning_rate": 7.022653721682848e-06, + "loss": 0.3971, + "step": 217 + }, + { + "epoch": 0.02121654501216545, + "grad_norm": 3.659254877088116, + "learning_rate": 7.055016181229773e-06, + "loss": 0.4115, + "step": 218 + }, + { + "epoch": 0.021313868613138685, + "grad_norm": 2.5501534359714655, + "learning_rate": 7.0873786407767e-06, + "loss": 0.4963, + "step": 219 + }, + { + "epoch": 0.02141119221411192, + "grad_norm": 4.296894309260591, + "learning_rate": 7.119741100323625e-06, + "loss": 0.5203, + "step": 220 + }, + { + "epoch": 0.021508515815085158, + "grad_norm": 2.5489854552137237, + "learning_rate": 7.152103559870551e-06, + "loss": 0.4343, + "step": 221 + }, + { + "epoch": 0.021605839416058394, + "grad_norm": 2.00955207958064, + "learning_rate": 7.184466019417476e-06, + "loss": 0.3603, + "step": 222 + }, + { + "epoch": 0.02170316301703163, + "grad_norm": 2.2675038932590224, + "learning_rate": 7.2168284789644015e-06, + "loss": 0.3968, + "step": 223 + }, + { + "epoch": 0.021800486618004867, + "grad_norm": 2.4690586331753277, + "learning_rate": 7.249190938511328e-06, + "loss": 0.5883, + "step": 224 + }, + { + "epoch": 0.021897810218978103, + "grad_norm": 2.141328682063472, + "learning_rate": 7.2815533980582534e-06, + "loss": 0.3547, + "step": 225 + }, + { + "epoch": 0.02199513381995134, + "grad_norm": 2.223927434368622, + "learning_rate": 7.3139158576051786e-06, + "loss": 0.5031, + "step": 226 + }, + { + "epoch": 0.022092457420924576, + "grad_norm": 2.8602320319532346, + "learning_rate": 7.3462783171521046e-06, + "loss": 0.4226, + "step": 227 + }, + { + "epoch": 0.02218978102189781, + "grad_norm": 2.8852449405031835, + "learning_rate": 7.37864077669903e-06, + "loss": 0.4298, + "step": 228 + }, + { + "epoch": 0.022287104622871045, + "grad_norm": 1.7370522944561966, + "learning_rate": 7.411003236245955e-06, + "loss": 0.3827, + "step": 229 + }, + { + "epoch": 0.02238442822384428, + "grad_norm": 2.3907908463140584, + "learning_rate": 7.443365695792882e-06, + "loss": 0.4139, + "step": 230 + }, + { + "epoch": 0.022481751824817518, + "grad_norm": 2.27581306432663, + "learning_rate": 7.475728155339807e-06, + "loss": 0.4736, + "step": 231 + }, + { + "epoch": 0.022579075425790754, + "grad_norm": 2.1861094823645675, + "learning_rate": 7.508090614886732e-06, + "loss": 0.4809, + "step": 232 + }, + { + "epoch": 0.02267639902676399, + "grad_norm": 1.9626208371421419, + "learning_rate": 7.540453074433658e-06, + "loss": 0.3436, + "step": 233 + }, + { + "epoch": 0.022773722627737226, + "grad_norm": 1.7092390993202267, + "learning_rate": 7.572815533980583e-06, + "loss": 0.3224, + "step": 234 + }, + { + "epoch": 0.022871046228710463, + "grad_norm": 3.0168693228526546, + "learning_rate": 7.605177993527508e-06, + "loss": 0.6366, + "step": 235 + }, + { + "epoch": 0.0229683698296837, + "grad_norm": 2.424919921496664, + "learning_rate": 7.637540453074434e-06, + "loss": 0.4483, + "step": 236 + }, + { + "epoch": 0.023065693430656935, + "grad_norm": 2.4586833984787626, + "learning_rate": 7.66990291262136e-06, + "loss": 0.4031, + "step": 237 + }, + { + "epoch": 0.02316301703163017, + "grad_norm": 2.092010230715883, + "learning_rate": 7.702265372168284e-06, + "loss": 0.4257, + "step": 238 + }, + { + "epoch": 0.023260340632603408, + "grad_norm": 2.3360188447701655, + "learning_rate": 7.734627831715211e-06, + "loss": 0.4684, + "step": 239 + }, + { + "epoch": 0.02335766423357664, + "grad_norm": 2.087175894606599, + "learning_rate": 7.766990291262136e-06, + "loss": 0.4272, + "step": 240 + }, + { + "epoch": 0.023454987834549877, + "grad_norm": 2.598684557686617, + "learning_rate": 7.799352750809061e-06, + "loss": 0.5401, + "step": 241 + }, + { + "epoch": 0.023552311435523113, + "grad_norm": 2.025117037181364, + "learning_rate": 7.831715210355988e-06, + "loss": 0.372, + "step": 242 + }, + { + "epoch": 0.02364963503649635, + "grad_norm": 2.2467324584398405, + "learning_rate": 7.864077669902913e-06, + "loss": 0.5891, + "step": 243 + }, + { + "epoch": 0.023746958637469586, + "grad_norm": 2.38036373195977, + "learning_rate": 7.896440129449839e-06, + "loss": 0.5133, + "step": 244 + }, + { + "epoch": 0.023844282238442822, + "grad_norm": 2.052700924442009, + "learning_rate": 7.928802588996765e-06, + "loss": 0.5161, + "step": 245 + }, + { + "epoch": 0.02394160583941606, + "grad_norm": 3.4299018810240254, + "learning_rate": 7.96116504854369e-06, + "loss": 0.5314, + "step": 246 + }, + { + "epoch": 0.024038929440389295, + "grad_norm": 1.3903956706369247, + "learning_rate": 7.993527508090616e-06, + "loss": 0.3539, + "step": 247 + }, + { + "epoch": 0.02413625304136253, + "grad_norm": 2.4599878810180873, + "learning_rate": 8.02588996763754e-06, + "loss": 0.4876, + "step": 248 + }, + { + "epoch": 0.024233576642335768, + "grad_norm": 2.4053308291912083, + "learning_rate": 8.058252427184466e-06, + "loss": 0.5185, + "step": 249 + }, + { + "epoch": 0.024330900243309004, + "grad_norm": 1.6624263546342495, + "learning_rate": 8.090614886731393e-06, + "loss": 0.2909, + "step": 250 + }, + { + "epoch": 0.024428223844282237, + "grad_norm": 2.4091367373679597, + "learning_rate": 8.122977346278318e-06, + "loss": 0.6192, + "step": 251 + }, + { + "epoch": 0.024525547445255473, + "grad_norm": 2.4595313520548427, + "learning_rate": 8.155339805825243e-06, + "loss": 0.3444, + "step": 252 + }, + { + "epoch": 0.02462287104622871, + "grad_norm": 2.3200411140153174, + "learning_rate": 8.18770226537217e-06, + "loss": 0.6112, + "step": 253 + }, + { + "epoch": 0.024720194647201946, + "grad_norm": 2.029624875741936, + "learning_rate": 8.220064724919095e-06, + "loss": 0.4524, + "step": 254 + }, + { + "epoch": 0.024817518248175182, + "grad_norm": 1.8862765408033388, + "learning_rate": 8.25242718446602e-06, + "loss": 0.2173, + "step": 255 + }, + { + "epoch": 0.02491484184914842, + "grad_norm": 2.575687620331568, + "learning_rate": 8.284789644012947e-06, + "loss": 0.4599, + "step": 256 + }, + { + "epoch": 0.025012165450121655, + "grad_norm": 2.373530485379713, + "learning_rate": 8.317152103559872e-06, + "loss": 0.5326, + "step": 257 + }, + { + "epoch": 0.02510948905109489, + "grad_norm": 2.4086353319447262, + "learning_rate": 8.349514563106797e-06, + "loss": 0.6275, + "step": 258 + }, + { + "epoch": 0.025206812652068127, + "grad_norm": 2.1075725625285697, + "learning_rate": 8.381877022653722e-06, + "loss": 0.44, + "step": 259 + }, + { + "epoch": 0.025304136253041364, + "grad_norm": 2.0285700798989614, + "learning_rate": 8.414239482200647e-06, + "loss": 0.3489, + "step": 260 + }, + { + "epoch": 0.0254014598540146, + "grad_norm": 2.5592973746241, + "learning_rate": 8.446601941747573e-06, + "loss": 0.4403, + "step": 261 + }, + { + "epoch": 0.025498783454987836, + "grad_norm": 2.470930078509074, + "learning_rate": 8.4789644012945e-06, + "loss": 0.4985, + "step": 262 + }, + { + "epoch": 0.02559610705596107, + "grad_norm": 2.099638103909556, + "learning_rate": 8.511326860841424e-06, + "loss": 0.4194, + "step": 263 + }, + { + "epoch": 0.025693430656934305, + "grad_norm": 1.6030834140551835, + "learning_rate": 8.54368932038835e-06, + "loss": 0.3382, + "step": 264 + }, + { + "epoch": 0.02579075425790754, + "grad_norm": 1.8960928547169034, + "learning_rate": 8.576051779935276e-06, + "loss": 0.2838, + "step": 265 + }, + { + "epoch": 0.025888077858880778, + "grad_norm": 2.4306930963261966, + "learning_rate": 8.608414239482202e-06, + "loss": 0.4956, + "step": 266 + }, + { + "epoch": 0.025985401459854014, + "grad_norm": 2.374430136325354, + "learning_rate": 8.640776699029127e-06, + "loss": 0.5083, + "step": 267 + }, + { + "epoch": 0.02608272506082725, + "grad_norm": 2.410095115145934, + "learning_rate": 8.673139158576054e-06, + "loss": 0.4247, + "step": 268 + }, + { + "epoch": 0.026180048661800487, + "grad_norm": 2.41271065696519, + "learning_rate": 8.705501618122979e-06, + "loss": 0.6946, + "step": 269 + }, + { + "epoch": 0.026277372262773723, + "grad_norm": 1.752688930628829, + "learning_rate": 8.737864077669904e-06, + "loss": 0.2662, + "step": 270 + }, + { + "epoch": 0.02637469586374696, + "grad_norm": 1.9842034213162434, + "learning_rate": 8.770226537216829e-06, + "loss": 0.3611, + "step": 271 + }, + { + "epoch": 0.026472019464720196, + "grad_norm": 2.4137979998327497, + "learning_rate": 8.802588996763754e-06, + "loss": 0.501, + "step": 272 + }, + { + "epoch": 0.026569343065693432, + "grad_norm": 2.929650064864996, + "learning_rate": 8.834951456310681e-06, + "loss": 0.6153, + "step": 273 + }, + { + "epoch": 0.02666666666666667, + "grad_norm": 2.281738020025263, + "learning_rate": 8.867313915857606e-06, + "loss": 0.5395, + "step": 274 + }, + { + "epoch": 0.0267639902676399, + "grad_norm": 2.1406726692627975, + "learning_rate": 8.899676375404531e-06, + "loss": 0.4039, + "step": 275 + }, + { + "epoch": 0.026861313868613138, + "grad_norm": 3.2366954201371523, + "learning_rate": 8.932038834951458e-06, + "loss": 0.5414, + "step": 276 + }, + { + "epoch": 0.026958637469586374, + "grad_norm": 2.1900667662872513, + "learning_rate": 8.964401294498383e-06, + "loss": 0.3815, + "step": 277 + }, + { + "epoch": 0.02705596107055961, + "grad_norm": 2.5301939091612216, + "learning_rate": 8.996763754045308e-06, + "loss": 0.8016, + "step": 278 + }, + { + "epoch": 0.027153284671532846, + "grad_norm": 2.2552758985680907, + "learning_rate": 9.029126213592233e-06, + "loss": 0.4133, + "step": 279 + }, + { + "epoch": 0.027250608272506083, + "grad_norm": 2.309545536997134, + "learning_rate": 9.06148867313916e-06, + "loss": 0.5346, + "step": 280 + }, + { + "epoch": 0.02734793187347932, + "grad_norm": 2.585578916644781, + "learning_rate": 9.093851132686085e-06, + "loss": 0.407, + "step": 281 + }, + { + "epoch": 0.027445255474452555, + "grad_norm": 1.8503464194025006, + "learning_rate": 9.12621359223301e-06, + "loss": 0.4674, + "step": 282 + }, + { + "epoch": 0.02754257907542579, + "grad_norm": 2.431490115980846, + "learning_rate": 9.158576051779936e-06, + "loss": 0.6026, + "step": 283 + }, + { + "epoch": 0.027639902676399028, + "grad_norm": 1.916233248702735, + "learning_rate": 9.19093851132686e-06, + "loss": 0.4949, + "step": 284 + }, + { + "epoch": 0.027737226277372264, + "grad_norm": 2.2160236640245072, + "learning_rate": 9.223300970873788e-06, + "loss": 0.4765, + "step": 285 + }, + { + "epoch": 0.027834549878345497, + "grad_norm": 2.0764827118780143, + "learning_rate": 9.255663430420713e-06, + "loss": 0.472, + "step": 286 + }, + { + "epoch": 0.027931873479318733, + "grad_norm": 2.638286661284288, + "learning_rate": 9.288025889967638e-06, + "loss": 0.6312, + "step": 287 + }, + { + "epoch": 0.02802919708029197, + "grad_norm": 1.940011273577467, + "learning_rate": 9.320388349514565e-06, + "loss": 0.4555, + "step": 288 + }, + { + "epoch": 0.028126520681265206, + "grad_norm": 1.8760624736314784, + "learning_rate": 9.35275080906149e-06, + "loss": 0.3625, + "step": 289 + }, + { + "epoch": 0.028223844282238442, + "grad_norm": 1.3468692859077058, + "learning_rate": 9.385113268608415e-06, + "loss": 0.2442, + "step": 290 + }, + { + "epoch": 0.02832116788321168, + "grad_norm": 2.1497394847504014, + "learning_rate": 9.41747572815534e-06, + "loss": 0.5227, + "step": 291 + }, + { + "epoch": 0.028418491484184915, + "grad_norm": 2.1233743171190014, + "learning_rate": 9.449838187702267e-06, + "loss": 0.6184, + "step": 292 + }, + { + "epoch": 0.02851581508515815, + "grad_norm": 2.337806183860394, + "learning_rate": 9.482200647249192e-06, + "loss": 0.5491, + "step": 293 + }, + { + "epoch": 0.028613138686131388, + "grad_norm": 2.015000594070385, + "learning_rate": 9.514563106796117e-06, + "loss": 0.5137, + "step": 294 + }, + { + "epoch": 0.028710462287104624, + "grad_norm": 2.0267324830753766, + "learning_rate": 9.546925566343042e-06, + "loss": 0.4117, + "step": 295 + }, + { + "epoch": 0.02880778588807786, + "grad_norm": 1.732639028192012, + "learning_rate": 9.579288025889967e-06, + "loss": 0.3156, + "step": 296 + }, + { + "epoch": 0.028905109489051097, + "grad_norm": 2.1204056159243923, + "learning_rate": 9.611650485436894e-06, + "loss": 0.6056, + "step": 297 + }, + { + "epoch": 0.02900243309002433, + "grad_norm": 1.7868071753968195, + "learning_rate": 9.64401294498382e-06, + "loss": 0.3417, + "step": 298 + }, + { + "epoch": 0.029099756690997566, + "grad_norm": 1.9477439300595292, + "learning_rate": 9.676375404530746e-06, + "loss": 0.3631, + "step": 299 + }, + { + "epoch": 0.029197080291970802, + "grad_norm": 1.7688147839655162, + "learning_rate": 9.708737864077671e-06, + "loss": 0.3605, + "step": 300 + }, + { + "epoch": 0.029294403892944038, + "grad_norm": 1.9162335597538034, + "learning_rate": 9.741100323624596e-06, + "loss": 0.2498, + "step": 301 + }, + { + "epoch": 0.029391727493917275, + "grad_norm": 2.9282579520055756, + "learning_rate": 9.773462783171522e-06, + "loss": 0.4286, + "step": 302 + }, + { + "epoch": 0.02948905109489051, + "grad_norm": 1.9744499285549086, + "learning_rate": 9.805825242718447e-06, + "loss": 0.3391, + "step": 303 + }, + { + "epoch": 0.029586374695863747, + "grad_norm": 2.2116032868392455, + "learning_rate": 9.838187702265373e-06, + "loss": 0.3414, + "step": 304 + }, + { + "epoch": 0.029683698296836983, + "grad_norm": 1.9159144570242486, + "learning_rate": 9.870550161812299e-06, + "loss": 0.2915, + "step": 305 + }, + { + "epoch": 0.02978102189781022, + "grad_norm": 2.671718838238437, + "learning_rate": 9.902912621359224e-06, + "loss": 0.79, + "step": 306 + }, + { + "epoch": 0.029878345498783456, + "grad_norm": 2.093937424199301, + "learning_rate": 9.935275080906149e-06, + "loss": 0.576, + "step": 307 + }, + { + "epoch": 0.029975669099756692, + "grad_norm": 1.895574286512308, + "learning_rate": 9.967637540453076e-06, + "loss": 0.4223, + "step": 308 + }, + { + "epoch": 0.03007299270072993, + "grad_norm": 2.142643554578675, + "learning_rate": 1e-05, + "loss": 0.4719, + "step": 309 + }, + { + "epoch": 0.03017031630170316, + "grad_norm": 2.2548613483238378, + "learning_rate": 9.999999751573464e-06, + "loss": 0.5547, + "step": 310 + }, + { + "epoch": 0.030267639902676398, + "grad_norm": 2.375146158639999, + "learning_rate": 9.99999900629388e-06, + "loss": 0.3864, + "step": 311 + }, + { + "epoch": 0.030364963503649634, + "grad_norm": 1.558937895217452, + "learning_rate": 9.99999776416132e-06, + "loss": 0.3409, + "step": 312 + }, + { + "epoch": 0.03046228710462287, + "grad_norm": 2.7508940543848115, + "learning_rate": 9.99999602517591e-06, + "loss": 0.3641, + "step": 313 + }, + { + "epoch": 0.030559610705596107, + "grad_norm": 2.228096737889712, + "learning_rate": 9.99999378933782e-06, + "loss": 0.6464, + "step": 314 + }, + { + "epoch": 0.030656934306569343, + "grad_norm": 1.5612763763472, + "learning_rate": 9.999991056647274e-06, + "loss": 0.3124, + "step": 315 + }, + { + "epoch": 0.03075425790754258, + "grad_norm": 2.3203527787434104, + "learning_rate": 9.999987827104544e-06, + "loss": 0.5893, + "step": 316 + }, + { + "epoch": 0.030851581508515816, + "grad_norm": 1.8472611567410342, + "learning_rate": 9.999984100709951e-06, + "loss": 0.3732, + "step": 317 + }, + { + "epoch": 0.030948905109489052, + "grad_norm": 2.269778108549014, + "learning_rate": 9.999979877463866e-06, + "loss": 0.5537, + "step": 318 + }, + { + "epoch": 0.03104622871046229, + "grad_norm": 2.381498581134022, + "learning_rate": 9.999975157366705e-06, + "loss": 0.7179, + "step": 319 + }, + { + "epoch": 0.031143552311435525, + "grad_norm": 1.7030655036823346, + "learning_rate": 9.99996994041894e-06, + "loss": 0.4256, + "step": 320 + }, + { + "epoch": 0.031240875912408757, + "grad_norm": 1.8361141038730153, + "learning_rate": 9.999964226621089e-06, + "loss": 0.4648, + "step": 321 + }, + { + "epoch": 0.031338199513381994, + "grad_norm": 1.7985459229558753, + "learning_rate": 9.99995801597372e-06, + "loss": 0.3031, + "step": 322 + }, + { + "epoch": 0.031435523114355234, + "grad_norm": 2.4309020119442915, + "learning_rate": 9.99995130847745e-06, + "loss": 0.5011, + "step": 323 + }, + { + "epoch": 0.031532846715328466, + "grad_norm": 2.048514022969095, + "learning_rate": 9.999944104132944e-06, + "loss": 0.6152, + "step": 324 + }, + { + "epoch": 0.031630170316301706, + "grad_norm": 1.8892667320795724, + "learning_rate": 9.99993640294092e-06, + "loss": 0.4738, + "step": 325 + }, + { + "epoch": 0.03172749391727494, + "grad_norm": 2.081179331819785, + "learning_rate": 9.999928204902141e-06, + "loss": 0.5192, + "step": 326 + }, + { + "epoch": 0.03182481751824817, + "grad_norm": 2.410280889073595, + "learning_rate": 9.999919510017424e-06, + "loss": 0.3314, + "step": 327 + }, + { + "epoch": 0.03192214111922141, + "grad_norm": 1.663034255724975, + "learning_rate": 9.999910318287632e-06, + "loss": 0.3342, + "step": 328 + }, + { + "epoch": 0.032019464720194644, + "grad_norm": 1.7874391345352068, + "learning_rate": 9.999900629713679e-06, + "loss": 0.3189, + "step": 329 + }, + { + "epoch": 0.032116788321167884, + "grad_norm": 2.1098429973097805, + "learning_rate": 9.999890444296528e-06, + "loss": 0.4561, + "step": 330 + }, + { + "epoch": 0.03221411192214112, + "grad_norm": 2.4678279265353558, + "learning_rate": 9.999879762037187e-06, + "loss": 0.5831, + "step": 331 + }, + { + "epoch": 0.03231143552311436, + "grad_norm": 1.6643716587630457, + "learning_rate": 9.999868582936726e-06, + "loss": 0.4371, + "step": 332 + }, + { + "epoch": 0.03240875912408759, + "grad_norm": 2.088466639768523, + "learning_rate": 9.999856906996246e-06, + "loss": 0.3904, + "step": 333 + }, + { + "epoch": 0.03250608272506083, + "grad_norm": 2.0023651443392256, + "learning_rate": 9.999844734216914e-06, + "loss": 0.4802, + "step": 334 + }, + { + "epoch": 0.03260340632603406, + "grad_norm": 2.161282844007076, + "learning_rate": 9.99983206459994e-06, + "loss": 0.5187, + "step": 335 + }, + { + "epoch": 0.0327007299270073, + "grad_norm": 2.10212671583593, + "learning_rate": 9.999818898146576e-06, + "loss": 0.4618, + "step": 336 + }, + { + "epoch": 0.032798053527980535, + "grad_norm": 2.2142899508809286, + "learning_rate": 9.999805234858137e-06, + "loss": 0.2387, + "step": 337 + }, + { + "epoch": 0.032895377128953775, + "grad_norm": 2.1084763484693023, + "learning_rate": 9.999791074735981e-06, + "loss": 0.5652, + "step": 338 + }, + { + "epoch": 0.03299270072992701, + "grad_norm": 2.261838498017328, + "learning_rate": 9.99977641778151e-06, + "loss": 0.7224, + "step": 339 + }, + { + "epoch": 0.03309002433090024, + "grad_norm": 1.612816030006559, + "learning_rate": 9.999761263996184e-06, + "loss": 0.377, + "step": 340 + }, + { + "epoch": 0.03318734793187348, + "grad_norm": 2.1209830295615832, + "learning_rate": 9.999745613381507e-06, + "loss": 0.614, + "step": 341 + }, + { + "epoch": 0.03328467153284671, + "grad_norm": 1.7938764015879674, + "learning_rate": 9.999729465939036e-06, + "loss": 0.3983, + "step": 342 + }, + { + "epoch": 0.03338199513381995, + "grad_norm": 1.943418875698731, + "learning_rate": 9.999712821670375e-06, + "loss": 0.4708, + "step": 343 + }, + { + "epoch": 0.033479318734793186, + "grad_norm": 1.9787546900237571, + "learning_rate": 9.99969568057718e-06, + "loss": 0.578, + "step": 344 + }, + { + "epoch": 0.033576642335766425, + "grad_norm": 1.4798263726328331, + "learning_rate": 9.99967804266115e-06, + "loss": 0.394, + "step": 345 + }, + { + "epoch": 0.03367396593673966, + "grad_norm": 2.1936298043995484, + "learning_rate": 9.99965990792404e-06, + "loss": 0.6316, + "step": 346 + }, + { + "epoch": 0.0337712895377129, + "grad_norm": 2.2799650780195133, + "learning_rate": 9.99964127636765e-06, + "loss": 0.3985, + "step": 347 + }, + { + "epoch": 0.03386861313868613, + "grad_norm": 1.8519049219191819, + "learning_rate": 9.999622147993837e-06, + "loss": 0.3853, + "step": 348 + }, + { + "epoch": 0.03396593673965937, + "grad_norm": 1.5111895282974241, + "learning_rate": 9.999602522804497e-06, + "loss": 0.4201, + "step": 349 + }, + { + "epoch": 0.0340632603406326, + "grad_norm": 1.8605769784283237, + "learning_rate": 9.99958240080158e-06, + "loss": 0.5225, + "step": 350 + }, + { + "epoch": 0.034160583941605836, + "grad_norm": 1.6063240538866903, + "learning_rate": 9.999561781987087e-06, + "loss": 0.3165, + "step": 351 + }, + { + "epoch": 0.034257907542579076, + "grad_norm": 1.4751976204077173, + "learning_rate": 9.999540666363068e-06, + "loss": 0.3156, + "step": 352 + }, + { + "epoch": 0.03435523114355231, + "grad_norm": 2.1029966771511757, + "learning_rate": 9.99951905393162e-06, + "loss": 0.5336, + "step": 353 + }, + { + "epoch": 0.03445255474452555, + "grad_norm": 2.1419054642874267, + "learning_rate": 9.99949694469489e-06, + "loss": 0.5253, + "step": 354 + }, + { + "epoch": 0.03454987834549878, + "grad_norm": 2.169397271826959, + "learning_rate": 9.999474338655075e-06, + "loss": 0.5567, + "step": 355 + }, + { + "epoch": 0.03464720194647202, + "grad_norm": 2.2972412855327797, + "learning_rate": 9.999451235814422e-06, + "loss": 0.5233, + "step": 356 + }, + { + "epoch": 0.034744525547445254, + "grad_norm": 1.830377999961128, + "learning_rate": 9.999427636175228e-06, + "loss": 0.4297, + "step": 357 + }, + { + "epoch": 0.034841849148418494, + "grad_norm": 2.1217123292302875, + "learning_rate": 9.999403539739837e-06, + "loss": 0.3605, + "step": 358 + }, + { + "epoch": 0.03493917274939173, + "grad_norm": 2.001599625802253, + "learning_rate": 9.999378946510642e-06, + "loss": 0.5237, + "step": 359 + }, + { + "epoch": 0.035036496350364967, + "grad_norm": 1.6719956399048532, + "learning_rate": 9.99935385649009e-06, + "loss": 0.424, + "step": 360 + }, + { + "epoch": 0.0351338199513382, + "grad_norm": 1.5962062682133515, + "learning_rate": 9.99932826968067e-06, + "loss": 0.4228, + "step": 361 + }, + { + "epoch": 0.03523114355231143, + "grad_norm": 1.9754274750693919, + "learning_rate": 9.999302186084929e-06, + "loss": 0.4333, + "step": 362 + }, + { + "epoch": 0.03532846715328467, + "grad_norm": 1.8248617929879183, + "learning_rate": 9.999275605705457e-06, + "loss": 0.4985, + "step": 363 + }, + { + "epoch": 0.035425790754257905, + "grad_norm": 2.5923075514224982, + "learning_rate": 9.999248528544895e-06, + "loss": 0.4829, + "step": 364 + }, + { + "epoch": 0.035523114355231145, + "grad_norm": 1.9900801938638135, + "learning_rate": 9.999220954605932e-06, + "loss": 0.587, + "step": 365 + }, + { + "epoch": 0.03562043795620438, + "grad_norm": 1.731558772897005, + "learning_rate": 9.999192883891314e-06, + "loss": 0.3299, + "step": 366 + }, + { + "epoch": 0.03571776155717762, + "grad_norm": 2.339577788711278, + "learning_rate": 9.999164316403823e-06, + "loss": 0.4845, + "step": 367 + }, + { + "epoch": 0.03581508515815085, + "grad_norm": 1.9784113864985955, + "learning_rate": 9.999135252146302e-06, + "loss": 0.5776, + "step": 368 + }, + { + "epoch": 0.03591240875912409, + "grad_norm": 1.5555461256937277, + "learning_rate": 9.999105691121638e-06, + "loss": 0.3563, + "step": 369 + }, + { + "epoch": 0.03600973236009732, + "grad_norm": 1.7905677559908044, + "learning_rate": 9.99907563333277e-06, + "loss": 0.546, + "step": 370 + }, + { + "epoch": 0.03610705596107056, + "grad_norm": 2.0490894714600287, + "learning_rate": 9.999045078782684e-06, + "loss": 0.6836, + "step": 371 + }, + { + "epoch": 0.036204379562043795, + "grad_norm": 2.216601446334751, + "learning_rate": 9.999014027474413e-06, + "loss": 0.5237, + "step": 372 + }, + { + "epoch": 0.036301703163017035, + "grad_norm": 1.5937926342815392, + "learning_rate": 9.998982479411047e-06, + "loss": 0.3539, + "step": 373 + }, + { + "epoch": 0.03639902676399027, + "grad_norm": 2.3941848280266864, + "learning_rate": 9.99895043459572e-06, + "loss": 0.6249, + "step": 374 + }, + { + "epoch": 0.0364963503649635, + "grad_norm": 2.072859669066288, + "learning_rate": 9.998917893031615e-06, + "loss": 0.5415, + "step": 375 + }, + { + "epoch": 0.03659367396593674, + "grad_norm": 1.670908711065728, + "learning_rate": 9.998884854721968e-06, + "loss": 0.3034, + "step": 376 + }, + { + "epoch": 0.03669099756690997, + "grad_norm": 1.9880303784818283, + "learning_rate": 9.998851319670057e-06, + "loss": 0.5025, + "step": 377 + }, + { + "epoch": 0.03678832116788321, + "grad_norm": 1.3517666701087396, + "learning_rate": 9.99881728787922e-06, + "loss": 0.2775, + "step": 378 + }, + { + "epoch": 0.036885644768856446, + "grad_norm": 1.8952553535268069, + "learning_rate": 9.998782759352839e-06, + "loss": 0.5306, + "step": 379 + }, + { + "epoch": 0.036982968369829686, + "grad_norm": 1.8730537486024816, + "learning_rate": 9.998747734094338e-06, + "loss": 0.386, + "step": 380 + }, + { + "epoch": 0.03708029197080292, + "grad_norm": 2.058996056292158, + "learning_rate": 9.998712212107205e-06, + "loss": 0.5641, + "step": 381 + }, + { + "epoch": 0.03717761557177616, + "grad_norm": 1.9837834234853275, + "learning_rate": 9.998676193394966e-06, + "loss": 0.2628, + "step": 382 + }, + { + "epoch": 0.03727493917274939, + "grad_norm": 2.189700953999047, + "learning_rate": 9.998639677961203e-06, + "loss": 0.6024, + "step": 383 + }, + { + "epoch": 0.03737226277372263, + "grad_norm": 2.060696593716547, + "learning_rate": 9.99860266580954e-06, + "loss": 0.5377, + "step": 384 + }, + { + "epoch": 0.037469586374695864, + "grad_norm": 2.0831966609629227, + "learning_rate": 9.99856515694366e-06, + "loss": 0.5063, + "step": 385 + }, + { + "epoch": 0.0375669099756691, + "grad_norm": 2.2950496556846227, + "learning_rate": 9.998527151367288e-06, + "loss": 0.6484, + "step": 386 + }, + { + "epoch": 0.037664233576642336, + "grad_norm": 2.2597922123273873, + "learning_rate": 9.9984886490842e-06, + "loss": 0.6617, + "step": 387 + }, + { + "epoch": 0.03776155717761557, + "grad_norm": 2.071575887731456, + "learning_rate": 9.99844965009822e-06, + "loss": 0.5405, + "step": 388 + }, + { + "epoch": 0.03785888077858881, + "grad_norm": 2.004249587957457, + "learning_rate": 9.99841015441323e-06, + "loss": 0.4306, + "step": 389 + }, + { + "epoch": 0.03795620437956204, + "grad_norm": 1.9297023880727862, + "learning_rate": 9.99837016203315e-06, + "loss": 0.4083, + "step": 390 + }, + { + "epoch": 0.03805352798053528, + "grad_norm": 2.001337081282171, + "learning_rate": 9.998329672961952e-06, + "loss": 0.4999, + "step": 391 + }, + { + "epoch": 0.038150851581508514, + "grad_norm": 1.7630230797021285, + "learning_rate": 9.998288687203665e-06, + "loss": 0.4267, + "step": 392 + }, + { + "epoch": 0.038248175182481754, + "grad_norm": 1.4413546421147376, + "learning_rate": 9.998247204762358e-06, + "loss": 0.3028, + "step": 393 + }, + { + "epoch": 0.03834549878345499, + "grad_norm": 2.032450629241147, + "learning_rate": 9.998205225642154e-06, + "loss": 0.4216, + "step": 394 + }, + { + "epoch": 0.03844282238442823, + "grad_norm": 1.8288270303352272, + "learning_rate": 9.998162749847224e-06, + "loss": 0.451, + "step": 395 + }, + { + "epoch": 0.03854014598540146, + "grad_norm": 1.5869427581540143, + "learning_rate": 9.998119777381791e-06, + "loss": 0.4896, + "step": 396 + }, + { + "epoch": 0.03863746958637469, + "grad_norm": 1.9312614168983935, + "learning_rate": 9.998076308250122e-06, + "loss": 0.351, + "step": 397 + }, + { + "epoch": 0.03873479318734793, + "grad_norm": 2.182734939846557, + "learning_rate": 9.99803234245654e-06, + "loss": 0.4456, + "step": 398 + }, + { + "epoch": 0.038832116788321165, + "grad_norm": 1.6075130172605856, + "learning_rate": 9.997987880005412e-06, + "loss": 0.3333, + "step": 399 + }, + { + "epoch": 0.038929440389294405, + "grad_norm": 2.0206579020801048, + "learning_rate": 9.997942920901154e-06, + "loss": 0.4662, + "step": 400 + }, + { + "epoch": 0.03902676399026764, + "grad_norm": 2.0019154912621246, + "learning_rate": 9.997897465148236e-06, + "loss": 0.588, + "step": 401 + }, + { + "epoch": 0.03912408759124088, + "grad_norm": 1.9556688755730123, + "learning_rate": 9.997851512751178e-06, + "loss": 0.5364, + "step": 402 + }, + { + "epoch": 0.03922141119221411, + "grad_norm": 2.1735940620422687, + "learning_rate": 9.997805063714541e-06, + "loss": 0.4155, + "step": 403 + }, + { + "epoch": 0.03931873479318735, + "grad_norm": 1.893104755523836, + "learning_rate": 9.997758118042945e-06, + "loss": 0.2835, + "step": 404 + }, + { + "epoch": 0.03941605839416058, + "grad_norm": 1.892857392200546, + "learning_rate": 9.99771067574105e-06, + "loss": 0.317, + "step": 405 + }, + { + "epoch": 0.03951338199513382, + "grad_norm": 2.194365925195629, + "learning_rate": 9.997662736813575e-06, + "loss": 0.5972, + "step": 406 + }, + { + "epoch": 0.039610705596107056, + "grad_norm": 2.3359516870584547, + "learning_rate": 9.997614301265281e-06, + "loss": 0.3505, + "step": 407 + }, + { + "epoch": 0.039708029197080295, + "grad_norm": 1.8041349283411827, + "learning_rate": 9.997565369100983e-06, + "loss": 0.4003, + "step": 408 + }, + { + "epoch": 0.03980535279805353, + "grad_norm": 2.2199870140108273, + "learning_rate": 9.997515940325542e-06, + "loss": 0.4428, + "step": 409 + }, + { + "epoch": 0.03990267639902676, + "grad_norm": 2.193796849633566, + "learning_rate": 9.997466014943871e-06, + "loss": 0.3906, + "step": 410 + }, + { + "epoch": 0.04, + "grad_norm": 2.7309920828616168, + "learning_rate": 9.99741559296093e-06, + "loss": 0.6283, + "step": 411 + }, + { + "epoch": 0.040097323600973234, + "grad_norm": 2.220745639846989, + "learning_rate": 9.99736467438173e-06, + "loss": 0.4568, + "step": 412 + }, + { + "epoch": 0.04019464720194647, + "grad_norm": 1.905067765139487, + "learning_rate": 9.99731325921133e-06, + "loss": 0.3198, + "step": 413 + }, + { + "epoch": 0.040291970802919706, + "grad_norm": 2.0461180940034116, + "learning_rate": 9.997261347454841e-06, + "loss": 0.3783, + "step": 414 + }, + { + "epoch": 0.040389294403892946, + "grad_norm": 1.9732614929529544, + "learning_rate": 9.99720893911742e-06, + "loss": 0.5211, + "step": 415 + }, + { + "epoch": 0.04048661800486618, + "grad_norm": 2.341156401873798, + "learning_rate": 9.997156034204276e-06, + "loss": 0.5094, + "step": 416 + }, + { + "epoch": 0.04058394160583942, + "grad_norm": 2.2588135503158138, + "learning_rate": 9.997102632720664e-06, + "loss": 0.591, + "step": 417 + }, + { + "epoch": 0.04068126520681265, + "grad_norm": 2.187795564574772, + "learning_rate": 9.997048734671893e-06, + "loss": 0.3811, + "step": 418 + }, + { + "epoch": 0.04077858880778589, + "grad_norm": 2.2570398189900938, + "learning_rate": 9.996994340063314e-06, + "loss": 0.4494, + "step": 419 + }, + { + "epoch": 0.040875912408759124, + "grad_norm": 2.3267878846596597, + "learning_rate": 9.996939448900341e-06, + "loss": 0.5254, + "step": 420 + }, + { + "epoch": 0.04097323600973236, + "grad_norm": 1.9149387144635641, + "learning_rate": 9.99688406118842e-06, + "loss": 0.4281, + "step": 421 + }, + { + "epoch": 0.0410705596107056, + "grad_norm": 2.4052095021382285, + "learning_rate": 9.996828176933062e-06, + "loss": 0.61, + "step": 422 + }, + { + "epoch": 0.04116788321167883, + "grad_norm": 2.8744864627123237, + "learning_rate": 9.996771796139814e-06, + "loss": 0.4708, + "step": 423 + }, + { + "epoch": 0.04126520681265207, + "grad_norm": 2.0334953222734513, + "learning_rate": 9.996714918814284e-06, + "loss": 0.2697, + "step": 424 + }, + { + "epoch": 0.0413625304136253, + "grad_norm": 2.1314093477075486, + "learning_rate": 9.996657544962119e-06, + "loss": 0.3026, + "step": 425 + }, + { + "epoch": 0.04145985401459854, + "grad_norm": 1.7241742631767316, + "learning_rate": 9.996599674589022e-06, + "loss": 0.3624, + "step": 426 + }, + { + "epoch": 0.041557177615571775, + "grad_norm": 2.417754377928955, + "learning_rate": 9.996541307700746e-06, + "loss": 0.6682, + "step": 427 + }, + { + "epoch": 0.041654501216545015, + "grad_norm": 2.2126055245100256, + "learning_rate": 9.99648244430309e-06, + "loss": 0.3705, + "step": 428 + }, + { + "epoch": 0.04175182481751825, + "grad_norm": 1.8224510106748588, + "learning_rate": 9.996423084401901e-06, + "loss": 0.4318, + "step": 429 + }, + { + "epoch": 0.04184914841849149, + "grad_norm": 1.6786428352287364, + "learning_rate": 9.996363228003079e-06, + "loss": 0.4662, + "step": 430 + }, + { + "epoch": 0.04194647201946472, + "grad_norm": 1.9342922605897592, + "learning_rate": 9.99630287511257e-06, + "loss": 0.4874, + "step": 431 + }, + { + "epoch": 0.04204379562043795, + "grad_norm": 1.9444011100602645, + "learning_rate": 9.996242025736377e-06, + "loss": 0.3711, + "step": 432 + }, + { + "epoch": 0.04214111922141119, + "grad_norm": 3.114184163688958, + "learning_rate": 9.99618067988054e-06, + "loss": 0.5342, + "step": 433 + }, + { + "epoch": 0.042238442822384425, + "grad_norm": 1.993932460938173, + "learning_rate": 9.99611883755116e-06, + "loss": 0.465, + "step": 434 + }, + { + "epoch": 0.042335766423357665, + "grad_norm": 1.5062408953506277, + "learning_rate": 9.99605649875438e-06, + "loss": 0.3862, + "step": 435 + }, + { + "epoch": 0.0424330900243309, + "grad_norm": 2.5287447175721733, + "learning_rate": 9.995993663496394e-06, + "loss": 0.5638, + "step": 436 + }, + { + "epoch": 0.04253041362530414, + "grad_norm": 1.7215400937807486, + "learning_rate": 9.995930331783448e-06, + "loss": 0.3507, + "step": 437 + }, + { + "epoch": 0.04262773722627737, + "grad_norm": 1.5105936757865817, + "learning_rate": 9.995866503621834e-06, + "loss": 0.4086, + "step": 438 + }, + { + "epoch": 0.04272506082725061, + "grad_norm": 1.828501540310894, + "learning_rate": 9.995802179017893e-06, + "loss": 0.3477, + "step": 439 + }, + { + "epoch": 0.04282238442822384, + "grad_norm": 1.6658361590948114, + "learning_rate": 9.995737357978022e-06, + "loss": 0.4006, + "step": 440 + }, + { + "epoch": 0.04291970802919708, + "grad_norm": 1.6434395036324305, + "learning_rate": 9.995672040508656e-06, + "loss": 0.4349, + "step": 441 + }, + { + "epoch": 0.043017031630170316, + "grad_norm": 1.9913424027071125, + "learning_rate": 9.99560622661629e-06, + "loss": 0.3415, + "step": 442 + }, + { + "epoch": 0.043114355231143556, + "grad_norm": 1.6487474195389296, + "learning_rate": 9.995539916307463e-06, + "loss": 0.4804, + "step": 443 + }, + { + "epoch": 0.04321167883211679, + "grad_norm": 1.4861266391850032, + "learning_rate": 9.995473109588764e-06, + "loss": 0.411, + "step": 444 + }, + { + "epoch": 0.04330900243309002, + "grad_norm": 1.4390762643228305, + "learning_rate": 9.995405806466831e-06, + "loss": 0.3806, + "step": 445 + }, + { + "epoch": 0.04340632603406326, + "grad_norm": 1.7775332171720517, + "learning_rate": 9.995338006948353e-06, + "loss": 0.3332, + "step": 446 + }, + { + "epoch": 0.043503649635036494, + "grad_norm": 1.7312883283317864, + "learning_rate": 9.995269711040067e-06, + "loss": 0.2736, + "step": 447 + }, + { + "epoch": 0.043600973236009734, + "grad_norm": 1.7973901424872405, + "learning_rate": 9.995200918748759e-06, + "loss": 0.5597, + "step": 448 + }, + { + "epoch": 0.04369829683698297, + "grad_norm": 2.0409413301370334, + "learning_rate": 9.995131630081265e-06, + "loss": 0.6045, + "step": 449 + }, + { + "epoch": 0.043795620437956206, + "grad_norm": 3.2708903670147347, + "learning_rate": 9.995061845044473e-06, + "loss": 0.6245, + "step": 450 + }, + { + "epoch": 0.04389294403892944, + "grad_norm": 1.744466889932859, + "learning_rate": 9.994991563645314e-06, + "loss": 0.4129, + "step": 451 + }, + { + "epoch": 0.04399026763990268, + "grad_norm": 1.8775864246251477, + "learning_rate": 9.994920785890771e-06, + "loss": 0.414, + "step": 452 + }, + { + "epoch": 0.04408759124087591, + "grad_norm": 1.3868286948878126, + "learning_rate": 9.994849511787881e-06, + "loss": 0.3164, + "step": 453 + }, + { + "epoch": 0.04418491484184915, + "grad_norm": 1.6888257223301795, + "learning_rate": 9.994777741343727e-06, + "loss": 0.3241, + "step": 454 + }, + { + "epoch": 0.044282238442822384, + "grad_norm": 1.5029594314338663, + "learning_rate": 9.994705474565436e-06, + "loss": 0.4148, + "step": 455 + }, + { + "epoch": 0.04437956204379562, + "grad_norm": 1.7159996915963702, + "learning_rate": 9.994632711460193e-06, + "loss": 0.3387, + "step": 456 + }, + { + "epoch": 0.04447688564476886, + "grad_norm": 1.7717997513120352, + "learning_rate": 9.994559452035228e-06, + "loss": 0.4547, + "step": 457 + }, + { + "epoch": 0.04457420924574209, + "grad_norm": 1.887765282184233, + "learning_rate": 9.99448569629782e-06, + "loss": 0.5919, + "step": 458 + }, + { + "epoch": 0.04467153284671533, + "grad_norm": 2.0151049512314585, + "learning_rate": 9.994411444255298e-06, + "loss": 0.4556, + "step": 459 + }, + { + "epoch": 0.04476885644768856, + "grad_norm": 1.5706463359289826, + "learning_rate": 9.994336695915041e-06, + "loss": 0.3443, + "step": 460 + }, + { + "epoch": 0.0448661800486618, + "grad_norm": 1.9067884841542395, + "learning_rate": 9.994261451284477e-06, + "loss": 0.5862, + "step": 461 + }, + { + "epoch": 0.044963503649635035, + "grad_norm": 1.7346846845298518, + "learning_rate": 9.994185710371083e-06, + "loss": 0.3588, + "step": 462 + }, + { + "epoch": 0.045060827250608275, + "grad_norm": 1.5593715629463312, + "learning_rate": 9.994109473182385e-06, + "loss": 0.2891, + "step": 463 + }, + { + "epoch": 0.04515815085158151, + "grad_norm": 2.326736753149576, + "learning_rate": 9.994032739725959e-06, + "loss": 0.6517, + "step": 464 + }, + { + "epoch": 0.04525547445255475, + "grad_norm": 2.2142852132770305, + "learning_rate": 9.99395551000943e-06, + "loss": 0.3571, + "step": 465 + }, + { + "epoch": 0.04535279805352798, + "grad_norm": 1.7351954813390544, + "learning_rate": 9.993877784040474e-06, + "loss": 0.3849, + "step": 466 + }, + { + "epoch": 0.04545012165450121, + "grad_norm": 1.3962336815381617, + "learning_rate": 9.993799561826811e-06, + "loss": 0.311, + "step": 467 + }, + { + "epoch": 0.04554744525547445, + "grad_norm": 1.878958465421645, + "learning_rate": 9.993720843376216e-06, + "loss": 0.5602, + "step": 468 + }, + { + "epoch": 0.045644768856447686, + "grad_norm": 1.519160992933857, + "learning_rate": 9.993641628696513e-06, + "loss": 0.2379, + "step": 469 + }, + { + "epoch": 0.045742092457420926, + "grad_norm": 2.5345930464298885, + "learning_rate": 9.99356191779557e-06, + "loss": 0.4239, + "step": 470 + }, + { + "epoch": 0.04583941605839416, + "grad_norm": 1.3153911718041251, + "learning_rate": 9.993481710681314e-06, + "loss": 0.3454, + "step": 471 + }, + { + "epoch": 0.0459367396593674, + "grad_norm": 2.16208125563947, + "learning_rate": 9.993401007361707e-06, + "loss": 0.5386, + "step": 472 + }, + { + "epoch": 0.04603406326034063, + "grad_norm": 1.8150842593472827, + "learning_rate": 9.993319807844775e-06, + "loss": 0.3077, + "step": 473 + }, + { + "epoch": 0.04613138686131387, + "grad_norm": 1.6656864462678063, + "learning_rate": 9.993238112138584e-06, + "loss": 0.4927, + "step": 474 + }, + { + "epoch": 0.046228710462287104, + "grad_norm": 1.3429917702468868, + "learning_rate": 9.993155920251252e-06, + "loss": 0.2433, + "step": 475 + }, + { + "epoch": 0.04632603406326034, + "grad_norm": 1.3651155739367906, + "learning_rate": 9.993073232190949e-06, + "loss": 0.2947, + "step": 476 + }, + { + "epoch": 0.046423357664233576, + "grad_norm": 1.7815516701613203, + "learning_rate": 9.992990047965887e-06, + "loss": 0.5372, + "step": 477 + }, + { + "epoch": 0.046520681265206816, + "grad_norm": 1.846696342179327, + "learning_rate": 9.992906367584337e-06, + "loss": 0.5127, + "step": 478 + }, + { + "epoch": 0.04661800486618005, + "grad_norm": 1.7511253825578088, + "learning_rate": 9.992822191054612e-06, + "loss": 0.4074, + "step": 479 + }, + { + "epoch": 0.04671532846715328, + "grad_norm": 1.8105635986872588, + "learning_rate": 9.992737518385076e-06, + "loss": 0.4998, + "step": 480 + }, + { + "epoch": 0.04681265206812652, + "grad_norm": 2.2743597617900746, + "learning_rate": 9.992652349584147e-06, + "loss": 0.6249, + "step": 481 + }, + { + "epoch": 0.046909975669099754, + "grad_norm": 1.93948496382319, + "learning_rate": 9.992566684660282e-06, + "loss": 0.5411, + "step": 482 + }, + { + "epoch": 0.047007299270072994, + "grad_norm": 1.4073760716303516, + "learning_rate": 9.992480523621999e-06, + "loss": 0.3506, + "step": 483 + }, + { + "epoch": 0.04710462287104623, + "grad_norm": 1.388293079160528, + "learning_rate": 9.992393866477856e-06, + "loss": 0.3304, + "step": 484 + }, + { + "epoch": 0.04720194647201947, + "grad_norm": 2.082643572745618, + "learning_rate": 9.992306713236467e-06, + "loss": 0.5653, + "step": 485 + }, + { + "epoch": 0.0472992700729927, + "grad_norm": 1.7104664332606834, + "learning_rate": 9.992219063906492e-06, + "loss": 0.3317, + "step": 486 + }, + { + "epoch": 0.04739659367396594, + "grad_norm": 1.7575848919482624, + "learning_rate": 9.992130918496638e-06, + "loss": 0.4109, + "step": 487 + }, + { + "epoch": 0.04749391727493917, + "grad_norm": 1.7351379091271637, + "learning_rate": 9.992042277015668e-06, + "loss": 0.5065, + "step": 488 + }, + { + "epoch": 0.04759124087591241, + "grad_norm": 1.4444570948381004, + "learning_rate": 9.991953139472387e-06, + "loss": 0.4023, + "step": 489 + }, + { + "epoch": 0.047688564476885645, + "grad_norm": 1.4697709289140384, + "learning_rate": 9.991863505875656e-06, + "loss": 0.3364, + "step": 490 + }, + { + "epoch": 0.04778588807785888, + "grad_norm": 1.9428205960506804, + "learning_rate": 9.99177337623438e-06, + "loss": 0.4303, + "step": 491 + }, + { + "epoch": 0.04788321167883212, + "grad_norm": 1.931152158561148, + "learning_rate": 9.991682750557516e-06, + "loss": 0.2857, + "step": 492 + }, + { + "epoch": 0.04798053527980535, + "grad_norm": 1.9301394655308035, + "learning_rate": 9.991591628854067e-06, + "loss": 0.5998, + "step": 493 + }, + { + "epoch": 0.04807785888077859, + "grad_norm": 1.7788293016868693, + "learning_rate": 9.99150001113309e-06, + "loss": 0.4595, + "step": 494 + }, + { + "epoch": 0.04817518248175182, + "grad_norm": 2.0641225732440134, + "learning_rate": 9.99140789740369e-06, + "loss": 0.3848, + "step": 495 + }, + { + "epoch": 0.04827250608272506, + "grad_norm": 2.2832955373527044, + "learning_rate": 9.99131528767502e-06, + "loss": 0.6396, + "step": 496 + }, + { + "epoch": 0.048369829683698295, + "grad_norm": 1.6658790952812916, + "learning_rate": 9.99122218195628e-06, + "loss": 0.5429, + "step": 497 + }, + { + "epoch": 0.048467153284671535, + "grad_norm": 1.6568038302360257, + "learning_rate": 9.991128580256725e-06, + "loss": 0.4532, + "step": 498 + }, + { + "epoch": 0.04856447688564477, + "grad_norm": 1.8451659374514144, + "learning_rate": 9.991034482585656e-06, + "loss": 0.5845, + "step": 499 + }, + { + "epoch": 0.04866180048661801, + "grad_norm": 1.9103948838029656, + "learning_rate": 9.99093988895242e-06, + "loss": 0.5508, + "step": 500 + }, + { + "epoch": 0.04875912408759124, + "grad_norm": 1.9691733858712537, + "learning_rate": 9.990844799366422e-06, + "loss": 0.6374, + "step": 501 + }, + { + "epoch": 0.048856447688564474, + "grad_norm": 2.1278472226161846, + "learning_rate": 9.990749213837108e-06, + "loss": 0.572, + "step": 502 + }, + { + "epoch": 0.04895377128953771, + "grad_norm": 1.9704028865885994, + "learning_rate": 9.990653132373977e-06, + "loss": 0.6282, + "step": 503 + }, + { + "epoch": 0.049051094890510946, + "grad_norm": 1.8965741341561362, + "learning_rate": 9.990556554986577e-06, + "loss": 0.5749, + "step": 504 + }, + { + "epoch": 0.049148418491484186, + "grad_norm": 1.5425018763105707, + "learning_rate": 9.990459481684504e-06, + "loss": 0.4236, + "step": 505 + }, + { + "epoch": 0.04924574209245742, + "grad_norm": 1.736669998068125, + "learning_rate": 9.990361912477405e-06, + "loss": 0.4275, + "step": 506 + }, + { + "epoch": 0.04934306569343066, + "grad_norm": 2.049335776858506, + "learning_rate": 9.990263847374976e-06, + "loss": 0.6897, + "step": 507 + }, + { + "epoch": 0.04944038929440389, + "grad_norm": 1.8544975871268152, + "learning_rate": 9.990165286386961e-06, + "loss": 0.4811, + "step": 508 + }, + { + "epoch": 0.04953771289537713, + "grad_norm": 1.5709178763522822, + "learning_rate": 9.990066229523155e-06, + "loss": 0.4585, + "step": 509 + }, + { + "epoch": 0.049635036496350364, + "grad_norm": 2.1410068811754153, + "learning_rate": 9.989966676793399e-06, + "loss": 0.4773, + "step": 510 + }, + { + "epoch": 0.049732360097323604, + "grad_norm": 1.760724042734433, + "learning_rate": 9.989866628207589e-06, + "loss": 0.3144, + "step": 511 + }, + { + "epoch": 0.04982968369829684, + "grad_norm": 1.8521560168370175, + "learning_rate": 9.989766083775662e-06, + "loss": 0.4656, + "step": 512 + }, + { + "epoch": 0.049927007299270076, + "grad_norm": 1.544987615640627, + "learning_rate": 9.989665043507616e-06, + "loss": 0.4089, + "step": 513 + }, + { + "epoch": 0.05002433090024331, + "grad_norm": 1.9122960249889975, + "learning_rate": 9.989563507413487e-06, + "loss": 0.4535, + "step": 514 + }, + { + "epoch": 0.05012165450121654, + "grad_norm": 1.5187134098621655, + "learning_rate": 9.989461475503363e-06, + "loss": 0.31, + "step": 515 + }, + { + "epoch": 0.05021897810218978, + "grad_norm": 1.562160455050312, + "learning_rate": 9.989358947787389e-06, + "loss": 0.4009, + "step": 516 + }, + { + "epoch": 0.050316301703163015, + "grad_norm": 1.738084966314413, + "learning_rate": 9.989255924275746e-06, + "loss": 0.4723, + "step": 517 + }, + { + "epoch": 0.050413625304136254, + "grad_norm": 2.156580581755068, + "learning_rate": 9.989152404978678e-06, + "loss": 0.4407, + "step": 518 + }, + { + "epoch": 0.05051094890510949, + "grad_norm": 1.8652302207700793, + "learning_rate": 9.989048389906469e-06, + "loss": 0.587, + "step": 519 + }, + { + "epoch": 0.05060827250608273, + "grad_norm": 1.5934369396830426, + "learning_rate": 9.988943879069452e-06, + "loss": 0.3961, + "step": 520 + }, + { + "epoch": 0.05070559610705596, + "grad_norm": 1.4294562647861604, + "learning_rate": 9.988838872478017e-06, + "loss": 0.3382, + "step": 521 + }, + { + "epoch": 0.0508029197080292, + "grad_norm": 1.5693240874435923, + "learning_rate": 9.988733370142598e-06, + "loss": 0.3876, + "step": 522 + }, + { + "epoch": 0.05090024330900243, + "grad_norm": 1.6720738515514542, + "learning_rate": 9.988627372073678e-06, + "loss": 0.448, + "step": 523 + }, + { + "epoch": 0.05099756690997567, + "grad_norm": 2.0438207304961367, + "learning_rate": 9.988520878281787e-06, + "loss": 0.5724, + "step": 524 + }, + { + "epoch": 0.051094890510948905, + "grad_norm": 2.0003463921985456, + "learning_rate": 9.988413888777512e-06, + "loss": 0.4506, + "step": 525 + }, + { + "epoch": 0.05119221411192214, + "grad_norm": 2.11812759304704, + "learning_rate": 9.988306403571482e-06, + "loss": 0.757, + "step": 526 + }, + { + "epoch": 0.05128953771289538, + "grad_norm": 1.5594386055307068, + "learning_rate": 9.98819842267438e-06, + "loss": 0.4145, + "step": 527 + }, + { + "epoch": 0.05138686131386861, + "grad_norm": 1.917978943216931, + "learning_rate": 9.988089946096933e-06, + "loss": 0.5363, + "step": 528 + }, + { + "epoch": 0.05148418491484185, + "grad_norm": 1.3212282063862113, + "learning_rate": 9.987980973849924e-06, + "loss": 0.3132, + "step": 529 + }, + { + "epoch": 0.05158150851581508, + "grad_norm": 1.2285769982465171, + "learning_rate": 9.987871505944177e-06, + "loss": 0.2287, + "step": 530 + }, + { + "epoch": 0.05167883211678832, + "grad_norm": 1.849610792922833, + "learning_rate": 9.987761542390574e-06, + "loss": 0.6487, + "step": 531 + }, + { + "epoch": 0.051776155717761556, + "grad_norm": 1.158461389164102, + "learning_rate": 9.987651083200044e-06, + "loss": 0.2111, + "step": 532 + }, + { + "epoch": 0.051873479318734796, + "grad_norm": 1.8450520976911682, + "learning_rate": 9.987540128383556e-06, + "loss": 0.5579, + "step": 533 + }, + { + "epoch": 0.05197080291970803, + "grad_norm": 1.9047794610871986, + "learning_rate": 9.98742867795214e-06, + "loss": 0.4542, + "step": 534 + }, + { + "epoch": 0.05206812652068127, + "grad_norm": 1.5564676952152843, + "learning_rate": 9.987316731916872e-06, + "loss": 0.4467, + "step": 535 + }, + { + "epoch": 0.0521654501216545, + "grad_norm": 1.403952395827601, + "learning_rate": 9.987204290288876e-06, + "loss": 0.3761, + "step": 536 + }, + { + "epoch": 0.052262773722627734, + "grad_norm": 1.948151749349848, + "learning_rate": 9.987091353079323e-06, + "loss": 0.5782, + "step": 537 + }, + { + "epoch": 0.052360097323600974, + "grad_norm": 1.6211222818460531, + "learning_rate": 9.986977920299437e-06, + "loss": 0.4047, + "step": 538 + }, + { + "epoch": 0.052457420924574207, + "grad_norm": 1.4911900726837217, + "learning_rate": 9.986863991960491e-06, + "loss": 0.3817, + "step": 539 + }, + { + "epoch": 0.052554744525547446, + "grad_norm": 1.530872687739145, + "learning_rate": 9.986749568073804e-06, + "loss": 0.4639, + "step": 540 + }, + { + "epoch": 0.05265206812652068, + "grad_norm": 1.766399180057757, + "learning_rate": 9.986634648650746e-06, + "loss": 0.5132, + "step": 541 + }, + { + "epoch": 0.05274939172749392, + "grad_norm": 1.7318370911583716, + "learning_rate": 9.98651923370274e-06, + "loss": 0.5845, + "step": 542 + }, + { + "epoch": 0.05284671532846715, + "grad_norm": 1.4523428175637472, + "learning_rate": 9.986403323241252e-06, + "loss": 0.3817, + "step": 543 + }, + { + "epoch": 0.05294403892944039, + "grad_norm": 1.3085205057626972, + "learning_rate": 9.9862869172778e-06, + "loss": 0.294, + "step": 544 + }, + { + "epoch": 0.053041362530413624, + "grad_norm": 1.749260064779093, + "learning_rate": 9.986170015823953e-06, + "loss": 0.3885, + "step": 545 + }, + { + "epoch": 0.053138686131386864, + "grad_norm": 1.9224820302612053, + "learning_rate": 9.986052618891326e-06, + "loss": 0.5841, + "step": 546 + }, + { + "epoch": 0.0532360097323601, + "grad_norm": 1.6019594770490224, + "learning_rate": 9.985934726491587e-06, + "loss": 0.5602, + "step": 547 + }, + { + "epoch": 0.05333333333333334, + "grad_norm": 1.63788543125369, + "learning_rate": 9.98581633863645e-06, + "loss": 0.4913, + "step": 548 + }, + { + "epoch": 0.05343065693430657, + "grad_norm": 1.7751230304686407, + "learning_rate": 9.985697455337677e-06, + "loss": 0.4575, + "step": 549 + }, + { + "epoch": 0.0535279805352798, + "grad_norm": 1.4813830287768246, + "learning_rate": 9.985578076607086e-06, + "loss": 0.2811, + "step": 550 + }, + { + "epoch": 0.05362530413625304, + "grad_norm": 1.8047180833743464, + "learning_rate": 9.985458202456534e-06, + "loss": 0.5564, + "step": 551 + }, + { + "epoch": 0.053722627737226275, + "grad_norm": 1.4776771818705197, + "learning_rate": 9.985337832897938e-06, + "loss": 0.2842, + "step": 552 + }, + { + "epoch": 0.053819951338199515, + "grad_norm": 1.800973083472876, + "learning_rate": 9.985216967943256e-06, + "loss": 0.4017, + "step": 553 + }, + { + "epoch": 0.05391727493917275, + "grad_norm": 1.4167019147788764, + "learning_rate": 9.985095607604502e-06, + "loss": 0.2676, + "step": 554 + }, + { + "epoch": 0.05401459854014599, + "grad_norm": 1.462279330828973, + "learning_rate": 9.984973751893732e-06, + "loss": 0.342, + "step": 555 + }, + { + "epoch": 0.05411192214111922, + "grad_norm": 1.7941608662857766, + "learning_rate": 9.984851400823056e-06, + "loss": 0.4851, + "step": 556 + }, + { + "epoch": 0.05420924574209246, + "grad_norm": 1.865163176610701, + "learning_rate": 9.984728554404632e-06, + "loss": 0.5938, + "step": 557 + }, + { + "epoch": 0.05430656934306569, + "grad_norm": 1.9578700904261006, + "learning_rate": 9.984605212650669e-06, + "loss": 0.5846, + "step": 558 + }, + { + "epoch": 0.05440389294403893, + "grad_norm": 1.7615345522382602, + "learning_rate": 9.98448137557342e-06, + "loss": 0.5517, + "step": 559 + }, + { + "epoch": 0.054501216545012166, + "grad_norm": 1.7987507193579173, + "learning_rate": 9.984357043185195e-06, + "loss": 0.4511, + "step": 560 + }, + { + "epoch": 0.0545985401459854, + "grad_norm": 1.8966136067258859, + "learning_rate": 9.984232215498347e-06, + "loss": 0.3339, + "step": 561 + }, + { + "epoch": 0.05469586374695864, + "grad_norm": 1.760439118311743, + "learning_rate": 9.98410689252528e-06, + "loss": 0.4797, + "step": 562 + }, + { + "epoch": 0.05479318734793187, + "grad_norm": 1.7467534741216573, + "learning_rate": 9.983981074278448e-06, + "loss": 0.3854, + "step": 563 + }, + { + "epoch": 0.05489051094890511, + "grad_norm": 1.638747457914032, + "learning_rate": 9.983854760770353e-06, + "loss": 0.3215, + "step": 564 + }, + { + "epoch": 0.054987834549878344, + "grad_norm": 1.565721167011275, + "learning_rate": 9.983727952013546e-06, + "loss": 0.3573, + "step": 565 + }, + { + "epoch": 0.05508515815085158, + "grad_norm": 1.819373023432736, + "learning_rate": 9.98360064802063e-06, + "loss": 0.304, + "step": 566 + }, + { + "epoch": 0.055182481751824816, + "grad_norm": 2.219648367380945, + "learning_rate": 9.983472848804254e-06, + "loss": 0.7398, + "step": 567 + }, + { + "epoch": 0.055279805352798056, + "grad_norm": 1.7935096739228122, + "learning_rate": 9.98334455437712e-06, + "loss": 0.3257, + "step": 568 + }, + { + "epoch": 0.05537712895377129, + "grad_norm": 2.085379879601924, + "learning_rate": 9.983215764751971e-06, + "loss": 0.3477, + "step": 569 + }, + { + "epoch": 0.05547445255474453, + "grad_norm": 1.528881264990704, + "learning_rate": 9.98308647994161e-06, + "loss": 0.4173, + "step": 570 + }, + { + "epoch": 0.05557177615571776, + "grad_norm": 1.282510416609492, + "learning_rate": 9.982956699958883e-06, + "loss": 0.3513, + "step": 571 + }, + { + "epoch": 0.055669099756690994, + "grad_norm": 1.6035600811723405, + "learning_rate": 9.982826424816688e-06, + "loss": 0.3318, + "step": 572 + }, + { + "epoch": 0.055766423357664234, + "grad_norm": 1.9455996381881653, + "learning_rate": 9.982695654527966e-06, + "loss": 0.4991, + "step": 573 + }, + { + "epoch": 0.05586374695863747, + "grad_norm": 1.8397262762514839, + "learning_rate": 9.982564389105714e-06, + "loss": 0.345, + "step": 574 + }, + { + "epoch": 0.05596107055961071, + "grad_norm": 1.7997461351876956, + "learning_rate": 9.982432628562978e-06, + "loss": 0.5384, + "step": 575 + }, + { + "epoch": 0.05605839416058394, + "grad_norm": 1.6246101205121968, + "learning_rate": 9.982300372912848e-06, + "loss": 0.5499, + "step": 576 + }, + { + "epoch": 0.05615571776155718, + "grad_norm": 1.9184631207748861, + "learning_rate": 9.982167622168467e-06, + "loss": 0.449, + "step": 577 + }, + { + "epoch": 0.05625304136253041, + "grad_norm": 1.5368079698239796, + "learning_rate": 9.982034376343029e-06, + "loss": 0.3311, + "step": 578 + }, + { + "epoch": 0.05635036496350365, + "grad_norm": 1.9061539422519105, + "learning_rate": 9.98190063544977e-06, + "loss": 0.4182, + "step": 579 + }, + { + "epoch": 0.056447688564476885, + "grad_norm": 1.6727227174184238, + "learning_rate": 9.981766399501984e-06, + "loss": 0.482, + "step": 580 + }, + { + "epoch": 0.056545012165450124, + "grad_norm": 1.8546055763617424, + "learning_rate": 9.98163166851301e-06, + "loss": 0.5758, + "step": 581 + }, + { + "epoch": 0.05664233576642336, + "grad_norm": 2.0350303098403706, + "learning_rate": 9.981496442496234e-06, + "loss": 0.5236, + "step": 582 + }, + { + "epoch": 0.0567396593673966, + "grad_norm": 1.3907379790284926, + "learning_rate": 9.981360721465095e-06, + "loss": 0.3375, + "step": 583 + }, + { + "epoch": 0.05683698296836983, + "grad_norm": 2.0168702766261486, + "learning_rate": 9.98122450543308e-06, + "loss": 0.595, + "step": 584 + }, + { + "epoch": 0.05693430656934306, + "grad_norm": 1.7248754760467295, + "learning_rate": 9.981087794413722e-06, + "loss": 0.3747, + "step": 585 + }, + { + "epoch": 0.0570316301703163, + "grad_norm": 1.8918865818240052, + "learning_rate": 9.98095058842061e-06, + "loss": 0.5805, + "step": 586 + }, + { + "epoch": 0.057128953771289535, + "grad_norm": 1.8691153689026438, + "learning_rate": 9.980812887467377e-06, + "loss": 0.3451, + "step": 587 + }, + { + "epoch": 0.057226277372262775, + "grad_norm": 1.7475224395533677, + "learning_rate": 9.980674691567705e-06, + "loss": 0.2789, + "step": 588 + }, + { + "epoch": 0.05732360097323601, + "grad_norm": 1.876124489873064, + "learning_rate": 9.980536000735328e-06, + "loss": 0.5917, + "step": 589 + }, + { + "epoch": 0.05742092457420925, + "grad_norm": 1.6438847446693803, + "learning_rate": 9.980396814984025e-06, + "loss": 0.3063, + "step": 590 + }, + { + "epoch": 0.05751824817518248, + "grad_norm": 1.7609146888426583, + "learning_rate": 9.980257134327634e-06, + "loss": 0.4177, + "step": 591 + }, + { + "epoch": 0.05761557177615572, + "grad_norm": 3.1047413099950445, + "learning_rate": 9.980116958780027e-06, + "loss": 0.2793, + "step": 592 + }, + { + "epoch": 0.05771289537712895, + "grad_norm": 1.3365913263494138, + "learning_rate": 9.979976288355137e-06, + "loss": 0.2754, + "step": 593 + }, + { + "epoch": 0.05781021897810219, + "grad_norm": 1.7378721977452198, + "learning_rate": 9.979835123066943e-06, + "loss": 0.4156, + "step": 594 + }, + { + "epoch": 0.057907542579075426, + "grad_norm": 1.7652517953930271, + "learning_rate": 9.979693462929472e-06, + "loss": 0.3768, + "step": 595 + }, + { + "epoch": 0.05800486618004866, + "grad_norm": 2.4155692425963675, + "learning_rate": 9.979551307956801e-06, + "loss": 0.6409, + "step": 596 + }, + { + "epoch": 0.0581021897810219, + "grad_norm": 2.2339995809091913, + "learning_rate": 9.979408658163055e-06, + "loss": 0.3134, + "step": 597 + }, + { + "epoch": 0.05819951338199513, + "grad_norm": 1.9788468018769068, + "learning_rate": 9.97926551356241e-06, + "loss": 0.2509, + "step": 598 + }, + { + "epoch": 0.05829683698296837, + "grad_norm": 4.0668515887714385, + "learning_rate": 9.979121874169091e-06, + "loss": 0.3322, + "step": 599 + }, + { + "epoch": 0.058394160583941604, + "grad_norm": 2.0552497355613264, + "learning_rate": 9.97897773999737e-06, + "loss": 0.2732, + "step": 600 + }, + { + "epoch": 0.058491484184914844, + "grad_norm": 1.7372746328291984, + "learning_rate": 9.978833111061573e-06, + "loss": 0.3021, + "step": 601 + }, + { + "epoch": 0.058588807785888077, + "grad_norm": 1.8426989129405926, + "learning_rate": 9.978687987376067e-06, + "loss": 0.3147, + "step": 602 + }, + { + "epoch": 0.058686131386861316, + "grad_norm": 1.456816302033054, + "learning_rate": 9.978542368955278e-06, + "loss": 0.3669, + "step": 603 + }, + { + "epoch": 0.05878345498783455, + "grad_norm": 2.1398878847147973, + "learning_rate": 9.978396255813672e-06, + "loss": 0.457, + "step": 604 + }, + { + "epoch": 0.05888077858880779, + "grad_norm": 1.860652260495742, + "learning_rate": 9.978249647965769e-06, + "loss": 0.5567, + "step": 605 + }, + { + "epoch": 0.05897810218978102, + "grad_norm": 1.7559525207322428, + "learning_rate": 9.97810254542614e-06, + "loss": 0.439, + "step": 606 + }, + { + "epoch": 0.059075425790754255, + "grad_norm": 1.4912680944094816, + "learning_rate": 9.977954948209402e-06, + "loss": 0.4431, + "step": 607 + }, + { + "epoch": 0.059172749391727494, + "grad_norm": 1.766690700595448, + "learning_rate": 9.97780685633022e-06, + "loss": 0.3187, + "step": 608 + }, + { + "epoch": 0.05927007299270073, + "grad_norm": 2.169180646458804, + "learning_rate": 9.977658269803312e-06, + "loss": 0.5042, + "step": 609 + }, + { + "epoch": 0.05936739659367397, + "grad_norm": 1.623119439845207, + "learning_rate": 9.977509188643441e-06, + "loss": 0.3632, + "step": 610 + }, + { + "epoch": 0.0594647201946472, + "grad_norm": 2.0976883017366226, + "learning_rate": 9.977359612865424e-06, + "loss": 0.6465, + "step": 611 + }, + { + "epoch": 0.05956204379562044, + "grad_norm": 1.59126192242755, + "learning_rate": 9.977209542484123e-06, + "loss": 0.4335, + "step": 612 + }, + { + "epoch": 0.05965936739659367, + "grad_norm": 1.6532378246551842, + "learning_rate": 9.97705897751445e-06, + "loss": 0.3462, + "step": 613 + }, + { + "epoch": 0.05975669099756691, + "grad_norm": 1.6478059833585124, + "learning_rate": 9.976907917971365e-06, + "loss": 0.4063, + "step": 614 + }, + { + "epoch": 0.059854014598540145, + "grad_norm": 1.750559308727237, + "learning_rate": 9.976756363869884e-06, + "loss": 0.5062, + "step": 615 + }, + { + "epoch": 0.059951338199513385, + "grad_norm": 1.6400113365898012, + "learning_rate": 9.976604315225063e-06, + "loss": 0.3699, + "step": 616 + }, + { + "epoch": 0.06004866180048662, + "grad_norm": 1.5449283565959169, + "learning_rate": 9.976451772052013e-06, + "loss": 0.3635, + "step": 617 + }, + { + "epoch": 0.06014598540145986, + "grad_norm": 1.3799772345005799, + "learning_rate": 9.97629873436589e-06, + "loss": 0.2747, + "step": 618 + }, + { + "epoch": 0.06024330900243309, + "grad_norm": 1.9454941262632244, + "learning_rate": 9.976145202181905e-06, + "loss": 0.4963, + "step": 619 + }, + { + "epoch": 0.06034063260340632, + "grad_norm": 1.5274916477255973, + "learning_rate": 9.975991175515311e-06, + "loss": 0.3348, + "step": 620 + }, + { + "epoch": 0.06043795620437956, + "grad_norm": 1.9623540496009142, + "learning_rate": 9.975836654381416e-06, + "loss": 0.5373, + "step": 621 + }, + { + "epoch": 0.060535279805352796, + "grad_norm": 1.4248144765181632, + "learning_rate": 9.975681638795575e-06, + "loss": 0.3137, + "step": 622 + }, + { + "epoch": 0.060632603406326036, + "grad_norm": 1.4366236793713136, + "learning_rate": 9.975526128773192e-06, + "loss": 0.3519, + "step": 623 + }, + { + "epoch": 0.06072992700729927, + "grad_norm": 1.8458441140553945, + "learning_rate": 9.97537012432972e-06, + "loss": 0.3937, + "step": 624 + }, + { + "epoch": 0.06082725060827251, + "grad_norm": 1.868271580826056, + "learning_rate": 9.975213625480658e-06, + "loss": 0.4567, + "step": 625 + }, + { + "epoch": 0.06092457420924574, + "grad_norm": 2.4613001964869223, + "learning_rate": 9.97505663224156e-06, + "loss": 0.5607, + "step": 626 + }, + { + "epoch": 0.06102189781021898, + "grad_norm": 1.6709839772769468, + "learning_rate": 9.974899144628027e-06, + "loss": 0.3233, + "step": 627 + }, + { + "epoch": 0.061119221411192214, + "grad_norm": 1.8046591620263965, + "learning_rate": 9.97474116265571e-06, + "loss": 0.3929, + "step": 628 + }, + { + "epoch": 0.06121654501216545, + "grad_norm": 1.7182161369033975, + "learning_rate": 9.974582686340304e-06, + "loss": 0.3804, + "step": 629 + }, + { + "epoch": 0.061313868613138686, + "grad_norm": 2.435940855169524, + "learning_rate": 9.974423715697558e-06, + "loss": 0.7453, + "step": 630 + }, + { + "epoch": 0.06141119221411192, + "grad_norm": 1.401143104634322, + "learning_rate": 9.974264250743272e-06, + "loss": 0.306, + "step": 631 + }, + { + "epoch": 0.06150851581508516, + "grad_norm": 1.540550326071636, + "learning_rate": 9.97410429149329e-06, + "loss": 0.3582, + "step": 632 + }, + { + "epoch": 0.06160583941605839, + "grad_norm": 4.038520112503673, + "learning_rate": 9.973943837963507e-06, + "loss": 0.2688, + "step": 633 + }, + { + "epoch": 0.06170316301703163, + "grad_norm": 2.032927304778425, + "learning_rate": 9.973782890169867e-06, + "loss": 0.6952, + "step": 634 + }, + { + "epoch": 0.061800486618004864, + "grad_norm": 1.5242884680104736, + "learning_rate": 9.973621448128364e-06, + "loss": 0.3957, + "step": 635 + }, + { + "epoch": 0.061897810218978104, + "grad_norm": 1.599953340803732, + "learning_rate": 9.973459511855042e-06, + "loss": 0.3783, + "step": 636 + }, + { + "epoch": 0.06199513381995134, + "grad_norm": 2.1886899708740697, + "learning_rate": 9.973297081365988e-06, + "loss": 0.5426, + "step": 637 + }, + { + "epoch": 0.06209245742092458, + "grad_norm": 1.363421719809718, + "learning_rate": 9.973134156677349e-06, + "loss": 0.2707, + "step": 638 + }, + { + "epoch": 0.06218978102189781, + "grad_norm": 1.883218491971664, + "learning_rate": 9.972970737805312e-06, + "loss": 0.543, + "step": 639 + }, + { + "epoch": 0.06228710462287105, + "grad_norm": 1.6336178778276322, + "learning_rate": 9.972806824766117e-06, + "loss": 0.4833, + "step": 640 + }, + { + "epoch": 0.06238442822384428, + "grad_norm": 1.74145478719615, + "learning_rate": 9.972642417576049e-06, + "loss": 0.5456, + "step": 641 + }, + { + "epoch": 0.062481751824817515, + "grad_norm": 1.3939447959630629, + "learning_rate": 9.972477516251448e-06, + "loss": 0.2935, + "step": 642 + }, + { + "epoch": 0.06257907542579075, + "grad_norm": 1.9741261661680443, + "learning_rate": 9.9723121208087e-06, + "loss": 0.4377, + "step": 643 + }, + { + "epoch": 0.06267639902676399, + "grad_norm": 2.214700253529172, + "learning_rate": 9.972146231264242e-06, + "loss": 0.6711, + "step": 644 + }, + { + "epoch": 0.06277372262773723, + "grad_norm": 1.7399845992974294, + "learning_rate": 9.971979847634554e-06, + "loss": 0.5327, + "step": 645 + }, + { + "epoch": 0.06287104622871047, + "grad_norm": 1.3552365502663122, + "learning_rate": 9.971812969936174e-06, + "loss": 0.3553, + "step": 646 + }, + { + "epoch": 0.06296836982968369, + "grad_norm": 1.8378075997453163, + "learning_rate": 9.971645598185685e-06, + "loss": 0.3709, + "step": 647 + }, + { + "epoch": 0.06306569343065693, + "grad_norm": 1.7441350204189767, + "learning_rate": 9.971477732399714e-06, + "loss": 0.489, + "step": 648 + }, + { + "epoch": 0.06316301703163017, + "grad_norm": 2.083031963167252, + "learning_rate": 9.971309372594947e-06, + "loss": 0.6196, + "step": 649 + }, + { + "epoch": 0.06326034063260341, + "grad_norm": 1.5678236487001533, + "learning_rate": 9.971140518788112e-06, + "loss": 0.3202, + "step": 650 + }, + { + "epoch": 0.06335766423357664, + "grad_norm": 1.7281008810115812, + "learning_rate": 9.970971170995988e-06, + "loss": 0.4169, + "step": 651 + }, + { + "epoch": 0.06345498783454988, + "grad_norm": 1.5626981990993993, + "learning_rate": 9.970801329235402e-06, + "loss": 0.4238, + "step": 652 + }, + { + "epoch": 0.06355231143552312, + "grad_norm": 1.5338214380715702, + "learning_rate": 9.970630993523234e-06, + "loss": 0.278, + "step": 653 + }, + { + "epoch": 0.06364963503649634, + "grad_norm": 1.7806299033721755, + "learning_rate": 9.970460163876409e-06, + "loss": 0.5649, + "step": 654 + }, + { + "epoch": 0.06374695863746958, + "grad_norm": 1.9349681554929028, + "learning_rate": 9.9702888403119e-06, + "loss": 0.3297, + "step": 655 + }, + { + "epoch": 0.06384428223844282, + "grad_norm": 1.4947723050696704, + "learning_rate": 9.970117022846736e-06, + "loss": 0.4077, + "step": 656 + }, + { + "epoch": 0.06394160583941606, + "grad_norm": 1.5696774237596223, + "learning_rate": 9.96994471149799e-06, + "loss": 0.4681, + "step": 657 + }, + { + "epoch": 0.06403892944038929, + "grad_norm": 1.7662095984112474, + "learning_rate": 9.969771906282781e-06, + "loss": 0.539, + "step": 658 + }, + { + "epoch": 0.06413625304136253, + "grad_norm": 2.926336951253308, + "learning_rate": 9.969598607218285e-06, + "loss": 0.4196, + "step": 659 + }, + { + "epoch": 0.06423357664233577, + "grad_norm": 3.148192138198314, + "learning_rate": 9.96942481432172e-06, + "loss": 0.4827, + "step": 660 + }, + { + "epoch": 0.06433090024330901, + "grad_norm": 1.790436662552377, + "learning_rate": 9.969250527610356e-06, + "loss": 0.4972, + "step": 661 + }, + { + "epoch": 0.06442822384428223, + "grad_norm": 1.4712739725679773, + "learning_rate": 9.969075747101514e-06, + "loss": 0.4112, + "step": 662 + }, + { + "epoch": 0.06452554744525547, + "grad_norm": 1.4521996617982842, + "learning_rate": 9.96890047281256e-06, + "loss": 0.3729, + "step": 663 + }, + { + "epoch": 0.06462287104622871, + "grad_norm": 1.5457088814513262, + "learning_rate": 9.96872470476091e-06, + "loss": 0.4294, + "step": 664 + }, + { + "epoch": 0.06472019464720194, + "grad_norm": 1.7644033340951866, + "learning_rate": 9.968548442964034e-06, + "loss": 0.4487, + "step": 665 + }, + { + "epoch": 0.06481751824817518, + "grad_norm": 1.632555708701406, + "learning_rate": 9.968371687439446e-06, + "loss": 0.3929, + "step": 666 + }, + { + "epoch": 0.06491484184914842, + "grad_norm": 1.8990302396780172, + "learning_rate": 9.968194438204708e-06, + "loss": 0.4101, + "step": 667 + }, + { + "epoch": 0.06501216545012166, + "grad_norm": 2.092762728551112, + "learning_rate": 9.968016695277436e-06, + "loss": 0.5712, + "step": 668 + }, + { + "epoch": 0.06510948905109488, + "grad_norm": 1.5876668887386824, + "learning_rate": 9.967838458675292e-06, + "loss": 0.494, + "step": 669 + }, + { + "epoch": 0.06520681265206812, + "grad_norm": 1.7536517597940893, + "learning_rate": 9.967659728415985e-06, + "loss": 0.6121, + "step": 670 + }, + { + "epoch": 0.06530413625304136, + "grad_norm": 1.9021294255711243, + "learning_rate": 9.96748050451728e-06, + "loss": 0.3634, + "step": 671 + }, + { + "epoch": 0.0654014598540146, + "grad_norm": 1.4457078547633553, + "learning_rate": 9.96730078699698e-06, + "loss": 0.4586, + "step": 672 + }, + { + "epoch": 0.06549878345498783, + "grad_norm": 1.6474950184261972, + "learning_rate": 9.967120575872952e-06, + "loss": 0.5028, + "step": 673 + }, + { + "epoch": 0.06559610705596107, + "grad_norm": 1.9901979572232373, + "learning_rate": 9.966939871163098e-06, + "loss": 0.6986, + "step": 674 + }, + { + "epoch": 0.06569343065693431, + "grad_norm": 1.3671458210722949, + "learning_rate": 9.966758672885375e-06, + "loss": 0.3945, + "step": 675 + }, + { + "epoch": 0.06579075425790755, + "grad_norm": 1.8371332697903162, + "learning_rate": 9.96657698105779e-06, + "loss": 0.6782, + "step": 676 + }, + { + "epoch": 0.06588807785888078, + "grad_norm": 1.1955013749239556, + "learning_rate": 9.966394795698397e-06, + "loss": 0.242, + "step": 677 + }, + { + "epoch": 0.06598540145985402, + "grad_norm": 1.5330975344313047, + "learning_rate": 9.966212116825302e-06, + "loss": 0.4351, + "step": 678 + }, + { + "epoch": 0.06608272506082725, + "grad_norm": 1.539581985713935, + "learning_rate": 9.966028944456657e-06, + "loss": 0.3512, + "step": 679 + }, + { + "epoch": 0.06618004866180048, + "grad_norm": 1.9573455375443363, + "learning_rate": 9.965845278610661e-06, + "loss": 0.4859, + "step": 680 + }, + { + "epoch": 0.06627737226277372, + "grad_norm": 1.8387055004344444, + "learning_rate": 9.96566111930557e-06, + "loss": 0.3831, + "step": 681 + }, + { + "epoch": 0.06637469586374696, + "grad_norm": 1.7056154014174738, + "learning_rate": 9.96547646655968e-06, + "loss": 0.4675, + "step": 682 + }, + { + "epoch": 0.0664720194647202, + "grad_norm": 1.881602931580563, + "learning_rate": 9.965291320391342e-06, + "loss": 0.5955, + "step": 683 + }, + { + "epoch": 0.06656934306569343, + "grad_norm": 2.9885065529853416, + "learning_rate": 9.965105680818955e-06, + "loss": 0.393, + "step": 684 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 1.7363492709096229, + "learning_rate": 9.964919547860963e-06, + "loss": 0.4903, + "step": 685 + }, + { + "epoch": 0.0667639902676399, + "grad_norm": 1.8182376939684146, + "learning_rate": 9.964732921535863e-06, + "loss": 0.5443, + "step": 686 + }, + { + "epoch": 0.06686131386861315, + "grad_norm": 1.6914779965026407, + "learning_rate": 9.964545801862202e-06, + "loss": 0.5119, + "step": 687 + }, + { + "epoch": 0.06695863746958637, + "grad_norm": 1.2736843314571082, + "learning_rate": 9.964358188858573e-06, + "loss": 0.2495, + "step": 688 + }, + { + "epoch": 0.06705596107055961, + "grad_norm": 1.5831736266585599, + "learning_rate": 9.96417008254362e-06, + "loss": 0.4489, + "step": 689 + }, + { + "epoch": 0.06715328467153285, + "grad_norm": 2.2148297560046224, + "learning_rate": 9.963981482936034e-06, + "loss": 0.5415, + "step": 690 + }, + { + "epoch": 0.06725060827250608, + "grad_norm": 1.5025934211262992, + "learning_rate": 9.963792390054558e-06, + "loss": 0.3903, + "step": 691 + }, + { + "epoch": 0.06734793187347932, + "grad_norm": 1.4602374679322867, + "learning_rate": 9.96360280391798e-06, + "loss": 0.3199, + "step": 692 + }, + { + "epoch": 0.06744525547445256, + "grad_norm": 1.5813416284844282, + "learning_rate": 9.963412724545142e-06, + "loss": 0.3213, + "step": 693 + }, + { + "epoch": 0.0675425790754258, + "grad_norm": 1.246883512769049, + "learning_rate": 9.96322215195493e-06, + "loss": 0.2644, + "step": 694 + }, + { + "epoch": 0.06763990267639902, + "grad_norm": 1.7094335347253355, + "learning_rate": 9.963031086166282e-06, + "loss": 0.4761, + "step": 695 + }, + { + "epoch": 0.06773722627737226, + "grad_norm": 1.6516611118524773, + "learning_rate": 9.962839527198184e-06, + "loss": 0.4823, + "step": 696 + }, + { + "epoch": 0.0678345498783455, + "grad_norm": 1.3531669839243998, + "learning_rate": 9.962647475069672e-06, + "loss": 0.4272, + "step": 697 + }, + { + "epoch": 0.06793187347931874, + "grad_norm": 1.9430916606586504, + "learning_rate": 9.962454929799829e-06, + "loss": 0.5776, + "step": 698 + }, + { + "epoch": 0.06802919708029197, + "grad_norm": 1.8772536403383466, + "learning_rate": 9.962261891407792e-06, + "loss": 0.6338, + "step": 699 + }, + { + "epoch": 0.0681265206812652, + "grad_norm": 1.3972932620324034, + "learning_rate": 9.96206835991274e-06, + "loss": 0.3671, + "step": 700 + }, + { + "epoch": 0.06822384428223845, + "grad_norm": 1.287329601381866, + "learning_rate": 9.961874335333904e-06, + "loss": 0.2744, + "step": 701 + }, + { + "epoch": 0.06832116788321167, + "grad_norm": 1.5600519457751545, + "learning_rate": 9.961679817690566e-06, + "loss": 0.4433, + "step": 702 + }, + { + "epoch": 0.06841849148418491, + "grad_norm": 1.3898736874388666, + "learning_rate": 9.961484807002056e-06, + "loss": 0.4197, + "step": 703 + }, + { + "epoch": 0.06851581508515815, + "grad_norm": 1.672202746628868, + "learning_rate": 9.961289303287749e-06, + "loss": 0.4601, + "step": 704 + }, + { + "epoch": 0.06861313868613139, + "grad_norm": 1.7427655274680753, + "learning_rate": 9.961093306567076e-06, + "loss": 0.5845, + "step": 705 + }, + { + "epoch": 0.06871046228710462, + "grad_norm": 1.794570108008766, + "learning_rate": 9.960896816859512e-06, + "loss": 0.3459, + "step": 706 + }, + { + "epoch": 0.06880778588807786, + "grad_norm": 1.6024314197975584, + "learning_rate": 9.960699834184582e-06, + "loss": 0.4441, + "step": 707 + }, + { + "epoch": 0.0689051094890511, + "grad_norm": 1.619306935418848, + "learning_rate": 9.960502358561858e-06, + "loss": 0.4647, + "step": 708 + }, + { + "epoch": 0.06900243309002434, + "grad_norm": 1.5009190604836247, + "learning_rate": 9.960304390010968e-06, + "loss": 0.373, + "step": 709 + }, + { + "epoch": 0.06909975669099756, + "grad_norm": 1.8613999824223078, + "learning_rate": 9.960105928551583e-06, + "loss": 0.3926, + "step": 710 + }, + { + "epoch": 0.0691970802919708, + "grad_norm": 2.8907340364253757, + "learning_rate": 9.959906974203422e-06, + "loss": 0.5451, + "step": 711 + }, + { + "epoch": 0.06929440389294404, + "grad_norm": 1.826374356881247, + "learning_rate": 9.959707526986256e-06, + "loss": 0.4341, + "step": 712 + }, + { + "epoch": 0.06939172749391727, + "grad_norm": 2.5001373253299133, + "learning_rate": 9.959507586919903e-06, + "loss": 0.6643, + "step": 713 + }, + { + "epoch": 0.06948905109489051, + "grad_norm": 1.769427365923108, + "learning_rate": 9.959307154024234e-06, + "loss": 0.5431, + "step": 714 + }, + { + "epoch": 0.06958637469586375, + "grad_norm": 2.3285358245695322, + "learning_rate": 9.959106228319166e-06, + "loss": 0.5274, + "step": 715 + }, + { + "epoch": 0.06968369829683699, + "grad_norm": 1.4070234926508725, + "learning_rate": 9.958904809824663e-06, + "loss": 0.3257, + "step": 716 + }, + { + "epoch": 0.06978102189781021, + "grad_norm": 1.9284568290872997, + "learning_rate": 9.958702898560742e-06, + "loss": 0.5648, + "step": 717 + }, + { + "epoch": 0.06987834549878345, + "grad_norm": 2.092543866644565, + "learning_rate": 9.958500494547465e-06, + "loss": 0.6256, + "step": 718 + }, + { + "epoch": 0.0699756690997567, + "grad_norm": 1.5948763588365042, + "learning_rate": 9.958297597804947e-06, + "loss": 0.4011, + "step": 719 + }, + { + "epoch": 0.07007299270072993, + "grad_norm": 1.2246362905267065, + "learning_rate": 9.958094208353348e-06, + "loss": 0.2444, + "step": 720 + }, + { + "epoch": 0.07017031630170316, + "grad_norm": 1.2302916868666773, + "learning_rate": 9.95789032621288e-06, + "loss": 0.3191, + "step": 721 + }, + { + "epoch": 0.0702676399026764, + "grad_norm": 1.5504396768673763, + "learning_rate": 9.957685951403803e-06, + "loss": 0.3112, + "step": 722 + }, + { + "epoch": 0.07036496350364964, + "grad_norm": 2.1205819146422438, + "learning_rate": 9.957481083946427e-06, + "loss": 0.3453, + "step": 723 + }, + { + "epoch": 0.07046228710462286, + "grad_norm": 2.048519725880563, + "learning_rate": 9.957275723861108e-06, + "loss": 0.5266, + "step": 724 + }, + { + "epoch": 0.0705596107055961, + "grad_norm": 1.4453693275620771, + "learning_rate": 9.957069871168253e-06, + "loss": 0.3082, + "step": 725 + }, + { + "epoch": 0.07065693430656934, + "grad_norm": 1.8824931146868138, + "learning_rate": 9.956863525888318e-06, + "loss": 0.588, + "step": 726 + }, + { + "epoch": 0.07075425790754258, + "grad_norm": 1.6143333569692804, + "learning_rate": 9.956656688041807e-06, + "loss": 0.4126, + "step": 727 + }, + { + "epoch": 0.07085158150851581, + "grad_norm": 1.7905307392122496, + "learning_rate": 9.956449357649276e-06, + "loss": 0.521, + "step": 728 + }, + { + "epoch": 0.07094890510948905, + "grad_norm": 1.3295021098228834, + "learning_rate": 9.956241534731325e-06, + "loss": 0.31, + "step": 729 + }, + { + "epoch": 0.07104622871046229, + "grad_norm": 1.5783278835300563, + "learning_rate": 9.956033219308607e-06, + "loss": 0.3091, + "step": 730 + }, + { + "epoch": 0.07114355231143553, + "grad_norm": 1.9905003004076265, + "learning_rate": 9.955824411401822e-06, + "loss": 0.3843, + "step": 731 + }, + { + "epoch": 0.07124087591240875, + "grad_norm": 1.7644558301646922, + "learning_rate": 9.955615111031717e-06, + "loss": 0.4288, + "step": 732 + }, + { + "epoch": 0.071338199513382, + "grad_norm": 1.5922207695027908, + "learning_rate": 9.955405318219096e-06, + "loss": 0.4767, + "step": 733 + }, + { + "epoch": 0.07143552311435523, + "grad_norm": 1.7054240956141933, + "learning_rate": 9.955195032984798e-06, + "loss": 0.4082, + "step": 734 + }, + { + "epoch": 0.07153284671532846, + "grad_norm": 1.3954970063738148, + "learning_rate": 9.954984255349729e-06, + "loss": 0.318, + "step": 735 + }, + { + "epoch": 0.0716301703163017, + "grad_norm": 1.7287069268697828, + "learning_rate": 9.954772985334825e-06, + "loss": 0.4998, + "step": 736 + }, + { + "epoch": 0.07172749391727494, + "grad_norm": 1.4535895804720915, + "learning_rate": 9.954561222961086e-06, + "loss": 0.2489, + "step": 737 + }, + { + "epoch": 0.07182481751824818, + "grad_norm": 1.7113518757446542, + "learning_rate": 9.954348968249552e-06, + "loss": 0.4578, + "step": 738 + }, + { + "epoch": 0.0719221411192214, + "grad_norm": 1.6741613993254088, + "learning_rate": 9.954136221221316e-06, + "loss": 0.4907, + "step": 739 + }, + { + "epoch": 0.07201946472019465, + "grad_norm": 1.590982465166657, + "learning_rate": 9.95392298189752e-06, + "loss": 0.4116, + "step": 740 + }, + { + "epoch": 0.07211678832116789, + "grad_norm": 1.422974716648181, + "learning_rate": 9.953709250299351e-06, + "loss": 0.3501, + "step": 741 + }, + { + "epoch": 0.07221411192214112, + "grad_norm": 1.8424007198547667, + "learning_rate": 9.953495026448048e-06, + "loss": 0.5647, + "step": 742 + }, + { + "epoch": 0.07231143552311435, + "grad_norm": 1.6572484299897867, + "learning_rate": 9.953280310364902e-06, + "loss": 0.3937, + "step": 743 + }, + { + "epoch": 0.07240875912408759, + "grad_norm": 1.6027770112754065, + "learning_rate": 9.953065102071245e-06, + "loss": 0.3845, + "step": 744 + }, + { + "epoch": 0.07250608272506083, + "grad_norm": 1.3618658637431431, + "learning_rate": 9.952849401588464e-06, + "loss": 0.3946, + "step": 745 + }, + { + "epoch": 0.07260340632603407, + "grad_norm": 1.63075572158439, + "learning_rate": 9.952633208937997e-06, + "loss": 0.4506, + "step": 746 + }, + { + "epoch": 0.0727007299270073, + "grad_norm": 1.483187632244976, + "learning_rate": 9.95241652414132e-06, + "loss": 0.3908, + "step": 747 + }, + { + "epoch": 0.07279805352798054, + "grad_norm": 2.147960263046311, + "learning_rate": 9.952199347219972e-06, + "loss": 0.5249, + "step": 748 + }, + { + "epoch": 0.07289537712895378, + "grad_norm": 1.5046941105429004, + "learning_rate": 9.951981678195529e-06, + "loss": 0.3592, + "step": 749 + }, + { + "epoch": 0.072992700729927, + "grad_norm": 1.1457618113072725, + "learning_rate": 9.951763517089624e-06, + "loss": 0.2197, + "step": 750 + }, + { + "epoch": 0.07309002433090024, + "grad_norm": 1.9275946136488011, + "learning_rate": 9.951544863923934e-06, + "loss": 0.5692, + "step": 751 + }, + { + "epoch": 0.07318734793187348, + "grad_norm": 1.9590929330277462, + "learning_rate": 9.95132571872019e-06, + "loss": 0.7243, + "step": 752 + }, + { + "epoch": 0.07328467153284672, + "grad_norm": 2.1368780826391283, + "learning_rate": 9.951106081500162e-06, + "loss": 0.7601, + "step": 753 + }, + { + "epoch": 0.07338199513381995, + "grad_norm": 2.0085695969306396, + "learning_rate": 9.950885952285682e-06, + "loss": 0.5541, + "step": 754 + }, + { + "epoch": 0.07347931873479319, + "grad_norm": 1.9283983503616706, + "learning_rate": 9.950665331098622e-06, + "loss": 0.3832, + "step": 755 + }, + { + "epoch": 0.07357664233576643, + "grad_norm": 1.4173732379297153, + "learning_rate": 9.950444217960902e-06, + "loss": 0.379, + "step": 756 + }, + { + "epoch": 0.07367396593673967, + "grad_norm": 1.5015176407129935, + "learning_rate": 9.9502226128945e-06, + "loss": 0.4696, + "step": 757 + }, + { + "epoch": 0.07377128953771289, + "grad_norm": 1.6746905852394565, + "learning_rate": 9.950000515921434e-06, + "loss": 0.2984, + "step": 758 + }, + { + "epoch": 0.07386861313868613, + "grad_norm": 1.4429847737048944, + "learning_rate": 9.949777927063776e-06, + "loss": 0.3748, + "step": 759 + }, + { + "epoch": 0.07396593673965937, + "grad_norm": 1.1895632638034424, + "learning_rate": 9.94955484634364e-06, + "loss": 0.3014, + "step": 760 + }, + { + "epoch": 0.0740632603406326, + "grad_norm": 1.5497241513071458, + "learning_rate": 9.949331273783198e-06, + "loss": 0.5458, + "step": 761 + }, + { + "epoch": 0.07416058394160584, + "grad_norm": 1.5531214201672936, + "learning_rate": 9.949107209404664e-06, + "loss": 0.4575, + "step": 762 + }, + { + "epoch": 0.07425790754257908, + "grad_norm": 1.3336107839559097, + "learning_rate": 9.948882653230306e-06, + "loss": 0.4227, + "step": 763 + }, + { + "epoch": 0.07435523114355232, + "grad_norm": 1.7418209768074853, + "learning_rate": 9.948657605282437e-06, + "loss": 0.659, + "step": 764 + }, + { + "epoch": 0.07445255474452554, + "grad_norm": 1.462439433090815, + "learning_rate": 9.94843206558342e-06, + "loss": 0.445, + "step": 765 + }, + { + "epoch": 0.07454987834549878, + "grad_norm": 1.0856086178050317, + "learning_rate": 9.948206034155666e-06, + "loss": 0.2245, + "step": 766 + }, + { + "epoch": 0.07464720194647202, + "grad_norm": 1.458503858496447, + "learning_rate": 9.947979511021638e-06, + "loss": 0.3009, + "step": 767 + }, + { + "epoch": 0.07474452554744526, + "grad_norm": 1.1921292471996519, + "learning_rate": 9.947752496203844e-06, + "loss": 0.2988, + "step": 768 + }, + { + "epoch": 0.07484184914841849, + "grad_norm": 1.6693024138876786, + "learning_rate": 9.947524989724844e-06, + "loss": 0.4783, + "step": 769 + }, + { + "epoch": 0.07493917274939173, + "grad_norm": 1.4928671202909605, + "learning_rate": 9.947296991607244e-06, + "loss": 0.4161, + "step": 770 + }, + { + "epoch": 0.07503649635036497, + "grad_norm": 1.4549005796935413, + "learning_rate": 9.947068501873702e-06, + "loss": 0.4186, + "step": 771 + }, + { + "epoch": 0.0751338199513382, + "grad_norm": 1.7544781744298734, + "learning_rate": 9.946839520546923e-06, + "loss": 0.5593, + "step": 772 + }, + { + "epoch": 0.07523114355231143, + "grad_norm": 1.561541454027553, + "learning_rate": 9.946610047649659e-06, + "loss": 0.5097, + "step": 773 + }, + { + "epoch": 0.07532846715328467, + "grad_norm": 1.598616630831168, + "learning_rate": 9.946380083204714e-06, + "loss": 0.3744, + "step": 774 + }, + { + "epoch": 0.07542579075425791, + "grad_norm": 1.6915556597188157, + "learning_rate": 9.94614962723494e-06, + "loss": 0.439, + "step": 775 + }, + { + "epoch": 0.07552311435523114, + "grad_norm": 1.220024420697048, + "learning_rate": 9.945918679763237e-06, + "loss": 0.2339, + "step": 776 + }, + { + "epoch": 0.07562043795620438, + "grad_norm": 1.6061445238682988, + "learning_rate": 9.945687240812556e-06, + "loss": 0.4493, + "step": 777 + }, + { + "epoch": 0.07571776155717762, + "grad_norm": 1.400813806243779, + "learning_rate": 9.945455310405895e-06, + "loss": 0.4513, + "step": 778 + }, + { + "epoch": 0.07581508515815086, + "grad_norm": 1.753751480308555, + "learning_rate": 9.945222888566298e-06, + "loss": 0.5379, + "step": 779 + }, + { + "epoch": 0.07591240875912408, + "grad_norm": 1.4421667558329163, + "learning_rate": 9.944989975316862e-06, + "loss": 0.4118, + "step": 780 + }, + { + "epoch": 0.07600973236009732, + "grad_norm": 1.4411974086247974, + "learning_rate": 9.944756570680733e-06, + "loss": 0.3295, + "step": 781 + }, + { + "epoch": 0.07610705596107056, + "grad_norm": 1.5545586767450623, + "learning_rate": 9.944522674681107e-06, + "loss": 0.4146, + "step": 782 + }, + { + "epoch": 0.07620437956204379, + "grad_norm": 2.0019900434858084, + "learning_rate": 9.944288287341222e-06, + "loss": 0.4945, + "step": 783 + }, + { + "epoch": 0.07630170316301703, + "grad_norm": 1.5834930071710975, + "learning_rate": 9.944053408684371e-06, + "loss": 0.3781, + "step": 784 + }, + { + "epoch": 0.07639902676399027, + "grad_norm": 1.5272521164667598, + "learning_rate": 9.943818038733894e-06, + "loss": 0.3865, + "step": 785 + }, + { + "epoch": 0.07649635036496351, + "grad_norm": 1.8005925077547513, + "learning_rate": 9.94358217751318e-06, + "loss": 0.3951, + "step": 786 + }, + { + "epoch": 0.07659367396593673, + "grad_norm": 2.0471085276865995, + "learning_rate": 9.943345825045664e-06, + "loss": 0.6391, + "step": 787 + }, + { + "epoch": 0.07669099756690997, + "grad_norm": 1.7893386028077656, + "learning_rate": 9.943108981354839e-06, + "loss": 0.6373, + "step": 788 + }, + { + "epoch": 0.07678832116788321, + "grad_norm": 1.6529186502183046, + "learning_rate": 9.942871646464234e-06, + "loss": 0.4901, + "step": 789 + }, + { + "epoch": 0.07688564476885645, + "grad_norm": 1.8449837387732961, + "learning_rate": 9.942633820397436e-06, + "loss": 0.4444, + "step": 790 + }, + { + "epoch": 0.07698296836982968, + "grad_norm": 1.5278738521461448, + "learning_rate": 9.942395503178077e-06, + "loss": 0.3701, + "step": 791 + }, + { + "epoch": 0.07708029197080292, + "grad_norm": 1.8197808533034088, + "learning_rate": 9.942156694829838e-06, + "loss": 0.6142, + "step": 792 + }, + { + "epoch": 0.07717761557177616, + "grad_norm": 1.8496691201700692, + "learning_rate": 9.941917395376452e-06, + "loss": 0.2021, + "step": 793 + }, + { + "epoch": 0.07727493917274939, + "grad_norm": 1.8762664332677217, + "learning_rate": 9.941677604841696e-06, + "loss": 0.6742, + "step": 794 + }, + { + "epoch": 0.07737226277372262, + "grad_norm": 1.5933514264940258, + "learning_rate": 9.9414373232494e-06, + "loss": 0.5156, + "step": 795 + }, + { + "epoch": 0.07746958637469586, + "grad_norm": 1.538651154827247, + "learning_rate": 9.94119655062344e-06, + "loss": 0.446, + "step": 796 + }, + { + "epoch": 0.0775669099756691, + "grad_norm": 3.7300878200470926, + "learning_rate": 9.94095528698774e-06, + "loss": 0.2745, + "step": 797 + }, + { + "epoch": 0.07766423357664233, + "grad_norm": 1.685774804326696, + "learning_rate": 9.940713532366277e-06, + "loss": 0.4236, + "step": 798 + }, + { + "epoch": 0.07776155717761557, + "grad_norm": 1.2528388212678458, + "learning_rate": 9.940471286783074e-06, + "loss": 0.308, + "step": 799 + }, + { + "epoch": 0.07785888077858881, + "grad_norm": 1.5082779398207746, + "learning_rate": 9.940228550262203e-06, + "loss": 0.4925, + "step": 800 + }, + { + "epoch": 0.07795620437956205, + "grad_norm": 1.544326069333433, + "learning_rate": 9.939985322827784e-06, + "loss": 0.4341, + "step": 801 + }, + { + "epoch": 0.07805352798053528, + "grad_norm": 1.4959220289677864, + "learning_rate": 9.939741604503987e-06, + "loss": 0.4548, + "step": 802 + }, + { + "epoch": 0.07815085158150852, + "grad_norm": 1.682287714178995, + "learning_rate": 9.93949739531503e-06, + "loss": 0.5277, + "step": 803 + }, + { + "epoch": 0.07824817518248176, + "grad_norm": 1.6519496438708445, + "learning_rate": 9.93925269528518e-06, + "loss": 0.3074, + "step": 804 + }, + { + "epoch": 0.07834549878345498, + "grad_norm": 1.4379883641500402, + "learning_rate": 9.939007504438756e-06, + "loss": 0.3069, + "step": 805 + }, + { + "epoch": 0.07844282238442822, + "grad_norm": 2.0644552037743793, + "learning_rate": 9.93876182280012e-06, + "loss": 0.4479, + "step": 806 + }, + { + "epoch": 0.07854014598540146, + "grad_norm": 1.4791313310441092, + "learning_rate": 9.938515650393685e-06, + "loss": 0.4255, + "step": 807 + }, + { + "epoch": 0.0786374695863747, + "grad_norm": 1.4280736600967436, + "learning_rate": 9.938268987243914e-06, + "loss": 0.466, + "step": 808 + }, + { + "epoch": 0.07873479318734793, + "grad_norm": 1.610976672135659, + "learning_rate": 9.93802183337532e-06, + "loss": 0.4327, + "step": 809 + }, + { + "epoch": 0.07883211678832117, + "grad_norm": 1.5447130604673693, + "learning_rate": 9.93777418881246e-06, + "loss": 0.4931, + "step": 810 + }, + { + "epoch": 0.0789294403892944, + "grad_norm": 1.3831325957946852, + "learning_rate": 9.937526053579944e-06, + "loss": 0.3877, + "step": 811 + }, + { + "epoch": 0.07902676399026765, + "grad_norm": 1.4247112282736865, + "learning_rate": 9.93727742770243e-06, + "loss": 0.4168, + "step": 812 + }, + { + "epoch": 0.07912408759124087, + "grad_norm": 1.5074130304911886, + "learning_rate": 9.937028311204624e-06, + "loss": 0.4747, + "step": 813 + }, + { + "epoch": 0.07922141119221411, + "grad_norm": 1.4955958242475926, + "learning_rate": 9.936778704111278e-06, + "loss": 0.2999, + "step": 814 + }, + { + "epoch": 0.07931873479318735, + "grad_norm": 1.6038468607718186, + "learning_rate": 9.9365286064472e-06, + "loss": 0.4897, + "step": 815 + }, + { + "epoch": 0.07941605839416059, + "grad_norm": 1.8040845780349017, + "learning_rate": 9.93627801823724e-06, + "loss": 0.6413, + "step": 816 + }, + { + "epoch": 0.07951338199513382, + "grad_norm": 1.4598215502284355, + "learning_rate": 9.936026939506298e-06, + "loss": 0.3687, + "step": 817 + }, + { + "epoch": 0.07961070559610706, + "grad_norm": 1.340412030499075, + "learning_rate": 9.935775370279324e-06, + "loss": 0.3833, + "step": 818 + }, + { + "epoch": 0.0797080291970803, + "grad_norm": 1.6913032059853774, + "learning_rate": 9.935523310581318e-06, + "loss": 0.5857, + "step": 819 + }, + { + "epoch": 0.07980535279805352, + "grad_norm": 1.9970663728185467, + "learning_rate": 9.93527076043733e-06, + "loss": 0.6843, + "step": 820 + }, + { + "epoch": 0.07990267639902676, + "grad_norm": 1.4408921562941295, + "learning_rate": 9.93501771987245e-06, + "loss": 0.4385, + "step": 821 + }, + { + "epoch": 0.08, + "grad_norm": 1.5184490203891443, + "learning_rate": 9.934764188911827e-06, + "loss": 0.4708, + "step": 822 + }, + { + "epoch": 0.08009732360097324, + "grad_norm": 1.8501562903086661, + "learning_rate": 9.934510167580654e-06, + "loss": 0.6431, + "step": 823 + }, + { + "epoch": 0.08019464720194647, + "grad_norm": 1.6997829158405129, + "learning_rate": 9.934255655904172e-06, + "loss": 0.5188, + "step": 824 + }, + { + "epoch": 0.08029197080291971, + "grad_norm": 1.8510241792275326, + "learning_rate": 9.934000653907674e-06, + "loss": 0.5457, + "step": 825 + }, + { + "epoch": 0.08038929440389295, + "grad_norm": 1.6853569692908912, + "learning_rate": 9.933745161616498e-06, + "loss": 0.5062, + "step": 826 + }, + { + "epoch": 0.08048661800486619, + "grad_norm": 1.3066104263898661, + "learning_rate": 9.93348917905603e-06, + "loss": 0.404, + "step": 827 + }, + { + "epoch": 0.08058394160583941, + "grad_norm": 1.2788244408859646, + "learning_rate": 9.933232706251712e-06, + "loss": 0.3253, + "step": 828 + }, + { + "epoch": 0.08068126520681265, + "grad_norm": 2.2690800072126325, + "learning_rate": 9.932975743229027e-06, + "loss": 0.3405, + "step": 829 + }, + { + "epoch": 0.08077858880778589, + "grad_norm": 1.9113871035353245, + "learning_rate": 9.932718290013512e-06, + "loss": 0.5989, + "step": 830 + }, + { + "epoch": 0.08087591240875912, + "grad_norm": 1.3655256798283997, + "learning_rate": 9.932460346630748e-06, + "loss": 0.2942, + "step": 831 + }, + { + "epoch": 0.08097323600973236, + "grad_norm": 1.5234864838378999, + "learning_rate": 9.932201913106366e-06, + "loss": 0.3913, + "step": 832 + }, + { + "epoch": 0.0810705596107056, + "grad_norm": 1.3752195876516826, + "learning_rate": 9.93194298946605e-06, + "loss": 0.3293, + "step": 833 + }, + { + "epoch": 0.08116788321167884, + "grad_norm": 1.4842622412969824, + "learning_rate": 9.931683575735527e-06, + "loss": 0.4157, + "step": 834 + }, + { + "epoch": 0.08126520681265206, + "grad_norm": 4.003685207313109, + "learning_rate": 9.931423671940577e-06, + "loss": 0.3276, + "step": 835 + }, + { + "epoch": 0.0813625304136253, + "grad_norm": 1.509943035011216, + "learning_rate": 9.931163278107023e-06, + "loss": 0.4045, + "step": 836 + }, + { + "epoch": 0.08145985401459854, + "grad_norm": 1.4382523765338775, + "learning_rate": 9.930902394260746e-06, + "loss": 0.2709, + "step": 837 + }, + { + "epoch": 0.08155717761557178, + "grad_norm": 1.4492711471586157, + "learning_rate": 9.930641020427665e-06, + "loss": 0.3957, + "step": 838 + }, + { + "epoch": 0.08165450121654501, + "grad_norm": 1.7428876214187694, + "learning_rate": 9.930379156633758e-06, + "loss": 0.5257, + "step": 839 + }, + { + "epoch": 0.08175182481751825, + "grad_norm": 1.5652514836380926, + "learning_rate": 9.930116802905042e-06, + "loss": 0.4948, + "step": 840 + }, + { + "epoch": 0.08184914841849149, + "grad_norm": 2.4133112951540494, + "learning_rate": 9.929853959267589e-06, + "loss": 0.5455, + "step": 841 + }, + { + "epoch": 0.08194647201946471, + "grad_norm": 1.4309460046419233, + "learning_rate": 9.929590625747518e-06, + "loss": 0.4057, + "step": 842 + }, + { + "epoch": 0.08204379562043795, + "grad_norm": 1.0450296792009146, + "learning_rate": 9.929326802370995e-06, + "loss": 0.2332, + "step": 843 + }, + { + "epoch": 0.0821411192214112, + "grad_norm": 1.1201933325217828, + "learning_rate": 9.92906248916424e-06, + "loss": 0.3264, + "step": 844 + }, + { + "epoch": 0.08223844282238443, + "grad_norm": 1.6243579769967154, + "learning_rate": 9.928797686153515e-06, + "loss": 0.5385, + "step": 845 + }, + { + "epoch": 0.08233576642335766, + "grad_norm": 1.3496069901220336, + "learning_rate": 9.928532393365136e-06, + "loss": 0.3875, + "step": 846 + }, + { + "epoch": 0.0824330900243309, + "grad_norm": 1.4862888245769246, + "learning_rate": 9.928266610825462e-06, + "loss": 0.4493, + "step": 847 + }, + { + "epoch": 0.08253041362530414, + "grad_norm": 1.8305160014899666, + "learning_rate": 9.928000338560906e-06, + "loss": 0.4582, + "step": 848 + }, + { + "epoch": 0.08262773722627738, + "grad_norm": 1.642584946989029, + "learning_rate": 9.927733576597926e-06, + "loss": 0.3347, + "step": 849 + }, + { + "epoch": 0.0827250608272506, + "grad_norm": 1.5413363162928122, + "learning_rate": 9.927466324963033e-06, + "loss": 0.4607, + "step": 850 + }, + { + "epoch": 0.08282238442822384, + "grad_norm": 1.7093263469236866, + "learning_rate": 9.927198583682784e-06, + "loss": 0.5706, + "step": 851 + }, + { + "epoch": 0.08291970802919708, + "grad_norm": 1.531714933227777, + "learning_rate": 9.926930352783781e-06, + "loss": 0.533, + "step": 852 + }, + { + "epoch": 0.08301703163017031, + "grad_norm": 1.8181822267445191, + "learning_rate": 9.926661632292683e-06, + "loss": 0.5946, + "step": 853 + }, + { + "epoch": 0.08311435523114355, + "grad_norm": 1.8304662465930317, + "learning_rate": 9.926392422236189e-06, + "loss": 0.3746, + "step": 854 + }, + { + "epoch": 0.08321167883211679, + "grad_norm": 1.3135536142885351, + "learning_rate": 9.926122722641051e-06, + "loss": 0.429, + "step": 855 + }, + { + "epoch": 0.08330900243309003, + "grad_norm": 1.714390027755308, + "learning_rate": 9.925852533534071e-06, + "loss": 0.6806, + "step": 856 + }, + { + "epoch": 0.08340632603406326, + "grad_norm": 1.3399957064659453, + "learning_rate": 9.925581854942099e-06, + "loss": 0.2824, + "step": 857 + }, + { + "epoch": 0.0835036496350365, + "grad_norm": 1.3705351036499993, + "learning_rate": 9.925310686892026e-06, + "loss": 0.3085, + "step": 858 + }, + { + "epoch": 0.08360097323600973, + "grad_norm": 1.5064665959171673, + "learning_rate": 9.925039029410807e-06, + "loss": 0.4445, + "step": 859 + }, + { + "epoch": 0.08369829683698297, + "grad_norm": 1.725614330530946, + "learning_rate": 9.924766882525433e-06, + "loss": 0.4704, + "step": 860 + }, + { + "epoch": 0.0837956204379562, + "grad_norm": 1.765372064078189, + "learning_rate": 9.924494246262944e-06, + "loss": 0.6383, + "step": 861 + }, + { + "epoch": 0.08389294403892944, + "grad_norm": 2.085503007877936, + "learning_rate": 9.924221120650434e-06, + "loss": 0.296, + "step": 862 + }, + { + "epoch": 0.08399026763990268, + "grad_norm": 1.7898541160892734, + "learning_rate": 9.923947505715046e-06, + "loss": 0.5991, + "step": 863 + }, + { + "epoch": 0.0840875912408759, + "grad_norm": 1.6476104975968628, + "learning_rate": 9.923673401483968e-06, + "loss": 0.4734, + "step": 864 + }, + { + "epoch": 0.08418491484184915, + "grad_norm": 1.5502768976775265, + "learning_rate": 9.923398807984439e-06, + "loss": 0.2764, + "step": 865 + }, + { + "epoch": 0.08428223844282239, + "grad_norm": 1.2398437846135097, + "learning_rate": 9.923123725243744e-06, + "loss": 0.2705, + "step": 866 + }, + { + "epoch": 0.08437956204379563, + "grad_norm": 1.5290591078236662, + "learning_rate": 9.922848153289217e-06, + "loss": 0.4228, + "step": 867 + }, + { + "epoch": 0.08447688564476885, + "grad_norm": 1.134889947118225, + "learning_rate": 9.922572092148244e-06, + "loss": 0.2953, + "step": 868 + }, + { + "epoch": 0.08457420924574209, + "grad_norm": 1.6307620082274505, + "learning_rate": 9.922295541848257e-06, + "loss": 0.3363, + "step": 869 + }, + { + "epoch": 0.08467153284671533, + "grad_norm": 1.373015271795792, + "learning_rate": 9.922018502416736e-06, + "loss": 0.3593, + "step": 870 + }, + { + "epoch": 0.08476885644768857, + "grad_norm": 1.7500724096304088, + "learning_rate": 9.921740973881211e-06, + "loss": 0.5236, + "step": 871 + }, + { + "epoch": 0.0848661800486618, + "grad_norm": 1.6167507595463353, + "learning_rate": 9.92146295626926e-06, + "loss": 0.5138, + "step": 872 + }, + { + "epoch": 0.08496350364963504, + "grad_norm": 1.0398007401901226, + "learning_rate": 9.92118444960851e-06, + "loss": 0.295, + "step": 873 + }, + { + "epoch": 0.08506082725060828, + "grad_norm": 1.4140920056378707, + "learning_rate": 9.920905453926637e-06, + "loss": 0.4192, + "step": 874 + }, + { + "epoch": 0.0851581508515815, + "grad_norm": 1.8785238213855096, + "learning_rate": 9.920625969251365e-06, + "loss": 0.4228, + "step": 875 + }, + { + "epoch": 0.08525547445255474, + "grad_norm": 1.719991686268608, + "learning_rate": 9.920345995610465e-06, + "loss": 0.5026, + "step": 876 + }, + { + "epoch": 0.08535279805352798, + "grad_norm": 1.7112372148926476, + "learning_rate": 9.92006553303176e-06, + "loss": 0.3157, + "step": 877 + }, + { + "epoch": 0.08545012165450122, + "grad_norm": 2.5105720144829116, + "learning_rate": 9.919784581543117e-06, + "loss": 0.4777, + "step": 878 + }, + { + "epoch": 0.08554744525547445, + "grad_norm": 1.42848630379055, + "learning_rate": 9.919503141172458e-06, + "loss": 0.3998, + "step": 879 + }, + { + "epoch": 0.08564476885644769, + "grad_norm": 1.4246136626839867, + "learning_rate": 9.919221211947748e-06, + "loss": 0.4415, + "step": 880 + }, + { + "epoch": 0.08574209245742093, + "grad_norm": 1.939970471855472, + "learning_rate": 9.918938793897002e-06, + "loss": 0.5887, + "step": 881 + }, + { + "epoch": 0.08583941605839417, + "grad_norm": 1.5467402852284964, + "learning_rate": 9.918655887048285e-06, + "loss": 0.3726, + "step": 882 + }, + { + "epoch": 0.08593673965936739, + "grad_norm": 1.6261636529000345, + "learning_rate": 9.918372491429708e-06, + "loss": 0.3382, + "step": 883 + }, + { + "epoch": 0.08603406326034063, + "grad_norm": 1.4859289768748727, + "learning_rate": 9.918088607069434e-06, + "loss": 0.4837, + "step": 884 + }, + { + "epoch": 0.08613138686131387, + "grad_norm": 1.8534453271170916, + "learning_rate": 9.917804233995673e-06, + "loss": 0.5948, + "step": 885 + }, + { + "epoch": 0.08622871046228711, + "grad_norm": 1.3491809126204122, + "learning_rate": 9.917519372236684e-06, + "loss": 0.381, + "step": 886 + }, + { + "epoch": 0.08632603406326034, + "grad_norm": 1.4913268478302555, + "learning_rate": 9.91723402182077e-06, + "loss": 0.2872, + "step": 887 + }, + { + "epoch": 0.08642335766423358, + "grad_norm": 1.5345667515291348, + "learning_rate": 9.916948182776289e-06, + "loss": 0.4426, + "step": 888 + }, + { + "epoch": 0.08652068126520682, + "grad_norm": 1.9142340135608018, + "learning_rate": 9.916661855131646e-06, + "loss": 0.467, + "step": 889 + }, + { + "epoch": 0.08661800486618004, + "grad_norm": 1.7451883652681546, + "learning_rate": 9.916375038915291e-06, + "loss": 0.3579, + "step": 890 + }, + { + "epoch": 0.08671532846715328, + "grad_norm": 3.3675828599824618, + "learning_rate": 9.916087734155728e-06, + "loss": 0.3965, + "step": 891 + }, + { + "epoch": 0.08681265206812652, + "grad_norm": 1.6430989821947144, + "learning_rate": 9.915799940881504e-06, + "loss": 0.5089, + "step": 892 + }, + { + "epoch": 0.08690997566909976, + "grad_norm": 1.8434153107573372, + "learning_rate": 9.915511659121219e-06, + "loss": 0.6513, + "step": 893 + }, + { + "epoch": 0.08700729927007299, + "grad_norm": 1.7259560464984558, + "learning_rate": 9.91522288890352e-06, + "loss": 0.5963, + "step": 894 + }, + { + "epoch": 0.08710462287104623, + "grad_norm": 1.4417036209809253, + "learning_rate": 9.9149336302571e-06, + "loss": 0.4076, + "step": 895 + }, + { + "epoch": 0.08720194647201947, + "grad_norm": 1.4565626930182671, + "learning_rate": 9.914643883210704e-06, + "loss": 0.3548, + "step": 896 + }, + { + "epoch": 0.08729927007299271, + "grad_norm": 1.8286482885292266, + "learning_rate": 9.914353647793126e-06, + "loss": 0.5158, + "step": 897 + }, + { + "epoch": 0.08739659367396593, + "grad_norm": 1.573235746781315, + "learning_rate": 9.914062924033204e-06, + "loss": 0.4804, + "step": 898 + }, + { + "epoch": 0.08749391727493917, + "grad_norm": 1.7725042500734154, + "learning_rate": 9.91377171195983e-06, + "loss": 0.4037, + "step": 899 + }, + { + "epoch": 0.08759124087591241, + "grad_norm": 1.5572801757524644, + "learning_rate": 9.913480011601939e-06, + "loss": 0.2757, + "step": 900 + }, + { + "epoch": 0.08768856447688564, + "grad_norm": 1.690990088453521, + "learning_rate": 9.91318782298852e-06, + "loss": 0.624, + "step": 901 + }, + { + "epoch": 0.08778588807785888, + "grad_norm": 1.5797017595834213, + "learning_rate": 9.912895146148609e-06, + "loss": 0.418, + "step": 902 + }, + { + "epoch": 0.08788321167883212, + "grad_norm": 1.722754374021215, + "learning_rate": 9.912601981111287e-06, + "loss": 0.5991, + "step": 903 + }, + { + "epoch": 0.08798053527980536, + "grad_norm": 1.2395740583484196, + "learning_rate": 9.912308327905683e-06, + "loss": 0.3632, + "step": 904 + }, + { + "epoch": 0.08807785888077858, + "grad_norm": 1.8637568028899596, + "learning_rate": 9.912014186560985e-06, + "loss": 0.5766, + "step": 905 + }, + { + "epoch": 0.08817518248175182, + "grad_norm": 1.8489319991981024, + "learning_rate": 9.911719557106418e-06, + "loss": 0.6834, + "step": 906 + }, + { + "epoch": 0.08827250608272506, + "grad_norm": 1.6692858460733677, + "learning_rate": 9.911424439571258e-06, + "loss": 0.5067, + "step": 907 + }, + { + "epoch": 0.0883698296836983, + "grad_norm": 1.4727605888984552, + "learning_rate": 9.911128833984834e-06, + "loss": 0.3141, + "step": 908 + }, + { + "epoch": 0.08846715328467153, + "grad_norm": 1.644393806422472, + "learning_rate": 9.910832740376518e-06, + "loss": 0.4599, + "step": 909 + }, + { + "epoch": 0.08856447688564477, + "grad_norm": 1.730275300452632, + "learning_rate": 9.910536158775734e-06, + "loss": 0.3908, + "step": 910 + }, + { + "epoch": 0.08866180048661801, + "grad_norm": 1.7281903494262714, + "learning_rate": 9.910239089211955e-06, + "loss": 0.5919, + "step": 911 + }, + { + "epoch": 0.08875912408759123, + "grad_norm": 1.7234172913238917, + "learning_rate": 9.909941531714699e-06, + "loss": 0.609, + "step": 912 + }, + { + "epoch": 0.08885644768856447, + "grad_norm": 1.4594702058569258, + "learning_rate": 9.909643486313533e-06, + "loss": 0.4399, + "step": 913 + }, + { + "epoch": 0.08895377128953771, + "grad_norm": 1.4625782448468165, + "learning_rate": 9.90934495303808e-06, + "loss": 0.4011, + "step": 914 + }, + { + "epoch": 0.08905109489051095, + "grad_norm": 1.7262645481609784, + "learning_rate": 9.909045931918e-06, + "loss": 0.4992, + "step": 915 + }, + { + "epoch": 0.08914841849148418, + "grad_norm": 1.6255222361700263, + "learning_rate": 9.908746422983007e-06, + "loss": 0.4909, + "step": 916 + }, + { + "epoch": 0.08924574209245742, + "grad_norm": 1.7512982185254946, + "learning_rate": 9.908446426262865e-06, + "loss": 0.5527, + "step": 917 + }, + { + "epoch": 0.08934306569343066, + "grad_norm": 1.617605772613541, + "learning_rate": 9.908145941787386e-06, + "loss": 0.3228, + "step": 918 + }, + { + "epoch": 0.0894403892944039, + "grad_norm": 1.489706963519404, + "learning_rate": 9.907844969586427e-06, + "loss": 0.4838, + "step": 919 + }, + { + "epoch": 0.08953771289537713, + "grad_norm": 1.193837371345013, + "learning_rate": 9.907543509689896e-06, + "loss": 0.284, + "step": 920 + }, + { + "epoch": 0.08963503649635036, + "grad_norm": 1.5855787651349198, + "learning_rate": 9.907241562127752e-06, + "loss": 0.4641, + "step": 921 + }, + { + "epoch": 0.0897323600973236, + "grad_norm": 1.2401284480478103, + "learning_rate": 9.906939126929998e-06, + "loss": 0.246, + "step": 922 + }, + { + "epoch": 0.08982968369829683, + "grad_norm": 1.503842201355298, + "learning_rate": 9.906636204126685e-06, + "loss": 0.4031, + "step": 923 + }, + { + "epoch": 0.08992700729927007, + "grad_norm": 1.9138265658958267, + "learning_rate": 9.906332793747917e-06, + "loss": 0.587, + "step": 924 + }, + { + "epoch": 0.09002433090024331, + "grad_norm": 1.5381184892388742, + "learning_rate": 9.906028895823844e-06, + "loss": 0.4119, + "step": 925 + }, + { + "epoch": 0.09012165450121655, + "grad_norm": 1.5769181877690257, + "learning_rate": 9.905724510384664e-06, + "loss": 0.4071, + "step": 926 + }, + { + "epoch": 0.09021897810218978, + "grad_norm": 1.4644408625641083, + "learning_rate": 9.905419637460625e-06, + "loss": 0.3656, + "step": 927 + }, + { + "epoch": 0.09031630170316302, + "grad_norm": 2.043739071504731, + "learning_rate": 9.90511427708202e-06, + "loss": 0.6317, + "step": 928 + }, + { + "epoch": 0.09041362530413626, + "grad_norm": 1.8397228419915481, + "learning_rate": 9.904808429279195e-06, + "loss": 0.6656, + "step": 929 + }, + { + "epoch": 0.0905109489051095, + "grad_norm": 1.6689588837493128, + "learning_rate": 9.904502094082542e-06, + "loss": 0.4603, + "step": 930 + }, + { + "epoch": 0.09060827250608272, + "grad_norm": 1.7157610479724803, + "learning_rate": 9.9041952715225e-06, + "loss": 0.3566, + "step": 931 + }, + { + "epoch": 0.09070559610705596, + "grad_norm": 1.5797548847560638, + "learning_rate": 9.90388796162956e-06, + "loss": 0.527, + "step": 932 + }, + { + "epoch": 0.0908029197080292, + "grad_norm": 1.3861944362556795, + "learning_rate": 9.903580164434262e-06, + "loss": 0.3555, + "step": 933 + }, + { + "epoch": 0.09090024330900243, + "grad_norm": 1.4873043668950738, + "learning_rate": 9.903271879967185e-06, + "loss": 0.3606, + "step": 934 + }, + { + "epoch": 0.09099756690997567, + "grad_norm": 1.5471770637050817, + "learning_rate": 9.90296310825897e-06, + "loss": 0.5407, + "step": 935 + }, + { + "epoch": 0.0910948905109489, + "grad_norm": 1.7410898214633266, + "learning_rate": 9.902653849340296e-06, + "loss": 0.5604, + "step": 936 + }, + { + "epoch": 0.09119221411192215, + "grad_norm": 1.490257412993615, + "learning_rate": 9.902344103241897e-06, + "loss": 0.4293, + "step": 937 + }, + { + "epoch": 0.09128953771289537, + "grad_norm": 1.3076716120407041, + "learning_rate": 9.90203386999455e-06, + "loss": 0.4311, + "step": 938 + }, + { + "epoch": 0.09138686131386861, + "grad_norm": 1.63883307554104, + "learning_rate": 9.901723149629085e-06, + "loss": 0.5026, + "step": 939 + }, + { + "epoch": 0.09148418491484185, + "grad_norm": 1.460694807977355, + "learning_rate": 9.901411942176377e-06, + "loss": 0.4449, + "step": 940 + }, + { + "epoch": 0.09158150851581509, + "grad_norm": 1.631318499416747, + "learning_rate": 9.901100247667352e-06, + "loss": 0.4762, + "step": 941 + }, + { + "epoch": 0.09167883211678832, + "grad_norm": 1.472942456024595, + "learning_rate": 9.900788066132982e-06, + "loss": 0.4208, + "step": 942 + }, + { + "epoch": 0.09177615571776156, + "grad_norm": 1.9471723252943203, + "learning_rate": 9.900475397604292e-06, + "loss": 0.4887, + "step": 943 + }, + { + "epoch": 0.0918734793187348, + "grad_norm": 1.4192635165617975, + "learning_rate": 9.900162242112348e-06, + "loss": 0.4753, + "step": 944 + }, + { + "epoch": 0.09197080291970802, + "grad_norm": 1.7864248496903834, + "learning_rate": 9.89984859968827e-06, + "loss": 0.6063, + "step": 945 + }, + { + "epoch": 0.09206812652068126, + "grad_norm": 1.402919088092856, + "learning_rate": 9.899534470363225e-06, + "loss": 0.3561, + "step": 946 + }, + { + "epoch": 0.0921654501216545, + "grad_norm": 1.15011785152118, + "learning_rate": 9.89921985416843e-06, + "loss": 0.2605, + "step": 947 + }, + { + "epoch": 0.09226277372262774, + "grad_norm": 1.2940536511249239, + "learning_rate": 9.898904751135145e-06, + "loss": 0.2503, + "step": 948 + }, + { + "epoch": 0.09236009732360097, + "grad_norm": 1.5093308152075566, + "learning_rate": 9.898589161294684e-06, + "loss": 0.4185, + "step": 949 + }, + { + "epoch": 0.09245742092457421, + "grad_norm": 1.5826010349075055, + "learning_rate": 9.898273084678406e-06, + "loss": 0.536, + "step": 950 + }, + { + "epoch": 0.09255474452554745, + "grad_norm": 1.5672518381317015, + "learning_rate": 9.897956521317724e-06, + "loss": 0.5068, + "step": 951 + }, + { + "epoch": 0.09265206812652069, + "grad_norm": 1.784767292144658, + "learning_rate": 9.89763947124409e-06, + "loss": 0.6601, + "step": 952 + }, + { + "epoch": 0.09274939172749391, + "grad_norm": 1.620681747107968, + "learning_rate": 9.897321934489011e-06, + "loss": 0.5402, + "step": 953 + }, + { + "epoch": 0.09284671532846715, + "grad_norm": 1.7479722673062432, + "learning_rate": 9.897003911084042e-06, + "loss": 0.6593, + "step": 954 + }, + { + "epoch": 0.09294403892944039, + "grad_norm": 1.6618363798373263, + "learning_rate": 9.896685401060783e-06, + "loss": 0.6086, + "step": 955 + }, + { + "epoch": 0.09304136253041363, + "grad_norm": 1.3782603882872615, + "learning_rate": 9.896366404450888e-06, + "loss": 0.3431, + "step": 956 + }, + { + "epoch": 0.09313868613138686, + "grad_norm": 1.6607836446620106, + "learning_rate": 9.896046921286053e-06, + "loss": 0.4015, + "step": 957 + }, + { + "epoch": 0.0932360097323601, + "grad_norm": 1.372535143543006, + "learning_rate": 9.895726951598026e-06, + "loss": 0.3627, + "step": 958 + }, + { + "epoch": 0.09333333333333334, + "grad_norm": 1.965175835699204, + "learning_rate": 9.895406495418602e-06, + "loss": 0.434, + "step": 959 + }, + { + "epoch": 0.09343065693430656, + "grad_norm": 1.6072227382486934, + "learning_rate": 9.895085552779626e-06, + "loss": 0.3666, + "step": 960 + }, + { + "epoch": 0.0935279805352798, + "grad_norm": 1.8680414138630521, + "learning_rate": 9.894764123712991e-06, + "loss": 0.6182, + "step": 961 + }, + { + "epoch": 0.09362530413625304, + "grad_norm": 1.7249394724081422, + "learning_rate": 9.894442208250636e-06, + "loss": 0.569, + "step": 962 + }, + { + "epoch": 0.09372262773722628, + "grad_norm": 1.7887658285510963, + "learning_rate": 9.894119806424549e-06, + "loss": 0.4825, + "step": 963 + }, + { + "epoch": 0.09381995133819951, + "grad_norm": 1.4470695743772581, + "learning_rate": 9.89379691826677e-06, + "loss": 0.4036, + "step": 964 + }, + { + "epoch": 0.09391727493917275, + "grad_norm": 1.739037372856574, + "learning_rate": 9.893473543809383e-06, + "loss": 0.3734, + "step": 965 + }, + { + "epoch": 0.09401459854014599, + "grad_norm": 1.2401623802615098, + "learning_rate": 9.893149683084522e-06, + "loss": 0.2892, + "step": 966 + }, + { + "epoch": 0.09411192214111923, + "grad_norm": 1.632367817316159, + "learning_rate": 9.892825336124369e-06, + "loss": 0.3324, + "step": 967 + }, + { + "epoch": 0.09420924574209245, + "grad_norm": 1.4553279790204596, + "learning_rate": 9.892500502961156e-06, + "loss": 0.4518, + "step": 968 + }, + { + "epoch": 0.0943065693430657, + "grad_norm": 2.0184949211791867, + "learning_rate": 9.892175183627161e-06, + "loss": 0.496, + "step": 969 + }, + { + "epoch": 0.09440389294403893, + "grad_norm": 1.3847811204395728, + "learning_rate": 9.89184937815471e-06, + "loss": 0.3908, + "step": 970 + }, + { + "epoch": 0.09450121654501216, + "grad_norm": 1.7325451795183482, + "learning_rate": 9.89152308657618e-06, + "loss": 0.5813, + "step": 971 + }, + { + "epoch": 0.0945985401459854, + "grad_norm": 1.3485480854398895, + "learning_rate": 9.891196308923994e-06, + "loss": 0.2773, + "step": 972 + }, + { + "epoch": 0.09469586374695864, + "grad_norm": 1.6137214411092917, + "learning_rate": 9.890869045230625e-06, + "loss": 0.573, + "step": 973 + }, + { + "epoch": 0.09479318734793188, + "grad_norm": 1.8098732560393935, + "learning_rate": 9.890541295528593e-06, + "loss": 0.5765, + "step": 974 + }, + { + "epoch": 0.0948905109489051, + "grad_norm": 1.7169741386061155, + "learning_rate": 9.890213059850467e-06, + "loss": 0.5463, + "step": 975 + }, + { + "epoch": 0.09498783454987834, + "grad_norm": 1.6226425677233698, + "learning_rate": 9.889884338228861e-06, + "loss": 0.459, + "step": 976 + }, + { + "epoch": 0.09508515815085158, + "grad_norm": 1.5712338302132318, + "learning_rate": 9.889555130696445e-06, + "loss": 0.2926, + "step": 977 + }, + { + "epoch": 0.09518248175182482, + "grad_norm": 2.368668096329164, + "learning_rate": 9.88922543728593e-06, + "loss": 0.4602, + "step": 978 + }, + { + "epoch": 0.09527980535279805, + "grad_norm": 1.5481463515619227, + "learning_rate": 9.888895258030077e-06, + "loss": 0.382, + "step": 979 + }, + { + "epoch": 0.09537712895377129, + "grad_norm": 1.5566394762827083, + "learning_rate": 9.888564592961698e-06, + "loss": 0.4432, + "step": 980 + }, + { + "epoch": 0.09547445255474453, + "grad_norm": 1.2929219586068095, + "learning_rate": 9.888233442113651e-06, + "loss": 0.2986, + "step": 981 + }, + { + "epoch": 0.09557177615571776, + "grad_norm": 1.7926346211976876, + "learning_rate": 9.887901805518841e-06, + "loss": 0.4536, + "step": 982 + }, + { + "epoch": 0.095669099756691, + "grad_norm": 1.5810862037952855, + "learning_rate": 9.887569683210225e-06, + "loss": 0.5143, + "step": 983 + }, + { + "epoch": 0.09576642335766423, + "grad_norm": 1.486412737689962, + "learning_rate": 9.887237075220805e-06, + "loss": 0.4422, + "step": 984 + }, + { + "epoch": 0.09586374695863747, + "grad_norm": 1.5634292890846626, + "learning_rate": 9.886903981583633e-06, + "loss": 0.5158, + "step": 985 + }, + { + "epoch": 0.0959610705596107, + "grad_norm": 1.4911106877832496, + "learning_rate": 9.88657040233181e-06, + "loss": 0.3584, + "step": 986 + }, + { + "epoch": 0.09605839416058394, + "grad_norm": 1.8920202230134835, + "learning_rate": 9.886236337498481e-06, + "loss": 0.7059, + "step": 987 + }, + { + "epoch": 0.09615571776155718, + "grad_norm": 1.9765830057761664, + "learning_rate": 9.885901787116844e-06, + "loss": 0.3363, + "step": 988 + }, + { + "epoch": 0.09625304136253042, + "grad_norm": 1.7412713212065478, + "learning_rate": 9.885566751220144e-06, + "loss": 0.6238, + "step": 989 + }, + { + "epoch": 0.09635036496350365, + "grad_norm": 1.4558500764026314, + "learning_rate": 9.885231229841675e-06, + "loss": 0.5033, + "step": 990 + }, + { + "epoch": 0.09644768856447689, + "grad_norm": 1.5722863237428275, + "learning_rate": 9.884895223014772e-06, + "loss": 0.3026, + "step": 991 + }, + { + "epoch": 0.09654501216545013, + "grad_norm": 1.7850396516814273, + "learning_rate": 9.88455873077283e-06, + "loss": 0.6797, + "step": 992 + }, + { + "epoch": 0.09664233576642335, + "grad_norm": 1.5907642595826164, + "learning_rate": 9.884221753149286e-06, + "loss": 0.5051, + "step": 993 + }, + { + "epoch": 0.09673965936739659, + "grad_norm": 1.383326117178851, + "learning_rate": 9.883884290177623e-06, + "loss": 0.394, + "step": 994 + }, + { + "epoch": 0.09683698296836983, + "grad_norm": 1.5330791836349085, + "learning_rate": 9.883546341891375e-06, + "loss": 0.4531, + "step": 995 + }, + { + "epoch": 0.09693430656934307, + "grad_norm": 1.3858453283442664, + "learning_rate": 9.883207908324126e-06, + "loss": 0.4674, + "step": 996 + }, + { + "epoch": 0.0970316301703163, + "grad_norm": 1.2633519423598012, + "learning_rate": 9.882868989509507e-06, + "loss": 0.3053, + "step": 997 + }, + { + "epoch": 0.09712895377128954, + "grad_norm": 1.5725755469000553, + "learning_rate": 9.882529585481194e-06, + "loss": 0.5382, + "step": 998 + }, + { + "epoch": 0.09722627737226278, + "grad_norm": 1.594807816051373, + "learning_rate": 9.882189696272916e-06, + "loss": 0.5027, + "step": 999 + }, + { + "epoch": 0.09732360097323602, + "grad_norm": 1.7855937930735857, + "learning_rate": 9.881849321918446e-06, + "loss": 0.6336, + "step": 1000 + }, + { + "epoch": 0.09742092457420924, + "grad_norm": 1.8161736452208326, + "learning_rate": 9.88150846245161e-06, + "loss": 0.5432, + "step": 1001 + }, + { + "epoch": 0.09751824817518248, + "grad_norm": 1.2323791206307224, + "learning_rate": 9.881167117906276e-06, + "loss": 0.3361, + "step": 1002 + }, + { + "epoch": 0.09761557177615572, + "grad_norm": 1.6720448345305876, + "learning_rate": 9.880825288316367e-06, + "loss": 0.3583, + "step": 1003 + }, + { + "epoch": 0.09771289537712895, + "grad_norm": 1.408364549926656, + "learning_rate": 9.880482973715846e-06, + "loss": 0.3847, + "step": 1004 + }, + { + "epoch": 0.09781021897810219, + "grad_norm": 1.493256031544701, + "learning_rate": 9.880140174138735e-06, + "loss": 0.3611, + "step": 1005 + }, + { + "epoch": 0.09790754257907543, + "grad_norm": 1.3658283125944337, + "learning_rate": 9.879796889619093e-06, + "loss": 0.3555, + "step": 1006 + }, + { + "epoch": 0.09800486618004867, + "grad_norm": 1.7346143127846696, + "learning_rate": 9.879453120191037e-06, + "loss": 0.5028, + "step": 1007 + }, + { + "epoch": 0.09810218978102189, + "grad_norm": 1.9094090784905724, + "learning_rate": 9.879108865888724e-06, + "loss": 0.4799, + "step": 1008 + }, + { + "epoch": 0.09819951338199513, + "grad_norm": 1.1235415223499565, + "learning_rate": 9.878764126746364e-06, + "loss": 0.2181, + "step": 1009 + }, + { + "epoch": 0.09829683698296837, + "grad_norm": 1.494557121918356, + "learning_rate": 9.878418902798215e-06, + "loss": 0.4548, + "step": 1010 + }, + { + "epoch": 0.09839416058394161, + "grad_norm": 1.5340021274706077, + "learning_rate": 9.87807319407858e-06, + "loss": 0.4952, + "step": 1011 + }, + { + "epoch": 0.09849148418491484, + "grad_norm": 1.2523545024978981, + "learning_rate": 9.877727000621815e-06, + "loss": 0.2887, + "step": 1012 + }, + { + "epoch": 0.09858880778588808, + "grad_norm": 1.424446798325285, + "learning_rate": 9.877380322462317e-06, + "loss": 0.3628, + "step": 1013 + }, + { + "epoch": 0.09868613138686132, + "grad_norm": 1.6382574528105933, + "learning_rate": 9.877033159634542e-06, + "loss": 0.5396, + "step": 1014 + }, + { + "epoch": 0.09878345498783454, + "grad_norm": 1.544256440771578, + "learning_rate": 9.876685512172982e-06, + "loss": 0.4031, + "step": 1015 + }, + { + "epoch": 0.09888077858880778, + "grad_norm": 1.620162733287423, + "learning_rate": 9.876337380112185e-06, + "loss": 0.4925, + "step": 1016 + }, + { + "epoch": 0.09897810218978102, + "grad_norm": 1.6140460771461889, + "learning_rate": 9.875988763486746e-06, + "loss": 0.5549, + "step": 1017 + }, + { + "epoch": 0.09907542579075426, + "grad_norm": 1.6187864498320685, + "learning_rate": 9.875639662331307e-06, + "loss": 0.5034, + "step": 1018 + }, + { + "epoch": 0.09917274939172749, + "grad_norm": 1.249422512171971, + "learning_rate": 9.875290076680557e-06, + "loss": 0.236, + "step": 1019 + }, + { + "epoch": 0.09927007299270073, + "grad_norm": 1.5835572971087337, + "learning_rate": 9.874940006569236e-06, + "loss": 0.5309, + "step": 1020 + }, + { + "epoch": 0.09936739659367397, + "grad_norm": 0.8658795502351594, + "learning_rate": 9.874589452032131e-06, + "loss": 0.1911, + "step": 1021 + }, + { + "epoch": 0.09946472019464721, + "grad_norm": 1.3171385587421753, + "learning_rate": 9.874238413104076e-06, + "loss": 0.3486, + "step": 1022 + }, + { + "epoch": 0.09956204379562043, + "grad_norm": 1.4498439375980756, + "learning_rate": 9.873886889819953e-06, + "loss": 0.1986, + "step": 1023 + }, + { + "epoch": 0.09965936739659367, + "grad_norm": 1.5991307847988792, + "learning_rate": 9.873534882214692e-06, + "loss": 0.6397, + "step": 1024 + }, + { + "epoch": 0.09975669099756691, + "grad_norm": 1.6135151765084201, + "learning_rate": 9.873182390323277e-06, + "loss": 0.4338, + "step": 1025 + }, + { + "epoch": 0.09985401459854015, + "grad_norm": 1.465261170994732, + "learning_rate": 9.872829414180733e-06, + "loss": 0.4692, + "step": 1026 + }, + { + "epoch": 0.09995133819951338, + "grad_norm": 1.6964068418559575, + "learning_rate": 9.872475953822134e-06, + "loss": 0.4763, + "step": 1027 + }, + { + "epoch": 0.10004866180048662, + "grad_norm": 1.5209137969308788, + "learning_rate": 9.872122009282604e-06, + "loss": 0.4266, + "step": 1028 + }, + { + "epoch": 0.10014598540145986, + "grad_norm": 1.4495568716439686, + "learning_rate": 9.871767580597316e-06, + "loss": 0.4087, + "step": 1029 + }, + { + "epoch": 0.10024330900243308, + "grad_norm": 1.344434785457905, + "learning_rate": 9.871412667801488e-06, + "loss": 0.3797, + "step": 1030 + }, + { + "epoch": 0.10034063260340632, + "grad_norm": 1.5794908259633444, + "learning_rate": 9.871057270930392e-06, + "loss": 0.3939, + "step": 1031 + }, + { + "epoch": 0.10043795620437956, + "grad_norm": 1.5876979734473795, + "learning_rate": 9.870701390019337e-06, + "loss": 0.484, + "step": 1032 + }, + { + "epoch": 0.1005352798053528, + "grad_norm": 1.8773231101994967, + "learning_rate": 9.870345025103694e-06, + "loss": 0.5893, + "step": 1033 + }, + { + "epoch": 0.10063260340632603, + "grad_norm": 1.4927383125242464, + "learning_rate": 9.869988176218871e-06, + "loss": 0.4138, + "step": 1034 + }, + { + "epoch": 0.10072992700729927, + "grad_norm": 1.4766306382054422, + "learning_rate": 9.869630843400331e-06, + "loss": 0.4125, + "step": 1035 + }, + { + "epoch": 0.10082725060827251, + "grad_norm": 2.1872385141217388, + "learning_rate": 9.86927302668358e-06, + "loss": 0.4581, + "step": 1036 + }, + { + "epoch": 0.10092457420924575, + "grad_norm": 1.4275090865666056, + "learning_rate": 9.868914726104174e-06, + "loss": 0.2393, + "step": 1037 + }, + { + "epoch": 0.10102189781021897, + "grad_norm": 1.6989614006808447, + "learning_rate": 9.868555941697721e-06, + "loss": 0.4941, + "step": 1038 + }, + { + "epoch": 0.10111922141119221, + "grad_norm": 1.4357333730365565, + "learning_rate": 9.86819667349987e-06, + "loss": 0.4907, + "step": 1039 + }, + { + "epoch": 0.10121654501216545, + "grad_norm": 2.0026376735495055, + "learning_rate": 9.867836921546326e-06, + "loss": 0.8695, + "step": 1040 + }, + { + "epoch": 0.10131386861313868, + "grad_norm": 1.6951372609783342, + "learning_rate": 9.867476685872833e-06, + "loss": 0.6236, + "step": 1041 + }, + { + "epoch": 0.10141119221411192, + "grad_norm": 1.6963236381946833, + "learning_rate": 9.86711596651519e-06, + "loss": 0.6358, + "step": 1042 + }, + { + "epoch": 0.10150851581508516, + "grad_norm": 1.5189733584329748, + "learning_rate": 9.866754763509242e-06, + "loss": 0.4374, + "step": 1043 + }, + { + "epoch": 0.1016058394160584, + "grad_norm": 1.2748045341406278, + "learning_rate": 9.866393076890881e-06, + "loss": 0.4213, + "step": 1044 + }, + { + "epoch": 0.10170316301703163, + "grad_norm": 1.7405552081322075, + "learning_rate": 9.866030906696051e-06, + "loss": 0.6708, + "step": 1045 + }, + { + "epoch": 0.10180048661800487, + "grad_norm": 1.3495682131815454, + "learning_rate": 9.865668252960737e-06, + "loss": 0.3531, + "step": 1046 + }, + { + "epoch": 0.1018978102189781, + "grad_norm": 1.5653185028552046, + "learning_rate": 9.86530511572098e-06, + "loss": 0.4331, + "step": 1047 + }, + { + "epoch": 0.10199513381995134, + "grad_norm": 1.3992858529840162, + "learning_rate": 9.864941495012861e-06, + "loss": 0.3388, + "step": 1048 + }, + { + "epoch": 0.10209245742092457, + "grad_norm": 1.6270586325333123, + "learning_rate": 9.864577390872516e-06, + "loss": 0.4234, + "step": 1049 + }, + { + "epoch": 0.10218978102189781, + "grad_norm": 1.8656971621974168, + "learning_rate": 9.864212803336126e-06, + "loss": 0.718, + "step": 1050 + }, + { + "epoch": 0.10228710462287105, + "grad_norm": 1.4029758909387644, + "learning_rate": 9.86384773243992e-06, + "loss": 0.3892, + "step": 1051 + }, + { + "epoch": 0.10238442822384428, + "grad_norm": 1.1023559958942302, + "learning_rate": 9.863482178220176e-06, + "loss": 0.2453, + "step": 1052 + }, + { + "epoch": 0.10248175182481752, + "grad_norm": 1.5775869982106272, + "learning_rate": 9.863116140713219e-06, + "loss": 0.5324, + "step": 1053 + }, + { + "epoch": 0.10257907542579076, + "grad_norm": 1.603675899324949, + "learning_rate": 9.86274961995542e-06, + "loss": 0.4521, + "step": 1054 + }, + { + "epoch": 0.102676399026764, + "grad_norm": 1.6020699046167006, + "learning_rate": 9.862382615983203e-06, + "loss": 0.4545, + "step": 1055 + }, + { + "epoch": 0.10277372262773722, + "grad_norm": 1.474718021659803, + "learning_rate": 9.862015128833036e-06, + "loss": 0.4822, + "step": 1056 + }, + { + "epoch": 0.10287104622871046, + "grad_norm": 1.6033514684549, + "learning_rate": 9.861647158541438e-06, + "loss": 0.5069, + "step": 1057 + }, + { + "epoch": 0.1029683698296837, + "grad_norm": 1.4841655382640788, + "learning_rate": 9.861278705144974e-06, + "loss": 0.3865, + "step": 1058 + }, + { + "epoch": 0.10306569343065694, + "grad_norm": 1.1425556408878823, + "learning_rate": 9.860909768680259e-06, + "loss": 0.2443, + "step": 1059 + }, + { + "epoch": 0.10316301703163017, + "grad_norm": 1.5288676978753954, + "learning_rate": 9.86054034918395e-06, + "loss": 0.3652, + "step": 1060 + }, + { + "epoch": 0.1032603406326034, + "grad_norm": 1.5264484093473076, + "learning_rate": 9.860170446692758e-06, + "loss": 0.3318, + "step": 1061 + }, + { + "epoch": 0.10335766423357665, + "grad_norm": 1.4476258605632986, + "learning_rate": 9.859800061243443e-06, + "loss": 0.4518, + "step": 1062 + }, + { + "epoch": 0.10345498783454987, + "grad_norm": 1.336933590040686, + "learning_rate": 9.859429192872809e-06, + "loss": 0.2652, + "step": 1063 + }, + { + "epoch": 0.10355231143552311, + "grad_norm": 1.6050187197155075, + "learning_rate": 9.859057841617709e-06, + "loss": 0.5383, + "step": 1064 + }, + { + "epoch": 0.10364963503649635, + "grad_norm": 1.3472405276196469, + "learning_rate": 9.858686007515045e-06, + "loss": 0.4483, + "step": 1065 + }, + { + "epoch": 0.10374695863746959, + "grad_norm": 1.4838970820374793, + "learning_rate": 9.858313690601767e-06, + "loss": 0.3506, + "step": 1066 + }, + { + "epoch": 0.10384428223844282, + "grad_norm": 1.5911831099601979, + "learning_rate": 9.857940890914868e-06, + "loss": 0.3995, + "step": 1067 + }, + { + "epoch": 0.10394160583941606, + "grad_norm": 1.415577451063168, + "learning_rate": 9.8575676084914e-06, + "loss": 0.4773, + "step": 1068 + }, + { + "epoch": 0.1040389294403893, + "grad_norm": 1.7250253730787564, + "learning_rate": 9.857193843368451e-06, + "loss": 0.4456, + "step": 1069 + }, + { + "epoch": 0.10413625304136254, + "grad_norm": 1.5066269873708278, + "learning_rate": 9.856819595583166e-06, + "loss": 0.5481, + "step": 1070 + }, + { + "epoch": 0.10423357664233576, + "grad_norm": 1.5626665408071483, + "learning_rate": 9.856444865172732e-06, + "loss": 0.5382, + "step": 1071 + }, + { + "epoch": 0.104330900243309, + "grad_norm": 1.9089561390061884, + "learning_rate": 9.856069652174385e-06, + "loss": 0.5533, + "step": 1072 + }, + { + "epoch": 0.10442822384428224, + "grad_norm": 1.2757688373398666, + "learning_rate": 9.855693956625414e-06, + "loss": 0.3065, + "step": 1073 + }, + { + "epoch": 0.10452554744525547, + "grad_norm": 1.7230598513214688, + "learning_rate": 9.85531777856315e-06, + "loss": 0.5367, + "step": 1074 + }, + { + "epoch": 0.10462287104622871, + "grad_norm": 1.8368494244508635, + "learning_rate": 9.854941118024973e-06, + "loss": 0.4587, + "step": 1075 + }, + { + "epoch": 0.10472019464720195, + "grad_norm": 1.418583003899538, + "learning_rate": 9.854563975048314e-06, + "loss": 0.405, + "step": 1076 + }, + { + "epoch": 0.10481751824817519, + "grad_norm": 1.555078045275604, + "learning_rate": 9.854186349670648e-06, + "loss": 0.5572, + "step": 1077 + }, + { + "epoch": 0.10491484184914841, + "grad_norm": 1.5414220083120458, + "learning_rate": 9.853808241929502e-06, + "loss": 0.3382, + "step": 1078 + }, + { + "epoch": 0.10501216545012165, + "grad_norm": 1.2895897451723073, + "learning_rate": 9.853429651862445e-06, + "loss": 0.4342, + "step": 1079 + }, + { + "epoch": 0.10510948905109489, + "grad_norm": 1.3117010773132232, + "learning_rate": 9.853050579507104e-06, + "loss": 0.3751, + "step": 1080 + }, + { + "epoch": 0.10520681265206813, + "grad_norm": 1.5440994948167002, + "learning_rate": 9.852671024901141e-06, + "loss": 0.4971, + "step": 1081 + }, + { + "epoch": 0.10530413625304136, + "grad_norm": 1.2028388141262132, + "learning_rate": 9.852290988082278e-06, + "loss": 0.3933, + "step": 1082 + }, + { + "epoch": 0.1054014598540146, + "grad_norm": 1.6199890049219825, + "learning_rate": 9.851910469088275e-06, + "loss": 0.5394, + "step": 1083 + }, + { + "epoch": 0.10549878345498784, + "grad_norm": 1.4805170620003079, + "learning_rate": 9.851529467956946e-06, + "loss": 0.2421, + "step": 1084 + }, + { + "epoch": 0.10559610705596106, + "grad_norm": 1.432802486072686, + "learning_rate": 9.851147984726154e-06, + "loss": 0.479, + "step": 1085 + }, + { + "epoch": 0.1056934306569343, + "grad_norm": 1.7662999036343905, + "learning_rate": 9.850766019433803e-06, + "loss": 0.706, + "step": 1086 + }, + { + "epoch": 0.10579075425790754, + "grad_norm": 1.9136497208168854, + "learning_rate": 9.850383572117853e-06, + "loss": 0.7672, + "step": 1087 + }, + { + "epoch": 0.10588807785888078, + "grad_norm": 1.1667281997438979, + "learning_rate": 9.850000642816306e-06, + "loss": 0.2263, + "step": 1088 + }, + { + "epoch": 0.10598540145985401, + "grad_norm": 1.3133144576431575, + "learning_rate": 9.849617231567213e-06, + "loss": 0.2211, + "step": 1089 + }, + { + "epoch": 0.10608272506082725, + "grad_norm": 1.411642205718121, + "learning_rate": 9.849233338408674e-06, + "loss": 0.4379, + "step": 1090 + }, + { + "epoch": 0.10618004866180049, + "grad_norm": 1.7114110143353651, + "learning_rate": 9.84884896337884e-06, + "loss": 0.462, + "step": 1091 + }, + { + "epoch": 0.10627737226277373, + "grad_norm": 1.4035875335457177, + "learning_rate": 9.848464106515903e-06, + "loss": 0.317, + "step": 1092 + }, + { + "epoch": 0.10637469586374695, + "grad_norm": 1.5988244446936477, + "learning_rate": 9.848078767858107e-06, + "loss": 0.5254, + "step": 1093 + }, + { + "epoch": 0.1064720194647202, + "grad_norm": 1.6336010940510732, + "learning_rate": 9.847692947443745e-06, + "loss": 0.4979, + "step": 1094 + }, + { + "epoch": 0.10656934306569343, + "grad_norm": 1.68747146017171, + "learning_rate": 9.847306645311154e-06, + "loss": 0.5515, + "step": 1095 + }, + { + "epoch": 0.10666666666666667, + "grad_norm": 1.497709273552353, + "learning_rate": 9.846919861498724e-06, + "loss": 0.4221, + "step": 1096 + }, + { + "epoch": 0.1067639902676399, + "grad_norm": 1.4761873606313476, + "learning_rate": 9.846532596044887e-06, + "loss": 0.4296, + "step": 1097 + }, + { + "epoch": 0.10686131386861314, + "grad_norm": 1.1441862868877024, + "learning_rate": 9.846144848988127e-06, + "loss": 0.2816, + "step": 1098 + }, + { + "epoch": 0.10695863746958638, + "grad_norm": 1.7272604657837642, + "learning_rate": 9.845756620366976e-06, + "loss": 0.5916, + "step": 1099 + }, + { + "epoch": 0.1070559610705596, + "grad_norm": 1.3799505412872324, + "learning_rate": 9.84536791022001e-06, + "loss": 0.3947, + "step": 1100 + }, + { + "epoch": 0.10715328467153284, + "grad_norm": 1.6943818099878132, + "learning_rate": 9.844978718585855e-06, + "loss": 0.4737, + "step": 1101 + }, + { + "epoch": 0.10725060827250608, + "grad_norm": 1.5405614688920448, + "learning_rate": 9.84458904550319e-06, + "loss": 0.4152, + "step": 1102 + }, + { + "epoch": 0.10734793187347932, + "grad_norm": 1.6335292867295117, + "learning_rate": 9.844198891010733e-06, + "loss": 0.5677, + "step": 1103 + }, + { + "epoch": 0.10744525547445255, + "grad_norm": 1.302603147379972, + "learning_rate": 9.843808255147253e-06, + "loss": 0.4283, + "step": 1104 + }, + { + "epoch": 0.10754257907542579, + "grad_norm": 1.7967506033919078, + "learning_rate": 9.84341713795157e-06, + "loss": 0.6995, + "step": 1105 + }, + { + "epoch": 0.10763990267639903, + "grad_norm": 1.7320527346822367, + "learning_rate": 9.84302553946255e-06, + "loss": 0.5369, + "step": 1106 + }, + { + "epoch": 0.10773722627737227, + "grad_norm": 1.2124746103676287, + "learning_rate": 9.842633459719104e-06, + "loss": 0.296, + "step": 1107 + }, + { + "epoch": 0.1078345498783455, + "grad_norm": 1.6638227119864655, + "learning_rate": 9.842240898760195e-06, + "loss": 0.5632, + "step": 1108 + }, + { + "epoch": 0.10793187347931874, + "grad_norm": 1.5728826792836543, + "learning_rate": 9.841847856624833e-06, + "loss": 0.3407, + "step": 1109 + }, + { + "epoch": 0.10802919708029197, + "grad_norm": 1.4855225795030034, + "learning_rate": 9.841454333352073e-06, + "loss": 0.534, + "step": 1110 + }, + { + "epoch": 0.1081265206812652, + "grad_norm": 1.741747608159628, + "learning_rate": 9.841060328981019e-06, + "loss": 0.5739, + "step": 1111 + }, + { + "epoch": 0.10822384428223844, + "grad_norm": 1.2765533148109443, + "learning_rate": 9.840665843550825e-06, + "loss": 0.335, + "step": 1112 + }, + { + "epoch": 0.10832116788321168, + "grad_norm": 1.9391527817309226, + "learning_rate": 9.840270877100692e-06, + "loss": 0.5604, + "step": 1113 + }, + { + "epoch": 0.10841849148418492, + "grad_norm": 1.2570937099076989, + "learning_rate": 9.839875429669865e-06, + "loss": 0.3098, + "step": 1114 + }, + { + "epoch": 0.10851581508515815, + "grad_norm": 1.6345857910998665, + "learning_rate": 9.839479501297643e-06, + "loss": 0.4665, + "step": 1115 + }, + { + "epoch": 0.10861313868613139, + "grad_norm": 2.1039943309751075, + "learning_rate": 9.839083092023368e-06, + "loss": 0.8597, + "step": 1116 + }, + { + "epoch": 0.10871046228710463, + "grad_norm": 1.634678554608885, + "learning_rate": 9.838686201886432e-06, + "loss": 0.4907, + "step": 1117 + }, + { + "epoch": 0.10880778588807787, + "grad_norm": 1.328229383966676, + "learning_rate": 9.838288830926274e-06, + "loss": 0.3255, + "step": 1118 + }, + { + "epoch": 0.10890510948905109, + "grad_norm": 1.3587359099021656, + "learning_rate": 9.837890979182381e-06, + "loss": 0.4224, + "step": 1119 + }, + { + "epoch": 0.10900243309002433, + "grad_norm": 1.6242900911620413, + "learning_rate": 9.837492646694287e-06, + "loss": 0.4338, + "step": 1120 + }, + { + "epoch": 0.10909975669099757, + "grad_norm": 1.5901048900387273, + "learning_rate": 9.837093833501576e-06, + "loss": 0.5168, + "step": 1121 + }, + { + "epoch": 0.1091970802919708, + "grad_norm": 1.34172908606168, + "learning_rate": 9.836694539643878e-06, + "loss": 0.3233, + "step": 1122 + }, + { + "epoch": 0.10929440389294404, + "grad_norm": 1.4724714330159256, + "learning_rate": 9.83629476516087e-06, + "loss": 0.3652, + "step": 1123 + }, + { + "epoch": 0.10939172749391728, + "grad_norm": 1.4884050773310515, + "learning_rate": 9.835894510092279e-06, + "loss": 0.4622, + "step": 1124 + }, + { + "epoch": 0.10948905109489052, + "grad_norm": 1.3181328020728609, + "learning_rate": 9.835493774477877e-06, + "loss": 0.4531, + "step": 1125 + }, + { + "epoch": 0.10958637469586374, + "grad_norm": 1.5414298966880746, + "learning_rate": 9.835092558357488e-06, + "loss": 0.3659, + "step": 1126 + }, + { + "epoch": 0.10968369829683698, + "grad_norm": 1.3248299507567909, + "learning_rate": 9.834690861770979e-06, + "loss": 0.3207, + "step": 1127 + }, + { + "epoch": 0.10978102189781022, + "grad_norm": 1.5527535683267375, + "learning_rate": 9.834288684758269e-06, + "loss": 0.4938, + "step": 1128 + }, + { + "epoch": 0.10987834549878346, + "grad_norm": 1.3342131255187983, + "learning_rate": 9.83388602735932e-06, + "loss": 0.4451, + "step": 1129 + }, + { + "epoch": 0.10997566909975669, + "grad_norm": 1.0500905202266426, + "learning_rate": 9.833482889614143e-06, + "loss": 0.2408, + "step": 1130 + }, + { + "epoch": 0.11007299270072993, + "grad_norm": 1.377353907486564, + "learning_rate": 9.833079271562802e-06, + "loss": 0.3945, + "step": 1131 + }, + { + "epoch": 0.11017031630170317, + "grad_norm": 1.5823324787969848, + "learning_rate": 9.832675173245404e-06, + "loss": 0.6066, + "step": 1132 + }, + { + "epoch": 0.11026763990267639, + "grad_norm": 1.7266167679625446, + "learning_rate": 9.832270594702102e-06, + "loss": 0.6417, + "step": 1133 + }, + { + "epoch": 0.11036496350364963, + "grad_norm": 1.4091165783577269, + "learning_rate": 9.831865535973103e-06, + "loss": 0.2661, + "step": 1134 + }, + { + "epoch": 0.11046228710462287, + "grad_norm": 0.9959339686876645, + "learning_rate": 9.831459997098654e-06, + "loss": 0.1744, + "step": 1135 + }, + { + "epoch": 0.11055961070559611, + "grad_norm": 1.4748243970921762, + "learning_rate": 9.831053978119056e-06, + "loss": 0.4011, + "step": 1136 + }, + { + "epoch": 0.11065693430656934, + "grad_norm": 1.5879686249629044, + "learning_rate": 9.830647479074656e-06, + "loss": 0.3021, + "step": 1137 + }, + { + "epoch": 0.11075425790754258, + "grad_norm": 1.5057704716227702, + "learning_rate": 9.830240500005845e-06, + "loss": 0.2962, + "step": 1138 + }, + { + "epoch": 0.11085158150851582, + "grad_norm": 1.7497051535586357, + "learning_rate": 9.829833040953068e-06, + "loss": 0.4717, + "step": 1139 + }, + { + "epoch": 0.11094890510948906, + "grad_norm": 1.7819946472609902, + "learning_rate": 9.829425101956812e-06, + "loss": 0.6113, + "step": 1140 + }, + { + "epoch": 0.11104622871046228, + "grad_norm": 1.7680522472506797, + "learning_rate": 9.829016683057615e-06, + "loss": 0.4672, + "step": 1141 + }, + { + "epoch": 0.11114355231143552, + "grad_norm": 1.8291787265156998, + "learning_rate": 9.828607784296063e-06, + "loss": 0.5148, + "step": 1142 + }, + { + "epoch": 0.11124087591240876, + "grad_norm": 1.4119536127948566, + "learning_rate": 9.828198405712788e-06, + "loss": 0.2698, + "step": 1143 + }, + { + "epoch": 0.11133819951338199, + "grad_norm": 1.67600232780131, + "learning_rate": 9.827788547348469e-06, + "loss": 0.4912, + "step": 1144 + }, + { + "epoch": 0.11143552311435523, + "grad_norm": 1.9367616538665617, + "learning_rate": 9.827378209243835e-06, + "loss": 0.3781, + "step": 1145 + }, + { + "epoch": 0.11153284671532847, + "grad_norm": 1.7032208896905794, + "learning_rate": 9.826967391439662e-06, + "loss": 0.5816, + "step": 1146 + }, + { + "epoch": 0.11163017031630171, + "grad_norm": 1.60872896431165, + "learning_rate": 9.826556093976769e-06, + "loss": 0.4654, + "step": 1147 + }, + { + "epoch": 0.11172749391727493, + "grad_norm": 1.5752275514466696, + "learning_rate": 9.826144316896033e-06, + "loss": 0.3177, + "step": 1148 + }, + { + "epoch": 0.11182481751824817, + "grad_norm": 1.8207599924827627, + "learning_rate": 9.82573206023837e-06, + "loss": 0.5701, + "step": 1149 + }, + { + "epoch": 0.11192214111922141, + "grad_norm": 1.5850279506541385, + "learning_rate": 9.825319324044745e-06, + "loss": 0.5616, + "step": 1150 + }, + { + "epoch": 0.11201946472019465, + "grad_norm": 1.360496233978723, + "learning_rate": 9.824906108356174e-06, + "loss": 0.3407, + "step": 1151 + }, + { + "epoch": 0.11211678832116788, + "grad_norm": 1.6595565610362801, + "learning_rate": 9.824492413213717e-06, + "loss": 0.6641, + "step": 1152 + }, + { + "epoch": 0.11221411192214112, + "grad_norm": 1.6031792644515102, + "learning_rate": 9.824078238658483e-06, + "loss": 0.4779, + "step": 1153 + }, + { + "epoch": 0.11231143552311436, + "grad_norm": 1.0762751645680708, + "learning_rate": 9.82366358473163e-06, + "loss": 0.2739, + "step": 1154 + }, + { + "epoch": 0.11240875912408758, + "grad_norm": 1.3660129842713564, + "learning_rate": 9.82324845147436e-06, + "loss": 0.5043, + "step": 1155 + }, + { + "epoch": 0.11250608272506082, + "grad_norm": 1.6273408315616833, + "learning_rate": 9.822832838927929e-06, + "loss": 0.6159, + "step": 1156 + }, + { + "epoch": 0.11260340632603406, + "grad_norm": 1.4216921342906768, + "learning_rate": 9.822416747133634e-06, + "loss": 0.4093, + "step": 1157 + }, + { + "epoch": 0.1127007299270073, + "grad_norm": 1.8899721642114575, + "learning_rate": 9.822000176132822e-06, + "loss": 0.5586, + "step": 1158 + }, + { + "epoch": 0.11279805352798053, + "grad_norm": 1.5144459966059345, + "learning_rate": 9.821583125966889e-06, + "loss": 0.3806, + "step": 1159 + }, + { + "epoch": 0.11289537712895377, + "grad_norm": 1.61041803725934, + "learning_rate": 9.821165596677278e-06, + "loss": 0.4064, + "step": 1160 + }, + { + "epoch": 0.11299270072992701, + "grad_norm": 1.5410637406837986, + "learning_rate": 9.820747588305477e-06, + "loss": 0.3526, + "step": 1161 + }, + { + "epoch": 0.11309002433090025, + "grad_norm": 1.5545393523360629, + "learning_rate": 9.820329100893026e-06, + "loss": 0.3834, + "step": 1162 + }, + { + "epoch": 0.11318734793187347, + "grad_norm": 1.6391567381322345, + "learning_rate": 9.819910134481508e-06, + "loss": 0.3849, + "step": 1163 + }, + { + "epoch": 0.11328467153284671, + "grad_norm": 1.5204183543600032, + "learning_rate": 9.819490689112559e-06, + "loss": 0.4712, + "step": 1164 + }, + { + "epoch": 0.11338199513381995, + "grad_norm": 1.5168954302933022, + "learning_rate": 9.819070764827858e-06, + "loss": 0.4662, + "step": 1165 + }, + { + "epoch": 0.1134793187347932, + "grad_norm": 1.4412304117107342, + "learning_rate": 9.818650361669133e-06, + "loss": 0.3515, + "step": 1166 + }, + { + "epoch": 0.11357664233576642, + "grad_norm": 1.5419710047923603, + "learning_rate": 9.81822947967816e-06, + "loss": 0.383, + "step": 1167 + }, + { + "epoch": 0.11367396593673966, + "grad_norm": 1.59211707141906, + "learning_rate": 9.817808118896759e-06, + "loss": 0.5101, + "step": 1168 + }, + { + "epoch": 0.1137712895377129, + "grad_norm": 1.9315831066859817, + "learning_rate": 9.817386279366808e-06, + "loss": 0.6179, + "step": 1169 + }, + { + "epoch": 0.11386861313868613, + "grad_norm": 1.3153157684002792, + "learning_rate": 9.816963961130218e-06, + "loss": 0.2382, + "step": 1170 + }, + { + "epoch": 0.11396593673965937, + "grad_norm": 1.3579619945410324, + "learning_rate": 9.81654116422896e-06, + "loss": 0.4424, + "step": 1171 + }, + { + "epoch": 0.1140632603406326, + "grad_norm": 1.479330223962703, + "learning_rate": 9.816117888705046e-06, + "loss": 0.3647, + "step": 1172 + }, + { + "epoch": 0.11416058394160584, + "grad_norm": 1.5031676224913018, + "learning_rate": 9.815694134600537e-06, + "loss": 0.3686, + "step": 1173 + }, + { + "epoch": 0.11425790754257907, + "grad_norm": 1.6106095254885215, + "learning_rate": 9.815269901957543e-06, + "loss": 0.5309, + "step": 1174 + }, + { + "epoch": 0.11435523114355231, + "grad_norm": 1.4367590943688036, + "learning_rate": 9.814845190818218e-06, + "loss": 0.3786, + "step": 1175 + }, + { + "epoch": 0.11445255474452555, + "grad_norm": 2.0513510648109636, + "learning_rate": 9.814420001224767e-06, + "loss": 0.8885, + "step": 1176 + }, + { + "epoch": 0.11454987834549879, + "grad_norm": 1.3799990465326748, + "learning_rate": 9.813994333219443e-06, + "loss": 0.3511, + "step": 1177 + }, + { + "epoch": 0.11464720194647202, + "grad_norm": 1.2354207015762353, + "learning_rate": 9.813568186844541e-06, + "loss": 0.3571, + "step": 1178 + }, + { + "epoch": 0.11474452554744526, + "grad_norm": 2.0501383618438678, + "learning_rate": 9.813141562142409e-06, + "loss": 0.4485, + "step": 1179 + }, + { + "epoch": 0.1148418491484185, + "grad_norm": 1.351584991091541, + "learning_rate": 9.812714459155444e-06, + "loss": 0.2894, + "step": 1180 + }, + { + "epoch": 0.11493917274939172, + "grad_norm": 1.3568994189032655, + "learning_rate": 9.812286877926085e-06, + "loss": 0.4016, + "step": 1181 + }, + { + "epoch": 0.11503649635036496, + "grad_norm": 1.4949546840268106, + "learning_rate": 9.81185881849682e-06, + "loss": 0.527, + "step": 1182 + }, + { + "epoch": 0.1151338199513382, + "grad_norm": 1.5053242129518953, + "learning_rate": 9.811430280910186e-06, + "loss": 0.4324, + "step": 1183 + }, + { + "epoch": 0.11523114355231144, + "grad_norm": 1.2995408017430223, + "learning_rate": 9.811001265208768e-06, + "loss": 0.4592, + "step": 1184 + }, + { + "epoch": 0.11532846715328467, + "grad_norm": 1.4103061247668216, + "learning_rate": 9.810571771435197e-06, + "loss": 0.4615, + "step": 1185 + }, + { + "epoch": 0.1154257907542579, + "grad_norm": 1.3694132099540144, + "learning_rate": 9.810141799632153e-06, + "loss": 0.4224, + "step": 1186 + }, + { + "epoch": 0.11552311435523115, + "grad_norm": 1.4494836775882813, + "learning_rate": 9.809711349842363e-06, + "loss": 0.4189, + "step": 1187 + }, + { + "epoch": 0.11562043795620439, + "grad_norm": 1.5100099037805617, + "learning_rate": 9.809280422108598e-06, + "loss": 0.495, + "step": 1188 + }, + { + "epoch": 0.11571776155717761, + "grad_norm": 1.449093301695385, + "learning_rate": 9.808849016473682e-06, + "loss": 0.345, + "step": 1189 + }, + { + "epoch": 0.11581508515815085, + "grad_norm": 1.501093862959825, + "learning_rate": 9.808417132980484e-06, + "loss": 0.4624, + "step": 1190 + }, + { + "epoch": 0.11591240875912409, + "grad_norm": 1.4567657310588336, + "learning_rate": 9.807984771671919e-06, + "loss": 0.2836, + "step": 1191 + }, + { + "epoch": 0.11600973236009732, + "grad_norm": 1.6666134190000732, + "learning_rate": 9.807551932590952e-06, + "loss": 0.3341, + "step": 1192 + }, + { + "epoch": 0.11610705596107056, + "grad_norm": 1.7534770482902293, + "learning_rate": 9.807118615780595e-06, + "loss": 0.6021, + "step": 1193 + }, + { + "epoch": 0.1162043795620438, + "grad_norm": 1.744738707996039, + "learning_rate": 9.806684821283908e-06, + "loss": 0.4593, + "step": 1194 + }, + { + "epoch": 0.11630170316301704, + "grad_norm": 1.7519974888996959, + "learning_rate": 9.806250549143994e-06, + "loss": 0.5433, + "step": 1195 + }, + { + "epoch": 0.11639902676399026, + "grad_norm": 1.6094009249182397, + "learning_rate": 9.805815799404008e-06, + "loss": 0.6053, + "step": 1196 + }, + { + "epoch": 0.1164963503649635, + "grad_norm": 1.4291146386614342, + "learning_rate": 9.805380572107153e-06, + "loss": 0.4377, + "step": 1197 + }, + { + "epoch": 0.11659367396593674, + "grad_norm": 1.6092739629047335, + "learning_rate": 9.804944867296678e-06, + "loss": 0.5708, + "step": 1198 + }, + { + "epoch": 0.11669099756690998, + "grad_norm": 1.3856208861087336, + "learning_rate": 9.804508685015876e-06, + "loss": 0.3677, + "step": 1199 + }, + { + "epoch": 0.11678832116788321, + "grad_norm": 1.52110832375871, + "learning_rate": 9.804072025308096e-06, + "loss": 0.3076, + "step": 1200 + }, + { + "epoch": 0.11688564476885645, + "grad_norm": 1.3072729020716074, + "learning_rate": 9.803634888216724e-06, + "loss": 0.2673, + "step": 1201 + }, + { + "epoch": 0.11698296836982969, + "grad_norm": 1.9045471339964295, + "learning_rate": 9.8031972737852e-06, + "loss": 0.7326, + "step": 1202 + }, + { + "epoch": 0.11708029197080291, + "grad_norm": 1.3351659498760804, + "learning_rate": 9.802759182057013e-06, + "loss": 0.4193, + "step": 1203 + }, + { + "epoch": 0.11717761557177615, + "grad_norm": 1.4664570380446003, + "learning_rate": 9.80232061307569e-06, + "loss": 0.358, + "step": 1204 + }, + { + "epoch": 0.11727493917274939, + "grad_norm": 1.1764722042212887, + "learning_rate": 9.80188156688482e-06, + "loss": 0.3093, + "step": 1205 + }, + { + "epoch": 0.11737226277372263, + "grad_norm": 1.5415184448059258, + "learning_rate": 9.801442043528026e-06, + "loss": 0.4667, + "step": 1206 + }, + { + "epoch": 0.11746958637469586, + "grad_norm": 1.4827166479100118, + "learning_rate": 9.801002043048984e-06, + "loss": 0.4876, + "step": 1207 + }, + { + "epoch": 0.1175669099756691, + "grad_norm": 1.6786553338713377, + "learning_rate": 9.80056156549142e-06, + "loss": 0.5076, + "step": 1208 + }, + { + "epoch": 0.11766423357664234, + "grad_norm": 1.2161366736688597, + "learning_rate": 9.8001206108991e-06, + "loss": 0.2247, + "step": 1209 + }, + { + "epoch": 0.11776155717761558, + "grad_norm": 1.4015628266094937, + "learning_rate": 9.799679179315846e-06, + "loss": 0.4327, + "step": 1210 + }, + { + "epoch": 0.1178588807785888, + "grad_norm": 1.5420255844625947, + "learning_rate": 9.799237270785522e-06, + "loss": 0.438, + "step": 1211 + }, + { + "epoch": 0.11795620437956204, + "grad_norm": 1.5978057716745744, + "learning_rate": 9.79879488535204e-06, + "loss": 0.4203, + "step": 1212 + }, + { + "epoch": 0.11805352798053528, + "grad_norm": 1.8973070083198396, + "learning_rate": 9.79835202305936e-06, + "loss": 0.7404, + "step": 1213 + }, + { + "epoch": 0.11815085158150851, + "grad_norm": 1.5331088091760856, + "learning_rate": 9.797908683951492e-06, + "loss": 0.5378, + "step": 1214 + }, + { + "epoch": 0.11824817518248175, + "grad_norm": 1.9627839775910105, + "learning_rate": 9.797464868072489e-06, + "loss": 0.6298, + "step": 1215 + }, + { + "epoch": 0.11834549878345499, + "grad_norm": 1.5059209630421948, + "learning_rate": 9.797020575466452e-06, + "loss": 0.4233, + "step": 1216 + }, + { + "epoch": 0.11844282238442823, + "grad_norm": 1.4714593450262028, + "learning_rate": 9.796575806177531e-06, + "loss": 0.4078, + "step": 1217 + }, + { + "epoch": 0.11854014598540145, + "grad_norm": 1.812199008547911, + "learning_rate": 9.796130560249926e-06, + "loss": 0.6636, + "step": 1218 + }, + { + "epoch": 0.1186374695863747, + "grad_norm": 1.330364448549248, + "learning_rate": 9.795684837727878e-06, + "loss": 0.2597, + "step": 1219 + }, + { + "epoch": 0.11873479318734793, + "grad_norm": 1.1642089342014024, + "learning_rate": 9.795238638655681e-06, + "loss": 0.2669, + "step": 1220 + }, + { + "epoch": 0.11883211678832117, + "grad_norm": 1.0578785975666756, + "learning_rate": 9.794791963077672e-06, + "loss": 0.2138, + "step": 1221 + }, + { + "epoch": 0.1189294403892944, + "grad_norm": 1.2810119779981208, + "learning_rate": 9.794344811038239e-06, + "loss": 0.3426, + "step": 1222 + }, + { + "epoch": 0.11902676399026764, + "grad_norm": 1.6109574325023976, + "learning_rate": 9.793897182581816e-06, + "loss": 0.4931, + "step": 1223 + }, + { + "epoch": 0.11912408759124088, + "grad_norm": 1.8314564663365431, + "learning_rate": 9.793449077752882e-06, + "loss": 0.5424, + "step": 1224 + }, + { + "epoch": 0.1192214111922141, + "grad_norm": 1.3266514401224994, + "learning_rate": 9.793000496595968e-06, + "loss": 0.3123, + "step": 1225 + }, + { + "epoch": 0.11931873479318734, + "grad_norm": 1.624792232435884, + "learning_rate": 9.792551439155649e-06, + "loss": 0.3635, + "step": 1226 + }, + { + "epoch": 0.11941605839416058, + "grad_norm": 1.306535519875853, + "learning_rate": 9.792101905476547e-06, + "loss": 0.3252, + "step": 1227 + }, + { + "epoch": 0.11951338199513382, + "grad_norm": 1.591218471169796, + "learning_rate": 9.791651895603333e-06, + "loss": 0.5493, + "step": 1228 + }, + { + "epoch": 0.11961070559610705, + "grad_norm": 1.8218114354346657, + "learning_rate": 9.791201409580725e-06, + "loss": 0.6988, + "step": 1229 + }, + { + "epoch": 0.11970802919708029, + "grad_norm": 1.7366783724272585, + "learning_rate": 9.790750447453487e-06, + "loss": 0.4285, + "step": 1230 + }, + { + "epoch": 0.11980535279805353, + "grad_norm": 1.9439764988659998, + "learning_rate": 9.790299009266434e-06, + "loss": 0.2787, + "step": 1231 + }, + { + "epoch": 0.11990267639902677, + "grad_norm": 1.4894849660267724, + "learning_rate": 9.789847095064425e-06, + "loss": 0.2531, + "step": 1232 + }, + { + "epoch": 0.12, + "grad_norm": 1.6270936536604101, + "learning_rate": 9.789394704892364e-06, + "loss": 0.5309, + "step": 1233 + }, + { + "epoch": 0.12009732360097324, + "grad_norm": 1.4144832764840753, + "learning_rate": 9.788941838795209e-06, + "loss": 0.298, + "step": 1234 + }, + { + "epoch": 0.12019464720194648, + "grad_norm": 1.546926786538444, + "learning_rate": 9.788488496817958e-06, + "loss": 0.4751, + "step": 1235 + }, + { + "epoch": 0.12029197080291971, + "grad_norm": 1.5827216255031866, + "learning_rate": 9.788034679005664e-06, + "loss": 0.4576, + "step": 1236 + }, + { + "epoch": 0.12038929440389294, + "grad_norm": 1.6103699210596951, + "learning_rate": 9.78758038540342e-06, + "loss": 0.4637, + "step": 1237 + }, + { + "epoch": 0.12048661800486618, + "grad_norm": 1.4918367462943103, + "learning_rate": 9.78712561605637e-06, + "loss": 0.4998, + "step": 1238 + }, + { + "epoch": 0.12058394160583942, + "grad_norm": 1.5775409788682337, + "learning_rate": 9.786670371009706e-06, + "loss": 0.4415, + "step": 1239 + }, + { + "epoch": 0.12068126520681265, + "grad_norm": 1.5427286854632911, + "learning_rate": 9.786214650308666e-06, + "loss": 0.4606, + "step": 1240 + }, + { + "epoch": 0.12077858880778589, + "grad_norm": 1.523821034203494, + "learning_rate": 9.78575845399853e-06, + "loss": 0.3918, + "step": 1241 + }, + { + "epoch": 0.12087591240875913, + "grad_norm": 1.950297391662121, + "learning_rate": 9.785301782124638e-06, + "loss": 0.5579, + "step": 1242 + }, + { + "epoch": 0.12097323600973237, + "grad_norm": 1.5957141815138678, + "learning_rate": 9.784844634732367e-06, + "loss": 0.3814, + "step": 1243 + }, + { + "epoch": 0.12107055961070559, + "grad_norm": 1.3924341327971197, + "learning_rate": 9.784387011867145e-06, + "loss": 0.3576, + "step": 1244 + }, + { + "epoch": 0.12116788321167883, + "grad_norm": 1.670661057733516, + "learning_rate": 9.783928913574442e-06, + "loss": 0.5307, + "step": 1245 + }, + { + "epoch": 0.12126520681265207, + "grad_norm": 1.9162789104592521, + "learning_rate": 9.783470339899783e-06, + "loss": 0.2309, + "step": 1246 + }, + { + "epoch": 0.12136253041362531, + "grad_norm": 1.4323883393925967, + "learning_rate": 9.783011290888737e-06, + "loss": 0.4816, + "step": 1247 + }, + { + "epoch": 0.12145985401459854, + "grad_norm": 1.133557304990043, + "learning_rate": 9.78255176658692e-06, + "loss": 0.259, + "step": 1248 + }, + { + "epoch": 0.12155717761557178, + "grad_norm": 1.6381613262272003, + "learning_rate": 9.782091767039992e-06, + "loss": 0.535, + "step": 1249 + }, + { + "epoch": 0.12165450121654502, + "grad_norm": 1.521879132713644, + "learning_rate": 9.781631292293668e-06, + "loss": 0.5299, + "step": 1250 + }, + { + "epoch": 0.12175182481751824, + "grad_norm": 1.2965362290198492, + "learning_rate": 9.781170342393702e-06, + "loss": 0.4161, + "step": 1251 + }, + { + "epoch": 0.12184914841849148, + "grad_norm": 1.4753461399295356, + "learning_rate": 9.780708917385901e-06, + "loss": 0.5379, + "step": 1252 + }, + { + "epoch": 0.12194647201946472, + "grad_norm": 0.9509628974965367, + "learning_rate": 9.780247017316115e-06, + "loss": 0.2681, + "step": 1253 + }, + { + "epoch": 0.12204379562043796, + "grad_norm": 1.3308735848114122, + "learning_rate": 9.779784642230246e-06, + "loss": 0.4247, + "step": 1254 + }, + { + "epoch": 0.12214111922141119, + "grad_norm": 1.1206835484781008, + "learning_rate": 9.779321792174239e-06, + "loss": 0.2301, + "step": 1255 + }, + { + "epoch": 0.12223844282238443, + "grad_norm": 1.2598096263209464, + "learning_rate": 9.778858467194087e-06, + "loss": 0.3163, + "step": 1256 + }, + { + "epoch": 0.12233576642335767, + "grad_norm": 1.4871998460052394, + "learning_rate": 9.778394667335834e-06, + "loss": 0.3433, + "step": 1257 + }, + { + "epoch": 0.1224330900243309, + "grad_norm": 1.384245738588718, + "learning_rate": 9.777930392645565e-06, + "loss": 0.2111, + "step": 1258 + }, + { + "epoch": 0.12253041362530413, + "grad_norm": 1.4369061113982475, + "learning_rate": 9.777465643169417e-06, + "loss": 0.3895, + "step": 1259 + }, + { + "epoch": 0.12262773722627737, + "grad_norm": 1.8558638944994366, + "learning_rate": 9.777000418953568e-06, + "loss": 0.3388, + "step": 1260 + }, + { + "epoch": 0.12272506082725061, + "grad_norm": 1.512984108492842, + "learning_rate": 9.776534720044255e-06, + "loss": 0.4726, + "step": 1261 + }, + { + "epoch": 0.12282238442822384, + "grad_norm": 1.367540412040702, + "learning_rate": 9.77606854648775e-06, + "loss": 0.2684, + "step": 1262 + }, + { + "epoch": 0.12291970802919708, + "grad_norm": 1.2042550068870583, + "learning_rate": 9.775601898330377e-06, + "loss": 0.2173, + "step": 1263 + }, + { + "epoch": 0.12301703163017032, + "grad_norm": 1.5842484372844456, + "learning_rate": 9.775134775618509e-06, + "loss": 0.5608, + "step": 1264 + }, + { + "epoch": 0.12311435523114356, + "grad_norm": 1.397447971201202, + "learning_rate": 9.774667178398562e-06, + "loss": 0.4632, + "step": 1265 + }, + { + "epoch": 0.12321167883211678, + "grad_norm": 1.3468996882112099, + "learning_rate": 9.774199106717004e-06, + "loss": 0.3697, + "step": 1266 + }, + { + "epoch": 0.12330900243309002, + "grad_norm": 1.252677053550249, + "learning_rate": 9.773730560620345e-06, + "loss": 0.2377, + "step": 1267 + }, + { + "epoch": 0.12340632603406326, + "grad_norm": 1.4179546260918483, + "learning_rate": 9.773261540155148e-06, + "loss": 0.4857, + "step": 1268 + }, + { + "epoch": 0.1235036496350365, + "grad_norm": 1.3092572252570605, + "learning_rate": 9.772792045368015e-06, + "loss": 0.2969, + "step": 1269 + }, + { + "epoch": 0.12360097323600973, + "grad_norm": 1.7901486760202572, + "learning_rate": 9.772322076305607e-06, + "loss": 0.6935, + "step": 1270 + }, + { + "epoch": 0.12369829683698297, + "grad_norm": 1.5982523135009328, + "learning_rate": 9.771851633014618e-06, + "loss": 0.4368, + "step": 1271 + }, + { + "epoch": 0.12379562043795621, + "grad_norm": 1.195950207110724, + "learning_rate": 9.7713807155418e-06, + "loss": 0.3202, + "step": 1272 + }, + { + "epoch": 0.12389294403892943, + "grad_norm": 1.352519407714817, + "learning_rate": 9.770909323933947e-06, + "loss": 0.4284, + "step": 1273 + }, + { + "epoch": 0.12399026763990267, + "grad_norm": 1.4231425912579843, + "learning_rate": 9.770437458237903e-06, + "loss": 0.434, + "step": 1274 + }, + { + "epoch": 0.12408759124087591, + "grad_norm": 1.2825234760121222, + "learning_rate": 9.769965118500555e-06, + "loss": 0.3817, + "step": 1275 + }, + { + "epoch": 0.12418491484184915, + "grad_norm": 1.8250797045299043, + "learning_rate": 9.769492304768843e-06, + "loss": 0.7366, + "step": 1276 + }, + { + "epoch": 0.12428223844282238, + "grad_norm": 1.3974167065714918, + "learning_rate": 9.769019017089748e-06, + "loss": 0.2804, + "step": 1277 + }, + { + "epoch": 0.12437956204379562, + "grad_norm": 1.2933017267383033, + "learning_rate": 9.768545255510302e-06, + "loss": 0.3495, + "step": 1278 + }, + { + "epoch": 0.12447688564476886, + "grad_norm": 1.2423501538003798, + "learning_rate": 9.768071020077584e-06, + "loss": 0.2908, + "step": 1279 + }, + { + "epoch": 0.1245742092457421, + "grad_norm": 1.8228975858143868, + "learning_rate": 9.767596310838718e-06, + "loss": 0.4222, + "step": 1280 + }, + { + "epoch": 0.12467153284671532, + "grad_norm": 1.5510872411682606, + "learning_rate": 9.767121127840874e-06, + "loss": 0.5058, + "step": 1281 + }, + { + "epoch": 0.12476885644768856, + "grad_norm": 1.6665778692750302, + "learning_rate": 9.766645471131278e-06, + "loss": 0.3592, + "step": 1282 + }, + { + "epoch": 0.1248661800486618, + "grad_norm": 1.5396481092124317, + "learning_rate": 9.766169340757187e-06, + "loss": 0.2737, + "step": 1283 + }, + { + "epoch": 0.12496350364963503, + "grad_norm": 1.5555229817491858, + "learning_rate": 9.765692736765922e-06, + "loss": 0.5466, + "step": 1284 + }, + { + "epoch": 0.12506082725060827, + "grad_norm": 1.5351601326386175, + "learning_rate": 9.765215659204838e-06, + "loss": 0.4733, + "step": 1285 + }, + { + "epoch": 0.1251581508515815, + "grad_norm": 1.2793773363741519, + "learning_rate": 9.764738108121347e-06, + "loss": 0.3056, + "step": 1286 + }, + { + "epoch": 0.12525547445255475, + "grad_norm": 1.6331577939793205, + "learning_rate": 9.764260083562902e-06, + "loss": 0.5883, + "step": 1287 + }, + { + "epoch": 0.12535279805352798, + "grad_norm": 1.3363728845544067, + "learning_rate": 9.763781585577003e-06, + "loss": 0.2904, + "step": 1288 + }, + { + "epoch": 0.12545012165450123, + "grad_norm": 1.360818732035961, + "learning_rate": 9.763302614211199e-06, + "loss": 0.4202, + "step": 1289 + }, + { + "epoch": 0.12554744525547445, + "grad_norm": 1.3103877737057137, + "learning_rate": 9.762823169513089e-06, + "loss": 0.4694, + "step": 1290 + }, + { + "epoch": 0.12564476885644768, + "grad_norm": 1.1848446118808063, + "learning_rate": 9.76234325153031e-06, + "loss": 0.2265, + "step": 1291 + }, + { + "epoch": 0.12574209245742093, + "grad_norm": 1.3494947194310234, + "learning_rate": 9.761862860310558e-06, + "loss": 0.2382, + "step": 1292 + }, + { + "epoch": 0.12583941605839416, + "grad_norm": 1.7062717031139596, + "learning_rate": 9.761381995901564e-06, + "loss": 0.7254, + "step": 1293 + }, + { + "epoch": 0.12593673965936739, + "grad_norm": 1.208337515242783, + "learning_rate": 9.760900658351117e-06, + "loss": 0.326, + "step": 1294 + }, + { + "epoch": 0.12603406326034064, + "grad_norm": 1.3159841432369768, + "learning_rate": 9.760418847707043e-06, + "loss": 0.3438, + "step": 1295 + }, + { + "epoch": 0.12613138686131387, + "grad_norm": 1.3809255020300633, + "learning_rate": 9.759936564017223e-06, + "loss": 0.2716, + "step": 1296 + }, + { + "epoch": 0.1262287104622871, + "grad_norm": 1.3382917039666673, + "learning_rate": 9.759453807329582e-06, + "loss": 0.2882, + "step": 1297 + }, + { + "epoch": 0.12632603406326035, + "grad_norm": 1.3572918507167704, + "learning_rate": 9.75897057769209e-06, + "loss": 0.4181, + "step": 1298 + }, + { + "epoch": 0.12642335766423357, + "grad_norm": 1.4433440128897468, + "learning_rate": 9.758486875152766e-06, + "loss": 0.4883, + "step": 1299 + }, + { + "epoch": 0.12652068126520682, + "grad_norm": 1.1934091211117765, + "learning_rate": 9.758002699759677e-06, + "loss": 0.3828, + "step": 1300 + }, + { + "epoch": 0.12661800486618005, + "grad_norm": 1.4647925609545562, + "learning_rate": 9.757518051560935e-06, + "loss": 0.402, + "step": 1301 + }, + { + "epoch": 0.12671532846715328, + "grad_norm": 1.658517832372951, + "learning_rate": 9.7570329306047e-06, + "loss": 0.6752, + "step": 1302 + }, + { + "epoch": 0.12681265206812653, + "grad_norm": 1.2682494280043264, + "learning_rate": 9.75654733693918e-06, + "loss": 0.2786, + "step": 1303 + }, + { + "epoch": 0.12690997566909976, + "grad_norm": 1.3919267883395627, + "learning_rate": 9.756061270612625e-06, + "loss": 0.4806, + "step": 1304 + }, + { + "epoch": 0.12700729927007298, + "grad_norm": 1.160118847382142, + "learning_rate": 9.75557473167334e-06, + "loss": 0.2458, + "step": 1305 + }, + { + "epoch": 0.12710462287104624, + "grad_norm": 1.482640427472728, + "learning_rate": 9.755087720169672e-06, + "loss": 0.527, + "step": 1306 + }, + { + "epoch": 0.12720194647201946, + "grad_norm": 1.5068875178509769, + "learning_rate": 9.75460023615001e-06, + "loss": 0.4985, + "step": 1307 + }, + { + "epoch": 0.1272992700729927, + "grad_norm": 1.2878541774064265, + "learning_rate": 9.754112279662805e-06, + "loss": 0.3478, + "step": 1308 + }, + { + "epoch": 0.12739659367396594, + "grad_norm": 1.1398490461157162, + "learning_rate": 9.75362385075654e-06, + "loss": 0.3084, + "step": 1309 + }, + { + "epoch": 0.12749391727493917, + "grad_norm": 1.2924420070365765, + "learning_rate": 9.75313494947975e-06, + "loss": 0.3919, + "step": 1310 + }, + { + "epoch": 0.12759124087591242, + "grad_norm": 1.4558696462945964, + "learning_rate": 9.752645575881018e-06, + "loss": 0.225, + "step": 1311 + }, + { + "epoch": 0.12768856447688565, + "grad_norm": 1.677251779693783, + "learning_rate": 9.752155730008974e-06, + "loss": 0.4831, + "step": 1312 + }, + { + "epoch": 0.12778588807785887, + "grad_norm": 1.3350720195417478, + "learning_rate": 9.751665411912294e-06, + "loss": 0.4371, + "step": 1313 + }, + { + "epoch": 0.12788321167883213, + "grad_norm": 1.3653039655289896, + "learning_rate": 9.751174621639702e-06, + "loss": 0.4051, + "step": 1314 + }, + { + "epoch": 0.12798053527980535, + "grad_norm": 2.0214110135389927, + "learning_rate": 9.75068335923997e-06, + "loss": 0.4971, + "step": 1315 + }, + { + "epoch": 0.12807785888077858, + "grad_norm": 1.7144522600221743, + "learning_rate": 9.750191624761909e-06, + "loss": 0.6353, + "step": 1316 + }, + { + "epoch": 0.12817518248175183, + "grad_norm": 1.61491787633751, + "learning_rate": 9.749699418254388e-06, + "loss": 0.5408, + "step": 1317 + }, + { + "epoch": 0.12827250608272506, + "grad_norm": 1.3029361322695596, + "learning_rate": 9.749206739766317e-06, + "loss": 0.407, + "step": 1318 + }, + { + "epoch": 0.12836982968369828, + "grad_norm": 1.2453189940624274, + "learning_rate": 9.748713589346652e-06, + "loss": 0.3254, + "step": 1319 + }, + { + "epoch": 0.12846715328467154, + "grad_norm": 1.4117795102544664, + "learning_rate": 9.748219967044398e-06, + "loss": 0.3941, + "step": 1320 + }, + { + "epoch": 0.12856447688564476, + "grad_norm": 1.4197813276706028, + "learning_rate": 9.74772587290861e-06, + "loss": 0.3454, + "step": 1321 + }, + { + "epoch": 0.12866180048661802, + "grad_norm": 1.3133599325252279, + "learning_rate": 9.747231306988381e-06, + "loss": 0.3389, + "step": 1322 + }, + { + "epoch": 0.12875912408759124, + "grad_norm": 1.3432229805022793, + "learning_rate": 9.746736269332861e-06, + "loss": 0.469, + "step": 1323 + }, + { + "epoch": 0.12885644768856447, + "grad_norm": 1.1244292400820686, + "learning_rate": 9.746240759991241e-06, + "loss": 0.3674, + "step": 1324 + }, + { + "epoch": 0.12895377128953772, + "grad_norm": 1.4966792860681473, + "learning_rate": 9.745744779012758e-06, + "loss": 0.4308, + "step": 1325 + }, + { + "epoch": 0.12905109489051095, + "grad_norm": 1.5238028846181695, + "learning_rate": 9.745248326446699e-06, + "loss": 0.4213, + "step": 1326 + }, + { + "epoch": 0.12914841849148417, + "grad_norm": 1.3633303920337936, + "learning_rate": 9.744751402342398e-06, + "loss": 0.438, + "step": 1327 + }, + { + "epoch": 0.12924574209245743, + "grad_norm": 1.3260493495785517, + "learning_rate": 9.744254006749235e-06, + "loss": 0.4762, + "step": 1328 + }, + { + "epoch": 0.12934306569343065, + "grad_norm": 1.705738477220435, + "learning_rate": 9.743756139716634e-06, + "loss": 0.5861, + "step": 1329 + }, + { + "epoch": 0.12944038929440388, + "grad_norm": 1.5829201544013396, + "learning_rate": 9.743257801294069e-06, + "loss": 0.469, + "step": 1330 + }, + { + "epoch": 0.12953771289537713, + "grad_norm": 1.1445128143179795, + "learning_rate": 9.74275899153106e-06, + "loss": 0.4018, + "step": 1331 + }, + { + "epoch": 0.12963503649635036, + "grad_norm": 1.2900129109113572, + "learning_rate": 9.742259710477178e-06, + "loss": 0.3802, + "step": 1332 + }, + { + "epoch": 0.1297323600973236, + "grad_norm": 1.3212461161488713, + "learning_rate": 9.74175995818203e-06, + "loss": 0.3725, + "step": 1333 + }, + { + "epoch": 0.12982968369829684, + "grad_norm": 1.3979706650986563, + "learning_rate": 9.741259734695283e-06, + "loss": 0.3961, + "step": 1334 + }, + { + "epoch": 0.12992700729927006, + "grad_norm": 1.2642819849441118, + "learning_rate": 9.740759040066642e-06, + "loss": 0.3528, + "step": 1335 + }, + { + "epoch": 0.13002433090024332, + "grad_norm": 1.7776493019463793, + "learning_rate": 9.74025787434586e-06, + "loss": 0.6424, + "step": 1336 + }, + { + "epoch": 0.13012165450121654, + "grad_norm": 1.1885806737857232, + "learning_rate": 9.73975623758274e-06, + "loss": 0.3163, + "step": 1337 + }, + { + "epoch": 0.13021897810218977, + "grad_norm": 1.7443954093720497, + "learning_rate": 9.739254129827131e-06, + "loss": 0.7263, + "step": 1338 + }, + { + "epoch": 0.13031630170316302, + "grad_norm": 1.7005058938305366, + "learning_rate": 9.738751551128924e-06, + "loss": 0.5204, + "step": 1339 + }, + { + "epoch": 0.13041362530413625, + "grad_norm": 6.598521165184121, + "learning_rate": 9.738248501538063e-06, + "loss": 0.5113, + "step": 1340 + }, + { + "epoch": 0.1305109489051095, + "grad_norm": 1.6203066466178853, + "learning_rate": 9.737744981104536e-06, + "loss": 0.625, + "step": 1341 + }, + { + "epoch": 0.13060827250608273, + "grad_norm": 1.548111392574701, + "learning_rate": 9.73724098987838e-06, + "loss": 0.3952, + "step": 1342 + }, + { + "epoch": 0.13070559610705595, + "grad_norm": 1.4871418112966692, + "learning_rate": 9.736736527909674e-06, + "loss": 0.5084, + "step": 1343 + }, + { + "epoch": 0.1308029197080292, + "grad_norm": 1.0723677900938815, + "learning_rate": 9.736231595248546e-06, + "loss": 0.255, + "step": 1344 + }, + { + "epoch": 0.13090024330900243, + "grad_norm": 1.5695490713137843, + "learning_rate": 9.735726191945176e-06, + "loss": 0.3438, + "step": 1345 + }, + { + "epoch": 0.13099756690997566, + "grad_norm": 2.059617079542521, + "learning_rate": 9.73522031804978e-06, + "loss": 0.5249, + "step": 1346 + }, + { + "epoch": 0.1310948905109489, + "grad_norm": 1.5301765260275246, + "learning_rate": 9.734713973612633e-06, + "loss": 0.3667, + "step": 1347 + }, + { + "epoch": 0.13119221411192214, + "grad_norm": 1.7431028553023509, + "learning_rate": 9.734207158684048e-06, + "loss": 0.5551, + "step": 1348 + }, + { + "epoch": 0.13128953771289537, + "grad_norm": 1.2916959738739295, + "learning_rate": 9.733699873314388e-06, + "loss": 0.278, + "step": 1349 + }, + { + "epoch": 0.13138686131386862, + "grad_norm": 1.5891072584842363, + "learning_rate": 9.733192117554062e-06, + "loss": 0.4139, + "step": 1350 + }, + { + "epoch": 0.13148418491484185, + "grad_norm": 1.6366778166029219, + "learning_rate": 9.732683891453528e-06, + "loss": 0.4888, + "step": 1351 + }, + { + "epoch": 0.1315815085158151, + "grad_norm": 1.6763551525158185, + "learning_rate": 9.732175195063283e-06, + "loss": 0.5432, + "step": 1352 + }, + { + "epoch": 0.13167883211678832, + "grad_norm": 1.551593865483807, + "learning_rate": 9.731666028433882e-06, + "loss": 0.5634, + "step": 1353 + }, + { + "epoch": 0.13177615571776155, + "grad_norm": 1.693219206573502, + "learning_rate": 9.731156391615919e-06, + "loss": 0.4554, + "step": 1354 + }, + { + "epoch": 0.1318734793187348, + "grad_norm": 1.4894832853139421, + "learning_rate": 9.730646284660037e-06, + "loss": 0.4286, + "step": 1355 + }, + { + "epoch": 0.13197080291970803, + "grad_norm": 1.20058966692396, + "learning_rate": 9.730135707616927e-06, + "loss": 0.2519, + "step": 1356 + }, + { + "epoch": 0.13206812652068126, + "grad_norm": 1.395115321325138, + "learning_rate": 9.729624660537324e-06, + "loss": 0.3718, + "step": 1357 + }, + { + "epoch": 0.1321654501216545, + "grad_norm": 1.3441869335850034, + "learning_rate": 9.729113143472011e-06, + "loss": 0.43, + "step": 1358 + }, + { + "epoch": 0.13226277372262774, + "grad_norm": 1.31865416445236, + "learning_rate": 9.72860115647182e-06, + "loss": 0.296, + "step": 1359 + }, + { + "epoch": 0.13236009732360096, + "grad_norm": 1.3998148863889133, + "learning_rate": 9.728088699587623e-06, + "loss": 0.2642, + "step": 1360 + }, + { + "epoch": 0.13245742092457422, + "grad_norm": 1.5917388343760925, + "learning_rate": 9.727575772870347e-06, + "loss": 0.5999, + "step": 1361 + }, + { + "epoch": 0.13255474452554744, + "grad_norm": 1.6062441992747731, + "learning_rate": 9.727062376370962e-06, + "loss": 0.6017, + "step": 1362 + }, + { + "epoch": 0.1326520681265207, + "grad_norm": 1.756325054261889, + "learning_rate": 9.72654851014048e-06, + "loss": 0.5855, + "step": 1363 + }, + { + "epoch": 0.13274939172749392, + "grad_norm": 1.5782112626775713, + "learning_rate": 9.72603417422997e-06, + "loss": 0.5643, + "step": 1364 + }, + { + "epoch": 0.13284671532846715, + "grad_norm": 1.6280008631148617, + "learning_rate": 9.725519368690539e-06, + "loss": 0.3918, + "step": 1365 + }, + { + "epoch": 0.1329440389294404, + "grad_norm": 1.731476294535625, + "learning_rate": 9.725004093573343e-06, + "loss": 0.6909, + "step": 1366 + }, + { + "epoch": 0.13304136253041363, + "grad_norm": 1.7012591859680217, + "learning_rate": 9.724488348929587e-06, + "loss": 0.3206, + "step": 1367 + }, + { + "epoch": 0.13313868613138685, + "grad_norm": 1.5539166250213363, + "learning_rate": 9.723972134810519e-06, + "loss": 0.3735, + "step": 1368 + }, + { + "epoch": 0.1332360097323601, + "grad_norm": 1.2431527472113675, + "learning_rate": 9.723455451267436e-06, + "loss": 0.4023, + "step": 1369 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 1.4147928785913308, + "learning_rate": 9.722938298351682e-06, + "loss": 0.4501, + "step": 1370 + }, + { + "epoch": 0.13343065693430656, + "grad_norm": 1.5209109752466956, + "learning_rate": 9.722420676114646e-06, + "loss": 0.4504, + "step": 1371 + }, + { + "epoch": 0.1335279805352798, + "grad_norm": 1.6031977794999224, + "learning_rate": 9.721902584607766e-06, + "loss": 0.4036, + "step": 1372 + }, + { + "epoch": 0.13362530413625304, + "grad_norm": 1.3752266957066934, + "learning_rate": 9.721384023882524e-06, + "loss": 0.4008, + "step": 1373 + }, + { + "epoch": 0.1337226277372263, + "grad_norm": 1.2289957585024915, + "learning_rate": 9.720864993990448e-06, + "loss": 0.3214, + "step": 1374 + }, + { + "epoch": 0.13381995133819952, + "grad_norm": 1.5334770200964671, + "learning_rate": 9.720345494983117e-06, + "loss": 0.4103, + "step": 1375 + }, + { + "epoch": 0.13391727493917274, + "grad_norm": 1.4428318489533865, + "learning_rate": 9.719825526912152e-06, + "loss": 0.4314, + "step": 1376 + }, + { + "epoch": 0.134014598540146, + "grad_norm": 1.6794168653476527, + "learning_rate": 9.719305089829224e-06, + "loss": 0.6027, + "step": 1377 + }, + { + "epoch": 0.13411192214111922, + "grad_norm": 1.4695816931820398, + "learning_rate": 9.718784183786048e-06, + "loss": 0.5337, + "step": 1378 + }, + { + "epoch": 0.13420924574209245, + "grad_norm": 1.428180254445363, + "learning_rate": 9.718262808834386e-06, + "loss": 0.3636, + "step": 1379 + }, + { + "epoch": 0.1343065693430657, + "grad_norm": 1.4446624118640763, + "learning_rate": 9.717740965026051e-06, + "loss": 0.4213, + "step": 1380 + }, + { + "epoch": 0.13440389294403893, + "grad_norm": 1.0145899854020284, + "learning_rate": 9.717218652412896e-06, + "loss": 0.292, + "step": 1381 + }, + { + "epoch": 0.13450121654501215, + "grad_norm": 1.4589445831305994, + "learning_rate": 9.716695871046824e-06, + "loss": 0.4787, + "step": 1382 + }, + { + "epoch": 0.1345985401459854, + "grad_norm": 1.5462880417778382, + "learning_rate": 9.716172620979783e-06, + "loss": 0.4716, + "step": 1383 + }, + { + "epoch": 0.13469586374695863, + "grad_norm": 1.5411526965931406, + "learning_rate": 9.71564890226377e-06, + "loss": 0.5311, + "step": 1384 + }, + { + "epoch": 0.1347931873479319, + "grad_norm": 1.1928924795974905, + "learning_rate": 9.71512471495083e-06, + "loss": 0.2724, + "step": 1385 + }, + { + "epoch": 0.1348905109489051, + "grad_norm": 1.3201585939530793, + "learning_rate": 9.714600059093045e-06, + "loss": 0.2987, + "step": 1386 + }, + { + "epoch": 0.13498783454987834, + "grad_norm": 1.5687746327202647, + "learning_rate": 9.714074934742556e-06, + "loss": 0.363, + "step": 1387 + }, + { + "epoch": 0.1350851581508516, + "grad_norm": 1.5338932500845779, + "learning_rate": 9.713549341951543e-06, + "loss": 0.5661, + "step": 1388 + }, + { + "epoch": 0.13518248175182482, + "grad_norm": 1.1601153536694444, + "learning_rate": 9.713023280772236e-06, + "loss": 0.3079, + "step": 1389 + }, + { + "epoch": 0.13527980535279804, + "grad_norm": 1.3983637614495477, + "learning_rate": 9.712496751256907e-06, + "loss": 0.4741, + "step": 1390 + }, + { + "epoch": 0.1353771289537713, + "grad_norm": 1.2378967843544995, + "learning_rate": 9.71196975345788e-06, + "loss": 0.3467, + "step": 1391 + }, + { + "epoch": 0.13547445255474452, + "grad_norm": 1.3128641622430697, + "learning_rate": 9.711442287427523e-06, + "loss": 0.413, + "step": 1392 + }, + { + "epoch": 0.13557177615571775, + "grad_norm": 1.6638151172989781, + "learning_rate": 9.71091435321825e-06, + "loss": 0.4844, + "step": 1393 + }, + { + "epoch": 0.135669099756691, + "grad_norm": 1.5023430961651105, + "learning_rate": 9.710385950882522e-06, + "loss": 0.3639, + "step": 1394 + }, + { + "epoch": 0.13576642335766423, + "grad_norm": 1.3286069107884302, + "learning_rate": 9.709857080472847e-06, + "loss": 0.4055, + "step": 1395 + }, + { + "epoch": 0.13586374695863748, + "grad_norm": 1.2934746343236392, + "learning_rate": 9.709327742041776e-06, + "loss": 0.2837, + "step": 1396 + }, + { + "epoch": 0.1359610705596107, + "grad_norm": 1.698077360010743, + "learning_rate": 9.708797935641915e-06, + "loss": 0.3687, + "step": 1397 + }, + { + "epoch": 0.13605839416058393, + "grad_norm": 1.412271661785088, + "learning_rate": 9.70826766132591e-06, + "loss": 0.3577, + "step": 1398 + }, + { + "epoch": 0.1361557177615572, + "grad_norm": 1.5421055950766074, + "learning_rate": 9.707736919146453e-06, + "loss": 0.5394, + "step": 1399 + }, + { + "epoch": 0.1362530413625304, + "grad_norm": 2.554386599490806, + "learning_rate": 9.707205709156285e-06, + "loss": 0.212, + "step": 1400 + }, + { + "epoch": 0.13635036496350364, + "grad_norm": 1.7436805109650844, + "learning_rate": 9.70667403140819e-06, + "loss": 0.6893, + "step": 1401 + }, + { + "epoch": 0.1364476885644769, + "grad_norm": 1.613527884115612, + "learning_rate": 9.706141885955006e-06, + "loss": 0.42, + "step": 1402 + }, + { + "epoch": 0.13654501216545012, + "grad_norm": 1.711619341430359, + "learning_rate": 9.70560927284961e-06, + "loss": 0.7025, + "step": 1403 + }, + { + "epoch": 0.13664233576642335, + "grad_norm": 1.5376532439489434, + "learning_rate": 9.705076192144927e-06, + "loss": 0.5201, + "step": 1404 + }, + { + "epoch": 0.1367396593673966, + "grad_norm": 1.492510855426001, + "learning_rate": 9.704542643893931e-06, + "loss": 0.4281, + "step": 1405 + }, + { + "epoch": 0.13683698296836982, + "grad_norm": 1.5678573317920237, + "learning_rate": 9.704008628149641e-06, + "loss": 0.506, + "step": 1406 + }, + { + "epoch": 0.13693430656934308, + "grad_norm": 1.3237691920747017, + "learning_rate": 9.703474144965123e-06, + "loss": 0.4114, + "step": 1407 + }, + { + "epoch": 0.1370316301703163, + "grad_norm": 1.4134135574988251, + "learning_rate": 9.702939194393489e-06, + "loss": 0.3806, + "step": 1408 + }, + { + "epoch": 0.13712895377128953, + "grad_norm": 1.5544258549266206, + "learning_rate": 9.702403776487895e-06, + "loss": 0.4863, + "step": 1409 + }, + { + "epoch": 0.13722627737226278, + "grad_norm": 1.3619063912879554, + "learning_rate": 9.701867891301548e-06, + "loss": 0.3692, + "step": 1410 + }, + { + "epoch": 0.137323600973236, + "grad_norm": 1.5146665393724075, + "learning_rate": 9.701331538887699e-06, + "loss": 0.3311, + "step": 1411 + }, + { + "epoch": 0.13742092457420924, + "grad_norm": 1.5674647990142176, + "learning_rate": 9.700794719299644e-06, + "loss": 0.5292, + "step": 1412 + }, + { + "epoch": 0.1375182481751825, + "grad_norm": 1.4711236643775818, + "learning_rate": 9.700257432590729e-06, + "loss": 0.466, + "step": 1413 + }, + { + "epoch": 0.13761557177615572, + "grad_norm": 1.4410106250389758, + "learning_rate": 9.699719678814345e-06, + "loss": 0.3276, + "step": 1414 + }, + { + "epoch": 0.13771289537712894, + "grad_norm": 1.652937978394441, + "learning_rate": 9.699181458023927e-06, + "loss": 0.5391, + "step": 1415 + }, + { + "epoch": 0.1378102189781022, + "grad_norm": 1.7285587973510355, + "learning_rate": 9.698642770272959e-06, + "loss": 0.5707, + "step": 1416 + }, + { + "epoch": 0.13790754257907542, + "grad_norm": 1.325058257423692, + "learning_rate": 9.698103615614972e-06, + "loss": 0.3429, + "step": 1417 + }, + { + "epoch": 0.13800486618004867, + "grad_norm": 1.5653351048996198, + "learning_rate": 9.69756399410354e-06, + "loss": 0.4132, + "step": 1418 + }, + { + "epoch": 0.1381021897810219, + "grad_norm": 1.603805088396393, + "learning_rate": 9.697023905792287e-06, + "loss": 0.4983, + "step": 1419 + }, + { + "epoch": 0.13819951338199513, + "grad_norm": 1.5052443063346659, + "learning_rate": 9.69648335073488e-06, + "loss": 0.2713, + "step": 1420 + }, + { + "epoch": 0.13829683698296838, + "grad_norm": 1.30196768692164, + "learning_rate": 9.695942328985037e-06, + "loss": 0.27, + "step": 1421 + }, + { + "epoch": 0.1383941605839416, + "grad_norm": 1.1542739478608208, + "learning_rate": 9.695400840596519e-06, + "loss": 0.3309, + "step": 1422 + }, + { + "epoch": 0.13849148418491483, + "grad_norm": 1.1029138054910885, + "learning_rate": 9.694858885623132e-06, + "loss": 0.3262, + "step": 1423 + }, + { + "epoch": 0.13858880778588809, + "grad_norm": 1.581389120261872, + "learning_rate": 9.694316464118732e-06, + "loss": 0.4663, + "step": 1424 + }, + { + "epoch": 0.1386861313868613, + "grad_norm": 1.2966198038166061, + "learning_rate": 9.69377357613722e-06, + "loss": 0.336, + "step": 1425 + }, + { + "epoch": 0.13878345498783454, + "grad_norm": 1.505634533514273, + "learning_rate": 9.693230221732544e-06, + "loss": 0.4269, + "step": 1426 + }, + { + "epoch": 0.1388807785888078, + "grad_norm": 1.274453115047599, + "learning_rate": 9.692686400958695e-06, + "loss": 0.3978, + "step": 1427 + }, + { + "epoch": 0.13897810218978102, + "grad_norm": 1.2126154933077449, + "learning_rate": 9.692142113869714e-06, + "loss": 0.2754, + "step": 1428 + }, + { + "epoch": 0.13907542579075427, + "grad_norm": 1.4884313472642259, + "learning_rate": 9.691597360519686e-06, + "loss": 0.4661, + "step": 1429 + }, + { + "epoch": 0.1391727493917275, + "grad_norm": 1.5680101511782372, + "learning_rate": 9.691052140962747e-06, + "loss": 0.4237, + "step": 1430 + }, + { + "epoch": 0.13927007299270072, + "grad_norm": 1.325640699841282, + "learning_rate": 9.690506455253073e-06, + "loss": 0.3988, + "step": 1431 + }, + { + "epoch": 0.13936739659367398, + "grad_norm": 1.3107002270910884, + "learning_rate": 9.689960303444887e-06, + "loss": 0.4268, + "step": 1432 + }, + { + "epoch": 0.1394647201946472, + "grad_norm": 1.9246823036308274, + "learning_rate": 9.689413685592465e-06, + "loss": 0.3733, + "step": 1433 + }, + { + "epoch": 0.13956204379562043, + "grad_norm": 1.3731854343094059, + "learning_rate": 9.688866601750122e-06, + "loss": 0.4215, + "step": 1434 + }, + { + "epoch": 0.13965936739659368, + "grad_norm": 1.368964734934982, + "learning_rate": 9.688319051972224e-06, + "loss": 0.4697, + "step": 1435 + }, + { + "epoch": 0.1397566909975669, + "grad_norm": 1.3451140821212522, + "learning_rate": 9.687771036313178e-06, + "loss": 0.3741, + "step": 1436 + }, + { + "epoch": 0.13985401459854013, + "grad_norm": 1.5372748667563303, + "learning_rate": 9.687222554827444e-06, + "loss": 0.4199, + "step": 1437 + }, + { + "epoch": 0.1399513381995134, + "grad_norm": 1.1780522614950486, + "learning_rate": 9.686673607569526e-06, + "loss": 0.3602, + "step": 1438 + }, + { + "epoch": 0.1400486618004866, + "grad_norm": 1.20778383169165, + "learning_rate": 9.686124194593967e-06, + "loss": 0.23, + "step": 1439 + }, + { + "epoch": 0.14014598540145987, + "grad_norm": 1.6760972087501165, + "learning_rate": 9.685574315955368e-06, + "loss": 0.5089, + "step": 1440 + }, + { + "epoch": 0.1402433090024331, + "grad_norm": 1.7963497555189056, + "learning_rate": 9.68502397170837e-06, + "loss": 0.3932, + "step": 1441 + }, + { + "epoch": 0.14034063260340632, + "grad_norm": 1.401968265514402, + "learning_rate": 9.68447316190766e-06, + "loss": 0.4272, + "step": 1442 + }, + { + "epoch": 0.14043795620437957, + "grad_norm": 1.1461895591250986, + "learning_rate": 9.683921886607973e-06, + "loss": 0.3003, + "step": 1443 + }, + { + "epoch": 0.1405352798053528, + "grad_norm": 1.8257595963636586, + "learning_rate": 9.683370145864089e-06, + "loss": 0.4454, + "step": 1444 + }, + { + "epoch": 0.14063260340632602, + "grad_norm": 1.3483599166387192, + "learning_rate": 9.682817939730833e-06, + "loss": 0.3708, + "step": 1445 + }, + { + "epoch": 0.14072992700729928, + "grad_norm": 1.4560700792487955, + "learning_rate": 9.682265268263083e-06, + "loss": 0.4321, + "step": 1446 + }, + { + "epoch": 0.1408272506082725, + "grad_norm": 1.4364952224667933, + "learning_rate": 9.681712131515753e-06, + "loss": 0.3812, + "step": 1447 + }, + { + "epoch": 0.14092457420924573, + "grad_norm": 1.6808986821455574, + "learning_rate": 9.681158529543812e-06, + "loss": 0.3939, + "step": 1448 + }, + { + "epoch": 0.14102189781021898, + "grad_norm": 1.5327313322922438, + "learning_rate": 9.68060446240227e-06, + "loss": 0.3617, + "step": 1449 + }, + { + "epoch": 0.1411192214111922, + "grad_norm": 1.9055650449775412, + "learning_rate": 9.680049930146186e-06, + "loss": 0.4984, + "step": 1450 + }, + { + "epoch": 0.14121654501216546, + "grad_norm": 1.8971706606162055, + "learning_rate": 9.679494932830664e-06, + "loss": 0.4196, + "step": 1451 + }, + { + "epoch": 0.1413138686131387, + "grad_norm": 1.7337796675846617, + "learning_rate": 9.678939470510856e-06, + "loss": 0.4282, + "step": 1452 + }, + { + "epoch": 0.1414111922141119, + "grad_norm": 1.6436762455975924, + "learning_rate": 9.678383543241954e-06, + "loss": 0.425, + "step": 1453 + }, + { + "epoch": 0.14150851581508517, + "grad_norm": 1.3304471527694197, + "learning_rate": 9.677827151079205e-06, + "loss": 0.346, + "step": 1454 + }, + { + "epoch": 0.1416058394160584, + "grad_norm": 1.3162532004293022, + "learning_rate": 9.677270294077895e-06, + "loss": 0.4492, + "step": 1455 + }, + { + "epoch": 0.14170316301703162, + "grad_norm": 1.2299075830057253, + "learning_rate": 9.676712972293363e-06, + "loss": 0.3525, + "step": 1456 + }, + { + "epoch": 0.14180048661800487, + "grad_norm": 1.7174455721253266, + "learning_rate": 9.676155185780989e-06, + "loss": 0.763, + "step": 1457 + }, + { + "epoch": 0.1418978102189781, + "grad_norm": 0.9624475539149472, + "learning_rate": 9.675596934596198e-06, + "loss": 0.2234, + "step": 1458 + }, + { + "epoch": 0.14199513381995132, + "grad_norm": 1.380722751360302, + "learning_rate": 9.675038218794469e-06, + "loss": 0.3539, + "step": 1459 + }, + { + "epoch": 0.14209245742092458, + "grad_norm": 1.3595616004290971, + "learning_rate": 9.674479038431314e-06, + "loss": 0.4356, + "step": 1460 + }, + { + "epoch": 0.1421897810218978, + "grad_norm": 1.2777542247997187, + "learning_rate": 9.673919393562308e-06, + "loss": 0.3233, + "step": 1461 + }, + { + "epoch": 0.14228710462287106, + "grad_norm": 1.23752096524445, + "learning_rate": 9.673359284243055e-06, + "loss": 0.405, + "step": 1462 + }, + { + "epoch": 0.14238442822384428, + "grad_norm": 1.4547729172095425, + "learning_rate": 9.672798710529222e-06, + "loss": 0.5356, + "step": 1463 + }, + { + "epoch": 0.1424817518248175, + "grad_norm": 1.5976011084855026, + "learning_rate": 9.672237672476506e-06, + "loss": 0.571, + "step": 1464 + }, + { + "epoch": 0.14257907542579076, + "grad_norm": 1.4454139467669962, + "learning_rate": 9.67167617014066e-06, + "loss": 0.4556, + "step": 1465 + }, + { + "epoch": 0.142676399026764, + "grad_norm": 1.5296734849172828, + "learning_rate": 9.671114203577485e-06, + "loss": 0.5791, + "step": 1466 + }, + { + "epoch": 0.14277372262773722, + "grad_norm": 1.0140913901893902, + "learning_rate": 9.670551772842818e-06, + "loss": 0.2732, + "step": 1467 + }, + { + "epoch": 0.14287104622871047, + "grad_norm": 1.5600773149062541, + "learning_rate": 9.669988877992551e-06, + "loss": 0.3902, + "step": 1468 + }, + { + "epoch": 0.1429683698296837, + "grad_norm": 1.4767158872669255, + "learning_rate": 9.66942551908262e-06, + "loss": 0.5531, + "step": 1469 + }, + { + "epoch": 0.14306569343065692, + "grad_norm": 1.1134570066684917, + "learning_rate": 9.668861696169003e-06, + "loss": 0.278, + "step": 1470 + }, + { + "epoch": 0.14316301703163017, + "grad_norm": 0.9776488708344422, + "learning_rate": 9.66829740930773e-06, + "loss": 0.231, + "step": 1471 + }, + { + "epoch": 0.1432603406326034, + "grad_norm": 1.4647496714581032, + "learning_rate": 9.667732658554875e-06, + "loss": 0.485, + "step": 1472 + }, + { + "epoch": 0.14335766423357665, + "grad_norm": 1.2234301570511203, + "learning_rate": 9.667167443966557e-06, + "loss": 0.3944, + "step": 1473 + }, + { + "epoch": 0.14345498783454988, + "grad_norm": 1.3655487702696618, + "learning_rate": 9.66660176559894e-06, + "loss": 0.3989, + "step": 1474 + }, + { + "epoch": 0.1435523114355231, + "grad_norm": 1.4690108372007447, + "learning_rate": 9.666035623508238e-06, + "loss": 0.4311, + "step": 1475 + }, + { + "epoch": 0.14364963503649636, + "grad_norm": 1.1910374305057687, + "learning_rate": 9.665469017750708e-06, + "loss": 0.3002, + "step": 1476 + }, + { + "epoch": 0.14374695863746959, + "grad_norm": 1.678176413091249, + "learning_rate": 9.664901948382654e-06, + "loss": 0.6143, + "step": 1477 + }, + { + "epoch": 0.1438442822384428, + "grad_norm": 1.817046546881487, + "learning_rate": 9.664334415460426e-06, + "loss": 0.7811, + "step": 1478 + }, + { + "epoch": 0.14394160583941606, + "grad_norm": 1.4955026439922687, + "learning_rate": 9.663766419040422e-06, + "loss": 0.411, + "step": 1479 + }, + { + "epoch": 0.1440389294403893, + "grad_norm": 1.4198677231066263, + "learning_rate": 9.66319795917908e-06, + "loss": 0.4245, + "step": 1480 + }, + { + "epoch": 0.14413625304136254, + "grad_norm": 1.5199876898969789, + "learning_rate": 9.662629035932892e-06, + "loss": 0.438, + "step": 1481 + }, + { + "epoch": 0.14423357664233577, + "grad_norm": 1.4859771113526168, + "learning_rate": 9.662059649358388e-06, + "loss": 0.3949, + "step": 1482 + }, + { + "epoch": 0.144330900243309, + "grad_norm": 1.5386966328542977, + "learning_rate": 9.661489799512155e-06, + "loss": 0.4679, + "step": 1483 + }, + { + "epoch": 0.14442822384428225, + "grad_norm": 1.2872766782537612, + "learning_rate": 9.660919486450813e-06, + "loss": 0.2624, + "step": 1484 + }, + { + "epoch": 0.14452554744525548, + "grad_norm": 1.3276179523832277, + "learning_rate": 9.660348710231037e-06, + "loss": 0.5476, + "step": 1485 + }, + { + "epoch": 0.1446228710462287, + "grad_norm": 0.9490583621811937, + "learning_rate": 9.659777470909547e-06, + "loss": 0.2354, + "step": 1486 + }, + { + "epoch": 0.14472019464720196, + "grad_norm": 1.3763558898436123, + "learning_rate": 9.659205768543104e-06, + "loss": 0.4327, + "step": 1487 + }, + { + "epoch": 0.14481751824817518, + "grad_norm": 1.178366926128956, + "learning_rate": 9.658633603188521e-06, + "loss": 0.3839, + "step": 1488 + }, + { + "epoch": 0.1449148418491484, + "grad_norm": 1.3255542333725456, + "learning_rate": 9.658060974902653e-06, + "loss": 0.3068, + "step": 1489 + }, + { + "epoch": 0.14501216545012166, + "grad_norm": 1.5998345706772108, + "learning_rate": 9.657487883742403e-06, + "loss": 0.5432, + "step": 1490 + }, + { + "epoch": 0.1451094890510949, + "grad_norm": 1.8804975658787435, + "learning_rate": 9.656914329764718e-06, + "loss": 0.5268, + "step": 1491 + }, + { + "epoch": 0.14520681265206814, + "grad_norm": 1.5841269093835124, + "learning_rate": 9.656340313026595e-06, + "loss": 0.6304, + "step": 1492 + }, + { + "epoch": 0.14530413625304137, + "grad_norm": 1.5832299483159056, + "learning_rate": 9.655765833585072e-06, + "loss": 0.4417, + "step": 1493 + }, + { + "epoch": 0.1454014598540146, + "grad_norm": 1.2541361090062475, + "learning_rate": 9.655190891497237e-06, + "loss": 0.2956, + "step": 1494 + }, + { + "epoch": 0.14549878345498785, + "grad_norm": 1.4549578520972333, + "learning_rate": 9.654615486820223e-06, + "loss": 0.5352, + "step": 1495 + }, + { + "epoch": 0.14559610705596107, + "grad_norm": 1.4797996277102474, + "learning_rate": 9.654039619611205e-06, + "loss": 0.4915, + "step": 1496 + }, + { + "epoch": 0.1456934306569343, + "grad_norm": 1.2281886698207842, + "learning_rate": 9.65346328992741e-06, + "loss": 0.1901, + "step": 1497 + }, + { + "epoch": 0.14579075425790755, + "grad_norm": 1.4478972545758728, + "learning_rate": 9.652886497826109e-06, + "loss": 0.4142, + "step": 1498 + }, + { + "epoch": 0.14588807785888078, + "grad_norm": 1.5883286963945868, + "learning_rate": 9.652309243364614e-06, + "loss": 0.3576, + "step": 1499 + }, + { + "epoch": 0.145985401459854, + "grad_norm": 1.5369489845441549, + "learning_rate": 9.651731526600293e-06, + "loss": 0.5479, + "step": 1500 + }, + { + "epoch": 0.14608272506082726, + "grad_norm": 1.5655077404950533, + "learning_rate": 9.651153347590549e-06, + "loss": 0.3464, + "step": 1501 + }, + { + "epoch": 0.14618004866180048, + "grad_norm": 1.6426065013852054, + "learning_rate": 9.65057470639284e-06, + "loss": 0.5038, + "step": 1502 + }, + { + "epoch": 0.14627737226277374, + "grad_norm": 1.8088684532537898, + "learning_rate": 9.649995603064664e-06, + "loss": 0.5731, + "step": 1503 + }, + { + "epoch": 0.14637469586374696, + "grad_norm": 1.2493389766016731, + "learning_rate": 9.649416037663564e-06, + "loss": 0.3306, + "step": 1504 + }, + { + "epoch": 0.1464720194647202, + "grad_norm": 1.5964615139072293, + "learning_rate": 9.648836010247137e-06, + "loss": 0.4169, + "step": 1505 + }, + { + "epoch": 0.14656934306569344, + "grad_norm": 1.3925830899828215, + "learning_rate": 9.648255520873018e-06, + "loss": 0.3092, + "step": 1506 + }, + { + "epoch": 0.14666666666666667, + "grad_norm": 1.4256860988690832, + "learning_rate": 9.647674569598889e-06, + "loss": 0.3201, + "step": 1507 + }, + { + "epoch": 0.1467639902676399, + "grad_norm": 1.6614703553660697, + "learning_rate": 9.647093156482483e-06, + "loss": 0.6078, + "step": 1508 + }, + { + "epoch": 0.14686131386861315, + "grad_norm": 1.4357097092225446, + "learning_rate": 9.646511281581575e-06, + "loss": 0.4004, + "step": 1509 + }, + { + "epoch": 0.14695863746958637, + "grad_norm": 1.4562846462074024, + "learning_rate": 9.645928944953981e-06, + "loss": 0.4601, + "step": 1510 + }, + { + "epoch": 0.1470559610705596, + "grad_norm": 1.3277232740610976, + "learning_rate": 9.645346146657575e-06, + "loss": 0.4015, + "step": 1511 + }, + { + "epoch": 0.14715328467153285, + "grad_norm": 1.5964514332978577, + "learning_rate": 9.644762886750267e-06, + "loss": 0.4556, + "step": 1512 + }, + { + "epoch": 0.14725060827250608, + "grad_norm": 1.4663379423625913, + "learning_rate": 9.644179165290015e-06, + "loss": 0.4353, + "step": 1513 + }, + { + "epoch": 0.14734793187347933, + "grad_norm": 1.0949765548634744, + "learning_rate": 9.643594982334826e-06, + "loss": 0.2276, + "step": 1514 + }, + { + "epoch": 0.14744525547445256, + "grad_norm": 1.563845779693575, + "learning_rate": 9.643010337942749e-06, + "loss": 0.6694, + "step": 1515 + }, + { + "epoch": 0.14754257907542578, + "grad_norm": 1.024413538842663, + "learning_rate": 9.642425232171881e-06, + "loss": 0.3047, + "step": 1516 + }, + { + "epoch": 0.14763990267639904, + "grad_norm": 1.712866405633365, + "learning_rate": 9.641839665080363e-06, + "loss": 0.6729, + "step": 1517 + }, + { + "epoch": 0.14773722627737226, + "grad_norm": 1.4526258041869373, + "learning_rate": 9.641253636726386e-06, + "loss": 0.5037, + "step": 1518 + }, + { + "epoch": 0.1478345498783455, + "grad_norm": 1.7375410582816389, + "learning_rate": 9.640667147168182e-06, + "loss": 0.6717, + "step": 1519 + }, + { + "epoch": 0.14793187347931874, + "grad_norm": 1.736227335112512, + "learning_rate": 9.640080196464032e-06, + "loss": 0.6677, + "step": 1520 + }, + { + "epoch": 0.14802919708029197, + "grad_norm": 1.5194007013329096, + "learning_rate": 9.63949278467226e-06, + "loss": 0.4288, + "step": 1521 + }, + { + "epoch": 0.1481265206812652, + "grad_norm": 1.5063763039212688, + "learning_rate": 9.638904911851237e-06, + "loss": 0.4529, + "step": 1522 + }, + { + "epoch": 0.14822384428223845, + "grad_norm": 1.6414876214129155, + "learning_rate": 9.638316578059384e-06, + "loss": 0.5482, + "step": 1523 + }, + { + "epoch": 0.14832116788321167, + "grad_norm": 1.3122113228270877, + "learning_rate": 9.63772778335516e-06, + "loss": 0.3903, + "step": 1524 + }, + { + "epoch": 0.14841849148418493, + "grad_norm": 1.6417051120393822, + "learning_rate": 9.637138527797075e-06, + "loss": 0.654, + "step": 1525 + }, + { + "epoch": 0.14851581508515815, + "grad_norm": 1.2700043251684836, + "learning_rate": 9.636548811443685e-06, + "loss": 0.3338, + "step": 1526 + }, + { + "epoch": 0.14861313868613138, + "grad_norm": 1.4124836827913858, + "learning_rate": 9.63595863435359e-06, + "loss": 0.3551, + "step": 1527 + }, + { + "epoch": 0.14871046228710463, + "grad_norm": 1.3732601776051463, + "learning_rate": 9.635367996585436e-06, + "loss": 0.4212, + "step": 1528 + }, + { + "epoch": 0.14880778588807786, + "grad_norm": 1.4785898006079692, + "learning_rate": 9.634776898197916e-06, + "loss": 0.416, + "step": 1529 + }, + { + "epoch": 0.14890510948905109, + "grad_norm": 1.5889313350171215, + "learning_rate": 9.634185339249766e-06, + "loss": 0.5277, + "step": 1530 + }, + { + "epoch": 0.14900243309002434, + "grad_norm": 1.7475817866143981, + "learning_rate": 9.63359331979977e-06, + "loss": 0.5202, + "step": 1531 + }, + { + "epoch": 0.14909975669099756, + "grad_norm": 1.5329427899001755, + "learning_rate": 9.633000839906758e-06, + "loss": 0.4283, + "step": 1532 + }, + { + "epoch": 0.1491970802919708, + "grad_norm": 1.3789605408265815, + "learning_rate": 9.632407899629606e-06, + "loss": 0.41, + "step": 1533 + }, + { + "epoch": 0.14929440389294404, + "grad_norm": 1.725959361785896, + "learning_rate": 9.631814499027233e-06, + "loss": 0.6289, + "step": 1534 + }, + { + "epoch": 0.14939172749391727, + "grad_norm": 1.5432692609357797, + "learning_rate": 9.631220638158605e-06, + "loss": 0.5, + "step": 1535 + }, + { + "epoch": 0.14948905109489052, + "grad_norm": 1.6556108789068573, + "learning_rate": 9.630626317082737e-06, + "loss": 0.3819, + "step": 1536 + }, + { + "epoch": 0.14958637469586375, + "grad_norm": 1.4498977098887442, + "learning_rate": 9.630031535858686e-06, + "loss": 0.4317, + "step": 1537 + }, + { + "epoch": 0.14968369829683698, + "grad_norm": 1.1232180788321369, + "learning_rate": 9.629436294545555e-06, + "loss": 0.4004, + "step": 1538 + }, + { + "epoch": 0.14978102189781023, + "grad_norm": 0.9949950497949807, + "learning_rate": 9.628840593202494e-06, + "loss": 0.2008, + "step": 1539 + }, + { + "epoch": 0.14987834549878346, + "grad_norm": 1.431426278132333, + "learning_rate": 9.628244431888699e-06, + "loss": 0.3689, + "step": 1540 + }, + { + "epoch": 0.14997566909975668, + "grad_norm": 1.575987397356523, + "learning_rate": 9.627647810663407e-06, + "loss": 0.5513, + "step": 1541 + }, + { + "epoch": 0.15007299270072993, + "grad_norm": 1.5419042077794642, + "learning_rate": 9.627050729585911e-06, + "loss": 0.4614, + "step": 1542 + }, + { + "epoch": 0.15017031630170316, + "grad_norm": 1.6695059275012083, + "learning_rate": 9.626453188715539e-06, + "loss": 0.5111, + "step": 1543 + }, + { + "epoch": 0.1502676399026764, + "grad_norm": 1.5402255238707527, + "learning_rate": 9.625855188111668e-06, + "loss": 0.4209, + "step": 1544 + }, + { + "epoch": 0.15036496350364964, + "grad_norm": 1.4645797288107798, + "learning_rate": 9.625256727833726e-06, + "loss": 0.4852, + "step": 1545 + }, + { + "epoch": 0.15046228710462287, + "grad_norm": 2.0138530187225845, + "learning_rate": 9.62465780794118e-06, + "loss": 0.4272, + "step": 1546 + }, + { + "epoch": 0.15055961070559612, + "grad_norm": 1.7939871096323345, + "learning_rate": 9.624058428493543e-06, + "loss": 0.3864, + "step": 1547 + }, + { + "epoch": 0.15065693430656935, + "grad_norm": 1.5936734798237622, + "learning_rate": 9.62345858955038e-06, + "loss": 0.5951, + "step": 1548 + }, + { + "epoch": 0.15075425790754257, + "grad_norm": 1.381736638575513, + "learning_rate": 9.622858291171295e-06, + "loss": 0.5078, + "step": 1549 + }, + { + "epoch": 0.15085158150851583, + "grad_norm": 1.2680468052820635, + "learning_rate": 9.622257533415939e-06, + "loss": 0.3314, + "step": 1550 + }, + { + "epoch": 0.15094890510948905, + "grad_norm": 1.5886359348363517, + "learning_rate": 9.621656316344011e-06, + "loss": 0.5985, + "step": 1551 + }, + { + "epoch": 0.15104622871046228, + "grad_norm": 1.631001321245941, + "learning_rate": 9.621054640015255e-06, + "loss": 0.6297, + "step": 1552 + }, + { + "epoch": 0.15114355231143553, + "grad_norm": 1.7004985330783402, + "learning_rate": 9.62045250448946e-06, + "loss": 0.5153, + "step": 1553 + }, + { + "epoch": 0.15124087591240876, + "grad_norm": 1.414050528965644, + "learning_rate": 9.619849909826457e-06, + "loss": 0.2651, + "step": 1554 + }, + { + "epoch": 0.15133819951338198, + "grad_norm": 1.3361950007111751, + "learning_rate": 9.61924685608613e-06, + "loss": 0.4179, + "step": 1555 + }, + { + "epoch": 0.15143552311435524, + "grad_norm": 1.2305020766816175, + "learning_rate": 9.618643343328404e-06, + "loss": 0.3342, + "step": 1556 + }, + { + "epoch": 0.15153284671532846, + "grad_norm": 1.3364057110807985, + "learning_rate": 9.618039371613251e-06, + "loss": 0.357, + "step": 1557 + }, + { + "epoch": 0.15163017031630172, + "grad_norm": 0.9846564904659728, + "learning_rate": 9.617434941000685e-06, + "loss": 0.2278, + "step": 1558 + }, + { + "epoch": 0.15172749391727494, + "grad_norm": 1.4874184978820846, + "learning_rate": 9.616830051550772e-06, + "loss": 0.4467, + "step": 1559 + }, + { + "epoch": 0.15182481751824817, + "grad_norm": 1.79907754997464, + "learning_rate": 9.61622470332362e-06, + "loss": 0.5501, + "step": 1560 + }, + { + "epoch": 0.15192214111922142, + "grad_norm": 1.2290536645357835, + "learning_rate": 9.61561889637938e-06, + "loss": 0.3149, + "step": 1561 + }, + { + "epoch": 0.15201946472019465, + "grad_norm": 1.5048179340178087, + "learning_rate": 9.615012630778254e-06, + "loss": 0.5367, + "step": 1562 + }, + { + "epoch": 0.15211678832116787, + "grad_norm": 1.387431259858161, + "learning_rate": 9.614405906580486e-06, + "loss": 0.4953, + "step": 1563 + }, + { + "epoch": 0.15221411192214113, + "grad_norm": 1.4159610711473967, + "learning_rate": 9.613798723846368e-06, + "loss": 0.454, + "step": 1564 + }, + { + "epoch": 0.15231143552311435, + "grad_norm": 1.2005509919566202, + "learning_rate": 9.613191082636235e-06, + "loss": 0.3945, + "step": 1565 + }, + { + "epoch": 0.15240875912408758, + "grad_norm": 1.518451218591156, + "learning_rate": 9.612582983010468e-06, + "loss": 0.42, + "step": 1566 + }, + { + "epoch": 0.15250608272506083, + "grad_norm": 1.2817177267697137, + "learning_rate": 9.611974425029494e-06, + "loss": 0.4119, + "step": 1567 + }, + { + "epoch": 0.15260340632603406, + "grad_norm": 1.3182769071429, + "learning_rate": 9.611365408753787e-06, + "loss": 0.4301, + "step": 1568 + }, + { + "epoch": 0.1527007299270073, + "grad_norm": 1.2668371165350867, + "learning_rate": 9.610755934243864e-06, + "loss": 0.3415, + "step": 1569 + }, + { + "epoch": 0.15279805352798054, + "grad_norm": 1.334265705787435, + "learning_rate": 9.610146001560293e-06, + "loss": 0.325, + "step": 1570 + }, + { + "epoch": 0.15289537712895376, + "grad_norm": 1.405628575667756, + "learning_rate": 9.609535610763678e-06, + "loss": 0.4, + "step": 1571 + }, + { + "epoch": 0.15299270072992702, + "grad_norm": 1.5931859233666277, + "learning_rate": 9.608924761914677e-06, + "loss": 0.643, + "step": 1572 + }, + { + "epoch": 0.15309002433090024, + "grad_norm": 1.323715339329346, + "learning_rate": 9.608313455073989e-06, + "loss": 0.4832, + "step": 1573 + }, + { + "epoch": 0.15318734793187347, + "grad_norm": 1.1603088792271297, + "learning_rate": 9.60770169030236e-06, + "loss": 0.2684, + "step": 1574 + }, + { + "epoch": 0.15328467153284672, + "grad_norm": 1.4578030666688024, + "learning_rate": 9.607089467660581e-06, + "loss": 0.4418, + "step": 1575 + }, + { + "epoch": 0.15338199513381995, + "grad_norm": 1.2739086566679132, + "learning_rate": 9.606476787209493e-06, + "loss": 0.3847, + "step": 1576 + }, + { + "epoch": 0.15347931873479317, + "grad_norm": 1.4031538044918876, + "learning_rate": 9.605863649009973e-06, + "loss": 0.3672, + "step": 1577 + }, + { + "epoch": 0.15357664233576643, + "grad_norm": 1.473592849907526, + "learning_rate": 9.605250053122951e-06, + "loss": 0.3955, + "step": 1578 + }, + { + "epoch": 0.15367396593673965, + "grad_norm": 1.6950520258208177, + "learning_rate": 9.604635999609402e-06, + "loss": 0.6923, + "step": 1579 + }, + { + "epoch": 0.1537712895377129, + "grad_norm": 1.6074239515288835, + "learning_rate": 9.604021488530342e-06, + "loss": 0.4771, + "step": 1580 + }, + { + "epoch": 0.15386861313868613, + "grad_norm": 1.5289432511411145, + "learning_rate": 9.603406519946838e-06, + "loss": 0.5881, + "step": 1581 + }, + { + "epoch": 0.15396593673965936, + "grad_norm": 1.3225323677068181, + "learning_rate": 9.602791093919998e-06, + "loss": 0.3128, + "step": 1582 + }, + { + "epoch": 0.1540632603406326, + "grad_norm": 1.467417498061456, + "learning_rate": 9.60217521051098e-06, + "loss": 0.545, + "step": 1583 + }, + { + "epoch": 0.15416058394160584, + "grad_norm": 1.7568491012309082, + "learning_rate": 9.60155886978098e-06, + "loss": 0.7054, + "step": 1584 + }, + { + "epoch": 0.15425790754257906, + "grad_norm": 1.5606257069028109, + "learning_rate": 9.600942071791248e-06, + "loss": 0.4329, + "step": 1585 + }, + { + "epoch": 0.15435523114355232, + "grad_norm": 1.5727160833264413, + "learning_rate": 9.600324816603074e-06, + "loss": 0.6128, + "step": 1586 + }, + { + "epoch": 0.15445255474452554, + "grad_norm": 1.3864503412663605, + "learning_rate": 9.599707104277796e-06, + "loss": 0.573, + "step": 1587 + }, + { + "epoch": 0.15454987834549877, + "grad_norm": 1.4232761061254342, + "learning_rate": 9.599088934876794e-06, + "loss": 0.4136, + "step": 1588 + }, + { + "epoch": 0.15464720194647202, + "grad_norm": 1.3399427727677786, + "learning_rate": 9.598470308461499e-06, + "loss": 0.257, + "step": 1589 + }, + { + "epoch": 0.15474452554744525, + "grad_norm": 1.61635763649276, + "learning_rate": 9.597851225093382e-06, + "loss": 0.566, + "step": 1590 + }, + { + "epoch": 0.1548418491484185, + "grad_norm": 1.6304164262097627, + "learning_rate": 9.597231684833964e-06, + "loss": 0.3673, + "step": 1591 + }, + { + "epoch": 0.15493917274939173, + "grad_norm": 1.4592987498064005, + "learning_rate": 9.596611687744807e-06, + "loss": 0.5193, + "step": 1592 + }, + { + "epoch": 0.15503649635036496, + "grad_norm": 1.4397292060019447, + "learning_rate": 9.595991233887523e-06, + "loss": 0.3236, + "step": 1593 + }, + { + "epoch": 0.1551338199513382, + "grad_norm": 1.2246835494507005, + "learning_rate": 9.595370323323763e-06, + "loss": 0.2397, + "step": 1594 + }, + { + "epoch": 0.15523114355231143, + "grad_norm": 1.530797619071646, + "learning_rate": 9.59474895611523e-06, + "loss": 0.4537, + "step": 1595 + }, + { + "epoch": 0.15532846715328466, + "grad_norm": 0.9400393110536889, + "learning_rate": 9.594127132323669e-06, + "loss": 0.1899, + "step": 1596 + }, + { + "epoch": 0.15542579075425791, + "grad_norm": 1.167634539806263, + "learning_rate": 9.593504852010872e-06, + "loss": 0.353, + "step": 1597 + }, + { + "epoch": 0.15552311435523114, + "grad_norm": 1.6772160290018319, + "learning_rate": 9.592882115238675e-06, + "loss": 0.4194, + "step": 1598 + }, + { + "epoch": 0.15562043795620437, + "grad_norm": 1.4391641520861267, + "learning_rate": 9.592258922068958e-06, + "loss": 0.4767, + "step": 1599 + }, + { + "epoch": 0.15571776155717762, + "grad_norm": 1.544673007447179, + "learning_rate": 9.591635272563648e-06, + "loss": 0.3175, + "step": 1600 + }, + { + "epoch": 0.15581508515815085, + "grad_norm": 1.4189512773822923, + "learning_rate": 9.591011166784721e-06, + "loss": 0.4834, + "step": 1601 + }, + { + "epoch": 0.1559124087591241, + "grad_norm": 1.2414753149853184, + "learning_rate": 9.590386604794191e-06, + "loss": 0.3657, + "step": 1602 + }, + { + "epoch": 0.15600973236009733, + "grad_norm": 1.0236785255419305, + "learning_rate": 9.589761586654122e-06, + "loss": 0.2011, + "step": 1603 + }, + { + "epoch": 0.15610705596107055, + "grad_norm": 2.3461369884265357, + "learning_rate": 9.589136112426625e-06, + "loss": 0.4024, + "step": 1604 + }, + { + "epoch": 0.1562043795620438, + "grad_norm": 1.2849479900774115, + "learning_rate": 9.588510182173851e-06, + "loss": 0.3527, + "step": 1605 + }, + { + "epoch": 0.15630170316301703, + "grad_norm": 1.4153286655317308, + "learning_rate": 9.587883795958001e-06, + "loss": 0.4149, + "step": 1606 + }, + { + "epoch": 0.15639902676399026, + "grad_norm": 1.6599870662874754, + "learning_rate": 9.587256953841317e-06, + "loss": 0.6479, + "step": 1607 + }, + { + "epoch": 0.1564963503649635, + "grad_norm": 1.6670860080877101, + "learning_rate": 9.58662965588609e-06, + "loss": 0.5825, + "step": 1608 + }, + { + "epoch": 0.15659367396593674, + "grad_norm": 1.7776280437765584, + "learning_rate": 9.586001902154655e-06, + "loss": 0.5798, + "step": 1609 + }, + { + "epoch": 0.15669099756690996, + "grad_norm": 1.5456297515043347, + "learning_rate": 9.585373692709391e-06, + "loss": 0.4583, + "step": 1610 + }, + { + "epoch": 0.15678832116788322, + "grad_norm": 1.8806083091738082, + "learning_rate": 9.584745027612728e-06, + "loss": 0.4736, + "step": 1611 + }, + { + "epoch": 0.15688564476885644, + "grad_norm": 1.4790926453601037, + "learning_rate": 9.584115906927131e-06, + "loss": 0.4172, + "step": 1612 + }, + { + "epoch": 0.1569829683698297, + "grad_norm": 3.3021500316987633, + "learning_rate": 9.58348633071512e-06, + "loss": 0.472, + "step": 1613 + }, + { + "epoch": 0.15708029197080292, + "grad_norm": 1.860435632122749, + "learning_rate": 9.582856299039253e-06, + "loss": 0.4743, + "step": 1614 + }, + { + "epoch": 0.15717761557177615, + "grad_norm": 1.7557070181222967, + "learning_rate": 9.58222581196214e-06, + "loss": 0.2907, + "step": 1615 + }, + { + "epoch": 0.1572749391727494, + "grad_norm": 1.5588238003780286, + "learning_rate": 9.581594869546433e-06, + "loss": 0.3803, + "step": 1616 + }, + { + "epoch": 0.15737226277372263, + "grad_norm": 1.5265824940366777, + "learning_rate": 9.580963471854825e-06, + "loss": 0.3163, + "step": 1617 + }, + { + "epoch": 0.15746958637469585, + "grad_norm": 1.5425233608560427, + "learning_rate": 9.580331618950063e-06, + "loss": 0.3884, + "step": 1618 + }, + { + "epoch": 0.1575669099756691, + "grad_norm": 1.4123635386488018, + "learning_rate": 9.579699310894932e-06, + "loss": 0.382, + "step": 1619 + }, + { + "epoch": 0.15766423357664233, + "grad_norm": 1.578019469103596, + "learning_rate": 9.579066547752266e-06, + "loss": 0.4293, + "step": 1620 + }, + { + "epoch": 0.15776155717761559, + "grad_norm": 1.6566990657429592, + "learning_rate": 9.578433329584943e-06, + "loss": 0.2878, + "step": 1621 + }, + { + "epoch": 0.1578588807785888, + "grad_norm": 1.5290043771605026, + "learning_rate": 9.577799656455886e-06, + "loss": 0.4483, + "step": 1622 + }, + { + "epoch": 0.15795620437956204, + "grad_norm": 1.7268752423292135, + "learning_rate": 9.577165528428063e-06, + "loss": 0.4805, + "step": 1623 + }, + { + "epoch": 0.1580535279805353, + "grad_norm": 1.3495189675110832, + "learning_rate": 9.576530945564488e-06, + "loss": 0.3161, + "step": 1624 + }, + { + "epoch": 0.15815085158150852, + "grad_norm": 1.4763829359235794, + "learning_rate": 9.575895907928218e-06, + "loss": 0.4825, + "step": 1625 + }, + { + "epoch": 0.15824817518248174, + "grad_norm": 1.686991367686583, + "learning_rate": 9.575260415582362e-06, + "loss": 0.3016, + "step": 1626 + }, + { + "epoch": 0.158345498783455, + "grad_norm": 1.3390220591470878, + "learning_rate": 9.574624468590065e-06, + "loss": 0.4523, + "step": 1627 + }, + { + "epoch": 0.15844282238442822, + "grad_norm": 1.8698808087393168, + "learning_rate": 9.573988067014523e-06, + "loss": 0.5203, + "step": 1628 + }, + { + "epoch": 0.15854014598540145, + "grad_norm": 1.4032165021732874, + "learning_rate": 9.573351210918976e-06, + "loss": 0.3678, + "step": 1629 + }, + { + "epoch": 0.1586374695863747, + "grad_norm": 1.4017015011859046, + "learning_rate": 9.572713900366707e-06, + "loss": 0.2798, + "step": 1630 + }, + { + "epoch": 0.15873479318734793, + "grad_norm": 1.4441030854971395, + "learning_rate": 9.572076135421048e-06, + "loss": 0.3514, + "step": 1631 + }, + { + "epoch": 0.15883211678832118, + "grad_norm": 1.3629792761623065, + "learning_rate": 9.571437916145373e-06, + "loss": 0.4604, + "step": 1632 + }, + { + "epoch": 0.1589294403892944, + "grad_norm": 1.376972344446985, + "learning_rate": 9.570799242603101e-06, + "loss": 0.4603, + "step": 1633 + }, + { + "epoch": 0.15902676399026763, + "grad_norm": 1.5637421057827365, + "learning_rate": 9.5701601148577e-06, + "loss": 0.5575, + "step": 1634 + }, + { + "epoch": 0.1591240875912409, + "grad_norm": 1.4338457681188446, + "learning_rate": 9.56952053297268e-06, + "loss": 0.532, + "step": 1635 + }, + { + "epoch": 0.1592214111922141, + "grad_norm": 1.4858651962900338, + "learning_rate": 9.568880497011597e-06, + "loss": 0.4951, + "step": 1636 + }, + { + "epoch": 0.15931873479318734, + "grad_norm": 1.543423201839799, + "learning_rate": 9.568240007038048e-06, + "loss": 0.5278, + "step": 1637 + }, + { + "epoch": 0.1594160583941606, + "grad_norm": 1.408319688012345, + "learning_rate": 9.567599063115683e-06, + "loss": 0.4474, + "step": 1638 + }, + { + "epoch": 0.15951338199513382, + "grad_norm": 1.2680346779127702, + "learning_rate": 9.566957665308192e-06, + "loss": 0.3351, + "step": 1639 + }, + { + "epoch": 0.15961070559610704, + "grad_norm": 1.6277797838197976, + "learning_rate": 9.56631581367931e-06, + "loss": 0.3966, + "step": 1640 + }, + { + "epoch": 0.1597080291970803, + "grad_norm": 1.5248977314161354, + "learning_rate": 9.565673508292818e-06, + "loss": 0.5211, + "step": 1641 + }, + { + "epoch": 0.15980535279805352, + "grad_norm": 1.7164012466100764, + "learning_rate": 9.565030749212546e-06, + "loss": 0.5428, + "step": 1642 + }, + { + "epoch": 0.15990267639902678, + "grad_norm": 1.6687081549609284, + "learning_rate": 9.56438753650236e-06, + "loss": 0.2936, + "step": 1643 + }, + { + "epoch": 0.16, + "grad_norm": 1.5678110268585723, + "learning_rate": 9.56374387022618e-06, + "loss": 0.5166, + "step": 1644 + }, + { + "epoch": 0.16009732360097323, + "grad_norm": 1.6983019931785335, + "learning_rate": 9.563099750447966e-06, + "loss": 0.4822, + "step": 1645 + }, + { + "epoch": 0.16019464720194648, + "grad_norm": 1.4431824530543444, + "learning_rate": 9.562455177231726e-06, + "loss": 0.3212, + "step": 1646 + }, + { + "epoch": 0.1602919708029197, + "grad_norm": 3.712828208723791, + "learning_rate": 9.56181015064151e-06, + "loss": 0.4286, + "step": 1647 + }, + { + "epoch": 0.16038929440389293, + "grad_norm": 1.4388083433357408, + "learning_rate": 9.561164670741416e-06, + "loss": 0.3757, + "step": 1648 + }, + { + "epoch": 0.1604866180048662, + "grad_norm": 2.4878081586110117, + "learning_rate": 9.560518737595586e-06, + "loss": 0.3494, + "step": 1649 + }, + { + "epoch": 0.16058394160583941, + "grad_norm": 2.3091262745384706, + "learning_rate": 9.559872351268205e-06, + "loss": 0.4607, + "step": 1650 + }, + { + "epoch": 0.16068126520681264, + "grad_norm": 1.6632563827899045, + "learning_rate": 9.559225511823504e-06, + "loss": 0.5718, + "step": 1651 + }, + { + "epoch": 0.1607785888077859, + "grad_norm": 1.6138862417611177, + "learning_rate": 9.558578219325763e-06, + "loss": 0.325, + "step": 1652 + }, + { + "epoch": 0.16087591240875912, + "grad_norm": 1.1933317040764397, + "learning_rate": 9.557930473839303e-06, + "loss": 0.339, + "step": 1653 + }, + { + "epoch": 0.16097323600973237, + "grad_norm": 0.9728312200944081, + "learning_rate": 9.55728227542849e-06, + "loss": 0.2395, + "step": 1654 + }, + { + "epoch": 0.1610705596107056, + "grad_norm": 1.5521742092214053, + "learning_rate": 9.556633624157735e-06, + "loss": 0.4613, + "step": 1655 + }, + { + "epoch": 0.16116788321167883, + "grad_norm": 1.639740187603822, + "learning_rate": 9.555984520091497e-06, + "loss": 0.5146, + "step": 1656 + }, + { + "epoch": 0.16126520681265208, + "grad_norm": 1.5387772039120604, + "learning_rate": 9.555334963294277e-06, + "loss": 0.4879, + "step": 1657 + }, + { + "epoch": 0.1613625304136253, + "grad_norm": 1.2788374913210725, + "learning_rate": 9.554684953830622e-06, + "loss": 0.2115, + "step": 1658 + }, + { + "epoch": 0.16145985401459853, + "grad_norm": 1.2466060338770748, + "learning_rate": 9.554034491765123e-06, + "loss": 0.4057, + "step": 1659 + }, + { + "epoch": 0.16155717761557178, + "grad_norm": 1.3626765355526065, + "learning_rate": 9.553383577162418e-06, + "loss": 0.3922, + "step": 1660 + }, + { + "epoch": 0.161654501216545, + "grad_norm": 1.4993759287568524, + "learning_rate": 9.552732210087188e-06, + "loss": 0.5101, + "step": 1661 + }, + { + "epoch": 0.16175182481751824, + "grad_norm": 1.4132678080310175, + "learning_rate": 9.55208039060416e-06, + "loss": 0.4098, + "step": 1662 + }, + { + "epoch": 0.1618491484184915, + "grad_norm": 1.3072203759845393, + "learning_rate": 9.551428118778105e-06, + "loss": 0.4437, + "step": 1663 + }, + { + "epoch": 0.16194647201946472, + "grad_norm": 1.4197615961970556, + "learning_rate": 9.550775394673841e-06, + "loss": 0.4855, + "step": 1664 + }, + { + "epoch": 0.16204379562043797, + "grad_norm": 1.1443578178578404, + "learning_rate": 9.550122218356228e-06, + "loss": 0.2651, + "step": 1665 + }, + { + "epoch": 0.1621411192214112, + "grad_norm": 1.6274953169982382, + "learning_rate": 9.549468589890173e-06, + "loss": 0.5702, + "step": 1666 + }, + { + "epoch": 0.16223844282238442, + "grad_norm": 1.5542252970145625, + "learning_rate": 9.548814509340631e-06, + "loss": 0.3618, + "step": 1667 + }, + { + "epoch": 0.16233576642335767, + "grad_norm": 1.5872588267319008, + "learning_rate": 9.548159976772593e-06, + "loss": 0.5261, + "step": 1668 + }, + { + "epoch": 0.1624330900243309, + "grad_norm": 1.1735078752446053, + "learning_rate": 9.547504992251102e-06, + "loss": 0.2709, + "step": 1669 + }, + { + "epoch": 0.16253041362530413, + "grad_norm": 1.8057871189139236, + "learning_rate": 9.546849555841247e-06, + "loss": 0.3383, + "step": 1670 + }, + { + "epoch": 0.16262773722627738, + "grad_norm": 1.4181568031561294, + "learning_rate": 9.546193667608155e-06, + "loss": 0.4654, + "step": 1671 + }, + { + "epoch": 0.1627250608272506, + "grad_norm": 1.3372190697374011, + "learning_rate": 9.545537327617004e-06, + "loss": 0.4098, + "step": 1672 + }, + { + "epoch": 0.16282238442822383, + "grad_norm": 1.4054977948345526, + "learning_rate": 9.544880535933015e-06, + "loss": 0.488, + "step": 1673 + }, + { + "epoch": 0.16291970802919709, + "grad_norm": 1.8103202340533562, + "learning_rate": 9.544223292621456e-06, + "loss": 0.2989, + "step": 1674 + }, + { + "epoch": 0.1630170316301703, + "grad_norm": 1.4424657055300307, + "learning_rate": 9.543565597747633e-06, + "loss": 0.3545, + "step": 1675 + }, + { + "epoch": 0.16311435523114357, + "grad_norm": 1.712897793310079, + "learning_rate": 9.542907451376904e-06, + "loss": 0.4372, + "step": 1676 + }, + { + "epoch": 0.1632116788321168, + "grad_norm": 1.5856342495538354, + "learning_rate": 9.542248853574669e-06, + "loss": 0.3552, + "step": 1677 + }, + { + "epoch": 0.16330900243309002, + "grad_norm": 1.6070757988154845, + "learning_rate": 9.541589804406373e-06, + "loss": 0.6297, + "step": 1678 + }, + { + "epoch": 0.16340632603406327, + "grad_norm": 1.4030835423791206, + "learning_rate": 9.540930303937508e-06, + "loss": 0.5304, + "step": 1679 + }, + { + "epoch": 0.1635036496350365, + "grad_norm": 1.1629420270697914, + "learning_rate": 9.540270352233607e-06, + "loss": 0.3196, + "step": 1680 + }, + { + "epoch": 0.16360097323600972, + "grad_norm": 1.6438421767465334, + "learning_rate": 9.53960994936025e-06, + "loss": 0.5718, + "step": 1681 + }, + { + "epoch": 0.16369829683698298, + "grad_norm": 1.4972655485667212, + "learning_rate": 9.538949095383064e-06, + "loss": 0.5411, + "step": 1682 + }, + { + "epoch": 0.1637956204379562, + "grad_norm": 1.6855463092047138, + "learning_rate": 9.538287790367715e-06, + "loss": 0.4072, + "step": 1683 + }, + { + "epoch": 0.16389294403892943, + "grad_norm": 1.3024464622228382, + "learning_rate": 9.537626034379918e-06, + "loss": 0.3779, + "step": 1684 + }, + { + "epoch": 0.16399026763990268, + "grad_norm": 1.295189693137423, + "learning_rate": 9.536963827485435e-06, + "loss": 0.3687, + "step": 1685 + }, + { + "epoch": 0.1640875912408759, + "grad_norm": 1.4535138830119652, + "learning_rate": 9.536301169750068e-06, + "loss": 0.4548, + "step": 1686 + }, + { + "epoch": 0.16418491484184916, + "grad_norm": 1.199213729997, + "learning_rate": 9.535638061239663e-06, + "loss": 0.2053, + "step": 1687 + }, + { + "epoch": 0.1642822384428224, + "grad_norm": 1.5567691993981325, + "learning_rate": 9.534974502020117e-06, + "loss": 0.4098, + "step": 1688 + }, + { + "epoch": 0.1643795620437956, + "grad_norm": 1.5701473016338705, + "learning_rate": 9.534310492157368e-06, + "loss": 0.4663, + "step": 1689 + }, + { + "epoch": 0.16447688564476887, + "grad_norm": 1.4652608455665965, + "learning_rate": 9.533646031717398e-06, + "loss": 0.423, + "step": 1690 + }, + { + "epoch": 0.1645742092457421, + "grad_norm": 1.556818972222242, + "learning_rate": 9.532981120766235e-06, + "loss": 0.5823, + "step": 1691 + }, + { + "epoch": 0.16467153284671532, + "grad_norm": 1.3176167070500389, + "learning_rate": 9.532315759369953e-06, + "loss": 0.3369, + "step": 1692 + }, + { + "epoch": 0.16476885644768857, + "grad_norm": 1.710131590392248, + "learning_rate": 9.531649947594668e-06, + "loss": 0.6235, + "step": 1693 + }, + { + "epoch": 0.1648661800486618, + "grad_norm": 1.316452070848038, + "learning_rate": 9.53098368550654e-06, + "loss": 0.2773, + "step": 1694 + }, + { + "epoch": 0.16496350364963502, + "grad_norm": 1.3144552108952152, + "learning_rate": 9.53031697317178e-06, + "loss": 0.4008, + "step": 1695 + }, + { + "epoch": 0.16506082725060828, + "grad_norm": 1.6242845867808264, + "learning_rate": 9.529649810656638e-06, + "loss": 0.4994, + "step": 1696 + }, + { + "epoch": 0.1651581508515815, + "grad_norm": 1.285181340955318, + "learning_rate": 9.52898219802741e-06, + "loss": 0.3565, + "step": 1697 + }, + { + "epoch": 0.16525547445255476, + "grad_norm": 1.5859120183692204, + "learning_rate": 9.528314135350439e-06, + "loss": 0.6057, + "step": 1698 + }, + { + "epoch": 0.16535279805352798, + "grad_norm": 1.2413369391689792, + "learning_rate": 9.527645622692105e-06, + "loss": 0.2912, + "step": 1699 + }, + { + "epoch": 0.1654501216545012, + "grad_norm": 1.5626898078072964, + "learning_rate": 9.526976660118846e-06, + "loss": 0.4912, + "step": 1700 + }, + { + "epoch": 0.16554744525547446, + "grad_norm": 1.355302168314411, + "learning_rate": 9.526307247697133e-06, + "loss": 0.4066, + "step": 1701 + }, + { + "epoch": 0.1656447688564477, + "grad_norm": 1.6754743388370108, + "learning_rate": 9.525637385493485e-06, + "loss": 0.4402, + "step": 1702 + }, + { + "epoch": 0.16574209245742091, + "grad_norm": 1.4378330010865907, + "learning_rate": 9.524967073574468e-06, + "loss": 0.3896, + "step": 1703 + }, + { + "epoch": 0.16583941605839417, + "grad_norm": 1.5562357645264613, + "learning_rate": 9.524296312006696e-06, + "loss": 0.7178, + "step": 1704 + }, + { + "epoch": 0.1659367396593674, + "grad_norm": 1.4997676033555023, + "learning_rate": 9.523625100856814e-06, + "loss": 0.5203, + "step": 1705 + }, + { + "epoch": 0.16603406326034062, + "grad_norm": 1.39039181243628, + "learning_rate": 9.522953440191528e-06, + "loss": 0.4804, + "step": 1706 + }, + { + "epoch": 0.16613138686131387, + "grad_norm": 1.2594698773182105, + "learning_rate": 9.522281330077579e-06, + "loss": 0.31, + "step": 1707 + }, + { + "epoch": 0.1662287104622871, + "grad_norm": 1.5394103920539104, + "learning_rate": 9.521608770581751e-06, + "loss": 0.4579, + "step": 1708 + }, + { + "epoch": 0.16632603406326035, + "grad_norm": 1.4703967014570463, + "learning_rate": 9.520935761770885e-06, + "loss": 0.4732, + "step": 1709 + }, + { + "epoch": 0.16642335766423358, + "grad_norm": 1.0444153315520046, + "learning_rate": 9.520262303711851e-06, + "loss": 0.2468, + "step": 1710 + }, + { + "epoch": 0.1665206812652068, + "grad_norm": 1.4440019594110525, + "learning_rate": 9.519588396471572e-06, + "loss": 0.4979, + "step": 1711 + }, + { + "epoch": 0.16661800486618006, + "grad_norm": 1.6467368949298022, + "learning_rate": 9.518914040117018e-06, + "loss": 0.603, + "step": 1712 + }, + { + "epoch": 0.16671532846715328, + "grad_norm": 1.656027868957794, + "learning_rate": 9.518239234715198e-06, + "loss": 0.3534, + "step": 1713 + }, + { + "epoch": 0.1668126520681265, + "grad_norm": 1.409360793352949, + "learning_rate": 9.517563980333169e-06, + "loss": 0.4442, + "step": 1714 + }, + { + "epoch": 0.16690997566909976, + "grad_norm": 1.4429795690770129, + "learning_rate": 9.51688827703803e-06, + "loss": 0.4347, + "step": 1715 + }, + { + "epoch": 0.167007299270073, + "grad_norm": 1.2256612199861667, + "learning_rate": 9.516212124896926e-06, + "loss": 0.3582, + "step": 1716 + }, + { + "epoch": 0.16710462287104622, + "grad_norm": 1.340106815948813, + "learning_rate": 9.515535523977047e-06, + "loss": 0.4494, + "step": 1717 + }, + { + "epoch": 0.16720194647201947, + "grad_norm": 1.8033632646616307, + "learning_rate": 9.514858474345628e-06, + "loss": 0.7254, + "step": 1718 + }, + { + "epoch": 0.1672992700729927, + "grad_norm": 1.461471704742246, + "learning_rate": 9.514180976069948e-06, + "loss": 0.4431, + "step": 1719 + }, + { + "epoch": 0.16739659367396595, + "grad_norm": 1.8149337871023152, + "learning_rate": 9.513503029217329e-06, + "loss": 0.6808, + "step": 1720 + }, + { + "epoch": 0.16749391727493917, + "grad_norm": 1.4317488687976054, + "learning_rate": 9.51282463385514e-06, + "loss": 0.3969, + "step": 1721 + }, + { + "epoch": 0.1675912408759124, + "grad_norm": 1.406660867644435, + "learning_rate": 9.512145790050793e-06, + "loss": 0.4466, + "step": 1722 + }, + { + "epoch": 0.16768856447688565, + "grad_norm": 1.5087949092220858, + "learning_rate": 9.511466497871747e-06, + "loss": 0.3588, + "step": 1723 + }, + { + "epoch": 0.16778588807785888, + "grad_norm": 1.3780878680496882, + "learning_rate": 9.5107867573855e-06, + "loss": 0.4136, + "step": 1724 + }, + { + "epoch": 0.1678832116788321, + "grad_norm": 1.1785521443758606, + "learning_rate": 9.510106568659601e-06, + "loss": 0.3319, + "step": 1725 + }, + { + "epoch": 0.16798053527980536, + "grad_norm": 1.4150065437408217, + "learning_rate": 9.50942593176164e-06, + "loss": 0.3619, + "step": 1726 + }, + { + "epoch": 0.16807785888077859, + "grad_norm": 1.5810685607791577, + "learning_rate": 9.508744846759254e-06, + "loss": 0.5204, + "step": 1727 + }, + { + "epoch": 0.1681751824817518, + "grad_norm": 1.5507123725258296, + "learning_rate": 9.50806331372012e-06, + "loss": 0.3017, + "step": 1728 + }, + { + "epoch": 0.16827250608272507, + "grad_norm": 1.7448176899198176, + "learning_rate": 9.507381332711963e-06, + "loss": 0.6488, + "step": 1729 + }, + { + "epoch": 0.1683698296836983, + "grad_norm": 2.0203041353812243, + "learning_rate": 9.506698903802553e-06, + "loss": 0.2868, + "step": 1730 + }, + { + "epoch": 0.16846715328467154, + "grad_norm": 1.425557408986151, + "learning_rate": 9.506016027059703e-06, + "loss": 0.4181, + "step": 1731 + }, + { + "epoch": 0.16856447688564477, + "grad_norm": 1.658389742609111, + "learning_rate": 9.505332702551272e-06, + "loss": 0.4834, + "step": 1732 + }, + { + "epoch": 0.168661800486618, + "grad_norm": 1.6313220070332846, + "learning_rate": 9.50464893034516e-06, + "loss": 0.6351, + "step": 1733 + }, + { + "epoch": 0.16875912408759125, + "grad_norm": 1.4860828412814417, + "learning_rate": 9.503964710509314e-06, + "loss": 0.384, + "step": 1734 + }, + { + "epoch": 0.16885644768856448, + "grad_norm": 1.5665989326823084, + "learning_rate": 9.503280043111729e-06, + "loss": 0.5031, + "step": 1735 + }, + { + "epoch": 0.1689537712895377, + "grad_norm": 1.2627591310970376, + "learning_rate": 9.502594928220437e-06, + "loss": 0.3557, + "step": 1736 + }, + { + "epoch": 0.16905109489051096, + "grad_norm": 1.6101827723851228, + "learning_rate": 9.50190936590352e-06, + "loss": 0.3886, + "step": 1737 + }, + { + "epoch": 0.16914841849148418, + "grad_norm": 1.190927027644026, + "learning_rate": 9.5012233562291e-06, + "loss": 0.3, + "step": 1738 + }, + { + "epoch": 0.1692457420924574, + "grad_norm": 1.6452233677093766, + "learning_rate": 9.50053689926535e-06, + "loss": 0.5808, + "step": 1739 + }, + { + "epoch": 0.16934306569343066, + "grad_norm": 1.607284224817037, + "learning_rate": 9.499849995080482e-06, + "loss": 0.5726, + "step": 1740 + }, + { + "epoch": 0.1694403892944039, + "grad_norm": 1.360873175063302, + "learning_rate": 9.499162643742754e-06, + "loss": 0.3294, + "step": 1741 + }, + { + "epoch": 0.16953771289537714, + "grad_norm": 1.6205396325650636, + "learning_rate": 9.49847484532047e-06, + "loss": 0.5496, + "step": 1742 + }, + { + "epoch": 0.16963503649635037, + "grad_norm": 1.6677491090337848, + "learning_rate": 9.497786599881973e-06, + "loss": 0.5745, + "step": 1743 + }, + { + "epoch": 0.1697323600973236, + "grad_norm": 1.4765151889225172, + "learning_rate": 9.497097907495658e-06, + "loss": 0.3552, + "step": 1744 + }, + { + "epoch": 0.16982968369829685, + "grad_norm": 1.4991516257283077, + "learning_rate": 9.496408768229962e-06, + "loss": 0.6004, + "step": 1745 + }, + { + "epoch": 0.16992700729927007, + "grad_norm": 1.394241003611109, + "learning_rate": 9.49571918215336e-06, + "loss": 0.4166, + "step": 1746 + }, + { + "epoch": 0.1700243309002433, + "grad_norm": 1.2418310265706307, + "learning_rate": 9.495029149334381e-06, + "loss": 0.3754, + "step": 1747 + }, + { + "epoch": 0.17012165450121655, + "grad_norm": 1.7344174079178016, + "learning_rate": 9.494338669841592e-06, + "loss": 0.6136, + "step": 1748 + }, + { + "epoch": 0.17021897810218978, + "grad_norm": 1.689754745813109, + "learning_rate": 9.493647743743605e-06, + "loss": 0.3066, + "step": 1749 + }, + { + "epoch": 0.170316301703163, + "grad_norm": 1.5986274434851808, + "learning_rate": 9.492956371109083e-06, + "loss": 0.6476, + "step": 1750 + }, + { + "epoch": 0.17041362530413626, + "grad_norm": 1.3892856963539753, + "learning_rate": 9.492264552006725e-06, + "loss": 0.2438, + "step": 1751 + }, + { + "epoch": 0.17051094890510948, + "grad_norm": 1.3744062095245357, + "learning_rate": 9.491572286505275e-06, + "loss": 0.4154, + "step": 1752 + }, + { + "epoch": 0.17060827250608274, + "grad_norm": 1.3041989445373636, + "learning_rate": 9.490879574673528e-06, + "loss": 0.3603, + "step": 1753 + }, + { + "epoch": 0.17070559610705596, + "grad_norm": 1.2198251236981021, + "learning_rate": 9.490186416580317e-06, + "loss": 0.3382, + "step": 1754 + }, + { + "epoch": 0.1708029197080292, + "grad_norm": 1.0699077871285796, + "learning_rate": 9.489492812294521e-06, + "loss": 0.2805, + "step": 1755 + }, + { + "epoch": 0.17090024330900244, + "grad_norm": 1.8289792925797566, + "learning_rate": 9.488798761885064e-06, + "loss": 0.2551, + "step": 1756 + }, + { + "epoch": 0.17099756690997567, + "grad_norm": 1.5156970449411904, + "learning_rate": 9.488104265420917e-06, + "loss": 0.5468, + "step": 1757 + }, + { + "epoch": 0.1710948905109489, + "grad_norm": 1.3669899498040559, + "learning_rate": 9.487409322971089e-06, + "loss": 0.4705, + "step": 1758 + }, + { + "epoch": 0.17119221411192215, + "grad_norm": 1.4212977316967985, + "learning_rate": 9.486713934604638e-06, + "loss": 0.5259, + "step": 1759 + }, + { + "epoch": 0.17128953771289537, + "grad_norm": 1.3256503218660822, + "learning_rate": 9.486018100390668e-06, + "loss": 0.3825, + "step": 1760 + }, + { + "epoch": 0.17138686131386863, + "grad_norm": 1.3448672418414023, + "learning_rate": 9.485321820398321e-06, + "loss": 0.4984, + "step": 1761 + }, + { + "epoch": 0.17148418491484185, + "grad_norm": 1.3293122762885854, + "learning_rate": 9.484625094696788e-06, + "loss": 0.4419, + "step": 1762 + }, + { + "epoch": 0.17158150851581508, + "grad_norm": 1.5749728003681251, + "learning_rate": 9.483927923355303e-06, + "loss": 0.4512, + "step": 1763 + }, + { + "epoch": 0.17167883211678833, + "grad_norm": 2.1875449039755, + "learning_rate": 9.483230306443144e-06, + "loss": 0.4606, + "step": 1764 + }, + { + "epoch": 0.17177615571776156, + "grad_norm": 1.4675466599593059, + "learning_rate": 9.482532244029632e-06, + "loss": 0.5098, + "step": 1765 + }, + { + "epoch": 0.17187347931873478, + "grad_norm": 1.4395657499189969, + "learning_rate": 9.481833736184137e-06, + "loss": 0.4196, + "step": 1766 + }, + { + "epoch": 0.17197080291970804, + "grad_norm": 1.6202346179751734, + "learning_rate": 9.48113478297607e-06, + "loss": 0.4083, + "step": 1767 + }, + { + "epoch": 0.17206812652068126, + "grad_norm": 1.943359375, + "learning_rate": 9.480435384474884e-06, + "loss": 0.3829, + "step": 1768 + }, + { + "epoch": 0.1721654501216545, + "grad_norm": 1.3957800309361543, + "learning_rate": 9.47973554075008e-06, + "loss": 0.4776, + "step": 1769 + }, + { + "epoch": 0.17226277372262774, + "grad_norm": 1.3277740014703983, + "learning_rate": 9.479035251871202e-06, + "loss": 0.2944, + "step": 1770 + }, + { + "epoch": 0.17236009732360097, + "grad_norm": 1.5955109684829234, + "learning_rate": 9.478334517907838e-06, + "loss": 0.4713, + "step": 1771 + }, + { + "epoch": 0.17245742092457422, + "grad_norm": 1.209763677864614, + "learning_rate": 9.477633338929621e-06, + "loss": 0.2925, + "step": 1772 + }, + { + "epoch": 0.17255474452554745, + "grad_norm": 1.6082316661319236, + "learning_rate": 9.476931715006225e-06, + "loss": 0.6037, + "step": 1773 + }, + { + "epoch": 0.17265206812652067, + "grad_norm": 1.5310145605828824, + "learning_rate": 9.476229646207375e-06, + "loss": 0.427, + "step": 1774 + }, + { + "epoch": 0.17274939172749393, + "grad_norm": 1.5161322305327478, + "learning_rate": 9.475527132602833e-06, + "loss": 0.5765, + "step": 1775 + }, + { + "epoch": 0.17284671532846715, + "grad_norm": 1.5515912532543141, + "learning_rate": 9.47482417426241e-06, + "loss": 0.4693, + "step": 1776 + }, + { + "epoch": 0.17294403892944038, + "grad_norm": 1.273583152257964, + "learning_rate": 9.474120771255956e-06, + "loss": 0.401, + "step": 1777 + }, + { + "epoch": 0.17304136253041363, + "grad_norm": 1.3058387108850102, + "learning_rate": 9.473416923653373e-06, + "loss": 0.4651, + "step": 1778 + }, + { + "epoch": 0.17313868613138686, + "grad_norm": 1.4876685295483647, + "learning_rate": 9.472712631524599e-06, + "loss": 0.5423, + "step": 1779 + }, + { + "epoch": 0.17323600973236009, + "grad_norm": 1.4134646674679987, + "learning_rate": 9.472007894939624e-06, + "loss": 0.448, + "step": 1780 + }, + { + "epoch": 0.17333333333333334, + "grad_norm": 1.3805564537318322, + "learning_rate": 9.471302713968473e-06, + "loss": 0.2429, + "step": 1781 + }, + { + "epoch": 0.17343065693430657, + "grad_norm": 1.4256414475552066, + "learning_rate": 9.470597088681225e-06, + "loss": 0.4821, + "step": 1782 + }, + { + "epoch": 0.17352798053527982, + "grad_norm": 1.2857804565204727, + "learning_rate": 9.469891019147996e-06, + "loss": 0.3177, + "step": 1783 + }, + { + "epoch": 0.17362530413625304, + "grad_norm": 1.7384422656290006, + "learning_rate": 9.46918450543895e-06, + "loss": 0.6144, + "step": 1784 + }, + { + "epoch": 0.17372262773722627, + "grad_norm": 1.3733947226466707, + "learning_rate": 9.46847754762429e-06, + "loss": 0.3777, + "step": 1785 + }, + { + "epoch": 0.17381995133819952, + "grad_norm": 1.090627736959876, + "learning_rate": 9.467770145774271e-06, + "loss": 0.307, + "step": 1786 + }, + { + "epoch": 0.17391727493917275, + "grad_norm": 1.5306242617959314, + "learning_rate": 9.467062299959187e-06, + "loss": 0.4652, + "step": 1787 + }, + { + "epoch": 0.17401459854014598, + "grad_norm": 1.6335244702718128, + "learning_rate": 9.466354010249375e-06, + "loss": 0.5127, + "step": 1788 + }, + { + "epoch": 0.17411192214111923, + "grad_norm": 1.3582351114688258, + "learning_rate": 9.465645276715221e-06, + "loss": 0.4213, + "step": 1789 + }, + { + "epoch": 0.17420924574209246, + "grad_norm": 1.4962342995542501, + "learning_rate": 9.464936099427151e-06, + "loss": 0.4327, + "step": 1790 + }, + { + "epoch": 0.17430656934306568, + "grad_norm": 1.513533102257641, + "learning_rate": 9.464226478455636e-06, + "loss": 0.4527, + "step": 1791 + }, + { + "epoch": 0.17440389294403894, + "grad_norm": 1.4174664240767785, + "learning_rate": 9.463516413871193e-06, + "loss": 0.4986, + "step": 1792 + }, + { + "epoch": 0.17450121654501216, + "grad_norm": 1.283758777636687, + "learning_rate": 9.46280590574438e-06, + "loss": 0.4648, + "step": 1793 + }, + { + "epoch": 0.17459854014598541, + "grad_norm": 1.3960565511895506, + "learning_rate": 9.4620949541458e-06, + "loss": 0.3587, + "step": 1794 + }, + { + "epoch": 0.17469586374695864, + "grad_norm": 1.6199676647020385, + "learning_rate": 9.461383559146104e-06, + "loss": 0.5292, + "step": 1795 + }, + { + "epoch": 0.17479318734793187, + "grad_norm": 1.5028051531717803, + "learning_rate": 9.46067172081598e-06, + "loss": 0.4903, + "step": 1796 + }, + { + "epoch": 0.17489051094890512, + "grad_norm": 1.683063467822515, + "learning_rate": 9.459959439226165e-06, + "loss": 0.3106, + "step": 1797 + }, + { + "epoch": 0.17498783454987835, + "grad_norm": 1.3296224342860092, + "learning_rate": 9.459246714447439e-06, + "loss": 0.409, + "step": 1798 + }, + { + "epoch": 0.17508515815085157, + "grad_norm": 1.5847151231323486, + "learning_rate": 9.458533546550628e-06, + "loss": 0.4169, + "step": 1799 + }, + { + "epoch": 0.17518248175182483, + "grad_norm": 1.495253204796384, + "learning_rate": 9.457819935606596e-06, + "loss": 0.3753, + "step": 1800 + }, + { + "epoch": 0.17527980535279805, + "grad_norm": 1.4723876384358174, + "learning_rate": 9.45710588168626e-06, + "loss": 0.2437, + "step": 1801 + }, + { + "epoch": 0.17537712895377128, + "grad_norm": 1.5610295815557715, + "learning_rate": 9.45639138486057e-06, + "loss": 0.5651, + "step": 1802 + }, + { + "epoch": 0.17547445255474453, + "grad_norm": 1.5876154549734276, + "learning_rate": 9.45567644520053e-06, + "loss": 0.4835, + "step": 1803 + }, + { + "epoch": 0.17557177615571776, + "grad_norm": 1.5619759252942187, + "learning_rate": 9.454961062777181e-06, + "loss": 0.3036, + "step": 1804 + }, + { + "epoch": 0.175669099756691, + "grad_norm": 1.0144029160546408, + "learning_rate": 9.454245237661617e-06, + "loss": 0.219, + "step": 1805 + }, + { + "epoch": 0.17576642335766424, + "grad_norm": 1.717922774563162, + "learning_rate": 9.453528969924963e-06, + "loss": 0.5388, + "step": 1806 + }, + { + "epoch": 0.17586374695863746, + "grad_norm": 1.288743598100688, + "learning_rate": 9.452812259638399e-06, + "loss": 0.4171, + "step": 1807 + }, + { + "epoch": 0.17596107055961072, + "grad_norm": 1.951279890184611, + "learning_rate": 9.452095106873142e-06, + "loss": 0.3823, + "step": 1808 + }, + { + "epoch": 0.17605839416058394, + "grad_norm": 1.352467757455935, + "learning_rate": 9.45137751170046e-06, + "loss": 0.3137, + "step": 1809 + }, + { + "epoch": 0.17615571776155717, + "grad_norm": 1.3883395327139227, + "learning_rate": 9.450659474191658e-06, + "loss": 0.4878, + "step": 1810 + }, + { + "epoch": 0.17625304136253042, + "grad_norm": 1.5658708451700805, + "learning_rate": 9.449940994418088e-06, + "loss": 0.523, + "step": 1811 + }, + { + "epoch": 0.17635036496350365, + "grad_norm": 1.215080164631292, + "learning_rate": 9.449222072451147e-06, + "loss": 0.3773, + "step": 1812 + }, + { + "epoch": 0.17644768856447687, + "grad_norm": 1.524331324578441, + "learning_rate": 9.448502708362273e-06, + "loss": 0.539, + "step": 1813 + }, + { + "epoch": 0.17654501216545013, + "grad_norm": 1.6985132616371517, + "learning_rate": 9.447782902222951e-06, + "loss": 0.6344, + "step": 1814 + }, + { + "epoch": 0.17664233576642335, + "grad_norm": 1.394156226586294, + "learning_rate": 9.447062654104708e-06, + "loss": 0.4136, + "step": 1815 + }, + { + "epoch": 0.1767396593673966, + "grad_norm": 1.0359913462457855, + "learning_rate": 9.446341964079116e-06, + "loss": 0.2471, + "step": 1816 + }, + { + "epoch": 0.17683698296836983, + "grad_norm": 1.6379291001324041, + "learning_rate": 9.44562083221779e-06, + "loss": 0.4648, + "step": 1817 + }, + { + "epoch": 0.17693430656934306, + "grad_norm": 1.0926982727654353, + "learning_rate": 9.44489925859239e-06, + "loss": 0.253, + "step": 1818 + }, + { + "epoch": 0.1770316301703163, + "grad_norm": 1.3396314447206463, + "learning_rate": 9.444177243274619e-06, + "loss": 0.4053, + "step": 1819 + }, + { + "epoch": 0.17712895377128954, + "grad_norm": 1.2170984864894128, + "learning_rate": 9.44345478633622e-06, + "loss": 0.3483, + "step": 1820 + }, + { + "epoch": 0.17722627737226276, + "grad_norm": 1.9241463489982464, + "learning_rate": 9.442731887848993e-06, + "loss": 0.7875, + "step": 1821 + }, + { + "epoch": 0.17732360097323602, + "grad_norm": 1.7367037011857493, + "learning_rate": 9.442008547884765e-06, + "loss": 0.5423, + "step": 1822 + }, + { + "epoch": 0.17742092457420924, + "grad_norm": 1.7768925691501514, + "learning_rate": 9.441284766515417e-06, + "loss": 0.5332, + "step": 1823 + }, + { + "epoch": 0.17751824817518247, + "grad_norm": 1.544872490519166, + "learning_rate": 9.440560543812872e-06, + "loss": 0.4797, + "step": 1824 + }, + { + "epoch": 0.17761557177615572, + "grad_norm": 1.3959412272112985, + "learning_rate": 9.439835879849097e-06, + "loss": 0.2813, + "step": 1825 + }, + { + "epoch": 0.17771289537712895, + "grad_norm": 1.4333698815114406, + "learning_rate": 9.439110774696101e-06, + "loss": 0.4623, + "step": 1826 + }, + { + "epoch": 0.1778102189781022, + "grad_norm": 1.4483549520432324, + "learning_rate": 9.43838522842594e-06, + "loss": 0.3718, + "step": 1827 + }, + { + "epoch": 0.17790754257907543, + "grad_norm": 1.1321375447475677, + "learning_rate": 9.43765924111071e-06, + "loss": 0.3035, + "step": 1828 + }, + { + "epoch": 0.17800486618004865, + "grad_norm": 1.362326738732822, + "learning_rate": 9.436932812822554e-06, + "loss": 0.316, + "step": 1829 + }, + { + "epoch": 0.1781021897810219, + "grad_norm": 1.460799021966237, + "learning_rate": 9.436205943633656e-06, + "loss": 0.3911, + "step": 1830 + }, + { + "epoch": 0.17819951338199513, + "grad_norm": 1.5389161016090995, + "learning_rate": 9.435478633616247e-06, + "loss": 0.5521, + "step": 1831 + }, + { + "epoch": 0.17829683698296836, + "grad_norm": 1.5219868331018827, + "learning_rate": 9.4347508828426e-06, + "loss": 0.5027, + "step": 1832 + }, + { + "epoch": 0.1783941605839416, + "grad_norm": 1.245087028586955, + "learning_rate": 9.434022691385034e-06, + "loss": 0.2981, + "step": 1833 + }, + { + "epoch": 0.17849148418491484, + "grad_norm": 1.4557548434245557, + "learning_rate": 9.433294059315905e-06, + "loss": 0.2293, + "step": 1834 + }, + { + "epoch": 0.17858880778588807, + "grad_norm": 1.3081558633618169, + "learning_rate": 9.432564986707621e-06, + "loss": 0.4217, + "step": 1835 + }, + { + "epoch": 0.17868613138686132, + "grad_norm": 1.3513560054673133, + "learning_rate": 9.43183547363263e-06, + "loss": 0.4318, + "step": 1836 + }, + { + "epoch": 0.17878345498783454, + "grad_norm": 1.3315264956466353, + "learning_rate": 9.431105520163426e-06, + "loss": 0.3781, + "step": 1837 + }, + { + "epoch": 0.1788807785888078, + "grad_norm": 1.0550787306059675, + "learning_rate": 9.430375126372542e-06, + "loss": 0.3104, + "step": 1838 + }, + { + "epoch": 0.17897810218978102, + "grad_norm": 1.3337629142786684, + "learning_rate": 9.429644292332557e-06, + "loss": 0.3455, + "step": 1839 + }, + { + "epoch": 0.17907542579075425, + "grad_norm": 1.6239197882024916, + "learning_rate": 9.428913018116098e-06, + "loss": 0.5855, + "step": 1840 + }, + { + "epoch": 0.1791727493917275, + "grad_norm": 1.3780162846249417, + "learning_rate": 9.428181303795828e-06, + "loss": 0.3643, + "step": 1841 + }, + { + "epoch": 0.17927007299270073, + "grad_norm": 1.3478310292007554, + "learning_rate": 9.42744914944446e-06, + "loss": 0.3962, + "step": 1842 + }, + { + "epoch": 0.17936739659367396, + "grad_norm": 1.5440243743593307, + "learning_rate": 9.426716555134751e-06, + "loss": 0.6193, + "step": 1843 + }, + { + "epoch": 0.1794647201946472, + "grad_norm": 1.4878960058265709, + "learning_rate": 9.425983520939495e-06, + "loss": 0.473, + "step": 1844 + }, + { + "epoch": 0.17956204379562044, + "grad_norm": 1.672460221871015, + "learning_rate": 9.425250046931539e-06, + "loss": 0.6429, + "step": 1845 + }, + { + "epoch": 0.17965936739659366, + "grad_norm": 1.6015212635221012, + "learning_rate": 9.424516133183762e-06, + "loss": 0.3195, + "step": 1846 + }, + { + "epoch": 0.17975669099756691, + "grad_norm": 1.385761715171386, + "learning_rate": 9.4237817797691e-06, + "loss": 0.4054, + "step": 1847 + }, + { + "epoch": 0.17985401459854014, + "grad_norm": 1.386847906411032, + "learning_rate": 9.423046986760522e-06, + "loss": 0.3825, + "step": 1848 + }, + { + "epoch": 0.1799513381995134, + "grad_norm": 1.087510047515406, + "learning_rate": 9.422311754231047e-06, + "loss": 0.3213, + "step": 1849 + }, + { + "epoch": 0.18004866180048662, + "grad_norm": 1.6065416301387576, + "learning_rate": 9.421576082253734e-06, + "loss": 0.5062, + "step": 1850 + }, + { + "epoch": 0.18014598540145985, + "grad_norm": 1.34096451308299, + "learning_rate": 9.42083997090169e-06, + "loss": 0.4036, + "step": 1851 + }, + { + "epoch": 0.1802433090024331, + "grad_norm": 1.2557739418598393, + "learning_rate": 9.42010342024806e-06, + "loss": 0.3595, + "step": 1852 + }, + { + "epoch": 0.18034063260340633, + "grad_norm": 1.5281441778996137, + "learning_rate": 9.419366430366035e-06, + "loss": 0.604, + "step": 1853 + }, + { + "epoch": 0.18043795620437955, + "grad_norm": 1.2665309724570952, + "learning_rate": 9.418629001328852e-06, + "loss": 0.4205, + "step": 1854 + }, + { + "epoch": 0.1805352798053528, + "grad_norm": 1.3442942382162348, + "learning_rate": 9.417891133209789e-06, + "loss": 0.3457, + "step": 1855 + }, + { + "epoch": 0.18063260340632603, + "grad_norm": 1.4106593198915445, + "learning_rate": 9.417152826082169e-06, + "loss": 0.4812, + "step": 1856 + }, + { + "epoch": 0.18072992700729926, + "grad_norm": 1.4377180846268287, + "learning_rate": 9.416414080019359e-06, + "loss": 0.4618, + "step": 1857 + }, + { + "epoch": 0.1808272506082725, + "grad_norm": 3.1493721230250182, + "learning_rate": 9.415674895094765e-06, + "loss": 0.4636, + "step": 1858 + }, + { + "epoch": 0.18092457420924574, + "grad_norm": 1.2019926414899231, + "learning_rate": 9.414935271381844e-06, + "loss": 0.3081, + "step": 1859 + }, + { + "epoch": 0.181021897810219, + "grad_norm": 2.6470194483303042, + "learning_rate": 9.41419520895409e-06, + "loss": 0.545, + "step": 1860 + }, + { + "epoch": 0.18111922141119222, + "grad_norm": 1.2980614715591199, + "learning_rate": 9.413454707885048e-06, + "loss": 0.2964, + "step": 1861 + }, + { + "epoch": 0.18121654501216544, + "grad_norm": 1.0776172492719038, + "learning_rate": 9.412713768248296e-06, + "loss": 0.3014, + "step": 1862 + }, + { + "epoch": 0.1813138686131387, + "grad_norm": 1.6105644497131084, + "learning_rate": 9.411972390117466e-06, + "loss": 0.2939, + "step": 1863 + }, + { + "epoch": 0.18141119221411192, + "grad_norm": 1.5656908641978677, + "learning_rate": 9.411230573566227e-06, + "loss": 0.5202, + "step": 1864 + }, + { + "epoch": 0.18150851581508515, + "grad_norm": 1.303806212869287, + "learning_rate": 9.410488318668294e-06, + "loss": 0.333, + "step": 1865 + }, + { + "epoch": 0.1816058394160584, + "grad_norm": 1.6655746538236336, + "learning_rate": 9.409745625497427e-06, + "loss": 0.432, + "step": 1866 + }, + { + "epoch": 0.18170316301703163, + "grad_norm": 1.3843667729738216, + "learning_rate": 9.409002494127427e-06, + "loss": 0.3721, + "step": 1867 + }, + { + "epoch": 0.18180048661800485, + "grad_norm": 1.119511993732411, + "learning_rate": 9.408258924632139e-06, + "loss": 0.3344, + "step": 1868 + }, + { + "epoch": 0.1818978102189781, + "grad_norm": 1.402581324947916, + "learning_rate": 9.407514917085451e-06, + "loss": 0.4016, + "step": 1869 + }, + { + "epoch": 0.18199513381995133, + "grad_norm": 1.424239738841203, + "learning_rate": 9.406770471561298e-06, + "loss": 0.4043, + "step": 1870 + }, + { + "epoch": 0.18209245742092459, + "grad_norm": 1.4825401610777273, + "learning_rate": 9.406025588133654e-06, + "loss": 0.5446, + "step": 1871 + }, + { + "epoch": 0.1821897810218978, + "grad_norm": 1.1812973154269832, + "learning_rate": 9.405280266876539e-06, + "loss": 0.3086, + "step": 1872 + }, + { + "epoch": 0.18228710462287104, + "grad_norm": 1.458454653825207, + "learning_rate": 9.404534507864015e-06, + "loss": 0.426, + "step": 1873 + }, + { + "epoch": 0.1823844282238443, + "grad_norm": 1.4345175445802738, + "learning_rate": 9.403788311170193e-06, + "loss": 0.4826, + "step": 1874 + }, + { + "epoch": 0.18248175182481752, + "grad_norm": 1.636664123351898, + "learning_rate": 9.403041676869217e-06, + "loss": 0.5861, + "step": 1875 + }, + { + "epoch": 0.18257907542579074, + "grad_norm": 1.4112207510715695, + "learning_rate": 9.402294605035285e-06, + "loss": 0.3575, + "step": 1876 + }, + { + "epoch": 0.182676399026764, + "grad_norm": 1.5632317164864975, + "learning_rate": 9.401547095742631e-06, + "loss": 0.5798, + "step": 1877 + }, + { + "epoch": 0.18277372262773722, + "grad_norm": 1.2700759423445944, + "learning_rate": 9.400799149065538e-06, + "loss": 0.3928, + "step": 1878 + }, + { + "epoch": 0.18287104622871045, + "grad_norm": 1.1318646905388465, + "learning_rate": 9.400050765078327e-06, + "loss": 0.2783, + "step": 1879 + }, + { + "epoch": 0.1829683698296837, + "grad_norm": 1.1697084872304198, + "learning_rate": 9.399301943855368e-06, + "loss": 0.2715, + "step": 1880 + }, + { + "epoch": 0.18306569343065693, + "grad_norm": 1.4137887426273796, + "learning_rate": 9.39855268547107e-06, + "loss": 0.3049, + "step": 1881 + }, + { + "epoch": 0.18316301703163018, + "grad_norm": 1.3869164554267486, + "learning_rate": 9.397802989999888e-06, + "loss": 0.3526, + "step": 1882 + }, + { + "epoch": 0.1832603406326034, + "grad_norm": 1.3336674996684654, + "learning_rate": 9.39705285751632e-06, + "loss": 0.3914, + "step": 1883 + }, + { + "epoch": 0.18335766423357663, + "grad_norm": 1.2095628873380657, + "learning_rate": 9.396302288094907e-06, + "loss": 0.3577, + "step": 1884 + }, + { + "epoch": 0.1834549878345499, + "grad_norm": 1.4741118747641506, + "learning_rate": 9.395551281810233e-06, + "loss": 0.4753, + "step": 1885 + }, + { + "epoch": 0.1835523114355231, + "grad_norm": 1.5440799623052803, + "learning_rate": 9.394799838736928e-06, + "loss": 0.5143, + "step": 1886 + }, + { + "epoch": 0.18364963503649634, + "grad_norm": 1.6461828641301555, + "learning_rate": 9.394047958949661e-06, + "loss": 0.5046, + "step": 1887 + }, + { + "epoch": 0.1837469586374696, + "grad_norm": 1.3077272649446732, + "learning_rate": 9.393295642523147e-06, + "loss": 0.4505, + "step": 1888 + }, + { + "epoch": 0.18384428223844282, + "grad_norm": 1.3954964938282017, + "learning_rate": 9.392542889532146e-06, + "loss": 0.3752, + "step": 1889 + }, + { + "epoch": 0.18394160583941604, + "grad_norm": 1.4332674159188397, + "learning_rate": 9.391789700051457e-06, + "loss": 0.4102, + "step": 1890 + }, + { + "epoch": 0.1840389294403893, + "grad_norm": 1.5291760471205262, + "learning_rate": 9.391036074155926e-06, + "loss": 0.3892, + "step": 1891 + }, + { + "epoch": 0.18413625304136252, + "grad_norm": 1.3194046059109847, + "learning_rate": 9.390282011920442e-06, + "loss": 0.3402, + "step": 1892 + }, + { + "epoch": 0.18423357664233578, + "grad_norm": 1.1218553674196712, + "learning_rate": 9.389527513419935e-06, + "loss": 0.2705, + "step": 1893 + }, + { + "epoch": 0.184330900243309, + "grad_norm": 1.4415924034763155, + "learning_rate": 9.388772578729382e-06, + "loss": 0.4153, + "step": 1894 + }, + { + "epoch": 0.18442822384428223, + "grad_norm": 1.1449469634853555, + "learning_rate": 9.3880172079238e-06, + "loss": 0.2464, + "step": 1895 + }, + { + "epoch": 0.18452554744525548, + "grad_norm": 1.3609647553229742, + "learning_rate": 9.38726140107825e-06, + "loss": 0.4167, + "step": 1896 + }, + { + "epoch": 0.1846228710462287, + "grad_norm": 1.5005607351629322, + "learning_rate": 9.38650515826784e-06, + "loss": 0.5496, + "step": 1897 + }, + { + "epoch": 0.18472019464720194, + "grad_norm": 1.2988771816540412, + "learning_rate": 9.385748479567715e-06, + "loss": 0.3746, + "step": 1898 + }, + { + "epoch": 0.1848175182481752, + "grad_norm": 1.6297457427665438, + "learning_rate": 9.384991365053066e-06, + "loss": 0.5329, + "step": 1899 + }, + { + "epoch": 0.18491484184914841, + "grad_norm": 1.4260746902123356, + "learning_rate": 9.384233814799133e-06, + "loss": 0.5495, + "step": 1900 + }, + { + "epoch": 0.18501216545012167, + "grad_norm": 1.6131616876000299, + "learning_rate": 9.38347582888119e-06, + "loss": 0.4956, + "step": 1901 + }, + { + "epoch": 0.1851094890510949, + "grad_norm": 1.2427047036633028, + "learning_rate": 9.382717407374559e-06, + "loss": 0.3527, + "step": 1902 + }, + { + "epoch": 0.18520681265206812, + "grad_norm": 1.1650358905093554, + "learning_rate": 9.381958550354607e-06, + "loss": 0.3282, + "step": 1903 + }, + { + "epoch": 0.18530413625304137, + "grad_norm": 1.2422827918011654, + "learning_rate": 9.381199257896738e-06, + "loss": 0.3954, + "step": 1904 + }, + { + "epoch": 0.1854014598540146, + "grad_norm": 1.3772059864511268, + "learning_rate": 9.38043953007641e-06, + "loss": 0.2519, + "step": 1905 + }, + { + "epoch": 0.18549878345498783, + "grad_norm": 1.2627132972091453, + "learning_rate": 9.379679366969108e-06, + "loss": 0.3748, + "step": 1906 + }, + { + "epoch": 0.18559610705596108, + "grad_norm": 1.7742544300786764, + "learning_rate": 9.378918768650379e-06, + "loss": 0.4627, + "step": 1907 + }, + { + "epoch": 0.1856934306569343, + "grad_norm": 1.3460661864821146, + "learning_rate": 9.3781577351958e-06, + "loss": 0.3769, + "step": 1908 + }, + { + "epoch": 0.18579075425790753, + "grad_norm": 1.2948363493096455, + "learning_rate": 9.377396266680993e-06, + "loss": 0.255, + "step": 1909 + }, + { + "epoch": 0.18588807785888078, + "grad_norm": 1.4260435934265066, + "learning_rate": 9.376634363181631e-06, + "loss": 0.4158, + "step": 1910 + }, + { + "epoch": 0.185985401459854, + "grad_norm": 1.4136193355548345, + "learning_rate": 9.375872024773423e-06, + "loss": 0.3764, + "step": 1911 + }, + { + "epoch": 0.18608272506082726, + "grad_norm": 1.2338333390059972, + "learning_rate": 9.375109251532121e-06, + "loss": 0.3785, + "step": 1912 + }, + { + "epoch": 0.1861800486618005, + "grad_norm": 1.535249430616727, + "learning_rate": 9.374346043533524e-06, + "loss": 0.5252, + "step": 1913 + }, + { + "epoch": 0.18627737226277372, + "grad_norm": 1.215284604855692, + "learning_rate": 9.373582400853472e-06, + "loss": 0.3295, + "step": 1914 + }, + { + "epoch": 0.18637469586374697, + "grad_norm": 1.331605367733698, + "learning_rate": 9.372818323567847e-06, + "loss": 0.2818, + "step": 1915 + }, + { + "epoch": 0.1864720194647202, + "grad_norm": 1.3700650260278666, + "learning_rate": 9.37205381175258e-06, + "loss": 0.5125, + "step": 1916 + }, + { + "epoch": 0.18656934306569342, + "grad_norm": 1.0730618437287824, + "learning_rate": 9.371288865483637e-06, + "loss": 0.3608, + "step": 1917 + }, + { + "epoch": 0.18666666666666668, + "grad_norm": 1.6775147335354874, + "learning_rate": 9.370523484837033e-06, + "loss": 0.4555, + "step": 1918 + }, + { + "epoch": 0.1867639902676399, + "grad_norm": 1.531630799569193, + "learning_rate": 9.369757669888822e-06, + "loss": 0.502, + "step": 1919 + }, + { + "epoch": 0.18686131386861313, + "grad_norm": 0.924734272033398, + "learning_rate": 9.368991420715109e-06, + "loss": 0.2117, + "step": 1920 + }, + { + "epoch": 0.18695863746958638, + "grad_norm": 1.3568146369682141, + "learning_rate": 9.36822473739203e-06, + "loss": 0.4311, + "step": 1921 + }, + { + "epoch": 0.1870559610705596, + "grad_norm": 1.2577909858711795, + "learning_rate": 9.367457619995776e-06, + "loss": 0.405, + "step": 1922 + }, + { + "epoch": 0.18715328467153286, + "grad_norm": 1.5933524739274278, + "learning_rate": 9.366690068602573e-06, + "loss": 0.627, + "step": 1923 + }, + { + "epoch": 0.18725060827250609, + "grad_norm": 1.279419778059805, + "learning_rate": 9.365922083288694e-06, + "loss": 0.2814, + "step": 1924 + }, + { + "epoch": 0.1873479318734793, + "grad_norm": 1.6336124778487715, + "learning_rate": 9.365153664130454e-06, + "loss": 0.6461, + "step": 1925 + }, + { + "epoch": 0.18744525547445257, + "grad_norm": 5.906434394339674, + "learning_rate": 9.364384811204212e-06, + "loss": 0.5628, + "step": 1926 + }, + { + "epoch": 0.1875425790754258, + "grad_norm": 1.2770793302804129, + "learning_rate": 9.363615524586368e-06, + "loss": 0.303, + "step": 1927 + }, + { + "epoch": 0.18763990267639902, + "grad_norm": 1.2695156624644028, + "learning_rate": 9.362845804353367e-06, + "loss": 0.3592, + "step": 1928 + }, + { + "epoch": 0.18773722627737227, + "grad_norm": 1.4443375056776053, + "learning_rate": 9.362075650581698e-06, + "loss": 0.4701, + "step": 1929 + }, + { + "epoch": 0.1878345498783455, + "grad_norm": 1.4330727776563095, + "learning_rate": 9.36130506334789e-06, + "loss": 0.5163, + "step": 1930 + }, + { + "epoch": 0.18793187347931872, + "grad_norm": 1.326934280688427, + "learning_rate": 9.360534042728517e-06, + "loss": 0.289, + "step": 1931 + }, + { + "epoch": 0.18802919708029198, + "grad_norm": 1.0531370847104877, + "learning_rate": 9.359762588800195e-06, + "loss": 0.1994, + "step": 1932 + }, + { + "epoch": 0.1881265206812652, + "grad_norm": 1.4998435892573359, + "learning_rate": 9.358990701639585e-06, + "loss": 0.4064, + "step": 1933 + }, + { + "epoch": 0.18822384428223846, + "grad_norm": 2.65155925581941, + "learning_rate": 9.358218381323391e-06, + "loss": 0.3513, + "step": 1934 + }, + { + "epoch": 0.18832116788321168, + "grad_norm": 1.280523326704506, + "learning_rate": 9.357445627928356e-06, + "loss": 0.3132, + "step": 1935 + }, + { + "epoch": 0.1884184914841849, + "grad_norm": 1.347047087613105, + "learning_rate": 9.356672441531273e-06, + "loss": 0.3334, + "step": 1936 + }, + { + "epoch": 0.18851581508515816, + "grad_norm": 1.2987558904079175, + "learning_rate": 9.35589882220897e-06, + "loss": 0.3224, + "step": 1937 + }, + { + "epoch": 0.1886131386861314, + "grad_norm": 0.9974048438134153, + "learning_rate": 9.355124770038323e-06, + "loss": 0.2764, + "step": 1938 + }, + { + "epoch": 0.1887104622871046, + "grad_norm": 2.544180913694316, + "learning_rate": 9.354350285096255e-06, + "loss": 0.495, + "step": 1939 + }, + { + "epoch": 0.18880778588807787, + "grad_norm": 1.613510595834776, + "learning_rate": 9.353575367459718e-06, + "loss": 0.5269, + "step": 1940 + }, + { + "epoch": 0.1889051094890511, + "grad_norm": 1.1663508101189002, + "learning_rate": 9.352800017205724e-06, + "loss": 0.3936, + "step": 1941 + }, + { + "epoch": 0.18900243309002432, + "grad_norm": 1.3673811421181858, + "learning_rate": 9.352024234411315e-06, + "loss": 0.4448, + "step": 1942 + }, + { + "epoch": 0.18909975669099757, + "grad_norm": 1.1481373712644614, + "learning_rate": 9.351248019153582e-06, + "loss": 0.3226, + "step": 1943 + }, + { + "epoch": 0.1891970802919708, + "grad_norm": 1.025014870233366, + "learning_rate": 9.350471371509659e-06, + "loss": 0.2095, + "step": 1944 + }, + { + "epoch": 0.18929440389294405, + "grad_norm": 1.6587902238420225, + "learning_rate": 9.349694291556723e-06, + "loss": 0.3805, + "step": 1945 + }, + { + "epoch": 0.18939172749391728, + "grad_norm": 1.568770301353131, + "learning_rate": 9.348916779371993e-06, + "loss": 0.3902, + "step": 1946 + }, + { + "epoch": 0.1894890510948905, + "grad_norm": 1.4274566422005779, + "learning_rate": 9.348138835032727e-06, + "loss": 0.3644, + "step": 1947 + }, + { + "epoch": 0.18958637469586376, + "grad_norm": 1.6590398647584288, + "learning_rate": 9.347360458616233e-06, + "loss": 0.3522, + "step": 1948 + }, + { + "epoch": 0.18968369829683698, + "grad_norm": 1.5905934658559544, + "learning_rate": 9.346581650199859e-06, + "loss": 0.3784, + "step": 1949 + }, + { + "epoch": 0.1897810218978102, + "grad_norm": 1.358850838464726, + "learning_rate": 9.345802409860995e-06, + "loss": 0.3407, + "step": 1950 + }, + { + "epoch": 0.18987834549878346, + "grad_norm": 1.5906740312195304, + "learning_rate": 9.345022737677073e-06, + "loss": 0.4735, + "step": 1951 + }, + { + "epoch": 0.1899756690997567, + "grad_norm": 1.419279223309371, + "learning_rate": 9.344242633725573e-06, + "loss": 0.4677, + "step": 1952 + }, + { + "epoch": 0.19007299270072991, + "grad_norm": 2.368125402390624, + "learning_rate": 9.34346209808401e-06, + "loss": 0.4341, + "step": 1953 + }, + { + "epoch": 0.19017031630170317, + "grad_norm": 1.6018933954570558, + "learning_rate": 9.342681130829949e-06, + "loss": 0.4348, + "step": 1954 + }, + { + "epoch": 0.1902676399026764, + "grad_norm": 1.4757982324740848, + "learning_rate": 9.341899732040996e-06, + "loss": 0.393, + "step": 1955 + }, + { + "epoch": 0.19036496350364965, + "grad_norm": 1.463093762624457, + "learning_rate": 9.341117901794797e-06, + "loss": 0.3787, + "step": 1956 + }, + { + "epoch": 0.19046228710462287, + "grad_norm": 1.5507561900230402, + "learning_rate": 9.340335640169045e-06, + "loss": 0.4715, + "step": 1957 + }, + { + "epoch": 0.1905596107055961, + "grad_norm": 1.4207468273121375, + "learning_rate": 9.339552947241471e-06, + "loss": 0.3938, + "step": 1958 + }, + { + "epoch": 0.19065693430656935, + "grad_norm": 1.407596113402629, + "learning_rate": 9.338769823089853e-06, + "loss": 0.4965, + "step": 1959 + }, + { + "epoch": 0.19075425790754258, + "grad_norm": 1.5505869092648736, + "learning_rate": 9.337986267792014e-06, + "loss": 0.3699, + "step": 1960 + }, + { + "epoch": 0.1908515815085158, + "grad_norm": 1.4558635051434323, + "learning_rate": 9.33720228142581e-06, + "loss": 0.3436, + "step": 1961 + }, + { + "epoch": 0.19094890510948906, + "grad_norm": 1.4210127007858437, + "learning_rate": 9.336417864069152e-06, + "loss": 0.3959, + "step": 1962 + }, + { + "epoch": 0.19104622871046228, + "grad_norm": 1.5797691467496429, + "learning_rate": 9.335633015799983e-06, + "loss": 0.5438, + "step": 1963 + }, + { + "epoch": 0.1911435523114355, + "grad_norm": 1.200940613853037, + "learning_rate": 9.334847736696297e-06, + "loss": 0.3037, + "step": 1964 + }, + { + "epoch": 0.19124087591240876, + "grad_norm": 1.6206966051553, + "learning_rate": 9.334062026836128e-06, + "loss": 0.6412, + "step": 1965 + }, + { + "epoch": 0.191338199513382, + "grad_norm": 1.3678147539203456, + "learning_rate": 9.33327588629755e-06, + "loss": 0.328, + "step": 1966 + }, + { + "epoch": 0.19143552311435524, + "grad_norm": 1.425436568728509, + "learning_rate": 9.332489315158685e-06, + "loss": 0.42, + "step": 1967 + }, + { + "epoch": 0.19153284671532847, + "grad_norm": 1.4740185495034979, + "learning_rate": 9.331702313497693e-06, + "loss": 0.3563, + "step": 1968 + }, + { + "epoch": 0.1916301703163017, + "grad_norm": 1.4865130636524604, + "learning_rate": 9.33091488139278e-06, + "loss": 0.3452, + "step": 1969 + }, + { + "epoch": 0.19172749391727495, + "grad_norm": 1.595704917953399, + "learning_rate": 9.330127018922195e-06, + "loss": 0.6593, + "step": 1970 + }, + { + "epoch": 0.19182481751824818, + "grad_norm": 1.4305855687191487, + "learning_rate": 9.329338726164225e-06, + "loss": 0.4935, + "step": 1971 + }, + { + "epoch": 0.1919221411192214, + "grad_norm": 1.4810316480182457, + "learning_rate": 9.328550003197203e-06, + "loss": 0.4303, + "step": 1972 + }, + { + "epoch": 0.19201946472019465, + "grad_norm": 1.1937939840472271, + "learning_rate": 9.32776085009951e-06, + "loss": 0.3178, + "step": 1973 + }, + { + "epoch": 0.19211678832116788, + "grad_norm": 1.3344201288029265, + "learning_rate": 9.326971266949558e-06, + "loss": 0.3469, + "step": 1974 + }, + { + "epoch": 0.1922141119221411, + "grad_norm": 1.5818137690503504, + "learning_rate": 9.326181253825813e-06, + "loss": 0.505, + "step": 1975 + }, + { + "epoch": 0.19231143552311436, + "grad_norm": 1.263126969220317, + "learning_rate": 9.325390810806778e-06, + "loss": 0.3967, + "step": 1976 + }, + { + "epoch": 0.19240875912408759, + "grad_norm": 1.6967730581105949, + "learning_rate": 9.324599937971e-06, + "loss": 0.7353, + "step": 1977 + }, + { + "epoch": 0.19250608272506084, + "grad_norm": 1.4550804189369502, + "learning_rate": 9.323808635397067e-06, + "loss": 0.3326, + "step": 1978 + }, + { + "epoch": 0.19260340632603407, + "grad_norm": 1.594493767215082, + "learning_rate": 9.323016903163612e-06, + "loss": 0.4547, + "step": 1979 + }, + { + "epoch": 0.1927007299270073, + "grad_norm": 1.4855552398261571, + "learning_rate": 9.322224741349313e-06, + "loss": 0.5095, + "step": 1980 + }, + { + "epoch": 0.19279805352798055, + "grad_norm": 1.3769945503658922, + "learning_rate": 9.321432150032884e-06, + "loss": 0.3853, + "step": 1981 + }, + { + "epoch": 0.19289537712895377, + "grad_norm": 1.3138128708042736, + "learning_rate": 9.320639129293083e-06, + "loss": 0.4129, + "step": 1982 + }, + { + "epoch": 0.192992700729927, + "grad_norm": 1.4617598559962484, + "learning_rate": 9.319845679208719e-06, + "loss": 0.449, + "step": 1983 + }, + { + "epoch": 0.19309002433090025, + "grad_norm": 1.6332060417216765, + "learning_rate": 9.319051799858633e-06, + "loss": 0.594, + "step": 1984 + }, + { + "epoch": 0.19318734793187348, + "grad_norm": 1.5432637765560855, + "learning_rate": 9.318257491321714e-06, + "loss": 0.3465, + "step": 1985 + }, + { + "epoch": 0.1932846715328467, + "grad_norm": 1.4536395238750577, + "learning_rate": 9.317462753676895e-06, + "loss": 0.4212, + "step": 1986 + }, + { + "epoch": 0.19338199513381996, + "grad_norm": 1.3985266204226148, + "learning_rate": 9.31666758700315e-06, + "loss": 0.5313, + "step": 1987 + }, + { + "epoch": 0.19347931873479318, + "grad_norm": 1.4329939166816383, + "learning_rate": 9.315871991379493e-06, + "loss": 0.3958, + "step": 1988 + }, + { + "epoch": 0.19357664233576644, + "grad_norm": 1.3666417803863316, + "learning_rate": 9.315075966884984e-06, + "loss": 0.462, + "step": 1989 + }, + { + "epoch": 0.19367396593673966, + "grad_norm": 1.6059064802064114, + "learning_rate": 9.314279513598721e-06, + "loss": 0.5734, + "step": 1990 + }, + { + "epoch": 0.1937712895377129, + "grad_norm": 1.521730062801285, + "learning_rate": 9.313482631599854e-06, + "loss": 0.3479, + "step": 1991 + }, + { + "epoch": 0.19386861313868614, + "grad_norm": 1.5212897395363751, + "learning_rate": 9.312685320967566e-06, + "loss": 0.4328, + "step": 1992 + }, + { + "epoch": 0.19396593673965937, + "grad_norm": 1.669365255826549, + "learning_rate": 9.311887581781086e-06, + "loss": 0.6153, + "step": 1993 + }, + { + "epoch": 0.1940632603406326, + "grad_norm": 1.1692329123053622, + "learning_rate": 9.311089414119688e-06, + "loss": 0.3149, + "step": 1994 + }, + { + "epoch": 0.19416058394160585, + "grad_norm": 1.4724909439197027, + "learning_rate": 9.310290818062683e-06, + "loss": 0.478, + "step": 1995 + }, + { + "epoch": 0.19425790754257907, + "grad_norm": 1.667688851021317, + "learning_rate": 9.309491793689431e-06, + "loss": 0.6192, + "step": 1996 + }, + { + "epoch": 0.1943552311435523, + "grad_norm": 1.2423474670669281, + "learning_rate": 9.30869234107933e-06, + "loss": 0.4242, + "step": 1997 + }, + { + "epoch": 0.19445255474452555, + "grad_norm": 1.4117486896728357, + "learning_rate": 9.307892460311825e-06, + "loss": 0.4417, + "step": 1998 + }, + { + "epoch": 0.19454987834549878, + "grad_norm": 1.6605518542896853, + "learning_rate": 9.307092151466397e-06, + "loss": 0.5289, + "step": 1999 + }, + { + "epoch": 0.19464720194647203, + "grad_norm": 1.661933360658536, + "learning_rate": 9.306291414622575e-06, + "loss": 0.3357, + "step": 2000 + }, + { + "epoch": 0.19474452554744526, + "grad_norm": 1.4409618985011814, + "learning_rate": 9.305490249859927e-06, + "loss": 0.4563, + "step": 2001 + }, + { + "epoch": 0.19484184914841848, + "grad_norm": 1.9082899591217046, + "learning_rate": 9.304688657258068e-06, + "loss": 0.3445, + "step": 2002 + }, + { + "epoch": 0.19493917274939174, + "grad_norm": 1.2157434891172034, + "learning_rate": 9.303886636896649e-06, + "loss": 0.3719, + "step": 2003 + }, + { + "epoch": 0.19503649635036496, + "grad_norm": 1.57236888854409, + "learning_rate": 9.303084188855371e-06, + "loss": 0.4399, + "step": 2004 + }, + { + "epoch": 0.1951338199513382, + "grad_norm": 1.4041570559360463, + "learning_rate": 9.302281313213973e-06, + "loss": 0.4442, + "step": 2005 + }, + { + "epoch": 0.19523114355231144, + "grad_norm": 1.595081147428658, + "learning_rate": 9.301478010052237e-06, + "loss": 0.4225, + "step": 2006 + }, + { + "epoch": 0.19532846715328467, + "grad_norm": 1.562924823229517, + "learning_rate": 9.300674279449986e-06, + "loss": 0.3739, + "step": 2007 + }, + { + "epoch": 0.1954257907542579, + "grad_norm": 1.6925679153497177, + "learning_rate": 9.299870121487088e-06, + "loss": 0.4465, + "step": 2008 + }, + { + "epoch": 0.19552311435523115, + "grad_norm": 1.4955175500348226, + "learning_rate": 9.299065536243453e-06, + "loss": 0.5055, + "step": 2009 + }, + { + "epoch": 0.19562043795620437, + "grad_norm": 1.5602814755448668, + "learning_rate": 9.298260523799035e-06, + "loss": 0.4214, + "step": 2010 + }, + { + "epoch": 0.19571776155717763, + "grad_norm": 1.4678189187481074, + "learning_rate": 9.297455084233826e-06, + "loss": 0.4221, + "step": 2011 + }, + { + "epoch": 0.19581508515815085, + "grad_norm": 1.1014848505883976, + "learning_rate": 9.296649217627863e-06, + "loss": 0.2531, + "step": 2012 + }, + { + "epoch": 0.19591240875912408, + "grad_norm": 1.553421501855423, + "learning_rate": 9.295842924061227e-06, + "loss": 0.5409, + "step": 2013 + }, + { + "epoch": 0.19600973236009733, + "grad_norm": 1.598118050761176, + "learning_rate": 9.295036203614039e-06, + "loss": 0.4084, + "step": 2014 + }, + { + "epoch": 0.19610705596107056, + "grad_norm": 1.6278848716274248, + "learning_rate": 9.294229056366464e-06, + "loss": 0.5842, + "step": 2015 + }, + { + "epoch": 0.19620437956204378, + "grad_norm": 1.243515264701947, + "learning_rate": 9.293421482398708e-06, + "loss": 0.3504, + "step": 2016 + }, + { + "epoch": 0.19630170316301704, + "grad_norm": 1.4687425329140307, + "learning_rate": 9.29261348179102e-06, + "loss": 0.2732, + "step": 2017 + }, + { + "epoch": 0.19639902676399026, + "grad_norm": 1.8000259635960119, + "learning_rate": 9.291805054623691e-06, + "loss": 0.7865, + "step": 2018 + }, + { + "epoch": 0.1964963503649635, + "grad_norm": 1.5721673591186547, + "learning_rate": 9.290996200977058e-06, + "loss": 0.5686, + "step": 2019 + }, + { + "epoch": 0.19659367396593674, + "grad_norm": 1.4634877349944297, + "learning_rate": 9.290186920931493e-06, + "loss": 0.4884, + "step": 2020 + }, + { + "epoch": 0.19669099756690997, + "grad_norm": 1.8795352763168436, + "learning_rate": 9.289377214567418e-06, + "loss": 0.279, + "step": 2021 + }, + { + "epoch": 0.19678832116788322, + "grad_norm": 1.2525962570268505, + "learning_rate": 9.288567081965292e-06, + "loss": 0.3003, + "step": 2022 + }, + { + "epoch": 0.19688564476885645, + "grad_norm": 1.4414518188882164, + "learning_rate": 9.28775652320562e-06, + "loss": 0.2883, + "step": 2023 + }, + { + "epoch": 0.19698296836982968, + "grad_norm": 1.1469869990322892, + "learning_rate": 9.286945538368946e-06, + "loss": 0.301, + "step": 2024 + }, + { + "epoch": 0.19708029197080293, + "grad_norm": 1.4386800814955665, + "learning_rate": 9.286134127535859e-06, + "loss": 0.417, + "step": 2025 + }, + { + "epoch": 0.19717761557177615, + "grad_norm": 1.4334168701816348, + "learning_rate": 9.28532229078699e-06, + "loss": 0.4694, + "step": 2026 + }, + { + "epoch": 0.19727493917274938, + "grad_norm": 1.2925159318336792, + "learning_rate": 9.28451002820301e-06, + "loss": 0.4438, + "step": 2027 + }, + { + "epoch": 0.19737226277372263, + "grad_norm": 1.1608723700468837, + "learning_rate": 9.283697339864635e-06, + "loss": 0.3899, + "step": 2028 + }, + { + "epoch": 0.19746958637469586, + "grad_norm": 1.0831308664734243, + "learning_rate": 9.282884225852625e-06, + "loss": 0.3594, + "step": 2029 + }, + { + "epoch": 0.19756690997566909, + "grad_norm": 1.3854325468066278, + "learning_rate": 9.282070686247773e-06, + "loss": 0.5111, + "step": 2030 + }, + { + "epoch": 0.19766423357664234, + "grad_norm": 1.2843702051671877, + "learning_rate": 9.281256721130927e-06, + "loss": 0.3298, + "step": 2031 + }, + { + "epoch": 0.19776155717761557, + "grad_norm": 1.4725158786403292, + "learning_rate": 9.280442330582968e-06, + "loss": 0.4776, + "step": 2032 + }, + { + "epoch": 0.19785888077858882, + "grad_norm": 1.2748346913452204, + "learning_rate": 9.279627514684826e-06, + "loss": 0.4438, + "step": 2033 + }, + { + "epoch": 0.19795620437956205, + "grad_norm": 1.406716290626126, + "learning_rate": 9.278812273517465e-06, + "loss": 0.2814, + "step": 2034 + }, + { + "epoch": 0.19805352798053527, + "grad_norm": 1.3303438388967537, + "learning_rate": 9.2779966071619e-06, + "loss": 0.4314, + "step": 2035 + }, + { + "epoch": 0.19815085158150852, + "grad_norm": 1.4134730169408085, + "learning_rate": 9.277180515699183e-06, + "loss": 0.2764, + "step": 2036 + }, + { + "epoch": 0.19824817518248175, + "grad_norm": 1.3255645305073551, + "learning_rate": 9.276363999210407e-06, + "loss": 0.4347, + "step": 2037 + }, + { + "epoch": 0.19834549878345498, + "grad_norm": 1.4369644328356708, + "learning_rate": 9.275547057776713e-06, + "loss": 0.3551, + "step": 2038 + }, + { + "epoch": 0.19844282238442823, + "grad_norm": 1.748281657046459, + "learning_rate": 9.27472969147928e-06, + "loss": 0.4372, + "step": 2039 + }, + { + "epoch": 0.19854014598540146, + "grad_norm": 1.2795189118800725, + "learning_rate": 9.273911900399331e-06, + "loss": 0.4431, + "step": 2040 + }, + { + "epoch": 0.1986374695863747, + "grad_norm": 1.165526474375854, + "learning_rate": 9.273093684618129e-06, + "loss": 0.2936, + "step": 2041 + }, + { + "epoch": 0.19873479318734794, + "grad_norm": 1.6068781771010836, + "learning_rate": 9.272275044216981e-06, + "loss": 0.5125, + "step": 2042 + }, + { + "epoch": 0.19883211678832116, + "grad_norm": 1.4210491087425543, + "learning_rate": 9.271455979277234e-06, + "loss": 0.4142, + "step": 2043 + }, + { + "epoch": 0.19892944038929442, + "grad_norm": 1.6609287753373938, + "learning_rate": 9.270636489880283e-06, + "loss": 0.6728, + "step": 2044 + }, + { + "epoch": 0.19902676399026764, + "grad_norm": 1.3902108507987736, + "learning_rate": 9.26981657610756e-06, + "loss": 0.3492, + "step": 2045 + }, + { + "epoch": 0.19912408759124087, + "grad_norm": 1.6316422644879316, + "learning_rate": 9.268996238040537e-06, + "loss": 0.5029, + "step": 2046 + }, + { + "epoch": 0.19922141119221412, + "grad_norm": 1.2841836791466006, + "learning_rate": 9.268175475760734e-06, + "loss": 0.3849, + "step": 2047 + }, + { + "epoch": 0.19931873479318735, + "grad_norm": 1.319713524379575, + "learning_rate": 9.267354289349712e-06, + "loss": 0.4439, + "step": 2048 + }, + { + "epoch": 0.19941605839416057, + "grad_norm": 1.3549935774985267, + "learning_rate": 9.266532678889071e-06, + "loss": 0.4382, + "step": 2049 + }, + { + "epoch": 0.19951338199513383, + "grad_norm": 1.8518976479625036, + "learning_rate": 9.265710644460455e-06, + "loss": 0.8216, + "step": 2050 + }, + { + "epoch": 0.19961070559610705, + "grad_norm": 1.9509154982810264, + "learning_rate": 9.26488818614555e-06, + "loss": 0.4607, + "step": 2051 + }, + { + "epoch": 0.1997080291970803, + "grad_norm": 1.2954164138913125, + "learning_rate": 9.264065304026087e-06, + "loss": 0.4257, + "step": 2052 + }, + { + "epoch": 0.19980535279805353, + "grad_norm": 1.925685176039115, + "learning_rate": 9.26324199818383e-06, + "loss": 0.6025, + "step": 2053 + }, + { + "epoch": 0.19990267639902676, + "grad_norm": 1.533947029174009, + "learning_rate": 9.262418268700596e-06, + "loss": 0.5443, + "step": 2054 + }, + { + "epoch": 0.2, + "grad_norm": 1.4995274594175463, + "learning_rate": 9.26159411565824e-06, + "loss": 0.5023, + "step": 2055 + }, + { + "epoch": 0.20009732360097324, + "grad_norm": 1.4350182215101954, + "learning_rate": 9.26076953913866e-06, + "loss": 0.3726, + "step": 2056 + }, + { + "epoch": 0.20019464720194646, + "grad_norm": 1.3019491914952392, + "learning_rate": 9.259944539223788e-06, + "loss": 0.4765, + "step": 2057 + }, + { + "epoch": 0.20029197080291972, + "grad_norm": 1.3884509805578256, + "learning_rate": 9.25911911599561e-06, + "loss": 0.338, + "step": 2058 + }, + { + "epoch": 0.20038929440389294, + "grad_norm": 1.488048064619486, + "learning_rate": 9.258293269536146e-06, + "loss": 0.5872, + "step": 2059 + }, + { + "epoch": 0.20048661800486617, + "grad_norm": 1.1548733119099643, + "learning_rate": 9.257466999927464e-06, + "loss": 0.3242, + "step": 2060 + }, + { + "epoch": 0.20058394160583942, + "grad_norm": 1.048222542797774, + "learning_rate": 9.25664030725167e-06, + "loss": 0.3253, + "step": 2061 + }, + { + "epoch": 0.20068126520681265, + "grad_norm": 1.211590892113714, + "learning_rate": 9.255813191590912e-06, + "loss": 0.3414, + "step": 2062 + }, + { + "epoch": 0.2007785888077859, + "grad_norm": 1.3770802107798175, + "learning_rate": 9.254985653027382e-06, + "loss": 0.4031, + "step": 2063 + }, + { + "epoch": 0.20087591240875913, + "grad_norm": 1.4503315973945832, + "learning_rate": 9.25415769164331e-06, + "loss": 0.4799, + "step": 2064 + }, + { + "epoch": 0.20097323600973235, + "grad_norm": 1.3613570222565128, + "learning_rate": 9.253329307520976e-06, + "loss": 0.3932, + "step": 2065 + }, + { + "epoch": 0.2010705596107056, + "grad_norm": 1.436956883536887, + "learning_rate": 9.252500500742692e-06, + "loss": 0.51, + "step": 2066 + }, + { + "epoch": 0.20116788321167883, + "grad_norm": 1.3042874208229347, + "learning_rate": 9.25167127139082e-06, + "loss": 0.3702, + "step": 2067 + }, + { + "epoch": 0.20126520681265206, + "grad_norm": 1.4601934649693376, + "learning_rate": 9.250841619547762e-06, + "loss": 0.3927, + "step": 2068 + }, + { + "epoch": 0.2013625304136253, + "grad_norm": 1.4877017036692342, + "learning_rate": 9.250011545295959e-06, + "loss": 0.5463, + "step": 2069 + }, + { + "epoch": 0.20145985401459854, + "grad_norm": 1.3385891837902342, + "learning_rate": 9.249181048717895e-06, + "loss": 0.3052, + "step": 2070 + }, + { + "epoch": 0.20155717761557176, + "grad_norm": 1.111892744483471, + "learning_rate": 9.2483501298961e-06, + "loss": 0.2342, + "step": 2071 + }, + { + "epoch": 0.20165450121654502, + "grad_norm": 1.4336755713622584, + "learning_rate": 9.247518788913141e-06, + "loss": 0.4416, + "step": 2072 + }, + { + "epoch": 0.20175182481751824, + "grad_norm": 1.4682039909825075, + "learning_rate": 9.246687025851629e-06, + "loss": 0.3044, + "step": 2073 + }, + { + "epoch": 0.2018491484184915, + "grad_norm": 1.1356161216510552, + "learning_rate": 9.245854840794217e-06, + "loss": 0.2913, + "step": 2074 + }, + { + "epoch": 0.20194647201946472, + "grad_norm": 1.2497989015941582, + "learning_rate": 9.2450222338236e-06, + "loss": 0.356, + "step": 2075 + }, + { + "epoch": 0.20204379562043795, + "grad_norm": 1.4662802201560914, + "learning_rate": 9.244189205022514e-06, + "loss": 0.5234, + "step": 2076 + }, + { + "epoch": 0.2021411192214112, + "grad_norm": 1.1493994388606168, + "learning_rate": 9.243355754473738e-06, + "loss": 0.3862, + "step": 2077 + }, + { + "epoch": 0.20223844282238443, + "grad_norm": 1.1352456631925198, + "learning_rate": 9.242521882260093e-06, + "loss": 0.3693, + "step": 2078 + }, + { + "epoch": 0.20233576642335765, + "grad_norm": 1.4112847797443164, + "learning_rate": 9.24168758846444e-06, + "loss": 0.4667, + "step": 2079 + }, + { + "epoch": 0.2024330900243309, + "grad_norm": 1.9587086933310962, + "learning_rate": 9.240852873169686e-06, + "loss": 0.5446, + "step": 2080 + }, + { + "epoch": 0.20253041362530413, + "grad_norm": 1.4532595336328356, + "learning_rate": 9.240017736458772e-06, + "loss": 0.56, + "step": 2081 + }, + { + "epoch": 0.20262773722627736, + "grad_norm": 1.1373158358211433, + "learning_rate": 9.239182178414694e-06, + "loss": 0.3998, + "step": 2082 + }, + { + "epoch": 0.2027250608272506, + "grad_norm": 1.4892855081953407, + "learning_rate": 9.238346199120473e-06, + "loss": 0.5564, + "step": 2083 + }, + { + "epoch": 0.20282238442822384, + "grad_norm": 1.4122351541601532, + "learning_rate": 9.237509798659188e-06, + "loss": 0.4407, + "step": 2084 + }, + { + "epoch": 0.2029197080291971, + "grad_norm": 1.266747153803517, + "learning_rate": 9.236672977113948e-06, + "loss": 0.3898, + "step": 2085 + }, + { + "epoch": 0.20301703163017032, + "grad_norm": 1.3972737248894866, + "learning_rate": 9.23583573456791e-06, + "loss": 0.4855, + "step": 2086 + }, + { + "epoch": 0.20311435523114355, + "grad_norm": 1.6424190339871019, + "learning_rate": 9.234998071104272e-06, + "loss": 0.732, + "step": 2087 + }, + { + "epoch": 0.2032116788321168, + "grad_norm": 1.4973722328869334, + "learning_rate": 9.234159986806275e-06, + "loss": 0.4796, + "step": 2088 + }, + { + "epoch": 0.20330900243309002, + "grad_norm": 1.5629802728678386, + "learning_rate": 9.233321481757196e-06, + "loss": 0.4762, + "step": 2089 + }, + { + "epoch": 0.20340632603406325, + "grad_norm": 1.5273353205689704, + "learning_rate": 9.23248255604036e-06, + "loss": 0.6446, + "step": 2090 + }, + { + "epoch": 0.2035036496350365, + "grad_norm": 1.3835329237350877, + "learning_rate": 9.231643209739128e-06, + "loss": 0.5297, + "step": 2091 + }, + { + "epoch": 0.20360097323600973, + "grad_norm": 1.2187102873763251, + "learning_rate": 9.230803442936911e-06, + "loss": 0.3727, + "step": 2092 + }, + { + "epoch": 0.20369829683698296, + "grad_norm": 1.325749011032711, + "learning_rate": 9.229963255717156e-06, + "loss": 0.5476, + "step": 2093 + }, + { + "epoch": 0.2037956204379562, + "grad_norm": 1.1246093495598513, + "learning_rate": 9.229122648163351e-06, + "loss": 0.3309, + "step": 2094 + }, + { + "epoch": 0.20389294403892944, + "grad_norm": 1.3415111254139396, + "learning_rate": 9.22828162035903e-06, + "loss": 0.4226, + "step": 2095 + }, + { + "epoch": 0.2039902676399027, + "grad_norm": 1.2431047519820402, + "learning_rate": 9.227440172387766e-06, + "loss": 0.2364, + "step": 2096 + }, + { + "epoch": 0.20408759124087592, + "grad_norm": 1.59824202042343, + "learning_rate": 9.226598304333175e-06, + "loss": 0.5713, + "step": 2097 + }, + { + "epoch": 0.20418491484184914, + "grad_norm": 1.3718145057357327, + "learning_rate": 9.22575601627891e-06, + "loss": 0.4366, + "step": 2098 + }, + { + "epoch": 0.2042822384428224, + "grad_norm": 1.8310954422547832, + "learning_rate": 9.224913308308672e-06, + "loss": 0.4098, + "step": 2099 + }, + { + "epoch": 0.20437956204379562, + "grad_norm": 1.3433956299970118, + "learning_rate": 9.224070180506202e-06, + "loss": 0.2959, + "step": 2100 + }, + { + "epoch": 0.20447688564476885, + "grad_norm": 1.0277615122037833, + "learning_rate": 9.223226632955283e-06, + "loss": 0.265, + "step": 2101 + }, + { + "epoch": 0.2045742092457421, + "grad_norm": 1.2285380399323877, + "learning_rate": 9.222382665739737e-06, + "loss": 0.3844, + "step": 2102 + }, + { + "epoch": 0.20467153284671533, + "grad_norm": 1.1151094116106592, + "learning_rate": 9.221538278943432e-06, + "loss": 0.2461, + "step": 2103 + }, + { + "epoch": 0.20476885644768855, + "grad_norm": 1.5239102143699876, + "learning_rate": 9.22069347265027e-06, + "loss": 0.4239, + "step": 2104 + }, + { + "epoch": 0.2048661800486618, + "grad_norm": 1.6502658051911525, + "learning_rate": 9.219848246944206e-06, + "loss": 0.6723, + "step": 2105 + }, + { + "epoch": 0.20496350364963503, + "grad_norm": 1.638974040274465, + "learning_rate": 9.219002601909229e-06, + "loss": 0.5068, + "step": 2106 + }, + { + "epoch": 0.20506082725060829, + "grad_norm": 1.4649352184984061, + "learning_rate": 9.218156537629368e-06, + "loss": 0.4698, + "step": 2107 + }, + { + "epoch": 0.2051581508515815, + "grad_norm": 1.5070786345583258, + "learning_rate": 9.217310054188699e-06, + "loss": 0.4654, + "step": 2108 + }, + { + "epoch": 0.20525547445255474, + "grad_norm": 1.2480947756940115, + "learning_rate": 9.216463151671338e-06, + "loss": 0.3614, + "step": 2109 + }, + { + "epoch": 0.205352798053528, + "grad_norm": 1.6536121595263205, + "learning_rate": 9.215615830161443e-06, + "loss": 0.5872, + "step": 2110 + }, + { + "epoch": 0.20545012165450122, + "grad_norm": 1.5559546132859907, + "learning_rate": 9.214768089743211e-06, + "loss": 0.5098, + "step": 2111 + }, + { + "epoch": 0.20554744525547444, + "grad_norm": 1.5691593927804695, + "learning_rate": 9.213919930500884e-06, + "loss": 0.3845, + "step": 2112 + }, + { + "epoch": 0.2056447688564477, + "grad_norm": 1.4385010923740136, + "learning_rate": 9.213071352518744e-06, + "loss": 0.4035, + "step": 2113 + }, + { + "epoch": 0.20574209245742092, + "grad_norm": 1.2415148755341134, + "learning_rate": 9.212222355881111e-06, + "loss": 0.2503, + "step": 2114 + }, + { + "epoch": 0.20583941605839415, + "grad_norm": 1.597224767194554, + "learning_rate": 9.211372940672356e-06, + "loss": 0.3831, + "step": 2115 + }, + { + "epoch": 0.2059367396593674, + "grad_norm": 1.3936071245663937, + "learning_rate": 9.210523106976884e-06, + "loss": 0.3664, + "step": 2116 + }, + { + "epoch": 0.20603406326034063, + "grad_norm": 1.4335641468120297, + "learning_rate": 9.209672854879142e-06, + "loss": 0.3182, + "step": 2117 + }, + { + "epoch": 0.20613138686131388, + "grad_norm": 1.2544256067640176, + "learning_rate": 9.20882218446362e-06, + "loss": 0.2678, + "step": 2118 + }, + { + "epoch": 0.2062287104622871, + "grad_norm": 1.4867246001264303, + "learning_rate": 9.207971095814852e-06, + "loss": 0.4934, + "step": 2119 + }, + { + "epoch": 0.20632603406326033, + "grad_norm": 1.5387304887069146, + "learning_rate": 9.207119589017408e-06, + "loss": 0.4552, + "step": 2120 + }, + { + "epoch": 0.2064233576642336, + "grad_norm": 1.507156387441411, + "learning_rate": 9.206267664155906e-06, + "loss": 0.4209, + "step": 2121 + }, + { + "epoch": 0.2065206812652068, + "grad_norm": 1.3407732350308024, + "learning_rate": 9.205415321315e-06, + "loss": 0.4256, + "step": 2122 + }, + { + "epoch": 0.20661800486618004, + "grad_norm": 1.6313949345186305, + "learning_rate": 9.20456256057939e-06, + "loss": 0.4727, + "step": 2123 + }, + { + "epoch": 0.2067153284671533, + "grad_norm": 1.695026004332969, + "learning_rate": 9.203709382033814e-06, + "loss": 0.6547, + "step": 2124 + }, + { + "epoch": 0.20681265206812652, + "grad_norm": 1.5677721722384952, + "learning_rate": 9.202855785763053e-06, + "loss": 0.4469, + "step": 2125 + }, + { + "epoch": 0.20690997566909974, + "grad_norm": 1.4276579746412523, + "learning_rate": 9.202001771851928e-06, + "loss": 0.4511, + "step": 2126 + }, + { + "epoch": 0.207007299270073, + "grad_norm": 1.365652083209099, + "learning_rate": 9.201147340385304e-06, + "loss": 0.4435, + "step": 2127 + }, + { + "epoch": 0.20710462287104622, + "grad_norm": 1.4014399599326692, + "learning_rate": 9.200292491448086e-06, + "loss": 0.4017, + "step": 2128 + }, + { + "epoch": 0.20720194647201948, + "grad_norm": 1.4131798281318602, + "learning_rate": 9.199437225125223e-06, + "loss": 0.2781, + "step": 2129 + }, + { + "epoch": 0.2072992700729927, + "grad_norm": 1.3392698432345278, + "learning_rate": 9.198581541501702e-06, + "loss": 0.3576, + "step": 2130 + }, + { + "epoch": 0.20739659367396593, + "grad_norm": 1.2859171090531423, + "learning_rate": 9.197725440662552e-06, + "loss": 0.4505, + "step": 2131 + }, + { + "epoch": 0.20749391727493918, + "grad_norm": 1.3075221898254676, + "learning_rate": 9.196868922692845e-06, + "loss": 0.42, + "step": 2132 + }, + { + "epoch": 0.2075912408759124, + "grad_norm": 1.3120969425940014, + "learning_rate": 9.196011987677693e-06, + "loss": 0.3918, + "step": 2133 + }, + { + "epoch": 0.20768856447688563, + "grad_norm": 1.2917866907447901, + "learning_rate": 9.19515463570225e-06, + "loss": 0.4515, + "step": 2134 + }, + { + "epoch": 0.2077858880778589, + "grad_norm": 1.4964227937052923, + "learning_rate": 9.194296866851714e-06, + "loss": 0.4007, + "step": 2135 + }, + { + "epoch": 0.2078832116788321, + "grad_norm": 1.4096694486456338, + "learning_rate": 9.19343868121132e-06, + "loss": 0.5684, + "step": 2136 + }, + { + "epoch": 0.20798053527980534, + "grad_norm": 1.1303877036272907, + "learning_rate": 9.192580078866346e-06, + "loss": 0.2661, + "step": 2137 + }, + { + "epoch": 0.2080778588807786, + "grad_norm": 1.4056619474271335, + "learning_rate": 9.191721059902112e-06, + "loss": 0.4174, + "step": 2138 + }, + { + "epoch": 0.20817518248175182, + "grad_norm": 1.7142064467904727, + "learning_rate": 9.190861624403981e-06, + "loss": 0.4453, + "step": 2139 + }, + { + "epoch": 0.20827250608272507, + "grad_norm": 1.3293557691236777, + "learning_rate": 9.190001772457356e-06, + "loss": 0.4541, + "step": 2140 + }, + { + "epoch": 0.2083698296836983, + "grad_norm": 1.6131133576379075, + "learning_rate": 9.189141504147676e-06, + "loss": 0.3751, + "step": 2141 + }, + { + "epoch": 0.20846715328467152, + "grad_norm": 1.509737357483189, + "learning_rate": 9.188280819560431e-06, + "loss": 0.4757, + "step": 2142 + }, + { + "epoch": 0.20856447688564478, + "grad_norm": 1.479538114231473, + "learning_rate": 9.187419718781149e-06, + "loss": 0.3243, + "step": 2143 + }, + { + "epoch": 0.208661800486618, + "grad_norm": 1.4973982658919327, + "learning_rate": 9.186558201895395e-06, + "loss": 0.3732, + "step": 2144 + }, + { + "epoch": 0.20875912408759123, + "grad_norm": 1.5121453838943797, + "learning_rate": 9.185696268988777e-06, + "loss": 0.5435, + "step": 2145 + }, + { + "epoch": 0.20885644768856448, + "grad_norm": 1.7349033410138828, + "learning_rate": 9.18483392014695e-06, + "loss": 0.6415, + "step": 2146 + }, + { + "epoch": 0.2089537712895377, + "grad_norm": 1.4812330220855032, + "learning_rate": 9.183971155455602e-06, + "loss": 0.4961, + "step": 2147 + }, + { + "epoch": 0.20905109489051094, + "grad_norm": 1.5121767597167877, + "learning_rate": 9.183107975000472e-06, + "loss": 0.5298, + "step": 2148 + }, + { + "epoch": 0.2091484184914842, + "grad_norm": 1.5424817825799644, + "learning_rate": 9.18224437886733e-06, + "loss": 0.4577, + "step": 2149 + }, + { + "epoch": 0.20924574209245742, + "grad_norm": 1.2733853569354763, + "learning_rate": 9.181380367141991e-06, + "loss": 0.3306, + "step": 2150 + }, + { + "epoch": 0.20934306569343067, + "grad_norm": 1.1384650904715041, + "learning_rate": 9.180515939910317e-06, + "loss": 0.3831, + "step": 2151 + }, + { + "epoch": 0.2094403892944039, + "grad_norm": 1.3798308474076018, + "learning_rate": 9.179651097258204e-06, + "loss": 0.4629, + "step": 2152 + }, + { + "epoch": 0.20953771289537712, + "grad_norm": 1.4059733648531154, + "learning_rate": 9.178785839271593e-06, + "loss": 0.4526, + "step": 2153 + }, + { + "epoch": 0.20963503649635037, + "grad_norm": 1.581039004516103, + "learning_rate": 9.177920166036464e-06, + "loss": 0.5397, + "step": 2154 + }, + { + "epoch": 0.2097323600973236, + "grad_norm": 1.4851118969101265, + "learning_rate": 9.17705407763884e-06, + "loss": 0.5052, + "step": 2155 + }, + { + "epoch": 0.20982968369829683, + "grad_norm": 1.3633687775503893, + "learning_rate": 9.176187574164785e-06, + "loss": 0.4427, + "step": 2156 + }, + { + "epoch": 0.20992700729927008, + "grad_norm": 1.360319094739405, + "learning_rate": 9.175320655700407e-06, + "loss": 0.3649, + "step": 2157 + }, + { + "epoch": 0.2100243309002433, + "grad_norm": 1.3829673206277566, + "learning_rate": 9.174453322331844e-06, + "loss": 0.3536, + "step": 2158 + }, + { + "epoch": 0.21012165450121653, + "grad_norm": 1.5804059757696094, + "learning_rate": 9.173585574145292e-06, + "loss": 0.5937, + "step": 2159 + }, + { + "epoch": 0.21021897810218979, + "grad_norm": 1.4991084469228289, + "learning_rate": 9.172717411226975e-06, + "loss": 0.3523, + "step": 2160 + }, + { + "epoch": 0.210316301703163, + "grad_norm": 1.4762289487935065, + "learning_rate": 9.171848833663165e-06, + "loss": 0.4991, + "step": 2161 + }, + { + "epoch": 0.21041362530413626, + "grad_norm": 1.4858484283610454, + "learning_rate": 9.17097984154017e-06, + "loss": 0.5153, + "step": 2162 + }, + { + "epoch": 0.2105109489051095, + "grad_norm": 1.2647097068290445, + "learning_rate": 9.170110434944345e-06, + "loss": 0.3193, + "step": 2163 + }, + { + "epoch": 0.21060827250608272, + "grad_norm": 1.6889738075479466, + "learning_rate": 9.169240613962086e-06, + "loss": 0.4755, + "step": 2164 + }, + { + "epoch": 0.21070559610705597, + "grad_norm": 1.6464662019172414, + "learning_rate": 9.168370378679821e-06, + "loss": 0.5303, + "step": 2165 + }, + { + "epoch": 0.2108029197080292, + "grad_norm": 1.287927301108519, + "learning_rate": 9.16749972918403e-06, + "loss": 0.3231, + "step": 2166 + }, + { + "epoch": 0.21090024330900242, + "grad_norm": 1.378935902738664, + "learning_rate": 9.16662866556123e-06, + "loss": 0.4654, + "step": 2167 + }, + { + "epoch": 0.21099756690997568, + "grad_norm": 1.415652566603492, + "learning_rate": 9.16575718789798e-06, + "loss": 0.42, + "step": 2168 + }, + { + "epoch": 0.2110948905109489, + "grad_norm": 1.189498123796033, + "learning_rate": 9.164885296280875e-06, + "loss": 0.3529, + "step": 2169 + }, + { + "epoch": 0.21119221411192213, + "grad_norm": 1.5371351227791108, + "learning_rate": 9.16401299079656e-06, + "loss": 0.4679, + "step": 2170 + }, + { + "epoch": 0.21128953771289538, + "grad_norm": 1.2493790037654902, + "learning_rate": 9.163140271531714e-06, + "loss": 0.3793, + "step": 2171 + }, + { + "epoch": 0.2113868613138686, + "grad_norm": 1.3836947713855836, + "learning_rate": 9.16226713857306e-06, + "loss": 0.436, + "step": 2172 + }, + { + "epoch": 0.21148418491484186, + "grad_norm": 1.583280621035993, + "learning_rate": 9.161393592007364e-06, + "loss": 0.5673, + "step": 2173 + }, + { + "epoch": 0.2115815085158151, + "grad_norm": 1.336076606512916, + "learning_rate": 9.160519631921427e-06, + "loss": 0.418, + "step": 2174 + }, + { + "epoch": 0.2116788321167883, + "grad_norm": 1.5539773056945747, + "learning_rate": 9.159645258402098e-06, + "loss": 0.4417, + "step": 2175 + }, + { + "epoch": 0.21177615571776157, + "grad_norm": 1.35099904216899, + "learning_rate": 9.158770471536261e-06, + "loss": 0.4389, + "step": 2176 + }, + { + "epoch": 0.2118734793187348, + "grad_norm": 1.5960801985245197, + "learning_rate": 9.157895271410848e-06, + "loss": 0.4444, + "step": 2177 + }, + { + "epoch": 0.21197080291970802, + "grad_norm": 1.343338393224711, + "learning_rate": 9.157019658112825e-06, + "loss": 0.3867, + "step": 2178 + }, + { + "epoch": 0.21206812652068127, + "grad_norm": 1.573040163695098, + "learning_rate": 9.156143631729205e-06, + "loss": 0.5564, + "step": 2179 + }, + { + "epoch": 0.2121654501216545, + "grad_norm": 1.477194998770335, + "learning_rate": 9.155267192347037e-06, + "loss": 0.5053, + "step": 2180 + }, + { + "epoch": 0.21226277372262772, + "grad_norm": 1.4697445687746653, + "learning_rate": 9.154390340053414e-06, + "loss": 0.4462, + "step": 2181 + }, + { + "epoch": 0.21236009732360098, + "grad_norm": 1.2383233673923462, + "learning_rate": 9.15351307493547e-06, + "loss": 0.4023, + "step": 2182 + }, + { + "epoch": 0.2124574209245742, + "grad_norm": 1.73929160255024, + "learning_rate": 9.152635397080377e-06, + "loss": 0.456, + "step": 2183 + }, + { + "epoch": 0.21255474452554746, + "grad_norm": 1.814215933055299, + "learning_rate": 9.151757306575354e-06, + "loss": 0.5283, + "step": 2184 + }, + { + "epoch": 0.21265206812652068, + "grad_norm": 1.440140413882406, + "learning_rate": 9.150878803507655e-06, + "loss": 0.4754, + "step": 2185 + }, + { + "epoch": 0.2127493917274939, + "grad_norm": 1.4991761170210094, + "learning_rate": 9.149999887964577e-06, + "loss": 0.4244, + "step": 2186 + }, + { + "epoch": 0.21284671532846716, + "grad_norm": 1.6045542244692401, + "learning_rate": 9.149120560033461e-06, + "loss": 0.4149, + "step": 2187 + }, + { + "epoch": 0.2129440389294404, + "grad_norm": 1.6999406355422166, + "learning_rate": 9.148240819801684e-06, + "loss": 0.7227, + "step": 2188 + }, + { + "epoch": 0.2130413625304136, + "grad_norm": 1.5383336234101048, + "learning_rate": 9.147360667356667e-06, + "loss": 0.4102, + "step": 2189 + }, + { + "epoch": 0.21313868613138687, + "grad_norm": 1.3100772476716567, + "learning_rate": 9.146480102785871e-06, + "loss": 0.4001, + "step": 2190 + }, + { + "epoch": 0.2132360097323601, + "grad_norm": 1.2113504505529646, + "learning_rate": 9.1455991261768e-06, + "loss": 0.3906, + "step": 2191 + }, + { + "epoch": 0.21333333333333335, + "grad_norm": 2.1524156395732996, + "learning_rate": 9.144717737616994e-06, + "loss": 0.3722, + "step": 2192 + }, + { + "epoch": 0.21343065693430657, + "grad_norm": 1.3156410053212892, + "learning_rate": 9.143835937194039e-06, + "loss": 0.414, + "step": 2193 + }, + { + "epoch": 0.2135279805352798, + "grad_norm": 1.382537469614808, + "learning_rate": 9.14295372499556e-06, + "loss": 0.3687, + "step": 2194 + }, + { + "epoch": 0.21362530413625305, + "grad_norm": 1.4106617705657403, + "learning_rate": 9.142071101109224e-06, + "loss": 0.2515, + "step": 2195 + }, + { + "epoch": 0.21372262773722628, + "grad_norm": 1.4292530170893925, + "learning_rate": 9.141188065622736e-06, + "loss": 0.4671, + "step": 2196 + }, + { + "epoch": 0.2138199513381995, + "grad_norm": 1.371262803483025, + "learning_rate": 9.140304618623844e-06, + "loss": 0.4397, + "step": 2197 + }, + { + "epoch": 0.21391727493917276, + "grad_norm": 1.3337172412513854, + "learning_rate": 9.13942076020034e-06, + "loss": 0.4518, + "step": 2198 + }, + { + "epoch": 0.21401459854014598, + "grad_norm": 1.195478639712577, + "learning_rate": 9.138536490440046e-06, + "loss": 0.3236, + "step": 2199 + }, + { + "epoch": 0.2141119221411192, + "grad_norm": 1.6207375008593756, + "learning_rate": 9.13765180943084e-06, + "loss": 0.5147, + "step": 2200 + }, + { + "epoch": 0.21420924574209246, + "grad_norm": 1.457360033672521, + "learning_rate": 9.136766717260631e-06, + "loss": 0.3228, + "step": 2201 + }, + { + "epoch": 0.2143065693430657, + "grad_norm": 1.2314544120773039, + "learning_rate": 9.13588121401737e-06, + "loss": 0.3413, + "step": 2202 + }, + { + "epoch": 0.21440389294403894, + "grad_norm": 1.3614880154600904, + "learning_rate": 9.13499529978905e-06, + "loss": 0.3902, + "step": 2203 + }, + { + "epoch": 0.21450121654501217, + "grad_norm": 1.3431981306372034, + "learning_rate": 9.134108974663707e-06, + "loss": 0.4893, + "step": 2204 + }, + { + "epoch": 0.2145985401459854, + "grad_norm": 1.346114362934121, + "learning_rate": 9.133222238729414e-06, + "loss": 0.4195, + "step": 2205 + }, + { + "epoch": 0.21469586374695865, + "grad_norm": 1.2405202461045035, + "learning_rate": 9.132335092074285e-06, + "loss": 0.4373, + "step": 2206 + }, + { + "epoch": 0.21479318734793187, + "grad_norm": 1.2952176269832685, + "learning_rate": 9.131447534786478e-06, + "loss": 0.3253, + "step": 2207 + }, + { + "epoch": 0.2148905109489051, + "grad_norm": 1.3497804127584312, + "learning_rate": 9.130559566954191e-06, + "loss": 0.4401, + "step": 2208 + }, + { + "epoch": 0.21498783454987835, + "grad_norm": 1.6094605506454212, + "learning_rate": 9.129671188665661e-06, + "loss": 0.5943, + "step": 2209 + }, + { + "epoch": 0.21508515815085158, + "grad_norm": 1.7393737788578179, + "learning_rate": 9.128782400009167e-06, + "loss": 0.6832, + "step": 2210 + }, + { + "epoch": 0.2151824817518248, + "grad_norm": 1.2888456219960003, + "learning_rate": 9.127893201073028e-06, + "loss": 0.4449, + "step": 2211 + }, + { + "epoch": 0.21527980535279806, + "grad_norm": 1.6231451452368957, + "learning_rate": 9.127003591945605e-06, + "loss": 0.6579, + "step": 2212 + }, + { + "epoch": 0.21537712895377129, + "grad_norm": 1.4013330754504585, + "learning_rate": 9.126113572715296e-06, + "loss": 0.5072, + "step": 2213 + }, + { + "epoch": 0.21547445255474454, + "grad_norm": 1.1928349667862592, + "learning_rate": 9.125223143470547e-06, + "loss": 0.2896, + "step": 2214 + }, + { + "epoch": 0.21557177615571776, + "grad_norm": 1.3027255903002162, + "learning_rate": 9.124332304299838e-06, + "loss": 0.3076, + "step": 2215 + }, + { + "epoch": 0.215669099756691, + "grad_norm": 1.6527022746103417, + "learning_rate": 9.123441055291694e-06, + "loss": 0.4688, + "step": 2216 + }, + { + "epoch": 0.21576642335766424, + "grad_norm": 1.3197927862863625, + "learning_rate": 9.122549396534676e-06, + "loss": 0.318, + "step": 2217 + }, + { + "epoch": 0.21586374695863747, + "grad_norm": 1.5297610770776902, + "learning_rate": 9.121657328117392e-06, + "loss": 0.6176, + "step": 2218 + }, + { + "epoch": 0.2159610705596107, + "grad_norm": 1.338041823259507, + "learning_rate": 9.120764850128486e-06, + "loss": 0.3941, + "step": 2219 + }, + { + "epoch": 0.21605839416058395, + "grad_norm": 1.200858421054794, + "learning_rate": 9.119871962656644e-06, + "loss": 0.3758, + "step": 2220 + }, + { + "epoch": 0.21615571776155718, + "grad_norm": 1.5023816592412242, + "learning_rate": 9.118978665790592e-06, + "loss": 0.5032, + "step": 2221 + }, + { + "epoch": 0.2162530413625304, + "grad_norm": 1.2258656459952086, + "learning_rate": 9.118084959619099e-06, + "loss": 0.4489, + "step": 2222 + }, + { + "epoch": 0.21635036496350366, + "grad_norm": 1.717063075964899, + "learning_rate": 9.117190844230971e-06, + "loss": 0.7762, + "step": 2223 + }, + { + "epoch": 0.21644768856447688, + "grad_norm": 1.210140433555958, + "learning_rate": 9.11629631971506e-06, + "loss": 0.4431, + "step": 2224 + }, + { + "epoch": 0.21654501216545013, + "grad_norm": 1.4188251693910732, + "learning_rate": 9.115401386160252e-06, + "loss": 0.3495, + "step": 2225 + }, + { + "epoch": 0.21664233576642336, + "grad_norm": 2.073136961715272, + "learning_rate": 9.11450604365548e-06, + "loss": 0.4268, + "step": 2226 + }, + { + "epoch": 0.2167396593673966, + "grad_norm": 1.5265588328884594, + "learning_rate": 9.113610292289714e-06, + "loss": 0.4303, + "step": 2227 + }, + { + "epoch": 0.21683698296836984, + "grad_norm": 1.3220401272995868, + "learning_rate": 9.112714132151963e-06, + "loss": 0.4221, + "step": 2228 + }, + { + "epoch": 0.21693430656934307, + "grad_norm": 1.4088441022230214, + "learning_rate": 9.111817563331282e-06, + "loss": 0.1886, + "step": 2229 + }, + { + "epoch": 0.2170316301703163, + "grad_norm": 1.3947572498286958, + "learning_rate": 9.110920585916763e-06, + "loss": 0.353, + "step": 2230 + }, + { + "epoch": 0.21712895377128955, + "grad_norm": 1.2369368803593181, + "learning_rate": 9.110023199997537e-06, + "loss": 0.2576, + "step": 2231 + }, + { + "epoch": 0.21722627737226277, + "grad_norm": 1.1860471672244592, + "learning_rate": 9.10912540566278e-06, + "loss": 0.3994, + "step": 2232 + }, + { + "epoch": 0.217323600973236, + "grad_norm": 1.309576411449957, + "learning_rate": 9.108227203001708e-06, + "loss": 0.4453, + "step": 2233 + }, + { + "epoch": 0.21742092457420925, + "grad_norm": 1.6554896930775824, + "learning_rate": 9.10732859210357e-06, + "loss": 0.589, + "step": 2234 + }, + { + "epoch": 0.21751824817518248, + "grad_norm": 1.761859219992272, + "learning_rate": 9.106429573057666e-06, + "loss": 0.726, + "step": 2235 + }, + { + "epoch": 0.21761557177615573, + "grad_norm": 1.35833156484165, + "learning_rate": 9.105530145953335e-06, + "loss": 0.4012, + "step": 2236 + }, + { + "epoch": 0.21771289537712896, + "grad_norm": 3.4502529438559884, + "learning_rate": 9.104630310879944e-06, + "loss": 0.4621, + "step": 2237 + }, + { + "epoch": 0.21781021897810218, + "grad_norm": 1.3357957463599541, + "learning_rate": 9.103730067926922e-06, + "loss": 0.317, + "step": 2238 + }, + { + "epoch": 0.21790754257907544, + "grad_norm": 1.3566642568052916, + "learning_rate": 9.102829417183716e-06, + "loss": 0.4245, + "step": 2239 + }, + { + "epoch": 0.21800486618004866, + "grad_norm": 1.673808040965782, + "learning_rate": 9.10192835873983e-06, + "loss": 0.6908, + "step": 2240 + }, + { + "epoch": 0.2181021897810219, + "grad_norm": 1.8194308130790637, + "learning_rate": 9.101026892684804e-06, + "loss": 0.5157, + "step": 2241 + }, + { + "epoch": 0.21819951338199514, + "grad_norm": 1.4443029228393756, + "learning_rate": 9.100125019108214e-06, + "loss": 0.5417, + "step": 2242 + }, + { + "epoch": 0.21829683698296837, + "grad_norm": 1.4594341846039764, + "learning_rate": 9.099222738099682e-06, + "loss": 0.4297, + "step": 2243 + }, + { + "epoch": 0.2183941605839416, + "grad_norm": 1.3121064822320374, + "learning_rate": 9.098320049748864e-06, + "loss": 0.4646, + "step": 2244 + }, + { + "epoch": 0.21849148418491485, + "grad_norm": 1.5596348242175504, + "learning_rate": 9.097416954145467e-06, + "loss": 0.4877, + "step": 2245 + }, + { + "epoch": 0.21858880778588807, + "grad_norm": 1.1835003302943965, + "learning_rate": 9.096513451379225e-06, + "loss": 0.3548, + "step": 2246 + }, + { + "epoch": 0.21868613138686133, + "grad_norm": 1.4956699498169375, + "learning_rate": 9.095609541539925e-06, + "loss": 0.3958, + "step": 2247 + }, + { + "epoch": 0.21878345498783455, + "grad_norm": 1.3761247023142853, + "learning_rate": 9.094705224717388e-06, + "loss": 0.4076, + "step": 2248 + }, + { + "epoch": 0.21888077858880778, + "grad_norm": 1.2940624946938768, + "learning_rate": 9.093800501001476e-06, + "loss": 0.4989, + "step": 2249 + }, + { + "epoch": 0.21897810218978103, + "grad_norm": 1.1389229499303237, + "learning_rate": 9.092895370482091e-06, + "loss": 0.332, + "step": 2250 + }, + { + "epoch": 0.21907542579075426, + "grad_norm": 1.5338979130860617, + "learning_rate": 9.091989833249179e-06, + "loss": 0.5609, + "step": 2251 + }, + { + "epoch": 0.21917274939172748, + "grad_norm": 1.3736786128370664, + "learning_rate": 9.091083889392721e-06, + "loss": 0.3767, + "step": 2252 + }, + { + "epoch": 0.21927007299270074, + "grad_norm": 1.6001218689759074, + "learning_rate": 9.090177539002743e-06, + "loss": 0.5709, + "step": 2253 + }, + { + "epoch": 0.21936739659367396, + "grad_norm": 1.2578364778685514, + "learning_rate": 9.089270782169308e-06, + "loss": 0.3796, + "step": 2254 + }, + { + "epoch": 0.2194647201946472, + "grad_norm": 1.5508865589735865, + "learning_rate": 9.088363618982523e-06, + "loss": 0.5947, + "step": 2255 + }, + { + "epoch": 0.21956204379562044, + "grad_norm": 1.2646857650137902, + "learning_rate": 9.08745604953253e-06, + "loss": 0.3024, + "step": 2256 + }, + { + "epoch": 0.21965936739659367, + "grad_norm": 1.1168071392771144, + "learning_rate": 9.08654807390952e-06, + "loss": 0.3113, + "step": 2257 + }, + { + "epoch": 0.21975669099756692, + "grad_norm": 1.238369237619726, + "learning_rate": 9.085639692203713e-06, + "loss": 0.2179, + "step": 2258 + }, + { + "epoch": 0.21985401459854015, + "grad_norm": 1.2485790759653945, + "learning_rate": 9.084730904505381e-06, + "loss": 0.3763, + "step": 2259 + }, + { + "epoch": 0.21995133819951337, + "grad_norm": 1.6082877032407055, + "learning_rate": 9.083821710904827e-06, + "loss": 0.3831, + "step": 2260 + }, + { + "epoch": 0.22004866180048663, + "grad_norm": 1.3213256018887491, + "learning_rate": 9.082912111492401e-06, + "loss": 0.4091, + "step": 2261 + }, + { + "epoch": 0.22014598540145985, + "grad_norm": 1.5899440724355371, + "learning_rate": 9.08200210635849e-06, + "loss": 0.4491, + "step": 2262 + }, + { + "epoch": 0.22024330900243308, + "grad_norm": 1.30089465497526, + "learning_rate": 9.081091695593518e-06, + "loss": 0.3762, + "step": 2263 + }, + { + "epoch": 0.22034063260340633, + "grad_norm": 1.5403984971127525, + "learning_rate": 9.080180879287957e-06, + "loss": 0.438, + "step": 2264 + }, + { + "epoch": 0.22043795620437956, + "grad_norm": 1.5500984898931875, + "learning_rate": 9.079269657532312e-06, + "loss": 0.398, + "step": 2265 + }, + { + "epoch": 0.22053527980535279, + "grad_norm": 1.4834461719298844, + "learning_rate": 9.078358030417136e-06, + "loss": 0.6175, + "step": 2266 + }, + { + "epoch": 0.22063260340632604, + "grad_norm": 1.3553003212010182, + "learning_rate": 9.077445998033015e-06, + "loss": 0.2719, + "step": 2267 + }, + { + "epoch": 0.22072992700729926, + "grad_norm": 1.573783871238475, + "learning_rate": 9.07653356047058e-06, + "loss": 0.2328, + "step": 2268 + }, + { + "epoch": 0.22082725060827252, + "grad_norm": 1.54928316645126, + "learning_rate": 9.075620717820498e-06, + "loss": 0.3514, + "step": 2269 + }, + { + "epoch": 0.22092457420924574, + "grad_norm": 1.3616253433976528, + "learning_rate": 9.07470747017348e-06, + "loss": 0.4636, + "step": 2270 + }, + { + "epoch": 0.22102189781021897, + "grad_norm": 1.6741713680481711, + "learning_rate": 9.073793817620277e-06, + "loss": 0.6321, + "step": 2271 + }, + { + "epoch": 0.22111922141119222, + "grad_norm": 1.3794305685281492, + "learning_rate": 9.07287976025168e-06, + "loss": 0.3172, + "step": 2272 + }, + { + "epoch": 0.22121654501216545, + "grad_norm": 1.362894347632133, + "learning_rate": 9.071965298158516e-06, + "loss": 0.3989, + "step": 2273 + }, + { + "epoch": 0.22131386861313868, + "grad_norm": 1.4233131262232992, + "learning_rate": 9.071050431431658e-06, + "loss": 0.4922, + "step": 2274 + }, + { + "epoch": 0.22141119221411193, + "grad_norm": 1.4905332812995968, + "learning_rate": 9.070135160162016e-06, + "loss": 0.3952, + "step": 2275 + }, + { + "epoch": 0.22150851581508516, + "grad_norm": 1.4389307945528345, + "learning_rate": 9.069219484440541e-06, + "loss": 0.4364, + "step": 2276 + }, + { + "epoch": 0.22160583941605838, + "grad_norm": 1.4796907096594347, + "learning_rate": 9.068303404358226e-06, + "loss": 0.4842, + "step": 2277 + }, + { + "epoch": 0.22170316301703163, + "grad_norm": 1.6561415294899449, + "learning_rate": 9.0673869200061e-06, + "loss": 0.5595, + "step": 2278 + }, + { + "epoch": 0.22180048661800486, + "grad_norm": 1.4198474890784685, + "learning_rate": 9.066470031475236e-06, + "loss": 0.4762, + "step": 2279 + }, + { + "epoch": 0.22189781021897811, + "grad_norm": 1.437724469115563, + "learning_rate": 9.065552738856745e-06, + "loss": 0.3687, + "step": 2280 + }, + { + "epoch": 0.22199513381995134, + "grad_norm": 1.2431258010669888, + "learning_rate": 9.06463504224178e-06, + "loss": 0.3854, + "step": 2281 + }, + { + "epoch": 0.22209245742092457, + "grad_norm": 1.362042407967867, + "learning_rate": 9.063716941721534e-06, + "loss": 0.3981, + "step": 2282 + }, + { + "epoch": 0.22218978102189782, + "grad_norm": 1.3260780267557537, + "learning_rate": 9.062798437387236e-06, + "loss": 0.4304, + "step": 2283 + }, + { + "epoch": 0.22228710462287105, + "grad_norm": 1.2009742636293355, + "learning_rate": 9.06187952933016e-06, + "loss": 0.3441, + "step": 2284 + }, + { + "epoch": 0.22238442822384427, + "grad_norm": 1.7089934430562992, + "learning_rate": 9.060960217641618e-06, + "loss": 0.3488, + "step": 2285 + }, + { + "epoch": 0.22248175182481753, + "grad_norm": 1.3539106224768682, + "learning_rate": 9.060040502412965e-06, + "loss": 0.3617, + "step": 2286 + }, + { + "epoch": 0.22257907542579075, + "grad_norm": 1.3952537396094973, + "learning_rate": 9.05912038373559e-06, + "loss": 0.4507, + "step": 2287 + }, + { + "epoch": 0.22267639902676398, + "grad_norm": 1.201207552744405, + "learning_rate": 9.058199861700928e-06, + "loss": 0.3074, + "step": 2288 + }, + { + "epoch": 0.22277372262773723, + "grad_norm": 1.1918182161083974, + "learning_rate": 9.057278936400453e-06, + "loss": 0.3713, + "step": 2289 + }, + { + "epoch": 0.22287104622871046, + "grad_norm": 1.5864015097741249, + "learning_rate": 9.056357607925674e-06, + "loss": 0.4651, + "step": 2290 + }, + { + "epoch": 0.2229683698296837, + "grad_norm": 1.0855034708664277, + "learning_rate": 9.055435876368148e-06, + "loss": 0.2361, + "step": 2291 + }, + { + "epoch": 0.22306569343065694, + "grad_norm": 1.1945153364440069, + "learning_rate": 9.054513741819466e-06, + "loss": 0.2803, + "step": 2292 + }, + { + "epoch": 0.22316301703163016, + "grad_norm": 1.3734264039165323, + "learning_rate": 9.053591204371262e-06, + "loss": 0.3709, + "step": 2293 + }, + { + "epoch": 0.22326034063260342, + "grad_norm": 1.662571628719731, + "learning_rate": 9.052668264115206e-06, + "loss": 0.6615, + "step": 2294 + }, + { + "epoch": 0.22335766423357664, + "grad_norm": 1.4371203045482563, + "learning_rate": 9.051744921143015e-06, + "loss": 0.4082, + "step": 2295 + }, + { + "epoch": 0.22345498783454987, + "grad_norm": 1.5571182647752952, + "learning_rate": 9.050821175546442e-06, + "loss": 0.5338, + "step": 2296 + }, + { + "epoch": 0.22355231143552312, + "grad_norm": 1.4022335338581293, + "learning_rate": 9.049897027417277e-06, + "loss": 0.3933, + "step": 2297 + }, + { + "epoch": 0.22364963503649635, + "grad_norm": 1.2815006290096387, + "learning_rate": 9.048972476847356e-06, + "loss": 0.4662, + "step": 2298 + }, + { + "epoch": 0.22374695863746957, + "grad_norm": 1.4344706750679865, + "learning_rate": 9.04804752392855e-06, + "loss": 0.4422, + "step": 2299 + }, + { + "epoch": 0.22384428223844283, + "grad_norm": 1.2984999163116793, + "learning_rate": 9.047122168752775e-06, + "loss": 0.3659, + "step": 2300 + }, + { + "epoch": 0.22394160583941605, + "grad_norm": 1.1587669196843096, + "learning_rate": 9.046196411411982e-06, + "loss": 0.2974, + "step": 2301 + }, + { + "epoch": 0.2240389294403893, + "grad_norm": 2.322228254141064, + "learning_rate": 9.045270251998166e-06, + "loss": 0.5667, + "step": 2302 + }, + { + "epoch": 0.22413625304136253, + "grad_norm": 1.5137300738559605, + "learning_rate": 9.044343690603358e-06, + "loss": 0.3889, + "step": 2303 + }, + { + "epoch": 0.22423357664233576, + "grad_norm": 1.472679239189759, + "learning_rate": 9.04341672731963e-06, + "loss": 0.4875, + "step": 2304 + }, + { + "epoch": 0.224330900243309, + "grad_norm": 1.391957619608358, + "learning_rate": 9.042489362239097e-06, + "loss": 0.4513, + "step": 2305 + }, + { + "epoch": 0.22442822384428224, + "grad_norm": 1.5752423841676473, + "learning_rate": 9.041561595453914e-06, + "loss": 0.6021, + "step": 2306 + }, + { + "epoch": 0.22452554744525546, + "grad_norm": 1.340696458312585, + "learning_rate": 9.040633427056268e-06, + "loss": 0.36, + "step": 2307 + }, + { + "epoch": 0.22462287104622872, + "grad_norm": 1.319309191993897, + "learning_rate": 9.039704857138396e-06, + "loss": 0.2632, + "step": 2308 + }, + { + "epoch": 0.22472019464720194, + "grad_norm": 1.3567748798839634, + "learning_rate": 9.03877588579257e-06, + "loss": 0.4085, + "step": 2309 + }, + { + "epoch": 0.22481751824817517, + "grad_norm": 1.7234931003044007, + "learning_rate": 9.0378465131111e-06, + "loss": 0.5366, + "step": 2310 + }, + { + "epoch": 0.22491484184914842, + "grad_norm": 1.3431964443797024, + "learning_rate": 9.036916739186341e-06, + "loss": 0.3406, + "step": 2311 + }, + { + "epoch": 0.22501216545012165, + "grad_norm": 1.6143507102825565, + "learning_rate": 9.035986564110685e-06, + "loss": 0.6322, + "step": 2312 + }, + { + "epoch": 0.2251094890510949, + "grad_norm": 1.421713348254314, + "learning_rate": 9.035055987976563e-06, + "loss": 0.3963, + "step": 2313 + }, + { + "epoch": 0.22520681265206813, + "grad_norm": 1.5860325075452377, + "learning_rate": 9.034125010876447e-06, + "loss": 0.4722, + "step": 2314 + }, + { + "epoch": 0.22530413625304135, + "grad_norm": 1.633700480684755, + "learning_rate": 9.03319363290285e-06, + "loss": 0.2649, + "step": 2315 + }, + { + "epoch": 0.2254014598540146, + "grad_norm": 1.5598775600409591, + "learning_rate": 9.03226185414832e-06, + "loss": 0.4778, + "step": 2316 + }, + { + "epoch": 0.22549878345498783, + "grad_norm": 1.4413798673536165, + "learning_rate": 9.031329674705455e-06, + "loss": 0.3182, + "step": 2317 + }, + { + "epoch": 0.22559610705596106, + "grad_norm": 1.437989358950148, + "learning_rate": 9.03039709466688e-06, + "loss": 0.4297, + "step": 2318 + }, + { + "epoch": 0.2256934306569343, + "grad_norm": 1.3355568683760275, + "learning_rate": 9.029464114125267e-06, + "loss": 0.3393, + "step": 2319 + }, + { + "epoch": 0.22579075425790754, + "grad_norm": 1.353161962413978, + "learning_rate": 9.028530733173332e-06, + "loss": 0.3362, + "step": 2320 + }, + { + "epoch": 0.22588807785888076, + "grad_norm": 1.1699742479017108, + "learning_rate": 9.027596951903819e-06, + "loss": 0.3674, + "step": 2321 + }, + { + "epoch": 0.22598540145985402, + "grad_norm": 1.1235278882417843, + "learning_rate": 9.026662770409524e-06, + "loss": 0.3209, + "step": 2322 + }, + { + "epoch": 0.22608272506082724, + "grad_norm": 1.4951135995374567, + "learning_rate": 9.025728188783273e-06, + "loss": 0.4297, + "step": 2323 + }, + { + "epoch": 0.2261800486618005, + "grad_norm": 1.3046514997255336, + "learning_rate": 9.024793207117937e-06, + "loss": 0.3765, + "step": 2324 + }, + { + "epoch": 0.22627737226277372, + "grad_norm": 1.3346554142143854, + "learning_rate": 9.023857825506426e-06, + "loss": 0.5228, + "step": 2325 + }, + { + "epoch": 0.22637469586374695, + "grad_norm": 1.4309619163867682, + "learning_rate": 9.022922044041691e-06, + "loss": 0.4605, + "step": 2326 + }, + { + "epoch": 0.2264720194647202, + "grad_norm": 1.5152634651556307, + "learning_rate": 9.021985862816718e-06, + "loss": 0.5553, + "step": 2327 + }, + { + "epoch": 0.22656934306569343, + "grad_norm": 1.3885182055556289, + "learning_rate": 9.02104928192454e-06, + "loss": 0.4831, + "step": 2328 + }, + { + "epoch": 0.22666666666666666, + "grad_norm": 1.2729317064328092, + "learning_rate": 9.020112301458221e-06, + "loss": 0.4314, + "step": 2329 + }, + { + "epoch": 0.2267639902676399, + "grad_norm": 0.9679503678492228, + "learning_rate": 9.019174921510874e-06, + "loss": 0.1925, + "step": 2330 + }, + { + "epoch": 0.22686131386861313, + "grad_norm": 1.4513146393120597, + "learning_rate": 9.018237142175643e-06, + "loss": 0.5487, + "step": 2331 + }, + { + "epoch": 0.2269586374695864, + "grad_norm": 1.5377065039176208, + "learning_rate": 9.017298963545718e-06, + "loss": 0.4063, + "step": 2332 + }, + { + "epoch": 0.22705596107055961, + "grad_norm": 1.0180180453516632, + "learning_rate": 9.016360385714324e-06, + "loss": 0.2101, + "step": 2333 + }, + { + "epoch": 0.22715328467153284, + "grad_norm": 1.3145676629552665, + "learning_rate": 9.015421408774732e-06, + "loss": 0.4575, + "step": 2334 + }, + { + "epoch": 0.2272506082725061, + "grad_norm": 1.3213351651174845, + "learning_rate": 9.014482032820247e-06, + "loss": 0.3924, + "step": 2335 + }, + { + "epoch": 0.22734793187347932, + "grad_norm": 1.9370834148842127, + "learning_rate": 9.013542257944212e-06, + "loss": 0.4332, + "step": 2336 + }, + { + "epoch": 0.22744525547445255, + "grad_norm": 1.4754695985325648, + "learning_rate": 9.012602084240018e-06, + "loss": 0.4014, + "step": 2337 + }, + { + "epoch": 0.2275425790754258, + "grad_norm": 1.1124893316550342, + "learning_rate": 9.011661511801088e-06, + "loss": 0.2957, + "step": 2338 + }, + { + "epoch": 0.22763990267639903, + "grad_norm": 1.2537185195667433, + "learning_rate": 9.010720540720888e-06, + "loss": 0.3004, + "step": 2339 + }, + { + "epoch": 0.22773722627737225, + "grad_norm": 1.4597689601256807, + "learning_rate": 9.009779171092923e-06, + "loss": 0.2555, + "step": 2340 + }, + { + "epoch": 0.2278345498783455, + "grad_norm": 1.4737791439989423, + "learning_rate": 9.008837403010736e-06, + "loss": 0.5355, + "step": 2341 + }, + { + "epoch": 0.22793187347931873, + "grad_norm": 1.3795639069131398, + "learning_rate": 9.007895236567913e-06, + "loss": 0.3961, + "step": 2342 + }, + { + "epoch": 0.22802919708029198, + "grad_norm": 1.6364796903185053, + "learning_rate": 9.006952671858078e-06, + "loss": 0.444, + "step": 2343 + }, + { + "epoch": 0.2281265206812652, + "grad_norm": 1.1964346909925698, + "learning_rate": 9.006009708974892e-06, + "loss": 0.3297, + "step": 2344 + }, + { + "epoch": 0.22822384428223844, + "grad_norm": 1.343808771666808, + "learning_rate": 9.00506634801206e-06, + "loss": 0.4537, + "step": 2345 + }, + { + "epoch": 0.2283211678832117, + "grad_norm": 1.4003110727355261, + "learning_rate": 9.004122589063323e-06, + "loss": 0.3883, + "step": 2346 + }, + { + "epoch": 0.22841849148418492, + "grad_norm": 1.2435101838594087, + "learning_rate": 9.003178432222462e-06, + "loss": 0.4238, + "step": 2347 + }, + { + "epoch": 0.22851581508515814, + "grad_norm": 1.324643227390155, + "learning_rate": 9.0022338775833e-06, + "loss": 0.4139, + "step": 2348 + }, + { + "epoch": 0.2286131386861314, + "grad_norm": 1.7692069120616638, + "learning_rate": 9.001288925239698e-06, + "loss": 0.4719, + "step": 2349 + }, + { + "epoch": 0.22871046228710462, + "grad_norm": 1.223562422765287, + "learning_rate": 9.000343575285555e-06, + "loss": 0.3256, + "step": 2350 + }, + { + "epoch": 0.22880778588807785, + "grad_norm": 1.3407025045830592, + "learning_rate": 8.999397827814812e-06, + "loss": 0.3788, + "step": 2351 + }, + { + "epoch": 0.2289051094890511, + "grad_norm": 1.5281139100341292, + "learning_rate": 8.99845168292145e-06, + "loss": 0.5565, + "step": 2352 + }, + { + "epoch": 0.22900243309002433, + "grad_norm": 1.560155712083658, + "learning_rate": 8.997505140699488e-06, + "loss": 0.4957, + "step": 2353 + }, + { + "epoch": 0.22909975669099758, + "grad_norm": 1.290422773797366, + "learning_rate": 8.996558201242981e-06, + "loss": 0.4011, + "step": 2354 + }, + { + "epoch": 0.2291970802919708, + "grad_norm": 1.2847680894150124, + "learning_rate": 8.99561086464603e-06, + "loss": 0.4419, + "step": 2355 + }, + { + "epoch": 0.22929440389294403, + "grad_norm": 1.4625413220428547, + "learning_rate": 8.99466313100277e-06, + "loss": 0.2511, + "step": 2356 + }, + { + "epoch": 0.22939172749391729, + "grad_norm": 1.2882840667194135, + "learning_rate": 8.99371500040738e-06, + "loss": 0.3992, + "step": 2357 + }, + { + "epoch": 0.2294890510948905, + "grad_norm": 1.1997126453782205, + "learning_rate": 8.992766472954077e-06, + "loss": 0.2639, + "step": 2358 + }, + { + "epoch": 0.22958637469586374, + "grad_norm": 1.6688893120724655, + "learning_rate": 8.991817548737114e-06, + "loss": 0.3103, + "step": 2359 + }, + { + "epoch": 0.229683698296837, + "grad_norm": 1.4031771252981649, + "learning_rate": 8.990868227850788e-06, + "loss": 0.4245, + "step": 2360 + }, + { + "epoch": 0.22978102189781022, + "grad_norm": 1.4825462721346627, + "learning_rate": 8.989918510389432e-06, + "loss": 0.3973, + "step": 2361 + }, + { + "epoch": 0.22987834549878344, + "grad_norm": 1.7756990641125774, + "learning_rate": 8.988968396447424e-06, + "loss": 0.6091, + "step": 2362 + }, + { + "epoch": 0.2299756690997567, + "grad_norm": 1.5519381803018173, + "learning_rate": 8.988017886119172e-06, + "loss": 0.5849, + "step": 2363 + }, + { + "epoch": 0.23007299270072992, + "grad_norm": 1.5288537407748173, + "learning_rate": 8.987066979499133e-06, + "loss": 0.594, + "step": 2364 + }, + { + "epoch": 0.23017031630170318, + "grad_norm": 1.2519254160654887, + "learning_rate": 8.986115676681797e-06, + "loss": 0.3781, + "step": 2365 + }, + { + "epoch": 0.2302676399026764, + "grad_norm": 1.2118409754918265, + "learning_rate": 8.985163977761697e-06, + "loss": 0.3761, + "step": 2366 + }, + { + "epoch": 0.23036496350364963, + "grad_norm": 1.3123505825187787, + "learning_rate": 8.984211882833402e-06, + "loss": 0.405, + "step": 2367 + }, + { + "epoch": 0.23046228710462288, + "grad_norm": 1.6027642184293107, + "learning_rate": 8.983259391991524e-06, + "loss": 0.597, + "step": 2368 + }, + { + "epoch": 0.2305596107055961, + "grad_norm": 1.3646497443348367, + "learning_rate": 8.982306505330712e-06, + "loss": 0.4036, + "step": 2369 + }, + { + "epoch": 0.23065693430656933, + "grad_norm": 1.2894115553392402, + "learning_rate": 8.981353222945653e-06, + "loss": 0.2778, + "step": 2370 + }, + { + "epoch": 0.2307542579075426, + "grad_norm": 1.27883786418869, + "learning_rate": 8.98039954493108e-06, + "loss": 0.3803, + "step": 2371 + }, + { + "epoch": 0.2308515815085158, + "grad_norm": 1.5863647637061415, + "learning_rate": 8.979445471381755e-06, + "loss": 0.4716, + "step": 2372 + }, + { + "epoch": 0.23094890510948904, + "grad_norm": 1.1874137646332688, + "learning_rate": 8.97849100239249e-06, + "loss": 0.2846, + "step": 2373 + }, + { + "epoch": 0.2310462287104623, + "grad_norm": 1.6936318641369774, + "learning_rate": 8.977536138058126e-06, + "loss": 0.3418, + "step": 2374 + }, + { + "epoch": 0.23114355231143552, + "grad_norm": 1.0526167143851337, + "learning_rate": 8.976580878473553e-06, + "loss": 0.258, + "step": 2375 + }, + { + "epoch": 0.23124087591240877, + "grad_norm": 1.753799809070063, + "learning_rate": 8.975625223733693e-06, + "loss": 0.4764, + "step": 2376 + }, + { + "epoch": 0.231338199513382, + "grad_norm": 1.3814482775299988, + "learning_rate": 8.97466917393351e-06, + "loss": 0.3811, + "step": 2377 + }, + { + "epoch": 0.23143552311435522, + "grad_norm": 1.575424754678499, + "learning_rate": 8.97371272916801e-06, + "loss": 0.5028, + "step": 2378 + }, + { + "epoch": 0.23153284671532848, + "grad_norm": 1.5163540217481704, + "learning_rate": 8.972755889532234e-06, + "loss": 0.4055, + "step": 2379 + }, + { + "epoch": 0.2316301703163017, + "grad_norm": 1.1877796947964157, + "learning_rate": 8.971798655121264e-06, + "loss": 0.2978, + "step": 2380 + }, + { + "epoch": 0.23172749391727493, + "grad_norm": 1.6274909221671408, + "learning_rate": 8.970841026030218e-06, + "loss": 0.4319, + "step": 2381 + }, + { + "epoch": 0.23182481751824818, + "grad_norm": 1.413480143472021, + "learning_rate": 8.969883002354259e-06, + "loss": 0.4015, + "step": 2382 + }, + { + "epoch": 0.2319221411192214, + "grad_norm": 1.451327617189514, + "learning_rate": 8.968924584188587e-06, + "loss": 0.5107, + "step": 2383 + }, + { + "epoch": 0.23201946472019463, + "grad_norm": 1.4288160659587352, + "learning_rate": 8.96796577162844e-06, + "loss": 0.369, + "step": 2384 + }, + { + "epoch": 0.2321167883211679, + "grad_norm": 1.6469132304956866, + "learning_rate": 8.967006564769094e-06, + "loss": 0.5982, + "step": 2385 + }, + { + "epoch": 0.23221411192214111, + "grad_norm": 1.4887239693800984, + "learning_rate": 8.966046963705869e-06, + "loss": 0.4967, + "step": 2386 + }, + { + "epoch": 0.23231143552311437, + "grad_norm": 1.2469481884120308, + "learning_rate": 8.965086968534116e-06, + "loss": 0.4022, + "step": 2387 + }, + { + "epoch": 0.2324087591240876, + "grad_norm": 2.6320603198934047, + "learning_rate": 8.964126579349237e-06, + "loss": 0.2489, + "step": 2388 + }, + { + "epoch": 0.23250608272506082, + "grad_norm": 1.2339093742509784, + "learning_rate": 8.963165796246663e-06, + "loss": 0.3694, + "step": 2389 + }, + { + "epoch": 0.23260340632603407, + "grad_norm": 1.4634162966788549, + "learning_rate": 8.962204619321866e-06, + "loss": 0.5646, + "step": 2390 + }, + { + "epoch": 0.2327007299270073, + "grad_norm": 1.2919651066139786, + "learning_rate": 8.961243048670363e-06, + "loss": 0.3833, + "step": 2391 + }, + { + "epoch": 0.23279805352798053, + "grad_norm": 1.5273773111622013, + "learning_rate": 8.960281084387701e-06, + "loss": 0.5724, + "step": 2392 + }, + { + "epoch": 0.23289537712895378, + "grad_norm": 1.4704498843019616, + "learning_rate": 8.959318726569475e-06, + "loss": 0.5232, + "step": 2393 + }, + { + "epoch": 0.232992700729927, + "grad_norm": 1.52947786509823, + "learning_rate": 8.958355975311314e-06, + "loss": 0.5014, + "step": 2394 + }, + { + "epoch": 0.23309002433090023, + "grad_norm": 1.457234959002331, + "learning_rate": 8.957392830708886e-06, + "loss": 0.5401, + "step": 2395 + }, + { + "epoch": 0.23318734793187348, + "grad_norm": 1.5878948291380384, + "learning_rate": 8.9564292928579e-06, + "loss": 0.4481, + "step": 2396 + }, + { + "epoch": 0.2332846715328467, + "grad_norm": 1.3353181262068508, + "learning_rate": 8.955465361854103e-06, + "loss": 0.3668, + "step": 2397 + }, + { + "epoch": 0.23338199513381996, + "grad_norm": 2.023729457927684, + "learning_rate": 8.954501037793282e-06, + "loss": 0.256, + "step": 2398 + }, + { + "epoch": 0.2334793187347932, + "grad_norm": 1.3501136378744423, + "learning_rate": 8.953536320771264e-06, + "loss": 0.4288, + "step": 2399 + }, + { + "epoch": 0.23357664233576642, + "grad_norm": 0.9695156209886321, + "learning_rate": 8.95257121088391e-06, + "loss": 0.3313, + "step": 2400 + }, + { + "epoch": 0.23367396593673967, + "grad_norm": 1.6268089203048999, + "learning_rate": 8.951605708227125e-06, + "loss": 0.5031, + "step": 2401 + }, + { + "epoch": 0.2337712895377129, + "grad_norm": 1.3327356528771297, + "learning_rate": 8.950639812896852e-06, + "loss": 0.352, + "step": 2402 + }, + { + "epoch": 0.23386861313868612, + "grad_norm": 1.646158604731562, + "learning_rate": 8.949673524989074e-06, + "loss": 0.6143, + "step": 2403 + }, + { + "epoch": 0.23396593673965937, + "grad_norm": 1.4459398712277267, + "learning_rate": 8.948706844599809e-06, + "loss": 0.301, + "step": 2404 + }, + { + "epoch": 0.2340632603406326, + "grad_norm": 1.242464142709881, + "learning_rate": 8.947739771825118e-06, + "loss": 0.3867, + "step": 2405 + }, + { + "epoch": 0.23416058394160583, + "grad_norm": 1.283369590610404, + "learning_rate": 8.946772306761099e-06, + "loss": 0.3396, + "step": 2406 + }, + { + "epoch": 0.23425790754257908, + "grad_norm": 1.659051576981879, + "learning_rate": 8.94580444950389e-06, + "loss": 0.2985, + "step": 2407 + }, + { + "epoch": 0.2343552311435523, + "grad_norm": 1.5811557183787177, + "learning_rate": 8.944836200149669e-06, + "loss": 0.5412, + "step": 2408 + }, + { + "epoch": 0.23445255474452556, + "grad_norm": 1.5937284580345608, + "learning_rate": 8.943867558794648e-06, + "loss": 0.4562, + "step": 2409 + }, + { + "epoch": 0.23454987834549879, + "grad_norm": 1.179539450140548, + "learning_rate": 8.942898525535085e-06, + "loss": 0.2436, + "step": 2410 + }, + { + "epoch": 0.234647201946472, + "grad_norm": 1.2115140465312926, + "learning_rate": 8.941929100467272e-06, + "loss": 0.325, + "step": 2411 + }, + { + "epoch": 0.23474452554744527, + "grad_norm": 1.3228525862104779, + "learning_rate": 8.94095928368754e-06, + "loss": 0.4001, + "step": 2412 + }, + { + "epoch": 0.2348418491484185, + "grad_norm": 1.5093562470528878, + "learning_rate": 8.939989075292263e-06, + "loss": 0.3554, + "step": 2413 + }, + { + "epoch": 0.23493917274939172, + "grad_norm": 1.629660086616085, + "learning_rate": 8.93901847537785e-06, + "loss": 0.6349, + "step": 2414 + }, + { + "epoch": 0.23503649635036497, + "grad_norm": 1.0826348229158524, + "learning_rate": 8.938047484040749e-06, + "loss": 0.2681, + "step": 2415 + }, + { + "epoch": 0.2351338199513382, + "grad_norm": 1.2841520241198179, + "learning_rate": 8.93707610137745e-06, + "loss": 0.4081, + "step": 2416 + }, + { + "epoch": 0.23523114355231142, + "grad_norm": 1.913465881785096, + "learning_rate": 8.936104327484479e-06, + "loss": 0.7043, + "step": 2417 + }, + { + "epoch": 0.23532846715328468, + "grad_norm": 1.386306701477425, + "learning_rate": 8.935132162458401e-06, + "loss": 0.341, + "step": 2418 + }, + { + "epoch": 0.2354257907542579, + "grad_norm": 1.1278547518059516, + "learning_rate": 8.934159606395821e-06, + "loss": 0.3151, + "step": 2419 + }, + { + "epoch": 0.23552311435523116, + "grad_norm": 1.5540265542588236, + "learning_rate": 8.933186659393384e-06, + "loss": 0.6514, + "step": 2420 + }, + { + "epoch": 0.23562043795620438, + "grad_norm": 1.278787339635804, + "learning_rate": 8.932213321547769e-06, + "loss": 0.3423, + "step": 2421 + }, + { + "epoch": 0.2357177615571776, + "grad_norm": 1.2885094583361822, + "learning_rate": 8.931239592955701e-06, + "loss": 0.2958, + "step": 2422 + }, + { + "epoch": 0.23581508515815086, + "grad_norm": 1.5181901598500283, + "learning_rate": 8.930265473713939e-06, + "loss": 0.4212, + "step": 2423 + }, + { + "epoch": 0.2359124087591241, + "grad_norm": 1.2136160482551297, + "learning_rate": 8.92929096391928e-06, + "loss": 0.3982, + "step": 2424 + }, + { + "epoch": 0.2360097323600973, + "grad_norm": 1.5487072814518004, + "learning_rate": 8.928316063668562e-06, + "loss": 0.5676, + "step": 2425 + }, + { + "epoch": 0.23610705596107057, + "grad_norm": 1.430432818475582, + "learning_rate": 8.927340773058664e-06, + "loss": 0.4735, + "step": 2426 + }, + { + "epoch": 0.2362043795620438, + "grad_norm": 1.4586841524588252, + "learning_rate": 8.926365092186498e-06, + "loss": 0.5637, + "step": 2427 + }, + { + "epoch": 0.23630170316301702, + "grad_norm": 1.5364014523424565, + "learning_rate": 8.92538902114902e-06, + "loss": 0.4783, + "step": 2428 + }, + { + "epoch": 0.23639902676399027, + "grad_norm": 1.3896600182614345, + "learning_rate": 8.924412560043223e-06, + "loss": 0.3748, + "step": 2429 + }, + { + "epoch": 0.2364963503649635, + "grad_norm": 1.304447540327908, + "learning_rate": 8.923435708966135e-06, + "loss": 0.3373, + "step": 2430 + }, + { + "epoch": 0.23659367396593675, + "grad_norm": 1.3383082719469825, + "learning_rate": 8.922458468014833e-06, + "loss": 0.3089, + "step": 2431 + }, + { + "epoch": 0.23669099756690998, + "grad_norm": 1.4376693294142868, + "learning_rate": 8.921480837286418e-06, + "loss": 0.2665, + "step": 2432 + }, + { + "epoch": 0.2367883211678832, + "grad_norm": 1.3948368197200884, + "learning_rate": 8.920502816878045e-06, + "loss": 0.4349, + "step": 2433 + }, + { + "epoch": 0.23688564476885646, + "grad_norm": 1.5583938814663865, + "learning_rate": 8.919524406886897e-06, + "loss": 0.4528, + "step": 2434 + }, + { + "epoch": 0.23698296836982968, + "grad_norm": 1.455016515054737, + "learning_rate": 8.918545607410199e-06, + "loss": 0.416, + "step": 2435 + }, + { + "epoch": 0.2370802919708029, + "grad_norm": 1.5707414335423742, + "learning_rate": 8.917566418545215e-06, + "loss": 0.4269, + "step": 2436 + }, + { + "epoch": 0.23717761557177616, + "grad_norm": 1.6214497738286784, + "learning_rate": 8.916586840389248e-06, + "loss": 0.5531, + "step": 2437 + }, + { + "epoch": 0.2372749391727494, + "grad_norm": 1.5231468510302828, + "learning_rate": 8.91560687303964e-06, + "loss": 0.5464, + "step": 2438 + }, + { + "epoch": 0.23737226277372261, + "grad_norm": 1.5631657517225734, + "learning_rate": 8.91462651659377e-06, + "loss": 0.4098, + "step": 2439 + }, + { + "epoch": 0.23746958637469587, + "grad_norm": 1.5003582208774642, + "learning_rate": 8.913645771149058e-06, + "loss": 0.342, + "step": 2440 + }, + { + "epoch": 0.2375669099756691, + "grad_norm": 1.2703591332316027, + "learning_rate": 8.91266463680296e-06, + "loss": 0.3195, + "step": 2441 + }, + { + "epoch": 0.23766423357664235, + "grad_norm": 1.3910967851640175, + "learning_rate": 8.91168311365297e-06, + "loss": 0.334, + "step": 2442 + }, + { + "epoch": 0.23776155717761557, + "grad_norm": 1.5001053773105038, + "learning_rate": 8.910701201796625e-06, + "loss": 0.4665, + "step": 2443 + }, + { + "epoch": 0.2378588807785888, + "grad_norm": 1.6142849143926903, + "learning_rate": 8.9097189013315e-06, + "loss": 0.5276, + "step": 2444 + }, + { + "epoch": 0.23795620437956205, + "grad_norm": 1.2059866820401877, + "learning_rate": 8.908736212355202e-06, + "loss": 0.2936, + "step": 2445 + }, + { + "epoch": 0.23805352798053528, + "grad_norm": 1.4496663268694052, + "learning_rate": 8.907753134965387e-06, + "loss": 0.475, + "step": 2446 + }, + { + "epoch": 0.2381508515815085, + "grad_norm": 1.4184456855989886, + "learning_rate": 8.90676966925974e-06, + "loss": 0.4477, + "step": 2447 + }, + { + "epoch": 0.23824817518248176, + "grad_norm": 1.7126804340284862, + "learning_rate": 8.90578581533599e-06, + "loss": 0.6392, + "step": 2448 + }, + { + "epoch": 0.23834549878345498, + "grad_norm": 1.6085356958926766, + "learning_rate": 8.904801573291901e-06, + "loss": 0.4428, + "step": 2449 + }, + { + "epoch": 0.2384428223844282, + "grad_norm": 1.1724096477321129, + "learning_rate": 8.903816943225281e-06, + "loss": 0.23, + "step": 2450 + }, + { + "epoch": 0.23854014598540146, + "grad_norm": 1.2849397331978023, + "learning_rate": 8.902831925233972e-06, + "loss": 0.4315, + "step": 2451 + }, + { + "epoch": 0.2386374695863747, + "grad_norm": 1.3479724015628292, + "learning_rate": 8.901846519415856e-06, + "loss": 0.4528, + "step": 2452 + }, + { + "epoch": 0.23873479318734794, + "grad_norm": 1.5241447958707557, + "learning_rate": 8.900860725868852e-06, + "loss": 0.5638, + "step": 2453 + }, + { + "epoch": 0.23883211678832117, + "grad_norm": 1.3008951589753057, + "learning_rate": 8.899874544690921e-06, + "loss": 0.4364, + "step": 2454 + }, + { + "epoch": 0.2389294403892944, + "grad_norm": 1.3889516127516133, + "learning_rate": 8.89888797598006e-06, + "loss": 0.5968, + "step": 2455 + }, + { + "epoch": 0.23902676399026765, + "grad_norm": 1.3382384356293548, + "learning_rate": 8.8979010198343e-06, + "loss": 0.3423, + "step": 2456 + }, + { + "epoch": 0.23912408759124087, + "grad_norm": 1.3927455122024084, + "learning_rate": 8.896913676351726e-06, + "loss": 0.5291, + "step": 2457 + }, + { + "epoch": 0.2392214111922141, + "grad_norm": 1.3654704619725508, + "learning_rate": 8.895925945630441e-06, + "loss": 0.3224, + "step": 2458 + }, + { + "epoch": 0.23931873479318735, + "grad_norm": 2.420859240745107, + "learning_rate": 8.8949378277686e-06, + "loss": 0.4526, + "step": 2459 + }, + { + "epoch": 0.23941605839416058, + "grad_norm": 1.279171164356654, + "learning_rate": 8.893949322864394e-06, + "loss": 0.3452, + "step": 2460 + }, + { + "epoch": 0.2395133819951338, + "grad_norm": 1.4336845514712926, + "learning_rate": 8.89296043101605e-06, + "loss": 0.3891, + "step": 2461 + }, + { + "epoch": 0.23961070559610706, + "grad_norm": 1.2391493008048138, + "learning_rate": 8.891971152321836e-06, + "loss": 0.5135, + "step": 2462 + }, + { + "epoch": 0.23970802919708029, + "grad_norm": 1.2398633987802397, + "learning_rate": 8.890981486880057e-06, + "loss": 0.2688, + "step": 2463 + }, + { + "epoch": 0.23980535279805354, + "grad_norm": 1.1975725536626207, + "learning_rate": 8.889991434789054e-06, + "loss": 0.4181, + "step": 2464 + }, + { + "epoch": 0.23990267639902677, + "grad_norm": 1.5121790458693565, + "learning_rate": 8.889000996147213e-06, + "loss": 0.667, + "step": 2465 + }, + { + "epoch": 0.24, + "grad_norm": 1.2980809407294283, + "learning_rate": 8.888010171052951e-06, + "loss": 0.4025, + "step": 2466 + }, + { + "epoch": 0.24009732360097324, + "grad_norm": 1.3683247659037883, + "learning_rate": 8.887018959604731e-06, + "loss": 0.4195, + "step": 2467 + }, + { + "epoch": 0.24019464720194647, + "grad_norm": 1.6392091000056277, + "learning_rate": 8.886027361901045e-06, + "loss": 0.4464, + "step": 2468 + }, + { + "epoch": 0.2402919708029197, + "grad_norm": 1.4286158146093557, + "learning_rate": 8.885035378040435e-06, + "loss": 0.503, + "step": 2469 + }, + { + "epoch": 0.24038929440389295, + "grad_norm": 1.6249203295617591, + "learning_rate": 8.884043008121468e-06, + "loss": 0.5875, + "step": 2470 + }, + { + "epoch": 0.24048661800486618, + "grad_norm": 1.316531393288964, + "learning_rate": 8.883050252242762e-06, + "loss": 0.3225, + "step": 2471 + }, + { + "epoch": 0.24058394160583943, + "grad_norm": 1.3738066957140371, + "learning_rate": 8.882057110502964e-06, + "loss": 0.3863, + "step": 2472 + }, + { + "epoch": 0.24068126520681266, + "grad_norm": 1.6149562610100578, + "learning_rate": 8.881063583000766e-06, + "loss": 0.6899, + "step": 2473 + }, + { + "epoch": 0.24077858880778588, + "grad_norm": 1.1978996054498634, + "learning_rate": 8.880069669834895e-06, + "loss": 0.4647, + "step": 2474 + }, + { + "epoch": 0.24087591240875914, + "grad_norm": 1.3737195294986575, + "learning_rate": 8.879075371104114e-06, + "loss": 0.3404, + "step": 2475 + }, + { + "epoch": 0.24097323600973236, + "grad_norm": 1.3242090275500389, + "learning_rate": 8.878080686907231e-06, + "loss": 0.4923, + "step": 2476 + }, + { + "epoch": 0.2410705596107056, + "grad_norm": 1.295191211796917, + "learning_rate": 8.877085617343085e-06, + "loss": 0.4449, + "step": 2477 + }, + { + "epoch": 0.24116788321167884, + "grad_norm": 1.5068542914468723, + "learning_rate": 8.87609016251056e-06, + "loss": 0.5506, + "step": 2478 + }, + { + "epoch": 0.24126520681265207, + "grad_norm": 1.650040845654398, + "learning_rate": 8.87509432250857e-06, + "loss": 0.5715, + "step": 2479 + }, + { + "epoch": 0.2413625304136253, + "grad_norm": 1.5289429392674028, + "learning_rate": 8.874098097436078e-06, + "loss": 0.5626, + "step": 2480 + }, + { + "epoch": 0.24145985401459855, + "grad_norm": 1.3609358059405043, + "learning_rate": 8.873101487392078e-06, + "loss": 0.4096, + "step": 2481 + }, + { + "epoch": 0.24155717761557177, + "grad_norm": 1.5725676631470524, + "learning_rate": 8.8721044924756e-06, + "loss": 0.6597, + "step": 2482 + }, + { + "epoch": 0.24165450121654503, + "grad_norm": 1.094002939677081, + "learning_rate": 8.87110711278572e-06, + "loss": 0.3206, + "step": 2483 + }, + { + "epoch": 0.24175182481751825, + "grad_norm": 1.4551979783640236, + "learning_rate": 8.870109348421544e-06, + "loss": 0.445, + "step": 2484 + }, + { + "epoch": 0.24184914841849148, + "grad_norm": 1.534219781362636, + "learning_rate": 8.869111199482227e-06, + "loss": 0.6666, + "step": 2485 + }, + { + "epoch": 0.24194647201946473, + "grad_norm": 0.9530847884904149, + "learning_rate": 8.86811266606695e-06, + "loss": 0.2756, + "step": 2486 + }, + { + "epoch": 0.24204379562043796, + "grad_norm": 1.4859819247146357, + "learning_rate": 8.86711374827494e-06, + "loss": 0.4626, + "step": 2487 + }, + { + "epoch": 0.24214111922141118, + "grad_norm": 1.5336983239407425, + "learning_rate": 8.86611444620546e-06, + "loss": 0.5383, + "step": 2488 + }, + { + "epoch": 0.24223844282238444, + "grad_norm": 1.4073640437212571, + "learning_rate": 8.865114759957812e-06, + "loss": 0.4675, + "step": 2489 + }, + { + "epoch": 0.24233576642335766, + "grad_norm": 1.562895534043348, + "learning_rate": 8.864114689631334e-06, + "loss": 0.5641, + "step": 2490 + }, + { + "epoch": 0.2424330900243309, + "grad_norm": 1.643145414496213, + "learning_rate": 8.863114235325405e-06, + "loss": 0.5749, + "step": 2491 + }, + { + "epoch": 0.24253041362530414, + "grad_norm": 1.226721686463078, + "learning_rate": 8.862113397139437e-06, + "loss": 0.3432, + "step": 2492 + }, + { + "epoch": 0.24262773722627737, + "grad_norm": 1.2699959241996903, + "learning_rate": 8.86111217517289e-06, + "loss": 0.4203, + "step": 2493 + }, + { + "epoch": 0.24272506082725062, + "grad_norm": 1.4233705808484327, + "learning_rate": 8.860110569525253e-06, + "loss": 0.2601, + "step": 2494 + }, + { + "epoch": 0.24282238442822385, + "grad_norm": 1.3784035260656315, + "learning_rate": 8.859108580296055e-06, + "loss": 0.4973, + "step": 2495 + }, + { + "epoch": 0.24291970802919707, + "grad_norm": 1.2790024746236357, + "learning_rate": 8.858106207584864e-06, + "loss": 0.4067, + "step": 2496 + }, + { + "epoch": 0.24301703163017033, + "grad_norm": 1.4041054798155945, + "learning_rate": 8.857103451491292e-06, + "loss": 0.5228, + "step": 2497 + }, + { + "epoch": 0.24311435523114355, + "grad_norm": 1.6788565066048042, + "learning_rate": 8.856100312114975e-06, + "loss": 0.7133, + "step": 2498 + }, + { + "epoch": 0.24321167883211678, + "grad_norm": 1.2024623978380433, + "learning_rate": 8.855096789555602e-06, + "loss": 0.2507, + "step": 2499 + }, + { + "epoch": 0.24330900243309003, + "grad_norm": 1.3828470689148782, + "learning_rate": 8.85409288391289e-06, + "loss": 0.3993, + "step": 2500 + }, + { + "epoch": 0.24340632603406326, + "grad_norm": 1.427484284296059, + "learning_rate": 8.8530885952866e-06, + "loss": 0.3926, + "step": 2501 + }, + { + "epoch": 0.24350364963503648, + "grad_norm": 1.3193446567792235, + "learning_rate": 8.852083923776529e-06, + "loss": 0.2152, + "step": 2502 + }, + { + "epoch": 0.24360097323600974, + "grad_norm": 1.3297823718570532, + "learning_rate": 8.851078869482509e-06, + "loss": 0.4772, + "step": 2503 + }, + { + "epoch": 0.24369829683698296, + "grad_norm": 1.3044660003313646, + "learning_rate": 8.850073432504416e-06, + "loss": 0.3589, + "step": 2504 + }, + { + "epoch": 0.24379562043795622, + "grad_norm": 1.4488096250914715, + "learning_rate": 8.84906761294216e-06, + "loss": 0.3261, + "step": 2505 + }, + { + "epoch": 0.24389294403892944, + "grad_norm": 1.2778329641523152, + "learning_rate": 8.848061410895687e-06, + "loss": 0.3047, + "step": 2506 + }, + { + "epoch": 0.24399026763990267, + "grad_norm": 1.135638375757245, + "learning_rate": 8.847054826464988e-06, + "loss": 0.3173, + "step": 2507 + }, + { + "epoch": 0.24408759124087592, + "grad_norm": 1.5033745953013864, + "learning_rate": 8.846047859750086e-06, + "loss": 0.4813, + "step": 2508 + }, + { + "epoch": 0.24418491484184915, + "grad_norm": 1.1189501535394493, + "learning_rate": 8.845040510851044e-06, + "loss": 0.3359, + "step": 2509 + }, + { + "epoch": 0.24428223844282237, + "grad_norm": 1.4743455663494507, + "learning_rate": 8.844032779867966e-06, + "loss": 0.5354, + "step": 2510 + }, + { + "epoch": 0.24437956204379563, + "grad_norm": 1.2644405709657818, + "learning_rate": 8.843024666900983e-06, + "loss": 0.4019, + "step": 2511 + }, + { + "epoch": 0.24447688564476885, + "grad_norm": 1.5585250648144962, + "learning_rate": 8.84201617205028e-06, + "loss": 0.4977, + "step": 2512 + }, + { + "epoch": 0.24457420924574208, + "grad_norm": 1.5187811483320863, + "learning_rate": 8.841007295416069e-06, + "loss": 0.6282, + "step": 2513 + }, + { + "epoch": 0.24467153284671533, + "grad_norm": 1.461783750506842, + "learning_rate": 8.839998037098601e-06, + "loss": 0.6085, + "step": 2514 + }, + { + "epoch": 0.24476885644768856, + "grad_norm": 1.4235036556142022, + "learning_rate": 8.838988397198167e-06, + "loss": 0.5696, + "step": 2515 + }, + { + "epoch": 0.2448661800486618, + "grad_norm": 1.6731038078758624, + "learning_rate": 8.837978375815097e-06, + "loss": 0.5026, + "step": 2516 + }, + { + "epoch": 0.24496350364963504, + "grad_norm": 1.2803102163564937, + "learning_rate": 8.836967973049757e-06, + "loss": 0.2605, + "step": 2517 + }, + { + "epoch": 0.24506082725060827, + "grad_norm": 1.2869808613318177, + "learning_rate": 8.835957189002551e-06, + "loss": 0.3073, + "step": 2518 + }, + { + "epoch": 0.24515815085158152, + "grad_norm": 1.4129342483481067, + "learning_rate": 8.834946023773921e-06, + "loss": 0.334, + "step": 2519 + }, + { + "epoch": 0.24525547445255474, + "grad_norm": 1.6342111830003216, + "learning_rate": 8.833934477464348e-06, + "loss": 0.6127, + "step": 2520 + }, + { + "epoch": 0.24535279805352797, + "grad_norm": 1.6465764681762454, + "learning_rate": 8.83292255017435e-06, + "loss": 0.6432, + "step": 2521 + }, + { + "epoch": 0.24545012165450122, + "grad_norm": 1.4262158711234114, + "learning_rate": 8.83191024200448e-06, + "loss": 0.5224, + "step": 2522 + }, + { + "epoch": 0.24554744525547445, + "grad_norm": 1.593193256147642, + "learning_rate": 8.830897553055337e-06, + "loss": 0.5211, + "step": 2523 + }, + { + "epoch": 0.24564476885644768, + "grad_norm": 1.624031218270973, + "learning_rate": 8.829884483427547e-06, + "loss": 0.5128, + "step": 2524 + }, + { + "epoch": 0.24574209245742093, + "grad_norm": 1.991662408778961, + "learning_rate": 8.828871033221783e-06, + "loss": 0.3025, + "step": 2525 + }, + { + "epoch": 0.24583941605839416, + "grad_norm": 1.4390691402915812, + "learning_rate": 8.82785720253875e-06, + "loss": 0.5088, + "step": 2526 + }, + { + "epoch": 0.2459367396593674, + "grad_norm": 1.4179406701872763, + "learning_rate": 8.826842991479197e-06, + "loss": 0.3887, + "step": 2527 + }, + { + "epoch": 0.24603406326034064, + "grad_norm": 1.460230365502962, + "learning_rate": 8.825828400143902e-06, + "loss": 0.3316, + "step": 2528 + }, + { + "epoch": 0.24613138686131386, + "grad_norm": 1.4924241123043909, + "learning_rate": 8.824813428633685e-06, + "loss": 0.4989, + "step": 2529 + }, + { + "epoch": 0.24622871046228711, + "grad_norm": 1.593556186634644, + "learning_rate": 8.82379807704941e-06, + "loss": 0.525, + "step": 2530 + }, + { + "epoch": 0.24632603406326034, + "grad_norm": 1.6809566227650843, + "learning_rate": 8.822782345491968e-06, + "loss": 0.3421, + "step": 2531 + }, + { + "epoch": 0.24642335766423357, + "grad_norm": 1.4773288736144092, + "learning_rate": 8.821766234062294e-06, + "loss": 0.534, + "step": 2532 + }, + { + "epoch": 0.24652068126520682, + "grad_norm": 1.4001059355846526, + "learning_rate": 8.820749742861363e-06, + "loss": 0.3887, + "step": 2533 + }, + { + "epoch": 0.24661800486618005, + "grad_norm": 1.349012582441713, + "learning_rate": 8.81973287199018e-06, + "loss": 0.2852, + "step": 2534 + }, + { + "epoch": 0.24671532846715327, + "grad_norm": 1.651550318908522, + "learning_rate": 8.818715621549794e-06, + "loss": 0.4967, + "step": 2535 + }, + { + "epoch": 0.24681265206812653, + "grad_norm": 1.5932669562049986, + "learning_rate": 8.817697991641289e-06, + "loss": 0.4173, + "step": 2536 + }, + { + "epoch": 0.24690997566909975, + "grad_norm": 1.3550488264007063, + "learning_rate": 8.816679982365787e-06, + "loss": 0.3404, + "step": 2537 + }, + { + "epoch": 0.247007299270073, + "grad_norm": 1.571341106532058, + "learning_rate": 8.815661593824451e-06, + "loss": 0.5666, + "step": 2538 + }, + { + "epoch": 0.24710462287104623, + "grad_norm": 1.5685299297246114, + "learning_rate": 8.814642826118477e-06, + "loss": 0.4521, + "step": 2539 + }, + { + "epoch": 0.24720194647201946, + "grad_norm": 1.5355691524375334, + "learning_rate": 8.8136236793491e-06, + "loss": 0.3452, + "step": 2540 + }, + { + "epoch": 0.2472992700729927, + "grad_norm": 1.4490992247448509, + "learning_rate": 8.812604153617594e-06, + "loss": 0.3046, + "step": 2541 + }, + { + "epoch": 0.24739659367396594, + "grad_norm": 1.5790493967738255, + "learning_rate": 8.81158424902527e-06, + "loss": 0.5957, + "step": 2542 + }, + { + "epoch": 0.24749391727493916, + "grad_norm": 1.8299083651337236, + "learning_rate": 8.810563965673478e-06, + "loss": 0.529, + "step": 2543 + }, + { + "epoch": 0.24759124087591242, + "grad_norm": 1.336357630649535, + "learning_rate": 8.8095433036636e-06, + "loss": 0.2498, + "step": 2544 + }, + { + "epoch": 0.24768856447688564, + "grad_norm": 3.272954864246679, + "learning_rate": 8.808522263097065e-06, + "loss": 0.3439, + "step": 2545 + }, + { + "epoch": 0.24778588807785887, + "grad_norm": 1.5948700054852, + "learning_rate": 8.80750084407533e-06, + "loss": 0.5754, + "step": 2546 + }, + { + "epoch": 0.24788321167883212, + "grad_norm": 1.2457293034288246, + "learning_rate": 8.806479046699896e-06, + "loss": 0.3355, + "step": 2547 + }, + { + "epoch": 0.24798053527980535, + "grad_norm": 1.4118835775208534, + "learning_rate": 8.8054568710723e-06, + "loss": 0.4843, + "step": 2548 + }, + { + "epoch": 0.2480778588807786, + "grad_norm": 2.0167817337794745, + "learning_rate": 8.804434317294115e-06, + "loss": 0.4781, + "step": 2549 + }, + { + "epoch": 0.24817518248175183, + "grad_norm": 1.630746510877536, + "learning_rate": 8.803411385466954e-06, + "loss": 0.5226, + "step": 2550 + }, + { + "epoch": 0.24827250608272505, + "grad_norm": 1.0942598516950242, + "learning_rate": 8.802388075692465e-06, + "loss": 0.1843, + "step": 2551 + }, + { + "epoch": 0.2483698296836983, + "grad_norm": 1.8060042956650721, + "learning_rate": 8.801364388072336e-06, + "loss": 0.705, + "step": 2552 + }, + { + "epoch": 0.24846715328467153, + "grad_norm": 1.632331667833736, + "learning_rate": 8.800340322708291e-06, + "loss": 0.4964, + "step": 2553 + }, + { + "epoch": 0.24856447688564476, + "grad_norm": 1.539098206701319, + "learning_rate": 8.799315879702095e-06, + "loss": 0.3962, + "step": 2554 + }, + { + "epoch": 0.248661800486618, + "grad_norm": 1.2219114137184675, + "learning_rate": 8.798291059155543e-06, + "loss": 0.2497, + "step": 2555 + }, + { + "epoch": 0.24875912408759124, + "grad_norm": 1.4540964796439875, + "learning_rate": 8.797265861170471e-06, + "loss": 0.5159, + "step": 2556 + }, + { + "epoch": 0.24885644768856446, + "grad_norm": 1.554150512584087, + "learning_rate": 8.796240285848761e-06, + "loss": 0.4412, + "step": 2557 + }, + { + "epoch": 0.24895377128953772, + "grad_norm": 1.7004545782091594, + "learning_rate": 8.795214333292318e-06, + "loss": 0.5179, + "step": 2558 + }, + { + "epoch": 0.24905109489051094, + "grad_norm": 1.726524110945535, + "learning_rate": 8.794188003603095e-06, + "loss": 0.4071, + "step": 2559 + }, + { + "epoch": 0.2491484184914842, + "grad_norm": 1.27126477948415, + "learning_rate": 8.793161296883077e-06, + "loss": 0.2268, + "step": 2560 + }, + { + "epoch": 0.24924574209245742, + "grad_norm": 1.9752049062158858, + "learning_rate": 8.79213421323429e-06, + "loss": 0.3632, + "step": 2561 + }, + { + "epoch": 0.24934306569343065, + "grad_norm": 1.0556825817929254, + "learning_rate": 8.791106752758796e-06, + "loss": 0.3627, + "step": 2562 + }, + { + "epoch": 0.2494403892944039, + "grad_norm": 1.6452772754401714, + "learning_rate": 8.790078915558693e-06, + "loss": 0.6043, + "step": 2563 + }, + { + "epoch": 0.24953771289537713, + "grad_norm": 1.278547180886592, + "learning_rate": 8.789050701736117e-06, + "loss": 0.3768, + "step": 2564 + }, + { + "epoch": 0.24963503649635035, + "grad_norm": 1.3443028399521961, + "learning_rate": 8.788022111393247e-06, + "loss": 0.3856, + "step": 2565 + }, + { + "epoch": 0.2497323600973236, + "grad_norm": 1.2774166354695482, + "learning_rate": 8.78699314463229e-06, + "loss": 0.4391, + "step": 2566 + }, + { + "epoch": 0.24982968369829683, + "grad_norm": 1.2231715277397497, + "learning_rate": 8.785963801555497e-06, + "loss": 0.4128, + "step": 2567 + }, + { + "epoch": 0.24992700729927006, + "grad_norm": 1.4012153782510572, + "learning_rate": 8.784934082265154e-06, + "loss": 0.4683, + "step": 2568 + }, + { + "epoch": 0.2500243309002433, + "grad_norm": 1.1954060436870173, + "learning_rate": 8.783903986863583e-06, + "loss": 0.2786, + "step": 2569 + }, + { + "epoch": 0.25012165450121654, + "grad_norm": 1.7116998515615807, + "learning_rate": 8.782873515453148e-06, + "loss": 0.6004, + "step": 2570 + }, + { + "epoch": 0.2502189781021898, + "grad_norm": 1.5712719922889962, + "learning_rate": 8.781842668136247e-06, + "loss": 0.6172, + "step": 2571 + }, + { + "epoch": 0.250316301703163, + "grad_norm": 1.246915874910697, + "learning_rate": 8.780811445015316e-06, + "loss": 0.4335, + "step": 2572 + }, + { + "epoch": 0.25041362530413624, + "grad_norm": 1.341456518636559, + "learning_rate": 8.779779846192827e-06, + "loss": 0.4187, + "step": 2573 + }, + { + "epoch": 0.2505109489051095, + "grad_norm": 1.1323562755710477, + "learning_rate": 8.778747871771293e-06, + "loss": 0.2832, + "step": 2574 + }, + { + "epoch": 0.25060827250608275, + "grad_norm": 1.4401083791532063, + "learning_rate": 8.777715521853258e-06, + "loss": 0.3779, + "step": 2575 + }, + { + "epoch": 0.25070559610705595, + "grad_norm": 1.4784987737181619, + "learning_rate": 8.77668279654131e-06, + "loss": 0.3129, + "step": 2576 + }, + { + "epoch": 0.2508029197080292, + "grad_norm": 1.1394717513462493, + "learning_rate": 8.775649695938074e-06, + "loss": 0.3162, + "step": 2577 + }, + { + "epoch": 0.25090024330900246, + "grad_norm": 1.4625556674372375, + "learning_rate": 8.774616220146204e-06, + "loss": 0.4605, + "step": 2578 + }, + { + "epoch": 0.25099756690997566, + "grad_norm": 3.1521808341091875, + "learning_rate": 8.773582369268402e-06, + "loss": 0.3485, + "step": 2579 + }, + { + "epoch": 0.2510948905109489, + "grad_norm": 1.3578124438111323, + "learning_rate": 8.7725481434074e-06, + "loss": 0.4693, + "step": 2580 + }, + { + "epoch": 0.25119221411192216, + "grad_norm": 1.63411664215404, + "learning_rate": 8.771513542665969e-06, + "loss": 0.4956, + "step": 2581 + }, + { + "epoch": 0.25128953771289536, + "grad_norm": 1.5098765580454843, + "learning_rate": 8.77047856714692e-06, + "loss": 0.4657, + "step": 2582 + }, + { + "epoch": 0.2513868613138686, + "grad_norm": 1.2801786921613054, + "learning_rate": 8.7694432169531e-06, + "loss": 0.3369, + "step": 2583 + }, + { + "epoch": 0.25148418491484187, + "grad_norm": 1.4360422953324754, + "learning_rate": 8.768407492187388e-06, + "loss": 0.4907, + "step": 2584 + }, + { + "epoch": 0.25158150851581507, + "grad_norm": 1.4560406874169747, + "learning_rate": 8.767371392952708e-06, + "loss": 0.3157, + "step": 2585 + }, + { + "epoch": 0.2516788321167883, + "grad_norm": 1.934211832538441, + "learning_rate": 8.766334919352018e-06, + "loss": 0.7151, + "step": 2586 + }, + { + "epoch": 0.2517761557177616, + "grad_norm": 1.6767044903158872, + "learning_rate": 8.76529807148831e-06, + "loss": 0.331, + "step": 2587 + }, + { + "epoch": 0.25187347931873477, + "grad_norm": 1.4698852047894042, + "learning_rate": 8.76426084946462e-06, + "loss": 0.3951, + "step": 2588 + }, + { + "epoch": 0.251970802919708, + "grad_norm": 1.3539539414605721, + "learning_rate": 8.763223253384015e-06, + "loss": 0.4011, + "step": 2589 + }, + { + "epoch": 0.2520681265206813, + "grad_norm": 1.506242240790805, + "learning_rate": 8.762185283349603e-06, + "loss": 0.5274, + "step": 2590 + }, + { + "epoch": 0.2521654501216545, + "grad_norm": 1.3140936667503142, + "learning_rate": 8.761146939464527e-06, + "loss": 0.3198, + "step": 2591 + }, + { + "epoch": 0.25226277372262773, + "grad_norm": 1.1404767919952752, + "learning_rate": 8.760108221831967e-06, + "loss": 0.4013, + "step": 2592 + }, + { + "epoch": 0.252360097323601, + "grad_norm": 1.4693477307137552, + "learning_rate": 8.759069130555142e-06, + "loss": 0.4783, + "step": 2593 + }, + { + "epoch": 0.2524574209245742, + "grad_norm": 1.3352582665983712, + "learning_rate": 8.75802966573731e-06, + "loss": 0.4617, + "step": 2594 + }, + { + "epoch": 0.25255474452554744, + "grad_norm": 1.2824428866870197, + "learning_rate": 8.756989827481756e-06, + "loss": 0.3352, + "step": 2595 + }, + { + "epoch": 0.2526520681265207, + "grad_norm": 1.4774059328965283, + "learning_rate": 8.755949615891814e-06, + "loss": 0.4635, + "step": 2596 + }, + { + "epoch": 0.25274939172749394, + "grad_norm": 1.6875827910282526, + "learning_rate": 8.754909031070852e-06, + "loss": 0.6222, + "step": 2597 + }, + { + "epoch": 0.25284671532846714, + "grad_norm": 1.2063205441417741, + "learning_rate": 8.75386807312227e-06, + "loss": 0.2455, + "step": 2598 + }, + { + "epoch": 0.2529440389294404, + "grad_norm": 1.3021547323360578, + "learning_rate": 8.752826742149512e-06, + "loss": 0.4329, + "step": 2599 + }, + { + "epoch": 0.25304136253041365, + "grad_norm": 1.1835878076183852, + "learning_rate": 8.751785038256054e-06, + "loss": 0.3662, + "step": 2600 + }, + { + "epoch": 0.25313868613138685, + "grad_norm": 1.544717999496196, + "learning_rate": 8.750742961545409e-06, + "loss": 0.3971, + "step": 2601 + }, + { + "epoch": 0.2532360097323601, + "grad_norm": 1.3629505007649398, + "learning_rate": 8.749700512121131e-06, + "loss": 0.5107, + "step": 2602 + }, + { + "epoch": 0.25333333333333335, + "grad_norm": 1.5737001686599814, + "learning_rate": 8.74865769008681e-06, + "loss": 0.5279, + "step": 2603 + }, + { + "epoch": 0.25343065693430655, + "grad_norm": 1.4784815997261378, + "learning_rate": 8.747614495546069e-06, + "loss": 0.4792, + "step": 2604 + }, + { + "epoch": 0.2535279805352798, + "grad_norm": 1.3236076722973804, + "learning_rate": 8.74657092860257e-06, + "loss": 0.3975, + "step": 2605 + }, + { + "epoch": 0.25362530413625306, + "grad_norm": 1.0968595172191475, + "learning_rate": 8.745526989360018e-06, + "loss": 0.269, + "step": 2606 + }, + { + "epoch": 0.25372262773722626, + "grad_norm": 1.562505340567045, + "learning_rate": 8.744482677922147e-06, + "loss": 0.5157, + "step": 2607 + }, + { + "epoch": 0.2538199513381995, + "grad_norm": 1.656826278908397, + "learning_rate": 8.743437994392729e-06, + "loss": 0.4867, + "step": 2608 + }, + { + "epoch": 0.25391727493917277, + "grad_norm": 1.3948672448161548, + "learning_rate": 8.742392938875577e-06, + "loss": 0.5279, + "step": 2609 + }, + { + "epoch": 0.25401459854014596, + "grad_norm": 1.5892338813179163, + "learning_rate": 8.741347511474539e-06, + "loss": 0.5611, + "step": 2610 + }, + { + "epoch": 0.2541119221411192, + "grad_norm": 1.6074249897923194, + "learning_rate": 8.740301712293498e-06, + "loss": 0.351, + "step": 2611 + }, + { + "epoch": 0.25420924574209247, + "grad_norm": 1.6505540033315536, + "learning_rate": 8.739255541436379e-06, + "loss": 0.5747, + "step": 2612 + }, + { + "epoch": 0.25430656934306567, + "grad_norm": 1.3314247428577628, + "learning_rate": 8.738208999007137e-06, + "loss": 0.3779, + "step": 2613 + }, + { + "epoch": 0.2544038929440389, + "grad_norm": 1.2796270745139389, + "learning_rate": 8.737162085109768e-06, + "loss": 0.3557, + "step": 2614 + }, + { + "epoch": 0.2545012165450122, + "grad_norm": 1.602637102567401, + "learning_rate": 8.736114799848307e-06, + "loss": 0.2882, + "step": 2615 + }, + { + "epoch": 0.2545985401459854, + "grad_norm": 1.4207119219562419, + "learning_rate": 8.735067143326821e-06, + "loss": 0.3881, + "step": 2616 + }, + { + "epoch": 0.25469586374695863, + "grad_norm": 1.4305110706379638, + "learning_rate": 8.73401911564942e-06, + "loss": 0.3486, + "step": 2617 + }, + { + "epoch": 0.2547931873479319, + "grad_norm": 1.434428707272536, + "learning_rate": 8.732970716920242e-06, + "loss": 0.3169, + "step": 2618 + }, + { + "epoch": 0.25489051094890514, + "grad_norm": 1.3228470441064362, + "learning_rate": 8.73192194724347e-06, + "loss": 0.4485, + "step": 2619 + }, + { + "epoch": 0.25498783454987833, + "grad_norm": 1.3897030806906485, + "learning_rate": 8.730872806723318e-06, + "loss": 0.4172, + "step": 2620 + }, + { + "epoch": 0.2550851581508516, + "grad_norm": 1.3840681937318722, + "learning_rate": 8.729823295464045e-06, + "loss": 0.251, + "step": 2621 + }, + { + "epoch": 0.25518248175182484, + "grad_norm": 1.775278354364079, + "learning_rate": 8.728773413569938e-06, + "loss": 0.4811, + "step": 2622 + }, + { + "epoch": 0.25527980535279804, + "grad_norm": 1.2701408917829737, + "learning_rate": 8.727723161145325e-06, + "loss": 0.2827, + "step": 2623 + }, + { + "epoch": 0.2553771289537713, + "grad_norm": 1.5528362504659363, + "learning_rate": 8.72667253829457e-06, + "loss": 0.5084, + "step": 2624 + }, + { + "epoch": 0.25547445255474455, + "grad_norm": 1.3793988523162408, + "learning_rate": 8.725621545122074e-06, + "loss": 0.3979, + "step": 2625 + }, + { + "epoch": 0.25557177615571774, + "grad_norm": 1.70282889775673, + "learning_rate": 8.724570181732275e-06, + "loss": 0.5983, + "step": 2626 + }, + { + "epoch": 0.255669099756691, + "grad_norm": 1.28105292316495, + "learning_rate": 8.723518448229649e-06, + "loss": 0.4756, + "step": 2627 + }, + { + "epoch": 0.25576642335766425, + "grad_norm": 1.3826686116158597, + "learning_rate": 8.722466344718705e-06, + "loss": 0.2978, + "step": 2628 + }, + { + "epoch": 0.25586374695863745, + "grad_norm": 1.460242284502631, + "learning_rate": 8.721413871303992e-06, + "loss": 0.4036, + "step": 2629 + }, + { + "epoch": 0.2559610705596107, + "grad_norm": 1.4181157816170762, + "learning_rate": 8.720361028090095e-06, + "loss": 0.4224, + "step": 2630 + }, + { + "epoch": 0.25605839416058396, + "grad_norm": 1.7898330028782403, + "learning_rate": 8.719307815181638e-06, + "loss": 0.7314, + "step": 2631 + }, + { + "epoch": 0.25615571776155716, + "grad_norm": 1.6886124652733636, + "learning_rate": 8.718254232683276e-06, + "loss": 0.3513, + "step": 2632 + }, + { + "epoch": 0.2562530413625304, + "grad_norm": 1.2562027575971086, + "learning_rate": 8.717200280699705e-06, + "loss": 0.284, + "step": 2633 + }, + { + "epoch": 0.25635036496350366, + "grad_norm": 1.4899596514775177, + "learning_rate": 8.716145959335658e-06, + "loss": 0.2778, + "step": 2634 + }, + { + "epoch": 0.25644768856447686, + "grad_norm": 1.1699021581347986, + "learning_rate": 8.715091268695903e-06, + "loss": 0.3163, + "step": 2635 + }, + { + "epoch": 0.2565450121654501, + "grad_norm": 1.020653527182934, + "learning_rate": 8.714036208885243e-06, + "loss": 0.2191, + "step": 2636 + }, + { + "epoch": 0.25664233576642337, + "grad_norm": 1.5373942827305265, + "learning_rate": 8.712980780008526e-06, + "loss": 0.4183, + "step": 2637 + }, + { + "epoch": 0.25673965936739657, + "grad_norm": 1.1268355971062876, + "learning_rate": 8.711924982170623e-06, + "loss": 0.2851, + "step": 2638 + }, + { + "epoch": 0.2568369829683698, + "grad_norm": 1.25228244300652, + "learning_rate": 8.710868815476456e-06, + "loss": 0.1963, + "step": 2639 + }, + { + "epoch": 0.2569343065693431, + "grad_norm": 1.3905442460862172, + "learning_rate": 8.709812280030971e-06, + "loss": 0.3648, + "step": 2640 + }, + { + "epoch": 0.2570316301703163, + "grad_norm": 1.5078176389616522, + "learning_rate": 8.708755375939162e-06, + "loss": 0.4131, + "step": 2641 + }, + { + "epoch": 0.2571289537712895, + "grad_norm": 1.4441200079463874, + "learning_rate": 8.70769810330605e-06, + "loss": 0.4047, + "step": 2642 + }, + { + "epoch": 0.2572262773722628, + "grad_norm": 1.3883503516178042, + "learning_rate": 8.7066404622367e-06, + "loss": 0.3308, + "step": 2643 + }, + { + "epoch": 0.25732360097323603, + "grad_norm": 1.7851696055640995, + "learning_rate": 8.705582452836208e-06, + "loss": 0.336, + "step": 2644 + }, + { + "epoch": 0.25742092457420923, + "grad_norm": 1.309628752016819, + "learning_rate": 8.70452407520971e-06, + "loss": 0.3462, + "step": 2645 + }, + { + "epoch": 0.2575182481751825, + "grad_norm": 1.3618437175125289, + "learning_rate": 8.703465329462379e-06, + "loss": 0.3047, + "step": 2646 + }, + { + "epoch": 0.25761557177615574, + "grad_norm": 1.5821297320572192, + "learning_rate": 8.702406215699421e-06, + "loss": 0.2318, + "step": 2647 + }, + { + "epoch": 0.25771289537712894, + "grad_norm": 1.4729014225467234, + "learning_rate": 8.701346734026082e-06, + "loss": 0.3147, + "step": 2648 + }, + { + "epoch": 0.2578102189781022, + "grad_norm": 1.6287249640343295, + "learning_rate": 8.700286884547642e-06, + "loss": 0.5808, + "step": 2649 + }, + { + "epoch": 0.25790754257907544, + "grad_norm": 1.2824109098190504, + "learning_rate": 8.69922666736942e-06, + "loss": 0.3836, + "step": 2650 + }, + { + "epoch": 0.25800486618004864, + "grad_norm": 1.5096397594183033, + "learning_rate": 8.69816608259677e-06, + "loss": 0.3804, + "step": 2651 + }, + { + "epoch": 0.2581021897810219, + "grad_norm": 1.7247008216261863, + "learning_rate": 8.697105130335084e-06, + "loss": 0.3378, + "step": 2652 + }, + { + "epoch": 0.25819951338199515, + "grad_norm": 1.5872130127065738, + "learning_rate": 8.69604381068979e-06, + "loss": 0.4369, + "step": 2653 + }, + { + "epoch": 0.25829683698296835, + "grad_norm": 1.5909295650502344, + "learning_rate": 8.694982123766348e-06, + "loss": 0.3554, + "step": 2654 + }, + { + "epoch": 0.2583941605839416, + "grad_norm": 1.7135035115393307, + "learning_rate": 8.693920069670265e-06, + "loss": 0.4869, + "step": 2655 + }, + { + "epoch": 0.25849148418491485, + "grad_norm": 1.3366492087792976, + "learning_rate": 8.692857648507071e-06, + "loss": 0.3102, + "step": 2656 + }, + { + "epoch": 0.25858880778588805, + "grad_norm": 1.2478048122674565, + "learning_rate": 8.691794860382345e-06, + "loss": 0.3722, + "step": 2657 + }, + { + "epoch": 0.2586861313868613, + "grad_norm": 1.5080776475601503, + "learning_rate": 8.690731705401694e-06, + "loss": 0.316, + "step": 2658 + }, + { + "epoch": 0.25878345498783456, + "grad_norm": 1.443811575497146, + "learning_rate": 8.689668183670763e-06, + "loss": 0.2875, + "step": 2659 + }, + { + "epoch": 0.25888077858880776, + "grad_norm": 1.7036441396737687, + "learning_rate": 8.688604295295238e-06, + "loss": 0.4025, + "step": 2660 + }, + { + "epoch": 0.258978102189781, + "grad_norm": 1.4234806259439374, + "learning_rate": 8.687540040380838e-06, + "loss": 0.4452, + "step": 2661 + }, + { + "epoch": 0.25907542579075427, + "grad_norm": 1.2741393980838642, + "learning_rate": 8.686475419033315e-06, + "loss": 0.2237, + "step": 2662 + }, + { + "epoch": 0.2591727493917275, + "grad_norm": 1.1826384563722763, + "learning_rate": 8.685410431358464e-06, + "loss": 0.3398, + "step": 2663 + }, + { + "epoch": 0.2592700729927007, + "grad_norm": 1.5757741509023746, + "learning_rate": 8.684345077462117e-06, + "loss": 0.3846, + "step": 2664 + }, + { + "epoch": 0.25936739659367397, + "grad_norm": 1.475707275733763, + "learning_rate": 8.683279357450131e-06, + "loss": 0.2804, + "step": 2665 + }, + { + "epoch": 0.2594647201946472, + "grad_norm": 1.4241797244636094, + "learning_rate": 8.682213271428415e-06, + "loss": 0.2553, + "step": 2666 + }, + { + "epoch": 0.2595620437956204, + "grad_norm": 1.1548194283365685, + "learning_rate": 8.6811468195029e-06, + "loss": 0.3118, + "step": 2667 + }, + { + "epoch": 0.2596593673965937, + "grad_norm": 1.5918458521510486, + "learning_rate": 8.680080001779564e-06, + "loss": 0.4525, + "step": 2668 + }, + { + "epoch": 0.25975669099756693, + "grad_norm": 1.5508802560099362, + "learning_rate": 8.679012818364416e-06, + "loss": 0.4163, + "step": 2669 + }, + { + "epoch": 0.25985401459854013, + "grad_norm": 2.4434630008376232, + "learning_rate": 8.677945269363504e-06, + "loss": 0.4372, + "step": 2670 + }, + { + "epoch": 0.2599513381995134, + "grad_norm": 1.5324792404386718, + "learning_rate": 8.676877354882907e-06, + "loss": 0.3514, + "step": 2671 + }, + { + "epoch": 0.26004866180048664, + "grad_norm": 2.0012246197360493, + "learning_rate": 8.67580907502875e-06, + "loss": 0.5067, + "step": 2672 + }, + { + "epoch": 0.26014598540145983, + "grad_norm": 1.5232176793280576, + "learning_rate": 8.674740429907186e-06, + "loss": 0.4174, + "step": 2673 + }, + { + "epoch": 0.2602433090024331, + "grad_norm": 1.3322865976928646, + "learning_rate": 8.673671419624405e-06, + "loss": 0.4095, + "step": 2674 + }, + { + "epoch": 0.26034063260340634, + "grad_norm": 1.514406481268828, + "learning_rate": 8.672602044286638e-06, + "loss": 0.5915, + "step": 2675 + }, + { + "epoch": 0.26043795620437954, + "grad_norm": 1.528467413797325, + "learning_rate": 8.67153230400015e-06, + "loss": 0.4018, + "step": 2676 + }, + { + "epoch": 0.2605352798053528, + "grad_norm": 1.4367698805538582, + "learning_rate": 8.670462198871237e-06, + "loss": 0.4115, + "step": 2677 + }, + { + "epoch": 0.26063260340632605, + "grad_norm": 1.6984444092554742, + "learning_rate": 8.66939172900624e-06, + "loss": 0.59, + "step": 2678 + }, + { + "epoch": 0.26072992700729924, + "grad_norm": 1.4698751482200727, + "learning_rate": 8.668320894511534e-06, + "loss": 0.4144, + "step": 2679 + }, + { + "epoch": 0.2608272506082725, + "grad_norm": 1.5003641004534345, + "learning_rate": 8.667249695493525e-06, + "loss": 0.4294, + "step": 2680 + }, + { + "epoch": 0.26092457420924575, + "grad_norm": 1.3123452231563197, + "learning_rate": 8.666178132058659e-06, + "loss": 0.3408, + "step": 2681 + }, + { + "epoch": 0.261021897810219, + "grad_norm": 1.5184535738040659, + "learning_rate": 8.665106204313418e-06, + "loss": 0.3662, + "step": 2682 + }, + { + "epoch": 0.2611192214111922, + "grad_norm": 1.0623024588559944, + "learning_rate": 8.664033912364321e-06, + "loss": 0.2953, + "step": 2683 + }, + { + "epoch": 0.26121654501216546, + "grad_norm": 1.4112725317400583, + "learning_rate": 8.662961256317923e-06, + "loss": 0.3825, + "step": 2684 + }, + { + "epoch": 0.2613138686131387, + "grad_norm": 2.2729536767377065, + "learning_rate": 8.661888236280813e-06, + "loss": 0.5791, + "step": 2685 + }, + { + "epoch": 0.2614111922141119, + "grad_norm": 2.2747614305768504, + "learning_rate": 8.660814852359617e-06, + "loss": 0.4859, + "step": 2686 + }, + { + "epoch": 0.26150851581508516, + "grad_norm": 1.6069562939941755, + "learning_rate": 8.659741104661002e-06, + "loss": 0.5254, + "step": 2687 + }, + { + "epoch": 0.2616058394160584, + "grad_norm": 1.3624858995460438, + "learning_rate": 8.658666993291662e-06, + "loss": 0.3904, + "step": 2688 + }, + { + "epoch": 0.2617031630170316, + "grad_norm": 1.2954398797770197, + "learning_rate": 8.657592518358332e-06, + "loss": 0.3789, + "step": 2689 + }, + { + "epoch": 0.26180048661800487, + "grad_norm": 1.4158991903907718, + "learning_rate": 8.656517679967788e-06, + "loss": 0.3732, + "step": 2690 + }, + { + "epoch": 0.2618978102189781, + "grad_norm": 1.3754641009755615, + "learning_rate": 8.655442478226835e-06, + "loss": 0.3035, + "step": 2691 + }, + { + "epoch": 0.2619951338199513, + "grad_norm": 1.3522608722257456, + "learning_rate": 8.654366913242316e-06, + "loss": 0.347, + "step": 2692 + }, + { + "epoch": 0.2620924574209246, + "grad_norm": 1.2764013704656585, + "learning_rate": 8.65329098512111e-06, + "loss": 0.4207, + "step": 2693 + }, + { + "epoch": 0.2621897810218978, + "grad_norm": 1.4009476621873176, + "learning_rate": 8.652214693970133e-06, + "loss": 0.4628, + "step": 2694 + }, + { + "epoch": 0.262287104622871, + "grad_norm": 1.3860597575903169, + "learning_rate": 8.65113803989634e-06, + "loss": 0.3844, + "step": 2695 + }, + { + "epoch": 0.2623844282238443, + "grad_norm": 1.5636622874346966, + "learning_rate": 8.650061023006711e-06, + "loss": 0.6239, + "step": 2696 + }, + { + "epoch": 0.26248175182481753, + "grad_norm": 1.3677003606993399, + "learning_rate": 8.648983643408276e-06, + "loss": 0.4319, + "step": 2697 + }, + { + "epoch": 0.26257907542579073, + "grad_norm": 1.4720449620822884, + "learning_rate": 8.647905901208096e-06, + "loss": 0.4824, + "step": 2698 + }, + { + "epoch": 0.262676399026764, + "grad_norm": 1.4180687903221385, + "learning_rate": 8.646827796513262e-06, + "loss": 0.539, + "step": 2699 + }, + { + "epoch": 0.26277372262773724, + "grad_norm": 1.3679667840460958, + "learning_rate": 8.64574932943091e-06, + "loss": 0.4588, + "step": 2700 + }, + { + "epoch": 0.26287104622871044, + "grad_norm": 1.125542933529368, + "learning_rate": 8.644670500068205e-06, + "loss": 0.3441, + "step": 2701 + }, + { + "epoch": 0.2629683698296837, + "grad_norm": 1.5641789380613262, + "learning_rate": 8.643591308532353e-06, + "loss": 0.4998, + "step": 2702 + }, + { + "epoch": 0.26306569343065694, + "grad_norm": 1.3425870342919086, + "learning_rate": 8.642511754930592e-06, + "loss": 0.4678, + "step": 2703 + }, + { + "epoch": 0.2631630170316302, + "grad_norm": 1.3010588112101855, + "learning_rate": 8.641431839370199e-06, + "loss": 0.4005, + "step": 2704 + }, + { + "epoch": 0.2632603406326034, + "grad_norm": 1.0067860306988832, + "learning_rate": 8.640351561958487e-06, + "loss": 0.2243, + "step": 2705 + }, + { + "epoch": 0.26335766423357665, + "grad_norm": 1.4713856201410829, + "learning_rate": 8.639270922802802e-06, + "loss": 0.4325, + "step": 2706 + }, + { + "epoch": 0.2634549878345499, + "grad_norm": 1.55962351192921, + "learning_rate": 8.63818992201053e-06, + "loss": 0.5307, + "step": 2707 + }, + { + "epoch": 0.2635523114355231, + "grad_norm": 1.4073629002175063, + "learning_rate": 8.637108559689088e-06, + "loss": 0.3329, + "step": 2708 + }, + { + "epoch": 0.26364963503649635, + "grad_norm": 1.2827086170801953, + "learning_rate": 8.636026835945933e-06, + "loss": 0.3095, + "step": 2709 + }, + { + "epoch": 0.2637469586374696, + "grad_norm": 1.4100209855486194, + "learning_rate": 8.634944750888556e-06, + "loss": 0.3033, + "step": 2710 + }, + { + "epoch": 0.2638442822384428, + "grad_norm": 1.343279822840104, + "learning_rate": 8.633862304624484e-06, + "loss": 0.402, + "step": 2711 + }, + { + "epoch": 0.26394160583941606, + "grad_norm": 1.4374516520455163, + "learning_rate": 8.632779497261284e-06, + "loss": 0.4574, + "step": 2712 + }, + { + "epoch": 0.2640389294403893, + "grad_norm": 1.1554648336740065, + "learning_rate": 8.63169632890655e-06, + "loss": 0.3091, + "step": 2713 + }, + { + "epoch": 0.2641362530413625, + "grad_norm": 1.5304191047752203, + "learning_rate": 8.630612799667923e-06, + "loss": 0.5392, + "step": 2714 + }, + { + "epoch": 0.26423357664233577, + "grad_norm": 1.9364214941018973, + "learning_rate": 8.629528909653067e-06, + "loss": 0.4705, + "step": 2715 + }, + { + "epoch": 0.264330900243309, + "grad_norm": 1.5176007479008755, + "learning_rate": 8.628444658969694e-06, + "loss": 0.3969, + "step": 2716 + }, + { + "epoch": 0.2644282238442822, + "grad_norm": 1.3882529784475808, + "learning_rate": 8.627360047725543e-06, + "loss": 0.4672, + "step": 2717 + }, + { + "epoch": 0.26452554744525547, + "grad_norm": 1.0419873824719341, + "learning_rate": 8.626275076028397e-06, + "loss": 0.2247, + "step": 2718 + }, + { + "epoch": 0.2646228710462287, + "grad_norm": 1.4147177174052021, + "learning_rate": 8.625189743986068e-06, + "loss": 0.3922, + "step": 2719 + }, + { + "epoch": 0.2647201946472019, + "grad_norm": 1.3513629744004096, + "learning_rate": 8.624104051706405e-06, + "loss": 0.415, + "step": 2720 + }, + { + "epoch": 0.2648175182481752, + "grad_norm": 1.3701041364422066, + "learning_rate": 8.623017999297294e-06, + "loss": 0.4329, + "step": 2721 + }, + { + "epoch": 0.26491484184914843, + "grad_norm": 1.5102917148163044, + "learning_rate": 8.621931586866658e-06, + "loss": 0.4104, + "step": 2722 + }, + { + "epoch": 0.26501216545012163, + "grad_norm": 1.4836677874290423, + "learning_rate": 8.620844814522455e-06, + "loss": 0.5131, + "step": 2723 + }, + { + "epoch": 0.2651094890510949, + "grad_norm": 1.2607364196409017, + "learning_rate": 8.619757682372675e-06, + "loss": 0.3856, + "step": 2724 + }, + { + "epoch": 0.26520681265206814, + "grad_norm": 1.4082529003642341, + "learning_rate": 8.61867019052535e-06, + "loss": 0.4719, + "step": 2725 + }, + { + "epoch": 0.2653041362530414, + "grad_norm": 1.4276001080419702, + "learning_rate": 8.617582339088545e-06, + "loss": 0.2825, + "step": 2726 + }, + { + "epoch": 0.2654014598540146, + "grad_norm": 1.4331001450603844, + "learning_rate": 8.61649412817036e-06, + "loss": 0.5104, + "step": 2727 + }, + { + "epoch": 0.26549878345498784, + "grad_norm": 1.358868383954866, + "learning_rate": 8.615405557878929e-06, + "loss": 0.4359, + "step": 2728 + }, + { + "epoch": 0.2655961070559611, + "grad_norm": 1.678463370024911, + "learning_rate": 8.614316628322427e-06, + "loss": 0.4658, + "step": 2729 + }, + { + "epoch": 0.2656934306569343, + "grad_norm": 1.2268291596580612, + "learning_rate": 8.61322733960906e-06, + "loss": 0.2337, + "step": 2730 + }, + { + "epoch": 0.26579075425790755, + "grad_norm": 0.9437944818586388, + "learning_rate": 8.61213769184707e-06, + "loss": 0.2525, + "step": 2731 + }, + { + "epoch": 0.2658880778588808, + "grad_norm": 1.2480121542051432, + "learning_rate": 8.611047685144737e-06, + "loss": 0.2656, + "step": 2732 + }, + { + "epoch": 0.265985401459854, + "grad_norm": 1.5255853623894704, + "learning_rate": 8.609957319610377e-06, + "loss": 0.5071, + "step": 2733 + }, + { + "epoch": 0.26608272506082725, + "grad_norm": 1.5847632660353408, + "learning_rate": 8.60886659535234e-06, + "loss": 0.4018, + "step": 2734 + }, + { + "epoch": 0.2661800486618005, + "grad_norm": 1.3469310633769445, + "learning_rate": 8.60777551247901e-06, + "loss": 0.451, + "step": 2735 + }, + { + "epoch": 0.2662773722627737, + "grad_norm": 1.3995570810499534, + "learning_rate": 8.60668407109881e-06, + "loss": 0.4991, + "step": 2736 + }, + { + "epoch": 0.26637469586374696, + "grad_norm": 1.5198269828404072, + "learning_rate": 8.605592271320199e-06, + "loss": 0.4266, + "step": 2737 + }, + { + "epoch": 0.2664720194647202, + "grad_norm": 1.3040716122405567, + "learning_rate": 8.604500113251666e-06, + "loss": 0.3465, + "step": 2738 + }, + { + "epoch": 0.2665693430656934, + "grad_norm": 1.3643506509353014, + "learning_rate": 8.60340759700174e-06, + "loss": 0.4355, + "step": 2739 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 1.026074804296968, + "learning_rate": 8.602314722678989e-06, + "loss": 0.2507, + "step": 2740 + }, + { + "epoch": 0.2667639902676399, + "grad_norm": 1.3894972782664292, + "learning_rate": 8.601221490392009e-06, + "loss": 0.3981, + "step": 2741 + }, + { + "epoch": 0.2668613138686131, + "grad_norm": 1.3071238902768438, + "learning_rate": 8.600127900249435e-06, + "loss": 0.5138, + "step": 2742 + }, + { + "epoch": 0.26695863746958637, + "grad_norm": 1.61583752885221, + "learning_rate": 8.59903395235994e-06, + "loss": 0.5072, + "step": 2743 + }, + { + "epoch": 0.2670559610705596, + "grad_norm": 1.3679578518174673, + "learning_rate": 8.597939646832227e-06, + "loss": 0.3754, + "step": 2744 + }, + { + "epoch": 0.2671532846715328, + "grad_norm": 1.0943121419181938, + "learning_rate": 8.596844983775042e-06, + "loss": 0.2457, + "step": 2745 + }, + { + "epoch": 0.2672506082725061, + "grad_norm": 1.176479145152164, + "learning_rate": 8.59574996329716e-06, + "loss": 0.3687, + "step": 2746 + }, + { + "epoch": 0.2673479318734793, + "grad_norm": 1.2666642902167933, + "learning_rate": 8.594654585507393e-06, + "loss": 0.2664, + "step": 2747 + }, + { + "epoch": 0.2674452554744526, + "grad_norm": 1.3951377938692817, + "learning_rate": 8.59355885051459e-06, + "loss": 0.4035, + "step": 2748 + }, + { + "epoch": 0.2675425790754258, + "grad_norm": 1.2722832533001889, + "learning_rate": 8.592462758427635e-06, + "loss": 0.4643, + "step": 2749 + }, + { + "epoch": 0.26763990267639903, + "grad_norm": 1.2157588835981379, + "learning_rate": 8.59136630935545e-06, + "loss": 0.3612, + "step": 2750 + }, + { + "epoch": 0.2677372262773723, + "grad_norm": 1.0785566378114326, + "learning_rate": 8.590269503406986e-06, + "loss": 0.3403, + "step": 2751 + }, + { + "epoch": 0.2678345498783455, + "grad_norm": 1.2447292785758555, + "learning_rate": 8.589172340691235e-06, + "loss": 0.3873, + "step": 2752 + }, + { + "epoch": 0.26793187347931874, + "grad_norm": 1.166378916722292, + "learning_rate": 8.588074821317222e-06, + "loss": 0.3264, + "step": 2753 + }, + { + "epoch": 0.268029197080292, + "grad_norm": 1.2197572995933224, + "learning_rate": 8.586976945394008e-06, + "loss": 0.3793, + "step": 2754 + }, + { + "epoch": 0.2681265206812652, + "grad_norm": 1.6234832434134598, + "learning_rate": 8.58587871303069e-06, + "loss": 0.5521, + "step": 2755 + }, + { + "epoch": 0.26822384428223844, + "grad_norm": 1.4760533014923396, + "learning_rate": 8.584780124336403e-06, + "loss": 0.5024, + "step": 2756 + }, + { + "epoch": 0.2683211678832117, + "grad_norm": 1.4156240197993037, + "learning_rate": 8.58368117942031e-06, + "loss": 0.2848, + "step": 2757 + }, + { + "epoch": 0.2684184914841849, + "grad_norm": 1.9092848960981135, + "learning_rate": 8.582581878391614e-06, + "loss": 0.4053, + "step": 2758 + }, + { + "epoch": 0.26851581508515815, + "grad_norm": 1.2158050168465575, + "learning_rate": 8.581482221359557e-06, + "loss": 0.2709, + "step": 2759 + }, + { + "epoch": 0.2686131386861314, + "grad_norm": 1.5515245630825936, + "learning_rate": 8.580382208433408e-06, + "loss": 0.4549, + "step": 2760 + }, + { + "epoch": 0.2687104622871046, + "grad_norm": 1.6603384837941395, + "learning_rate": 8.57928183972248e-06, + "loss": 0.3316, + "step": 2761 + }, + { + "epoch": 0.26880778588807785, + "grad_norm": 1.5595744401068579, + "learning_rate": 8.578181115336114e-06, + "loss": 0.5733, + "step": 2762 + }, + { + "epoch": 0.2689051094890511, + "grad_norm": 1.3547786308004384, + "learning_rate": 8.577080035383693e-06, + "loss": 0.5295, + "step": 2763 + }, + { + "epoch": 0.2690024330900243, + "grad_norm": 1.2889595684224195, + "learning_rate": 8.57597859997463e-06, + "loss": 0.3876, + "step": 2764 + }, + { + "epoch": 0.26909975669099756, + "grad_norm": 1.5401948742368967, + "learning_rate": 8.574876809218375e-06, + "loss": 0.4847, + "step": 2765 + }, + { + "epoch": 0.2691970802919708, + "grad_norm": 1.5886773556984544, + "learning_rate": 8.573774663224414e-06, + "loss": 0.4746, + "step": 2766 + }, + { + "epoch": 0.269294403892944, + "grad_norm": 1.2747463684628804, + "learning_rate": 8.572672162102269e-06, + "loss": 0.2568, + "step": 2767 + }, + { + "epoch": 0.26939172749391727, + "grad_norm": 1.1674673988315882, + "learning_rate": 8.571569305961495e-06, + "loss": 0.4329, + "step": 2768 + }, + { + "epoch": 0.2694890510948905, + "grad_norm": 1.6882113617461265, + "learning_rate": 8.570466094911684e-06, + "loss": 0.6891, + "step": 2769 + }, + { + "epoch": 0.2695863746958638, + "grad_norm": 1.6660737969996857, + "learning_rate": 8.569362529062461e-06, + "loss": 0.5887, + "step": 2770 + }, + { + "epoch": 0.26968369829683697, + "grad_norm": 1.1653044559020052, + "learning_rate": 8.568258608523491e-06, + "loss": 0.2452, + "step": 2771 + }, + { + "epoch": 0.2697810218978102, + "grad_norm": 1.5681206888540218, + "learning_rate": 8.567154333404471e-06, + "loss": 0.4952, + "step": 2772 + }, + { + "epoch": 0.2698783454987835, + "grad_norm": 1.3994591247160806, + "learning_rate": 8.56604970381513e-06, + "loss": 0.2848, + "step": 2773 + }, + { + "epoch": 0.2699756690997567, + "grad_norm": 1.300192393224716, + "learning_rate": 8.564944719865238e-06, + "loss": 0.3924, + "step": 2774 + }, + { + "epoch": 0.27007299270072993, + "grad_norm": 1.4412015443912716, + "learning_rate": 8.5638393816646e-06, + "loss": 0.4531, + "step": 2775 + }, + { + "epoch": 0.2701703163017032, + "grad_norm": 1.4360872043281558, + "learning_rate": 8.56273368932305e-06, + "loss": 0.4571, + "step": 2776 + }, + { + "epoch": 0.2702676399026764, + "grad_norm": 1.5811581309774965, + "learning_rate": 8.561627642950465e-06, + "loss": 0.4638, + "step": 2777 + }, + { + "epoch": 0.27036496350364964, + "grad_norm": 1.7924696283680308, + "learning_rate": 8.560521242656751e-06, + "loss": 0.2922, + "step": 2778 + }, + { + "epoch": 0.2704622871046229, + "grad_norm": 1.7929283253885162, + "learning_rate": 8.559414488551854e-06, + "loss": 0.6197, + "step": 2779 + }, + { + "epoch": 0.2705596107055961, + "grad_norm": 1.5593955671219286, + "learning_rate": 8.558307380745751e-06, + "loss": 0.5448, + "step": 2780 + }, + { + "epoch": 0.27065693430656934, + "grad_norm": 1.3760682204767343, + "learning_rate": 8.557199919348455e-06, + "loss": 0.4434, + "step": 2781 + }, + { + "epoch": 0.2707542579075426, + "grad_norm": 3.203989647256839, + "learning_rate": 8.556092104470019e-06, + "loss": 0.4323, + "step": 2782 + }, + { + "epoch": 0.2708515815085158, + "grad_norm": 1.3460764595466628, + "learning_rate": 8.554983936220525e-06, + "loss": 0.3367, + "step": 2783 + }, + { + "epoch": 0.27094890510948905, + "grad_norm": 1.6160732245190643, + "learning_rate": 8.553875414710088e-06, + "loss": 0.5301, + "step": 2784 + }, + { + "epoch": 0.2710462287104623, + "grad_norm": 1.5749454761331767, + "learning_rate": 8.552766540048872e-06, + "loss": 0.3741, + "step": 2785 + }, + { + "epoch": 0.2711435523114355, + "grad_norm": 1.150423059184381, + "learning_rate": 8.551657312347057e-06, + "loss": 0.2796, + "step": 2786 + }, + { + "epoch": 0.27124087591240875, + "grad_norm": 1.4217054664233575, + "learning_rate": 8.550547731714874e-06, + "loss": 0.4543, + "step": 2787 + }, + { + "epoch": 0.271338199513382, + "grad_norm": 1.470206005686861, + "learning_rate": 8.54943779826258e-06, + "loss": 0.438, + "step": 2788 + }, + { + "epoch": 0.2714355231143552, + "grad_norm": 1.5766219733733982, + "learning_rate": 8.54832751210047e-06, + "loss": 0.4966, + "step": 2789 + }, + { + "epoch": 0.27153284671532846, + "grad_norm": 1.2135102045567707, + "learning_rate": 8.547216873338876e-06, + "loss": 0.358, + "step": 2790 + }, + { + "epoch": 0.2716301703163017, + "grad_norm": 1.4595225616938101, + "learning_rate": 8.546105882088158e-06, + "loss": 0.2225, + "step": 2791 + }, + { + "epoch": 0.27172749391727496, + "grad_norm": 1.3363330099445299, + "learning_rate": 8.54499453845872e-06, + "loss": 0.3914, + "step": 2792 + }, + { + "epoch": 0.27182481751824816, + "grad_norm": 1.3646141902938869, + "learning_rate": 8.543882842560997e-06, + "loss": 0.4558, + "step": 2793 + }, + { + "epoch": 0.2719221411192214, + "grad_norm": 1.3464180828493995, + "learning_rate": 8.542770794505456e-06, + "loss": 0.4786, + "step": 2794 + }, + { + "epoch": 0.27201946472019467, + "grad_norm": 1.044551377255888, + "learning_rate": 8.541658394402606e-06, + "loss": 0.303, + "step": 2795 + }, + { + "epoch": 0.27211678832116787, + "grad_norm": 1.6706499263846184, + "learning_rate": 8.540545642362982e-06, + "loss": 0.4033, + "step": 2796 + }, + { + "epoch": 0.2722141119221411, + "grad_norm": 1.3164784669169094, + "learning_rate": 8.539432538497162e-06, + "loss": 0.4343, + "step": 2797 + }, + { + "epoch": 0.2723114355231144, + "grad_norm": 1.6044535524867656, + "learning_rate": 8.538319082915757e-06, + "loss": 0.3641, + "step": 2798 + }, + { + "epoch": 0.2724087591240876, + "grad_norm": 1.9897822202433566, + "learning_rate": 8.537205275729406e-06, + "loss": 0.48, + "step": 2799 + }, + { + "epoch": 0.2725060827250608, + "grad_norm": 1.4110579632506512, + "learning_rate": 8.536091117048794e-06, + "loss": 0.4798, + "step": 2800 + }, + { + "epoch": 0.2726034063260341, + "grad_norm": 1.4415607317920478, + "learning_rate": 8.534976606984636e-06, + "loss": 0.343, + "step": 2801 + }, + { + "epoch": 0.2727007299270073, + "grad_norm": 1.6363482727427716, + "learning_rate": 8.53386174564768e-06, + "loss": 0.6087, + "step": 2802 + }, + { + "epoch": 0.27279805352798053, + "grad_norm": 1.1272383780084416, + "learning_rate": 8.532746533148708e-06, + "loss": 0.2444, + "step": 2803 + }, + { + "epoch": 0.2728953771289538, + "grad_norm": 1.712140222332907, + "learning_rate": 8.531630969598544e-06, + "loss": 0.6702, + "step": 2804 + }, + { + "epoch": 0.272992700729927, + "grad_norm": 1.474485197586056, + "learning_rate": 8.530515055108038e-06, + "loss": 0.3876, + "step": 2805 + }, + { + "epoch": 0.27309002433090024, + "grad_norm": 1.2926370708159094, + "learning_rate": 8.529398789788082e-06, + "loss": 0.3239, + "step": 2806 + }, + { + "epoch": 0.2731873479318735, + "grad_norm": 1.1171205940753008, + "learning_rate": 8.528282173749599e-06, + "loss": 0.3135, + "step": 2807 + }, + { + "epoch": 0.2732846715328467, + "grad_norm": 1.3561762741371761, + "learning_rate": 8.527165207103546e-06, + "loss": 0.4686, + "step": 2808 + }, + { + "epoch": 0.27338199513381994, + "grad_norm": 1.3082129080843141, + "learning_rate": 8.52604788996092e-06, + "loss": 0.4274, + "step": 2809 + }, + { + "epoch": 0.2734793187347932, + "grad_norm": 1.2958697823961909, + "learning_rate": 8.524930222432748e-06, + "loss": 0.4334, + "step": 2810 + }, + { + "epoch": 0.2735766423357664, + "grad_norm": 1.4541266485936315, + "learning_rate": 8.523812204630093e-06, + "loss": 0.5685, + "step": 2811 + }, + { + "epoch": 0.27367396593673965, + "grad_norm": 1.3303596097899522, + "learning_rate": 8.522693836664052e-06, + "loss": 0.4305, + "step": 2812 + }, + { + "epoch": 0.2737712895377129, + "grad_norm": 1.220005269273729, + "learning_rate": 8.521575118645761e-06, + "loss": 0.4281, + "step": 2813 + }, + { + "epoch": 0.27386861313868616, + "grad_norm": 1.0981673276035366, + "learning_rate": 8.520456050686384e-06, + "loss": 0.3641, + "step": 2814 + }, + { + "epoch": 0.27396593673965935, + "grad_norm": 1.4310281439998578, + "learning_rate": 8.519336632897128e-06, + "loss": 0.557, + "step": 2815 + }, + { + "epoch": 0.2740632603406326, + "grad_norm": 1.345841620727785, + "learning_rate": 8.518216865389227e-06, + "loss": 0.3991, + "step": 2816 + }, + { + "epoch": 0.27416058394160586, + "grad_norm": 1.6650753610183784, + "learning_rate": 8.517096748273951e-06, + "loss": 0.3624, + "step": 2817 + }, + { + "epoch": 0.27425790754257906, + "grad_norm": 1.2633026385457689, + "learning_rate": 8.515976281662613e-06, + "loss": 0.349, + "step": 2818 + }, + { + "epoch": 0.2743552311435523, + "grad_norm": 1.392024932172172, + "learning_rate": 8.514855465666546e-06, + "loss": 0.4514, + "step": 2819 + }, + { + "epoch": 0.27445255474452557, + "grad_norm": 1.4295145565971665, + "learning_rate": 8.513734300397135e-06, + "loss": 0.5668, + "step": 2820 + }, + { + "epoch": 0.27454987834549877, + "grad_norm": 1.0967459926110283, + "learning_rate": 8.512612785965787e-06, + "loss": 0.1808, + "step": 2821 + }, + { + "epoch": 0.274647201946472, + "grad_norm": 1.4843839946273536, + "learning_rate": 8.511490922483946e-06, + "loss": 0.4352, + "step": 2822 + }, + { + "epoch": 0.2747445255474453, + "grad_norm": 1.339649820333997, + "learning_rate": 8.510368710063093e-06, + "loss": 0.3137, + "step": 2823 + }, + { + "epoch": 0.27484184914841847, + "grad_norm": 1.32567882782868, + "learning_rate": 8.509246148814745e-06, + "loss": 0.4089, + "step": 2824 + }, + { + "epoch": 0.2749391727493917, + "grad_norm": 1.2497731956714773, + "learning_rate": 8.50812323885045e-06, + "loss": 0.382, + "step": 2825 + }, + { + "epoch": 0.275036496350365, + "grad_norm": 1.5771259884963846, + "learning_rate": 8.506999980281791e-06, + "loss": 0.501, + "step": 2826 + }, + { + "epoch": 0.2751338199513382, + "grad_norm": 1.3295615561309837, + "learning_rate": 8.505876373220393e-06, + "loss": 0.3635, + "step": 2827 + }, + { + "epoch": 0.27523114355231143, + "grad_norm": 1.55543645713159, + "learning_rate": 8.504752417777899e-06, + "loss": 0.2986, + "step": 2828 + }, + { + "epoch": 0.2753284671532847, + "grad_norm": 1.421283473121396, + "learning_rate": 8.503628114066008e-06, + "loss": 0.4931, + "step": 2829 + }, + { + "epoch": 0.2754257907542579, + "grad_norm": 1.1988827610585986, + "learning_rate": 8.502503462196435e-06, + "loss": 0.3272, + "step": 2830 + }, + { + "epoch": 0.27552311435523114, + "grad_norm": 1.6163491550131937, + "learning_rate": 8.501378462280941e-06, + "loss": 0.5794, + "step": 2831 + }, + { + "epoch": 0.2756204379562044, + "grad_norm": 1.6499795796835799, + "learning_rate": 8.500253114431316e-06, + "loss": 0.3668, + "step": 2832 + }, + { + "epoch": 0.27571776155717764, + "grad_norm": 1.7305434923413188, + "learning_rate": 8.499127418759388e-06, + "loss": 0.5291, + "step": 2833 + }, + { + "epoch": 0.27581508515815084, + "grad_norm": 1.4062980643641485, + "learning_rate": 8.498001375377018e-06, + "loss": 0.4645, + "step": 2834 + }, + { + "epoch": 0.2759124087591241, + "grad_norm": 1.2961260919749351, + "learning_rate": 8.496874984396101e-06, + "loss": 0.2517, + "step": 2835 + }, + { + "epoch": 0.27600973236009735, + "grad_norm": 1.4273972641674804, + "learning_rate": 8.495748245928568e-06, + "loss": 0.4705, + "step": 2836 + }, + { + "epoch": 0.27610705596107055, + "grad_norm": 1.1525746776855315, + "learning_rate": 8.494621160086383e-06, + "loss": 0.3747, + "step": 2837 + }, + { + "epoch": 0.2762043795620438, + "grad_norm": 1.6083708658269757, + "learning_rate": 8.493493726981545e-06, + "loss": 0.5754, + "step": 2838 + }, + { + "epoch": 0.27630170316301705, + "grad_norm": 1.6380932846987073, + "learning_rate": 8.492365946726087e-06, + "loss": 0.4668, + "step": 2839 + }, + { + "epoch": 0.27639902676399025, + "grad_norm": 1.3587028332396105, + "learning_rate": 8.491237819432081e-06, + "loss": 0.3466, + "step": 2840 + }, + { + "epoch": 0.2764963503649635, + "grad_norm": 1.5812508624530597, + "learning_rate": 8.490109345211625e-06, + "loss": 0.628, + "step": 2841 + }, + { + "epoch": 0.27659367396593676, + "grad_norm": 1.359461682943084, + "learning_rate": 8.48898052417686e-06, + "loss": 0.4799, + "step": 2842 + }, + { + "epoch": 0.27669099756690996, + "grad_norm": 1.3773089875645015, + "learning_rate": 8.487851356439953e-06, + "loss": 0.3064, + "step": 2843 + }, + { + "epoch": 0.2767883211678832, + "grad_norm": 1.445505572645753, + "learning_rate": 8.486721842113114e-06, + "loss": 0.4629, + "step": 2844 + }, + { + "epoch": 0.27688564476885646, + "grad_norm": 2.1729540442826796, + "learning_rate": 8.485591981308584e-06, + "loss": 0.501, + "step": 2845 + }, + { + "epoch": 0.27698296836982966, + "grad_norm": 1.2698072866971275, + "learning_rate": 8.484461774138635e-06, + "loss": 0.3354, + "step": 2846 + }, + { + "epoch": 0.2770802919708029, + "grad_norm": 1.2270792461817257, + "learning_rate": 8.483331220715578e-06, + "loss": 0.2925, + "step": 2847 + }, + { + "epoch": 0.27717761557177617, + "grad_norm": 1.4982940191444252, + "learning_rate": 8.482200321151757e-06, + "loss": 0.4372, + "step": 2848 + }, + { + "epoch": 0.27727493917274937, + "grad_norm": 1.7962422459275051, + "learning_rate": 8.48106907555955e-06, + "loss": 0.2514, + "step": 2849 + }, + { + "epoch": 0.2773722627737226, + "grad_norm": 1.1765428275481227, + "learning_rate": 8.479937484051368e-06, + "loss": 0.2466, + "step": 2850 + }, + { + "epoch": 0.2774695863746959, + "grad_norm": 1.3671035304850088, + "learning_rate": 8.47880554673966e-06, + "loss": 0.4388, + "step": 2851 + }, + { + "epoch": 0.2775669099756691, + "grad_norm": 1.584083262413021, + "learning_rate": 8.477673263736908e-06, + "loss": 0.3117, + "step": 2852 + }, + { + "epoch": 0.2776642335766423, + "grad_norm": 1.6251518472003594, + "learning_rate": 8.476540635155623e-06, + "loss": 0.4661, + "step": 2853 + }, + { + "epoch": 0.2777615571776156, + "grad_norm": 1.6392857489539867, + "learning_rate": 8.475407661108361e-06, + "loss": 0.354, + "step": 2854 + }, + { + "epoch": 0.27785888077858883, + "grad_norm": 1.3195625296951223, + "learning_rate": 8.474274341707702e-06, + "loss": 0.3744, + "step": 2855 + }, + { + "epoch": 0.27795620437956203, + "grad_norm": 1.34410915454318, + "learning_rate": 8.473140677066267e-06, + "loss": 0.4069, + "step": 2856 + }, + { + "epoch": 0.2780535279805353, + "grad_norm": 1.0527413957181246, + "learning_rate": 8.472006667296709e-06, + "loss": 0.2776, + "step": 2857 + }, + { + "epoch": 0.27815085158150854, + "grad_norm": 1.496471387248685, + "learning_rate": 8.470872312511714e-06, + "loss": 0.3642, + "step": 2858 + }, + { + "epoch": 0.27824817518248174, + "grad_norm": 1.532429299396127, + "learning_rate": 8.469737612824001e-06, + "loss": 0.44, + "step": 2859 + }, + { + "epoch": 0.278345498783455, + "grad_norm": 1.601112711944827, + "learning_rate": 8.468602568346332e-06, + "loss": 0.421, + "step": 2860 + }, + { + "epoch": 0.27844282238442825, + "grad_norm": 1.5148720198103927, + "learning_rate": 8.467467179191493e-06, + "loss": 0.5258, + "step": 2861 + }, + { + "epoch": 0.27854014598540144, + "grad_norm": 1.573048120862393, + "learning_rate": 8.466331445472308e-06, + "loss": 0.4507, + "step": 2862 + }, + { + "epoch": 0.2786374695863747, + "grad_norm": 1.3938890789758775, + "learning_rate": 8.465195367301639e-06, + "loss": 0.3365, + "step": 2863 + }, + { + "epoch": 0.27873479318734795, + "grad_norm": 1.6895380781567202, + "learning_rate": 8.464058944792375e-06, + "loss": 0.4132, + "step": 2864 + }, + { + "epoch": 0.27883211678832115, + "grad_norm": 1.6880546647255488, + "learning_rate": 8.462922178057444e-06, + "loss": 0.2605, + "step": 2865 + }, + { + "epoch": 0.2789294403892944, + "grad_norm": 1.491755717654464, + "learning_rate": 8.46178506720981e-06, + "loss": 0.3983, + "step": 2866 + }, + { + "epoch": 0.27902676399026766, + "grad_norm": 1.5848666178901887, + "learning_rate": 8.460647612362464e-06, + "loss": 0.5101, + "step": 2867 + }, + { + "epoch": 0.27912408759124085, + "grad_norm": 1.3442317187907376, + "learning_rate": 8.459509813628437e-06, + "loss": 0.458, + "step": 2868 + }, + { + "epoch": 0.2792214111922141, + "grad_norm": 1.8095809186860319, + "learning_rate": 8.458371671120795e-06, + "loss": 0.382, + "step": 2869 + }, + { + "epoch": 0.27931873479318736, + "grad_norm": 0.9909926300929587, + "learning_rate": 8.457233184952635e-06, + "loss": 0.2292, + "step": 2870 + }, + { + "epoch": 0.27941605839416056, + "grad_norm": 1.7013118787018624, + "learning_rate": 8.456094355237086e-06, + "loss": 0.6861, + "step": 2871 + }, + { + "epoch": 0.2795133819951338, + "grad_norm": 3.4293212695090025, + "learning_rate": 8.45495518208732e-06, + "loss": 0.3233, + "step": 2872 + }, + { + "epoch": 0.27961070559610707, + "grad_norm": 1.4903797163776311, + "learning_rate": 8.45381566561653e-06, + "loss": 0.3231, + "step": 2873 + }, + { + "epoch": 0.27970802919708027, + "grad_norm": 1.5615177882070261, + "learning_rate": 8.452675805937956e-06, + "loss": 0.4125, + "step": 2874 + }, + { + "epoch": 0.2798053527980535, + "grad_norm": 1.4099046900170047, + "learning_rate": 8.451535603164865e-06, + "loss": 0.4967, + "step": 2875 + }, + { + "epoch": 0.2799026763990268, + "grad_norm": 1.383217014263479, + "learning_rate": 8.450395057410561e-06, + "loss": 0.3411, + "step": 2876 + }, + { + "epoch": 0.28, + "grad_norm": 1.2661588037606646, + "learning_rate": 8.449254168788377e-06, + "loss": 0.3734, + "step": 2877 + }, + { + "epoch": 0.2800973236009732, + "grad_norm": 1.4107359648240771, + "learning_rate": 8.448112937411689e-06, + "loss": 0.4765, + "step": 2878 + }, + { + "epoch": 0.2801946472019465, + "grad_norm": 1.567373989947911, + "learning_rate": 8.446971363393897e-06, + "loss": 0.5806, + "step": 2879 + }, + { + "epoch": 0.28029197080291973, + "grad_norm": 1.5980994022663064, + "learning_rate": 8.445829446848442e-06, + "loss": 0.3765, + "step": 2880 + }, + { + "epoch": 0.28038929440389293, + "grad_norm": 1.5582627635759285, + "learning_rate": 8.444687187888798e-06, + "loss": 0.3838, + "step": 2881 + }, + { + "epoch": 0.2804866180048662, + "grad_norm": 2.097365147798996, + "learning_rate": 8.44354458662847e-06, + "loss": 0.6467, + "step": 2882 + }, + { + "epoch": 0.28058394160583944, + "grad_norm": 1.5302257615618868, + "learning_rate": 8.442401643181e-06, + "loss": 0.4415, + "step": 2883 + }, + { + "epoch": 0.28068126520681264, + "grad_norm": 1.1646338986978766, + "learning_rate": 8.441258357659962e-06, + "loss": 0.3176, + "step": 2884 + }, + { + "epoch": 0.2807785888077859, + "grad_norm": 1.2287928718701633, + "learning_rate": 8.440114730178968e-06, + "loss": 0.4175, + "step": 2885 + }, + { + "epoch": 0.28087591240875914, + "grad_norm": 1.4416072881006319, + "learning_rate": 8.438970760851658e-06, + "loss": 0.4838, + "step": 2886 + }, + { + "epoch": 0.28097323600973234, + "grad_norm": 1.319870372533973, + "learning_rate": 8.437826449791709e-06, + "loss": 0.3421, + "step": 2887 + }, + { + "epoch": 0.2810705596107056, + "grad_norm": 1.6261475252650914, + "learning_rate": 8.436681797112833e-06, + "loss": 0.5019, + "step": 2888 + }, + { + "epoch": 0.28116788321167885, + "grad_norm": 1.6203143716652342, + "learning_rate": 8.435536802928774e-06, + "loss": 0.4282, + "step": 2889 + }, + { + "epoch": 0.28126520681265205, + "grad_norm": 1.4127079920263665, + "learning_rate": 8.434391467353312e-06, + "loss": 0.4542, + "step": 2890 + }, + { + "epoch": 0.2813625304136253, + "grad_norm": 1.1756885783532405, + "learning_rate": 8.433245790500258e-06, + "loss": 0.3563, + "step": 2891 + }, + { + "epoch": 0.28145985401459855, + "grad_norm": 1.1824997482138238, + "learning_rate": 8.43209977248346e-06, + "loss": 0.3628, + "step": 2892 + }, + { + "epoch": 0.28155717761557175, + "grad_norm": 1.4280724079623635, + "learning_rate": 8.430953413416798e-06, + "loss": 0.446, + "step": 2893 + }, + { + "epoch": 0.281654501216545, + "grad_norm": 1.0710350994410123, + "learning_rate": 8.429806713414188e-06, + "loss": 0.2016, + "step": 2894 + }, + { + "epoch": 0.28175182481751826, + "grad_norm": 1.453985226232095, + "learning_rate": 8.428659672589574e-06, + "loss": 0.4325, + "step": 2895 + }, + { + "epoch": 0.28184914841849146, + "grad_norm": 1.3045306996673216, + "learning_rate": 8.427512291056943e-06, + "loss": 0.3838, + "step": 2896 + }, + { + "epoch": 0.2819464720194647, + "grad_norm": 1.483337521636422, + "learning_rate": 8.426364568930309e-06, + "loss": 0.4212, + "step": 2897 + }, + { + "epoch": 0.28204379562043796, + "grad_norm": 1.0901324802348065, + "learning_rate": 8.425216506323721e-06, + "loss": 0.2392, + "step": 2898 + }, + { + "epoch": 0.2821411192214112, + "grad_norm": 1.3761268679827663, + "learning_rate": 8.424068103351264e-06, + "loss": 0.4459, + "step": 2899 + }, + { + "epoch": 0.2822384428223844, + "grad_norm": 1.461105500215717, + "learning_rate": 8.422919360127053e-06, + "loss": 0.5018, + "step": 2900 + }, + { + "epoch": 0.28233576642335767, + "grad_norm": 1.4314465150478046, + "learning_rate": 8.421770276765245e-06, + "loss": 0.4474, + "step": 2901 + }, + { + "epoch": 0.2824330900243309, + "grad_norm": 1.6060806185106393, + "learning_rate": 8.420620853380018e-06, + "loss": 0.5798, + "step": 2902 + }, + { + "epoch": 0.2825304136253041, + "grad_norm": 1.4468000025910832, + "learning_rate": 8.419471090085596e-06, + "loss": 0.5597, + "step": 2903 + }, + { + "epoch": 0.2826277372262774, + "grad_norm": 5.585104457387235, + "learning_rate": 8.41832098699623e-06, + "loss": 0.3493, + "step": 2904 + }, + { + "epoch": 0.28272506082725063, + "grad_norm": 1.3577816273786794, + "learning_rate": 8.417170544226205e-06, + "loss": 0.3262, + "step": 2905 + }, + { + "epoch": 0.2828223844282238, + "grad_norm": 1.1546363912171016, + "learning_rate": 8.416019761889845e-06, + "loss": 0.3691, + "step": 2906 + }, + { + "epoch": 0.2829197080291971, + "grad_norm": 1.3224407401265832, + "learning_rate": 8.4148686401015e-06, + "loss": 0.3079, + "step": 2907 + }, + { + "epoch": 0.28301703163017033, + "grad_norm": 1.5947860641264806, + "learning_rate": 8.413717178975558e-06, + "loss": 0.277, + "step": 2908 + }, + { + "epoch": 0.28311435523114353, + "grad_norm": 1.343045870800707, + "learning_rate": 8.412565378626442e-06, + "loss": 0.3448, + "step": 2909 + }, + { + "epoch": 0.2832116788321168, + "grad_norm": 1.5567901041780798, + "learning_rate": 8.411413239168609e-06, + "loss": 0.3954, + "step": 2910 + }, + { + "epoch": 0.28330900243309004, + "grad_norm": 1.5232536009297208, + "learning_rate": 8.410260760716545e-06, + "loss": 0.5103, + "step": 2911 + }, + { + "epoch": 0.28340632603406324, + "grad_norm": 1.2493384040941995, + "learning_rate": 8.409107943384773e-06, + "loss": 0.3671, + "step": 2912 + }, + { + "epoch": 0.2835036496350365, + "grad_norm": 1.246217249188392, + "learning_rate": 8.407954787287848e-06, + "loss": 0.4112, + "step": 2913 + }, + { + "epoch": 0.28360097323600975, + "grad_norm": 1.2012340002353967, + "learning_rate": 8.406801292540364e-06, + "loss": 0.3769, + "step": 2914 + }, + { + "epoch": 0.28369829683698294, + "grad_norm": 1.51749407168492, + "learning_rate": 8.405647459256939e-06, + "loss": 0.5515, + "step": 2915 + }, + { + "epoch": 0.2837956204379562, + "grad_norm": 1.1589770762667257, + "learning_rate": 8.404493287552232e-06, + "loss": 0.2577, + "step": 2916 + }, + { + "epoch": 0.28389294403892945, + "grad_norm": 1.5139932402052954, + "learning_rate": 8.403338777540936e-06, + "loss": 0.4796, + "step": 2917 + }, + { + "epoch": 0.28399026763990265, + "grad_norm": 1.5544290759133006, + "learning_rate": 8.402183929337774e-06, + "loss": 0.4594, + "step": 2918 + }, + { + "epoch": 0.2840875912408759, + "grad_norm": 1.3525572627526583, + "learning_rate": 8.401028743057503e-06, + "loss": 0.3978, + "step": 2919 + }, + { + "epoch": 0.28418491484184916, + "grad_norm": 1.3610916698563846, + "learning_rate": 8.399873218814916e-06, + "loss": 0.4308, + "step": 2920 + }, + { + "epoch": 0.2842822384428224, + "grad_norm": 1.2060322500759533, + "learning_rate": 8.398717356724837e-06, + "loss": 0.482, + "step": 2921 + }, + { + "epoch": 0.2843795620437956, + "grad_norm": 1.152727586861314, + "learning_rate": 8.397561156902126e-06, + "loss": 0.3862, + "step": 2922 + }, + { + "epoch": 0.28447688564476886, + "grad_norm": 1.6371195081735355, + "learning_rate": 8.396404619461673e-06, + "loss": 0.684, + "step": 2923 + }, + { + "epoch": 0.2845742092457421, + "grad_norm": 1.4756480619833048, + "learning_rate": 8.395247744518407e-06, + "loss": 0.4432, + "step": 2924 + }, + { + "epoch": 0.2846715328467153, + "grad_norm": 1.3495353534897125, + "learning_rate": 8.394090532187286e-06, + "loss": 0.4574, + "step": 2925 + }, + { + "epoch": 0.28476885644768857, + "grad_norm": 1.361248347874279, + "learning_rate": 8.392932982583301e-06, + "loss": 0.3117, + "step": 2926 + }, + { + "epoch": 0.2848661800486618, + "grad_norm": 1.5493409509214389, + "learning_rate": 8.391775095821481e-06, + "loss": 0.5949, + "step": 2927 + }, + { + "epoch": 0.284963503649635, + "grad_norm": 1.5159424124979992, + "learning_rate": 8.390616872016886e-06, + "loss": 0.612, + "step": 2928 + }, + { + "epoch": 0.2850608272506083, + "grad_norm": 0.9819694068633834, + "learning_rate": 8.389458311284606e-06, + "loss": 0.2407, + "step": 2929 + }, + { + "epoch": 0.2851581508515815, + "grad_norm": 1.4338313849048412, + "learning_rate": 8.388299413739772e-06, + "loss": 0.504, + "step": 2930 + }, + { + "epoch": 0.2852554744525547, + "grad_norm": 1.6033282710660985, + "learning_rate": 8.387140179497541e-06, + "loss": 0.4686, + "step": 2931 + }, + { + "epoch": 0.285352798053528, + "grad_norm": 1.4339139409278308, + "learning_rate": 8.38598060867311e-06, + "loss": 0.5885, + "step": 2932 + }, + { + "epoch": 0.28545012165450123, + "grad_norm": 1.6962944035069916, + "learning_rate": 8.384820701381705e-06, + "loss": 0.6325, + "step": 2933 + }, + { + "epoch": 0.28554744525547443, + "grad_norm": 1.2380931242026982, + "learning_rate": 8.383660457738585e-06, + "loss": 0.3528, + "step": 2934 + }, + { + "epoch": 0.2856447688564477, + "grad_norm": 1.4958548492045998, + "learning_rate": 8.382499877859046e-06, + "loss": 0.5261, + "step": 2935 + }, + { + "epoch": 0.28574209245742094, + "grad_norm": 1.2493863506860636, + "learning_rate": 8.381338961858417e-06, + "loss": 0.295, + "step": 2936 + }, + { + "epoch": 0.28583941605839414, + "grad_norm": 1.0264542939220365, + "learning_rate": 8.380177709852055e-06, + "loss": 0.2736, + "step": 2937 + }, + { + "epoch": 0.2859367396593674, + "grad_norm": 1.3694559515073481, + "learning_rate": 8.379016121955358e-06, + "loss": 0.2437, + "step": 2938 + }, + { + "epoch": 0.28603406326034064, + "grad_norm": 1.3958652644514353, + "learning_rate": 8.377854198283751e-06, + "loss": 0.5162, + "step": 2939 + }, + { + "epoch": 0.28613138686131384, + "grad_norm": 1.3188642877167738, + "learning_rate": 8.376691938952694e-06, + "loss": 0.4403, + "step": 2940 + }, + { + "epoch": 0.2862287104622871, + "grad_norm": 1.5563883463328907, + "learning_rate": 8.375529344077686e-06, + "loss": 0.3871, + "step": 2941 + }, + { + "epoch": 0.28632603406326035, + "grad_norm": 1.7106139691477682, + "learning_rate": 8.37436641377425e-06, + "loss": 0.5998, + "step": 2942 + }, + { + "epoch": 0.2864233576642336, + "grad_norm": 1.8227768617334648, + "learning_rate": 8.373203148157953e-06, + "loss": 0.4192, + "step": 2943 + }, + { + "epoch": 0.2865206812652068, + "grad_norm": 1.3645142496496503, + "learning_rate": 8.372039547344383e-06, + "loss": 0.4301, + "step": 2944 + }, + { + "epoch": 0.28661800486618005, + "grad_norm": 1.4644520960794265, + "learning_rate": 8.370875611449173e-06, + "loss": 0.4333, + "step": 2945 + }, + { + "epoch": 0.2867153284671533, + "grad_norm": 1.3686778637415178, + "learning_rate": 8.369711340587981e-06, + "loss": 0.4735, + "step": 2946 + }, + { + "epoch": 0.2868126520681265, + "grad_norm": 1.7752150982830557, + "learning_rate": 8.368546734876499e-06, + "loss": 0.605, + "step": 2947 + }, + { + "epoch": 0.28690997566909976, + "grad_norm": 1.6349896239905135, + "learning_rate": 8.36738179443046e-06, + "loss": 0.4521, + "step": 2948 + }, + { + "epoch": 0.287007299270073, + "grad_norm": 1.7001103309282906, + "learning_rate": 8.366216519365623e-06, + "loss": 0.5243, + "step": 2949 + }, + { + "epoch": 0.2871046228710462, + "grad_norm": 1.3288526449094853, + "learning_rate": 8.365050909797779e-06, + "loss": 0.4226, + "step": 2950 + }, + { + "epoch": 0.28720194647201946, + "grad_norm": 1.0609308885865543, + "learning_rate": 8.36388496584276e-06, + "loss": 0.2761, + "step": 2951 + }, + { + "epoch": 0.2872992700729927, + "grad_norm": 1.3048762567541314, + "learning_rate": 8.362718687616422e-06, + "loss": 0.3166, + "step": 2952 + }, + { + "epoch": 0.2873965936739659, + "grad_norm": 1.5602591658770568, + "learning_rate": 8.361552075234664e-06, + "loss": 0.1814, + "step": 2953 + }, + { + "epoch": 0.28749391727493917, + "grad_norm": 1.261612878851385, + "learning_rate": 8.360385128813409e-06, + "loss": 0.3431, + "step": 2954 + }, + { + "epoch": 0.2875912408759124, + "grad_norm": 1.6502840086679433, + "learning_rate": 8.359217848468617e-06, + "loss": 0.5688, + "step": 2955 + }, + { + "epoch": 0.2876885644768856, + "grad_norm": 1.1758618501430975, + "learning_rate": 8.358050234316283e-06, + "loss": 0.376, + "step": 2956 + }, + { + "epoch": 0.2877858880778589, + "grad_norm": 1.3748216513361973, + "learning_rate": 8.356882286472433e-06, + "loss": 0.4893, + "step": 2957 + }, + { + "epoch": 0.28788321167883213, + "grad_norm": 1.490557754247365, + "learning_rate": 8.35571400505313e-06, + "loss": 0.4322, + "step": 2958 + }, + { + "epoch": 0.2879805352798053, + "grad_norm": 1.2474734521766377, + "learning_rate": 8.35454539017446e-06, + "loss": 0.249, + "step": 2959 + }, + { + "epoch": 0.2880778588807786, + "grad_norm": 1.3041956082790018, + "learning_rate": 8.353376441952554e-06, + "loss": 0.3629, + "step": 2960 + }, + { + "epoch": 0.28817518248175183, + "grad_norm": 1.1813542799359134, + "learning_rate": 8.352207160503572e-06, + "loss": 0.2541, + "step": 2961 + }, + { + "epoch": 0.2882725060827251, + "grad_norm": 1.6196703441196314, + "learning_rate": 8.351037545943702e-06, + "loss": 0.5863, + "step": 2962 + }, + { + "epoch": 0.2883698296836983, + "grad_norm": 1.6020435634219072, + "learning_rate": 8.34986759838917e-06, + "loss": 0.5539, + "step": 2963 + }, + { + "epoch": 0.28846715328467154, + "grad_norm": 1.6170521555116952, + "learning_rate": 8.348697317956238e-06, + "loss": 0.4234, + "step": 2964 + }, + { + "epoch": 0.2885644768856448, + "grad_norm": 1.2300623631368495, + "learning_rate": 8.347526704761193e-06, + "loss": 0.2784, + "step": 2965 + }, + { + "epoch": 0.288661800486618, + "grad_norm": 2.179168092375873, + "learning_rate": 8.346355758920364e-06, + "loss": 0.4561, + "step": 2966 + }, + { + "epoch": 0.28875912408759125, + "grad_norm": 1.5135423174141494, + "learning_rate": 8.345184480550104e-06, + "loss": 0.3807, + "step": 2967 + }, + { + "epoch": 0.2888564476885645, + "grad_norm": 1.7005351963186346, + "learning_rate": 8.344012869766808e-06, + "loss": 0.538, + "step": 2968 + }, + { + "epoch": 0.2889537712895377, + "grad_norm": 1.2789157911351394, + "learning_rate": 8.342840926686898e-06, + "loss": 0.2623, + "step": 2969 + }, + { + "epoch": 0.28905109489051095, + "grad_norm": 1.304761873055631, + "learning_rate": 8.34166865142683e-06, + "loss": 0.4219, + "step": 2970 + }, + { + "epoch": 0.2891484184914842, + "grad_norm": 1.6192760894025877, + "learning_rate": 8.340496044103095e-06, + "loss": 0.4378, + "step": 2971 + }, + { + "epoch": 0.2892457420924574, + "grad_norm": 1.4363442626245757, + "learning_rate": 8.339323104832214e-06, + "loss": 0.3819, + "step": 2972 + }, + { + "epoch": 0.28934306569343066, + "grad_norm": 1.5094300127764981, + "learning_rate": 8.338149833730742e-06, + "loss": 0.2769, + "step": 2973 + }, + { + "epoch": 0.2894403892944039, + "grad_norm": 1.6047897202306092, + "learning_rate": 8.33697623091527e-06, + "loss": 0.424, + "step": 2974 + }, + { + "epoch": 0.2895377128953771, + "grad_norm": 1.3129110600868221, + "learning_rate": 8.33580229650242e-06, + "loss": 0.5053, + "step": 2975 + }, + { + "epoch": 0.28963503649635036, + "grad_norm": 1.1812562932245452, + "learning_rate": 8.334628030608845e-06, + "loss": 0.3835, + "step": 2976 + }, + { + "epoch": 0.2897323600973236, + "grad_norm": 1.2211203388582414, + "learning_rate": 8.333453433351233e-06, + "loss": 0.3531, + "step": 2977 + }, + { + "epoch": 0.2898296836982968, + "grad_norm": 1.4620903484748373, + "learning_rate": 8.332278504846303e-06, + "loss": 0.4771, + "step": 2978 + }, + { + "epoch": 0.28992700729927007, + "grad_norm": 0.9704255718501243, + "learning_rate": 8.331103245210812e-06, + "loss": 0.2618, + "step": 2979 + }, + { + "epoch": 0.2900243309002433, + "grad_norm": 1.2827724622455963, + "learning_rate": 8.329927654561544e-06, + "loss": 0.3052, + "step": 2980 + }, + { + "epoch": 0.2901216545012165, + "grad_norm": 1.378581411338256, + "learning_rate": 8.328751733015316e-06, + "loss": 0.3568, + "step": 2981 + }, + { + "epoch": 0.2902189781021898, + "grad_norm": 1.769807570821765, + "learning_rate": 8.327575480688985e-06, + "loss": 0.3102, + "step": 2982 + }, + { + "epoch": 0.290316301703163, + "grad_norm": 1.4326301683333176, + "learning_rate": 8.32639889769943e-06, + "loss": 0.3218, + "step": 2983 + }, + { + "epoch": 0.2904136253041363, + "grad_norm": 1.5418816322088151, + "learning_rate": 8.325221984163575e-06, + "loss": 0.3257, + "step": 2984 + }, + { + "epoch": 0.2905109489051095, + "grad_norm": 1.573484642436306, + "learning_rate": 8.324044740198366e-06, + "loss": 0.5401, + "step": 2985 + }, + { + "epoch": 0.29060827250608273, + "grad_norm": 1.2270555416429247, + "learning_rate": 8.322867165920789e-06, + "loss": 0.3914, + "step": 2986 + }, + { + "epoch": 0.290705596107056, + "grad_norm": 1.1838846887742434, + "learning_rate": 8.321689261447858e-06, + "loss": 0.3282, + "step": 2987 + }, + { + "epoch": 0.2908029197080292, + "grad_norm": 1.5077214188811954, + "learning_rate": 8.320511026896624e-06, + "loss": 0.5279, + "step": 2988 + }, + { + "epoch": 0.29090024330900244, + "grad_norm": 1.1784061774291985, + "learning_rate": 8.31933246238417e-06, + "loss": 0.403, + "step": 2989 + }, + { + "epoch": 0.2909975669099757, + "grad_norm": 1.2176703537151474, + "learning_rate": 8.318153568027607e-06, + "loss": 0.4213, + "step": 2990 + }, + { + "epoch": 0.2910948905109489, + "grad_norm": 1.3475262123063816, + "learning_rate": 8.316974343944085e-06, + "loss": 0.4059, + "step": 2991 + }, + { + "epoch": 0.29119221411192214, + "grad_norm": 1.2398233047847593, + "learning_rate": 8.315794790250784e-06, + "loss": 0.2626, + "step": 2992 + }, + { + "epoch": 0.2912895377128954, + "grad_norm": 1.3862498175549538, + "learning_rate": 8.314614907064915e-06, + "loss": 0.4535, + "step": 2993 + }, + { + "epoch": 0.2913868613138686, + "grad_norm": 1.455622096437578, + "learning_rate": 8.313434694503727e-06, + "loss": 0.4067, + "step": 2994 + }, + { + "epoch": 0.29148418491484185, + "grad_norm": 1.4755183973829757, + "learning_rate": 8.312254152684496e-06, + "loss": 0.6493, + "step": 2995 + }, + { + "epoch": 0.2915815085158151, + "grad_norm": 1.0399713771806027, + "learning_rate": 8.311073281724536e-06, + "loss": 0.3051, + "step": 2996 + }, + { + "epoch": 0.2916788321167883, + "grad_norm": 1.3151300509583979, + "learning_rate": 8.309892081741186e-06, + "loss": 0.3982, + "step": 2997 + }, + { + "epoch": 0.29177615571776155, + "grad_norm": 1.376541833798208, + "learning_rate": 8.308710552851826e-06, + "loss": 0.4749, + "step": 2998 + }, + { + "epoch": 0.2918734793187348, + "grad_norm": 1.2551786912554768, + "learning_rate": 8.307528695173865e-06, + "loss": 0.3118, + "step": 2999 + }, + { + "epoch": 0.291970802919708, + "grad_norm": 2.1707038191553463, + "learning_rate": 8.306346508824746e-06, + "loss": 0.3438, + "step": 3000 + }, + { + "epoch": 0.29206812652068126, + "grad_norm": 1.4299459588569998, + "learning_rate": 8.30516399392194e-06, + "loss": 0.4838, + "step": 3001 + }, + { + "epoch": 0.2921654501216545, + "grad_norm": 1.378341342959643, + "learning_rate": 8.303981150582958e-06, + "loss": 0.5055, + "step": 3002 + }, + { + "epoch": 0.2922627737226277, + "grad_norm": 1.4826508798742193, + "learning_rate": 8.302797978925338e-06, + "loss": 0.3737, + "step": 3003 + }, + { + "epoch": 0.29236009732360096, + "grad_norm": 1.222513403789782, + "learning_rate": 8.301614479066653e-06, + "loss": 0.4587, + "step": 3004 + }, + { + "epoch": 0.2924574209245742, + "grad_norm": 1.3819233250029228, + "learning_rate": 8.300430651124508e-06, + "loss": 0.4021, + "step": 3005 + }, + { + "epoch": 0.29255474452554747, + "grad_norm": 1.2846536784172882, + "learning_rate": 8.29924649521654e-06, + "loss": 0.3609, + "step": 3006 + }, + { + "epoch": 0.29265206812652067, + "grad_norm": 1.4274226525457885, + "learning_rate": 8.298062011460419e-06, + "loss": 0.5267, + "step": 3007 + }, + { + "epoch": 0.2927493917274939, + "grad_norm": 1.4642655922839627, + "learning_rate": 8.296877199973849e-06, + "loss": 0.3499, + "step": 3008 + }, + { + "epoch": 0.2928467153284672, + "grad_norm": 1.4317302181421974, + "learning_rate": 8.295692060874568e-06, + "loss": 0.4979, + "step": 3009 + }, + { + "epoch": 0.2929440389294404, + "grad_norm": 1.3191877461185262, + "learning_rate": 8.294506594280338e-06, + "loss": 0.2835, + "step": 3010 + }, + { + "epoch": 0.29304136253041363, + "grad_norm": 1.0943861065294986, + "learning_rate": 8.293320800308964e-06, + "loss": 0.2138, + "step": 3011 + }, + { + "epoch": 0.2931386861313869, + "grad_norm": 1.2621219805575281, + "learning_rate": 8.292134679078277e-06, + "loss": 0.3027, + "step": 3012 + }, + { + "epoch": 0.2932360097323601, + "grad_norm": 1.556172337566686, + "learning_rate": 8.290948230706145e-06, + "loss": 0.4462, + "step": 3013 + }, + { + "epoch": 0.29333333333333333, + "grad_norm": 1.3363658374504028, + "learning_rate": 8.289761455310463e-06, + "loss": 0.373, + "step": 3014 + }, + { + "epoch": 0.2934306569343066, + "grad_norm": 1.4458593210455408, + "learning_rate": 8.288574353009164e-06, + "loss": 0.5566, + "step": 3015 + }, + { + "epoch": 0.2935279805352798, + "grad_norm": 1.5034274044899172, + "learning_rate": 8.287386923920211e-06, + "loss": 0.3837, + "step": 3016 + }, + { + "epoch": 0.29362530413625304, + "grad_norm": 1.484769748600726, + "learning_rate": 8.286199168161598e-06, + "loss": 0.3173, + "step": 3017 + }, + { + "epoch": 0.2937226277372263, + "grad_norm": 1.4336064725306121, + "learning_rate": 8.285011085851353e-06, + "loss": 0.4005, + "step": 3018 + }, + { + "epoch": 0.2938199513381995, + "grad_norm": 1.3857231757141482, + "learning_rate": 8.283822677107539e-06, + "loss": 0.481, + "step": 3019 + }, + { + "epoch": 0.29391727493917275, + "grad_norm": 1.4086307294395457, + "learning_rate": 8.282633942048244e-06, + "loss": 0.4181, + "step": 3020 + }, + { + "epoch": 0.294014598540146, + "grad_norm": 1.4701075671537391, + "learning_rate": 8.2814448807916e-06, + "loss": 0.4041, + "step": 3021 + }, + { + "epoch": 0.2941119221411192, + "grad_norm": 1.5925621393078395, + "learning_rate": 8.28025549345576e-06, + "loss": 0.3062, + "step": 3022 + }, + { + "epoch": 0.29420924574209245, + "grad_norm": 1.6058911141553376, + "learning_rate": 8.279065780158914e-06, + "loss": 0.5534, + "step": 3023 + }, + { + "epoch": 0.2943065693430657, + "grad_norm": 1.4134575830281486, + "learning_rate": 8.277875741019289e-06, + "loss": 0.5017, + "step": 3024 + }, + { + "epoch": 0.2944038929440389, + "grad_norm": 1.6163740830610969, + "learning_rate": 8.276685376155133e-06, + "loss": 0.5513, + "step": 3025 + }, + { + "epoch": 0.29450121654501216, + "grad_norm": 1.3415920762045879, + "learning_rate": 8.275494685684739e-06, + "loss": 0.4209, + "step": 3026 + }, + { + "epoch": 0.2945985401459854, + "grad_norm": 1.699522776097275, + "learning_rate": 8.274303669726427e-06, + "loss": 0.2444, + "step": 3027 + }, + { + "epoch": 0.29469586374695866, + "grad_norm": 1.3118143561432465, + "learning_rate": 8.273112328398545e-06, + "loss": 0.3282, + "step": 3028 + }, + { + "epoch": 0.29479318734793186, + "grad_norm": 1.3608335365502384, + "learning_rate": 8.271920661819479e-06, + "loss": 0.4625, + "step": 3029 + }, + { + "epoch": 0.2948905109489051, + "grad_norm": 1.320965035708582, + "learning_rate": 8.270728670107645e-06, + "loss": 0.4161, + "step": 3030 + }, + { + "epoch": 0.29498783454987837, + "grad_norm": 1.2315684415049128, + "learning_rate": 8.269536353381493e-06, + "loss": 0.3264, + "step": 3031 + }, + { + "epoch": 0.29508515815085157, + "grad_norm": 1.2397754210481065, + "learning_rate": 8.268343711759505e-06, + "loss": 0.3184, + "step": 3032 + }, + { + "epoch": 0.2951824817518248, + "grad_norm": 1.4717261820272485, + "learning_rate": 8.267150745360194e-06, + "loss": 0.381, + "step": 3033 + }, + { + "epoch": 0.2952798053527981, + "grad_norm": 1.7364842416546407, + "learning_rate": 8.265957454302102e-06, + "loss": 0.3639, + "step": 3034 + }, + { + "epoch": 0.2953771289537713, + "grad_norm": 1.6249980192905973, + "learning_rate": 8.264763838703813e-06, + "loss": 0.5112, + "step": 3035 + }, + { + "epoch": 0.2954744525547445, + "grad_norm": 1.682249094263979, + "learning_rate": 8.263569898683934e-06, + "loss": 0.4894, + "step": 3036 + }, + { + "epoch": 0.2955717761557178, + "grad_norm": 1.9200248186176307, + "learning_rate": 8.262375634361108e-06, + "loss": 0.529, + "step": 3037 + }, + { + "epoch": 0.295669099756691, + "grad_norm": 1.4426650259998133, + "learning_rate": 8.261181045854011e-06, + "loss": 0.5037, + "step": 3038 + }, + { + "epoch": 0.29576642335766423, + "grad_norm": 1.6904227765149746, + "learning_rate": 8.259986133281348e-06, + "loss": 0.3632, + "step": 3039 + }, + { + "epoch": 0.2958637469586375, + "grad_norm": 1.3863799205056755, + "learning_rate": 8.25879089676186e-06, + "loss": 0.4148, + "step": 3040 + }, + { + "epoch": 0.2959610705596107, + "grad_norm": 1.627436205526306, + "learning_rate": 8.257595336414317e-06, + "loss": 0.4558, + "step": 3041 + }, + { + "epoch": 0.29605839416058394, + "grad_norm": 1.3163567598814478, + "learning_rate": 8.256399452357524e-06, + "loss": 0.2713, + "step": 3042 + }, + { + "epoch": 0.2961557177615572, + "grad_norm": 1.6072179171276018, + "learning_rate": 8.255203244710316e-06, + "loss": 0.353, + "step": 3043 + }, + { + "epoch": 0.2962530413625304, + "grad_norm": 1.4217719575203627, + "learning_rate": 8.254006713591559e-06, + "loss": 0.3744, + "step": 3044 + }, + { + "epoch": 0.29635036496350364, + "grad_norm": 1.9013012922141048, + "learning_rate": 8.252809859120154e-06, + "loss": 0.209, + "step": 3045 + }, + { + "epoch": 0.2964476885644769, + "grad_norm": 1.390657831725977, + "learning_rate": 8.251612681415035e-06, + "loss": 0.3722, + "step": 3046 + }, + { + "epoch": 0.2965450121654501, + "grad_norm": 1.4478686848472833, + "learning_rate": 8.250415180595167e-06, + "loss": 0.3869, + "step": 3047 + }, + { + "epoch": 0.29664233576642335, + "grad_norm": 1.1443911522017596, + "learning_rate": 8.249217356779544e-06, + "loss": 0.3385, + "step": 3048 + }, + { + "epoch": 0.2967396593673966, + "grad_norm": 1.7245119786652503, + "learning_rate": 8.248019210087195e-06, + "loss": 0.3023, + "step": 3049 + }, + { + "epoch": 0.29683698296836986, + "grad_norm": 1.8030337728763741, + "learning_rate": 8.24682074063718e-06, + "loss": 0.3784, + "step": 3050 + }, + { + "epoch": 0.29693430656934305, + "grad_norm": 1.299417141317702, + "learning_rate": 8.245621948548593e-06, + "loss": 0.2963, + "step": 3051 + }, + { + "epoch": 0.2970316301703163, + "grad_norm": 1.3334468356141627, + "learning_rate": 8.244422833940558e-06, + "loss": 0.3671, + "step": 3052 + }, + { + "epoch": 0.29712895377128956, + "grad_norm": 1.6168488226188178, + "learning_rate": 8.24322339693223e-06, + "loss": 0.5497, + "step": 3053 + }, + { + "epoch": 0.29722627737226276, + "grad_norm": 1.49700230831562, + "learning_rate": 8.242023637642802e-06, + "loss": 0.4567, + "step": 3054 + }, + { + "epoch": 0.297323600973236, + "grad_norm": 1.0494586888942983, + "learning_rate": 8.24082355619149e-06, + "loss": 0.2186, + "step": 3055 + }, + { + "epoch": 0.29742092457420927, + "grad_norm": 1.372792205417397, + "learning_rate": 8.239623152697553e-06, + "loss": 0.5083, + "step": 3056 + }, + { + "epoch": 0.29751824817518246, + "grad_norm": 1.266230497219453, + "learning_rate": 8.238422427280269e-06, + "loss": 0.461, + "step": 3057 + }, + { + "epoch": 0.2976155717761557, + "grad_norm": 1.5041389582539588, + "learning_rate": 8.237221380058959e-06, + "loss": 0.3813, + "step": 3058 + }, + { + "epoch": 0.29771289537712897, + "grad_norm": 1.4593593621079823, + "learning_rate": 8.23602001115297e-06, + "loss": 0.473, + "step": 3059 + }, + { + "epoch": 0.29781021897810217, + "grad_norm": 1.3666083716931219, + "learning_rate": 8.234818320681685e-06, + "loss": 0.4822, + "step": 3060 + }, + { + "epoch": 0.2979075425790754, + "grad_norm": 1.407870228183954, + "learning_rate": 8.233616308764513e-06, + "loss": 0.4012, + "step": 3061 + }, + { + "epoch": 0.2980048661800487, + "grad_norm": 1.4404350668596586, + "learning_rate": 8.232413975520903e-06, + "loss": 0.5057, + "step": 3062 + }, + { + "epoch": 0.2981021897810219, + "grad_norm": 1.3912456713229528, + "learning_rate": 8.231211321070329e-06, + "loss": 0.4578, + "step": 3063 + }, + { + "epoch": 0.29819951338199513, + "grad_norm": 1.3191795228165797, + "learning_rate": 8.2300083455323e-06, + "loss": 0.3888, + "step": 3064 + }, + { + "epoch": 0.2982968369829684, + "grad_norm": 1.4258248936492355, + "learning_rate": 8.228805049026355e-06, + "loss": 0.5108, + "step": 3065 + }, + { + "epoch": 0.2983941605839416, + "grad_norm": 1.4850835614825084, + "learning_rate": 8.22760143167207e-06, + "loss": 0.5968, + "step": 3066 + }, + { + "epoch": 0.29849148418491483, + "grad_norm": 1.2696050534436827, + "learning_rate": 8.226397493589044e-06, + "loss": 0.3328, + "step": 3067 + }, + { + "epoch": 0.2985888077858881, + "grad_norm": 1.1993181516723008, + "learning_rate": 8.225193234896918e-06, + "loss": 0.2682, + "step": 3068 + }, + { + "epoch": 0.2986861313868613, + "grad_norm": 1.3420953543565923, + "learning_rate": 8.223988655715355e-06, + "loss": 0.3865, + "step": 3069 + }, + { + "epoch": 0.29878345498783454, + "grad_norm": 1.305913976862295, + "learning_rate": 8.222783756164061e-06, + "loss": 0.3551, + "step": 3070 + }, + { + "epoch": 0.2988807785888078, + "grad_norm": 1.3385899852932626, + "learning_rate": 8.221578536362764e-06, + "loss": 0.4203, + "step": 3071 + }, + { + "epoch": 0.29897810218978105, + "grad_norm": 1.189534251886867, + "learning_rate": 8.220372996431228e-06, + "loss": 0.2937, + "step": 3072 + }, + { + "epoch": 0.29907542579075425, + "grad_norm": 1.5982329206910104, + "learning_rate": 8.219167136489245e-06, + "loss": 0.6064, + "step": 3073 + }, + { + "epoch": 0.2991727493917275, + "grad_norm": 1.775024980718492, + "learning_rate": 8.217960956656648e-06, + "loss": 0.5517, + "step": 3074 + }, + { + "epoch": 0.29927007299270075, + "grad_norm": 1.4818012612095348, + "learning_rate": 8.216754457053291e-06, + "loss": 0.3574, + "step": 3075 + }, + { + "epoch": 0.29936739659367395, + "grad_norm": 1.5621403089409462, + "learning_rate": 8.215547637799068e-06, + "loss": 0.4108, + "step": 3076 + }, + { + "epoch": 0.2994647201946472, + "grad_norm": 1.4983847186167278, + "learning_rate": 8.214340499013899e-06, + "loss": 0.4644, + "step": 3077 + }, + { + "epoch": 0.29956204379562046, + "grad_norm": 1.5897848132407382, + "learning_rate": 8.213133040817738e-06, + "loss": 0.4894, + "step": 3078 + }, + { + "epoch": 0.29965936739659366, + "grad_norm": 1.6354640621760643, + "learning_rate": 8.211925263330573e-06, + "loss": 0.4583, + "step": 3079 + }, + { + "epoch": 0.2997566909975669, + "grad_norm": 1.4952024987397354, + "learning_rate": 8.21071716667242e-06, + "loss": 0.5976, + "step": 3080 + }, + { + "epoch": 0.29985401459854016, + "grad_norm": 1.0095340308225043, + "learning_rate": 8.20950875096333e-06, + "loss": 0.2524, + "step": 3081 + }, + { + "epoch": 0.29995133819951336, + "grad_norm": 1.4197678935056404, + "learning_rate": 8.208300016323381e-06, + "loss": 0.5514, + "step": 3082 + }, + { + "epoch": 0.3000486618004866, + "grad_norm": 1.249287306745543, + "learning_rate": 8.207090962872688e-06, + "loss": 0.2683, + "step": 3083 + }, + { + "epoch": 0.30014598540145987, + "grad_norm": 1.2420194980085992, + "learning_rate": 8.205881590731394e-06, + "loss": 0.3941, + "step": 3084 + }, + { + "epoch": 0.30024330900243307, + "grad_norm": 1.0228818593574307, + "learning_rate": 8.204671900019676e-06, + "loss": 0.2158, + "step": 3085 + }, + { + "epoch": 0.3003406326034063, + "grad_norm": 1.4988207950368069, + "learning_rate": 8.203461890857743e-06, + "loss": 0.4833, + "step": 3086 + }, + { + "epoch": 0.3004379562043796, + "grad_norm": 1.3402746636459373, + "learning_rate": 8.20225156336583e-06, + "loss": 0.437, + "step": 3087 + }, + { + "epoch": 0.3005352798053528, + "grad_norm": 1.3071666622302105, + "learning_rate": 8.201040917664214e-06, + "loss": 0.3667, + "step": 3088 + }, + { + "epoch": 0.300632603406326, + "grad_norm": 2.001934665501785, + "learning_rate": 8.199829953873192e-06, + "loss": 0.346, + "step": 3089 + }, + { + "epoch": 0.3007299270072993, + "grad_norm": 1.50451394225963, + "learning_rate": 8.198618672113104e-06, + "loss": 0.4897, + "step": 3090 + }, + { + "epoch": 0.3008272506082725, + "grad_norm": 1.5127622960173581, + "learning_rate": 8.197407072504309e-06, + "loss": 0.4301, + "step": 3091 + }, + { + "epoch": 0.30092457420924573, + "grad_norm": 1.409495275402236, + "learning_rate": 8.196195155167211e-06, + "loss": 0.4954, + "step": 3092 + }, + { + "epoch": 0.301021897810219, + "grad_norm": 1.3458224438835962, + "learning_rate": 8.194982920222233e-06, + "loss": 0.5023, + "step": 3093 + }, + { + "epoch": 0.30111922141119224, + "grad_norm": 1.484836707336815, + "learning_rate": 8.19377036778984e-06, + "loss": 0.4471, + "step": 3094 + }, + { + "epoch": 0.30121654501216544, + "grad_norm": 1.4314600061658445, + "learning_rate": 8.192557497990522e-06, + "loss": 0.4519, + "step": 3095 + }, + { + "epoch": 0.3013138686131387, + "grad_norm": 1.228152077257465, + "learning_rate": 8.191344310944803e-06, + "loss": 0.2338, + "step": 3096 + }, + { + "epoch": 0.30141119221411194, + "grad_norm": 1.4025619039626473, + "learning_rate": 8.19013080677324e-06, + "loss": 0.3748, + "step": 3097 + }, + { + "epoch": 0.30150851581508514, + "grad_norm": 1.535338102251852, + "learning_rate": 8.188916985596415e-06, + "loss": 0.3129, + "step": 3098 + }, + { + "epoch": 0.3016058394160584, + "grad_norm": 1.7024230210298346, + "learning_rate": 8.187702847534952e-06, + "loss": 0.5525, + "step": 3099 + }, + { + "epoch": 0.30170316301703165, + "grad_norm": 1.4950690283515784, + "learning_rate": 8.186488392709495e-06, + "loss": 0.5258, + "step": 3100 + }, + { + "epoch": 0.30180048661800485, + "grad_norm": 1.589216178732189, + "learning_rate": 8.18527362124073e-06, + "loss": 0.5745, + "step": 3101 + }, + { + "epoch": 0.3018978102189781, + "grad_norm": 1.5942675928105552, + "learning_rate": 8.184058533249367e-06, + "loss": 0.6344, + "step": 3102 + }, + { + "epoch": 0.30199513381995136, + "grad_norm": 1.3981131065521017, + "learning_rate": 8.18284312885615e-06, + "loss": 0.3369, + "step": 3103 + }, + { + "epoch": 0.30209245742092455, + "grad_norm": 1.6180199585993311, + "learning_rate": 8.181627408181854e-06, + "loss": 0.4014, + "step": 3104 + }, + { + "epoch": 0.3021897810218978, + "grad_norm": 1.6338683004824879, + "learning_rate": 8.180411371347288e-06, + "loss": 0.4983, + "step": 3105 + }, + { + "epoch": 0.30228710462287106, + "grad_norm": 1.5225224020676915, + "learning_rate": 8.17919501847329e-06, + "loss": 0.5016, + "step": 3106 + }, + { + "epoch": 0.30238442822384426, + "grad_norm": 1.23190340238718, + "learning_rate": 8.177978349680727e-06, + "loss": 0.3644, + "step": 3107 + }, + { + "epoch": 0.3024817518248175, + "grad_norm": 1.4496645177592962, + "learning_rate": 8.176761365090503e-06, + "loss": 0.526, + "step": 3108 + }, + { + "epoch": 0.30257907542579077, + "grad_norm": 1.5209859048615393, + "learning_rate": 8.17554406482355e-06, + "loss": 0.3034, + "step": 3109 + }, + { + "epoch": 0.30267639902676396, + "grad_norm": 1.4404359772108442, + "learning_rate": 8.17432644900083e-06, + "loss": 0.4735, + "step": 3110 + }, + { + "epoch": 0.3027737226277372, + "grad_norm": 1.2693525922216498, + "learning_rate": 8.173108517743343e-06, + "loss": 0.4021, + "step": 3111 + }, + { + "epoch": 0.30287104622871047, + "grad_norm": 1.3995736051817393, + "learning_rate": 8.171890271172109e-06, + "loss": 0.3084, + "step": 3112 + }, + { + "epoch": 0.3029683698296837, + "grad_norm": 1.5690384436250255, + "learning_rate": 8.17067170940819e-06, + "loss": 0.4097, + "step": 3113 + }, + { + "epoch": 0.3030656934306569, + "grad_norm": 1.270566641334736, + "learning_rate": 8.169452832572676e-06, + "loss": 0.3813, + "step": 3114 + }, + { + "epoch": 0.3031630170316302, + "grad_norm": 1.1690990375599999, + "learning_rate": 8.168233640786682e-06, + "loss": 0.2898, + "step": 3115 + }, + { + "epoch": 0.30326034063260343, + "grad_norm": 1.5367454476066444, + "learning_rate": 8.167014134171367e-06, + "loss": 0.4167, + "step": 3116 + }, + { + "epoch": 0.30335766423357663, + "grad_norm": 1.113322849500334, + "learning_rate": 8.165794312847912e-06, + "loss": 0.3274, + "step": 3117 + }, + { + "epoch": 0.3034549878345499, + "grad_norm": 1.4711999953076527, + "learning_rate": 8.164574176937527e-06, + "loss": 0.368, + "step": 3118 + }, + { + "epoch": 0.30355231143552314, + "grad_norm": 1.4465621082621003, + "learning_rate": 8.163353726561462e-06, + "loss": 0.2719, + "step": 3119 + }, + { + "epoch": 0.30364963503649633, + "grad_norm": 1.5224694722189016, + "learning_rate": 8.162132961840994e-06, + "loss": 0.3296, + "step": 3120 + }, + { + "epoch": 0.3037469586374696, + "grad_norm": 1.3713377819635104, + "learning_rate": 8.160911882897429e-06, + "loss": 0.3064, + "step": 3121 + }, + { + "epoch": 0.30384428223844284, + "grad_norm": 1.6461819951429466, + "learning_rate": 8.159690489852108e-06, + "loss": 0.3646, + "step": 3122 + }, + { + "epoch": 0.30394160583941604, + "grad_norm": 1.4328493269552467, + "learning_rate": 8.1584687828264e-06, + "loss": 0.4363, + "step": 3123 + }, + { + "epoch": 0.3040389294403893, + "grad_norm": 1.2811535124384867, + "learning_rate": 8.157246761941708e-06, + "loss": 0.4582, + "step": 3124 + }, + { + "epoch": 0.30413625304136255, + "grad_norm": 1.2144846492785035, + "learning_rate": 8.156024427319464e-06, + "loss": 0.2413, + "step": 3125 + }, + { + "epoch": 0.30423357664233575, + "grad_norm": 1.4112474441167293, + "learning_rate": 8.154801779081135e-06, + "loss": 0.4762, + "step": 3126 + }, + { + "epoch": 0.304330900243309, + "grad_norm": 1.4641495751020401, + "learning_rate": 8.153578817348213e-06, + "loss": 0.4905, + "step": 3127 + }, + { + "epoch": 0.30442822384428225, + "grad_norm": 1.7041758523831827, + "learning_rate": 8.152355542242226e-06, + "loss": 0.5396, + "step": 3128 + }, + { + "epoch": 0.30452554744525545, + "grad_norm": 1.2349793608481423, + "learning_rate": 8.151131953884728e-06, + "loss": 0.3847, + "step": 3129 + }, + { + "epoch": 0.3046228710462287, + "grad_norm": 1.4859954822671841, + "learning_rate": 8.149908052397314e-06, + "loss": 0.5907, + "step": 3130 + }, + { + "epoch": 0.30472019464720196, + "grad_norm": 1.2693166698162108, + "learning_rate": 8.148683837901599e-06, + "loss": 0.2636, + "step": 3131 + }, + { + "epoch": 0.30481751824817516, + "grad_norm": 1.1084842598244526, + "learning_rate": 8.147459310519238e-06, + "loss": 0.3103, + "step": 3132 + }, + { + "epoch": 0.3049148418491484, + "grad_norm": 1.2151450124002459, + "learning_rate": 8.146234470371908e-06, + "loss": 0.2734, + "step": 3133 + }, + { + "epoch": 0.30501216545012166, + "grad_norm": 1.3625938628176788, + "learning_rate": 8.145009317581328e-06, + "loss": 0.3757, + "step": 3134 + }, + { + "epoch": 0.3051094890510949, + "grad_norm": 1.3367991206467007, + "learning_rate": 8.143783852269239e-06, + "loss": 0.3469, + "step": 3135 + }, + { + "epoch": 0.3052068126520681, + "grad_norm": 1.5485997458565015, + "learning_rate": 8.142558074557413e-06, + "loss": 0.6068, + "step": 3136 + }, + { + "epoch": 0.30530413625304137, + "grad_norm": 1.4327175362669387, + "learning_rate": 8.141331984567661e-06, + "loss": 0.4495, + "step": 3137 + }, + { + "epoch": 0.3054014598540146, + "grad_norm": 1.4648525390361329, + "learning_rate": 8.140105582421819e-06, + "loss": 0.4855, + "step": 3138 + }, + { + "epoch": 0.3054987834549878, + "grad_norm": 1.147269868566856, + "learning_rate": 8.138878868241755e-06, + "loss": 0.3671, + "step": 3139 + }, + { + "epoch": 0.3055961070559611, + "grad_norm": 1.4274559741070532, + "learning_rate": 8.13765184214937e-06, + "loss": 0.4246, + "step": 3140 + }, + { + "epoch": 0.30569343065693433, + "grad_norm": 1.3864373579762495, + "learning_rate": 8.13642450426659e-06, + "loss": 0.5252, + "step": 3141 + }, + { + "epoch": 0.3057907542579075, + "grad_norm": 1.3508219371380046, + "learning_rate": 8.135196854715382e-06, + "loss": 0.4022, + "step": 3142 + }, + { + "epoch": 0.3058880778588808, + "grad_norm": 1.3974501891292628, + "learning_rate": 8.133968893617734e-06, + "loss": 0.4903, + "step": 3143 + }, + { + "epoch": 0.30598540145985403, + "grad_norm": 1.379672607479744, + "learning_rate": 8.132740621095672e-06, + "loss": 0.4389, + "step": 3144 + }, + { + "epoch": 0.30608272506082723, + "grad_norm": 1.5338640282858476, + "learning_rate": 8.131512037271248e-06, + "loss": 0.5719, + "step": 3145 + }, + { + "epoch": 0.3061800486618005, + "grad_norm": 1.754315640729903, + "learning_rate": 8.130283142266549e-06, + "loss": 0.4684, + "step": 3146 + }, + { + "epoch": 0.30627737226277374, + "grad_norm": 1.3260485404955915, + "learning_rate": 8.129053936203688e-06, + "loss": 0.3967, + "step": 3147 + }, + { + "epoch": 0.30637469586374694, + "grad_norm": 1.3926987348333701, + "learning_rate": 8.127824419204818e-06, + "loss": 0.3916, + "step": 3148 + }, + { + "epoch": 0.3064720194647202, + "grad_norm": 1.4794110467900325, + "learning_rate": 8.126594591392108e-06, + "loss": 0.4127, + "step": 3149 + }, + { + "epoch": 0.30656934306569344, + "grad_norm": 1.5256531082933278, + "learning_rate": 8.125364452887775e-06, + "loss": 0.4219, + "step": 3150 + }, + { + "epoch": 0.30666666666666664, + "grad_norm": 1.2765687697220431, + "learning_rate": 8.124134003814054e-06, + "loss": 0.3482, + "step": 3151 + }, + { + "epoch": 0.3067639902676399, + "grad_norm": 1.345009147300955, + "learning_rate": 8.122903244293217e-06, + "loss": 0.2419, + "step": 3152 + }, + { + "epoch": 0.30686131386861315, + "grad_norm": 1.243655794496096, + "learning_rate": 8.121672174447566e-06, + "loss": 0.3132, + "step": 3153 + }, + { + "epoch": 0.30695863746958635, + "grad_norm": 1.4526276096090445, + "learning_rate": 8.120440794399432e-06, + "loss": 0.5369, + "step": 3154 + }, + { + "epoch": 0.3070559610705596, + "grad_norm": 1.4088986355103132, + "learning_rate": 8.119209104271177e-06, + "loss": 0.331, + "step": 3155 + }, + { + "epoch": 0.30715328467153286, + "grad_norm": 1.4915113265208189, + "learning_rate": 8.117977104185198e-06, + "loss": 0.6195, + "step": 3156 + }, + { + "epoch": 0.3072506082725061, + "grad_norm": 1.143472956889321, + "learning_rate": 8.116744794263916e-06, + "loss": 0.2632, + "step": 3157 + }, + { + "epoch": 0.3073479318734793, + "grad_norm": 1.3575606240914238, + "learning_rate": 8.11551217462979e-06, + "loss": 0.3927, + "step": 3158 + }, + { + "epoch": 0.30744525547445256, + "grad_norm": 1.2891794787417357, + "learning_rate": 8.114279245405301e-06, + "loss": 0.3766, + "step": 3159 + }, + { + "epoch": 0.3075425790754258, + "grad_norm": 1.1671813045475161, + "learning_rate": 8.113046006712973e-06, + "loss": 0.3527, + "step": 3160 + }, + { + "epoch": 0.307639902676399, + "grad_norm": 1.2346428689153102, + "learning_rate": 8.111812458675348e-06, + "loss": 0.456, + "step": 3161 + }, + { + "epoch": 0.30773722627737227, + "grad_norm": 1.543619139526521, + "learning_rate": 8.110578601415007e-06, + "loss": 0.419, + "step": 3162 + }, + { + "epoch": 0.3078345498783455, + "grad_norm": 1.5361457722305751, + "learning_rate": 8.109344435054557e-06, + "loss": 0.4477, + "step": 3163 + }, + { + "epoch": 0.3079318734793187, + "grad_norm": 1.0948111699644958, + "learning_rate": 8.108109959716641e-06, + "loss": 0.3469, + "step": 3164 + }, + { + "epoch": 0.30802919708029197, + "grad_norm": 1.4900262169803653, + "learning_rate": 8.106875175523928e-06, + "loss": 0.5066, + "step": 3165 + }, + { + "epoch": 0.3081265206812652, + "grad_norm": 1.2143378476964652, + "learning_rate": 8.105640082599118e-06, + "loss": 0.4016, + "step": 3166 + }, + { + "epoch": 0.3082238442822384, + "grad_norm": 1.227404068812886, + "learning_rate": 8.104404681064943e-06, + "loss": 0.3408, + "step": 3167 + }, + { + "epoch": 0.3083211678832117, + "grad_norm": 1.273486832675327, + "learning_rate": 8.10316897104417e-06, + "loss": 0.3819, + "step": 3168 + }, + { + "epoch": 0.30841849148418493, + "grad_norm": 1.390509439874599, + "learning_rate": 8.101932952659586e-06, + "loss": 0.5108, + "step": 3169 + }, + { + "epoch": 0.30851581508515813, + "grad_norm": 1.1910701089910116, + "learning_rate": 8.100696626034019e-06, + "loss": 0.3579, + "step": 3170 + }, + { + "epoch": 0.3086131386861314, + "grad_norm": 1.6328619260471173, + "learning_rate": 8.099459991290324e-06, + "loss": 0.666, + "step": 3171 + }, + { + "epoch": 0.30871046228710464, + "grad_norm": 1.4085180007277236, + "learning_rate": 8.09822304855138e-06, + "loss": 0.3684, + "step": 3172 + }, + { + "epoch": 0.30880778588807783, + "grad_norm": 1.5920530522626664, + "learning_rate": 8.096985797940111e-06, + "loss": 0.4499, + "step": 3173 + }, + { + "epoch": 0.3089051094890511, + "grad_norm": 1.2895313219583247, + "learning_rate": 8.09574823957946e-06, + "loss": 0.4939, + "step": 3174 + }, + { + "epoch": 0.30900243309002434, + "grad_norm": 1.5242111980147517, + "learning_rate": 8.094510373592403e-06, + "loss": 0.3223, + "step": 3175 + }, + { + "epoch": 0.30909975669099754, + "grad_norm": 1.3628460645839475, + "learning_rate": 8.093272200101946e-06, + "loss": 0.507, + "step": 3176 + }, + { + "epoch": 0.3091970802919708, + "grad_norm": 1.45716247785806, + "learning_rate": 8.092033719231134e-06, + "loss": 0.2011, + "step": 3177 + }, + { + "epoch": 0.30929440389294405, + "grad_norm": 1.0824220688323085, + "learning_rate": 8.090794931103026e-06, + "loss": 0.2127, + "step": 3178 + }, + { + "epoch": 0.3093917274939173, + "grad_norm": 1.3637285345889945, + "learning_rate": 8.089555835840728e-06, + "loss": 0.3567, + "step": 3179 + }, + { + "epoch": 0.3094890510948905, + "grad_norm": 1.352681485055594, + "learning_rate": 8.088316433567369e-06, + "loss": 0.4403, + "step": 3180 + }, + { + "epoch": 0.30958637469586375, + "grad_norm": 1.5330943463849844, + "learning_rate": 8.087076724406106e-06, + "loss": 0.3379, + "step": 3181 + }, + { + "epoch": 0.309683698296837, + "grad_norm": 1.4102464472701088, + "learning_rate": 8.08583670848013e-06, + "loss": 0.5173, + "step": 3182 + }, + { + "epoch": 0.3097810218978102, + "grad_norm": 1.3268465957799758, + "learning_rate": 8.084596385912666e-06, + "loss": 0.2684, + "step": 3183 + }, + { + "epoch": 0.30987834549878346, + "grad_norm": 1.0612856713580272, + "learning_rate": 8.083355756826962e-06, + "loss": 0.2057, + "step": 3184 + }, + { + "epoch": 0.3099756690997567, + "grad_norm": 1.2705392445129486, + "learning_rate": 8.082114821346302e-06, + "loss": 0.4234, + "step": 3185 + }, + { + "epoch": 0.3100729927007299, + "grad_norm": 1.502946661443704, + "learning_rate": 8.080873579593997e-06, + "loss": 0.5134, + "step": 3186 + }, + { + "epoch": 0.31017031630170316, + "grad_norm": 1.4788874331253061, + "learning_rate": 8.079632031693392e-06, + "loss": 0.6157, + "step": 3187 + }, + { + "epoch": 0.3102676399026764, + "grad_norm": 1.0642136610663042, + "learning_rate": 8.078390177767858e-06, + "loss": 0.2667, + "step": 3188 + }, + { + "epoch": 0.3103649635036496, + "grad_norm": 1.2694637339318475, + "learning_rate": 8.0771480179408e-06, + "loss": 0.4189, + "step": 3189 + }, + { + "epoch": 0.31046228710462287, + "grad_norm": 1.40127369638598, + "learning_rate": 8.075905552335652e-06, + "loss": 0.6007, + "step": 3190 + }, + { + "epoch": 0.3105596107055961, + "grad_norm": 1.4654952978073685, + "learning_rate": 8.07466278107588e-06, + "loss": 0.524, + "step": 3191 + }, + { + "epoch": 0.3106569343065693, + "grad_norm": 1.468706008576981, + "learning_rate": 8.073419704284977e-06, + "loss": 0.5511, + "step": 3192 + }, + { + "epoch": 0.3107542579075426, + "grad_norm": 1.3171427220237197, + "learning_rate": 8.072176322086468e-06, + "loss": 0.4903, + "step": 3193 + }, + { + "epoch": 0.31085158150851583, + "grad_norm": 1.5864389312753313, + "learning_rate": 8.07093263460391e-06, + "loss": 0.7036, + "step": 3194 + }, + { + "epoch": 0.310948905109489, + "grad_norm": 1.3892836370042843, + "learning_rate": 8.06968864196089e-06, + "loss": 0.4277, + "step": 3195 + }, + { + "epoch": 0.3110462287104623, + "grad_norm": 1.1676123850477602, + "learning_rate": 8.06844434428102e-06, + "loss": 0.2693, + "step": 3196 + }, + { + "epoch": 0.31114355231143553, + "grad_norm": 1.506360397408636, + "learning_rate": 8.067199741687951e-06, + "loss": 0.4425, + "step": 3197 + }, + { + "epoch": 0.31124087591240873, + "grad_norm": 1.9902872035508865, + "learning_rate": 8.065954834305359e-06, + "loss": 0.4464, + "step": 3198 + }, + { + "epoch": 0.311338199513382, + "grad_norm": 1.3738283887856941, + "learning_rate": 8.06470962225695e-06, + "loss": 0.2504, + "step": 3199 + }, + { + "epoch": 0.31143552311435524, + "grad_norm": 1.5173657046475728, + "learning_rate": 8.063464105666462e-06, + "loss": 0.4145, + "step": 3200 + }, + { + "epoch": 0.3115328467153285, + "grad_norm": 1.5348915862029098, + "learning_rate": 8.062218284657663e-06, + "loss": 0.4182, + "step": 3201 + }, + { + "epoch": 0.3116301703163017, + "grad_norm": 1.6415893626851852, + "learning_rate": 8.06097215935435e-06, + "loss": 0.5928, + "step": 3202 + }, + { + "epoch": 0.31172749391727494, + "grad_norm": 1.2746939048596826, + "learning_rate": 8.059725729880354e-06, + "loss": 0.2945, + "step": 3203 + }, + { + "epoch": 0.3118248175182482, + "grad_norm": 1.354856369246625, + "learning_rate": 8.05847899635953e-06, + "loss": 0.445, + "step": 3204 + }, + { + "epoch": 0.3119221411192214, + "grad_norm": 1.3006695307047205, + "learning_rate": 8.057231958915767e-06, + "loss": 0.3558, + "step": 3205 + }, + { + "epoch": 0.31201946472019465, + "grad_norm": 1.294215405183638, + "learning_rate": 8.05598461767299e-06, + "loss": 0.3764, + "step": 3206 + }, + { + "epoch": 0.3121167883211679, + "grad_norm": 1.4713454344723826, + "learning_rate": 8.054736972755138e-06, + "loss": 0.4945, + "step": 3207 + }, + { + "epoch": 0.3122141119221411, + "grad_norm": 1.1708979284493786, + "learning_rate": 8.053489024286198e-06, + "loss": 0.2419, + "step": 3208 + }, + { + "epoch": 0.31231143552311436, + "grad_norm": 1.2933153684751388, + "learning_rate": 8.052240772390176e-06, + "loss": 0.4624, + "step": 3209 + }, + { + "epoch": 0.3124087591240876, + "grad_norm": 1.2538957446837722, + "learning_rate": 8.050992217191114e-06, + "loss": 0.3305, + "step": 3210 + }, + { + "epoch": 0.3125060827250608, + "grad_norm": 1.5362890980077515, + "learning_rate": 8.049743358813078e-06, + "loss": 0.5151, + "step": 3211 + }, + { + "epoch": 0.31260340632603406, + "grad_norm": 1.3913400504692353, + "learning_rate": 8.04849419738017e-06, + "loss": 0.355, + "step": 3212 + }, + { + "epoch": 0.3127007299270073, + "grad_norm": 1.3089021233990763, + "learning_rate": 8.04724473301652e-06, + "loss": 0.262, + "step": 3213 + }, + { + "epoch": 0.3127980535279805, + "grad_norm": 1.0542807465796966, + "learning_rate": 8.045994965846288e-06, + "loss": 0.3133, + "step": 3214 + }, + { + "epoch": 0.31289537712895377, + "grad_norm": 1.4749126602435412, + "learning_rate": 8.044744895993666e-06, + "loss": 0.46, + "step": 3215 + }, + { + "epoch": 0.312992700729927, + "grad_norm": 1.3746833003058674, + "learning_rate": 8.043494523582871e-06, + "loss": 0.427, + "step": 3216 + }, + { + "epoch": 0.3130900243309002, + "grad_norm": 1.3205796823520726, + "learning_rate": 8.042243848738153e-06, + "loss": 0.3354, + "step": 3217 + }, + { + "epoch": 0.31318734793187347, + "grad_norm": 1.2865057356076828, + "learning_rate": 8.040992871583797e-06, + "loss": 0.3941, + "step": 3218 + }, + { + "epoch": 0.3132846715328467, + "grad_norm": 1.1248365389432082, + "learning_rate": 8.039741592244108e-06, + "loss": 0.2628, + "step": 3219 + }, + { + "epoch": 0.3133819951338199, + "grad_norm": 1.4652698761705358, + "learning_rate": 8.03849001084343e-06, + "loss": 0.3564, + "step": 3220 + }, + { + "epoch": 0.3134793187347932, + "grad_norm": 1.3953599364279132, + "learning_rate": 8.037238127506128e-06, + "loss": 0.4163, + "step": 3221 + }, + { + "epoch": 0.31357664233576643, + "grad_norm": 1.2419892638751415, + "learning_rate": 8.035985942356612e-06, + "loss": 0.354, + "step": 3222 + }, + { + "epoch": 0.3136739659367397, + "grad_norm": 1.715104485156596, + "learning_rate": 8.034733455519303e-06, + "loss": 0.2963, + "step": 3223 + }, + { + "epoch": 0.3137712895377129, + "grad_norm": 1.470040424076559, + "learning_rate": 8.033480667118667e-06, + "loss": 0.4648, + "step": 3224 + }, + { + "epoch": 0.31386861313868614, + "grad_norm": 1.4565317560800817, + "learning_rate": 8.032227577279191e-06, + "loss": 0.512, + "step": 3225 + }, + { + "epoch": 0.3139659367396594, + "grad_norm": 1.3742041452053566, + "learning_rate": 8.030974186125397e-06, + "loss": 0.3956, + "step": 3226 + }, + { + "epoch": 0.3140632603406326, + "grad_norm": 1.3788492769137004, + "learning_rate": 8.029720493781838e-06, + "loss": 0.4509, + "step": 3227 + }, + { + "epoch": 0.31416058394160584, + "grad_norm": 1.3858125546461868, + "learning_rate": 8.028466500373089e-06, + "loss": 0.2106, + "step": 3228 + }, + { + "epoch": 0.3142579075425791, + "grad_norm": 1.3840076863400683, + "learning_rate": 8.027212206023762e-06, + "loss": 0.3038, + "step": 3229 + }, + { + "epoch": 0.3143552311435523, + "grad_norm": 1.3152632009702614, + "learning_rate": 8.0259576108585e-06, + "loss": 0.4801, + "step": 3230 + }, + { + "epoch": 0.31445255474452555, + "grad_norm": 1.3788629368363385, + "learning_rate": 8.024702715001968e-06, + "loss": 0.4245, + "step": 3231 + }, + { + "epoch": 0.3145498783454988, + "grad_norm": 1.814337226771454, + "learning_rate": 8.023447518578868e-06, + "loss": 0.5632, + "step": 3232 + }, + { + "epoch": 0.314647201946472, + "grad_norm": 1.4232062511008752, + "learning_rate": 8.02219202171393e-06, + "loss": 0.3286, + "step": 3233 + }, + { + "epoch": 0.31474452554744525, + "grad_norm": 1.280903932308868, + "learning_rate": 8.020936224531912e-06, + "loss": 0.3626, + "step": 3234 + }, + { + "epoch": 0.3148418491484185, + "grad_norm": 1.0510702763807533, + "learning_rate": 8.019680127157607e-06, + "loss": 0.2524, + "step": 3235 + }, + { + "epoch": 0.3149391727493917, + "grad_norm": 1.7478480050109064, + "learning_rate": 8.018423729715832e-06, + "loss": 0.4348, + "step": 3236 + }, + { + "epoch": 0.31503649635036496, + "grad_norm": 1.5359033994100462, + "learning_rate": 8.017167032331434e-06, + "loss": 0.4124, + "step": 3237 + }, + { + "epoch": 0.3151338199513382, + "grad_norm": 1.506288459888311, + "learning_rate": 8.015910035129294e-06, + "loss": 0.3261, + "step": 3238 + }, + { + "epoch": 0.3152311435523114, + "grad_norm": 1.2745124165082422, + "learning_rate": 8.01465273823432e-06, + "loss": 0.464, + "step": 3239 + }, + { + "epoch": 0.31532846715328466, + "grad_norm": 1.338919807585357, + "learning_rate": 8.01339514177145e-06, + "loss": 0.4172, + "step": 3240 + }, + { + "epoch": 0.3154257907542579, + "grad_norm": 1.486780887173232, + "learning_rate": 8.012137245865654e-06, + "loss": 0.5408, + "step": 3241 + }, + { + "epoch": 0.31552311435523117, + "grad_norm": 1.3714950278620026, + "learning_rate": 8.010879050641927e-06, + "loss": 0.3436, + "step": 3242 + }, + { + "epoch": 0.31562043795620437, + "grad_norm": 1.631965892281063, + "learning_rate": 8.009620556225298e-06, + "loss": 0.4727, + "step": 3243 + }, + { + "epoch": 0.3157177615571776, + "grad_norm": 1.6149761911532763, + "learning_rate": 8.008361762740825e-06, + "loss": 0.4924, + "step": 3244 + }, + { + "epoch": 0.3158150851581509, + "grad_norm": 1.3425533377851226, + "learning_rate": 8.007102670313596e-06, + "loss": 0.3844, + "step": 3245 + }, + { + "epoch": 0.3159124087591241, + "grad_norm": 1.5650571116791179, + "learning_rate": 8.005843279068724e-06, + "loss": 0.5109, + "step": 3246 + }, + { + "epoch": 0.31600973236009733, + "grad_norm": 1.469317468630812, + "learning_rate": 8.004583589131359e-06, + "loss": 0.3981, + "step": 3247 + }, + { + "epoch": 0.3161070559610706, + "grad_norm": 1.3520956586482695, + "learning_rate": 8.003323600626675e-06, + "loss": 0.3628, + "step": 3248 + }, + { + "epoch": 0.3162043795620438, + "grad_norm": 1.306843877245721, + "learning_rate": 8.002063313679881e-06, + "loss": 0.3738, + "step": 3249 + }, + { + "epoch": 0.31630170316301703, + "grad_norm": 1.2062559137545288, + "learning_rate": 8.000802728416209e-06, + "loss": 0.3603, + "step": 3250 + }, + { + "epoch": 0.3163990267639903, + "grad_norm": 1.2270301850514787, + "learning_rate": 7.999541844960926e-06, + "loss": 0.3444, + "step": 3251 + }, + { + "epoch": 0.3164963503649635, + "grad_norm": 1.6301989651574331, + "learning_rate": 7.998280663439325e-06, + "loss": 0.5442, + "step": 3252 + }, + { + "epoch": 0.31659367396593674, + "grad_norm": 1.3633091002846736, + "learning_rate": 7.997019183976732e-06, + "loss": 0.4596, + "step": 3253 + }, + { + "epoch": 0.31669099756691, + "grad_norm": 1.0930243128300028, + "learning_rate": 7.9957574066985e-06, + "loss": 0.259, + "step": 3254 + }, + { + "epoch": 0.3167883211678832, + "grad_norm": 1.3445912322829077, + "learning_rate": 7.994495331730014e-06, + "loss": 0.438, + "step": 3255 + }, + { + "epoch": 0.31688564476885644, + "grad_norm": 1.3651374487790005, + "learning_rate": 7.993232959196687e-06, + "loss": 0.4589, + "step": 3256 + }, + { + "epoch": 0.3169829683698297, + "grad_norm": 1.3424961538987858, + "learning_rate": 7.99197028922396e-06, + "loss": 0.4367, + "step": 3257 + }, + { + "epoch": 0.3170802919708029, + "grad_norm": 1.5932344837064665, + "learning_rate": 7.990707321937308e-06, + "loss": 0.6921, + "step": 3258 + }, + { + "epoch": 0.31717761557177615, + "grad_norm": 1.475521709829228, + "learning_rate": 7.989444057462228e-06, + "loss": 0.4759, + "step": 3259 + }, + { + "epoch": 0.3172749391727494, + "grad_norm": 1.2971398875872913, + "learning_rate": 7.988180495924256e-06, + "loss": 0.4588, + "step": 3260 + }, + { + "epoch": 0.3173722627737226, + "grad_norm": 1.4756687426354647, + "learning_rate": 7.986916637448953e-06, + "loss": 0.4776, + "step": 3261 + }, + { + "epoch": 0.31746958637469586, + "grad_norm": 1.6271934377592354, + "learning_rate": 7.985652482161907e-06, + "loss": 0.4979, + "step": 3262 + }, + { + "epoch": 0.3175669099756691, + "grad_norm": 1.6597644298654206, + "learning_rate": 7.984388030188739e-06, + "loss": 0.6091, + "step": 3263 + }, + { + "epoch": 0.31766423357664236, + "grad_norm": 1.3383482658500123, + "learning_rate": 7.983123281655097e-06, + "loss": 0.4371, + "step": 3264 + }, + { + "epoch": 0.31776155717761556, + "grad_norm": 1.4591756386356707, + "learning_rate": 7.981858236686661e-06, + "loss": 0.4888, + "step": 3265 + }, + { + "epoch": 0.3178588807785888, + "grad_norm": 1.1677482677553854, + "learning_rate": 7.98059289540914e-06, + "loss": 0.348, + "step": 3266 + }, + { + "epoch": 0.31795620437956207, + "grad_norm": 1.280054884121347, + "learning_rate": 7.97932725794827e-06, + "loss": 0.3909, + "step": 3267 + }, + { + "epoch": 0.31805352798053527, + "grad_norm": 1.2818245878680554, + "learning_rate": 7.97806132442982e-06, + "loss": 0.3432, + "step": 3268 + }, + { + "epoch": 0.3181508515815085, + "grad_norm": 1.2511980990717368, + "learning_rate": 7.976795094979586e-06, + "loss": 0.398, + "step": 3269 + }, + { + "epoch": 0.3182481751824818, + "grad_norm": 1.1398243641659185, + "learning_rate": 7.975528569723391e-06, + "loss": 0.3561, + "step": 3270 + }, + { + "epoch": 0.31834549878345497, + "grad_norm": 1.4375913010503336, + "learning_rate": 7.974261748787096e-06, + "loss": 0.4341, + "step": 3271 + }, + { + "epoch": 0.3184428223844282, + "grad_norm": 1.5232808350435216, + "learning_rate": 7.972994632296583e-06, + "loss": 0.443, + "step": 3272 + }, + { + "epoch": 0.3185401459854015, + "grad_norm": 1.1586214953526035, + "learning_rate": 7.971727220377765e-06, + "loss": 0.3709, + "step": 3273 + }, + { + "epoch": 0.3186374695863747, + "grad_norm": 1.5274740880588324, + "learning_rate": 7.970459513156587e-06, + "loss": 0.3699, + "step": 3274 + }, + { + "epoch": 0.31873479318734793, + "grad_norm": 1.3981699767571285, + "learning_rate": 7.969191510759021e-06, + "loss": 0.3678, + "step": 3275 + }, + { + "epoch": 0.3188321167883212, + "grad_norm": 1.934723248663663, + "learning_rate": 7.96792321331107e-06, + "loss": 0.5528, + "step": 3276 + }, + { + "epoch": 0.3189294403892944, + "grad_norm": 1.3063523868424662, + "learning_rate": 7.966654620938765e-06, + "loss": 0.381, + "step": 3277 + }, + { + "epoch": 0.31902676399026764, + "grad_norm": 1.4316009890701873, + "learning_rate": 7.965385733768166e-06, + "loss": 0.3462, + "step": 3278 + }, + { + "epoch": 0.3191240875912409, + "grad_norm": 1.3925079285209303, + "learning_rate": 7.964116551925365e-06, + "loss": 0.3468, + "step": 3279 + }, + { + "epoch": 0.3192214111922141, + "grad_norm": 1.5929020888752208, + "learning_rate": 7.96284707553648e-06, + "loss": 0.5416, + "step": 3280 + }, + { + "epoch": 0.31931873479318734, + "grad_norm": 1.3866900373898063, + "learning_rate": 7.961577304727659e-06, + "loss": 0.3982, + "step": 3281 + }, + { + "epoch": 0.3194160583941606, + "grad_norm": 1.4962340605356037, + "learning_rate": 7.960307239625082e-06, + "loss": 0.4023, + "step": 3282 + }, + { + "epoch": 0.3195133819951338, + "grad_norm": 1.4057559523045156, + "learning_rate": 7.959036880354955e-06, + "loss": 0.495, + "step": 3283 + }, + { + "epoch": 0.31961070559610705, + "grad_norm": 1.2622065590243314, + "learning_rate": 7.957766227043514e-06, + "loss": 0.3581, + "step": 3284 + }, + { + "epoch": 0.3197080291970803, + "grad_norm": 1.5090453488845967, + "learning_rate": 7.956495279817026e-06, + "loss": 0.455, + "step": 3285 + }, + { + "epoch": 0.31980535279805355, + "grad_norm": 1.3133879337401893, + "learning_rate": 7.955224038801785e-06, + "loss": 0.4625, + "step": 3286 + }, + { + "epoch": 0.31990267639902675, + "grad_norm": 2.521541296377749, + "learning_rate": 7.953952504124114e-06, + "loss": 0.4415, + "step": 3287 + }, + { + "epoch": 0.32, + "grad_norm": 1.3567937262050411, + "learning_rate": 7.952680675910365e-06, + "loss": 0.3309, + "step": 3288 + }, + { + "epoch": 0.32009732360097326, + "grad_norm": 1.5421944908903493, + "learning_rate": 7.951408554286926e-06, + "loss": 0.4589, + "step": 3289 + }, + { + "epoch": 0.32019464720194646, + "grad_norm": 1.5998274173642424, + "learning_rate": 7.950136139380204e-06, + "loss": 0.5359, + "step": 3290 + }, + { + "epoch": 0.3202919708029197, + "grad_norm": 1.2725707304657317, + "learning_rate": 7.948863431316639e-06, + "loss": 0.3625, + "step": 3291 + }, + { + "epoch": 0.32038929440389297, + "grad_norm": 1.4290851095226622, + "learning_rate": 7.947590430222702e-06, + "loss": 0.4872, + "step": 3292 + }, + { + "epoch": 0.32048661800486616, + "grad_norm": 1.3420498316619087, + "learning_rate": 7.946317136224894e-06, + "loss": 0.2389, + "step": 3293 + }, + { + "epoch": 0.3205839416058394, + "grad_norm": 1.5002712163507215, + "learning_rate": 7.94504354944974e-06, + "loss": 0.5157, + "step": 3294 + }, + { + "epoch": 0.32068126520681267, + "grad_norm": 1.660535414536618, + "learning_rate": 7.9437696700238e-06, + "loss": 0.4267, + "step": 3295 + }, + { + "epoch": 0.32077858880778587, + "grad_norm": 1.2370297819791147, + "learning_rate": 7.942495498073657e-06, + "loss": 0.3355, + "step": 3296 + }, + { + "epoch": 0.3208759124087591, + "grad_norm": 1.1415276287275913, + "learning_rate": 7.941221033725928e-06, + "loss": 0.2944, + "step": 3297 + }, + { + "epoch": 0.3209732360097324, + "grad_norm": 1.5554788387015477, + "learning_rate": 7.939946277107258e-06, + "loss": 0.4871, + "step": 3298 + }, + { + "epoch": 0.3210705596107056, + "grad_norm": 1.2985241068062738, + "learning_rate": 7.938671228344319e-06, + "loss": 0.3143, + "step": 3299 + }, + { + "epoch": 0.32116788321167883, + "grad_norm": 1.301901578297674, + "learning_rate": 7.937395887563812e-06, + "loss": 0.3965, + "step": 3300 + }, + { + "epoch": 0.3212652068126521, + "grad_norm": 0.9529902878913864, + "learning_rate": 7.936120254892471e-06, + "loss": 0.3083, + "step": 3301 + }, + { + "epoch": 0.3213625304136253, + "grad_norm": 1.442015067028423, + "learning_rate": 7.934844330457056e-06, + "loss": 0.4318, + "step": 3302 + }, + { + "epoch": 0.32145985401459853, + "grad_norm": 0.8997835110854661, + "learning_rate": 7.933568114384358e-06, + "loss": 0.2885, + "step": 3303 + }, + { + "epoch": 0.3215571776155718, + "grad_norm": 1.5293826180265608, + "learning_rate": 7.932291606801192e-06, + "loss": 0.5437, + "step": 3304 + }, + { + "epoch": 0.321654501216545, + "grad_norm": 1.2264156375116992, + "learning_rate": 7.931014807834405e-06, + "loss": 0.4001, + "step": 3305 + }, + { + "epoch": 0.32175182481751824, + "grad_norm": 1.260350527748902, + "learning_rate": 7.929737717610878e-06, + "loss": 0.3847, + "step": 3306 + }, + { + "epoch": 0.3218491484184915, + "grad_norm": 1.1346472253273232, + "learning_rate": 7.92846033625751e-06, + "loss": 0.3766, + "step": 3307 + }, + { + "epoch": 0.32194647201946475, + "grad_norm": 1.1035142206503785, + "learning_rate": 7.927182663901241e-06, + "loss": 0.369, + "step": 3308 + }, + { + "epoch": 0.32204379562043794, + "grad_norm": 1.2980599562576733, + "learning_rate": 7.92590470066903e-06, + "loss": 0.3982, + "step": 3309 + }, + { + "epoch": 0.3221411192214112, + "grad_norm": 1.1742301163722888, + "learning_rate": 7.924626446687871e-06, + "loss": 0.3423, + "step": 3310 + }, + { + "epoch": 0.32223844282238445, + "grad_norm": 1.8345279558348917, + "learning_rate": 7.923347902084784e-06, + "loss": 0.3145, + "step": 3311 + }, + { + "epoch": 0.32233576642335765, + "grad_norm": 1.562283157560816, + "learning_rate": 7.92206906698682e-06, + "loss": 0.4308, + "step": 3312 + }, + { + "epoch": 0.3224330900243309, + "grad_norm": 1.568673716358362, + "learning_rate": 7.920789941521053e-06, + "loss": 0.7025, + "step": 3313 + }, + { + "epoch": 0.32253041362530416, + "grad_norm": 1.409203500738135, + "learning_rate": 7.9195105258146e-06, + "loss": 0.459, + "step": 3314 + }, + { + "epoch": 0.32262773722627736, + "grad_norm": 0.880538645206563, + "learning_rate": 7.918230819994589e-06, + "loss": 0.2786, + "step": 3315 + }, + { + "epoch": 0.3227250608272506, + "grad_norm": 1.1906838149715093, + "learning_rate": 7.916950824188188e-06, + "loss": 0.2686, + "step": 3316 + }, + { + "epoch": 0.32282238442822386, + "grad_norm": 1.5111358859487132, + "learning_rate": 7.91567053852259e-06, + "loss": 0.5147, + "step": 3317 + }, + { + "epoch": 0.32291970802919706, + "grad_norm": 1.1464259378074475, + "learning_rate": 7.914389963125018e-06, + "loss": 0.2685, + "step": 3318 + }, + { + "epoch": 0.3230170316301703, + "grad_norm": 1.581506486679798, + "learning_rate": 7.913109098122726e-06, + "loss": 0.5854, + "step": 3319 + }, + { + "epoch": 0.32311435523114357, + "grad_norm": 1.3242645255760028, + "learning_rate": 7.91182794364299e-06, + "loss": 0.2315, + "step": 3320 + }, + { + "epoch": 0.32321167883211677, + "grad_norm": 1.6094300344001513, + "learning_rate": 7.910546499813125e-06, + "loss": 0.4739, + "step": 3321 + }, + { + "epoch": 0.32330900243309, + "grad_norm": 1.3446882210600364, + "learning_rate": 7.909264766760462e-06, + "loss": 0.4145, + "step": 3322 + }, + { + "epoch": 0.3234063260340633, + "grad_norm": 1.282785844235514, + "learning_rate": 7.907982744612373e-06, + "loss": 0.4324, + "step": 3323 + }, + { + "epoch": 0.32350364963503647, + "grad_norm": 1.8199182173411141, + "learning_rate": 7.90670043349625e-06, + "loss": 0.2738, + "step": 3324 + }, + { + "epoch": 0.3236009732360097, + "grad_norm": 1.3779170997525898, + "learning_rate": 7.90541783353952e-06, + "loss": 0.4386, + "step": 3325 + }, + { + "epoch": 0.323698296836983, + "grad_norm": 1.4651475108226641, + "learning_rate": 7.904134944869631e-06, + "loss": 0.2272, + "step": 3326 + }, + { + "epoch": 0.3237956204379562, + "grad_norm": 1.4444649350514327, + "learning_rate": 7.902851767614069e-06, + "loss": 0.3631, + "step": 3327 + }, + { + "epoch": 0.32389294403892943, + "grad_norm": 1.6625856822026597, + "learning_rate": 7.901568301900343e-06, + "loss": 0.3649, + "step": 3328 + }, + { + "epoch": 0.3239902676399027, + "grad_norm": 1.4506728288423798, + "learning_rate": 7.900284547855992e-06, + "loss": 0.3231, + "step": 3329 + }, + { + "epoch": 0.32408759124087594, + "grad_norm": 1.5932450335999997, + "learning_rate": 7.899000505608583e-06, + "loss": 0.6145, + "step": 3330 + }, + { + "epoch": 0.32418491484184914, + "grad_norm": 1.466657731000094, + "learning_rate": 7.89771617528571e-06, + "loss": 0.4498, + "step": 3331 + }, + { + "epoch": 0.3242822384428224, + "grad_norm": 1.1444340687397683, + "learning_rate": 7.896431557015001e-06, + "loss": 0.3953, + "step": 3332 + }, + { + "epoch": 0.32437956204379564, + "grad_norm": 1.2696327521021862, + "learning_rate": 7.895146650924106e-06, + "loss": 0.3974, + "step": 3333 + }, + { + "epoch": 0.32447688564476884, + "grad_norm": 1.0672959190242737, + "learning_rate": 7.893861457140711e-06, + "loss": 0.3147, + "step": 3334 + }, + { + "epoch": 0.3245742092457421, + "grad_norm": 1.5878275615936528, + "learning_rate": 7.892575975792524e-06, + "loss": 0.5637, + "step": 3335 + }, + { + "epoch": 0.32467153284671535, + "grad_norm": 1.3553067860868797, + "learning_rate": 7.891290207007284e-06, + "loss": 0.3979, + "step": 3336 + }, + { + "epoch": 0.32476885644768855, + "grad_norm": 1.3585205023281657, + "learning_rate": 7.890004150912758e-06, + "loss": 0.5408, + "step": 3337 + }, + { + "epoch": 0.3248661800486618, + "grad_norm": 1.3335333713837063, + "learning_rate": 7.888717807636745e-06, + "loss": 0.5097, + "step": 3338 + }, + { + "epoch": 0.32496350364963505, + "grad_norm": 1.6084982696846433, + "learning_rate": 7.887431177307067e-06, + "loss": 0.6652, + "step": 3339 + }, + { + "epoch": 0.32506082725060825, + "grad_norm": 1.5254713518221517, + "learning_rate": 7.886144260051577e-06, + "loss": 0.5413, + "step": 3340 + }, + { + "epoch": 0.3251581508515815, + "grad_norm": 1.2319691547427678, + "learning_rate": 7.88485705599816e-06, + "loss": 0.3669, + "step": 3341 + }, + { + "epoch": 0.32525547445255476, + "grad_norm": 1.8060436355287317, + "learning_rate": 7.883569565274722e-06, + "loss": 0.332, + "step": 3342 + }, + { + "epoch": 0.32535279805352796, + "grad_norm": 1.2724687132520958, + "learning_rate": 7.882281788009207e-06, + "loss": 0.4156, + "step": 3343 + }, + { + "epoch": 0.3254501216545012, + "grad_norm": 1.2678653056689784, + "learning_rate": 7.880993724329578e-06, + "loss": 0.34, + "step": 3344 + }, + { + "epoch": 0.32554744525547447, + "grad_norm": 1.6551598614012555, + "learning_rate": 7.879705374363831e-06, + "loss": 0.4642, + "step": 3345 + }, + { + "epoch": 0.32564476885644766, + "grad_norm": 1.3206920191183078, + "learning_rate": 7.878416738239991e-06, + "loss": 0.3755, + "step": 3346 + }, + { + "epoch": 0.3257420924574209, + "grad_norm": 1.3858157374277495, + "learning_rate": 7.877127816086109e-06, + "loss": 0.3394, + "step": 3347 + }, + { + "epoch": 0.32583941605839417, + "grad_norm": 2.0774474222482286, + "learning_rate": 7.87583860803027e-06, + "loss": 0.4237, + "step": 3348 + }, + { + "epoch": 0.32593673965936737, + "grad_norm": 2.373250216339497, + "learning_rate": 7.87454911420058e-06, + "loss": 0.4854, + "step": 3349 + }, + { + "epoch": 0.3260340632603406, + "grad_norm": 1.2888779016735306, + "learning_rate": 7.873259334725177e-06, + "loss": 0.2953, + "step": 3350 + }, + { + "epoch": 0.3261313868613139, + "grad_norm": 1.4655511800886896, + "learning_rate": 7.87196926973223e-06, + "loss": 0.5252, + "step": 3351 + }, + { + "epoch": 0.32622871046228713, + "grad_norm": 1.3179348636560488, + "learning_rate": 7.870678919349929e-06, + "loss": 0.3587, + "step": 3352 + }, + { + "epoch": 0.32632603406326033, + "grad_norm": 1.5442264887171864, + "learning_rate": 7.869388283706501e-06, + "loss": 0.3808, + "step": 3353 + }, + { + "epoch": 0.3264233576642336, + "grad_norm": 0.909176934030135, + "learning_rate": 7.868097362930194e-06, + "loss": 0.1721, + "step": 3354 + }, + { + "epoch": 0.32652068126520684, + "grad_norm": 1.5955557971117078, + "learning_rate": 7.866806157149291e-06, + "loss": 0.5127, + "step": 3355 + }, + { + "epoch": 0.32661800486618003, + "grad_norm": 1.260336198155953, + "learning_rate": 7.865514666492096e-06, + "loss": 0.2699, + "step": 3356 + }, + { + "epoch": 0.3267153284671533, + "grad_norm": 1.5004384671553455, + "learning_rate": 7.864222891086948e-06, + "loss": 0.3168, + "step": 3357 + }, + { + "epoch": 0.32681265206812654, + "grad_norm": 1.261076205541224, + "learning_rate": 7.862930831062211e-06, + "loss": 0.3678, + "step": 3358 + }, + { + "epoch": 0.32690997566909974, + "grad_norm": 1.637405579152474, + "learning_rate": 7.861638486546279e-06, + "loss": 0.4613, + "step": 3359 + }, + { + "epoch": 0.327007299270073, + "grad_norm": 1.3584995739472485, + "learning_rate": 7.860345857667571e-06, + "loss": 0.3229, + "step": 3360 + }, + { + "epoch": 0.32710462287104625, + "grad_norm": 1.5428951197572305, + "learning_rate": 7.859052944554537e-06, + "loss": 0.4904, + "step": 3361 + }, + { + "epoch": 0.32720194647201944, + "grad_norm": 1.2674356859582525, + "learning_rate": 7.857759747335652e-06, + "loss": 0.2942, + "step": 3362 + }, + { + "epoch": 0.3272992700729927, + "grad_norm": 1.5802075836374327, + "learning_rate": 7.856466266139426e-06, + "loss": 0.2949, + "step": 3363 + }, + { + "epoch": 0.32739659367396595, + "grad_norm": 1.8209546966028776, + "learning_rate": 7.855172501094394e-06, + "loss": 0.5036, + "step": 3364 + }, + { + "epoch": 0.32749391727493915, + "grad_norm": 1.5829487718400805, + "learning_rate": 7.853878452329113e-06, + "loss": 0.3638, + "step": 3365 + }, + { + "epoch": 0.3275912408759124, + "grad_norm": 1.439883288183698, + "learning_rate": 7.852584119972178e-06, + "loss": 0.4529, + "step": 3366 + }, + { + "epoch": 0.32768856447688566, + "grad_norm": 1.188441154560626, + "learning_rate": 7.851289504152201e-06, + "loss": 0.1984, + "step": 3367 + }, + { + "epoch": 0.32778588807785886, + "grad_norm": 1.3841398947965158, + "learning_rate": 7.84999460499784e-06, + "loss": 0.3274, + "step": 3368 + }, + { + "epoch": 0.3278832116788321, + "grad_norm": 1.3784180119913427, + "learning_rate": 7.848699422637757e-06, + "loss": 0.5186, + "step": 3369 + }, + { + "epoch": 0.32798053527980536, + "grad_norm": 1.6167829071938324, + "learning_rate": 7.847403957200667e-06, + "loss": 0.5905, + "step": 3370 + }, + { + "epoch": 0.32807785888077856, + "grad_norm": 1.5035664756701093, + "learning_rate": 7.846108208815292e-06, + "loss": 0.3502, + "step": 3371 + }, + { + "epoch": 0.3281751824817518, + "grad_norm": 1.3836098651173667, + "learning_rate": 7.844812177610398e-06, + "loss": 0.426, + "step": 3372 + }, + { + "epoch": 0.32827250608272507, + "grad_norm": 1.33208856990685, + "learning_rate": 7.843515863714766e-06, + "loss": 0.38, + "step": 3373 + }, + { + "epoch": 0.3283698296836983, + "grad_norm": 1.3222208170433662, + "learning_rate": 7.842219267257216e-06, + "loss": 0.33, + "step": 3374 + }, + { + "epoch": 0.3284671532846715, + "grad_norm": 1.26421550866771, + "learning_rate": 7.84092238836659e-06, + "loss": 0.3682, + "step": 3375 + }, + { + "epoch": 0.3285644768856448, + "grad_norm": 1.3594451535417975, + "learning_rate": 7.839625227171762e-06, + "loss": 0.4504, + "step": 3376 + }, + { + "epoch": 0.328661800486618, + "grad_norm": 1.2038996790979526, + "learning_rate": 7.838327783801627e-06, + "loss": 0.3675, + "step": 3377 + }, + { + "epoch": 0.3287591240875912, + "grad_norm": 1.2523761100181583, + "learning_rate": 7.837030058385117e-06, + "loss": 0.2582, + "step": 3378 + }, + { + "epoch": 0.3288564476885645, + "grad_norm": 1.38623949822592, + "learning_rate": 7.835732051051188e-06, + "loss": 0.426, + "step": 3379 + }, + { + "epoch": 0.32895377128953773, + "grad_norm": 1.6752196039887233, + "learning_rate": 7.834433761928819e-06, + "loss": 0.5995, + "step": 3380 + }, + { + "epoch": 0.32905109489051093, + "grad_norm": 1.4769893614554368, + "learning_rate": 7.833135191147027e-06, + "loss": 0.4434, + "step": 3381 + }, + { + "epoch": 0.3291484184914842, + "grad_norm": 1.3473793961690093, + "learning_rate": 7.831836338834851e-06, + "loss": 0.4064, + "step": 3382 + }, + { + "epoch": 0.32924574209245744, + "grad_norm": 1.4564612861566626, + "learning_rate": 7.830537205121354e-06, + "loss": 0.5275, + "step": 3383 + }, + { + "epoch": 0.32934306569343064, + "grad_norm": 1.4662477809046923, + "learning_rate": 7.829237790135638e-06, + "loss": 0.3745, + "step": 3384 + }, + { + "epoch": 0.3294403892944039, + "grad_norm": 1.3883338656353856, + "learning_rate": 7.827938094006822e-06, + "loss": 0.4361, + "step": 3385 + }, + { + "epoch": 0.32953771289537714, + "grad_norm": 1.2360062745222065, + "learning_rate": 7.826638116864061e-06, + "loss": 0.2936, + "step": 3386 + }, + { + "epoch": 0.32963503649635034, + "grad_norm": 1.2636393287865908, + "learning_rate": 7.82533785883653e-06, + "loss": 0.3816, + "step": 3387 + }, + { + "epoch": 0.3297323600973236, + "grad_norm": 1.5704874728825693, + "learning_rate": 7.824037320053442e-06, + "loss": 0.4946, + "step": 3388 + }, + { + "epoch": 0.32982968369829685, + "grad_norm": 1.5450366878165769, + "learning_rate": 7.822736500644028e-06, + "loss": 0.5973, + "step": 3389 + }, + { + "epoch": 0.32992700729927005, + "grad_norm": 1.570140468606553, + "learning_rate": 7.821435400737555e-06, + "loss": 0.6187, + "step": 3390 + }, + { + "epoch": 0.3300243309002433, + "grad_norm": 1.404973531589098, + "learning_rate": 7.820134020463311e-06, + "loss": 0.4404, + "step": 3391 + }, + { + "epoch": 0.33012165450121655, + "grad_norm": 1.4221866811013593, + "learning_rate": 7.818832359950615e-06, + "loss": 0.4375, + "step": 3392 + }, + { + "epoch": 0.3302189781021898, + "grad_norm": 1.3514761483715907, + "learning_rate": 7.817530419328815e-06, + "loss": 0.4633, + "step": 3393 + }, + { + "epoch": 0.330316301703163, + "grad_norm": 1.4122938191319212, + "learning_rate": 7.816228198727287e-06, + "loss": 0.4735, + "step": 3394 + }, + { + "epoch": 0.33041362530413626, + "grad_norm": 1.2807472917541904, + "learning_rate": 7.814925698275432e-06, + "loss": 0.2993, + "step": 3395 + }, + { + "epoch": 0.3305109489051095, + "grad_norm": 1.2374164437493267, + "learning_rate": 7.813622918102679e-06, + "loss": 0.4486, + "step": 3396 + }, + { + "epoch": 0.3306082725060827, + "grad_norm": 1.4923726710921128, + "learning_rate": 7.812319858338486e-06, + "loss": 0.3976, + "step": 3397 + }, + { + "epoch": 0.33070559610705597, + "grad_norm": 1.4652422146853137, + "learning_rate": 7.811016519112342e-06, + "loss": 0.509, + "step": 3398 + }, + { + "epoch": 0.3308029197080292, + "grad_norm": 1.3523346564010856, + "learning_rate": 7.80971290055376e-06, + "loss": 0.4045, + "step": 3399 + }, + { + "epoch": 0.3309002433090024, + "grad_norm": 1.4034359644709637, + "learning_rate": 7.808409002792277e-06, + "loss": 0.5252, + "step": 3400 + }, + { + "epoch": 0.33099756690997567, + "grad_norm": 1.5977042267924388, + "learning_rate": 7.807104825957466e-06, + "loss": 0.5708, + "step": 3401 + }, + { + "epoch": 0.3310948905109489, + "grad_norm": 1.194169039851455, + "learning_rate": 7.805800370178925e-06, + "loss": 0.2592, + "step": 3402 + }, + { + "epoch": 0.3311922141119221, + "grad_norm": 1.3572077520529662, + "learning_rate": 7.804495635586274e-06, + "loss": 0.3838, + "step": 3403 + }, + { + "epoch": 0.3312895377128954, + "grad_norm": 1.6103699950857218, + "learning_rate": 7.80319062230917e-06, + "loss": 0.2847, + "step": 3404 + }, + { + "epoch": 0.33138686131386863, + "grad_norm": 1.1427751941761943, + "learning_rate": 7.80188533047729e-06, + "loss": 0.3235, + "step": 3405 + }, + { + "epoch": 0.33148418491484183, + "grad_norm": 1.4205910051862616, + "learning_rate": 7.800579760220343e-06, + "loss": 0.4415, + "step": 3406 + }, + { + "epoch": 0.3315815085158151, + "grad_norm": 1.239833112068907, + "learning_rate": 7.799273911668062e-06, + "loss": 0.296, + "step": 3407 + }, + { + "epoch": 0.33167883211678834, + "grad_norm": 1.382006652854662, + "learning_rate": 7.797967784950215e-06, + "loss": 0.5129, + "step": 3408 + }, + { + "epoch": 0.33177615571776153, + "grad_norm": 1.3910482812533478, + "learning_rate": 7.796661380196587e-06, + "loss": 0.4355, + "step": 3409 + }, + { + "epoch": 0.3318734793187348, + "grad_norm": 1.3166393673557537, + "learning_rate": 7.795354697537e-06, + "loss": 0.3357, + "step": 3410 + }, + { + "epoch": 0.33197080291970804, + "grad_norm": 1.3540344564992455, + "learning_rate": 7.794047737101298e-06, + "loss": 0.2772, + "step": 3411 + }, + { + "epoch": 0.33206812652068124, + "grad_norm": 1.5732997739305445, + "learning_rate": 7.792740499019354e-06, + "loss": 0.368, + "step": 3412 + }, + { + "epoch": 0.3321654501216545, + "grad_norm": 1.00398349093736, + "learning_rate": 7.791432983421071e-06, + "loss": 0.2794, + "step": 3413 + }, + { + "epoch": 0.33226277372262775, + "grad_norm": 1.5831231764140208, + "learning_rate": 7.790125190436378e-06, + "loss": 0.399, + "step": 3414 + }, + { + "epoch": 0.332360097323601, + "grad_norm": 1.301291609070449, + "learning_rate": 7.788817120195228e-06, + "loss": 0.4975, + "step": 3415 + }, + { + "epoch": 0.3324574209245742, + "grad_norm": 3.04667201221868, + "learning_rate": 7.787508772827606e-06, + "loss": 0.3034, + "step": 3416 + }, + { + "epoch": 0.33255474452554745, + "grad_norm": 1.3711038355442808, + "learning_rate": 7.786200148463525e-06, + "loss": 0.4023, + "step": 3417 + }, + { + "epoch": 0.3326520681265207, + "grad_norm": 1.4460757328108422, + "learning_rate": 7.784891247233025e-06, + "loss": 0.5218, + "step": 3418 + }, + { + "epoch": 0.3327493917274939, + "grad_norm": 1.5174415949182483, + "learning_rate": 7.783582069266167e-06, + "loss": 0.5401, + "step": 3419 + }, + { + "epoch": 0.33284671532846716, + "grad_norm": 1.170900270080405, + "learning_rate": 7.78227261469305e-06, + "loss": 0.3093, + "step": 3420 + }, + { + "epoch": 0.3329440389294404, + "grad_norm": 1.1117784496155982, + "learning_rate": 7.78096288364379e-06, + "loss": 0.2159, + "step": 3421 + }, + { + "epoch": 0.3330413625304136, + "grad_norm": 1.385907002729564, + "learning_rate": 7.779652876248541e-06, + "loss": 0.4513, + "step": 3422 + }, + { + "epoch": 0.33313868613138686, + "grad_norm": 1.091845134289533, + "learning_rate": 7.778342592637477e-06, + "loss": 0.249, + "step": 3423 + }, + { + "epoch": 0.3332360097323601, + "grad_norm": 1.1987125682853903, + "learning_rate": 7.7770320329408e-06, + "loss": 0.4583, + "step": 3424 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 1.150260309114711, + "learning_rate": 7.775721197288746e-06, + "loss": 0.4145, + "step": 3425 + }, + { + "epoch": 0.33343065693430657, + "grad_norm": 1.1244746146994131, + "learning_rate": 7.77441008581157e-06, + "loss": 0.2334, + "step": 3426 + }, + { + "epoch": 0.3335279805352798, + "grad_norm": 2.372002969337908, + "learning_rate": 7.773098698639558e-06, + "loss": 0.3346, + "step": 3427 + }, + { + "epoch": 0.333625304136253, + "grad_norm": 1.460305633169593, + "learning_rate": 7.771787035903023e-06, + "loss": 0.5202, + "step": 3428 + }, + { + "epoch": 0.3337226277372263, + "grad_norm": 1.4552706392676258, + "learning_rate": 7.77047509773231e-06, + "loss": 0.3249, + "step": 3429 + }, + { + "epoch": 0.3338199513381995, + "grad_norm": 26.423500051667432, + "learning_rate": 7.769162884257778e-06, + "loss": 0.3919, + "step": 3430 + }, + { + "epoch": 0.3339172749391727, + "grad_norm": 1.3364724766772538, + "learning_rate": 7.767850395609832e-06, + "loss": 0.4882, + "step": 3431 + }, + { + "epoch": 0.334014598540146, + "grad_norm": 1.162494089255124, + "learning_rate": 7.766537631918888e-06, + "loss": 0.4172, + "step": 3432 + }, + { + "epoch": 0.33411192214111923, + "grad_norm": 1.4584273535075323, + "learning_rate": 7.765224593315402e-06, + "loss": 0.5721, + "step": 3433 + }, + { + "epoch": 0.33420924574209243, + "grad_norm": 1.39936657097592, + "learning_rate": 7.763911279929848e-06, + "loss": 0.4454, + "step": 3434 + }, + { + "epoch": 0.3343065693430657, + "grad_norm": 1.5774498430067907, + "learning_rate": 7.76259769189273e-06, + "loss": 0.6756, + "step": 3435 + }, + { + "epoch": 0.33440389294403894, + "grad_norm": 1.4346477573335101, + "learning_rate": 7.761283829334583e-06, + "loss": 0.4939, + "step": 3436 + }, + { + "epoch": 0.3345012165450122, + "grad_norm": 1.2329772568374064, + "learning_rate": 7.759969692385963e-06, + "loss": 0.3576, + "step": 3437 + }, + { + "epoch": 0.3345985401459854, + "grad_norm": 86.06815068595351, + "learning_rate": 7.75865528117746e-06, + "loss": 0.7983, + "step": 3438 + }, + { + "epoch": 0.33469586374695864, + "grad_norm": 1.4916748444459116, + "learning_rate": 7.757340595839686e-06, + "loss": 0.3408, + "step": 3439 + }, + { + "epoch": 0.3347931873479319, + "grad_norm": 1.204864631425379, + "learning_rate": 7.756025636503281e-06, + "loss": 0.2893, + "step": 3440 + }, + { + "epoch": 0.3348905109489051, + "grad_norm": 1.1483309949418294, + "learning_rate": 7.754710403298915e-06, + "loss": 0.307, + "step": 3441 + }, + { + "epoch": 0.33498783454987835, + "grad_norm": 1.3801437746700074, + "learning_rate": 7.753394896357283e-06, + "loss": 0.5086, + "step": 3442 + }, + { + "epoch": 0.3350851581508516, + "grad_norm": 1.4670528589774587, + "learning_rate": 7.752079115809105e-06, + "loss": 0.5494, + "step": 3443 + }, + { + "epoch": 0.3351824817518248, + "grad_norm": 1.2268331435647832, + "learning_rate": 7.750763061785139e-06, + "loss": 0.3421, + "step": 3444 + }, + { + "epoch": 0.33527980535279805, + "grad_norm": 1.117498287907938, + "learning_rate": 7.749446734416153e-06, + "loss": 0.3583, + "step": 3445 + }, + { + "epoch": 0.3353771289537713, + "grad_norm": 1.6628933950216975, + "learning_rate": 7.748130133832956e-06, + "loss": 0.4265, + "step": 3446 + }, + { + "epoch": 0.3354744525547445, + "grad_norm": 1.4371941282513903, + "learning_rate": 7.746813260166379e-06, + "loss": 0.5499, + "step": 3447 + }, + { + "epoch": 0.33557177615571776, + "grad_norm": 1.4139636094726638, + "learning_rate": 7.74549611354728e-06, + "loss": 0.5113, + "step": 3448 + }, + { + "epoch": 0.335669099756691, + "grad_norm": 1.3607040572953095, + "learning_rate": 7.744178694106545e-06, + "loss": 0.3662, + "step": 3449 + }, + { + "epoch": 0.3357664233576642, + "grad_norm": 1.3497420382405303, + "learning_rate": 7.742861001975086e-06, + "loss": 0.37, + "step": 3450 + }, + { + "epoch": 0.33586374695863747, + "grad_norm": 1.1583812763163044, + "learning_rate": 7.741543037283844e-06, + "loss": 0.2328, + "step": 3451 + }, + { + "epoch": 0.3359610705596107, + "grad_norm": 1.233691835278808, + "learning_rate": 7.740224800163783e-06, + "loss": 0.3023, + "step": 3452 + }, + { + "epoch": 0.3360583941605839, + "grad_norm": 1.770041486932794, + "learning_rate": 7.738906290745902e-06, + "loss": 0.4559, + "step": 3453 + }, + { + "epoch": 0.33615571776155717, + "grad_norm": 1.6249038227669963, + "learning_rate": 7.737587509161218e-06, + "loss": 0.3305, + "step": 3454 + }, + { + "epoch": 0.3362530413625304, + "grad_norm": 1.7123859840919058, + "learning_rate": 7.73626845554078e-06, + "loss": 0.8223, + "step": 3455 + }, + { + "epoch": 0.3363503649635036, + "grad_norm": 1.465565821382168, + "learning_rate": 7.734949130015665e-06, + "loss": 0.3951, + "step": 3456 + }, + { + "epoch": 0.3364476885644769, + "grad_norm": 1.3358312642650005, + "learning_rate": 7.733629532716974e-06, + "loss": 0.3988, + "step": 3457 + }, + { + "epoch": 0.33654501216545013, + "grad_norm": 1.4614411980006665, + "learning_rate": 7.732309663775834e-06, + "loss": 0.4447, + "step": 3458 + }, + { + "epoch": 0.3366423357664234, + "grad_norm": 1.5007446348141111, + "learning_rate": 7.730989523323405e-06, + "loss": 0.5075, + "step": 3459 + }, + { + "epoch": 0.3367396593673966, + "grad_norm": 1.378374467272079, + "learning_rate": 7.72966911149087e-06, + "loss": 0.3713, + "step": 3460 + }, + { + "epoch": 0.33683698296836984, + "grad_norm": 1.264554548276351, + "learning_rate": 7.728348428409434e-06, + "loss": 0.4239, + "step": 3461 + }, + { + "epoch": 0.3369343065693431, + "grad_norm": 1.3654939025524866, + "learning_rate": 7.72702747421034e-06, + "loss": 0.3861, + "step": 3462 + }, + { + "epoch": 0.3370316301703163, + "grad_norm": 1.456076628879786, + "learning_rate": 7.72570624902485e-06, + "loss": 0.4548, + "step": 3463 + }, + { + "epoch": 0.33712895377128954, + "grad_norm": 1.5979907891076075, + "learning_rate": 7.724384752984253e-06, + "loss": 0.5395, + "step": 3464 + }, + { + "epoch": 0.3372262773722628, + "grad_norm": 1.479690145713753, + "learning_rate": 7.723062986219871e-06, + "loss": 0.4676, + "step": 3465 + }, + { + "epoch": 0.337323600973236, + "grad_norm": 1.5794398807050158, + "learning_rate": 7.721740948863044e-06, + "loss": 0.6383, + "step": 3466 + }, + { + "epoch": 0.33742092457420925, + "grad_norm": 1.5077566027780562, + "learning_rate": 7.720418641045147e-06, + "loss": 0.449, + "step": 3467 + }, + { + "epoch": 0.3375182481751825, + "grad_norm": 1.536687422230877, + "learning_rate": 7.719096062897578e-06, + "loss": 0.3885, + "step": 3468 + }, + { + "epoch": 0.3376155717761557, + "grad_norm": 1.4136867972495795, + "learning_rate": 7.717773214551762e-06, + "loss": 0.4262, + "step": 3469 + }, + { + "epoch": 0.33771289537712895, + "grad_norm": 1.7521028146405362, + "learning_rate": 7.71645009613915e-06, + "loss": 0.3553, + "step": 3470 + }, + { + "epoch": 0.3378102189781022, + "grad_norm": 1.28472614917785, + "learning_rate": 7.715126707791223e-06, + "loss": 0.5044, + "step": 3471 + }, + { + "epoch": 0.3379075425790754, + "grad_norm": 1.5285157685020219, + "learning_rate": 7.713803049639485e-06, + "loss": 0.3067, + "step": 3472 + }, + { + "epoch": 0.33800486618004866, + "grad_norm": 1.301311213128161, + "learning_rate": 7.712479121815473e-06, + "loss": 0.4036, + "step": 3473 + }, + { + "epoch": 0.3381021897810219, + "grad_norm": 1.4164505494518185, + "learning_rate": 7.711154924450741e-06, + "loss": 0.3674, + "step": 3474 + }, + { + "epoch": 0.3381995133819951, + "grad_norm": 1.2252498333021546, + "learning_rate": 7.709830457676876e-06, + "loss": 0.273, + "step": 3475 + }, + { + "epoch": 0.33829683698296836, + "grad_norm": 1.4131843411362779, + "learning_rate": 7.708505721625497e-06, + "loss": 0.565, + "step": 3476 + }, + { + "epoch": 0.3383941605839416, + "grad_norm": 1.6947767404353455, + "learning_rate": 7.707180716428237e-06, + "loss": 0.4248, + "step": 3477 + }, + { + "epoch": 0.3384914841849148, + "grad_norm": 1.2913551057356365, + "learning_rate": 7.705855442216766e-06, + "loss": 0.3537, + "step": 3478 + }, + { + "epoch": 0.33858880778588807, + "grad_norm": 1.150600446597589, + "learning_rate": 7.704529899122776e-06, + "loss": 0.3311, + "step": 3479 + }, + { + "epoch": 0.3386861313868613, + "grad_norm": 1.2021311829209522, + "learning_rate": 7.703204087277989e-06, + "loss": 0.4163, + "step": 3480 + }, + { + "epoch": 0.3387834549878346, + "grad_norm": 1.256321657329123, + "learning_rate": 7.70187800681415e-06, + "loss": 0.3609, + "step": 3481 + }, + { + "epoch": 0.3388807785888078, + "grad_norm": 1.3378074898611492, + "learning_rate": 7.70055165786303e-06, + "loss": 0.3365, + "step": 3482 + }, + { + "epoch": 0.338978102189781, + "grad_norm": 1.170985634605712, + "learning_rate": 7.699225040556435e-06, + "loss": 0.2524, + "step": 3483 + }, + { + "epoch": 0.3390754257907543, + "grad_norm": 1.3653491057947706, + "learning_rate": 7.697898155026188e-06, + "loss": 0.282, + "step": 3484 + }, + { + "epoch": 0.3391727493917275, + "grad_norm": 1.3228454670807173, + "learning_rate": 7.696571001404143e-06, + "loss": 0.4489, + "step": 3485 + }, + { + "epoch": 0.33927007299270073, + "grad_norm": 1.568456967406488, + "learning_rate": 7.695243579822179e-06, + "loss": 0.672, + "step": 3486 + }, + { + "epoch": 0.339367396593674, + "grad_norm": 1.5380107558510523, + "learning_rate": 7.693915890412205e-06, + "loss": 0.4099, + "step": 3487 + }, + { + "epoch": 0.3394647201946472, + "grad_norm": 1.522434941547624, + "learning_rate": 7.692587933306152e-06, + "loss": 0.3895, + "step": 3488 + }, + { + "epoch": 0.33956204379562044, + "grad_norm": 1.6424350744273293, + "learning_rate": 7.691259708635983e-06, + "loss": 0.4547, + "step": 3489 + }, + { + "epoch": 0.3396593673965937, + "grad_norm": 1.1285195925950828, + "learning_rate": 7.689931216533682e-06, + "loss": 0.3205, + "step": 3490 + }, + { + "epoch": 0.3397566909975669, + "grad_norm": 1.2211335178765037, + "learning_rate": 7.68860245713126e-06, + "loss": 0.2849, + "step": 3491 + }, + { + "epoch": 0.33985401459854014, + "grad_norm": 1.3440619705365895, + "learning_rate": 7.687273430560763e-06, + "loss": 0.3493, + "step": 3492 + }, + { + "epoch": 0.3399513381995134, + "grad_norm": 1.9270777796759784, + "learning_rate": 7.685944136954252e-06, + "loss": 0.3207, + "step": 3493 + }, + { + "epoch": 0.3400486618004866, + "grad_norm": 1.3294429746967642, + "learning_rate": 7.684614576443821e-06, + "loss": 0.3215, + "step": 3494 + }, + { + "epoch": 0.34014598540145985, + "grad_norm": 1.299183824061712, + "learning_rate": 7.68328474916159e-06, + "loss": 0.3565, + "step": 3495 + }, + { + "epoch": 0.3402433090024331, + "grad_norm": 1.5283573621457682, + "learning_rate": 7.681954655239703e-06, + "loss": 0.4789, + "step": 3496 + }, + { + "epoch": 0.3403406326034063, + "grad_norm": 1.4764556038728422, + "learning_rate": 7.680624294810335e-06, + "loss": 0.4079, + "step": 3497 + }, + { + "epoch": 0.34043795620437955, + "grad_norm": 1.5139159959394148, + "learning_rate": 7.679293668005683e-06, + "loss": 0.5341, + "step": 3498 + }, + { + "epoch": 0.3405352798053528, + "grad_norm": 1.5537302722123498, + "learning_rate": 7.677962774957971e-06, + "loss": 0.478, + "step": 3499 + }, + { + "epoch": 0.340632603406326, + "grad_norm": 1.5827415849591213, + "learning_rate": 7.676631615799453e-06, + "loss": 0.359, + "step": 3500 + }, + { + "epoch": 0.34072992700729926, + "grad_norm": 1.8389434879657838, + "learning_rate": 7.675300190662406e-06, + "loss": 0.3688, + "step": 3501 + }, + { + "epoch": 0.3408272506082725, + "grad_norm": 1.219949474382852, + "learning_rate": 7.673968499679134e-06, + "loss": 0.3099, + "step": 3502 + }, + { + "epoch": 0.34092457420924577, + "grad_norm": 1.5256977236182008, + "learning_rate": 7.67263654298197e-06, + "loss": 0.3838, + "step": 3503 + }, + { + "epoch": 0.34102189781021897, + "grad_norm": 1.4659409139833723, + "learning_rate": 7.671304320703269e-06, + "loss": 0.4845, + "step": 3504 + }, + { + "epoch": 0.3411192214111922, + "grad_norm": 1.667843387368496, + "learning_rate": 7.669971832975417e-06, + "loss": 0.5876, + "step": 3505 + }, + { + "epoch": 0.3412165450121655, + "grad_norm": 1.346414364244355, + "learning_rate": 7.668639079930821e-06, + "loss": 0.4337, + "step": 3506 + }, + { + "epoch": 0.34131386861313867, + "grad_norm": 2.022273962642171, + "learning_rate": 7.66730606170192e-06, + "loss": 0.472, + "step": 3507 + }, + { + "epoch": 0.3414111922141119, + "grad_norm": 1.4021904736753643, + "learning_rate": 7.665972778421175e-06, + "loss": 0.3331, + "step": 3508 + }, + { + "epoch": 0.3415085158150852, + "grad_norm": 1.3964469851310124, + "learning_rate": 7.664639230221081e-06, + "loss": 0.4151, + "step": 3509 + }, + { + "epoch": 0.3416058394160584, + "grad_norm": 1.437167087649688, + "learning_rate": 7.663305417234146e-06, + "loss": 0.3751, + "step": 3510 + }, + { + "epoch": 0.34170316301703163, + "grad_norm": 1.3813486918408102, + "learning_rate": 7.661971339592913e-06, + "loss": 0.2492, + "step": 3511 + }, + { + "epoch": 0.3418004866180049, + "grad_norm": 2.213050506553265, + "learning_rate": 7.660636997429953e-06, + "loss": 0.2442, + "step": 3512 + }, + { + "epoch": 0.3418978102189781, + "grad_norm": 1.4125007730667565, + "learning_rate": 7.659302390877858e-06, + "loss": 0.3901, + "step": 3513 + }, + { + "epoch": 0.34199513381995134, + "grad_norm": 1.5262547193449572, + "learning_rate": 7.657967520069253e-06, + "loss": 0.5142, + "step": 3514 + }, + { + "epoch": 0.3420924574209246, + "grad_norm": 1.419026298338398, + "learning_rate": 7.65663238513678e-06, + "loss": 0.4604, + "step": 3515 + }, + { + "epoch": 0.3421897810218978, + "grad_norm": 1.354197849012477, + "learning_rate": 7.655296986213114e-06, + "loss": 0.3741, + "step": 3516 + }, + { + "epoch": 0.34228710462287104, + "grad_norm": 1.3675444218814583, + "learning_rate": 7.653961323430954e-06, + "loss": 0.4636, + "step": 3517 + }, + { + "epoch": 0.3423844282238443, + "grad_norm": 1.6737094690203054, + "learning_rate": 7.652625396923027e-06, + "loss": 0.5368, + "step": 3518 + }, + { + "epoch": 0.3424817518248175, + "grad_norm": 1.5346015529843957, + "learning_rate": 7.651289206822084e-06, + "loss": 0.5808, + "step": 3519 + }, + { + "epoch": 0.34257907542579075, + "grad_norm": 1.7677803050375525, + "learning_rate": 7.649952753260901e-06, + "loss": 0.4331, + "step": 3520 + }, + { + "epoch": 0.342676399026764, + "grad_norm": 1.5130877149006923, + "learning_rate": 7.648616036372288e-06, + "loss": 0.4343, + "step": 3521 + }, + { + "epoch": 0.34277372262773725, + "grad_norm": 1.6505485143132894, + "learning_rate": 7.647279056289068e-06, + "loss": 0.4256, + "step": 3522 + }, + { + "epoch": 0.34287104622871045, + "grad_norm": 1.1213144756308453, + "learning_rate": 7.6459418131441e-06, + "loss": 0.248, + "step": 3523 + }, + { + "epoch": 0.3429683698296837, + "grad_norm": 1.4978011226878554, + "learning_rate": 7.64460430707027e-06, + "loss": 0.4457, + "step": 3524 + }, + { + "epoch": 0.34306569343065696, + "grad_norm": 1.724927585574558, + "learning_rate": 7.643266538200485e-06, + "loss": 0.5746, + "step": 3525 + }, + { + "epoch": 0.34316301703163016, + "grad_norm": 1.4218186587395187, + "learning_rate": 7.641928506667677e-06, + "loss": 0.4628, + "step": 3526 + }, + { + "epoch": 0.3432603406326034, + "grad_norm": 1.1979163238967183, + "learning_rate": 7.640590212604813e-06, + "loss": 0.3276, + "step": 3527 + }, + { + "epoch": 0.34335766423357666, + "grad_norm": 1.5459257353579257, + "learning_rate": 7.639251656144873e-06, + "loss": 0.5543, + "step": 3528 + }, + { + "epoch": 0.34345498783454986, + "grad_norm": 2.168708523840157, + "learning_rate": 7.637912837420876e-06, + "loss": 0.4451, + "step": 3529 + }, + { + "epoch": 0.3435523114355231, + "grad_norm": 1.3295572972455665, + "learning_rate": 7.63657375656586e-06, + "loss": 0.3659, + "step": 3530 + }, + { + "epoch": 0.34364963503649637, + "grad_norm": 1.3940981237720802, + "learning_rate": 7.635234413712886e-06, + "loss": 0.3305, + "step": 3531 + }, + { + "epoch": 0.34374695863746957, + "grad_norm": 1.668696796269181, + "learning_rate": 7.63389480899505e-06, + "loss": 0.262, + "step": 3532 + }, + { + "epoch": 0.3438442822384428, + "grad_norm": 1.3235762846506705, + "learning_rate": 7.632554942545468e-06, + "loss": 0.354, + "step": 3533 + }, + { + "epoch": 0.3439416058394161, + "grad_norm": 1.774595647074867, + "learning_rate": 7.631214814497283e-06, + "loss": 0.4181, + "step": 3534 + }, + { + "epoch": 0.3440389294403893, + "grad_norm": 1.3699960689070612, + "learning_rate": 7.629874424983664e-06, + "loss": 0.4893, + "step": 3535 + }, + { + "epoch": 0.3441362530413625, + "grad_norm": 1.0054517790481798, + "learning_rate": 7.628533774137809e-06, + "loss": 0.2678, + "step": 3536 + }, + { + "epoch": 0.3442335766423358, + "grad_norm": 1.2585804176689974, + "learning_rate": 7.627192862092936e-06, + "loss": 0.3145, + "step": 3537 + }, + { + "epoch": 0.344330900243309, + "grad_norm": 1.4486458769434574, + "learning_rate": 7.625851688982293e-06, + "loss": 0.5018, + "step": 3538 + }, + { + "epoch": 0.34442822384428223, + "grad_norm": 1.6761010493950546, + "learning_rate": 7.624510254939155e-06, + "loss": 0.5786, + "step": 3539 + }, + { + "epoch": 0.3445255474452555, + "grad_norm": 1.1326870552848924, + "learning_rate": 7.623168560096819e-06, + "loss": 0.2714, + "step": 3540 + }, + { + "epoch": 0.3446228710462287, + "grad_norm": 1.380514271735255, + "learning_rate": 7.62182660458861e-06, + "loss": 0.4435, + "step": 3541 + }, + { + "epoch": 0.34472019464720194, + "grad_norm": 1.3847938610044406, + "learning_rate": 7.620484388547881e-06, + "loss": 0.456, + "step": 3542 + }, + { + "epoch": 0.3448175182481752, + "grad_norm": 1.4774572497281164, + "learning_rate": 7.619141912108008e-06, + "loss": 0.4016, + "step": 3543 + }, + { + "epoch": 0.34491484184914845, + "grad_norm": 1.4155087320104913, + "learning_rate": 7.617799175402392e-06, + "loss": 0.4672, + "step": 3544 + }, + { + "epoch": 0.34501216545012164, + "grad_norm": 1.2347049030843364, + "learning_rate": 7.616456178564463e-06, + "loss": 0.4701, + "step": 3545 + }, + { + "epoch": 0.3451094890510949, + "grad_norm": 1.3617965352115597, + "learning_rate": 7.615112921727677e-06, + "loss": 0.4411, + "step": 3546 + }, + { + "epoch": 0.34520681265206815, + "grad_norm": 1.5080403368283026, + "learning_rate": 7.613769405025511e-06, + "loss": 0.446, + "step": 3547 + }, + { + "epoch": 0.34530413625304135, + "grad_norm": 1.3587460871748047, + "learning_rate": 7.612425628591473e-06, + "loss": 0.4618, + "step": 3548 + }, + { + "epoch": 0.3454014598540146, + "grad_norm": 1.5594894397765757, + "learning_rate": 7.611081592559095e-06, + "loss": 0.6454, + "step": 3549 + }, + { + "epoch": 0.34549878345498786, + "grad_norm": 1.5472776871393785, + "learning_rate": 7.609737297061934e-06, + "loss": 0.4209, + "step": 3550 + }, + { + "epoch": 0.34559610705596105, + "grad_norm": 1.1387925779252865, + "learning_rate": 7.608392742233573e-06, + "loss": 0.2542, + "step": 3551 + }, + { + "epoch": 0.3456934306569343, + "grad_norm": 2.1308610533262753, + "learning_rate": 7.6070479282076226e-06, + "loss": 0.4232, + "step": 3552 + }, + { + "epoch": 0.34579075425790756, + "grad_norm": 1.655772230226882, + "learning_rate": 7.605702855117717e-06, + "loss": 0.316, + "step": 3553 + }, + { + "epoch": 0.34588807785888076, + "grad_norm": 1.2773133253854652, + "learning_rate": 7.604357523097518e-06, + "loss": 0.3933, + "step": 3554 + }, + { + "epoch": 0.345985401459854, + "grad_norm": 1.5157476945746606, + "learning_rate": 7.6030119322807105e-06, + "loss": 0.4895, + "step": 3555 + }, + { + "epoch": 0.34608272506082727, + "grad_norm": 1.5384836415390328, + "learning_rate": 7.601666082801007e-06, + "loss": 0.4571, + "step": 3556 + }, + { + "epoch": 0.34618004866180047, + "grad_norm": 1.3494696317056387, + "learning_rate": 7.600319974792145e-06, + "loss": 0.3585, + "step": 3557 + }, + { + "epoch": 0.3462773722627737, + "grad_norm": 1.5639692646479821, + "learning_rate": 7.59897360838789e-06, + "loss": 0.4913, + "step": 3558 + }, + { + "epoch": 0.346374695863747, + "grad_norm": 1.5771201683321747, + "learning_rate": 7.59762698372203e-06, + "loss": 0.644, + "step": 3559 + }, + { + "epoch": 0.34647201946472017, + "grad_norm": 1.445755844165086, + "learning_rate": 7.596280100928379e-06, + "loss": 0.4662, + "step": 3560 + }, + { + "epoch": 0.3465693430656934, + "grad_norm": 1.0169653044814775, + "learning_rate": 7.59493296014078e-06, + "loss": 0.1873, + "step": 3561 + }, + { + "epoch": 0.3466666666666667, + "grad_norm": 1.3573861316393436, + "learning_rate": 7.593585561493098e-06, + "loss": 0.3621, + "step": 3562 + }, + { + "epoch": 0.3467639902676399, + "grad_norm": 1.3679634725925294, + "learning_rate": 7.592237905119224e-06, + "loss": 0.3714, + "step": 3563 + }, + { + "epoch": 0.34686131386861313, + "grad_norm": 1.6153262572349345, + "learning_rate": 7.590889991153076e-06, + "loss": 0.1934, + "step": 3564 + }, + { + "epoch": 0.3469586374695864, + "grad_norm": 1.3501706757209104, + "learning_rate": 7.589541819728597e-06, + "loss": 0.3771, + "step": 3565 + }, + { + "epoch": 0.34705596107055964, + "grad_norm": 1.4359834383485004, + "learning_rate": 7.588193390979756e-06, + "loss": 0.4021, + "step": 3566 + }, + { + "epoch": 0.34715328467153284, + "grad_norm": 1.6178918309073458, + "learning_rate": 7.5868447050405456e-06, + "loss": 0.5326, + "step": 3567 + }, + { + "epoch": 0.3472506082725061, + "grad_norm": 1.2161304982080974, + "learning_rate": 7.585495762044989e-06, + "loss": 0.3215, + "step": 3568 + }, + { + "epoch": 0.34734793187347934, + "grad_norm": 1.2514780842366786, + "learning_rate": 7.584146562127128e-06, + "loss": 0.2619, + "step": 3569 + }, + { + "epoch": 0.34744525547445254, + "grad_norm": 1.292347877079837, + "learning_rate": 7.5827971054210334e-06, + "loss": 0.3722, + "step": 3570 + }, + { + "epoch": 0.3475425790754258, + "grad_norm": 1.428500991174703, + "learning_rate": 7.581447392060806e-06, + "loss": 0.4681, + "step": 3571 + }, + { + "epoch": 0.34763990267639905, + "grad_norm": 1.6187169310973553, + "learning_rate": 7.5800974221805635e-06, + "loss": 0.3123, + "step": 3572 + }, + { + "epoch": 0.34773722627737225, + "grad_norm": 1.2828962403261588, + "learning_rate": 7.5787471959144535e-06, + "loss": 0.3426, + "step": 3573 + }, + { + "epoch": 0.3478345498783455, + "grad_norm": 1.3424794600068044, + "learning_rate": 7.577396713396649e-06, + "loss": 0.2749, + "step": 3574 + }, + { + "epoch": 0.34793187347931875, + "grad_norm": 1.311429995391481, + "learning_rate": 7.576045974761352e-06, + "loss": 0.1912, + "step": 3575 + }, + { + "epoch": 0.34802919708029195, + "grad_norm": 1.9475934853598964, + "learning_rate": 7.57469498014278e-06, + "loss": 0.3568, + "step": 3576 + }, + { + "epoch": 0.3481265206812652, + "grad_norm": 1.5712590946732445, + "learning_rate": 7.573343729675187e-06, + "loss": 0.4361, + "step": 3577 + }, + { + "epoch": 0.34822384428223846, + "grad_norm": 1.3460143771601136, + "learning_rate": 7.5719922234928435e-06, + "loss": 0.3258, + "step": 3578 + }, + { + "epoch": 0.34832116788321166, + "grad_norm": 1.3457032135851295, + "learning_rate": 7.5706404617300544e-06, + "loss": 0.2679, + "step": 3579 + }, + { + "epoch": 0.3484184914841849, + "grad_norm": 1.602637400100207, + "learning_rate": 7.569288444521141e-06, + "loss": 0.4349, + "step": 3580 + }, + { + "epoch": 0.34851581508515816, + "grad_norm": 1.193898879748489, + "learning_rate": 7.567936172000456e-06, + "loss": 0.3823, + "step": 3581 + }, + { + "epoch": 0.34861313868613136, + "grad_norm": 1.7213934943419156, + "learning_rate": 7.5665836443023764e-06, + "loss": 0.4477, + "step": 3582 + }, + { + "epoch": 0.3487104622871046, + "grad_norm": 2.0896889317289262, + "learning_rate": 7.5652308615613025e-06, + "loss": 0.548, + "step": 3583 + }, + { + "epoch": 0.34880778588807787, + "grad_norm": 1.2133584709672542, + "learning_rate": 7.563877823911661e-06, + "loss": 0.3342, + "step": 3584 + }, + { + "epoch": 0.34890510948905107, + "grad_norm": 1.3518803834565474, + "learning_rate": 7.562524531487902e-06, + "loss": 0.4428, + "step": 3585 + }, + { + "epoch": 0.3490024330900243, + "grad_norm": 1.5646028482749865, + "learning_rate": 7.561170984424509e-06, + "loss": 0.4805, + "step": 3586 + }, + { + "epoch": 0.3490997566909976, + "grad_norm": 1.725109749219042, + "learning_rate": 7.5598171828559775e-06, + "loss": 0.3953, + "step": 3587 + }, + { + "epoch": 0.34919708029197083, + "grad_norm": 1.6751385503374157, + "learning_rate": 7.558463126916842e-06, + "loss": 0.3466, + "step": 3588 + }, + { + "epoch": 0.349294403892944, + "grad_norm": 1.577501457080682, + "learning_rate": 7.557108816741651e-06, + "loss": 0.387, + "step": 3589 + }, + { + "epoch": 0.3493917274939173, + "grad_norm": 1.5319961753926956, + "learning_rate": 7.5557542524649866e-06, + "loss": 0.2916, + "step": 3590 + }, + { + "epoch": 0.34948905109489053, + "grad_norm": 2.5209551905243845, + "learning_rate": 7.554399434221449e-06, + "loss": 0.2941, + "step": 3591 + }, + { + "epoch": 0.34958637469586373, + "grad_norm": 1.1980216546975295, + "learning_rate": 7.553044362145672e-06, + "loss": 0.3867, + "step": 3592 + }, + { + "epoch": 0.349683698296837, + "grad_norm": 1.425370917650529, + "learning_rate": 7.551689036372306e-06, + "loss": 0.4788, + "step": 3593 + }, + { + "epoch": 0.34978102189781024, + "grad_norm": 1.5504040775883203, + "learning_rate": 7.550333457036032e-06, + "loss": 0.5355, + "step": 3594 + }, + { + "epoch": 0.34987834549878344, + "grad_norm": 1.253372887044746, + "learning_rate": 7.5489776242715564e-06, + "loss": 0.4783, + "step": 3595 + }, + { + "epoch": 0.3499756690997567, + "grad_norm": 1.4985868632126829, + "learning_rate": 7.547621538213607e-06, + "loss": 0.4225, + "step": 3596 + }, + { + "epoch": 0.35007299270072995, + "grad_norm": 1.4435970539146459, + "learning_rate": 7.5462651989969385e-06, + "loss": 0.3745, + "step": 3597 + }, + { + "epoch": 0.35017031630170314, + "grad_norm": 1.3981266208958172, + "learning_rate": 7.5449086067563314e-06, + "loss": 0.3456, + "step": 3598 + }, + { + "epoch": 0.3502676399026764, + "grad_norm": 1.8315759023798575, + "learning_rate": 7.543551761626594e-06, + "loss": 0.5542, + "step": 3599 + }, + { + "epoch": 0.35036496350364965, + "grad_norm": 1.0610077420382762, + "learning_rate": 7.542194663742553e-06, + "loss": 0.3009, + "step": 3600 + }, + { + "epoch": 0.35046228710462285, + "grad_norm": 1.4599599576710758, + "learning_rate": 7.5408373132390674e-06, + "loss": 0.3322, + "step": 3601 + }, + { + "epoch": 0.3505596107055961, + "grad_norm": 1.357290752856556, + "learning_rate": 7.539479710251014e-06, + "loss": 0.4219, + "step": 3602 + }, + { + "epoch": 0.35065693430656936, + "grad_norm": 1.3466674715612543, + "learning_rate": 7.538121854913303e-06, + "loss": 0.4688, + "step": 3603 + }, + { + "epoch": 0.35075425790754255, + "grad_norm": 1.9207221814789595, + "learning_rate": 7.536763747360863e-06, + "loss": 0.5269, + "step": 3604 + }, + { + "epoch": 0.3508515815085158, + "grad_norm": 1.4616085693980927, + "learning_rate": 7.535405387728649e-06, + "loss": 0.5216, + "step": 3605 + } + ], + "logging_steps": 1.0, + "max_steps": 10275, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 515, + "total_flos": 461160819605504.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}