diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,74844 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 10686, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00028074115665356543, + "grad_norm": 6.073578357696533, + "learning_rate": 9.354536950420954e-09, + "loss": 0.8028, + "step": 1 + }, + { + "epoch": 0.0005614823133071309, + "grad_norm": 5.649418354034424, + "learning_rate": 1.870907390084191e-08, + "loss": 0.7859, + "step": 2 + }, + { + "epoch": 0.0008422234699606962, + "grad_norm": 5.829460620880127, + "learning_rate": 2.806361085126286e-08, + "loss": 0.8459, + "step": 3 + }, + { + "epoch": 0.0011229646266142617, + "grad_norm": 5.927548408508301, + "learning_rate": 3.741814780168382e-08, + "loss": 0.8519, + "step": 4 + }, + { + "epoch": 0.001403705783267827, + "grad_norm": 6.04041862487793, + "learning_rate": 4.677268475210477e-08, + "loss": 0.8199, + "step": 5 + }, + { + "epoch": 0.0016844469399213925, + "grad_norm": 5.618837833404541, + "learning_rate": 5.612722170252572e-08, + "loss": 0.834, + "step": 6 + }, + { + "epoch": 0.0019651880965749578, + "grad_norm": 5.914753437042236, + "learning_rate": 6.548175865294669e-08, + "loss": 0.8431, + "step": 7 + }, + { + "epoch": 0.0022459292532285235, + "grad_norm": 6.155481815338135, + "learning_rate": 7.483629560336764e-08, + "loss": 0.8905, + "step": 8 + }, + { + "epoch": 0.0025266704098820887, + "grad_norm": 6.104146957397461, + "learning_rate": 8.419083255378861e-08, + "loss": 0.8435, + "step": 9 + }, + { + "epoch": 0.002807411566535654, + "grad_norm": 6.155229091644287, + "learning_rate": 9.354536950420954e-08, + "loss": 0.8149, + "step": 10 + }, + { + "epoch": 0.0030881527231892197, + "grad_norm": 5.91041374206543, + "learning_rate": 1.0289990645463051e-07, + "loss": 0.8558, + "step": 11 + }, + { + "epoch": 0.003368893879842785, + "grad_norm": 5.734512805938721, + "learning_rate": 1.1225444340505145e-07, + "loss": 0.853, + "step": 12 + }, + { + "epoch": 0.0036496350364963502, + "grad_norm": 5.7818145751953125, + "learning_rate": 1.216089803554724e-07, + "loss": 0.8389, + "step": 13 + }, + { + "epoch": 0.0039303761931499155, + "grad_norm": 6.0573272705078125, + "learning_rate": 1.3096351730589338e-07, + "loss": 0.8861, + "step": 14 + }, + { + "epoch": 0.004211117349803481, + "grad_norm": 6.075229644775391, + "learning_rate": 1.4031805425631432e-07, + "loss": 0.8324, + "step": 15 + }, + { + "epoch": 0.004491858506457047, + "grad_norm": 6.1600661277771, + "learning_rate": 1.4967259120673527e-07, + "loss": 0.8764, + "step": 16 + }, + { + "epoch": 0.004772599663110612, + "grad_norm": 6.097347736358643, + "learning_rate": 1.5902712815715624e-07, + "loss": 0.9159, + "step": 17 + }, + { + "epoch": 0.0050533408197641775, + "grad_norm": 6.373533725738525, + "learning_rate": 1.6838166510757722e-07, + "loss": 0.9214, + "step": 18 + }, + { + "epoch": 0.005334081976417743, + "grad_norm": 6.170802116394043, + "learning_rate": 1.7773620205799813e-07, + "loss": 0.9023, + "step": 19 + }, + { + "epoch": 0.005614823133071308, + "grad_norm": 5.920821189880371, + "learning_rate": 1.8709073900841908e-07, + "loss": 0.9209, + "step": 20 + }, + { + "epoch": 0.005895564289724873, + "grad_norm": 6.032707214355469, + "learning_rate": 1.9644527595884005e-07, + "loss": 0.871, + "step": 21 + }, + { + "epoch": 0.006176305446378439, + "grad_norm": 5.691333293914795, + "learning_rate": 2.0579981290926103e-07, + "loss": 0.8536, + "step": 22 + }, + { + "epoch": 0.006457046603032005, + "grad_norm": 5.853001594543457, + "learning_rate": 2.1515434985968197e-07, + "loss": 0.8784, + "step": 23 + }, + { + "epoch": 0.00673778775968557, + "grad_norm": 5.614751815795898, + "learning_rate": 2.245088868101029e-07, + "loss": 0.8405, + "step": 24 + }, + { + "epoch": 0.007018528916339135, + "grad_norm": 5.751898288726807, + "learning_rate": 2.3386342376052386e-07, + "loss": 0.8712, + "step": 25 + }, + { + "epoch": 0.0072992700729927005, + "grad_norm": 5.45972204208374, + "learning_rate": 2.432179607109448e-07, + "loss": 0.8191, + "step": 26 + }, + { + "epoch": 0.007580011229646266, + "grad_norm": 5.904362201690674, + "learning_rate": 2.525724976613658e-07, + "loss": 0.8455, + "step": 27 + }, + { + "epoch": 0.007860752386299831, + "grad_norm": 5.9625749588012695, + "learning_rate": 2.6192703461178676e-07, + "loss": 0.9043, + "step": 28 + }, + { + "epoch": 0.008141493542953397, + "grad_norm": 5.246673107147217, + "learning_rate": 2.712815715622077e-07, + "loss": 0.8219, + "step": 29 + }, + { + "epoch": 0.008422234699606962, + "grad_norm": 5.433787822723389, + "learning_rate": 2.8063610851262865e-07, + "loss": 0.8194, + "step": 30 + }, + { + "epoch": 0.008702975856260528, + "grad_norm": 4.702797889709473, + "learning_rate": 2.899906454630496e-07, + "loss": 0.7633, + "step": 31 + }, + { + "epoch": 0.008983717012914094, + "grad_norm": 4.799088001251221, + "learning_rate": 2.9934518241347054e-07, + "loss": 0.8496, + "step": 32 + }, + { + "epoch": 0.009264458169567658, + "grad_norm": 4.480788230895996, + "learning_rate": 3.0869971936389154e-07, + "loss": 0.8413, + "step": 33 + }, + { + "epoch": 0.009545199326221224, + "grad_norm": 4.927966117858887, + "learning_rate": 3.180542563143125e-07, + "loss": 0.787, + "step": 34 + }, + { + "epoch": 0.009825940482874789, + "grad_norm": 4.581888198852539, + "learning_rate": 3.2740879326473343e-07, + "loss": 0.7689, + "step": 35 + }, + { + "epoch": 0.010106681639528355, + "grad_norm": 4.776089668273926, + "learning_rate": 3.3676333021515443e-07, + "loss": 0.8478, + "step": 36 + }, + { + "epoch": 0.010387422796181921, + "grad_norm": 4.4883832931518555, + "learning_rate": 3.461178671655753e-07, + "loss": 0.8311, + "step": 37 + }, + { + "epoch": 0.010668163952835485, + "grad_norm": 4.310268402099609, + "learning_rate": 3.5547240411599627e-07, + "loss": 0.8243, + "step": 38 + }, + { + "epoch": 0.010948905109489052, + "grad_norm": 4.470303535461426, + "learning_rate": 3.648269410664172e-07, + "loss": 0.8333, + "step": 39 + }, + { + "epoch": 0.011229646266142616, + "grad_norm": 4.391443252563477, + "learning_rate": 3.7418147801683816e-07, + "loss": 0.8377, + "step": 40 + }, + { + "epoch": 0.011510387422796182, + "grad_norm": 4.44397497177124, + "learning_rate": 3.8353601496725916e-07, + "loss": 0.8357, + "step": 41 + }, + { + "epoch": 0.011791128579449747, + "grad_norm": 3.539785861968994, + "learning_rate": 3.928905519176801e-07, + "loss": 0.7979, + "step": 42 + }, + { + "epoch": 0.012071869736103313, + "grad_norm": 2.8846137523651123, + "learning_rate": 4.0224508886810105e-07, + "loss": 0.6935, + "step": 43 + }, + { + "epoch": 0.012352610892756879, + "grad_norm": 3.18035626411438, + "learning_rate": 4.1159962581852205e-07, + "loss": 0.8754, + "step": 44 + }, + { + "epoch": 0.012633352049410443, + "grad_norm": 2.7209296226501465, + "learning_rate": 4.20954162768943e-07, + "loss": 0.8195, + "step": 45 + }, + { + "epoch": 0.01291409320606401, + "grad_norm": 2.650001287460327, + "learning_rate": 4.3030869971936394e-07, + "loss": 0.749, + "step": 46 + }, + { + "epoch": 0.013194834362717574, + "grad_norm": 2.4477689266204834, + "learning_rate": 4.396632366697849e-07, + "loss": 0.681, + "step": 47 + }, + { + "epoch": 0.01347557551937114, + "grad_norm": 2.5114424228668213, + "learning_rate": 4.490177736202058e-07, + "loss": 0.7193, + "step": 48 + }, + { + "epoch": 0.013756316676024706, + "grad_norm": 2.5794506072998047, + "learning_rate": 4.583723105706268e-07, + "loss": 0.7745, + "step": 49 + }, + { + "epoch": 0.01403705783267827, + "grad_norm": 2.4139857292175293, + "learning_rate": 4.6772684752104773e-07, + "loss": 0.8092, + "step": 50 + }, + { + "epoch": 0.014317798989331837, + "grad_norm": 2.4023053646087646, + "learning_rate": 4.770813844714687e-07, + "loss": 0.7918, + "step": 51 + }, + { + "epoch": 0.014598540145985401, + "grad_norm": 2.1910533905029297, + "learning_rate": 4.864359214218896e-07, + "loss": 0.7414, + "step": 52 + }, + { + "epoch": 0.014879281302638967, + "grad_norm": 2.130261182785034, + "learning_rate": 4.957904583723106e-07, + "loss": 0.7386, + "step": 53 + }, + { + "epoch": 0.015160022459292532, + "grad_norm": 2.0518133640289307, + "learning_rate": 5.051449953227316e-07, + "loss": 0.745, + "step": 54 + }, + { + "epoch": 0.015440763615946098, + "grad_norm": 1.911180019378662, + "learning_rate": 5.144995322731526e-07, + "loss": 0.7744, + "step": 55 + }, + { + "epoch": 0.015721504772599662, + "grad_norm": 2.1166069507598877, + "learning_rate": 5.238540692235735e-07, + "loss": 0.8067, + "step": 56 + }, + { + "epoch": 0.016002245929253228, + "grad_norm": 1.7378008365631104, + "learning_rate": 5.332086061739945e-07, + "loss": 0.7059, + "step": 57 + }, + { + "epoch": 0.016282987085906794, + "grad_norm": 1.721903920173645, + "learning_rate": 5.425631431244154e-07, + "loss": 0.6848, + "step": 58 + }, + { + "epoch": 0.01656372824256036, + "grad_norm": 1.6297296285629272, + "learning_rate": 5.519176800748364e-07, + "loss": 0.7134, + "step": 59 + }, + { + "epoch": 0.016844469399213923, + "grad_norm": 1.826710820198059, + "learning_rate": 5.612722170252573e-07, + "loss": 0.7292, + "step": 60 + }, + { + "epoch": 0.01712521055586749, + "grad_norm": 1.675954818725586, + "learning_rate": 5.706267539756782e-07, + "loss": 0.6998, + "step": 61 + }, + { + "epoch": 0.017405951712521055, + "grad_norm": 1.860581874847412, + "learning_rate": 5.799812909260992e-07, + "loss": 0.7113, + "step": 62 + }, + { + "epoch": 0.01768669286917462, + "grad_norm": 1.838109016418457, + "learning_rate": 5.893358278765201e-07, + "loss": 0.7165, + "step": 63 + }, + { + "epoch": 0.017967434025828188, + "grad_norm": 1.7727512121200562, + "learning_rate": 5.986903648269411e-07, + "loss": 0.6837, + "step": 64 + }, + { + "epoch": 0.01824817518248175, + "grad_norm": 1.5861293077468872, + "learning_rate": 6.080449017773621e-07, + "loss": 0.6792, + "step": 65 + }, + { + "epoch": 0.018528916339135316, + "grad_norm": 1.6274561882019043, + "learning_rate": 6.173994387277831e-07, + "loss": 0.6929, + "step": 66 + }, + { + "epoch": 0.018809657495788883, + "grad_norm": 1.4121606349945068, + "learning_rate": 6.26753975678204e-07, + "loss": 0.6376, + "step": 67 + }, + { + "epoch": 0.01909039865244245, + "grad_norm": 1.5709538459777832, + "learning_rate": 6.36108512628625e-07, + "loss": 0.7105, + "step": 68 + }, + { + "epoch": 0.019371139809096015, + "grad_norm": 1.6315401792526245, + "learning_rate": 6.454630495790459e-07, + "loss": 0.7196, + "step": 69 + }, + { + "epoch": 0.019651880965749578, + "grad_norm": 1.3795241117477417, + "learning_rate": 6.548175865294669e-07, + "loss": 0.7252, + "step": 70 + }, + { + "epoch": 0.019932622122403144, + "grad_norm": 1.3909331560134888, + "learning_rate": 6.641721234798878e-07, + "loss": 0.7319, + "step": 71 + }, + { + "epoch": 0.02021336327905671, + "grad_norm": 1.493241786956787, + "learning_rate": 6.735266604303089e-07, + "loss": 0.7072, + "step": 72 + }, + { + "epoch": 0.020494104435710276, + "grad_norm": 1.3212029933929443, + "learning_rate": 6.828811973807298e-07, + "loss": 0.7089, + "step": 73 + }, + { + "epoch": 0.020774845592363842, + "grad_norm": 1.329487681388855, + "learning_rate": 6.922357343311506e-07, + "loss": 0.7599, + "step": 74 + }, + { + "epoch": 0.021055586749017405, + "grad_norm": 1.239382028579712, + "learning_rate": 7.015902712815716e-07, + "loss": 0.6636, + "step": 75 + }, + { + "epoch": 0.02133632790567097, + "grad_norm": 1.2681066989898682, + "learning_rate": 7.109448082319925e-07, + "loss": 0.7067, + "step": 76 + }, + { + "epoch": 0.021617069062324537, + "grad_norm": 1.1106700897216797, + "learning_rate": 7.202993451824135e-07, + "loss": 0.6762, + "step": 77 + }, + { + "epoch": 0.021897810218978103, + "grad_norm": 1.2456177473068237, + "learning_rate": 7.296538821328344e-07, + "loss": 0.75, + "step": 78 + }, + { + "epoch": 0.022178551375631666, + "grad_norm": 1.062915563583374, + "learning_rate": 7.390084190832554e-07, + "loss": 0.6481, + "step": 79 + }, + { + "epoch": 0.022459292532285232, + "grad_norm": 1.066072940826416, + "learning_rate": 7.483629560336763e-07, + "loss": 0.7333, + "step": 80 + }, + { + "epoch": 0.022740033688938798, + "grad_norm": 0.975945770740509, + "learning_rate": 7.577174929840974e-07, + "loss": 0.6478, + "step": 81 + }, + { + "epoch": 0.023020774845592364, + "grad_norm": 1.0468389987945557, + "learning_rate": 7.670720299345183e-07, + "loss": 0.6577, + "step": 82 + }, + { + "epoch": 0.02330151600224593, + "grad_norm": 0.9923246502876282, + "learning_rate": 7.764265668849393e-07, + "loss": 0.6219, + "step": 83 + }, + { + "epoch": 0.023582257158899493, + "grad_norm": 1.1474268436431885, + "learning_rate": 7.857811038353602e-07, + "loss": 0.6846, + "step": 84 + }, + { + "epoch": 0.02386299831555306, + "grad_norm": 0.9405164122581482, + "learning_rate": 7.951356407857812e-07, + "loss": 0.6289, + "step": 85 + }, + { + "epoch": 0.024143739472206625, + "grad_norm": 0.9991530179977417, + "learning_rate": 8.044901777362021e-07, + "loss": 0.6363, + "step": 86 + }, + { + "epoch": 0.02442448062886019, + "grad_norm": 1.0833004713058472, + "learning_rate": 8.138447146866231e-07, + "loss": 0.6228, + "step": 87 + }, + { + "epoch": 0.024705221785513758, + "grad_norm": 0.9896429181098938, + "learning_rate": 8.231992516370441e-07, + "loss": 0.6961, + "step": 88 + }, + { + "epoch": 0.02498596294216732, + "grad_norm": 0.9045128226280212, + "learning_rate": 8.32553788587465e-07, + "loss": 0.6423, + "step": 89 + }, + { + "epoch": 0.025266704098820886, + "grad_norm": 1.0337687730789185, + "learning_rate": 8.41908325537886e-07, + "loss": 0.6517, + "step": 90 + }, + { + "epoch": 0.025547445255474453, + "grad_norm": 1.0262173414230347, + "learning_rate": 8.512628624883069e-07, + "loss": 0.6149, + "step": 91 + }, + { + "epoch": 0.02582818641212802, + "grad_norm": 0.8971832990646362, + "learning_rate": 8.606173994387279e-07, + "loss": 0.6895, + "step": 92 + }, + { + "epoch": 0.026108927568781585, + "grad_norm": 1.041611671447754, + "learning_rate": 8.699719363891488e-07, + "loss": 0.6284, + "step": 93 + }, + { + "epoch": 0.026389668725435148, + "grad_norm": 0.9133061170578003, + "learning_rate": 8.793264733395698e-07, + "loss": 0.6183, + "step": 94 + }, + { + "epoch": 0.026670409882088714, + "grad_norm": 0.8937678337097168, + "learning_rate": 8.886810102899906e-07, + "loss": 0.6211, + "step": 95 + }, + { + "epoch": 0.02695115103874228, + "grad_norm": 0.9467660188674927, + "learning_rate": 8.980355472404116e-07, + "loss": 0.6697, + "step": 96 + }, + { + "epoch": 0.027231892195395846, + "grad_norm": 0.9453322887420654, + "learning_rate": 9.073900841908326e-07, + "loss": 0.6369, + "step": 97 + }, + { + "epoch": 0.027512633352049412, + "grad_norm": 0.7970829606056213, + "learning_rate": 9.167446211412536e-07, + "loss": 0.6426, + "step": 98 + }, + { + "epoch": 0.027793374508702975, + "grad_norm": 0.9353342056274414, + "learning_rate": 9.260991580916745e-07, + "loss": 0.6234, + "step": 99 + }, + { + "epoch": 0.02807411566535654, + "grad_norm": 0.782704770565033, + "learning_rate": 9.354536950420955e-07, + "loss": 0.607, + "step": 100 + }, + { + "epoch": 0.028354856822010107, + "grad_norm": 0.8209283351898193, + "learning_rate": 9.448082319925164e-07, + "loss": 0.6219, + "step": 101 + }, + { + "epoch": 0.028635597978663673, + "grad_norm": 0.8043917417526245, + "learning_rate": 9.541627689429374e-07, + "loss": 0.5847, + "step": 102 + }, + { + "epoch": 0.028916339135317236, + "grad_norm": 0.882258951663971, + "learning_rate": 9.635173058933584e-07, + "loss": 0.6425, + "step": 103 + }, + { + "epoch": 0.029197080291970802, + "grad_norm": 0.8027119040489197, + "learning_rate": 9.728718428437792e-07, + "loss": 0.5817, + "step": 104 + }, + { + "epoch": 0.029477821448624368, + "grad_norm": 0.8394242525100708, + "learning_rate": 9.822263797942003e-07, + "loss": 0.6646, + "step": 105 + }, + { + "epoch": 0.029758562605277934, + "grad_norm": 0.7680022120475769, + "learning_rate": 9.915809167446211e-07, + "loss": 0.5845, + "step": 106 + }, + { + "epoch": 0.0300393037619315, + "grad_norm": 0.8337945938110352, + "learning_rate": 1.0009354536950422e-06, + "loss": 0.5888, + "step": 107 + }, + { + "epoch": 0.030320044918585063, + "grad_norm": 0.8383430242538452, + "learning_rate": 1.0102899906454632e-06, + "loss": 0.6502, + "step": 108 + }, + { + "epoch": 0.03060078607523863, + "grad_norm": 0.7696283459663391, + "learning_rate": 1.019644527595884e-06, + "loss": 0.5764, + "step": 109 + }, + { + "epoch": 0.030881527231892195, + "grad_norm": 0.820706307888031, + "learning_rate": 1.0289990645463051e-06, + "loss": 0.5963, + "step": 110 + }, + { + "epoch": 0.03116226838854576, + "grad_norm": 0.8323437571525574, + "learning_rate": 1.038353601496726e-06, + "loss": 0.6005, + "step": 111 + }, + { + "epoch": 0.031443009545199324, + "grad_norm": 0.7819299697875977, + "learning_rate": 1.047708138447147e-06, + "loss": 0.6413, + "step": 112 + }, + { + "epoch": 0.03172375070185289, + "grad_norm": 0.8218362331390381, + "learning_rate": 1.0570626753975679e-06, + "loss": 0.5802, + "step": 113 + }, + { + "epoch": 0.032004491858506456, + "grad_norm": 1.021608829498291, + "learning_rate": 1.066417212347989e-06, + "loss": 0.656, + "step": 114 + }, + { + "epoch": 0.03228523301516002, + "grad_norm": 0.7813796997070312, + "learning_rate": 1.07577174929841e-06, + "loss": 0.6464, + "step": 115 + }, + { + "epoch": 0.03256597417181359, + "grad_norm": 0.7661817669868469, + "learning_rate": 1.0851262862488308e-06, + "loss": 0.6281, + "step": 116 + }, + { + "epoch": 0.032846715328467155, + "grad_norm": 0.7750112414360046, + "learning_rate": 1.0944808231992516e-06, + "loss": 0.5962, + "step": 117 + }, + { + "epoch": 0.03312745648512072, + "grad_norm": 0.8634867072105408, + "learning_rate": 1.1038353601496727e-06, + "loss": 0.5911, + "step": 118 + }, + { + "epoch": 0.03340819764177429, + "grad_norm": 0.7029651999473572, + "learning_rate": 1.1131898971000935e-06, + "loss": 0.5254, + "step": 119 + }, + { + "epoch": 0.033688938798427846, + "grad_norm": 0.7431548833847046, + "learning_rate": 1.1225444340505146e-06, + "loss": 0.5994, + "step": 120 + }, + { + "epoch": 0.03396967995508141, + "grad_norm": 0.7729125618934631, + "learning_rate": 1.1318989710009354e-06, + "loss": 0.591, + "step": 121 + }, + { + "epoch": 0.03425042111173498, + "grad_norm": 0.7862691879272461, + "learning_rate": 1.1412535079513565e-06, + "loss": 0.5886, + "step": 122 + }, + { + "epoch": 0.034531162268388545, + "grad_norm": 0.8104708790779114, + "learning_rate": 1.1506080449017775e-06, + "loss": 0.585, + "step": 123 + }, + { + "epoch": 0.03481190342504211, + "grad_norm": 0.7641741037368774, + "learning_rate": 1.1599625818521984e-06, + "loss": 0.6107, + "step": 124 + }, + { + "epoch": 0.03509264458169568, + "grad_norm": 0.8046885132789612, + "learning_rate": 1.1693171188026194e-06, + "loss": 0.5298, + "step": 125 + }, + { + "epoch": 0.03537338573834924, + "grad_norm": 0.7960103750228882, + "learning_rate": 1.1786716557530403e-06, + "loss": 0.573, + "step": 126 + }, + { + "epoch": 0.03565412689500281, + "grad_norm": 0.8036853671073914, + "learning_rate": 1.1880261927034613e-06, + "loss": 0.6013, + "step": 127 + }, + { + "epoch": 0.035934868051656375, + "grad_norm": 0.877988874912262, + "learning_rate": 1.1973807296538822e-06, + "loss": 0.5879, + "step": 128 + }, + { + "epoch": 0.03621560920830994, + "grad_norm": 0.8558998107910156, + "learning_rate": 1.2067352666043032e-06, + "loss": 0.6272, + "step": 129 + }, + { + "epoch": 0.0364963503649635, + "grad_norm": 0.8760024905204773, + "learning_rate": 1.2160898035547243e-06, + "loss": 0.5597, + "step": 130 + }, + { + "epoch": 0.03677709152161707, + "grad_norm": 0.6800727844238281, + "learning_rate": 1.225444340505145e-06, + "loss": 0.5552, + "step": 131 + }, + { + "epoch": 0.03705783267827063, + "grad_norm": 0.7151376008987427, + "learning_rate": 1.2347988774555662e-06, + "loss": 0.5566, + "step": 132 + }, + { + "epoch": 0.0373385738349242, + "grad_norm": 0.7448155283927917, + "learning_rate": 1.244153414405987e-06, + "loss": 0.5772, + "step": 133 + }, + { + "epoch": 0.037619314991577765, + "grad_norm": 0.7712084650993347, + "learning_rate": 1.253507951356408e-06, + "loss": 0.5934, + "step": 134 + }, + { + "epoch": 0.03790005614823133, + "grad_norm": 0.8580107688903809, + "learning_rate": 1.2628624883068289e-06, + "loss": 0.6066, + "step": 135 + }, + { + "epoch": 0.0381807973048849, + "grad_norm": 0.800658106803894, + "learning_rate": 1.27221702525725e-06, + "loss": 0.5592, + "step": 136 + }, + { + "epoch": 0.038461538461538464, + "grad_norm": 0.7491589784622192, + "learning_rate": 1.2815715622076708e-06, + "loss": 0.5398, + "step": 137 + }, + { + "epoch": 0.03874227961819203, + "grad_norm": 0.7043203711509705, + "learning_rate": 1.2909260991580918e-06, + "loss": 0.5834, + "step": 138 + }, + { + "epoch": 0.03902302077484559, + "grad_norm": 0.9559151530265808, + "learning_rate": 1.3002806361085127e-06, + "loss": 0.6397, + "step": 139 + }, + { + "epoch": 0.039303761931499155, + "grad_norm": 0.8577326536178589, + "learning_rate": 1.3096351730589337e-06, + "loss": 0.5862, + "step": 140 + }, + { + "epoch": 0.03958450308815272, + "grad_norm": 0.9272957444190979, + "learning_rate": 1.3189897100093546e-06, + "loss": 0.6603, + "step": 141 + }, + { + "epoch": 0.03986524424480629, + "grad_norm": 0.7770213484764099, + "learning_rate": 1.3283442469597756e-06, + "loss": 0.5693, + "step": 142 + }, + { + "epoch": 0.040145985401459854, + "grad_norm": 0.7320612072944641, + "learning_rate": 1.3376987839101965e-06, + "loss": 0.537, + "step": 143 + }, + { + "epoch": 0.04042672655811342, + "grad_norm": 0.8152422904968262, + "learning_rate": 1.3470533208606177e-06, + "loss": 0.5794, + "step": 144 + }, + { + "epoch": 0.040707467714766986, + "grad_norm": 0.743893563747406, + "learning_rate": 1.3564078578110384e-06, + "loss": 0.5789, + "step": 145 + }, + { + "epoch": 0.04098820887142055, + "grad_norm": 0.7537864446640015, + "learning_rate": 1.3657623947614596e-06, + "loss": 0.602, + "step": 146 + }, + { + "epoch": 0.04126895002807412, + "grad_norm": 0.7398315072059631, + "learning_rate": 1.3751169317118805e-06, + "loss": 0.5623, + "step": 147 + }, + { + "epoch": 0.041549691184727684, + "grad_norm": 0.866960346698761, + "learning_rate": 1.3844714686623013e-06, + "loss": 0.5189, + "step": 148 + }, + { + "epoch": 0.04183043234138124, + "grad_norm": 0.7426940202713013, + "learning_rate": 1.3938260056127223e-06, + "loss": 0.5586, + "step": 149 + }, + { + "epoch": 0.04211117349803481, + "grad_norm": 0.8084462285041809, + "learning_rate": 1.4031805425631432e-06, + "loss": 0.5317, + "step": 150 + }, + { + "epoch": 0.042391914654688376, + "grad_norm": 0.7611818313598633, + "learning_rate": 1.4125350795135642e-06, + "loss": 0.5646, + "step": 151 + }, + { + "epoch": 0.04267265581134194, + "grad_norm": 0.74317467212677, + "learning_rate": 1.421889616463985e-06, + "loss": 0.5743, + "step": 152 + }, + { + "epoch": 0.04295339696799551, + "grad_norm": 0.6734655499458313, + "learning_rate": 1.4312441534144061e-06, + "loss": 0.5806, + "step": 153 + }, + { + "epoch": 0.043234138124649074, + "grad_norm": 0.688694953918457, + "learning_rate": 1.440598690364827e-06, + "loss": 0.5818, + "step": 154 + }, + { + "epoch": 0.04351487928130264, + "grad_norm": 0.7829697728157043, + "learning_rate": 1.449953227315248e-06, + "loss": 0.5194, + "step": 155 + }, + { + "epoch": 0.043795620437956206, + "grad_norm": 0.9685561060905457, + "learning_rate": 1.4593077642656689e-06, + "loss": 0.5782, + "step": 156 + }, + { + "epoch": 0.04407636159460977, + "grad_norm": 0.8144839406013489, + "learning_rate": 1.46866230121609e-06, + "loss": 0.5486, + "step": 157 + }, + { + "epoch": 0.04435710275126333, + "grad_norm": 0.835292398929596, + "learning_rate": 1.4780168381665108e-06, + "loss": 0.6424, + "step": 158 + }, + { + "epoch": 0.0446378439079169, + "grad_norm": 0.8725515007972717, + "learning_rate": 1.487371375116932e-06, + "loss": 0.5862, + "step": 159 + }, + { + "epoch": 0.044918585064570464, + "grad_norm": 0.7808248400688171, + "learning_rate": 1.4967259120673526e-06, + "loss": 0.5901, + "step": 160 + }, + { + "epoch": 0.04519932622122403, + "grad_norm": 0.7102718949317932, + "learning_rate": 1.506080449017774e-06, + "loss": 0.5738, + "step": 161 + }, + { + "epoch": 0.045480067377877596, + "grad_norm": 0.7492978572845459, + "learning_rate": 1.5154349859681948e-06, + "loss": 0.5963, + "step": 162 + }, + { + "epoch": 0.04576080853453116, + "grad_norm": 0.7842692732810974, + "learning_rate": 1.5247895229186158e-06, + "loss": 0.6135, + "step": 163 + }, + { + "epoch": 0.04604154969118473, + "grad_norm": 0.731016218662262, + "learning_rate": 1.5341440598690366e-06, + "loss": 0.5738, + "step": 164 + }, + { + "epoch": 0.046322290847838295, + "grad_norm": 0.7418588995933533, + "learning_rate": 1.5434985968194577e-06, + "loss": 0.5433, + "step": 165 + }, + { + "epoch": 0.04660303200449186, + "grad_norm": 0.7768568396568298, + "learning_rate": 1.5528531337698785e-06, + "loss": 0.5264, + "step": 166 + }, + { + "epoch": 0.04688377316114543, + "grad_norm": 0.7436287999153137, + "learning_rate": 1.5622076707202996e-06, + "loss": 0.5662, + "step": 167 + }, + { + "epoch": 0.047164514317798986, + "grad_norm": 0.7058064341545105, + "learning_rate": 1.5715622076707204e-06, + "loss": 0.5687, + "step": 168 + }, + { + "epoch": 0.04744525547445255, + "grad_norm": 0.7297882437705994, + "learning_rate": 1.5809167446211413e-06, + "loss": 0.5808, + "step": 169 + }, + { + "epoch": 0.04772599663110612, + "grad_norm": 0.6743118166923523, + "learning_rate": 1.5902712815715623e-06, + "loss": 0.5539, + "step": 170 + }, + { + "epoch": 0.048006737787759685, + "grad_norm": 0.7409380078315735, + "learning_rate": 1.5996258185219832e-06, + "loss": 0.6305, + "step": 171 + }, + { + "epoch": 0.04828747894441325, + "grad_norm": 0.9666582942008972, + "learning_rate": 1.6089803554724042e-06, + "loss": 0.6139, + "step": 172 + }, + { + "epoch": 0.04856822010106682, + "grad_norm": 0.7424812912940979, + "learning_rate": 1.618334892422825e-06, + "loss": 0.537, + "step": 173 + }, + { + "epoch": 0.04884896125772038, + "grad_norm": 0.7016400098800659, + "learning_rate": 1.6276894293732461e-06, + "loss": 0.5316, + "step": 174 + }, + { + "epoch": 0.04912970241437395, + "grad_norm": 0.7044817805290222, + "learning_rate": 1.637043966323667e-06, + "loss": 0.5476, + "step": 175 + }, + { + "epoch": 0.049410443571027515, + "grad_norm": 0.8542489409446716, + "learning_rate": 1.6463985032740882e-06, + "loss": 0.5406, + "step": 176 + }, + { + "epoch": 0.04969118472768108, + "grad_norm": 0.7255383729934692, + "learning_rate": 1.655753040224509e-06, + "loss": 0.4932, + "step": 177 + }, + { + "epoch": 0.04997192588433464, + "grad_norm": 0.8024505376815796, + "learning_rate": 1.66510757717493e-06, + "loss": 0.6012, + "step": 178 + }, + { + "epoch": 0.05025266704098821, + "grad_norm": 0.7083516716957092, + "learning_rate": 1.674462114125351e-06, + "loss": 0.5535, + "step": 179 + }, + { + "epoch": 0.05053340819764177, + "grad_norm": 0.8453170657157898, + "learning_rate": 1.683816651075772e-06, + "loss": 0.5506, + "step": 180 + }, + { + "epoch": 0.05081414935429534, + "grad_norm": 0.742327094078064, + "learning_rate": 1.6931711880261928e-06, + "loss": 0.5545, + "step": 181 + }, + { + "epoch": 0.051094890510948905, + "grad_norm": 0.7339834570884705, + "learning_rate": 1.7025257249766139e-06, + "loss": 0.5283, + "step": 182 + }, + { + "epoch": 0.05137563166760247, + "grad_norm": 0.8246902823448181, + "learning_rate": 1.7118802619270347e-06, + "loss": 0.593, + "step": 183 + }, + { + "epoch": 0.05165637282425604, + "grad_norm": 0.7709143161773682, + "learning_rate": 1.7212347988774558e-06, + "loss": 0.507, + "step": 184 + }, + { + "epoch": 0.051937113980909604, + "grad_norm": 0.8270139694213867, + "learning_rate": 1.7305893358278766e-06, + "loss": 0.5726, + "step": 185 + }, + { + "epoch": 0.05221785513756317, + "grad_norm": 0.982842206954956, + "learning_rate": 1.7399438727782977e-06, + "loss": 0.5737, + "step": 186 + }, + { + "epoch": 0.05249859629421673, + "grad_norm": 0.6899937987327576, + "learning_rate": 1.7492984097287185e-06, + "loss": 0.5721, + "step": 187 + }, + { + "epoch": 0.052779337450870295, + "grad_norm": 0.8084338307380676, + "learning_rate": 1.7586529466791396e-06, + "loss": 0.5704, + "step": 188 + }, + { + "epoch": 0.05306007860752386, + "grad_norm": 0.8003899455070496, + "learning_rate": 1.7680074836295604e-06, + "loss": 0.5308, + "step": 189 + }, + { + "epoch": 0.05334081976417743, + "grad_norm": 0.9019588232040405, + "learning_rate": 1.7773620205799812e-06, + "loss": 0.5492, + "step": 190 + }, + { + "epoch": 0.05362156092083099, + "grad_norm": 0.865822434425354, + "learning_rate": 1.7867165575304025e-06, + "loss": 0.5138, + "step": 191 + }, + { + "epoch": 0.05390230207748456, + "grad_norm": 0.8150571584701538, + "learning_rate": 1.7960710944808231e-06, + "loss": 0.5333, + "step": 192 + }, + { + "epoch": 0.054183043234138126, + "grad_norm": 0.8458138704299927, + "learning_rate": 1.8054256314312444e-06, + "loss": 0.549, + "step": 193 + }, + { + "epoch": 0.05446378439079169, + "grad_norm": 0.7980986833572388, + "learning_rate": 1.8147801683816652e-06, + "loss": 0.5312, + "step": 194 + }, + { + "epoch": 0.05474452554744526, + "grad_norm": 0.8078594207763672, + "learning_rate": 1.8241347053320863e-06, + "loss": 0.5506, + "step": 195 + }, + { + "epoch": 0.055025266704098824, + "grad_norm": 0.8306414484977722, + "learning_rate": 1.8334892422825071e-06, + "loss": 0.5478, + "step": 196 + }, + { + "epoch": 0.05530600786075238, + "grad_norm": 0.748626708984375, + "learning_rate": 1.8428437792329282e-06, + "loss": 0.5562, + "step": 197 + }, + { + "epoch": 0.05558674901740595, + "grad_norm": 0.774010181427002, + "learning_rate": 1.852198316183349e-06, + "loss": 0.5638, + "step": 198 + }, + { + "epoch": 0.055867490174059516, + "grad_norm": 0.7905158996582031, + "learning_rate": 1.86155285313377e-06, + "loss": 0.5299, + "step": 199 + }, + { + "epoch": 0.05614823133071308, + "grad_norm": 0.7886589169502258, + "learning_rate": 1.870907390084191e-06, + "loss": 0.571, + "step": 200 + }, + { + "epoch": 0.05642897248736665, + "grad_norm": 0.805158257484436, + "learning_rate": 1.880261927034612e-06, + "loss": 0.525, + "step": 201 + }, + { + "epoch": 0.056709713644020214, + "grad_norm": 0.6530075073242188, + "learning_rate": 1.8896164639850328e-06, + "loss": 0.5196, + "step": 202 + }, + { + "epoch": 0.05699045480067378, + "grad_norm": 0.7828499674797058, + "learning_rate": 1.8989710009354539e-06, + "loss": 0.5465, + "step": 203 + }, + { + "epoch": 0.057271195957327346, + "grad_norm": 1.0039658546447754, + "learning_rate": 1.9083255378858747e-06, + "loss": 0.5339, + "step": 204 + }, + { + "epoch": 0.05755193711398091, + "grad_norm": 1.0169117450714111, + "learning_rate": 1.9176800748362958e-06, + "loss": 0.572, + "step": 205 + }, + { + "epoch": 0.05783267827063447, + "grad_norm": 0.8177130222320557, + "learning_rate": 1.927034611786717e-06, + "loss": 0.5335, + "step": 206 + }, + { + "epoch": 0.05811341942728804, + "grad_norm": 0.6351256966590881, + "learning_rate": 1.936389148737138e-06, + "loss": 0.5451, + "step": 207 + }, + { + "epoch": 0.058394160583941604, + "grad_norm": 0.7631227374076843, + "learning_rate": 1.9457436856875585e-06, + "loss": 0.5436, + "step": 208 + }, + { + "epoch": 0.05867490174059517, + "grad_norm": 0.7937856912612915, + "learning_rate": 1.9550982226379795e-06, + "loss": 0.5426, + "step": 209 + }, + { + "epoch": 0.058955642897248736, + "grad_norm": 0.768334150314331, + "learning_rate": 1.9644527595884006e-06, + "loss": 0.5088, + "step": 210 + }, + { + "epoch": 0.0592363840539023, + "grad_norm": 0.8432625532150269, + "learning_rate": 1.9738072965388212e-06, + "loss": 0.5384, + "step": 211 + }, + { + "epoch": 0.05951712521055587, + "grad_norm": 0.798975944519043, + "learning_rate": 1.9831618334892423e-06, + "loss": 0.5788, + "step": 212 + }, + { + "epoch": 0.059797866367209435, + "grad_norm": 0.7429670691490173, + "learning_rate": 1.9925163704396633e-06, + "loss": 0.5478, + "step": 213 + }, + { + "epoch": 0.060078607523863, + "grad_norm": 0.8425019979476929, + "learning_rate": 2.0018709073900844e-06, + "loss": 0.5571, + "step": 214 + }, + { + "epoch": 0.06035934868051657, + "grad_norm": 0.777231752872467, + "learning_rate": 2.011225444340505e-06, + "loss": 0.5368, + "step": 215 + }, + { + "epoch": 0.060640089837170126, + "grad_norm": 0.8038666844367981, + "learning_rate": 2.0205799812909265e-06, + "loss": 0.5585, + "step": 216 + }, + { + "epoch": 0.06092083099382369, + "grad_norm": 0.6847572922706604, + "learning_rate": 2.029934518241347e-06, + "loss": 0.5692, + "step": 217 + }, + { + "epoch": 0.06120157215047726, + "grad_norm": 0.6985204815864563, + "learning_rate": 2.039289055191768e-06, + "loss": 0.5254, + "step": 218 + }, + { + "epoch": 0.061482313307130824, + "grad_norm": 0.7749029397964478, + "learning_rate": 2.048643592142189e-06, + "loss": 0.5529, + "step": 219 + }, + { + "epoch": 0.06176305446378439, + "grad_norm": 0.7894192337989807, + "learning_rate": 2.0579981290926103e-06, + "loss": 0.5753, + "step": 220 + }, + { + "epoch": 0.06204379562043796, + "grad_norm": 0.9177107214927673, + "learning_rate": 2.067352666043031e-06, + "loss": 0.5163, + "step": 221 + }, + { + "epoch": 0.06232453677709152, + "grad_norm": 0.8762850761413574, + "learning_rate": 2.076707202993452e-06, + "loss": 0.5601, + "step": 222 + }, + { + "epoch": 0.06260527793374508, + "grad_norm": 0.9435070753097534, + "learning_rate": 2.086061739943873e-06, + "loss": 0.5493, + "step": 223 + }, + { + "epoch": 0.06288601909039865, + "grad_norm": 0.8169218301773071, + "learning_rate": 2.095416276894294e-06, + "loss": 0.5298, + "step": 224 + }, + { + "epoch": 0.06316676024705221, + "grad_norm": 0.7907678484916687, + "learning_rate": 2.1047708138447147e-06, + "loss": 0.5219, + "step": 225 + }, + { + "epoch": 0.06344750140370578, + "grad_norm": 0.676476001739502, + "learning_rate": 2.1141253507951357e-06, + "loss": 0.4889, + "step": 226 + }, + { + "epoch": 0.06372824256035935, + "grad_norm": 0.7545673251152039, + "learning_rate": 2.1234798877455568e-06, + "loss": 0.5141, + "step": 227 + }, + { + "epoch": 0.06400898371701291, + "grad_norm": 0.6979267001152039, + "learning_rate": 2.132834424695978e-06, + "loss": 0.5098, + "step": 228 + }, + { + "epoch": 0.06428972487366648, + "grad_norm": 0.783541738986969, + "learning_rate": 2.1421889616463985e-06, + "loss": 0.5096, + "step": 229 + }, + { + "epoch": 0.06457046603032005, + "grad_norm": 0.7176050543785095, + "learning_rate": 2.15154349859682e-06, + "loss": 0.5264, + "step": 230 + }, + { + "epoch": 0.06485120718697361, + "grad_norm": 0.8634214401245117, + "learning_rate": 2.1608980355472406e-06, + "loss": 0.548, + "step": 231 + }, + { + "epoch": 0.06513194834362718, + "grad_norm": 0.8606464266777039, + "learning_rate": 2.1702525724976616e-06, + "loss": 0.5787, + "step": 232 + }, + { + "epoch": 0.06541268950028074, + "grad_norm": 0.7644342184066772, + "learning_rate": 2.1796071094480827e-06, + "loss": 0.5503, + "step": 233 + }, + { + "epoch": 0.06569343065693431, + "grad_norm": 0.7599000930786133, + "learning_rate": 2.1889616463985033e-06, + "loss": 0.5946, + "step": 234 + }, + { + "epoch": 0.06597417181358788, + "grad_norm": 0.9466094374656677, + "learning_rate": 2.1983161833489243e-06, + "loss": 0.5723, + "step": 235 + }, + { + "epoch": 0.06625491297024144, + "grad_norm": 0.6076518893241882, + "learning_rate": 2.2076707202993454e-06, + "loss": 0.502, + "step": 236 + }, + { + "epoch": 0.06653565412689501, + "grad_norm": 0.6740292906761169, + "learning_rate": 2.2170252572497665e-06, + "loss": 0.488, + "step": 237 + }, + { + "epoch": 0.06681639528354857, + "grad_norm": 0.6609193086624146, + "learning_rate": 2.226379794200187e-06, + "loss": 0.5479, + "step": 238 + }, + { + "epoch": 0.06709713644020214, + "grad_norm": 0.7286792397499084, + "learning_rate": 2.235734331150608e-06, + "loss": 0.5128, + "step": 239 + }, + { + "epoch": 0.06737787759685569, + "grad_norm": 0.7384173274040222, + "learning_rate": 2.245088868101029e-06, + "loss": 0.5679, + "step": 240 + }, + { + "epoch": 0.06765861875350926, + "grad_norm": 0.7667889595031738, + "learning_rate": 2.2544434050514502e-06, + "loss": 0.5323, + "step": 241 + }, + { + "epoch": 0.06793935991016282, + "grad_norm": 0.7510852217674255, + "learning_rate": 2.263797942001871e-06, + "loss": 0.5013, + "step": 242 + }, + { + "epoch": 0.06822010106681639, + "grad_norm": 0.9144022464752197, + "learning_rate": 2.2731524789522923e-06, + "loss": 0.5599, + "step": 243 + }, + { + "epoch": 0.06850084222346996, + "grad_norm": 0.8114398717880249, + "learning_rate": 2.282507015902713e-06, + "loss": 0.5345, + "step": 244 + }, + { + "epoch": 0.06878158338012352, + "grad_norm": 0.8570141792297363, + "learning_rate": 2.291861552853134e-06, + "loss": 0.5804, + "step": 245 + }, + { + "epoch": 0.06906232453677709, + "grad_norm": 0.8347529768943787, + "learning_rate": 2.301216089803555e-06, + "loss": 0.5321, + "step": 246 + }, + { + "epoch": 0.06934306569343066, + "grad_norm": 0.6538413166999817, + "learning_rate": 2.310570626753976e-06, + "loss": 0.4891, + "step": 247 + }, + { + "epoch": 0.06962380685008422, + "grad_norm": 0.7439711689949036, + "learning_rate": 2.3199251637043968e-06, + "loss": 0.5169, + "step": 248 + }, + { + "epoch": 0.06990454800673779, + "grad_norm": 0.7807654142379761, + "learning_rate": 2.329279700654818e-06, + "loss": 0.5353, + "step": 249 + }, + { + "epoch": 0.07018528916339135, + "grad_norm": 0.7158603072166443, + "learning_rate": 2.338634237605239e-06, + "loss": 0.5381, + "step": 250 + }, + { + "epoch": 0.07046603032004492, + "grad_norm": 0.6879047751426697, + "learning_rate": 2.34798877455566e-06, + "loss": 0.4972, + "step": 251 + }, + { + "epoch": 0.07074677147669849, + "grad_norm": 0.7796744108200073, + "learning_rate": 2.3573433115060805e-06, + "loss": 0.5541, + "step": 252 + }, + { + "epoch": 0.07102751263335205, + "grad_norm": 0.7775213718414307, + "learning_rate": 2.3666978484565016e-06, + "loss": 0.5263, + "step": 253 + }, + { + "epoch": 0.07130825379000562, + "grad_norm": 0.7353174090385437, + "learning_rate": 2.3760523854069226e-06, + "loss": 0.4889, + "step": 254 + }, + { + "epoch": 0.07158899494665918, + "grad_norm": 0.8232805728912354, + "learning_rate": 2.3854069223573433e-06, + "loss": 0.5443, + "step": 255 + }, + { + "epoch": 0.07186973610331275, + "grad_norm": 0.7795940637588501, + "learning_rate": 2.3947614593077643e-06, + "loss": 0.5768, + "step": 256 + }, + { + "epoch": 0.07215047725996632, + "grad_norm": 0.7369815111160278, + "learning_rate": 2.4041159962581854e-06, + "loss": 0.5662, + "step": 257 + }, + { + "epoch": 0.07243121841661988, + "grad_norm": 0.723147451877594, + "learning_rate": 2.4134705332086064e-06, + "loss": 0.5206, + "step": 258 + }, + { + "epoch": 0.07271195957327344, + "grad_norm": 0.8946998119354248, + "learning_rate": 2.422825070159027e-06, + "loss": 0.5619, + "step": 259 + }, + { + "epoch": 0.072992700729927, + "grad_norm": 0.837316632270813, + "learning_rate": 2.4321796071094485e-06, + "loss": 0.5184, + "step": 260 + }, + { + "epoch": 0.07327344188658057, + "grad_norm": 0.7697188854217529, + "learning_rate": 2.441534144059869e-06, + "loss": 0.5332, + "step": 261 + }, + { + "epoch": 0.07355418304323413, + "grad_norm": 0.9006131887435913, + "learning_rate": 2.45088868101029e-06, + "loss": 0.5867, + "step": 262 + }, + { + "epoch": 0.0738349241998877, + "grad_norm": 0.7689294815063477, + "learning_rate": 2.4602432179607113e-06, + "loss": 0.5182, + "step": 263 + }, + { + "epoch": 0.07411566535654127, + "grad_norm": 0.7972959280014038, + "learning_rate": 2.4695977549111323e-06, + "loss": 0.5812, + "step": 264 + }, + { + "epoch": 0.07439640651319483, + "grad_norm": 0.7685443162918091, + "learning_rate": 2.478952291861553e-06, + "loss": 0.5638, + "step": 265 + }, + { + "epoch": 0.0746771476698484, + "grad_norm": 0.6880684494972229, + "learning_rate": 2.488306828811974e-06, + "loss": 0.5003, + "step": 266 + }, + { + "epoch": 0.07495788882650196, + "grad_norm": 0.7760937213897705, + "learning_rate": 2.497661365762395e-06, + "loss": 0.5633, + "step": 267 + }, + { + "epoch": 0.07523862998315553, + "grad_norm": 0.7446058988571167, + "learning_rate": 2.507015902712816e-06, + "loss": 0.5474, + "step": 268 + }, + { + "epoch": 0.0755193711398091, + "grad_norm": 0.8981422781944275, + "learning_rate": 2.516370439663237e-06, + "loss": 0.5511, + "step": 269 + }, + { + "epoch": 0.07580011229646266, + "grad_norm": 0.7858171463012695, + "learning_rate": 2.5257249766136578e-06, + "loss": 0.5815, + "step": 270 + }, + { + "epoch": 0.07608085345311623, + "grad_norm": 0.7271562218666077, + "learning_rate": 2.535079513564079e-06, + "loss": 0.5551, + "step": 271 + }, + { + "epoch": 0.0763615946097698, + "grad_norm": 0.7712613344192505, + "learning_rate": 2.5444340505145e-06, + "loss": 0.5458, + "step": 272 + }, + { + "epoch": 0.07664233576642336, + "grad_norm": 0.7486150860786438, + "learning_rate": 2.5537885874649205e-06, + "loss": 0.4753, + "step": 273 + }, + { + "epoch": 0.07692307692307693, + "grad_norm": 0.8282490968704224, + "learning_rate": 2.5631431244153416e-06, + "loss": 0.5463, + "step": 274 + }, + { + "epoch": 0.0772038180797305, + "grad_norm": 0.8079036474227905, + "learning_rate": 2.5724976613657626e-06, + "loss": 0.5006, + "step": 275 + }, + { + "epoch": 0.07748455923638406, + "grad_norm": 0.7917653918266296, + "learning_rate": 2.5818521983161837e-06, + "loss": 0.5391, + "step": 276 + }, + { + "epoch": 0.07776530039303763, + "grad_norm": 0.7724319696426392, + "learning_rate": 2.5912067352666043e-06, + "loss": 0.5012, + "step": 277 + }, + { + "epoch": 0.07804604154969118, + "grad_norm": 0.7696590423583984, + "learning_rate": 2.6005612722170253e-06, + "loss": 0.5501, + "step": 278 + }, + { + "epoch": 0.07832678270634474, + "grad_norm": 0.7920299768447876, + "learning_rate": 2.6099158091674464e-06, + "loss": 0.5605, + "step": 279 + }, + { + "epoch": 0.07860752386299831, + "grad_norm": 0.812937319278717, + "learning_rate": 2.6192703461178675e-06, + "loss": 0.5564, + "step": 280 + }, + { + "epoch": 0.07888826501965188, + "grad_norm": 0.8756580352783203, + "learning_rate": 2.628624883068288e-06, + "loss": 0.5502, + "step": 281 + }, + { + "epoch": 0.07916900617630544, + "grad_norm": 0.7585451602935791, + "learning_rate": 2.637979420018709e-06, + "loss": 0.5378, + "step": 282 + }, + { + "epoch": 0.07944974733295901, + "grad_norm": 0.9798131585121155, + "learning_rate": 2.64733395696913e-06, + "loss": 0.5406, + "step": 283 + }, + { + "epoch": 0.07973048848961257, + "grad_norm": 0.6927319169044495, + "learning_rate": 2.6566884939195512e-06, + "loss": 0.5404, + "step": 284 + }, + { + "epoch": 0.08001122964626614, + "grad_norm": 0.795004665851593, + "learning_rate": 2.666043030869972e-06, + "loss": 0.5058, + "step": 285 + }, + { + "epoch": 0.08029197080291971, + "grad_norm": 0.8104981780052185, + "learning_rate": 2.675397567820393e-06, + "loss": 0.5122, + "step": 286 + }, + { + "epoch": 0.08057271195957327, + "grad_norm": 0.910351574420929, + "learning_rate": 2.684752104770814e-06, + "loss": 0.5457, + "step": 287 + }, + { + "epoch": 0.08085345311622684, + "grad_norm": 0.752348780632019, + "learning_rate": 2.6941066417212354e-06, + "loss": 0.5467, + "step": 288 + }, + { + "epoch": 0.0811341942728804, + "grad_norm": 0.7559342980384827, + "learning_rate": 2.7034611786716557e-06, + "loss": 0.553, + "step": 289 + }, + { + "epoch": 0.08141493542953397, + "grad_norm": 0.896124541759491, + "learning_rate": 2.7128157156220767e-06, + "loss": 0.5408, + "step": 290 + }, + { + "epoch": 0.08169567658618754, + "grad_norm": 0.7341452836990356, + "learning_rate": 2.722170252572498e-06, + "loss": 0.5549, + "step": 291 + }, + { + "epoch": 0.0819764177428411, + "grad_norm": 0.7190515398979187, + "learning_rate": 2.7315247895229192e-06, + "loss": 0.4733, + "step": 292 + }, + { + "epoch": 0.08225715889949467, + "grad_norm": 0.8105028867721558, + "learning_rate": 2.74087932647334e-06, + "loss": 0.5223, + "step": 293 + }, + { + "epoch": 0.08253790005614824, + "grad_norm": 0.7164011001586914, + "learning_rate": 2.750233863423761e-06, + "loss": 0.5237, + "step": 294 + }, + { + "epoch": 0.0828186412128018, + "grad_norm": 0.7514110207557678, + "learning_rate": 2.759588400374182e-06, + "loss": 0.488, + "step": 295 + }, + { + "epoch": 0.08309938236945537, + "grad_norm": 0.7827313542366028, + "learning_rate": 2.7689429373246026e-06, + "loss": 0.5395, + "step": 296 + }, + { + "epoch": 0.08338012352610892, + "grad_norm": 0.7245772480964661, + "learning_rate": 2.7782974742750236e-06, + "loss": 0.5206, + "step": 297 + }, + { + "epoch": 0.08366086468276249, + "grad_norm": 0.8863216638565063, + "learning_rate": 2.7876520112254447e-06, + "loss": 0.5715, + "step": 298 + }, + { + "epoch": 0.08394160583941605, + "grad_norm": 0.7554477453231812, + "learning_rate": 2.7970065481758657e-06, + "loss": 0.5577, + "step": 299 + }, + { + "epoch": 0.08422234699606962, + "grad_norm": 0.7475144267082214, + "learning_rate": 2.8063610851262864e-06, + "loss": 0.5233, + "step": 300 + }, + { + "epoch": 0.08450308815272319, + "grad_norm": 0.7585038542747498, + "learning_rate": 2.8157156220767074e-06, + "loss": 0.5567, + "step": 301 + }, + { + "epoch": 0.08478382930937675, + "grad_norm": 0.7732409834861755, + "learning_rate": 2.8250701590271285e-06, + "loss": 0.541, + "step": 302 + }, + { + "epoch": 0.08506457046603032, + "grad_norm": 0.7325016260147095, + "learning_rate": 2.8344246959775495e-06, + "loss": 0.5193, + "step": 303 + }, + { + "epoch": 0.08534531162268388, + "grad_norm": 0.6840338110923767, + "learning_rate": 2.84377923292797e-06, + "loss": 0.5083, + "step": 304 + }, + { + "epoch": 0.08562605277933745, + "grad_norm": 0.6712573766708374, + "learning_rate": 2.853133769878391e-06, + "loss": 0.5338, + "step": 305 + }, + { + "epoch": 0.08590679393599102, + "grad_norm": 0.8282577991485596, + "learning_rate": 2.8624883068288123e-06, + "loss": 0.6124, + "step": 306 + }, + { + "epoch": 0.08618753509264458, + "grad_norm": 0.6931818723678589, + "learning_rate": 2.8718428437792333e-06, + "loss": 0.5231, + "step": 307 + }, + { + "epoch": 0.08646827624929815, + "grad_norm": 0.7463021874427795, + "learning_rate": 2.881197380729654e-06, + "loss": 0.4949, + "step": 308 + }, + { + "epoch": 0.08674901740595171, + "grad_norm": 0.7152007222175598, + "learning_rate": 2.890551917680075e-06, + "loss": 0.539, + "step": 309 + }, + { + "epoch": 0.08702975856260528, + "grad_norm": 0.7241409420967102, + "learning_rate": 2.899906454630496e-06, + "loss": 0.5563, + "step": 310 + }, + { + "epoch": 0.08731049971925885, + "grad_norm": 0.7746958136558533, + "learning_rate": 2.909260991580917e-06, + "loss": 0.5484, + "step": 311 + }, + { + "epoch": 0.08759124087591241, + "grad_norm": 0.7572389245033264, + "learning_rate": 2.9186155285313377e-06, + "loss": 0.5298, + "step": 312 + }, + { + "epoch": 0.08787198203256598, + "grad_norm": 0.7889935374259949, + "learning_rate": 2.9279700654817588e-06, + "loss": 0.543, + "step": 313 + }, + { + "epoch": 0.08815272318921955, + "grad_norm": 0.7501704096794128, + "learning_rate": 2.93732460243218e-06, + "loss": 0.5314, + "step": 314 + }, + { + "epoch": 0.08843346434587311, + "grad_norm": 0.8016058206558228, + "learning_rate": 2.9466791393826005e-06, + "loss": 0.5535, + "step": 315 + }, + { + "epoch": 0.08871420550252666, + "grad_norm": 0.7872318029403687, + "learning_rate": 2.9560336763330215e-06, + "loss": 0.4671, + "step": 316 + }, + { + "epoch": 0.08899494665918023, + "grad_norm": 0.7549393177032471, + "learning_rate": 2.9653882132834426e-06, + "loss": 0.5145, + "step": 317 + }, + { + "epoch": 0.0892756878158338, + "grad_norm": 0.8216416835784912, + "learning_rate": 2.974742750233864e-06, + "loss": 0.5564, + "step": 318 + }, + { + "epoch": 0.08955642897248736, + "grad_norm": 0.8333849906921387, + "learning_rate": 2.9840972871842842e-06, + "loss": 0.5188, + "step": 319 + }, + { + "epoch": 0.08983717012914093, + "grad_norm": 0.8397712111473083, + "learning_rate": 2.9934518241347053e-06, + "loss": 0.5891, + "step": 320 + }, + { + "epoch": 0.0901179112857945, + "grad_norm": 0.8094610571861267, + "learning_rate": 3.0028063610851268e-06, + "loss": 0.4991, + "step": 321 + }, + { + "epoch": 0.09039865244244806, + "grad_norm": 0.6761815547943115, + "learning_rate": 3.012160898035548e-06, + "loss": 0.5209, + "step": 322 + }, + { + "epoch": 0.09067939359910163, + "grad_norm": 0.7161853909492493, + "learning_rate": 3.021515434985968e-06, + "loss": 0.5243, + "step": 323 + }, + { + "epoch": 0.09096013475575519, + "grad_norm": 0.7544697523117065, + "learning_rate": 3.0308699719363895e-06, + "loss": 0.5265, + "step": 324 + }, + { + "epoch": 0.09124087591240876, + "grad_norm": 0.823232889175415, + "learning_rate": 3.0402245088868106e-06, + "loss": 0.5217, + "step": 325 + }, + { + "epoch": 0.09152161706906232, + "grad_norm": 0.9152257442474365, + "learning_rate": 3.0495790458372316e-06, + "loss": 0.5331, + "step": 326 + }, + { + "epoch": 0.09180235822571589, + "grad_norm": 0.832825779914856, + "learning_rate": 3.0589335827876522e-06, + "loss": 0.5325, + "step": 327 + }, + { + "epoch": 0.09208309938236946, + "grad_norm": 0.8086512684822083, + "learning_rate": 3.0682881197380733e-06, + "loss": 0.5288, + "step": 328 + }, + { + "epoch": 0.09236384053902302, + "grad_norm": 0.7992445230484009, + "learning_rate": 3.0776426566884943e-06, + "loss": 0.5526, + "step": 329 + }, + { + "epoch": 0.09264458169567659, + "grad_norm": 0.726046621799469, + "learning_rate": 3.0869971936389154e-06, + "loss": 0.4995, + "step": 330 + }, + { + "epoch": 0.09292532285233016, + "grad_norm": 0.8194549083709717, + "learning_rate": 3.096351730589336e-06, + "loss": 0.5416, + "step": 331 + }, + { + "epoch": 0.09320606400898372, + "grad_norm": 0.7737677097320557, + "learning_rate": 3.105706267539757e-06, + "loss": 0.5339, + "step": 332 + }, + { + "epoch": 0.09348680516563729, + "grad_norm": 0.8151153922080994, + "learning_rate": 3.115060804490178e-06, + "loss": 0.5544, + "step": 333 + }, + { + "epoch": 0.09376754632229085, + "grad_norm": 0.7140392661094666, + "learning_rate": 3.124415341440599e-06, + "loss": 0.528, + "step": 334 + }, + { + "epoch": 0.09404828747894442, + "grad_norm": 0.8134286999702454, + "learning_rate": 3.13376987839102e-06, + "loss": 0.4922, + "step": 335 + }, + { + "epoch": 0.09432902863559797, + "grad_norm": 0.7863975763320923, + "learning_rate": 3.143124415341441e-06, + "loss": 0.4945, + "step": 336 + }, + { + "epoch": 0.09460976979225154, + "grad_norm": 0.8091383576393127, + "learning_rate": 3.152478952291862e-06, + "loss": 0.5016, + "step": 337 + }, + { + "epoch": 0.0948905109489051, + "grad_norm": 0.7497513890266418, + "learning_rate": 3.1618334892422825e-06, + "loss": 0.5357, + "step": 338 + }, + { + "epoch": 0.09517125210555867, + "grad_norm": 0.7403958439826965, + "learning_rate": 3.1711880261927036e-06, + "loss": 0.5273, + "step": 339 + }, + { + "epoch": 0.09545199326221224, + "grad_norm": 0.7487135529518127, + "learning_rate": 3.1805425631431246e-06, + "loss": 0.4795, + "step": 340 + }, + { + "epoch": 0.0957327344188658, + "grad_norm": 0.7567718029022217, + "learning_rate": 3.1898971000935457e-06, + "loss": 0.5271, + "step": 341 + }, + { + "epoch": 0.09601347557551937, + "grad_norm": 0.7643928527832031, + "learning_rate": 3.1992516370439663e-06, + "loss": 0.5559, + "step": 342 + }, + { + "epoch": 0.09629421673217294, + "grad_norm": 0.7757812738418579, + "learning_rate": 3.2086061739943874e-06, + "loss": 0.5476, + "step": 343 + }, + { + "epoch": 0.0965749578888265, + "grad_norm": 0.6897493600845337, + "learning_rate": 3.2179607109448084e-06, + "loss": 0.4861, + "step": 344 + }, + { + "epoch": 0.09685569904548007, + "grad_norm": 0.8549035787582397, + "learning_rate": 3.2273152478952295e-06, + "loss": 0.5076, + "step": 345 + }, + { + "epoch": 0.09713644020213363, + "grad_norm": 0.8369840979576111, + "learning_rate": 3.23666978484565e-06, + "loss": 0.5164, + "step": 346 + }, + { + "epoch": 0.0974171813587872, + "grad_norm": 0.7894216775894165, + "learning_rate": 3.246024321796071e-06, + "loss": 0.5282, + "step": 347 + }, + { + "epoch": 0.09769792251544077, + "grad_norm": 0.7484057545661926, + "learning_rate": 3.2553788587464922e-06, + "loss": 0.4986, + "step": 348 + }, + { + "epoch": 0.09797866367209433, + "grad_norm": 0.8106874823570251, + "learning_rate": 3.2647333956969137e-06, + "loss": 0.5534, + "step": 349 + }, + { + "epoch": 0.0982594048287479, + "grad_norm": 0.6894437670707703, + "learning_rate": 3.274087932647334e-06, + "loss": 0.5158, + "step": 350 + }, + { + "epoch": 0.09854014598540146, + "grad_norm": 0.7431429028511047, + "learning_rate": 3.283442469597755e-06, + "loss": 0.51, + "step": 351 + }, + { + "epoch": 0.09882088714205503, + "grad_norm": 0.7787994742393494, + "learning_rate": 3.2927970065481764e-06, + "loss": 0.4997, + "step": 352 + }, + { + "epoch": 0.0991016282987086, + "grad_norm": 0.7309817671775818, + "learning_rate": 3.3021515434985975e-06, + "loss": 0.544, + "step": 353 + }, + { + "epoch": 0.09938236945536216, + "grad_norm": 0.7808282971382141, + "learning_rate": 3.311506080449018e-06, + "loss": 0.5241, + "step": 354 + }, + { + "epoch": 0.09966311061201572, + "grad_norm": 0.7501707077026367, + "learning_rate": 3.320860617399439e-06, + "loss": 0.5323, + "step": 355 + }, + { + "epoch": 0.09994385176866928, + "grad_norm": 0.7073619365692139, + "learning_rate": 3.33021515434986e-06, + "loss": 0.4831, + "step": 356 + }, + { + "epoch": 0.10022459292532285, + "grad_norm": 0.8075127601623535, + "learning_rate": 3.339569691300281e-06, + "loss": 0.5469, + "step": 357 + }, + { + "epoch": 0.10050533408197641, + "grad_norm": 0.7519919872283936, + "learning_rate": 3.348924228250702e-06, + "loss": 0.4982, + "step": 358 + }, + { + "epoch": 0.10078607523862998, + "grad_norm": 0.827935516834259, + "learning_rate": 3.358278765201123e-06, + "loss": 0.5476, + "step": 359 + }, + { + "epoch": 0.10106681639528355, + "grad_norm": 0.7817774415016174, + "learning_rate": 3.367633302151544e-06, + "loss": 0.5202, + "step": 360 + }, + { + "epoch": 0.10134755755193711, + "grad_norm": 0.8603132963180542, + "learning_rate": 3.3769878391019646e-06, + "loss": 0.5865, + "step": 361 + }, + { + "epoch": 0.10162829870859068, + "grad_norm": 0.7862941026687622, + "learning_rate": 3.3863423760523857e-06, + "loss": 0.5135, + "step": 362 + }, + { + "epoch": 0.10190903986524424, + "grad_norm": 0.8191604614257812, + "learning_rate": 3.3956969130028067e-06, + "loss": 0.5339, + "step": 363 + }, + { + "epoch": 0.10218978102189781, + "grad_norm": 0.7199459671974182, + "learning_rate": 3.4050514499532278e-06, + "loss": 0.5013, + "step": 364 + }, + { + "epoch": 0.10247052217855138, + "grad_norm": 0.8049266934394836, + "learning_rate": 3.4144059869036484e-06, + "loss": 0.494, + "step": 365 + }, + { + "epoch": 0.10275126333520494, + "grad_norm": 0.8080098628997803, + "learning_rate": 3.4237605238540695e-06, + "loss": 0.5302, + "step": 366 + }, + { + "epoch": 0.10303200449185851, + "grad_norm": 0.8382777571678162, + "learning_rate": 3.4331150608044905e-06, + "loss": 0.5034, + "step": 367 + }, + { + "epoch": 0.10331274564851207, + "grad_norm": 0.6838870644569397, + "learning_rate": 3.4424695977549116e-06, + "loss": 0.5109, + "step": 368 + }, + { + "epoch": 0.10359348680516564, + "grad_norm": 0.7601078152656555, + "learning_rate": 3.451824134705332e-06, + "loss": 0.4835, + "step": 369 + }, + { + "epoch": 0.10387422796181921, + "grad_norm": 0.7310308218002319, + "learning_rate": 3.4611786716557532e-06, + "loss": 0.5082, + "step": 370 + }, + { + "epoch": 0.10415496911847277, + "grad_norm": 0.813713788986206, + "learning_rate": 3.4705332086061743e-06, + "loss": 0.5292, + "step": 371 + }, + { + "epoch": 0.10443571027512634, + "grad_norm": 0.7797373533248901, + "learning_rate": 3.4798877455565953e-06, + "loss": 0.4605, + "step": 372 + }, + { + "epoch": 0.1047164514317799, + "grad_norm": 0.7014128565788269, + "learning_rate": 3.489242282507016e-06, + "loss": 0.4903, + "step": 373 + }, + { + "epoch": 0.10499719258843346, + "grad_norm": 0.7359755635261536, + "learning_rate": 3.498596819457437e-06, + "loss": 0.4911, + "step": 374 + }, + { + "epoch": 0.10527793374508702, + "grad_norm": 0.7577720880508423, + "learning_rate": 3.507951356407858e-06, + "loss": 0.503, + "step": 375 + }, + { + "epoch": 0.10555867490174059, + "grad_norm": 0.8713043928146362, + "learning_rate": 3.517305893358279e-06, + "loss": 0.5975, + "step": 376 + }, + { + "epoch": 0.10583941605839416, + "grad_norm": 0.7561119794845581, + "learning_rate": 3.5266604303086998e-06, + "loss": 0.5536, + "step": 377 + }, + { + "epoch": 0.10612015721504772, + "grad_norm": 0.8345180749893188, + "learning_rate": 3.536014967259121e-06, + "loss": 0.5266, + "step": 378 + }, + { + "epoch": 0.10640089837170129, + "grad_norm": 0.7421478629112244, + "learning_rate": 3.545369504209542e-06, + "loss": 0.4975, + "step": 379 + }, + { + "epoch": 0.10668163952835485, + "grad_norm": 0.8593834638595581, + "learning_rate": 3.5547240411599625e-06, + "loss": 0.5739, + "step": 380 + }, + { + "epoch": 0.10696238068500842, + "grad_norm": 0.6971132755279541, + "learning_rate": 3.5640785781103835e-06, + "loss": 0.4987, + "step": 381 + }, + { + "epoch": 0.10724312184166199, + "grad_norm": 0.7154144048690796, + "learning_rate": 3.573433115060805e-06, + "loss": 0.5, + "step": 382 + }, + { + "epoch": 0.10752386299831555, + "grad_norm": 0.6905863285064697, + "learning_rate": 3.582787652011226e-06, + "loss": 0.5299, + "step": 383 + }, + { + "epoch": 0.10780460415496912, + "grad_norm": 0.7130893468856812, + "learning_rate": 3.5921421889616463e-06, + "loss": 0.4624, + "step": 384 + }, + { + "epoch": 0.10808534531162269, + "grad_norm": 0.7693150043487549, + "learning_rate": 3.6014967259120677e-06, + "loss": 0.5376, + "step": 385 + }, + { + "epoch": 0.10836608646827625, + "grad_norm": 0.8047574758529663, + "learning_rate": 3.610851262862489e-06, + "loss": 0.4938, + "step": 386 + }, + { + "epoch": 0.10864682762492982, + "grad_norm": 0.7278048396110535, + "learning_rate": 3.62020579981291e-06, + "loss": 0.5318, + "step": 387 + }, + { + "epoch": 0.10892756878158338, + "grad_norm": 0.783423900604248, + "learning_rate": 3.6295603367633305e-06, + "loss": 0.5297, + "step": 388 + }, + { + "epoch": 0.10920830993823695, + "grad_norm": 0.7358670234680176, + "learning_rate": 3.6389148737137515e-06, + "loss": 0.5068, + "step": 389 + }, + { + "epoch": 0.10948905109489052, + "grad_norm": 0.7775084972381592, + "learning_rate": 3.6482694106641726e-06, + "loss": 0.5555, + "step": 390 + }, + { + "epoch": 0.10976979225154408, + "grad_norm": 0.7384195923805237, + "learning_rate": 3.6576239476145936e-06, + "loss": 0.5267, + "step": 391 + }, + { + "epoch": 0.11005053340819765, + "grad_norm": 0.8676061034202576, + "learning_rate": 3.6669784845650143e-06, + "loss": 0.5341, + "step": 392 + }, + { + "epoch": 0.1103312745648512, + "grad_norm": 0.7602493762969971, + "learning_rate": 3.6763330215154353e-06, + "loss": 0.4882, + "step": 393 + }, + { + "epoch": 0.11061201572150477, + "grad_norm": 0.7962198853492737, + "learning_rate": 3.6856875584658564e-06, + "loss": 0.5062, + "step": 394 + }, + { + "epoch": 0.11089275687815833, + "grad_norm": 0.8236138224601746, + "learning_rate": 3.6950420954162774e-06, + "loss": 0.4992, + "step": 395 + }, + { + "epoch": 0.1111734980348119, + "grad_norm": 0.8599743843078613, + "learning_rate": 3.704396632366698e-06, + "loss": 0.5382, + "step": 396 + }, + { + "epoch": 0.11145423919146547, + "grad_norm": 0.8462352156639099, + "learning_rate": 3.713751169317119e-06, + "loss": 0.5305, + "step": 397 + }, + { + "epoch": 0.11173498034811903, + "grad_norm": 0.7529622912406921, + "learning_rate": 3.72310570626754e-06, + "loss": 0.4833, + "step": 398 + }, + { + "epoch": 0.1120157215047726, + "grad_norm": 0.9346505999565125, + "learning_rate": 3.7324602432179608e-06, + "loss": 0.5364, + "step": 399 + }, + { + "epoch": 0.11229646266142616, + "grad_norm": 0.9087169170379639, + "learning_rate": 3.741814780168382e-06, + "loss": 0.5304, + "step": 400 + }, + { + "epoch": 0.11257720381807973, + "grad_norm": 0.7055246829986572, + "learning_rate": 3.751169317118803e-06, + "loss": 0.4843, + "step": 401 + }, + { + "epoch": 0.1128579449747333, + "grad_norm": 0.8225287795066833, + "learning_rate": 3.760523854069224e-06, + "loss": 0.5396, + "step": 402 + }, + { + "epoch": 0.11313868613138686, + "grad_norm": 0.8399577140808105, + "learning_rate": 3.7698783910196446e-06, + "loss": 0.4898, + "step": 403 + }, + { + "epoch": 0.11341942728804043, + "grad_norm": 0.8058261275291443, + "learning_rate": 3.7792329279700656e-06, + "loss": 0.5178, + "step": 404 + }, + { + "epoch": 0.113700168444694, + "grad_norm": 0.7236977219581604, + "learning_rate": 3.7885874649204867e-06, + "loss": 0.4987, + "step": 405 + }, + { + "epoch": 0.11398090960134756, + "grad_norm": 0.8208003640174866, + "learning_rate": 3.7979420018709077e-06, + "loss": 0.5161, + "step": 406 + }, + { + "epoch": 0.11426165075800113, + "grad_norm": 0.8044561147689819, + "learning_rate": 3.8072965388213284e-06, + "loss": 0.5231, + "step": 407 + }, + { + "epoch": 0.11454239191465469, + "grad_norm": 0.9247372150421143, + "learning_rate": 3.816651075771749e-06, + "loss": 0.5591, + "step": 408 + }, + { + "epoch": 0.11482313307130826, + "grad_norm": 0.7564449906349182, + "learning_rate": 3.826005612722171e-06, + "loss": 0.5211, + "step": 409 + }, + { + "epoch": 0.11510387422796182, + "grad_norm": 0.7962033152580261, + "learning_rate": 3.8353601496725915e-06, + "loss": 0.4561, + "step": 410 + }, + { + "epoch": 0.11538461538461539, + "grad_norm": 0.8253943920135498, + "learning_rate": 3.844714686623012e-06, + "loss": 0.4698, + "step": 411 + }, + { + "epoch": 0.11566535654126894, + "grad_norm": 0.7705097198486328, + "learning_rate": 3.854069223573434e-06, + "loss": 0.5227, + "step": 412 + }, + { + "epoch": 0.11594609769792251, + "grad_norm": 0.7166139483451843, + "learning_rate": 3.863423760523854e-06, + "loss": 0.5114, + "step": 413 + }, + { + "epoch": 0.11622683885457608, + "grad_norm": 0.8675190210342407, + "learning_rate": 3.872778297474276e-06, + "loss": 0.5241, + "step": 414 + }, + { + "epoch": 0.11650758001122964, + "grad_norm": 0.7419388294219971, + "learning_rate": 3.882132834424696e-06, + "loss": 0.474, + "step": 415 + }, + { + "epoch": 0.11678832116788321, + "grad_norm": 0.7719217538833618, + "learning_rate": 3.891487371375117e-06, + "loss": 0.5058, + "step": 416 + }, + { + "epoch": 0.11706906232453677, + "grad_norm": 0.9033870697021484, + "learning_rate": 3.9008419083255384e-06, + "loss": 0.4945, + "step": 417 + }, + { + "epoch": 0.11734980348119034, + "grad_norm": 0.754767119884491, + "learning_rate": 3.910196445275959e-06, + "loss": 0.4682, + "step": 418 + }, + { + "epoch": 0.1176305446378439, + "grad_norm": 0.7341263294219971, + "learning_rate": 3.91955098222638e-06, + "loss": 0.5304, + "step": 419 + }, + { + "epoch": 0.11791128579449747, + "grad_norm": 0.7991729378700256, + "learning_rate": 3.928905519176801e-06, + "loss": 0.5031, + "step": 420 + }, + { + "epoch": 0.11819202695115104, + "grad_norm": 0.9032796025276184, + "learning_rate": 3.938260056127222e-06, + "loss": 0.5169, + "step": 421 + }, + { + "epoch": 0.1184727681078046, + "grad_norm": 0.7505165934562683, + "learning_rate": 3.9476145930776424e-06, + "loss": 0.4967, + "step": 422 + }, + { + "epoch": 0.11875350926445817, + "grad_norm": 0.7698094844818115, + "learning_rate": 3.956969130028064e-06, + "loss": 0.5515, + "step": 423 + }, + { + "epoch": 0.11903425042111174, + "grad_norm": 0.7911322116851807, + "learning_rate": 3.9663236669784845e-06, + "loss": 0.4701, + "step": 424 + }, + { + "epoch": 0.1193149915777653, + "grad_norm": 0.7555925846099854, + "learning_rate": 3.975678203928906e-06, + "loss": 0.5028, + "step": 425 + }, + { + "epoch": 0.11959573273441887, + "grad_norm": 0.8901088833808899, + "learning_rate": 3.985032740879327e-06, + "loss": 0.5457, + "step": 426 + }, + { + "epoch": 0.11987647389107244, + "grad_norm": 0.7833061218261719, + "learning_rate": 3.994387277829747e-06, + "loss": 0.5151, + "step": 427 + }, + { + "epoch": 0.120157215047726, + "grad_norm": 0.7623012065887451, + "learning_rate": 4.003741814780169e-06, + "loss": 0.4722, + "step": 428 + }, + { + "epoch": 0.12043795620437957, + "grad_norm": 0.8048515915870667, + "learning_rate": 4.01309635173059e-06, + "loss": 0.4987, + "step": 429 + }, + { + "epoch": 0.12071869736103313, + "grad_norm": 0.9146219491958618, + "learning_rate": 4.02245088868101e-06, + "loss": 0.5422, + "step": 430 + }, + { + "epoch": 0.12099943851768669, + "grad_norm": 0.8742251992225647, + "learning_rate": 4.0318054256314315e-06, + "loss": 0.5352, + "step": 431 + }, + { + "epoch": 0.12128017967434025, + "grad_norm": 0.8957265615463257, + "learning_rate": 4.041159962581853e-06, + "loss": 0.5251, + "step": 432 + }, + { + "epoch": 0.12156092083099382, + "grad_norm": 0.7855954766273499, + "learning_rate": 4.050514499532274e-06, + "loss": 0.5136, + "step": 433 + }, + { + "epoch": 0.12184166198764738, + "grad_norm": 0.8036879897117615, + "learning_rate": 4.059869036482694e-06, + "loss": 0.5542, + "step": 434 + }, + { + "epoch": 0.12212240314430095, + "grad_norm": 0.727243185043335, + "learning_rate": 4.069223573433116e-06, + "loss": 0.5156, + "step": 435 + }, + { + "epoch": 0.12240314430095452, + "grad_norm": 0.8112772703170776, + "learning_rate": 4.078578110383536e-06, + "loss": 0.5292, + "step": 436 + }, + { + "epoch": 0.12268388545760808, + "grad_norm": 0.7895253896713257, + "learning_rate": 4.087932647333958e-06, + "loss": 0.4958, + "step": 437 + }, + { + "epoch": 0.12296462661426165, + "grad_norm": 0.7211445569992065, + "learning_rate": 4.097287184284378e-06, + "loss": 0.5347, + "step": 438 + }, + { + "epoch": 0.12324536777091522, + "grad_norm": 0.8120772242546082, + "learning_rate": 4.106641721234799e-06, + "loss": 0.5109, + "step": 439 + }, + { + "epoch": 0.12352610892756878, + "grad_norm": 0.9701956510543823, + "learning_rate": 4.1159962581852205e-06, + "loss": 0.5731, + "step": 440 + }, + { + "epoch": 0.12380685008422235, + "grad_norm": 0.7655613422393799, + "learning_rate": 4.125350795135641e-06, + "loss": 0.5056, + "step": 441 + }, + { + "epoch": 0.12408759124087591, + "grad_norm": 0.8565111756324768, + "learning_rate": 4.134705332086062e-06, + "loss": 0.5134, + "step": 442 + }, + { + "epoch": 0.12436833239752948, + "grad_norm": 0.8162031769752502, + "learning_rate": 4.144059869036483e-06, + "loss": 0.5414, + "step": 443 + }, + { + "epoch": 0.12464907355418305, + "grad_norm": 0.8924745917320251, + "learning_rate": 4.153414405986904e-06, + "loss": 0.529, + "step": 444 + }, + { + "epoch": 0.12492981471083661, + "grad_norm": 0.8853678703308105, + "learning_rate": 4.1627689429373245e-06, + "loss": 0.5486, + "step": 445 + }, + { + "epoch": 0.12521055586749016, + "grad_norm": 0.7069456577301025, + "learning_rate": 4.172123479887746e-06, + "loss": 0.4879, + "step": 446 + }, + { + "epoch": 0.12549129702414374, + "grad_norm": 0.7341776490211487, + "learning_rate": 4.181478016838167e-06, + "loss": 0.4936, + "step": 447 + }, + { + "epoch": 0.1257720381807973, + "grad_norm": 0.7338918447494507, + "learning_rate": 4.190832553788588e-06, + "loss": 0.5155, + "step": 448 + }, + { + "epoch": 0.12605277933745088, + "grad_norm": 0.868898868560791, + "learning_rate": 4.200187090739009e-06, + "loss": 0.5179, + "step": 449 + }, + { + "epoch": 0.12633352049410443, + "grad_norm": 0.7320483326911926, + "learning_rate": 4.209541627689429e-06, + "loss": 0.5242, + "step": 450 + }, + { + "epoch": 0.126614261650758, + "grad_norm": 0.7627094388008118, + "learning_rate": 4.218896164639851e-06, + "loss": 0.5039, + "step": 451 + }, + { + "epoch": 0.12689500280741156, + "grad_norm": 0.8147349953651428, + "learning_rate": 4.2282507015902715e-06, + "loss": 0.4611, + "step": 452 + }, + { + "epoch": 0.12717574396406514, + "grad_norm": 0.8715364933013916, + "learning_rate": 4.237605238540692e-06, + "loss": 0.4628, + "step": 453 + }, + { + "epoch": 0.1274564851207187, + "grad_norm": 0.8515135645866394, + "learning_rate": 4.2469597754911136e-06, + "loss": 0.4578, + "step": 454 + }, + { + "epoch": 0.12773722627737227, + "grad_norm": 0.8269696831703186, + "learning_rate": 4.256314312441534e-06, + "loss": 0.5201, + "step": 455 + }, + { + "epoch": 0.12801796743402583, + "grad_norm": 0.7364990711212158, + "learning_rate": 4.265668849391956e-06, + "loss": 0.5088, + "step": 456 + }, + { + "epoch": 0.1282987085906794, + "grad_norm": 0.8844892382621765, + "learning_rate": 4.275023386342376e-06, + "loss": 0.5124, + "step": 457 + }, + { + "epoch": 0.12857944974733296, + "grad_norm": 1.0102354288101196, + "learning_rate": 4.284377923292797e-06, + "loss": 0.5183, + "step": 458 + }, + { + "epoch": 0.12886019090398654, + "grad_norm": 0.7829555869102478, + "learning_rate": 4.293732460243218e-06, + "loss": 0.5009, + "step": 459 + }, + { + "epoch": 0.1291409320606401, + "grad_norm": 0.7153803706169128, + "learning_rate": 4.30308699719364e-06, + "loss": 0.5156, + "step": 460 + }, + { + "epoch": 0.12942167321729364, + "grad_norm": 0.8135461807250977, + "learning_rate": 4.3124415341440605e-06, + "loss": 0.4938, + "step": 461 + }, + { + "epoch": 0.12970241437394722, + "grad_norm": 0.8829972743988037, + "learning_rate": 4.321796071094481e-06, + "loss": 0.506, + "step": 462 + }, + { + "epoch": 0.12998315553060077, + "grad_norm": 0.8309484124183655, + "learning_rate": 4.331150608044903e-06, + "loss": 0.5434, + "step": 463 + }, + { + "epoch": 0.13026389668725435, + "grad_norm": 0.7044673562049866, + "learning_rate": 4.340505144995323e-06, + "loss": 0.4991, + "step": 464 + }, + { + "epoch": 0.1305446378439079, + "grad_norm": 0.9659757018089294, + "learning_rate": 4.349859681945744e-06, + "loss": 0.551, + "step": 465 + }, + { + "epoch": 0.1308253790005615, + "grad_norm": 0.8155401349067688, + "learning_rate": 4.359214218896165e-06, + "loss": 0.5389, + "step": 466 + }, + { + "epoch": 0.13110612015721504, + "grad_norm": 0.6680974364280701, + "learning_rate": 4.368568755846586e-06, + "loss": 0.4989, + "step": 467 + }, + { + "epoch": 0.13138686131386862, + "grad_norm": 0.7897123098373413, + "learning_rate": 4.377923292797007e-06, + "loss": 0.4981, + "step": 468 + }, + { + "epoch": 0.13166760247052217, + "grad_norm": 0.844268262386322, + "learning_rate": 4.387277829747428e-06, + "loss": 0.5175, + "step": 469 + }, + { + "epoch": 0.13194834362717575, + "grad_norm": 0.7481633424758911, + "learning_rate": 4.396632366697849e-06, + "loss": 0.5522, + "step": 470 + }, + { + "epoch": 0.1322290847838293, + "grad_norm": 0.8489326238632202, + "learning_rate": 4.40598690364827e-06, + "loss": 0.5321, + "step": 471 + }, + { + "epoch": 0.13250982594048288, + "grad_norm": 0.8275486826896667, + "learning_rate": 4.415341440598691e-06, + "loss": 0.4853, + "step": 472 + }, + { + "epoch": 0.13279056709713644, + "grad_norm": 0.7152136564254761, + "learning_rate": 4.4246959775491114e-06, + "loss": 0.5229, + "step": 473 + }, + { + "epoch": 0.13307130825379002, + "grad_norm": 0.7578805088996887, + "learning_rate": 4.434050514499533e-06, + "loss": 0.4997, + "step": 474 + }, + { + "epoch": 0.13335204941044357, + "grad_norm": 0.7343823313713074, + "learning_rate": 4.4434050514499535e-06, + "loss": 0.5085, + "step": 475 + }, + { + "epoch": 0.13363279056709715, + "grad_norm": 0.8387718796730042, + "learning_rate": 4.452759588400374e-06, + "loss": 0.5004, + "step": 476 + }, + { + "epoch": 0.1339135317237507, + "grad_norm": 0.7833738327026367, + "learning_rate": 4.462114125350796e-06, + "loss": 0.5447, + "step": 477 + }, + { + "epoch": 0.13419427288040428, + "grad_norm": 0.9023407697677612, + "learning_rate": 4.471468662301216e-06, + "loss": 0.5389, + "step": 478 + }, + { + "epoch": 0.13447501403705783, + "grad_norm": 0.7810462117195129, + "learning_rate": 4.480823199251638e-06, + "loss": 0.4702, + "step": 479 + }, + { + "epoch": 0.13475575519371139, + "grad_norm": 0.8092748522758484, + "learning_rate": 4.490177736202058e-06, + "loss": 0.5314, + "step": 480 + }, + { + "epoch": 0.13503649635036497, + "grad_norm": 0.7407997250556946, + "learning_rate": 4.499532273152479e-06, + "loss": 0.5026, + "step": 481 + }, + { + "epoch": 0.13531723750701852, + "grad_norm": 0.8457898497581482, + "learning_rate": 4.5088868101029005e-06, + "loss": 0.5533, + "step": 482 + }, + { + "epoch": 0.1355979786636721, + "grad_norm": 0.765034019947052, + "learning_rate": 4.518241347053321e-06, + "loss": 0.5362, + "step": 483 + }, + { + "epoch": 0.13587871982032565, + "grad_norm": 0.6809737682342529, + "learning_rate": 4.527595884003742e-06, + "loss": 0.4814, + "step": 484 + }, + { + "epoch": 0.13615946097697923, + "grad_norm": 0.8527835011482239, + "learning_rate": 4.536950420954163e-06, + "loss": 0.5112, + "step": 485 + }, + { + "epoch": 0.13644020213363278, + "grad_norm": 0.7911974191665649, + "learning_rate": 4.546304957904585e-06, + "loss": 0.516, + "step": 486 + }, + { + "epoch": 0.13672094329028636, + "grad_norm": 0.8519447445869446, + "learning_rate": 4.5556594948550045e-06, + "loss": 0.5101, + "step": 487 + }, + { + "epoch": 0.13700168444693991, + "grad_norm": 0.6963992714881897, + "learning_rate": 4.565014031805426e-06, + "loss": 0.4595, + "step": 488 + }, + { + "epoch": 0.1372824256035935, + "grad_norm": 0.8214945793151855, + "learning_rate": 4.574368568755847e-06, + "loss": 0.5205, + "step": 489 + }, + { + "epoch": 0.13756316676024705, + "grad_norm": 0.7888759970664978, + "learning_rate": 4.583723105706268e-06, + "loss": 0.5406, + "step": 490 + }, + { + "epoch": 0.13784390791690063, + "grad_norm": 0.7536157369613647, + "learning_rate": 4.593077642656689e-06, + "loss": 0.5322, + "step": 491 + }, + { + "epoch": 0.13812464907355418, + "grad_norm": 0.8491219282150269, + "learning_rate": 4.60243217960711e-06, + "loss": 0.5074, + "step": 492 + }, + { + "epoch": 0.13840539023020776, + "grad_norm": 0.7836640477180481, + "learning_rate": 4.611786716557531e-06, + "loss": 0.4925, + "step": 493 + }, + { + "epoch": 0.1386861313868613, + "grad_norm": 0.8089984059333801, + "learning_rate": 4.621141253507952e-06, + "loss": 0.4947, + "step": 494 + }, + { + "epoch": 0.1389668725435149, + "grad_norm": 0.9444265365600586, + "learning_rate": 4.630495790458373e-06, + "loss": 0.4985, + "step": 495 + }, + { + "epoch": 0.13924761370016844, + "grad_norm": 0.8405911326408386, + "learning_rate": 4.6398503274087935e-06, + "loss": 0.5427, + "step": 496 + }, + { + "epoch": 0.13952835485682202, + "grad_norm": 0.7384021282196045, + "learning_rate": 4.649204864359215e-06, + "loss": 0.5331, + "step": 497 + }, + { + "epoch": 0.13980909601347558, + "grad_norm": 0.8780004978179932, + "learning_rate": 4.658559401309636e-06, + "loss": 0.5129, + "step": 498 + }, + { + "epoch": 0.14008983717012913, + "grad_norm": 0.9964781403541565, + "learning_rate": 4.667913938260056e-06, + "loss": 0.5178, + "step": 499 + }, + { + "epoch": 0.1403705783267827, + "grad_norm": 0.7410679459571838, + "learning_rate": 4.677268475210478e-06, + "loss": 0.495, + "step": 500 + }, + { + "epoch": 0.14065131948343626, + "grad_norm": 0.7884122729301453, + "learning_rate": 4.686623012160898e-06, + "loss": 0.4963, + "step": 501 + }, + { + "epoch": 0.14093206064008984, + "grad_norm": 0.9369380474090576, + "learning_rate": 4.69597754911132e-06, + "loss": 0.4986, + "step": 502 + }, + { + "epoch": 0.1412128017967434, + "grad_norm": 1.0544527769088745, + "learning_rate": 4.7053320860617404e-06, + "loss": 0.5399, + "step": 503 + }, + { + "epoch": 0.14149354295339697, + "grad_norm": 0.7701802849769592, + "learning_rate": 4.714686623012161e-06, + "loss": 0.5134, + "step": 504 + }, + { + "epoch": 0.14177428411005052, + "grad_norm": 0.8956480622291565, + "learning_rate": 4.7240411599625826e-06, + "loss": 0.5135, + "step": 505 + }, + { + "epoch": 0.1420550252667041, + "grad_norm": 1.0529237985610962, + "learning_rate": 4.733395696913003e-06, + "loss": 0.523, + "step": 506 + }, + { + "epoch": 0.14233576642335766, + "grad_norm": 0.8469558358192444, + "learning_rate": 4.742750233863424e-06, + "loss": 0.5178, + "step": 507 + }, + { + "epoch": 0.14261650758001124, + "grad_norm": 0.8177878856658936, + "learning_rate": 4.752104770813845e-06, + "loss": 0.5613, + "step": 508 + }, + { + "epoch": 0.1428972487366648, + "grad_norm": 0.8858785033226013, + "learning_rate": 4.761459307764266e-06, + "loss": 0.491, + "step": 509 + }, + { + "epoch": 0.14317798989331837, + "grad_norm": 0.8501063585281372, + "learning_rate": 4.7708138447146865e-06, + "loss": 0.4929, + "step": 510 + }, + { + "epoch": 0.14345873104997192, + "grad_norm": 0.9841606616973877, + "learning_rate": 4.780168381665108e-06, + "loss": 0.51, + "step": 511 + }, + { + "epoch": 0.1437394722066255, + "grad_norm": 0.7775784730911255, + "learning_rate": 4.789522918615529e-06, + "loss": 0.481, + "step": 512 + }, + { + "epoch": 0.14402021336327905, + "grad_norm": 0.7898653149604797, + "learning_rate": 4.79887745556595e-06, + "loss": 0.4996, + "step": 513 + }, + { + "epoch": 0.14430095451993263, + "grad_norm": 0.7949855923652649, + "learning_rate": 4.808231992516371e-06, + "loss": 0.5754, + "step": 514 + }, + { + "epoch": 0.14458169567658619, + "grad_norm": 0.9908952713012695, + "learning_rate": 4.817586529466791e-06, + "loss": 0.5185, + "step": 515 + }, + { + "epoch": 0.14486243683323977, + "grad_norm": 0.9723168015480042, + "learning_rate": 4.826941066417213e-06, + "loss": 0.4764, + "step": 516 + }, + { + "epoch": 0.14514317798989332, + "grad_norm": 0.8574391603469849, + "learning_rate": 4.836295603367634e-06, + "loss": 0.5662, + "step": 517 + }, + { + "epoch": 0.14542391914654687, + "grad_norm": 0.8870858550071716, + "learning_rate": 4.845650140318054e-06, + "loss": 0.4795, + "step": 518 + }, + { + "epoch": 0.14570466030320045, + "grad_norm": 0.8039255738258362, + "learning_rate": 4.855004677268476e-06, + "loss": 0.4968, + "step": 519 + }, + { + "epoch": 0.145985401459854, + "grad_norm": 0.6876815557479858, + "learning_rate": 4.864359214218897e-06, + "loss": 0.4784, + "step": 520 + }, + { + "epoch": 0.14626614261650758, + "grad_norm": 0.7060603499412537, + "learning_rate": 4.873713751169318e-06, + "loss": 0.4669, + "step": 521 + }, + { + "epoch": 0.14654688377316114, + "grad_norm": 0.7625956535339355, + "learning_rate": 4.883068288119738e-06, + "loss": 0.4737, + "step": 522 + }, + { + "epoch": 0.14682762492981472, + "grad_norm": 0.9577537775039673, + "learning_rate": 4.89242282507016e-06, + "loss": 0.5497, + "step": 523 + }, + { + "epoch": 0.14710836608646827, + "grad_norm": 0.7729389071464539, + "learning_rate": 4.90177736202058e-06, + "loss": 0.5415, + "step": 524 + }, + { + "epoch": 0.14738910724312185, + "grad_norm": 0.8255321383476257, + "learning_rate": 4.911131898971001e-06, + "loss": 0.496, + "step": 525 + }, + { + "epoch": 0.1476698483997754, + "grad_norm": 0.823871374130249, + "learning_rate": 4.9204864359214225e-06, + "loss": 0.5014, + "step": 526 + }, + { + "epoch": 0.14795058955642898, + "grad_norm": 0.7970291376113892, + "learning_rate": 4.929840972871843e-06, + "loss": 0.5102, + "step": 527 + }, + { + "epoch": 0.14823133071308253, + "grad_norm": 0.7161914706230164, + "learning_rate": 4.939195509822265e-06, + "loss": 0.4838, + "step": 528 + }, + { + "epoch": 0.1485120718697361, + "grad_norm": 0.8029255270957947, + "learning_rate": 4.948550046772685e-06, + "loss": 0.5437, + "step": 529 + }, + { + "epoch": 0.14879281302638966, + "grad_norm": 0.7042182087898254, + "learning_rate": 4.957904583723106e-06, + "loss": 0.4834, + "step": 530 + }, + { + "epoch": 0.14907355418304324, + "grad_norm": 0.8169024586677551, + "learning_rate": 4.967259120673527e-06, + "loss": 0.4586, + "step": 531 + }, + { + "epoch": 0.1493542953396968, + "grad_norm": 0.8217435479164124, + "learning_rate": 4.976613657623948e-06, + "loss": 0.5296, + "step": 532 + }, + { + "epoch": 0.14963503649635038, + "grad_norm": 0.7397123575210571, + "learning_rate": 4.985968194574369e-06, + "loss": 0.5598, + "step": 533 + }, + { + "epoch": 0.14991577765300393, + "grad_norm": 0.8947880864143372, + "learning_rate": 4.99532273152479e-06, + "loss": 0.5462, + "step": 534 + }, + { + "epoch": 0.1501965188096575, + "grad_norm": 0.7352718710899353, + "learning_rate": 5.004677268475211e-06, + "loss": 0.4843, + "step": 535 + }, + { + "epoch": 0.15047725996631106, + "grad_norm": 0.7786549925804138, + "learning_rate": 5.014031805425632e-06, + "loss": 0.4706, + "step": 536 + }, + { + "epoch": 0.1507580011229646, + "grad_norm": 0.7344022989273071, + "learning_rate": 5.023386342376053e-06, + "loss": 0.5204, + "step": 537 + }, + { + "epoch": 0.1510387422796182, + "grad_norm": 0.8131575584411621, + "learning_rate": 5.032740879326474e-06, + "loss": 0.5751, + "step": 538 + }, + { + "epoch": 0.15131948343627175, + "grad_norm": 0.7318029999732971, + "learning_rate": 5.042095416276894e-06, + "loss": 0.4698, + "step": 539 + }, + { + "epoch": 0.15160022459292533, + "grad_norm": 0.8344904780387878, + "learning_rate": 5.0514499532273156e-06, + "loss": 0.4934, + "step": 540 + }, + { + "epoch": 0.15188096574957888, + "grad_norm": 0.6257030963897705, + "learning_rate": 5.060804490177736e-06, + "loss": 0.4479, + "step": 541 + }, + { + "epoch": 0.15216170690623246, + "grad_norm": 0.8552340865135193, + "learning_rate": 5.070159027128158e-06, + "loss": 0.4709, + "step": 542 + }, + { + "epoch": 0.152442448062886, + "grad_norm": 0.7414129376411438, + "learning_rate": 5.079513564078578e-06, + "loss": 0.5039, + "step": 543 + }, + { + "epoch": 0.1527231892195396, + "grad_norm": 0.7615332007408142, + "learning_rate": 5.088868101029e-06, + "loss": 0.5441, + "step": 544 + }, + { + "epoch": 0.15300393037619314, + "grad_norm": 0.690945565700531, + "learning_rate": 5.098222637979421e-06, + "loss": 0.5114, + "step": 545 + }, + { + "epoch": 0.15328467153284672, + "grad_norm": 0.6874892115592957, + "learning_rate": 5.107577174929841e-06, + "loss": 0.4722, + "step": 546 + }, + { + "epoch": 0.15356541268950027, + "grad_norm": 0.7465002536773682, + "learning_rate": 5.116931711880262e-06, + "loss": 0.4996, + "step": 547 + }, + { + "epoch": 0.15384615384615385, + "grad_norm": 0.7759626507759094, + "learning_rate": 5.126286248830683e-06, + "loss": 0.5134, + "step": 548 + }, + { + "epoch": 0.1541268950028074, + "grad_norm": 0.8571668863296509, + "learning_rate": 5.135640785781104e-06, + "loss": 0.5298, + "step": 549 + }, + { + "epoch": 0.154407636159461, + "grad_norm": 0.7905661463737488, + "learning_rate": 5.144995322731525e-06, + "loss": 0.5319, + "step": 550 + }, + { + "epoch": 0.15468837731611454, + "grad_norm": 0.7992151975631714, + "learning_rate": 5.154349859681947e-06, + "loss": 0.534, + "step": 551 + }, + { + "epoch": 0.15496911847276812, + "grad_norm": 0.8805502653121948, + "learning_rate": 5.163704396632367e-06, + "loss": 0.5318, + "step": 552 + }, + { + "epoch": 0.15524985962942167, + "grad_norm": 0.894903302192688, + "learning_rate": 5.173058933582789e-06, + "loss": 0.565, + "step": 553 + }, + { + "epoch": 0.15553060078607525, + "grad_norm": 0.8050834536552429, + "learning_rate": 5.182413470533209e-06, + "loss": 0.4873, + "step": 554 + }, + { + "epoch": 0.1558113419427288, + "grad_norm": 0.9257166385650635, + "learning_rate": 5.191768007483629e-06, + "loss": 0.5192, + "step": 555 + }, + { + "epoch": 0.15609208309938236, + "grad_norm": 0.7694621682167053, + "learning_rate": 5.201122544434051e-06, + "loss": 0.5001, + "step": 556 + }, + { + "epoch": 0.15637282425603594, + "grad_norm": 0.8481784462928772, + "learning_rate": 5.210477081384472e-06, + "loss": 0.4991, + "step": 557 + }, + { + "epoch": 0.1566535654126895, + "grad_norm": 0.8397626876831055, + "learning_rate": 5.219831618334893e-06, + "loss": 0.4966, + "step": 558 + }, + { + "epoch": 0.15693430656934307, + "grad_norm": 0.7801057696342468, + "learning_rate": 5.229186155285314e-06, + "loss": 0.4751, + "step": 559 + }, + { + "epoch": 0.15721504772599662, + "grad_norm": 0.7261902689933777, + "learning_rate": 5.238540692235735e-06, + "loss": 0.4794, + "step": 560 + }, + { + "epoch": 0.1574957888826502, + "grad_norm": 0.8400854468345642, + "learning_rate": 5.247895229186156e-06, + "loss": 0.465, + "step": 561 + }, + { + "epoch": 0.15777653003930375, + "grad_norm": 0.8911288976669312, + "learning_rate": 5.257249766136576e-06, + "loss": 0.4677, + "step": 562 + }, + { + "epoch": 0.15805727119595733, + "grad_norm": 0.8608343601226807, + "learning_rate": 5.266604303086998e-06, + "loss": 0.501, + "step": 563 + }, + { + "epoch": 0.15833801235261089, + "grad_norm": 0.8770518898963928, + "learning_rate": 5.275958840037418e-06, + "loss": 0.5028, + "step": 564 + }, + { + "epoch": 0.15861875350926447, + "grad_norm": 0.8536120057106018, + "learning_rate": 5.28531337698784e-06, + "loss": 0.5544, + "step": 565 + }, + { + "epoch": 0.15889949466591802, + "grad_norm": 0.9643396735191345, + "learning_rate": 5.29466791393826e-06, + "loss": 0.5322, + "step": 566 + }, + { + "epoch": 0.1591802358225716, + "grad_norm": 0.793518602848053, + "learning_rate": 5.304022450888682e-06, + "loss": 0.5367, + "step": 567 + }, + { + "epoch": 0.15946097697922515, + "grad_norm": 0.7644721865653992, + "learning_rate": 5.3133769878391025e-06, + "loss": 0.4954, + "step": 568 + }, + { + "epoch": 0.15974171813587873, + "grad_norm": 0.7848473191261292, + "learning_rate": 5.322731524789523e-06, + "loss": 0.5036, + "step": 569 + }, + { + "epoch": 0.16002245929253228, + "grad_norm": 0.7806656956672668, + "learning_rate": 5.332086061739944e-06, + "loss": 0.4985, + "step": 570 + }, + { + "epoch": 0.16030320044918586, + "grad_norm": 0.8278757333755493, + "learning_rate": 5.341440598690365e-06, + "loss": 0.4639, + "step": 571 + }, + { + "epoch": 0.16058394160583941, + "grad_norm": 0.7348732948303223, + "learning_rate": 5.350795135640786e-06, + "loss": 0.4956, + "step": 572 + }, + { + "epoch": 0.160864682762493, + "grad_norm": 0.8381409645080566, + "learning_rate": 5.360149672591207e-06, + "loss": 0.4553, + "step": 573 + }, + { + "epoch": 0.16114542391914655, + "grad_norm": 0.7404391765594482, + "learning_rate": 5.369504209541628e-06, + "loss": 0.5078, + "step": 574 + }, + { + "epoch": 0.1614261650758001, + "grad_norm": 0.8644616007804871, + "learning_rate": 5.378858746492049e-06, + "loss": 0.4824, + "step": 575 + }, + { + "epoch": 0.16170690623245368, + "grad_norm": 0.8510028719902039, + "learning_rate": 5.388213283442471e-06, + "loss": 0.4946, + "step": 576 + }, + { + "epoch": 0.16198764738910723, + "grad_norm": 0.7837504744529724, + "learning_rate": 5.397567820392891e-06, + "loss": 0.5025, + "step": 577 + }, + { + "epoch": 0.1622683885457608, + "grad_norm": 0.8370034098625183, + "learning_rate": 5.406922357343311e-06, + "loss": 0.4808, + "step": 578 + }, + { + "epoch": 0.16254912970241436, + "grad_norm": 0.8849330544471741, + "learning_rate": 5.416276894293733e-06, + "loss": 0.4866, + "step": 579 + }, + { + "epoch": 0.16282987085906794, + "grad_norm": 0.775743305683136, + "learning_rate": 5.425631431244153e-06, + "loss": 0.4642, + "step": 580 + }, + { + "epoch": 0.1631106120157215, + "grad_norm": 0.8211396336555481, + "learning_rate": 5.434985968194575e-06, + "loss": 0.4993, + "step": 581 + }, + { + "epoch": 0.16339135317237508, + "grad_norm": 0.8382355570793152, + "learning_rate": 5.444340505144996e-06, + "loss": 0.4747, + "step": 582 + }, + { + "epoch": 0.16367209432902863, + "grad_norm": 0.8368803858757019, + "learning_rate": 5.453695042095417e-06, + "loss": 0.461, + "step": 583 + }, + { + "epoch": 0.1639528354856822, + "grad_norm": 0.7276535034179688, + "learning_rate": 5.4630495790458385e-06, + "loss": 0.4788, + "step": 584 + }, + { + "epoch": 0.16423357664233576, + "grad_norm": 0.8614891171455383, + "learning_rate": 5.472404115996258e-06, + "loss": 0.5592, + "step": 585 + }, + { + "epoch": 0.16451431779898934, + "grad_norm": 0.7511591911315918, + "learning_rate": 5.48175865294668e-06, + "loss": 0.4818, + "step": 586 + }, + { + "epoch": 0.1647950589556429, + "grad_norm": 0.7985215187072754, + "learning_rate": 5.4911131898971e-06, + "loss": 0.4707, + "step": 587 + }, + { + "epoch": 0.16507580011229647, + "grad_norm": 0.9363226890563965, + "learning_rate": 5.500467726847522e-06, + "loss": 0.4896, + "step": 588 + }, + { + "epoch": 0.16535654126895002, + "grad_norm": 0.7941949367523193, + "learning_rate": 5.5098222637979424e-06, + "loss": 0.5211, + "step": 589 + }, + { + "epoch": 0.1656372824256036, + "grad_norm": 0.8037974834442139, + "learning_rate": 5.519176800748364e-06, + "loss": 0.5263, + "step": 590 + }, + { + "epoch": 0.16591802358225716, + "grad_norm": 0.7701349854469299, + "learning_rate": 5.5285313376987846e-06, + "loss": 0.5092, + "step": 591 + }, + { + "epoch": 0.16619876473891074, + "grad_norm": 0.7301192879676819, + "learning_rate": 5.537885874649205e-06, + "loss": 0.4826, + "step": 592 + }, + { + "epoch": 0.1664795058955643, + "grad_norm": 0.8852382898330688, + "learning_rate": 5.547240411599626e-06, + "loss": 0.5481, + "step": 593 + }, + { + "epoch": 0.16676024705221784, + "grad_norm": 0.8560103178024292, + "learning_rate": 5.556594948550047e-06, + "loss": 0.5154, + "step": 594 + }, + { + "epoch": 0.16704098820887142, + "grad_norm": 0.7502108812332153, + "learning_rate": 5.565949485500468e-06, + "loss": 0.5251, + "step": 595 + }, + { + "epoch": 0.16732172936552497, + "grad_norm": 0.7865407466888428, + "learning_rate": 5.575304022450889e-06, + "loss": 0.5201, + "step": 596 + }, + { + "epoch": 0.16760247052217855, + "grad_norm": 0.7632266283035278, + "learning_rate": 5.58465855940131e-06, + "loss": 0.4567, + "step": 597 + }, + { + "epoch": 0.1678832116788321, + "grad_norm": 0.83583664894104, + "learning_rate": 5.5940130963517315e-06, + "loss": 0.5082, + "step": 598 + }, + { + "epoch": 0.16816395283548569, + "grad_norm": 0.8887547254562378, + "learning_rate": 5.603367633302152e-06, + "loss": 0.5472, + "step": 599 + }, + { + "epoch": 0.16844469399213924, + "grad_norm": 0.712386965751648, + "learning_rate": 5.612722170252573e-06, + "loss": 0.4863, + "step": 600 + }, + { + "epoch": 0.16872543514879282, + "grad_norm": 0.7156517505645752, + "learning_rate": 5.622076707202993e-06, + "loss": 0.4614, + "step": 601 + }, + { + "epoch": 0.16900617630544637, + "grad_norm": 0.7918566465377808, + "learning_rate": 5.631431244153415e-06, + "loss": 0.5613, + "step": 602 + }, + { + "epoch": 0.16928691746209995, + "grad_norm": 0.7155835628509521, + "learning_rate": 5.6407857811038355e-06, + "loss": 0.4798, + "step": 603 + }, + { + "epoch": 0.1695676586187535, + "grad_norm": 0.7834767699241638, + "learning_rate": 5.650140318054257e-06, + "loss": 0.4335, + "step": 604 + }, + { + "epoch": 0.16984839977540708, + "grad_norm": 0.6875774264335632, + "learning_rate": 5.659494855004678e-06, + "loss": 0.4752, + "step": 605 + }, + { + "epoch": 0.17012914093206064, + "grad_norm": 0.8880415558815002, + "learning_rate": 5.668849391955099e-06, + "loss": 0.5201, + "step": 606 + }, + { + "epoch": 0.17040988208871422, + "grad_norm": 0.8246658444404602, + "learning_rate": 5.6782039289055205e-06, + "loss": 0.5114, + "step": 607 + }, + { + "epoch": 0.17069062324536777, + "grad_norm": 0.8733250498771667, + "learning_rate": 5.68755846585594e-06, + "loss": 0.541, + "step": 608 + }, + { + "epoch": 0.17097136440202135, + "grad_norm": 0.7933526635169983, + "learning_rate": 5.696913002806361e-06, + "loss": 0.4487, + "step": 609 + }, + { + "epoch": 0.1712521055586749, + "grad_norm": 0.842877984046936, + "learning_rate": 5.706267539756782e-06, + "loss": 0.51, + "step": 610 + }, + { + "epoch": 0.17153284671532848, + "grad_norm": 0.8397455811500549, + "learning_rate": 5.715622076707204e-06, + "loss": 0.5534, + "step": 611 + }, + { + "epoch": 0.17181358787198203, + "grad_norm": 0.7597702741622925, + "learning_rate": 5.7249766136576245e-06, + "loss": 0.5146, + "step": 612 + }, + { + "epoch": 0.17209432902863558, + "grad_norm": 0.849120557308197, + "learning_rate": 5.734331150608046e-06, + "loss": 0.4783, + "step": 613 + }, + { + "epoch": 0.17237507018528916, + "grad_norm": 0.7869701981544495, + "learning_rate": 5.743685687558467e-06, + "loss": 0.5101, + "step": 614 + }, + { + "epoch": 0.17265581134194272, + "grad_norm": 0.7406596541404724, + "learning_rate": 5.753040224508886e-06, + "loss": 0.4854, + "step": 615 + }, + { + "epoch": 0.1729365524985963, + "grad_norm": 0.6839237213134766, + "learning_rate": 5.762394761459308e-06, + "loss": 0.4381, + "step": 616 + }, + { + "epoch": 0.17321729365524985, + "grad_norm": 0.8371248245239258, + "learning_rate": 5.771749298409729e-06, + "loss": 0.5202, + "step": 617 + }, + { + "epoch": 0.17349803481190343, + "grad_norm": 0.6959989666938782, + "learning_rate": 5.78110383536015e-06, + "loss": 0.4705, + "step": 618 + }, + { + "epoch": 0.17377877596855698, + "grad_norm": 0.7967045903205872, + "learning_rate": 5.7904583723105715e-06, + "loss": 0.5166, + "step": 619 + }, + { + "epoch": 0.17405951712521056, + "grad_norm": 0.8553176522254944, + "learning_rate": 5.799812909260992e-06, + "loss": 0.5272, + "step": 620 + }, + { + "epoch": 0.1743402582818641, + "grad_norm": 0.8010025024414062, + "learning_rate": 5.8091674462114136e-06, + "loss": 0.475, + "step": 621 + }, + { + "epoch": 0.1746209994385177, + "grad_norm": 0.7816404104232788, + "learning_rate": 5.818521983161834e-06, + "loss": 0.5193, + "step": 622 + }, + { + "epoch": 0.17490174059517125, + "grad_norm": 0.7686100602149963, + "learning_rate": 5.827876520112255e-06, + "loss": 0.5058, + "step": 623 + }, + { + "epoch": 0.17518248175182483, + "grad_norm": 0.8996552228927612, + "learning_rate": 5.8372310570626755e-06, + "loss": 0.5281, + "step": 624 + }, + { + "epoch": 0.17546322290847838, + "grad_norm": 0.9018763899803162, + "learning_rate": 5.846585594013097e-06, + "loss": 0.4906, + "step": 625 + }, + { + "epoch": 0.17574396406513196, + "grad_norm": 0.8745623826980591, + "learning_rate": 5.8559401309635176e-06, + "loss": 0.5317, + "step": 626 + }, + { + "epoch": 0.1760247052217855, + "grad_norm": 0.8026597499847412, + "learning_rate": 5.865294667913939e-06, + "loss": 0.4755, + "step": 627 + }, + { + "epoch": 0.1763054463784391, + "grad_norm": 0.8113706111907959, + "learning_rate": 5.87464920486436e-06, + "loss": 0.4656, + "step": 628 + }, + { + "epoch": 0.17658618753509264, + "grad_norm": 0.7952340245246887, + "learning_rate": 5.884003741814781e-06, + "loss": 0.5326, + "step": 629 + }, + { + "epoch": 0.17686692869174622, + "grad_norm": 0.7512227892875671, + "learning_rate": 5.893358278765201e-06, + "loss": 0.4843, + "step": 630 + }, + { + "epoch": 0.17714766984839977, + "grad_norm": 0.7184258103370667, + "learning_rate": 5.902712815715622e-06, + "loss": 0.547, + "step": 631 + }, + { + "epoch": 0.17742841100505333, + "grad_norm": 0.7102201581001282, + "learning_rate": 5.912067352666043e-06, + "loss": 0.5041, + "step": 632 + }, + { + "epoch": 0.1777091521617069, + "grad_norm": 0.7374149560928345, + "learning_rate": 5.9214218896164645e-06, + "loss": 0.4294, + "step": 633 + }, + { + "epoch": 0.17798989331836046, + "grad_norm": 0.7687684893608093, + "learning_rate": 5.930776426566885e-06, + "loss": 0.5131, + "step": 634 + }, + { + "epoch": 0.17827063447501404, + "grad_norm": 0.6948454976081848, + "learning_rate": 5.940130963517307e-06, + "loss": 0.475, + "step": 635 + }, + { + "epoch": 0.1785513756316676, + "grad_norm": 0.7867432832717896, + "learning_rate": 5.949485500467728e-06, + "loss": 0.4894, + "step": 636 + }, + { + "epoch": 0.17883211678832117, + "grad_norm": 0.8605855107307434, + "learning_rate": 5.958840037418149e-06, + "loss": 0.4981, + "step": 637 + }, + { + "epoch": 0.17911285794497472, + "grad_norm": 0.838919460773468, + "learning_rate": 5.9681945743685685e-06, + "loss": 0.4816, + "step": 638 + }, + { + "epoch": 0.1793935991016283, + "grad_norm": 0.7709580063819885, + "learning_rate": 5.97754911131899e-06, + "loss": 0.5236, + "step": 639 + }, + { + "epoch": 0.17967434025828186, + "grad_norm": 0.8052390217781067, + "learning_rate": 5.986903648269411e-06, + "loss": 0.5261, + "step": 640 + }, + { + "epoch": 0.17995508141493544, + "grad_norm": 0.8202112317085266, + "learning_rate": 5.996258185219832e-06, + "loss": 0.5035, + "step": 641 + }, + { + "epoch": 0.180235822571589, + "grad_norm": 0.8466426134109497, + "learning_rate": 6.0056127221702535e-06, + "loss": 0.5071, + "step": 642 + }, + { + "epoch": 0.18051656372824257, + "grad_norm": 0.8519512414932251, + "learning_rate": 6.014967259120674e-06, + "loss": 0.4966, + "step": 643 + }, + { + "epoch": 0.18079730488489612, + "grad_norm": 0.8277428150177002, + "learning_rate": 6.024321796071096e-06, + "loss": 0.4786, + "step": 644 + }, + { + "epoch": 0.1810780460415497, + "grad_norm": 0.7670875787734985, + "learning_rate": 6.033676333021516e-06, + "loss": 0.5597, + "step": 645 + }, + { + "epoch": 0.18135878719820325, + "grad_norm": 0.7683556079864502, + "learning_rate": 6.043030869971936e-06, + "loss": 0.4834, + "step": 646 + }, + { + "epoch": 0.18163952835485683, + "grad_norm": 0.9911069273948669, + "learning_rate": 6.0523854069223575e-06, + "loss": 0.5175, + "step": 647 + }, + { + "epoch": 0.18192026951151039, + "grad_norm": 1.0118271112442017, + "learning_rate": 6.061739943872779e-06, + "loss": 0.5203, + "step": 648 + }, + { + "epoch": 0.18220101066816397, + "grad_norm": 0.7321862578392029, + "learning_rate": 6.0710944808232e-06, + "loss": 0.4718, + "step": 649 + }, + { + "epoch": 0.18248175182481752, + "grad_norm": 0.8685011863708496, + "learning_rate": 6.080449017773621e-06, + "loss": 0.5076, + "step": 650 + }, + { + "epoch": 0.1827624929814711, + "grad_norm": 0.8869417309761047, + "learning_rate": 6.089803554724042e-06, + "loss": 0.4809, + "step": 651 + }, + { + "epoch": 0.18304323413812465, + "grad_norm": 0.8386750817298889, + "learning_rate": 6.099158091674463e-06, + "loss": 0.4401, + "step": 652 + }, + { + "epoch": 0.1833239752947782, + "grad_norm": 0.9820486307144165, + "learning_rate": 6.108512628624883e-06, + "loss": 0.5499, + "step": 653 + }, + { + "epoch": 0.18360471645143178, + "grad_norm": 0.8415496349334717, + "learning_rate": 6.1178671655753045e-06, + "loss": 0.4853, + "step": 654 + }, + { + "epoch": 0.18388545760808533, + "grad_norm": 0.9291137456893921, + "learning_rate": 6.127221702525725e-06, + "loss": 0.5085, + "step": 655 + }, + { + "epoch": 0.18416619876473891, + "grad_norm": 0.8530355095863342, + "learning_rate": 6.136576239476147e-06, + "loss": 0.5417, + "step": 656 + }, + { + "epoch": 0.18444693992139247, + "grad_norm": 0.8005567193031311, + "learning_rate": 6.145930776426567e-06, + "loss": 0.4913, + "step": 657 + }, + { + "epoch": 0.18472768107804605, + "grad_norm": 0.6735771298408508, + "learning_rate": 6.155285313376989e-06, + "loss": 0.4819, + "step": 658 + }, + { + "epoch": 0.1850084222346996, + "grad_norm": 0.8707764148712158, + "learning_rate": 6.164639850327409e-06, + "loss": 0.5603, + "step": 659 + }, + { + "epoch": 0.18528916339135318, + "grad_norm": 0.799589216709137, + "learning_rate": 6.173994387277831e-06, + "loss": 0.4453, + "step": 660 + }, + { + "epoch": 0.18556990454800673, + "grad_norm": 0.8446917533874512, + "learning_rate": 6.1833489242282506e-06, + "loss": 0.4905, + "step": 661 + }, + { + "epoch": 0.1858506457046603, + "grad_norm": 0.781111478805542, + "learning_rate": 6.192703461178672e-06, + "loss": 0.4821, + "step": 662 + }, + { + "epoch": 0.18613138686131386, + "grad_norm": 0.8124823570251465, + "learning_rate": 6.202057998129093e-06, + "loss": 0.5284, + "step": 663 + }, + { + "epoch": 0.18641212801796744, + "grad_norm": 0.8359686136245728, + "learning_rate": 6.211412535079514e-06, + "loss": 0.4538, + "step": 664 + }, + { + "epoch": 0.186692869174621, + "grad_norm": 0.6956129670143127, + "learning_rate": 6.220767072029935e-06, + "loss": 0.5337, + "step": 665 + }, + { + "epoch": 0.18697361033127458, + "grad_norm": 0.7677740454673767, + "learning_rate": 6.230121608980356e-06, + "loss": 0.4858, + "step": 666 + }, + { + "epoch": 0.18725435148792813, + "grad_norm": 0.796335756778717, + "learning_rate": 6.239476145930778e-06, + "loss": 0.5479, + "step": 667 + }, + { + "epoch": 0.1875350926445817, + "grad_norm": 0.6870970129966736, + "learning_rate": 6.248830682881198e-06, + "loss": 0.4741, + "step": 668 + }, + { + "epoch": 0.18781583380123526, + "grad_norm": 0.674141526222229, + "learning_rate": 6.258185219831618e-06, + "loss": 0.4668, + "step": 669 + }, + { + "epoch": 0.18809657495788884, + "grad_norm": 0.7898195385932922, + "learning_rate": 6.26753975678204e-06, + "loss": 0.486, + "step": 670 + }, + { + "epoch": 0.1883773161145424, + "grad_norm": 0.8173038363456726, + "learning_rate": 6.27689429373246e-06, + "loss": 0.4851, + "step": 671 + }, + { + "epoch": 0.18865805727119594, + "grad_norm": 0.6723657846450806, + "learning_rate": 6.286248830682882e-06, + "loss": 0.4558, + "step": 672 + }, + { + "epoch": 0.18893879842784952, + "grad_norm": 0.7023763060569763, + "learning_rate": 6.295603367633303e-06, + "loss": 0.4698, + "step": 673 + }, + { + "epoch": 0.18921953958450308, + "grad_norm": 0.6620508432388306, + "learning_rate": 6.304957904583724e-06, + "loss": 0.4724, + "step": 674 + }, + { + "epoch": 0.18950028074115666, + "grad_norm": 0.7251138091087341, + "learning_rate": 6.314312441534145e-06, + "loss": 0.4584, + "step": 675 + }, + { + "epoch": 0.1897810218978102, + "grad_norm": 0.8622398972511292, + "learning_rate": 6.323666978484565e-06, + "loss": 0.5493, + "step": 676 + }, + { + "epoch": 0.1900617630544638, + "grad_norm": 0.8090211153030396, + "learning_rate": 6.333021515434986e-06, + "loss": 0.4458, + "step": 677 + }, + { + "epoch": 0.19034250421111734, + "grad_norm": 0.80698162317276, + "learning_rate": 6.342376052385407e-06, + "loss": 0.5148, + "step": 678 + }, + { + "epoch": 0.19062324536777092, + "grad_norm": 0.8406893014907837, + "learning_rate": 6.351730589335829e-06, + "loss": 0.4809, + "step": 679 + }, + { + "epoch": 0.19090398652442447, + "grad_norm": 0.8074467182159424, + "learning_rate": 6.361085126286249e-06, + "loss": 0.513, + "step": 680 + }, + { + "epoch": 0.19118472768107805, + "grad_norm": 0.7587609887123108, + "learning_rate": 6.370439663236671e-06, + "loss": 0.5189, + "step": 681 + }, + { + "epoch": 0.1914654688377316, + "grad_norm": 0.7114958763122559, + "learning_rate": 6.379794200187091e-06, + "loss": 0.4952, + "step": 682 + }, + { + "epoch": 0.19174620999438519, + "grad_norm": 0.8224939107894897, + "learning_rate": 6.389148737137513e-06, + "loss": 0.4878, + "step": 683 + }, + { + "epoch": 0.19202695115103874, + "grad_norm": 0.749329686164856, + "learning_rate": 6.398503274087933e-06, + "loss": 0.478, + "step": 684 + }, + { + "epoch": 0.19230769230769232, + "grad_norm": 0.7548496127128601, + "learning_rate": 6.407857811038354e-06, + "loss": 0.428, + "step": 685 + }, + { + "epoch": 0.19258843346434587, + "grad_norm": 0.8775320053100586, + "learning_rate": 6.417212347988775e-06, + "loss": 0.5282, + "step": 686 + }, + { + "epoch": 0.19286917462099945, + "grad_norm": 0.8071384429931641, + "learning_rate": 6.426566884939196e-06, + "loss": 0.4887, + "step": 687 + }, + { + "epoch": 0.193149915777653, + "grad_norm": 0.7695320844650269, + "learning_rate": 6.435921421889617e-06, + "loss": 0.5031, + "step": 688 + }, + { + "epoch": 0.19343065693430658, + "grad_norm": 0.8379629254341125, + "learning_rate": 6.445275958840038e-06, + "loss": 0.5068, + "step": 689 + }, + { + "epoch": 0.19371139809096014, + "grad_norm": 0.8163147568702698, + "learning_rate": 6.454630495790459e-06, + "loss": 0.5065, + "step": 690 + }, + { + "epoch": 0.1939921392476137, + "grad_norm": 0.8712754845619202, + "learning_rate": 6.4639850327408804e-06, + "loss": 0.5463, + "step": 691 + }, + { + "epoch": 0.19427288040426727, + "grad_norm": 0.7699676156044006, + "learning_rate": 6.4733395696913e-06, + "loss": 0.5188, + "step": 692 + }, + { + "epoch": 0.19455362156092082, + "grad_norm": 0.8428667187690735, + "learning_rate": 6.482694106641722e-06, + "loss": 0.4543, + "step": 693 + }, + { + "epoch": 0.1948343627175744, + "grad_norm": 0.8588039875030518, + "learning_rate": 6.492048643592142e-06, + "loss": 0.4977, + "step": 694 + }, + { + "epoch": 0.19511510387422795, + "grad_norm": 0.8210405707359314, + "learning_rate": 6.501403180542564e-06, + "loss": 0.4532, + "step": 695 + }, + { + "epoch": 0.19539584503088153, + "grad_norm": 0.6708239912986755, + "learning_rate": 6.5107577174929844e-06, + "loss": 0.4776, + "step": 696 + }, + { + "epoch": 0.19567658618753508, + "grad_norm": 0.6946752667427063, + "learning_rate": 6.520112254443406e-06, + "loss": 0.4976, + "step": 697 + }, + { + "epoch": 0.19595732734418866, + "grad_norm": 0.7242135405540466, + "learning_rate": 6.529466791393827e-06, + "loss": 0.5051, + "step": 698 + }, + { + "epoch": 0.19623806850084222, + "grad_norm": 0.8506885766983032, + "learning_rate": 6.538821328344247e-06, + "loss": 0.5328, + "step": 699 + }, + { + "epoch": 0.1965188096574958, + "grad_norm": 0.7978003025054932, + "learning_rate": 6.548175865294668e-06, + "loss": 0.5217, + "step": 700 + }, + { + "epoch": 0.19679955081414935, + "grad_norm": 0.771725058555603, + "learning_rate": 6.557530402245089e-06, + "loss": 0.4604, + "step": 701 + }, + { + "epoch": 0.19708029197080293, + "grad_norm": 0.8751739859580994, + "learning_rate": 6.56688493919551e-06, + "loss": 0.5024, + "step": 702 + }, + { + "epoch": 0.19736103312745648, + "grad_norm": 0.744221031665802, + "learning_rate": 6.576239476145931e-06, + "loss": 0.4438, + "step": 703 + }, + { + "epoch": 0.19764177428411006, + "grad_norm": 0.7264887690544128, + "learning_rate": 6.585594013096353e-06, + "loss": 0.4674, + "step": 704 + }, + { + "epoch": 0.1979225154407636, + "grad_norm": 0.7788257002830505, + "learning_rate": 6.5949485500467735e-06, + "loss": 0.532, + "step": 705 + }, + { + "epoch": 0.1982032565974172, + "grad_norm": 0.8833885192871094, + "learning_rate": 6.604303086997195e-06, + "loss": 0.4563, + "step": 706 + }, + { + "epoch": 0.19848399775407075, + "grad_norm": 0.7958397269248962, + "learning_rate": 6.613657623947615e-06, + "loss": 0.476, + "step": 707 + }, + { + "epoch": 0.19876473891072433, + "grad_norm": 0.8469579219818115, + "learning_rate": 6.623012160898036e-06, + "loss": 0.5257, + "step": 708 + }, + { + "epoch": 0.19904548006737788, + "grad_norm": 0.7643548250198364, + "learning_rate": 6.632366697848457e-06, + "loss": 0.4322, + "step": 709 + }, + { + "epoch": 0.19932622122403143, + "grad_norm": 0.7553983926773071, + "learning_rate": 6.641721234798878e-06, + "loss": 0.5362, + "step": 710 + }, + { + "epoch": 0.199606962380685, + "grad_norm": 0.8119388818740845, + "learning_rate": 6.651075771749299e-06, + "loss": 0.5136, + "step": 711 + }, + { + "epoch": 0.19988770353733856, + "grad_norm": 0.7699570059776306, + "learning_rate": 6.66043030869972e-06, + "loss": 0.4586, + "step": 712 + }, + { + "epoch": 0.20016844469399214, + "grad_norm": 0.8307033181190491, + "learning_rate": 6.669784845650141e-06, + "loss": 0.4792, + "step": 713 + }, + { + "epoch": 0.2004491858506457, + "grad_norm": 0.7847017645835876, + "learning_rate": 6.679139382600562e-06, + "loss": 0.4868, + "step": 714 + }, + { + "epoch": 0.20072992700729927, + "grad_norm": 0.7745780348777771, + "learning_rate": 6.688493919550982e-06, + "loss": 0.472, + "step": 715 + }, + { + "epoch": 0.20101066816395283, + "grad_norm": 1.0744963884353638, + "learning_rate": 6.697848456501404e-06, + "loss": 0.4936, + "step": 716 + }, + { + "epoch": 0.2012914093206064, + "grad_norm": 0.814922571182251, + "learning_rate": 6.707202993451824e-06, + "loss": 0.4654, + "step": 717 + }, + { + "epoch": 0.20157215047725996, + "grad_norm": 0.7702688574790955, + "learning_rate": 6.716557530402246e-06, + "loss": 0.477, + "step": 718 + }, + { + "epoch": 0.20185289163391354, + "grad_norm": 0.9415417313575745, + "learning_rate": 6.7259120673526665e-06, + "loss": 0.5186, + "step": 719 + }, + { + "epoch": 0.2021336327905671, + "grad_norm": 0.8672822117805481, + "learning_rate": 6.735266604303088e-06, + "loss": 0.4938, + "step": 720 + }, + { + "epoch": 0.20241437394722067, + "grad_norm": 0.8722926378250122, + "learning_rate": 6.744621141253509e-06, + "loss": 0.5423, + "step": 721 + }, + { + "epoch": 0.20269511510387422, + "grad_norm": 0.7042394876480103, + "learning_rate": 6.753975678203929e-06, + "loss": 0.4621, + "step": 722 + }, + { + "epoch": 0.2029758562605278, + "grad_norm": 0.7742260694503784, + "learning_rate": 6.76333021515435e-06, + "loss": 0.4619, + "step": 723 + }, + { + "epoch": 0.20325659741718136, + "grad_norm": 0.7998137474060059, + "learning_rate": 6.772684752104771e-06, + "loss": 0.475, + "step": 724 + }, + { + "epoch": 0.20353733857383494, + "grad_norm": 0.7976365089416504, + "learning_rate": 6.782039289055192e-06, + "loss": 0.4698, + "step": 725 + }, + { + "epoch": 0.2038180797304885, + "grad_norm": 0.8283064365386963, + "learning_rate": 6.7913938260056134e-06, + "loss": 0.4838, + "step": 726 + }, + { + "epoch": 0.20409882088714207, + "grad_norm": 0.7168331146240234, + "learning_rate": 6.800748362956034e-06, + "loss": 0.537, + "step": 727 + }, + { + "epoch": 0.20437956204379562, + "grad_norm": 0.9389481544494629, + "learning_rate": 6.8101028999064555e-06, + "loss": 0.578, + "step": 728 + }, + { + "epoch": 0.20466030320044917, + "grad_norm": 0.8230177760124207, + "learning_rate": 6.819457436856877e-06, + "loss": 0.5107, + "step": 729 + }, + { + "epoch": 0.20494104435710275, + "grad_norm": 0.8442697525024414, + "learning_rate": 6.828811973807297e-06, + "loss": 0.4883, + "step": 730 + }, + { + "epoch": 0.2052217855137563, + "grad_norm": 0.8169631958007812, + "learning_rate": 6.8381665107577174e-06, + "loss": 0.4912, + "step": 731 + }, + { + "epoch": 0.20550252667040989, + "grad_norm": 0.7780888676643372, + "learning_rate": 6.847521047708139e-06, + "loss": 0.5034, + "step": 732 + }, + { + "epoch": 0.20578326782706344, + "grad_norm": 0.9030914306640625, + "learning_rate": 6.8568755846585595e-06, + "loss": 0.5266, + "step": 733 + }, + { + "epoch": 0.20606400898371702, + "grad_norm": 0.759200394153595, + "learning_rate": 6.866230121608981e-06, + "loss": 0.5234, + "step": 734 + }, + { + "epoch": 0.20634475014037057, + "grad_norm": 0.7931351661682129, + "learning_rate": 6.8755846585594025e-06, + "loss": 0.456, + "step": 735 + }, + { + "epoch": 0.20662549129702415, + "grad_norm": 0.8052343130111694, + "learning_rate": 6.884939195509823e-06, + "loss": 0.4853, + "step": 736 + }, + { + "epoch": 0.2069062324536777, + "grad_norm": 0.7927553653717041, + "learning_rate": 6.894293732460243e-06, + "loss": 0.4991, + "step": 737 + }, + { + "epoch": 0.20718697361033128, + "grad_norm": 0.9182446599006653, + "learning_rate": 6.903648269410664e-06, + "loss": 0.479, + "step": 738 + }, + { + "epoch": 0.20746771476698483, + "grad_norm": 0.7966193556785583, + "learning_rate": 6.913002806361086e-06, + "loss": 0.5107, + "step": 739 + }, + { + "epoch": 0.20774845592363841, + "grad_norm": 0.796349048614502, + "learning_rate": 6.9223573433115065e-06, + "loss": 0.5064, + "step": 740 + }, + { + "epoch": 0.20802919708029197, + "grad_norm": 0.8751774430274963, + "learning_rate": 6.931711880261928e-06, + "loss": 0.4919, + "step": 741 + }, + { + "epoch": 0.20830993823694555, + "grad_norm": 0.8394516706466675, + "learning_rate": 6.941066417212349e-06, + "loss": 0.4924, + "step": 742 + }, + { + "epoch": 0.2085906793935991, + "grad_norm": 0.8675036430358887, + "learning_rate": 6.95042095416277e-06, + "loss": 0.4964, + "step": 743 + }, + { + "epoch": 0.20887142055025268, + "grad_norm": 0.7031323909759521, + "learning_rate": 6.959775491113191e-06, + "loss": 0.4696, + "step": 744 + }, + { + "epoch": 0.20915216170690623, + "grad_norm": 0.9879941940307617, + "learning_rate": 6.969130028063611e-06, + "loss": 0.4884, + "step": 745 + }, + { + "epoch": 0.2094329028635598, + "grad_norm": 0.8013218641281128, + "learning_rate": 6.978484565014032e-06, + "loss": 0.46, + "step": 746 + }, + { + "epoch": 0.20971364402021336, + "grad_norm": 0.7790707945823669, + "learning_rate": 6.987839101964453e-06, + "loss": 0.5173, + "step": 747 + }, + { + "epoch": 0.20999438517686692, + "grad_norm": 0.8945547342300415, + "learning_rate": 6.997193638914874e-06, + "loss": 0.4542, + "step": 748 + }, + { + "epoch": 0.2102751263335205, + "grad_norm": 0.8125542402267456, + "learning_rate": 7.0065481758652955e-06, + "loss": 0.5067, + "step": 749 + }, + { + "epoch": 0.21055586749017405, + "grad_norm": 0.7749972939491272, + "learning_rate": 7.015902712815716e-06, + "loss": 0.4425, + "step": 750 + }, + { + "epoch": 0.21083660864682763, + "grad_norm": 0.9507055282592773, + "learning_rate": 7.025257249766138e-06, + "loss": 0.5095, + "step": 751 + }, + { + "epoch": 0.21111734980348118, + "grad_norm": 0.8765051364898682, + "learning_rate": 7.034611786716558e-06, + "loss": 0.5172, + "step": 752 + }, + { + "epoch": 0.21139809096013476, + "grad_norm": 0.8438290953636169, + "learning_rate": 7.043966323666979e-06, + "loss": 0.4626, + "step": 753 + }, + { + "epoch": 0.2116788321167883, + "grad_norm": 0.791168212890625, + "learning_rate": 7.0533208606173995e-06, + "loss": 0.5015, + "step": 754 + }, + { + "epoch": 0.2119595732734419, + "grad_norm": 0.7929601073265076, + "learning_rate": 7.062675397567821e-06, + "loss": 0.5085, + "step": 755 + }, + { + "epoch": 0.21224031443009544, + "grad_norm": 0.7710103392601013, + "learning_rate": 7.072029934518242e-06, + "loss": 0.4502, + "step": 756 + }, + { + "epoch": 0.21252105558674902, + "grad_norm": 0.8204118013381958, + "learning_rate": 7.081384471468663e-06, + "loss": 0.4549, + "step": 757 + }, + { + "epoch": 0.21280179674340258, + "grad_norm": 0.8469942808151245, + "learning_rate": 7.090739008419084e-06, + "loss": 0.4856, + "step": 758 + }, + { + "epoch": 0.21308253790005616, + "grad_norm": 0.8816447854042053, + "learning_rate": 7.100093545369505e-06, + "loss": 0.5089, + "step": 759 + }, + { + "epoch": 0.2133632790567097, + "grad_norm": 0.7904311418533325, + "learning_rate": 7.109448082319925e-06, + "loss": 0.4753, + "step": 760 + }, + { + "epoch": 0.2136440202133633, + "grad_norm": 0.8346189260482788, + "learning_rate": 7.1188026192703465e-06, + "loss": 0.5039, + "step": 761 + }, + { + "epoch": 0.21392476137001684, + "grad_norm": 0.94911789894104, + "learning_rate": 7.128157156220767e-06, + "loss": 0.5588, + "step": 762 + }, + { + "epoch": 0.21420550252667042, + "grad_norm": 0.837386429309845, + "learning_rate": 7.1375116931711886e-06, + "loss": 0.4809, + "step": 763 + }, + { + "epoch": 0.21448624368332397, + "grad_norm": 0.8271566033363342, + "learning_rate": 7.14686623012161e-06, + "loss": 0.4947, + "step": 764 + }, + { + "epoch": 0.21476698483997755, + "grad_norm": 0.8610383868217468, + "learning_rate": 7.156220767072031e-06, + "loss": 0.4717, + "step": 765 + }, + { + "epoch": 0.2150477259966311, + "grad_norm": 0.9502079486846924, + "learning_rate": 7.165575304022452e-06, + "loss": 0.4758, + "step": 766 + }, + { + "epoch": 0.21532846715328466, + "grad_norm": 0.9139910936355591, + "learning_rate": 7.174929840972873e-06, + "loss": 0.5117, + "step": 767 + }, + { + "epoch": 0.21560920830993824, + "grad_norm": 0.809079647064209, + "learning_rate": 7.1842843779232925e-06, + "loss": 0.4904, + "step": 768 + }, + { + "epoch": 0.2158899494665918, + "grad_norm": 0.8286628723144531, + "learning_rate": 7.193638914873714e-06, + "loss": 0.4818, + "step": 769 + }, + { + "epoch": 0.21617069062324537, + "grad_norm": 0.8030611872673035, + "learning_rate": 7.2029934518241355e-06, + "loss": 0.4776, + "step": 770 + }, + { + "epoch": 0.21645143177989892, + "grad_norm": 0.7689762115478516, + "learning_rate": 7.212347988774556e-06, + "loss": 0.4948, + "step": 771 + }, + { + "epoch": 0.2167321729365525, + "grad_norm": 0.9789795875549316, + "learning_rate": 7.221702525724978e-06, + "loss": 0.4869, + "step": 772 + }, + { + "epoch": 0.21701291409320606, + "grad_norm": 0.8280055522918701, + "learning_rate": 7.231057062675398e-06, + "loss": 0.5029, + "step": 773 + }, + { + "epoch": 0.21729365524985964, + "grad_norm": 0.8097561001777649, + "learning_rate": 7.24041159962582e-06, + "loss": 0.4568, + "step": 774 + }, + { + "epoch": 0.2175743964065132, + "grad_norm": 0.8653045892715454, + "learning_rate": 7.24976613657624e-06, + "loss": 0.4876, + "step": 775 + }, + { + "epoch": 0.21785513756316677, + "grad_norm": 0.8296322822570801, + "learning_rate": 7.259120673526661e-06, + "loss": 0.558, + "step": 776 + }, + { + "epoch": 0.21813587871982032, + "grad_norm": 0.7704964280128479, + "learning_rate": 7.268475210477082e-06, + "loss": 0.4396, + "step": 777 + }, + { + "epoch": 0.2184166198764739, + "grad_norm": 0.8957526683807373, + "learning_rate": 7.277829747427503e-06, + "loss": 0.5055, + "step": 778 + }, + { + "epoch": 0.21869736103312745, + "grad_norm": 0.7658538222312927, + "learning_rate": 7.287184284377924e-06, + "loss": 0.4509, + "step": 779 + }, + { + "epoch": 0.21897810218978103, + "grad_norm": 0.7775703072547913, + "learning_rate": 7.296538821328345e-06, + "loss": 0.4828, + "step": 780 + }, + { + "epoch": 0.21925884334643458, + "grad_norm": 0.751159131526947, + "learning_rate": 7.305893358278766e-06, + "loss": 0.486, + "step": 781 + }, + { + "epoch": 0.21953958450308816, + "grad_norm": 0.8056408762931824, + "learning_rate": 7.315247895229187e-06, + "loss": 0.4552, + "step": 782 + }, + { + "epoch": 0.21982032565974172, + "grad_norm": 0.703741729259491, + "learning_rate": 7.324602432179607e-06, + "loss": 0.4853, + "step": 783 + }, + { + "epoch": 0.2201010668163953, + "grad_norm": 0.8325309753417969, + "learning_rate": 7.3339569691300285e-06, + "loss": 0.484, + "step": 784 + }, + { + "epoch": 0.22038180797304885, + "grad_norm": 0.7223569750785828, + "learning_rate": 7.343311506080449e-06, + "loss": 0.5043, + "step": 785 + }, + { + "epoch": 0.2206625491297024, + "grad_norm": 0.7619519233703613, + "learning_rate": 7.352666043030871e-06, + "loss": 0.4649, + "step": 786 + }, + { + "epoch": 0.22094329028635598, + "grad_norm": 0.9175875782966614, + "learning_rate": 7.362020579981291e-06, + "loss": 0.5978, + "step": 787 + }, + { + "epoch": 0.22122403144300953, + "grad_norm": 0.7340946197509766, + "learning_rate": 7.371375116931713e-06, + "loss": 0.4801, + "step": 788 + }, + { + "epoch": 0.2215047725996631, + "grad_norm": 0.7784373164176941, + "learning_rate": 7.380729653882134e-06, + "loss": 0.5125, + "step": 789 + }, + { + "epoch": 0.22178551375631667, + "grad_norm": 0.8114541172981262, + "learning_rate": 7.390084190832555e-06, + "loss": 0.4761, + "step": 790 + }, + { + "epoch": 0.22206625491297025, + "grad_norm": 0.9015792608261108, + "learning_rate": 7.399438727782975e-06, + "loss": 0.5065, + "step": 791 + }, + { + "epoch": 0.2223469960696238, + "grad_norm": 0.7953155040740967, + "learning_rate": 7.408793264733396e-06, + "loss": 0.5121, + "step": 792 + }, + { + "epoch": 0.22262773722627738, + "grad_norm": 0.7775832414627075, + "learning_rate": 7.418147801683817e-06, + "loss": 0.4871, + "step": 793 + }, + { + "epoch": 0.22290847838293093, + "grad_norm": 0.8492860794067383, + "learning_rate": 7.427502338634238e-06, + "loss": 0.5279, + "step": 794 + }, + { + "epoch": 0.2231892195395845, + "grad_norm": 0.8550410866737366, + "learning_rate": 7.43685687558466e-06, + "loss": 0.4912, + "step": 795 + }, + { + "epoch": 0.22346996069623806, + "grad_norm": 0.8710159063339233, + "learning_rate": 7.44621141253508e-06, + "loss": 0.5149, + "step": 796 + }, + { + "epoch": 0.22375070185289164, + "grad_norm": 0.7900233268737793, + "learning_rate": 7.455565949485502e-06, + "loss": 0.4885, + "step": 797 + }, + { + "epoch": 0.2240314430095452, + "grad_norm": 0.7633907198905945, + "learning_rate": 7.4649204864359216e-06, + "loss": 0.4923, + "step": 798 + }, + { + "epoch": 0.22431218416619877, + "grad_norm": 0.9308202266693115, + "learning_rate": 7.474275023386342e-06, + "loss": 0.4673, + "step": 799 + }, + { + "epoch": 0.22459292532285233, + "grad_norm": 0.8563190698623657, + "learning_rate": 7.483629560336764e-06, + "loss": 0.501, + "step": 800 + }, + { + "epoch": 0.2248736664795059, + "grad_norm": 0.8566862344741821, + "learning_rate": 7.492984097287185e-06, + "loss": 0.527, + "step": 801 + }, + { + "epoch": 0.22515440763615946, + "grad_norm": 0.7848474979400635, + "learning_rate": 7.502338634237606e-06, + "loss": 0.4928, + "step": 802 + }, + { + "epoch": 0.22543514879281304, + "grad_norm": 0.7709822654724121, + "learning_rate": 7.511693171188027e-06, + "loss": 0.4791, + "step": 803 + }, + { + "epoch": 0.2257158899494666, + "grad_norm": 0.8138049840927124, + "learning_rate": 7.521047708138448e-06, + "loss": 0.4524, + "step": 804 + }, + { + "epoch": 0.22599663110612014, + "grad_norm": 0.7268965840339661, + "learning_rate": 7.530402245088869e-06, + "loss": 0.478, + "step": 805 + }, + { + "epoch": 0.22627737226277372, + "grad_norm": 0.7944256663322449, + "learning_rate": 7.539756782039289e-06, + "loss": 0.4657, + "step": 806 + }, + { + "epoch": 0.22655811341942728, + "grad_norm": 0.7516258955001831, + "learning_rate": 7.549111318989711e-06, + "loss": 0.4577, + "step": 807 + }, + { + "epoch": 0.22683885457608086, + "grad_norm": 0.8455751538276672, + "learning_rate": 7.558465855940131e-06, + "loss": 0.4604, + "step": 808 + }, + { + "epoch": 0.2271195957327344, + "grad_norm": 0.9007487893104553, + "learning_rate": 7.567820392890553e-06, + "loss": 0.5352, + "step": 809 + }, + { + "epoch": 0.227400336889388, + "grad_norm": 0.8126270771026611, + "learning_rate": 7.577174929840973e-06, + "loss": 0.471, + "step": 810 + }, + { + "epoch": 0.22768107804604154, + "grad_norm": 0.8163447380065918, + "learning_rate": 7.586529466791395e-06, + "loss": 0.4879, + "step": 811 + }, + { + "epoch": 0.22796181920269512, + "grad_norm": 0.783903956413269, + "learning_rate": 7.5958840037418154e-06, + "loss": 0.5218, + "step": 812 + }, + { + "epoch": 0.22824256035934867, + "grad_norm": 0.7607461214065552, + "learning_rate": 7.605238540692237e-06, + "loss": 0.4878, + "step": 813 + }, + { + "epoch": 0.22852330151600225, + "grad_norm": 0.8193497657775879, + "learning_rate": 7.614593077642657e-06, + "loss": 0.4666, + "step": 814 + }, + { + "epoch": 0.2288040426726558, + "grad_norm": 0.8364272713661194, + "learning_rate": 7.623947614593078e-06, + "loss": 0.4932, + "step": 815 + }, + { + "epoch": 0.22908478382930939, + "grad_norm": 0.6981427073478699, + "learning_rate": 7.633302151543499e-06, + "loss": 0.4862, + "step": 816 + }, + { + "epoch": 0.22936552498596294, + "grad_norm": 0.8305123448371887, + "learning_rate": 7.64265668849392e-06, + "loss": 0.5026, + "step": 817 + }, + { + "epoch": 0.22964626614261652, + "grad_norm": 0.9503836631774902, + "learning_rate": 7.652011225444342e-06, + "loss": 0.4959, + "step": 818 + }, + { + "epoch": 0.22992700729927007, + "grad_norm": 0.8699612021446228, + "learning_rate": 7.661365762394762e-06, + "loss": 0.4976, + "step": 819 + }, + { + "epoch": 0.23020774845592365, + "grad_norm": 0.6767401695251465, + "learning_rate": 7.670720299345183e-06, + "loss": 0.4245, + "step": 820 + }, + { + "epoch": 0.2304884896125772, + "grad_norm": 0.8819581866264343, + "learning_rate": 7.680074836295604e-06, + "loss": 0.5003, + "step": 821 + }, + { + "epoch": 0.23076923076923078, + "grad_norm": 0.9521064758300781, + "learning_rate": 7.689429373246024e-06, + "loss": 0.4196, + "step": 822 + }, + { + "epoch": 0.23104997192588433, + "grad_norm": 0.8548203706741333, + "learning_rate": 7.698783910196445e-06, + "loss": 0.5253, + "step": 823 + }, + { + "epoch": 0.2313307130825379, + "grad_norm": 0.784529447555542, + "learning_rate": 7.708138447146867e-06, + "loss": 0.4748, + "step": 824 + }, + { + "epoch": 0.23161145423919147, + "grad_norm": 0.8814212083816528, + "learning_rate": 7.717492984097288e-06, + "loss": 0.4517, + "step": 825 + }, + { + "epoch": 0.23189219539584502, + "grad_norm": 0.8412429690361023, + "learning_rate": 7.726847521047708e-06, + "loss": 0.4789, + "step": 826 + }, + { + "epoch": 0.2321729365524986, + "grad_norm": 0.7028629779815674, + "learning_rate": 7.73620205799813e-06, + "loss": 0.5404, + "step": 827 + }, + { + "epoch": 0.23245367770915215, + "grad_norm": 0.8391116261482239, + "learning_rate": 7.745556594948551e-06, + "loss": 0.4269, + "step": 828 + }, + { + "epoch": 0.23273441886580573, + "grad_norm": 0.7596981525421143, + "learning_rate": 7.75491113189897e-06, + "loss": 0.4879, + "step": 829 + }, + { + "epoch": 0.23301516002245928, + "grad_norm": 0.8645800948143005, + "learning_rate": 7.764265668849393e-06, + "loss": 0.4648, + "step": 830 + }, + { + "epoch": 0.23329590117911286, + "grad_norm": 1.0553699731826782, + "learning_rate": 7.773620205799813e-06, + "loss": 0.5485, + "step": 831 + }, + { + "epoch": 0.23357664233576642, + "grad_norm": 0.9821456670761108, + "learning_rate": 7.782974742750234e-06, + "loss": 0.557, + "step": 832 + }, + { + "epoch": 0.23385738349242, + "grad_norm": 0.8769895434379578, + "learning_rate": 7.792329279700656e-06, + "loss": 0.509, + "step": 833 + }, + { + "epoch": 0.23413812464907355, + "grad_norm": 0.9701917767524719, + "learning_rate": 7.801683816651077e-06, + "loss": 0.5332, + "step": 834 + }, + { + "epoch": 0.23441886580572713, + "grad_norm": 0.9042077660560608, + "learning_rate": 7.811038353601498e-06, + "loss": 0.4989, + "step": 835 + }, + { + "epoch": 0.23469960696238068, + "grad_norm": 0.863466739654541, + "learning_rate": 7.820392890551918e-06, + "loss": 0.4811, + "step": 836 + }, + { + "epoch": 0.23498034811903426, + "grad_norm": 0.7645045518875122, + "learning_rate": 7.829747427502339e-06, + "loss": 0.4729, + "step": 837 + }, + { + "epoch": 0.2352610892756878, + "grad_norm": 0.8438298106193542, + "learning_rate": 7.83910196445276e-06, + "loss": 0.46, + "step": 838 + }, + { + "epoch": 0.2355418304323414, + "grad_norm": 0.849493145942688, + "learning_rate": 7.848456501403182e-06, + "loss": 0.4868, + "step": 839 + }, + { + "epoch": 0.23582257158899494, + "grad_norm": 0.9705619812011719, + "learning_rate": 7.857811038353602e-06, + "loss": 0.4465, + "step": 840 + }, + { + "epoch": 0.23610331274564852, + "grad_norm": 0.8479672074317932, + "learning_rate": 7.867165575304023e-06, + "loss": 0.5, + "step": 841 + }, + { + "epoch": 0.23638405390230208, + "grad_norm": 0.7416703701019287, + "learning_rate": 7.876520112254444e-06, + "loss": 0.5244, + "step": 842 + }, + { + "epoch": 0.23666479505895563, + "grad_norm": 0.7811509370803833, + "learning_rate": 7.885874649204866e-06, + "loss": 0.5247, + "step": 843 + }, + { + "epoch": 0.2369455362156092, + "grad_norm": 0.7777257561683655, + "learning_rate": 7.895229186155285e-06, + "loss": 0.4863, + "step": 844 + }, + { + "epoch": 0.23722627737226276, + "grad_norm": 0.7654300928115845, + "learning_rate": 7.904583723105707e-06, + "loss": 0.4927, + "step": 845 + }, + { + "epoch": 0.23750701852891634, + "grad_norm": 0.7922725081443787, + "learning_rate": 7.913938260056128e-06, + "loss": 0.4634, + "step": 846 + }, + { + "epoch": 0.2377877596855699, + "grad_norm": 0.7853131294250488, + "learning_rate": 7.923292797006548e-06, + "loss": 0.4677, + "step": 847 + }, + { + "epoch": 0.23806850084222347, + "grad_norm": 0.9648452997207642, + "learning_rate": 7.932647333956969e-06, + "loss": 0.4835, + "step": 848 + }, + { + "epoch": 0.23834924199887703, + "grad_norm": 0.7550539374351501, + "learning_rate": 7.942001870907391e-06, + "loss": 0.4879, + "step": 849 + }, + { + "epoch": 0.2386299831555306, + "grad_norm": 0.8540953397750854, + "learning_rate": 7.951356407857812e-06, + "loss": 0.49, + "step": 850 + }, + { + "epoch": 0.23891072431218416, + "grad_norm": 0.9588361978530884, + "learning_rate": 7.960710944808233e-06, + "loss": 0.4817, + "step": 851 + }, + { + "epoch": 0.23919146546883774, + "grad_norm": 0.7889480590820312, + "learning_rate": 7.970065481758653e-06, + "loss": 0.4509, + "step": 852 + }, + { + "epoch": 0.2394722066254913, + "grad_norm": 0.9453020095825195, + "learning_rate": 7.979420018709074e-06, + "loss": 0.5955, + "step": 853 + }, + { + "epoch": 0.23975294778214487, + "grad_norm": 0.9234386682510376, + "learning_rate": 7.988774555659495e-06, + "loss": 0.4713, + "step": 854 + }, + { + "epoch": 0.24003368893879842, + "grad_norm": 0.8285678625106812, + "learning_rate": 7.998129092609917e-06, + "loss": 0.4857, + "step": 855 + }, + { + "epoch": 0.240314430095452, + "grad_norm": 0.7617980241775513, + "learning_rate": 8.007483629560337e-06, + "loss": 0.451, + "step": 856 + }, + { + "epoch": 0.24059517125210556, + "grad_norm": 0.9261062145233154, + "learning_rate": 8.016838166510758e-06, + "loss": 0.5317, + "step": 857 + }, + { + "epoch": 0.24087591240875914, + "grad_norm": 0.9025489091873169, + "learning_rate": 8.02619270346118e-06, + "loss": 0.4408, + "step": 858 + }, + { + "epoch": 0.2411566535654127, + "grad_norm": 0.8039227724075317, + "learning_rate": 8.035547240411601e-06, + "loss": 0.4828, + "step": 859 + }, + { + "epoch": 0.24143739472206627, + "grad_norm": 0.8967242240905762, + "learning_rate": 8.04490177736202e-06, + "loss": 0.5024, + "step": 860 + }, + { + "epoch": 0.24171813587871982, + "grad_norm": 0.8802300095558167, + "learning_rate": 8.054256314312442e-06, + "loss": 0.5036, + "step": 861 + }, + { + "epoch": 0.24199887703537337, + "grad_norm": 0.8861883878707886, + "learning_rate": 8.063610851262863e-06, + "loss": 0.5273, + "step": 862 + }, + { + "epoch": 0.24227961819202695, + "grad_norm": 0.7743239998817444, + "learning_rate": 8.072965388213284e-06, + "loss": 0.4993, + "step": 863 + }, + { + "epoch": 0.2425603593486805, + "grad_norm": 0.7340776920318604, + "learning_rate": 8.082319925163706e-06, + "loss": 0.4608, + "step": 864 + }, + { + "epoch": 0.24284110050533408, + "grad_norm": 0.7196219563484192, + "learning_rate": 8.091674462114127e-06, + "loss": 0.4381, + "step": 865 + }, + { + "epoch": 0.24312184166198764, + "grad_norm": 0.7719977498054504, + "learning_rate": 8.101028999064547e-06, + "loss": 0.4581, + "step": 866 + }, + { + "epoch": 0.24340258281864122, + "grad_norm": 0.9021661877632141, + "learning_rate": 8.110383536014968e-06, + "loss": 0.4692, + "step": 867 + }, + { + "epoch": 0.24368332397529477, + "grad_norm": 0.7454041242599487, + "learning_rate": 8.119738072965388e-06, + "loss": 0.4559, + "step": 868 + }, + { + "epoch": 0.24396406513194835, + "grad_norm": 0.8102445602416992, + "learning_rate": 8.129092609915809e-06, + "loss": 0.5006, + "step": 869 + }, + { + "epoch": 0.2442448062886019, + "grad_norm": 0.7215291261672974, + "learning_rate": 8.138447146866231e-06, + "loss": 0.4426, + "step": 870 + }, + { + "epoch": 0.24452554744525548, + "grad_norm": 0.7336854934692383, + "learning_rate": 8.147801683816652e-06, + "loss": 0.4685, + "step": 871 + }, + { + "epoch": 0.24480628860190903, + "grad_norm": 0.8260940909385681, + "learning_rate": 8.157156220767073e-06, + "loss": 0.5054, + "step": 872 + }, + { + "epoch": 0.2450870297585626, + "grad_norm": 0.8100261092185974, + "learning_rate": 8.166510757717493e-06, + "loss": 0.5025, + "step": 873 + }, + { + "epoch": 0.24536777091521617, + "grad_norm": 0.6854899525642395, + "learning_rate": 8.175865294667916e-06, + "loss": 0.5013, + "step": 874 + }, + { + "epoch": 0.24564851207186975, + "grad_norm": 0.7612625360488892, + "learning_rate": 8.185219831618335e-06, + "loss": 0.4197, + "step": 875 + }, + { + "epoch": 0.2459292532285233, + "grad_norm": 0.775981068611145, + "learning_rate": 8.194574368568757e-06, + "loss": 0.4416, + "step": 876 + }, + { + "epoch": 0.24620999438517688, + "grad_norm": 0.8787760734558105, + "learning_rate": 8.203928905519177e-06, + "loss": 0.496, + "step": 877 + }, + { + "epoch": 0.24649073554183043, + "grad_norm": 0.8954545259475708, + "learning_rate": 8.213283442469598e-06, + "loss": 0.5222, + "step": 878 + }, + { + "epoch": 0.246771476698484, + "grad_norm": 0.7083318829536438, + "learning_rate": 8.222637979420019e-06, + "loss": 0.4849, + "step": 879 + }, + { + "epoch": 0.24705221785513756, + "grad_norm": 0.7826118469238281, + "learning_rate": 8.231992516370441e-06, + "loss": 0.4799, + "step": 880 + }, + { + "epoch": 0.24733295901179111, + "grad_norm": 0.7675488591194153, + "learning_rate": 8.241347053320862e-06, + "loss": 0.5113, + "step": 881 + }, + { + "epoch": 0.2476137001684447, + "grad_norm": 0.8048291206359863, + "learning_rate": 8.250701590271282e-06, + "loss": 0.5162, + "step": 882 + }, + { + "epoch": 0.24789444132509825, + "grad_norm": 0.8667749166488647, + "learning_rate": 8.260056127221703e-06, + "loss": 0.4802, + "step": 883 + }, + { + "epoch": 0.24817518248175183, + "grad_norm": 0.8236709237098694, + "learning_rate": 8.269410664172124e-06, + "loss": 0.4491, + "step": 884 + }, + { + "epoch": 0.24845592363840538, + "grad_norm": 0.7885945439338684, + "learning_rate": 8.278765201122544e-06, + "loss": 0.472, + "step": 885 + }, + { + "epoch": 0.24873666479505896, + "grad_norm": 0.8309069275856018, + "learning_rate": 8.288119738072967e-06, + "loss": 0.4879, + "step": 886 + }, + { + "epoch": 0.2490174059517125, + "grad_norm": 0.9352370500564575, + "learning_rate": 8.297474275023387e-06, + "loss": 0.5002, + "step": 887 + }, + { + "epoch": 0.2492981471083661, + "grad_norm": 0.9705796837806702, + "learning_rate": 8.306828811973808e-06, + "loss": 0.5102, + "step": 888 + }, + { + "epoch": 0.24957888826501964, + "grad_norm": 0.7391785383224487, + "learning_rate": 8.31618334892423e-06, + "loss": 0.4565, + "step": 889 + }, + { + "epoch": 0.24985962942167322, + "grad_norm": 0.859291136264801, + "learning_rate": 8.325537885874649e-06, + "loss": 0.4779, + "step": 890 + }, + { + "epoch": 0.2501403705783268, + "grad_norm": 0.7881708145141602, + "learning_rate": 8.33489242282507e-06, + "loss": 0.4949, + "step": 891 + }, + { + "epoch": 0.25042111173498033, + "grad_norm": 0.8595026731491089, + "learning_rate": 8.344246959775492e-06, + "loss": 0.478, + "step": 892 + }, + { + "epoch": 0.25070185289163394, + "grad_norm": 0.7372920513153076, + "learning_rate": 8.353601496725913e-06, + "loss": 0.5033, + "step": 893 + }, + { + "epoch": 0.2509825940482875, + "grad_norm": 0.7447819113731384, + "learning_rate": 8.362956033676333e-06, + "loss": 0.4458, + "step": 894 + }, + { + "epoch": 0.25126333520494104, + "grad_norm": 0.9821788668632507, + "learning_rate": 8.372310570626756e-06, + "loss": 0.5061, + "step": 895 + }, + { + "epoch": 0.2515440763615946, + "grad_norm": 0.7504124045372009, + "learning_rate": 8.381665107577176e-06, + "loss": 0.4628, + "step": 896 + }, + { + "epoch": 0.2518248175182482, + "grad_norm": 0.7859714031219482, + "learning_rate": 8.391019644527597e-06, + "loss": 0.4936, + "step": 897 + }, + { + "epoch": 0.25210555867490175, + "grad_norm": 0.8116353154182434, + "learning_rate": 8.400374181478017e-06, + "loss": 0.4905, + "step": 898 + }, + { + "epoch": 0.2523862998315553, + "grad_norm": 0.7412129044532776, + "learning_rate": 8.409728718428438e-06, + "loss": 0.4779, + "step": 899 + }, + { + "epoch": 0.25266704098820886, + "grad_norm": 0.77248615026474, + "learning_rate": 8.419083255378859e-06, + "loss": 0.4896, + "step": 900 + }, + { + "epoch": 0.2529477821448624, + "grad_norm": 0.7527933716773987, + "learning_rate": 8.428437792329281e-06, + "loss": 0.4932, + "step": 901 + }, + { + "epoch": 0.253228523301516, + "grad_norm": 0.7819905281066895, + "learning_rate": 8.437792329279702e-06, + "loss": 0.4309, + "step": 902 + }, + { + "epoch": 0.25350926445816957, + "grad_norm": 0.7580601572990417, + "learning_rate": 8.447146866230122e-06, + "loss": 0.5103, + "step": 903 + }, + { + "epoch": 0.2537900056148231, + "grad_norm": 0.7849022150039673, + "learning_rate": 8.456501403180543e-06, + "loss": 0.4792, + "step": 904 + }, + { + "epoch": 0.2540707467714767, + "grad_norm": 0.746990442276001, + "learning_rate": 8.465855940130964e-06, + "loss": 0.473, + "step": 905 + }, + { + "epoch": 0.2543514879281303, + "grad_norm": 0.7876943945884705, + "learning_rate": 8.475210477081384e-06, + "loss": 0.4204, + "step": 906 + }, + { + "epoch": 0.25463222908478383, + "grad_norm": 0.8317561149597168, + "learning_rate": 8.484565014031806e-06, + "loss": 0.5359, + "step": 907 + }, + { + "epoch": 0.2549129702414374, + "grad_norm": 0.7951239943504333, + "learning_rate": 8.493919550982227e-06, + "loss": 0.4973, + "step": 908 + }, + { + "epoch": 0.25519371139809094, + "grad_norm": 0.7526887655258179, + "learning_rate": 8.503274087932648e-06, + "loss": 0.489, + "step": 909 + }, + { + "epoch": 0.25547445255474455, + "grad_norm": 0.8517107963562012, + "learning_rate": 8.512628624883068e-06, + "loss": 0.4819, + "step": 910 + }, + { + "epoch": 0.2557551937113981, + "grad_norm": 0.7782130241394043, + "learning_rate": 8.52198316183349e-06, + "loss": 0.5018, + "step": 911 + }, + { + "epoch": 0.25603593486805165, + "grad_norm": 0.8412166833877563, + "learning_rate": 8.531337698783911e-06, + "loss": 0.5138, + "step": 912 + }, + { + "epoch": 0.2563166760247052, + "grad_norm": 0.7934753894805908, + "learning_rate": 8.540692235734332e-06, + "loss": 0.4733, + "step": 913 + }, + { + "epoch": 0.2565974171813588, + "grad_norm": 0.7621031403541565, + "learning_rate": 8.550046772684753e-06, + "loss": 0.4957, + "step": 914 + }, + { + "epoch": 0.25687815833801236, + "grad_norm": 0.7486236691474915, + "learning_rate": 8.559401309635173e-06, + "loss": 0.5315, + "step": 915 + }, + { + "epoch": 0.2571588994946659, + "grad_norm": 0.7432385683059692, + "learning_rate": 8.568755846585594e-06, + "loss": 0.4396, + "step": 916 + }, + { + "epoch": 0.25743964065131947, + "grad_norm": 0.9275770783424377, + "learning_rate": 8.578110383536016e-06, + "loss": 0.4905, + "step": 917 + }, + { + "epoch": 0.2577203818079731, + "grad_norm": 0.7362551689147949, + "learning_rate": 8.587464920486437e-06, + "loss": 0.425, + "step": 918 + }, + { + "epoch": 0.25800112296462663, + "grad_norm": 0.8446869254112244, + "learning_rate": 8.596819457436857e-06, + "loss": 0.5109, + "step": 919 + }, + { + "epoch": 0.2582818641212802, + "grad_norm": 1.0435951948165894, + "learning_rate": 8.60617399438728e-06, + "loss": 0.5141, + "step": 920 + }, + { + "epoch": 0.25856260527793373, + "grad_norm": 0.8806620240211487, + "learning_rate": 8.615528531337699e-06, + "loss": 0.4957, + "step": 921 + }, + { + "epoch": 0.2588433464345873, + "grad_norm": 0.8784282207489014, + "learning_rate": 8.624883068288121e-06, + "loss": 0.4836, + "step": 922 + }, + { + "epoch": 0.2591240875912409, + "grad_norm": 0.9344772100448608, + "learning_rate": 8.634237605238542e-06, + "loss": 0.5401, + "step": 923 + }, + { + "epoch": 0.25940482874789444, + "grad_norm": 0.885509192943573, + "learning_rate": 8.643592142188962e-06, + "loss": 0.4499, + "step": 924 + }, + { + "epoch": 0.259685569904548, + "grad_norm": 0.7840343713760376, + "learning_rate": 8.652946679139383e-06, + "loss": 0.4851, + "step": 925 + }, + { + "epoch": 0.25996631106120155, + "grad_norm": 0.8780069351196289, + "learning_rate": 8.662301216089805e-06, + "loss": 0.4678, + "step": 926 + }, + { + "epoch": 0.26024705221785516, + "grad_norm": 0.7959757447242737, + "learning_rate": 8.671655753040226e-06, + "loss": 0.4463, + "step": 927 + }, + { + "epoch": 0.2605277933745087, + "grad_norm": 0.7591774463653564, + "learning_rate": 8.681010289990646e-06, + "loss": 0.4804, + "step": 928 + }, + { + "epoch": 0.26080853453116226, + "grad_norm": 0.9297831058502197, + "learning_rate": 8.690364826941067e-06, + "loss": 0.5108, + "step": 929 + }, + { + "epoch": 0.2610892756878158, + "grad_norm": 0.7070633769035339, + "learning_rate": 8.699719363891488e-06, + "loss": 0.434, + "step": 930 + }, + { + "epoch": 0.2613700168444694, + "grad_norm": 0.903701901435852, + "learning_rate": 8.709073900841908e-06, + "loss": 0.525, + "step": 931 + }, + { + "epoch": 0.261650758001123, + "grad_norm": 0.7866299152374268, + "learning_rate": 8.71842843779233e-06, + "loss": 0.4782, + "step": 932 + }, + { + "epoch": 0.2619314991577765, + "grad_norm": 0.6943819522857666, + "learning_rate": 8.727782974742751e-06, + "loss": 0.4678, + "step": 933 + }, + { + "epoch": 0.2622122403144301, + "grad_norm": 0.845110297203064, + "learning_rate": 8.737137511693172e-06, + "loss": 0.5255, + "step": 934 + }, + { + "epoch": 0.2624929814710837, + "grad_norm": 0.7377384901046753, + "learning_rate": 8.746492048643593e-06, + "loss": 0.5018, + "step": 935 + }, + { + "epoch": 0.26277372262773724, + "grad_norm": 0.8059272766113281, + "learning_rate": 8.755846585594013e-06, + "loss": 0.4423, + "step": 936 + }, + { + "epoch": 0.2630544637843908, + "grad_norm": 0.7214736342430115, + "learning_rate": 8.765201122544434e-06, + "loss": 0.4988, + "step": 937 + }, + { + "epoch": 0.26333520494104434, + "grad_norm": 0.7720814347267151, + "learning_rate": 8.774555659494856e-06, + "loss": 0.5019, + "step": 938 + }, + { + "epoch": 0.2636159460976979, + "grad_norm": 0.776086151599884, + "learning_rate": 8.783910196445277e-06, + "loss": 0.4787, + "step": 939 + }, + { + "epoch": 0.2638966872543515, + "grad_norm": 0.7871274948120117, + "learning_rate": 8.793264733395697e-06, + "loss": 0.5098, + "step": 940 + }, + { + "epoch": 0.26417742841100506, + "grad_norm": 0.8371344208717346, + "learning_rate": 8.802619270346118e-06, + "loss": 0.4995, + "step": 941 + }, + { + "epoch": 0.2644581695676586, + "grad_norm": 1.0325953960418701, + "learning_rate": 8.81197380729654e-06, + "loss": 0.5568, + "step": 942 + }, + { + "epoch": 0.26473891072431216, + "grad_norm": 0.8071467876434326, + "learning_rate": 8.821328344246961e-06, + "loss": 0.4658, + "step": 943 + }, + { + "epoch": 0.26501965188096577, + "grad_norm": 0.7698601484298706, + "learning_rate": 8.830682881197382e-06, + "loss": 0.4532, + "step": 944 + }, + { + "epoch": 0.2653003930376193, + "grad_norm": 0.9465731382369995, + "learning_rate": 8.840037418147802e-06, + "loss": 0.4382, + "step": 945 + }, + { + "epoch": 0.26558113419427287, + "grad_norm": 1.0514284372329712, + "learning_rate": 8.849391955098223e-06, + "loss": 0.4715, + "step": 946 + }, + { + "epoch": 0.2658618753509264, + "grad_norm": 0.8537533283233643, + "learning_rate": 8.858746492048645e-06, + "loss": 0.497, + "step": 947 + }, + { + "epoch": 0.26614261650758003, + "grad_norm": 0.9736228585243225, + "learning_rate": 8.868101028999066e-06, + "loss": 0.4909, + "step": 948 + }, + { + "epoch": 0.2664233576642336, + "grad_norm": 0.8221850991249084, + "learning_rate": 8.877455565949486e-06, + "loss": 0.4597, + "step": 949 + }, + { + "epoch": 0.26670409882088714, + "grad_norm": 1.005744218826294, + "learning_rate": 8.886810102899907e-06, + "loss": 0.4813, + "step": 950 + }, + { + "epoch": 0.2669848399775407, + "grad_norm": 0.8730907440185547, + "learning_rate": 8.896164639850328e-06, + "loss": 0.4546, + "step": 951 + }, + { + "epoch": 0.2672655811341943, + "grad_norm": 0.8560677766799927, + "learning_rate": 8.905519176800748e-06, + "loss": 0.4836, + "step": 952 + }, + { + "epoch": 0.26754632229084785, + "grad_norm": 0.8395571112632751, + "learning_rate": 8.91487371375117e-06, + "loss": 0.4865, + "step": 953 + }, + { + "epoch": 0.2678270634475014, + "grad_norm": 0.8776417374610901, + "learning_rate": 8.924228250701591e-06, + "loss": 0.4915, + "step": 954 + }, + { + "epoch": 0.26810780460415495, + "grad_norm": 0.9050156474113464, + "learning_rate": 8.933582787652012e-06, + "loss": 0.4671, + "step": 955 + }, + { + "epoch": 0.26838854576080856, + "grad_norm": 0.7902247309684753, + "learning_rate": 8.942937324602433e-06, + "loss": 0.4976, + "step": 956 + }, + { + "epoch": 0.2686692869174621, + "grad_norm": 0.7983662486076355, + "learning_rate": 8.952291861552855e-06, + "loss": 0.512, + "step": 957 + }, + { + "epoch": 0.26895002807411567, + "grad_norm": 0.813231348991394, + "learning_rate": 8.961646398503275e-06, + "loss": 0.45, + "step": 958 + }, + { + "epoch": 0.2692307692307692, + "grad_norm": 0.8848547339439392, + "learning_rate": 8.971000935453696e-06, + "loss": 0.4848, + "step": 959 + }, + { + "epoch": 0.26951151038742277, + "grad_norm": 0.8239362835884094, + "learning_rate": 8.980355472404117e-06, + "loss": 0.4623, + "step": 960 + }, + { + "epoch": 0.2697922515440764, + "grad_norm": 0.9324841499328613, + "learning_rate": 8.989710009354537e-06, + "loss": 0.5013, + "step": 961 + }, + { + "epoch": 0.27007299270072993, + "grad_norm": 0.871340811252594, + "learning_rate": 8.999064546304958e-06, + "loss": 0.4507, + "step": 962 + }, + { + "epoch": 0.2703537338573835, + "grad_norm": 0.9839334487915039, + "learning_rate": 9.00841908325538e-06, + "loss": 0.5147, + "step": 963 + }, + { + "epoch": 0.27063447501403703, + "grad_norm": 0.9232575297355652, + "learning_rate": 9.017773620205801e-06, + "loss": 0.4329, + "step": 964 + }, + { + "epoch": 0.27091521617069064, + "grad_norm": 1.1634669303894043, + "learning_rate": 9.027128157156222e-06, + "loss": 0.5702, + "step": 965 + }, + { + "epoch": 0.2711959573273442, + "grad_norm": 0.8469628691673279, + "learning_rate": 9.036482694106642e-06, + "loss": 0.4695, + "step": 966 + }, + { + "epoch": 0.27147669848399775, + "grad_norm": 0.8922038078308105, + "learning_rate": 9.045837231057063e-06, + "loss": 0.4382, + "step": 967 + }, + { + "epoch": 0.2717574396406513, + "grad_norm": 0.8791254162788391, + "learning_rate": 9.055191768007483e-06, + "loss": 0.4659, + "step": 968 + }, + { + "epoch": 0.2720381807973049, + "grad_norm": 0.839027464389801, + "learning_rate": 9.064546304957906e-06, + "loss": 0.4501, + "step": 969 + }, + { + "epoch": 0.27231892195395846, + "grad_norm": 0.9237929582595825, + "learning_rate": 9.073900841908326e-06, + "loss": 0.4699, + "step": 970 + }, + { + "epoch": 0.272599663110612, + "grad_norm": 0.9299217462539673, + "learning_rate": 9.083255378858747e-06, + "loss": 0.4637, + "step": 971 + }, + { + "epoch": 0.27288040426726556, + "grad_norm": 0.7219536900520325, + "learning_rate": 9.09260991580917e-06, + "loss": 0.4869, + "step": 972 + }, + { + "epoch": 0.27316114542391917, + "grad_norm": 0.9024264812469482, + "learning_rate": 9.10196445275959e-06, + "loss": 0.487, + "step": 973 + }, + { + "epoch": 0.2734418865805727, + "grad_norm": 0.8910660147666931, + "learning_rate": 9.111318989710009e-06, + "loss": 0.5116, + "step": 974 + }, + { + "epoch": 0.2737226277372263, + "grad_norm": 0.8204740285873413, + "learning_rate": 9.120673526660431e-06, + "loss": 0.4493, + "step": 975 + }, + { + "epoch": 0.27400336889387983, + "grad_norm": 0.9932129383087158, + "learning_rate": 9.130028063610852e-06, + "loss": 0.4483, + "step": 976 + }, + { + "epoch": 0.2742841100505334, + "grad_norm": 0.8457167148590088, + "learning_rate": 9.139382600561273e-06, + "loss": 0.4244, + "step": 977 + }, + { + "epoch": 0.274564851207187, + "grad_norm": 1.0124696493148804, + "learning_rate": 9.148737137511695e-06, + "loss": 0.5319, + "step": 978 + }, + { + "epoch": 0.27484559236384054, + "grad_norm": 0.8043119311332703, + "learning_rate": 9.158091674462115e-06, + "loss": 0.4687, + "step": 979 + }, + { + "epoch": 0.2751263335204941, + "grad_norm": 0.8999631404876709, + "learning_rate": 9.167446211412536e-06, + "loss": 0.4663, + "step": 980 + }, + { + "epoch": 0.27540707467714765, + "grad_norm": 0.7950866222381592, + "learning_rate": 9.176800748362957e-06, + "loss": 0.4648, + "step": 981 + }, + { + "epoch": 0.27568781583380125, + "grad_norm": 0.8977778553962708, + "learning_rate": 9.186155285313377e-06, + "loss": 0.4644, + "step": 982 + }, + { + "epoch": 0.2759685569904548, + "grad_norm": 0.8251086473464966, + "learning_rate": 9.195509822263798e-06, + "loss": 0.4705, + "step": 983 + }, + { + "epoch": 0.27624929814710836, + "grad_norm": 0.8819026947021484, + "learning_rate": 9.20486435921422e-06, + "loss": 0.474, + "step": 984 + }, + { + "epoch": 0.2765300393037619, + "grad_norm": 0.9170657396316528, + "learning_rate": 9.214218896164641e-06, + "loss": 0.5067, + "step": 985 + }, + { + "epoch": 0.2768107804604155, + "grad_norm": 0.9154438972473145, + "learning_rate": 9.223573433115062e-06, + "loss": 0.433, + "step": 986 + }, + { + "epoch": 0.27709152161706907, + "grad_norm": 0.8111538887023926, + "learning_rate": 9.232927970065482e-06, + "loss": 0.484, + "step": 987 + }, + { + "epoch": 0.2773722627737226, + "grad_norm": 0.808013916015625, + "learning_rate": 9.242282507015905e-06, + "loss": 0.4879, + "step": 988 + }, + { + "epoch": 0.2776530039303762, + "grad_norm": 0.9945582747459412, + "learning_rate": 9.251637043966323e-06, + "loss": 0.4738, + "step": 989 + }, + { + "epoch": 0.2779337450870298, + "grad_norm": 0.8452256917953491, + "learning_rate": 9.260991580916746e-06, + "loss": 0.5337, + "step": 990 + }, + { + "epoch": 0.27821448624368333, + "grad_norm": 0.7302437424659729, + "learning_rate": 9.270346117867166e-06, + "loss": 0.4836, + "step": 991 + }, + { + "epoch": 0.2784952274003369, + "grad_norm": 0.8313810229301453, + "learning_rate": 9.279700654817587e-06, + "loss": 0.5328, + "step": 992 + }, + { + "epoch": 0.27877596855699044, + "grad_norm": 0.9550543427467346, + "learning_rate": 9.289055191768008e-06, + "loss": 0.4587, + "step": 993 + }, + { + "epoch": 0.27905670971364405, + "grad_norm": 0.8457921147346497, + "learning_rate": 9.29840972871843e-06, + "loss": 0.4718, + "step": 994 + }, + { + "epoch": 0.2793374508702976, + "grad_norm": 0.8598791360855103, + "learning_rate": 9.30776426566885e-06, + "loss": 0.4654, + "step": 995 + }, + { + "epoch": 0.27961819202695115, + "grad_norm": 0.9834221601486206, + "learning_rate": 9.317118802619271e-06, + "loss": 0.4971, + "step": 996 + }, + { + "epoch": 0.2798989331836047, + "grad_norm": 0.8372482061386108, + "learning_rate": 9.326473339569692e-06, + "loss": 0.4551, + "step": 997 + }, + { + "epoch": 0.28017967434025826, + "grad_norm": 0.92179274559021, + "learning_rate": 9.335827876520112e-06, + "loss": 0.5081, + "step": 998 + }, + { + "epoch": 0.28046041549691186, + "grad_norm": 0.8274542689323425, + "learning_rate": 9.345182413470533e-06, + "loss": 0.5604, + "step": 999 + }, + { + "epoch": 0.2807411566535654, + "grad_norm": 0.8723477721214294, + "learning_rate": 9.354536950420955e-06, + "loss": 0.5213, + "step": 1000 + }, + { + "epoch": 0.28102189781021897, + "grad_norm": 0.975670576095581, + "learning_rate": 9.363891487371376e-06, + "loss": 0.5363, + "step": 1001 + }, + { + "epoch": 0.2813026389668725, + "grad_norm": 0.8007491827011108, + "learning_rate": 9.373246024321797e-06, + "loss": 0.4587, + "step": 1002 + }, + { + "epoch": 0.28158338012352613, + "grad_norm": 0.9563086032867432, + "learning_rate": 9.382600561272219e-06, + "loss": 0.5113, + "step": 1003 + }, + { + "epoch": 0.2818641212801797, + "grad_norm": 0.9842604994773865, + "learning_rate": 9.39195509822264e-06, + "loss": 0.5133, + "step": 1004 + }, + { + "epoch": 0.28214486243683323, + "grad_norm": 0.7433784008026123, + "learning_rate": 9.401309635173059e-06, + "loss": 0.4674, + "step": 1005 + }, + { + "epoch": 0.2824256035934868, + "grad_norm": 0.8338155746459961, + "learning_rate": 9.410664172123481e-06, + "loss": 0.51, + "step": 1006 + }, + { + "epoch": 0.2827063447501404, + "grad_norm": 0.8775511384010315, + "learning_rate": 9.420018709073902e-06, + "loss": 0.4848, + "step": 1007 + }, + { + "epoch": 0.28298708590679394, + "grad_norm": 0.8014006614685059, + "learning_rate": 9.429373246024322e-06, + "loss": 0.5229, + "step": 1008 + }, + { + "epoch": 0.2832678270634475, + "grad_norm": 0.8549612760543823, + "learning_rate": 9.438727782974744e-06, + "loss": 0.523, + "step": 1009 + }, + { + "epoch": 0.28354856822010105, + "grad_norm": 0.9535396099090576, + "learning_rate": 9.448082319925165e-06, + "loss": 0.5106, + "step": 1010 + }, + { + "epoch": 0.28382930937675466, + "grad_norm": 0.7287841439247131, + "learning_rate": 9.457436856875586e-06, + "loss": 0.4796, + "step": 1011 + }, + { + "epoch": 0.2841100505334082, + "grad_norm": 0.7829171419143677, + "learning_rate": 9.466791393826006e-06, + "loss": 0.4585, + "step": 1012 + }, + { + "epoch": 0.28439079169006176, + "grad_norm": 0.8058018684387207, + "learning_rate": 9.476145930776427e-06, + "loss": 0.435, + "step": 1013 + }, + { + "epoch": 0.2846715328467153, + "grad_norm": 0.7625716924667358, + "learning_rate": 9.485500467726848e-06, + "loss": 0.4703, + "step": 1014 + }, + { + "epoch": 0.28495227400336887, + "grad_norm": 0.7859020829200745, + "learning_rate": 9.49485500467727e-06, + "loss": 0.4794, + "step": 1015 + }, + { + "epoch": 0.2852330151600225, + "grad_norm": 0.8216314315795898, + "learning_rate": 9.50420954162769e-06, + "loss": 0.4536, + "step": 1016 + }, + { + "epoch": 0.285513756316676, + "grad_norm": 0.709894061088562, + "learning_rate": 9.513564078578111e-06, + "loss": 0.4608, + "step": 1017 + }, + { + "epoch": 0.2857944974733296, + "grad_norm": 0.7770177125930786, + "learning_rate": 9.522918615528532e-06, + "loss": 0.4991, + "step": 1018 + }, + { + "epoch": 0.28607523862998313, + "grad_norm": 0.8208000063896179, + "learning_rate": 9.532273152478954e-06, + "loss": 0.5088, + "step": 1019 + }, + { + "epoch": 0.28635597978663674, + "grad_norm": 0.7777297496795654, + "learning_rate": 9.541627689429373e-06, + "loss": 0.4546, + "step": 1020 + }, + { + "epoch": 0.2866367209432903, + "grad_norm": 0.6712039113044739, + "learning_rate": 9.550982226379795e-06, + "loss": 0.4791, + "step": 1021 + }, + { + "epoch": 0.28691746209994384, + "grad_norm": 0.7849695682525635, + "learning_rate": 9.560336763330216e-06, + "loss": 0.4831, + "step": 1022 + }, + { + "epoch": 0.2871982032565974, + "grad_norm": 0.7336786389350891, + "learning_rate": 9.569691300280637e-06, + "loss": 0.4849, + "step": 1023 + }, + { + "epoch": 0.287478944413251, + "grad_norm": 0.7290138602256775, + "learning_rate": 9.579045837231057e-06, + "loss": 0.4704, + "step": 1024 + }, + { + "epoch": 0.28775968556990456, + "grad_norm": 0.9773189425468445, + "learning_rate": 9.58840037418148e-06, + "loss": 0.4793, + "step": 1025 + }, + { + "epoch": 0.2880404267265581, + "grad_norm": 0.8708929419517517, + "learning_rate": 9.5977549111319e-06, + "loss": 0.4886, + "step": 1026 + }, + { + "epoch": 0.28832116788321166, + "grad_norm": 0.7952057719230652, + "learning_rate": 9.607109448082321e-06, + "loss": 0.4942, + "step": 1027 + }, + { + "epoch": 0.28860190903986527, + "grad_norm": 0.8154239058494568, + "learning_rate": 9.616463985032741e-06, + "loss": 0.5241, + "step": 1028 + }, + { + "epoch": 0.2888826501965188, + "grad_norm": 0.7335970997810364, + "learning_rate": 9.625818521983162e-06, + "loss": 0.5128, + "step": 1029 + }, + { + "epoch": 0.28916339135317237, + "grad_norm": 0.701531171798706, + "learning_rate": 9.635173058933583e-06, + "loss": 0.4949, + "step": 1030 + }, + { + "epoch": 0.2894441325098259, + "grad_norm": 0.7591395378112793, + "learning_rate": 9.644527595884005e-06, + "loss": 0.4846, + "step": 1031 + }, + { + "epoch": 0.28972487366647953, + "grad_norm": 0.8313401341438293, + "learning_rate": 9.653882132834426e-06, + "loss": 0.5033, + "step": 1032 + }, + { + "epoch": 0.2900056148231331, + "grad_norm": 0.8509560823440552, + "learning_rate": 9.663236669784846e-06, + "loss": 0.4974, + "step": 1033 + }, + { + "epoch": 0.29028635597978664, + "grad_norm": 0.7456276416778564, + "learning_rate": 9.672591206735269e-06, + "loss": 0.5008, + "step": 1034 + }, + { + "epoch": 0.2905670971364402, + "grad_norm": 0.8189723491668701, + "learning_rate": 9.681945743685688e-06, + "loss": 0.4596, + "step": 1035 + }, + { + "epoch": 0.29084783829309374, + "grad_norm": 0.791438102722168, + "learning_rate": 9.691300280636108e-06, + "loss": 0.4789, + "step": 1036 + }, + { + "epoch": 0.29112857944974735, + "grad_norm": 0.69429612159729, + "learning_rate": 9.70065481758653e-06, + "loss": 0.406, + "step": 1037 + }, + { + "epoch": 0.2914093206064009, + "grad_norm": 0.7351028919219971, + "learning_rate": 9.710009354536951e-06, + "loss": 0.4522, + "step": 1038 + }, + { + "epoch": 0.29169006176305445, + "grad_norm": 0.80379319190979, + "learning_rate": 9.719363891487372e-06, + "loss": 0.5104, + "step": 1039 + }, + { + "epoch": 0.291970802919708, + "grad_norm": 0.7888717651367188, + "learning_rate": 9.728718428437794e-06, + "loss": 0.4552, + "step": 1040 + }, + { + "epoch": 0.2922515440763616, + "grad_norm": 0.7304790616035461, + "learning_rate": 9.738072965388215e-06, + "loss": 0.4246, + "step": 1041 + }, + { + "epoch": 0.29253228523301517, + "grad_norm": 0.875408411026001, + "learning_rate": 9.747427502338635e-06, + "loss": 0.43, + "step": 1042 + }, + { + "epoch": 0.2928130263896687, + "grad_norm": 0.8990151286125183, + "learning_rate": 9.756782039289056e-06, + "loss": 0.4987, + "step": 1043 + }, + { + "epoch": 0.29309376754632227, + "grad_norm": 0.722625732421875, + "learning_rate": 9.766136576239477e-06, + "loss": 0.474, + "step": 1044 + }, + { + "epoch": 0.2933745087029759, + "grad_norm": 0.9640076756477356, + "learning_rate": 9.775491113189897e-06, + "loss": 0.4753, + "step": 1045 + }, + { + "epoch": 0.29365524985962943, + "grad_norm": 0.8131601810455322, + "learning_rate": 9.78484565014032e-06, + "loss": 0.5016, + "step": 1046 + }, + { + "epoch": 0.293935991016283, + "grad_norm": 0.7900562286376953, + "learning_rate": 9.79420018709074e-06, + "loss": 0.491, + "step": 1047 + }, + { + "epoch": 0.29421673217293653, + "grad_norm": 0.8465869426727295, + "learning_rate": 9.80355472404116e-06, + "loss": 0.4802, + "step": 1048 + }, + { + "epoch": 0.29449747332959014, + "grad_norm": 0.9690655469894409, + "learning_rate": 9.812909260991581e-06, + "loss": 0.5153, + "step": 1049 + }, + { + "epoch": 0.2947782144862437, + "grad_norm": 0.8377925753593445, + "learning_rate": 9.822263797942002e-06, + "loss": 0.4918, + "step": 1050 + }, + { + "epoch": 0.29505895564289725, + "grad_norm": 0.772736668586731, + "learning_rate": 9.831618334892423e-06, + "loss": 0.4505, + "step": 1051 + }, + { + "epoch": 0.2953396967995508, + "grad_norm": 0.9311164617538452, + "learning_rate": 9.840972871842845e-06, + "loss": 0.4547, + "step": 1052 + }, + { + "epoch": 0.2956204379562044, + "grad_norm": 0.9110825061798096, + "learning_rate": 9.850327408793266e-06, + "loss": 0.4864, + "step": 1053 + }, + { + "epoch": 0.29590117911285796, + "grad_norm": 0.8068156242370605, + "learning_rate": 9.859681945743686e-06, + "loss": 0.4701, + "step": 1054 + }, + { + "epoch": 0.2961819202695115, + "grad_norm": 0.833731472492218, + "learning_rate": 9.869036482694107e-06, + "loss": 0.455, + "step": 1055 + }, + { + "epoch": 0.29646266142616506, + "grad_norm": 0.9055677056312561, + "learning_rate": 9.87839101964453e-06, + "loss": 0.4432, + "step": 1056 + }, + { + "epoch": 0.2967434025828186, + "grad_norm": 0.9414280652999878, + "learning_rate": 9.88774555659495e-06, + "loss": 0.5132, + "step": 1057 + }, + { + "epoch": 0.2970241437394722, + "grad_norm": 0.877994179725647, + "learning_rate": 9.89710009354537e-06, + "loss": 0.5158, + "step": 1058 + }, + { + "epoch": 0.2973048848961258, + "grad_norm": 0.8954114317893982, + "learning_rate": 9.906454630495791e-06, + "loss": 0.5154, + "step": 1059 + }, + { + "epoch": 0.29758562605277933, + "grad_norm": 0.9333704113960266, + "learning_rate": 9.915809167446212e-06, + "loss": 0.5306, + "step": 1060 + }, + { + "epoch": 0.2978663672094329, + "grad_norm": 0.9117119908332825, + "learning_rate": 9.925163704396632e-06, + "loss": 0.5018, + "step": 1061 + }, + { + "epoch": 0.2981471083660865, + "grad_norm": 0.8949404358863831, + "learning_rate": 9.934518241347055e-06, + "loss": 0.4871, + "step": 1062 + }, + { + "epoch": 0.29842784952274004, + "grad_norm": 0.8439606428146362, + "learning_rate": 9.943872778297475e-06, + "loss": 0.4553, + "step": 1063 + }, + { + "epoch": 0.2987085906793936, + "grad_norm": 0.9122105836868286, + "learning_rate": 9.953227315247896e-06, + "loss": 0.4793, + "step": 1064 + }, + { + "epoch": 0.29898933183604715, + "grad_norm": 0.7889140844345093, + "learning_rate": 9.962581852198318e-06, + "loss": 0.4867, + "step": 1065 + }, + { + "epoch": 0.29927007299270075, + "grad_norm": 0.6639648079872131, + "learning_rate": 9.971936389148737e-06, + "loss": 0.476, + "step": 1066 + }, + { + "epoch": 0.2995508141493543, + "grad_norm": 0.77021324634552, + "learning_rate": 9.981290926099158e-06, + "loss": 0.4622, + "step": 1067 + }, + { + "epoch": 0.29983155530600786, + "grad_norm": 0.8649744391441345, + "learning_rate": 9.99064546304958e-06, + "loss": 0.4847, + "step": 1068 + }, + { + "epoch": 0.3001122964626614, + "grad_norm": 0.7904282808303833, + "learning_rate": 1e-05, + "loss": 0.4737, + "step": 1069 + }, + { + "epoch": 0.300393037619315, + "grad_norm": 0.7468514442443848, + "learning_rate": 9.999999733215548e-06, + "loss": 0.4923, + "step": 1070 + }, + { + "epoch": 0.30067377877596857, + "grad_norm": 0.9646374583244324, + "learning_rate": 9.999998932862217e-06, + "loss": 0.5091, + "step": 1071 + }, + { + "epoch": 0.3009545199326221, + "grad_norm": 0.8719750642776489, + "learning_rate": 9.99999759894009e-06, + "loss": 0.4564, + "step": 1072 + }, + { + "epoch": 0.3012352610892757, + "grad_norm": 0.7746025919914246, + "learning_rate": 9.999995731449315e-06, + "loss": 0.4724, + "step": 1073 + }, + { + "epoch": 0.3015160022459292, + "grad_norm": 0.7543782591819763, + "learning_rate": 9.999993330390085e-06, + "loss": 0.5004, + "step": 1074 + }, + { + "epoch": 0.30179674340258283, + "grad_norm": 0.7941534519195557, + "learning_rate": 9.999990395762663e-06, + "loss": 0.4597, + "step": 1075 + }, + { + "epoch": 0.3020774845592364, + "grad_norm": 0.8257895708084106, + "learning_rate": 9.999986927567358e-06, + "loss": 0.4847, + "step": 1076 + }, + { + "epoch": 0.30235822571588994, + "grad_norm": 0.9384962320327759, + "learning_rate": 9.999982925804541e-06, + "loss": 0.5341, + "step": 1077 + }, + { + "epoch": 0.3026389668725435, + "grad_norm": 0.9379695653915405, + "learning_rate": 9.999978390474639e-06, + "loss": 0.4905, + "step": 1078 + }, + { + "epoch": 0.3029197080291971, + "grad_norm": 0.863917350769043, + "learning_rate": 9.999973321578136e-06, + "loss": 0.4363, + "step": 1079 + }, + { + "epoch": 0.30320044918585065, + "grad_norm": 0.9006621241569519, + "learning_rate": 9.999967719115574e-06, + "loss": 0.4703, + "step": 1080 + }, + { + "epoch": 0.3034811903425042, + "grad_norm": 0.8605801463127136, + "learning_rate": 9.999961583087548e-06, + "loss": 0.4343, + "step": 1081 + }, + { + "epoch": 0.30376193149915776, + "grad_norm": 0.8910146355628967, + "learning_rate": 9.999954913494713e-06, + "loss": 0.5191, + "step": 1082 + }, + { + "epoch": 0.30404267265581136, + "grad_norm": 0.6636241674423218, + "learning_rate": 9.999947710337785e-06, + "loss": 0.4793, + "step": 1083 + }, + { + "epoch": 0.3043234138124649, + "grad_norm": 0.8792728781700134, + "learning_rate": 9.99993997361753e-06, + "loss": 0.445, + "step": 1084 + }, + { + "epoch": 0.30460415496911847, + "grad_norm": 0.7850635051727295, + "learning_rate": 9.999931703334774e-06, + "loss": 0.4996, + "step": 1085 + }, + { + "epoch": 0.304884896125772, + "grad_norm": 0.8990797996520996, + "learning_rate": 9.999922899490396e-06, + "loss": 0.502, + "step": 1086 + }, + { + "epoch": 0.30516563728242563, + "grad_norm": 0.946212649345398, + "learning_rate": 9.999913562085342e-06, + "loss": 0.4797, + "step": 1087 + }, + { + "epoch": 0.3054463784390792, + "grad_norm": 0.8670285940170288, + "learning_rate": 9.999903691120603e-06, + "loss": 0.4525, + "step": 1088 + }, + { + "epoch": 0.30572711959573273, + "grad_norm": 0.8761336207389832, + "learning_rate": 9.999893286597235e-06, + "loss": 0.5337, + "step": 1089 + }, + { + "epoch": 0.3060078607523863, + "grad_norm": 0.8910177946090698, + "learning_rate": 9.999882348516348e-06, + "loss": 0.4737, + "step": 1090 + }, + { + "epoch": 0.3062886019090399, + "grad_norm": 1.042477011680603, + "learning_rate": 9.99987087687911e-06, + "loss": 0.523, + "step": 1091 + }, + { + "epoch": 0.30656934306569344, + "grad_norm": 0.8660892844200134, + "learning_rate": 9.999858871686743e-06, + "loss": 0.4171, + "step": 1092 + }, + { + "epoch": 0.306850084222347, + "grad_norm": 0.8873071074485779, + "learning_rate": 9.999846332940528e-06, + "loss": 0.463, + "step": 1093 + }, + { + "epoch": 0.30713082537900055, + "grad_norm": 0.8817505240440369, + "learning_rate": 9.999833260641807e-06, + "loss": 0.4774, + "step": 1094 + }, + { + "epoch": 0.3074115665356541, + "grad_norm": 0.840127170085907, + "learning_rate": 9.99981965479197e-06, + "loss": 0.4881, + "step": 1095 + }, + { + "epoch": 0.3076923076923077, + "grad_norm": 0.8102737665176392, + "learning_rate": 9.999805515392473e-06, + "loss": 0.459, + "step": 1096 + }, + { + "epoch": 0.30797304884896126, + "grad_norm": 0.7922653555870056, + "learning_rate": 9.999790842444822e-06, + "loss": 0.4728, + "step": 1097 + }, + { + "epoch": 0.3082537900056148, + "grad_norm": 0.805242121219635, + "learning_rate": 9.999775635950584e-06, + "loss": 0.503, + "step": 1098 + }, + { + "epoch": 0.30853453116226837, + "grad_norm": 0.7929728031158447, + "learning_rate": 9.999759895911383e-06, + "loss": 0.4475, + "step": 1099 + }, + { + "epoch": 0.308815272318922, + "grad_norm": 0.8671475052833557, + "learning_rate": 9.999743622328895e-06, + "loss": 0.509, + "step": 1100 + }, + { + "epoch": 0.3090960134755755, + "grad_norm": 0.9393376111984253, + "learning_rate": 9.999726815204862e-06, + "loss": 0.4587, + "step": 1101 + }, + { + "epoch": 0.3093767546322291, + "grad_norm": 0.8894326686859131, + "learning_rate": 9.999709474541072e-06, + "loss": 0.513, + "step": 1102 + }, + { + "epoch": 0.30965749578888263, + "grad_norm": 0.7650179266929626, + "learning_rate": 9.99969160033938e-06, + "loss": 0.4715, + "step": 1103 + }, + { + "epoch": 0.30993823694553624, + "grad_norm": 0.7481208443641663, + "learning_rate": 9.99967319260169e-06, + "loss": 0.4853, + "step": 1104 + }, + { + "epoch": 0.3102189781021898, + "grad_norm": 0.8617684841156006, + "learning_rate": 9.999654251329967e-06, + "loss": 0.4982, + "step": 1105 + }, + { + "epoch": 0.31049971925884334, + "grad_norm": 0.6560022830963135, + "learning_rate": 9.999634776526234e-06, + "loss": 0.4079, + "step": 1106 + }, + { + "epoch": 0.3107804604154969, + "grad_norm": 0.6817672252655029, + "learning_rate": 9.999614768192569e-06, + "loss": 0.4814, + "step": 1107 + }, + { + "epoch": 0.3110612015721505, + "grad_norm": 0.8304376006126404, + "learning_rate": 9.999594226331107e-06, + "loss": 0.4919, + "step": 1108 + }, + { + "epoch": 0.31134194272880406, + "grad_norm": 0.7779669165611267, + "learning_rate": 9.999573150944039e-06, + "loss": 0.4816, + "step": 1109 + }, + { + "epoch": 0.3116226838854576, + "grad_norm": 0.7018353343009949, + "learning_rate": 9.999551542033614e-06, + "loss": 0.4522, + "step": 1110 + }, + { + "epoch": 0.31190342504211116, + "grad_norm": 0.7014132142066956, + "learning_rate": 9.999529399602139e-06, + "loss": 0.4363, + "step": 1111 + }, + { + "epoch": 0.3121841661987647, + "grad_norm": 0.7255052328109741, + "learning_rate": 9.999506723651976e-06, + "loss": 0.452, + "step": 1112 + }, + { + "epoch": 0.3124649073554183, + "grad_norm": 0.8355602025985718, + "learning_rate": 9.999483514185547e-06, + "loss": 0.4634, + "step": 1113 + }, + { + "epoch": 0.31274564851207187, + "grad_norm": 0.9734678864479065, + "learning_rate": 9.999459771205324e-06, + "loss": 0.5471, + "step": 1114 + }, + { + "epoch": 0.3130263896687254, + "grad_norm": 0.7540462613105774, + "learning_rate": 9.999435494713847e-06, + "loss": 0.4628, + "step": 1115 + }, + { + "epoch": 0.313307130825379, + "grad_norm": 0.7741772532463074, + "learning_rate": 9.999410684713701e-06, + "loss": 0.4551, + "step": 1116 + }, + { + "epoch": 0.3135878719820326, + "grad_norm": 0.9902048707008362, + "learning_rate": 9.999385341207536e-06, + "loss": 0.508, + "step": 1117 + }, + { + "epoch": 0.31386861313868614, + "grad_norm": 0.8954352736473083, + "learning_rate": 9.999359464198059e-06, + "loss": 0.4685, + "step": 1118 + }, + { + "epoch": 0.3141493542953397, + "grad_norm": 0.9386639595031738, + "learning_rate": 9.999333053688028e-06, + "loss": 0.4448, + "step": 1119 + }, + { + "epoch": 0.31443009545199324, + "grad_norm": 0.9378950595855713, + "learning_rate": 9.999306109680262e-06, + "loss": 0.4827, + "step": 1120 + }, + { + "epoch": 0.31471083660864685, + "grad_norm": 0.8391536474227905, + "learning_rate": 9.999278632177635e-06, + "loss": 0.4895, + "step": 1121 + }, + { + "epoch": 0.3149915777653004, + "grad_norm": 0.8170937895774841, + "learning_rate": 9.999250621183083e-06, + "loss": 0.4735, + "step": 1122 + }, + { + "epoch": 0.31527231892195395, + "grad_norm": 0.8841730356216431, + "learning_rate": 9.999222076699593e-06, + "loss": 0.4549, + "step": 1123 + }, + { + "epoch": 0.3155530600786075, + "grad_norm": 0.9412198066711426, + "learning_rate": 9.999192998730211e-06, + "loss": 0.4064, + "step": 1124 + }, + { + "epoch": 0.3158338012352611, + "grad_norm": 0.835308313369751, + "learning_rate": 9.999163387278039e-06, + "loss": 0.4729, + "step": 1125 + }, + { + "epoch": 0.31611454239191467, + "grad_norm": 0.7908468842506409, + "learning_rate": 9.999133242346239e-06, + "loss": 0.4466, + "step": 1126 + }, + { + "epoch": 0.3163952835485682, + "grad_norm": 0.881558358669281, + "learning_rate": 9.999102563938025e-06, + "loss": 0.5225, + "step": 1127 + }, + { + "epoch": 0.31667602470522177, + "grad_norm": 0.85551917552948, + "learning_rate": 9.999071352056676e-06, + "loss": 0.5211, + "step": 1128 + }, + { + "epoch": 0.3169567658618754, + "grad_norm": 0.7841977477073669, + "learning_rate": 9.999039606705516e-06, + "loss": 0.4472, + "step": 1129 + }, + { + "epoch": 0.31723750701852893, + "grad_norm": 0.9335364699363708, + "learning_rate": 9.999007327887939e-06, + "loss": 0.5199, + "step": 1130 + }, + { + "epoch": 0.3175182481751825, + "grad_norm": 0.9114176034927368, + "learning_rate": 9.998974515607384e-06, + "loss": 0.4696, + "step": 1131 + }, + { + "epoch": 0.31779898933183603, + "grad_norm": 0.9091213941574097, + "learning_rate": 9.998941169867357e-06, + "loss": 0.4742, + "step": 1132 + }, + { + "epoch": 0.3180797304884896, + "grad_norm": 1.1805241107940674, + "learning_rate": 9.998907290671415e-06, + "loss": 0.5277, + "step": 1133 + }, + { + "epoch": 0.3183604716451432, + "grad_norm": 1.036816120147705, + "learning_rate": 9.998872878023174e-06, + "loss": 0.4831, + "step": 1134 + }, + { + "epoch": 0.31864121280179675, + "grad_norm": 0.8304394483566284, + "learning_rate": 9.998837931926304e-06, + "loss": 0.4965, + "step": 1135 + }, + { + "epoch": 0.3189219539584503, + "grad_norm": 0.782892644405365, + "learning_rate": 9.998802452384536e-06, + "loss": 0.4742, + "step": 1136 + }, + { + "epoch": 0.31920269511510385, + "grad_norm": 0.9833454489707947, + "learning_rate": 9.998766439401655e-06, + "loss": 0.429, + "step": 1137 + }, + { + "epoch": 0.31948343627175746, + "grad_norm": 0.996681809425354, + "learning_rate": 9.998729892981505e-06, + "loss": 0.4632, + "step": 1138 + }, + { + "epoch": 0.319764177428411, + "grad_norm": 0.7468957901000977, + "learning_rate": 9.998692813127986e-06, + "loss": 0.4746, + "step": 1139 + }, + { + "epoch": 0.32004491858506456, + "grad_norm": 0.9790645837783813, + "learning_rate": 9.998655199845055e-06, + "loss": 0.5058, + "step": 1140 + }, + { + "epoch": 0.3203256597417181, + "grad_norm": 0.8107130527496338, + "learning_rate": 9.998617053136726e-06, + "loss": 0.4848, + "step": 1141 + }, + { + "epoch": 0.3206064008983717, + "grad_norm": 0.8455003499984741, + "learning_rate": 9.998578373007068e-06, + "loss": 0.494, + "step": 1142 + }, + { + "epoch": 0.3208871420550253, + "grad_norm": 0.9056618809700012, + "learning_rate": 9.998539159460213e-06, + "loss": 0.5063, + "step": 1143 + }, + { + "epoch": 0.32116788321167883, + "grad_norm": 0.7718350887298584, + "learning_rate": 9.998499412500339e-06, + "loss": 0.4429, + "step": 1144 + }, + { + "epoch": 0.3214486243683324, + "grad_norm": 0.8149054646492004, + "learning_rate": 9.998459132131695e-06, + "loss": 0.4876, + "step": 1145 + }, + { + "epoch": 0.321729365524986, + "grad_norm": 0.9570802450180054, + "learning_rate": 9.998418318358573e-06, + "loss": 0.5506, + "step": 1146 + }, + { + "epoch": 0.32201010668163954, + "grad_norm": 0.7342217564582825, + "learning_rate": 9.998376971185333e-06, + "loss": 0.5012, + "step": 1147 + }, + { + "epoch": 0.3222908478382931, + "grad_norm": 0.7465083003044128, + "learning_rate": 9.998335090616384e-06, + "loss": 0.4983, + "step": 1148 + }, + { + "epoch": 0.32257158899494665, + "grad_norm": 0.8385224342346191, + "learning_rate": 9.998292676656199e-06, + "loss": 0.4948, + "step": 1149 + }, + { + "epoch": 0.3228523301516002, + "grad_norm": 0.7445923686027527, + "learning_rate": 9.998249729309299e-06, + "loss": 0.5014, + "step": 1150 + }, + { + "epoch": 0.3231330713082538, + "grad_norm": 0.8759795427322388, + "learning_rate": 9.998206248580272e-06, + "loss": 0.5285, + "step": 1151 + }, + { + "epoch": 0.32341381246490736, + "grad_norm": 0.9513266682624817, + "learning_rate": 9.998162234473756e-06, + "loss": 0.5295, + "step": 1152 + }, + { + "epoch": 0.3236945536215609, + "grad_norm": 0.763493537902832, + "learning_rate": 9.998117686994446e-06, + "loss": 0.4679, + "step": 1153 + }, + { + "epoch": 0.32397529477821446, + "grad_norm": 0.8014282584190369, + "learning_rate": 9.9980726061471e-06, + "loss": 0.4575, + "step": 1154 + }, + { + "epoch": 0.32425603593486807, + "grad_norm": 0.8929340243339539, + "learning_rate": 9.998026991936525e-06, + "loss": 0.4627, + "step": 1155 + }, + { + "epoch": 0.3245367770915216, + "grad_norm": 0.9553277492523193, + "learning_rate": 9.99798084436759e-06, + "loss": 0.4817, + "step": 1156 + }, + { + "epoch": 0.3248175182481752, + "grad_norm": 0.7893467545509338, + "learning_rate": 9.99793416344522e-06, + "loss": 0.4743, + "step": 1157 + }, + { + "epoch": 0.3250982594048287, + "grad_norm": 0.8612390756607056, + "learning_rate": 9.997886949174397e-06, + "loss": 0.5213, + "step": 1158 + }, + { + "epoch": 0.32537900056148233, + "grad_norm": 0.77967369556427, + "learning_rate": 9.997839201560158e-06, + "loss": 0.4631, + "step": 1159 + }, + { + "epoch": 0.3256597417181359, + "grad_norm": 0.7877880930900574, + "learning_rate": 9.997790920607597e-06, + "loss": 0.5018, + "step": 1160 + }, + { + "epoch": 0.32594048287478944, + "grad_norm": 0.7800800204277039, + "learning_rate": 9.99774210632187e-06, + "loss": 0.4838, + "step": 1161 + }, + { + "epoch": 0.326221224031443, + "grad_norm": 0.8173455595970154, + "learning_rate": 9.997692758708186e-06, + "loss": 0.453, + "step": 1162 + }, + { + "epoch": 0.3265019651880966, + "grad_norm": 0.7861931324005127, + "learning_rate": 9.997642877771807e-06, + "loss": 0.5204, + "step": 1163 + }, + { + "epoch": 0.32678270634475015, + "grad_norm": 0.7018530368804932, + "learning_rate": 9.997592463518059e-06, + "loss": 0.4835, + "step": 1164 + }, + { + "epoch": 0.3270634475014037, + "grad_norm": 0.8089879751205444, + "learning_rate": 9.997541515952321e-06, + "loss": 0.4532, + "step": 1165 + }, + { + "epoch": 0.32734418865805726, + "grad_norm": 0.7837215662002563, + "learning_rate": 9.99749003508003e-06, + "loss": 0.4317, + "step": 1166 + }, + { + "epoch": 0.32762492981471086, + "grad_norm": 0.8741607069969177, + "learning_rate": 9.99743802090668e-06, + "loss": 0.5138, + "step": 1167 + }, + { + "epoch": 0.3279056709713644, + "grad_norm": 0.7507151365280151, + "learning_rate": 9.997385473437822e-06, + "loss": 0.4047, + "step": 1168 + }, + { + "epoch": 0.32818641212801797, + "grad_norm": 0.8245847821235657, + "learning_rate": 9.997332392679063e-06, + "loss": 0.5032, + "step": 1169 + }, + { + "epoch": 0.3284671532846715, + "grad_norm": 0.905717134475708, + "learning_rate": 9.997278778636067e-06, + "loss": 0.4344, + "step": 1170 + }, + { + "epoch": 0.3287478944413251, + "grad_norm": 0.7550010085105896, + "learning_rate": 9.997224631314556e-06, + "loss": 0.4952, + "step": 1171 + }, + { + "epoch": 0.3290286355979787, + "grad_norm": 0.8762251734733582, + "learning_rate": 9.997169950720307e-06, + "loss": 0.4596, + "step": 1172 + }, + { + "epoch": 0.32930937675463223, + "grad_norm": 0.7687079310417175, + "learning_rate": 9.997114736859158e-06, + "loss": 0.4804, + "step": 1173 + }, + { + "epoch": 0.3295901179112858, + "grad_norm": 1.1469238996505737, + "learning_rate": 9.997058989736997e-06, + "loss": 0.5505, + "step": 1174 + }, + { + "epoch": 0.32987085906793934, + "grad_norm": 0.9569076299667358, + "learning_rate": 9.997002709359776e-06, + "loss": 0.4758, + "step": 1175 + }, + { + "epoch": 0.33015160022459294, + "grad_norm": 0.8654143214225769, + "learning_rate": 9.9969458957335e-06, + "loss": 0.5084, + "step": 1176 + }, + { + "epoch": 0.3304323413812465, + "grad_norm": 0.84196537733078, + "learning_rate": 9.996888548864234e-06, + "loss": 0.4747, + "step": 1177 + }, + { + "epoch": 0.33071308253790005, + "grad_norm": 0.7889792323112488, + "learning_rate": 9.996830668758095e-06, + "loss": 0.4473, + "step": 1178 + }, + { + "epoch": 0.3309938236945536, + "grad_norm": 0.7202876806259155, + "learning_rate": 9.99677225542126e-06, + "loss": 0.5425, + "step": 1179 + }, + { + "epoch": 0.3312745648512072, + "grad_norm": 0.6800077557563782, + "learning_rate": 9.99671330885996e-06, + "loss": 0.4971, + "step": 1180 + }, + { + "epoch": 0.33155530600786076, + "grad_norm": 0.8296113014221191, + "learning_rate": 9.996653829080492e-06, + "loss": 0.4586, + "step": 1181 + }, + { + "epoch": 0.3318360471645143, + "grad_norm": 0.7130815386772156, + "learning_rate": 9.996593816089197e-06, + "loss": 0.478, + "step": 1182 + }, + { + "epoch": 0.33211678832116787, + "grad_norm": 0.8269796967506409, + "learning_rate": 9.996533269892483e-06, + "loss": 0.4493, + "step": 1183 + }, + { + "epoch": 0.3323975294778215, + "grad_norm": 0.8497306704521179, + "learning_rate": 9.99647219049681e-06, + "loss": 0.4777, + "step": 1184 + }, + { + "epoch": 0.332678270634475, + "grad_norm": 0.7915348410606384, + "learning_rate": 9.996410577908695e-06, + "loss": 0.4862, + "step": 1185 + }, + { + "epoch": 0.3329590117911286, + "grad_norm": 0.8377079367637634, + "learning_rate": 9.996348432134714e-06, + "loss": 0.4927, + "step": 1186 + }, + { + "epoch": 0.33323975294778213, + "grad_norm": 0.8476775884628296, + "learning_rate": 9.996285753181499e-06, + "loss": 0.5068, + "step": 1187 + }, + { + "epoch": 0.3335204941044357, + "grad_norm": 0.8414422869682312, + "learning_rate": 9.996222541055739e-06, + "loss": 0.448, + "step": 1188 + }, + { + "epoch": 0.3338012352610893, + "grad_norm": 0.8037646412849426, + "learning_rate": 9.996158795764177e-06, + "loss": 0.4817, + "step": 1189 + }, + { + "epoch": 0.33408197641774284, + "grad_norm": 0.8846210241317749, + "learning_rate": 9.996094517313618e-06, + "loss": 0.4698, + "step": 1190 + }, + { + "epoch": 0.3343627175743964, + "grad_norm": 0.8361542224884033, + "learning_rate": 9.996029705710921e-06, + "loss": 0.4929, + "step": 1191 + }, + { + "epoch": 0.33464345873104995, + "grad_norm": 0.8588730096817017, + "learning_rate": 9.995964360963003e-06, + "loss": 0.48, + "step": 1192 + }, + { + "epoch": 0.33492419988770356, + "grad_norm": 0.8661556839942932, + "learning_rate": 9.995898483076835e-06, + "loss": 0.5238, + "step": 1193 + }, + { + "epoch": 0.3352049410443571, + "grad_norm": 0.9000097513198853, + "learning_rate": 9.995832072059449e-06, + "loss": 0.5252, + "step": 1194 + }, + { + "epoch": 0.33548568220101066, + "grad_norm": 0.8409597873687744, + "learning_rate": 9.99576512791793e-06, + "loss": 0.5082, + "step": 1195 + }, + { + "epoch": 0.3357664233576642, + "grad_norm": 0.7277405261993408, + "learning_rate": 9.995697650659426e-06, + "loss": 0.5317, + "step": 1196 + }, + { + "epoch": 0.3360471645143178, + "grad_norm": 0.7315642833709717, + "learning_rate": 9.995629640291132e-06, + "loss": 0.4866, + "step": 1197 + }, + { + "epoch": 0.33632790567097137, + "grad_norm": 0.7630830407142639, + "learning_rate": 9.995561096820309e-06, + "loss": 0.497, + "step": 1198 + }, + { + "epoch": 0.3366086468276249, + "grad_norm": 0.7109522819519043, + "learning_rate": 9.995492020254271e-06, + "loss": 0.5072, + "step": 1199 + }, + { + "epoch": 0.3368893879842785, + "grad_norm": 0.7166508436203003, + "learning_rate": 9.995422410600391e-06, + "loss": 0.4433, + "step": 1200 + }, + { + "epoch": 0.3371701291409321, + "grad_norm": 0.9349340796470642, + "learning_rate": 9.995352267866095e-06, + "loss": 0.4625, + "step": 1201 + }, + { + "epoch": 0.33745087029758564, + "grad_norm": 0.702035665512085, + "learning_rate": 9.99528159205887e-06, + "loss": 0.4314, + "step": 1202 + }, + { + "epoch": 0.3377316114542392, + "grad_norm": 0.7309128046035767, + "learning_rate": 9.995210383186256e-06, + "loss": 0.4176, + "step": 1203 + }, + { + "epoch": 0.33801235261089274, + "grad_norm": 0.8737656474113464, + "learning_rate": 9.995138641255853e-06, + "loss": 0.5018, + "step": 1204 + }, + { + "epoch": 0.33829309376754635, + "grad_norm": 0.888888955116272, + "learning_rate": 9.995066366275317e-06, + "loss": 0.5339, + "step": 1205 + }, + { + "epoch": 0.3385738349241999, + "grad_norm": 0.8548439741134644, + "learning_rate": 9.99499355825236e-06, + "loss": 0.4878, + "step": 1206 + }, + { + "epoch": 0.33885457608085345, + "grad_norm": 0.9180452823638916, + "learning_rate": 9.994920217194755e-06, + "loss": 0.5312, + "step": 1207 + }, + { + "epoch": 0.339135317237507, + "grad_norm": 0.8303162455558777, + "learning_rate": 9.994846343110323e-06, + "loss": 0.4159, + "step": 1208 + }, + { + "epoch": 0.33941605839416056, + "grad_norm": 0.9149907231330872, + "learning_rate": 9.99477193600695e-06, + "loss": 0.4983, + "step": 1209 + }, + { + "epoch": 0.33969679955081417, + "grad_norm": 0.9785766005516052, + "learning_rate": 9.99469699589258e-06, + "loss": 0.4575, + "step": 1210 + }, + { + "epoch": 0.3399775407074677, + "grad_norm": 0.8752284049987793, + "learning_rate": 9.994621522775201e-06, + "loss": 0.4245, + "step": 1211 + }, + { + "epoch": 0.34025828186412127, + "grad_norm": 0.8960926532745361, + "learning_rate": 9.994545516662876e-06, + "loss": 0.4675, + "step": 1212 + }, + { + "epoch": 0.3405390230207748, + "grad_norm": 0.7819463014602661, + "learning_rate": 9.994468977563712e-06, + "loss": 0.5165, + "step": 1213 + }, + { + "epoch": 0.34081976417742843, + "grad_norm": 0.8910586833953857, + "learning_rate": 9.994391905485879e-06, + "loss": 0.4834, + "step": 1214 + }, + { + "epoch": 0.341100505334082, + "grad_norm": 0.7877460718154907, + "learning_rate": 9.994314300437598e-06, + "loss": 0.4403, + "step": 1215 + }, + { + "epoch": 0.34138124649073553, + "grad_norm": 0.7668460011482239, + "learning_rate": 9.994236162427152e-06, + "loss": 0.518, + "step": 1216 + }, + { + "epoch": 0.3416619876473891, + "grad_norm": 0.9465773105621338, + "learning_rate": 9.99415749146288e-06, + "loss": 0.4881, + "step": 1217 + }, + { + "epoch": 0.3419427288040427, + "grad_norm": 0.9357730746269226, + "learning_rate": 9.994078287553179e-06, + "loss": 0.4611, + "step": 1218 + }, + { + "epoch": 0.34222346996069625, + "grad_norm": 0.7182336449623108, + "learning_rate": 9.993998550706498e-06, + "loss": 0.4418, + "step": 1219 + }, + { + "epoch": 0.3425042111173498, + "grad_norm": 1.0413166284561157, + "learning_rate": 9.993918280931347e-06, + "loss": 0.4809, + "step": 1220 + }, + { + "epoch": 0.34278495227400335, + "grad_norm": 0.9022256731987, + "learning_rate": 9.993837478236293e-06, + "loss": 0.5131, + "step": 1221 + }, + { + "epoch": 0.34306569343065696, + "grad_norm": 0.8306872248649597, + "learning_rate": 9.993756142629957e-06, + "loss": 0.4447, + "step": 1222 + }, + { + "epoch": 0.3433464345873105, + "grad_norm": 0.8597789406776428, + "learning_rate": 9.993674274121018e-06, + "loss": 0.4933, + "step": 1223 + }, + { + "epoch": 0.34362717574396406, + "grad_norm": 0.8198334574699402, + "learning_rate": 9.993591872718218e-06, + "loss": 0.4617, + "step": 1224 + }, + { + "epoch": 0.3439079169006176, + "grad_norm": 0.8512240052223206, + "learning_rate": 9.993508938430344e-06, + "loss": 0.5089, + "step": 1225 + }, + { + "epoch": 0.34418865805727117, + "grad_norm": 0.8421564102172852, + "learning_rate": 9.99342547126625e-06, + "loss": 0.4637, + "step": 1226 + }, + { + "epoch": 0.3444693992139248, + "grad_norm": 0.7788011431694031, + "learning_rate": 9.99334147123484e-06, + "loss": 0.511, + "step": 1227 + }, + { + "epoch": 0.34475014037057833, + "grad_norm": 0.7721831202507019, + "learning_rate": 9.993256938345082e-06, + "loss": 0.4678, + "step": 1228 + }, + { + "epoch": 0.3450308815272319, + "grad_norm": 0.946810781955719, + "learning_rate": 9.993171872605992e-06, + "loss": 0.4807, + "step": 1229 + }, + { + "epoch": 0.34531162268388543, + "grad_norm": 0.818935215473175, + "learning_rate": 9.99308627402665e-06, + "loss": 0.4654, + "step": 1230 + }, + { + "epoch": 0.34559236384053904, + "grad_norm": 0.7893509864807129, + "learning_rate": 9.993000142616193e-06, + "loss": 0.489, + "step": 1231 + }, + { + "epoch": 0.3458731049971926, + "grad_norm": 0.8334790468215942, + "learning_rate": 9.99291347838381e-06, + "loss": 0.518, + "step": 1232 + }, + { + "epoch": 0.34615384615384615, + "grad_norm": 0.8282754421234131, + "learning_rate": 9.99282628133875e-06, + "loss": 0.4254, + "step": 1233 + }, + { + "epoch": 0.3464345873104997, + "grad_norm": 0.8387258052825928, + "learning_rate": 9.992738551490315e-06, + "loss": 0.477, + "step": 1234 + }, + { + "epoch": 0.3467153284671533, + "grad_norm": 0.8990135788917542, + "learning_rate": 9.99265028884787e-06, + "loss": 0.4903, + "step": 1235 + }, + { + "epoch": 0.34699606962380686, + "grad_norm": 0.751071572303772, + "learning_rate": 9.992561493420835e-06, + "loss": 0.4891, + "step": 1236 + }, + { + "epoch": 0.3472768107804604, + "grad_norm": 0.8329818248748779, + "learning_rate": 9.992472165218685e-06, + "loss": 0.4593, + "step": 1237 + }, + { + "epoch": 0.34755755193711396, + "grad_norm": 0.8205108642578125, + "learning_rate": 9.99238230425095e-06, + "loss": 0.4817, + "step": 1238 + }, + { + "epoch": 0.34783829309376757, + "grad_norm": 1.082516074180603, + "learning_rate": 9.992291910527223e-06, + "loss": 0.5169, + "step": 1239 + }, + { + "epoch": 0.3481190342504211, + "grad_norm": 0.7027360200881958, + "learning_rate": 9.992200984057146e-06, + "loss": 0.4638, + "step": 1240 + }, + { + "epoch": 0.3483997754070747, + "grad_norm": 0.7534340023994446, + "learning_rate": 9.992109524850423e-06, + "loss": 0.4388, + "step": 1241 + }, + { + "epoch": 0.3486805165637282, + "grad_norm": 0.8990961909294128, + "learning_rate": 9.99201753291682e-06, + "loss": 0.5136, + "step": 1242 + }, + { + "epoch": 0.34896125772038183, + "grad_norm": 0.8145226240158081, + "learning_rate": 9.991925008266145e-06, + "loss": 0.4261, + "step": 1243 + }, + { + "epoch": 0.3492419988770354, + "grad_norm": 0.8352614641189575, + "learning_rate": 9.991831950908278e-06, + "loss": 0.4867, + "step": 1244 + }, + { + "epoch": 0.34952274003368894, + "grad_norm": 0.7495833039283752, + "learning_rate": 9.991738360853147e-06, + "loss": 0.4323, + "step": 1245 + }, + { + "epoch": 0.3498034811903425, + "grad_norm": 0.8435266613960266, + "learning_rate": 9.991644238110741e-06, + "loss": 0.4659, + "step": 1246 + }, + { + "epoch": 0.35008422234699604, + "grad_norm": 0.8464834094047546, + "learning_rate": 9.9915495826911e-06, + "loss": 0.5104, + "step": 1247 + }, + { + "epoch": 0.35036496350364965, + "grad_norm": 0.9711482524871826, + "learning_rate": 9.99145439460433e-06, + "loss": 0.5038, + "step": 1248 + }, + { + "epoch": 0.3506457046603032, + "grad_norm": 1.0533941984176636, + "learning_rate": 9.991358673860586e-06, + "loss": 0.4762, + "step": 1249 + }, + { + "epoch": 0.35092644581695676, + "grad_norm": 0.774887204170227, + "learning_rate": 9.991262420470086e-06, + "loss": 0.4886, + "step": 1250 + }, + { + "epoch": 0.3512071869736103, + "grad_norm": 0.9104116559028625, + "learning_rate": 9.991165634443095e-06, + "loss": 0.4336, + "step": 1251 + }, + { + "epoch": 0.3514879281302639, + "grad_norm": 0.8911571502685547, + "learning_rate": 9.991068315789947e-06, + "loss": 0.4751, + "step": 1252 + }, + { + "epoch": 0.35176866928691747, + "grad_norm": 0.7602444887161255, + "learning_rate": 9.990970464521026e-06, + "loss": 0.4601, + "step": 1253 + }, + { + "epoch": 0.352049410443571, + "grad_norm": 0.9924885034561157, + "learning_rate": 9.990872080646774e-06, + "loss": 0.5201, + "step": 1254 + }, + { + "epoch": 0.3523301516002246, + "grad_norm": 0.748501181602478, + "learning_rate": 9.99077316417769e-06, + "loss": 0.4751, + "step": 1255 + }, + { + "epoch": 0.3526108927568782, + "grad_norm": 0.8485338687896729, + "learning_rate": 9.990673715124329e-06, + "loss": 0.5258, + "step": 1256 + }, + { + "epoch": 0.35289163391353173, + "grad_norm": 0.9282192587852478, + "learning_rate": 9.990573733497305e-06, + "loss": 0.4783, + "step": 1257 + }, + { + "epoch": 0.3531723750701853, + "grad_norm": 0.7793827652931213, + "learning_rate": 9.990473219307286e-06, + "loss": 0.5003, + "step": 1258 + }, + { + "epoch": 0.35345311622683884, + "grad_norm": 0.8053379058837891, + "learning_rate": 9.990372172564998e-06, + "loss": 0.4637, + "step": 1259 + }, + { + "epoch": 0.35373385738349244, + "grad_norm": 0.8316372632980347, + "learning_rate": 9.990270593281225e-06, + "loss": 0.4892, + "step": 1260 + }, + { + "epoch": 0.354014598540146, + "grad_norm": 0.8421614766120911, + "learning_rate": 9.990168481466806e-06, + "loss": 0.5076, + "step": 1261 + }, + { + "epoch": 0.35429533969679955, + "grad_norm": 0.7738340497016907, + "learning_rate": 9.99006583713264e-06, + "loss": 0.501, + "step": 1262 + }, + { + "epoch": 0.3545760808534531, + "grad_norm": 0.7720453143119812, + "learning_rate": 9.989962660289679e-06, + "loss": 0.4427, + "step": 1263 + }, + { + "epoch": 0.35485682201010665, + "grad_norm": 0.6586256623268127, + "learning_rate": 9.989858950948934e-06, + "loss": 0.4455, + "step": 1264 + }, + { + "epoch": 0.35513756316676026, + "grad_norm": 0.8287173509597778, + "learning_rate": 9.98975470912147e-06, + "loss": 0.4571, + "step": 1265 + }, + { + "epoch": 0.3554183043234138, + "grad_norm": 0.7584671378135681, + "learning_rate": 9.989649934818413e-06, + "loss": 0.4747, + "step": 1266 + }, + { + "epoch": 0.35569904548006737, + "grad_norm": 0.872772216796875, + "learning_rate": 9.989544628050944e-06, + "loss": 0.518, + "step": 1267 + }, + { + "epoch": 0.3559797866367209, + "grad_norm": 0.789007842540741, + "learning_rate": 9.9894387888303e-06, + "loss": 0.4963, + "step": 1268 + }, + { + "epoch": 0.3562605277933745, + "grad_norm": 0.9237364530563354, + "learning_rate": 9.989332417167776e-06, + "loss": 0.5423, + "step": 1269 + }, + { + "epoch": 0.3565412689500281, + "grad_norm": 0.793707013130188, + "learning_rate": 9.989225513074723e-06, + "loss": 0.5002, + "step": 1270 + }, + { + "epoch": 0.35682201010668163, + "grad_norm": 0.8023290634155273, + "learning_rate": 9.989118076562549e-06, + "loss": 0.5043, + "step": 1271 + }, + { + "epoch": 0.3571027512633352, + "grad_norm": 0.6765355467796326, + "learning_rate": 9.989010107642718e-06, + "loss": 0.4761, + "step": 1272 + }, + { + "epoch": 0.3573834924199888, + "grad_norm": 0.8727385997772217, + "learning_rate": 9.988901606326756e-06, + "loss": 0.4965, + "step": 1273 + }, + { + "epoch": 0.35766423357664234, + "grad_norm": 0.835047721862793, + "learning_rate": 9.988792572626236e-06, + "loss": 0.4792, + "step": 1274 + }, + { + "epoch": 0.3579449747332959, + "grad_norm": 0.7939520478248596, + "learning_rate": 9.988683006552796e-06, + "loss": 0.4364, + "step": 1275 + }, + { + "epoch": 0.35822571588994945, + "grad_norm": 0.876715898513794, + "learning_rate": 9.988572908118129e-06, + "loss": 0.4847, + "step": 1276 + }, + { + "epoch": 0.35850645704660306, + "grad_norm": 0.8438725471496582, + "learning_rate": 9.988462277333983e-06, + "loss": 0.483, + "step": 1277 + }, + { + "epoch": 0.3587871982032566, + "grad_norm": 0.804371178150177, + "learning_rate": 9.988351114212163e-06, + "loss": 0.4601, + "step": 1278 + }, + { + "epoch": 0.35906793935991016, + "grad_norm": 0.7393758296966553, + "learning_rate": 9.988239418764534e-06, + "loss": 0.4858, + "step": 1279 + }, + { + "epoch": 0.3593486805165637, + "grad_norm": 0.7303123474121094, + "learning_rate": 9.988127191003011e-06, + "loss": 0.4427, + "step": 1280 + }, + { + "epoch": 0.3596294216732173, + "grad_norm": 0.7270464897155762, + "learning_rate": 9.988014430939577e-06, + "loss": 0.4911, + "step": 1281 + }, + { + "epoch": 0.35991016282987087, + "grad_norm": 0.7835894227027893, + "learning_rate": 9.98790113858626e-06, + "loss": 0.4954, + "step": 1282 + }, + { + "epoch": 0.3601909039865244, + "grad_norm": 0.8116075396537781, + "learning_rate": 9.987787313955151e-06, + "loss": 0.4852, + "step": 1283 + }, + { + "epoch": 0.360471645143178, + "grad_norm": 0.8052205443382263, + "learning_rate": 9.987672957058398e-06, + "loss": 0.4868, + "step": 1284 + }, + { + "epoch": 0.36075238629983153, + "grad_norm": 0.6934790015220642, + "learning_rate": 9.987558067908203e-06, + "loss": 0.4788, + "step": 1285 + }, + { + "epoch": 0.36103312745648514, + "grad_norm": 0.876417875289917, + "learning_rate": 9.987442646516825e-06, + "loss": 0.4554, + "step": 1286 + }, + { + "epoch": 0.3613138686131387, + "grad_norm": 0.7848859429359436, + "learning_rate": 9.987326692896584e-06, + "loss": 0.492, + "step": 1287 + }, + { + "epoch": 0.36159460976979224, + "grad_norm": 0.7674251794815063, + "learning_rate": 9.987210207059852e-06, + "loss": 0.4928, + "step": 1288 + }, + { + "epoch": 0.3618753509264458, + "grad_norm": 0.7805658578872681, + "learning_rate": 9.987093189019058e-06, + "loss": 0.4764, + "step": 1289 + }, + { + "epoch": 0.3621560920830994, + "grad_norm": 0.7619417309761047, + "learning_rate": 9.986975638786696e-06, + "loss": 0.4562, + "step": 1290 + }, + { + "epoch": 0.36243683323975295, + "grad_norm": 0.7314834594726562, + "learning_rate": 9.986857556375302e-06, + "loss": 0.4478, + "step": 1291 + }, + { + "epoch": 0.3627175743964065, + "grad_norm": 0.7276840209960938, + "learning_rate": 9.986738941797482e-06, + "loss": 0.4287, + "step": 1292 + }, + { + "epoch": 0.36299831555306006, + "grad_norm": 0.8957933783531189, + "learning_rate": 9.986619795065894e-06, + "loss": 0.519, + "step": 1293 + }, + { + "epoch": 0.36327905670971367, + "grad_norm": 0.7751873731613159, + "learning_rate": 9.986500116193249e-06, + "loss": 0.4899, + "step": 1294 + }, + { + "epoch": 0.3635597978663672, + "grad_norm": 0.9643718600273132, + "learning_rate": 9.986379905192322e-06, + "loss": 0.5239, + "step": 1295 + }, + { + "epoch": 0.36384053902302077, + "grad_norm": 0.8136892318725586, + "learning_rate": 9.98625916207594e-06, + "loss": 0.4314, + "step": 1296 + }, + { + "epoch": 0.3641212801796743, + "grad_norm": 1.0789484977722168, + "learning_rate": 9.986137886856988e-06, + "loss": 0.5421, + "step": 1297 + }, + { + "epoch": 0.36440202133632793, + "grad_norm": 0.8058463335037231, + "learning_rate": 9.986016079548406e-06, + "loss": 0.4646, + "step": 1298 + }, + { + "epoch": 0.3646827624929815, + "grad_norm": 0.8146299719810486, + "learning_rate": 9.985893740163195e-06, + "loss": 0.4648, + "step": 1299 + }, + { + "epoch": 0.36496350364963503, + "grad_norm": 0.8253329992294312, + "learning_rate": 9.985770868714409e-06, + "loss": 0.5193, + "step": 1300 + }, + { + "epoch": 0.3652442448062886, + "grad_norm": 0.7895228266716003, + "learning_rate": 9.98564746521516e-06, + "loss": 0.4948, + "step": 1301 + }, + { + "epoch": 0.3655249859629422, + "grad_norm": 0.8136559128761292, + "learning_rate": 9.985523529678617e-06, + "loss": 0.4576, + "step": 1302 + }, + { + "epoch": 0.36580572711959575, + "grad_norm": 0.7098286747932434, + "learning_rate": 9.985399062118006e-06, + "loss": 0.4374, + "step": 1303 + }, + { + "epoch": 0.3660864682762493, + "grad_norm": 0.7789715528488159, + "learning_rate": 9.98527406254661e-06, + "loss": 0.4676, + "step": 1304 + }, + { + "epoch": 0.36636720943290285, + "grad_norm": 0.766231894493103, + "learning_rate": 9.985148530977767e-06, + "loss": 0.4915, + "step": 1305 + }, + { + "epoch": 0.3666479505895564, + "grad_norm": 0.8277270197868347, + "learning_rate": 9.985022467424873e-06, + "loss": 0.508, + "step": 1306 + }, + { + "epoch": 0.36692869174621, + "grad_norm": 0.810585081577301, + "learning_rate": 9.984895871901382e-06, + "loss": 0.4925, + "step": 1307 + }, + { + "epoch": 0.36720943290286356, + "grad_norm": 0.7740399837493896, + "learning_rate": 9.984768744420802e-06, + "loss": 0.502, + "step": 1308 + }, + { + "epoch": 0.3674901740595171, + "grad_norm": 0.820045530796051, + "learning_rate": 9.9846410849967e-06, + "loss": 0.517, + "step": 1309 + }, + { + "epoch": 0.36777091521617067, + "grad_norm": 0.873641848564148, + "learning_rate": 9.984512893642699e-06, + "loss": 0.5007, + "step": 1310 + }, + { + "epoch": 0.3680516563728243, + "grad_norm": 1.0452135801315308, + "learning_rate": 9.984384170372478e-06, + "loss": 0.5194, + "step": 1311 + }, + { + "epoch": 0.36833239752947783, + "grad_norm": 0.7366434335708618, + "learning_rate": 9.984254915199773e-06, + "loss": 0.4454, + "step": 1312 + }, + { + "epoch": 0.3686131386861314, + "grad_norm": 0.7293533086776733, + "learning_rate": 9.98412512813838e-06, + "loss": 0.4487, + "step": 1313 + }, + { + "epoch": 0.36889387984278493, + "grad_norm": 0.9081177115440369, + "learning_rate": 9.983994809202148e-06, + "loss": 0.4958, + "step": 1314 + }, + { + "epoch": 0.36917462099943854, + "grad_norm": 0.9251126646995544, + "learning_rate": 9.983863958404983e-06, + "loss": 0.4455, + "step": 1315 + }, + { + "epoch": 0.3694553621560921, + "grad_norm": 1.0090391635894775, + "learning_rate": 9.983732575760849e-06, + "loss": 0.426, + "step": 1316 + }, + { + "epoch": 0.36973610331274565, + "grad_norm": 0.7077220678329468, + "learning_rate": 9.983600661283766e-06, + "loss": 0.4615, + "step": 1317 + }, + { + "epoch": 0.3700168444693992, + "grad_norm": 0.8807445168495178, + "learning_rate": 9.983468214987812e-06, + "loss": 0.5233, + "step": 1318 + }, + { + "epoch": 0.3702975856260528, + "grad_norm": 0.7944371700286865, + "learning_rate": 9.98333523688712e-06, + "loss": 0.483, + "step": 1319 + }, + { + "epoch": 0.37057832678270636, + "grad_norm": 0.7848116159439087, + "learning_rate": 9.98320172699588e-06, + "loss": 0.4714, + "step": 1320 + }, + { + "epoch": 0.3708590679393599, + "grad_norm": 0.7266755104064941, + "learning_rate": 9.983067685328341e-06, + "loss": 0.4354, + "step": 1321 + }, + { + "epoch": 0.37113980909601346, + "grad_norm": 0.8377553224563599, + "learning_rate": 9.982933111898806e-06, + "loss": 0.5186, + "step": 1322 + }, + { + "epoch": 0.371420550252667, + "grad_norm": 0.7535090446472168, + "learning_rate": 9.982798006721637e-06, + "loss": 0.4874, + "step": 1323 + }, + { + "epoch": 0.3717012914093206, + "grad_norm": 0.807345986366272, + "learning_rate": 9.982662369811249e-06, + "loss": 0.5177, + "step": 1324 + }, + { + "epoch": 0.3719820325659742, + "grad_norm": 0.8962818384170532, + "learning_rate": 9.982526201182118e-06, + "loss": 0.4823, + "step": 1325 + }, + { + "epoch": 0.3722627737226277, + "grad_norm": 0.8004371523857117, + "learning_rate": 9.982389500848777e-06, + "loss": 0.4973, + "step": 1326 + }, + { + "epoch": 0.3725435148792813, + "grad_norm": 0.8684409856796265, + "learning_rate": 9.98225226882581e-06, + "loss": 0.5043, + "step": 1327 + }, + { + "epoch": 0.3728242560359349, + "grad_norm": 0.7550148367881775, + "learning_rate": 9.982114505127865e-06, + "loss": 0.4859, + "step": 1328 + }, + { + "epoch": 0.37310499719258844, + "grad_norm": 0.7923074960708618, + "learning_rate": 9.981976209769642e-06, + "loss": 0.4842, + "step": 1329 + }, + { + "epoch": 0.373385738349242, + "grad_norm": 0.7989718317985535, + "learning_rate": 9.981837382765898e-06, + "loss": 0.4885, + "step": 1330 + }, + { + "epoch": 0.37366647950589554, + "grad_norm": 0.8559819459915161, + "learning_rate": 9.981698024131448e-06, + "loss": 0.484, + "step": 1331 + }, + { + "epoch": 0.37394722066254915, + "grad_norm": 0.690994143486023, + "learning_rate": 9.981558133881163e-06, + "loss": 0.4413, + "step": 1332 + }, + { + "epoch": 0.3742279618192027, + "grad_norm": 0.811465859413147, + "learning_rate": 9.981417712029975e-06, + "loss": 0.5188, + "step": 1333 + }, + { + "epoch": 0.37450870297585626, + "grad_norm": 0.7612215280532837, + "learning_rate": 9.981276758592863e-06, + "loss": 0.4742, + "step": 1334 + }, + { + "epoch": 0.3747894441325098, + "grad_norm": 0.7821604013442993, + "learning_rate": 9.981135273584875e-06, + "loss": 0.4604, + "step": 1335 + }, + { + "epoch": 0.3750701852891634, + "grad_norm": 0.780653715133667, + "learning_rate": 9.980993257021105e-06, + "loss": 0.4694, + "step": 1336 + }, + { + "epoch": 0.37535092644581697, + "grad_norm": 0.7403636574745178, + "learning_rate": 9.98085070891671e-06, + "loss": 0.5041, + "step": 1337 + }, + { + "epoch": 0.3756316676024705, + "grad_norm": 0.7289802432060242, + "learning_rate": 9.980707629286899e-06, + "loss": 0.4881, + "step": 1338 + }, + { + "epoch": 0.3759124087591241, + "grad_norm": 0.744711697101593, + "learning_rate": 9.980564018146944e-06, + "loss": 0.4724, + "step": 1339 + }, + { + "epoch": 0.3761931499157777, + "grad_norm": 0.7679106593132019, + "learning_rate": 9.980419875512169e-06, + "loss": 0.4773, + "step": 1340 + }, + { + "epoch": 0.37647389107243123, + "grad_norm": 0.7225204110145569, + "learning_rate": 9.980275201397958e-06, + "loss": 0.4373, + "step": 1341 + }, + { + "epoch": 0.3767546322290848, + "grad_norm": 0.8641905784606934, + "learning_rate": 9.980129995819745e-06, + "loss": 0.5268, + "step": 1342 + }, + { + "epoch": 0.37703537338573834, + "grad_norm": 0.8561159372329712, + "learning_rate": 9.97998425879303e-06, + "loss": 0.5046, + "step": 1343 + }, + { + "epoch": 0.3773161145423919, + "grad_norm": 0.6561315059661865, + "learning_rate": 9.979837990333361e-06, + "loss": 0.4526, + "step": 1344 + }, + { + "epoch": 0.3775968556990455, + "grad_norm": 0.8008257746696472, + "learning_rate": 9.979691190456352e-06, + "loss": 0.5145, + "step": 1345 + }, + { + "epoch": 0.37787759685569905, + "grad_norm": 0.7799999713897705, + "learning_rate": 9.979543859177664e-06, + "loss": 0.4722, + "step": 1346 + }, + { + "epoch": 0.3781583380123526, + "grad_norm": 0.7862280011177063, + "learning_rate": 9.979395996513023e-06, + "loss": 0.4596, + "step": 1347 + }, + { + "epoch": 0.37843907916900615, + "grad_norm": 0.7400721311569214, + "learning_rate": 9.979247602478204e-06, + "loss": 0.4961, + "step": 1348 + }, + { + "epoch": 0.37871982032565976, + "grad_norm": 0.7986732125282288, + "learning_rate": 9.979098677089046e-06, + "loss": 0.4756, + "step": 1349 + }, + { + "epoch": 0.3790005614823133, + "grad_norm": 0.8387327194213867, + "learning_rate": 9.97894922036144e-06, + "loss": 0.5021, + "step": 1350 + }, + { + "epoch": 0.37928130263896687, + "grad_norm": 0.8232660889625549, + "learning_rate": 9.978799232311336e-06, + "loss": 0.4918, + "step": 1351 + }, + { + "epoch": 0.3795620437956204, + "grad_norm": 0.7406255602836609, + "learning_rate": 9.978648712954738e-06, + "loss": 0.4329, + "step": 1352 + }, + { + "epoch": 0.379842784952274, + "grad_norm": 0.87729811668396, + "learning_rate": 9.978497662307709e-06, + "loss": 0.4685, + "step": 1353 + }, + { + "epoch": 0.3801235261089276, + "grad_norm": 0.8358006477355957, + "learning_rate": 9.978346080386369e-06, + "loss": 0.4578, + "step": 1354 + }, + { + "epoch": 0.38040426726558113, + "grad_norm": 0.8884652853012085, + "learning_rate": 9.978193967206895e-06, + "loss": 0.4757, + "step": 1355 + }, + { + "epoch": 0.3806850084222347, + "grad_norm": 0.8674014806747437, + "learning_rate": 9.978041322785517e-06, + "loss": 0.4737, + "step": 1356 + }, + { + "epoch": 0.3809657495788883, + "grad_norm": 0.7499932646751404, + "learning_rate": 9.977888147138526e-06, + "loss": 0.4703, + "step": 1357 + }, + { + "epoch": 0.38124649073554184, + "grad_norm": 0.8546051979064941, + "learning_rate": 9.977734440282267e-06, + "loss": 0.4375, + "step": 1358 + }, + { + "epoch": 0.3815272318921954, + "grad_norm": 0.7380093336105347, + "learning_rate": 9.97758020223314e-06, + "loss": 0.4808, + "step": 1359 + }, + { + "epoch": 0.38180797304884895, + "grad_norm": 0.8195316195487976, + "learning_rate": 9.977425433007612e-06, + "loss": 0.5102, + "step": 1360 + }, + { + "epoch": 0.3820887142055025, + "grad_norm": 0.8140376806259155, + "learning_rate": 9.977270132622193e-06, + "loss": 0.4562, + "step": 1361 + }, + { + "epoch": 0.3823694553621561, + "grad_norm": 0.7844399213790894, + "learning_rate": 9.977114301093456e-06, + "loss": 0.4587, + "step": 1362 + }, + { + "epoch": 0.38265019651880966, + "grad_norm": 0.921621561050415, + "learning_rate": 9.976957938438033e-06, + "loss": 0.5032, + "step": 1363 + }, + { + "epoch": 0.3829309376754632, + "grad_norm": 0.8237884640693665, + "learning_rate": 9.976801044672608e-06, + "loss": 0.4569, + "step": 1364 + }, + { + "epoch": 0.38321167883211676, + "grad_norm": 0.7739622592926025, + "learning_rate": 9.976643619813924e-06, + "loss": 0.452, + "step": 1365 + }, + { + "epoch": 0.38349241998877037, + "grad_norm": 0.8974728584289551, + "learning_rate": 9.97648566387878e-06, + "loss": 0.4814, + "step": 1366 + }, + { + "epoch": 0.3837731611454239, + "grad_norm": 0.838535726070404, + "learning_rate": 9.976327176884034e-06, + "loss": 0.4335, + "step": 1367 + }, + { + "epoch": 0.3840539023020775, + "grad_norm": 0.8377835750579834, + "learning_rate": 9.976168158846596e-06, + "loss": 0.4787, + "step": 1368 + }, + { + "epoch": 0.38433464345873103, + "grad_norm": 0.9407914280891418, + "learning_rate": 9.976008609783436e-06, + "loss": 0.4428, + "step": 1369 + }, + { + "epoch": 0.38461538461538464, + "grad_norm": 0.9758520722389221, + "learning_rate": 9.975848529711583e-06, + "loss": 0.4676, + "step": 1370 + }, + { + "epoch": 0.3848961257720382, + "grad_norm": 0.8121294379234314, + "learning_rate": 9.975687918648115e-06, + "loss": 0.4446, + "step": 1371 + }, + { + "epoch": 0.38517686692869174, + "grad_norm": 0.861008882522583, + "learning_rate": 9.975526776610178e-06, + "loss": 0.5343, + "step": 1372 + }, + { + "epoch": 0.3854576080853453, + "grad_norm": 1.0093066692352295, + "learning_rate": 9.975365103614962e-06, + "loss": 0.4832, + "step": 1373 + }, + { + "epoch": 0.3857383492419989, + "grad_norm": 0.8748912811279297, + "learning_rate": 9.97520289967972e-06, + "loss": 0.4914, + "step": 1374 + }, + { + "epoch": 0.38601909039865245, + "grad_norm": 0.8520193099975586, + "learning_rate": 9.975040164821767e-06, + "loss": 0.4343, + "step": 1375 + }, + { + "epoch": 0.386299831555306, + "grad_norm": 0.9347444176673889, + "learning_rate": 9.974876899058464e-06, + "loss": 0.5065, + "step": 1376 + }, + { + "epoch": 0.38658057271195956, + "grad_norm": 0.7243751287460327, + "learning_rate": 9.974713102407234e-06, + "loss": 0.4528, + "step": 1377 + }, + { + "epoch": 0.38686131386861317, + "grad_norm": 0.9694708585739136, + "learning_rate": 9.974548774885558e-06, + "loss": 0.5738, + "step": 1378 + }, + { + "epoch": 0.3871420550252667, + "grad_norm": 0.9710278511047363, + "learning_rate": 9.974383916510973e-06, + "loss": 0.5006, + "step": 1379 + }, + { + "epoch": 0.38742279618192027, + "grad_norm": 0.8056460022926331, + "learning_rate": 9.974218527301067e-06, + "loss": 0.4436, + "step": 1380 + }, + { + "epoch": 0.3877035373385738, + "grad_norm": 0.83726966381073, + "learning_rate": 9.974052607273494e-06, + "loss": 0.451, + "step": 1381 + }, + { + "epoch": 0.3879842784952274, + "grad_norm": 0.9945221543312073, + "learning_rate": 9.97388615644596e-06, + "loss": 0.4864, + "step": 1382 + }, + { + "epoch": 0.388265019651881, + "grad_norm": 0.8952162861824036, + "learning_rate": 9.973719174836224e-06, + "loss": 0.483, + "step": 1383 + }, + { + "epoch": 0.38854576080853453, + "grad_norm": 0.9157962799072266, + "learning_rate": 9.973551662462106e-06, + "loss": 0.4456, + "step": 1384 + }, + { + "epoch": 0.3888265019651881, + "grad_norm": 0.9072278738021851, + "learning_rate": 9.973383619341486e-06, + "loss": 0.4322, + "step": 1385 + }, + { + "epoch": 0.38910724312184164, + "grad_norm": 0.8230103254318237, + "learning_rate": 9.973215045492292e-06, + "loss": 0.4803, + "step": 1386 + }, + { + "epoch": 0.38938798427849525, + "grad_norm": 0.9844091534614563, + "learning_rate": 9.973045940932515e-06, + "loss": 0.4944, + "step": 1387 + }, + { + "epoch": 0.3896687254351488, + "grad_norm": 0.733823835849762, + "learning_rate": 9.972876305680201e-06, + "loss": 0.4699, + "step": 1388 + }, + { + "epoch": 0.38994946659180235, + "grad_norm": 0.7970441579818726, + "learning_rate": 9.97270613975345e-06, + "loss": 0.454, + "step": 1389 + }, + { + "epoch": 0.3902302077484559, + "grad_norm": 0.7929754257202148, + "learning_rate": 9.972535443170425e-06, + "loss": 0.4799, + "step": 1390 + }, + { + "epoch": 0.3905109489051095, + "grad_norm": 0.7899355888366699, + "learning_rate": 9.972364215949338e-06, + "loss": 0.431, + "step": 1391 + }, + { + "epoch": 0.39079169006176306, + "grad_norm": 0.8784908056259155, + "learning_rate": 9.972192458108465e-06, + "loss": 0.4708, + "step": 1392 + }, + { + "epoch": 0.3910724312184166, + "grad_norm": 0.841095507144928, + "learning_rate": 9.97202016966613e-06, + "loss": 0.4702, + "step": 1393 + }, + { + "epoch": 0.39135317237507017, + "grad_norm": 0.780816376209259, + "learning_rate": 9.971847350640724e-06, + "loss": 0.463, + "step": 1394 + }, + { + "epoch": 0.3916339135317238, + "grad_norm": 0.7796720862388611, + "learning_rate": 9.971674001050687e-06, + "loss": 0.4648, + "step": 1395 + }, + { + "epoch": 0.39191465468837733, + "grad_norm": 0.8291729092597961, + "learning_rate": 9.971500120914515e-06, + "loss": 0.502, + "step": 1396 + }, + { + "epoch": 0.3921953958450309, + "grad_norm": 0.7778762578964233, + "learning_rate": 9.971325710250768e-06, + "loss": 0.4338, + "step": 1397 + }, + { + "epoch": 0.39247613700168443, + "grad_norm": 0.8537712693214417, + "learning_rate": 9.971150769078056e-06, + "loss": 0.4454, + "step": 1398 + }, + { + "epoch": 0.392756878158338, + "grad_norm": 0.9105302691459656, + "learning_rate": 9.970975297415045e-06, + "loss": 0.4964, + "step": 1399 + }, + { + "epoch": 0.3930376193149916, + "grad_norm": 0.7636066675186157, + "learning_rate": 9.970799295280464e-06, + "loss": 0.4532, + "step": 1400 + }, + { + "epoch": 0.39331836047164515, + "grad_norm": 0.8331524729728699, + "learning_rate": 9.970622762693093e-06, + "loss": 0.4574, + "step": 1401 + }, + { + "epoch": 0.3935991016282987, + "grad_norm": 0.8816007375717163, + "learning_rate": 9.970445699671773e-06, + "loss": 0.4654, + "step": 1402 + }, + { + "epoch": 0.39387984278495225, + "grad_norm": 0.9071862697601318, + "learning_rate": 9.970268106235395e-06, + "loss": 0.4585, + "step": 1403 + }, + { + "epoch": 0.39416058394160586, + "grad_norm": 0.7492117881774902, + "learning_rate": 9.970089982402915e-06, + "loss": 0.4568, + "step": 1404 + }, + { + "epoch": 0.3944413250982594, + "grad_norm": 1.2280957698822021, + "learning_rate": 9.969911328193337e-06, + "loss": 0.4999, + "step": 1405 + }, + { + "epoch": 0.39472206625491296, + "grad_norm": 0.9285614490509033, + "learning_rate": 9.969732143625728e-06, + "loss": 0.4757, + "step": 1406 + }, + { + "epoch": 0.3950028074115665, + "grad_norm": 0.823926568031311, + "learning_rate": 9.96955242871921e-06, + "loss": 0.4654, + "step": 1407 + }, + { + "epoch": 0.3952835485682201, + "grad_norm": 0.7346882224082947, + "learning_rate": 9.96937218349296e-06, + "loss": 0.419, + "step": 1408 + }, + { + "epoch": 0.3955642897248737, + "grad_norm": 0.9443112015724182, + "learning_rate": 9.969191407966214e-06, + "loss": 0.4837, + "step": 1409 + }, + { + "epoch": 0.3958450308815272, + "grad_norm": 0.7753540873527527, + "learning_rate": 9.969010102158262e-06, + "loss": 0.4637, + "step": 1410 + }, + { + "epoch": 0.3961257720381808, + "grad_norm": 0.8248151540756226, + "learning_rate": 9.96882826608845e-06, + "loss": 0.4988, + "step": 1411 + }, + { + "epoch": 0.3964065131948344, + "grad_norm": 0.85450279712677, + "learning_rate": 9.968645899776187e-06, + "loss": 0.4896, + "step": 1412 + }, + { + "epoch": 0.39668725435148794, + "grad_norm": 0.9409697651863098, + "learning_rate": 9.968463003240931e-06, + "loss": 0.4807, + "step": 1413 + }, + { + "epoch": 0.3969679955081415, + "grad_norm": 0.8878933191299438, + "learning_rate": 9.9682795765022e-06, + "loss": 0.4162, + "step": 1414 + }, + { + "epoch": 0.39724873666479504, + "grad_norm": 0.8156781196594238, + "learning_rate": 9.96809561957957e-06, + "loss": 0.535, + "step": 1415 + }, + { + "epoch": 0.39752947782144865, + "grad_norm": 0.9529138207435608, + "learning_rate": 9.967911132492667e-06, + "loss": 0.502, + "step": 1416 + }, + { + "epoch": 0.3978102189781022, + "grad_norm": 0.9243906736373901, + "learning_rate": 9.967726115261183e-06, + "loss": 0.4879, + "step": 1417 + }, + { + "epoch": 0.39809096013475576, + "grad_norm": 0.7522918581962585, + "learning_rate": 9.96754056790486e-06, + "loss": 0.4685, + "step": 1418 + }, + { + "epoch": 0.3983717012914093, + "grad_norm": 0.8215938806533813, + "learning_rate": 9.967354490443497e-06, + "loss": 0.4936, + "step": 1419 + }, + { + "epoch": 0.39865244244806286, + "grad_norm": 0.9339632987976074, + "learning_rate": 9.967167882896956e-06, + "loss": 0.4927, + "step": 1420 + }, + { + "epoch": 0.39893318360471647, + "grad_norm": 0.7769731283187866, + "learning_rate": 9.966980745285144e-06, + "loss": 0.4361, + "step": 1421 + }, + { + "epoch": 0.39921392476137, + "grad_norm": 0.8242982029914856, + "learning_rate": 9.966793077628037e-06, + "loss": 0.4621, + "step": 1422 + }, + { + "epoch": 0.3994946659180236, + "grad_norm": 0.7599133849143982, + "learning_rate": 9.966604879945659e-06, + "loss": 0.4679, + "step": 1423 + }, + { + "epoch": 0.3997754070746771, + "grad_norm": 0.8209311366081238, + "learning_rate": 9.966416152258091e-06, + "loss": 0.489, + "step": 1424 + }, + { + "epoch": 0.40005614823133073, + "grad_norm": 0.8286378979682922, + "learning_rate": 9.966226894585478e-06, + "loss": 0.5006, + "step": 1425 + }, + { + "epoch": 0.4003368893879843, + "grad_norm": 0.7345553040504456, + "learning_rate": 9.966037106948012e-06, + "loss": 0.5022, + "step": 1426 + }, + { + "epoch": 0.40061763054463784, + "grad_norm": 0.8585434556007385, + "learning_rate": 9.96584678936595e-06, + "loss": 0.5224, + "step": 1427 + }, + { + "epoch": 0.4008983717012914, + "grad_norm": 0.69450843334198, + "learning_rate": 9.965655941859597e-06, + "loss": 0.5126, + "step": 1428 + }, + { + "epoch": 0.401179112857945, + "grad_norm": 0.7183843851089478, + "learning_rate": 9.965464564449322e-06, + "loss": 0.4258, + "step": 1429 + }, + { + "epoch": 0.40145985401459855, + "grad_norm": 0.9425116777420044, + "learning_rate": 9.965272657155546e-06, + "loss": 0.495, + "step": 1430 + }, + { + "epoch": 0.4017405951712521, + "grad_norm": 0.6672155857086182, + "learning_rate": 9.96508021999875e-06, + "loss": 0.4726, + "step": 1431 + }, + { + "epoch": 0.40202133632790565, + "grad_norm": 0.6973167061805725, + "learning_rate": 9.96488725299947e-06, + "loss": 0.4776, + "step": 1432 + }, + { + "epoch": 0.40230207748455926, + "grad_norm": 0.8882748484611511, + "learning_rate": 9.964693756178295e-06, + "loss": 0.4543, + "step": 1433 + }, + { + "epoch": 0.4025828186412128, + "grad_norm": 0.7760128974914551, + "learning_rate": 9.964499729555876e-06, + "loss": 0.441, + "step": 1434 + }, + { + "epoch": 0.40286355979786637, + "grad_norm": 0.7548131942749023, + "learning_rate": 9.964305173152919e-06, + "loss": 0.4888, + "step": 1435 + }, + { + "epoch": 0.4031443009545199, + "grad_norm": 0.7253437638282776, + "learning_rate": 9.964110086990184e-06, + "loss": 0.4973, + "step": 1436 + }, + { + "epoch": 0.40342504211117347, + "grad_norm": 0.8014014363288879, + "learning_rate": 9.96391447108849e-06, + "loss": 0.4264, + "step": 1437 + }, + { + "epoch": 0.4037057832678271, + "grad_norm": 0.8057489395141602, + "learning_rate": 9.963718325468712e-06, + "loss": 0.4906, + "step": 1438 + }, + { + "epoch": 0.40398652442448063, + "grad_norm": 0.7576271891593933, + "learning_rate": 9.963521650151783e-06, + "loss": 0.4648, + "step": 1439 + }, + { + "epoch": 0.4042672655811342, + "grad_norm": 0.866041898727417, + "learning_rate": 9.963324445158688e-06, + "loss": 0.4921, + "step": 1440 + }, + { + "epoch": 0.40454800673778774, + "grad_norm": 0.8300007581710815, + "learning_rate": 9.963126710510476e-06, + "loss": 0.454, + "step": 1441 + }, + { + "epoch": 0.40482874789444134, + "grad_norm": 0.7814754247665405, + "learning_rate": 9.962928446228241e-06, + "loss": 0.4376, + "step": 1442 + }, + { + "epoch": 0.4051094890510949, + "grad_norm": 0.6946207880973816, + "learning_rate": 9.962729652333147e-06, + "loss": 0.4209, + "step": 1443 + }, + { + "epoch": 0.40539023020774845, + "grad_norm": 0.7424529194831848, + "learning_rate": 9.962530328846407e-06, + "loss": 0.4108, + "step": 1444 + }, + { + "epoch": 0.405670971364402, + "grad_norm": 0.8224674463272095, + "learning_rate": 9.962330475789287e-06, + "loss": 0.53, + "step": 1445 + }, + { + "epoch": 0.4059517125210556, + "grad_norm": 0.714143693447113, + "learning_rate": 9.96213009318312e-06, + "loss": 0.4943, + "step": 1446 + }, + { + "epoch": 0.40623245367770916, + "grad_norm": 0.8385562896728516, + "learning_rate": 9.961929181049286e-06, + "loss": 0.4736, + "step": 1447 + }, + { + "epoch": 0.4065131948343627, + "grad_norm": 0.6077457070350647, + "learning_rate": 9.961727739409226e-06, + "loss": 0.4747, + "step": 1448 + }, + { + "epoch": 0.40679393599101626, + "grad_norm": 0.7630495429039001, + "learning_rate": 9.961525768284438e-06, + "loss": 0.48, + "step": 1449 + }, + { + "epoch": 0.40707467714766987, + "grad_norm": 0.7672988772392273, + "learning_rate": 9.96132326769647e-06, + "loss": 0.4887, + "step": 1450 + }, + { + "epoch": 0.4073554183043234, + "grad_norm": 0.7725625038146973, + "learning_rate": 9.961120237666938e-06, + "loss": 0.4883, + "step": 1451 + }, + { + "epoch": 0.407636159460977, + "grad_norm": 0.749697208404541, + "learning_rate": 9.960916678217504e-06, + "loss": 0.4646, + "step": 1452 + }, + { + "epoch": 0.40791690061763053, + "grad_norm": 0.8230544328689575, + "learning_rate": 9.960712589369894e-06, + "loss": 0.4293, + "step": 1453 + }, + { + "epoch": 0.40819764177428414, + "grad_norm": 0.7162449359893799, + "learning_rate": 9.960507971145884e-06, + "loss": 0.534, + "step": 1454 + }, + { + "epoch": 0.4084783829309377, + "grad_norm": 0.7527154684066772, + "learning_rate": 9.960302823567311e-06, + "loss": 0.4873, + "step": 1455 + }, + { + "epoch": 0.40875912408759124, + "grad_norm": 0.7369965314865112, + "learning_rate": 9.960097146656065e-06, + "loss": 0.482, + "step": 1456 + }, + { + "epoch": 0.4090398652442448, + "grad_norm": 0.7361452579498291, + "learning_rate": 9.959890940434098e-06, + "loss": 0.4207, + "step": 1457 + }, + { + "epoch": 0.40932060640089835, + "grad_norm": 0.8282372951507568, + "learning_rate": 9.959684204923415e-06, + "loss": 0.506, + "step": 1458 + }, + { + "epoch": 0.40960134755755195, + "grad_norm": 0.7200673222541809, + "learning_rate": 9.959476940146074e-06, + "loss": 0.4781, + "step": 1459 + }, + { + "epoch": 0.4098820887142055, + "grad_norm": 0.7844985723495483, + "learning_rate": 9.959269146124195e-06, + "loss": 0.4968, + "step": 1460 + }, + { + "epoch": 0.41016282987085906, + "grad_norm": 0.83040452003479, + "learning_rate": 9.959060822879952e-06, + "loss": 0.4742, + "step": 1461 + }, + { + "epoch": 0.4104435710275126, + "grad_norm": 0.8366664052009583, + "learning_rate": 9.958851970435576e-06, + "loss": 0.4932, + "step": 1462 + }, + { + "epoch": 0.4107243121841662, + "grad_norm": 0.8647298812866211, + "learning_rate": 9.958642588813355e-06, + "loss": 0.443, + "step": 1463 + }, + { + "epoch": 0.41100505334081977, + "grad_norm": 0.929803192615509, + "learning_rate": 9.958432678035633e-06, + "loss": 0.477, + "step": 1464 + }, + { + "epoch": 0.4112857944974733, + "grad_norm": 0.76978600025177, + "learning_rate": 9.958222238124811e-06, + "loss": 0.4367, + "step": 1465 + }, + { + "epoch": 0.4115665356541269, + "grad_norm": 0.7822108268737793, + "learning_rate": 9.958011269103343e-06, + "loss": 0.4694, + "step": 1466 + }, + { + "epoch": 0.4118472768107805, + "grad_norm": 0.7481580376625061, + "learning_rate": 9.957799770993746e-06, + "loss": 0.472, + "step": 1467 + }, + { + "epoch": 0.41212801796743403, + "grad_norm": 0.7773211002349854, + "learning_rate": 9.957587743818586e-06, + "loss": 0.4853, + "step": 1468 + }, + { + "epoch": 0.4124087591240876, + "grad_norm": 0.7309362292289734, + "learning_rate": 9.957375187600493e-06, + "loss": 0.4408, + "step": 1469 + }, + { + "epoch": 0.41268950028074114, + "grad_norm": 0.8492318391799927, + "learning_rate": 9.957162102362147e-06, + "loss": 0.4844, + "step": 1470 + }, + { + "epoch": 0.41297024143739475, + "grad_norm": 0.7797303199768066, + "learning_rate": 9.95694848812629e-06, + "loss": 0.4609, + "step": 1471 + }, + { + "epoch": 0.4132509825940483, + "grad_norm": 0.8322198987007141, + "learning_rate": 9.956734344915713e-06, + "loss": 0.5176, + "step": 1472 + }, + { + "epoch": 0.41353172375070185, + "grad_norm": 0.8106900453567505, + "learning_rate": 9.956519672753271e-06, + "loss": 0.4792, + "step": 1473 + }, + { + "epoch": 0.4138124649073554, + "grad_norm": 0.8217254877090454, + "learning_rate": 9.956304471661873e-06, + "loss": 0.4491, + "step": 1474 + }, + { + "epoch": 0.41409320606400896, + "grad_norm": 0.8479796648025513, + "learning_rate": 9.956088741664483e-06, + "loss": 0.4203, + "step": 1475 + }, + { + "epoch": 0.41437394722066256, + "grad_norm": 0.7619498372077942, + "learning_rate": 9.955872482784122e-06, + "loss": 0.4971, + "step": 1476 + }, + { + "epoch": 0.4146546883773161, + "grad_norm": 0.7956091165542603, + "learning_rate": 9.955655695043868e-06, + "loss": 0.4715, + "step": 1477 + }, + { + "epoch": 0.41493542953396967, + "grad_norm": 0.7198827266693115, + "learning_rate": 9.955438378466855e-06, + "loss": 0.4713, + "step": 1478 + }, + { + "epoch": 0.4152161706906232, + "grad_norm": 0.7029407024383545, + "learning_rate": 9.955220533076276e-06, + "loss": 0.447, + "step": 1479 + }, + { + "epoch": 0.41549691184727683, + "grad_norm": 0.7933072447776794, + "learning_rate": 9.955002158895374e-06, + "loss": 0.4698, + "step": 1480 + }, + { + "epoch": 0.4157776530039304, + "grad_norm": 0.732537031173706, + "learning_rate": 9.954783255947456e-06, + "loss": 0.398, + "step": 1481 + }, + { + "epoch": 0.41605839416058393, + "grad_norm": 0.8646494746208191, + "learning_rate": 9.954563824255879e-06, + "loss": 0.5254, + "step": 1482 + }, + { + "epoch": 0.4163391353172375, + "grad_norm": 0.7793468236923218, + "learning_rate": 9.954343863844063e-06, + "loss": 0.4341, + "step": 1483 + }, + { + "epoch": 0.4166198764738911, + "grad_norm": 0.705496072769165, + "learning_rate": 9.954123374735478e-06, + "loss": 0.4845, + "step": 1484 + }, + { + "epoch": 0.41690061763054465, + "grad_norm": 0.7429677248001099, + "learning_rate": 9.953902356953653e-06, + "loss": 0.4863, + "step": 1485 + }, + { + "epoch": 0.4171813587871982, + "grad_norm": 0.7521372437477112, + "learning_rate": 9.953680810522178e-06, + "loss": 0.504, + "step": 1486 + }, + { + "epoch": 0.41746209994385175, + "grad_norm": 0.8362607359886169, + "learning_rate": 9.953458735464689e-06, + "loss": 0.5103, + "step": 1487 + }, + { + "epoch": 0.41774284110050536, + "grad_norm": 0.979227602481842, + "learning_rate": 9.95323613180489e-06, + "loss": 0.4921, + "step": 1488 + }, + { + "epoch": 0.4180235822571589, + "grad_norm": 0.8398364186286926, + "learning_rate": 9.95301299956653e-06, + "loss": 0.4748, + "step": 1489 + }, + { + "epoch": 0.41830432341381246, + "grad_norm": 0.8194150328636169, + "learning_rate": 9.952789338773423e-06, + "loss": 0.4934, + "step": 1490 + }, + { + "epoch": 0.418585064570466, + "grad_norm": 0.9343910813331604, + "learning_rate": 9.95256514944944e-06, + "loss": 0.5287, + "step": 1491 + }, + { + "epoch": 0.4188658057271196, + "grad_norm": 1.1770141124725342, + "learning_rate": 9.952340431618502e-06, + "loss": 0.4856, + "step": 1492 + }, + { + "epoch": 0.4191465468837732, + "grad_norm": 0.8844005465507507, + "learning_rate": 9.952115185304587e-06, + "loss": 0.4633, + "step": 1493 + }, + { + "epoch": 0.4194272880404267, + "grad_norm": 0.8328038454055786, + "learning_rate": 9.951889410531737e-06, + "loss": 0.4553, + "step": 1494 + }, + { + "epoch": 0.4197080291970803, + "grad_norm": 0.8194181323051453, + "learning_rate": 9.951663107324042e-06, + "loss": 0.4636, + "step": 1495 + }, + { + "epoch": 0.41998877035373383, + "grad_norm": 0.8475966453552246, + "learning_rate": 9.951436275705653e-06, + "loss": 0.5128, + "step": 1496 + }, + { + "epoch": 0.42026951151038744, + "grad_norm": 0.770342230796814, + "learning_rate": 9.951208915700776e-06, + "loss": 0.4889, + "step": 1497 + }, + { + "epoch": 0.420550252667041, + "grad_norm": 0.9271401762962341, + "learning_rate": 9.950981027333672e-06, + "loss": 0.4543, + "step": 1498 + }, + { + "epoch": 0.42083099382369454, + "grad_norm": 0.7796668410301208, + "learning_rate": 9.95075261062866e-06, + "loss": 0.4798, + "step": 1499 + }, + { + "epoch": 0.4211117349803481, + "grad_norm": 0.8748780488967896, + "learning_rate": 9.950523665610118e-06, + "loss": 0.4667, + "step": 1500 + }, + { + "epoch": 0.4213924761370017, + "grad_norm": 0.8888767957687378, + "learning_rate": 9.950294192302475e-06, + "loss": 0.4436, + "step": 1501 + }, + { + "epoch": 0.42167321729365526, + "grad_norm": 0.8639453649520874, + "learning_rate": 9.95006419073022e-06, + "loss": 0.4863, + "step": 1502 + }, + { + "epoch": 0.4219539584503088, + "grad_norm": 0.7428751587867737, + "learning_rate": 9.949833660917897e-06, + "loss": 0.4854, + "step": 1503 + }, + { + "epoch": 0.42223469960696236, + "grad_norm": 0.856082022190094, + "learning_rate": 9.949602602890107e-06, + "loss": 0.4637, + "step": 1504 + }, + { + "epoch": 0.42251544076361597, + "grad_norm": 0.9224918484687805, + "learning_rate": 9.949371016671505e-06, + "loss": 0.4895, + "step": 1505 + }, + { + "epoch": 0.4227961819202695, + "grad_norm": 0.7828034162521362, + "learning_rate": 9.949138902286807e-06, + "loss": 0.4702, + "step": 1506 + }, + { + "epoch": 0.4230769230769231, + "grad_norm": 0.9434019327163696, + "learning_rate": 9.948906259760785e-06, + "loss": 0.5116, + "step": 1507 + }, + { + "epoch": 0.4233576642335766, + "grad_norm": 0.9723303318023682, + "learning_rate": 9.948673089118259e-06, + "loss": 0.4614, + "step": 1508 + }, + { + "epoch": 0.42363840539023023, + "grad_norm": 0.8419533967971802, + "learning_rate": 9.948439390384115e-06, + "loss": 0.4921, + "step": 1509 + }, + { + "epoch": 0.4239191465468838, + "grad_norm": 0.8092457056045532, + "learning_rate": 9.948205163583292e-06, + "loss": 0.4875, + "step": 1510 + }, + { + "epoch": 0.42419988770353734, + "grad_norm": 0.9529274702072144, + "learning_rate": 9.947970408740783e-06, + "loss": 0.4838, + "step": 1511 + }, + { + "epoch": 0.4244806288601909, + "grad_norm": 0.7889905571937561, + "learning_rate": 9.947735125881644e-06, + "loss": 0.4495, + "step": 1512 + }, + { + "epoch": 0.42476137001684444, + "grad_norm": 0.7975249290466309, + "learning_rate": 9.947499315030979e-06, + "loss": 0.4847, + "step": 1513 + }, + { + "epoch": 0.42504211117349805, + "grad_norm": 0.7952381372451782, + "learning_rate": 9.947262976213954e-06, + "loss": 0.448, + "step": 1514 + }, + { + "epoch": 0.4253228523301516, + "grad_norm": 0.8523067235946655, + "learning_rate": 9.947026109455789e-06, + "loss": 0.5072, + "step": 1515 + }, + { + "epoch": 0.42560359348680515, + "grad_norm": 0.8707007169723511, + "learning_rate": 9.946788714781761e-06, + "loss": 0.4931, + "step": 1516 + }, + { + "epoch": 0.4258843346434587, + "grad_norm": 0.830998420715332, + "learning_rate": 9.946550792217204e-06, + "loss": 0.4841, + "step": 1517 + }, + { + "epoch": 0.4261650758001123, + "grad_norm": 0.741875171661377, + "learning_rate": 9.946312341787507e-06, + "loss": 0.4465, + "step": 1518 + }, + { + "epoch": 0.42644581695676587, + "grad_norm": 0.9552786350250244, + "learning_rate": 9.946073363518115e-06, + "loss": 0.468, + "step": 1519 + }, + { + "epoch": 0.4267265581134194, + "grad_norm": 0.8409485816955566, + "learning_rate": 9.945833857434533e-06, + "loss": 0.5204, + "step": 1520 + }, + { + "epoch": 0.42700729927007297, + "grad_norm": 0.6598392724990845, + "learning_rate": 9.945593823562316e-06, + "loss": 0.4635, + "step": 1521 + }, + { + "epoch": 0.4272880404267266, + "grad_norm": 0.7886444926261902, + "learning_rate": 9.945353261927081e-06, + "loss": 0.4528, + "step": 1522 + }, + { + "epoch": 0.42756878158338013, + "grad_norm": 0.8217816352844238, + "learning_rate": 9.9451121725545e-06, + "loss": 0.4471, + "step": 1523 + }, + { + "epoch": 0.4278495227400337, + "grad_norm": 0.8745150566101074, + "learning_rate": 9.944870555470298e-06, + "loss": 0.4729, + "step": 1524 + }, + { + "epoch": 0.42813026389668724, + "grad_norm": 0.7911088466644287, + "learning_rate": 9.944628410700262e-06, + "loss": 0.4849, + "step": 1525 + }, + { + "epoch": 0.42841100505334084, + "grad_norm": 0.7825942635536194, + "learning_rate": 9.94438573827023e-06, + "loss": 0.4911, + "step": 1526 + }, + { + "epoch": 0.4286917462099944, + "grad_norm": 0.8885286450386047, + "learning_rate": 9.9441425382061e-06, + "loss": 0.4249, + "step": 1527 + }, + { + "epoch": 0.42897248736664795, + "grad_norm": 0.7094171047210693, + "learning_rate": 9.943898810533823e-06, + "loss": 0.4342, + "step": 1528 + }, + { + "epoch": 0.4292532285233015, + "grad_norm": 0.772149920463562, + "learning_rate": 9.94365455527941e-06, + "loss": 0.4713, + "step": 1529 + }, + { + "epoch": 0.4295339696799551, + "grad_norm": 0.8422811031341553, + "learning_rate": 9.943409772468923e-06, + "loss": 0.4715, + "step": 1530 + }, + { + "epoch": 0.42981471083660866, + "grad_norm": 0.7839064002037048, + "learning_rate": 9.943164462128487e-06, + "loss": 0.4097, + "step": 1531 + }, + { + "epoch": 0.4300954519932622, + "grad_norm": 0.7710862159729004, + "learning_rate": 9.942918624284282e-06, + "loss": 0.4303, + "step": 1532 + }, + { + "epoch": 0.43037619314991576, + "grad_norm": 0.8447971343994141, + "learning_rate": 9.942672258962537e-06, + "loss": 0.5089, + "step": 1533 + }, + { + "epoch": 0.4306569343065693, + "grad_norm": 0.7621978521347046, + "learning_rate": 9.942425366189545e-06, + "loss": 0.4849, + "step": 1534 + }, + { + "epoch": 0.4309376754632229, + "grad_norm": 0.6854081153869629, + "learning_rate": 9.942177945991652e-06, + "loss": 0.4185, + "step": 1535 + }, + { + "epoch": 0.4312184166198765, + "grad_norm": 0.8562795519828796, + "learning_rate": 9.941929998395263e-06, + "loss": 0.4907, + "step": 1536 + }, + { + "epoch": 0.43149915777653003, + "grad_norm": 0.8944031596183777, + "learning_rate": 9.941681523426835e-06, + "loss": 0.4517, + "step": 1537 + }, + { + "epoch": 0.4317798989331836, + "grad_norm": 0.7585890293121338, + "learning_rate": 9.941432521112887e-06, + "loss": 0.446, + "step": 1538 + }, + { + "epoch": 0.4320606400898372, + "grad_norm": 0.7893147468566895, + "learning_rate": 9.94118299147999e-06, + "loss": 0.4825, + "step": 1539 + }, + { + "epoch": 0.43234138124649074, + "grad_norm": 0.8792277574539185, + "learning_rate": 9.94093293455477e-06, + "loss": 0.4426, + "step": 1540 + }, + { + "epoch": 0.4326221224031443, + "grad_norm": 0.7946180701255798, + "learning_rate": 9.940682350363913e-06, + "loss": 0.5026, + "step": 1541 + }, + { + "epoch": 0.43290286355979785, + "grad_norm": 0.7558562159538269, + "learning_rate": 9.940431238934158e-06, + "loss": 0.4722, + "step": 1542 + }, + { + "epoch": 0.43318360471645145, + "grad_norm": 0.7382895946502686, + "learning_rate": 9.940179600292305e-06, + "loss": 0.4698, + "step": 1543 + }, + { + "epoch": 0.433464345873105, + "grad_norm": 0.8346493244171143, + "learning_rate": 9.939927434465206e-06, + "loss": 0.5273, + "step": 1544 + }, + { + "epoch": 0.43374508702975856, + "grad_norm": 0.7740429043769836, + "learning_rate": 9.939674741479772e-06, + "loss": 0.4671, + "step": 1545 + }, + { + "epoch": 0.4340258281864121, + "grad_norm": 0.7033578753471375, + "learning_rate": 9.939421521362966e-06, + "loss": 0.4802, + "step": 1546 + }, + { + "epoch": 0.4343065693430657, + "grad_norm": 0.792039692401886, + "learning_rate": 9.939167774141811e-06, + "loss": 0.4912, + "step": 1547 + }, + { + "epoch": 0.43458731049971927, + "grad_norm": 0.7619795799255371, + "learning_rate": 9.938913499843386e-06, + "loss": 0.4564, + "step": 1548 + }, + { + "epoch": 0.4348680516563728, + "grad_norm": 0.7204170227050781, + "learning_rate": 9.938658698494826e-06, + "loss": 0.4554, + "step": 1549 + }, + { + "epoch": 0.4351487928130264, + "grad_norm": 0.7560875415802002, + "learning_rate": 9.938403370123321e-06, + "loss": 0.4724, + "step": 1550 + }, + { + "epoch": 0.43542953396968, + "grad_norm": 0.6911844611167908, + "learning_rate": 9.938147514756117e-06, + "loss": 0.5127, + "step": 1551 + }, + { + "epoch": 0.43571027512633353, + "grad_norm": 0.8913024663925171, + "learning_rate": 9.93789113242052e-06, + "loss": 0.4806, + "step": 1552 + }, + { + "epoch": 0.4359910162829871, + "grad_norm": 0.7620805501937866, + "learning_rate": 9.937634223143889e-06, + "loss": 0.4561, + "step": 1553 + }, + { + "epoch": 0.43627175743964064, + "grad_norm": 0.9313696622848511, + "learning_rate": 9.937376786953637e-06, + "loss": 0.4356, + "step": 1554 + }, + { + "epoch": 0.4365524985962942, + "grad_norm": 0.6566257476806641, + "learning_rate": 9.93711882387724e-06, + "loss": 0.4554, + "step": 1555 + }, + { + "epoch": 0.4368332397529478, + "grad_norm": 0.6699604392051697, + "learning_rate": 9.936860333942221e-06, + "loss": 0.4352, + "step": 1556 + }, + { + "epoch": 0.43711398090960135, + "grad_norm": 0.8826926350593567, + "learning_rate": 9.93660131717617e-06, + "loss": 0.5356, + "step": 1557 + }, + { + "epoch": 0.4373947220662549, + "grad_norm": 0.8786247372627258, + "learning_rate": 9.936341773606723e-06, + "loss": 0.4661, + "step": 1558 + }, + { + "epoch": 0.43767546322290846, + "grad_norm": 0.6881845593452454, + "learning_rate": 9.93608170326158e-06, + "loss": 0.4566, + "step": 1559 + }, + { + "epoch": 0.43795620437956206, + "grad_norm": 0.7192972898483276, + "learning_rate": 9.935821106168493e-06, + "loss": 0.4161, + "step": 1560 + }, + { + "epoch": 0.4382369455362156, + "grad_norm": 0.8811171054840088, + "learning_rate": 9.935559982355271e-06, + "loss": 0.4917, + "step": 1561 + }, + { + "epoch": 0.43851768669286917, + "grad_norm": 0.788429319858551, + "learning_rate": 9.935298331849783e-06, + "loss": 0.4911, + "step": 1562 + }, + { + "epoch": 0.4387984278495227, + "grad_norm": 0.8126656413078308, + "learning_rate": 9.935036154679945e-06, + "loss": 0.4891, + "step": 1563 + }, + { + "epoch": 0.43907916900617633, + "grad_norm": 0.7749803066253662, + "learning_rate": 9.934773450873737e-06, + "loss": 0.473, + "step": 1564 + }, + { + "epoch": 0.4393599101628299, + "grad_norm": 0.8331796526908875, + "learning_rate": 9.934510220459193e-06, + "loss": 0.4875, + "step": 1565 + }, + { + "epoch": 0.43964065131948343, + "grad_norm": 0.7581439018249512, + "learning_rate": 9.934246463464405e-06, + "loss": 0.4686, + "step": 1566 + }, + { + "epoch": 0.439921392476137, + "grad_norm": 0.7353121042251587, + "learning_rate": 9.933982179917519e-06, + "loss": 0.4451, + "step": 1567 + }, + { + "epoch": 0.4402021336327906, + "grad_norm": 0.8095428347587585, + "learning_rate": 9.933717369846737e-06, + "loss": 0.4955, + "step": 1568 + }, + { + "epoch": 0.44048287478944415, + "grad_norm": 0.8571195602416992, + "learning_rate": 9.933452033280319e-06, + "loss": 0.4613, + "step": 1569 + }, + { + "epoch": 0.4407636159460977, + "grad_norm": 0.6963633894920349, + "learning_rate": 9.933186170246579e-06, + "loss": 0.4291, + "step": 1570 + }, + { + "epoch": 0.44104435710275125, + "grad_norm": 0.7224349975585938, + "learning_rate": 9.932919780773886e-06, + "loss": 0.4578, + "step": 1571 + }, + { + "epoch": 0.4413250982594048, + "grad_norm": 0.9017399549484253, + "learning_rate": 9.932652864890671e-06, + "loss": 0.4873, + "step": 1572 + }, + { + "epoch": 0.4416058394160584, + "grad_norm": 0.7524573802947998, + "learning_rate": 9.932385422625418e-06, + "loss": 0.4774, + "step": 1573 + }, + { + "epoch": 0.44188658057271196, + "grad_norm": 0.8817484974861145, + "learning_rate": 9.932117454006662e-06, + "loss": 0.4847, + "step": 1574 + }, + { + "epoch": 0.4421673217293655, + "grad_norm": 0.7601636052131653, + "learning_rate": 9.931848959063004e-06, + "loss": 0.4163, + "step": 1575 + }, + { + "epoch": 0.44244806288601907, + "grad_norm": 0.8592706918716431, + "learning_rate": 9.931579937823094e-06, + "loss": 0.4385, + "step": 1576 + }, + { + "epoch": 0.4427288040426727, + "grad_norm": 0.7996790409088135, + "learning_rate": 9.931310390315642e-06, + "loss": 0.4685, + "step": 1577 + }, + { + "epoch": 0.4430095451993262, + "grad_norm": 0.8284817934036255, + "learning_rate": 9.931040316569409e-06, + "loss": 0.4946, + "step": 1578 + }, + { + "epoch": 0.4432902863559798, + "grad_norm": 0.8335215449333191, + "learning_rate": 9.930769716613218e-06, + "loss": 0.4647, + "step": 1579 + }, + { + "epoch": 0.44357102751263333, + "grad_norm": 0.8053838610649109, + "learning_rate": 9.930498590475945e-06, + "loss": 0.4935, + "step": 1580 + }, + { + "epoch": 0.44385176866928694, + "grad_norm": 0.8241704106330872, + "learning_rate": 9.930226938186525e-06, + "loss": 0.4465, + "step": 1581 + }, + { + "epoch": 0.4441325098259405, + "grad_norm": 0.7485541701316833, + "learning_rate": 9.929954759773943e-06, + "loss": 0.4882, + "step": 1582 + }, + { + "epoch": 0.44441325098259404, + "grad_norm": 0.7574432492256165, + "learning_rate": 9.929682055267246e-06, + "loss": 0.4345, + "step": 1583 + }, + { + "epoch": 0.4446939921392476, + "grad_norm": 0.8734549880027771, + "learning_rate": 9.929408824695539e-06, + "loss": 0.4552, + "step": 1584 + }, + { + "epoch": 0.4449747332959012, + "grad_norm": 0.769991934299469, + "learning_rate": 9.929135068087975e-06, + "loss": 0.4565, + "step": 1585 + }, + { + "epoch": 0.44525547445255476, + "grad_norm": 0.7188234925270081, + "learning_rate": 9.928860785473767e-06, + "loss": 0.4535, + "step": 1586 + }, + { + "epoch": 0.4455362156092083, + "grad_norm": 0.8100794553756714, + "learning_rate": 9.928585976882187e-06, + "loss": 0.4705, + "step": 1587 + }, + { + "epoch": 0.44581695676586186, + "grad_norm": 0.7685830593109131, + "learning_rate": 9.92831064234256e-06, + "loss": 0.4515, + "step": 1588 + }, + { + "epoch": 0.44609769792251547, + "grad_norm": 0.6984787583351135, + "learning_rate": 9.92803478188427e-06, + "loss": 0.4332, + "step": 1589 + }, + { + "epoch": 0.446378439079169, + "grad_norm": 0.7266493439674377, + "learning_rate": 9.927758395536753e-06, + "loss": 0.4594, + "step": 1590 + }, + { + "epoch": 0.4466591802358226, + "grad_norm": 0.783992350101471, + "learning_rate": 9.927481483329503e-06, + "loss": 0.4426, + "step": 1591 + }, + { + "epoch": 0.4469399213924761, + "grad_norm": 0.7979679703712463, + "learning_rate": 9.927204045292071e-06, + "loss": 0.4821, + "step": 1592 + }, + { + "epoch": 0.4472206625491297, + "grad_norm": 0.7410790324211121, + "learning_rate": 9.926926081454065e-06, + "loss": 0.4638, + "step": 1593 + }, + { + "epoch": 0.4475014037057833, + "grad_norm": 0.7430979013442993, + "learning_rate": 9.926647591845144e-06, + "loss": 0.4643, + "step": 1594 + }, + { + "epoch": 0.44778214486243684, + "grad_norm": 0.7869971990585327, + "learning_rate": 9.926368576495031e-06, + "loss": 0.4221, + "step": 1595 + }, + { + "epoch": 0.4480628860190904, + "grad_norm": 0.7983207106590271, + "learning_rate": 9.926089035433497e-06, + "loss": 0.4693, + "step": 1596 + }, + { + "epoch": 0.44834362717574394, + "grad_norm": 0.8259583115577698, + "learning_rate": 9.925808968690376e-06, + "loss": 0.5099, + "step": 1597 + }, + { + "epoch": 0.44862436833239755, + "grad_norm": 0.7925142049789429, + "learning_rate": 9.925528376295552e-06, + "loss": 0.4541, + "step": 1598 + }, + { + "epoch": 0.4489051094890511, + "grad_norm": 0.6849550604820251, + "learning_rate": 9.92524725827897e-06, + "loss": 0.4452, + "step": 1599 + }, + { + "epoch": 0.44918585064570465, + "grad_norm": 0.9882868528366089, + "learning_rate": 9.924965614670629e-06, + "loss": 0.4764, + "step": 1600 + }, + { + "epoch": 0.4494665918023582, + "grad_norm": 0.8814857602119446, + "learning_rate": 9.924683445500584e-06, + "loss": 0.4699, + "step": 1601 + }, + { + "epoch": 0.4497473329590118, + "grad_norm": 0.7847899794578552, + "learning_rate": 9.924400750798947e-06, + "loss": 0.4377, + "step": 1602 + }, + { + "epoch": 0.45002807411566537, + "grad_norm": 0.7930905222892761, + "learning_rate": 9.924117530595882e-06, + "loss": 0.4912, + "step": 1603 + }, + { + "epoch": 0.4503088152723189, + "grad_norm": 0.810623288154602, + "learning_rate": 9.923833784921617e-06, + "loss": 0.4587, + "step": 1604 + }, + { + "epoch": 0.45058955642897247, + "grad_norm": 0.9229034185409546, + "learning_rate": 9.92354951380643e-06, + "loss": 0.4293, + "step": 1605 + }, + { + "epoch": 0.4508702975856261, + "grad_norm": 0.9106329679489136, + "learning_rate": 9.923264717280656e-06, + "loss": 0.5011, + "step": 1606 + }, + { + "epoch": 0.45115103874227963, + "grad_norm": 0.8749147057533264, + "learning_rate": 9.922979395374687e-06, + "loss": 0.4763, + "step": 1607 + }, + { + "epoch": 0.4514317798989332, + "grad_norm": 0.8200501799583435, + "learning_rate": 9.92269354811897e-06, + "loss": 0.4594, + "step": 1608 + }, + { + "epoch": 0.45171252105558674, + "grad_norm": 0.9100151658058167, + "learning_rate": 9.922407175544012e-06, + "loss": 0.4603, + "step": 1609 + }, + { + "epoch": 0.4519932622122403, + "grad_norm": 0.911439836025238, + "learning_rate": 9.922120277680369e-06, + "loss": 0.4665, + "step": 1610 + }, + { + "epoch": 0.4522740033688939, + "grad_norm": 0.7262864112854004, + "learning_rate": 9.92183285455866e-06, + "loss": 0.5072, + "step": 1611 + }, + { + "epoch": 0.45255474452554745, + "grad_norm": 0.8657403588294983, + "learning_rate": 9.921544906209554e-06, + "loss": 0.5121, + "step": 1612 + }, + { + "epoch": 0.452835485682201, + "grad_norm": 0.8669281005859375, + "learning_rate": 9.921256432663781e-06, + "loss": 0.4388, + "step": 1613 + }, + { + "epoch": 0.45311622683885455, + "grad_norm": 0.9119303822517395, + "learning_rate": 9.920967433952126e-06, + "loss": 0.5122, + "step": 1614 + }, + { + "epoch": 0.45339696799550816, + "grad_norm": 0.7300405502319336, + "learning_rate": 9.920677910105428e-06, + "loss": 0.5011, + "step": 1615 + }, + { + "epoch": 0.4536777091521617, + "grad_norm": 0.7571413516998291, + "learning_rate": 9.92038786115458e-06, + "loss": 0.4831, + "step": 1616 + }, + { + "epoch": 0.45395845030881526, + "grad_norm": 0.9800949692726135, + "learning_rate": 9.92009728713054e-06, + "loss": 0.4989, + "step": 1617 + }, + { + "epoch": 0.4542391914654688, + "grad_norm": 0.8586767911911011, + "learning_rate": 9.919806188064314e-06, + "loss": 0.5118, + "step": 1618 + }, + { + "epoch": 0.4545199326221224, + "grad_norm": 0.7775992155075073, + "learning_rate": 9.919514563986965e-06, + "loss": 0.4514, + "step": 1619 + }, + { + "epoch": 0.454800673778776, + "grad_norm": 0.8338137865066528, + "learning_rate": 9.919222414929614e-06, + "loss": 0.4799, + "step": 1620 + }, + { + "epoch": 0.45508141493542953, + "grad_norm": 0.7976445555686951, + "learning_rate": 9.91892974092344e-06, + "loss": 0.4953, + "step": 1621 + }, + { + "epoch": 0.4553621560920831, + "grad_norm": 0.7487460970878601, + "learning_rate": 9.91863654199967e-06, + "loss": 0.4395, + "step": 1622 + }, + { + "epoch": 0.4556428972487367, + "grad_norm": 0.7499296069145203, + "learning_rate": 9.918342818189594e-06, + "loss": 0.4953, + "step": 1623 + }, + { + "epoch": 0.45592363840539024, + "grad_norm": 0.7772714495658875, + "learning_rate": 9.91804856952456e-06, + "loss": 0.4385, + "step": 1624 + }, + { + "epoch": 0.4562043795620438, + "grad_norm": 0.7606932520866394, + "learning_rate": 9.917753796035965e-06, + "loss": 0.4545, + "step": 1625 + }, + { + "epoch": 0.45648512071869735, + "grad_norm": 0.7545024752616882, + "learning_rate": 9.917458497755267e-06, + "loss": 0.4663, + "step": 1626 + }, + { + "epoch": 0.45676586187535095, + "grad_norm": 0.8371989130973816, + "learning_rate": 9.917162674713979e-06, + "loss": 0.4154, + "step": 1627 + }, + { + "epoch": 0.4570466030320045, + "grad_norm": 0.7079052925109863, + "learning_rate": 9.916866326943666e-06, + "loss": 0.4643, + "step": 1628 + }, + { + "epoch": 0.45732734418865806, + "grad_norm": 0.8473893404006958, + "learning_rate": 9.916569454475954e-06, + "loss": 0.4883, + "step": 1629 + }, + { + "epoch": 0.4576080853453116, + "grad_norm": 0.9153130650520325, + "learning_rate": 9.916272057342527e-06, + "loss": 0.4841, + "step": 1630 + }, + { + "epoch": 0.45788882650196516, + "grad_norm": 0.8404461741447449, + "learning_rate": 9.915974135575115e-06, + "loss": 0.4754, + "step": 1631 + }, + { + "epoch": 0.45816956765861877, + "grad_norm": 0.7715436220169067, + "learning_rate": 9.915675689205516e-06, + "loss": 0.469, + "step": 1632 + }, + { + "epoch": 0.4584503088152723, + "grad_norm": 0.8396292924880981, + "learning_rate": 9.915376718265575e-06, + "loss": 0.45, + "step": 1633 + }, + { + "epoch": 0.4587310499719259, + "grad_norm": 0.7500838041305542, + "learning_rate": 9.915077222787197e-06, + "loss": 0.4445, + "step": 1634 + }, + { + "epoch": 0.4590117911285794, + "grad_norm": 0.8071966171264648, + "learning_rate": 9.914777202802343e-06, + "loss": 0.4591, + "step": 1635 + }, + { + "epoch": 0.45929253228523303, + "grad_norm": 0.758062481880188, + "learning_rate": 9.91447665834303e-06, + "loss": 0.4708, + "step": 1636 + }, + { + "epoch": 0.4595732734418866, + "grad_norm": 0.7524632811546326, + "learning_rate": 9.914175589441328e-06, + "loss": 0.4156, + "step": 1637 + }, + { + "epoch": 0.45985401459854014, + "grad_norm": 0.8921982049942017, + "learning_rate": 9.913873996129367e-06, + "loss": 0.4928, + "step": 1638 + }, + { + "epoch": 0.4601347557551937, + "grad_norm": 0.788564920425415, + "learning_rate": 9.91357187843933e-06, + "loss": 0.4954, + "step": 1639 + }, + { + "epoch": 0.4604154969118473, + "grad_norm": 0.8892732262611389, + "learning_rate": 9.913269236403457e-06, + "loss": 0.5074, + "step": 1640 + }, + { + "epoch": 0.46069623806850085, + "grad_norm": 0.9354593753814697, + "learning_rate": 9.912966070054045e-06, + "loss": 0.4672, + "step": 1641 + }, + { + "epoch": 0.4609769792251544, + "grad_norm": 0.7764392495155334, + "learning_rate": 9.912662379423447e-06, + "loss": 0.5, + "step": 1642 + }, + { + "epoch": 0.46125772038180796, + "grad_norm": 0.8332095742225647, + "learning_rate": 9.912358164544068e-06, + "loss": 0.4983, + "step": 1643 + }, + { + "epoch": 0.46153846153846156, + "grad_norm": 0.8388085961341858, + "learning_rate": 9.912053425448375e-06, + "loss": 0.486, + "step": 1644 + }, + { + "epoch": 0.4618192026951151, + "grad_norm": 0.7964088916778564, + "learning_rate": 9.911748162168886e-06, + "loss": 0.4782, + "step": 1645 + }, + { + "epoch": 0.46209994385176867, + "grad_norm": 0.684843122959137, + "learning_rate": 9.911442374738177e-06, + "loss": 0.4474, + "step": 1646 + }, + { + "epoch": 0.4623806850084222, + "grad_norm": 0.8008718490600586, + "learning_rate": 9.911136063188879e-06, + "loss": 0.4766, + "step": 1647 + }, + { + "epoch": 0.4626614261650758, + "grad_norm": 0.7822255492210388, + "learning_rate": 9.910829227553683e-06, + "loss": 0.4693, + "step": 1648 + }, + { + "epoch": 0.4629421673217294, + "grad_norm": 0.7523805499076843, + "learning_rate": 9.91052186786533e-06, + "loss": 0.4622, + "step": 1649 + }, + { + "epoch": 0.46322290847838293, + "grad_norm": 0.8271676301956177, + "learning_rate": 9.910213984156618e-06, + "loss": 0.467, + "step": 1650 + }, + { + "epoch": 0.4635036496350365, + "grad_norm": 0.714205801486969, + "learning_rate": 9.909905576460405e-06, + "loss": 0.4825, + "step": 1651 + }, + { + "epoch": 0.46378439079169004, + "grad_norm": 0.6896019577980042, + "learning_rate": 9.909596644809601e-06, + "loss": 0.4256, + "step": 1652 + }, + { + "epoch": 0.46406513194834365, + "grad_norm": 0.7542632222175598, + "learning_rate": 9.909287189237175e-06, + "loss": 0.434, + "step": 1653 + }, + { + "epoch": 0.4643458731049972, + "grad_norm": 0.8367012143135071, + "learning_rate": 9.908977209776149e-06, + "loss": 0.4808, + "step": 1654 + }, + { + "epoch": 0.46462661426165075, + "grad_norm": 0.790479302406311, + "learning_rate": 9.908666706459602e-06, + "loss": 0.464, + "step": 1655 + }, + { + "epoch": 0.4649073554183043, + "grad_norm": 0.6635505557060242, + "learning_rate": 9.908355679320668e-06, + "loss": 0.4576, + "step": 1656 + }, + { + "epoch": 0.4651880965749579, + "grad_norm": 0.8331106305122375, + "learning_rate": 9.90804412839254e-06, + "loss": 0.4516, + "step": 1657 + }, + { + "epoch": 0.46546883773161146, + "grad_norm": 0.7288654446601868, + "learning_rate": 9.907732053708463e-06, + "loss": 0.4304, + "step": 1658 + }, + { + "epoch": 0.465749578888265, + "grad_norm": 0.8938093781471252, + "learning_rate": 9.90741945530174e-06, + "loss": 0.4573, + "step": 1659 + }, + { + "epoch": 0.46603032004491857, + "grad_norm": 0.9637454152107239, + "learning_rate": 9.907106333205731e-06, + "loss": 0.4768, + "step": 1660 + }, + { + "epoch": 0.4663110612015722, + "grad_norm": 0.7337474822998047, + "learning_rate": 9.90679268745385e-06, + "loss": 0.455, + "step": 1661 + }, + { + "epoch": 0.4665918023582257, + "grad_norm": 0.7972109317779541, + "learning_rate": 9.906478518079564e-06, + "loss": 0.4839, + "step": 1662 + }, + { + "epoch": 0.4668725435148793, + "grad_norm": 0.7869675159454346, + "learning_rate": 9.906163825116406e-06, + "loss": 0.4418, + "step": 1663 + }, + { + "epoch": 0.46715328467153283, + "grad_norm": 0.7835783958435059, + "learning_rate": 9.90584860859795e-06, + "loss": 0.473, + "step": 1664 + }, + { + "epoch": 0.46743402582818644, + "grad_norm": 0.8191174268722534, + "learning_rate": 9.905532868557841e-06, + "loss": 0.4685, + "step": 1665 + }, + { + "epoch": 0.46771476698484, + "grad_norm": 0.8050325512886047, + "learning_rate": 9.905216605029768e-06, + "loss": 0.4777, + "step": 1666 + }, + { + "epoch": 0.46799550814149354, + "grad_norm": 0.8423100709915161, + "learning_rate": 9.904899818047483e-06, + "loss": 0.4874, + "step": 1667 + }, + { + "epoch": 0.4682762492981471, + "grad_norm": 0.8271785378456116, + "learning_rate": 9.904582507644791e-06, + "loss": 0.4673, + "step": 1668 + }, + { + "epoch": 0.46855699045480065, + "grad_norm": 0.8468924164772034, + "learning_rate": 9.904264673855553e-06, + "loss": 0.4361, + "step": 1669 + }, + { + "epoch": 0.46883773161145426, + "grad_norm": 0.8870489597320557, + "learning_rate": 9.903946316713687e-06, + "loss": 0.4564, + "step": 1670 + }, + { + "epoch": 0.4691184727681078, + "grad_norm": 0.8100656867027283, + "learning_rate": 9.903627436253166e-06, + "loss": 0.4289, + "step": 1671 + }, + { + "epoch": 0.46939921392476136, + "grad_norm": 0.7862449288368225, + "learning_rate": 9.903308032508019e-06, + "loss": 0.4387, + "step": 1672 + }, + { + "epoch": 0.4696799550814149, + "grad_norm": 0.8773373365402222, + "learning_rate": 9.902988105512328e-06, + "loss": 0.4436, + "step": 1673 + }, + { + "epoch": 0.4699606962380685, + "grad_norm": 0.8158847093582153, + "learning_rate": 9.902667655300237e-06, + "loss": 0.4848, + "step": 1674 + }, + { + "epoch": 0.4702414373947221, + "grad_norm": 0.7537901997566223, + "learning_rate": 9.902346681905944e-06, + "loss": 0.4255, + "step": 1675 + }, + { + "epoch": 0.4705221785513756, + "grad_norm": 0.8028502464294434, + "learning_rate": 9.902025185363697e-06, + "loss": 0.4752, + "step": 1676 + }, + { + "epoch": 0.4708029197080292, + "grad_norm": 0.7397227883338928, + "learning_rate": 9.901703165707805e-06, + "loss": 0.4505, + "step": 1677 + }, + { + "epoch": 0.4710836608646828, + "grad_norm": 0.7525709271430969, + "learning_rate": 9.901380622972634e-06, + "loss": 0.439, + "step": 1678 + }, + { + "epoch": 0.47136440202133634, + "grad_norm": 0.9410326480865479, + "learning_rate": 9.901057557192603e-06, + "loss": 0.4574, + "step": 1679 + }, + { + "epoch": 0.4716451431779899, + "grad_norm": 0.6887338757514954, + "learning_rate": 9.900733968402186e-06, + "loss": 0.4515, + "step": 1680 + }, + { + "epoch": 0.47192588433464344, + "grad_norm": 0.7193292379379272, + "learning_rate": 9.900409856635916e-06, + "loss": 0.4393, + "step": 1681 + }, + { + "epoch": 0.47220662549129705, + "grad_norm": 0.7619711756706238, + "learning_rate": 9.90008522192838e-06, + "loss": 0.4356, + "step": 1682 + }, + { + "epoch": 0.4724873666479506, + "grad_norm": 0.8705488443374634, + "learning_rate": 9.899760064314221e-06, + "loss": 0.5041, + "step": 1683 + }, + { + "epoch": 0.47276810780460415, + "grad_norm": 0.733539342880249, + "learning_rate": 9.899434383828137e-06, + "loss": 0.4908, + "step": 1684 + }, + { + "epoch": 0.4730488489612577, + "grad_norm": 0.8176860809326172, + "learning_rate": 9.899108180504883e-06, + "loss": 0.4769, + "step": 1685 + }, + { + "epoch": 0.47332959011791126, + "grad_norm": 0.8628261685371399, + "learning_rate": 9.89878145437927e-06, + "loss": 0.467, + "step": 1686 + }, + { + "epoch": 0.47361033127456487, + "grad_norm": 0.7747090458869934, + "learning_rate": 9.898454205486164e-06, + "loss": 0.5571, + "step": 1687 + }, + { + "epoch": 0.4738910724312184, + "grad_norm": 0.7447236776351929, + "learning_rate": 9.898126433860484e-06, + "loss": 0.4442, + "step": 1688 + }, + { + "epoch": 0.47417181358787197, + "grad_norm": 0.7353898882865906, + "learning_rate": 9.897798139537214e-06, + "loss": 0.3964, + "step": 1689 + }, + { + "epoch": 0.4744525547445255, + "grad_norm": 0.7461681365966797, + "learning_rate": 9.897469322551381e-06, + "loss": 0.4388, + "step": 1690 + }, + { + "epoch": 0.47473329590117913, + "grad_norm": 0.8284276127815247, + "learning_rate": 9.89713998293808e-06, + "loss": 0.5372, + "step": 1691 + }, + { + "epoch": 0.4750140370578327, + "grad_norm": 0.7964382171630859, + "learning_rate": 9.896810120732452e-06, + "loss": 0.4263, + "step": 1692 + }, + { + "epoch": 0.47529477821448624, + "grad_norm": 0.8379276394844055, + "learning_rate": 9.8964797359697e-06, + "loss": 0.4609, + "step": 1693 + }, + { + "epoch": 0.4755755193711398, + "grad_norm": 0.7556933164596558, + "learning_rate": 9.896148828685077e-06, + "loss": 0.4225, + "step": 1694 + }, + { + "epoch": 0.4758562605277934, + "grad_norm": 0.8176212310791016, + "learning_rate": 9.895817398913901e-06, + "loss": 0.4548, + "step": 1695 + }, + { + "epoch": 0.47613700168444695, + "grad_norm": 0.8952212929725647, + "learning_rate": 9.895485446691537e-06, + "loss": 0.4612, + "step": 1696 + }, + { + "epoch": 0.4764177428411005, + "grad_norm": 0.7643344402313232, + "learning_rate": 9.895152972053408e-06, + "loss": 0.4882, + "step": 1697 + }, + { + "epoch": 0.47669848399775405, + "grad_norm": 0.7792640328407288, + "learning_rate": 9.894819975034995e-06, + "loss": 0.4505, + "step": 1698 + }, + { + "epoch": 0.47697922515440766, + "grad_norm": 0.7249532341957092, + "learning_rate": 9.894486455671833e-06, + "loss": 0.4439, + "step": 1699 + }, + { + "epoch": 0.4772599663110612, + "grad_norm": 0.8037902116775513, + "learning_rate": 9.894152413999514e-06, + "loss": 0.4635, + "step": 1700 + }, + { + "epoch": 0.47754070746771476, + "grad_norm": 0.8761308789253235, + "learning_rate": 9.893817850053683e-06, + "loss": 0.4588, + "step": 1701 + }, + { + "epoch": 0.4778214486243683, + "grad_norm": 0.7966299057006836, + "learning_rate": 9.893482763870046e-06, + "loss": 0.4748, + "step": 1702 + }, + { + "epoch": 0.4781021897810219, + "grad_norm": 0.6806185841560364, + "learning_rate": 9.893147155484357e-06, + "loss": 0.4698, + "step": 1703 + }, + { + "epoch": 0.4783829309376755, + "grad_norm": 0.8648146986961365, + "learning_rate": 9.892811024932433e-06, + "loss": 0.4815, + "step": 1704 + }, + { + "epoch": 0.47866367209432903, + "grad_norm": 0.8874828815460205, + "learning_rate": 9.892474372250141e-06, + "loss": 0.4552, + "step": 1705 + }, + { + "epoch": 0.4789444132509826, + "grad_norm": 0.7632767558097839, + "learning_rate": 9.89213719747341e-06, + "loss": 0.4308, + "step": 1706 + }, + { + "epoch": 0.47922515440763613, + "grad_norm": 0.7456077337265015, + "learning_rate": 9.89179950063822e-06, + "loss": 0.4683, + "step": 1707 + }, + { + "epoch": 0.47950589556428974, + "grad_norm": 0.8706619739532471, + "learning_rate": 9.891461281780606e-06, + "loss": 0.4552, + "step": 1708 + }, + { + "epoch": 0.4797866367209433, + "grad_norm": 0.724503219127655, + "learning_rate": 9.891122540936661e-06, + "loss": 0.4558, + "step": 1709 + }, + { + "epoch": 0.48006737787759685, + "grad_norm": 0.9172972440719604, + "learning_rate": 9.890783278142537e-06, + "loss": 0.4365, + "step": 1710 + }, + { + "epoch": 0.4803481190342504, + "grad_norm": 0.796964168548584, + "learning_rate": 9.890443493434434e-06, + "loss": 0.4908, + "step": 1711 + }, + { + "epoch": 0.480628860190904, + "grad_norm": 0.7678824067115784, + "learning_rate": 9.890103186848614e-06, + "loss": 0.4613, + "step": 1712 + }, + { + "epoch": 0.48090960134755756, + "grad_norm": 0.9513916969299316, + "learning_rate": 9.88976235842139e-06, + "loss": 0.4717, + "step": 1713 + }, + { + "epoch": 0.4811903425042111, + "grad_norm": 0.9423227906227112, + "learning_rate": 9.889421008189135e-06, + "loss": 0.4948, + "step": 1714 + }, + { + "epoch": 0.48147108366086466, + "grad_norm": 0.7791553735733032, + "learning_rate": 9.889079136188276e-06, + "loss": 0.4246, + "step": 1715 + }, + { + "epoch": 0.48175182481751827, + "grad_norm": 0.8257462382316589, + "learning_rate": 9.888736742455294e-06, + "loss": 0.4384, + "step": 1716 + }, + { + "epoch": 0.4820325659741718, + "grad_norm": 0.8112425804138184, + "learning_rate": 9.888393827026728e-06, + "loss": 0.4939, + "step": 1717 + }, + { + "epoch": 0.4823133071308254, + "grad_norm": 0.7351109385490417, + "learning_rate": 9.888050389939172e-06, + "loss": 0.4958, + "step": 1718 + }, + { + "epoch": 0.4825940482874789, + "grad_norm": 0.7306787371635437, + "learning_rate": 9.887706431229274e-06, + "loss": 0.4605, + "step": 1719 + }, + { + "epoch": 0.48287478944413254, + "grad_norm": 0.7030362486839294, + "learning_rate": 9.887361950933742e-06, + "loss": 0.4529, + "step": 1720 + }, + { + "epoch": 0.4831555306007861, + "grad_norm": 0.6981697082519531, + "learning_rate": 9.887016949089334e-06, + "loss": 0.4915, + "step": 1721 + }, + { + "epoch": 0.48343627175743964, + "grad_norm": 0.7343105673789978, + "learning_rate": 9.886671425732868e-06, + "loss": 0.5037, + "step": 1722 + }, + { + "epoch": 0.4837170129140932, + "grad_norm": 0.6748523116111755, + "learning_rate": 9.886325380901217e-06, + "loss": 0.4758, + "step": 1723 + }, + { + "epoch": 0.48399775407074674, + "grad_norm": 0.7729491591453552, + "learning_rate": 9.885978814631306e-06, + "loss": 0.4684, + "step": 1724 + }, + { + "epoch": 0.48427849522740035, + "grad_norm": 0.8255395889282227, + "learning_rate": 9.885631726960119e-06, + "loss": 0.4684, + "step": 1725 + }, + { + "epoch": 0.4845592363840539, + "grad_norm": 0.7924033403396606, + "learning_rate": 9.885284117924696e-06, + "loss": 0.4562, + "step": 1726 + }, + { + "epoch": 0.48483997754070746, + "grad_norm": 0.6522700786590576, + "learning_rate": 9.884935987562132e-06, + "loss": 0.418, + "step": 1727 + }, + { + "epoch": 0.485120718697361, + "grad_norm": 0.7519421577453613, + "learning_rate": 9.884587335909577e-06, + "loss": 0.4698, + "step": 1728 + }, + { + "epoch": 0.4854014598540146, + "grad_norm": 0.7885614633560181, + "learning_rate": 9.884238163004237e-06, + "loss": 0.5158, + "step": 1729 + }, + { + "epoch": 0.48568220101066817, + "grad_norm": 0.790217399597168, + "learning_rate": 9.883888468883373e-06, + "loss": 0.5119, + "step": 1730 + }, + { + "epoch": 0.4859629421673217, + "grad_norm": 0.8577813506126404, + "learning_rate": 9.883538253584302e-06, + "loss": 0.4703, + "step": 1731 + }, + { + "epoch": 0.4862436833239753, + "grad_norm": 0.7423000931739807, + "learning_rate": 9.883187517144397e-06, + "loss": 0.4507, + "step": 1732 + }, + { + "epoch": 0.4865244244806289, + "grad_norm": 0.6888643503189087, + "learning_rate": 9.882836259601087e-06, + "loss": 0.4063, + "step": 1733 + }, + { + "epoch": 0.48680516563728243, + "grad_norm": 0.82935631275177, + "learning_rate": 9.882484480991857e-06, + "loss": 0.4997, + "step": 1734 + }, + { + "epoch": 0.487085906793936, + "grad_norm": 0.9646106958389282, + "learning_rate": 9.882132181354244e-06, + "loss": 0.489, + "step": 1735 + }, + { + "epoch": 0.48736664795058954, + "grad_norm": 0.7377066612243652, + "learning_rate": 9.881779360725847e-06, + "loss": 0.4142, + "step": 1736 + }, + { + "epoch": 0.48764738910724315, + "grad_norm": 0.9286917448043823, + "learning_rate": 9.881426019144311e-06, + "loss": 0.4301, + "step": 1737 + }, + { + "epoch": 0.4879281302638967, + "grad_norm": 0.8288553953170776, + "learning_rate": 9.881072156647347e-06, + "loss": 0.4988, + "step": 1738 + }, + { + "epoch": 0.48820887142055025, + "grad_norm": 0.7531358599662781, + "learning_rate": 9.880717773272717e-06, + "loss": 0.4544, + "step": 1739 + }, + { + "epoch": 0.4884896125772038, + "grad_norm": 0.9587254524230957, + "learning_rate": 9.880362869058237e-06, + "loss": 0.4704, + "step": 1740 + }, + { + "epoch": 0.4887703537338574, + "grad_norm": 0.8731741905212402, + "learning_rate": 9.880007444041779e-06, + "loss": 0.4812, + "step": 1741 + }, + { + "epoch": 0.48905109489051096, + "grad_norm": 0.760473370552063, + "learning_rate": 9.879651498261275e-06, + "loss": 0.4975, + "step": 1742 + }, + { + "epoch": 0.4893318360471645, + "grad_norm": 0.8611308932304382, + "learning_rate": 9.879295031754708e-06, + "loss": 0.4463, + "step": 1743 + }, + { + "epoch": 0.48961257720381807, + "grad_norm": 0.7820237874984741, + "learning_rate": 9.878938044560117e-06, + "loss": 0.445, + "step": 1744 + }, + { + "epoch": 0.4898933183604716, + "grad_norm": 0.7753906846046448, + "learning_rate": 9.878580536715597e-06, + "loss": 0.464, + "step": 1745 + }, + { + "epoch": 0.4901740595171252, + "grad_norm": 0.7010873556137085, + "learning_rate": 9.878222508259301e-06, + "loss": 0.4784, + "step": 1746 + }, + { + "epoch": 0.4904548006737788, + "grad_norm": 0.8321229219436646, + "learning_rate": 9.877863959229435e-06, + "loss": 0.4658, + "step": 1747 + }, + { + "epoch": 0.49073554183043233, + "grad_norm": 0.7034891843795776, + "learning_rate": 9.87750488966426e-06, + "loss": 0.4785, + "step": 1748 + }, + { + "epoch": 0.4910162829870859, + "grad_norm": 0.7781282067298889, + "learning_rate": 9.877145299602093e-06, + "loss": 0.4487, + "step": 1749 + }, + { + "epoch": 0.4912970241437395, + "grad_norm": 0.7160866856575012, + "learning_rate": 9.87678518908131e-06, + "loss": 0.4861, + "step": 1750 + }, + { + "epoch": 0.49157776530039304, + "grad_norm": 0.8705825805664062, + "learning_rate": 9.876424558140338e-06, + "loss": 0.4492, + "step": 1751 + }, + { + "epoch": 0.4918585064570466, + "grad_norm": 0.8751598000526428, + "learning_rate": 9.876063406817664e-06, + "loss": 0.4853, + "step": 1752 + }, + { + "epoch": 0.49213924761370015, + "grad_norm": 0.8282373547554016, + "learning_rate": 9.875701735151823e-06, + "loss": 0.4529, + "step": 1753 + }, + { + "epoch": 0.49241998877035376, + "grad_norm": 0.8029273152351379, + "learning_rate": 9.875339543181413e-06, + "loss": 0.4559, + "step": 1754 + }, + { + "epoch": 0.4927007299270073, + "grad_norm": 0.8402436971664429, + "learning_rate": 9.874976830945085e-06, + "loss": 0.4225, + "step": 1755 + }, + { + "epoch": 0.49298147108366086, + "grad_norm": 0.8531173467636108, + "learning_rate": 9.874613598481545e-06, + "loss": 0.4886, + "step": 1756 + }, + { + "epoch": 0.4932622122403144, + "grad_norm": 0.8536921143531799, + "learning_rate": 9.874249845829556e-06, + "loss": 0.4655, + "step": 1757 + }, + { + "epoch": 0.493542953396968, + "grad_norm": 0.7912077307701111, + "learning_rate": 9.873885573027932e-06, + "loss": 0.4587, + "step": 1758 + }, + { + "epoch": 0.4938236945536216, + "grad_norm": 0.7647911906242371, + "learning_rate": 9.87352078011555e-06, + "loss": 0.4748, + "step": 1759 + }, + { + "epoch": 0.4941044357102751, + "grad_norm": 0.8365086317062378, + "learning_rate": 9.873155467131336e-06, + "loss": 0.4741, + "step": 1760 + }, + { + "epoch": 0.4943851768669287, + "grad_norm": 0.7844578623771667, + "learning_rate": 9.872789634114277e-06, + "loss": 0.441, + "step": 1761 + }, + { + "epoch": 0.49466591802358223, + "grad_norm": 0.9620003700256348, + "learning_rate": 9.872423281103406e-06, + "loss": 0.4754, + "step": 1762 + }, + { + "epoch": 0.49494665918023584, + "grad_norm": 0.7750184535980225, + "learning_rate": 9.872056408137826e-06, + "loss": 0.4352, + "step": 1763 + }, + { + "epoch": 0.4952274003368894, + "grad_norm": 0.7962160110473633, + "learning_rate": 9.871689015256682e-06, + "loss": 0.4589, + "step": 1764 + }, + { + "epoch": 0.49550814149354294, + "grad_norm": 0.9381359219551086, + "learning_rate": 9.871321102499181e-06, + "loss": 0.5046, + "step": 1765 + }, + { + "epoch": 0.4957888826501965, + "grad_norm": 0.8677588701248169, + "learning_rate": 9.870952669904585e-06, + "loss": 0.4957, + "step": 1766 + }, + { + "epoch": 0.4960696238068501, + "grad_norm": 0.7756445407867432, + "learning_rate": 9.87058371751221e-06, + "loss": 0.4448, + "step": 1767 + }, + { + "epoch": 0.49635036496350365, + "grad_norm": 0.8854823708534241, + "learning_rate": 9.870214245361429e-06, + "loss": 0.4712, + "step": 1768 + }, + { + "epoch": 0.4966311061201572, + "grad_norm": 0.8065089583396912, + "learning_rate": 9.869844253491669e-06, + "loss": 0.4698, + "step": 1769 + }, + { + "epoch": 0.49691184727681076, + "grad_norm": 0.8647491335868835, + "learning_rate": 9.869473741942415e-06, + "loss": 0.499, + "step": 1770 + }, + { + "epoch": 0.49719258843346437, + "grad_norm": 0.7648187875747681, + "learning_rate": 9.869102710753204e-06, + "loss": 0.4492, + "step": 1771 + }, + { + "epoch": 0.4974733295901179, + "grad_norm": 0.7908899784088135, + "learning_rate": 9.868731159963631e-06, + "loss": 0.4415, + "step": 1772 + }, + { + "epoch": 0.49775407074677147, + "grad_norm": 0.7534183859825134, + "learning_rate": 9.868359089613346e-06, + "loss": 0.4815, + "step": 1773 + }, + { + "epoch": 0.498034811903425, + "grad_norm": 0.7666630148887634, + "learning_rate": 9.867986499742052e-06, + "loss": 0.4592, + "step": 1774 + }, + { + "epoch": 0.49831555306007863, + "grad_norm": 0.7794538736343384, + "learning_rate": 9.86761339038951e-06, + "loss": 0.4911, + "step": 1775 + }, + { + "epoch": 0.4985962942167322, + "grad_norm": 0.7326319217681885, + "learning_rate": 9.86723976159554e-06, + "loss": 0.4299, + "step": 1776 + }, + { + "epoch": 0.49887703537338574, + "grad_norm": 0.8433898687362671, + "learning_rate": 9.866865613400008e-06, + "loss": 0.455, + "step": 1777 + }, + { + "epoch": 0.4991577765300393, + "grad_norm": 0.8451767563819885, + "learning_rate": 9.866490945842841e-06, + "loss": 0.5088, + "step": 1778 + }, + { + "epoch": 0.4994385176866929, + "grad_norm": 0.8145613074302673, + "learning_rate": 9.866115758964026e-06, + "loss": 0.4789, + "step": 1779 + }, + { + "epoch": 0.49971925884334645, + "grad_norm": 0.6962898969650269, + "learning_rate": 9.865740052803596e-06, + "loss": 0.3938, + "step": 1780 + }, + { + "epoch": 0.5, + "grad_norm": 0.8139388561248779, + "learning_rate": 9.865363827401643e-06, + "loss": 0.4132, + "step": 1781 + }, + { + "epoch": 0.5002807411566536, + "grad_norm": 0.7577467560768127, + "learning_rate": 9.864987082798323e-06, + "loss": 0.4542, + "step": 1782 + }, + { + "epoch": 0.5005614823133071, + "grad_norm": 0.7806551456451416, + "learning_rate": 9.864609819033833e-06, + "loss": 0.5121, + "step": 1783 + }, + { + "epoch": 0.5008422234699607, + "grad_norm": 0.852153480052948, + "learning_rate": 9.864232036148434e-06, + "loss": 0.4866, + "step": 1784 + }, + { + "epoch": 0.5011229646266142, + "grad_norm": 0.7819837927818298, + "learning_rate": 9.86385373418244e-06, + "loss": 0.45, + "step": 1785 + }, + { + "epoch": 0.5014037057832679, + "grad_norm": 0.6908717751502991, + "learning_rate": 9.863474913176222e-06, + "loss": 0.5197, + "step": 1786 + }, + { + "epoch": 0.5016844469399214, + "grad_norm": 0.7145721912384033, + "learning_rate": 9.863095573170206e-06, + "loss": 0.4477, + "step": 1787 + }, + { + "epoch": 0.501965188096575, + "grad_norm": 0.8932965397834778, + "learning_rate": 9.862715714204872e-06, + "loss": 0.4559, + "step": 1788 + }, + { + "epoch": 0.5022459292532285, + "grad_norm": 0.8007551431655884, + "learning_rate": 9.862335336320753e-06, + "loss": 0.4498, + "step": 1789 + }, + { + "epoch": 0.5025266704098821, + "grad_norm": 0.76729416847229, + "learning_rate": 9.861954439558448e-06, + "loss": 0.4409, + "step": 1790 + }, + { + "epoch": 0.5028074115665356, + "grad_norm": 0.8064449429512024, + "learning_rate": 9.8615730239586e-06, + "loss": 0.4683, + "step": 1791 + }, + { + "epoch": 0.5030881527231892, + "grad_norm": 0.7141784429550171, + "learning_rate": 9.861191089561908e-06, + "loss": 0.4319, + "step": 1792 + }, + { + "epoch": 0.5033688938798427, + "grad_norm": 0.8147256970405579, + "learning_rate": 9.860808636409134e-06, + "loss": 0.4517, + "step": 1793 + }, + { + "epoch": 0.5036496350364964, + "grad_norm": 0.7154982089996338, + "learning_rate": 9.86042566454109e-06, + "loss": 0.4578, + "step": 1794 + }, + { + "epoch": 0.50393037619315, + "grad_norm": 0.8301019072532654, + "learning_rate": 9.860042173998643e-06, + "loss": 0.4388, + "step": 1795 + }, + { + "epoch": 0.5042111173498035, + "grad_norm": 0.8481314778327942, + "learning_rate": 9.859658164822718e-06, + "loss": 0.489, + "step": 1796 + }, + { + "epoch": 0.5044918585064571, + "grad_norm": 0.8445191383361816, + "learning_rate": 9.859273637054295e-06, + "loss": 0.52, + "step": 1797 + }, + { + "epoch": 0.5047725996631106, + "grad_norm": 0.8490387201309204, + "learning_rate": 9.858888590734406e-06, + "loss": 0.4881, + "step": 1798 + }, + { + "epoch": 0.5050533408197642, + "grad_norm": 0.812320351600647, + "learning_rate": 9.858503025904143e-06, + "loss": 0.4528, + "step": 1799 + }, + { + "epoch": 0.5053340819764177, + "grad_norm": 0.8201896548271179, + "learning_rate": 9.858116942604649e-06, + "loss": 0.4681, + "step": 1800 + }, + { + "epoch": 0.5056148231330713, + "grad_norm": 0.9138181805610657, + "learning_rate": 9.857730340877128e-06, + "loss": 0.4774, + "step": 1801 + }, + { + "epoch": 0.5058955642897248, + "grad_norm": 0.6980558633804321, + "learning_rate": 9.857343220762831e-06, + "loss": 0.4542, + "step": 1802 + }, + { + "epoch": 0.5061763054463785, + "grad_norm": 0.7874338626861572, + "learning_rate": 9.856955582303072e-06, + "loss": 0.5028, + "step": 1803 + }, + { + "epoch": 0.506457046603032, + "grad_norm": 0.7695236802101135, + "learning_rate": 9.856567425539217e-06, + "loss": 0.4738, + "step": 1804 + }, + { + "epoch": 0.5067377877596856, + "grad_norm": 0.8767713904380798, + "learning_rate": 9.856178750512688e-06, + "loss": 0.4734, + "step": 1805 + }, + { + "epoch": 0.5070185289163391, + "grad_norm": 0.6862586736679077, + "learning_rate": 9.85578955726496e-06, + "loss": 0.512, + "step": 1806 + }, + { + "epoch": 0.5072992700729927, + "grad_norm": 0.9676357507705688, + "learning_rate": 9.855399845837565e-06, + "loss": 0.4657, + "step": 1807 + }, + { + "epoch": 0.5075800112296462, + "grad_norm": 0.8110551834106445, + "learning_rate": 9.855009616272095e-06, + "loss": 0.4411, + "step": 1808 + }, + { + "epoch": 0.5078607523862998, + "grad_norm": 0.8683038353919983, + "learning_rate": 9.854618868610188e-06, + "loss": 0.5059, + "step": 1809 + }, + { + "epoch": 0.5081414935429533, + "grad_norm": 0.9837791323661804, + "learning_rate": 9.854227602893547e-06, + "loss": 0.458, + "step": 1810 + }, + { + "epoch": 0.508422234699607, + "grad_norm": 0.8323397040367126, + "learning_rate": 9.853835819163921e-06, + "loss": 0.4763, + "step": 1811 + }, + { + "epoch": 0.5087029758562606, + "grad_norm": 0.7720654010772705, + "learning_rate": 9.85344351746312e-06, + "loss": 0.5114, + "step": 1812 + }, + { + "epoch": 0.5089837170129141, + "grad_norm": 0.7435499429702759, + "learning_rate": 9.853050697833009e-06, + "loss": 0.4489, + "step": 1813 + }, + { + "epoch": 0.5092644581695677, + "grad_norm": 0.7995089292526245, + "learning_rate": 9.852657360315505e-06, + "loss": 0.4712, + "step": 1814 + }, + { + "epoch": 0.5095451993262212, + "grad_norm": 0.7547017931938171, + "learning_rate": 9.852263504952585e-06, + "loss": 0.4818, + "step": 1815 + }, + { + "epoch": 0.5098259404828748, + "grad_norm": 0.7278504967689514, + "learning_rate": 9.85186913178628e-06, + "loss": 0.5011, + "step": 1816 + }, + { + "epoch": 0.5101066816395283, + "grad_norm": 0.8858345150947571, + "learning_rate": 9.851474240858671e-06, + "loss": 0.4514, + "step": 1817 + }, + { + "epoch": 0.5103874227961819, + "grad_norm": 0.8252277970314026, + "learning_rate": 9.851078832211903e-06, + "loss": 0.4764, + "step": 1818 + }, + { + "epoch": 0.5106681639528355, + "grad_norm": 0.836026668548584, + "learning_rate": 9.850682905888165e-06, + "loss": 0.5051, + "step": 1819 + }, + { + "epoch": 0.5109489051094891, + "grad_norm": 0.6830624938011169, + "learning_rate": 9.850286461929714e-06, + "loss": 0.4316, + "step": 1820 + }, + { + "epoch": 0.5112296462661426, + "grad_norm": 0.6491891741752625, + "learning_rate": 9.849889500378854e-06, + "loss": 0.4308, + "step": 1821 + }, + { + "epoch": 0.5115103874227962, + "grad_norm": 0.7383559942245483, + "learning_rate": 9.849492021277946e-06, + "loss": 0.4704, + "step": 1822 + }, + { + "epoch": 0.5117911285794498, + "grad_norm": 0.7036583423614502, + "learning_rate": 9.849094024669405e-06, + "loss": 0.4578, + "step": 1823 + }, + { + "epoch": 0.5120718697361033, + "grad_norm": 0.7398111820220947, + "learning_rate": 9.848695510595705e-06, + "loss": 0.4467, + "step": 1824 + }, + { + "epoch": 0.5123526108927569, + "grad_norm": 0.6627861857414246, + "learning_rate": 9.848296479099373e-06, + "loss": 0.4478, + "step": 1825 + }, + { + "epoch": 0.5126333520494104, + "grad_norm": 0.7219253182411194, + "learning_rate": 9.847896930222989e-06, + "loss": 0.4185, + "step": 1826 + }, + { + "epoch": 0.512914093206064, + "grad_norm": 0.84356689453125, + "learning_rate": 9.847496864009191e-06, + "loss": 0.5129, + "step": 1827 + }, + { + "epoch": 0.5131948343627176, + "grad_norm": 0.765963077545166, + "learning_rate": 9.847096280500675e-06, + "loss": 0.4656, + "step": 1828 + }, + { + "epoch": 0.5134755755193712, + "grad_norm": 0.7722097039222717, + "learning_rate": 9.846695179740184e-06, + "loss": 0.4865, + "step": 1829 + }, + { + "epoch": 0.5137563166760247, + "grad_norm": 0.7085173726081848, + "learning_rate": 9.846293561770523e-06, + "loss": 0.4209, + "step": 1830 + }, + { + "epoch": 0.5140370578326783, + "grad_norm": 0.6874461770057678, + "learning_rate": 9.845891426634551e-06, + "loss": 0.4673, + "step": 1831 + }, + { + "epoch": 0.5143177989893318, + "grad_norm": 0.8429763317108154, + "learning_rate": 9.84548877437518e-06, + "loss": 0.4672, + "step": 1832 + }, + { + "epoch": 0.5145985401459854, + "grad_norm": 0.6847505569458008, + "learning_rate": 9.84508560503538e-06, + "loss": 0.4489, + "step": 1833 + }, + { + "epoch": 0.5148792813026389, + "grad_norm": 0.6943210959434509, + "learning_rate": 9.844681918658172e-06, + "loss": 0.4422, + "step": 1834 + }, + { + "epoch": 0.5151600224592925, + "grad_norm": 0.8705496191978455, + "learning_rate": 9.844277715286639e-06, + "loss": 0.4773, + "step": 1835 + }, + { + "epoch": 0.5154407636159462, + "grad_norm": 0.6923941373825073, + "learning_rate": 9.843872994963912e-06, + "loss": 0.4355, + "step": 1836 + }, + { + "epoch": 0.5157215047725997, + "grad_norm": 0.7502871155738831, + "learning_rate": 9.84346775773318e-06, + "loss": 0.4859, + "step": 1837 + }, + { + "epoch": 0.5160022459292533, + "grad_norm": 0.736991822719574, + "learning_rate": 9.84306200363769e-06, + "loss": 0.4332, + "step": 1838 + }, + { + "epoch": 0.5162829870859068, + "grad_norm": 0.7772122621536255, + "learning_rate": 9.842655732720738e-06, + "loss": 0.4416, + "step": 1839 + }, + { + "epoch": 0.5165637282425604, + "grad_norm": 0.6768174171447754, + "learning_rate": 9.842248945025682e-06, + "loss": 0.4348, + "step": 1840 + }, + { + "epoch": 0.5168444693992139, + "grad_norm": 0.7615413069725037, + "learning_rate": 9.84184164059593e-06, + "loss": 0.5154, + "step": 1841 + }, + { + "epoch": 0.5171252105558675, + "grad_norm": 0.8892810940742493, + "learning_rate": 9.84143381947495e-06, + "loss": 0.5142, + "step": 1842 + }, + { + "epoch": 0.517405951712521, + "grad_norm": 0.8347628712654114, + "learning_rate": 9.841025481706256e-06, + "loss": 0.4566, + "step": 1843 + }, + { + "epoch": 0.5176866928691746, + "grad_norm": 0.7237997055053711, + "learning_rate": 9.84061662733343e-06, + "loss": 0.4733, + "step": 1844 + }, + { + "epoch": 0.5179674340258282, + "grad_norm": 0.7402610778808594, + "learning_rate": 9.840207256400097e-06, + "loss": 0.4682, + "step": 1845 + }, + { + "epoch": 0.5182481751824818, + "grad_norm": 0.7389965653419495, + "learning_rate": 9.839797368949946e-06, + "loss": 0.421, + "step": 1846 + }, + { + "epoch": 0.5185289163391353, + "grad_norm": 0.7180278897285461, + "learning_rate": 9.839386965026716e-06, + "loss": 0.4503, + "step": 1847 + }, + { + "epoch": 0.5188096574957889, + "grad_norm": 0.7654619812965393, + "learning_rate": 9.838976044674204e-06, + "loss": 0.4503, + "step": 1848 + }, + { + "epoch": 0.5190903986524424, + "grad_norm": 0.8214221000671387, + "learning_rate": 9.838564607936259e-06, + "loss": 0.4682, + "step": 1849 + }, + { + "epoch": 0.519371139809096, + "grad_norm": 0.7136037349700928, + "learning_rate": 9.83815265485679e-06, + "loss": 0.4364, + "step": 1850 + }, + { + "epoch": 0.5196518809657495, + "grad_norm": 0.7680732607841492, + "learning_rate": 9.837740185479755e-06, + "loss": 0.5183, + "step": 1851 + }, + { + "epoch": 0.5199326221224031, + "grad_norm": 0.7541682124137878, + "learning_rate": 9.83732719984917e-06, + "loss": 0.4879, + "step": 1852 + }, + { + "epoch": 0.5202133632790568, + "grad_norm": 0.7221009731292725, + "learning_rate": 9.836913698009109e-06, + "loss": 0.4726, + "step": 1853 + }, + { + "epoch": 0.5204941044357103, + "grad_norm": 0.6487541794776917, + "learning_rate": 9.836499680003697e-06, + "loss": 0.4558, + "step": 1854 + }, + { + "epoch": 0.5207748455923639, + "grad_norm": 0.7128645181655884, + "learning_rate": 9.836085145877115e-06, + "loss": 0.4665, + "step": 1855 + }, + { + "epoch": 0.5210555867490174, + "grad_norm": 0.8577792644500732, + "learning_rate": 9.835670095673599e-06, + "loss": 0.5188, + "step": 1856 + }, + { + "epoch": 0.521336327905671, + "grad_norm": 0.8320954442024231, + "learning_rate": 9.835254529437444e-06, + "loss": 0.458, + "step": 1857 + }, + { + "epoch": 0.5216170690623245, + "grad_norm": 0.7644145488739014, + "learning_rate": 9.834838447212991e-06, + "loss": 0.4492, + "step": 1858 + }, + { + "epoch": 0.5218978102189781, + "grad_norm": 0.7536954283714294, + "learning_rate": 9.834421849044646e-06, + "loss": 0.4481, + "step": 1859 + }, + { + "epoch": 0.5221785513756316, + "grad_norm": 0.7009553909301758, + "learning_rate": 9.834004734976865e-06, + "loss": 0.4779, + "step": 1860 + }, + { + "epoch": 0.5224592925322852, + "grad_norm": 0.7501538991928101, + "learning_rate": 9.83358710505416e-06, + "loss": 0.4499, + "step": 1861 + }, + { + "epoch": 0.5227400336889388, + "grad_norm": 0.7410726547241211, + "learning_rate": 9.833168959321097e-06, + "loss": 0.4466, + "step": 1862 + }, + { + "epoch": 0.5230207748455924, + "grad_norm": 0.7580085396766663, + "learning_rate": 9.832750297822298e-06, + "loss": 0.454, + "step": 1863 + }, + { + "epoch": 0.523301516002246, + "grad_norm": 0.810674250125885, + "learning_rate": 9.832331120602439e-06, + "loss": 0.4561, + "step": 1864 + }, + { + "epoch": 0.5235822571588995, + "grad_norm": 0.7958224415779114, + "learning_rate": 9.831911427706253e-06, + "loss": 0.4859, + "step": 1865 + }, + { + "epoch": 0.523862998315553, + "grad_norm": 0.7352412939071655, + "learning_rate": 9.831491219178528e-06, + "loss": 0.4653, + "step": 1866 + }, + { + "epoch": 0.5241437394722066, + "grad_norm": 0.7566716074943542, + "learning_rate": 9.831070495064106e-06, + "loss": 0.4384, + "step": 1867 + }, + { + "epoch": 0.5244244806288602, + "grad_norm": 0.7479974031448364, + "learning_rate": 9.830649255407882e-06, + "loss": 0.4641, + "step": 1868 + }, + { + "epoch": 0.5247052217855137, + "grad_norm": 1.0138598680496216, + "learning_rate": 9.83022750025481e-06, + "loss": 0.4781, + "step": 1869 + }, + { + "epoch": 0.5249859629421674, + "grad_norm": 0.7030500769615173, + "learning_rate": 9.829805229649896e-06, + "loss": 0.4739, + "step": 1870 + }, + { + "epoch": 0.5252667040988209, + "grad_norm": 0.7876224517822266, + "learning_rate": 9.829382443638202e-06, + "loss": 0.4826, + "step": 1871 + }, + { + "epoch": 0.5255474452554745, + "grad_norm": 0.8127958178520203, + "learning_rate": 9.828959142264845e-06, + "loss": 0.4795, + "step": 1872 + }, + { + "epoch": 0.525828186412128, + "grad_norm": 0.8334431648254395, + "learning_rate": 9.828535325574999e-06, + "loss": 0.4765, + "step": 1873 + }, + { + "epoch": 0.5261089275687816, + "grad_norm": 0.8717887997627258, + "learning_rate": 9.82811099361389e-06, + "loss": 0.4354, + "step": 1874 + }, + { + "epoch": 0.5263896687254351, + "grad_norm": 0.7463579773902893, + "learning_rate": 9.827686146426798e-06, + "loss": 0.4283, + "step": 1875 + }, + { + "epoch": 0.5266704098820887, + "grad_norm": 0.8334963917732239, + "learning_rate": 9.827260784059064e-06, + "loss": 0.4554, + "step": 1876 + }, + { + "epoch": 0.5269511510387422, + "grad_norm": 0.6488940715789795, + "learning_rate": 9.826834906556077e-06, + "loss": 0.4422, + "step": 1877 + }, + { + "epoch": 0.5272318921953958, + "grad_norm": 0.6755518913269043, + "learning_rate": 9.826408513963283e-06, + "loss": 0.4323, + "step": 1878 + }, + { + "epoch": 0.5275126333520495, + "grad_norm": 0.7829365730285645, + "learning_rate": 9.825981606326189e-06, + "loss": 0.5051, + "step": 1879 + }, + { + "epoch": 0.527793374508703, + "grad_norm": 0.7612887620925903, + "learning_rate": 9.825554183690347e-06, + "loss": 0.5031, + "step": 1880 + }, + { + "epoch": 0.5280741156653566, + "grad_norm": 0.700315535068512, + "learning_rate": 9.82512624610137e-06, + "loss": 0.4804, + "step": 1881 + }, + { + "epoch": 0.5283548568220101, + "grad_norm": 0.7135051488876343, + "learning_rate": 9.824697793604929e-06, + "loss": 0.474, + "step": 1882 + }, + { + "epoch": 0.5286355979786637, + "grad_norm": 0.911785900592804, + "learning_rate": 9.82426882624674e-06, + "loss": 0.5015, + "step": 1883 + }, + { + "epoch": 0.5289163391353172, + "grad_norm": 0.8162867426872253, + "learning_rate": 9.823839344072582e-06, + "loss": 0.4659, + "step": 1884 + }, + { + "epoch": 0.5291970802919708, + "grad_norm": 0.7398881316184998, + "learning_rate": 9.823409347128286e-06, + "loss": 0.4696, + "step": 1885 + }, + { + "epoch": 0.5294778214486243, + "grad_norm": 0.8000018000602722, + "learning_rate": 9.822978835459738e-06, + "loss": 0.4745, + "step": 1886 + }, + { + "epoch": 0.529758562605278, + "grad_norm": 0.7813568711280823, + "learning_rate": 9.822547809112883e-06, + "loss": 0.4689, + "step": 1887 + }, + { + "epoch": 0.5300393037619315, + "grad_norm": 0.7030794620513916, + "learning_rate": 9.822116268133715e-06, + "loss": 0.4661, + "step": 1888 + }, + { + "epoch": 0.5303200449185851, + "grad_norm": 0.7161295413970947, + "learning_rate": 9.821684212568286e-06, + "loss": 0.4725, + "step": 1889 + }, + { + "epoch": 0.5306007860752386, + "grad_norm": 0.7135481834411621, + "learning_rate": 9.821251642462701e-06, + "loss": 0.4403, + "step": 1890 + }, + { + "epoch": 0.5308815272318922, + "grad_norm": 0.7595284581184387, + "learning_rate": 9.820818557863123e-06, + "loss": 0.4175, + "step": 1891 + }, + { + "epoch": 0.5311622683885457, + "grad_norm": 0.7693842053413391, + "learning_rate": 9.820384958815766e-06, + "loss": 0.4789, + "step": 1892 + }, + { + "epoch": 0.5314430095451993, + "grad_norm": 0.7659890651702881, + "learning_rate": 9.819950845366904e-06, + "loss": 0.4893, + "step": 1893 + }, + { + "epoch": 0.5317237507018528, + "grad_norm": 0.8019644021987915, + "learning_rate": 9.819516217562859e-06, + "loss": 0.4266, + "step": 1894 + }, + { + "epoch": 0.5320044918585065, + "grad_norm": 0.7651217579841614, + "learning_rate": 9.819081075450014e-06, + "loss": 0.5205, + "step": 1895 + }, + { + "epoch": 0.5322852330151601, + "grad_norm": 0.7626285552978516, + "learning_rate": 9.818645419074807e-06, + "loss": 0.4727, + "step": 1896 + }, + { + "epoch": 0.5325659741718136, + "grad_norm": 0.8264485001564026, + "learning_rate": 9.818209248483724e-06, + "loss": 0.416, + "step": 1897 + }, + { + "epoch": 0.5328467153284672, + "grad_norm": 0.7829334139823914, + "learning_rate": 9.817772563723313e-06, + "loss": 0.4942, + "step": 1898 + }, + { + "epoch": 0.5331274564851207, + "grad_norm": 0.7674833536148071, + "learning_rate": 9.817335364840173e-06, + "loss": 0.3953, + "step": 1899 + }, + { + "epoch": 0.5334081976417743, + "grad_norm": 0.7134654521942139, + "learning_rate": 9.816897651880962e-06, + "loss": 0.408, + "step": 1900 + }, + { + "epoch": 0.5336889387984278, + "grad_norm": 0.7987240552902222, + "learning_rate": 9.816459424892385e-06, + "loss": 0.4509, + "step": 1901 + }, + { + "epoch": 0.5339696799550814, + "grad_norm": 0.8815852403640747, + "learning_rate": 9.81602068392121e-06, + "loss": 0.4644, + "step": 1902 + }, + { + "epoch": 0.5342504211117349, + "grad_norm": 0.889531135559082, + "learning_rate": 9.815581429014259e-06, + "loss": 0.4997, + "step": 1903 + }, + { + "epoch": 0.5345311622683886, + "grad_norm": 0.896373450756073, + "learning_rate": 9.815141660218402e-06, + "loss": 0.4973, + "step": 1904 + }, + { + "epoch": 0.5348119034250421, + "grad_norm": 0.7559667825698853, + "learning_rate": 9.814701377580571e-06, + "loss": 0.457, + "step": 1905 + }, + { + "epoch": 0.5350926445816957, + "grad_norm": 0.9544247984886169, + "learning_rate": 9.814260581147749e-06, + "loss": 0.4698, + "step": 1906 + }, + { + "epoch": 0.5353733857383493, + "grad_norm": 0.8227559328079224, + "learning_rate": 9.813819270966978e-06, + "loss": 0.4788, + "step": 1907 + }, + { + "epoch": 0.5356541268950028, + "grad_norm": 0.6959999203681946, + "learning_rate": 9.813377447085347e-06, + "loss": 0.4957, + "step": 1908 + }, + { + "epoch": 0.5359348680516564, + "grad_norm": 0.8869747519493103, + "learning_rate": 9.812935109550008e-06, + "loss": 0.4636, + "step": 1909 + }, + { + "epoch": 0.5362156092083099, + "grad_norm": 0.8593977689743042, + "learning_rate": 9.812492258408164e-06, + "loss": 0.4438, + "step": 1910 + }, + { + "epoch": 0.5364963503649635, + "grad_norm": 0.8060788512229919, + "learning_rate": 9.812048893707073e-06, + "loss": 0.465, + "step": 1911 + }, + { + "epoch": 0.5367770915216171, + "grad_norm": 0.7863687872886658, + "learning_rate": 9.811605015494048e-06, + "loss": 0.4348, + "step": 1912 + }, + { + "epoch": 0.5370578326782707, + "grad_norm": 0.8226597905158997, + "learning_rate": 9.811160623816458e-06, + "loss": 0.4424, + "step": 1913 + }, + { + "epoch": 0.5373385738349242, + "grad_norm": 0.9052649140357971, + "learning_rate": 9.810715718721723e-06, + "loss": 0.4204, + "step": 1914 + }, + { + "epoch": 0.5376193149915778, + "grad_norm": 1.0391721725463867, + "learning_rate": 9.810270300257325e-06, + "loss": 0.4733, + "step": 1915 + }, + { + "epoch": 0.5379000561482313, + "grad_norm": 0.8370290994644165, + "learning_rate": 9.809824368470794e-06, + "loss": 0.4671, + "step": 1916 + }, + { + "epoch": 0.5381807973048849, + "grad_norm": 0.9258934855461121, + "learning_rate": 9.809377923409713e-06, + "loss": 0.4404, + "step": 1917 + }, + { + "epoch": 0.5384615384615384, + "grad_norm": 0.9329887628555298, + "learning_rate": 9.80893096512173e-06, + "loss": 0.4897, + "step": 1918 + }, + { + "epoch": 0.538742279618192, + "grad_norm": 0.9769116640090942, + "learning_rate": 9.80848349365454e-06, + "loss": 0.4841, + "step": 1919 + }, + { + "epoch": 0.5390230207748455, + "grad_norm": 0.6780902743339539, + "learning_rate": 9.80803550905589e-06, + "loss": 0.4816, + "step": 1920 + }, + { + "epoch": 0.5393037619314992, + "grad_norm": 0.9711014628410339, + "learning_rate": 9.807587011373594e-06, + "loss": 0.4728, + "step": 1921 + }, + { + "epoch": 0.5395845030881528, + "grad_norm": 1.0224378108978271, + "learning_rate": 9.807138000655506e-06, + "loss": 0.4554, + "step": 1922 + }, + { + "epoch": 0.5398652442448063, + "grad_norm": 0.7928524613380432, + "learning_rate": 9.806688476949544e-06, + "loss": 0.4499, + "step": 1923 + }, + { + "epoch": 0.5401459854014599, + "grad_norm": 0.7543311715126038, + "learning_rate": 9.806238440303679e-06, + "loss": 0.4302, + "step": 1924 + }, + { + "epoch": 0.5404267265581134, + "grad_norm": 1.0170397758483887, + "learning_rate": 9.805787890765937e-06, + "loss": 0.4592, + "step": 1925 + }, + { + "epoch": 0.540707467714767, + "grad_norm": 0.8489850163459778, + "learning_rate": 9.805336828384395e-06, + "loss": 0.4695, + "step": 1926 + }, + { + "epoch": 0.5409882088714205, + "grad_norm": 0.797697126865387, + "learning_rate": 9.80488525320719e-06, + "loss": 0.4918, + "step": 1927 + }, + { + "epoch": 0.5412689500280741, + "grad_norm": 0.8735690712928772, + "learning_rate": 9.80443316528251e-06, + "loss": 0.4745, + "step": 1928 + }, + { + "epoch": 0.5415496911847277, + "grad_norm": 0.8410695195198059, + "learning_rate": 9.8039805646586e-06, + "loss": 0.4697, + "step": 1929 + }, + { + "epoch": 0.5418304323413813, + "grad_norm": 0.817071259021759, + "learning_rate": 9.803527451383757e-06, + "loss": 0.4678, + "step": 1930 + }, + { + "epoch": 0.5421111734980348, + "grad_norm": 0.7217406630516052, + "learning_rate": 9.803073825506336e-06, + "loss": 0.4205, + "step": 1931 + }, + { + "epoch": 0.5423919146546884, + "grad_norm": 0.8115424513816833, + "learning_rate": 9.802619687074743e-06, + "loss": 0.4728, + "step": 1932 + }, + { + "epoch": 0.5426726558113419, + "grad_norm": 0.8847445249557495, + "learning_rate": 9.802165036137446e-06, + "loss": 0.5106, + "step": 1933 + }, + { + "epoch": 0.5429533969679955, + "grad_norm": 0.892156183719635, + "learning_rate": 9.801709872742958e-06, + "loss": 0.4993, + "step": 1934 + }, + { + "epoch": 0.543234138124649, + "grad_norm": 0.784192681312561, + "learning_rate": 9.80125419693985e-06, + "loss": 0.4765, + "step": 1935 + }, + { + "epoch": 0.5435148792813026, + "grad_norm": 0.8306336402893066, + "learning_rate": 9.800798008776753e-06, + "loss": 0.5158, + "step": 1936 + }, + { + "epoch": 0.5437956204379562, + "grad_norm": 0.6939802169799805, + "learning_rate": 9.800341308302346e-06, + "loss": 0.4275, + "step": 1937 + }, + { + "epoch": 0.5440763615946098, + "grad_norm": 0.7448406219482422, + "learning_rate": 9.799884095565366e-06, + "loss": 0.441, + "step": 1938 + }, + { + "epoch": 0.5443571027512634, + "grad_norm": 0.7934439778327942, + "learning_rate": 9.799426370614605e-06, + "loss": 0.4582, + "step": 1939 + }, + { + "epoch": 0.5446378439079169, + "grad_norm": 0.764974057674408, + "learning_rate": 9.798968133498906e-06, + "loss": 0.4403, + "step": 1940 + }, + { + "epoch": 0.5449185850645705, + "grad_norm": 0.7812139987945557, + "learning_rate": 9.798509384267172e-06, + "loss": 0.4652, + "step": 1941 + }, + { + "epoch": 0.545199326221224, + "grad_norm": 0.7582302689552307, + "learning_rate": 9.798050122968354e-06, + "loss": 0.4388, + "step": 1942 + }, + { + "epoch": 0.5454800673778776, + "grad_norm": 1.008747935295105, + "learning_rate": 9.797590349651467e-06, + "loss": 0.4973, + "step": 1943 + }, + { + "epoch": 0.5457608085345311, + "grad_norm": 0.8279508948326111, + "learning_rate": 9.79713006436557e-06, + "loss": 0.4885, + "step": 1944 + }, + { + "epoch": 0.5460415496911847, + "grad_norm": 0.689279317855835, + "learning_rate": 9.796669267159784e-06, + "loss": 0.4725, + "step": 1945 + }, + { + "epoch": 0.5463222908478383, + "grad_norm": 0.6917114853858948, + "learning_rate": 9.796207958083283e-06, + "loss": 0.4561, + "step": 1946 + }, + { + "epoch": 0.5466030320044919, + "grad_norm": 0.8557814359664917, + "learning_rate": 9.795746137185296e-06, + "loss": 0.4853, + "step": 1947 + }, + { + "epoch": 0.5468837731611454, + "grad_norm": 0.8069921135902405, + "learning_rate": 9.795283804515101e-06, + "loss": 0.4525, + "step": 1948 + }, + { + "epoch": 0.547164514317799, + "grad_norm": 0.8353968262672424, + "learning_rate": 9.79482096012204e-06, + "loss": 0.4396, + "step": 1949 + }, + { + "epoch": 0.5474452554744526, + "grad_norm": 0.7278546690940857, + "learning_rate": 9.794357604055502e-06, + "loss": 0.4545, + "step": 1950 + }, + { + "epoch": 0.5477259966311061, + "grad_norm": 0.8496597409248352, + "learning_rate": 9.793893736364937e-06, + "loss": 0.4843, + "step": 1951 + }, + { + "epoch": 0.5480067377877597, + "grad_norm": 0.859420895576477, + "learning_rate": 9.793429357099842e-06, + "loss": 0.4501, + "step": 1952 + }, + { + "epoch": 0.5482874789444132, + "grad_norm": 0.7816826701164246, + "learning_rate": 9.792964466309773e-06, + "loss": 0.4634, + "step": 1953 + }, + { + "epoch": 0.5485682201010668, + "grad_norm": 0.8437268733978271, + "learning_rate": 9.792499064044343e-06, + "loss": 0.4397, + "step": 1954 + }, + { + "epoch": 0.5488489612577204, + "grad_norm": 0.7572882175445557, + "learning_rate": 9.792033150353216e-06, + "loss": 0.4853, + "step": 1955 + }, + { + "epoch": 0.549129702414374, + "grad_norm": 0.6619787216186523, + "learning_rate": 9.79156672528611e-06, + "loss": 0.445, + "step": 1956 + }, + { + "epoch": 0.5494104435710275, + "grad_norm": 0.8600680232048035, + "learning_rate": 9.791099788892801e-06, + "loss": 0.4568, + "step": 1957 + }, + { + "epoch": 0.5496911847276811, + "grad_norm": 0.6911255717277527, + "learning_rate": 9.790632341223116e-06, + "loss": 0.4255, + "step": 1958 + }, + { + "epoch": 0.5499719258843346, + "grad_norm": 0.6408498883247375, + "learning_rate": 9.790164382326938e-06, + "loss": 0.4571, + "step": 1959 + }, + { + "epoch": 0.5502526670409882, + "grad_norm": 0.6756793260574341, + "learning_rate": 9.789695912254206e-06, + "loss": 0.4194, + "step": 1960 + }, + { + "epoch": 0.5505334081976417, + "grad_norm": 0.6890113353729248, + "learning_rate": 9.789226931054911e-06, + "loss": 0.4173, + "step": 1961 + }, + { + "epoch": 0.5508141493542953, + "grad_norm": 0.6933071613311768, + "learning_rate": 9.7887574387791e-06, + "loss": 0.4548, + "step": 1962 + }, + { + "epoch": 0.551094890510949, + "grad_norm": 0.8239140510559082, + "learning_rate": 9.788287435476874e-06, + "loss": 0.4757, + "step": 1963 + }, + { + "epoch": 0.5513756316676025, + "grad_norm": 0.7228875160217285, + "learning_rate": 9.78781692119839e-06, + "loss": 0.4604, + "step": 1964 + }, + { + "epoch": 0.5516563728242561, + "grad_norm": 0.8217010498046875, + "learning_rate": 9.787345895993857e-06, + "loss": 0.4938, + "step": 1965 + }, + { + "epoch": 0.5519371139809096, + "grad_norm": 0.8807656168937683, + "learning_rate": 9.78687435991354e-06, + "loss": 0.4698, + "step": 1966 + }, + { + "epoch": 0.5522178551375632, + "grad_norm": 0.8089063167572021, + "learning_rate": 9.786402313007762e-06, + "loss": 0.4458, + "step": 1967 + }, + { + "epoch": 0.5524985962942167, + "grad_norm": 0.7698459625244141, + "learning_rate": 9.78592975532689e-06, + "loss": 0.4661, + "step": 1968 + }, + { + "epoch": 0.5527793374508703, + "grad_norm": 0.7129600048065186, + "learning_rate": 9.785456686921358e-06, + "loss": 0.4336, + "step": 1969 + }, + { + "epoch": 0.5530600786075238, + "grad_norm": 0.8214575052261353, + "learning_rate": 9.784983107841649e-06, + "loss": 0.4605, + "step": 1970 + }, + { + "epoch": 0.5533408197641775, + "grad_norm": 0.8851922750473022, + "learning_rate": 9.784509018138295e-06, + "loss": 0.4713, + "step": 1971 + }, + { + "epoch": 0.553621560920831, + "grad_norm": 0.9047035574913025, + "learning_rate": 9.784034417861893e-06, + "loss": 0.4655, + "step": 1972 + }, + { + "epoch": 0.5539023020774846, + "grad_norm": 0.8648396730422974, + "learning_rate": 9.78355930706309e-06, + "loss": 0.483, + "step": 1973 + }, + { + "epoch": 0.5541830432341381, + "grad_norm": 0.8606545925140381, + "learning_rate": 9.78308368579258e-06, + "loss": 0.4571, + "step": 1974 + }, + { + "epoch": 0.5544637843907917, + "grad_norm": 0.8742926716804504, + "learning_rate": 9.782607554101127e-06, + "loss": 0.4574, + "step": 1975 + }, + { + "epoch": 0.5547445255474452, + "grad_norm": 0.8614062070846558, + "learning_rate": 9.782130912039535e-06, + "loss": 0.4481, + "step": 1976 + }, + { + "epoch": 0.5550252667040988, + "grad_norm": 0.8186282515525818, + "learning_rate": 9.781653759658671e-06, + "loss": 0.4827, + "step": 1977 + }, + { + "epoch": 0.5553060078607523, + "grad_norm": 0.8074194192886353, + "learning_rate": 9.781176097009453e-06, + "loss": 0.4917, + "step": 1978 + }, + { + "epoch": 0.5555867490174059, + "grad_norm": 0.7881940603256226, + "learning_rate": 9.780697924142854e-06, + "loss": 0.4195, + "step": 1979 + }, + { + "epoch": 0.5558674901740596, + "grad_norm": 0.8685644865036011, + "learning_rate": 9.7802192411099e-06, + "loss": 0.4868, + "step": 1980 + }, + { + "epoch": 0.5561482313307131, + "grad_norm": 0.8442627191543579, + "learning_rate": 9.779740047961677e-06, + "loss": 0.453, + "step": 1981 + }, + { + "epoch": 0.5564289724873667, + "grad_norm": 0.8148524761199951, + "learning_rate": 9.77926034474932e-06, + "loss": 0.5291, + "step": 1982 + }, + { + "epoch": 0.5567097136440202, + "grad_norm": 0.8087888360023499, + "learning_rate": 9.778780131524017e-06, + "loss": 0.4746, + "step": 1983 + }, + { + "epoch": 0.5569904548006738, + "grad_norm": 0.8510912656784058, + "learning_rate": 9.778299408337018e-06, + "loss": 0.4666, + "step": 1984 + }, + { + "epoch": 0.5572711959573273, + "grad_norm": 0.9262344837188721, + "learning_rate": 9.777818175239618e-06, + "loss": 0.4475, + "step": 1985 + }, + { + "epoch": 0.5575519371139809, + "grad_norm": 0.7814247012138367, + "learning_rate": 9.777336432283175e-06, + "loss": 0.4555, + "step": 1986 + }, + { + "epoch": 0.5578326782706344, + "grad_norm": 0.8396809101104736, + "learning_rate": 9.776854179519096e-06, + "loss": 0.4374, + "step": 1987 + }, + { + "epoch": 0.5581134194272881, + "grad_norm": 0.7959112524986267, + "learning_rate": 9.776371416998844e-06, + "loss": 0.4514, + "step": 1988 + }, + { + "epoch": 0.5583941605839416, + "grad_norm": 0.760784387588501, + "learning_rate": 9.775888144773937e-06, + "loss": 0.4003, + "step": 1989 + }, + { + "epoch": 0.5586749017405952, + "grad_norm": 0.7153075933456421, + "learning_rate": 9.775404362895946e-06, + "loss": 0.4582, + "step": 1990 + }, + { + "epoch": 0.5589556428972488, + "grad_norm": 0.7210965752601624, + "learning_rate": 9.774920071416499e-06, + "loss": 0.4741, + "step": 1991 + }, + { + "epoch": 0.5592363840539023, + "grad_norm": 0.7483850121498108, + "learning_rate": 9.774435270387274e-06, + "loss": 0.4659, + "step": 1992 + }, + { + "epoch": 0.5595171252105559, + "grad_norm": 0.6827176809310913, + "learning_rate": 9.773949959860008e-06, + "loss": 0.4772, + "step": 1993 + }, + { + "epoch": 0.5597978663672094, + "grad_norm": 0.851970911026001, + "learning_rate": 9.773464139886489e-06, + "loss": 0.5254, + "step": 1994 + }, + { + "epoch": 0.560078607523863, + "grad_norm": 0.7237651944160461, + "learning_rate": 9.77297781051856e-06, + "loss": 0.517, + "step": 1995 + }, + { + "epoch": 0.5603593486805165, + "grad_norm": 0.7680930495262146, + "learning_rate": 9.772490971808122e-06, + "loss": 0.4433, + "step": 1996 + }, + { + "epoch": 0.5606400898371702, + "grad_norm": 0.817929744720459, + "learning_rate": 9.772003623807125e-06, + "loss": 0.4211, + "step": 1997 + }, + { + "epoch": 0.5609208309938237, + "grad_norm": 0.7939580082893372, + "learning_rate": 9.771515766567576e-06, + "loss": 0.4668, + "step": 1998 + }, + { + "epoch": 0.5612015721504773, + "grad_norm": 0.7887587547302246, + "learning_rate": 9.771027400141538e-06, + "loss": 0.4788, + "step": 1999 + }, + { + "epoch": 0.5614823133071308, + "grad_norm": 0.809152364730835, + "learning_rate": 9.770538524581124e-06, + "loss": 0.4576, + "step": 2000 + }, + { + "epoch": 0.5617630544637844, + "grad_norm": 0.8647423982620239, + "learning_rate": 9.770049139938505e-06, + "loss": 0.4745, + "step": 2001 + }, + { + "epoch": 0.5620437956204379, + "grad_norm": 0.7462626695632935, + "learning_rate": 9.769559246265903e-06, + "loss": 0.4547, + "step": 2002 + }, + { + "epoch": 0.5623245367770915, + "grad_norm": 0.7057831287384033, + "learning_rate": 9.7690688436156e-06, + "loss": 0.4267, + "step": 2003 + }, + { + "epoch": 0.562605277933745, + "grad_norm": 0.7603277564048767, + "learning_rate": 9.768577932039927e-06, + "loss": 0.4768, + "step": 2004 + }, + { + "epoch": 0.5628860190903987, + "grad_norm": 0.8108817338943481, + "learning_rate": 9.76808651159127e-06, + "loss": 0.4681, + "step": 2005 + }, + { + "epoch": 0.5631667602470523, + "grad_norm": 0.7390596866607666, + "learning_rate": 9.767594582322071e-06, + "loss": 0.4408, + "step": 2006 + }, + { + "epoch": 0.5634475014037058, + "grad_norm": 0.7935249209403992, + "learning_rate": 9.767102144284826e-06, + "loss": 0.4857, + "step": 2007 + }, + { + "epoch": 0.5637282425603594, + "grad_norm": 0.7647663950920105, + "learning_rate": 9.766609197532087e-06, + "loss": 0.4658, + "step": 2008 + }, + { + "epoch": 0.5640089837170129, + "grad_norm": 0.780458927154541, + "learning_rate": 9.766115742116454e-06, + "loss": 0.4893, + "step": 2009 + }, + { + "epoch": 0.5642897248736665, + "grad_norm": 0.842208981513977, + "learning_rate": 9.765621778090587e-06, + "loss": 0.5028, + "step": 2010 + }, + { + "epoch": 0.56457046603032, + "grad_norm": 0.7980358600616455, + "learning_rate": 9.765127305507201e-06, + "loss": 0.4295, + "step": 2011 + }, + { + "epoch": 0.5648512071869736, + "grad_norm": 0.7977677583694458, + "learning_rate": 9.76463232441906e-06, + "loss": 0.4384, + "step": 2012 + }, + { + "epoch": 0.5651319483436271, + "grad_norm": 0.7563501596450806, + "learning_rate": 9.764136834878987e-06, + "loss": 0.4187, + "step": 2013 + }, + { + "epoch": 0.5654126895002808, + "grad_norm": 0.7552735805511475, + "learning_rate": 9.763640836939857e-06, + "loss": 0.4276, + "step": 2014 + }, + { + "epoch": 0.5656934306569343, + "grad_norm": 0.7085328698158264, + "learning_rate": 9.7631443306546e-06, + "loss": 0.48, + "step": 2015 + }, + { + "epoch": 0.5659741718135879, + "grad_norm": 0.7801918387413025, + "learning_rate": 9.762647316076201e-06, + "loss": 0.432, + "step": 2016 + }, + { + "epoch": 0.5662549129702414, + "grad_norm": 0.7970189452171326, + "learning_rate": 9.762149793257695e-06, + "loss": 0.4282, + "step": 2017 + }, + { + "epoch": 0.566535654126895, + "grad_norm": 0.7315121293067932, + "learning_rate": 9.76165176225218e-06, + "loss": 0.4477, + "step": 2018 + }, + { + "epoch": 0.5668163952835485, + "grad_norm": 0.7222184538841248, + "learning_rate": 9.761153223112799e-06, + "loss": 0.4448, + "step": 2019 + }, + { + "epoch": 0.5670971364402021, + "grad_norm": 0.8473417162895203, + "learning_rate": 9.760654175892751e-06, + "loss": 0.4348, + "step": 2020 + }, + { + "epoch": 0.5673778775968557, + "grad_norm": 0.8478591442108154, + "learning_rate": 9.760154620645297e-06, + "loss": 0.4659, + "step": 2021 + }, + { + "epoch": 0.5676586187535093, + "grad_norm": 0.8104862570762634, + "learning_rate": 9.759654557423743e-06, + "loss": 0.454, + "step": 2022 + }, + { + "epoch": 0.5679393599101629, + "grad_norm": 0.6937376260757446, + "learning_rate": 9.759153986281452e-06, + "loss": 0.4603, + "step": 2023 + }, + { + "epoch": 0.5682201010668164, + "grad_norm": 0.8246472477912903, + "learning_rate": 9.758652907271842e-06, + "loss": 0.4474, + "step": 2024 + }, + { + "epoch": 0.56850084222347, + "grad_norm": 0.899146318435669, + "learning_rate": 9.758151320448388e-06, + "loss": 0.4948, + "step": 2025 + }, + { + "epoch": 0.5687815833801235, + "grad_norm": 0.8296086192131042, + "learning_rate": 9.757649225864612e-06, + "loss": 0.4731, + "step": 2026 + }, + { + "epoch": 0.5690623245367771, + "grad_norm": 0.7202047109603882, + "learning_rate": 9.757146623574098e-06, + "loss": 0.436, + "step": 2027 + }, + { + "epoch": 0.5693430656934306, + "grad_norm": 0.7963749766349792, + "learning_rate": 9.75664351363048e-06, + "loss": 0.4491, + "step": 2028 + }, + { + "epoch": 0.5696238068500842, + "grad_norm": 0.709155261516571, + "learning_rate": 9.756139896087444e-06, + "loss": 0.4421, + "step": 2029 + }, + { + "epoch": 0.5699045480067377, + "grad_norm": 0.8129062056541443, + "learning_rate": 9.755635770998734e-06, + "loss": 0.449, + "step": 2030 + }, + { + "epoch": 0.5701852891633914, + "grad_norm": 0.7801936268806458, + "learning_rate": 9.755131138418149e-06, + "loss": 0.4348, + "step": 2031 + }, + { + "epoch": 0.570466030320045, + "grad_norm": 0.8874030709266663, + "learning_rate": 9.754625998399539e-06, + "loss": 0.5274, + "step": 2032 + }, + { + "epoch": 0.5707467714766985, + "grad_norm": 0.7172839641571045, + "learning_rate": 9.75412035099681e-06, + "loss": 0.4733, + "step": 2033 + }, + { + "epoch": 0.571027512633352, + "grad_norm": 0.7903905510902405, + "learning_rate": 9.753614196263921e-06, + "loss": 0.4621, + "step": 2034 + }, + { + "epoch": 0.5713082537900056, + "grad_norm": 0.8660666346549988, + "learning_rate": 9.753107534254885e-06, + "loss": 0.4448, + "step": 2035 + }, + { + "epoch": 0.5715889949466592, + "grad_norm": 0.6763301491737366, + "learning_rate": 9.75260036502377e-06, + "loss": 0.4458, + "step": 2036 + }, + { + "epoch": 0.5718697361033127, + "grad_norm": 0.7507290840148926, + "learning_rate": 9.752092688624702e-06, + "loss": 0.4788, + "step": 2037 + }, + { + "epoch": 0.5721504772599663, + "grad_norm": 0.7282779216766357, + "learning_rate": 9.75158450511185e-06, + "loss": 0.4729, + "step": 2038 + }, + { + "epoch": 0.5724312184166199, + "grad_norm": 0.8436610698699951, + "learning_rate": 9.751075814539448e-06, + "loss": 0.4363, + "step": 2039 + }, + { + "epoch": 0.5727119595732735, + "grad_norm": 0.850831151008606, + "learning_rate": 9.750566616961782e-06, + "loss": 0.4677, + "step": 2040 + }, + { + "epoch": 0.572992700729927, + "grad_norm": 0.7979633808135986, + "learning_rate": 9.750056912433187e-06, + "loss": 0.5169, + "step": 2041 + }, + { + "epoch": 0.5732734418865806, + "grad_norm": 0.7856271266937256, + "learning_rate": 9.749546701008056e-06, + "loss": 0.4459, + "step": 2042 + }, + { + "epoch": 0.5735541830432341, + "grad_norm": 0.8156360983848572, + "learning_rate": 9.74903598274084e-06, + "loss": 0.4496, + "step": 2043 + }, + { + "epoch": 0.5738349241998877, + "grad_norm": 0.8404834866523743, + "learning_rate": 9.748524757686034e-06, + "loss": 0.5005, + "step": 2044 + }, + { + "epoch": 0.5741156653565412, + "grad_norm": 0.7475012540817261, + "learning_rate": 9.748013025898196e-06, + "loss": 0.4495, + "step": 2045 + }, + { + "epoch": 0.5743964065131948, + "grad_norm": 0.822589635848999, + "learning_rate": 9.747500787431932e-06, + "loss": 0.4905, + "step": 2046 + }, + { + "epoch": 0.5746771476698485, + "grad_norm": 0.7559536099433899, + "learning_rate": 9.746988042341907e-06, + "loss": 0.439, + "step": 2047 + }, + { + "epoch": 0.574957888826502, + "grad_norm": 0.7868804335594177, + "learning_rate": 9.746474790682838e-06, + "loss": 0.4429, + "step": 2048 + }, + { + "epoch": 0.5752386299831556, + "grad_norm": 0.8868182301521301, + "learning_rate": 9.745961032509497e-06, + "loss": 0.4609, + "step": 2049 + }, + { + "epoch": 0.5755193711398091, + "grad_norm": 0.8592252135276794, + "learning_rate": 9.745446767876708e-06, + "loss": 0.5002, + "step": 2050 + }, + { + "epoch": 0.5758001122964627, + "grad_norm": 0.8985242247581482, + "learning_rate": 9.744931996839347e-06, + "loss": 0.4661, + "step": 2051 + }, + { + "epoch": 0.5760808534531162, + "grad_norm": 0.8594610691070557, + "learning_rate": 9.744416719452352e-06, + "loss": 0.4634, + "step": 2052 + }, + { + "epoch": 0.5763615946097698, + "grad_norm": 0.827816903591156, + "learning_rate": 9.743900935770709e-06, + "loss": 0.478, + "step": 2053 + }, + { + "epoch": 0.5766423357664233, + "grad_norm": 0.6995730400085449, + "learning_rate": 9.743384645849456e-06, + "loss": 0.4301, + "step": 2054 + }, + { + "epoch": 0.5769230769230769, + "grad_norm": 0.6672750115394592, + "learning_rate": 9.742867849743694e-06, + "loss": 0.485, + "step": 2055 + }, + { + "epoch": 0.5772038180797305, + "grad_norm": 0.8091338276863098, + "learning_rate": 9.742350547508568e-06, + "loss": 0.4236, + "step": 2056 + }, + { + "epoch": 0.5774845592363841, + "grad_norm": 0.889529824256897, + "learning_rate": 9.741832739199281e-06, + "loss": 0.4448, + "step": 2057 + }, + { + "epoch": 0.5777653003930376, + "grad_norm": 0.8087449669837952, + "learning_rate": 9.741314424871092e-06, + "loss": 0.4384, + "step": 2058 + }, + { + "epoch": 0.5780460415496912, + "grad_norm": 0.7608034014701843, + "learning_rate": 9.740795604579312e-06, + "loss": 0.4592, + "step": 2059 + }, + { + "epoch": 0.5783267827063447, + "grad_norm": 0.9347120523452759, + "learning_rate": 9.740276278379306e-06, + "loss": 0.4603, + "step": 2060 + }, + { + "epoch": 0.5786075238629983, + "grad_norm": 0.8066372871398926, + "learning_rate": 9.739756446326494e-06, + "loss": 0.4778, + "step": 2061 + }, + { + "epoch": 0.5788882650196518, + "grad_norm": 0.8516121506690979, + "learning_rate": 9.739236108476348e-06, + "loss": 0.4046, + "step": 2062 + }, + { + "epoch": 0.5791690061763054, + "grad_norm": 0.8246614336967468, + "learning_rate": 9.738715264884397e-06, + "loss": 0.4899, + "step": 2063 + }, + { + "epoch": 0.5794497473329591, + "grad_norm": 0.7649110555648804, + "learning_rate": 9.73819391560622e-06, + "loss": 0.4569, + "step": 2064 + }, + { + "epoch": 0.5797304884896126, + "grad_norm": 0.7903653979301453, + "learning_rate": 9.737672060697454e-06, + "loss": 0.4789, + "step": 2065 + }, + { + "epoch": 0.5800112296462662, + "grad_norm": 0.9227196574211121, + "learning_rate": 9.737149700213787e-06, + "loss": 0.4672, + "step": 2066 + }, + { + "epoch": 0.5802919708029197, + "grad_norm": 0.7712948322296143, + "learning_rate": 9.736626834210963e-06, + "loss": 0.4367, + "step": 2067 + }, + { + "epoch": 0.5805727119595733, + "grad_norm": 0.8210005164146423, + "learning_rate": 9.736103462744776e-06, + "loss": 0.4837, + "step": 2068 + }, + { + "epoch": 0.5808534531162268, + "grad_norm": 0.8951388001441956, + "learning_rate": 9.735579585871081e-06, + "loss": 0.5036, + "step": 2069 + }, + { + "epoch": 0.5811341942728804, + "grad_norm": 0.9867268800735474, + "learning_rate": 9.735055203645782e-06, + "loss": 0.4101, + "step": 2070 + }, + { + "epoch": 0.5814149354295339, + "grad_norm": 0.7119899392127991, + "learning_rate": 9.734530316124836e-06, + "loss": 0.416, + "step": 2071 + }, + { + "epoch": 0.5816956765861875, + "grad_norm": 0.8316327929496765, + "learning_rate": 9.734004923364258e-06, + "loss": 0.4133, + "step": 2072 + }, + { + "epoch": 0.5819764177428411, + "grad_norm": 1.04079008102417, + "learning_rate": 9.733479025420111e-06, + "loss": 0.4902, + "step": 2073 + }, + { + "epoch": 0.5822571588994947, + "grad_norm": 0.7793975472450256, + "learning_rate": 9.732952622348519e-06, + "loss": 0.4251, + "step": 2074 + }, + { + "epoch": 0.5825379000561483, + "grad_norm": 0.6976045966148376, + "learning_rate": 9.732425714205657e-06, + "loss": 0.4479, + "step": 2075 + }, + { + "epoch": 0.5828186412128018, + "grad_norm": 0.873542845249176, + "learning_rate": 9.731898301047751e-06, + "loss": 0.4219, + "step": 2076 + }, + { + "epoch": 0.5830993823694554, + "grad_norm": 0.8051063418388367, + "learning_rate": 9.731370382931082e-06, + "loss": 0.4599, + "step": 2077 + }, + { + "epoch": 0.5833801235261089, + "grad_norm": 0.9318996071815491, + "learning_rate": 9.73084195991199e-06, + "loss": 0.448, + "step": 2078 + }, + { + "epoch": 0.5836608646827625, + "grad_norm": 0.7169986367225647, + "learning_rate": 9.730313032046863e-06, + "loss": 0.4181, + "step": 2079 + }, + { + "epoch": 0.583941605839416, + "grad_norm": 0.7468517422676086, + "learning_rate": 9.729783599392147e-06, + "loss": 0.4413, + "step": 2080 + }, + { + "epoch": 0.5842223469960697, + "grad_norm": 0.810914933681488, + "learning_rate": 9.729253662004334e-06, + "loss": 0.4867, + "step": 2081 + }, + { + "epoch": 0.5845030881527232, + "grad_norm": 0.8537750244140625, + "learning_rate": 9.728723219939982e-06, + "loss": 0.4792, + "step": 2082 + }, + { + "epoch": 0.5847838293093768, + "grad_norm": 0.8743634223937988, + "learning_rate": 9.728192273255693e-06, + "loss": 0.521, + "step": 2083 + }, + { + "epoch": 0.5850645704660303, + "grad_norm": 0.7106368541717529, + "learning_rate": 9.727660822008129e-06, + "loss": 0.4415, + "step": 2084 + }, + { + "epoch": 0.5853453116226839, + "grad_norm": 0.8619228601455688, + "learning_rate": 9.727128866253999e-06, + "loss": 0.5018, + "step": 2085 + }, + { + "epoch": 0.5856260527793374, + "grad_norm": 0.8768312931060791, + "learning_rate": 9.726596406050073e-06, + "loss": 0.5083, + "step": 2086 + }, + { + "epoch": 0.585906793935991, + "grad_norm": 0.6641687154769897, + "learning_rate": 9.726063441453173e-06, + "loss": 0.4249, + "step": 2087 + }, + { + "epoch": 0.5861875350926445, + "grad_norm": 0.7184276580810547, + "learning_rate": 9.725529972520172e-06, + "loss": 0.4645, + "step": 2088 + }, + { + "epoch": 0.5864682762492981, + "grad_norm": 0.7163186073303223, + "learning_rate": 9.724995999307996e-06, + "loss": 0.4566, + "step": 2089 + }, + { + "epoch": 0.5867490174059518, + "grad_norm": 0.8219218254089355, + "learning_rate": 9.724461521873631e-06, + "loss": 0.4296, + "step": 2090 + }, + { + "epoch": 0.5870297585626053, + "grad_norm": 0.8303726315498352, + "learning_rate": 9.723926540274112e-06, + "loss": 0.4496, + "step": 2091 + }, + { + "epoch": 0.5873104997192589, + "grad_norm": 0.8900569081306458, + "learning_rate": 9.72339105456653e-06, + "loss": 0.5211, + "step": 2092 + }, + { + "epoch": 0.5875912408759124, + "grad_norm": 0.6788725256919861, + "learning_rate": 9.722855064808026e-06, + "loss": 0.4641, + "step": 2093 + }, + { + "epoch": 0.587871982032566, + "grad_norm": 0.7321316599845886, + "learning_rate": 9.722318571055799e-06, + "loss": 0.4799, + "step": 2094 + }, + { + "epoch": 0.5881527231892195, + "grad_norm": 0.7232583165168762, + "learning_rate": 9.721781573367099e-06, + "loss": 0.4372, + "step": 2095 + }, + { + "epoch": 0.5884334643458731, + "grad_norm": 0.6985263228416443, + "learning_rate": 9.721244071799235e-06, + "loss": 0.4282, + "step": 2096 + }, + { + "epoch": 0.5887142055025266, + "grad_norm": 0.8235869407653809, + "learning_rate": 9.720706066409561e-06, + "loss": 0.4714, + "step": 2097 + }, + { + "epoch": 0.5889949466591803, + "grad_norm": 0.7926880121231079, + "learning_rate": 9.720167557255494e-06, + "loss": 0.4511, + "step": 2098 + }, + { + "epoch": 0.5892756878158338, + "grad_norm": 0.7409059405326843, + "learning_rate": 9.719628544394497e-06, + "loss": 0.41, + "step": 2099 + }, + { + "epoch": 0.5895564289724874, + "grad_norm": 0.8219630122184753, + "learning_rate": 9.71908902788409e-06, + "loss": 0.4475, + "step": 2100 + }, + { + "epoch": 0.5898371701291409, + "grad_norm": 0.853156328201294, + "learning_rate": 9.71854900778185e-06, + "loss": 0.4624, + "step": 2101 + }, + { + "epoch": 0.5901179112857945, + "grad_norm": 0.7232211232185364, + "learning_rate": 9.7180084841454e-06, + "loss": 0.4223, + "step": 2102 + }, + { + "epoch": 0.590398652442448, + "grad_norm": 0.824863612651825, + "learning_rate": 9.717467457032425e-06, + "loss": 0.4398, + "step": 2103 + }, + { + "epoch": 0.5906793935991016, + "grad_norm": 0.7342715263366699, + "learning_rate": 9.71692592650066e-06, + "loss": 0.4608, + "step": 2104 + }, + { + "epoch": 0.5909601347557552, + "grad_norm": 0.7489803433418274, + "learning_rate": 9.716383892607893e-06, + "loss": 0.4639, + "step": 2105 + }, + { + "epoch": 0.5912408759124088, + "grad_norm": 0.7693859934806824, + "learning_rate": 9.715841355411965e-06, + "loss": 0.4514, + "step": 2106 + }, + { + "epoch": 0.5915216170690624, + "grad_norm": 0.7569899559020996, + "learning_rate": 9.715298314970775e-06, + "loss": 0.4303, + "step": 2107 + }, + { + "epoch": 0.5918023582257159, + "grad_norm": 0.8443773984909058, + "learning_rate": 9.71475477134227e-06, + "loss": 0.4189, + "step": 2108 + }, + { + "epoch": 0.5920830993823695, + "grad_norm": 0.8607834577560425, + "learning_rate": 9.714210724584455e-06, + "loss": 0.445, + "step": 2109 + }, + { + "epoch": 0.592363840539023, + "grad_norm": 0.8464186191558838, + "learning_rate": 9.713666174755388e-06, + "loss": 0.4633, + "step": 2110 + }, + { + "epoch": 0.5926445816956766, + "grad_norm": 0.8905773758888245, + "learning_rate": 9.713121121913179e-06, + "loss": 0.4576, + "step": 2111 + }, + { + "epoch": 0.5929253228523301, + "grad_norm": 0.9899441599845886, + "learning_rate": 9.712575566115992e-06, + "loss": 0.4186, + "step": 2112 + }, + { + "epoch": 0.5932060640089837, + "grad_norm": 0.7846997976303101, + "learning_rate": 9.712029507422045e-06, + "loss": 0.4573, + "step": 2113 + }, + { + "epoch": 0.5934868051656372, + "grad_norm": 0.7968783378601074, + "learning_rate": 9.711482945889613e-06, + "loss": 0.4326, + "step": 2114 + }, + { + "epoch": 0.5937675463222909, + "grad_norm": 0.9178119897842407, + "learning_rate": 9.710935881577019e-06, + "loss": 0.5156, + "step": 2115 + }, + { + "epoch": 0.5940482874789444, + "grad_norm": 0.8109418749809265, + "learning_rate": 9.710388314542644e-06, + "loss": 0.4439, + "step": 2116 + }, + { + "epoch": 0.594329028635598, + "grad_norm": 0.8619762659072876, + "learning_rate": 9.70984024484492e-06, + "loss": 0.4981, + "step": 2117 + }, + { + "epoch": 0.5946097697922516, + "grad_norm": 0.8317869901657104, + "learning_rate": 9.709291672542333e-06, + "loss": 0.4395, + "step": 2118 + }, + { + "epoch": 0.5948905109489051, + "grad_norm": 0.9120360612869263, + "learning_rate": 9.708742597693425e-06, + "loss": 0.5175, + "step": 2119 + }, + { + "epoch": 0.5951712521055587, + "grad_norm": 0.7006910443305969, + "learning_rate": 9.708193020356787e-06, + "loss": 0.4377, + "step": 2120 + }, + { + "epoch": 0.5954519932622122, + "grad_norm": 0.8110869526863098, + "learning_rate": 9.707642940591068e-06, + "loss": 0.4409, + "step": 2121 + }, + { + "epoch": 0.5957327344188658, + "grad_norm": 0.7683268785476685, + "learning_rate": 9.707092358454972e-06, + "loss": 0.4287, + "step": 2122 + }, + { + "epoch": 0.5960134755755194, + "grad_norm": 0.8027356863021851, + "learning_rate": 9.706541274007249e-06, + "loss": 0.5123, + "step": 2123 + }, + { + "epoch": 0.596294216732173, + "grad_norm": 0.6715095043182373, + "learning_rate": 9.70598968730671e-06, + "loss": 0.3924, + "step": 2124 + }, + { + "epoch": 0.5965749578888265, + "grad_norm": 0.7607294321060181, + "learning_rate": 9.705437598412216e-06, + "loss": 0.4335, + "step": 2125 + }, + { + "epoch": 0.5968556990454801, + "grad_norm": 0.7370964288711548, + "learning_rate": 9.704885007382681e-06, + "loss": 0.4457, + "step": 2126 + }, + { + "epoch": 0.5971364402021336, + "grad_norm": 0.7293221950531006, + "learning_rate": 9.704331914277078e-06, + "loss": 0.4502, + "step": 2127 + }, + { + "epoch": 0.5974171813587872, + "grad_norm": 0.7145909070968628, + "learning_rate": 9.703778319154427e-06, + "loss": 0.4449, + "step": 2128 + }, + { + "epoch": 0.5976979225154407, + "grad_norm": 0.8036126494407654, + "learning_rate": 9.703224222073803e-06, + "loss": 0.5145, + "step": 2129 + }, + { + "epoch": 0.5979786636720943, + "grad_norm": 0.8295819759368896, + "learning_rate": 9.702669623094339e-06, + "loss": 0.4813, + "step": 2130 + }, + { + "epoch": 0.5982594048287478, + "grad_norm": 0.9092859625816345, + "learning_rate": 9.702114522275216e-06, + "loss": 0.4847, + "step": 2131 + }, + { + "epoch": 0.5985401459854015, + "grad_norm": 0.7231720685958862, + "learning_rate": 9.701558919675672e-06, + "loss": 0.4356, + "step": 2132 + }, + { + "epoch": 0.5988208871420551, + "grad_norm": 0.719670295715332, + "learning_rate": 9.701002815354999e-06, + "loss": 0.5005, + "step": 2133 + }, + { + "epoch": 0.5991016282987086, + "grad_norm": 0.750986635684967, + "learning_rate": 9.700446209372537e-06, + "loss": 0.4747, + "step": 2134 + }, + { + "epoch": 0.5993823694553622, + "grad_norm": 1.015622854232788, + "learning_rate": 9.699889101787687e-06, + "loss": 0.5376, + "step": 2135 + }, + { + "epoch": 0.5996631106120157, + "grad_norm": 0.7454087734222412, + "learning_rate": 9.699331492659897e-06, + "loss": 0.4398, + "step": 2136 + }, + { + "epoch": 0.5999438517686693, + "grad_norm": 0.7963955998420715, + "learning_rate": 9.698773382048673e-06, + "loss": 0.4509, + "step": 2137 + }, + { + "epoch": 0.6002245929253228, + "grad_norm": 0.7308932542800903, + "learning_rate": 9.698214770013576e-06, + "loss": 0.4476, + "step": 2138 + }, + { + "epoch": 0.6005053340819764, + "grad_norm": 0.7614893913269043, + "learning_rate": 9.697655656614214e-06, + "loss": 0.414, + "step": 2139 + }, + { + "epoch": 0.60078607523863, + "grad_norm": 0.6617187261581421, + "learning_rate": 9.697096041910251e-06, + "loss": 0.4323, + "step": 2140 + }, + { + "epoch": 0.6010668163952836, + "grad_norm": 0.7306409478187561, + "learning_rate": 9.69653592596141e-06, + "loss": 0.4525, + "step": 2141 + }, + { + "epoch": 0.6013475575519371, + "grad_norm": 0.7639381289482117, + "learning_rate": 9.69597530882746e-06, + "loss": 0.4925, + "step": 2142 + }, + { + "epoch": 0.6016282987085907, + "grad_norm": 0.7443253993988037, + "learning_rate": 9.695414190568229e-06, + "loss": 0.4536, + "step": 2143 + }, + { + "epoch": 0.6019090398652442, + "grad_norm": 0.7563796639442444, + "learning_rate": 9.694852571243593e-06, + "loss": 0.4673, + "step": 2144 + }, + { + "epoch": 0.6021897810218978, + "grad_norm": 0.8298394083976746, + "learning_rate": 9.694290450913486e-06, + "loss": 0.4802, + "step": 2145 + }, + { + "epoch": 0.6024705221785513, + "grad_norm": 0.8806003928184509, + "learning_rate": 9.693727829637895e-06, + "loss": 0.5414, + "step": 2146 + }, + { + "epoch": 0.6027512633352049, + "grad_norm": 0.635846734046936, + "learning_rate": 9.693164707476856e-06, + "loss": 0.4635, + "step": 2147 + }, + { + "epoch": 0.6030320044918585, + "grad_norm": 0.6990854740142822, + "learning_rate": 9.692601084490468e-06, + "loss": 0.4309, + "step": 2148 + }, + { + "epoch": 0.6033127456485121, + "grad_norm": 0.9040673971176147, + "learning_rate": 9.69203696073887e-06, + "loss": 0.4853, + "step": 2149 + }, + { + "epoch": 0.6035934868051657, + "grad_norm": 0.6646368503570557, + "learning_rate": 9.691472336282267e-06, + "loss": 0.4132, + "step": 2150 + }, + { + "epoch": 0.6038742279618192, + "grad_norm": 0.8943970799446106, + "learning_rate": 9.690907211180909e-06, + "loss": 0.4666, + "step": 2151 + }, + { + "epoch": 0.6041549691184728, + "grad_norm": 0.9120403528213501, + "learning_rate": 9.690341585495107e-06, + "loss": 0.4754, + "step": 2152 + }, + { + "epoch": 0.6044357102751263, + "grad_norm": 0.845994234085083, + "learning_rate": 9.689775459285216e-06, + "loss": 0.4911, + "step": 2153 + }, + { + "epoch": 0.6047164514317799, + "grad_norm": 0.7136091589927673, + "learning_rate": 9.689208832611653e-06, + "loss": 0.4524, + "step": 2154 + }, + { + "epoch": 0.6049971925884334, + "grad_norm": 0.8505575060844421, + "learning_rate": 9.688641705534883e-06, + "loss": 0.4301, + "step": 2155 + }, + { + "epoch": 0.605277933745087, + "grad_norm": 0.8498470783233643, + "learning_rate": 9.688074078115428e-06, + "loss": 0.4616, + "step": 2156 + }, + { + "epoch": 0.6055586749017406, + "grad_norm": 0.9020117521286011, + "learning_rate": 9.687505950413861e-06, + "loss": 0.4896, + "step": 2157 + }, + { + "epoch": 0.6058394160583942, + "grad_norm": 0.8544296622276306, + "learning_rate": 9.686937322490806e-06, + "loss": 0.4291, + "step": 2158 + }, + { + "epoch": 0.6061201572150478, + "grad_norm": 0.7850180864334106, + "learning_rate": 9.686368194406948e-06, + "loss": 0.3939, + "step": 2159 + }, + { + "epoch": 0.6064008983717013, + "grad_norm": 1.0482336282730103, + "learning_rate": 9.685798566223018e-06, + "loss": 0.4647, + "step": 2160 + }, + { + "epoch": 0.6066816395283549, + "grad_norm": 0.8706359267234802, + "learning_rate": 9.685228437999805e-06, + "loss": 0.4345, + "step": 2161 + }, + { + "epoch": 0.6069623806850084, + "grad_norm": 0.7320350408554077, + "learning_rate": 9.684657809798148e-06, + "loss": 0.4501, + "step": 2162 + }, + { + "epoch": 0.607243121841662, + "grad_norm": 0.9550179243087769, + "learning_rate": 9.68408668167894e-06, + "loss": 0.5091, + "step": 2163 + }, + { + "epoch": 0.6075238629983155, + "grad_norm": 1.0302115678787231, + "learning_rate": 9.683515053703133e-06, + "loss": 0.4659, + "step": 2164 + }, + { + "epoch": 0.6078046041549691, + "grad_norm": 0.7839834690093994, + "learning_rate": 9.682942925931722e-06, + "loss": 0.5299, + "step": 2165 + }, + { + "epoch": 0.6080853453116227, + "grad_norm": 0.8000436425209045, + "learning_rate": 9.682370298425766e-06, + "loss": 0.4884, + "step": 2166 + }, + { + "epoch": 0.6083660864682763, + "grad_norm": 0.8380921483039856, + "learning_rate": 9.681797171246365e-06, + "loss": 0.4929, + "step": 2167 + }, + { + "epoch": 0.6086468276249298, + "grad_norm": 0.8474700450897217, + "learning_rate": 9.681223544454687e-06, + "loss": 0.4347, + "step": 2168 + }, + { + "epoch": 0.6089275687815834, + "grad_norm": 0.8120713829994202, + "learning_rate": 9.680649418111942e-06, + "loss": 0.4552, + "step": 2169 + }, + { + "epoch": 0.6092083099382369, + "grad_norm": 0.7763919234275818, + "learning_rate": 9.680074792279399e-06, + "loss": 0.4434, + "step": 2170 + }, + { + "epoch": 0.6094890510948905, + "grad_norm": 0.8411002159118652, + "learning_rate": 9.679499667018376e-06, + "loss": 0.4451, + "step": 2171 + }, + { + "epoch": 0.609769792251544, + "grad_norm": 0.8674899339675903, + "learning_rate": 9.678924042390252e-06, + "loss": 0.4619, + "step": 2172 + }, + { + "epoch": 0.6100505334081976, + "grad_norm": 0.7417405247688293, + "learning_rate": 9.678347918456448e-06, + "loss": 0.4515, + "step": 2173 + }, + { + "epoch": 0.6103312745648513, + "grad_norm": 0.7583059668540955, + "learning_rate": 9.677771295278446e-06, + "loss": 0.4528, + "step": 2174 + }, + { + "epoch": 0.6106120157215048, + "grad_norm": 0.8104037046432495, + "learning_rate": 9.677194172917781e-06, + "loss": 0.5128, + "step": 2175 + }, + { + "epoch": 0.6108927568781584, + "grad_norm": 0.792428731918335, + "learning_rate": 9.676616551436042e-06, + "loss": 0.4191, + "step": 2176 + }, + { + "epoch": 0.6111734980348119, + "grad_norm": 0.72653728723526, + "learning_rate": 9.676038430894863e-06, + "loss": 0.4401, + "step": 2177 + }, + { + "epoch": 0.6114542391914655, + "grad_norm": 0.6194968819618225, + "learning_rate": 9.675459811355944e-06, + "loss": 0.4254, + "step": 2178 + }, + { + "epoch": 0.611734980348119, + "grad_norm": 0.7762696743011475, + "learning_rate": 9.674880692881026e-06, + "loss": 0.4652, + "step": 2179 + }, + { + "epoch": 0.6120157215047726, + "grad_norm": 0.6617141366004944, + "learning_rate": 9.674301075531913e-06, + "loss": 0.4376, + "step": 2180 + }, + { + "epoch": 0.6122964626614261, + "grad_norm": 0.6666786670684814, + "learning_rate": 9.673720959370458e-06, + "loss": 0.4386, + "step": 2181 + }, + { + "epoch": 0.6125772038180798, + "grad_norm": 0.8252989649772644, + "learning_rate": 9.673140344458565e-06, + "loss": 0.4881, + "step": 2182 + }, + { + "epoch": 0.6128579449747333, + "grad_norm": 0.7981967329978943, + "learning_rate": 9.672559230858194e-06, + "loss": 0.4417, + "step": 2183 + }, + { + "epoch": 0.6131386861313869, + "grad_norm": 0.6259914040565491, + "learning_rate": 9.671977618631359e-06, + "loss": 0.4676, + "step": 2184 + }, + { + "epoch": 0.6134194272880404, + "grad_norm": 0.7408134937286377, + "learning_rate": 9.671395507840126e-06, + "loss": 0.4635, + "step": 2185 + }, + { + "epoch": 0.613700168444694, + "grad_norm": 0.818143367767334, + "learning_rate": 9.670812898546613e-06, + "loss": 0.4538, + "step": 2186 + }, + { + "epoch": 0.6139809096013475, + "grad_norm": 0.7853017449378967, + "learning_rate": 9.670229790812994e-06, + "loss": 0.4679, + "step": 2187 + }, + { + "epoch": 0.6142616507580011, + "grad_norm": 0.6382354497909546, + "learning_rate": 9.669646184701494e-06, + "loss": 0.4307, + "step": 2188 + }, + { + "epoch": 0.6145423919146547, + "grad_norm": 0.6988775730133057, + "learning_rate": 9.669062080274391e-06, + "loss": 0.4371, + "step": 2189 + }, + { + "epoch": 0.6148231330713082, + "grad_norm": 0.794001579284668, + "learning_rate": 9.668477477594021e-06, + "loss": 0.4761, + "step": 2190 + }, + { + "epoch": 0.6151038742279619, + "grad_norm": 0.8090159296989441, + "learning_rate": 9.667892376722763e-06, + "loss": 0.4631, + "step": 2191 + }, + { + "epoch": 0.6153846153846154, + "grad_norm": 0.7847395539283752, + "learning_rate": 9.667306777723058e-06, + "loss": 0.4903, + "step": 2192 + }, + { + "epoch": 0.615665356541269, + "grad_norm": 0.8224895000457764, + "learning_rate": 9.666720680657399e-06, + "loss": 0.4868, + "step": 2193 + }, + { + "epoch": 0.6159460976979225, + "grad_norm": 0.8005455732345581, + "learning_rate": 9.666134085588329e-06, + "loss": 0.441, + "step": 2194 + }, + { + "epoch": 0.6162268388545761, + "grad_norm": 0.8278933167457581, + "learning_rate": 9.665546992578446e-06, + "loss": 0.4662, + "step": 2195 + }, + { + "epoch": 0.6165075800112296, + "grad_norm": 0.8037208318710327, + "learning_rate": 9.6649594016904e-06, + "loss": 0.4814, + "step": 2196 + }, + { + "epoch": 0.6167883211678832, + "grad_norm": 0.8241268396377563, + "learning_rate": 9.6643713129869e-06, + "loss": 0.4458, + "step": 2197 + }, + { + "epoch": 0.6170690623245367, + "grad_norm": 0.7292011380195618, + "learning_rate": 9.663782726530696e-06, + "loss": 0.4591, + "step": 2198 + }, + { + "epoch": 0.6173498034811904, + "grad_norm": 0.6113244891166687, + "learning_rate": 9.6631936423846e-06, + "loss": 0.4147, + "step": 2199 + }, + { + "epoch": 0.617630544637844, + "grad_norm": 0.6649948358535767, + "learning_rate": 9.66260406061148e-06, + "loss": 0.4117, + "step": 2200 + }, + { + "epoch": 0.6179112857944975, + "grad_norm": 0.8504791855812073, + "learning_rate": 9.66201398127425e-06, + "loss": 0.5089, + "step": 2201 + }, + { + "epoch": 0.618192026951151, + "grad_norm": 0.7351915240287781, + "learning_rate": 9.661423404435877e-06, + "loss": 0.4125, + "step": 2202 + }, + { + "epoch": 0.6184727681078046, + "grad_norm": 0.7622799277305603, + "learning_rate": 9.660832330159387e-06, + "loss": 0.4494, + "step": 2203 + }, + { + "epoch": 0.6187535092644582, + "grad_norm": 0.7854540944099426, + "learning_rate": 9.660240758507852e-06, + "loss": 0.4603, + "step": 2204 + }, + { + "epoch": 0.6190342504211117, + "grad_norm": 0.861838161945343, + "learning_rate": 9.659648689544406e-06, + "loss": 0.4706, + "step": 2205 + }, + { + "epoch": 0.6193149915777653, + "grad_norm": 0.7758306860923767, + "learning_rate": 9.659056123332229e-06, + "loss": 0.4796, + "step": 2206 + }, + { + "epoch": 0.6195957327344188, + "grad_norm": 0.7804763317108154, + "learning_rate": 9.658463059934553e-06, + "loss": 0.3905, + "step": 2207 + }, + { + "epoch": 0.6198764738910725, + "grad_norm": 0.7302863001823425, + "learning_rate": 9.657869499414669e-06, + "loss": 0.4803, + "step": 2208 + }, + { + "epoch": 0.620157215047726, + "grad_norm": 0.7502059936523438, + "learning_rate": 9.657275441835919e-06, + "loss": 0.4721, + "step": 2209 + }, + { + "epoch": 0.6204379562043796, + "grad_norm": 0.7972210645675659, + "learning_rate": 9.656680887261693e-06, + "loss": 0.4521, + "step": 2210 + }, + { + "epoch": 0.6207186973610331, + "grad_norm": 0.690517008304596, + "learning_rate": 9.656085835755442e-06, + "loss": 0.4554, + "step": 2211 + }, + { + "epoch": 0.6209994385176867, + "grad_norm": 0.6880694031715393, + "learning_rate": 9.655490287380664e-06, + "loss": 0.4203, + "step": 2212 + }, + { + "epoch": 0.6212801796743402, + "grad_norm": 0.8624227643013, + "learning_rate": 9.654894242200914e-06, + "loss": 0.4467, + "step": 2213 + }, + { + "epoch": 0.6215609208309938, + "grad_norm": 0.7833415865898132, + "learning_rate": 9.654297700279798e-06, + "loss": 0.468, + "step": 2214 + }, + { + "epoch": 0.6218416619876473, + "grad_norm": 0.7033445835113525, + "learning_rate": 9.653700661680973e-06, + "loss": 0.493, + "step": 2215 + }, + { + "epoch": 0.622122403144301, + "grad_norm": 0.9222918748855591, + "learning_rate": 9.653103126468154e-06, + "loss": 0.4662, + "step": 2216 + }, + { + "epoch": 0.6224031443009546, + "grad_norm": 0.7879701852798462, + "learning_rate": 9.652505094705105e-06, + "loss": 0.5095, + "step": 2217 + }, + { + "epoch": 0.6226838854576081, + "grad_norm": 0.6616983413696289, + "learning_rate": 9.651906566455645e-06, + "loss": 0.4761, + "step": 2218 + }, + { + "epoch": 0.6229646266142617, + "grad_norm": 0.8261833190917969, + "learning_rate": 9.651307541783643e-06, + "loss": 0.4322, + "step": 2219 + }, + { + "epoch": 0.6232453677709152, + "grad_norm": 0.8467888236045837, + "learning_rate": 9.650708020753025e-06, + "loss": 0.4686, + "step": 2220 + }, + { + "epoch": 0.6235261089275688, + "grad_norm": 0.7467922568321228, + "learning_rate": 9.650108003427767e-06, + "loss": 0.431, + "step": 2221 + }, + { + "epoch": 0.6238068500842223, + "grad_norm": 0.9102432131767273, + "learning_rate": 9.649507489871902e-06, + "loss": 0.4263, + "step": 2222 + }, + { + "epoch": 0.6240875912408759, + "grad_norm": 0.9594277143478394, + "learning_rate": 9.64890648014951e-06, + "loss": 0.4684, + "step": 2223 + }, + { + "epoch": 0.6243683323975294, + "grad_norm": 0.7878795266151428, + "learning_rate": 9.64830497432473e-06, + "loss": 0.4443, + "step": 2224 + }, + { + "epoch": 0.6246490735541831, + "grad_norm": 0.8136244416236877, + "learning_rate": 9.647702972461745e-06, + "loss": 0.4573, + "step": 2225 + }, + { + "epoch": 0.6249298147108366, + "grad_norm": 0.8782902956008911, + "learning_rate": 9.647100474624805e-06, + "loss": 0.4236, + "step": 2226 + }, + { + "epoch": 0.6252105558674902, + "grad_norm": 0.7650416493415833, + "learning_rate": 9.646497480878199e-06, + "loss": 0.4489, + "step": 2227 + }, + { + "epoch": 0.6254912970241437, + "grad_norm": 0.8226631879806519, + "learning_rate": 9.645893991286276e-06, + "loss": 0.4495, + "step": 2228 + }, + { + "epoch": 0.6257720381807973, + "grad_norm": 0.7743797302246094, + "learning_rate": 9.645290005913437e-06, + "loss": 0.4519, + "step": 2229 + }, + { + "epoch": 0.6260527793374508, + "grad_norm": 0.6662293672561646, + "learning_rate": 9.644685524824137e-06, + "loss": 0.4652, + "step": 2230 + }, + { + "epoch": 0.6263335204941044, + "grad_norm": 0.6626827716827393, + "learning_rate": 9.64408054808288e-06, + "loss": 0.4685, + "step": 2231 + }, + { + "epoch": 0.626614261650758, + "grad_norm": 0.7970080375671387, + "learning_rate": 9.643475075754227e-06, + "loss": 0.476, + "step": 2232 + }, + { + "epoch": 0.6268950028074116, + "grad_norm": 0.7575878500938416, + "learning_rate": 9.642869107902791e-06, + "loss": 0.4162, + "step": 2233 + }, + { + "epoch": 0.6271757439640652, + "grad_norm": 0.7158808708190918, + "learning_rate": 9.642262644593235e-06, + "loss": 0.4474, + "step": 2234 + }, + { + "epoch": 0.6274564851207187, + "grad_norm": 0.7752962112426758, + "learning_rate": 9.641655685890277e-06, + "loss": 0.4456, + "step": 2235 + }, + { + "epoch": 0.6277372262773723, + "grad_norm": 0.7573445439338684, + "learning_rate": 9.641048231858689e-06, + "loss": 0.4133, + "step": 2236 + }, + { + "epoch": 0.6280179674340258, + "grad_norm": 0.733131468296051, + "learning_rate": 9.640440282563294e-06, + "loss": 0.4745, + "step": 2237 + }, + { + "epoch": 0.6282987085906794, + "grad_norm": 0.8936368227005005, + "learning_rate": 9.639831838068972e-06, + "loss": 0.4777, + "step": 2238 + }, + { + "epoch": 0.6285794497473329, + "grad_norm": 0.737271249294281, + "learning_rate": 9.639222898440647e-06, + "loss": 0.4595, + "step": 2239 + }, + { + "epoch": 0.6288601909039865, + "grad_norm": 0.7406973838806152, + "learning_rate": 9.638613463743303e-06, + "loss": 0.4238, + "step": 2240 + }, + { + "epoch": 0.62914093206064, + "grad_norm": 0.7626855969429016, + "learning_rate": 9.638003534041977e-06, + "loss": 0.45, + "step": 2241 + }, + { + "epoch": 0.6294216732172937, + "grad_norm": 0.7607765197753906, + "learning_rate": 9.637393109401755e-06, + "loss": 0.4457, + "step": 2242 + }, + { + "epoch": 0.6297024143739473, + "grad_norm": 0.7057299613952637, + "learning_rate": 9.63678218988778e-06, + "loss": 0.4707, + "step": 2243 + }, + { + "epoch": 0.6299831555306008, + "grad_norm": 0.8158141374588013, + "learning_rate": 9.636170775565243e-06, + "loss": 0.4902, + "step": 2244 + }, + { + "epoch": 0.6302638966872544, + "grad_norm": 0.8315247893333435, + "learning_rate": 9.63555886649939e-06, + "loss": 0.5091, + "step": 2245 + }, + { + "epoch": 0.6305446378439079, + "grad_norm": 0.7286932468414307, + "learning_rate": 9.634946462755523e-06, + "loss": 0.5043, + "step": 2246 + }, + { + "epoch": 0.6308253790005615, + "grad_norm": 0.6606234312057495, + "learning_rate": 9.634333564398992e-06, + "loss": 0.4394, + "step": 2247 + }, + { + "epoch": 0.631106120157215, + "grad_norm": 0.8718246817588806, + "learning_rate": 9.633720171495202e-06, + "loss": 0.4927, + "step": 2248 + }, + { + "epoch": 0.6313868613138686, + "grad_norm": 0.753736138343811, + "learning_rate": 9.633106284109612e-06, + "loss": 0.4545, + "step": 2249 + }, + { + "epoch": 0.6316676024705222, + "grad_norm": 0.7505636215209961, + "learning_rate": 9.632491902307727e-06, + "loss": 0.4711, + "step": 2250 + }, + { + "epoch": 0.6319483436271758, + "grad_norm": 0.7484976649284363, + "learning_rate": 9.631877026155118e-06, + "loss": 0.4726, + "step": 2251 + }, + { + "epoch": 0.6322290847838293, + "grad_norm": 0.9118866920471191, + "learning_rate": 9.631261655717394e-06, + "loss": 0.5145, + "step": 2252 + }, + { + "epoch": 0.6325098259404829, + "grad_norm": 0.6723572015762329, + "learning_rate": 9.630645791060226e-06, + "loss": 0.4593, + "step": 2253 + }, + { + "epoch": 0.6327905670971364, + "grad_norm": 0.6758120059967041, + "learning_rate": 9.630029432249336e-06, + "loss": 0.3962, + "step": 2254 + }, + { + "epoch": 0.63307130825379, + "grad_norm": 0.7874472141265869, + "learning_rate": 9.629412579350496e-06, + "loss": 0.4346, + "step": 2255 + }, + { + "epoch": 0.6333520494104435, + "grad_norm": 0.7685884833335876, + "learning_rate": 9.628795232429535e-06, + "loss": 0.4687, + "step": 2256 + }, + { + "epoch": 0.6336327905670971, + "grad_norm": 0.7021225094795227, + "learning_rate": 9.628177391552333e-06, + "loss": 0.4444, + "step": 2257 + }, + { + "epoch": 0.6339135317237508, + "grad_norm": 0.7794039249420166, + "learning_rate": 9.627559056784818e-06, + "loss": 0.4489, + "step": 2258 + }, + { + "epoch": 0.6341942728804043, + "grad_norm": 0.9281190633773804, + "learning_rate": 9.626940228192979e-06, + "loss": 0.4624, + "step": 2259 + }, + { + "epoch": 0.6344750140370579, + "grad_norm": 0.643397331237793, + "learning_rate": 9.626320905842849e-06, + "loss": 0.4257, + "step": 2260 + }, + { + "epoch": 0.6347557551937114, + "grad_norm": 0.7969174981117249, + "learning_rate": 9.625701089800525e-06, + "loss": 0.4429, + "step": 2261 + }, + { + "epoch": 0.635036496350365, + "grad_norm": 0.9377960562705994, + "learning_rate": 9.625080780132143e-06, + "loss": 0.4882, + "step": 2262 + }, + { + "epoch": 0.6353172375070185, + "grad_norm": 0.728842556476593, + "learning_rate": 9.624459976903903e-06, + "loss": 0.4482, + "step": 2263 + }, + { + "epoch": 0.6355979786636721, + "grad_norm": 0.683595597743988, + "learning_rate": 9.623838680182051e-06, + "loss": 0.451, + "step": 2264 + }, + { + "epoch": 0.6358787198203256, + "grad_norm": 0.8157421350479126, + "learning_rate": 9.623216890032892e-06, + "loss": 0.4768, + "step": 2265 + }, + { + "epoch": 0.6361594609769792, + "grad_norm": 0.6925378441810608, + "learning_rate": 9.622594606522772e-06, + "loss": 0.4301, + "step": 2266 + }, + { + "epoch": 0.6364402021336328, + "grad_norm": 0.8240598440170288, + "learning_rate": 9.621971829718104e-06, + "loss": 0.4379, + "step": 2267 + }, + { + "epoch": 0.6367209432902864, + "grad_norm": 0.737712562084198, + "learning_rate": 9.621348559685345e-06, + "loss": 0.4519, + "step": 2268 + }, + { + "epoch": 0.6370016844469399, + "grad_norm": 0.845904529094696, + "learning_rate": 9.620724796491004e-06, + "loss": 0.4489, + "step": 2269 + }, + { + "epoch": 0.6372824256035935, + "grad_norm": 0.7575750350952148, + "learning_rate": 9.620100540201648e-06, + "loss": 0.4642, + "step": 2270 + }, + { + "epoch": 0.637563166760247, + "grad_norm": 0.7919850945472717, + "learning_rate": 9.619475790883894e-06, + "loss": 0.4513, + "step": 2271 + }, + { + "epoch": 0.6378439079169006, + "grad_norm": 0.856126606464386, + "learning_rate": 9.618850548604409e-06, + "loss": 0.4749, + "step": 2272 + }, + { + "epoch": 0.6381246490735542, + "grad_norm": 0.7882614135742188, + "learning_rate": 9.618224813429916e-06, + "loss": 0.4168, + "step": 2273 + }, + { + "epoch": 0.6384053902302077, + "grad_norm": 0.7316656112670898, + "learning_rate": 9.61759858542719e-06, + "loss": 0.4741, + "step": 2274 + }, + { + "epoch": 0.6386861313868614, + "grad_norm": 0.7819839119911194, + "learning_rate": 9.616971864663059e-06, + "loss": 0.4935, + "step": 2275 + }, + { + "epoch": 0.6389668725435149, + "grad_norm": 0.8177666664123535, + "learning_rate": 9.616344651204398e-06, + "loss": 0.4671, + "step": 2276 + }, + { + "epoch": 0.6392476137001685, + "grad_norm": 0.6885898113250732, + "learning_rate": 9.615716945118147e-06, + "loss": 0.4029, + "step": 2277 + }, + { + "epoch": 0.639528354856822, + "grad_norm": 0.8757466673851013, + "learning_rate": 9.615088746471286e-06, + "loss": 0.4449, + "step": 2278 + }, + { + "epoch": 0.6398090960134756, + "grad_norm": 0.7296600937843323, + "learning_rate": 9.614460055330852e-06, + "loss": 0.4331, + "step": 2279 + }, + { + "epoch": 0.6400898371701291, + "grad_norm": 0.8537064790725708, + "learning_rate": 9.613830871763939e-06, + "loss": 0.4844, + "step": 2280 + }, + { + "epoch": 0.6403705783267827, + "grad_norm": 0.7637564539909363, + "learning_rate": 9.613201195837684e-06, + "loss": 0.4167, + "step": 2281 + }, + { + "epoch": 0.6406513194834362, + "grad_norm": 0.8047333359718323, + "learning_rate": 9.612571027619287e-06, + "loss": 0.4301, + "step": 2282 + }, + { + "epoch": 0.6409320606400898, + "grad_norm": 0.7688471674919128, + "learning_rate": 9.611940367175992e-06, + "loss": 0.4373, + "step": 2283 + }, + { + "epoch": 0.6412128017967434, + "grad_norm": 0.7930221557617188, + "learning_rate": 9.611309214575103e-06, + "loss": 0.4407, + "step": 2284 + }, + { + "epoch": 0.641493542953397, + "grad_norm": 0.7173995971679688, + "learning_rate": 9.610677569883967e-06, + "loss": 0.4651, + "step": 2285 + }, + { + "epoch": 0.6417742841100506, + "grad_norm": 0.6970399022102356, + "learning_rate": 9.610045433169994e-06, + "loss": 0.4458, + "step": 2286 + }, + { + "epoch": 0.6420550252667041, + "grad_norm": 0.7596144676208496, + "learning_rate": 9.609412804500642e-06, + "loss": 0.4504, + "step": 2287 + }, + { + "epoch": 0.6423357664233577, + "grad_norm": 0.8646596074104309, + "learning_rate": 9.608779683943417e-06, + "loss": 0.4748, + "step": 2288 + }, + { + "epoch": 0.6426165075800112, + "grad_norm": 0.7043774127960205, + "learning_rate": 9.608146071565888e-06, + "loss": 0.4651, + "step": 2289 + }, + { + "epoch": 0.6428972487366648, + "grad_norm": 0.6364893913269043, + "learning_rate": 9.607511967435663e-06, + "loss": 0.4205, + "step": 2290 + }, + { + "epoch": 0.6431779898933183, + "grad_norm": 0.7925765514373779, + "learning_rate": 9.606877371620413e-06, + "loss": 0.4129, + "step": 2291 + }, + { + "epoch": 0.643458731049972, + "grad_norm": 0.7484250068664551, + "learning_rate": 9.606242284187861e-06, + "loss": 0.4414, + "step": 2292 + }, + { + "epoch": 0.6437394722066255, + "grad_norm": 0.7754836082458496, + "learning_rate": 9.605606705205774e-06, + "loss": 0.442, + "step": 2293 + }, + { + "epoch": 0.6440202133632791, + "grad_norm": 0.6823911070823669, + "learning_rate": 9.604970634741981e-06, + "loss": 0.4544, + "step": 2294 + }, + { + "epoch": 0.6443009545199326, + "grad_norm": 0.6883746981620789, + "learning_rate": 9.604334072864358e-06, + "loss": 0.427, + "step": 2295 + }, + { + "epoch": 0.6445816956765862, + "grad_norm": 0.7669408917427063, + "learning_rate": 9.603697019640837e-06, + "loss": 0.4852, + "step": 2296 + }, + { + "epoch": 0.6448624368332397, + "grad_norm": 0.7926437258720398, + "learning_rate": 9.603059475139395e-06, + "loss": 0.4602, + "step": 2297 + }, + { + "epoch": 0.6451431779898933, + "grad_norm": 0.6788368225097656, + "learning_rate": 9.602421439428073e-06, + "loss": 0.4353, + "step": 2298 + }, + { + "epoch": 0.6454239191465468, + "grad_norm": 0.791560709476471, + "learning_rate": 9.601782912574955e-06, + "loss": 0.5169, + "step": 2299 + }, + { + "epoch": 0.6457046603032004, + "grad_norm": 0.7834374904632568, + "learning_rate": 9.601143894648182e-06, + "loss": 0.4565, + "step": 2300 + }, + { + "epoch": 0.6459854014598541, + "grad_norm": 0.8093973994255066, + "learning_rate": 9.600504385715943e-06, + "loss": 0.4327, + "step": 2301 + }, + { + "epoch": 0.6462661426165076, + "grad_norm": 0.7137249112129211, + "learning_rate": 9.599864385846487e-06, + "loss": 0.4228, + "step": 2302 + }, + { + "epoch": 0.6465468837731612, + "grad_norm": 0.7782304286956787, + "learning_rate": 9.599223895108107e-06, + "loss": 0.4197, + "step": 2303 + }, + { + "epoch": 0.6468276249298147, + "grad_norm": 0.8250675797462463, + "learning_rate": 9.598582913569153e-06, + "loss": 0.5315, + "step": 2304 + }, + { + "epoch": 0.6471083660864683, + "grad_norm": 0.8861746788024902, + "learning_rate": 9.597941441298028e-06, + "loss": 0.5203, + "step": 2305 + }, + { + "epoch": 0.6473891072431218, + "grad_norm": 0.7449389100074768, + "learning_rate": 9.597299478363186e-06, + "loss": 0.4792, + "step": 2306 + }, + { + "epoch": 0.6476698483997754, + "grad_norm": 0.6926552057266235, + "learning_rate": 9.596657024833132e-06, + "loss": 0.4126, + "step": 2307 + }, + { + "epoch": 0.6479505895564289, + "grad_norm": 0.8158414959907532, + "learning_rate": 9.596014080776424e-06, + "loss": 0.4573, + "step": 2308 + }, + { + "epoch": 0.6482313307130826, + "grad_norm": 0.7573313117027283, + "learning_rate": 9.595370646261674e-06, + "loss": 0.4671, + "step": 2309 + }, + { + "epoch": 0.6485120718697361, + "grad_norm": 0.7863513827323914, + "learning_rate": 9.594726721357545e-06, + "loss": 0.4695, + "step": 2310 + }, + { + "epoch": 0.6487928130263897, + "grad_norm": 0.8010240793228149, + "learning_rate": 9.594082306132755e-06, + "loss": 0.5111, + "step": 2311 + }, + { + "epoch": 0.6490735541830432, + "grad_norm": 0.8068008422851562, + "learning_rate": 9.593437400656069e-06, + "loss": 0.4253, + "step": 2312 + }, + { + "epoch": 0.6493542953396968, + "grad_norm": 0.7970138788223267, + "learning_rate": 9.592792004996307e-06, + "loss": 0.4752, + "step": 2313 + }, + { + "epoch": 0.6496350364963503, + "grad_norm": 0.7540660500526428, + "learning_rate": 9.592146119222345e-06, + "loss": 0.4656, + "step": 2314 + }, + { + "epoch": 0.6499157776530039, + "grad_norm": 0.7930955290794373, + "learning_rate": 9.591499743403105e-06, + "loss": 0.4496, + "step": 2315 + }, + { + "epoch": 0.6501965188096575, + "grad_norm": 0.9169591069221497, + "learning_rate": 9.590852877607566e-06, + "loss": 0.4957, + "step": 2316 + }, + { + "epoch": 0.6504772599663111, + "grad_norm": 0.815403938293457, + "learning_rate": 9.590205521904753e-06, + "loss": 0.4478, + "step": 2317 + }, + { + "epoch": 0.6507580011229647, + "grad_norm": 0.7266177535057068, + "learning_rate": 9.589557676363755e-06, + "loss": 0.4439, + "step": 2318 + }, + { + "epoch": 0.6510387422796182, + "grad_norm": 0.7989054918289185, + "learning_rate": 9.588909341053702e-06, + "loss": 0.4838, + "step": 2319 + }, + { + "epoch": 0.6513194834362718, + "grad_norm": 0.8077318072319031, + "learning_rate": 9.58826051604378e-06, + "loss": 0.4566, + "step": 2320 + }, + { + "epoch": 0.6516002245929253, + "grad_norm": 0.7144681215286255, + "learning_rate": 9.587611201403228e-06, + "loss": 0.4249, + "step": 2321 + }, + { + "epoch": 0.6518809657495789, + "grad_norm": 0.7660030126571655, + "learning_rate": 9.586961397201338e-06, + "loss": 0.4477, + "step": 2322 + }, + { + "epoch": 0.6521617069062324, + "grad_norm": 0.7299714088439941, + "learning_rate": 9.58631110350745e-06, + "loss": 0.4455, + "step": 2323 + }, + { + "epoch": 0.652442448062886, + "grad_norm": 0.8112562298774719, + "learning_rate": 9.585660320390964e-06, + "loss": 0.4506, + "step": 2324 + }, + { + "epoch": 0.6527231892195395, + "grad_norm": 0.8056146502494812, + "learning_rate": 9.585009047921323e-06, + "loss": 0.425, + "step": 2325 + }, + { + "epoch": 0.6530039303761932, + "grad_norm": 0.7698877453804016, + "learning_rate": 9.58435728616803e-06, + "loss": 0.4735, + "step": 2326 + }, + { + "epoch": 0.6532846715328468, + "grad_norm": 0.6862674355506897, + "learning_rate": 9.583705035200634e-06, + "loss": 0.4949, + "step": 2327 + }, + { + "epoch": 0.6535654126895003, + "grad_norm": 0.84004145860672, + "learning_rate": 9.583052295088742e-06, + "loss": 0.5078, + "step": 2328 + }, + { + "epoch": 0.6538461538461539, + "grad_norm": 0.7269312739372253, + "learning_rate": 9.582399065902008e-06, + "loss": 0.4443, + "step": 2329 + }, + { + "epoch": 0.6541268950028074, + "grad_norm": 0.8159030079841614, + "learning_rate": 9.581745347710143e-06, + "loss": 0.4615, + "step": 2330 + }, + { + "epoch": 0.654407636159461, + "grad_norm": 0.7063059210777283, + "learning_rate": 9.581091140582906e-06, + "loss": 0.4521, + "step": 2331 + }, + { + "epoch": 0.6546883773161145, + "grad_norm": 0.6708444952964783, + "learning_rate": 9.58043644459011e-06, + "loss": 0.4062, + "step": 2332 + }, + { + "epoch": 0.6549691184727681, + "grad_norm": 0.7174159288406372, + "learning_rate": 9.579781259801623e-06, + "loss": 0.4654, + "step": 2333 + }, + { + "epoch": 0.6552498596294217, + "grad_norm": 0.6760228276252747, + "learning_rate": 9.579125586287357e-06, + "loss": 0.4308, + "step": 2334 + }, + { + "epoch": 0.6555306007860753, + "grad_norm": 0.8395696878433228, + "learning_rate": 9.578469424117284e-06, + "loss": 0.4963, + "step": 2335 + }, + { + "epoch": 0.6558113419427288, + "grad_norm": 0.6900395750999451, + "learning_rate": 9.577812773361428e-06, + "loss": 0.4627, + "step": 2336 + }, + { + "epoch": 0.6560920830993824, + "grad_norm": 0.6162226796150208, + "learning_rate": 9.57715563408986e-06, + "loss": 0.4273, + "step": 2337 + }, + { + "epoch": 0.6563728242560359, + "grad_norm": 0.6915835738182068, + "learning_rate": 9.576498006372705e-06, + "loss": 0.4195, + "step": 2338 + }, + { + "epoch": 0.6566535654126895, + "grad_norm": 0.7447922825813293, + "learning_rate": 9.575839890280141e-06, + "loss": 0.4395, + "step": 2339 + }, + { + "epoch": 0.656934306569343, + "grad_norm": 0.6708216071128845, + "learning_rate": 9.5751812858824e-06, + "loss": 0.4265, + "step": 2340 + }, + { + "epoch": 0.6572150477259966, + "grad_norm": 0.7260141968727112, + "learning_rate": 9.574522193249764e-06, + "loss": 0.4462, + "step": 2341 + }, + { + "epoch": 0.6574957888826501, + "grad_norm": 0.8416393995285034, + "learning_rate": 9.573862612452567e-06, + "loss": 0.436, + "step": 2342 + }, + { + "epoch": 0.6577765300393038, + "grad_norm": 0.8120285272598267, + "learning_rate": 9.573202543561195e-06, + "loss": 0.4726, + "step": 2343 + }, + { + "epoch": 0.6580572711959574, + "grad_norm": 0.7599145770072937, + "learning_rate": 9.572541986646087e-06, + "loss": 0.4428, + "step": 2344 + }, + { + "epoch": 0.6583380123526109, + "grad_norm": 0.7690832018852234, + "learning_rate": 9.571880941777732e-06, + "loss": 0.438, + "step": 2345 + }, + { + "epoch": 0.6586187535092645, + "grad_norm": 0.8145992755889893, + "learning_rate": 9.571219409026672e-06, + "loss": 0.4551, + "step": 2346 + }, + { + "epoch": 0.658899494665918, + "grad_norm": 0.7256839871406555, + "learning_rate": 9.570557388463504e-06, + "loss": 0.406, + "step": 2347 + }, + { + "epoch": 0.6591802358225716, + "grad_norm": 0.7386422157287598, + "learning_rate": 9.569894880158876e-06, + "loss": 0.4488, + "step": 2348 + }, + { + "epoch": 0.6594609769792251, + "grad_norm": 0.8676342964172363, + "learning_rate": 9.569231884183483e-06, + "loss": 0.5044, + "step": 2349 + }, + { + "epoch": 0.6597417181358787, + "grad_norm": 0.7732373476028442, + "learning_rate": 9.568568400608079e-06, + "loss": 0.4671, + "step": 2350 + }, + { + "epoch": 0.6600224592925323, + "grad_norm": 0.8180080652236938, + "learning_rate": 9.567904429503463e-06, + "loss": 0.4935, + "step": 2351 + }, + { + "epoch": 0.6603032004491859, + "grad_norm": 0.7789162397384644, + "learning_rate": 9.567239970940492e-06, + "loss": 0.4285, + "step": 2352 + }, + { + "epoch": 0.6605839416058394, + "grad_norm": 0.7517969608306885, + "learning_rate": 9.566575024990075e-06, + "loss": 0.4401, + "step": 2353 + }, + { + "epoch": 0.660864682762493, + "grad_norm": 0.8047889471054077, + "learning_rate": 9.565909591723169e-06, + "loss": 0.4281, + "step": 2354 + }, + { + "epoch": 0.6611454239191465, + "grad_norm": 0.6843998432159424, + "learning_rate": 9.565243671210783e-06, + "loss": 0.4996, + "step": 2355 + }, + { + "epoch": 0.6614261650758001, + "grad_norm": 0.7578331232070923, + "learning_rate": 9.564577263523985e-06, + "loss": 0.4404, + "step": 2356 + }, + { + "epoch": 0.6617069062324537, + "grad_norm": 0.7489643096923828, + "learning_rate": 9.563910368733883e-06, + "loss": 0.4583, + "step": 2357 + }, + { + "epoch": 0.6619876473891072, + "grad_norm": 0.8898212313652039, + "learning_rate": 9.56324298691165e-06, + "loss": 0.4822, + "step": 2358 + }, + { + "epoch": 0.6622683885457608, + "grad_norm": 0.7135800123214722, + "learning_rate": 9.562575118128501e-06, + "loss": 0.4378, + "step": 2359 + }, + { + "epoch": 0.6625491297024144, + "grad_norm": 0.7930676937103271, + "learning_rate": 9.561906762455708e-06, + "loss": 0.4626, + "step": 2360 + }, + { + "epoch": 0.662829870859068, + "grad_norm": 0.6866939663887024, + "learning_rate": 9.561237919964595e-06, + "loss": 0.3879, + "step": 2361 + }, + { + "epoch": 0.6631106120157215, + "grad_norm": 0.8476772904396057, + "learning_rate": 9.560568590726536e-06, + "loss": 0.4268, + "step": 2362 + }, + { + "epoch": 0.6633913531723751, + "grad_norm": 0.7971569895744324, + "learning_rate": 9.559898774812957e-06, + "loss": 0.4879, + "step": 2363 + }, + { + "epoch": 0.6636720943290286, + "grad_norm": 0.8160806894302368, + "learning_rate": 9.559228472295336e-06, + "loss": 0.5194, + "step": 2364 + }, + { + "epoch": 0.6639528354856822, + "grad_norm": 0.7013716697692871, + "learning_rate": 9.558557683245204e-06, + "loss": 0.4323, + "step": 2365 + }, + { + "epoch": 0.6642335766423357, + "grad_norm": 0.7933659553527832, + "learning_rate": 9.557886407734145e-06, + "loss": 0.4903, + "step": 2366 + }, + { + "epoch": 0.6645143177989893, + "grad_norm": 0.7558932900428772, + "learning_rate": 9.557214645833792e-06, + "loss": 0.4799, + "step": 2367 + }, + { + "epoch": 0.664795058955643, + "grad_norm": 0.743954062461853, + "learning_rate": 9.556542397615831e-06, + "loss": 0.4406, + "step": 2368 + }, + { + "epoch": 0.6650758001122965, + "grad_norm": 0.7228794097900391, + "learning_rate": 9.555869663152003e-06, + "loss": 0.4065, + "step": 2369 + }, + { + "epoch": 0.66535654126895, + "grad_norm": 0.7404186129570007, + "learning_rate": 9.555196442514091e-06, + "loss": 0.4371, + "step": 2370 + }, + { + "epoch": 0.6656372824256036, + "grad_norm": 0.7843636274337769, + "learning_rate": 9.554522735773946e-06, + "loss": 0.4449, + "step": 2371 + }, + { + "epoch": 0.6659180235822572, + "grad_norm": 0.7291850447654724, + "learning_rate": 9.553848543003454e-06, + "loss": 0.4812, + "step": 2372 + }, + { + "epoch": 0.6661987647389107, + "grad_norm": 0.7260723114013672, + "learning_rate": 9.553173864274567e-06, + "loss": 0.5139, + "step": 2373 + }, + { + "epoch": 0.6664795058955643, + "grad_norm": 0.7064835429191589, + "learning_rate": 9.552498699659279e-06, + "loss": 0.468, + "step": 2374 + }, + { + "epoch": 0.6667602470522178, + "grad_norm": 0.7111903429031372, + "learning_rate": 9.551823049229638e-06, + "loss": 0.475, + "step": 2375 + }, + { + "epoch": 0.6670409882088714, + "grad_norm": 0.8541937470436096, + "learning_rate": 9.551146913057747e-06, + "loss": 0.4862, + "step": 2376 + }, + { + "epoch": 0.667321729365525, + "grad_norm": 0.8285909295082092, + "learning_rate": 9.55047029121576e-06, + "loss": 0.4276, + "step": 2377 + }, + { + "epoch": 0.6676024705221786, + "grad_norm": 0.6595309972763062, + "learning_rate": 9.549793183775882e-06, + "loss": 0.4413, + "step": 2378 + }, + { + "epoch": 0.6678832116788321, + "grad_norm": 0.7900859713554382, + "learning_rate": 9.549115590810369e-06, + "loss": 0.4404, + "step": 2379 + }, + { + "epoch": 0.6681639528354857, + "grad_norm": 0.8686185479164124, + "learning_rate": 9.548437512391527e-06, + "loss": 0.4782, + "step": 2380 + }, + { + "epoch": 0.6684446939921392, + "grad_norm": 0.8771662712097168, + "learning_rate": 9.54775894859172e-06, + "loss": 0.4371, + "step": 2381 + }, + { + "epoch": 0.6687254351487928, + "grad_norm": 0.7942057251930237, + "learning_rate": 9.547079899483358e-06, + "loss": 0.464, + "step": 2382 + }, + { + "epoch": 0.6690061763054463, + "grad_norm": 0.7294292449951172, + "learning_rate": 9.546400365138906e-06, + "loss": 0.44, + "step": 2383 + }, + { + "epoch": 0.6692869174620999, + "grad_norm": 0.7692420482635498, + "learning_rate": 9.54572034563088e-06, + "loss": 0.4585, + "step": 2384 + }, + { + "epoch": 0.6695676586187536, + "grad_norm": 0.7228407859802246, + "learning_rate": 9.545039841031845e-06, + "loss": 0.4761, + "step": 2385 + }, + { + "epoch": 0.6698483997754071, + "grad_norm": 0.7915753126144409, + "learning_rate": 9.544358851414423e-06, + "loss": 0.3975, + "step": 2386 + }, + { + "epoch": 0.6701291409320607, + "grad_norm": 0.81806480884552, + "learning_rate": 9.543677376851284e-06, + "loss": 0.4716, + "step": 2387 + }, + { + "epoch": 0.6704098820887142, + "grad_norm": 0.702312707901001, + "learning_rate": 9.542995417415151e-06, + "loss": 0.4594, + "step": 2388 + }, + { + "epoch": 0.6706906232453678, + "grad_norm": 0.7722580432891846, + "learning_rate": 9.542312973178797e-06, + "loss": 0.4851, + "step": 2389 + }, + { + "epoch": 0.6709713644020213, + "grad_norm": 0.7912092208862305, + "learning_rate": 9.54163004421505e-06, + "loss": 0.4423, + "step": 2390 + }, + { + "epoch": 0.6712521055586749, + "grad_norm": 0.690192461013794, + "learning_rate": 9.540946630596786e-06, + "loss": 0.4178, + "step": 2391 + }, + { + "epoch": 0.6715328467153284, + "grad_norm": 0.7059841752052307, + "learning_rate": 9.540262732396936e-06, + "loss": 0.4542, + "step": 2392 + }, + { + "epoch": 0.6718135878719821, + "grad_norm": 0.6695817112922668, + "learning_rate": 9.539578349688483e-06, + "loss": 0.4384, + "step": 2393 + }, + { + "epoch": 0.6720943290286356, + "grad_norm": 0.8164772987365723, + "learning_rate": 9.538893482544457e-06, + "loss": 0.4891, + "step": 2394 + }, + { + "epoch": 0.6723750701852892, + "grad_norm": 0.8997693657875061, + "learning_rate": 9.538208131037945e-06, + "loss": 0.4789, + "step": 2395 + }, + { + "epoch": 0.6726558113419427, + "grad_norm": 0.7119197249412537, + "learning_rate": 9.53752229524208e-06, + "loss": 0.429, + "step": 2396 + }, + { + "epoch": 0.6729365524985963, + "grad_norm": 0.7313930988311768, + "learning_rate": 9.536835975230055e-06, + "loss": 0.468, + "step": 2397 + }, + { + "epoch": 0.6732172936552498, + "grad_norm": 0.8206318616867065, + "learning_rate": 9.536149171075106e-06, + "loss": 0.462, + "step": 2398 + }, + { + "epoch": 0.6734980348119034, + "grad_norm": 0.671920120716095, + "learning_rate": 9.535461882850527e-06, + "loss": 0.4192, + "step": 2399 + }, + { + "epoch": 0.673778775968557, + "grad_norm": 0.6229581832885742, + "learning_rate": 9.534774110629661e-06, + "loss": 0.4273, + "step": 2400 + }, + { + "epoch": 0.6740595171252105, + "grad_norm": 0.7329521179199219, + "learning_rate": 9.5340858544859e-06, + "loss": 0.4996, + "step": 2401 + }, + { + "epoch": 0.6743402582818642, + "grad_norm": 0.7659728527069092, + "learning_rate": 9.533397114492692e-06, + "loss": 0.4524, + "step": 2402 + }, + { + "epoch": 0.6746209994385177, + "grad_norm": 0.7722841501235962, + "learning_rate": 9.532707890723537e-06, + "loss": 0.4457, + "step": 2403 + }, + { + "epoch": 0.6749017405951713, + "grad_norm": 0.791482150554657, + "learning_rate": 9.532018183251984e-06, + "loss": 0.4838, + "step": 2404 + }, + { + "epoch": 0.6751824817518248, + "grad_norm": 0.8250871300697327, + "learning_rate": 9.53132799215163e-06, + "loss": 0.4715, + "step": 2405 + }, + { + "epoch": 0.6754632229084784, + "grad_norm": 0.9840292930603027, + "learning_rate": 9.530637317496132e-06, + "loss": 0.5335, + "step": 2406 + }, + { + "epoch": 0.6757439640651319, + "grad_norm": 0.7690650820732117, + "learning_rate": 9.529946159359194e-06, + "loss": 0.4639, + "step": 2407 + }, + { + "epoch": 0.6760247052217855, + "grad_norm": 0.8268058896064758, + "learning_rate": 9.529254517814573e-06, + "loss": 0.4876, + "step": 2408 + }, + { + "epoch": 0.676305446378439, + "grad_norm": 0.7992638349533081, + "learning_rate": 9.528562392936074e-06, + "loss": 0.4807, + "step": 2409 + }, + { + "epoch": 0.6765861875350927, + "grad_norm": 0.9305452108383179, + "learning_rate": 9.527869784797558e-06, + "loss": 0.4933, + "step": 2410 + }, + { + "epoch": 0.6768669286917463, + "grad_norm": 0.8309610486030579, + "learning_rate": 9.527176693472935e-06, + "loss": 0.4476, + "step": 2411 + }, + { + "epoch": 0.6771476698483998, + "grad_norm": 0.7115912437438965, + "learning_rate": 9.526483119036169e-06, + "loss": 0.4714, + "step": 2412 + }, + { + "epoch": 0.6774284110050534, + "grad_norm": 0.7805620431900024, + "learning_rate": 9.525789061561273e-06, + "loss": 0.4416, + "step": 2413 + }, + { + "epoch": 0.6777091521617069, + "grad_norm": 0.7787017226219177, + "learning_rate": 9.525094521122311e-06, + "loss": 0.4346, + "step": 2414 + }, + { + "epoch": 0.6779898933183605, + "grad_norm": 0.7078654170036316, + "learning_rate": 9.524399497793401e-06, + "loss": 0.4637, + "step": 2415 + }, + { + "epoch": 0.678270634475014, + "grad_norm": 0.7167945504188538, + "learning_rate": 9.523703991648713e-06, + "loss": 0.4513, + "step": 2416 + }, + { + "epoch": 0.6785513756316676, + "grad_norm": 0.7387191653251648, + "learning_rate": 9.523008002762468e-06, + "loss": 0.4733, + "step": 2417 + }, + { + "epoch": 0.6788321167883211, + "grad_norm": 0.7310633659362793, + "learning_rate": 9.522311531208932e-06, + "loss": 0.495, + "step": 2418 + }, + { + "epoch": 0.6791128579449748, + "grad_norm": 0.6858799457550049, + "learning_rate": 9.521614577062434e-06, + "loss": 0.4783, + "step": 2419 + }, + { + "epoch": 0.6793935991016283, + "grad_norm": 0.6893883347511292, + "learning_rate": 9.520917140397346e-06, + "loss": 0.4738, + "step": 2420 + }, + { + "epoch": 0.6796743402582819, + "grad_norm": 0.770002007484436, + "learning_rate": 9.520219221288095e-06, + "loss": 0.4545, + "step": 2421 + }, + { + "epoch": 0.6799550814149354, + "grad_norm": 0.7189306616783142, + "learning_rate": 9.519520819809158e-06, + "loss": 0.4528, + "step": 2422 + }, + { + "epoch": 0.680235822571589, + "grad_norm": 0.9201745390892029, + "learning_rate": 9.518821936035063e-06, + "loss": 0.5134, + "step": 2423 + }, + { + "epoch": 0.6805165637282425, + "grad_norm": 0.739543616771698, + "learning_rate": 9.518122570040393e-06, + "loss": 0.4746, + "step": 2424 + }, + { + "epoch": 0.6807973048848961, + "grad_norm": 0.8252633810043335, + "learning_rate": 9.517422721899779e-06, + "loss": 0.4553, + "step": 2425 + }, + { + "epoch": 0.6810780460415496, + "grad_norm": 0.8291049003601074, + "learning_rate": 9.516722391687903e-06, + "loss": 0.4629, + "step": 2426 + }, + { + "epoch": 0.6813587871982033, + "grad_norm": 0.7182469964027405, + "learning_rate": 9.5160215794795e-06, + "loss": 0.4784, + "step": 2427 + }, + { + "epoch": 0.6816395283548569, + "grad_norm": 0.6084442138671875, + "learning_rate": 9.515320285349359e-06, + "loss": 0.4337, + "step": 2428 + }, + { + "epoch": 0.6819202695115104, + "grad_norm": 0.8051382303237915, + "learning_rate": 9.514618509372315e-06, + "loss": 0.4674, + "step": 2429 + }, + { + "epoch": 0.682201010668164, + "grad_norm": 0.9197446703910828, + "learning_rate": 9.513916251623259e-06, + "loss": 0.5006, + "step": 2430 + }, + { + "epoch": 0.6824817518248175, + "grad_norm": 0.809824526309967, + "learning_rate": 9.513213512177131e-06, + "loss": 0.4441, + "step": 2431 + }, + { + "epoch": 0.6827624929814711, + "grad_norm": 0.7757403254508972, + "learning_rate": 9.512510291108924e-06, + "loss": 0.4671, + "step": 2432 + }, + { + "epoch": 0.6830432341381246, + "grad_norm": 0.9004009962081909, + "learning_rate": 9.511806588493678e-06, + "loss": 0.4431, + "step": 2433 + }, + { + "epoch": 0.6833239752947782, + "grad_norm": 0.8466460108757019, + "learning_rate": 9.51110240440649e-06, + "loss": 0.4636, + "step": 2434 + }, + { + "epoch": 0.6836047164514317, + "grad_norm": 0.825817883014679, + "learning_rate": 9.510397738922508e-06, + "loss": 0.4396, + "step": 2435 + }, + { + "epoch": 0.6838854576080854, + "grad_norm": 0.6038423180580139, + "learning_rate": 9.509692592116926e-06, + "loss": 0.4766, + "step": 2436 + }, + { + "epoch": 0.6841661987647389, + "grad_norm": 0.6919887661933899, + "learning_rate": 9.508986964064994e-06, + "loss": 0.4299, + "step": 2437 + }, + { + "epoch": 0.6844469399213925, + "grad_norm": 0.8460870385169983, + "learning_rate": 9.508280854842014e-06, + "loss": 0.4623, + "step": 2438 + }, + { + "epoch": 0.684727681078046, + "grad_norm": 0.9004149436950684, + "learning_rate": 9.507574264523337e-06, + "loss": 0.4913, + "step": 2439 + }, + { + "epoch": 0.6850084222346996, + "grad_norm": 0.6217430233955383, + "learning_rate": 9.506867193184363e-06, + "loss": 0.4478, + "step": 2440 + }, + { + "epoch": 0.6852891633913532, + "grad_norm": 0.8997867703437805, + "learning_rate": 9.50615964090055e-06, + "loss": 0.4657, + "step": 2441 + }, + { + "epoch": 0.6855699045480067, + "grad_norm": 0.9446470737457275, + "learning_rate": 9.505451607747402e-06, + "loss": 0.4673, + "step": 2442 + }, + { + "epoch": 0.6858506457046603, + "grad_norm": 0.6886206269264221, + "learning_rate": 9.504743093800474e-06, + "loss": 0.4502, + "step": 2443 + }, + { + "epoch": 0.6861313868613139, + "grad_norm": 0.764959990978241, + "learning_rate": 9.50403409913538e-06, + "loss": 0.4802, + "step": 2444 + }, + { + "epoch": 0.6864121280179675, + "grad_norm": 0.8891680240631104, + "learning_rate": 9.503324623827773e-06, + "loss": 0.5012, + "step": 2445 + }, + { + "epoch": 0.686692869174621, + "grad_norm": 0.930833101272583, + "learning_rate": 9.502614667953366e-06, + "loss": 0.4822, + "step": 2446 + }, + { + "epoch": 0.6869736103312746, + "grad_norm": 0.7132047414779663, + "learning_rate": 9.501904231587924e-06, + "loss": 0.4728, + "step": 2447 + }, + { + "epoch": 0.6872543514879281, + "grad_norm": 0.9185259342193604, + "learning_rate": 9.501193314807256e-06, + "loss": 0.4525, + "step": 2448 + }, + { + "epoch": 0.6875350926445817, + "grad_norm": 0.7687292695045471, + "learning_rate": 9.50048191768723e-06, + "loss": 0.4711, + "step": 2449 + }, + { + "epoch": 0.6878158338012352, + "grad_norm": 0.6602697372436523, + "learning_rate": 9.499770040303759e-06, + "loss": 0.4476, + "step": 2450 + }, + { + "epoch": 0.6880965749578888, + "grad_norm": 0.7810957431793213, + "learning_rate": 9.499057682732812e-06, + "loss": 0.4447, + "step": 2451 + }, + { + "epoch": 0.6883773161145423, + "grad_norm": 0.7534429430961609, + "learning_rate": 9.498344845050406e-06, + "loss": 0.479, + "step": 2452 + }, + { + "epoch": 0.688658057271196, + "grad_norm": 0.6799314618110657, + "learning_rate": 9.497631527332613e-06, + "loss": 0.4915, + "step": 2453 + }, + { + "epoch": 0.6889387984278496, + "grad_norm": 0.7317137122154236, + "learning_rate": 9.496917729655552e-06, + "loss": 0.4033, + "step": 2454 + }, + { + "epoch": 0.6892195395845031, + "grad_norm": 0.7076913118362427, + "learning_rate": 9.496203452095395e-06, + "loss": 0.4123, + "step": 2455 + }, + { + "epoch": 0.6895002807411567, + "grad_norm": 0.7963237166404724, + "learning_rate": 9.495488694728366e-06, + "loss": 0.4317, + "step": 2456 + }, + { + "epoch": 0.6897810218978102, + "grad_norm": 0.7819330096244812, + "learning_rate": 9.494773457630738e-06, + "loss": 0.4786, + "step": 2457 + }, + { + "epoch": 0.6900617630544638, + "grad_norm": 0.9064457416534424, + "learning_rate": 9.494057740878838e-06, + "loss": 0.471, + "step": 2458 + }, + { + "epoch": 0.6903425042111173, + "grad_norm": 0.7059385180473328, + "learning_rate": 9.493341544549044e-06, + "loss": 0.5004, + "step": 2459 + }, + { + "epoch": 0.6906232453677709, + "grad_norm": 0.7412230968475342, + "learning_rate": 9.492624868717782e-06, + "loss": 0.4656, + "step": 2460 + }, + { + "epoch": 0.6909039865244245, + "grad_norm": 0.8034632205963135, + "learning_rate": 9.49190771346153e-06, + "loss": 0.4688, + "step": 2461 + }, + { + "epoch": 0.6911847276810781, + "grad_norm": 0.8217840790748596, + "learning_rate": 9.491190078856822e-06, + "loss": 0.4753, + "step": 2462 + }, + { + "epoch": 0.6914654688377316, + "grad_norm": 0.7424009442329407, + "learning_rate": 9.490471964980236e-06, + "loss": 0.4754, + "step": 2463 + }, + { + "epoch": 0.6917462099943852, + "grad_norm": 0.7725020051002502, + "learning_rate": 9.48975337190841e-06, + "loss": 0.452, + "step": 2464 + }, + { + "epoch": 0.6920269511510387, + "grad_norm": 0.8121838569641113, + "learning_rate": 9.48903429971802e-06, + "loss": 0.5129, + "step": 2465 + }, + { + "epoch": 0.6923076923076923, + "grad_norm": 0.8661502003669739, + "learning_rate": 9.488314748485807e-06, + "loss": 0.4799, + "step": 2466 + }, + { + "epoch": 0.6925884334643458, + "grad_norm": 1.0192327499389648, + "learning_rate": 9.487594718288555e-06, + "loss": 0.4925, + "step": 2467 + }, + { + "epoch": 0.6928691746209994, + "grad_norm": 0.6838007569313049, + "learning_rate": 9.4868742092031e-06, + "loss": 0.4564, + "step": 2468 + }, + { + "epoch": 0.6931499157776531, + "grad_norm": 0.9444794058799744, + "learning_rate": 9.486153221306333e-06, + "loss": 0.4609, + "step": 2469 + }, + { + "epoch": 0.6934306569343066, + "grad_norm": 0.7631643414497375, + "learning_rate": 9.485431754675192e-06, + "loss": 0.4527, + "step": 2470 + }, + { + "epoch": 0.6937113980909602, + "grad_norm": 0.8071423172950745, + "learning_rate": 9.484709809386667e-06, + "loss": 0.4956, + "step": 2471 + }, + { + "epoch": 0.6939921392476137, + "grad_norm": 0.7490708827972412, + "learning_rate": 9.483987385517798e-06, + "loss": 0.4774, + "step": 2472 + }, + { + "epoch": 0.6942728804042673, + "grad_norm": 0.7812394499778748, + "learning_rate": 9.48326448314568e-06, + "loss": 0.4388, + "step": 2473 + }, + { + "epoch": 0.6945536215609208, + "grad_norm": 0.6985755562782288, + "learning_rate": 9.482541102347455e-06, + "loss": 0.4439, + "step": 2474 + }, + { + "epoch": 0.6948343627175744, + "grad_norm": 0.6900184750556946, + "learning_rate": 9.481817243200321e-06, + "loss": 0.4162, + "step": 2475 + }, + { + "epoch": 0.6951151038742279, + "grad_norm": 0.6658081412315369, + "learning_rate": 9.481092905781522e-06, + "loss": 0.4384, + "step": 2476 + }, + { + "epoch": 0.6953958450308815, + "grad_norm": 0.6750561594963074, + "learning_rate": 9.48036809016835e-06, + "loss": 0.4016, + "step": 2477 + }, + { + "epoch": 0.6956765861875351, + "grad_norm": 0.7680978178977966, + "learning_rate": 9.47964279643816e-06, + "loss": 0.4465, + "step": 2478 + }, + { + "epoch": 0.6959573273441887, + "grad_norm": 0.7434172630310059, + "learning_rate": 9.478917024668349e-06, + "loss": 0.4862, + "step": 2479 + }, + { + "epoch": 0.6962380685008422, + "grad_norm": 0.7565758228302002, + "learning_rate": 9.478190774936362e-06, + "loss": 0.4587, + "step": 2480 + }, + { + "epoch": 0.6965188096574958, + "grad_norm": 0.6876004338264465, + "learning_rate": 9.477464047319706e-06, + "loss": 0.4514, + "step": 2481 + }, + { + "epoch": 0.6967995508141493, + "grad_norm": 0.7988496422767639, + "learning_rate": 9.47673684189593e-06, + "loss": 0.498, + "step": 2482 + }, + { + "epoch": 0.6970802919708029, + "grad_norm": 0.6999518871307373, + "learning_rate": 9.476009158742638e-06, + "loss": 0.4252, + "step": 2483 + }, + { + "epoch": 0.6973610331274565, + "grad_norm": 0.6802830696105957, + "learning_rate": 9.475280997937482e-06, + "loss": 0.4479, + "step": 2484 + }, + { + "epoch": 0.69764177428411, + "grad_norm": 0.7858578562736511, + "learning_rate": 9.474552359558167e-06, + "loss": 0.4592, + "step": 2485 + }, + { + "epoch": 0.6979225154407637, + "grad_norm": 0.7689603567123413, + "learning_rate": 9.473823243682451e-06, + "loss": 0.4533, + "step": 2486 + }, + { + "epoch": 0.6982032565974172, + "grad_norm": 0.7005722522735596, + "learning_rate": 9.473093650388138e-06, + "loss": 0.458, + "step": 2487 + }, + { + "epoch": 0.6984839977540708, + "grad_norm": 0.7821038961410522, + "learning_rate": 9.472363579753088e-06, + "loss": 0.4634, + "step": 2488 + }, + { + "epoch": 0.6987647389107243, + "grad_norm": 0.8071959018707275, + "learning_rate": 9.471633031855208e-06, + "loss": 0.5033, + "step": 2489 + }, + { + "epoch": 0.6990454800673779, + "grad_norm": 0.7244101762771606, + "learning_rate": 9.47090200677246e-06, + "loss": 0.416, + "step": 2490 + }, + { + "epoch": 0.6993262212240314, + "grad_norm": 0.7001928091049194, + "learning_rate": 9.47017050458285e-06, + "loss": 0.4284, + "step": 2491 + }, + { + "epoch": 0.699606962380685, + "grad_norm": 0.6796762943267822, + "learning_rate": 9.469438525364442e-06, + "loss": 0.4183, + "step": 2492 + }, + { + "epoch": 0.6998877035373385, + "grad_norm": 0.6669578552246094, + "learning_rate": 9.46870606919535e-06, + "loss": 0.4371, + "step": 2493 + }, + { + "epoch": 0.7001684446939921, + "grad_norm": 0.7107380032539368, + "learning_rate": 9.467973136153734e-06, + "loss": 0.4376, + "step": 2494 + }, + { + "epoch": 0.7004491858506458, + "grad_norm": 0.6801512241363525, + "learning_rate": 9.467239726317811e-06, + "loss": 0.4525, + "step": 2495 + }, + { + "epoch": 0.7007299270072993, + "grad_norm": 0.7749803066253662, + "learning_rate": 9.466505839765842e-06, + "loss": 0.4208, + "step": 2496 + }, + { + "epoch": 0.7010106681639529, + "grad_norm": 0.7285341620445251, + "learning_rate": 9.465771476576146e-06, + "loss": 0.4433, + "step": 2497 + }, + { + "epoch": 0.7012914093206064, + "grad_norm": 0.8107213377952576, + "learning_rate": 9.46503663682709e-06, + "loss": 0.5056, + "step": 2498 + }, + { + "epoch": 0.70157215047726, + "grad_norm": 0.7753007411956787, + "learning_rate": 9.46430132059709e-06, + "loss": 0.4399, + "step": 2499 + }, + { + "epoch": 0.7018528916339135, + "grad_norm": 0.8164193630218506, + "learning_rate": 9.463565527964612e-06, + "loss": 0.4448, + "step": 2500 + }, + { + "epoch": 0.7021336327905671, + "grad_norm": 0.6882041096687317, + "learning_rate": 9.462829259008182e-06, + "loss": 0.4339, + "step": 2501 + }, + { + "epoch": 0.7024143739472206, + "grad_norm": 0.8022362589836121, + "learning_rate": 9.462092513806364e-06, + "loss": 0.4391, + "step": 2502 + }, + { + "epoch": 0.7026951151038743, + "grad_norm": 0.6818189024925232, + "learning_rate": 9.461355292437782e-06, + "loss": 0.4583, + "step": 2503 + }, + { + "epoch": 0.7029758562605278, + "grad_norm": 0.8273773193359375, + "learning_rate": 9.460617594981104e-06, + "loss": 0.5037, + "step": 2504 + }, + { + "epoch": 0.7032565974171814, + "grad_norm": 0.7536837458610535, + "learning_rate": 9.459879421515057e-06, + "loss": 0.498, + "step": 2505 + }, + { + "epoch": 0.7035373385738349, + "grad_norm": 0.6416440606117249, + "learning_rate": 9.45914077211841e-06, + "loss": 0.44, + "step": 2506 + }, + { + "epoch": 0.7038180797304885, + "grad_norm": 0.7490890622138977, + "learning_rate": 9.458401646869992e-06, + "loss": 0.4497, + "step": 2507 + }, + { + "epoch": 0.704098820887142, + "grad_norm": 0.9517717361450195, + "learning_rate": 9.457662045848674e-06, + "loss": 0.4104, + "step": 2508 + }, + { + "epoch": 0.7043795620437956, + "grad_norm": 0.7885116338729858, + "learning_rate": 9.456921969133385e-06, + "loss": 0.4445, + "step": 2509 + }, + { + "epoch": 0.7046603032004491, + "grad_norm": 0.8300957083702087, + "learning_rate": 9.456181416803096e-06, + "loss": 0.4213, + "step": 2510 + }, + { + "epoch": 0.7049410443571027, + "grad_norm": 0.8503570556640625, + "learning_rate": 9.45544038893684e-06, + "loss": 0.4574, + "step": 2511 + }, + { + "epoch": 0.7052217855137564, + "grad_norm": 0.6946449875831604, + "learning_rate": 9.454698885613691e-06, + "loss": 0.4484, + "step": 2512 + }, + { + "epoch": 0.7055025266704099, + "grad_norm": 0.7207089066505432, + "learning_rate": 9.453956906912779e-06, + "loss": 0.4673, + "step": 2513 + }, + { + "epoch": 0.7057832678270635, + "grad_norm": 0.8768999576568604, + "learning_rate": 9.453214452913284e-06, + "loss": 0.44, + "step": 2514 + }, + { + "epoch": 0.706064008983717, + "grad_norm": 0.8195427656173706, + "learning_rate": 9.452471523694434e-06, + "loss": 0.4721, + "step": 2515 + }, + { + "epoch": 0.7063447501403706, + "grad_norm": 0.7296256422996521, + "learning_rate": 9.45172811933551e-06, + "loss": 0.4288, + "step": 2516 + }, + { + "epoch": 0.7066254912970241, + "grad_norm": 0.7585498094558716, + "learning_rate": 9.450984239915848e-06, + "loss": 0.4691, + "step": 2517 + }, + { + "epoch": 0.7069062324536777, + "grad_norm": 0.7398808598518372, + "learning_rate": 9.450239885514824e-06, + "loss": 0.4337, + "step": 2518 + }, + { + "epoch": 0.7071869736103312, + "grad_norm": 0.8726441264152527, + "learning_rate": 9.449495056211874e-06, + "loss": 0.466, + "step": 2519 + }, + { + "epoch": 0.7074677147669849, + "grad_norm": 0.7603203654289246, + "learning_rate": 9.448749752086482e-06, + "loss": 0.4295, + "step": 2520 + }, + { + "epoch": 0.7077484559236384, + "grad_norm": 0.8504489660263062, + "learning_rate": 9.448003973218181e-06, + "loss": 0.4655, + "step": 2521 + }, + { + "epoch": 0.708029197080292, + "grad_norm": 0.9464203715324402, + "learning_rate": 9.447257719686557e-06, + "loss": 0.4404, + "step": 2522 + }, + { + "epoch": 0.7083099382369455, + "grad_norm": 0.8075183033943176, + "learning_rate": 9.446510991571243e-06, + "loss": 0.4548, + "step": 2523 + }, + { + "epoch": 0.7085906793935991, + "grad_norm": 0.7006256580352783, + "learning_rate": 9.44576378895193e-06, + "loss": 0.4606, + "step": 2524 + }, + { + "epoch": 0.7088714205502527, + "grad_norm": 0.6948923468589783, + "learning_rate": 9.445016111908349e-06, + "loss": 0.4215, + "step": 2525 + }, + { + "epoch": 0.7091521617069062, + "grad_norm": 0.8745613694190979, + "learning_rate": 9.444267960520292e-06, + "loss": 0.4809, + "step": 2526 + }, + { + "epoch": 0.7094329028635598, + "grad_norm": 0.8244311809539795, + "learning_rate": 9.443519334867595e-06, + "loss": 0.5293, + "step": 2527 + }, + { + "epoch": 0.7097136440202133, + "grad_norm": 0.750098705291748, + "learning_rate": 9.442770235030145e-06, + "loss": 0.4484, + "step": 2528 + }, + { + "epoch": 0.709994385176867, + "grad_norm": 0.8122600317001343, + "learning_rate": 9.442020661087885e-06, + "loss": 0.4568, + "step": 2529 + }, + { + "epoch": 0.7102751263335205, + "grad_norm": 0.7557501792907715, + "learning_rate": 9.441270613120803e-06, + "loss": 0.4357, + "step": 2530 + }, + { + "epoch": 0.7105558674901741, + "grad_norm": 0.820770263671875, + "learning_rate": 9.44052009120894e-06, + "loss": 0.4499, + "step": 2531 + }, + { + "epoch": 0.7108366086468276, + "grad_norm": 0.664503276348114, + "learning_rate": 9.439769095432387e-06, + "loss": 0.4077, + "step": 2532 + }, + { + "epoch": 0.7111173498034812, + "grad_norm": 0.7776325345039368, + "learning_rate": 9.439017625871285e-06, + "loss": 0.4862, + "step": 2533 + }, + { + "epoch": 0.7113980909601347, + "grad_norm": 0.7743126153945923, + "learning_rate": 9.438265682605826e-06, + "loss": 0.4426, + "step": 2534 + }, + { + "epoch": 0.7116788321167883, + "grad_norm": 0.8332822918891907, + "learning_rate": 9.437513265716253e-06, + "loss": 0.4954, + "step": 2535 + }, + { + "epoch": 0.7119595732734418, + "grad_norm": 0.7532265186309814, + "learning_rate": 9.436760375282858e-06, + "loss": 0.4424, + "step": 2536 + }, + { + "epoch": 0.7122403144300955, + "grad_norm": 0.6855920553207397, + "learning_rate": 9.436007011385988e-06, + "loss": 0.4272, + "step": 2537 + }, + { + "epoch": 0.712521055586749, + "grad_norm": 0.6882874965667725, + "learning_rate": 9.435253174106036e-06, + "loss": 0.4638, + "step": 2538 + }, + { + "epoch": 0.7128017967434026, + "grad_norm": 0.8163425326347351, + "learning_rate": 9.434498863523444e-06, + "loss": 0.4793, + "step": 2539 + }, + { + "epoch": 0.7130825379000562, + "grad_norm": 0.762488842010498, + "learning_rate": 9.433744079718712e-06, + "loss": 0.4574, + "step": 2540 + }, + { + "epoch": 0.7133632790567097, + "grad_norm": 0.6940890550613403, + "learning_rate": 9.432988822772382e-06, + "loss": 0.4628, + "step": 2541 + }, + { + "epoch": 0.7136440202133633, + "grad_norm": 0.6820698976516724, + "learning_rate": 9.432233092765052e-06, + "loss": 0.4578, + "step": 2542 + }, + { + "epoch": 0.7139247613700168, + "grad_norm": 0.8692309856414795, + "learning_rate": 9.43147688977737e-06, + "loss": 0.5115, + "step": 2543 + }, + { + "epoch": 0.7142055025266704, + "grad_norm": 0.654083251953125, + "learning_rate": 9.43072021389003e-06, + "loss": 0.4836, + "step": 2544 + }, + { + "epoch": 0.714486243683324, + "grad_norm": 0.6994615793228149, + "learning_rate": 9.429963065183781e-06, + "loss": 0.4982, + "step": 2545 + }, + { + "epoch": 0.7147669848399776, + "grad_norm": 0.8428835272789001, + "learning_rate": 9.429205443739424e-06, + "loss": 0.4899, + "step": 2546 + }, + { + "epoch": 0.7150477259966311, + "grad_norm": 0.7471094727516174, + "learning_rate": 9.428447349637804e-06, + "loss": 0.4135, + "step": 2547 + }, + { + "epoch": 0.7153284671532847, + "grad_norm": 0.7090533375740051, + "learning_rate": 9.427688782959821e-06, + "loss": 0.4321, + "step": 2548 + }, + { + "epoch": 0.7156092083099382, + "grad_norm": 0.8219285011291504, + "learning_rate": 9.426929743786426e-06, + "loss": 0.4222, + "step": 2549 + }, + { + "epoch": 0.7158899494665918, + "grad_norm": 0.7486048936843872, + "learning_rate": 9.42617023219862e-06, + "loss": 0.4577, + "step": 2550 + }, + { + "epoch": 0.7161706906232453, + "grad_norm": 0.751075029373169, + "learning_rate": 9.42541024827745e-06, + "loss": 0.5137, + "step": 2551 + }, + { + "epoch": 0.7164514317798989, + "grad_norm": 0.7988788485527039, + "learning_rate": 9.424649792104016e-06, + "loss": 0.4987, + "step": 2552 + }, + { + "epoch": 0.7167321729365524, + "grad_norm": 0.6931146383285522, + "learning_rate": 9.423888863759473e-06, + "loss": 0.4494, + "step": 2553 + }, + { + "epoch": 0.7170129140932061, + "grad_norm": 0.7630744576454163, + "learning_rate": 9.42312746332502e-06, + "loss": 0.4668, + "step": 2554 + }, + { + "epoch": 0.7172936552498597, + "grad_norm": 0.928024411201477, + "learning_rate": 9.42236559088191e-06, + "loss": 0.4995, + "step": 2555 + }, + { + "epoch": 0.7175743964065132, + "grad_norm": 0.7794921398162842, + "learning_rate": 9.421603246511446e-06, + "loss": 0.4671, + "step": 2556 + }, + { + "epoch": 0.7178551375631668, + "grad_norm": 0.7536506056785583, + "learning_rate": 9.42084043029498e-06, + "loss": 0.4747, + "step": 2557 + }, + { + "epoch": 0.7181358787198203, + "grad_norm": 0.7516429424285889, + "learning_rate": 9.420077142313912e-06, + "loss": 0.4614, + "step": 2558 + }, + { + "epoch": 0.7184166198764739, + "grad_norm": 0.8014383912086487, + "learning_rate": 9.419313382649699e-06, + "loss": 0.4381, + "step": 2559 + }, + { + "epoch": 0.7186973610331274, + "grad_norm": 0.8374093770980835, + "learning_rate": 9.418549151383846e-06, + "loss": 0.4249, + "step": 2560 + }, + { + "epoch": 0.718978102189781, + "grad_norm": 0.8145158886909485, + "learning_rate": 9.417784448597901e-06, + "loss": 0.4368, + "step": 2561 + }, + { + "epoch": 0.7192588433464346, + "grad_norm": 0.7724982500076294, + "learning_rate": 9.417019274373476e-06, + "loss": 0.4103, + "step": 2562 + }, + { + "epoch": 0.7195395845030882, + "grad_norm": 0.7369022965431213, + "learning_rate": 9.41625362879222e-06, + "loss": 0.4599, + "step": 2563 + }, + { + "epoch": 0.7198203256597417, + "grad_norm": 0.9183072447776794, + "learning_rate": 9.415487511935838e-06, + "loss": 0.416, + "step": 2564 + }, + { + "epoch": 0.7201010668163953, + "grad_norm": 0.8330563902854919, + "learning_rate": 9.41472092388609e-06, + "loss": 0.4918, + "step": 2565 + }, + { + "epoch": 0.7203818079730488, + "grad_norm": 0.7774697542190552, + "learning_rate": 9.413953864724777e-06, + "loss": 0.4765, + "step": 2566 + }, + { + "epoch": 0.7206625491297024, + "grad_norm": 0.8402326703071594, + "learning_rate": 9.413186334533755e-06, + "loss": 0.446, + "step": 2567 + }, + { + "epoch": 0.720943290286356, + "grad_norm": 0.8870805501937866, + "learning_rate": 9.41241833339493e-06, + "loss": 0.4717, + "step": 2568 + }, + { + "epoch": 0.7212240314430095, + "grad_norm": 0.7984758615493774, + "learning_rate": 9.411649861390261e-06, + "loss": 0.4089, + "step": 2569 + }, + { + "epoch": 0.7215047725996631, + "grad_norm": 0.8126974105834961, + "learning_rate": 9.410880918601755e-06, + "loss": 0.4458, + "step": 2570 + }, + { + "epoch": 0.7217855137563167, + "grad_norm": 0.6914010643959045, + "learning_rate": 9.410111505111466e-06, + "loss": 0.4407, + "step": 2571 + }, + { + "epoch": 0.7220662549129703, + "grad_norm": 0.7459649443626404, + "learning_rate": 9.4093416210015e-06, + "loss": 0.4435, + "step": 2572 + }, + { + "epoch": 0.7223469960696238, + "grad_norm": 0.7722095847129822, + "learning_rate": 9.408571266354017e-06, + "loss": 0.4375, + "step": 2573 + }, + { + "epoch": 0.7226277372262774, + "grad_norm": 0.8616966009140015, + "learning_rate": 9.407800441251224e-06, + "loss": 0.4755, + "step": 2574 + }, + { + "epoch": 0.7229084783829309, + "grad_norm": 0.7333300709724426, + "learning_rate": 9.407029145775377e-06, + "loss": 0.4368, + "step": 2575 + }, + { + "epoch": 0.7231892195395845, + "grad_norm": 0.805834174156189, + "learning_rate": 9.406257380008788e-06, + "loss": 0.4841, + "step": 2576 + }, + { + "epoch": 0.723469960696238, + "grad_norm": 0.8962896466255188, + "learning_rate": 9.40548514403381e-06, + "loss": 0.4578, + "step": 2577 + }, + { + "epoch": 0.7237507018528916, + "grad_norm": 0.8885279893875122, + "learning_rate": 9.404712437932852e-06, + "loss": 0.476, + "step": 2578 + }, + { + "epoch": 0.7240314430095453, + "grad_norm": 0.8999950289726257, + "learning_rate": 9.403939261788375e-06, + "loss": 0.4839, + "step": 2579 + }, + { + "epoch": 0.7243121841661988, + "grad_norm": 0.7031996250152588, + "learning_rate": 9.403165615682888e-06, + "loss": 0.4066, + "step": 2580 + }, + { + "epoch": 0.7245929253228524, + "grad_norm": 0.8968575596809387, + "learning_rate": 9.402391499698946e-06, + "loss": 0.4359, + "step": 2581 + }, + { + "epoch": 0.7248736664795059, + "grad_norm": 0.8842137455940247, + "learning_rate": 9.40161691391916e-06, + "loss": 0.5041, + "step": 2582 + }, + { + "epoch": 0.7251544076361595, + "grad_norm": 0.7458582520484924, + "learning_rate": 9.400841858426188e-06, + "loss": 0.3874, + "step": 2583 + }, + { + "epoch": 0.725435148792813, + "grad_norm": 0.912441611289978, + "learning_rate": 9.400066333302742e-06, + "loss": 0.5015, + "step": 2584 + }, + { + "epoch": 0.7257158899494666, + "grad_norm": 0.8796662092208862, + "learning_rate": 9.399290338631578e-06, + "loss": 0.4423, + "step": 2585 + }, + { + "epoch": 0.7259966311061201, + "grad_norm": 0.8011311292648315, + "learning_rate": 9.398513874495506e-06, + "loss": 0.4452, + "step": 2586 + }, + { + "epoch": 0.7262773722627737, + "grad_norm": 0.8862401843070984, + "learning_rate": 9.397736940977387e-06, + "loss": 0.4832, + "step": 2587 + }, + { + "epoch": 0.7265581134194273, + "grad_norm": 0.7400742173194885, + "learning_rate": 9.39695953816013e-06, + "loss": 0.4102, + "step": 2588 + }, + { + "epoch": 0.7268388545760809, + "grad_norm": 0.6960920691490173, + "learning_rate": 9.39618166612669e-06, + "loss": 0.4858, + "step": 2589 + }, + { + "epoch": 0.7271195957327344, + "grad_norm": 0.7522947788238525, + "learning_rate": 9.395403324960084e-06, + "loss": 0.4098, + "step": 2590 + }, + { + "epoch": 0.727400336889388, + "grad_norm": 0.8645053505897522, + "learning_rate": 9.394624514743368e-06, + "loss": 0.459, + "step": 2591 + }, + { + "epoch": 0.7276810780460415, + "grad_norm": 0.787739634513855, + "learning_rate": 9.39384523555965e-06, + "loss": 0.4518, + "step": 2592 + }, + { + "epoch": 0.7279618192026951, + "grad_norm": 0.6793380379676819, + "learning_rate": 9.393065487492095e-06, + "loss": 0.4389, + "step": 2593 + }, + { + "epoch": 0.7282425603593486, + "grad_norm": 0.9285565614700317, + "learning_rate": 9.392285270623908e-06, + "loss": 0.4026, + "step": 2594 + }, + { + "epoch": 0.7285233015160022, + "grad_norm": 0.7372851371765137, + "learning_rate": 9.391504585038353e-06, + "loss": 0.4188, + "step": 2595 + }, + { + "epoch": 0.7288040426726559, + "grad_norm": 0.763027012348175, + "learning_rate": 9.390723430818735e-06, + "loss": 0.4319, + "step": 2596 + }, + { + "epoch": 0.7290847838293094, + "grad_norm": 0.7152758836746216, + "learning_rate": 9.389941808048417e-06, + "loss": 0.4743, + "step": 2597 + }, + { + "epoch": 0.729365524985963, + "grad_norm": 0.7588361501693726, + "learning_rate": 9.38915971681081e-06, + "loss": 0.4491, + "step": 2598 + }, + { + "epoch": 0.7296462661426165, + "grad_norm": 0.7377399206161499, + "learning_rate": 9.388377157189373e-06, + "loss": 0.432, + "step": 2599 + }, + { + "epoch": 0.7299270072992701, + "grad_norm": 0.7724609375, + "learning_rate": 9.387594129267612e-06, + "loss": 0.4394, + "step": 2600 + }, + { + "epoch": 0.7302077484559236, + "grad_norm": 0.7818425297737122, + "learning_rate": 9.386810633129093e-06, + "loss": 0.4068, + "step": 2601 + }, + { + "epoch": 0.7304884896125772, + "grad_norm": 0.814519464969635, + "learning_rate": 9.386026668857421e-06, + "loss": 0.4305, + "step": 2602 + }, + { + "epoch": 0.7307692307692307, + "grad_norm": 0.7100637555122375, + "learning_rate": 9.385242236536259e-06, + "loss": 0.4704, + "step": 2603 + }, + { + "epoch": 0.7310499719258844, + "grad_norm": 0.8754744529724121, + "learning_rate": 9.384457336249316e-06, + "loss": 0.4577, + "step": 2604 + }, + { + "epoch": 0.7313307130825379, + "grad_norm": 0.6805042624473572, + "learning_rate": 9.38367196808035e-06, + "loss": 0.4422, + "step": 2605 + }, + { + "epoch": 0.7316114542391915, + "grad_norm": 0.9314867258071899, + "learning_rate": 9.382886132113172e-06, + "loss": 0.4575, + "step": 2606 + }, + { + "epoch": 0.731892195395845, + "grad_norm": 0.7781674265861511, + "learning_rate": 9.382099828431643e-06, + "loss": 0.4258, + "step": 2607 + }, + { + "epoch": 0.7321729365524986, + "grad_norm": 0.7923882603645325, + "learning_rate": 9.381313057119669e-06, + "loss": 0.497, + "step": 2608 + }, + { + "epoch": 0.7324536777091522, + "grad_norm": 0.7565465569496155, + "learning_rate": 9.380525818261211e-06, + "loss": 0.466, + "step": 2609 + }, + { + "epoch": 0.7327344188658057, + "grad_norm": 0.7454366087913513, + "learning_rate": 9.37973811194028e-06, + "loss": 0.4508, + "step": 2610 + }, + { + "epoch": 0.7330151600224593, + "grad_norm": 0.8961068987846375, + "learning_rate": 9.378949938240932e-06, + "loss": 0.4454, + "step": 2611 + }, + { + "epoch": 0.7332959011791128, + "grad_norm": 0.7383410930633545, + "learning_rate": 9.378161297247278e-06, + "loss": 0.4532, + "step": 2612 + }, + { + "epoch": 0.7335766423357665, + "grad_norm": 0.6697604060173035, + "learning_rate": 9.377372189043477e-06, + "loss": 0.4546, + "step": 2613 + }, + { + "epoch": 0.73385738349242, + "grad_norm": 0.7382734417915344, + "learning_rate": 9.376582613713736e-06, + "loss": 0.4346, + "step": 2614 + }, + { + "epoch": 0.7341381246490736, + "grad_norm": 0.7783324718475342, + "learning_rate": 9.375792571342314e-06, + "loss": 0.4422, + "step": 2615 + }, + { + "epoch": 0.7344188658057271, + "grad_norm": 0.6934186816215515, + "learning_rate": 9.375002062013521e-06, + "loss": 0.4458, + "step": 2616 + }, + { + "epoch": 0.7346996069623807, + "grad_norm": 0.7689214944839478, + "learning_rate": 9.374211085811714e-06, + "loss": 0.4635, + "step": 2617 + }, + { + "epoch": 0.7349803481190342, + "grad_norm": 0.7955026030540466, + "learning_rate": 9.373419642821302e-06, + "loss": 0.4586, + "step": 2618 + }, + { + "epoch": 0.7352610892756878, + "grad_norm": 0.7794889211654663, + "learning_rate": 9.372627733126743e-06, + "loss": 0.4527, + "step": 2619 + }, + { + "epoch": 0.7355418304323413, + "grad_norm": 0.7205256223678589, + "learning_rate": 9.371835356812542e-06, + "loss": 0.4143, + "step": 2620 + }, + { + "epoch": 0.735822571588995, + "grad_norm": 0.8978113532066345, + "learning_rate": 9.37104251396326e-06, + "loss": 0.4683, + "step": 2621 + }, + { + "epoch": 0.7361033127456486, + "grad_norm": 0.7564862966537476, + "learning_rate": 9.370249204663502e-06, + "loss": 0.4187, + "step": 2622 + }, + { + "epoch": 0.7363840539023021, + "grad_norm": 0.7632102966308594, + "learning_rate": 9.369455428997925e-06, + "loss": 0.4575, + "step": 2623 + }, + { + "epoch": 0.7366647950589557, + "grad_norm": 0.8099504113197327, + "learning_rate": 9.368661187051238e-06, + "loss": 0.4418, + "step": 2624 + }, + { + "epoch": 0.7369455362156092, + "grad_norm": 0.8187891244888306, + "learning_rate": 9.367866478908194e-06, + "loss": 0.4875, + "step": 2625 + }, + { + "epoch": 0.7372262773722628, + "grad_norm": 0.6581400632858276, + "learning_rate": 9.367071304653603e-06, + "loss": 0.399, + "step": 2626 + }, + { + "epoch": 0.7375070185289163, + "grad_norm": 0.8173044323921204, + "learning_rate": 9.366275664372317e-06, + "loss": 0.4899, + "step": 2627 + }, + { + "epoch": 0.7377877596855699, + "grad_norm": 0.8202473521232605, + "learning_rate": 9.365479558149246e-06, + "loss": 0.4487, + "step": 2628 + }, + { + "epoch": 0.7380685008422234, + "grad_norm": 0.7705366015434265, + "learning_rate": 9.364682986069344e-06, + "loss": 0.5156, + "step": 2629 + }, + { + "epoch": 0.7383492419988771, + "grad_norm": 0.6788835525512695, + "learning_rate": 9.363885948217615e-06, + "loss": 0.4232, + "step": 2630 + }, + { + "epoch": 0.7386299831555306, + "grad_norm": 0.8718878030776978, + "learning_rate": 9.363088444679116e-06, + "loss": 0.4578, + "step": 2631 + }, + { + "epoch": 0.7389107243121842, + "grad_norm": 0.7273135781288147, + "learning_rate": 9.36229047553895e-06, + "loss": 0.4215, + "step": 2632 + }, + { + "epoch": 0.7391914654688377, + "grad_norm": 0.789441704750061, + "learning_rate": 9.361492040882272e-06, + "loss": 0.4474, + "step": 2633 + }, + { + "epoch": 0.7394722066254913, + "grad_norm": 0.8108557462692261, + "learning_rate": 9.360693140794285e-06, + "loss": 0.4684, + "step": 2634 + }, + { + "epoch": 0.7397529477821448, + "grad_norm": 0.6832651495933533, + "learning_rate": 9.359893775360244e-06, + "loss": 0.4142, + "step": 2635 + }, + { + "epoch": 0.7400336889387984, + "grad_norm": 0.8333257436752319, + "learning_rate": 9.35909394466545e-06, + "loss": 0.4751, + "step": 2636 + }, + { + "epoch": 0.740314430095452, + "grad_norm": 0.8688099980354309, + "learning_rate": 9.35829364879526e-06, + "loss": 0.5005, + "step": 2637 + }, + { + "epoch": 0.7405951712521056, + "grad_norm": 0.7191597819328308, + "learning_rate": 9.357492887835073e-06, + "loss": 0.426, + "step": 2638 + }, + { + "epoch": 0.7408759124087592, + "grad_norm": 0.8580526113510132, + "learning_rate": 9.356691661870342e-06, + "loss": 0.4623, + "step": 2639 + }, + { + "epoch": 0.7411566535654127, + "grad_norm": 0.80132657289505, + "learning_rate": 9.355889970986571e-06, + "loss": 0.4536, + "step": 2640 + }, + { + "epoch": 0.7414373947220663, + "grad_norm": 0.7642393708229065, + "learning_rate": 9.355087815269307e-06, + "loss": 0.4646, + "step": 2641 + }, + { + "epoch": 0.7417181358787198, + "grad_norm": 0.7316069006919861, + "learning_rate": 9.354285194804156e-06, + "loss": 0.4471, + "step": 2642 + }, + { + "epoch": 0.7419988770353734, + "grad_norm": 0.7916228175163269, + "learning_rate": 9.353482109676767e-06, + "loss": 0.4837, + "step": 2643 + }, + { + "epoch": 0.7422796181920269, + "grad_norm": 0.7023115158081055, + "learning_rate": 9.352678559972839e-06, + "loss": 0.451, + "step": 2644 + }, + { + "epoch": 0.7425603593486805, + "grad_norm": 0.7417224049568176, + "learning_rate": 9.35187454577812e-06, + "loss": 0.437, + "step": 2645 + }, + { + "epoch": 0.742841100505334, + "grad_norm": 0.8513741493225098, + "learning_rate": 9.351070067178416e-06, + "loss": 0.4638, + "step": 2646 + }, + { + "epoch": 0.7431218416619877, + "grad_norm": 0.758176326751709, + "learning_rate": 9.350265124259571e-06, + "loss": 0.4392, + "step": 2647 + }, + { + "epoch": 0.7434025828186412, + "grad_norm": 0.7236112952232361, + "learning_rate": 9.349459717107484e-06, + "loss": 0.4508, + "step": 2648 + }, + { + "epoch": 0.7436833239752948, + "grad_norm": 0.7783223390579224, + "learning_rate": 9.348653845808103e-06, + "loss": 0.4326, + "step": 2649 + }, + { + "epoch": 0.7439640651319483, + "grad_norm": 0.8385047316551208, + "learning_rate": 9.347847510447427e-06, + "loss": 0.4438, + "step": 2650 + }, + { + "epoch": 0.7442448062886019, + "grad_norm": 0.7798228859901428, + "learning_rate": 9.347040711111501e-06, + "loss": 0.4449, + "step": 2651 + }, + { + "epoch": 0.7445255474452555, + "grad_norm": 0.7581650018692017, + "learning_rate": 9.346233447886424e-06, + "loss": 0.4328, + "step": 2652 + }, + { + "epoch": 0.744806288601909, + "grad_norm": 0.8718856573104858, + "learning_rate": 9.34542572085834e-06, + "loss": 0.5039, + "step": 2653 + }, + { + "epoch": 0.7450870297585626, + "grad_norm": 0.9271716475486755, + "learning_rate": 9.344617530113446e-06, + "loss": 0.4096, + "step": 2654 + }, + { + "epoch": 0.7453677709152162, + "grad_norm": 0.8231848478317261, + "learning_rate": 9.343808875737985e-06, + "loss": 0.5526, + "step": 2655 + }, + { + "epoch": 0.7456485120718698, + "grad_norm": 0.8132502436637878, + "learning_rate": 9.342999757818256e-06, + "loss": 0.4904, + "step": 2656 + }, + { + "epoch": 0.7459292532285233, + "grad_norm": 0.7610129714012146, + "learning_rate": 9.342190176440598e-06, + "loss": 0.4681, + "step": 2657 + }, + { + "epoch": 0.7462099943851769, + "grad_norm": 0.6845329999923706, + "learning_rate": 9.341380131691406e-06, + "loss": 0.4222, + "step": 2658 + }, + { + "epoch": 0.7464907355418304, + "grad_norm": 0.7476111650466919, + "learning_rate": 9.340569623657127e-06, + "loss": 0.4427, + "step": 2659 + }, + { + "epoch": 0.746771476698484, + "grad_norm": 0.9219915866851807, + "learning_rate": 9.339758652424246e-06, + "loss": 0.4787, + "step": 2660 + }, + { + "epoch": 0.7470522178551375, + "grad_norm": 0.8040258884429932, + "learning_rate": 9.338947218079312e-06, + "loss": 0.4732, + "step": 2661 + }, + { + "epoch": 0.7473329590117911, + "grad_norm": 0.7521624565124512, + "learning_rate": 9.338135320708912e-06, + "loss": 0.5128, + "step": 2662 + }, + { + "epoch": 0.7476137001684446, + "grad_norm": 0.810127854347229, + "learning_rate": 9.337322960399686e-06, + "loss": 0.3961, + "step": 2663 + }, + { + "epoch": 0.7478944413250983, + "grad_norm": 0.8981438279151917, + "learning_rate": 9.336510137238328e-06, + "loss": 0.4779, + "step": 2664 + }, + { + "epoch": 0.7481751824817519, + "grad_norm": 0.8440060615539551, + "learning_rate": 9.335696851311573e-06, + "loss": 0.4866, + "step": 2665 + }, + { + "epoch": 0.7484559236384054, + "grad_norm": 1.0122367143630981, + "learning_rate": 9.334883102706214e-06, + "loss": 0.491, + "step": 2666 + }, + { + "epoch": 0.748736664795059, + "grad_norm": 0.8927567601203918, + "learning_rate": 9.334068891509087e-06, + "loss": 0.51, + "step": 2667 + }, + { + "epoch": 0.7490174059517125, + "grad_norm": 0.7860920429229736, + "learning_rate": 9.333254217807079e-06, + "loss": 0.4393, + "step": 2668 + }, + { + "epoch": 0.7492981471083661, + "grad_norm": 0.8904474973678589, + "learning_rate": 9.332439081687128e-06, + "loss": 0.4185, + "step": 2669 + }, + { + "epoch": 0.7495788882650196, + "grad_norm": 0.839434802532196, + "learning_rate": 9.331623483236218e-06, + "loss": 0.4436, + "step": 2670 + }, + { + "epoch": 0.7498596294216732, + "grad_norm": 0.8348051309585571, + "learning_rate": 9.330807422541388e-06, + "loss": 0.3844, + "step": 2671 + }, + { + "epoch": 0.7501403705783268, + "grad_norm": 0.8297469615936279, + "learning_rate": 9.329990899689723e-06, + "loss": 0.4686, + "step": 2672 + }, + { + "epoch": 0.7504211117349804, + "grad_norm": 0.7008973956108093, + "learning_rate": 9.329173914768352e-06, + "loss": 0.4336, + "step": 2673 + }, + { + "epoch": 0.7507018528916339, + "grad_norm": 0.7615330219268799, + "learning_rate": 9.328356467864466e-06, + "loss": 0.4789, + "step": 2674 + }, + { + "epoch": 0.7509825940482875, + "grad_norm": 0.850792407989502, + "learning_rate": 9.327538559065292e-06, + "loss": 0.4828, + "step": 2675 + }, + { + "epoch": 0.751263335204941, + "grad_norm": 0.7741012573242188, + "learning_rate": 9.326720188458113e-06, + "loss": 0.4609, + "step": 2676 + }, + { + "epoch": 0.7515440763615946, + "grad_norm": 0.6990548372268677, + "learning_rate": 9.325901356130262e-06, + "loss": 0.4384, + "step": 2677 + }, + { + "epoch": 0.7518248175182481, + "grad_norm": 0.7923767566680908, + "learning_rate": 9.325082062169122e-06, + "loss": 0.4596, + "step": 2678 + }, + { + "epoch": 0.7521055586749017, + "grad_norm": 0.7546195387840271, + "learning_rate": 9.324262306662118e-06, + "loss": 0.4544, + "step": 2679 + }, + { + "epoch": 0.7523862998315554, + "grad_norm": 0.7449183464050293, + "learning_rate": 9.323442089696731e-06, + "loss": 0.4462, + "step": 2680 + }, + { + "epoch": 0.7526670409882089, + "grad_norm": 0.7192502021789551, + "learning_rate": 9.322621411360492e-06, + "loss": 0.4377, + "step": 2681 + }, + { + "epoch": 0.7529477821448625, + "grad_norm": 0.822742223739624, + "learning_rate": 9.321800271740974e-06, + "loss": 0.4444, + "step": 2682 + }, + { + "epoch": 0.753228523301516, + "grad_norm": 0.7490763664245605, + "learning_rate": 9.320978670925808e-06, + "loss": 0.4186, + "step": 2683 + }, + { + "epoch": 0.7535092644581696, + "grad_norm": 0.8998096585273743, + "learning_rate": 9.320156609002668e-06, + "loss": 0.4851, + "step": 2684 + }, + { + "epoch": 0.7537900056148231, + "grad_norm": 0.7903598546981812, + "learning_rate": 9.319334086059281e-06, + "loss": 0.4381, + "step": 2685 + }, + { + "epoch": 0.7540707467714767, + "grad_norm": 0.7860065698623657, + "learning_rate": 9.318511102183421e-06, + "loss": 0.3971, + "step": 2686 + }, + { + "epoch": 0.7543514879281302, + "grad_norm": 0.7966722846031189, + "learning_rate": 9.31768765746291e-06, + "loss": 0.4809, + "step": 2687 + }, + { + "epoch": 0.7546322290847838, + "grad_norm": 0.7666751146316528, + "learning_rate": 9.316863751985621e-06, + "loss": 0.4592, + "step": 2688 + }, + { + "epoch": 0.7549129702414374, + "grad_norm": 0.8527171015739441, + "learning_rate": 9.31603938583948e-06, + "loss": 0.4475, + "step": 2689 + }, + { + "epoch": 0.755193711398091, + "grad_norm": 0.705956757068634, + "learning_rate": 9.315214559112454e-06, + "loss": 0.4585, + "step": 2690 + }, + { + "epoch": 0.7554744525547445, + "grad_norm": 0.7720552086830139, + "learning_rate": 9.314389271892563e-06, + "loss": 0.4396, + "step": 2691 + }, + { + "epoch": 0.7557551937113981, + "grad_norm": 0.8717186450958252, + "learning_rate": 9.31356352426788e-06, + "loss": 0.4567, + "step": 2692 + }, + { + "epoch": 0.7560359348680517, + "grad_norm": 0.8001463413238525, + "learning_rate": 9.312737316326524e-06, + "loss": 0.436, + "step": 2693 + }, + { + "epoch": 0.7563166760247052, + "grad_norm": 0.6348888278007507, + "learning_rate": 9.311910648156657e-06, + "loss": 0.4088, + "step": 2694 + }, + { + "epoch": 0.7565974171813588, + "grad_norm": 0.6831598281860352, + "learning_rate": 9.311083519846502e-06, + "loss": 0.4595, + "step": 2695 + }, + { + "epoch": 0.7568781583380123, + "grad_norm": 0.8067934513092041, + "learning_rate": 9.310255931484322e-06, + "loss": 0.4339, + "step": 2696 + }, + { + "epoch": 0.757158899494666, + "grad_norm": 0.6594023108482361, + "learning_rate": 9.309427883158433e-06, + "loss": 0.4293, + "step": 2697 + }, + { + "epoch": 0.7574396406513195, + "grad_norm": 0.7691739201545715, + "learning_rate": 9.308599374957198e-06, + "loss": 0.4536, + "step": 2698 + }, + { + "epoch": 0.7577203818079731, + "grad_norm": 0.7178975939750671, + "learning_rate": 9.307770406969032e-06, + "loss": 0.4255, + "step": 2699 + }, + { + "epoch": 0.7580011229646266, + "grad_norm": 0.698609471321106, + "learning_rate": 9.306940979282395e-06, + "loss": 0.429, + "step": 2700 + }, + { + "epoch": 0.7582818641212802, + "grad_norm": 0.7874751091003418, + "learning_rate": 9.306111091985802e-06, + "loss": 0.4856, + "step": 2701 + }, + { + "epoch": 0.7585626052779337, + "grad_norm": 0.8140864968299866, + "learning_rate": 9.305280745167809e-06, + "loss": 0.4576, + "step": 2702 + }, + { + "epoch": 0.7588433464345873, + "grad_norm": 0.872873067855835, + "learning_rate": 9.304449938917029e-06, + "loss": 0.4807, + "step": 2703 + }, + { + "epoch": 0.7591240875912408, + "grad_norm": 0.7322248816490173, + "learning_rate": 9.303618673322119e-06, + "loss": 0.463, + "step": 2704 + }, + { + "epoch": 0.7594048287478944, + "grad_norm": 0.6696175932884216, + "learning_rate": 9.302786948471787e-06, + "loss": 0.4232, + "step": 2705 + }, + { + "epoch": 0.759685569904548, + "grad_norm": 0.7476364970207214, + "learning_rate": 9.301954764454788e-06, + "loss": 0.4252, + "step": 2706 + }, + { + "epoch": 0.7599663110612016, + "grad_norm": 0.8410789370536804, + "learning_rate": 9.30112212135993e-06, + "loss": 0.4681, + "step": 2707 + }, + { + "epoch": 0.7602470522178552, + "grad_norm": 0.8708897233009338, + "learning_rate": 9.300289019276066e-06, + "loss": 0.457, + "step": 2708 + }, + { + "epoch": 0.7605277933745087, + "grad_norm": 0.7599568367004395, + "learning_rate": 9.299455458292097e-06, + "loss": 0.4662, + "step": 2709 + }, + { + "epoch": 0.7608085345311623, + "grad_norm": 0.7744013667106628, + "learning_rate": 9.29862143849698e-06, + "loss": 0.4816, + "step": 2710 + }, + { + "epoch": 0.7610892756878158, + "grad_norm": 0.8597412109375, + "learning_rate": 9.297786959979715e-06, + "loss": 0.4601, + "step": 2711 + }, + { + "epoch": 0.7613700168444694, + "grad_norm": 0.8001948595046997, + "learning_rate": 9.29695202282935e-06, + "loss": 0.461, + "step": 2712 + }, + { + "epoch": 0.7616507580011229, + "grad_norm": 0.7523707747459412, + "learning_rate": 9.296116627134988e-06, + "loss": 0.4746, + "step": 2713 + }, + { + "epoch": 0.7619314991577766, + "grad_norm": 0.8415935039520264, + "learning_rate": 9.295280772985775e-06, + "loss": 0.4287, + "step": 2714 + }, + { + "epoch": 0.7622122403144301, + "grad_norm": 0.7757467031478882, + "learning_rate": 9.294444460470909e-06, + "loss": 0.4568, + "step": 2715 + }, + { + "epoch": 0.7624929814710837, + "grad_norm": 0.8341929912567139, + "learning_rate": 9.293607689679633e-06, + "loss": 0.4796, + "step": 2716 + }, + { + "epoch": 0.7627737226277372, + "grad_norm": 0.6295900940895081, + "learning_rate": 9.292770460701247e-06, + "loss": 0.3763, + "step": 2717 + }, + { + "epoch": 0.7630544637843908, + "grad_norm": 0.6859361529350281, + "learning_rate": 9.291932773625092e-06, + "loss": 0.4721, + "step": 2718 + }, + { + "epoch": 0.7633352049410443, + "grad_norm": 0.7676633596420288, + "learning_rate": 9.29109462854056e-06, + "loss": 0.4323, + "step": 2719 + }, + { + "epoch": 0.7636159460976979, + "grad_norm": 0.7976534366607666, + "learning_rate": 9.290256025537096e-06, + "loss": 0.4654, + "step": 2720 + }, + { + "epoch": 0.7638966872543514, + "grad_norm": 0.7481630444526672, + "learning_rate": 9.289416964704186e-06, + "loss": 0.4683, + "step": 2721 + }, + { + "epoch": 0.764177428411005, + "grad_norm": 0.6892101168632507, + "learning_rate": 9.288577446131372e-06, + "loss": 0.4738, + "step": 2722 + }, + { + "epoch": 0.7644581695676587, + "grad_norm": 0.6679108738899231, + "learning_rate": 9.287737469908243e-06, + "loss": 0.4585, + "step": 2723 + }, + { + "epoch": 0.7647389107243122, + "grad_norm": 0.7333400249481201, + "learning_rate": 9.286897036124435e-06, + "loss": 0.4694, + "step": 2724 + }, + { + "epoch": 0.7650196518809658, + "grad_norm": 0.866708517074585, + "learning_rate": 9.286056144869633e-06, + "loss": 0.4638, + "step": 2725 + }, + { + "epoch": 0.7653003930376193, + "grad_norm": 0.7348212003707886, + "learning_rate": 9.285214796233573e-06, + "loss": 0.46, + "step": 2726 + }, + { + "epoch": 0.7655811341942729, + "grad_norm": 0.9280748963356018, + "learning_rate": 9.284372990306036e-06, + "loss": 0.4356, + "step": 2727 + }, + { + "epoch": 0.7658618753509264, + "grad_norm": 0.6627675890922546, + "learning_rate": 9.283530727176857e-06, + "loss": 0.4305, + "step": 2728 + }, + { + "epoch": 0.76614261650758, + "grad_norm": 0.7102477550506592, + "learning_rate": 9.282688006935918e-06, + "loss": 0.4492, + "step": 2729 + }, + { + "epoch": 0.7664233576642335, + "grad_norm": 0.7039529085159302, + "learning_rate": 9.281844829673146e-06, + "loss": 0.3993, + "step": 2730 + }, + { + "epoch": 0.7667040988208872, + "grad_norm": 0.8434668779373169, + "learning_rate": 9.281001195478522e-06, + "loss": 0.4584, + "step": 2731 + }, + { + "epoch": 0.7669848399775407, + "grad_norm": 0.6976370811462402, + "learning_rate": 9.280157104442072e-06, + "loss": 0.4196, + "step": 2732 + }, + { + "epoch": 0.7672655811341943, + "grad_norm": 0.764058530330658, + "learning_rate": 9.27931255665387e-06, + "loss": 0.4895, + "step": 2733 + }, + { + "epoch": 0.7675463222908478, + "grad_norm": 0.8501991629600525, + "learning_rate": 9.278467552204045e-06, + "loss": 0.5196, + "step": 2734 + }, + { + "epoch": 0.7678270634475014, + "grad_norm": 0.6693174242973328, + "learning_rate": 9.277622091182769e-06, + "loss": 0.4278, + "step": 2735 + }, + { + "epoch": 0.768107804604155, + "grad_norm": 0.8203902244567871, + "learning_rate": 9.276776173680264e-06, + "loss": 0.427, + "step": 2736 + }, + { + "epoch": 0.7683885457608085, + "grad_norm": 0.864119291305542, + "learning_rate": 9.275929799786801e-06, + "loss": 0.441, + "step": 2737 + }, + { + "epoch": 0.7686692869174621, + "grad_norm": 0.7313922047615051, + "learning_rate": 9.2750829695927e-06, + "loss": 0.4288, + "step": 2738 + }, + { + "epoch": 0.7689500280741156, + "grad_norm": 0.7067921161651611, + "learning_rate": 9.27423568318833e-06, + "loss": 0.4531, + "step": 2739 + }, + { + "epoch": 0.7692307692307693, + "grad_norm": 0.858953595161438, + "learning_rate": 9.273387940664108e-06, + "loss": 0.4526, + "step": 2740 + }, + { + "epoch": 0.7695115103874228, + "grad_norm": 0.822174608707428, + "learning_rate": 9.272539742110498e-06, + "loss": 0.4631, + "step": 2741 + }, + { + "epoch": 0.7697922515440764, + "grad_norm": 0.7382689118385315, + "learning_rate": 9.271691087618015e-06, + "loss": 0.4529, + "step": 2742 + }, + { + "epoch": 0.7700729927007299, + "grad_norm": 0.9063779711723328, + "learning_rate": 9.270841977277226e-06, + "loss": 0.4479, + "step": 2743 + }, + { + "epoch": 0.7703537338573835, + "grad_norm": 0.8486456274986267, + "learning_rate": 9.269992411178738e-06, + "loss": 0.4504, + "step": 2744 + }, + { + "epoch": 0.770634475014037, + "grad_norm": 0.8725316524505615, + "learning_rate": 9.269142389413213e-06, + "loss": 0.474, + "step": 2745 + }, + { + "epoch": 0.7709152161706906, + "grad_norm": 0.8478258848190308, + "learning_rate": 9.268291912071362e-06, + "loss": 0.4731, + "step": 2746 + }, + { + "epoch": 0.7711959573273441, + "grad_norm": 1.1193541288375854, + "learning_rate": 9.26744097924394e-06, + "loss": 0.4823, + "step": 2747 + }, + { + "epoch": 0.7714766984839978, + "grad_norm": 0.8588424921035767, + "learning_rate": 9.266589591021752e-06, + "loss": 0.4613, + "step": 2748 + }, + { + "epoch": 0.7717574396406514, + "grad_norm": 0.7786567211151123, + "learning_rate": 9.265737747495657e-06, + "loss": 0.4995, + "step": 2749 + }, + { + "epoch": 0.7720381807973049, + "grad_norm": 1.015882134437561, + "learning_rate": 9.264885448756557e-06, + "loss": 0.4784, + "step": 2750 + }, + { + "epoch": 0.7723189219539585, + "grad_norm": 0.9489424824714661, + "learning_rate": 9.264032694895404e-06, + "loss": 0.4435, + "step": 2751 + }, + { + "epoch": 0.772599663110612, + "grad_norm": 0.7556193470954895, + "learning_rate": 9.263179486003196e-06, + "loss": 0.4302, + "step": 2752 + }, + { + "epoch": 0.7728804042672656, + "grad_norm": 0.8387235999107361, + "learning_rate": 9.262325822170985e-06, + "loss": 0.4245, + "step": 2753 + }, + { + "epoch": 0.7731611454239191, + "grad_norm": 1.0417672395706177, + "learning_rate": 9.261471703489869e-06, + "loss": 0.4428, + "step": 2754 + }, + { + "epoch": 0.7734418865805727, + "grad_norm": 0.7097517848014832, + "learning_rate": 9.260617130050991e-06, + "loss": 0.4779, + "step": 2755 + }, + { + "epoch": 0.7737226277372263, + "grad_norm": 0.7087475657463074, + "learning_rate": 9.25976210194555e-06, + "loss": 0.3939, + "step": 2756 + }, + { + "epoch": 0.7740033688938799, + "grad_norm": 0.7660328149795532, + "learning_rate": 9.258906619264785e-06, + "loss": 0.4167, + "step": 2757 + }, + { + "epoch": 0.7742841100505334, + "grad_norm": 0.696075439453125, + "learning_rate": 9.258050682099993e-06, + "loss": 0.3943, + "step": 2758 + }, + { + "epoch": 0.774564851207187, + "grad_norm": 0.7289900183677673, + "learning_rate": 9.257194290542508e-06, + "loss": 0.4311, + "step": 2759 + }, + { + "epoch": 0.7748455923638405, + "grad_norm": 0.7284126877784729, + "learning_rate": 9.256337444683725e-06, + "loss": 0.4583, + "step": 2760 + }, + { + "epoch": 0.7751263335204941, + "grad_norm": 0.7524017691612244, + "learning_rate": 9.255480144615077e-06, + "loss": 0.4505, + "step": 2761 + }, + { + "epoch": 0.7754070746771476, + "grad_norm": 0.7010413408279419, + "learning_rate": 9.25462239042805e-06, + "loss": 0.4214, + "step": 2762 + }, + { + "epoch": 0.7756878158338012, + "grad_norm": 0.8451077938079834, + "learning_rate": 9.25376418221418e-06, + "loss": 0.4309, + "step": 2763 + }, + { + "epoch": 0.7759685569904547, + "grad_norm": 0.8602747917175293, + "learning_rate": 9.25290552006505e-06, + "loss": 0.4438, + "step": 2764 + }, + { + "epoch": 0.7762492981471084, + "grad_norm": 0.7057498693466187, + "learning_rate": 9.252046404072288e-06, + "loss": 0.4528, + "step": 2765 + }, + { + "epoch": 0.776530039303762, + "grad_norm": 0.9576298594474792, + "learning_rate": 9.251186834327577e-06, + "loss": 0.4671, + "step": 2766 + }, + { + "epoch": 0.7768107804604155, + "grad_norm": 0.8898288011550903, + "learning_rate": 9.250326810922643e-06, + "loss": 0.4412, + "step": 2767 + }, + { + "epoch": 0.7770915216170691, + "grad_norm": 0.6958866715431213, + "learning_rate": 9.249466333949264e-06, + "loss": 0.4981, + "step": 2768 + }, + { + "epoch": 0.7773722627737226, + "grad_norm": 0.9550066590309143, + "learning_rate": 9.248605403499262e-06, + "loss": 0.4692, + "step": 2769 + }, + { + "epoch": 0.7776530039303762, + "grad_norm": 1.0029186010360718, + "learning_rate": 9.24774401966451e-06, + "loss": 0.5155, + "step": 2770 + }, + { + "epoch": 0.7779337450870297, + "grad_norm": 0.8125027418136597, + "learning_rate": 9.246882182536935e-06, + "loss": 0.4989, + "step": 2771 + }, + { + "epoch": 0.7782144862436833, + "grad_norm": 0.7664905190467834, + "learning_rate": 9.2460198922085e-06, + "loss": 0.4757, + "step": 2772 + }, + { + "epoch": 0.7784952274003369, + "grad_norm": 0.7412874102592468, + "learning_rate": 9.245157148771229e-06, + "loss": 0.403, + "step": 2773 + }, + { + "epoch": 0.7787759685569905, + "grad_norm": 0.8610168695449829, + "learning_rate": 9.244293952317184e-06, + "loss": 0.4119, + "step": 2774 + }, + { + "epoch": 0.779056709713644, + "grad_norm": 0.830284059047699, + "learning_rate": 9.243430302938483e-06, + "loss": 0.4354, + "step": 2775 + }, + { + "epoch": 0.7793374508702976, + "grad_norm": 0.7288861870765686, + "learning_rate": 9.242566200727288e-06, + "loss": 0.4577, + "step": 2776 + }, + { + "epoch": 0.7796181920269512, + "grad_norm": 0.8208156824111938, + "learning_rate": 9.24170164577581e-06, + "loss": 0.4942, + "step": 2777 + }, + { + "epoch": 0.7798989331836047, + "grad_norm": 0.8003494143486023, + "learning_rate": 9.24083663817631e-06, + "loss": 0.4731, + "step": 2778 + }, + { + "epoch": 0.7801796743402583, + "grad_norm": 0.8178521990776062, + "learning_rate": 9.239971178021096e-06, + "loss": 0.4764, + "step": 2779 + }, + { + "epoch": 0.7804604154969118, + "grad_norm": 0.7557169198989868, + "learning_rate": 9.239105265402525e-06, + "loss": 0.4152, + "step": 2780 + }, + { + "epoch": 0.7807411566535654, + "grad_norm": 0.7637736797332764, + "learning_rate": 9.238238900413e-06, + "loss": 0.4447, + "step": 2781 + }, + { + "epoch": 0.781021897810219, + "grad_norm": 0.7243945598602295, + "learning_rate": 9.237372083144977e-06, + "loss": 0.4095, + "step": 2782 + }, + { + "epoch": 0.7813026389668726, + "grad_norm": 0.7808176279067993, + "learning_rate": 9.236504813690957e-06, + "loss": 0.4295, + "step": 2783 + }, + { + "epoch": 0.7815833801235261, + "grad_norm": 0.874909520149231, + "learning_rate": 9.235637092143486e-06, + "loss": 0.4434, + "step": 2784 + }, + { + "epoch": 0.7818641212801797, + "grad_norm": 0.7889077663421631, + "learning_rate": 9.234768918595165e-06, + "loss": 0.4465, + "step": 2785 + }, + { + "epoch": 0.7821448624368332, + "grad_norm": 0.8345769047737122, + "learning_rate": 9.23390029313864e-06, + "loss": 0.4444, + "step": 2786 + }, + { + "epoch": 0.7824256035934868, + "grad_norm": 0.8471890687942505, + "learning_rate": 9.233031215866603e-06, + "loss": 0.4704, + "step": 2787 + }, + { + "epoch": 0.7827063447501403, + "grad_norm": 0.8265674710273743, + "learning_rate": 9.232161686871798e-06, + "loss": 0.4182, + "step": 2788 + }, + { + "epoch": 0.7829870859067939, + "grad_norm": 0.8616026639938354, + "learning_rate": 9.231291706247018e-06, + "loss": 0.4536, + "step": 2789 + }, + { + "epoch": 0.7832678270634476, + "grad_norm": 0.8215060830116272, + "learning_rate": 9.2304212740851e-06, + "loss": 0.456, + "step": 2790 + }, + { + "epoch": 0.7835485682201011, + "grad_norm": 0.7380470633506775, + "learning_rate": 9.229550390478928e-06, + "loss": 0.4337, + "step": 2791 + }, + { + "epoch": 0.7838293093767547, + "grad_norm": 0.709854245185852, + "learning_rate": 9.228679055521442e-06, + "loss": 0.4849, + "step": 2792 + }, + { + "epoch": 0.7841100505334082, + "grad_norm": 0.9454172253608704, + "learning_rate": 9.227807269305624e-06, + "loss": 0.5318, + "step": 2793 + }, + { + "epoch": 0.7843907916900618, + "grad_norm": 0.7940680980682373, + "learning_rate": 9.226935031924505e-06, + "loss": 0.4719, + "step": 2794 + }, + { + "epoch": 0.7846715328467153, + "grad_norm": 0.6572303771972656, + "learning_rate": 9.226062343471165e-06, + "loss": 0.4582, + "step": 2795 + }, + { + "epoch": 0.7849522740033689, + "grad_norm": 0.7868558168411255, + "learning_rate": 9.225189204038728e-06, + "loss": 0.4732, + "step": 2796 + }, + { + "epoch": 0.7852330151600224, + "grad_norm": 0.7322407364845276, + "learning_rate": 9.224315613720378e-06, + "loss": 0.4402, + "step": 2797 + }, + { + "epoch": 0.785513756316676, + "grad_norm": 0.7923116087913513, + "learning_rate": 9.223441572609335e-06, + "loss": 0.4362, + "step": 2798 + }, + { + "epoch": 0.7857944974733296, + "grad_norm": 0.6431361436843872, + "learning_rate": 9.222567080798871e-06, + "loss": 0.452, + "step": 2799 + }, + { + "epoch": 0.7860752386299832, + "grad_norm": 0.7228332161903381, + "learning_rate": 9.221692138382305e-06, + "loss": 0.4618, + "step": 2800 + }, + { + "epoch": 0.7863559797866367, + "grad_norm": 0.7577100396156311, + "learning_rate": 9.22081674545301e-06, + "loss": 0.4192, + "step": 2801 + }, + { + "epoch": 0.7866367209432903, + "grad_norm": 0.7033113837242126, + "learning_rate": 9.219940902104396e-06, + "loss": 0.4457, + "step": 2802 + }, + { + "epoch": 0.7869174620999438, + "grad_norm": 0.7653865814208984, + "learning_rate": 9.219064608429932e-06, + "loss": 0.4921, + "step": 2803 + }, + { + "epoch": 0.7871982032565974, + "grad_norm": 0.7377403974533081, + "learning_rate": 9.218187864523131e-06, + "loss": 0.4363, + "step": 2804 + }, + { + "epoch": 0.787478944413251, + "grad_norm": 0.7712346911430359, + "learning_rate": 9.21731067047755e-06, + "loss": 0.5021, + "step": 2805 + }, + { + "epoch": 0.7877596855699045, + "grad_norm": 0.7734142541885376, + "learning_rate": 9.216433026386802e-06, + "loss": 0.4691, + "step": 2806 + }, + { + "epoch": 0.7880404267265582, + "grad_norm": 0.7538473010063171, + "learning_rate": 9.215554932344542e-06, + "loss": 0.4895, + "step": 2807 + }, + { + "epoch": 0.7883211678832117, + "grad_norm": 0.7117080092430115, + "learning_rate": 9.214676388444472e-06, + "loss": 0.4257, + "step": 2808 + }, + { + "epoch": 0.7886019090398653, + "grad_norm": 0.7666523456573486, + "learning_rate": 9.213797394780349e-06, + "loss": 0.4409, + "step": 2809 + }, + { + "epoch": 0.7888826501965188, + "grad_norm": 0.6807367205619812, + "learning_rate": 9.212917951445971e-06, + "loss": 0.4444, + "step": 2810 + }, + { + "epoch": 0.7891633913531724, + "grad_norm": 0.7090035676956177, + "learning_rate": 9.212038058535189e-06, + "loss": 0.4526, + "step": 2811 + }, + { + "epoch": 0.7894441325098259, + "grad_norm": 0.854701817035675, + "learning_rate": 9.211157716141896e-06, + "loss": 0.4461, + "step": 2812 + }, + { + "epoch": 0.7897248736664795, + "grad_norm": 0.7089012265205383, + "learning_rate": 9.21027692436004e-06, + "loss": 0.4842, + "step": 2813 + }, + { + "epoch": 0.790005614823133, + "grad_norm": 0.8200495839118958, + "learning_rate": 9.209395683283615e-06, + "loss": 0.4178, + "step": 2814 + }, + { + "epoch": 0.7902863559797867, + "grad_norm": 0.6886423230171204, + "learning_rate": 9.208513993006655e-06, + "loss": 0.4509, + "step": 2815 + }, + { + "epoch": 0.7905670971364402, + "grad_norm": 0.8361442685127258, + "learning_rate": 9.207631853623256e-06, + "loss": 0.481, + "step": 2816 + }, + { + "epoch": 0.7908478382930938, + "grad_norm": 0.6882129907608032, + "learning_rate": 9.206749265227551e-06, + "loss": 0.4046, + "step": 2817 + }, + { + "epoch": 0.7911285794497473, + "grad_norm": 0.7281439900398254, + "learning_rate": 9.205866227913723e-06, + "loss": 0.4241, + "step": 2818 + }, + { + "epoch": 0.7914093206064009, + "grad_norm": 0.7652041912078857, + "learning_rate": 9.204982741776005e-06, + "loss": 0.4264, + "step": 2819 + }, + { + "epoch": 0.7916900617630545, + "grad_norm": 0.8479816913604736, + "learning_rate": 9.20409880690868e-06, + "loss": 0.4743, + "step": 2820 + }, + { + "epoch": 0.791970802919708, + "grad_norm": 0.741902768611908, + "learning_rate": 9.203214423406073e-06, + "loss": 0.4311, + "step": 2821 + }, + { + "epoch": 0.7922515440763616, + "grad_norm": 0.6678258776664734, + "learning_rate": 9.20232959136256e-06, + "loss": 0.4046, + "step": 2822 + }, + { + "epoch": 0.7925322852330151, + "grad_norm": 0.7125440835952759, + "learning_rate": 9.201444310872566e-06, + "loss": 0.4241, + "step": 2823 + }, + { + "epoch": 0.7928130263896688, + "grad_norm": 0.7699744701385498, + "learning_rate": 9.200558582030563e-06, + "loss": 0.4255, + "step": 2824 + }, + { + "epoch": 0.7930937675463223, + "grad_norm": 0.6562171578407288, + "learning_rate": 9.199672404931068e-06, + "loss": 0.4814, + "step": 2825 + }, + { + "epoch": 0.7933745087029759, + "grad_norm": 0.7204014658927917, + "learning_rate": 9.198785779668652e-06, + "loss": 0.4513, + "step": 2826 + }, + { + "epoch": 0.7936552498596294, + "grad_norm": 0.7226943373680115, + "learning_rate": 9.197898706337927e-06, + "loss": 0.4542, + "step": 2827 + }, + { + "epoch": 0.793935991016283, + "grad_norm": 0.7600888013839722, + "learning_rate": 9.197011185033558e-06, + "loss": 0.4457, + "step": 2828 + }, + { + "epoch": 0.7942167321729365, + "grad_norm": 0.7142552733421326, + "learning_rate": 9.196123215850254e-06, + "loss": 0.4777, + "step": 2829 + }, + { + "epoch": 0.7944974733295901, + "grad_norm": 0.713534414768219, + "learning_rate": 9.195234798882774e-06, + "loss": 0.4491, + "step": 2830 + }, + { + "epoch": 0.7947782144862436, + "grad_norm": 0.6138408780097961, + "learning_rate": 9.194345934225925e-06, + "loss": 0.4308, + "step": 2831 + }, + { + "epoch": 0.7950589556428973, + "grad_norm": 0.6582531929016113, + "learning_rate": 9.19345662197456e-06, + "loss": 0.4151, + "step": 2832 + }, + { + "epoch": 0.7953396967995509, + "grad_norm": 0.7898809313774109, + "learning_rate": 9.192566862223585e-06, + "loss": 0.4682, + "step": 2833 + }, + { + "epoch": 0.7956204379562044, + "grad_norm": 0.6748396158218384, + "learning_rate": 9.191676655067944e-06, + "loss": 0.3966, + "step": 2834 + }, + { + "epoch": 0.795901179112858, + "grad_norm": 0.7965052127838135, + "learning_rate": 9.190786000602635e-06, + "loss": 0.5002, + "step": 2835 + }, + { + "epoch": 0.7961819202695115, + "grad_norm": 0.7563027739524841, + "learning_rate": 9.189894898922708e-06, + "loss": 0.4898, + "step": 2836 + }, + { + "epoch": 0.7964626614261651, + "grad_norm": 0.7985860705375671, + "learning_rate": 9.189003350123252e-06, + "loss": 0.4847, + "step": 2837 + }, + { + "epoch": 0.7967434025828186, + "grad_norm": 0.7177184820175171, + "learning_rate": 9.188111354299407e-06, + "loss": 0.4486, + "step": 2838 + }, + { + "epoch": 0.7970241437394722, + "grad_norm": 0.679217517375946, + "learning_rate": 9.187218911546363e-06, + "loss": 0.4346, + "step": 2839 + }, + { + "epoch": 0.7973048848961257, + "grad_norm": 0.6719887256622314, + "learning_rate": 9.186326021959354e-06, + "loss": 0.4408, + "step": 2840 + }, + { + "epoch": 0.7975856260527794, + "grad_norm": 0.6565520763397217, + "learning_rate": 9.185432685633666e-06, + "loss": 0.4355, + "step": 2841 + }, + { + "epoch": 0.7978663672094329, + "grad_norm": 0.69264817237854, + "learning_rate": 9.184538902664628e-06, + "loss": 0.4126, + "step": 2842 + }, + { + "epoch": 0.7981471083660865, + "grad_norm": 0.7278082966804504, + "learning_rate": 9.183644673147622e-06, + "loss": 0.448, + "step": 2843 + }, + { + "epoch": 0.79842784952274, + "grad_norm": 0.7132008075714111, + "learning_rate": 9.182749997178074e-06, + "loss": 0.46, + "step": 2844 + }, + { + "epoch": 0.7987085906793936, + "grad_norm": 0.6451550126075745, + "learning_rate": 9.181854874851454e-06, + "loss": 0.4441, + "step": 2845 + }, + { + "epoch": 0.7989893318360471, + "grad_norm": 0.7662902474403381, + "learning_rate": 9.18095930626329e-06, + "loss": 0.4358, + "step": 2846 + }, + { + "epoch": 0.7992700729927007, + "grad_norm": 0.6235150098800659, + "learning_rate": 9.180063291509148e-06, + "loss": 0.4111, + "step": 2847 + }, + { + "epoch": 0.7995508141493542, + "grad_norm": 0.7264047861099243, + "learning_rate": 9.179166830684643e-06, + "loss": 0.4382, + "step": 2848 + }, + { + "epoch": 0.7998315553060079, + "grad_norm": 0.6406546831130981, + "learning_rate": 9.178269923885444e-06, + "loss": 0.432, + "step": 2849 + }, + { + "epoch": 0.8001122964626615, + "grad_norm": 0.7896562218666077, + "learning_rate": 9.17737257120726e-06, + "loss": 0.4674, + "step": 2850 + }, + { + "epoch": 0.800393037619315, + "grad_norm": 0.698302686214447, + "learning_rate": 9.176474772745855e-06, + "loss": 0.419, + "step": 2851 + }, + { + "epoch": 0.8006737787759686, + "grad_norm": 0.7514543533325195, + "learning_rate": 9.175576528597035e-06, + "loss": 0.4471, + "step": 2852 + }, + { + "epoch": 0.8009545199326221, + "grad_norm": 0.8828864097595215, + "learning_rate": 9.174677838856651e-06, + "loss": 0.4819, + "step": 2853 + }, + { + "epoch": 0.8012352610892757, + "grad_norm": 0.6965618133544922, + "learning_rate": 9.17377870362061e-06, + "loss": 0.4305, + "step": 2854 + }, + { + "epoch": 0.8015160022459292, + "grad_norm": 0.8631529211997986, + "learning_rate": 9.172879122984861e-06, + "loss": 0.4782, + "step": 2855 + }, + { + "epoch": 0.8017967434025828, + "grad_norm": 0.9167166948318481, + "learning_rate": 9.171979097045402e-06, + "loss": 0.4496, + "step": 2856 + }, + { + "epoch": 0.8020774845592363, + "grad_norm": 0.803749144077301, + "learning_rate": 9.171078625898277e-06, + "loss": 0.3924, + "step": 2857 + }, + { + "epoch": 0.80235822571589, + "grad_norm": 0.7713818550109863, + "learning_rate": 9.170177709639578e-06, + "loss": 0.4555, + "step": 2858 + }, + { + "epoch": 0.8026389668725435, + "grad_norm": 0.7084963321685791, + "learning_rate": 9.169276348365447e-06, + "loss": 0.4507, + "step": 2859 + }, + { + "epoch": 0.8029197080291971, + "grad_norm": 0.8877184391021729, + "learning_rate": 9.168374542172073e-06, + "loss": 0.445, + "step": 2860 + }, + { + "epoch": 0.8032004491858507, + "grad_norm": 0.9707546830177307, + "learning_rate": 9.167472291155688e-06, + "loss": 0.4303, + "step": 2861 + }, + { + "epoch": 0.8034811903425042, + "grad_norm": 0.7353629469871521, + "learning_rate": 9.166569595412576e-06, + "loss": 0.4691, + "step": 2862 + }, + { + "epoch": 0.8037619314991578, + "grad_norm": 0.725790798664093, + "learning_rate": 9.165666455039065e-06, + "loss": 0.4713, + "step": 2863 + }, + { + "epoch": 0.8040426726558113, + "grad_norm": 0.7861348986625671, + "learning_rate": 9.164762870131538e-06, + "loss": 0.458, + "step": 2864 + }, + { + "epoch": 0.8043234138124649, + "grad_norm": 0.8006153702735901, + "learning_rate": 9.163858840786415e-06, + "loss": 0.4518, + "step": 2865 + }, + { + "epoch": 0.8046041549691185, + "grad_norm": 0.7562771439552307, + "learning_rate": 9.162954367100169e-06, + "loss": 0.4325, + "step": 2866 + }, + { + "epoch": 0.8048848961257721, + "grad_norm": 0.6728021502494812, + "learning_rate": 9.162049449169321e-06, + "loss": 0.4948, + "step": 2867 + }, + { + "epoch": 0.8051656372824256, + "grad_norm": 0.8100345134735107, + "learning_rate": 9.161144087090438e-06, + "loss": 0.4912, + "step": 2868 + }, + { + "epoch": 0.8054463784390792, + "grad_norm": 0.6807064414024353, + "learning_rate": 9.160238280960134e-06, + "loss": 0.4895, + "step": 2869 + }, + { + "epoch": 0.8057271195957327, + "grad_norm": 0.7116173505783081, + "learning_rate": 9.159332030875072e-06, + "loss": 0.4396, + "step": 2870 + }, + { + "epoch": 0.8060078607523863, + "grad_norm": 0.7416521310806274, + "learning_rate": 9.158425336931961e-06, + "loss": 0.4543, + "step": 2871 + }, + { + "epoch": 0.8062886019090398, + "grad_norm": 0.6447952389717102, + "learning_rate": 9.157518199227558e-06, + "loss": 0.4139, + "step": 2872 + }, + { + "epoch": 0.8065693430656934, + "grad_norm": 0.7108721733093262, + "learning_rate": 9.156610617858665e-06, + "loss": 0.4872, + "step": 2873 + }, + { + "epoch": 0.8068500842223469, + "grad_norm": 0.7568307518959045, + "learning_rate": 9.155702592922138e-06, + "loss": 0.4733, + "step": 2874 + }, + { + "epoch": 0.8071308253790006, + "grad_norm": 0.7475255131721497, + "learning_rate": 9.15479412451487e-06, + "loss": 0.4626, + "step": 2875 + }, + { + "epoch": 0.8074115665356542, + "grad_norm": 0.7216377854347229, + "learning_rate": 9.15388521273381e-06, + "loss": 0.4544, + "step": 2876 + }, + { + "epoch": 0.8076923076923077, + "grad_norm": 0.7445193529129028, + "learning_rate": 9.152975857675954e-06, + "loss": 0.4728, + "step": 2877 + }, + { + "epoch": 0.8079730488489613, + "grad_norm": 0.7817115783691406, + "learning_rate": 9.152066059438339e-06, + "loss": 0.4119, + "step": 2878 + }, + { + "epoch": 0.8082537900056148, + "grad_norm": 0.5849059820175171, + "learning_rate": 9.151155818118055e-06, + "loss": 0.4306, + "step": 2879 + }, + { + "epoch": 0.8085345311622684, + "grad_norm": 0.7628145813941956, + "learning_rate": 9.150245133812233e-06, + "loss": 0.4423, + "step": 2880 + }, + { + "epoch": 0.8088152723189219, + "grad_norm": 0.8405887484550476, + "learning_rate": 9.149334006618062e-06, + "loss": 0.4941, + "step": 2881 + }, + { + "epoch": 0.8090960134755755, + "grad_norm": 0.6933674216270447, + "learning_rate": 9.148422436632768e-06, + "loss": 0.4865, + "step": 2882 + }, + { + "epoch": 0.8093767546322291, + "grad_norm": 0.6658686995506287, + "learning_rate": 9.147510423953628e-06, + "loss": 0.4648, + "step": 2883 + }, + { + "epoch": 0.8096574957888827, + "grad_norm": 0.7244076728820801, + "learning_rate": 9.146597968677968e-06, + "loss": 0.4654, + "step": 2884 + }, + { + "epoch": 0.8099382369455362, + "grad_norm": 0.6563810110092163, + "learning_rate": 9.145685070903158e-06, + "loss": 0.4436, + "step": 2885 + }, + { + "epoch": 0.8102189781021898, + "grad_norm": 0.7178528904914856, + "learning_rate": 9.14477173072662e-06, + "loss": 0.4505, + "step": 2886 + }, + { + "epoch": 0.8104997192588433, + "grad_norm": 0.7493821978569031, + "learning_rate": 9.143857948245815e-06, + "loss": 0.4372, + "step": 2887 + }, + { + "epoch": 0.8107804604154969, + "grad_norm": 0.7121507525444031, + "learning_rate": 9.14294372355826e-06, + "loss": 0.4815, + "step": 2888 + }, + { + "epoch": 0.8110612015721504, + "grad_norm": 0.7137668132781982, + "learning_rate": 9.142029056761515e-06, + "loss": 0.4552, + "step": 2889 + }, + { + "epoch": 0.811341942728804, + "grad_norm": 0.7351555824279785, + "learning_rate": 9.141113947953184e-06, + "loss": 0.5112, + "step": 2890 + }, + { + "epoch": 0.8116226838854577, + "grad_norm": 0.7549033164978027, + "learning_rate": 9.140198397230926e-06, + "loss": 0.4575, + "step": 2891 + }, + { + "epoch": 0.8119034250421112, + "grad_norm": 0.7129078507423401, + "learning_rate": 9.139282404692442e-06, + "loss": 0.5095, + "step": 2892 + }, + { + "epoch": 0.8121841661987648, + "grad_norm": 0.667210042476654, + "learning_rate": 9.13836597043548e-06, + "loss": 0.469, + "step": 2893 + }, + { + "epoch": 0.8124649073554183, + "grad_norm": 0.8358027338981628, + "learning_rate": 9.137449094557834e-06, + "loss": 0.431, + "step": 2894 + }, + { + "epoch": 0.8127456485120719, + "grad_norm": 0.8525927662849426, + "learning_rate": 9.136531777157352e-06, + "loss": 0.3989, + "step": 2895 + }, + { + "epoch": 0.8130263896687254, + "grad_norm": 0.6969891786575317, + "learning_rate": 9.135614018331922e-06, + "loss": 0.4646, + "step": 2896 + }, + { + "epoch": 0.813307130825379, + "grad_norm": 0.6864852905273438, + "learning_rate": 9.13469581817948e-06, + "loss": 0.389, + "step": 2897 + }, + { + "epoch": 0.8135878719820325, + "grad_norm": 0.8523910045623779, + "learning_rate": 9.133777176798013e-06, + "loss": 0.4372, + "step": 2898 + }, + { + "epoch": 0.8138686131386861, + "grad_norm": 0.7501009106636047, + "learning_rate": 9.132858094285554e-06, + "loss": 0.4855, + "step": 2899 + }, + { + "epoch": 0.8141493542953397, + "grad_norm": 0.7910122871398926, + "learning_rate": 9.131938570740177e-06, + "loss": 0.4781, + "step": 2900 + }, + { + "epoch": 0.8144300954519933, + "grad_norm": 0.8193109631538391, + "learning_rate": 9.131018606260012e-06, + "loss": 0.4149, + "step": 2901 + }, + { + "epoch": 0.8147108366086468, + "grad_norm": 0.6814030408859253, + "learning_rate": 9.13009820094323e-06, + "loss": 0.4362, + "step": 2902 + }, + { + "epoch": 0.8149915777653004, + "grad_norm": 0.7575172781944275, + "learning_rate": 9.129177354888053e-06, + "loss": 0.4787, + "step": 2903 + }, + { + "epoch": 0.815272318921954, + "grad_norm": 0.6326907277107239, + "learning_rate": 9.128256068192744e-06, + "loss": 0.4439, + "step": 2904 + }, + { + "epoch": 0.8155530600786075, + "grad_norm": 0.9037715792655945, + "learning_rate": 9.12733434095562e-06, + "loss": 0.465, + "step": 2905 + }, + { + "epoch": 0.8158338012352611, + "grad_norm": 0.8178196549415588, + "learning_rate": 9.12641217327504e-06, + "loss": 0.4422, + "step": 2906 + }, + { + "epoch": 0.8161145423919146, + "grad_norm": 0.7674875259399414, + "learning_rate": 9.125489565249417e-06, + "loss": 0.4536, + "step": 2907 + }, + { + "epoch": 0.8163952835485683, + "grad_norm": 0.6643429398536682, + "learning_rate": 9.1245665169772e-06, + "loss": 0.4268, + "step": 2908 + }, + { + "epoch": 0.8166760247052218, + "grad_norm": 0.683995246887207, + "learning_rate": 9.123643028556894e-06, + "loss": 0.4596, + "step": 2909 + }, + { + "epoch": 0.8169567658618754, + "grad_norm": 0.8194730281829834, + "learning_rate": 9.122719100087045e-06, + "loss": 0.475, + "step": 2910 + }, + { + "epoch": 0.8172375070185289, + "grad_norm": 0.7791884541511536, + "learning_rate": 9.121794731666253e-06, + "loss": 0.4867, + "step": 2911 + }, + { + "epoch": 0.8175182481751825, + "grad_norm": 0.6854553818702698, + "learning_rate": 9.12086992339316e-06, + "loss": 0.4707, + "step": 2912 + }, + { + "epoch": 0.817798989331836, + "grad_norm": 0.8855614066123962, + "learning_rate": 9.119944675366453e-06, + "loss": 0.4906, + "step": 2913 + }, + { + "epoch": 0.8180797304884896, + "grad_norm": 0.7338504791259766, + "learning_rate": 9.119018987684872e-06, + "loss": 0.4181, + "step": 2914 + }, + { + "epoch": 0.8183604716451431, + "grad_norm": 0.7446621656417847, + "learning_rate": 9.118092860447198e-06, + "loss": 0.4713, + "step": 2915 + }, + { + "epoch": 0.8186412128017967, + "grad_norm": 0.7724880576133728, + "learning_rate": 9.117166293752263e-06, + "loss": 0.4471, + "step": 2916 + }, + { + "epoch": 0.8189219539584504, + "grad_norm": 0.7833882570266724, + "learning_rate": 9.116239287698944e-06, + "loss": 0.4616, + "step": 2917 + }, + { + "epoch": 0.8192026951151039, + "grad_norm": 0.7517616152763367, + "learning_rate": 9.115311842386166e-06, + "loss": 0.4527, + "step": 2918 + }, + { + "epoch": 0.8194834362717575, + "grad_norm": 0.6753953099250793, + "learning_rate": 9.114383957912898e-06, + "loss": 0.4699, + "step": 2919 + }, + { + "epoch": 0.819764177428411, + "grad_norm": 0.7758336067199707, + "learning_rate": 9.11345563437816e-06, + "loss": 0.473, + "step": 2920 + }, + { + "epoch": 0.8200449185850646, + "grad_norm": 0.6592584252357483, + "learning_rate": 9.112526871881019e-06, + "loss": 0.4522, + "step": 2921 + }, + { + "epoch": 0.8203256597417181, + "grad_norm": 0.6775717735290527, + "learning_rate": 9.111597670520583e-06, + "loss": 0.4808, + "step": 2922 + }, + { + "epoch": 0.8206064008983717, + "grad_norm": 0.6942693591117859, + "learning_rate": 9.110668030396011e-06, + "loss": 0.4266, + "step": 2923 + }, + { + "epoch": 0.8208871420550252, + "grad_norm": 0.6692604422569275, + "learning_rate": 9.10973795160651e-06, + "loss": 0.44, + "step": 2924 + }, + { + "epoch": 0.8211678832116789, + "grad_norm": 0.6583316326141357, + "learning_rate": 9.108807434251331e-06, + "loss": 0.4197, + "step": 2925 + }, + { + "epoch": 0.8214486243683324, + "grad_norm": 0.7477753162384033, + "learning_rate": 9.107876478429773e-06, + "loss": 0.4533, + "step": 2926 + }, + { + "epoch": 0.821729365524986, + "grad_norm": 0.6819732189178467, + "learning_rate": 9.106945084241185e-06, + "loss": 0.4749, + "step": 2927 + }, + { + "epoch": 0.8220101066816395, + "grad_norm": 0.6898617744445801, + "learning_rate": 9.106013251784956e-06, + "loss": 0.4463, + "step": 2928 + }, + { + "epoch": 0.8222908478382931, + "grad_norm": 0.7254061698913574, + "learning_rate": 9.105080981160525e-06, + "loss": 0.4259, + "step": 2929 + }, + { + "epoch": 0.8225715889949466, + "grad_norm": 0.8229217529296875, + "learning_rate": 9.104148272467381e-06, + "loss": 0.4616, + "step": 2930 + }, + { + "epoch": 0.8228523301516002, + "grad_norm": 0.7421444654464722, + "learning_rate": 9.103215125805054e-06, + "loss": 0.4527, + "step": 2931 + }, + { + "epoch": 0.8231330713082537, + "grad_norm": 0.6365233063697815, + "learning_rate": 9.102281541273126e-06, + "loss": 0.4251, + "step": 2932 + }, + { + "epoch": 0.8234138124649073, + "grad_norm": 0.6618478298187256, + "learning_rate": 9.101347518971223e-06, + "loss": 0.4721, + "step": 2933 + }, + { + "epoch": 0.823694553621561, + "grad_norm": 0.6431057453155518, + "learning_rate": 9.100413058999015e-06, + "loss": 0.4725, + "step": 2934 + }, + { + "epoch": 0.8239752947782145, + "grad_norm": 0.6888259053230286, + "learning_rate": 9.099478161456226e-06, + "loss": 0.4443, + "step": 2935 + }, + { + "epoch": 0.8242560359348681, + "grad_norm": 0.693355143070221, + "learning_rate": 9.09854282644262e-06, + "loss": 0.4585, + "step": 2936 + }, + { + "epoch": 0.8245367770915216, + "grad_norm": 0.6703828573226929, + "learning_rate": 9.09760705405801e-06, + "loss": 0.4661, + "step": 2937 + }, + { + "epoch": 0.8248175182481752, + "grad_norm": 0.6152010560035706, + "learning_rate": 9.09667084440226e-06, + "loss": 0.4251, + "step": 2938 + }, + { + "epoch": 0.8250982594048287, + "grad_norm": 0.6722482442855835, + "learning_rate": 9.09573419757527e-06, + "loss": 0.4764, + "step": 2939 + }, + { + "epoch": 0.8253790005614823, + "grad_norm": 0.8198792934417725, + "learning_rate": 9.094797113676997e-06, + "loss": 0.4774, + "step": 2940 + }, + { + "epoch": 0.8256597417181358, + "grad_norm": 0.7244009971618652, + "learning_rate": 9.093859592807439e-06, + "loss": 0.4225, + "step": 2941 + }, + { + "epoch": 0.8259404828747895, + "grad_norm": 0.66555255651474, + "learning_rate": 9.092921635066643e-06, + "loss": 0.4377, + "step": 2942 + }, + { + "epoch": 0.826221224031443, + "grad_norm": 0.6388446092605591, + "learning_rate": 9.091983240554703e-06, + "loss": 0.4317, + "step": 2943 + }, + { + "epoch": 0.8265019651880966, + "grad_norm": 0.6635749340057373, + "learning_rate": 9.091044409371759e-06, + "loss": 0.4841, + "step": 2944 + }, + { + "epoch": 0.8267827063447502, + "grad_norm": 0.6262797117233276, + "learning_rate": 9.090105141617995e-06, + "loss": 0.4157, + "step": 2945 + }, + { + "epoch": 0.8270634475014037, + "grad_norm": 0.7906641960144043, + "learning_rate": 9.089165437393645e-06, + "loss": 0.4703, + "step": 2946 + }, + { + "epoch": 0.8273441886580573, + "grad_norm": 0.7401872277259827, + "learning_rate": 9.08822529679899e-06, + "loss": 0.444, + "step": 2947 + }, + { + "epoch": 0.8276249298147108, + "grad_norm": 0.6760455965995789, + "learning_rate": 9.08728471993435e-06, + "loss": 0.4674, + "step": 2948 + }, + { + "epoch": 0.8279056709713644, + "grad_norm": 0.6818065047264099, + "learning_rate": 9.086343706900105e-06, + "loss": 0.4739, + "step": 2949 + }, + { + "epoch": 0.8281864121280179, + "grad_norm": 0.7495522499084473, + "learning_rate": 9.085402257796671e-06, + "loss": 0.4641, + "step": 2950 + }, + { + "epoch": 0.8284671532846716, + "grad_norm": 0.6967856884002686, + "learning_rate": 9.084460372724514e-06, + "loss": 0.4173, + "step": 2951 + }, + { + "epoch": 0.8287478944413251, + "grad_norm": 0.7353311777114868, + "learning_rate": 9.083518051784143e-06, + "loss": 0.4236, + "step": 2952 + }, + { + "epoch": 0.8290286355979787, + "grad_norm": 0.7496944069862366, + "learning_rate": 9.082575295076121e-06, + "loss": 0.4191, + "step": 2953 + }, + { + "epoch": 0.8293093767546322, + "grad_norm": 0.7949837446212769, + "learning_rate": 9.081632102701053e-06, + "loss": 0.4701, + "step": 2954 + }, + { + "epoch": 0.8295901179112858, + "grad_norm": 0.7971872687339783, + "learning_rate": 9.080688474759587e-06, + "loss": 0.4395, + "step": 2955 + }, + { + "epoch": 0.8298708590679393, + "grad_norm": 0.7470804452896118, + "learning_rate": 9.079744411352422e-06, + "loss": 0.4189, + "step": 2956 + }, + { + "epoch": 0.8301516002245929, + "grad_norm": 0.8063899874687195, + "learning_rate": 9.078799912580305e-06, + "loss": 0.541, + "step": 2957 + }, + { + "epoch": 0.8304323413812464, + "grad_norm": 0.6667802333831787, + "learning_rate": 9.077854978544027e-06, + "loss": 0.4142, + "step": 2958 + }, + { + "epoch": 0.8307130825379001, + "grad_norm": 0.8121907114982605, + "learning_rate": 9.076909609344422e-06, + "loss": 0.4515, + "step": 2959 + }, + { + "epoch": 0.8309938236945537, + "grad_norm": 0.8178734183311462, + "learning_rate": 9.075963805082378e-06, + "loss": 0.4538, + "step": 2960 + }, + { + "epoch": 0.8312745648512072, + "grad_norm": 0.7481734752655029, + "learning_rate": 9.075017565858822e-06, + "loss": 0.4633, + "step": 2961 + }, + { + "epoch": 0.8315553060078608, + "grad_norm": 0.6679221391677856, + "learning_rate": 9.074070891774733e-06, + "loss": 0.4215, + "step": 2962 + }, + { + "epoch": 0.8318360471645143, + "grad_norm": 0.9229344725608826, + "learning_rate": 9.073123782931133e-06, + "loss": 0.4561, + "step": 2963 + }, + { + "epoch": 0.8321167883211679, + "grad_norm": 0.7118955254554749, + "learning_rate": 9.07217623942909e-06, + "loss": 0.4186, + "step": 2964 + }, + { + "epoch": 0.8323975294778214, + "grad_norm": 0.6958334445953369, + "learning_rate": 9.071228261369726e-06, + "loss": 0.3944, + "step": 2965 + }, + { + "epoch": 0.832678270634475, + "grad_norm": 0.718093752861023, + "learning_rate": 9.070279848854198e-06, + "loss": 0.4287, + "step": 2966 + }, + { + "epoch": 0.8329590117911286, + "grad_norm": 0.91909259557724, + "learning_rate": 9.069331001983715e-06, + "loss": 0.4863, + "step": 2967 + }, + { + "epoch": 0.8332397529477822, + "grad_norm": 0.8083088397979736, + "learning_rate": 9.068381720859532e-06, + "loss": 0.4631, + "step": 2968 + }, + { + "epoch": 0.8335204941044357, + "grad_norm": 0.710812509059906, + "learning_rate": 9.067432005582953e-06, + "loss": 0.4502, + "step": 2969 + }, + { + "epoch": 0.8338012352610893, + "grad_norm": 0.8467698693275452, + "learning_rate": 9.066481856255323e-06, + "loss": 0.505, + "step": 2970 + }, + { + "epoch": 0.8340819764177428, + "grad_norm": 0.7723353505134583, + "learning_rate": 9.065531272978039e-06, + "loss": 0.4744, + "step": 2971 + }, + { + "epoch": 0.8343627175743964, + "grad_norm": 0.9657412171363831, + "learning_rate": 9.064580255852537e-06, + "loss": 0.5081, + "step": 2972 + }, + { + "epoch": 0.83464345873105, + "grad_norm": 0.7016030550003052, + "learning_rate": 9.063628804980308e-06, + "loss": 0.445, + "step": 2973 + }, + { + "epoch": 0.8349241998877035, + "grad_norm": 0.7023826241493225, + "learning_rate": 9.062676920462882e-06, + "loss": 0.4317, + "step": 2974 + }, + { + "epoch": 0.835204941044357, + "grad_norm": 0.7801896929740906, + "learning_rate": 9.061724602401838e-06, + "loss": 0.461, + "step": 2975 + }, + { + "epoch": 0.8354856822010107, + "grad_norm": 0.7139474153518677, + "learning_rate": 9.060771850898806e-06, + "loss": 0.4247, + "step": 2976 + }, + { + "epoch": 0.8357664233576643, + "grad_norm": 0.7815770506858826, + "learning_rate": 9.05981866605545e-06, + "loss": 0.4165, + "step": 2977 + }, + { + "epoch": 0.8360471645143178, + "grad_norm": 0.7460588216781616, + "learning_rate": 9.058865047973495e-06, + "loss": 0.4716, + "step": 2978 + }, + { + "epoch": 0.8363279056709714, + "grad_norm": 0.6795597076416016, + "learning_rate": 9.057910996754704e-06, + "loss": 0.4401, + "step": 2979 + }, + { + "epoch": 0.8366086468276249, + "grad_norm": 0.6955469846725464, + "learning_rate": 9.056956512500882e-06, + "loss": 0.4591, + "step": 2980 + }, + { + "epoch": 0.8368893879842785, + "grad_norm": 0.6433377265930176, + "learning_rate": 9.056001595313892e-06, + "loss": 0.476, + "step": 2981 + }, + { + "epoch": 0.837170129140932, + "grad_norm": 0.7584505677223206, + "learning_rate": 9.055046245295634e-06, + "loss": 0.4811, + "step": 2982 + }, + { + "epoch": 0.8374508702975856, + "grad_norm": 0.6828482747077942, + "learning_rate": 9.054090462548058e-06, + "loss": 0.4782, + "step": 2983 + }, + { + "epoch": 0.8377316114542392, + "grad_norm": 0.7746155858039856, + "learning_rate": 9.053134247173158e-06, + "loss": 0.4278, + "step": 2984 + }, + { + "epoch": 0.8380123526108928, + "grad_norm": 0.6910436749458313, + "learning_rate": 9.052177599272976e-06, + "loss": 0.4889, + "step": 2985 + }, + { + "epoch": 0.8382930937675463, + "grad_norm": 0.742337167263031, + "learning_rate": 9.051220518949598e-06, + "loss": 0.4337, + "step": 2986 + }, + { + "epoch": 0.8385738349241999, + "grad_norm": 0.5568944215774536, + "learning_rate": 9.05026300630516e-06, + "loss": 0.4366, + "step": 2987 + }, + { + "epoch": 0.8388545760808535, + "grad_norm": 0.7992352247238159, + "learning_rate": 9.049305061441842e-06, + "loss": 0.4724, + "step": 2988 + }, + { + "epoch": 0.839135317237507, + "grad_norm": 0.800186038017273, + "learning_rate": 9.048346684461867e-06, + "loss": 0.4655, + "step": 2989 + }, + { + "epoch": 0.8394160583941606, + "grad_norm": 0.7031164169311523, + "learning_rate": 9.04738787546751e-06, + "loss": 0.4117, + "step": 2990 + }, + { + "epoch": 0.8396967995508141, + "grad_norm": 0.6729116439819336, + "learning_rate": 9.046428634561089e-06, + "loss": 0.4126, + "step": 2991 + }, + { + "epoch": 0.8399775407074677, + "grad_norm": 0.7558688521385193, + "learning_rate": 9.045468961844966e-06, + "loss": 0.4166, + "step": 2992 + }, + { + "epoch": 0.8402582818641213, + "grad_norm": 0.7242925763130188, + "learning_rate": 9.044508857421552e-06, + "loss": 0.4336, + "step": 2993 + }, + { + "epoch": 0.8405390230207749, + "grad_norm": 0.7024223804473877, + "learning_rate": 9.043548321393305e-06, + "loss": 0.4303, + "step": 2994 + }, + { + "epoch": 0.8408197641774284, + "grad_norm": 0.7583803534507751, + "learning_rate": 9.042587353862723e-06, + "loss": 0.4541, + "step": 2995 + }, + { + "epoch": 0.841100505334082, + "grad_norm": 0.7492231726646423, + "learning_rate": 9.041625954932363e-06, + "loss": 0.4946, + "step": 2996 + }, + { + "epoch": 0.8413812464907355, + "grad_norm": 0.8081197738647461, + "learning_rate": 9.04066412470481e-06, + "loss": 0.4623, + "step": 2997 + }, + { + "epoch": 0.8416619876473891, + "grad_norm": 0.7065810561180115, + "learning_rate": 9.03970186328271e-06, + "loss": 0.4022, + "step": 2998 + }, + { + "epoch": 0.8419427288040426, + "grad_norm": 0.8383647799491882, + "learning_rate": 9.038739170768751e-06, + "loss": 0.4854, + "step": 2999 + }, + { + "epoch": 0.8422234699606962, + "grad_norm": 0.6942509412765503, + "learning_rate": 9.037776047265661e-06, + "loss": 0.4587, + "step": 3000 + }, + { + "epoch": 0.8425042111173499, + "grad_norm": 0.8069538474082947, + "learning_rate": 9.03681249287622e-06, + "loss": 0.4329, + "step": 3001 + }, + { + "epoch": 0.8427849522740034, + "grad_norm": 0.8150032162666321, + "learning_rate": 9.035848507703253e-06, + "loss": 0.4601, + "step": 3002 + }, + { + "epoch": 0.843065693430657, + "grad_norm": 0.7180718779563904, + "learning_rate": 9.034884091849632e-06, + "loss": 0.4661, + "step": 3003 + }, + { + "epoch": 0.8433464345873105, + "grad_norm": 0.7106649875640869, + "learning_rate": 9.03391924541827e-06, + "loss": 0.4342, + "step": 3004 + }, + { + "epoch": 0.8436271757439641, + "grad_norm": 0.778570294380188, + "learning_rate": 9.032953968512132e-06, + "loss": 0.4594, + "step": 3005 + }, + { + "epoch": 0.8439079169006176, + "grad_norm": 0.7676113843917847, + "learning_rate": 9.031988261234226e-06, + "loss": 0.4634, + "step": 3006 + }, + { + "epoch": 0.8441886580572712, + "grad_norm": 0.6679142117500305, + "learning_rate": 9.031022123687607e-06, + "loss": 0.4497, + "step": 3007 + }, + { + "epoch": 0.8444693992139247, + "grad_norm": 0.8467196822166443, + "learning_rate": 9.030055555975373e-06, + "loss": 0.4472, + "step": 3008 + }, + { + "epoch": 0.8447501403705783, + "grad_norm": 0.8035460710525513, + "learning_rate": 9.029088558200672e-06, + "loss": 0.432, + "step": 3009 + }, + { + "epoch": 0.8450308815272319, + "grad_norm": 0.6552834510803223, + "learning_rate": 9.028121130466696e-06, + "loss": 0.4093, + "step": 3010 + }, + { + "epoch": 0.8453116226838855, + "grad_norm": 0.7501563429832458, + "learning_rate": 9.02715327287668e-06, + "loss": 0.4549, + "step": 3011 + }, + { + "epoch": 0.845592363840539, + "grad_norm": 0.6984012126922607, + "learning_rate": 9.026184985533913e-06, + "loss": 0.4796, + "step": 3012 + }, + { + "epoch": 0.8458731049971926, + "grad_norm": 0.6520709991455078, + "learning_rate": 9.02521626854172e-06, + "loss": 0.4339, + "step": 3013 + }, + { + "epoch": 0.8461538461538461, + "grad_norm": 0.742888331413269, + "learning_rate": 9.024247122003477e-06, + "loss": 0.4433, + "step": 3014 + }, + { + "epoch": 0.8464345873104997, + "grad_norm": 0.8158312439918518, + "learning_rate": 9.023277546022606e-06, + "loss": 0.4798, + "step": 3015 + }, + { + "epoch": 0.8467153284671532, + "grad_norm": 0.8237171769142151, + "learning_rate": 9.022307540702576e-06, + "loss": 0.4696, + "step": 3016 + }, + { + "epoch": 0.8469960696238068, + "grad_norm": 0.7952943444252014, + "learning_rate": 9.021337106146898e-06, + "loss": 0.5073, + "step": 3017 + }, + { + "epoch": 0.8472768107804605, + "grad_norm": 0.7563936710357666, + "learning_rate": 9.02036624245913e-06, + "loss": 0.4126, + "step": 3018 + }, + { + "epoch": 0.847557551937114, + "grad_norm": 0.675305187702179, + "learning_rate": 9.019394949742879e-06, + "loss": 0.4364, + "step": 3019 + }, + { + "epoch": 0.8478382930937676, + "grad_norm": 0.8370843529701233, + "learning_rate": 9.018423228101793e-06, + "loss": 0.4709, + "step": 3020 + }, + { + "epoch": 0.8481190342504211, + "grad_norm": 0.749310314655304, + "learning_rate": 9.017451077639569e-06, + "loss": 0.4018, + "step": 3021 + }, + { + "epoch": 0.8483997754070747, + "grad_norm": 0.6608916521072388, + "learning_rate": 9.01647849845995e-06, + "loss": 0.4688, + "step": 3022 + }, + { + "epoch": 0.8486805165637282, + "grad_norm": 0.7035723924636841, + "learning_rate": 9.015505490666722e-06, + "loss": 0.457, + "step": 3023 + }, + { + "epoch": 0.8489612577203818, + "grad_norm": 0.771713137626648, + "learning_rate": 9.014532054363719e-06, + "loss": 0.4467, + "step": 3024 + }, + { + "epoch": 0.8492419988770353, + "grad_norm": 0.7923526167869568, + "learning_rate": 9.013558189654819e-06, + "loss": 0.5159, + "step": 3025 + }, + { + "epoch": 0.8495227400336889, + "grad_norm": 0.6970590949058533, + "learning_rate": 9.012583896643949e-06, + "loss": 0.4514, + "step": 3026 + }, + { + "epoch": 0.8498034811903425, + "grad_norm": 0.6298572421073914, + "learning_rate": 9.011609175435077e-06, + "loss": 0.4585, + "step": 3027 + }, + { + "epoch": 0.8500842223469961, + "grad_norm": 0.7198712825775146, + "learning_rate": 9.01063402613222e-06, + "loss": 0.516, + "step": 3028 + }, + { + "epoch": 0.8503649635036497, + "grad_norm": 0.8174310922622681, + "learning_rate": 9.009658448839441e-06, + "loss": 0.4829, + "step": 3029 + }, + { + "epoch": 0.8506457046603032, + "grad_norm": 0.660001277923584, + "learning_rate": 9.008682443660848e-06, + "loss": 0.4184, + "step": 3030 + }, + { + "epoch": 0.8509264458169568, + "grad_norm": 0.8710374236106873, + "learning_rate": 9.00770601070059e-06, + "loss": 0.4829, + "step": 3031 + }, + { + "epoch": 0.8512071869736103, + "grad_norm": 0.7065457105636597, + "learning_rate": 9.00672915006287e-06, + "loss": 0.4596, + "step": 3032 + }, + { + "epoch": 0.8514879281302639, + "grad_norm": 0.8196362257003784, + "learning_rate": 9.005751861851933e-06, + "loss": 0.5039, + "step": 3033 + }, + { + "epoch": 0.8517686692869174, + "grad_norm": 0.6606584191322327, + "learning_rate": 9.004774146172067e-06, + "loss": 0.4225, + "step": 3034 + }, + { + "epoch": 0.8520494104435711, + "grad_norm": 0.7076795697212219, + "learning_rate": 9.003796003127606e-06, + "loss": 0.4868, + "step": 3035 + }, + { + "epoch": 0.8523301516002246, + "grad_norm": 0.6806330680847168, + "learning_rate": 9.002817432822934e-06, + "loss": 0.4269, + "step": 3036 + }, + { + "epoch": 0.8526108927568782, + "grad_norm": 0.6327815651893616, + "learning_rate": 9.00183843536248e-06, + "loss": 0.4746, + "step": 3037 + }, + { + "epoch": 0.8528916339135317, + "grad_norm": 0.7250660061836243, + "learning_rate": 9.00085901085071e-06, + "loss": 0.4664, + "step": 3038 + }, + { + "epoch": 0.8531723750701853, + "grad_norm": 0.6600928902626038, + "learning_rate": 8.99987915939215e-06, + "loss": 0.3859, + "step": 3039 + }, + { + "epoch": 0.8534531162268388, + "grad_norm": 0.7053912281990051, + "learning_rate": 8.998898881091358e-06, + "loss": 0.4645, + "step": 3040 + }, + { + "epoch": 0.8537338573834924, + "grad_norm": 0.688148558139801, + "learning_rate": 8.997918176052945e-06, + "loss": 0.4288, + "step": 3041 + }, + { + "epoch": 0.8540145985401459, + "grad_norm": 0.7710590362548828, + "learning_rate": 8.996937044381565e-06, + "loss": 0.4707, + "step": 3042 + }, + { + "epoch": 0.8542953396967996, + "grad_norm": 0.7533964514732361, + "learning_rate": 8.99595548618192e-06, + "loss": 0.4476, + "step": 3043 + }, + { + "epoch": 0.8545760808534532, + "grad_norm": 0.6575049161911011, + "learning_rate": 8.994973501558754e-06, + "loss": 0.4097, + "step": 3044 + }, + { + "epoch": 0.8548568220101067, + "grad_norm": 0.7209513187408447, + "learning_rate": 8.99399109061686e-06, + "loss": 0.4277, + "step": 3045 + }, + { + "epoch": 0.8551375631667603, + "grad_norm": 0.744247317314148, + "learning_rate": 8.993008253461073e-06, + "loss": 0.4202, + "step": 3046 + }, + { + "epoch": 0.8554183043234138, + "grad_norm": 0.7632492780685425, + "learning_rate": 8.992024990196276e-06, + "loss": 0.4748, + "step": 3047 + }, + { + "epoch": 0.8556990454800674, + "grad_norm": 0.7649577856063843, + "learning_rate": 8.991041300927397e-06, + "loss": 0.4327, + "step": 3048 + }, + { + "epoch": 0.8559797866367209, + "grad_norm": 0.7533644437789917, + "learning_rate": 8.990057185759409e-06, + "loss": 0.468, + "step": 3049 + }, + { + "epoch": 0.8562605277933745, + "grad_norm": 0.7657700777053833, + "learning_rate": 8.98907264479733e-06, + "loss": 0.4726, + "step": 3050 + }, + { + "epoch": 0.856541268950028, + "grad_norm": 0.74150550365448, + "learning_rate": 8.988087678146225e-06, + "loss": 0.4649, + "step": 3051 + }, + { + "epoch": 0.8568220101066817, + "grad_norm": 0.7767965793609619, + "learning_rate": 8.987102285911204e-06, + "loss": 0.4911, + "step": 3052 + }, + { + "epoch": 0.8571027512633352, + "grad_norm": 0.8834661841392517, + "learning_rate": 8.986116468197422e-06, + "loss": 0.5185, + "step": 3053 + }, + { + "epoch": 0.8573834924199888, + "grad_norm": 0.8438416719436646, + "learning_rate": 8.985130225110077e-06, + "loss": 0.473, + "step": 3054 + }, + { + "epoch": 0.8576642335766423, + "grad_norm": 0.6546692848205566, + "learning_rate": 8.984143556754416e-06, + "loss": 0.4466, + "step": 3055 + }, + { + "epoch": 0.8579449747332959, + "grad_norm": 0.8710799217224121, + "learning_rate": 8.983156463235731e-06, + "loss": 0.4892, + "step": 3056 + }, + { + "epoch": 0.8582257158899494, + "grad_norm": 0.6808534264564514, + "learning_rate": 8.982168944659359e-06, + "loss": 0.4673, + "step": 3057 + }, + { + "epoch": 0.858506457046603, + "grad_norm": 0.6909962892532349, + "learning_rate": 8.981181001130678e-06, + "loss": 0.4454, + "step": 3058 + }, + { + "epoch": 0.8587871982032566, + "grad_norm": 0.7451123595237732, + "learning_rate": 8.98019263275512e-06, + "loss": 0.4541, + "step": 3059 + }, + { + "epoch": 0.8590679393599102, + "grad_norm": 0.7801520824432373, + "learning_rate": 8.979203839638155e-06, + "loss": 0.3956, + "step": 3060 + }, + { + "epoch": 0.8593486805165638, + "grad_norm": 0.781225323677063, + "learning_rate": 8.978214621885301e-06, + "loss": 0.4904, + "step": 3061 + }, + { + "epoch": 0.8596294216732173, + "grad_norm": 0.833699107170105, + "learning_rate": 8.977224979602123e-06, + "loss": 0.4668, + "step": 3062 + }, + { + "epoch": 0.8599101628298709, + "grad_norm": 0.8141199350357056, + "learning_rate": 8.976234912894226e-06, + "loss": 0.5174, + "step": 3063 + }, + { + "epoch": 0.8601909039865244, + "grad_norm": 0.7246940732002258, + "learning_rate": 8.975244421867267e-06, + "loss": 0.4553, + "step": 3064 + }, + { + "epoch": 0.860471645143178, + "grad_norm": 0.6886260509490967, + "learning_rate": 8.974253506626943e-06, + "loss": 0.4545, + "step": 3065 + }, + { + "epoch": 0.8607523862998315, + "grad_norm": 0.7634584307670593, + "learning_rate": 8.973262167278997e-06, + "loss": 0.4305, + "step": 3066 + }, + { + "epoch": 0.8610331274564851, + "grad_norm": 0.6877836585044861, + "learning_rate": 8.972270403929223e-06, + "loss": 0.4367, + "step": 3067 + }, + { + "epoch": 0.8613138686131386, + "grad_norm": 0.7027422785758972, + "learning_rate": 8.971278216683454e-06, + "loss": 0.4301, + "step": 3068 + }, + { + "epoch": 0.8615946097697923, + "grad_norm": 0.6292520761489868, + "learning_rate": 8.970285605647568e-06, + "loss": 0.4273, + "step": 3069 + }, + { + "epoch": 0.8618753509264458, + "grad_norm": 0.6710954904556274, + "learning_rate": 8.969292570927493e-06, + "loss": 0.4557, + "step": 3070 + }, + { + "epoch": 0.8621560920830994, + "grad_norm": 0.7669581770896912, + "learning_rate": 8.968299112629196e-06, + "loss": 0.4891, + "step": 3071 + }, + { + "epoch": 0.862436833239753, + "grad_norm": 0.7548537254333496, + "learning_rate": 8.967305230858696e-06, + "loss": 0.4537, + "step": 3072 + }, + { + "epoch": 0.8627175743964065, + "grad_norm": 0.6805199384689331, + "learning_rate": 8.966310925722054e-06, + "loss": 0.4135, + "step": 3073 + }, + { + "epoch": 0.8629983155530601, + "grad_norm": 0.714707612991333, + "learning_rate": 8.965316197325374e-06, + "loss": 0.4524, + "step": 3074 + }, + { + "epoch": 0.8632790567097136, + "grad_norm": 0.6804215908050537, + "learning_rate": 8.964321045774808e-06, + "loss": 0.4432, + "step": 3075 + }, + { + "epoch": 0.8635597978663672, + "grad_norm": 0.789669394493103, + "learning_rate": 8.963325471176552e-06, + "loss": 0.4624, + "step": 3076 + }, + { + "epoch": 0.8638405390230208, + "grad_norm": 0.9625343084335327, + "learning_rate": 8.962329473636848e-06, + "loss": 0.4467, + "step": 3077 + }, + { + "epoch": 0.8641212801796744, + "grad_norm": 0.8646829128265381, + "learning_rate": 8.961333053261984e-06, + "loss": 0.4651, + "step": 3078 + }, + { + "epoch": 0.8644020213363279, + "grad_norm": 0.7739148736000061, + "learning_rate": 8.96033621015829e-06, + "loss": 0.4417, + "step": 3079 + }, + { + "epoch": 0.8646827624929815, + "grad_norm": 0.9266568422317505, + "learning_rate": 8.959338944432144e-06, + "loss": 0.4283, + "step": 3080 + }, + { + "epoch": 0.864963503649635, + "grad_norm": 0.8844408392906189, + "learning_rate": 8.958341256189966e-06, + "loss": 0.5005, + "step": 3081 + }, + { + "epoch": 0.8652442448062886, + "grad_norm": 0.7739181518554688, + "learning_rate": 8.957343145538225e-06, + "loss": 0.4523, + "step": 3082 + }, + { + "epoch": 0.8655249859629421, + "grad_norm": 0.712052047252655, + "learning_rate": 8.956344612583433e-06, + "loss": 0.4531, + "step": 3083 + }, + { + "epoch": 0.8658057271195957, + "grad_norm": 0.7869881987571716, + "learning_rate": 8.955345657432144e-06, + "loss": 0.4482, + "step": 3084 + }, + { + "epoch": 0.8660864682762492, + "grad_norm": 0.8225265741348267, + "learning_rate": 8.954346280190966e-06, + "loss": 0.4773, + "step": 3085 + }, + { + "epoch": 0.8663672094329029, + "grad_norm": 0.7096180319786072, + "learning_rate": 8.953346480966543e-06, + "loss": 0.4322, + "step": 3086 + }, + { + "epoch": 0.8666479505895565, + "grad_norm": 0.7668026089668274, + "learning_rate": 8.952346259865568e-06, + "loss": 0.4447, + "step": 3087 + }, + { + "epoch": 0.86692869174621, + "grad_norm": 0.7819424271583557, + "learning_rate": 8.951345616994777e-06, + "loss": 0.4906, + "step": 3088 + }, + { + "epoch": 0.8672094329028636, + "grad_norm": 0.7087759971618652, + "learning_rate": 8.950344552460955e-06, + "loss": 0.4261, + "step": 3089 + }, + { + "epoch": 0.8674901740595171, + "grad_norm": 0.6854142546653748, + "learning_rate": 8.949343066370927e-06, + "loss": 0.4386, + "step": 3090 + }, + { + "epoch": 0.8677709152161707, + "grad_norm": 0.8375086784362793, + "learning_rate": 8.948341158831565e-06, + "loss": 0.4754, + "step": 3091 + }, + { + "epoch": 0.8680516563728242, + "grad_norm": 0.7688049077987671, + "learning_rate": 8.947338829949789e-06, + "loss": 0.4668, + "step": 3092 + }, + { + "epoch": 0.8683323975294778, + "grad_norm": 0.6722404956817627, + "learning_rate": 8.946336079832562e-06, + "loss": 0.4403, + "step": 3093 + }, + { + "epoch": 0.8686131386861314, + "grad_norm": 0.7951295375823975, + "learning_rate": 8.945332908586887e-06, + "loss": 0.409, + "step": 3094 + }, + { + "epoch": 0.868893879842785, + "grad_norm": 0.8671542406082153, + "learning_rate": 8.944329316319819e-06, + "loss": 0.4759, + "step": 3095 + }, + { + "epoch": 0.8691746209994385, + "grad_norm": 0.6443971991539001, + "learning_rate": 8.943325303138455e-06, + "loss": 0.4458, + "step": 3096 + }, + { + "epoch": 0.8694553621560921, + "grad_norm": 0.7402511835098267, + "learning_rate": 8.942320869149933e-06, + "loss": 0.4943, + "step": 3097 + }, + { + "epoch": 0.8697361033127456, + "grad_norm": 0.7752165198326111, + "learning_rate": 8.941316014461448e-06, + "loss": 0.474, + "step": 3098 + }, + { + "epoch": 0.8700168444693992, + "grad_norm": 0.6883082389831543, + "learning_rate": 8.940310739180227e-06, + "loss": 0.4641, + "step": 3099 + }, + { + "epoch": 0.8702975856260527, + "grad_norm": 0.7413114309310913, + "learning_rate": 8.939305043413543e-06, + "loss": 0.4447, + "step": 3100 + }, + { + "epoch": 0.8705783267827063, + "grad_norm": 0.7216244339942932, + "learning_rate": 8.938298927268728e-06, + "loss": 0.4962, + "step": 3101 + }, + { + "epoch": 0.87085906793936, + "grad_norm": 0.6698883771896362, + "learning_rate": 8.937292390853136e-06, + "loss": 0.4559, + "step": 3102 + }, + { + "epoch": 0.8711398090960135, + "grad_norm": 0.6504644155502319, + "learning_rate": 8.936285434274189e-06, + "loss": 0.4342, + "step": 3103 + }, + { + "epoch": 0.8714205502526671, + "grad_norm": 0.7135512232780457, + "learning_rate": 8.935278057639336e-06, + "loss": 0.4597, + "step": 3104 + }, + { + "epoch": 0.8717012914093206, + "grad_norm": 0.6523886322975159, + "learning_rate": 8.934270261056081e-06, + "loss": 0.4278, + "step": 3105 + }, + { + "epoch": 0.8719820325659742, + "grad_norm": 0.6546775698661804, + "learning_rate": 8.93326204463197e-06, + "loss": 0.4594, + "step": 3106 + }, + { + "epoch": 0.8722627737226277, + "grad_norm": 0.6894417405128479, + "learning_rate": 8.932253408474592e-06, + "loss": 0.4453, + "step": 3107 + }, + { + "epoch": 0.8725435148792813, + "grad_norm": 0.6604836583137512, + "learning_rate": 8.931244352691584e-06, + "loss": 0.4097, + "step": 3108 + }, + { + "epoch": 0.8728242560359348, + "grad_norm": 0.731212854385376, + "learning_rate": 8.930234877390626e-06, + "loss": 0.4184, + "step": 3109 + }, + { + "epoch": 0.8731049971925884, + "grad_norm": 0.6962615847587585, + "learning_rate": 8.929224982679441e-06, + "loss": 0.459, + "step": 3110 + }, + { + "epoch": 0.873385738349242, + "grad_norm": 0.6900382041931152, + "learning_rate": 8.928214668665802e-06, + "loss": 0.4406, + "step": 3111 + }, + { + "epoch": 0.8736664795058956, + "grad_norm": 0.6988445520401001, + "learning_rate": 8.92720393545752e-06, + "loss": 0.4368, + "step": 3112 + }, + { + "epoch": 0.8739472206625492, + "grad_norm": 0.6465603113174438, + "learning_rate": 8.926192783162456e-06, + "loss": 0.4174, + "step": 3113 + }, + { + "epoch": 0.8742279618192027, + "grad_norm": 0.6586212515830994, + "learning_rate": 8.925181211888514e-06, + "loss": 0.4466, + "step": 3114 + }, + { + "epoch": 0.8745087029758563, + "grad_norm": 0.6625884771347046, + "learning_rate": 8.92416922174364e-06, + "loss": 0.4329, + "step": 3115 + }, + { + "epoch": 0.8747894441325098, + "grad_norm": 0.7261552214622498, + "learning_rate": 8.923156812835831e-06, + "loss": 0.4247, + "step": 3116 + }, + { + "epoch": 0.8750701852891634, + "grad_norm": 0.614919900894165, + "learning_rate": 8.922143985273125e-06, + "loss": 0.4021, + "step": 3117 + }, + { + "epoch": 0.8753509264458169, + "grad_norm": 0.7092061042785645, + "learning_rate": 8.921130739163602e-06, + "loss": 0.4417, + "step": 3118 + }, + { + "epoch": 0.8756316676024706, + "grad_norm": 0.6687312126159668, + "learning_rate": 8.92011707461539e-06, + "loss": 0.4381, + "step": 3119 + }, + { + "epoch": 0.8759124087591241, + "grad_norm": 0.8217565417289734, + "learning_rate": 8.91910299173666e-06, + "loss": 0.4336, + "step": 3120 + }, + { + "epoch": 0.8761931499157777, + "grad_norm": 0.6277878284454346, + "learning_rate": 8.918088490635633e-06, + "loss": 0.4055, + "step": 3121 + }, + { + "epoch": 0.8764738910724312, + "grad_norm": 0.6974098086357117, + "learning_rate": 8.917073571420565e-06, + "loss": 0.4409, + "step": 3122 + }, + { + "epoch": 0.8767546322290848, + "grad_norm": 0.757596492767334, + "learning_rate": 8.916058234199766e-06, + "loss": 0.4763, + "step": 3123 + }, + { + "epoch": 0.8770353733857383, + "grad_norm": 0.7230308651924133, + "learning_rate": 8.915042479081584e-06, + "loss": 0.4771, + "step": 3124 + }, + { + "epoch": 0.8773161145423919, + "grad_norm": 0.667575478553772, + "learning_rate": 8.914026306174413e-06, + "loss": 0.4139, + "step": 3125 + }, + { + "epoch": 0.8775968556990454, + "grad_norm": 0.6815164089202881, + "learning_rate": 8.913009715586695e-06, + "loss": 0.4136, + "step": 3126 + }, + { + "epoch": 0.877877596855699, + "grad_norm": 0.7295694351196289, + "learning_rate": 8.911992707426915e-06, + "loss": 0.4365, + "step": 3127 + }, + { + "epoch": 0.8781583380123527, + "grad_norm": 0.6554813385009766, + "learning_rate": 8.910975281803599e-06, + "loss": 0.4426, + "step": 3128 + }, + { + "epoch": 0.8784390791690062, + "grad_norm": 0.6871294379234314, + "learning_rate": 8.909957438825324e-06, + "loss": 0.4543, + "step": 3129 + }, + { + "epoch": 0.8787198203256598, + "grad_norm": 0.7401013374328613, + "learning_rate": 8.908939178600702e-06, + "loss": 0.4397, + "step": 3130 + }, + { + "epoch": 0.8790005614823133, + "grad_norm": 0.7382338047027588, + "learning_rate": 8.907920501238402e-06, + "loss": 0.447, + "step": 3131 + }, + { + "epoch": 0.8792813026389669, + "grad_norm": 0.8176373839378357, + "learning_rate": 8.906901406847127e-06, + "loss": 0.4459, + "step": 3132 + }, + { + "epoch": 0.8795620437956204, + "grad_norm": 0.7015848159790039, + "learning_rate": 8.905881895535628e-06, + "loss": 0.4162, + "step": 3133 + }, + { + "epoch": 0.879842784952274, + "grad_norm": 0.6791051030158997, + "learning_rate": 8.904861967412702e-06, + "loss": 0.4037, + "step": 3134 + }, + { + "epoch": 0.8801235261089275, + "grad_norm": 0.7253435254096985, + "learning_rate": 8.90384162258719e-06, + "loss": 0.4687, + "step": 3135 + }, + { + "epoch": 0.8804042672655812, + "grad_norm": 0.732147753238678, + "learning_rate": 8.902820861167978e-06, + "loss": 0.4808, + "step": 3136 + }, + { + "epoch": 0.8806850084222347, + "grad_norm": 0.8130166530609131, + "learning_rate": 8.901799683263993e-06, + "loss": 0.4357, + "step": 3137 + }, + { + "epoch": 0.8809657495788883, + "grad_norm": 0.8349103927612305, + "learning_rate": 8.90077808898421e-06, + "loss": 0.4633, + "step": 3138 + }, + { + "epoch": 0.8812464907355418, + "grad_norm": 0.7550891637802124, + "learning_rate": 8.899756078437645e-06, + "loss": 0.436, + "step": 3139 + }, + { + "epoch": 0.8815272318921954, + "grad_norm": 0.833274245262146, + "learning_rate": 8.898733651733362e-06, + "loss": 0.4437, + "step": 3140 + }, + { + "epoch": 0.881807973048849, + "grad_norm": 0.8202539682388306, + "learning_rate": 8.897710808980472e-06, + "loss": 0.4464, + "step": 3141 + }, + { + "epoch": 0.8820887142055025, + "grad_norm": 0.7455984354019165, + "learning_rate": 8.896687550288119e-06, + "loss": 0.3987, + "step": 3142 + }, + { + "epoch": 0.882369455362156, + "grad_norm": 0.9817519783973694, + "learning_rate": 8.895663875765503e-06, + "loss": 0.456, + "step": 3143 + }, + { + "epoch": 0.8826501965188096, + "grad_norm": 0.7784855961799622, + "learning_rate": 8.894639785521866e-06, + "loss": 0.4575, + "step": 3144 + }, + { + "epoch": 0.8829309376754633, + "grad_norm": 0.7350506782531738, + "learning_rate": 8.893615279666488e-06, + "loss": 0.4711, + "step": 3145 + }, + { + "epoch": 0.8832116788321168, + "grad_norm": 0.8772125840187073, + "learning_rate": 8.8925903583087e-06, + "loss": 0.467, + "step": 3146 + }, + { + "epoch": 0.8834924199887704, + "grad_norm": 0.8276565670967102, + "learning_rate": 8.891565021557877e-06, + "loss": 0.4205, + "step": 3147 + }, + { + "epoch": 0.8837731611454239, + "grad_norm": 0.7918325066566467, + "learning_rate": 8.890539269523435e-06, + "loss": 0.4498, + "step": 3148 + }, + { + "epoch": 0.8840539023020775, + "grad_norm": 0.8533987998962402, + "learning_rate": 8.889513102314833e-06, + "loss": 0.4483, + "step": 3149 + }, + { + "epoch": 0.884334643458731, + "grad_norm": 0.7781782746315002, + "learning_rate": 8.888486520041583e-06, + "loss": 0.4347, + "step": 3150 + }, + { + "epoch": 0.8846153846153846, + "grad_norm": 0.7314184904098511, + "learning_rate": 8.887459522813232e-06, + "loss": 0.4787, + "step": 3151 + }, + { + "epoch": 0.8848961257720381, + "grad_norm": 0.6887636780738831, + "learning_rate": 8.886432110739374e-06, + "loss": 0.4403, + "step": 3152 + }, + { + "epoch": 0.8851768669286918, + "grad_norm": 0.7512714862823486, + "learning_rate": 8.885404283929651e-06, + "loss": 0.432, + "step": 3153 + }, + { + "epoch": 0.8854576080853453, + "grad_norm": 0.8041152954101562, + "learning_rate": 8.884376042493742e-06, + "loss": 0.4581, + "step": 3154 + }, + { + "epoch": 0.8857383492419989, + "grad_norm": 0.7227070927619934, + "learning_rate": 8.883347386541378e-06, + "loss": 0.4137, + "step": 3155 + }, + { + "epoch": 0.8860190903986525, + "grad_norm": 0.6934637427330017, + "learning_rate": 8.882318316182333e-06, + "loss": 0.449, + "step": 3156 + }, + { + "epoch": 0.886299831555306, + "grad_norm": 0.7745835781097412, + "learning_rate": 8.881288831526416e-06, + "loss": 0.4566, + "step": 3157 + }, + { + "epoch": 0.8865805727119596, + "grad_norm": 0.6885500550270081, + "learning_rate": 8.880258932683493e-06, + "loss": 0.4156, + "step": 3158 + }, + { + "epoch": 0.8868613138686131, + "grad_norm": 0.7272955179214478, + "learning_rate": 8.879228619763467e-06, + "loss": 0.4832, + "step": 3159 + }, + { + "epoch": 0.8871420550252667, + "grad_norm": 0.6531155109405518, + "learning_rate": 8.878197892876284e-06, + "loss": 0.4589, + "step": 3160 + }, + { + "epoch": 0.8874227961819202, + "grad_norm": 0.7912184000015259, + "learning_rate": 8.877166752131939e-06, + "loss": 0.5203, + "step": 3161 + }, + { + "epoch": 0.8877035373385739, + "grad_norm": 0.7932146191596985, + "learning_rate": 8.87613519764047e-06, + "loss": 0.4992, + "step": 3162 + }, + { + "epoch": 0.8879842784952274, + "grad_norm": 0.8007762432098389, + "learning_rate": 8.875103229511957e-06, + "loss": 0.4631, + "step": 3163 + }, + { + "epoch": 0.888265019651881, + "grad_norm": 0.7280421257019043, + "learning_rate": 8.874070847856524e-06, + "loss": 0.4738, + "step": 3164 + }, + { + "epoch": 0.8885457608085345, + "grad_norm": 0.6544811725616455, + "learning_rate": 8.87303805278434e-06, + "loss": 0.4413, + "step": 3165 + }, + { + "epoch": 0.8888265019651881, + "grad_norm": 0.67585688829422, + "learning_rate": 8.872004844405622e-06, + "loss": 0.4353, + "step": 3166 + }, + { + "epoch": 0.8891072431218416, + "grad_norm": 0.9749636054039001, + "learning_rate": 8.870971222830624e-06, + "loss": 0.5339, + "step": 3167 + }, + { + "epoch": 0.8893879842784952, + "grad_norm": 0.8017314076423645, + "learning_rate": 8.86993718816965e-06, + "loss": 0.4825, + "step": 3168 + }, + { + "epoch": 0.8896687254351487, + "grad_norm": 0.6650331020355225, + "learning_rate": 8.868902740533045e-06, + "loss": 0.4365, + "step": 3169 + }, + { + "epoch": 0.8899494665918024, + "grad_norm": 0.8409380316734314, + "learning_rate": 8.8678678800312e-06, + "loss": 0.3997, + "step": 3170 + }, + { + "epoch": 0.890230207748456, + "grad_norm": 0.8529126644134521, + "learning_rate": 8.866832606774544e-06, + "loss": 0.4469, + "step": 3171 + }, + { + "epoch": 0.8905109489051095, + "grad_norm": 0.6713091731071472, + "learning_rate": 8.865796920873561e-06, + "loss": 0.4477, + "step": 3172 + }, + { + "epoch": 0.8907916900617631, + "grad_norm": 0.7690662741661072, + "learning_rate": 8.864760822438769e-06, + "loss": 0.4449, + "step": 3173 + }, + { + "epoch": 0.8910724312184166, + "grad_norm": 0.8488250374794006, + "learning_rate": 8.863724311580738e-06, + "loss": 0.4371, + "step": 3174 + }, + { + "epoch": 0.8913531723750702, + "grad_norm": 0.7574658989906311, + "learning_rate": 8.862687388410073e-06, + "loss": 0.4491, + "step": 3175 + }, + { + "epoch": 0.8916339135317237, + "grad_norm": 0.8288156986236572, + "learning_rate": 8.86165005303743e-06, + "loss": 0.4468, + "step": 3176 + }, + { + "epoch": 0.8919146546883773, + "grad_norm": 0.8906120657920837, + "learning_rate": 8.860612305573508e-06, + "loss": 0.479, + "step": 3177 + }, + { + "epoch": 0.8921953958450309, + "grad_norm": 0.7341766953468323, + "learning_rate": 8.85957414612905e-06, + "loss": 0.4704, + "step": 3178 + }, + { + "epoch": 0.8924761370016845, + "grad_norm": 0.7411095499992371, + "learning_rate": 8.858535574814838e-06, + "loss": 0.4399, + "step": 3179 + }, + { + "epoch": 0.892756878158338, + "grad_norm": 0.7041527628898621, + "learning_rate": 8.857496591741705e-06, + "loss": 0.4901, + "step": 3180 + }, + { + "epoch": 0.8930376193149916, + "grad_norm": 0.7300823330879211, + "learning_rate": 8.856457197020526e-06, + "loss": 0.4674, + "step": 3181 + }, + { + "epoch": 0.8933183604716451, + "grad_norm": 0.6751479506492615, + "learning_rate": 8.855417390762212e-06, + "loss": 0.4461, + "step": 3182 + }, + { + "epoch": 0.8935991016282987, + "grad_norm": 0.7834952473640442, + "learning_rate": 8.854377173077733e-06, + "loss": 0.4578, + "step": 3183 + }, + { + "epoch": 0.8938798427849522, + "grad_norm": 0.5938035249710083, + "learning_rate": 8.853336544078089e-06, + "loss": 0.4061, + "step": 3184 + }, + { + "epoch": 0.8941605839416058, + "grad_norm": 0.8587839007377625, + "learning_rate": 8.852295503874331e-06, + "loss": 0.4948, + "step": 3185 + }, + { + "epoch": 0.8944413250982594, + "grad_norm": 0.7783791422843933, + "learning_rate": 8.851254052577555e-06, + "loss": 0.4443, + "step": 3186 + }, + { + "epoch": 0.894722066254913, + "grad_norm": 0.6832489967346191, + "learning_rate": 8.850212190298894e-06, + "loss": 0.4167, + "step": 3187 + }, + { + "epoch": 0.8950028074115666, + "grad_norm": 0.6670812368392944, + "learning_rate": 8.849169917149532e-06, + "loss": 0.427, + "step": 3188 + }, + { + "epoch": 0.8952835485682201, + "grad_norm": 0.8351960778236389, + "learning_rate": 8.848127233240693e-06, + "loss": 0.4807, + "step": 3189 + }, + { + "epoch": 0.8955642897248737, + "grad_norm": 0.7806240320205688, + "learning_rate": 8.847084138683644e-06, + "loss": 0.4548, + "step": 3190 + }, + { + "epoch": 0.8958450308815272, + "grad_norm": 0.6731281280517578, + "learning_rate": 8.8460406335897e-06, + "loss": 0.4007, + "step": 3191 + }, + { + "epoch": 0.8961257720381808, + "grad_norm": 0.6744766235351562, + "learning_rate": 8.844996718070218e-06, + "loss": 0.4357, + "step": 3192 + }, + { + "epoch": 0.8964065131948343, + "grad_norm": 0.8585912585258484, + "learning_rate": 8.843952392236595e-06, + "loss": 0.4865, + "step": 3193 + }, + { + "epoch": 0.8966872543514879, + "grad_norm": 0.8856034278869629, + "learning_rate": 8.842907656200277e-06, + "loss": 0.45, + "step": 3194 + }, + { + "epoch": 0.8969679955081415, + "grad_norm": 0.698112428188324, + "learning_rate": 8.841862510072751e-06, + "loss": 0.427, + "step": 3195 + }, + { + "epoch": 0.8972487366647951, + "grad_norm": 0.750064492225647, + "learning_rate": 8.84081695396555e-06, + "loss": 0.4826, + "step": 3196 + }, + { + "epoch": 0.8975294778214487, + "grad_norm": 0.6911453008651733, + "learning_rate": 8.839770987990245e-06, + "loss": 0.4264, + "step": 3197 + }, + { + "epoch": 0.8978102189781022, + "grad_norm": 0.744818925857544, + "learning_rate": 8.838724612258462e-06, + "loss": 0.4018, + "step": 3198 + }, + { + "epoch": 0.8980909601347558, + "grad_norm": 0.8290770053863525, + "learning_rate": 8.837677826881858e-06, + "loss": 0.4159, + "step": 3199 + }, + { + "epoch": 0.8983717012914093, + "grad_norm": 0.6693921089172363, + "learning_rate": 8.836630631972142e-06, + "loss": 0.4308, + "step": 3200 + }, + { + "epoch": 0.8986524424480629, + "grad_norm": 0.7737688422203064, + "learning_rate": 8.835583027641064e-06, + "loss": 0.4225, + "step": 3201 + }, + { + "epoch": 0.8989331836047164, + "grad_norm": 0.9004019498825073, + "learning_rate": 8.834535014000417e-06, + "loss": 0.4812, + "step": 3202 + }, + { + "epoch": 0.89921392476137, + "grad_norm": 0.6781492233276367, + "learning_rate": 8.833486591162037e-06, + "loss": 0.4397, + "step": 3203 + }, + { + "epoch": 0.8994946659180236, + "grad_norm": 0.6477671265602112, + "learning_rate": 8.832437759237808e-06, + "loss": 0.3967, + "step": 3204 + }, + { + "epoch": 0.8997754070746772, + "grad_norm": 0.672127902507782, + "learning_rate": 8.831388518339652e-06, + "loss": 0.4118, + "step": 3205 + }, + { + "epoch": 0.9000561482313307, + "grad_norm": 0.7684900760650635, + "learning_rate": 8.830338868579542e-06, + "loss": 0.475, + "step": 3206 + }, + { + "epoch": 0.9003368893879843, + "grad_norm": 0.7863741517066956, + "learning_rate": 8.829288810069486e-06, + "loss": 0.4807, + "step": 3207 + }, + { + "epoch": 0.9006176305446378, + "grad_norm": 0.7152606844902039, + "learning_rate": 8.82823834292154e-06, + "loss": 0.4412, + "step": 3208 + }, + { + "epoch": 0.9008983717012914, + "grad_norm": 0.7831460237503052, + "learning_rate": 8.827187467247806e-06, + "loss": 0.4698, + "step": 3209 + }, + { + "epoch": 0.9011791128579449, + "grad_norm": 0.7695897817611694, + "learning_rate": 8.826136183160424e-06, + "loss": 0.4737, + "step": 3210 + }, + { + "epoch": 0.9014598540145985, + "grad_norm": 0.7289673089981079, + "learning_rate": 8.825084490771583e-06, + "loss": 0.4536, + "step": 3211 + }, + { + "epoch": 0.9017405951712522, + "grad_norm": 0.7347771525382996, + "learning_rate": 8.82403239019351e-06, + "loss": 0.4325, + "step": 3212 + }, + { + "epoch": 0.9020213363279057, + "grad_norm": 0.7328828573226929, + "learning_rate": 8.822979881538482e-06, + "loss": 0.4169, + "step": 3213 + }, + { + "epoch": 0.9023020774845593, + "grad_norm": 0.8392121195793152, + "learning_rate": 8.821926964918814e-06, + "loss": 0.4613, + "step": 3214 + }, + { + "epoch": 0.9025828186412128, + "grad_norm": 0.7489368915557861, + "learning_rate": 8.820873640446866e-06, + "loss": 0.461, + "step": 3215 + }, + { + "epoch": 0.9028635597978664, + "grad_norm": 0.6864010691642761, + "learning_rate": 8.819819908235045e-06, + "loss": 0.4441, + "step": 3216 + }, + { + "epoch": 0.9031443009545199, + "grad_norm": 0.8161920309066772, + "learning_rate": 8.818765768395796e-06, + "loss": 0.5073, + "step": 3217 + }, + { + "epoch": 0.9034250421111735, + "grad_norm": 0.7584202289581299, + "learning_rate": 8.817711221041613e-06, + "loss": 0.4337, + "step": 3218 + }, + { + "epoch": 0.903705783267827, + "grad_norm": 0.7495251297950745, + "learning_rate": 8.816656266285028e-06, + "loss": 0.4185, + "step": 3219 + }, + { + "epoch": 0.9039865244244806, + "grad_norm": 0.6710088849067688, + "learning_rate": 8.815600904238623e-06, + "loss": 0.4733, + "step": 3220 + }, + { + "epoch": 0.9042672655811342, + "grad_norm": 0.8690589666366577, + "learning_rate": 8.814545135015015e-06, + "loss": 0.4668, + "step": 3221 + }, + { + "epoch": 0.9045480067377878, + "grad_norm": 0.8617925643920898, + "learning_rate": 8.813488958726872e-06, + "loss": 0.5133, + "step": 3222 + }, + { + "epoch": 0.9048287478944413, + "grad_norm": 0.8535025119781494, + "learning_rate": 8.812432375486902e-06, + "loss": 0.4717, + "step": 3223 + }, + { + "epoch": 0.9051094890510949, + "grad_norm": 0.6097338199615479, + "learning_rate": 8.811375385407855e-06, + "loss": 0.4367, + "step": 3224 + }, + { + "epoch": 0.9053902302077484, + "grad_norm": 0.764920711517334, + "learning_rate": 8.81031798860253e-06, + "loss": 0.4476, + "step": 3225 + }, + { + "epoch": 0.905670971364402, + "grad_norm": 0.9842613935470581, + "learning_rate": 8.809260185183763e-06, + "loss": 0.4683, + "step": 3226 + }, + { + "epoch": 0.9059517125210556, + "grad_norm": 0.7564629912376404, + "learning_rate": 8.80820197526444e-06, + "loss": 0.4318, + "step": 3227 + }, + { + "epoch": 0.9062324536777091, + "grad_norm": 0.6793269515037537, + "learning_rate": 8.80714335895748e-06, + "loss": 0.4297, + "step": 3228 + }, + { + "epoch": 0.9065131948343628, + "grad_norm": 0.8259584307670593, + "learning_rate": 8.806084336375857e-06, + "loss": 0.4628, + "step": 3229 + }, + { + "epoch": 0.9067939359910163, + "grad_norm": 0.8333523273468018, + "learning_rate": 8.805024907632585e-06, + "loss": 0.4539, + "step": 3230 + }, + { + "epoch": 0.9070746771476699, + "grad_norm": 0.6882826089859009, + "learning_rate": 8.803965072840713e-06, + "loss": 0.42, + "step": 3231 + }, + { + "epoch": 0.9073554183043234, + "grad_norm": 0.6492483019828796, + "learning_rate": 8.802904832113345e-06, + "loss": 0.4766, + "step": 3232 + }, + { + "epoch": 0.907636159460977, + "grad_norm": 0.7343829274177551, + "learning_rate": 8.801844185563622e-06, + "loss": 0.4461, + "step": 3233 + }, + { + "epoch": 0.9079169006176305, + "grad_norm": 0.7494798898696899, + "learning_rate": 8.800783133304731e-06, + "loss": 0.4169, + "step": 3234 + }, + { + "epoch": 0.9081976417742841, + "grad_norm": 0.7179716229438782, + "learning_rate": 8.799721675449897e-06, + "loss": 0.4718, + "step": 3235 + }, + { + "epoch": 0.9084783829309376, + "grad_norm": 0.6898223757743835, + "learning_rate": 8.798659812112397e-06, + "loss": 0.4273, + "step": 3236 + }, + { + "epoch": 0.9087591240875912, + "grad_norm": 0.6938149929046631, + "learning_rate": 8.797597543405543e-06, + "loss": 0.423, + "step": 3237 + }, + { + "epoch": 0.9090398652442448, + "grad_norm": 0.8423193097114563, + "learning_rate": 8.796534869442694e-06, + "loss": 0.4652, + "step": 3238 + }, + { + "epoch": 0.9093206064008984, + "grad_norm": 0.8559891581535339, + "learning_rate": 8.795471790337256e-06, + "loss": 0.4867, + "step": 3239 + }, + { + "epoch": 0.909601347557552, + "grad_norm": 0.6462901830673218, + "learning_rate": 8.794408306202668e-06, + "loss": 0.4446, + "step": 3240 + }, + { + "epoch": 0.9098820887142055, + "grad_norm": 0.7510899305343628, + "learning_rate": 8.793344417152423e-06, + "loss": 0.4457, + "step": 3241 + }, + { + "epoch": 0.9101628298708591, + "grad_norm": 0.7380796074867249, + "learning_rate": 8.79228012330005e-06, + "loss": 0.4372, + "step": 3242 + }, + { + "epoch": 0.9104435710275126, + "grad_norm": 0.7024365067481995, + "learning_rate": 8.791215424759126e-06, + "loss": 0.4301, + "step": 3243 + }, + { + "epoch": 0.9107243121841662, + "grad_norm": 0.6903531551361084, + "learning_rate": 8.790150321643266e-06, + "loss": 0.4035, + "step": 3244 + }, + { + "epoch": 0.9110050533408197, + "grad_norm": 0.7847535610198975, + "learning_rate": 8.789084814066133e-06, + "loss": 0.4378, + "step": 3245 + }, + { + "epoch": 0.9112857944974734, + "grad_norm": 0.7383411526679993, + "learning_rate": 8.788018902141435e-06, + "loss": 0.3852, + "step": 3246 + }, + { + "epoch": 0.9115665356541269, + "grad_norm": 0.7784723043441772, + "learning_rate": 8.786952585982913e-06, + "loss": 0.4729, + "step": 3247 + }, + { + "epoch": 0.9118472768107805, + "grad_norm": 0.7858310341835022, + "learning_rate": 8.78588586570436e-06, + "loss": 0.4539, + "step": 3248 + }, + { + "epoch": 0.912128017967434, + "grad_norm": 0.7449018955230713, + "learning_rate": 8.784818741419611e-06, + "loss": 0.4474, + "step": 3249 + }, + { + "epoch": 0.9124087591240876, + "grad_norm": 0.847886323928833, + "learning_rate": 8.783751213242543e-06, + "loss": 0.498, + "step": 3250 + }, + { + "epoch": 0.9126895002807411, + "grad_norm": 0.8486092686653137, + "learning_rate": 8.782683281287075e-06, + "loss": 0.4109, + "step": 3251 + }, + { + "epoch": 0.9129702414373947, + "grad_norm": 0.8215112090110779, + "learning_rate": 8.78161494566717e-06, + "loss": 0.4399, + "step": 3252 + }, + { + "epoch": 0.9132509825940482, + "grad_norm": 0.8668664693832397, + "learning_rate": 8.780546206496833e-06, + "loss": 0.4448, + "step": 3253 + }, + { + "epoch": 0.9135317237507019, + "grad_norm": 0.8881429433822632, + "learning_rate": 8.779477063890116e-06, + "loss": 0.5114, + "step": 3254 + }, + { + "epoch": 0.9138124649073555, + "grad_norm": 0.6719093918800354, + "learning_rate": 8.77840751796111e-06, + "loss": 0.4325, + "step": 3255 + }, + { + "epoch": 0.914093206064009, + "grad_norm": 0.7087459564208984, + "learning_rate": 8.777337568823948e-06, + "loss": 0.3946, + "step": 3256 + }, + { + "epoch": 0.9143739472206626, + "grad_norm": 0.8789963126182556, + "learning_rate": 8.776267216592814e-06, + "loss": 0.4345, + "step": 3257 + }, + { + "epoch": 0.9146546883773161, + "grad_norm": 0.7974157929420471, + "learning_rate": 8.775196461381922e-06, + "loss": 0.49, + "step": 3258 + }, + { + "epoch": 0.9149354295339697, + "grad_norm": 0.7480311393737793, + "learning_rate": 8.774125303305542e-06, + "loss": 0.4538, + "step": 3259 + }, + { + "epoch": 0.9152161706906232, + "grad_norm": 0.7599238157272339, + "learning_rate": 8.773053742477979e-06, + "loss": 0.4249, + "step": 3260 + }, + { + "epoch": 0.9154969118472768, + "grad_norm": 0.790607213973999, + "learning_rate": 8.771981779013582e-06, + "loss": 0.4063, + "step": 3261 + }, + { + "epoch": 0.9157776530039303, + "grad_norm": 0.6766489148139954, + "learning_rate": 8.770909413026749e-06, + "loss": 0.4336, + "step": 3262 + }, + { + "epoch": 0.916058394160584, + "grad_norm": 0.8385114669799805, + "learning_rate": 8.769836644631911e-06, + "loss": 0.4367, + "step": 3263 + }, + { + "epoch": 0.9163391353172375, + "grad_norm": 0.7480401396751404, + "learning_rate": 8.76876347394355e-06, + "loss": 0.4692, + "step": 3264 + }, + { + "epoch": 0.9166198764738911, + "grad_norm": 0.8067586421966553, + "learning_rate": 8.767689901076188e-06, + "loss": 0.4456, + "step": 3265 + }, + { + "epoch": 0.9169006176305446, + "grad_norm": 0.7194135189056396, + "learning_rate": 8.766615926144389e-06, + "loss": 0.4543, + "step": 3266 + }, + { + "epoch": 0.9171813587871982, + "grad_norm": 0.8041672110557556, + "learning_rate": 8.765541549262762e-06, + "loss": 0.4115, + "step": 3267 + }, + { + "epoch": 0.9174620999438517, + "grad_norm": 0.8199875354766846, + "learning_rate": 8.764466770545956e-06, + "loss": 0.4521, + "step": 3268 + }, + { + "epoch": 0.9177428411005053, + "grad_norm": 0.7689574360847473, + "learning_rate": 8.763391590108666e-06, + "loss": 0.4925, + "step": 3269 + }, + { + "epoch": 0.9180235822571589, + "grad_norm": 0.6917686462402344, + "learning_rate": 8.762316008065629e-06, + "loss": 0.4152, + "step": 3270 + }, + { + "epoch": 0.9183043234138125, + "grad_norm": 0.747933566570282, + "learning_rate": 8.761240024531624e-06, + "loss": 0.4525, + "step": 3271 + }, + { + "epoch": 0.9185850645704661, + "grad_norm": 0.7590529918670654, + "learning_rate": 8.760163639621473e-06, + "loss": 0.432, + "step": 3272 + }, + { + "epoch": 0.9188658057271196, + "grad_norm": 0.6946105360984802, + "learning_rate": 8.759086853450042e-06, + "loss": 0.4743, + "step": 3273 + }, + { + "epoch": 0.9191465468837732, + "grad_norm": 0.8134940266609192, + "learning_rate": 8.758009666132237e-06, + "loss": 0.4833, + "step": 3274 + }, + { + "epoch": 0.9194272880404267, + "grad_norm": 0.7616158723831177, + "learning_rate": 8.75693207778301e-06, + "loss": 0.47, + "step": 3275 + }, + { + "epoch": 0.9197080291970803, + "grad_norm": 0.762461245059967, + "learning_rate": 8.755854088517356e-06, + "loss": 0.4368, + "step": 3276 + }, + { + "epoch": 0.9199887703537338, + "grad_norm": 0.7236607074737549, + "learning_rate": 8.754775698450308e-06, + "loss": 0.4555, + "step": 3277 + }, + { + "epoch": 0.9202695115103874, + "grad_norm": 0.7588319778442383, + "learning_rate": 8.753696907696948e-06, + "loss": 0.5008, + "step": 3278 + }, + { + "epoch": 0.9205502526670409, + "grad_norm": 0.923145592212677, + "learning_rate": 8.752617716372397e-06, + "loss": 0.4782, + "step": 3279 + }, + { + "epoch": 0.9208309938236946, + "grad_norm": 0.7173494100570679, + "learning_rate": 8.75153812459182e-06, + "loss": 0.4647, + "step": 3280 + }, + { + "epoch": 0.9211117349803482, + "grad_norm": 0.6904281973838806, + "learning_rate": 8.75045813247042e-06, + "loss": 0.4444, + "step": 3281 + }, + { + "epoch": 0.9213924761370017, + "grad_norm": 0.7474949359893799, + "learning_rate": 8.749377740123454e-06, + "loss": 0.4691, + "step": 3282 + }, + { + "epoch": 0.9216732172936553, + "grad_norm": 0.761879563331604, + "learning_rate": 8.74829694766621e-06, + "loss": 0.4991, + "step": 3283 + }, + { + "epoch": 0.9219539584503088, + "grad_norm": 0.7057278752326965, + "learning_rate": 8.747215755214024e-06, + "loss": 0.4316, + "step": 3284 + }, + { + "epoch": 0.9222346996069624, + "grad_norm": 0.7089288830757141, + "learning_rate": 8.746134162882278e-06, + "loss": 0.4484, + "step": 3285 + }, + { + "epoch": 0.9225154407636159, + "grad_norm": 0.6737016439437866, + "learning_rate": 8.745052170786388e-06, + "loss": 0.44, + "step": 3286 + }, + { + "epoch": 0.9227961819202695, + "grad_norm": 0.772479772567749, + "learning_rate": 8.743969779041819e-06, + "loss": 0.4764, + "step": 3287 + }, + { + "epoch": 0.9230769230769231, + "grad_norm": 0.7977652549743652, + "learning_rate": 8.742886987764077e-06, + "loss": 0.4845, + "step": 3288 + }, + { + "epoch": 0.9233576642335767, + "grad_norm": 0.6931220293045044, + "learning_rate": 8.741803797068713e-06, + "loss": 0.4234, + "step": 3289 + }, + { + "epoch": 0.9236384053902302, + "grad_norm": 0.8048646450042725, + "learning_rate": 8.740720207071316e-06, + "loss": 0.4754, + "step": 3290 + }, + { + "epoch": 0.9239191465468838, + "grad_norm": 0.6938891410827637, + "learning_rate": 8.73963621788752e-06, + "loss": 0.4526, + "step": 3291 + }, + { + "epoch": 0.9241998877035373, + "grad_norm": 0.7346242070198059, + "learning_rate": 8.738551829633e-06, + "loss": 0.4533, + "step": 3292 + }, + { + "epoch": 0.9244806288601909, + "grad_norm": 0.7148845791816711, + "learning_rate": 8.73746704242348e-06, + "loss": 0.4376, + "step": 3293 + }, + { + "epoch": 0.9247613700168444, + "grad_norm": 0.7251663208007812, + "learning_rate": 8.736381856374719e-06, + "loss": 0.4273, + "step": 3294 + }, + { + "epoch": 0.925042111173498, + "grad_norm": 0.6632020473480225, + "learning_rate": 8.73529627160252e-06, + "loss": 0.4658, + "step": 3295 + }, + { + "epoch": 0.9253228523301515, + "grad_norm": 0.7892797589302063, + "learning_rate": 8.734210288222733e-06, + "loss": 0.4717, + "step": 3296 + }, + { + "epoch": 0.9256035934868052, + "grad_norm": 0.7627503871917725, + "learning_rate": 8.733123906351243e-06, + "loss": 0.4288, + "step": 3297 + }, + { + "epoch": 0.9258843346434588, + "grad_norm": 0.775695264339447, + "learning_rate": 8.732037126103987e-06, + "loss": 0.4468, + "step": 3298 + }, + { + "epoch": 0.9261650758001123, + "grad_norm": 0.78183913230896, + "learning_rate": 8.730949947596934e-06, + "loss": 0.4671, + "step": 3299 + }, + { + "epoch": 0.9264458169567659, + "grad_norm": 0.7109136581420898, + "learning_rate": 8.729862370946106e-06, + "loss": 0.4143, + "step": 3300 + }, + { + "epoch": 0.9267265581134194, + "grad_norm": 0.7258944511413574, + "learning_rate": 8.72877439626756e-06, + "loss": 0.4268, + "step": 3301 + }, + { + "epoch": 0.927007299270073, + "grad_norm": 0.667835533618927, + "learning_rate": 8.727686023677398e-06, + "loss": 0.4377, + "step": 3302 + }, + { + "epoch": 0.9272880404267265, + "grad_norm": 0.7879599928855896, + "learning_rate": 8.726597253291764e-06, + "loss": 0.4493, + "step": 3303 + }, + { + "epoch": 0.9275687815833801, + "grad_norm": 0.7433174252510071, + "learning_rate": 8.725508085226846e-06, + "loss": 0.4511, + "step": 3304 + }, + { + "epoch": 0.9278495227400337, + "grad_norm": 0.8071883916854858, + "learning_rate": 8.724418519598872e-06, + "loss": 0.4901, + "step": 3305 + }, + { + "epoch": 0.9281302638966873, + "grad_norm": 0.6438072919845581, + "learning_rate": 8.723328556524116e-06, + "loss": 0.4447, + "step": 3306 + }, + { + "epoch": 0.9284110050533408, + "grad_norm": 0.7367581725120544, + "learning_rate": 8.722238196118888e-06, + "loss": 0.4168, + "step": 3307 + }, + { + "epoch": 0.9286917462099944, + "grad_norm": 0.6820290684700012, + "learning_rate": 8.721147438499547e-06, + "loss": 0.4152, + "step": 3308 + }, + { + "epoch": 0.928972487366648, + "grad_norm": 0.6569641828536987, + "learning_rate": 8.720056283782491e-06, + "loss": 0.4201, + "step": 3309 + }, + { + "epoch": 0.9292532285233015, + "grad_norm": 0.6963849067687988, + "learning_rate": 8.718964732084165e-06, + "loss": 0.4673, + "step": 3310 + }, + { + "epoch": 0.929533969679955, + "grad_norm": 0.6471827626228333, + "learning_rate": 8.717872783521048e-06, + "loss": 0.4461, + "step": 3311 + }, + { + "epoch": 0.9298147108366086, + "grad_norm": 0.8178622722625732, + "learning_rate": 8.716780438209666e-06, + "loss": 0.4265, + "step": 3312 + }, + { + "epoch": 0.9300954519932623, + "grad_norm": 0.7103288769721985, + "learning_rate": 8.71568769626659e-06, + "loss": 0.4216, + "step": 3313 + }, + { + "epoch": 0.9303761931499158, + "grad_norm": 0.6218788623809814, + "learning_rate": 8.71459455780843e-06, + "loss": 0.4347, + "step": 3314 + }, + { + "epoch": 0.9306569343065694, + "grad_norm": 0.8654505014419556, + "learning_rate": 8.713501022951838e-06, + "loss": 0.4317, + "step": 3315 + }, + { + "epoch": 0.9309376754632229, + "grad_norm": 0.7331998348236084, + "learning_rate": 8.712407091813508e-06, + "loss": 0.4676, + "step": 3316 + }, + { + "epoch": 0.9312184166198765, + "grad_norm": 0.7468166351318359, + "learning_rate": 8.71131276451018e-06, + "loss": 0.4631, + "step": 3317 + }, + { + "epoch": 0.93149915777653, + "grad_norm": 0.6815680265426636, + "learning_rate": 8.710218041158633e-06, + "loss": 0.4527, + "step": 3318 + }, + { + "epoch": 0.9317798989331836, + "grad_norm": 0.8265864849090576, + "learning_rate": 8.70912292187569e-06, + "loss": 0.4583, + "step": 3319 + }, + { + "epoch": 0.9320606400898371, + "grad_norm": 0.853699803352356, + "learning_rate": 8.708027406778214e-06, + "loss": 0.4325, + "step": 3320 + }, + { + "epoch": 0.9323413812464907, + "grad_norm": 0.8384313583374023, + "learning_rate": 8.706931495983111e-06, + "loss": 0.4675, + "step": 3321 + }, + { + "epoch": 0.9326221224031443, + "grad_norm": 0.8724325895309448, + "learning_rate": 8.70583518960733e-06, + "loss": 0.4428, + "step": 3322 + }, + { + "epoch": 0.9329028635597979, + "grad_norm": 0.9371477365493774, + "learning_rate": 8.704738487767864e-06, + "loss": 0.4765, + "step": 3323 + }, + { + "epoch": 0.9331836047164515, + "grad_norm": 0.7754281163215637, + "learning_rate": 8.703641390581745e-06, + "loss": 0.4344, + "step": 3324 + }, + { + "epoch": 0.933464345873105, + "grad_norm": 0.754790723323822, + "learning_rate": 8.702543898166047e-06, + "loss": 0.4562, + "step": 3325 + }, + { + "epoch": 0.9337450870297586, + "grad_norm": 0.7967555522918701, + "learning_rate": 8.701446010637889e-06, + "loss": 0.4094, + "step": 3326 + }, + { + "epoch": 0.9340258281864121, + "grad_norm": 0.7168818712234497, + "learning_rate": 8.700347728114431e-06, + "loss": 0.4081, + "step": 3327 + }, + { + "epoch": 0.9343065693430657, + "grad_norm": 0.7467126250267029, + "learning_rate": 8.699249050712874e-06, + "loss": 0.4396, + "step": 3328 + }, + { + "epoch": 0.9345873104997192, + "grad_norm": 0.7604356408119202, + "learning_rate": 8.698149978550463e-06, + "loss": 0.5038, + "step": 3329 + }, + { + "epoch": 0.9348680516563729, + "grad_norm": 0.6994295120239258, + "learning_rate": 8.697050511744484e-06, + "loss": 0.4616, + "step": 3330 + }, + { + "epoch": 0.9351487928130264, + "grad_norm": 0.8779392838478088, + "learning_rate": 8.695950650412264e-06, + "loss": 0.5152, + "step": 3331 + }, + { + "epoch": 0.93542953396968, + "grad_norm": 0.7325848340988159, + "learning_rate": 8.694850394671175e-06, + "loss": 0.4782, + "step": 3332 + }, + { + "epoch": 0.9357102751263335, + "grad_norm": 0.8201211094856262, + "learning_rate": 8.693749744638626e-06, + "loss": 0.4464, + "step": 3333 + }, + { + "epoch": 0.9359910162829871, + "grad_norm": 0.8083143830299377, + "learning_rate": 8.692648700432078e-06, + "loss": 0.4772, + "step": 3334 + }, + { + "epoch": 0.9362717574396406, + "grad_norm": 0.7418912649154663, + "learning_rate": 8.691547262169021e-06, + "loss": 0.4426, + "step": 3335 + }, + { + "epoch": 0.9365524985962942, + "grad_norm": 0.6536455750465393, + "learning_rate": 8.690445429966998e-06, + "loss": 0.4068, + "step": 3336 + }, + { + "epoch": 0.9368332397529477, + "grad_norm": 0.7285296320915222, + "learning_rate": 8.689343203943588e-06, + "loss": 0.4507, + "step": 3337 + }, + { + "epoch": 0.9371139809096013, + "grad_norm": 0.8919716477394104, + "learning_rate": 8.688240584216412e-06, + "loss": 0.4526, + "step": 3338 + }, + { + "epoch": 0.937394722066255, + "grad_norm": 0.6783115863800049, + "learning_rate": 8.687137570903139e-06, + "loss": 0.4229, + "step": 3339 + }, + { + "epoch": 0.9376754632229085, + "grad_norm": 0.7820303440093994, + "learning_rate": 8.68603416412147e-06, + "loss": 0.4332, + "step": 3340 + }, + { + "epoch": 0.9379562043795621, + "grad_norm": 0.7348349690437317, + "learning_rate": 8.684930363989159e-06, + "loss": 0.4271, + "step": 3341 + }, + { + "epoch": 0.9382369455362156, + "grad_norm": 0.6245999336242676, + "learning_rate": 8.683826170623995e-06, + "loss": 0.4261, + "step": 3342 + }, + { + "epoch": 0.9385176866928692, + "grad_norm": 0.7995222210884094, + "learning_rate": 8.682721584143809e-06, + "loss": 0.4252, + "step": 3343 + }, + { + "epoch": 0.9387984278495227, + "grad_norm": 0.6974126100540161, + "learning_rate": 8.681616604666479e-06, + "loss": 0.4402, + "step": 3344 + }, + { + "epoch": 0.9390791690061763, + "grad_norm": 0.8546789288520813, + "learning_rate": 8.680511232309917e-06, + "loss": 0.4912, + "step": 3345 + }, + { + "epoch": 0.9393599101628298, + "grad_norm": 0.688572883605957, + "learning_rate": 8.679405467192085e-06, + "loss": 0.4905, + "step": 3346 + }, + { + "epoch": 0.9396406513194835, + "grad_norm": 0.7426177859306335, + "learning_rate": 8.678299309430982e-06, + "loss": 0.4602, + "step": 3347 + }, + { + "epoch": 0.939921392476137, + "grad_norm": 0.8280917406082153, + "learning_rate": 8.67719275914465e-06, + "loss": 0.4562, + "step": 3348 + }, + { + "epoch": 0.9402021336327906, + "grad_norm": 0.8141161203384399, + "learning_rate": 8.676085816451176e-06, + "loss": 0.4544, + "step": 3349 + }, + { + "epoch": 0.9404828747894441, + "grad_norm": 0.7345113754272461, + "learning_rate": 8.674978481468681e-06, + "loss": 0.4848, + "step": 3350 + }, + { + "epoch": 0.9407636159460977, + "grad_norm": 0.702564537525177, + "learning_rate": 8.673870754315336e-06, + "loss": 0.4833, + "step": 3351 + }, + { + "epoch": 0.9410443571027512, + "grad_norm": 0.8589380383491516, + "learning_rate": 8.672762635109351e-06, + "loss": 0.4173, + "step": 3352 + }, + { + "epoch": 0.9413250982594048, + "grad_norm": 0.8307361602783203, + "learning_rate": 8.671654123968977e-06, + "loss": 0.4069, + "step": 3353 + }, + { + "epoch": 0.9416058394160584, + "grad_norm": 0.8155995011329651, + "learning_rate": 8.67054522101251e-06, + "loss": 0.4382, + "step": 3354 + }, + { + "epoch": 0.9418865805727119, + "grad_norm": 0.9018946290016174, + "learning_rate": 8.669435926358278e-06, + "loss": 0.4632, + "step": 3355 + }, + { + "epoch": 0.9421673217293656, + "grad_norm": 0.7798970341682434, + "learning_rate": 8.668326240124666e-06, + "loss": 0.4419, + "step": 3356 + }, + { + "epoch": 0.9424480628860191, + "grad_norm": 0.6875411868095398, + "learning_rate": 8.667216162430088e-06, + "loss": 0.4231, + "step": 3357 + }, + { + "epoch": 0.9427288040426727, + "grad_norm": 0.6356247067451477, + "learning_rate": 8.666105693393007e-06, + "loss": 0.4412, + "step": 3358 + }, + { + "epoch": 0.9430095451993262, + "grad_norm": 0.7255062460899353, + "learning_rate": 8.664994833131923e-06, + "loss": 0.4225, + "step": 3359 + }, + { + "epoch": 0.9432902863559798, + "grad_norm": 0.7604843974113464, + "learning_rate": 8.663883581765381e-06, + "loss": 0.4879, + "step": 3360 + }, + { + "epoch": 0.9435710275126333, + "grad_norm": 0.6840789318084717, + "learning_rate": 8.66277193941197e-06, + "loss": 0.4289, + "step": 3361 + }, + { + "epoch": 0.9438517686692869, + "grad_norm": 0.7160725593566895, + "learning_rate": 8.661659906190314e-06, + "loss": 0.4596, + "step": 3362 + }, + { + "epoch": 0.9441325098259404, + "grad_norm": 0.832273006439209, + "learning_rate": 8.660547482219082e-06, + "loss": 0.4473, + "step": 3363 + }, + { + "epoch": 0.9444132509825941, + "grad_norm": 0.744295060634613, + "learning_rate": 8.659434667616987e-06, + "loss": 0.4439, + "step": 3364 + }, + { + "epoch": 0.9446939921392477, + "grad_norm": 0.7506633400917053, + "learning_rate": 8.658321462502782e-06, + "loss": 0.4898, + "step": 3365 + }, + { + "epoch": 0.9449747332959012, + "grad_norm": 0.6658370494842529, + "learning_rate": 8.657207866995257e-06, + "loss": 0.4086, + "step": 3366 + }, + { + "epoch": 0.9452554744525548, + "grad_norm": 0.7962737679481506, + "learning_rate": 8.656093881213253e-06, + "loss": 0.4416, + "step": 3367 + }, + { + "epoch": 0.9455362156092083, + "grad_norm": 0.76934814453125, + "learning_rate": 8.654979505275646e-06, + "loss": 0.4574, + "step": 3368 + }, + { + "epoch": 0.9458169567658619, + "grad_norm": 0.7706449627876282, + "learning_rate": 8.653864739301354e-06, + "loss": 0.4906, + "step": 3369 + }, + { + "epoch": 0.9460976979225154, + "grad_norm": 0.7856554388999939, + "learning_rate": 8.65274958340934e-06, + "loss": 0.4279, + "step": 3370 + }, + { + "epoch": 0.946378439079169, + "grad_norm": 0.7563718557357788, + "learning_rate": 8.651634037718604e-06, + "loss": 0.4619, + "step": 3371 + }, + { + "epoch": 0.9466591802358225, + "grad_norm": 0.7916227579116821, + "learning_rate": 8.650518102348193e-06, + "loss": 0.4059, + "step": 3372 + }, + { + "epoch": 0.9469399213924762, + "grad_norm": 0.7160757780075073, + "learning_rate": 8.64940177741719e-06, + "loss": 0.4192, + "step": 3373 + }, + { + "epoch": 0.9472206625491297, + "grad_norm": 0.7741641998291016, + "learning_rate": 8.648285063044724e-06, + "loss": 0.464, + "step": 3374 + }, + { + "epoch": 0.9475014037057833, + "grad_norm": 0.6588005423545837, + "learning_rate": 8.647167959349964e-06, + "loss": 0.4651, + "step": 3375 + }, + { + "epoch": 0.9477821448624368, + "grad_norm": 0.7646698355674744, + "learning_rate": 8.646050466452118e-06, + "loss": 0.4457, + "step": 3376 + }, + { + "epoch": 0.9480628860190904, + "grad_norm": 0.9929888844490051, + "learning_rate": 8.644932584470442e-06, + "loss": 0.4691, + "step": 3377 + }, + { + "epoch": 0.9483436271757439, + "grad_norm": 0.8656958937644958, + "learning_rate": 8.643814313524224e-06, + "loss": 0.465, + "step": 3378 + }, + { + "epoch": 0.9486243683323975, + "grad_norm": 0.7341300845146179, + "learning_rate": 8.642695653732804e-06, + "loss": 0.4715, + "step": 3379 + }, + { + "epoch": 0.948905109489051, + "grad_norm": 0.6429067850112915, + "learning_rate": 8.641576605215556e-06, + "loss": 0.4124, + "step": 3380 + }, + { + "epoch": 0.9491858506457047, + "grad_norm": 0.9043833613395691, + "learning_rate": 8.640457168091898e-06, + "loss": 0.4586, + "step": 3381 + }, + { + "epoch": 0.9494665918023583, + "grad_norm": 0.7792132496833801, + "learning_rate": 8.639337342481289e-06, + "loss": 0.4596, + "step": 3382 + }, + { + "epoch": 0.9497473329590118, + "grad_norm": 0.7375581860542297, + "learning_rate": 8.63821712850323e-06, + "loss": 0.4429, + "step": 3383 + }, + { + "epoch": 0.9500280741156654, + "grad_norm": 0.6678343415260315, + "learning_rate": 8.637096526277264e-06, + "loss": 0.4431, + "step": 3384 + }, + { + "epoch": 0.9503088152723189, + "grad_norm": 0.7538302540779114, + "learning_rate": 8.635975535922974e-06, + "loss": 0.4527, + "step": 3385 + }, + { + "epoch": 0.9505895564289725, + "grad_norm": 0.9098194241523743, + "learning_rate": 8.634854157559987e-06, + "loss": 0.496, + "step": 3386 + }, + { + "epoch": 0.950870297585626, + "grad_norm": 0.6951583027839661, + "learning_rate": 8.633732391307967e-06, + "loss": 0.4006, + "step": 3387 + }, + { + "epoch": 0.9511510387422796, + "grad_norm": 0.708798348903656, + "learning_rate": 8.632610237286622e-06, + "loss": 0.4065, + "step": 3388 + }, + { + "epoch": 0.9514317798989332, + "grad_norm": 0.6822143793106079, + "learning_rate": 8.631487695615704e-06, + "loss": 0.3906, + "step": 3389 + }, + { + "epoch": 0.9517125210555868, + "grad_norm": 0.8026174306869507, + "learning_rate": 8.630364766415e-06, + "loss": 0.451, + "step": 3390 + }, + { + "epoch": 0.9519932622122403, + "grad_norm": 0.7409147620201111, + "learning_rate": 8.629241449804344e-06, + "loss": 0.4672, + "step": 3391 + }, + { + "epoch": 0.9522740033688939, + "grad_norm": 0.6837946772575378, + "learning_rate": 8.62811774590361e-06, + "loss": 0.4578, + "step": 3392 + }, + { + "epoch": 0.9525547445255474, + "grad_norm": 0.6938409805297852, + "learning_rate": 8.626993654832711e-06, + "loss": 0.4442, + "step": 3393 + }, + { + "epoch": 0.952835485682201, + "grad_norm": 0.8176584839820862, + "learning_rate": 8.625869176711605e-06, + "loss": 0.4755, + "step": 3394 + }, + { + "epoch": 0.9531162268388546, + "grad_norm": 0.8156872987747192, + "learning_rate": 8.624744311660289e-06, + "loss": 0.4752, + "step": 3395 + }, + { + "epoch": 0.9533969679955081, + "grad_norm": 0.7634759545326233, + "learning_rate": 8.6236190597988e-06, + "loss": 0.4407, + "step": 3396 + }, + { + "epoch": 0.9536777091521617, + "grad_norm": 0.7569522857666016, + "learning_rate": 8.622493421247218e-06, + "loss": 0.3901, + "step": 3397 + }, + { + "epoch": 0.9539584503088153, + "grad_norm": 0.7346652746200562, + "learning_rate": 8.621367396125666e-06, + "loss": 0.4364, + "step": 3398 + }, + { + "epoch": 0.9542391914654689, + "grad_norm": 0.7377001047134399, + "learning_rate": 8.620240984554305e-06, + "loss": 0.4102, + "step": 3399 + }, + { + "epoch": 0.9545199326221224, + "grad_norm": 0.7047996520996094, + "learning_rate": 8.61911418665334e-06, + "loss": 0.433, + "step": 3400 + }, + { + "epoch": 0.954800673778776, + "grad_norm": 0.6999577879905701, + "learning_rate": 8.617987002543012e-06, + "loss": 0.4404, + "step": 3401 + }, + { + "epoch": 0.9550814149354295, + "grad_norm": 0.6502342820167542, + "learning_rate": 8.616859432343612e-06, + "loss": 0.4072, + "step": 3402 + }, + { + "epoch": 0.9553621560920831, + "grad_norm": 0.9056606888771057, + "learning_rate": 8.615731476175464e-06, + "loss": 0.4228, + "step": 3403 + }, + { + "epoch": 0.9556428972487366, + "grad_norm": 0.7005042433738708, + "learning_rate": 8.614603134158938e-06, + "loss": 0.4122, + "step": 3404 + }, + { + "epoch": 0.9559236384053902, + "grad_norm": 0.6874243021011353, + "learning_rate": 8.613474406414443e-06, + "loss": 0.4559, + "step": 3405 + }, + { + "epoch": 0.9562043795620438, + "grad_norm": 0.7888487577438354, + "learning_rate": 8.612345293062433e-06, + "loss": 0.4843, + "step": 3406 + }, + { + "epoch": 0.9564851207186974, + "grad_norm": 0.9484554529190063, + "learning_rate": 8.611215794223393e-06, + "loss": 0.4455, + "step": 3407 + }, + { + "epoch": 0.956765861875351, + "grad_norm": 0.7570213675498962, + "learning_rate": 8.610085910017861e-06, + "loss": 0.4466, + "step": 3408 + }, + { + "epoch": 0.9570466030320045, + "grad_norm": 0.7415023446083069, + "learning_rate": 8.60895564056641e-06, + "loss": 0.4598, + "step": 3409 + }, + { + "epoch": 0.9573273441886581, + "grad_norm": 0.7095988392829895, + "learning_rate": 8.607824985989658e-06, + "loss": 0.4183, + "step": 3410 + }, + { + "epoch": 0.9576080853453116, + "grad_norm": 0.7952398657798767, + "learning_rate": 8.606693946408258e-06, + "loss": 0.498, + "step": 3411 + }, + { + "epoch": 0.9578888265019652, + "grad_norm": 0.718460202217102, + "learning_rate": 8.605562521942907e-06, + "loss": 0.4356, + "step": 3412 + }, + { + "epoch": 0.9581695676586187, + "grad_norm": 0.8813295364379883, + "learning_rate": 8.604430712714348e-06, + "loss": 0.4674, + "step": 3413 + }, + { + "epoch": 0.9584503088152723, + "grad_norm": 0.6620175242424011, + "learning_rate": 8.603298518843354e-06, + "loss": 0.435, + "step": 3414 + }, + { + "epoch": 0.9587310499719259, + "grad_norm": 0.7960380911827087, + "learning_rate": 8.602165940450754e-06, + "loss": 0.4917, + "step": 3415 + }, + { + "epoch": 0.9590117911285795, + "grad_norm": 0.8146663308143616, + "learning_rate": 8.601032977657402e-06, + "loss": 0.3953, + "step": 3416 + }, + { + "epoch": 0.959292532285233, + "grad_norm": 0.8786693215370178, + "learning_rate": 8.599899630584206e-06, + "loss": 0.4852, + "step": 3417 + }, + { + "epoch": 0.9595732734418866, + "grad_norm": 0.7795003056526184, + "learning_rate": 8.598765899352106e-06, + "loss": 0.4919, + "step": 3418 + }, + { + "epoch": 0.9598540145985401, + "grad_norm": 0.7619941234588623, + "learning_rate": 8.597631784082089e-06, + "loss": 0.4788, + "step": 3419 + }, + { + "epoch": 0.9601347557551937, + "grad_norm": 0.90803462266922, + "learning_rate": 8.59649728489518e-06, + "loss": 0.4726, + "step": 3420 + }, + { + "epoch": 0.9604154969118472, + "grad_norm": 0.6915274262428284, + "learning_rate": 8.595362401912446e-06, + "loss": 0.3921, + "step": 3421 + }, + { + "epoch": 0.9606962380685008, + "grad_norm": 0.6664067506790161, + "learning_rate": 8.594227135254996e-06, + "loss": 0.3992, + "step": 3422 + }, + { + "epoch": 0.9609769792251545, + "grad_norm": 0.8315762877464294, + "learning_rate": 8.593091485043976e-06, + "loss": 0.4604, + "step": 3423 + }, + { + "epoch": 0.961257720381808, + "grad_norm": 0.8344126343727112, + "learning_rate": 8.591955451400575e-06, + "loss": 0.4371, + "step": 3424 + }, + { + "epoch": 0.9615384615384616, + "grad_norm": 0.8421126008033752, + "learning_rate": 8.590819034446027e-06, + "loss": 0.4177, + "step": 3425 + }, + { + "epoch": 0.9618192026951151, + "grad_norm": 0.7112802863121033, + "learning_rate": 8.589682234301601e-06, + "loss": 0.433, + "step": 3426 + }, + { + "epoch": 0.9620999438517687, + "grad_norm": 0.7226215600967407, + "learning_rate": 8.58854505108861e-06, + "loss": 0.4044, + "step": 3427 + }, + { + "epoch": 0.9623806850084222, + "grad_norm": 0.8644697666168213, + "learning_rate": 8.587407484928408e-06, + "loss": 0.4163, + "step": 3428 + }, + { + "epoch": 0.9626614261650758, + "grad_norm": 0.7910282611846924, + "learning_rate": 8.586269535942386e-06, + "loss": 0.4567, + "step": 3429 + }, + { + "epoch": 0.9629421673217293, + "grad_norm": 0.8188333511352539, + "learning_rate": 8.585131204251982e-06, + "loss": 0.4933, + "step": 3430 + }, + { + "epoch": 0.9632229084783829, + "grad_norm": 0.6796020865440369, + "learning_rate": 8.583992489978669e-06, + "loss": 0.4275, + "step": 3431 + }, + { + "epoch": 0.9635036496350365, + "grad_norm": 0.8388454914093018, + "learning_rate": 8.582853393243965e-06, + "loss": 0.444, + "step": 3432 + }, + { + "epoch": 0.9637843907916901, + "grad_norm": 0.6707009077072144, + "learning_rate": 8.581713914169428e-06, + "loss": 0.4368, + "step": 3433 + }, + { + "epoch": 0.9640651319483436, + "grad_norm": 0.8923273682594299, + "learning_rate": 8.580574052876653e-06, + "loss": 0.4547, + "step": 3434 + }, + { + "epoch": 0.9643458731049972, + "grad_norm": 0.6617672443389893, + "learning_rate": 8.579433809487285e-06, + "loss": 0.4365, + "step": 3435 + }, + { + "epoch": 0.9646266142616507, + "grad_norm": 0.677503764629364, + "learning_rate": 8.578293184122997e-06, + "loss": 0.4347, + "step": 3436 + }, + { + "epoch": 0.9649073554183043, + "grad_norm": 0.6488812565803528, + "learning_rate": 8.577152176905515e-06, + "loss": 0.3938, + "step": 3437 + }, + { + "epoch": 0.9651880965749579, + "grad_norm": 0.8241518139839172, + "learning_rate": 8.576010787956595e-06, + "loss": 0.4447, + "step": 3438 + }, + { + "epoch": 0.9654688377316114, + "grad_norm": 0.7706436514854431, + "learning_rate": 8.574869017398042e-06, + "loss": 0.421, + "step": 3439 + }, + { + "epoch": 0.9657495788882651, + "grad_norm": 0.6720361709594727, + "learning_rate": 8.573726865351698e-06, + "loss": 0.4072, + "step": 3440 + }, + { + "epoch": 0.9660303200449186, + "grad_norm": 0.7769226431846619, + "learning_rate": 8.572584331939447e-06, + "loss": 0.4641, + "step": 3441 + }, + { + "epoch": 0.9663110612015722, + "grad_norm": 0.6810237169265747, + "learning_rate": 8.571441417283214e-06, + "loss": 0.4512, + "step": 3442 + }, + { + "epoch": 0.9665918023582257, + "grad_norm": 0.7702351808547974, + "learning_rate": 8.570298121504958e-06, + "loss": 0.4223, + "step": 3443 + }, + { + "epoch": 0.9668725435148793, + "grad_norm": 0.7678765058517456, + "learning_rate": 8.569154444726692e-06, + "loss": 0.446, + "step": 3444 + }, + { + "epoch": 0.9671532846715328, + "grad_norm": 0.8248004913330078, + "learning_rate": 8.568010387070458e-06, + "loss": 0.512, + "step": 3445 + }, + { + "epoch": 0.9674340258281864, + "grad_norm": 0.633748471736908, + "learning_rate": 8.566865948658344e-06, + "loss": 0.4211, + "step": 3446 + }, + { + "epoch": 0.9677147669848399, + "grad_norm": 0.6683886647224426, + "learning_rate": 8.565721129612476e-06, + "loss": 0.4307, + "step": 3447 + }, + { + "epoch": 0.9679955081414935, + "grad_norm": 0.6899271607398987, + "learning_rate": 8.564575930055023e-06, + "loss": 0.4438, + "step": 3448 + }, + { + "epoch": 0.9682762492981472, + "grad_norm": 0.7076660394668579, + "learning_rate": 8.563430350108194e-06, + "loss": 0.4136, + "step": 3449 + }, + { + "epoch": 0.9685569904548007, + "grad_norm": 0.8247249722480774, + "learning_rate": 8.562284389894238e-06, + "loss": 0.4718, + "step": 3450 + }, + { + "epoch": 0.9688377316114543, + "grad_norm": 0.8279039263725281, + "learning_rate": 8.561138049535443e-06, + "loss": 0.454, + "step": 3451 + }, + { + "epoch": 0.9691184727681078, + "grad_norm": 0.730094313621521, + "learning_rate": 8.55999132915414e-06, + "loss": 0.4339, + "step": 3452 + }, + { + "epoch": 0.9693992139247614, + "grad_norm": 0.7048096656799316, + "learning_rate": 8.558844228872702e-06, + "loss": 0.4793, + "step": 3453 + }, + { + "epoch": 0.9696799550814149, + "grad_norm": 0.6670785546302795, + "learning_rate": 8.55769674881354e-06, + "loss": 0.3888, + "step": 3454 + }, + { + "epoch": 0.9699606962380685, + "grad_norm": 0.6821185350418091, + "learning_rate": 8.556548889099102e-06, + "loss": 0.4079, + "step": 3455 + }, + { + "epoch": 0.970241437394722, + "grad_norm": 0.7230536937713623, + "learning_rate": 8.555400649851884e-06, + "loss": 0.4325, + "step": 3456 + }, + { + "epoch": 0.9705221785513757, + "grad_norm": 0.7355298399925232, + "learning_rate": 8.554252031194418e-06, + "loss": 0.4413, + "step": 3457 + }, + { + "epoch": 0.9708029197080292, + "grad_norm": 0.7354752421379089, + "learning_rate": 8.55310303324928e-06, + "loss": 0.4112, + "step": 3458 + }, + { + "epoch": 0.9710836608646828, + "grad_norm": 0.6842572093009949, + "learning_rate": 8.551953656139079e-06, + "loss": 0.4457, + "step": 3459 + }, + { + "epoch": 0.9713644020213363, + "grad_norm": 0.6387802362442017, + "learning_rate": 8.550803899986473e-06, + "loss": 0.4223, + "step": 3460 + }, + { + "epoch": 0.9716451431779899, + "grad_norm": 0.6626890301704407, + "learning_rate": 8.549653764914157e-06, + "loss": 0.4146, + "step": 3461 + }, + { + "epoch": 0.9719258843346434, + "grad_norm": 0.8330475687980652, + "learning_rate": 8.548503251044863e-06, + "loss": 0.4799, + "step": 3462 + }, + { + "epoch": 0.972206625491297, + "grad_norm": 0.7225596308708191, + "learning_rate": 8.54735235850137e-06, + "loss": 0.4397, + "step": 3463 + }, + { + "epoch": 0.9724873666479505, + "grad_norm": 0.6990557909011841, + "learning_rate": 8.546201087406491e-06, + "loss": 0.3915, + "step": 3464 + }, + { + "epoch": 0.9727681078046042, + "grad_norm": 0.8358392715454102, + "learning_rate": 8.545049437883087e-06, + "loss": 0.4082, + "step": 3465 + }, + { + "epoch": 0.9730488489612578, + "grad_norm": 0.810123860836029, + "learning_rate": 8.54389741005405e-06, + "loss": 0.4455, + "step": 3466 + }, + { + "epoch": 0.9733295901179113, + "grad_norm": 0.8371599912643433, + "learning_rate": 8.542745004042321e-06, + "loss": 0.4897, + "step": 3467 + }, + { + "epoch": 0.9736103312745649, + "grad_norm": 0.8034262657165527, + "learning_rate": 8.541592219970876e-06, + "loss": 0.4369, + "step": 3468 + }, + { + "epoch": 0.9738910724312184, + "grad_norm": 0.8848324418067932, + "learning_rate": 8.540439057962731e-06, + "loss": 0.4607, + "step": 3469 + }, + { + "epoch": 0.974171813587872, + "grad_norm": 0.691663920879364, + "learning_rate": 8.539285518140947e-06, + "loss": 0.4495, + "step": 3470 + }, + { + "epoch": 0.9744525547445255, + "grad_norm": 0.6771031022071838, + "learning_rate": 8.538131600628624e-06, + "loss": 0.4768, + "step": 3471 + }, + { + "epoch": 0.9747332959011791, + "grad_norm": 0.819758415222168, + "learning_rate": 8.536977305548898e-06, + "loss": 0.4754, + "step": 3472 + }, + { + "epoch": 0.9750140370578326, + "grad_norm": 0.7135564684867859, + "learning_rate": 8.535822633024946e-06, + "loss": 0.4282, + "step": 3473 + }, + { + "epoch": 0.9752947782144863, + "grad_norm": 0.7830508351325989, + "learning_rate": 8.534667583179993e-06, + "loss": 0.4637, + "step": 3474 + }, + { + "epoch": 0.9755755193711398, + "grad_norm": 0.6742008328437805, + "learning_rate": 8.533512156137297e-06, + "loss": 0.4641, + "step": 3475 + }, + { + "epoch": 0.9758562605277934, + "grad_norm": 0.8535793423652649, + "learning_rate": 8.532356352020155e-06, + "loss": 0.468, + "step": 3476 + }, + { + "epoch": 0.976137001684447, + "grad_norm": 0.7796273231506348, + "learning_rate": 8.53120017095191e-06, + "loss": 0.4336, + "step": 3477 + }, + { + "epoch": 0.9764177428411005, + "grad_norm": 0.8205191493034363, + "learning_rate": 8.530043613055942e-06, + "loss": 0.4699, + "step": 3478 + }, + { + "epoch": 0.976698483997754, + "grad_norm": 0.7456270456314087, + "learning_rate": 8.528886678455671e-06, + "loss": 0.4745, + "step": 3479 + }, + { + "epoch": 0.9769792251544076, + "grad_norm": 0.8388075232505798, + "learning_rate": 8.527729367274559e-06, + "loss": 0.4143, + "step": 3480 + }, + { + "epoch": 0.9772599663110612, + "grad_norm": 0.6366397142410278, + "learning_rate": 8.526571679636107e-06, + "loss": 0.4288, + "step": 3481 + }, + { + "epoch": 0.9775407074677148, + "grad_norm": 0.6700262427330017, + "learning_rate": 8.525413615663855e-06, + "loss": 0.4778, + "step": 3482 + }, + { + "epoch": 0.9778214486243684, + "grad_norm": 0.9022010564804077, + "learning_rate": 8.524255175481387e-06, + "loss": 0.4453, + "step": 3483 + }, + { + "epoch": 0.9781021897810219, + "grad_norm": 0.7886511087417603, + "learning_rate": 8.52309635921232e-06, + "loss": 0.4261, + "step": 3484 + }, + { + "epoch": 0.9783829309376755, + "grad_norm": 0.9262488484382629, + "learning_rate": 8.521937166980318e-06, + "loss": 0.4301, + "step": 3485 + }, + { + "epoch": 0.978663672094329, + "grad_norm": 0.9617725014686584, + "learning_rate": 8.520777598909084e-06, + "loss": 0.4843, + "step": 3486 + }, + { + "epoch": 0.9789444132509826, + "grad_norm": 0.7633271813392639, + "learning_rate": 8.51961765512236e-06, + "loss": 0.4659, + "step": 3487 + }, + { + "epoch": 0.9792251544076361, + "grad_norm": 0.7121231555938721, + "learning_rate": 8.518457335743927e-06, + "loss": 0.4424, + "step": 3488 + }, + { + "epoch": 0.9795058955642897, + "grad_norm": 0.7436328530311584, + "learning_rate": 8.517296640897606e-06, + "loss": 0.5038, + "step": 3489 + }, + { + "epoch": 0.9797866367209432, + "grad_norm": 0.7411491274833679, + "learning_rate": 8.516135570707258e-06, + "loss": 0.4454, + "step": 3490 + }, + { + "epoch": 0.9800673778775969, + "grad_norm": 0.783284068107605, + "learning_rate": 8.51497412529679e-06, + "loss": 0.433, + "step": 3491 + }, + { + "epoch": 0.9803481190342505, + "grad_norm": 0.6556893587112427, + "learning_rate": 8.513812304790141e-06, + "loss": 0.4297, + "step": 3492 + }, + { + "epoch": 0.980628860190904, + "grad_norm": 0.706886887550354, + "learning_rate": 8.512650109311293e-06, + "loss": 0.4145, + "step": 3493 + }, + { + "epoch": 0.9809096013475576, + "grad_norm": 0.7100974917411804, + "learning_rate": 8.511487538984268e-06, + "loss": 0.4335, + "step": 3494 + }, + { + "epoch": 0.9811903425042111, + "grad_norm": 0.791081964969635, + "learning_rate": 8.510324593933132e-06, + "loss": 0.458, + "step": 3495 + }, + { + "epoch": 0.9814710836608647, + "grad_norm": 0.7544689178466797, + "learning_rate": 8.509161274281984e-06, + "loss": 0.4499, + "step": 3496 + }, + { + "epoch": 0.9817518248175182, + "grad_norm": 0.6856833100318909, + "learning_rate": 8.507997580154967e-06, + "loss": 0.4043, + "step": 3497 + }, + { + "epoch": 0.9820325659741718, + "grad_norm": 0.7366673946380615, + "learning_rate": 8.506833511676262e-06, + "loss": 0.47, + "step": 3498 + }, + { + "epoch": 0.9823133071308254, + "grad_norm": 0.7325769066810608, + "learning_rate": 8.505669068970092e-06, + "loss": 0.4502, + "step": 3499 + }, + { + "epoch": 0.982594048287479, + "grad_norm": 0.7868366241455078, + "learning_rate": 8.50450425216072e-06, + "loss": 0.4358, + "step": 3500 + }, + { + "epoch": 0.9828747894441325, + "grad_norm": 0.7951871156692505, + "learning_rate": 8.503339061372449e-06, + "loss": 0.4496, + "step": 3501 + }, + { + "epoch": 0.9831555306007861, + "grad_norm": 0.7538761496543884, + "learning_rate": 8.502173496729615e-06, + "loss": 0.4516, + "step": 3502 + }, + { + "epoch": 0.9834362717574396, + "grad_norm": 0.6773754358291626, + "learning_rate": 8.501007558356607e-06, + "loss": 0.397, + "step": 3503 + }, + { + "epoch": 0.9837170129140932, + "grad_norm": 0.8450381755828857, + "learning_rate": 8.499841246377844e-06, + "loss": 0.4612, + "step": 3504 + }, + { + "epoch": 0.9839977540707467, + "grad_norm": 0.8215748071670532, + "learning_rate": 8.498674560917785e-06, + "loss": 0.4717, + "step": 3505 + }, + { + "epoch": 0.9842784952274003, + "grad_norm": 0.7071011662483215, + "learning_rate": 8.497507502100935e-06, + "loss": 0.3929, + "step": 3506 + }, + { + "epoch": 0.9845592363840538, + "grad_norm": 0.7242488861083984, + "learning_rate": 8.496340070051834e-06, + "loss": 0.4434, + "step": 3507 + }, + { + "epoch": 0.9848399775407075, + "grad_norm": 0.8573377728462219, + "learning_rate": 8.495172264895065e-06, + "loss": 0.434, + "step": 3508 + }, + { + "epoch": 0.9851207186973611, + "grad_norm": 0.8474485278129578, + "learning_rate": 8.494004086755243e-06, + "loss": 0.4508, + "step": 3509 + }, + { + "epoch": 0.9854014598540146, + "grad_norm": 0.9176875948905945, + "learning_rate": 8.492835535757037e-06, + "loss": 0.463, + "step": 3510 + }, + { + "epoch": 0.9856822010106682, + "grad_norm": 0.8488715291023254, + "learning_rate": 8.491666612025139e-06, + "loss": 0.4434, + "step": 3511 + }, + { + "epoch": 0.9859629421673217, + "grad_norm": 0.9118342995643616, + "learning_rate": 8.490497315684295e-06, + "loss": 0.4793, + "step": 3512 + }, + { + "epoch": 0.9862436833239753, + "grad_norm": 0.7651602625846863, + "learning_rate": 8.489327646859284e-06, + "loss": 0.4614, + "step": 3513 + }, + { + "epoch": 0.9865244244806288, + "grad_norm": 0.8609891533851624, + "learning_rate": 8.488157605674924e-06, + "loss": 0.4728, + "step": 3514 + }, + { + "epoch": 0.9868051656372824, + "grad_norm": 0.8994102478027344, + "learning_rate": 8.486987192256077e-06, + "loss": 0.4449, + "step": 3515 + }, + { + "epoch": 0.987085906793936, + "grad_norm": 0.8351258635520935, + "learning_rate": 8.48581640672764e-06, + "loss": 0.4221, + "step": 3516 + }, + { + "epoch": 0.9873666479505896, + "grad_norm": 0.7111465334892273, + "learning_rate": 8.484645249214554e-06, + "loss": 0.4358, + "step": 3517 + }, + { + "epoch": 0.9876473891072431, + "grad_norm": 0.7977031469345093, + "learning_rate": 8.483473719841794e-06, + "loss": 0.4559, + "step": 3518 + }, + { + "epoch": 0.9879281302638967, + "grad_norm": 0.8247277140617371, + "learning_rate": 8.482301818734384e-06, + "loss": 0.4703, + "step": 3519 + }, + { + "epoch": 0.9882088714205502, + "grad_norm": 0.7320131063461304, + "learning_rate": 8.481129546017379e-06, + "loss": 0.4569, + "step": 3520 + }, + { + "epoch": 0.9884896125772038, + "grad_norm": 0.6837695837020874, + "learning_rate": 8.479956901815875e-06, + "loss": 0.4665, + "step": 3521 + }, + { + "epoch": 0.9887703537338574, + "grad_norm": 0.7357917428016663, + "learning_rate": 8.47878388625501e-06, + "loss": 0.4456, + "step": 3522 + }, + { + "epoch": 0.9890510948905109, + "grad_norm": 0.8145801424980164, + "learning_rate": 8.477610499459964e-06, + "loss": 0.4541, + "step": 3523 + }, + { + "epoch": 0.9893318360471645, + "grad_norm": 0.6550049185752869, + "learning_rate": 8.476436741555952e-06, + "loss": 0.4096, + "step": 3524 + }, + { + "epoch": 0.9896125772038181, + "grad_norm": 0.718496561050415, + "learning_rate": 8.475262612668227e-06, + "loss": 0.4405, + "step": 3525 + }, + { + "epoch": 0.9898933183604717, + "grad_norm": 0.8126087784767151, + "learning_rate": 8.474088112922087e-06, + "loss": 0.4461, + "step": 3526 + }, + { + "epoch": 0.9901740595171252, + "grad_norm": 0.7250719666481018, + "learning_rate": 8.47291324244287e-06, + "loss": 0.4229, + "step": 3527 + }, + { + "epoch": 0.9904548006737788, + "grad_norm": 0.8736149668693542, + "learning_rate": 8.471738001355947e-06, + "loss": 0.4433, + "step": 3528 + }, + { + "epoch": 0.9907355418304323, + "grad_norm": 0.6776615381240845, + "learning_rate": 8.470562389786733e-06, + "loss": 0.4244, + "step": 3529 + }, + { + "epoch": 0.9910162829870859, + "grad_norm": 0.7001596689224243, + "learning_rate": 8.469386407860683e-06, + "loss": 0.4742, + "step": 3530 + }, + { + "epoch": 0.9912970241437394, + "grad_norm": 0.697855532169342, + "learning_rate": 8.468210055703291e-06, + "loss": 0.4188, + "step": 3531 + }, + { + "epoch": 0.991577765300393, + "grad_norm": 0.7609755992889404, + "learning_rate": 8.467033333440089e-06, + "loss": 0.4595, + "step": 3532 + }, + { + "epoch": 0.9918585064570467, + "grad_norm": 0.7395150065422058, + "learning_rate": 8.46585624119665e-06, + "loss": 0.4547, + "step": 3533 + }, + { + "epoch": 0.9921392476137002, + "grad_norm": 0.7125252485275269, + "learning_rate": 8.464678779098586e-06, + "loss": 0.433, + "step": 3534 + }, + { + "epoch": 0.9924199887703538, + "grad_norm": 0.7173366546630859, + "learning_rate": 8.463500947271547e-06, + "loss": 0.4771, + "step": 3535 + }, + { + "epoch": 0.9927007299270073, + "grad_norm": 0.6565790176391602, + "learning_rate": 8.462322745841225e-06, + "loss": 0.4403, + "step": 3536 + }, + { + "epoch": 0.9929814710836609, + "grad_norm": 0.7130043506622314, + "learning_rate": 8.46114417493335e-06, + "loss": 0.4347, + "step": 3537 + }, + { + "epoch": 0.9932622122403144, + "grad_norm": 0.7157816886901855, + "learning_rate": 8.459965234673695e-06, + "loss": 0.4277, + "step": 3538 + }, + { + "epoch": 0.993542953396968, + "grad_norm": 0.8052875399589539, + "learning_rate": 8.458785925188064e-06, + "loss": 0.4847, + "step": 3539 + }, + { + "epoch": 0.9938236945536215, + "grad_norm": 0.6643232703208923, + "learning_rate": 8.457606246602307e-06, + "loss": 0.4288, + "step": 3540 + }, + { + "epoch": 0.9941044357102752, + "grad_norm": 0.6808356046676636, + "learning_rate": 8.456426199042314e-06, + "loss": 0.4316, + "step": 3541 + }, + { + "epoch": 0.9943851768669287, + "grad_norm": 0.6699117422103882, + "learning_rate": 8.455245782634011e-06, + "loss": 0.4125, + "step": 3542 + }, + { + "epoch": 0.9946659180235823, + "grad_norm": 0.7270050048828125, + "learning_rate": 8.454064997503365e-06, + "loss": 0.4592, + "step": 3543 + }, + { + "epoch": 0.9949466591802358, + "grad_norm": 0.8155306577682495, + "learning_rate": 8.45288384377638e-06, + "loss": 0.4148, + "step": 3544 + }, + { + "epoch": 0.9952274003368894, + "grad_norm": 0.6916264295578003, + "learning_rate": 8.451702321579106e-06, + "loss": 0.4164, + "step": 3545 + }, + { + "epoch": 0.9955081414935429, + "grad_norm": 0.682720422744751, + "learning_rate": 8.450520431037624e-06, + "loss": 0.4084, + "step": 3546 + }, + { + "epoch": 0.9957888826501965, + "grad_norm": 0.777711033821106, + "learning_rate": 8.44933817227806e-06, + "loss": 0.4258, + "step": 3547 + }, + { + "epoch": 0.99606962380685, + "grad_norm": 0.6997545957565308, + "learning_rate": 8.448155545426573e-06, + "loss": 0.4614, + "step": 3548 + }, + { + "epoch": 0.9963503649635036, + "grad_norm": 0.7430494427680969, + "learning_rate": 8.446972550609372e-06, + "loss": 0.4869, + "step": 3549 + }, + { + "epoch": 0.9966311061201573, + "grad_norm": 0.7802561521530151, + "learning_rate": 8.445789187952696e-06, + "loss": 0.4555, + "step": 3550 + }, + { + "epoch": 0.9969118472768108, + "grad_norm": 0.7442938089370728, + "learning_rate": 8.444605457582823e-06, + "loss": 0.4606, + "step": 3551 + }, + { + "epoch": 0.9971925884334644, + "grad_norm": 0.6580045223236084, + "learning_rate": 8.443421359626078e-06, + "loss": 0.4634, + "step": 3552 + }, + { + "epoch": 0.9974733295901179, + "grad_norm": 0.6235171556472778, + "learning_rate": 8.442236894208819e-06, + "loss": 0.4203, + "step": 3553 + }, + { + "epoch": 0.9977540707467715, + "grad_norm": 0.6908484697341919, + "learning_rate": 8.441052061457444e-06, + "loss": 0.4687, + "step": 3554 + }, + { + "epoch": 0.998034811903425, + "grad_norm": 0.7158430218696594, + "learning_rate": 8.439866861498392e-06, + "loss": 0.4334, + "step": 3555 + }, + { + "epoch": 0.9983155530600786, + "grad_norm": 0.6933430433273315, + "learning_rate": 8.438681294458137e-06, + "loss": 0.4424, + "step": 3556 + }, + { + "epoch": 0.9985962942167321, + "grad_norm": 0.8389679193496704, + "learning_rate": 8.4374953604632e-06, + "loss": 0.4369, + "step": 3557 + }, + { + "epoch": 0.9988770353733858, + "grad_norm": 0.6856235861778259, + "learning_rate": 8.436309059640136e-06, + "loss": 0.4675, + "step": 3558 + }, + { + "epoch": 0.9991577765300393, + "grad_norm": 0.7605506181716919, + "learning_rate": 8.435122392115536e-06, + "loss": 0.4366, + "step": 3559 + }, + { + "epoch": 0.9994385176866929, + "grad_norm": 0.6978572607040405, + "learning_rate": 8.433935358016037e-06, + "loss": 0.4646, + "step": 3560 + }, + { + "epoch": 0.9997192588433464, + "grad_norm": 0.6760448217391968, + "learning_rate": 8.43274795746831e-06, + "loss": 0.4125, + "step": 3561 + }, + { + "epoch": 1.0, + "grad_norm": 0.6113824248313904, + "learning_rate": 8.431560190599069e-06, + "loss": 0.3944, + "step": 3562 + }, + { + "epoch": 1.0002807411566537, + "grad_norm": 0.8328418135643005, + "learning_rate": 8.430372057535063e-06, + "loss": 0.4023, + "step": 3563 + }, + { + "epoch": 1.000561482313307, + "grad_norm": 0.7610487341880798, + "learning_rate": 8.429183558403083e-06, + "loss": 0.4073, + "step": 3564 + }, + { + "epoch": 1.0008422234699608, + "grad_norm": 0.8734410405158997, + "learning_rate": 8.427994693329959e-06, + "loss": 0.4443, + "step": 3565 + }, + { + "epoch": 1.0011229646266142, + "grad_norm": 0.7603929042816162, + "learning_rate": 8.426805462442558e-06, + "loss": 0.3791, + "step": 3566 + }, + { + "epoch": 1.0014037057832679, + "grad_norm": 0.7288072109222412, + "learning_rate": 8.42561586586779e-06, + "loss": 0.3872, + "step": 3567 + }, + { + "epoch": 1.0016844469399213, + "grad_norm": 0.7560978531837463, + "learning_rate": 8.424425903732596e-06, + "loss": 0.3625, + "step": 3568 + }, + { + "epoch": 1.001965188096575, + "grad_norm": 0.6720370054244995, + "learning_rate": 8.423235576163966e-06, + "loss": 0.4031, + "step": 3569 + }, + { + "epoch": 1.0022459292532284, + "grad_norm": 0.6554220914840698, + "learning_rate": 8.422044883288922e-06, + "loss": 0.3758, + "step": 3570 + }, + { + "epoch": 1.002526670409882, + "grad_norm": 0.7182668447494507, + "learning_rate": 8.42085382523453e-06, + "loss": 0.3685, + "step": 3571 + }, + { + "epoch": 1.0028074115665357, + "grad_norm": 0.6917582154273987, + "learning_rate": 8.41966240212789e-06, + "loss": 0.3874, + "step": 3572 + }, + { + "epoch": 1.0030881527231892, + "grad_norm": 0.6168217062950134, + "learning_rate": 8.418470614096144e-06, + "loss": 0.419, + "step": 3573 + }, + { + "epoch": 1.0033688938798428, + "grad_norm": 0.6358725428581238, + "learning_rate": 8.417278461266472e-06, + "loss": 0.3979, + "step": 3574 + }, + { + "epoch": 1.0036496350364963, + "grad_norm": 0.6770613789558411, + "learning_rate": 8.416085943766095e-06, + "loss": 0.4123, + "step": 3575 + }, + { + "epoch": 1.00393037619315, + "grad_norm": 0.5870041847229004, + "learning_rate": 8.414893061722267e-06, + "loss": 0.4045, + "step": 3576 + }, + { + "epoch": 1.0042111173498034, + "grad_norm": 0.6959606409072876, + "learning_rate": 8.413699815262289e-06, + "loss": 0.3818, + "step": 3577 + }, + { + "epoch": 1.004491858506457, + "grad_norm": 0.669116199016571, + "learning_rate": 8.412506204513494e-06, + "loss": 0.3662, + "step": 3578 + }, + { + "epoch": 1.0047725996631107, + "grad_norm": 0.5783640742301941, + "learning_rate": 8.411312229603257e-06, + "loss": 0.3563, + "step": 3579 + }, + { + "epoch": 1.0050533408197642, + "grad_norm": 0.7830600142478943, + "learning_rate": 8.410117890658994e-06, + "loss": 0.427, + "step": 3580 + }, + { + "epoch": 1.0053340819764178, + "grad_norm": 0.7403177618980408, + "learning_rate": 8.408923187808156e-06, + "loss": 0.4321, + "step": 3581 + }, + { + "epoch": 1.0056148231330713, + "grad_norm": 0.8039252161979675, + "learning_rate": 8.407728121178232e-06, + "loss": 0.3812, + "step": 3582 + }, + { + "epoch": 1.005895564289725, + "grad_norm": 0.6774498224258423, + "learning_rate": 8.406532690896756e-06, + "loss": 0.4349, + "step": 3583 + }, + { + "epoch": 1.0061763054463784, + "grad_norm": 0.7238128185272217, + "learning_rate": 8.405336897091294e-06, + "loss": 0.3924, + "step": 3584 + }, + { + "epoch": 1.006457046603032, + "grad_norm": 0.7300281524658203, + "learning_rate": 8.404140739889455e-06, + "loss": 0.3747, + "step": 3585 + }, + { + "epoch": 1.0067377877596855, + "grad_norm": 0.6838752627372742, + "learning_rate": 8.402944219418887e-06, + "loss": 0.3846, + "step": 3586 + }, + { + "epoch": 1.0070185289163391, + "grad_norm": 0.6838813424110413, + "learning_rate": 8.40174733580727e-06, + "loss": 0.3811, + "step": 3587 + }, + { + "epoch": 1.0072992700729928, + "grad_norm": 0.7132014632225037, + "learning_rate": 8.400550089182334e-06, + "loss": 0.402, + "step": 3588 + }, + { + "epoch": 1.0075800112296462, + "grad_norm": 0.594083845615387, + "learning_rate": 8.399352479671839e-06, + "loss": 0.3855, + "step": 3589 + }, + { + "epoch": 1.0078607523863, + "grad_norm": 0.638372540473938, + "learning_rate": 8.398154507403587e-06, + "loss": 0.3787, + "step": 3590 + }, + { + "epoch": 1.0081414935429533, + "grad_norm": 0.626323401927948, + "learning_rate": 8.396956172505414e-06, + "loss": 0.3891, + "step": 3591 + }, + { + "epoch": 1.008422234699607, + "grad_norm": 0.7636907696723938, + "learning_rate": 8.395757475105206e-06, + "loss": 0.404, + "step": 3592 + }, + { + "epoch": 1.0087029758562605, + "grad_norm": 0.6529296636581421, + "learning_rate": 8.394558415330879e-06, + "loss": 0.3461, + "step": 3593 + }, + { + "epoch": 1.0089837170129141, + "grad_norm": 0.692482054233551, + "learning_rate": 8.393358993310384e-06, + "loss": 0.3659, + "step": 3594 + }, + { + "epoch": 1.0092644581695676, + "grad_norm": 0.6872764229774475, + "learning_rate": 8.392159209171717e-06, + "loss": 0.3768, + "step": 3595 + }, + { + "epoch": 1.0095451993262212, + "grad_norm": 0.7262110710144043, + "learning_rate": 8.390959063042917e-06, + "loss": 0.3879, + "step": 3596 + }, + { + "epoch": 1.0098259404828749, + "grad_norm": 0.6634448766708374, + "learning_rate": 8.389758555052053e-06, + "loss": 0.3694, + "step": 3597 + }, + { + "epoch": 1.0101066816395283, + "grad_norm": 0.6361666321754456, + "learning_rate": 8.388557685327234e-06, + "loss": 0.3851, + "step": 3598 + }, + { + "epoch": 1.010387422796182, + "grad_norm": 0.6331276297569275, + "learning_rate": 8.387356453996612e-06, + "loss": 0.3729, + "step": 3599 + }, + { + "epoch": 1.0106681639528354, + "grad_norm": 0.66754150390625, + "learning_rate": 8.386154861188374e-06, + "loss": 0.3871, + "step": 3600 + }, + { + "epoch": 1.010948905109489, + "grad_norm": 0.822075605392456, + "learning_rate": 8.384952907030744e-06, + "loss": 0.4391, + "step": 3601 + }, + { + "epoch": 1.0112296462661425, + "grad_norm": 0.8297271132469177, + "learning_rate": 8.383750591651991e-06, + "loss": 0.4098, + "step": 3602 + }, + { + "epoch": 1.0115103874227962, + "grad_norm": 0.6594547629356384, + "learning_rate": 8.382547915180417e-06, + "loss": 0.4068, + "step": 3603 + }, + { + "epoch": 1.0117911285794496, + "grad_norm": 0.7082101702690125, + "learning_rate": 8.381344877744366e-06, + "loss": 0.3876, + "step": 3604 + }, + { + "epoch": 1.0120718697361033, + "grad_norm": 0.7290804386138916, + "learning_rate": 8.380141479472214e-06, + "loss": 0.4073, + "step": 3605 + }, + { + "epoch": 1.012352610892757, + "grad_norm": 0.6972177028656006, + "learning_rate": 8.378937720492384e-06, + "loss": 0.3706, + "step": 3606 + }, + { + "epoch": 1.0126333520494104, + "grad_norm": 0.6854816675186157, + "learning_rate": 8.377733600933333e-06, + "loss": 0.4191, + "step": 3607 + }, + { + "epoch": 1.012914093206064, + "grad_norm": 0.6364105939865112, + "learning_rate": 8.376529120923556e-06, + "loss": 0.4325, + "step": 3608 + }, + { + "epoch": 1.0131948343627175, + "grad_norm": 0.7041125893592834, + "learning_rate": 8.37532428059159e-06, + "loss": 0.3666, + "step": 3609 + }, + { + "epoch": 1.0134755755193712, + "grad_norm": 0.6840842962265015, + "learning_rate": 8.374119080066005e-06, + "loss": 0.392, + "step": 3610 + }, + { + "epoch": 1.0137563166760246, + "grad_norm": 0.7031016945838928, + "learning_rate": 8.372913519475415e-06, + "loss": 0.3791, + "step": 3611 + }, + { + "epoch": 1.0140370578326783, + "grad_norm": 0.7323938012123108, + "learning_rate": 8.371707598948468e-06, + "loss": 0.3947, + "step": 3612 + }, + { + "epoch": 1.014317798989332, + "grad_norm": 0.6816534996032715, + "learning_rate": 8.370501318613855e-06, + "loss": 0.3445, + "step": 3613 + }, + { + "epoch": 1.0145985401459854, + "grad_norm": 0.7405982613563538, + "learning_rate": 8.3692946786003e-06, + "loss": 0.4005, + "step": 3614 + }, + { + "epoch": 1.014879281302639, + "grad_norm": 0.5886854529380798, + "learning_rate": 8.36808767903657e-06, + "loss": 0.3746, + "step": 3615 + }, + { + "epoch": 1.0151600224592925, + "grad_norm": 0.7090999484062195, + "learning_rate": 8.366880320051465e-06, + "loss": 0.3456, + "step": 3616 + }, + { + "epoch": 1.0154407636159462, + "grad_norm": 0.6102776527404785, + "learning_rate": 8.365672601773833e-06, + "loss": 0.3948, + "step": 3617 + }, + { + "epoch": 1.0157215047725996, + "grad_norm": 0.7303576469421387, + "learning_rate": 8.364464524332547e-06, + "loss": 0.3747, + "step": 3618 + }, + { + "epoch": 1.0160022459292533, + "grad_norm": 0.6708539724349976, + "learning_rate": 8.363256087856532e-06, + "loss": 0.3588, + "step": 3619 + }, + { + "epoch": 1.0162829870859067, + "grad_norm": 0.6503663659095764, + "learning_rate": 8.362047292474741e-06, + "loss": 0.3458, + "step": 3620 + }, + { + "epoch": 1.0165637282425604, + "grad_norm": 0.6906838417053223, + "learning_rate": 8.36083813831617e-06, + "loss": 0.3953, + "step": 3621 + }, + { + "epoch": 1.016844469399214, + "grad_norm": 0.6733444333076477, + "learning_rate": 8.359628625509852e-06, + "loss": 0.3914, + "step": 3622 + }, + { + "epoch": 1.0171252105558675, + "grad_norm": 0.7370371222496033, + "learning_rate": 8.35841875418486e-06, + "loss": 0.4315, + "step": 3623 + }, + { + "epoch": 1.0174059517125211, + "grad_norm": 0.7200049757957458, + "learning_rate": 8.357208524470304e-06, + "loss": 0.3529, + "step": 3624 + }, + { + "epoch": 1.0176866928691746, + "grad_norm": 0.6907103657722473, + "learning_rate": 8.355997936495332e-06, + "loss": 0.3867, + "step": 3625 + }, + { + "epoch": 1.0179674340258282, + "grad_norm": 0.6707966327667236, + "learning_rate": 8.354786990389128e-06, + "loss": 0.3675, + "step": 3626 + }, + { + "epoch": 1.0182481751824817, + "grad_norm": 0.710354208946228, + "learning_rate": 8.35357568628092e-06, + "loss": 0.3761, + "step": 3627 + }, + { + "epoch": 1.0185289163391353, + "grad_norm": 0.629179060459137, + "learning_rate": 8.352364024299966e-06, + "loss": 0.3714, + "step": 3628 + }, + { + "epoch": 1.0188096574957888, + "grad_norm": 0.677626371383667, + "learning_rate": 8.351152004575573e-06, + "loss": 0.3879, + "step": 3629 + }, + { + "epoch": 1.0190903986524424, + "grad_norm": 0.7375317811965942, + "learning_rate": 8.349939627237079e-06, + "loss": 0.3909, + "step": 3630 + }, + { + "epoch": 1.019371139809096, + "grad_norm": 0.6857993006706238, + "learning_rate": 8.348726892413857e-06, + "loss": 0.3787, + "step": 3631 + }, + { + "epoch": 1.0196518809657495, + "grad_norm": 0.6406326293945312, + "learning_rate": 8.347513800235325e-06, + "loss": 0.4423, + "step": 3632 + }, + { + "epoch": 1.0199326221224032, + "grad_norm": 0.6543883085250854, + "learning_rate": 8.346300350830938e-06, + "loss": 0.3737, + "step": 3633 + }, + { + "epoch": 1.0202133632790567, + "grad_norm": 0.6704428195953369, + "learning_rate": 8.345086544330188e-06, + "loss": 0.3783, + "step": 3634 + }, + { + "epoch": 1.0204941044357103, + "grad_norm": 0.7376469969749451, + "learning_rate": 8.343872380862601e-06, + "loss": 0.4056, + "step": 3635 + }, + { + "epoch": 1.0207748455923638, + "grad_norm": 0.6677947044372559, + "learning_rate": 8.34265786055775e-06, + "loss": 0.3739, + "step": 3636 + }, + { + "epoch": 1.0210555867490174, + "grad_norm": 0.7404154539108276, + "learning_rate": 8.341442983545239e-06, + "loss": 0.383, + "step": 3637 + }, + { + "epoch": 1.0213363279056709, + "grad_norm": 0.7266256213188171, + "learning_rate": 8.340227749954712e-06, + "loss": 0.4031, + "step": 3638 + }, + { + "epoch": 1.0216170690623245, + "grad_norm": 0.6936408281326294, + "learning_rate": 8.339012159915848e-06, + "loss": 0.3589, + "step": 3639 + }, + { + "epoch": 1.0218978102189782, + "grad_norm": 0.7971001267433167, + "learning_rate": 8.337796213558374e-06, + "loss": 0.412, + "step": 3640 + }, + { + "epoch": 1.0221785513756316, + "grad_norm": 0.7692808508872986, + "learning_rate": 8.336579911012043e-06, + "loss": 0.3846, + "step": 3641 + }, + { + "epoch": 1.0224592925322853, + "grad_norm": 0.6137217283248901, + "learning_rate": 8.335363252406652e-06, + "loss": 0.3909, + "step": 3642 + }, + { + "epoch": 1.0227400336889387, + "grad_norm": 0.8090940117835999, + "learning_rate": 8.334146237872037e-06, + "loss": 0.442, + "step": 3643 + }, + { + "epoch": 1.0230207748455924, + "grad_norm": 0.716333270072937, + "learning_rate": 8.332928867538068e-06, + "loss": 0.4048, + "step": 3644 + }, + { + "epoch": 1.0233015160022458, + "grad_norm": 0.6243662238121033, + "learning_rate": 8.331711141534657e-06, + "loss": 0.3809, + "step": 3645 + }, + { + "epoch": 1.0235822571588995, + "grad_norm": 0.6206414699554443, + "learning_rate": 8.33049305999175e-06, + "loss": 0.3563, + "step": 3646 + }, + { + "epoch": 1.0238629983155532, + "grad_norm": 0.6339302062988281, + "learning_rate": 8.329274623039339e-06, + "loss": 0.3873, + "step": 3647 + }, + { + "epoch": 1.0241437394722066, + "grad_norm": 0.6579312086105347, + "learning_rate": 8.32805583080744e-06, + "loss": 0.4038, + "step": 3648 + }, + { + "epoch": 1.0244244806288603, + "grad_norm": 0.687745213508606, + "learning_rate": 8.326836683426118e-06, + "loss": 0.4212, + "step": 3649 + }, + { + "epoch": 1.0247052217855137, + "grad_norm": 0.642025351524353, + "learning_rate": 8.325617181025476e-06, + "loss": 0.4016, + "step": 3650 + }, + { + "epoch": 1.0249859629421674, + "grad_norm": 0.5885717272758484, + "learning_rate": 8.324397323735646e-06, + "loss": 0.3893, + "step": 3651 + }, + { + "epoch": 1.0252667040988208, + "grad_norm": 0.6964549422264099, + "learning_rate": 8.32317711168681e-06, + "loss": 0.3858, + "step": 3652 + }, + { + "epoch": 1.0255474452554745, + "grad_norm": 0.757373034954071, + "learning_rate": 8.321956545009176e-06, + "loss": 0.3824, + "step": 3653 + }, + { + "epoch": 1.025828186412128, + "grad_norm": 0.656338632106781, + "learning_rate": 8.320735623832998e-06, + "loss": 0.3795, + "step": 3654 + }, + { + "epoch": 1.0261089275687816, + "grad_norm": 0.6454774737358093, + "learning_rate": 8.319514348288566e-06, + "loss": 0.388, + "step": 3655 + }, + { + "epoch": 1.0263896687254352, + "grad_norm": 0.8298314213752747, + "learning_rate": 8.318292718506204e-06, + "loss": 0.4123, + "step": 3656 + }, + { + "epoch": 1.0266704098820887, + "grad_norm": 0.7114923000335693, + "learning_rate": 8.317070734616278e-06, + "loss": 0.3873, + "step": 3657 + }, + { + "epoch": 1.0269511510387423, + "grad_norm": 0.7187616229057312, + "learning_rate": 8.31584839674919e-06, + "loss": 0.3787, + "step": 3658 + }, + { + "epoch": 1.0272318921953958, + "grad_norm": 0.680168628692627, + "learning_rate": 8.314625705035382e-06, + "loss": 0.4077, + "step": 3659 + }, + { + "epoch": 1.0275126333520495, + "grad_norm": 0.6636437773704529, + "learning_rate": 8.313402659605332e-06, + "loss": 0.3907, + "step": 3660 + }, + { + "epoch": 1.027793374508703, + "grad_norm": 0.719252347946167, + "learning_rate": 8.312179260589553e-06, + "loss": 0.3776, + "step": 3661 + }, + { + "epoch": 1.0280741156653566, + "grad_norm": 0.9567844271659851, + "learning_rate": 8.310955508118601e-06, + "loss": 0.3921, + "step": 3662 + }, + { + "epoch": 1.02835485682201, + "grad_norm": 0.7299419045448303, + "learning_rate": 8.309731402323066e-06, + "loss": 0.4071, + "step": 3663 + }, + { + "epoch": 1.0286355979786637, + "grad_norm": 0.7468762397766113, + "learning_rate": 8.308506943333578e-06, + "loss": 0.3831, + "step": 3664 + }, + { + "epoch": 1.0289163391353173, + "grad_norm": 0.8965431451797485, + "learning_rate": 8.307282131280805e-06, + "loss": 0.3694, + "step": 3665 + }, + { + "epoch": 1.0291970802919708, + "grad_norm": 0.8244647979736328, + "learning_rate": 8.306056966295448e-06, + "loss": 0.3921, + "step": 3666 + }, + { + "epoch": 1.0294778214486244, + "grad_norm": 0.6560270190238953, + "learning_rate": 8.30483144850825e-06, + "loss": 0.4107, + "step": 3667 + }, + { + "epoch": 1.0297585626052779, + "grad_norm": 0.8128765225410461, + "learning_rate": 8.303605578049993e-06, + "loss": 0.4124, + "step": 3668 + }, + { + "epoch": 1.0300393037619315, + "grad_norm": 0.7475265264511108, + "learning_rate": 8.302379355051491e-06, + "loss": 0.3652, + "step": 3669 + }, + { + "epoch": 1.030320044918585, + "grad_norm": 0.713414192199707, + "learning_rate": 8.301152779643602e-06, + "loss": 0.4044, + "step": 3670 + }, + { + "epoch": 1.0306007860752386, + "grad_norm": 0.6739659905433655, + "learning_rate": 8.299925851957216e-06, + "loss": 0.3963, + "step": 3671 + }, + { + "epoch": 1.0308815272318923, + "grad_norm": 0.729546844959259, + "learning_rate": 8.298698572123263e-06, + "loss": 0.3907, + "step": 3672 + }, + { + "epoch": 1.0311622683885457, + "grad_norm": 0.7554054260253906, + "learning_rate": 8.297470940272712e-06, + "loss": 0.366, + "step": 3673 + }, + { + "epoch": 1.0314430095451994, + "grad_norm": 0.6990939378738403, + "learning_rate": 8.296242956536569e-06, + "loss": 0.4435, + "step": 3674 + }, + { + "epoch": 1.0317237507018528, + "grad_norm": 0.6825934648513794, + "learning_rate": 8.295014621045874e-06, + "loss": 0.3902, + "step": 3675 + }, + { + "epoch": 1.0320044918585065, + "grad_norm": 0.7339544296264648, + "learning_rate": 8.29378593393171e-06, + "loss": 0.4016, + "step": 3676 + }, + { + "epoch": 1.03228523301516, + "grad_norm": 0.5781506299972534, + "learning_rate": 8.292556895325195e-06, + "loss": 0.4108, + "step": 3677 + }, + { + "epoch": 1.0325659741718136, + "grad_norm": 0.6548964381217957, + "learning_rate": 8.29132750535748e-06, + "loss": 0.3509, + "step": 3678 + }, + { + "epoch": 1.032846715328467, + "grad_norm": 0.6997511386871338, + "learning_rate": 8.290097764159764e-06, + "loss": 0.392, + "step": 3679 + }, + { + "epoch": 1.0331274564851207, + "grad_norm": 0.776130199432373, + "learning_rate": 8.288867671863274e-06, + "loss": 0.3587, + "step": 3680 + }, + { + "epoch": 1.0334081976417744, + "grad_norm": 0.6548985242843628, + "learning_rate": 8.287637228599278e-06, + "loss": 0.4048, + "step": 3681 + }, + { + "epoch": 1.0336889387984278, + "grad_norm": 0.6806820034980774, + "learning_rate": 8.28640643449908e-06, + "loss": 0.4257, + "step": 3682 + }, + { + "epoch": 1.0339696799550815, + "grad_norm": 0.7635098099708557, + "learning_rate": 8.285175289694024e-06, + "loss": 0.3998, + "step": 3683 + }, + { + "epoch": 1.034250421111735, + "grad_norm": 0.7298235297203064, + "learning_rate": 8.283943794315492e-06, + "loss": 0.3871, + "step": 3684 + }, + { + "epoch": 1.0345311622683886, + "grad_norm": 0.6716488003730774, + "learning_rate": 8.282711948494899e-06, + "loss": 0.3432, + "step": 3685 + }, + { + "epoch": 1.034811903425042, + "grad_norm": 0.6999754905700684, + "learning_rate": 8.2814797523637e-06, + "loss": 0.4539, + "step": 3686 + }, + { + "epoch": 1.0350926445816957, + "grad_norm": 0.7473323345184326, + "learning_rate": 8.28024720605339e-06, + "loss": 0.3962, + "step": 3687 + }, + { + "epoch": 1.0353733857383491, + "grad_norm": 0.6757204532623291, + "learning_rate": 8.279014309695494e-06, + "loss": 0.3905, + "step": 3688 + }, + { + "epoch": 1.0356541268950028, + "grad_norm": 0.804010808467865, + "learning_rate": 8.277781063421584e-06, + "loss": 0.4076, + "step": 3689 + }, + { + "epoch": 1.0359348680516565, + "grad_norm": 0.686629593372345, + "learning_rate": 8.276547467363263e-06, + "loss": 0.3904, + "step": 3690 + }, + { + "epoch": 1.03621560920831, + "grad_norm": 0.6530104875564575, + "learning_rate": 8.275313521652168e-06, + "loss": 0.3445, + "step": 3691 + }, + { + "epoch": 1.0364963503649636, + "grad_norm": 0.6870188117027283, + "learning_rate": 8.274079226419984e-06, + "loss": 0.4052, + "step": 3692 + }, + { + "epoch": 1.036777091521617, + "grad_norm": 0.8461865782737732, + "learning_rate": 8.272844581798426e-06, + "loss": 0.3839, + "step": 3693 + }, + { + "epoch": 1.0370578326782707, + "grad_norm": 0.783974826335907, + "learning_rate": 8.271609587919245e-06, + "loss": 0.3916, + "step": 3694 + }, + { + "epoch": 1.0373385738349241, + "grad_norm": 0.627433180809021, + "learning_rate": 8.270374244914234e-06, + "loss": 0.3824, + "step": 3695 + }, + { + "epoch": 1.0376193149915778, + "grad_norm": 0.6883553266525269, + "learning_rate": 8.269138552915221e-06, + "loss": 0.3889, + "step": 3696 + }, + { + "epoch": 1.0379000561482314, + "grad_norm": 0.7421615123748779, + "learning_rate": 8.267902512054071e-06, + "loss": 0.4252, + "step": 3697 + }, + { + "epoch": 1.0381807973048849, + "grad_norm": 0.801162600517273, + "learning_rate": 8.266666122462687e-06, + "loss": 0.4435, + "step": 3698 + }, + { + "epoch": 1.0384615384615385, + "grad_norm": 0.7508783936500549, + "learning_rate": 8.265429384273007e-06, + "loss": 0.4253, + "step": 3699 + }, + { + "epoch": 1.038742279618192, + "grad_norm": 0.6713810563087463, + "learning_rate": 8.264192297617011e-06, + "loss": 0.3849, + "step": 3700 + }, + { + "epoch": 1.0390230207748457, + "grad_norm": 0.7695631384849548, + "learning_rate": 8.262954862626711e-06, + "loss": 0.3659, + "step": 3701 + }, + { + "epoch": 1.039303761931499, + "grad_norm": 0.7910667657852173, + "learning_rate": 8.26171707943416e-06, + "loss": 0.3943, + "step": 3702 + }, + { + "epoch": 1.0395845030881528, + "grad_norm": 0.860735297203064, + "learning_rate": 8.260478948171444e-06, + "loss": 0.4034, + "step": 3703 + }, + { + "epoch": 1.0398652442448062, + "grad_norm": 0.7097742557525635, + "learning_rate": 8.259240468970692e-06, + "loss": 0.3796, + "step": 3704 + }, + { + "epoch": 1.0401459854014599, + "grad_norm": 0.6722464561462402, + "learning_rate": 8.258001641964064e-06, + "loss": 0.4049, + "step": 3705 + }, + { + "epoch": 1.0404267265581135, + "grad_norm": 0.6745389699935913, + "learning_rate": 8.256762467283762e-06, + "loss": 0.3896, + "step": 3706 + }, + { + "epoch": 1.040707467714767, + "grad_norm": 0.7041593194007874, + "learning_rate": 8.25552294506202e-06, + "loss": 0.388, + "step": 3707 + }, + { + "epoch": 1.0409882088714206, + "grad_norm": 0.7543497085571289, + "learning_rate": 8.254283075431115e-06, + "loss": 0.3784, + "step": 3708 + }, + { + "epoch": 1.041268950028074, + "grad_norm": 0.6669436097145081, + "learning_rate": 8.253042858523356e-06, + "loss": 0.4165, + "step": 3709 + }, + { + "epoch": 1.0415496911847277, + "grad_norm": 0.664459764957428, + "learning_rate": 8.251802294471094e-06, + "loss": 0.3455, + "step": 3710 + }, + { + "epoch": 1.0418304323413812, + "grad_norm": 0.7049517035484314, + "learning_rate": 8.250561383406713e-06, + "loss": 0.3815, + "step": 3711 + }, + { + "epoch": 1.0421111734980348, + "grad_norm": 0.6471162438392639, + "learning_rate": 8.249320125462636e-06, + "loss": 0.4003, + "step": 3712 + }, + { + "epoch": 1.0423919146546883, + "grad_norm": 0.6745674014091492, + "learning_rate": 8.248078520771319e-06, + "loss": 0.3649, + "step": 3713 + }, + { + "epoch": 1.042672655811342, + "grad_norm": 0.799483060836792, + "learning_rate": 8.246836569465262e-06, + "loss": 0.3882, + "step": 3714 + }, + { + "epoch": 1.0429533969679956, + "grad_norm": 0.6792709231376648, + "learning_rate": 8.245594271676998e-06, + "loss": 0.4062, + "step": 3715 + }, + { + "epoch": 1.043234138124649, + "grad_norm": 0.633111298084259, + "learning_rate": 8.244351627539093e-06, + "loss": 0.4272, + "step": 3716 + }, + { + "epoch": 1.0435148792813027, + "grad_norm": 0.7625438570976257, + "learning_rate": 8.24310863718416e-06, + "loss": 0.4234, + "step": 3717 + }, + { + "epoch": 1.0437956204379562, + "grad_norm": 0.6400211453437805, + "learning_rate": 8.24186530074484e-06, + "loss": 0.3414, + "step": 3718 + }, + { + "epoch": 1.0440763615946098, + "grad_norm": 0.6569346189498901, + "learning_rate": 8.240621618353817e-06, + "loss": 0.3917, + "step": 3719 + }, + { + "epoch": 1.0443571027512633, + "grad_norm": 0.7927652597427368, + "learning_rate": 8.239377590143804e-06, + "loss": 0.389, + "step": 3720 + }, + { + "epoch": 1.044637843907917, + "grad_norm": 0.6913268566131592, + "learning_rate": 8.23813321624756e-06, + "loss": 0.414, + "step": 3721 + }, + { + "epoch": 1.0449185850645704, + "grad_norm": 0.6259279847145081, + "learning_rate": 8.236888496797878e-06, + "loss": 0.3848, + "step": 3722 + }, + { + "epoch": 1.045199326221224, + "grad_norm": 0.6240760684013367, + "learning_rate": 8.235643431927582e-06, + "loss": 0.3473, + "step": 3723 + }, + { + "epoch": 1.0454800673778777, + "grad_norm": 0.6741257309913635, + "learning_rate": 8.234398021769541e-06, + "loss": 0.3839, + "step": 3724 + }, + { + "epoch": 1.0457608085345311, + "grad_norm": 0.634797215461731, + "learning_rate": 8.233152266456656e-06, + "loss": 0.4012, + "step": 3725 + }, + { + "epoch": 1.0460415496911848, + "grad_norm": 0.7278827428817749, + "learning_rate": 8.231906166121868e-06, + "loss": 0.3915, + "step": 3726 + }, + { + "epoch": 1.0463222908478382, + "grad_norm": 0.6704578399658203, + "learning_rate": 8.23065972089815e-06, + "loss": 0.3334, + "step": 3727 + }, + { + "epoch": 1.046603032004492, + "grad_norm": 0.678528904914856, + "learning_rate": 8.229412930918517e-06, + "loss": 0.3794, + "step": 3728 + }, + { + "epoch": 1.0468837731611453, + "grad_norm": 0.7016115188598633, + "learning_rate": 8.22816579631602e-06, + "loss": 0.3424, + "step": 3729 + }, + { + "epoch": 1.047164514317799, + "grad_norm": 0.7756428122520447, + "learning_rate": 8.226918317223744e-06, + "loss": 0.4266, + "step": 3730 + }, + { + "epoch": 1.0474452554744524, + "grad_norm": 0.8074459433555603, + "learning_rate": 8.22567049377481e-06, + "loss": 0.3867, + "step": 3731 + }, + { + "epoch": 1.047725996631106, + "grad_norm": 0.7199363708496094, + "learning_rate": 8.224422326102381e-06, + "loss": 0.4191, + "step": 3732 + }, + { + "epoch": 1.0480067377877598, + "grad_norm": 0.7219660878181458, + "learning_rate": 8.223173814339653e-06, + "loss": 0.4009, + "step": 3733 + }, + { + "epoch": 1.0482874789444132, + "grad_norm": 0.838511049747467, + "learning_rate": 8.221924958619857e-06, + "loss": 0.3618, + "step": 3734 + }, + { + "epoch": 1.0485682201010669, + "grad_norm": 0.8532513380050659, + "learning_rate": 8.220675759076269e-06, + "loss": 0.4255, + "step": 3735 + }, + { + "epoch": 1.0488489612577203, + "grad_norm": 0.7771586775779724, + "learning_rate": 8.21942621584219e-06, + "loss": 0.3698, + "step": 3736 + }, + { + "epoch": 1.049129702414374, + "grad_norm": 0.7285014390945435, + "learning_rate": 8.218176329050965e-06, + "loss": 0.3944, + "step": 3737 + }, + { + "epoch": 1.0494104435710274, + "grad_norm": 0.6804565787315369, + "learning_rate": 8.216926098835975e-06, + "loss": 0.412, + "step": 3738 + }, + { + "epoch": 1.049691184727681, + "grad_norm": 0.7419346570968628, + "learning_rate": 8.215675525330637e-06, + "loss": 0.4103, + "step": 3739 + }, + { + "epoch": 1.0499719258843347, + "grad_norm": 0.744171142578125, + "learning_rate": 8.214424608668404e-06, + "loss": 0.3855, + "step": 3740 + }, + { + "epoch": 1.0502526670409882, + "grad_norm": 0.7131140232086182, + "learning_rate": 8.213173348982766e-06, + "loss": 0.4163, + "step": 3741 + }, + { + "epoch": 1.0505334081976418, + "grad_norm": 0.643436849117279, + "learning_rate": 8.21192174640725e-06, + "loss": 0.4241, + "step": 3742 + }, + { + "epoch": 1.0508141493542953, + "grad_norm": 0.6585485935211182, + "learning_rate": 8.210669801075417e-06, + "loss": 0.4082, + "step": 3743 + }, + { + "epoch": 1.051094890510949, + "grad_norm": 0.7669865489006042, + "learning_rate": 8.20941751312087e-06, + "loss": 0.4145, + "step": 3744 + }, + { + "epoch": 1.0513756316676024, + "grad_norm": 0.6367470622062683, + "learning_rate": 8.208164882677244e-06, + "loss": 0.3739, + "step": 3745 + }, + { + "epoch": 1.051656372824256, + "grad_norm": 0.6548689007759094, + "learning_rate": 8.206911909878212e-06, + "loss": 0.3783, + "step": 3746 + }, + { + "epoch": 1.0519371139809095, + "grad_norm": 0.6977734565734863, + "learning_rate": 8.205658594857483e-06, + "loss": 0.4005, + "step": 3747 + }, + { + "epoch": 1.0522178551375632, + "grad_norm": 0.6368660926818848, + "learning_rate": 8.204404937748805e-06, + "loss": 0.3603, + "step": 3748 + }, + { + "epoch": 1.0524985962942168, + "grad_norm": 0.7640892267227173, + "learning_rate": 8.203150938685957e-06, + "loss": 0.3853, + "step": 3749 + }, + { + "epoch": 1.0527793374508703, + "grad_norm": 0.6732112169265747, + "learning_rate": 8.20189659780276e-06, + "loss": 0.4105, + "step": 3750 + }, + { + "epoch": 1.053060078607524, + "grad_norm": 0.6741898655891418, + "learning_rate": 8.20064191523307e-06, + "loss": 0.395, + "step": 3751 + }, + { + "epoch": 1.0533408197641774, + "grad_norm": 0.7830673456192017, + "learning_rate": 8.199386891110778e-06, + "loss": 0.3988, + "step": 3752 + }, + { + "epoch": 1.053621560920831, + "grad_norm": 0.6740986704826355, + "learning_rate": 8.198131525569812e-06, + "loss": 0.3725, + "step": 3753 + }, + { + "epoch": 1.0539023020774845, + "grad_norm": 0.7528538703918457, + "learning_rate": 8.196875818744138e-06, + "loss": 0.4328, + "step": 3754 + }, + { + "epoch": 1.0541830432341381, + "grad_norm": 0.6501534581184387, + "learning_rate": 8.195619770767758e-06, + "loss": 0.3695, + "step": 3755 + }, + { + "epoch": 1.0544637843907916, + "grad_norm": 0.6537755131721497, + "learning_rate": 8.194363381774708e-06, + "loss": 0.3796, + "step": 3756 + }, + { + "epoch": 1.0547445255474452, + "grad_norm": 0.6635921001434326, + "learning_rate": 8.193106651899061e-06, + "loss": 0.3731, + "step": 3757 + }, + { + "epoch": 1.055025266704099, + "grad_norm": 0.6463021636009216, + "learning_rate": 8.19184958127493e-06, + "loss": 0.3818, + "step": 3758 + }, + { + "epoch": 1.0553060078607523, + "grad_norm": 0.7013039588928223, + "learning_rate": 8.19059217003646e-06, + "loss": 0.3829, + "step": 3759 + }, + { + "epoch": 1.055586749017406, + "grad_norm": 0.690478503704071, + "learning_rate": 8.189334418317834e-06, + "loss": 0.3758, + "step": 3760 + }, + { + "epoch": 1.0558674901740595, + "grad_norm": 0.726870596408844, + "learning_rate": 8.188076326253272e-06, + "loss": 0.3802, + "step": 3761 + }, + { + "epoch": 1.0561482313307131, + "grad_norm": 0.7228400111198425, + "learning_rate": 8.18681789397703e-06, + "loss": 0.4015, + "step": 3762 + }, + { + "epoch": 1.0564289724873666, + "grad_norm": 0.6611549258232117, + "learning_rate": 8.1855591216234e-06, + "loss": 0.3876, + "step": 3763 + }, + { + "epoch": 1.0567097136440202, + "grad_norm": 0.6233832836151123, + "learning_rate": 8.18430000932671e-06, + "loss": 0.3835, + "step": 3764 + }, + { + "epoch": 1.0569904548006739, + "grad_norm": 0.7990942597389221, + "learning_rate": 8.183040557221326e-06, + "loss": 0.4109, + "step": 3765 + }, + { + "epoch": 1.0572711959573273, + "grad_norm": 0.6729419827461243, + "learning_rate": 8.181780765441647e-06, + "loss": 0.3981, + "step": 3766 + }, + { + "epoch": 1.057551937113981, + "grad_norm": 0.7742902636528015, + "learning_rate": 8.18052063412211e-06, + "loss": 0.4007, + "step": 3767 + }, + { + "epoch": 1.0578326782706344, + "grad_norm": 0.8096804618835449, + "learning_rate": 8.179260163397191e-06, + "loss": 0.4007, + "step": 3768 + }, + { + "epoch": 1.058113419427288, + "grad_norm": 0.652288556098938, + "learning_rate": 8.177999353401398e-06, + "loss": 0.3933, + "step": 3769 + }, + { + "epoch": 1.0583941605839415, + "grad_norm": 0.8095072507858276, + "learning_rate": 8.176738204269276e-06, + "loss": 0.4036, + "step": 3770 + }, + { + "epoch": 1.0586749017405952, + "grad_norm": 0.7715938687324524, + "learning_rate": 8.175476716135407e-06, + "loss": 0.3928, + "step": 3771 + }, + { + "epoch": 1.0589556428972486, + "grad_norm": 0.6101293563842773, + "learning_rate": 8.17421488913441e-06, + "loss": 0.4006, + "step": 3772 + }, + { + "epoch": 1.0592363840539023, + "grad_norm": 0.6438723206520081, + "learning_rate": 8.172952723400938e-06, + "loss": 0.4201, + "step": 3773 + }, + { + "epoch": 1.059517125210556, + "grad_norm": 0.7053725123405457, + "learning_rate": 8.171690219069686e-06, + "loss": 0.3934, + "step": 3774 + }, + { + "epoch": 1.0597978663672094, + "grad_norm": 0.7070673704147339, + "learning_rate": 8.170427376275374e-06, + "loss": 0.3921, + "step": 3775 + }, + { + "epoch": 1.060078607523863, + "grad_norm": 0.771934449672699, + "learning_rate": 8.16916419515277e-06, + "loss": 0.4117, + "step": 3776 + }, + { + "epoch": 1.0603593486805165, + "grad_norm": 0.6540143489837646, + "learning_rate": 8.167900675836669e-06, + "loss": 0.4106, + "step": 3777 + }, + { + "epoch": 1.0606400898371702, + "grad_norm": 0.6747614741325378, + "learning_rate": 8.16663681846191e-06, + "loss": 0.3754, + "step": 3778 + }, + { + "epoch": 1.0609208309938236, + "grad_norm": 0.6334322094917297, + "learning_rate": 8.16537262316336e-06, + "loss": 0.3775, + "step": 3779 + }, + { + "epoch": 1.0612015721504773, + "grad_norm": 0.7444552779197693, + "learning_rate": 8.164108090075929e-06, + "loss": 0.3851, + "step": 3780 + }, + { + "epoch": 1.0614823133071307, + "grad_norm": 0.7173585295677185, + "learning_rate": 8.162843219334559e-06, + "loss": 0.3771, + "step": 3781 + }, + { + "epoch": 1.0617630544637844, + "grad_norm": 0.6801967024803162, + "learning_rate": 8.161578011074229e-06, + "loss": 0.4322, + "step": 3782 + }, + { + "epoch": 1.062043795620438, + "grad_norm": 0.748656153678894, + "learning_rate": 8.160312465429952e-06, + "loss": 0.37, + "step": 3783 + }, + { + "epoch": 1.0623245367770915, + "grad_norm": 0.8064485788345337, + "learning_rate": 8.159046582536784e-06, + "loss": 0.3818, + "step": 3784 + }, + { + "epoch": 1.0626052779337452, + "grad_norm": 0.7356282472610474, + "learning_rate": 8.157780362529809e-06, + "loss": 0.3927, + "step": 3785 + }, + { + "epoch": 1.0628860190903986, + "grad_norm": 0.6002188920974731, + "learning_rate": 8.156513805544148e-06, + "loss": 0.3642, + "step": 3786 + }, + { + "epoch": 1.0631667602470523, + "grad_norm": 0.691355288028717, + "learning_rate": 8.155246911714965e-06, + "loss": 0.3859, + "step": 3787 + }, + { + "epoch": 1.0634475014037057, + "grad_norm": 0.7035458087921143, + "learning_rate": 8.153979681177453e-06, + "loss": 0.397, + "step": 3788 + }, + { + "epoch": 1.0637282425603594, + "grad_norm": 0.6652584671974182, + "learning_rate": 8.152712114066842e-06, + "loss": 0.4021, + "step": 3789 + }, + { + "epoch": 1.064008983717013, + "grad_norm": 0.5629920959472656, + "learning_rate": 8.1514442105184e-06, + "loss": 0.3324, + "step": 3790 + }, + { + "epoch": 1.0642897248736665, + "grad_norm": 0.6746225357055664, + "learning_rate": 8.150175970667432e-06, + "loss": 0.4047, + "step": 3791 + }, + { + "epoch": 1.0645704660303201, + "grad_norm": 0.6971006989479065, + "learning_rate": 8.14890739464927e-06, + "loss": 0.3861, + "step": 3792 + }, + { + "epoch": 1.0648512071869736, + "grad_norm": 0.6421023011207581, + "learning_rate": 8.147638482599294e-06, + "loss": 0.366, + "step": 3793 + }, + { + "epoch": 1.0651319483436272, + "grad_norm": 0.5770668983459473, + "learning_rate": 8.146369234652913e-06, + "loss": 0.3866, + "step": 3794 + }, + { + "epoch": 1.0654126895002807, + "grad_norm": 0.6772415637969971, + "learning_rate": 8.145099650945574e-06, + "loss": 0.3968, + "step": 3795 + }, + { + "epoch": 1.0656934306569343, + "grad_norm": 0.6348497271537781, + "learning_rate": 8.143829731612757e-06, + "loss": 0.3709, + "step": 3796 + }, + { + "epoch": 1.0659741718135878, + "grad_norm": 0.7543554306030273, + "learning_rate": 8.142559476789982e-06, + "loss": 0.3851, + "step": 3797 + }, + { + "epoch": 1.0662549129702414, + "grad_norm": 0.7075092792510986, + "learning_rate": 8.141288886612801e-06, + "loss": 0.394, + "step": 3798 + }, + { + "epoch": 1.066535654126895, + "grad_norm": 0.6890093088150024, + "learning_rate": 8.140017961216807e-06, + "loss": 0.3853, + "step": 3799 + }, + { + "epoch": 1.0668163952835485, + "grad_norm": 0.6252447366714478, + "learning_rate": 8.13874670073762e-06, + "loss": 0.4105, + "step": 3800 + }, + { + "epoch": 1.0670971364402022, + "grad_norm": 0.6732771396636963, + "learning_rate": 8.137475105310903e-06, + "loss": 0.3713, + "step": 3801 + }, + { + "epoch": 1.0673778775968557, + "grad_norm": 0.7609824538230896, + "learning_rate": 8.136203175072357e-06, + "loss": 0.4072, + "step": 3802 + }, + { + "epoch": 1.0676586187535093, + "grad_norm": 0.6842123866081238, + "learning_rate": 8.134930910157708e-06, + "loss": 0.3409, + "step": 3803 + }, + { + "epoch": 1.0679393599101628, + "grad_norm": 0.7474604249000549, + "learning_rate": 8.133658310702729e-06, + "loss": 0.4253, + "step": 3804 + }, + { + "epoch": 1.0682201010668164, + "grad_norm": 0.7342727184295654, + "learning_rate": 8.132385376843221e-06, + "loss": 0.4085, + "step": 3805 + }, + { + "epoch": 1.0685008422234699, + "grad_norm": 0.6349467039108276, + "learning_rate": 8.131112108715024e-06, + "loss": 0.3756, + "step": 3806 + }, + { + "epoch": 1.0687815833801235, + "grad_norm": 0.7352294921875, + "learning_rate": 8.129838506454015e-06, + "loss": 0.4099, + "step": 3807 + }, + { + "epoch": 1.0690623245367772, + "grad_norm": 0.6199530959129333, + "learning_rate": 8.128564570196102e-06, + "loss": 0.3777, + "step": 3808 + }, + { + "epoch": 1.0693430656934306, + "grad_norm": 0.7040279507637024, + "learning_rate": 8.127290300077236e-06, + "loss": 0.4022, + "step": 3809 + }, + { + "epoch": 1.0696238068500843, + "grad_norm": 0.6669299602508545, + "learning_rate": 8.126015696233396e-06, + "loss": 0.3905, + "step": 3810 + }, + { + "epoch": 1.0699045480067377, + "grad_norm": 0.7671824097633362, + "learning_rate": 8.124740758800598e-06, + "loss": 0.3816, + "step": 3811 + }, + { + "epoch": 1.0701852891633914, + "grad_norm": 0.6888076663017273, + "learning_rate": 8.123465487914898e-06, + "loss": 0.3785, + "step": 3812 + }, + { + "epoch": 1.0704660303200448, + "grad_norm": 0.6825580596923828, + "learning_rate": 8.122189883712387e-06, + "loss": 0.3698, + "step": 3813 + }, + { + "epoch": 1.0707467714766985, + "grad_norm": 0.721526563167572, + "learning_rate": 8.120913946329186e-06, + "loss": 0.3713, + "step": 3814 + }, + { + "epoch": 1.0710275126333522, + "grad_norm": 0.7052328586578369, + "learning_rate": 8.119637675901457e-06, + "loss": 0.3825, + "step": 3815 + }, + { + "epoch": 1.0713082537900056, + "grad_norm": 0.8516050577163696, + "learning_rate": 8.118361072565394e-06, + "loss": 0.4036, + "step": 3816 + }, + { + "epoch": 1.0715889949466593, + "grad_norm": 0.7296687960624695, + "learning_rate": 8.11708413645723e-06, + "loss": 0.3922, + "step": 3817 + }, + { + "epoch": 1.0718697361033127, + "grad_norm": 0.6892791986465454, + "learning_rate": 8.115806867713229e-06, + "loss": 0.41, + "step": 3818 + }, + { + "epoch": 1.0721504772599664, + "grad_norm": 0.8589861989021301, + "learning_rate": 8.114529266469697e-06, + "loss": 0.4311, + "step": 3819 + }, + { + "epoch": 1.0724312184166198, + "grad_norm": 0.7154029607772827, + "learning_rate": 8.113251332862969e-06, + "loss": 0.3736, + "step": 3820 + }, + { + "epoch": 1.0727119595732735, + "grad_norm": 0.7286500930786133, + "learning_rate": 8.11197306702942e-06, + "loss": 0.4383, + "step": 3821 + }, + { + "epoch": 1.072992700729927, + "grad_norm": 0.6642131805419922, + "learning_rate": 8.110694469105456e-06, + "loss": 0.3609, + "step": 3822 + }, + { + "epoch": 1.0732734418865806, + "grad_norm": 0.6660007834434509, + "learning_rate": 8.109415539227522e-06, + "loss": 0.4267, + "step": 3823 + }, + { + "epoch": 1.073554183043234, + "grad_norm": 0.735863208770752, + "learning_rate": 8.108136277532096e-06, + "loss": 0.3674, + "step": 3824 + }, + { + "epoch": 1.0738349241998877, + "grad_norm": 0.7106322646141052, + "learning_rate": 8.1068566841557e-06, + "loss": 0.36, + "step": 3825 + }, + { + "epoch": 1.0741156653565413, + "grad_norm": 0.723576545715332, + "learning_rate": 8.105576759234874e-06, + "loss": 0.3763, + "step": 3826 + }, + { + "epoch": 1.0743964065131948, + "grad_norm": 0.6807546019554138, + "learning_rate": 8.10429650290621e-06, + "loss": 0.429, + "step": 3827 + }, + { + "epoch": 1.0746771476698485, + "grad_norm": 0.7537394165992737, + "learning_rate": 8.10301591530633e-06, + "loss": 0.3705, + "step": 3828 + }, + { + "epoch": 1.074957888826502, + "grad_norm": 0.7843241095542908, + "learning_rate": 8.101734996571886e-06, + "loss": 0.3479, + "step": 3829 + }, + { + "epoch": 1.0752386299831556, + "grad_norm": 0.7866134643554688, + "learning_rate": 8.100453746839571e-06, + "loss": 0.44, + "step": 3830 + }, + { + "epoch": 1.075519371139809, + "grad_norm": 0.7115223407745361, + "learning_rate": 8.099172166246113e-06, + "loss": 0.3798, + "step": 3831 + }, + { + "epoch": 1.0758001122964627, + "grad_norm": 0.7375951409339905, + "learning_rate": 8.097890254928274e-06, + "loss": 0.4256, + "step": 3832 + }, + { + "epoch": 1.0760808534531163, + "grad_norm": 0.7635318040847778, + "learning_rate": 8.096608013022854e-06, + "loss": 0.3746, + "step": 3833 + }, + { + "epoch": 1.0763615946097698, + "grad_norm": 0.7682521343231201, + "learning_rate": 8.095325440666679e-06, + "loss": 0.3834, + "step": 3834 + }, + { + "epoch": 1.0766423357664234, + "grad_norm": 0.7489264011383057, + "learning_rate": 8.094042537996624e-06, + "loss": 0.3951, + "step": 3835 + }, + { + "epoch": 1.0769230769230769, + "grad_norm": 0.682855486869812, + "learning_rate": 8.092759305149588e-06, + "loss": 0.4004, + "step": 3836 + }, + { + "epoch": 1.0772038180797305, + "grad_norm": 0.7808113098144531, + "learning_rate": 8.091475742262514e-06, + "loss": 0.3939, + "step": 3837 + }, + { + "epoch": 1.077484559236384, + "grad_norm": 0.7783780694007874, + "learning_rate": 8.090191849472371e-06, + "loss": 0.3593, + "step": 3838 + }, + { + "epoch": 1.0777653003930376, + "grad_norm": 0.7571975588798523, + "learning_rate": 8.088907626916173e-06, + "loss": 0.3672, + "step": 3839 + }, + { + "epoch": 1.078046041549691, + "grad_norm": 0.7357426285743713, + "learning_rate": 8.08762307473096e-06, + "loss": 0.4213, + "step": 3840 + }, + { + "epoch": 1.0783267827063447, + "grad_norm": 0.8001245260238647, + "learning_rate": 8.086338193053814e-06, + "loss": 0.382, + "step": 3841 + }, + { + "epoch": 1.0786075238629984, + "grad_norm": 0.6524637341499329, + "learning_rate": 8.085052982021849e-06, + "loss": 0.384, + "step": 3842 + }, + { + "epoch": 1.0788882650196518, + "grad_norm": 0.7771375179290771, + "learning_rate": 8.083767441772212e-06, + "loss": 0.3788, + "step": 3843 + }, + { + "epoch": 1.0791690061763055, + "grad_norm": 0.7094160914421082, + "learning_rate": 8.082481572442093e-06, + "loss": 0.3765, + "step": 3844 + }, + { + "epoch": 1.079449747332959, + "grad_norm": 0.7189434170722961, + "learning_rate": 8.081195374168708e-06, + "loss": 0.4138, + "step": 3845 + }, + { + "epoch": 1.0797304884896126, + "grad_norm": 0.6085180640220642, + "learning_rate": 8.079908847089314e-06, + "loss": 0.378, + "step": 3846 + }, + { + "epoch": 1.080011229646266, + "grad_norm": 0.5959956645965576, + "learning_rate": 8.078621991341202e-06, + "loss": 0.3599, + "step": 3847 + }, + { + "epoch": 1.0802919708029197, + "grad_norm": 0.7881020307540894, + "learning_rate": 8.077334807061692e-06, + "loss": 0.4257, + "step": 3848 + }, + { + "epoch": 1.0805727119595732, + "grad_norm": 0.6509604454040527, + "learning_rate": 8.076047294388151e-06, + "loss": 0.381, + "step": 3849 + }, + { + "epoch": 1.0808534531162268, + "grad_norm": 0.6959467530250549, + "learning_rate": 8.074759453457969e-06, + "loss": 0.4064, + "step": 3850 + }, + { + "epoch": 1.0811341942728805, + "grad_norm": 0.5917371511459351, + "learning_rate": 8.07347128440858e-06, + "loss": 0.3978, + "step": 3851 + }, + { + "epoch": 1.081414935429534, + "grad_norm": 0.632373571395874, + "learning_rate": 8.07218278737745e-06, + "loss": 0.3909, + "step": 3852 + }, + { + "epoch": 1.0816956765861876, + "grad_norm": 0.6964919567108154, + "learning_rate": 8.070893962502076e-06, + "loss": 0.4364, + "step": 3853 + }, + { + "epoch": 1.081976417742841, + "grad_norm": 0.7179551720619202, + "learning_rate": 8.069604809919996e-06, + "loss": 0.3781, + "step": 3854 + }, + { + "epoch": 1.0822571588994947, + "grad_norm": 0.6049095988273621, + "learning_rate": 8.068315329768778e-06, + "loss": 0.3661, + "step": 3855 + }, + { + "epoch": 1.0825379000561481, + "grad_norm": 0.6889928579330444, + "learning_rate": 8.067025522186028e-06, + "loss": 0.3932, + "step": 3856 + }, + { + "epoch": 1.0828186412128018, + "grad_norm": 0.6682705283164978, + "learning_rate": 8.065735387309388e-06, + "loss": 0.3719, + "step": 3857 + }, + { + "epoch": 1.0830993823694555, + "grad_norm": 0.7578855156898499, + "learning_rate": 8.064444925276532e-06, + "loss": 0.3824, + "step": 3858 + }, + { + "epoch": 1.083380123526109, + "grad_norm": 0.7042250633239746, + "learning_rate": 8.063154136225172e-06, + "loss": 0.371, + "step": 3859 + }, + { + "epoch": 1.0836608646827626, + "grad_norm": 0.714329719543457, + "learning_rate": 8.061863020293046e-06, + "loss": 0.4127, + "step": 3860 + }, + { + "epoch": 1.083941605839416, + "grad_norm": 0.7102716565132141, + "learning_rate": 8.060571577617944e-06, + "loss": 0.4013, + "step": 3861 + }, + { + "epoch": 1.0842223469960697, + "grad_norm": 0.8229711651802063, + "learning_rate": 8.059279808337672e-06, + "loss": 0.4461, + "step": 3862 + }, + { + "epoch": 1.0845030881527231, + "grad_norm": 0.6872700452804565, + "learning_rate": 8.057987712590085e-06, + "loss": 0.3826, + "step": 3863 + }, + { + "epoch": 1.0847838293093768, + "grad_norm": 0.7488064169883728, + "learning_rate": 8.056695290513066e-06, + "loss": 0.4195, + "step": 3864 + }, + { + "epoch": 1.0850645704660302, + "grad_norm": 0.6129146218299866, + "learning_rate": 8.055402542244533e-06, + "loss": 0.3511, + "step": 3865 + }, + { + "epoch": 1.0853453116226839, + "grad_norm": 0.6663370728492737, + "learning_rate": 8.054109467922441e-06, + "loss": 0.3954, + "step": 3866 + }, + { + "epoch": 1.0856260527793375, + "grad_norm": 0.7446199655532837, + "learning_rate": 8.05281606768478e-06, + "loss": 0.4107, + "step": 3867 + }, + { + "epoch": 1.085906793935991, + "grad_norm": 0.7511297464370728, + "learning_rate": 8.051522341669571e-06, + "loss": 0.4026, + "step": 3868 + }, + { + "epoch": 1.0861875350926447, + "grad_norm": 0.6736665964126587, + "learning_rate": 8.050228290014875e-06, + "loss": 0.3767, + "step": 3869 + }, + { + "epoch": 1.086468276249298, + "grad_norm": 0.6423181295394897, + "learning_rate": 8.048933912858783e-06, + "loss": 0.4045, + "step": 3870 + }, + { + "epoch": 1.0867490174059518, + "grad_norm": 0.7433770895004272, + "learning_rate": 8.047639210339425e-06, + "loss": 0.4022, + "step": 3871 + }, + { + "epoch": 1.0870297585626052, + "grad_norm": 0.7664890289306641, + "learning_rate": 8.046344182594963e-06, + "loss": 0.4393, + "step": 3872 + }, + { + "epoch": 1.0873104997192589, + "grad_norm": 0.7127290964126587, + "learning_rate": 8.045048829763593e-06, + "loss": 0.3717, + "step": 3873 + }, + { + "epoch": 1.0875912408759123, + "grad_norm": 0.6565137505531311, + "learning_rate": 8.043753151983546e-06, + "loss": 0.3741, + "step": 3874 + }, + { + "epoch": 1.087871982032566, + "grad_norm": 0.6661744713783264, + "learning_rate": 8.042457149393092e-06, + "loss": 0.4163, + "step": 3875 + }, + { + "epoch": 1.0881527231892196, + "grad_norm": 0.6414688229560852, + "learning_rate": 8.041160822130532e-06, + "loss": 0.3815, + "step": 3876 + }, + { + "epoch": 1.088433464345873, + "grad_norm": 0.770722508430481, + "learning_rate": 8.0398641703342e-06, + "loss": 0.4112, + "step": 3877 + }, + { + "epoch": 1.0887142055025267, + "grad_norm": 0.6567904949188232, + "learning_rate": 8.038567194142466e-06, + "loss": 0.3748, + "step": 3878 + }, + { + "epoch": 1.0889949466591802, + "grad_norm": 0.738982081413269, + "learning_rate": 8.037269893693738e-06, + "loss": 0.3973, + "step": 3879 + }, + { + "epoch": 1.0892756878158338, + "grad_norm": 0.7579796314239502, + "learning_rate": 8.035972269126456e-06, + "loss": 0.3833, + "step": 3880 + }, + { + "epoch": 1.0895564289724873, + "grad_norm": 0.6975324749946594, + "learning_rate": 8.03467432057909e-06, + "loss": 0.3701, + "step": 3881 + }, + { + "epoch": 1.089837170129141, + "grad_norm": 0.6883982419967651, + "learning_rate": 8.033376048190152e-06, + "loss": 0.3601, + "step": 3882 + }, + { + "epoch": 1.0901179112857946, + "grad_norm": 0.79848712682724, + "learning_rate": 8.032077452098187e-06, + "loss": 0.3952, + "step": 3883 + }, + { + "epoch": 1.090398652442448, + "grad_norm": 0.7479804158210754, + "learning_rate": 8.03077853244177e-06, + "loss": 0.3815, + "step": 3884 + }, + { + "epoch": 1.0906793935991017, + "grad_norm": 0.6892729997634888, + "learning_rate": 8.029479289359517e-06, + "loss": 0.3907, + "step": 3885 + }, + { + "epoch": 1.0909601347557552, + "grad_norm": 0.7771220803260803, + "learning_rate": 8.028179722990073e-06, + "loss": 0.3693, + "step": 3886 + }, + { + "epoch": 1.0912408759124088, + "grad_norm": 0.686834990978241, + "learning_rate": 8.02687983347212e-06, + "loss": 0.3778, + "step": 3887 + }, + { + "epoch": 1.0915216170690623, + "grad_norm": 0.6629605293273926, + "learning_rate": 8.025579620944372e-06, + "loss": 0.3989, + "step": 3888 + }, + { + "epoch": 1.091802358225716, + "grad_norm": 0.6506296396255493, + "learning_rate": 8.024279085545584e-06, + "loss": 0.3804, + "step": 3889 + }, + { + "epoch": 1.0920830993823694, + "grad_norm": 0.6472775936126709, + "learning_rate": 8.022978227414537e-06, + "loss": 0.4016, + "step": 3890 + }, + { + "epoch": 1.092363840539023, + "grad_norm": 0.8023062944412231, + "learning_rate": 8.021677046690055e-06, + "loss": 0.3864, + "step": 3891 + }, + { + "epoch": 1.0926445816956767, + "grad_norm": 0.8447490930557251, + "learning_rate": 8.020375543510986e-06, + "loss": 0.3828, + "step": 3892 + }, + { + "epoch": 1.0929253228523301, + "grad_norm": 0.7679696083068848, + "learning_rate": 8.019073718016223e-06, + "loss": 0.4142, + "step": 3893 + }, + { + "epoch": 1.0932060640089838, + "grad_norm": 0.7488394975662231, + "learning_rate": 8.017771570344687e-06, + "loss": 0.453, + "step": 3894 + }, + { + "epoch": 1.0934868051656372, + "grad_norm": 0.7049294114112854, + "learning_rate": 8.016469100635336e-06, + "loss": 0.341, + "step": 3895 + }, + { + "epoch": 1.093767546322291, + "grad_norm": 0.7560601830482483, + "learning_rate": 8.01516630902716e-06, + "loss": 0.416, + "step": 3896 + }, + { + "epoch": 1.0940482874789443, + "grad_norm": 0.7238718271255493, + "learning_rate": 8.013863195659187e-06, + "loss": 0.3637, + "step": 3897 + }, + { + "epoch": 1.094329028635598, + "grad_norm": 0.7373055219650269, + "learning_rate": 8.012559760670472e-06, + "loss": 0.3995, + "step": 3898 + }, + { + "epoch": 1.0946097697922514, + "grad_norm": 0.66459721326828, + "learning_rate": 8.011256004200118e-06, + "loss": 0.4071, + "step": 3899 + }, + { + "epoch": 1.094890510948905, + "grad_norm": 0.6978521943092346, + "learning_rate": 8.009951926387245e-06, + "loss": 0.3953, + "step": 3900 + }, + { + "epoch": 1.0951712521055588, + "grad_norm": 0.6564455032348633, + "learning_rate": 8.008647527371022e-06, + "loss": 0.4034, + "step": 3901 + }, + { + "epoch": 1.0954519932622122, + "grad_norm": 0.6997036933898926, + "learning_rate": 8.007342807290644e-06, + "loss": 0.3742, + "step": 3902 + }, + { + "epoch": 1.0957327344188659, + "grad_norm": 0.7327374219894409, + "learning_rate": 8.006037766285344e-06, + "loss": 0.3838, + "step": 3903 + }, + { + "epoch": 1.0960134755755193, + "grad_norm": 0.6831018924713135, + "learning_rate": 8.004732404494386e-06, + "loss": 0.3632, + "step": 3904 + }, + { + "epoch": 1.096294216732173, + "grad_norm": 0.7136168479919434, + "learning_rate": 8.003426722057071e-06, + "loss": 0.3916, + "step": 3905 + }, + { + "epoch": 1.0965749578888264, + "grad_norm": 0.6591780185699463, + "learning_rate": 8.002120719112734e-06, + "loss": 0.3646, + "step": 3906 + }, + { + "epoch": 1.09685569904548, + "grad_norm": 0.6764297485351562, + "learning_rate": 8.000814395800742e-06, + "loss": 0.4084, + "step": 3907 + }, + { + "epoch": 1.0971364402021337, + "grad_norm": 0.7040059566497803, + "learning_rate": 7.999507752260499e-06, + "loss": 0.404, + "step": 3908 + }, + { + "epoch": 1.0974171813587872, + "grad_norm": 0.6634208559989929, + "learning_rate": 7.998200788631441e-06, + "loss": 0.3485, + "step": 3909 + }, + { + "epoch": 1.0976979225154408, + "grad_norm": 0.729281485080719, + "learning_rate": 7.99689350505304e-06, + "loss": 0.3859, + "step": 3910 + }, + { + "epoch": 1.0979786636720943, + "grad_norm": 0.7145592570304871, + "learning_rate": 7.9955859016648e-06, + "loss": 0.3836, + "step": 3911 + }, + { + "epoch": 1.098259404828748, + "grad_norm": 0.644367516040802, + "learning_rate": 7.994277978606259e-06, + "loss": 0.3872, + "step": 3912 + }, + { + "epoch": 1.0985401459854014, + "grad_norm": 0.5941146612167358, + "learning_rate": 7.992969736016996e-06, + "loss": 0.4076, + "step": 3913 + }, + { + "epoch": 1.098820887142055, + "grad_norm": 0.6180407404899597, + "learning_rate": 7.99166117403661e-06, + "loss": 0.3769, + "step": 3914 + }, + { + "epoch": 1.0991016282987085, + "grad_norm": 0.7049715518951416, + "learning_rate": 7.990352292804752e-06, + "loss": 0.3769, + "step": 3915 + }, + { + "epoch": 1.0993823694553622, + "grad_norm": 0.7062937617301941, + "learning_rate": 7.989043092461094e-06, + "loss": 0.3958, + "step": 3916 + }, + { + "epoch": 1.0996631106120156, + "grad_norm": 0.6752821803092957, + "learning_rate": 7.98773357314534e-06, + "loss": 0.3994, + "step": 3917 + }, + { + "epoch": 1.0999438517686693, + "grad_norm": 0.6410258412361145, + "learning_rate": 7.986423734997243e-06, + "loss": 0.4132, + "step": 3918 + }, + { + "epoch": 1.100224592925323, + "grad_norm": 0.7151635885238647, + "learning_rate": 7.985113578156573e-06, + "loss": 0.3976, + "step": 3919 + }, + { + "epoch": 1.1005053340819764, + "grad_norm": 0.6709374189376831, + "learning_rate": 7.983803102763146e-06, + "loss": 0.4026, + "step": 3920 + }, + { + "epoch": 1.10078607523863, + "grad_norm": 0.720008909702301, + "learning_rate": 7.982492308956809e-06, + "loss": 0.3594, + "step": 3921 + }, + { + "epoch": 1.1010668163952835, + "grad_norm": 0.6780417561531067, + "learning_rate": 7.981181196877437e-06, + "loss": 0.3823, + "step": 3922 + }, + { + "epoch": 1.1013475575519371, + "grad_norm": 0.74256432056427, + "learning_rate": 7.979869766664947e-06, + "loss": 0.4017, + "step": 3923 + }, + { + "epoch": 1.1016282987085906, + "grad_norm": 0.6556293964385986, + "learning_rate": 7.978558018459288e-06, + "loss": 0.3896, + "step": 3924 + }, + { + "epoch": 1.1019090398652442, + "grad_norm": 0.6396933197975159, + "learning_rate": 7.977245952400436e-06, + "loss": 0.414, + "step": 3925 + }, + { + "epoch": 1.102189781021898, + "grad_norm": 0.702535092830658, + "learning_rate": 7.975933568628413e-06, + "loss": 0.4357, + "step": 3926 + }, + { + "epoch": 1.1024705221785513, + "grad_norm": 0.6938668489456177, + "learning_rate": 7.974620867283267e-06, + "loss": 0.4172, + "step": 3927 + }, + { + "epoch": 1.102751263335205, + "grad_norm": 0.701370358467102, + "learning_rate": 7.973307848505076e-06, + "loss": 0.4292, + "step": 3928 + }, + { + "epoch": 1.1030320044918585, + "grad_norm": 0.665600597858429, + "learning_rate": 7.971994512433965e-06, + "loss": 0.3658, + "step": 3929 + }, + { + "epoch": 1.1033127456485121, + "grad_norm": 0.6574090719223022, + "learning_rate": 7.97068085921008e-06, + "loss": 0.3844, + "step": 3930 + }, + { + "epoch": 1.1035934868051656, + "grad_norm": 0.7160803079605103, + "learning_rate": 7.969366888973606e-06, + "loss": 0.4122, + "step": 3931 + }, + { + "epoch": 1.1038742279618192, + "grad_norm": 0.7250133752822876, + "learning_rate": 7.968052601864764e-06, + "loss": 0.3684, + "step": 3932 + }, + { + "epoch": 1.1041549691184729, + "grad_norm": 0.6573944687843323, + "learning_rate": 7.966737998023804e-06, + "loss": 0.3786, + "step": 3933 + }, + { + "epoch": 1.1044357102751263, + "grad_norm": 0.7087840437889099, + "learning_rate": 7.965423077591016e-06, + "loss": 0.3926, + "step": 3934 + }, + { + "epoch": 1.10471645143178, + "grad_norm": 0.6879131197929382, + "learning_rate": 7.964107840706717e-06, + "loss": 0.4313, + "step": 3935 + }, + { + "epoch": 1.1049971925884334, + "grad_norm": 0.7839788198471069, + "learning_rate": 7.96279228751126e-06, + "loss": 0.3994, + "step": 3936 + }, + { + "epoch": 1.105277933745087, + "grad_norm": 0.7168227434158325, + "learning_rate": 7.961476418145037e-06, + "loss": 0.4207, + "step": 3937 + }, + { + "epoch": 1.1055586749017405, + "grad_norm": 0.885214626789093, + "learning_rate": 7.960160232748466e-06, + "loss": 0.3973, + "step": 3938 + }, + { + "epoch": 1.1058394160583942, + "grad_norm": 0.7359375357627869, + "learning_rate": 7.958843731462003e-06, + "loss": 0.4176, + "step": 3939 + }, + { + "epoch": 1.1061201572150476, + "grad_norm": 0.7108869552612305, + "learning_rate": 7.957526914426137e-06, + "loss": 0.4332, + "step": 3940 + }, + { + "epoch": 1.1064008983717013, + "grad_norm": 0.8117178678512573, + "learning_rate": 7.95620978178139e-06, + "loss": 0.3826, + "step": 3941 + }, + { + "epoch": 1.1066816395283547, + "grad_norm": 0.7513662576675415, + "learning_rate": 7.954892333668318e-06, + "loss": 0.4262, + "step": 3942 + }, + { + "epoch": 1.1069623806850084, + "grad_norm": 0.6816915273666382, + "learning_rate": 7.953574570227512e-06, + "loss": 0.3902, + "step": 3943 + }, + { + "epoch": 1.107243121841662, + "grad_norm": 0.7227581739425659, + "learning_rate": 7.952256491599594e-06, + "loss": 0.3654, + "step": 3944 + }, + { + "epoch": 1.1075238629983155, + "grad_norm": 0.6222494840621948, + "learning_rate": 7.950938097925224e-06, + "loss": 0.341, + "step": 3945 + }, + { + "epoch": 1.1078046041549692, + "grad_norm": 0.742754340171814, + "learning_rate": 7.94961938934509e-06, + "loss": 0.3895, + "step": 3946 + }, + { + "epoch": 1.1080853453116226, + "grad_norm": 0.7121419906616211, + "learning_rate": 7.948300365999917e-06, + "loss": 0.3845, + "step": 3947 + }, + { + "epoch": 1.1083660864682763, + "grad_norm": 0.5612559914588928, + "learning_rate": 7.946981028030463e-06, + "loss": 0.3646, + "step": 3948 + }, + { + "epoch": 1.1086468276249297, + "grad_norm": 0.6668398976325989, + "learning_rate": 7.94566137557752e-06, + "loss": 0.3666, + "step": 3949 + }, + { + "epoch": 1.1089275687815834, + "grad_norm": 0.6178943514823914, + "learning_rate": 7.944341408781914e-06, + "loss": 0.3649, + "step": 3950 + }, + { + "epoch": 1.109208309938237, + "grad_norm": 0.7678118944168091, + "learning_rate": 7.9430211277845e-06, + "loss": 0.3747, + "step": 3951 + }, + { + "epoch": 1.1094890510948905, + "grad_norm": 0.6977105736732483, + "learning_rate": 7.941700532726174e-06, + "loss": 0.3772, + "step": 3952 + }, + { + "epoch": 1.1097697922515442, + "grad_norm": 0.5625650882720947, + "learning_rate": 7.940379623747861e-06, + "loss": 0.3677, + "step": 3953 + }, + { + "epoch": 1.1100505334081976, + "grad_norm": 0.7186122536659241, + "learning_rate": 7.939058400990518e-06, + "loss": 0.393, + "step": 3954 + }, + { + "epoch": 1.1103312745648513, + "grad_norm": 0.6502189040184021, + "learning_rate": 7.93773686459514e-06, + "loss": 0.3892, + "step": 3955 + }, + { + "epoch": 1.1106120157215047, + "grad_norm": 0.7585682272911072, + "learning_rate": 7.936415014702754e-06, + "loss": 0.3548, + "step": 3956 + }, + { + "epoch": 1.1108927568781584, + "grad_norm": 0.7043623924255371, + "learning_rate": 7.935092851454416e-06, + "loss": 0.4114, + "step": 3957 + }, + { + "epoch": 1.1111734980348118, + "grad_norm": 0.6830571889877319, + "learning_rate": 7.933770374991223e-06, + "loss": 0.3904, + "step": 3958 + }, + { + "epoch": 1.1114542391914655, + "grad_norm": 0.6246893405914307, + "learning_rate": 7.932447585454298e-06, + "loss": 0.3611, + "step": 3959 + }, + { + "epoch": 1.1117349803481191, + "grad_norm": 0.6417291164398193, + "learning_rate": 7.931124482984802e-06, + "loss": 0.3709, + "step": 3960 + }, + { + "epoch": 1.1120157215047726, + "grad_norm": 0.7732900381088257, + "learning_rate": 7.92980106772393e-06, + "loss": 0.4169, + "step": 3961 + }, + { + "epoch": 1.1122964626614262, + "grad_norm": 0.6309844851493835, + "learning_rate": 7.928477339812906e-06, + "loss": 0.3871, + "step": 3962 + }, + { + "epoch": 1.1125772038180797, + "grad_norm": 0.7527498602867126, + "learning_rate": 7.927153299392993e-06, + "loss": 0.3913, + "step": 3963 + }, + { + "epoch": 1.1128579449747333, + "grad_norm": 0.7211992740631104, + "learning_rate": 7.925828946605481e-06, + "loss": 0.405, + "step": 3964 + }, + { + "epoch": 1.1131386861313868, + "grad_norm": 0.6866170167922974, + "learning_rate": 7.924504281591698e-06, + "loss": 0.3781, + "step": 3965 + }, + { + "epoch": 1.1134194272880404, + "grad_norm": 0.6364414095878601, + "learning_rate": 7.923179304493005e-06, + "loss": 0.3695, + "step": 3966 + }, + { + "epoch": 1.1137001684446939, + "grad_norm": 0.6579988598823547, + "learning_rate": 7.921854015450794e-06, + "loss": 0.4142, + "step": 3967 + }, + { + "epoch": 1.1139809096013475, + "grad_norm": 0.6545124053955078, + "learning_rate": 7.920528414606495e-06, + "loss": 0.3819, + "step": 3968 + }, + { + "epoch": 1.1142616507580012, + "grad_norm": 0.7061275243759155, + "learning_rate": 7.919202502101562e-06, + "loss": 0.4118, + "step": 3969 + }, + { + "epoch": 1.1145423919146547, + "grad_norm": 0.7414146065711975, + "learning_rate": 7.917876278077493e-06, + "loss": 0.4475, + "step": 3970 + }, + { + "epoch": 1.1148231330713083, + "grad_norm": 0.5749108195304871, + "learning_rate": 7.916549742675812e-06, + "loss": 0.3836, + "step": 3971 + }, + { + "epoch": 1.1151038742279618, + "grad_norm": 0.6774276494979858, + "learning_rate": 7.91522289603808e-06, + "loss": 0.4011, + "step": 3972 + }, + { + "epoch": 1.1153846153846154, + "grad_norm": 0.7002820372581482, + "learning_rate": 7.91389573830589e-06, + "loss": 0.4116, + "step": 3973 + }, + { + "epoch": 1.1156653565412689, + "grad_norm": 0.6322146058082581, + "learning_rate": 7.912568269620864e-06, + "loss": 0.3826, + "step": 3974 + }, + { + "epoch": 1.1159460976979225, + "grad_norm": 0.6247429251670837, + "learning_rate": 7.911240490124667e-06, + "loss": 0.3888, + "step": 3975 + }, + { + "epoch": 1.1162268388545762, + "grad_norm": 0.6144675016403198, + "learning_rate": 7.909912399958986e-06, + "loss": 0.4358, + "step": 3976 + }, + { + "epoch": 1.1165075800112296, + "grad_norm": 0.6474701166152954, + "learning_rate": 7.908583999265552e-06, + "loss": 0.4135, + "step": 3977 + }, + { + "epoch": 1.1167883211678833, + "grad_norm": 0.6898013353347778, + "learning_rate": 7.90725528818612e-06, + "loss": 0.3373, + "step": 3978 + }, + { + "epoch": 1.1170690623245367, + "grad_norm": 0.6261772513389587, + "learning_rate": 7.905926266862483e-06, + "loss": 0.3731, + "step": 3979 + }, + { + "epoch": 1.1173498034811904, + "grad_norm": 0.669147253036499, + "learning_rate": 7.904596935436464e-06, + "loss": 0.3998, + "step": 3980 + }, + { + "epoch": 1.1176305446378438, + "grad_norm": 0.6951127052307129, + "learning_rate": 7.903267294049926e-06, + "loss": 0.4025, + "step": 3981 + }, + { + "epoch": 1.1179112857944975, + "grad_norm": 0.6515661478042603, + "learning_rate": 7.901937342844753e-06, + "loss": 0.4261, + "step": 3982 + }, + { + "epoch": 1.118192026951151, + "grad_norm": 0.644037663936615, + "learning_rate": 7.900607081962875e-06, + "loss": 0.3985, + "step": 3983 + }, + { + "epoch": 1.1184727681078046, + "grad_norm": 0.734929084777832, + "learning_rate": 7.899276511546245e-06, + "loss": 0.3896, + "step": 3984 + }, + { + "epoch": 1.1187535092644583, + "grad_norm": 0.7229781150817871, + "learning_rate": 7.897945631736856e-06, + "loss": 0.3814, + "step": 3985 + }, + { + "epoch": 1.1190342504211117, + "grad_norm": 0.5938378572463989, + "learning_rate": 7.896614442676731e-06, + "loss": 0.4119, + "step": 3986 + }, + { + "epoch": 1.1193149915777654, + "grad_norm": 0.6958502531051636, + "learning_rate": 7.895282944507925e-06, + "loss": 0.4011, + "step": 3987 + }, + { + "epoch": 1.1195957327344188, + "grad_norm": 0.7006041407585144, + "learning_rate": 7.893951137372527e-06, + "loss": 0.3505, + "step": 3988 + }, + { + "epoch": 1.1198764738910725, + "grad_norm": 0.674226701259613, + "learning_rate": 7.892619021412659e-06, + "loss": 0.3948, + "step": 3989 + }, + { + "epoch": 1.120157215047726, + "grad_norm": 0.7255149483680725, + "learning_rate": 7.89128659677048e-06, + "loss": 0.3947, + "step": 3990 + }, + { + "epoch": 1.1204379562043796, + "grad_norm": 0.7130322456359863, + "learning_rate": 7.889953863588173e-06, + "loss": 0.3889, + "step": 3991 + }, + { + "epoch": 1.120718697361033, + "grad_norm": 0.6574380993843079, + "learning_rate": 7.888620822007963e-06, + "loss": 0.4091, + "step": 3992 + }, + { + "epoch": 1.1209994385176867, + "grad_norm": 0.695537269115448, + "learning_rate": 7.8872874721721e-06, + "loss": 0.3811, + "step": 3993 + }, + { + "epoch": 1.1212801796743403, + "grad_norm": 0.6779100894927979, + "learning_rate": 7.885953814222874e-06, + "loss": 0.3854, + "step": 3994 + }, + { + "epoch": 1.1215609208309938, + "grad_norm": 0.7288315892219543, + "learning_rate": 7.884619848302603e-06, + "loss": 0.3784, + "step": 3995 + }, + { + "epoch": 1.1218416619876475, + "grad_norm": 0.6675150990486145, + "learning_rate": 7.883285574553641e-06, + "loss": 0.3877, + "step": 3996 + }, + { + "epoch": 1.122122403144301, + "grad_norm": 0.6860742568969727, + "learning_rate": 7.881950993118372e-06, + "loss": 0.4033, + "step": 3997 + }, + { + "epoch": 1.1224031443009546, + "grad_norm": 0.6912956833839417, + "learning_rate": 7.880616104139214e-06, + "loss": 0.4343, + "step": 3998 + }, + { + "epoch": 1.122683885457608, + "grad_norm": 0.6946637034416199, + "learning_rate": 7.87928090775862e-06, + "loss": 0.4107, + "step": 3999 + }, + { + "epoch": 1.1229646266142617, + "grad_norm": 0.6523827910423279, + "learning_rate": 7.877945404119071e-06, + "loss": 0.4002, + "step": 4000 + }, + { + "epoch": 1.1232453677709153, + "grad_norm": 0.6998991370201111, + "learning_rate": 7.876609593363086e-06, + "loss": 0.431, + "step": 4001 + }, + { + "epoch": 1.1235261089275688, + "grad_norm": 0.6889566779136658, + "learning_rate": 7.875273475633212e-06, + "loss": 0.3927, + "step": 4002 + }, + { + "epoch": 1.1238068500842224, + "grad_norm": 0.6717015504837036, + "learning_rate": 7.873937051072037e-06, + "loss": 0.3751, + "step": 4003 + }, + { + "epoch": 1.1240875912408759, + "grad_norm": 0.6775625348091125, + "learning_rate": 7.872600319822168e-06, + "loss": 0.3783, + "step": 4004 + }, + { + "epoch": 1.1243683323975295, + "grad_norm": 0.7042632102966309, + "learning_rate": 7.871263282026256e-06, + "loss": 0.3949, + "step": 4005 + }, + { + "epoch": 1.124649073554183, + "grad_norm": 0.8770792484283447, + "learning_rate": 7.869925937826984e-06, + "loss": 0.3743, + "step": 4006 + }, + { + "epoch": 1.1249298147108366, + "grad_norm": 0.7265608310699463, + "learning_rate": 7.868588287367062e-06, + "loss": 0.4012, + "step": 4007 + }, + { + "epoch": 1.12521055586749, + "grad_norm": 0.8121118545532227, + "learning_rate": 7.867250330789237e-06, + "loss": 0.4143, + "step": 4008 + }, + { + "epoch": 1.1254912970241437, + "grad_norm": 0.6335089206695557, + "learning_rate": 7.865912068236286e-06, + "loss": 0.3642, + "step": 4009 + }, + { + "epoch": 1.1257720381807972, + "grad_norm": 0.8560290336608887, + "learning_rate": 7.864573499851022e-06, + "loss": 0.37, + "step": 4010 + }, + { + "epoch": 1.1260527793374508, + "grad_norm": 0.6950612664222717, + "learning_rate": 7.863234625776289e-06, + "loss": 0.4, + "step": 4011 + }, + { + "epoch": 1.1263335204941045, + "grad_norm": 0.6369311213493347, + "learning_rate": 7.861895446154959e-06, + "loss": 0.4145, + "step": 4012 + }, + { + "epoch": 1.126614261650758, + "grad_norm": 0.6790870428085327, + "learning_rate": 7.860555961129945e-06, + "loss": 0.3876, + "step": 4013 + }, + { + "epoch": 1.1268950028074116, + "grad_norm": 0.795529842376709, + "learning_rate": 7.859216170844187e-06, + "loss": 0.4086, + "step": 4014 + }, + { + "epoch": 1.127175743964065, + "grad_norm": 0.7263389229774475, + "learning_rate": 7.85787607544066e-06, + "loss": 0.3635, + "step": 4015 + }, + { + "epoch": 1.1274564851207187, + "grad_norm": 0.7352930903434753, + "learning_rate": 7.856535675062371e-06, + "loss": 0.4146, + "step": 4016 + }, + { + "epoch": 1.1277372262773722, + "grad_norm": 0.7245302200317383, + "learning_rate": 7.855194969852358e-06, + "loss": 0.3749, + "step": 4017 + }, + { + "epoch": 1.1280179674340258, + "grad_norm": 0.666125476360321, + "learning_rate": 7.853853959953692e-06, + "loss": 0.3911, + "step": 4018 + }, + { + "epoch": 1.1282987085906795, + "grad_norm": 0.6330158114433289, + "learning_rate": 7.85251264550948e-06, + "loss": 0.4036, + "step": 4019 + }, + { + "epoch": 1.128579449747333, + "grad_norm": 0.7882659435272217, + "learning_rate": 7.851171026662857e-06, + "loss": 0.4063, + "step": 4020 + }, + { + "epoch": 1.1288601909039866, + "grad_norm": 0.7144256234169006, + "learning_rate": 7.849829103556991e-06, + "loss": 0.4033, + "step": 4021 + }, + { + "epoch": 1.12914093206064, + "grad_norm": 0.7497358918190002, + "learning_rate": 7.848486876335086e-06, + "loss": 0.4031, + "step": 4022 + }, + { + "epoch": 1.1294216732172937, + "grad_norm": 0.7111562490463257, + "learning_rate": 7.847144345140375e-06, + "loss": 0.3828, + "step": 4023 + }, + { + "epoch": 1.1297024143739471, + "grad_norm": 0.7431685328483582, + "learning_rate": 7.845801510116124e-06, + "loss": 0.4383, + "step": 4024 + }, + { + "epoch": 1.1299831555306008, + "grad_norm": 0.7841300368309021, + "learning_rate": 7.844458371405634e-06, + "loss": 0.3627, + "step": 4025 + }, + { + "epoch": 1.1302638966872545, + "grad_norm": 0.7278240919113159, + "learning_rate": 7.843114929152234e-06, + "loss": 0.3705, + "step": 4026 + }, + { + "epoch": 1.130544637843908, + "grad_norm": 0.7185124754905701, + "learning_rate": 7.841771183499289e-06, + "loss": 0.4071, + "step": 4027 + }, + { + "epoch": 1.1308253790005616, + "grad_norm": 0.6828190684318542, + "learning_rate": 7.840427134590196e-06, + "loss": 0.3812, + "step": 4028 + }, + { + "epoch": 1.131106120157215, + "grad_norm": 0.7162874937057495, + "learning_rate": 7.839082782568382e-06, + "loss": 0.3628, + "step": 4029 + }, + { + "epoch": 1.1313868613138687, + "grad_norm": 0.7726592421531677, + "learning_rate": 7.837738127577307e-06, + "loss": 0.3761, + "step": 4030 + }, + { + "epoch": 1.1316676024705221, + "grad_norm": 0.7891879081726074, + "learning_rate": 7.836393169760467e-06, + "loss": 0.3373, + "step": 4031 + }, + { + "epoch": 1.1319483436271758, + "grad_norm": 0.6850951910018921, + "learning_rate": 7.835047909261387e-06, + "loss": 0.3865, + "step": 4032 + }, + { + "epoch": 1.1322290847838292, + "grad_norm": 0.7357378005981445, + "learning_rate": 7.833702346223624e-06, + "loss": 0.3946, + "step": 4033 + }, + { + "epoch": 1.1325098259404829, + "grad_norm": 0.761630654335022, + "learning_rate": 7.832356480790767e-06, + "loss": 0.4249, + "step": 4034 + }, + { + "epoch": 1.1327905670971363, + "grad_norm": 0.816126823425293, + "learning_rate": 7.831010313106441e-06, + "loss": 0.4132, + "step": 4035 + }, + { + "epoch": 1.13307130825379, + "grad_norm": 0.7078574895858765, + "learning_rate": 7.829663843314301e-06, + "loss": 0.4246, + "step": 4036 + }, + { + "epoch": 1.1333520494104437, + "grad_norm": 0.6342118978500366, + "learning_rate": 7.828317071558029e-06, + "loss": 0.3885, + "step": 4037 + }, + { + "epoch": 1.133632790567097, + "grad_norm": 0.7254751920700073, + "learning_rate": 7.826969997981349e-06, + "loss": 0.4345, + "step": 4038 + }, + { + "epoch": 1.1339135317237508, + "grad_norm": 0.7696471810340881, + "learning_rate": 7.825622622728008e-06, + "loss": 0.3946, + "step": 4039 + }, + { + "epoch": 1.1341942728804042, + "grad_norm": 0.7007644772529602, + "learning_rate": 7.824274945941794e-06, + "loss": 0.414, + "step": 4040 + }, + { + "epoch": 1.1344750140370579, + "grad_norm": 0.6638544797897339, + "learning_rate": 7.82292696776652e-06, + "loss": 0.4122, + "step": 4041 + }, + { + "epoch": 1.1347557551937113, + "grad_norm": 0.7450946569442749, + "learning_rate": 7.821578688346037e-06, + "loss": 0.3923, + "step": 4042 + }, + { + "epoch": 1.135036496350365, + "grad_norm": 0.7676528096199036, + "learning_rate": 7.82023010782422e-06, + "loss": 0.3575, + "step": 4043 + }, + { + "epoch": 1.1353172375070186, + "grad_norm": 0.6937967538833618, + "learning_rate": 7.818881226344985e-06, + "loss": 0.4193, + "step": 4044 + }, + { + "epoch": 1.135597978663672, + "grad_norm": 0.7304214835166931, + "learning_rate": 7.817532044052275e-06, + "loss": 0.4366, + "step": 4045 + }, + { + "epoch": 1.1358787198203257, + "grad_norm": 0.591670036315918, + "learning_rate": 7.816182561090066e-06, + "loss": 0.3783, + "step": 4046 + }, + { + "epoch": 1.1361594609769792, + "grad_norm": 0.796985924243927, + "learning_rate": 7.814832777602367e-06, + "loss": 0.3986, + "step": 4047 + }, + { + "epoch": 1.1364402021336328, + "grad_norm": 0.7883581519126892, + "learning_rate": 7.81348269373322e-06, + "loss": 0.4015, + "step": 4048 + }, + { + "epoch": 1.1367209432902863, + "grad_norm": 0.7149470448493958, + "learning_rate": 7.812132309626692e-06, + "loss": 0.4174, + "step": 4049 + }, + { + "epoch": 1.13700168444694, + "grad_norm": 0.8815571665763855, + "learning_rate": 7.810781625426893e-06, + "loss": 0.4509, + "step": 4050 + }, + { + "epoch": 1.1372824256035936, + "grad_norm": 0.6563960313796997, + "learning_rate": 7.809430641277959e-06, + "loss": 0.3817, + "step": 4051 + }, + { + "epoch": 1.137563166760247, + "grad_norm": 0.8129245042800903, + "learning_rate": 7.808079357324057e-06, + "loss": 0.4071, + "step": 4052 + }, + { + "epoch": 1.1378439079169007, + "grad_norm": 0.6671165823936462, + "learning_rate": 7.806727773709388e-06, + "loss": 0.403, + "step": 4053 + }, + { + "epoch": 1.1381246490735542, + "grad_norm": 0.5800145864486694, + "learning_rate": 7.805375890578184e-06, + "loss": 0.3579, + "step": 4054 + }, + { + "epoch": 1.1384053902302078, + "grad_norm": 0.7509213089942932, + "learning_rate": 7.804023708074714e-06, + "loss": 0.3987, + "step": 4055 + }, + { + "epoch": 1.1386861313868613, + "grad_norm": 0.6746266484260559, + "learning_rate": 7.802671226343266e-06, + "loss": 0.4248, + "step": 4056 + }, + { + "epoch": 1.138966872543515, + "grad_norm": 0.7355132102966309, + "learning_rate": 7.801318445528177e-06, + "loss": 0.43, + "step": 4057 + }, + { + "epoch": 1.1392476137001684, + "grad_norm": 0.6688141822814941, + "learning_rate": 7.799965365773803e-06, + "loss": 0.3977, + "step": 4058 + }, + { + "epoch": 1.139528354856822, + "grad_norm": 0.7025774717330933, + "learning_rate": 7.798611987224535e-06, + "loss": 0.4038, + "step": 4059 + }, + { + "epoch": 1.1398090960134755, + "grad_norm": 0.6520720720291138, + "learning_rate": 7.797258310024802e-06, + "loss": 0.3624, + "step": 4060 + }, + { + "epoch": 1.1400898371701291, + "grad_norm": 0.6350058913230896, + "learning_rate": 7.795904334319056e-06, + "loss": 0.3818, + "step": 4061 + }, + { + "epoch": 1.1403705783267828, + "grad_norm": 0.6513971090316772, + "learning_rate": 7.794550060251786e-06, + "loss": 0.4089, + "step": 4062 + }, + { + "epoch": 1.1406513194834362, + "grad_norm": 0.7588690519332886, + "learning_rate": 7.79319548796751e-06, + "loss": 0.3827, + "step": 4063 + }, + { + "epoch": 1.14093206064009, + "grad_norm": 0.763512134552002, + "learning_rate": 7.791840617610784e-06, + "loss": 0.4002, + "step": 4064 + }, + { + "epoch": 1.1412128017967433, + "grad_norm": 0.7148080468177795, + "learning_rate": 7.790485449326188e-06, + "loss": 0.3573, + "step": 4065 + }, + { + "epoch": 1.141493542953397, + "grad_norm": 0.7314262986183167, + "learning_rate": 7.789129983258336e-06, + "loss": 0.3869, + "step": 4066 + }, + { + "epoch": 1.1417742841100504, + "grad_norm": 0.6132856011390686, + "learning_rate": 7.787774219551878e-06, + "loss": 0.3484, + "step": 4067 + }, + { + "epoch": 1.142055025266704, + "grad_norm": 0.7298488616943359, + "learning_rate": 7.786418158351491e-06, + "loss": 0.3768, + "step": 4068 + }, + { + "epoch": 1.1423357664233578, + "grad_norm": 0.6746385097503662, + "learning_rate": 7.785061799801888e-06, + "loss": 0.3538, + "step": 4069 + }, + { + "epoch": 1.1426165075800112, + "grad_norm": 0.6759898066520691, + "learning_rate": 7.783705144047805e-06, + "loss": 0.3556, + "step": 4070 + }, + { + "epoch": 1.1428972487366649, + "grad_norm": 0.6515828371047974, + "learning_rate": 7.782348191234022e-06, + "loss": 0.4138, + "step": 4071 + }, + { + "epoch": 1.1431779898933183, + "grad_norm": 0.6762593984603882, + "learning_rate": 7.780990941505342e-06, + "loss": 0.3785, + "step": 4072 + }, + { + "epoch": 1.143458731049972, + "grad_norm": 0.6707088351249695, + "learning_rate": 7.779633395006603e-06, + "loss": 0.4018, + "step": 4073 + }, + { + "epoch": 1.1437394722066254, + "grad_norm": 0.7524548172950745, + "learning_rate": 7.778275551882673e-06, + "loss": 0.3949, + "step": 4074 + }, + { + "epoch": 1.144020213363279, + "grad_norm": 0.7122347950935364, + "learning_rate": 7.776917412278454e-06, + "loss": 0.3791, + "step": 4075 + }, + { + "epoch": 1.1443009545199327, + "grad_norm": 0.762192964553833, + "learning_rate": 7.775558976338875e-06, + "loss": 0.4066, + "step": 4076 + }, + { + "epoch": 1.1445816956765862, + "grad_norm": 0.6764295101165771, + "learning_rate": 7.774200244208903e-06, + "loss": 0.3711, + "step": 4077 + }, + { + "epoch": 1.1448624368332398, + "grad_norm": 0.7432030439376831, + "learning_rate": 7.772841216033534e-06, + "loss": 0.3548, + "step": 4078 + }, + { + "epoch": 1.1451431779898933, + "grad_norm": 0.6911014318466187, + "learning_rate": 7.771481891957792e-06, + "loss": 0.4129, + "step": 4079 + }, + { + "epoch": 1.145423919146547, + "grad_norm": 0.74835205078125, + "learning_rate": 7.770122272126738e-06, + "loss": 0.3853, + "step": 4080 + }, + { + "epoch": 1.1457046603032004, + "grad_norm": 0.7849177122116089, + "learning_rate": 7.76876235668546e-06, + "loss": 0.3847, + "step": 4081 + }, + { + "epoch": 1.145985401459854, + "grad_norm": 0.7110073566436768, + "learning_rate": 7.767402145779083e-06, + "loss": 0.3946, + "step": 4082 + }, + { + "epoch": 1.1462661426165075, + "grad_norm": 0.7056792378425598, + "learning_rate": 7.766041639552757e-06, + "loss": 0.4053, + "step": 4083 + }, + { + "epoch": 1.1465468837731612, + "grad_norm": 0.7177689075469971, + "learning_rate": 7.764680838151669e-06, + "loss": 0.4309, + "step": 4084 + }, + { + "epoch": 1.1468276249298146, + "grad_norm": 0.6438093185424805, + "learning_rate": 7.763319741721034e-06, + "loss": 0.4055, + "step": 4085 + }, + { + "epoch": 1.1471083660864683, + "grad_norm": 0.731724739074707, + "learning_rate": 7.7619583504061e-06, + "loss": 0.4371, + "step": 4086 + }, + { + "epoch": 1.147389107243122, + "grad_norm": 0.6364150047302246, + "learning_rate": 7.760596664352148e-06, + "loss": 0.403, + "step": 4087 + }, + { + "epoch": 1.1476698483997754, + "grad_norm": 0.6307037472724915, + "learning_rate": 7.759234683704485e-06, + "loss": 0.3921, + "step": 4088 + }, + { + "epoch": 1.147950589556429, + "grad_norm": 0.7023909091949463, + "learning_rate": 7.757872408608456e-06, + "loss": 0.3825, + "step": 4089 + }, + { + "epoch": 1.1482313307130825, + "grad_norm": 0.6668989658355713, + "learning_rate": 7.756509839209431e-06, + "loss": 0.3466, + "step": 4090 + }, + { + "epoch": 1.1485120718697361, + "grad_norm": 0.610808789730072, + "learning_rate": 7.75514697565282e-06, + "loss": 0.3692, + "step": 4091 + }, + { + "epoch": 1.1487928130263896, + "grad_norm": 0.64811772108078, + "learning_rate": 7.753783818084057e-06, + "loss": 0.3854, + "step": 4092 + }, + { + "epoch": 1.1490735541830432, + "grad_norm": 0.6047309041023254, + "learning_rate": 7.75242036664861e-06, + "loss": 0.387, + "step": 4093 + }, + { + "epoch": 1.149354295339697, + "grad_norm": 0.6245483756065369, + "learning_rate": 7.751056621491977e-06, + "loss": 0.3935, + "step": 4094 + }, + { + "epoch": 1.1496350364963503, + "grad_norm": 0.6940885782241821, + "learning_rate": 7.749692582759689e-06, + "loss": 0.3924, + "step": 4095 + }, + { + "epoch": 1.149915777653004, + "grad_norm": 0.7416813969612122, + "learning_rate": 7.748328250597308e-06, + "loss": 0.4011, + "step": 4096 + }, + { + "epoch": 1.1501965188096575, + "grad_norm": 0.7112900018692017, + "learning_rate": 7.746963625150425e-06, + "loss": 0.3809, + "step": 4097 + }, + { + "epoch": 1.1504772599663111, + "grad_norm": 0.6424203515052795, + "learning_rate": 7.745598706564668e-06, + "loss": 0.3746, + "step": 4098 + }, + { + "epoch": 1.1507580011229646, + "grad_norm": 0.7422674894332886, + "learning_rate": 7.744233494985691e-06, + "loss": 0.4076, + "step": 4099 + }, + { + "epoch": 1.1510387422796182, + "grad_norm": 0.6848623156547546, + "learning_rate": 7.74286799055918e-06, + "loss": 0.4129, + "step": 4100 + }, + { + "epoch": 1.1513194834362717, + "grad_norm": 0.7590399980545044, + "learning_rate": 7.741502193430854e-06, + "loss": 0.4041, + "step": 4101 + }, + { + "epoch": 1.1516002245929253, + "grad_norm": 0.6886131763458252, + "learning_rate": 7.740136103746463e-06, + "loss": 0.417, + "step": 4102 + }, + { + "epoch": 1.1518809657495788, + "grad_norm": 0.7533678412437439, + "learning_rate": 7.738769721651784e-06, + "loss": 0.4097, + "step": 4103 + }, + { + "epoch": 1.1521617069062324, + "grad_norm": 0.7325572967529297, + "learning_rate": 7.737403047292634e-06, + "loss": 0.4018, + "step": 4104 + }, + { + "epoch": 1.152442448062886, + "grad_norm": 0.665576696395874, + "learning_rate": 7.736036080814853e-06, + "loss": 0.371, + "step": 4105 + }, + { + "epoch": 1.1527231892195395, + "grad_norm": 0.6930864453315735, + "learning_rate": 7.734668822364315e-06, + "loss": 0.3908, + "step": 4106 + }, + { + "epoch": 1.1530039303761932, + "grad_norm": 0.6323667764663696, + "learning_rate": 7.733301272086929e-06, + "loss": 0.3888, + "step": 4107 + }, + { + "epoch": 1.1532846715328466, + "grad_norm": 0.6767651438713074, + "learning_rate": 7.731933430128624e-06, + "loss": 0.3917, + "step": 4108 + }, + { + "epoch": 1.1535654126895003, + "grad_norm": 0.8144638538360596, + "learning_rate": 7.730565296635376e-06, + "loss": 0.3984, + "step": 4109 + }, + { + "epoch": 1.1538461538461537, + "grad_norm": 0.6930384635925293, + "learning_rate": 7.729196871753178e-06, + "loss": 0.3868, + "step": 4110 + }, + { + "epoch": 1.1541268950028074, + "grad_norm": 0.6677954792976379, + "learning_rate": 7.727828155628063e-06, + "loss": 0.3692, + "step": 4111 + }, + { + "epoch": 1.154407636159461, + "grad_norm": 0.6747219562530518, + "learning_rate": 7.726459148406089e-06, + "loss": 0.4066, + "step": 4112 + }, + { + "epoch": 1.1546883773161145, + "grad_norm": 0.7844739556312561, + "learning_rate": 7.72508985023335e-06, + "loss": 0.3898, + "step": 4113 + }, + { + "epoch": 1.1549691184727682, + "grad_norm": 0.7765569686889648, + "learning_rate": 7.723720261255967e-06, + "loss": 0.4141, + "step": 4114 + }, + { + "epoch": 1.1552498596294216, + "grad_norm": 0.6895026564598083, + "learning_rate": 7.722350381620099e-06, + "loss": 0.4056, + "step": 4115 + }, + { + "epoch": 1.1555306007860753, + "grad_norm": 0.7399360537528992, + "learning_rate": 7.720980211471922e-06, + "loss": 0.4075, + "step": 4116 + }, + { + "epoch": 1.1558113419427287, + "grad_norm": 0.7794618010520935, + "learning_rate": 7.719609750957662e-06, + "loss": 0.4062, + "step": 4117 + }, + { + "epoch": 1.1560920830993824, + "grad_norm": 0.7110264301300049, + "learning_rate": 7.71823900022356e-06, + "loss": 0.3875, + "step": 4118 + }, + { + "epoch": 1.156372824256036, + "grad_norm": 0.7261342406272888, + "learning_rate": 7.716867959415895e-06, + "loss": 0.4133, + "step": 4119 + }, + { + "epoch": 1.1566535654126895, + "grad_norm": 0.711584210395813, + "learning_rate": 7.715496628680977e-06, + "loss": 0.3735, + "step": 4120 + }, + { + "epoch": 1.1569343065693432, + "grad_norm": 0.6330934166908264, + "learning_rate": 7.714125008165146e-06, + "loss": 0.3581, + "step": 4121 + }, + { + "epoch": 1.1572150477259966, + "grad_norm": 0.6390913724899292, + "learning_rate": 7.712753098014771e-06, + "loss": 0.377, + "step": 4122 + }, + { + "epoch": 1.1574957888826503, + "grad_norm": 0.6860611438751221, + "learning_rate": 7.711380898376257e-06, + "loss": 0.3909, + "step": 4123 + }, + { + "epoch": 1.1577765300393037, + "grad_norm": 0.7579001784324646, + "learning_rate": 7.710008409396032e-06, + "loss": 0.4041, + "step": 4124 + }, + { + "epoch": 1.1580572711959574, + "grad_norm": 0.6396341919898987, + "learning_rate": 7.708635631220564e-06, + "loss": 0.3445, + "step": 4125 + }, + { + "epoch": 1.1583380123526108, + "grad_norm": 0.6361292600631714, + "learning_rate": 7.707262563996343e-06, + "loss": 0.3805, + "step": 4126 + }, + { + "epoch": 1.1586187535092645, + "grad_norm": 0.821531355381012, + "learning_rate": 7.705889207869898e-06, + "loss": 0.4316, + "step": 4127 + }, + { + "epoch": 1.158899494665918, + "grad_norm": 0.685020923614502, + "learning_rate": 7.704515562987784e-06, + "loss": 0.4029, + "step": 4128 + }, + { + "epoch": 1.1591802358225716, + "grad_norm": 0.601153552532196, + "learning_rate": 7.703141629496587e-06, + "loss": 0.3705, + "step": 4129 + }, + { + "epoch": 1.1594609769792252, + "grad_norm": 0.8576675653457642, + "learning_rate": 7.701767407542924e-06, + "loss": 0.441, + "step": 4130 + }, + { + "epoch": 1.1597417181358787, + "grad_norm": 0.7202056050300598, + "learning_rate": 7.700392897273446e-06, + "loss": 0.3824, + "step": 4131 + }, + { + "epoch": 1.1600224592925323, + "grad_norm": 0.8050165176391602, + "learning_rate": 7.699018098834828e-06, + "loss": 0.3553, + "step": 4132 + }, + { + "epoch": 1.1603032004491858, + "grad_norm": 0.6949837803840637, + "learning_rate": 7.697643012373786e-06, + "loss": 0.4272, + "step": 4133 + }, + { + "epoch": 1.1605839416058394, + "grad_norm": 0.6781432628631592, + "learning_rate": 7.696267638037055e-06, + "loss": 0.4283, + "step": 4134 + }, + { + "epoch": 1.1608646827624929, + "grad_norm": 0.8500373363494873, + "learning_rate": 7.69489197597141e-06, + "loss": 0.4156, + "step": 4135 + }, + { + "epoch": 1.1611454239191465, + "grad_norm": 0.7929197549819946, + "learning_rate": 7.69351602632365e-06, + "loss": 0.4049, + "step": 4136 + }, + { + "epoch": 1.1614261650758002, + "grad_norm": 0.6842719912528992, + "learning_rate": 7.692139789240611e-06, + "loss": 0.4341, + "step": 4137 + }, + { + "epoch": 1.1617069062324537, + "grad_norm": 0.8185324668884277, + "learning_rate": 7.690763264869154e-06, + "loss": 0.403, + "step": 4138 + }, + { + "epoch": 1.1619876473891073, + "grad_norm": 0.6700324416160583, + "learning_rate": 7.689386453356175e-06, + "loss": 0.3571, + "step": 4139 + }, + { + "epoch": 1.1622683885457608, + "grad_norm": 0.6507755517959595, + "learning_rate": 7.6880093548486e-06, + "loss": 0.3547, + "step": 4140 + }, + { + "epoch": 1.1625491297024144, + "grad_norm": 0.8207817673683167, + "learning_rate": 7.68663196949338e-06, + "loss": 0.3834, + "step": 4141 + }, + { + "epoch": 1.1628298708590679, + "grad_norm": 0.7874570488929749, + "learning_rate": 7.685254297437501e-06, + "loss": 0.4405, + "step": 4142 + }, + { + "epoch": 1.1631106120157215, + "grad_norm": 0.7565908432006836, + "learning_rate": 7.683876338827984e-06, + "loss": 0.3889, + "step": 4143 + }, + { + "epoch": 1.1633913531723752, + "grad_norm": 0.7770771384239197, + "learning_rate": 7.682498093811875e-06, + "loss": 0.4114, + "step": 4144 + }, + { + "epoch": 1.1636720943290286, + "grad_norm": 0.743602991104126, + "learning_rate": 7.68111956253625e-06, + "loss": 0.4182, + "step": 4145 + }, + { + "epoch": 1.1639528354856823, + "grad_norm": 0.7257493734359741, + "learning_rate": 7.679740745148216e-06, + "loss": 0.3871, + "step": 4146 + }, + { + "epoch": 1.1642335766423357, + "grad_norm": 0.7658679485321045, + "learning_rate": 7.678361641794917e-06, + "loss": 0.4128, + "step": 4147 + }, + { + "epoch": 1.1645143177989894, + "grad_norm": 0.7146362662315369, + "learning_rate": 7.676982252623518e-06, + "loss": 0.3783, + "step": 4148 + }, + { + "epoch": 1.1647950589556428, + "grad_norm": 0.7754902243614197, + "learning_rate": 7.675602577781221e-06, + "loss": 0.4098, + "step": 4149 + }, + { + "epoch": 1.1650758001122965, + "grad_norm": 0.6947715878486633, + "learning_rate": 7.674222617415255e-06, + "loss": 0.3951, + "step": 4150 + }, + { + "epoch": 1.16535654126895, + "grad_norm": 0.6923962235450745, + "learning_rate": 7.672842371672879e-06, + "loss": 0.3917, + "step": 4151 + }, + { + "epoch": 1.1656372824256036, + "grad_norm": 0.724932074546814, + "learning_rate": 7.671461840701389e-06, + "loss": 0.411, + "step": 4152 + }, + { + "epoch": 1.165918023582257, + "grad_norm": 0.7778397798538208, + "learning_rate": 7.670081024648102e-06, + "loss": 0.3858, + "step": 4153 + }, + { + "epoch": 1.1661987647389107, + "grad_norm": 0.6858928799629211, + "learning_rate": 7.668699923660373e-06, + "loss": 0.3671, + "step": 4154 + }, + { + "epoch": 1.1664795058955644, + "grad_norm": 0.730816662311554, + "learning_rate": 7.667318537885586e-06, + "loss": 0.4002, + "step": 4155 + }, + { + "epoch": 1.1667602470522178, + "grad_norm": 0.6910610795021057, + "learning_rate": 7.665936867471148e-06, + "loss": 0.3881, + "step": 4156 + }, + { + "epoch": 1.1670409882088715, + "grad_norm": 0.6639518737792969, + "learning_rate": 7.664554912564509e-06, + "loss": 0.3988, + "step": 4157 + }, + { + "epoch": 1.167321729365525, + "grad_norm": 0.7089428305625916, + "learning_rate": 7.663172673313137e-06, + "loss": 0.4119, + "step": 4158 + }, + { + "epoch": 1.1676024705221786, + "grad_norm": 0.6947149634361267, + "learning_rate": 7.66179014986454e-06, + "loss": 0.4202, + "step": 4159 + }, + { + "epoch": 1.167883211678832, + "grad_norm": 0.7748801112174988, + "learning_rate": 7.66040734236625e-06, + "loss": 0.3732, + "step": 4160 + }, + { + "epoch": 1.1681639528354857, + "grad_norm": 0.7255113124847412, + "learning_rate": 7.659024250965833e-06, + "loss": 0.3776, + "step": 4161 + }, + { + "epoch": 1.1684446939921393, + "grad_norm": 0.5578462481498718, + "learning_rate": 7.657640875810884e-06, + "loss": 0.362, + "step": 4162 + }, + { + "epoch": 1.1687254351487928, + "grad_norm": 0.7239642143249512, + "learning_rate": 7.656257217049025e-06, + "loss": 0.3805, + "step": 4163 + }, + { + "epoch": 1.1690061763054465, + "grad_norm": 0.6771806478500366, + "learning_rate": 7.654873274827915e-06, + "loss": 0.393, + "step": 4164 + }, + { + "epoch": 1.1692869174621, + "grad_norm": 0.7037496566772461, + "learning_rate": 7.65348904929524e-06, + "loss": 0.3499, + "step": 4165 + }, + { + "epoch": 1.1695676586187536, + "grad_norm": 0.6214662790298462, + "learning_rate": 7.652104540598712e-06, + "loss": 0.3838, + "step": 4166 + }, + { + "epoch": 1.169848399775407, + "grad_norm": 0.7172297239303589, + "learning_rate": 7.650719748886082e-06, + "loss": 0.3517, + "step": 4167 + }, + { + "epoch": 1.1701291409320607, + "grad_norm": 0.6152862906455994, + "learning_rate": 7.649334674305124e-06, + "loss": 0.3734, + "step": 4168 + }, + { + "epoch": 1.1704098820887143, + "grad_norm": 0.7730669379234314, + "learning_rate": 7.647949317003645e-06, + "loss": 0.3872, + "step": 4169 + }, + { + "epoch": 1.1706906232453678, + "grad_norm": 0.6491602659225464, + "learning_rate": 7.64656367712948e-06, + "loss": 0.3701, + "step": 4170 + }, + { + "epoch": 1.1709713644020214, + "grad_norm": 0.7138637900352478, + "learning_rate": 7.645177754830497e-06, + "loss": 0.3791, + "step": 4171 + }, + { + "epoch": 1.1712521055586749, + "grad_norm": 0.701231062412262, + "learning_rate": 7.643791550254595e-06, + "loss": 0.3742, + "step": 4172 + }, + { + "epoch": 1.1715328467153285, + "grad_norm": 0.714582622051239, + "learning_rate": 7.6424050635497e-06, + "loss": 0.409, + "step": 4173 + }, + { + "epoch": 1.171813587871982, + "grad_norm": 0.746593177318573, + "learning_rate": 7.641018294863768e-06, + "loss": 0.4367, + "step": 4174 + }, + { + "epoch": 1.1720943290286356, + "grad_norm": 0.7003906965255737, + "learning_rate": 7.639631244344786e-06, + "loss": 0.3661, + "step": 4175 + }, + { + "epoch": 1.172375070185289, + "grad_norm": 0.7029473185539246, + "learning_rate": 7.638243912140772e-06, + "loss": 0.3724, + "step": 4176 + }, + { + "epoch": 1.1726558113419427, + "grad_norm": 0.6909000873565674, + "learning_rate": 7.636856298399774e-06, + "loss": 0.3918, + "step": 4177 + }, + { + "epoch": 1.1729365524985962, + "grad_norm": 0.6882992386817932, + "learning_rate": 7.635468403269871e-06, + "loss": 0.381, + "step": 4178 + }, + { + "epoch": 1.1732172936552498, + "grad_norm": 0.7175175547599792, + "learning_rate": 7.634080226899168e-06, + "loss": 0.3519, + "step": 4179 + }, + { + "epoch": 1.1734980348119035, + "grad_norm": 0.6818076372146606, + "learning_rate": 7.632691769435803e-06, + "loss": 0.4044, + "step": 4180 + }, + { + "epoch": 1.173778775968557, + "grad_norm": 0.6545034050941467, + "learning_rate": 7.631303031027944e-06, + "loss": 0.4161, + "step": 4181 + }, + { + "epoch": 1.1740595171252106, + "grad_norm": 0.6610690951347351, + "learning_rate": 7.629914011823788e-06, + "loss": 0.4019, + "step": 4182 + }, + { + "epoch": 1.174340258281864, + "grad_norm": 0.6658121347427368, + "learning_rate": 7.628524711971566e-06, + "loss": 0.3804, + "step": 4183 + }, + { + "epoch": 1.1746209994385177, + "grad_norm": 0.7791524529457092, + "learning_rate": 7.62713513161953e-06, + "loss": 0.4162, + "step": 4184 + }, + { + "epoch": 1.1749017405951712, + "grad_norm": 1.1453559398651123, + "learning_rate": 7.625745270915969e-06, + "loss": 0.4044, + "step": 4185 + }, + { + "epoch": 1.1751824817518248, + "grad_norm": 0.6825242638587952, + "learning_rate": 7.624355130009202e-06, + "loss": 0.4257, + "step": 4186 + }, + { + "epoch": 1.1754632229084785, + "grad_norm": 0.7512766122817993, + "learning_rate": 7.622964709047576e-06, + "loss": 0.3708, + "step": 4187 + }, + { + "epoch": 1.175743964065132, + "grad_norm": 0.7023360729217529, + "learning_rate": 7.6215740081794665e-06, + "loss": 0.4249, + "step": 4188 + }, + { + "epoch": 1.1760247052217856, + "grad_norm": 0.753903865814209, + "learning_rate": 7.620183027553283e-06, + "loss": 0.3943, + "step": 4189 + }, + { + "epoch": 1.176305446378439, + "grad_norm": 0.5684053301811218, + "learning_rate": 7.61879176731746e-06, + "loss": 0.4126, + "step": 4190 + }, + { + "epoch": 1.1765861875350927, + "grad_norm": 0.7779684662818909, + "learning_rate": 7.617400227620463e-06, + "loss": 0.4077, + "step": 4191 + }, + { + "epoch": 1.1768669286917461, + "grad_norm": 0.6655716896057129, + "learning_rate": 7.616008408610791e-06, + "loss": 0.3712, + "step": 4192 + }, + { + "epoch": 1.1771476698483998, + "grad_norm": 0.7407196164131165, + "learning_rate": 7.614616310436971e-06, + "loss": 0.4351, + "step": 4193 + }, + { + "epoch": 1.1774284110050532, + "grad_norm": 0.7647140026092529, + "learning_rate": 7.613223933247555e-06, + "loss": 0.4277, + "step": 4194 + }, + { + "epoch": 1.177709152161707, + "grad_norm": 0.6379993557929993, + "learning_rate": 7.6118312771911325e-06, + "loss": 0.4016, + "step": 4195 + }, + { + "epoch": 1.1779898933183603, + "grad_norm": 0.6983136534690857, + "learning_rate": 7.61043834241632e-06, + "loss": 0.3982, + "step": 4196 + }, + { + "epoch": 1.178270634475014, + "grad_norm": 0.7108660936355591, + "learning_rate": 7.609045129071759e-06, + "loss": 0.3951, + "step": 4197 + }, + { + "epoch": 1.1785513756316677, + "grad_norm": 0.6539588570594788, + "learning_rate": 7.607651637306126e-06, + "loss": 0.3821, + "step": 4198 + }, + { + "epoch": 1.1788321167883211, + "grad_norm": 0.7670096755027771, + "learning_rate": 7.6062578672681275e-06, + "loss": 0.3963, + "step": 4199 + }, + { + "epoch": 1.1791128579449748, + "grad_norm": 0.666587233543396, + "learning_rate": 7.604863819106496e-06, + "loss": 0.4053, + "step": 4200 + }, + { + "epoch": 1.1793935991016282, + "grad_norm": 0.7293751239776611, + "learning_rate": 7.603469492969997e-06, + "loss": 0.442, + "step": 4201 + }, + { + "epoch": 1.1796743402582819, + "grad_norm": 0.8012384176254272, + "learning_rate": 7.602074889007423e-06, + "loss": 0.4257, + "step": 4202 + }, + { + "epoch": 1.1799550814149353, + "grad_norm": 0.6272772550582886, + "learning_rate": 7.600680007367598e-06, + "loss": 0.3746, + "step": 4203 + }, + { + "epoch": 1.180235822571589, + "grad_norm": 0.6824804544448853, + "learning_rate": 7.599284848199375e-06, + "loss": 0.374, + "step": 4204 + }, + { + "epoch": 1.1805165637282427, + "grad_norm": 0.6905078291893005, + "learning_rate": 7.597889411651636e-06, + "loss": 0.3836, + "step": 4205 + }, + { + "epoch": 1.180797304884896, + "grad_norm": 0.6570492386817932, + "learning_rate": 7.596493697873295e-06, + "loss": 0.3559, + "step": 4206 + }, + { + "epoch": 1.1810780460415498, + "grad_norm": 0.783523440361023, + "learning_rate": 7.595097707013295e-06, + "loss": 0.447, + "step": 4207 + }, + { + "epoch": 1.1813587871982032, + "grad_norm": 0.6097093820571899, + "learning_rate": 7.593701439220602e-06, + "loss": 0.3731, + "step": 4208 + }, + { + "epoch": 1.1816395283548569, + "grad_norm": 0.7884918451309204, + "learning_rate": 7.592304894644223e-06, + "loss": 0.4275, + "step": 4209 + }, + { + "epoch": 1.1819202695115103, + "grad_norm": 0.6653420329093933, + "learning_rate": 7.5909080734331875e-06, + "loss": 0.4094, + "step": 4210 + }, + { + "epoch": 1.182201010668164, + "grad_norm": 0.698269248008728, + "learning_rate": 7.5895109757365515e-06, + "loss": 0.4093, + "step": 4211 + }, + { + "epoch": 1.1824817518248176, + "grad_norm": 0.6784840822219849, + "learning_rate": 7.588113601703408e-06, + "loss": 0.4136, + "step": 4212 + }, + { + "epoch": 1.182762492981471, + "grad_norm": 0.7869148254394531, + "learning_rate": 7.5867159514828745e-06, + "loss": 0.397, + "step": 4213 + }, + { + "epoch": 1.1830432341381247, + "grad_norm": 0.7723643183708191, + "learning_rate": 7.585318025224102e-06, + "loss": 0.3898, + "step": 4214 + }, + { + "epoch": 1.1833239752947782, + "grad_norm": 0.6724240183830261, + "learning_rate": 7.583919823076267e-06, + "loss": 0.4115, + "step": 4215 + }, + { + "epoch": 1.1836047164514318, + "grad_norm": 0.5880551338195801, + "learning_rate": 7.582521345188576e-06, + "loss": 0.3874, + "step": 4216 + }, + { + "epoch": 1.1838854576080853, + "grad_norm": 0.6574677228927612, + "learning_rate": 7.581122591710266e-06, + "loss": 0.4159, + "step": 4217 + }, + { + "epoch": 1.184166198764739, + "grad_norm": 0.6764013171195984, + "learning_rate": 7.579723562790604e-06, + "loss": 0.4091, + "step": 4218 + }, + { + "epoch": 1.1844469399213924, + "grad_norm": 0.7931365966796875, + "learning_rate": 7.5783242585788865e-06, + "loss": 0.3928, + "step": 4219 + }, + { + "epoch": 1.184727681078046, + "grad_norm": 0.6253363490104675, + "learning_rate": 7.576924679224438e-06, + "loss": 0.3876, + "step": 4220 + }, + { + "epoch": 1.1850084222346995, + "grad_norm": 0.6436002254486084, + "learning_rate": 7.575524824876612e-06, + "loss": 0.3787, + "step": 4221 + }, + { + "epoch": 1.1852891633913532, + "grad_norm": 0.6307581067085266, + "learning_rate": 7.574124695684793e-06, + "loss": 0.3809, + "step": 4222 + }, + { + "epoch": 1.1855699045480068, + "grad_norm": 0.6719784140586853, + "learning_rate": 7.572724291798394e-06, + "loss": 0.3722, + "step": 4223 + }, + { + "epoch": 1.1858506457046603, + "grad_norm": 0.6689779758453369, + "learning_rate": 7.5713236133668566e-06, + "loss": 0.3644, + "step": 4224 + }, + { + "epoch": 1.186131386861314, + "grad_norm": 0.6395378708839417, + "learning_rate": 7.569922660539654e-06, + "loss": 0.3984, + "step": 4225 + }, + { + "epoch": 1.1864121280179674, + "grad_norm": 0.6845116019248962, + "learning_rate": 7.568521433466285e-06, + "loss": 0.3787, + "step": 4226 + }, + { + "epoch": 1.186692869174621, + "grad_norm": 0.8106564283370972, + "learning_rate": 7.567119932296283e-06, + "loss": 0.426, + "step": 4227 + }, + { + "epoch": 1.1869736103312745, + "grad_norm": 0.6468884944915771, + "learning_rate": 7.565718157179205e-06, + "loss": 0.4067, + "step": 4228 + }, + { + "epoch": 1.1872543514879281, + "grad_norm": 0.6525301337242126, + "learning_rate": 7.56431610826464e-06, + "loss": 0.3647, + "step": 4229 + }, + { + "epoch": 1.1875350926445818, + "grad_norm": 0.7215001583099365, + "learning_rate": 7.562913785702208e-06, + "loss": 0.4292, + "step": 4230 + }, + { + "epoch": 1.1878158338012352, + "grad_norm": 0.7083243131637573, + "learning_rate": 7.5615111896415506e-06, + "loss": 0.3921, + "step": 4231 + }, + { + "epoch": 1.188096574957889, + "grad_norm": 0.6091982126235962, + "learning_rate": 7.5601083202323525e-06, + "loss": 0.3805, + "step": 4232 + }, + { + "epoch": 1.1883773161145423, + "grad_norm": 0.7021400928497314, + "learning_rate": 7.558705177624312e-06, + "loss": 0.4257, + "step": 4233 + }, + { + "epoch": 1.188658057271196, + "grad_norm": 0.548466682434082, + "learning_rate": 7.557301761967167e-06, + "loss": 0.3744, + "step": 4234 + }, + { + "epoch": 1.1889387984278494, + "grad_norm": 0.6605693697929382, + "learning_rate": 7.5558980734106814e-06, + "loss": 0.3982, + "step": 4235 + }, + { + "epoch": 1.189219539584503, + "grad_norm": 0.7793446183204651, + "learning_rate": 7.554494112104647e-06, + "loss": 0.4009, + "step": 4236 + }, + { + "epoch": 1.1895002807411568, + "grad_norm": 0.6424089074134827, + "learning_rate": 7.553089878198887e-06, + "loss": 0.3536, + "step": 4237 + }, + { + "epoch": 1.1897810218978102, + "grad_norm": 0.7062333822250366, + "learning_rate": 7.551685371843251e-06, + "loss": 0.3962, + "step": 4238 + }, + { + "epoch": 1.1900617630544639, + "grad_norm": 0.6652928590774536, + "learning_rate": 7.550280593187621e-06, + "loss": 0.4093, + "step": 4239 + }, + { + "epoch": 1.1903425042111173, + "grad_norm": 0.6986287832260132, + "learning_rate": 7.548875542381904e-06, + "loss": 0.4097, + "step": 4240 + }, + { + "epoch": 1.190623245367771, + "grad_norm": 0.6457189917564392, + "learning_rate": 7.547470219576041e-06, + "loss": 0.4157, + "step": 4241 + }, + { + "epoch": 1.1909039865244244, + "grad_norm": 0.7715068459510803, + "learning_rate": 7.5460646249199956e-06, + "loss": 0.4076, + "step": 4242 + }, + { + "epoch": 1.191184727681078, + "grad_norm": 0.6989235877990723, + "learning_rate": 7.544658758563768e-06, + "loss": 0.4136, + "step": 4243 + }, + { + "epoch": 1.1914654688377315, + "grad_norm": 0.7448019981384277, + "learning_rate": 7.543252620657382e-06, + "loss": 0.3926, + "step": 4244 + }, + { + "epoch": 1.1917462099943852, + "grad_norm": 0.7202630639076233, + "learning_rate": 7.5418462113508906e-06, + "loss": 0.3287, + "step": 4245 + }, + { + "epoch": 1.1920269511510386, + "grad_norm": 0.7235342264175415, + "learning_rate": 7.540439530794379e-06, + "loss": 0.4007, + "step": 4246 + }, + { + "epoch": 1.1923076923076923, + "grad_norm": 0.6588276624679565, + "learning_rate": 7.539032579137958e-06, + "loss": 0.3819, + "step": 4247 + }, + { + "epoch": 1.192588433464346, + "grad_norm": 0.7202818393707275, + "learning_rate": 7.53762535653177e-06, + "loss": 0.4136, + "step": 4248 + }, + { + "epoch": 1.1928691746209994, + "grad_norm": 0.8266968727111816, + "learning_rate": 7.536217863125985e-06, + "loss": 0.4443, + "step": 4249 + }, + { + "epoch": 1.193149915777653, + "grad_norm": 0.7003817558288574, + "learning_rate": 7.534810099070801e-06, + "loss": 0.4214, + "step": 4250 + }, + { + "epoch": 1.1934306569343065, + "grad_norm": 0.7337723970413208, + "learning_rate": 7.533402064516445e-06, + "loss": 0.3992, + "step": 4251 + }, + { + "epoch": 1.1937113980909602, + "grad_norm": 0.6616443395614624, + "learning_rate": 7.5319937596131764e-06, + "loss": 0.357, + "step": 4252 + }, + { + "epoch": 1.1939921392476136, + "grad_norm": 0.6619863510131836, + "learning_rate": 7.530585184511278e-06, + "loss": 0.3843, + "step": 4253 + }, + { + "epoch": 1.1942728804042673, + "grad_norm": 0.6348530054092407, + "learning_rate": 7.529176339361066e-06, + "loss": 0.3468, + "step": 4254 + }, + { + "epoch": 1.194553621560921, + "grad_norm": 0.6510490775108337, + "learning_rate": 7.527767224312883e-06, + "loss": 0.4198, + "step": 4255 + }, + { + "epoch": 1.1948343627175744, + "grad_norm": 0.7726225256919861, + "learning_rate": 7.5263578395171e-06, + "loss": 0.3689, + "step": 4256 + }, + { + "epoch": 1.195115103874228, + "grad_norm": 0.7218582630157471, + "learning_rate": 7.5249481851241195e-06, + "loss": 0.3984, + "step": 4257 + }, + { + "epoch": 1.1953958450308815, + "grad_norm": 0.6921402812004089, + "learning_rate": 7.523538261284371e-06, + "loss": 0.4278, + "step": 4258 + }, + { + "epoch": 1.1956765861875351, + "grad_norm": 0.7084171175956726, + "learning_rate": 7.522128068148311e-06, + "loss": 0.4064, + "step": 4259 + }, + { + "epoch": 1.1959573273441886, + "grad_norm": 0.6785997152328491, + "learning_rate": 7.520717605866429e-06, + "loss": 0.4098, + "step": 4260 + }, + { + "epoch": 1.1962380685008422, + "grad_norm": 0.6730110049247742, + "learning_rate": 7.519306874589238e-06, + "loss": 0.3765, + "step": 4261 + }, + { + "epoch": 1.196518809657496, + "grad_norm": 0.6426097750663757, + "learning_rate": 7.517895874467285e-06, + "loss": 0.3874, + "step": 4262 + }, + { + "epoch": 1.1967995508141493, + "grad_norm": 0.6854656338691711, + "learning_rate": 7.516484605651141e-06, + "loss": 0.4128, + "step": 4263 + }, + { + "epoch": 1.197080291970803, + "grad_norm": 0.7365685701370239, + "learning_rate": 7.5150730682914085e-06, + "loss": 0.3806, + "step": 4264 + }, + { + "epoch": 1.1973610331274565, + "grad_norm": 0.6268132328987122, + "learning_rate": 7.513661262538721e-06, + "loss": 0.3833, + "step": 4265 + }, + { + "epoch": 1.1976417742841101, + "grad_norm": 0.7252869606018066, + "learning_rate": 7.5122491885437324e-06, + "loss": 0.4033, + "step": 4266 + }, + { + "epoch": 1.1979225154407636, + "grad_norm": 0.7028063535690308, + "learning_rate": 7.510836846457134e-06, + "loss": 0.3861, + "step": 4267 + }, + { + "epoch": 1.1982032565974172, + "grad_norm": 0.6498178839683533, + "learning_rate": 7.509424236429641e-06, + "loss": 0.3918, + "step": 4268 + }, + { + "epoch": 1.1984839977540707, + "grad_norm": 0.7453802824020386, + "learning_rate": 7.508011358611997e-06, + "loss": 0.3619, + "step": 4269 + }, + { + "epoch": 1.1987647389107243, + "grad_norm": 0.7189194560050964, + "learning_rate": 7.5065982131549795e-06, + "loss": 0.3859, + "step": 4270 + }, + { + "epoch": 1.1990454800673778, + "grad_norm": 0.6720621585845947, + "learning_rate": 7.505184800209387e-06, + "loss": 0.3784, + "step": 4271 + }, + { + "epoch": 1.1993262212240314, + "grad_norm": 0.6525327563285828, + "learning_rate": 7.503771119926052e-06, + "loss": 0.3805, + "step": 4272 + }, + { + "epoch": 1.199606962380685, + "grad_norm": 0.6999576687812805, + "learning_rate": 7.502357172455832e-06, + "loss": 0.3985, + "step": 4273 + }, + { + "epoch": 1.1998877035373385, + "grad_norm": 0.7010440826416016, + "learning_rate": 7.5009429579496174e-06, + "loss": 0.4044, + "step": 4274 + }, + { + "epoch": 1.2001684446939922, + "grad_norm": 0.707637369632721, + "learning_rate": 7.499528476558321e-06, + "loss": 0.3679, + "step": 4275 + }, + { + "epoch": 1.2004491858506456, + "grad_norm": 0.6848056316375732, + "learning_rate": 7.498113728432891e-06, + "loss": 0.3782, + "step": 4276 + }, + { + "epoch": 1.2007299270072993, + "grad_norm": 0.6553831696510315, + "learning_rate": 7.4966987137242975e-06, + "loss": 0.3867, + "step": 4277 + }, + { + "epoch": 1.2010106681639527, + "grad_norm": 0.7628927826881409, + "learning_rate": 7.495283432583542e-06, + "loss": 0.3743, + "step": 4278 + }, + { + "epoch": 1.2012914093206064, + "grad_norm": 0.791347086429596, + "learning_rate": 7.493867885161658e-06, + "loss": 0.4186, + "step": 4279 + }, + { + "epoch": 1.20157215047726, + "grad_norm": 0.6676722168922424, + "learning_rate": 7.4924520716096995e-06, + "loss": 0.4047, + "step": 4280 + }, + { + "epoch": 1.2018528916339135, + "grad_norm": 0.6947941184043884, + "learning_rate": 7.491035992078757e-06, + "loss": 0.3969, + "step": 4281 + }, + { + "epoch": 1.2021336327905672, + "grad_norm": 0.8484598994255066, + "learning_rate": 7.489619646719943e-06, + "loss": 0.434, + "step": 4282 + }, + { + "epoch": 1.2024143739472206, + "grad_norm": 0.6089879274368286, + "learning_rate": 7.4882030356844025e-06, + "loss": 0.3917, + "step": 4283 + }, + { + "epoch": 1.2026951151038743, + "grad_norm": 0.7126756906509399, + "learning_rate": 7.486786159123307e-06, + "loss": 0.4022, + "step": 4284 + }, + { + "epoch": 1.2029758562605277, + "grad_norm": 0.8028550744056702, + "learning_rate": 7.485369017187858e-06, + "loss": 0.4079, + "step": 4285 + }, + { + "epoch": 1.2032565974171814, + "grad_norm": 0.6776315569877625, + "learning_rate": 7.483951610029282e-06, + "loss": 0.3326, + "step": 4286 + }, + { + "epoch": 1.203537338573835, + "grad_norm": 0.7978324890136719, + "learning_rate": 7.482533937798838e-06, + "loss": 0.3583, + "step": 4287 + }, + { + "epoch": 1.2038180797304885, + "grad_norm": 0.7162542939186096, + "learning_rate": 7.48111600064781e-06, + "loss": 0.385, + "step": 4288 + }, + { + "epoch": 1.2040988208871422, + "grad_norm": 0.6677225232124329, + "learning_rate": 7.47969779872751e-06, + "loss": 0.3427, + "step": 4289 + }, + { + "epoch": 1.2043795620437956, + "grad_norm": 0.7364921569824219, + "learning_rate": 7.478279332189282e-06, + "loss": 0.3826, + "step": 4290 + }, + { + "epoch": 1.2046603032004493, + "grad_norm": 0.6630323529243469, + "learning_rate": 7.476860601184495e-06, + "loss": 0.4213, + "step": 4291 + }, + { + "epoch": 1.2049410443571027, + "grad_norm": 0.6920050382614136, + "learning_rate": 7.475441605864546e-06, + "loss": 0.3748, + "step": 4292 + }, + { + "epoch": 1.2052217855137564, + "grad_norm": 0.6308068037033081, + "learning_rate": 7.4740223463808644e-06, + "loss": 0.4059, + "step": 4293 + }, + { + "epoch": 1.2055025266704098, + "grad_norm": 0.7217302918434143, + "learning_rate": 7.472602822884903e-06, + "loss": 0.4223, + "step": 4294 + }, + { + "epoch": 1.2057832678270635, + "grad_norm": 0.6837269067764282, + "learning_rate": 7.471183035528142e-06, + "loss": 0.3681, + "step": 4295 + }, + { + "epoch": 1.206064008983717, + "grad_norm": 0.5798900723457336, + "learning_rate": 7.469762984462097e-06, + "loss": 0.4099, + "step": 4296 + }, + { + "epoch": 1.2063447501403706, + "grad_norm": 0.7235340476036072, + "learning_rate": 7.468342669838304e-06, + "loss": 0.4243, + "step": 4297 + }, + { + "epoch": 1.2066254912970242, + "grad_norm": 0.6759546399116516, + "learning_rate": 7.466922091808332e-06, + "loss": 0.3993, + "step": 4298 + }, + { + "epoch": 1.2069062324536777, + "grad_norm": 0.7389718890190125, + "learning_rate": 7.465501250523773e-06, + "loss": 0.4034, + "step": 4299 + }, + { + "epoch": 1.2071869736103313, + "grad_norm": 0.5856978893280029, + "learning_rate": 7.464080146136255e-06, + "loss": 0.3981, + "step": 4300 + }, + { + "epoch": 1.2074677147669848, + "grad_norm": 0.6909798979759216, + "learning_rate": 7.462658778797425e-06, + "loss": 0.3528, + "step": 4301 + }, + { + "epoch": 1.2077484559236384, + "grad_norm": 0.6939312219619751, + "learning_rate": 7.461237148658964e-06, + "loss": 0.4294, + "step": 4302 + }, + { + "epoch": 1.2080291970802919, + "grad_norm": 0.6418517827987671, + "learning_rate": 7.459815255872581e-06, + "loss": 0.3889, + "step": 4303 + }, + { + "epoch": 1.2083099382369455, + "grad_norm": 0.6352454423904419, + "learning_rate": 7.458393100590011e-06, + "loss": 0.4098, + "step": 4304 + }, + { + "epoch": 1.2085906793935992, + "grad_norm": 0.7874202132225037, + "learning_rate": 7.456970682963016e-06, + "loss": 0.4049, + "step": 4305 + }, + { + "epoch": 1.2088714205502527, + "grad_norm": 0.6554611921310425, + "learning_rate": 7.455548003143389e-06, + "loss": 0.3827, + "step": 4306 + }, + { + "epoch": 1.2091521617069063, + "grad_norm": 0.6753202080726624, + "learning_rate": 7.4541250612829485e-06, + "loss": 0.3924, + "step": 4307 + }, + { + "epoch": 1.2094329028635598, + "grad_norm": 0.6789506077766418, + "learning_rate": 7.452701857533543e-06, + "loss": 0.3799, + "step": 4308 + }, + { + "epoch": 1.2097136440202134, + "grad_norm": 0.7028040289878845, + "learning_rate": 7.451278392047049e-06, + "loss": 0.3756, + "step": 4309 + }, + { + "epoch": 1.2099943851768669, + "grad_norm": 0.7353365421295166, + "learning_rate": 7.449854664975366e-06, + "loss": 0.4344, + "step": 4310 + }, + { + "epoch": 1.2102751263335205, + "grad_norm": 0.7207481265068054, + "learning_rate": 7.448430676470431e-06, + "loss": 0.3834, + "step": 4311 + }, + { + "epoch": 1.210555867490174, + "grad_norm": 0.6377664804458618, + "learning_rate": 7.447006426684198e-06, + "loss": 0.3741, + "step": 4312 + }, + { + "epoch": 1.2108366086468276, + "grad_norm": 0.7059157490730286, + "learning_rate": 7.445581915768656e-06, + "loss": 0.3996, + "step": 4313 + }, + { + "epoch": 1.211117349803481, + "grad_norm": 0.6970432996749878, + "learning_rate": 7.44415714387582e-06, + "loss": 0.3722, + "step": 4314 + }, + { + "epoch": 1.2113980909601347, + "grad_norm": 0.7008577585220337, + "learning_rate": 7.442732111157734e-06, + "loss": 0.403, + "step": 4315 + }, + { + "epoch": 1.2116788321167884, + "grad_norm": 0.7184580564498901, + "learning_rate": 7.4413068177664664e-06, + "loss": 0.3773, + "step": 4316 + }, + { + "epoch": 1.2119595732734418, + "grad_norm": 0.7665238976478577, + "learning_rate": 7.439881263854116e-06, + "loss": 0.3613, + "step": 4317 + }, + { + "epoch": 1.2122403144300955, + "grad_norm": 0.6726934909820557, + "learning_rate": 7.438455449572811e-06, + "loss": 0.4, + "step": 4318 + }, + { + "epoch": 1.212521055586749, + "grad_norm": 0.7335178852081299, + "learning_rate": 7.437029375074704e-06, + "loss": 0.4192, + "step": 4319 + }, + { + "epoch": 1.2128017967434026, + "grad_norm": 0.6724697351455688, + "learning_rate": 7.435603040511976e-06, + "loss": 0.3784, + "step": 4320 + }, + { + "epoch": 1.213082537900056, + "grad_norm": 0.7500842809677124, + "learning_rate": 7.4341764460368385e-06, + "loss": 0.4021, + "step": 4321 + }, + { + "epoch": 1.2133632790567097, + "grad_norm": 0.6556463837623596, + "learning_rate": 7.432749591801527e-06, + "loss": 0.3962, + "step": 4322 + }, + { + "epoch": 1.2136440202133634, + "grad_norm": 0.6699510812759399, + "learning_rate": 7.431322477958308e-06, + "loss": 0.3723, + "step": 4323 + }, + { + "epoch": 1.2139247613700168, + "grad_norm": 0.7302048206329346, + "learning_rate": 7.429895104659473e-06, + "loss": 0.3964, + "step": 4324 + }, + { + "epoch": 1.2142055025266705, + "grad_norm": 0.7734420299530029, + "learning_rate": 7.428467472057345e-06, + "loss": 0.3965, + "step": 4325 + }, + { + "epoch": 1.214486243683324, + "grad_norm": 0.5939376950263977, + "learning_rate": 7.427039580304268e-06, + "loss": 0.3571, + "step": 4326 + }, + { + "epoch": 1.2147669848399776, + "grad_norm": 0.6674864292144775, + "learning_rate": 7.425611429552621e-06, + "loss": 0.3839, + "step": 4327 + }, + { + "epoch": 1.215047725996631, + "grad_norm": 0.6552734375, + "learning_rate": 7.424183019954805e-06, + "loss": 0.3999, + "step": 4328 + }, + { + "epoch": 1.2153284671532847, + "grad_norm": 0.7163453102111816, + "learning_rate": 7.422754351663252e-06, + "loss": 0.3809, + "step": 4329 + }, + { + "epoch": 1.2156092083099383, + "grad_norm": 0.6272565722465515, + "learning_rate": 7.421325424830421e-06, + "loss": 0.3996, + "step": 4330 + }, + { + "epoch": 1.2158899494665918, + "grad_norm": 0.7307857871055603, + "learning_rate": 7.419896239608799e-06, + "loss": 0.3932, + "step": 4331 + }, + { + "epoch": 1.2161706906232455, + "grad_norm": 0.6637787222862244, + "learning_rate": 7.418466796150896e-06, + "loss": 0.368, + "step": 4332 + }, + { + "epoch": 1.216451431779899, + "grad_norm": 0.7471299171447754, + "learning_rate": 7.417037094609258e-06, + "loss": 0.4043, + "step": 4333 + }, + { + "epoch": 1.2167321729365526, + "grad_norm": 0.7452800869941711, + "learning_rate": 7.415607135136451e-06, + "loss": 0.42, + "step": 4334 + }, + { + "epoch": 1.217012914093206, + "grad_norm": 0.6763039231300354, + "learning_rate": 7.414176917885072e-06, + "loss": 0.3994, + "step": 4335 + }, + { + "epoch": 1.2172936552498597, + "grad_norm": 0.6500682830810547, + "learning_rate": 7.412746443007748e-06, + "loss": 0.4056, + "step": 4336 + }, + { + "epoch": 1.217574396406513, + "grad_norm": 0.7071866393089294, + "learning_rate": 7.411315710657124e-06, + "loss": 0.3858, + "step": 4337 + }, + { + "epoch": 1.2178551375631668, + "grad_norm": 0.7375584244728088, + "learning_rate": 7.409884720985884e-06, + "loss": 0.41, + "step": 4338 + }, + { + "epoch": 1.2181358787198202, + "grad_norm": 0.6340219378471375, + "learning_rate": 7.408453474146731e-06, + "loss": 0.3664, + "step": 4339 + }, + { + "epoch": 1.2184166198764739, + "grad_norm": 0.6191103458404541, + "learning_rate": 7.407021970292403e-06, + "loss": 0.399, + "step": 4340 + }, + { + "epoch": 1.2186973610331275, + "grad_norm": 0.5875264406204224, + "learning_rate": 7.405590209575657e-06, + "loss": 0.357, + "step": 4341 + }, + { + "epoch": 1.218978102189781, + "grad_norm": 0.6360987424850464, + "learning_rate": 7.404158192149285e-06, + "loss": 0.3812, + "step": 4342 + }, + { + "epoch": 1.2192588433464346, + "grad_norm": 0.7508474588394165, + "learning_rate": 7.402725918166099e-06, + "loss": 0.3645, + "step": 4343 + }, + { + "epoch": 1.219539584503088, + "grad_norm": 0.796063244342804, + "learning_rate": 7.401293387778945e-06, + "loss": 0.4028, + "step": 4344 + }, + { + "epoch": 1.2198203256597417, + "grad_norm": 0.6457430720329285, + "learning_rate": 7.399860601140693e-06, + "loss": 0.3877, + "step": 4345 + }, + { + "epoch": 1.2201010668163952, + "grad_norm": 0.6755557060241699, + "learning_rate": 7.398427558404241e-06, + "loss": 0.3582, + "step": 4346 + }, + { + "epoch": 1.2203818079730488, + "grad_norm": 0.752692699432373, + "learning_rate": 7.396994259722517e-06, + "loss": 0.4437, + "step": 4347 + }, + { + "epoch": 1.2206625491297025, + "grad_norm": 0.7466068863868713, + "learning_rate": 7.39556070524847e-06, + "loss": 0.4005, + "step": 4348 + }, + { + "epoch": 1.220943290286356, + "grad_norm": 0.7716429233551025, + "learning_rate": 7.394126895135082e-06, + "loss": 0.4037, + "step": 4349 + }, + { + "epoch": 1.2212240314430096, + "grad_norm": 0.6424754858016968, + "learning_rate": 7.392692829535359e-06, + "loss": 0.3833, + "step": 4350 + }, + { + "epoch": 1.221504772599663, + "grad_norm": 0.6294004917144775, + "learning_rate": 7.391258508602337e-06, + "loss": 0.3644, + "step": 4351 + }, + { + "epoch": 1.2217855137563167, + "grad_norm": 0.6806685924530029, + "learning_rate": 7.389823932489078e-06, + "loss": 0.377, + "step": 4352 + }, + { + "epoch": 1.2220662549129702, + "grad_norm": 0.7771987318992615, + "learning_rate": 7.38838910134867e-06, + "loss": 0.4393, + "step": 4353 + }, + { + "epoch": 1.2223469960696238, + "grad_norm": 0.6604443788528442, + "learning_rate": 7.386954015334229e-06, + "loss": 0.3772, + "step": 4354 + }, + { + "epoch": 1.2226277372262775, + "grad_norm": 0.7321616411209106, + "learning_rate": 7.385518674598899e-06, + "loss": 0.3993, + "step": 4355 + }, + { + "epoch": 1.222908478382931, + "grad_norm": 0.661529004573822, + "learning_rate": 7.38408307929585e-06, + "loss": 0.381, + "step": 4356 + }, + { + "epoch": 1.2231892195395846, + "grad_norm": 0.6855313777923584, + "learning_rate": 7.382647229578282e-06, + "loss": 0.3318, + "step": 4357 + }, + { + "epoch": 1.223469960696238, + "grad_norm": 0.7962076663970947, + "learning_rate": 7.381211125599417e-06, + "loss": 0.4035, + "step": 4358 + }, + { + "epoch": 1.2237507018528917, + "grad_norm": 0.6684330701828003, + "learning_rate": 7.379774767512509e-06, + "loss": 0.3802, + "step": 4359 + }, + { + "epoch": 1.2240314430095451, + "grad_norm": 0.7023851275444031, + "learning_rate": 7.3783381554708366e-06, + "loss": 0.3929, + "step": 4360 + }, + { + "epoch": 1.2243121841661988, + "grad_norm": 0.6956204175949097, + "learning_rate": 7.376901289627706e-06, + "loss": 0.3774, + "step": 4361 + }, + { + "epoch": 1.2245929253228522, + "grad_norm": 0.8025999069213867, + "learning_rate": 7.3754641701364504e-06, + "loss": 0.4143, + "step": 4362 + }, + { + "epoch": 1.224873666479506, + "grad_norm": 0.6624419689178467, + "learning_rate": 7.374026797150431e-06, + "loss": 0.3838, + "step": 4363 + }, + { + "epoch": 1.2251544076361593, + "grad_norm": 0.6223154067993164, + "learning_rate": 7.3725891708230355e-06, + "loss": 0.3765, + "step": 4364 + }, + { + "epoch": 1.225435148792813, + "grad_norm": 0.6757875084877014, + "learning_rate": 7.371151291307677e-06, + "loss": 0.376, + "step": 4365 + }, + { + "epoch": 1.2257158899494667, + "grad_norm": 0.7055069804191589, + "learning_rate": 7.3697131587577985e-06, + "loss": 0.4029, + "step": 4366 + }, + { + "epoch": 1.2259966311061201, + "grad_norm": 0.7640204429626465, + "learning_rate": 7.368274773326868e-06, + "loss": 0.3598, + "step": 4367 + }, + { + "epoch": 1.2262773722627738, + "grad_norm": 0.6684814095497131, + "learning_rate": 7.366836135168381e-06, + "loss": 0.3686, + "step": 4368 + }, + { + "epoch": 1.2265581134194272, + "grad_norm": 0.7763293981552124, + "learning_rate": 7.365397244435859e-06, + "loss": 0.4126, + "step": 4369 + }, + { + "epoch": 1.2268388545760809, + "grad_norm": 0.6652472615242004, + "learning_rate": 7.3639581012828545e-06, + "loss": 0.3997, + "step": 4370 + }, + { + "epoch": 1.2271195957327343, + "grad_norm": 0.7757605910301208, + "learning_rate": 7.362518705862939e-06, + "loss": 0.3872, + "step": 4371 + }, + { + "epoch": 1.227400336889388, + "grad_norm": 0.8113671541213989, + "learning_rate": 7.361079058329721e-06, + "loss": 0.3981, + "step": 4372 + }, + { + "epoch": 1.2276810780460417, + "grad_norm": 0.6892746090888977, + "learning_rate": 7.359639158836828e-06, + "loss": 0.408, + "step": 4373 + }, + { + "epoch": 1.227961819202695, + "grad_norm": 0.6588771343231201, + "learning_rate": 7.358199007537916e-06, + "loss": 0.4066, + "step": 4374 + }, + { + "epoch": 1.2282425603593488, + "grad_norm": 0.7860007286071777, + "learning_rate": 7.3567586045866734e-06, + "loss": 0.3845, + "step": 4375 + }, + { + "epoch": 1.2285233015160022, + "grad_norm": 0.6605427265167236, + "learning_rate": 7.355317950136807e-06, + "loss": 0.3813, + "step": 4376 + }, + { + "epoch": 1.2288040426726559, + "grad_norm": 0.7304157614707947, + "learning_rate": 7.353877044342056e-06, + "loss": 0.4147, + "step": 4377 + }, + { + "epoch": 1.2290847838293093, + "grad_norm": 0.6476666331291199, + "learning_rate": 7.352435887356184e-06, + "loss": 0.3857, + "step": 4378 + }, + { + "epoch": 1.229365524985963, + "grad_norm": 0.7970354557037354, + "learning_rate": 7.350994479332983e-06, + "loss": 0.3632, + "step": 4379 + }, + { + "epoch": 1.2296462661426166, + "grad_norm": 0.8846747875213623, + "learning_rate": 7.349552820426271e-06, + "loss": 0.358, + "step": 4380 + }, + { + "epoch": 1.22992700729927, + "grad_norm": 0.7176412343978882, + "learning_rate": 7.348110910789894e-06, + "loss": 0.4284, + "step": 4381 + }, + { + "epoch": 1.2302077484559237, + "grad_norm": 0.7293211221694946, + "learning_rate": 7.346668750577721e-06, + "loss": 0.4196, + "step": 4382 + }, + { + "epoch": 1.2304884896125772, + "grad_norm": 0.8100600838661194, + "learning_rate": 7.345226339943652e-06, + "loss": 0.4024, + "step": 4383 + }, + { + "epoch": 1.2307692307692308, + "grad_norm": 0.7535078525543213, + "learning_rate": 7.343783679041613e-06, + "loss": 0.4172, + "step": 4384 + }, + { + "epoch": 1.2310499719258843, + "grad_norm": 0.6468392610549927, + "learning_rate": 7.342340768025555e-06, + "loss": 0.4109, + "step": 4385 + }, + { + "epoch": 1.231330713082538, + "grad_norm": 0.7342042326927185, + "learning_rate": 7.3408976070494555e-06, + "loss": 0.3921, + "step": 4386 + }, + { + "epoch": 1.2316114542391914, + "grad_norm": 0.7490071058273315, + "learning_rate": 7.33945419626732e-06, + "loss": 0.3891, + "step": 4387 + }, + { + "epoch": 1.231892195395845, + "grad_norm": 0.7169302701950073, + "learning_rate": 7.338010535833182e-06, + "loss": 0.3515, + "step": 4388 + }, + { + "epoch": 1.2321729365524985, + "grad_norm": 0.7124811410903931, + "learning_rate": 7.336566625901098e-06, + "loss": 0.372, + "step": 4389 + }, + { + "epoch": 1.2324536777091522, + "grad_norm": 0.7964193820953369, + "learning_rate": 7.335122466625153e-06, + "loss": 0.4042, + "step": 4390 + }, + { + "epoch": 1.2327344188658058, + "grad_norm": 0.6440420150756836, + "learning_rate": 7.333678058159461e-06, + "loss": 0.3808, + "step": 4391 + }, + { + "epoch": 1.2330151600224593, + "grad_norm": 0.590033233165741, + "learning_rate": 7.33223340065816e-06, + "loss": 0.3866, + "step": 4392 + }, + { + "epoch": 1.233295901179113, + "grad_norm": 0.730883777141571, + "learning_rate": 7.330788494275411e-06, + "loss": 0.3758, + "step": 4393 + }, + { + "epoch": 1.2335766423357664, + "grad_norm": 0.7454805970191956, + "learning_rate": 7.329343339165409e-06, + "loss": 0.3747, + "step": 4394 + }, + { + "epoch": 1.23385738349242, + "grad_norm": 0.6938319802284241, + "learning_rate": 7.327897935482371e-06, + "loss": 0.4059, + "step": 4395 + }, + { + "epoch": 1.2341381246490735, + "grad_norm": 0.6656152009963989, + "learning_rate": 7.326452283380542e-06, + "loss": 0.3665, + "step": 4396 + }, + { + "epoch": 1.2344188658057271, + "grad_norm": 0.5903286337852478, + "learning_rate": 7.325006383014193e-06, + "loss": 0.4085, + "step": 4397 + }, + { + "epoch": 1.2346996069623808, + "grad_norm": 0.6602579951286316, + "learning_rate": 7.323560234537619e-06, + "loss": 0.3764, + "step": 4398 + }, + { + "epoch": 1.2349803481190342, + "grad_norm": 0.6968615651130676, + "learning_rate": 7.3221138381051475e-06, + "loss": 0.4454, + "step": 4399 + }, + { + "epoch": 1.235261089275688, + "grad_norm": 0.6112107038497925, + "learning_rate": 7.320667193871127e-06, + "loss": 0.3626, + "step": 4400 + }, + { + "epoch": 1.2355418304323413, + "grad_norm": 0.7879973649978638, + "learning_rate": 7.319220301989936e-06, + "loss": 0.4109, + "step": 4401 + }, + { + "epoch": 1.235822571588995, + "grad_norm": 0.7149852514266968, + "learning_rate": 7.317773162615976e-06, + "loss": 0.4146, + "step": 4402 + }, + { + "epoch": 1.2361033127456484, + "grad_norm": 0.699227511882782, + "learning_rate": 7.316325775903678e-06, + "loss": 0.4224, + "step": 4403 + }, + { + "epoch": 1.236384053902302, + "grad_norm": 0.6559727787971497, + "learning_rate": 7.314878142007497e-06, + "loss": 0.3852, + "step": 4404 + }, + { + "epoch": 1.2366647950589555, + "grad_norm": 0.6238139867782593, + "learning_rate": 7.313430261081915e-06, + "loss": 0.3946, + "step": 4405 + }, + { + "epoch": 1.2369455362156092, + "grad_norm": 0.6538670659065247, + "learning_rate": 7.311982133281442e-06, + "loss": 0.4028, + "step": 4406 + }, + { + "epoch": 1.2372262773722627, + "grad_norm": 0.632474422454834, + "learning_rate": 7.310533758760614e-06, + "loss": 0.42, + "step": 4407 + }, + { + "epoch": 1.2375070185289163, + "grad_norm": 0.6869728565216064, + "learning_rate": 7.3090851376739915e-06, + "loss": 0.4116, + "step": 4408 + }, + { + "epoch": 1.23778775968557, + "grad_norm": 0.6850327849388123, + "learning_rate": 7.3076362701761615e-06, + "loss": 0.3722, + "step": 4409 + }, + { + "epoch": 1.2380685008422234, + "grad_norm": 0.7792261838912964, + "learning_rate": 7.306187156421739e-06, + "loss": 0.4284, + "step": 4410 + }, + { + "epoch": 1.238349241998877, + "grad_norm": 0.6640065312385559, + "learning_rate": 7.304737796565364e-06, + "loss": 0.3776, + "step": 4411 + }, + { + "epoch": 1.2386299831555305, + "grad_norm": 0.6064174771308899, + "learning_rate": 7.303288190761705e-06, + "loss": 0.3913, + "step": 4412 + }, + { + "epoch": 1.2389107243121842, + "grad_norm": 0.7190853357315063, + "learning_rate": 7.301838339165454e-06, + "loss": 0.3657, + "step": 4413 + }, + { + "epoch": 1.2391914654688376, + "grad_norm": 0.743168294429779, + "learning_rate": 7.300388241931328e-06, + "loss": 0.4124, + "step": 4414 + }, + { + "epoch": 1.2394722066254913, + "grad_norm": 0.5954951643943787, + "learning_rate": 7.298937899214073e-06, + "loss": 0.385, + "step": 4415 + }, + { + "epoch": 1.239752947782145, + "grad_norm": 0.6010693311691284, + "learning_rate": 7.297487311168464e-06, + "loss": 0.3631, + "step": 4416 + }, + { + "epoch": 1.2400336889387984, + "grad_norm": 0.6950507164001465, + "learning_rate": 7.296036477949295e-06, + "loss": 0.3785, + "step": 4417 + }, + { + "epoch": 1.240314430095452, + "grad_norm": 0.6173332333564758, + "learning_rate": 7.294585399711391e-06, + "loss": 0.409, + "step": 4418 + }, + { + "epoch": 1.2405951712521055, + "grad_norm": 0.6691762804985046, + "learning_rate": 7.293134076609605e-06, + "loss": 0.3891, + "step": 4419 + }, + { + "epoch": 1.2408759124087592, + "grad_norm": 0.6664168238639832, + "learning_rate": 7.291682508798808e-06, + "loss": 0.3618, + "step": 4420 + }, + { + "epoch": 1.2411566535654126, + "grad_norm": 0.7321335077285767, + "learning_rate": 7.290230696433903e-06, + "loss": 0.3792, + "step": 4421 + }, + { + "epoch": 1.2414373947220663, + "grad_norm": 0.6839069128036499, + "learning_rate": 7.288778639669822e-06, + "loss": 0.4251, + "step": 4422 + }, + { + "epoch": 1.24171813587872, + "grad_norm": 0.5564275979995728, + "learning_rate": 7.287326338661518e-06, + "loss": 0.3888, + "step": 4423 + }, + { + "epoch": 1.2419988770353734, + "grad_norm": 0.6266826391220093, + "learning_rate": 7.285873793563972e-06, + "loss": 0.379, + "step": 4424 + }, + { + "epoch": 1.242279618192027, + "grad_norm": 0.661496102809906, + "learning_rate": 7.284421004532187e-06, + "loss": 0.3889, + "step": 4425 + }, + { + "epoch": 1.2425603593486805, + "grad_norm": 0.5980477333068848, + "learning_rate": 7.282967971721199e-06, + "loss": 0.3778, + "step": 4426 + }, + { + "epoch": 1.2428411005053341, + "grad_norm": 0.7250111699104309, + "learning_rate": 7.281514695286066e-06, + "loss": 0.3935, + "step": 4427 + }, + { + "epoch": 1.2431218416619876, + "grad_norm": 0.6160321235656738, + "learning_rate": 7.280061175381873e-06, + "loss": 0.3681, + "step": 4428 + }, + { + "epoch": 1.2434025828186412, + "grad_norm": 0.7048326134681702, + "learning_rate": 7.278607412163729e-06, + "loss": 0.3821, + "step": 4429 + }, + { + "epoch": 1.2436833239752947, + "grad_norm": 0.7044711709022522, + "learning_rate": 7.277153405786774e-06, + "loss": 0.4152, + "step": 4430 + }, + { + "epoch": 1.2439640651319483, + "grad_norm": 0.6765097975730896, + "learning_rate": 7.275699156406167e-06, + "loss": 0.3917, + "step": 4431 + }, + { + "epoch": 1.2442448062886018, + "grad_norm": 0.6712751984596252, + "learning_rate": 7.2742446641770985e-06, + "loss": 0.395, + "step": 4432 + }, + { + "epoch": 1.2445255474452555, + "grad_norm": 0.647374153137207, + "learning_rate": 7.27278992925478e-06, + "loss": 0.339, + "step": 4433 + }, + { + "epoch": 1.2448062886019091, + "grad_norm": 0.5925500988960266, + "learning_rate": 7.271334951794455e-06, + "loss": 0.3714, + "step": 4434 + }, + { + "epoch": 1.2450870297585626, + "grad_norm": 0.7028294801712036, + "learning_rate": 7.269879731951388e-06, + "loss": 0.4011, + "step": 4435 + }, + { + "epoch": 1.2453677709152162, + "grad_norm": 0.6963602304458618, + "learning_rate": 7.268424269880872e-06, + "loss": 0.3703, + "step": 4436 + }, + { + "epoch": 1.2456485120718697, + "grad_norm": 0.6790087223052979, + "learning_rate": 7.266968565738224e-06, + "loss": 0.3891, + "step": 4437 + }, + { + "epoch": 1.2459292532285233, + "grad_norm": 0.6815446615219116, + "learning_rate": 7.265512619678789e-06, + "loss": 0.4022, + "step": 4438 + }, + { + "epoch": 1.2462099943851768, + "grad_norm": 0.6858093738555908, + "learning_rate": 7.264056431857934e-06, + "loss": 0.3755, + "step": 4439 + }, + { + "epoch": 1.2464907355418304, + "grad_norm": 0.7268288135528564, + "learning_rate": 7.262600002431056e-06, + "loss": 0.425, + "step": 4440 + }, + { + "epoch": 1.246771476698484, + "grad_norm": 0.758263349533081, + "learning_rate": 7.261143331553577e-06, + "loss": 0.4372, + "step": 4441 + }, + { + "epoch": 1.2470522178551375, + "grad_norm": 0.6037503480911255, + "learning_rate": 7.259686419380942e-06, + "loss": 0.4161, + "step": 4442 + }, + { + "epoch": 1.2473329590117912, + "grad_norm": 0.643770694732666, + "learning_rate": 7.258229266068625e-06, + "loss": 0.4194, + "step": 4443 + }, + { + "epoch": 1.2476137001684446, + "grad_norm": 0.8054676651954651, + "learning_rate": 7.256771871772124e-06, + "loss": 0.3104, + "step": 4444 + }, + { + "epoch": 1.2478944413250983, + "grad_norm": 0.6536027789115906, + "learning_rate": 7.255314236646962e-06, + "loss": 0.3777, + "step": 4445 + }, + { + "epoch": 1.2481751824817517, + "grad_norm": 0.7296000123023987, + "learning_rate": 7.253856360848689e-06, + "loss": 0.4218, + "step": 4446 + }, + { + "epoch": 1.2484559236384054, + "grad_norm": 0.6687675714492798, + "learning_rate": 7.252398244532881e-06, + "loss": 0.3678, + "step": 4447 + }, + { + "epoch": 1.248736664795059, + "grad_norm": 0.757503092288971, + "learning_rate": 7.25093988785514e-06, + "loss": 0.4333, + "step": 4448 + }, + { + "epoch": 1.2490174059517125, + "grad_norm": 0.6565677523612976, + "learning_rate": 7.249481290971092e-06, + "loss": 0.3587, + "step": 4449 + }, + { + "epoch": 1.2492981471083662, + "grad_norm": 0.7528462409973145, + "learning_rate": 7.248022454036389e-06, + "loss": 0.3811, + "step": 4450 + }, + { + "epoch": 1.2495788882650196, + "grad_norm": 0.7380896210670471, + "learning_rate": 7.246563377206709e-06, + "loss": 0.3768, + "step": 4451 + }, + { + "epoch": 1.2498596294216733, + "grad_norm": 0.6426535844802856, + "learning_rate": 7.245104060637755e-06, + "loss": 0.3918, + "step": 4452 + }, + { + "epoch": 1.2501403705783267, + "grad_norm": 0.7681300640106201, + "learning_rate": 7.2436445044852585e-06, + "loss": 0.3986, + "step": 4453 + }, + { + "epoch": 1.2504211117349804, + "grad_norm": 0.6747211813926697, + "learning_rate": 7.2421847089049724e-06, + "loss": 0.3821, + "step": 4454 + }, + { + "epoch": 1.250701852891634, + "grad_norm": 0.6966944336891174, + "learning_rate": 7.240724674052677e-06, + "loss": 0.4028, + "step": 4455 + }, + { + "epoch": 1.2509825940482875, + "grad_norm": 0.6423379778862, + "learning_rate": 7.239264400084178e-06, + "loss": 0.4268, + "step": 4456 + }, + { + "epoch": 1.251263335204941, + "grad_norm": 0.7469407916069031, + "learning_rate": 7.237803887155308e-06, + "loss": 0.3656, + "step": 4457 + }, + { + "epoch": 1.2515440763615946, + "grad_norm": 0.7970960140228271, + "learning_rate": 7.236343135421924e-06, + "loss": 0.4237, + "step": 4458 + }, + { + "epoch": 1.2518248175182483, + "grad_norm": 0.6215502619743347, + "learning_rate": 7.234882145039906e-06, + "loss": 0.405, + "step": 4459 + }, + { + "epoch": 1.2521055586749017, + "grad_norm": 0.725236713886261, + "learning_rate": 7.233420916165164e-06, + "loss": 0.3527, + "step": 4460 + }, + { + "epoch": 1.2523862998315554, + "grad_norm": 0.7854107618331909, + "learning_rate": 7.231959448953629e-06, + "loss": 0.3903, + "step": 4461 + }, + { + "epoch": 1.2526670409882088, + "grad_norm": 0.6946197152137756, + "learning_rate": 7.230497743561265e-06, + "loss": 0.3815, + "step": 4462 + }, + { + "epoch": 1.2529477821448625, + "grad_norm": 0.641368567943573, + "learning_rate": 7.22903580014405e-06, + "loss": 0.3651, + "step": 4463 + }, + { + "epoch": 1.253228523301516, + "grad_norm": 0.6621046662330627, + "learning_rate": 7.227573618857995e-06, + "loss": 0.3845, + "step": 4464 + }, + { + "epoch": 1.2535092644581696, + "grad_norm": 0.5420482754707336, + "learning_rate": 7.226111199859137e-06, + "loss": 0.3902, + "step": 4465 + }, + { + "epoch": 1.2537900056148232, + "grad_norm": 0.6948087811470032, + "learning_rate": 7.224648543303534e-06, + "loss": 0.4198, + "step": 4466 + }, + { + "epoch": 1.2540707467714767, + "grad_norm": 0.8189418315887451, + "learning_rate": 7.223185649347274e-06, + "loss": 0.437, + "step": 4467 + }, + { + "epoch": 1.2543514879281303, + "grad_norm": 0.7353101372718811, + "learning_rate": 7.221722518146467e-06, + "loss": 0.3968, + "step": 4468 + }, + { + "epoch": 1.2546322290847838, + "grad_norm": 0.6004009246826172, + "learning_rate": 7.220259149857247e-06, + "loss": 0.367, + "step": 4469 + }, + { + "epoch": 1.2549129702414374, + "grad_norm": 0.6492067575454712, + "learning_rate": 7.218795544635778e-06, + "loss": 0.3861, + "step": 4470 + }, + { + "epoch": 1.2551937113980909, + "grad_norm": 0.7004754543304443, + "learning_rate": 7.2173317026382465e-06, + "loss": 0.3975, + "step": 4471 + }, + { + "epoch": 1.2554744525547445, + "grad_norm": 0.6812534928321838, + "learning_rate": 7.215867624020863e-06, + "loss": 0.3993, + "step": 4472 + }, + { + "epoch": 1.2557551937113982, + "grad_norm": 0.6776750683784485, + "learning_rate": 7.214403308939869e-06, + "loss": 0.4008, + "step": 4473 + }, + { + "epoch": 1.2560359348680517, + "grad_norm": 0.5960357785224915, + "learning_rate": 7.212938757551522e-06, + "loss": 0.3607, + "step": 4474 + }, + { + "epoch": 1.256316676024705, + "grad_norm": 0.5871161222457886, + "learning_rate": 7.211473970012113e-06, + "loss": 0.3623, + "step": 4475 + }, + { + "epoch": 1.2565974171813588, + "grad_norm": 0.605216920375824, + "learning_rate": 7.210008946477954e-06, + "loss": 0.4004, + "step": 4476 + }, + { + "epoch": 1.2568781583380124, + "grad_norm": 0.5579923391342163, + "learning_rate": 7.208543687105384e-06, + "loss": 0.3792, + "step": 4477 + }, + { + "epoch": 1.2571588994946659, + "grad_norm": 0.6565082669258118, + "learning_rate": 7.207078192050765e-06, + "loss": 0.3888, + "step": 4478 + }, + { + "epoch": 1.2574396406513195, + "grad_norm": 0.6623365879058838, + "learning_rate": 7.205612461470488e-06, + "loss": 0.3972, + "step": 4479 + }, + { + "epoch": 1.2577203818079732, + "grad_norm": 0.7521390318870544, + "learning_rate": 7.2041464955209625e-06, + "loss": 0.3865, + "step": 4480 + }, + { + "epoch": 1.2580011229646266, + "grad_norm": 0.6704253554344177, + "learning_rate": 7.202680294358631e-06, + "loss": 0.4221, + "step": 4481 + }, + { + "epoch": 1.25828186412128, + "grad_norm": 0.562984049320221, + "learning_rate": 7.201213858139956e-06, + "loss": 0.3913, + "step": 4482 + }, + { + "epoch": 1.2585626052779337, + "grad_norm": 0.6283015608787537, + "learning_rate": 7.199747187021427e-06, + "loss": 0.37, + "step": 4483 + }, + { + "epoch": 1.2588433464345874, + "grad_norm": 0.5919360518455505, + "learning_rate": 7.198280281159556e-06, + "loss": 0.3826, + "step": 4484 + }, + { + "epoch": 1.2591240875912408, + "grad_norm": 0.6767114400863647, + "learning_rate": 7.1968131407108835e-06, + "loss": 0.3658, + "step": 4485 + }, + { + "epoch": 1.2594048287478945, + "grad_norm": 0.6459712982177734, + "learning_rate": 7.195345765831975e-06, + "loss": 0.4175, + "step": 4486 + }, + { + "epoch": 1.259685569904548, + "grad_norm": 0.6185809969902039, + "learning_rate": 7.193878156679417e-06, + "loss": 0.3764, + "step": 4487 + }, + { + "epoch": 1.2599663110612016, + "grad_norm": 0.6733853816986084, + "learning_rate": 7.1924103134098254e-06, + "loss": 0.3657, + "step": 4488 + }, + { + "epoch": 1.260247052217855, + "grad_norm": 0.7166831493377686, + "learning_rate": 7.19094223617984e-06, + "loss": 0.3635, + "step": 4489 + }, + { + "epoch": 1.2605277933745087, + "grad_norm": 0.6229596734046936, + "learning_rate": 7.189473925146124e-06, + "loss": 0.4057, + "step": 4490 + }, + { + "epoch": 1.2608085345311624, + "grad_norm": 0.6371515989303589, + "learning_rate": 7.188005380465365e-06, + "loss": 0.4026, + "step": 4491 + }, + { + "epoch": 1.2610892756878158, + "grad_norm": 0.7031722068786621, + "learning_rate": 7.186536602294278e-06, + "loss": 0.3936, + "step": 4492 + }, + { + "epoch": 1.2613700168444695, + "grad_norm": 0.6174464821815491, + "learning_rate": 7.185067590789602e-06, + "loss": 0.3815, + "step": 4493 + }, + { + "epoch": 1.261650758001123, + "grad_norm": 0.7459012866020203, + "learning_rate": 7.183598346108101e-06, + "loss": 0.4227, + "step": 4494 + }, + { + "epoch": 1.2619314991577766, + "grad_norm": 0.632767379283905, + "learning_rate": 7.1821288684065635e-06, + "loss": 0.3733, + "step": 4495 + }, + { + "epoch": 1.26221224031443, + "grad_norm": 0.7163012027740479, + "learning_rate": 7.180659157841803e-06, + "loss": 0.3764, + "step": 4496 + }, + { + "epoch": 1.2624929814710837, + "grad_norm": 0.7456682324409485, + "learning_rate": 7.179189214570658e-06, + "loss": 0.4029, + "step": 4497 + }, + { + "epoch": 1.2627737226277373, + "grad_norm": 0.737336277961731, + "learning_rate": 7.17771903874999e-06, + "loss": 0.4172, + "step": 4498 + }, + { + "epoch": 1.2630544637843908, + "grad_norm": 0.6734129786491394, + "learning_rate": 7.17624863053669e-06, + "loss": 0.4146, + "step": 4499 + }, + { + "epoch": 1.2633352049410442, + "grad_norm": 0.653513491153717, + "learning_rate": 7.174777990087668e-06, + "loss": 0.3847, + "step": 4500 + }, + { + "epoch": 1.263615946097698, + "grad_norm": 0.6573703289031982, + "learning_rate": 7.173307117559865e-06, + "loss": 0.3884, + "step": 4501 + }, + { + "epoch": 1.2638966872543516, + "grad_norm": 0.7086719274520874, + "learning_rate": 7.17183601311024e-06, + "loss": 0.3638, + "step": 4502 + }, + { + "epoch": 1.264177428411005, + "grad_norm": 0.6080599427223206, + "learning_rate": 7.170364676895782e-06, + "loss": 0.3999, + "step": 4503 + }, + { + "epoch": 1.2644581695676587, + "grad_norm": 0.6222747564315796, + "learning_rate": 7.168893109073502e-06, + "loss": 0.359, + "step": 4504 + }, + { + "epoch": 1.264738910724312, + "grad_norm": 0.6191179752349854, + "learning_rate": 7.167421309800436e-06, + "loss": 0.3997, + "step": 4505 + }, + { + "epoch": 1.2650196518809658, + "grad_norm": 0.7251825332641602, + "learning_rate": 7.165949279233647e-06, + "loss": 0.3808, + "step": 4506 + }, + { + "epoch": 1.2653003930376192, + "grad_norm": 0.6500434279441833, + "learning_rate": 7.164477017530221e-06, + "loss": 0.4165, + "step": 4507 + }, + { + "epoch": 1.2655811341942729, + "grad_norm": 0.6976383328437805, + "learning_rate": 7.1630045248472665e-06, + "loss": 0.436, + "step": 4508 + }, + { + "epoch": 1.2658618753509265, + "grad_norm": 0.6470715403556824, + "learning_rate": 7.16153180134192e-06, + "loss": 0.4134, + "step": 4509 + }, + { + "epoch": 1.26614261650758, + "grad_norm": 0.6182039976119995, + "learning_rate": 7.160058847171342e-06, + "loss": 0.3937, + "step": 4510 + }, + { + "epoch": 1.2664233576642336, + "grad_norm": 0.6108161807060242, + "learning_rate": 7.158585662492715e-06, + "loss": 0.3997, + "step": 4511 + }, + { + "epoch": 1.266704098820887, + "grad_norm": 0.6060779094696045, + "learning_rate": 7.1571122474632505e-06, + "loss": 0.3347, + "step": 4512 + }, + { + "epoch": 1.2669848399775407, + "grad_norm": 0.6959679126739502, + "learning_rate": 7.15563860224018e-06, + "loss": 0.4157, + "step": 4513 + }, + { + "epoch": 1.2672655811341942, + "grad_norm": 0.7535659074783325, + "learning_rate": 7.154164726980764e-06, + "loss": 0.4114, + "step": 4514 + }, + { + "epoch": 1.2675463222908478, + "grad_norm": 0.7022605538368225, + "learning_rate": 7.152690621842284e-06, + "loss": 0.3758, + "step": 4515 + }, + { + "epoch": 1.2678270634475015, + "grad_norm": 0.7148244380950928, + "learning_rate": 7.151216286982048e-06, + "loss": 0.3723, + "step": 4516 + }, + { + "epoch": 1.268107804604155, + "grad_norm": 0.6735973954200745, + "learning_rate": 7.1497417225573865e-06, + "loss": 0.3861, + "step": 4517 + }, + { + "epoch": 1.2683885457608086, + "grad_norm": 0.641895055770874, + "learning_rate": 7.1482669287256575e-06, + "loss": 0.387, + "step": 4518 + }, + { + "epoch": 1.268669286917462, + "grad_norm": 0.7048580646514893, + "learning_rate": 7.146791905644241e-06, + "loss": 0.427, + "step": 4519 + }, + { + "epoch": 1.2689500280741157, + "grad_norm": 0.6548430919647217, + "learning_rate": 7.145316653470542e-06, + "loss": 0.352, + "step": 4520 + }, + { + "epoch": 1.2692307692307692, + "grad_norm": 0.6917059421539307, + "learning_rate": 7.143841172361991e-06, + "loss": 0.3406, + "step": 4521 + }, + { + "epoch": 1.2695115103874228, + "grad_norm": 0.6860353946685791, + "learning_rate": 7.142365462476042e-06, + "loss": 0.4143, + "step": 4522 + }, + { + "epoch": 1.2697922515440765, + "grad_norm": 0.6748304963111877, + "learning_rate": 7.140889523970173e-06, + "loss": 0.3962, + "step": 4523 + }, + { + "epoch": 1.27007299270073, + "grad_norm": 0.6810723543167114, + "learning_rate": 7.139413357001886e-06, + "loss": 0.3934, + "step": 4524 + }, + { + "epoch": 1.2703537338573834, + "grad_norm": 0.6174544095993042, + "learning_rate": 7.1379369617287105e-06, + "loss": 0.4198, + "step": 4525 + }, + { + "epoch": 1.270634475014037, + "grad_norm": 0.680924117565155, + "learning_rate": 7.136460338308197e-06, + "loss": 0.4235, + "step": 4526 + }, + { + "epoch": 1.2709152161706907, + "grad_norm": 0.6375547647476196, + "learning_rate": 7.134983486897924e-06, + "loss": 0.3882, + "step": 4527 + }, + { + "epoch": 1.2711959573273441, + "grad_norm": 0.6849893927574158, + "learning_rate": 7.133506407655488e-06, + "loss": 0.3744, + "step": 4528 + }, + { + "epoch": 1.2714766984839978, + "grad_norm": 0.6663550138473511, + "learning_rate": 7.132029100738517e-06, + "loss": 0.3773, + "step": 4529 + }, + { + "epoch": 1.2717574396406512, + "grad_norm": 0.6391407251358032, + "learning_rate": 7.130551566304657e-06, + "loss": 0.4126, + "step": 4530 + }, + { + "epoch": 1.272038180797305, + "grad_norm": 0.6585769057273865, + "learning_rate": 7.129073804511584e-06, + "loss": 0.4077, + "step": 4531 + }, + { + "epoch": 1.2723189219539583, + "grad_norm": 0.7295487523078918, + "learning_rate": 7.127595815516993e-06, + "loss": 0.3754, + "step": 4532 + }, + { + "epoch": 1.272599663110612, + "grad_norm": 0.631105363368988, + "learning_rate": 7.126117599478608e-06, + "loss": 0.3674, + "step": 4533 + }, + { + "epoch": 1.2728804042672657, + "grad_norm": 0.7147228121757507, + "learning_rate": 7.124639156554176e-06, + "loss": 0.4603, + "step": 4534 + }, + { + "epoch": 1.2731611454239191, + "grad_norm": 0.6172426342964172, + "learning_rate": 7.123160486901464e-06, + "loss": 0.3626, + "step": 4535 + }, + { + "epoch": 1.2734418865805728, + "grad_norm": 0.7407633066177368, + "learning_rate": 7.121681590678267e-06, + "loss": 0.3988, + "step": 4536 + }, + { + "epoch": 1.2737226277372262, + "grad_norm": 0.6925725936889648, + "learning_rate": 7.120202468042404e-06, + "loss": 0.3724, + "step": 4537 + }, + { + "epoch": 1.2740033688938799, + "grad_norm": 0.6817641854286194, + "learning_rate": 7.11872311915172e-06, + "loss": 0.4323, + "step": 4538 + }, + { + "epoch": 1.2742841100505333, + "grad_norm": 0.7241194844245911, + "learning_rate": 7.117243544164081e-06, + "loss": 0.3942, + "step": 4539 + }, + { + "epoch": 1.274564851207187, + "grad_norm": 0.6990113258361816, + "learning_rate": 7.115763743237375e-06, + "loss": 0.4256, + "step": 4540 + }, + { + "epoch": 1.2748455923638407, + "grad_norm": 0.763110876083374, + "learning_rate": 7.1142837165295206e-06, + "loss": 0.4015, + "step": 4541 + }, + { + "epoch": 1.275126333520494, + "grad_norm": 0.6815094351768494, + "learning_rate": 7.112803464198455e-06, + "loss": 0.3997, + "step": 4542 + }, + { + "epoch": 1.2754070746771475, + "grad_norm": 0.7032138705253601, + "learning_rate": 7.111322986402143e-06, + "loss": 0.4123, + "step": 4543 + }, + { + "epoch": 1.2756878158338012, + "grad_norm": 0.7433674931526184, + "learning_rate": 7.109842283298572e-06, + "loss": 0.3829, + "step": 4544 + }, + { + "epoch": 1.2759685569904549, + "grad_norm": 0.6728748083114624, + "learning_rate": 7.108361355045752e-06, + "loss": 0.3915, + "step": 4545 + }, + { + "epoch": 1.2762492981471083, + "grad_norm": 0.7070676684379578, + "learning_rate": 7.10688020180172e-06, + "loss": 0.386, + "step": 4546 + }, + { + "epoch": 1.276530039303762, + "grad_norm": 0.6893311142921448, + "learning_rate": 7.1053988237245345e-06, + "loss": 0.3893, + "step": 4547 + }, + { + "epoch": 1.2768107804604156, + "grad_norm": 0.6288242340087891, + "learning_rate": 7.103917220972277e-06, + "loss": 0.3819, + "step": 4548 + }, + { + "epoch": 1.277091521617069, + "grad_norm": 0.7185351252555847, + "learning_rate": 7.102435393703058e-06, + "loss": 0.3633, + "step": 4549 + }, + { + "epoch": 1.2773722627737225, + "grad_norm": 0.6069153547286987, + "learning_rate": 7.10095334207501e-06, + "loss": 0.3942, + "step": 4550 + }, + { + "epoch": 1.2776530039303762, + "grad_norm": 0.6017578840255737, + "learning_rate": 7.099471066246284e-06, + "loss": 0.4023, + "step": 4551 + }, + { + "epoch": 1.2779337450870298, + "grad_norm": 0.6145133376121521, + "learning_rate": 7.097988566375063e-06, + "loss": 0.3857, + "step": 4552 + }, + { + "epoch": 1.2782144862436833, + "grad_norm": 0.6975274682044983, + "learning_rate": 7.096505842619547e-06, + "loss": 0.3487, + "step": 4553 + }, + { + "epoch": 1.278495227400337, + "grad_norm": 0.7050109505653381, + "learning_rate": 7.095022895137968e-06, + "loss": 0.3856, + "step": 4554 + }, + { + "epoch": 1.2787759685569904, + "grad_norm": 0.6894435286521912, + "learning_rate": 7.0935397240885705e-06, + "loss": 0.3619, + "step": 4555 + }, + { + "epoch": 1.279056709713644, + "grad_norm": 0.7413249015808105, + "learning_rate": 7.092056329629635e-06, + "loss": 0.3906, + "step": 4556 + }, + { + "epoch": 1.2793374508702975, + "grad_norm": 0.6645633578300476, + "learning_rate": 7.090572711919457e-06, + "loss": 0.4181, + "step": 4557 + }, + { + "epoch": 1.2796181920269512, + "grad_norm": 0.659733772277832, + "learning_rate": 7.089088871116358e-06, + "loss": 0.4172, + "step": 4558 + }, + { + "epoch": 1.2798989331836048, + "grad_norm": 0.695060133934021, + "learning_rate": 7.087604807378687e-06, + "loss": 0.3894, + "step": 4559 + }, + { + "epoch": 1.2801796743402583, + "grad_norm": 0.6080955862998962, + "learning_rate": 7.086120520864812e-06, + "loss": 0.3982, + "step": 4560 + }, + { + "epoch": 1.280460415496912, + "grad_norm": 0.7476343512535095, + "learning_rate": 7.084636011733129e-06, + "loss": 0.3969, + "step": 4561 + }, + { + "epoch": 1.2807411566535654, + "grad_norm": 0.6847297549247742, + "learning_rate": 7.083151280142053e-06, + "loss": 0.3908, + "step": 4562 + }, + { + "epoch": 1.281021897810219, + "grad_norm": 0.6581273078918457, + "learning_rate": 7.0816663262500275e-06, + "loss": 0.3605, + "step": 4563 + }, + { + "epoch": 1.2813026389668725, + "grad_norm": 0.6434887051582336, + "learning_rate": 7.080181150215517e-06, + "loss": 0.4163, + "step": 4564 + }, + { + "epoch": 1.2815833801235261, + "grad_norm": 0.786142110824585, + "learning_rate": 7.078695752197009e-06, + "loss": 0.4492, + "step": 4565 + }, + { + "epoch": 1.2818641212801798, + "grad_norm": 0.7068789005279541, + "learning_rate": 7.0772101323530184e-06, + "loss": 0.3972, + "step": 4566 + }, + { + "epoch": 1.2821448624368332, + "grad_norm": 0.6115596890449524, + "learning_rate": 7.07572429084208e-06, + "loss": 0.4151, + "step": 4567 + }, + { + "epoch": 1.2824256035934867, + "grad_norm": 0.7637412548065186, + "learning_rate": 7.074238227822752e-06, + "loss": 0.3977, + "step": 4568 + }, + { + "epoch": 1.2827063447501403, + "grad_norm": 0.6415371298789978, + "learning_rate": 7.0727519434536185e-06, + "loss": 0.3745, + "step": 4569 + }, + { + "epoch": 1.282987085906794, + "grad_norm": 0.6550493836402893, + "learning_rate": 7.071265437893289e-06, + "loss": 0.4112, + "step": 4570 + }, + { + "epoch": 1.2832678270634474, + "grad_norm": 0.7450150847434998, + "learning_rate": 7.06977871130039e-06, + "loss": 0.4056, + "step": 4571 + }, + { + "epoch": 1.283548568220101, + "grad_norm": 0.6011745929718018, + "learning_rate": 7.06829176383358e-06, + "loss": 0.3866, + "step": 4572 + }, + { + "epoch": 1.2838293093767548, + "grad_norm": 0.6650264263153076, + "learning_rate": 7.066804595651535e-06, + "loss": 0.3731, + "step": 4573 + }, + { + "epoch": 1.2841100505334082, + "grad_norm": 0.6137609481811523, + "learning_rate": 7.065317206912954e-06, + "loss": 0.3993, + "step": 4574 + }, + { + "epoch": 1.2843907916900617, + "grad_norm": 0.668127179145813, + "learning_rate": 7.0638295977765654e-06, + "loss": 0.3718, + "step": 4575 + }, + { + "epoch": 1.2846715328467153, + "grad_norm": 0.7284173965454102, + "learning_rate": 7.062341768401117e-06, + "loss": 0.4086, + "step": 4576 + }, + { + "epoch": 1.284952274003369, + "grad_norm": 0.6427273750305176, + "learning_rate": 7.060853718945378e-06, + "loss": 0.3908, + "step": 4577 + }, + { + "epoch": 1.2852330151600224, + "grad_norm": 0.668682336807251, + "learning_rate": 7.059365449568148e-06, + "loss": 0.4184, + "step": 4578 + }, + { + "epoch": 1.285513756316676, + "grad_norm": 0.6647806763648987, + "learning_rate": 7.057876960428243e-06, + "loss": 0.4035, + "step": 4579 + }, + { + "epoch": 1.2857944974733295, + "grad_norm": 0.63369220495224, + "learning_rate": 7.056388251684505e-06, + "loss": 0.4016, + "step": 4580 + }, + { + "epoch": 1.2860752386299832, + "grad_norm": 0.6674875020980835, + "learning_rate": 7.054899323495801e-06, + "loss": 0.3932, + "step": 4581 + }, + { + "epoch": 1.2863559797866366, + "grad_norm": 0.6439608335494995, + "learning_rate": 7.0534101760210206e-06, + "loss": 0.3866, + "step": 4582 + }, + { + "epoch": 1.2866367209432903, + "grad_norm": 0.7269125580787659, + "learning_rate": 7.0519208094190735e-06, + "loss": 0.3778, + "step": 4583 + }, + { + "epoch": 1.286917462099944, + "grad_norm": 0.6372610926628113, + "learning_rate": 7.0504312238489e-06, + "loss": 0.3753, + "step": 4584 + }, + { + "epoch": 1.2871982032565974, + "grad_norm": 0.6168985366821289, + "learning_rate": 7.048941419469456e-06, + "loss": 0.3966, + "step": 4585 + }, + { + "epoch": 1.287478944413251, + "grad_norm": 0.6626611948013306, + "learning_rate": 7.0474513964397255e-06, + "loss": 0.4384, + "step": 4586 + }, + { + "epoch": 1.2877596855699045, + "grad_norm": 0.6824188828468323, + "learning_rate": 7.0459611549187126e-06, + "loss": 0.3753, + "step": 4587 + }, + { + "epoch": 1.2880404267265582, + "grad_norm": 0.6214696168899536, + "learning_rate": 7.04447069506545e-06, + "loss": 0.3798, + "step": 4588 + }, + { + "epoch": 1.2883211678832116, + "grad_norm": 0.6406710743904114, + "learning_rate": 7.042980017038988e-06, + "loss": 0.3652, + "step": 4589 + }, + { + "epoch": 1.2886019090398653, + "grad_norm": 0.6062746644020081, + "learning_rate": 7.041489120998403e-06, + "loss": 0.3859, + "step": 4590 + }, + { + "epoch": 1.288882650196519, + "grad_norm": 0.757871150970459, + "learning_rate": 7.0399980071027955e-06, + "loss": 0.3947, + "step": 4591 + }, + { + "epoch": 1.2891633913531724, + "grad_norm": 0.5964840054512024, + "learning_rate": 7.038506675511285e-06, + "loss": 0.3819, + "step": 4592 + }, + { + "epoch": 1.2894441325098258, + "grad_norm": 0.6043334603309631, + "learning_rate": 7.037015126383019e-06, + "loss": 0.3766, + "step": 4593 + }, + { + "epoch": 1.2897248736664795, + "grad_norm": 0.747077465057373, + "learning_rate": 7.035523359877167e-06, + "loss": 0.4247, + "step": 4594 + }, + { + "epoch": 1.2900056148231331, + "grad_norm": 0.7753455638885498, + "learning_rate": 7.0340313761529185e-06, + "loss": 0.3674, + "step": 4595 + }, + { + "epoch": 1.2902863559797866, + "grad_norm": 0.8529857993125916, + "learning_rate": 7.032539175369491e-06, + "loss": 0.4015, + "step": 4596 + }, + { + "epoch": 1.2905670971364402, + "grad_norm": 0.8258609771728516, + "learning_rate": 7.031046757686123e-06, + "loss": 0.4435, + "step": 4597 + }, + { + "epoch": 1.2908478382930937, + "grad_norm": 0.7256587743759155, + "learning_rate": 7.029554123262075e-06, + "loss": 0.3806, + "step": 4598 + }, + { + "epoch": 1.2911285794497473, + "grad_norm": 0.7312936186790466, + "learning_rate": 7.028061272256631e-06, + "loss": 0.3874, + "step": 4599 + }, + { + "epoch": 1.2914093206064008, + "grad_norm": 0.7907974123954773, + "learning_rate": 7.0265682048291005e-06, + "loss": 0.384, + "step": 4600 + }, + { + "epoch": 1.2916900617630545, + "grad_norm": 0.8494774103164673, + "learning_rate": 7.025074921138813e-06, + "loss": 0.4253, + "step": 4601 + }, + { + "epoch": 1.2919708029197081, + "grad_norm": 0.6940559148788452, + "learning_rate": 7.023581421345124e-06, + "loss": 0.3402, + "step": 4602 + }, + { + "epoch": 1.2922515440763616, + "grad_norm": 0.6202247142791748, + "learning_rate": 7.022087705607409e-06, + "loss": 0.4177, + "step": 4603 + }, + { + "epoch": 1.2925322852330152, + "grad_norm": 0.6664928793907166, + "learning_rate": 7.020593774085068e-06, + "loss": 0.3901, + "step": 4604 + }, + { + "epoch": 1.2928130263896687, + "grad_norm": 0.6981474757194519, + "learning_rate": 7.019099626937527e-06, + "loss": 0.379, + "step": 4605 + }, + { + "epoch": 1.2930937675463223, + "grad_norm": 0.8299378752708435, + "learning_rate": 7.017605264324227e-06, + "loss": 0.4382, + "step": 4606 + }, + { + "epoch": 1.2933745087029758, + "grad_norm": 0.6710399389266968, + "learning_rate": 7.016110686404642e-06, + "loss": 0.3913, + "step": 4607 + }, + { + "epoch": 1.2936552498596294, + "grad_norm": 0.6304298043251038, + "learning_rate": 7.014615893338259e-06, + "loss": 0.3665, + "step": 4608 + }, + { + "epoch": 1.293935991016283, + "grad_norm": 0.6952391266822815, + "learning_rate": 7.013120885284599e-06, + "loss": 0.3765, + "step": 4609 + }, + { + "epoch": 1.2942167321729365, + "grad_norm": 0.7461031079292297, + "learning_rate": 7.0116256624031945e-06, + "loss": 0.3934, + "step": 4610 + }, + { + "epoch": 1.2944974733295902, + "grad_norm": 0.7327235341072083, + "learning_rate": 7.0101302248536105e-06, + "loss": 0.3925, + "step": 4611 + }, + { + "epoch": 1.2947782144862436, + "grad_norm": 0.6534166932106018, + "learning_rate": 7.008634572795427e-06, + "loss": 0.3925, + "step": 4612 + }, + { + "epoch": 1.2950589556428973, + "grad_norm": 0.6636688113212585, + "learning_rate": 7.007138706388254e-06, + "loss": 0.4072, + "step": 4613 + }, + { + "epoch": 1.2953396967995507, + "grad_norm": 0.629555881023407, + "learning_rate": 7.005642625791721e-06, + "loss": 0.3744, + "step": 4614 + }, + { + "epoch": 1.2956204379562044, + "grad_norm": 0.7262469530105591, + "learning_rate": 7.004146331165478e-06, + "loss": 0.365, + "step": 4615 + }, + { + "epoch": 1.295901179112858, + "grad_norm": 0.67493736743927, + "learning_rate": 7.002649822669203e-06, + "loss": 0.364, + "step": 4616 + }, + { + "epoch": 1.2961819202695115, + "grad_norm": 0.729296863079071, + "learning_rate": 7.001153100462591e-06, + "loss": 0.384, + "step": 4617 + }, + { + "epoch": 1.296462661426165, + "grad_norm": 0.7144120931625366, + "learning_rate": 6.999656164705365e-06, + "loss": 0.3775, + "step": 4618 + }, + { + "epoch": 1.2967434025828186, + "grad_norm": 0.6951630711555481, + "learning_rate": 6.9981590155572675e-06, + "loss": 0.3757, + "step": 4619 + }, + { + "epoch": 1.2970241437394723, + "grad_norm": 0.7642596960067749, + "learning_rate": 6.996661653178067e-06, + "loss": 0.3543, + "step": 4620 + }, + { + "epoch": 1.2973048848961257, + "grad_norm": 0.7871041297912598, + "learning_rate": 6.99516407772755e-06, + "loss": 0.4006, + "step": 4621 + }, + { + "epoch": 1.2975856260527794, + "grad_norm": 0.693824291229248, + "learning_rate": 6.993666289365531e-06, + "loss": 0.4093, + "step": 4622 + }, + { + "epoch": 1.2978663672094328, + "grad_norm": 0.742197573184967, + "learning_rate": 6.9921682882518414e-06, + "loss": 0.4218, + "step": 4623 + }, + { + "epoch": 1.2981471083660865, + "grad_norm": 0.8252508044242859, + "learning_rate": 6.990670074546342e-06, + "loss": 0.3815, + "step": 4624 + }, + { + "epoch": 1.29842784952274, + "grad_norm": 0.7689230442047119, + "learning_rate": 6.98917164840891e-06, + "loss": 0.404, + "step": 4625 + }, + { + "epoch": 1.2987085906793936, + "grad_norm": 0.616387665271759, + "learning_rate": 6.9876730099994504e-06, + "loss": 0.3916, + "step": 4626 + }, + { + "epoch": 1.2989893318360473, + "grad_norm": 0.5897514820098877, + "learning_rate": 6.9861741594778885e-06, + "loss": 0.3425, + "step": 4627 + }, + { + "epoch": 1.2992700729927007, + "grad_norm": 0.7335630059242249, + "learning_rate": 6.984675097004171e-06, + "loss": 0.3576, + "step": 4628 + }, + { + "epoch": 1.2995508141493544, + "grad_norm": 0.7389088869094849, + "learning_rate": 6.983175822738268e-06, + "loss": 0.4253, + "step": 4629 + }, + { + "epoch": 1.2998315553060078, + "grad_norm": 0.7147209048271179, + "learning_rate": 6.9816763368401755e-06, + "loss": 0.3943, + "step": 4630 + }, + { + "epoch": 1.3001122964626615, + "grad_norm": 0.7022382020950317, + "learning_rate": 6.980176639469907e-06, + "loss": 0.3814, + "step": 4631 + }, + { + "epoch": 1.300393037619315, + "grad_norm": 0.8424223065376282, + "learning_rate": 6.978676730787502e-06, + "loss": 0.4243, + "step": 4632 + }, + { + "epoch": 1.3006737787759686, + "grad_norm": 0.7687547206878662, + "learning_rate": 6.97717661095302e-06, + "loss": 0.3854, + "step": 4633 + }, + { + "epoch": 1.3009545199326222, + "grad_norm": 0.7166351675987244, + "learning_rate": 6.975676280126545e-06, + "loss": 0.3763, + "step": 4634 + }, + { + "epoch": 1.3012352610892757, + "grad_norm": 0.8682746291160583, + "learning_rate": 6.974175738468183e-06, + "loss": 0.3786, + "step": 4635 + }, + { + "epoch": 1.3015160022459291, + "grad_norm": 0.6913161277770996, + "learning_rate": 6.972674986138064e-06, + "loss": 0.3932, + "step": 4636 + }, + { + "epoch": 1.3017967434025828, + "grad_norm": 0.7797034978866577, + "learning_rate": 6.971174023296337e-06, + "loss": 0.409, + "step": 4637 + }, + { + "epoch": 1.3020774845592364, + "grad_norm": 0.8386295437812805, + "learning_rate": 6.969672850103176e-06, + "loss": 0.3876, + "step": 4638 + }, + { + "epoch": 1.3023582257158899, + "grad_norm": 0.6327503323554993, + "learning_rate": 6.968171466718777e-06, + "loss": 0.3772, + "step": 4639 + }, + { + "epoch": 1.3026389668725435, + "grad_norm": 0.671227216720581, + "learning_rate": 6.966669873303359e-06, + "loss": 0.3812, + "step": 4640 + }, + { + "epoch": 1.3029197080291972, + "grad_norm": 0.8056958913803101, + "learning_rate": 6.965168070017162e-06, + "loss": 0.4121, + "step": 4641 + }, + { + "epoch": 1.3032004491858507, + "grad_norm": 0.7812299132347107, + "learning_rate": 6.963666057020448e-06, + "loss": 0.462, + "step": 4642 + }, + { + "epoch": 1.303481190342504, + "grad_norm": 0.6905235052108765, + "learning_rate": 6.962163834473506e-06, + "loss": 0.3778, + "step": 4643 + }, + { + "epoch": 1.3037619314991578, + "grad_norm": 0.8202456831932068, + "learning_rate": 6.960661402536639e-06, + "loss": 0.4063, + "step": 4644 + }, + { + "epoch": 1.3040426726558114, + "grad_norm": 0.6483975648880005, + "learning_rate": 6.959158761370181e-06, + "loss": 0.3859, + "step": 4645 + }, + { + "epoch": 1.3043234138124649, + "grad_norm": 0.6956228017807007, + "learning_rate": 6.957655911134484e-06, + "loss": 0.3739, + "step": 4646 + }, + { + "epoch": 1.3046041549691185, + "grad_norm": 0.779199481010437, + "learning_rate": 6.956152851989919e-06, + "loss": 0.444, + "step": 4647 + }, + { + "epoch": 1.304884896125772, + "grad_norm": 0.8077702522277832, + "learning_rate": 6.9546495840968885e-06, + "loss": 0.3742, + "step": 4648 + }, + { + "epoch": 1.3051656372824256, + "grad_norm": 0.6182279586791992, + "learning_rate": 6.953146107615809e-06, + "loss": 0.381, + "step": 4649 + }, + { + "epoch": 1.305446378439079, + "grad_norm": 0.6720951795578003, + "learning_rate": 6.951642422707123e-06, + "loss": 0.3864, + "step": 4650 + }, + { + "epoch": 1.3057271195957327, + "grad_norm": 0.8674094676971436, + "learning_rate": 6.950138529531294e-06, + "loss": 0.427, + "step": 4651 + }, + { + "epoch": 1.3060078607523864, + "grad_norm": 0.8232053518295288, + "learning_rate": 6.948634428248807e-06, + "loss": 0.3995, + "step": 4652 + }, + { + "epoch": 1.3062886019090398, + "grad_norm": 0.7324113845825195, + "learning_rate": 6.947130119020173e-06, + "loss": 0.3881, + "step": 4653 + }, + { + "epoch": 1.3065693430656935, + "grad_norm": 0.6415976881980896, + "learning_rate": 6.945625602005922e-06, + "loss": 0.3767, + "step": 4654 + }, + { + "epoch": 1.306850084222347, + "grad_norm": 0.6434155106544495, + "learning_rate": 6.944120877366605e-06, + "loss": 0.3903, + "step": 4655 + }, + { + "epoch": 1.3071308253790006, + "grad_norm": 0.7344347834587097, + "learning_rate": 6.942615945262796e-06, + "loss": 0.3712, + "step": 4656 + }, + { + "epoch": 1.307411566535654, + "grad_norm": 0.6861408948898315, + "learning_rate": 6.9411108058550955e-06, + "loss": 0.3868, + "step": 4657 + }, + { + "epoch": 1.3076923076923077, + "grad_norm": 0.7437962293624878, + "learning_rate": 6.9396054593041196e-06, + "loss": 0.3829, + "step": 4658 + }, + { + "epoch": 1.3079730488489614, + "grad_norm": 0.6208816766738892, + "learning_rate": 6.938099905770511e-06, + "loss": 0.3673, + "step": 4659 + }, + { + "epoch": 1.3082537900056148, + "grad_norm": 0.7573358416557312, + "learning_rate": 6.936594145414934e-06, + "loss": 0.3972, + "step": 4660 + }, + { + "epoch": 1.3085345311622683, + "grad_norm": 0.7199413180351257, + "learning_rate": 6.935088178398072e-06, + "loss": 0.4038, + "step": 4661 + }, + { + "epoch": 1.308815272318922, + "grad_norm": 0.6689475178718567, + "learning_rate": 6.9335820048806326e-06, + "loss": 0.3737, + "step": 4662 + }, + { + "epoch": 1.3090960134755756, + "grad_norm": 0.6980438232421875, + "learning_rate": 6.932075625023344e-06, + "loss": 0.3853, + "step": 4663 + }, + { + "epoch": 1.309376754632229, + "grad_norm": 0.747400164604187, + "learning_rate": 6.930569038986962e-06, + "loss": 0.3957, + "step": 4664 + }, + { + "epoch": 1.3096574957888827, + "grad_norm": 0.8000480532646179, + "learning_rate": 6.929062246932258e-06, + "loss": 0.4203, + "step": 4665 + }, + { + "epoch": 1.3099382369455363, + "grad_norm": 0.7655662894248962, + "learning_rate": 6.927555249020026e-06, + "loss": 0.4033, + "step": 4666 + }, + { + "epoch": 1.3102189781021898, + "grad_norm": 0.7041757702827454, + "learning_rate": 6.9260480454110845e-06, + "loss": 0.3708, + "step": 4667 + }, + { + "epoch": 1.3104997192588432, + "grad_norm": 0.6548405289649963, + "learning_rate": 6.924540636266272e-06, + "loss": 0.3569, + "step": 4668 + }, + { + "epoch": 1.310780460415497, + "grad_norm": 0.6934897303581238, + "learning_rate": 6.923033021746453e-06, + "loss": 0.4432, + "step": 4669 + }, + { + "epoch": 1.3110612015721506, + "grad_norm": 0.7211781144142151, + "learning_rate": 6.921525202012507e-06, + "loss": 0.4008, + "step": 4670 + }, + { + "epoch": 1.311341942728804, + "grad_norm": 0.6320825815200806, + "learning_rate": 6.920017177225341e-06, + "loss": 0.3678, + "step": 4671 + }, + { + "epoch": 1.3116226838854577, + "grad_norm": 0.6099188327789307, + "learning_rate": 6.918508947545881e-06, + "loss": 0.3341, + "step": 4672 + }, + { + "epoch": 1.311903425042111, + "grad_norm": 0.6788693070411682, + "learning_rate": 6.917000513135077e-06, + "loss": 0.3569, + "step": 4673 + }, + { + "epoch": 1.3121841661987648, + "grad_norm": 0.7344499230384827, + "learning_rate": 6.915491874153899e-06, + "loss": 0.4154, + "step": 4674 + }, + { + "epoch": 1.3124649073554182, + "grad_norm": 0.6813397407531738, + "learning_rate": 6.91398303076334e-06, + "loss": 0.3867, + "step": 4675 + }, + { + "epoch": 1.3127456485120719, + "grad_norm": 0.7256249785423279, + "learning_rate": 6.912473983124414e-06, + "loss": 0.3861, + "step": 4676 + }, + { + "epoch": 1.3130263896687255, + "grad_norm": 0.6895406246185303, + "learning_rate": 6.910964731398158e-06, + "loss": 0.3696, + "step": 4677 + }, + { + "epoch": 1.313307130825379, + "grad_norm": 0.6773607134819031, + "learning_rate": 6.909455275745629e-06, + "loss": 0.3931, + "step": 4678 + }, + { + "epoch": 1.3135878719820326, + "grad_norm": 0.6725152730941772, + "learning_rate": 6.907945616327907e-06, + "loss": 0.4195, + "step": 4679 + }, + { + "epoch": 1.313868613138686, + "grad_norm": 0.6737792491912842, + "learning_rate": 6.906435753306094e-06, + "loss": 0.3797, + "step": 4680 + }, + { + "epoch": 1.3141493542953397, + "grad_norm": 0.6173524856567383, + "learning_rate": 6.904925686841313e-06, + "loss": 0.4058, + "step": 4681 + }, + { + "epoch": 1.3144300954519932, + "grad_norm": 0.7905880808830261, + "learning_rate": 6.903415417094709e-06, + "loss": 0.3481, + "step": 4682 + }, + { + "epoch": 1.3147108366086468, + "grad_norm": 0.7190317511558533, + "learning_rate": 6.901904944227446e-06, + "loss": 0.3783, + "step": 4683 + }, + { + "epoch": 1.3149915777653005, + "grad_norm": 0.649093747138977, + "learning_rate": 6.900394268400716e-06, + "loss": 0.3654, + "step": 4684 + }, + { + "epoch": 1.315272318921954, + "grad_norm": 0.6908594965934753, + "learning_rate": 6.898883389775728e-06, + "loss": 0.4037, + "step": 4685 + }, + { + "epoch": 1.3155530600786074, + "grad_norm": 0.7254620790481567, + "learning_rate": 6.897372308513712e-06, + "loss": 0.4032, + "step": 4686 + }, + { + "epoch": 1.315833801235261, + "grad_norm": 0.678902804851532, + "learning_rate": 6.895861024775922e-06, + "loss": 0.4111, + "step": 4687 + }, + { + "epoch": 1.3161145423919147, + "grad_norm": 0.7710278034210205, + "learning_rate": 6.894349538723635e-06, + "loss": 0.3768, + "step": 4688 + }, + { + "epoch": 1.3163952835485682, + "grad_norm": 0.7409265637397766, + "learning_rate": 6.892837850518144e-06, + "loss": 0.3601, + "step": 4689 + }, + { + "epoch": 1.3166760247052218, + "grad_norm": 0.7537786960601807, + "learning_rate": 6.891325960320767e-06, + "loss": 0.3888, + "step": 4690 + }, + { + "epoch": 1.3169567658618755, + "grad_norm": 0.7065321207046509, + "learning_rate": 6.889813868292846e-06, + "loss": 0.3763, + "step": 4691 + }, + { + "epoch": 1.317237507018529, + "grad_norm": 0.750227153301239, + "learning_rate": 6.888301574595742e-06, + "loss": 0.3961, + "step": 4692 + }, + { + "epoch": 1.3175182481751824, + "grad_norm": 0.604944109916687, + "learning_rate": 6.886789079390837e-06, + "loss": 0.4111, + "step": 4693 + }, + { + "epoch": 1.317798989331836, + "grad_norm": 0.8548795580863953, + "learning_rate": 6.885276382839533e-06, + "loss": 0.3593, + "step": 4694 + }, + { + "epoch": 1.3180797304884897, + "grad_norm": 0.7772150635719299, + "learning_rate": 6.883763485103257e-06, + "loss": 0.4223, + "step": 4695 + }, + { + "epoch": 1.3183604716451431, + "grad_norm": 0.7162172794342041, + "learning_rate": 6.882250386343456e-06, + "loss": 0.4122, + "step": 4696 + }, + { + "epoch": 1.3186412128017968, + "grad_norm": 0.6120654940605164, + "learning_rate": 6.8807370867216e-06, + "loss": 0.3827, + "step": 4697 + }, + { + "epoch": 1.3189219539584502, + "grad_norm": 0.7186130881309509, + "learning_rate": 6.879223586399178e-06, + "loss": 0.3805, + "step": 4698 + }, + { + "epoch": 1.319202695115104, + "grad_norm": 0.6152911186218262, + "learning_rate": 6.8777098855377e-06, + "loss": 0.3824, + "step": 4699 + }, + { + "epoch": 1.3194834362717573, + "grad_norm": 0.7523589730262756, + "learning_rate": 6.8761959842987e-06, + "loss": 0.3878, + "step": 4700 + }, + { + "epoch": 1.319764177428411, + "grad_norm": 0.6040613651275635, + "learning_rate": 6.87468188284373e-06, + "loss": 0.3592, + "step": 4701 + }, + { + "epoch": 1.3200449185850647, + "grad_norm": 0.6443442106246948, + "learning_rate": 6.87316758133437e-06, + "loss": 0.3611, + "step": 4702 + }, + { + "epoch": 1.3203256597417181, + "grad_norm": 0.7300065755844116, + "learning_rate": 6.871653079932213e-06, + "loss": 0.3827, + "step": 4703 + }, + { + "epoch": 1.3206064008983718, + "grad_norm": 0.7388545274734497, + "learning_rate": 6.87013837879888e-06, + "loss": 0.3998, + "step": 4704 + }, + { + "epoch": 1.3208871420550252, + "grad_norm": 0.7331262826919556, + "learning_rate": 6.868623478096006e-06, + "loss": 0.3922, + "step": 4705 + }, + { + "epoch": 1.3211678832116789, + "grad_norm": 0.6619069576263428, + "learning_rate": 6.867108377985257e-06, + "loss": 0.4034, + "step": 4706 + }, + { + "epoch": 1.3214486243683323, + "grad_norm": 0.6988587975502014, + "learning_rate": 6.865593078628311e-06, + "loss": 0.4308, + "step": 4707 + }, + { + "epoch": 1.321729365524986, + "grad_norm": 0.7060587406158447, + "learning_rate": 6.8640775801868746e-06, + "loss": 0.4065, + "step": 4708 + }, + { + "epoch": 1.3220101066816397, + "grad_norm": 0.6800733804702759, + "learning_rate": 6.8625618828226695e-06, + "loss": 0.47, + "step": 4709 + }, + { + "epoch": 1.322290847838293, + "grad_norm": 0.7192081212997437, + "learning_rate": 6.861045986697443e-06, + "loss": 0.4265, + "step": 4710 + }, + { + "epoch": 1.3225715889949465, + "grad_norm": 0.703007698059082, + "learning_rate": 6.8595298919729624e-06, + "loss": 0.3654, + "step": 4711 + }, + { + "epoch": 1.3228523301516002, + "grad_norm": 0.7066115736961365, + "learning_rate": 6.858013598811015e-06, + "loss": 0.3571, + "step": 4712 + }, + { + "epoch": 1.3231330713082539, + "grad_norm": 0.6508426666259766, + "learning_rate": 6.8564971073734095e-06, + "loss": 0.3763, + "step": 4713 + }, + { + "epoch": 1.3234138124649073, + "grad_norm": 0.6828879714012146, + "learning_rate": 6.8549804178219794e-06, + "loss": 0.3662, + "step": 4714 + }, + { + "epoch": 1.323694553621561, + "grad_norm": 0.6254562139511108, + "learning_rate": 6.8534635303185735e-06, + "loss": 0.3912, + "step": 4715 + }, + { + "epoch": 1.3239752947782144, + "grad_norm": 0.7081087231636047, + "learning_rate": 6.8519464450250665e-06, + "loss": 0.422, + "step": 4716 + }, + { + "epoch": 1.324256035934868, + "grad_norm": 0.6534614562988281, + "learning_rate": 6.850429162103349e-06, + "loss": 0.3728, + "step": 4717 + }, + { + "epoch": 1.3245367770915215, + "grad_norm": 0.7958827614784241, + "learning_rate": 6.84891168171534e-06, + "loss": 0.4647, + "step": 4718 + }, + { + "epoch": 1.3248175182481752, + "grad_norm": 0.7704675197601318, + "learning_rate": 6.847394004022975e-06, + "loss": 0.4027, + "step": 4719 + }, + { + "epoch": 1.3250982594048288, + "grad_norm": 0.5801774263381958, + "learning_rate": 6.845876129188209e-06, + "loss": 0.3593, + "step": 4720 + }, + { + "epoch": 1.3253790005614823, + "grad_norm": 0.6821433305740356, + "learning_rate": 6.844358057373021e-06, + "loss": 0.3843, + "step": 4721 + }, + { + "epoch": 1.325659741718136, + "grad_norm": 0.7506273984909058, + "learning_rate": 6.8428397887394105e-06, + "loss": 0.4549, + "step": 4722 + }, + { + "epoch": 1.3259404828747894, + "grad_norm": 0.7236044406890869, + "learning_rate": 6.841321323449398e-06, + "loss": 0.3661, + "step": 4723 + }, + { + "epoch": 1.326221224031443, + "grad_norm": 0.6838476061820984, + "learning_rate": 6.8398026616650246e-06, + "loss": 0.3912, + "step": 4724 + }, + { + "epoch": 1.3265019651880965, + "grad_norm": 0.7017772793769836, + "learning_rate": 6.838283803548351e-06, + "loss": 0.4085, + "step": 4725 + }, + { + "epoch": 1.3267827063447502, + "grad_norm": 0.7446981072425842, + "learning_rate": 6.836764749261463e-06, + "loss": 0.4056, + "step": 4726 + }, + { + "epoch": 1.3270634475014038, + "grad_norm": 0.7386606335639954, + "learning_rate": 6.835245498966461e-06, + "loss": 0.4392, + "step": 4727 + }, + { + "epoch": 1.3273441886580573, + "grad_norm": 0.6297206878662109, + "learning_rate": 6.833726052825474e-06, + "loss": 0.3706, + "step": 4728 + }, + { + "epoch": 1.327624929814711, + "grad_norm": 0.6871871948242188, + "learning_rate": 6.832206411000644e-06, + "loss": 0.4382, + "step": 4729 + }, + { + "epoch": 1.3279056709713644, + "grad_norm": 0.6700953841209412, + "learning_rate": 6.83068657365414e-06, + "loss": 0.3578, + "step": 4730 + }, + { + "epoch": 1.328186412128018, + "grad_norm": 0.6798109412193298, + "learning_rate": 6.829166540948151e-06, + "loss": 0.3947, + "step": 4731 + }, + { + "epoch": 1.3284671532846715, + "grad_norm": 0.7043603658676147, + "learning_rate": 6.827646313044882e-06, + "loss": 0.3808, + "step": 4732 + }, + { + "epoch": 1.3287478944413251, + "grad_norm": 0.7176543474197388, + "learning_rate": 6.826125890106563e-06, + "loss": 0.3915, + "step": 4733 + }, + { + "epoch": 1.3290286355979788, + "grad_norm": 0.5913230776786804, + "learning_rate": 6.824605272295446e-06, + "loss": 0.3551, + "step": 4734 + }, + { + "epoch": 1.3293093767546322, + "grad_norm": 0.5929445028305054, + "learning_rate": 6.8230844597738014e-06, + "loss": 0.4258, + "step": 4735 + }, + { + "epoch": 1.3295901179112857, + "grad_norm": 0.6223021745681763, + "learning_rate": 6.821563452703919e-06, + "loss": 0.4027, + "step": 4736 + }, + { + "epoch": 1.3298708590679393, + "grad_norm": 0.7025001049041748, + "learning_rate": 6.8200422512481146e-06, + "loss": 0.3603, + "step": 4737 + }, + { + "epoch": 1.330151600224593, + "grad_norm": 0.73182213306427, + "learning_rate": 6.818520855568717e-06, + "loss": 0.4313, + "step": 4738 + }, + { + "epoch": 1.3304323413812464, + "grad_norm": 0.6106387972831726, + "learning_rate": 6.816999265828083e-06, + "loss": 0.3771, + "step": 4739 + }, + { + "epoch": 1.3307130825379, + "grad_norm": 0.7461013793945312, + "learning_rate": 6.8154774821885864e-06, + "loss": 0.4075, + "step": 4740 + }, + { + "epoch": 1.3309938236945535, + "grad_norm": 0.6924073696136475, + "learning_rate": 6.813955504812621e-06, + "loss": 0.3849, + "step": 4741 + }, + { + "epoch": 1.3312745648512072, + "grad_norm": 0.6440805792808533, + "learning_rate": 6.812433333862608e-06, + "loss": 0.3861, + "step": 4742 + }, + { + "epoch": 1.3315553060078607, + "grad_norm": 0.716837465763092, + "learning_rate": 6.810910969500979e-06, + "loss": 0.391, + "step": 4743 + }, + { + "epoch": 1.3318360471645143, + "grad_norm": 0.6487555503845215, + "learning_rate": 6.809388411890192e-06, + "loss": 0.352, + "step": 4744 + }, + { + "epoch": 1.332116788321168, + "grad_norm": 0.6747896671295166, + "learning_rate": 6.807865661192725e-06, + "loss": 0.3625, + "step": 4745 + }, + { + "epoch": 1.3323975294778214, + "grad_norm": 0.7779172658920288, + "learning_rate": 6.806342717571078e-06, + "loss": 0.3995, + "step": 4746 + }, + { + "epoch": 1.332678270634475, + "grad_norm": 0.671245813369751, + "learning_rate": 6.80481958118777e-06, + "loss": 0.4116, + "step": 4747 + }, + { + "epoch": 1.3329590117911285, + "grad_norm": 0.5985339879989624, + "learning_rate": 6.803296252205338e-06, + "loss": 0.3933, + "step": 4748 + }, + { + "epoch": 1.3332397529477822, + "grad_norm": 0.6273095607757568, + "learning_rate": 6.8017727307863445e-06, + "loss": 0.3972, + "step": 4749 + }, + { + "epoch": 1.3335204941044356, + "grad_norm": 0.6496458649635315, + "learning_rate": 6.80024901709337e-06, + "loss": 0.4196, + "step": 4750 + }, + { + "epoch": 1.3338012352610893, + "grad_norm": 0.5970061421394348, + "learning_rate": 6.798725111289015e-06, + "loss": 0.3995, + "step": 4751 + }, + { + "epoch": 1.334081976417743, + "grad_norm": 0.638443112373352, + "learning_rate": 6.797201013535901e-06, + "loss": 0.4375, + "step": 4752 + }, + { + "epoch": 1.3343627175743964, + "grad_norm": 0.6787781119346619, + "learning_rate": 6.795676723996672e-06, + "loss": 0.3667, + "step": 4753 + }, + { + "epoch": 1.3346434587310498, + "grad_norm": 0.6766923666000366, + "learning_rate": 6.794152242833989e-06, + "loss": 0.3838, + "step": 4754 + }, + { + "epoch": 1.3349241998877035, + "grad_norm": 0.6247906684875488, + "learning_rate": 6.792627570210536e-06, + "loss": 0.3787, + "step": 4755 + }, + { + "epoch": 1.3352049410443572, + "grad_norm": 0.6960628032684326, + "learning_rate": 6.7911027062890165e-06, + "loss": 0.3338, + "step": 4756 + }, + { + "epoch": 1.3354856822010106, + "grad_norm": 0.6159615516662598, + "learning_rate": 6.789577651232153e-06, + "loss": 0.3983, + "step": 4757 + }, + { + "epoch": 1.3357664233576643, + "grad_norm": 0.572173535823822, + "learning_rate": 6.788052405202693e-06, + "loss": 0.4016, + "step": 4758 + }, + { + "epoch": 1.336047164514318, + "grad_norm": 0.6744593381881714, + "learning_rate": 6.7865269683634e-06, + "loss": 0.3672, + "step": 4759 + }, + { + "epoch": 1.3363279056709714, + "grad_norm": 0.6162299513816833, + "learning_rate": 6.7850013408770565e-06, + "loss": 0.3888, + "step": 4760 + }, + { + "epoch": 1.3366086468276248, + "grad_norm": 0.6695587635040283, + "learning_rate": 6.783475522906471e-06, + "loss": 0.4109, + "step": 4761 + }, + { + "epoch": 1.3368893879842785, + "grad_norm": 0.6276293396949768, + "learning_rate": 6.781949514614469e-06, + "loss": 0.3826, + "step": 4762 + }, + { + "epoch": 1.3371701291409321, + "grad_norm": 0.7240124344825745, + "learning_rate": 6.780423316163896e-06, + "loss": 0.393, + "step": 4763 + }, + { + "epoch": 1.3374508702975856, + "grad_norm": 0.6815361380577087, + "learning_rate": 6.7788969277176165e-06, + "loss": 0.3703, + "step": 4764 + }, + { + "epoch": 1.3377316114542392, + "grad_norm": 0.6647552251815796, + "learning_rate": 6.77737034943852e-06, + "loss": 0.3896, + "step": 4765 + }, + { + "epoch": 1.3380123526108927, + "grad_norm": 0.6063252687454224, + "learning_rate": 6.775843581489513e-06, + "loss": 0.3891, + "step": 4766 + }, + { + "epoch": 1.3382930937675463, + "grad_norm": 0.6022602319717407, + "learning_rate": 6.774316624033522e-06, + "loss": 0.3921, + "step": 4767 + }, + { + "epoch": 1.3385738349241998, + "grad_norm": 0.6594793796539307, + "learning_rate": 6.772789477233494e-06, + "loss": 0.4026, + "step": 4768 + }, + { + "epoch": 1.3388545760808535, + "grad_norm": 0.7128239274024963, + "learning_rate": 6.771262141252399e-06, + "loss": 0.3933, + "step": 4769 + }, + { + "epoch": 1.3391353172375071, + "grad_norm": 0.5674261450767517, + "learning_rate": 6.769734616253223e-06, + "loss": 0.3872, + "step": 4770 + }, + { + "epoch": 1.3394160583941606, + "grad_norm": 0.7471799850463867, + "learning_rate": 6.7682069023989725e-06, + "loss": 0.379, + "step": 4771 + }, + { + "epoch": 1.3396967995508142, + "grad_norm": 0.7530657649040222, + "learning_rate": 6.766678999852678e-06, + "loss": 0.36, + "step": 4772 + }, + { + "epoch": 1.3399775407074677, + "grad_norm": 0.6597678065299988, + "learning_rate": 6.765150908777387e-06, + "loss": 0.3906, + "step": 4773 + }, + { + "epoch": 1.3402582818641213, + "grad_norm": 0.6415244936943054, + "learning_rate": 6.763622629336168e-06, + "loss": 0.3793, + "step": 4774 + }, + { + "epoch": 1.3405390230207748, + "grad_norm": 0.6748092174530029, + "learning_rate": 6.7620941616921076e-06, + "loss": 0.3995, + "step": 4775 + }, + { + "epoch": 1.3408197641774284, + "grad_norm": 0.7586249709129333, + "learning_rate": 6.760565506008319e-06, + "loss": 0.3841, + "step": 4776 + }, + { + "epoch": 1.341100505334082, + "grad_norm": 0.6067720055580139, + "learning_rate": 6.759036662447924e-06, + "loss": 0.4036, + "step": 4777 + }, + { + "epoch": 1.3413812464907355, + "grad_norm": 0.6660099029541016, + "learning_rate": 6.757507631174076e-06, + "loss": 0.3918, + "step": 4778 + }, + { + "epoch": 1.341661987647389, + "grad_norm": 0.6044845581054688, + "learning_rate": 6.755978412349944e-06, + "loss": 0.3747, + "step": 4779 + }, + { + "epoch": 1.3419427288040426, + "grad_norm": 0.6778552532196045, + "learning_rate": 6.754449006138717e-06, + "loss": 0.3929, + "step": 4780 + }, + { + "epoch": 1.3422234699606963, + "grad_norm": 0.7300291061401367, + "learning_rate": 6.7529194127036005e-06, + "loss": 0.3853, + "step": 4781 + }, + { + "epoch": 1.3425042111173497, + "grad_norm": 0.7000691294670105, + "learning_rate": 6.7513896322078246e-06, + "loss": 0.3733, + "step": 4782 + }, + { + "epoch": 1.3427849522740034, + "grad_norm": 0.7240469455718994, + "learning_rate": 6.749859664814639e-06, + "loss": 0.4096, + "step": 4783 + }, + { + "epoch": 1.343065693430657, + "grad_norm": 0.6703082919120789, + "learning_rate": 6.7483295106873104e-06, + "loss": 0.3749, + "step": 4784 + }, + { + "epoch": 1.3433464345873105, + "grad_norm": 0.6959947943687439, + "learning_rate": 6.74679916998913e-06, + "loss": 0.4297, + "step": 4785 + }, + { + "epoch": 1.343627175743964, + "grad_norm": 0.6914154887199402, + "learning_rate": 6.7452686428834045e-06, + "loss": 0.3624, + "step": 4786 + }, + { + "epoch": 1.3439079169006176, + "grad_norm": 0.8058205842971802, + "learning_rate": 6.743737929533462e-06, + "loss": 0.3956, + "step": 4787 + }, + { + "epoch": 1.3441886580572713, + "grad_norm": 0.6181507706642151, + "learning_rate": 6.742207030102652e-06, + "loss": 0.4118, + "step": 4788 + }, + { + "epoch": 1.3444693992139247, + "grad_norm": 0.7075345516204834, + "learning_rate": 6.740675944754343e-06, + "loss": 0.3525, + "step": 4789 + }, + { + "epoch": 1.3447501403705784, + "grad_norm": 0.7400590181350708, + "learning_rate": 6.739144673651918e-06, + "loss": 0.3979, + "step": 4790 + }, + { + "epoch": 1.3450308815272318, + "grad_norm": 0.9430344700813293, + "learning_rate": 6.7376132169587915e-06, + "loss": 0.4275, + "step": 4791 + }, + { + "epoch": 1.3453116226838855, + "grad_norm": 0.6225433945655823, + "learning_rate": 6.7360815748383865e-06, + "loss": 0.4083, + "step": 4792 + }, + { + "epoch": 1.345592363840539, + "grad_norm": 0.6359659433364868, + "learning_rate": 6.7345497474541534e-06, + "loss": 0.3592, + "step": 4793 + }, + { + "epoch": 1.3458731049971926, + "grad_norm": 0.619032084941864, + "learning_rate": 6.733017734969557e-06, + "loss": 0.3815, + "step": 4794 + }, + { + "epoch": 1.3461538461538463, + "grad_norm": 0.6978216767311096, + "learning_rate": 6.731485537548084e-06, + "loss": 0.3659, + "step": 4795 + }, + { + "epoch": 1.3464345873104997, + "grad_norm": 0.7958037257194519, + "learning_rate": 6.729953155353243e-06, + "loss": 0.3671, + "step": 4796 + }, + { + "epoch": 1.3467153284671534, + "grad_norm": 0.8144029378890991, + "learning_rate": 6.728420588548558e-06, + "loss": 0.3808, + "step": 4797 + }, + { + "epoch": 1.3469960696238068, + "grad_norm": 0.7494853734970093, + "learning_rate": 6.726887837297578e-06, + "loss": 0.4052, + "step": 4798 + }, + { + "epoch": 1.3472768107804605, + "grad_norm": 0.678192675113678, + "learning_rate": 6.725354901763865e-06, + "loss": 0.3847, + "step": 4799 + }, + { + "epoch": 1.347557551937114, + "grad_norm": 0.7762433290481567, + "learning_rate": 6.7238217821110066e-06, + "loss": 0.3866, + "step": 4800 + }, + { + "epoch": 1.3478382930937676, + "grad_norm": 0.7528349757194519, + "learning_rate": 6.722288478502608e-06, + "loss": 0.3616, + "step": 4801 + }, + { + "epoch": 1.3481190342504212, + "grad_norm": 0.6515021920204163, + "learning_rate": 6.720754991102292e-06, + "loss": 0.379, + "step": 4802 + }, + { + "epoch": 1.3483997754070747, + "grad_norm": 0.6461201906204224, + "learning_rate": 6.719221320073705e-06, + "loss": 0.4429, + "step": 4803 + }, + { + "epoch": 1.3486805165637281, + "grad_norm": 0.8549379706382751, + "learning_rate": 6.717687465580509e-06, + "loss": 0.3687, + "step": 4804 + }, + { + "epoch": 1.3489612577203818, + "grad_norm": 0.6953334212303162, + "learning_rate": 6.716153427786388e-06, + "loss": 0.3656, + "step": 4805 + }, + { + "epoch": 1.3492419988770354, + "grad_norm": 0.7070066928863525, + "learning_rate": 6.714619206855046e-06, + "loss": 0.3889, + "step": 4806 + }, + { + "epoch": 1.3495227400336889, + "grad_norm": 0.7246028184890747, + "learning_rate": 6.713084802950205e-06, + "loss": 0.3919, + "step": 4807 + }, + { + "epoch": 1.3498034811903425, + "grad_norm": 0.7758237719535828, + "learning_rate": 6.711550216235607e-06, + "loss": 0.4151, + "step": 4808 + }, + { + "epoch": 1.350084222346996, + "grad_norm": 0.7004920840263367, + "learning_rate": 6.7100154468750135e-06, + "loss": 0.3666, + "step": 4809 + }, + { + "epoch": 1.3503649635036497, + "grad_norm": 0.7784600853919983, + "learning_rate": 6.7084804950322045e-06, + "loss": 0.4238, + "step": 4810 + }, + { + "epoch": 1.350645704660303, + "grad_norm": 0.6851533055305481, + "learning_rate": 6.706945360870982e-06, + "loss": 0.4086, + "step": 4811 + }, + { + "epoch": 1.3509264458169568, + "grad_norm": 0.7173120379447937, + "learning_rate": 6.705410044555165e-06, + "loss": 0.4195, + "step": 4812 + }, + { + "epoch": 1.3512071869736104, + "grad_norm": 0.846400260925293, + "learning_rate": 6.703874546248593e-06, + "loss": 0.4133, + "step": 4813 + }, + { + "epoch": 1.3514879281302639, + "grad_norm": 0.7642650008201599, + "learning_rate": 6.7023388661151265e-06, + "loss": 0.4062, + "step": 4814 + }, + { + "epoch": 1.3517686692869175, + "grad_norm": 0.6799836754798889, + "learning_rate": 6.700803004318641e-06, + "loss": 0.4051, + "step": 4815 + }, + { + "epoch": 1.352049410443571, + "grad_norm": 0.7603617906570435, + "learning_rate": 6.6992669610230345e-06, + "loss": 0.4453, + "step": 4816 + }, + { + "epoch": 1.3523301516002246, + "grad_norm": 0.6981773376464844, + "learning_rate": 6.697730736392226e-06, + "loss": 0.3966, + "step": 4817 + }, + { + "epoch": 1.352610892756878, + "grad_norm": 0.6745737195014954, + "learning_rate": 6.6961943305901515e-06, + "loss": 0.3902, + "step": 4818 + }, + { + "epoch": 1.3528916339135317, + "grad_norm": 0.7249606251716614, + "learning_rate": 6.694657743780767e-06, + "loss": 0.3549, + "step": 4819 + }, + { + "epoch": 1.3531723750701854, + "grad_norm": 0.6501696109771729, + "learning_rate": 6.6931209761280445e-06, + "loss": 0.3608, + "step": 4820 + }, + { + "epoch": 1.3534531162268388, + "grad_norm": 0.663993775844574, + "learning_rate": 6.691584027795981e-06, + "loss": 0.3795, + "step": 4821 + }, + { + "epoch": 1.3537338573834925, + "grad_norm": 0.6785128712654114, + "learning_rate": 6.690046898948589e-06, + "loss": 0.3785, + "step": 4822 + }, + { + "epoch": 1.354014598540146, + "grad_norm": 0.700816810131073, + "learning_rate": 6.688509589749901e-06, + "loss": 0.404, + "step": 4823 + }, + { + "epoch": 1.3542953396967996, + "grad_norm": 0.6838839650154114, + "learning_rate": 6.686972100363971e-06, + "loss": 0.3666, + "step": 4824 + }, + { + "epoch": 1.354576080853453, + "grad_norm": 0.6519018411636353, + "learning_rate": 6.685434430954869e-06, + "loss": 0.4002, + "step": 4825 + }, + { + "epoch": 1.3548568220101067, + "grad_norm": 0.6772266626358032, + "learning_rate": 6.683896581686685e-06, + "loss": 0.3679, + "step": 4826 + }, + { + "epoch": 1.3551375631667604, + "grad_norm": 0.7054872512817383, + "learning_rate": 6.682358552723529e-06, + "loss": 0.3874, + "step": 4827 + }, + { + "epoch": 1.3554183043234138, + "grad_norm": 0.6136159300804138, + "learning_rate": 6.68082034422953e-06, + "loss": 0.3956, + "step": 4828 + }, + { + "epoch": 1.3556990454800673, + "grad_norm": 0.6421452164649963, + "learning_rate": 6.679281956368836e-06, + "loss": 0.355, + "step": 4829 + }, + { + "epoch": 1.355979786636721, + "grad_norm": 0.6787823438644409, + "learning_rate": 6.6777433893056165e-06, + "loss": 0.3823, + "step": 4830 + }, + { + "epoch": 1.3562605277933746, + "grad_norm": 0.8672777414321899, + "learning_rate": 6.676204643204054e-06, + "loss": 0.3905, + "step": 4831 + }, + { + "epoch": 1.356541268950028, + "grad_norm": 0.7196842432022095, + "learning_rate": 6.674665718228356e-06, + "loss": 0.3964, + "step": 4832 + }, + { + "epoch": 1.3568220101066817, + "grad_norm": 0.7156896591186523, + "learning_rate": 6.673126614542746e-06, + "loss": 0.3911, + "step": 4833 + }, + { + "epoch": 1.3571027512633351, + "grad_norm": 0.672362744808197, + "learning_rate": 6.671587332311468e-06, + "loss": 0.4078, + "step": 4834 + }, + { + "epoch": 1.3573834924199888, + "grad_norm": 0.718880295753479, + "learning_rate": 6.670047871698786e-06, + "loss": 0.3649, + "step": 4835 + }, + { + "epoch": 1.3576642335766422, + "grad_norm": 0.643750786781311, + "learning_rate": 6.668508232868981e-06, + "loss": 0.3827, + "step": 4836 + }, + { + "epoch": 1.357944974733296, + "grad_norm": 0.6865444183349609, + "learning_rate": 6.666968415986352e-06, + "loss": 0.433, + "step": 4837 + }, + { + "epoch": 1.3582257158899496, + "grad_norm": 0.6739438772201538, + "learning_rate": 6.6654284212152195e-06, + "loss": 0.3741, + "step": 4838 + }, + { + "epoch": 1.358506457046603, + "grad_norm": 0.597885012626648, + "learning_rate": 6.663888248719923e-06, + "loss": 0.3479, + "step": 4839 + }, + { + "epoch": 1.3587871982032567, + "grad_norm": 0.6266430616378784, + "learning_rate": 6.6623478986648205e-06, + "loss": 0.3961, + "step": 4840 + }, + { + "epoch": 1.35906793935991, + "grad_norm": 0.5871906876564026, + "learning_rate": 6.660807371214286e-06, + "loss": 0.3815, + "step": 4841 + }, + { + "epoch": 1.3593486805165638, + "grad_norm": 0.6124931573867798, + "learning_rate": 6.6592666665327176e-06, + "loss": 0.3567, + "step": 4842 + }, + { + "epoch": 1.3596294216732172, + "grad_norm": 0.6495899558067322, + "learning_rate": 6.657725784784529e-06, + "loss": 0.3442, + "step": 4843 + }, + { + "epoch": 1.3599101628298709, + "grad_norm": 0.5793033838272095, + "learning_rate": 6.656184726134153e-06, + "loss": 0.3927, + "step": 4844 + }, + { + "epoch": 1.3601909039865245, + "grad_norm": 0.637464165687561, + "learning_rate": 6.654643490746042e-06, + "loss": 0.3471, + "step": 4845 + }, + { + "epoch": 1.360471645143178, + "grad_norm": 0.6115562915802002, + "learning_rate": 6.653102078784667e-06, + "loss": 0.3981, + "step": 4846 + }, + { + "epoch": 1.3607523862998314, + "grad_norm": 0.6376574039459229, + "learning_rate": 6.651560490414519e-06, + "loss": 0.3637, + "step": 4847 + }, + { + "epoch": 1.361033127456485, + "grad_norm": 0.6842833757400513, + "learning_rate": 6.6500187258001055e-06, + "loss": 0.3926, + "step": 4848 + }, + { + "epoch": 1.3613138686131387, + "grad_norm": 0.7011694312095642, + "learning_rate": 6.648476785105955e-06, + "loss": 0.3943, + "step": 4849 + }, + { + "epoch": 1.3615946097697922, + "grad_norm": 0.6348097324371338, + "learning_rate": 6.646934668496612e-06, + "loss": 0.4112, + "step": 4850 + }, + { + "epoch": 1.3618753509264458, + "grad_norm": 0.6216190457344055, + "learning_rate": 6.645392376136644e-06, + "loss": 0.3563, + "step": 4851 + }, + { + "epoch": 1.3621560920830995, + "grad_norm": 0.6453918814659119, + "learning_rate": 6.643849908190632e-06, + "loss": 0.3932, + "step": 4852 + }, + { + "epoch": 1.362436833239753, + "grad_norm": 0.6672836542129517, + "learning_rate": 6.642307264823182e-06, + "loss": 0.4234, + "step": 4853 + }, + { + "epoch": 1.3627175743964064, + "grad_norm": 0.6598169207572937, + "learning_rate": 6.640764446198912e-06, + "loss": 0.3894, + "step": 4854 + }, + { + "epoch": 1.36299831555306, + "grad_norm": 0.7138548493385315, + "learning_rate": 6.639221452482464e-06, + "loss": 0.3739, + "step": 4855 + }, + { + "epoch": 1.3632790567097137, + "grad_norm": 0.6872824430465698, + "learning_rate": 6.637678283838497e-06, + "loss": 0.4325, + "step": 4856 + }, + { + "epoch": 1.3635597978663672, + "grad_norm": 0.6734063625335693, + "learning_rate": 6.636134940431688e-06, + "loss": 0.3829, + "step": 4857 + }, + { + "epoch": 1.3638405390230208, + "grad_norm": 0.7197776436805725, + "learning_rate": 6.634591422426731e-06, + "loss": 0.397, + "step": 4858 + }, + { + "epoch": 1.3641212801796743, + "grad_norm": 0.6476694941520691, + "learning_rate": 6.633047729988343e-06, + "loss": 0.4005, + "step": 4859 + }, + { + "epoch": 1.364402021336328, + "grad_norm": 0.6788256764411926, + "learning_rate": 6.6315038632812565e-06, + "loss": 0.4071, + "step": 4860 + }, + { + "epoch": 1.3646827624929814, + "grad_norm": 0.6949490308761597, + "learning_rate": 6.629959822470223e-06, + "loss": 0.3624, + "step": 4861 + }, + { + "epoch": 1.364963503649635, + "grad_norm": 0.5946491956710815, + "learning_rate": 6.628415607720013e-06, + "loss": 0.3583, + "step": 4862 + }, + { + "epoch": 1.3652442448062887, + "grad_norm": 0.7144595980644226, + "learning_rate": 6.626871219195418e-06, + "loss": 0.3718, + "step": 4863 + }, + { + "epoch": 1.3655249859629421, + "grad_norm": 0.6326375007629395, + "learning_rate": 6.625326657061242e-06, + "loss": 0.4067, + "step": 4864 + }, + { + "epoch": 1.3658057271195958, + "grad_norm": 0.7528015375137329, + "learning_rate": 6.6237819214823115e-06, + "loss": 0.3922, + "step": 4865 + }, + { + "epoch": 1.3660864682762492, + "grad_norm": 0.7716882228851318, + "learning_rate": 6.622237012623473e-06, + "loss": 0.3999, + "step": 4866 + }, + { + "epoch": 1.366367209432903, + "grad_norm": 0.6801896095275879, + "learning_rate": 6.620691930649586e-06, + "loss": 0.3482, + "step": 4867 + }, + { + "epoch": 1.3666479505895563, + "grad_norm": 0.6368662118911743, + "learning_rate": 6.619146675725539e-06, + "loss": 0.3849, + "step": 4868 + }, + { + "epoch": 1.36692869174621, + "grad_norm": 0.6772675514221191, + "learning_rate": 6.617601248016224e-06, + "loss": 0.3671, + "step": 4869 + }, + { + "epoch": 1.3672094329028637, + "grad_norm": 0.6529778242111206, + "learning_rate": 6.616055647686565e-06, + "loss": 0.3821, + "step": 4870 + }, + { + "epoch": 1.3674901740595171, + "grad_norm": 0.6284794807434082, + "learning_rate": 6.6145098749014954e-06, + "loss": 0.3529, + "step": 4871 + }, + { + "epoch": 1.3677709152161706, + "grad_norm": 0.6432361006736755, + "learning_rate": 6.612963929825973e-06, + "loss": 0.3974, + "step": 4872 + }, + { + "epoch": 1.3680516563728242, + "grad_norm": 0.6178905367851257, + "learning_rate": 6.6114178126249694e-06, + "loss": 0.3484, + "step": 4873 + }, + { + "epoch": 1.3683323975294779, + "grad_norm": 0.6260415315628052, + "learning_rate": 6.6098715234634805e-06, + "loss": 0.3705, + "step": 4874 + }, + { + "epoch": 1.3686131386861313, + "grad_norm": 0.7609507441520691, + "learning_rate": 6.608325062506511e-06, + "loss": 0.4133, + "step": 4875 + }, + { + "epoch": 1.368893879842785, + "grad_norm": 0.6837617754936218, + "learning_rate": 6.606778429919093e-06, + "loss": 0.3862, + "step": 4876 + }, + { + "epoch": 1.3691746209994387, + "grad_norm": 0.6646842360496521, + "learning_rate": 6.605231625866272e-06, + "loss": 0.3645, + "step": 4877 + }, + { + "epoch": 1.369455362156092, + "grad_norm": 0.6701329946517944, + "learning_rate": 6.603684650513115e-06, + "loss": 0.4086, + "step": 4878 + }, + { + "epoch": 1.3697361033127455, + "grad_norm": 0.6591203212738037, + "learning_rate": 6.602137504024705e-06, + "loss": 0.3404, + "step": 4879 + }, + { + "epoch": 1.3700168444693992, + "grad_norm": 0.6211579442024231, + "learning_rate": 6.600590186566143e-06, + "loss": 0.3951, + "step": 4880 + }, + { + "epoch": 1.3702975856260529, + "grad_norm": 0.6577760577201843, + "learning_rate": 6.599042698302549e-06, + "loss": 0.3723, + "step": 4881 + }, + { + "epoch": 1.3705783267827063, + "grad_norm": 0.6943831443786621, + "learning_rate": 6.597495039399064e-06, + "loss": 0.3806, + "step": 4882 + }, + { + "epoch": 1.37085906793936, + "grad_norm": 0.6222783923149109, + "learning_rate": 6.595947210020841e-06, + "loss": 0.3814, + "step": 4883 + }, + { + "epoch": 1.3711398090960134, + "grad_norm": 0.6409193277359009, + "learning_rate": 6.594399210333057e-06, + "loss": 0.3773, + "step": 4884 + }, + { + "epoch": 1.371420550252667, + "grad_norm": 0.6976940631866455, + "learning_rate": 6.592851040500905e-06, + "loss": 0.4153, + "step": 4885 + }, + { + "epoch": 1.3717012914093205, + "grad_norm": 0.5962629318237305, + "learning_rate": 6.591302700689593e-06, + "loss": 0.3999, + "step": 4886 + }, + { + "epoch": 1.3719820325659742, + "grad_norm": 0.6599648594856262, + "learning_rate": 6.5897541910643545e-06, + "loss": 0.3937, + "step": 4887 + }, + { + "epoch": 1.3722627737226278, + "grad_norm": 0.6184437870979309, + "learning_rate": 6.5882055117904334e-06, + "loss": 0.3736, + "step": 4888 + }, + { + "epoch": 1.3725435148792813, + "grad_norm": 0.5633988380432129, + "learning_rate": 6.586656663033098e-06, + "loss": 0.3721, + "step": 4889 + }, + { + "epoch": 1.372824256035935, + "grad_norm": 0.6845596432685852, + "learning_rate": 6.5851076449576276e-06, + "loss": 0.3708, + "step": 4890 + }, + { + "epoch": 1.3731049971925884, + "grad_norm": 0.6809899210929871, + "learning_rate": 6.58355845772933e-06, + "loss": 0.3743, + "step": 4891 + }, + { + "epoch": 1.373385738349242, + "grad_norm": 0.7047613263130188, + "learning_rate": 6.582009101513518e-06, + "loss": 0.3859, + "step": 4892 + }, + { + "epoch": 1.3736664795058955, + "grad_norm": 0.6444752216339111, + "learning_rate": 6.580459576475534e-06, + "loss": 0.3774, + "step": 4893 + }, + { + "epoch": 1.3739472206625492, + "grad_norm": 0.6714679002761841, + "learning_rate": 6.578909882780732e-06, + "loss": 0.3867, + "step": 4894 + }, + { + "epoch": 1.3742279618192028, + "grad_norm": 0.7270873785018921, + "learning_rate": 6.577360020594487e-06, + "loss": 0.3859, + "step": 4895 + }, + { + "epoch": 1.3745087029758563, + "grad_norm": 0.6965042948722839, + "learning_rate": 6.575809990082189e-06, + "loss": 0.3975, + "step": 4896 + }, + { + "epoch": 1.3747894441325097, + "grad_norm": 0.6982359290122986, + "learning_rate": 6.574259791409248e-06, + "loss": 0.4052, + "step": 4897 + }, + { + "epoch": 1.3750701852891634, + "grad_norm": 0.667641282081604, + "learning_rate": 6.57270942474109e-06, + "loss": 0.3276, + "step": 4898 + }, + { + "epoch": 1.375350926445817, + "grad_norm": 0.7631139159202576, + "learning_rate": 6.571158890243166e-06, + "loss": 0.4356, + "step": 4899 + }, + { + "epoch": 1.3756316676024705, + "grad_norm": 0.6603594422340393, + "learning_rate": 6.5696081880809325e-06, + "loss": 0.3702, + "step": 4900 + }, + { + "epoch": 1.3759124087591241, + "grad_norm": 0.5941827893257141, + "learning_rate": 6.5680573184198745e-06, + "loss": 0.3826, + "step": 4901 + }, + { + "epoch": 1.3761931499157778, + "grad_norm": 0.6959962248802185, + "learning_rate": 6.566506281425492e-06, + "loss": 0.3845, + "step": 4902 + }, + { + "epoch": 1.3764738910724312, + "grad_norm": 0.7347832322120667, + "learning_rate": 6.5649550772633e-06, + "loss": 0.3814, + "step": 4903 + }, + { + "epoch": 1.3767546322290847, + "grad_norm": 0.6277152895927429, + "learning_rate": 6.563403706098833e-06, + "loss": 0.3303, + "step": 4904 + }, + { + "epoch": 1.3770353733857383, + "grad_norm": 0.677270770072937, + "learning_rate": 6.561852168097644e-06, + "loss": 0.3823, + "step": 4905 + }, + { + "epoch": 1.377316114542392, + "grad_norm": 0.7346622943878174, + "learning_rate": 6.560300463425306e-06, + "loss": 0.39, + "step": 4906 + }, + { + "epoch": 1.3775968556990454, + "grad_norm": 0.6248444318771362, + "learning_rate": 6.558748592247404e-06, + "loss": 0.3838, + "step": 4907 + }, + { + "epoch": 1.377877596855699, + "grad_norm": 0.6768785119056702, + "learning_rate": 6.557196554729547e-06, + "loss": 0.3925, + "step": 4908 + }, + { + "epoch": 1.3781583380123525, + "grad_norm": 0.703629195690155, + "learning_rate": 6.555644351037356e-06, + "loss": 0.3763, + "step": 4909 + }, + { + "epoch": 1.3784390791690062, + "grad_norm": 0.7016772031784058, + "learning_rate": 6.554091981336475e-06, + "loss": 0.4042, + "step": 4910 + }, + { + "epoch": 1.3787198203256597, + "grad_norm": 0.6389033794403076, + "learning_rate": 6.5525394457925605e-06, + "loss": 0.3759, + "step": 4911 + }, + { + "epoch": 1.3790005614823133, + "grad_norm": 0.6705660223960876, + "learning_rate": 6.550986744571291e-06, + "loss": 0.414, + "step": 4912 + }, + { + "epoch": 1.379281302638967, + "grad_norm": 0.6910821795463562, + "learning_rate": 6.549433877838362e-06, + "loss": 0.3772, + "step": 4913 + }, + { + "epoch": 1.3795620437956204, + "grad_norm": 0.6090564727783203, + "learning_rate": 6.547880845759486e-06, + "loss": 0.3552, + "step": 4914 + }, + { + "epoch": 1.379842784952274, + "grad_norm": 0.5981763601303101, + "learning_rate": 6.5463276485003905e-06, + "loss": 0.3699, + "step": 4915 + }, + { + "epoch": 1.3801235261089275, + "grad_norm": 0.6429130434989929, + "learning_rate": 6.544774286226824e-06, + "loss": 0.3839, + "step": 4916 + }, + { + "epoch": 1.3804042672655812, + "grad_norm": 0.7170768976211548, + "learning_rate": 6.543220759104552e-06, + "loss": 0.3879, + "step": 4917 + }, + { + "epoch": 1.3806850084222346, + "grad_norm": 0.6578571796417236, + "learning_rate": 6.541667067299358e-06, + "loss": 0.3756, + "step": 4918 + }, + { + "epoch": 1.3809657495788883, + "grad_norm": 0.6086967587471008, + "learning_rate": 6.5401132109770395e-06, + "loss": 0.4147, + "step": 4919 + }, + { + "epoch": 1.381246490735542, + "grad_norm": 0.6940132975578308, + "learning_rate": 6.538559190303418e-06, + "loss": 0.4086, + "step": 4920 + }, + { + "epoch": 1.3815272318921954, + "grad_norm": 0.5924859642982483, + "learning_rate": 6.537005005444328e-06, + "loss": 0.3927, + "step": 4921 + }, + { + "epoch": 1.3818079730488488, + "grad_norm": 0.6510424613952637, + "learning_rate": 6.535450656565621e-06, + "loss": 0.4249, + "step": 4922 + }, + { + "epoch": 1.3820887142055025, + "grad_norm": 0.647047221660614, + "learning_rate": 6.533896143833169e-06, + "loss": 0.3646, + "step": 4923 + }, + { + "epoch": 1.3823694553621562, + "grad_norm": 0.6695600152015686, + "learning_rate": 6.532341467412858e-06, + "loss": 0.3888, + "step": 4924 + }, + { + "epoch": 1.3826501965188096, + "grad_norm": 0.6573482751846313, + "learning_rate": 6.5307866274705955e-06, + "loss": 0.4147, + "step": 4925 + }, + { + "epoch": 1.3829309376754633, + "grad_norm": 0.6649966239929199, + "learning_rate": 6.529231624172303e-06, + "loss": 0.3755, + "step": 4926 + }, + { + "epoch": 1.3832116788321167, + "grad_norm": 0.6034950017929077, + "learning_rate": 6.527676457683921e-06, + "loss": 0.3802, + "step": 4927 + }, + { + "epoch": 1.3834924199887704, + "grad_norm": 0.6298536658287048, + "learning_rate": 6.526121128171408e-06, + "loss": 0.3943, + "step": 4928 + }, + { + "epoch": 1.3837731611454238, + "grad_norm": 0.6359330415725708, + "learning_rate": 6.524565635800739e-06, + "loss": 0.3633, + "step": 4929 + }, + { + "epoch": 1.3840539023020775, + "grad_norm": 0.6504920721054077, + "learning_rate": 6.523009980737904e-06, + "loss": 0.3355, + "step": 4930 + }, + { + "epoch": 1.3843346434587311, + "grad_norm": 0.6498956680297852, + "learning_rate": 6.521454163148917e-06, + "loss": 0.3937, + "step": 4931 + }, + { + "epoch": 1.3846153846153846, + "grad_norm": 0.7074066996574402, + "learning_rate": 6.519898183199802e-06, + "loss": 0.4384, + "step": 4932 + }, + { + "epoch": 1.3848961257720382, + "grad_norm": 0.6834805011749268, + "learning_rate": 6.518342041056604e-06, + "loss": 0.3631, + "step": 4933 + }, + { + "epoch": 1.3851768669286917, + "grad_norm": 0.6755250096321106, + "learning_rate": 6.516785736885387e-06, + "loss": 0.4312, + "step": 4934 + }, + { + "epoch": 1.3854576080853453, + "grad_norm": 0.7766903042793274, + "learning_rate": 6.515229270852228e-06, + "loss": 0.4174, + "step": 4935 + }, + { + "epoch": 1.3857383492419988, + "grad_norm": 0.7174411416053772, + "learning_rate": 6.513672643123223e-06, + "loss": 0.4022, + "step": 4936 + }, + { + "epoch": 1.3860190903986525, + "grad_norm": 0.8040292263031006, + "learning_rate": 6.512115853864487e-06, + "loss": 0.4064, + "step": 4937 + }, + { + "epoch": 1.3862998315553061, + "grad_norm": 0.6910301446914673, + "learning_rate": 6.51055890324215e-06, + "loss": 0.4323, + "step": 4938 + }, + { + "epoch": 1.3865805727119596, + "grad_norm": 0.6850510835647583, + "learning_rate": 6.50900179142236e-06, + "loss": 0.3709, + "step": 4939 + }, + { + "epoch": 1.3868613138686132, + "grad_norm": 0.7027866840362549, + "learning_rate": 6.507444518571284e-06, + "loss": 0.3695, + "step": 4940 + }, + { + "epoch": 1.3871420550252667, + "grad_norm": 0.6455894708633423, + "learning_rate": 6.505887084855103e-06, + "loss": 0.3557, + "step": 4941 + }, + { + "epoch": 1.3874227961819203, + "grad_norm": 0.6405916213989258, + "learning_rate": 6.504329490440016e-06, + "loss": 0.4098, + "step": 4942 + }, + { + "epoch": 1.3877035373385738, + "grad_norm": 0.6554495096206665, + "learning_rate": 6.502771735492238e-06, + "loss": 0.3433, + "step": 4943 + }, + { + "epoch": 1.3879842784952274, + "grad_norm": 0.6610128283500671, + "learning_rate": 6.5012138201780085e-06, + "loss": 0.4254, + "step": 4944 + }, + { + "epoch": 1.388265019651881, + "grad_norm": 0.6976993083953857, + "learning_rate": 6.499655744663577e-06, + "loss": 0.371, + "step": 4945 + }, + { + "epoch": 1.3885457608085345, + "grad_norm": 0.7344253063201904, + "learning_rate": 6.498097509115207e-06, + "loss": 0.3852, + "step": 4946 + }, + { + "epoch": 1.388826501965188, + "grad_norm": 0.6160487532615662, + "learning_rate": 6.496539113699189e-06, + "loss": 0.3353, + "step": 4947 + }, + { + "epoch": 1.3891072431218416, + "grad_norm": 0.6966599225997925, + "learning_rate": 6.4949805585818215e-06, + "loss": 0.4081, + "step": 4948 + }, + { + "epoch": 1.3893879842784953, + "grad_norm": 0.7261844873428345, + "learning_rate": 6.4934218439294265e-06, + "loss": 0.3848, + "step": 4949 + }, + { + "epoch": 1.3896687254351487, + "grad_norm": 0.6698335409164429, + "learning_rate": 6.491862969908339e-06, + "loss": 0.3474, + "step": 4950 + }, + { + "epoch": 1.3899494665918024, + "grad_norm": 0.6629172563552856, + "learning_rate": 6.4903039366849145e-06, + "loss": 0.3298, + "step": 4951 + }, + { + "epoch": 1.3902302077484558, + "grad_norm": 0.568679928779602, + "learning_rate": 6.488744744425519e-06, + "loss": 0.3688, + "step": 4952 + }, + { + "epoch": 1.3905109489051095, + "grad_norm": 0.7038002014160156, + "learning_rate": 6.487185393296542e-06, + "loss": 0.4113, + "step": 4953 + }, + { + "epoch": 1.390791690061763, + "grad_norm": 0.6793254017829895, + "learning_rate": 6.48562588346439e-06, + "loss": 0.3948, + "step": 4954 + }, + { + "epoch": 1.3910724312184166, + "grad_norm": 0.6914476752281189, + "learning_rate": 6.484066215095481e-06, + "loss": 0.3825, + "step": 4955 + }, + { + "epoch": 1.3913531723750703, + "grad_norm": 0.6814020872116089, + "learning_rate": 6.4825063883562545e-06, + "loss": 0.4026, + "step": 4956 + }, + { + "epoch": 1.3916339135317237, + "grad_norm": 0.7289256453514099, + "learning_rate": 6.480946403413166e-06, + "loss": 0.3818, + "step": 4957 + }, + { + "epoch": 1.3919146546883774, + "grad_norm": 0.7625842690467834, + "learning_rate": 6.4793862604326855e-06, + "loss": 0.3881, + "step": 4958 + }, + { + "epoch": 1.3921953958450308, + "grad_norm": 0.6781641244888306, + "learning_rate": 6.477825959581303e-06, + "loss": 0.4003, + "step": 4959 + }, + { + "epoch": 1.3924761370016845, + "grad_norm": 0.7230656147003174, + "learning_rate": 6.476265501025525e-06, + "loss": 0.378, + "step": 4960 + }, + { + "epoch": 1.392756878158338, + "grad_norm": 0.6661189198493958, + "learning_rate": 6.474704884931873e-06, + "loss": 0.3761, + "step": 4961 + }, + { + "epoch": 1.3930376193149916, + "grad_norm": 0.6882614493370056, + "learning_rate": 6.473144111466887e-06, + "loss": 0.3803, + "step": 4962 + }, + { + "epoch": 1.3933183604716453, + "grad_norm": 0.6907784938812256, + "learning_rate": 6.471583180797121e-06, + "loss": 0.3543, + "step": 4963 + }, + { + "epoch": 1.3935991016282987, + "grad_norm": 0.7187560796737671, + "learning_rate": 6.470022093089149e-06, + "loss": 0.388, + "step": 4964 + }, + { + "epoch": 1.3938798427849521, + "grad_norm": 0.6923490166664124, + "learning_rate": 6.468460848509561e-06, + "loss": 0.3994, + "step": 4965 + }, + { + "epoch": 1.3941605839416058, + "grad_norm": 0.7261494398117065, + "learning_rate": 6.466899447224963e-06, + "loss": 0.4299, + "step": 4966 + }, + { + "epoch": 1.3944413250982595, + "grad_norm": 0.6663200855255127, + "learning_rate": 6.465337889401978e-06, + "loss": 0.3598, + "step": 4967 + }, + { + "epoch": 1.394722066254913, + "grad_norm": 0.8251053690910339, + "learning_rate": 6.463776175207246e-06, + "loss": 0.382, + "step": 4968 + }, + { + "epoch": 1.3950028074115666, + "grad_norm": 0.730740487575531, + "learning_rate": 6.462214304807422e-06, + "loss": 0.3935, + "step": 4969 + }, + { + "epoch": 1.3952835485682202, + "grad_norm": 0.7686986327171326, + "learning_rate": 6.4606522783691816e-06, + "loss": 0.4, + "step": 4970 + }, + { + "epoch": 1.3955642897248737, + "grad_norm": 0.6586964130401611, + "learning_rate": 6.459090096059213e-06, + "loss": 0.4015, + "step": 4971 + }, + { + "epoch": 1.3958450308815271, + "grad_norm": 0.7674559354782104, + "learning_rate": 6.457527758044223e-06, + "loss": 0.4047, + "step": 4972 + }, + { + "epoch": 1.3961257720381808, + "grad_norm": 0.6154000759124756, + "learning_rate": 6.455965264490935e-06, + "loss": 0.3486, + "step": 4973 + }, + { + "epoch": 1.3964065131948344, + "grad_norm": 0.7266693711280823, + "learning_rate": 6.454402615566088e-06, + "loss": 0.3267, + "step": 4974 + }, + { + "epoch": 1.3966872543514879, + "grad_norm": 0.7308728098869324, + "learning_rate": 6.452839811436436e-06, + "loss": 0.4139, + "step": 4975 + }, + { + "epoch": 1.3969679955081415, + "grad_norm": 0.6754895448684692, + "learning_rate": 6.451276852268757e-06, + "loss": 0.3611, + "step": 4976 + }, + { + "epoch": 1.397248736664795, + "grad_norm": 0.5917999744415283, + "learning_rate": 6.449713738229835e-06, + "loss": 0.355, + "step": 4977 + }, + { + "epoch": 1.3975294778214487, + "grad_norm": 0.7222596406936646, + "learning_rate": 6.448150469486478e-06, + "loss": 0.4194, + "step": 4978 + }, + { + "epoch": 1.397810218978102, + "grad_norm": 0.6465803384780884, + "learning_rate": 6.446587046205511e-06, + "loss": 0.383, + "step": 4979 + }, + { + "epoch": 1.3980909601347558, + "grad_norm": 0.7141954302787781, + "learning_rate": 6.445023468553768e-06, + "loss": 0.3849, + "step": 4980 + }, + { + "epoch": 1.3983717012914094, + "grad_norm": 0.7008252739906311, + "learning_rate": 6.443459736698106e-06, + "loss": 0.4069, + "step": 4981 + }, + { + "epoch": 1.3986524424480629, + "grad_norm": 0.6069467067718506, + "learning_rate": 6.441895850805397e-06, + "loss": 0.4174, + "step": 4982 + }, + { + "epoch": 1.3989331836047165, + "grad_norm": 0.6798213720321655, + "learning_rate": 6.440331811042531e-06, + "loss": 0.3845, + "step": 4983 + }, + { + "epoch": 1.39921392476137, + "grad_norm": 0.6617259383201599, + "learning_rate": 6.43876761757641e-06, + "loss": 0.3566, + "step": 4984 + }, + { + "epoch": 1.3994946659180236, + "grad_norm": 0.6779190301895142, + "learning_rate": 6.4372032705739565e-06, + "loss": 0.3751, + "step": 4985 + }, + { + "epoch": 1.399775407074677, + "grad_norm": 0.6222606897354126, + "learning_rate": 6.435638770202106e-06, + "loss": 0.3957, + "step": 4986 + }, + { + "epoch": 1.4000561482313307, + "grad_norm": 0.710183322429657, + "learning_rate": 6.434074116627815e-06, + "loss": 0.4077, + "step": 4987 + }, + { + "epoch": 1.4003368893879844, + "grad_norm": 0.6390467286109924, + "learning_rate": 6.432509310018051e-06, + "loss": 0.368, + "step": 4988 + }, + { + "epoch": 1.4006176305446378, + "grad_norm": 0.6966363191604614, + "learning_rate": 6.430944350539804e-06, + "loss": 0.3659, + "step": 4989 + }, + { + "epoch": 1.4008983717012913, + "grad_norm": 0.6830536723136902, + "learning_rate": 6.429379238360073e-06, + "loss": 0.4024, + "step": 4990 + }, + { + "epoch": 1.401179112857945, + "grad_norm": 0.6816349029541016, + "learning_rate": 6.427813973645878e-06, + "loss": 0.3578, + "step": 4991 + }, + { + "epoch": 1.4014598540145986, + "grad_norm": 0.6343541145324707, + "learning_rate": 6.426248556564254e-06, + "loss": 0.3661, + "step": 4992 + }, + { + "epoch": 1.401740595171252, + "grad_norm": 0.7885885834693909, + "learning_rate": 6.424682987282255e-06, + "loss": 0.4378, + "step": 4993 + }, + { + "epoch": 1.4020213363279057, + "grad_norm": 0.6680176258087158, + "learning_rate": 6.423117265966946e-06, + "loss": 0.3843, + "step": 4994 + }, + { + "epoch": 1.4023020774845594, + "grad_norm": 0.6686276793479919, + "learning_rate": 6.4215513927854125e-06, + "loss": 0.3693, + "step": 4995 + }, + { + "epoch": 1.4025828186412128, + "grad_norm": 0.6055652499198914, + "learning_rate": 6.419985367904754e-06, + "loss": 0.3945, + "step": 4996 + }, + { + "epoch": 1.4028635597978663, + "grad_norm": 0.7359544038772583, + "learning_rate": 6.418419191492088e-06, + "loss": 0.4042, + "step": 4997 + }, + { + "epoch": 1.40314430095452, + "grad_norm": 0.7134687900543213, + "learning_rate": 6.416852863714545e-06, + "loss": 0.4205, + "step": 4998 + }, + { + "epoch": 1.4034250421111736, + "grad_norm": 0.6834651231765747, + "learning_rate": 6.415286384739277e-06, + "loss": 0.3803, + "step": 4999 + }, + { + "epoch": 1.403705783267827, + "grad_norm": 0.6677467823028564, + "learning_rate": 6.413719754733447e-06, + "loss": 0.4085, + "step": 5000 + }, + { + "epoch": 1.4039865244244807, + "grad_norm": 0.5563065409660339, + "learning_rate": 6.412152973864236e-06, + "loss": 0.3835, + "step": 5001 + }, + { + "epoch": 1.4042672655811341, + "grad_norm": 0.6281640529632568, + "learning_rate": 6.410586042298841e-06, + "loss": 0.3813, + "step": 5002 + }, + { + "epoch": 1.4045480067377878, + "grad_norm": 0.6774550676345825, + "learning_rate": 6.409018960204475e-06, + "loss": 0.3584, + "step": 5003 + }, + { + "epoch": 1.4048287478944412, + "grad_norm": 0.6842412948608398, + "learning_rate": 6.407451727748367e-06, + "loss": 0.3957, + "step": 5004 + }, + { + "epoch": 1.405109489051095, + "grad_norm": 0.7258167266845703, + "learning_rate": 6.405884345097764e-06, + "loss": 0.3367, + "step": 5005 + }, + { + "epoch": 1.4053902302077486, + "grad_norm": 0.710361897945404, + "learning_rate": 6.404316812419927e-06, + "loss": 0.4234, + "step": 5006 + }, + { + "epoch": 1.405670971364402, + "grad_norm": 0.6374581456184387, + "learning_rate": 6.402749129882131e-06, + "loss": 0.3832, + "step": 5007 + }, + { + "epoch": 1.4059517125210557, + "grad_norm": 0.6684624552726746, + "learning_rate": 6.401181297651672e-06, + "loss": 0.3808, + "step": 5008 + }, + { + "epoch": 1.406232453677709, + "grad_norm": 0.6398100256919861, + "learning_rate": 6.399613315895858e-06, + "loss": 0.3983, + "step": 5009 + }, + { + "epoch": 1.4065131948343628, + "grad_norm": 0.8064338564872742, + "learning_rate": 6.398045184782015e-06, + "loss": 0.3898, + "step": 5010 + }, + { + "epoch": 1.4067939359910162, + "grad_norm": 0.6622109413146973, + "learning_rate": 6.396476904477484e-06, + "loss": 0.3617, + "step": 5011 + }, + { + "epoch": 1.4070746771476699, + "grad_norm": 0.6772120594978333, + "learning_rate": 6.3949084751496215e-06, + "loss": 0.3579, + "step": 5012 + }, + { + "epoch": 1.4073554183043235, + "grad_norm": 0.781470537185669, + "learning_rate": 6.3933398969658e-06, + "loss": 0.3879, + "step": 5013 + }, + { + "epoch": 1.407636159460977, + "grad_norm": 0.6676806807518005, + "learning_rate": 6.3917711700934106e-06, + "loss": 0.4139, + "step": 5014 + }, + { + "epoch": 1.4079169006176304, + "grad_norm": 0.5935172438621521, + "learning_rate": 6.390202294699855e-06, + "loss": 0.4123, + "step": 5015 + }, + { + "epoch": 1.408197641774284, + "grad_norm": 0.6429613828659058, + "learning_rate": 6.388633270952558e-06, + "loss": 0.3915, + "step": 5016 + }, + { + "epoch": 1.4084783829309377, + "grad_norm": 0.6627713441848755, + "learning_rate": 6.387064099018953e-06, + "loss": 0.374, + "step": 5017 + }, + { + "epoch": 1.4087591240875912, + "grad_norm": 0.671201765537262, + "learning_rate": 6.385494779066492e-06, + "loss": 0.4084, + "step": 5018 + }, + { + "epoch": 1.4090398652442448, + "grad_norm": 0.7396451830863953, + "learning_rate": 6.383925311262643e-06, + "loss": 0.4551, + "step": 5019 + }, + { + "epoch": 1.4093206064008983, + "grad_norm": 0.7062227129936218, + "learning_rate": 6.382355695774892e-06, + "loss": 0.3951, + "step": 5020 + }, + { + "epoch": 1.409601347557552, + "grad_norm": 0.6947963237762451, + "learning_rate": 6.3807859327707375e-06, + "loss": 0.3887, + "step": 5021 + }, + { + "epoch": 1.4098820887142054, + "grad_norm": 0.6252344846725464, + "learning_rate": 6.379216022417695e-06, + "loss": 0.3988, + "step": 5022 + }, + { + "epoch": 1.410162829870859, + "grad_norm": 0.6357190608978271, + "learning_rate": 6.377645964883297e-06, + "loss": 0.3774, + "step": 5023 + }, + { + "epoch": 1.4104435710275127, + "grad_norm": 0.6681320667266846, + "learning_rate": 6.376075760335086e-06, + "loss": 0.403, + "step": 5024 + }, + { + "epoch": 1.4107243121841662, + "grad_norm": 0.715579628944397, + "learning_rate": 6.374505408940627e-06, + "loss": 0.4181, + "step": 5025 + }, + { + "epoch": 1.4110050533408198, + "grad_norm": 0.6407375931739807, + "learning_rate": 6.372934910867501e-06, + "loss": 0.3899, + "step": 5026 + }, + { + "epoch": 1.4112857944974733, + "grad_norm": 0.6472136974334717, + "learning_rate": 6.371364266283296e-06, + "loss": 0.3909, + "step": 5027 + }, + { + "epoch": 1.411566535654127, + "grad_norm": 0.681711733341217, + "learning_rate": 6.369793475355628e-06, + "loss": 0.4094, + "step": 5028 + }, + { + "epoch": 1.4118472768107804, + "grad_norm": 0.6502349376678467, + "learning_rate": 6.368222538252116e-06, + "loss": 0.406, + "step": 5029 + }, + { + "epoch": 1.412128017967434, + "grad_norm": 0.6270555257797241, + "learning_rate": 6.366651455140403e-06, + "loss": 0.3585, + "step": 5030 + }, + { + "epoch": 1.4124087591240877, + "grad_norm": 0.7469307780265808, + "learning_rate": 6.365080226188145e-06, + "loss": 0.3481, + "step": 5031 + }, + { + "epoch": 1.4126895002807411, + "grad_norm": 0.6458436250686646, + "learning_rate": 6.363508851563014e-06, + "loss": 0.4041, + "step": 5032 + }, + { + "epoch": 1.4129702414373948, + "grad_norm": 0.6175792813301086, + "learning_rate": 6.361937331432699e-06, + "loss": 0.375, + "step": 5033 + }, + { + "epoch": 1.4132509825940482, + "grad_norm": 0.7301008701324463, + "learning_rate": 6.360365665964901e-06, + "loss": 0.4322, + "step": 5034 + }, + { + "epoch": 1.413531723750702, + "grad_norm": 0.6470361351966858, + "learning_rate": 6.358793855327339e-06, + "loss": 0.3916, + "step": 5035 + }, + { + "epoch": 1.4138124649073553, + "grad_norm": 0.6229981780052185, + "learning_rate": 6.357221899687746e-06, + "loss": 0.3563, + "step": 5036 + }, + { + "epoch": 1.414093206064009, + "grad_norm": 0.65587317943573, + "learning_rate": 6.355649799213871e-06, + "loss": 0.3632, + "step": 5037 + }, + { + "epoch": 1.4143739472206627, + "grad_norm": 0.6841045022010803, + "learning_rate": 6.354077554073481e-06, + "loss": 0.3733, + "step": 5038 + }, + { + "epoch": 1.4146546883773161, + "grad_norm": 0.6161452531814575, + "learning_rate": 6.3525051644343545e-06, + "loss": 0.3751, + "step": 5039 + }, + { + "epoch": 1.4149354295339696, + "grad_norm": 0.70626300573349, + "learning_rate": 6.350932630464288e-06, + "loss": 0.3851, + "step": 5040 + }, + { + "epoch": 1.4152161706906232, + "grad_norm": 0.7271189093589783, + "learning_rate": 6.349359952331091e-06, + "loss": 0.3759, + "step": 5041 + }, + { + "epoch": 1.4154969118472769, + "grad_norm": 0.7051572203636169, + "learning_rate": 6.347787130202592e-06, + "loss": 0.3771, + "step": 5042 + }, + { + "epoch": 1.4157776530039303, + "grad_norm": 0.7936788201332092, + "learning_rate": 6.34621416424663e-06, + "loss": 0.4111, + "step": 5043 + }, + { + "epoch": 1.416058394160584, + "grad_norm": 0.6933422088623047, + "learning_rate": 6.344641054631065e-06, + "loss": 0.3619, + "step": 5044 + }, + { + "epoch": 1.4163391353172374, + "grad_norm": 0.6461235284805298, + "learning_rate": 6.343067801523769e-06, + "loss": 0.3907, + "step": 5045 + }, + { + "epoch": 1.416619876473891, + "grad_norm": 0.6267691254615784, + "learning_rate": 6.341494405092628e-06, + "loss": 0.3845, + "step": 5046 + }, + { + "epoch": 1.4169006176305445, + "grad_norm": 0.7046382427215576, + "learning_rate": 6.339920865505548e-06, + "loss": 0.3845, + "step": 5047 + }, + { + "epoch": 1.4171813587871982, + "grad_norm": 0.6728993058204651, + "learning_rate": 6.338347182930445e-06, + "loss": 0.3442, + "step": 5048 + }, + { + "epoch": 1.4174620999438519, + "grad_norm": 0.6944672465324402, + "learning_rate": 6.336773357535253e-06, + "loss": 0.3654, + "step": 5049 + }, + { + "epoch": 1.4177428411005053, + "grad_norm": 0.6600845456123352, + "learning_rate": 6.335199389487922e-06, + "loss": 0.385, + "step": 5050 + }, + { + "epoch": 1.418023582257159, + "grad_norm": 0.6061417460441589, + "learning_rate": 6.333625278956413e-06, + "loss": 0.4183, + "step": 5051 + }, + { + "epoch": 1.4183043234138124, + "grad_norm": 0.6814526915550232, + "learning_rate": 6.33205102610871e-06, + "loss": 0.4054, + "step": 5052 + }, + { + "epoch": 1.418585064570466, + "grad_norm": 0.6553138494491577, + "learning_rate": 6.330476631112803e-06, + "loss": 0.3896, + "step": 5053 + }, + { + "epoch": 1.4188658057271195, + "grad_norm": 0.6747038960456848, + "learning_rate": 6.328902094136704e-06, + "loss": 0.3696, + "step": 5054 + }, + { + "epoch": 1.4191465468837732, + "grad_norm": 0.580277681350708, + "learning_rate": 6.3273274153484384e-06, + "loss": 0.3924, + "step": 5055 + }, + { + "epoch": 1.4194272880404268, + "grad_norm": 0.6985778212547302, + "learning_rate": 6.325752594916046e-06, + "loss": 0.409, + "step": 5056 + }, + { + "epoch": 1.4197080291970803, + "grad_norm": 0.696919858455658, + "learning_rate": 6.324177633007578e-06, + "loss": 0.3494, + "step": 5057 + }, + { + "epoch": 1.4199887703537337, + "grad_norm": 0.6058250069618225, + "learning_rate": 6.322602529791109e-06, + "loss": 0.364, + "step": 5058 + }, + { + "epoch": 1.4202695115103874, + "grad_norm": 0.7440758347511292, + "learning_rate": 6.321027285434722e-06, + "loss": 0.3736, + "step": 5059 + }, + { + "epoch": 1.420550252667041, + "grad_norm": 0.6947153210639954, + "learning_rate": 6.31945190010652e-06, + "loss": 0.3567, + "step": 5060 + }, + { + "epoch": 1.4208309938236945, + "grad_norm": 0.6216912865638733, + "learning_rate": 6.317876373974616e-06, + "loss": 0.3485, + "step": 5061 + }, + { + "epoch": 1.4211117349803482, + "grad_norm": 0.7084161043167114, + "learning_rate": 6.3163007072071395e-06, + "loss": 0.4088, + "step": 5062 + }, + { + "epoch": 1.4213924761370018, + "grad_norm": 0.6685324907302856, + "learning_rate": 6.314724899972238e-06, + "loss": 0.4537, + "step": 5063 + }, + { + "epoch": 1.4216732172936553, + "grad_norm": 0.6328791975975037, + "learning_rate": 6.31314895243807e-06, + "loss": 0.3822, + "step": 5064 + }, + { + "epoch": 1.4219539584503087, + "grad_norm": 0.7225044965744019, + "learning_rate": 6.311572864772811e-06, + "loss": 0.3776, + "step": 5065 + }, + { + "epoch": 1.4222346996069624, + "grad_norm": 0.8034189343452454, + "learning_rate": 6.3099966371446556e-06, + "loss": 0.4106, + "step": 5066 + }, + { + "epoch": 1.422515440763616, + "grad_norm": 0.6231784224510193, + "learning_rate": 6.308420269721802e-06, + "loss": 0.3653, + "step": 5067 + }, + { + "epoch": 1.4227961819202695, + "grad_norm": 0.770412802696228, + "learning_rate": 6.306843762672474e-06, + "loss": 0.4098, + "step": 5068 + }, + { + "epoch": 1.4230769230769231, + "grad_norm": 0.7057575583457947, + "learning_rate": 6.305267116164908e-06, + "loss": 0.389, + "step": 5069 + }, + { + "epoch": 1.4233576642335766, + "grad_norm": 0.6264330744743347, + "learning_rate": 6.3036903303673495e-06, + "loss": 0.4156, + "step": 5070 + }, + { + "epoch": 1.4236384053902302, + "grad_norm": 0.6959660053253174, + "learning_rate": 6.302113405448069e-06, + "loss": 0.4026, + "step": 5071 + }, + { + "epoch": 1.4239191465468837, + "grad_norm": 0.6862159371376038, + "learning_rate": 6.300536341575342e-06, + "loss": 0.3975, + "step": 5072 + }, + { + "epoch": 1.4241998877035373, + "grad_norm": 0.7660701274871826, + "learning_rate": 6.2989591389174645e-06, + "loss": 0.3946, + "step": 5073 + }, + { + "epoch": 1.424480628860191, + "grad_norm": 0.6575465202331543, + "learning_rate": 6.2973817976427455e-06, + "loss": 0.3739, + "step": 5074 + }, + { + "epoch": 1.4247613700168444, + "grad_norm": 0.6916999220848083, + "learning_rate": 6.295804317919507e-06, + "loss": 0.4288, + "step": 5075 + }, + { + "epoch": 1.425042111173498, + "grad_norm": 0.6507255434989929, + "learning_rate": 6.294226699916092e-06, + "loss": 0.3685, + "step": 5076 + }, + { + "epoch": 1.4253228523301515, + "grad_norm": 0.6341503858566284, + "learning_rate": 6.292648943800852e-06, + "loss": 0.3885, + "step": 5077 + }, + { + "epoch": 1.4256035934868052, + "grad_norm": 0.623775064945221, + "learning_rate": 6.291071049742154e-06, + "loss": 0.4102, + "step": 5078 + }, + { + "epoch": 1.4258843346434587, + "grad_norm": 0.6329973340034485, + "learning_rate": 6.289493017908383e-06, + "loss": 0.4072, + "step": 5079 + }, + { + "epoch": 1.4261650758001123, + "grad_norm": 0.7985526323318481, + "learning_rate": 6.287914848467935e-06, + "loss": 0.4063, + "step": 5080 + }, + { + "epoch": 1.426445816956766, + "grad_norm": 0.6926229596138, + "learning_rate": 6.286336541589224e-06, + "loss": 0.3551, + "step": 5081 + }, + { + "epoch": 1.4267265581134194, + "grad_norm": 0.6243866682052612, + "learning_rate": 6.284758097440676e-06, + "loss": 0.3587, + "step": 5082 + }, + { + "epoch": 1.4270072992700729, + "grad_norm": 0.7212977409362793, + "learning_rate": 6.283179516190734e-06, + "loss": 0.3762, + "step": 5083 + }, + { + "epoch": 1.4272880404267265, + "grad_norm": 0.6903027892112732, + "learning_rate": 6.281600798007853e-06, + "loss": 0.4086, + "step": 5084 + }, + { + "epoch": 1.4275687815833802, + "grad_norm": 0.7182713150978088, + "learning_rate": 6.280021943060505e-06, + "loss": 0.3884, + "step": 5085 + }, + { + "epoch": 1.4278495227400336, + "grad_norm": 0.7404643297195435, + "learning_rate": 6.278442951517174e-06, + "loss": 0.3623, + "step": 5086 + }, + { + "epoch": 1.4281302638966873, + "grad_norm": 0.6994548439979553, + "learning_rate": 6.276863823546362e-06, + "loss": 0.3857, + "step": 5087 + }, + { + "epoch": 1.428411005053341, + "grad_norm": 0.7237161993980408, + "learning_rate": 6.275284559316585e-06, + "loss": 0.4047, + "step": 5088 + }, + { + "epoch": 1.4286917462099944, + "grad_norm": 0.7051738500595093, + "learning_rate": 6.273705158996368e-06, + "loss": 0.4071, + "step": 5089 + }, + { + "epoch": 1.4289724873666478, + "grad_norm": 0.7517426013946533, + "learning_rate": 6.272125622754257e-06, + "loss": 0.3897, + "step": 5090 + }, + { + "epoch": 1.4292532285233015, + "grad_norm": 0.8073099851608276, + "learning_rate": 6.2705459507588115e-06, + "loss": 0.4211, + "step": 5091 + }, + { + "epoch": 1.4295339696799552, + "grad_norm": 0.5920656323432922, + "learning_rate": 6.268966143178603e-06, + "loss": 0.3422, + "step": 5092 + }, + { + "epoch": 1.4298147108366086, + "grad_norm": 0.6506666541099548, + "learning_rate": 6.267386200182218e-06, + "loss": 0.3767, + "step": 5093 + }, + { + "epoch": 1.4300954519932623, + "grad_norm": 0.7356073260307312, + "learning_rate": 6.265806121938261e-06, + "loss": 0.4074, + "step": 5094 + }, + { + "epoch": 1.4303761931499157, + "grad_norm": 0.7603099346160889, + "learning_rate": 6.2642259086153445e-06, + "loss": 0.3942, + "step": 5095 + }, + { + "epoch": 1.4306569343065694, + "grad_norm": 0.6621203422546387, + "learning_rate": 6.2626455603821e-06, + "loss": 0.3211, + "step": 5096 + }, + { + "epoch": 1.4309376754632228, + "grad_norm": 0.657948911190033, + "learning_rate": 6.261065077407173e-06, + "loss": 0.3523, + "step": 5097 + }, + { + "epoch": 1.4312184166198765, + "grad_norm": 0.7687973380088806, + "learning_rate": 6.2594844598592256e-06, + "loss": 0.388, + "step": 5098 + }, + { + "epoch": 1.4314991577765301, + "grad_norm": 0.6468183398246765, + "learning_rate": 6.257903707906928e-06, + "loss": 0.3633, + "step": 5099 + }, + { + "epoch": 1.4317798989331836, + "grad_norm": 0.6889108419418335, + "learning_rate": 6.2563228217189686e-06, + "loss": 0.3722, + "step": 5100 + }, + { + "epoch": 1.4320606400898372, + "grad_norm": 0.6281739473342896, + "learning_rate": 6.25474180146405e-06, + "loss": 0.4251, + "step": 5101 + }, + { + "epoch": 1.4323413812464907, + "grad_norm": 0.6302910447120667, + "learning_rate": 6.25316064731089e-06, + "loss": 0.4036, + "step": 5102 + }, + { + "epoch": 1.4326221224031443, + "grad_norm": 0.7975131273269653, + "learning_rate": 6.251579359428217e-06, + "loss": 0.4034, + "step": 5103 + }, + { + "epoch": 1.4329028635597978, + "grad_norm": 0.6228559613227844, + "learning_rate": 6.249997937984778e-06, + "loss": 0.3894, + "step": 5104 + }, + { + "epoch": 1.4331836047164515, + "grad_norm": 0.7324758172035217, + "learning_rate": 6.248416383149335e-06, + "loss": 0.4363, + "step": 5105 + }, + { + "epoch": 1.4334643458731051, + "grad_norm": 0.729753315448761, + "learning_rate": 6.2468346950906565e-06, + "loss": 0.3883, + "step": 5106 + }, + { + "epoch": 1.4337450870297586, + "grad_norm": 0.6479659080505371, + "learning_rate": 6.245252873977533e-06, + "loss": 0.3705, + "step": 5107 + }, + { + "epoch": 1.434025828186412, + "grad_norm": 0.6021603941917419, + "learning_rate": 6.243670919978766e-06, + "loss": 0.4169, + "step": 5108 + }, + { + "epoch": 1.4343065693430657, + "grad_norm": 0.7919953465461731, + "learning_rate": 6.2420888332631735e-06, + "loss": 0.3934, + "step": 5109 + }, + { + "epoch": 1.4345873104997193, + "grad_norm": 0.7754859924316406, + "learning_rate": 6.240506613999585e-06, + "loss": 0.4505, + "step": 5110 + }, + { + "epoch": 1.4348680516563728, + "grad_norm": 0.6871572136878967, + "learning_rate": 6.238924262356845e-06, + "loss": 0.42, + "step": 5111 + }, + { + "epoch": 1.4351487928130264, + "grad_norm": 0.6975321173667908, + "learning_rate": 6.23734177850381e-06, + "loss": 0.3978, + "step": 5112 + }, + { + "epoch": 1.43542953396968, + "grad_norm": 0.6754171252250671, + "learning_rate": 6.235759162609356e-06, + "loss": 0.3773, + "step": 5113 + }, + { + "epoch": 1.4357102751263335, + "grad_norm": 0.6637935638427734, + "learning_rate": 6.23417641484237e-06, + "loss": 0.3749, + "step": 5114 + }, + { + "epoch": 1.435991016282987, + "grad_norm": 0.7114065289497375, + "learning_rate": 6.23259353537175e-06, + "loss": 0.4158, + "step": 5115 + }, + { + "epoch": 1.4362717574396406, + "grad_norm": 0.6718721389770508, + "learning_rate": 6.231010524366415e-06, + "loss": 0.4135, + "step": 5116 + }, + { + "epoch": 1.4365524985962943, + "grad_norm": 0.6565790176391602, + "learning_rate": 6.229427381995291e-06, + "loss": 0.383, + "step": 5117 + }, + { + "epoch": 1.4368332397529477, + "grad_norm": 0.6728942394256592, + "learning_rate": 6.2278441084273224e-06, + "loss": 0.3851, + "step": 5118 + }, + { + "epoch": 1.4371139809096014, + "grad_norm": 0.6315937042236328, + "learning_rate": 6.226260703831465e-06, + "loss": 0.3628, + "step": 5119 + }, + { + "epoch": 1.4373947220662548, + "grad_norm": 0.7243862748146057, + "learning_rate": 6.224677168376692e-06, + "loss": 0.4102, + "step": 5120 + }, + { + "epoch": 1.4376754632229085, + "grad_norm": 0.6211796998977661, + "learning_rate": 6.223093502231986e-06, + "loss": 0.4051, + "step": 5121 + }, + { + "epoch": 1.437956204379562, + "grad_norm": 0.7139093279838562, + "learning_rate": 6.221509705566348e-06, + "loss": 0.3756, + "step": 5122 + }, + { + "epoch": 1.4382369455362156, + "grad_norm": 0.6886172890663147, + "learning_rate": 6.21992577854879e-06, + "loss": 0.3514, + "step": 5123 + }, + { + "epoch": 1.4385176866928693, + "grad_norm": 0.6658490300178528, + "learning_rate": 6.21834172134834e-06, + "loss": 0.3719, + "step": 5124 + }, + { + "epoch": 1.4387984278495227, + "grad_norm": 0.6449331641197205, + "learning_rate": 6.216757534134037e-06, + "loss": 0.3596, + "step": 5125 + }, + { + "epoch": 1.4390791690061764, + "grad_norm": 0.689836323261261, + "learning_rate": 6.215173217074938e-06, + "loss": 0.417, + "step": 5126 + }, + { + "epoch": 1.4393599101628298, + "grad_norm": 0.6353428959846497, + "learning_rate": 6.213588770340109e-06, + "loss": 0.4016, + "step": 5127 + }, + { + "epoch": 1.4396406513194835, + "grad_norm": 0.6529582738876343, + "learning_rate": 6.212004194098633e-06, + "loss": 0.3354, + "step": 5128 + }, + { + "epoch": 1.439921392476137, + "grad_norm": 0.72930508852005, + "learning_rate": 6.210419488519606e-06, + "loss": 0.3965, + "step": 5129 + }, + { + "epoch": 1.4402021336327906, + "grad_norm": 0.6347339749336243, + "learning_rate": 6.208834653772139e-06, + "loss": 0.3941, + "step": 5130 + }, + { + "epoch": 1.4404828747894443, + "grad_norm": 0.6254330277442932, + "learning_rate": 6.207249690025354e-06, + "loss": 0.3676, + "step": 5131 + }, + { + "epoch": 1.4407636159460977, + "grad_norm": 0.6846107244491577, + "learning_rate": 6.205664597448392e-06, + "loss": 0.4145, + "step": 5132 + }, + { + "epoch": 1.4410443571027511, + "grad_norm": 0.6662903428077698, + "learning_rate": 6.204079376210399e-06, + "loss": 0.3774, + "step": 5133 + }, + { + "epoch": 1.4413250982594048, + "grad_norm": 0.7583791613578796, + "learning_rate": 6.2024940264805434e-06, + "loss": 0.387, + "step": 5134 + }, + { + "epoch": 1.4416058394160585, + "grad_norm": 0.6241505146026611, + "learning_rate": 6.200908548428003e-06, + "loss": 0.3911, + "step": 5135 + }, + { + "epoch": 1.441886580572712, + "grad_norm": 0.7127392888069153, + "learning_rate": 6.199322942221971e-06, + "loss": 0.3658, + "step": 5136 + }, + { + "epoch": 1.4421673217293656, + "grad_norm": 0.6791383624076843, + "learning_rate": 6.197737208031652e-06, + "loss": 0.4141, + "step": 5137 + }, + { + "epoch": 1.442448062886019, + "grad_norm": 0.7384278178215027, + "learning_rate": 6.196151346026267e-06, + "loss": 0.3746, + "step": 5138 + }, + { + "epoch": 1.4427288040426727, + "grad_norm": 0.6535114645957947, + "learning_rate": 6.1945653563750485e-06, + "loss": 0.3674, + "step": 5139 + }, + { + "epoch": 1.4430095451993261, + "grad_norm": 0.7013922333717346, + "learning_rate": 6.192979239247243e-06, + "loss": 0.4047, + "step": 5140 + }, + { + "epoch": 1.4432902863559798, + "grad_norm": 0.6561954617500305, + "learning_rate": 6.191392994812112e-06, + "loss": 0.4055, + "step": 5141 + }, + { + "epoch": 1.4435710275126334, + "grad_norm": 0.639331579208374, + "learning_rate": 6.18980662323893e-06, + "loss": 0.3724, + "step": 5142 + }, + { + "epoch": 1.4438517686692869, + "grad_norm": 0.7580430507659912, + "learning_rate": 6.1882201246969845e-06, + "loss": 0.4205, + "step": 5143 + }, + { + "epoch": 1.4441325098259405, + "grad_norm": 0.7018490433692932, + "learning_rate": 6.186633499355576e-06, + "loss": 0.398, + "step": 5144 + }, + { + "epoch": 1.444413250982594, + "grad_norm": 0.6287434101104736, + "learning_rate": 6.185046747384018e-06, + "loss": 0.3624, + "step": 5145 + }, + { + "epoch": 1.4446939921392477, + "grad_norm": 0.684122622013092, + "learning_rate": 6.183459868951642e-06, + "loss": 0.3724, + "step": 5146 + }, + { + "epoch": 1.444974733295901, + "grad_norm": 0.6380981802940369, + "learning_rate": 6.181872864227787e-06, + "loss": 0.4284, + "step": 5147 + }, + { + "epoch": 1.4452554744525548, + "grad_norm": 0.6782366633415222, + "learning_rate": 6.180285733381811e-06, + "loss": 0.3887, + "step": 5148 + }, + { + "epoch": 1.4455362156092084, + "grad_norm": 0.6177129149436951, + "learning_rate": 6.17869847658308e-06, + "loss": 0.4087, + "step": 5149 + }, + { + "epoch": 1.4458169567658619, + "grad_norm": 0.649387001991272, + "learning_rate": 6.177111094000978e-06, + "loss": 0.3467, + "step": 5150 + }, + { + "epoch": 1.4460976979225155, + "grad_norm": 0.7056505680084229, + "learning_rate": 6.175523585804901e-06, + "loss": 0.3866, + "step": 5151 + }, + { + "epoch": 1.446378439079169, + "grad_norm": 0.6559800505638123, + "learning_rate": 6.173935952164256e-06, + "loss": 0.3893, + "step": 5152 + }, + { + "epoch": 1.4466591802358226, + "grad_norm": 0.7441030740737915, + "learning_rate": 6.172348193248466e-06, + "loss": 0.3607, + "step": 5153 + }, + { + "epoch": 1.446939921392476, + "grad_norm": 0.6608142852783203, + "learning_rate": 6.170760309226969e-06, + "loss": 0.4059, + "step": 5154 + }, + { + "epoch": 1.4472206625491297, + "grad_norm": 0.6568796038627625, + "learning_rate": 6.169172300269211e-06, + "loss": 0.3678, + "step": 5155 + }, + { + "epoch": 1.4475014037057834, + "grad_norm": 0.676991879940033, + "learning_rate": 6.167584166544655e-06, + "loss": 0.3704, + "step": 5156 + }, + { + "epoch": 1.4477821448624368, + "grad_norm": 0.7802074551582336, + "learning_rate": 6.165995908222778e-06, + "loss": 0.4331, + "step": 5157 + }, + { + "epoch": 1.4480628860190903, + "grad_norm": 0.7982289791107178, + "learning_rate": 6.164407525473069e-06, + "loss": 0.4263, + "step": 5158 + }, + { + "epoch": 1.448343627175744, + "grad_norm": 0.5990244150161743, + "learning_rate": 6.162819018465029e-06, + "loss": 0.3738, + "step": 5159 + }, + { + "epoch": 1.4486243683323976, + "grad_norm": 0.6717208027839661, + "learning_rate": 6.161230387368175e-06, + "loss": 0.3954, + "step": 5160 + }, + { + "epoch": 1.448905109489051, + "grad_norm": 0.8350700736045837, + "learning_rate": 6.159641632352036e-06, + "loss": 0.4125, + "step": 5161 + }, + { + "epoch": 1.4491858506457047, + "grad_norm": 0.684823215007782, + "learning_rate": 6.158052753586152e-06, + "loss": 0.4194, + "step": 5162 + }, + { + "epoch": 1.4494665918023582, + "grad_norm": 0.6564561724662781, + "learning_rate": 6.15646375124008e-06, + "loss": 0.4372, + "step": 5163 + }, + { + "epoch": 1.4497473329590118, + "grad_norm": 0.6363829970359802, + "learning_rate": 6.154874625483388e-06, + "loss": 0.4184, + "step": 5164 + }, + { + "epoch": 1.4500280741156653, + "grad_norm": 0.670998752117157, + "learning_rate": 6.153285376485659e-06, + "loss": 0.4047, + "step": 5165 + }, + { + "epoch": 1.450308815272319, + "grad_norm": 0.706078052520752, + "learning_rate": 6.1516960044164855e-06, + "loss": 0.3762, + "step": 5166 + }, + { + "epoch": 1.4505895564289726, + "grad_norm": 0.6266189813613892, + "learning_rate": 6.150106509445476e-06, + "loss": 0.4298, + "step": 5167 + }, + { + "epoch": 1.450870297585626, + "grad_norm": 0.5693655014038086, + "learning_rate": 6.148516891742251e-06, + "loss": 0.4207, + "step": 5168 + }, + { + "epoch": 1.4511510387422797, + "grad_norm": 0.7362409234046936, + "learning_rate": 6.146927151476447e-06, + "loss": 0.3714, + "step": 5169 + }, + { + "epoch": 1.4514317798989331, + "grad_norm": 0.5743008255958557, + "learning_rate": 6.145337288817709e-06, + "loss": 0.358, + "step": 5170 + }, + { + "epoch": 1.4517125210555868, + "grad_norm": 0.6924434900283813, + "learning_rate": 6.143747303935699e-06, + "loss": 0.4015, + "step": 5171 + }, + { + "epoch": 1.4519932622122402, + "grad_norm": 0.6799778342247009, + "learning_rate": 6.142157197000087e-06, + "loss": 0.389, + "step": 5172 + }, + { + "epoch": 1.452274003368894, + "grad_norm": 0.7883371710777283, + "learning_rate": 6.1405669681805634e-06, + "loss": 0.3762, + "step": 5173 + }, + { + "epoch": 1.4525547445255476, + "grad_norm": 0.6141828894615173, + "learning_rate": 6.138976617646824e-06, + "loss": 0.38, + "step": 5174 + }, + { + "epoch": 1.452835485682201, + "grad_norm": 0.6267702579498291, + "learning_rate": 6.137386145568584e-06, + "loss": 0.4254, + "step": 5175 + }, + { + "epoch": 1.4531162268388544, + "grad_norm": 0.6863150596618652, + "learning_rate": 6.135795552115569e-06, + "loss": 0.4153, + "step": 5176 + }, + { + "epoch": 1.453396967995508, + "grad_norm": 0.614777147769928, + "learning_rate": 6.134204837457514e-06, + "loss": 0.3838, + "step": 5177 + }, + { + "epoch": 1.4536777091521618, + "grad_norm": 0.6711261868476868, + "learning_rate": 6.132614001764171e-06, + "loss": 0.3868, + "step": 5178 + }, + { + "epoch": 1.4539584503088152, + "grad_norm": 0.6433554291725159, + "learning_rate": 6.131023045205306e-06, + "loss": 0.4148, + "step": 5179 + }, + { + "epoch": 1.4542391914654689, + "grad_norm": 0.6728886365890503, + "learning_rate": 6.129431967950695e-06, + "loss": 0.3798, + "step": 5180 + }, + { + "epoch": 1.4545199326221225, + "grad_norm": 0.6076109409332275, + "learning_rate": 6.127840770170128e-06, + "loss": 0.4035, + "step": 5181 + }, + { + "epoch": 1.454800673778776, + "grad_norm": 0.6427233815193176, + "learning_rate": 6.126249452033408e-06, + "loss": 0.3793, + "step": 5182 + }, + { + "epoch": 1.4550814149354294, + "grad_norm": 0.7428321838378906, + "learning_rate": 6.12465801371035e-06, + "loss": 0.3593, + "step": 5183 + }, + { + "epoch": 1.455362156092083, + "grad_norm": 0.6285116076469421, + "learning_rate": 6.123066455370782e-06, + "loss": 0.4224, + "step": 5184 + }, + { + "epoch": 1.4556428972487367, + "grad_norm": 0.652482271194458, + "learning_rate": 6.121474777184544e-06, + "loss": 0.3746, + "step": 5185 + }, + { + "epoch": 1.4559236384053902, + "grad_norm": 0.6782539486885071, + "learning_rate": 6.119882979321495e-06, + "loss": 0.4115, + "step": 5186 + }, + { + "epoch": 1.4562043795620438, + "grad_norm": 0.7005321383476257, + "learning_rate": 6.1182910619514975e-06, + "loss": 0.4285, + "step": 5187 + }, + { + "epoch": 1.4564851207186973, + "grad_norm": 0.7454401850700378, + "learning_rate": 6.116699025244431e-06, + "loss": 0.3917, + "step": 5188 + }, + { + "epoch": 1.456765861875351, + "grad_norm": 0.570483922958374, + "learning_rate": 6.11510686937019e-06, + "loss": 0.4052, + "step": 5189 + }, + { + "epoch": 1.4570466030320044, + "grad_norm": 0.5567541122436523, + "learning_rate": 6.113514594498677e-06, + "loss": 0.3812, + "step": 5190 + }, + { + "epoch": 1.457327344188658, + "grad_norm": 0.6308498382568359, + "learning_rate": 6.1119222007998125e-06, + "loss": 0.3748, + "step": 5191 + }, + { + "epoch": 1.4576080853453117, + "grad_norm": 0.6073638200759888, + "learning_rate": 6.110329688443526e-06, + "loss": 0.3943, + "step": 5192 + }, + { + "epoch": 1.4578888265019652, + "grad_norm": 0.6466220021247864, + "learning_rate": 6.108737057599758e-06, + "loss": 0.3835, + "step": 5193 + }, + { + "epoch": 1.4581695676586188, + "grad_norm": 0.6801443696022034, + "learning_rate": 6.107144308438466e-06, + "loss": 0.3877, + "step": 5194 + }, + { + "epoch": 1.4584503088152723, + "grad_norm": 0.6578395366668701, + "learning_rate": 6.105551441129619e-06, + "loss": 0.4343, + "step": 5195 + }, + { + "epoch": 1.458731049971926, + "grad_norm": 0.6512399315834045, + "learning_rate": 6.103958455843198e-06, + "loss": 0.396, + "step": 5196 + }, + { + "epoch": 1.4590117911285794, + "grad_norm": 0.6606807112693787, + "learning_rate": 6.102365352749193e-06, + "loss": 0.3757, + "step": 5197 + }, + { + "epoch": 1.459292532285233, + "grad_norm": 0.6820982098579407, + "learning_rate": 6.100772132017615e-06, + "loss": 0.3817, + "step": 5198 + }, + { + "epoch": 1.4595732734418867, + "grad_norm": 0.681573748588562, + "learning_rate": 6.099178793818479e-06, + "loss": 0.3855, + "step": 5199 + }, + { + "epoch": 1.4598540145985401, + "grad_norm": 0.5980532169342041, + "learning_rate": 6.097585338321819e-06, + "loss": 0.3946, + "step": 5200 + }, + { + "epoch": 1.4601347557551936, + "grad_norm": 0.6430398225784302, + "learning_rate": 6.095991765697675e-06, + "loss": 0.3408, + "step": 5201 + }, + { + "epoch": 1.4604154969118472, + "grad_norm": 0.694545328617096, + "learning_rate": 6.094398076116107e-06, + "loss": 0.3828, + "step": 5202 + }, + { + "epoch": 1.460696238068501, + "grad_norm": 0.6895719766616821, + "learning_rate": 6.092804269747183e-06, + "loss": 0.4021, + "step": 5203 + }, + { + "epoch": 1.4609769792251543, + "grad_norm": 0.6689839959144592, + "learning_rate": 6.091210346760981e-06, + "loss": 0.3952, + "step": 5204 + }, + { + "epoch": 1.461257720381808, + "grad_norm": 0.6083852648735046, + "learning_rate": 6.089616307327597e-06, + "loss": 0.4226, + "step": 5205 + }, + { + "epoch": 1.4615384615384617, + "grad_norm": 0.6489900350570679, + "learning_rate": 6.088022151617137e-06, + "loss": 0.3543, + "step": 5206 + }, + { + "epoch": 1.4618192026951151, + "grad_norm": 0.6742801070213318, + "learning_rate": 6.0864278797997176e-06, + "loss": 0.404, + "step": 5207 + }, + { + "epoch": 1.4620999438517686, + "grad_norm": 0.655458390712738, + "learning_rate": 6.084833492045472e-06, + "loss": 0.364, + "step": 5208 + }, + { + "epoch": 1.4623806850084222, + "grad_norm": 0.6500082612037659, + "learning_rate": 6.083238988524542e-06, + "loss": 0.3886, + "step": 5209 + }, + { + "epoch": 1.4626614261650759, + "grad_norm": 0.6608209609985352, + "learning_rate": 6.081644369407084e-06, + "loss": 0.3694, + "step": 5210 + }, + { + "epoch": 1.4629421673217293, + "grad_norm": 0.7076460719108582, + "learning_rate": 6.080049634863264e-06, + "loss": 0.4411, + "step": 5211 + }, + { + "epoch": 1.463222908478383, + "grad_norm": 0.6633089184761047, + "learning_rate": 6.078454785063263e-06, + "loss": 0.4391, + "step": 5212 + }, + { + "epoch": 1.4635036496350364, + "grad_norm": 0.696105420589447, + "learning_rate": 6.076859820177275e-06, + "loss": 0.3536, + "step": 5213 + }, + { + "epoch": 1.46378439079169, + "grad_norm": 0.6126047372817993, + "learning_rate": 6.075264740375505e-06, + "loss": 0.4349, + "step": 5214 + }, + { + "epoch": 1.4640651319483435, + "grad_norm": 0.6296899914741516, + "learning_rate": 6.073669545828167e-06, + "loss": 0.366, + "step": 5215 + }, + { + "epoch": 1.4643458731049972, + "grad_norm": 0.5691863894462585, + "learning_rate": 6.072074236705492e-06, + "loss": 0.3717, + "step": 5216 + }, + { + "epoch": 1.4646266142616509, + "grad_norm": 0.6138046383857727, + "learning_rate": 6.07047881317772e-06, + "loss": 0.3671, + "step": 5217 + }, + { + "epoch": 1.4649073554183043, + "grad_norm": 0.5991971492767334, + "learning_rate": 6.068883275415107e-06, + "loss": 0.4068, + "step": 5218 + }, + { + "epoch": 1.465188096574958, + "grad_norm": 0.6231683492660522, + "learning_rate": 6.067287623587917e-06, + "loss": 0.3644, + "step": 5219 + }, + { + "epoch": 1.4654688377316114, + "grad_norm": 0.6731587052345276, + "learning_rate": 6.0656918578664315e-06, + "loss": 0.4324, + "step": 5220 + }, + { + "epoch": 1.465749578888265, + "grad_norm": 0.6973851919174194, + "learning_rate": 6.064095978420936e-06, + "loss": 0.3871, + "step": 5221 + }, + { + "epoch": 1.4660303200449185, + "grad_norm": 0.6489410996437073, + "learning_rate": 6.0624999854217346e-06, + "loss": 0.3636, + "step": 5222 + }, + { + "epoch": 1.4663110612015722, + "grad_norm": 0.5894689559936523, + "learning_rate": 6.0609038790391415e-06, + "loss": 0.3416, + "step": 5223 + }, + { + "epoch": 1.4665918023582258, + "grad_norm": 0.6763150095939636, + "learning_rate": 6.059307659443484e-06, + "loss": 0.3928, + "step": 5224 + }, + { + "epoch": 1.4668725435148793, + "grad_norm": 0.7071971893310547, + "learning_rate": 6.0577113268051025e-06, + "loss": 0.397, + "step": 5225 + }, + { + "epoch": 1.4671532846715327, + "grad_norm": 0.7107048630714417, + "learning_rate": 6.0561148812943435e-06, + "loss": 0.3993, + "step": 5226 + }, + { + "epoch": 1.4674340258281864, + "grad_norm": 0.6285058856010437, + "learning_rate": 6.0545183230815725e-06, + "loss": 0.4107, + "step": 5227 + }, + { + "epoch": 1.46771476698484, + "grad_norm": 0.7170884013175964, + "learning_rate": 6.0529216523371635e-06, + "loss": 0.4125, + "step": 5228 + }, + { + "epoch": 1.4679955081414935, + "grad_norm": 0.5967025756835938, + "learning_rate": 6.051324869231504e-06, + "loss": 0.3696, + "step": 5229 + }, + { + "epoch": 1.4682762492981472, + "grad_norm": 0.673880934715271, + "learning_rate": 6.049727973934993e-06, + "loss": 0.3435, + "step": 5230 + }, + { + "epoch": 1.4685569904548006, + "grad_norm": 0.6483648419380188, + "learning_rate": 6.04813096661804e-06, + "loss": 0.388, + "step": 5231 + }, + { + "epoch": 1.4688377316114543, + "grad_norm": 0.7351468205451965, + "learning_rate": 6.046533847451067e-06, + "loss": 0.3824, + "step": 5232 + }, + { + "epoch": 1.4691184727681077, + "grad_norm": 0.7011606097221375, + "learning_rate": 6.04493661660451e-06, + "loss": 0.3654, + "step": 5233 + }, + { + "epoch": 1.4693992139247614, + "grad_norm": 0.6505141854286194, + "learning_rate": 6.043339274248816e-06, + "loss": 0.361, + "step": 5234 + }, + { + "epoch": 1.469679955081415, + "grad_norm": 0.6735453605651855, + "learning_rate": 6.041741820554442e-06, + "loss": 0.4113, + "step": 5235 + }, + { + "epoch": 1.4699606962380685, + "grad_norm": 0.596533477306366, + "learning_rate": 6.04014425569186e-06, + "loss": 0.3739, + "step": 5236 + }, + { + "epoch": 1.4702414373947221, + "grad_norm": 0.7310095429420471, + "learning_rate": 6.03854657983155e-06, + "loss": 0.379, + "step": 5237 + }, + { + "epoch": 1.4705221785513756, + "grad_norm": 0.7709599733352661, + "learning_rate": 6.036948793144008e-06, + "loss": 0.3856, + "step": 5238 + }, + { + "epoch": 1.4708029197080292, + "grad_norm": 0.7274706959724426, + "learning_rate": 6.035350895799739e-06, + "loss": 0.3892, + "step": 5239 + }, + { + "epoch": 1.4710836608646827, + "grad_norm": 0.632496178150177, + "learning_rate": 6.033752887969261e-06, + "loss": 0.3933, + "step": 5240 + }, + { + "epoch": 1.4713644020213363, + "grad_norm": 0.7112442255020142, + "learning_rate": 6.032154769823103e-06, + "loss": 0.4182, + "step": 5241 + }, + { + "epoch": 1.47164514317799, + "grad_norm": 0.6606371402740479, + "learning_rate": 6.030556541531808e-06, + "loss": 0.3856, + "step": 5242 + }, + { + "epoch": 1.4719258843346434, + "grad_norm": 0.6617858409881592, + "learning_rate": 6.028958203265926e-06, + "loss": 0.3883, + "step": 5243 + }, + { + "epoch": 1.472206625491297, + "grad_norm": 0.6967059969902039, + "learning_rate": 6.027359755196024e-06, + "loss": 0.427, + "step": 5244 + }, + { + "epoch": 1.4724873666479505, + "grad_norm": 0.5987151265144348, + "learning_rate": 6.0257611974926764e-06, + "loss": 0.4207, + "step": 5245 + }, + { + "epoch": 1.4727681078046042, + "grad_norm": 0.6150304675102234, + "learning_rate": 6.024162530326474e-06, + "loss": 0.4101, + "step": 5246 + }, + { + "epoch": 1.4730488489612577, + "grad_norm": 0.674147367477417, + "learning_rate": 6.022563753868014e-06, + "loss": 0.4238, + "step": 5247 + }, + { + "epoch": 1.4733295901179113, + "grad_norm": 0.6694990992546082, + "learning_rate": 6.0209648682879095e-06, + "loss": 0.4326, + "step": 5248 + }, + { + "epoch": 1.473610331274565, + "grad_norm": 0.664541482925415, + "learning_rate": 6.019365873756784e-06, + "loss": 0.4074, + "step": 5249 + }, + { + "epoch": 1.4738910724312184, + "grad_norm": 0.6293095350265503, + "learning_rate": 6.0177667704452706e-06, + "loss": 0.4024, + "step": 5250 + }, + { + "epoch": 1.4741718135878719, + "grad_norm": 0.6363199949264526, + "learning_rate": 6.0161675585240165e-06, + "loss": 0.3974, + "step": 5251 + }, + { + "epoch": 1.4744525547445255, + "grad_norm": 0.6345648169517517, + "learning_rate": 6.014568238163681e-06, + "loss": 0.3867, + "step": 5252 + }, + { + "epoch": 1.4747332959011792, + "grad_norm": 0.7065677046775818, + "learning_rate": 6.0129688095349315e-06, + "loss": 0.4126, + "step": 5253 + }, + { + "epoch": 1.4750140370578326, + "grad_norm": 0.6076880097389221, + "learning_rate": 6.011369272808449e-06, + "loss": 0.4191, + "step": 5254 + }, + { + "epoch": 1.4752947782144863, + "grad_norm": 0.6089729070663452, + "learning_rate": 6.009769628154928e-06, + "loss": 0.37, + "step": 5255 + }, + { + "epoch": 1.4755755193711397, + "grad_norm": 0.6556140184402466, + "learning_rate": 6.008169875745071e-06, + "loss": 0.4119, + "step": 5256 + }, + { + "epoch": 1.4758562605277934, + "grad_norm": 0.7148545980453491, + "learning_rate": 6.006570015749594e-06, + "loss": 0.3902, + "step": 5257 + }, + { + "epoch": 1.4761370016844468, + "grad_norm": 0.6153672933578491, + "learning_rate": 6.0049700483392256e-06, + "loss": 0.3719, + "step": 5258 + }, + { + "epoch": 1.4764177428411005, + "grad_norm": 0.5607266426086426, + "learning_rate": 6.003369973684703e-06, + "loss": 0.3671, + "step": 5259 + }, + { + "epoch": 1.4766984839977542, + "grad_norm": 0.6392752528190613, + "learning_rate": 6.0017697919567755e-06, + "loss": 0.3762, + "step": 5260 + }, + { + "epoch": 1.4769792251544076, + "grad_norm": 0.7464618682861328, + "learning_rate": 6.000169503326204e-06, + "loss": 0.3669, + "step": 5261 + }, + { + "epoch": 1.4772599663110613, + "grad_norm": 0.6039334535598755, + "learning_rate": 5.998569107963765e-06, + "loss": 0.3605, + "step": 5262 + }, + { + "epoch": 1.4775407074677147, + "grad_norm": 0.5802311897277832, + "learning_rate": 5.996968606040241e-06, + "loss": 0.3658, + "step": 5263 + }, + { + "epoch": 1.4778214486243684, + "grad_norm": 0.5855432152748108, + "learning_rate": 5.995367997726426e-06, + "loss": 0.4361, + "step": 5264 + }, + { + "epoch": 1.4781021897810218, + "grad_norm": 0.7310748100280762, + "learning_rate": 5.993767283193128e-06, + "loss": 0.4102, + "step": 5265 + }, + { + "epoch": 1.4783829309376755, + "grad_norm": 0.7403544187545776, + "learning_rate": 5.992166462611165e-06, + "loss": 0.3685, + "step": 5266 + }, + { + "epoch": 1.4786636720943291, + "grad_norm": 0.5496102571487427, + "learning_rate": 5.990565536151367e-06, + "loss": 0.4098, + "step": 5267 + }, + { + "epoch": 1.4789444132509826, + "grad_norm": 0.6038678288459778, + "learning_rate": 5.988964503984575e-06, + "loss": 0.3511, + "step": 5268 + }, + { + "epoch": 1.479225154407636, + "grad_norm": 0.6533830165863037, + "learning_rate": 5.9873633662816435e-06, + "loss": 0.3553, + "step": 5269 + }, + { + "epoch": 1.4795058955642897, + "grad_norm": 0.6522130966186523, + "learning_rate": 5.985762123213431e-06, + "loss": 0.3863, + "step": 5270 + }, + { + "epoch": 1.4797866367209433, + "grad_norm": 0.7871561050415039, + "learning_rate": 5.984160774950816e-06, + "loss": 0.4152, + "step": 5271 + }, + { + "epoch": 1.4800673778775968, + "grad_norm": 0.6167128086090088, + "learning_rate": 5.982559321664681e-06, + "loss": 0.3992, + "step": 5272 + }, + { + "epoch": 1.4803481190342505, + "grad_norm": 0.7990025281906128, + "learning_rate": 5.980957763525927e-06, + "loss": 0.3848, + "step": 5273 + }, + { + "epoch": 1.4806288601909041, + "grad_norm": 0.6659459471702576, + "learning_rate": 5.97935610070546e-06, + "loss": 0.4005, + "step": 5274 + }, + { + "epoch": 1.4809096013475576, + "grad_norm": 0.6580727696418762, + "learning_rate": 5.977754333374201e-06, + "loss": 0.3583, + "step": 5275 + }, + { + "epoch": 1.481190342504211, + "grad_norm": 0.7282503843307495, + "learning_rate": 5.9761524617030796e-06, + "loss": 0.3631, + "step": 5276 + }, + { + "epoch": 1.4814710836608647, + "grad_norm": 0.7209925055503845, + "learning_rate": 5.974550485863038e-06, + "loss": 0.3747, + "step": 5277 + }, + { + "epoch": 1.4817518248175183, + "grad_norm": 0.7016963362693787, + "learning_rate": 5.972948406025028e-06, + "loss": 0.3659, + "step": 5278 + }, + { + "epoch": 1.4820325659741718, + "grad_norm": 0.6650273203849792, + "learning_rate": 5.971346222360015e-06, + "loss": 0.3735, + "step": 5279 + }, + { + "epoch": 1.4823133071308254, + "grad_norm": 0.620310366153717, + "learning_rate": 5.969743935038974e-06, + "loss": 0.3954, + "step": 5280 + }, + { + "epoch": 1.4825940482874789, + "grad_norm": 0.7159186601638794, + "learning_rate": 5.96814154423289e-06, + "loss": 0.416, + "step": 5281 + }, + { + "epoch": 1.4828747894441325, + "grad_norm": 0.729147732257843, + "learning_rate": 5.966539050112761e-06, + "loss": 0.3969, + "step": 5282 + }, + { + "epoch": 1.483155530600786, + "grad_norm": 0.7633885145187378, + "learning_rate": 5.964936452849594e-06, + "loss": 0.4151, + "step": 5283 + }, + { + "epoch": 1.4834362717574396, + "grad_norm": 0.6439753174781799, + "learning_rate": 5.963333752614411e-06, + "loss": 0.4113, + "step": 5284 + }, + { + "epoch": 1.4837170129140933, + "grad_norm": 0.6582778692245483, + "learning_rate": 5.961730949578239e-06, + "loss": 0.3844, + "step": 5285 + }, + { + "epoch": 1.4839977540707467, + "grad_norm": 0.7423039674758911, + "learning_rate": 5.9601280439121224e-06, + "loss": 0.4004, + "step": 5286 + }, + { + "epoch": 1.4842784952274004, + "grad_norm": 0.6271695494651794, + "learning_rate": 5.9585250357871105e-06, + "loss": 0.3826, + "step": 5287 + }, + { + "epoch": 1.4845592363840538, + "grad_norm": 0.6961673498153687, + "learning_rate": 5.956921925374269e-06, + "loss": 0.3965, + "step": 5288 + }, + { + "epoch": 1.4848399775407075, + "grad_norm": 0.7871799468994141, + "learning_rate": 5.955318712844668e-06, + "loss": 0.4116, + "step": 5289 + }, + { + "epoch": 1.485120718697361, + "grad_norm": 0.548133134841919, + "learning_rate": 5.953715398369395e-06, + "loss": 0.3959, + "step": 5290 + }, + { + "epoch": 1.4854014598540146, + "grad_norm": 0.7311726212501526, + "learning_rate": 5.9521119821195475e-06, + "loss": 0.3948, + "step": 5291 + }, + { + "epoch": 1.4856822010106683, + "grad_norm": 0.5908982157707214, + "learning_rate": 5.9505084642662295e-06, + "loss": 0.3693, + "step": 5292 + }, + { + "epoch": 1.4859629421673217, + "grad_norm": 0.7001266479492188, + "learning_rate": 5.948904844980558e-06, + "loss": 0.4142, + "step": 5293 + }, + { + "epoch": 1.4862436833239752, + "grad_norm": 0.7001531720161438, + "learning_rate": 5.947301124433662e-06, + "loss": 0.3822, + "step": 5294 + }, + { + "epoch": 1.4865244244806288, + "grad_norm": 0.6736369132995605, + "learning_rate": 5.945697302796681e-06, + "loss": 0.3953, + "step": 5295 + }, + { + "epoch": 1.4868051656372825, + "grad_norm": 0.6351548433303833, + "learning_rate": 5.944093380240765e-06, + "loss": 0.3622, + "step": 5296 + }, + { + "epoch": 1.487085906793936, + "grad_norm": 0.664350152015686, + "learning_rate": 5.942489356937075e-06, + "loss": 0.4058, + "step": 5297 + }, + { + "epoch": 1.4873666479505896, + "grad_norm": 0.7064056396484375, + "learning_rate": 5.940885233056782e-06, + "loss": 0.387, + "step": 5298 + }, + { + "epoch": 1.4876473891072433, + "grad_norm": 0.6060472726821899, + "learning_rate": 5.9392810087710666e-06, + "loss": 0.348, + "step": 5299 + }, + { + "epoch": 1.4879281302638967, + "grad_norm": 0.5615829825401306, + "learning_rate": 5.937676684251124e-06, + "loss": 0.402, + "step": 5300 + }, + { + "epoch": 1.4882088714205501, + "grad_norm": 0.6724618673324585, + "learning_rate": 5.936072259668155e-06, + "loss": 0.3923, + "step": 5301 + }, + { + "epoch": 1.4884896125772038, + "grad_norm": 0.8215413689613342, + "learning_rate": 5.9344677351933785e-06, + "loss": 0.3659, + "step": 5302 + }, + { + "epoch": 1.4887703537338575, + "grad_norm": 0.6384742259979248, + "learning_rate": 5.932863110998014e-06, + "loss": 0.3858, + "step": 5303 + }, + { + "epoch": 1.489051094890511, + "grad_norm": 0.6819884777069092, + "learning_rate": 5.9312583872533e-06, + "loss": 0.4005, + "step": 5304 + }, + { + "epoch": 1.4893318360471646, + "grad_norm": 0.6694992184638977, + "learning_rate": 5.929653564130482e-06, + "loss": 0.4105, + "step": 5305 + }, + { + "epoch": 1.489612577203818, + "grad_norm": 0.6424428224563599, + "learning_rate": 5.928048641800817e-06, + "loss": 0.3477, + "step": 5306 + }, + { + "epoch": 1.4898933183604717, + "grad_norm": 0.651040256023407, + "learning_rate": 5.926443620435572e-06, + "loss": 0.389, + "step": 5307 + }, + { + "epoch": 1.4901740595171251, + "grad_norm": 0.6359730958938599, + "learning_rate": 5.924838500206026e-06, + "loss": 0.3662, + "step": 5308 + }, + { + "epoch": 1.4904548006737788, + "grad_norm": 0.707542896270752, + "learning_rate": 5.923233281283465e-06, + "loss": 0.439, + "step": 5309 + }, + { + "epoch": 1.4907355418304324, + "grad_norm": 0.6115970611572266, + "learning_rate": 5.92162796383919e-06, + "loss": 0.328, + "step": 5310 + }, + { + "epoch": 1.4910162829870859, + "grad_norm": 0.6650423407554626, + "learning_rate": 5.920022548044509e-06, + "loss": 0.3628, + "step": 5311 + }, + { + "epoch": 1.4912970241437395, + "grad_norm": 0.6958677172660828, + "learning_rate": 5.918417034070745e-06, + "loss": 0.3635, + "step": 5312 + }, + { + "epoch": 1.491577765300393, + "grad_norm": 0.678145170211792, + "learning_rate": 5.916811422089224e-06, + "loss": 0.3891, + "step": 5313 + }, + { + "epoch": 1.4918585064570467, + "grad_norm": 0.7073054909706116, + "learning_rate": 5.91520571227129e-06, + "loss": 0.4188, + "step": 5314 + }, + { + "epoch": 1.4921392476137, + "grad_norm": 0.6573716402053833, + "learning_rate": 5.913599904788294e-06, + "loss": 0.3435, + "step": 5315 + }, + { + "epoch": 1.4924199887703538, + "grad_norm": 0.5908942222595215, + "learning_rate": 5.9119939998115984e-06, + "loss": 0.3456, + "step": 5316 + }, + { + "epoch": 1.4927007299270074, + "grad_norm": 0.6898457407951355, + "learning_rate": 5.910387997512573e-06, + "loss": 0.4218, + "step": 5317 + }, + { + "epoch": 1.4929814710836609, + "grad_norm": 0.7363622188568115, + "learning_rate": 5.908781898062604e-06, + "loss": 0.3607, + "step": 5318 + }, + { + "epoch": 1.4932622122403143, + "grad_norm": 0.695950448513031, + "learning_rate": 5.90717570163308e-06, + "loss": 0.3921, + "step": 5319 + }, + { + "epoch": 1.493542953396968, + "grad_norm": 0.7554194927215576, + "learning_rate": 5.905569408395407e-06, + "loss": 0.4059, + "step": 5320 + }, + { + "epoch": 1.4938236945536216, + "grad_norm": 0.6438939571380615, + "learning_rate": 5.903963018520997e-06, + "loss": 0.3712, + "step": 5321 + }, + { + "epoch": 1.494104435710275, + "grad_norm": 0.6880878806114197, + "learning_rate": 5.902356532181277e-06, + "loss": 0.3861, + "step": 5322 + }, + { + "epoch": 1.4943851768669287, + "grad_norm": 0.684183657169342, + "learning_rate": 5.900749949547679e-06, + "loss": 0.4044, + "step": 5323 + }, + { + "epoch": 1.4946659180235822, + "grad_norm": 0.658062756061554, + "learning_rate": 5.899143270791648e-06, + "loss": 0.34, + "step": 5324 + }, + { + "epoch": 1.4949466591802358, + "grad_norm": 0.7454277276992798, + "learning_rate": 5.897536496084636e-06, + "loss": 0.4106, + "step": 5325 + }, + { + "epoch": 1.4952274003368893, + "grad_norm": 0.6257035732269287, + "learning_rate": 5.895929625598113e-06, + "loss": 0.3756, + "step": 5326 + }, + { + "epoch": 1.495508141493543, + "grad_norm": 0.6086931824684143, + "learning_rate": 5.894322659503551e-06, + "loss": 0.3817, + "step": 5327 + }, + { + "epoch": 1.4957888826501966, + "grad_norm": 0.593729555606842, + "learning_rate": 5.892715597972436e-06, + "loss": 0.408, + "step": 5328 + }, + { + "epoch": 1.49606962380685, + "grad_norm": 0.6202966570854187, + "learning_rate": 5.891108441176266e-06, + "loss": 0.3942, + "step": 5329 + }, + { + "epoch": 1.4963503649635037, + "grad_norm": 0.6379451751708984, + "learning_rate": 5.889501189286542e-06, + "loss": 0.397, + "step": 5330 + }, + { + "epoch": 1.4966311061201572, + "grad_norm": 0.5945736765861511, + "learning_rate": 5.887893842474783e-06, + "loss": 0.3994, + "step": 5331 + }, + { + "epoch": 1.4969118472768108, + "grad_norm": 0.721962034702301, + "learning_rate": 5.886286400912514e-06, + "loss": 0.4003, + "step": 5332 + }, + { + "epoch": 1.4971925884334643, + "grad_norm": 0.6053306460380554, + "learning_rate": 5.884678864771273e-06, + "loss": 0.3712, + "step": 5333 + }, + { + "epoch": 1.497473329590118, + "grad_norm": 0.670535147190094, + "learning_rate": 5.883071234222604e-06, + "loss": 0.4128, + "step": 5334 + }, + { + "epoch": 1.4977540707467716, + "grad_norm": 0.6948467493057251, + "learning_rate": 5.881463509438066e-06, + "loss": 0.3906, + "step": 5335 + }, + { + "epoch": 1.498034811903425, + "grad_norm": 0.6200743913650513, + "learning_rate": 5.879855690589223e-06, + "loss": 0.4406, + "step": 5336 + }, + { + "epoch": 1.4983155530600787, + "grad_norm": 0.6302000284194946, + "learning_rate": 5.8782477778476495e-06, + "loss": 0.3975, + "step": 5337 + }, + { + "epoch": 1.4985962942167321, + "grad_norm": 0.532776951789856, + "learning_rate": 5.876639771384938e-06, + "loss": 0.3407, + "step": 5338 + }, + { + "epoch": 1.4988770353733858, + "grad_norm": 0.607579231262207, + "learning_rate": 5.87503167137268e-06, + "loss": 0.4003, + "step": 5339 + }, + { + "epoch": 1.4991577765300392, + "grad_norm": 0.6802986860275269, + "learning_rate": 5.873423477982485e-06, + "loss": 0.4227, + "step": 5340 + }, + { + "epoch": 1.499438517686693, + "grad_norm": 0.6502795219421387, + "learning_rate": 5.871815191385967e-06, + "loss": 0.4116, + "step": 5341 + }, + { + "epoch": 1.4997192588433466, + "grad_norm": 0.6263981461524963, + "learning_rate": 5.8702068117547525e-06, + "loss": 0.3711, + "step": 5342 + }, + { + "epoch": 1.5, + "grad_norm": 0.6796420812606812, + "learning_rate": 5.86859833926048e-06, + "loss": 0.3804, + "step": 5343 + }, + { + "epoch": 1.5002807411566534, + "grad_norm": 0.6035329103469849, + "learning_rate": 5.8669897740747924e-06, + "loss": 0.4039, + "step": 5344 + }, + { + "epoch": 1.500561482313307, + "grad_norm": 0.7662019729614258, + "learning_rate": 5.865381116369348e-06, + "loss": 0.3979, + "step": 5345 + }, + { + "epoch": 1.5008422234699608, + "grad_norm": 0.6464301943778992, + "learning_rate": 5.863772366315814e-06, + "loss": 0.3918, + "step": 5346 + }, + { + "epoch": 1.5011229646266142, + "grad_norm": 0.6173228025436401, + "learning_rate": 5.8621635240858635e-06, + "loss": 0.4121, + "step": 5347 + }, + { + "epoch": 1.5014037057832679, + "grad_norm": 0.6510396003723145, + "learning_rate": 5.860554589851183e-06, + "loss": 0.3676, + "step": 5348 + }, + { + "epoch": 1.5016844469399215, + "grad_norm": 0.6980282068252563, + "learning_rate": 5.858945563783468e-06, + "loss": 0.3932, + "step": 5349 + }, + { + "epoch": 1.501965188096575, + "grad_norm": 0.7186689972877502, + "learning_rate": 5.857336446054423e-06, + "loss": 0.4263, + "step": 5350 + }, + { + "epoch": 1.5022459292532284, + "grad_norm": 0.7035834789276123, + "learning_rate": 5.8557272368357655e-06, + "loss": 0.3772, + "step": 5351 + }, + { + "epoch": 1.502526670409882, + "grad_norm": 0.644368588924408, + "learning_rate": 5.854117936299217e-06, + "loss": 0.3852, + "step": 5352 + }, + { + "epoch": 1.5028074115665357, + "grad_norm": 0.6306403279304504, + "learning_rate": 5.852508544616515e-06, + "loss": 0.4048, + "step": 5353 + }, + { + "epoch": 1.5030881527231892, + "grad_norm": 0.6548639535903931, + "learning_rate": 5.850899061959403e-06, + "loss": 0.3425, + "step": 5354 + }, + { + "epoch": 1.5033688938798426, + "grad_norm": 0.6650660037994385, + "learning_rate": 5.8492894884996334e-06, + "loss": 0.373, + "step": 5355 + }, + { + "epoch": 1.5036496350364965, + "grad_norm": 0.6372920274734497, + "learning_rate": 5.847679824408972e-06, + "loss": 0.3832, + "step": 5356 + }, + { + "epoch": 1.50393037619315, + "grad_norm": 0.6380687355995178, + "learning_rate": 5.846070069859191e-06, + "loss": 0.3824, + "step": 5357 + }, + { + "epoch": 1.5042111173498034, + "grad_norm": 0.5915088057518005, + "learning_rate": 5.8444602250220726e-06, + "loss": 0.3712, + "step": 5358 + }, + { + "epoch": 1.504491858506457, + "grad_norm": 0.661666989326477, + "learning_rate": 5.84285029006941e-06, + "loss": 0.3929, + "step": 5359 + }, + { + "epoch": 1.5047725996631107, + "grad_norm": 0.6691256761550903, + "learning_rate": 5.841240265173007e-06, + "loss": 0.3963, + "step": 5360 + }, + { + "epoch": 1.5050533408197642, + "grad_norm": 0.6810079216957092, + "learning_rate": 5.8396301505046735e-06, + "loss": 0.3857, + "step": 5361 + }, + { + "epoch": 1.5053340819764176, + "grad_norm": 0.7217987775802612, + "learning_rate": 5.8380199462362315e-06, + "loss": 0.3524, + "step": 5362 + }, + { + "epoch": 1.5056148231330713, + "grad_norm": 0.7055343985557556, + "learning_rate": 5.836409652539513e-06, + "loss": 0.4055, + "step": 5363 + }, + { + "epoch": 1.505895564289725, + "grad_norm": 0.681816577911377, + "learning_rate": 5.834799269586358e-06, + "loss": 0.4012, + "step": 5364 + }, + { + "epoch": 1.5061763054463784, + "grad_norm": 0.6335623264312744, + "learning_rate": 5.833188797548614e-06, + "loss": 0.3969, + "step": 5365 + }, + { + "epoch": 1.506457046603032, + "grad_norm": 0.7278549671173096, + "learning_rate": 5.831578236598145e-06, + "loss": 0.4264, + "step": 5366 + }, + { + "epoch": 1.5067377877596857, + "grad_norm": 0.8221374750137329, + "learning_rate": 5.8299675869068166e-06, + "loss": 0.3996, + "step": 5367 + }, + { + "epoch": 1.5070185289163391, + "grad_norm": 0.7820380330085754, + "learning_rate": 5.82835684864651e-06, + "loss": 0.3678, + "step": 5368 + }, + { + "epoch": 1.5072992700729926, + "grad_norm": 0.5716264843940735, + "learning_rate": 5.8267460219891105e-06, + "loss": 0.3813, + "step": 5369 + }, + { + "epoch": 1.5075800112296462, + "grad_norm": 0.7560267448425293, + "learning_rate": 5.825135107106517e-06, + "loss": 0.4229, + "step": 5370 + }, + { + "epoch": 1.5078607523863, + "grad_norm": 0.5916890501976013, + "learning_rate": 5.823524104170636e-06, + "loss": 0.3894, + "step": 5371 + }, + { + "epoch": 1.5081414935429533, + "grad_norm": 0.6364879012107849, + "learning_rate": 5.821913013353383e-06, + "loss": 0.3902, + "step": 5372 + }, + { + "epoch": 1.508422234699607, + "grad_norm": 0.7223371863365173, + "learning_rate": 5.820301834826685e-06, + "loss": 0.4066, + "step": 5373 + }, + { + "epoch": 1.5087029758562607, + "grad_norm": 0.7059696912765503, + "learning_rate": 5.818690568762477e-06, + "loss": 0.3313, + "step": 5374 + }, + { + "epoch": 1.5089837170129141, + "grad_norm": 0.7071421146392822, + "learning_rate": 5.817079215332703e-06, + "loss": 0.4008, + "step": 5375 + }, + { + "epoch": 1.5092644581695676, + "grad_norm": 0.7023800015449524, + "learning_rate": 5.815467774709314e-06, + "loss": 0.4313, + "step": 5376 + }, + { + "epoch": 1.5095451993262212, + "grad_norm": 0.5554521679878235, + "learning_rate": 5.813856247064276e-06, + "loss": 0.3913, + "step": 5377 + }, + { + "epoch": 1.5098259404828749, + "grad_norm": 0.6641628742218018, + "learning_rate": 5.812244632569561e-06, + "loss": 0.4017, + "step": 5378 + }, + { + "epoch": 1.5101066816395283, + "grad_norm": 0.6552072763442993, + "learning_rate": 5.81063293139715e-06, + "loss": 0.3742, + "step": 5379 + }, + { + "epoch": 1.5103874227961818, + "grad_norm": 0.7391794919967651, + "learning_rate": 5.8090211437190335e-06, + "loss": 0.4016, + "step": 5380 + }, + { + "epoch": 1.5106681639528357, + "grad_norm": 0.608917772769928, + "learning_rate": 5.807409269707211e-06, + "loss": 0.376, + "step": 5381 + }, + { + "epoch": 1.510948905109489, + "grad_norm": 0.6967856884002686, + "learning_rate": 5.805797309533692e-06, + "loss": 0.3931, + "step": 5382 + }, + { + "epoch": 1.5112296462661425, + "grad_norm": 0.6006859540939331, + "learning_rate": 5.8041852633704955e-06, + "loss": 0.3481, + "step": 5383 + }, + { + "epoch": 1.5115103874227962, + "grad_norm": 0.6341225504875183, + "learning_rate": 5.80257313138965e-06, + "loss": 0.4189, + "step": 5384 + }, + { + "epoch": 1.5117911285794499, + "grad_norm": 0.6314254999160767, + "learning_rate": 5.8009609137631886e-06, + "loss": 0.3615, + "step": 5385 + }, + { + "epoch": 1.5120718697361033, + "grad_norm": 0.7232957482337952, + "learning_rate": 5.7993486106631595e-06, + "loss": 0.3636, + "step": 5386 + }, + { + "epoch": 1.5123526108927567, + "grad_norm": 0.6637201309204102, + "learning_rate": 5.797736222261617e-06, + "loss": 0.3657, + "step": 5387 + }, + { + "epoch": 1.5126333520494104, + "grad_norm": 0.6195018887519836, + "learning_rate": 5.7961237487306265e-06, + "loss": 0.3679, + "step": 5388 + }, + { + "epoch": 1.512914093206064, + "grad_norm": 0.5997506976127625, + "learning_rate": 5.794511190242261e-06, + "loss": 0.4312, + "step": 5389 + }, + { + "epoch": 1.5131948343627175, + "grad_norm": 0.6237205266952515, + "learning_rate": 5.792898546968601e-06, + "loss": 0.3597, + "step": 5390 + }, + { + "epoch": 1.5134755755193712, + "grad_norm": 0.6758390665054321, + "learning_rate": 5.79128581908174e-06, + "loss": 0.414, + "step": 5391 + }, + { + "epoch": 1.5137563166760248, + "grad_norm": 0.6171358823776245, + "learning_rate": 5.789673006753776e-06, + "loss": 0.3444, + "step": 5392 + }, + { + "epoch": 1.5140370578326783, + "grad_norm": 0.7520563006401062, + "learning_rate": 5.788060110156819e-06, + "loss": 0.4251, + "step": 5393 + }, + { + "epoch": 1.5143177989893317, + "grad_norm": 0.7279042601585388, + "learning_rate": 5.786447129462989e-06, + "loss": 0.4328, + "step": 5394 + }, + { + "epoch": 1.5145985401459854, + "grad_norm": 0.5959756374359131, + "learning_rate": 5.784834064844411e-06, + "loss": 0.3886, + "step": 5395 + }, + { + "epoch": 1.514879281302639, + "grad_norm": 0.7278431057929993, + "learning_rate": 5.783220916473224e-06, + "loss": 0.408, + "step": 5396 + }, + { + "epoch": 1.5151600224592925, + "grad_norm": 0.7128639221191406, + "learning_rate": 5.781607684521568e-06, + "loss": 0.3918, + "step": 5397 + }, + { + "epoch": 1.5154407636159462, + "grad_norm": 0.7180032730102539, + "learning_rate": 5.779994369161602e-06, + "loss": 0.3705, + "step": 5398 + }, + { + "epoch": 1.5157215047725998, + "grad_norm": 0.6366936564445496, + "learning_rate": 5.778380970565488e-06, + "loss": 0.3333, + "step": 5399 + }, + { + "epoch": 1.5160022459292533, + "grad_norm": 0.6799874305725098, + "learning_rate": 5.776767488905397e-06, + "loss": 0.3705, + "step": 5400 + }, + { + "epoch": 1.5162829870859067, + "grad_norm": 0.6618354916572571, + "learning_rate": 5.7751539243535096e-06, + "loss": 0.3751, + "step": 5401 + }, + { + "epoch": 1.5165637282425604, + "grad_norm": 0.6983927488327026, + "learning_rate": 5.773540277082016e-06, + "loss": 0.3722, + "step": 5402 + }, + { + "epoch": 1.516844469399214, + "grad_norm": 0.7446848750114441, + "learning_rate": 5.7719265472631134e-06, + "loss": 0.3902, + "step": 5403 + }, + { + "epoch": 1.5171252105558675, + "grad_norm": 0.6266611218452454, + "learning_rate": 5.770312735069012e-06, + "loss": 0.409, + "step": 5404 + }, + { + "epoch": 1.517405951712521, + "grad_norm": 0.6330932974815369, + "learning_rate": 5.768698840671924e-06, + "loss": 0.3883, + "step": 5405 + }, + { + "epoch": 1.5176866928691746, + "grad_norm": 0.6011622548103333, + "learning_rate": 5.767084864244077e-06, + "loss": 0.3445, + "step": 5406 + }, + { + "epoch": 1.5179674340258282, + "grad_norm": 0.6243938207626343, + "learning_rate": 5.765470805957704e-06, + "loss": 0.3844, + "step": 5407 + }, + { + "epoch": 1.5182481751824817, + "grad_norm": 0.6916384100914001, + "learning_rate": 5.763856665985045e-06, + "loss": 0.4055, + "step": 5408 + }, + { + "epoch": 1.5185289163391353, + "grad_norm": 0.6826633810997009, + "learning_rate": 5.762242444498353e-06, + "loss": 0.4252, + "step": 5409 + }, + { + "epoch": 1.518809657495789, + "grad_norm": 0.7094565629959106, + "learning_rate": 5.7606281416698886e-06, + "loss": 0.3954, + "step": 5410 + }, + { + "epoch": 1.5190903986524424, + "grad_norm": 0.648792028427124, + "learning_rate": 5.7590137576719174e-06, + "loss": 0.3884, + "step": 5411 + }, + { + "epoch": 1.5193711398090959, + "grad_norm": 0.5709918141365051, + "learning_rate": 5.75739929267672e-06, + "loss": 0.4086, + "step": 5412 + }, + { + "epoch": 1.5196518809657495, + "grad_norm": 0.6704549789428711, + "learning_rate": 5.7557847468565785e-06, + "loss": 0.3918, + "step": 5413 + }, + { + "epoch": 1.5199326221224032, + "grad_norm": 0.6673620939254761, + "learning_rate": 5.754170120383789e-06, + "loss": 0.3996, + "step": 5414 + }, + { + "epoch": 1.5202133632790567, + "grad_norm": 0.6866858601570129, + "learning_rate": 5.752555413430654e-06, + "loss": 0.3987, + "step": 5415 + }, + { + "epoch": 1.5204941044357103, + "grad_norm": 0.6254552602767944, + "learning_rate": 5.7509406261694846e-06, + "loss": 0.3571, + "step": 5416 + }, + { + "epoch": 1.520774845592364, + "grad_norm": 0.6172268986701965, + "learning_rate": 5.749325758772604e-06, + "loss": 0.3999, + "step": 5417 + }, + { + "epoch": 1.5210555867490174, + "grad_norm": 0.686348021030426, + "learning_rate": 5.747710811412335e-06, + "loss": 0.3975, + "step": 5418 + }, + { + "epoch": 1.5213363279056709, + "grad_norm": 0.6760064959526062, + "learning_rate": 5.74609578426102e-06, + "loss": 0.386, + "step": 5419 + }, + { + "epoch": 1.5216170690623245, + "grad_norm": 0.6784769296646118, + "learning_rate": 5.744480677491001e-06, + "loss": 0.4086, + "step": 5420 + }, + { + "epoch": 1.5218978102189782, + "grad_norm": 0.5766478180885315, + "learning_rate": 5.742865491274634e-06, + "loss": 0.3686, + "step": 5421 + }, + { + "epoch": 1.5221785513756316, + "grad_norm": 0.614371657371521, + "learning_rate": 5.741250225784282e-06, + "loss": 0.3734, + "step": 5422 + }, + { + "epoch": 1.522459292532285, + "grad_norm": 0.6568720936775208, + "learning_rate": 5.739634881192316e-06, + "loss": 0.3863, + "step": 5423 + }, + { + "epoch": 1.522740033688939, + "grad_norm": 0.7889798879623413, + "learning_rate": 5.738019457671115e-06, + "loss": 0.4398, + "step": 5424 + }, + { + "epoch": 1.5230207748455924, + "grad_norm": 0.685434103012085, + "learning_rate": 5.736403955393066e-06, + "loss": 0.4039, + "step": 5425 + }, + { + "epoch": 1.5233015160022458, + "grad_norm": 0.5949421525001526, + "learning_rate": 5.734788374530565e-06, + "loss": 0.3699, + "step": 5426 + }, + { + "epoch": 1.5235822571588995, + "grad_norm": 0.6006492972373962, + "learning_rate": 5.733172715256019e-06, + "loss": 0.3843, + "step": 5427 + }, + { + "epoch": 1.5238629983155532, + "grad_norm": 0.6904444098472595, + "learning_rate": 5.731556977741841e-06, + "loss": 0.3813, + "step": 5428 + }, + { + "epoch": 1.5241437394722066, + "grad_norm": 0.7011377811431885, + "learning_rate": 5.729941162160452e-06, + "loss": 0.3952, + "step": 5429 + }, + { + "epoch": 1.52442448062886, + "grad_norm": 0.6623652577400208, + "learning_rate": 5.72832526868428e-06, + "loss": 0.3673, + "step": 5430 + }, + { + "epoch": 1.5247052217855137, + "grad_norm": 0.7013071775436401, + "learning_rate": 5.726709297485765e-06, + "loss": 0.3739, + "step": 5431 + }, + { + "epoch": 1.5249859629421674, + "grad_norm": 0.6543093919754028, + "learning_rate": 5.725093248737352e-06, + "loss": 0.4227, + "step": 5432 + }, + { + "epoch": 1.5252667040988208, + "grad_norm": 0.6951916217803955, + "learning_rate": 5.723477122611499e-06, + "loss": 0.3842, + "step": 5433 + }, + { + "epoch": 1.5255474452554745, + "grad_norm": 0.6171422600746155, + "learning_rate": 5.721860919280665e-06, + "loss": 0.4137, + "step": 5434 + }, + { + "epoch": 1.5258281864121281, + "grad_norm": 0.7122002243995667, + "learning_rate": 5.7202446389173225e-06, + "loss": 0.3993, + "step": 5435 + }, + { + "epoch": 1.5261089275687816, + "grad_norm": 0.6579063534736633, + "learning_rate": 5.7186282816939506e-06, + "loss": 0.3922, + "step": 5436 + }, + { + "epoch": 1.526389668725435, + "grad_norm": 0.6793777346611023, + "learning_rate": 5.717011847783039e-06, + "loss": 0.3699, + "step": 5437 + }, + { + "epoch": 1.5266704098820887, + "grad_norm": 0.7135227918624878, + "learning_rate": 5.715395337357079e-06, + "loss": 0.3952, + "step": 5438 + }, + { + "epoch": 1.5269511510387423, + "grad_norm": 0.7062365412712097, + "learning_rate": 5.71377875058858e-06, + "loss": 0.373, + "step": 5439 + }, + { + "epoch": 1.5272318921953958, + "grad_norm": 0.6488957405090332, + "learning_rate": 5.712162087650051e-06, + "loss": 0.3951, + "step": 5440 + }, + { + "epoch": 1.5275126333520495, + "grad_norm": 0.6518882513046265, + "learning_rate": 5.71054534871401e-06, + "loss": 0.3937, + "step": 5441 + }, + { + "epoch": 1.5277933745087031, + "grad_norm": 0.6719104051589966, + "learning_rate": 5.7089285339529906e-06, + "loss": 0.3696, + "step": 5442 + }, + { + "epoch": 1.5280741156653566, + "grad_norm": 0.6756449341773987, + "learning_rate": 5.707311643539526e-06, + "loss": 0.3965, + "step": 5443 + }, + { + "epoch": 1.52835485682201, + "grad_norm": 0.6698692440986633, + "learning_rate": 5.705694677646162e-06, + "loss": 0.3698, + "step": 5444 + }, + { + "epoch": 1.5286355979786637, + "grad_norm": 0.7081566452980042, + "learning_rate": 5.704077636445451e-06, + "loss": 0.362, + "step": 5445 + }, + { + "epoch": 1.5289163391353173, + "grad_norm": 0.5820019245147705, + "learning_rate": 5.702460520109952e-06, + "loss": 0.4134, + "step": 5446 + }, + { + "epoch": 1.5291970802919708, + "grad_norm": 0.7080488204956055, + "learning_rate": 5.700843328812234e-06, + "loss": 0.4127, + "step": 5447 + }, + { + "epoch": 1.5294778214486242, + "grad_norm": 0.6721872687339783, + "learning_rate": 5.699226062724874e-06, + "loss": 0.3849, + "step": 5448 + }, + { + "epoch": 1.529758562605278, + "grad_norm": 0.6288347840309143, + "learning_rate": 5.697608722020457e-06, + "loss": 0.3897, + "step": 5449 + }, + { + "epoch": 1.5300393037619315, + "grad_norm": 0.6673933863639832, + "learning_rate": 5.6959913068715755e-06, + "loss": 0.4067, + "step": 5450 + }, + { + "epoch": 1.530320044918585, + "grad_norm": 0.7181184887886047, + "learning_rate": 5.694373817450831e-06, + "loss": 0.4224, + "step": 5451 + }, + { + "epoch": 1.5306007860752386, + "grad_norm": 0.6134187579154968, + "learning_rate": 5.692756253930829e-06, + "loss": 0.4048, + "step": 5452 + }, + { + "epoch": 1.5308815272318923, + "grad_norm": 0.7308587431907654, + "learning_rate": 5.691138616484188e-06, + "loss": 0.4185, + "step": 5453 + }, + { + "epoch": 1.5311622683885457, + "grad_norm": 0.6228800415992737, + "learning_rate": 5.689520905283532e-06, + "loss": 0.3899, + "step": 5454 + }, + { + "epoch": 1.5314430095451992, + "grad_norm": 0.5505013465881348, + "learning_rate": 5.687903120501493e-06, + "loss": 0.3814, + "step": 5455 + }, + { + "epoch": 1.5317237507018528, + "grad_norm": 0.5905176401138306, + "learning_rate": 5.686285262310711e-06, + "loss": 0.4157, + "step": 5456 + }, + { + "epoch": 1.5320044918585065, + "grad_norm": 0.5852351188659668, + "learning_rate": 5.684667330883833e-06, + "loss": 0.3955, + "step": 5457 + }, + { + "epoch": 1.53228523301516, + "grad_norm": 0.7054872512817383, + "learning_rate": 5.683049326393515e-06, + "loss": 0.4057, + "step": 5458 + }, + { + "epoch": 1.5325659741718136, + "grad_norm": 0.6706883311271667, + "learning_rate": 5.681431249012421e-06, + "loss": 0.3792, + "step": 5459 + }, + { + "epoch": 1.5328467153284673, + "grad_norm": 0.6272156238555908, + "learning_rate": 5.679813098913222e-06, + "loss": 0.4217, + "step": 5460 + }, + { + "epoch": 1.5331274564851207, + "grad_norm": 0.6789434552192688, + "learning_rate": 5.6781948762685964e-06, + "loss": 0.3303, + "step": 5461 + }, + { + "epoch": 1.5334081976417742, + "grad_norm": 0.5765001773834229, + "learning_rate": 5.6765765812512305e-06, + "loss": 0.4084, + "step": 5462 + }, + { + "epoch": 1.5336889387984278, + "grad_norm": 0.6353691816329956, + "learning_rate": 5.674958214033819e-06, + "loss": 0.3779, + "step": 5463 + }, + { + "epoch": 1.5339696799550815, + "grad_norm": 0.6867348551750183, + "learning_rate": 5.6733397747890654e-06, + "loss": 0.3928, + "step": 5464 + }, + { + "epoch": 1.534250421111735, + "grad_norm": 0.6241796612739563, + "learning_rate": 5.671721263689675e-06, + "loss": 0.4177, + "step": 5465 + }, + { + "epoch": 1.5345311622683886, + "grad_norm": 0.6127474308013916, + "learning_rate": 5.670102680908372e-06, + "loss": 0.3968, + "step": 5466 + }, + { + "epoch": 1.5348119034250423, + "grad_norm": 0.7174510955810547, + "learning_rate": 5.668484026617878e-06, + "loss": 0.3708, + "step": 5467 + }, + { + "epoch": 1.5350926445816957, + "grad_norm": 0.6681004166603088, + "learning_rate": 5.666865300990923e-06, + "loss": 0.4002, + "step": 5468 + }, + { + "epoch": 1.5353733857383491, + "grad_norm": 0.6794464588165283, + "learning_rate": 5.665246504200253e-06, + "loss": 0.4073, + "step": 5469 + }, + { + "epoch": 1.5356541268950028, + "grad_norm": 0.6528903841972351, + "learning_rate": 5.663627636418611e-06, + "loss": 0.3979, + "step": 5470 + }, + { + "epoch": 1.5359348680516565, + "grad_norm": 0.5895650386810303, + "learning_rate": 5.662008697818754e-06, + "loss": 0.4001, + "step": 5471 + }, + { + "epoch": 1.53621560920831, + "grad_norm": 0.6318337917327881, + "learning_rate": 5.660389688573448e-06, + "loss": 0.3612, + "step": 5472 + }, + { + "epoch": 1.5364963503649633, + "grad_norm": 0.7096831798553467, + "learning_rate": 5.658770608855459e-06, + "loss": 0.3752, + "step": 5473 + }, + { + "epoch": 1.5367770915216172, + "grad_norm": 0.5959429740905762, + "learning_rate": 5.657151458837569e-06, + "loss": 0.3754, + "step": 5474 + }, + { + "epoch": 1.5370578326782707, + "grad_norm": 0.6204558610916138, + "learning_rate": 5.65553223869256e-06, + "loss": 0.3697, + "step": 5475 + }, + { + "epoch": 1.5373385738349241, + "grad_norm": 0.6069909930229187, + "learning_rate": 5.653912948593227e-06, + "loss": 0.3898, + "step": 5476 + }, + { + "epoch": 1.5376193149915778, + "grad_norm": 0.6009364128112793, + "learning_rate": 5.652293588712372e-06, + "loss": 0.3678, + "step": 5477 + }, + { + "epoch": 1.5379000561482314, + "grad_norm": 0.6900011301040649, + "learning_rate": 5.650674159222801e-06, + "loss": 0.4194, + "step": 5478 + }, + { + "epoch": 1.5381807973048849, + "grad_norm": 0.6191831231117249, + "learning_rate": 5.64905466029733e-06, + "loss": 0.361, + "step": 5479 + }, + { + "epoch": 1.5384615384615383, + "grad_norm": 0.6132239699363708, + "learning_rate": 5.64743509210878e-06, + "loss": 0.3786, + "step": 5480 + }, + { + "epoch": 1.538742279618192, + "grad_norm": 0.6818129420280457, + "learning_rate": 5.645815454829986e-06, + "loss": 0.3878, + "step": 5481 + }, + { + "epoch": 1.5390230207748457, + "grad_norm": 0.6904420256614685, + "learning_rate": 5.644195748633781e-06, + "loss": 0.3872, + "step": 5482 + }, + { + "epoch": 1.539303761931499, + "grad_norm": 0.5789188742637634, + "learning_rate": 5.642575973693013e-06, + "loss": 0.4358, + "step": 5483 + }, + { + "epoch": 1.5395845030881528, + "grad_norm": 0.6155490875244141, + "learning_rate": 5.640956130180533e-06, + "loss": 0.3667, + "step": 5484 + }, + { + "epoch": 1.5398652442448064, + "grad_norm": 0.7449417114257812, + "learning_rate": 5.639336218269199e-06, + "loss": 0.3935, + "step": 5485 + }, + { + "epoch": 1.5401459854014599, + "grad_norm": 0.6869012713432312, + "learning_rate": 5.6377162381318806e-06, + "loss": 0.3808, + "step": 5486 + }, + { + "epoch": 1.5404267265581133, + "grad_norm": 0.5921604037284851, + "learning_rate": 5.6360961899414515e-06, + "loss": 0.4041, + "step": 5487 + }, + { + "epoch": 1.540707467714767, + "grad_norm": 0.7154459357261658, + "learning_rate": 5.634476073870791e-06, + "loss": 0.3615, + "step": 5488 + }, + { + "epoch": 1.5409882088714206, + "grad_norm": 0.6011552214622498, + "learning_rate": 5.632855890092791e-06, + "loss": 0.4014, + "step": 5489 + }, + { + "epoch": 1.541268950028074, + "grad_norm": 0.8709184527397156, + "learning_rate": 5.631235638780345e-06, + "loss": 0.3773, + "step": 5490 + }, + { + "epoch": 1.5415496911847277, + "grad_norm": 0.710959255695343, + "learning_rate": 5.629615320106356e-06, + "loss": 0.3899, + "step": 5491 + }, + { + "epoch": 1.5418304323413814, + "grad_norm": 0.6088156700134277, + "learning_rate": 5.627994934243737e-06, + "loss": 0.4004, + "step": 5492 + }, + { + "epoch": 1.5421111734980348, + "grad_norm": 0.7583803534507751, + "learning_rate": 5.626374481365404e-06, + "loss": 0.4415, + "step": 5493 + }, + { + "epoch": 1.5423919146546883, + "grad_norm": 0.7273849248886108, + "learning_rate": 5.624753961644281e-06, + "loss": 0.4285, + "step": 5494 + }, + { + "epoch": 1.542672655811342, + "grad_norm": 0.7240543365478516, + "learning_rate": 5.623133375253301e-06, + "loss": 0.3664, + "step": 5495 + }, + { + "epoch": 1.5429533969679956, + "grad_norm": 0.6836703419685364, + "learning_rate": 5.621512722365401e-06, + "loss": 0.3607, + "step": 5496 + }, + { + "epoch": 1.543234138124649, + "grad_norm": 0.6441492438316345, + "learning_rate": 5.619892003153529e-06, + "loss": 0.4024, + "step": 5497 + }, + { + "epoch": 1.5435148792813025, + "grad_norm": 0.6888523101806641, + "learning_rate": 5.618271217790636e-06, + "loss": 0.3961, + "step": 5498 + }, + { + "epoch": 1.5437956204379562, + "grad_norm": 0.6339767575263977, + "learning_rate": 5.616650366449685e-06, + "loss": 0.3833, + "step": 5499 + }, + { + "epoch": 1.5440763615946098, + "grad_norm": 0.6974245309829712, + "learning_rate": 5.615029449303642e-06, + "loss": 0.3928, + "step": 5500 + }, + { + "epoch": 1.5443571027512633, + "grad_norm": 0.6889846920967102, + "learning_rate": 5.613408466525479e-06, + "loss": 0.3889, + "step": 5501 + }, + { + "epoch": 1.544637843907917, + "grad_norm": 0.6527835726737976, + "learning_rate": 5.61178741828818e-06, + "loss": 0.4131, + "step": 5502 + }, + { + "epoch": 1.5449185850645706, + "grad_norm": 0.6665393114089966, + "learning_rate": 5.610166304764732e-06, + "loss": 0.4026, + "step": 5503 + }, + { + "epoch": 1.545199326221224, + "grad_norm": 0.638247013092041, + "learning_rate": 5.60854512612813e-06, + "loss": 0.3957, + "step": 5504 + }, + { + "epoch": 1.5454800673778775, + "grad_norm": 0.7534692883491516, + "learning_rate": 5.6069238825513774e-06, + "loss": 0.3884, + "step": 5505 + }, + { + "epoch": 1.5457608085345311, + "grad_norm": 0.6691770553588867, + "learning_rate": 5.6053025742074805e-06, + "loss": 0.3791, + "step": 5506 + }, + { + "epoch": 1.5460415496911848, + "grad_norm": 0.5810383558273315, + "learning_rate": 5.603681201269458e-06, + "loss": 0.4009, + "step": 5507 + }, + { + "epoch": 1.5463222908478382, + "grad_norm": 0.6450157165527344, + "learning_rate": 5.6020597639103325e-06, + "loss": 0.4004, + "step": 5508 + }, + { + "epoch": 1.546603032004492, + "grad_norm": 0.7024112939834595, + "learning_rate": 5.600438262303132e-06, + "loss": 0.4178, + "step": 5509 + }, + { + "epoch": 1.5468837731611456, + "grad_norm": 0.6183632612228394, + "learning_rate": 5.598816696620895e-06, + "loss": 0.4037, + "step": 5510 + }, + { + "epoch": 1.547164514317799, + "grad_norm": 0.6242706775665283, + "learning_rate": 5.597195067036663e-06, + "loss": 0.3941, + "step": 5511 + }, + { + "epoch": 1.5474452554744524, + "grad_norm": 0.9261346459388733, + "learning_rate": 5.595573373723487e-06, + "loss": 0.4497, + "step": 5512 + }, + { + "epoch": 1.547725996631106, + "grad_norm": 0.7260050177574158, + "learning_rate": 5.593951616854425e-06, + "loss": 0.3641, + "step": 5513 + }, + { + "epoch": 1.5480067377877598, + "grad_norm": 0.5826572179794312, + "learning_rate": 5.59232979660254e-06, + "loss": 0.4066, + "step": 5514 + }, + { + "epoch": 1.5482874789444132, + "grad_norm": 0.7458318471908569, + "learning_rate": 5.590707913140901e-06, + "loss": 0.3966, + "step": 5515 + }, + { + "epoch": 1.5485682201010667, + "grad_norm": 0.61967533826828, + "learning_rate": 5.589085966642589e-06, + "loss": 0.3988, + "step": 5516 + }, + { + "epoch": 1.5488489612577205, + "grad_norm": 0.6171215772628784, + "learning_rate": 5.587463957280685e-06, + "loss": 0.3914, + "step": 5517 + }, + { + "epoch": 1.549129702414374, + "grad_norm": 0.6467472910881042, + "learning_rate": 5.585841885228281e-06, + "loss": 0.3976, + "step": 5518 + }, + { + "epoch": 1.5494104435710274, + "grad_norm": 0.7427176833152771, + "learning_rate": 5.584219750658473e-06, + "loss": 0.4056, + "step": 5519 + }, + { + "epoch": 1.549691184727681, + "grad_norm": 0.7593867182731628, + "learning_rate": 5.582597553744366e-06, + "loss": 0.398, + "step": 5520 + }, + { + "epoch": 1.5499719258843347, + "grad_norm": 0.6774862408638, + "learning_rate": 5.580975294659074e-06, + "loss": 0.355, + "step": 5521 + }, + { + "epoch": 1.5502526670409882, + "grad_norm": 0.6058395504951477, + "learning_rate": 5.579352973575709e-06, + "loss": 0.377, + "step": 5522 + }, + { + "epoch": 1.5505334081976416, + "grad_norm": 0.7323144674301147, + "learning_rate": 5.577730590667397e-06, + "loss": 0.3708, + "step": 5523 + }, + { + "epoch": 1.5508141493542953, + "grad_norm": 0.729806661605835, + "learning_rate": 5.5761081461072695e-06, + "loss": 0.3996, + "step": 5524 + }, + { + "epoch": 1.551094890510949, + "grad_norm": 0.6938438415527344, + "learning_rate": 5.574485640068464e-06, + "loss": 0.3463, + "step": 5525 + }, + { + "epoch": 1.5513756316676024, + "grad_norm": 0.6121125221252441, + "learning_rate": 5.572863072724123e-06, + "loss": 0.3984, + "step": 5526 + }, + { + "epoch": 1.551656372824256, + "grad_norm": 0.6394217610359192, + "learning_rate": 5.571240444247399e-06, + "loss": 0.3701, + "step": 5527 + }, + { + "epoch": 1.5519371139809097, + "grad_norm": 0.7209957242012024, + "learning_rate": 5.569617754811444e-06, + "loss": 0.404, + "step": 5528 + }, + { + "epoch": 1.5522178551375632, + "grad_norm": 0.6942537426948547, + "learning_rate": 5.567995004589425e-06, + "loss": 0.3906, + "step": 5529 + }, + { + "epoch": 1.5524985962942166, + "grad_norm": 0.6493688225746155, + "learning_rate": 5.566372193754512e-06, + "loss": 0.4436, + "step": 5530 + }, + { + "epoch": 1.5527793374508703, + "grad_norm": 0.5768901109695435, + "learning_rate": 5.564749322479881e-06, + "loss": 0.4155, + "step": 5531 + }, + { + "epoch": 1.553060078607524, + "grad_norm": 0.675581157207489, + "learning_rate": 5.5631263909387145e-06, + "loss": 0.3932, + "step": 5532 + }, + { + "epoch": 1.5533408197641774, + "grad_norm": 0.6773494482040405, + "learning_rate": 5.561503399304201e-06, + "loss": 0.4234, + "step": 5533 + }, + { + "epoch": 1.553621560920831, + "grad_norm": 0.6229548454284668, + "learning_rate": 5.559880347749536e-06, + "loss": 0.3775, + "step": 5534 + }, + { + "epoch": 1.5539023020774847, + "grad_norm": 0.6098642349243164, + "learning_rate": 5.558257236447921e-06, + "loss": 0.3462, + "step": 5535 + }, + { + "epoch": 1.5541830432341381, + "grad_norm": 0.6481459736824036, + "learning_rate": 5.556634065572567e-06, + "loss": 0.3707, + "step": 5536 + }, + { + "epoch": 1.5544637843907916, + "grad_norm": 0.6870434880256653, + "learning_rate": 5.555010835296687e-06, + "loss": 0.3573, + "step": 5537 + }, + { + "epoch": 1.5547445255474452, + "grad_norm": 0.6114113330841064, + "learning_rate": 5.553387545793503e-06, + "loss": 0.3716, + "step": 5538 + }, + { + "epoch": 1.555025266704099, + "grad_norm": 0.6437608599662781, + "learning_rate": 5.551764197236239e-06, + "loss": 0.4068, + "step": 5539 + }, + { + "epoch": 1.5553060078607523, + "grad_norm": 0.6629282236099243, + "learning_rate": 5.550140789798132e-06, + "loss": 0.3855, + "step": 5540 + }, + { + "epoch": 1.5555867490174058, + "grad_norm": 0.6390807032585144, + "learning_rate": 5.54851732365242e-06, + "loss": 0.3635, + "step": 5541 + }, + { + "epoch": 1.5558674901740597, + "grad_norm": 0.6676781177520752, + "learning_rate": 5.5468937989723506e-06, + "loss": 0.4189, + "step": 5542 + }, + { + "epoch": 1.5561482313307131, + "grad_norm": 0.6239802837371826, + "learning_rate": 5.545270215931177e-06, + "loss": 0.3526, + "step": 5543 + }, + { + "epoch": 1.5564289724873666, + "grad_norm": 0.6265968680381775, + "learning_rate": 5.543646574702158e-06, + "loss": 0.4239, + "step": 5544 + }, + { + "epoch": 1.5567097136440202, + "grad_norm": 0.6826972365379333, + "learning_rate": 5.5420228754585545e-06, + "loss": 0.4096, + "step": 5545 + }, + { + "epoch": 1.5569904548006739, + "grad_norm": 0.6733637452125549, + "learning_rate": 5.540399118373641e-06, + "loss": 0.3595, + "step": 5546 + }, + { + "epoch": 1.5572711959573273, + "grad_norm": 0.6451590061187744, + "learning_rate": 5.538775303620695e-06, + "loss": 0.3829, + "step": 5547 + }, + { + "epoch": 1.5575519371139808, + "grad_norm": 0.675022304058075, + "learning_rate": 5.5371514313729975e-06, + "loss": 0.3709, + "step": 5548 + }, + { + "epoch": 1.5578326782706344, + "grad_norm": 0.6459131836891174, + "learning_rate": 5.535527501803842e-06, + "loss": 0.4349, + "step": 5549 + }, + { + "epoch": 1.558113419427288, + "grad_norm": 0.6688864231109619, + "learning_rate": 5.533903515086521e-06, + "loss": 0.3794, + "step": 5550 + }, + { + "epoch": 1.5583941605839415, + "grad_norm": 0.6998948454856873, + "learning_rate": 5.5322794713943355e-06, + "loss": 0.3694, + "step": 5551 + }, + { + "epoch": 1.5586749017405952, + "grad_norm": 0.632996678352356, + "learning_rate": 5.530655370900596e-06, + "loss": 0.3921, + "step": 5552 + }, + { + "epoch": 1.5589556428972489, + "grad_norm": 0.6345375180244446, + "learning_rate": 5.529031213778615e-06, + "loss": 0.3684, + "step": 5553 + }, + { + "epoch": 1.5592363840539023, + "grad_norm": 0.7264622449874878, + "learning_rate": 5.527407000201712e-06, + "loss": 0.3744, + "step": 5554 + }, + { + "epoch": 1.5595171252105557, + "grad_norm": 0.6772736310958862, + "learning_rate": 5.525782730343215e-06, + "loss": 0.3893, + "step": 5555 + }, + { + "epoch": 1.5597978663672094, + "grad_norm": 0.649009644985199, + "learning_rate": 5.524158404376453e-06, + "loss": 0.3654, + "step": 5556 + }, + { + "epoch": 1.560078607523863, + "grad_norm": 0.741586446762085, + "learning_rate": 5.522534022474766e-06, + "loss": 0.409, + "step": 5557 + }, + { + "epoch": 1.5603593486805165, + "grad_norm": 0.6765753030776978, + "learning_rate": 5.520909584811498e-06, + "loss": 0.4212, + "step": 5558 + }, + { + "epoch": 1.5606400898371702, + "grad_norm": 0.6014144420623779, + "learning_rate": 5.519285091559998e-06, + "loss": 0.3859, + "step": 5559 + }, + { + "epoch": 1.5609208309938238, + "grad_norm": 0.6442937850952148, + "learning_rate": 5.517660542893625e-06, + "loss": 0.3625, + "step": 5560 + }, + { + "epoch": 1.5612015721504773, + "grad_norm": 0.6993957757949829, + "learning_rate": 5.516035938985735e-06, + "loss": 0.3748, + "step": 5561 + }, + { + "epoch": 1.5614823133071307, + "grad_norm": 0.7138069272041321, + "learning_rate": 5.5144112800097e-06, + "loss": 0.4089, + "step": 5562 + }, + { + "epoch": 1.5617630544637844, + "grad_norm": 0.6776037216186523, + "learning_rate": 5.51278656613889e-06, + "loss": 0.3774, + "step": 5563 + }, + { + "epoch": 1.562043795620438, + "grad_norm": 0.5727848410606384, + "learning_rate": 5.5111617975466895e-06, + "loss": 0.3572, + "step": 5564 + }, + { + "epoch": 1.5623245367770915, + "grad_norm": 0.6910765767097473, + "learning_rate": 5.5095369744064776e-06, + "loss": 0.3996, + "step": 5565 + }, + { + "epoch": 1.562605277933745, + "grad_norm": 0.7952026128768921, + "learning_rate": 5.50791209689165e-06, + "loss": 0.3761, + "step": 5566 + }, + { + "epoch": 1.5628860190903988, + "grad_norm": 0.6733259558677673, + "learning_rate": 5.506287165175602e-06, + "loss": 0.3828, + "step": 5567 + }, + { + "epoch": 1.5631667602470523, + "grad_norm": 0.814070999622345, + "learning_rate": 5.504662179431735e-06, + "loss": 0.3774, + "step": 5568 + }, + { + "epoch": 1.5634475014037057, + "grad_norm": 0.6128706336021423, + "learning_rate": 5.503037139833461e-06, + "loss": 0.3603, + "step": 5569 + }, + { + "epoch": 1.5637282425603594, + "grad_norm": 0.6916417479515076, + "learning_rate": 5.5014120465541885e-06, + "loss": 0.3528, + "step": 5570 + }, + { + "epoch": 1.564008983717013, + "grad_norm": 0.7357199788093567, + "learning_rate": 5.4997868997673435e-06, + "loss": 0.4089, + "step": 5571 + }, + { + "epoch": 1.5642897248736665, + "grad_norm": 0.7291268110275269, + "learning_rate": 5.498161699646347e-06, + "loss": 0.3958, + "step": 5572 + }, + { + "epoch": 1.56457046603032, + "grad_norm": 0.6182436347007751, + "learning_rate": 5.496536446364632e-06, + "loss": 0.3955, + "step": 5573 + }, + { + "epoch": 1.5648512071869736, + "grad_norm": 0.6410479545593262, + "learning_rate": 5.494911140095634e-06, + "loss": 0.346, + "step": 5574 + }, + { + "epoch": 1.5651319483436272, + "grad_norm": 0.6170321106910706, + "learning_rate": 5.493285781012798e-06, + "loss": 0.3817, + "step": 5575 + }, + { + "epoch": 1.5654126895002807, + "grad_norm": 0.6629190444946289, + "learning_rate": 5.491660369289571e-06, + "loss": 0.4293, + "step": 5576 + }, + { + "epoch": 1.5656934306569343, + "grad_norm": 0.7044650912284851, + "learning_rate": 5.490034905099408e-06, + "loss": 0.3708, + "step": 5577 + }, + { + "epoch": 1.565974171813588, + "grad_norm": 0.6208305358886719, + "learning_rate": 5.4884093886157654e-06, + "loss": 0.4206, + "step": 5578 + }, + { + "epoch": 1.5662549129702414, + "grad_norm": 0.8037087321281433, + "learning_rate": 5.486783820012109e-06, + "loss": 0.3823, + "step": 5579 + }, + { + "epoch": 1.5665356541268949, + "grad_norm": 0.6731278896331787, + "learning_rate": 5.485158199461912e-06, + "loss": 0.4122, + "step": 5580 + }, + { + "epoch": 1.5668163952835485, + "grad_norm": 0.704522967338562, + "learning_rate": 5.48353252713865e-06, + "loss": 0.4276, + "step": 5581 + }, + { + "epoch": 1.5670971364402022, + "grad_norm": 0.6292304396629333, + "learning_rate": 5.481906803215803e-06, + "loss": 0.3593, + "step": 5582 + }, + { + "epoch": 1.5673778775968557, + "grad_norm": 0.6704569458961487, + "learning_rate": 5.4802810278668575e-06, + "loss": 0.3968, + "step": 5583 + }, + { + "epoch": 1.5676586187535093, + "grad_norm": 0.6457850933074951, + "learning_rate": 5.478655201265308e-06, + "loss": 0.3673, + "step": 5584 + }, + { + "epoch": 1.567939359910163, + "grad_norm": 0.6567925810813904, + "learning_rate": 5.477029323584652e-06, + "loss": 0.4081, + "step": 5585 + }, + { + "epoch": 1.5682201010668164, + "grad_norm": 0.6074472069740295, + "learning_rate": 5.475403394998393e-06, + "loss": 0.3884, + "step": 5586 + }, + { + "epoch": 1.5685008422234699, + "grad_norm": 0.699834942817688, + "learning_rate": 5.473777415680042e-06, + "loss": 0.4117, + "step": 5587 + }, + { + "epoch": 1.5687815833801235, + "grad_norm": 0.6256060004234314, + "learning_rate": 5.472151385803108e-06, + "loss": 0.3448, + "step": 5588 + }, + { + "epoch": 1.5690623245367772, + "grad_norm": 0.6225060224533081, + "learning_rate": 5.470525305541117e-06, + "loss": 0.3771, + "step": 5589 + }, + { + "epoch": 1.5693430656934306, + "grad_norm": 0.5951137542724609, + "learning_rate": 5.46889917506759e-06, + "loss": 0.3699, + "step": 5590 + }, + { + "epoch": 1.569623806850084, + "grad_norm": 0.7257815003395081, + "learning_rate": 5.467272994556059e-06, + "loss": 0.3987, + "step": 5591 + }, + { + "epoch": 1.5699045480067377, + "grad_norm": 0.5655238032341003, + "learning_rate": 5.465646764180059e-06, + "loss": 0.4205, + "step": 5592 + }, + { + "epoch": 1.5701852891633914, + "grad_norm": 0.7138909697532654, + "learning_rate": 5.464020484113134e-06, + "loss": 0.4162, + "step": 5593 + }, + { + "epoch": 1.5704660303200448, + "grad_norm": 0.6702297329902649, + "learning_rate": 5.462394154528827e-06, + "loss": 0.4072, + "step": 5594 + }, + { + "epoch": 1.5707467714766985, + "grad_norm": 0.5846837162971497, + "learning_rate": 5.460767775600691e-06, + "loss": 0.3813, + "step": 5595 + }, + { + "epoch": 1.5710275126333522, + "grad_norm": 0.6576070189476013, + "learning_rate": 5.459141347502284e-06, + "loss": 0.3778, + "step": 5596 + }, + { + "epoch": 1.5713082537900056, + "grad_norm": 0.6405521035194397, + "learning_rate": 5.457514870407168e-06, + "loss": 0.3421, + "step": 5597 + }, + { + "epoch": 1.571588994946659, + "grad_norm": 0.6876504421234131, + "learning_rate": 5.4558883444889114e-06, + "loss": 0.363, + "step": 5598 + }, + { + "epoch": 1.5718697361033127, + "grad_norm": 0.7009930610656738, + "learning_rate": 5.454261769921083e-06, + "loss": 0.3508, + "step": 5599 + }, + { + "epoch": 1.5721504772599664, + "grad_norm": 0.7065655589103699, + "learning_rate": 5.452635146877264e-06, + "loss": 0.4081, + "step": 5600 + }, + { + "epoch": 1.5724312184166198, + "grad_norm": 0.6671035289764404, + "learning_rate": 5.4510084755310375e-06, + "loss": 0.4366, + "step": 5601 + }, + { + "epoch": 1.5727119595732735, + "grad_norm": 0.6496288180351257, + "learning_rate": 5.44938175605599e-06, + "loss": 0.4247, + "step": 5602 + }, + { + "epoch": 1.5729927007299271, + "grad_norm": 0.8295820951461792, + "learning_rate": 5.447754988625717e-06, + "loss": 0.4129, + "step": 5603 + }, + { + "epoch": 1.5732734418865806, + "grad_norm": 0.6814701557159424, + "learning_rate": 5.446128173413817e-06, + "loss": 0.4212, + "step": 5604 + }, + { + "epoch": 1.573554183043234, + "grad_norm": 0.6117609739303589, + "learning_rate": 5.44450131059389e-06, + "loss": 0.3632, + "step": 5605 + }, + { + "epoch": 1.5738349241998877, + "grad_norm": 0.7425245046615601, + "learning_rate": 5.4428744003395496e-06, + "loss": 0.4001, + "step": 5606 + }, + { + "epoch": 1.5741156653565413, + "grad_norm": 0.6963576078414917, + "learning_rate": 5.441247442824407e-06, + "loss": 0.3764, + "step": 5607 + }, + { + "epoch": 1.5743964065131948, + "grad_norm": 0.7188160419464111, + "learning_rate": 5.4396204382220795e-06, + "loss": 0.3745, + "step": 5608 + }, + { + "epoch": 1.5746771476698485, + "grad_norm": 0.6298017501831055, + "learning_rate": 5.437993386706195e-06, + "loss": 0.3872, + "step": 5609 + }, + { + "epoch": 1.5749578888265021, + "grad_norm": 0.660544216632843, + "learning_rate": 5.436366288450379e-06, + "loss": 0.3676, + "step": 5610 + }, + { + "epoch": 1.5752386299831556, + "grad_norm": 0.7274445295333862, + "learning_rate": 5.4347391436282656e-06, + "loss": 0.4052, + "step": 5611 + }, + { + "epoch": 1.575519371139809, + "grad_norm": 0.6884285807609558, + "learning_rate": 5.433111952413496e-06, + "loss": 0.3688, + "step": 5612 + }, + { + "epoch": 1.5758001122964627, + "grad_norm": 0.6660681366920471, + "learning_rate": 5.43148471497971e-06, + "loss": 0.4059, + "step": 5613 + }, + { + "epoch": 1.5760808534531163, + "grad_norm": 0.6194701790809631, + "learning_rate": 5.429857431500559e-06, + "loss": 0.3832, + "step": 5614 + }, + { + "epoch": 1.5763615946097698, + "grad_norm": 0.6025277972221375, + "learning_rate": 5.428230102149697e-06, + "loss": 0.3782, + "step": 5615 + }, + { + "epoch": 1.5766423357664232, + "grad_norm": 0.5469680428504944, + "learning_rate": 5.426602727100782e-06, + "loss": 0.3582, + "step": 5616 + }, + { + "epoch": 1.5769230769230769, + "grad_norm": 0.6515589952468872, + "learning_rate": 5.424975306527474e-06, + "loss": 0.4364, + "step": 5617 + }, + { + "epoch": 1.5772038180797305, + "grad_norm": 0.6261181831359863, + "learning_rate": 5.423347840603446e-06, + "loss": 0.393, + "step": 5618 + }, + { + "epoch": 1.577484559236384, + "grad_norm": 0.6409482359886169, + "learning_rate": 5.421720329502369e-06, + "loss": 0.3687, + "step": 5619 + }, + { + "epoch": 1.5777653003930376, + "grad_norm": 0.6708099246025085, + "learning_rate": 5.420092773397922e-06, + "loss": 0.4339, + "step": 5620 + }, + { + "epoch": 1.5780460415496913, + "grad_norm": 0.6219494938850403, + "learning_rate": 5.418465172463785e-06, + "loss": 0.3659, + "step": 5621 + }, + { + "epoch": 1.5783267827063447, + "grad_norm": 0.7946568131446838, + "learning_rate": 5.416837526873647e-06, + "loss": 0.4062, + "step": 5622 + }, + { + "epoch": 1.5786075238629982, + "grad_norm": 0.6432189345359802, + "learning_rate": 5.415209836801201e-06, + "loss": 0.3883, + "step": 5623 + }, + { + "epoch": 1.5788882650196518, + "grad_norm": 0.6144047975540161, + "learning_rate": 5.4135821024201425e-06, + "loss": 0.3949, + "step": 5624 + }, + { + "epoch": 1.5791690061763055, + "grad_norm": 0.6671247482299805, + "learning_rate": 5.411954323904175e-06, + "loss": 0.4313, + "step": 5625 + }, + { + "epoch": 1.579449747332959, + "grad_norm": 0.5982580780982971, + "learning_rate": 5.410326501427004e-06, + "loss": 0.3392, + "step": 5626 + }, + { + "epoch": 1.5797304884896126, + "grad_norm": 0.6136658787727356, + "learning_rate": 5.40869863516234e-06, + "loss": 0.3647, + "step": 5627 + }, + { + "epoch": 1.5800112296462663, + "grad_norm": 0.6476240158081055, + "learning_rate": 5.407070725283898e-06, + "loss": 0.3885, + "step": 5628 + }, + { + "epoch": 1.5802919708029197, + "grad_norm": 0.7178700566291809, + "learning_rate": 5.4054427719654e-06, + "loss": 0.3636, + "step": 5629 + }, + { + "epoch": 1.5805727119595732, + "grad_norm": 0.6783363223075867, + "learning_rate": 5.40381477538057e-06, + "loss": 0.3823, + "step": 5630 + }, + { + "epoch": 1.5808534531162268, + "grad_norm": 0.623234212398529, + "learning_rate": 5.402186735703141e-06, + "loss": 0.3826, + "step": 5631 + }, + { + "epoch": 1.5811341942728805, + "grad_norm": 0.6377182006835938, + "learning_rate": 5.4005586531068425e-06, + "loss": 0.4293, + "step": 5632 + }, + { + "epoch": 1.581414935429534, + "grad_norm": 0.6228262782096863, + "learning_rate": 5.398930527765416e-06, + "loss": 0.3748, + "step": 5633 + }, + { + "epoch": 1.5816956765861874, + "grad_norm": 0.6211974024772644, + "learning_rate": 5.3973023598526045e-06, + "loss": 0.3807, + "step": 5634 + }, + { + "epoch": 1.5819764177428413, + "grad_norm": 0.6278573274612427, + "learning_rate": 5.395674149542155e-06, + "loss": 0.3784, + "step": 5635 + }, + { + "epoch": 1.5822571588994947, + "grad_norm": 0.6505194902420044, + "learning_rate": 5.394045897007821e-06, + "loss": 0.4335, + "step": 5636 + }, + { + "epoch": 1.5825379000561481, + "grad_norm": 0.6234167814254761, + "learning_rate": 5.392417602423361e-06, + "loss": 0.4005, + "step": 5637 + }, + { + "epoch": 1.5828186412128018, + "grad_norm": 0.6796559691429138, + "learning_rate": 5.390789265962534e-06, + "loss": 0.4065, + "step": 5638 + }, + { + "epoch": 1.5830993823694555, + "grad_norm": 0.7159935235977173, + "learning_rate": 5.389160887799105e-06, + "loss": 0.3921, + "step": 5639 + }, + { + "epoch": 1.583380123526109, + "grad_norm": 0.6569457054138184, + "learning_rate": 5.387532468106848e-06, + "loss": 0.3437, + "step": 5640 + }, + { + "epoch": 1.5836608646827623, + "grad_norm": 0.6812229156494141, + "learning_rate": 5.385904007059535e-06, + "loss": 0.3895, + "step": 5641 + }, + { + "epoch": 1.583941605839416, + "grad_norm": 0.571449875831604, + "learning_rate": 5.384275504830946e-06, + "loss": 0.3738, + "step": 5642 + }, + { + "epoch": 1.5842223469960697, + "grad_norm": 0.5636094212532043, + "learning_rate": 5.382646961594865e-06, + "loss": 0.4131, + "step": 5643 + }, + { + "epoch": 1.5845030881527231, + "grad_norm": 0.6847487092018127, + "learning_rate": 5.38101837752508e-06, + "loss": 0.3666, + "step": 5644 + }, + { + "epoch": 1.5847838293093768, + "grad_norm": 0.6250169277191162, + "learning_rate": 5.379389752795383e-06, + "loss": 0.4011, + "step": 5645 + }, + { + "epoch": 1.5850645704660304, + "grad_norm": 0.6602939367294312, + "learning_rate": 5.377761087579571e-06, + "loss": 0.3728, + "step": 5646 + }, + { + "epoch": 1.5853453116226839, + "grad_norm": 0.6117948293685913, + "learning_rate": 5.376132382051445e-06, + "loss": 0.367, + "step": 5647 + }, + { + "epoch": 1.5856260527793373, + "grad_norm": 0.5795977711677551, + "learning_rate": 5.3745036363848105e-06, + "loss": 0.3457, + "step": 5648 + }, + { + "epoch": 1.585906793935991, + "grad_norm": 0.6143127679824829, + "learning_rate": 5.3728748507534755e-06, + "loss": 0.3461, + "step": 5649 + }, + { + "epoch": 1.5861875350926447, + "grad_norm": 0.6950699090957642, + "learning_rate": 5.371246025331256e-06, + "loss": 0.3638, + "step": 5650 + }, + { + "epoch": 1.586468276249298, + "grad_norm": 0.6955627202987671, + "learning_rate": 5.36961716029197e-06, + "loss": 0.4029, + "step": 5651 + }, + { + "epoch": 1.5867490174059518, + "grad_norm": 0.6370987296104431, + "learning_rate": 5.367988255809438e-06, + "loss": 0.3897, + "step": 5652 + }, + { + "epoch": 1.5870297585626054, + "grad_norm": 0.6839419007301331, + "learning_rate": 5.366359312057489e-06, + "loss": 0.4192, + "step": 5653 + }, + { + "epoch": 1.5873104997192589, + "grad_norm": 0.5921178460121155, + "learning_rate": 5.364730329209951e-06, + "loss": 0.3852, + "step": 5654 + }, + { + "epoch": 1.5875912408759123, + "grad_norm": 0.6688896417617798, + "learning_rate": 5.3631013074406606e-06, + "loss": 0.3945, + "step": 5655 + }, + { + "epoch": 1.587871982032566, + "grad_norm": 0.637823224067688, + "learning_rate": 5.361472246923457e-06, + "loss": 0.3999, + "step": 5656 + }, + { + "epoch": 1.5881527231892196, + "grad_norm": 0.5959929823875427, + "learning_rate": 5.359843147832183e-06, + "loss": 0.3443, + "step": 5657 + }, + { + "epoch": 1.588433464345873, + "grad_norm": 0.6044209003448486, + "learning_rate": 5.358214010340686e-06, + "loss": 0.4039, + "step": 5658 + }, + { + "epoch": 1.5887142055025265, + "grad_norm": 0.6545689105987549, + "learning_rate": 5.356584834622818e-06, + "loss": 0.4111, + "step": 5659 + }, + { + "epoch": 1.5889949466591804, + "grad_norm": 0.612241804599762, + "learning_rate": 5.3549556208524336e-06, + "loss": 0.4077, + "step": 5660 + }, + { + "epoch": 1.5892756878158338, + "grad_norm": 0.5983982086181641, + "learning_rate": 5.353326369203392e-06, + "loss": 0.4107, + "step": 5661 + }, + { + "epoch": 1.5895564289724873, + "grad_norm": 0.6796982884407043, + "learning_rate": 5.351697079849557e-06, + "loss": 0.3761, + "step": 5662 + }, + { + "epoch": 1.589837170129141, + "grad_norm": 0.61302250623703, + "learning_rate": 5.350067752964798e-06, + "loss": 0.3949, + "step": 5663 + }, + { + "epoch": 1.5901179112857946, + "grad_norm": 0.6317418813705444, + "learning_rate": 5.348438388722986e-06, + "loss": 0.3863, + "step": 5664 + }, + { + "epoch": 1.590398652442448, + "grad_norm": 0.5928323864936829, + "learning_rate": 5.3468089872979945e-06, + "loss": 0.36, + "step": 5665 + }, + { + "epoch": 1.5906793935991015, + "grad_norm": 0.6907023191452026, + "learning_rate": 5.345179548863705e-06, + "loss": 0.3854, + "step": 5666 + }, + { + "epoch": 1.5909601347557552, + "grad_norm": 0.6863617300987244, + "learning_rate": 5.343550073594e-06, + "loss": 0.3868, + "step": 5667 + }, + { + "epoch": 1.5912408759124088, + "grad_norm": 0.6140517592430115, + "learning_rate": 5.341920561662767e-06, + "loss": 0.3997, + "step": 5668 + }, + { + "epoch": 1.5915216170690623, + "grad_norm": 0.6900655031204224, + "learning_rate": 5.3402910132439004e-06, + "loss": 0.4194, + "step": 5669 + }, + { + "epoch": 1.591802358225716, + "grad_norm": 0.6943616271018982, + "learning_rate": 5.338661428511292e-06, + "loss": 0.4013, + "step": 5670 + }, + { + "epoch": 1.5920830993823696, + "grad_norm": 0.585360050201416, + "learning_rate": 5.3370318076388405e-06, + "loss": 0.3818, + "step": 5671 + }, + { + "epoch": 1.592363840539023, + "grad_norm": 0.6311351656913757, + "learning_rate": 5.335402150800451e-06, + "loss": 0.3663, + "step": 5672 + }, + { + "epoch": 1.5926445816956765, + "grad_norm": 0.6898561120033264, + "learning_rate": 5.33377245817003e-06, + "loss": 0.3546, + "step": 5673 + }, + { + "epoch": 1.5929253228523301, + "grad_norm": 0.661670982837677, + "learning_rate": 5.332142729921488e-06, + "loss": 0.3921, + "step": 5674 + }, + { + "epoch": 1.5932060640089838, + "grad_norm": 0.6570422053337097, + "learning_rate": 5.33051296622874e-06, + "loss": 0.4157, + "step": 5675 + }, + { + "epoch": 1.5934868051656372, + "grad_norm": 0.5399648547172546, + "learning_rate": 5.328883167265703e-06, + "loss": 0.375, + "step": 5676 + }, + { + "epoch": 1.593767546322291, + "grad_norm": 0.7241529822349548, + "learning_rate": 5.327253333206299e-06, + "loss": 0.3709, + "step": 5677 + }, + { + "epoch": 1.5940482874789446, + "grad_norm": 0.6967384815216064, + "learning_rate": 5.325623464224454e-06, + "loss": 0.3932, + "step": 5678 + }, + { + "epoch": 1.594329028635598, + "grad_norm": 0.7840999364852905, + "learning_rate": 5.323993560494099e-06, + "loss": 0.4236, + "step": 5679 + }, + { + "epoch": 1.5946097697922514, + "grad_norm": 0.6832603812217712, + "learning_rate": 5.322363622189165e-06, + "loss": 0.3965, + "step": 5680 + }, + { + "epoch": 1.594890510948905, + "grad_norm": 0.6805334687232971, + "learning_rate": 5.320733649483591e-06, + "loss": 0.3309, + "step": 5681 + }, + { + "epoch": 1.5951712521055588, + "grad_norm": 0.6894299983978271, + "learning_rate": 5.319103642551315e-06, + "loss": 0.3613, + "step": 5682 + }, + { + "epoch": 1.5954519932622122, + "grad_norm": 0.7123468518257141, + "learning_rate": 5.3174736015662845e-06, + "loss": 0.3768, + "step": 5683 + }, + { + "epoch": 1.5957327344188657, + "grad_norm": 0.7026888728141785, + "learning_rate": 5.315843526702443e-06, + "loss": 0.3622, + "step": 5684 + }, + { + "epoch": 1.5960134755755195, + "grad_norm": 0.6979182958602905, + "learning_rate": 5.3142134181337466e-06, + "loss": 0.4203, + "step": 5685 + }, + { + "epoch": 1.596294216732173, + "grad_norm": 0.6742028594017029, + "learning_rate": 5.312583276034148e-06, + "loss": 0.3579, + "step": 5686 + }, + { + "epoch": 1.5965749578888264, + "grad_norm": 0.7389938831329346, + "learning_rate": 5.310953100577606e-06, + "loss": 0.3686, + "step": 5687 + }, + { + "epoch": 1.59685569904548, + "grad_norm": 0.7732906937599182, + "learning_rate": 5.309322891938082e-06, + "loss": 0.4171, + "step": 5688 + }, + { + "epoch": 1.5971364402021337, + "grad_norm": 0.6303626894950867, + "learning_rate": 5.307692650289542e-06, + "loss": 0.3753, + "step": 5689 + }, + { + "epoch": 1.5974171813587872, + "grad_norm": 0.7214590907096863, + "learning_rate": 5.306062375805957e-06, + "loss": 0.3934, + "step": 5690 + }, + { + "epoch": 1.5976979225154406, + "grad_norm": 0.6761311888694763, + "learning_rate": 5.304432068661298e-06, + "loss": 0.3229, + "step": 5691 + }, + { + "epoch": 1.5979786636720943, + "grad_norm": 0.736258864402771, + "learning_rate": 5.302801729029543e-06, + "loss": 0.3751, + "step": 5692 + }, + { + "epoch": 1.598259404828748, + "grad_norm": 0.7438814043998718, + "learning_rate": 5.301171357084669e-06, + "loss": 0.3805, + "step": 5693 + }, + { + "epoch": 1.5985401459854014, + "grad_norm": 0.695715606212616, + "learning_rate": 5.29954095300066e-06, + "loss": 0.3869, + "step": 5694 + }, + { + "epoch": 1.598820887142055, + "grad_norm": 0.6188850998878479, + "learning_rate": 5.2979105169515045e-06, + "loss": 0.4087, + "step": 5695 + }, + { + "epoch": 1.5991016282987087, + "grad_norm": 0.6104736924171448, + "learning_rate": 5.2962800491111895e-06, + "loss": 0.3956, + "step": 5696 + }, + { + "epoch": 1.5993823694553622, + "grad_norm": 0.665642261505127, + "learning_rate": 5.294649549653713e-06, + "loss": 0.3867, + "step": 5697 + }, + { + "epoch": 1.5996631106120156, + "grad_norm": 0.593701183795929, + "learning_rate": 5.2930190187530675e-06, + "loss": 0.3557, + "step": 5698 + }, + { + "epoch": 1.5999438517686693, + "grad_norm": 0.6774048209190369, + "learning_rate": 5.291388456583254e-06, + "loss": 0.3939, + "step": 5699 + }, + { + "epoch": 1.600224592925323, + "grad_norm": 0.5818973779678345, + "learning_rate": 5.289757863318277e-06, + "loss": 0.3877, + "step": 5700 + }, + { + "epoch": 1.6005053340819764, + "grad_norm": 0.6529117822647095, + "learning_rate": 5.288127239132143e-06, + "loss": 0.3597, + "step": 5701 + }, + { + "epoch": 1.60078607523863, + "grad_norm": 0.652026891708374, + "learning_rate": 5.28649658419886e-06, + "loss": 0.3814, + "step": 5702 + }, + { + "epoch": 1.6010668163952837, + "grad_norm": 0.5884618163108826, + "learning_rate": 5.284865898692446e-06, + "loss": 0.3917, + "step": 5703 + }, + { + "epoch": 1.6013475575519371, + "grad_norm": 0.6404399871826172, + "learning_rate": 5.2832351827869135e-06, + "loss": 0.3547, + "step": 5704 + }, + { + "epoch": 1.6016282987085906, + "grad_norm": 0.7353585958480835, + "learning_rate": 5.281604436656283e-06, + "loss": 0.3876, + "step": 5705 + }, + { + "epoch": 1.6019090398652442, + "grad_norm": 0.59206223487854, + "learning_rate": 5.2799736604745765e-06, + "loss": 0.3729, + "step": 5706 + }, + { + "epoch": 1.602189781021898, + "grad_norm": 0.6682827472686768, + "learning_rate": 5.278342854415825e-06, + "loss": 0.394, + "step": 5707 + }, + { + "epoch": 1.6024705221785513, + "grad_norm": 0.6857581734657288, + "learning_rate": 5.276712018654054e-06, + "loss": 0.3888, + "step": 5708 + }, + { + "epoch": 1.6027512633352048, + "grad_norm": 0.6567980051040649, + "learning_rate": 5.275081153363297e-06, + "loss": 0.3857, + "step": 5709 + }, + { + "epoch": 1.6030320044918585, + "grad_norm": 0.7068313360214233, + "learning_rate": 5.27345025871759e-06, + "loss": 0.4007, + "step": 5710 + }, + { + "epoch": 1.6033127456485121, + "grad_norm": 0.6105488538742065, + "learning_rate": 5.271819334890972e-06, + "loss": 0.3754, + "step": 5711 + }, + { + "epoch": 1.6035934868051656, + "grad_norm": 0.5581768155097961, + "learning_rate": 5.270188382057485e-06, + "loss": 0.3347, + "step": 5712 + }, + { + "epoch": 1.6038742279618192, + "grad_norm": 0.6744940876960754, + "learning_rate": 5.2685574003911745e-06, + "loss": 0.4348, + "step": 5713 + }, + { + "epoch": 1.6041549691184729, + "grad_norm": 0.7591729164123535, + "learning_rate": 5.266926390066089e-06, + "loss": 0.3978, + "step": 5714 + }, + { + "epoch": 1.6044357102751263, + "grad_norm": 0.7207170724868774, + "learning_rate": 5.265295351256277e-06, + "loss": 0.3934, + "step": 5715 + }, + { + "epoch": 1.6047164514317798, + "grad_norm": 0.6559902429580688, + "learning_rate": 5.263664284135795e-06, + "loss": 0.4285, + "step": 5716 + }, + { + "epoch": 1.6049971925884334, + "grad_norm": 0.6645886301994324, + "learning_rate": 5.2620331888787e-06, + "loss": 0.4305, + "step": 5717 + }, + { + "epoch": 1.605277933745087, + "grad_norm": 0.6138094663619995, + "learning_rate": 5.260402065659054e-06, + "loss": 0.3626, + "step": 5718 + }, + { + "epoch": 1.6055586749017405, + "grad_norm": 0.604344367980957, + "learning_rate": 5.258770914650918e-06, + "loss": 0.3848, + "step": 5719 + }, + { + "epoch": 1.6058394160583942, + "grad_norm": 0.5603964924812317, + "learning_rate": 5.25713973602836e-06, + "loss": 0.3518, + "step": 5720 + }, + { + "epoch": 1.6061201572150479, + "grad_norm": 0.6279407143592834, + "learning_rate": 5.255508529965447e-06, + "loss": 0.3836, + "step": 5721 + }, + { + "epoch": 1.6064008983717013, + "grad_norm": 0.627970278263092, + "learning_rate": 5.253877296636254e-06, + "loss": 0.3928, + "step": 5722 + }, + { + "epoch": 1.6066816395283547, + "grad_norm": 0.6636152267456055, + "learning_rate": 5.252246036214853e-06, + "loss": 0.3824, + "step": 5723 + }, + { + "epoch": 1.6069623806850084, + "grad_norm": 0.6894207000732422, + "learning_rate": 5.250614748875327e-06, + "loss": 0.3978, + "step": 5724 + }, + { + "epoch": 1.607243121841662, + "grad_norm": 0.6381357312202454, + "learning_rate": 5.24898343479175e-06, + "loss": 0.3801, + "step": 5725 + }, + { + "epoch": 1.6075238629983155, + "grad_norm": 0.6617277264595032, + "learning_rate": 5.24735209413821e-06, + "loss": 0.4457, + "step": 5726 + }, + { + "epoch": 1.607804604154969, + "grad_norm": 0.6681424975395203, + "learning_rate": 5.2457207270887935e-06, + "loss": 0.3761, + "step": 5727 + }, + { + "epoch": 1.6080853453116228, + "grad_norm": 0.6610637903213501, + "learning_rate": 5.244089333817588e-06, + "loss": 0.4367, + "step": 5728 + }, + { + "epoch": 1.6083660864682763, + "grad_norm": 0.7265160083770752, + "learning_rate": 5.242457914498688e-06, + "loss": 0.4346, + "step": 5729 + }, + { + "epoch": 1.6086468276249297, + "grad_norm": 0.6373753547668457, + "learning_rate": 5.240826469306187e-06, + "loss": 0.3769, + "step": 5730 + }, + { + "epoch": 1.6089275687815834, + "grad_norm": 0.6125161051750183, + "learning_rate": 5.239194998414182e-06, + "loss": 0.3775, + "step": 5731 + }, + { + "epoch": 1.609208309938237, + "grad_norm": 0.700334370136261, + "learning_rate": 5.237563501996773e-06, + "loss": 0.3834, + "step": 5732 + }, + { + "epoch": 1.6094890510948905, + "grad_norm": 0.6575661301612854, + "learning_rate": 5.235931980228066e-06, + "loss": 0.3508, + "step": 5733 + }, + { + "epoch": 1.609769792251544, + "grad_norm": 0.7324906587600708, + "learning_rate": 5.234300433282165e-06, + "loss": 0.4197, + "step": 5734 + }, + { + "epoch": 1.6100505334081976, + "grad_norm": 0.7340754270553589, + "learning_rate": 5.23266886133318e-06, + "loss": 0.4245, + "step": 5735 + }, + { + "epoch": 1.6103312745648513, + "grad_norm": 0.7993359565734863, + "learning_rate": 5.23103726455522e-06, + "loss": 0.3587, + "step": 5736 + }, + { + "epoch": 1.6106120157215047, + "grad_norm": 0.627616286277771, + "learning_rate": 5.229405643122399e-06, + "loss": 0.3923, + "step": 5737 + }, + { + "epoch": 1.6108927568781584, + "grad_norm": 0.7044618129730225, + "learning_rate": 5.227773997208835e-06, + "loss": 0.3955, + "step": 5738 + }, + { + "epoch": 1.611173498034812, + "grad_norm": 0.7278639674186707, + "learning_rate": 5.226142326988646e-06, + "loss": 0.3911, + "step": 5739 + }, + { + "epoch": 1.6114542391914655, + "grad_norm": 0.7551366090774536, + "learning_rate": 5.224510632635955e-06, + "loss": 0.4101, + "step": 5740 + }, + { + "epoch": 1.611734980348119, + "grad_norm": 0.7136589884757996, + "learning_rate": 5.222878914324886e-06, + "loss": 0.389, + "step": 5741 + }, + { + "epoch": 1.6120157215047726, + "grad_norm": 0.630165159702301, + "learning_rate": 5.221247172229564e-06, + "loss": 0.3553, + "step": 5742 + }, + { + "epoch": 1.6122964626614262, + "grad_norm": 0.7382895946502686, + "learning_rate": 5.2196154065241204e-06, + "loss": 0.3893, + "step": 5743 + }, + { + "epoch": 1.6125772038180797, + "grad_norm": 0.594893217086792, + "learning_rate": 5.217983617382684e-06, + "loss": 0.3962, + "step": 5744 + }, + { + "epoch": 1.6128579449747333, + "grad_norm": 0.7302297949790955, + "learning_rate": 5.2163518049793935e-06, + "loss": 0.4039, + "step": 5745 + }, + { + "epoch": 1.613138686131387, + "grad_norm": 0.6644755601882935, + "learning_rate": 5.214719969488384e-06, + "loss": 0.375, + "step": 5746 + }, + { + "epoch": 1.6134194272880404, + "grad_norm": 0.6315532326698303, + "learning_rate": 5.213088111083795e-06, + "loss": 0.3851, + "step": 5747 + }, + { + "epoch": 1.6137001684446939, + "grad_norm": 0.5566315650939941, + "learning_rate": 5.2114562299397665e-06, + "loss": 0.3664, + "step": 5748 + }, + { + "epoch": 1.6139809096013475, + "grad_norm": 0.7148703336715698, + "learning_rate": 5.209824326230445e-06, + "loss": 0.401, + "step": 5749 + }, + { + "epoch": 1.6142616507580012, + "grad_norm": 0.6244839429855347, + "learning_rate": 5.2081924001299754e-06, + "loss": 0.3599, + "step": 5750 + }, + { + "epoch": 1.6145423919146547, + "grad_norm": 0.6982523798942566, + "learning_rate": 5.206560451812508e-06, + "loss": 0.3601, + "step": 5751 + }, + { + "epoch": 1.614823133071308, + "grad_norm": 0.5379160046577454, + "learning_rate": 5.204928481452195e-06, + "loss": 0.356, + "step": 5752 + }, + { + "epoch": 1.615103874227962, + "grad_norm": 0.6875414252281189, + "learning_rate": 5.203296489223187e-06, + "loss": 0.4464, + "step": 5753 + }, + { + "epoch": 1.6153846153846154, + "grad_norm": 0.6543856263160706, + "learning_rate": 5.201664475299643e-06, + "loss": 0.3838, + "step": 5754 + }, + { + "epoch": 1.6156653565412689, + "grad_norm": 0.5894539952278137, + "learning_rate": 5.200032439855719e-06, + "loss": 0.3794, + "step": 5755 + }, + { + "epoch": 1.6159460976979225, + "grad_norm": 0.6440432667732239, + "learning_rate": 5.198400383065577e-06, + "loss": 0.372, + "step": 5756 + }, + { + "epoch": 1.6162268388545762, + "grad_norm": 0.5637151002883911, + "learning_rate": 5.196768305103381e-06, + "loss": 0.3473, + "step": 5757 + }, + { + "epoch": 1.6165075800112296, + "grad_norm": 0.6018733978271484, + "learning_rate": 5.195136206143294e-06, + "loss": 0.3749, + "step": 5758 + }, + { + "epoch": 1.616788321167883, + "grad_norm": 0.6205705404281616, + "learning_rate": 5.193504086359485e-06, + "loss": 0.397, + "step": 5759 + }, + { + "epoch": 1.6170690623245367, + "grad_norm": 0.6449079513549805, + "learning_rate": 5.191871945926123e-06, + "loss": 0.3903, + "step": 5760 + }, + { + "epoch": 1.6173498034811904, + "grad_norm": 0.5725222229957581, + "learning_rate": 5.190239785017382e-06, + "loss": 0.3929, + "step": 5761 + }, + { + "epoch": 1.6176305446378438, + "grad_norm": 0.7197320461273193, + "learning_rate": 5.188607603807432e-06, + "loss": 0.3867, + "step": 5762 + }, + { + "epoch": 1.6179112857944975, + "grad_norm": 0.5944594144821167, + "learning_rate": 5.186975402470453e-06, + "loss": 0.378, + "step": 5763 + }, + { + "epoch": 1.6181920269511512, + "grad_norm": 0.6972327828407288, + "learning_rate": 5.185343181180621e-06, + "loss": 0.4369, + "step": 5764 + }, + { + "epoch": 1.6184727681078046, + "grad_norm": 0.6686606407165527, + "learning_rate": 5.183710940112117e-06, + "loss": 0.3793, + "step": 5765 + }, + { + "epoch": 1.618753509264458, + "grad_norm": 0.6122421622276306, + "learning_rate": 5.182078679439124e-06, + "loss": 0.4051, + "step": 5766 + }, + { + "epoch": 1.6190342504211117, + "grad_norm": 0.5697387456893921, + "learning_rate": 5.180446399335826e-06, + "loss": 0.3671, + "step": 5767 + }, + { + "epoch": 1.6193149915777654, + "grad_norm": 0.6529817581176758, + "learning_rate": 5.178814099976411e-06, + "loss": 0.3993, + "step": 5768 + }, + { + "epoch": 1.6195957327344188, + "grad_norm": 0.5563371777534485, + "learning_rate": 5.177181781535069e-06, + "loss": 0.38, + "step": 5769 + }, + { + "epoch": 1.6198764738910725, + "grad_norm": 0.6792994141578674, + "learning_rate": 5.175549444185986e-06, + "loss": 0.3902, + "step": 5770 + }, + { + "epoch": 1.6201572150477261, + "grad_norm": 0.6415961384773254, + "learning_rate": 5.173917088103358e-06, + "loss": 0.3641, + "step": 5771 + }, + { + "epoch": 1.6204379562043796, + "grad_norm": 0.6560315489768982, + "learning_rate": 5.172284713461382e-06, + "loss": 0.3802, + "step": 5772 + }, + { + "epoch": 1.620718697361033, + "grad_norm": 0.6932475566864014, + "learning_rate": 5.170652320434252e-06, + "loss": 0.3777, + "step": 5773 + }, + { + "epoch": 1.6209994385176867, + "grad_norm": 0.6296159625053406, + "learning_rate": 5.169019909196168e-06, + "loss": 0.3853, + "step": 5774 + }, + { + "epoch": 1.6212801796743403, + "grad_norm": 0.5964828133583069, + "learning_rate": 5.1673874799213295e-06, + "loss": 0.3803, + "step": 5775 + }, + { + "epoch": 1.6215609208309938, + "grad_norm": 0.6716960668563843, + "learning_rate": 5.165755032783941e-06, + "loss": 0.3629, + "step": 5776 + }, + { + "epoch": 1.6218416619876472, + "grad_norm": 0.5418384075164795, + "learning_rate": 5.164122567958205e-06, + "loss": 0.41, + "step": 5777 + }, + { + "epoch": 1.6221224031443011, + "grad_norm": 0.7283366322517395, + "learning_rate": 5.16249008561833e-06, + "loss": 0.4203, + "step": 5778 + }, + { + "epoch": 1.6224031443009546, + "grad_norm": 0.6761435866355896, + "learning_rate": 5.160857585938523e-06, + "loss": 0.3773, + "step": 5779 + }, + { + "epoch": 1.622683885457608, + "grad_norm": 0.6270684599876404, + "learning_rate": 5.159225069092996e-06, + "loss": 0.3775, + "step": 5780 + }, + { + "epoch": 1.6229646266142617, + "grad_norm": 0.7111120820045471, + "learning_rate": 5.157592535255958e-06, + "loss": 0.3792, + "step": 5781 + }, + { + "epoch": 1.6232453677709153, + "grad_norm": 0.6534535884857178, + "learning_rate": 5.155959984601626e-06, + "loss": 0.3697, + "step": 5782 + }, + { + "epoch": 1.6235261089275688, + "grad_norm": 0.6421679258346558, + "learning_rate": 5.1543274173042125e-06, + "loss": 0.369, + "step": 5783 + }, + { + "epoch": 1.6238068500842222, + "grad_norm": 0.6642189621925354, + "learning_rate": 5.152694833537939e-06, + "loss": 0.3826, + "step": 5784 + }, + { + "epoch": 1.6240875912408759, + "grad_norm": 0.7172495126724243, + "learning_rate": 5.151062233477021e-06, + "loss": 0.3565, + "step": 5785 + }, + { + "epoch": 1.6243683323975295, + "grad_norm": 0.6423928737640381, + "learning_rate": 5.149429617295682e-06, + "loss": 0.3785, + "step": 5786 + }, + { + "epoch": 1.624649073554183, + "grad_norm": 0.6350986361503601, + "learning_rate": 5.147796985168142e-06, + "loss": 0.3899, + "step": 5787 + }, + { + "epoch": 1.6249298147108366, + "grad_norm": 0.6232904195785522, + "learning_rate": 5.146164337268628e-06, + "loss": 0.4013, + "step": 5788 + }, + { + "epoch": 1.6252105558674903, + "grad_norm": 0.5838711857795715, + "learning_rate": 5.144531673771364e-06, + "loss": 0.3989, + "step": 5789 + }, + { + "epoch": 1.6254912970241437, + "grad_norm": 0.625808835029602, + "learning_rate": 5.14289899485058e-06, + "loss": 0.4052, + "step": 5790 + }, + { + "epoch": 1.6257720381807972, + "grad_norm": 0.6351568698883057, + "learning_rate": 5.141266300680503e-06, + "loss": 0.4101, + "step": 5791 + }, + { + "epoch": 1.6260527793374508, + "grad_norm": 0.5987807512283325, + "learning_rate": 5.139633591435364e-06, + "loss": 0.3526, + "step": 5792 + }, + { + "epoch": 1.6263335204941045, + "grad_norm": 0.6660354733467102, + "learning_rate": 5.138000867289397e-06, + "loss": 0.3572, + "step": 5793 + }, + { + "epoch": 1.626614261650758, + "grad_norm": 0.7153281569480896, + "learning_rate": 5.136368128416835e-06, + "loss": 0.3883, + "step": 5794 + }, + { + "epoch": 1.6268950028074116, + "grad_norm": 0.7444567680358887, + "learning_rate": 5.134735374991916e-06, + "loss": 0.3765, + "step": 5795 + }, + { + "epoch": 1.6271757439640653, + "grad_norm": 0.6267321705818176, + "learning_rate": 5.133102607188875e-06, + "loss": 0.4184, + "step": 5796 + }, + { + "epoch": 1.6274564851207187, + "grad_norm": 0.6239635944366455, + "learning_rate": 5.13146982518195e-06, + "loss": 0.442, + "step": 5797 + }, + { + "epoch": 1.6277372262773722, + "grad_norm": 0.6209805011749268, + "learning_rate": 5.129837029145385e-06, + "loss": 0.3629, + "step": 5798 + }, + { + "epoch": 1.6280179674340258, + "grad_norm": 0.6415537595748901, + "learning_rate": 5.128204219253418e-06, + "loss": 0.4024, + "step": 5799 + }, + { + "epoch": 1.6282987085906795, + "grad_norm": 0.6636104583740234, + "learning_rate": 5.126571395680294e-06, + "loss": 0.3867, + "step": 5800 + }, + { + "epoch": 1.628579449747333, + "grad_norm": 0.7155022621154785, + "learning_rate": 5.124938558600259e-06, + "loss": 0.4119, + "step": 5801 + }, + { + "epoch": 1.6288601909039864, + "grad_norm": 0.6171215772628784, + "learning_rate": 5.123305708187558e-06, + "loss": 0.3423, + "step": 5802 + }, + { + "epoch": 1.62914093206064, + "grad_norm": 0.6507341265678406, + "learning_rate": 5.121672844616439e-06, + "loss": 0.3409, + "step": 5803 + }, + { + "epoch": 1.6294216732172937, + "grad_norm": 0.7989486455917358, + "learning_rate": 5.120039968061149e-06, + "loss": 0.4406, + "step": 5804 + }, + { + "epoch": 1.6297024143739471, + "grad_norm": 0.6134814620018005, + "learning_rate": 5.1184070786959405e-06, + "loss": 0.3551, + "step": 5805 + }, + { + "epoch": 1.6299831555306008, + "grad_norm": 0.5829088687896729, + "learning_rate": 5.116774176695065e-06, + "loss": 0.3649, + "step": 5806 + }, + { + "epoch": 1.6302638966872545, + "grad_norm": 0.6511884927749634, + "learning_rate": 5.115141262232777e-06, + "loss": 0.3583, + "step": 5807 + }, + { + "epoch": 1.630544637843908, + "grad_norm": 0.7171990871429443, + "learning_rate": 5.113508335483327e-06, + "loss": 0.3634, + "step": 5808 + }, + { + "epoch": 1.6308253790005613, + "grad_norm": 0.5896447896957397, + "learning_rate": 5.1118753966209745e-06, + "loss": 0.3963, + "step": 5809 + }, + { + "epoch": 1.631106120157215, + "grad_norm": 0.6296886801719666, + "learning_rate": 5.110242445819975e-06, + "loss": 0.3508, + "step": 5810 + }, + { + "epoch": 1.6313868613138687, + "grad_norm": 0.6544142961502075, + "learning_rate": 5.108609483254587e-06, + "loss": 0.3571, + "step": 5811 + }, + { + "epoch": 1.6316676024705221, + "grad_norm": 0.6321722269058228, + "learning_rate": 5.106976509099072e-06, + "loss": 0.3681, + "step": 5812 + }, + { + "epoch": 1.6319483436271758, + "grad_norm": 0.6789606213569641, + "learning_rate": 5.1053435235276885e-06, + "loss": 0.3842, + "step": 5813 + }, + { + "epoch": 1.6322290847838294, + "grad_norm": 0.580205500125885, + "learning_rate": 5.103710526714698e-06, + "loss": 0.3726, + "step": 5814 + }, + { + "epoch": 1.6325098259404829, + "grad_norm": 0.6430047750473022, + "learning_rate": 5.102077518834366e-06, + "loss": 0.3868, + "step": 5815 + }, + { + "epoch": 1.6327905670971363, + "grad_norm": 0.6262224316596985, + "learning_rate": 5.100444500060956e-06, + "loss": 0.3895, + "step": 5816 + }, + { + "epoch": 1.63307130825379, + "grad_norm": 0.6608339548110962, + "learning_rate": 5.098811470568733e-06, + "loss": 0.4123, + "step": 5817 + }, + { + "epoch": 1.6333520494104437, + "grad_norm": 0.5748572945594788, + "learning_rate": 5.097178430531966e-06, + "loss": 0.3776, + "step": 5818 + }, + { + "epoch": 1.633632790567097, + "grad_norm": 0.6591218709945679, + "learning_rate": 5.09554538012492e-06, + "loss": 0.3291, + "step": 5819 + }, + { + "epoch": 1.6339135317237508, + "grad_norm": 0.7473452687263489, + "learning_rate": 5.093912319521865e-06, + "loss": 0.4303, + "step": 5820 + }, + { + "epoch": 1.6341942728804044, + "grad_norm": 0.608173668384552, + "learning_rate": 5.092279248897071e-06, + "loss": 0.3635, + "step": 5821 + }, + { + "epoch": 1.6344750140370579, + "grad_norm": 0.6118491888046265, + "learning_rate": 5.090646168424809e-06, + "loss": 0.4119, + "step": 5822 + }, + { + "epoch": 1.6347557551937113, + "grad_norm": 0.7723951935768127, + "learning_rate": 5.089013078279353e-06, + "loss": 0.3836, + "step": 5823 + }, + { + "epoch": 1.635036496350365, + "grad_norm": 0.7044379115104675, + "learning_rate": 5.0873799786349755e-06, + "loss": 0.3862, + "step": 5824 + }, + { + "epoch": 1.6353172375070186, + "grad_norm": 0.6145278811454773, + "learning_rate": 5.085746869665948e-06, + "loss": 0.3978, + "step": 5825 + }, + { + "epoch": 1.635597978663672, + "grad_norm": 0.7669884562492371, + "learning_rate": 5.084113751546549e-06, + "loss": 0.3903, + "step": 5826 + }, + { + "epoch": 1.6358787198203255, + "grad_norm": 0.6864380240440369, + "learning_rate": 5.082480624451053e-06, + "loss": 0.4236, + "step": 5827 + }, + { + "epoch": 1.6361594609769792, + "grad_norm": 0.6729112863540649, + "learning_rate": 5.080847488553738e-06, + "loss": 0.3793, + "step": 5828 + }, + { + "epoch": 1.6364402021336328, + "grad_norm": 0.6278758645057678, + "learning_rate": 5.079214344028882e-06, + "loss": 0.3701, + "step": 5829 + }, + { + "epoch": 1.6367209432902863, + "grad_norm": 0.7034110426902771, + "learning_rate": 5.077581191050763e-06, + "loss": 0.3567, + "step": 5830 + }, + { + "epoch": 1.63700168444694, + "grad_norm": 0.7015382647514343, + "learning_rate": 5.075948029793663e-06, + "loss": 0.372, + "step": 5831 + }, + { + "epoch": 1.6372824256035936, + "grad_norm": 0.7056487798690796, + "learning_rate": 5.074314860431859e-06, + "loss": 0.3678, + "step": 5832 + }, + { + "epoch": 1.637563166760247, + "grad_norm": 0.6272594332695007, + "learning_rate": 5.072681683139636e-06, + "loss": 0.3777, + "step": 5833 + }, + { + "epoch": 1.6378439079169005, + "grad_norm": 0.6215318441390991, + "learning_rate": 5.071048498091278e-06, + "loss": 0.3847, + "step": 5834 + }, + { + "epoch": 1.6381246490735542, + "grad_norm": 0.6224110126495361, + "learning_rate": 5.0694153054610655e-06, + "loss": 0.4487, + "step": 5835 + }, + { + "epoch": 1.6384053902302078, + "grad_norm": 0.5982664823532104, + "learning_rate": 5.067782105423281e-06, + "loss": 0.3638, + "step": 5836 + }, + { + "epoch": 1.6386861313868613, + "grad_norm": 0.6761977672576904, + "learning_rate": 5.066148898152213e-06, + "loss": 0.357, + "step": 5837 + }, + { + "epoch": 1.638966872543515, + "grad_norm": 0.6112979650497437, + "learning_rate": 5.064515683822147e-06, + "loss": 0.4099, + "step": 5838 + }, + { + "epoch": 1.6392476137001686, + "grad_norm": 0.6694360375404358, + "learning_rate": 5.062882462607367e-06, + "loss": 0.3891, + "step": 5839 + }, + { + "epoch": 1.639528354856822, + "grad_norm": 0.6972580552101135, + "learning_rate": 5.061249234682164e-06, + "loss": 0.3683, + "step": 5840 + }, + { + "epoch": 1.6398090960134755, + "grad_norm": 0.6658865213394165, + "learning_rate": 5.059616000220822e-06, + "loss": 0.3701, + "step": 5841 + }, + { + "epoch": 1.6400898371701291, + "grad_norm": 0.645904541015625, + "learning_rate": 5.057982759397631e-06, + "loss": 0.3815, + "step": 5842 + }, + { + "epoch": 1.6403705783267828, + "grad_norm": 0.665382981300354, + "learning_rate": 5.056349512386879e-06, + "loss": 0.3217, + "step": 5843 + }, + { + "epoch": 1.6406513194834362, + "grad_norm": 0.5599098205566406, + "learning_rate": 5.0547162593628595e-06, + "loss": 0.3558, + "step": 5844 + }, + { + "epoch": 1.6409320606400897, + "grad_norm": 0.6961774230003357, + "learning_rate": 5.05308300049986e-06, + "loss": 0.4203, + "step": 5845 + }, + { + "epoch": 1.6412128017967436, + "grad_norm": 0.6967402696609497, + "learning_rate": 5.051449735972174e-06, + "loss": 0.3944, + "step": 5846 + }, + { + "epoch": 1.641493542953397, + "grad_norm": 0.6849242448806763, + "learning_rate": 5.0498164659540905e-06, + "loss": 0.3852, + "step": 5847 + }, + { + "epoch": 1.6417742841100504, + "grad_norm": 0.6335440874099731, + "learning_rate": 5.048183190619904e-06, + "loss": 0.3931, + "step": 5848 + }, + { + "epoch": 1.642055025266704, + "grad_norm": 0.6914854645729065, + "learning_rate": 5.046549910143907e-06, + "loss": 0.3897, + "step": 5849 + }, + { + "epoch": 1.6423357664233578, + "grad_norm": 0.62864750623703, + "learning_rate": 5.044916624700395e-06, + "loss": 0.4085, + "step": 5850 + }, + { + "epoch": 1.6426165075800112, + "grad_norm": 0.6330033540725708, + "learning_rate": 5.043283334463659e-06, + "loss": 0.3884, + "step": 5851 + }, + { + "epoch": 1.6428972487366647, + "grad_norm": 0.5764800906181335, + "learning_rate": 5.0416500396079936e-06, + "loss": 0.3579, + "step": 5852 + }, + { + "epoch": 1.6431779898933183, + "grad_norm": 0.8093725442886353, + "learning_rate": 5.040016740307696e-06, + "loss": 0.4166, + "step": 5853 + }, + { + "epoch": 1.643458731049972, + "grad_norm": 0.6535652279853821, + "learning_rate": 5.03838343673706e-06, + "loss": 0.3765, + "step": 5854 + }, + { + "epoch": 1.6437394722066254, + "grad_norm": 0.6010372042655945, + "learning_rate": 5.036750129070384e-06, + "loss": 0.3969, + "step": 5855 + }, + { + "epoch": 1.644020213363279, + "grad_norm": 0.6564108729362488, + "learning_rate": 5.035116817481962e-06, + "loss": 0.3771, + "step": 5856 + }, + { + "epoch": 1.6443009545199327, + "grad_norm": 0.6179079413414001, + "learning_rate": 5.03348350214609e-06, + "loss": 0.3837, + "step": 5857 + }, + { + "epoch": 1.6445816956765862, + "grad_norm": 0.6608940362930298, + "learning_rate": 5.031850183237068e-06, + "loss": 0.3372, + "step": 5858 + }, + { + "epoch": 1.6448624368332396, + "grad_norm": 0.6407791376113892, + "learning_rate": 5.030216860929192e-06, + "loss": 0.3997, + "step": 5859 + }, + { + "epoch": 1.6451431779898933, + "grad_norm": 0.7145275473594666, + "learning_rate": 5.02858353539676e-06, + "loss": 0.3847, + "step": 5860 + }, + { + "epoch": 1.645423919146547, + "grad_norm": 0.6945395469665527, + "learning_rate": 5.026950206814074e-06, + "loss": 0.3604, + "step": 5861 + }, + { + "epoch": 1.6457046603032004, + "grad_norm": 0.5824368596076965, + "learning_rate": 5.025316875355427e-06, + "loss": 0.3551, + "step": 5862 + }, + { + "epoch": 1.645985401459854, + "grad_norm": 0.6859902739524841, + "learning_rate": 5.023683541195121e-06, + "loss": 0.3463, + "step": 5863 + }, + { + "epoch": 1.6462661426165077, + "grad_norm": 0.6709204316139221, + "learning_rate": 5.022050204507455e-06, + "loss": 0.3613, + "step": 5864 + }, + { + "epoch": 1.6465468837731612, + "grad_norm": 0.6162554621696472, + "learning_rate": 5.020416865466728e-06, + "loss": 0.3416, + "step": 5865 + }, + { + "epoch": 1.6468276249298146, + "grad_norm": 0.6482139229774475, + "learning_rate": 5.01878352424724e-06, + "loss": 0.3716, + "step": 5866 + }, + { + "epoch": 1.6471083660864683, + "grad_norm": 0.6603090763092041, + "learning_rate": 5.017150181023291e-06, + "loss": 0.3813, + "step": 5867 + }, + { + "epoch": 1.647389107243122, + "grad_norm": 0.5874245166778564, + "learning_rate": 5.015516835969182e-06, + "loss": 0.412, + "step": 5868 + }, + { + "epoch": 1.6476698483997754, + "grad_norm": 0.699480414390564, + "learning_rate": 5.013883489259212e-06, + "loss": 0.3763, + "step": 5869 + }, + { + "epoch": 1.6479505895564288, + "grad_norm": 0.7026716470718384, + "learning_rate": 5.012250141067683e-06, + "loss": 0.3454, + "step": 5870 + }, + { + "epoch": 1.6482313307130827, + "grad_norm": 0.8506357669830322, + "learning_rate": 5.010616791568894e-06, + "loss": 0.4086, + "step": 5871 + }, + { + "epoch": 1.6485120718697361, + "grad_norm": 0.6266496777534485, + "learning_rate": 5.008983440937147e-06, + "loss": 0.3934, + "step": 5872 + }, + { + "epoch": 1.6487928130263896, + "grad_norm": 0.6439083218574524, + "learning_rate": 5.007350089346744e-06, + "loss": 0.3812, + "step": 5873 + }, + { + "epoch": 1.6490735541830432, + "grad_norm": 0.6669334173202515, + "learning_rate": 5.005716736971985e-06, + "loss": 0.3783, + "step": 5874 + }, + { + "epoch": 1.649354295339697, + "grad_norm": 0.7153447866439819, + "learning_rate": 5.004083383987172e-06, + "loss": 0.4105, + "step": 5875 + }, + { + "epoch": 1.6496350364963503, + "grad_norm": 0.6811407804489136, + "learning_rate": 5.002450030566604e-06, + "loss": 0.4035, + "step": 5876 + }, + { + "epoch": 1.6499157776530038, + "grad_norm": 0.6322611570358276, + "learning_rate": 5.000816676884586e-06, + "loss": 0.4038, + "step": 5877 + }, + { + "epoch": 1.6501965188096575, + "grad_norm": 0.7674782872200012, + "learning_rate": 4.999183323115416e-06, + "loss": 0.4112, + "step": 5878 + }, + { + "epoch": 1.6504772599663111, + "grad_norm": 0.6602808833122253, + "learning_rate": 4.9975499694333974e-06, + "loss": 0.4213, + "step": 5879 + }, + { + "epoch": 1.6507580011229646, + "grad_norm": 0.6431399583816528, + "learning_rate": 4.99591661601283e-06, + "loss": 0.3523, + "step": 5880 + }, + { + "epoch": 1.6510387422796182, + "grad_norm": 0.7292598485946655, + "learning_rate": 4.994283263028016e-06, + "loss": 0.3859, + "step": 5881 + }, + { + "epoch": 1.6513194834362719, + "grad_norm": 0.6640633940696716, + "learning_rate": 4.9926499106532575e-06, + "loss": 0.3742, + "step": 5882 + }, + { + "epoch": 1.6516002245929253, + "grad_norm": 0.583101749420166, + "learning_rate": 4.991016559062854e-06, + "loss": 0.3665, + "step": 5883 + }, + { + "epoch": 1.6518809657495788, + "grad_norm": 0.6108763217926025, + "learning_rate": 4.9893832084311085e-06, + "loss": 0.3857, + "step": 5884 + }, + { + "epoch": 1.6521617069062324, + "grad_norm": 0.6275140047073364, + "learning_rate": 4.987749858932321e-06, + "loss": 0.3725, + "step": 5885 + }, + { + "epoch": 1.652442448062886, + "grad_norm": 0.8515222668647766, + "learning_rate": 4.986116510740791e-06, + "loss": 0.4015, + "step": 5886 + }, + { + "epoch": 1.6527231892195395, + "grad_norm": 0.6848206520080566, + "learning_rate": 4.984483164030821e-06, + "loss": 0.4262, + "step": 5887 + }, + { + "epoch": 1.6530039303761932, + "grad_norm": 0.635560929775238, + "learning_rate": 4.982849818976711e-06, + "loss": 0.3727, + "step": 5888 + }, + { + "epoch": 1.6532846715328469, + "grad_norm": 0.671344518661499, + "learning_rate": 4.981216475752763e-06, + "loss": 0.3595, + "step": 5889 + }, + { + "epoch": 1.6535654126895003, + "grad_norm": 0.6194545030593872, + "learning_rate": 4.979583134533275e-06, + "loss": 0.334, + "step": 5890 + }, + { + "epoch": 1.6538461538461537, + "grad_norm": 0.6954805254936218, + "learning_rate": 4.977949795492546e-06, + "loss": 0.3659, + "step": 5891 + }, + { + "epoch": 1.6541268950028074, + "grad_norm": 0.6220340132713318, + "learning_rate": 4.97631645880488e-06, + "loss": 0.3656, + "step": 5892 + }, + { + "epoch": 1.654407636159461, + "grad_norm": 0.6406189799308777, + "learning_rate": 4.974683124644573e-06, + "loss": 0.3449, + "step": 5893 + }, + { + "epoch": 1.6546883773161145, + "grad_norm": 0.7023875713348389, + "learning_rate": 4.973049793185928e-06, + "loss": 0.3711, + "step": 5894 + }, + { + "epoch": 1.654969118472768, + "grad_norm": 0.730359673500061, + "learning_rate": 4.97141646460324e-06, + "loss": 0.4041, + "step": 5895 + }, + { + "epoch": 1.6552498596294218, + "grad_norm": 0.6749394536018372, + "learning_rate": 4.969783139070809e-06, + "loss": 0.4101, + "step": 5896 + }, + { + "epoch": 1.6555306007860753, + "grad_norm": 0.5995941758155823, + "learning_rate": 4.968149816762933e-06, + "loss": 0.3984, + "step": 5897 + }, + { + "epoch": 1.6558113419427287, + "grad_norm": 0.6909562945365906, + "learning_rate": 4.966516497853911e-06, + "loss": 0.3919, + "step": 5898 + }, + { + "epoch": 1.6560920830993824, + "grad_norm": 0.7594336271286011, + "learning_rate": 4.96488318251804e-06, + "loss": 0.3773, + "step": 5899 + }, + { + "epoch": 1.656372824256036, + "grad_norm": 0.7784666419029236, + "learning_rate": 4.963249870929619e-06, + "loss": 0.4054, + "step": 5900 + }, + { + "epoch": 1.6566535654126895, + "grad_norm": 0.5823115110397339, + "learning_rate": 4.961616563262941e-06, + "loss": 0.3981, + "step": 5901 + }, + { + "epoch": 1.656934306569343, + "grad_norm": 0.7006237506866455, + "learning_rate": 4.959983259692305e-06, + "loss": 0.3813, + "step": 5902 + }, + { + "epoch": 1.6572150477259966, + "grad_norm": 0.6254827380180359, + "learning_rate": 4.958349960392007e-06, + "loss": 0.3993, + "step": 5903 + }, + { + "epoch": 1.6574957888826503, + "grad_norm": 0.6537759900093079, + "learning_rate": 4.956716665536342e-06, + "loss": 0.3271, + "step": 5904 + }, + { + "epoch": 1.6577765300393037, + "grad_norm": 0.6395942568778992, + "learning_rate": 4.955083375299606e-06, + "loss": 0.4178, + "step": 5905 + }, + { + "epoch": 1.6580572711959574, + "grad_norm": 0.6403214931488037, + "learning_rate": 4.953450089856094e-06, + "loss": 0.4017, + "step": 5906 + }, + { + "epoch": 1.658338012352611, + "grad_norm": 0.6025586724281311, + "learning_rate": 4.951816809380098e-06, + "loss": 0.3854, + "step": 5907 + }, + { + "epoch": 1.6586187535092645, + "grad_norm": 0.6520612835884094, + "learning_rate": 4.950183534045911e-06, + "loss": 0.3765, + "step": 5908 + }, + { + "epoch": 1.658899494665918, + "grad_norm": 0.6875594258308411, + "learning_rate": 4.948550264027828e-06, + "loss": 0.3818, + "step": 5909 + }, + { + "epoch": 1.6591802358225716, + "grad_norm": 0.5716404914855957, + "learning_rate": 4.946916999500141e-06, + "loss": 0.4154, + "step": 5910 + }, + { + "epoch": 1.6594609769792252, + "grad_norm": 0.6810140013694763, + "learning_rate": 4.945283740637142e-06, + "loss": 0.3967, + "step": 5911 + }, + { + "epoch": 1.6597417181358787, + "grad_norm": 0.7087152600288391, + "learning_rate": 4.943650487613123e-06, + "loss": 0.3401, + "step": 5912 + }, + { + "epoch": 1.6600224592925323, + "grad_norm": 0.5920955538749695, + "learning_rate": 4.942017240602373e-06, + "loss": 0.3554, + "step": 5913 + }, + { + "epoch": 1.660303200449186, + "grad_norm": 0.7348477244377136, + "learning_rate": 4.940383999779182e-06, + "loss": 0.3658, + "step": 5914 + }, + { + "epoch": 1.6605839416058394, + "grad_norm": 0.6357693076133728, + "learning_rate": 4.938750765317839e-06, + "loss": 0.3862, + "step": 5915 + }, + { + "epoch": 1.6608646827624929, + "grad_norm": 0.6174179911613464, + "learning_rate": 4.937117537392633e-06, + "loss": 0.3638, + "step": 5916 + }, + { + "epoch": 1.6611454239191465, + "grad_norm": 0.6077427268028259, + "learning_rate": 4.935484316177854e-06, + "loss": 0.3908, + "step": 5917 + }, + { + "epoch": 1.6614261650758002, + "grad_norm": 0.7190478444099426, + "learning_rate": 4.933851101847787e-06, + "loss": 0.3929, + "step": 5918 + }, + { + "epoch": 1.6617069062324537, + "grad_norm": 0.6962661743164062, + "learning_rate": 4.932217894576718e-06, + "loss": 0.3821, + "step": 5919 + }, + { + "epoch": 1.661987647389107, + "grad_norm": 0.714051365852356, + "learning_rate": 4.930584694538935e-06, + "loss": 0.3751, + "step": 5920 + }, + { + "epoch": 1.6622683885457608, + "grad_norm": 0.669011116027832, + "learning_rate": 4.928951501908724e-06, + "loss": 0.3792, + "step": 5921 + }, + { + "epoch": 1.6625491297024144, + "grad_norm": 0.5968210101127625, + "learning_rate": 4.927318316860364e-06, + "loss": 0.3719, + "step": 5922 + }, + { + "epoch": 1.6628298708590679, + "grad_norm": 0.6652485728263855, + "learning_rate": 4.925685139568142e-06, + "loss": 0.378, + "step": 5923 + }, + { + "epoch": 1.6631106120157215, + "grad_norm": 0.6216427683830261, + "learning_rate": 4.92405197020634e-06, + "loss": 0.367, + "step": 5924 + }, + { + "epoch": 1.6633913531723752, + "grad_norm": 0.6448199152946472, + "learning_rate": 4.922418808949238e-06, + "loss": 0.3701, + "step": 5925 + }, + { + "epoch": 1.6636720943290286, + "grad_norm": 0.684476912021637, + "learning_rate": 4.92078565597112e-06, + "loss": 0.3452, + "step": 5926 + }, + { + "epoch": 1.663952835485682, + "grad_norm": 0.5752292275428772, + "learning_rate": 4.919152511446264e-06, + "loss": 0.3826, + "step": 5927 + }, + { + "epoch": 1.6642335766423357, + "grad_norm": 0.6469265222549438, + "learning_rate": 4.917519375548949e-06, + "loss": 0.401, + "step": 5928 + }, + { + "epoch": 1.6645143177989894, + "grad_norm": 0.65733402967453, + "learning_rate": 4.915886248453453e-06, + "loss": 0.3856, + "step": 5929 + }, + { + "epoch": 1.6647950589556428, + "grad_norm": 0.6798132658004761, + "learning_rate": 4.914253130334053e-06, + "loss": 0.4177, + "step": 5930 + }, + { + "epoch": 1.6650758001122965, + "grad_norm": 0.7080673575401306, + "learning_rate": 4.912620021365026e-06, + "loss": 0.3782, + "step": 5931 + }, + { + "epoch": 1.6653565412689502, + "grad_norm": 0.7040212750434875, + "learning_rate": 4.9109869217206475e-06, + "loss": 0.3815, + "step": 5932 + }, + { + "epoch": 1.6656372824256036, + "grad_norm": 0.6813741326332092, + "learning_rate": 4.909353831575192e-06, + "loss": 0.3652, + "step": 5933 + }, + { + "epoch": 1.665918023582257, + "grad_norm": 0.683027446269989, + "learning_rate": 4.9077207511029315e-06, + "loss": 0.3928, + "step": 5934 + }, + { + "epoch": 1.6661987647389107, + "grad_norm": 0.6870583891868591, + "learning_rate": 4.906087680478137e-06, + "loss": 0.3485, + "step": 5935 + }, + { + "epoch": 1.6664795058955644, + "grad_norm": 0.5466635227203369, + "learning_rate": 4.9044546198750825e-06, + "loss": 0.3595, + "step": 5936 + }, + { + "epoch": 1.6667602470522178, + "grad_norm": 0.6759864687919617, + "learning_rate": 4.902821569468036e-06, + "loss": 0.3517, + "step": 5937 + }, + { + "epoch": 1.6670409882088713, + "grad_norm": 0.6421211957931519, + "learning_rate": 4.901188529431268e-06, + "loss": 0.3987, + "step": 5938 + }, + { + "epoch": 1.6673217293655251, + "grad_norm": 0.7061609029769897, + "learning_rate": 4.899555499939046e-06, + "loss": 0.389, + "step": 5939 + }, + { + "epoch": 1.6676024705221786, + "grad_norm": 0.6985163688659668, + "learning_rate": 4.897922481165636e-06, + "loss": 0.4113, + "step": 5940 + }, + { + "epoch": 1.667883211678832, + "grad_norm": 0.5786982178688049, + "learning_rate": 4.896289473285304e-06, + "loss": 0.3908, + "step": 5941 + }, + { + "epoch": 1.6681639528354857, + "grad_norm": 0.6494597792625427, + "learning_rate": 4.894656476472312e-06, + "loss": 0.3873, + "step": 5942 + }, + { + "epoch": 1.6684446939921393, + "grad_norm": 0.6812187433242798, + "learning_rate": 4.893023490900929e-06, + "loss": 0.3664, + "step": 5943 + }, + { + "epoch": 1.6687254351487928, + "grad_norm": 0.7137150168418884, + "learning_rate": 4.891390516745413e-06, + "loss": 0.4004, + "step": 5944 + }, + { + "epoch": 1.6690061763054462, + "grad_norm": 0.6889640688896179, + "learning_rate": 4.889757554180026e-06, + "loss": 0.3971, + "step": 5945 + }, + { + "epoch": 1.6692869174621, + "grad_norm": 0.6488991379737854, + "learning_rate": 4.888124603379026e-06, + "loss": 0.3733, + "step": 5946 + }, + { + "epoch": 1.6695676586187536, + "grad_norm": 0.6044003367424011, + "learning_rate": 4.886491664516674e-06, + "loss": 0.3866, + "step": 5947 + }, + { + "epoch": 1.669848399775407, + "grad_norm": 0.6599321961402893, + "learning_rate": 4.884858737767225e-06, + "loss": 0.3477, + "step": 5948 + }, + { + "epoch": 1.6701291409320607, + "grad_norm": 0.7093262672424316, + "learning_rate": 4.883225823304936e-06, + "loss": 0.3957, + "step": 5949 + }, + { + "epoch": 1.6704098820887143, + "grad_norm": 0.7099141478538513, + "learning_rate": 4.881592921304061e-06, + "loss": 0.3659, + "step": 5950 + }, + { + "epoch": 1.6706906232453678, + "grad_norm": 0.5497747659683228, + "learning_rate": 4.879960031938852e-06, + "loss": 0.4154, + "step": 5951 + }, + { + "epoch": 1.6709713644020212, + "grad_norm": 0.6621913313865662, + "learning_rate": 4.8783271553835635e-06, + "loss": 0.4363, + "step": 5952 + }, + { + "epoch": 1.6712521055586749, + "grad_norm": 0.5906604528427124, + "learning_rate": 4.876694291812443e-06, + "loss": 0.3754, + "step": 5953 + }, + { + "epoch": 1.6715328467153285, + "grad_norm": 0.6203334331512451, + "learning_rate": 4.8750614413997414e-06, + "loss": 0.3653, + "step": 5954 + }, + { + "epoch": 1.671813587871982, + "grad_norm": 0.5811553001403809, + "learning_rate": 4.8734286043197064e-06, + "loss": 0.3847, + "step": 5955 + }, + { + "epoch": 1.6720943290286356, + "grad_norm": 0.6775395274162292, + "learning_rate": 4.871795780746583e-06, + "loss": 0.4004, + "step": 5956 + }, + { + "epoch": 1.6723750701852893, + "grad_norm": 0.6454271674156189, + "learning_rate": 4.870162970854617e-06, + "loss": 0.3712, + "step": 5957 + }, + { + "epoch": 1.6726558113419427, + "grad_norm": 0.6467093229293823, + "learning_rate": 4.8685301748180505e-06, + "loss": 0.4088, + "step": 5958 + }, + { + "epoch": 1.6729365524985962, + "grad_norm": 0.6088839173316956, + "learning_rate": 4.866897392811127e-06, + "loss": 0.3681, + "step": 5959 + }, + { + "epoch": 1.6732172936552498, + "grad_norm": 0.7110549211502075, + "learning_rate": 4.8652646250080855e-06, + "loss": 0.4544, + "step": 5960 + }, + { + "epoch": 1.6734980348119035, + "grad_norm": 0.6865145564079285, + "learning_rate": 4.8636318715831665e-06, + "loss": 0.4096, + "step": 5961 + }, + { + "epoch": 1.673778775968557, + "grad_norm": 0.6610509157180786, + "learning_rate": 4.861999132710606e-06, + "loss": 0.3895, + "step": 5962 + }, + { + "epoch": 1.6740595171252104, + "grad_norm": 0.6774160861968994, + "learning_rate": 4.860366408564638e-06, + "loss": 0.4073, + "step": 5963 + }, + { + "epoch": 1.6743402582818643, + "grad_norm": 0.6872767806053162, + "learning_rate": 4.8587336993195e-06, + "loss": 0.3659, + "step": 5964 + }, + { + "epoch": 1.6746209994385177, + "grad_norm": 0.6550089120864868, + "learning_rate": 4.857101005149422e-06, + "loss": 0.3616, + "step": 5965 + }, + { + "epoch": 1.6749017405951712, + "grad_norm": 0.6324899792671204, + "learning_rate": 4.855468326228638e-06, + "loss": 0.4, + "step": 5966 + }, + { + "epoch": 1.6751824817518248, + "grad_norm": 0.6363287568092346, + "learning_rate": 4.853835662731372e-06, + "loss": 0.4195, + "step": 5967 + }, + { + "epoch": 1.6754632229084785, + "grad_norm": 0.6213005185127258, + "learning_rate": 4.852203014831858e-06, + "loss": 0.3967, + "step": 5968 + }, + { + "epoch": 1.675743964065132, + "grad_norm": 0.6869015097618103, + "learning_rate": 4.850570382704319e-06, + "loss": 0.3737, + "step": 5969 + }, + { + "epoch": 1.6760247052217854, + "grad_norm": 0.5972256660461426, + "learning_rate": 4.84893776652298e-06, + "loss": 0.3792, + "step": 5970 + }, + { + "epoch": 1.676305446378439, + "grad_norm": 0.6419148445129395, + "learning_rate": 4.847305166462062e-06, + "loss": 0.4108, + "step": 5971 + }, + { + "epoch": 1.6765861875350927, + "grad_norm": 0.6109099388122559, + "learning_rate": 4.8456725826957875e-06, + "loss": 0.3746, + "step": 5972 + }, + { + "epoch": 1.6768669286917461, + "grad_norm": 0.6137343049049377, + "learning_rate": 4.844040015398375e-06, + "loss": 0.3884, + "step": 5973 + }, + { + "epoch": 1.6771476698483998, + "grad_norm": 0.6361182332038879, + "learning_rate": 4.842407464744043e-06, + "loss": 0.3591, + "step": 5974 + }, + { + "epoch": 1.6774284110050535, + "grad_norm": 0.5942631363868713, + "learning_rate": 4.840774930907005e-06, + "loss": 0.3525, + "step": 5975 + }, + { + "epoch": 1.677709152161707, + "grad_norm": 0.6512374877929688, + "learning_rate": 4.839142414061478e-06, + "loss": 0.3744, + "step": 5976 + }, + { + "epoch": 1.6779898933183603, + "grad_norm": 0.6963050961494446, + "learning_rate": 4.837509914381671e-06, + "loss": 0.3882, + "step": 5977 + }, + { + "epoch": 1.678270634475014, + "grad_norm": 0.6489622592926025, + "learning_rate": 4.835877432041796e-06, + "loss": 0.3513, + "step": 5978 + }, + { + "epoch": 1.6785513756316677, + "grad_norm": 0.7303745746612549, + "learning_rate": 4.834244967216061e-06, + "loss": 0.353, + "step": 5979 + }, + { + "epoch": 1.6788321167883211, + "grad_norm": 0.7013206481933594, + "learning_rate": 4.832612520078671e-06, + "loss": 0.3722, + "step": 5980 + }, + { + "epoch": 1.6791128579449748, + "grad_norm": 0.738486647605896, + "learning_rate": 4.830980090803834e-06, + "loss": 0.4062, + "step": 5981 + }, + { + "epoch": 1.6793935991016284, + "grad_norm": 0.755413830280304, + "learning_rate": 4.82934767956575e-06, + "loss": 0.3573, + "step": 5982 + }, + { + "epoch": 1.6796743402582819, + "grad_norm": 0.6180960536003113, + "learning_rate": 4.82771528653862e-06, + "loss": 0.3952, + "step": 5983 + }, + { + "epoch": 1.6799550814149353, + "grad_norm": 0.6720296740531921, + "learning_rate": 4.826082911896643e-06, + "loss": 0.3548, + "step": 5984 + }, + { + "epoch": 1.680235822571589, + "grad_norm": 0.6452472805976868, + "learning_rate": 4.824450555814016e-06, + "loss": 0.3591, + "step": 5985 + }, + { + "epoch": 1.6805165637282427, + "grad_norm": 0.6786510348320007, + "learning_rate": 4.822818218464934e-06, + "loss": 0.3983, + "step": 5986 + }, + { + "epoch": 1.680797304884896, + "grad_norm": 0.725750207901001, + "learning_rate": 4.8211859000235905e-06, + "loss": 0.3933, + "step": 5987 + }, + { + "epoch": 1.6810780460415495, + "grad_norm": 0.641161322593689, + "learning_rate": 4.819553600664175e-06, + "loss": 0.4031, + "step": 5988 + }, + { + "epoch": 1.6813587871982034, + "grad_norm": 0.6939529180526733, + "learning_rate": 4.8179213205608784e-06, + "loss": 0.3522, + "step": 5989 + }, + { + "epoch": 1.6816395283548569, + "grad_norm": 0.593222439289093, + "learning_rate": 4.8162890598878855e-06, + "loss": 0.3768, + "step": 5990 + }, + { + "epoch": 1.6819202695115103, + "grad_norm": 0.6571763157844543, + "learning_rate": 4.814656818819381e-06, + "loss": 0.3648, + "step": 5991 + }, + { + "epoch": 1.682201010668164, + "grad_norm": 0.6553687453269958, + "learning_rate": 4.8130245975295486e-06, + "loss": 0.377, + "step": 5992 + }, + { + "epoch": 1.6824817518248176, + "grad_norm": 0.6839633584022522, + "learning_rate": 4.811392396192569e-06, + "loss": 0.3978, + "step": 5993 + }, + { + "epoch": 1.682762492981471, + "grad_norm": 0.6256217360496521, + "learning_rate": 4.809760214982619e-06, + "loss": 0.3806, + "step": 5994 + }, + { + "epoch": 1.6830432341381245, + "grad_norm": 0.6105203628540039, + "learning_rate": 4.808128054073876e-06, + "loss": 0.3426, + "step": 5995 + }, + { + "epoch": 1.6833239752947782, + "grad_norm": 0.5967744588851929, + "learning_rate": 4.806495913640515e-06, + "loss": 0.3889, + "step": 5996 + }, + { + "epoch": 1.6836047164514318, + "grad_norm": 0.6517393589019775, + "learning_rate": 4.804863793856706e-06, + "loss": 0.3642, + "step": 5997 + }, + { + "epoch": 1.6838854576080853, + "grad_norm": 0.6016674041748047, + "learning_rate": 4.803231694896621e-06, + "loss": 0.3625, + "step": 5998 + }, + { + "epoch": 1.684166198764739, + "grad_norm": 0.6955810189247131, + "learning_rate": 4.801599616934424e-06, + "loss": 0.4115, + "step": 5999 + }, + { + "epoch": 1.6844469399213926, + "grad_norm": 0.6546865105628967, + "learning_rate": 4.799967560144283e-06, + "loss": 0.3695, + "step": 6000 + }, + { + "epoch": 1.684727681078046, + "grad_norm": 0.6968745589256287, + "learning_rate": 4.798335524700359e-06, + "loss": 0.3643, + "step": 6001 + }, + { + "epoch": 1.6850084222346995, + "grad_norm": 0.6732944250106812, + "learning_rate": 4.796703510776814e-06, + "loss": 0.3941, + "step": 6002 + }, + { + "epoch": 1.6852891633913532, + "grad_norm": 0.5669814944267273, + "learning_rate": 4.795071518547807e-06, + "loss": 0.3801, + "step": 6003 + }, + { + "epoch": 1.6855699045480068, + "grad_norm": 0.6808241605758667, + "learning_rate": 4.793439548187494e-06, + "loss": 0.3587, + "step": 6004 + }, + { + "epoch": 1.6858506457046603, + "grad_norm": 0.7255261540412903, + "learning_rate": 4.791807599870026e-06, + "loss": 0.3839, + "step": 6005 + }, + { + "epoch": 1.686131386861314, + "grad_norm": 0.607886016368866, + "learning_rate": 4.790175673769557e-06, + "loss": 0.396, + "step": 6006 + }, + { + "epoch": 1.6864121280179676, + "grad_norm": 0.6321319937705994, + "learning_rate": 4.788543770060235e-06, + "loss": 0.3798, + "step": 6007 + }, + { + "epoch": 1.686692869174621, + "grad_norm": 0.6118468046188354, + "learning_rate": 4.786911888916207e-06, + "loss": 0.3922, + "step": 6008 + }, + { + "epoch": 1.6869736103312745, + "grad_norm": 0.6063696146011353, + "learning_rate": 4.785280030511617e-06, + "loss": 0.3926, + "step": 6009 + }, + { + "epoch": 1.6872543514879281, + "grad_norm": 0.6252252459526062, + "learning_rate": 4.783648195020608e-06, + "loss": 0.3608, + "step": 6010 + }, + { + "epoch": 1.6875350926445818, + "grad_norm": 0.6016137599945068, + "learning_rate": 4.782016382617317e-06, + "loss": 0.3927, + "step": 6011 + }, + { + "epoch": 1.6878158338012352, + "grad_norm": 0.8158223628997803, + "learning_rate": 4.780384593475882e-06, + "loss": 0.4098, + "step": 6012 + }, + { + "epoch": 1.6880965749578887, + "grad_norm": 0.7017393112182617, + "learning_rate": 4.778752827770439e-06, + "loss": 0.4322, + "step": 6013 + }, + { + "epoch": 1.6883773161145423, + "grad_norm": 0.6651670932769775, + "learning_rate": 4.777121085675116e-06, + "loss": 0.3957, + "step": 6014 + }, + { + "epoch": 1.688658057271196, + "grad_norm": 0.6182573437690735, + "learning_rate": 4.775489367364047e-06, + "loss": 0.3486, + "step": 6015 + }, + { + "epoch": 1.6889387984278494, + "grad_norm": 0.7296952605247498, + "learning_rate": 4.773857673011356e-06, + "loss": 0.3876, + "step": 6016 + }, + { + "epoch": 1.689219539584503, + "grad_norm": 0.6980814933776855, + "learning_rate": 4.772226002791168e-06, + "loss": 0.3805, + "step": 6017 + }, + { + "epoch": 1.6895002807411568, + "grad_norm": 0.6811695098876953, + "learning_rate": 4.7705943568776015e-06, + "loss": 0.3997, + "step": 6018 + }, + { + "epoch": 1.6897810218978102, + "grad_norm": 0.5936205387115479, + "learning_rate": 4.768962735444781e-06, + "loss": 0.4092, + "step": 6019 + }, + { + "epoch": 1.6900617630544637, + "grad_norm": 0.6474246382713318, + "learning_rate": 4.767331138666822e-06, + "loss": 0.4018, + "step": 6020 + }, + { + "epoch": 1.6903425042111173, + "grad_norm": 0.6197667717933655, + "learning_rate": 4.765699566717835e-06, + "loss": 0.3318, + "step": 6021 + }, + { + "epoch": 1.690623245367771, + "grad_norm": 0.6729341745376587, + "learning_rate": 4.764068019771934e-06, + "loss": 0.3744, + "step": 6022 + }, + { + "epoch": 1.6909039865244244, + "grad_norm": 0.685239851474762, + "learning_rate": 4.762436498003227e-06, + "loss": 0.3818, + "step": 6023 + }, + { + "epoch": 1.691184727681078, + "grad_norm": 0.6004669070243835, + "learning_rate": 4.760805001585819e-06, + "loss": 0.3839, + "step": 6024 + }, + { + "epoch": 1.6914654688377317, + "grad_norm": 0.528843104839325, + "learning_rate": 4.7591735306938144e-06, + "loss": 0.3775, + "step": 6025 + }, + { + "epoch": 1.6917462099943852, + "grad_norm": 0.6972212791442871, + "learning_rate": 4.757542085501314e-06, + "loss": 0.3702, + "step": 6026 + }, + { + "epoch": 1.6920269511510386, + "grad_norm": 0.6752337217330933, + "learning_rate": 4.755910666182413e-06, + "loss": 0.3602, + "step": 6027 + }, + { + "epoch": 1.6923076923076923, + "grad_norm": 0.6402177810668945, + "learning_rate": 4.754279272911208e-06, + "loss": 0.3978, + "step": 6028 + }, + { + "epoch": 1.692588433464346, + "grad_norm": 0.6916014552116394, + "learning_rate": 4.7526479058617904e-06, + "loss": 0.3876, + "step": 6029 + }, + { + "epoch": 1.6928691746209994, + "grad_norm": 0.6594701409339905, + "learning_rate": 4.751016565208251e-06, + "loss": 0.411, + "step": 6030 + }, + { + "epoch": 1.693149915777653, + "grad_norm": 0.678952693939209, + "learning_rate": 4.749385251124675e-06, + "loss": 0.3663, + "step": 6031 + }, + { + "epoch": 1.6934306569343067, + "grad_norm": 0.6876115202903748, + "learning_rate": 4.7477539637851475e-06, + "loss": 0.4004, + "step": 6032 + }, + { + "epoch": 1.6937113980909602, + "grad_norm": 0.7282420992851257, + "learning_rate": 4.746122703363748e-06, + "loss": 0.3702, + "step": 6033 + }, + { + "epoch": 1.6939921392476136, + "grad_norm": 0.7316907644271851, + "learning_rate": 4.744491470034554e-06, + "loss": 0.4198, + "step": 6034 + }, + { + "epoch": 1.6942728804042673, + "grad_norm": 0.7218764424324036, + "learning_rate": 4.742860263971642e-06, + "loss": 0.4007, + "step": 6035 + }, + { + "epoch": 1.694553621560921, + "grad_norm": 0.6547330021858215, + "learning_rate": 4.741229085349083e-06, + "loss": 0.3817, + "step": 6036 + }, + { + "epoch": 1.6948343627175744, + "grad_norm": 0.7427129149436951, + "learning_rate": 4.7395979343409475e-06, + "loss": 0.3925, + "step": 6037 + }, + { + "epoch": 1.6951151038742278, + "grad_norm": 0.6655198931694031, + "learning_rate": 4.737966811121302e-06, + "loss": 0.3621, + "step": 6038 + }, + { + "epoch": 1.6953958450308815, + "grad_norm": 0.6993547081947327, + "learning_rate": 4.736335715864207e-06, + "loss": 0.4127, + "step": 6039 + }, + { + "epoch": 1.6956765861875351, + "grad_norm": 0.7520627975463867, + "learning_rate": 4.734704648743726e-06, + "loss": 0.4052, + "step": 6040 + }, + { + "epoch": 1.6959573273441886, + "grad_norm": 0.6116990447044373, + "learning_rate": 4.733073609933915e-06, + "loss": 0.3693, + "step": 6041 + }, + { + "epoch": 1.6962380685008422, + "grad_norm": 0.7016486525535583, + "learning_rate": 4.731442599608828e-06, + "loss": 0.3685, + "step": 6042 + }, + { + "epoch": 1.696518809657496, + "grad_norm": 0.7572161555290222, + "learning_rate": 4.729811617942515e-06, + "loss": 0.4026, + "step": 6043 + }, + { + "epoch": 1.6967995508141493, + "grad_norm": 0.7121775150299072, + "learning_rate": 4.728180665109028e-06, + "loss": 0.3877, + "step": 6044 + }, + { + "epoch": 1.6970802919708028, + "grad_norm": 0.6513147354125977, + "learning_rate": 4.7265497412824096e-06, + "loss": 0.4139, + "step": 6045 + }, + { + "epoch": 1.6973610331274565, + "grad_norm": 0.6890811920166016, + "learning_rate": 4.724918846636703e-06, + "loss": 0.3418, + "step": 6046 + }, + { + "epoch": 1.6976417742841101, + "grad_norm": 0.5832764506340027, + "learning_rate": 4.723287981345947e-06, + "loss": 0.3561, + "step": 6047 + }, + { + "epoch": 1.6979225154407636, + "grad_norm": 0.658863365650177, + "learning_rate": 4.721657145584176e-06, + "loss": 0.4022, + "step": 6048 + }, + { + "epoch": 1.6982032565974172, + "grad_norm": 0.7016426920890808, + "learning_rate": 4.7200263395254235e-06, + "loss": 0.3924, + "step": 6049 + }, + { + "epoch": 1.6984839977540709, + "grad_norm": 0.5901565551757812, + "learning_rate": 4.718395563343718e-06, + "loss": 0.3872, + "step": 6050 + }, + { + "epoch": 1.6987647389107243, + "grad_norm": 0.6345401406288147, + "learning_rate": 4.716764817213088e-06, + "loss": 0.3997, + "step": 6051 + }, + { + "epoch": 1.6990454800673778, + "grad_norm": 0.6664313077926636, + "learning_rate": 4.715134101307555e-06, + "loss": 0.3864, + "step": 6052 + }, + { + "epoch": 1.6993262212240314, + "grad_norm": 0.6883613467216492, + "learning_rate": 4.713503415801141e-06, + "loss": 0.3684, + "step": 6053 + }, + { + "epoch": 1.699606962380685, + "grad_norm": 0.65483558177948, + "learning_rate": 4.711872760867859e-06, + "loss": 0.3698, + "step": 6054 + }, + { + "epoch": 1.6998877035373385, + "grad_norm": 0.6401641964912415, + "learning_rate": 4.710242136681725e-06, + "loss": 0.3714, + "step": 6055 + }, + { + "epoch": 1.700168444693992, + "grad_norm": 0.7197986841201782, + "learning_rate": 4.708611543416747e-06, + "loss": 0.4473, + "step": 6056 + }, + { + "epoch": 1.7004491858506459, + "grad_norm": 0.7501349449157715, + "learning_rate": 4.706980981246934e-06, + "loss": 0.3615, + "step": 6057 + }, + { + "epoch": 1.7007299270072993, + "grad_norm": 0.7164209485054016, + "learning_rate": 4.705350450346289e-06, + "loss": 0.3721, + "step": 6058 + }, + { + "epoch": 1.7010106681639527, + "grad_norm": 0.62754887342453, + "learning_rate": 4.703719950888811e-06, + "loss": 0.4012, + "step": 6059 + }, + { + "epoch": 1.7012914093206064, + "grad_norm": 0.6176878809928894, + "learning_rate": 4.702089483048497e-06, + "loss": 0.3734, + "step": 6060 + }, + { + "epoch": 1.70157215047726, + "grad_norm": 0.6346988677978516, + "learning_rate": 4.700459046999341e-06, + "loss": 0.3895, + "step": 6061 + }, + { + "epoch": 1.7018528916339135, + "grad_norm": 0.6312441229820251, + "learning_rate": 4.698828642915334e-06, + "loss": 0.403, + "step": 6062 + }, + { + "epoch": 1.702133632790567, + "grad_norm": 0.648977518081665, + "learning_rate": 4.69719827097046e-06, + "loss": 0.3532, + "step": 6063 + }, + { + "epoch": 1.7024143739472206, + "grad_norm": 0.6212430596351624, + "learning_rate": 4.695567931338703e-06, + "loss": 0.3686, + "step": 6064 + }, + { + "epoch": 1.7026951151038743, + "grad_norm": 0.6183661222457886, + "learning_rate": 4.693937624194045e-06, + "loss": 0.3468, + "step": 6065 + }, + { + "epoch": 1.7029758562605277, + "grad_norm": 0.5506702065467834, + "learning_rate": 4.69230734971046e-06, + "loss": 0.3297, + "step": 6066 + }, + { + "epoch": 1.7032565974171814, + "grad_norm": 0.6448323130607605, + "learning_rate": 4.690677108061921e-06, + "loss": 0.397, + "step": 6067 + }, + { + "epoch": 1.703537338573835, + "grad_norm": 0.6663951277732849, + "learning_rate": 4.689046899422397e-06, + "loss": 0.4069, + "step": 6068 + }, + { + "epoch": 1.7038180797304885, + "grad_norm": 0.7060367465019226, + "learning_rate": 4.687416723965853e-06, + "loss": 0.3985, + "step": 6069 + }, + { + "epoch": 1.704098820887142, + "grad_norm": 0.7022783756256104, + "learning_rate": 4.685786581866254e-06, + "loss": 0.4073, + "step": 6070 + }, + { + "epoch": 1.7043795620437956, + "grad_norm": 0.620241641998291, + "learning_rate": 4.684156473297557e-06, + "loss": 0.4315, + "step": 6071 + }, + { + "epoch": 1.7046603032004493, + "grad_norm": 0.6034104228019714, + "learning_rate": 4.682526398433716e-06, + "loss": 0.3792, + "step": 6072 + }, + { + "epoch": 1.7049410443571027, + "grad_norm": 0.6438170075416565, + "learning_rate": 4.680896357448685e-06, + "loss": 0.3946, + "step": 6073 + }, + { + "epoch": 1.7052217855137564, + "grad_norm": 0.6571474075317383, + "learning_rate": 4.67926635051641e-06, + "loss": 0.3572, + "step": 6074 + }, + { + "epoch": 1.70550252667041, + "grad_norm": 0.6134425401687622, + "learning_rate": 4.677636377810836e-06, + "loss": 0.3888, + "step": 6075 + }, + { + "epoch": 1.7057832678270635, + "grad_norm": 0.6544412970542908, + "learning_rate": 4.676006439505902e-06, + "loss": 0.3851, + "step": 6076 + }, + { + "epoch": 1.706064008983717, + "grad_norm": 0.5887376666069031, + "learning_rate": 4.6743765357755465e-06, + "loss": 0.3585, + "step": 6077 + }, + { + "epoch": 1.7063447501403706, + "grad_norm": 0.5665528178215027, + "learning_rate": 4.672746666793703e-06, + "loss": 0.3877, + "step": 6078 + }, + { + "epoch": 1.7066254912970242, + "grad_norm": 0.6077118515968323, + "learning_rate": 4.671116832734299e-06, + "loss": 0.4167, + "step": 6079 + }, + { + "epoch": 1.7069062324536777, + "grad_norm": 0.649574339389801, + "learning_rate": 4.669487033771261e-06, + "loss": 0.3892, + "step": 6080 + }, + { + "epoch": 1.7071869736103311, + "grad_norm": 0.6175322532653809, + "learning_rate": 4.667857270078513e-06, + "loss": 0.3688, + "step": 6081 + }, + { + "epoch": 1.707467714766985, + "grad_norm": 0.6835350394248962, + "learning_rate": 4.666227541829971e-06, + "loss": 0.3492, + "step": 6082 + }, + { + "epoch": 1.7077484559236384, + "grad_norm": 0.6302521228790283, + "learning_rate": 4.66459784919955e-06, + "loss": 0.3685, + "step": 6083 + }, + { + "epoch": 1.7080291970802919, + "grad_norm": 0.5762865543365479, + "learning_rate": 4.662968192361161e-06, + "loss": 0.3705, + "step": 6084 + }, + { + "epoch": 1.7083099382369455, + "grad_norm": 0.6340917944908142, + "learning_rate": 4.661338571488711e-06, + "loss": 0.3417, + "step": 6085 + }, + { + "epoch": 1.7085906793935992, + "grad_norm": 0.6836395263671875, + "learning_rate": 4.659708986756101e-06, + "loss": 0.3718, + "step": 6086 + }, + { + "epoch": 1.7088714205502527, + "grad_norm": 0.7231102585792542, + "learning_rate": 4.658079438337234e-06, + "loss": 0.376, + "step": 6087 + }, + { + "epoch": 1.709152161706906, + "grad_norm": 0.7105762958526611, + "learning_rate": 4.6564499264060024e-06, + "loss": 0.3924, + "step": 6088 + }, + { + "epoch": 1.7094329028635598, + "grad_norm": 0.7268754243850708, + "learning_rate": 4.654820451136297e-06, + "loss": 0.4305, + "step": 6089 + }, + { + "epoch": 1.7097136440202134, + "grad_norm": 0.5852659940719604, + "learning_rate": 4.653191012702008e-06, + "loss": 0.3614, + "step": 6090 + }, + { + "epoch": 1.7099943851768669, + "grad_norm": 0.644648551940918, + "learning_rate": 4.6515616112770165e-06, + "loss": 0.4235, + "step": 6091 + }, + { + "epoch": 1.7102751263335205, + "grad_norm": 0.6437558531761169, + "learning_rate": 4.6499322470352035e-06, + "loss": 0.3926, + "step": 6092 + }, + { + "epoch": 1.7105558674901742, + "grad_norm": 0.6182278990745544, + "learning_rate": 4.6483029201504445e-06, + "loss": 0.366, + "step": 6093 + }, + { + "epoch": 1.7108366086468276, + "grad_norm": 0.6242963075637817, + "learning_rate": 4.646673630796608e-06, + "loss": 0.3518, + "step": 6094 + }, + { + "epoch": 1.711117349803481, + "grad_norm": 0.6547690629959106, + "learning_rate": 4.645044379147567e-06, + "loss": 0.4349, + "step": 6095 + }, + { + "epoch": 1.7113980909601347, + "grad_norm": 0.6208428740501404, + "learning_rate": 4.643415165377184e-06, + "loss": 0.3873, + "step": 6096 + }, + { + "epoch": 1.7116788321167884, + "grad_norm": 0.6276224851608276, + "learning_rate": 4.641785989659314e-06, + "loss": 0.3593, + "step": 6097 + }, + { + "epoch": 1.7119595732734418, + "grad_norm": 0.654462993144989, + "learning_rate": 4.640156852167818e-06, + "loss": 0.386, + "step": 6098 + }, + { + "epoch": 1.7122403144300955, + "grad_norm": 0.6660346388816833, + "learning_rate": 4.638527753076544e-06, + "loss": 0.3912, + "step": 6099 + }, + { + "epoch": 1.7125210555867492, + "grad_norm": 0.7017571330070496, + "learning_rate": 4.63689869255934e-06, + "loss": 0.4131, + "step": 6100 + }, + { + "epoch": 1.7128017967434026, + "grad_norm": 0.6872237324714661, + "learning_rate": 4.63526967079005e-06, + "loss": 0.4045, + "step": 6101 + }, + { + "epoch": 1.713082537900056, + "grad_norm": 0.6601234078407288, + "learning_rate": 4.633640687942512e-06, + "loss": 0.3775, + "step": 6102 + }, + { + "epoch": 1.7133632790567097, + "grad_norm": 0.6169291138648987, + "learning_rate": 4.632011744190563e-06, + "loss": 0.4002, + "step": 6103 + }, + { + "epoch": 1.7136440202133634, + "grad_norm": 0.6733725666999817, + "learning_rate": 4.630382839708032e-06, + "loss": 0.4231, + "step": 6104 + }, + { + "epoch": 1.7139247613700168, + "grad_norm": 0.6713374257087708, + "learning_rate": 4.628753974668745e-06, + "loss": 0.3771, + "step": 6105 + }, + { + "epoch": 1.7142055025266703, + "grad_norm": 0.6887063384056091, + "learning_rate": 4.627125149246525e-06, + "loss": 0.3817, + "step": 6106 + }, + { + "epoch": 1.7144862436833241, + "grad_norm": 0.5848514437675476, + "learning_rate": 4.625496363615191e-06, + "loss": 0.3665, + "step": 6107 + }, + { + "epoch": 1.7147669848399776, + "grad_norm": 0.6316863894462585, + "learning_rate": 4.623867617948556e-06, + "loss": 0.3588, + "step": 6108 + }, + { + "epoch": 1.715047725996631, + "grad_norm": 0.5803934335708618, + "learning_rate": 4.62223891242043e-06, + "loss": 0.3709, + "step": 6109 + }, + { + "epoch": 1.7153284671532847, + "grad_norm": 0.6939713358879089, + "learning_rate": 4.6206102472046185e-06, + "loss": 0.3842, + "step": 6110 + }, + { + "epoch": 1.7156092083099383, + "grad_norm": 0.607570469379425, + "learning_rate": 4.618981622474921e-06, + "loss": 0.3622, + "step": 6111 + }, + { + "epoch": 1.7158899494665918, + "grad_norm": 0.5985682606697083, + "learning_rate": 4.617353038405136e-06, + "loss": 0.3695, + "step": 6112 + }, + { + "epoch": 1.7161706906232452, + "grad_norm": 0.5189514756202698, + "learning_rate": 4.615724495169055e-06, + "loss": 0.364, + "step": 6113 + }, + { + "epoch": 1.716451431779899, + "grad_norm": 0.6978327035903931, + "learning_rate": 4.614095992940466e-06, + "loss": 0.4608, + "step": 6114 + }, + { + "epoch": 1.7167321729365526, + "grad_norm": 0.6184882521629333, + "learning_rate": 4.612467531893154e-06, + "loss": 0.3752, + "step": 6115 + }, + { + "epoch": 1.717012914093206, + "grad_norm": 0.7372332811355591, + "learning_rate": 4.610839112200896e-06, + "loss": 0.3983, + "step": 6116 + }, + { + "epoch": 1.7172936552498597, + "grad_norm": 0.7011968493461609, + "learning_rate": 4.6092107340374685e-06, + "loss": 0.3694, + "step": 6117 + }, + { + "epoch": 1.7175743964065133, + "grad_norm": 0.6919954419136047, + "learning_rate": 4.607582397576641e-06, + "loss": 0.4044, + "step": 6118 + }, + { + "epoch": 1.7178551375631668, + "grad_norm": 0.6805636882781982, + "learning_rate": 4.60595410299218e-06, + "loss": 0.3698, + "step": 6119 + }, + { + "epoch": 1.7181358787198202, + "grad_norm": 0.5570012331008911, + "learning_rate": 4.604325850457845e-06, + "loss": 0.3824, + "step": 6120 + }, + { + "epoch": 1.7184166198764739, + "grad_norm": 0.6912919282913208, + "learning_rate": 4.602697640147396e-06, + "loss": 0.42, + "step": 6121 + }, + { + "epoch": 1.7186973610331275, + "grad_norm": 0.6265649795532227, + "learning_rate": 4.601069472234584e-06, + "loss": 0.4127, + "step": 6122 + }, + { + "epoch": 1.718978102189781, + "grad_norm": 0.6834555864334106, + "learning_rate": 4.5994413468931575e-06, + "loss": 0.4102, + "step": 6123 + }, + { + "epoch": 1.7192588433464346, + "grad_norm": 0.6186806559562683, + "learning_rate": 4.597813264296861e-06, + "loss": 0.428, + "step": 6124 + }, + { + "epoch": 1.7195395845030883, + "grad_norm": 0.5749118328094482, + "learning_rate": 4.59618522461943e-06, + "loss": 0.4105, + "step": 6125 + }, + { + "epoch": 1.7198203256597417, + "grad_norm": 0.6539474725723267, + "learning_rate": 4.594557228034602e-06, + "loss": 0.3942, + "step": 6126 + }, + { + "epoch": 1.7201010668163952, + "grad_norm": 0.5971326231956482, + "learning_rate": 4.5929292747161035e-06, + "loss": 0.3799, + "step": 6127 + }, + { + "epoch": 1.7203818079730488, + "grad_norm": 0.6099491715431213, + "learning_rate": 4.591301364837662e-06, + "loss": 0.4484, + "step": 6128 + }, + { + "epoch": 1.7206625491297025, + "grad_norm": 0.6051498651504517, + "learning_rate": 4.589673498572998e-06, + "loss": 0.3661, + "step": 6129 + }, + { + "epoch": 1.720943290286356, + "grad_norm": 0.7305816411972046, + "learning_rate": 4.5880456760958266e-06, + "loss": 0.3665, + "step": 6130 + }, + { + "epoch": 1.7212240314430094, + "grad_norm": 0.6616935729980469, + "learning_rate": 4.586417897579859e-06, + "loss": 0.4187, + "step": 6131 + }, + { + "epoch": 1.721504772599663, + "grad_norm": 0.70457923412323, + "learning_rate": 4.584790163198801e-06, + "loss": 0.4152, + "step": 6132 + }, + { + "epoch": 1.7217855137563167, + "grad_norm": 0.6524569392204285, + "learning_rate": 4.583162473126354e-06, + "loss": 0.3795, + "step": 6133 + }, + { + "epoch": 1.7220662549129702, + "grad_norm": 0.645473301410675, + "learning_rate": 4.581534827536216e-06, + "loss": 0.396, + "step": 6134 + }, + { + "epoch": 1.7223469960696238, + "grad_norm": 0.6396334171295166, + "learning_rate": 4.57990722660208e-06, + "loss": 0.3689, + "step": 6135 + }, + { + "epoch": 1.7226277372262775, + "grad_norm": 0.6571674942970276, + "learning_rate": 4.578279670497633e-06, + "loss": 0.3966, + "step": 6136 + }, + { + "epoch": 1.722908478382931, + "grad_norm": 0.6665908098220825, + "learning_rate": 4.576652159396556e-06, + "loss": 0.3754, + "step": 6137 + }, + { + "epoch": 1.7231892195395844, + "grad_norm": 0.6403006315231323, + "learning_rate": 4.575024693472527e-06, + "loss": 0.4184, + "step": 6138 + }, + { + "epoch": 1.723469960696238, + "grad_norm": 0.6201865077018738, + "learning_rate": 4.573397272899221e-06, + "loss": 0.3925, + "step": 6139 + }, + { + "epoch": 1.7237507018528917, + "grad_norm": 0.5629845261573792, + "learning_rate": 4.571769897850305e-06, + "loss": 0.4269, + "step": 6140 + }, + { + "epoch": 1.7240314430095451, + "grad_norm": 0.5768033862113953, + "learning_rate": 4.570142568499442e-06, + "loss": 0.379, + "step": 6141 + }, + { + "epoch": 1.7243121841661988, + "grad_norm": 0.6511726975440979, + "learning_rate": 4.568515285020292e-06, + "loss": 0.383, + "step": 6142 + }, + { + "epoch": 1.7245929253228525, + "grad_norm": 0.7166250348091125, + "learning_rate": 4.5668880475865074e-06, + "loss": 0.3599, + "step": 6143 + }, + { + "epoch": 1.724873666479506, + "grad_norm": 0.6251681447029114, + "learning_rate": 4.565260856371737e-06, + "loss": 0.3556, + "step": 6144 + }, + { + "epoch": 1.7251544076361593, + "grad_norm": 0.6248157024383545, + "learning_rate": 4.563633711549621e-06, + "loss": 0.3787, + "step": 6145 + }, + { + "epoch": 1.725435148792813, + "grad_norm": 0.6260890960693359, + "learning_rate": 4.562006613293806e-06, + "loss": 0.4035, + "step": 6146 + }, + { + "epoch": 1.7257158899494667, + "grad_norm": 0.6268472075462341, + "learning_rate": 4.5603795617779204e-06, + "loss": 0.3866, + "step": 6147 + }, + { + "epoch": 1.7259966311061201, + "grad_norm": 0.7070551514625549, + "learning_rate": 4.558752557175594e-06, + "loss": 0.3616, + "step": 6148 + }, + { + "epoch": 1.7262773722627736, + "grad_norm": 0.6147276163101196, + "learning_rate": 4.55712559966045e-06, + "loss": 0.37, + "step": 6149 + }, + { + "epoch": 1.7265581134194274, + "grad_norm": 0.611661970615387, + "learning_rate": 4.55549868940611e-06, + "loss": 0.3809, + "step": 6150 + }, + { + "epoch": 1.7268388545760809, + "grad_norm": 0.6389936208724976, + "learning_rate": 4.553871826586184e-06, + "loss": 0.3691, + "step": 6151 + }, + { + "epoch": 1.7271195957327343, + "grad_norm": 0.6252506971359253, + "learning_rate": 4.552245011374284e-06, + "loss": 0.4309, + "step": 6152 + }, + { + "epoch": 1.727400336889388, + "grad_norm": 0.5754144787788391, + "learning_rate": 4.550618243944011e-06, + "loss": 0.3742, + "step": 6153 + }, + { + "epoch": 1.7276810780460417, + "grad_norm": 0.62964928150177, + "learning_rate": 4.548991524468964e-06, + "loss": 0.3729, + "step": 6154 + }, + { + "epoch": 1.727961819202695, + "grad_norm": 0.6977580785751343, + "learning_rate": 4.547364853122737e-06, + "loss": 0.3624, + "step": 6155 + }, + { + "epoch": 1.7282425603593485, + "grad_norm": 0.6553834080696106, + "learning_rate": 4.545738230078918e-06, + "loss": 0.3484, + "step": 6156 + }, + { + "epoch": 1.7285233015160022, + "grad_norm": 0.7282357811927795, + "learning_rate": 4.544111655511091e-06, + "loss": 0.3407, + "step": 6157 + }, + { + "epoch": 1.7288040426726559, + "grad_norm": 0.5911629796028137, + "learning_rate": 4.542485129592833e-06, + "loss": 0.4099, + "step": 6158 + }, + { + "epoch": 1.7290847838293093, + "grad_norm": 0.6260474324226379, + "learning_rate": 4.540858652497717e-06, + "loss": 0.3887, + "step": 6159 + }, + { + "epoch": 1.729365524985963, + "grad_norm": 0.5876392126083374, + "learning_rate": 4.5392322243993105e-06, + "loss": 0.3965, + "step": 6160 + }, + { + "epoch": 1.7296462661426166, + "grad_norm": 0.6654799580574036, + "learning_rate": 4.537605845471174e-06, + "loss": 0.399, + "step": 6161 + }, + { + "epoch": 1.72992700729927, + "grad_norm": 0.5956904888153076, + "learning_rate": 4.535979515886868e-06, + "loss": 0.3853, + "step": 6162 + }, + { + "epoch": 1.7302077484559235, + "grad_norm": 0.7051137089729309, + "learning_rate": 4.5343532358199415e-06, + "loss": 0.3822, + "step": 6163 + }, + { + "epoch": 1.7304884896125772, + "grad_norm": 0.6873592138290405, + "learning_rate": 4.532727005443944e-06, + "loss": 0.4121, + "step": 6164 + }, + { + "epoch": 1.7307692307692308, + "grad_norm": 0.6353142261505127, + "learning_rate": 4.531100824932413e-06, + "loss": 0.3624, + "step": 6165 + }, + { + "epoch": 1.7310499719258843, + "grad_norm": 0.652218222618103, + "learning_rate": 4.5294746944588855e-06, + "loss": 0.409, + "step": 6166 + }, + { + "epoch": 1.731330713082538, + "grad_norm": 0.7085467576980591, + "learning_rate": 4.527848614196893e-06, + "loss": 0.3851, + "step": 6167 + }, + { + "epoch": 1.7316114542391916, + "grad_norm": 0.580599844455719, + "learning_rate": 4.526222584319961e-06, + "loss": 0.3905, + "step": 6168 + }, + { + "epoch": 1.731892195395845, + "grad_norm": 0.6966289281845093, + "learning_rate": 4.524596605001609e-06, + "loss": 0.3993, + "step": 6169 + }, + { + "epoch": 1.7321729365524985, + "grad_norm": 0.6031883358955383, + "learning_rate": 4.52297067641535e-06, + "loss": 0.3994, + "step": 6170 + }, + { + "epoch": 1.7324536777091522, + "grad_norm": 0.5960671901702881, + "learning_rate": 4.521344798734692e-06, + "loss": 0.4145, + "step": 6171 + }, + { + "epoch": 1.7327344188658058, + "grad_norm": 0.7738549709320068, + "learning_rate": 4.5197189721331425e-06, + "loss": 0.3737, + "step": 6172 + }, + { + "epoch": 1.7330151600224593, + "grad_norm": 0.6027511358261108, + "learning_rate": 4.518093196784199e-06, + "loss": 0.382, + "step": 6173 + }, + { + "epoch": 1.7332959011791127, + "grad_norm": 0.6172485947608948, + "learning_rate": 4.516467472861351e-06, + "loss": 0.3741, + "step": 6174 + }, + { + "epoch": 1.7335766423357666, + "grad_norm": 0.6590242981910706, + "learning_rate": 4.514841800538088e-06, + "loss": 0.3876, + "step": 6175 + }, + { + "epoch": 1.73385738349242, + "grad_norm": 0.6483175158500671, + "learning_rate": 4.513216179987891e-06, + "loss": 0.4139, + "step": 6176 + }, + { + "epoch": 1.7341381246490735, + "grad_norm": 0.6050055623054504, + "learning_rate": 4.511590611384236e-06, + "loss": 0.4409, + "step": 6177 + }, + { + "epoch": 1.7344188658057271, + "grad_norm": 0.6095812320709229, + "learning_rate": 4.509965094900593e-06, + "loss": 0.3975, + "step": 6178 + }, + { + "epoch": 1.7346996069623808, + "grad_norm": 0.6329188346862793, + "learning_rate": 4.508339630710431e-06, + "loss": 0.4147, + "step": 6179 + }, + { + "epoch": 1.7349803481190342, + "grad_norm": 0.6765221357345581, + "learning_rate": 4.5067142189872034e-06, + "loss": 0.3702, + "step": 6180 + }, + { + "epoch": 1.7352610892756877, + "grad_norm": 0.7201967239379883, + "learning_rate": 4.505088859904367e-06, + "loss": 0.4053, + "step": 6181 + }, + { + "epoch": 1.7355418304323413, + "grad_norm": 0.6717000603675842, + "learning_rate": 4.50346355363537e-06, + "loss": 0.399, + "step": 6182 + }, + { + "epoch": 1.735822571588995, + "grad_norm": 0.622174859046936, + "learning_rate": 4.501838300353654e-06, + "loss": 0.4101, + "step": 6183 + }, + { + "epoch": 1.7361033127456484, + "grad_norm": 0.6078985929489136, + "learning_rate": 4.500213100232657e-06, + "loss": 0.3702, + "step": 6184 + }, + { + "epoch": 1.736384053902302, + "grad_norm": 0.622093141078949, + "learning_rate": 4.498587953445812e-06, + "loss": 0.3629, + "step": 6185 + }, + { + "epoch": 1.7366647950589558, + "grad_norm": 0.6237002015113831, + "learning_rate": 4.496962860166542e-06, + "loss": 0.3836, + "step": 6186 + }, + { + "epoch": 1.7369455362156092, + "grad_norm": 0.6947446465492249, + "learning_rate": 4.4953378205682655e-06, + "loss": 0.3795, + "step": 6187 + }, + { + "epoch": 1.7372262773722627, + "grad_norm": 0.7730202674865723, + "learning_rate": 4.4937128348244e-06, + "loss": 0.37, + "step": 6188 + }, + { + "epoch": 1.7375070185289163, + "grad_norm": 0.6687898635864258, + "learning_rate": 4.492087903108351e-06, + "loss": 0.3759, + "step": 6189 + }, + { + "epoch": 1.73778775968557, + "grad_norm": 0.7137091755867004, + "learning_rate": 4.490463025593523e-06, + "loss": 0.3424, + "step": 6190 + }, + { + "epoch": 1.7380685008422234, + "grad_norm": 0.6977535486221313, + "learning_rate": 4.488838202453314e-06, + "loss": 0.3984, + "step": 6191 + }, + { + "epoch": 1.738349241998877, + "grad_norm": 0.6577920317649841, + "learning_rate": 4.487213433861111e-06, + "loss": 0.3529, + "step": 6192 + }, + { + "epoch": 1.7386299831555307, + "grad_norm": 0.6382343173027039, + "learning_rate": 4.485588719990303e-06, + "loss": 0.3587, + "step": 6193 + }, + { + "epoch": 1.7389107243121842, + "grad_norm": 0.6272525787353516, + "learning_rate": 4.483964061014268e-06, + "loss": 0.4062, + "step": 6194 + }, + { + "epoch": 1.7391914654688376, + "grad_norm": 0.7789995074272156, + "learning_rate": 4.482339457106378e-06, + "loss": 0.3862, + "step": 6195 + }, + { + "epoch": 1.7394722066254913, + "grad_norm": 0.729128897190094, + "learning_rate": 4.480714908440002e-06, + "loss": 0.3939, + "step": 6196 + }, + { + "epoch": 1.739752947782145, + "grad_norm": 0.6368979215621948, + "learning_rate": 4.479090415188502e-06, + "loss": 0.3974, + "step": 6197 + }, + { + "epoch": 1.7400336889387984, + "grad_norm": 0.6093249320983887, + "learning_rate": 4.477465977525234e-06, + "loss": 0.3604, + "step": 6198 + }, + { + "epoch": 1.7403144300954518, + "grad_norm": 0.7181936502456665, + "learning_rate": 4.475841595623547e-06, + "loss": 0.3768, + "step": 6199 + }, + { + "epoch": 1.7405951712521057, + "grad_norm": 0.7454440593719482, + "learning_rate": 4.474217269656786e-06, + "loss": 0.3705, + "step": 6200 + }, + { + "epoch": 1.7408759124087592, + "grad_norm": 0.6315555572509766, + "learning_rate": 4.4725929997982895e-06, + "loss": 0.3806, + "step": 6201 + }, + { + "epoch": 1.7411566535654126, + "grad_norm": 0.6051346659660339, + "learning_rate": 4.4709687862213866e-06, + "loss": 0.3965, + "step": 6202 + }, + { + "epoch": 1.7414373947220663, + "grad_norm": 0.6589784026145935, + "learning_rate": 4.469344629099406e-06, + "loss": 0.3929, + "step": 6203 + }, + { + "epoch": 1.74171813587872, + "grad_norm": 0.692035973072052, + "learning_rate": 4.467720528605665e-06, + "loss": 0.392, + "step": 6204 + }, + { + "epoch": 1.7419988770353734, + "grad_norm": 0.6870393753051758, + "learning_rate": 4.466096484913481e-06, + "loss": 0.4543, + "step": 6205 + }, + { + "epoch": 1.7422796181920268, + "grad_norm": 0.6039896607398987, + "learning_rate": 4.464472498196159e-06, + "loss": 0.3611, + "step": 6206 + }, + { + "epoch": 1.7425603593486805, + "grad_norm": 0.6538866758346558, + "learning_rate": 4.462848568627003e-06, + "loss": 0.3562, + "step": 6207 + }, + { + "epoch": 1.7428411005053341, + "grad_norm": 0.703467845916748, + "learning_rate": 4.461224696379307e-06, + "loss": 0.3841, + "step": 6208 + }, + { + "epoch": 1.7431218416619876, + "grad_norm": 0.6572993993759155, + "learning_rate": 4.45960088162636e-06, + "loss": 0.3609, + "step": 6209 + }, + { + "epoch": 1.7434025828186412, + "grad_norm": 0.5625954270362854, + "learning_rate": 4.457977124541447e-06, + "loss": 0.417, + "step": 6210 + }, + { + "epoch": 1.743683323975295, + "grad_norm": 0.6067602038383484, + "learning_rate": 4.456353425297845e-06, + "loss": 0.3588, + "step": 6211 + }, + { + "epoch": 1.7439640651319483, + "grad_norm": 0.7094693779945374, + "learning_rate": 4.4547297840688235e-06, + "loss": 0.3708, + "step": 6212 + }, + { + "epoch": 1.7442448062886018, + "grad_norm": 0.6259050965309143, + "learning_rate": 4.45310620102765e-06, + "loss": 0.3712, + "step": 6213 + }, + { + "epoch": 1.7445255474452555, + "grad_norm": 0.5816469788551331, + "learning_rate": 4.4514826763475816e-06, + "loss": 0.3658, + "step": 6214 + }, + { + "epoch": 1.7448062886019091, + "grad_norm": 0.573767364025116, + "learning_rate": 4.44985921020187e-06, + "loss": 0.4047, + "step": 6215 + }, + { + "epoch": 1.7450870297585626, + "grad_norm": 0.5543454885482788, + "learning_rate": 4.448235802763764e-06, + "loss": 0.3805, + "step": 6216 + }, + { + "epoch": 1.7453677709152162, + "grad_norm": 0.5985108017921448, + "learning_rate": 4.4466124542065e-06, + "loss": 0.4046, + "step": 6217 + }, + { + "epoch": 1.7456485120718699, + "grad_norm": 0.6127582788467407, + "learning_rate": 4.444989164703315e-06, + "loss": 0.4083, + "step": 6218 + }, + { + "epoch": 1.7459292532285233, + "grad_norm": 0.6588864922523499, + "learning_rate": 4.443365934427435e-06, + "loss": 0.3801, + "step": 6219 + }, + { + "epoch": 1.7462099943851768, + "grad_norm": 0.5574650168418884, + "learning_rate": 4.44174276355208e-06, + "loss": 0.3787, + "step": 6220 + }, + { + "epoch": 1.7464907355418304, + "grad_norm": 0.661906898021698, + "learning_rate": 4.440119652250465e-06, + "loss": 0.4387, + "step": 6221 + }, + { + "epoch": 1.746771476698484, + "grad_norm": 0.6496044993400574, + "learning_rate": 4.4384966006958e-06, + "loss": 0.3752, + "step": 6222 + }, + { + "epoch": 1.7470522178551375, + "grad_norm": 0.7547746896743774, + "learning_rate": 4.436873609061287e-06, + "loss": 0.4123, + "step": 6223 + }, + { + "epoch": 1.747332959011791, + "grad_norm": 0.6067736744880676, + "learning_rate": 4.43525067752012e-06, + "loss": 0.4282, + "step": 6224 + }, + { + "epoch": 1.7476137001684446, + "grad_norm": 0.6351024508476257, + "learning_rate": 4.433627806245488e-06, + "loss": 0.3896, + "step": 6225 + }, + { + "epoch": 1.7478944413250983, + "grad_norm": 0.7073475122451782, + "learning_rate": 4.432004995410575e-06, + "loss": 0.3881, + "step": 6226 + }, + { + "epoch": 1.7481751824817517, + "grad_norm": 0.4947356879711151, + "learning_rate": 4.430382245188557e-06, + "loss": 0.3488, + "step": 6227 + }, + { + "epoch": 1.7484559236384054, + "grad_norm": 0.719068706035614, + "learning_rate": 4.428759555752603e-06, + "loss": 0.4192, + "step": 6228 + }, + { + "epoch": 1.748736664795059, + "grad_norm": 0.6581749320030212, + "learning_rate": 4.427136927275879e-06, + "loss": 0.3948, + "step": 6229 + }, + { + "epoch": 1.7490174059517125, + "grad_norm": 0.5638178586959839, + "learning_rate": 4.4255143599315375e-06, + "loss": 0.3817, + "step": 6230 + }, + { + "epoch": 1.749298147108366, + "grad_norm": 0.6202285289764404, + "learning_rate": 4.423891853892731e-06, + "loss": 0.36, + "step": 6231 + }, + { + "epoch": 1.7495788882650196, + "grad_norm": 0.5796914100646973, + "learning_rate": 4.422269409332604e-06, + "loss": 0.368, + "step": 6232 + }, + { + "epoch": 1.7498596294216733, + "grad_norm": 0.609451413154602, + "learning_rate": 4.420647026424293e-06, + "loss": 0.3887, + "step": 6233 + }, + { + "epoch": 1.7501403705783267, + "grad_norm": 0.5809699296951294, + "learning_rate": 4.419024705340928e-06, + "loss": 0.3584, + "step": 6234 + }, + { + "epoch": 1.7504211117349804, + "grad_norm": 0.6108555793762207, + "learning_rate": 4.4174024462556346e-06, + "loss": 0.3753, + "step": 6235 + }, + { + "epoch": 1.750701852891634, + "grad_norm": 0.640383243560791, + "learning_rate": 4.415780249341529e-06, + "loss": 0.4057, + "step": 6236 + }, + { + "epoch": 1.7509825940482875, + "grad_norm": 0.6090337038040161, + "learning_rate": 4.414158114771722e-06, + "loss": 0.3845, + "step": 6237 + }, + { + "epoch": 1.751263335204941, + "grad_norm": 0.7175537347793579, + "learning_rate": 4.412536042719317e-06, + "loss": 0.3618, + "step": 6238 + }, + { + "epoch": 1.7515440763615946, + "grad_norm": 0.6068211793899536, + "learning_rate": 4.410914033357413e-06, + "loss": 0.3628, + "step": 6239 + }, + { + "epoch": 1.7518248175182483, + "grad_norm": 0.65510493516922, + "learning_rate": 4.4092920868590995e-06, + "loss": 0.3327, + "step": 6240 + }, + { + "epoch": 1.7521055586749017, + "grad_norm": 0.612531840801239, + "learning_rate": 4.407670203397462e-06, + "loss": 0.3947, + "step": 6241 + }, + { + "epoch": 1.7523862998315554, + "grad_norm": 0.6644596457481384, + "learning_rate": 4.4060483831455775e-06, + "loss": 0.3759, + "step": 6242 + }, + { + "epoch": 1.752667040988209, + "grad_norm": 0.642458975315094, + "learning_rate": 4.404426626276514e-06, + "loss": 0.4211, + "step": 6243 + }, + { + "epoch": 1.7529477821448625, + "grad_norm": 0.772847592830658, + "learning_rate": 4.402804932963339e-06, + "loss": 0.4511, + "step": 6244 + }, + { + "epoch": 1.753228523301516, + "grad_norm": 0.6655374765396118, + "learning_rate": 4.401183303379107e-06, + "loss": 0.3554, + "step": 6245 + }, + { + "epoch": 1.7535092644581696, + "grad_norm": 0.6555061936378479, + "learning_rate": 4.399561737696869e-06, + "loss": 0.3631, + "step": 6246 + }, + { + "epoch": 1.7537900056148232, + "grad_norm": 0.6439672112464905, + "learning_rate": 4.3979402360896675e-06, + "loss": 0.378, + "step": 6247 + }, + { + "epoch": 1.7540707467714767, + "grad_norm": 0.5472911596298218, + "learning_rate": 4.396318798730542e-06, + "loss": 0.378, + "step": 6248 + }, + { + "epoch": 1.7543514879281301, + "grad_norm": 0.6403830051422119, + "learning_rate": 4.394697425792519e-06, + "loss": 0.366, + "step": 6249 + }, + { + "epoch": 1.7546322290847838, + "grad_norm": 0.6984131336212158, + "learning_rate": 4.393076117448624e-06, + "loss": 0.4171, + "step": 6250 + }, + { + "epoch": 1.7549129702414374, + "grad_norm": 0.7900029420852661, + "learning_rate": 4.391454873871871e-06, + "loss": 0.4234, + "step": 6251 + }, + { + "epoch": 1.7551937113980909, + "grad_norm": 0.6361263394355774, + "learning_rate": 4.389833695235269e-06, + "loss": 0.4082, + "step": 6252 + }, + { + "epoch": 1.7554744525547445, + "grad_norm": 0.5593291521072388, + "learning_rate": 4.388212581711821e-06, + "loss": 0.3987, + "step": 6253 + }, + { + "epoch": 1.7557551937113982, + "grad_norm": 0.6575533151626587, + "learning_rate": 4.3865915334745216e-06, + "loss": 0.4187, + "step": 6254 + }, + { + "epoch": 1.7560359348680517, + "grad_norm": 0.6631331443786621, + "learning_rate": 4.384970550696359e-06, + "loss": 0.3923, + "step": 6255 + }, + { + "epoch": 1.756316676024705, + "grad_norm": 0.6196184158325195, + "learning_rate": 4.3833496335503164e-06, + "loss": 0.415, + "step": 6256 + }, + { + "epoch": 1.7565974171813588, + "grad_norm": 0.6458535194396973, + "learning_rate": 4.381728782209365e-06, + "loss": 0.414, + "step": 6257 + }, + { + "epoch": 1.7568781583380124, + "grad_norm": 0.7704488635063171, + "learning_rate": 4.380107996846473e-06, + "loss": 0.4066, + "step": 6258 + }, + { + "epoch": 1.7571588994946659, + "grad_norm": 0.6514649391174316, + "learning_rate": 4.3784872776346e-06, + "loss": 0.3607, + "step": 6259 + }, + { + "epoch": 1.7574396406513195, + "grad_norm": 0.6268259286880493, + "learning_rate": 4.376866624746701e-06, + "loss": 0.4031, + "step": 6260 + }, + { + "epoch": 1.7577203818079732, + "grad_norm": 0.6264336109161377, + "learning_rate": 4.3752460383557195e-06, + "loss": 0.3885, + "step": 6261 + }, + { + "epoch": 1.7580011229646266, + "grad_norm": 0.5954325795173645, + "learning_rate": 4.3736255186345975e-06, + "loss": 0.3617, + "step": 6262 + }, + { + "epoch": 1.75828186412128, + "grad_norm": 0.6842427253723145, + "learning_rate": 4.372005065756264e-06, + "loss": 0.3937, + "step": 6263 + }, + { + "epoch": 1.7585626052779337, + "grad_norm": 0.6825528740882874, + "learning_rate": 4.370384679893645e-06, + "loss": 0.3974, + "step": 6264 + }, + { + "epoch": 1.7588433464345874, + "grad_norm": 0.5167978405952454, + "learning_rate": 4.368764361219657e-06, + "loss": 0.369, + "step": 6265 + }, + { + "epoch": 1.7591240875912408, + "grad_norm": 0.6316250562667847, + "learning_rate": 4.367144109907211e-06, + "loss": 0.3385, + "step": 6266 + }, + { + "epoch": 1.7594048287478943, + "grad_norm": 0.6032918691635132, + "learning_rate": 4.36552392612921e-06, + "loss": 0.3801, + "step": 6267 + }, + { + "epoch": 1.7596855699045482, + "grad_norm": 0.6955772042274475, + "learning_rate": 4.363903810058552e-06, + "loss": 0.3715, + "step": 6268 + }, + { + "epoch": 1.7599663110612016, + "grad_norm": 0.666412889957428, + "learning_rate": 4.362283761868122e-06, + "loss": 0.3742, + "step": 6269 + }, + { + "epoch": 1.760247052217855, + "grad_norm": 0.6349466443061829, + "learning_rate": 4.360663781730803e-06, + "loss": 0.3439, + "step": 6270 + }, + { + "epoch": 1.7605277933745087, + "grad_norm": 0.6407955288887024, + "learning_rate": 4.3590438698194695e-06, + "loss": 0.4182, + "step": 6271 + }, + { + "epoch": 1.7608085345311624, + "grad_norm": 0.7088667750358582, + "learning_rate": 4.357424026306988e-06, + "loss": 0.3657, + "step": 6272 + }, + { + "epoch": 1.7610892756878158, + "grad_norm": 0.6948704123497009, + "learning_rate": 4.355804251366219e-06, + "loss": 0.4408, + "step": 6273 + }, + { + "epoch": 1.7613700168444693, + "grad_norm": 0.6439430117607117, + "learning_rate": 4.354184545170015e-06, + "loss": 0.3945, + "step": 6274 + }, + { + "epoch": 1.761650758001123, + "grad_norm": 0.7262217402458191, + "learning_rate": 4.352564907891219e-06, + "loss": 0.4043, + "step": 6275 + }, + { + "epoch": 1.7619314991577766, + "grad_norm": 0.7246805429458618, + "learning_rate": 4.350945339702671e-06, + "loss": 0.4272, + "step": 6276 + }, + { + "epoch": 1.76221224031443, + "grad_norm": 0.5724625587463379, + "learning_rate": 4.3493258407772e-06, + "loss": 0.3539, + "step": 6277 + }, + { + "epoch": 1.7624929814710837, + "grad_norm": 0.7266539931297302, + "learning_rate": 4.34770641128763e-06, + "loss": 0.4067, + "step": 6278 + }, + { + "epoch": 1.7627737226277373, + "grad_norm": 0.5940781235694885, + "learning_rate": 4.3460870514067735e-06, + "loss": 0.3424, + "step": 6279 + }, + { + "epoch": 1.7630544637843908, + "grad_norm": 0.6305220723152161, + "learning_rate": 4.3444677613074415e-06, + "loss": 0.3624, + "step": 6280 + }, + { + "epoch": 1.7633352049410442, + "grad_norm": 0.565099835395813, + "learning_rate": 4.342848541162433e-06, + "loss": 0.3855, + "step": 6281 + }, + { + "epoch": 1.763615946097698, + "grad_norm": 0.6301993727684021, + "learning_rate": 4.3412293911445416e-06, + "loss": 0.3894, + "step": 6282 + }, + { + "epoch": 1.7638966872543516, + "grad_norm": 0.6357986927032471, + "learning_rate": 4.339610311426554e-06, + "loss": 0.3925, + "step": 6283 + }, + { + "epoch": 1.764177428411005, + "grad_norm": 0.6513936519622803, + "learning_rate": 4.337991302181247e-06, + "loss": 0.3836, + "step": 6284 + }, + { + "epoch": 1.7644581695676587, + "grad_norm": 0.6839202642440796, + "learning_rate": 4.336372363581391e-06, + "loss": 0.4126, + "step": 6285 + }, + { + "epoch": 1.7647389107243123, + "grad_norm": 0.663027286529541, + "learning_rate": 4.33475349579975e-06, + "loss": 0.4074, + "step": 6286 + }, + { + "epoch": 1.7650196518809658, + "grad_norm": 0.7076448202133179, + "learning_rate": 4.333134699009078e-06, + "loss": 0.4145, + "step": 6287 + }, + { + "epoch": 1.7653003930376192, + "grad_norm": 0.5735254883766174, + "learning_rate": 4.331515973382125e-06, + "loss": 0.3597, + "step": 6288 + }, + { + "epoch": 1.7655811341942729, + "grad_norm": 0.6507977843284607, + "learning_rate": 4.3298973190916294e-06, + "loss": 0.4041, + "step": 6289 + }, + { + "epoch": 1.7658618753509265, + "grad_norm": 0.6680722236633301, + "learning_rate": 4.328278736310326e-06, + "loss": 0.387, + "step": 6290 + }, + { + "epoch": 1.76614261650758, + "grad_norm": 0.6661301851272583, + "learning_rate": 4.326660225210938e-06, + "loss": 0.3795, + "step": 6291 + }, + { + "epoch": 1.7664233576642334, + "grad_norm": 0.6460405588150024, + "learning_rate": 4.325041785966183e-06, + "loss": 0.4159, + "step": 6292 + }, + { + "epoch": 1.7667040988208873, + "grad_norm": 0.6425129175186157, + "learning_rate": 4.323423418748772e-06, + "loss": 0.3691, + "step": 6293 + }, + { + "epoch": 1.7669848399775407, + "grad_norm": 0.6333646178245544, + "learning_rate": 4.321805123731406e-06, + "loss": 0.4006, + "step": 6294 + }, + { + "epoch": 1.7672655811341942, + "grad_norm": 0.6111431121826172, + "learning_rate": 4.320186901086781e-06, + "loss": 0.3578, + "step": 6295 + }, + { + "epoch": 1.7675463222908478, + "grad_norm": 0.5983756184577942, + "learning_rate": 4.318568750987582e-06, + "loss": 0.3586, + "step": 6296 + }, + { + "epoch": 1.7678270634475015, + "grad_norm": 0.6899738907814026, + "learning_rate": 4.316950673606487e-06, + "loss": 0.3956, + "step": 6297 + }, + { + "epoch": 1.768107804604155, + "grad_norm": 0.6273054480552673, + "learning_rate": 4.315332669116167e-06, + "loss": 0.3558, + "step": 6298 + }, + { + "epoch": 1.7683885457608084, + "grad_norm": 0.6247169971466064, + "learning_rate": 4.31371473768929e-06, + "loss": 0.3611, + "step": 6299 + }, + { + "epoch": 1.768669286917462, + "grad_norm": 0.6382704973220825, + "learning_rate": 4.312096879498508e-06, + "loss": 0.3633, + "step": 6300 + }, + { + "epoch": 1.7689500280741157, + "grad_norm": 0.6193411946296692, + "learning_rate": 4.310479094716469e-06, + "loss": 0.3428, + "step": 6301 + }, + { + "epoch": 1.7692307692307692, + "grad_norm": 0.6488775610923767, + "learning_rate": 4.308861383515813e-06, + "loss": 0.3553, + "step": 6302 + }, + { + "epoch": 1.7695115103874228, + "grad_norm": 0.5911656022071838, + "learning_rate": 4.307243746069172e-06, + "loss": 0.3721, + "step": 6303 + }, + { + "epoch": 1.7697922515440765, + "grad_norm": 0.7117661237716675, + "learning_rate": 4.30562618254917e-06, + "loss": 0.3546, + "step": 6304 + }, + { + "epoch": 1.77007299270073, + "grad_norm": 0.7182649374008179, + "learning_rate": 4.304008693128426e-06, + "loss": 0.3865, + "step": 6305 + }, + { + "epoch": 1.7703537338573834, + "grad_norm": 0.6753956079483032, + "learning_rate": 4.302391277979545e-06, + "loss": 0.399, + "step": 6306 + }, + { + "epoch": 1.770634475014037, + "grad_norm": 0.7102121710777283, + "learning_rate": 4.3007739372751275e-06, + "loss": 0.3477, + "step": 6307 + }, + { + "epoch": 1.7709152161706907, + "grad_norm": 0.6047407984733582, + "learning_rate": 4.299156671187768e-06, + "loss": 0.3956, + "step": 6308 + }, + { + "epoch": 1.7711959573273441, + "grad_norm": 0.6499601602554321, + "learning_rate": 4.297539479890051e-06, + "loss": 0.3764, + "step": 6309 + }, + { + "epoch": 1.7714766984839978, + "grad_norm": 0.658145010471344, + "learning_rate": 4.295922363554551e-06, + "loss": 0.3467, + "step": 6310 + }, + { + "epoch": 1.7717574396406515, + "grad_norm": 0.6435934901237488, + "learning_rate": 4.29430532235384e-06, + "loss": 0.3763, + "step": 6311 + }, + { + "epoch": 1.772038180797305, + "grad_norm": 0.7077054977416992, + "learning_rate": 4.292688356460475e-06, + "loss": 0.3486, + "step": 6312 + }, + { + "epoch": 1.7723189219539583, + "grad_norm": 0.6282106041908264, + "learning_rate": 4.29107146604701e-06, + "loss": 0.362, + "step": 6313 + }, + { + "epoch": 1.772599663110612, + "grad_norm": 0.6815602779388428, + "learning_rate": 4.289454651285991e-06, + "loss": 0.3688, + "step": 6314 + }, + { + "epoch": 1.7728804042672657, + "grad_norm": 0.5672601461410522, + "learning_rate": 4.287837912349952e-06, + "loss": 0.3805, + "step": 6315 + }, + { + "epoch": 1.7731611454239191, + "grad_norm": 0.6486336588859558, + "learning_rate": 4.286221249411422e-06, + "loss": 0.3973, + "step": 6316 + }, + { + "epoch": 1.7734418865805726, + "grad_norm": 0.5920996069908142, + "learning_rate": 4.2846046626429215e-06, + "loss": 0.3811, + "step": 6317 + }, + { + "epoch": 1.7737226277372264, + "grad_norm": 0.6090525388717651, + "learning_rate": 4.282988152216964e-06, + "loss": 0.3893, + "step": 6318 + }, + { + "epoch": 1.7740033688938799, + "grad_norm": 0.7044205665588379, + "learning_rate": 4.281371718306052e-06, + "loss": 0.3792, + "step": 6319 + }, + { + "epoch": 1.7742841100505333, + "grad_norm": 0.6818379759788513, + "learning_rate": 4.27975536108268e-06, + "loss": 0.3813, + "step": 6320 + }, + { + "epoch": 1.774564851207187, + "grad_norm": 0.6091445088386536, + "learning_rate": 4.278139080719338e-06, + "loss": 0.3495, + "step": 6321 + }, + { + "epoch": 1.7748455923638407, + "grad_norm": 0.6549208760261536, + "learning_rate": 4.276522877388503e-06, + "loss": 0.3786, + "step": 6322 + }, + { + "epoch": 1.775126333520494, + "grad_norm": 0.6359061598777771, + "learning_rate": 4.274906751262647e-06, + "loss": 0.4105, + "step": 6323 + }, + { + "epoch": 1.7754070746771475, + "grad_norm": 0.5990251898765564, + "learning_rate": 4.273290702514236e-06, + "loss": 0.401, + "step": 6324 + }, + { + "epoch": 1.7756878158338012, + "grad_norm": 0.6826092004776001, + "learning_rate": 4.2716747313157206e-06, + "loss": 0.3592, + "step": 6325 + }, + { + "epoch": 1.7759685569904549, + "grad_norm": 0.6266413927078247, + "learning_rate": 4.270058837839548e-06, + "loss": 0.3585, + "step": 6326 + }, + { + "epoch": 1.7762492981471083, + "grad_norm": 0.600288450717926, + "learning_rate": 4.26844302225816e-06, + "loss": 0.3811, + "step": 6327 + }, + { + "epoch": 1.776530039303762, + "grad_norm": 0.6904387474060059, + "learning_rate": 4.266827284743981e-06, + "loss": 0.3741, + "step": 6328 + }, + { + "epoch": 1.7768107804604156, + "grad_norm": 0.7091556191444397, + "learning_rate": 4.265211625469435e-06, + "loss": 0.3433, + "step": 6329 + }, + { + "epoch": 1.777091521617069, + "grad_norm": 0.66339510679245, + "learning_rate": 4.263596044606936e-06, + "loss": 0.4235, + "step": 6330 + }, + { + "epoch": 1.7773722627737225, + "grad_norm": 0.626258134841919, + "learning_rate": 4.261980542328887e-06, + "loss": 0.3755, + "step": 6331 + }, + { + "epoch": 1.7776530039303762, + "grad_norm": 0.6411709785461426, + "learning_rate": 4.260365118807685e-06, + "loss": 0.3838, + "step": 6332 + }, + { + "epoch": 1.7779337450870298, + "grad_norm": 0.6367449760437012, + "learning_rate": 4.258749774215719e-06, + "loss": 0.3755, + "step": 6333 + }, + { + "epoch": 1.7782144862436833, + "grad_norm": 0.619971513748169, + "learning_rate": 4.2571345087253665e-06, + "loss": 0.3917, + "step": 6334 + }, + { + "epoch": 1.778495227400337, + "grad_norm": 0.6844624280929565, + "learning_rate": 4.2555193225090005e-06, + "loss": 0.3674, + "step": 6335 + }, + { + "epoch": 1.7787759685569906, + "grad_norm": 0.6690794229507446, + "learning_rate": 4.253904215738982e-06, + "loss": 0.3742, + "step": 6336 + }, + { + "epoch": 1.779056709713644, + "grad_norm": 0.6709421873092651, + "learning_rate": 4.252289188587666e-06, + "loss": 0.38, + "step": 6337 + }, + { + "epoch": 1.7793374508702975, + "grad_norm": 0.668391227722168, + "learning_rate": 4.2506742412273986e-06, + "loss": 0.335, + "step": 6338 + }, + { + "epoch": 1.7796181920269512, + "grad_norm": 0.5918411612510681, + "learning_rate": 4.249059373830517e-06, + "loss": 0.4, + "step": 6339 + }, + { + "epoch": 1.7798989331836048, + "grad_norm": 0.6350464820861816, + "learning_rate": 4.247444586569348e-06, + "loss": 0.3877, + "step": 6340 + }, + { + "epoch": 1.7801796743402583, + "grad_norm": 0.5575278401374817, + "learning_rate": 4.245829879616214e-06, + "loss": 0.3834, + "step": 6341 + }, + { + "epoch": 1.7804604154969117, + "grad_norm": 0.5742670893669128, + "learning_rate": 4.244215253143423e-06, + "loss": 0.3852, + "step": 6342 + }, + { + "epoch": 1.7807411566535654, + "grad_norm": 0.6611730456352234, + "learning_rate": 4.242600707323282e-06, + "loss": 0.3849, + "step": 6343 + }, + { + "epoch": 1.781021897810219, + "grad_norm": 0.5678191184997559, + "learning_rate": 4.240986242328083e-06, + "loss": 0.3535, + "step": 6344 + }, + { + "epoch": 1.7813026389668725, + "grad_norm": 0.582705020904541, + "learning_rate": 4.239371858330115e-06, + "loss": 0.3634, + "step": 6345 + }, + { + "epoch": 1.7815833801235261, + "grad_norm": 0.6255151033401489, + "learning_rate": 4.237757555501649e-06, + "loss": 0.3499, + "step": 6346 + }, + { + "epoch": 1.7818641212801798, + "grad_norm": 0.6268560886383057, + "learning_rate": 4.236143334014958e-06, + "loss": 0.3739, + "step": 6347 + }, + { + "epoch": 1.7821448624368332, + "grad_norm": 0.5669793486595154, + "learning_rate": 4.2345291940423e-06, + "loss": 0.4106, + "step": 6348 + }, + { + "epoch": 1.7824256035934867, + "grad_norm": 0.5445272326469421, + "learning_rate": 4.232915135755924e-06, + "loss": 0.3622, + "step": 6349 + }, + { + "epoch": 1.7827063447501403, + "grad_norm": 0.7186790108680725, + "learning_rate": 4.231301159328076e-06, + "loss": 0.4353, + "step": 6350 + }, + { + "epoch": 1.782987085906794, + "grad_norm": 0.5925012826919556, + "learning_rate": 4.229687264930989e-06, + "loss": 0.3828, + "step": 6351 + }, + { + "epoch": 1.7832678270634474, + "grad_norm": 0.6374977827072144, + "learning_rate": 4.2280734527368865e-06, + "loss": 0.3847, + "step": 6352 + }, + { + "epoch": 1.783548568220101, + "grad_norm": 0.5620571970939636, + "learning_rate": 4.226459722917985e-06, + "loss": 0.3703, + "step": 6353 + }, + { + "epoch": 1.7838293093767548, + "grad_norm": 0.5990384221076965, + "learning_rate": 4.224846075646491e-06, + "loss": 0.3905, + "step": 6354 + }, + { + "epoch": 1.7841100505334082, + "grad_norm": 0.6036943793296814, + "learning_rate": 4.223232511094605e-06, + "loss": 0.3564, + "step": 6355 + }, + { + "epoch": 1.7843907916900617, + "grad_norm": 0.7412349581718445, + "learning_rate": 4.221619029434513e-06, + "loss": 0.4274, + "step": 6356 + }, + { + "epoch": 1.7846715328467153, + "grad_norm": 0.577129602432251, + "learning_rate": 4.220005630838399e-06, + "loss": 0.376, + "step": 6357 + }, + { + "epoch": 1.784952274003369, + "grad_norm": 0.5761657357215881, + "learning_rate": 4.2183923154784325e-06, + "loss": 0.3975, + "step": 6358 + }, + { + "epoch": 1.7852330151600224, + "grad_norm": 0.5842590928077698, + "learning_rate": 4.216779083526779e-06, + "loss": 0.3785, + "step": 6359 + }, + { + "epoch": 1.7855137563166759, + "grad_norm": 0.6966604590415955, + "learning_rate": 4.2151659351555895e-06, + "loss": 0.3885, + "step": 6360 + }, + { + "epoch": 1.7857944974733297, + "grad_norm": 0.551936149597168, + "learning_rate": 4.213552870537013e-06, + "loss": 0.4025, + "step": 6361 + }, + { + "epoch": 1.7860752386299832, + "grad_norm": 0.6577399969100952, + "learning_rate": 4.211939889843182e-06, + "loss": 0.3438, + "step": 6362 + }, + { + "epoch": 1.7863559797866366, + "grad_norm": 0.6121562719345093, + "learning_rate": 4.210326993246225e-06, + "loss": 0.3654, + "step": 6363 + }, + { + "epoch": 1.7866367209432903, + "grad_norm": 0.6832751035690308, + "learning_rate": 4.208714180918262e-06, + "loss": 0.3733, + "step": 6364 + }, + { + "epoch": 1.786917462099944, + "grad_norm": 0.6492119431495667, + "learning_rate": 4.2071014530314e-06, + "loss": 0.423, + "step": 6365 + }, + { + "epoch": 1.7871982032565974, + "grad_norm": 0.6280763745307922, + "learning_rate": 4.205488809757741e-06, + "loss": 0.3487, + "step": 6366 + }, + { + "epoch": 1.7874789444132508, + "grad_norm": 0.5772631764411926, + "learning_rate": 4.203876251269375e-06, + "loss": 0.398, + "step": 6367 + }, + { + "epoch": 1.7877596855699045, + "grad_norm": 0.6471928954124451, + "learning_rate": 4.202263777738385e-06, + "loss": 0.409, + "step": 6368 + }, + { + "epoch": 1.7880404267265582, + "grad_norm": 0.7307161688804626, + "learning_rate": 4.200651389336843e-06, + "loss": 0.3565, + "step": 6369 + }, + { + "epoch": 1.7883211678832116, + "grad_norm": 0.7405007481575012, + "learning_rate": 4.199039086236815e-06, + "loss": 0.3534, + "step": 6370 + }, + { + "epoch": 1.7886019090398653, + "grad_norm": 0.6418318152427673, + "learning_rate": 4.197426868610354e-06, + "loss": 0.3561, + "step": 6371 + }, + { + "epoch": 1.788882650196519, + "grad_norm": 0.6842749714851379, + "learning_rate": 4.195814736629506e-06, + "loss": 0.4021, + "step": 6372 + }, + { + "epoch": 1.7891633913531724, + "grad_norm": 0.5714514851570129, + "learning_rate": 4.194202690466311e-06, + "loss": 0.3819, + "step": 6373 + }, + { + "epoch": 1.7894441325098258, + "grad_norm": 0.7125022411346436, + "learning_rate": 4.19259073029279e-06, + "loss": 0.4236, + "step": 6374 + }, + { + "epoch": 1.7897248736664795, + "grad_norm": 0.6383434534072876, + "learning_rate": 4.190978856280967e-06, + "loss": 0.4211, + "step": 6375 + }, + { + "epoch": 1.7900056148231331, + "grad_norm": 0.6919798254966736, + "learning_rate": 4.189367068602852e-06, + "loss": 0.3707, + "step": 6376 + }, + { + "epoch": 1.7902863559797866, + "grad_norm": 0.6926742196083069, + "learning_rate": 4.18775536743044e-06, + "loss": 0.3941, + "step": 6377 + }, + { + "epoch": 1.7905670971364402, + "grad_norm": 0.6804854273796082, + "learning_rate": 4.186143752935725e-06, + "loss": 0.405, + "step": 6378 + }, + { + "epoch": 1.790847838293094, + "grad_norm": 0.6851863861083984, + "learning_rate": 4.184532225290687e-06, + "loss": 0.3632, + "step": 6379 + }, + { + "epoch": 1.7911285794497473, + "grad_norm": 0.6039943695068359, + "learning_rate": 4.182920784667299e-06, + "loss": 0.3582, + "step": 6380 + }, + { + "epoch": 1.7914093206064008, + "grad_norm": 0.8127399682998657, + "learning_rate": 4.181309431237523e-06, + "loss": 0.3652, + "step": 6381 + }, + { + "epoch": 1.7916900617630545, + "grad_norm": 0.7083007097244263, + "learning_rate": 4.179698165173316e-06, + "loss": 0.3763, + "step": 6382 + }, + { + "epoch": 1.7919708029197081, + "grad_norm": 0.5860080718994141, + "learning_rate": 4.178086986646618e-06, + "loss": 0.3549, + "step": 6383 + }, + { + "epoch": 1.7922515440763616, + "grad_norm": 0.6356669664382935, + "learning_rate": 4.1764758958293665e-06, + "loss": 0.3987, + "step": 6384 + }, + { + "epoch": 1.792532285233015, + "grad_norm": 0.6829647421836853, + "learning_rate": 4.174864892893485e-06, + "loss": 0.3733, + "step": 6385 + }, + { + "epoch": 1.7928130263896689, + "grad_norm": 0.6945798993110657, + "learning_rate": 4.173253978010891e-06, + "loss": 0.3611, + "step": 6386 + }, + { + "epoch": 1.7930937675463223, + "grad_norm": 0.7382370829582214, + "learning_rate": 4.171643151353492e-06, + "loss": 0.4309, + "step": 6387 + }, + { + "epoch": 1.7933745087029758, + "grad_norm": 0.6458000540733337, + "learning_rate": 4.170032413093185e-06, + "loss": 0.3478, + "step": 6388 + }, + { + "epoch": 1.7936552498596294, + "grad_norm": 0.7223645448684692, + "learning_rate": 4.168421763401857e-06, + "loss": 0.3803, + "step": 6389 + }, + { + "epoch": 1.793935991016283, + "grad_norm": 0.6951597332954407, + "learning_rate": 4.1668112024513875e-06, + "loss": 0.4198, + "step": 6390 + }, + { + "epoch": 1.7942167321729365, + "grad_norm": 0.6678778529167175, + "learning_rate": 4.1652007304136446e-06, + "loss": 0.3857, + "step": 6391 + }, + { + "epoch": 1.79449747332959, + "grad_norm": 0.745650053024292, + "learning_rate": 4.163590347460489e-06, + "loss": 0.3989, + "step": 6392 + }, + { + "epoch": 1.7947782144862436, + "grad_norm": 0.6009384989738464, + "learning_rate": 4.161980053763769e-06, + "loss": 0.3711, + "step": 6393 + }, + { + "epoch": 1.7950589556428973, + "grad_norm": 0.6509097814559937, + "learning_rate": 4.160369849495329e-06, + "loss": 0.4101, + "step": 6394 + }, + { + "epoch": 1.7953396967995507, + "grad_norm": 0.6226939558982849, + "learning_rate": 4.158759734826995e-06, + "loss": 0.3934, + "step": 6395 + }, + { + "epoch": 1.7956204379562044, + "grad_norm": 0.6726289391517639, + "learning_rate": 4.157149709930592e-06, + "loss": 0.3966, + "step": 6396 + }, + { + "epoch": 1.795901179112858, + "grad_norm": 0.641761302947998, + "learning_rate": 4.15553977497793e-06, + "loss": 0.3934, + "step": 6397 + }, + { + "epoch": 1.7961819202695115, + "grad_norm": 0.5958701372146606, + "learning_rate": 4.153929930140812e-06, + "loss": 0.3615, + "step": 6398 + }, + { + "epoch": 1.796462661426165, + "grad_norm": 0.681663990020752, + "learning_rate": 4.15232017559103e-06, + "loss": 0.357, + "step": 6399 + }, + { + "epoch": 1.7967434025828186, + "grad_norm": 0.6625244617462158, + "learning_rate": 4.1507105115003665e-06, + "loss": 0.3852, + "step": 6400 + }, + { + "epoch": 1.7970241437394723, + "grad_norm": 0.6397461295127869, + "learning_rate": 4.149100938040598e-06, + "loss": 0.3958, + "step": 6401 + }, + { + "epoch": 1.7973048848961257, + "grad_norm": 0.6216028332710266, + "learning_rate": 4.1474914553834846e-06, + "loss": 0.368, + "step": 6402 + }, + { + "epoch": 1.7975856260527794, + "grad_norm": 0.5505136251449585, + "learning_rate": 4.145882063700783e-06, + "loss": 0.385, + "step": 6403 + }, + { + "epoch": 1.797866367209433, + "grad_norm": 0.6373128890991211, + "learning_rate": 4.144272763164236e-06, + "loss": 0.441, + "step": 6404 + }, + { + "epoch": 1.7981471083660865, + "grad_norm": 0.6197081208229065, + "learning_rate": 4.142663553945578e-06, + "loss": 0.3666, + "step": 6405 + }, + { + "epoch": 1.79842784952274, + "grad_norm": 0.5677542090415955, + "learning_rate": 4.141054436216533e-06, + "loss": 0.3614, + "step": 6406 + }, + { + "epoch": 1.7987085906793936, + "grad_norm": 0.6772831678390503, + "learning_rate": 4.1394454101488185e-06, + "loss": 0.42, + "step": 6407 + }, + { + "epoch": 1.7989893318360473, + "grad_norm": 0.590928316116333, + "learning_rate": 4.137836475914137e-06, + "loss": 0.3672, + "step": 6408 + }, + { + "epoch": 1.7992700729927007, + "grad_norm": 0.6583422422409058, + "learning_rate": 4.136227633684187e-06, + "loss": 0.4089, + "step": 6409 + }, + { + "epoch": 1.7995508141493541, + "grad_norm": 0.6012908816337585, + "learning_rate": 4.134618883630653e-06, + "loss": 0.3712, + "step": 6410 + }, + { + "epoch": 1.799831555306008, + "grad_norm": 0.6087749600410461, + "learning_rate": 4.133010225925208e-06, + "loss": 0.3761, + "step": 6411 + }, + { + "epoch": 1.8001122964626615, + "grad_norm": 0.6169080138206482, + "learning_rate": 4.131401660739522e-06, + "loss": 0.3768, + "step": 6412 + }, + { + "epoch": 1.800393037619315, + "grad_norm": 0.5806935429573059, + "learning_rate": 4.129793188245248e-06, + "loss": 0.3887, + "step": 6413 + }, + { + "epoch": 1.8006737787759686, + "grad_norm": 0.6660763621330261, + "learning_rate": 4.128184808614035e-06, + "loss": 0.4043, + "step": 6414 + }, + { + "epoch": 1.8009545199326222, + "grad_norm": 0.6095353960990906, + "learning_rate": 4.126576522017516e-06, + "loss": 0.3823, + "step": 6415 + }, + { + "epoch": 1.8012352610892757, + "grad_norm": 0.6177636981010437, + "learning_rate": 4.124968328627321e-06, + "loss": 0.4277, + "step": 6416 + }, + { + "epoch": 1.8015160022459291, + "grad_norm": 0.6765494346618652, + "learning_rate": 4.123360228615064e-06, + "loss": 0.3682, + "step": 6417 + }, + { + "epoch": 1.8017967434025828, + "grad_norm": 0.5754491686820984, + "learning_rate": 4.121752222152351e-06, + "loss": 0.3381, + "step": 6418 + }, + { + "epoch": 1.8020774845592364, + "grad_norm": 0.6277416944503784, + "learning_rate": 4.12014430941078e-06, + "loss": 0.4018, + "step": 6419 + }, + { + "epoch": 1.8023582257158899, + "grad_norm": 0.6139590740203857, + "learning_rate": 4.1185364905619365e-06, + "loss": 0.3685, + "step": 6420 + }, + { + "epoch": 1.8026389668725435, + "grad_norm": 0.6387202739715576, + "learning_rate": 4.116928765777397e-06, + "loss": 0.3638, + "step": 6421 + }, + { + "epoch": 1.8029197080291972, + "grad_norm": 0.5808118581771851, + "learning_rate": 4.11532113522873e-06, + "loss": 0.4067, + "step": 6422 + }, + { + "epoch": 1.8032004491858507, + "grad_norm": 0.6156762838363647, + "learning_rate": 4.113713599087488e-06, + "loss": 0.3581, + "step": 6423 + }, + { + "epoch": 1.803481190342504, + "grad_norm": 0.6466835141181946, + "learning_rate": 4.11210615752522e-06, + "loss": 0.4458, + "step": 6424 + }, + { + "epoch": 1.8037619314991578, + "grad_norm": 0.5484521389007568, + "learning_rate": 4.1104988107134605e-06, + "loss": 0.408, + "step": 6425 + }, + { + "epoch": 1.8040426726558114, + "grad_norm": 0.656670868396759, + "learning_rate": 4.108891558823737e-06, + "loss": 0.3845, + "step": 6426 + }, + { + "epoch": 1.8043234138124649, + "grad_norm": 0.6025376319885254, + "learning_rate": 4.1072844020275646e-06, + "loss": 0.3551, + "step": 6427 + }, + { + "epoch": 1.8046041549691185, + "grad_norm": 0.6003783345222473, + "learning_rate": 4.10567734049645e-06, + "loss": 0.3834, + "step": 6428 + }, + { + "epoch": 1.8048848961257722, + "grad_norm": 0.5879949927330017, + "learning_rate": 4.104070374401888e-06, + "loss": 0.3454, + "step": 6429 + }, + { + "epoch": 1.8051656372824256, + "grad_norm": 0.6855700016021729, + "learning_rate": 4.102463503915364e-06, + "loss": 0.3708, + "step": 6430 + }, + { + "epoch": 1.805446378439079, + "grad_norm": 0.6719605326652527, + "learning_rate": 4.100856729208354e-06, + "loss": 0.413, + "step": 6431 + }, + { + "epoch": 1.8057271195957327, + "grad_norm": 0.621315062046051, + "learning_rate": 4.099250050452323e-06, + "loss": 0.3742, + "step": 6432 + }, + { + "epoch": 1.8060078607523864, + "grad_norm": 0.6734421849250793, + "learning_rate": 4.097643467818724e-06, + "loss": 0.4077, + "step": 6433 + }, + { + "epoch": 1.8062886019090398, + "grad_norm": 0.6684174537658691, + "learning_rate": 4.0960369814790035e-06, + "loss": 0.3671, + "step": 6434 + }, + { + "epoch": 1.8065693430656933, + "grad_norm": 0.6640192270278931, + "learning_rate": 4.094430591604594e-06, + "loss": 0.4058, + "step": 6435 + }, + { + "epoch": 1.806850084222347, + "grad_norm": 0.6472187638282776, + "learning_rate": 4.092824298366922e-06, + "loss": 0.3693, + "step": 6436 + }, + { + "epoch": 1.8071308253790006, + "grad_norm": 0.6261100172996521, + "learning_rate": 4.091218101937398e-06, + "loss": 0.3705, + "step": 6437 + }, + { + "epoch": 1.807411566535654, + "grad_norm": 0.6097570061683655, + "learning_rate": 4.089612002487428e-06, + "loss": 0.3594, + "step": 6438 + }, + { + "epoch": 1.8076923076923077, + "grad_norm": 0.6226575374603271, + "learning_rate": 4.088006000188403e-06, + "loss": 0.38, + "step": 6439 + }, + { + "epoch": 1.8079730488489614, + "grad_norm": 0.6200900673866272, + "learning_rate": 4.086400095211707e-06, + "loss": 0.3751, + "step": 6440 + }, + { + "epoch": 1.8082537900056148, + "grad_norm": 0.6498468518257141, + "learning_rate": 4.0847942877287105e-06, + "loss": 0.4159, + "step": 6441 + }, + { + "epoch": 1.8085345311622683, + "grad_norm": 0.6783927083015442, + "learning_rate": 4.083188577910777e-06, + "loss": 0.3936, + "step": 6442 + }, + { + "epoch": 1.808815272318922, + "grad_norm": 0.6968936324119568, + "learning_rate": 4.081582965929257e-06, + "loss": 0.3893, + "step": 6443 + }, + { + "epoch": 1.8090960134755756, + "grad_norm": 0.6645230650901794, + "learning_rate": 4.079977451955493e-06, + "loss": 0.3772, + "step": 6444 + }, + { + "epoch": 1.809376754632229, + "grad_norm": 0.5472118854522705, + "learning_rate": 4.078372036160812e-06, + "loss": 0.3699, + "step": 6445 + }, + { + "epoch": 1.8096574957888827, + "grad_norm": 0.6444216370582581, + "learning_rate": 4.076766718716537e-06, + "loss": 0.3696, + "step": 6446 + }, + { + "epoch": 1.8099382369455363, + "grad_norm": 0.5640847682952881, + "learning_rate": 4.075161499793976e-06, + "loss": 0.3738, + "step": 6447 + }, + { + "epoch": 1.8102189781021898, + "grad_norm": 0.6268128752708435, + "learning_rate": 4.073556379564429e-06, + "loss": 0.375, + "step": 6448 + }, + { + "epoch": 1.8104997192588432, + "grad_norm": 0.6185505390167236, + "learning_rate": 4.071951358199184e-06, + "loss": 0.3408, + "step": 6449 + }, + { + "epoch": 1.810780460415497, + "grad_norm": 0.7345503568649292, + "learning_rate": 4.070346435869518e-06, + "loss": 0.3982, + "step": 6450 + }, + { + "epoch": 1.8110612015721506, + "grad_norm": 0.6497979164123535, + "learning_rate": 4.0687416127467e-06, + "loss": 0.405, + "step": 6451 + }, + { + "epoch": 1.811341942728804, + "grad_norm": 0.6260815262794495, + "learning_rate": 4.067136889001986e-06, + "loss": 0.4078, + "step": 6452 + }, + { + "epoch": 1.8116226838854577, + "grad_norm": 0.6811968684196472, + "learning_rate": 4.065532264806623e-06, + "loss": 0.4309, + "step": 6453 + }, + { + "epoch": 1.8119034250421113, + "grad_norm": 0.6197689771652222, + "learning_rate": 4.063927740331845e-06, + "loss": 0.3978, + "step": 6454 + }, + { + "epoch": 1.8121841661987648, + "grad_norm": 0.7707491517066956, + "learning_rate": 4.062323315748877e-06, + "loss": 0.423, + "step": 6455 + }, + { + "epoch": 1.8124649073554182, + "grad_norm": 0.7443587183952332, + "learning_rate": 4.060718991228934e-06, + "loss": 0.4071, + "step": 6456 + }, + { + "epoch": 1.8127456485120719, + "grad_norm": 0.7394616603851318, + "learning_rate": 4.059114766943219e-06, + "loss": 0.3712, + "step": 6457 + }, + { + "epoch": 1.8130263896687255, + "grad_norm": 0.7203782200813293, + "learning_rate": 4.0575106430629255e-06, + "loss": 0.3824, + "step": 6458 + }, + { + "epoch": 1.813307130825379, + "grad_norm": 0.6342647075653076, + "learning_rate": 4.055906619759236e-06, + "loss": 0.4031, + "step": 6459 + }, + { + "epoch": 1.8135878719820324, + "grad_norm": 0.5927562713623047, + "learning_rate": 4.05430269720332e-06, + "loss": 0.3899, + "step": 6460 + }, + { + "epoch": 1.813868613138686, + "grad_norm": 0.61030113697052, + "learning_rate": 4.052698875566339e-06, + "loss": 0.4144, + "step": 6461 + }, + { + "epoch": 1.8141493542953397, + "grad_norm": 0.6469379663467407, + "learning_rate": 4.051095155019444e-06, + "loss": 0.3706, + "step": 6462 + }, + { + "epoch": 1.8144300954519932, + "grad_norm": 0.77492755651474, + "learning_rate": 4.049491535733773e-06, + "loss": 0.3409, + "step": 6463 + }, + { + "epoch": 1.8147108366086468, + "grad_norm": 0.6135831475257874, + "learning_rate": 4.047888017880453e-06, + "loss": 0.369, + "step": 6464 + }, + { + "epoch": 1.8149915777653005, + "grad_norm": 0.772193968296051, + "learning_rate": 4.046284601630606e-06, + "loss": 0.4013, + "step": 6465 + }, + { + "epoch": 1.815272318921954, + "grad_norm": 0.6679842472076416, + "learning_rate": 4.044681287155334e-06, + "loss": 0.3721, + "step": 6466 + }, + { + "epoch": 1.8155530600786074, + "grad_norm": 0.7335694432258606, + "learning_rate": 4.043078074625734e-06, + "loss": 0.3764, + "step": 6467 + }, + { + "epoch": 1.815833801235261, + "grad_norm": 0.6807160973548889, + "learning_rate": 4.041474964212891e-06, + "loss": 0.3692, + "step": 6468 + }, + { + "epoch": 1.8161145423919147, + "grad_norm": 0.6571961641311646, + "learning_rate": 4.039871956087879e-06, + "loss": 0.3923, + "step": 6469 + }, + { + "epoch": 1.8163952835485682, + "grad_norm": 0.6566359400749207, + "learning_rate": 4.0382690504217615e-06, + "loss": 0.3867, + "step": 6470 + }, + { + "epoch": 1.8166760247052218, + "grad_norm": 0.6149438619613647, + "learning_rate": 4.036666247385591e-06, + "loss": 0.3503, + "step": 6471 + }, + { + "epoch": 1.8169567658618755, + "grad_norm": 0.6013513207435608, + "learning_rate": 4.035063547150408e-06, + "loss": 0.3967, + "step": 6472 + }, + { + "epoch": 1.817237507018529, + "grad_norm": 0.5649804472923279, + "learning_rate": 4.033460949887242e-06, + "loss": 0.3718, + "step": 6473 + }, + { + "epoch": 1.8175182481751824, + "grad_norm": 0.6157485246658325, + "learning_rate": 4.031858455767113e-06, + "loss": 0.3748, + "step": 6474 + }, + { + "epoch": 1.817798989331836, + "grad_norm": 0.6589345335960388, + "learning_rate": 4.030256064961029e-06, + "loss": 0.3966, + "step": 6475 + }, + { + "epoch": 1.8180797304884897, + "grad_norm": 0.6747671961784363, + "learning_rate": 4.0286537776399855e-06, + "loss": 0.3776, + "step": 6476 + }, + { + "epoch": 1.8183604716451431, + "grad_norm": 0.7009803056716919, + "learning_rate": 4.027051593974973e-06, + "loss": 0.4255, + "step": 6477 + }, + { + "epoch": 1.8186412128017966, + "grad_norm": 0.6249062418937683, + "learning_rate": 4.025449514136963e-06, + "loss": 0.3657, + "step": 6478 + }, + { + "epoch": 1.8189219539584505, + "grad_norm": 0.6347492337226868, + "learning_rate": 4.023847538296921e-06, + "loss": 0.4021, + "step": 6479 + }, + { + "epoch": 1.819202695115104, + "grad_norm": 0.6176151633262634, + "learning_rate": 4.0222456666257994e-06, + "loss": 0.4387, + "step": 6480 + }, + { + "epoch": 1.8194834362717573, + "grad_norm": 0.5777949094772339, + "learning_rate": 4.020643899294541e-06, + "loss": 0.3304, + "step": 6481 + }, + { + "epoch": 1.819764177428411, + "grad_norm": 0.6171284914016724, + "learning_rate": 4.0190422364740745e-06, + "loss": 0.3835, + "step": 6482 + }, + { + "epoch": 1.8200449185850647, + "grad_norm": 0.6122297048568726, + "learning_rate": 4.017440678335319e-06, + "loss": 0.3948, + "step": 6483 + }, + { + "epoch": 1.8203256597417181, + "grad_norm": 0.5959627628326416, + "learning_rate": 4.015839225049186e-06, + "loss": 0.3404, + "step": 6484 + }, + { + "epoch": 1.8206064008983716, + "grad_norm": 0.5845166444778442, + "learning_rate": 4.01423787678657e-06, + "loss": 0.3491, + "step": 6485 + }, + { + "epoch": 1.8208871420550252, + "grad_norm": 0.6463956832885742, + "learning_rate": 4.012636633718359e-06, + "loss": 0.3933, + "step": 6486 + }, + { + "epoch": 1.8211678832116789, + "grad_norm": 0.6332153081893921, + "learning_rate": 4.0110354960154256e-06, + "loss": 0.3838, + "step": 6487 + }, + { + "epoch": 1.8214486243683323, + "grad_norm": 0.6473224759101868, + "learning_rate": 4.009434463848634e-06, + "loss": 0.3393, + "step": 6488 + }, + { + "epoch": 1.821729365524986, + "grad_norm": 0.6339811682701111, + "learning_rate": 4.007833537388836e-06, + "loss": 0.3964, + "step": 6489 + }, + { + "epoch": 1.8220101066816397, + "grad_norm": 0.6294807195663452, + "learning_rate": 4.006232716806874e-06, + "loss": 0.3986, + "step": 6490 + }, + { + "epoch": 1.822290847838293, + "grad_norm": 0.6058825850486755, + "learning_rate": 4.004632002273576e-06, + "loss": 0.3617, + "step": 6491 + }, + { + "epoch": 1.8225715889949465, + "grad_norm": 0.6486465930938721, + "learning_rate": 4.003031393959761e-06, + "loss": 0.3812, + "step": 6492 + }, + { + "epoch": 1.8228523301516002, + "grad_norm": 0.6626247763633728, + "learning_rate": 4.001430892036236e-06, + "loss": 0.4084, + "step": 6493 + }, + { + "epoch": 1.8231330713082539, + "grad_norm": 0.7068142294883728, + "learning_rate": 3.999830496673797e-06, + "loss": 0.4151, + "step": 6494 + }, + { + "epoch": 1.8234138124649073, + "grad_norm": 0.6361899971961975, + "learning_rate": 3.998230208043227e-06, + "loss": 0.3832, + "step": 6495 + }, + { + "epoch": 1.823694553621561, + "grad_norm": 0.6086771488189697, + "learning_rate": 3.9966300263153e-06, + "loss": 0.3535, + "step": 6496 + }, + { + "epoch": 1.8239752947782146, + "grad_norm": 0.6495249271392822, + "learning_rate": 3.995029951660777e-06, + "loss": 0.3953, + "step": 6497 + }, + { + "epoch": 1.824256035934868, + "grad_norm": 0.6098946332931519, + "learning_rate": 3.993429984250408e-06, + "loss": 0.3616, + "step": 6498 + }, + { + "epoch": 1.8245367770915215, + "grad_norm": 0.680414617061615, + "learning_rate": 3.9918301242549316e-06, + "loss": 0.3586, + "step": 6499 + }, + { + "epoch": 1.8248175182481752, + "grad_norm": 0.6312323808670044, + "learning_rate": 3.990230371845075e-06, + "loss": 0.3427, + "step": 6500 + }, + { + "epoch": 1.8250982594048288, + "grad_norm": 0.6725577116012573, + "learning_rate": 3.988630727191552e-06, + "loss": 0.3775, + "step": 6501 + }, + { + "epoch": 1.8253790005614823, + "grad_norm": 0.6414360404014587, + "learning_rate": 3.98703119046507e-06, + "loss": 0.3586, + "step": 6502 + }, + { + "epoch": 1.8256597417181357, + "grad_norm": 0.6267603635787964, + "learning_rate": 3.985431761836321e-06, + "loss": 0.3808, + "step": 6503 + }, + { + "epoch": 1.8259404828747896, + "grad_norm": 0.6660203337669373, + "learning_rate": 3.983832441475984e-06, + "loss": 0.3718, + "step": 6504 + }, + { + "epoch": 1.826221224031443, + "grad_norm": 0.5696361064910889, + "learning_rate": 3.98223322955473e-06, + "loss": 0.3814, + "step": 6505 + }, + { + "epoch": 1.8265019651880965, + "grad_norm": 0.6294177174568176, + "learning_rate": 3.980634126243217e-06, + "loss": 0.3941, + "step": 6506 + }, + { + "epoch": 1.8267827063447502, + "grad_norm": 0.6637787222862244, + "learning_rate": 3.9790351317120904e-06, + "loss": 0.3888, + "step": 6507 + }, + { + "epoch": 1.8270634475014038, + "grad_norm": 0.5955951809883118, + "learning_rate": 3.977436246131987e-06, + "loss": 0.385, + "step": 6508 + }, + { + "epoch": 1.8273441886580573, + "grad_norm": 0.6251006126403809, + "learning_rate": 3.975837469673528e-06, + "loss": 0.4361, + "step": 6509 + }, + { + "epoch": 1.8276249298147107, + "grad_norm": 0.6367130875587463, + "learning_rate": 3.974238802507324e-06, + "loss": 0.376, + "step": 6510 + }, + { + "epoch": 1.8279056709713644, + "grad_norm": 0.6377788782119751, + "learning_rate": 3.972640244803978e-06, + "loss": 0.3701, + "step": 6511 + }, + { + "epoch": 1.828186412128018, + "grad_norm": 0.6418980956077576, + "learning_rate": 3.971041796734075e-06, + "loss": 0.3804, + "step": 6512 + }, + { + "epoch": 1.8284671532846715, + "grad_norm": 0.6587963104248047, + "learning_rate": 3.969443458468194e-06, + "loss": 0.3728, + "step": 6513 + }, + { + "epoch": 1.8287478944413251, + "grad_norm": 0.6418294906616211, + "learning_rate": 3.967845230176898e-06, + "loss": 0.3442, + "step": 6514 + }, + { + "epoch": 1.8290286355979788, + "grad_norm": 0.600695788860321, + "learning_rate": 3.9662471120307406e-06, + "loss": 0.3981, + "step": 6515 + }, + { + "epoch": 1.8293093767546322, + "grad_norm": 0.7527453899383545, + "learning_rate": 3.964649104200262e-06, + "loss": 0.3924, + "step": 6516 + }, + { + "epoch": 1.8295901179112857, + "grad_norm": 0.6880242824554443, + "learning_rate": 3.963051206855993e-06, + "loss": 0.3854, + "step": 6517 + }, + { + "epoch": 1.8298708590679393, + "grad_norm": 0.5922836065292358, + "learning_rate": 3.961453420168451e-06, + "loss": 0.4188, + "step": 6518 + }, + { + "epoch": 1.830151600224593, + "grad_norm": 0.6433671116828918, + "learning_rate": 3.959855744308142e-06, + "loss": 0.3837, + "step": 6519 + }, + { + "epoch": 1.8304323413812464, + "grad_norm": 0.6398699879646301, + "learning_rate": 3.95825817944556e-06, + "loss": 0.377, + "step": 6520 + }, + { + "epoch": 1.8307130825379, + "grad_norm": 0.6227861046791077, + "learning_rate": 3.956660725751187e-06, + "loss": 0.3595, + "step": 6521 + }, + { + "epoch": 1.8309938236945538, + "grad_norm": 0.6193345189094543, + "learning_rate": 3.955063383395492e-06, + "loss": 0.3496, + "step": 6522 + }, + { + "epoch": 1.8312745648512072, + "grad_norm": 0.6425995230674744, + "learning_rate": 3.953466152548935e-06, + "loss": 0.3517, + "step": 6523 + }, + { + "epoch": 1.8315553060078607, + "grad_norm": 0.733466386795044, + "learning_rate": 3.951869033381963e-06, + "loss": 0.3263, + "step": 6524 + }, + { + "epoch": 1.8318360471645143, + "grad_norm": 0.6365712285041809, + "learning_rate": 3.95027202606501e-06, + "loss": 0.3844, + "step": 6525 + }, + { + "epoch": 1.832116788321168, + "grad_norm": 0.6588440537452698, + "learning_rate": 3.948675130768497e-06, + "loss": 0.3684, + "step": 6526 + }, + { + "epoch": 1.8323975294778214, + "grad_norm": 0.6574601531028748, + "learning_rate": 3.947078347662836e-06, + "loss": 0.3709, + "step": 6527 + }, + { + "epoch": 1.8326782706344749, + "grad_norm": 0.6721467971801758, + "learning_rate": 3.945481676918428e-06, + "loss": 0.3809, + "step": 6528 + }, + { + "epoch": 1.8329590117911287, + "grad_norm": 0.6558837890625, + "learning_rate": 3.9438851187056564e-06, + "loss": 0.3893, + "step": 6529 + }, + { + "epoch": 1.8332397529477822, + "grad_norm": 0.5771139860153198, + "learning_rate": 3.942288673194899e-06, + "loss": 0.3655, + "step": 6530 + }, + { + "epoch": 1.8335204941044356, + "grad_norm": 0.6688631176948547, + "learning_rate": 3.940692340556516e-06, + "loss": 0.379, + "step": 6531 + }, + { + "epoch": 1.8338012352610893, + "grad_norm": 0.7210232615470886, + "learning_rate": 3.939096120960859e-06, + "loss": 0.3791, + "step": 6532 + }, + { + "epoch": 1.834081976417743, + "grad_norm": 0.7092217206954956, + "learning_rate": 3.937500014578267e-06, + "loss": 0.3901, + "step": 6533 + }, + { + "epoch": 1.8343627175743964, + "grad_norm": 0.7015949487686157, + "learning_rate": 3.9359040215790656e-06, + "loss": 0.383, + "step": 6534 + }, + { + "epoch": 1.8346434587310498, + "grad_norm": 0.6470916867256165, + "learning_rate": 3.93430814213357e-06, + "loss": 0.3674, + "step": 6535 + }, + { + "epoch": 1.8349241998877035, + "grad_norm": 0.6438083648681641, + "learning_rate": 3.932712376412084e-06, + "loss": 0.3823, + "step": 6536 + }, + { + "epoch": 1.8352049410443572, + "grad_norm": 0.6496115326881409, + "learning_rate": 3.9311167245848945e-06, + "loss": 0.3173, + "step": 6537 + }, + { + "epoch": 1.8354856822010106, + "grad_norm": 0.7356016039848328, + "learning_rate": 3.929521186822281e-06, + "loss": 0.4127, + "step": 6538 + }, + { + "epoch": 1.8357664233576643, + "grad_norm": 0.6692792773246765, + "learning_rate": 3.92792576329451e-06, + "loss": 0.4103, + "step": 6539 + }, + { + "epoch": 1.836047164514318, + "grad_norm": 0.6190904378890991, + "learning_rate": 3.926330454171835e-06, + "loss": 0.371, + "step": 6540 + }, + { + "epoch": 1.8363279056709714, + "grad_norm": 0.6613806486129761, + "learning_rate": 3.924735259624496e-06, + "loss": 0.3771, + "step": 6541 + }, + { + "epoch": 1.8366086468276248, + "grad_norm": 0.5960564613342285, + "learning_rate": 3.9231401798227256e-06, + "loss": 0.4025, + "step": 6542 + }, + { + "epoch": 1.8368893879842785, + "grad_norm": 0.6149198412895203, + "learning_rate": 3.9215452149367375e-06, + "loss": 0.3946, + "step": 6543 + }, + { + "epoch": 1.8371701291409321, + "grad_norm": 0.6016678214073181, + "learning_rate": 3.919950365136737e-06, + "loss": 0.3798, + "step": 6544 + }, + { + "epoch": 1.8374508702975856, + "grad_norm": 0.6572104692459106, + "learning_rate": 3.918355630592919e-06, + "loss": 0.3869, + "step": 6545 + }, + { + "epoch": 1.8377316114542392, + "grad_norm": 0.5891427993774414, + "learning_rate": 3.9167610114754595e-06, + "loss": 0.3549, + "step": 6546 + }, + { + "epoch": 1.838012352610893, + "grad_norm": 0.6644848585128784, + "learning_rate": 3.91516650795453e-06, + "loss": 0.3458, + "step": 6547 + }, + { + "epoch": 1.8382930937675463, + "grad_norm": 0.6368195414543152, + "learning_rate": 3.913572120200285e-06, + "loss": 0.4116, + "step": 6548 + }, + { + "epoch": 1.8385738349241998, + "grad_norm": 0.6424891352653503, + "learning_rate": 3.911977848382867e-06, + "loss": 0.3442, + "step": 6549 + }, + { + "epoch": 1.8388545760808535, + "grad_norm": 0.6905144453048706, + "learning_rate": 3.910383692672406e-06, + "loss": 0.3817, + "step": 6550 + }, + { + "epoch": 1.8391353172375071, + "grad_norm": 0.6814104318618774, + "learning_rate": 3.908789653239022e-06, + "loss": 0.4331, + "step": 6551 + }, + { + "epoch": 1.8394160583941606, + "grad_norm": 0.5546183586120605, + "learning_rate": 3.907195730252819e-06, + "loss": 0.4039, + "step": 6552 + }, + { + "epoch": 1.839696799550814, + "grad_norm": 0.6172484159469604, + "learning_rate": 3.905601923883894e-06, + "loss": 0.4318, + "step": 6553 + }, + { + "epoch": 1.8399775407074677, + "grad_norm": 0.609162449836731, + "learning_rate": 3.904008234302325e-06, + "loss": 0.3627, + "step": 6554 + }, + { + "epoch": 1.8402582818641213, + "grad_norm": 0.5798518061637878, + "learning_rate": 3.902414661678182e-06, + "loss": 0.3811, + "step": 6555 + }, + { + "epoch": 1.8405390230207748, + "grad_norm": 0.6442756652832031, + "learning_rate": 3.900821206181521e-06, + "loss": 0.3718, + "step": 6556 + }, + { + "epoch": 1.8408197641774284, + "grad_norm": 0.6947007775306702, + "learning_rate": 3.899227867982386e-06, + "loss": 0.4414, + "step": 6557 + }, + { + "epoch": 1.841100505334082, + "grad_norm": 0.6912604570388794, + "learning_rate": 3.897634647250808e-06, + "loss": 0.4063, + "step": 6558 + }, + { + "epoch": 1.8413812464907355, + "grad_norm": 0.6262841820716858, + "learning_rate": 3.896041544156805e-06, + "loss": 0.3996, + "step": 6559 + }, + { + "epoch": 1.841661987647389, + "grad_norm": 0.573444128036499, + "learning_rate": 3.894448558870382e-06, + "loss": 0.3816, + "step": 6560 + }, + { + "epoch": 1.8419427288040426, + "grad_norm": 0.5473210215568542, + "learning_rate": 3.892855691561535e-06, + "loss": 0.3919, + "step": 6561 + }, + { + "epoch": 1.8422234699606963, + "grad_norm": 0.6592857837677002, + "learning_rate": 3.891262942400243e-06, + "loss": 0.4205, + "step": 6562 + }, + { + "epoch": 1.8425042111173497, + "grad_norm": 0.6161134243011475, + "learning_rate": 3.889670311556476e-06, + "loss": 0.3769, + "step": 6563 + }, + { + "epoch": 1.8427849522740034, + "grad_norm": 0.6447630524635315, + "learning_rate": 3.888077799200189e-06, + "loss": 0.3758, + "step": 6564 + }, + { + "epoch": 1.843065693430657, + "grad_norm": 0.6067754030227661, + "learning_rate": 3.8864854055013235e-06, + "loss": 0.3791, + "step": 6565 + }, + { + "epoch": 1.8433464345873105, + "grad_norm": 0.7233385443687439, + "learning_rate": 3.8848931306298115e-06, + "loss": 0.3878, + "step": 6566 + }, + { + "epoch": 1.843627175743964, + "grad_norm": 0.6731755137443542, + "learning_rate": 3.88330097475557e-06, + "loss": 0.3664, + "step": 6567 + }, + { + "epoch": 1.8439079169006176, + "grad_norm": 0.6081092357635498, + "learning_rate": 3.881708938048504e-06, + "loss": 0.4029, + "step": 6568 + }, + { + "epoch": 1.8441886580572713, + "grad_norm": 0.6320384740829468, + "learning_rate": 3.880117020678506e-06, + "loss": 0.3976, + "step": 6569 + }, + { + "epoch": 1.8444693992139247, + "grad_norm": 0.7215650081634521, + "learning_rate": 3.878525222815457e-06, + "loss": 0.3794, + "step": 6570 + }, + { + "epoch": 1.8447501403705782, + "grad_norm": 0.6690097451210022, + "learning_rate": 3.876933544629221e-06, + "loss": 0.3505, + "step": 6571 + }, + { + "epoch": 1.845030881527232, + "grad_norm": 0.6703141331672668, + "learning_rate": 3.875341986289653e-06, + "loss": 0.3982, + "step": 6572 + }, + { + "epoch": 1.8453116226838855, + "grad_norm": 0.6017905473709106, + "learning_rate": 3.8737505479665946e-06, + "loss": 0.3662, + "step": 6573 + }, + { + "epoch": 1.845592363840539, + "grad_norm": 0.575630247592926, + "learning_rate": 3.872159229829873e-06, + "loss": 0.3891, + "step": 6574 + }, + { + "epoch": 1.8458731049971926, + "grad_norm": 0.6483700275421143, + "learning_rate": 3.870568032049306e-06, + "loss": 0.4058, + "step": 6575 + }, + { + "epoch": 1.8461538461538463, + "grad_norm": 0.6435593366622925, + "learning_rate": 3.868976954794696e-06, + "loss": 0.4025, + "step": 6576 + }, + { + "epoch": 1.8464345873104997, + "grad_norm": 0.671330988407135, + "learning_rate": 3.867385998235831e-06, + "loss": 0.4031, + "step": 6577 + }, + { + "epoch": 1.8467153284671531, + "grad_norm": 0.6058936715126038, + "learning_rate": 3.865795162542487e-06, + "loss": 0.3666, + "step": 6578 + }, + { + "epoch": 1.8469960696238068, + "grad_norm": 0.5750553011894226, + "learning_rate": 3.864204447884433e-06, + "loss": 0.339, + "step": 6579 + }, + { + "epoch": 1.8472768107804605, + "grad_norm": 0.6059145331382751, + "learning_rate": 3.8626138544314165e-06, + "loss": 0.3304, + "step": 6580 + }, + { + "epoch": 1.847557551937114, + "grad_norm": 0.6579108238220215, + "learning_rate": 3.861023382353176e-06, + "loss": 0.3919, + "step": 6581 + }, + { + "epoch": 1.8478382930937676, + "grad_norm": 0.6709635853767395, + "learning_rate": 3.859433031819437e-06, + "loss": 0.392, + "step": 6582 + }, + { + "epoch": 1.8481190342504212, + "grad_norm": 0.6408994197845459, + "learning_rate": 3.857842802999913e-06, + "loss": 0.3889, + "step": 6583 + }, + { + "epoch": 1.8483997754070747, + "grad_norm": 0.6497156023979187, + "learning_rate": 3.856252696064302e-06, + "loss": 0.406, + "step": 6584 + }, + { + "epoch": 1.8486805165637281, + "grad_norm": 0.5648993849754333, + "learning_rate": 3.854662711182292e-06, + "loss": 0.3931, + "step": 6585 + }, + { + "epoch": 1.8489612577203818, + "grad_norm": 0.5973063111305237, + "learning_rate": 3.853072848523555e-06, + "loss": 0.3692, + "step": 6586 + }, + { + "epoch": 1.8492419988770354, + "grad_norm": 0.6557505130767822, + "learning_rate": 3.85148310825775e-06, + "loss": 0.3887, + "step": 6587 + }, + { + "epoch": 1.8495227400336889, + "grad_norm": 0.5865505933761597, + "learning_rate": 3.8498934905545254e-06, + "loss": 0.387, + "step": 6588 + }, + { + "epoch": 1.8498034811903425, + "grad_norm": 0.6405991911888123, + "learning_rate": 3.848303995583516e-06, + "loss": 0.3851, + "step": 6589 + }, + { + "epoch": 1.8500842223469962, + "grad_norm": 0.6908926963806152, + "learning_rate": 3.846714623514342e-06, + "loss": 0.3862, + "step": 6590 + }, + { + "epoch": 1.8503649635036497, + "grad_norm": 0.6192941665649414, + "learning_rate": 3.845125374516614e-06, + "loss": 0.3625, + "step": 6591 + }, + { + "epoch": 1.850645704660303, + "grad_norm": 0.6998831629753113, + "learning_rate": 3.8435362487599214e-06, + "loss": 0.3871, + "step": 6592 + }, + { + "epoch": 1.8509264458169568, + "grad_norm": 0.5711450576782227, + "learning_rate": 3.841947246413849e-06, + "loss": 0.3667, + "step": 6593 + }, + { + "epoch": 1.8512071869736104, + "grad_norm": 0.602340579032898, + "learning_rate": 3.840358367647966e-06, + "loss": 0.3868, + "step": 6594 + }, + { + "epoch": 1.8514879281302639, + "grad_norm": 0.6366960406303406, + "learning_rate": 3.838769612631826e-06, + "loss": 0.3544, + "step": 6595 + }, + { + "epoch": 1.8517686692869173, + "grad_norm": 0.6178829073905945, + "learning_rate": 3.837180981534972e-06, + "loss": 0.3735, + "step": 6596 + }, + { + "epoch": 1.8520494104435712, + "grad_norm": 0.6999847292900085, + "learning_rate": 3.835592474526934e-06, + "loss": 0.351, + "step": 6597 + }, + { + "epoch": 1.8523301516002246, + "grad_norm": 0.6134530305862427, + "learning_rate": 3.8340040917772245e-06, + "loss": 0.3829, + "step": 6598 + }, + { + "epoch": 1.852610892756878, + "grad_norm": 0.5505448579788208, + "learning_rate": 3.832415833455347e-06, + "loss": 0.3568, + "step": 6599 + }, + { + "epoch": 1.8528916339135317, + "grad_norm": 0.6162842512130737, + "learning_rate": 3.830827699730792e-06, + "loss": 0.367, + "step": 6600 + }, + { + "epoch": 1.8531723750701854, + "grad_norm": 0.6293978691101074, + "learning_rate": 3.829239690773033e-06, + "loss": 0.3829, + "step": 6601 + }, + { + "epoch": 1.8534531162268388, + "grad_norm": 0.545942485332489, + "learning_rate": 3.827651806751535e-06, + "loss": 0.3475, + "step": 6602 + }, + { + "epoch": 1.8537338573834923, + "grad_norm": 0.5861099362373352, + "learning_rate": 3.826064047835745e-06, + "loss": 0.4153, + "step": 6603 + }, + { + "epoch": 1.854014598540146, + "grad_norm": 0.585117518901825, + "learning_rate": 3.824476414195099e-06, + "loss": 0.388, + "step": 6604 + }, + { + "epoch": 1.8542953396967996, + "grad_norm": 0.7277740240097046, + "learning_rate": 3.822888905999021e-06, + "loss": 0.4142, + "step": 6605 + }, + { + "epoch": 1.854576080853453, + "grad_norm": 0.7299200296401978, + "learning_rate": 3.82130152341692e-06, + "loss": 0.3728, + "step": 6606 + }, + { + "epoch": 1.8548568220101067, + "grad_norm": 0.5398320555686951, + "learning_rate": 3.81971426661819e-06, + "loss": 0.3618, + "step": 6607 + }, + { + "epoch": 1.8551375631667604, + "grad_norm": 0.6225571632385254, + "learning_rate": 3.8181271357722135e-06, + "loss": 0.4139, + "step": 6608 + }, + { + "epoch": 1.8554183043234138, + "grad_norm": 0.5465289354324341, + "learning_rate": 3.8165401310483594e-06, + "loss": 0.3345, + "step": 6609 + }, + { + "epoch": 1.8556990454800673, + "grad_norm": 0.48412251472473145, + "learning_rate": 3.814953252615983e-06, + "loss": 0.3724, + "step": 6610 + }, + { + "epoch": 1.855979786636721, + "grad_norm": 0.6475601196289062, + "learning_rate": 3.813366500644426e-06, + "loss": 0.3863, + "step": 6611 + }, + { + "epoch": 1.8562605277933746, + "grad_norm": 0.5891608595848083, + "learning_rate": 3.8117798753030167e-06, + "loss": 0.3448, + "step": 6612 + }, + { + "epoch": 1.856541268950028, + "grad_norm": 0.6342847347259521, + "learning_rate": 3.8101933767610715e-06, + "loss": 0.4103, + "step": 6613 + }, + { + "epoch": 1.8568220101066817, + "grad_norm": 0.6834299564361572, + "learning_rate": 3.8086070051878894e-06, + "loss": 0.3786, + "step": 6614 + }, + { + "epoch": 1.8571027512633353, + "grad_norm": 0.6929450035095215, + "learning_rate": 3.8070207607527587e-06, + "loss": 0.3871, + "step": 6615 + }, + { + "epoch": 1.8573834924199888, + "grad_norm": 0.6023022532463074, + "learning_rate": 3.805434643624953e-06, + "loss": 0.3518, + "step": 6616 + }, + { + "epoch": 1.8576642335766422, + "grad_norm": 0.6579223871231079, + "learning_rate": 3.8038486539737348e-06, + "loss": 0.4233, + "step": 6617 + }, + { + "epoch": 1.857944974733296, + "grad_norm": 0.747478723526001, + "learning_rate": 3.802262791968349e-06, + "loss": 0.3782, + "step": 6618 + }, + { + "epoch": 1.8582257158899496, + "grad_norm": 0.6331729292869568, + "learning_rate": 3.8006770577780315e-06, + "loss": 0.3791, + "step": 6619 + }, + { + "epoch": 1.858506457046603, + "grad_norm": 0.720064640045166, + "learning_rate": 3.799091451571999e-06, + "loss": 0.3602, + "step": 6620 + }, + { + "epoch": 1.8587871982032564, + "grad_norm": 0.6993894577026367, + "learning_rate": 3.7975059735194587e-06, + "loss": 0.3247, + "step": 6621 + }, + { + "epoch": 1.8590679393599103, + "grad_norm": 0.6288126111030579, + "learning_rate": 3.7959206237896027e-06, + "loss": 0.4001, + "step": 6622 + }, + { + "epoch": 1.8593486805165638, + "grad_norm": 0.6296070218086243, + "learning_rate": 3.794335402551611e-06, + "loss": 0.4012, + "step": 6623 + }, + { + "epoch": 1.8596294216732172, + "grad_norm": 0.5715782642364502, + "learning_rate": 3.792750309974647e-06, + "loss": 0.3634, + "step": 6624 + }, + { + "epoch": 1.8599101628298709, + "grad_norm": 0.5731468796730042, + "learning_rate": 3.7911653462278634e-06, + "loss": 0.39, + "step": 6625 + }, + { + "epoch": 1.8601909039865245, + "grad_norm": 0.7277904748916626, + "learning_rate": 3.7895805114803962e-06, + "loss": 0.353, + "step": 6626 + }, + { + "epoch": 1.860471645143178, + "grad_norm": 0.6582594513893127, + "learning_rate": 3.78799580590137e-06, + "loss": 0.4148, + "step": 6627 + }, + { + "epoch": 1.8607523862998314, + "grad_norm": 0.6079365015029907, + "learning_rate": 3.7864112296598936e-06, + "loss": 0.4209, + "step": 6628 + }, + { + "epoch": 1.861033127456485, + "grad_norm": 0.6307871341705322, + "learning_rate": 3.7848267829250634e-06, + "loss": 0.3617, + "step": 6629 + }, + { + "epoch": 1.8613138686131387, + "grad_norm": 0.7056549787521362, + "learning_rate": 3.7832424658659635e-06, + "loss": 0.4091, + "step": 6630 + }, + { + "epoch": 1.8615946097697922, + "grad_norm": 0.5833138823509216, + "learning_rate": 3.7816582786516603e-06, + "loss": 0.3934, + "step": 6631 + }, + { + "epoch": 1.8618753509264458, + "grad_norm": 0.553715705871582, + "learning_rate": 3.78007422145121e-06, + "loss": 0.396, + "step": 6632 + }, + { + "epoch": 1.8621560920830995, + "grad_norm": 0.7350575923919678, + "learning_rate": 3.778490294433652e-06, + "loss": 0.3464, + "step": 6633 + }, + { + "epoch": 1.862436833239753, + "grad_norm": 0.673321008682251, + "learning_rate": 3.7769064977680153e-06, + "loss": 0.4206, + "step": 6634 + }, + { + "epoch": 1.8627175743964064, + "grad_norm": 0.6504115462303162, + "learning_rate": 3.7753228316233104e-06, + "loss": 0.3649, + "step": 6635 + }, + { + "epoch": 1.86299831555306, + "grad_norm": 0.7077226042747498, + "learning_rate": 3.7737392961685365e-06, + "loss": 0.3924, + "step": 6636 + }, + { + "epoch": 1.8632790567097137, + "grad_norm": 0.5814265608787537, + "learning_rate": 3.7721558915726797e-06, + "loss": 0.3773, + "step": 6637 + }, + { + "epoch": 1.8635597978663672, + "grad_norm": 0.6288542747497559, + "learning_rate": 3.7705726180047103e-06, + "loss": 0.3887, + "step": 6638 + }, + { + "epoch": 1.8638405390230208, + "grad_norm": 0.6334558129310608, + "learning_rate": 3.7689894756335864e-06, + "loss": 0.372, + "step": 6639 + }, + { + "epoch": 1.8641212801796745, + "grad_norm": 0.6678167581558228, + "learning_rate": 3.767406464628251e-06, + "loss": 0.3395, + "step": 6640 + }, + { + "epoch": 1.864402021336328, + "grad_norm": 0.6088495850563049, + "learning_rate": 3.765823585157632e-06, + "loss": 0.3579, + "step": 6641 + }, + { + "epoch": 1.8646827624929814, + "grad_norm": 0.6270818114280701, + "learning_rate": 3.7642408373906445e-06, + "loss": 0.37, + "step": 6642 + }, + { + "epoch": 1.864963503649635, + "grad_norm": 0.6393176317214966, + "learning_rate": 3.762658221496191e-06, + "loss": 0.3967, + "step": 6643 + }, + { + "epoch": 1.8652442448062887, + "grad_norm": 0.6403645277023315, + "learning_rate": 3.7610757376431575e-06, + "loss": 0.3932, + "step": 6644 + }, + { + "epoch": 1.8655249859629421, + "grad_norm": 0.6309423446655273, + "learning_rate": 3.759493386000417e-06, + "loss": 0.4042, + "step": 6645 + }, + { + "epoch": 1.8658057271195956, + "grad_norm": 0.6744917631149292, + "learning_rate": 3.7579111667368273e-06, + "loss": 0.4014, + "step": 6646 + }, + { + "epoch": 1.8660864682762492, + "grad_norm": 0.6249033808708191, + "learning_rate": 3.7563290800212355e-06, + "loss": 0.4366, + "step": 6647 + }, + { + "epoch": 1.866367209432903, + "grad_norm": 0.6633877754211426, + "learning_rate": 3.7547471260224695e-06, + "loss": 0.3755, + "step": 6648 + }, + { + "epoch": 1.8666479505895563, + "grad_norm": 0.6555549502372742, + "learning_rate": 3.753165304909346e-06, + "loss": 0.3902, + "step": 6649 + }, + { + "epoch": 1.86692869174621, + "grad_norm": 0.6909569501876831, + "learning_rate": 3.751583616850668e-06, + "loss": 0.381, + "step": 6650 + }, + { + "epoch": 1.8672094329028637, + "grad_norm": 0.5843143463134766, + "learning_rate": 3.7500020620152226e-06, + "loss": 0.3694, + "step": 6651 + }, + { + "epoch": 1.8674901740595171, + "grad_norm": 0.5654796957969666, + "learning_rate": 3.7484206405717844e-06, + "loss": 0.3714, + "step": 6652 + }, + { + "epoch": 1.8677709152161706, + "grad_norm": 0.6510779857635498, + "learning_rate": 3.7468393526891133e-06, + "loss": 0.4177, + "step": 6653 + }, + { + "epoch": 1.8680516563728242, + "grad_norm": 0.6091763973236084, + "learning_rate": 3.7452581985359505e-06, + "loss": 0.4191, + "step": 6654 + }, + { + "epoch": 1.8683323975294779, + "grad_norm": 0.6130009889602661, + "learning_rate": 3.7436771782810314e-06, + "loss": 0.3491, + "step": 6655 + }, + { + "epoch": 1.8686131386861313, + "grad_norm": 0.6302722692489624, + "learning_rate": 3.742096292093073e-06, + "loss": 0.36, + "step": 6656 + }, + { + "epoch": 1.868893879842785, + "grad_norm": 0.692513644695282, + "learning_rate": 3.740515540140775e-06, + "loss": 0.3739, + "step": 6657 + }, + { + "epoch": 1.8691746209994387, + "grad_norm": 0.6807888746261597, + "learning_rate": 3.7389349225928262e-06, + "loss": 0.4135, + "step": 6658 + }, + { + "epoch": 1.869455362156092, + "grad_norm": 0.6932981610298157, + "learning_rate": 3.737354439617901e-06, + "loss": 0.3831, + "step": 6659 + }, + { + "epoch": 1.8697361033127455, + "grad_norm": 0.6126266717910767, + "learning_rate": 3.7357740913846567e-06, + "loss": 0.376, + "step": 6660 + }, + { + "epoch": 1.8700168444693992, + "grad_norm": 0.6369067430496216, + "learning_rate": 3.7341938780617404e-06, + "loss": 0.4159, + "step": 6661 + }, + { + "epoch": 1.8702975856260529, + "grad_norm": 0.587691605091095, + "learning_rate": 3.732613799817783e-06, + "loss": 0.3612, + "step": 6662 + }, + { + "epoch": 1.8705783267827063, + "grad_norm": 0.6611619591712952, + "learning_rate": 3.7310338568213987e-06, + "loss": 0.382, + "step": 6663 + }, + { + "epoch": 1.87085906793936, + "grad_norm": 0.6849161982536316, + "learning_rate": 3.7294540492411898e-06, + "loss": 0.3478, + "step": 6664 + }, + { + "epoch": 1.8711398090960136, + "grad_norm": 0.6290566921234131, + "learning_rate": 3.7278743772457438e-06, + "loss": 0.4332, + "step": 6665 + }, + { + "epoch": 1.871420550252667, + "grad_norm": 0.6598408222198486, + "learning_rate": 3.726294841003633e-06, + "loss": 0.4205, + "step": 6666 + }, + { + "epoch": 1.8717012914093205, + "grad_norm": 0.6161982417106628, + "learning_rate": 3.724715440683417e-06, + "loss": 0.4043, + "step": 6667 + }, + { + "epoch": 1.8719820325659742, + "grad_norm": 0.6266809701919556, + "learning_rate": 3.723136176453639e-06, + "loss": 0.4025, + "step": 6668 + }, + { + "epoch": 1.8722627737226278, + "grad_norm": 0.6120468974113464, + "learning_rate": 3.721557048482827e-06, + "loss": 0.3845, + "step": 6669 + }, + { + "epoch": 1.8725435148792813, + "grad_norm": 0.6067700982093811, + "learning_rate": 3.719978056939497e-06, + "loss": 0.3945, + "step": 6670 + }, + { + "epoch": 1.8728242560359347, + "grad_norm": 0.6045942902565002, + "learning_rate": 3.718399201992149e-06, + "loss": 0.3828, + "step": 6671 + }, + { + "epoch": 1.8731049971925884, + "grad_norm": 0.6053402423858643, + "learning_rate": 3.716820483809268e-06, + "loss": 0.3514, + "step": 6672 + }, + { + "epoch": 1.873385738349242, + "grad_norm": 0.5942814350128174, + "learning_rate": 3.7152419025593257e-06, + "loss": 0.4029, + "step": 6673 + }, + { + "epoch": 1.8736664795058955, + "grad_norm": 0.5933666229248047, + "learning_rate": 3.7136634584107787e-06, + "loss": 0.3755, + "step": 6674 + }, + { + "epoch": 1.8739472206625492, + "grad_norm": 0.6381656527519226, + "learning_rate": 3.7120851515320676e-06, + "loss": 0.3525, + "step": 6675 + }, + { + "epoch": 1.8742279618192028, + "grad_norm": 0.7048853635787964, + "learning_rate": 3.7105069820916193e-06, + "loss": 0.3558, + "step": 6676 + }, + { + "epoch": 1.8745087029758563, + "grad_norm": 0.7625410556793213, + "learning_rate": 3.7089289502578486e-06, + "loss": 0.3953, + "step": 6677 + }, + { + "epoch": 1.8747894441325097, + "grad_norm": 0.686286985874176, + "learning_rate": 3.7073510561991503e-06, + "loss": 0.3895, + "step": 6678 + }, + { + "epoch": 1.8750701852891634, + "grad_norm": 0.6566884517669678, + "learning_rate": 3.7057733000839086e-06, + "loss": 0.4032, + "step": 6679 + }, + { + "epoch": 1.875350926445817, + "grad_norm": 0.5950343012809753, + "learning_rate": 3.7041956820804925e-06, + "loss": 0.3832, + "step": 6680 + }, + { + "epoch": 1.8756316676024705, + "grad_norm": 0.49978721141815186, + "learning_rate": 3.7026182023572553e-06, + "loss": 0.3465, + "step": 6681 + }, + { + "epoch": 1.8759124087591241, + "grad_norm": 0.6579840183258057, + "learning_rate": 3.701040861082536e-06, + "loss": 0.3377, + "step": 6682 + }, + { + "epoch": 1.8761931499157778, + "grad_norm": 0.5922228097915649, + "learning_rate": 3.699463658424658e-06, + "loss": 0.415, + "step": 6683 + }, + { + "epoch": 1.8764738910724312, + "grad_norm": 0.6216049194335938, + "learning_rate": 3.6978865945519327e-06, + "loss": 0.3986, + "step": 6684 + }, + { + "epoch": 1.8767546322290847, + "grad_norm": 0.6333596110343933, + "learning_rate": 3.6963096696326505e-06, + "loss": 0.3729, + "step": 6685 + }, + { + "epoch": 1.8770353733857383, + "grad_norm": 0.7083112597465515, + "learning_rate": 3.694732883835094e-06, + "loss": 0.3858, + "step": 6686 + }, + { + "epoch": 1.877316114542392, + "grad_norm": 0.6354517936706543, + "learning_rate": 3.6931562373275265e-06, + "loss": 0.3647, + "step": 6687 + }, + { + "epoch": 1.8775968556990454, + "grad_norm": 0.6213771104812622, + "learning_rate": 3.6915797302782e-06, + "loss": 0.3857, + "step": 6688 + }, + { + "epoch": 1.8778775968556989, + "grad_norm": 0.6547612547874451, + "learning_rate": 3.6900033628553465e-06, + "loss": 0.3593, + "step": 6689 + }, + { + "epoch": 1.8781583380123528, + "grad_norm": 0.5587164163589478, + "learning_rate": 3.6884271352271895e-06, + "loss": 0.3583, + "step": 6690 + }, + { + "epoch": 1.8784390791690062, + "grad_norm": 0.6224437952041626, + "learning_rate": 3.6868510475619316e-06, + "loss": 0.3996, + "step": 6691 + }, + { + "epoch": 1.8787198203256597, + "grad_norm": 0.6226052045822144, + "learning_rate": 3.685275100027764e-06, + "loss": 0.3561, + "step": 6692 + }, + { + "epoch": 1.8790005614823133, + "grad_norm": 0.5926626920700073, + "learning_rate": 3.6836992927928618e-06, + "loss": 0.3934, + "step": 6693 + }, + { + "epoch": 1.879281302638967, + "grad_norm": 0.7276115417480469, + "learning_rate": 3.682123626025386e-06, + "loss": 0.3674, + "step": 6694 + }, + { + "epoch": 1.8795620437956204, + "grad_norm": 0.5634215474128723, + "learning_rate": 3.6805480998934807e-06, + "loss": 0.4247, + "step": 6695 + }, + { + "epoch": 1.8798427849522739, + "grad_norm": 0.5653231143951416, + "learning_rate": 3.6789727145652786e-06, + "loss": 0.3882, + "step": 6696 + }, + { + "epoch": 1.8801235261089275, + "grad_norm": 0.5527107119560242, + "learning_rate": 3.677397470208892e-06, + "loss": 0.3763, + "step": 6697 + }, + { + "epoch": 1.8804042672655812, + "grad_norm": 0.5930175185203552, + "learning_rate": 3.6758223669924232e-06, + "loss": 0.3605, + "step": 6698 + }, + { + "epoch": 1.8806850084222346, + "grad_norm": 0.6014449596405029, + "learning_rate": 3.6742474050839573e-06, + "loss": 0.3915, + "step": 6699 + }, + { + "epoch": 1.8809657495788883, + "grad_norm": 0.642204999923706, + "learning_rate": 3.6726725846515632e-06, + "loss": 0.4136, + "step": 6700 + }, + { + "epoch": 1.881246490735542, + "grad_norm": 0.5835151672363281, + "learning_rate": 3.6710979058632966e-06, + "loss": 0.3806, + "step": 6701 + }, + { + "epoch": 1.8815272318921954, + "grad_norm": 0.6496613025665283, + "learning_rate": 3.669523368887199e-06, + "loss": 0.3884, + "step": 6702 + }, + { + "epoch": 1.8818079730488488, + "grad_norm": 0.6161884665489197, + "learning_rate": 3.667948973891293e-06, + "loss": 0.3931, + "step": 6703 + }, + { + "epoch": 1.8820887142055025, + "grad_norm": 0.5649263858795166, + "learning_rate": 3.6663747210435886e-06, + "loss": 0.3605, + "step": 6704 + }, + { + "epoch": 1.8823694553621562, + "grad_norm": 0.6075252890586853, + "learning_rate": 3.6648006105120796e-06, + "loss": 0.4164, + "step": 6705 + }, + { + "epoch": 1.8826501965188096, + "grad_norm": 0.6512606143951416, + "learning_rate": 3.6632266424647477e-06, + "loss": 0.3717, + "step": 6706 + }, + { + "epoch": 1.8829309376754633, + "grad_norm": 0.6432591080665588, + "learning_rate": 3.661652817069556e-06, + "loss": 0.3738, + "step": 6707 + }, + { + "epoch": 1.883211678832117, + "grad_norm": 0.5726814270019531, + "learning_rate": 3.6600791344944523e-06, + "loss": 0.3637, + "step": 6708 + }, + { + "epoch": 1.8834924199887704, + "grad_norm": 0.6158642172813416, + "learning_rate": 3.6585055949073717e-06, + "loss": 0.3511, + "step": 6709 + }, + { + "epoch": 1.8837731611454238, + "grad_norm": 0.6513944864273071, + "learning_rate": 3.6569321984762314e-06, + "loss": 0.3551, + "step": 6710 + }, + { + "epoch": 1.8840539023020775, + "grad_norm": 0.64405757188797, + "learning_rate": 3.655358945368936e-06, + "loss": 0.3637, + "step": 6711 + }, + { + "epoch": 1.8843346434587311, + "grad_norm": 0.6633872389793396, + "learning_rate": 3.6537858357533706e-06, + "loss": 0.3761, + "step": 6712 + }, + { + "epoch": 1.8846153846153846, + "grad_norm": 0.6905936002731323, + "learning_rate": 3.6522128697974103e-06, + "loss": 0.3708, + "step": 6713 + }, + { + "epoch": 1.884896125772038, + "grad_norm": 0.6328822374343872, + "learning_rate": 3.6506400476689107e-06, + "loss": 0.3908, + "step": 6714 + }, + { + "epoch": 1.885176866928692, + "grad_norm": 0.654472291469574, + "learning_rate": 3.6490673695357136e-06, + "loss": 0.374, + "step": 6715 + }, + { + "epoch": 1.8854576080853453, + "grad_norm": 0.6299577355384827, + "learning_rate": 3.6474948355656463e-06, + "loss": 0.3689, + "step": 6716 + }, + { + "epoch": 1.8857383492419988, + "grad_norm": 0.7236530780792236, + "learning_rate": 3.6459224459265207e-06, + "loss": 0.3584, + "step": 6717 + }, + { + "epoch": 1.8860190903986525, + "grad_norm": 0.6885652542114258, + "learning_rate": 3.64435020078613e-06, + "loss": 0.4165, + "step": 6718 + }, + { + "epoch": 1.8862998315553061, + "grad_norm": 0.5869594812393188, + "learning_rate": 3.642778100312256e-06, + "loss": 0.3821, + "step": 6719 + }, + { + "epoch": 1.8865805727119596, + "grad_norm": 0.6333834528923035, + "learning_rate": 3.6412061446726626e-06, + "loss": 0.4168, + "step": 6720 + }, + { + "epoch": 1.886861313868613, + "grad_norm": 0.6479399800300598, + "learning_rate": 3.6396343340351003e-06, + "loss": 0.3861, + "step": 6721 + }, + { + "epoch": 1.8871420550252667, + "grad_norm": 0.7683178782463074, + "learning_rate": 3.6380626685673016e-06, + "loss": 0.4155, + "step": 6722 + }, + { + "epoch": 1.8874227961819203, + "grad_norm": 0.6431917548179626, + "learning_rate": 3.6364911484369867e-06, + "loss": 0.3798, + "step": 6723 + }, + { + "epoch": 1.8877035373385738, + "grad_norm": 0.6315298080444336, + "learning_rate": 3.6349197738118567e-06, + "loss": 0.3678, + "step": 6724 + }, + { + "epoch": 1.8879842784952274, + "grad_norm": 0.6385400891304016, + "learning_rate": 3.6333485448595994e-06, + "loss": 0.3522, + "step": 6725 + }, + { + "epoch": 1.888265019651881, + "grad_norm": 0.6229247450828552, + "learning_rate": 3.631777461747887e-06, + "loss": 0.3636, + "step": 6726 + }, + { + "epoch": 1.8885457608085345, + "grad_norm": 0.615041196346283, + "learning_rate": 3.630206524644375e-06, + "loss": 0.3971, + "step": 6727 + }, + { + "epoch": 1.888826501965188, + "grad_norm": 0.6098858714103699, + "learning_rate": 3.6286357337167044e-06, + "loss": 0.3832, + "step": 6728 + }, + { + "epoch": 1.8891072431218416, + "grad_norm": 0.6216406226158142, + "learning_rate": 3.627065089132502e-06, + "loss": 0.3711, + "step": 6729 + }, + { + "epoch": 1.8893879842784953, + "grad_norm": 0.5764333009719849, + "learning_rate": 3.625494591059372e-06, + "loss": 0.3994, + "step": 6730 + }, + { + "epoch": 1.8896687254351487, + "grad_norm": 0.6698734164237976, + "learning_rate": 3.623924239664914e-06, + "loss": 0.379, + "step": 6731 + }, + { + "epoch": 1.8899494665918024, + "grad_norm": 0.6468973755836487, + "learning_rate": 3.6223540351167043e-06, + "loss": 0.3849, + "step": 6732 + }, + { + "epoch": 1.890230207748456, + "grad_norm": 0.578467071056366, + "learning_rate": 3.620783977582305e-06, + "loss": 0.3756, + "step": 6733 + }, + { + "epoch": 1.8905109489051095, + "grad_norm": 0.6839062571525574, + "learning_rate": 3.6192140672292625e-06, + "loss": 0.4014, + "step": 6734 + }, + { + "epoch": 1.890791690061763, + "grad_norm": 0.664242148399353, + "learning_rate": 3.6176443042251084e-06, + "loss": 0.4039, + "step": 6735 + }, + { + "epoch": 1.8910724312184166, + "grad_norm": 0.5890271663665771, + "learning_rate": 3.6160746887373575e-06, + "loss": 0.366, + "step": 6736 + }, + { + "epoch": 1.8913531723750703, + "grad_norm": 0.6562938094139099, + "learning_rate": 3.6145052209335097e-06, + "loss": 0.3471, + "step": 6737 + }, + { + "epoch": 1.8916339135317237, + "grad_norm": 0.5657323598861694, + "learning_rate": 3.6129359009810488e-06, + "loss": 0.3807, + "step": 6738 + }, + { + "epoch": 1.8919146546883772, + "grad_norm": 0.706623375415802, + "learning_rate": 3.611366729047444e-06, + "loss": 0.4152, + "step": 6739 + }, + { + "epoch": 1.892195395845031, + "grad_norm": 0.6669846773147583, + "learning_rate": 3.609797705300146e-06, + "loss": 0.375, + "step": 6740 + }, + { + "epoch": 1.8924761370016845, + "grad_norm": 0.6773914694786072, + "learning_rate": 3.6082288299065915e-06, + "loss": 0.3538, + "step": 6741 + }, + { + "epoch": 1.892756878158338, + "grad_norm": 0.6867550015449524, + "learning_rate": 3.6066601030342014e-06, + "loss": 0.405, + "step": 6742 + }, + { + "epoch": 1.8930376193149916, + "grad_norm": 0.5890536904335022, + "learning_rate": 3.60509152485038e-06, + "loss": 0.3521, + "step": 6743 + }, + { + "epoch": 1.8933183604716453, + "grad_norm": 0.6439083814620972, + "learning_rate": 3.6035230955225176e-06, + "loss": 0.4042, + "step": 6744 + }, + { + "epoch": 1.8935991016282987, + "grad_norm": 0.649586021900177, + "learning_rate": 3.6019548152179874e-06, + "loss": 0.4085, + "step": 6745 + }, + { + "epoch": 1.8938798427849521, + "grad_norm": 0.7091128826141357, + "learning_rate": 3.6003866841041434e-06, + "loss": 0.384, + "step": 6746 + }, + { + "epoch": 1.8941605839416058, + "grad_norm": 0.6923140287399292, + "learning_rate": 3.5988187023483296e-06, + "loss": 0.3471, + "step": 6747 + }, + { + "epoch": 1.8944413250982595, + "grad_norm": 0.6134341359138489, + "learning_rate": 3.59725087011787e-06, + "loss": 0.3534, + "step": 6748 + }, + { + "epoch": 1.894722066254913, + "grad_norm": 0.6707556247711182, + "learning_rate": 3.5956831875800747e-06, + "loss": 0.4049, + "step": 6749 + }, + { + "epoch": 1.8950028074115666, + "grad_norm": 0.5964996218681335, + "learning_rate": 3.5941156549022373e-06, + "loss": 0.4019, + "step": 6750 + }, + { + "epoch": 1.8952835485682202, + "grad_norm": 0.6699771881103516, + "learning_rate": 3.5925482722516345e-06, + "loss": 0.3981, + "step": 6751 + }, + { + "epoch": 1.8955642897248737, + "grad_norm": 0.5670035481452942, + "learning_rate": 3.590981039795528e-06, + "loss": 0.3561, + "step": 6752 + }, + { + "epoch": 1.8958450308815271, + "grad_norm": 0.6419764161109924, + "learning_rate": 3.589413957701162e-06, + "loss": 0.402, + "step": 6753 + }, + { + "epoch": 1.8961257720381808, + "grad_norm": 0.6489019393920898, + "learning_rate": 3.5878470261357666e-06, + "loss": 0.3832, + "step": 6754 + }, + { + "epoch": 1.8964065131948344, + "grad_norm": 0.7206984162330627, + "learning_rate": 3.586280245266555e-06, + "loss": 0.3905, + "step": 6755 + }, + { + "epoch": 1.8966872543514879, + "grad_norm": 0.5839571356773376, + "learning_rate": 3.584713615260723e-06, + "loss": 0.3853, + "step": 6756 + }, + { + "epoch": 1.8969679955081415, + "grad_norm": 0.6096206903457642, + "learning_rate": 3.5831471362854547e-06, + "loss": 0.3766, + "step": 6757 + }, + { + "epoch": 1.8972487366647952, + "grad_norm": 0.6558371186256409, + "learning_rate": 3.5815808085079127e-06, + "loss": 0.4206, + "step": 6758 + }, + { + "epoch": 1.8975294778214487, + "grad_norm": 0.5894262790679932, + "learning_rate": 3.5800146320952465e-06, + "loss": 0.3434, + "step": 6759 + }, + { + "epoch": 1.897810218978102, + "grad_norm": 0.5367259383201599, + "learning_rate": 3.578448607214588e-06, + "loss": 0.388, + "step": 6760 + }, + { + "epoch": 1.8980909601347558, + "grad_norm": 0.6186708807945251, + "learning_rate": 3.5768827340330557e-06, + "loss": 0.3534, + "step": 6761 + }, + { + "epoch": 1.8983717012914094, + "grad_norm": 0.5753026604652405, + "learning_rate": 3.5753170127177467e-06, + "loss": 0.3969, + "step": 6762 + }, + { + "epoch": 1.8986524424480629, + "grad_norm": 0.6549876928329468, + "learning_rate": 3.573751443435747e-06, + "loss": 0.4152, + "step": 6763 + }, + { + "epoch": 1.8989331836047163, + "grad_norm": 0.5275825262069702, + "learning_rate": 3.5721860263541235e-06, + "loss": 0.4088, + "step": 6764 + }, + { + "epoch": 1.89921392476137, + "grad_norm": 0.5949759483337402, + "learning_rate": 3.5706207616399287e-06, + "loss": 0.3474, + "step": 6765 + }, + { + "epoch": 1.8994946659180236, + "grad_norm": 0.5990729928016663, + "learning_rate": 3.569055649460197e-06, + "loss": 0.3525, + "step": 6766 + }, + { + "epoch": 1.899775407074677, + "grad_norm": 0.6396803855895996, + "learning_rate": 3.5674906899819494e-06, + "loss": 0.4261, + "step": 6767 + }, + { + "epoch": 1.9000561482313307, + "grad_norm": 0.5973544120788574, + "learning_rate": 3.5659258833721867e-06, + "loss": 0.3901, + "step": 6768 + }, + { + "epoch": 1.9003368893879844, + "grad_norm": 0.6160799264907837, + "learning_rate": 3.564361229797895e-06, + "loss": 0.3694, + "step": 6769 + }, + { + "epoch": 1.9006176305446378, + "grad_norm": 0.6115102171897888, + "learning_rate": 3.562796729426045e-06, + "loss": 0.3848, + "step": 6770 + }, + { + "epoch": 1.9008983717012913, + "grad_norm": 0.6193190813064575, + "learning_rate": 3.5612323824235913e-06, + "loss": 0.3426, + "step": 6771 + }, + { + "epoch": 1.901179112857945, + "grad_norm": 0.6612417697906494, + "learning_rate": 3.55966818895747e-06, + "loss": 0.3894, + "step": 6772 + }, + { + "epoch": 1.9014598540145986, + "grad_norm": 0.6099318861961365, + "learning_rate": 3.5581041491946045e-06, + "loss": 0.3654, + "step": 6773 + }, + { + "epoch": 1.901740595171252, + "grad_norm": 0.6059949994087219, + "learning_rate": 3.5565402633018963e-06, + "loss": 0.3885, + "step": 6774 + }, + { + "epoch": 1.9020213363279057, + "grad_norm": 0.6448893547058105, + "learning_rate": 3.5549765314462347e-06, + "loss": 0.3997, + "step": 6775 + }, + { + "epoch": 1.9023020774845594, + "grad_norm": 0.5469254851341248, + "learning_rate": 3.5534129537944915e-06, + "loss": 0.3648, + "step": 6776 + }, + { + "epoch": 1.9025828186412128, + "grad_norm": 0.6060861349105835, + "learning_rate": 3.5518495305135225e-06, + "loss": 0.3806, + "step": 6777 + }, + { + "epoch": 1.9028635597978663, + "grad_norm": 0.5298658013343811, + "learning_rate": 3.550286261770166e-06, + "loss": 0.3592, + "step": 6778 + }, + { + "epoch": 1.90314430095452, + "grad_norm": 0.5830897092819214, + "learning_rate": 3.5487231477312463e-06, + "loss": 0.3884, + "step": 6779 + }, + { + "epoch": 1.9034250421111736, + "grad_norm": 0.591823160648346, + "learning_rate": 3.5471601885635654e-06, + "loss": 0.3819, + "step": 6780 + }, + { + "epoch": 1.903705783267827, + "grad_norm": 0.6853220462799072, + "learning_rate": 3.545597384433913e-06, + "loss": 0.368, + "step": 6781 + }, + { + "epoch": 1.9039865244244805, + "grad_norm": 0.6024075746536255, + "learning_rate": 3.5440347355090666e-06, + "loss": 0.3904, + "step": 6782 + }, + { + "epoch": 1.9042672655811343, + "grad_norm": 0.6517391204833984, + "learning_rate": 3.542472241955778e-06, + "loss": 0.3402, + "step": 6783 + }, + { + "epoch": 1.9045480067377878, + "grad_norm": 0.6168352365493774, + "learning_rate": 3.5409099039407867e-06, + "loss": 0.3997, + "step": 6784 + }, + { + "epoch": 1.9048287478944412, + "grad_norm": 0.6700246930122375, + "learning_rate": 3.539347721630818e-06, + "loss": 0.3822, + "step": 6785 + }, + { + "epoch": 1.905109489051095, + "grad_norm": 0.6758705973625183, + "learning_rate": 3.537785695192578e-06, + "loss": 0.4051, + "step": 6786 + }, + { + "epoch": 1.9053902302077486, + "grad_norm": 0.5712435245513916, + "learning_rate": 3.536223824792755e-06, + "loss": 0.405, + "step": 6787 + }, + { + "epoch": 1.905670971364402, + "grad_norm": 0.6648273468017578, + "learning_rate": 3.5346621105980237e-06, + "loss": 0.3793, + "step": 6788 + }, + { + "epoch": 1.9059517125210554, + "grad_norm": 0.6850698590278625, + "learning_rate": 3.5331005527750385e-06, + "loss": 0.3817, + "step": 6789 + }, + { + "epoch": 1.906232453677709, + "grad_norm": 0.6969839930534363, + "learning_rate": 3.5315391514904408e-06, + "loss": 0.3413, + "step": 6790 + }, + { + "epoch": 1.9065131948343628, + "grad_norm": 0.601905107498169, + "learning_rate": 3.5299779069108524e-06, + "loss": 0.3445, + "step": 6791 + }, + { + "epoch": 1.9067939359910162, + "grad_norm": 0.6207190155982971, + "learning_rate": 3.528416819202881e-06, + "loss": 0.3744, + "step": 6792 + }, + { + "epoch": 1.9070746771476699, + "grad_norm": 0.6887391805648804, + "learning_rate": 3.526855888533115e-06, + "loss": 0.3885, + "step": 6793 + }, + { + "epoch": 1.9073554183043235, + "grad_norm": 0.7935128808021545, + "learning_rate": 3.525295115068129e-06, + "loss": 0.3879, + "step": 6794 + }, + { + "epoch": 1.907636159460977, + "grad_norm": 0.5594736933708191, + "learning_rate": 3.5237344989744765e-06, + "loss": 0.3921, + "step": 6795 + }, + { + "epoch": 1.9079169006176304, + "grad_norm": 0.5839893817901611, + "learning_rate": 3.5221740404186983e-06, + "loss": 0.3995, + "step": 6796 + }, + { + "epoch": 1.908197641774284, + "grad_norm": 0.6778796315193176, + "learning_rate": 3.520613739567316e-06, + "loss": 0.366, + "step": 6797 + }, + { + "epoch": 1.9084783829309377, + "grad_norm": 0.673738420009613, + "learning_rate": 3.519053596586836e-06, + "loss": 0.4061, + "step": 6798 + }, + { + "epoch": 1.9087591240875912, + "grad_norm": 0.6413092613220215, + "learning_rate": 3.5174936116437467e-06, + "loss": 0.3779, + "step": 6799 + }, + { + "epoch": 1.9090398652442448, + "grad_norm": 0.6005680561065674, + "learning_rate": 3.5159337849045217e-06, + "loss": 0.3613, + "step": 6800 + }, + { + "epoch": 1.9093206064008985, + "grad_norm": 0.6098957061767578, + "learning_rate": 3.5143741165356127e-06, + "loss": 0.3758, + "step": 6801 + }, + { + "epoch": 1.909601347557552, + "grad_norm": 0.7080655097961426, + "learning_rate": 3.5128146067034595e-06, + "loss": 0.3731, + "step": 6802 + }, + { + "epoch": 1.9098820887142054, + "grad_norm": 0.5995592474937439, + "learning_rate": 3.5112552555744837e-06, + "loss": 0.3948, + "step": 6803 + }, + { + "epoch": 1.910162829870859, + "grad_norm": 0.5612696409225464, + "learning_rate": 3.509696063315089e-06, + "loss": 0.3722, + "step": 6804 + }, + { + "epoch": 1.9104435710275127, + "grad_norm": 0.6568280458450317, + "learning_rate": 3.5081370300916623e-06, + "loss": 0.3355, + "step": 6805 + }, + { + "epoch": 1.9107243121841662, + "grad_norm": 0.6519230008125305, + "learning_rate": 3.506578156070576e-06, + "loss": 0.3889, + "step": 6806 + }, + { + "epoch": 1.9110050533408196, + "grad_norm": 0.6857989430427551, + "learning_rate": 3.505019441418178e-06, + "loss": 0.4229, + "step": 6807 + }, + { + "epoch": 1.9112857944974735, + "grad_norm": 0.646470844745636, + "learning_rate": 3.5034608863008117e-06, + "loss": 0.3649, + "step": 6808 + }, + { + "epoch": 1.911566535654127, + "grad_norm": 0.570564329624176, + "learning_rate": 3.501902490884793e-06, + "loss": 0.352, + "step": 6809 + }, + { + "epoch": 1.9118472768107804, + "grad_norm": 0.5701771378517151, + "learning_rate": 3.5003442553364253e-06, + "loss": 0.3734, + "step": 6810 + }, + { + "epoch": 1.912128017967434, + "grad_norm": 0.6223947405815125, + "learning_rate": 3.498786179821992e-06, + "loss": 0.4247, + "step": 6811 + }, + { + "epoch": 1.9124087591240877, + "grad_norm": 0.6876004338264465, + "learning_rate": 3.4972282645077617e-06, + "loss": 0.3813, + "step": 6812 + }, + { + "epoch": 1.9126895002807411, + "grad_norm": 0.6655701398849487, + "learning_rate": 3.4956705095599865e-06, + "loss": 0.369, + "step": 6813 + }, + { + "epoch": 1.9129702414373946, + "grad_norm": 0.6083945035934448, + "learning_rate": 3.4941129151448995e-06, + "loss": 0.3866, + "step": 6814 + }, + { + "epoch": 1.9132509825940482, + "grad_norm": 0.6104804277420044, + "learning_rate": 3.4925554814287177e-06, + "loss": 0.3397, + "step": 6815 + }, + { + "epoch": 1.913531723750702, + "grad_norm": 0.698229968547821, + "learning_rate": 3.4909982085776417e-06, + "loss": 0.365, + "step": 6816 + }, + { + "epoch": 1.9138124649073553, + "grad_norm": 0.6584321856498718, + "learning_rate": 3.489441096757852e-06, + "loss": 0.3846, + "step": 6817 + }, + { + "epoch": 1.914093206064009, + "grad_norm": 0.6015378832817078, + "learning_rate": 3.4878841461355147e-06, + "loss": 0.3417, + "step": 6818 + }, + { + "epoch": 1.9143739472206627, + "grad_norm": 0.6599941253662109, + "learning_rate": 3.4863273568767787e-06, + "loss": 0.3802, + "step": 6819 + }, + { + "epoch": 1.9146546883773161, + "grad_norm": 0.688912034034729, + "learning_rate": 3.4847707291477735e-06, + "loss": 0.3931, + "step": 6820 + }, + { + "epoch": 1.9149354295339696, + "grad_norm": 0.6507009863853455, + "learning_rate": 3.483214263114614e-06, + "loss": 0.3508, + "step": 6821 + }, + { + "epoch": 1.9152161706906232, + "grad_norm": 0.5559006929397583, + "learning_rate": 3.4816579589433967e-06, + "loss": 0.4068, + "step": 6822 + }, + { + "epoch": 1.9154969118472769, + "grad_norm": 0.6691533327102661, + "learning_rate": 3.4801018168001994e-06, + "loss": 0.4489, + "step": 6823 + }, + { + "epoch": 1.9157776530039303, + "grad_norm": 0.6477255821228027, + "learning_rate": 3.4785458368510844e-06, + "loss": 0.3773, + "step": 6824 + }, + { + "epoch": 1.916058394160584, + "grad_norm": 0.6950054168701172, + "learning_rate": 3.4769900192620964e-06, + "loss": 0.3832, + "step": 6825 + }, + { + "epoch": 1.9163391353172377, + "grad_norm": 0.7205079197883606, + "learning_rate": 3.4754343641992627e-06, + "loss": 0.3661, + "step": 6826 + }, + { + "epoch": 1.916619876473891, + "grad_norm": 0.6157258152961731, + "learning_rate": 3.473878871828593e-06, + "loss": 0.3646, + "step": 6827 + }, + { + "epoch": 1.9169006176305445, + "grad_norm": 0.586892306804657, + "learning_rate": 3.4723235423160808e-06, + "loss": 0.3707, + "step": 6828 + }, + { + "epoch": 1.9171813587871982, + "grad_norm": 0.7581172585487366, + "learning_rate": 3.470768375827699e-06, + "loss": 0.4054, + "step": 6829 + }, + { + "epoch": 1.9174620999438519, + "grad_norm": 0.61969393491745, + "learning_rate": 3.4692133725294066e-06, + "loss": 0.4026, + "step": 6830 + }, + { + "epoch": 1.9177428411005053, + "grad_norm": 0.7075392007827759, + "learning_rate": 3.4676585325871435e-06, + "loss": 0.4167, + "step": 6831 + }, + { + "epoch": 1.9180235822571587, + "grad_norm": 0.655890703201294, + "learning_rate": 3.4661038561668326e-06, + "loss": 0.3419, + "step": 6832 + }, + { + "epoch": 1.9183043234138126, + "grad_norm": 0.6394795775413513, + "learning_rate": 3.4645493434343797e-06, + "loss": 0.3659, + "step": 6833 + }, + { + "epoch": 1.918585064570466, + "grad_norm": 0.663291871547699, + "learning_rate": 3.462994994555673e-06, + "loss": 0.3793, + "step": 6834 + }, + { + "epoch": 1.9188658057271195, + "grad_norm": 0.736150324344635, + "learning_rate": 3.4614408096965822e-06, + "loss": 0.3919, + "step": 6835 + }, + { + "epoch": 1.9191465468837732, + "grad_norm": 0.6177143454551697, + "learning_rate": 3.4598867890229605e-06, + "loss": 0.3707, + "step": 6836 + }, + { + "epoch": 1.9194272880404268, + "grad_norm": 0.5929267406463623, + "learning_rate": 3.4583329327006445e-06, + "loss": 0.4282, + "step": 6837 + }, + { + "epoch": 1.9197080291970803, + "grad_norm": 0.6003937721252441, + "learning_rate": 3.45677924089545e-06, + "loss": 0.3754, + "step": 6838 + }, + { + "epoch": 1.9199887703537337, + "grad_norm": 0.6654490232467651, + "learning_rate": 3.455225713773178e-06, + "loss": 0.377, + "step": 6839 + }, + { + "epoch": 1.9202695115103874, + "grad_norm": 0.7084203958511353, + "learning_rate": 3.453672351499611e-06, + "loss": 0.4031, + "step": 6840 + }, + { + "epoch": 1.920550252667041, + "grad_norm": 0.6988268494606018, + "learning_rate": 3.452119154240515e-06, + "loss": 0.374, + "step": 6841 + }, + { + "epoch": 1.9208309938236945, + "grad_norm": 0.6115239262580872, + "learning_rate": 3.4505661221616382e-06, + "loss": 0.3694, + "step": 6842 + }, + { + "epoch": 1.9211117349803482, + "grad_norm": 0.6088030934333801, + "learning_rate": 3.44901325542871e-06, + "loss": 0.3848, + "step": 6843 + }, + { + "epoch": 1.9213924761370018, + "grad_norm": 0.7091503739356995, + "learning_rate": 3.447460554207441e-06, + "loss": 0.3365, + "step": 6844 + }, + { + "epoch": 1.9216732172936553, + "grad_norm": 0.6756137609481812, + "learning_rate": 3.4459080186635275e-06, + "loss": 0.4009, + "step": 6845 + }, + { + "epoch": 1.9219539584503087, + "grad_norm": 0.6201615333557129, + "learning_rate": 3.444355648962645e-06, + "loss": 0.3667, + "step": 6846 + }, + { + "epoch": 1.9222346996069624, + "grad_norm": 0.6235066056251526, + "learning_rate": 3.4428034452704546e-06, + "loss": 0.4193, + "step": 6847 + }, + { + "epoch": 1.922515440763616, + "grad_norm": 0.6128908395767212, + "learning_rate": 3.4412514077525964e-06, + "loss": 0.397, + "step": 6848 + }, + { + "epoch": 1.9227961819202695, + "grad_norm": 0.6845629811286926, + "learning_rate": 3.4396995365746965e-06, + "loss": 0.3652, + "step": 6849 + }, + { + "epoch": 1.9230769230769231, + "grad_norm": 0.5951507091522217, + "learning_rate": 3.4381478319023575e-06, + "loss": 0.3886, + "step": 6850 + }, + { + "epoch": 1.9233576642335768, + "grad_norm": 0.7217908501625061, + "learning_rate": 3.43659629390117e-06, + "loss": 0.4387, + "step": 6851 + }, + { + "epoch": 1.9236384053902302, + "grad_norm": 0.662869393825531, + "learning_rate": 3.4350449227367034e-06, + "loss": 0.3805, + "step": 6852 + }, + { + "epoch": 1.9239191465468837, + "grad_norm": 0.6102414727210999, + "learning_rate": 3.4334937185745104e-06, + "loss": 0.3737, + "step": 6853 + }, + { + "epoch": 1.9241998877035373, + "grad_norm": 0.6550286412239075, + "learning_rate": 3.431942681580127e-06, + "loss": 0.3728, + "step": 6854 + }, + { + "epoch": 1.924480628860191, + "grad_norm": 0.5773801803588867, + "learning_rate": 3.430391811919069e-06, + "loss": 0.3717, + "step": 6855 + }, + { + "epoch": 1.9247613700168444, + "grad_norm": 0.5805873870849609, + "learning_rate": 3.4288411097568375e-06, + "loss": 0.3662, + "step": 6856 + }, + { + "epoch": 1.9250421111734979, + "grad_norm": 0.7163267731666565, + "learning_rate": 3.4272905752589113e-06, + "loss": 0.3705, + "step": 6857 + }, + { + "epoch": 1.9253228523301515, + "grad_norm": 0.6486401557922363, + "learning_rate": 3.425740208590753e-06, + "loss": 0.3736, + "step": 6858 + }, + { + "epoch": 1.9256035934868052, + "grad_norm": 0.6509899497032166, + "learning_rate": 3.4241900099178125e-06, + "loss": 0.3933, + "step": 6859 + }, + { + "epoch": 1.9258843346434587, + "grad_norm": 0.6873028874397278, + "learning_rate": 3.4226399794055144e-06, + "loss": 0.3539, + "step": 6860 + }, + { + "epoch": 1.9261650758001123, + "grad_norm": 0.6148975491523743, + "learning_rate": 3.421090117219268e-06, + "loss": 0.3756, + "step": 6861 + }, + { + "epoch": 1.926445816956766, + "grad_norm": 0.600109338760376, + "learning_rate": 3.4195404235244665e-06, + "loss": 0.4183, + "step": 6862 + }, + { + "epoch": 1.9267265581134194, + "grad_norm": 0.567651629447937, + "learning_rate": 3.4179908984864823e-06, + "loss": 0.3429, + "step": 6863 + }, + { + "epoch": 1.9270072992700729, + "grad_norm": 0.6116517186164856, + "learning_rate": 3.4164415422706716e-06, + "loss": 0.4281, + "step": 6864 + }, + { + "epoch": 1.9272880404267265, + "grad_norm": 0.594724178314209, + "learning_rate": 3.414892355042373e-06, + "loss": 0.391, + "step": 6865 + }, + { + "epoch": 1.9275687815833802, + "grad_norm": 0.6017888188362122, + "learning_rate": 3.4133433369669043e-06, + "loss": 0.4176, + "step": 6866 + }, + { + "epoch": 1.9278495227400336, + "grad_norm": 0.5930371284484863, + "learning_rate": 3.411794488209568e-06, + "loss": 0.4013, + "step": 6867 + }, + { + "epoch": 1.9281302638966873, + "grad_norm": 0.634795606136322, + "learning_rate": 3.410245808935647e-06, + "loss": 0.3922, + "step": 6868 + }, + { + "epoch": 1.928411005053341, + "grad_norm": 0.5888826251029968, + "learning_rate": 3.4086972993104076e-06, + "loss": 0.3541, + "step": 6869 + }, + { + "epoch": 1.9286917462099944, + "grad_norm": 0.592771053314209, + "learning_rate": 3.407148959499097e-06, + "loss": 0.373, + "step": 6870 + }, + { + "epoch": 1.9289724873666478, + "grad_norm": 0.6941747069358826, + "learning_rate": 3.405600789666945e-06, + "loss": 0.41, + "step": 6871 + }, + { + "epoch": 1.9292532285233015, + "grad_norm": 0.5978795886039734, + "learning_rate": 3.404052789979161e-06, + "loss": 0.3842, + "step": 6872 + }, + { + "epoch": 1.9295339696799552, + "grad_norm": 0.5973474383354187, + "learning_rate": 3.402504960600938e-06, + "loss": 0.3843, + "step": 6873 + }, + { + "epoch": 1.9298147108366086, + "grad_norm": 0.5809810161590576, + "learning_rate": 3.4009573016974517e-06, + "loss": 0.3621, + "step": 6874 + }, + { + "epoch": 1.9300954519932623, + "grad_norm": 0.6391447186470032, + "learning_rate": 3.3994098134338587e-06, + "loss": 0.3933, + "step": 6875 + }, + { + "epoch": 1.930376193149916, + "grad_norm": 0.5237780213356018, + "learning_rate": 3.397862495975297e-06, + "loss": 0.3833, + "step": 6876 + }, + { + "epoch": 1.9306569343065694, + "grad_norm": 0.6448827981948853, + "learning_rate": 3.3963153494868873e-06, + "loss": 0.3456, + "step": 6877 + }, + { + "epoch": 1.9309376754632228, + "grad_norm": 0.6682508587837219, + "learning_rate": 3.39476837413373e-06, + "loss": 0.4004, + "step": 6878 + }, + { + "epoch": 1.9312184166198765, + "grad_norm": 0.6115602254867554, + "learning_rate": 3.39322157008091e-06, + "loss": 0.3822, + "step": 6879 + }, + { + "epoch": 1.9314991577765301, + "grad_norm": 0.6135326027870178, + "learning_rate": 3.3916749374934917e-06, + "loss": 0.3374, + "step": 6880 + }, + { + "epoch": 1.9317798989331836, + "grad_norm": 0.6781626343727112, + "learning_rate": 3.390128476536523e-06, + "loss": 0.3421, + "step": 6881 + }, + { + "epoch": 1.932060640089837, + "grad_norm": 0.6635868549346924, + "learning_rate": 3.3885821873750314e-06, + "loss": 0.3802, + "step": 6882 + }, + { + "epoch": 1.9323413812464907, + "grad_norm": 0.5993078947067261, + "learning_rate": 3.387036070174027e-06, + "loss": 0.3896, + "step": 6883 + }, + { + "epoch": 1.9326221224031443, + "grad_norm": 0.6735298037528992, + "learning_rate": 3.3854901250985045e-06, + "loss": 0.4156, + "step": 6884 + }, + { + "epoch": 1.9329028635597978, + "grad_norm": 0.5911180973052979, + "learning_rate": 3.383944352313435e-06, + "loss": 0.397, + "step": 6885 + }, + { + "epoch": 1.9331836047164515, + "grad_norm": 0.6654474139213562, + "learning_rate": 3.3823987519837752e-06, + "loss": 0.3541, + "step": 6886 + }, + { + "epoch": 1.9334643458731051, + "grad_norm": 0.6673989295959473, + "learning_rate": 3.380853324274463e-06, + "loss": 0.3677, + "step": 6887 + }, + { + "epoch": 1.9337450870297586, + "grad_norm": 0.7239603400230408, + "learning_rate": 3.3793080693504132e-06, + "loss": 0.3958, + "step": 6888 + }, + { + "epoch": 1.934025828186412, + "grad_norm": 0.6146863102912903, + "learning_rate": 3.3777629873765283e-06, + "loss": 0.3899, + "step": 6889 + }, + { + "epoch": 1.9343065693430657, + "grad_norm": 0.573029637336731, + "learning_rate": 3.3762180785176897e-06, + "loss": 0.3721, + "step": 6890 + }, + { + "epoch": 1.9345873104997193, + "grad_norm": 0.6924031376838684, + "learning_rate": 3.3746733429387596e-06, + "loss": 0.4149, + "step": 6891 + }, + { + "epoch": 1.9348680516563728, + "grad_norm": 0.6505733132362366, + "learning_rate": 3.373128780804583e-06, + "loss": 0.3501, + "step": 6892 + }, + { + "epoch": 1.9351487928130264, + "grad_norm": 0.6565077304840088, + "learning_rate": 3.3715843922799873e-06, + "loss": 0.4297, + "step": 6893 + }, + { + "epoch": 1.93542953396968, + "grad_norm": 0.681267261505127, + "learning_rate": 3.3700401775297787e-06, + "loss": 0.3784, + "step": 6894 + }, + { + "epoch": 1.9357102751263335, + "grad_norm": 0.7059513926506042, + "learning_rate": 3.368496136718745e-06, + "loss": 0.3856, + "step": 6895 + }, + { + "epoch": 1.935991016282987, + "grad_norm": 0.6833927035331726, + "learning_rate": 3.3669522700116585e-06, + "loss": 0.3679, + "step": 6896 + }, + { + "epoch": 1.9362717574396406, + "grad_norm": 0.6739688515663147, + "learning_rate": 3.3654085775732703e-06, + "loss": 0.3588, + "step": 6897 + }, + { + "epoch": 1.9365524985962943, + "grad_norm": 0.6699977517127991, + "learning_rate": 3.3638650595683135e-06, + "loss": 0.413, + "step": 6898 + }, + { + "epoch": 1.9368332397529477, + "grad_norm": 0.6002979278564453, + "learning_rate": 3.362321716161505e-06, + "loss": 0.3458, + "step": 6899 + }, + { + "epoch": 1.9371139809096012, + "grad_norm": 0.6465573906898499, + "learning_rate": 3.360778547517537e-06, + "loss": 0.373, + "step": 6900 + }, + { + "epoch": 1.937394722066255, + "grad_norm": 0.6303117871284485, + "learning_rate": 3.359235553801089e-06, + "loss": 0.3694, + "step": 6901 + }, + { + "epoch": 1.9376754632229085, + "grad_norm": 0.655850350856781, + "learning_rate": 3.3576927351768195e-06, + "loss": 0.3831, + "step": 6902 + }, + { + "epoch": 1.937956204379562, + "grad_norm": 0.5944316983222961, + "learning_rate": 3.3561500918093693e-06, + "loss": 0.3822, + "step": 6903 + }, + { + "epoch": 1.9382369455362156, + "grad_norm": 0.6386849880218506, + "learning_rate": 3.354607623863358e-06, + "loss": 0.4114, + "step": 6904 + }, + { + "epoch": 1.9385176866928693, + "grad_norm": 0.6671199202537537, + "learning_rate": 3.3530653315033902e-06, + "loss": 0.399, + "step": 6905 + }, + { + "epoch": 1.9387984278495227, + "grad_norm": 0.6202429533004761, + "learning_rate": 3.351523214894048e-06, + "loss": 0.3604, + "step": 6906 + }, + { + "epoch": 1.9390791690061762, + "grad_norm": 0.5951457023620605, + "learning_rate": 3.349981274199896e-06, + "loss": 0.3769, + "step": 6907 + }, + { + "epoch": 1.9393599101628298, + "grad_norm": 0.6183099746704102, + "learning_rate": 3.3484395095854815e-06, + "loss": 0.388, + "step": 6908 + }, + { + "epoch": 1.9396406513194835, + "grad_norm": 0.703260064125061, + "learning_rate": 3.3468979212153328e-06, + "loss": 0.3831, + "step": 6909 + }, + { + "epoch": 1.939921392476137, + "grad_norm": 0.6947627663612366, + "learning_rate": 3.3453565092539586e-06, + "loss": 0.4032, + "step": 6910 + }, + { + "epoch": 1.9402021336327906, + "grad_norm": 0.6286624670028687, + "learning_rate": 3.343815273865848e-06, + "loss": 0.3652, + "step": 6911 + }, + { + "epoch": 1.9404828747894443, + "grad_norm": 0.5934192538261414, + "learning_rate": 3.342274215215472e-06, + "loss": 0.3909, + "step": 6912 + }, + { + "epoch": 1.9407636159460977, + "grad_norm": 0.7036631107330322, + "learning_rate": 3.3407333334672832e-06, + "loss": 0.4325, + "step": 6913 + }, + { + "epoch": 1.9410443571027511, + "grad_norm": 0.5805580019950867, + "learning_rate": 3.339192628785716e-06, + "loss": 0.4018, + "step": 6914 + }, + { + "epoch": 1.9413250982594048, + "grad_norm": 0.6935589909553528, + "learning_rate": 3.3376521013351816e-06, + "loss": 0.3545, + "step": 6915 + }, + { + "epoch": 1.9416058394160585, + "grad_norm": 0.5639068484306335, + "learning_rate": 3.336111751280078e-06, + "loss": 0.3604, + "step": 6916 + }, + { + "epoch": 1.941886580572712, + "grad_norm": 0.6157374382019043, + "learning_rate": 3.3345715787847814e-06, + "loss": 0.3801, + "step": 6917 + }, + { + "epoch": 1.9421673217293656, + "grad_norm": 0.5812360048294067, + "learning_rate": 3.3330315840136494e-06, + "loss": 0.3986, + "step": 6918 + }, + { + "epoch": 1.9424480628860192, + "grad_norm": 0.6319735646247864, + "learning_rate": 3.3314917671310204e-06, + "loss": 0.4064, + "step": 6919 + }, + { + "epoch": 1.9427288040426727, + "grad_norm": 0.5736815929412842, + "learning_rate": 3.329952128301215e-06, + "loss": 0.3741, + "step": 6920 + }, + { + "epoch": 1.9430095451993261, + "grad_norm": 0.577810525894165, + "learning_rate": 3.3284126676885324e-06, + "loss": 0.3515, + "step": 6921 + }, + { + "epoch": 1.9432902863559798, + "grad_norm": 0.6701428294181824, + "learning_rate": 3.3268733854572553e-06, + "loss": 0.4244, + "step": 6922 + }, + { + "epoch": 1.9435710275126334, + "grad_norm": 0.6287543773651123, + "learning_rate": 3.3253342817716456e-06, + "loss": 0.3776, + "step": 6923 + }, + { + "epoch": 1.9438517686692869, + "grad_norm": 0.618564784526825, + "learning_rate": 3.323795356795947e-06, + "loss": 0.3389, + "step": 6924 + }, + { + "epoch": 1.9441325098259403, + "grad_norm": 0.6012948751449585, + "learning_rate": 3.3222566106943848e-06, + "loss": 0.369, + "step": 6925 + }, + { + "epoch": 1.9444132509825942, + "grad_norm": 0.6347872614860535, + "learning_rate": 3.3207180436311646e-06, + "loss": 0.3737, + "step": 6926 + }, + { + "epoch": 1.9446939921392477, + "grad_norm": 0.5722830295562744, + "learning_rate": 3.3191796557704712e-06, + "loss": 0.3621, + "step": 6927 + }, + { + "epoch": 1.944974733295901, + "grad_norm": 0.6756933927536011, + "learning_rate": 3.3176414472764727e-06, + "loss": 0.3688, + "step": 6928 + }, + { + "epoch": 1.9452554744525548, + "grad_norm": 0.627498209476471, + "learning_rate": 3.3161034183133173e-06, + "loss": 0.3649, + "step": 6929 + }, + { + "epoch": 1.9455362156092084, + "grad_norm": 0.6117562055587769, + "learning_rate": 3.314565569045133e-06, + "loss": 0.3488, + "step": 6930 + }, + { + "epoch": 1.9458169567658619, + "grad_norm": 0.5619035363197327, + "learning_rate": 3.313027899636031e-06, + "loss": 0.3735, + "step": 6931 + }, + { + "epoch": 1.9460976979225153, + "grad_norm": 0.704190731048584, + "learning_rate": 3.311490410250101e-06, + "loss": 0.3853, + "step": 6932 + }, + { + "epoch": 1.946378439079169, + "grad_norm": 0.6487050652503967, + "learning_rate": 3.309953101051414e-06, + "loss": 0.3795, + "step": 6933 + }, + { + "epoch": 1.9466591802358226, + "grad_norm": 0.6865701675415039, + "learning_rate": 3.30841597220402e-06, + "loss": 0.3703, + "step": 6934 + }, + { + "epoch": 1.946939921392476, + "grad_norm": 0.5946937799453735, + "learning_rate": 3.3068790238719563e-06, + "loss": 0.3701, + "step": 6935 + }, + { + "epoch": 1.9472206625491297, + "grad_norm": 0.6469739675521851, + "learning_rate": 3.305342256219235e-06, + "loss": 0.4324, + "step": 6936 + }, + { + "epoch": 1.9475014037057834, + "grad_norm": 0.7273101806640625, + "learning_rate": 3.3038056694098485e-06, + "loss": 0.4094, + "step": 6937 + }, + { + "epoch": 1.9477821448624368, + "grad_norm": 0.6214479804039001, + "learning_rate": 3.3022692636077734e-06, + "loss": 0.3569, + "step": 6938 + }, + { + "epoch": 1.9480628860190903, + "grad_norm": 0.6211503148078918, + "learning_rate": 3.3007330389769655e-06, + "loss": 0.3713, + "step": 6939 + }, + { + "epoch": 1.948343627175744, + "grad_norm": 0.6319563388824463, + "learning_rate": 3.2991969956813604e-06, + "loss": 0.4089, + "step": 6940 + }, + { + "epoch": 1.9486243683323976, + "grad_norm": 0.6137008666992188, + "learning_rate": 3.297661133884875e-06, + "loss": 0.379, + "step": 6941 + }, + { + "epoch": 1.948905109489051, + "grad_norm": 0.7523574829101562, + "learning_rate": 3.296125453751409e-06, + "loss": 0.4119, + "step": 6942 + }, + { + "epoch": 1.9491858506457047, + "grad_norm": 0.6027659773826599, + "learning_rate": 3.294589955444837e-06, + "loss": 0.3915, + "step": 6943 + }, + { + "epoch": 1.9494665918023584, + "grad_norm": 0.515748143196106, + "learning_rate": 3.29305463912902e-06, + "loss": 0.4017, + "step": 6944 + }, + { + "epoch": 1.9497473329590118, + "grad_norm": 0.6369849443435669, + "learning_rate": 3.291519504967797e-06, + "loss": 0.3587, + "step": 6945 + }, + { + "epoch": 1.9500280741156653, + "grad_norm": 0.6632637977600098, + "learning_rate": 3.2899845531249878e-06, + "loss": 0.3521, + "step": 6946 + }, + { + "epoch": 1.950308815272319, + "grad_norm": 0.6441149711608887, + "learning_rate": 3.2884497837643934e-06, + "loss": 0.3886, + "step": 6947 + }, + { + "epoch": 1.9505895564289726, + "grad_norm": 0.6686705350875854, + "learning_rate": 3.2869151970497964e-06, + "loss": 0.3789, + "step": 6948 + }, + { + "epoch": 1.950870297585626, + "grad_norm": 0.6360732913017273, + "learning_rate": 3.285380793144955e-06, + "loss": 0.3802, + "step": 6949 + }, + { + "epoch": 1.9511510387422795, + "grad_norm": 0.6022663712501526, + "learning_rate": 3.2838465722136126e-06, + "loss": 0.3563, + "step": 6950 + }, + { + "epoch": 1.9514317798989333, + "grad_norm": 0.54377681016922, + "learning_rate": 3.282312534419493e-06, + "loss": 0.4103, + "step": 6951 + }, + { + "epoch": 1.9517125210555868, + "grad_norm": 0.5532845854759216, + "learning_rate": 3.280778679926297e-06, + "loss": 0.3644, + "step": 6952 + }, + { + "epoch": 1.9519932622122402, + "grad_norm": 0.6353257894515991, + "learning_rate": 3.2792450088977097e-06, + "loss": 0.3898, + "step": 6953 + }, + { + "epoch": 1.952274003368894, + "grad_norm": 0.6258017420768738, + "learning_rate": 3.2777115214973953e-06, + "loss": 0.371, + "step": 6954 + }, + { + "epoch": 1.9525547445255476, + "grad_norm": 0.6752704381942749, + "learning_rate": 3.2761782178889955e-06, + "loss": 0.3796, + "step": 6955 + }, + { + "epoch": 1.952835485682201, + "grad_norm": 0.6499730944633484, + "learning_rate": 3.274645098236138e-06, + "loss": 0.3976, + "step": 6956 + }, + { + "epoch": 1.9531162268388544, + "grad_norm": 0.7651991248130798, + "learning_rate": 3.273112162702425e-06, + "loss": 0.3648, + "step": 6957 + }, + { + "epoch": 1.953396967995508, + "grad_norm": 0.6649768948554993, + "learning_rate": 3.2715794114514433e-06, + "loss": 0.3806, + "step": 6958 + }, + { + "epoch": 1.9536777091521618, + "grad_norm": 0.6103923320770264, + "learning_rate": 3.270046844646758e-06, + "loss": 0.368, + "step": 6959 + }, + { + "epoch": 1.9539584503088152, + "grad_norm": 0.7249850630760193, + "learning_rate": 3.268514462451916e-06, + "loss": 0.3778, + "step": 6960 + }, + { + "epoch": 1.9542391914654689, + "grad_norm": 0.6887420415878296, + "learning_rate": 3.266982265030444e-06, + "loss": 0.3693, + "step": 6961 + }, + { + "epoch": 1.9545199326221225, + "grad_norm": 0.593892514705658, + "learning_rate": 3.265450252545847e-06, + "loss": 0.4007, + "step": 6962 + }, + { + "epoch": 1.954800673778776, + "grad_norm": 0.6767063736915588, + "learning_rate": 3.263918425161614e-06, + "loss": 0.3579, + "step": 6963 + }, + { + "epoch": 1.9550814149354294, + "grad_norm": 0.6518818736076355, + "learning_rate": 3.2623867830412093e-06, + "loss": 0.3879, + "step": 6964 + }, + { + "epoch": 1.955362156092083, + "grad_norm": 0.6803855299949646, + "learning_rate": 3.2608553263480826e-06, + "loss": 0.3831, + "step": 6965 + }, + { + "epoch": 1.9556428972487367, + "grad_norm": 0.5994442701339722, + "learning_rate": 3.2593240552456594e-06, + "loss": 0.3853, + "step": 6966 + }, + { + "epoch": 1.9559236384053902, + "grad_norm": 0.6772409081459045, + "learning_rate": 3.2577929698973486e-06, + "loss": 0.3637, + "step": 6967 + }, + { + "epoch": 1.9562043795620438, + "grad_norm": 0.6775597333908081, + "learning_rate": 3.256262070466538e-06, + "loss": 0.3719, + "step": 6968 + }, + { + "epoch": 1.9564851207186975, + "grad_norm": 0.6733676195144653, + "learning_rate": 3.254731357116597e-06, + "loss": 0.4176, + "step": 6969 + }, + { + "epoch": 1.956765861875351, + "grad_norm": 0.6897537112236023, + "learning_rate": 3.2532008300108715e-06, + "loss": 0.3605, + "step": 6970 + }, + { + "epoch": 1.9570466030320044, + "grad_norm": 0.5646283030509949, + "learning_rate": 3.2516704893126904e-06, + "loss": 0.3731, + "step": 6971 + }, + { + "epoch": 1.957327344188658, + "grad_norm": 0.5377715229988098, + "learning_rate": 3.250140335185363e-06, + "loss": 0.3694, + "step": 6972 + }, + { + "epoch": 1.9576080853453117, + "grad_norm": 0.6581278443336487, + "learning_rate": 3.2486103677921767e-06, + "loss": 0.3723, + "step": 6973 + }, + { + "epoch": 1.9578888265019652, + "grad_norm": 0.6968143582344055, + "learning_rate": 3.2470805872964016e-06, + "loss": 0.4159, + "step": 6974 + }, + { + "epoch": 1.9581695676586186, + "grad_norm": 0.6114397048950195, + "learning_rate": 3.245550993861285e-06, + "loss": 0.3793, + "step": 6975 + }, + { + "epoch": 1.9584503088152723, + "grad_norm": 0.6772613525390625, + "learning_rate": 3.244021587650057e-06, + "loss": 0.4359, + "step": 6976 + }, + { + "epoch": 1.958731049971926, + "grad_norm": 0.6400285959243774, + "learning_rate": 3.2424923688259246e-06, + "loss": 0.3723, + "step": 6977 + }, + { + "epoch": 1.9590117911285794, + "grad_norm": 0.6172521114349365, + "learning_rate": 3.2409633375520777e-06, + "loss": 0.3647, + "step": 6978 + }, + { + "epoch": 1.959292532285233, + "grad_norm": 0.5661091208457947, + "learning_rate": 3.2394344939916845e-06, + "loss": 0.3964, + "step": 6979 + }, + { + "epoch": 1.9595732734418867, + "grad_norm": 0.5954875349998474, + "learning_rate": 3.2379058383078937e-06, + "loss": 0.3698, + "step": 6980 + }, + { + "epoch": 1.9598540145985401, + "grad_norm": 0.5770564675331116, + "learning_rate": 3.2363773706638345e-06, + "loss": 0.4045, + "step": 6981 + }, + { + "epoch": 1.9601347557551936, + "grad_norm": 0.7111275792121887, + "learning_rate": 3.234849091222616e-06, + "loss": 0.3546, + "step": 6982 + }, + { + "epoch": 1.9604154969118472, + "grad_norm": 0.5821360945701599, + "learning_rate": 3.233321000147324e-06, + "loss": 0.3822, + "step": 6983 + }, + { + "epoch": 1.960696238068501, + "grad_norm": 0.6313339471817017, + "learning_rate": 3.231793097601029e-06, + "loss": 0.3621, + "step": 6984 + }, + { + "epoch": 1.9609769792251543, + "grad_norm": 0.5764349102973938, + "learning_rate": 3.230265383746778e-06, + "loss": 0.3608, + "step": 6985 + }, + { + "epoch": 1.961257720381808, + "grad_norm": 0.6047051548957825, + "learning_rate": 3.2287378587476014e-06, + "loss": 0.3884, + "step": 6986 + }, + { + "epoch": 1.9615384615384617, + "grad_norm": 0.6179006099700928, + "learning_rate": 3.227210522766505e-06, + "loss": 0.3826, + "step": 6987 + }, + { + "epoch": 1.9618192026951151, + "grad_norm": 0.6662248969078064, + "learning_rate": 3.225683375966478e-06, + "loss": 0.3407, + "step": 6988 + }, + { + "epoch": 1.9620999438517686, + "grad_norm": 0.6197724342346191, + "learning_rate": 3.224156418510487e-06, + "loss": 0.38, + "step": 6989 + }, + { + "epoch": 1.9623806850084222, + "grad_norm": 0.6409143805503845, + "learning_rate": 3.2226296505614796e-06, + "loss": 0.4148, + "step": 6990 + }, + { + "epoch": 1.9626614261650759, + "grad_norm": 0.6775321364402771, + "learning_rate": 3.2211030722823844e-06, + "loss": 0.3673, + "step": 6991 + }, + { + "epoch": 1.9629421673217293, + "grad_norm": 0.5687212944030762, + "learning_rate": 3.2195766838361065e-06, + "loss": 0.3936, + "step": 6992 + }, + { + "epoch": 1.9632229084783828, + "grad_norm": 0.5648772120475769, + "learning_rate": 3.2180504853855334e-06, + "loss": 0.3863, + "step": 6993 + }, + { + "epoch": 1.9635036496350367, + "grad_norm": 0.5899771451950073, + "learning_rate": 3.21652447709353e-06, + "loss": 0.3845, + "step": 6994 + }, + { + "epoch": 1.96378439079169, + "grad_norm": 0.602634072303772, + "learning_rate": 3.2149986591229443e-06, + "loss": 0.4047, + "step": 6995 + }, + { + "epoch": 1.9640651319483435, + "grad_norm": 0.6572073101997375, + "learning_rate": 3.2134730316366015e-06, + "loss": 0.4277, + "step": 6996 + }, + { + "epoch": 1.9643458731049972, + "grad_norm": 0.526705265045166, + "learning_rate": 3.211947594797309e-06, + "loss": 0.3753, + "step": 6997 + }, + { + "epoch": 1.9646266142616509, + "grad_norm": 0.6437777876853943, + "learning_rate": 3.2104223487678476e-06, + "loss": 0.3445, + "step": 6998 + }, + { + "epoch": 1.9649073554183043, + "grad_norm": 0.6343139410018921, + "learning_rate": 3.208897293710985e-06, + "loss": 0.4043, + "step": 6999 + }, + { + "epoch": 1.9651880965749577, + "grad_norm": 0.621269941329956, + "learning_rate": 3.2073724297894654e-06, + "loss": 0.3572, + "step": 7000 + }, + { + "epoch": 1.9654688377316114, + "grad_norm": 0.6208169460296631, + "learning_rate": 3.205847757166012e-06, + "loss": 0.3759, + "step": 7001 + }, + { + "epoch": 1.965749578888265, + "grad_norm": 0.7012404799461365, + "learning_rate": 3.2043232760033294e-06, + "loss": 0.3962, + "step": 7002 + }, + { + "epoch": 1.9660303200449185, + "grad_norm": 0.6112543940544128, + "learning_rate": 3.2027989864641008e-06, + "loss": 0.356, + "step": 7003 + }, + { + "epoch": 1.9663110612015722, + "grad_norm": 0.603643536567688, + "learning_rate": 3.2012748887109873e-06, + "loss": 0.3771, + "step": 7004 + }, + { + "epoch": 1.9665918023582258, + "grad_norm": 0.7005816102027893, + "learning_rate": 3.1997509829066324e-06, + "loss": 0.4341, + "step": 7005 + }, + { + "epoch": 1.9668725435148793, + "grad_norm": 0.6568206548690796, + "learning_rate": 3.198227269213657e-06, + "loss": 0.3561, + "step": 7006 + }, + { + "epoch": 1.9671532846715327, + "grad_norm": 0.6353272795677185, + "learning_rate": 3.196703747794664e-06, + "loss": 0.439, + "step": 7007 + }, + { + "epoch": 1.9674340258281864, + "grad_norm": 0.6421124339103699, + "learning_rate": 3.1951804188122324e-06, + "loss": 0.3894, + "step": 7008 + }, + { + "epoch": 1.96771476698484, + "grad_norm": 0.6794632077217102, + "learning_rate": 3.193657282428924e-06, + "loss": 0.3903, + "step": 7009 + }, + { + "epoch": 1.9679955081414935, + "grad_norm": 0.6266258358955383, + "learning_rate": 3.1921343388072746e-06, + "loss": 0.3201, + "step": 7010 + }, + { + "epoch": 1.9682762492981472, + "grad_norm": 0.6301803588867188, + "learning_rate": 3.1906115881098086e-06, + "loss": 0.3973, + "step": 7011 + }, + { + "epoch": 1.9685569904548008, + "grad_norm": 0.6951670050621033, + "learning_rate": 3.1890890304990217e-06, + "loss": 0.3904, + "step": 7012 + }, + { + "epoch": 1.9688377316114543, + "grad_norm": 0.6230060458183289, + "learning_rate": 3.1875666661373927e-06, + "loss": 0.373, + "step": 7013 + }, + { + "epoch": 1.9691184727681077, + "grad_norm": 0.6953141093254089, + "learning_rate": 3.1860444951873783e-06, + "loss": 0.3742, + "step": 7014 + }, + { + "epoch": 1.9693992139247614, + "grad_norm": 0.6503199338912964, + "learning_rate": 3.184522517811415e-06, + "loss": 0.3592, + "step": 7015 + }, + { + "epoch": 1.969679955081415, + "grad_norm": 0.6654722094535828, + "learning_rate": 3.1830007341719182e-06, + "loss": 0.3723, + "step": 7016 + }, + { + "epoch": 1.9699606962380685, + "grad_norm": 0.6210854053497314, + "learning_rate": 3.1814791444312843e-06, + "loss": 0.3341, + "step": 7017 + }, + { + "epoch": 1.970241437394722, + "grad_norm": 0.6457767486572266, + "learning_rate": 3.1799577487518875e-06, + "loss": 0.3821, + "step": 7018 + }, + { + "epoch": 1.9705221785513758, + "grad_norm": 0.6870471239089966, + "learning_rate": 3.178436547296082e-06, + "loss": 0.3652, + "step": 7019 + }, + { + "epoch": 1.9708029197080292, + "grad_norm": 0.6089987754821777, + "learning_rate": 3.1769155402262002e-06, + "loss": 0.3559, + "step": 7020 + }, + { + "epoch": 1.9710836608646827, + "grad_norm": 0.6179919242858887, + "learning_rate": 3.175394727704555e-06, + "loss": 0.3526, + "step": 7021 + }, + { + "epoch": 1.9713644020213363, + "grad_norm": 0.5888646841049194, + "learning_rate": 3.173874109893438e-06, + "loss": 0.3808, + "step": 7022 + }, + { + "epoch": 1.97164514317799, + "grad_norm": 0.5951924920082092, + "learning_rate": 3.1723536869551197e-06, + "loss": 0.3519, + "step": 7023 + }, + { + "epoch": 1.9719258843346434, + "grad_norm": 0.6952611207962036, + "learning_rate": 3.170833459051851e-06, + "loss": 0.3557, + "step": 7024 + }, + { + "epoch": 1.9722066254912969, + "grad_norm": 0.6679577231407166, + "learning_rate": 3.1693134263458614e-06, + "loss": 0.3927, + "step": 7025 + }, + { + "epoch": 1.9724873666479505, + "grad_norm": 0.5919626951217651, + "learning_rate": 3.167793588999358e-06, + "loss": 0.402, + "step": 7026 + }, + { + "epoch": 1.9727681078046042, + "grad_norm": 0.5656420588493347, + "learning_rate": 3.1662739471745287e-06, + "loss": 0.3738, + "step": 7027 + }, + { + "epoch": 1.9730488489612577, + "grad_norm": 0.6829491853713989, + "learning_rate": 3.16475450103354e-06, + "loss": 0.4032, + "step": 7028 + }, + { + "epoch": 1.9733295901179113, + "grad_norm": 0.6164791584014893, + "learning_rate": 3.1632352507385393e-06, + "loss": 0.3654, + "step": 7029 + }, + { + "epoch": 1.973610331274565, + "grad_norm": 0.7377853989601135, + "learning_rate": 3.1617161964516503e-06, + "loss": 0.375, + "step": 7030 + }, + { + "epoch": 1.9738910724312184, + "grad_norm": 0.635339081287384, + "learning_rate": 3.1601973383349784e-06, + "loss": 0.4071, + "step": 7031 + }, + { + "epoch": 1.9741718135878719, + "grad_norm": 0.6464971899986267, + "learning_rate": 3.158678676550604e-06, + "loss": 0.3729, + "step": 7032 + }, + { + "epoch": 1.9744525547445255, + "grad_norm": 0.59506756067276, + "learning_rate": 3.1571602112605916e-06, + "loss": 0.3461, + "step": 7033 + }, + { + "epoch": 1.9747332959011792, + "grad_norm": 0.7854287624359131, + "learning_rate": 3.1556419426269808e-06, + "loss": 0.3778, + "step": 7034 + }, + { + "epoch": 1.9750140370578326, + "grad_norm": 0.6562832593917847, + "learning_rate": 3.1541238708117926e-06, + "loss": 0.3554, + "step": 7035 + }, + { + "epoch": 1.9752947782144863, + "grad_norm": 0.6035671234130859, + "learning_rate": 3.152605995977026e-06, + "loss": 0.3804, + "step": 7036 + }, + { + "epoch": 1.97557551937114, + "grad_norm": 0.5490044355392456, + "learning_rate": 3.15108831828466e-06, + "loss": 0.3672, + "step": 7037 + }, + { + "epoch": 1.9758562605277934, + "grad_norm": 0.6840856075286865, + "learning_rate": 3.1495708378966507e-06, + "loss": 0.4226, + "step": 7038 + }, + { + "epoch": 1.9761370016844468, + "grad_norm": 0.6101973056793213, + "learning_rate": 3.1480535549749348e-06, + "loss": 0.3825, + "step": 7039 + }, + { + "epoch": 1.9764177428411005, + "grad_norm": 0.6528775095939636, + "learning_rate": 3.1465364696814277e-06, + "loss": 0.3835, + "step": 7040 + }, + { + "epoch": 1.9766984839977542, + "grad_norm": 0.6299150586128235, + "learning_rate": 3.145019582178022e-06, + "loss": 0.3847, + "step": 7041 + }, + { + "epoch": 1.9769792251544076, + "grad_norm": 0.5839090943336487, + "learning_rate": 3.143502892626591e-06, + "loss": 0.3966, + "step": 7042 + }, + { + "epoch": 1.977259966311061, + "grad_norm": 0.6374778747558594, + "learning_rate": 3.141986401188987e-06, + "loss": 0.3617, + "step": 7043 + }, + { + "epoch": 1.977540707467715, + "grad_norm": 0.5736613869667053, + "learning_rate": 3.140470108027039e-06, + "loss": 0.3905, + "step": 7044 + }, + { + "epoch": 1.9778214486243684, + "grad_norm": 0.6634061336517334, + "learning_rate": 3.1389540133025575e-06, + "loss": 0.4334, + "step": 7045 + }, + { + "epoch": 1.9781021897810218, + "grad_norm": 0.7333037257194519, + "learning_rate": 3.1374381171773326e-06, + "loss": 0.3997, + "step": 7046 + }, + { + "epoch": 1.9783829309376755, + "grad_norm": 0.7173023819923401, + "learning_rate": 3.135922419813128e-06, + "loss": 0.4126, + "step": 7047 + }, + { + "epoch": 1.9786636720943291, + "grad_norm": 0.6173637509346008, + "learning_rate": 3.13440692137169e-06, + "loss": 0.36, + "step": 7048 + }, + { + "epoch": 1.9789444132509826, + "grad_norm": 0.618675172328949, + "learning_rate": 3.1328916220147447e-06, + "loss": 0.3511, + "step": 7049 + }, + { + "epoch": 1.979225154407636, + "grad_norm": 0.6620262861251831, + "learning_rate": 3.1313765219039947e-06, + "loss": 0.3853, + "step": 7050 + }, + { + "epoch": 1.9795058955642897, + "grad_norm": 0.7026667594909668, + "learning_rate": 3.1298616212011224e-06, + "loss": 0.3585, + "step": 7051 + }, + { + "epoch": 1.9797866367209433, + "grad_norm": 0.6468698978424072, + "learning_rate": 3.1283469200677886e-06, + "loss": 0.3959, + "step": 7052 + }, + { + "epoch": 1.9800673778775968, + "grad_norm": 0.6284026503562927, + "learning_rate": 3.126832418665632e-06, + "loss": 0.3833, + "step": 7053 + }, + { + "epoch": 1.9803481190342505, + "grad_norm": 0.7425382733345032, + "learning_rate": 3.1253181171562707e-06, + "loss": 0.4048, + "step": 7054 + }, + { + "epoch": 1.9806288601909041, + "grad_norm": 0.6566505432128906, + "learning_rate": 3.123804015701302e-06, + "loss": 0.3514, + "step": 7055 + }, + { + "epoch": 1.9809096013475576, + "grad_norm": 0.5765173435211182, + "learning_rate": 3.1222901144623018e-06, + "loss": 0.3483, + "step": 7056 + }, + { + "epoch": 1.981190342504211, + "grad_norm": 0.6245824098587036, + "learning_rate": 3.120776413600824e-06, + "loss": 0.4012, + "step": 7057 + }, + { + "epoch": 1.9814710836608647, + "grad_norm": 0.533491849899292, + "learning_rate": 3.1192629132784023e-06, + "loss": 0.3971, + "step": 7058 + }, + { + "epoch": 1.9817518248175183, + "grad_norm": 0.7285841703414917, + "learning_rate": 3.1177496136565455e-06, + "loss": 0.3703, + "step": 7059 + }, + { + "epoch": 1.9820325659741718, + "grad_norm": 0.6480452418327332, + "learning_rate": 3.1162365148967453e-06, + "loss": 0.3495, + "step": 7060 + }, + { + "epoch": 1.9823133071308254, + "grad_norm": 0.6141634583473206, + "learning_rate": 3.114723617160468e-06, + "loss": 0.3925, + "step": 7061 + }, + { + "epoch": 1.982594048287479, + "grad_norm": 0.6793792247772217, + "learning_rate": 3.113210920609165e-06, + "loss": 0.3921, + "step": 7062 + }, + { + "epoch": 1.9828747894441325, + "grad_norm": 0.6222654581069946, + "learning_rate": 3.1116984254042587e-06, + "loss": 0.4115, + "step": 7063 + }, + { + "epoch": 1.983155530600786, + "grad_norm": 0.6494109034538269, + "learning_rate": 3.1101861317071536e-06, + "loss": 0.4008, + "step": 7064 + }, + { + "epoch": 1.9834362717574396, + "grad_norm": 0.6529889106750488, + "learning_rate": 3.108674039679233e-06, + "loss": 0.3543, + "step": 7065 + }, + { + "epoch": 1.9837170129140933, + "grad_norm": 0.6445561051368713, + "learning_rate": 3.107162149481857e-06, + "loss": 0.389, + "step": 7066 + }, + { + "epoch": 1.9839977540707467, + "grad_norm": 0.6440285444259644, + "learning_rate": 3.105650461276366e-06, + "loss": 0.4036, + "step": 7067 + }, + { + "epoch": 1.9842784952274002, + "grad_norm": 0.5650212168693542, + "learning_rate": 3.1041389752240793e-06, + "loss": 0.382, + "step": 7068 + }, + { + "epoch": 1.9845592363840538, + "grad_norm": 0.7039580345153809, + "learning_rate": 3.1026276914862896e-06, + "loss": 0.3736, + "step": 7069 + }, + { + "epoch": 1.9848399775407075, + "grad_norm": 0.7007707953453064, + "learning_rate": 3.1011166102242733e-06, + "loss": 0.3706, + "step": 7070 + }, + { + "epoch": 1.985120718697361, + "grad_norm": 0.5565095543861389, + "learning_rate": 3.0996057315992844e-06, + "loss": 0.3656, + "step": 7071 + }, + { + "epoch": 1.9854014598540146, + "grad_norm": 0.6265332698822021, + "learning_rate": 3.0980950557725546e-06, + "loss": 0.3984, + "step": 7072 + }, + { + "epoch": 1.9856822010106683, + "grad_norm": 0.6178296804428101, + "learning_rate": 3.096584582905293e-06, + "loss": 0.3621, + "step": 7073 + }, + { + "epoch": 1.9859629421673217, + "grad_norm": 0.7236082553863525, + "learning_rate": 3.095074313158689e-06, + "loss": 0.3549, + "step": 7074 + }, + { + "epoch": 1.9862436833239752, + "grad_norm": 0.6384782195091248, + "learning_rate": 3.0935642466939076e-06, + "loss": 0.4202, + "step": 7075 + }, + { + "epoch": 1.9865244244806288, + "grad_norm": 0.7140527963638306, + "learning_rate": 3.092054383672094e-06, + "loss": 0.386, + "step": 7076 + }, + { + "epoch": 1.9868051656372825, + "grad_norm": 0.5895060896873474, + "learning_rate": 3.090544724254372e-06, + "loss": 0.3422, + "step": 7077 + }, + { + "epoch": 1.987085906793936, + "grad_norm": 0.544243574142456, + "learning_rate": 3.089035268601843e-06, + "loss": 0.4101, + "step": 7078 + }, + { + "epoch": 1.9873666479505896, + "grad_norm": 0.6739355325698853, + "learning_rate": 3.0875260168755873e-06, + "loss": 0.3777, + "step": 7079 + }, + { + "epoch": 1.9876473891072433, + "grad_norm": 0.5470808744430542, + "learning_rate": 3.086016969236662e-06, + "loss": 0.3704, + "step": 7080 + }, + { + "epoch": 1.9879281302638967, + "grad_norm": 0.629051148891449, + "learning_rate": 3.084508125846103e-06, + "loss": 0.3662, + "step": 7081 + }, + { + "epoch": 1.9882088714205501, + "grad_norm": 0.6512294411659241, + "learning_rate": 3.082999486864925e-06, + "loss": 0.354, + "step": 7082 + }, + { + "epoch": 1.9884896125772038, + "grad_norm": 0.6564597487449646, + "learning_rate": 3.081491052454121e-06, + "loss": 0.3861, + "step": 7083 + }, + { + "epoch": 1.9887703537338575, + "grad_norm": 0.5574722290039062, + "learning_rate": 3.0799828227746615e-06, + "loss": 0.3711, + "step": 7084 + }, + { + "epoch": 1.989051094890511, + "grad_norm": 0.6071930527687073, + "learning_rate": 3.0784747979874954e-06, + "loss": 0.3486, + "step": 7085 + }, + { + "epoch": 1.9893318360471643, + "grad_norm": 0.6464855074882507, + "learning_rate": 3.076966978253548e-06, + "loss": 0.3358, + "step": 7086 + }, + { + "epoch": 1.9896125772038182, + "grad_norm": 0.5567142367362976, + "learning_rate": 3.0754593637337276e-06, + "loss": 0.419, + "step": 7087 + }, + { + "epoch": 1.9898933183604717, + "grad_norm": 0.6773258447647095, + "learning_rate": 3.0739519545889163e-06, + "loss": 0.4196, + "step": 7088 + }, + { + "epoch": 1.9901740595171251, + "grad_norm": 0.7013642191886902, + "learning_rate": 3.0724447509799747e-06, + "loss": 0.4267, + "step": 7089 + }, + { + "epoch": 1.9904548006737788, + "grad_norm": 0.6178555488586426, + "learning_rate": 3.0709377530677433e-06, + "loss": 0.398, + "step": 7090 + }, + { + "epoch": 1.9907355418304324, + "grad_norm": 0.6290339231491089, + "learning_rate": 3.0694309610130386e-06, + "loss": 0.3896, + "step": 7091 + }, + { + "epoch": 1.9910162829870859, + "grad_norm": 0.681408166885376, + "learning_rate": 3.0679243749766557e-06, + "loss": 0.3943, + "step": 7092 + }, + { + "epoch": 1.9912970241437393, + "grad_norm": 0.6687710285186768, + "learning_rate": 3.066417995119369e-06, + "loss": 0.3724, + "step": 7093 + }, + { + "epoch": 1.991577765300393, + "grad_norm": 0.6002901792526245, + "learning_rate": 3.0649118216019296e-06, + "loss": 0.3951, + "step": 7094 + }, + { + "epoch": 1.9918585064570467, + "grad_norm": 0.6790820360183716, + "learning_rate": 3.0634058545850677e-06, + "loss": 0.403, + "step": 7095 + }, + { + "epoch": 1.9921392476137, + "grad_norm": 0.7577724456787109, + "learning_rate": 3.06190009422949e-06, + "loss": 0.3775, + "step": 7096 + }, + { + "epoch": 1.9924199887703538, + "grad_norm": 0.6288694739341736, + "learning_rate": 3.0603945406958812e-06, + "loss": 0.4084, + "step": 7097 + }, + { + "epoch": 1.9927007299270074, + "grad_norm": 0.6337240934371948, + "learning_rate": 3.058889194144906e-06, + "loss": 0.3466, + "step": 7098 + }, + { + "epoch": 1.9929814710836609, + "grad_norm": 0.58216392993927, + "learning_rate": 3.0573840547372047e-06, + "loss": 0.3607, + "step": 7099 + }, + { + "epoch": 1.9932622122403143, + "grad_norm": 0.6355688571929932, + "learning_rate": 3.0558791226333974e-06, + "loss": 0.3573, + "step": 7100 + }, + { + "epoch": 1.993542953396968, + "grad_norm": 0.7551351189613342, + "learning_rate": 3.0543743979940797e-06, + "loss": 0.3727, + "step": 7101 + }, + { + "epoch": 1.9938236945536216, + "grad_norm": 0.6733753681182861, + "learning_rate": 3.0528698809798287e-06, + "loss": 0.3987, + "step": 7102 + }, + { + "epoch": 1.994104435710275, + "grad_norm": 0.6546279788017273, + "learning_rate": 3.0513655717511936e-06, + "loss": 0.3818, + "step": 7103 + }, + { + "epoch": 1.9943851768669287, + "grad_norm": 0.7093046307563782, + "learning_rate": 3.049861470468708e-06, + "loss": 0.3783, + "step": 7104 + }, + { + "epoch": 1.9946659180235824, + "grad_norm": 0.5993322134017944, + "learning_rate": 3.0483575772928786e-06, + "loss": 0.4103, + "step": 7105 + }, + { + "epoch": 1.9949466591802358, + "grad_norm": 0.6651349067687988, + "learning_rate": 3.046853892384192e-06, + "loss": 0.3419, + "step": 7106 + }, + { + "epoch": 1.9952274003368893, + "grad_norm": 0.5755440592765808, + "learning_rate": 3.0453504159031128e-06, + "loss": 0.3556, + "step": 7107 + }, + { + "epoch": 1.995508141493543, + "grad_norm": 0.6396806836128235, + "learning_rate": 3.043847148010083e-06, + "loss": 0.3942, + "step": 7108 + }, + { + "epoch": 1.9957888826501966, + "grad_norm": 0.6420109868049622, + "learning_rate": 3.0423440888655198e-06, + "loss": 0.4171, + "step": 7109 + }, + { + "epoch": 1.99606962380685, + "grad_norm": 0.6826302409172058, + "learning_rate": 3.0408412386298216e-06, + "loss": 0.3735, + "step": 7110 + }, + { + "epoch": 1.9963503649635035, + "grad_norm": 0.6247993111610413, + "learning_rate": 3.0393385974633626e-06, + "loss": 0.3673, + "step": 7111 + }, + { + "epoch": 1.9966311061201574, + "grad_norm": 0.7043548822402954, + "learning_rate": 3.0378361655264955e-06, + "loss": 0.4281, + "step": 7112 + }, + { + "epoch": 1.9969118472768108, + "grad_norm": 0.5801706910133362, + "learning_rate": 3.036333942979552e-06, + "loss": 0.3961, + "step": 7113 + }, + { + "epoch": 1.9971925884334643, + "grad_norm": 0.5859178304672241, + "learning_rate": 3.034831929982839e-06, + "loss": 0.3809, + "step": 7114 + }, + { + "epoch": 1.997473329590118, + "grad_norm": 0.7079702615737915, + "learning_rate": 3.0333301266966415e-06, + "loss": 0.3398, + "step": 7115 + }, + { + "epoch": 1.9977540707467716, + "grad_norm": 0.5475205779075623, + "learning_rate": 3.0318285332812225e-06, + "loss": 0.3939, + "step": 7116 + }, + { + "epoch": 1.998034811903425, + "grad_norm": 0.720130443572998, + "learning_rate": 3.030327149896825e-06, + "loss": 0.3752, + "step": 7117 + }, + { + "epoch": 1.9983155530600785, + "grad_norm": 0.6578987836837769, + "learning_rate": 3.0288259767036645e-06, + "loss": 0.3815, + "step": 7118 + }, + { + "epoch": 1.9985962942167321, + "grad_norm": 0.6133025884628296, + "learning_rate": 3.0273250138619376e-06, + "loss": 0.3697, + "step": 7119 + }, + { + "epoch": 1.9988770353733858, + "grad_norm": 0.6277243494987488, + "learning_rate": 3.025824261531818e-06, + "loss": 0.3579, + "step": 7120 + }, + { + "epoch": 1.9991577765300392, + "grad_norm": 0.6791841387748718, + "learning_rate": 3.0243237198734567e-06, + "loss": 0.4439, + "step": 7121 + }, + { + "epoch": 1.999438517686693, + "grad_norm": 0.6244490146636963, + "learning_rate": 3.0228233890469817e-06, + "loss": 0.3806, + "step": 7122 + }, + { + "epoch": 1.9997192588433466, + "grad_norm": 0.647245466709137, + "learning_rate": 3.0213232692125005e-06, + "loss": 0.3806, + "step": 7123 + }, + { + "epoch": 2.0, + "grad_norm": 0.683252215385437, + "learning_rate": 3.0198233605300947e-06, + "loss": 0.3444, + "step": 7124 + }, + { + "epoch": 2.0002807411566534, + "grad_norm": 0.5534584522247314, + "learning_rate": 3.0183236631598257e-06, + "loss": 0.3508, + "step": 7125 + }, + { + "epoch": 2.0005614823133073, + "grad_norm": 0.5867440700531006, + "learning_rate": 3.0168241772617323e-06, + "loss": 0.2919, + "step": 7126 + }, + { + "epoch": 2.0008422234699608, + "grad_norm": 0.5523701906204224, + "learning_rate": 3.0153249029958296e-06, + "loss": 0.3218, + "step": 7127 + }, + { + "epoch": 2.001122964626614, + "grad_norm": 0.5687662363052368, + "learning_rate": 3.013825840522112e-06, + "loss": 0.3426, + "step": 7128 + }, + { + "epoch": 2.0014037057832677, + "grad_norm": 0.5287777781486511, + "learning_rate": 3.012326990000551e-06, + "loss": 0.3301, + "step": 7129 + }, + { + "epoch": 2.0016844469399215, + "grad_norm": 0.5678732991218567, + "learning_rate": 3.0108283515910914e-06, + "loss": 0.3635, + "step": 7130 + }, + { + "epoch": 2.001965188096575, + "grad_norm": 0.5132319331169128, + "learning_rate": 3.00932992545366e-06, + "loss": 0.3438, + "step": 7131 + }, + { + "epoch": 2.0022459292532284, + "grad_norm": 0.5629129409790039, + "learning_rate": 3.0078317117481602e-06, + "loss": 0.3202, + "step": 7132 + }, + { + "epoch": 2.0025266704098823, + "grad_norm": 0.584525465965271, + "learning_rate": 3.0063337106344713e-06, + "loss": 0.3132, + "step": 7133 + }, + { + "epoch": 2.0028074115665357, + "grad_norm": 0.5917782783508301, + "learning_rate": 3.004835922272452e-06, + "loss": 0.313, + "step": 7134 + }, + { + "epoch": 2.003088152723189, + "grad_norm": 0.5950379967689514, + "learning_rate": 3.003338346821936e-06, + "loss": 0.3386, + "step": 7135 + }, + { + "epoch": 2.0033688938798426, + "grad_norm": 0.6072313189506531, + "learning_rate": 3.001840984442734e-06, + "loss": 0.3626, + "step": 7136 + }, + { + "epoch": 2.0036496350364965, + "grad_norm": 0.5548645257949829, + "learning_rate": 3.0003438352946355e-06, + "loss": 0.3279, + "step": 7137 + }, + { + "epoch": 2.00393037619315, + "grad_norm": 0.6289433836936951, + "learning_rate": 2.9988468995374093e-06, + "loss": 0.3405, + "step": 7138 + }, + { + "epoch": 2.0042111173498034, + "grad_norm": 0.5315924286842346, + "learning_rate": 2.9973501773307984e-06, + "loss": 0.3065, + "step": 7139 + }, + { + "epoch": 2.004491858506457, + "grad_norm": 0.5503185987472534, + "learning_rate": 2.995853668834522e-06, + "loss": 0.3563, + "step": 7140 + }, + { + "epoch": 2.0047725996631107, + "grad_norm": 0.5823440551757812, + "learning_rate": 2.9943573742082793e-06, + "loss": 0.3198, + "step": 7141 + }, + { + "epoch": 2.005053340819764, + "grad_norm": 0.5085039734840393, + "learning_rate": 2.9928612936117453e-06, + "loss": 0.2783, + "step": 7142 + }, + { + "epoch": 2.0053340819764176, + "grad_norm": 0.607925534248352, + "learning_rate": 2.9913654272045723e-06, + "loss": 0.3569, + "step": 7143 + }, + { + "epoch": 2.0056148231330715, + "grad_norm": 0.5201929211616516, + "learning_rate": 2.9898697751463903e-06, + "loss": 0.348, + "step": 7144 + }, + { + "epoch": 2.005895564289725, + "grad_norm": 0.6306862831115723, + "learning_rate": 2.9883743375968067e-06, + "loss": 0.3693, + "step": 7145 + }, + { + "epoch": 2.0061763054463784, + "grad_norm": 0.5337955951690674, + "learning_rate": 2.986879114715403e-06, + "loss": 0.31, + "step": 7146 + }, + { + "epoch": 2.006457046603032, + "grad_norm": 0.571149468421936, + "learning_rate": 2.985384106661742e-06, + "loss": 0.353, + "step": 7147 + }, + { + "epoch": 2.0067377877596857, + "grad_norm": 0.6062653064727783, + "learning_rate": 2.9838893135953604e-06, + "loss": 0.3339, + "step": 7148 + }, + { + "epoch": 2.007018528916339, + "grad_norm": 0.563480794429779, + "learning_rate": 2.9823947356757744e-06, + "loss": 0.3189, + "step": 7149 + }, + { + "epoch": 2.0072992700729926, + "grad_norm": 0.5251970291137695, + "learning_rate": 2.980900373062475e-06, + "loss": 0.3331, + "step": 7150 + }, + { + "epoch": 2.0075800112296465, + "grad_norm": 0.6197962164878845, + "learning_rate": 2.979406225914933e-06, + "loss": 0.3201, + "step": 7151 + }, + { + "epoch": 2.0078607523863, + "grad_norm": 0.6260367631912231, + "learning_rate": 2.9779122943925924e-06, + "loss": 0.3471, + "step": 7152 + }, + { + "epoch": 2.0081414935429533, + "grad_norm": 0.5870199799537659, + "learning_rate": 2.976418578654877e-06, + "loss": 0.324, + "step": 7153 + }, + { + "epoch": 2.008422234699607, + "grad_norm": 0.6012134552001953, + "learning_rate": 2.9749250788611884e-06, + "loss": 0.3401, + "step": 7154 + }, + { + "epoch": 2.0087029758562607, + "grad_norm": 0.5963485836982727, + "learning_rate": 2.9734317951709008e-06, + "loss": 0.3312, + "step": 7155 + }, + { + "epoch": 2.008983717012914, + "grad_norm": 0.5649819374084473, + "learning_rate": 2.97193872774337e-06, + "loss": 0.2976, + "step": 7156 + }, + { + "epoch": 2.0092644581695676, + "grad_norm": 0.5973899960517883, + "learning_rate": 2.9704458767379274e-06, + "loss": 0.3381, + "step": 7157 + }, + { + "epoch": 2.0095451993262214, + "grad_norm": 0.6624057292938232, + "learning_rate": 2.968953242313879e-06, + "loss": 0.3136, + "step": 7158 + }, + { + "epoch": 2.009825940482875, + "grad_norm": 0.5636825561523438, + "learning_rate": 2.9674608246305103e-06, + "loss": 0.3478, + "step": 7159 + }, + { + "epoch": 2.0101066816395283, + "grad_norm": 0.6221297383308411, + "learning_rate": 2.965968623847083e-06, + "loss": 0.3079, + "step": 7160 + }, + { + "epoch": 2.0103874227961818, + "grad_norm": 0.6307937502861023, + "learning_rate": 2.964476640122835e-06, + "loss": 0.287, + "step": 7161 + }, + { + "epoch": 2.0106681639528357, + "grad_norm": 0.5652309656143188, + "learning_rate": 2.9629848736169825e-06, + "loss": 0.3627, + "step": 7162 + }, + { + "epoch": 2.010948905109489, + "grad_norm": 0.6897119283676147, + "learning_rate": 2.9614933244887154e-06, + "loss": 0.3268, + "step": 7163 + }, + { + "epoch": 2.0112296462661425, + "grad_norm": 0.6074008345603943, + "learning_rate": 2.9600019928972057e-06, + "loss": 0.3863, + "step": 7164 + }, + { + "epoch": 2.011510387422796, + "grad_norm": 0.6044426560401917, + "learning_rate": 2.958510879001597e-06, + "loss": 0.3171, + "step": 7165 + }, + { + "epoch": 2.01179112857945, + "grad_norm": 0.5722121596336365, + "learning_rate": 2.9570199829610123e-06, + "loss": 0.3659, + "step": 7166 + }, + { + "epoch": 2.0120718697361033, + "grad_norm": 0.6430336236953735, + "learning_rate": 2.955529304934551e-06, + "loss": 0.2792, + "step": 7167 + }, + { + "epoch": 2.0123526108927567, + "grad_norm": 0.5799283385276794, + "learning_rate": 2.9540388450812874e-06, + "loss": 0.3151, + "step": 7168 + }, + { + "epoch": 2.0126333520494106, + "grad_norm": 0.6603776812553406, + "learning_rate": 2.9525486035602758e-06, + "loss": 0.3062, + "step": 7169 + }, + { + "epoch": 2.012914093206064, + "grad_norm": 0.6019997596740723, + "learning_rate": 2.9510585805305447e-06, + "loss": 0.3136, + "step": 7170 + }, + { + "epoch": 2.0131948343627175, + "grad_norm": 0.6055475473403931, + "learning_rate": 2.949568776151101e-06, + "loss": 0.3324, + "step": 7171 + }, + { + "epoch": 2.013475575519371, + "grad_norm": 0.5041202902793884, + "learning_rate": 2.948079190580927e-06, + "loss": 0.2982, + "step": 7172 + }, + { + "epoch": 2.013756316676025, + "grad_norm": 0.5663584470748901, + "learning_rate": 2.9465898239789815e-06, + "loss": 0.365, + "step": 7173 + }, + { + "epoch": 2.0140370578326783, + "grad_norm": 0.6115890145301819, + "learning_rate": 2.9451006765042e-06, + "loss": 0.3233, + "step": 7174 + }, + { + "epoch": 2.0143177989893317, + "grad_norm": 0.6371398568153381, + "learning_rate": 2.943611748315496e-06, + "loss": 0.3415, + "step": 7175 + }, + { + "epoch": 2.0145985401459856, + "grad_norm": 0.5430166721343994, + "learning_rate": 2.9421230395717582e-06, + "loss": 0.3388, + "step": 7176 + }, + { + "epoch": 2.014879281302639, + "grad_norm": 0.5949511528015137, + "learning_rate": 2.940634550431852e-06, + "loss": 0.3041, + "step": 7177 + }, + { + "epoch": 2.0151600224592925, + "grad_norm": 0.5818724632263184, + "learning_rate": 2.939146281054622e-06, + "loss": 0.3297, + "step": 7178 + }, + { + "epoch": 2.015440763615946, + "grad_norm": 0.549331784248352, + "learning_rate": 2.9376582315988845e-06, + "loss": 0.3182, + "step": 7179 + }, + { + "epoch": 2.0157215047726, + "grad_norm": 0.5347083210945129, + "learning_rate": 2.9361704022234354e-06, + "loss": 0.3378, + "step": 7180 + }, + { + "epoch": 2.0160022459292533, + "grad_norm": 0.5704168081283569, + "learning_rate": 2.934682793087047e-06, + "loss": 0.3576, + "step": 7181 + }, + { + "epoch": 2.0162829870859067, + "grad_norm": 0.5149984359741211, + "learning_rate": 2.9331954043484672e-06, + "loss": 0.319, + "step": 7182 + }, + { + "epoch": 2.01656372824256, + "grad_norm": 0.568211019039154, + "learning_rate": 2.9317082361664213e-06, + "loss": 0.3445, + "step": 7183 + }, + { + "epoch": 2.016844469399214, + "grad_norm": 0.5676230788230896, + "learning_rate": 2.9302212886996107e-06, + "loss": 0.3568, + "step": 7184 + }, + { + "epoch": 2.0171252105558675, + "grad_norm": 0.5790365934371948, + "learning_rate": 2.928734562106714e-06, + "loss": 0.3476, + "step": 7185 + }, + { + "epoch": 2.017405951712521, + "grad_norm": 0.6276818513870239, + "learning_rate": 2.9272480565463836e-06, + "loss": 0.3584, + "step": 7186 + }, + { + "epoch": 2.017686692869175, + "grad_norm": 0.6061694622039795, + "learning_rate": 2.9257617721772508e-06, + "loss": 0.2938, + "step": 7187 + }, + { + "epoch": 2.0179674340258282, + "grad_norm": 0.5798271298408508, + "learning_rate": 2.924275709157922e-06, + "loss": 0.3567, + "step": 7188 + }, + { + "epoch": 2.0182481751824817, + "grad_norm": 0.5810397863388062, + "learning_rate": 2.9227898676469824e-06, + "loss": 0.3068, + "step": 7189 + }, + { + "epoch": 2.018528916339135, + "grad_norm": 0.6163998246192932, + "learning_rate": 2.9213042478029908e-06, + "loss": 0.2707, + "step": 7190 + }, + { + "epoch": 2.018809657495789, + "grad_norm": 0.5872731804847717, + "learning_rate": 2.919818849784483e-06, + "loss": 0.3263, + "step": 7191 + }, + { + "epoch": 2.0190903986524424, + "grad_norm": 0.5344861149787903, + "learning_rate": 2.9183336737499733e-06, + "loss": 0.3386, + "step": 7192 + }, + { + "epoch": 2.019371139809096, + "grad_norm": 0.6087139844894409, + "learning_rate": 2.9168487198579465e-06, + "loss": 0.321, + "step": 7193 + }, + { + "epoch": 2.0196518809657498, + "grad_norm": 0.6097393035888672, + "learning_rate": 2.915363988266873e-06, + "loss": 0.3357, + "step": 7194 + }, + { + "epoch": 2.019932622122403, + "grad_norm": 0.5947750210762024, + "learning_rate": 2.9138794791351877e-06, + "loss": 0.2989, + "step": 7195 + }, + { + "epoch": 2.0202133632790567, + "grad_norm": 0.576885998249054, + "learning_rate": 2.9123951926213145e-06, + "loss": 0.3854, + "step": 7196 + }, + { + "epoch": 2.02049410443571, + "grad_norm": 0.5861470699310303, + "learning_rate": 2.9109111288836443e-06, + "loss": 0.3753, + "step": 7197 + }, + { + "epoch": 2.020774845592364, + "grad_norm": 0.5560078024864197, + "learning_rate": 2.909427288080545e-06, + "loss": 0.3268, + "step": 7198 + }, + { + "epoch": 2.0210555867490174, + "grad_norm": 0.596595048904419, + "learning_rate": 2.9079436703703676e-06, + "loss": 0.3612, + "step": 7199 + }, + { + "epoch": 2.021336327905671, + "grad_norm": 0.5585770606994629, + "learning_rate": 2.9064602759114295e-06, + "loss": 0.3158, + "step": 7200 + }, + { + "epoch": 2.0216170690623247, + "grad_norm": 0.5293141007423401, + "learning_rate": 2.9049771048620344e-06, + "loss": 0.3535, + "step": 7201 + }, + { + "epoch": 2.021897810218978, + "grad_norm": 0.6145786046981812, + "learning_rate": 2.903494157380452e-06, + "loss": 0.3221, + "step": 7202 + }, + { + "epoch": 2.0221785513756316, + "grad_norm": 0.5820688605308533, + "learning_rate": 2.902011433624938e-06, + "loss": 0.3577, + "step": 7203 + }, + { + "epoch": 2.022459292532285, + "grad_norm": 0.5819162130355835, + "learning_rate": 2.900528933753718e-06, + "loss": 0.2765, + "step": 7204 + }, + { + "epoch": 2.022740033688939, + "grad_norm": 0.567573606967926, + "learning_rate": 2.899046657924992e-06, + "loss": 0.2921, + "step": 7205 + }, + { + "epoch": 2.0230207748455924, + "grad_norm": 0.5665693283081055, + "learning_rate": 2.8975646062969432e-06, + "loss": 0.3204, + "step": 7206 + }, + { + "epoch": 2.023301516002246, + "grad_norm": 0.5993481874465942, + "learning_rate": 2.8960827790277234e-06, + "loss": 0.3456, + "step": 7207 + }, + { + "epoch": 2.0235822571588993, + "grad_norm": 0.5737941861152649, + "learning_rate": 2.894601176275469e-06, + "loss": 0.3182, + "step": 7208 + }, + { + "epoch": 2.023862998315553, + "grad_norm": 0.531103253364563, + "learning_rate": 2.893119798198284e-06, + "loss": 0.3492, + "step": 7209 + }, + { + "epoch": 2.0241437394722066, + "grad_norm": 0.5729021430015564, + "learning_rate": 2.89163864495425e-06, + "loss": 0.3027, + "step": 7210 + }, + { + "epoch": 2.02442448062886, + "grad_norm": 0.5614590048789978, + "learning_rate": 2.8901577167014303e-06, + "loss": 0.3185, + "step": 7211 + }, + { + "epoch": 2.024705221785514, + "grad_norm": 0.5724241137504578, + "learning_rate": 2.8886770135978582e-06, + "loss": 0.3081, + "step": 7212 + }, + { + "epoch": 2.0249859629421674, + "grad_norm": 0.489589661359787, + "learning_rate": 2.8871965358015467e-06, + "loss": 0.3074, + "step": 7213 + }, + { + "epoch": 2.025266704098821, + "grad_norm": 0.499676913022995, + "learning_rate": 2.885716283470481e-06, + "loss": 0.298, + "step": 7214 + }, + { + "epoch": 2.0255474452554743, + "grad_norm": 0.6236593723297119, + "learning_rate": 2.884236256762625e-06, + "loss": 0.3566, + "step": 7215 + }, + { + "epoch": 2.025828186412128, + "grad_norm": 0.57680743932724, + "learning_rate": 2.882756455835921e-06, + "loss": 0.3436, + "step": 7216 + }, + { + "epoch": 2.0261089275687816, + "grad_norm": 0.5888566374778748, + "learning_rate": 2.881276880848279e-06, + "loss": 0.3102, + "step": 7217 + }, + { + "epoch": 2.026389668725435, + "grad_norm": 0.6607045531272888, + "learning_rate": 2.879797531957596e-06, + "loss": 0.2672, + "step": 7218 + }, + { + "epoch": 2.026670409882089, + "grad_norm": 0.5833709836006165, + "learning_rate": 2.8783184093217355e-06, + "loss": 0.3353, + "step": 7219 + }, + { + "epoch": 2.0269511510387423, + "grad_norm": 0.5592872500419617, + "learning_rate": 2.8768395130985377e-06, + "loss": 0.3496, + "step": 7220 + }, + { + "epoch": 2.027231892195396, + "grad_norm": 0.5389991402626038, + "learning_rate": 2.875360843445827e-06, + "loss": 0.3629, + "step": 7221 + }, + { + "epoch": 2.0275126333520492, + "grad_norm": 0.5927066206932068, + "learning_rate": 2.873882400521392e-06, + "loss": 0.3587, + "step": 7222 + }, + { + "epoch": 2.027793374508703, + "grad_norm": 0.5337790250778198, + "learning_rate": 2.8724041844830076e-06, + "loss": 0.3152, + "step": 7223 + }, + { + "epoch": 2.0280741156653566, + "grad_norm": 0.5419126152992249, + "learning_rate": 2.870926195488417e-06, + "loss": 0.4049, + "step": 7224 + }, + { + "epoch": 2.02835485682201, + "grad_norm": 0.5905657410621643, + "learning_rate": 2.8694484336953444e-06, + "loss": 0.3667, + "step": 7225 + }, + { + "epoch": 2.028635597978664, + "grad_norm": 0.5812539458274841, + "learning_rate": 2.8679708992614857e-06, + "loss": 0.329, + "step": 7226 + }, + { + "epoch": 2.0289163391353173, + "grad_norm": 0.558904767036438, + "learning_rate": 2.8664935923445125e-06, + "loss": 0.3185, + "step": 7227 + }, + { + "epoch": 2.0291970802919708, + "grad_norm": 0.5703031420707703, + "learning_rate": 2.865016513102078e-06, + "loss": 0.3579, + "step": 7228 + }, + { + "epoch": 2.029477821448624, + "grad_norm": 0.5999102592468262, + "learning_rate": 2.8635396616918027e-06, + "loss": 0.3208, + "step": 7229 + }, + { + "epoch": 2.029758562605278, + "grad_norm": 0.6106962561607361, + "learning_rate": 2.8620630382712903e-06, + "loss": 0.3185, + "step": 7230 + }, + { + "epoch": 2.0300393037619315, + "grad_norm": 0.6072873473167419, + "learning_rate": 2.8605866429981167e-06, + "loss": 0.3138, + "step": 7231 + }, + { + "epoch": 2.030320044918585, + "grad_norm": 0.5712512731552124, + "learning_rate": 2.8591104760298293e-06, + "loss": 0.3497, + "step": 7232 + }, + { + "epoch": 2.0306007860752384, + "grad_norm": 0.5625197887420654, + "learning_rate": 2.8576345375239612e-06, + "loss": 0.3324, + "step": 7233 + }, + { + "epoch": 2.0308815272318923, + "grad_norm": 0.5673905611038208, + "learning_rate": 2.8561588276380103e-06, + "loss": 0.3135, + "step": 7234 + }, + { + "epoch": 2.0311622683885457, + "grad_norm": 0.5926187038421631, + "learning_rate": 2.85468334652946e-06, + "loss": 0.3175, + "step": 7235 + }, + { + "epoch": 2.031443009545199, + "grad_norm": 0.638444721698761, + "learning_rate": 2.8532080943557593e-06, + "loss": 0.3031, + "step": 7236 + }, + { + "epoch": 2.031723750701853, + "grad_norm": 0.607799232006073, + "learning_rate": 2.851733071274344e-06, + "loss": 0.3747, + "step": 7237 + }, + { + "epoch": 2.0320044918585065, + "grad_norm": 0.6229817867279053, + "learning_rate": 2.8502582774426156e-06, + "loss": 0.3466, + "step": 7238 + }, + { + "epoch": 2.03228523301516, + "grad_norm": 0.5465052723884583, + "learning_rate": 2.8487837130179514e-06, + "loss": 0.3511, + "step": 7239 + }, + { + "epoch": 2.0325659741718134, + "grad_norm": 0.519872784614563, + "learning_rate": 2.8473093781577156e-06, + "loss": 0.3977, + "step": 7240 + }, + { + "epoch": 2.0328467153284673, + "grad_norm": 0.5261691212654114, + "learning_rate": 2.845835273019237e-06, + "loss": 0.3656, + "step": 7241 + }, + { + "epoch": 2.0331274564851207, + "grad_norm": 0.5705521702766418, + "learning_rate": 2.8443613977598193e-06, + "loss": 0.3554, + "step": 7242 + }, + { + "epoch": 2.033408197641774, + "grad_norm": 0.6061201095581055, + "learning_rate": 2.842887752536751e-06, + "loss": 0.2866, + "step": 7243 + }, + { + "epoch": 2.033688938798428, + "grad_norm": 0.5863782167434692, + "learning_rate": 2.841414337507285e-06, + "loss": 0.3359, + "step": 7244 + }, + { + "epoch": 2.0339696799550815, + "grad_norm": 0.5974136590957642, + "learning_rate": 2.83994115282866e-06, + "loss": 0.3359, + "step": 7245 + }, + { + "epoch": 2.034250421111735, + "grad_norm": 0.6383056640625, + "learning_rate": 2.838468198658082e-06, + "loss": 0.3143, + "step": 7246 + }, + { + "epoch": 2.0345311622683884, + "grad_norm": 0.6040796637535095, + "learning_rate": 2.836995475152735e-06, + "loss": 0.3502, + "step": 7247 + }, + { + "epoch": 2.0348119034250423, + "grad_norm": 0.570811927318573, + "learning_rate": 2.8355229824697818e-06, + "loss": 0.3429, + "step": 7248 + }, + { + "epoch": 2.0350926445816957, + "grad_norm": 0.5666947364807129, + "learning_rate": 2.834050720766353e-06, + "loss": 0.3187, + "step": 7249 + }, + { + "epoch": 2.035373385738349, + "grad_norm": 0.5729508996009827, + "learning_rate": 2.832578690199565e-06, + "loss": 0.3523, + "step": 7250 + }, + { + "epoch": 2.035654126895003, + "grad_norm": 0.5670958161354065, + "learning_rate": 2.8311068909264987e-06, + "loss": 0.321, + "step": 7251 + }, + { + "epoch": 2.0359348680516565, + "grad_norm": 1.3062509298324585, + "learning_rate": 2.8296353231042197e-06, + "loss": 0.3171, + "step": 7252 + }, + { + "epoch": 2.03621560920831, + "grad_norm": 0.6545298099517822, + "learning_rate": 2.8281639868897627e-06, + "loss": 0.3205, + "step": 7253 + }, + { + "epoch": 2.0364963503649633, + "grad_norm": 0.6367155909538269, + "learning_rate": 2.8266928824401363e-06, + "loss": 0.358, + "step": 7254 + }, + { + "epoch": 2.0367770915216172, + "grad_norm": 0.5915209054946899, + "learning_rate": 2.825222009912333e-06, + "loss": 0.3099, + "step": 7255 + }, + { + "epoch": 2.0370578326782707, + "grad_norm": 0.6656043529510498, + "learning_rate": 2.823751369463311e-06, + "loss": 0.2923, + "step": 7256 + }, + { + "epoch": 2.037338573834924, + "grad_norm": 0.5338889956474304, + "learning_rate": 2.8222809612500114e-06, + "loss": 0.3487, + "step": 7257 + }, + { + "epoch": 2.0376193149915776, + "grad_norm": 0.5747395157814026, + "learning_rate": 2.8208107854293455e-06, + "loss": 0.3254, + "step": 7258 + }, + { + "epoch": 2.0379000561482314, + "grad_norm": 0.556736409664154, + "learning_rate": 2.819340842158199e-06, + "loss": 0.3191, + "step": 7259 + }, + { + "epoch": 2.038180797304885, + "grad_norm": 0.6049169301986694, + "learning_rate": 2.8178711315934395e-06, + "loss": 0.3305, + "step": 7260 + }, + { + "epoch": 2.0384615384615383, + "grad_norm": 0.6023212671279907, + "learning_rate": 2.8164016538919005e-06, + "loss": 0.3385, + "step": 7261 + }, + { + "epoch": 2.038742279618192, + "grad_norm": 0.6171841025352478, + "learning_rate": 2.8149324092104e-06, + "loss": 0.3173, + "step": 7262 + }, + { + "epoch": 2.0390230207748457, + "grad_norm": 0.6021910905838013, + "learning_rate": 2.8134633977057236e-06, + "loss": 0.3476, + "step": 7263 + }, + { + "epoch": 2.039303761931499, + "grad_norm": 0.5984469056129456, + "learning_rate": 2.8119946195346375e-06, + "loss": 0.3288, + "step": 7264 + }, + { + "epoch": 2.0395845030881525, + "grad_norm": 0.525161862373352, + "learning_rate": 2.8105260748538778e-06, + "loss": 0.3625, + "step": 7265 + }, + { + "epoch": 2.0398652442448064, + "grad_norm": 0.613210916519165, + "learning_rate": 2.80905776382016e-06, + "loss": 0.3574, + "step": 7266 + }, + { + "epoch": 2.04014598540146, + "grad_norm": 0.5665947794914246, + "learning_rate": 2.807589686590174e-06, + "loss": 0.3543, + "step": 7267 + }, + { + "epoch": 2.0404267265581133, + "grad_norm": 0.5441635847091675, + "learning_rate": 2.806121843320584e-06, + "loss": 0.3469, + "step": 7268 + }, + { + "epoch": 2.040707467714767, + "grad_norm": 0.6160149574279785, + "learning_rate": 2.804654234168026e-06, + "loss": 0.3073, + "step": 7269 + }, + { + "epoch": 2.0409882088714206, + "grad_norm": 0.5180505514144897, + "learning_rate": 2.8031868592891177e-06, + "loss": 0.3089, + "step": 7270 + }, + { + "epoch": 2.041268950028074, + "grad_norm": 0.5670261979103088, + "learning_rate": 2.801719718840445e-06, + "loss": 0.3112, + "step": 7271 + }, + { + "epoch": 2.0415496911847275, + "grad_norm": 0.5812737941741943, + "learning_rate": 2.8002528129785755e-06, + "loss": 0.3412, + "step": 7272 + }, + { + "epoch": 2.0418304323413814, + "grad_norm": 0.6227341890335083, + "learning_rate": 2.798786141860045e-06, + "loss": 0.2957, + "step": 7273 + }, + { + "epoch": 2.042111173498035, + "grad_norm": 0.6089826226234436, + "learning_rate": 2.7973197056413705e-06, + "loss": 0.3218, + "step": 7274 + }, + { + "epoch": 2.0423919146546883, + "grad_norm": 0.5416876077651978, + "learning_rate": 2.795853504479039e-06, + "loss": 0.3312, + "step": 7275 + }, + { + "epoch": 2.0426726558113417, + "grad_norm": 0.5602169036865234, + "learning_rate": 2.794387538529514e-06, + "loss": 0.311, + "step": 7276 + }, + { + "epoch": 2.0429533969679956, + "grad_norm": 0.5964089632034302, + "learning_rate": 2.792921807949236e-06, + "loss": 0.2815, + "step": 7277 + }, + { + "epoch": 2.043234138124649, + "grad_norm": 0.5287725925445557, + "learning_rate": 2.7914563128946165e-06, + "loss": 0.3178, + "step": 7278 + }, + { + "epoch": 2.0435148792813025, + "grad_norm": 0.5968223214149475, + "learning_rate": 2.7899910535220463e-06, + "loss": 0.3444, + "step": 7279 + }, + { + "epoch": 2.0437956204379564, + "grad_norm": 0.5191061496734619, + "learning_rate": 2.788526029987889e-06, + "loss": 0.3306, + "step": 7280 + }, + { + "epoch": 2.04407636159461, + "grad_norm": 0.6092125773429871, + "learning_rate": 2.7870612424484787e-06, + "loss": 0.3279, + "step": 7281 + }, + { + "epoch": 2.0443571027512633, + "grad_norm": 0.6415168642997742, + "learning_rate": 2.785596691060134e-06, + "loss": 0.312, + "step": 7282 + }, + { + "epoch": 2.0446378439079167, + "grad_norm": 0.5579442381858826, + "learning_rate": 2.784132375979137e-06, + "loss": 0.3075, + "step": 7283 + }, + { + "epoch": 2.0449185850645706, + "grad_norm": 0.5816577076911926, + "learning_rate": 2.7826682973617556e-06, + "loss": 0.3102, + "step": 7284 + }, + { + "epoch": 2.045199326221224, + "grad_norm": 0.502070963382721, + "learning_rate": 2.7812044553642232e-06, + "loss": 0.3539, + "step": 7285 + }, + { + "epoch": 2.0454800673778775, + "grad_norm": 0.5898591876029968, + "learning_rate": 2.7797408501427547e-06, + "loss": 0.3461, + "step": 7286 + }, + { + "epoch": 2.0457608085345313, + "grad_norm": 0.600614070892334, + "learning_rate": 2.778277481853537e-06, + "loss": 0.3603, + "step": 7287 + }, + { + "epoch": 2.046041549691185, + "grad_norm": 0.5811378955841064, + "learning_rate": 2.776814350652728e-06, + "loss": 0.3374, + "step": 7288 + }, + { + "epoch": 2.0463222908478382, + "grad_norm": 0.5325272679328918, + "learning_rate": 2.775351456696468e-06, + "loss": 0.3436, + "step": 7289 + }, + { + "epoch": 2.0466030320044917, + "grad_norm": 0.5729573369026184, + "learning_rate": 2.773888800140865e-06, + "loss": 0.3748, + "step": 7290 + }, + { + "epoch": 2.0468837731611456, + "grad_norm": 0.5439331531524658, + "learning_rate": 2.772426381142005e-06, + "loss": 0.3477, + "step": 7291 + }, + { + "epoch": 2.047164514317799, + "grad_norm": 0.5973528623580933, + "learning_rate": 2.7709641998559523e-06, + "loss": 0.3341, + "step": 7292 + }, + { + "epoch": 2.0474452554744524, + "grad_norm": 0.5766603350639343, + "learning_rate": 2.769502256438736e-06, + "loss": 0.349, + "step": 7293 + }, + { + "epoch": 2.0477259966311063, + "grad_norm": 0.5928993821144104, + "learning_rate": 2.7680405510463702e-06, + "loss": 0.3063, + "step": 7294 + }, + { + "epoch": 2.0480067377877598, + "grad_norm": 0.5455955266952515, + "learning_rate": 2.7665790838348377e-06, + "loss": 0.3539, + "step": 7295 + }, + { + "epoch": 2.048287478944413, + "grad_norm": 0.5615442991256714, + "learning_rate": 2.7651178549600942e-06, + "loss": 0.3393, + "step": 7296 + }, + { + "epoch": 2.0485682201010667, + "grad_norm": 0.5350291132926941, + "learning_rate": 2.763656864578078e-06, + "loss": 0.2635, + "step": 7297 + }, + { + "epoch": 2.0488489612577205, + "grad_norm": 0.6523261070251465, + "learning_rate": 2.762196112844692e-06, + "loss": 0.3197, + "step": 7298 + }, + { + "epoch": 2.049129702414374, + "grad_norm": 0.612379252910614, + "learning_rate": 2.760735599915823e-06, + "loss": 0.356, + "step": 7299 + }, + { + "epoch": 2.0494104435710274, + "grad_norm": 0.5997228622436523, + "learning_rate": 2.759275325947324e-06, + "loss": 0.3303, + "step": 7300 + }, + { + "epoch": 2.0496911847276813, + "grad_norm": 0.5357117652893066, + "learning_rate": 2.7578152910950297e-06, + "loss": 0.3508, + "step": 7301 + }, + { + "epoch": 2.0499719258843347, + "grad_norm": 0.5867998600006104, + "learning_rate": 2.7563554955147436e-06, + "loss": 0.3299, + "step": 7302 + }, + { + "epoch": 2.050252667040988, + "grad_norm": 0.6556015610694885, + "learning_rate": 2.7548959393622454e-06, + "loss": 0.3312, + "step": 7303 + }, + { + "epoch": 2.0505334081976416, + "grad_norm": 0.5935055613517761, + "learning_rate": 2.7534366227932928e-06, + "loss": 0.3327, + "step": 7304 + }, + { + "epoch": 2.0508141493542955, + "grad_norm": 0.5554129481315613, + "learning_rate": 2.751977545963612e-06, + "loss": 0.3316, + "step": 7305 + }, + { + "epoch": 2.051094890510949, + "grad_norm": 0.5492838621139526, + "learning_rate": 2.75051870902891e-06, + "loss": 0.3548, + "step": 7306 + }, + { + "epoch": 2.0513756316676024, + "grad_norm": 0.58622807264328, + "learning_rate": 2.7490601121448602e-06, + "loss": 0.3336, + "step": 7307 + }, + { + "epoch": 2.051656372824256, + "grad_norm": 0.5542654991149902, + "learning_rate": 2.74760175546712e-06, + "loss": 0.3161, + "step": 7308 + }, + { + "epoch": 2.0519371139809097, + "grad_norm": 0.5774078369140625, + "learning_rate": 2.746143639151313e-06, + "loss": 0.3421, + "step": 7309 + }, + { + "epoch": 2.052217855137563, + "grad_norm": 0.6042695045471191, + "learning_rate": 2.74468576335304e-06, + "loss": 0.3662, + "step": 7310 + }, + { + "epoch": 2.0524985962942166, + "grad_norm": 0.5622040629386902, + "learning_rate": 2.7432281282278788e-06, + "loss": 0.3463, + "step": 7311 + }, + { + "epoch": 2.0527793374508705, + "grad_norm": 0.5341228246688843, + "learning_rate": 2.741770733931376e-06, + "loss": 0.3207, + "step": 7312 + }, + { + "epoch": 2.053060078607524, + "grad_norm": 0.5676590204238892, + "learning_rate": 2.7403135806190595e-06, + "loss": 0.3088, + "step": 7313 + }, + { + "epoch": 2.0533408197641774, + "grad_norm": 0.4851858913898468, + "learning_rate": 2.738856668446426e-06, + "loss": 0.3173, + "step": 7314 + }, + { + "epoch": 2.053621560920831, + "grad_norm": 0.5754046440124512, + "learning_rate": 2.737399997568943e-06, + "loss": 0.3401, + "step": 7315 + }, + { + "epoch": 2.0539023020774847, + "grad_norm": 0.5998167991638184, + "learning_rate": 2.7359435681420665e-06, + "loss": 0.3409, + "step": 7316 + }, + { + "epoch": 2.054183043234138, + "grad_norm": 0.6578349471092224, + "learning_rate": 2.734487380321213e-06, + "loss": 0.3205, + "step": 7317 + }, + { + "epoch": 2.0544637843907916, + "grad_norm": 0.5938233137130737, + "learning_rate": 2.7330314342617758e-06, + "loss": 0.3401, + "step": 7318 + }, + { + "epoch": 2.0547445255474455, + "grad_norm": 0.5376281142234802, + "learning_rate": 2.7315757301191293e-06, + "loss": 0.3411, + "step": 7319 + }, + { + "epoch": 2.055025266704099, + "grad_norm": 0.5516668558120728, + "learning_rate": 2.730120268048612e-06, + "loss": 0.2793, + "step": 7320 + }, + { + "epoch": 2.0553060078607523, + "grad_norm": 0.568037748336792, + "learning_rate": 2.728665048205546e-06, + "loss": 0.3125, + "step": 7321 + }, + { + "epoch": 2.055586749017406, + "grad_norm": 0.592534601688385, + "learning_rate": 2.72721007074522e-06, + "loss": 0.3567, + "step": 7322 + }, + { + "epoch": 2.0558674901740597, + "grad_norm": 0.5778653621673584, + "learning_rate": 2.725755335822903e-06, + "loss": 0.341, + "step": 7323 + }, + { + "epoch": 2.056148231330713, + "grad_norm": 0.6206321716308594, + "learning_rate": 2.7243008435938346e-06, + "loss": 0.333, + "step": 7324 + }, + { + "epoch": 2.0564289724873666, + "grad_norm": 0.584697425365448, + "learning_rate": 2.7228465942132264e-06, + "loss": 0.3366, + "step": 7325 + }, + { + "epoch": 2.05670971364402, + "grad_norm": 0.527962327003479, + "learning_rate": 2.721392587836271e-06, + "loss": 0.3553, + "step": 7326 + }, + { + "epoch": 2.056990454800674, + "grad_norm": 0.5727418065071106, + "learning_rate": 2.7199388246181268e-06, + "loss": 0.3608, + "step": 7327 + }, + { + "epoch": 2.0572711959573273, + "grad_norm": 0.604831874370575, + "learning_rate": 2.718485304713935e-06, + "loss": 0.3145, + "step": 7328 + }, + { + "epoch": 2.0575519371139808, + "grad_norm": 0.5907540917396545, + "learning_rate": 2.717032028278803e-06, + "loss": 0.351, + "step": 7329 + }, + { + "epoch": 2.0578326782706347, + "grad_norm": 0.6221470236778259, + "learning_rate": 2.7155789954678146e-06, + "loss": 0.3245, + "step": 7330 + }, + { + "epoch": 2.058113419427288, + "grad_norm": 0.5572207570075989, + "learning_rate": 2.7141262064360318e-06, + "loss": 0.3396, + "step": 7331 + }, + { + "epoch": 2.0583941605839415, + "grad_norm": 0.6000232100486755, + "learning_rate": 2.7126736613384828e-06, + "loss": 0.3458, + "step": 7332 + }, + { + "epoch": 2.058674901740595, + "grad_norm": 0.5176807641983032, + "learning_rate": 2.7112213603301798e-06, + "loss": 0.3384, + "step": 7333 + }, + { + "epoch": 2.058955642897249, + "grad_norm": 0.5626146793365479, + "learning_rate": 2.709769303566097e-06, + "loss": 0.3004, + "step": 7334 + }, + { + "epoch": 2.0592363840539023, + "grad_norm": 0.589306652545929, + "learning_rate": 2.708317491201195e-06, + "loss": 0.3194, + "step": 7335 + }, + { + "epoch": 2.0595171252105557, + "grad_norm": 0.5636805295944214, + "learning_rate": 2.706865923390399e-06, + "loss": 0.3217, + "step": 7336 + }, + { + "epoch": 2.0597978663672096, + "grad_norm": 0.6115208864212036, + "learning_rate": 2.7054146002886095e-06, + "loss": 0.3137, + "step": 7337 + }, + { + "epoch": 2.060078607523863, + "grad_norm": 0.6049689650535583, + "learning_rate": 2.7039635220507067e-06, + "loss": 0.3365, + "step": 7338 + }, + { + "epoch": 2.0603593486805165, + "grad_norm": 0.6705333590507507, + "learning_rate": 2.702512688831537e-06, + "loss": 0.3068, + "step": 7339 + }, + { + "epoch": 2.06064008983717, + "grad_norm": 0.5652129650115967, + "learning_rate": 2.7010621007859284e-06, + "loss": 0.2963, + "step": 7340 + }, + { + "epoch": 2.060920830993824, + "grad_norm": 0.5260769128799438, + "learning_rate": 2.6996117580686732e-06, + "loss": 0.3383, + "step": 7341 + }, + { + "epoch": 2.0612015721504773, + "grad_norm": 0.6173593997955322, + "learning_rate": 2.698161660834547e-06, + "loss": 0.3166, + "step": 7342 + }, + { + "epoch": 2.0614823133071307, + "grad_norm": 0.5808701515197754, + "learning_rate": 2.696711809238296e-06, + "loss": 0.3529, + "step": 7343 + }, + { + "epoch": 2.0617630544637846, + "grad_norm": 0.6364161968231201, + "learning_rate": 2.6952622034346347e-06, + "loss": 0.3702, + "step": 7344 + }, + { + "epoch": 2.062043795620438, + "grad_norm": 0.6393119692802429, + "learning_rate": 2.693812843578262e-06, + "loss": 0.3322, + "step": 7345 + }, + { + "epoch": 2.0623245367770915, + "grad_norm": 0.5216532945632935, + "learning_rate": 2.69236372982384e-06, + "loss": 0.3337, + "step": 7346 + }, + { + "epoch": 2.062605277933745, + "grad_norm": 0.6000442504882812, + "learning_rate": 2.6909148623260097e-06, + "loss": 0.3581, + "step": 7347 + }, + { + "epoch": 2.062886019090399, + "grad_norm": 0.6018159985542297, + "learning_rate": 2.689466241239388e-06, + "loss": 0.3084, + "step": 7348 + }, + { + "epoch": 2.0631667602470523, + "grad_norm": 0.6169343590736389, + "learning_rate": 2.688017866718558e-06, + "loss": 0.3617, + "step": 7349 + }, + { + "epoch": 2.0634475014037057, + "grad_norm": 0.5640069842338562, + "learning_rate": 2.6865697389180867e-06, + "loss": 0.3288, + "step": 7350 + }, + { + "epoch": 2.063728242560359, + "grad_norm": 0.553663432598114, + "learning_rate": 2.6851218579925065e-06, + "loss": 0.3262, + "step": 7351 + }, + { + "epoch": 2.064008983717013, + "grad_norm": 0.5483970642089844, + "learning_rate": 2.6836742240963237e-06, + "loss": 0.334, + "step": 7352 + }, + { + "epoch": 2.0642897248736665, + "grad_norm": 0.5885378122329712, + "learning_rate": 2.682226837384026e-06, + "loss": 0.3375, + "step": 7353 + }, + { + "epoch": 2.06457046603032, + "grad_norm": 0.5467673540115356, + "learning_rate": 2.680779698010065e-06, + "loss": 0.3582, + "step": 7354 + }, + { + "epoch": 2.064851207186974, + "grad_norm": 0.5636199712753296, + "learning_rate": 2.6793328061288736e-06, + "loss": 0.3143, + "step": 7355 + }, + { + "epoch": 2.0651319483436272, + "grad_norm": 0.6640552282333374, + "learning_rate": 2.6778861618948525e-06, + "loss": 0.32, + "step": 7356 + }, + { + "epoch": 2.0654126895002807, + "grad_norm": 0.5755638480186462, + "learning_rate": 2.6764397654623818e-06, + "loss": 0.2938, + "step": 7357 + }, + { + "epoch": 2.065693430656934, + "grad_norm": 0.5817362070083618, + "learning_rate": 2.6749936169858103e-06, + "loss": 0.3094, + "step": 7358 + }, + { + "epoch": 2.065974171813588, + "grad_norm": 0.5755941867828369, + "learning_rate": 2.6735477166194595e-06, + "loss": 0.3351, + "step": 7359 + }, + { + "epoch": 2.0662549129702414, + "grad_norm": 0.5803687572479248, + "learning_rate": 2.672102064517631e-06, + "loss": 0.3208, + "step": 7360 + }, + { + "epoch": 2.066535654126895, + "grad_norm": 0.5887718796730042, + "learning_rate": 2.6706566608345917e-06, + "loss": 0.3207, + "step": 7361 + }, + { + "epoch": 2.0668163952835488, + "grad_norm": 0.5406284332275391, + "learning_rate": 2.6692115057245917e-06, + "loss": 0.3071, + "step": 7362 + }, + { + "epoch": 2.067097136440202, + "grad_norm": 0.5603659152984619, + "learning_rate": 2.6677665993418445e-06, + "loss": 0.3554, + "step": 7363 + }, + { + "epoch": 2.0673778775968557, + "grad_norm": 0.588245689868927, + "learning_rate": 2.6663219418405405e-06, + "loss": 0.3466, + "step": 7364 + }, + { + "epoch": 2.067658618753509, + "grad_norm": 0.5675176978111267, + "learning_rate": 2.6648775333748487e-06, + "loss": 0.3281, + "step": 7365 + }, + { + "epoch": 2.067939359910163, + "grad_norm": 0.5302689075469971, + "learning_rate": 2.6634333740989037e-06, + "loss": 0.3154, + "step": 7366 + }, + { + "epoch": 2.0682201010668164, + "grad_norm": 0.6093424558639526, + "learning_rate": 2.661989464166819e-06, + "loss": 0.2794, + "step": 7367 + }, + { + "epoch": 2.06850084222347, + "grad_norm": 0.5445008277893066, + "learning_rate": 2.6605458037326814e-06, + "loss": 0.3224, + "step": 7368 + }, + { + "epoch": 2.0687815833801233, + "grad_norm": 0.62278151512146, + "learning_rate": 2.6591023929505453e-06, + "loss": 0.3364, + "step": 7369 + }, + { + "epoch": 2.069062324536777, + "grad_norm": 0.5738529562950134, + "learning_rate": 2.6576592319744466e-06, + "loss": 0.3199, + "step": 7370 + }, + { + "epoch": 2.0693430656934306, + "grad_norm": 0.518867015838623, + "learning_rate": 2.656216320958387e-06, + "loss": 0.2954, + "step": 7371 + }, + { + "epoch": 2.069623806850084, + "grad_norm": 0.5508706569671631, + "learning_rate": 2.6547736600563486e-06, + "loss": 0.3511, + "step": 7372 + }, + { + "epoch": 2.069904548006738, + "grad_norm": 0.5410498380661011, + "learning_rate": 2.653331249422281e-06, + "loss": 0.3068, + "step": 7373 + }, + { + "epoch": 2.0701852891633914, + "grad_norm": 0.6112616658210754, + "learning_rate": 2.6518890892101075e-06, + "loss": 0.3378, + "step": 7374 + }, + { + "epoch": 2.070466030320045, + "grad_norm": 0.5861853957176208, + "learning_rate": 2.6504471795737308e-06, + "loss": 0.2802, + "step": 7375 + }, + { + "epoch": 2.0707467714766983, + "grad_norm": 0.5634087324142456, + "learning_rate": 2.6490055206670174e-06, + "loss": 0.3042, + "step": 7376 + }, + { + "epoch": 2.071027512633352, + "grad_norm": 0.5576903223991394, + "learning_rate": 2.647564112643818e-06, + "loss": 0.3534, + "step": 7377 + }, + { + "epoch": 2.0713082537900056, + "grad_norm": 0.6198608875274658, + "learning_rate": 2.646122955657947e-06, + "loss": 0.292, + "step": 7378 + }, + { + "epoch": 2.071588994946659, + "grad_norm": 0.521613359451294, + "learning_rate": 2.644682049863194e-06, + "loss": 0.3613, + "step": 7379 + }, + { + "epoch": 2.071869736103313, + "grad_norm": 0.5087357759475708, + "learning_rate": 2.6432413954133287e-06, + "loss": 0.3411, + "step": 7380 + }, + { + "epoch": 2.0721504772599664, + "grad_norm": 0.5704274773597717, + "learning_rate": 2.6418009924620836e-06, + "loss": 0.3732, + "step": 7381 + }, + { + "epoch": 2.07243121841662, + "grad_norm": 0.5092198848724365, + "learning_rate": 2.6403608411631744e-06, + "loss": 0.3532, + "step": 7382 + }, + { + "epoch": 2.0727119595732733, + "grad_norm": 0.5866182446479797, + "learning_rate": 2.63892094167028e-06, + "loss": 0.315, + "step": 7383 + }, + { + "epoch": 2.072992700729927, + "grad_norm": 0.518237829208374, + "learning_rate": 2.637481294137062e-06, + "loss": 0.309, + "step": 7384 + }, + { + "epoch": 2.0732734418865806, + "grad_norm": 0.56381756067276, + "learning_rate": 2.6360418987171493e-06, + "loss": 0.3084, + "step": 7385 + }, + { + "epoch": 2.073554183043234, + "grad_norm": 0.608199954032898, + "learning_rate": 2.6346027555641422e-06, + "loss": 0.321, + "step": 7386 + }, + { + "epoch": 2.073834924199888, + "grad_norm": 0.5592595338821411, + "learning_rate": 2.6331638648316223e-06, + "loss": 0.3583, + "step": 7387 + }, + { + "epoch": 2.0741156653565413, + "grad_norm": 0.6111178994178772, + "learning_rate": 2.6317252266731337e-06, + "loss": 0.3382, + "step": 7388 + }, + { + "epoch": 2.074396406513195, + "grad_norm": 0.5903767347335815, + "learning_rate": 2.630286841242203e-06, + "loss": 0.3195, + "step": 7389 + }, + { + "epoch": 2.0746771476698482, + "grad_norm": 0.5476042628288269, + "learning_rate": 2.628848708692326e-06, + "loss": 0.3326, + "step": 7390 + }, + { + "epoch": 2.074957888826502, + "grad_norm": 0.6097612977027893, + "learning_rate": 2.627410829176966e-06, + "loss": 0.3061, + "step": 7391 + }, + { + "epoch": 2.0752386299831556, + "grad_norm": 0.5774500370025635, + "learning_rate": 2.6259732028495693e-06, + "loss": 0.3361, + "step": 7392 + }, + { + "epoch": 2.075519371139809, + "grad_norm": 0.5278791189193726, + "learning_rate": 2.624535829863549e-06, + "loss": 0.3266, + "step": 7393 + }, + { + "epoch": 2.075800112296463, + "grad_norm": 0.6039384007453918, + "learning_rate": 2.623098710372295e-06, + "loss": 0.3182, + "step": 7394 + }, + { + "epoch": 2.0760808534531163, + "grad_norm": 0.6215166449546814, + "learning_rate": 2.621661844529165e-06, + "loss": 0.2946, + "step": 7395 + }, + { + "epoch": 2.0763615946097698, + "grad_norm": 0.5878775119781494, + "learning_rate": 2.6202252324874916e-06, + "loss": 0.3404, + "step": 7396 + }, + { + "epoch": 2.076642335766423, + "grad_norm": 0.5666494369506836, + "learning_rate": 2.6187888744005842e-06, + "loss": 0.2673, + "step": 7397 + }, + { + "epoch": 2.076923076923077, + "grad_norm": 0.5629453659057617, + "learning_rate": 2.6173527704217188e-06, + "loss": 0.2981, + "step": 7398 + }, + { + "epoch": 2.0772038180797305, + "grad_norm": 0.5452430248260498, + "learning_rate": 2.6159169207041505e-06, + "loss": 0.3016, + "step": 7399 + }, + { + "epoch": 2.077484559236384, + "grad_norm": 0.6120348572731018, + "learning_rate": 2.6144813254011036e-06, + "loss": 0.2976, + "step": 7400 + }, + { + "epoch": 2.0777653003930374, + "grad_norm": 0.5789759159088135, + "learning_rate": 2.6130459846657723e-06, + "loss": 0.3126, + "step": 7401 + }, + { + "epoch": 2.0780460415496913, + "grad_norm": 0.5597143769264221, + "learning_rate": 2.6116108986513324e-06, + "loss": 0.3379, + "step": 7402 + }, + { + "epoch": 2.0783267827063447, + "grad_norm": 0.6099171042442322, + "learning_rate": 2.6101760675109228e-06, + "loss": 0.3187, + "step": 7403 + }, + { + "epoch": 2.078607523862998, + "grad_norm": 0.6088306903839111, + "learning_rate": 2.6087414913976637e-06, + "loss": 0.297, + "step": 7404 + }, + { + "epoch": 2.078888265019652, + "grad_norm": 0.5763127207756042, + "learning_rate": 2.607307170464641e-06, + "loss": 0.3083, + "step": 7405 + }, + { + "epoch": 2.0791690061763055, + "grad_norm": 0.48565754294395447, + "learning_rate": 2.60587310486492e-06, + "loss": 0.3519, + "step": 7406 + }, + { + "epoch": 2.079449747332959, + "grad_norm": 0.5613904595375061, + "learning_rate": 2.6044392947515326e-06, + "loss": 0.3454, + "step": 7407 + }, + { + "epoch": 2.0797304884896124, + "grad_norm": 0.5467095971107483, + "learning_rate": 2.6030057402774846e-06, + "loss": 0.3246, + "step": 7408 + }, + { + "epoch": 2.0800112296462663, + "grad_norm": 0.6917138695716858, + "learning_rate": 2.60157244159576e-06, + "loss": 0.3632, + "step": 7409 + }, + { + "epoch": 2.0802919708029197, + "grad_norm": 0.5972304344177246, + "learning_rate": 2.600139398859308e-06, + "loss": 0.3007, + "step": 7410 + }, + { + "epoch": 2.080572711959573, + "grad_norm": 0.591606080532074, + "learning_rate": 2.5987066122210574e-06, + "loss": 0.3418, + "step": 7411 + }, + { + "epoch": 2.080853453116227, + "grad_norm": 0.5599426627159119, + "learning_rate": 2.5972740818339048e-06, + "loss": 0.3257, + "step": 7412 + }, + { + "epoch": 2.0811341942728805, + "grad_norm": 0.51194167137146, + "learning_rate": 2.5958418078507187e-06, + "loss": 0.3386, + "step": 7413 + }, + { + "epoch": 2.081414935429534, + "grad_norm": 0.5968789458274841, + "learning_rate": 2.594409790424346e-06, + "loss": 0.331, + "step": 7414 + }, + { + "epoch": 2.0816956765861874, + "grad_norm": 0.5914863348007202, + "learning_rate": 2.592978029707599e-06, + "loss": 0.3506, + "step": 7415 + }, + { + "epoch": 2.0819764177428413, + "grad_norm": 0.5246887803077698, + "learning_rate": 2.5915465258532703e-06, + "loss": 0.3365, + "step": 7416 + }, + { + "epoch": 2.0822571588994947, + "grad_norm": 0.5770841836929321, + "learning_rate": 2.5901152790141175e-06, + "loss": 0.3165, + "step": 7417 + }, + { + "epoch": 2.082537900056148, + "grad_norm": 0.5936367511749268, + "learning_rate": 2.588684289342876e-06, + "loss": 0.2772, + "step": 7418 + }, + { + "epoch": 2.0828186412128016, + "grad_norm": 0.6054359674453735, + "learning_rate": 2.587253556992254e-06, + "loss": 0.3147, + "step": 7419 + }, + { + "epoch": 2.0830993823694555, + "grad_norm": 0.6312696933746338, + "learning_rate": 2.5858230821149267e-06, + "loss": 0.3152, + "step": 7420 + }, + { + "epoch": 2.083380123526109, + "grad_norm": 0.5774433612823486, + "learning_rate": 2.58439286486355e-06, + "loss": 0.3323, + "step": 7421 + }, + { + "epoch": 2.0836608646827623, + "grad_norm": 0.5920672416687012, + "learning_rate": 2.5829629053907436e-06, + "loss": 0.3329, + "step": 7422 + }, + { + "epoch": 2.0839416058394162, + "grad_norm": 0.7043784856796265, + "learning_rate": 2.5815332038491044e-06, + "loss": 0.3182, + "step": 7423 + }, + { + "epoch": 2.0842223469960697, + "grad_norm": 0.6078377366065979, + "learning_rate": 2.5801037603912036e-06, + "loss": 0.3414, + "step": 7424 + }, + { + "epoch": 2.084503088152723, + "grad_norm": 0.5726706385612488, + "learning_rate": 2.5786745751695796e-06, + "loss": 0.2863, + "step": 7425 + }, + { + "epoch": 2.0847838293093766, + "grad_norm": 0.5997820496559143, + "learning_rate": 2.57724564833675e-06, + "loss": 0.298, + "step": 7426 + }, + { + "epoch": 2.0850645704660304, + "grad_norm": 0.6254377365112305, + "learning_rate": 2.575816980045196e-06, + "loss": 0.3404, + "step": 7427 + }, + { + "epoch": 2.085345311622684, + "grad_norm": 0.5725039839744568, + "learning_rate": 2.5743885704473813e-06, + "loss": 0.3222, + "step": 7428 + }, + { + "epoch": 2.0856260527793373, + "grad_norm": 0.608169674873352, + "learning_rate": 2.572960419695734e-06, + "loss": 0.3372, + "step": 7429 + }, + { + "epoch": 2.085906793935991, + "grad_norm": 0.6945512294769287, + "learning_rate": 2.571532527942657e-06, + "loss": 0.2935, + "step": 7430 + }, + { + "epoch": 2.0861875350926447, + "grad_norm": 0.5469156503677368, + "learning_rate": 2.570104895340528e-06, + "loss": 0.3506, + "step": 7431 + }, + { + "epoch": 2.086468276249298, + "grad_norm": 0.5566493272781372, + "learning_rate": 2.5686775220416927e-06, + "loss": 0.3173, + "step": 7432 + }, + { + "epoch": 2.0867490174059515, + "grad_norm": 0.6051676273345947, + "learning_rate": 2.567250408198474e-06, + "loss": 0.3332, + "step": 7433 + }, + { + "epoch": 2.0870297585626054, + "grad_norm": 0.5985397696495056, + "learning_rate": 2.5658235539631636e-06, + "loss": 0.3162, + "step": 7434 + }, + { + "epoch": 2.087310499719259, + "grad_norm": 0.6635539531707764, + "learning_rate": 2.5643969594880253e-06, + "loss": 0.3055, + "step": 7435 + }, + { + "epoch": 2.0875912408759123, + "grad_norm": 0.6240601539611816, + "learning_rate": 2.5629706249252984e-06, + "loss": 0.3411, + "step": 7436 + }, + { + "epoch": 2.087871982032566, + "grad_norm": 0.599671483039856, + "learning_rate": 2.561544550427191e-06, + "loss": 0.3049, + "step": 7437 + }, + { + "epoch": 2.0881527231892196, + "grad_norm": 0.7075299620628357, + "learning_rate": 2.560118736145886e-06, + "loss": 0.3189, + "step": 7438 + }, + { + "epoch": 2.088433464345873, + "grad_norm": 0.5455586314201355, + "learning_rate": 2.558693182233535e-06, + "loss": 0.3386, + "step": 7439 + }, + { + "epoch": 2.0887142055025265, + "grad_norm": 0.5406078696250916, + "learning_rate": 2.5572678888422684e-06, + "loss": 0.3512, + "step": 7440 + }, + { + "epoch": 2.0889949466591804, + "grad_norm": 0.6455414295196533, + "learning_rate": 2.555842856124182e-06, + "loss": 0.2934, + "step": 7441 + }, + { + "epoch": 2.089275687815834, + "grad_norm": 0.663253903388977, + "learning_rate": 2.554418084231346e-06, + "loss": 0.3041, + "step": 7442 + }, + { + "epoch": 2.0895564289724873, + "grad_norm": 0.5740071535110474, + "learning_rate": 2.552993573315803e-06, + "loss": 0.3295, + "step": 7443 + }, + { + "epoch": 2.0898371701291407, + "grad_norm": 0.6303340792655945, + "learning_rate": 2.5515693235295714e-06, + "loss": 0.31, + "step": 7444 + }, + { + "epoch": 2.0901179112857946, + "grad_norm": 0.6651839017868042, + "learning_rate": 2.550145335024633e-06, + "loss": 0.33, + "step": 7445 + }, + { + "epoch": 2.090398652442448, + "grad_norm": 0.5303608179092407, + "learning_rate": 2.548721607952952e-06, + "loss": 0.3673, + "step": 7446 + }, + { + "epoch": 2.0906793935991015, + "grad_norm": 0.5375730991363525, + "learning_rate": 2.547298142466456e-06, + "loss": 0.3085, + "step": 7447 + }, + { + "epoch": 2.0909601347557554, + "grad_norm": 0.5289609432220459, + "learning_rate": 2.545874938717052e-06, + "loss": 0.3284, + "step": 7448 + }, + { + "epoch": 2.091240875912409, + "grad_norm": 0.5582297444343567, + "learning_rate": 2.5444519968566128e-06, + "loss": 0.3177, + "step": 7449 + }, + { + "epoch": 2.0915216170690623, + "grad_norm": 0.5148752331733704, + "learning_rate": 2.543029317036985e-06, + "loss": 0.3124, + "step": 7450 + }, + { + "epoch": 2.0918023582257157, + "grad_norm": 0.5447492003440857, + "learning_rate": 2.5416068994099907e-06, + "loss": 0.2981, + "step": 7451 + }, + { + "epoch": 2.0920830993823696, + "grad_norm": 0.5968486070632935, + "learning_rate": 2.540184744127419e-06, + "loss": 0.3387, + "step": 7452 + }, + { + "epoch": 2.092363840539023, + "grad_norm": 0.6052152514457703, + "learning_rate": 2.538762851341037e-06, + "loss": 0.3227, + "step": 7453 + }, + { + "epoch": 2.0926445816956765, + "grad_norm": 0.6142473220825195, + "learning_rate": 2.537341221202576e-06, + "loss": 0.2883, + "step": 7454 + }, + { + "epoch": 2.0929253228523303, + "grad_norm": 0.6422644853591919, + "learning_rate": 2.5359198538637475e-06, + "loss": 0.2955, + "step": 7455 + }, + { + "epoch": 2.093206064008984, + "grad_norm": 0.6409845948219299, + "learning_rate": 2.5344987494762287e-06, + "loss": 0.3308, + "step": 7456 + }, + { + "epoch": 2.0934868051656372, + "grad_norm": 0.5762452483177185, + "learning_rate": 2.5330779081916703e-06, + "loss": 0.3282, + "step": 7457 + }, + { + "epoch": 2.0937675463222907, + "grad_norm": 0.5471944212913513, + "learning_rate": 2.5316573301616976e-06, + "loss": 0.308, + "step": 7458 + }, + { + "epoch": 2.0940482874789446, + "grad_norm": 0.5242460370063782, + "learning_rate": 2.5302370155379037e-06, + "loss": 0.3485, + "step": 7459 + }, + { + "epoch": 2.094329028635598, + "grad_norm": 0.5595307350158691, + "learning_rate": 2.5288169644718587e-06, + "loss": 0.3595, + "step": 7460 + }, + { + "epoch": 2.0946097697922514, + "grad_norm": 0.5410830974578857, + "learning_rate": 2.5273971771151007e-06, + "loss": 0.3302, + "step": 7461 + }, + { + "epoch": 2.094890510948905, + "grad_norm": 0.543195903301239, + "learning_rate": 2.5259776536191372e-06, + "loss": 0.3293, + "step": 7462 + }, + { + "epoch": 2.0951712521055588, + "grad_norm": 0.5585414171218872, + "learning_rate": 2.524558394135456e-06, + "loss": 0.3021, + "step": 7463 + }, + { + "epoch": 2.095451993262212, + "grad_norm": 0.5444145798683167, + "learning_rate": 2.523139398815507e-06, + "loss": 0.308, + "step": 7464 + }, + { + "epoch": 2.0957327344188657, + "grad_norm": 0.5823849439620972, + "learning_rate": 2.5217206678107207e-06, + "loss": 0.3196, + "step": 7465 + }, + { + "epoch": 2.0960134755755195, + "grad_norm": 0.5624425411224365, + "learning_rate": 2.520302201272491e-06, + "loss": 0.3146, + "step": 7466 + }, + { + "epoch": 2.096294216732173, + "grad_norm": 0.5407665967941284, + "learning_rate": 2.518883999352193e-06, + "loss": 0.3056, + "step": 7467 + }, + { + "epoch": 2.0965749578888264, + "grad_norm": 0.5789952874183655, + "learning_rate": 2.5174660622011627e-06, + "loss": 0.3187, + "step": 7468 + }, + { + "epoch": 2.09685569904548, + "grad_norm": 0.5371475219726562, + "learning_rate": 2.5160483899707173e-06, + "loss": 0.3329, + "step": 7469 + }, + { + "epoch": 2.0971364402021337, + "grad_norm": 0.5785125494003296, + "learning_rate": 2.5146309828121424e-06, + "loss": 0.3119, + "step": 7470 + }, + { + "epoch": 2.097417181358787, + "grad_norm": 0.5566592216491699, + "learning_rate": 2.5132138408766937e-06, + "loss": 0.3496, + "step": 7471 + }, + { + "epoch": 2.0976979225154406, + "grad_norm": 0.5942724943161011, + "learning_rate": 2.5117969643155975e-06, + "loss": 0.3738, + "step": 7472 + }, + { + "epoch": 2.0979786636720945, + "grad_norm": 0.5612032413482666, + "learning_rate": 2.5103803532800587e-06, + "loss": 0.3987, + "step": 7473 + }, + { + "epoch": 2.098259404828748, + "grad_norm": 0.5707175731658936, + "learning_rate": 2.508964007921244e-06, + "loss": 0.3084, + "step": 7474 + }, + { + "epoch": 2.0985401459854014, + "grad_norm": 0.6076996922492981, + "learning_rate": 2.5075479283903013e-06, + "loss": 0.3216, + "step": 7475 + }, + { + "epoch": 2.098820887142055, + "grad_norm": 0.5889235138893127, + "learning_rate": 2.506132114838343e-06, + "loss": 0.3122, + "step": 7476 + }, + { + "epoch": 2.0991016282987087, + "grad_norm": 0.5599289536476135, + "learning_rate": 2.5047165674164586e-06, + "loss": 0.3516, + "step": 7477 + }, + { + "epoch": 2.099382369455362, + "grad_norm": 0.5801505446434021, + "learning_rate": 2.5033012862757054e-06, + "loss": 0.3031, + "step": 7478 + }, + { + "epoch": 2.0996631106120156, + "grad_norm": 0.5513166785240173, + "learning_rate": 2.501886271567111e-06, + "loss": 0.2811, + "step": 7479 + }, + { + "epoch": 2.0999438517686695, + "grad_norm": 0.6112514138221741, + "learning_rate": 2.5004715234416804e-06, + "loss": 0.376, + "step": 7480 + }, + { + "epoch": 2.100224592925323, + "grad_norm": 0.5071297287940979, + "learning_rate": 2.4990570420503834e-06, + "loss": 0.3055, + "step": 7481 + }, + { + "epoch": 2.1005053340819764, + "grad_norm": 0.516471266746521, + "learning_rate": 2.4976428275441687e-06, + "loss": 0.3257, + "step": 7482 + }, + { + "epoch": 2.10078607523863, + "grad_norm": 0.5417808294296265, + "learning_rate": 2.4962288800739503e-06, + "loss": 0.3404, + "step": 7483 + }, + { + "epoch": 2.1010668163952837, + "grad_norm": 0.6079418659210205, + "learning_rate": 2.4948151997906138e-06, + "loss": 0.3456, + "step": 7484 + }, + { + "epoch": 2.101347557551937, + "grad_norm": 0.5489481091499329, + "learning_rate": 2.4934017868450226e-06, + "loss": 0.3265, + "step": 7485 + }, + { + "epoch": 2.1016282987085906, + "grad_norm": 0.5220032930374146, + "learning_rate": 2.4919886413880036e-06, + "loss": 0.3148, + "step": 7486 + }, + { + "epoch": 2.1019090398652445, + "grad_norm": 0.6036257147789001, + "learning_rate": 2.4905757635703613e-06, + "loss": 0.3158, + "step": 7487 + }, + { + "epoch": 2.102189781021898, + "grad_norm": 0.5709389448165894, + "learning_rate": 2.489163153542868e-06, + "loss": 0.2949, + "step": 7488 + }, + { + "epoch": 2.1024705221785513, + "grad_norm": 0.5465784668922424, + "learning_rate": 2.4877508114562697e-06, + "loss": 0.3371, + "step": 7489 + }, + { + "epoch": 2.102751263335205, + "grad_norm": 0.5625720024108887, + "learning_rate": 2.4863387374612827e-06, + "loss": 0.319, + "step": 7490 + }, + { + "epoch": 2.1030320044918587, + "grad_norm": 0.5638197660446167, + "learning_rate": 2.4849269317085927e-06, + "loss": 0.3438, + "step": 7491 + }, + { + "epoch": 2.103312745648512, + "grad_norm": 0.591765284538269, + "learning_rate": 2.4835153943488617e-06, + "loss": 0.3305, + "step": 7492 + }, + { + "epoch": 2.1035934868051656, + "grad_norm": 0.5602503418922424, + "learning_rate": 2.482104125532717e-06, + "loss": 0.3237, + "step": 7493 + }, + { + "epoch": 2.103874227961819, + "grad_norm": 0.6113702058792114, + "learning_rate": 2.4806931254107625e-06, + "loss": 0.3125, + "step": 7494 + }, + { + "epoch": 2.104154969118473, + "grad_norm": 0.5282644033432007, + "learning_rate": 2.4792823941335724e-06, + "loss": 0.3647, + "step": 7495 + }, + { + "epoch": 2.1044357102751263, + "grad_norm": 0.5988295674324036, + "learning_rate": 2.4778719318516886e-06, + "loss": 0.3629, + "step": 7496 + }, + { + "epoch": 2.1047164514317798, + "grad_norm": 0.6805895566940308, + "learning_rate": 2.4764617387156304e-06, + "loss": 0.36, + "step": 7497 + }, + { + "epoch": 2.1049971925884337, + "grad_norm": 0.6177749633789062, + "learning_rate": 2.4750518148758818e-06, + "loss": 0.3218, + "step": 7498 + }, + { + "epoch": 2.105277933745087, + "grad_norm": 0.5861527323722839, + "learning_rate": 2.4736421604829002e-06, + "loss": 0.3399, + "step": 7499 + }, + { + "epoch": 2.1055586749017405, + "grad_norm": 0.6087446808815002, + "learning_rate": 2.472232775687119e-06, + "loss": 0.3161, + "step": 7500 + }, + { + "epoch": 2.105839416058394, + "grad_norm": 0.49675971269607544, + "learning_rate": 2.4708236606389347e-06, + "loss": 0.3214, + "step": 7501 + }, + { + "epoch": 2.106120157215048, + "grad_norm": 0.6294077038764954, + "learning_rate": 2.4694148154887233e-06, + "loss": 0.3223, + "step": 7502 + }, + { + "epoch": 2.1064008983717013, + "grad_norm": 0.5638206601142883, + "learning_rate": 2.4680062403868244e-06, + "loss": 0.3203, + "step": 7503 + }, + { + "epoch": 2.1066816395283547, + "grad_norm": 0.5751687288284302, + "learning_rate": 2.4665979354835563e-06, + "loss": 0.325, + "step": 7504 + }, + { + "epoch": 2.1069623806850086, + "grad_norm": 0.6632601022720337, + "learning_rate": 2.465189900929202e-06, + "loss": 0.3106, + "step": 7505 + }, + { + "epoch": 2.107243121841662, + "grad_norm": 0.5973731279373169, + "learning_rate": 2.463782136874016e-06, + "loss": 0.3438, + "step": 7506 + }, + { + "epoch": 2.1075238629983155, + "grad_norm": 0.600148618221283, + "learning_rate": 2.4623746434682317e-06, + "loss": 0.321, + "step": 7507 + }, + { + "epoch": 2.107804604154969, + "grad_norm": 0.5248749852180481, + "learning_rate": 2.460967420862042e-06, + "loss": 0.3083, + "step": 7508 + }, + { + "epoch": 2.108085345311623, + "grad_norm": 0.5900746583938599, + "learning_rate": 2.4595604692056225e-06, + "loss": 0.3359, + "step": 7509 + }, + { + "epoch": 2.1083660864682763, + "grad_norm": 0.5891392827033997, + "learning_rate": 2.458153788649112e-06, + "loss": 0.2958, + "step": 7510 + }, + { + "epoch": 2.1086468276249297, + "grad_norm": 0.5616078972816467, + "learning_rate": 2.45674737934262e-06, + "loss": 0.3548, + "step": 7511 + }, + { + "epoch": 2.108927568781583, + "grad_norm": 0.5850181579589844, + "learning_rate": 2.4553412414362343e-06, + "loss": 0.3199, + "step": 7512 + }, + { + "epoch": 2.109208309938237, + "grad_norm": 0.557204008102417, + "learning_rate": 2.4539353750800052e-06, + "loss": 0.3649, + "step": 7513 + }, + { + "epoch": 2.1094890510948905, + "grad_norm": 0.6122573614120483, + "learning_rate": 2.4525297804239623e-06, + "loss": 0.3062, + "step": 7514 + }, + { + "epoch": 2.109769792251544, + "grad_norm": 0.5749813318252563, + "learning_rate": 2.451124457618097e-06, + "loss": 0.3345, + "step": 7515 + }, + { + "epoch": 2.110050533408198, + "grad_norm": 0.6210273504257202, + "learning_rate": 2.4497194068123816e-06, + "loss": 0.333, + "step": 7516 + }, + { + "epoch": 2.1103312745648513, + "grad_norm": 0.5353874564170837, + "learning_rate": 2.4483146281567515e-06, + "loss": 0.3077, + "step": 7517 + }, + { + "epoch": 2.1106120157215047, + "grad_norm": 0.6579467058181763, + "learning_rate": 2.446910121801115e-06, + "loss": 0.3334, + "step": 7518 + }, + { + "epoch": 2.110892756878158, + "grad_norm": 0.5924035906791687, + "learning_rate": 2.445505887895353e-06, + "loss": 0.3439, + "step": 7519 + }, + { + "epoch": 2.111173498034812, + "grad_norm": 0.5882341861724854, + "learning_rate": 2.4441019265893202e-06, + "loss": 0.326, + "step": 7520 + }, + { + "epoch": 2.1114542391914655, + "grad_norm": 0.5669301152229309, + "learning_rate": 2.4426982380328328e-06, + "loss": 0.3481, + "step": 7521 + }, + { + "epoch": 2.111734980348119, + "grad_norm": 0.5781010985374451, + "learning_rate": 2.4412948223756886e-06, + "loss": 0.3442, + "step": 7522 + }, + { + "epoch": 2.112015721504773, + "grad_norm": 0.5489780306816101, + "learning_rate": 2.439891679767648e-06, + "loss": 0.3243, + "step": 7523 + }, + { + "epoch": 2.1122964626614262, + "grad_norm": 0.6508286595344543, + "learning_rate": 2.4384888103584494e-06, + "loss": 0.3317, + "step": 7524 + }, + { + "epoch": 2.1125772038180797, + "grad_norm": 0.5775461196899414, + "learning_rate": 2.437086214297793e-06, + "loss": 0.3416, + "step": 7525 + }, + { + "epoch": 2.112857944974733, + "grad_norm": 0.5868874192237854, + "learning_rate": 2.435683891735361e-06, + "loss": 0.3343, + "step": 7526 + }, + { + "epoch": 2.113138686131387, + "grad_norm": 0.5194208025932312, + "learning_rate": 2.434281842820797e-06, + "loss": 0.3611, + "step": 7527 + }, + { + "epoch": 2.1134194272880404, + "grad_norm": 0.5542311072349548, + "learning_rate": 2.4328800677037178e-06, + "loss": 0.3476, + "step": 7528 + }, + { + "epoch": 2.113700168444694, + "grad_norm": 0.6189546585083008, + "learning_rate": 2.4314785665337158e-06, + "loss": 0.3554, + "step": 7529 + }, + { + "epoch": 2.1139809096013478, + "grad_norm": 0.5882139801979065, + "learning_rate": 2.4300773394603466e-06, + "loss": 0.2938, + "step": 7530 + }, + { + "epoch": 2.114261650758001, + "grad_norm": 0.5537412166595459, + "learning_rate": 2.4286763866331447e-06, + "loss": 0.2909, + "step": 7531 + }, + { + "epoch": 2.1145423919146547, + "grad_norm": 0.5822217464447021, + "learning_rate": 2.4272757082016087e-06, + "loss": 0.3164, + "step": 7532 + }, + { + "epoch": 2.114823133071308, + "grad_norm": 0.5897016525268555, + "learning_rate": 2.425875304315208e-06, + "loss": 0.3467, + "step": 7533 + }, + { + "epoch": 2.115103874227962, + "grad_norm": 0.6019304394721985, + "learning_rate": 2.4244751751233895e-06, + "loss": 0.3264, + "step": 7534 + }, + { + "epoch": 2.1153846153846154, + "grad_norm": 0.5713221430778503, + "learning_rate": 2.4230753207755633e-06, + "loss": 0.3045, + "step": 7535 + }, + { + "epoch": 2.115665356541269, + "grad_norm": 0.6040393114089966, + "learning_rate": 2.4216757414211143e-06, + "loss": 0.3213, + "step": 7536 + }, + { + "epoch": 2.1159460976979223, + "grad_norm": 0.5339690446853638, + "learning_rate": 2.420276437209396e-06, + "loss": 0.3378, + "step": 7537 + }, + { + "epoch": 2.116226838854576, + "grad_norm": 0.6134693622589111, + "learning_rate": 2.4188774082897353e-06, + "loss": 0.3856, + "step": 7538 + }, + { + "epoch": 2.1165075800112296, + "grad_norm": 0.5634220838546753, + "learning_rate": 2.417478654811427e-06, + "loss": 0.2969, + "step": 7539 + }, + { + "epoch": 2.116788321167883, + "grad_norm": 0.6140146851539612, + "learning_rate": 2.4160801769237353e-06, + "loss": 0.3411, + "step": 7540 + }, + { + "epoch": 2.117069062324537, + "grad_norm": 0.5918681621551514, + "learning_rate": 2.4146819747759004e-06, + "loss": 0.3637, + "step": 7541 + }, + { + "epoch": 2.1173498034811904, + "grad_norm": 0.6469187140464783, + "learning_rate": 2.413284048517126e-06, + "loss": 0.3388, + "step": 7542 + }, + { + "epoch": 2.117630544637844, + "grad_norm": 0.5671555995941162, + "learning_rate": 2.411886398296594e-06, + "loss": 0.3216, + "step": 7543 + }, + { + "epoch": 2.1179112857944973, + "grad_norm": 0.5650293827056885, + "learning_rate": 2.4104890242634497e-06, + "loss": 0.3427, + "step": 7544 + }, + { + "epoch": 2.118192026951151, + "grad_norm": 0.607598602771759, + "learning_rate": 2.4090919265668134e-06, + "loss": 0.3587, + "step": 7545 + }, + { + "epoch": 2.1184727681078046, + "grad_norm": 0.6334040760993958, + "learning_rate": 2.4076951053557773e-06, + "loss": 0.3126, + "step": 7546 + }, + { + "epoch": 2.118753509264458, + "grad_norm": 0.5849760174751282, + "learning_rate": 2.4062985607793965e-06, + "loss": 0.3234, + "step": 7547 + }, + { + "epoch": 2.119034250421112, + "grad_norm": 0.5422614812850952, + "learning_rate": 2.404902292986706e-06, + "loss": 0.3108, + "step": 7548 + }, + { + "epoch": 2.1193149915777654, + "grad_norm": 0.5604290962219238, + "learning_rate": 2.4035063021267057e-06, + "loss": 0.3361, + "step": 7549 + }, + { + "epoch": 2.119595732734419, + "grad_norm": 0.5354562401771545, + "learning_rate": 2.4021105883483636e-06, + "loss": 0.3279, + "step": 7550 + }, + { + "epoch": 2.1198764738910723, + "grad_norm": 0.5493098497390747, + "learning_rate": 2.4007151518006267e-06, + "loss": 0.3355, + "step": 7551 + }, + { + "epoch": 2.120157215047726, + "grad_norm": 0.5883382558822632, + "learning_rate": 2.399319992632403e-06, + "loss": 0.3451, + "step": 7552 + }, + { + "epoch": 2.1204379562043796, + "grad_norm": 0.6600798964500427, + "learning_rate": 2.397925110992579e-06, + "loss": 0.327, + "step": 7553 + }, + { + "epoch": 2.120718697361033, + "grad_norm": 0.5692732930183411, + "learning_rate": 2.3965305070300054e-06, + "loss": 0.3234, + "step": 7554 + }, + { + "epoch": 2.1209994385176865, + "grad_norm": 0.5529320240020752, + "learning_rate": 2.3951361808935047e-06, + "loss": 0.2964, + "step": 7555 + }, + { + "epoch": 2.1212801796743403, + "grad_norm": 0.5611093044281006, + "learning_rate": 2.3937421327318746e-06, + "loss": 0.3681, + "step": 7556 + }, + { + "epoch": 2.121560920830994, + "grad_norm": 0.5585898160934448, + "learning_rate": 2.3923483626938744e-06, + "loss": 0.3147, + "step": 7557 + }, + { + "epoch": 2.1218416619876472, + "grad_norm": 0.6208656430244446, + "learning_rate": 2.3909548709282427e-06, + "loss": 0.3159, + "step": 7558 + }, + { + "epoch": 2.122122403144301, + "grad_norm": 0.6122855544090271, + "learning_rate": 2.389561657583681e-06, + "loss": 0.3376, + "step": 7559 + }, + { + "epoch": 2.1224031443009546, + "grad_norm": 0.5505663752555847, + "learning_rate": 2.388168722808868e-06, + "loss": 0.3573, + "step": 7560 + }, + { + "epoch": 2.122683885457608, + "grad_norm": 0.5612902045249939, + "learning_rate": 2.3867760667524464e-06, + "loss": 0.3412, + "step": 7561 + }, + { + "epoch": 2.1229646266142614, + "grad_norm": 0.6577821373939514, + "learning_rate": 2.385383689563031e-06, + "loss": 0.3318, + "step": 7562 + }, + { + "epoch": 2.1232453677709153, + "grad_norm": 0.6066629886627197, + "learning_rate": 2.383991591389211e-06, + "loss": 0.3409, + "step": 7563 + }, + { + "epoch": 2.1235261089275688, + "grad_norm": 0.6030011773109436, + "learning_rate": 2.3825997723795384e-06, + "loss": 0.3185, + "step": 7564 + }, + { + "epoch": 2.123806850084222, + "grad_norm": 0.6716628074645996, + "learning_rate": 2.381208232682543e-06, + "loss": 0.3383, + "step": 7565 + }, + { + "epoch": 2.124087591240876, + "grad_norm": 0.5747437477111816, + "learning_rate": 2.3798169724467207e-06, + "loss": 0.3173, + "step": 7566 + }, + { + "epoch": 2.1243683323975295, + "grad_norm": 0.5512399077415466, + "learning_rate": 2.3784259918205347e-06, + "loss": 0.321, + "step": 7567 + }, + { + "epoch": 2.124649073554183, + "grad_norm": 0.6270153522491455, + "learning_rate": 2.377035290952427e-06, + "loss": 0.3497, + "step": 7568 + }, + { + "epoch": 2.1249298147108364, + "grad_norm": 0.6106194257736206, + "learning_rate": 2.375644869990799e-06, + "loss": 0.3372, + "step": 7569 + }, + { + "epoch": 2.1252105558674903, + "grad_norm": 0.5862271785736084, + "learning_rate": 2.374254729084031e-06, + "loss": 0.3359, + "step": 7570 + }, + { + "epoch": 2.1254912970241437, + "grad_norm": 0.5978150367736816, + "learning_rate": 2.372864868380472e-06, + "loss": 0.3816, + "step": 7571 + }, + { + "epoch": 2.125772038180797, + "grad_norm": 0.6333993077278137, + "learning_rate": 2.371475288028435e-06, + "loss": 0.3105, + "step": 7572 + }, + { + "epoch": 2.126052779337451, + "grad_norm": 0.6072990894317627, + "learning_rate": 2.3700859881762125e-06, + "loss": 0.3615, + "step": 7573 + }, + { + "epoch": 2.1263335204941045, + "grad_norm": 0.5752395987510681, + "learning_rate": 2.368696968972056e-06, + "loss": 0.3371, + "step": 7574 + }, + { + "epoch": 2.126614261650758, + "grad_norm": 0.5667293071746826, + "learning_rate": 2.3673082305641982e-06, + "loss": 0.3204, + "step": 7575 + }, + { + "epoch": 2.1268950028074114, + "grad_norm": 0.550583004951477, + "learning_rate": 2.3659197731008343e-06, + "loss": 0.2922, + "step": 7576 + }, + { + "epoch": 2.1271757439640653, + "grad_norm": 0.6199833154678345, + "learning_rate": 2.3645315967301304e-06, + "loss": 0.2839, + "step": 7577 + }, + { + "epoch": 2.1274564851207187, + "grad_norm": 0.5537998080253601, + "learning_rate": 2.363143701600227e-06, + "loss": 0.3375, + "step": 7578 + }, + { + "epoch": 2.127737226277372, + "grad_norm": 0.5808775424957275, + "learning_rate": 2.361756087859228e-06, + "loss": 0.3396, + "step": 7579 + }, + { + "epoch": 2.128017967434026, + "grad_norm": 0.5723810791969299, + "learning_rate": 2.360368755655216e-06, + "loss": 0.2775, + "step": 7580 + }, + { + "epoch": 2.1282987085906795, + "grad_norm": 0.48975247144699097, + "learning_rate": 2.3589817051362353e-06, + "loss": 0.363, + "step": 7581 + }, + { + "epoch": 2.128579449747333, + "grad_norm": 0.5985215902328491, + "learning_rate": 2.3575949364503013e-06, + "loss": 0.2941, + "step": 7582 + }, + { + "epoch": 2.1288601909039864, + "grad_norm": 0.5500089526176453, + "learning_rate": 2.3562084497454064e-06, + "loss": 0.324, + "step": 7583 + }, + { + "epoch": 2.1291409320606403, + "grad_norm": 0.5465249419212341, + "learning_rate": 2.3548222451695023e-06, + "loss": 0.3049, + "step": 7584 + }, + { + "epoch": 2.1294216732172937, + "grad_norm": 0.5955963730812073, + "learning_rate": 2.3534363228705216e-06, + "loss": 0.3097, + "step": 7585 + }, + { + "epoch": 2.129702414373947, + "grad_norm": 0.5353860259056091, + "learning_rate": 2.352050682996356e-06, + "loss": 0.326, + "step": 7586 + }, + { + "epoch": 2.1299831555306006, + "grad_norm": 0.5219101905822754, + "learning_rate": 2.350665325694877e-06, + "loss": 0.3722, + "step": 7587 + }, + { + "epoch": 2.1302638966872545, + "grad_norm": 0.5881059169769287, + "learning_rate": 2.3492802511139197e-06, + "loss": 0.3312, + "step": 7588 + }, + { + "epoch": 2.130544637843908, + "grad_norm": 0.5381979942321777, + "learning_rate": 2.3478954594012884e-06, + "loss": 0.3322, + "step": 7589 + }, + { + "epoch": 2.1308253790005613, + "grad_norm": 0.6292857527732849, + "learning_rate": 2.3465109507047628e-06, + "loss": 0.3815, + "step": 7590 + }, + { + "epoch": 2.1311061201572152, + "grad_norm": 0.5772551894187927, + "learning_rate": 2.3451267251720862e-06, + "loss": 0.3351, + "step": 7591 + }, + { + "epoch": 2.1313868613138687, + "grad_norm": 0.5882990956306458, + "learning_rate": 2.3437427829509775e-06, + "loss": 0.3334, + "step": 7592 + }, + { + "epoch": 2.131667602470522, + "grad_norm": 0.6055522561073303, + "learning_rate": 2.34235912418912e-06, + "loss": 0.3051, + "step": 7593 + }, + { + "epoch": 2.1319483436271756, + "grad_norm": 0.5196595191955566, + "learning_rate": 2.3409757490341694e-06, + "loss": 0.3534, + "step": 7594 + }, + { + "epoch": 2.1322290847838294, + "grad_norm": 0.5962284207344055, + "learning_rate": 2.3395926576337513e-06, + "loss": 0.3111, + "step": 7595 + }, + { + "epoch": 2.132509825940483, + "grad_norm": 0.5383732318878174, + "learning_rate": 2.3382098501354603e-06, + "loss": 0.3222, + "step": 7596 + }, + { + "epoch": 2.1327905670971363, + "grad_norm": 0.5777949690818787, + "learning_rate": 2.3368273266868637e-06, + "loss": 0.3553, + "step": 7597 + }, + { + "epoch": 2.13307130825379, + "grad_norm": 0.5984180569648743, + "learning_rate": 2.3354450874354934e-06, + "loss": 0.2832, + "step": 7598 + }, + { + "epoch": 2.1333520494104437, + "grad_norm": 0.5609362125396729, + "learning_rate": 2.334063132528852e-06, + "loss": 0.3384, + "step": 7599 + }, + { + "epoch": 2.133632790567097, + "grad_norm": 0.5911528468132019, + "learning_rate": 2.332681462114416e-06, + "loss": 0.3451, + "step": 7600 + }, + { + "epoch": 2.1339135317237505, + "grad_norm": 0.6141667366027832, + "learning_rate": 2.3313000763396266e-06, + "loss": 0.3033, + "step": 7601 + }, + { + "epoch": 2.1341942728804044, + "grad_norm": 0.6282621622085571, + "learning_rate": 2.3299189753518984e-06, + "loss": 0.2947, + "step": 7602 + }, + { + "epoch": 2.134475014037058, + "grad_norm": 0.5885448455810547, + "learning_rate": 2.3285381592986133e-06, + "loss": 0.3634, + "step": 7603 + }, + { + "epoch": 2.1347557551937113, + "grad_norm": 0.5369791388511658, + "learning_rate": 2.3271576283271215e-06, + "loss": 0.3067, + "step": 7604 + }, + { + "epoch": 2.1350364963503647, + "grad_norm": 0.5751357078552246, + "learning_rate": 2.325777382584748e-06, + "loss": 0.3165, + "step": 7605 + }, + { + "epoch": 2.1353172375070186, + "grad_norm": 0.5804493427276611, + "learning_rate": 2.3243974222187805e-06, + "loss": 0.3145, + "step": 7606 + }, + { + "epoch": 2.135597978663672, + "grad_norm": 0.6409282088279724, + "learning_rate": 2.323017747376484e-06, + "loss": 0.322, + "step": 7607 + }, + { + "epoch": 2.1358787198203255, + "grad_norm": 0.7023793458938599, + "learning_rate": 2.321638358205083e-06, + "loss": 0.3812, + "step": 7608 + }, + { + "epoch": 2.1361594609769794, + "grad_norm": 0.5948903560638428, + "learning_rate": 2.320259254851784e-06, + "loss": 0.3465, + "step": 7609 + }, + { + "epoch": 2.136440202133633, + "grad_norm": 0.5517528057098389, + "learning_rate": 2.318880437463753e-06, + "loss": 0.2651, + "step": 7610 + }, + { + "epoch": 2.1367209432902863, + "grad_norm": 0.5835468769073486, + "learning_rate": 2.3175019061881266e-06, + "loss": 0.3452, + "step": 7611 + }, + { + "epoch": 2.1370016844469397, + "grad_norm": 0.5559266805648804, + "learning_rate": 2.316123661172018e-06, + "loss": 0.3072, + "step": 7612 + }, + { + "epoch": 2.1372824256035936, + "grad_norm": 0.621350109577179, + "learning_rate": 2.3147457025624998e-06, + "loss": 0.2976, + "step": 7613 + }, + { + "epoch": 2.137563166760247, + "grad_norm": 0.5921510457992554, + "learning_rate": 2.3133680305066236e-06, + "loss": 0.3149, + "step": 7614 + }, + { + "epoch": 2.1378439079169005, + "grad_norm": 0.5949103236198425, + "learning_rate": 2.3119906451514045e-06, + "loss": 0.3443, + "step": 7615 + }, + { + "epoch": 2.1381246490735544, + "grad_norm": 0.5970814228057861, + "learning_rate": 2.310613546643826e-06, + "loss": 0.2916, + "step": 7616 + }, + { + "epoch": 2.138405390230208, + "grad_norm": 0.611373245716095, + "learning_rate": 2.3092367351308478e-06, + "loss": 0.3379, + "step": 7617 + }, + { + "epoch": 2.1386861313868613, + "grad_norm": 0.5446313619613647, + "learning_rate": 2.30786021075939e-06, + "loss": 0.3182, + "step": 7618 + }, + { + "epoch": 2.1389668725435147, + "grad_norm": 0.568806529045105, + "learning_rate": 2.3064839736763516e-06, + "loss": 0.3217, + "step": 7619 + }, + { + "epoch": 2.1392476137001686, + "grad_norm": 0.5940142869949341, + "learning_rate": 2.3051080240285917e-06, + "loss": 0.2976, + "step": 7620 + }, + { + "epoch": 2.139528354856822, + "grad_norm": 0.5360431671142578, + "learning_rate": 2.3037323619629448e-06, + "loss": 0.3038, + "step": 7621 + }, + { + "epoch": 2.1398090960134755, + "grad_norm": 0.5377373695373535, + "learning_rate": 2.3023569876262154e-06, + "loss": 0.3278, + "step": 7622 + }, + { + "epoch": 2.1400898371701293, + "grad_norm": 0.5923365950584412, + "learning_rate": 2.3009819011651705e-06, + "loss": 0.3461, + "step": 7623 + }, + { + "epoch": 2.140370578326783, + "grad_norm": 0.5814552307128906, + "learning_rate": 2.2996071027265553e-06, + "loss": 0.3381, + "step": 7624 + }, + { + "epoch": 2.1406513194834362, + "grad_norm": 0.5893500447273254, + "learning_rate": 2.2982325924570775e-06, + "loss": 0.3203, + "step": 7625 + }, + { + "epoch": 2.1409320606400897, + "grad_norm": 0.522693932056427, + "learning_rate": 2.296858370503414e-06, + "loss": 0.368, + "step": 7626 + }, + { + "epoch": 2.1412128017967436, + "grad_norm": 0.5240692496299744, + "learning_rate": 2.295484437012218e-06, + "loss": 0.3346, + "step": 7627 + }, + { + "epoch": 2.141493542953397, + "grad_norm": 0.526337742805481, + "learning_rate": 2.294110792130102e-06, + "loss": 0.3137, + "step": 7628 + }, + { + "epoch": 2.1417742841100504, + "grad_norm": 0.5728418827056885, + "learning_rate": 2.292737436003658e-06, + "loss": 0.3152, + "step": 7629 + }, + { + "epoch": 2.1420550252667043, + "grad_norm": 0.5894528031349182, + "learning_rate": 2.291364368779437e-06, + "loss": 0.3136, + "step": 7630 + }, + { + "epoch": 2.1423357664233578, + "grad_norm": 0.5678023099899292, + "learning_rate": 2.2899915906039687e-06, + "loss": 0.3616, + "step": 7631 + }, + { + "epoch": 2.142616507580011, + "grad_norm": 0.5598422884941101, + "learning_rate": 2.288619101623746e-06, + "loss": 0.3357, + "step": 7632 + }, + { + "epoch": 2.1428972487366647, + "grad_norm": 0.5799552202224731, + "learning_rate": 2.287246901985229e-06, + "loss": 0.2989, + "step": 7633 + }, + { + "epoch": 2.1431779898933185, + "grad_norm": 0.5640353560447693, + "learning_rate": 2.2858749918348556e-06, + "loss": 0.3367, + "step": 7634 + }, + { + "epoch": 2.143458731049972, + "grad_norm": 0.5647066831588745, + "learning_rate": 2.284503371319023e-06, + "loss": 0.3342, + "step": 7635 + }, + { + "epoch": 2.1437394722066254, + "grad_norm": 0.6022063493728638, + "learning_rate": 2.283132040584106e-06, + "loss": 0.327, + "step": 7636 + }, + { + "epoch": 2.144020213363279, + "grad_norm": 0.5313782691955566, + "learning_rate": 2.2817609997764425e-06, + "loss": 0.2896, + "step": 7637 + }, + { + "epoch": 2.1443009545199327, + "grad_norm": 0.546099066734314, + "learning_rate": 2.2803902490423393e-06, + "loss": 0.3727, + "step": 7638 + }, + { + "epoch": 2.144581695676586, + "grad_norm": 0.5981913805007935, + "learning_rate": 2.2790197885280784e-06, + "loss": 0.3227, + "step": 7639 + }, + { + "epoch": 2.1448624368332396, + "grad_norm": 0.6189888715744019, + "learning_rate": 2.2776496183799034e-06, + "loss": 0.3163, + "step": 7640 + }, + { + "epoch": 2.1451431779898935, + "grad_norm": 0.6030513644218445, + "learning_rate": 2.2762797387440343e-06, + "loss": 0.306, + "step": 7641 + }, + { + "epoch": 2.145423919146547, + "grad_norm": 0.6845206022262573, + "learning_rate": 2.2749101497666515e-06, + "loss": 0.3137, + "step": 7642 + }, + { + "epoch": 2.1457046603032004, + "grad_norm": 0.6505918502807617, + "learning_rate": 2.273540851593913e-06, + "loss": 0.3337, + "step": 7643 + }, + { + "epoch": 2.145985401459854, + "grad_norm": 0.6210272312164307, + "learning_rate": 2.27217184437194e-06, + "loss": 0.3893, + "step": 7644 + }, + { + "epoch": 2.1462661426165077, + "grad_norm": 0.5908429026603699, + "learning_rate": 2.2708031282468235e-06, + "loss": 0.3355, + "step": 7645 + }, + { + "epoch": 2.146546883773161, + "grad_norm": 0.6091362833976746, + "learning_rate": 2.269434703364625e-06, + "loss": 0.3513, + "step": 7646 + }, + { + "epoch": 2.1468276249298146, + "grad_norm": 0.6162760257720947, + "learning_rate": 2.268066569871376e-06, + "loss": 0.3102, + "step": 7647 + }, + { + "epoch": 2.147108366086468, + "grad_norm": 0.5612039566040039, + "learning_rate": 2.2666987279130727e-06, + "loss": 0.3015, + "step": 7648 + }, + { + "epoch": 2.147389107243122, + "grad_norm": 0.5317087769508362, + "learning_rate": 2.265331177635685e-06, + "loss": 0.3512, + "step": 7649 + }, + { + "epoch": 2.1476698483997754, + "grad_norm": 0.5793330669403076, + "learning_rate": 2.263963919185147e-06, + "loss": 0.3264, + "step": 7650 + }, + { + "epoch": 2.147950589556429, + "grad_norm": 0.6438215970993042, + "learning_rate": 2.2625969527073666e-06, + "loss": 0.328, + "step": 7651 + }, + { + "epoch": 2.1482313307130827, + "grad_norm": 0.6481834650039673, + "learning_rate": 2.261230278348217e-06, + "loss": 0.3165, + "step": 7652 + }, + { + "epoch": 2.148512071869736, + "grad_norm": 0.6245670318603516, + "learning_rate": 2.259863896253539e-06, + "loss": 0.3273, + "step": 7653 + }, + { + "epoch": 2.1487928130263896, + "grad_norm": 0.5959929823875427, + "learning_rate": 2.258497806569148e-06, + "loss": 0.3575, + "step": 7654 + }, + { + "epoch": 2.149073554183043, + "grad_norm": 0.545421838760376, + "learning_rate": 2.2571320094408213e-06, + "loss": 0.2979, + "step": 7655 + }, + { + "epoch": 2.149354295339697, + "grad_norm": 0.5832664370536804, + "learning_rate": 2.2557665050143106e-06, + "loss": 0.3247, + "step": 7656 + }, + { + "epoch": 2.1496350364963503, + "grad_norm": 0.6086153984069824, + "learning_rate": 2.254401293435332e-06, + "loss": 0.2812, + "step": 7657 + }, + { + "epoch": 2.149915777653004, + "grad_norm": 0.5361592173576355, + "learning_rate": 2.253036374849576e-06, + "loss": 0.327, + "step": 7658 + }, + { + "epoch": 2.1501965188096577, + "grad_norm": 0.5319271087646484, + "learning_rate": 2.251671749402695e-06, + "loss": 0.3496, + "step": 7659 + }, + { + "epoch": 2.150477259966311, + "grad_norm": 0.6418182849884033, + "learning_rate": 2.250307417240313e-06, + "loss": 0.2949, + "step": 7660 + }, + { + "epoch": 2.1507580011229646, + "grad_norm": 0.539433479309082, + "learning_rate": 2.2489433785080256e-06, + "loss": 0.3208, + "step": 7661 + }, + { + "epoch": 2.151038742279618, + "grad_norm": 0.6144167184829712, + "learning_rate": 2.2475796333513916e-06, + "loss": 0.3068, + "step": 7662 + }, + { + "epoch": 2.151319483436272, + "grad_norm": 0.5509148836135864, + "learning_rate": 2.2462161819159445e-06, + "loss": 0.3553, + "step": 7663 + }, + { + "epoch": 2.1516002245929253, + "grad_norm": 0.5421391129493713, + "learning_rate": 2.2448530243471816e-06, + "loss": 0.301, + "step": 7664 + }, + { + "epoch": 2.1518809657495788, + "grad_norm": 0.616186797618866, + "learning_rate": 2.2434901607905694e-06, + "loss": 0.3476, + "step": 7665 + }, + { + "epoch": 2.1521617069062327, + "grad_norm": 0.5403421521186829, + "learning_rate": 2.2421275913915473e-06, + "loss": 0.3487, + "step": 7666 + }, + { + "epoch": 2.152442448062886, + "grad_norm": 0.5637606382369995, + "learning_rate": 2.2407653162955173e-06, + "loss": 0.3133, + "step": 7667 + }, + { + "epoch": 2.1527231892195395, + "grad_norm": 0.576444685459137, + "learning_rate": 2.2394033356478557e-06, + "loss": 0.3567, + "step": 7668 + }, + { + "epoch": 2.153003930376193, + "grad_norm": 0.6034927368164062, + "learning_rate": 2.238041649593901e-06, + "loss": 0.3318, + "step": 7669 + }, + { + "epoch": 2.153284671532847, + "grad_norm": 0.543558657169342, + "learning_rate": 2.2366802582789676e-06, + "loss": 0.3411, + "step": 7670 + }, + { + "epoch": 2.1535654126895003, + "grad_norm": 0.6106688976287842, + "learning_rate": 2.235319161848334e-06, + "loss": 0.3128, + "step": 7671 + }, + { + "epoch": 2.1538461538461537, + "grad_norm": 0.5550286769866943, + "learning_rate": 2.2339583604472426e-06, + "loss": 0.3032, + "step": 7672 + }, + { + "epoch": 2.1541268950028076, + "grad_norm": 0.5444990992546082, + "learning_rate": 2.2325978542209177e-06, + "loss": 0.3112, + "step": 7673 + }, + { + "epoch": 2.154407636159461, + "grad_norm": 0.5721750259399414, + "learning_rate": 2.2312376433145412e-06, + "loss": 0.3417, + "step": 7674 + }, + { + "epoch": 2.1546883773161145, + "grad_norm": 0.5697322487831116, + "learning_rate": 2.2298777278732632e-06, + "loss": 0.3478, + "step": 7675 + }, + { + "epoch": 2.154969118472768, + "grad_norm": 0.5581787824630737, + "learning_rate": 2.2285181080422097e-06, + "loss": 0.3135, + "step": 7676 + }, + { + "epoch": 2.155249859629422, + "grad_norm": 0.6136451959609985, + "learning_rate": 2.2271587839664673e-06, + "loss": 0.3338, + "step": 7677 + }, + { + "epoch": 2.1555306007860753, + "grad_norm": 0.5717584490776062, + "learning_rate": 2.2257997557910974e-06, + "loss": 0.3557, + "step": 7678 + }, + { + "epoch": 2.1558113419427287, + "grad_norm": 0.6362666487693787, + "learning_rate": 2.2244410236611254e-06, + "loss": 0.2819, + "step": 7679 + }, + { + "epoch": 2.156092083099382, + "grad_norm": 0.4996248185634613, + "learning_rate": 2.2230825877215484e-06, + "loss": 0.3315, + "step": 7680 + }, + { + "epoch": 2.156372824256036, + "grad_norm": 0.5268717408180237, + "learning_rate": 2.221724448117329e-06, + "loss": 0.3021, + "step": 7681 + }, + { + "epoch": 2.1566535654126895, + "grad_norm": 0.5348608493804932, + "learning_rate": 2.220366604993398e-06, + "loss": 0.3008, + "step": 7682 + }, + { + "epoch": 2.156934306569343, + "grad_norm": 0.6342946290969849, + "learning_rate": 2.2190090584946595e-06, + "loss": 0.3182, + "step": 7683 + }, + { + "epoch": 2.157215047725997, + "grad_norm": 0.5667331218719482, + "learning_rate": 2.2176518087659783e-06, + "loss": 0.3808, + "step": 7684 + }, + { + "epoch": 2.1574957888826503, + "grad_norm": 0.5849010944366455, + "learning_rate": 2.216294855952196e-06, + "loss": 0.3348, + "step": 7685 + }, + { + "epoch": 2.1577765300393037, + "grad_norm": 0.6405203342437744, + "learning_rate": 2.214938200198115e-06, + "loss": 0.3108, + "step": 7686 + }, + { + "epoch": 2.158057271195957, + "grad_norm": 0.5259215831756592, + "learning_rate": 2.2135818416485096e-06, + "loss": 0.3193, + "step": 7687 + }, + { + "epoch": 2.158338012352611, + "grad_norm": 0.5738062858581543, + "learning_rate": 2.2122257804481233e-06, + "loss": 0.3287, + "step": 7688 + }, + { + "epoch": 2.1586187535092645, + "grad_norm": 0.5805240869522095, + "learning_rate": 2.210870016741664e-06, + "loss": 0.3451, + "step": 7689 + }, + { + "epoch": 2.158899494665918, + "grad_norm": 0.6308855414390564, + "learning_rate": 2.209514550673814e-06, + "loss": 0.2936, + "step": 7690 + }, + { + "epoch": 2.159180235822572, + "grad_norm": 0.5688693523406982, + "learning_rate": 2.208159382389217e-06, + "loss": 0.3584, + "step": 7691 + }, + { + "epoch": 2.1594609769792252, + "grad_norm": 0.5478384494781494, + "learning_rate": 2.2068045120324905e-06, + "loss": 0.3125, + "step": 7692 + }, + { + "epoch": 2.1597417181358787, + "grad_norm": 0.5713398456573486, + "learning_rate": 2.205449939748217e-06, + "loss": 0.3161, + "step": 7693 + }, + { + "epoch": 2.160022459292532, + "grad_norm": 0.5147822499275208, + "learning_rate": 2.204095665680946e-06, + "loss": 0.3512, + "step": 7694 + }, + { + "epoch": 2.160303200449186, + "grad_norm": 0.5163027048110962, + "learning_rate": 2.202741689975201e-06, + "loss": 0.3718, + "step": 7695 + }, + { + "epoch": 2.1605839416058394, + "grad_norm": 0.5625680088996887, + "learning_rate": 2.2013880127754654e-06, + "loss": 0.3129, + "step": 7696 + }, + { + "epoch": 2.160864682762493, + "grad_norm": 0.5634370446205139, + "learning_rate": 2.200034634226198e-06, + "loss": 0.3085, + "step": 7697 + }, + { + "epoch": 2.1611454239191463, + "grad_norm": 0.5660167932510376, + "learning_rate": 2.198681554471825e-06, + "loss": 0.3381, + "step": 7698 + }, + { + "epoch": 2.1614261650758, + "grad_norm": 0.5348774194717407, + "learning_rate": 2.1973287736567332e-06, + "loss": 0.3531, + "step": 7699 + }, + { + "epoch": 2.1617069062324537, + "grad_norm": 0.5696753263473511, + "learning_rate": 2.1959762919252885e-06, + "loss": 0.3314, + "step": 7700 + }, + { + "epoch": 2.161987647389107, + "grad_norm": 0.5382516384124756, + "learning_rate": 2.1946241094218176e-06, + "loss": 0.3257, + "step": 7701 + }, + { + "epoch": 2.162268388545761, + "grad_norm": 0.567698061466217, + "learning_rate": 2.1932722262906133e-06, + "loss": 0.3453, + "step": 7702 + }, + { + "epoch": 2.1625491297024144, + "grad_norm": 0.5876367688179016, + "learning_rate": 2.1919206426759453e-06, + "loss": 0.2999, + "step": 7703 + }, + { + "epoch": 2.162829870859068, + "grad_norm": 0.580852210521698, + "learning_rate": 2.190569358722042e-06, + "loss": 0.3096, + "step": 7704 + }, + { + "epoch": 2.1631106120157213, + "grad_norm": 0.598149836063385, + "learning_rate": 2.189218374573108e-06, + "loss": 0.3172, + "step": 7705 + }, + { + "epoch": 2.163391353172375, + "grad_norm": 0.6089771389961243, + "learning_rate": 2.1878676903733088e-06, + "loss": 0.3641, + "step": 7706 + }, + { + "epoch": 2.1636720943290286, + "grad_norm": 0.6105449199676514, + "learning_rate": 2.1865173062667832e-06, + "loss": 0.3414, + "step": 7707 + }, + { + "epoch": 2.163952835485682, + "grad_norm": 0.5699039697647095, + "learning_rate": 2.1851672223976355e-06, + "loss": 0.2988, + "step": 7708 + }, + { + "epoch": 2.164233576642336, + "grad_norm": 0.5573256015777588, + "learning_rate": 2.183817438909935e-06, + "loss": 0.351, + "step": 7709 + }, + { + "epoch": 2.1645143177989894, + "grad_norm": 0.6350598931312561, + "learning_rate": 2.1824679559477267e-06, + "loss": 0.3197, + "step": 7710 + }, + { + "epoch": 2.164795058955643, + "grad_norm": 0.6099666953086853, + "learning_rate": 2.1811187736550156e-06, + "loss": 0.3115, + "step": 7711 + }, + { + "epoch": 2.1650758001122963, + "grad_norm": 0.5796921849250793, + "learning_rate": 2.179769892175781e-06, + "loss": 0.3097, + "step": 7712 + }, + { + "epoch": 2.16535654126895, + "grad_norm": 0.6009588241577148, + "learning_rate": 2.178421311653966e-06, + "loss": 0.273, + "step": 7713 + }, + { + "epoch": 2.1656372824256036, + "grad_norm": 0.5405430793762207, + "learning_rate": 2.17707303223348e-06, + "loss": 0.3355, + "step": 7714 + }, + { + "epoch": 2.165918023582257, + "grad_norm": 0.6128930449485779, + "learning_rate": 2.1757250540582075e-06, + "loss": 0.3118, + "step": 7715 + }, + { + "epoch": 2.166198764738911, + "grad_norm": 0.5836557745933533, + "learning_rate": 2.1743773772719927e-06, + "loss": 0.3437, + "step": 7716 + }, + { + "epoch": 2.1664795058955644, + "grad_norm": 0.6024447679519653, + "learning_rate": 2.173030002018654e-06, + "loss": 0.2688, + "step": 7717 + }, + { + "epoch": 2.166760247052218, + "grad_norm": 0.6296533942222595, + "learning_rate": 2.1716829284419727e-06, + "loss": 0.3088, + "step": 7718 + }, + { + "epoch": 2.1670409882088713, + "grad_norm": 0.6000844240188599, + "learning_rate": 2.1703361566857025e-06, + "loss": 0.3062, + "step": 7719 + }, + { + "epoch": 2.167321729365525, + "grad_norm": 0.5442104339599609, + "learning_rate": 2.1689896868935613e-06, + "loss": 0.3364, + "step": 7720 + }, + { + "epoch": 2.1676024705221786, + "grad_norm": 0.592156171798706, + "learning_rate": 2.167643519209233e-06, + "loss": 0.3387, + "step": 7721 + }, + { + "epoch": 2.167883211678832, + "grad_norm": 0.6309889554977417, + "learning_rate": 2.166297653776378e-06, + "loss": 0.2709, + "step": 7722 + }, + { + "epoch": 2.168163952835486, + "grad_norm": 0.599522590637207, + "learning_rate": 2.164952090738614e-06, + "loss": 0.3284, + "step": 7723 + }, + { + "epoch": 2.1684446939921393, + "grad_norm": 0.5739444494247437, + "learning_rate": 2.1636068302395324e-06, + "loss": 0.3244, + "step": 7724 + }, + { + "epoch": 2.168725435148793, + "grad_norm": 0.6328409910202026, + "learning_rate": 2.1622618724226934e-06, + "loss": 0.3425, + "step": 7725 + }, + { + "epoch": 2.1690061763054462, + "grad_norm": 0.5877583026885986, + "learning_rate": 2.1609172174316188e-06, + "loss": 0.3225, + "step": 7726 + }, + { + "epoch": 2.1692869174621, + "grad_norm": 0.5428234934806824, + "learning_rate": 2.1595728654098057e-06, + "loss": 0.3717, + "step": 7727 + }, + { + "epoch": 2.1695676586187536, + "grad_norm": 0.5548840761184692, + "learning_rate": 2.158228816500711e-06, + "loss": 0.3677, + "step": 7728 + }, + { + "epoch": 2.169848399775407, + "grad_norm": 0.5127007961273193, + "learning_rate": 2.1568850708477672e-06, + "loss": 0.3156, + "step": 7729 + }, + { + "epoch": 2.1701291409320604, + "grad_norm": 0.5930838584899902, + "learning_rate": 2.1555416285943683e-06, + "loss": 0.3291, + "step": 7730 + }, + { + "epoch": 2.1704098820887143, + "grad_norm": 0.5415441989898682, + "learning_rate": 2.1541984898838763e-06, + "loss": 0.3561, + "step": 7731 + }, + { + "epoch": 2.1706906232453678, + "grad_norm": 0.5847387909889221, + "learning_rate": 2.152855654859627e-06, + "loss": 0.3848, + "step": 7732 + }, + { + "epoch": 2.170971364402021, + "grad_norm": 0.5580772757530212, + "learning_rate": 2.1515131236649144e-06, + "loss": 0.3037, + "step": 7733 + }, + { + "epoch": 2.171252105558675, + "grad_norm": 0.5752868056297302, + "learning_rate": 2.15017089644301e-06, + "loss": 0.3629, + "step": 7734 + }, + { + "epoch": 2.1715328467153285, + "grad_norm": 0.6275718808174133, + "learning_rate": 2.1488289733371454e-06, + "loss": 0.3029, + "step": 7735 + }, + { + "epoch": 2.171813587871982, + "grad_norm": 0.5496490597724915, + "learning_rate": 2.1474873544905204e-06, + "loss": 0.3288, + "step": 7736 + }, + { + "epoch": 2.1720943290286354, + "grad_norm": 0.5766544342041016, + "learning_rate": 2.1461460400463084e-06, + "loss": 0.336, + "step": 7737 + }, + { + "epoch": 2.1723750701852893, + "grad_norm": 0.596799373626709, + "learning_rate": 2.1448050301476425e-06, + "loss": 0.3326, + "step": 7738 + }, + { + "epoch": 2.1726558113419427, + "grad_norm": 0.5938395857810974, + "learning_rate": 2.1434643249376304e-06, + "loss": 0.3374, + "step": 7739 + }, + { + "epoch": 2.172936552498596, + "grad_norm": 0.5244588851928711, + "learning_rate": 2.14212392455934e-06, + "loss": 0.3107, + "step": 7740 + }, + { + "epoch": 2.1732172936552496, + "grad_norm": 0.5750537514686584, + "learning_rate": 2.140783829155814e-06, + "loss": 0.3297, + "step": 7741 + }, + { + "epoch": 2.1734980348119035, + "grad_norm": 0.5554062724113464, + "learning_rate": 2.139444038870057e-06, + "loss": 0.336, + "step": 7742 + }, + { + "epoch": 2.173778775968557, + "grad_norm": 0.5660613775253296, + "learning_rate": 2.138104553845043e-06, + "loss": 0.323, + "step": 7743 + }, + { + "epoch": 2.1740595171252104, + "grad_norm": 0.6504146456718445, + "learning_rate": 2.136765374223715e-06, + "loss": 0.3416, + "step": 7744 + }, + { + "epoch": 2.1743402582818643, + "grad_norm": 0.532696008682251, + "learning_rate": 2.1354265001489795e-06, + "loss": 0.3495, + "step": 7745 + }, + { + "epoch": 2.1746209994385177, + "grad_norm": 0.6625186800956726, + "learning_rate": 2.1340879317637154e-06, + "loss": 0.3098, + "step": 7746 + }, + { + "epoch": 2.174901740595171, + "grad_norm": 0.620116651058197, + "learning_rate": 2.132749669210766e-06, + "loss": 0.3671, + "step": 7747 + }, + { + "epoch": 2.1751824817518246, + "grad_norm": 0.6086158156394958, + "learning_rate": 2.1314117126329375e-06, + "loss": 0.3115, + "step": 7748 + }, + { + "epoch": 2.1754632229084785, + "grad_norm": 0.6067386269569397, + "learning_rate": 2.1300740621730165e-06, + "loss": 0.3228, + "step": 7749 + }, + { + "epoch": 2.175743964065132, + "grad_norm": 0.6013914346694946, + "learning_rate": 2.128736717973743e-06, + "loss": 0.3161, + "step": 7750 + }, + { + "epoch": 2.1760247052217854, + "grad_norm": 0.5442312359809875, + "learning_rate": 2.1273996801778336e-06, + "loss": 0.3615, + "step": 7751 + }, + { + "epoch": 2.1763054463784393, + "grad_norm": 0.6353532671928406, + "learning_rate": 2.1260629489279662e-06, + "loss": 0.3188, + "step": 7752 + }, + { + "epoch": 2.1765861875350927, + "grad_norm": 0.6379748582839966, + "learning_rate": 2.1247265243667875e-06, + "loss": 0.3386, + "step": 7753 + }, + { + "epoch": 2.176866928691746, + "grad_norm": 0.6232293844223022, + "learning_rate": 2.123390406636916e-06, + "loss": 0.2794, + "step": 7754 + }, + { + "epoch": 2.1771476698483996, + "grad_norm": 0.557019829750061, + "learning_rate": 2.12205459588093e-06, + "loss": 0.3874, + "step": 7755 + }, + { + "epoch": 2.1774284110050535, + "grad_norm": 0.583795428276062, + "learning_rate": 2.1207190922413825e-06, + "loss": 0.3471, + "step": 7756 + }, + { + "epoch": 2.177709152161707, + "grad_norm": 0.5697951912879944, + "learning_rate": 2.119383895860788e-06, + "loss": 0.3513, + "step": 7757 + }, + { + "epoch": 2.1779898933183603, + "grad_norm": 0.5671022534370422, + "learning_rate": 2.1180490068816296e-06, + "loss": 0.3582, + "step": 7758 + }, + { + "epoch": 2.1782706344750142, + "grad_norm": 0.5471994280815125, + "learning_rate": 2.116714425446361e-06, + "loss": 0.3537, + "step": 7759 + }, + { + "epoch": 2.1785513756316677, + "grad_norm": 0.5638909339904785, + "learning_rate": 2.1153801516973976e-06, + "loss": 0.3138, + "step": 7760 + }, + { + "epoch": 2.178832116788321, + "grad_norm": 0.602271318435669, + "learning_rate": 2.114046185777128e-06, + "loss": 0.3198, + "step": 7761 + }, + { + "epoch": 2.1791128579449746, + "grad_norm": 0.5836250185966492, + "learning_rate": 2.1127125278279005e-06, + "loss": 0.3459, + "step": 7762 + }, + { + "epoch": 2.1793935991016284, + "grad_norm": 0.5795777440071106, + "learning_rate": 2.111379177992039e-06, + "loss": 0.3149, + "step": 7763 + }, + { + "epoch": 2.179674340258282, + "grad_norm": 0.590065598487854, + "learning_rate": 2.1100461364118285e-06, + "loss": 0.3654, + "step": 7764 + }, + { + "epoch": 2.1799550814149353, + "grad_norm": 0.6122351288795471, + "learning_rate": 2.1087134032295208e-06, + "loss": 0.3374, + "step": 7765 + }, + { + "epoch": 2.180235822571589, + "grad_norm": 0.5787692666053772, + "learning_rate": 2.1073809785873417e-06, + "loss": 0.3714, + "step": 7766 + }, + { + "epoch": 2.1805165637282427, + "grad_norm": 0.540298342704773, + "learning_rate": 2.106048862627474e-06, + "loss": 0.3275, + "step": 7767 + }, + { + "epoch": 2.180797304884896, + "grad_norm": 0.5755876302719116, + "learning_rate": 2.1047170554920775e-06, + "loss": 0.3117, + "step": 7768 + }, + { + "epoch": 2.1810780460415495, + "grad_norm": 0.5442964434623718, + "learning_rate": 2.103385557323272e-06, + "loss": 0.3087, + "step": 7769 + }, + { + "epoch": 2.1813587871982034, + "grad_norm": 0.6338362097740173, + "learning_rate": 2.1020543682631454e-06, + "loss": 0.3017, + "step": 7770 + }, + { + "epoch": 2.181639528354857, + "grad_norm": 0.6212226748466492, + "learning_rate": 2.1007234884537574e-06, + "loss": 0.3292, + "step": 7771 + }, + { + "epoch": 2.1819202695115103, + "grad_norm": 0.5792097449302673, + "learning_rate": 2.0993929180371277e-06, + "loss": 0.3172, + "step": 7772 + }, + { + "epoch": 2.182201010668164, + "grad_norm": 0.59674072265625, + "learning_rate": 2.0980626571552474e-06, + "loss": 0.331, + "step": 7773 + }, + { + "epoch": 2.1824817518248176, + "grad_norm": 0.5870463252067566, + "learning_rate": 2.0967327059500763e-06, + "loss": 0.31, + "step": 7774 + }, + { + "epoch": 2.182762492981471, + "grad_norm": 0.5281668305397034, + "learning_rate": 2.0954030645635352e-06, + "loss": 0.3677, + "step": 7775 + }, + { + "epoch": 2.1830432341381245, + "grad_norm": 0.5943126082420349, + "learning_rate": 2.0940737331375182e-06, + "loss": 0.3244, + "step": 7776 + }, + { + "epoch": 2.1833239752947784, + "grad_norm": 0.5884861946105957, + "learning_rate": 2.09274471181388e-06, + "loss": 0.2964, + "step": 7777 + }, + { + "epoch": 2.183604716451432, + "grad_norm": 0.5693631768226624, + "learning_rate": 2.0914160007344487e-06, + "loss": 0.3297, + "step": 7778 + }, + { + "epoch": 2.1838854576080853, + "grad_norm": 0.6288324594497681, + "learning_rate": 2.090087600041014e-06, + "loss": 0.3094, + "step": 7779 + }, + { + "epoch": 2.1841661987647387, + "grad_norm": 0.5757323503494263, + "learning_rate": 2.0887595098753342e-06, + "loss": 0.3147, + "step": 7780 + }, + { + "epoch": 2.1844469399213926, + "grad_norm": 0.5840219855308533, + "learning_rate": 2.087431730379137e-06, + "loss": 0.3291, + "step": 7781 + }, + { + "epoch": 2.184727681078046, + "grad_norm": 0.5445331335067749, + "learning_rate": 2.0861042616941117e-06, + "loss": 0.3535, + "step": 7782 + }, + { + "epoch": 2.1850084222346995, + "grad_norm": 0.5671808123588562, + "learning_rate": 2.0847771039619215e-06, + "loss": 0.3084, + "step": 7783 + }, + { + "epoch": 2.1852891633913534, + "grad_norm": 0.6456224918365479, + "learning_rate": 2.08345025732419e-06, + "loss": 0.3262, + "step": 7784 + }, + { + "epoch": 2.185569904548007, + "grad_norm": 0.6174296140670776, + "learning_rate": 2.082123721922508e-06, + "loss": 0.3243, + "step": 7785 + }, + { + "epoch": 2.1858506457046603, + "grad_norm": 0.5586106777191162, + "learning_rate": 2.080797497898439e-06, + "loss": 0.32, + "step": 7786 + }, + { + "epoch": 2.1861313868613137, + "grad_norm": 0.5374150276184082, + "learning_rate": 2.0794715853935064e-06, + "loss": 0.3055, + "step": 7787 + }, + { + "epoch": 2.1864121280179676, + "grad_norm": 0.5498889684677124, + "learning_rate": 2.0781459845492064e-06, + "loss": 0.3391, + "step": 7788 + }, + { + "epoch": 2.186692869174621, + "grad_norm": 0.5667555928230286, + "learning_rate": 2.0768206955069953e-06, + "loss": 0.3288, + "step": 7789 + }, + { + "epoch": 2.1869736103312745, + "grad_norm": 0.5854453444480896, + "learning_rate": 2.0754957184083036e-06, + "loss": 0.308, + "step": 7790 + }, + { + "epoch": 2.187254351487928, + "grad_norm": 0.5819854736328125, + "learning_rate": 2.074171053394522e-06, + "loss": 0.2941, + "step": 7791 + }, + { + "epoch": 2.187535092644582, + "grad_norm": 0.6175342798233032, + "learning_rate": 2.0728467006070095e-06, + "loss": 0.3443, + "step": 7792 + }, + { + "epoch": 2.1878158338012352, + "grad_norm": 0.5961562395095825, + "learning_rate": 2.0715226601870956e-06, + "loss": 0.3465, + "step": 7793 + }, + { + "epoch": 2.1880965749578887, + "grad_norm": 0.5794124007225037, + "learning_rate": 2.0701989322760714e-06, + "loss": 0.2745, + "step": 7794 + }, + { + "epoch": 2.1883773161145426, + "grad_norm": 0.6212928891181946, + "learning_rate": 2.0688755170152e-06, + "loss": 0.3152, + "step": 7795 + }, + { + "epoch": 2.188658057271196, + "grad_norm": 0.6221616864204407, + "learning_rate": 2.067552414545705e-06, + "loss": 0.32, + "step": 7796 + }, + { + "epoch": 2.1889387984278494, + "grad_norm": 0.5288687348365784, + "learning_rate": 2.0662296250087795e-06, + "loss": 0.3123, + "step": 7797 + }, + { + "epoch": 2.189219539584503, + "grad_norm": 0.5770567059516907, + "learning_rate": 2.064907148545586e-06, + "loss": 0.326, + "step": 7798 + }, + { + "epoch": 2.1895002807411568, + "grad_norm": 0.6000266075134277, + "learning_rate": 2.0635849852972456e-06, + "loss": 0.3185, + "step": 7799 + }, + { + "epoch": 2.18978102189781, + "grad_norm": 0.5692927241325378, + "learning_rate": 2.062263135404859e-06, + "loss": 0.3621, + "step": 7800 + }, + { + "epoch": 2.1900617630544637, + "grad_norm": 0.5376588702201843, + "learning_rate": 2.0609415990094824e-06, + "loss": 0.3309, + "step": 7801 + }, + { + "epoch": 2.1903425042111175, + "grad_norm": 0.65265291929245, + "learning_rate": 2.0596203762521392e-06, + "loss": 0.3343, + "step": 7802 + }, + { + "epoch": 2.190623245367771, + "grad_norm": 0.585486888885498, + "learning_rate": 2.0582994672738264e-06, + "loss": 0.294, + "step": 7803 + }, + { + "epoch": 2.1909039865244244, + "grad_norm": 0.5621030926704407, + "learning_rate": 2.0569788722155e-06, + "loss": 0.3274, + "step": 7804 + }, + { + "epoch": 2.191184727681078, + "grad_norm": 0.6017136573791504, + "learning_rate": 2.0556585912180876e-06, + "loss": 0.3217, + "step": 7805 + }, + { + "epoch": 2.1914654688377317, + "grad_norm": 0.5663872957229614, + "learning_rate": 2.0543386244224817e-06, + "loss": 0.3111, + "step": 7806 + }, + { + "epoch": 2.191746209994385, + "grad_norm": 0.5912230610847473, + "learning_rate": 2.0530189719695376e-06, + "loss": 0.3277, + "step": 7807 + }, + { + "epoch": 2.1920269511510386, + "grad_norm": 0.5351399183273315, + "learning_rate": 2.051699634000085e-06, + "loss": 0.3316, + "step": 7808 + }, + { + "epoch": 2.1923076923076925, + "grad_norm": 0.6377553939819336, + "learning_rate": 2.0503806106549107e-06, + "loss": 0.2905, + "step": 7809 + }, + { + "epoch": 2.192588433464346, + "grad_norm": 0.6106664538383484, + "learning_rate": 2.0490619020747774e-06, + "loss": 0.362, + "step": 7810 + }, + { + "epoch": 2.1928691746209994, + "grad_norm": 0.6014419794082642, + "learning_rate": 2.0477435084004053e-06, + "loss": 0.3071, + "step": 7811 + }, + { + "epoch": 2.193149915777653, + "grad_norm": 0.5704319477081299, + "learning_rate": 2.0464254297724896e-06, + "loss": 0.3366, + "step": 7812 + }, + { + "epoch": 2.1934306569343067, + "grad_norm": 0.568524956703186, + "learning_rate": 2.0451076663316843e-06, + "loss": 0.3377, + "step": 7813 + }, + { + "epoch": 2.19371139809096, + "grad_norm": 0.6283813714981079, + "learning_rate": 2.0437902182186113e-06, + "loss": 0.3569, + "step": 7814 + }, + { + "epoch": 2.1939921392476136, + "grad_norm": 0.5840946435928345, + "learning_rate": 2.0424730855738657e-06, + "loss": 0.3769, + "step": 7815 + }, + { + "epoch": 2.1942728804042675, + "grad_norm": 0.5546283721923828, + "learning_rate": 2.0411562685379983e-06, + "loss": 0.3305, + "step": 7816 + }, + { + "epoch": 2.194553621560921, + "grad_norm": 0.5784397721290588, + "learning_rate": 2.0398397672515356e-06, + "loss": 0.3101, + "step": 7817 + }, + { + "epoch": 2.1948343627175744, + "grad_norm": 0.5585423111915588, + "learning_rate": 2.0385235818549653e-06, + "loss": 0.3996, + "step": 7818 + }, + { + "epoch": 2.195115103874228, + "grad_norm": 0.5485456585884094, + "learning_rate": 2.0372077124887407e-06, + "loss": 0.3324, + "step": 7819 + }, + { + "epoch": 2.1953958450308817, + "grad_norm": 0.5490700602531433, + "learning_rate": 2.0358921592932855e-06, + "loss": 0.3688, + "step": 7820 + }, + { + "epoch": 2.195676586187535, + "grad_norm": 0.5394091010093689, + "learning_rate": 2.0345769224089855e-06, + "loss": 0.3234, + "step": 7821 + }, + { + "epoch": 2.1959573273441886, + "grad_norm": 0.5372628569602966, + "learning_rate": 2.033262001976197e-06, + "loss": 0.3462, + "step": 7822 + }, + { + "epoch": 2.196238068500842, + "grad_norm": 0.6373228430747986, + "learning_rate": 2.031947398135237e-06, + "loss": 0.3556, + "step": 7823 + }, + { + "epoch": 2.196518809657496, + "grad_norm": 0.5175253748893738, + "learning_rate": 2.030633111026394e-06, + "loss": 0.3201, + "step": 7824 + }, + { + "epoch": 2.1967995508141493, + "grad_norm": 0.6014914512634277, + "learning_rate": 2.0293191407899215e-06, + "loss": 0.327, + "step": 7825 + }, + { + "epoch": 2.197080291970803, + "grad_norm": 0.546654462814331, + "learning_rate": 2.0280054875660353e-06, + "loss": 0.3284, + "step": 7826 + }, + { + "epoch": 2.1973610331274567, + "grad_norm": 0.5931504368782043, + "learning_rate": 2.0266921514949235e-06, + "loss": 0.3323, + "step": 7827 + }, + { + "epoch": 2.19764177428411, + "grad_norm": 0.6058072447776794, + "learning_rate": 2.0253791327167354e-06, + "loss": 0.3245, + "step": 7828 + }, + { + "epoch": 2.1979225154407636, + "grad_norm": 0.6033126711845398, + "learning_rate": 2.0240664313715863e-06, + "loss": 0.3263, + "step": 7829 + }, + { + "epoch": 2.198203256597417, + "grad_norm": 0.5586051344871521, + "learning_rate": 2.022754047599564e-06, + "loss": 0.2928, + "step": 7830 + }, + { + "epoch": 2.198483997754071, + "grad_norm": 0.5582164525985718, + "learning_rate": 2.021441981540713e-06, + "loss": 0.3319, + "step": 7831 + }, + { + "epoch": 2.1987647389107243, + "grad_norm": 0.5959237217903137, + "learning_rate": 2.020130233335054e-06, + "loss": 0.319, + "step": 7832 + }, + { + "epoch": 2.1990454800673778, + "grad_norm": 0.5489107370376587, + "learning_rate": 2.0188188031225646e-06, + "loss": 0.3161, + "step": 7833 + }, + { + "epoch": 2.199326221224031, + "grad_norm": 0.6317627429962158, + "learning_rate": 2.017507691043193e-06, + "loss": 0.3051, + "step": 7834 + }, + { + "epoch": 2.199606962380685, + "grad_norm": 0.5487210154533386, + "learning_rate": 2.016196897236855e-06, + "loss": 0.3183, + "step": 7835 + }, + { + "epoch": 2.1998877035373385, + "grad_norm": 0.5515868067741394, + "learning_rate": 2.0148864218434277e-06, + "loss": 0.3095, + "step": 7836 + }, + { + "epoch": 2.200168444693992, + "grad_norm": 0.5675533413887024, + "learning_rate": 2.0135762650027595e-06, + "loss": 0.3662, + "step": 7837 + }, + { + "epoch": 2.200449185850646, + "grad_norm": 0.5844115018844604, + "learning_rate": 2.01226642685466e-06, + "loss": 0.3302, + "step": 7838 + }, + { + "epoch": 2.2007299270072993, + "grad_norm": 0.6632570624351501, + "learning_rate": 2.010956907538909e-06, + "loss": 0.3138, + "step": 7839 + }, + { + "epoch": 2.2010106681639527, + "grad_norm": 0.6090406179428101, + "learning_rate": 2.0096477071952493e-06, + "loss": 0.3114, + "step": 7840 + }, + { + "epoch": 2.201291409320606, + "grad_norm": 0.7017105221748352, + "learning_rate": 2.008338825963389e-06, + "loss": 0.342, + "step": 7841 + }, + { + "epoch": 2.20157215047726, + "grad_norm": 0.6197972893714905, + "learning_rate": 2.007030263983007e-06, + "loss": 0.2941, + "step": 7842 + }, + { + "epoch": 2.2018528916339135, + "grad_norm": 0.5869395732879639, + "learning_rate": 2.005722021393741e-06, + "loss": 0.3029, + "step": 7843 + }, + { + "epoch": 2.202133632790567, + "grad_norm": 0.5609723925590515, + "learning_rate": 2.004414098335203e-06, + "loss": 0.3338, + "step": 7844 + }, + { + "epoch": 2.202414373947221, + "grad_norm": 0.6342980861663818, + "learning_rate": 2.003106494946962e-06, + "loss": 0.3352, + "step": 7845 + }, + { + "epoch": 2.2026951151038743, + "grad_norm": 0.6291910409927368, + "learning_rate": 2.0017992113685612e-06, + "loss": 0.3748, + "step": 7846 + }, + { + "epoch": 2.2029758562605277, + "grad_norm": 0.5410831570625305, + "learning_rate": 2.0004922477395043e-06, + "loss": 0.321, + "step": 7847 + }, + { + "epoch": 2.203256597417181, + "grad_norm": 0.5877004265785217, + "learning_rate": 1.9991856041992604e-06, + "loss": 0.3111, + "step": 7848 + }, + { + "epoch": 2.203537338573835, + "grad_norm": 0.6012861132621765, + "learning_rate": 1.997879280887269e-06, + "loss": 0.2995, + "step": 7849 + }, + { + "epoch": 2.2038180797304885, + "grad_norm": 0.5536375045776367, + "learning_rate": 1.9965732779429302e-06, + "loss": 0.3011, + "step": 7850 + }, + { + "epoch": 2.204098820887142, + "grad_norm": 0.5814211964607239, + "learning_rate": 1.9952675955056144e-06, + "loss": 0.3227, + "step": 7851 + }, + { + "epoch": 2.204379562043796, + "grad_norm": 0.5739724040031433, + "learning_rate": 1.9939622337146574e-06, + "loss": 0.324, + "step": 7852 + }, + { + "epoch": 2.2046603032004493, + "grad_norm": 0.5835128426551819, + "learning_rate": 1.992657192709356e-06, + "loss": 0.3203, + "step": 7853 + }, + { + "epoch": 2.2049410443571027, + "grad_norm": 0.5871363878250122, + "learning_rate": 1.9913524726289784e-06, + "loss": 0.3051, + "step": 7854 + }, + { + "epoch": 2.205221785513756, + "grad_norm": 0.5754022598266602, + "learning_rate": 1.9900480736127557e-06, + "loss": 0.3118, + "step": 7855 + }, + { + "epoch": 2.20550252667041, + "grad_norm": 0.5595225095748901, + "learning_rate": 1.988743995799884e-06, + "loss": 0.3405, + "step": 7856 + }, + { + "epoch": 2.2057832678270635, + "grad_norm": 0.6285916566848755, + "learning_rate": 1.987440239329528e-06, + "loss": 0.3072, + "step": 7857 + }, + { + "epoch": 2.206064008983717, + "grad_norm": 0.575323760509491, + "learning_rate": 1.9861368043408147e-06, + "loss": 0.3595, + "step": 7858 + }, + { + "epoch": 2.206344750140371, + "grad_norm": 0.6051434874534607, + "learning_rate": 1.984833690972841e-06, + "loss": 0.2976, + "step": 7859 + }, + { + "epoch": 2.2066254912970242, + "grad_norm": 0.6305602192878723, + "learning_rate": 1.9835308993646647e-06, + "loss": 0.3218, + "step": 7860 + }, + { + "epoch": 2.2069062324536777, + "grad_norm": 0.5705970525741577, + "learning_rate": 1.9822284296553133e-06, + "loss": 0.3523, + "step": 7861 + }, + { + "epoch": 2.207186973610331, + "grad_norm": 0.6255747079849243, + "learning_rate": 1.9809262819837783e-06, + "loss": 0.2904, + "step": 7862 + }, + { + "epoch": 2.207467714766985, + "grad_norm": 0.5619363188743591, + "learning_rate": 1.9796244564890144e-06, + "loss": 0.3448, + "step": 7863 + }, + { + "epoch": 2.2077484559236384, + "grad_norm": 0.5439637899398804, + "learning_rate": 1.9783229533099475e-06, + "loss": 0.3033, + "step": 7864 + }, + { + "epoch": 2.208029197080292, + "grad_norm": 0.50906902551651, + "learning_rate": 1.977021772585463e-06, + "loss": 0.2966, + "step": 7865 + }, + { + "epoch": 2.2083099382369458, + "grad_norm": 0.561398446559906, + "learning_rate": 1.9757209144544178e-06, + "loss": 0.3458, + "step": 7866 + }, + { + "epoch": 2.208590679393599, + "grad_norm": 0.5879591107368469, + "learning_rate": 1.9744203790556298e-06, + "loss": 0.3372, + "step": 7867 + }, + { + "epoch": 2.2088714205502527, + "grad_norm": 0.6072366833686829, + "learning_rate": 1.9731201665278823e-06, + "loss": 0.3513, + "step": 7868 + }, + { + "epoch": 2.209152161706906, + "grad_norm": 0.5826284289360046, + "learning_rate": 1.97182027700993e-06, + "loss": 0.2988, + "step": 7869 + }, + { + "epoch": 2.20943290286356, + "grad_norm": 0.5746475458145142, + "learning_rate": 1.970520710640484e-06, + "loss": 0.2696, + "step": 7870 + }, + { + "epoch": 2.2097136440202134, + "grad_norm": 0.566821277141571, + "learning_rate": 1.969221467558231e-06, + "loss": 0.33, + "step": 7871 + }, + { + "epoch": 2.209994385176867, + "grad_norm": 0.6455053091049194, + "learning_rate": 1.9679225479018144e-06, + "loss": 0.3265, + "step": 7872 + }, + { + "epoch": 2.2102751263335203, + "grad_norm": 0.6207641959190369, + "learning_rate": 1.9666239518098496e-06, + "loss": 0.3492, + "step": 7873 + }, + { + "epoch": 2.210555867490174, + "grad_norm": 0.5575459599494934, + "learning_rate": 1.965325679420913e-06, + "loss": 0.3419, + "step": 7874 + }, + { + "epoch": 2.2108366086468276, + "grad_norm": 0.6213704943656921, + "learning_rate": 1.9640277308735457e-06, + "loss": 0.3427, + "step": 7875 + }, + { + "epoch": 2.211117349803481, + "grad_norm": 0.6364071369171143, + "learning_rate": 1.9627301063062626e-06, + "loss": 0.2843, + "step": 7876 + }, + { + "epoch": 2.211398090960135, + "grad_norm": 0.6064237356185913, + "learning_rate": 1.9614328058575347e-06, + "loss": 0.3346, + "step": 7877 + }, + { + "epoch": 2.2116788321167884, + "grad_norm": 0.5435418486595154, + "learning_rate": 1.9601358296658013e-06, + "loss": 0.3399, + "step": 7878 + }, + { + "epoch": 2.211959573273442, + "grad_norm": 0.6605810523033142, + "learning_rate": 1.95883917786947e-06, + "loss": 0.3099, + "step": 7879 + }, + { + "epoch": 2.2122403144300953, + "grad_norm": 0.5825643539428711, + "learning_rate": 1.9575428506069077e-06, + "loss": 0.3189, + "step": 7880 + }, + { + "epoch": 2.212521055586749, + "grad_norm": 0.611756443977356, + "learning_rate": 1.956246848016454e-06, + "loss": 0.3382, + "step": 7881 + }, + { + "epoch": 2.2128017967434026, + "grad_norm": 0.600749135017395, + "learning_rate": 1.954951170236408e-06, + "loss": 0.2971, + "step": 7882 + }, + { + "epoch": 2.213082537900056, + "grad_norm": 0.6075959801673889, + "learning_rate": 1.9536558174050386e-06, + "loss": 0.3113, + "step": 7883 + }, + { + "epoch": 2.2133632790567095, + "grad_norm": 0.5713773369789124, + "learning_rate": 1.952360789660576e-06, + "loss": 0.3589, + "step": 7884 + }, + { + "epoch": 2.2136440202133634, + "grad_norm": 0.542594850063324, + "learning_rate": 1.9510660871412167e-06, + "loss": 0.3796, + "step": 7885 + }, + { + "epoch": 2.213924761370017, + "grad_norm": 0.5956571698188782, + "learning_rate": 1.9497717099851266e-06, + "loss": 0.3047, + "step": 7886 + }, + { + "epoch": 2.2142055025266703, + "grad_norm": 0.5386093258857727, + "learning_rate": 1.948477658330429e-06, + "loss": 0.3322, + "step": 7887 + }, + { + "epoch": 2.214486243683324, + "grad_norm": 0.5793008208274841, + "learning_rate": 1.9471839323152215e-06, + "loss": 0.3265, + "step": 7888 + }, + { + "epoch": 2.2147669848399776, + "grad_norm": 0.5399921536445618, + "learning_rate": 1.9458905320775605e-06, + "loss": 0.3034, + "step": 7889 + }, + { + "epoch": 2.215047725996631, + "grad_norm": 0.5770264863967896, + "learning_rate": 1.944597457755468e-06, + "loss": 0.2891, + "step": 7890 + }, + { + "epoch": 2.2153284671532845, + "grad_norm": 0.6316906809806824, + "learning_rate": 1.943304709486936e-06, + "loss": 0.3453, + "step": 7891 + }, + { + "epoch": 2.2156092083099383, + "grad_norm": 0.5945931673049927, + "learning_rate": 1.9420122874099157e-06, + "loss": 0.3582, + "step": 7892 + }, + { + "epoch": 2.215889949466592, + "grad_norm": 0.5706280469894409, + "learning_rate": 1.9407201916623293e-06, + "loss": 0.3589, + "step": 7893 + }, + { + "epoch": 2.2161706906232452, + "grad_norm": 0.6127965450286865, + "learning_rate": 1.939428422382058e-06, + "loss": 0.3373, + "step": 7894 + }, + { + "epoch": 2.216451431779899, + "grad_norm": 0.5266435146331787, + "learning_rate": 1.938136979706955e-06, + "loss": 0.333, + "step": 7895 + }, + { + "epoch": 2.2167321729365526, + "grad_norm": 0.5792006850242615, + "learning_rate": 1.9368458637748326e-06, + "loss": 0.3474, + "step": 7896 + }, + { + "epoch": 2.217012914093206, + "grad_norm": 0.6186229586601257, + "learning_rate": 1.935555074723469e-06, + "loss": 0.318, + "step": 7897 + }, + { + "epoch": 2.2172936552498594, + "grad_norm": 0.5567249059677124, + "learning_rate": 1.934264612690614e-06, + "loss": 0.3207, + "step": 7898 + }, + { + "epoch": 2.2175743964065133, + "grad_norm": 0.5644236207008362, + "learning_rate": 1.9329744778139724e-06, + "loss": 0.3198, + "step": 7899 + }, + { + "epoch": 2.2178551375631668, + "grad_norm": 0.5268715620040894, + "learning_rate": 1.931684670231225e-06, + "loss": 0.3278, + "step": 7900 + }, + { + "epoch": 2.21813587871982, + "grad_norm": 0.5538960695266724, + "learning_rate": 1.930395190080006e-06, + "loss": 0.3042, + "step": 7901 + }, + { + "epoch": 2.218416619876474, + "grad_norm": 0.6099576950073242, + "learning_rate": 1.9291060374979237e-06, + "loss": 0.3613, + "step": 7902 + }, + { + "epoch": 2.2186973610331275, + "grad_norm": 0.5823920369148254, + "learning_rate": 1.9278172126225508e-06, + "loss": 0.3116, + "step": 7903 + }, + { + "epoch": 2.218978102189781, + "grad_norm": 0.5380644202232361, + "learning_rate": 1.92652871559142e-06, + "loss": 0.3243, + "step": 7904 + }, + { + "epoch": 2.2192588433464344, + "grad_norm": 0.61611008644104, + "learning_rate": 1.9252405465420305e-06, + "loss": 0.3205, + "step": 7905 + }, + { + "epoch": 2.2195395845030883, + "grad_norm": 0.585796058177948, + "learning_rate": 1.923952705611851e-06, + "loss": 0.2994, + "step": 7906 + }, + { + "epoch": 2.2198203256597417, + "grad_norm": 0.5764893889427185, + "learning_rate": 1.9226651929383077e-06, + "loss": 0.3397, + "step": 7907 + }, + { + "epoch": 2.220101066816395, + "grad_norm": 0.5012286305427551, + "learning_rate": 1.921378008658801e-06, + "loss": 0.3412, + "step": 7908 + }, + { + "epoch": 2.220381807973049, + "grad_norm": 0.6187290549278259, + "learning_rate": 1.920091152910686e-06, + "loss": 0.3058, + "step": 7909 + }, + { + "epoch": 2.2206625491297025, + "grad_norm": 0.628390908241272, + "learning_rate": 1.9188046258312924e-06, + "loss": 0.3045, + "step": 7910 + }, + { + "epoch": 2.220943290286356, + "grad_norm": 0.5092245936393738, + "learning_rate": 1.9175184275579083e-06, + "loss": 0.3768, + "step": 7911 + }, + { + "epoch": 2.2212240314430094, + "grad_norm": 0.5756733417510986, + "learning_rate": 1.916232558227788e-06, + "loss": 0.3559, + "step": 7912 + }, + { + "epoch": 2.2215047725996633, + "grad_norm": 0.5927130579948425, + "learning_rate": 1.914947017978153e-06, + "loss": 0.337, + "step": 7913 + }, + { + "epoch": 2.2217855137563167, + "grad_norm": 0.5313396453857422, + "learning_rate": 1.9136618069461863e-06, + "loss": 0.3246, + "step": 7914 + }, + { + "epoch": 2.22206625491297, + "grad_norm": 0.5105767250061035, + "learning_rate": 1.912376925269041e-06, + "loss": 0.3031, + "step": 7915 + }, + { + "epoch": 2.2223469960696236, + "grad_norm": 0.5950306057929993, + "learning_rate": 1.911092373083829e-06, + "loss": 0.3169, + "step": 7916 + }, + { + "epoch": 2.2226277372262775, + "grad_norm": 0.529450535774231, + "learning_rate": 1.9098081505276296e-06, + "loss": 0.3076, + "step": 7917 + }, + { + "epoch": 2.222908478382931, + "grad_norm": 0.6398170590400696, + "learning_rate": 1.9085242577374885e-06, + "loss": 0.3157, + "step": 7918 + }, + { + "epoch": 2.2231892195395844, + "grad_norm": 0.5717803239822388, + "learning_rate": 1.9072406948504123e-06, + "loss": 0.3126, + "step": 7919 + }, + { + "epoch": 2.2234699606962383, + "grad_norm": 0.6152672171592712, + "learning_rate": 1.905957462003379e-06, + "loss": 0.2989, + "step": 7920 + }, + { + "epoch": 2.2237507018528917, + "grad_norm": 0.5398199558258057, + "learning_rate": 1.9046745593333222e-06, + "loss": 0.3386, + "step": 7921 + }, + { + "epoch": 2.224031443009545, + "grad_norm": 0.568665087223053, + "learning_rate": 1.90339198697715e-06, + "loss": 0.3224, + "step": 7922 + }, + { + "epoch": 2.2243121841661986, + "grad_norm": 0.533877968788147, + "learning_rate": 1.9021097450717285e-06, + "loss": 0.3403, + "step": 7923 + }, + { + "epoch": 2.2245929253228525, + "grad_norm": 0.5058253407478333, + "learning_rate": 1.9008278337538883e-06, + "loss": 0.3631, + "step": 7924 + }, + { + "epoch": 2.224873666479506, + "grad_norm": 0.6110826730728149, + "learning_rate": 1.8995462531604314e-06, + "loss": 0.3373, + "step": 7925 + }, + { + "epoch": 2.2251544076361593, + "grad_norm": 0.5676496028900146, + "learning_rate": 1.8982650034281158e-06, + "loss": 0.3354, + "step": 7926 + }, + { + "epoch": 2.2254351487928132, + "grad_norm": 0.6552402377128601, + "learning_rate": 1.896984084693671e-06, + "loss": 0.2991, + "step": 7927 + }, + { + "epoch": 2.2257158899494667, + "grad_norm": 0.5908652544021606, + "learning_rate": 1.8957034970937898e-06, + "loss": 0.3494, + "step": 7928 + }, + { + "epoch": 2.22599663110612, + "grad_norm": 0.6053508520126343, + "learning_rate": 1.8944232407651253e-06, + "loss": 0.3342, + "step": 7929 + }, + { + "epoch": 2.2262773722627736, + "grad_norm": 0.5698174238204956, + "learning_rate": 1.8931433158443024e-06, + "loss": 0.3078, + "step": 7930 + }, + { + "epoch": 2.2265581134194274, + "grad_norm": 0.5074124336242676, + "learning_rate": 1.891863722467902e-06, + "loss": 0.3071, + "step": 7931 + }, + { + "epoch": 2.226838854576081, + "grad_norm": 0.5592202544212341, + "learning_rate": 1.8905844607724793e-06, + "loss": 0.2941, + "step": 7932 + }, + { + "epoch": 2.2271195957327343, + "grad_norm": 0.5499841570854187, + "learning_rate": 1.8893055308945468e-06, + "loss": 0.3218, + "step": 7933 + }, + { + "epoch": 2.2274003368893878, + "grad_norm": 0.5575269460678101, + "learning_rate": 1.8880269329705818e-06, + "loss": 0.3066, + "step": 7934 + }, + { + "epoch": 2.2276810780460417, + "grad_norm": 0.5841457843780518, + "learning_rate": 1.8867486671370323e-06, + "loss": 0.3384, + "step": 7935 + }, + { + "epoch": 2.227961819202695, + "grad_norm": 0.5694056153297424, + "learning_rate": 1.8854707335303034e-06, + "loss": 0.3356, + "step": 7936 + }, + { + "epoch": 2.2282425603593485, + "grad_norm": 0.5506075024604797, + "learning_rate": 1.8841931322867719e-06, + "loss": 0.3122, + "step": 7937 + }, + { + "epoch": 2.2285233015160024, + "grad_norm": 0.6021720767021179, + "learning_rate": 1.8829158635427724e-06, + "loss": 0.3548, + "step": 7938 + }, + { + "epoch": 2.228804042672656, + "grad_norm": 0.5166854858398438, + "learning_rate": 1.8816389274346069e-06, + "loss": 0.3509, + "step": 7939 + }, + { + "epoch": 2.2290847838293093, + "grad_norm": 0.6093659996986389, + "learning_rate": 1.8803623240985452e-06, + "loss": 0.3003, + "step": 7940 + }, + { + "epoch": 2.2293655249859627, + "grad_norm": 0.5474016070365906, + "learning_rate": 1.8790860536708144e-06, + "loss": 0.3351, + "step": 7941 + }, + { + "epoch": 2.2296462661426166, + "grad_norm": 0.5782341957092285, + "learning_rate": 1.8778101162876144e-06, + "loss": 0.3449, + "step": 7942 + }, + { + "epoch": 2.22992700729927, + "grad_norm": 0.5373322367668152, + "learning_rate": 1.8765345120851014e-06, + "loss": 0.3125, + "step": 7943 + }, + { + "epoch": 2.2302077484559235, + "grad_norm": 0.5648261308670044, + "learning_rate": 1.8752592411994031e-06, + "loss": 0.3033, + "step": 7944 + }, + { + "epoch": 2.2304884896125774, + "grad_norm": 0.5863624811172485, + "learning_rate": 1.8739843037666073e-06, + "loss": 0.3072, + "step": 7945 + }, + { + "epoch": 2.230769230769231, + "grad_norm": 0.5565276145935059, + "learning_rate": 1.8727096999227655e-06, + "loss": 0.3114, + "step": 7946 + }, + { + "epoch": 2.2310499719258843, + "grad_norm": 0.5573920011520386, + "learning_rate": 1.871435429803899e-06, + "loss": 0.3064, + "step": 7947 + }, + { + "epoch": 2.2313307130825377, + "grad_norm": 0.591001570224762, + "learning_rate": 1.8701614935459867e-06, + "loss": 0.3437, + "step": 7948 + }, + { + "epoch": 2.2316114542391916, + "grad_norm": 0.5355536341667175, + "learning_rate": 1.868887891284978e-06, + "loss": 0.3483, + "step": 7949 + }, + { + "epoch": 2.231892195395845, + "grad_norm": 0.5544248223304749, + "learning_rate": 1.8676146231567821e-06, + "loss": 0.2743, + "step": 7950 + }, + { + "epoch": 2.2321729365524985, + "grad_norm": 0.5608052611351013, + "learning_rate": 1.866341689297273e-06, + "loss": 0.3117, + "step": 7951 + }, + { + "epoch": 2.2324536777091524, + "grad_norm": 0.5530779957771301, + "learning_rate": 1.8650690898422925e-06, + "loss": 0.3316, + "step": 7952 + }, + { + "epoch": 2.232734418865806, + "grad_norm": 0.6023195385932922, + "learning_rate": 1.863796824927645e-06, + "loss": 0.3395, + "step": 7953 + }, + { + "epoch": 2.2330151600224593, + "grad_norm": 0.6207793354988098, + "learning_rate": 1.862524894689096e-06, + "loss": 0.3158, + "step": 7954 + }, + { + "epoch": 2.2332959011791127, + "grad_norm": 0.5584052801132202, + "learning_rate": 1.8612532992623816e-06, + "loss": 0.2987, + "step": 7955 + }, + { + "epoch": 2.2335766423357666, + "grad_norm": 0.5351366400718689, + "learning_rate": 1.8599820387831947e-06, + "loss": 0.3403, + "step": 7956 + }, + { + "epoch": 2.23385738349242, + "grad_norm": 0.5963825583457947, + "learning_rate": 1.8587111133871994e-06, + "loss": 0.3314, + "step": 7957 + }, + { + "epoch": 2.2341381246490735, + "grad_norm": 0.6010246872901917, + "learning_rate": 1.8574405232100184e-06, + "loss": 0.3044, + "step": 7958 + }, + { + "epoch": 2.2344188658057273, + "grad_norm": 0.4795440137386322, + "learning_rate": 1.856170268387244e-06, + "loss": 0.38, + "step": 7959 + }, + { + "epoch": 2.234699606962381, + "grad_norm": 0.5915919542312622, + "learning_rate": 1.8549003490544281e-06, + "loss": 0.3262, + "step": 7960 + }, + { + "epoch": 2.2349803481190342, + "grad_norm": 0.6033159494400024, + "learning_rate": 1.8536307653470875e-06, + "loss": 0.3678, + "step": 7961 + }, + { + "epoch": 2.2352610892756877, + "grad_norm": 0.5163617730140686, + "learning_rate": 1.8523615174007077e-06, + "loss": 0.2971, + "step": 7962 + }, + { + "epoch": 2.2355418304323416, + "grad_norm": 0.5907436609268188, + "learning_rate": 1.8510926053507306e-06, + "loss": 0.3579, + "step": 7963 + }, + { + "epoch": 2.235822571588995, + "grad_norm": 0.5777222514152527, + "learning_rate": 1.8498240293325708e-06, + "loss": 0.342, + "step": 7964 + }, + { + "epoch": 2.2361033127456484, + "grad_norm": 0.6026085615158081, + "learning_rate": 1.8485557894815992e-06, + "loss": 0.3419, + "step": 7965 + }, + { + "epoch": 2.236384053902302, + "grad_norm": 0.6003116965293884, + "learning_rate": 1.8472878859331583e-06, + "loss": 0.2992, + "step": 7966 + }, + { + "epoch": 2.2366647950589558, + "grad_norm": 0.632685661315918, + "learning_rate": 1.8460203188225484e-06, + "loss": 0.361, + "step": 7967 + }, + { + "epoch": 2.236945536215609, + "grad_norm": 0.5606963038444519, + "learning_rate": 1.844753088285035e-06, + "loss": 0.3607, + "step": 7968 + }, + { + "epoch": 2.2372262773722627, + "grad_norm": 0.5325944423675537, + "learning_rate": 1.843486194455853e-06, + "loss": 0.2764, + "step": 7969 + }, + { + "epoch": 2.2375070185289165, + "grad_norm": 0.4976692795753479, + "learning_rate": 1.8422196374701927e-06, + "loss": 0.3496, + "step": 7970 + }, + { + "epoch": 2.23778775968557, + "grad_norm": 0.5979899764060974, + "learning_rate": 1.8409534174632181e-06, + "loss": 0.3005, + "step": 7971 + }, + { + "epoch": 2.2380685008422234, + "grad_norm": 0.5300469398498535, + "learning_rate": 1.8396875345700498e-06, + "loss": 0.3254, + "step": 7972 + }, + { + "epoch": 2.238349241998877, + "grad_norm": 0.6309905052185059, + "learning_rate": 1.8384219889257737e-06, + "loss": 0.3307, + "step": 7973 + }, + { + "epoch": 2.2386299831555307, + "grad_norm": 0.5937932133674622, + "learning_rate": 1.837156780665444e-06, + "loss": 0.3324, + "step": 7974 + }, + { + "epoch": 2.238910724312184, + "grad_norm": 0.5859931707382202, + "learning_rate": 1.8358919099240723e-06, + "loss": 0.3034, + "step": 7975 + }, + { + "epoch": 2.2391914654688376, + "grad_norm": 0.515424370765686, + "learning_rate": 1.8346273768366417e-06, + "loss": 0.3365, + "step": 7976 + }, + { + "epoch": 2.239472206625491, + "grad_norm": 0.6204776167869568, + "learning_rate": 1.8333631815380915e-06, + "loss": 0.3587, + "step": 7977 + }, + { + "epoch": 2.239752947782145, + "grad_norm": 0.621213972568512, + "learning_rate": 1.83209932416333e-06, + "loss": 0.3138, + "step": 7978 + }, + { + "epoch": 2.2400336889387984, + "grad_norm": 0.6118841767311096, + "learning_rate": 1.8308358048472313e-06, + "loss": 0.3238, + "step": 7979 + }, + { + "epoch": 2.240314430095452, + "grad_norm": 0.5114016532897949, + "learning_rate": 1.8295726237246254e-06, + "loss": 0.3297, + "step": 7980 + }, + { + "epoch": 2.2405951712521057, + "grad_norm": 0.5543270707130432, + "learning_rate": 1.8283097809303158e-06, + "loss": 0.3276, + "step": 7981 + }, + { + "epoch": 2.240875912408759, + "grad_norm": 0.5982322096824646, + "learning_rate": 1.8270472765990622e-06, + "loss": 0.3296, + "step": 7982 + }, + { + "epoch": 2.2411566535654126, + "grad_norm": 0.5736710429191589, + "learning_rate": 1.8257851108655905e-06, + "loss": 0.3739, + "step": 7983 + }, + { + "epoch": 2.241437394722066, + "grad_norm": 0.5847101211547852, + "learning_rate": 1.8245232838645948e-06, + "loss": 0.2932, + "step": 7984 + }, + { + "epoch": 2.24171813587872, + "grad_norm": 0.6166174411773682, + "learning_rate": 1.8232617957307254e-06, + "loss": 0.366, + "step": 7985 + }, + { + "epoch": 2.2419988770353734, + "grad_norm": 0.5478823184967041, + "learning_rate": 1.822000646598604e-06, + "loss": 0.3398, + "step": 7986 + }, + { + "epoch": 2.242279618192027, + "grad_norm": 0.6085838675498962, + "learning_rate": 1.8207398366028106e-06, + "loss": 0.3733, + "step": 7987 + }, + { + "epoch": 2.2425603593486807, + "grad_norm": 0.548533022403717, + "learning_rate": 1.8194793658778898e-06, + "loss": 0.3508, + "step": 7988 + }, + { + "epoch": 2.242841100505334, + "grad_norm": 0.5656881928443909, + "learning_rate": 1.8182192345583543e-06, + "loss": 0.3089, + "step": 7989 + }, + { + "epoch": 2.2431218416619876, + "grad_norm": 0.5688865780830383, + "learning_rate": 1.8169594427786746e-06, + "loss": 0.292, + "step": 7990 + }, + { + "epoch": 2.243402582818641, + "grad_norm": 0.5680079460144043, + "learning_rate": 1.8156999906732902e-06, + "loss": 0.3077, + "step": 7991 + }, + { + "epoch": 2.243683323975295, + "grad_norm": 0.5976494550704956, + "learning_rate": 1.8144408783766e-06, + "loss": 0.2918, + "step": 7992 + }, + { + "epoch": 2.2439640651319483, + "grad_norm": 0.564488410949707, + "learning_rate": 1.813182106022971e-06, + "loss": 0.3047, + "step": 7993 + }, + { + "epoch": 2.244244806288602, + "grad_norm": 0.4989648461341858, + "learning_rate": 1.81192367374673e-06, + "loss": 0.2928, + "step": 7994 + }, + { + "epoch": 2.2445255474452557, + "grad_norm": 0.5590886473655701, + "learning_rate": 1.8106655816821672e-06, + "loss": 0.359, + "step": 7995 + }, + { + "epoch": 2.244806288601909, + "grad_norm": 0.5778689384460449, + "learning_rate": 1.8094078299635427e-06, + "loss": 0.3032, + "step": 7996 + }, + { + "epoch": 2.2450870297585626, + "grad_norm": 0.5303788185119629, + "learning_rate": 1.8081504187250715e-06, + "loss": 0.338, + "step": 7997 + }, + { + "epoch": 2.245367770915216, + "grad_norm": 0.6084389090538025, + "learning_rate": 1.8068933481009405e-06, + "loss": 0.3278, + "step": 7998 + }, + { + "epoch": 2.24564851207187, + "grad_norm": 0.5453808307647705, + "learning_rate": 1.8056366182252949e-06, + "loss": 0.3586, + "step": 7999 + }, + { + "epoch": 2.2459292532285233, + "grad_norm": 0.5822224020957947, + "learning_rate": 1.8043802292322433e-06, + "loss": 0.3137, + "step": 8000 + }, + { + "epoch": 2.2462099943851768, + "grad_norm": 0.5362311005592346, + "learning_rate": 1.8031241812558631e-06, + "loss": 0.3386, + "step": 8001 + }, + { + "epoch": 2.2464907355418307, + "grad_norm": 0.5358651876449585, + "learning_rate": 1.8018684744301867e-06, + "loss": 0.3103, + "step": 8002 + }, + { + "epoch": 2.246771476698484, + "grad_norm": 0.5495103001594543, + "learning_rate": 1.8006131088892227e-06, + "loss": 0.3178, + "step": 8003 + }, + { + "epoch": 2.2470522178551375, + "grad_norm": 0.6086322069168091, + "learning_rate": 1.7993580847669312e-06, + "loss": 0.3393, + "step": 8004 + }, + { + "epoch": 2.247332959011791, + "grad_norm": 0.5692078471183777, + "learning_rate": 1.7981034021972404e-06, + "loss": 0.3328, + "step": 8005 + }, + { + "epoch": 2.247613700168445, + "grad_norm": 0.5917820334434509, + "learning_rate": 1.7968490613140443e-06, + "loss": 0.3158, + "step": 8006 + }, + { + "epoch": 2.2478944413250983, + "grad_norm": 0.5158905982971191, + "learning_rate": 1.7955950622511964e-06, + "loss": 0.317, + "step": 8007 + }, + { + "epoch": 2.2481751824817517, + "grad_norm": 0.5884225368499756, + "learning_rate": 1.7943414051425178e-06, + "loss": 0.3606, + "step": 8008 + }, + { + "epoch": 2.248455923638405, + "grad_norm": 0.5456855893135071, + "learning_rate": 1.7930880901217896e-06, + "loss": 0.3239, + "step": 8009 + }, + { + "epoch": 2.248736664795059, + "grad_norm": 0.5448585152626038, + "learning_rate": 1.7918351173227566e-06, + "loss": 0.3199, + "step": 8010 + }, + { + "epoch": 2.2490174059517125, + "grad_norm": 0.6192921996116638, + "learning_rate": 1.790582486879131e-06, + "loss": 0.3379, + "step": 8011 + }, + { + "epoch": 2.249298147108366, + "grad_norm": 0.5639538168907166, + "learning_rate": 1.789330198924583e-06, + "loss": 0.3365, + "step": 8012 + }, + { + "epoch": 2.24957888826502, + "grad_norm": 0.5992386937141418, + "learning_rate": 1.788078253592752e-06, + "loss": 0.3108, + "step": 8013 + }, + { + "epoch": 2.2498596294216733, + "grad_norm": 0.5644425749778748, + "learning_rate": 1.7868266510172344e-06, + "loss": 0.2993, + "step": 8014 + }, + { + "epoch": 2.2501403705783267, + "grad_norm": 0.5378849506378174, + "learning_rate": 1.785575391331597e-06, + "loss": 0.3072, + "step": 8015 + }, + { + "epoch": 2.25042111173498, + "grad_norm": 0.5695605874061584, + "learning_rate": 1.7843244746693645e-06, + "loss": 0.3759, + "step": 8016 + }, + { + "epoch": 2.250701852891634, + "grad_norm": 0.5689758062362671, + "learning_rate": 1.7830739011640252e-06, + "loss": 0.3287, + "step": 8017 + }, + { + "epoch": 2.2509825940482875, + "grad_norm": 0.6179699301719666, + "learning_rate": 1.7818236709490362e-06, + "loss": 0.3415, + "step": 8018 + }, + { + "epoch": 2.251263335204941, + "grad_norm": 0.6113905906677246, + "learning_rate": 1.7805737841578108e-06, + "loss": 0.3135, + "step": 8019 + }, + { + "epoch": 2.2515440763615944, + "grad_norm": 0.5384484529495239, + "learning_rate": 1.7793242409237327e-06, + "loss": 0.3439, + "step": 8020 + }, + { + "epoch": 2.2518248175182483, + "grad_norm": 0.552721381187439, + "learning_rate": 1.778075041380144e-06, + "loss": 0.3488, + "step": 8021 + }, + { + "epoch": 2.2521055586749017, + "grad_norm": 0.5636094808578491, + "learning_rate": 1.7768261856603487e-06, + "loss": 0.3086, + "step": 8022 + }, + { + "epoch": 2.252386299831555, + "grad_norm": 0.6538780331611633, + "learning_rate": 1.775577673897621e-06, + "loss": 0.3233, + "step": 8023 + }, + { + "epoch": 2.252667040988209, + "grad_norm": 0.5709100365638733, + "learning_rate": 1.7743295062251913e-06, + "loss": 0.339, + "step": 8024 + }, + { + "epoch": 2.2529477821448625, + "grad_norm": 0.5108280181884766, + "learning_rate": 1.7730816827762587e-06, + "loss": 0.3322, + "step": 8025 + }, + { + "epoch": 2.253228523301516, + "grad_norm": 0.5429946184158325, + "learning_rate": 1.7718342036839808e-06, + "loss": 0.3199, + "step": 8026 + }, + { + "epoch": 2.2535092644581693, + "grad_norm": 0.5417314171791077, + "learning_rate": 1.7705870690814835e-06, + "loss": 0.3159, + "step": 8027 + }, + { + "epoch": 2.2537900056148232, + "grad_norm": 0.5941969156265259, + "learning_rate": 1.7693402791018505e-06, + "loss": 0.3811, + "step": 8028 + }, + { + "epoch": 2.2540707467714767, + "grad_norm": 0.5561227798461914, + "learning_rate": 1.7680938338781322e-06, + "loss": 0.2943, + "step": 8029 + }, + { + "epoch": 2.25435148792813, + "grad_norm": 0.540425717830658, + "learning_rate": 1.7668477335433443e-06, + "loss": 0.3421, + "step": 8030 + }, + { + "epoch": 2.254632229084784, + "grad_norm": 0.5694829821586609, + "learning_rate": 1.7656019782304602e-06, + "loss": 0.3186, + "step": 8031 + }, + { + "epoch": 2.2549129702414374, + "grad_norm": 0.5782746076583862, + "learning_rate": 1.7643565680724183e-06, + "loss": 0.2984, + "step": 8032 + }, + { + "epoch": 2.255193711398091, + "grad_norm": 0.6405271887779236, + "learning_rate": 1.7631115032021235e-06, + "loss": 0.3296, + "step": 8033 + }, + { + "epoch": 2.2554744525547443, + "grad_norm": 0.5816053152084351, + "learning_rate": 1.761866783752439e-06, + "loss": 0.3575, + "step": 8034 + }, + { + "epoch": 2.255755193711398, + "grad_norm": 0.5303795337677002, + "learning_rate": 1.7606224098561957e-06, + "loss": 0.3766, + "step": 8035 + }, + { + "epoch": 2.2560359348680517, + "grad_norm": 0.5758079290390015, + "learning_rate": 1.7593783816461852e-06, + "loss": 0.2891, + "step": 8036 + }, + { + "epoch": 2.256316676024705, + "grad_norm": 0.5049720406532288, + "learning_rate": 1.7581346992551595e-06, + "loss": 0.3721, + "step": 8037 + }, + { + "epoch": 2.256597417181359, + "grad_norm": 0.5692103505134583, + "learning_rate": 1.7568913628158412e-06, + "loss": 0.3573, + "step": 8038 + }, + { + "epoch": 2.2568781583380124, + "grad_norm": 0.5312740802764893, + "learning_rate": 1.755648372460907e-06, + "loss": 0.3375, + "step": 8039 + }, + { + "epoch": 2.257158899494666, + "grad_norm": 0.5886383056640625, + "learning_rate": 1.754405728323005e-06, + "loss": 0.3297, + "step": 8040 + }, + { + "epoch": 2.2574396406513193, + "grad_norm": 0.5831328630447388, + "learning_rate": 1.753163430534739e-06, + "loss": 0.3483, + "step": 8041 + }, + { + "epoch": 2.257720381807973, + "grad_norm": 0.5989186763763428, + "learning_rate": 1.751921479228682e-06, + "loss": 0.4014, + "step": 8042 + }, + { + "epoch": 2.2580011229646266, + "grad_norm": 0.6026795506477356, + "learning_rate": 1.750679874537367e-06, + "loss": 0.3295, + "step": 8043 + }, + { + "epoch": 2.25828186412128, + "grad_norm": 0.5234981179237366, + "learning_rate": 1.7494386165932875e-06, + "loss": 0.291, + "step": 8044 + }, + { + "epoch": 2.258562605277934, + "grad_norm": 0.5749183297157288, + "learning_rate": 1.7481977055289067e-06, + "loss": 0.3237, + "step": 8045 + }, + { + "epoch": 2.2588433464345874, + "grad_norm": 0.5641282796859741, + "learning_rate": 1.7469571414766439e-06, + "loss": 0.3293, + "step": 8046 + }, + { + "epoch": 2.259124087591241, + "grad_norm": 0.5993145108222961, + "learning_rate": 1.745716924568887e-06, + "loss": 0.3187, + "step": 8047 + }, + { + "epoch": 2.2594048287478943, + "grad_norm": 0.6631541848182678, + "learning_rate": 1.744477054937983e-06, + "loss": 0.3293, + "step": 8048 + }, + { + "epoch": 2.259685569904548, + "grad_norm": 0.563859760761261, + "learning_rate": 1.743237532716241e-06, + "loss": 0.3392, + "step": 8049 + }, + { + "epoch": 2.2599663110612016, + "grad_norm": 0.6036688685417175, + "learning_rate": 1.7419983580359383e-06, + "loss": 0.3225, + "step": 8050 + }, + { + "epoch": 2.260247052217855, + "grad_norm": 0.5790561437606812, + "learning_rate": 1.7407595310293096e-06, + "loss": 0.3134, + "step": 8051 + }, + { + "epoch": 2.260527793374509, + "grad_norm": 0.527915894985199, + "learning_rate": 1.7395210518285577e-06, + "loss": 0.3712, + "step": 8052 + }, + { + "epoch": 2.2608085345311624, + "grad_norm": 0.5608624815940857, + "learning_rate": 1.7382829205658413e-06, + "loss": 0.324, + "step": 8053 + }, + { + "epoch": 2.261089275687816, + "grad_norm": 0.6710591912269592, + "learning_rate": 1.737045137373289e-06, + "loss": 0.3105, + "step": 8054 + }, + { + "epoch": 2.2613700168444693, + "grad_norm": 0.6487722396850586, + "learning_rate": 1.73580770238299e-06, + "loss": 0.359, + "step": 8055 + }, + { + "epoch": 2.261650758001123, + "grad_norm": 0.5920708775520325, + "learning_rate": 1.7345706157269926e-06, + "loss": 0.3082, + "step": 8056 + }, + { + "epoch": 2.2619314991577766, + "grad_norm": 0.615537703037262, + "learning_rate": 1.7333338775373142e-06, + "loss": 0.3109, + "step": 8057 + }, + { + "epoch": 2.26221224031443, + "grad_norm": 0.5814971327781677, + "learning_rate": 1.7320974879459306e-06, + "loss": 0.3012, + "step": 8058 + }, + { + "epoch": 2.262492981471084, + "grad_norm": 0.5310434699058533, + "learning_rate": 1.7308614470847802e-06, + "loss": 0.329, + "step": 8059 + }, + { + "epoch": 2.2627737226277373, + "grad_norm": 0.592069685459137, + "learning_rate": 1.7296257550857677e-06, + "loss": 0.351, + "step": 8060 + }, + { + "epoch": 2.263054463784391, + "grad_norm": 0.6197977066040039, + "learning_rate": 1.728390412080756e-06, + "loss": 0.3425, + "step": 8061 + }, + { + "epoch": 2.2633352049410442, + "grad_norm": 0.5771230459213257, + "learning_rate": 1.7271554182015765e-06, + "loss": 0.3037, + "step": 8062 + }, + { + "epoch": 2.2636159460976977, + "grad_norm": 0.5480746030807495, + "learning_rate": 1.725920773580016e-06, + "loss": 0.3071, + "step": 8063 + }, + { + "epoch": 2.2638966872543516, + "grad_norm": 0.5535979866981506, + "learning_rate": 1.7246864783478329e-06, + "loss": 0.3282, + "step": 8064 + }, + { + "epoch": 2.264177428411005, + "grad_norm": 0.550595760345459, + "learning_rate": 1.7234525326367408e-06, + "loss": 0.2868, + "step": 8065 + }, + { + "epoch": 2.2644581695676584, + "grad_norm": 0.6128308176994324, + "learning_rate": 1.722218936578417e-06, + "loss": 0.3421, + "step": 8066 + }, + { + "epoch": 2.2647389107243123, + "grad_norm": 0.5360646843910217, + "learning_rate": 1.720985690304507e-06, + "loss": 0.371, + "step": 8067 + }, + { + "epoch": 2.2650196518809658, + "grad_norm": 0.6131049990653992, + "learning_rate": 1.7197527939466114e-06, + "loss": 0.3451, + "step": 8068 + }, + { + "epoch": 2.265300393037619, + "grad_norm": 0.5713410973548889, + "learning_rate": 1.7185202476363005e-06, + "loss": 0.3193, + "step": 8069 + }, + { + "epoch": 2.2655811341942727, + "grad_norm": 0.5460421442985535, + "learning_rate": 1.7172880515051033e-06, + "loss": 0.3092, + "step": 8070 + }, + { + "epoch": 2.2658618753509265, + "grad_norm": 0.5895035862922668, + "learning_rate": 1.7160562056845092e-06, + "loss": 0.3461, + "step": 8071 + }, + { + "epoch": 2.26614261650758, + "grad_norm": 0.5344123244285583, + "learning_rate": 1.7148247103059772e-06, + "loss": 0.3431, + "step": 8072 + }, + { + "epoch": 2.2664233576642334, + "grad_norm": 0.6651570796966553, + "learning_rate": 1.7135935655009218e-06, + "loss": 0.3151, + "step": 8073 + }, + { + "epoch": 2.2667040988208873, + "grad_norm": 0.560825526714325, + "learning_rate": 1.7123627714007252e-06, + "loss": 0.3769, + "step": 8074 + }, + { + "epoch": 2.2669848399775407, + "grad_norm": 0.597614049911499, + "learning_rate": 1.7111323281367276e-06, + "loss": 0.3434, + "step": 8075 + }, + { + "epoch": 2.267265581134194, + "grad_norm": 0.581476628780365, + "learning_rate": 1.709902235840238e-06, + "loss": 0.3367, + "step": 8076 + }, + { + "epoch": 2.2675463222908476, + "grad_norm": 0.5085705518722534, + "learning_rate": 1.7086724946425215e-06, + "loss": 0.2906, + "step": 8077 + }, + { + "epoch": 2.2678270634475015, + "grad_norm": 0.4936918616294861, + "learning_rate": 1.7074431046748075e-06, + "loss": 0.3195, + "step": 8078 + }, + { + "epoch": 2.268107804604155, + "grad_norm": 0.5442838668823242, + "learning_rate": 1.7062140660682902e-06, + "loss": 0.3419, + "step": 8079 + }, + { + "epoch": 2.2683885457608084, + "grad_norm": 0.5474158525466919, + "learning_rate": 1.704985378954127e-06, + "loss": 0.3031, + "step": 8080 + }, + { + "epoch": 2.2686692869174623, + "grad_norm": 0.5990607142448425, + "learning_rate": 1.703757043463432e-06, + "loss": 0.3129, + "step": 8081 + }, + { + "epoch": 2.2689500280741157, + "grad_norm": 0.5595345497131348, + "learning_rate": 1.7025290597272886e-06, + "loss": 0.3561, + "step": 8082 + }, + { + "epoch": 2.269230769230769, + "grad_norm": 0.610658586025238, + "learning_rate": 1.7013014278767377e-06, + "loss": 0.2928, + "step": 8083 + }, + { + "epoch": 2.2695115103874226, + "grad_norm": 0.549555778503418, + "learning_rate": 1.7000741480427856e-06, + "loss": 0.3529, + "step": 8084 + }, + { + "epoch": 2.2697922515440765, + "grad_norm": 0.4983183443546295, + "learning_rate": 1.6988472203563988e-06, + "loss": 0.3351, + "step": 8085 + }, + { + "epoch": 2.27007299270073, + "grad_norm": 0.6268941760063171, + "learning_rate": 1.6976206449485094e-06, + "loss": 0.3294, + "step": 8086 + }, + { + "epoch": 2.2703537338573834, + "grad_norm": 0.5816642642021179, + "learning_rate": 1.6963944219500084e-06, + "loss": 0.3192, + "step": 8087 + }, + { + "epoch": 2.2706344750140373, + "grad_norm": 0.6187565922737122, + "learning_rate": 1.6951685514917499e-06, + "loss": 0.2926, + "step": 8088 + }, + { + "epoch": 2.2709152161706907, + "grad_norm": 0.6386123299598694, + "learning_rate": 1.6939430337045532e-06, + "loss": 0.3618, + "step": 8089 + }, + { + "epoch": 2.271195957327344, + "grad_norm": 0.5533019304275513, + "learning_rate": 1.6927178687191953e-06, + "loss": 0.3203, + "step": 8090 + }, + { + "epoch": 2.2714766984839976, + "grad_norm": 0.5853825211524963, + "learning_rate": 1.6914930566664216e-06, + "loss": 0.3381, + "step": 8091 + }, + { + "epoch": 2.2717574396406515, + "grad_norm": 0.5264586210250854, + "learning_rate": 1.690268597676935e-06, + "loss": 0.3196, + "step": 8092 + }, + { + "epoch": 2.272038180797305, + "grad_norm": 0.6071246266365051, + "learning_rate": 1.6890444918814002e-06, + "loss": 0.3026, + "step": 8093 + }, + { + "epoch": 2.2723189219539583, + "grad_norm": 0.6103940010070801, + "learning_rate": 1.687820739410449e-06, + "loss": 0.3541, + "step": 8094 + }, + { + "epoch": 2.2725996631106122, + "grad_norm": 0.5096889734268188, + "learning_rate": 1.68659734039467e-06, + "loss": 0.3352, + "step": 8095 + }, + { + "epoch": 2.2728804042672657, + "grad_norm": 0.5179657340049744, + "learning_rate": 1.6853742949646197e-06, + "loss": 0.3501, + "step": 8096 + }, + { + "epoch": 2.273161145423919, + "grad_norm": 0.5765989422798157, + "learning_rate": 1.6841516032508105e-06, + "loss": 0.283, + "step": 8097 + }, + { + "epoch": 2.2734418865805726, + "grad_norm": 0.7021377682685852, + "learning_rate": 1.6829292653837243e-06, + "loss": 0.2916, + "step": 8098 + }, + { + "epoch": 2.2737226277372264, + "grad_norm": 0.5201624035835266, + "learning_rate": 1.681707281493799e-06, + "loss": 0.3449, + "step": 8099 + }, + { + "epoch": 2.27400336889388, + "grad_norm": 0.6332237124443054, + "learning_rate": 1.6804856517114359e-06, + "loss": 0.3409, + "step": 8100 + }, + { + "epoch": 2.2742841100505333, + "grad_norm": 0.5995384454727173, + "learning_rate": 1.6792643761670035e-06, + "loss": 0.2915, + "step": 8101 + }, + { + "epoch": 2.274564851207187, + "grad_norm": 0.5878689289093018, + "learning_rate": 1.6780434549908241e-06, + "loss": 0.3581, + "step": 8102 + }, + { + "epoch": 2.2748455923638407, + "grad_norm": 0.580745279788971, + "learning_rate": 1.6768228883131921e-06, + "loss": 0.3258, + "step": 8103 + }, + { + "epoch": 2.275126333520494, + "grad_norm": 0.5451475381851196, + "learning_rate": 1.6756026762643535e-06, + "loss": 0.314, + "step": 8104 + }, + { + "epoch": 2.2754070746771475, + "grad_norm": 0.5909281373023987, + "learning_rate": 1.6743828189745243e-06, + "loss": 0.3359, + "step": 8105 + }, + { + "epoch": 2.2756878158338014, + "grad_norm": 0.5588069558143616, + "learning_rate": 1.673163316573882e-06, + "loss": 0.3037, + "step": 8106 + }, + { + "epoch": 2.275968556990455, + "grad_norm": 0.5816650986671448, + "learning_rate": 1.671944169192562e-06, + "loss": 0.3286, + "step": 8107 + }, + { + "epoch": 2.2762492981471083, + "grad_norm": 0.5617421269416809, + "learning_rate": 1.670725376960663e-06, + "loss": 0.316, + "step": 8108 + }, + { + "epoch": 2.2765300393037617, + "grad_norm": 0.5178927779197693, + "learning_rate": 1.6695069400082497e-06, + "loss": 0.2995, + "step": 8109 + }, + { + "epoch": 2.2768107804604156, + "grad_norm": 0.5963665246963501, + "learning_rate": 1.6682888584653434e-06, + "loss": 0.3311, + "step": 8110 + }, + { + "epoch": 2.277091521617069, + "grad_norm": 0.5320101976394653, + "learning_rate": 1.6670711324619332e-06, + "loss": 0.3383, + "step": 8111 + }, + { + "epoch": 2.2773722627737225, + "grad_norm": 0.5567086338996887, + "learning_rate": 1.6658537621279642e-06, + "loss": 0.3347, + "step": 8112 + }, + { + "epoch": 2.277653003930376, + "grad_norm": 0.5196081399917603, + "learning_rate": 1.6646367475933495e-06, + "loss": 0.3445, + "step": 8113 + }, + { + "epoch": 2.27793374508703, + "grad_norm": 0.6353957056999207, + "learning_rate": 1.6634200889879592e-06, + "loss": 0.3133, + "step": 8114 + }, + { + "epoch": 2.2782144862436833, + "grad_norm": 0.5574871897697449, + "learning_rate": 1.6622037864416274e-06, + "loss": 0.3003, + "step": 8115 + }, + { + "epoch": 2.2784952274003367, + "grad_norm": 0.5481862425804138, + "learning_rate": 1.6609878400841527e-06, + "loss": 0.382, + "step": 8116 + }, + { + "epoch": 2.2787759685569906, + "grad_norm": 0.5305172801017761, + "learning_rate": 1.6597722500452895e-06, + "loss": 0.3853, + "step": 8117 + }, + { + "epoch": 2.279056709713644, + "grad_norm": 0.5914360284805298, + "learning_rate": 1.6585570164547627e-06, + "loss": 0.3063, + "step": 8118 + }, + { + "epoch": 2.2793374508702975, + "grad_norm": 0.5952426195144653, + "learning_rate": 1.6573421394422519e-06, + "loss": 0.3658, + "step": 8119 + }, + { + "epoch": 2.279618192026951, + "grad_norm": 0.5759475827217102, + "learning_rate": 1.6561276191373994e-06, + "loss": 0.2958, + "step": 8120 + }, + { + "epoch": 2.279898933183605, + "grad_norm": 0.5460476875305176, + "learning_rate": 1.6549134556698148e-06, + "loss": 0.2847, + "step": 8121 + }, + { + "epoch": 2.2801796743402583, + "grad_norm": 0.5319930911064148, + "learning_rate": 1.6536996491690627e-06, + "loss": 0.3438, + "step": 8122 + }, + { + "epoch": 2.2804604154969117, + "grad_norm": 0.5628823637962341, + "learning_rate": 1.6524861997646769e-06, + "loss": 0.3398, + "step": 8123 + }, + { + "epoch": 2.2807411566535656, + "grad_norm": 0.5416834354400635, + "learning_rate": 1.6512731075861455e-06, + "loss": 0.333, + "step": 8124 + }, + { + "epoch": 2.281021897810219, + "grad_norm": 0.5499876737594604, + "learning_rate": 1.650060372762925e-06, + "loss": 0.351, + "step": 8125 + }, + { + "epoch": 2.2813026389668725, + "grad_norm": 0.5261113047599792, + "learning_rate": 1.6488479954244297e-06, + "loss": 0.3234, + "step": 8126 + }, + { + "epoch": 2.281583380123526, + "grad_norm": 0.5825338959693909, + "learning_rate": 1.6476359757000349e-06, + "loss": 0.3616, + "step": 8127 + }, + { + "epoch": 2.28186412128018, + "grad_norm": 0.6240234375, + "learning_rate": 1.6464243137190838e-06, + "loss": 0.3149, + "step": 8128 + }, + { + "epoch": 2.2821448624368332, + "grad_norm": 0.5497562289237976, + "learning_rate": 1.6452130096108738e-06, + "loss": 0.3578, + "step": 8129 + }, + { + "epoch": 2.2824256035934867, + "grad_norm": 0.5215038061141968, + "learning_rate": 1.6440020635046695e-06, + "loss": 0.3712, + "step": 8130 + }, + { + "epoch": 2.2827063447501406, + "grad_norm": 0.5622836351394653, + "learning_rate": 1.6427914755296964e-06, + "loss": 0.3812, + "step": 8131 + }, + { + "epoch": 2.282987085906794, + "grad_norm": 0.5552845001220703, + "learning_rate": 1.6415812458151393e-06, + "loss": 0.3419, + "step": 8132 + }, + { + "epoch": 2.2832678270634474, + "grad_norm": 0.5544072985649109, + "learning_rate": 1.6403713744901478e-06, + "loss": 0.3489, + "step": 8133 + }, + { + "epoch": 2.283548568220101, + "grad_norm": 0.5986542701721191, + "learning_rate": 1.63916186168383e-06, + "loss": 0.3148, + "step": 8134 + }, + { + "epoch": 2.2838293093767548, + "grad_norm": 0.5655476450920105, + "learning_rate": 1.6379527075252598e-06, + "loss": 0.335, + "step": 8135 + }, + { + "epoch": 2.284110050533408, + "grad_norm": 0.5394382476806641, + "learning_rate": 1.6367439121434698e-06, + "loss": 0.3325, + "step": 8136 + }, + { + "epoch": 2.2843907916900617, + "grad_norm": 0.5876544713973999, + "learning_rate": 1.635535475667453e-06, + "loss": 0.3508, + "step": 8137 + }, + { + "epoch": 2.2846715328467155, + "grad_norm": 0.5868120193481445, + "learning_rate": 1.63432739822617e-06, + "loss": 0.3155, + "step": 8138 + }, + { + "epoch": 2.284952274003369, + "grad_norm": 0.5698676705360413, + "learning_rate": 1.633119679948535e-06, + "loss": 0.3509, + "step": 8139 + }, + { + "epoch": 2.2852330151600224, + "grad_norm": 0.5355852246284485, + "learning_rate": 1.6319123209634324e-06, + "loss": 0.3949, + "step": 8140 + }, + { + "epoch": 2.285513756316676, + "grad_norm": 0.5934165716171265, + "learning_rate": 1.6307053213997026e-06, + "loss": 0.3681, + "step": 8141 + }, + { + "epoch": 2.2857944974733297, + "grad_norm": 0.5372290015220642, + "learning_rate": 1.6294986813861462e-06, + "loss": 0.2869, + "step": 8142 + }, + { + "epoch": 2.286075238629983, + "grad_norm": 0.6086657643318176, + "learning_rate": 1.6282924010515333e-06, + "loss": 0.3051, + "step": 8143 + }, + { + "epoch": 2.2863559797866366, + "grad_norm": 0.5466243624687195, + "learning_rate": 1.6270864805245856e-06, + "loss": 0.3235, + "step": 8144 + }, + { + "epoch": 2.2866367209432905, + "grad_norm": 0.5670375823974609, + "learning_rate": 1.6258809199339964e-06, + "loss": 0.3234, + "step": 8145 + }, + { + "epoch": 2.286917462099944, + "grad_norm": 0.5249202251434326, + "learning_rate": 1.6246757194084111e-06, + "loss": 0.3425, + "step": 8146 + }, + { + "epoch": 2.2871982032565974, + "grad_norm": 0.5967442393302917, + "learning_rate": 1.6234708790764446e-06, + "loss": 0.3449, + "step": 8147 + }, + { + "epoch": 2.287478944413251, + "grad_norm": 0.606689453125, + "learning_rate": 1.6222663990666692e-06, + "loss": 0.3257, + "step": 8148 + }, + { + "epoch": 2.2877596855699047, + "grad_norm": 0.5632715821266174, + "learning_rate": 1.621062279507617e-06, + "loss": 0.3176, + "step": 8149 + }, + { + "epoch": 2.288040426726558, + "grad_norm": 0.5461794137954712, + "learning_rate": 1.6198585205277877e-06, + "loss": 0.3526, + "step": 8150 + }, + { + "epoch": 2.2883211678832116, + "grad_norm": 0.6035500764846802, + "learning_rate": 1.6186551222556363e-06, + "loss": 0.3312, + "step": 8151 + }, + { + "epoch": 2.2886019090398655, + "grad_norm": 0.5271415114402771, + "learning_rate": 1.617452084819584e-06, + "loss": 0.3147, + "step": 8152 + }, + { + "epoch": 2.288882650196519, + "grad_norm": 0.6394876837730408, + "learning_rate": 1.6162494083480106e-06, + "loss": 0.3276, + "step": 8153 + }, + { + "epoch": 2.2891633913531724, + "grad_norm": 0.5247066617012024, + "learning_rate": 1.615047092969257e-06, + "loss": 0.3377, + "step": 8154 + }, + { + "epoch": 2.289444132509826, + "grad_norm": 0.5702440142631531, + "learning_rate": 1.6138451388116278e-06, + "loss": 0.3458, + "step": 8155 + }, + { + "epoch": 2.2897248736664797, + "grad_norm": 0.5576598644256592, + "learning_rate": 1.6126435460033896e-06, + "loss": 0.3359, + "step": 8156 + }, + { + "epoch": 2.290005614823133, + "grad_norm": 0.5746631026268005, + "learning_rate": 1.6114423146727664e-06, + "loss": 0.3263, + "step": 8157 + }, + { + "epoch": 2.2902863559797866, + "grad_norm": 0.5279629230499268, + "learning_rate": 1.6102414449479487e-06, + "loss": 0.339, + "step": 8158 + }, + { + "epoch": 2.29056709713644, + "grad_norm": 0.5480526089668274, + "learning_rate": 1.609040936957083e-06, + "loss": 0.3069, + "step": 8159 + }, + { + "epoch": 2.290847838293094, + "grad_norm": 0.5356866717338562, + "learning_rate": 1.6078407908282829e-06, + "loss": 0.3243, + "step": 8160 + }, + { + "epoch": 2.2911285794497473, + "grad_norm": 0.4751942753791809, + "learning_rate": 1.6066410066896177e-06, + "loss": 0.3796, + "step": 8161 + }, + { + "epoch": 2.291409320606401, + "grad_norm": 0.5966656804084778, + "learning_rate": 1.605441584669124e-06, + "loss": 0.3573, + "step": 8162 + }, + { + "epoch": 2.2916900617630542, + "grad_norm": 0.6304299831390381, + "learning_rate": 1.6042425248947951e-06, + "loss": 0.3411, + "step": 8163 + }, + { + "epoch": 2.291970802919708, + "grad_norm": 0.5414368510246277, + "learning_rate": 1.6030438274945853e-06, + "loss": 0.3656, + "step": 8164 + }, + { + "epoch": 2.2922515440763616, + "grad_norm": 0.5494125485420227, + "learning_rate": 1.6018454925964161e-06, + "loss": 0.2972, + "step": 8165 + }, + { + "epoch": 2.292532285233015, + "grad_norm": 0.5852463245391846, + "learning_rate": 1.600647520328162e-06, + "loss": 0.3375, + "step": 8166 + }, + { + "epoch": 2.292813026389669, + "grad_norm": 0.5897251963615417, + "learning_rate": 1.599449910817667e-06, + "loss": 0.3411, + "step": 8167 + }, + { + "epoch": 2.2930937675463223, + "grad_norm": 0.6259673237800598, + "learning_rate": 1.598252664192731e-06, + "loss": 0.3235, + "step": 8168 + }, + { + "epoch": 2.2933745087029758, + "grad_norm": 0.5944607853889465, + "learning_rate": 1.5970557805811148e-06, + "loss": 0.3467, + "step": 8169 + }, + { + "epoch": 2.293655249859629, + "grad_norm": 0.5877252817153931, + "learning_rate": 1.5958592601105461e-06, + "loss": 0.3139, + "step": 8170 + }, + { + "epoch": 2.293935991016283, + "grad_norm": 0.6014631986618042, + "learning_rate": 1.5946631029087068e-06, + "loss": 0.3081, + "step": 8171 + }, + { + "epoch": 2.2942167321729365, + "grad_norm": 0.6377054452896118, + "learning_rate": 1.593467309103246e-06, + "loss": 0.3714, + "step": 8172 + }, + { + "epoch": 2.29449747332959, + "grad_norm": 0.5477756857872009, + "learning_rate": 1.5922718788217683e-06, + "loss": 0.3356, + "step": 8173 + }, + { + "epoch": 2.294778214486244, + "grad_norm": 0.6328738331794739, + "learning_rate": 1.5910768121918469e-06, + "loss": 0.3859, + "step": 8174 + }, + { + "epoch": 2.2950589556428973, + "grad_norm": 0.5691003799438477, + "learning_rate": 1.589882109341009e-06, + "loss": 0.3464, + "step": 8175 + }, + { + "epoch": 2.2953396967995507, + "grad_norm": 0.5517013669013977, + "learning_rate": 1.5886877703967441e-06, + "loss": 0.3233, + "step": 8176 + }, + { + "epoch": 2.295620437956204, + "grad_norm": 0.5734623670578003, + "learning_rate": 1.587493795486509e-06, + "loss": 0.3074, + "step": 8177 + }, + { + "epoch": 2.295901179112858, + "grad_norm": 0.5163578987121582, + "learning_rate": 1.586300184737713e-06, + "loss": 0.3368, + "step": 8178 + }, + { + "epoch": 2.2961819202695115, + "grad_norm": 0.5542207360267639, + "learning_rate": 1.5851069382777352e-06, + "loss": 0.3153, + "step": 8179 + }, + { + "epoch": 2.296462661426165, + "grad_norm": 0.5119909644126892, + "learning_rate": 1.5839140562339066e-06, + "loss": 0.3607, + "step": 8180 + }, + { + "epoch": 2.296743402582819, + "grad_norm": 0.4962103068828583, + "learning_rate": 1.5827215387335277e-06, + "loss": 0.3562, + "step": 8181 + }, + { + "epoch": 2.2970241437394723, + "grad_norm": 0.5574684739112854, + "learning_rate": 1.5815293859038566e-06, + "loss": 0.2697, + "step": 8182 + }, + { + "epoch": 2.2973048848961257, + "grad_norm": 0.6066998243331909, + "learning_rate": 1.5803375978721096e-06, + "loss": 0.3226, + "step": 8183 + }, + { + "epoch": 2.297585626052779, + "grad_norm": 0.5233213901519775, + "learning_rate": 1.5791461747654707e-06, + "loss": 0.3167, + "step": 8184 + }, + { + "epoch": 2.297866367209433, + "grad_norm": 0.5955953001976013, + "learning_rate": 1.5779551167110784e-06, + "loss": 0.3381, + "step": 8185 + }, + { + "epoch": 2.2981471083660865, + "grad_norm": 0.5840866565704346, + "learning_rate": 1.5767644238360352e-06, + "loss": 0.358, + "step": 8186 + }, + { + "epoch": 2.29842784952274, + "grad_norm": 0.5760275721549988, + "learning_rate": 1.575574096267406e-06, + "loss": 0.3016, + "step": 8187 + }, + { + "epoch": 2.298708590679394, + "grad_norm": 0.5533139705657959, + "learning_rate": 1.5743841341322125e-06, + "loss": 0.314, + "step": 8188 + }, + { + "epoch": 2.2989893318360473, + "grad_norm": 0.5782603025436401, + "learning_rate": 1.5731945375574432e-06, + "loss": 0.3718, + "step": 8189 + }, + { + "epoch": 2.2992700729927007, + "grad_norm": 0.5638977885246277, + "learning_rate": 1.5720053066700436e-06, + "loss": 0.3342, + "step": 8190 + }, + { + "epoch": 2.299550814149354, + "grad_norm": 0.54283607006073, + "learning_rate": 1.570816441596918e-06, + "loss": 0.3243, + "step": 8191 + }, + { + "epoch": 2.299831555306008, + "grad_norm": 0.5435816645622253, + "learning_rate": 1.569627942464939e-06, + "loss": 0.3682, + "step": 8192 + }, + { + "epoch": 2.3001122964626615, + "grad_norm": 0.5701005458831787, + "learning_rate": 1.5684398094009322e-06, + "loss": 0.3395, + "step": 8193 + }, + { + "epoch": 2.300393037619315, + "grad_norm": 0.5799752473831177, + "learning_rate": 1.5672520425316912e-06, + "loss": 0.3671, + "step": 8194 + }, + { + "epoch": 2.300673778775969, + "grad_norm": 0.5573103427886963, + "learning_rate": 1.5660646419839642e-06, + "loss": 0.3248, + "step": 8195 + }, + { + "epoch": 2.3009545199326222, + "grad_norm": 0.5010234713554382, + "learning_rate": 1.5648776078844653e-06, + "loss": 0.3262, + "step": 8196 + }, + { + "epoch": 2.3012352610892757, + "grad_norm": 0.5645644664764404, + "learning_rate": 1.5636909403598665e-06, + "loss": 0.2944, + "step": 8197 + }, + { + "epoch": 2.301516002245929, + "grad_norm": 0.5264140367507935, + "learning_rate": 1.5625046395368004e-06, + "loss": 0.3516, + "step": 8198 + }, + { + "epoch": 2.301796743402583, + "grad_norm": 0.5427291393280029, + "learning_rate": 1.5613187055418643e-06, + "loss": 0.3304, + "step": 8199 + }, + { + "epoch": 2.3020774845592364, + "grad_norm": 0.585015058517456, + "learning_rate": 1.5601331385016106e-06, + "loss": 0.2747, + "step": 8200 + }, + { + "epoch": 2.30235822571589, + "grad_norm": 0.5409886837005615, + "learning_rate": 1.5589479385425581e-06, + "loss": 0.4031, + "step": 8201 + }, + { + "epoch": 2.3026389668725433, + "grad_norm": 0.6217318177223206, + "learning_rate": 1.557763105791184e-06, + "loss": 0.3104, + "step": 8202 + }, + { + "epoch": 2.302919708029197, + "grad_norm": 0.5729625225067139, + "learning_rate": 1.5565786403739236e-06, + "loss": 0.3413, + "step": 8203 + }, + { + "epoch": 2.3032004491858507, + "grad_norm": 0.6085329651832581, + "learning_rate": 1.555394542417179e-06, + "loss": 0.3203, + "step": 8204 + }, + { + "epoch": 2.303481190342504, + "grad_norm": 0.5232425332069397, + "learning_rate": 1.5542108120473065e-06, + "loss": 0.3966, + "step": 8205 + }, + { + "epoch": 2.3037619314991575, + "grad_norm": 0.6044387221336365, + "learning_rate": 1.5530274493906282e-06, + "loss": 0.3144, + "step": 8206 + }, + { + "epoch": 2.3040426726558114, + "grad_norm": 0.5787413716316223, + "learning_rate": 1.5518444545734267e-06, + "loss": 0.2744, + "step": 8207 + }, + { + "epoch": 2.304323413812465, + "grad_norm": 0.6461145281791687, + "learning_rate": 1.550661827721941e-06, + "loss": 0.3532, + "step": 8208 + }, + { + "epoch": 2.3046041549691183, + "grad_norm": 0.5377730131149292, + "learning_rate": 1.549479568962377e-06, + "loss": 0.338, + "step": 8209 + }, + { + "epoch": 2.304884896125772, + "grad_norm": 0.6564037203788757, + "learning_rate": 1.5482976784208935e-06, + "loss": 0.3503, + "step": 8210 + }, + { + "epoch": 2.3051656372824256, + "grad_norm": 0.5975328087806702, + "learning_rate": 1.5471161562236192e-06, + "loss": 0.3131, + "step": 8211 + }, + { + "epoch": 2.305446378439079, + "grad_norm": 0.5177893042564392, + "learning_rate": 1.5459350024966368e-06, + "loss": 0.3158, + "step": 8212 + }, + { + "epoch": 2.3057271195957325, + "grad_norm": 0.5879108309745789, + "learning_rate": 1.5447542173659897e-06, + "loss": 0.3304, + "step": 8213 + }, + { + "epoch": 2.3060078607523864, + "grad_norm": 0.5290200114250183, + "learning_rate": 1.5435738009576872e-06, + "loss": 0.3021, + "step": 8214 + }, + { + "epoch": 2.30628860190904, + "grad_norm": 0.5285158753395081, + "learning_rate": 1.5423937533976936e-06, + "loss": 0.3485, + "step": 8215 + }, + { + "epoch": 2.3065693430656933, + "grad_norm": 0.5717185735702515, + "learning_rate": 1.541214074811938e-06, + "loss": 0.3601, + "step": 8216 + }, + { + "epoch": 2.306850084222347, + "grad_norm": 0.5635417699813843, + "learning_rate": 1.5400347653263064e-06, + "loss": 0.361, + "step": 8217 + }, + { + "epoch": 2.3071308253790006, + "grad_norm": 0.5670722723007202, + "learning_rate": 1.5388558250666502e-06, + "loss": 0.3308, + "step": 8218 + }, + { + "epoch": 2.307411566535654, + "grad_norm": 0.5696948170661926, + "learning_rate": 1.5376772541587765e-06, + "loss": 0.323, + "step": 8219 + }, + { + "epoch": 2.3076923076923075, + "grad_norm": 0.6074270009994507, + "learning_rate": 1.536499052728454e-06, + "loss": 0.3151, + "step": 8220 + }, + { + "epoch": 2.3079730488489614, + "grad_norm": 0.5729271173477173, + "learning_rate": 1.5353212209014163e-06, + "loss": 0.3709, + "step": 8221 + }, + { + "epoch": 2.308253790005615, + "grad_norm": 0.5935203433036804, + "learning_rate": 1.5341437588033508e-06, + "loss": 0.3307, + "step": 8222 + }, + { + "epoch": 2.3085345311622683, + "grad_norm": 0.5793057084083557, + "learning_rate": 1.5329666665599125e-06, + "loss": 0.338, + "step": 8223 + }, + { + "epoch": 2.308815272318922, + "grad_norm": 0.5447109341621399, + "learning_rate": 1.531789944296711e-06, + "loss": 0.3057, + "step": 8224 + }, + { + "epoch": 2.3090960134755756, + "grad_norm": 0.5514894723892212, + "learning_rate": 1.5306135921393178e-06, + "loss": 0.2681, + "step": 8225 + }, + { + "epoch": 2.309376754632229, + "grad_norm": 0.6545277237892151, + "learning_rate": 1.5294376102132691e-06, + "loss": 0.3523, + "step": 8226 + }, + { + "epoch": 2.3096574957888825, + "grad_norm": 0.5393643975257874, + "learning_rate": 1.528261998644055e-06, + "loss": 0.3486, + "step": 8227 + }, + { + "epoch": 2.3099382369455363, + "grad_norm": 0.5711279511451721, + "learning_rate": 1.527086757557133e-06, + "loss": 0.3129, + "step": 8228 + }, + { + "epoch": 2.31021897810219, + "grad_norm": 0.5342314839363098, + "learning_rate": 1.5259118870779134e-06, + "loss": 0.3083, + "step": 8229 + }, + { + "epoch": 2.3104997192588432, + "grad_norm": 0.6208454966545105, + "learning_rate": 1.524737387331775e-06, + "loss": 0.3157, + "step": 8230 + }, + { + "epoch": 2.310780460415497, + "grad_norm": 0.5687270164489746, + "learning_rate": 1.5235632584440503e-06, + "loss": 0.3459, + "step": 8231 + }, + { + "epoch": 2.3110612015721506, + "grad_norm": 0.5974138379096985, + "learning_rate": 1.5223895005400359e-06, + "loss": 0.3014, + "step": 8232 + }, + { + "epoch": 2.311341942728804, + "grad_norm": 0.5893130302429199, + "learning_rate": 1.5212161137449892e-06, + "loss": 0.3337, + "step": 8233 + }, + { + "epoch": 2.3116226838854574, + "grad_norm": 0.5888399481773376, + "learning_rate": 1.5200430981841262e-06, + "loss": 0.3174, + "step": 8234 + }, + { + "epoch": 2.3119034250421113, + "grad_norm": 0.5141733288764954, + "learning_rate": 1.5188704539826222e-06, + "loss": 0.3262, + "step": 8235 + }, + { + "epoch": 2.3121841661987648, + "grad_norm": 0.5367722511291504, + "learning_rate": 1.5176981812656166e-06, + "loss": 0.3919, + "step": 8236 + }, + { + "epoch": 2.312464907355418, + "grad_norm": 0.5129967331886292, + "learning_rate": 1.5165262801582048e-06, + "loss": 0.3121, + "step": 8237 + }, + { + "epoch": 2.312745648512072, + "grad_norm": 0.6207329034805298, + "learning_rate": 1.5153547507854472e-06, + "loss": 0.3898, + "step": 8238 + }, + { + "epoch": 2.3130263896687255, + "grad_norm": 0.5402520298957825, + "learning_rate": 1.5141835932723615e-06, + "loss": 0.3251, + "step": 8239 + }, + { + "epoch": 2.313307130825379, + "grad_norm": 0.5945013761520386, + "learning_rate": 1.5130128077439244e-06, + "loss": 0.3428, + "step": 8240 + }, + { + "epoch": 2.3135878719820324, + "grad_norm": 0.5576349496841431, + "learning_rate": 1.511842394325077e-06, + "loss": 0.3475, + "step": 8241 + }, + { + "epoch": 2.3138686131386863, + "grad_norm": 0.5237705111503601, + "learning_rate": 1.510672353140717e-06, + "loss": 0.3041, + "step": 8242 + }, + { + "epoch": 2.3141493542953397, + "grad_norm": 0.5582318902015686, + "learning_rate": 1.5095026843157062e-06, + "loss": 0.2963, + "step": 8243 + }, + { + "epoch": 2.314430095451993, + "grad_norm": 0.5891752243041992, + "learning_rate": 1.5083333879748618e-06, + "loss": 0.2987, + "step": 8244 + }, + { + "epoch": 2.314710836608647, + "grad_norm": 0.581871747970581, + "learning_rate": 1.507164464242966e-06, + "loss": 0.338, + "step": 8245 + }, + { + "epoch": 2.3149915777653005, + "grad_norm": 0.5098099112510681, + "learning_rate": 1.5059959132447582e-06, + "loss": 0.2718, + "step": 8246 + }, + { + "epoch": 2.315272318921954, + "grad_norm": 0.5287011861801147, + "learning_rate": 1.5048277351049373e-06, + "loss": 0.3499, + "step": 8247 + }, + { + "epoch": 2.3155530600786074, + "grad_norm": 0.6619024872779846, + "learning_rate": 1.503659929948167e-06, + "loss": 0.3568, + "step": 8248 + }, + { + "epoch": 2.3158338012352613, + "grad_norm": 0.5380387902259827, + "learning_rate": 1.5024924978990651e-06, + "loss": 0.3302, + "step": 8249 + }, + { + "epoch": 2.3161145423919147, + "grad_norm": 0.5728728175163269, + "learning_rate": 1.5013254390822158e-06, + "loss": 0.3213, + "step": 8250 + }, + { + "epoch": 2.316395283548568, + "grad_norm": 0.5615373849868774, + "learning_rate": 1.500158753622159e-06, + "loss": 0.3108, + "step": 8251 + }, + { + "epoch": 2.3166760247052216, + "grad_norm": 0.6084820032119751, + "learning_rate": 1.4989924416433943e-06, + "loss": 0.2813, + "step": 8252 + }, + { + "epoch": 2.3169567658618755, + "grad_norm": 0.5133932828903198, + "learning_rate": 1.4978265032703864e-06, + "loss": 0.3426, + "step": 8253 + }, + { + "epoch": 2.317237507018529, + "grad_norm": 0.567959189414978, + "learning_rate": 1.4966609386275538e-06, + "loss": 0.3044, + "step": 8254 + }, + { + "epoch": 2.3175182481751824, + "grad_norm": 0.5489754676818848, + "learning_rate": 1.4954957478392818e-06, + "loss": 0.3123, + "step": 8255 + }, + { + "epoch": 2.317798989331836, + "grad_norm": 0.568737268447876, + "learning_rate": 1.4943309310299086e-06, + "loss": 0.3351, + "step": 8256 + }, + { + "epoch": 2.3180797304884897, + "grad_norm": 0.5404559373855591, + "learning_rate": 1.4931664883237385e-06, + "loss": 0.3401, + "step": 8257 + }, + { + "epoch": 2.318360471645143, + "grad_norm": 0.4898296296596527, + "learning_rate": 1.4920024198450344e-06, + "loss": 0.3225, + "step": 8258 + }, + { + "epoch": 2.3186412128017966, + "grad_norm": 0.5398330688476562, + "learning_rate": 1.4908387257180162e-06, + "loss": 0.3108, + "step": 8259 + }, + { + "epoch": 2.3189219539584505, + "grad_norm": 0.580632746219635, + "learning_rate": 1.4896754060668684e-06, + "loss": 0.2972, + "step": 8260 + }, + { + "epoch": 2.319202695115104, + "grad_norm": 0.6417765021324158, + "learning_rate": 1.488512461015732e-06, + "loss": 0.2738, + "step": 8261 + }, + { + "epoch": 2.3194834362717573, + "grad_norm": 0.580383837223053, + "learning_rate": 1.4873498906887073e-06, + "loss": 0.3589, + "step": 8262 + }, + { + "epoch": 2.319764177428411, + "grad_norm": 0.5803306102752686, + "learning_rate": 1.4861876952098609e-06, + "loss": 0.3285, + "step": 8263 + }, + { + "epoch": 2.3200449185850647, + "grad_norm": 0.5498411655426025, + "learning_rate": 1.485025874703211e-06, + "loss": 0.3219, + "step": 8264 + }, + { + "epoch": 2.320325659741718, + "grad_norm": 0.6075876355171204, + "learning_rate": 1.483864429292743e-06, + "loss": 0.3251, + "step": 8265 + }, + { + "epoch": 2.3206064008983716, + "grad_norm": 0.5450096726417542, + "learning_rate": 1.4827033591023953e-06, + "loss": 0.314, + "step": 8266 + }, + { + "epoch": 2.3208871420550254, + "grad_norm": 0.5331777334213257, + "learning_rate": 1.4815426642560753e-06, + "loss": 0.2894, + "step": 8267 + }, + { + "epoch": 2.321167883211679, + "grad_norm": 0.4995564818382263, + "learning_rate": 1.4803823448776417e-06, + "loss": 0.3716, + "step": 8268 + }, + { + "epoch": 2.3214486243683323, + "grad_norm": 0.5682269930839539, + "learning_rate": 1.4792224010909157e-06, + "loss": 0.3462, + "step": 8269 + }, + { + "epoch": 2.3217293655249858, + "grad_norm": 0.6323981881141663, + "learning_rate": 1.4780628330196821e-06, + "loss": 0.2919, + "step": 8270 + }, + { + "epoch": 2.3220101066816397, + "grad_norm": 0.6075713634490967, + "learning_rate": 1.4769036407876808e-06, + "loss": 0.3343, + "step": 8271 + }, + { + "epoch": 2.322290847838293, + "grad_norm": 0.607307493686676, + "learning_rate": 1.475744824518615e-06, + "loss": 0.3415, + "step": 8272 + }, + { + "epoch": 2.3225715889949465, + "grad_norm": 0.5312021374702454, + "learning_rate": 1.4745863843361459e-06, + "loss": 0.3408, + "step": 8273 + }, + { + "epoch": 2.3228523301516004, + "grad_norm": 0.5947307348251343, + "learning_rate": 1.4734283203638934e-06, + "loss": 0.3654, + "step": 8274 + }, + { + "epoch": 2.323133071308254, + "grad_norm": 0.5386355519294739, + "learning_rate": 1.4722706327254416e-06, + "loss": 0.3531, + "step": 8275 + }, + { + "epoch": 2.3234138124649073, + "grad_norm": 0.5545230507850647, + "learning_rate": 1.4711133215443285e-06, + "loss": 0.321, + "step": 8276 + }, + { + "epoch": 2.3236945536215607, + "grad_norm": 0.5298532843589783, + "learning_rate": 1.4699563869440592e-06, + "loss": 0.3268, + "step": 8277 + }, + { + "epoch": 2.3239752947782146, + "grad_norm": 0.5816928148269653, + "learning_rate": 1.4687998290480904e-06, + "loss": 0.3152, + "step": 8278 + }, + { + "epoch": 2.324256035934868, + "grad_norm": 0.6305320262908936, + "learning_rate": 1.4676436479798462e-06, + "loss": 0.3181, + "step": 8279 + }, + { + "epoch": 2.3245367770915215, + "grad_norm": 0.6032200455665588, + "learning_rate": 1.4664878438627061e-06, + "loss": 0.3855, + "step": 8280 + }, + { + "epoch": 2.3248175182481754, + "grad_norm": 0.5271633267402649, + "learning_rate": 1.4653324168200078e-06, + "loss": 0.3717, + "step": 8281 + }, + { + "epoch": 2.325098259404829, + "grad_norm": 0.5593137741088867, + "learning_rate": 1.4641773669750537e-06, + "loss": 0.3202, + "step": 8282 + }, + { + "epoch": 2.3253790005614823, + "grad_norm": 0.524844765663147, + "learning_rate": 1.4630226944511045e-06, + "loss": 0.3913, + "step": 8283 + }, + { + "epoch": 2.3256597417181357, + "grad_norm": 0.5058882236480713, + "learning_rate": 1.4618683993713773e-06, + "loss": 0.347, + "step": 8284 + }, + { + "epoch": 2.3259404828747896, + "grad_norm": 0.5980703830718994, + "learning_rate": 1.4607144818590536e-06, + "loss": 0.3311, + "step": 8285 + }, + { + "epoch": 2.326221224031443, + "grad_norm": 0.5376325249671936, + "learning_rate": 1.4595609420372692e-06, + "loss": 0.3088, + "step": 8286 + }, + { + "epoch": 2.3265019651880965, + "grad_norm": 0.5730352401733398, + "learning_rate": 1.4584077800291262e-06, + "loss": 0.3595, + "step": 8287 + }, + { + "epoch": 2.3267827063447504, + "grad_norm": 0.5882030725479126, + "learning_rate": 1.4572549959576793e-06, + "loss": 0.3208, + "step": 8288 + }, + { + "epoch": 2.327063447501404, + "grad_norm": 0.545253336429596, + "learning_rate": 1.4561025899459508e-06, + "loss": 0.334, + "step": 8289 + }, + { + "epoch": 2.3273441886580573, + "grad_norm": 0.5363314151763916, + "learning_rate": 1.4549505621169152e-06, + "loss": 0.3279, + "step": 8290 + }, + { + "epoch": 2.3276249298147107, + "grad_norm": 0.5934737920761108, + "learning_rate": 1.4537989125935086e-06, + "loss": 0.3027, + "step": 8291 + }, + { + "epoch": 2.3279056709713646, + "grad_norm": 0.5294120907783508, + "learning_rate": 1.4526476414986318e-06, + "loss": 0.3074, + "step": 8292 + }, + { + "epoch": 2.328186412128018, + "grad_norm": 0.5200093984603882, + "learning_rate": 1.4514967489551373e-06, + "loss": 0.3323, + "step": 8293 + }, + { + "epoch": 2.3284671532846715, + "grad_norm": 0.5722740888595581, + "learning_rate": 1.450346235085845e-06, + "loss": 0.3623, + "step": 8294 + }, + { + "epoch": 2.328747894441325, + "grad_norm": 0.574876606464386, + "learning_rate": 1.4491961000135285e-06, + "loss": 0.3011, + "step": 8295 + }, + { + "epoch": 2.329028635597979, + "grad_norm": 0.5320606231689453, + "learning_rate": 1.4480463438609216e-06, + "loss": 0.3321, + "step": 8296 + }, + { + "epoch": 2.3293093767546322, + "grad_norm": 0.5752878785133362, + "learning_rate": 1.4468969667507222e-06, + "loss": 0.352, + "step": 8297 + }, + { + "epoch": 2.3295901179112857, + "grad_norm": 0.5299527049064636, + "learning_rate": 1.445747968805582e-06, + "loss": 0.3436, + "step": 8298 + }, + { + "epoch": 2.329870859067939, + "grad_norm": 0.5439935922622681, + "learning_rate": 1.4445993501481171e-06, + "loss": 0.3133, + "step": 8299 + }, + { + "epoch": 2.330151600224593, + "grad_norm": 0.5759299993515015, + "learning_rate": 1.4434511109008987e-06, + "loss": 0.2979, + "step": 8300 + }, + { + "epoch": 2.3304323413812464, + "grad_norm": 0.5390260219573975, + "learning_rate": 1.4423032511864626e-06, + "loss": 0.3311, + "step": 8301 + }, + { + "epoch": 2.3307130825379, + "grad_norm": 0.5760179162025452, + "learning_rate": 1.4411557711273e-06, + "loss": 0.3311, + "step": 8302 + }, + { + "epoch": 2.3309938236945538, + "grad_norm": 0.499437540769577, + "learning_rate": 1.4400086708458605e-06, + "loss": 0.315, + "step": 8303 + }, + { + "epoch": 2.331274564851207, + "grad_norm": 0.6799649596214294, + "learning_rate": 1.438861950464559e-06, + "loss": 0.2776, + "step": 8304 + }, + { + "epoch": 2.3315553060078607, + "grad_norm": 0.5366402268409729, + "learning_rate": 1.4377156101057638e-06, + "loss": 0.3265, + "step": 8305 + }, + { + "epoch": 2.331836047164514, + "grad_norm": 0.5740495920181274, + "learning_rate": 1.4365696498918074e-06, + "loss": 0.3323, + "step": 8306 + }, + { + "epoch": 2.332116788321168, + "grad_norm": 0.5453783273696899, + "learning_rate": 1.435424069944979e-06, + "loss": 0.3448, + "step": 8307 + }, + { + "epoch": 2.3323975294778214, + "grad_norm": 0.5498390793800354, + "learning_rate": 1.4342788703875237e-06, + "loss": 0.2801, + "step": 8308 + }, + { + "epoch": 2.332678270634475, + "grad_norm": 0.5661704540252686, + "learning_rate": 1.433134051341657e-06, + "loss": 0.3173, + "step": 8309 + }, + { + "epoch": 2.3329590117911287, + "grad_norm": 0.5494989156723022, + "learning_rate": 1.4319896129295429e-06, + "loss": 0.3518, + "step": 8310 + }, + { + "epoch": 2.333239752947782, + "grad_norm": 0.5851483345031738, + "learning_rate": 1.4308455552733086e-06, + "loss": 0.3302, + "step": 8311 + }, + { + "epoch": 2.3335204941044356, + "grad_norm": 0.5645585060119629, + "learning_rate": 1.4297018784950424e-06, + "loss": 0.3258, + "step": 8312 + }, + { + "epoch": 2.333801235261089, + "grad_norm": 0.5929242968559265, + "learning_rate": 1.428558582716788e-06, + "loss": 0.3174, + "step": 8313 + }, + { + "epoch": 2.334081976417743, + "grad_norm": 0.5702686309814453, + "learning_rate": 1.4274156680605543e-06, + "loss": 0.278, + "step": 8314 + }, + { + "epoch": 2.3343627175743964, + "grad_norm": 0.5488886833190918, + "learning_rate": 1.4262731346483022e-06, + "loss": 0.3334, + "step": 8315 + }, + { + "epoch": 2.33464345873105, + "grad_norm": 0.5933936238288879, + "learning_rate": 1.4251309826019593e-06, + "loss": 0.3424, + "step": 8316 + }, + { + "epoch": 2.3349241998877037, + "grad_norm": 0.5949031710624695, + "learning_rate": 1.4239892120434073e-06, + "loss": 0.3333, + "step": 8317 + }, + { + "epoch": 2.335204941044357, + "grad_norm": 0.5480186939239502, + "learning_rate": 1.422847823094487e-06, + "loss": 0.327, + "step": 8318 + }, + { + "epoch": 2.3354856822010106, + "grad_norm": 0.5760910511016846, + "learning_rate": 1.4217068158770043e-06, + "loss": 0.2974, + "step": 8319 + }, + { + "epoch": 2.335766423357664, + "grad_norm": 0.5917199850082397, + "learning_rate": 1.420566190512716e-06, + "loss": 0.3202, + "step": 8320 + }, + { + "epoch": 2.336047164514318, + "grad_norm": 0.5515584349632263, + "learning_rate": 1.4194259471233468e-06, + "loss": 0.3348, + "step": 8321 + }, + { + "epoch": 2.3363279056709714, + "grad_norm": 0.5509028434753418, + "learning_rate": 1.4182860858305747e-06, + "loss": 0.3378, + "step": 8322 + }, + { + "epoch": 2.336608646827625, + "grad_norm": 0.6656421422958374, + "learning_rate": 1.4171466067560362e-06, + "loss": 0.3618, + "step": 8323 + }, + { + "epoch": 2.3368893879842787, + "grad_norm": 0.5880535244941711, + "learning_rate": 1.4160075100213334e-06, + "loss": 0.3502, + "step": 8324 + }, + { + "epoch": 2.337170129140932, + "grad_norm": 0.5356268286705017, + "learning_rate": 1.41486879574802e-06, + "loss": 0.357, + "step": 8325 + }, + { + "epoch": 2.3374508702975856, + "grad_norm": 0.5595723390579224, + "learning_rate": 1.4137304640576161e-06, + "loss": 0.3307, + "step": 8326 + }, + { + "epoch": 2.337731611454239, + "grad_norm": 0.6235252618789673, + "learning_rate": 1.4125925150715936e-06, + "loss": 0.3056, + "step": 8327 + }, + { + "epoch": 2.338012352610893, + "grad_norm": 0.56630539894104, + "learning_rate": 1.411454948911391e-06, + "loss": 0.3141, + "step": 8328 + }, + { + "epoch": 2.3382930937675463, + "grad_norm": 0.5708787441253662, + "learning_rate": 1.4103177656984007e-06, + "loss": 0.2683, + "step": 8329 + }, + { + "epoch": 2.3385738349242, + "grad_norm": 0.6209465861320496, + "learning_rate": 1.409180965553974e-06, + "loss": 0.2943, + "step": 8330 + }, + { + "epoch": 2.3388545760808537, + "grad_norm": 0.535517156124115, + "learning_rate": 1.4080445485994265e-06, + "loss": 0.3374, + "step": 8331 + }, + { + "epoch": 2.339135317237507, + "grad_norm": 0.5888431072235107, + "learning_rate": 1.4069085149560264e-06, + "loss": 0.3154, + "step": 8332 + }, + { + "epoch": 2.3394160583941606, + "grad_norm": 0.5710810422897339, + "learning_rate": 1.4057728647450053e-06, + "loss": 0.3306, + "step": 8333 + }, + { + "epoch": 2.339696799550814, + "grad_norm": 0.6118029952049255, + "learning_rate": 1.404637598087555e-06, + "loss": 0.3057, + "step": 8334 + }, + { + "epoch": 2.339977540707468, + "grad_norm": 0.5413199663162231, + "learning_rate": 1.4035027151048203e-06, + "loss": 0.3222, + "step": 8335 + }, + { + "epoch": 2.3402582818641213, + "grad_norm": 0.5166553854942322, + "learning_rate": 1.402368215917912e-06, + "loss": 0.3518, + "step": 8336 + }, + { + "epoch": 2.3405390230207748, + "grad_norm": 0.5677252411842346, + "learning_rate": 1.4012341006478947e-06, + "loss": 0.3681, + "step": 8337 + }, + { + "epoch": 2.3408197641774287, + "grad_norm": 0.6056877374649048, + "learning_rate": 1.4001003694157955e-06, + "loss": 0.3207, + "step": 8338 + }, + { + "epoch": 2.341100505334082, + "grad_norm": 0.5278530716896057, + "learning_rate": 1.3989670223425995e-06, + "loss": 0.3372, + "step": 8339 + }, + { + "epoch": 2.3413812464907355, + "grad_norm": 0.6560772061347961, + "learning_rate": 1.3978340595492473e-06, + "loss": 0.3587, + "step": 8340 + }, + { + "epoch": 2.341661987647389, + "grad_norm": 0.49482297897338867, + "learning_rate": 1.396701481156646e-06, + "loss": 0.3258, + "step": 8341 + }, + { + "epoch": 2.341942728804043, + "grad_norm": 0.6362783908843994, + "learning_rate": 1.3955692872856535e-06, + "loss": 0.3311, + "step": 8342 + }, + { + "epoch": 2.3422234699606963, + "grad_norm": 0.6038870215415955, + "learning_rate": 1.3944374780570936e-06, + "loss": 0.3241, + "step": 8343 + }, + { + "epoch": 2.3425042111173497, + "grad_norm": 0.5476205348968506, + "learning_rate": 1.393306053591744e-06, + "loss": 0.3351, + "step": 8344 + }, + { + "epoch": 2.342784952274003, + "grad_norm": 0.5627130270004272, + "learning_rate": 1.3921750140103429e-06, + "loss": 0.3579, + "step": 8345 + }, + { + "epoch": 2.343065693430657, + "grad_norm": 0.5221319794654846, + "learning_rate": 1.3910443594335904e-06, + "loss": 0.386, + "step": 8346 + }, + { + "epoch": 2.3433464345873105, + "grad_norm": 0.6206731796264648, + "learning_rate": 1.3899140899821396e-06, + "loss": 0.323, + "step": 8347 + }, + { + "epoch": 2.343627175743964, + "grad_norm": 0.6443973779678345, + "learning_rate": 1.3887842057766089e-06, + "loss": 0.353, + "step": 8348 + }, + { + "epoch": 2.3439079169006174, + "grad_norm": 0.6118109226226807, + "learning_rate": 1.3876547069375696e-06, + "loss": 0.3177, + "step": 8349 + }, + { + "epoch": 2.3441886580572713, + "grad_norm": 0.6209521293640137, + "learning_rate": 1.3865255935855577e-06, + "loss": 0.2785, + "step": 8350 + }, + { + "epoch": 2.3444693992139247, + "grad_norm": 0.5720152854919434, + "learning_rate": 1.3853968658410638e-06, + "loss": 0.3032, + "step": 8351 + }, + { + "epoch": 2.344750140370578, + "grad_norm": 0.5183261036872864, + "learning_rate": 1.384268523824537e-06, + "loss": 0.3791, + "step": 8352 + }, + { + "epoch": 2.345030881527232, + "grad_norm": 0.5950300097465515, + "learning_rate": 1.3831405676563902e-06, + "loss": 0.2945, + "step": 8353 + }, + { + "epoch": 2.3453116226838855, + "grad_norm": 0.5488893985748291, + "learning_rate": 1.382012997456989e-06, + "loss": 0.3289, + "step": 8354 + }, + { + "epoch": 2.345592363840539, + "grad_norm": 0.666434109210968, + "learning_rate": 1.380885813346663e-06, + "loss": 0.2815, + "step": 8355 + }, + { + "epoch": 2.3458731049971924, + "grad_norm": 0.5665557384490967, + "learning_rate": 1.3797590154456975e-06, + "loss": 0.3674, + "step": 8356 + }, + { + "epoch": 2.3461538461538463, + "grad_norm": 0.6095634698867798, + "learning_rate": 1.3786326038743354e-06, + "loss": 0.3822, + "step": 8357 + }, + { + "epoch": 2.3464345873104997, + "grad_norm": 0.5665314793586731, + "learning_rate": 1.3775065787527837e-06, + "loss": 0.3284, + "step": 8358 + }, + { + "epoch": 2.346715328467153, + "grad_norm": 0.5768237709999084, + "learning_rate": 1.3763809402012012e-06, + "loss": 0.2781, + "step": 8359 + }, + { + "epoch": 2.346996069623807, + "grad_norm": 0.5959749221801758, + "learning_rate": 1.3752556883397116e-06, + "loss": 0.289, + "step": 8360 + }, + { + "epoch": 2.3472768107804605, + "grad_norm": 0.5341787338256836, + "learning_rate": 1.3741308232883955e-06, + "loss": 0.3232, + "step": 8361 + }, + { + "epoch": 2.347557551937114, + "grad_norm": 0.4947063624858856, + "learning_rate": 1.3730063451672882e-06, + "loss": 0.3399, + "step": 8362 + }, + { + "epoch": 2.3478382930937673, + "grad_norm": 0.6144747734069824, + "learning_rate": 1.3718822540963906e-06, + "loss": 0.3154, + "step": 8363 + }, + { + "epoch": 2.3481190342504212, + "grad_norm": 0.5683850646018982, + "learning_rate": 1.370758550195656e-06, + "loss": 0.3686, + "step": 8364 + }, + { + "epoch": 2.3483997754070747, + "grad_norm": 0.5459191799163818, + "learning_rate": 1.369635233585001e-06, + "loss": 0.3309, + "step": 8365 + }, + { + "epoch": 2.348680516563728, + "grad_norm": 0.5615030527114868, + "learning_rate": 1.3685123043842985e-06, + "loss": 0.3489, + "step": 8366 + }, + { + "epoch": 2.348961257720382, + "grad_norm": 0.546295702457428, + "learning_rate": 1.3673897627133787e-06, + "loss": 0.3599, + "step": 8367 + }, + { + "epoch": 2.3492419988770354, + "grad_norm": 0.5746467709541321, + "learning_rate": 1.3662676086920352e-06, + "loss": 0.3449, + "step": 8368 + }, + { + "epoch": 2.349522740033689, + "grad_norm": 0.6438571810722351, + "learning_rate": 1.3651458424400143e-06, + "loss": 0.3255, + "step": 8369 + }, + { + "epoch": 2.3498034811903423, + "grad_norm": 0.5440747141838074, + "learning_rate": 1.3640244640770266e-06, + "loss": 0.3148, + "step": 8370 + }, + { + "epoch": 2.350084222346996, + "grad_norm": 0.575641930103302, + "learning_rate": 1.3629034737227382e-06, + "loss": 0.3093, + "step": 8371 + }, + { + "epoch": 2.3503649635036497, + "grad_norm": 0.5817500948905945, + "learning_rate": 1.3617828714967713e-06, + "loss": 0.3389, + "step": 8372 + }, + { + "epoch": 2.350645704660303, + "grad_norm": 0.5214036107063293, + "learning_rate": 1.3606626575187138e-06, + "loss": 0.3328, + "step": 8373 + }, + { + "epoch": 2.350926445816957, + "grad_norm": 0.5743655562400818, + "learning_rate": 1.359542831908104e-06, + "loss": 0.3295, + "step": 8374 + }, + { + "epoch": 2.3512071869736104, + "grad_norm": 0.5960953235626221, + "learning_rate": 1.358423394784446e-06, + "loss": 0.3077, + "step": 8375 + }, + { + "epoch": 2.351487928130264, + "grad_norm": 0.5571728348731995, + "learning_rate": 1.357304346267197e-06, + "loss": 0.3136, + "step": 8376 + }, + { + "epoch": 2.3517686692869173, + "grad_norm": 0.5779780745506287, + "learning_rate": 1.3561856864757767e-06, + "loss": 0.3465, + "step": 8377 + }, + { + "epoch": 2.352049410443571, + "grad_norm": 0.5787519216537476, + "learning_rate": 1.3550674155295606e-06, + "loss": 0.3557, + "step": 8378 + }, + { + "epoch": 2.3523301516002246, + "grad_norm": 0.5545042157173157, + "learning_rate": 1.3539495335478826e-06, + "loss": 0.3076, + "step": 8379 + }, + { + "epoch": 2.352610892756878, + "grad_norm": 0.564544141292572, + "learning_rate": 1.3528320406500378e-06, + "loss": 0.3521, + "step": 8380 + }, + { + "epoch": 2.352891633913532, + "grad_norm": 0.6182708144187927, + "learning_rate": 1.3517149369552762e-06, + "loss": 0.3474, + "step": 8381 + }, + { + "epoch": 2.3531723750701854, + "grad_norm": 0.587466299533844, + "learning_rate": 1.3505982225828113e-06, + "loss": 0.3348, + "step": 8382 + }, + { + "epoch": 2.353453116226839, + "grad_norm": 0.5764768123626709, + "learning_rate": 1.3494818976518093e-06, + "loss": 0.3901, + "step": 8383 + }, + { + "epoch": 2.3537338573834923, + "grad_norm": 0.5216163396835327, + "learning_rate": 1.3483659622813954e-06, + "loss": 0.3857, + "step": 8384 + }, + { + "epoch": 2.354014598540146, + "grad_norm": 0.5445965528488159, + "learning_rate": 1.3472504165906614e-06, + "loss": 0.2882, + "step": 8385 + }, + { + "epoch": 2.3542953396967996, + "grad_norm": 0.5460180044174194, + "learning_rate": 1.3461352606986456e-06, + "loss": 0.3148, + "step": 8386 + }, + { + "epoch": 2.354576080853453, + "grad_norm": 0.6001867651939392, + "learning_rate": 1.345020494724355e-06, + "loss": 0.329, + "step": 8387 + }, + { + "epoch": 2.3548568220101065, + "grad_norm": 0.5801257491111755, + "learning_rate": 1.343906118786748e-06, + "loss": 0.3104, + "step": 8388 + }, + { + "epoch": 2.3551375631667604, + "grad_norm": 0.6232779026031494, + "learning_rate": 1.3427921330047434e-06, + "loss": 0.3066, + "step": 8389 + }, + { + "epoch": 2.355418304323414, + "grad_norm": 0.6081386208534241, + "learning_rate": 1.3416785374972208e-06, + "loss": 0.2967, + "step": 8390 + }, + { + "epoch": 2.3556990454800673, + "grad_norm": 0.6113626956939697, + "learning_rate": 1.3405653323830136e-06, + "loss": 0.2924, + "step": 8391 + }, + { + "epoch": 2.3559797866367207, + "grad_norm": 0.5782427787780762, + "learning_rate": 1.3394525177809187e-06, + "loss": 0.3128, + "step": 8392 + }, + { + "epoch": 2.3562605277933746, + "grad_norm": 0.5502467751502991, + "learning_rate": 1.338340093809688e-06, + "loss": 0.3099, + "step": 8393 + }, + { + "epoch": 2.356541268950028, + "grad_norm": 0.5638527274131775, + "learning_rate": 1.337228060588031e-06, + "loss": 0.3496, + "step": 8394 + }, + { + "epoch": 2.3568220101066815, + "grad_norm": 0.5481851100921631, + "learning_rate": 1.3361164182346193e-06, + "loss": 0.3298, + "step": 8395 + }, + { + "epoch": 2.3571027512633353, + "grad_norm": 0.5715014934539795, + "learning_rate": 1.3350051668680775e-06, + "loss": 0.331, + "step": 8396 + }, + { + "epoch": 2.357383492419989, + "grad_norm": 0.5994958877563477, + "learning_rate": 1.3338943066069948e-06, + "loss": 0.3384, + "step": 8397 + }, + { + "epoch": 2.3576642335766422, + "grad_norm": 0.5965902805328369, + "learning_rate": 1.3327838375699127e-06, + "loss": 0.3061, + "step": 8398 + }, + { + "epoch": 2.3579449747332957, + "grad_norm": 0.5985276103019714, + "learning_rate": 1.3316737598753354e-06, + "loss": 0.2946, + "step": 8399 + }, + { + "epoch": 2.3582257158899496, + "grad_norm": 0.610192596912384, + "learning_rate": 1.330564073641723e-06, + "loss": 0.3425, + "step": 8400 + }, + { + "epoch": 2.358506457046603, + "grad_norm": 0.5971916913986206, + "learning_rate": 1.3294547789874924e-06, + "loss": 0.3113, + "step": 8401 + }, + { + "epoch": 2.3587871982032564, + "grad_norm": 0.5463565587997437, + "learning_rate": 1.3283458760310237e-06, + "loss": 0.3726, + "step": 8402 + }, + { + "epoch": 2.3590679393599103, + "grad_norm": 0.5724221467971802, + "learning_rate": 1.327237364890649e-06, + "loss": 0.3495, + "step": 8403 + }, + { + "epoch": 2.3593486805165638, + "grad_norm": 0.5457465648651123, + "learning_rate": 1.3261292456846648e-06, + "loss": 0.3181, + "step": 8404 + }, + { + "epoch": 2.359629421673217, + "grad_norm": 0.5801199674606323, + "learning_rate": 1.3250215185313208e-06, + "loss": 0.3261, + "step": 8405 + }, + { + "epoch": 2.3599101628298707, + "grad_norm": 0.5199431777000427, + "learning_rate": 1.3239141835488261e-06, + "loss": 0.3523, + "step": 8406 + }, + { + "epoch": 2.3601909039865245, + "grad_norm": 0.5299031734466553, + "learning_rate": 1.322807240855351e-06, + "loss": 0.3292, + "step": 8407 + }, + { + "epoch": 2.360471645143178, + "grad_norm": 0.5299385190010071, + "learning_rate": 1.3217006905690189e-06, + "loss": 0.3163, + "step": 8408 + }, + { + "epoch": 2.3607523862998314, + "grad_norm": 0.5740717649459839, + "learning_rate": 1.3205945328079157e-06, + "loss": 0.3241, + "step": 8409 + }, + { + "epoch": 2.3610331274564853, + "grad_norm": 0.5934680700302124, + "learning_rate": 1.3194887676900841e-06, + "loss": 0.3565, + "step": 8410 + }, + { + "epoch": 2.3613138686131387, + "grad_norm": 0.5834106206893921, + "learning_rate": 1.3183833953335224e-06, + "loss": 0.3472, + "step": 8411 + }, + { + "epoch": 2.361594609769792, + "grad_norm": 0.5752813220024109, + "learning_rate": 1.3172784158561913e-06, + "loss": 0.3529, + "step": 8412 + }, + { + "epoch": 2.3618753509264456, + "grad_norm": 0.5484989285469055, + "learning_rate": 1.3161738293760052e-06, + "loss": 0.304, + "step": 8413 + }, + { + "epoch": 2.3621560920830995, + "grad_norm": 0.5734193921089172, + "learning_rate": 1.315069636010841e-06, + "loss": 0.3073, + "step": 8414 + }, + { + "epoch": 2.362436833239753, + "grad_norm": 0.5143022537231445, + "learning_rate": 1.3139658358785306e-06, + "loss": 0.3348, + "step": 8415 + }, + { + "epoch": 2.3627175743964064, + "grad_norm": 0.5107776522636414, + "learning_rate": 1.3128624290968628e-06, + "loss": 0.3013, + "step": 8416 + }, + { + "epoch": 2.3629983155530603, + "grad_norm": 0.5321611762046814, + "learning_rate": 1.3117594157835895e-06, + "loss": 0.3234, + "step": 8417 + }, + { + "epoch": 2.3632790567097137, + "grad_norm": 0.5909276008605957, + "learning_rate": 1.3106567960564136e-06, + "loss": 0.3481, + "step": 8418 + }, + { + "epoch": 2.363559797866367, + "grad_norm": 0.513523519039154, + "learning_rate": 1.3095545700330037e-06, + "loss": 0.303, + "step": 8419 + }, + { + "epoch": 2.3638405390230206, + "grad_norm": 0.5815671682357788, + "learning_rate": 1.3084527378309792e-06, + "loss": 0.325, + "step": 8420 + }, + { + "epoch": 2.3641212801796745, + "grad_norm": 0.5334650874137878, + "learning_rate": 1.3073512995679238e-06, + "loss": 0.3154, + "step": 8421 + }, + { + "epoch": 2.364402021336328, + "grad_norm": 0.5904586911201477, + "learning_rate": 1.3062502553613743e-06, + "loss": 0.3228, + "step": 8422 + }, + { + "epoch": 2.3646827624929814, + "grad_norm": 0.5955625176429749, + "learning_rate": 1.3051496053288265e-06, + "loss": 0.3448, + "step": 8423 + }, + { + "epoch": 2.3649635036496353, + "grad_norm": 0.5929975509643555, + "learning_rate": 1.3040493495877376e-06, + "loss": 0.3479, + "step": 8424 + }, + { + "epoch": 2.3652442448062887, + "grad_norm": 0.5295895934104919, + "learning_rate": 1.3029494882555166e-06, + "loss": 0.339, + "step": 8425 + }, + { + "epoch": 2.365524985962942, + "grad_norm": 0.5702146887779236, + "learning_rate": 1.3018500214495378e-06, + "loss": 0.3571, + "step": 8426 + }, + { + "epoch": 2.3658057271195956, + "grad_norm": 0.5665997862815857, + "learning_rate": 1.3007509492871274e-06, + "loss": 0.3202, + "step": 8427 + }, + { + "epoch": 2.3660864682762495, + "grad_norm": 0.5769081711769104, + "learning_rate": 1.2996522718855698e-06, + "loss": 0.3324, + "step": 8428 + }, + { + "epoch": 2.366367209432903, + "grad_norm": 0.6712439060211182, + "learning_rate": 1.2985539893621123e-06, + "loss": 0.3498, + "step": 8429 + }, + { + "epoch": 2.3666479505895563, + "grad_norm": 0.5461412668228149, + "learning_rate": 1.2974561018339537e-06, + "loss": 0.3305, + "step": 8430 + }, + { + "epoch": 2.3669286917462102, + "grad_norm": 0.5450289845466614, + "learning_rate": 1.2963586094182573e-06, + "loss": 0.3477, + "step": 8431 + }, + { + "epoch": 2.3672094329028637, + "grad_norm": 0.5062228441238403, + "learning_rate": 1.2952615122321366e-06, + "loss": 0.3232, + "step": 8432 + }, + { + "epoch": 2.367490174059517, + "grad_norm": 0.5167391300201416, + "learning_rate": 1.2941648103926712e-06, + "loss": 0.3583, + "step": 8433 + }, + { + "epoch": 2.3677709152161706, + "grad_norm": 0.5224451422691345, + "learning_rate": 1.2930685040168916e-06, + "loss": 0.3486, + "step": 8434 + }, + { + "epoch": 2.3680516563728244, + "grad_norm": 0.5944300293922424, + "learning_rate": 1.2919725932217863e-06, + "loss": 0.2567, + "step": 8435 + }, + { + "epoch": 2.368332397529478, + "grad_norm": 0.5734946131706238, + "learning_rate": 1.2908770781243108e-06, + "loss": 0.3202, + "step": 8436 + }, + { + "epoch": 2.3686131386861313, + "grad_norm": 0.5120568871498108, + "learning_rate": 1.2897819588413675e-06, + "loss": 0.3213, + "step": 8437 + }, + { + "epoch": 2.3688938798427848, + "grad_norm": 0.5835923552513123, + "learning_rate": 1.28868723548982e-06, + "loss": 0.2747, + "step": 8438 + }, + { + "epoch": 2.3691746209994387, + "grad_norm": 0.5205466151237488, + "learning_rate": 1.287592908186493e-06, + "loss": 0.3618, + "step": 8439 + }, + { + "epoch": 2.369455362156092, + "grad_norm": 0.6142522692680359, + "learning_rate": 1.2864989770481634e-06, + "loss": 0.3048, + "step": 8440 + }, + { + "epoch": 2.3697361033127455, + "grad_norm": 0.5458880662918091, + "learning_rate": 1.2854054421915712e-06, + "loss": 0.3434, + "step": 8441 + }, + { + "epoch": 2.370016844469399, + "grad_norm": 0.5864067077636719, + "learning_rate": 1.2843123037334115e-06, + "loss": 0.2684, + "step": 8442 + }, + { + "epoch": 2.370297585626053, + "grad_norm": 0.5875257253646851, + "learning_rate": 1.2832195617903342e-06, + "loss": 0.3211, + "step": 8443 + }, + { + "epoch": 2.3705783267827063, + "grad_norm": 0.5107319951057434, + "learning_rate": 1.2821272164789544e-06, + "loss": 0.3581, + "step": 8444 + }, + { + "epoch": 2.3708590679393597, + "grad_norm": 0.630216121673584, + "learning_rate": 1.2810352679158362e-06, + "loss": 0.3138, + "step": 8445 + }, + { + "epoch": 2.3711398090960136, + "grad_norm": 0.5387038588523865, + "learning_rate": 1.2799437162175087e-06, + "loss": 0.3519, + "step": 8446 + }, + { + "epoch": 2.371420550252667, + "grad_norm": 0.5554496049880981, + "learning_rate": 1.2788525615004532e-06, + "loss": 0.3683, + "step": 8447 + }, + { + "epoch": 2.3717012914093205, + "grad_norm": 0.547758936882019, + "learning_rate": 1.2777618038811134e-06, + "loss": 0.3423, + "step": 8448 + }, + { + "epoch": 2.371982032565974, + "grad_norm": 0.5472351312637329, + "learning_rate": 1.276671443475887e-06, + "loss": 0.3288, + "step": 8449 + }, + { + "epoch": 2.372262773722628, + "grad_norm": 0.6356356143951416, + "learning_rate": 1.275581480401129e-06, + "loss": 0.3491, + "step": 8450 + }, + { + "epoch": 2.3725435148792813, + "grad_norm": 0.5399888157844543, + "learning_rate": 1.274491914773156e-06, + "loss": 0.3149, + "step": 8451 + }, + { + "epoch": 2.3728242560359347, + "grad_norm": 0.5299639105796814, + "learning_rate": 1.2734027467082366e-06, + "loss": 0.3121, + "step": 8452 + }, + { + "epoch": 2.3731049971925886, + "grad_norm": 0.4775037467479706, + "learning_rate": 1.2723139763226039e-06, + "loss": 0.3176, + "step": 8453 + }, + { + "epoch": 2.373385738349242, + "grad_norm": 0.534806489944458, + "learning_rate": 1.2712256037324421e-06, + "loss": 0.326, + "step": 8454 + }, + { + "epoch": 2.3736664795058955, + "grad_norm": 0.6329243779182434, + "learning_rate": 1.2701376290538952e-06, + "loss": 0.3308, + "step": 8455 + }, + { + "epoch": 2.373947220662549, + "grad_norm": 0.5237137079238892, + "learning_rate": 1.2690500524030675e-06, + "loss": 0.2707, + "step": 8456 + }, + { + "epoch": 2.374227961819203, + "grad_norm": 0.5480474829673767, + "learning_rate": 1.2679628738960155e-06, + "loss": 0.3525, + "step": 8457 + }, + { + "epoch": 2.3745087029758563, + "grad_norm": 0.5983497500419617, + "learning_rate": 1.266876093648759e-06, + "loss": 0.325, + "step": 8458 + }, + { + "epoch": 2.3747894441325097, + "grad_norm": 0.6835537552833557, + "learning_rate": 1.2657897117772695e-06, + "loss": 0.3166, + "step": 8459 + }, + { + "epoch": 2.3750701852891636, + "grad_norm": 0.5635731220245361, + "learning_rate": 1.2647037283974805e-06, + "loss": 0.3609, + "step": 8460 + }, + { + "epoch": 2.375350926445817, + "grad_norm": 0.5700622797012329, + "learning_rate": 1.2636181436252826e-06, + "loss": 0.326, + "step": 8461 + }, + { + "epoch": 2.3756316676024705, + "grad_norm": 0.5278427600860596, + "learning_rate": 1.2625329575765198e-06, + "loss": 0.3247, + "step": 8462 + }, + { + "epoch": 2.375912408759124, + "grad_norm": 0.5857083797454834, + "learning_rate": 1.261448170367e-06, + "loss": 0.3155, + "step": 8463 + }, + { + "epoch": 2.376193149915778, + "grad_norm": 0.5428717136383057, + "learning_rate": 1.2603637821124825e-06, + "loss": 0.3218, + "step": 8464 + }, + { + "epoch": 2.3764738910724312, + "grad_norm": 0.5713076591491699, + "learning_rate": 1.259279792928686e-06, + "loss": 0.2934, + "step": 8465 + }, + { + "epoch": 2.3767546322290847, + "grad_norm": 0.5731424689292908, + "learning_rate": 1.2581962029312889e-06, + "loss": 0.3246, + "step": 8466 + }, + { + "epoch": 2.3770353733857386, + "grad_norm": 0.5826208591461182, + "learning_rate": 1.2571130122359226e-06, + "loss": 0.3603, + "step": 8467 + }, + { + "epoch": 2.377316114542392, + "grad_norm": 0.5617663860321045, + "learning_rate": 1.2560302209581822e-06, + "loss": 0.3218, + "step": 8468 + }, + { + "epoch": 2.3775968556990454, + "grad_norm": 0.5224429368972778, + "learning_rate": 1.254947829213613e-06, + "loss": 0.3335, + "step": 8469 + }, + { + "epoch": 2.377877596855699, + "grad_norm": 0.5597840547561646, + "learning_rate": 1.2538658371177236e-06, + "loss": 0.3352, + "step": 8470 + }, + { + "epoch": 2.3781583380123528, + "grad_norm": 0.5383083820343018, + "learning_rate": 1.2527842447859762e-06, + "loss": 0.3496, + "step": 8471 + }, + { + "epoch": 2.378439079169006, + "grad_norm": 0.6109507083892822, + "learning_rate": 1.2517030523337908e-06, + "loss": 0.2995, + "step": 8472 + }, + { + "epoch": 2.3787198203256597, + "grad_norm": 0.6926080584526062, + "learning_rate": 1.2506222598765477e-06, + "loss": 0.2796, + "step": 8473 + }, + { + "epoch": 2.3790005614823135, + "grad_norm": 0.5966923236846924, + "learning_rate": 1.24954186752958e-06, + "loss": 0.3514, + "step": 8474 + }, + { + "epoch": 2.379281302638967, + "grad_norm": 0.5545368194580078, + "learning_rate": 1.248461875408183e-06, + "loss": 0.3558, + "step": 8475 + }, + { + "epoch": 2.3795620437956204, + "grad_norm": 0.5807587504386902, + "learning_rate": 1.2473822836276056e-06, + "loss": 0.307, + "step": 8476 + }, + { + "epoch": 2.379842784952274, + "grad_norm": 0.5329170823097229, + "learning_rate": 1.2463030923030527e-06, + "loss": 0.351, + "step": 8477 + }, + { + "epoch": 2.3801235261089277, + "grad_norm": 0.525782585144043, + "learning_rate": 1.2452243015496934e-06, + "loss": 0.3402, + "step": 8478 + }, + { + "epoch": 2.380404267265581, + "grad_norm": 0.583706259727478, + "learning_rate": 1.2441459114826454e-06, + "loss": 0.3038, + "step": 8479 + }, + { + "epoch": 2.3806850084222346, + "grad_norm": 0.5716387629508972, + "learning_rate": 1.2430679222169911e-06, + "loss": 0.3398, + "step": 8480 + }, + { + "epoch": 2.3809657495788885, + "grad_norm": 0.5780821442604065, + "learning_rate": 1.2419903338677636e-06, + "loss": 0.319, + "step": 8481 + }, + { + "epoch": 2.381246490735542, + "grad_norm": 0.6285926699638367, + "learning_rate": 1.2409131465499602e-06, + "loss": 0.3012, + "step": 8482 + }, + { + "epoch": 2.3815272318921954, + "grad_norm": 0.5814008712768555, + "learning_rate": 1.239836360378529e-06, + "loss": 0.3092, + "step": 8483 + }, + { + "epoch": 2.381807973048849, + "grad_norm": 0.5618540644645691, + "learning_rate": 1.2387599754683777e-06, + "loss": 0.3377, + "step": 8484 + }, + { + "epoch": 2.3820887142055023, + "grad_norm": 0.5493171811103821, + "learning_rate": 1.2376839919343731e-06, + "loss": 0.3365, + "step": 8485 + }, + { + "epoch": 2.382369455362156, + "grad_norm": 0.8610407114028931, + "learning_rate": 1.236608409891335e-06, + "loss": 0.3283, + "step": 8486 + }, + { + "epoch": 2.3826501965188096, + "grad_norm": 0.6050732731819153, + "learning_rate": 1.235533229454045e-06, + "loss": 0.3433, + "step": 8487 + }, + { + "epoch": 2.382930937675463, + "grad_norm": 0.5976834893226624, + "learning_rate": 1.2344584507372404e-06, + "loss": 0.3333, + "step": 8488 + }, + { + "epoch": 2.383211678832117, + "grad_norm": 0.5333192944526672, + "learning_rate": 1.233384073855612e-06, + "loss": 0.3488, + "step": 8489 + }, + { + "epoch": 2.3834924199887704, + "grad_norm": 0.5537493824958801, + "learning_rate": 1.2323100989238136e-06, + "loss": 0.3466, + "step": 8490 + }, + { + "epoch": 2.383773161145424, + "grad_norm": 0.5674349665641785, + "learning_rate": 1.2312365260564513e-06, + "loss": 0.337, + "step": 8491 + }, + { + "epoch": 2.3840539023020773, + "grad_norm": 0.546229362487793, + "learning_rate": 1.2301633553680896e-06, + "loss": 0.2806, + "step": 8492 + }, + { + "epoch": 2.384334643458731, + "grad_norm": 0.5464627742767334, + "learning_rate": 1.2290905869732529e-06, + "loss": 0.3387, + "step": 8493 + }, + { + "epoch": 2.3846153846153846, + "grad_norm": 0.5359675884246826, + "learning_rate": 1.2280182209864177e-06, + "loss": 0.3151, + "step": 8494 + }, + { + "epoch": 2.384896125772038, + "grad_norm": 0.6023908853530884, + "learning_rate": 1.2269462575220226e-06, + "loss": 0.3012, + "step": 8495 + }, + { + "epoch": 2.385176866928692, + "grad_norm": 0.4947608411312103, + "learning_rate": 1.2258746966944591e-06, + "loss": 0.339, + "step": 8496 + }, + { + "epoch": 2.3854576080853453, + "grad_norm": 0.631801187992096, + "learning_rate": 1.224803538618079e-06, + "loss": 0.3245, + "step": 8497 + }, + { + "epoch": 2.385738349241999, + "grad_norm": 0.5674340724945068, + "learning_rate": 1.2237327834071893e-06, + "loss": 0.2874, + "step": 8498 + }, + { + "epoch": 2.3860190903986522, + "grad_norm": 0.5879778265953064, + "learning_rate": 1.2226624311760521e-06, + "loss": 0.3076, + "step": 8499 + }, + { + "epoch": 2.386299831555306, + "grad_norm": 0.5510382652282715, + "learning_rate": 1.2215924820388919e-06, + "loss": 0.3161, + "step": 8500 + }, + { + "epoch": 2.3865805727119596, + "grad_norm": 0.5921781063079834, + "learning_rate": 1.2205229361098847e-06, + "loss": 0.2966, + "step": 8501 + }, + { + "epoch": 2.386861313868613, + "grad_norm": 0.6243314146995544, + "learning_rate": 1.219453793503168e-06, + "loss": 0.3108, + "step": 8502 + }, + { + "epoch": 2.387142055025267, + "grad_norm": 0.5734645128250122, + "learning_rate": 1.2183850543328313e-06, + "loss": 0.331, + "step": 8503 + }, + { + "epoch": 2.3874227961819203, + "grad_norm": 0.5308526158332825, + "learning_rate": 1.2173167187129265e-06, + "loss": 0.3404, + "step": 8504 + }, + { + "epoch": 2.3877035373385738, + "grad_norm": 0.5437654852867126, + "learning_rate": 1.216248786757459e-06, + "loss": 0.3325, + "step": 8505 + }, + { + "epoch": 2.387984278495227, + "grad_norm": 0.5825144648551941, + "learning_rate": 1.2151812585803895e-06, + "loss": 0.3626, + "step": 8506 + }, + { + "epoch": 2.388265019651881, + "grad_norm": 0.5162522792816162, + "learning_rate": 1.2141141342956414e-06, + "loss": 0.3759, + "step": 8507 + }, + { + "epoch": 2.3885457608085345, + "grad_norm": 0.5401203632354736, + "learning_rate": 1.213047414017089e-06, + "loss": 0.3429, + "step": 8508 + }, + { + "epoch": 2.388826501965188, + "grad_norm": 0.5947383642196655, + "learning_rate": 1.2119810978585678e-06, + "loss": 0.3491, + "step": 8509 + }, + { + "epoch": 2.389107243121842, + "grad_norm": 0.5353080630302429, + "learning_rate": 1.210915185933868e-06, + "loss": 0.3527, + "step": 8510 + }, + { + "epoch": 2.3893879842784953, + "grad_norm": 0.5459492206573486, + "learning_rate": 1.2098496783567343e-06, + "loss": 0.349, + "step": 8511 + }, + { + "epoch": 2.3896687254351487, + "grad_norm": 0.5531647801399231, + "learning_rate": 1.208784575240876e-06, + "loss": 0.3292, + "step": 8512 + }, + { + "epoch": 2.389949466591802, + "grad_norm": 0.5723190903663635, + "learning_rate": 1.207719876699952e-06, + "loss": 0.3563, + "step": 8513 + }, + { + "epoch": 2.390230207748456, + "grad_norm": 0.5542603135108948, + "learning_rate": 1.2066555828475785e-06, + "loss": 0.3116, + "step": 8514 + }, + { + "epoch": 2.3905109489051095, + "grad_norm": 0.5417312979698181, + "learning_rate": 1.2055916937973333e-06, + "loss": 0.3537, + "step": 8515 + }, + { + "epoch": 2.390791690061763, + "grad_norm": 0.5427179336547852, + "learning_rate": 1.2045282096627453e-06, + "loss": 0.38, + "step": 8516 + }, + { + "epoch": 2.391072431218417, + "grad_norm": 0.5366474986076355, + "learning_rate": 1.2034651305573059e-06, + "loss": 0.2698, + "step": 8517 + }, + { + "epoch": 2.3913531723750703, + "grad_norm": 0.5877828001976013, + "learning_rate": 1.2024024565944576e-06, + "loss": 0.3196, + "step": 8518 + }, + { + "epoch": 2.3916339135317237, + "grad_norm": 0.5598554611206055, + "learning_rate": 1.2013401878876042e-06, + "loss": 0.3598, + "step": 8519 + }, + { + "epoch": 2.391914654688377, + "grad_norm": 0.4990025460720062, + "learning_rate": 1.2002783245501038e-06, + "loss": 0.3537, + "step": 8520 + }, + { + "epoch": 2.392195395845031, + "grad_norm": 0.5653421878814697, + "learning_rate": 1.1992168666952703e-06, + "loss": 0.3424, + "step": 8521 + }, + { + "epoch": 2.3924761370016845, + "grad_norm": 0.543632447719574, + "learning_rate": 1.1981558144363787e-06, + "loss": 0.3377, + "step": 8522 + }, + { + "epoch": 2.392756878158338, + "grad_norm": 0.5864419937133789, + "learning_rate": 1.1970951678866555e-06, + "loss": 0.3312, + "step": 8523 + }, + { + "epoch": 2.393037619314992, + "grad_norm": 0.5528798699378967, + "learning_rate": 1.196034927159288e-06, + "loss": 0.3036, + "step": 8524 + }, + { + "epoch": 2.3933183604716453, + "grad_norm": 0.5613815784454346, + "learning_rate": 1.194975092367418e-06, + "loss": 0.3248, + "step": 8525 + }, + { + "epoch": 2.3935991016282987, + "grad_norm": 0.6046698689460754, + "learning_rate": 1.1939156636241429e-06, + "loss": 0.3092, + "step": 8526 + }, + { + "epoch": 2.393879842784952, + "grad_norm": 0.5368297696113586, + "learning_rate": 1.1928566410425213e-06, + "loss": 0.3377, + "step": 8527 + }, + { + "epoch": 2.394160583941606, + "grad_norm": 0.5937784314155579, + "learning_rate": 1.1917980247355621e-06, + "loss": 0.3526, + "step": 8528 + }, + { + "epoch": 2.3944413250982595, + "grad_norm": 0.5347118377685547, + "learning_rate": 1.190739814816238e-06, + "loss": 0.2996, + "step": 8529 + }, + { + "epoch": 2.394722066254913, + "grad_norm": 0.584606409072876, + "learning_rate": 1.1896820113974705e-06, + "loss": 0.3166, + "step": 8530 + }, + { + "epoch": 2.3950028074115663, + "grad_norm": 0.6021196842193604, + "learning_rate": 1.188624614592146e-06, + "loss": 0.3179, + "step": 8531 + }, + { + "epoch": 2.3952835485682202, + "grad_norm": 0.5946304202079773, + "learning_rate": 1.1875676245131012e-06, + "loss": 0.3623, + "step": 8532 + }, + { + "epoch": 2.3955642897248737, + "grad_norm": 0.5681818723678589, + "learning_rate": 1.1865110412731295e-06, + "loss": 0.3432, + "step": 8533 + }, + { + "epoch": 2.395845030881527, + "grad_norm": 0.5449780225753784, + "learning_rate": 1.1854548649849874e-06, + "loss": 0.3562, + "step": 8534 + }, + { + "epoch": 2.3961257720381806, + "grad_norm": 0.6077841520309448, + "learning_rate": 1.1843990957613787e-06, + "loss": 0.3174, + "step": 8535 + }, + { + "epoch": 2.3964065131948344, + "grad_norm": 0.7257047891616821, + "learning_rate": 1.1833437337149728e-06, + "loss": 0.2879, + "step": 8536 + }, + { + "epoch": 2.396687254351488, + "grad_norm": 0.5597652196884155, + "learning_rate": 1.1822887789583875e-06, + "loss": 0.3213, + "step": 8537 + }, + { + "epoch": 2.3969679955081413, + "grad_norm": 0.5362836718559265, + "learning_rate": 1.1812342316042036e-06, + "loss": 0.3392, + "step": 8538 + }, + { + "epoch": 2.397248736664795, + "grad_norm": 0.5410301089286804, + "learning_rate": 1.180180091764956e-06, + "loss": 0.3385, + "step": 8539 + }, + { + "epoch": 2.3975294778214487, + "grad_norm": 0.608157753944397, + "learning_rate": 1.1791263595531338e-06, + "loss": 0.3491, + "step": 8540 + }, + { + "epoch": 2.397810218978102, + "grad_norm": 0.5700143575668335, + "learning_rate": 1.1780730350811876e-06, + "loss": 0.3542, + "step": 8541 + }, + { + "epoch": 2.3980909601347555, + "grad_norm": 0.6143352389335632, + "learning_rate": 1.1770201184615203e-06, + "loss": 0.2471, + "step": 8542 + }, + { + "epoch": 2.3983717012914094, + "grad_norm": 0.5531323552131653, + "learning_rate": 1.1759676098064903e-06, + "loss": 0.3531, + "step": 8543 + }, + { + "epoch": 2.398652442448063, + "grad_norm": 0.5703702569007874, + "learning_rate": 1.1749155092284192e-06, + "loss": 0.2932, + "step": 8544 + }, + { + "epoch": 2.3989331836047163, + "grad_norm": 0.5654466152191162, + "learning_rate": 1.1738638168395767e-06, + "loss": 0.3307, + "step": 8545 + }, + { + "epoch": 2.39921392476137, + "grad_norm": 0.6061022281646729, + "learning_rate": 1.1728125327521955e-06, + "loss": 0.3081, + "step": 8546 + }, + { + "epoch": 2.3994946659180236, + "grad_norm": 0.6017066836357117, + "learning_rate": 1.1717616570784612e-06, + "loss": 0.3175, + "step": 8547 + }, + { + "epoch": 2.399775407074677, + "grad_norm": 0.5499140620231628, + "learning_rate": 1.1707111899305151e-06, + "loss": 0.3067, + "step": 8548 + }, + { + "epoch": 2.4000561482313305, + "grad_norm": 0.6325089335441589, + "learning_rate": 1.1696611314204599e-06, + "loss": 0.3309, + "step": 8549 + }, + { + "epoch": 2.4003368893879844, + "grad_norm": 0.5376364588737488, + "learning_rate": 1.1686114816603477e-06, + "loss": 0.3229, + "step": 8550 + }, + { + "epoch": 2.400617630544638, + "grad_norm": 0.5829223394393921, + "learning_rate": 1.167562240762194e-06, + "loss": 0.3612, + "step": 8551 + }, + { + "epoch": 2.4008983717012913, + "grad_norm": 0.6152477264404297, + "learning_rate": 1.1665134088379643e-06, + "loss": 0.2987, + "step": 8552 + }, + { + "epoch": 2.401179112857945, + "grad_norm": 0.5731292366981506, + "learning_rate": 1.165464985999586e-06, + "loss": 0.3559, + "step": 8553 + }, + { + "epoch": 2.4014598540145986, + "grad_norm": 0.5887482762336731, + "learning_rate": 1.1644169723589389e-06, + "loss": 0.3053, + "step": 8554 + }, + { + "epoch": 2.401740595171252, + "grad_norm": 0.5587543845176697, + "learning_rate": 1.1633693680278591e-06, + "loss": 0.3231, + "step": 8555 + }, + { + "epoch": 2.4020213363279055, + "grad_norm": 0.5400908589363098, + "learning_rate": 1.1623221731181432e-06, + "loss": 0.3114, + "step": 8556 + }, + { + "epoch": 2.4023020774845594, + "grad_norm": 0.568158745765686, + "learning_rate": 1.161275387741539e-06, + "loss": 0.337, + "step": 8557 + }, + { + "epoch": 2.402582818641213, + "grad_norm": 0.539108395576477, + "learning_rate": 1.1602290120097548e-06, + "loss": 0.3102, + "step": 8558 + }, + { + "epoch": 2.4028635597978663, + "grad_norm": 0.639522135257721, + "learning_rate": 1.1591830460344528e-06, + "loss": 0.3415, + "step": 8559 + }, + { + "epoch": 2.40314430095452, + "grad_norm": 0.55396968126297, + "learning_rate": 1.1581374899272507e-06, + "loss": 0.3387, + "step": 8560 + }, + { + "epoch": 2.4034250421111736, + "grad_norm": 0.5616620182991028, + "learning_rate": 1.1570923437997255e-06, + "loss": 0.3225, + "step": 8561 + }, + { + "epoch": 2.403705783267827, + "grad_norm": 0.5308200120925903, + "learning_rate": 1.156047607763407e-06, + "loss": 0.3412, + "step": 8562 + }, + { + "epoch": 2.4039865244244805, + "grad_norm": 0.5579840540885925, + "learning_rate": 1.1550032819297835e-06, + "loss": 0.357, + "step": 8563 + }, + { + "epoch": 2.4042672655811343, + "grad_norm": 0.5448474884033203, + "learning_rate": 1.1539593664103005e-06, + "loss": 0.3172, + "step": 8564 + }, + { + "epoch": 2.404548006737788, + "grad_norm": 0.6003230810165405, + "learning_rate": 1.152915861316356e-06, + "loss": 0.3518, + "step": 8565 + }, + { + "epoch": 2.4048287478944412, + "grad_norm": 0.5988787412643433, + "learning_rate": 1.1518727667593087e-06, + "loss": 0.3165, + "step": 8566 + }, + { + "epoch": 2.405109489051095, + "grad_norm": 0.5354225635528564, + "learning_rate": 1.1508300828504682e-06, + "loss": 0.3445, + "step": 8567 + }, + { + "epoch": 2.4053902302077486, + "grad_norm": 0.5439662337303162, + "learning_rate": 1.1497878097011062e-06, + "loss": 0.324, + "step": 8568 + }, + { + "epoch": 2.405670971364402, + "grad_norm": 0.5637765526771545, + "learning_rate": 1.1487459474224467e-06, + "loss": 0.3504, + "step": 8569 + }, + { + "epoch": 2.4059517125210554, + "grad_norm": 0.5570805072784424, + "learning_rate": 1.1477044961256684e-06, + "loss": 0.3299, + "step": 8570 + }, + { + "epoch": 2.4062324536777093, + "grad_norm": 0.5669572949409485, + "learning_rate": 1.146663455921912e-06, + "loss": 0.3214, + "step": 8571 + }, + { + "epoch": 2.4065131948343628, + "grad_norm": 0.5762567520141602, + "learning_rate": 1.145622826922268e-06, + "loss": 0.3, + "step": 8572 + }, + { + "epoch": 2.406793935991016, + "grad_norm": 0.6352512836456299, + "learning_rate": 1.1445826092377888e-06, + "loss": 0.2981, + "step": 8573 + }, + { + "epoch": 2.40707467714767, + "grad_norm": 0.5779180526733398, + "learning_rate": 1.1435428029794775e-06, + "loss": 0.346, + "step": 8574 + }, + { + "epoch": 2.4073554183043235, + "grad_norm": 0.5574477910995483, + "learning_rate": 1.1425034082582959e-06, + "loss": 0.312, + "step": 8575 + }, + { + "epoch": 2.407636159460977, + "grad_norm": 0.5137155055999756, + "learning_rate": 1.1414644251851637e-06, + "loss": 0.332, + "step": 8576 + }, + { + "epoch": 2.4079169006176304, + "grad_norm": 0.5635234117507935, + "learning_rate": 1.1404258538709512e-06, + "loss": 0.3753, + "step": 8577 + }, + { + "epoch": 2.4081976417742843, + "grad_norm": 0.5221818089485168, + "learning_rate": 1.1393876944264926e-06, + "loss": 0.3065, + "step": 8578 + }, + { + "epoch": 2.4084783829309377, + "grad_norm": 0.5356088876724243, + "learning_rate": 1.1383499469625702e-06, + "loss": 0.3014, + "step": 8579 + }, + { + "epoch": 2.408759124087591, + "grad_norm": 0.5905362963676453, + "learning_rate": 1.1373126115899286e-06, + "loss": 0.3379, + "step": 8580 + }, + { + "epoch": 2.4090398652442446, + "grad_norm": 0.6346529126167297, + "learning_rate": 1.136275688419265e-06, + "loss": 0.296, + "step": 8581 + }, + { + "epoch": 2.4093206064008985, + "grad_norm": 0.5467550754547119, + "learning_rate": 1.1352391775612314e-06, + "loss": 0.3535, + "step": 8582 + }, + { + "epoch": 2.409601347557552, + "grad_norm": 0.4837294816970825, + "learning_rate": 1.1342030791264408e-06, + "loss": 0.315, + "step": 8583 + }, + { + "epoch": 2.4098820887142054, + "grad_norm": 0.6097621321678162, + "learning_rate": 1.133167393225456e-06, + "loss": 0.3558, + "step": 8584 + }, + { + "epoch": 2.410162829870859, + "grad_norm": 0.6209611296653748, + "learning_rate": 1.132132119968803e-06, + "loss": 0.3096, + "step": 8585 + }, + { + "epoch": 2.4104435710275127, + "grad_norm": 0.5919151306152344, + "learning_rate": 1.1310972594669567e-06, + "loss": 0.3734, + "step": 8586 + }, + { + "epoch": 2.410724312184166, + "grad_norm": 0.6211033463478088, + "learning_rate": 1.130062811830351e-06, + "loss": 0.3623, + "step": 8587 + }, + { + "epoch": 2.4110050533408196, + "grad_norm": 0.5057561993598938, + "learning_rate": 1.1290287771693759e-06, + "loss": 0.3433, + "step": 8588 + }, + { + "epoch": 2.4112857944974735, + "grad_norm": 0.5803185105323792, + "learning_rate": 1.127995155594378e-06, + "loss": 0.3041, + "step": 8589 + }, + { + "epoch": 2.411566535654127, + "grad_norm": 0.4840635657310486, + "learning_rate": 1.1269619472156602e-06, + "loss": 0.351, + "step": 8590 + }, + { + "epoch": 2.4118472768107804, + "grad_norm": 0.5588074922561646, + "learning_rate": 1.1259291521434785e-06, + "loss": 0.3051, + "step": 8591 + }, + { + "epoch": 2.412128017967434, + "grad_norm": 0.5140504240989685, + "learning_rate": 1.1248967704880449e-06, + "loss": 0.3652, + "step": 8592 + }, + { + "epoch": 2.4124087591240877, + "grad_norm": 0.5610706210136414, + "learning_rate": 1.1238648023595316e-06, + "loss": 0.3171, + "step": 8593 + }, + { + "epoch": 2.412689500280741, + "grad_norm": 0.5871493816375732, + "learning_rate": 1.1228332478680608e-06, + "loss": 0.3214, + "step": 8594 + }, + { + "epoch": 2.4129702414373946, + "grad_norm": 0.533639669418335, + "learning_rate": 1.1218021071237173e-06, + "loss": 0.29, + "step": 8595 + }, + { + "epoch": 2.4132509825940485, + "grad_norm": 0.5451346635818481, + "learning_rate": 1.1207713802365361e-06, + "loss": 0.3251, + "step": 8596 + }, + { + "epoch": 2.413531723750702, + "grad_norm": 0.5177164077758789, + "learning_rate": 1.1197410673165077e-06, + "loss": 0.3011, + "step": 8597 + }, + { + "epoch": 2.4138124649073553, + "grad_norm": 0.5294882655143738, + "learning_rate": 1.1187111684735853e-06, + "loss": 0.3416, + "step": 8598 + }, + { + "epoch": 2.414093206064009, + "grad_norm": 0.5082541704177856, + "learning_rate": 1.1176816838176685e-06, + "loss": 0.3313, + "step": 8599 + }, + { + "epoch": 2.4143739472206627, + "grad_norm": 0.5761443972587585, + "learning_rate": 1.1166526134586213e-06, + "loss": 0.3376, + "step": 8600 + }, + { + "epoch": 2.414654688377316, + "grad_norm": 0.5105448365211487, + "learning_rate": 1.1156239575062578e-06, + "loss": 0.3248, + "step": 8601 + }, + { + "epoch": 2.4149354295339696, + "grad_norm": 0.5691644549369812, + "learning_rate": 1.1145957160703508e-06, + "loss": 0.3394, + "step": 8602 + }, + { + "epoch": 2.4152161706906234, + "grad_norm": 0.5590173006057739, + "learning_rate": 1.1135678892606273e-06, + "loss": 0.3605, + "step": 8603 + }, + { + "epoch": 2.415496911847277, + "grad_norm": 0.5871078372001648, + "learning_rate": 1.1125404771867692e-06, + "loss": 0.2855, + "step": 8604 + }, + { + "epoch": 2.4157776530039303, + "grad_norm": 0.6195324659347534, + "learning_rate": 1.1115134799584188e-06, + "loss": 0.3279, + "step": 8605 + }, + { + "epoch": 2.4160583941605838, + "grad_norm": 0.5766201615333557, + "learning_rate": 1.1104868976851669e-06, + "loss": 0.3279, + "step": 8606 + }, + { + "epoch": 2.4163391353172377, + "grad_norm": 0.5775571465492249, + "learning_rate": 1.1094607304765676e-06, + "loss": 0.3226, + "step": 8607 + }, + { + "epoch": 2.416619876473891, + "grad_norm": 0.5180070996284485, + "learning_rate": 1.108434978442125e-06, + "loss": 0.3393, + "step": 8608 + }, + { + "epoch": 2.4169006176305445, + "grad_norm": 0.6077528595924377, + "learning_rate": 1.1074096416913005e-06, + "loss": 0.3564, + "step": 8609 + }, + { + "epoch": 2.4171813587871984, + "grad_norm": 0.5559960603713989, + "learning_rate": 1.1063847203335143e-06, + "loss": 0.3235, + "step": 8610 + }, + { + "epoch": 2.417462099943852, + "grad_norm": 0.5508143305778503, + "learning_rate": 1.105360214478136e-06, + "loss": 0.328, + "step": 8611 + }, + { + "epoch": 2.4177428411005053, + "grad_norm": 0.5017294883728027, + "learning_rate": 1.104336124234498e-06, + "loss": 0.3276, + "step": 8612 + }, + { + "epoch": 2.4180235822571587, + "grad_norm": 0.5135388374328613, + "learning_rate": 1.1033124497118825e-06, + "loss": 0.362, + "step": 8613 + }, + { + "epoch": 2.4183043234138126, + "grad_norm": 0.5576337575912476, + "learning_rate": 1.10228919101953e-06, + "loss": 0.3307, + "step": 8614 + }, + { + "epoch": 2.418585064570466, + "grad_norm": 0.6229658722877502, + "learning_rate": 1.101266348266638e-06, + "loss": 0.3165, + "step": 8615 + }, + { + "epoch": 2.4188658057271195, + "grad_norm": 0.5190553665161133, + "learning_rate": 1.100243921562355e-06, + "loss": 0.3807, + "step": 8616 + }, + { + "epoch": 2.4191465468837734, + "grad_norm": 0.5850105285644531, + "learning_rate": 1.0992219110157914e-06, + "loss": 0.3312, + "step": 8617 + }, + { + "epoch": 2.419427288040427, + "grad_norm": 0.5060490369796753, + "learning_rate": 1.0982003167360083e-06, + "loss": 0.307, + "step": 8618 + }, + { + "epoch": 2.4197080291970803, + "grad_norm": 0.5556020140647888, + "learning_rate": 1.0971791388320224e-06, + "loss": 0.3356, + "step": 8619 + }, + { + "epoch": 2.4199887703537337, + "grad_norm": 0.5170776844024658, + "learning_rate": 1.0961583774128099e-06, + "loss": 0.3296, + "step": 8620 + }, + { + "epoch": 2.4202695115103876, + "grad_norm": 0.5360212922096252, + "learning_rate": 1.095138032587298e-06, + "loss": 0.3244, + "step": 8621 + }, + { + "epoch": 2.420550252667041, + "grad_norm": 0.5871847867965698, + "learning_rate": 1.0941181044643734e-06, + "loss": 0.3181, + "step": 8622 + }, + { + "epoch": 2.4208309938236945, + "grad_norm": 0.6124431490898132, + "learning_rate": 1.0930985931528742e-06, + "loss": 0.2591, + "step": 8623 + }, + { + "epoch": 2.421111734980348, + "grad_norm": 0.5897138118743896, + "learning_rate": 1.0920794987615996e-06, + "loss": 0.3427, + "step": 8624 + }, + { + "epoch": 2.421392476137002, + "grad_norm": 0.5578790307044983, + "learning_rate": 1.091060821399299e-06, + "loss": 0.3305, + "step": 8625 + }, + { + "epoch": 2.4216732172936553, + "grad_norm": 0.5649060010910034, + "learning_rate": 1.090042561174678e-06, + "loss": 0.3414, + "step": 8626 + }, + { + "epoch": 2.4219539584503087, + "grad_norm": 0.6106213927268982, + "learning_rate": 1.0890247181964015e-06, + "loss": 0.3337, + "step": 8627 + }, + { + "epoch": 2.422234699606962, + "grad_norm": 0.6144611835479736, + "learning_rate": 1.0880072925730855e-06, + "loss": 0.2506, + "step": 8628 + }, + { + "epoch": 2.422515440763616, + "grad_norm": 0.5643687844276428, + "learning_rate": 1.0869902844133051e-06, + "loss": 0.3061, + "step": 8629 + }, + { + "epoch": 2.4227961819202695, + "grad_norm": 0.5640206336975098, + "learning_rate": 1.0859736938255882e-06, + "loss": 0.3359, + "step": 8630 + }, + { + "epoch": 2.423076923076923, + "grad_norm": 0.6235983967781067, + "learning_rate": 1.0849575209184178e-06, + "loss": 0.3169, + "step": 8631 + }, + { + "epoch": 2.423357664233577, + "grad_norm": 0.5621570348739624, + "learning_rate": 1.083941765800236e-06, + "loss": 0.3162, + "step": 8632 + }, + { + "epoch": 2.4236384053902302, + "grad_norm": 0.585750162601471, + "learning_rate": 1.0829264285794349e-06, + "loss": 0.2987, + "step": 8633 + }, + { + "epoch": 2.4239191465468837, + "grad_norm": 0.5673215389251709, + "learning_rate": 1.081911509364369e-06, + "loss": 0.3337, + "step": 8634 + }, + { + "epoch": 2.424199887703537, + "grad_norm": 0.5640286803245544, + "learning_rate": 1.0808970082633396e-06, + "loss": 0.3602, + "step": 8635 + }, + { + "epoch": 2.424480628860191, + "grad_norm": 0.5741168856620789, + "learning_rate": 1.0798829253846116e-06, + "loss": 0.3386, + "step": 8636 + }, + { + "epoch": 2.4247613700168444, + "grad_norm": 0.5366054177284241, + "learning_rate": 1.0788692608364004e-06, + "loss": 0.294, + "step": 8637 + }, + { + "epoch": 2.425042111173498, + "grad_norm": 0.5743140578269958, + "learning_rate": 1.0778560147268752e-06, + "loss": 0.3018, + "step": 8638 + }, + { + "epoch": 2.4253228523301518, + "grad_norm": 0.5586783289909363, + "learning_rate": 1.0768431871641682e-06, + "loss": 0.3436, + "step": 8639 + }, + { + "epoch": 2.425603593486805, + "grad_norm": 0.6264934539794922, + "learning_rate": 1.0758307782563604e-06, + "loss": 0.3589, + "step": 8640 + }, + { + "epoch": 2.4258843346434587, + "grad_norm": 0.5324342250823975, + "learning_rate": 1.0748187881114874e-06, + "loss": 0.3215, + "step": 8641 + }, + { + "epoch": 2.426165075800112, + "grad_norm": 0.5290731191635132, + "learning_rate": 1.0738072168375452e-06, + "loss": 0.3394, + "step": 8642 + }, + { + "epoch": 2.426445816956766, + "grad_norm": 0.5561985969543457, + "learning_rate": 1.0727960645424806e-06, + "loss": 0.315, + "step": 8643 + }, + { + "epoch": 2.4267265581134194, + "grad_norm": 0.5355144143104553, + "learning_rate": 1.0717853313341997e-06, + "loss": 0.3503, + "step": 8644 + }, + { + "epoch": 2.427007299270073, + "grad_norm": 0.5117299556732178, + "learning_rate": 1.07077501732056e-06, + "loss": 0.3726, + "step": 8645 + }, + { + "epoch": 2.4272880404267267, + "grad_norm": 0.5098596215248108, + "learning_rate": 1.0697651226093752e-06, + "loss": 0.3065, + "step": 8646 + }, + { + "epoch": 2.42756878158338, + "grad_norm": 0.5706070065498352, + "learning_rate": 1.0687556473084172e-06, + "loss": 0.2954, + "step": 8647 + }, + { + "epoch": 2.4278495227400336, + "grad_norm": 0.5685199499130249, + "learning_rate": 1.067746591525408e-06, + "loss": 0.339, + "step": 8648 + }, + { + "epoch": 2.428130263896687, + "grad_norm": 0.5293835997581482, + "learning_rate": 1.0667379553680312e-06, + "loss": 0.3147, + "step": 8649 + }, + { + "epoch": 2.428411005053341, + "grad_norm": 0.565785825252533, + "learning_rate": 1.0657297389439192e-06, + "loss": 0.3235, + "step": 8650 + }, + { + "epoch": 2.4286917462099944, + "grad_norm": 0.5743482112884521, + "learning_rate": 1.0647219423606653e-06, + "loss": 0.339, + "step": 8651 + }, + { + "epoch": 2.428972487366648, + "grad_norm": 0.5861659646034241, + "learning_rate": 1.0637145657258135e-06, + "loss": 0.313, + "step": 8652 + }, + { + "epoch": 2.4292532285233017, + "grad_norm": 0.6089012622833252, + "learning_rate": 1.062707609146864e-06, + "loss": 0.3356, + "step": 8653 + }, + { + "epoch": 2.429533969679955, + "grad_norm": 0.5933595895767212, + "learning_rate": 1.0617010727312755e-06, + "loss": 0.3475, + "step": 8654 + }, + { + "epoch": 2.4298147108366086, + "grad_norm": 0.4917030930519104, + "learning_rate": 1.060694956586456e-06, + "loss": 0.3547, + "step": 8655 + }, + { + "epoch": 2.430095451993262, + "grad_norm": 0.5100833177566528, + "learning_rate": 1.0596892608197756e-06, + "loss": 0.3571, + "step": 8656 + }, + { + "epoch": 2.430376193149916, + "grad_norm": 0.6266801357269287, + "learning_rate": 1.0586839855385539e-06, + "loss": 0.3453, + "step": 8657 + }, + { + "epoch": 2.4306569343065694, + "grad_norm": 0.5679176449775696, + "learning_rate": 1.0576791308500661e-06, + "loss": 0.3349, + "step": 8658 + }, + { + "epoch": 2.430937675463223, + "grad_norm": 0.5580161809921265, + "learning_rate": 1.0566746968615476e-06, + "loss": 0.3264, + "step": 8659 + }, + { + "epoch": 2.4312184166198767, + "grad_norm": 0.5244404673576355, + "learning_rate": 1.0556706836801822e-06, + "loss": 0.3378, + "step": 8660 + }, + { + "epoch": 2.43149915777653, + "grad_norm": 0.582090437412262, + "learning_rate": 1.0546670914131147e-06, + "loss": 0.2973, + "step": 8661 + }, + { + "epoch": 2.4317798989331836, + "grad_norm": 0.5382874608039856, + "learning_rate": 1.0536639201674393e-06, + "loss": 0.3411, + "step": 8662 + }, + { + "epoch": 2.432060640089837, + "grad_norm": 0.5697934031486511, + "learning_rate": 1.052661170050211e-06, + "loss": 0.3323, + "step": 8663 + }, + { + "epoch": 2.432341381246491, + "grad_norm": 0.5195464491844177, + "learning_rate": 1.0516588411684347e-06, + "loss": 0.2925, + "step": 8664 + }, + { + "epoch": 2.4326221224031443, + "grad_norm": 0.5455653667449951, + "learning_rate": 1.0506569336290735e-06, + "loss": 0.3294, + "step": 8665 + }, + { + "epoch": 2.432902863559798, + "grad_norm": 0.48982128500938416, + "learning_rate": 1.0496554475390464e-06, + "loss": 0.3288, + "step": 8666 + }, + { + "epoch": 2.4331836047164517, + "grad_norm": 0.5944242477416992, + "learning_rate": 1.0486543830052243e-06, + "loss": 0.3009, + "step": 8667 + }, + { + "epoch": 2.433464345873105, + "grad_norm": 0.5586980581283569, + "learning_rate": 1.0476537401344333e-06, + "loss": 0.2843, + "step": 8668 + }, + { + "epoch": 2.4337450870297586, + "grad_norm": 0.5792757272720337, + "learning_rate": 1.0466535190334588e-06, + "loss": 0.3174, + "step": 8669 + }, + { + "epoch": 2.434025828186412, + "grad_norm": 0.5918002128601074, + "learning_rate": 1.0456537198090343e-06, + "loss": 0.3126, + "step": 8670 + }, + { + "epoch": 2.434306569343066, + "grad_norm": 0.5627588629722595, + "learning_rate": 1.0446543425678563e-06, + "loss": 0.3387, + "step": 8671 + }, + { + "epoch": 2.4345873104997193, + "grad_norm": 0.5633062720298767, + "learning_rate": 1.043655387416569e-06, + "loss": 0.3255, + "step": 8672 + }, + { + "epoch": 2.4348680516563728, + "grad_norm": 0.6193362474441528, + "learning_rate": 1.0426568544617771e-06, + "loss": 0.3331, + "step": 8673 + }, + { + "epoch": 2.435148792813026, + "grad_norm": 0.6241822242736816, + "learning_rate": 1.0416587438100367e-06, + "loss": 0.3104, + "step": 8674 + }, + { + "epoch": 2.43542953396968, + "grad_norm": 0.5509600639343262, + "learning_rate": 1.0406610555678582e-06, + "loss": 0.3432, + "step": 8675 + }, + { + "epoch": 2.4357102751263335, + "grad_norm": 0.5371003746986389, + "learning_rate": 1.0396637898417117e-06, + "loss": 0.3349, + "step": 8676 + }, + { + "epoch": 2.435991016282987, + "grad_norm": 0.5923648476600647, + "learning_rate": 1.0386669467380167e-06, + "loss": 0.3058, + "step": 8677 + }, + { + "epoch": 2.4362717574396404, + "grad_norm": 0.5829152464866638, + "learning_rate": 1.0376705263631525e-06, + "loss": 0.3529, + "step": 8678 + }, + { + "epoch": 2.4365524985962943, + "grad_norm": 0.5810061693191528, + "learning_rate": 1.0366745288234497e-06, + "loss": 0.2951, + "step": 8679 + }, + { + "epoch": 2.4368332397529477, + "grad_norm": 0.5497332811355591, + "learning_rate": 1.0356789542251939e-06, + "loss": 0.3297, + "step": 8680 + }, + { + "epoch": 2.437113980909601, + "grad_norm": 0.5494989156723022, + "learning_rate": 1.034683802674628e-06, + "loss": 0.3442, + "step": 8681 + }, + { + "epoch": 2.437394722066255, + "grad_norm": 0.5286225080490112, + "learning_rate": 1.0336890742779476e-06, + "loss": 0.3357, + "step": 8682 + }, + { + "epoch": 2.4376754632229085, + "grad_norm": 0.52973473072052, + "learning_rate": 1.0326947691413053e-06, + "loss": 0.3748, + "step": 8683 + }, + { + "epoch": 2.437956204379562, + "grad_norm": 0.5699828863143921, + "learning_rate": 1.0317008873708045e-06, + "loss": 0.3565, + "step": 8684 + }, + { + "epoch": 2.4382369455362154, + "grad_norm": 0.5908522009849548, + "learning_rate": 1.03070742907251e-06, + "loss": 0.3268, + "step": 8685 + }, + { + "epoch": 2.4385176866928693, + "grad_norm": 0.5856167674064636, + "learning_rate": 1.0297143943524345e-06, + "loss": 0.3308, + "step": 8686 + }, + { + "epoch": 2.4387984278495227, + "grad_norm": 0.5940700173377991, + "learning_rate": 1.028721783316548e-06, + "loss": 0.2994, + "step": 8687 + }, + { + "epoch": 2.439079169006176, + "grad_norm": 0.5850594639778137, + "learning_rate": 1.0277295960707788e-06, + "loss": 0.329, + "step": 8688 + }, + { + "epoch": 2.43935991016283, + "grad_norm": 0.5705891847610474, + "learning_rate": 1.0267378327210036e-06, + "loss": 0.3335, + "step": 8689 + }, + { + "epoch": 2.4396406513194835, + "grad_norm": 0.5475537180900574, + "learning_rate": 1.0257464933730587e-06, + "loss": 0.3237, + "step": 8690 + }, + { + "epoch": 2.439921392476137, + "grad_norm": 0.5784679651260376, + "learning_rate": 1.024755578132735e-06, + "loss": 0.3418, + "step": 8691 + }, + { + "epoch": 2.4402021336327904, + "grad_norm": 0.518255889415741, + "learning_rate": 1.0237650871057746e-06, + "loss": 0.3549, + "step": 8692 + }, + { + "epoch": 2.4404828747894443, + "grad_norm": 0.5716167688369751, + "learning_rate": 1.0227750203978787e-06, + "loss": 0.3188, + "step": 8693 + }, + { + "epoch": 2.4407636159460977, + "grad_norm": 0.5474283695220947, + "learning_rate": 1.0217853781147003e-06, + "loss": 0.307, + "step": 8694 + }, + { + "epoch": 2.441044357102751, + "grad_norm": 0.5570739507675171, + "learning_rate": 1.0207961603618456e-06, + "loss": 0.3073, + "step": 8695 + }, + { + "epoch": 2.441325098259405, + "grad_norm": 0.5603153109550476, + "learning_rate": 1.0198073672448811e-06, + "loss": 0.29, + "step": 8696 + }, + { + "epoch": 2.4416058394160585, + "grad_norm": 0.5592935681343079, + "learning_rate": 1.0188189988693215e-06, + "loss": 0.3273, + "step": 8697 + }, + { + "epoch": 2.441886580572712, + "grad_norm": 0.6249144077301025, + "learning_rate": 1.0178310553406429e-06, + "loss": 0.3274, + "step": 8698 + }, + { + "epoch": 2.4421673217293653, + "grad_norm": 0.5414302349090576, + "learning_rate": 1.0168435367642693e-06, + "loss": 0.3427, + "step": 8699 + }, + { + "epoch": 2.4424480628860192, + "grad_norm": 0.512814998626709, + "learning_rate": 1.0158564432455847e-06, + "loss": 0.3502, + "step": 8700 + }, + { + "epoch": 2.4427288040426727, + "grad_norm": 0.5602161288261414, + "learning_rate": 1.014869774889925e-06, + "loss": 0.2995, + "step": 8701 + }, + { + "epoch": 2.443009545199326, + "grad_norm": 0.5614134073257446, + "learning_rate": 1.0138835318025796e-06, + "loss": 0.3221, + "step": 8702 + }, + { + "epoch": 2.44329028635598, + "grad_norm": 0.5190013647079468, + "learning_rate": 1.0128977140887968e-06, + "loss": 0.3099, + "step": 8703 + }, + { + "epoch": 2.4435710275126334, + "grad_norm": 0.5597667694091797, + "learning_rate": 1.0119123218537752e-06, + "loss": 0.3066, + "step": 8704 + }, + { + "epoch": 2.443851768669287, + "grad_norm": 0.5079295039176941, + "learning_rate": 1.010927355202671e-06, + "loss": 0.3045, + "step": 8705 + }, + { + "epoch": 2.4441325098259403, + "grad_norm": 0.597145140171051, + "learning_rate": 1.009942814240593e-06, + "loss": 0.3087, + "step": 8706 + }, + { + "epoch": 2.444413250982594, + "grad_norm": 0.5442847609519958, + "learning_rate": 1.0089586990726047e-06, + "loss": 0.3124, + "step": 8707 + }, + { + "epoch": 2.4446939921392477, + "grad_norm": 0.5378947854042053, + "learning_rate": 1.007975009803726e-06, + "loss": 0.2801, + "step": 8708 + }, + { + "epoch": 2.444974733295901, + "grad_norm": 0.609731137752533, + "learning_rate": 1.0069917465389285e-06, + "loss": 0.3478, + "step": 8709 + }, + { + "epoch": 2.445255474452555, + "grad_norm": 0.5179837346076965, + "learning_rate": 1.006008909383142e-06, + "loss": 0.3424, + "step": 8710 + }, + { + "epoch": 2.4455362156092084, + "grad_norm": 0.5271161794662476, + "learning_rate": 1.0050264984412467e-06, + "loss": 0.3298, + "step": 8711 + }, + { + "epoch": 2.445816956765862, + "grad_norm": 0.555071234703064, + "learning_rate": 1.0040445138180816e-06, + "loss": 0.3291, + "step": 8712 + }, + { + "epoch": 2.4460976979225153, + "grad_norm": 0.5495772957801819, + "learning_rate": 1.0030629556184367e-06, + "loss": 0.3501, + "step": 8713 + }, + { + "epoch": 2.446378439079169, + "grad_norm": 0.5728296637535095, + "learning_rate": 1.002081823947057e-06, + "loss": 0.2925, + "step": 8714 + }, + { + "epoch": 2.4466591802358226, + "grad_norm": 0.572092592716217, + "learning_rate": 1.0011011189086428e-06, + "loss": 0.2899, + "step": 8715 + }, + { + "epoch": 2.446939921392476, + "grad_norm": 0.6001694202423096, + "learning_rate": 1.0001208406078516e-06, + "loss": 0.2697, + "step": 8716 + }, + { + "epoch": 2.4472206625491295, + "grad_norm": 0.5329112410545349, + "learning_rate": 9.991409891492892e-07, + "loss": 0.3375, + "step": 8717 + }, + { + "epoch": 2.4475014037057834, + "grad_norm": 0.5405532121658325, + "learning_rate": 9.98161564637522e-07, + "loss": 0.3606, + "step": 8718 + }, + { + "epoch": 2.447782144862437, + "grad_norm": 0.630767285823822, + "learning_rate": 9.971825671770658e-07, + "loss": 0.3045, + "step": 8719 + }, + { + "epoch": 2.4480628860190903, + "grad_norm": 0.6453242897987366, + "learning_rate": 9.962039968723952e-07, + "loss": 0.2963, + "step": 8720 + }, + { + "epoch": 2.4483436271757437, + "grad_norm": 0.5536431074142456, + "learning_rate": 9.952258538279348e-07, + "loss": 0.3714, + "step": 8721 + }, + { + "epoch": 2.4486243683323976, + "grad_norm": 0.5230050086975098, + "learning_rate": 9.942481381480684e-07, + "loss": 0.341, + "step": 8722 + }, + { + "epoch": 2.448905109489051, + "grad_norm": 0.5292918086051941, + "learning_rate": 9.932708499371307e-07, + "loss": 0.3405, + "step": 8723 + }, + { + "epoch": 2.4491858506457045, + "grad_norm": 0.5841137766838074, + "learning_rate": 9.922939892994104e-07, + "loss": 0.2919, + "step": 8724 + }, + { + "epoch": 2.4494665918023584, + "grad_norm": 0.5978755354881287, + "learning_rate": 9.913175563391546e-07, + "loss": 0.3034, + "step": 8725 + }, + { + "epoch": 2.449747332959012, + "grad_norm": 0.5780662894248962, + "learning_rate": 9.903415511605597e-07, + "loss": 0.3214, + "step": 8726 + }, + { + "epoch": 2.4500280741156653, + "grad_norm": 0.5812617540359497, + "learning_rate": 9.893659738677807e-07, + "loss": 0.3084, + "step": 8727 + }, + { + "epoch": 2.4503088152723187, + "grad_norm": 0.557697594165802, + "learning_rate": 9.883908245649253e-07, + "loss": 0.3399, + "step": 8728 + }, + { + "epoch": 2.4505895564289726, + "grad_norm": 0.543389081954956, + "learning_rate": 9.874161033560532e-07, + "loss": 0.3696, + "step": 8729 + }, + { + "epoch": 2.450870297585626, + "grad_norm": 0.517784595489502, + "learning_rate": 9.86441810345183e-07, + "loss": 0.359, + "step": 8730 + }, + { + "epoch": 2.4511510387422795, + "grad_norm": 0.6222788691520691, + "learning_rate": 9.854679456362825e-07, + "loss": 0.2722, + "step": 8731 + }, + { + "epoch": 2.4514317798989333, + "grad_norm": 0.5975311994552612, + "learning_rate": 9.844945093332797e-07, + "loss": 0.3275, + "step": 8732 + }, + { + "epoch": 2.451712521055587, + "grad_norm": 0.5764299035072327, + "learning_rate": 9.835215015400506e-07, + "loss": 0.2994, + "step": 8733 + }, + { + "epoch": 2.4519932622122402, + "grad_norm": 0.5655746459960938, + "learning_rate": 9.825489223604313e-07, + "loss": 0.382, + "step": 8734 + }, + { + "epoch": 2.4522740033688937, + "grad_norm": 0.5662592649459839, + "learning_rate": 9.815767718982083e-07, + "loss": 0.3239, + "step": 8735 + }, + { + "epoch": 2.4525547445255476, + "grad_norm": 0.5751045942306519, + "learning_rate": 9.806050502571224e-07, + "loss": 0.3109, + "step": 8736 + }, + { + "epoch": 2.452835485682201, + "grad_norm": 0.5780230760574341, + "learning_rate": 9.79633757540871e-07, + "loss": 0.373, + "step": 8737 + }, + { + "epoch": 2.4531162268388544, + "grad_norm": 0.6236237287521362, + "learning_rate": 9.786628938531034e-07, + "loss": 0.3232, + "step": 8738 + }, + { + "epoch": 2.4533969679955083, + "grad_norm": 0.6261851191520691, + "learning_rate": 9.776924592974257e-07, + "loss": 0.2989, + "step": 8739 + }, + { + "epoch": 2.4536777091521618, + "grad_norm": 0.5853842496871948, + "learning_rate": 9.767224539773944e-07, + "loss": 0.3541, + "step": 8740 + }, + { + "epoch": 2.453958450308815, + "grad_norm": 0.5706650614738464, + "learning_rate": 9.757528779965236e-07, + "loss": 0.3027, + "step": 8741 + }, + { + "epoch": 2.4542391914654687, + "grad_norm": 0.5657103061676025, + "learning_rate": 9.74783731458282e-07, + "loss": 0.3169, + "step": 8742 + }, + { + "epoch": 2.4545199326221225, + "grad_norm": 0.5295966267585754, + "learning_rate": 9.738150144660879e-07, + "loss": 0.3372, + "step": 8743 + }, + { + "epoch": 2.454800673778776, + "grad_norm": 0.5562377572059631, + "learning_rate": 9.7284672712332e-07, + "loss": 0.3145, + "step": 8744 + }, + { + "epoch": 2.4550814149354294, + "grad_norm": 0.6606711745262146, + "learning_rate": 9.71878869533306e-07, + "loss": 0.321, + "step": 8745 + }, + { + "epoch": 2.4553621560920833, + "grad_norm": 0.5965161919593811, + "learning_rate": 9.709114417993283e-07, + "loss": 0.3008, + "step": 8746 + }, + { + "epoch": 2.4556428972487367, + "grad_norm": 0.5543991923332214, + "learning_rate": 9.699444440246276e-07, + "loss": 0.3368, + "step": 8747 + }, + { + "epoch": 2.45592363840539, + "grad_norm": 0.5702335834503174, + "learning_rate": 9.689778763123935e-07, + "loss": 0.3354, + "step": 8748 + }, + { + "epoch": 2.4562043795620436, + "grad_norm": 0.5386536121368408, + "learning_rate": 9.680117387657744e-07, + "loss": 0.3613, + "step": 8749 + }, + { + "epoch": 2.4564851207186975, + "grad_norm": 0.5534363985061646, + "learning_rate": 9.670460314878694e-07, + "loss": 0.3161, + "step": 8750 + }, + { + "epoch": 2.456765861875351, + "grad_norm": 0.5873655080795288, + "learning_rate": 9.660807545817308e-07, + "loss": 0.2932, + "step": 8751 + }, + { + "epoch": 2.4570466030320044, + "grad_norm": 0.5368070602416992, + "learning_rate": 9.651159081503704e-07, + "loss": 0.3087, + "step": 8752 + }, + { + "epoch": 2.4573273441886583, + "grad_norm": 0.5483406186103821, + "learning_rate": 9.641514922967476e-07, + "loss": 0.3239, + "step": 8753 + }, + { + "epoch": 2.4576080853453117, + "grad_norm": 0.5662078261375427, + "learning_rate": 9.631875071237816e-07, + "loss": 0.3549, + "step": 8754 + }, + { + "epoch": 2.457888826501965, + "grad_norm": 0.539288341999054, + "learning_rate": 9.622239527343409e-07, + "loss": 0.3218, + "step": 8755 + }, + { + "epoch": 2.4581695676586186, + "grad_norm": 0.5616505742073059, + "learning_rate": 9.61260829231251e-07, + "loss": 0.3092, + "step": 8756 + }, + { + "epoch": 2.4584503088152725, + "grad_norm": 0.5700971484184265, + "learning_rate": 9.60298136717291e-07, + "loss": 0.3107, + "step": 8757 + }, + { + "epoch": 2.458731049971926, + "grad_norm": 0.6075683832168579, + "learning_rate": 9.593358752951904e-07, + "loss": 0.2727, + "step": 8758 + }, + { + "epoch": 2.4590117911285794, + "grad_norm": 0.5623923540115356, + "learning_rate": 9.583740450676398e-07, + "loss": 0.3617, + "step": 8759 + }, + { + "epoch": 2.4592925322852333, + "grad_norm": 0.5814056992530823, + "learning_rate": 9.574126461372762e-07, + "loss": 0.3344, + "step": 8760 + }, + { + "epoch": 2.4595732734418867, + "grad_norm": 0.5781075358390808, + "learning_rate": 9.56451678606698e-07, + "loss": 0.3265, + "step": 8761 + }, + { + "epoch": 2.45985401459854, + "grad_norm": 0.553530752658844, + "learning_rate": 9.554911425784507e-07, + "loss": 0.3337, + "step": 8762 + }, + { + "epoch": 2.4601347557551936, + "grad_norm": 0.5906181931495667, + "learning_rate": 9.545310381550365e-07, + "loss": 0.3293, + "step": 8763 + }, + { + "epoch": 2.4604154969118475, + "grad_norm": 0.5802428126335144, + "learning_rate": 9.535713654389139e-07, + "loss": 0.3848, + "step": 8764 + }, + { + "epoch": 2.460696238068501, + "grad_norm": 0.5183466672897339, + "learning_rate": 9.52612124532491e-07, + "loss": 0.313, + "step": 8765 + }, + { + "epoch": 2.4609769792251543, + "grad_norm": 0.5322921276092529, + "learning_rate": 9.51653315538133e-07, + "loss": 0.3588, + "step": 8766 + }, + { + "epoch": 2.461257720381808, + "grad_norm": 0.48973557353019714, + "learning_rate": 9.506949385581593e-07, + "loss": 0.3562, + "step": 8767 + }, + { + "epoch": 2.4615384615384617, + "grad_norm": 0.6054965853691101, + "learning_rate": 9.497369936948397e-07, + "loss": 0.3743, + "step": 8768 + }, + { + "epoch": 2.461819202695115, + "grad_norm": 0.5307208299636841, + "learning_rate": 9.487794810504025e-07, + "loss": 0.2946, + "step": 8769 + }, + { + "epoch": 2.4620999438517686, + "grad_norm": 0.5321015119552612, + "learning_rate": 9.478224007270253e-07, + "loss": 0.3368, + "step": 8770 + }, + { + "epoch": 2.462380685008422, + "grad_norm": 0.5543622970581055, + "learning_rate": 9.468657528268432e-07, + "loss": 0.3846, + "step": 8771 + }, + { + "epoch": 2.462661426165076, + "grad_norm": 0.5620099306106567, + "learning_rate": 9.45909537451944e-07, + "loss": 0.3575, + "step": 8772 + }, + { + "epoch": 2.4629421673217293, + "grad_norm": 0.5877784490585327, + "learning_rate": 9.449537547043664e-07, + "loss": 0.3329, + "step": 8773 + }, + { + "epoch": 2.4632229084783828, + "grad_norm": 0.5643718242645264, + "learning_rate": 9.439984046861089e-07, + "loss": 0.3624, + "step": 8774 + }, + { + "epoch": 2.4635036496350367, + "grad_norm": 0.5258351564407349, + "learning_rate": 9.430434874991173e-07, + "loss": 0.3522, + "step": 8775 + }, + { + "epoch": 2.46378439079169, + "grad_norm": 0.5531173348426819, + "learning_rate": 9.420890032452984e-07, + "loss": 0.2925, + "step": 8776 + }, + { + "epoch": 2.4640651319483435, + "grad_norm": 0.5484004616737366, + "learning_rate": 9.411349520265056e-07, + "loss": 0.3327, + "step": 8777 + }, + { + "epoch": 2.464345873104997, + "grad_norm": 0.5880656242370605, + "learning_rate": 9.401813339445498e-07, + "loss": 0.2846, + "step": 8778 + }, + { + "epoch": 2.464626614261651, + "grad_norm": 0.5654290914535522, + "learning_rate": 9.392281491011967e-07, + "loss": 0.3181, + "step": 8779 + }, + { + "epoch": 2.4649073554183043, + "grad_norm": 0.5417488813400269, + "learning_rate": 9.382753975981617e-07, + "loss": 0.3467, + "step": 8780 + }, + { + "epoch": 2.4651880965749577, + "grad_norm": 0.5313130021095276, + "learning_rate": 9.373230795371197e-07, + "loss": 0.3315, + "step": 8781 + }, + { + "epoch": 2.4654688377316116, + "grad_norm": 0.5896884202957153, + "learning_rate": 9.363711950196929e-07, + "loss": 0.2942, + "step": 8782 + }, + { + "epoch": 2.465749578888265, + "grad_norm": 0.5709277391433716, + "learning_rate": 9.354197441474638e-07, + "loss": 0.2942, + "step": 8783 + }, + { + "epoch": 2.4660303200449185, + "grad_norm": 0.5979763269424438, + "learning_rate": 9.344687270219632e-07, + "loss": 0.2903, + "step": 8784 + }, + { + "epoch": 2.466311061201572, + "grad_norm": 0.5077071189880371, + "learning_rate": 9.335181437446772e-07, + "loss": 0.3625, + "step": 8785 + }, + { + "epoch": 2.466591802358226, + "grad_norm": 0.6212291121482849, + "learning_rate": 9.325679944170484e-07, + "loss": 0.3066, + "step": 8786 + }, + { + "epoch": 2.4668725435148793, + "grad_norm": 0.5817729234695435, + "learning_rate": 9.316182791404682e-07, + "loss": 0.3081, + "step": 8787 + }, + { + "epoch": 2.4671532846715327, + "grad_norm": 0.5042723417282104, + "learning_rate": 9.30668998016287e-07, + "loss": 0.3098, + "step": 8788 + }, + { + "epoch": 2.4674340258281866, + "grad_norm": 0.6605865359306335, + "learning_rate": 9.297201511458049e-07, + "loss": 0.363, + "step": 8789 + }, + { + "epoch": 2.46771476698484, + "grad_norm": 0.5550678968429565, + "learning_rate": 9.287717386302753e-07, + "loss": 0.3285, + "step": 8790 + }, + { + "epoch": 2.4679955081414935, + "grad_norm": 0.5506582260131836, + "learning_rate": 9.278237605709089e-07, + "loss": 0.3299, + "step": 8791 + }, + { + "epoch": 2.468276249298147, + "grad_norm": 0.5243185758590698, + "learning_rate": 9.268762170688673e-07, + "loss": 0.3295, + "step": 8792 + }, + { + "epoch": 2.468556990454801, + "grad_norm": 0.5579249858856201, + "learning_rate": 9.25929108225268e-07, + "loss": 0.3221, + "step": 8793 + }, + { + "epoch": 2.4688377316114543, + "grad_norm": 0.5396980047225952, + "learning_rate": 9.249824341411795e-07, + "loss": 0.3491, + "step": 8794 + }, + { + "epoch": 2.4691184727681077, + "grad_norm": 0.5205463767051697, + "learning_rate": 9.240361949176229e-07, + "loss": 0.3462, + "step": 8795 + }, + { + "epoch": 2.4693992139247616, + "grad_norm": 0.5658536553382874, + "learning_rate": 9.230903906555788e-07, + "loss": 0.3009, + "step": 8796 + }, + { + "epoch": 2.469679955081415, + "grad_norm": 0.5527428388595581, + "learning_rate": 9.221450214559741e-07, + "loss": 0.3384, + "step": 8797 + }, + { + "epoch": 2.4699606962380685, + "grad_norm": 0.5233757495880127, + "learning_rate": 9.212000874196953e-07, + "loss": 0.4131, + "step": 8798 + }, + { + "epoch": 2.470241437394722, + "grad_norm": 0.5014605522155762, + "learning_rate": 9.202555886475789e-07, + "loss": 0.3034, + "step": 8799 + }, + { + "epoch": 2.470522178551376, + "grad_norm": 0.5935212969779968, + "learning_rate": 9.193115252404144e-07, + "loss": 0.3101, + "step": 8800 + }, + { + "epoch": 2.4708029197080292, + "grad_norm": 0.547580897808075, + "learning_rate": 9.18367897298949e-07, + "loss": 0.3134, + "step": 8801 + }, + { + "epoch": 2.4710836608646827, + "grad_norm": 0.5496137142181396, + "learning_rate": 9.174247049238788e-07, + "loss": 0.3206, + "step": 8802 + }, + { + "epoch": 2.4713644020213366, + "grad_norm": 0.5276779532432556, + "learning_rate": 9.164819482158571e-07, + "loss": 0.3357, + "step": 8803 + }, + { + "epoch": 2.47164514317799, + "grad_norm": 0.6099643111228943, + "learning_rate": 9.155396272754874e-07, + "loss": 0.3313, + "step": 8804 + }, + { + "epoch": 2.4719258843346434, + "grad_norm": 0.6051143407821655, + "learning_rate": 9.145977422033298e-07, + "loss": 0.3285, + "step": 8805 + }, + { + "epoch": 2.472206625491297, + "grad_norm": 0.5772519111633301, + "learning_rate": 9.13656293099896e-07, + "loss": 0.3106, + "step": 8806 + }, + { + "epoch": 2.4724873666479508, + "grad_norm": 0.5344874262809753, + "learning_rate": 9.127152800656497e-07, + "loss": 0.369, + "step": 8807 + }, + { + "epoch": 2.472768107804604, + "grad_norm": 0.5589173436164856, + "learning_rate": 9.117747032010132e-07, + "loss": 0.3101, + "step": 8808 + }, + { + "epoch": 2.4730488489612577, + "grad_norm": 0.5959139466285706, + "learning_rate": 9.108345626063559e-07, + "loss": 0.3601, + "step": 8809 + }, + { + "epoch": 2.473329590117911, + "grad_norm": 0.5017551183700562, + "learning_rate": 9.098948583820066e-07, + "loss": 0.3705, + "step": 8810 + }, + { + "epoch": 2.473610331274565, + "grad_norm": 0.526187002658844, + "learning_rate": 9.089555906282438e-07, + "loss": 0.3363, + "step": 8811 + }, + { + "epoch": 2.4738910724312184, + "grad_norm": 0.6104418635368347, + "learning_rate": 9.080167594452977e-07, + "loss": 0.3358, + "step": 8812 + }, + { + "epoch": 2.474171813587872, + "grad_norm": 0.5797775387763977, + "learning_rate": 9.070783649333587e-07, + "loss": 0.332, + "step": 8813 + }, + { + "epoch": 2.4744525547445253, + "grad_norm": 0.5935958027839661, + "learning_rate": 9.061404071925628e-07, + "loss": 0.3541, + "step": 8814 + }, + { + "epoch": 2.474733295901179, + "grad_norm": 0.5519028306007385, + "learning_rate": 9.052028863230056e-07, + "loss": 0.3449, + "step": 8815 + }, + { + "epoch": 2.4750140370578326, + "grad_norm": 0.5282933712005615, + "learning_rate": 9.042658024247314e-07, + "loss": 0.3393, + "step": 8816 + }, + { + "epoch": 2.475294778214486, + "grad_norm": 0.5654563903808594, + "learning_rate": 9.033291555977414e-07, + "loss": 0.2902, + "step": 8817 + }, + { + "epoch": 2.47557551937114, + "grad_norm": 0.556206464767456, + "learning_rate": 9.023929459419894e-07, + "loss": 0.3482, + "step": 8818 + }, + { + "epoch": 2.4758562605277934, + "grad_norm": 0.6563774347305298, + "learning_rate": 9.014571735573796e-07, + "loss": 0.2735, + "step": 8819 + }, + { + "epoch": 2.476137001684447, + "grad_norm": 0.5940157175064087, + "learning_rate": 9.005218385437742e-07, + "loss": 0.3285, + "step": 8820 + }, + { + "epoch": 2.4764177428411003, + "grad_norm": 0.557159423828125, + "learning_rate": 8.995869410009855e-07, + "loss": 0.3247, + "step": 8821 + }, + { + "epoch": 2.476698483997754, + "grad_norm": 0.4959602355957031, + "learning_rate": 8.986524810287783e-07, + "loss": 0.3567, + "step": 8822 + }, + { + "epoch": 2.4769792251544076, + "grad_norm": 0.5449239611625671, + "learning_rate": 8.97718458726875e-07, + "loss": 0.3645, + "step": 8823 + }, + { + "epoch": 2.477259966311061, + "grad_norm": 0.5151669383049011, + "learning_rate": 8.967848741949464e-07, + "loss": 0.3351, + "step": 8824 + }, + { + "epoch": 2.477540707467715, + "grad_norm": 0.5070462822914124, + "learning_rate": 8.958517275326207e-07, + "loss": 0.3934, + "step": 8825 + }, + { + "epoch": 2.4778214486243684, + "grad_norm": 0.5659955143928528, + "learning_rate": 8.949190188394768e-07, + "loss": 0.3105, + "step": 8826 + }, + { + "epoch": 2.478102189781022, + "grad_norm": 0.6299929022789001, + "learning_rate": 8.93986748215046e-07, + "loss": 0.3294, + "step": 8827 + }, + { + "epoch": 2.4783829309376753, + "grad_norm": 0.5061848759651184, + "learning_rate": 8.93054915758817e-07, + "loss": 0.3352, + "step": 8828 + }, + { + "epoch": 2.478663672094329, + "grad_norm": 0.5764517784118652, + "learning_rate": 8.921235215702268e-07, + "loss": 0.3123, + "step": 8829 + }, + { + "epoch": 2.4789444132509826, + "grad_norm": 0.5483461618423462, + "learning_rate": 8.911925657486709e-07, + "loss": 0.3129, + "step": 8830 + }, + { + "epoch": 2.479225154407636, + "grad_norm": 0.5682786703109741, + "learning_rate": 8.902620483934915e-07, + "loss": 0.3263, + "step": 8831 + }, + { + "epoch": 2.47950589556429, + "grad_norm": 0.5124207735061646, + "learning_rate": 8.89331969603991e-07, + "loss": 0.3681, + "step": 8832 + }, + { + "epoch": 2.4797866367209433, + "grad_norm": 0.5345808267593384, + "learning_rate": 8.884023294794202e-07, + "loss": 0.3145, + "step": 8833 + }, + { + "epoch": 2.480067377877597, + "grad_norm": 0.5861395597457886, + "learning_rate": 8.87473128118983e-07, + "loss": 0.3744, + "step": 8834 + }, + { + "epoch": 2.4803481190342502, + "grad_norm": 0.533804178237915, + "learning_rate": 8.865443656218409e-07, + "loss": 0.3531, + "step": 8835 + }, + { + "epoch": 2.480628860190904, + "grad_norm": 0.5505049824714661, + "learning_rate": 8.856160420871029e-07, + "loss": 0.3218, + "step": 8836 + }, + { + "epoch": 2.4809096013475576, + "grad_norm": 0.572982132434845, + "learning_rate": 8.846881576138366e-07, + "loss": 0.3578, + "step": 8837 + }, + { + "epoch": 2.481190342504211, + "grad_norm": 0.6158345341682434, + "learning_rate": 8.837607123010572e-07, + "loss": 0.3306, + "step": 8838 + }, + { + "epoch": 2.481471083660865, + "grad_norm": 0.5887256860733032, + "learning_rate": 8.828337062477387e-07, + "loss": 0.3081, + "step": 8839 + }, + { + "epoch": 2.4817518248175183, + "grad_norm": 0.5889377593994141, + "learning_rate": 8.819071395528044e-07, + "loss": 0.2857, + "step": 8840 + }, + { + "epoch": 2.4820325659741718, + "grad_norm": 0.5727645754814148, + "learning_rate": 8.809810123151302e-07, + "loss": 0.3135, + "step": 8841 + }, + { + "epoch": 2.482313307130825, + "grad_norm": 0.535312294960022, + "learning_rate": 8.800553246335475e-07, + "loss": 0.3102, + "step": 8842 + }, + { + "epoch": 2.482594048287479, + "grad_norm": 0.5249013900756836, + "learning_rate": 8.791300766068416e-07, + "loss": 0.3297, + "step": 8843 + }, + { + "epoch": 2.4828747894441325, + "grad_norm": 0.5815885663032532, + "learning_rate": 8.782052683337466e-07, + "loss": 0.352, + "step": 8844 + }, + { + "epoch": 2.483155530600786, + "grad_norm": 0.64006108045578, + "learning_rate": 8.772808999129551e-07, + "loss": 0.2846, + "step": 8845 + }, + { + "epoch": 2.48343627175744, + "grad_norm": 0.5870609879493713, + "learning_rate": 8.763569714431075e-07, + "loss": 0.3247, + "step": 8846 + }, + { + "epoch": 2.4837170129140933, + "grad_norm": 0.6090686321258545, + "learning_rate": 8.754334830228012e-07, + "loss": 0.3026, + "step": 8847 + }, + { + "epoch": 2.4839977540707467, + "grad_norm": 0.6023396253585815, + "learning_rate": 8.74510434750585e-07, + "loss": 0.3225, + "step": 8848 + }, + { + "epoch": 2.4842784952274, + "grad_norm": 0.5896828770637512, + "learning_rate": 8.735878267249593e-07, + "loss": 0.3483, + "step": 8849 + }, + { + "epoch": 2.484559236384054, + "grad_norm": 0.5239513516426086, + "learning_rate": 8.726656590443816e-07, + "loss": 0.3149, + "step": 8850 + }, + { + "epoch": 2.4848399775407075, + "grad_norm": 0.5483566522598267, + "learning_rate": 8.717439318072568e-07, + "loss": 0.3037, + "step": 8851 + }, + { + "epoch": 2.485120718697361, + "grad_norm": 0.5274941921234131, + "learning_rate": 8.708226451119495e-07, + "loss": 0.3586, + "step": 8852 + }, + { + "epoch": 2.485401459854015, + "grad_norm": 0.6059607267379761, + "learning_rate": 8.699017990567704e-07, + "loss": 0.3171, + "step": 8853 + }, + { + "epoch": 2.4856822010106683, + "grad_norm": 0.5647132992744446, + "learning_rate": 8.689813937399887e-07, + "loss": 0.2982, + "step": 8854 + }, + { + "epoch": 2.4859629421673217, + "grad_norm": 0.5836697816848755, + "learning_rate": 8.680614292598244e-07, + "loss": 0.3333, + "step": 8855 + }, + { + "epoch": 2.486243683323975, + "grad_norm": 0.5829404592514038, + "learning_rate": 8.671419057144475e-07, + "loss": 0.3308, + "step": 8856 + }, + { + "epoch": 2.486524424480629, + "grad_norm": 0.5423857569694519, + "learning_rate": 8.662228232019876e-07, + "loss": 0.3304, + "step": 8857 + }, + { + "epoch": 2.4868051656372825, + "grad_norm": 0.6207121014595032, + "learning_rate": 8.653041818205204e-07, + "loss": 0.3718, + "step": 8858 + }, + { + "epoch": 2.487085906793936, + "grad_norm": 0.590457022190094, + "learning_rate": 8.6438598166808e-07, + "loss": 0.312, + "step": 8859 + }, + { + "epoch": 2.4873666479505894, + "grad_norm": 0.5615829825401306, + "learning_rate": 8.634682228426505e-07, + "loss": 0.2747, + "step": 8860 + }, + { + "epoch": 2.4876473891072433, + "grad_norm": 0.539747953414917, + "learning_rate": 8.625509054421671e-07, + "loss": 0.3144, + "step": 8861 + }, + { + "epoch": 2.4879281302638967, + "grad_norm": 0.5582963228225708, + "learning_rate": 8.616340295645231e-07, + "loss": 0.3043, + "step": 8862 + }, + { + "epoch": 2.48820887142055, + "grad_norm": 0.5768175721168518, + "learning_rate": 8.607175953075597e-07, + "loss": 0.2978, + "step": 8863 + }, + { + "epoch": 2.4884896125772036, + "grad_norm": 0.561724841594696, + "learning_rate": 8.598016027690753e-07, + "loss": 0.3553, + "step": 8864 + }, + { + "epoch": 2.4887703537338575, + "grad_norm": 0.517392098903656, + "learning_rate": 8.588860520468167e-07, + "loss": 0.332, + "step": 8865 + }, + { + "epoch": 2.489051094890511, + "grad_norm": 0.5608408451080322, + "learning_rate": 8.579709432384876e-07, + "loss": 0.2792, + "step": 8866 + }, + { + "epoch": 2.4893318360471643, + "grad_norm": 0.6274533271789551, + "learning_rate": 8.570562764417406e-07, + "loss": 0.3592, + "step": 8867 + }, + { + "epoch": 2.4896125772038182, + "grad_norm": 0.5518200397491455, + "learning_rate": 8.561420517541846e-07, + "loss": 0.3241, + "step": 8868 + }, + { + "epoch": 2.4898933183604717, + "grad_norm": 0.5387300848960876, + "learning_rate": 8.55228269273381e-07, + "loss": 0.2999, + "step": 8869 + }, + { + "epoch": 2.490174059517125, + "grad_norm": 0.5818502902984619, + "learning_rate": 8.543149290968422e-07, + "loss": 0.3076, + "step": 8870 + }, + { + "epoch": 2.4904548006737786, + "grad_norm": 0.4942573308944702, + "learning_rate": 8.53402031322032e-07, + "loss": 0.3066, + "step": 8871 + }, + { + "epoch": 2.4907355418304324, + "grad_norm": 0.5426687002182007, + "learning_rate": 8.52489576046373e-07, + "loss": 0.3359, + "step": 8872 + }, + { + "epoch": 2.491016282987086, + "grad_norm": 0.6341243982315063, + "learning_rate": 8.51577563367233e-07, + "loss": 0.364, + "step": 8873 + }, + { + "epoch": 2.4912970241437393, + "grad_norm": 0.5375432372093201, + "learning_rate": 8.506659933819395e-07, + "loss": 0.3095, + "step": 8874 + }, + { + "epoch": 2.491577765300393, + "grad_norm": 0.5210657119750977, + "learning_rate": 8.497548661877669e-07, + "loss": 0.3357, + "step": 8875 + }, + { + "epoch": 2.4918585064570467, + "grad_norm": 0.5737097263336182, + "learning_rate": 8.488441818819476e-07, + "loss": 0.3275, + "step": 8876 + }, + { + "epoch": 2.4921392476137, + "grad_norm": 0.5249507427215576, + "learning_rate": 8.479339405616633e-07, + "loss": 0.3435, + "step": 8877 + }, + { + "epoch": 2.4924199887703535, + "grad_norm": 0.5626519322395325, + "learning_rate": 8.470241423240472e-07, + "loss": 0.3107, + "step": 8878 + }, + { + "epoch": 2.4927007299270074, + "grad_norm": 0.5387868881225586, + "learning_rate": 8.461147872661902e-07, + "loss": 0.317, + "step": 8879 + }, + { + "epoch": 2.492981471083661, + "grad_norm": 0.5167239308357239, + "learning_rate": 8.452058754851306e-07, + "loss": 0.3349, + "step": 8880 + }, + { + "epoch": 2.4932622122403143, + "grad_norm": 0.5811373591423035, + "learning_rate": 8.442974070778643e-07, + "loss": 0.3044, + "step": 8881 + }, + { + "epoch": 2.493542953396968, + "grad_norm": 0.59946209192276, + "learning_rate": 8.433893821413358e-07, + "loss": 0.3655, + "step": 8882 + }, + { + "epoch": 2.4938236945536216, + "grad_norm": 0.49584391713142395, + "learning_rate": 8.424818007724434e-07, + "loss": 0.3858, + "step": 8883 + }, + { + "epoch": 2.494104435710275, + "grad_norm": 0.5428626537322998, + "learning_rate": 8.415746630680405e-07, + "loss": 0.3397, + "step": 8884 + }, + { + "epoch": 2.4943851768669285, + "grad_norm": 0.6168662905693054, + "learning_rate": 8.406679691249281e-07, + "loss": 0.3104, + "step": 8885 + }, + { + "epoch": 2.4946659180235824, + "grad_norm": 0.6244155764579773, + "learning_rate": 8.397617190398671e-07, + "loss": 0.269, + "step": 8886 + }, + { + "epoch": 2.494946659180236, + "grad_norm": 0.5118817090988159, + "learning_rate": 8.388559129095625e-07, + "loss": 0.3109, + "step": 8887 + }, + { + "epoch": 2.4952274003368893, + "grad_norm": 0.5412524938583374, + "learning_rate": 8.379505508306801e-07, + "loss": 0.342, + "step": 8888 + }, + { + "epoch": 2.495508141493543, + "grad_norm": 0.5388503670692444, + "learning_rate": 8.370456328998333e-07, + "loss": 0.3085, + "step": 8889 + }, + { + "epoch": 2.4957888826501966, + "grad_norm": 0.5854045748710632, + "learning_rate": 8.36141159213587e-07, + "loss": 0.3123, + "step": 8890 + }, + { + "epoch": 2.49606962380685, + "grad_norm": 0.6063563227653503, + "learning_rate": 8.352371298684641e-07, + "loss": 0.3666, + "step": 8891 + }, + { + "epoch": 2.4963503649635035, + "grad_norm": 0.60853511095047, + "learning_rate": 8.343335449609347e-07, + "loss": 0.3091, + "step": 8892 + }, + { + "epoch": 2.4966311061201574, + "grad_norm": 0.5416209697723389, + "learning_rate": 8.334304045874248e-07, + "loss": 0.3229, + "step": 8893 + }, + { + "epoch": 2.496911847276811, + "grad_norm": 0.5660837292671204, + "learning_rate": 8.325277088443129e-07, + "loss": 0.2786, + "step": 8894 + }, + { + "epoch": 2.4971925884334643, + "grad_norm": 0.5210088491439819, + "learning_rate": 8.316254578279276e-07, + "loss": 0.3188, + "step": 8895 + }, + { + "epoch": 2.497473329590118, + "grad_norm": 0.6038588285446167, + "learning_rate": 8.307236516345524e-07, + "loss": 0.3151, + "step": 8896 + }, + { + "epoch": 2.4977540707467716, + "grad_norm": 0.552193284034729, + "learning_rate": 8.298222903604225e-07, + "loss": 0.3382, + "step": 8897 + }, + { + "epoch": 2.498034811903425, + "grad_norm": 0.5803182721138, + "learning_rate": 8.289213741017238e-07, + "loss": 0.3278, + "step": 8898 + }, + { + "epoch": 2.4983155530600785, + "grad_norm": 0.6113681793212891, + "learning_rate": 8.280209029545993e-07, + "loss": 0.3152, + "step": 8899 + }, + { + "epoch": 2.4985962942167323, + "grad_norm": 0.5139860510826111, + "learning_rate": 8.27120877015139e-07, + "loss": 0.3099, + "step": 8900 + }, + { + "epoch": 2.498877035373386, + "grad_norm": 0.6146509051322937, + "learning_rate": 8.262212963793903e-07, + "loss": 0.2872, + "step": 8901 + }, + { + "epoch": 2.4991577765300392, + "grad_norm": 0.6226741075515747, + "learning_rate": 8.253221611433481e-07, + "loss": 0.3635, + "step": 8902 + }, + { + "epoch": 2.499438517686693, + "grad_norm": 0.5823273658752441, + "learning_rate": 8.244234714029664e-07, + "loss": 0.2683, + "step": 8903 + }, + { + "epoch": 2.4997192588433466, + "grad_norm": 0.5751005411148071, + "learning_rate": 8.23525227254145e-07, + "loss": 0.3014, + "step": 8904 + }, + { + "epoch": 2.5, + "grad_norm": 0.5652390718460083, + "learning_rate": 8.226274287927388e-07, + "loss": 0.3097, + "step": 8905 + }, + { + "epoch": 2.5002807411566534, + "grad_norm": 0.5503231287002563, + "learning_rate": 8.217300761145569e-07, + "loss": 0.3563, + "step": 8906 + }, + { + "epoch": 2.500561482313307, + "grad_norm": 0.5338963866233826, + "learning_rate": 8.208331693153577e-07, + "loss": 0.3417, + "step": 8907 + }, + { + "epoch": 2.5008422234699608, + "grad_norm": 0.5708197951316833, + "learning_rate": 8.199367084908544e-07, + "loss": 0.3123, + "step": 8908 + }, + { + "epoch": 2.501122964626614, + "grad_norm": 0.5358381271362305, + "learning_rate": 8.190406937367123e-07, + "loss": 0.3572, + "step": 8909 + }, + { + "epoch": 2.501403705783268, + "grad_norm": 0.5544228553771973, + "learning_rate": 8.181451251485461e-07, + "loss": 0.3483, + "step": 8910 + }, + { + "epoch": 2.5016844469399215, + "grad_norm": 0.49938660860061646, + "learning_rate": 8.172500028219283e-07, + "loss": 0.3211, + "step": 8911 + }, + { + "epoch": 2.501965188096575, + "grad_norm": 0.5111168026924133, + "learning_rate": 8.163553268523777e-07, + "loss": 0.3327, + "step": 8912 + }, + { + "epoch": 2.5022459292532284, + "grad_norm": 0.5480323433876038, + "learning_rate": 8.154610973353722e-07, + "loss": 0.3645, + "step": 8913 + }, + { + "epoch": 2.502526670409882, + "grad_norm": 0.5683262348175049, + "learning_rate": 8.145673143663347e-07, + "loss": 0.3696, + "step": 8914 + }, + { + "epoch": 2.5028074115665357, + "grad_norm": 0.5247148275375366, + "learning_rate": 8.136739780406472e-07, + "loss": 0.3248, + "step": 8915 + }, + { + "epoch": 2.503088152723189, + "grad_norm": 0.5809712409973145, + "learning_rate": 8.127810884536402e-07, + "loss": 0.3315, + "step": 8916 + }, + { + "epoch": 2.5033688938798426, + "grad_norm": 0.5839601755142212, + "learning_rate": 8.118886457005954e-07, + "loss": 0.2789, + "step": 8917 + }, + { + "epoch": 2.5036496350364965, + "grad_norm": 0.5308005213737488, + "learning_rate": 8.109966498767497e-07, + "loss": 0.2718, + "step": 8918 + }, + { + "epoch": 2.50393037619315, + "grad_norm": 0.5514929294586182, + "learning_rate": 8.101051010772937e-07, + "loss": 0.3064, + "step": 8919 + }, + { + "epoch": 2.5042111173498034, + "grad_norm": 0.5309748649597168, + "learning_rate": 8.092139993973641e-07, + "loss": 0.3732, + "step": 8920 + }, + { + "epoch": 2.504491858506457, + "grad_norm": 0.553950846195221, + "learning_rate": 8.083233449320576e-07, + "loss": 0.3305, + "step": 8921 + }, + { + "epoch": 2.5047725996631107, + "grad_norm": 0.5077446103096008, + "learning_rate": 8.07433137776416e-07, + "loss": 0.3318, + "step": 8922 + }, + { + "epoch": 2.505053340819764, + "grad_norm": 0.5609468817710876, + "learning_rate": 8.065433780254395e-07, + "loss": 0.3329, + "step": 8923 + }, + { + "epoch": 2.5053340819764176, + "grad_norm": 0.5669708847999573, + "learning_rate": 8.056540657740747e-07, + "loss": 0.2633, + "step": 8924 + }, + { + "epoch": 2.5056148231330715, + "grad_norm": 0.6168059706687927, + "learning_rate": 8.047652011172264e-07, + "loss": 0.3286, + "step": 8925 + }, + { + "epoch": 2.505895564289725, + "grad_norm": 0.5560932755470276, + "learning_rate": 8.038767841497475e-07, + "loss": 0.3118, + "step": 8926 + }, + { + "epoch": 2.5061763054463784, + "grad_norm": 0.5844700336456299, + "learning_rate": 8.029888149664434e-07, + "loss": 0.2957, + "step": 8927 + }, + { + "epoch": 2.506457046603032, + "grad_norm": 0.5862390995025635, + "learning_rate": 8.021012936620737e-07, + "loss": 0.3175, + "step": 8928 + }, + { + "epoch": 2.5067377877596857, + "grad_norm": 0.5474711656570435, + "learning_rate": 8.012142203313484e-07, + "loss": 0.331, + "step": 8929 + }, + { + "epoch": 2.507018528916339, + "grad_norm": 0.5384641885757446, + "learning_rate": 8.003275950689321e-07, + "loss": 0.3374, + "step": 8930 + }, + { + "epoch": 2.5072992700729926, + "grad_norm": 0.5157870650291443, + "learning_rate": 7.994414179694388e-07, + "loss": 0.278, + "step": 8931 + }, + { + "epoch": 2.5075800112296465, + "grad_norm": 0.5530809760093689, + "learning_rate": 7.985556891274349e-07, + "loss": 0.3128, + "step": 8932 + }, + { + "epoch": 2.5078607523863, + "grad_norm": 0.5196710228919983, + "learning_rate": 7.976704086374415e-07, + "loss": 0.3215, + "step": 8933 + }, + { + "epoch": 2.5081414935429533, + "grad_norm": 0.5491336584091187, + "learning_rate": 7.967855765939286e-07, + "loss": 0.3174, + "step": 8934 + }, + { + "epoch": 2.508422234699607, + "grad_norm": 0.5538433790206909, + "learning_rate": 7.959011930913219e-07, + "loss": 0.329, + "step": 8935 + }, + { + "epoch": 2.5087029758562607, + "grad_norm": 0.5808168053627014, + "learning_rate": 7.950172582239957e-07, + "loss": 0.2845, + "step": 8936 + }, + { + "epoch": 2.508983717012914, + "grad_norm": 0.491447389125824, + "learning_rate": 7.94133772086279e-07, + "loss": 0.3353, + "step": 8937 + }, + { + "epoch": 2.5092644581695676, + "grad_norm": 0.5617589950561523, + "learning_rate": 7.932507347724522e-07, + "loss": 0.314, + "step": 8938 + }, + { + "epoch": 2.5095451993262214, + "grad_norm": 0.48420363664627075, + "learning_rate": 7.923681463767452e-07, + "loss": 0.3696, + "step": 8939 + }, + { + "epoch": 2.509825940482875, + "grad_norm": 0.5282022953033447, + "learning_rate": 7.914860069933455e-07, + "loss": 0.2646, + "step": 8940 + }, + { + "epoch": 2.5101066816395283, + "grad_norm": 0.5548469424247742, + "learning_rate": 7.906043167163874e-07, + "loss": 0.3555, + "step": 8941 + }, + { + "epoch": 2.5103874227961818, + "grad_norm": 0.482534259557724, + "learning_rate": 7.89723075639961e-07, + "loss": 0.3473, + "step": 8942 + }, + { + "epoch": 2.5106681639528357, + "grad_norm": 0.5801131129264832, + "learning_rate": 7.888422838581056e-07, + "loss": 0.3502, + "step": 8943 + }, + { + "epoch": 2.510948905109489, + "grad_norm": 0.5397019386291504, + "learning_rate": 7.879619414648121e-07, + "loss": 0.268, + "step": 8944 + }, + { + "epoch": 2.5112296462661425, + "grad_norm": 0.46736371517181396, + "learning_rate": 7.870820485540298e-07, + "loss": 0.3665, + "step": 8945 + }, + { + "epoch": 2.5115103874227964, + "grad_norm": 0.5036104321479797, + "learning_rate": 7.862026052196514e-07, + "loss": 0.3464, + "step": 8946 + }, + { + "epoch": 2.51179112857945, + "grad_norm": 0.6011244654655457, + "learning_rate": 7.853236115555285e-07, + "loss": 0.3021, + "step": 8947 + }, + { + "epoch": 2.5120718697361033, + "grad_norm": 0.5795788168907166, + "learning_rate": 7.844450676554604e-07, + "loss": 0.326, + "step": 8948 + }, + { + "epoch": 2.5123526108927567, + "grad_norm": 0.5732229948043823, + "learning_rate": 7.83566973613199e-07, + "loss": 0.3181, + "step": 8949 + }, + { + "epoch": 2.51263335204941, + "grad_norm": 0.5182971954345703, + "learning_rate": 7.826893295224509e-07, + "loss": 0.3418, + "step": 8950 + }, + { + "epoch": 2.512914093206064, + "grad_norm": 0.6016291975975037, + "learning_rate": 7.818121354768704e-07, + "loss": 0.3706, + "step": 8951 + }, + { + "epoch": 2.5131948343627175, + "grad_norm": 0.6650599837303162, + "learning_rate": 7.80935391570069e-07, + "loss": 0.2721, + "step": 8952 + }, + { + "epoch": 2.5134755755193714, + "grad_norm": 0.49425604939460754, + "learning_rate": 7.800590978956058e-07, + "loss": 0.3059, + "step": 8953 + }, + { + "epoch": 2.513756316676025, + "grad_norm": 0.5239137411117554, + "learning_rate": 7.791832545469924e-07, + "loss": 0.3131, + "step": 8954 + }, + { + "epoch": 2.5140370578326783, + "grad_norm": 0.5117790699005127, + "learning_rate": 7.783078616176959e-07, + "loss": 0.3273, + "step": 8955 + }, + { + "epoch": 2.5143177989893317, + "grad_norm": 0.5523049831390381, + "learning_rate": 7.774329192011304e-07, + "loss": 0.3449, + "step": 8956 + }, + { + "epoch": 2.514598540145985, + "grad_norm": 0.5536803603172302, + "learning_rate": 7.765584273906662e-07, + "loss": 0.3483, + "step": 8957 + }, + { + "epoch": 2.514879281302639, + "grad_norm": 0.5914592146873474, + "learning_rate": 7.756843862796215e-07, + "loss": 0.3217, + "step": 8958 + }, + { + "epoch": 2.5151600224592925, + "grad_norm": 0.5371770858764648, + "learning_rate": 7.748107959612711e-07, + "loss": 0.3223, + "step": 8959 + }, + { + "epoch": 2.5154407636159464, + "grad_norm": 0.5213318467140198, + "learning_rate": 7.739376565288382e-07, + "loss": 0.3054, + "step": 8960 + }, + { + "epoch": 2.5157215047726, + "grad_norm": 0.4887320399284363, + "learning_rate": 7.73064968075497e-07, + "loss": 0.3586, + "step": 8961 + }, + { + "epoch": 2.5160022459292533, + "grad_norm": 0.5178138017654419, + "learning_rate": 7.721927306943783e-07, + "loss": 0.3231, + "step": 8962 + }, + { + "epoch": 2.5162829870859067, + "grad_norm": 0.5341764688491821, + "learning_rate": 7.713209444785586e-07, + "loss": 0.3341, + "step": 8963 + }, + { + "epoch": 2.51656372824256, + "grad_norm": 0.5665843486785889, + "learning_rate": 7.704496095210729e-07, + "loss": 0.3415, + "step": 8964 + }, + { + "epoch": 2.516844469399214, + "grad_norm": 0.5507887601852417, + "learning_rate": 7.695787259149029e-07, + "loss": 0.3028, + "step": 8965 + }, + { + "epoch": 2.5171252105558675, + "grad_norm": 0.5506218671798706, + "learning_rate": 7.687082937529827e-07, + "loss": 0.3306, + "step": 8966 + }, + { + "epoch": 2.517405951712521, + "grad_norm": 0.5599642395973206, + "learning_rate": 7.678383131282024e-07, + "loss": 0.3473, + "step": 8967 + }, + { + "epoch": 2.517686692869175, + "grad_norm": 0.548728346824646, + "learning_rate": 7.669687841333978e-07, + "loss": 0.3222, + "step": 8968 + }, + { + "epoch": 2.5179674340258282, + "grad_norm": 0.44374653697013855, + "learning_rate": 7.660997068613607e-07, + "loss": 0.3252, + "step": 8969 + }, + { + "epoch": 2.5182481751824817, + "grad_norm": 0.5944491624832153, + "learning_rate": 7.652310814048358e-07, + "loss": 0.2953, + "step": 8970 + }, + { + "epoch": 2.518528916339135, + "grad_norm": 0.5412534475326538, + "learning_rate": 7.643629078565141e-07, + "loss": 0.3049, + "step": 8971 + }, + { + "epoch": 2.518809657495789, + "grad_norm": 0.5190032720565796, + "learning_rate": 7.634951863090445e-07, + "loss": 0.3413, + "step": 8972 + }, + { + "epoch": 2.5190903986524424, + "grad_norm": 0.5591129660606384, + "learning_rate": 7.626279168550221e-07, + "loss": 0.3227, + "step": 8973 + }, + { + "epoch": 2.519371139809096, + "grad_norm": 0.524470865726471, + "learning_rate": 7.61761099586999e-07, + "loss": 0.3242, + "step": 8974 + }, + { + "epoch": 2.5196518809657498, + "grad_norm": 0.5753305554389954, + "learning_rate": 7.60894734597476e-07, + "loss": 0.2893, + "step": 8975 + }, + { + "epoch": 2.519932622122403, + "grad_norm": 0.5870516896247864, + "learning_rate": 7.600288219789043e-07, + "loss": 0.3517, + "step": 8976 + }, + { + "epoch": 2.5202133632790567, + "grad_norm": 0.5744442343711853, + "learning_rate": 7.59163361823691e-07, + "loss": 0.3036, + "step": 8977 + }, + { + "epoch": 2.52049410443571, + "grad_norm": 0.6032195091247559, + "learning_rate": 7.582983542241906e-07, + "loss": 0.3084, + "step": 8978 + }, + { + "epoch": 2.520774845592364, + "grad_norm": 0.4968772530555725, + "learning_rate": 7.574337992727137e-07, + "loss": 0.3374, + "step": 8979 + }, + { + "epoch": 2.5210555867490174, + "grad_norm": 0.5651487112045288, + "learning_rate": 7.565696970615188e-07, + "loss": 0.3112, + "step": 8980 + }, + { + "epoch": 2.521336327905671, + "grad_norm": 0.5269807577133179, + "learning_rate": 7.557060476828171e-07, + "loss": 0.3436, + "step": 8981 + }, + { + "epoch": 2.5216170690623247, + "grad_norm": 0.5763477683067322, + "learning_rate": 7.548428512287731e-07, + "loss": 0.2804, + "step": 8982 + }, + { + "epoch": 2.521897810218978, + "grad_norm": 0.5456498861312866, + "learning_rate": 7.539801077914999e-07, + "loss": 0.3214, + "step": 8983 + }, + { + "epoch": 2.5221785513756316, + "grad_norm": 0.5414750576019287, + "learning_rate": 7.531178174630671e-07, + "loss": 0.3298, + "step": 8984 + }, + { + "epoch": 2.522459292532285, + "grad_norm": 0.5692690014839172, + "learning_rate": 7.522559803354895e-07, + "loss": 0.3135, + "step": 8985 + }, + { + "epoch": 2.522740033688939, + "grad_norm": 0.5702782869338989, + "learning_rate": 7.513945965007396e-07, + "loss": 0.3163, + "step": 8986 + }, + { + "epoch": 2.5230207748455924, + "grad_norm": 0.6065912246704102, + "learning_rate": 7.50533666050739e-07, + "loss": 0.3094, + "step": 8987 + }, + { + "epoch": 2.523301516002246, + "grad_norm": 0.5257893800735474, + "learning_rate": 7.496731890773579e-07, + "loss": 0.3321, + "step": 8988 + }, + { + "epoch": 2.5235822571588997, + "grad_norm": 0.5629504919052124, + "learning_rate": 7.488131656724246e-07, + "loss": 0.3232, + "step": 8989 + }, + { + "epoch": 2.523862998315553, + "grad_norm": 0.49127185344696045, + "learning_rate": 7.479535959277123e-07, + "loss": 0.3365, + "step": 8990 + }, + { + "epoch": 2.5241437394722066, + "grad_norm": 0.535672664642334, + "learning_rate": 7.470944799349517e-07, + "loss": 0.3256, + "step": 8991 + }, + { + "epoch": 2.52442448062886, + "grad_norm": 0.5545815825462341, + "learning_rate": 7.462358177858214e-07, + "loss": 0.2886, + "step": 8992 + }, + { + "epoch": 2.5247052217855135, + "grad_norm": 0.5472292900085449, + "learning_rate": 7.453776095719511e-07, + "loss": 0.3466, + "step": 8993 + }, + { + "epoch": 2.5249859629421674, + "grad_norm": 0.610140323638916, + "learning_rate": 7.445198553849259e-07, + "loss": 0.2881, + "step": 8994 + }, + { + "epoch": 2.525266704098821, + "grad_norm": 0.5382416844367981, + "learning_rate": 7.436625553162757e-07, + "loss": 0.2802, + "step": 8995 + }, + { + "epoch": 2.5255474452554747, + "grad_norm": 0.4972606599330902, + "learning_rate": 7.42805709457492e-07, + "loss": 0.3334, + "step": 8996 + }, + { + "epoch": 2.525828186412128, + "grad_norm": 0.5735927224159241, + "learning_rate": 7.419493179000087e-07, + "loss": 0.2911, + "step": 8997 + }, + { + "epoch": 2.5261089275687816, + "grad_norm": 0.5767256617546082, + "learning_rate": 7.410933807352144e-07, + "loss": 0.3525, + "step": 8998 + }, + { + "epoch": 2.526389668725435, + "grad_norm": 0.6331986784934998, + "learning_rate": 7.402378980544517e-07, + "loss": 0.341, + "step": 8999 + }, + { + "epoch": 2.5266704098820885, + "grad_norm": 0.5588213801383972, + "learning_rate": 7.393828699490091e-07, + "loss": 0.3509, + "step": 9000 + }, + { + "epoch": 2.5269511510387423, + "grad_norm": 0.6067502498626709, + "learning_rate": 7.385282965101326e-07, + "loss": 0.3248, + "step": 9001 + }, + { + "epoch": 2.527231892195396, + "grad_norm": 0.5410355925559998, + "learning_rate": 7.376741778290164e-07, + "loss": 0.3635, + "step": 9002 + }, + { + "epoch": 2.5275126333520497, + "grad_norm": 0.6640228033065796, + "learning_rate": 7.368205139968048e-07, + "loss": 0.318, + "step": 9003 + }, + { + "epoch": 2.527793374508703, + "grad_norm": 0.6127304434776306, + "learning_rate": 7.359673051045979e-07, + "loss": 0.29, + "step": 9004 + }, + { + "epoch": 2.5280741156653566, + "grad_norm": 0.5498339533805847, + "learning_rate": 7.351145512434432e-07, + "loss": 0.3223, + "step": 9005 + }, + { + "epoch": 2.52835485682201, + "grad_norm": 0.5274477005004883, + "learning_rate": 7.34262252504343e-07, + "loss": 0.3181, + "step": 9006 + }, + { + "epoch": 2.5286355979786634, + "grad_norm": 0.5333122611045837, + "learning_rate": 7.334104089782473e-07, + "loss": 0.3366, + "step": 9007 + }, + { + "epoch": 2.5289163391353173, + "grad_norm": 0.513627827167511, + "learning_rate": 7.325590207560618e-07, + "loss": 0.3505, + "step": 9008 + }, + { + "epoch": 2.5291970802919708, + "grad_norm": 0.5493861436843872, + "learning_rate": 7.317080879286398e-07, + "loss": 0.3318, + "step": 9009 + }, + { + "epoch": 2.529477821448624, + "grad_norm": 0.5757958292961121, + "learning_rate": 7.308576105867871e-07, + "loss": 0.3204, + "step": 9010 + }, + { + "epoch": 2.529758562605278, + "grad_norm": 0.5586037039756775, + "learning_rate": 7.300075888212632e-07, + "loss": 0.3082, + "step": 9011 + }, + { + "epoch": 2.5300393037619315, + "grad_norm": 0.5369079113006592, + "learning_rate": 7.291580227227751e-07, + "loss": 0.328, + "step": 9012 + }, + { + "epoch": 2.530320044918585, + "grad_norm": 0.6014928221702576, + "learning_rate": 7.283089123819853e-07, + "loss": 0.3023, + "step": 9013 + }, + { + "epoch": 2.5306007860752384, + "grad_norm": 0.5827710032463074, + "learning_rate": 7.27460257889504e-07, + "loss": 0.3045, + "step": 9014 + }, + { + "epoch": 2.5308815272318923, + "grad_norm": 0.5509430766105652, + "learning_rate": 7.266120593358944e-07, + "loss": 0.3146, + "step": 9015 + }, + { + "epoch": 2.5311622683885457, + "grad_norm": 0.5305706262588501, + "learning_rate": 7.257643168116718e-07, + "loss": 0.3146, + "step": 9016 + }, + { + "epoch": 2.531443009545199, + "grad_norm": 0.5430615544319153, + "learning_rate": 7.249170304073011e-07, + "loss": 0.3394, + "step": 9017 + }, + { + "epoch": 2.531723750701853, + "grad_norm": 0.581591010093689, + "learning_rate": 7.240702002132005e-07, + "loss": 0.289, + "step": 9018 + }, + { + "epoch": 2.5320044918585065, + "grad_norm": 0.5528303384780884, + "learning_rate": 7.232238263197372e-07, + "loss": 0.3649, + "step": 9019 + }, + { + "epoch": 2.53228523301516, + "grad_norm": 0.5842733979225159, + "learning_rate": 7.223779088172317e-07, + "loss": 0.3548, + "step": 9020 + }, + { + "epoch": 2.5325659741718134, + "grad_norm": 0.5184828042984009, + "learning_rate": 7.215324477959557e-07, + "loss": 0.2878, + "step": 9021 + }, + { + "epoch": 2.5328467153284673, + "grad_norm": 0.6485675573348999, + "learning_rate": 7.206874433461298e-07, + "loss": 0.3439, + "step": 9022 + }, + { + "epoch": 2.5331274564851207, + "grad_norm": 0.6202267408370972, + "learning_rate": 7.1984289555793e-07, + "loss": 0.3471, + "step": 9023 + }, + { + "epoch": 2.533408197641774, + "grad_norm": 0.5531259179115295, + "learning_rate": 7.189988045214797e-07, + "loss": 0.3124, + "step": 9024 + }, + { + "epoch": 2.533688938798428, + "grad_norm": 0.6165199279785156, + "learning_rate": 7.181551703268541e-07, + "loss": 0.3464, + "step": 9025 + }, + { + "epoch": 2.5339696799550815, + "grad_norm": 0.6090344190597534, + "learning_rate": 7.173119930640826e-07, + "loss": 0.3429, + "step": 9026 + }, + { + "epoch": 2.534250421111735, + "grad_norm": 0.5615666508674622, + "learning_rate": 7.16469272823142e-07, + "loss": 0.3225, + "step": 9027 + }, + { + "epoch": 2.5345311622683884, + "grad_norm": 0.5526992678642273, + "learning_rate": 7.156270096939644e-07, + "loss": 0.3639, + "step": 9028 + }, + { + "epoch": 2.5348119034250423, + "grad_norm": 0.5863558053970337, + "learning_rate": 7.147852037664293e-07, + "loss": 0.32, + "step": 9029 + }, + { + "epoch": 2.5350926445816957, + "grad_norm": 0.5933303833007812, + "learning_rate": 7.139438551303684e-07, + "loss": 0.3524, + "step": 9030 + }, + { + "epoch": 2.535373385738349, + "grad_norm": 0.5584559440612793, + "learning_rate": 7.131029638755666e-07, + "loss": 0.2797, + "step": 9031 + }, + { + "epoch": 2.535654126895003, + "grad_norm": 0.5528318881988525, + "learning_rate": 7.122625300917574e-07, + "loss": 0.3657, + "step": 9032 + }, + { + "epoch": 2.5359348680516565, + "grad_norm": 0.5077992081642151, + "learning_rate": 7.114225538686287e-07, + "loss": 0.3182, + "step": 9033 + }, + { + "epoch": 2.53621560920831, + "grad_norm": 0.5783612132072449, + "learning_rate": 7.105830352958143e-07, + "loss": 0.351, + "step": 9034 + }, + { + "epoch": 2.5364963503649633, + "grad_norm": 0.6035152077674866, + "learning_rate": 7.097439744629059e-07, + "loss": 0.331, + "step": 9035 + }, + { + "epoch": 2.5367770915216172, + "grad_norm": 0.5512037873268127, + "learning_rate": 7.089053714594413e-07, + "loss": 0.3505, + "step": 9036 + }, + { + "epoch": 2.5370578326782707, + "grad_norm": 0.5658860802650452, + "learning_rate": 7.080672263749094e-07, + "loss": 0.3831, + "step": 9037 + }, + { + "epoch": 2.537338573834924, + "grad_norm": 0.5604510307312012, + "learning_rate": 7.072295392987543e-07, + "loss": 0.3089, + "step": 9038 + }, + { + "epoch": 2.537619314991578, + "grad_norm": 0.5483195185661316, + "learning_rate": 7.063923103203668e-07, + "loss": 0.3593, + "step": 9039 + }, + { + "epoch": 2.5379000561482314, + "grad_norm": 0.5885195136070251, + "learning_rate": 7.05555539529093e-07, + "loss": 0.3606, + "step": 9040 + }, + { + "epoch": 2.538180797304885, + "grad_norm": 0.6049413084983826, + "learning_rate": 7.047192270142273e-07, + "loss": 0.3124, + "step": 9041 + }, + { + "epoch": 2.5384615384615383, + "grad_norm": 0.5033851861953735, + "learning_rate": 7.03883372865013e-07, + "loss": 0.3535, + "step": 9042 + }, + { + "epoch": 2.5387422796181918, + "grad_norm": 0.5461069941520691, + "learning_rate": 7.03047977170651e-07, + "loss": 0.3108, + "step": 9043 + }, + { + "epoch": 2.5390230207748457, + "grad_norm": 0.48883023858070374, + "learning_rate": 7.022130400202864e-07, + "loss": 0.3801, + "step": 9044 + }, + { + "epoch": 2.539303761931499, + "grad_norm": 0.5367558002471924, + "learning_rate": 7.013785615030217e-07, + "loss": 0.3327, + "step": 9045 + }, + { + "epoch": 2.539584503088153, + "grad_norm": 0.5318974256515503, + "learning_rate": 7.005445417079038e-07, + "loss": 0.3152, + "step": 9046 + }, + { + "epoch": 2.5398652442448064, + "grad_norm": 0.5410260558128357, + "learning_rate": 6.99710980723936e-07, + "loss": 0.3635, + "step": 9047 + }, + { + "epoch": 2.54014598540146, + "grad_norm": 0.5868778824806213, + "learning_rate": 6.988778786400719e-07, + "loss": 0.3084, + "step": 9048 + }, + { + "epoch": 2.5404267265581133, + "grad_norm": 0.562307596206665, + "learning_rate": 6.980452355452122e-07, + "loss": 0.3624, + "step": 9049 + }, + { + "epoch": 2.5407074677147667, + "grad_norm": 0.5488806962966919, + "learning_rate": 6.972130515282144e-07, + "loss": 0.3285, + "step": 9050 + }, + { + "epoch": 2.5409882088714206, + "grad_norm": 0.6462432742118835, + "learning_rate": 6.963813266778824e-07, + "loss": 0.3234, + "step": 9051 + }, + { + "epoch": 2.541268950028074, + "grad_norm": 0.5582271218299866, + "learning_rate": 6.955500610829718e-07, + "loss": 0.3333, + "step": 9052 + }, + { + "epoch": 2.541549691184728, + "grad_norm": 0.6388147473335266, + "learning_rate": 6.947192548321918e-07, + "loss": 0.3011, + "step": 9053 + }, + { + "epoch": 2.5418304323413814, + "grad_norm": 0.5151721835136414, + "learning_rate": 6.938889080141992e-07, + "loss": 0.3262, + "step": 9054 + }, + { + "epoch": 2.542111173498035, + "grad_norm": 0.594851553440094, + "learning_rate": 6.930590207176058e-07, + "loss": 0.3543, + "step": 9055 + }, + { + "epoch": 2.5423919146546883, + "grad_norm": 0.5733625292778015, + "learning_rate": 6.922295930309691e-07, + "loss": 0.3097, + "step": 9056 + }, + { + "epoch": 2.5426726558113417, + "grad_norm": 0.556485652923584, + "learning_rate": 6.914006250428034e-07, + "loss": 0.3398, + "step": 9057 + }, + { + "epoch": 2.5429533969679956, + "grad_norm": 0.5053539276123047, + "learning_rate": 6.905721168415691e-07, + "loss": 0.315, + "step": 9058 + }, + { + "epoch": 2.543234138124649, + "grad_norm": 0.5656735301017761, + "learning_rate": 6.897440685156792e-07, + "loss": 0.3504, + "step": 9059 + }, + { + "epoch": 2.5435148792813025, + "grad_norm": 0.644794225692749, + "learning_rate": 6.889164801534998e-07, + "loss": 0.3133, + "step": 9060 + }, + { + "epoch": 2.5437956204379564, + "grad_norm": 0.5380450487136841, + "learning_rate": 6.880893518433434e-07, + "loss": 0.3205, + "step": 9061 + }, + { + "epoch": 2.54407636159461, + "grad_norm": 0.5853850245475769, + "learning_rate": 6.872626836734786e-07, + "loss": 0.3657, + "step": 9062 + }, + { + "epoch": 2.5443571027512633, + "grad_norm": 0.5349850058555603, + "learning_rate": 6.864364757321213e-07, + "loss": 0.3277, + "step": 9063 + }, + { + "epoch": 2.5446378439079167, + "grad_norm": 0.584952175617218, + "learning_rate": 6.856107281074376e-07, + "loss": 0.3395, + "step": 9064 + }, + { + "epoch": 2.5449185850645706, + "grad_norm": 0.5998843312263489, + "learning_rate": 6.847854408875488e-07, + "loss": 0.338, + "step": 9065 + }, + { + "epoch": 2.545199326221224, + "grad_norm": 0.5139232277870178, + "learning_rate": 6.839606141605221e-07, + "loss": 0.3064, + "step": 9066 + }, + { + "epoch": 2.5454800673778775, + "grad_norm": 0.559762716293335, + "learning_rate": 6.831362480143799e-07, + "loss": 0.32, + "step": 9067 + }, + { + "epoch": 2.5457608085345313, + "grad_norm": 0.5240930914878845, + "learning_rate": 6.823123425370914e-07, + "loss": 0.3414, + "step": 9068 + }, + { + "epoch": 2.546041549691185, + "grad_norm": 0.5897007584571838, + "learning_rate": 6.814888978165812e-07, + "loss": 0.2849, + "step": 9069 + }, + { + "epoch": 2.5463222908478382, + "grad_norm": 0.4596761465072632, + "learning_rate": 6.806659139407206e-07, + "loss": 0.3747, + "step": 9070 + }, + { + "epoch": 2.5466030320044917, + "grad_norm": 0.5650991797447205, + "learning_rate": 6.798433909973312e-07, + "loss": 0.3151, + "step": 9071 + }, + { + "epoch": 2.5468837731611456, + "grad_norm": 0.5619720220565796, + "learning_rate": 6.790213290741921e-07, + "loss": 0.3338, + "step": 9072 + }, + { + "epoch": 2.547164514317799, + "grad_norm": 0.5655146837234497, + "learning_rate": 6.781997282590264e-07, + "loss": 0.2953, + "step": 9073 + }, + { + "epoch": 2.5474452554744524, + "grad_norm": 0.5359267592430115, + "learning_rate": 6.773785886395096e-07, + "loss": 0.3197, + "step": 9074 + }, + { + "epoch": 2.5477259966311063, + "grad_norm": 0.5062718987464905, + "learning_rate": 6.765579103032694e-07, + "loss": 0.3348, + "step": 9075 + }, + { + "epoch": 2.5480067377877598, + "grad_norm": 0.5986114740371704, + "learning_rate": 6.757376933378829e-07, + "loss": 0.2997, + "step": 9076 + }, + { + "epoch": 2.548287478944413, + "grad_norm": 0.5603893995285034, + "learning_rate": 6.749179378308795e-07, + "loss": 0.2791, + "step": 9077 + }, + { + "epoch": 2.5485682201010667, + "grad_norm": 0.5378441214561462, + "learning_rate": 6.740986438697372e-07, + "loss": 0.3339, + "step": 9078 + }, + { + "epoch": 2.5488489612577205, + "grad_norm": 0.6141390800476074, + "learning_rate": 6.732798115418876e-07, + "loss": 0.3197, + "step": 9079 + }, + { + "epoch": 2.549129702414374, + "grad_norm": 0.5450079441070557, + "learning_rate": 6.724614409347102e-07, + "loss": 0.3639, + "step": 9080 + }, + { + "epoch": 2.5494104435710274, + "grad_norm": 0.5260674953460693, + "learning_rate": 6.716435321355358e-07, + "loss": 0.3182, + "step": 9081 + }, + { + "epoch": 2.5496911847276813, + "grad_norm": 0.5757387280464172, + "learning_rate": 6.708260852316483e-07, + "loss": 0.3647, + "step": 9082 + }, + { + "epoch": 2.5499719258843347, + "grad_norm": 0.5008442401885986, + "learning_rate": 6.700091003102788e-07, + "loss": 0.3448, + "step": 9083 + }, + { + "epoch": 2.550252667040988, + "grad_norm": 0.5486941337585449, + "learning_rate": 6.691925774586122e-07, + "loss": 0.332, + "step": 9084 + }, + { + "epoch": 2.5505334081976416, + "grad_norm": 0.5755200982093811, + "learning_rate": 6.683765167637829e-07, + "loss": 0.2807, + "step": 9085 + }, + { + "epoch": 2.550814149354295, + "grad_norm": 0.5597274303436279, + "learning_rate": 6.675609183128739e-07, + "loss": 0.3065, + "step": 9086 + }, + { + "epoch": 2.551094890510949, + "grad_norm": 0.5731322765350342, + "learning_rate": 6.667457821929229e-07, + "loss": 0.2875, + "step": 9087 + }, + { + "epoch": 2.5513756316676024, + "grad_norm": 0.5955571532249451, + "learning_rate": 6.659311084909143e-07, + "loss": 0.3247, + "step": 9088 + }, + { + "epoch": 2.5516563728242563, + "grad_norm": 0.5537461638450623, + "learning_rate": 6.651168972937871e-07, + "loss": 0.337, + "step": 9089 + }, + { + "epoch": 2.5519371139809097, + "grad_norm": 0.5642498135566711, + "learning_rate": 6.643031486884266e-07, + "loss": 0.3071, + "step": 9090 + }, + { + "epoch": 2.552217855137563, + "grad_norm": 0.6300172805786133, + "learning_rate": 6.634898627616731e-07, + "loss": 0.2783, + "step": 9091 + }, + { + "epoch": 2.5524985962942166, + "grad_norm": 0.5638207793235779, + "learning_rate": 6.626770396003145e-07, + "loss": 0.3316, + "step": 9092 + }, + { + "epoch": 2.55277933745087, + "grad_norm": 0.6052414774894714, + "learning_rate": 6.618646792910893e-07, + "loss": 0.3487, + "step": 9093 + }, + { + "epoch": 2.553060078607524, + "grad_norm": 0.5608871579170227, + "learning_rate": 6.610527819206897e-07, + "loss": 0.3197, + "step": 9094 + }, + { + "epoch": 2.5533408197641774, + "grad_norm": 0.5285017490386963, + "learning_rate": 6.602413475757541e-07, + "loss": 0.3102, + "step": 9095 + }, + { + "epoch": 2.5536215609208313, + "grad_norm": 0.6314661502838135, + "learning_rate": 6.594303763428744e-07, + "loss": 0.2841, + "step": 9096 + }, + { + "epoch": 2.5539023020774847, + "grad_norm": 0.5488671660423279, + "learning_rate": 6.586198683085937e-07, + "loss": 0.3093, + "step": 9097 + }, + { + "epoch": 2.554183043234138, + "grad_norm": 0.5688106417655945, + "learning_rate": 6.578098235594022e-07, + "loss": 0.3421, + "step": 9098 + }, + { + "epoch": 2.5544637843907916, + "grad_norm": 0.601155161857605, + "learning_rate": 6.570002421817456e-07, + "loss": 0.3196, + "step": 9099 + }, + { + "epoch": 2.554744525547445, + "grad_norm": 0.5598655343055725, + "learning_rate": 6.561911242620156e-07, + "loss": 0.354, + "step": 9100 + }, + { + "epoch": 2.555025266704099, + "grad_norm": 0.49863100051879883, + "learning_rate": 6.553824698865552e-07, + "loss": 0.3619, + "step": 9101 + }, + { + "epoch": 2.5553060078607523, + "grad_norm": 0.56257164478302, + "learning_rate": 6.545742791416615e-07, + "loss": 0.3314, + "step": 9102 + }, + { + "epoch": 2.555586749017406, + "grad_norm": 0.536172091960907, + "learning_rate": 6.537665521135772e-07, + "loss": 0.3177, + "step": 9103 + }, + { + "epoch": 2.5558674901740597, + "grad_norm": 0.5639461874961853, + "learning_rate": 6.529592888885e-07, + "loss": 0.3537, + "step": 9104 + }, + { + "epoch": 2.556148231330713, + "grad_norm": 0.6627373099327087, + "learning_rate": 6.521524895525738e-07, + "loss": 0.297, + "step": 9105 + }, + { + "epoch": 2.5564289724873666, + "grad_norm": 0.5472077131271362, + "learning_rate": 6.513461541918981e-07, + "loss": 0.3651, + "step": 9106 + }, + { + "epoch": 2.55670971364402, + "grad_norm": 0.6022294759750366, + "learning_rate": 6.505402828925178e-07, + "loss": 0.3175, + "step": 9107 + }, + { + "epoch": 2.556990454800674, + "grad_norm": 0.580473780632019, + "learning_rate": 6.497348757404298e-07, + "loss": 0.3987, + "step": 9108 + }, + { + "epoch": 2.5572711959573273, + "grad_norm": 0.570435643196106, + "learning_rate": 6.489299328215848e-07, + "loss": 0.331, + "step": 9109 + }, + { + "epoch": 2.5575519371139808, + "grad_norm": 0.5406184792518616, + "learning_rate": 6.48125454221879e-07, + "loss": 0.353, + "step": 9110 + }, + { + "epoch": 2.5578326782706347, + "grad_norm": 0.517946183681488, + "learning_rate": 6.473214400271626e-07, + "loss": 0.3527, + "step": 9111 + }, + { + "epoch": 2.558113419427288, + "grad_norm": 0.6016600131988525, + "learning_rate": 6.465178903232349e-07, + "loss": 0.3488, + "step": 9112 + }, + { + "epoch": 2.5583941605839415, + "grad_norm": 0.5373508930206299, + "learning_rate": 6.457148051958445e-07, + "loss": 0.3628, + "step": 9113 + }, + { + "epoch": 2.558674901740595, + "grad_norm": 0.5815570950508118, + "learning_rate": 6.449121847306938e-07, + "loss": 0.3125, + "step": 9114 + }, + { + "epoch": 2.558955642897249, + "grad_norm": 0.5918525457382202, + "learning_rate": 6.441100290134312e-07, + "loss": 0.2971, + "step": 9115 + }, + { + "epoch": 2.5592363840539023, + "grad_norm": 0.561310350894928, + "learning_rate": 6.433083381296596e-07, + "loss": 0.3199, + "step": 9116 + }, + { + "epoch": 2.5595171252105557, + "grad_norm": 0.5653798580169678, + "learning_rate": 6.425071121649285e-07, + "loss": 0.3219, + "step": 9117 + }, + { + "epoch": 2.5597978663672096, + "grad_norm": 0.5669142007827759, + "learning_rate": 6.417063512047417e-07, + "loss": 0.3513, + "step": 9118 + }, + { + "epoch": 2.560078607523863, + "grad_norm": 0.5990543961524963, + "learning_rate": 6.409060553345515e-07, + "loss": 0.3014, + "step": 9119 + }, + { + "epoch": 2.5603593486805165, + "grad_norm": 0.586254894733429, + "learning_rate": 6.401062246397582e-07, + "loss": 0.339, + "step": 9120 + }, + { + "epoch": 2.56064008983717, + "grad_norm": 0.5165368914604187, + "learning_rate": 6.393068592057173e-07, + "loss": 0.3488, + "step": 9121 + }, + { + "epoch": 2.560920830993824, + "grad_norm": 0.5481555461883545, + "learning_rate": 6.385079591177296e-07, + "loss": 0.3288, + "step": 9122 + }, + { + "epoch": 2.5612015721504773, + "grad_norm": 0.6165788173675537, + "learning_rate": 6.377095244610504e-07, + "loss": 0.3473, + "step": 9123 + }, + { + "epoch": 2.5614823133071307, + "grad_norm": 0.5769089460372925, + "learning_rate": 6.369115553208849e-07, + "loss": 0.3266, + "step": 9124 + }, + { + "epoch": 2.5617630544637846, + "grad_norm": 0.5593810081481934, + "learning_rate": 6.361140517823844e-07, + "loss": 0.3161, + "step": 9125 + }, + { + "epoch": 2.562043795620438, + "grad_norm": 0.5495381951332092, + "learning_rate": 6.353170139306563e-07, + "loss": 0.3237, + "step": 9126 + }, + { + "epoch": 2.5623245367770915, + "grad_norm": 0.5270056128501892, + "learning_rate": 6.345204418507534e-07, + "loss": 0.3618, + "step": 9127 + }, + { + "epoch": 2.562605277933745, + "grad_norm": 0.5322282910346985, + "learning_rate": 6.337243356276829e-07, + "loss": 0.3555, + "step": 9128 + }, + { + "epoch": 2.562886019090399, + "grad_norm": 0.5177984833717346, + "learning_rate": 6.329286953463992e-07, + "loss": 0.3413, + "step": 9129 + }, + { + "epoch": 2.5631667602470523, + "grad_norm": 0.5783156156539917, + "learning_rate": 6.321335210918067e-07, + "loss": 0.2853, + "step": 9130 + }, + { + "epoch": 2.5634475014037057, + "grad_norm": 0.5159136652946472, + "learning_rate": 6.313388129487641e-07, + "loss": 0.3684, + "step": 9131 + }, + { + "epoch": 2.5637282425603596, + "grad_norm": 0.5714773535728455, + "learning_rate": 6.305445710020758e-07, + "loss": 0.3101, + "step": 9132 + }, + { + "epoch": 2.564008983717013, + "grad_norm": 0.5122051239013672, + "learning_rate": 6.297507953364995e-07, + "loss": 0.3199, + "step": 9133 + }, + { + "epoch": 2.5642897248736665, + "grad_norm": 0.4964951276779175, + "learning_rate": 6.28957486036742e-07, + "loss": 0.3599, + "step": 9134 + }, + { + "epoch": 2.56457046603032, + "grad_norm": 0.5868041515350342, + "learning_rate": 6.281646431874583e-07, + "loss": 0.3152, + "step": 9135 + }, + { + "epoch": 2.5648512071869733, + "grad_norm": 0.5245321393013, + "learning_rate": 6.273722668732585e-07, + "loss": 0.3497, + "step": 9136 + }, + { + "epoch": 2.5651319483436272, + "grad_norm": 0.5406701564788818, + "learning_rate": 6.265803571786983e-07, + "loss": 0.2849, + "step": 9137 + }, + { + "epoch": 2.5654126895002807, + "grad_norm": 0.6307325959205627, + "learning_rate": 6.257889141882861e-07, + "loss": 0.3115, + "step": 9138 + }, + { + "epoch": 2.5656934306569346, + "grad_norm": 0.6060090065002441, + "learning_rate": 6.24997937986479e-07, + "loss": 0.3362, + "step": 9139 + }, + { + "epoch": 2.565974171813588, + "grad_norm": 0.5877820253372192, + "learning_rate": 6.242074286576866e-07, + "loss": 0.3368, + "step": 9140 + }, + { + "epoch": 2.5662549129702414, + "grad_norm": 0.5987187623977661, + "learning_rate": 6.234173862862663e-07, + "loss": 0.3253, + "step": 9141 + }, + { + "epoch": 2.566535654126895, + "grad_norm": 0.5555630922317505, + "learning_rate": 6.226278109565248e-07, + "loss": 0.35, + "step": 9142 + }, + { + "epoch": 2.5668163952835483, + "grad_norm": 0.5410875678062439, + "learning_rate": 6.21838702752724e-07, + "loss": 0.321, + "step": 9143 + }, + { + "epoch": 2.567097136440202, + "grad_norm": 0.567570686340332, + "learning_rate": 6.210500617590692e-07, + "loss": 0.274, + "step": 9144 + }, + { + "epoch": 2.5673778775968557, + "grad_norm": 0.5772649049758911, + "learning_rate": 6.202618880597222e-07, + "loss": 0.3114, + "step": 9145 + }, + { + "epoch": 2.5676586187535095, + "grad_norm": 0.5099751949310303, + "learning_rate": 6.194741817387906e-07, + "loss": 0.3284, + "step": 9146 + }, + { + "epoch": 2.567939359910163, + "grad_norm": 0.5567918419837952, + "learning_rate": 6.186869428803316e-07, + "loss": 0.3011, + "step": 9147 + }, + { + "epoch": 2.5682201010668164, + "grad_norm": 0.6177340745925903, + "learning_rate": 6.179001715683586e-07, + "loss": 0.2984, + "step": 9148 + }, + { + "epoch": 2.56850084222347, + "grad_norm": 0.6140207648277283, + "learning_rate": 6.171138678868288e-07, + "loss": 0.3346, + "step": 9149 + }, + { + "epoch": 2.5687815833801233, + "grad_norm": 0.5163900852203369, + "learning_rate": 6.163280319196507e-07, + "loss": 0.3642, + "step": 9150 + }, + { + "epoch": 2.569062324536777, + "grad_norm": 0.5513126254081726, + "learning_rate": 6.155426637506856e-07, + "loss": 0.3159, + "step": 9151 + }, + { + "epoch": 2.5693430656934306, + "grad_norm": 0.516830563545227, + "learning_rate": 6.147577634637413e-07, + "loss": 0.3515, + "step": 9152 + }, + { + "epoch": 2.569623806850084, + "grad_norm": 0.4917302429676056, + "learning_rate": 6.139733311425794e-07, + "loss": 0.3459, + "step": 9153 + }, + { + "epoch": 2.569904548006738, + "grad_norm": 0.5159890055656433, + "learning_rate": 6.131893668709082e-07, + "loss": 0.3261, + "step": 9154 + }, + { + "epoch": 2.5701852891633914, + "grad_norm": 0.581808865070343, + "learning_rate": 6.124058707323888e-07, + "loss": 0.2964, + "step": 9155 + }, + { + "epoch": 2.570466030320045, + "grad_norm": 0.5300115346908569, + "learning_rate": 6.116228428106296e-07, + "loss": 0.3052, + "step": 9156 + }, + { + "epoch": 2.5707467714766983, + "grad_norm": 0.5398166179656982, + "learning_rate": 6.108402831891908e-07, + "loss": 0.3167, + "step": 9157 + }, + { + "epoch": 2.571027512633352, + "grad_norm": 0.6200629472732544, + "learning_rate": 6.100581919515835e-07, + "loss": 0.3212, + "step": 9158 + }, + { + "epoch": 2.5713082537900056, + "grad_norm": 0.565707266330719, + "learning_rate": 6.092765691812652e-07, + "loss": 0.3105, + "step": 9159 + }, + { + "epoch": 2.571588994946659, + "grad_norm": 0.49850425124168396, + "learning_rate": 6.08495414961649e-07, + "loss": 0.3017, + "step": 9160 + }, + { + "epoch": 2.571869736103313, + "grad_norm": 0.5485131740570068, + "learning_rate": 6.077147293760915e-07, + "loss": 0.3204, + "step": 9161 + }, + { + "epoch": 2.5721504772599664, + "grad_norm": 0.5504187345504761, + "learning_rate": 6.069345125079057e-07, + "loss": 0.327, + "step": 9162 + }, + { + "epoch": 2.57243121841662, + "grad_norm": 0.6180916428565979, + "learning_rate": 6.061547644403503e-07, + "loss": 0.3257, + "step": 9163 + }, + { + "epoch": 2.5727119595732733, + "grad_norm": 0.586897075176239, + "learning_rate": 6.053754852566335e-07, + "loss": 0.3314, + "step": 9164 + }, + { + "epoch": 2.572992700729927, + "grad_norm": 0.568545401096344, + "learning_rate": 6.045966750399174e-07, + "loss": 0.3122, + "step": 9165 + }, + { + "epoch": 2.5732734418865806, + "grad_norm": 0.5774040818214417, + "learning_rate": 6.038183338733106e-07, + "loss": 0.3405, + "step": 9166 + }, + { + "epoch": 2.573554183043234, + "grad_norm": 0.5538953542709351, + "learning_rate": 6.030404618398733e-07, + "loss": 0.3245, + "step": 9167 + }, + { + "epoch": 2.573834924199888, + "grad_norm": 0.5269328951835632, + "learning_rate": 6.022630590226159e-07, + "loss": 0.2977, + "step": 9168 + }, + { + "epoch": 2.5741156653565413, + "grad_norm": 0.5575495362281799, + "learning_rate": 6.014861255044951e-07, + "loss": 0.3178, + "step": 9169 + }, + { + "epoch": 2.574396406513195, + "grad_norm": 0.5304580926895142, + "learning_rate": 6.007096613684243e-07, + "loss": 0.3362, + "step": 9170 + }, + { + "epoch": 2.5746771476698482, + "grad_norm": 0.5896769762039185, + "learning_rate": 5.9993366669726e-07, + "loss": 0.3436, + "step": 9171 + }, + { + "epoch": 2.574957888826502, + "grad_norm": 0.5595166683197021, + "learning_rate": 5.991581415738129e-07, + "loss": 0.3608, + "step": 9172 + }, + { + "epoch": 2.5752386299831556, + "grad_norm": 0.5666127800941467, + "learning_rate": 5.983830860808415e-07, + "loss": 0.3379, + "step": 9173 + }, + { + "epoch": 2.575519371139809, + "grad_norm": 0.5501779317855835, + "learning_rate": 5.976085003010551e-07, + "loss": 0.2954, + "step": 9174 + }, + { + "epoch": 2.575800112296463, + "grad_norm": 0.604441225528717, + "learning_rate": 5.968343843171143e-07, + "loss": 0.3082, + "step": 9175 + }, + { + "epoch": 2.5760808534531163, + "grad_norm": 0.55019211769104, + "learning_rate": 5.960607382116246e-07, + "loss": 0.3036, + "step": 9176 + }, + { + "epoch": 2.5763615946097698, + "grad_norm": 0.5779778957366943, + "learning_rate": 5.952875620671484e-07, + "loss": 0.3058, + "step": 9177 + }, + { + "epoch": 2.576642335766423, + "grad_norm": 0.5864604115486145, + "learning_rate": 5.945148559661922e-07, + "loss": 0.3242, + "step": 9178 + }, + { + "epoch": 2.5769230769230766, + "grad_norm": 0.5155860185623169, + "learning_rate": 5.937426199912139e-07, + "loss": 0.3247, + "step": 9179 + }, + { + "epoch": 2.5772038180797305, + "grad_norm": 0.558754026889801, + "learning_rate": 5.929708542246232e-07, + "loss": 0.2928, + "step": 9180 + }, + { + "epoch": 2.577484559236384, + "grad_norm": 0.4811382293701172, + "learning_rate": 5.921995587487767e-07, + "loss": 0.313, + "step": 9181 + }, + { + "epoch": 2.577765300393038, + "grad_norm": 0.591773509979248, + "learning_rate": 5.91428733645984e-07, + "loss": 0.3276, + "step": 9182 + }, + { + "epoch": 2.5780460415496913, + "grad_norm": 0.5202547311782837, + "learning_rate": 5.906583789985015e-07, + "loss": 0.2937, + "step": 9183 + }, + { + "epoch": 2.5783267827063447, + "grad_norm": 0.5595214366912842, + "learning_rate": 5.898884948885358e-07, + "loss": 0.3421, + "step": 9184 + }, + { + "epoch": 2.578607523862998, + "grad_norm": 0.5193902254104614, + "learning_rate": 5.891190813982467e-07, + "loss": 0.3356, + "step": 9185 + }, + { + "epoch": 2.5788882650196516, + "grad_norm": 0.5778075456619263, + "learning_rate": 5.883501386097385e-07, + "loss": 0.378, + "step": 9186 + }, + { + "epoch": 2.5791690061763055, + "grad_norm": 0.5058221817016602, + "learning_rate": 5.875816666050699e-07, + "loss": 0.3479, + "step": 9187 + }, + { + "epoch": 2.579449747332959, + "grad_norm": 0.5434385538101196, + "learning_rate": 5.868136654662465e-07, + "loss": 0.362, + "step": 9188 + }, + { + "epoch": 2.579730488489613, + "grad_norm": 0.5488899350166321, + "learning_rate": 5.860461352752256e-07, + "loss": 0.3278, + "step": 9189 + }, + { + "epoch": 2.5800112296462663, + "grad_norm": 0.5754150152206421, + "learning_rate": 5.852790761139126e-07, + "loss": 0.393, + "step": 9190 + }, + { + "epoch": 2.5802919708029197, + "grad_norm": 0.6625264286994934, + "learning_rate": 5.845124880641623e-07, + "loss": 0.3082, + "step": 9191 + }, + { + "epoch": 2.580572711959573, + "grad_norm": 0.5571896433830261, + "learning_rate": 5.837463712077824e-07, + "loss": 0.3572, + "step": 9192 + }, + { + "epoch": 2.5808534531162266, + "grad_norm": 0.5079872012138367, + "learning_rate": 5.829807256265252e-07, + "loss": 0.3753, + "step": 9193 + }, + { + "epoch": 2.5811341942728805, + "grad_norm": 0.5559747815132141, + "learning_rate": 5.822155514020988e-07, + "loss": 0.3315, + "step": 9194 + }, + { + "epoch": 2.581414935429534, + "grad_norm": 0.575465977191925, + "learning_rate": 5.814508486161563e-07, + "loss": 0.2907, + "step": 9195 + }, + { + "epoch": 2.5816956765861874, + "grad_norm": 0.5370486378669739, + "learning_rate": 5.806866173503012e-07, + "loss": 0.3183, + "step": 9196 + }, + { + "epoch": 2.5819764177428413, + "grad_norm": 0.5360647439956665, + "learning_rate": 5.799228576860893e-07, + "loss": 0.3152, + "step": 9197 + }, + { + "epoch": 2.5822571588994947, + "grad_norm": 0.6080288290977478, + "learning_rate": 5.791595697050217e-07, + "loss": 0.347, + "step": 9198 + }, + { + "epoch": 2.582537900056148, + "grad_norm": 0.5147066116333008, + "learning_rate": 5.783967534885549e-07, + "loss": 0.2961, + "step": 9199 + }, + { + "epoch": 2.5828186412128016, + "grad_norm": 0.539100706577301, + "learning_rate": 5.776344091180908e-07, + "loss": 0.3404, + "step": 9200 + }, + { + "epoch": 2.5830993823694555, + "grad_norm": 0.491840660572052, + "learning_rate": 5.768725366749806e-07, + "loss": 0.316, + "step": 9201 + }, + { + "epoch": 2.583380123526109, + "grad_norm": 0.5694340467453003, + "learning_rate": 5.761111362405286e-07, + "loss": 0.3055, + "step": 9202 + }, + { + "epoch": 2.5836608646827623, + "grad_norm": 0.5699926018714905, + "learning_rate": 5.753502078959849e-07, + "loss": 0.2624, + "step": 9203 + }, + { + "epoch": 2.5839416058394162, + "grad_norm": 0.5050646662712097, + "learning_rate": 5.745897517225529e-07, + "loss": 0.3213, + "step": 9204 + }, + { + "epoch": 2.5842223469960697, + "grad_norm": 0.5447824001312256, + "learning_rate": 5.738297678013826e-07, + "loss": 0.3377, + "step": 9205 + }, + { + "epoch": 2.584503088152723, + "grad_norm": 0.5743123888969421, + "learning_rate": 5.730702562135742e-07, + "loss": 0.3165, + "step": 9206 + }, + { + "epoch": 2.5847838293093766, + "grad_norm": 0.5601766109466553, + "learning_rate": 5.723112170401796e-07, + "loss": 0.3119, + "step": 9207 + }, + { + "epoch": 2.5850645704660304, + "grad_norm": 0.5751959681510925, + "learning_rate": 5.71552650362197e-07, + "loss": 0.3503, + "step": 9208 + }, + { + "epoch": 2.585345311622684, + "grad_norm": 0.5345215797424316, + "learning_rate": 5.707945562605777e-07, + "loss": 0.2974, + "step": 9209 + }, + { + "epoch": 2.5856260527793373, + "grad_norm": 0.5181090235710144, + "learning_rate": 5.700369348162194e-07, + "loss": 0.3499, + "step": 9210 + }, + { + "epoch": 2.585906793935991, + "grad_norm": 0.5359534025192261, + "learning_rate": 5.692797861099719e-07, + "loss": 0.3076, + "step": 9211 + }, + { + "epoch": 2.5861875350926447, + "grad_norm": 0.5513017177581787, + "learning_rate": 5.68523110222633e-07, + "loss": 0.3664, + "step": 9212 + }, + { + "epoch": 2.586468276249298, + "grad_norm": 0.5880671739578247, + "learning_rate": 5.67766907234949e-07, + "loss": 0.316, + "step": 9213 + }, + { + "epoch": 2.5867490174059515, + "grad_norm": 0.5786539912223816, + "learning_rate": 5.670111772276194e-07, + "loss": 0.2962, + "step": 9214 + }, + { + "epoch": 2.5870297585626054, + "grad_norm": 0.5368750691413879, + "learning_rate": 5.662559202812895e-07, + "loss": 0.2904, + "step": 9215 + }, + { + "epoch": 2.587310499719259, + "grad_norm": 0.5923262238502502, + "learning_rate": 5.655011364765566e-07, + "loss": 0.3456, + "step": 9216 + }, + { + "epoch": 2.5875912408759123, + "grad_norm": 0.6336987614631653, + "learning_rate": 5.647468258939664e-07, + "loss": 0.3157, + "step": 9217 + }, + { + "epoch": 2.587871982032566, + "grad_norm": 0.5571448802947998, + "learning_rate": 5.639929886140127e-07, + "loss": 0.3128, + "step": 9218 + }, + { + "epoch": 2.5881527231892196, + "grad_norm": 0.4952406585216522, + "learning_rate": 5.632396247171429e-07, + "loss": 0.3326, + "step": 9219 + }, + { + "epoch": 2.588433464345873, + "grad_norm": 0.5353228449821472, + "learning_rate": 5.624867342837487e-07, + "loss": 0.3089, + "step": 9220 + }, + { + "epoch": 2.5887142055025265, + "grad_norm": 0.5043750405311584, + "learning_rate": 5.617343173941763e-07, + "loss": 0.3354, + "step": 9221 + }, + { + "epoch": 2.5889949466591804, + "grad_norm": 0.546557605266571, + "learning_rate": 5.609823741287168e-07, + "loss": 0.2781, + "step": 9222 + }, + { + "epoch": 2.589275687815834, + "grad_norm": 0.593588650226593, + "learning_rate": 5.602309045676146e-07, + "loss": 0.3152, + "step": 9223 + }, + { + "epoch": 2.5895564289724873, + "grad_norm": 0.6697573661804199, + "learning_rate": 5.594799087910608e-07, + "loss": 0.3085, + "step": 9224 + }, + { + "epoch": 2.589837170129141, + "grad_norm": 0.5571876764297485, + "learning_rate": 5.587293868791965e-07, + "loss": 0.3765, + "step": 9225 + }, + { + "epoch": 2.5901179112857946, + "grad_norm": 0.47442951798439026, + "learning_rate": 5.579793389121152e-07, + "loss": 0.3371, + "step": 9226 + }, + { + "epoch": 2.590398652442448, + "grad_norm": 0.5200581550598145, + "learning_rate": 5.572297649698555e-07, + "loss": 0.3205, + "step": 9227 + }, + { + "epoch": 2.5906793935991015, + "grad_norm": 0.5396103262901306, + "learning_rate": 5.564806651324068e-07, + "loss": 0.3413, + "step": 9228 + }, + { + "epoch": 2.590960134755755, + "grad_norm": 0.6101888418197632, + "learning_rate": 5.5573203947971e-07, + "loss": 0.3241, + "step": 9229 + }, + { + "epoch": 2.591240875912409, + "grad_norm": 0.561924159526825, + "learning_rate": 5.549838880916514e-07, + "loss": 0.297, + "step": 9230 + }, + { + "epoch": 2.5915216170690623, + "grad_norm": 0.5239946246147156, + "learning_rate": 5.54236211048072e-07, + "loss": 0.3656, + "step": 9231 + }, + { + "epoch": 2.591802358225716, + "grad_norm": 0.5240875482559204, + "learning_rate": 5.534890084287575e-07, + "loss": 0.3076, + "step": 9232 + }, + { + "epoch": 2.5920830993823696, + "grad_norm": 0.5559367537498474, + "learning_rate": 5.527422803134441e-07, + "loss": 0.3052, + "step": 9233 + }, + { + "epoch": 2.592363840539023, + "grad_norm": 0.5739917159080505, + "learning_rate": 5.519960267818203e-07, + "loss": 0.3132, + "step": 9234 + }, + { + "epoch": 2.5926445816956765, + "grad_norm": 0.626494824886322, + "learning_rate": 5.512502479135184e-07, + "loss": 0.2805, + "step": 9235 + }, + { + "epoch": 2.59292532285233, + "grad_norm": 0.5362425446510315, + "learning_rate": 5.505049437881266e-07, + "loss": 0.3514, + "step": 9236 + }, + { + "epoch": 2.593206064008984, + "grad_norm": 0.5248806476593018, + "learning_rate": 5.497601144851766e-07, + "loss": 0.2949, + "step": 9237 + }, + { + "epoch": 2.5934868051656372, + "grad_norm": 0.5375782251358032, + "learning_rate": 5.490157600841539e-07, + "loss": 0.3784, + "step": 9238 + }, + { + "epoch": 2.593767546322291, + "grad_norm": 0.5520331859588623, + "learning_rate": 5.482718806644904e-07, + "loss": 0.3275, + "step": 9239 + }, + { + "epoch": 2.5940482874789446, + "grad_norm": 0.5443271398544312, + "learning_rate": 5.475284763055677e-07, + "loss": 0.3245, + "step": 9240 + }, + { + "epoch": 2.594329028635598, + "grad_norm": 0.583730161190033, + "learning_rate": 5.467855470867184e-07, + "loss": 0.3458, + "step": 9241 + }, + { + "epoch": 2.5946097697922514, + "grad_norm": 0.5758969783782959, + "learning_rate": 5.460430930872224e-07, + "loss": 0.3141, + "step": 9242 + }, + { + "epoch": 2.594890510948905, + "grad_norm": 0.5554253458976746, + "learning_rate": 5.453011143863108e-07, + "loss": 0.2876, + "step": 9243 + }, + { + "epoch": 2.5951712521055588, + "grad_norm": 0.582573413848877, + "learning_rate": 5.445596110631618e-07, + "loss": 0.3385, + "step": 9244 + }, + { + "epoch": 2.595451993262212, + "grad_norm": 0.5768382549285889, + "learning_rate": 5.438185831969045e-07, + "loss": 0.3346, + "step": 9245 + }, + { + "epoch": 2.5957327344188657, + "grad_norm": 0.5702758431434631, + "learning_rate": 5.430780308666173e-07, + "loss": 0.3251, + "step": 9246 + }, + { + "epoch": 2.5960134755755195, + "grad_norm": 0.5713242292404175, + "learning_rate": 5.423379541513257e-07, + "loss": 0.2683, + "step": 9247 + }, + { + "epoch": 2.596294216732173, + "grad_norm": 0.5656282901763916, + "learning_rate": 5.41598353130009e-07, + "loss": 0.3176, + "step": 9248 + }, + { + "epoch": 2.5965749578888264, + "grad_norm": 0.5462035536766052, + "learning_rate": 5.408592278815894e-07, + "loss": 0.2929, + "step": 9249 + }, + { + "epoch": 2.59685569904548, + "grad_norm": 0.6136484146118164, + "learning_rate": 5.401205784849433e-07, + "loss": 0.3025, + "step": 9250 + }, + { + "epoch": 2.5971364402021337, + "grad_norm": 0.6153603792190552, + "learning_rate": 5.393824050188968e-07, + "loss": 0.3261, + "step": 9251 + }, + { + "epoch": 2.597417181358787, + "grad_norm": 0.6858318448066711, + "learning_rate": 5.386447075622198e-07, + "loss": 0.3102, + "step": 9252 + }, + { + "epoch": 2.5976979225154406, + "grad_norm": 0.5621753931045532, + "learning_rate": 5.379074861936367e-07, + "loss": 0.3255, + "step": 9253 + }, + { + "epoch": 2.5979786636720945, + "grad_norm": 0.5572348833084106, + "learning_rate": 5.371707409918198e-07, + "loss": 0.3697, + "step": 9254 + }, + { + "epoch": 2.598259404828748, + "grad_norm": 0.5891318321228027, + "learning_rate": 5.36434472035387e-07, + "loss": 0.2778, + "step": 9255 + }, + { + "epoch": 2.5985401459854014, + "grad_norm": 0.5778660774230957, + "learning_rate": 5.356986794029117e-07, + "loss": 0.3383, + "step": 9256 + }, + { + "epoch": 2.598820887142055, + "grad_norm": 0.5736680030822754, + "learning_rate": 5.349633631729106e-07, + "loss": 0.3693, + "step": 9257 + }, + { + "epoch": 2.5991016282987087, + "grad_norm": 0.6269557476043701, + "learning_rate": 5.342285234238543e-07, + "loss": 0.3204, + "step": 9258 + }, + { + "epoch": 2.599382369455362, + "grad_norm": 0.5626348257064819, + "learning_rate": 5.334941602341581e-07, + "loss": 0.3115, + "step": 9259 + }, + { + "epoch": 2.5996631106120156, + "grad_norm": 0.5542964339256287, + "learning_rate": 5.327602736821907e-07, + "loss": 0.3284, + "step": 9260 + }, + { + "epoch": 2.5999438517686695, + "grad_norm": 0.5452356338500977, + "learning_rate": 5.320268638462667e-07, + "loss": 0.3377, + "step": 9261 + }, + { + "epoch": 2.600224592925323, + "grad_norm": 0.5376703143119812, + "learning_rate": 5.312939308046505e-07, + "loss": 0.3275, + "step": 9262 + }, + { + "epoch": 2.6005053340819764, + "grad_norm": 0.5631587505340576, + "learning_rate": 5.305614746355581e-07, + "loss": 0.3034, + "step": 9263 + }, + { + "epoch": 2.60078607523863, + "grad_norm": 0.5472807884216309, + "learning_rate": 5.298294954171506e-07, + "loss": 0.3011, + "step": 9264 + }, + { + "epoch": 2.6010668163952837, + "grad_norm": 0.5509679317474365, + "learning_rate": 5.290979932275419e-07, + "loss": 0.3329, + "step": 9265 + }, + { + "epoch": 2.601347557551937, + "grad_norm": 0.585071325302124, + "learning_rate": 5.283669681447928e-07, + "loss": 0.3164, + "step": 9266 + }, + { + "epoch": 2.6016282987085906, + "grad_norm": 0.540922999382019, + "learning_rate": 5.276364202469131e-07, + "loss": 0.3554, + "step": 9267 + }, + { + "epoch": 2.6019090398652445, + "grad_norm": 0.557780385017395, + "learning_rate": 5.269063496118632e-07, + "loss": 0.3168, + "step": 9268 + }, + { + "epoch": 2.602189781021898, + "grad_norm": 0.522637128829956, + "learning_rate": 5.261767563175501e-07, + "loss": 0.3183, + "step": 9269 + }, + { + "epoch": 2.6024705221785513, + "grad_norm": 0.5406872630119324, + "learning_rate": 5.254476404418341e-07, + "loss": 0.3343, + "step": 9270 + }, + { + "epoch": 2.602751263335205, + "grad_norm": 0.5162304043769836, + "learning_rate": 5.247190020625197e-07, + "loss": 0.352, + "step": 9271 + }, + { + "epoch": 2.6030320044918582, + "grad_norm": 0.5424394011497498, + "learning_rate": 5.239908412573641e-07, + "loss": 0.3061, + "step": 9272 + }, + { + "epoch": 2.603312745648512, + "grad_norm": 0.5532450675964355, + "learning_rate": 5.23263158104072e-07, + "loss": 0.3388, + "step": 9273 + }, + { + "epoch": 2.6035934868051656, + "grad_norm": 0.5204339623451233, + "learning_rate": 5.225359526802942e-07, + "loss": 0.3182, + "step": 9274 + }, + { + "epoch": 2.6038742279618194, + "grad_norm": 0.5085756182670593, + "learning_rate": 5.21809225063638e-07, + "loss": 0.3007, + "step": 9275 + }, + { + "epoch": 2.604154969118473, + "grad_norm": 0.5846678614616394, + "learning_rate": 5.210829753316532e-07, + "loss": 0.3605, + "step": 9276 + }, + { + "epoch": 2.6044357102751263, + "grad_norm": 0.5946673154830933, + "learning_rate": 5.203572035618398e-07, + "loss": 0.3474, + "step": 9277 + }, + { + "epoch": 2.6047164514317798, + "grad_norm": 0.5541054010391235, + "learning_rate": 5.196319098316499e-07, + "loss": 0.3412, + "step": 9278 + }, + { + "epoch": 2.604997192588433, + "grad_norm": 0.5580927729606628, + "learning_rate": 5.189070942184799e-07, + "loss": 0.3187, + "step": 9279 + }, + { + "epoch": 2.605277933745087, + "grad_norm": 0.535959780216217, + "learning_rate": 5.181827567996795e-07, + "loss": 0.2686, + "step": 9280 + }, + { + "epoch": 2.6055586749017405, + "grad_norm": 0.5632882714271545, + "learning_rate": 5.174588976525441e-07, + "loss": 0.3003, + "step": 9281 + }, + { + "epoch": 2.6058394160583944, + "grad_norm": 0.5435212850570679, + "learning_rate": 5.167355168543203e-07, + "loss": 0.2932, + "step": 9282 + }, + { + "epoch": 2.606120157215048, + "grad_norm": 0.7202975153923035, + "learning_rate": 5.160126144822031e-07, + "loss": 0.3529, + "step": 9283 + }, + { + "epoch": 2.6064008983717013, + "grad_norm": 0.6050068736076355, + "learning_rate": 5.15290190613335e-07, + "loss": 0.3536, + "step": 9284 + }, + { + "epoch": 2.6066816395283547, + "grad_norm": 0.5328752994537354, + "learning_rate": 5.145682453248096e-07, + "loss": 0.3385, + "step": 9285 + }, + { + "epoch": 2.606962380685008, + "grad_norm": 0.6006103157997131, + "learning_rate": 5.138467786936669e-07, + "loss": 0.3407, + "step": 9286 + }, + { + "epoch": 2.607243121841662, + "grad_norm": 0.5150094628334045, + "learning_rate": 5.131257907969001e-07, + "loss": 0.3551, + "step": 9287 + }, + { + "epoch": 2.6075238629983155, + "grad_norm": 0.5361217260360718, + "learning_rate": 5.124052817114461e-07, + "loss": 0.3093, + "step": 9288 + }, + { + "epoch": 2.607804604154969, + "grad_norm": 0.5722095966339111, + "learning_rate": 5.116852515141934e-07, + "loss": 0.3301, + "step": 9289 + }, + { + "epoch": 2.608085345311623, + "grad_norm": 0.6124836802482605, + "learning_rate": 5.109657002819807e-07, + "loss": 0.3366, + "step": 9290 + }, + { + "epoch": 2.6083660864682763, + "grad_norm": 0.5385775566101074, + "learning_rate": 5.102466280915918e-07, + "loss": 0.3525, + "step": 9291 + }, + { + "epoch": 2.6086468276249297, + "grad_norm": 0.5474715232849121, + "learning_rate": 5.09528035019764e-07, + "loss": 0.3547, + "step": 9292 + }, + { + "epoch": 2.608927568781583, + "grad_norm": 0.5858557820320129, + "learning_rate": 5.088099211431785e-07, + "loss": 0.2946, + "step": 9293 + }, + { + "epoch": 2.609208309938237, + "grad_norm": 0.5125181078910828, + "learning_rate": 5.080922865384707e-07, + "loss": 0.3123, + "step": 9294 + }, + { + "epoch": 2.6094890510948905, + "grad_norm": 0.5887751579284668, + "learning_rate": 5.073751312822206e-07, + "loss": 0.3226, + "step": 9295 + }, + { + "epoch": 2.609769792251544, + "grad_norm": 0.5322958827018738, + "learning_rate": 5.066584554509579e-07, + "loss": 0.3807, + "step": 9296 + }, + { + "epoch": 2.610050533408198, + "grad_norm": 0.5683243274688721, + "learning_rate": 5.059422591211633e-07, + "loss": 0.363, + "step": 9297 + }, + { + "epoch": 2.6103312745648513, + "grad_norm": 0.5509128570556641, + "learning_rate": 5.052265423692631e-07, + "loss": 0.3405, + "step": 9298 + }, + { + "epoch": 2.6106120157215047, + "grad_norm": 0.5450987815856934, + "learning_rate": 5.045113052716366e-07, + "loss": 0.299, + "step": 9299 + }, + { + "epoch": 2.610892756878158, + "grad_norm": 0.4994717240333557, + "learning_rate": 5.037965479046064e-07, + "loss": 0.3371, + "step": 9300 + }, + { + "epoch": 2.611173498034812, + "grad_norm": 0.7137396335601807, + "learning_rate": 5.030822703444494e-07, + "loss": 0.3466, + "step": 9301 + }, + { + "epoch": 2.6114542391914655, + "grad_norm": 0.4880484640598297, + "learning_rate": 5.023684726673883e-07, + "loss": 0.3518, + "step": 9302 + }, + { + "epoch": 2.611734980348119, + "grad_norm": 0.5722585916519165, + "learning_rate": 5.016551549495951e-07, + "loss": 0.2926, + "step": 9303 + }, + { + "epoch": 2.612015721504773, + "grad_norm": 0.6080985069274902, + "learning_rate": 5.009423172671896e-07, + "loss": 0.3066, + "step": 9304 + }, + { + "epoch": 2.6122964626614262, + "grad_norm": 0.6243522763252258, + "learning_rate": 5.002299596962429e-07, + "loss": 0.359, + "step": 9305 + }, + { + "epoch": 2.6125772038180797, + "grad_norm": 0.5547509789466858, + "learning_rate": 4.995180823127716e-07, + "loss": 0.3057, + "step": 9306 + }, + { + "epoch": 2.612857944974733, + "grad_norm": 0.5454524755477905, + "learning_rate": 4.988066851927448e-07, + "loss": 0.308, + "step": 9307 + }, + { + "epoch": 2.613138686131387, + "grad_norm": 0.4972596764564514, + "learning_rate": 4.980957684120768e-07, + "loss": 0.3304, + "step": 9308 + }, + { + "epoch": 2.6134194272880404, + "grad_norm": 0.5153542160987854, + "learning_rate": 4.973853320466338e-07, + "loss": 0.2921, + "step": 9309 + }, + { + "epoch": 2.613700168444694, + "grad_norm": 0.5744361877441406, + "learning_rate": 4.966753761722287e-07, + "loss": 0.291, + "step": 9310 + }, + { + "epoch": 2.6139809096013478, + "grad_norm": 0.5168066024780273, + "learning_rate": 4.959659008646217e-07, + "loss": 0.3278, + "step": 9311 + }, + { + "epoch": 2.614261650758001, + "grad_norm": 0.5387728810310364, + "learning_rate": 4.95256906199526e-07, + "loss": 0.3342, + "step": 9312 + }, + { + "epoch": 2.6145423919146547, + "grad_norm": 0.6054450869560242, + "learning_rate": 4.94548392252599e-07, + "loss": 0.3109, + "step": 9313 + }, + { + "epoch": 2.614823133071308, + "grad_norm": 0.5160964727401733, + "learning_rate": 4.938403590994512e-07, + "loss": 0.3252, + "step": 9314 + }, + { + "epoch": 2.615103874227962, + "grad_norm": 0.6011884212493896, + "learning_rate": 4.931328068156383e-07, + "loss": 0.3263, + "step": 9315 + }, + { + "epoch": 2.6153846153846154, + "grad_norm": 0.5339115858078003, + "learning_rate": 4.924257354766648e-07, + "loss": 0.322, + "step": 9316 + }, + { + "epoch": 2.615665356541269, + "grad_norm": 0.5518389344215393, + "learning_rate": 4.917191451579873e-07, + "loss": 0.3496, + "step": 9317 + }, + { + "epoch": 2.6159460976979227, + "grad_norm": 0.479958176612854, + "learning_rate": 4.910130359350063e-07, + "loss": 0.3357, + "step": 9318 + }, + { + "epoch": 2.616226838854576, + "grad_norm": 0.5352818965911865, + "learning_rate": 4.903074078830755e-07, + "loss": 0.3491, + "step": 9319 + }, + { + "epoch": 2.6165075800112296, + "grad_norm": 0.5416983366012573, + "learning_rate": 4.896022610774937e-07, + "loss": 0.3617, + "step": 9320 + }, + { + "epoch": 2.616788321167883, + "grad_norm": 0.5425775647163391, + "learning_rate": 4.888975955935104e-07, + "loss": 0.3296, + "step": 9321 + }, + { + "epoch": 2.6170690623245365, + "grad_norm": 0.5622072219848633, + "learning_rate": 4.881934115063237e-07, + "loss": 0.3529, + "step": 9322 + }, + { + "epoch": 2.6173498034811904, + "grad_norm": 0.5656238198280334, + "learning_rate": 4.87489708891078e-07, + "loss": 0.3612, + "step": 9323 + }, + { + "epoch": 2.617630544637844, + "grad_norm": 0.600321888923645, + "learning_rate": 4.867864878228701e-07, + "loss": 0.3256, + "step": 9324 + }, + { + "epoch": 2.6179112857944977, + "grad_norm": 0.5435383319854736, + "learning_rate": 4.860837483767411e-07, + "loss": 0.286, + "step": 9325 + }, + { + "epoch": 2.618192026951151, + "grad_norm": 0.561693012714386, + "learning_rate": 4.853814906276844e-07, + "loss": 0.358, + "step": 9326 + }, + { + "epoch": 2.6184727681078046, + "grad_norm": 0.4986342787742615, + "learning_rate": 4.846797146506416e-07, + "loss": 0.3737, + "step": 9327 + }, + { + "epoch": 2.618753509264458, + "grad_norm": 0.5988184809684753, + "learning_rate": 4.839784205204995e-07, + "loss": 0.3074, + "step": 9328 + }, + { + "epoch": 2.6190342504211115, + "grad_norm": 0.5872687697410583, + "learning_rate": 4.832776083120983e-07, + "loss": 0.2894, + "step": 9329 + }, + { + "epoch": 2.6193149915777654, + "grad_norm": 0.601529061794281, + "learning_rate": 4.825772781002219e-07, + "loss": 0.3043, + "step": 9330 + }, + { + "epoch": 2.619595732734419, + "grad_norm": 0.5467483401298523, + "learning_rate": 4.818774299596079e-07, + "loss": 0.3301, + "step": 9331 + }, + { + "epoch": 2.6198764738910727, + "grad_norm": 0.5491088032722473, + "learning_rate": 4.811780639649377e-07, + "loss": 0.3151, + "step": 9332 + }, + { + "epoch": 2.620157215047726, + "grad_norm": 0.5768588185310364, + "learning_rate": 4.804791801908432e-07, + "loss": 0.2798, + "step": 9333 + }, + { + "epoch": 2.6204379562043796, + "grad_norm": 0.5211851596832275, + "learning_rate": 4.797807787119058e-07, + "loss": 0.2931, + "step": 9334 + }, + { + "epoch": 2.620718697361033, + "grad_norm": 0.5591154098510742, + "learning_rate": 4.790828596026542e-07, + "loss": 0.2922, + "step": 9335 + }, + { + "epoch": 2.6209994385176865, + "grad_norm": 0.5015286803245544, + "learning_rate": 4.783854229375667e-07, + "loss": 0.3445, + "step": 9336 + }, + { + "epoch": 2.6212801796743403, + "grad_norm": 0.5570569634437561, + "learning_rate": 4.776884687910688e-07, + "loss": 0.3092, + "step": 9337 + }, + { + "epoch": 2.621560920830994, + "grad_norm": 0.5584204196929932, + "learning_rate": 4.769919972375337e-07, + "loss": 0.3034, + "step": 9338 + }, + { + "epoch": 2.6218416619876472, + "grad_norm": 0.543012797832489, + "learning_rate": 4.762960083512874e-07, + "loss": 0.3559, + "step": 9339 + }, + { + "epoch": 2.622122403144301, + "grad_norm": 0.540846049785614, + "learning_rate": 4.7560050220659903e-07, + "loss": 0.3432, + "step": 9340 + }, + { + "epoch": 2.6224031443009546, + "grad_norm": 0.6288012862205505, + "learning_rate": 4.749054788776908e-07, + "loss": 0.3164, + "step": 9341 + }, + { + "epoch": 2.622683885457608, + "grad_norm": 0.5445126891136169, + "learning_rate": 4.742109384387289e-07, + "loss": 0.3356, + "step": 9342 + }, + { + "epoch": 2.6229646266142614, + "grad_norm": 0.6313918828964233, + "learning_rate": 4.735168809638324e-07, + "loss": 0.3443, + "step": 9343 + }, + { + "epoch": 2.6232453677709153, + "grad_norm": 0.5379695892333984, + "learning_rate": 4.728233065270665e-07, + "loss": 0.3011, + "step": 9344 + }, + { + "epoch": 2.6235261089275688, + "grad_norm": 0.5338721871376038, + "learning_rate": 4.7213021520244375e-07, + "loss": 0.3584, + "step": 9345 + }, + { + "epoch": 2.623806850084222, + "grad_norm": 0.5549765825271606, + "learning_rate": 4.7143760706392816e-07, + "loss": 0.3672, + "step": 9346 + }, + { + "epoch": 2.624087591240876, + "grad_norm": 0.5060943961143494, + "learning_rate": 4.707454821854285e-07, + "loss": 0.2816, + "step": 9347 + }, + { + "epoch": 2.6243683323975295, + "grad_norm": 0.5631430149078369, + "learning_rate": 4.700538406408073e-07, + "loss": 0.3635, + "step": 9348 + }, + { + "epoch": 2.624649073554183, + "grad_norm": 0.5169394016265869, + "learning_rate": 4.693626825038694e-07, + "loss": 0.333, + "step": 9349 + }, + { + "epoch": 2.6249298147108364, + "grad_norm": 0.5193637609481812, + "learning_rate": 4.686720078483714e-07, + "loss": 0.3496, + "step": 9350 + }, + { + "epoch": 2.6252105558674903, + "grad_norm": 0.556094229221344, + "learning_rate": 4.6798181674801825e-07, + "loss": 0.333, + "step": 9351 + }, + { + "epoch": 2.6254912970241437, + "grad_norm": 0.5475212931632996, + "learning_rate": 4.6729210927646374e-07, + "loss": 0.3557, + "step": 9352 + }, + { + "epoch": 2.625772038180797, + "grad_norm": 0.5579310059547424, + "learning_rate": 4.6660288550730735e-07, + "loss": 0.3264, + "step": 9353 + }, + { + "epoch": 2.626052779337451, + "grad_norm": 0.5508068203926086, + "learning_rate": 4.6591414551410085e-07, + "loss": 0.3392, + "step": 9354 + }, + { + "epoch": 2.6263335204941045, + "grad_norm": 0.5637610554695129, + "learning_rate": 4.6522588937033986e-07, + "loss": 0.3241, + "step": 9355 + }, + { + "epoch": 2.626614261650758, + "grad_norm": 0.5632300972938538, + "learning_rate": 4.6453811714947283e-07, + "loss": 0.3393, + "step": 9356 + }, + { + "epoch": 2.6268950028074114, + "grad_norm": 0.525488018989563, + "learning_rate": 4.6385082892489333e-07, + "loss": 0.3684, + "step": 9357 + }, + { + "epoch": 2.6271757439640653, + "grad_norm": 0.5839850306510925, + "learning_rate": 4.631640247699459e-07, + "loss": 0.323, + "step": 9358 + }, + { + "epoch": 2.6274564851207187, + "grad_norm": 0.5301842093467712, + "learning_rate": 4.624777047579204e-07, + "loss": 0.303, + "step": 9359 + }, + { + "epoch": 2.627737226277372, + "grad_norm": 0.5381922721862793, + "learning_rate": 4.6179186896205654e-07, + "loss": 0.3423, + "step": 9360 + }, + { + "epoch": 2.628017967434026, + "grad_norm": 0.5731533765792847, + "learning_rate": 4.611065174555446e-07, + "loss": 0.3234, + "step": 9361 + }, + { + "epoch": 2.6282987085906795, + "grad_norm": 0.5349149107933044, + "learning_rate": 4.6042165031151833e-07, + "loss": 0.3192, + "step": 9362 + }, + { + "epoch": 2.628579449747333, + "grad_norm": 0.5049703121185303, + "learning_rate": 4.5973726760306427e-07, + "loss": 0.3534, + "step": 9363 + }, + { + "epoch": 2.6288601909039864, + "grad_norm": 0.5449021458625793, + "learning_rate": 4.5905336940321565e-07, + "loss": 0.3401, + "step": 9364 + }, + { + "epoch": 2.62914093206064, + "grad_norm": 0.5794591903686523, + "learning_rate": 4.5836995578495193e-07, + "loss": 0.3083, + "step": 9365 + }, + { + "epoch": 2.6294216732172937, + "grad_norm": 0.5394207835197449, + "learning_rate": 4.576870268212047e-07, + "loss": 0.3048, + "step": 9366 + }, + { + "epoch": 2.629702414373947, + "grad_norm": 0.542289674282074, + "learning_rate": 4.570045825848507e-07, + "loss": 0.309, + "step": 9367 + }, + { + "epoch": 2.629983155530601, + "grad_norm": 0.5799707770347595, + "learning_rate": 4.563226231487172e-07, + "loss": 0.3015, + "step": 9368 + }, + { + "epoch": 2.6302638966872545, + "grad_norm": 0.5405160188674927, + "learning_rate": 4.5564114858557776e-07, + "loss": 0.3421, + "step": 9369 + }, + { + "epoch": 2.630544637843908, + "grad_norm": 0.6025461554527283, + "learning_rate": 4.5496015896815573e-07, + "loss": 0.3223, + "step": 9370 + }, + { + "epoch": 2.6308253790005613, + "grad_norm": 0.5442836284637451, + "learning_rate": 4.542796543691219e-07, + "loss": 0.316, + "step": 9371 + }, + { + "epoch": 2.631106120157215, + "grad_norm": 0.5988059639930725, + "learning_rate": 4.5359963486109495e-07, + "loss": 0.2992, + "step": 9372 + }, + { + "epoch": 2.6313868613138687, + "grad_norm": 0.5295849442481995, + "learning_rate": 4.529201005166428e-07, + "loss": 0.3463, + "step": 9373 + }, + { + "epoch": 2.631667602470522, + "grad_norm": 0.5595494508743286, + "learning_rate": 4.5224105140828087e-07, + "loss": 0.3246, + "step": 9374 + }, + { + "epoch": 2.631948343627176, + "grad_norm": 0.5467063188552856, + "learning_rate": 4.51562487608474e-07, + "loss": 0.3578, + "step": 9375 + }, + { + "epoch": 2.6322290847838294, + "grad_norm": 0.5460591912269592, + "learning_rate": 4.508844091896325e-07, + "loss": 0.303, + "step": 9376 + }, + { + "epoch": 2.632509825940483, + "grad_norm": 0.6249129772186279, + "learning_rate": 4.50206816224118e-07, + "loss": 0.3391, + "step": 9377 + }, + { + "epoch": 2.6327905670971363, + "grad_norm": 0.5546107292175293, + "learning_rate": 4.4952970878423983e-07, + "loss": 0.3268, + "step": 9378 + }, + { + "epoch": 2.6330713082537898, + "grad_norm": 0.5169398784637451, + "learning_rate": 4.488530869422525e-07, + "loss": 0.3755, + "step": 9379 + }, + { + "epoch": 2.6333520494104437, + "grad_norm": 0.6199008822441101, + "learning_rate": 4.4817695077036316e-07, + "loss": 0.3415, + "step": 9380 + }, + { + "epoch": 2.633632790567097, + "grad_norm": 0.5584743618965149, + "learning_rate": 4.4750130034072356e-07, + "loss": 0.3203, + "step": 9381 + }, + { + "epoch": 2.633913531723751, + "grad_norm": 0.546150267124176, + "learning_rate": 4.468261357254339e-07, + "loss": 0.3239, + "step": 9382 + }, + { + "epoch": 2.6341942728804044, + "grad_norm": 0.4746125638484955, + "learning_rate": 4.4615145699654585e-07, + "loss": 0.3121, + "step": 9383 + }, + { + "epoch": 2.634475014037058, + "grad_norm": 0.5220149755477905, + "learning_rate": 4.454772642260552e-07, + "loss": 0.3466, + "step": 9384 + }, + { + "epoch": 2.6347557551937113, + "grad_norm": 0.5786832571029663, + "learning_rate": 4.4480355748590834e-07, + "loss": 0.2597, + "step": 9385 + }, + { + "epoch": 2.6350364963503647, + "grad_norm": 0.5669474601745605, + "learning_rate": 4.4413033684799935e-07, + "loss": 0.3449, + "step": 9386 + }, + { + "epoch": 2.6353172375070186, + "grad_norm": 0.622798502445221, + "learning_rate": 4.434576023841691e-07, + "loss": 0.3406, + "step": 9387 + }, + { + "epoch": 2.635597978663672, + "grad_norm": 0.5079540610313416, + "learning_rate": 4.4278535416620914e-07, + "loss": 0.296, + "step": 9388 + }, + { + "epoch": 2.6358787198203255, + "grad_norm": 0.5355873703956604, + "learning_rate": 4.4211359226585536e-07, + "loss": 0.3236, + "step": 9389 + }, + { + "epoch": 2.6361594609769794, + "grad_norm": 0.5529024600982666, + "learning_rate": 4.4144231675479656e-07, + "loss": 0.3035, + "step": 9390 + }, + { + "epoch": 2.636440202133633, + "grad_norm": 0.5050484538078308, + "learning_rate": 4.407715277046648e-07, + "loss": 0.3403, + "step": 9391 + }, + { + "epoch": 2.6367209432902863, + "grad_norm": 0.49440085887908936, + "learning_rate": 4.401012251870451e-07, + "loss": 0.3166, + "step": 9392 + }, + { + "epoch": 2.6370016844469397, + "grad_norm": 0.5921834111213684, + "learning_rate": 4.3943140927346584e-07, + "loss": 0.3188, + "step": 9393 + }, + { + "epoch": 2.6372824256035936, + "grad_norm": 0.5881319046020508, + "learning_rate": 4.387620800354059e-07, + "loss": 0.3478, + "step": 9394 + }, + { + "epoch": 2.637563166760247, + "grad_norm": 0.6179415583610535, + "learning_rate": 4.380932375442931e-07, + "loss": 0.2947, + "step": 9395 + }, + { + "epoch": 2.6378439079169005, + "grad_norm": 0.5373092889785767, + "learning_rate": 4.374248818714999e-07, + "loss": 0.3413, + "step": 9396 + }, + { + "epoch": 2.6381246490735544, + "grad_norm": 0.5521765351295471, + "learning_rate": 4.3675701308835196e-07, + "loss": 0.344, + "step": 9397 + }, + { + "epoch": 2.638405390230208, + "grad_norm": 0.5564184784889221, + "learning_rate": 4.360896312661189e-07, + "loss": 0.3405, + "step": 9398 + }, + { + "epoch": 2.6386861313868613, + "grad_norm": 0.5610907077789307, + "learning_rate": 4.3542273647601774e-07, + "loss": 0.3451, + "step": 9399 + }, + { + "epoch": 2.6389668725435147, + "grad_norm": 0.593033492565155, + "learning_rate": 4.3475632878921816e-07, + "loss": 0.3315, + "step": 9400 + }, + { + "epoch": 2.6392476137001686, + "grad_norm": 0.5673864483833313, + "learning_rate": 4.340904082768332e-07, + "loss": 0.3257, + "step": 9401 + }, + { + "epoch": 2.639528354856822, + "grad_norm": 0.5801091194152832, + "learning_rate": 4.3342497500992566e-07, + "loss": 0.3212, + "step": 9402 + }, + { + "epoch": 2.6398090960134755, + "grad_norm": 0.5522968769073486, + "learning_rate": 4.3276002905950853e-07, + "loss": 0.3043, + "step": 9403 + }, + { + "epoch": 2.6400898371701293, + "grad_norm": 0.6049322485923767, + "learning_rate": 4.320955704965385e-07, + "loss": 0.3057, + "step": 9404 + }, + { + "epoch": 2.640370578326783, + "grad_norm": 0.5833464860916138, + "learning_rate": 4.314315993919238e-07, + "loss": 0.3147, + "step": 9405 + }, + { + "epoch": 2.6406513194834362, + "grad_norm": 0.5223578810691833, + "learning_rate": 4.3076811581651777e-07, + "loss": 0.3306, + "step": 9406 + }, + { + "epoch": 2.6409320606400897, + "grad_norm": 0.5660110116004944, + "learning_rate": 4.301051198411255e-07, + "loss": 0.3624, + "step": 9407 + }, + { + "epoch": 2.6412128017967436, + "grad_norm": 0.4879720211029053, + "learning_rate": 4.294426115364964e-07, + "loss": 0.3068, + "step": 9408 + }, + { + "epoch": 2.641493542953397, + "grad_norm": 0.5538328289985657, + "learning_rate": 4.2878059097332834e-07, + "loss": 0.3644, + "step": 9409 + }, + { + "epoch": 2.6417742841100504, + "grad_norm": 0.5390783548355103, + "learning_rate": 4.281190582222705e-07, + "loss": 0.3033, + "step": 9410 + }, + { + "epoch": 2.6420550252667043, + "grad_norm": 0.5444378852844238, + "learning_rate": 4.274580133539147e-07, + "loss": 0.3538, + "step": 9411 + }, + { + "epoch": 2.6423357664233578, + "grad_norm": 0.5433520674705505, + "learning_rate": 4.267974564388061e-07, + "loss": 0.3333, + "step": 9412 + }, + { + "epoch": 2.642616507580011, + "grad_norm": 0.5778615474700928, + "learning_rate": 4.261373875474328e-07, + "loss": 0.3487, + "step": 9413 + }, + { + "epoch": 2.6428972487366647, + "grad_norm": 0.5516786575317383, + "learning_rate": 4.2547780675023577e-07, + "loss": 0.3227, + "step": 9414 + }, + { + "epoch": 2.643177989893318, + "grad_norm": 0.5535200238227844, + "learning_rate": 4.248187141176002e-07, + "loss": 0.3145, + "step": 9415 + }, + { + "epoch": 2.643458731049972, + "grad_norm": 0.6026257872581482, + "learning_rate": 4.2416010971985945e-07, + "loss": 0.2817, + "step": 9416 + }, + { + "epoch": 2.6437394722066254, + "grad_norm": 0.512552797794342, + "learning_rate": 4.2350199362729717e-07, + "loss": 0.324, + "step": 9417 + }, + { + "epoch": 2.6440202133632793, + "grad_norm": 0.5500130653381348, + "learning_rate": 4.2284436591014166e-07, + "loss": 0.3638, + "step": 9418 + }, + { + "epoch": 2.6443009545199327, + "grad_norm": 0.6028361320495605, + "learning_rate": 4.2218722663857294e-07, + "loss": 0.3141, + "step": 9419 + }, + { + "epoch": 2.644581695676586, + "grad_norm": 0.5345044136047363, + "learning_rate": 4.2153057588271597e-07, + "loss": 0.3172, + "step": 9420 + }, + { + "epoch": 2.6448624368332396, + "grad_norm": 0.5637393593788147, + "learning_rate": 4.208744137126436e-07, + "loss": 0.3486, + "step": 9421 + }, + { + "epoch": 2.645143177989893, + "grad_norm": 0.5475842356681824, + "learning_rate": 4.2021874019837874e-07, + "loss": 0.3525, + "step": 9422 + }, + { + "epoch": 2.645423919146547, + "grad_norm": 0.5665484666824341, + "learning_rate": 4.195635554098898e-07, + "loss": 0.346, + "step": 9423 + }, + { + "epoch": 2.6457046603032004, + "grad_norm": 0.5510707497596741, + "learning_rate": 4.189088594170948e-07, + "loss": 0.3382, + "step": 9424 + }, + { + "epoch": 2.6459854014598543, + "grad_norm": 0.5881587862968445, + "learning_rate": 4.182546522898573e-07, + "loss": 0.3405, + "step": 9425 + }, + { + "epoch": 2.6462661426165077, + "grad_norm": 0.6297908425331116, + "learning_rate": 4.1760093409799253e-07, + "loss": 0.2861, + "step": 9426 + }, + { + "epoch": 2.646546883773161, + "grad_norm": 0.5179921388626099, + "learning_rate": 4.1694770491125914e-07, + "loss": 0.3369, + "step": 9427 + }, + { + "epoch": 2.6468276249298146, + "grad_norm": 0.522391140460968, + "learning_rate": 4.1629496479936636e-07, + "loss": 0.2874, + "step": 9428 + }, + { + "epoch": 2.647108366086468, + "grad_norm": 0.5518567562103271, + "learning_rate": 4.1564271383197183e-07, + "loss": 0.3344, + "step": 9429 + }, + { + "epoch": 2.647389107243122, + "grad_norm": 0.5623995661735535, + "learning_rate": 4.1499095207867877e-07, + "loss": 0.3644, + "step": 9430 + }, + { + "epoch": 2.6476698483997754, + "grad_norm": 0.5590502023696899, + "learning_rate": 4.1433967960903764e-07, + "loss": 0.335, + "step": 9431 + }, + { + "epoch": 2.647950589556429, + "grad_norm": 0.5279700756072998, + "learning_rate": 4.136888964925506e-07, + "loss": 0.2864, + "step": 9432 + }, + { + "epoch": 2.6482313307130827, + "grad_norm": 0.5270088315010071, + "learning_rate": 4.1303860279866383e-07, + "loss": 0.325, + "step": 9433 + }, + { + "epoch": 2.648512071869736, + "grad_norm": 0.5713874697685242, + "learning_rate": 4.123887985967734e-07, + "loss": 0.3212, + "step": 9434 + }, + { + "epoch": 2.6487928130263896, + "grad_norm": 0.5349494218826294, + "learning_rate": 4.1173948395622167e-07, + "loss": 0.3325, + "step": 9435 + }, + { + "epoch": 2.649073554183043, + "grad_norm": 0.5535030364990234, + "learning_rate": 4.110906589462993e-07, + "loss": 0.3663, + "step": 9436 + }, + { + "epoch": 2.649354295339697, + "grad_norm": 0.4877673089504242, + "learning_rate": 4.104423236362459e-07, + "loss": 0.3166, + "step": 9437 + }, + { + "epoch": 2.6496350364963503, + "grad_norm": 0.5749571919441223, + "learning_rate": 4.097944780952462e-07, + "loss": 0.3235, + "step": 9438 + }, + { + "epoch": 2.649915777653004, + "grad_norm": 0.5387641191482544, + "learning_rate": 4.0914712239243595e-07, + "loss": 0.3377, + "step": 9439 + }, + { + "epoch": 2.6501965188096577, + "grad_norm": 0.563923716545105, + "learning_rate": 4.085002565968954e-07, + "loss": 0.3065, + "step": 9440 + }, + { + "epoch": 2.650477259966311, + "grad_norm": 0.5082716345787048, + "learning_rate": 4.0785388077765606e-07, + "loss": 0.3413, + "step": 9441 + }, + { + "epoch": 2.6507580011229646, + "grad_norm": 0.5434393286705017, + "learning_rate": 4.0720799500369337e-07, + "loss": 0.3174, + "step": 9442 + }, + { + "epoch": 2.651038742279618, + "grad_norm": 0.5248544812202454, + "learning_rate": 4.065625993439321e-07, + "loss": 0.3546, + "step": 9443 + }, + { + "epoch": 2.651319483436272, + "grad_norm": 0.5421655774116516, + "learning_rate": 4.0591769386724656e-07, + "loss": 0.3244, + "step": 9444 + }, + { + "epoch": 2.6516002245929253, + "grad_norm": 0.5521301627159119, + "learning_rate": 4.052732786424551e-07, + "loss": 0.3213, + "step": 9445 + }, + { + "epoch": 2.6518809657495788, + "grad_norm": 0.5440464615821838, + "learning_rate": 4.0462935373832725e-07, + "loss": 0.322, + "step": 9446 + }, + { + "epoch": 2.6521617069062327, + "grad_norm": 0.5045026540756226, + "learning_rate": 4.0398591922357787e-07, + "loss": 0.3576, + "step": 9447 + }, + { + "epoch": 2.652442448062886, + "grad_norm": 0.5185729265213013, + "learning_rate": 4.0334297516686994e-07, + "loss": 0.2952, + "step": 9448 + }, + { + "epoch": 2.6527231892195395, + "grad_norm": 0.5884493589401245, + "learning_rate": 4.0270052163681627e-07, + "loss": 0.3419, + "step": 9449 + }, + { + "epoch": 2.653003930376193, + "grad_norm": 0.5225335955619812, + "learning_rate": 4.020585587019726e-07, + "loss": 0.3643, + "step": 9450 + }, + { + "epoch": 2.653284671532847, + "grad_norm": 0.5333012938499451, + "learning_rate": 4.014170864308481e-07, + "loss": 0.3384, + "step": 9451 + }, + { + "epoch": 2.6535654126895003, + "grad_norm": 0.5922673344612122, + "learning_rate": 4.0077610489189453e-07, + "loss": 0.3454, + "step": 9452 + }, + { + "epoch": 2.6538461538461537, + "grad_norm": 0.5295975208282471, + "learning_rate": 4.00135614153514e-07, + "loss": 0.3451, + "step": 9453 + }, + { + "epoch": 2.6541268950028076, + "grad_norm": 0.5840491652488708, + "learning_rate": 3.9949561428405723e-07, + "loss": 0.3554, + "step": 9454 + }, + { + "epoch": 2.654407636159461, + "grad_norm": 0.4888450801372528, + "learning_rate": 3.988561053518192e-07, + "loss": 0.3776, + "step": 9455 + }, + { + "epoch": 2.6546883773161145, + "grad_norm": 0.5314975380897522, + "learning_rate": 3.9821708742504573e-07, + "loss": 0.352, + "step": 9456 + }, + { + "epoch": 2.654969118472768, + "grad_norm": 0.5290461778640747, + "learning_rate": 3.975785605719279e-07, + "loss": 0.351, + "step": 9457 + }, + { + "epoch": 2.655249859629422, + "grad_norm": 0.534783661365509, + "learning_rate": 3.9694052486060453e-07, + "loss": 0.3387, + "step": 9458 + }, + { + "epoch": 2.6555306007860753, + "grad_norm": 0.5146978497505188, + "learning_rate": 3.9630298035916503e-07, + "loss": 0.3371, + "step": 9459 + }, + { + "epoch": 2.6558113419427287, + "grad_norm": 0.5695974826812744, + "learning_rate": 3.956659271356422e-07, + "loss": 0.327, + "step": 9460 + }, + { + "epoch": 2.6560920830993826, + "grad_norm": 0.5320312976837158, + "learning_rate": 3.950293652580195e-07, + "loss": 0.3497, + "step": 9461 + }, + { + "epoch": 2.656372824256036, + "grad_norm": 0.5372816324234009, + "learning_rate": 3.9439329479422585e-07, + "loss": 0.3192, + "step": 9462 + }, + { + "epoch": 2.6566535654126895, + "grad_norm": 0.6621048450469971, + "learning_rate": 3.937577158121408e-07, + "loss": 0.3481, + "step": 9463 + }, + { + "epoch": 2.656934306569343, + "grad_norm": 0.5602954030036926, + "learning_rate": 3.9312262837958746e-07, + "loss": 0.3059, + "step": 9464 + }, + { + "epoch": 2.6572150477259964, + "grad_norm": 0.5516539812088013, + "learning_rate": 3.924880325643382e-07, + "loss": 0.2998, + "step": 9465 + }, + { + "epoch": 2.6574957888826503, + "grad_norm": 0.5282114148139954, + "learning_rate": 3.918539284341144e-07, + "loss": 0.3176, + "step": 9466 + }, + { + "epoch": 2.6577765300393037, + "grad_norm": 0.5805308222770691, + "learning_rate": 3.912203160565825e-07, + "loss": 0.2986, + "step": 9467 + }, + { + "epoch": 2.6580572711959576, + "grad_norm": 0.5293039083480835, + "learning_rate": 3.9058719549935953e-07, + "loss": 0.3304, + "step": 9468 + }, + { + "epoch": 2.658338012352611, + "grad_norm": 0.5936756134033203, + "learning_rate": 3.8995456683000696e-07, + "loss": 0.3178, + "step": 9469 + }, + { + "epoch": 2.6586187535092645, + "grad_norm": 0.502144455909729, + "learning_rate": 3.893224301160342e-07, + "loss": 0.3548, + "step": 9470 + }, + { + "epoch": 2.658899494665918, + "grad_norm": 0.5930432081222534, + "learning_rate": 3.886907854249e-07, + "loss": 0.3562, + "step": 9471 + }, + { + "epoch": 2.6591802358225713, + "grad_norm": 0.5416750311851501, + "learning_rate": 3.8805963282400936e-07, + "loss": 0.3252, + "step": 9472 + }, + { + "epoch": 2.6594609769792252, + "grad_norm": 0.5488835573196411, + "learning_rate": 3.874289723807151e-07, + "loss": 0.3299, + "step": 9473 + }, + { + "epoch": 2.6597417181358787, + "grad_norm": 0.5911818146705627, + "learning_rate": 3.8679880416231666e-07, + "loss": 0.3079, + "step": 9474 + }, + { + "epoch": 2.6600224592925326, + "grad_norm": 0.6673837304115295, + "learning_rate": 3.8616912823606357e-07, + "loss": 0.3159, + "step": 9475 + }, + { + "epoch": 2.660303200449186, + "grad_norm": 0.5358628034591675, + "learning_rate": 3.855399446691488e-07, + "loss": 0.3095, + "step": 9476 + }, + { + "epoch": 2.6605839416058394, + "grad_norm": 0.5243021845817566, + "learning_rate": 3.849112535287153e-07, + "loss": 0.33, + "step": 9477 + }, + { + "epoch": 2.660864682762493, + "grad_norm": 0.5456988215446472, + "learning_rate": 3.8428305488185327e-07, + "loss": 0.323, + "step": 9478 + }, + { + "epoch": 2.6611454239191463, + "grad_norm": 0.57991623878479, + "learning_rate": 3.836553487956013e-07, + "loss": 0.3371, + "step": 9479 + }, + { + "epoch": 2.6614261650758, + "grad_norm": 0.492904931306839, + "learning_rate": 3.830281353369425e-07, + "loss": 0.3397, + "step": 9480 + }, + { + "epoch": 2.6617069062324537, + "grad_norm": 0.5380684733390808, + "learning_rate": 3.824014145728111e-07, + "loss": 0.3147, + "step": 9481 + }, + { + "epoch": 2.661987647389107, + "grad_norm": 0.5540799498558044, + "learning_rate": 3.817751865700847e-07, + "loss": 0.3165, + "step": 9482 + }, + { + "epoch": 2.662268388545761, + "grad_norm": 0.5207505226135254, + "learning_rate": 3.8114945139559214e-07, + "loss": 0.3543, + "step": 9483 + }, + { + "epoch": 2.6625491297024144, + "grad_norm": 0.587072491645813, + "learning_rate": 3.8052420911610767e-07, + "loss": 0.3555, + "step": 9484 + }, + { + "epoch": 2.662829870859068, + "grad_norm": 0.47210097312927246, + "learning_rate": 3.7989945979835243e-07, + "loss": 0.337, + "step": 9485 + }, + { + "epoch": 2.6631106120157213, + "grad_norm": 0.5797653794288635, + "learning_rate": 3.7927520350899693e-07, + "loss": 0.3285, + "step": 9486 + }, + { + "epoch": 2.663391353172375, + "grad_norm": 0.5478522777557373, + "learning_rate": 3.786514403146563e-07, + "loss": 0.2741, + "step": 9487 + }, + { + "epoch": 2.6636720943290286, + "grad_norm": 0.5957590341567993, + "learning_rate": 3.780281702818966e-07, + "loss": 0.3463, + "step": 9488 + }, + { + "epoch": 2.663952835485682, + "grad_norm": 0.5520449280738831, + "learning_rate": 3.774053934772276e-07, + "loss": 0.3376, + "step": 9489 + }, + { + "epoch": 2.664233576642336, + "grad_norm": 0.5618157982826233, + "learning_rate": 3.767831099671099e-07, + "loss": 0.3067, + "step": 9490 + }, + { + "epoch": 2.6645143177989894, + "grad_norm": 0.5358468294143677, + "learning_rate": 3.7616131981794925e-07, + "loss": 0.3111, + "step": 9491 + }, + { + "epoch": 2.664795058955643, + "grad_norm": 0.5917156934738159, + "learning_rate": 3.7554002309609707e-07, + "loss": 0.332, + "step": 9492 + }, + { + "epoch": 2.6650758001122963, + "grad_norm": 0.5606355667114258, + "learning_rate": 3.749192198678575e-07, + "loss": 0.2889, + "step": 9493 + }, + { + "epoch": 2.66535654126895, + "grad_norm": 0.5559099316596985, + "learning_rate": 3.742989101994765e-07, + "loss": 0.3282, + "step": 9494 + }, + { + "epoch": 2.6656372824256036, + "grad_norm": 0.5656825304031372, + "learning_rate": 3.7367909415715054e-07, + "loss": 0.3306, + "step": 9495 + }, + { + "epoch": 2.665918023582257, + "grad_norm": 0.5568356513977051, + "learning_rate": 3.7305977180702223e-07, + "loss": 0.328, + "step": 9496 + }, + { + "epoch": 2.666198764738911, + "grad_norm": 0.5428913235664368, + "learning_rate": 3.724409432151832e-07, + "loss": 0.3465, + "step": 9497 + }, + { + "epoch": 2.6664795058955644, + "grad_norm": 0.6108860373497009, + "learning_rate": 3.718226084476689e-07, + "loss": 0.3329, + "step": 9498 + }, + { + "epoch": 2.666760247052218, + "grad_norm": 0.5239104628562927, + "learning_rate": 3.7120476757046496e-07, + "loss": 0.3625, + "step": 9499 + }, + { + "epoch": 2.6670409882088713, + "grad_norm": 0.6110329627990723, + "learning_rate": 3.7058742064950417e-07, + "loss": 0.3056, + "step": 9500 + }, + { + "epoch": 2.667321729365525, + "grad_norm": 0.5739771127700806, + "learning_rate": 3.6997056775066486e-07, + "loss": 0.3379, + "step": 9501 + }, + { + "epoch": 2.6676024705221786, + "grad_norm": 0.5730254650115967, + "learning_rate": 3.6935420893977503e-07, + "loss": 0.3685, + "step": 9502 + }, + { + "epoch": 2.667883211678832, + "grad_norm": 0.568720281124115, + "learning_rate": 3.68738344282607e-07, + "loss": 0.33, + "step": 9503 + }, + { + "epoch": 2.668163952835486, + "grad_norm": 0.5455431342124939, + "learning_rate": 3.6812297384488326e-07, + "loss": 0.3252, + "step": 9504 + }, + { + "epoch": 2.6684446939921393, + "grad_norm": 0.5412228107452393, + "learning_rate": 3.6750809769227237e-07, + "loss": 0.3587, + "step": 9505 + }, + { + "epoch": 2.668725435148793, + "grad_norm": 0.5856388211250305, + "learning_rate": 3.6689371589039013e-07, + "loss": 0.359, + "step": 9506 + }, + { + "epoch": 2.6690061763054462, + "grad_norm": 0.5271070003509521, + "learning_rate": 3.6627982850479805e-07, + "loss": 0.3157, + "step": 9507 + }, + { + "epoch": 2.6692869174620997, + "grad_norm": 0.551708459854126, + "learning_rate": 3.656664356010081e-07, + "loss": 0.301, + "step": 9508 + }, + { + "epoch": 2.6695676586187536, + "grad_norm": 0.5328904986381531, + "learning_rate": 3.650535372444769e-07, + "loss": 0.3416, + "step": 9509 + }, + { + "epoch": 2.669848399775407, + "grad_norm": 0.5363731384277344, + "learning_rate": 3.6444113350060985e-07, + "loss": 0.3554, + "step": 9510 + }, + { + "epoch": 2.670129140932061, + "grad_norm": 0.5621405839920044, + "learning_rate": 3.6382922443475743e-07, + "loss": 0.3185, + "step": 9511 + }, + { + "epoch": 2.6704098820887143, + "grad_norm": 0.640352189540863, + "learning_rate": 3.632178101122208e-07, + "loss": 0.2812, + "step": 9512 + }, + { + "epoch": 2.6706906232453678, + "grad_norm": 0.5635489821434021, + "learning_rate": 3.6260689059824495e-07, + "loss": 0.3339, + "step": 9513 + }, + { + "epoch": 2.670971364402021, + "grad_norm": 0.5003281831741333, + "learning_rate": 3.619964659580233e-07, + "loss": 0.3388, + "step": 9514 + }, + { + "epoch": 2.6712521055586746, + "grad_norm": 0.5414652228355408, + "learning_rate": 3.6138653625669764e-07, + "loss": 0.3217, + "step": 9515 + }, + { + "epoch": 2.6715328467153285, + "grad_norm": 0.5872387886047363, + "learning_rate": 3.607771015593542e-07, + "loss": 0.3054, + "step": 9516 + }, + { + "epoch": 2.671813587871982, + "grad_norm": 0.5588954091072083, + "learning_rate": 3.601681619310299e-07, + "loss": 0.34, + "step": 9517 + }, + { + "epoch": 2.672094329028636, + "grad_norm": 0.5404657125473022, + "learning_rate": 3.5955971743670605e-07, + "loss": 0.3437, + "step": 9518 + }, + { + "epoch": 2.6723750701852893, + "grad_norm": 0.5519921183586121, + "learning_rate": 3.589517681413118e-07, + "loss": 0.3459, + "step": 9519 + }, + { + "epoch": 2.6726558113419427, + "grad_norm": 0.5478466153144836, + "learning_rate": 3.583443141097248e-07, + "loss": 0.3472, + "step": 9520 + }, + { + "epoch": 2.672936552498596, + "grad_norm": 0.557296872138977, + "learning_rate": 3.57737355406767e-07, + "loss": 0.298, + "step": 9521 + }, + { + "epoch": 2.6732172936552496, + "grad_norm": 0.5240044593811035, + "learning_rate": 3.571308920972111e-07, + "loss": 0.3584, + "step": 9522 + }, + { + "epoch": 2.6734980348119035, + "grad_norm": 0.5655061602592468, + "learning_rate": 3.565249242457736e-07, + "loss": 0.3218, + "step": 9523 + }, + { + "epoch": 2.673778775968557, + "grad_norm": 0.5311112403869629, + "learning_rate": 3.5591945191712105e-07, + "loss": 0.3307, + "step": 9524 + }, + { + "epoch": 2.6740595171252104, + "grad_norm": 0.547884464263916, + "learning_rate": 3.553144751758647e-07, + "loss": 0.3308, + "step": 9525 + }, + { + "epoch": 2.6743402582818643, + "grad_norm": 0.5580441355705261, + "learning_rate": 3.547099940865639e-07, + "loss": 0.353, + "step": 9526 + }, + { + "epoch": 2.6746209994385177, + "grad_norm": 0.5507350564002991, + "learning_rate": 3.54106008713726e-07, + "loss": 0.3312, + "step": 9527 + }, + { + "epoch": 2.674901740595171, + "grad_norm": 0.5310389399528503, + "learning_rate": 3.5350251912180277e-07, + "loss": 0.3565, + "step": 9528 + }, + { + "epoch": 2.6751824817518246, + "grad_norm": 0.570912778377533, + "learning_rate": 3.5289952537519654e-07, + "loss": 0.3128, + "step": 9529 + }, + { + "epoch": 2.6754632229084785, + "grad_norm": 0.5651665329933167, + "learning_rate": 3.5229702753825536e-07, + "loss": 0.3056, + "step": 9530 + }, + { + "epoch": 2.675743964065132, + "grad_norm": 0.5954676270484924, + "learning_rate": 3.5169502567527215e-07, + "loss": 0.3314, + "step": 9531 + }, + { + "epoch": 2.6760247052217854, + "grad_norm": 0.5380265116691589, + "learning_rate": 3.5109351985049054e-07, + "loss": 0.3208, + "step": 9532 + }, + { + "epoch": 2.6763054463784393, + "grad_norm": 0.6027633547782898, + "learning_rate": 3.504925101280981e-07, + "loss": 0.3069, + "step": 9533 + }, + { + "epoch": 2.6765861875350927, + "grad_norm": 0.5807909965515137, + "learning_rate": 3.4989199657223307e-07, + "loss": 0.3048, + "step": 9534 + }, + { + "epoch": 2.676866928691746, + "grad_norm": 0.4826158881187439, + "learning_rate": 3.4929197924697623e-07, + "loss": 0.3025, + "step": 9535 + }, + { + "epoch": 2.6771476698483996, + "grad_norm": 0.6137301921844482, + "learning_rate": 3.486924582163581e-07, + "loss": 0.3067, + "step": 9536 + }, + { + "epoch": 2.6774284110050535, + "grad_norm": 0.5462205410003662, + "learning_rate": 3.48093433544357e-07, + "loss": 0.291, + "step": 9537 + }, + { + "epoch": 2.677709152161707, + "grad_norm": 0.5179881453514099, + "learning_rate": 3.474949052948956e-07, + "loss": 0.2973, + "step": 9538 + }, + { + "epoch": 2.6779898933183603, + "grad_norm": 0.514231264591217, + "learning_rate": 3.4689687353184675e-07, + "loss": 0.3531, + "step": 9539 + }, + { + "epoch": 2.6782706344750142, + "grad_norm": 0.5977672934532166, + "learning_rate": 3.462993383190277e-07, + "loss": 0.3408, + "step": 9540 + }, + { + "epoch": 2.6785513756316677, + "grad_norm": 0.5542689561843872, + "learning_rate": 3.4570229972020306e-07, + "loss": 0.3148, + "step": 9541 + }, + { + "epoch": 2.678832116788321, + "grad_norm": 0.5289836525917053, + "learning_rate": 3.451057577990868e-07, + "loss": 0.3332, + "step": 9542 + }, + { + "epoch": 2.6791128579449746, + "grad_norm": 0.611068844795227, + "learning_rate": 3.4450971261933643e-07, + "loss": 0.3282, + "step": 9543 + }, + { + "epoch": 2.6793935991016284, + "grad_norm": 0.5822266936302185, + "learning_rate": 3.4391416424455925e-07, + "loss": 0.3309, + "step": 9544 + }, + { + "epoch": 2.679674340258282, + "grad_norm": 0.6132161617279053, + "learning_rate": 3.433191127383079e-07, + "loss": 0.3556, + "step": 9545 + }, + { + "epoch": 2.6799550814149353, + "grad_norm": 0.5779435038566589, + "learning_rate": 3.427245581640831e-07, + "loss": 0.3137, + "step": 9546 + }, + { + "epoch": 2.680235822571589, + "grad_norm": 0.5796428322792053, + "learning_rate": 3.4213050058533203e-07, + "loss": 0.3158, + "step": 9547 + }, + { + "epoch": 2.6805165637282427, + "grad_norm": 0.6340121626853943, + "learning_rate": 3.415369400654478e-07, + "loss": 0.3287, + "step": 9548 + }, + { + "epoch": 2.680797304884896, + "grad_norm": 0.560840368270874, + "learning_rate": 3.4094387666777305e-07, + "loss": 0.3184, + "step": 9549 + }, + { + "epoch": 2.6810780460415495, + "grad_norm": 0.5851090550422668, + "learning_rate": 3.4035131045559445e-07, + "loss": 0.3167, + "step": 9550 + }, + { + "epoch": 2.6813587871982034, + "grad_norm": 0.5190525650978088, + "learning_rate": 3.39759241492148e-07, + "loss": 0.2957, + "step": 9551 + }, + { + "epoch": 2.681639528354857, + "grad_norm": 0.5649513006210327, + "learning_rate": 3.3916766984061546e-07, + "loss": 0.3569, + "step": 9552 + }, + { + "epoch": 2.6819202695115103, + "grad_norm": 0.5288940072059631, + "learning_rate": 3.3857659556412457e-07, + "loss": 0.3483, + "step": 9553 + }, + { + "epoch": 2.682201010668164, + "grad_norm": 0.5213239192962646, + "learning_rate": 3.379860187257517e-07, + "loss": 0.2801, + "step": 9554 + }, + { + "epoch": 2.6824817518248176, + "grad_norm": 0.5677089095115662, + "learning_rate": 3.3739593938852065e-07, + "loss": 0.3039, + "step": 9555 + }, + { + "epoch": 2.682762492981471, + "grad_norm": 0.5153822302818298, + "learning_rate": 3.368063576153996e-07, + "loss": 0.3373, + "step": 9556 + }, + { + "epoch": 2.6830432341381245, + "grad_norm": 0.5741904377937317, + "learning_rate": 3.362172734693059e-07, + "loss": 0.2974, + "step": 9557 + }, + { + "epoch": 2.683323975294778, + "grad_norm": 0.6181908845901489, + "learning_rate": 3.35628687013102e-07, + "loss": 0.3169, + "step": 9558 + }, + { + "epoch": 2.683604716451432, + "grad_norm": 0.5768583416938782, + "learning_rate": 3.3504059830960003e-07, + "loss": 0.3216, + "step": 9559 + }, + { + "epoch": 2.6838854576080853, + "grad_norm": 0.5114100575447083, + "learning_rate": 3.344530074215546e-07, + "loss": 0.323, + "step": 9560 + }, + { + "epoch": 2.684166198764739, + "grad_norm": 0.5469650626182556, + "learning_rate": 3.3386591441167184e-07, + "loss": 0.3455, + "step": 9561 + }, + { + "epoch": 2.6844469399213926, + "grad_norm": 0.6090538501739502, + "learning_rate": 3.3327931934260206e-07, + "loss": 0.3312, + "step": 9562 + }, + { + "epoch": 2.684727681078046, + "grad_norm": 0.5837964415550232, + "learning_rate": 3.3269322227694244e-07, + "loss": 0.3085, + "step": 9563 + }, + { + "epoch": 2.6850084222346995, + "grad_norm": 0.5404969453811646, + "learning_rate": 3.321076232772386e-07, + "loss": 0.3169, + "step": 9564 + }, + { + "epoch": 2.685289163391353, + "grad_norm": 0.5449265241622925, + "learning_rate": 3.315225224059809e-07, + "loss": 0.3241, + "step": 9565 + }, + { + "epoch": 2.685569904548007, + "grad_norm": 0.5977044701576233, + "learning_rate": 3.309379197256085e-07, + "loss": 0.3257, + "step": 9566 + }, + { + "epoch": 2.6858506457046603, + "grad_norm": 0.5311598181724548, + "learning_rate": 3.3035381529850697e-07, + "loss": 0.3325, + "step": 9567 + }, + { + "epoch": 2.686131386861314, + "grad_norm": 0.542149007320404, + "learning_rate": 3.2977020918700644e-07, + "loss": 0.3159, + "step": 9568 + }, + { + "epoch": 2.6864121280179676, + "grad_norm": 0.5439414978027344, + "learning_rate": 3.2918710145338817e-07, + "loss": 0.2904, + "step": 9569 + }, + { + "epoch": 2.686692869174621, + "grad_norm": 0.5617645978927612, + "learning_rate": 3.286044921598752e-07, + "loss": 0.3225, + "step": 9570 + }, + { + "epoch": 2.6869736103312745, + "grad_norm": 0.5254599452018738, + "learning_rate": 3.280223813686423e-07, + "loss": 0.2832, + "step": 9571 + }, + { + "epoch": 2.687254351487928, + "grad_norm": 0.5410608053207397, + "learning_rate": 3.2744076914180746e-07, + "loss": 0.3792, + "step": 9572 + }, + { + "epoch": 2.687535092644582, + "grad_norm": 0.6197337508201599, + "learning_rate": 3.268596555414372e-07, + "loss": 0.3085, + "step": 9573 + }, + { + "epoch": 2.6878158338012352, + "grad_norm": 0.5445768237113953, + "learning_rate": 3.2627904062954463e-07, + "loss": 0.311, + "step": 9574 + }, + { + "epoch": 2.6880965749578887, + "grad_norm": 0.5660192966461182, + "learning_rate": 3.256989244680875e-07, + "loss": 0.3236, + "step": 9575 + }, + { + "epoch": 2.6883773161145426, + "grad_norm": 0.5556873679161072, + "learning_rate": 3.251193071189751e-07, + "loss": 0.3166, + "step": 9576 + }, + { + "epoch": 2.688658057271196, + "grad_norm": 0.6064473986625671, + "learning_rate": 3.2454018864405745e-07, + "loss": 0.2923, + "step": 9577 + }, + { + "epoch": 2.6889387984278494, + "grad_norm": 0.5116872191429138, + "learning_rate": 3.239615691051379e-07, + "loss": 0.329, + "step": 9578 + }, + { + "epoch": 2.689219539584503, + "grad_norm": 0.5713472962379456, + "learning_rate": 3.233834485639603e-07, + "loss": 0.3312, + "step": 9579 + }, + { + "epoch": 2.6895002807411568, + "grad_norm": 0.5208572149276733, + "learning_rate": 3.2280582708221817e-07, + "loss": 0.3446, + "step": 9580 + }, + { + "epoch": 2.68978102189781, + "grad_norm": 0.5725234150886536, + "learning_rate": 3.2222870472155386e-07, + "loss": 0.3107, + "step": 9581 + }, + { + "epoch": 2.6900617630544637, + "grad_norm": 0.564706027507782, + "learning_rate": 3.2165208154355253e-07, + "loss": 0.3257, + "step": 9582 + }, + { + "epoch": 2.6903425042111175, + "grad_norm": 0.6131207346916199, + "learning_rate": 3.2107595760974944e-07, + "loss": 0.3005, + "step": 9583 + }, + { + "epoch": 2.690623245367771, + "grad_norm": 0.5989841222763062, + "learning_rate": 3.2050033298162307e-07, + "loss": 0.3195, + "step": 9584 + }, + { + "epoch": 2.6909039865244244, + "grad_norm": 0.52616947889328, + "learning_rate": 3.199252077206011e-07, + "loss": 0.3425, + "step": 9585 + }, + { + "epoch": 2.691184727681078, + "grad_norm": 0.5609388947486877, + "learning_rate": 3.1935058188805825e-07, + "loss": 0.3325, + "step": 9586 + }, + { + "epoch": 2.6914654688377317, + "grad_norm": 0.46792733669281006, + "learning_rate": 3.187764555453132e-07, + "loss": 0.3408, + "step": 9587 + }, + { + "epoch": 2.691746209994385, + "grad_norm": 0.51032555103302, + "learning_rate": 3.182028287536348e-07, + "loss": 0.311, + "step": 9588 + }, + { + "epoch": 2.6920269511510386, + "grad_norm": 0.5743250846862793, + "learning_rate": 3.176297015742369e-07, + "loss": 0.3284, + "step": 9589 + }, + { + "epoch": 2.6923076923076925, + "grad_norm": 0.5259221196174622, + "learning_rate": 3.1705707406827825e-07, + "loss": 0.3167, + "step": 9590 + }, + { + "epoch": 2.692588433464346, + "grad_norm": 0.49799326062202454, + "learning_rate": 3.164849462968683e-07, + "loss": 0.3735, + "step": 9591 + }, + { + "epoch": 2.6928691746209994, + "grad_norm": 0.5619614720344543, + "learning_rate": 3.159133183210594e-07, + "loss": 0.386, + "step": 9592 + }, + { + "epoch": 2.693149915777653, + "grad_norm": 0.5929741859436035, + "learning_rate": 3.153421902018533e-07, + "loss": 0.3129, + "step": 9593 + }, + { + "epoch": 2.6934306569343067, + "grad_norm": 0.5148611664772034, + "learning_rate": 3.147715620001962e-07, + "loss": 0.2732, + "step": 9594 + }, + { + "epoch": 2.69371139809096, + "grad_norm": 0.511158287525177, + "learning_rate": 3.1420143377698274e-07, + "loss": 0.3327, + "step": 9595 + }, + { + "epoch": 2.6939921392476136, + "grad_norm": 0.5217373967170715, + "learning_rate": 3.1363180559305374e-07, + "loss": 0.308, + "step": 9596 + }, + { + "epoch": 2.6942728804042675, + "grad_norm": 0.5783630013465881, + "learning_rate": 3.1306267750919496e-07, + "loss": 0.3468, + "step": 9597 + }, + { + "epoch": 2.694553621560921, + "grad_norm": 0.5551561713218689, + "learning_rate": 3.124940495861417e-07, + "loss": 0.303, + "step": 9598 + }, + { + "epoch": 2.6948343627175744, + "grad_norm": 0.5402063727378845, + "learning_rate": 3.119259218845733e-07, + "loss": 0.3598, + "step": 9599 + }, + { + "epoch": 2.695115103874228, + "grad_norm": 0.5575801134109497, + "learning_rate": 3.1135829446511776e-07, + "loss": 0.2999, + "step": 9600 + }, + { + "epoch": 2.6953958450308813, + "grad_norm": 0.5121349096298218, + "learning_rate": 3.1079116738834845e-07, + "loss": 0.2835, + "step": 9601 + }, + { + "epoch": 2.695676586187535, + "grad_norm": 0.5719684362411499, + "learning_rate": 3.1022454071478467e-07, + "loss": 0.3326, + "step": 9602 + }, + { + "epoch": 2.6959573273441886, + "grad_norm": 0.5816136002540588, + "learning_rate": 3.0965841450489474e-07, + "loss": 0.3486, + "step": 9603 + }, + { + "epoch": 2.6962380685008425, + "grad_norm": 0.5508344173431396, + "learning_rate": 3.090927888190909e-07, + "loss": 0.3371, + "step": 9604 + }, + { + "epoch": 2.696518809657496, + "grad_norm": 0.5628803372383118, + "learning_rate": 3.0852766371773423e-07, + "loss": 0.3606, + "step": 9605 + }, + { + "epoch": 2.6967995508141493, + "grad_norm": 0.5306931138038635, + "learning_rate": 3.07963039261131e-07, + "loss": 0.3052, + "step": 9606 + }, + { + "epoch": 2.697080291970803, + "grad_norm": 0.5864467024803162, + "learning_rate": 3.0739891550953415e-07, + "loss": 0.2937, + "step": 9607 + }, + { + "epoch": 2.6973610331274562, + "grad_norm": 0.5703273415565491, + "learning_rate": 3.0683529252314436e-07, + "loss": 0.321, + "step": 9608 + }, + { + "epoch": 2.69764177428411, + "grad_norm": 0.5476502776145935, + "learning_rate": 3.062721703621063e-07, + "loss": 0.3399, + "step": 9609 + }, + { + "epoch": 2.6979225154407636, + "grad_norm": 0.5061193108558655, + "learning_rate": 3.0570954908651474e-07, + "loss": 0.2823, + "step": 9610 + }, + { + "epoch": 2.6982032565974174, + "grad_norm": 0.6391820311546326, + "learning_rate": 3.0514742875640825e-07, + "loss": 0.3066, + "step": 9611 + }, + { + "epoch": 2.698483997754071, + "grad_norm": 0.5198980569839478, + "learning_rate": 3.0458580943177165e-07, + "loss": 0.3138, + "step": 9612 + }, + { + "epoch": 2.6987647389107243, + "grad_norm": 0.5558304190635681, + "learning_rate": 3.040246911725397e-07, + "loss": 0.3262, + "step": 9613 + }, + { + "epoch": 2.6990454800673778, + "grad_norm": 0.5105278491973877, + "learning_rate": 3.034640740385897e-07, + "loss": 0.3743, + "step": 9614 + }, + { + "epoch": 2.699326221224031, + "grad_norm": 0.5367564558982849, + "learning_rate": 3.029039580897486e-07, + "loss": 0.3189, + "step": 9615 + }, + { + "epoch": 2.699606962380685, + "grad_norm": 0.5472331643104553, + "learning_rate": 3.02344343385787e-07, + "loss": 0.3645, + "step": 9616 + }, + { + "epoch": 2.6998877035373385, + "grad_norm": 0.5079192519187927, + "learning_rate": 3.017852299864249e-07, + "loss": 0.3778, + "step": 9617 + }, + { + "epoch": 2.700168444693992, + "grad_norm": 0.4927424490451813, + "learning_rate": 3.012266179513268e-07, + "loss": 0.295, + "step": 9618 + }, + { + "epoch": 2.700449185850646, + "grad_norm": 0.550288736820221, + "learning_rate": 3.006685073401039e-07, + "loss": 0.3289, + "step": 9619 + }, + { + "epoch": 2.7007299270072993, + "grad_norm": 0.467318594455719, + "learning_rate": 3.001108982123152e-07, + "loss": 0.308, + "step": 9620 + }, + { + "epoch": 2.7010106681639527, + "grad_norm": 0.5019981265068054, + "learning_rate": 2.995537906274637e-07, + "loss": 0.3235, + "step": 9621 + }, + { + "epoch": 2.701291409320606, + "grad_norm": 0.563401460647583, + "learning_rate": 2.989971846450024e-07, + "loss": 0.3368, + "step": 9622 + }, + { + "epoch": 2.70157215047726, + "grad_norm": 0.4814218282699585, + "learning_rate": 2.984410803243282e-07, + "loss": 0.2999, + "step": 9623 + }, + { + "epoch": 2.7018528916339135, + "grad_norm": 0.6057947874069214, + "learning_rate": 2.9788547772478416e-07, + "loss": 0.3172, + "step": 9624 + }, + { + "epoch": 2.702133632790567, + "grad_norm": 0.6055577993392944, + "learning_rate": 2.973303769056618e-07, + "loss": 0.35, + "step": 9625 + }, + { + "epoch": 2.702414373947221, + "grad_norm": 0.5092797875404358, + "learning_rate": 2.9677577792619704e-07, + "loss": 0.3372, + "step": 9626 + }, + { + "epoch": 2.7026951151038743, + "grad_norm": 0.5349711179733276, + "learning_rate": 2.962216808455748e-07, + "loss": 0.3499, + "step": 9627 + }, + { + "epoch": 2.7029758562605277, + "grad_norm": 0.5831418037414551, + "learning_rate": 2.956680857229233e-07, + "loss": 0.2988, + "step": 9628 + }, + { + "epoch": 2.703256597417181, + "grad_norm": 0.4941293001174927, + "learning_rate": 2.951149926173197e-07, + "loss": 0.3223, + "step": 9629 + }, + { + "epoch": 2.703537338573835, + "grad_norm": 0.5175655484199524, + "learning_rate": 2.9456240158778627e-07, + "loss": 0.3444, + "step": 9630 + }, + { + "epoch": 2.7038180797304885, + "grad_norm": 0.5821202397346497, + "learning_rate": 2.9401031269329085e-07, + "loss": 0.3082, + "step": 9631 + }, + { + "epoch": 2.704098820887142, + "grad_norm": 0.5866091847419739, + "learning_rate": 2.934587259927518e-07, + "loss": 0.3112, + "step": 9632 + }, + { + "epoch": 2.704379562043796, + "grad_norm": 0.536228597164154, + "learning_rate": 2.929076415450294e-07, + "loss": 0.326, + "step": 9633 + }, + { + "epoch": 2.7046603032004493, + "grad_norm": 0.6105281710624695, + "learning_rate": 2.9235705940893144e-07, + "loss": 0.2728, + "step": 9634 + }, + { + "epoch": 2.7049410443571027, + "grad_norm": 0.5519971251487732, + "learning_rate": 2.918069796432138e-07, + "loss": 0.3387, + "step": 9635 + }, + { + "epoch": 2.705221785513756, + "grad_norm": 0.5336389541625977, + "learning_rate": 2.912574023065762e-07, + "loss": 0.3229, + "step": 9636 + }, + { + "epoch": 2.70550252667041, + "grad_norm": 0.5339955687522888, + "learning_rate": 2.9070832745766774e-07, + "loss": 0.3273, + "step": 9637 + }, + { + "epoch": 2.7057832678270635, + "grad_norm": 0.6147384643554688, + "learning_rate": 2.901597551550811e-07, + "loss": 0.2974, + "step": 9638 + }, + { + "epoch": 2.706064008983717, + "grad_norm": 0.6201896071434021, + "learning_rate": 2.8961168545735606e-07, + "loss": 0.3193, + "step": 9639 + }, + { + "epoch": 2.706344750140371, + "grad_norm": 0.5300193428993225, + "learning_rate": 2.8906411842298144e-07, + "loss": 0.3012, + "step": 9640 + }, + { + "epoch": 2.7066254912970242, + "grad_norm": 0.52329021692276, + "learning_rate": 2.8851705411038713e-07, + "loss": 0.3662, + "step": 9641 + }, + { + "epoch": 2.7069062324536777, + "grad_norm": 0.5149202942848206, + "learning_rate": 2.879704925779553e-07, + "loss": 0.3317, + "step": 9642 + }, + { + "epoch": 2.707186973610331, + "grad_norm": 0.5491557121276855, + "learning_rate": 2.874244338840093e-07, + "loss": 0.3249, + "step": 9643 + }, + { + "epoch": 2.707467714766985, + "grad_norm": 0.6020238399505615, + "learning_rate": 2.8687887808682315e-07, + "loss": 0.2963, + "step": 9644 + }, + { + "epoch": 2.7077484559236384, + "grad_norm": 0.510621964931488, + "learning_rate": 2.8633382524461406e-07, + "loss": 0.309, + "step": 9645 + }, + { + "epoch": 2.708029197080292, + "grad_norm": 0.5275996923446655, + "learning_rate": 2.8578927541554614e-07, + "loss": 0.3554, + "step": 9646 + }, + { + "epoch": 2.7083099382369458, + "grad_norm": 0.5425759553909302, + "learning_rate": 2.8524522865773177e-07, + "loss": 0.3255, + "step": 9647 + }, + { + "epoch": 2.708590679393599, + "grad_norm": 0.5639587640762329, + "learning_rate": 2.847016850292261e-07, + "loss": 0.3371, + "step": 9648 + }, + { + "epoch": 2.7088714205502527, + "grad_norm": 0.5653936266899109, + "learning_rate": 2.8415864458803566e-07, + "loss": 0.3142, + "step": 9649 + }, + { + "epoch": 2.709152161706906, + "grad_norm": 0.5165392160415649, + "learning_rate": 2.8361610739210845e-07, + "loss": 0.3942, + "step": 9650 + }, + { + "epoch": 2.7094329028635595, + "grad_norm": 0.5510459542274475, + "learning_rate": 2.8307407349934033e-07, + "loss": 0.2903, + "step": 9651 + }, + { + "epoch": 2.7097136440202134, + "grad_norm": 0.49656468629837036, + "learning_rate": 2.8253254296757514e-07, + "loss": 0.3084, + "step": 9652 + }, + { + "epoch": 2.709994385176867, + "grad_norm": 0.5687635540962219, + "learning_rate": 2.8199151585459996e-07, + "loss": 0.3359, + "step": 9653 + }, + { + "epoch": 2.7102751263335207, + "grad_norm": 0.5923601984977722, + "learning_rate": 2.814509922181519e-07, + "loss": 0.3585, + "step": 9654 + }, + { + "epoch": 2.710555867490174, + "grad_norm": 0.5608054399490356, + "learning_rate": 2.8091097211591034e-07, + "loss": 0.3368, + "step": 9655 + }, + { + "epoch": 2.7108366086468276, + "grad_norm": 0.5620177388191223, + "learning_rate": 2.8037145560550425e-07, + "loss": 0.3204, + "step": 9656 + }, + { + "epoch": 2.711117349803481, + "grad_norm": 0.5754275918006897, + "learning_rate": 2.79832442744507e-07, + "loss": 0.3021, + "step": 9657 + }, + { + "epoch": 2.7113980909601345, + "grad_norm": 0.551679790019989, + "learning_rate": 2.7929393359043875e-07, + "loss": 0.3189, + "step": 9658 + }, + { + "epoch": 2.7116788321167884, + "grad_norm": 0.5252971649169922, + "learning_rate": 2.787559282007657e-07, + "loss": 0.3555, + "step": 9659 + }, + { + "epoch": 2.711959573273442, + "grad_norm": 0.5090240836143494, + "learning_rate": 2.7821842663290076e-07, + "loss": 0.3369, + "step": 9660 + }, + { + "epoch": 2.7122403144300957, + "grad_norm": 0.6105270981788635, + "learning_rate": 2.77681428944202e-07, + "loss": 0.4077, + "step": 9661 + }, + { + "epoch": 2.712521055586749, + "grad_norm": 0.597703754901886, + "learning_rate": 2.7714493519197585e-07, + "loss": 0.2904, + "step": 9662 + }, + { + "epoch": 2.7128017967434026, + "grad_norm": 0.5896347165107727, + "learning_rate": 2.766089454334714e-07, + "loss": 0.3206, + "step": 9663 + }, + { + "epoch": 2.713082537900056, + "grad_norm": 0.5729497075080872, + "learning_rate": 2.760734597258885e-07, + "loss": 0.3567, + "step": 9664 + }, + { + "epoch": 2.7133632790567095, + "grad_norm": 0.48528119921684265, + "learning_rate": 2.7553847812636924e-07, + "loss": 0.3175, + "step": 9665 + }, + { + "epoch": 2.7136440202133634, + "grad_norm": 0.49910029768943787, + "learning_rate": 2.750040006920046e-07, + "loss": 0.3582, + "step": 9666 + }, + { + "epoch": 2.713924761370017, + "grad_norm": 0.5457202792167664, + "learning_rate": 2.7447002747983065e-07, + "loss": 0.3206, + "step": 9667 + }, + { + "epoch": 2.7142055025266703, + "grad_norm": 0.5576246976852417, + "learning_rate": 2.739365585468279e-07, + "loss": 0.3442, + "step": 9668 + }, + { + "epoch": 2.714486243683324, + "grad_norm": 0.6320303678512573, + "learning_rate": 2.7340359394992687e-07, + "loss": 0.3267, + "step": 9669 + }, + { + "epoch": 2.7147669848399776, + "grad_norm": 0.5264926552772522, + "learning_rate": 2.728711337460016e-07, + "loss": 0.3346, + "step": 9670 + }, + { + "epoch": 2.715047725996631, + "grad_norm": 0.5229409337043762, + "learning_rate": 2.723391779918727e-07, + "loss": 0.3021, + "step": 9671 + }, + { + "epoch": 2.7153284671532845, + "grad_norm": 0.6011293530464172, + "learning_rate": 2.7180772674430813e-07, + "loss": 0.3007, + "step": 9672 + }, + { + "epoch": 2.7156092083099383, + "grad_norm": 0.5639219880104065, + "learning_rate": 2.712767800600191e-07, + "loss": 0.3188, + "step": 9673 + }, + { + "epoch": 2.715889949466592, + "grad_norm": 0.5480575561523438, + "learning_rate": 2.7074633799566654e-07, + "loss": 0.3108, + "step": 9674 + }, + { + "epoch": 2.7161706906232452, + "grad_norm": 0.5085447430610657, + "learning_rate": 2.7021640060785505e-07, + "loss": 0.3419, + "step": 9675 + }, + { + "epoch": 2.716451431779899, + "grad_norm": 0.5802251696586609, + "learning_rate": 2.6968696795313773e-07, + "loss": 0.3111, + "step": 9676 + }, + { + "epoch": 2.7167321729365526, + "grad_norm": 0.5498368740081787, + "learning_rate": 2.6915804008801005e-07, + "loss": 0.3295, + "step": 9677 + }, + { + "epoch": 2.717012914093206, + "grad_norm": 0.5393154621124268, + "learning_rate": 2.6862961706891844e-07, + "loss": 0.3269, + "step": 9678 + }, + { + "epoch": 2.7172936552498594, + "grad_norm": 0.5487037897109985, + "learning_rate": 2.6810169895225116e-07, + "loss": 0.317, + "step": 9679 + }, + { + "epoch": 2.7175743964065133, + "grad_norm": 0.564078152179718, + "learning_rate": 2.6757428579434473e-07, + "loss": 0.3155, + "step": 9680 + }, + { + "epoch": 2.7178551375631668, + "grad_norm": 0.6106180548667908, + "learning_rate": 2.670473776514815e-07, + "loss": 0.2887, + "step": 9681 + }, + { + "epoch": 2.71813587871982, + "grad_norm": 0.5764226317405701, + "learning_rate": 2.665209745798897e-07, + "loss": 0.3344, + "step": 9682 + }, + { + "epoch": 2.718416619876474, + "grad_norm": 0.5557388067245483, + "learning_rate": 2.6599507663574387e-07, + "loss": 0.3504, + "step": 9683 + }, + { + "epoch": 2.7186973610331275, + "grad_norm": 0.5256271362304688, + "learning_rate": 2.654696838751647e-07, + "loss": 0.3304, + "step": 9684 + }, + { + "epoch": 2.718978102189781, + "grad_norm": 0.5853570699691772, + "learning_rate": 2.64944796354219e-07, + "loss": 0.3509, + "step": 9685 + }, + { + "epoch": 2.7192588433464344, + "grad_norm": 0.5093727707862854, + "learning_rate": 2.6442041412891916e-07, + "loss": 0.3266, + "step": 9686 + }, + { + "epoch": 2.7195395845030883, + "grad_norm": 0.54444819688797, + "learning_rate": 2.6389653725522433e-07, + "loss": 0.3373, + "step": 9687 + }, + { + "epoch": 2.7198203256597417, + "grad_norm": 0.5237269401550293, + "learning_rate": 2.633731657890387e-07, + "loss": 0.3299, + "step": 9688 + }, + { + "epoch": 2.720101066816395, + "grad_norm": 0.5486893653869629, + "learning_rate": 2.6285029978621426e-07, + "loss": 0.3253, + "step": 9689 + }, + { + "epoch": 2.720381807973049, + "grad_norm": 0.5041185617446899, + "learning_rate": 2.6232793930254686e-07, + "loss": 0.3452, + "step": 9690 + }, + { + "epoch": 2.7206625491297025, + "grad_norm": 0.538627564907074, + "learning_rate": 2.618060843937809e-07, + "loss": 0.3322, + "step": 9691 + }, + { + "epoch": 2.720943290286356, + "grad_norm": 0.5140835642814636, + "learning_rate": 2.6128473511560337e-07, + "loss": 0.3225, + "step": 9692 + }, + { + "epoch": 2.7212240314430094, + "grad_norm": 0.513480007648468, + "learning_rate": 2.607638915236521e-07, + "loss": 0.3451, + "step": 9693 + }, + { + "epoch": 2.721504772599663, + "grad_norm": 0.612460196018219, + "learning_rate": 2.6024355367350706e-07, + "loss": 0.31, + "step": 9694 + }, + { + "epoch": 2.7217855137563167, + "grad_norm": 0.5481112599372864, + "learning_rate": 2.5972372162069436e-07, + "loss": 0.3359, + "step": 9695 + }, + { + "epoch": 2.72206625491297, + "grad_norm": 0.527559220790863, + "learning_rate": 2.592043954206891e-07, + "loss": 0.3419, + "step": 9696 + }, + { + "epoch": 2.722346996069624, + "grad_norm": 0.5547905564308167, + "learning_rate": 2.5868557512890855e-07, + "loss": 0.3135, + "step": 9697 + }, + { + "epoch": 2.7226277372262775, + "grad_norm": 0.5954114198684692, + "learning_rate": 2.581672608007202e-07, + "loss": 0.3347, + "step": 9698 + }, + { + "epoch": 2.722908478382931, + "grad_norm": 0.5858810544013977, + "learning_rate": 2.576494524914347e-07, + "loss": 0.306, + "step": 9699 + }, + { + "epoch": 2.7231892195395844, + "grad_norm": 0.5046973824501038, + "learning_rate": 2.5713215025630734e-07, + "loss": 0.2988, + "step": 9700 + }, + { + "epoch": 2.723469960696238, + "grad_norm": 0.6144183278083801, + "learning_rate": 2.566153541505445e-07, + "loss": 0.3357, + "step": 9701 + }, + { + "epoch": 2.7237507018528917, + "grad_norm": 0.5328025817871094, + "learning_rate": 2.5609906422929263e-07, + "loss": 0.2969, + "step": 9702 + }, + { + "epoch": 2.724031443009545, + "grad_norm": 0.5346313714981079, + "learning_rate": 2.555832805476488e-07, + "loss": 0.3241, + "step": 9703 + }, + { + "epoch": 2.724312184166199, + "grad_norm": 0.5095492601394653, + "learning_rate": 2.550680031606534e-07, + "loss": 0.3276, + "step": 9704 + }, + { + "epoch": 2.7245929253228525, + "grad_norm": 0.5440551042556763, + "learning_rate": 2.5455323212329463e-07, + "loss": 0.2901, + "step": 9705 + }, + { + "epoch": 2.724873666479506, + "grad_norm": 0.5555832982063293, + "learning_rate": 2.5403896749050474e-07, + "loss": 0.2796, + "step": 9706 + }, + { + "epoch": 2.7251544076361593, + "grad_norm": 0.5191340446472168, + "learning_rate": 2.5352520931716196e-07, + "loss": 0.3383, + "step": 9707 + }, + { + "epoch": 2.725435148792813, + "grad_norm": 0.5697038173675537, + "learning_rate": 2.530119576580936e-07, + "loss": 0.3102, + "step": 9708 + }, + { + "epoch": 2.7257158899494667, + "grad_norm": 0.5125386714935303, + "learning_rate": 2.5249921256806976e-07, + "loss": 0.3257, + "step": 9709 + }, + { + "epoch": 2.72599663110612, + "grad_norm": 0.546018660068512, + "learning_rate": 2.5198697410180606e-07, + "loss": 0.3398, + "step": 9710 + }, + { + "epoch": 2.7262773722627736, + "grad_norm": 0.5035435557365417, + "learning_rate": 2.514752423139677e-07, + "loss": 0.3365, + "step": 9711 + }, + { + "epoch": 2.7265581134194274, + "grad_norm": 0.5132114887237549, + "learning_rate": 2.509640172591615e-07, + "loss": 0.3278, + "step": 9712 + }, + { + "epoch": 2.726838854576081, + "grad_norm": 0.5680822134017944, + "learning_rate": 2.5045329899194337e-07, + "loss": 0.3014, + "step": 9713 + }, + { + "epoch": 2.7271195957327343, + "grad_norm": 0.49828410148620605, + "learning_rate": 2.4994308756681354e-07, + "loss": 0.2894, + "step": 9714 + }, + { + "epoch": 2.7274003368893878, + "grad_norm": 0.6366356611251831, + "learning_rate": 2.4943338303821894e-07, + "loss": 0.3601, + "step": 9715 + }, + { + "epoch": 2.7276810780460417, + "grad_norm": 0.5409765243530273, + "learning_rate": 2.489241854605523e-07, + "loss": 0.3342, + "step": 9716 + }, + { + "epoch": 2.727961819202695, + "grad_norm": 0.5092395544052124, + "learning_rate": 2.484154948881512e-07, + "loss": 0.3359, + "step": 9717 + }, + { + "epoch": 2.7282425603593485, + "grad_norm": 0.5486072301864624, + "learning_rate": 2.4790731137530054e-07, + "loss": 0.323, + "step": 9718 + }, + { + "epoch": 2.7285233015160024, + "grad_norm": 0.5532756447792053, + "learning_rate": 2.4739963497622977e-07, + "loss": 0.3258, + "step": 9719 + }, + { + "epoch": 2.728804042672656, + "grad_norm": 0.6096033453941345, + "learning_rate": 2.4689246574511604e-07, + "loss": 0.2796, + "step": 9720 + }, + { + "epoch": 2.7290847838293093, + "grad_norm": 0.5525265336036682, + "learning_rate": 2.4638580373608057e-07, + "loss": 0.323, + "step": 9721 + }, + { + "epoch": 2.7293655249859627, + "grad_norm": 0.5786679983139038, + "learning_rate": 2.4587964900319117e-07, + "loss": 0.3448, + "step": 9722 + }, + { + "epoch": 2.7296462661426166, + "grad_norm": 0.5298022031784058, + "learning_rate": 2.4537400160046186e-07, + "loss": 0.2942, + "step": 9723 + }, + { + "epoch": 2.72992700729927, + "grad_norm": 0.5679615139961243, + "learning_rate": 2.4486886158185175e-07, + "loss": 0.3366, + "step": 9724 + }, + { + "epoch": 2.7302077484559235, + "grad_norm": 0.5167908668518066, + "learning_rate": 2.443642290012671e-07, + "loss": 0.3446, + "step": 9725 + }, + { + "epoch": 2.7304884896125774, + "grad_norm": 0.4989408850669861, + "learning_rate": 2.4386010391255763e-07, + "loss": 0.3423, + "step": 9726 + }, + { + "epoch": 2.730769230769231, + "grad_norm": 0.46948695182800293, + "learning_rate": 2.4335648636952256e-07, + "loss": 0.3425, + "step": 9727 + }, + { + "epoch": 2.7310499719258843, + "grad_norm": 0.5441704988479614, + "learning_rate": 2.4285337642590336e-07, + "loss": 0.3221, + "step": 9728 + }, + { + "epoch": 2.7313307130825377, + "grad_norm": 0.4902036786079407, + "learning_rate": 2.423507741353881e-07, + "loss": 0.3498, + "step": 9729 + }, + { + "epoch": 2.7316114542391916, + "grad_norm": 0.5878719687461853, + "learning_rate": 2.418486795516134e-07, + "loss": 0.3362, + "step": 9730 + }, + { + "epoch": 2.731892195395845, + "grad_norm": 0.5594978928565979, + "learning_rate": 2.4134709272815803e-07, + "loss": 0.304, + "step": 9731 + }, + { + "epoch": 2.7321729365524985, + "grad_norm": 0.5778236389160156, + "learning_rate": 2.4084601371854867e-07, + "loss": 0.3144, + "step": 9732 + }, + { + "epoch": 2.7324536777091524, + "grad_norm": 0.5218297243118286, + "learning_rate": 2.4034544257625805e-07, + "loss": 0.3276, + "step": 9733 + }, + { + "epoch": 2.732734418865806, + "grad_norm": 0.5348289012908936, + "learning_rate": 2.398453793547034e-07, + "loss": 0.3064, + "step": 9734 + }, + { + "epoch": 2.7330151600224593, + "grad_norm": 0.530484676361084, + "learning_rate": 2.3934582410724827e-07, + "loss": 0.3049, + "step": 9735 + }, + { + "epoch": 2.7332959011791127, + "grad_norm": 0.4909449517726898, + "learning_rate": 2.3884677688720216e-07, + "loss": 0.3272, + "step": 9736 + }, + { + "epoch": 2.7335766423357666, + "grad_norm": 0.4785850942134857, + "learning_rate": 2.383482377478208e-07, + "loss": 0.3365, + "step": 9737 + }, + { + "epoch": 2.73385738349242, + "grad_norm": 0.5998890995979309, + "learning_rate": 2.3785020674230443e-07, + "loss": 0.3, + "step": 9738 + }, + { + "epoch": 2.7341381246490735, + "grad_norm": 0.5561297535896301, + "learning_rate": 2.3735268392379995e-07, + "loss": 0.3644, + "step": 9739 + }, + { + "epoch": 2.7344188658057273, + "grad_norm": 0.5706725716590881, + "learning_rate": 2.368556693454005e-07, + "loss": 0.335, + "step": 9740 + }, + { + "epoch": 2.734699606962381, + "grad_norm": 0.5220150351524353, + "learning_rate": 2.3635916306014362e-07, + "loss": 0.3158, + "step": 9741 + }, + { + "epoch": 2.7349803481190342, + "grad_norm": 0.5619484186172485, + "learning_rate": 2.3586316512101416e-07, + "loss": 0.3397, + "step": 9742 + }, + { + "epoch": 2.7352610892756877, + "grad_norm": 0.5048370361328125, + "learning_rate": 2.3536767558094142e-07, + "loss": 0.3369, + "step": 9743 + }, + { + "epoch": 2.735541830432341, + "grad_norm": 0.5645469427108765, + "learning_rate": 2.3487269449280037e-07, + "loss": 0.3028, + "step": 9744 + }, + { + "epoch": 2.735822571588995, + "grad_norm": 0.5247581005096436, + "learning_rate": 2.343782219094143e-07, + "loss": 0.3171, + "step": 9745 + }, + { + "epoch": 2.7361033127456484, + "grad_norm": 0.571412980556488, + "learning_rate": 2.3388425788354762e-07, + "loss": 0.317, + "step": 9746 + }, + { + "epoch": 2.7363840539023023, + "grad_norm": 0.5135009288787842, + "learning_rate": 2.333908024679149e-07, + "loss": 0.2874, + "step": 9747 + }, + { + "epoch": 2.7366647950589558, + "grad_norm": 0.49205532670021057, + "learning_rate": 2.3289785571517398e-07, + "loss": 0.3548, + "step": 9748 + }, + { + "epoch": 2.736945536215609, + "grad_norm": 0.5593763589859009, + "learning_rate": 2.3240541767793002e-07, + "loss": 0.3716, + "step": 9749 + }, + { + "epoch": 2.7372262773722627, + "grad_norm": 0.5323165059089661, + "learning_rate": 2.319134884087315e-07, + "loss": 0.2951, + "step": 9750 + }, + { + "epoch": 2.737507018528916, + "grad_norm": 0.5595327615737915, + "learning_rate": 2.3142206796007484e-07, + "loss": 0.3389, + "step": 9751 + }, + { + "epoch": 2.73778775968557, + "grad_norm": 0.5712656378746033, + "learning_rate": 2.3093115638440134e-07, + "loss": 0.336, + "step": 9752 + }, + { + "epoch": 2.7380685008422234, + "grad_norm": 0.5395691394805908, + "learning_rate": 2.3044075373409746e-07, + "loss": 0.3375, + "step": 9753 + }, + { + "epoch": 2.7383492419988773, + "grad_norm": 0.6070802211761475, + "learning_rate": 2.2995086006149746e-07, + "loss": 0.3286, + "step": 9754 + }, + { + "epoch": 2.7386299831555307, + "grad_norm": 0.6212032437324524, + "learning_rate": 2.2946147541887788e-07, + "loss": 0.2668, + "step": 9755 + }, + { + "epoch": 2.738910724312184, + "grad_norm": 0.5324410200119019, + "learning_rate": 2.2897259985846355e-07, + "loss": 0.3187, + "step": 9756 + }, + { + "epoch": 2.7391914654688376, + "grad_norm": 0.5174353122711182, + "learning_rate": 2.2848423343242498e-07, + "loss": 0.3398, + "step": 9757 + }, + { + "epoch": 2.739472206625491, + "grad_norm": 0.5650909543037415, + "learning_rate": 2.2799637619287606e-07, + "loss": 0.3077, + "step": 9758 + }, + { + "epoch": 2.739752947782145, + "grad_norm": 0.5459592938423157, + "learning_rate": 2.2750902819187903e-07, + "loss": 0.3049, + "step": 9759 + }, + { + "epoch": 2.7400336889387984, + "grad_norm": 0.5758468508720398, + "learning_rate": 2.2702218948144061e-07, + "loss": 0.3751, + "step": 9760 + }, + { + "epoch": 2.740314430095452, + "grad_norm": 0.5374598503112793, + "learning_rate": 2.2653586011351259e-07, + "loss": 0.3028, + "step": 9761 + }, + { + "epoch": 2.7405951712521057, + "grad_norm": 0.5994312167167664, + "learning_rate": 2.2605004013999343e-07, + "loss": 0.3111, + "step": 9762 + }, + { + "epoch": 2.740875912408759, + "grad_norm": 0.6063372492790222, + "learning_rate": 2.2556472961272668e-07, + "loss": 0.3272, + "step": 9763 + }, + { + "epoch": 2.7411566535654126, + "grad_norm": 0.5221341252326965, + "learning_rate": 2.2507992858350258e-07, + "loss": 0.3493, + "step": 9764 + }, + { + "epoch": 2.741437394722066, + "grad_norm": 0.5699635148048401, + "learning_rate": 2.2459563710405473e-07, + "loss": 0.3177, + "step": 9765 + }, + { + "epoch": 2.74171813587872, + "grad_norm": 0.6553797721862793, + "learning_rate": 2.241118552260635e-07, + "loss": 0.3737, + "step": 9766 + }, + { + "epoch": 2.7419988770353734, + "grad_norm": 0.5036535859107971, + "learning_rate": 2.2362858300115696e-07, + "loss": 0.323, + "step": 9767 + }, + { + "epoch": 2.742279618192027, + "grad_norm": 0.5880960822105408, + "learning_rate": 2.231458204809045e-07, + "loss": 0.3428, + "step": 9768 + }, + { + "epoch": 2.7425603593486807, + "grad_norm": 0.6233805418014526, + "learning_rate": 2.2266356771682596e-07, + "loss": 0.3174, + "step": 9769 + }, + { + "epoch": 2.742841100505334, + "grad_norm": 0.5725235939025879, + "learning_rate": 2.2218182476038298e-07, + "loss": 0.3382, + "step": 9770 + }, + { + "epoch": 2.7431218416619876, + "grad_norm": 0.5659641027450562, + "learning_rate": 2.2170059166298386e-07, + "loss": 0.3004, + "step": 9771 + }, + { + "epoch": 2.743402582818641, + "grad_norm": 0.5155832767486572, + "learning_rate": 2.2121986847598364e-07, + "loss": 0.329, + "step": 9772 + }, + { + "epoch": 2.743683323975295, + "grad_norm": 0.48419222235679626, + "learning_rate": 2.207396552506813e-07, + "loss": 0.3647, + "step": 9773 + }, + { + "epoch": 2.7439640651319483, + "grad_norm": 0.5316529870033264, + "learning_rate": 2.202599520383236e-07, + "loss": 0.3126, + "step": 9774 + }, + { + "epoch": 2.744244806288602, + "grad_norm": 0.5338003039360046, + "learning_rate": 2.1978075889009965e-07, + "loss": 0.3122, + "step": 9775 + }, + { + "epoch": 2.7445255474452557, + "grad_norm": 0.6163437962532043, + "learning_rate": 2.1930207585714736e-07, + "loss": 0.3474, + "step": 9776 + }, + { + "epoch": 2.744806288601909, + "grad_norm": 0.56536865234375, + "learning_rate": 2.1882390299054867e-07, + "loss": 0.2856, + "step": 9777 + }, + { + "epoch": 2.7450870297585626, + "grad_norm": 0.563823938369751, + "learning_rate": 2.1834624034133002e-07, + "loss": 0.2931, + "step": 9778 + }, + { + "epoch": 2.745367770915216, + "grad_norm": 0.548951268196106, + "learning_rate": 2.1786908796046612e-07, + "loss": 0.3462, + "step": 9779 + }, + { + "epoch": 2.74564851207187, + "grad_norm": 0.5613840818405151, + "learning_rate": 2.1739244589887464e-07, + "loss": 0.2883, + "step": 9780 + }, + { + "epoch": 2.7459292532285233, + "grad_norm": 0.5726778507232666, + "learning_rate": 2.1691631420742043e-07, + "loss": 0.2636, + "step": 9781 + }, + { + "epoch": 2.7462099943851768, + "grad_norm": 0.5685061812400818, + "learning_rate": 2.1644069293691338e-07, + "loss": 0.3302, + "step": 9782 + }, + { + "epoch": 2.7464907355418307, + "grad_norm": 0.5521758794784546, + "learning_rate": 2.1596558213810735e-07, + "loss": 0.3433, + "step": 9783 + }, + { + "epoch": 2.746771476698484, + "grad_norm": 0.5658281445503235, + "learning_rate": 2.154909818617057e-07, + "loss": 0.3486, + "step": 9784 + }, + { + "epoch": 2.7470522178551375, + "grad_norm": 0.6052652597427368, + "learning_rate": 2.1501689215835342e-07, + "loss": 0.2884, + "step": 9785 + }, + { + "epoch": 2.747332959011791, + "grad_norm": 0.5344831347465515, + "learning_rate": 2.1454331307864229e-07, + "loss": 0.3386, + "step": 9786 + }, + { + "epoch": 2.7476137001684444, + "grad_norm": 0.553074836730957, + "learning_rate": 2.1407024467311077e-07, + "loss": 0.3124, + "step": 9787 + }, + { + "epoch": 2.7478944413250983, + "grad_norm": 0.539045512676239, + "learning_rate": 2.1359768699224015e-07, + "loss": 0.3238, + "step": 9788 + }, + { + "epoch": 2.7481751824817517, + "grad_norm": 0.5671885013580322, + "learning_rate": 2.1312564008646008e-07, + "loss": 0.3491, + "step": 9789 + }, + { + "epoch": 2.7484559236384056, + "grad_norm": 0.5347363352775574, + "learning_rate": 2.126541040061436e-07, + "loss": 0.3759, + "step": 9790 + }, + { + "epoch": 2.748736664795059, + "grad_norm": 0.5233283638954163, + "learning_rate": 2.1218307880161105e-07, + "loss": 0.3282, + "step": 9791 + }, + { + "epoch": 2.7490174059517125, + "grad_norm": 0.5441765785217285, + "learning_rate": 2.1171256452312717e-07, + "loss": 0.3173, + "step": 9792 + }, + { + "epoch": 2.749298147108366, + "grad_norm": 0.4917449951171875, + "learning_rate": 2.1124256122090125e-07, + "loss": 0.3337, + "step": 9793 + }, + { + "epoch": 2.7495788882650194, + "grad_norm": 0.5290191769599915, + "learning_rate": 2.1077306894509042e-07, + "loss": 0.338, + "step": 9794 + }, + { + "epoch": 2.7498596294216733, + "grad_norm": 0.5983152985572815, + "learning_rate": 2.1030408774579515e-07, + "loss": 0.2917, + "step": 9795 + }, + { + "epoch": 2.7501403705783267, + "grad_norm": 0.46865564584732056, + "learning_rate": 2.0983561767306314e-07, + "loss": 0.3288, + "step": 9796 + }, + { + "epoch": 2.7504211117349806, + "grad_norm": 0.5375168919563293, + "learning_rate": 2.0936765877688504e-07, + "loss": 0.3532, + "step": 9797 + }, + { + "epoch": 2.750701852891634, + "grad_norm": 0.5457127690315247, + "learning_rate": 2.0890021110719972e-07, + "loss": 0.3202, + "step": 9798 + }, + { + "epoch": 2.7509825940482875, + "grad_norm": 0.51783287525177, + "learning_rate": 2.0843327471389063e-07, + "loss": 0.3116, + "step": 9799 + }, + { + "epoch": 2.751263335204941, + "grad_norm": 0.6021425127983093, + "learning_rate": 2.079668496467846e-07, + "loss": 0.3255, + "step": 9800 + }, + { + "epoch": 2.7515440763615944, + "grad_norm": 0.5214225649833679, + "learning_rate": 2.0750093595565735e-07, + "loss": 0.3233, + "step": 9801 + }, + { + "epoch": 2.7518248175182483, + "grad_norm": 0.5541335940361023, + "learning_rate": 2.0703553369022743e-07, + "loss": 0.3285, + "step": 9802 + }, + { + "epoch": 2.7521055586749017, + "grad_norm": 0.49332740902900696, + "learning_rate": 2.0657064290016015e-07, + "loss": 0.3281, + "step": 9803 + }, + { + "epoch": 2.7523862998315556, + "grad_norm": 0.5509899854660034, + "learning_rate": 2.0610626363506526e-07, + "loss": 0.3498, + "step": 9804 + }, + { + "epoch": 2.752667040988209, + "grad_norm": 0.5894120931625366, + "learning_rate": 2.0564239594449863e-07, + "loss": 0.2912, + "step": 9805 + }, + { + "epoch": 2.7529477821448625, + "grad_norm": 0.5687708258628845, + "learning_rate": 2.051790398779613e-07, + "loss": 0.3523, + "step": 9806 + }, + { + "epoch": 2.753228523301516, + "grad_norm": 0.4712930917739868, + "learning_rate": 2.0471619548489974e-07, + "loss": 0.3269, + "step": 9807 + }, + { + "epoch": 2.7535092644581693, + "grad_norm": 0.46517688035964966, + "learning_rate": 2.0425386281470617e-07, + "loss": 0.315, + "step": 9808 + }, + { + "epoch": 2.7537900056148232, + "grad_norm": 0.6015969514846802, + "learning_rate": 2.037920419167172e-07, + "loss": 0.3251, + "step": 9809 + }, + { + "epoch": 2.7540707467714767, + "grad_norm": 0.5469324588775635, + "learning_rate": 2.0333073284021565e-07, + "loss": 0.3286, + "step": 9810 + }, + { + "epoch": 2.75435148792813, + "grad_norm": 0.5498746037483215, + "learning_rate": 2.0286993563443048e-07, + "loss": 0.3157, + "step": 9811 + }, + { + "epoch": 2.754632229084784, + "grad_norm": 0.5221884846687317, + "learning_rate": 2.02409650348534e-07, + "loss": 0.3367, + "step": 9812 + }, + { + "epoch": 2.7549129702414374, + "grad_norm": 0.49796751141548157, + "learning_rate": 2.0194987703164582e-07, + "loss": 0.2977, + "step": 9813 + }, + { + "epoch": 2.755193711398091, + "grad_norm": 0.5357893109321594, + "learning_rate": 2.0149061573283003e-07, + "loss": 0.3371, + "step": 9814 + }, + { + "epoch": 2.7554744525547443, + "grad_norm": 0.5246144533157349, + "learning_rate": 2.0103186650109462e-07, + "loss": 0.2993, + "step": 9815 + }, + { + "epoch": 2.755755193711398, + "grad_norm": 0.5692760348320007, + "learning_rate": 2.005736293853966e-07, + "loss": 0.3415, + "step": 9816 + }, + { + "epoch": 2.7560359348680517, + "grad_norm": 0.48902472853660583, + "learning_rate": 2.0011590443463403e-07, + "loss": 0.3486, + "step": 9817 + }, + { + "epoch": 2.756316676024705, + "grad_norm": 0.5468211770057678, + "learning_rate": 1.996586916976545e-07, + "loss": 0.2888, + "step": 9818 + }, + { + "epoch": 2.756597417181359, + "grad_norm": 0.5358859300613403, + "learning_rate": 1.992019912232479e-07, + "loss": 0.3376, + "step": 9819 + }, + { + "epoch": 2.7568781583380124, + "grad_norm": 0.50605309009552, + "learning_rate": 1.987458030601508e-07, + "loss": 0.342, + "step": 9820 + }, + { + "epoch": 2.757158899494666, + "grad_norm": 0.5235331058502197, + "learning_rate": 1.982901272570442e-07, + "loss": 0.3184, + "step": 9821 + }, + { + "epoch": 2.7574396406513193, + "grad_norm": 0.5272335410118103, + "learning_rate": 1.978349638625554e-07, + "loss": 0.3314, + "step": 9822 + }, + { + "epoch": 2.757720381807973, + "grad_norm": 0.5786486268043518, + "learning_rate": 1.973803129252566e-07, + "loss": 0.32, + "step": 9823 + }, + { + "epoch": 2.7580011229646266, + "grad_norm": 0.5336734056472778, + "learning_rate": 1.9692617449366514e-07, + "loss": 0.3584, + "step": 9824 + }, + { + "epoch": 2.75828186412128, + "grad_norm": 0.5181218981742859, + "learning_rate": 1.9647254861624444e-07, + "loss": 0.3429, + "step": 9825 + }, + { + "epoch": 2.758562605277934, + "grad_norm": 0.5494160652160645, + "learning_rate": 1.9601943534140245e-07, + "loss": 0.3301, + "step": 9826 + }, + { + "epoch": 2.7588433464345874, + "grad_norm": 0.5143555998802185, + "learning_rate": 1.955668347174916e-07, + "loss": 0.297, + "step": 9827 + }, + { + "epoch": 2.759124087591241, + "grad_norm": 0.5042095184326172, + "learning_rate": 1.951147467928116e-07, + "loss": 0.3044, + "step": 9828 + }, + { + "epoch": 2.7594048287478943, + "grad_norm": 0.5449483394622803, + "learning_rate": 1.9466317161560556e-07, + "loss": 0.2934, + "step": 9829 + }, + { + "epoch": 2.759685569904548, + "grad_norm": 0.5260158777236938, + "learning_rate": 1.942121092340643e-07, + "loss": 0.3321, + "step": 9830 + }, + { + "epoch": 2.7599663110612016, + "grad_norm": 0.5834680199623108, + "learning_rate": 1.9376155969632104e-07, + "loss": 0.3181, + "step": 9831 + }, + { + "epoch": 2.760247052217855, + "grad_norm": 0.6096001863479614, + "learning_rate": 1.9331152305045674e-07, + "loss": 0.3585, + "step": 9832 + }, + { + "epoch": 2.760527793374509, + "grad_norm": 0.5205704569816589, + "learning_rate": 1.9286199934449578e-07, + "loss": 0.3308, + "step": 9833 + }, + { + "epoch": 2.7608085345311624, + "grad_norm": 0.6398050785064697, + "learning_rate": 1.9241298862640757e-07, + "loss": 0.3051, + "step": 9834 + }, + { + "epoch": 2.761089275687816, + "grad_norm": 0.6154297590255737, + "learning_rate": 1.9196449094410985e-07, + "loss": 0.2986, + "step": 9835 + }, + { + "epoch": 2.7613700168444693, + "grad_norm": 0.5259007215499878, + "learning_rate": 1.9151650634546214e-07, + "loss": 0.3218, + "step": 9836 + }, + { + "epoch": 2.7616507580011227, + "grad_norm": 0.571379542350769, + "learning_rate": 1.9106903487827067e-07, + "loss": 0.328, + "step": 9837 + }, + { + "epoch": 2.7619314991577766, + "grad_norm": 0.5834273099899292, + "learning_rate": 1.9062207659028774e-07, + "loss": 0.3255, + "step": 9838 + }, + { + "epoch": 2.76221224031443, + "grad_norm": 0.5143601894378662, + "learning_rate": 1.9017563152920803e-07, + "loss": 0.3265, + "step": 9839 + }, + { + "epoch": 2.762492981471084, + "grad_norm": 0.5301283001899719, + "learning_rate": 1.8972969974267564e-07, + "loss": 0.3052, + "step": 9840 + }, + { + "epoch": 2.7627737226277373, + "grad_norm": 0.6125536561012268, + "learning_rate": 1.8928428127827693e-07, + "loss": 0.2915, + "step": 9841 + }, + { + "epoch": 2.763054463784391, + "grad_norm": 0.5684979557991028, + "learning_rate": 1.8883937618354275e-07, + "loss": 0.3369, + "step": 9842 + }, + { + "epoch": 2.7633352049410442, + "grad_norm": 0.5631262063980103, + "learning_rate": 1.883949845059524e-07, + "loss": 0.3139, + "step": 9843 + }, + { + "epoch": 2.7636159460976977, + "grad_norm": 0.620158851146698, + "learning_rate": 1.8795110629292734e-07, + "loss": 0.3624, + "step": 9844 + }, + { + "epoch": 2.7638966872543516, + "grad_norm": 0.5072607398033142, + "learning_rate": 1.8750774159183693e-07, + "loss": 0.3445, + "step": 9845 + }, + { + "epoch": 2.764177428411005, + "grad_norm": 0.5696941614151001, + "learning_rate": 1.8706489044999222e-07, + "loss": 0.3293, + "step": 9846 + }, + { + "epoch": 2.764458169567659, + "grad_norm": 0.591693103313446, + "learning_rate": 1.866225529146537e-07, + "loss": 0.3516, + "step": 9847 + }, + { + "epoch": 2.7647389107243123, + "grad_norm": 0.5637592077255249, + "learning_rate": 1.861807290330242e-07, + "loss": 0.2612, + "step": 9848 + }, + { + "epoch": 2.7650196518809658, + "grad_norm": 0.5399445295333862, + "learning_rate": 1.85739418852251e-07, + "loss": 0.3225, + "step": 9849 + }, + { + "epoch": 2.765300393037619, + "grad_norm": 0.5351146459579468, + "learning_rate": 1.8529862241942975e-07, + "loss": 0.3085, + "step": 9850 + }, + { + "epoch": 2.7655811341942727, + "grad_norm": 0.528683066368103, + "learning_rate": 1.8485833978159895e-07, + "loss": 0.3321, + "step": 9851 + }, + { + "epoch": 2.7658618753509265, + "grad_norm": 0.5455690026283264, + "learning_rate": 1.8441857098574267e-07, + "loss": 0.3029, + "step": 9852 + }, + { + "epoch": 2.76614261650758, + "grad_norm": 0.5639652013778687, + "learning_rate": 1.839793160787906e-07, + "loss": 0.3232, + "step": 9853 + }, + { + "epoch": 2.7664233576642334, + "grad_norm": 0.5558152794837952, + "learning_rate": 1.8354057510761637e-07, + "loss": 0.327, + "step": 9854 + }, + { + "epoch": 2.7667040988208873, + "grad_norm": 0.546768307685852, + "learning_rate": 1.831023481190408e-07, + "loss": 0.3422, + "step": 9855 + }, + { + "epoch": 2.7669848399775407, + "grad_norm": 0.5269338488578796, + "learning_rate": 1.8266463515982824e-07, + "loss": 0.3338, + "step": 9856 + }, + { + "epoch": 2.767265581134194, + "grad_norm": 0.575153648853302, + "learning_rate": 1.8222743627668903e-07, + "loss": 0.3071, + "step": 9857 + }, + { + "epoch": 2.7675463222908476, + "grad_norm": 0.593867838382721, + "learning_rate": 1.8179075151627755e-07, + "loss": 0.3008, + "step": 9858 + }, + { + "epoch": 2.7678270634475015, + "grad_norm": 0.49975305795669556, + "learning_rate": 1.8135458092519485e-07, + "loss": 0.3043, + "step": 9859 + }, + { + "epoch": 2.768107804604155, + "grad_norm": 0.6041407585144043, + "learning_rate": 1.8091892454998595e-07, + "loss": 0.3608, + "step": 9860 + }, + { + "epoch": 2.7683885457608084, + "grad_norm": 0.5721468925476074, + "learning_rate": 1.8048378243714136e-07, + "loss": 0.3171, + "step": 9861 + }, + { + "epoch": 2.7686692869174623, + "grad_norm": 0.5866236090660095, + "learning_rate": 1.8004915463309792e-07, + "loss": 0.2997, + "step": 9862 + }, + { + "epoch": 2.7689500280741157, + "grad_norm": 0.5634253025054932, + "learning_rate": 1.7961504118423512e-07, + "loss": 0.3062, + "step": 9863 + }, + { + "epoch": 2.769230769230769, + "grad_norm": 0.6289180517196655, + "learning_rate": 1.7918144213687815e-07, + "loss": 0.3098, + "step": 9864 + }, + { + "epoch": 2.7695115103874226, + "grad_norm": 0.5639164447784424, + "learning_rate": 1.7874835753730003e-07, + "loss": 0.3309, + "step": 9865 + }, + { + "epoch": 2.7697922515440765, + "grad_norm": 0.5672398209571838, + "learning_rate": 1.7831578743171484e-07, + "loss": 0.2969, + "step": 9866 + }, + { + "epoch": 2.77007299270073, + "grad_norm": 0.5080732703208923, + "learning_rate": 1.7788373186628573e-07, + "loss": 0.2991, + "step": 9867 + }, + { + "epoch": 2.7703537338573834, + "grad_norm": 0.54532390832901, + "learning_rate": 1.774521908871174e-07, + "loss": 0.336, + "step": 9868 + }, + { + "epoch": 2.7706344750140373, + "grad_norm": 0.5791751146316528, + "learning_rate": 1.7702116454026196e-07, + "loss": 0.3353, + "step": 9869 + }, + { + "epoch": 2.7709152161706907, + "grad_norm": 0.4919031858444214, + "learning_rate": 1.7659065287171594e-07, + "loss": 0.3385, + "step": 9870 + }, + { + "epoch": 2.771195957327344, + "grad_norm": 0.5607175827026367, + "learning_rate": 1.7616065592742038e-07, + "loss": 0.3511, + "step": 9871 + }, + { + "epoch": 2.7714766984839976, + "grad_norm": 0.5476047396659851, + "learning_rate": 1.7573117375326242e-07, + "loss": 0.3253, + "step": 9872 + }, + { + "epoch": 2.7717574396406515, + "grad_norm": 0.6203597187995911, + "learning_rate": 1.7530220639507268e-07, + "loss": 0.3501, + "step": 9873 + }, + { + "epoch": 2.772038180797305, + "grad_norm": 0.5313006043434143, + "learning_rate": 1.748737538986295e-07, + "loss": 0.3248, + "step": 9874 + }, + { + "epoch": 2.7723189219539583, + "grad_norm": 0.5611987709999084, + "learning_rate": 1.7444581630965406e-07, + "loss": 0.3257, + "step": 9875 + }, + { + "epoch": 2.7725996631106122, + "grad_norm": 0.4884902536869049, + "learning_rate": 1.7401839367381213e-07, + "loss": 0.3458, + "step": 9876 + }, + { + "epoch": 2.7728804042672657, + "grad_norm": 0.5368293523788452, + "learning_rate": 1.735914860367166e-07, + "loss": 0.3392, + "step": 9877 + }, + { + "epoch": 2.773161145423919, + "grad_norm": 0.521256685256958, + "learning_rate": 1.731650934439244e-07, + "loss": 0.3374, + "step": 9878 + }, + { + "epoch": 2.7734418865805726, + "grad_norm": 0.49649372696876526, + "learning_rate": 1.7273921594093746e-07, + "loss": 0.3483, + "step": 9879 + }, + { + "epoch": 2.7737226277372264, + "grad_norm": 0.5162923336029053, + "learning_rate": 1.723138535732022e-07, + "loss": 0.3163, + "step": 9880 + }, + { + "epoch": 2.77400336889388, + "grad_norm": 0.5270166397094727, + "learning_rate": 1.7188900638611172e-07, + "loss": 0.3463, + "step": 9881 + }, + { + "epoch": 2.7742841100505333, + "grad_norm": 0.5457121133804321, + "learning_rate": 1.7146467442500203e-07, + "loss": 0.3815, + "step": 9882 + }, + { + "epoch": 2.774564851207187, + "grad_norm": 0.5218682289123535, + "learning_rate": 1.710408577351552e-07, + "loss": 0.317, + "step": 9883 + }, + { + "epoch": 2.7748455923638407, + "grad_norm": 0.5575658679008484, + "learning_rate": 1.7061755636179954e-07, + "loss": 0.3254, + "step": 9884 + }, + { + "epoch": 2.775126333520494, + "grad_norm": 0.5529207587242126, + "learning_rate": 1.7019477035010557e-07, + "loss": 0.369, + "step": 9885 + }, + { + "epoch": 2.7754070746771475, + "grad_norm": 0.5120636224746704, + "learning_rate": 1.6977249974519106e-07, + "loss": 0.362, + "step": 9886 + }, + { + "epoch": 2.775687815833801, + "grad_norm": 0.537596583366394, + "learning_rate": 1.6935074459211887e-07, + "loss": 0.3167, + "step": 9887 + }, + { + "epoch": 2.775968556990455, + "grad_norm": 0.4775805175304413, + "learning_rate": 1.6892950493589522e-07, + "loss": 0.3413, + "step": 9888 + }, + { + "epoch": 2.7762492981471083, + "grad_norm": 0.5533557534217834, + "learning_rate": 1.685087808214725e-07, + "loss": 0.3336, + "step": 9889 + }, + { + "epoch": 2.776530039303762, + "grad_norm": 0.5454351305961609, + "learning_rate": 1.6808857229374753e-07, + "loss": 0.3243, + "step": 9890 + }, + { + "epoch": 2.7768107804604156, + "grad_norm": 0.4986509382724762, + "learning_rate": 1.6766887939756227e-07, + "loss": 0.3571, + "step": 9891 + }, + { + "epoch": 2.777091521617069, + "grad_norm": 0.5733097791671753, + "learning_rate": 1.672497021777042e-07, + "loss": 0.3009, + "step": 9892 + }, + { + "epoch": 2.7773722627737225, + "grad_norm": 0.5357571244239807, + "learning_rate": 1.6683104067890422e-07, + "loss": 0.3498, + "step": 9893 + }, + { + "epoch": 2.777653003930376, + "grad_norm": 0.5690387487411499, + "learning_rate": 1.6641289494584102e-07, + "loss": 0.2982, + "step": 9894 + }, + { + "epoch": 2.77793374508703, + "grad_norm": 0.5264323353767395, + "learning_rate": 1.6599526502313502e-07, + "loss": 0.3467, + "step": 9895 + }, + { + "epoch": 2.7782144862436833, + "grad_norm": 0.4939902126789093, + "learning_rate": 1.655781509553539e-07, + "loss": 0.3609, + "step": 9896 + }, + { + "epoch": 2.778495227400337, + "grad_norm": 0.6071324944496155, + "learning_rate": 1.651615527870093e-07, + "loss": 0.3193, + "step": 9897 + }, + { + "epoch": 2.7787759685569906, + "grad_norm": 0.5301510095596313, + "learning_rate": 1.6474547056255785e-07, + "loss": 0.2884, + "step": 9898 + }, + { + "epoch": 2.779056709713644, + "grad_norm": 0.563073992729187, + "learning_rate": 1.6432990432640127e-07, + "loss": 0.3197, + "step": 9899 + }, + { + "epoch": 2.7793374508702975, + "grad_norm": 0.5243821740150452, + "learning_rate": 1.6391485412288576e-07, + "loss": 0.3203, + "step": 9900 + }, + { + "epoch": 2.779618192026951, + "grad_norm": 0.4988243579864502, + "learning_rate": 1.6350031999630423e-07, + "loss": 0.3656, + "step": 9901 + }, + { + "epoch": 2.779898933183605, + "grad_norm": 0.5599173903465271, + "learning_rate": 1.6308630199089238e-07, + "loss": 0.3034, + "step": 9902 + }, + { + "epoch": 2.7801796743402583, + "grad_norm": 0.5435174107551575, + "learning_rate": 1.6267280015083098e-07, + "loss": 0.298, + "step": 9903 + }, + { + "epoch": 2.7804604154969117, + "grad_norm": 0.6139850616455078, + "learning_rate": 1.6225981452024752e-07, + "loss": 0.3715, + "step": 9904 + }, + { + "epoch": 2.7807411566535656, + "grad_norm": 0.546872615814209, + "learning_rate": 1.6184734514321177e-07, + "loss": 0.3276, + "step": 9905 + }, + { + "epoch": 2.781021897810219, + "grad_norm": 0.5780566334724426, + "learning_rate": 1.614353920637418e-07, + "loss": 0.295, + "step": 9906 + }, + { + "epoch": 2.7813026389668725, + "grad_norm": 0.539276659488678, + "learning_rate": 1.6102395532579695e-07, + "loss": 0.3031, + "step": 9907 + }, + { + "epoch": 2.781583380123526, + "grad_norm": 0.550692617893219, + "learning_rate": 1.6061303497328485e-07, + "loss": 0.311, + "step": 9908 + }, + { + "epoch": 2.78186412128018, + "grad_norm": 0.601198673248291, + "learning_rate": 1.602026310500554e-07, + "loss": 0.3091, + "step": 9909 + }, + { + "epoch": 2.7821448624368332, + "grad_norm": 0.5254162549972534, + "learning_rate": 1.5979274359990415e-07, + "loss": 0.3718, + "step": 9910 + }, + { + "epoch": 2.7824256035934867, + "grad_norm": 0.5281869769096375, + "learning_rate": 1.5938337266657167e-07, + "loss": 0.3488, + "step": 9911 + }, + { + "epoch": 2.7827063447501406, + "grad_norm": 0.5763053894042969, + "learning_rate": 1.5897451829374465e-07, + "loss": 0.3154, + "step": 9912 + }, + { + "epoch": 2.782987085906794, + "grad_norm": 0.5088951587677002, + "learning_rate": 1.5856618052505157e-07, + "loss": 0.3292, + "step": 9913 + }, + { + "epoch": 2.7832678270634474, + "grad_norm": 0.5531498193740845, + "learning_rate": 1.5815835940406977e-07, + "loss": 0.3289, + "step": 9914 + }, + { + "epoch": 2.783548568220101, + "grad_norm": 0.5799394249916077, + "learning_rate": 1.5775105497431831e-07, + "loss": 0.3133, + "step": 9915 + }, + { + "epoch": 2.7838293093767548, + "grad_norm": 0.5850483775138855, + "learning_rate": 1.5734426727926245e-07, + "loss": 0.2798, + "step": 9916 + }, + { + "epoch": 2.784110050533408, + "grad_norm": 0.594656765460968, + "learning_rate": 1.5693799636231134e-07, + "loss": 0.3612, + "step": 9917 + }, + { + "epoch": 2.7843907916900617, + "grad_norm": 0.5652356147766113, + "learning_rate": 1.5653224226682085e-07, + "loss": 0.3407, + "step": 9918 + }, + { + "epoch": 2.7846715328467155, + "grad_norm": 0.5384714007377625, + "learning_rate": 1.561270050360897e-07, + "loss": 0.3086, + "step": 9919 + }, + { + "epoch": 2.784952274003369, + "grad_norm": 0.5557889342308044, + "learning_rate": 1.5572228471336214e-07, + "loss": 0.3134, + "step": 9920 + }, + { + "epoch": 2.7852330151600224, + "grad_norm": 0.5542265772819519, + "learning_rate": 1.5531808134182813e-07, + "loss": 0.3073, + "step": 9921 + }, + { + "epoch": 2.785513756316676, + "grad_norm": 0.5558714866638184, + "learning_rate": 1.549143949646209e-07, + "loss": 0.2906, + "step": 9922 + }, + { + "epoch": 2.7857944974733297, + "grad_norm": 0.5353521704673767, + "learning_rate": 1.5451122562482047e-07, + "loss": 0.3463, + "step": 9923 + }, + { + "epoch": 2.786075238629983, + "grad_norm": 0.5866934061050415, + "learning_rate": 1.5410857336545015e-07, + "loss": 0.3059, + "step": 9924 + }, + { + "epoch": 2.7863559797866366, + "grad_norm": 0.5464940667152405, + "learning_rate": 1.5370643822947784e-07, + "loss": 0.3245, + "step": 9925 + }, + { + "epoch": 2.7866367209432905, + "grad_norm": 0.5502282381057739, + "learning_rate": 1.5330482025981753e-07, + "loss": 0.3118, + "step": 9926 + }, + { + "epoch": 2.786917462099944, + "grad_norm": 0.5689045786857605, + "learning_rate": 1.5290371949932657e-07, + "loss": 0.3105, + "step": 9927 + }, + { + "epoch": 2.7871982032565974, + "grad_norm": 0.5704795718193054, + "learning_rate": 1.5250313599080913e-07, + "loss": 0.2706, + "step": 9928 + }, + { + "epoch": 2.787478944413251, + "grad_norm": 0.5281431674957275, + "learning_rate": 1.5210306977701205e-07, + "loss": 0.3107, + "step": 9929 + }, + { + "epoch": 2.7877596855699043, + "grad_norm": 0.551979124546051, + "learning_rate": 1.51703520900629e-07, + "loss": 0.2933, + "step": 9930 + }, + { + "epoch": 2.788040426726558, + "grad_norm": 0.6004064083099365, + "learning_rate": 1.5130448940429644e-07, + "loss": 0.3244, + "step": 9931 + }, + { + "epoch": 2.7883211678832116, + "grad_norm": 0.5763236284255981, + "learning_rate": 1.509059753305958e-07, + "loss": 0.3288, + "step": 9932 + }, + { + "epoch": 2.7886019090398655, + "grad_norm": 0.5568984746932983, + "learning_rate": 1.5050797872205592e-07, + "loss": 0.3074, + "step": 9933 + }, + { + "epoch": 2.788882650196519, + "grad_norm": 0.5835351347923279, + "learning_rate": 1.501104996211472e-07, + "loss": 0.2962, + "step": 9934 + }, + { + "epoch": 2.7891633913531724, + "grad_norm": 0.5246619582176208, + "learning_rate": 1.4971353807028688e-07, + "loss": 0.2898, + "step": 9935 + }, + { + "epoch": 2.789444132509826, + "grad_norm": 0.5604928731918335, + "learning_rate": 1.4931709411183493e-07, + "loss": 0.3153, + "step": 9936 + }, + { + "epoch": 2.7897248736664793, + "grad_norm": 0.48893657326698303, + "learning_rate": 1.4892116778809863e-07, + "loss": 0.3509, + "step": 9937 + }, + { + "epoch": 2.790005614823133, + "grad_norm": 0.5521307587623596, + "learning_rate": 1.485257591413286e-07, + "loss": 0.3234, + "step": 9938 + }, + { + "epoch": 2.7902863559797866, + "grad_norm": 0.5598009824752808, + "learning_rate": 1.4813086821372003e-07, + "loss": 0.3583, + "step": 9939 + }, + { + "epoch": 2.7905670971364405, + "grad_norm": 0.5937865376472473, + "learning_rate": 1.4773649504741417e-07, + "loss": 0.3402, + "step": 9940 + }, + { + "epoch": 2.790847838293094, + "grad_norm": 0.6012153029441833, + "learning_rate": 1.4734263968449515e-07, + "loss": 0.3428, + "step": 9941 + }, + { + "epoch": 2.7911285794497473, + "grad_norm": 0.5403911471366882, + "learning_rate": 1.469493021669921e-07, + "loss": 0.3261, + "step": 9942 + }, + { + "epoch": 2.791409320606401, + "grad_norm": 0.505204439163208, + "learning_rate": 1.4655648253688094e-07, + "loss": 0.2905, + "step": 9943 + }, + { + "epoch": 2.7916900617630542, + "grad_norm": 0.5365025401115417, + "learning_rate": 1.461641808360803e-07, + "loss": 0.3105, + "step": 9944 + }, + { + "epoch": 2.791970802919708, + "grad_norm": 0.5731824636459351, + "learning_rate": 1.4577239710645452e-07, + "loss": 0.3012, + "step": 9945 + }, + { + "epoch": 2.7922515440763616, + "grad_norm": 0.5721343159675598, + "learning_rate": 1.453811313898118e-07, + "loss": 0.3647, + "step": 9946 + }, + { + "epoch": 2.792532285233015, + "grad_norm": 0.5556032657623291, + "learning_rate": 1.4499038372790596e-07, + "loss": 0.3482, + "step": 9947 + }, + { + "epoch": 2.792813026389669, + "grad_norm": 0.5744494795799255, + "learning_rate": 1.4460015416243534e-07, + "loss": 0.3327, + "step": 9948 + }, + { + "epoch": 2.7930937675463223, + "grad_norm": 0.564821183681488, + "learning_rate": 1.442104427350416e-07, + "loss": 0.3467, + "step": 9949 + }, + { + "epoch": 2.7933745087029758, + "grad_norm": 0.5284962058067322, + "learning_rate": 1.4382124948731423e-07, + "loss": 0.3323, + "step": 9950 + }, + { + "epoch": 2.793655249859629, + "grad_norm": 0.5517900586128235, + "learning_rate": 1.4343257446078397e-07, + "loss": 0.3577, + "step": 9951 + }, + { + "epoch": 2.793935991016283, + "grad_norm": 0.5161297917366028, + "learning_rate": 1.4304441769692867e-07, + "loss": 0.3377, + "step": 9952 + }, + { + "epoch": 2.7942167321729365, + "grad_norm": 0.5578047633171082, + "learning_rate": 1.426567792371697e-07, + "loss": 0.2868, + "step": 9953 + }, + { + "epoch": 2.79449747332959, + "grad_norm": 0.5168033838272095, + "learning_rate": 1.4226965912287282e-07, + "loss": 0.3105, + "step": 9954 + }, + { + "epoch": 2.794778214486244, + "grad_norm": 0.5163038969039917, + "learning_rate": 1.4188305739535059e-07, + "loss": 0.3419, + "step": 9955 + }, + { + "epoch": 2.7950589556428973, + "grad_norm": 0.5863468050956726, + "learning_rate": 1.4149697409585715e-07, + "loss": 0.3107, + "step": 9956 + }, + { + "epoch": 2.7953396967995507, + "grad_norm": 0.5861634612083435, + "learning_rate": 1.411114092655941e-07, + "loss": 0.279, + "step": 9957 + }, + { + "epoch": 2.795620437956204, + "grad_norm": 0.617355465888977, + "learning_rate": 1.4072636294570617e-07, + "loss": 0.3253, + "step": 9958 + }, + { + "epoch": 2.795901179112858, + "grad_norm": 0.5366426706314087, + "learning_rate": 1.4034183517728229e-07, + "loss": 0.3507, + "step": 9959 + }, + { + "epoch": 2.7961819202695115, + "grad_norm": 0.556643009185791, + "learning_rate": 1.3995782600135787e-07, + "loss": 0.3296, + "step": 9960 + }, + { + "epoch": 2.796462661426165, + "grad_norm": 0.5481691956520081, + "learning_rate": 1.395743354589113e-07, + "loss": 0.3096, + "step": 9961 + }, + { + "epoch": 2.796743402582819, + "grad_norm": 0.5889233350753784, + "learning_rate": 1.3919136359086703e-07, + "loss": 0.3071, + "step": 9962 + }, + { + "epoch": 2.7970241437394723, + "grad_norm": 0.465064138174057, + "learning_rate": 1.3880891043809296e-07, + "loss": 0.2922, + "step": 9963 + }, + { + "epoch": 2.7973048848961257, + "grad_norm": 0.5425115823745728, + "learning_rate": 1.3842697604140198e-07, + "loss": 0.3441, + "step": 9964 + }, + { + "epoch": 2.797585626052779, + "grad_norm": 0.5827770233154297, + "learning_rate": 1.380455604415526e-07, + "loss": 0.309, + "step": 9965 + }, + { + "epoch": 2.797866367209433, + "grad_norm": 0.6112386584281921, + "learning_rate": 1.3766466367924557e-07, + "loss": 0.3407, + "step": 9966 + }, + { + "epoch": 2.7981471083660865, + "grad_norm": 0.5571935176849365, + "learning_rate": 1.3728428579513008e-07, + "loss": 0.3261, + "step": 9967 + }, + { + "epoch": 2.79842784952274, + "grad_norm": 0.5710159540176392, + "learning_rate": 1.3690442682979588e-07, + "loss": 0.3388, + "step": 9968 + }, + { + "epoch": 2.798708590679394, + "grad_norm": 0.5367956161499023, + "learning_rate": 1.3652508682377886e-07, + "loss": 0.3387, + "step": 9969 + }, + { + "epoch": 2.7989893318360473, + "grad_norm": 0.5501680970191956, + "learning_rate": 1.3614626581756164e-07, + "loss": 0.2838, + "step": 9970 + }, + { + "epoch": 2.7992700729927007, + "grad_norm": 0.5452075600624084, + "learning_rate": 1.35767963851568e-07, + "loss": 0.3137, + "step": 9971 + }, + { + "epoch": 2.799550814149354, + "grad_norm": 0.5247623920440674, + "learning_rate": 1.3539018096616897e-07, + "loss": 0.3196, + "step": 9972 + }, + { + "epoch": 2.799831555306008, + "grad_norm": 0.5177823901176453, + "learning_rate": 1.3501291720167898e-07, + "loss": 0.3287, + "step": 9973 + }, + { + "epoch": 2.8001122964626615, + "grad_norm": 0.5358867645263672, + "learning_rate": 1.3463617259835639e-07, + "loss": 0.3494, + "step": 9974 + }, + { + "epoch": 2.800393037619315, + "grad_norm": 0.5144773721694946, + "learning_rate": 1.3425994719640622e-07, + "loss": 0.3313, + "step": 9975 + }, + { + "epoch": 2.800673778775969, + "grad_norm": 0.5704407691955566, + "learning_rate": 1.3388424103597586e-07, + "loss": 0.3092, + "step": 9976 + }, + { + "epoch": 2.8009545199326222, + "grad_norm": 0.5049776434898376, + "learning_rate": 1.3350905415715986e-07, + "loss": 0.3522, + "step": 9977 + }, + { + "epoch": 2.8012352610892757, + "grad_norm": 0.5009602308273315, + "learning_rate": 1.33134386599994e-07, + "loss": 0.361, + "step": 9978 + }, + { + "epoch": 2.801516002245929, + "grad_norm": 0.5368480086326599, + "learning_rate": 1.3276023840446183e-07, + "loss": 0.3301, + "step": 9979 + }, + { + "epoch": 2.8017967434025826, + "grad_norm": 0.5960638523101807, + "learning_rate": 1.3238660961049033e-07, + "loss": 0.3122, + "step": 9980 + }, + { + "epoch": 2.8020774845592364, + "grad_norm": 0.4748612344264984, + "learning_rate": 1.3201350025794924e-07, + "loss": 0.3104, + "step": 9981 + }, + { + "epoch": 2.80235822571589, + "grad_norm": 0.5358529686927795, + "learning_rate": 1.316409103866556e-07, + "loss": 0.324, + "step": 9982 + }, + { + "epoch": 2.8026389668725438, + "grad_norm": 0.6039599776268005, + "learning_rate": 1.312688400363693e-07, + "loss": 0.3326, + "step": 9983 + }, + { + "epoch": 2.802919708029197, + "grad_norm": 0.5407767295837402, + "learning_rate": 1.3089728924679634e-07, + "loss": 0.3457, + "step": 9984 + }, + { + "epoch": 2.8032004491858507, + "grad_norm": 0.5642867088317871, + "learning_rate": 1.3052625805758556e-07, + "loss": 0.3, + "step": 9985 + }, + { + "epoch": 2.803481190342504, + "grad_norm": 0.533231258392334, + "learning_rate": 1.3015574650833141e-07, + "loss": 0.3457, + "step": 9986 + }, + { + "epoch": 2.8037619314991575, + "grad_norm": 0.5869655609130859, + "learning_rate": 1.2978575463857168e-07, + "loss": 0.328, + "step": 9987 + }, + { + "epoch": 2.8040426726558114, + "grad_norm": 0.5910897254943848, + "learning_rate": 1.294162824877909e-07, + "loss": 0.3339, + "step": 9988 + }, + { + "epoch": 2.804323413812465, + "grad_norm": 0.573411226272583, + "learning_rate": 1.2904733009541647e-07, + "loss": 0.2916, + "step": 9989 + }, + { + "epoch": 2.8046041549691187, + "grad_norm": 0.5155426263809204, + "learning_rate": 1.2867889750082018e-07, + "loss": 0.357, + "step": 9990 + }, + { + "epoch": 2.804884896125772, + "grad_norm": 0.5490040183067322, + "learning_rate": 1.283109847433195e-07, + "loss": 0.3393, + "step": 9991 + }, + { + "epoch": 2.8051656372824256, + "grad_norm": 0.5404068827629089, + "learning_rate": 1.279435918621752e-07, + "loss": 0.3893, + "step": 9992 + }, + { + "epoch": 2.805446378439079, + "grad_norm": 0.5030194520950317, + "learning_rate": 1.2757671889659373e-07, + "loss": 0.3112, + "step": 9993 + }, + { + "epoch": 2.8057271195957325, + "grad_norm": 0.5610947608947754, + "learning_rate": 1.272103658857249e-07, + "loss": 0.3198, + "step": 9994 + }, + { + "epoch": 2.8060078607523864, + "grad_norm": 0.5328671336174011, + "learning_rate": 1.268445328686646e-07, + "loss": 0.3142, + "step": 9995 + }, + { + "epoch": 2.80628860190904, + "grad_norm": 0.5556824207305908, + "learning_rate": 1.264792198844511e-07, + "loss": 0.3467, + "step": 9996 + }, + { + "epoch": 2.8065693430656933, + "grad_norm": 0.5286175608634949, + "learning_rate": 1.2611442697206877e-07, + "loss": 0.3342, + "step": 9997 + }, + { + "epoch": 2.806850084222347, + "grad_norm": 0.5325221419334412, + "learning_rate": 1.257501541704459e-07, + "loss": 0.3327, + "step": 9998 + }, + { + "epoch": 2.8071308253790006, + "grad_norm": 0.5068479776382446, + "learning_rate": 1.2538640151845638e-07, + "loss": 0.3201, + "step": 9999 + }, + { + "epoch": 2.807411566535654, + "grad_norm": 0.5140798687934875, + "learning_rate": 1.2502316905491584e-07, + "loss": 0.3031, + "step": 10000 + }, + { + "epoch": 2.8076923076923075, + "grad_norm": 0.4957268238067627, + "learning_rate": 1.246604568185883e-07, + "loss": 0.2899, + "step": 10001 + }, + { + "epoch": 2.8079730488489614, + "grad_norm": 0.5943296551704407, + "learning_rate": 1.2429826484817887e-07, + "loss": 0.3311, + "step": 10002 + }, + { + "epoch": 2.808253790005615, + "grad_norm": 0.5997329354286194, + "learning_rate": 1.2393659318233775e-07, + "loss": 0.3807, + "step": 10003 + }, + { + "epoch": 2.8085345311622683, + "grad_norm": 0.553098738193512, + "learning_rate": 1.2357544185966187e-07, + "loss": 0.3146, + "step": 10004 + }, + { + "epoch": 2.808815272318922, + "grad_norm": 0.5032588839530945, + "learning_rate": 1.2321481091869035e-07, + "loss": 0.3575, + "step": 10005 + }, + { + "epoch": 2.8090960134755756, + "grad_norm": 0.5524049401283264, + "learning_rate": 1.2285470039790749e-07, + "loss": 0.3072, + "step": 10006 + }, + { + "epoch": 2.809376754632229, + "grad_norm": 0.5532103180885315, + "learning_rate": 1.224951103357419e-07, + "loss": 0.3417, + "step": 10007 + }, + { + "epoch": 2.8096574957888825, + "grad_norm": 0.504477322101593, + "learning_rate": 1.2213604077056685e-07, + "loss": 0.3161, + "step": 10008 + }, + { + "epoch": 2.8099382369455363, + "grad_norm": 0.5959299802780151, + "learning_rate": 1.2177749174070053e-07, + "loss": 0.2957, + "step": 10009 + }, + { + "epoch": 2.81021897810219, + "grad_norm": 0.565331757068634, + "learning_rate": 1.21419463284404e-07, + "loss": 0.3706, + "step": 10010 + }, + { + "epoch": 2.8104997192588432, + "grad_norm": 0.5657942295074463, + "learning_rate": 1.2106195543988454e-07, + "loss": 0.3318, + "step": 10011 + }, + { + "epoch": 2.810780460415497, + "grad_norm": 0.5270640254020691, + "learning_rate": 1.207049682452932e-07, + "loss": 0.34, + "step": 10012 + }, + { + "epoch": 2.8110612015721506, + "grad_norm": 0.5368057489395142, + "learning_rate": 1.2034850173872515e-07, + "loss": 0.3541, + "step": 10013 + }, + { + "epoch": 2.811341942728804, + "grad_norm": 0.5489338636398315, + "learning_rate": 1.19992555958221e-07, + "loss": 0.3014, + "step": 10014 + }, + { + "epoch": 2.8116226838854574, + "grad_norm": 0.5464556217193604, + "learning_rate": 1.1963713094176376e-07, + "loss": 0.3026, + "step": 10015 + }, + { + "epoch": 2.8119034250421113, + "grad_norm": 0.5182561278343201, + "learning_rate": 1.1928222672728363e-07, + "loss": 0.3132, + "step": 10016 + }, + { + "epoch": 2.8121841661987648, + "grad_norm": 0.5293217897415161, + "learning_rate": 1.1892784335265307e-07, + "loss": 0.3176, + "step": 10017 + }, + { + "epoch": 2.812464907355418, + "grad_norm": 0.5431877970695496, + "learning_rate": 1.1857398085568905e-07, + "loss": 0.3152, + "step": 10018 + }, + { + "epoch": 2.812745648512072, + "grad_norm": 0.5799314379692078, + "learning_rate": 1.1822063927415527e-07, + "loss": 0.2787, + "step": 10019 + }, + { + "epoch": 2.8130263896687255, + "grad_norm": 0.500160276889801, + "learning_rate": 1.1786781864575602e-07, + "loss": 0.2854, + "step": 10020 + }, + { + "epoch": 2.813307130825379, + "grad_norm": 0.5474427938461304, + "learning_rate": 1.1751551900814395e-07, + "loss": 0.2829, + "step": 10021 + }, + { + "epoch": 2.8135878719820324, + "grad_norm": 0.5582706332206726, + "learning_rate": 1.1716374039891288e-07, + "loss": 0.3442, + "step": 10022 + }, + { + "epoch": 2.813868613138686, + "grad_norm": 0.5029194951057434, + "learning_rate": 1.1681248285560332e-07, + "loss": 0.3481, + "step": 10023 + }, + { + "epoch": 2.8141493542953397, + "grad_norm": 0.5261446237564087, + "learning_rate": 1.1646174641569919e-07, + "loss": 0.3322, + "step": 10024 + }, + { + "epoch": 2.814430095451993, + "grad_norm": 0.5505539774894714, + "learning_rate": 1.1611153111662832e-07, + "loss": 0.3508, + "step": 10025 + }, + { + "epoch": 2.814710836608647, + "grad_norm": 0.5470914244651794, + "learning_rate": 1.1576183699576471e-07, + "loss": 0.3486, + "step": 10026 + }, + { + "epoch": 2.8149915777653005, + "grad_norm": 0.5789533853530884, + "learning_rate": 1.1541266409042406e-07, + "loss": 0.3233, + "step": 10027 + }, + { + "epoch": 2.815272318921954, + "grad_norm": 0.5578635334968567, + "learning_rate": 1.1506401243786935e-07, + "loss": 0.3194, + "step": 10028 + }, + { + "epoch": 2.8155530600786074, + "grad_norm": 0.522255003452301, + "learning_rate": 1.1471588207530527e-07, + "loss": 0.3624, + "step": 10029 + }, + { + "epoch": 2.815833801235261, + "grad_norm": 0.5245084166526794, + "learning_rate": 1.1436827303988263e-07, + "loss": 0.3684, + "step": 10030 + }, + { + "epoch": 2.8161145423919147, + "grad_norm": 0.5508833527565002, + "learning_rate": 1.1402118536869677e-07, + "loss": 0.3311, + "step": 10031 + }, + { + "epoch": 2.816395283548568, + "grad_norm": 0.6131346225738525, + "learning_rate": 1.136746190987853e-07, + "loss": 0.3566, + "step": 10032 + }, + { + "epoch": 2.816676024705222, + "grad_norm": 0.581041693687439, + "learning_rate": 1.1332857426713307e-07, + "loss": 0.3211, + "step": 10033 + }, + { + "epoch": 2.8169567658618755, + "grad_norm": 0.616783082485199, + "learning_rate": 1.1298305091066664e-07, + "loss": 0.2943, + "step": 10034 + }, + { + "epoch": 2.817237507018529, + "grad_norm": 0.5316537618637085, + "learning_rate": 1.1263804906625931e-07, + "loss": 0.3107, + "step": 10035 + }, + { + "epoch": 2.8175182481751824, + "grad_norm": 0.5256446599960327, + "learning_rate": 1.1229356877072662e-07, + "loss": 0.2897, + "step": 10036 + }, + { + "epoch": 2.817798989331836, + "grad_norm": 0.6676996350288391, + "learning_rate": 1.1194961006082972e-07, + "loss": 0.314, + "step": 10037 + }, + { + "epoch": 2.8180797304884897, + "grad_norm": 0.5410981178283691, + "learning_rate": 1.1160617297327314e-07, + "loss": 0.3493, + "step": 10038 + }, + { + "epoch": 2.818360471645143, + "grad_norm": 0.5171387195587158, + "learning_rate": 1.1126325754470701e-07, + "loss": 0.3107, + "step": 10039 + }, + { + "epoch": 2.8186412128017966, + "grad_norm": 0.5379522442817688, + "learning_rate": 1.109208638117254e-07, + "loss": 0.2837, + "step": 10040 + }, + { + "epoch": 2.8189219539584505, + "grad_norm": 0.5606611371040344, + "learning_rate": 1.1057899181086573e-07, + "loss": 0.298, + "step": 10041 + }, + { + "epoch": 2.819202695115104, + "grad_norm": 0.5114135146141052, + "learning_rate": 1.102376415786105e-07, + "loss": 0.3479, + "step": 10042 + }, + { + "epoch": 2.8194834362717573, + "grad_norm": 0.5463590025901794, + "learning_rate": 1.0989681315138778e-07, + "loss": 0.3068, + "step": 10043 + }, + { + "epoch": 2.819764177428411, + "grad_norm": 0.549858570098877, + "learning_rate": 1.0955650656556682e-07, + "loss": 0.3423, + "step": 10044 + }, + { + "epoch": 2.8200449185850647, + "grad_norm": 0.5742153525352478, + "learning_rate": 1.0921672185746357e-07, + "loss": 0.3147, + "step": 10045 + }, + { + "epoch": 2.820325659741718, + "grad_norm": 0.5294710397720337, + "learning_rate": 1.0887745906333846e-07, + "loss": 0.363, + "step": 10046 + }, + { + "epoch": 2.8206064008983716, + "grad_norm": 0.5256567001342773, + "learning_rate": 1.0853871821939477e-07, + "loss": 0.3615, + "step": 10047 + }, + { + "epoch": 2.8208871420550254, + "grad_norm": 0.49465903639793396, + "learning_rate": 1.0820049936178134e-07, + "loss": 0.3489, + "step": 10048 + }, + { + "epoch": 2.821167883211679, + "grad_norm": 0.5517958998680115, + "learning_rate": 1.0786280252659043e-07, + "loss": 0.3118, + "step": 10049 + }, + { + "epoch": 2.8214486243683323, + "grad_norm": 0.6105058193206787, + "learning_rate": 1.075256277498593e-07, + "loss": 0.3198, + "step": 10050 + }, + { + "epoch": 2.8217293655249858, + "grad_norm": 0.5564125776290894, + "learning_rate": 1.0718897506756865e-07, + "loss": 0.3476, + "step": 10051 + }, + { + "epoch": 2.8220101066816397, + "grad_norm": 0.5239488482475281, + "learning_rate": 1.0685284451564415e-07, + "loss": 0.3679, + "step": 10052 + }, + { + "epoch": 2.822290847838293, + "grad_norm": 0.52907395362854, + "learning_rate": 1.0651723612995546e-07, + "loss": 0.3131, + "step": 10053 + }, + { + "epoch": 2.8225715889949465, + "grad_norm": 0.5235008597373962, + "learning_rate": 1.061821499463167e-07, + "loss": 0.2992, + "step": 10054 + }, + { + "epoch": 2.8228523301516004, + "grad_norm": 0.6019278764724731, + "learning_rate": 1.058475860004865e-07, + "loss": 0.3487, + "step": 10055 + }, + { + "epoch": 2.823133071308254, + "grad_norm": 0.5207324624061584, + "learning_rate": 1.055135443281674e-07, + "loss": 0.3389, + "step": 10056 + }, + { + "epoch": 2.8234138124649073, + "grad_norm": 0.6004019379615784, + "learning_rate": 1.051800249650059e-07, + "loss": 0.342, + "step": 10057 + }, + { + "epoch": 2.8236945536215607, + "grad_norm": 0.5514563322067261, + "learning_rate": 1.0484702794659352e-07, + "loss": 0.3262, + "step": 10058 + }, + { + "epoch": 2.8239752947782146, + "grad_norm": 0.5341305732727051, + "learning_rate": 1.0451455330846461e-07, + "loss": 0.3119, + "step": 10059 + }, + { + "epoch": 2.824256035934868, + "grad_norm": 0.5418851375579834, + "learning_rate": 1.0418260108610023e-07, + "loss": 0.3104, + "step": 10060 + }, + { + "epoch": 2.8245367770915215, + "grad_norm": 0.5163307785987854, + "learning_rate": 1.0385117131492318e-07, + "loss": 0.2921, + "step": 10061 + }, + { + "epoch": 2.8248175182481754, + "grad_norm": 0.6473627090454102, + "learning_rate": 1.0352026403030235e-07, + "loss": 0.3064, + "step": 10062 + }, + { + "epoch": 2.825098259404829, + "grad_norm": 0.47515952587127686, + "learning_rate": 1.0318987926754897e-07, + "loss": 0.333, + "step": 10063 + }, + { + "epoch": 2.8253790005614823, + "grad_norm": 0.5817738175392151, + "learning_rate": 1.0286001706192095e-07, + "loss": 0.2753, + "step": 10064 + }, + { + "epoch": 2.8256597417181357, + "grad_norm": 0.5453835725784302, + "learning_rate": 1.02530677448619e-07, + "loss": 0.3077, + "step": 10065 + }, + { + "epoch": 2.8259404828747896, + "grad_norm": 0.5364516973495483, + "learning_rate": 1.0220186046278724e-07, + "loss": 0.3615, + "step": 10066 + }, + { + "epoch": 2.826221224031443, + "grad_norm": 0.5471426248550415, + "learning_rate": 1.0187356613951538e-07, + "loss": 0.3171, + "step": 10067 + }, + { + "epoch": 2.8265019651880965, + "grad_norm": 0.5084347724914551, + "learning_rate": 1.0154579451383817e-07, + "loss": 0.2998, + "step": 10068 + }, + { + "epoch": 2.8267827063447504, + "grad_norm": 0.4818313717842102, + "learning_rate": 1.0121854562073097e-07, + "loss": 0.311, + "step": 10069 + }, + { + "epoch": 2.827063447501404, + "grad_norm": 0.537496030330658, + "learning_rate": 1.0089181949511805e-07, + "loss": 0.3544, + "step": 10070 + }, + { + "epoch": 2.8273441886580573, + "grad_norm": 0.5626868009567261, + "learning_rate": 1.0056561617186378e-07, + "loss": 0.3284, + "step": 10071 + }, + { + "epoch": 2.8276249298147107, + "grad_norm": 0.5381502509117126, + "learning_rate": 1.0023993568578027e-07, + "loss": 0.3354, + "step": 10072 + }, + { + "epoch": 2.827905670971364, + "grad_norm": 0.5141575932502747, + "learning_rate": 9.991477807162086e-08, + "loss": 0.3409, + "step": 10073 + }, + { + "epoch": 2.828186412128018, + "grad_norm": 0.5125014185905457, + "learning_rate": 9.959014336408446e-08, + "loss": 0.3468, + "step": 10074 + }, + { + "epoch": 2.8284671532846715, + "grad_norm": 0.5192347168922424, + "learning_rate": 9.926603159781444e-08, + "loss": 0.3509, + "step": 10075 + }, + { + "epoch": 2.8287478944413254, + "grad_norm": 0.5120210647583008, + "learning_rate": 9.894244280739817e-08, + "loss": 0.3216, + "step": 10076 + }, + { + "epoch": 2.829028635597979, + "grad_norm": 0.5238515138626099, + "learning_rate": 9.861937702736635e-08, + "loss": 0.2914, + "step": 10077 + }, + { + "epoch": 2.8293093767546322, + "grad_norm": 0.5473514199256897, + "learning_rate": 9.829683429219528e-08, + "loss": 0.3048, + "step": 10078 + }, + { + "epoch": 2.8295901179112857, + "grad_norm": 0.5520949363708496, + "learning_rate": 9.79748146363041e-08, + "loss": 0.3543, + "step": 10079 + }, + { + "epoch": 2.829870859067939, + "grad_norm": 0.5224297642707825, + "learning_rate": 9.765331809405754e-08, + "loss": 0.2951, + "step": 10080 + }, + { + "epoch": 2.830151600224593, + "grad_norm": 0.5464485883712769, + "learning_rate": 9.73323446997626e-08, + "loss": 0.3872, + "step": 10081 + }, + { + "epoch": 2.8304323413812464, + "grad_norm": 0.5790408849716187, + "learning_rate": 9.701189448767245e-08, + "loss": 0.2835, + "step": 10082 + }, + { + "epoch": 2.8307130825379003, + "grad_norm": 0.5932109951972961, + "learning_rate": 9.669196749198251e-08, + "loss": 0.2984, + "step": 10083 + }, + { + "epoch": 2.8309938236945538, + "grad_norm": 0.5314530730247498, + "learning_rate": 9.637256374683546e-08, + "loss": 0.3159, + "step": 10084 + }, + { + "epoch": 2.831274564851207, + "grad_norm": 0.5765442252159119, + "learning_rate": 9.605368328631403e-08, + "loss": 0.3292, + "step": 10085 + }, + { + "epoch": 2.8315553060078607, + "grad_norm": 0.5692158937454224, + "learning_rate": 9.57353261444477e-08, + "loss": 0.3028, + "step": 10086 + }, + { + "epoch": 2.831836047164514, + "grad_norm": 0.5491392612457275, + "learning_rate": 9.541749235521036e-08, + "loss": 0.3378, + "step": 10087 + }, + { + "epoch": 2.832116788321168, + "grad_norm": 0.5618224740028381, + "learning_rate": 9.510018195251769e-08, + "loss": 0.2745, + "step": 10088 + }, + { + "epoch": 2.8323975294778214, + "grad_norm": 0.5175615549087524, + "learning_rate": 9.478339497023259e-08, + "loss": 0.3758, + "step": 10089 + }, + { + "epoch": 2.832678270634475, + "grad_norm": 0.5344045758247375, + "learning_rate": 9.446713144216024e-08, + "loss": 0.3304, + "step": 10090 + }, + { + "epoch": 2.8329590117911287, + "grad_norm": 0.5420215725898743, + "learning_rate": 9.415139140204977e-08, + "loss": 0.3353, + "step": 10091 + }, + { + "epoch": 2.833239752947782, + "grad_norm": 0.5115945339202881, + "learning_rate": 9.383617488359587e-08, + "loss": 0.3446, + "step": 10092 + }, + { + "epoch": 2.8335204941044356, + "grad_norm": 0.5203591585159302, + "learning_rate": 9.352148192043553e-08, + "loss": 0.3056, + "step": 10093 + }, + { + "epoch": 2.833801235261089, + "grad_norm": 0.48789289593696594, + "learning_rate": 9.320731254615134e-08, + "loss": 0.3755, + "step": 10094 + }, + { + "epoch": 2.834081976417743, + "grad_norm": 0.6028507351875305, + "learning_rate": 9.289366679426926e-08, + "loss": 0.3039, + "step": 10095 + }, + { + "epoch": 2.8343627175743964, + "grad_norm": 0.5670675039291382, + "learning_rate": 9.258054469825972e-08, + "loss": 0.343, + "step": 10096 + }, + { + "epoch": 2.83464345873105, + "grad_norm": 0.566779375076294, + "learning_rate": 9.226794629153768e-08, + "loss": 0.3404, + "step": 10097 + }, + { + "epoch": 2.8349241998877037, + "grad_norm": 0.5995916128158569, + "learning_rate": 9.195587160746089e-08, + "loss": 0.3581, + "step": 10098 + }, + { + "epoch": 2.835204941044357, + "grad_norm": 0.5468376278877258, + "learning_rate": 9.164432067933271e-08, + "loss": 0.3174, + "step": 10099 + }, + { + "epoch": 2.8354856822010106, + "grad_norm": 0.5315907597541809, + "learning_rate": 9.133329354039988e-08, + "loss": 0.3299, + "step": 10100 + }, + { + "epoch": 2.835766423357664, + "grad_norm": 0.5144544243812561, + "learning_rate": 9.102279022385196e-08, + "loss": 0.3711, + "step": 10101 + }, + { + "epoch": 2.836047164514318, + "grad_norm": 0.5138227343559265, + "learning_rate": 9.071281076282579e-08, + "loss": 0.307, + "step": 10102 + }, + { + "epoch": 2.8363279056709714, + "grad_norm": 0.5745804309844971, + "learning_rate": 9.040335519039933e-08, + "loss": 0.3517, + "step": 10103 + }, + { + "epoch": 2.836608646827625, + "grad_norm": 0.6126704216003418, + "learning_rate": 9.009442353959618e-08, + "loss": 0.332, + "step": 10104 + }, + { + "epoch": 2.8368893879842787, + "grad_norm": 0.6069706678390503, + "learning_rate": 8.978601584338332e-08, + "loss": 0.332, + "step": 10105 + }, + { + "epoch": 2.837170129140932, + "grad_norm": 0.5251445174217224, + "learning_rate": 8.947813213467216e-08, + "loss": 0.3303, + "step": 10106 + }, + { + "epoch": 2.8374508702975856, + "grad_norm": 0.5183876752853394, + "learning_rate": 8.917077244631812e-08, + "loss": 0.3543, + "step": 10107 + }, + { + "epoch": 2.837731611454239, + "grad_norm": 0.5323402285575867, + "learning_rate": 8.886393681112104e-08, + "loss": 0.2954, + "step": 10108 + }, + { + "epoch": 2.838012352610893, + "grad_norm": 0.5713376402854919, + "learning_rate": 8.85576252618242e-08, + "loss": 0.3086, + "step": 10109 + }, + { + "epoch": 2.8382930937675463, + "grad_norm": 0.5591835975646973, + "learning_rate": 8.825183783111535e-08, + "loss": 0.3298, + "step": 10110 + }, + { + "epoch": 2.8385738349242, + "grad_norm": 0.5651745796203613, + "learning_rate": 8.794657455162615e-08, + "loss": 0.3382, + "step": 10111 + }, + { + "epoch": 2.8388545760808537, + "grad_norm": 0.5637382864952087, + "learning_rate": 8.764183545593275e-08, + "loss": 0.32, + "step": 10112 + }, + { + "epoch": 2.839135317237507, + "grad_norm": 0.5656781792640686, + "learning_rate": 8.73376205765547e-08, + "loss": 0.3162, + "step": 10113 + }, + { + "epoch": 2.8394160583941606, + "grad_norm": 0.48276302218437195, + "learning_rate": 8.70339299459555e-08, + "loss": 0.3243, + "step": 10114 + }, + { + "epoch": 2.839696799550814, + "grad_norm": 0.6217085719108582, + "learning_rate": 8.673076359654364e-08, + "loss": 0.3169, + "step": 10115 + }, + { + "epoch": 2.8399775407074674, + "grad_norm": 0.5485671162605286, + "learning_rate": 8.642812156067104e-08, + "loss": 0.3507, + "step": 10116 + }, + { + "epoch": 2.8402582818641213, + "grad_norm": 0.6354616284370422, + "learning_rate": 8.612600387063463e-08, + "loss": 0.3233, + "step": 10117 + }, + { + "epoch": 2.8405390230207748, + "grad_norm": 0.5699709057807922, + "learning_rate": 8.582441055867308e-08, + "loss": 0.3316, + "step": 10118 + }, + { + "epoch": 2.8408197641774287, + "grad_norm": 0.5277790427207947, + "learning_rate": 8.552334165697118e-08, + "loss": 0.3452, + "step": 10119 + }, + { + "epoch": 2.841100505334082, + "grad_norm": 0.5202212333679199, + "learning_rate": 8.522279719765714e-08, + "loss": 0.3006, + "step": 10120 + }, + { + "epoch": 2.8413812464907355, + "grad_norm": 0.5469048619270325, + "learning_rate": 8.492277721280362e-08, + "loss": 0.373, + "step": 10121 + }, + { + "epoch": 2.841661987647389, + "grad_norm": 0.5882004499435425, + "learning_rate": 8.462328173442613e-08, + "loss": 0.2982, + "step": 10122 + }, + { + "epoch": 2.8419427288040424, + "grad_norm": 0.4816424250602722, + "learning_rate": 8.432431079448521e-08, + "loss": 0.3833, + "step": 10123 + }, + { + "epoch": 2.8422234699606963, + "grad_norm": 0.5815948247909546, + "learning_rate": 8.402586442488536e-08, + "loss": 0.3541, + "step": 10124 + }, + { + "epoch": 2.8425042111173497, + "grad_norm": 0.5320097208023071, + "learning_rate": 8.372794265747498e-08, + "loss": 0.3211, + "step": 10125 + }, + { + "epoch": 2.8427849522740036, + "grad_norm": 0.5010483860969543, + "learning_rate": 8.343054552404639e-08, + "loss": 0.2919, + "step": 10126 + }, + { + "epoch": 2.843065693430657, + "grad_norm": 0.5685129165649414, + "learning_rate": 8.313367305633591e-08, + "loss": 0.3144, + "step": 10127 + }, + { + "epoch": 2.8433464345873105, + "grad_norm": 0.5361132621765137, + "learning_rate": 8.283732528602318e-08, + "loss": 0.3237, + "step": 10128 + }, + { + "epoch": 2.843627175743964, + "grad_norm": 0.5101833343505859, + "learning_rate": 8.2541502244734e-08, + "loss": 0.3141, + "step": 10129 + }, + { + "epoch": 2.8439079169006174, + "grad_norm": 0.5523453950881958, + "learning_rate": 8.224620396403537e-08, + "loss": 0.3032, + "step": 10130 + }, + { + "epoch": 2.8441886580572713, + "grad_norm": 0.6426838040351868, + "learning_rate": 8.195143047544096e-08, + "loss": 0.2995, + "step": 10131 + }, + { + "epoch": 2.8444693992139247, + "grad_norm": 0.514690637588501, + "learning_rate": 8.165718181040617e-08, + "loss": 0.3156, + "step": 10132 + }, + { + "epoch": 2.844750140370578, + "grad_norm": 0.532554566860199, + "learning_rate": 8.136345800033196e-08, + "loss": 0.2773, + "step": 10133 + }, + { + "epoch": 2.845030881527232, + "grad_norm": 0.4946853518486023, + "learning_rate": 8.107025907656274e-08, + "loss": 0.3494, + "step": 10134 + }, + { + "epoch": 2.8453116226838855, + "grad_norm": 0.55220627784729, + "learning_rate": 8.077758507038624e-08, + "loss": 0.3246, + "step": 10135 + }, + { + "epoch": 2.845592363840539, + "grad_norm": 0.5223219394683838, + "learning_rate": 8.048543601303583e-08, + "loss": 0.3113, + "step": 10136 + }, + { + "epoch": 2.8458731049971924, + "grad_norm": 0.5246886610984802, + "learning_rate": 8.019381193568654e-08, + "loss": 0.3525, + "step": 10137 + }, + { + "epoch": 2.8461538461538463, + "grad_norm": 0.5363090634346008, + "learning_rate": 7.990271286945961e-08, + "loss": 0.3618, + "step": 10138 + }, + { + "epoch": 2.8464345873104997, + "grad_norm": 0.5171090960502625, + "learning_rate": 7.961213884541962e-08, + "loss": 0.3434, + "step": 10139 + }, + { + "epoch": 2.846715328467153, + "grad_norm": 0.5945302248001099, + "learning_rate": 7.93220898945729e-08, + "loss": 0.3501, + "step": 10140 + }, + { + "epoch": 2.846996069623807, + "grad_norm": 0.4916573762893677, + "learning_rate": 7.903256604787468e-08, + "loss": 0.3742, + "step": 10141 + }, + { + "epoch": 2.8472768107804605, + "grad_norm": 0.5513736009597778, + "learning_rate": 7.874356733621913e-08, + "loss": 0.3375, + "step": 10142 + }, + { + "epoch": 2.847557551937114, + "grad_norm": 0.545196533203125, + "learning_rate": 7.845509379044603e-08, + "loss": 0.3572, + "step": 10143 + }, + { + "epoch": 2.8478382930937673, + "grad_norm": 0.5878639817237854, + "learning_rate": 7.816714544134074e-08, + "loss": 0.3035, + "step": 10144 + }, + { + "epoch": 2.8481190342504212, + "grad_norm": 0.5589436888694763, + "learning_rate": 7.787972231963092e-08, + "loss": 0.3272, + "step": 10145 + }, + { + "epoch": 2.8483997754070747, + "grad_norm": 0.5432109832763672, + "learning_rate": 7.759282445598871e-08, + "loss": 0.3198, + "step": 10146 + }, + { + "epoch": 2.848680516563728, + "grad_norm": 0.6179593205451965, + "learning_rate": 7.730645188102904e-08, + "loss": 0.2991, + "step": 10147 + }, + { + "epoch": 2.848961257720382, + "grad_norm": 0.5356544852256775, + "learning_rate": 7.702060462531358e-08, + "loss": 0.3185, + "step": 10148 + }, + { + "epoch": 2.8492419988770354, + "grad_norm": 0.5758293271064758, + "learning_rate": 7.673528271934516e-08, + "loss": 0.307, + "step": 10149 + }, + { + "epoch": 2.849522740033689, + "grad_norm": 0.5547780990600586, + "learning_rate": 7.645048619357054e-08, + "loss": 0.3399, + "step": 10150 + }, + { + "epoch": 2.8498034811903423, + "grad_norm": 0.5531817674636841, + "learning_rate": 7.616621507838374e-08, + "loss": 0.341, + "step": 10151 + }, + { + "epoch": 2.850084222346996, + "grad_norm": 0.5724533200263977, + "learning_rate": 7.588246940411825e-08, + "loss": 0.3, + "step": 10152 + }, + { + "epoch": 2.8503649635036497, + "grad_norm": 0.5944581031799316, + "learning_rate": 7.559924920105488e-08, + "loss": 0.3358, + "step": 10153 + }, + { + "epoch": 2.850645704660303, + "grad_norm": 0.5550400614738464, + "learning_rate": 7.531655449941666e-08, + "loss": 0.337, + "step": 10154 + }, + { + "epoch": 2.850926445816957, + "grad_norm": 0.5747030377388, + "learning_rate": 7.503438532937169e-08, + "loss": 0.344, + "step": 10155 + }, + { + "epoch": 2.8512071869736104, + "grad_norm": 0.5691621899604797, + "learning_rate": 7.475274172103086e-08, + "loss": 0.3255, + "step": 10156 + }, + { + "epoch": 2.851487928130264, + "grad_norm": 0.5421184301376343, + "learning_rate": 7.447162370444849e-08, + "loss": 0.3737, + "step": 10157 + }, + { + "epoch": 2.8517686692869173, + "grad_norm": 0.5573785901069641, + "learning_rate": 7.419103130962502e-08, + "loss": 0.3367, + "step": 10158 + }, + { + "epoch": 2.852049410443571, + "grad_norm": 0.5164793133735657, + "learning_rate": 7.391096456650315e-08, + "loss": 0.3171, + "step": 10159 + }, + { + "epoch": 2.8523301516002246, + "grad_norm": 0.5594348311424255, + "learning_rate": 7.36314235049701e-08, + "loss": 0.337, + "step": 10160 + }, + { + "epoch": 2.852610892756878, + "grad_norm": 0.5490220785140991, + "learning_rate": 7.335240815485589e-08, + "loss": 0.3511, + "step": 10161 + }, + { + "epoch": 2.852891633913532, + "grad_norm": 0.5237599015235901, + "learning_rate": 7.307391854593615e-08, + "loss": 0.3314, + "step": 10162 + }, + { + "epoch": 2.8531723750701854, + "grad_norm": 0.5433495044708252, + "learning_rate": 7.279595470792932e-08, + "loss": 0.2959, + "step": 10163 + }, + { + "epoch": 2.853453116226839, + "grad_norm": 0.5113106966018677, + "learning_rate": 7.25185166704978e-08, + "loss": 0.3614, + "step": 10164 + }, + { + "epoch": 2.8537338573834923, + "grad_norm": 0.5449342727661133, + "learning_rate": 7.22416044632479e-08, + "loss": 0.2881, + "step": 10165 + }, + { + "epoch": 2.8540145985401457, + "grad_norm": 0.5747097134590149, + "learning_rate": 7.196521811573098e-08, + "loss": 0.3041, + "step": 10166 + }, + { + "epoch": 2.8542953396967996, + "grad_norm": 0.5181044340133667, + "learning_rate": 7.168935765744012e-08, + "loss": 0.329, + "step": 10167 + }, + { + "epoch": 2.854576080853453, + "grad_norm": 0.5626211166381836, + "learning_rate": 7.141402311781398e-08, + "loss": 0.3659, + "step": 10168 + }, + { + "epoch": 2.854856822010107, + "grad_norm": 0.5382546782493591, + "learning_rate": 7.113921452623462e-08, + "loss": 0.3173, + "step": 10169 + }, + { + "epoch": 2.8551375631667604, + "grad_norm": 0.5320886373519897, + "learning_rate": 7.086493191202747e-08, + "loss": 0.3004, + "step": 10170 + }, + { + "epoch": 2.855418304323414, + "grad_norm": 0.6260315179824829, + "learning_rate": 7.059117530446303e-08, + "loss": 0.3578, + "step": 10171 + }, + { + "epoch": 2.8556990454800673, + "grad_norm": 0.602780282497406, + "learning_rate": 7.031794473275344e-08, + "loss": 0.3261, + "step": 10172 + }, + { + "epoch": 2.8559797866367207, + "grad_norm": 0.6174184083938599, + "learning_rate": 7.004524022605764e-08, + "loss": 0.3228, + "step": 10173 + }, + { + "epoch": 2.8562605277933746, + "grad_norm": 0.6240619421005249, + "learning_rate": 6.977306181347677e-08, + "loss": 0.3005, + "step": 10174 + }, + { + "epoch": 2.856541268950028, + "grad_norm": 0.5377023816108704, + "learning_rate": 6.950140952405538e-08, + "loss": 0.3116, + "step": 10175 + }, + { + "epoch": 2.856822010106682, + "grad_norm": 0.579302966594696, + "learning_rate": 6.923028338678306e-08, + "loss": 0.3184, + "step": 10176 + }, + { + "epoch": 2.8571027512633353, + "grad_norm": 0.5167502760887146, + "learning_rate": 6.895968343059168e-08, + "loss": 0.2948, + "step": 10177 + }, + { + "epoch": 2.857383492419989, + "grad_norm": 0.625468373298645, + "learning_rate": 6.868960968435978e-08, + "loss": 0.3118, + "step": 10178 + }, + { + "epoch": 2.8576642335766422, + "grad_norm": 0.5315325260162354, + "learning_rate": 6.8420062176906e-08, + "loss": 0.3405, + "step": 10179 + }, + { + "epoch": 2.8579449747332957, + "grad_norm": 0.4759010970592499, + "learning_rate": 6.815104093699621e-08, + "loss": 0.3338, + "step": 10180 + }, + { + "epoch": 2.8582257158899496, + "grad_norm": 0.5294516682624817, + "learning_rate": 6.788254599333799e-08, + "loss": 0.3237, + "step": 10181 + }, + { + "epoch": 2.858506457046603, + "grad_norm": 0.540122926235199, + "learning_rate": 6.761457737458399e-08, + "loss": 0.3345, + "step": 10182 + }, + { + "epoch": 2.8587871982032564, + "grad_norm": 0.5500849485397339, + "learning_rate": 6.734713510932967e-08, + "loss": 0.3381, + "step": 10183 + }, + { + "epoch": 2.8590679393599103, + "grad_norm": 0.5662973523139954, + "learning_rate": 6.708021922611496e-08, + "loss": 0.3283, + "step": 10184 + }, + { + "epoch": 2.8593486805165638, + "grad_norm": 0.5160301923751831, + "learning_rate": 6.681382975342321e-08, + "loss": 0.3076, + "step": 10185 + }, + { + "epoch": 2.859629421673217, + "grad_norm": 0.5323373675346375, + "learning_rate": 6.654796671968222e-08, + "loss": 0.3389, + "step": 10186 + }, + { + "epoch": 2.8599101628298707, + "grad_norm": 0.577246904373169, + "learning_rate": 6.62826301532632e-08, + "loss": 0.3038, + "step": 10187 + }, + { + "epoch": 2.8601909039865245, + "grad_norm": 0.5509032011032104, + "learning_rate": 6.601782008248126e-08, + "loss": 0.3164, + "step": 10188 + }, + { + "epoch": 2.860471645143178, + "grad_norm": 0.606834352016449, + "learning_rate": 6.575353653559491e-08, + "loss": 0.2821, + "step": 10189 + }, + { + "epoch": 2.8607523862998314, + "grad_norm": 0.5302348136901855, + "learning_rate": 6.548977954080716e-08, + "loss": 0.2887, + "step": 10190 + }, + { + "epoch": 2.8610331274564853, + "grad_norm": 0.4936233162879944, + "learning_rate": 6.522654912626381e-08, + "loss": 0.3539, + "step": 10191 + }, + { + "epoch": 2.8613138686131387, + "grad_norm": 0.5368813872337341, + "learning_rate": 6.496384532005684e-08, + "loss": 0.3217, + "step": 10192 + }, + { + "epoch": 2.861594609769792, + "grad_norm": 0.5451339483261108, + "learning_rate": 6.470166815021884e-08, + "loss": 0.3348, + "step": 10193 + }, + { + "epoch": 2.8618753509264456, + "grad_norm": 0.49208346009254456, + "learning_rate": 6.444001764472852e-08, + "loss": 0.3632, + "step": 10194 + }, + { + "epoch": 2.8621560920830995, + "grad_norm": 0.5209183692932129, + "learning_rate": 6.417889383150688e-08, + "loss": 0.3281, + "step": 10195 + }, + { + "epoch": 2.862436833239753, + "grad_norm": 0.5655257105827332, + "learning_rate": 6.391829673841998e-08, + "loss": 0.3641, + "step": 10196 + }, + { + "epoch": 2.8627175743964064, + "grad_norm": 0.5721009969711304, + "learning_rate": 6.365822639327724e-08, + "loss": 0.3308, + "step": 10197 + }, + { + "epoch": 2.8629983155530603, + "grad_norm": 0.5337321758270264, + "learning_rate": 6.339868282383144e-08, + "loss": 0.3526, + "step": 10198 + }, + { + "epoch": 2.8632790567097137, + "grad_norm": 0.5727617144584656, + "learning_rate": 6.313966605777932e-08, + "loss": 0.3172, + "step": 10199 + }, + { + "epoch": 2.863559797866367, + "grad_norm": 0.5758257508277893, + "learning_rate": 6.288117612276157e-08, + "loss": 0.3114, + "step": 10200 + }, + { + "epoch": 2.8638405390230206, + "grad_norm": 0.5676164627075195, + "learning_rate": 6.262321304636277e-08, + "loss": 0.3292, + "step": 10201 + }, + { + "epoch": 2.8641212801796745, + "grad_norm": 0.5245554447174072, + "learning_rate": 6.2365776856112e-08, + "loss": 0.3388, + "step": 10202 + }, + { + "epoch": 2.864402021336328, + "grad_norm": 0.5958057045936584, + "learning_rate": 6.210886757947954e-08, + "loss": 0.2981, + "step": 10203 + }, + { + "epoch": 2.8646827624929814, + "grad_norm": 0.6062753796577454, + "learning_rate": 6.185248524388232e-08, + "loss": 0.3074, + "step": 10204 + }, + { + "epoch": 2.8649635036496353, + "grad_norm": 0.49634841084480286, + "learning_rate": 6.159662987667959e-08, + "loss": 0.3604, + "step": 10205 + }, + { + "epoch": 2.8652442448062887, + "grad_norm": 0.5320731401443481, + "learning_rate": 6.134130150517447e-08, + "loss": 0.3814, + "step": 10206 + }, + { + "epoch": 2.865524985962942, + "grad_norm": 0.5452108383178711, + "learning_rate": 6.10865001566141e-08, + "loss": 0.3404, + "step": 10207 + }, + { + "epoch": 2.8658057271195956, + "grad_norm": 0.5631430149078369, + "learning_rate": 6.083222585818949e-08, + "loss": 0.327, + "step": 10208 + }, + { + "epoch": 2.866086468276249, + "grad_norm": 0.5746709108352661, + "learning_rate": 6.057847863703503e-08, + "loss": 0.304, + "step": 10209 + }, + { + "epoch": 2.866367209432903, + "grad_norm": 0.5286504030227661, + "learning_rate": 6.032525852022964e-08, + "loss": 0.3492, + "step": 10210 + }, + { + "epoch": 2.8666479505895563, + "grad_norm": 0.5584084987640381, + "learning_rate": 6.00725655347939e-08, + "loss": 0.3283, + "step": 10211 + }, + { + "epoch": 2.8669286917462102, + "grad_norm": 0.566763162612915, + "learning_rate": 5.982039970769515e-08, + "loss": 0.3203, + "step": 10212 + }, + { + "epoch": 2.8672094329028637, + "grad_norm": 0.5794817209243774, + "learning_rate": 5.956876106584242e-08, + "loss": 0.285, + "step": 10213 + }, + { + "epoch": 2.867490174059517, + "grad_norm": 0.5237037539482117, + "learning_rate": 5.9317649636088656e-08, + "loss": 0.3399, + "step": 10214 + }, + { + "epoch": 2.8677709152161706, + "grad_norm": 0.5269162654876709, + "learning_rate": 5.906706544523133e-08, + "loss": 0.3061, + "step": 10215 + }, + { + "epoch": 2.868051656372824, + "grad_norm": 0.6088763475418091, + "learning_rate": 5.881700852001127e-08, + "loss": 0.3223, + "step": 10216 + }, + { + "epoch": 2.868332397529478, + "grad_norm": 0.5291255712509155, + "learning_rate": 5.856747888711267e-08, + "loss": 0.3254, + "step": 10217 + }, + { + "epoch": 2.8686131386861313, + "grad_norm": 0.5879034996032715, + "learning_rate": 5.831847657316425e-08, + "loss": 0.3235, + "step": 10218 + }, + { + "epoch": 2.868893879842785, + "grad_norm": 0.5519848465919495, + "learning_rate": 5.8070001604737525e-08, + "loss": 0.329, + "step": 10219 + }, + { + "epoch": 2.8691746209994387, + "grad_norm": 0.5223598480224609, + "learning_rate": 5.782205400834906e-08, + "loss": 0.319, + "step": 10220 + }, + { + "epoch": 2.869455362156092, + "grad_norm": 0.5825893878936768, + "learning_rate": 5.757463381045658e-08, + "loss": 0.2851, + "step": 10221 + }, + { + "epoch": 2.8697361033127455, + "grad_norm": 0.48623377084732056, + "learning_rate": 5.732774103746508e-08, + "loss": 0.3602, + "step": 10222 + }, + { + "epoch": 2.870016844469399, + "grad_norm": 0.5634300112724304, + "learning_rate": 5.7081375715720146e-08, + "loss": 0.2964, + "step": 10223 + }, + { + "epoch": 2.870297585626053, + "grad_norm": 0.5560899972915649, + "learning_rate": 5.683553787151297e-08, + "loss": 0.3085, + "step": 10224 + }, + { + "epoch": 2.8705783267827063, + "grad_norm": 0.578284502029419, + "learning_rate": 5.659022753107757e-08, + "loss": 0.3338, + "step": 10225 + }, + { + "epoch": 2.87085906793936, + "grad_norm": 0.5929065942764282, + "learning_rate": 5.6345444720591894e-08, + "loss": 0.3214, + "step": 10226 + }, + { + "epoch": 2.8711398090960136, + "grad_norm": 0.5859715938568115, + "learning_rate": 5.610118946617837e-08, + "loss": 0.3054, + "step": 10227 + }, + { + "epoch": 2.871420550252667, + "grad_norm": 0.5601416230201721, + "learning_rate": 5.585746179390117e-08, + "loss": 0.3447, + "step": 10228 + }, + { + "epoch": 2.8717012914093205, + "grad_norm": 0.5210142135620117, + "learning_rate": 5.561426172977058e-08, + "loss": 0.3095, + "step": 10229 + }, + { + "epoch": 2.871982032565974, + "grad_norm": 0.542915940284729, + "learning_rate": 5.537158929973863e-08, + "loss": 0.3809, + "step": 10230 + }, + { + "epoch": 2.872262773722628, + "grad_norm": 0.517369270324707, + "learning_rate": 5.512944452970237e-08, + "loss": 0.3331, + "step": 10231 + }, + { + "epoch": 2.8725435148792813, + "grad_norm": 0.5070340037345886, + "learning_rate": 5.4887827445501144e-08, + "loss": 0.328, + "step": 10232 + }, + { + "epoch": 2.8728242560359347, + "grad_norm": 0.49687615036964417, + "learning_rate": 5.464673807291987e-08, + "loss": 0.3369, + "step": 10233 + }, + { + "epoch": 2.8731049971925886, + "grad_norm": 0.5236063003540039, + "learning_rate": 5.44061764376852e-08, + "loss": 0.3544, + "step": 10234 + }, + { + "epoch": 2.873385738349242, + "grad_norm": 0.5609530806541443, + "learning_rate": 5.4166142565468815e-08, + "loss": 0.2939, + "step": 10235 + }, + { + "epoch": 2.8736664795058955, + "grad_norm": 0.5667310953140259, + "learning_rate": 5.3926636481885786e-08, + "loss": 0.3501, + "step": 10236 + }, + { + "epoch": 2.873947220662549, + "grad_norm": 0.5604002475738525, + "learning_rate": 5.3687658212494554e-08, + "loss": 0.3014, + "step": 10237 + }, + { + "epoch": 2.874227961819203, + "grad_norm": 0.5266476273536682, + "learning_rate": 5.344920778279694e-08, + "loss": 0.2882, + "step": 10238 + }, + { + "epoch": 2.8745087029758563, + "grad_norm": 0.5349099636077881, + "learning_rate": 5.321128521823982e-08, + "loss": 0.3287, + "step": 10239 + }, + { + "epoch": 2.8747894441325097, + "grad_norm": 0.5059753656387329, + "learning_rate": 5.297389054421176e-08, + "loss": 0.3048, + "step": 10240 + }, + { + "epoch": 2.8750701852891636, + "grad_norm": 0.5411710739135742, + "learning_rate": 5.273702378604639e-08, + "loss": 0.3586, + "step": 10241 + }, + { + "epoch": 2.875350926445817, + "grad_norm": 0.5979899764060974, + "learning_rate": 5.250068496902183e-08, + "loss": 0.2859, + "step": 10242 + }, + { + "epoch": 2.8756316676024705, + "grad_norm": 0.5573270916938782, + "learning_rate": 5.226487411835679e-08, + "loss": 0.3023, + "step": 10243 + }, + { + "epoch": 2.875912408759124, + "grad_norm": 0.5321760177612305, + "learning_rate": 5.202959125921725e-08, + "loss": 0.3501, + "step": 10244 + }, + { + "epoch": 2.876193149915778, + "grad_norm": 0.5406895875930786, + "learning_rate": 5.1794836416709236e-08, + "loss": 0.2834, + "step": 10245 + }, + { + "epoch": 2.8764738910724312, + "grad_norm": 0.5286221504211426, + "learning_rate": 5.15606096158866e-08, + "loss": 0.347, + "step": 10246 + }, + { + "epoch": 2.8767546322290847, + "grad_norm": 0.5209830403327942, + "learning_rate": 5.132691088174269e-08, + "loss": 0.3234, + "step": 10247 + }, + { + "epoch": 2.8770353733857386, + "grad_norm": 0.5012113451957703, + "learning_rate": 5.109374023921754e-08, + "loss": 0.3785, + "step": 10248 + }, + { + "epoch": 2.877316114542392, + "grad_norm": 0.5220000743865967, + "learning_rate": 5.0861097713192916e-08, + "loss": 0.3395, + "step": 10249 + }, + { + "epoch": 2.8775968556990454, + "grad_norm": 0.539867103099823, + "learning_rate": 5.062898332849509e-08, + "loss": 0.3119, + "step": 10250 + }, + { + "epoch": 2.877877596855699, + "grad_norm": 0.5832822918891907, + "learning_rate": 5.039739710989422e-08, + "loss": 0.3235, + "step": 10251 + }, + { + "epoch": 2.8781583380123528, + "grad_norm": 0.5702241659164429, + "learning_rate": 5.016633908210389e-08, + "loss": 0.304, + "step": 10252 + }, + { + "epoch": 2.878439079169006, + "grad_norm": 0.5818907022476196, + "learning_rate": 4.993580926978048e-08, + "loss": 0.3146, + "step": 10253 + }, + { + "epoch": 2.8787198203256597, + "grad_norm": 0.5696885585784912, + "learning_rate": 4.970580769752542e-08, + "loss": 0.3038, + "step": 10254 + }, + { + "epoch": 2.8790005614823135, + "grad_norm": 0.5527303814888, + "learning_rate": 4.9476334389882416e-08, + "loss": 0.3359, + "step": 10255 + }, + { + "epoch": 2.879281302638967, + "grad_norm": 0.5346391797065735, + "learning_rate": 4.924738937133966e-08, + "loss": 0.3166, + "step": 10256 + }, + { + "epoch": 2.8795620437956204, + "grad_norm": 0.5558182597160339, + "learning_rate": 4.901897266632927e-08, + "loss": 0.342, + "step": 10257 + }, + { + "epoch": 2.879842784952274, + "grad_norm": 0.537028968334198, + "learning_rate": 4.8791084299225635e-08, + "loss": 0.3139, + "step": 10258 + }, + { + "epoch": 2.8801235261089273, + "grad_norm": 0.5365895628929138, + "learning_rate": 4.856372429434819e-08, + "loss": 0.3217, + "step": 10259 + }, + { + "epoch": 2.880404267265581, + "grad_norm": 0.5496370196342468, + "learning_rate": 4.8336892675958646e-08, + "loss": 0.3511, + "step": 10260 + }, + { + "epoch": 2.8806850084222346, + "grad_norm": 0.5810548067092896, + "learning_rate": 4.81105894682643e-08, + "loss": 0.2826, + "step": 10261 + }, + { + "epoch": 2.8809657495788885, + "grad_norm": 0.60880446434021, + "learning_rate": 4.788481469541306e-08, + "loss": 0.2954, + "step": 10262 + }, + { + "epoch": 2.881246490735542, + "grad_norm": 0.600534200668335, + "learning_rate": 4.76595683815001e-08, + "loss": 0.2726, + "step": 10263 + }, + { + "epoch": 2.8815272318921954, + "grad_norm": 0.5583833456039429, + "learning_rate": 4.7434850550561185e-08, + "loss": 0.3538, + "step": 10264 + }, + { + "epoch": 2.881807973048849, + "grad_norm": 0.547808825969696, + "learning_rate": 4.721066122657714e-08, + "loss": 0.3073, + "step": 10265 + }, + { + "epoch": 2.8820887142055023, + "grad_norm": 0.5400701761245728, + "learning_rate": 4.698700043347215e-08, + "loss": 0.3383, + "step": 10266 + }, + { + "epoch": 2.882369455362156, + "grad_norm": 0.6265240907669067, + "learning_rate": 4.6763868195112695e-08, + "loss": 0.3008, + "step": 10267 + }, + { + "epoch": 2.8826501965188096, + "grad_norm": 0.5000604391098022, + "learning_rate": 4.6541264535311936e-08, + "loss": 0.3396, + "step": 10268 + }, + { + "epoch": 2.8829309376754635, + "grad_norm": 0.5225700736045837, + "learning_rate": 4.631918947782421e-08, + "loss": 0.338, + "step": 10269 + }, + { + "epoch": 2.883211678832117, + "grad_norm": 0.505322277545929, + "learning_rate": 4.6097643046346674e-08, + "loss": 0.3291, + "step": 10270 + }, + { + "epoch": 2.8834924199887704, + "grad_norm": 0.4945646822452545, + "learning_rate": 4.587662526452319e-08, + "loss": 0.3683, + "step": 10271 + }, + { + "epoch": 2.883773161145424, + "grad_norm": 0.5467312335968018, + "learning_rate": 4.565613615593822e-08, + "loss": 0.2948, + "step": 10272 + }, + { + "epoch": 2.8840539023020773, + "grad_norm": 0.5793159604072571, + "learning_rate": 4.543617574412185e-08, + "loss": 0.3087, + "step": 10273 + }, + { + "epoch": 2.884334643458731, + "grad_norm": 0.5565694570541382, + "learning_rate": 4.521674405254583e-08, + "loss": 0.3424, + "step": 10274 + }, + { + "epoch": 2.8846153846153846, + "grad_norm": 0.565380334854126, + "learning_rate": 4.499784110462757e-08, + "loss": 0.3108, + "step": 10275 + }, + { + "epoch": 2.884896125772038, + "grad_norm": 0.49810847640037537, + "learning_rate": 4.4779466923726146e-08, + "loss": 0.3562, + "step": 10276 + }, + { + "epoch": 2.885176866928692, + "grad_norm": 0.5254920125007629, + "learning_rate": 4.456162153314569e-08, + "loss": 0.3223, + "step": 10277 + }, + { + "epoch": 2.8854576080853453, + "grad_norm": 0.5809634327888489, + "learning_rate": 4.434430495613318e-08, + "loss": 0.33, + "step": 10278 + }, + { + "epoch": 2.885738349241999, + "grad_norm": 0.5467097163200378, + "learning_rate": 4.412751721587949e-08, + "loss": 0.3482, + "step": 10279 + }, + { + "epoch": 2.8860190903986522, + "grad_norm": 0.5433565378189087, + "learning_rate": 4.3911258335518345e-08, + "loss": 0.3757, + "step": 10280 + }, + { + "epoch": 2.886299831555306, + "grad_norm": 0.5442503094673157, + "learning_rate": 4.369552833812796e-08, + "loss": 0.3516, + "step": 10281 + }, + { + "epoch": 2.8865805727119596, + "grad_norm": 0.5326297879219055, + "learning_rate": 4.3480327246729347e-08, + "loss": 0.352, + "step": 10282 + }, + { + "epoch": 2.886861313868613, + "grad_norm": 0.5138195753097534, + "learning_rate": 4.326565508428804e-08, + "loss": 0.3385, + "step": 10283 + }, + { + "epoch": 2.887142055025267, + "grad_norm": 0.5652940273284912, + "learning_rate": 4.305151187371182e-08, + "loss": 0.3256, + "step": 10284 + }, + { + "epoch": 2.8874227961819203, + "grad_norm": 0.6104983687400818, + "learning_rate": 4.283789763785295e-08, + "loss": 0.2974, + "step": 10285 + }, + { + "epoch": 2.8877035373385738, + "grad_norm": 0.5927769541740417, + "learning_rate": 4.262481239950711e-08, + "loss": 0.3275, + "step": 10286 + }, + { + "epoch": 2.887984278495227, + "grad_norm": 0.5427842140197754, + "learning_rate": 4.241225618141387e-08, + "loss": 0.3128, + "step": 10287 + }, + { + "epoch": 2.888265019651881, + "grad_norm": 0.5188747048377991, + "learning_rate": 4.220022900625509e-08, + "loss": 0.3239, + "step": 10288 + }, + { + "epoch": 2.8885457608085345, + "grad_norm": 0.5585922002792358, + "learning_rate": 4.198873089665711e-08, + "loss": 0.2919, + "step": 10289 + }, + { + "epoch": 2.888826501965188, + "grad_norm": 0.507307231426239, + "learning_rate": 4.177776187519023e-08, + "loss": 0.3559, + "step": 10290 + }, + { + "epoch": 2.889107243121842, + "grad_norm": 0.5435081720352173, + "learning_rate": 4.156732196436752e-08, + "loss": 0.3373, + "step": 10291 + }, + { + "epoch": 2.8893879842784953, + "grad_norm": 0.5974767804145813, + "learning_rate": 4.135741118664549e-08, + "loss": 0.3264, + "step": 10292 + }, + { + "epoch": 2.8896687254351487, + "grad_norm": 0.5404211282730103, + "learning_rate": 4.1148029564425094e-08, + "loss": 0.3582, + "step": 10293 + }, + { + "epoch": 2.889949466591802, + "grad_norm": 0.5471585392951965, + "learning_rate": 4.0939177120049576e-08, + "loss": 0.3166, + "step": 10294 + }, + { + "epoch": 2.890230207748456, + "grad_norm": 0.5118330717086792, + "learning_rate": 4.073085387580722e-08, + "loss": 0.3692, + "step": 10295 + }, + { + "epoch": 2.8905109489051095, + "grad_norm": 0.5458824634552002, + "learning_rate": 4.0523059853928014e-08, + "loss": 0.3055, + "step": 10296 + }, + { + "epoch": 2.890791690061763, + "grad_norm": 0.5640842914581299, + "learning_rate": 4.0315795076587005e-08, + "loss": 0.3125, + "step": 10297 + }, + { + "epoch": 2.891072431218417, + "grad_norm": 0.48543086647987366, + "learning_rate": 4.010905956590205e-08, + "loss": 0.3545, + "step": 10298 + }, + { + "epoch": 2.8913531723750703, + "grad_norm": 0.5426521301269531, + "learning_rate": 3.9902853343934955e-08, + "loss": 0.3301, + "step": 10299 + }, + { + "epoch": 2.8916339135317237, + "grad_norm": 0.5406230092048645, + "learning_rate": 3.9697176432690335e-08, + "loss": 0.3159, + "step": 10300 + }, + { + "epoch": 2.891914654688377, + "grad_norm": 0.5597209334373474, + "learning_rate": 3.949202885411674e-08, + "loss": 0.2952, + "step": 10301 + }, + { + "epoch": 2.892195395845031, + "grad_norm": 0.6307613253593445, + "learning_rate": 3.928741063010721e-08, + "loss": 0.3169, + "step": 10302 + }, + { + "epoch": 2.8924761370016845, + "grad_norm": 0.5918838977813721, + "learning_rate": 3.9083321782495965e-08, + "loss": 0.3306, + "step": 10303 + }, + { + "epoch": 2.892756878158338, + "grad_norm": 0.5424016714096069, + "learning_rate": 3.887976233306279e-08, + "loss": 0.3429, + "step": 10304 + }, + { + "epoch": 2.893037619314992, + "grad_norm": 0.536504328250885, + "learning_rate": 3.867673230353031e-08, + "loss": 0.328, + "step": 10305 + }, + { + "epoch": 2.8933183604716453, + "grad_norm": 0.6102229952812195, + "learning_rate": 3.847423171556452e-08, + "loss": 0.2996, + "step": 10306 + }, + { + "epoch": 2.8935991016282987, + "grad_norm": 0.5004302263259888, + "learning_rate": 3.827226059077538e-08, + "loss": 0.3217, + "step": 10307 + }, + { + "epoch": 2.893879842784952, + "grad_norm": 0.5153526663780212, + "learning_rate": 3.807081895071507e-08, + "loss": 0.3303, + "step": 10308 + }, + { + "epoch": 2.8941605839416056, + "grad_norm": 0.5676251649856567, + "learning_rate": 3.7869906816880855e-08, + "loss": 0.3022, + "step": 10309 + }, + { + "epoch": 2.8944413250982595, + "grad_norm": 0.5440424680709839, + "learning_rate": 3.766952421071335e-08, + "loss": 0.3073, + "step": 10310 + }, + { + "epoch": 2.894722066254913, + "grad_norm": 0.5407348275184631, + "learning_rate": 3.7469671153594346e-08, + "loss": 0.3455, + "step": 10311 + }, + { + "epoch": 2.895002807411567, + "grad_norm": 0.4998387396335602, + "learning_rate": 3.7270347666853446e-08, + "loss": 0.3108, + "step": 10312 + }, + { + "epoch": 2.8952835485682202, + "grad_norm": 0.5646181702613831, + "learning_rate": 3.707155377175864e-08, + "loss": 0.2744, + "step": 10313 + }, + { + "epoch": 2.8955642897248737, + "grad_norm": 0.5334526300430298, + "learning_rate": 3.6873289489526285e-08, + "loss": 0.418, + "step": 10314 + }, + { + "epoch": 2.895845030881527, + "grad_norm": 0.5950364470481873, + "learning_rate": 3.6675554841312246e-08, + "loss": 0.3107, + "step": 10315 + }, + { + "epoch": 2.8961257720381806, + "grad_norm": 0.5439245700836182, + "learning_rate": 3.6478349848217966e-08, + "loss": 0.2894, + "step": 10316 + }, + { + "epoch": 2.8964065131948344, + "grad_norm": 0.5425600409507751, + "learning_rate": 3.628167453128828e-08, + "loss": 0.3334, + "step": 10317 + }, + { + "epoch": 2.896687254351488, + "grad_norm": 0.5517589449882507, + "learning_rate": 3.608552891151085e-08, + "loss": 0.3591, + "step": 10318 + }, + { + "epoch": 2.8969679955081418, + "grad_norm": 0.5716979503631592, + "learning_rate": 3.588991300981726e-08, + "loss": 0.329, + "step": 10319 + }, + { + "epoch": 2.897248736664795, + "grad_norm": 0.5769723057746887, + "learning_rate": 3.569482684708247e-08, + "loss": 0.3438, + "step": 10320 + }, + { + "epoch": 2.8975294778214487, + "grad_norm": 0.5434415936470032, + "learning_rate": 3.550027044412485e-08, + "loss": 0.3272, + "step": 10321 + }, + { + "epoch": 2.897810218978102, + "grad_norm": 0.517772912979126, + "learning_rate": 3.530624382170611e-08, + "loss": 0.3453, + "step": 10322 + }, + { + "epoch": 2.8980909601347555, + "grad_norm": 0.5583859086036682, + "learning_rate": 3.5112747000531355e-08, + "loss": 0.3527, + "step": 10323 + }, + { + "epoch": 2.8983717012914094, + "grad_norm": 0.5648369193077087, + "learning_rate": 3.491978000125018e-08, + "loss": 0.3211, + "step": 10324 + }, + { + "epoch": 2.898652442448063, + "grad_norm": 0.5798605680465698, + "learning_rate": 3.472734284445445e-08, + "loss": 0.2961, + "step": 10325 + }, + { + "epoch": 2.8989331836047163, + "grad_norm": 0.5161697864532471, + "learning_rate": 3.4535435550678844e-08, + "loss": 0.3224, + "step": 10326 + }, + { + "epoch": 2.89921392476137, + "grad_norm": 0.5376103520393372, + "learning_rate": 3.43440581404042e-08, + "loss": 0.2998, + "step": 10327 + }, + { + "epoch": 2.8994946659180236, + "grad_norm": 0.5266596078872681, + "learning_rate": 3.415321063405141e-08, + "loss": 0.3251, + "step": 10328 + }, + { + "epoch": 2.899775407074677, + "grad_norm": 0.5706929564476013, + "learning_rate": 3.3962893051988077e-08, + "loss": 0.3823, + "step": 10329 + }, + { + "epoch": 2.9000561482313305, + "grad_norm": 0.5149383544921875, + "learning_rate": 3.3773105414523496e-08, + "loss": 0.3331, + "step": 10330 + }, + { + "epoch": 2.9003368893879844, + "grad_norm": 0.512887179851532, + "learning_rate": 3.358384774190926e-08, + "loss": 0.3314, + "step": 10331 + }, + { + "epoch": 2.900617630544638, + "grad_norm": 0.5874810814857483, + "learning_rate": 3.339512005434309e-08, + "loss": 0.2891, + "step": 10332 + }, + { + "epoch": 2.9008983717012913, + "grad_norm": 0.522426962852478, + "learning_rate": 3.3206922371964436e-08, + "loss": 0.3386, + "step": 10333 + }, + { + "epoch": 2.901179112857945, + "grad_norm": 0.5868736505508423, + "learning_rate": 3.301925471485612e-08, + "loss": 0.3159, + "step": 10334 + }, + { + "epoch": 2.9014598540145986, + "grad_norm": 0.5494204163551331, + "learning_rate": 3.283211710304601e-08, + "loss": 0.333, + "step": 10335 + }, + { + "epoch": 2.901740595171252, + "grad_norm": 0.5404460430145264, + "learning_rate": 3.264550955650314e-08, + "loss": 0.3383, + "step": 10336 + }, + { + "epoch": 2.9020213363279055, + "grad_norm": 0.526391863822937, + "learning_rate": 3.245943209514213e-08, + "loss": 0.3108, + "step": 10337 + }, + { + "epoch": 2.9023020774845594, + "grad_norm": 0.5620700716972351, + "learning_rate": 3.227388473881876e-08, + "loss": 0.2952, + "step": 10338 + }, + { + "epoch": 2.902582818641213, + "grad_norm": 0.49800369143486023, + "learning_rate": 3.208886750733442e-08, + "loss": 0.3496, + "step": 10339 + }, + { + "epoch": 2.9028635597978663, + "grad_norm": 0.5326073169708252, + "learning_rate": 3.190438042043276e-08, + "loss": 0.342, + "step": 10340 + }, + { + "epoch": 2.90314430095452, + "grad_norm": 0.6173860430717468, + "learning_rate": 3.17204234978008e-08, + "loss": 0.3089, + "step": 10341 + }, + { + "epoch": 2.9034250421111736, + "grad_norm": 0.5504783987998962, + "learning_rate": 3.153699675907007e-08, + "loss": 0.3424, + "step": 10342 + }, + { + "epoch": 2.903705783267827, + "grad_norm": 0.5313858389854431, + "learning_rate": 3.1354100223813246e-08, + "loss": 0.3203, + "step": 10343 + }, + { + "epoch": 2.9039865244244805, + "grad_norm": 0.4968644678592682, + "learning_rate": 3.117173391154971e-08, + "loss": 0.3211, + "step": 10344 + }, + { + "epoch": 2.9042672655811343, + "grad_norm": 0.5251296758651733, + "learning_rate": 3.0989897841739446e-08, + "loss": 0.3061, + "step": 10345 + }, + { + "epoch": 2.904548006737788, + "grad_norm": 0.5955836772918701, + "learning_rate": 3.0808592033786944e-08, + "loss": 0.3351, + "step": 10346 + }, + { + "epoch": 2.9048287478944412, + "grad_norm": 0.5214530825614929, + "learning_rate": 3.062781650704061e-08, + "loss": 0.3383, + "step": 10347 + }, + { + "epoch": 2.905109489051095, + "grad_norm": 0.561315655708313, + "learning_rate": 3.044757128079057e-08, + "loss": 0.2763, + "step": 10348 + }, + { + "epoch": 2.9053902302077486, + "grad_norm": 0.545353353023529, + "learning_rate": 3.026785637427254e-08, + "loss": 0.3542, + "step": 10349 + }, + { + "epoch": 2.905670971364402, + "grad_norm": 0.5503994822502136, + "learning_rate": 3.008867180666397e-08, + "loss": 0.356, + "step": 10350 + }, + { + "epoch": 2.9059517125210554, + "grad_norm": 0.5180359482765198, + "learning_rate": 2.991001759708678e-08, + "loss": 0.3488, + "step": 10351 + }, + { + "epoch": 2.906232453677709, + "grad_norm": 0.48081329464912415, + "learning_rate": 2.973189376460517e-08, + "loss": 0.323, + "step": 10352 + }, + { + "epoch": 2.9065131948343628, + "grad_norm": 0.5101129412651062, + "learning_rate": 2.9554300328228368e-08, + "loss": 0.3278, + "step": 10353 + }, + { + "epoch": 2.906793935991016, + "grad_norm": 0.5722399353981018, + "learning_rate": 2.937723730690678e-08, + "loss": 0.2704, + "step": 10354 + }, + { + "epoch": 2.90707467714767, + "grad_norm": 0.5677437782287598, + "learning_rate": 2.92007047195364e-08, + "loss": 0.2834, + "step": 10355 + }, + { + "epoch": 2.9073554183043235, + "grad_norm": 0.5312627553939819, + "learning_rate": 2.902470258495549e-08, + "loss": 0.3123, + "step": 10356 + }, + { + "epoch": 2.907636159460977, + "grad_norm": 0.6231878399848938, + "learning_rate": 2.8849230921946248e-08, + "loss": 0.3309, + "step": 10357 + }, + { + "epoch": 2.9079169006176304, + "grad_norm": 0.5991750359535217, + "learning_rate": 2.8674289749233142e-08, + "loss": 0.3519, + "step": 10358 + }, + { + "epoch": 2.908197641774284, + "grad_norm": 0.5414668321609497, + "learning_rate": 2.8499879085485128e-08, + "loss": 0.3843, + "step": 10359 + }, + { + "epoch": 2.9084783829309377, + "grad_norm": 0.5410469174385071, + "learning_rate": 2.8325998949314536e-08, + "loss": 0.3001, + "step": 10360 + }, + { + "epoch": 2.908759124087591, + "grad_norm": 0.4624980390071869, + "learning_rate": 2.815264935927653e-08, + "loss": 0.3158, + "step": 10361 + }, + { + "epoch": 2.909039865244245, + "grad_norm": 0.5556575655937195, + "learning_rate": 2.7979830333869638e-08, + "loss": 0.29, + "step": 10362 + }, + { + "epoch": 2.9093206064008985, + "grad_norm": 0.506561279296875, + "learning_rate": 2.780754189153634e-08, + "loss": 0.3363, + "step": 10363 + }, + { + "epoch": 2.909601347557552, + "grad_norm": 0.5680440664291382, + "learning_rate": 2.7635784050662474e-08, + "loss": 0.315, + "step": 10364 + }, + { + "epoch": 2.9098820887142054, + "grad_norm": 0.47068169713020325, + "learning_rate": 2.746455682957616e-08, + "loss": 0.3098, + "step": 10365 + }, + { + "epoch": 2.910162829870859, + "grad_norm": 0.48363471031188965, + "learning_rate": 2.7293860246550563e-08, + "loss": 0.3483, + "step": 10366 + }, + { + "epoch": 2.9104435710275127, + "grad_norm": 0.5012058019638062, + "learning_rate": 2.7123694319800552e-08, + "loss": 0.3316, + "step": 10367 + }, + { + "epoch": 2.910724312184166, + "grad_norm": 0.5747319459915161, + "learning_rate": 2.695405906748605e-08, + "loss": 0.3357, + "step": 10368 + }, + { + "epoch": 2.9110050533408196, + "grad_norm": 0.5871933698654175, + "learning_rate": 2.678495450770924e-08, + "loss": 0.3277, + "step": 10369 + }, + { + "epoch": 2.9112857944974735, + "grad_norm": 0.5979270935058594, + "learning_rate": 2.6616380658515128e-08, + "loss": 0.3435, + "step": 10370 + }, + { + "epoch": 2.911566535654127, + "grad_norm": 0.5709941387176514, + "learning_rate": 2.6448337537893776e-08, + "loss": 0.3324, + "step": 10371 + }, + { + "epoch": 2.9118472768107804, + "grad_norm": 0.5479961633682251, + "learning_rate": 2.6280825163776946e-08, + "loss": 0.3391, + "step": 10372 + }, + { + "epoch": 2.912128017967434, + "grad_norm": 0.560149073600769, + "learning_rate": 2.6113843554041453e-08, + "loss": 0.3575, + "step": 10373 + }, + { + "epoch": 2.9124087591240877, + "grad_norm": 0.5506590604782104, + "learning_rate": 2.5947392726505817e-08, + "loss": 0.3279, + "step": 10374 + }, + { + "epoch": 2.912689500280741, + "grad_norm": 0.4642602801322937, + "learning_rate": 2.578147269893305e-08, + "loss": 0.3367, + "step": 10375 + }, + { + "epoch": 2.9129702414373946, + "grad_norm": 0.5406062602996826, + "learning_rate": 2.5616083489028443e-08, + "loss": 0.3271, + "step": 10376 + }, + { + "epoch": 2.9132509825940485, + "grad_norm": 0.5081760883331299, + "learning_rate": 2.5451225114441758e-08, + "loss": 0.3172, + "step": 10377 + }, + { + "epoch": 2.913531723750702, + "grad_norm": 0.5476431846618652, + "learning_rate": 2.5286897592766147e-08, + "loss": 0.3141, + "step": 10378 + }, + { + "epoch": 2.9138124649073553, + "grad_norm": 0.569713294506073, + "learning_rate": 2.5123100941537027e-08, + "loss": 0.337, + "step": 10379 + }, + { + "epoch": 2.914093206064009, + "grad_norm": 0.5425527691841125, + "learning_rate": 2.4959835178233748e-08, + "loss": 0.3072, + "step": 10380 + }, + { + "epoch": 2.9143739472206627, + "grad_norm": 0.5655215978622437, + "learning_rate": 2.4797100320279045e-08, + "loss": 0.3058, + "step": 10381 + }, + { + "epoch": 2.914654688377316, + "grad_norm": 0.504270613193512, + "learning_rate": 2.463489638503902e-08, + "loss": 0.3181, + "step": 10382 + }, + { + "epoch": 2.9149354295339696, + "grad_norm": 0.5506932139396667, + "learning_rate": 2.4473223389823166e-08, + "loss": 0.3079, + "step": 10383 + }, + { + "epoch": 2.9152161706906234, + "grad_norm": 0.5552608370780945, + "learning_rate": 2.4312081351883786e-08, + "loss": 0.3365, + "step": 10384 + }, + { + "epoch": 2.915496911847277, + "grad_norm": 0.5960777997970581, + "learning_rate": 2.4151470288418246e-08, + "loss": 0.3114, + "step": 10385 + }, + { + "epoch": 2.9157776530039303, + "grad_norm": 0.5495597720146179, + "learning_rate": 2.3991390216564492e-08, + "loss": 0.3443, + "step": 10386 + }, + { + "epoch": 2.9160583941605838, + "grad_norm": 0.5533434152603149, + "learning_rate": 2.3831841153405532e-08, + "loss": 0.2783, + "step": 10387 + }, + { + "epoch": 2.9163391353172377, + "grad_norm": 0.4872790575027466, + "learning_rate": 2.3672823115968303e-08, + "loss": 0.3496, + "step": 10388 + }, + { + "epoch": 2.916619876473891, + "grad_norm": 0.49798262119293213, + "learning_rate": 2.3514336121220893e-08, + "loss": 0.3551, + "step": 10389 + }, + { + "epoch": 2.9169006176305445, + "grad_norm": 0.5557563900947571, + "learning_rate": 2.3356380186077554e-08, + "loss": 0.3619, + "step": 10390 + }, + { + "epoch": 2.9171813587871984, + "grad_norm": 0.5699899792671204, + "learning_rate": 2.319895532739369e-08, + "loss": 0.3509, + "step": 10391 + }, + { + "epoch": 2.917462099943852, + "grad_norm": 0.632973849773407, + "learning_rate": 2.3042061561968087e-08, + "loss": 0.3064, + "step": 10392 + }, + { + "epoch": 2.9177428411005053, + "grad_norm": 0.4675182104110718, + "learning_rate": 2.2885698906544017e-08, + "loss": 0.3706, + "step": 10393 + }, + { + "epoch": 2.9180235822571587, + "grad_norm": 0.534770131111145, + "learning_rate": 2.272986737780758e-08, + "loss": 0.3383, + "step": 10394 + }, + { + "epoch": 2.9183043234138126, + "grad_norm": 0.5844979286193848, + "learning_rate": 2.2574566992388247e-08, + "loss": 0.3407, + "step": 10395 + }, + { + "epoch": 2.918585064570466, + "grad_norm": 0.584099292755127, + "learning_rate": 2.2419797766858876e-08, + "loss": 0.3281, + "step": 10396 + }, + { + "epoch": 2.9188658057271195, + "grad_norm": 0.5600420236587524, + "learning_rate": 2.2265559717734586e-08, + "loss": 0.3283, + "step": 10397 + }, + { + "epoch": 2.9191465468837734, + "grad_norm": 0.5826917886734009, + "learning_rate": 2.2111852861475546e-08, + "loss": 0.3408, + "step": 10398 + }, + { + "epoch": 2.919427288040427, + "grad_norm": 0.5664740204811096, + "learning_rate": 2.1958677214484192e-08, + "loss": 0.3267, + "step": 10399 + }, + { + "epoch": 2.9197080291970803, + "grad_norm": 0.5925627946853638, + "learning_rate": 2.1806032793106334e-08, + "loss": 0.3083, + "step": 10400 + }, + { + "epoch": 2.9199887703537337, + "grad_norm": 0.525027871131897, + "learning_rate": 2.165391961363117e-08, + "loss": 0.3482, + "step": 10401 + }, + { + "epoch": 2.920269511510387, + "grad_norm": 0.48973405361175537, + "learning_rate": 2.1502337692291818e-08, + "loss": 0.3958, + "step": 10402 + }, + { + "epoch": 2.920550252667041, + "grad_norm": 0.6105417609214783, + "learning_rate": 2.1351287045263124e-08, + "loss": 0.3459, + "step": 10403 + }, + { + "epoch": 2.9208309938236945, + "grad_norm": 0.5968075394630432, + "learning_rate": 2.1200767688665524e-08, + "loss": 0.3289, + "step": 10404 + }, + { + "epoch": 2.9211117349803484, + "grad_norm": 0.5124432444572449, + "learning_rate": 2.1050779638560616e-08, + "loss": 0.3492, + "step": 10405 + }, + { + "epoch": 2.921392476137002, + "grad_norm": 0.5225157141685486, + "learning_rate": 2.090132291095448e-08, + "loss": 0.323, + "step": 10406 + }, + { + "epoch": 2.9216732172936553, + "grad_norm": 0.5560155510902405, + "learning_rate": 2.075239752179603e-08, + "loss": 0.3574, + "step": 10407 + }, + { + "epoch": 2.9219539584503087, + "grad_norm": 0.5677082538604736, + "learning_rate": 2.060400348697811e-08, + "loss": 0.322, + "step": 10408 + }, + { + "epoch": 2.922234699606962, + "grad_norm": 0.5468083620071411, + "learning_rate": 2.0456140822335825e-08, + "loss": 0.2917, + "step": 10409 + }, + { + "epoch": 2.922515440763616, + "grad_norm": 0.5514059662818909, + "learning_rate": 2.0308809543648776e-08, + "loss": 0.29, + "step": 10410 + }, + { + "epoch": 2.9227961819202695, + "grad_norm": 0.5410345196723938, + "learning_rate": 2.0162009666638837e-08, + "loss": 0.335, + "step": 10411 + }, + { + "epoch": 2.9230769230769234, + "grad_norm": 0.5883322358131409, + "learning_rate": 2.0015741206971252e-08, + "loss": 0.3183, + "step": 10412 + }, + { + "epoch": 2.923357664233577, + "grad_norm": 0.5663148164749146, + "learning_rate": 1.9870004180255755e-08, + "loss": 0.3271, + "step": 10413 + }, + { + "epoch": 2.9236384053902302, + "grad_norm": 0.5233526229858398, + "learning_rate": 1.9724798602043793e-08, + "loss": 0.3407, + "step": 10414 + }, + { + "epoch": 2.9239191465468837, + "grad_norm": 0.6190729737281799, + "learning_rate": 1.958012448783131e-08, + "loss": 0.3041, + "step": 10415 + }, + { + "epoch": 2.924199887703537, + "grad_norm": 0.5732908248901367, + "learning_rate": 1.9435981853056506e-08, + "loss": 0.3322, + "step": 10416 + }, + { + "epoch": 2.924480628860191, + "grad_norm": 0.5757434964179993, + "learning_rate": 1.9292370713101527e-08, + "loss": 0.3411, + "step": 10417 + }, + { + "epoch": 2.9247613700168444, + "grad_norm": 0.5287437438964844, + "learning_rate": 1.9149291083291888e-08, + "loss": 0.3659, + "step": 10418 + }, + { + "epoch": 2.925042111173498, + "grad_norm": 0.5786102414131165, + "learning_rate": 1.9006742978895933e-08, + "loss": 0.3072, + "step": 10419 + }, + { + "epoch": 2.9253228523301518, + "grad_norm": 0.5517265200614929, + "learning_rate": 1.8864726415125933e-08, + "loss": 0.2873, + "step": 10420 + }, + { + "epoch": 2.925603593486805, + "grad_norm": 0.5496549606323242, + "learning_rate": 1.872324140713644e-08, + "loss": 0.3576, + "step": 10421 + }, + { + "epoch": 2.9258843346434587, + "grad_norm": 0.5972222685813904, + "learning_rate": 1.858228797002648e-08, + "loss": 0.3111, + "step": 10422 + }, + { + "epoch": 2.926165075800112, + "grad_norm": 0.5477586388587952, + "learning_rate": 1.8441866118836804e-08, + "loss": 0.2791, + "step": 10423 + }, + { + "epoch": 2.926445816956766, + "grad_norm": 0.5522471070289612, + "learning_rate": 1.8301975868553202e-08, + "loss": 0.3107, + "step": 10424 + }, + { + "epoch": 2.9267265581134194, + "grad_norm": 0.5395142436027527, + "learning_rate": 1.816261723410373e-08, + "loss": 0.3256, + "step": 10425 + }, + { + "epoch": 2.927007299270073, + "grad_norm": 0.515011191368103, + "learning_rate": 1.802379023035927e-08, + "loss": 0.3307, + "step": 10426 + }, + { + "epoch": 2.9272880404267267, + "grad_norm": 0.5615693926811218, + "learning_rate": 1.7885494872135754e-08, + "loss": 0.3205, + "step": 10427 + }, + { + "epoch": 2.92756878158338, + "grad_norm": 0.5720718502998352, + "learning_rate": 1.7747731174190262e-08, + "loss": 0.3105, + "step": 10428 + }, + { + "epoch": 2.9278495227400336, + "grad_norm": 0.5090105533599854, + "learning_rate": 1.7610499151223813e-08, + "loss": 0.3065, + "step": 10429 + }, + { + "epoch": 2.928130263896687, + "grad_norm": 0.5333269834518433, + "learning_rate": 1.7473798817881915e-08, + "loss": 0.3635, + "step": 10430 + }, + { + "epoch": 2.928411005053341, + "grad_norm": 0.5919604897499084, + "learning_rate": 1.7337630188751787e-08, + "loss": 0.2897, + "step": 10431 + }, + { + "epoch": 2.9286917462099944, + "grad_norm": 0.5328156352043152, + "learning_rate": 1.7201993278364582e-08, + "loss": 0.3172, + "step": 10432 + }, + { + "epoch": 2.928972487366648, + "grad_norm": 0.6113194227218628, + "learning_rate": 1.7066888101194835e-08, + "loss": 0.29, + "step": 10433 + }, + { + "epoch": 2.9292532285233017, + "grad_norm": 0.5573399066925049, + "learning_rate": 1.69323146716599e-08, + "loss": 0.3267, + "step": 10434 + }, + { + "epoch": 2.929533969679955, + "grad_norm": 0.4959973990917206, + "learning_rate": 1.6798273004121067e-08, + "loss": 0.356, + "step": 10435 + }, + { + "epoch": 2.9298147108366086, + "grad_norm": 0.5739654302597046, + "learning_rate": 1.6664763112881342e-08, + "loss": 0.3563, + "step": 10436 + }, + { + "epoch": 2.930095451993262, + "grad_norm": 0.5687186121940613, + "learning_rate": 1.6531785012189327e-08, + "loss": 0.2725, + "step": 10437 + }, + { + "epoch": 2.930376193149916, + "grad_norm": 0.5686548352241516, + "learning_rate": 1.639933871623478e-08, + "loss": 0.3233, + "step": 10438 + }, + { + "epoch": 2.9306569343065694, + "grad_norm": 0.5329210758209229, + "learning_rate": 1.6267424239151953e-08, + "loss": 0.2906, + "step": 10439 + }, + { + "epoch": 2.930937675463223, + "grad_norm": 0.5315520167350769, + "learning_rate": 1.6136041595017914e-08, + "loss": 0.3553, + "step": 10440 + }, + { + "epoch": 2.9312184166198767, + "grad_norm": 0.5640670657157898, + "learning_rate": 1.6005190797852564e-08, + "loss": 0.2898, + "step": 10441 + }, + { + "epoch": 2.93149915777653, + "grad_norm": 0.539916455745697, + "learning_rate": 1.5874871861620287e-08, + "loss": 0.3293, + "step": 10442 + }, + { + "epoch": 2.9317798989331836, + "grad_norm": 0.5563673377037048, + "learning_rate": 1.5745084800227184e-08, + "loss": 0.3042, + "step": 10443 + }, + { + "epoch": 2.932060640089837, + "grad_norm": 0.5389270782470703, + "learning_rate": 1.5615829627523837e-08, + "loss": 0.2954, + "step": 10444 + }, + { + "epoch": 2.9323413812464905, + "grad_norm": 0.5811926126480103, + "learning_rate": 1.5487106357303106e-08, + "loss": 0.3445, + "step": 10445 + }, + { + "epoch": 2.9326221224031443, + "grad_norm": 0.566461980342865, + "learning_rate": 1.5358915003301223e-08, + "loss": 0.2767, + "step": 10446 + }, + { + "epoch": 2.932902863559798, + "grad_norm": 0.49240151047706604, + "learning_rate": 1.5231255579199466e-08, + "loss": 0.3875, + "step": 10447 + }, + { + "epoch": 2.9331836047164517, + "grad_norm": 0.5289565920829773, + "learning_rate": 1.5104128098619164e-08, + "loss": 0.3177, + "step": 10448 + }, + { + "epoch": 2.933464345873105, + "grad_norm": 0.5947088599205017, + "learning_rate": 1.4977532575127794e-08, + "loss": 0.3637, + "step": 10449 + }, + { + "epoch": 2.9337450870297586, + "grad_norm": 0.5179443955421448, + "learning_rate": 1.4851469022234e-08, + "loss": 0.3051, + "step": 10450 + }, + { + "epoch": 2.934025828186412, + "grad_norm": 0.5526657104492188, + "learning_rate": 1.4725937453390904e-08, + "loss": 0.3195, + "step": 10451 + }, + { + "epoch": 2.9343065693430654, + "grad_norm": 0.5307288765907288, + "learning_rate": 1.460093788199446e-08, + "loss": 0.3406, + "step": 10452 + }, + { + "epoch": 2.9345873104997193, + "grad_norm": 0.6137859225273132, + "learning_rate": 1.4476470321383995e-08, + "loss": 0.3571, + "step": 10453 + }, + { + "epoch": 2.9348680516563728, + "grad_norm": 0.5237303376197815, + "learning_rate": 1.435253478484111e-08, + "loss": 0.3229, + "step": 10454 + }, + { + "epoch": 2.9351487928130267, + "grad_norm": 0.5243408679962158, + "learning_rate": 1.4229131285592446e-08, + "loss": 0.2663, + "step": 10455 + }, + { + "epoch": 2.93542953396968, + "grad_norm": 0.5741708278656006, + "learning_rate": 1.4106259836806357e-08, + "loss": 0.3031, + "step": 10456 + }, + { + "epoch": 2.9357102751263335, + "grad_norm": 0.5652405023574829, + "learning_rate": 1.3983920451595135e-08, + "loss": 0.3058, + "step": 10457 + }, + { + "epoch": 2.935991016282987, + "grad_norm": 0.5818977355957031, + "learning_rate": 1.3862113143013888e-08, + "loss": 0.281, + "step": 10458 + }, + { + "epoch": 2.9362717574396404, + "grad_norm": 0.5077506899833679, + "learning_rate": 1.3740837924061112e-08, + "loss": 0.3462, + "step": 10459 + }, + { + "epoch": 2.9365524985962943, + "grad_norm": 0.5771107077598572, + "learning_rate": 1.362009480767812e-08, + "loss": 0.3328, + "step": 10460 + }, + { + "epoch": 2.9368332397529477, + "grad_norm": 0.5271692276000977, + "learning_rate": 1.3499883806751269e-08, + "loss": 0.2982, + "step": 10461 + }, + { + "epoch": 2.937113980909601, + "grad_norm": 0.6047289967536926, + "learning_rate": 1.3380204934106967e-08, + "loss": 0.3102, + "step": 10462 + }, + { + "epoch": 2.937394722066255, + "grad_norm": 0.47763630747795105, + "learning_rate": 1.3261058202517773e-08, + "loss": 0.3464, + "step": 10463 + }, + { + "epoch": 2.9376754632229085, + "grad_norm": 0.5546920895576477, + "learning_rate": 1.314244362469852e-08, + "loss": 0.3326, + "step": 10464 + }, + { + "epoch": 2.937956204379562, + "grad_norm": 0.5359768867492676, + "learning_rate": 1.3024361213305747e-08, + "loss": 0.2888, + "step": 10465 + }, + { + "epoch": 2.9382369455362154, + "grad_norm": 0.5531989336013794, + "learning_rate": 1.2906810980941597e-08, + "loss": 0.3443, + "step": 10466 + }, + { + "epoch": 2.9385176866928693, + "grad_norm": 0.5018088221549988, + "learning_rate": 1.2789792940149371e-08, + "loss": 0.3229, + "step": 10467 + }, + { + "epoch": 2.9387984278495227, + "grad_norm": 0.5590019822120667, + "learning_rate": 1.267330710341741e-08, + "loss": 0.3599, + "step": 10468 + }, + { + "epoch": 2.939079169006176, + "grad_norm": 0.5875823497772217, + "learning_rate": 1.2557353483176327e-08, + "loss": 0.3281, + "step": 10469 + }, + { + "epoch": 2.93935991016283, + "grad_norm": 0.5337052941322327, + "learning_rate": 1.2441932091799004e-08, + "loss": 0.2883, + "step": 10470 + }, + { + "epoch": 2.9396406513194835, + "grad_norm": 0.5880594253540039, + "learning_rate": 1.2327042941603362e-08, + "loss": 0.318, + "step": 10471 + }, + { + "epoch": 2.939921392476137, + "grad_norm": 0.6057711839675903, + "learning_rate": 1.2212686044849598e-08, + "loss": 0.3212, + "step": 10472 + }, + { + "epoch": 2.9402021336327904, + "grad_norm": 0.5037532448768616, + "learning_rate": 1.2098861413740726e-08, + "loss": 0.3392, + "step": 10473 + }, + { + "epoch": 2.9404828747894443, + "grad_norm": 0.5388426184654236, + "learning_rate": 1.1985569060423696e-08, + "loss": 0.3077, + "step": 10474 + }, + { + "epoch": 2.9407636159460977, + "grad_norm": 0.5098231434822083, + "learning_rate": 1.1872808996988284e-08, + "loss": 0.3176, + "step": 10475 + }, + { + "epoch": 2.941044357102751, + "grad_norm": 0.5242395401000977, + "learning_rate": 1.176058123546764e-08, + "loss": 0.3001, + "step": 10476 + }, + { + "epoch": 2.941325098259405, + "grad_norm": 0.5007601380348206, + "learning_rate": 1.1648885787837737e-08, + "loss": 0.3105, + "step": 10477 + }, + { + "epoch": 2.9416058394160585, + "grad_norm": 0.494960218667984, + "learning_rate": 1.1537722666018492e-08, + "loss": 0.3354, + "step": 10478 + }, + { + "epoch": 2.941886580572712, + "grad_norm": 0.4698236584663391, + "learning_rate": 1.1427091881872077e-08, + "loss": 0.3497, + "step": 10479 + }, + { + "epoch": 2.9421673217293653, + "grad_norm": 0.532472550868988, + "learning_rate": 1.1316993447204604e-08, + "loss": 0.3354, + "step": 10480 + }, + { + "epoch": 2.9424480628860192, + "grad_norm": 0.565513014793396, + "learning_rate": 1.1207427373765568e-08, + "loss": 0.3438, + "step": 10481 + }, + { + "epoch": 2.9427288040426727, + "grad_norm": 0.5432757139205933, + "learning_rate": 1.1098393673246166e-08, + "loss": 0.3479, + "step": 10482 + }, + { + "epoch": 2.943009545199326, + "grad_norm": 0.6216778755187988, + "learning_rate": 1.0989892357282095e-08, + "loss": 0.3328, + "step": 10483 + }, + { + "epoch": 2.94329028635598, + "grad_norm": 0.48868751525878906, + "learning_rate": 1.0881923437452424e-08, + "loss": 0.3117, + "step": 10484 + }, + { + "epoch": 2.9435710275126334, + "grad_norm": 0.5677542090415955, + "learning_rate": 1.0774486925278493e-08, + "loss": 0.2997, + "step": 10485 + }, + { + "epoch": 2.943851768669287, + "grad_norm": 0.5757440328598022, + "learning_rate": 1.066758283222502e-08, + "loss": 0.2953, + "step": 10486 + }, + { + "epoch": 2.9441325098259403, + "grad_norm": 0.5152274966239929, + "learning_rate": 1.0561211169700658e-08, + "loss": 0.3133, + "step": 10487 + }, + { + "epoch": 2.944413250982594, + "grad_norm": 0.5793607234954834, + "learning_rate": 1.045537194905688e-08, + "loss": 0.3091, + "step": 10488 + }, + { + "epoch": 2.9446939921392477, + "grad_norm": 0.5389032363891602, + "learning_rate": 1.0350065181587432e-08, + "loss": 0.2924, + "step": 10489 + }, + { + "epoch": 2.944974733295901, + "grad_norm": 0.6232114434242249, + "learning_rate": 1.02452908785311e-08, + "loss": 0.3007, + "step": 10490 + }, + { + "epoch": 2.945255474452555, + "grad_norm": 0.5648753046989441, + "learning_rate": 1.014104905106783e-08, + "loss": 0.3289, + "step": 10491 + }, + { + "epoch": 2.9455362156092084, + "grad_norm": 0.5341949462890625, + "learning_rate": 1.0037339710321503e-08, + "loss": 0.3247, + "step": 10492 + }, + { + "epoch": 2.945816956765862, + "grad_norm": 0.5344690084457397, + "learning_rate": 9.934162867359932e-09, + "loss": 0.3034, + "step": 10493 + }, + { + "epoch": 2.9460976979225153, + "grad_norm": 0.5142850279808044, + "learning_rate": 9.83151853319375e-09, + "loss": 0.3155, + "step": 10494 + }, + { + "epoch": 2.9463784390791687, + "grad_norm": 0.5972554087638855, + "learning_rate": 9.72940671877587e-09, + "loss": 0.359, + "step": 10495 + }, + { + "epoch": 2.9466591802358226, + "grad_norm": 0.5689883232116699, + "learning_rate": 9.627827435003124e-09, + "loss": 0.3496, + "step": 10496 + }, + { + "epoch": 2.946939921392476, + "grad_norm": 0.539682149887085, + "learning_rate": 9.526780692715177e-09, + "loss": 0.3097, + "step": 10497 + }, + { + "epoch": 2.94722066254913, + "grad_norm": 0.5398123264312744, + "learning_rate": 9.42626650269618e-09, + "loss": 0.331, + "step": 10498 + }, + { + "epoch": 2.9475014037057834, + "grad_norm": 0.5512509346008301, + "learning_rate": 9.326284875671444e-09, + "loss": 0.3332, + "step": 10499 + }, + { + "epoch": 2.947782144862437, + "grad_norm": 0.5428307056427002, + "learning_rate": 9.226835822310765e-09, + "loss": 0.3715, + "step": 10500 + }, + { + "epoch": 2.9480628860190903, + "grad_norm": 0.5412346124649048, + "learning_rate": 9.127919353226212e-09, + "loss": 0.3278, + "step": 10501 + }, + { + "epoch": 2.9483436271757437, + "grad_norm": 0.5379180908203125, + "learning_rate": 9.029535478974339e-09, + "loss": 0.3038, + "step": 10502 + }, + { + "epoch": 2.9486243683323976, + "grad_norm": 0.5824816226959229, + "learning_rate": 8.931684210053415e-09, + "loss": 0.3405, + "step": 10503 + }, + { + "epoch": 2.948905109489051, + "grad_norm": 0.5193791389465332, + "learning_rate": 8.834365556905644e-09, + "loss": 0.3748, + "step": 10504 + }, + { + "epoch": 2.949185850645705, + "grad_norm": 0.5310022234916687, + "learning_rate": 8.737579529916607e-09, + "loss": 0.3754, + "step": 10505 + }, + { + "epoch": 2.9494665918023584, + "grad_norm": 0.510654866695404, + "learning_rate": 8.641326139414707e-09, + "loss": 0.2935, + "step": 10506 + }, + { + "epoch": 2.949747332959012, + "grad_norm": 0.5400134921073914, + "learning_rate": 8.54560539567062e-09, + "loss": 0.3278, + "step": 10507 + }, + { + "epoch": 2.9500280741156653, + "grad_norm": 0.5767861008644104, + "learning_rate": 8.45041730890006e-09, + "loss": 0.3373, + "step": 10508 + }, + { + "epoch": 2.9503088152723187, + "grad_norm": 0.51285320520401, + "learning_rate": 8.35576188926046e-09, + "loss": 0.3096, + "step": 10509 + }, + { + "epoch": 2.9505895564289726, + "grad_norm": 0.5136780738830566, + "learning_rate": 8.261639146853185e-09, + "loss": 0.3167, + "step": 10510 + }, + { + "epoch": 2.950870297585626, + "grad_norm": 0.5744150280952454, + "learning_rate": 8.168049091722418e-09, + "loss": 0.347, + "step": 10511 + }, + { + "epoch": 2.9511510387422795, + "grad_norm": 0.5855783224105835, + "learning_rate": 8.07499173385462e-09, + "loss": 0.3265, + "step": 10512 + }, + { + "epoch": 2.9514317798989333, + "grad_norm": 0.5832756757736206, + "learning_rate": 7.982467083181845e-09, + "loss": 0.3291, + "step": 10513 + }, + { + "epoch": 2.951712521055587, + "grad_norm": 0.548391580581665, + "learning_rate": 7.890475149576194e-09, + "loss": 0.3197, + "step": 10514 + }, + { + "epoch": 2.9519932622122402, + "grad_norm": 0.5297378301620483, + "learning_rate": 7.79901594285537e-09, + "loss": 0.3109, + "step": 10515 + }, + { + "epoch": 2.9522740033688937, + "grad_norm": 0.48342037200927734, + "learning_rate": 7.70808947277879e-09, + "loss": 0.3334, + "step": 10516 + }, + { + "epoch": 2.9525547445255476, + "grad_norm": 0.5580374002456665, + "learning_rate": 7.617695749050358e-09, + "loss": 0.3156, + "step": 10517 + }, + { + "epoch": 2.952835485682201, + "grad_norm": 0.5306932330131531, + "learning_rate": 7.52783478131569e-09, + "loss": 0.3414, + "step": 10518 + }, + { + "epoch": 2.9531162268388544, + "grad_norm": 0.5230298042297363, + "learning_rate": 7.43850657916434e-09, + "loss": 0.3603, + "step": 10519 + }, + { + "epoch": 2.9533969679955083, + "grad_norm": 0.5968591570854187, + "learning_rate": 7.34971115212868e-09, + "loss": 0.3154, + "step": 10520 + }, + { + "epoch": 2.9536777091521618, + "grad_norm": 0.633278489112854, + "learning_rate": 7.2614485096850205e-09, + "loss": 0.3056, + "step": 10521 + }, + { + "epoch": 2.953958450308815, + "grad_norm": 0.5836353302001953, + "learning_rate": 7.173718661251383e-09, + "loss": 0.3193, + "step": 10522 + }, + { + "epoch": 2.9542391914654687, + "grad_norm": 0.5063651204109192, + "learning_rate": 7.0865216161902785e-09, + "loss": 0.2898, + "step": 10523 + }, + { + "epoch": 2.9545199326221225, + "grad_norm": 0.6093495488166809, + "learning_rate": 6.999857383806485e-09, + "loss": 0.3116, + "step": 10524 + }, + { + "epoch": 2.954800673778776, + "grad_norm": 0.5958014726638794, + "learning_rate": 6.913725973349272e-09, + "loss": 0.2975, + "step": 10525 + }, + { + "epoch": 2.9550814149354294, + "grad_norm": 0.5749132633209229, + "learning_rate": 6.82812739400851e-09, + "loss": 0.3268, + "step": 10526 + }, + { + "epoch": 2.9553621560920833, + "grad_norm": 0.517746090888977, + "learning_rate": 6.743061654919669e-09, + "loss": 0.3395, + "step": 10527 + }, + { + "epoch": 2.9556428972487367, + "grad_norm": 0.6040874123573303, + "learning_rate": 6.658528765160488e-09, + "loss": 0.3363, + "step": 10528 + }, + { + "epoch": 2.95592363840539, + "grad_norm": 0.5603330135345459, + "learning_rate": 6.574528733751529e-09, + "loss": 0.3631, + "step": 10529 + }, + { + "epoch": 2.9562043795620436, + "grad_norm": 0.5852132439613342, + "learning_rate": 6.491061569656731e-09, + "loss": 0.3742, + "step": 10530 + }, + { + "epoch": 2.9564851207186975, + "grad_norm": 0.515753984451294, + "learning_rate": 6.408127281782861e-09, + "loss": 0.3291, + "step": 10531 + }, + { + "epoch": 2.956765861875351, + "grad_norm": 0.5279064774513245, + "learning_rate": 6.32572587898117e-09, + "loss": 0.3308, + "step": 10532 + }, + { + "epoch": 2.9570466030320044, + "grad_norm": 0.5763012766838074, + "learning_rate": 6.2438573700440706e-09, + "loss": 0.3341, + "step": 10533 + }, + { + "epoch": 2.9573273441886583, + "grad_norm": 0.5592288374900818, + "learning_rate": 6.162521763708462e-09, + "loss": 0.2718, + "step": 10534 + }, + { + "epoch": 2.9576080853453117, + "grad_norm": 0.5332738757133484, + "learning_rate": 6.0817190686540685e-09, + "loss": 0.2813, + "step": 10535 + }, + { + "epoch": 2.957888826501965, + "grad_norm": 0.5409986972808838, + "learning_rate": 6.001449293503436e-09, + "loss": 0.3257, + "step": 10536 + }, + { + "epoch": 2.9581695676586186, + "grad_norm": 0.5411288142204285, + "learning_rate": 5.921712446822492e-09, + "loss": 0.3446, + "step": 10537 + }, + { + "epoch": 2.958450308815272, + "grad_norm": 0.5346503853797913, + "learning_rate": 5.842508537119984e-09, + "loss": 0.3407, + "step": 10538 + }, + { + "epoch": 2.958731049971926, + "grad_norm": 0.528346598148346, + "learning_rate": 5.763837572848596e-09, + "loss": 0.3092, + "step": 10539 + }, + { + "epoch": 2.9590117911285794, + "grad_norm": 0.5470328330993652, + "learning_rate": 5.685699562403279e-09, + "loss": 0.3298, + "step": 10540 + }, + { + "epoch": 2.9592925322852333, + "grad_norm": 0.5878907442092896, + "learning_rate": 5.6080945141223644e-09, + "loss": 0.3496, + "step": 10541 + }, + { + "epoch": 2.9595732734418867, + "grad_norm": 0.4949032664299011, + "learning_rate": 5.531022436288114e-09, + "loss": 0.3576, + "step": 10542 + }, + { + "epoch": 2.95985401459854, + "grad_norm": 0.6270462870597839, + "learning_rate": 5.45448333712395e-09, + "loss": 0.3467, + "step": 10543 + }, + { + "epoch": 2.9601347557551936, + "grad_norm": 0.5856950879096985, + "learning_rate": 5.378477224798895e-09, + "loss": 0.2873, + "step": 10544 + }, + { + "epoch": 2.960415496911847, + "grad_norm": 0.5382655262947083, + "learning_rate": 5.303004107422571e-09, + "loss": 0.3211, + "step": 10545 + }, + { + "epoch": 2.960696238068501, + "grad_norm": 0.699613094329834, + "learning_rate": 5.228063993050203e-09, + "loss": 0.2839, + "step": 10546 + }, + { + "epoch": 2.9609769792251543, + "grad_norm": 0.5352074503898621, + "learning_rate": 5.1536568896781715e-09, + "loss": 0.3989, + "step": 10547 + }, + { + "epoch": 2.9612577203818082, + "grad_norm": 0.49492260813713074, + "learning_rate": 5.079782805246791e-09, + "loss": 0.3326, + "step": 10548 + }, + { + "epoch": 2.9615384615384617, + "grad_norm": 0.49701163172721863, + "learning_rate": 5.0064417476403115e-09, + "loss": 0.3489, + "step": 10549 + }, + { + "epoch": 2.961819202695115, + "grad_norm": 0.489573210477829, + "learning_rate": 4.9336337246841394e-09, + "loss": 0.3358, + "step": 10550 + }, + { + "epoch": 2.9620999438517686, + "grad_norm": 0.5300890207290649, + "learning_rate": 4.86135874414817e-09, + "loss": 0.3099, + "step": 10551 + }, + { + "epoch": 2.962380685008422, + "grad_norm": 0.6492616534233093, + "learning_rate": 4.789616813745123e-09, + "loss": 0.3033, + "step": 10552 + }, + { + "epoch": 2.962661426165076, + "grad_norm": 0.5822144150733948, + "learning_rate": 4.7184079411310975e-09, + "loss": 0.28, + "step": 10553 + }, + { + "epoch": 2.9629421673217293, + "grad_norm": 0.5453940629959106, + "learning_rate": 4.6477321339055695e-09, + "loss": 0.3671, + "step": 10554 + }, + { + "epoch": 2.9632229084783828, + "grad_norm": 0.610227644443512, + "learning_rate": 4.5775893996097274e-09, + "loss": 0.3249, + "step": 10555 + }, + { + "epoch": 2.9635036496350367, + "grad_norm": 0.5688523054122925, + "learning_rate": 4.5079797457286965e-09, + "loss": 0.3318, + "step": 10556 + }, + { + "epoch": 2.96378439079169, + "grad_norm": 0.5592189431190491, + "learning_rate": 4.438903179691534e-09, + "loss": 0.3837, + "step": 10557 + }, + { + "epoch": 2.9640651319483435, + "grad_norm": 0.6163564324378967, + "learning_rate": 4.37035970886901e-09, + "loss": 0.2799, + "step": 10558 + }, + { + "epoch": 2.964345873104997, + "grad_norm": 0.5381830930709839, + "learning_rate": 4.302349340575829e-09, + "loss": 0.3028, + "step": 10559 + }, + { + "epoch": 2.964626614261651, + "grad_norm": 0.5643508434295654, + "learning_rate": 4.234872082070074e-09, + "loss": 0.3514, + "step": 10560 + }, + { + "epoch": 2.9649073554183043, + "grad_norm": 0.5094709992408752, + "learning_rate": 4.167927940552097e-09, + "loss": 0.354, + "step": 10561 + }, + { + "epoch": 2.9651880965749577, + "grad_norm": 0.5500958561897278, + "learning_rate": 4.101516923165627e-09, + "loss": 0.3048, + "step": 10562 + }, + { + "epoch": 2.9654688377316116, + "grad_norm": 0.5777549743652344, + "learning_rate": 4.035639036998329e-09, + "loss": 0.3187, + "step": 10563 + }, + { + "epoch": 2.965749578888265, + "grad_norm": 0.5151559114456177, + "learning_rate": 3.970294289079024e-09, + "loss": 0.3021, + "step": 10564 + }, + { + "epoch": 2.9660303200449185, + "grad_norm": 0.5703250765800476, + "learning_rate": 3.905482686382134e-09, + "loss": 0.3246, + "step": 10565 + }, + { + "epoch": 2.966311061201572, + "grad_norm": 0.5516538023948669, + "learning_rate": 3.841204235823792e-09, + "loss": 0.3308, + "step": 10566 + }, + { + "epoch": 2.966591802358226, + "grad_norm": 0.5556989908218384, + "learning_rate": 3.777458944262402e-09, + "loss": 0.3584, + "step": 10567 + }, + { + "epoch": 2.9668725435148793, + "grad_norm": 0.5369669795036316, + "learning_rate": 3.71424681850141e-09, + "loss": 0.2965, + "step": 10568 + }, + { + "epoch": 2.9671532846715327, + "grad_norm": 0.5599948167800903, + "learning_rate": 3.6515678652859765e-09, + "loss": 0.3284, + "step": 10569 + }, + { + "epoch": 2.9674340258281866, + "grad_norm": 0.5642825365066528, + "learning_rate": 3.589422091305195e-09, + "loss": 0.285, + "step": 10570 + }, + { + "epoch": 2.96771476698484, + "grad_norm": 0.5726443529129028, + "learning_rate": 3.527809503190982e-09, + "loss": 0.3092, + "step": 10571 + }, + { + "epoch": 2.9679955081414935, + "grad_norm": 0.4911079704761505, + "learning_rate": 3.4667301075175242e-09, + "loss": 0.3249, + "step": 10572 + }, + { + "epoch": 2.968276249298147, + "grad_norm": 0.5511907935142517, + "learning_rate": 3.4061839108029403e-09, + "loss": 0.326, + "step": 10573 + }, + { + "epoch": 2.968556990454801, + "grad_norm": 0.5117613673210144, + "learning_rate": 3.346170919509284e-09, + "loss": 0.3266, + "step": 10574 + }, + { + "epoch": 2.9688377316114543, + "grad_norm": 0.5381171703338623, + "learning_rate": 3.286691140039766e-09, + "loss": 0.307, + "step": 10575 + }, + { + "epoch": 2.9691184727681077, + "grad_norm": 0.5498996376991272, + "learning_rate": 3.227744578742087e-09, + "loss": 0.3189, + "step": 10576 + }, + { + "epoch": 2.9693992139247616, + "grad_norm": 0.5805063843727112, + "learning_rate": 3.16933124190677e-09, + "loss": 0.3325, + "step": 10577 + }, + { + "epoch": 2.969679955081415, + "grad_norm": 0.5250602960586548, + "learning_rate": 3.1114511357666077e-09, + "loss": 0.3277, + "step": 10578 + }, + { + "epoch": 2.9699606962380685, + "grad_norm": 0.5555799603462219, + "learning_rate": 3.054104266499436e-09, + "loss": 0.3127, + "step": 10579 + }, + { + "epoch": 2.970241437394722, + "grad_norm": 0.5048649311065674, + "learning_rate": 2.9972906402242487e-09, + "loss": 0.3484, + "step": 10580 + }, + { + "epoch": 2.970522178551376, + "grad_norm": 0.5382528901100159, + "learning_rate": 2.9410102630034186e-09, + "loss": 0.3327, + "step": 10581 + }, + { + "epoch": 2.9708029197080292, + "grad_norm": 0.5924113988876343, + "learning_rate": 2.885263140843808e-09, + "loss": 0.3206, + "step": 10582 + }, + { + "epoch": 2.9710836608646827, + "grad_norm": 0.519474446773529, + "learning_rate": 2.8300492796939914e-09, + "loss": 0.3459, + "step": 10583 + }, + { + "epoch": 2.9713644020213366, + "grad_norm": 0.5960259437561035, + "learning_rate": 2.7753686854453676e-09, + "loss": 0.3431, + "step": 10584 + }, + { + "epoch": 2.97164514317799, + "grad_norm": 0.5507221817970276, + "learning_rate": 2.721221363934379e-09, + "loss": 0.3204, + "step": 10585 + }, + { + "epoch": 2.9719258843346434, + "grad_norm": 0.5993731617927551, + "learning_rate": 2.6676073209380705e-09, + "loss": 0.2847, + "step": 10586 + }, + { + "epoch": 2.972206625491297, + "grad_norm": 0.5828808546066284, + "learning_rate": 2.614526562178532e-09, + "loss": 0.3446, + "step": 10587 + }, + { + "epoch": 2.9724873666479503, + "grad_norm": 0.6000937223434448, + "learning_rate": 2.5619790933201216e-09, + "loss": 0.3496, + "step": 10588 + }, + { + "epoch": 2.972768107804604, + "grad_norm": 0.484211266040802, + "learning_rate": 2.5099649199705754e-09, + "loss": 0.3505, + "step": 10589 + }, + { + "epoch": 2.9730488489612577, + "grad_norm": 0.58460932970047, + "learning_rate": 2.4584840476798986e-09, + "loss": 0.322, + "step": 10590 + }, + { + "epoch": 2.9733295901179115, + "grad_norm": 0.5212740898132324, + "learning_rate": 2.40753648194203e-09, + "loss": 0.316, + "step": 10591 + }, + { + "epoch": 2.973610331274565, + "grad_norm": 0.5156008005142212, + "learning_rate": 2.3571222281937312e-09, + "loss": 0.3286, + "step": 10592 + }, + { + "epoch": 2.9738910724312184, + "grad_norm": 0.5211127996444702, + "learning_rate": 2.3072412918156984e-09, + "loss": 0.3035, + "step": 10593 + }, + { + "epoch": 2.974171813587872, + "grad_norm": 0.4743029475212097, + "learning_rate": 2.2578936781297854e-09, + "loss": 0.3533, + "step": 10594 + }, + { + "epoch": 2.9744525547445253, + "grad_norm": 0.6124950647354126, + "learning_rate": 2.209079392402891e-09, + "loss": 0.2876, + "step": 10595 + }, + { + "epoch": 2.974733295901179, + "grad_norm": 0.4953879415988922, + "learning_rate": 2.1607984398436255e-09, + "loss": 0.3059, + "step": 10596 + }, + { + "epoch": 2.9750140370578326, + "grad_norm": 0.543163001537323, + "learning_rate": 2.1130508256039793e-09, + "loss": 0.3122, + "step": 10597 + }, + { + "epoch": 2.9752947782144865, + "grad_norm": 0.5467824339866638, + "learning_rate": 2.0658365547804316e-09, + "loss": 0.3232, + "step": 10598 + }, + { + "epoch": 2.97557551937114, + "grad_norm": 0.5721521377563477, + "learning_rate": 2.0191556324106186e-09, + "loss": 0.3285, + "step": 10599 + }, + { + "epoch": 2.9758562605277934, + "grad_norm": 0.5498932600021362, + "learning_rate": 1.9730080634761116e-09, + "loss": 0.306, + "step": 10600 + }, + { + "epoch": 2.976137001684447, + "grad_norm": 0.5505990386009216, + "learning_rate": 1.927393852901305e-09, + "loss": 0.3281, + "step": 10601 + }, + { + "epoch": 2.9764177428411003, + "grad_norm": 0.5039509534835815, + "learning_rate": 1.8823130055539706e-09, + "loss": 0.3456, + "step": 10602 + }, + { + "epoch": 2.976698483997754, + "grad_norm": 0.5800801515579224, + "learning_rate": 1.837765526245261e-09, + "loss": 0.2784, + "step": 10603 + }, + { + "epoch": 2.9769792251544076, + "grad_norm": 0.5249902009963989, + "learning_rate": 1.7937514197285955e-09, + "loss": 0.315, + "step": 10604 + }, + { + "epoch": 2.977259966311061, + "grad_norm": 0.5883903503417969, + "learning_rate": 1.7502706907007727e-09, + "loss": 0.3186, + "step": 10605 + }, + { + "epoch": 2.977540707467715, + "grad_norm": 0.5466181039810181, + "learning_rate": 1.7073233438019699e-09, + "loss": 0.3521, + "step": 10606 + }, + { + "epoch": 2.9778214486243684, + "grad_norm": 0.5664665699005127, + "learning_rate": 1.6649093836157427e-09, + "loss": 0.3201, + "step": 10607 + }, + { + "epoch": 2.978102189781022, + "grad_norm": 0.5520034432411194, + "learning_rate": 1.62302881466736e-09, + "loss": 0.3534, + "step": 10608 + }, + { + "epoch": 2.9783829309376753, + "grad_norm": 0.54378741979599, + "learning_rate": 1.58168164142658e-09, + "loss": 0.3412, + "step": 10609 + }, + { + "epoch": 2.978663672094329, + "grad_norm": 0.5794954895973206, + "learning_rate": 1.540867868305984e-09, + "loss": 0.3537, + "step": 10610 + }, + { + "epoch": 2.9789444132509826, + "grad_norm": 0.5040106773376465, + "learning_rate": 1.500587499660422e-09, + "loss": 0.3113, + "step": 10611 + }, + { + "epoch": 2.979225154407636, + "grad_norm": 0.5427335500717163, + "learning_rate": 1.460840539788122e-09, + "loss": 0.4027, + "step": 10612 + }, + { + "epoch": 2.97950589556429, + "grad_norm": 0.49653446674346924, + "learning_rate": 1.4216269929318017e-09, + "loss": 0.3556, + "step": 10613 + }, + { + "epoch": 2.9797866367209433, + "grad_norm": 0.5319122672080994, + "learning_rate": 1.382946863274781e-09, + "loss": 0.2885, + "step": 10614 + }, + { + "epoch": 2.980067377877597, + "grad_norm": 0.5083991289138794, + "learning_rate": 1.3448001549454248e-09, + "loss": 0.3235, + "step": 10615 + }, + { + "epoch": 2.9803481190342502, + "grad_norm": 0.5581716895103455, + "learning_rate": 1.3071868720143654e-09, + "loss": 0.2853, + "step": 10616 + }, + { + "epoch": 2.980628860190904, + "grad_norm": 0.47545793652534485, + "learning_rate": 1.2701070184956143e-09, + "loss": 0.3366, + "step": 10617 + }, + { + "epoch": 2.9809096013475576, + "grad_norm": 0.5832540988922119, + "learning_rate": 1.2335605983460065e-09, + "loss": 0.3102, + "step": 10618 + }, + { + "epoch": 2.981190342504211, + "grad_norm": 0.5522453784942627, + "learning_rate": 1.1975476154651999e-09, + "loss": 0.2653, + "step": 10619 + }, + { + "epoch": 2.981471083660865, + "grad_norm": 0.5310642123222351, + "learning_rate": 1.1620680736973422e-09, + "loss": 0.3552, + "step": 10620 + }, + { + "epoch": 2.9817518248175183, + "grad_norm": 0.565291702747345, + "learning_rate": 1.1271219768271836e-09, + "loss": 0.2752, + "step": 10621 + }, + { + "epoch": 2.9820325659741718, + "grad_norm": 0.5362760424613953, + "learning_rate": 1.0927093285850732e-09, + "loss": 0.3523, + "step": 10622 + }, + { + "epoch": 2.982313307130825, + "grad_norm": 0.6323783993721008, + "learning_rate": 1.0588301326425187e-09, + "loss": 0.2652, + "step": 10623 + }, + { + "epoch": 2.982594048287479, + "grad_norm": 0.5766808986663818, + "learning_rate": 1.0254843926155167e-09, + "loss": 0.3853, + "step": 10624 + }, + { + "epoch": 2.9828747894441325, + "grad_norm": 0.5770233273506165, + "learning_rate": 9.926721120617766e-10, + "loss": 0.3102, + "step": 10625 + }, + { + "epoch": 2.983155530600786, + "grad_norm": 0.5527682304382324, + "learning_rate": 9.603932944840522e-10, + "loss": 0.3594, + "step": 10626 + }, + { + "epoch": 2.98343627175744, + "grad_norm": 0.5960839986801147, + "learning_rate": 9.286479433257e-10, + "loss": 0.3154, + "step": 10627 + }, + { + "epoch": 2.9837170129140933, + "grad_norm": 0.49269023537635803, + "learning_rate": 8.97436061975121e-10, + "loss": 0.3657, + "step": 10628 + }, + { + "epoch": 2.9839977540707467, + "grad_norm": 0.5490446090698242, + "learning_rate": 8.667576537624289e-10, + "loss": 0.3287, + "step": 10629 + }, + { + "epoch": 2.9842784952274, + "grad_norm": 0.48584094643592834, + "learning_rate": 8.366127219616715e-10, + "loss": 0.3202, + "step": 10630 + }, + { + "epoch": 2.9845592363840536, + "grad_norm": 0.5821020007133484, + "learning_rate": 8.070012697902752e-10, + "loss": 0.354, + "step": 10631 + }, + { + "epoch": 2.9848399775407075, + "grad_norm": 0.5731359124183655, + "learning_rate": 7.779233004079345e-10, + "loss": 0.3505, + "step": 10632 + }, + { + "epoch": 2.985120718697361, + "grad_norm": 0.5254268050193787, + "learning_rate": 7.493788169171678e-10, + "loss": 0.2915, + "step": 10633 + }, + { + "epoch": 2.985401459854015, + "grad_norm": 0.5600931644439697, + "learning_rate": 7.213678223644272e-10, + "loss": 0.2926, + "step": 10634 + }, + { + "epoch": 2.9856822010106683, + "grad_norm": 0.4954153597354889, + "learning_rate": 6.938903197389879e-10, + "loss": 0.3117, + "step": 10635 + }, + { + "epoch": 2.9859629421673217, + "grad_norm": 0.578407347202301, + "learning_rate": 6.669463119729491e-10, + "loss": 0.3263, + "step": 10636 + }, + { + "epoch": 2.986243683323975, + "grad_norm": 0.6071785092353821, + "learning_rate": 6.405358019412333e-10, + "loss": 0.3116, + "step": 10637 + }, + { + "epoch": 2.9865244244806286, + "grad_norm": 0.5326305627822876, + "learning_rate": 6.146587924632519e-10, + "loss": 0.3328, + "step": 10638 + }, + { + "epoch": 2.9868051656372825, + "grad_norm": 0.5341914296150208, + "learning_rate": 5.893152862990192e-10, + "loss": 0.3398, + "step": 10639 + }, + { + "epoch": 2.987085906793936, + "grad_norm": 0.6067649722099304, + "learning_rate": 5.645052861541489e-10, + "loss": 0.3017, + "step": 10640 + }, + { + "epoch": 2.98736664795059, + "grad_norm": 0.567701518535614, + "learning_rate": 5.402287946759676e-10, + "loss": 0.2839, + "step": 10641 + }, + { + "epoch": 2.9876473891072433, + "grad_norm": 0.5179830193519592, + "learning_rate": 5.164858144546259e-10, + "loss": 0.3089, + "step": 10642 + }, + { + "epoch": 2.9879281302638967, + "grad_norm": 0.5601654648780823, + "learning_rate": 4.932763480247626e-10, + "loss": 0.3635, + "step": 10643 + }, + { + "epoch": 2.98820887142055, + "grad_norm": 0.5821573138237, + "learning_rate": 4.706003978621754e-10, + "loss": 0.3229, + "step": 10644 + }, + { + "epoch": 2.9884896125772036, + "grad_norm": 0.6123842000961304, + "learning_rate": 4.484579663871502e-10, + "loss": 0.2975, + "step": 10645 + }, + { + "epoch": 2.9887703537338575, + "grad_norm": 0.5240119099617004, + "learning_rate": 4.268490559622418e-10, + "loss": 0.361, + "step": 10646 + }, + { + "epoch": 2.989051094890511, + "grad_norm": 0.5306237936019897, + "learning_rate": 4.0577366889393844e-10, + "loss": 0.339, + "step": 10647 + }, + { + "epoch": 2.9893318360471643, + "grad_norm": 0.5565869212150574, + "learning_rate": 3.8523180743099686e-10, + "loss": 0.3589, + "step": 10648 + }, + { + "epoch": 2.9896125772038182, + "grad_norm": 0.5355308651924133, + "learning_rate": 3.6522347376555244e-10, + "loss": 0.3194, + "step": 10649 + }, + { + "epoch": 2.9898933183604717, + "grad_norm": 0.5698487758636475, + "learning_rate": 3.4574867003311916e-10, + "loss": 0.3167, + "step": 10650 + }, + { + "epoch": 2.990174059517125, + "grad_norm": 0.5874865055084229, + "learning_rate": 3.2680739831092436e-10, + "loss": 0.3066, + "step": 10651 + }, + { + "epoch": 2.9904548006737786, + "grad_norm": 0.5256145000457764, + "learning_rate": 3.083996606217943e-10, + "loss": 0.3004, + "step": 10652 + }, + { + "epoch": 2.9907355418304324, + "grad_norm": 0.584301769733429, + "learning_rate": 2.905254589286033e-10, + "loss": 0.3049, + "step": 10653 + }, + { + "epoch": 2.991016282987086, + "grad_norm": 0.5106728672981262, + "learning_rate": 2.7318479513926965e-10, + "loss": 0.3625, + "step": 10654 + }, + { + "epoch": 2.9912970241437393, + "grad_norm": 0.607122004032135, + "learning_rate": 2.5637767110509025e-10, + "loss": 0.3115, + "step": 10655 + }, + { + "epoch": 2.991577765300393, + "grad_norm": 0.5652472376823425, + "learning_rate": 2.401040886185202e-10, + "loss": 0.3221, + "step": 10656 + }, + { + "epoch": 2.9918585064570467, + "grad_norm": 0.5534455180168152, + "learning_rate": 2.2436404941650337e-10, + "loss": 0.2889, + "step": 10657 + }, + { + "epoch": 2.9921392476137, + "grad_norm": 0.47856977581977844, + "learning_rate": 2.091575551788072e-10, + "loss": 0.3409, + "step": 10658 + }, + { + "epoch": 2.9924199887703535, + "grad_norm": 0.561577320098877, + "learning_rate": 1.9448460752802267e-10, + "loss": 0.355, + "step": 10659 + }, + { + "epoch": 2.9927007299270074, + "grad_norm": 0.5409138202667236, + "learning_rate": 1.8034520803067444e-10, + "loss": 0.3475, + "step": 10660 + }, + { + "epoch": 2.992981471083661, + "grad_norm": 0.5205484628677368, + "learning_rate": 1.6673935819444541e-10, + "loss": 0.3434, + "step": 10661 + }, + { + "epoch": 2.9932622122403143, + "grad_norm": 0.5401335954666138, + "learning_rate": 1.536670594720624e-10, + "loss": 0.2729, + "step": 10662 + }, + { + "epoch": 2.993542953396968, + "grad_norm": 0.5215033888816833, + "learning_rate": 1.411283132585206e-10, + "loss": 0.3518, + "step": 10663 + }, + { + "epoch": 2.9938236945536216, + "grad_norm": 0.49377742409706116, + "learning_rate": 1.291231208916388e-10, + "loss": 0.2978, + "step": 10664 + }, + { + "epoch": 2.994104435710275, + "grad_norm": 0.5893886685371399, + "learning_rate": 1.1765148365261437e-10, + "loss": 0.2775, + "step": 10665 + }, + { + "epoch": 2.9943851768669285, + "grad_norm": 0.5212074518203735, + "learning_rate": 1.0671340276546816e-10, + "loss": 0.3573, + "step": 10666 + }, + { + "epoch": 2.9946659180235824, + "grad_norm": 0.5782735347747803, + "learning_rate": 9.63088793975997e-11, + "loss": 0.2998, + "step": 10667 + }, + { + "epoch": 2.994946659180236, + "grad_norm": 0.47576746344566345, + "learning_rate": 8.643791465978712e-11, + "loss": 0.3351, + "step": 10668 + }, + { + "epoch": 2.9952274003368893, + "grad_norm": 0.6338368654251099, + "learning_rate": 7.710050960452187e-11, + "loss": 0.3259, + "step": 10669 + }, + { + "epoch": 2.995508141493543, + "grad_norm": 0.6172406673431396, + "learning_rate": 6.829666522822909e-11, + "loss": 0.274, + "step": 10670 + }, + { + "epoch": 2.9957888826501966, + "grad_norm": 0.513821005821228, + "learning_rate": 6.002638247126769e-11, + "loss": 0.3604, + "step": 10671 + }, + { + "epoch": 2.99606962380685, + "grad_norm": 0.5321049094200134, + "learning_rate": 5.228966221570986e-11, + "loss": 0.3484, + "step": 10672 + }, + { + "epoch": 2.9963503649635035, + "grad_norm": 0.5471032857894897, + "learning_rate": 4.508650528700642e-11, + "loss": 0.3083, + "step": 10673 + }, + { + "epoch": 2.9966311061201574, + "grad_norm": 0.5512019395828247, + "learning_rate": 3.841691245398682e-11, + "loss": 0.3014, + "step": 10674 + }, + { + "epoch": 2.996911847276811, + "grad_norm": 0.5173888206481934, + "learning_rate": 3.228088442830402e-11, + "loss": 0.3355, + "step": 10675 + }, + { + "epoch": 2.9971925884334643, + "grad_norm": 0.5431347489356995, + "learning_rate": 2.667842186498959e-11, + "loss": 0.3199, + "step": 10676 + }, + { + "epoch": 2.997473329590118, + "grad_norm": 0.5168270468711853, + "learning_rate": 2.1609525361898643e-11, + "loss": 0.3701, + "step": 10677 + }, + { + "epoch": 2.9977540707467716, + "grad_norm": 0.5044336318969727, + "learning_rate": 1.7074195459709784e-11, + "loss": 0.321, + "step": 10678 + }, + { + "epoch": 2.998034811903425, + "grad_norm": 0.5126956105232239, + "learning_rate": 1.307243264248026e-11, + "loss": 0.3203, + "step": 10679 + }, + { + "epoch": 2.9983155530600785, + "grad_norm": 0.5498796105384827, + "learning_rate": 9.604237337645928e-12, + "loss": 0.3084, + "step": 10680 + }, + { + "epoch": 2.998596294216732, + "grad_norm": 0.5440199971199036, + "learning_rate": 6.669609914911057e-12, + "loss": 0.3357, + "step": 10681 + }, + { + "epoch": 2.998877035373386, + "grad_norm": 0.49261048436164856, + "learning_rate": 4.26855068680343e-12, + "loss": 0.3433, + "step": 10682 + }, + { + "epoch": 2.9991577765300392, + "grad_norm": 0.5796825289726257, + "learning_rate": 2.4010599108947876e-12, + "loss": 0.3425, + "step": 10683 + }, + { + "epoch": 2.999438517686693, + "grad_norm": 0.5342565178871155, + "learning_rate": 1.0671377853599395e-12, + "loss": 0.319, + "step": 10684 + }, + { + "epoch": 2.9997192588433466, + "grad_norm": 0.6024678349494934, + "learning_rate": 2.667844534176567e-13, + "loss": 0.3298, + "step": 10685 + }, + { + "epoch": 3.0, + "grad_norm": 0.4760485291481018, + "learning_rate": 0.0, + "loss": 0.3339, + "step": 10686 + }, + { + "epoch": 3.0, + "step": 10686, + "total_flos": 4620716816613376.0, + "train_loss": 0.39878227835304614, + "train_runtime": 132014.4933, + "train_samples_per_second": 2.59, + "train_steps_per_second": 0.081 + } + ], + "logging_steps": 1.0, + "max_steps": 10686, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4620716816613376.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}