diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,56138 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.3503212552996197, + "eval_steps": 500, + "global_step": 8015, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 4.3708204029896414e-05, + "grad_norm": 23.75, + "learning_rate": 1e-05, + "loss": 5.1721, + "step": 1 + }, + { + "epoch": 8.741640805979283e-05, + "grad_norm": 25.625, + "learning_rate": 2e-05, + "loss": 5.2813, + "step": 2 + }, + { + "epoch": 0.00013112461208968922, + "grad_norm": 26.75, + "learning_rate": 3e-05, + "loss": 5.1543, + "step": 3 + }, + { + "epoch": 0.00017483281611958566, + "grad_norm": 20.0, + "learning_rate": 4e-05, + "loss": 5.2777, + "step": 4 + }, + { + "epoch": 0.00021854102014948207, + "grad_norm": 15.3125, + "learning_rate": 5e-05, + "loss": 4.738, + "step": 5 + }, + { + "epoch": 0.00026224922417937845, + "grad_norm": 15.875, + "learning_rate": 6e-05, + "loss": 4.0695, + "step": 6 + }, + { + "epoch": 0.0003059574282092749, + "grad_norm": 13.625, + "learning_rate": 7e-05, + "loss": 4.3845, + "step": 7 + }, + { + "epoch": 0.0003496656322391713, + "grad_norm": 12.3125, + "learning_rate": 8e-05, + "loss": 4.2192, + "step": 8 + }, + { + "epoch": 0.0003933738362690677, + "grad_norm": 8.5625, + "learning_rate": 9e-05, + "loss": 3.635, + "step": 9 + }, + { + "epoch": 0.00043708204029896413, + "grad_norm": 8.875, + "learning_rate": 0.0001, + "loss": 3.798, + "step": 10 + }, + { + "epoch": 0.0004807902443288605, + "grad_norm": 8.25, + "learning_rate": 9.999999952821362e-05, + "loss": 3.4472, + "step": 11 + }, + { + "epoch": 0.0005244984483587569, + "grad_norm": 9.4375, + "learning_rate": 9.999999811285448e-05, + "loss": 3.7404, + "step": 12 + }, + { + "epoch": 0.0005682066523886533, + "grad_norm": 8.1875, + "learning_rate": 9.999999575392258e-05, + "loss": 3.5688, + "step": 13 + }, + { + "epoch": 0.0006119148564185498, + "grad_norm": 6.46875, + "learning_rate": 9.9999992451418e-05, + "loss": 3.6849, + "step": 14 + }, + { + "epoch": 0.0006556230604484462, + "grad_norm": 7.0, + "learning_rate": 9.99999882053408e-05, + "loss": 3.6462, + "step": 15 + }, + { + "epoch": 0.0006993312644783426, + "grad_norm": 5.40625, + "learning_rate": 9.999998301569104e-05, + "loss": 3.1855, + "step": 16 + }, + { + "epoch": 0.000743039468508239, + "grad_norm": 5.75, + "learning_rate": 9.999997688246885e-05, + "loss": 3.388, + "step": 17 + }, + { + "epoch": 0.0007867476725381354, + "grad_norm": 5.5625, + "learning_rate": 9.999996980567431e-05, + "loss": 3.2684, + "step": 18 + }, + { + "epoch": 0.0008304558765680318, + "grad_norm": 5.6875, + "learning_rate": 9.999996178530757e-05, + "loss": 2.8991, + "step": 19 + }, + { + "epoch": 0.0008741640805979283, + "grad_norm": 6.3125, + "learning_rate": 9.999995282136878e-05, + "loss": 3.6678, + "step": 20 + }, + { + "epoch": 0.0009178722846278247, + "grad_norm": 6.4375, + "learning_rate": 9.99999429138581e-05, + "loss": 3.3619, + "step": 21 + }, + { + "epoch": 0.000961580488657721, + "grad_norm": 7.34375, + "learning_rate": 9.999993206277573e-05, + "loss": 3.2241, + "step": 22 + }, + { + "epoch": 0.0010052886926876176, + "grad_norm": 38.0, + "learning_rate": 9.999992026812187e-05, + "loss": 3.4659, + "step": 23 + }, + { + "epoch": 0.0010489968967175138, + "grad_norm": 7.03125, + "learning_rate": 9.999990752989675e-05, + "loss": 3.2605, + "step": 24 + }, + { + "epoch": 0.0010927051007474102, + "grad_norm": 14.625, + "learning_rate": 9.99998938481006e-05, + "loss": 3.3462, + "step": 25 + }, + { + "epoch": 0.0011364133047773067, + "grad_norm": 6.03125, + "learning_rate": 9.999987922273368e-05, + "loss": 3.2599, + "step": 26 + }, + { + "epoch": 0.001180121508807203, + "grad_norm": 6.4375, + "learning_rate": 9.999986365379628e-05, + "loss": 3.0645, + "step": 27 + }, + { + "epoch": 0.0012238297128370995, + "grad_norm": 5.75, + "learning_rate": 9.999984714128867e-05, + "loss": 3.1231, + "step": 28 + }, + { + "epoch": 0.001267537916866996, + "grad_norm": 5.4375, + "learning_rate": 9.999982968521116e-05, + "loss": 2.9634, + "step": 29 + }, + { + "epoch": 0.0013112461208968924, + "grad_norm": 4.8125, + "learning_rate": 9.999981128556411e-05, + "loss": 2.902, + "step": 30 + }, + { + "epoch": 0.0013549543249267888, + "grad_norm": 21.0, + "learning_rate": 9.999979194234786e-05, + "loss": 3.1391, + "step": 31 + }, + { + "epoch": 0.0013986625289566853, + "grad_norm": 5.40625, + "learning_rate": 9.999977165556273e-05, + "loss": 2.9417, + "step": 32 + }, + { + "epoch": 0.0014423707329865815, + "grad_norm": 6.5, + "learning_rate": 9.999975042520917e-05, + "loss": 2.8276, + "step": 33 + }, + { + "epoch": 0.001486078937016478, + "grad_norm": 6.4375, + "learning_rate": 9.999972825128754e-05, + "loss": 3.7324, + "step": 34 + }, + { + "epoch": 0.0015297871410463744, + "grad_norm": 6.28125, + "learning_rate": 9.999970513379826e-05, + "loss": 3.2154, + "step": 35 + }, + { + "epoch": 0.0015734953450762708, + "grad_norm": 5.8125, + "learning_rate": 9.999968107274177e-05, + "loss": 3.2734, + "step": 36 + }, + { + "epoch": 0.0016172035491061672, + "grad_norm": 4.65625, + "learning_rate": 9.999965606811854e-05, + "loss": 2.8065, + "step": 37 + }, + { + "epoch": 0.0016609117531360637, + "grad_norm": 5.59375, + "learning_rate": 9.999963011992902e-05, + "loss": 2.8479, + "step": 38 + }, + { + "epoch": 0.00170461995716596, + "grad_norm": 12.875, + "learning_rate": 9.99996032281737e-05, + "loss": 3.0187, + "step": 39 + }, + { + "epoch": 0.0017483281611958565, + "grad_norm": 5.40625, + "learning_rate": 9.999957539285312e-05, + "loss": 3.0339, + "step": 40 + }, + { + "epoch": 0.001792036365225753, + "grad_norm": 8.4375, + "learning_rate": 9.999954661396777e-05, + "loss": 3.1146, + "step": 41 + }, + { + "epoch": 0.0018357445692556494, + "grad_norm": 8.875, + "learning_rate": 9.99995168915182e-05, + "loss": 2.7607, + "step": 42 + }, + { + "epoch": 0.0018794527732855456, + "grad_norm": 5.1875, + "learning_rate": 9.999948622550497e-05, + "loss": 3.1231, + "step": 43 + }, + { + "epoch": 0.001923160977315442, + "grad_norm": 4.65625, + "learning_rate": 9.999945461592867e-05, + "loss": 3.0816, + "step": 44 + }, + { + "epoch": 0.0019668691813453385, + "grad_norm": 5.125, + "learning_rate": 9.99994220627899e-05, + "loss": 3.6489, + "step": 45 + }, + { + "epoch": 0.002010577385375235, + "grad_norm": 4.53125, + "learning_rate": 9.999938856608926e-05, + "loss": 3.0423, + "step": 46 + }, + { + "epoch": 0.0020542855894051314, + "grad_norm": 5.28125, + "learning_rate": 9.999935412582738e-05, + "loss": 3.1826, + "step": 47 + }, + { + "epoch": 0.0020979937934350276, + "grad_norm": 5.0, + "learning_rate": 9.999931874200492e-05, + "loss": 3.0762, + "step": 48 + }, + { + "epoch": 0.0021417019974649242, + "grad_norm": 4.46875, + "learning_rate": 9.999928241462255e-05, + "loss": 3.4191, + "step": 49 + }, + { + "epoch": 0.0021854102014948204, + "grad_norm": 6.90625, + "learning_rate": 9.999924514368095e-05, + "loss": 3.2524, + "step": 50 + }, + { + "epoch": 0.002229118405524717, + "grad_norm": 4.0625, + "learning_rate": 9.999920692918082e-05, + "loss": 2.9654, + "step": 51 + }, + { + "epoch": 0.0022728266095546133, + "grad_norm": 5.75, + "learning_rate": 9.999916777112288e-05, + "loss": 3.1085, + "step": 52 + }, + { + "epoch": 0.00231653481358451, + "grad_norm": 4.75, + "learning_rate": 9.999912766950789e-05, + "loss": 3.0036, + "step": 53 + }, + { + "epoch": 0.002360243017614406, + "grad_norm": 4.96875, + "learning_rate": 9.999908662433657e-05, + "loss": 3.0041, + "step": 54 + }, + { + "epoch": 0.002403951221644303, + "grad_norm": 5.53125, + "learning_rate": 9.999904463560975e-05, + "loss": 2.6111, + "step": 55 + }, + { + "epoch": 0.002447659425674199, + "grad_norm": 4.28125, + "learning_rate": 9.999900170332814e-05, + "loss": 2.7588, + "step": 56 + }, + { + "epoch": 0.0024913676297040953, + "grad_norm": 4.53125, + "learning_rate": 9.999895782749263e-05, + "loss": 3.0982, + "step": 57 + }, + { + "epoch": 0.002535075833733992, + "grad_norm": 7.5625, + "learning_rate": 9.9998913008104e-05, + "loss": 3.2029, + "step": 58 + }, + { + "epoch": 0.002578784037763888, + "grad_norm": 6.125, + "learning_rate": 9.999886724516312e-05, + "loss": 2.601, + "step": 59 + }, + { + "epoch": 0.002622492241793785, + "grad_norm": 5.03125, + "learning_rate": 9.999882053867085e-05, + "loss": 2.5674, + "step": 60 + }, + { + "epoch": 0.002666200445823681, + "grad_norm": 5.15625, + "learning_rate": 9.999877288862806e-05, + "loss": 2.8234, + "step": 61 + }, + { + "epoch": 0.0027099086498535777, + "grad_norm": 5.34375, + "learning_rate": 9.999872429503565e-05, + "loss": 2.9025, + "step": 62 + }, + { + "epoch": 0.002753616853883474, + "grad_norm": 4.625, + "learning_rate": 9.999867475789455e-05, + "loss": 2.7504, + "step": 63 + }, + { + "epoch": 0.0027973250579133705, + "grad_norm": 10.125, + "learning_rate": 9.999862427720568e-05, + "loss": 3.6081, + "step": 64 + }, + { + "epoch": 0.0028410332619432667, + "grad_norm": 4.9375, + "learning_rate": 9.999857285297e-05, + "loss": 2.7429, + "step": 65 + }, + { + "epoch": 0.002884741465973163, + "grad_norm": 8.3125, + "learning_rate": 9.999852048518849e-05, + "loss": 2.8513, + "step": 66 + }, + { + "epoch": 0.0029284496700030596, + "grad_norm": 4.84375, + "learning_rate": 9.999846717386214e-05, + "loss": 2.7622, + "step": 67 + }, + { + "epoch": 0.002972157874032956, + "grad_norm": 6.90625, + "learning_rate": 9.999841291899193e-05, + "loss": 3.102, + "step": 68 + }, + { + "epoch": 0.0030158660780628525, + "grad_norm": 4.3125, + "learning_rate": 9.999835772057891e-05, + "loss": 2.6154, + "step": 69 + }, + { + "epoch": 0.0030595742820927487, + "grad_norm": 5.9375, + "learning_rate": 9.99983015786241e-05, + "loss": 2.9899, + "step": 70 + }, + { + "epoch": 0.0031032824861226454, + "grad_norm": 4.59375, + "learning_rate": 9.999824449312856e-05, + "loss": 2.8435, + "step": 71 + }, + { + "epoch": 0.0031469906901525416, + "grad_norm": 13.375, + "learning_rate": 9.999818646409339e-05, + "loss": 4.1482, + "step": 72 + }, + { + "epoch": 0.0031906988941824382, + "grad_norm": 4.40625, + "learning_rate": 9.999812749151966e-05, + "loss": 2.5349, + "step": 73 + }, + { + "epoch": 0.0032344070982123344, + "grad_norm": 4.15625, + "learning_rate": 9.999806757540851e-05, + "loss": 2.5921, + "step": 74 + }, + { + "epoch": 0.0032781153022422307, + "grad_norm": 4.46875, + "learning_rate": 9.999800671576106e-05, + "loss": 2.6727, + "step": 75 + }, + { + "epoch": 0.0033218235062721273, + "grad_norm": 4.0625, + "learning_rate": 9.999794491257845e-05, + "loss": 2.7172, + "step": 76 + }, + { + "epoch": 0.0033655317103020235, + "grad_norm": 5.34375, + "learning_rate": 9.999788216586186e-05, + "loss": 3.1049, + "step": 77 + }, + { + "epoch": 0.00340923991433192, + "grad_norm": 5.1875, + "learning_rate": 9.999781847561245e-05, + "loss": 2.8463, + "step": 78 + }, + { + "epoch": 0.0034529481183618164, + "grad_norm": 5.34375, + "learning_rate": 9.999775384183143e-05, + "loss": 2.6217, + "step": 79 + }, + { + "epoch": 0.003496656322391713, + "grad_norm": 4.34375, + "learning_rate": 9.999768826452004e-05, + "loss": 2.9236, + "step": 80 + }, + { + "epoch": 0.0035403645264216093, + "grad_norm": 3.875, + "learning_rate": 9.99976217436795e-05, + "loss": 2.4763, + "step": 81 + }, + { + "epoch": 0.003584072730451506, + "grad_norm": 6.46875, + "learning_rate": 9.999755427931107e-05, + "loss": 2.5611, + "step": 82 + }, + { + "epoch": 0.003627780934481402, + "grad_norm": 6.84375, + "learning_rate": 9.999748587141602e-05, + "loss": 2.4349, + "step": 83 + }, + { + "epoch": 0.003671489138511299, + "grad_norm": 4.78125, + "learning_rate": 9.999741651999566e-05, + "loss": 2.5408, + "step": 84 + }, + { + "epoch": 0.003715197342541195, + "grad_norm": 4.875, + "learning_rate": 9.999734622505126e-05, + "loss": 2.9666, + "step": 85 + }, + { + "epoch": 0.0037589055465710912, + "grad_norm": 5.03125, + "learning_rate": 9.999727498658417e-05, + "loss": 2.679, + "step": 86 + }, + { + "epoch": 0.003802613750600988, + "grad_norm": 3.796875, + "learning_rate": 9.999720280459576e-05, + "loss": 2.6728, + "step": 87 + }, + { + "epoch": 0.003846321954630884, + "grad_norm": 4.71875, + "learning_rate": 9.999712967908735e-05, + "loss": 3.1564, + "step": 88 + }, + { + "epoch": 0.0038900301586607807, + "grad_norm": 6.40625, + "learning_rate": 9.999705561006034e-05, + "loss": 2.4467, + "step": 89 + }, + { + "epoch": 0.003933738362690677, + "grad_norm": 5.28125, + "learning_rate": 9.999698059751609e-05, + "loss": 2.8369, + "step": 90 + }, + { + "epoch": 0.003977446566720573, + "grad_norm": 4.59375, + "learning_rate": 9.999690464145609e-05, + "loss": 2.6195, + "step": 91 + }, + { + "epoch": 0.00402115477075047, + "grad_norm": 5.34375, + "learning_rate": 9.999682774188173e-05, + "loss": 2.7844, + "step": 92 + }, + { + "epoch": 0.0040648629747803665, + "grad_norm": 4.375, + "learning_rate": 9.999674989879444e-05, + "loss": 2.6323, + "step": 93 + }, + { + "epoch": 0.004108571178810263, + "grad_norm": 4.125, + "learning_rate": 9.999667111219573e-05, + "loss": 2.692, + "step": 94 + }, + { + "epoch": 0.004152279382840159, + "grad_norm": 4.96875, + "learning_rate": 9.999659138208705e-05, + "loss": 2.4742, + "step": 95 + }, + { + "epoch": 0.004195987586870055, + "grad_norm": 4.53125, + "learning_rate": 9.999651070846995e-05, + "loss": 2.5278, + "step": 96 + }, + { + "epoch": 0.004239695790899952, + "grad_norm": 4.3125, + "learning_rate": 9.999642909134592e-05, + "loss": 2.8639, + "step": 97 + }, + { + "epoch": 0.0042834039949298484, + "grad_norm": 5.75, + "learning_rate": 9.99963465307165e-05, + "loss": 2.6517, + "step": 98 + }, + { + "epoch": 0.004327112198959745, + "grad_norm": 5.6875, + "learning_rate": 9.999626302658324e-05, + "loss": 2.7603, + "step": 99 + }, + { + "epoch": 0.004370820402989641, + "grad_norm": 4.375, + "learning_rate": 9.999617857894777e-05, + "loss": 2.3816, + "step": 100 + }, + { + "epoch": 0.004414528607019538, + "grad_norm": 3.796875, + "learning_rate": 9.99960931878116e-05, + "loss": 2.6044, + "step": 101 + }, + { + "epoch": 0.004458236811049434, + "grad_norm": 3.703125, + "learning_rate": 9.999600685317642e-05, + "loss": 2.3602, + "step": 102 + }, + { + "epoch": 0.00450194501507933, + "grad_norm": 9.0, + "learning_rate": 9.99959195750438e-05, + "loss": 2.4109, + "step": 103 + }, + { + "epoch": 0.004545653219109227, + "grad_norm": 6.0625, + "learning_rate": 9.999583135341544e-05, + "loss": 2.9449, + "step": 104 + }, + { + "epoch": 0.004589361423139123, + "grad_norm": 5.96875, + "learning_rate": 9.999574218829295e-05, + "loss": 2.4216, + "step": 105 + }, + { + "epoch": 0.00463306962716902, + "grad_norm": 5.46875, + "learning_rate": 9.999565207967805e-05, + "loss": 2.8841, + "step": 106 + }, + { + "epoch": 0.004676777831198916, + "grad_norm": 3.875, + "learning_rate": 9.999556102757244e-05, + "loss": 2.6692, + "step": 107 + }, + { + "epoch": 0.004720486035228812, + "grad_norm": 5.90625, + "learning_rate": 9.99954690319778e-05, + "loss": 2.6152, + "step": 108 + }, + { + "epoch": 0.004764194239258709, + "grad_norm": 4.46875, + "learning_rate": 9.999537609289592e-05, + "loss": 2.6917, + "step": 109 + }, + { + "epoch": 0.004807902443288606, + "grad_norm": 4.03125, + "learning_rate": 9.999528221032852e-05, + "loss": 2.2566, + "step": 110 + }, + { + "epoch": 0.004851610647318502, + "grad_norm": 4.84375, + "learning_rate": 9.999518738427737e-05, + "loss": 2.2956, + "step": 111 + }, + { + "epoch": 0.004895318851348398, + "grad_norm": 6.625, + "learning_rate": 9.99950916147443e-05, + "loss": 2.4404, + "step": 112 + }, + { + "epoch": 0.004939027055378294, + "grad_norm": 5.0, + "learning_rate": 9.999499490173106e-05, + "loss": 2.5939, + "step": 113 + }, + { + "epoch": 0.0049827352594081905, + "grad_norm": 5.0625, + "learning_rate": 9.999489724523951e-05, + "loss": 3.225, + "step": 114 + }, + { + "epoch": 0.005026443463438088, + "grad_norm": 4.59375, + "learning_rate": 9.999479864527148e-05, + "loss": 2.8029, + "step": 115 + }, + { + "epoch": 0.005070151667467984, + "grad_norm": 3.828125, + "learning_rate": 9.999469910182885e-05, + "loss": 2.5389, + "step": 116 + }, + { + "epoch": 0.00511385987149788, + "grad_norm": 4.40625, + "learning_rate": 9.999459861491348e-05, + "loss": 2.9194, + "step": 117 + }, + { + "epoch": 0.005157568075527776, + "grad_norm": 4.65625, + "learning_rate": 9.999449718452725e-05, + "loss": 2.623, + "step": 118 + }, + { + "epoch": 0.005201276279557673, + "grad_norm": 6.46875, + "learning_rate": 9.999439481067212e-05, + "loss": 3.8513, + "step": 119 + }, + { + "epoch": 0.00524498448358757, + "grad_norm": 3.90625, + "learning_rate": 9.999429149334998e-05, + "loss": 2.4835, + "step": 120 + }, + { + "epoch": 0.005288692687617466, + "grad_norm": 4.71875, + "learning_rate": 9.999418723256279e-05, + "loss": 2.4446, + "step": 121 + }, + { + "epoch": 0.005332400891647362, + "grad_norm": 3.921875, + "learning_rate": 9.999408202831255e-05, + "loss": 3.1651, + "step": 122 + }, + { + "epoch": 0.005376109095677258, + "grad_norm": 3.625, + "learning_rate": 9.99939758806012e-05, + "loss": 2.908, + "step": 123 + }, + { + "epoch": 0.005419817299707155, + "grad_norm": 3.953125, + "learning_rate": 9.999386878943077e-05, + "loss": 2.483, + "step": 124 + }, + { + "epoch": 0.0054635255037370515, + "grad_norm": 4.09375, + "learning_rate": 9.999376075480327e-05, + "loss": 2.8223, + "step": 125 + }, + { + "epoch": 0.005507233707766948, + "grad_norm": 4.28125, + "learning_rate": 9.999365177672075e-05, + "loss": 2.6433, + "step": 126 + }, + { + "epoch": 0.005550941911796844, + "grad_norm": 4.53125, + "learning_rate": 9.999354185518525e-05, + "loss": 3.3299, + "step": 127 + }, + { + "epoch": 0.005594650115826741, + "grad_norm": 4.3125, + "learning_rate": 9.999343099019884e-05, + "loss": 2.6354, + "step": 128 + }, + { + "epoch": 0.005638358319856637, + "grad_norm": 4.0625, + "learning_rate": 9.999331918176365e-05, + "loss": 2.099, + "step": 129 + }, + { + "epoch": 0.0056820665238865335, + "grad_norm": 4.15625, + "learning_rate": 9.999320642988175e-05, + "loss": 2.7426, + "step": 130 + }, + { + "epoch": 0.00572577472791643, + "grad_norm": 4.84375, + "learning_rate": 9.999309273455528e-05, + "loss": 3.2764, + "step": 131 + }, + { + "epoch": 0.005769482931946326, + "grad_norm": 3.78125, + "learning_rate": 9.99929780957864e-05, + "loss": 2.2569, + "step": 132 + }, + { + "epoch": 0.005813191135976223, + "grad_norm": 4.28125, + "learning_rate": 9.999286251357727e-05, + "loss": 2.6164, + "step": 133 + }, + { + "epoch": 0.005856899340006119, + "grad_norm": 4.21875, + "learning_rate": 9.999274598793005e-05, + "loss": 2.2387, + "step": 134 + }, + { + "epoch": 0.0059006075440360154, + "grad_norm": 14.125, + "learning_rate": 9.999262851884695e-05, + "loss": 2.7666, + "step": 135 + }, + { + "epoch": 0.005944315748065912, + "grad_norm": 4.09375, + "learning_rate": 9.99925101063302e-05, + "loss": 2.6582, + "step": 136 + }, + { + "epoch": 0.005988023952095809, + "grad_norm": 4.53125, + "learning_rate": 9.9992390750382e-05, + "loss": 2.0556, + "step": 137 + }, + { + "epoch": 0.006031732156125705, + "grad_norm": 4.4375, + "learning_rate": 9.999227045100465e-05, + "loss": 2.7892, + "step": 138 + }, + { + "epoch": 0.006075440360155601, + "grad_norm": 4.8125, + "learning_rate": 9.999214920820039e-05, + "loss": 2.4652, + "step": 139 + }, + { + "epoch": 0.006119148564185497, + "grad_norm": 5.25, + "learning_rate": 9.999202702197151e-05, + "loss": 3.0553, + "step": 140 + }, + { + "epoch": 0.006162856768215394, + "grad_norm": 4.53125, + "learning_rate": 9.999190389232032e-05, + "loss": 2.8345, + "step": 141 + }, + { + "epoch": 0.006206564972245291, + "grad_norm": 9.8125, + "learning_rate": 9.999177981924916e-05, + "loss": 2.97, + "step": 142 + }, + { + "epoch": 0.006250273176275187, + "grad_norm": 6.0625, + "learning_rate": 9.999165480276034e-05, + "loss": 2.718, + "step": 143 + }, + { + "epoch": 0.006293981380305083, + "grad_norm": 5.28125, + "learning_rate": 9.999152884285622e-05, + "loss": 2.0698, + "step": 144 + }, + { + "epoch": 0.006337689584334979, + "grad_norm": 4.875, + "learning_rate": 9.999140193953921e-05, + "loss": 2.4787, + "step": 145 + }, + { + "epoch": 0.0063813977883648764, + "grad_norm": 4.40625, + "learning_rate": 9.99912740928117e-05, + "loss": 2.3143, + "step": 146 + }, + { + "epoch": 0.006425105992394773, + "grad_norm": 10.4375, + "learning_rate": 9.999114530267607e-05, + "loss": 2.3755, + "step": 147 + }, + { + "epoch": 0.006468814196424669, + "grad_norm": 4.46875, + "learning_rate": 9.999101556913477e-05, + "loss": 2.5425, + "step": 148 + }, + { + "epoch": 0.006512522400454565, + "grad_norm": 5.3125, + "learning_rate": 9.999088489219027e-05, + "loss": 2.6765, + "step": 149 + }, + { + "epoch": 0.006556230604484461, + "grad_norm": 4.28125, + "learning_rate": 9.999075327184499e-05, + "loss": 2.6608, + "step": 150 + }, + { + "epoch": 0.006599938808514358, + "grad_norm": 4.28125, + "learning_rate": 9.999062070810144e-05, + "loss": 2.8477, + "step": 151 + }, + { + "epoch": 0.006643647012544255, + "grad_norm": 5.375, + "learning_rate": 9.999048720096212e-05, + "loss": 2.8137, + "step": 152 + }, + { + "epoch": 0.006687355216574151, + "grad_norm": 4.375, + "learning_rate": 9.999035275042954e-05, + "loss": 2.6685, + "step": 153 + }, + { + "epoch": 0.006731063420604047, + "grad_norm": 7.09375, + "learning_rate": 9.999021735650627e-05, + "loss": 3.1262, + "step": 154 + }, + { + "epoch": 0.006774771624633944, + "grad_norm": 4.53125, + "learning_rate": 9.999008101919482e-05, + "loss": 2.5198, + "step": 155 + }, + { + "epoch": 0.00681847982866384, + "grad_norm": 6.625, + "learning_rate": 9.99899437384978e-05, + "loss": 3.0499, + "step": 156 + }, + { + "epoch": 0.006862188032693737, + "grad_norm": 4.90625, + "learning_rate": 9.998980551441776e-05, + "loss": 2.6218, + "step": 157 + }, + { + "epoch": 0.006905896236723633, + "grad_norm": 4.15625, + "learning_rate": 9.998966634695737e-05, + "loss": 2.47, + "step": 158 + }, + { + "epoch": 0.006949604440753529, + "grad_norm": 4.59375, + "learning_rate": 9.99895262361192e-05, + "loss": 2.1887, + "step": 159 + }, + { + "epoch": 0.006993312644783426, + "grad_norm": 4.25, + "learning_rate": 9.998938518190591e-05, + "loss": 3.6236, + "step": 160 + }, + { + "epoch": 0.007037020848813322, + "grad_norm": 3.546875, + "learning_rate": 9.998924318432016e-05, + "loss": 2.6418, + "step": 161 + }, + { + "epoch": 0.0070807290528432185, + "grad_norm": 4.0, + "learning_rate": 9.998910024336466e-05, + "loss": 2.4386, + "step": 162 + }, + { + "epoch": 0.007124437256873115, + "grad_norm": 5.5625, + "learning_rate": 9.998895635904205e-05, + "loss": 2.7331, + "step": 163 + }, + { + "epoch": 0.007168145460903012, + "grad_norm": 4.46875, + "learning_rate": 9.99888115313551e-05, + "loss": 2.7163, + "step": 164 + }, + { + "epoch": 0.007211853664932908, + "grad_norm": 3.484375, + "learning_rate": 9.998866576030651e-05, + "loss": 2.123, + "step": 165 + }, + { + "epoch": 0.007255561868962804, + "grad_norm": 5.375, + "learning_rate": 9.998851904589905e-05, + "loss": 2.7204, + "step": 166 + }, + { + "epoch": 0.0072992700729927005, + "grad_norm": 4.21875, + "learning_rate": 9.998837138813549e-05, + "loss": 2.6734, + "step": 167 + }, + { + "epoch": 0.007342978277022598, + "grad_norm": 4.65625, + "learning_rate": 9.998822278701858e-05, + "loss": 2.7616, + "step": 168 + }, + { + "epoch": 0.007386686481052494, + "grad_norm": 4.34375, + "learning_rate": 9.998807324255118e-05, + "loss": 2.556, + "step": 169 + }, + { + "epoch": 0.00743039468508239, + "grad_norm": 5.34375, + "learning_rate": 9.998792275473607e-05, + "loss": 2.5006, + "step": 170 + }, + { + "epoch": 0.007474102889112286, + "grad_norm": 4.09375, + "learning_rate": 9.99877713235761e-05, + "loss": 2.8722, + "step": 171 + }, + { + "epoch": 0.0075178110931421824, + "grad_norm": 5.03125, + "learning_rate": 9.998761894907414e-05, + "loss": 3.0658, + "step": 172 + }, + { + "epoch": 0.0075615192971720795, + "grad_norm": 4.4375, + "learning_rate": 9.998746563123305e-05, + "loss": 2.8287, + "step": 173 + }, + { + "epoch": 0.007605227501201976, + "grad_norm": 4.9375, + "learning_rate": 9.998731137005572e-05, + "loss": 2.5613, + "step": 174 + }, + { + "epoch": 0.007648935705231872, + "grad_norm": 4.09375, + "learning_rate": 9.99871561655451e-05, + "loss": 2.4192, + "step": 175 + }, + { + "epoch": 0.007692643909261768, + "grad_norm": 9.875, + "learning_rate": 9.998700001770406e-05, + "loss": 3.3212, + "step": 176 + }, + { + "epoch": 0.007736352113291665, + "grad_norm": 5.3125, + "learning_rate": 9.998684292653559e-05, + "loss": 2.8366, + "step": 177 + }, + { + "epoch": 0.0077800603173215615, + "grad_norm": 4.75, + "learning_rate": 9.998668489204266e-05, + "loss": 2.4589, + "step": 178 + }, + { + "epoch": 0.007823768521351457, + "grad_norm": 3.859375, + "learning_rate": 9.998652591422821e-05, + "loss": 2.6667, + "step": 179 + }, + { + "epoch": 0.007867476725381354, + "grad_norm": 4.53125, + "learning_rate": 9.998636599309527e-05, + "loss": 2.7617, + "step": 180 + }, + { + "epoch": 0.007911184929411251, + "grad_norm": 4.1875, + "learning_rate": 9.998620512864686e-05, + "loss": 2.5969, + "step": 181 + }, + { + "epoch": 0.007954893133441146, + "grad_norm": 4.59375, + "learning_rate": 9.9986043320886e-05, + "loss": 2.214, + "step": 182 + }, + { + "epoch": 0.007998601337471043, + "grad_norm": 4.28125, + "learning_rate": 9.998588056981575e-05, + "loss": 2.6208, + "step": 183 + }, + { + "epoch": 0.00804230954150094, + "grad_norm": 4.375, + "learning_rate": 9.998571687543918e-05, + "loss": 2.8798, + "step": 184 + }, + { + "epoch": 0.008086017745530836, + "grad_norm": 4.3125, + "learning_rate": 9.99855522377594e-05, + "loss": 2.4923, + "step": 185 + }, + { + "epoch": 0.008129725949560733, + "grad_norm": 3.703125, + "learning_rate": 9.998538665677948e-05, + "loss": 2.055, + "step": 186 + }, + { + "epoch": 0.008173434153590628, + "grad_norm": 4.78125, + "learning_rate": 9.998522013250257e-05, + "loss": 2.6016, + "step": 187 + }, + { + "epoch": 0.008217142357620525, + "grad_norm": 4.09375, + "learning_rate": 9.998505266493181e-05, + "loss": 2.4205, + "step": 188 + }, + { + "epoch": 0.008260850561650422, + "grad_norm": 5.25, + "learning_rate": 9.998488425407035e-05, + "loss": 3.2423, + "step": 189 + }, + { + "epoch": 0.008304558765680318, + "grad_norm": 3.703125, + "learning_rate": 9.998471489992138e-05, + "loss": 2.3305, + "step": 190 + }, + { + "epoch": 0.008348266969710215, + "grad_norm": 3.40625, + "learning_rate": 9.998454460248809e-05, + "loss": 2.736, + "step": 191 + }, + { + "epoch": 0.00839197517374011, + "grad_norm": 4.46875, + "learning_rate": 9.998437336177369e-05, + "loss": 2.8134, + "step": 192 + }, + { + "epoch": 0.008435683377770007, + "grad_norm": 5.03125, + "learning_rate": 9.998420117778141e-05, + "loss": 2.3994, + "step": 193 + }, + { + "epoch": 0.008479391581799904, + "grad_norm": 5.6875, + "learning_rate": 9.998402805051452e-05, + "loss": 2.5559, + "step": 194 + }, + { + "epoch": 0.0085230997858298, + "grad_norm": 3.4375, + "learning_rate": 9.998385397997625e-05, + "loss": 2.3871, + "step": 195 + }, + { + "epoch": 0.008566807989859697, + "grad_norm": 4.28125, + "learning_rate": 9.998367896616993e-05, + "loss": 2.1715, + "step": 196 + }, + { + "epoch": 0.008610516193889592, + "grad_norm": 6.15625, + "learning_rate": 9.998350300909883e-05, + "loss": 2.7056, + "step": 197 + }, + { + "epoch": 0.00865422439791949, + "grad_norm": 10.3125, + "learning_rate": 9.998332610876629e-05, + "loss": 2.2428, + "step": 198 + }, + { + "epoch": 0.008697932601949386, + "grad_norm": 6.40625, + "learning_rate": 9.998314826517563e-05, + "loss": 2.7718, + "step": 199 + }, + { + "epoch": 0.008741640805979282, + "grad_norm": 7.84375, + "learning_rate": 9.998296947833021e-05, + "loss": 3.1103, + "step": 200 + }, + { + "epoch": 0.008785349010009179, + "grad_norm": 4.96875, + "learning_rate": 9.998278974823342e-05, + "loss": 2.9857, + "step": 201 + }, + { + "epoch": 0.008829057214039076, + "grad_norm": 4.3125, + "learning_rate": 9.998260907488863e-05, + "loss": 2.2535, + "step": 202 + }, + { + "epoch": 0.008872765418068971, + "grad_norm": 3.625, + "learning_rate": 9.998242745829927e-05, + "loss": 2.3132, + "step": 203 + }, + { + "epoch": 0.008916473622098868, + "grad_norm": 5.46875, + "learning_rate": 9.998224489846877e-05, + "loss": 2.4809, + "step": 204 + }, + { + "epoch": 0.008960181826128764, + "grad_norm": 5.125, + "learning_rate": 9.998206139540054e-05, + "loss": 3.1461, + "step": 205 + }, + { + "epoch": 0.00900389003015866, + "grad_norm": 4.15625, + "learning_rate": 9.998187694909807e-05, + "loss": 2.5521, + "step": 206 + }, + { + "epoch": 0.009047598234188558, + "grad_norm": 5.6875, + "learning_rate": 9.998169155956485e-05, + "loss": 2.6275, + "step": 207 + }, + { + "epoch": 0.009091306438218453, + "grad_norm": 5.09375, + "learning_rate": 9.998150522680437e-05, + "loss": 2.3942, + "step": 208 + }, + { + "epoch": 0.00913501464224835, + "grad_norm": 19.0, + "learning_rate": 9.998131795082011e-05, + "loss": 2.7506, + "step": 209 + }, + { + "epoch": 0.009178722846278246, + "grad_norm": 4.125, + "learning_rate": 9.998112973161566e-05, + "loss": 2.8633, + "step": 210 + }, + { + "epoch": 0.009222431050308143, + "grad_norm": 4.59375, + "learning_rate": 9.998094056919454e-05, + "loss": 2.8713, + "step": 211 + }, + { + "epoch": 0.00926613925433804, + "grad_norm": 3.9375, + "learning_rate": 9.998075046356033e-05, + "loss": 2.5957, + "step": 212 + }, + { + "epoch": 0.009309847458367935, + "grad_norm": 3.359375, + "learning_rate": 9.998055941471662e-05, + "loss": 2.4896, + "step": 213 + }, + { + "epoch": 0.009353555662397832, + "grad_norm": 4.5, + "learning_rate": 9.998036742266701e-05, + "loss": 3.1921, + "step": 214 + }, + { + "epoch": 0.009397263866427728, + "grad_norm": 4.375, + "learning_rate": 9.998017448741512e-05, + "loss": 2.8811, + "step": 215 + }, + { + "epoch": 0.009440972070457625, + "grad_norm": 3.671875, + "learning_rate": 9.997998060896458e-05, + "loss": 2.6873, + "step": 216 + }, + { + "epoch": 0.009484680274487522, + "grad_norm": 4.0625, + "learning_rate": 9.997978578731908e-05, + "loss": 2.5835, + "step": 217 + }, + { + "epoch": 0.009528388478517417, + "grad_norm": 7.0625, + "learning_rate": 9.997959002248229e-05, + "loss": 2.7929, + "step": 218 + }, + { + "epoch": 0.009572096682547314, + "grad_norm": 3.890625, + "learning_rate": 9.997939331445787e-05, + "loss": 2.1056, + "step": 219 + }, + { + "epoch": 0.009615804886577211, + "grad_norm": 4.28125, + "learning_rate": 9.997919566324959e-05, + "loss": 3.3739, + "step": 220 + }, + { + "epoch": 0.009659513090607107, + "grad_norm": 3.78125, + "learning_rate": 9.99789970688611e-05, + "loss": 2.6079, + "step": 221 + }, + { + "epoch": 0.009703221294637004, + "grad_norm": 3.796875, + "learning_rate": 9.997879753129624e-05, + "loss": 2.4539, + "step": 222 + }, + { + "epoch": 0.009746929498666899, + "grad_norm": 4.21875, + "learning_rate": 9.99785970505587e-05, + "loss": 2.2522, + "step": 223 + }, + { + "epoch": 0.009790637702696796, + "grad_norm": 3.84375, + "learning_rate": 9.99783956266523e-05, + "loss": 2.6285, + "step": 224 + }, + { + "epoch": 0.009834345906726693, + "grad_norm": 4.71875, + "learning_rate": 9.997819325958085e-05, + "loss": 2.7673, + "step": 225 + }, + { + "epoch": 0.009878054110756589, + "grad_norm": 7.5, + "learning_rate": 9.997798994934813e-05, + "loss": 2.7084, + "step": 226 + }, + { + "epoch": 0.009921762314786486, + "grad_norm": 4.03125, + "learning_rate": 9.997778569595801e-05, + "loss": 2.9379, + "step": 227 + }, + { + "epoch": 0.009965470518816381, + "grad_norm": 4.46875, + "learning_rate": 9.997758049941435e-05, + "loss": 2.7648, + "step": 228 + }, + { + "epoch": 0.010009178722846278, + "grad_norm": 3.71875, + "learning_rate": 9.997737435972099e-05, + "loss": 2.381, + "step": 229 + }, + { + "epoch": 0.010052886926876175, + "grad_norm": 3.40625, + "learning_rate": 9.997716727688183e-05, + "loss": 2.2979, + "step": 230 + }, + { + "epoch": 0.01009659513090607, + "grad_norm": 4.15625, + "learning_rate": 9.99769592509008e-05, + "loss": 2.7071, + "step": 231 + }, + { + "epoch": 0.010140303334935968, + "grad_norm": 4.875, + "learning_rate": 9.99767502817818e-05, + "loss": 3.5704, + "step": 232 + }, + { + "epoch": 0.010184011538965865, + "grad_norm": 3.421875, + "learning_rate": 9.997654036952879e-05, + "loss": 1.8784, + "step": 233 + }, + { + "epoch": 0.01022771974299576, + "grad_norm": 4.5, + "learning_rate": 9.997632951414573e-05, + "loss": 2.5868, + "step": 234 + }, + { + "epoch": 0.010271427947025657, + "grad_norm": 4.21875, + "learning_rate": 9.99761177156366e-05, + "loss": 2.1905, + "step": 235 + }, + { + "epoch": 0.010315136151055553, + "grad_norm": 4.25, + "learning_rate": 9.997590497400538e-05, + "loss": 3.011, + "step": 236 + }, + { + "epoch": 0.01035884435508545, + "grad_norm": 3.765625, + "learning_rate": 9.997569128925611e-05, + "loss": 2.5319, + "step": 237 + }, + { + "epoch": 0.010402552559115347, + "grad_norm": 4.09375, + "learning_rate": 9.997547666139279e-05, + "loss": 2.0043, + "step": 238 + }, + { + "epoch": 0.010446260763145242, + "grad_norm": 3.640625, + "learning_rate": 9.99752610904195e-05, + "loss": 2.708, + "step": 239 + }, + { + "epoch": 0.01048996896717514, + "grad_norm": 3.28125, + "learning_rate": 9.997504457634029e-05, + "loss": 2.214, + "step": 240 + }, + { + "epoch": 0.010533677171205034, + "grad_norm": 3.140625, + "learning_rate": 9.997482711915927e-05, + "loss": 2.275, + "step": 241 + }, + { + "epoch": 0.010577385375234932, + "grad_norm": 4.6875, + "learning_rate": 9.997460871888052e-05, + "loss": 2.7156, + "step": 242 + }, + { + "epoch": 0.010621093579264829, + "grad_norm": 5.0625, + "learning_rate": 9.997438937550816e-05, + "loss": 3.3969, + "step": 243 + }, + { + "epoch": 0.010664801783294724, + "grad_norm": 4.59375, + "learning_rate": 9.997416908904633e-05, + "loss": 2.7685, + "step": 244 + }, + { + "epoch": 0.010708509987324621, + "grad_norm": 3.453125, + "learning_rate": 9.997394785949922e-05, + "loss": 2.6075, + "step": 245 + }, + { + "epoch": 0.010752218191354516, + "grad_norm": 3.625, + "learning_rate": 9.997372568687097e-05, + "loss": 2.2984, + "step": 246 + }, + { + "epoch": 0.010795926395384414, + "grad_norm": 4.375, + "learning_rate": 9.997350257116578e-05, + "loss": 2.589, + "step": 247 + }, + { + "epoch": 0.01083963459941431, + "grad_norm": 3.75, + "learning_rate": 9.997327851238788e-05, + "loss": 2.0634, + "step": 248 + }, + { + "epoch": 0.010883342803444206, + "grad_norm": 9.1875, + "learning_rate": 9.997305351054146e-05, + "loss": 2.4742, + "step": 249 + }, + { + "epoch": 0.010927051007474103, + "grad_norm": 4.0, + "learning_rate": 9.99728275656308e-05, + "loss": 2.9405, + "step": 250 + }, + { + "epoch": 0.010970759211504, + "grad_norm": 3.953125, + "learning_rate": 9.997260067766014e-05, + "loss": 2.304, + "step": 251 + }, + { + "epoch": 0.011014467415533895, + "grad_norm": 4.4375, + "learning_rate": 9.997237284663379e-05, + "loss": 2.9572, + "step": 252 + }, + { + "epoch": 0.011058175619563793, + "grad_norm": 3.65625, + "learning_rate": 9.997214407255602e-05, + "loss": 2.3134, + "step": 253 + }, + { + "epoch": 0.011101883823593688, + "grad_norm": 5.125, + "learning_rate": 9.997191435543117e-05, + "loss": 3.2421, + "step": 254 + }, + { + "epoch": 0.011145592027623585, + "grad_norm": 3.453125, + "learning_rate": 9.997168369526355e-05, + "loss": 2.6112, + "step": 255 + }, + { + "epoch": 0.011189300231653482, + "grad_norm": 5.96875, + "learning_rate": 9.997145209205754e-05, + "loss": 2.4515, + "step": 256 + }, + { + "epoch": 0.011233008435683377, + "grad_norm": 5.28125, + "learning_rate": 9.99712195458175e-05, + "loss": 2.8503, + "step": 257 + }, + { + "epoch": 0.011276716639713275, + "grad_norm": 3.9375, + "learning_rate": 9.997098605654782e-05, + "loss": 2.4916, + "step": 258 + }, + { + "epoch": 0.01132042484374317, + "grad_norm": 3.59375, + "learning_rate": 9.99707516242529e-05, + "loss": 2.2976, + "step": 259 + }, + { + "epoch": 0.011364133047773067, + "grad_norm": 4.40625, + "learning_rate": 9.997051624893716e-05, + "loss": 2.5726, + "step": 260 + }, + { + "epoch": 0.011407841251802964, + "grad_norm": 4.0625, + "learning_rate": 9.997027993060506e-05, + "loss": 2.0185, + "step": 261 + }, + { + "epoch": 0.01145154945583286, + "grad_norm": 3.96875, + "learning_rate": 9.997004266926105e-05, + "loss": 3.0021, + "step": 262 + }, + { + "epoch": 0.011495257659862756, + "grad_norm": 4.0625, + "learning_rate": 9.99698044649096e-05, + "loss": 2.7337, + "step": 263 + }, + { + "epoch": 0.011538965863892652, + "grad_norm": 3.359375, + "learning_rate": 9.996956531755521e-05, + "loss": 2.3048, + "step": 264 + }, + { + "epoch": 0.011582674067922549, + "grad_norm": 5.34375, + "learning_rate": 9.996932522720242e-05, + "loss": 3.3072, + "step": 265 + }, + { + "epoch": 0.011626382271952446, + "grad_norm": 8.625, + "learning_rate": 9.996908419385571e-05, + "loss": 3.0222, + "step": 266 + }, + { + "epoch": 0.011670090475982341, + "grad_norm": 4.5625, + "learning_rate": 9.996884221751966e-05, + "loss": 2.5027, + "step": 267 + }, + { + "epoch": 0.011713798680012238, + "grad_norm": 4.15625, + "learning_rate": 9.996859929819882e-05, + "loss": 2.2139, + "step": 268 + }, + { + "epoch": 0.011757506884042136, + "grad_norm": 62.75, + "learning_rate": 9.996835543589781e-05, + "loss": 3.957, + "step": 269 + }, + { + "epoch": 0.011801215088072031, + "grad_norm": 4.5, + "learning_rate": 9.996811063062119e-05, + "loss": 2.7251, + "step": 270 + }, + { + "epoch": 0.011844923292101928, + "grad_norm": 4.75, + "learning_rate": 9.99678648823736e-05, + "loss": 2.8835, + "step": 271 + }, + { + "epoch": 0.011888631496131823, + "grad_norm": 4.5, + "learning_rate": 9.996761819115968e-05, + "loss": 2.8296, + "step": 272 + }, + { + "epoch": 0.01193233970016172, + "grad_norm": 4.9375, + "learning_rate": 9.996737055698409e-05, + "loss": 2.8128, + "step": 273 + }, + { + "epoch": 0.011976047904191617, + "grad_norm": 4.125, + "learning_rate": 9.996712197985147e-05, + "loss": 2.365, + "step": 274 + }, + { + "epoch": 0.012019756108221513, + "grad_norm": 6.3125, + "learning_rate": 9.996687245976655e-05, + "loss": 2.4859, + "step": 275 + }, + { + "epoch": 0.01206346431225141, + "grad_norm": 3.15625, + "learning_rate": 9.996662199673401e-05, + "loss": 2.6919, + "step": 276 + }, + { + "epoch": 0.012107172516281305, + "grad_norm": 4.5, + "learning_rate": 9.996637059075861e-05, + "loss": 2.649, + "step": 277 + }, + { + "epoch": 0.012150880720311202, + "grad_norm": 7.6875, + "learning_rate": 9.996611824184505e-05, + "loss": 3.3843, + "step": 278 + }, + { + "epoch": 0.0121945889243411, + "grad_norm": 3.703125, + "learning_rate": 9.996586494999814e-05, + "loss": 2.5776, + "step": 279 + }, + { + "epoch": 0.012238297128370995, + "grad_norm": 3.734375, + "learning_rate": 9.996561071522264e-05, + "loss": 2.43, + "step": 280 + }, + { + "epoch": 0.012282005332400892, + "grad_norm": 5.3125, + "learning_rate": 9.996535553752331e-05, + "loss": 2.3819, + "step": 281 + }, + { + "epoch": 0.012325713536430787, + "grad_norm": 3.359375, + "learning_rate": 9.996509941690503e-05, + "loss": 2.1984, + "step": 282 + }, + { + "epoch": 0.012369421740460684, + "grad_norm": 3.6875, + "learning_rate": 9.99648423533726e-05, + "loss": 2.7669, + "step": 283 + }, + { + "epoch": 0.012413129944490581, + "grad_norm": 3.3125, + "learning_rate": 9.996458434693086e-05, + "loss": 2.799, + "step": 284 + }, + { + "epoch": 0.012456838148520477, + "grad_norm": 4.0625, + "learning_rate": 9.99643253975847e-05, + "loss": 2.7216, + "step": 285 + }, + { + "epoch": 0.012500546352550374, + "grad_norm": 3.390625, + "learning_rate": 9.996406550533901e-05, + "loss": 2.8937, + "step": 286 + }, + { + "epoch": 0.012544254556580271, + "grad_norm": 6.0625, + "learning_rate": 9.996380467019868e-05, + "loss": 2.3943, + "step": 287 + }, + { + "epoch": 0.012587962760610166, + "grad_norm": 4.15625, + "learning_rate": 9.996354289216863e-05, + "loss": 2.9379, + "step": 288 + }, + { + "epoch": 0.012631670964640063, + "grad_norm": 3.71875, + "learning_rate": 9.996328017125381e-05, + "loss": 2.5333, + "step": 289 + }, + { + "epoch": 0.012675379168669959, + "grad_norm": 5.09375, + "learning_rate": 9.996301650745917e-05, + "loss": 2.6082, + "step": 290 + }, + { + "epoch": 0.012719087372699856, + "grad_norm": 4.28125, + "learning_rate": 9.99627519007897e-05, + "loss": 3.1054, + "step": 291 + }, + { + "epoch": 0.012762795576729753, + "grad_norm": 4.5, + "learning_rate": 9.996248635125039e-05, + "loss": 2.423, + "step": 292 + }, + { + "epoch": 0.012806503780759648, + "grad_norm": 7.8125, + "learning_rate": 9.996221985884623e-05, + "loss": 2.3539, + "step": 293 + }, + { + "epoch": 0.012850211984789545, + "grad_norm": 4.6875, + "learning_rate": 9.996195242358226e-05, + "loss": 2.8162, + "step": 294 + }, + { + "epoch": 0.01289392018881944, + "grad_norm": 4.0625, + "learning_rate": 9.996168404546356e-05, + "loss": 2.4511, + "step": 295 + }, + { + "epoch": 0.012937628392849338, + "grad_norm": 3.46875, + "learning_rate": 9.996141472449514e-05, + "loss": 2.3848, + "step": 296 + }, + { + "epoch": 0.012981336596879235, + "grad_norm": 3.515625, + "learning_rate": 9.996114446068212e-05, + "loss": 2.4308, + "step": 297 + }, + { + "epoch": 0.01302504480090913, + "grad_norm": 5.0, + "learning_rate": 9.996087325402959e-05, + "loss": 2.6241, + "step": 298 + }, + { + "epoch": 0.013068753004939027, + "grad_norm": 3.453125, + "learning_rate": 9.996060110454266e-05, + "loss": 2.1833, + "step": 299 + }, + { + "epoch": 0.013112461208968923, + "grad_norm": 3.625, + "learning_rate": 9.996032801222648e-05, + "loss": 2.782, + "step": 300 + }, + { + "epoch": 0.01315616941299882, + "grad_norm": 3.046875, + "learning_rate": 9.996005397708619e-05, + "loss": 2.0714, + "step": 301 + }, + { + "epoch": 0.013199877617028717, + "grad_norm": 4.59375, + "learning_rate": 9.995977899912697e-05, + "loss": 2.7152, + "step": 302 + }, + { + "epoch": 0.013243585821058612, + "grad_norm": 3.765625, + "learning_rate": 9.995950307835401e-05, + "loss": 2.8984, + "step": 303 + }, + { + "epoch": 0.01328729402508851, + "grad_norm": 3.828125, + "learning_rate": 9.995922621477252e-05, + "loss": 2.3674, + "step": 304 + }, + { + "epoch": 0.013331002229118406, + "grad_norm": 6.0625, + "learning_rate": 9.995894840838771e-05, + "loss": 2.7993, + "step": 305 + }, + { + "epoch": 0.013374710433148302, + "grad_norm": 5.78125, + "learning_rate": 9.995866965920485e-05, + "loss": 2.4223, + "step": 306 + }, + { + "epoch": 0.013418418637178199, + "grad_norm": 4.6875, + "learning_rate": 9.995838996722914e-05, + "loss": 2.8455, + "step": 307 + }, + { + "epoch": 0.013462126841208094, + "grad_norm": 3.71875, + "learning_rate": 9.995810933246594e-05, + "loss": 2.0036, + "step": 308 + }, + { + "epoch": 0.013505835045237991, + "grad_norm": 4.03125, + "learning_rate": 9.995782775492048e-05, + "loss": 2.5858, + "step": 309 + }, + { + "epoch": 0.013549543249267888, + "grad_norm": 11.75, + "learning_rate": 9.995754523459813e-05, + "loss": 2.921, + "step": 310 + }, + { + "epoch": 0.013593251453297784, + "grad_norm": 27.875, + "learning_rate": 9.995726177150418e-05, + "loss": 2.7016, + "step": 311 + }, + { + "epoch": 0.01363695965732768, + "grad_norm": 5.09375, + "learning_rate": 9.995697736564397e-05, + "loss": 2.4968, + "step": 312 + }, + { + "epoch": 0.013680667861357576, + "grad_norm": 3.9375, + "learning_rate": 9.995669201702291e-05, + "loss": 3.1727, + "step": 313 + }, + { + "epoch": 0.013724376065387473, + "grad_norm": 4.34375, + "learning_rate": 9.995640572564635e-05, + "loss": 2.812, + "step": 314 + }, + { + "epoch": 0.01376808426941737, + "grad_norm": 3.296875, + "learning_rate": 9.995611849151971e-05, + "loss": 2.3059, + "step": 315 + }, + { + "epoch": 0.013811792473447266, + "grad_norm": 5.40625, + "learning_rate": 9.995583031464842e-05, + "loss": 3.082, + "step": 316 + }, + { + "epoch": 0.013855500677477163, + "grad_norm": 4.0, + "learning_rate": 9.99555411950379e-05, + "loss": 2.0426, + "step": 317 + }, + { + "epoch": 0.013899208881507058, + "grad_norm": 4.65625, + "learning_rate": 9.99552511326936e-05, + "loss": 2.8893, + "step": 318 + }, + { + "epoch": 0.013942917085536955, + "grad_norm": 3.734375, + "learning_rate": 9.9954960127621e-05, + "loss": 2.4564, + "step": 319 + }, + { + "epoch": 0.013986625289566852, + "grad_norm": 3.984375, + "learning_rate": 9.995466817982562e-05, + "loss": 2.5178, + "step": 320 + }, + { + "epoch": 0.014030333493596748, + "grad_norm": 15.625, + "learning_rate": 9.995437528931293e-05, + "loss": 3.7793, + "step": 321 + }, + { + "epoch": 0.014074041697626645, + "grad_norm": 3.40625, + "learning_rate": 9.995408145608847e-05, + "loss": 2.3784, + "step": 322 + }, + { + "epoch": 0.014117749901656542, + "grad_norm": 3.890625, + "learning_rate": 9.99537866801578e-05, + "loss": 2.2287, + "step": 323 + }, + { + "epoch": 0.014161458105686437, + "grad_norm": 3.984375, + "learning_rate": 9.995349096152645e-05, + "loss": 2.4412, + "step": 324 + }, + { + "epoch": 0.014205166309716334, + "grad_norm": 4.09375, + "learning_rate": 9.995319430020003e-05, + "loss": 2.3395, + "step": 325 + }, + { + "epoch": 0.01424887451374623, + "grad_norm": 4.5, + "learning_rate": 9.995289669618415e-05, + "loss": 2.7861, + "step": 326 + }, + { + "epoch": 0.014292582717776127, + "grad_norm": 4.34375, + "learning_rate": 9.995259814948439e-05, + "loss": 2.4553, + "step": 327 + }, + { + "epoch": 0.014336290921806024, + "grad_norm": 4.78125, + "learning_rate": 9.99522986601064e-05, + "loss": 2.9104, + "step": 328 + }, + { + "epoch": 0.014379999125835919, + "grad_norm": 4.03125, + "learning_rate": 9.995199822805583e-05, + "loss": 2.8891, + "step": 329 + }, + { + "epoch": 0.014423707329865816, + "grad_norm": 4.34375, + "learning_rate": 9.995169685333836e-05, + "loss": 2.8756, + "step": 330 + }, + { + "epoch": 0.014467415533895711, + "grad_norm": 3.140625, + "learning_rate": 9.995139453595968e-05, + "loss": 2.123, + "step": 331 + }, + { + "epoch": 0.014511123737925609, + "grad_norm": 4.1875, + "learning_rate": 9.995109127592546e-05, + "loss": 2.7999, + "step": 332 + }, + { + "epoch": 0.014554831941955506, + "grad_norm": 3.359375, + "learning_rate": 9.995078707324146e-05, + "loss": 2.3765, + "step": 333 + }, + { + "epoch": 0.014598540145985401, + "grad_norm": 3.515625, + "learning_rate": 9.99504819279134e-05, + "loss": 2.0792, + "step": 334 + }, + { + "epoch": 0.014642248350015298, + "grad_norm": 4.125, + "learning_rate": 9.995017583994706e-05, + "loss": 2.5463, + "step": 335 + }, + { + "epoch": 0.014685956554045195, + "grad_norm": 4.71875, + "learning_rate": 9.99498688093482e-05, + "loss": 3.4456, + "step": 336 + }, + { + "epoch": 0.01472966475807509, + "grad_norm": 3.765625, + "learning_rate": 9.994956083612261e-05, + "loss": 2.3138, + "step": 337 + }, + { + "epoch": 0.014773372962104988, + "grad_norm": 3.359375, + "learning_rate": 9.99492519202761e-05, + "loss": 2.4555, + "step": 338 + }, + { + "epoch": 0.014817081166134883, + "grad_norm": 3.453125, + "learning_rate": 9.994894206181452e-05, + "loss": 2.4877, + "step": 339 + }, + { + "epoch": 0.01486078937016478, + "grad_norm": 4.875, + "learning_rate": 9.994863126074371e-05, + "loss": 2.9761, + "step": 340 + }, + { + "epoch": 0.014904497574194677, + "grad_norm": 4.09375, + "learning_rate": 9.994831951706953e-05, + "loss": 2.5688, + "step": 341 + }, + { + "epoch": 0.014948205778224572, + "grad_norm": 3.171875, + "learning_rate": 9.994800683079786e-05, + "loss": 2.6589, + "step": 342 + }, + { + "epoch": 0.01499191398225447, + "grad_norm": 4.65625, + "learning_rate": 9.99476932019346e-05, + "loss": 2.4292, + "step": 343 + }, + { + "epoch": 0.015035622186284365, + "grad_norm": 3.703125, + "learning_rate": 9.994737863048567e-05, + "loss": 2.5704, + "step": 344 + }, + { + "epoch": 0.015079330390314262, + "grad_norm": 4.15625, + "learning_rate": 9.994706311645703e-05, + "loss": 2.5487, + "step": 345 + }, + { + "epoch": 0.015123038594344159, + "grad_norm": 3.65625, + "learning_rate": 9.994674665985461e-05, + "loss": 2.1273, + "step": 346 + }, + { + "epoch": 0.015166746798374054, + "grad_norm": 4.59375, + "learning_rate": 9.994642926068438e-05, + "loss": 2.0096, + "step": 347 + }, + { + "epoch": 0.015210455002403951, + "grad_norm": 4.8125, + "learning_rate": 9.994611091895234e-05, + "loss": 2.841, + "step": 348 + }, + { + "epoch": 0.015254163206433847, + "grad_norm": 3.375, + "learning_rate": 9.994579163466448e-05, + "loss": 2.3199, + "step": 349 + }, + { + "epoch": 0.015297871410463744, + "grad_norm": 3.171875, + "learning_rate": 9.994547140782686e-05, + "loss": 2.1122, + "step": 350 + }, + { + "epoch": 0.015341579614493641, + "grad_norm": 3.6875, + "learning_rate": 9.994515023844548e-05, + "loss": 2.705, + "step": 351 + }, + { + "epoch": 0.015385287818523536, + "grad_norm": 3.828125, + "learning_rate": 9.994482812652645e-05, + "loss": 1.9516, + "step": 352 + }, + { + "epoch": 0.015428996022553433, + "grad_norm": 3.453125, + "learning_rate": 9.99445050720758e-05, + "loss": 2.2709, + "step": 353 + }, + { + "epoch": 0.01547270422658333, + "grad_norm": 3.84375, + "learning_rate": 9.994418107509966e-05, + "loss": 2.2912, + "step": 354 + }, + { + "epoch": 0.015516412430613226, + "grad_norm": 4.4375, + "learning_rate": 9.994385613560413e-05, + "loss": 2.6541, + "step": 355 + }, + { + "epoch": 0.015560120634643123, + "grad_norm": 3.953125, + "learning_rate": 9.994353025359535e-05, + "loss": 2.1626, + "step": 356 + }, + { + "epoch": 0.015603828838673018, + "grad_norm": 3.890625, + "learning_rate": 9.994320342907945e-05, + "loss": 2.693, + "step": 357 + }, + { + "epoch": 0.015647537042702914, + "grad_norm": 3.5625, + "learning_rate": 9.994287566206264e-05, + "loss": 2.8311, + "step": 358 + }, + { + "epoch": 0.01569124524673281, + "grad_norm": 3.59375, + "learning_rate": 9.994254695255105e-05, + "loss": 2.4746, + "step": 359 + }, + { + "epoch": 0.015734953450762708, + "grad_norm": 3.375, + "learning_rate": 9.994221730055091e-05, + "loss": 2.3147, + "step": 360 + }, + { + "epoch": 0.015778661654792605, + "grad_norm": 4.09375, + "learning_rate": 9.994188670606846e-05, + "loss": 2.7883, + "step": 361 + }, + { + "epoch": 0.015822369858822502, + "grad_norm": 3.34375, + "learning_rate": 9.994155516910991e-05, + "loss": 2.2414, + "step": 362 + }, + { + "epoch": 0.0158660780628524, + "grad_norm": 3.1875, + "learning_rate": 9.994122268968154e-05, + "loss": 2.3983, + "step": 363 + }, + { + "epoch": 0.015909786266882293, + "grad_norm": 3.859375, + "learning_rate": 9.99408892677896e-05, + "loss": 2.6232, + "step": 364 + }, + { + "epoch": 0.01595349447091219, + "grad_norm": 6.09375, + "learning_rate": 9.99405549034404e-05, + "loss": 2.3536, + "step": 365 + }, + { + "epoch": 0.015997202674942087, + "grad_norm": 4.9375, + "learning_rate": 9.994021959664024e-05, + "loss": 2.8419, + "step": 366 + }, + { + "epoch": 0.016040910878971984, + "grad_norm": 3.703125, + "learning_rate": 9.993988334739544e-05, + "loss": 2.9319, + "step": 367 + }, + { + "epoch": 0.01608461908300188, + "grad_norm": 4.53125, + "learning_rate": 9.993954615571238e-05, + "loss": 2.4233, + "step": 368 + }, + { + "epoch": 0.016128327287031775, + "grad_norm": 5.84375, + "learning_rate": 9.993920802159739e-05, + "loss": 3.2623, + "step": 369 + }, + { + "epoch": 0.016172035491061672, + "grad_norm": 3.28125, + "learning_rate": 9.993886894505686e-05, + "loss": 2.6416, + "step": 370 + }, + { + "epoch": 0.01621574369509157, + "grad_norm": 4.34375, + "learning_rate": 9.993852892609718e-05, + "loss": 2.7184, + "step": 371 + }, + { + "epoch": 0.016259451899121466, + "grad_norm": 4.0625, + "learning_rate": 9.99381879647248e-05, + "loss": 2.9044, + "step": 372 + }, + { + "epoch": 0.016303160103151363, + "grad_norm": 4.5, + "learning_rate": 9.993784606094612e-05, + "loss": 2.1336, + "step": 373 + }, + { + "epoch": 0.016346868307181257, + "grad_norm": 6.34375, + "learning_rate": 9.99375032147676e-05, + "loss": 2.2281, + "step": 374 + }, + { + "epoch": 0.016390576511211154, + "grad_norm": 3.21875, + "learning_rate": 9.993715942619573e-05, + "loss": 2.3788, + "step": 375 + }, + { + "epoch": 0.01643428471524105, + "grad_norm": 3.21875, + "learning_rate": 9.993681469523697e-05, + "loss": 1.9586, + "step": 376 + }, + { + "epoch": 0.016477992919270948, + "grad_norm": 4.3125, + "learning_rate": 9.993646902189784e-05, + "loss": 3.2296, + "step": 377 + }, + { + "epoch": 0.016521701123300845, + "grad_norm": 3.984375, + "learning_rate": 9.993612240618485e-05, + "loss": 2.4422, + "step": 378 + }, + { + "epoch": 0.01656540932733074, + "grad_norm": 5.28125, + "learning_rate": 9.993577484810455e-05, + "loss": 2.8639, + "step": 379 + }, + { + "epoch": 0.016609117531360636, + "grad_norm": 5.34375, + "learning_rate": 9.993542634766352e-05, + "loss": 2.9119, + "step": 380 + }, + { + "epoch": 0.016652825735390533, + "grad_norm": 4.25, + "learning_rate": 9.993507690486831e-05, + "loss": 2.5102, + "step": 381 + }, + { + "epoch": 0.01669653393942043, + "grad_norm": 3.53125, + "learning_rate": 9.99347265197255e-05, + "loss": 2.5379, + "step": 382 + }, + { + "epoch": 0.016740242143450327, + "grad_norm": 3.53125, + "learning_rate": 9.993437519224176e-05, + "loss": 2.2643, + "step": 383 + }, + { + "epoch": 0.01678395034748022, + "grad_norm": 3.625, + "learning_rate": 9.993402292242367e-05, + "loss": 2.3072, + "step": 384 + }, + { + "epoch": 0.016827658551510118, + "grad_norm": 3.90625, + "learning_rate": 9.993366971027788e-05, + "loss": 2.592, + "step": 385 + }, + { + "epoch": 0.016871366755540015, + "grad_norm": 4.0625, + "learning_rate": 9.993331555581108e-05, + "loss": 2.6759, + "step": 386 + }, + { + "epoch": 0.016915074959569912, + "grad_norm": 3.296875, + "learning_rate": 9.993296045902994e-05, + "loss": 2.2537, + "step": 387 + }, + { + "epoch": 0.01695878316359981, + "grad_norm": 4.625, + "learning_rate": 9.993260441994116e-05, + "loss": 2.7636, + "step": 388 + }, + { + "epoch": 0.017002491367629703, + "grad_norm": 3.671875, + "learning_rate": 9.993224743855145e-05, + "loss": 2.5618, + "step": 389 + }, + { + "epoch": 0.0170461995716596, + "grad_norm": 3.09375, + "learning_rate": 9.993188951486758e-05, + "loss": 2.3234, + "step": 390 + }, + { + "epoch": 0.017089907775689497, + "grad_norm": 3.453125, + "learning_rate": 9.993153064889626e-05, + "loss": 3.1043, + "step": 391 + }, + { + "epoch": 0.017133615979719394, + "grad_norm": 4.28125, + "learning_rate": 9.99311708406443e-05, + "loss": 2.9755, + "step": 392 + }, + { + "epoch": 0.01717732418374929, + "grad_norm": 3.890625, + "learning_rate": 9.993081009011847e-05, + "loss": 2.5243, + "step": 393 + }, + { + "epoch": 0.017221032387779184, + "grad_norm": 3.6875, + "learning_rate": 9.993044839732559e-05, + "loss": 2.388, + "step": 394 + }, + { + "epoch": 0.01726474059180908, + "grad_norm": 3.09375, + "learning_rate": 9.993008576227247e-05, + "loss": 2.1978, + "step": 395 + }, + { + "epoch": 0.01730844879583898, + "grad_norm": 3.359375, + "learning_rate": 9.992972218496597e-05, + "loss": 2.2283, + "step": 396 + }, + { + "epoch": 0.017352156999868876, + "grad_norm": 4.09375, + "learning_rate": 9.992935766541294e-05, + "loss": 2.2622, + "step": 397 + }, + { + "epoch": 0.017395865203898773, + "grad_norm": 3.625, + "learning_rate": 9.992899220362025e-05, + "loss": 2.4432, + "step": 398 + }, + { + "epoch": 0.01743957340792867, + "grad_norm": 4.0625, + "learning_rate": 9.992862579959481e-05, + "loss": 2.408, + "step": 399 + }, + { + "epoch": 0.017483281611958564, + "grad_norm": 3.625, + "learning_rate": 9.992825845334355e-05, + "loss": 1.867, + "step": 400 + }, + { + "epoch": 0.01752698981598846, + "grad_norm": 4.59375, + "learning_rate": 9.992789016487337e-05, + "loss": 2.1599, + "step": 401 + }, + { + "epoch": 0.017570698020018358, + "grad_norm": 7.4375, + "learning_rate": 9.992752093419124e-05, + "loss": 2.4313, + "step": 402 + }, + { + "epoch": 0.017614406224048255, + "grad_norm": 6.5, + "learning_rate": 9.992715076130414e-05, + "loss": 2.2794, + "step": 403 + }, + { + "epoch": 0.017658114428078152, + "grad_norm": 4.25, + "learning_rate": 9.992677964621901e-05, + "loss": 2.8487, + "step": 404 + }, + { + "epoch": 0.017701822632108045, + "grad_norm": 3.203125, + "learning_rate": 9.992640758894292e-05, + "loss": 2.3595, + "step": 405 + }, + { + "epoch": 0.017745530836137943, + "grad_norm": 3.34375, + "learning_rate": 9.992603458948281e-05, + "loss": 2.5555, + "step": 406 + }, + { + "epoch": 0.01778923904016784, + "grad_norm": 3.3125, + "learning_rate": 9.992566064784581e-05, + "loss": 2.0888, + "step": 407 + }, + { + "epoch": 0.017832947244197737, + "grad_norm": 4.5, + "learning_rate": 9.99252857640389e-05, + "loss": 2.4566, + "step": 408 + }, + { + "epoch": 0.017876655448227634, + "grad_norm": 3.125, + "learning_rate": 9.99249099380692e-05, + "loss": 2.1311, + "step": 409 + }, + { + "epoch": 0.017920363652257527, + "grad_norm": 3.875, + "learning_rate": 9.992453316994377e-05, + "loss": 2.4535, + "step": 410 + }, + { + "epoch": 0.017964071856287425, + "grad_norm": 3.9375, + "learning_rate": 9.992415545966976e-05, + "loss": 2.1546, + "step": 411 + }, + { + "epoch": 0.01800778006031732, + "grad_norm": 3.484375, + "learning_rate": 9.992377680725425e-05, + "loss": 2.4082, + "step": 412 + }, + { + "epoch": 0.01805148826434722, + "grad_norm": 3.40625, + "learning_rate": 9.992339721270443e-05, + "loss": 2.7536, + "step": 413 + }, + { + "epoch": 0.018095196468377116, + "grad_norm": 3.703125, + "learning_rate": 9.992301667602743e-05, + "loss": 2.5332, + "step": 414 + }, + { + "epoch": 0.01813890467240701, + "grad_norm": 3.109375, + "learning_rate": 9.992263519723046e-05, + "loss": 2.2629, + "step": 415 + }, + { + "epoch": 0.018182612876436906, + "grad_norm": 3.671875, + "learning_rate": 9.99222527763207e-05, + "loss": 2.08, + "step": 416 + }, + { + "epoch": 0.018226321080466804, + "grad_norm": 3.515625, + "learning_rate": 9.992186941330537e-05, + "loss": 2.2829, + "step": 417 + }, + { + "epoch": 0.0182700292844967, + "grad_norm": 3.0, + "learning_rate": 9.99214851081917e-05, + "loss": 2.5898, + "step": 418 + }, + { + "epoch": 0.018313737488526598, + "grad_norm": 3.171875, + "learning_rate": 9.992109986098696e-05, + "loss": 2.425, + "step": 419 + }, + { + "epoch": 0.01835744569255649, + "grad_norm": 3.59375, + "learning_rate": 9.992071367169839e-05, + "loss": 2.2076, + "step": 420 + }, + { + "epoch": 0.01840115389658639, + "grad_norm": 3.203125, + "learning_rate": 9.992032654033333e-05, + "loss": 2.2598, + "step": 421 + }, + { + "epoch": 0.018444862100616286, + "grad_norm": 2.96875, + "learning_rate": 9.991993846689902e-05, + "loss": 2.3534, + "step": 422 + }, + { + "epoch": 0.018488570304646183, + "grad_norm": 2.84375, + "learning_rate": 9.991954945140284e-05, + "loss": 2.023, + "step": 423 + }, + { + "epoch": 0.01853227850867608, + "grad_norm": 3.578125, + "learning_rate": 9.99191594938521e-05, + "loss": 2.5662, + "step": 424 + }, + { + "epoch": 0.018575986712705973, + "grad_norm": 3.5625, + "learning_rate": 9.991876859425415e-05, + "loss": 2.2721, + "step": 425 + }, + { + "epoch": 0.01861969491673587, + "grad_norm": 3.6875, + "learning_rate": 9.991837675261641e-05, + "loss": 2.6638, + "step": 426 + }, + { + "epoch": 0.018663403120765767, + "grad_norm": 3.359375, + "learning_rate": 9.991798396894622e-05, + "loss": 2.6768, + "step": 427 + }, + { + "epoch": 0.018707111324795665, + "grad_norm": 3.890625, + "learning_rate": 9.991759024325104e-05, + "loss": 3.2616, + "step": 428 + }, + { + "epoch": 0.01875081952882556, + "grad_norm": 3.109375, + "learning_rate": 9.99171955755383e-05, + "loss": 2.2708, + "step": 429 + }, + { + "epoch": 0.018794527732855455, + "grad_norm": 4.03125, + "learning_rate": 9.991679996581539e-05, + "loss": 2.6224, + "step": 430 + }, + { + "epoch": 0.018838235936885352, + "grad_norm": 5.28125, + "learning_rate": 9.991640341408984e-05, + "loss": 1.7391, + "step": 431 + }, + { + "epoch": 0.01888194414091525, + "grad_norm": 4.09375, + "learning_rate": 9.991600592036908e-05, + "loss": 2.494, + "step": 432 + }, + { + "epoch": 0.018925652344945147, + "grad_norm": 3.59375, + "learning_rate": 9.991560748466067e-05, + "loss": 2.703, + "step": 433 + }, + { + "epoch": 0.018969360548975044, + "grad_norm": 8.625, + "learning_rate": 9.991520810697208e-05, + "loss": 2.121, + "step": 434 + }, + { + "epoch": 0.01901306875300494, + "grad_norm": 7.78125, + "learning_rate": 9.991480778731086e-05, + "loss": 1.7441, + "step": 435 + }, + { + "epoch": 0.019056776957034834, + "grad_norm": 3.546875, + "learning_rate": 9.991440652568458e-05, + "loss": 1.911, + "step": 436 + }, + { + "epoch": 0.01910048516106473, + "grad_norm": 3.0, + "learning_rate": 9.99140043221008e-05, + "loss": 2.441, + "step": 437 + }, + { + "epoch": 0.01914419336509463, + "grad_norm": 4.53125, + "learning_rate": 9.991360117656712e-05, + "loss": 2.7907, + "step": 438 + }, + { + "epoch": 0.019187901569124526, + "grad_norm": 4.125, + "learning_rate": 9.991319708909113e-05, + "loss": 2.7946, + "step": 439 + }, + { + "epoch": 0.019231609773154423, + "grad_norm": 4.0625, + "learning_rate": 9.991279205968046e-05, + "loss": 2.985, + "step": 440 + }, + { + "epoch": 0.019275317977184316, + "grad_norm": 3.15625, + "learning_rate": 9.991238608834276e-05, + "loss": 2.6155, + "step": 441 + }, + { + "epoch": 0.019319026181214213, + "grad_norm": 3.40625, + "learning_rate": 9.99119791750857e-05, + "loss": 2.3012, + "step": 442 + }, + { + "epoch": 0.01936273438524411, + "grad_norm": 6.09375, + "learning_rate": 9.991157131991695e-05, + "loss": 3.8634, + "step": 443 + }, + { + "epoch": 0.019406442589274008, + "grad_norm": 3.09375, + "learning_rate": 9.991116252284421e-05, + "loss": 2.4776, + "step": 444 + }, + { + "epoch": 0.019450150793303905, + "grad_norm": 6.625, + "learning_rate": 9.991075278387518e-05, + "loss": 2.2461, + "step": 445 + }, + { + "epoch": 0.019493858997333798, + "grad_norm": 3.21875, + "learning_rate": 9.99103421030176e-05, + "loss": 2.2593, + "step": 446 + }, + { + "epoch": 0.019537567201363695, + "grad_norm": 10.375, + "learning_rate": 9.990993048027923e-05, + "loss": 2.8026, + "step": 447 + }, + { + "epoch": 0.019581275405393592, + "grad_norm": 3.8125, + "learning_rate": 9.990951791566784e-05, + "loss": 2.234, + "step": 448 + }, + { + "epoch": 0.01962498360942349, + "grad_norm": 4.53125, + "learning_rate": 9.99091044091912e-05, + "loss": 2.7442, + "step": 449 + }, + { + "epoch": 0.019668691813453387, + "grad_norm": 4.125, + "learning_rate": 9.990868996085712e-05, + "loss": 2.5416, + "step": 450 + }, + { + "epoch": 0.01971240001748328, + "grad_norm": 3.28125, + "learning_rate": 9.990827457067343e-05, + "loss": 2.4102, + "step": 451 + }, + { + "epoch": 0.019756108221513177, + "grad_norm": 4.34375, + "learning_rate": 9.990785823864795e-05, + "loss": 2.8615, + "step": 452 + }, + { + "epoch": 0.019799816425543074, + "grad_norm": 3.125, + "learning_rate": 9.990744096478855e-05, + "loss": 2.4941, + "step": 453 + }, + { + "epoch": 0.01984352462957297, + "grad_norm": 5.0, + "learning_rate": 9.990702274910309e-05, + "loss": 3.4405, + "step": 454 + }, + { + "epoch": 0.01988723283360287, + "grad_norm": 4.53125, + "learning_rate": 9.990660359159949e-05, + "loss": 2.8826, + "step": 455 + }, + { + "epoch": 0.019930941037632762, + "grad_norm": 4.03125, + "learning_rate": 9.990618349228564e-05, + "loss": 2.3629, + "step": 456 + }, + { + "epoch": 0.01997464924166266, + "grad_norm": 5.96875, + "learning_rate": 9.990576245116947e-05, + "loss": 3.5398, + "step": 457 + }, + { + "epoch": 0.020018357445692556, + "grad_norm": 4.4375, + "learning_rate": 9.990534046825893e-05, + "loss": 2.4665, + "step": 458 + }, + { + "epoch": 0.020062065649722453, + "grad_norm": 3.046875, + "learning_rate": 9.990491754356199e-05, + "loss": 2.6201, + "step": 459 + }, + { + "epoch": 0.02010577385375235, + "grad_norm": 4.0, + "learning_rate": 9.990449367708661e-05, + "loss": 2.3883, + "step": 460 + }, + { + "epoch": 0.020149482057782244, + "grad_norm": 3.296875, + "learning_rate": 9.99040688688408e-05, + "loss": 2.013, + "step": 461 + }, + { + "epoch": 0.02019319026181214, + "grad_norm": 3.15625, + "learning_rate": 9.99036431188326e-05, + "loss": 2.3463, + "step": 462 + }, + { + "epoch": 0.020236898465842038, + "grad_norm": 3.984375, + "learning_rate": 9.990321642707001e-05, + "loss": 1.838, + "step": 463 + }, + { + "epoch": 0.020280606669871935, + "grad_norm": 3.90625, + "learning_rate": 9.99027887935611e-05, + "loss": 2.5029, + "step": 464 + }, + { + "epoch": 0.020324314873901832, + "grad_norm": 3.609375, + "learning_rate": 9.990236021831391e-05, + "loss": 2.2701, + "step": 465 + }, + { + "epoch": 0.02036802307793173, + "grad_norm": 4.125, + "learning_rate": 9.990193070133659e-05, + "loss": 2.177, + "step": 466 + }, + { + "epoch": 0.020411731281961623, + "grad_norm": 3.90625, + "learning_rate": 9.99015002426372e-05, + "loss": 2.4276, + "step": 467 + }, + { + "epoch": 0.02045543948599152, + "grad_norm": 3.578125, + "learning_rate": 9.990106884222385e-05, + "loss": 2.2515, + "step": 468 + }, + { + "epoch": 0.020499147690021417, + "grad_norm": 3.390625, + "learning_rate": 9.990063650010473e-05, + "loss": 2.238, + "step": 469 + }, + { + "epoch": 0.020542855894051314, + "grad_norm": 3.859375, + "learning_rate": 9.990020321628794e-05, + "loss": 2.5933, + "step": 470 + }, + { + "epoch": 0.02058656409808121, + "grad_norm": 10.1875, + "learning_rate": 9.989976899078172e-05, + "loss": 3.4384, + "step": 471 + }, + { + "epoch": 0.020630272302111105, + "grad_norm": 3.3125, + "learning_rate": 9.989933382359422e-05, + "loss": 2.2785, + "step": 472 + }, + { + "epoch": 0.020673980506141002, + "grad_norm": 4.125, + "learning_rate": 9.989889771473367e-05, + "loss": 2.3461, + "step": 473 + }, + { + "epoch": 0.0207176887101709, + "grad_norm": 3.984375, + "learning_rate": 9.989846066420829e-05, + "loss": 2.3127, + "step": 474 + }, + { + "epoch": 0.020761396914200796, + "grad_norm": 3.421875, + "learning_rate": 9.989802267202635e-05, + "loss": 3.1751, + "step": 475 + }, + { + "epoch": 0.020805105118230693, + "grad_norm": 4.96875, + "learning_rate": 9.989758373819608e-05, + "loss": 2.5285, + "step": 476 + }, + { + "epoch": 0.020848813322260587, + "grad_norm": 4.4375, + "learning_rate": 9.989714386272579e-05, + "loss": 2.064, + "step": 477 + }, + { + "epoch": 0.020892521526290484, + "grad_norm": 3.40625, + "learning_rate": 9.989670304562377e-05, + "loss": 2.2734, + "step": 478 + }, + { + "epoch": 0.02093622973032038, + "grad_norm": 3.515625, + "learning_rate": 9.989626128689835e-05, + "loss": 2.8476, + "step": 479 + }, + { + "epoch": 0.02097993793435028, + "grad_norm": 4.71875, + "learning_rate": 9.989581858655785e-05, + "loss": 2.5702, + "step": 480 + }, + { + "epoch": 0.021023646138380175, + "grad_norm": 3.3125, + "learning_rate": 9.989537494461064e-05, + "loss": 2.4645, + "step": 481 + }, + { + "epoch": 0.02106735434241007, + "grad_norm": 7.40625, + "learning_rate": 9.989493036106507e-05, + "loss": 3.0028, + "step": 482 + }, + { + "epoch": 0.021111062546439966, + "grad_norm": 3.515625, + "learning_rate": 9.989448483592957e-05, + "loss": 2.9433, + "step": 483 + }, + { + "epoch": 0.021154770750469863, + "grad_norm": 3.71875, + "learning_rate": 9.989403836921251e-05, + "loss": 2.474, + "step": 484 + }, + { + "epoch": 0.02119847895449976, + "grad_norm": 3.5625, + "learning_rate": 9.989359096092233e-05, + "loss": 2.9873, + "step": 485 + }, + { + "epoch": 0.021242187158529657, + "grad_norm": 3.796875, + "learning_rate": 9.989314261106749e-05, + "loss": 2.3552, + "step": 486 + }, + { + "epoch": 0.02128589536255955, + "grad_norm": 3.09375, + "learning_rate": 9.98926933196564e-05, + "loss": 2.4021, + "step": 487 + }, + { + "epoch": 0.021329603566589448, + "grad_norm": 3.078125, + "learning_rate": 9.989224308669758e-05, + "loss": 2.3827, + "step": 488 + }, + { + "epoch": 0.021373311770619345, + "grad_norm": 2.828125, + "learning_rate": 9.989179191219952e-05, + "loss": 2.1144, + "step": 489 + }, + { + "epoch": 0.021417019974649242, + "grad_norm": 3.765625, + "learning_rate": 9.989133979617074e-05, + "loss": 2.1834, + "step": 490 + }, + { + "epoch": 0.02146072817867914, + "grad_norm": 3.390625, + "learning_rate": 9.989088673861977e-05, + "loss": 2.3981, + "step": 491 + }, + { + "epoch": 0.021504436382709033, + "grad_norm": 3.234375, + "learning_rate": 9.989043273955513e-05, + "loss": 2.8218, + "step": 492 + }, + { + "epoch": 0.02154814458673893, + "grad_norm": 3.65625, + "learning_rate": 9.988997779898545e-05, + "loss": 2.0585, + "step": 493 + }, + { + "epoch": 0.021591852790768827, + "grad_norm": 3.859375, + "learning_rate": 9.988952191691925e-05, + "loss": 2.5417, + "step": 494 + }, + { + "epoch": 0.021635560994798724, + "grad_norm": 3.390625, + "learning_rate": 9.988906509336518e-05, + "loss": 1.9448, + "step": 495 + }, + { + "epoch": 0.02167926919882862, + "grad_norm": 3.28125, + "learning_rate": 9.988860732833182e-05, + "loss": 2.2873, + "step": 496 + }, + { + "epoch": 0.021722977402858515, + "grad_norm": 3.71875, + "learning_rate": 9.988814862182783e-05, + "loss": 2.8405, + "step": 497 + }, + { + "epoch": 0.021766685606888412, + "grad_norm": 3.359375, + "learning_rate": 9.988768897386188e-05, + "loss": 2.4951, + "step": 498 + }, + { + "epoch": 0.02181039381091831, + "grad_norm": 3.03125, + "learning_rate": 9.988722838444262e-05, + "loss": 2.1855, + "step": 499 + }, + { + "epoch": 0.021854102014948206, + "grad_norm": 3.4375, + "learning_rate": 9.988676685357876e-05, + "loss": 1.9974, + "step": 500 + }, + { + "epoch": 0.021897810218978103, + "grad_norm": 3.21875, + "learning_rate": 9.988630438127901e-05, + "loss": 1.9175, + "step": 501 + }, + { + "epoch": 0.021941518423008, + "grad_norm": 3.453125, + "learning_rate": 9.988584096755208e-05, + "loss": 2.3125, + "step": 502 + }, + { + "epoch": 0.021985226627037894, + "grad_norm": 4.53125, + "learning_rate": 9.988537661240673e-05, + "loss": 2.1997, + "step": 503 + }, + { + "epoch": 0.02202893483106779, + "grad_norm": 3.4375, + "learning_rate": 9.988491131585171e-05, + "loss": 2.3069, + "step": 504 + }, + { + "epoch": 0.022072643035097688, + "grad_norm": 4.0625, + "learning_rate": 9.988444507789582e-05, + "loss": 2.2246, + "step": 505 + }, + { + "epoch": 0.022116351239127585, + "grad_norm": 3.203125, + "learning_rate": 9.988397789854784e-05, + "loss": 2.7865, + "step": 506 + }, + { + "epoch": 0.022160059443157482, + "grad_norm": 3.34375, + "learning_rate": 9.98835097778166e-05, + "loss": 2.2723, + "step": 507 + }, + { + "epoch": 0.022203767647187376, + "grad_norm": 4.3125, + "learning_rate": 9.988304071571093e-05, + "loss": 2.3632, + "step": 508 + }, + { + "epoch": 0.022247475851217273, + "grad_norm": 4.71875, + "learning_rate": 9.988257071223968e-05, + "loss": 2.6076, + "step": 509 + }, + { + "epoch": 0.02229118405524717, + "grad_norm": 3.265625, + "learning_rate": 9.988209976741172e-05, + "loss": 2.2625, + "step": 510 + }, + { + "epoch": 0.022334892259277067, + "grad_norm": 3.34375, + "learning_rate": 9.988162788123594e-05, + "loss": 2.5188, + "step": 511 + }, + { + "epoch": 0.022378600463306964, + "grad_norm": 4.53125, + "learning_rate": 9.988115505372123e-05, + "loss": 2.7811, + "step": 512 + }, + { + "epoch": 0.022422308667336858, + "grad_norm": 3.28125, + "learning_rate": 9.988068128487654e-05, + "loss": 2.3825, + "step": 513 + }, + { + "epoch": 0.022466016871366755, + "grad_norm": 3.84375, + "learning_rate": 9.988020657471077e-05, + "loss": 2.4349, + "step": 514 + }, + { + "epoch": 0.022509725075396652, + "grad_norm": 3.875, + "learning_rate": 9.987973092323293e-05, + "loss": 2.6618, + "step": 515 + }, + { + "epoch": 0.02255343327942655, + "grad_norm": 3.671875, + "learning_rate": 9.987925433045197e-05, + "loss": 1.955, + "step": 516 + }, + { + "epoch": 0.022597141483456446, + "grad_norm": 7.5625, + "learning_rate": 9.987877679637688e-05, + "loss": 2.9896, + "step": 517 + }, + { + "epoch": 0.02264084968748634, + "grad_norm": 3.671875, + "learning_rate": 9.987829832101667e-05, + "loss": 2.0962, + "step": 518 + }, + { + "epoch": 0.022684557891516237, + "grad_norm": 4.25, + "learning_rate": 9.987781890438039e-05, + "loss": 2.0316, + "step": 519 + }, + { + "epoch": 0.022728266095546134, + "grad_norm": 3.359375, + "learning_rate": 9.987733854647707e-05, + "loss": 2.8837, + "step": 520 + }, + { + "epoch": 0.02277197429957603, + "grad_norm": 3.421875, + "learning_rate": 9.987685724731577e-05, + "loss": 2.0682, + "step": 521 + }, + { + "epoch": 0.022815682503605928, + "grad_norm": 4.125, + "learning_rate": 9.987637500690559e-05, + "loss": 2.4832, + "step": 522 + }, + { + "epoch": 0.02285939070763582, + "grad_norm": 4.21875, + "learning_rate": 9.987589182525561e-05, + "loss": 3.3895, + "step": 523 + }, + { + "epoch": 0.02290309891166572, + "grad_norm": 2.921875, + "learning_rate": 9.987540770237498e-05, + "loss": 2.2143, + "step": 524 + }, + { + "epoch": 0.022946807115695616, + "grad_norm": 3.671875, + "learning_rate": 9.98749226382728e-05, + "loss": 2.1937, + "step": 525 + }, + { + "epoch": 0.022990515319725513, + "grad_norm": 4.0625, + "learning_rate": 9.987443663295825e-05, + "loss": 3.2734, + "step": 526 + }, + { + "epoch": 0.02303422352375541, + "grad_norm": 3.640625, + "learning_rate": 9.987394968644049e-05, + "loss": 2.7082, + "step": 527 + }, + { + "epoch": 0.023077931727785304, + "grad_norm": 3.25, + "learning_rate": 9.987346179872869e-05, + "loss": 2.4288, + "step": 528 + }, + { + "epoch": 0.0231216399318152, + "grad_norm": 4.5625, + "learning_rate": 9.987297296983211e-05, + "loss": 2.2455, + "step": 529 + }, + { + "epoch": 0.023165348135845098, + "grad_norm": 3.40625, + "learning_rate": 9.987248319975993e-05, + "loss": 2.1247, + "step": 530 + }, + { + "epoch": 0.023209056339874995, + "grad_norm": 4.90625, + "learning_rate": 9.987199248852141e-05, + "loss": 2.9593, + "step": 531 + }, + { + "epoch": 0.023252764543904892, + "grad_norm": 4.21875, + "learning_rate": 9.987150083612579e-05, + "loss": 2.4991, + "step": 532 + }, + { + "epoch": 0.023296472747934786, + "grad_norm": 3.59375, + "learning_rate": 9.987100824258239e-05, + "loss": 2.68, + "step": 533 + }, + { + "epoch": 0.023340180951964683, + "grad_norm": 3.296875, + "learning_rate": 9.987051470790048e-05, + "loss": 2.7721, + "step": 534 + }, + { + "epoch": 0.02338388915599458, + "grad_norm": 3.5, + "learning_rate": 9.987002023208935e-05, + "loss": 2.1934, + "step": 535 + }, + { + "epoch": 0.023427597360024477, + "grad_norm": 3.5, + "learning_rate": 9.986952481515836e-05, + "loss": 2.1404, + "step": 536 + }, + { + "epoch": 0.023471305564054374, + "grad_norm": 2.953125, + "learning_rate": 9.986902845711687e-05, + "loss": 2.2057, + "step": 537 + }, + { + "epoch": 0.02351501376808427, + "grad_norm": 3.4375, + "learning_rate": 9.986853115797423e-05, + "loss": 2.0918, + "step": 538 + }, + { + "epoch": 0.023558721972114165, + "grad_norm": 3.25, + "learning_rate": 9.986803291773982e-05, + "loss": 2.3491, + "step": 539 + }, + { + "epoch": 0.023602430176144062, + "grad_norm": 3.25, + "learning_rate": 9.986753373642306e-05, + "loss": 3.0529, + "step": 540 + }, + { + "epoch": 0.02364613838017396, + "grad_norm": 2.765625, + "learning_rate": 9.986703361403335e-05, + "loss": 2.1784, + "step": 541 + }, + { + "epoch": 0.023689846584203856, + "grad_norm": 3.265625, + "learning_rate": 9.986653255058014e-05, + "loss": 2.5431, + "step": 542 + }, + { + "epoch": 0.023733554788233753, + "grad_norm": 3.515625, + "learning_rate": 9.986603054607288e-05, + "loss": 2.2682, + "step": 543 + }, + { + "epoch": 0.023777262992263647, + "grad_norm": 3.234375, + "learning_rate": 9.986552760052105e-05, + "loss": 2.5113, + "step": 544 + }, + { + "epoch": 0.023820971196293544, + "grad_norm": 4.28125, + "learning_rate": 9.986502371393413e-05, + "loss": 2.7172, + "step": 545 + }, + { + "epoch": 0.02386467940032344, + "grad_norm": 4.1875, + "learning_rate": 9.986451888632165e-05, + "loss": 2.6705, + "step": 546 + }, + { + "epoch": 0.023908387604353338, + "grad_norm": 3.4375, + "learning_rate": 9.986401311769312e-05, + "loss": 2.2219, + "step": 547 + }, + { + "epoch": 0.023952095808383235, + "grad_norm": 3.875, + "learning_rate": 9.98635064080581e-05, + "loss": 2.3801, + "step": 548 + }, + { + "epoch": 0.02399580401241313, + "grad_norm": 3.46875, + "learning_rate": 9.986299875742613e-05, + "loss": 2.2103, + "step": 549 + }, + { + "epoch": 0.024039512216443026, + "grad_norm": 4.75, + "learning_rate": 9.98624901658068e-05, + "loss": 2.581, + "step": 550 + }, + { + "epoch": 0.024083220420472923, + "grad_norm": 3.109375, + "learning_rate": 9.986198063320971e-05, + "loss": 2.2305, + "step": 551 + }, + { + "epoch": 0.02412692862450282, + "grad_norm": 3.515625, + "learning_rate": 9.986147015964446e-05, + "loss": 2.5085, + "step": 552 + }, + { + "epoch": 0.024170636828532717, + "grad_norm": 3.640625, + "learning_rate": 9.986095874512072e-05, + "loss": 2.3899, + "step": 553 + }, + { + "epoch": 0.02421434503256261, + "grad_norm": 3.21875, + "learning_rate": 9.986044638964811e-05, + "loss": 2.2853, + "step": 554 + }, + { + "epoch": 0.024258053236592508, + "grad_norm": 4.65625, + "learning_rate": 9.985993309323631e-05, + "loss": 2.8576, + "step": 555 + }, + { + "epoch": 0.024301761440622405, + "grad_norm": 4.09375, + "learning_rate": 9.985941885589502e-05, + "loss": 2.8423, + "step": 556 + }, + { + "epoch": 0.024345469644652302, + "grad_norm": 6.40625, + "learning_rate": 9.985890367763391e-05, + "loss": 2.9808, + "step": 557 + }, + { + "epoch": 0.0243891778486822, + "grad_norm": 2.9375, + "learning_rate": 9.985838755846273e-05, + "loss": 2.2293, + "step": 558 + }, + { + "epoch": 0.024432886052712093, + "grad_norm": 4.40625, + "learning_rate": 9.98578704983912e-05, + "loss": 3.302, + "step": 559 + }, + { + "epoch": 0.02447659425674199, + "grad_norm": 13.1875, + "learning_rate": 9.98573524974291e-05, + "loss": 3.9046, + "step": 560 + }, + { + "epoch": 0.024520302460771887, + "grad_norm": 3.0625, + "learning_rate": 9.98568335555862e-05, + "loss": 2.1789, + "step": 561 + }, + { + "epoch": 0.024564010664801784, + "grad_norm": 3.640625, + "learning_rate": 9.985631367287226e-05, + "loss": 2.3703, + "step": 562 + }, + { + "epoch": 0.02460771886883168, + "grad_norm": 4.78125, + "learning_rate": 9.985579284929715e-05, + "loss": 2.5617, + "step": 563 + }, + { + "epoch": 0.024651427072861574, + "grad_norm": 2.8125, + "learning_rate": 9.985527108487065e-05, + "loss": 2.1361, + "step": 564 + }, + { + "epoch": 0.02469513527689147, + "grad_norm": 4.125, + "learning_rate": 9.985474837960263e-05, + "loss": 2.3338, + "step": 565 + }, + { + "epoch": 0.02473884348092137, + "grad_norm": 2.984375, + "learning_rate": 9.985422473350295e-05, + "loss": 2.0618, + "step": 566 + }, + { + "epoch": 0.024782551684951266, + "grad_norm": 3.0625, + "learning_rate": 9.985370014658148e-05, + "loss": 2.133, + "step": 567 + }, + { + "epoch": 0.024826259888981163, + "grad_norm": 3.359375, + "learning_rate": 9.985317461884814e-05, + "loss": 2.9443, + "step": 568 + }, + { + "epoch": 0.02486996809301106, + "grad_norm": 3.25, + "learning_rate": 9.985264815031283e-05, + "loss": 2.6085, + "step": 569 + }, + { + "epoch": 0.024913676297040954, + "grad_norm": 4.625, + "learning_rate": 9.98521207409855e-05, + "loss": 2.6604, + "step": 570 + }, + { + "epoch": 0.02495738450107085, + "grad_norm": 3.078125, + "learning_rate": 9.985159239087609e-05, + "loss": 2.368, + "step": 571 + }, + { + "epoch": 0.025001092705100748, + "grad_norm": 3.609375, + "learning_rate": 9.985106309999458e-05, + "loss": 2.2466, + "step": 572 + }, + { + "epoch": 0.025044800909130645, + "grad_norm": 3.15625, + "learning_rate": 9.985053286835095e-05, + "loss": 2.3146, + "step": 573 + }, + { + "epoch": 0.025088509113160542, + "grad_norm": 2.953125, + "learning_rate": 9.985000169595521e-05, + "loss": 2.3523, + "step": 574 + }, + { + "epoch": 0.025132217317190435, + "grad_norm": 3.328125, + "learning_rate": 9.984946958281739e-05, + "loss": 2.6499, + "step": 575 + }, + { + "epoch": 0.025175925521220333, + "grad_norm": 3.421875, + "learning_rate": 9.984893652894753e-05, + "loss": 1.8698, + "step": 576 + }, + { + "epoch": 0.02521963372525023, + "grad_norm": 3.453125, + "learning_rate": 9.984840253435568e-05, + "loss": 2.1902, + "step": 577 + }, + { + "epoch": 0.025263341929280127, + "grad_norm": 3.6875, + "learning_rate": 9.984786759905191e-05, + "loss": 2.1288, + "step": 578 + }, + { + "epoch": 0.025307050133310024, + "grad_norm": 3.25, + "learning_rate": 9.984733172304634e-05, + "loss": 2.2317, + "step": 579 + }, + { + "epoch": 0.025350758337339917, + "grad_norm": 10.75, + "learning_rate": 9.984679490634907e-05, + "loss": 2.4892, + "step": 580 + }, + { + "epoch": 0.025394466541369815, + "grad_norm": 4.75, + "learning_rate": 9.984625714897024e-05, + "loss": 2.6579, + "step": 581 + }, + { + "epoch": 0.02543817474539971, + "grad_norm": 3.375, + "learning_rate": 9.984571845091999e-05, + "loss": 2.2262, + "step": 582 + }, + { + "epoch": 0.02548188294942961, + "grad_norm": 3.296875, + "learning_rate": 9.984517881220848e-05, + "loss": 2.4475, + "step": 583 + }, + { + "epoch": 0.025525591153459506, + "grad_norm": 3.046875, + "learning_rate": 9.984463823284589e-05, + "loss": 2.2006, + "step": 584 + }, + { + "epoch": 0.0255692993574894, + "grad_norm": 2.984375, + "learning_rate": 9.984409671284243e-05, + "loss": 2.0782, + "step": 585 + }, + { + "epoch": 0.025613007561519296, + "grad_norm": 3.328125, + "learning_rate": 9.984355425220835e-05, + "loss": 2.7302, + "step": 586 + }, + { + "epoch": 0.025656715765549194, + "grad_norm": 3.328125, + "learning_rate": 9.984301085095382e-05, + "loss": 2.5203, + "step": 587 + }, + { + "epoch": 0.02570042396957909, + "grad_norm": 3.234375, + "learning_rate": 9.984246650908915e-05, + "loss": 2.5343, + "step": 588 + }, + { + "epoch": 0.025744132173608988, + "grad_norm": 5.5625, + "learning_rate": 9.98419212266246e-05, + "loss": 2.0457, + "step": 589 + }, + { + "epoch": 0.02578784037763888, + "grad_norm": 3.59375, + "learning_rate": 9.984137500357044e-05, + "loss": 2.2221, + "step": 590 + }, + { + "epoch": 0.02583154858166878, + "grad_norm": 3.71875, + "learning_rate": 9.984082783993703e-05, + "loss": 2.0484, + "step": 591 + }, + { + "epoch": 0.025875256785698676, + "grad_norm": 3.140625, + "learning_rate": 9.984027973573462e-05, + "loss": 2.1899, + "step": 592 + }, + { + "epoch": 0.025918964989728573, + "grad_norm": 4.5625, + "learning_rate": 9.983973069097359e-05, + "loss": 2.5412, + "step": 593 + }, + { + "epoch": 0.02596267319375847, + "grad_norm": 3.8125, + "learning_rate": 9.983918070566433e-05, + "loss": 2.7712, + "step": 594 + }, + { + "epoch": 0.026006381397788363, + "grad_norm": 3.203125, + "learning_rate": 9.983862977981718e-05, + "loss": 2.521, + "step": 595 + }, + { + "epoch": 0.02605008960181826, + "grad_norm": 3.5, + "learning_rate": 9.983807791344255e-05, + "loss": 2.3225, + "step": 596 + }, + { + "epoch": 0.026093797805848157, + "grad_norm": 3.90625, + "learning_rate": 9.983752510655084e-05, + "loss": 2.5047, + "step": 597 + }, + { + "epoch": 0.026137506009878055, + "grad_norm": 3.390625, + "learning_rate": 9.983697135915252e-05, + "loss": 2.0369, + "step": 598 + }, + { + "epoch": 0.02618121421390795, + "grad_norm": 2.859375, + "learning_rate": 9.9836416671258e-05, + "loss": 2.3224, + "step": 599 + }, + { + "epoch": 0.026224922417937845, + "grad_norm": 3.203125, + "learning_rate": 9.983586104287778e-05, + "loss": 1.9921, + "step": 600 + }, + { + "epoch": 0.026268630621967742, + "grad_norm": 3.484375, + "learning_rate": 9.983530447402231e-05, + "loss": 1.903, + "step": 601 + }, + { + "epoch": 0.02631233882599764, + "grad_norm": 3.265625, + "learning_rate": 9.983474696470212e-05, + "loss": 2.197, + "step": 602 + }, + { + "epoch": 0.026356047030027537, + "grad_norm": 5.21875, + "learning_rate": 9.983418851492773e-05, + "loss": 2.5317, + "step": 603 + }, + { + "epoch": 0.026399755234057434, + "grad_norm": 4.375, + "learning_rate": 9.983362912470966e-05, + "loss": 2.7071, + "step": 604 + }, + { + "epoch": 0.02644346343808733, + "grad_norm": 6.6875, + "learning_rate": 9.98330687940585e-05, + "loss": 2.3881, + "step": 605 + }, + { + "epoch": 0.026487171642117224, + "grad_norm": 4.21875, + "learning_rate": 9.983250752298478e-05, + "loss": 2.201, + "step": 606 + }, + { + "epoch": 0.02653087984614712, + "grad_norm": 4.3125, + "learning_rate": 9.983194531149914e-05, + "loss": 2.5151, + "step": 607 + }, + { + "epoch": 0.02657458805017702, + "grad_norm": 3.9375, + "learning_rate": 9.983138215961214e-05, + "loss": 2.4005, + "step": 608 + }, + { + "epoch": 0.026618296254206916, + "grad_norm": 3.359375, + "learning_rate": 9.983081806733444e-05, + "loss": 2.1401, + "step": 609 + }, + { + "epoch": 0.026662004458236813, + "grad_norm": 3.25, + "learning_rate": 9.983025303467668e-05, + "loss": 2.2363, + "step": 610 + }, + { + "epoch": 0.026705712662266706, + "grad_norm": 2.84375, + "learning_rate": 9.982968706164953e-05, + "loss": 1.9678, + "step": 611 + }, + { + "epoch": 0.026749420866296603, + "grad_norm": 3.90625, + "learning_rate": 9.982912014826365e-05, + "loss": 2.2302, + "step": 612 + }, + { + "epoch": 0.0267931290703265, + "grad_norm": 3.796875, + "learning_rate": 9.982855229452975e-05, + "loss": 2.2146, + "step": 613 + }, + { + "epoch": 0.026836837274356398, + "grad_norm": 3.265625, + "learning_rate": 9.982798350045854e-05, + "loss": 2.1537, + "step": 614 + }, + { + "epoch": 0.026880545478386295, + "grad_norm": 4.0625, + "learning_rate": 9.982741376606078e-05, + "loss": 2.3415, + "step": 615 + }, + { + "epoch": 0.026924253682416188, + "grad_norm": 3.1875, + "learning_rate": 9.982684309134719e-05, + "loss": 1.9102, + "step": 616 + }, + { + "epoch": 0.026967961886446085, + "grad_norm": 3.671875, + "learning_rate": 9.982627147632855e-05, + "loss": 2.2455, + "step": 617 + }, + { + "epoch": 0.027011670090475982, + "grad_norm": 3.265625, + "learning_rate": 9.982569892101565e-05, + "loss": 2.1046, + "step": 618 + }, + { + "epoch": 0.02705537829450588, + "grad_norm": 4.40625, + "learning_rate": 9.982512542541929e-05, + "loss": 2.8151, + "step": 619 + }, + { + "epoch": 0.027099086498535777, + "grad_norm": 3.34375, + "learning_rate": 9.98245509895503e-05, + "loss": 2.7252, + "step": 620 + }, + { + "epoch": 0.02714279470256567, + "grad_norm": 2.875, + "learning_rate": 9.982397561341952e-05, + "loss": 1.9278, + "step": 621 + }, + { + "epoch": 0.027186502906595567, + "grad_norm": 2.921875, + "learning_rate": 9.982339929703781e-05, + "loss": 2.1459, + "step": 622 + }, + { + "epoch": 0.027230211110625464, + "grad_norm": 3.46875, + "learning_rate": 9.982282204041604e-05, + "loss": 2.5963, + "step": 623 + }, + { + "epoch": 0.02727391931465536, + "grad_norm": 7.90625, + "learning_rate": 9.982224384356508e-05, + "loss": 2.058, + "step": 624 + }, + { + "epoch": 0.02731762751868526, + "grad_norm": 2.859375, + "learning_rate": 9.98216647064959e-05, + "loss": 2.4754, + "step": 625 + }, + { + "epoch": 0.027361335722715152, + "grad_norm": 3.640625, + "learning_rate": 9.982108462921937e-05, + "loss": 2.964, + "step": 626 + }, + { + "epoch": 0.02740504392674505, + "grad_norm": 3.4375, + "learning_rate": 9.982050361174647e-05, + "loss": 2.1477, + "step": 627 + }, + { + "epoch": 0.027448752130774946, + "grad_norm": 2.75, + "learning_rate": 9.981992165408816e-05, + "loss": 2.2316, + "step": 628 + }, + { + "epoch": 0.027492460334804843, + "grad_norm": 2.984375, + "learning_rate": 9.981933875625542e-05, + "loss": 2.4692, + "step": 629 + }, + { + "epoch": 0.02753616853883474, + "grad_norm": 3.328125, + "learning_rate": 9.981875491825924e-05, + "loss": 2.175, + "step": 630 + }, + { + "epoch": 0.027579876742864634, + "grad_norm": 5.15625, + "learning_rate": 9.981817014011066e-05, + "loss": 2.2929, + "step": 631 + }, + { + "epoch": 0.02762358494689453, + "grad_norm": 3.59375, + "learning_rate": 9.981758442182068e-05, + "loss": 2.5712, + "step": 632 + }, + { + "epoch": 0.027667293150924428, + "grad_norm": 3.265625, + "learning_rate": 9.981699776340039e-05, + "loss": 1.9267, + "step": 633 + }, + { + "epoch": 0.027711001354954325, + "grad_norm": 3.390625, + "learning_rate": 9.981641016486085e-05, + "loss": 2.7868, + "step": 634 + }, + { + "epoch": 0.027754709558984222, + "grad_norm": 3.84375, + "learning_rate": 9.981582162621314e-05, + "loss": 2.5112, + "step": 635 + }, + { + "epoch": 0.027798417763014116, + "grad_norm": 3.46875, + "learning_rate": 9.981523214746837e-05, + "loss": 2.2265, + "step": 636 + }, + { + "epoch": 0.027842125967044013, + "grad_norm": 4.125, + "learning_rate": 9.981464172863768e-05, + "loss": 2.7183, + "step": 637 + }, + { + "epoch": 0.02788583417107391, + "grad_norm": 5.0, + "learning_rate": 9.981405036973219e-05, + "loss": 2.5362, + "step": 638 + }, + { + "epoch": 0.027929542375103807, + "grad_norm": 3.8125, + "learning_rate": 9.981345807076307e-05, + "loss": 3.1845, + "step": 639 + }, + { + "epoch": 0.027973250579133704, + "grad_norm": 3.25, + "learning_rate": 9.98128648317415e-05, + "loss": 2.0682, + "step": 640 + }, + { + "epoch": 0.0280169587831636, + "grad_norm": 3.265625, + "learning_rate": 9.981227065267867e-05, + "loss": 2.0779, + "step": 641 + }, + { + "epoch": 0.028060666987193495, + "grad_norm": 3.1875, + "learning_rate": 9.981167553358579e-05, + "loss": 2.5607, + "step": 642 + }, + { + "epoch": 0.028104375191223392, + "grad_norm": 3.21875, + "learning_rate": 9.981107947447409e-05, + "loss": 2.2148, + "step": 643 + }, + { + "epoch": 0.02814808339525329, + "grad_norm": 3.265625, + "learning_rate": 9.981048247535483e-05, + "loss": 2.6086, + "step": 644 + }, + { + "epoch": 0.028191791599283186, + "grad_norm": 3.453125, + "learning_rate": 9.980988453623928e-05, + "loss": 2.6902, + "step": 645 + }, + { + "epoch": 0.028235499803313083, + "grad_norm": 4.9375, + "learning_rate": 9.98092856571387e-05, + "loss": 2.5818, + "step": 646 + }, + { + "epoch": 0.028279208007342977, + "grad_norm": 3.5, + "learning_rate": 9.98086858380644e-05, + "loss": 2.1797, + "step": 647 + }, + { + "epoch": 0.028322916211372874, + "grad_norm": 3.578125, + "learning_rate": 9.980808507902773e-05, + "loss": 2.1088, + "step": 648 + }, + { + "epoch": 0.02836662441540277, + "grad_norm": 3.296875, + "learning_rate": 9.980748338003998e-05, + "loss": 2.5253, + "step": 649 + }, + { + "epoch": 0.02841033261943267, + "grad_norm": 4.75, + "learning_rate": 9.980688074111253e-05, + "loss": 2.7148, + "step": 650 + }, + { + "epoch": 0.028454040823462565, + "grad_norm": 3.21875, + "learning_rate": 9.980627716225675e-05, + "loss": 2.6032, + "step": 651 + }, + { + "epoch": 0.02849774902749246, + "grad_norm": 15.375, + "learning_rate": 9.980567264348404e-05, + "loss": 2.927, + "step": 652 + }, + { + "epoch": 0.028541457231522356, + "grad_norm": 3.421875, + "learning_rate": 9.98050671848058e-05, + "loss": 2.7096, + "step": 653 + }, + { + "epoch": 0.028585165435552253, + "grad_norm": 6.0625, + "learning_rate": 9.980446078623345e-05, + "loss": 1.9591, + "step": 654 + }, + { + "epoch": 0.02862887363958215, + "grad_norm": 3.53125, + "learning_rate": 9.980385344777842e-05, + "loss": 2.5283, + "step": 655 + }, + { + "epoch": 0.028672581843612047, + "grad_norm": 3.125, + "learning_rate": 9.980324516945221e-05, + "loss": 1.8712, + "step": 656 + }, + { + "epoch": 0.02871629004764194, + "grad_norm": 2.765625, + "learning_rate": 9.980263595126629e-05, + "loss": 2.1822, + "step": 657 + }, + { + "epoch": 0.028759998251671838, + "grad_norm": 4.03125, + "learning_rate": 9.980202579323212e-05, + "loss": 2.3666, + "step": 658 + }, + { + "epoch": 0.028803706455701735, + "grad_norm": 3.171875, + "learning_rate": 9.980141469536125e-05, + "loss": 2.4666, + "step": 659 + }, + { + "epoch": 0.028847414659731632, + "grad_norm": 3.46875, + "learning_rate": 9.98008026576652e-05, + "loss": 2.176, + "step": 660 + }, + { + "epoch": 0.02889112286376153, + "grad_norm": 3.25, + "learning_rate": 9.980018968015552e-05, + "loss": 1.8907, + "step": 661 + }, + { + "epoch": 0.028934831067791423, + "grad_norm": 3.890625, + "learning_rate": 9.979957576284379e-05, + "loss": 1.7983, + "step": 662 + }, + { + "epoch": 0.02897853927182132, + "grad_norm": 3.6875, + "learning_rate": 9.979896090574157e-05, + "loss": 3.0609, + "step": 663 + }, + { + "epoch": 0.029022247475851217, + "grad_norm": 3.515625, + "learning_rate": 9.97983451088605e-05, + "loss": 2.1672, + "step": 664 + }, + { + "epoch": 0.029065955679881114, + "grad_norm": 10.0625, + "learning_rate": 9.979772837221216e-05, + "loss": 2.3753, + "step": 665 + }, + { + "epoch": 0.02910966388391101, + "grad_norm": 2.875, + "learning_rate": 9.979711069580821e-05, + "loss": 2.6742, + "step": 666 + }, + { + "epoch": 0.029153372087940905, + "grad_norm": 3.890625, + "learning_rate": 9.979649207966031e-05, + "loss": 2.415, + "step": 667 + }, + { + "epoch": 0.029197080291970802, + "grad_norm": 5.78125, + "learning_rate": 9.979587252378013e-05, + "loss": 2.6886, + "step": 668 + }, + { + "epoch": 0.0292407884960007, + "grad_norm": 3.953125, + "learning_rate": 9.979525202817936e-05, + "loss": 2.5185, + "step": 669 + }, + { + "epoch": 0.029284496700030596, + "grad_norm": 3.703125, + "learning_rate": 9.979463059286972e-05, + "loss": 2.3531, + "step": 670 + }, + { + "epoch": 0.029328204904060493, + "grad_norm": 3.3125, + "learning_rate": 9.979400821786291e-05, + "loss": 2.2063, + "step": 671 + }, + { + "epoch": 0.02937191310809039, + "grad_norm": 3.46875, + "learning_rate": 9.979338490317072e-05, + "loss": 2.5651, + "step": 672 + }, + { + "epoch": 0.029415621312120284, + "grad_norm": 2.75, + "learning_rate": 9.979276064880486e-05, + "loss": 2.0795, + "step": 673 + }, + { + "epoch": 0.02945932951615018, + "grad_norm": 3.03125, + "learning_rate": 9.979213545477715e-05, + "loss": 2.2151, + "step": 674 + }, + { + "epoch": 0.029503037720180078, + "grad_norm": 3.109375, + "learning_rate": 9.979150932109937e-05, + "loss": 2.5187, + "step": 675 + }, + { + "epoch": 0.029546745924209975, + "grad_norm": 2.875, + "learning_rate": 9.979088224778335e-05, + "loss": 2.2578, + "step": 676 + }, + { + "epoch": 0.029590454128239872, + "grad_norm": 3.421875, + "learning_rate": 9.97902542348409e-05, + "loss": 2.3404, + "step": 677 + }, + { + "epoch": 0.029634162332269766, + "grad_norm": 3.140625, + "learning_rate": 9.978962528228388e-05, + "loss": 2.2639, + "step": 678 + }, + { + "epoch": 0.029677870536299663, + "grad_norm": 3.296875, + "learning_rate": 9.978899539012418e-05, + "loss": 1.9094, + "step": 679 + }, + { + "epoch": 0.02972157874032956, + "grad_norm": 3.09375, + "learning_rate": 9.978836455837368e-05, + "loss": 2.413, + "step": 680 + }, + { + "epoch": 0.029765286944359457, + "grad_norm": 4.5625, + "learning_rate": 9.978773278704426e-05, + "loss": 2.5859, + "step": 681 + }, + { + "epoch": 0.029808995148389354, + "grad_norm": 3.203125, + "learning_rate": 9.978710007614786e-05, + "loss": 2.1896, + "step": 682 + }, + { + "epoch": 0.029852703352419248, + "grad_norm": 3.59375, + "learning_rate": 9.978646642569644e-05, + "loss": 2.4116, + "step": 683 + }, + { + "epoch": 0.029896411556449145, + "grad_norm": 3.453125, + "learning_rate": 9.978583183570193e-05, + "loss": 2.1242, + "step": 684 + }, + { + "epoch": 0.029940119760479042, + "grad_norm": 2.8125, + "learning_rate": 9.97851963061763e-05, + "loss": 2.1152, + "step": 685 + }, + { + "epoch": 0.02998382796450894, + "grad_norm": 3.046875, + "learning_rate": 9.978455983713157e-05, + "loss": 2.6089, + "step": 686 + }, + { + "epoch": 0.030027536168538836, + "grad_norm": 3.40625, + "learning_rate": 9.978392242857973e-05, + "loss": 2.4736, + "step": 687 + }, + { + "epoch": 0.03007124437256873, + "grad_norm": 3.546875, + "learning_rate": 9.978328408053282e-05, + "loss": 2.5537, + "step": 688 + }, + { + "epoch": 0.030114952576598627, + "grad_norm": 2.609375, + "learning_rate": 9.978264479300289e-05, + "loss": 1.7285, + "step": 689 + }, + { + "epoch": 0.030158660780628524, + "grad_norm": 4.84375, + "learning_rate": 9.978200456600198e-05, + "loss": 2.3336, + "step": 690 + }, + { + "epoch": 0.03020236898465842, + "grad_norm": 2.859375, + "learning_rate": 9.97813633995422e-05, + "loss": 1.8666, + "step": 691 + }, + { + "epoch": 0.030246077188688318, + "grad_norm": 3.4375, + "learning_rate": 9.978072129363564e-05, + "loss": 2.2779, + "step": 692 + }, + { + "epoch": 0.030289785392718212, + "grad_norm": 3.21875, + "learning_rate": 9.978007824829442e-05, + "loss": 2.6795, + "step": 693 + }, + { + "epoch": 0.03033349359674811, + "grad_norm": 3.3125, + "learning_rate": 9.977943426353067e-05, + "loss": 2.7468, + "step": 694 + }, + { + "epoch": 0.030377201800778006, + "grad_norm": 4.96875, + "learning_rate": 9.977878933935655e-05, + "loss": 2.453, + "step": 695 + }, + { + "epoch": 0.030420910004807903, + "grad_norm": 3.546875, + "learning_rate": 9.977814347578421e-05, + "loss": 2.216, + "step": 696 + }, + { + "epoch": 0.0304646182088378, + "grad_norm": 3.09375, + "learning_rate": 9.977749667282588e-05, + "loss": 2.5925, + "step": 697 + }, + { + "epoch": 0.030508326412867694, + "grad_norm": 3.21875, + "learning_rate": 9.977684893049371e-05, + "loss": 2.7342, + "step": 698 + }, + { + "epoch": 0.03055203461689759, + "grad_norm": 3.3125, + "learning_rate": 9.977620024879997e-05, + "loss": 2.2558, + "step": 699 + }, + { + "epoch": 0.030595742820927488, + "grad_norm": 3.53125, + "learning_rate": 9.977555062775688e-05, + "loss": 2.322, + "step": 700 + }, + { + "epoch": 0.030639451024957385, + "grad_norm": 3.1875, + "learning_rate": 9.97749000673767e-05, + "loss": 2.3314, + "step": 701 + }, + { + "epoch": 0.030683159228987282, + "grad_norm": 3.390625, + "learning_rate": 9.977424856767172e-05, + "loss": 2.0297, + "step": 702 + }, + { + "epoch": 0.030726867433017176, + "grad_norm": 3.203125, + "learning_rate": 9.977359612865423e-05, + "loss": 2.2477, + "step": 703 + }, + { + "epoch": 0.030770575637047073, + "grad_norm": 3.4375, + "learning_rate": 9.977294275033654e-05, + "loss": 2.28, + "step": 704 + }, + { + "epoch": 0.03081428384107697, + "grad_norm": 11.4375, + "learning_rate": 9.977228843273098e-05, + "loss": 6.5621, + "step": 705 + }, + { + "epoch": 0.030857992045106867, + "grad_norm": 4.125, + "learning_rate": 9.977163317584988e-05, + "loss": 2.6921, + "step": 706 + }, + { + "epoch": 0.030901700249136764, + "grad_norm": 6.71875, + "learning_rate": 9.977097697970563e-05, + "loss": 2.1848, + "step": 707 + }, + { + "epoch": 0.03094540845316666, + "grad_norm": 3.34375, + "learning_rate": 9.977031984431063e-05, + "loss": 2.3677, + "step": 708 + }, + { + "epoch": 0.030989116657196555, + "grad_norm": 4.0, + "learning_rate": 9.976966176967722e-05, + "loss": 2.5439, + "step": 709 + }, + { + "epoch": 0.031032824861226452, + "grad_norm": 2.9375, + "learning_rate": 9.976900275581789e-05, + "loss": 2.305, + "step": 710 + }, + { + "epoch": 0.03107653306525635, + "grad_norm": 3.375, + "learning_rate": 9.976834280274502e-05, + "loss": 2.5969, + "step": 711 + }, + { + "epoch": 0.031120241269286246, + "grad_norm": 4.75, + "learning_rate": 9.976768191047109e-05, + "loss": 2.6965, + "step": 712 + }, + { + "epoch": 0.031163949473316143, + "grad_norm": 3.046875, + "learning_rate": 9.976702007900857e-05, + "loss": 1.9579, + "step": 713 + }, + { + "epoch": 0.031207657677346037, + "grad_norm": 3.8125, + "learning_rate": 9.976635730836995e-05, + "loss": 1.9571, + "step": 714 + }, + { + "epoch": 0.031251365881375934, + "grad_norm": 3.234375, + "learning_rate": 9.976569359856773e-05, + "loss": 2.0944, + "step": 715 + }, + { + "epoch": 0.03129507408540583, + "grad_norm": 3.140625, + "learning_rate": 9.976502894961445e-05, + "loss": 2.0834, + "step": 716 + }, + { + "epoch": 0.03133878228943573, + "grad_norm": 3.890625, + "learning_rate": 9.976436336152265e-05, + "loss": 2.4847, + "step": 717 + }, + { + "epoch": 0.03138249049346562, + "grad_norm": 2.953125, + "learning_rate": 9.976369683430487e-05, + "loss": 2.2834, + "step": 718 + }, + { + "epoch": 0.03142619869749552, + "grad_norm": 3.4375, + "learning_rate": 9.976302936797371e-05, + "loss": 2.1861, + "step": 719 + }, + { + "epoch": 0.031469906901525416, + "grad_norm": 4.71875, + "learning_rate": 9.976236096254177e-05, + "loss": 3.4581, + "step": 720 + }, + { + "epoch": 0.03151361510555531, + "grad_norm": 5.09375, + "learning_rate": 9.976169161802164e-05, + "loss": 2.4288, + "step": 721 + }, + { + "epoch": 0.03155732330958521, + "grad_norm": 3.140625, + "learning_rate": 9.976102133442596e-05, + "loss": 2.3548, + "step": 722 + }, + { + "epoch": 0.031601031513615103, + "grad_norm": 3.390625, + "learning_rate": 9.976035011176738e-05, + "loss": 2.1473, + "step": 723 + }, + { + "epoch": 0.031644739717645004, + "grad_norm": 4.71875, + "learning_rate": 9.975967795005859e-05, + "loss": 2.6153, + "step": 724 + }, + { + "epoch": 0.0316884479216749, + "grad_norm": 3.6875, + "learning_rate": 9.975900484931225e-05, + "loss": 2.9447, + "step": 725 + }, + { + "epoch": 0.0317321561257048, + "grad_norm": 2.984375, + "learning_rate": 9.975833080954107e-05, + "loss": 2.2077, + "step": 726 + }, + { + "epoch": 0.03177586432973469, + "grad_norm": 2.921875, + "learning_rate": 9.975765583075776e-05, + "loss": 2.2345, + "step": 727 + }, + { + "epoch": 0.031819572533764585, + "grad_norm": 3.15625, + "learning_rate": 9.975697991297506e-05, + "loss": 2.1691, + "step": 728 + }, + { + "epoch": 0.031863280737794486, + "grad_norm": 3.25, + "learning_rate": 9.975630305620574e-05, + "loss": 2.5123, + "step": 729 + }, + { + "epoch": 0.03190698894182438, + "grad_norm": 3.234375, + "learning_rate": 9.975562526046256e-05, + "loss": 2.519, + "step": 730 + }, + { + "epoch": 0.03195069714585428, + "grad_norm": 2.796875, + "learning_rate": 9.975494652575832e-05, + "loss": 2.0585, + "step": 731 + }, + { + "epoch": 0.031994405349884174, + "grad_norm": 2.65625, + "learning_rate": 9.975426685210582e-05, + "loss": 2.1076, + "step": 732 + }, + { + "epoch": 0.03203811355391407, + "grad_norm": 3.4375, + "learning_rate": 9.975358623951789e-05, + "loss": 2.4889, + "step": 733 + }, + { + "epoch": 0.03208182175794397, + "grad_norm": 3.265625, + "learning_rate": 9.975290468800739e-05, + "loss": 2.576, + "step": 734 + }, + { + "epoch": 0.03212552996197386, + "grad_norm": 2.828125, + "learning_rate": 9.975222219758716e-05, + "loss": 2.4601, + "step": 735 + }, + { + "epoch": 0.03216923816600376, + "grad_norm": 3.15625, + "learning_rate": 9.975153876827008e-05, + "loss": 2.4984, + "step": 736 + }, + { + "epoch": 0.032212946370033656, + "grad_norm": 3.328125, + "learning_rate": 9.975085440006904e-05, + "loss": 2.5157, + "step": 737 + }, + { + "epoch": 0.03225665457406355, + "grad_norm": 4.28125, + "learning_rate": 9.9750169092997e-05, + "loss": 1.9031, + "step": 738 + }, + { + "epoch": 0.03230036277809345, + "grad_norm": 3.09375, + "learning_rate": 9.974948284706685e-05, + "loss": 2.3383, + "step": 739 + }, + { + "epoch": 0.032344070982123344, + "grad_norm": 2.8125, + "learning_rate": 9.974879566229154e-05, + "loss": 2.1407, + "step": 740 + }, + { + "epoch": 0.032387779186153244, + "grad_norm": 3.9375, + "learning_rate": 9.974810753868405e-05, + "loss": 2.9169, + "step": 741 + }, + { + "epoch": 0.03243148739018314, + "grad_norm": 3.1875, + "learning_rate": 9.974741847625737e-05, + "loss": 2.4408, + "step": 742 + }, + { + "epoch": 0.03247519559421303, + "grad_norm": 4.0, + "learning_rate": 9.974672847502451e-05, + "loss": 2.501, + "step": 743 + }, + { + "epoch": 0.03251890379824293, + "grad_norm": 4.65625, + "learning_rate": 9.974603753499847e-05, + "loss": 2.9058, + "step": 744 + }, + { + "epoch": 0.032562612002272825, + "grad_norm": 3.203125, + "learning_rate": 9.97453456561923e-05, + "loss": 2.0979, + "step": 745 + }, + { + "epoch": 0.032606320206302726, + "grad_norm": 3.0625, + "learning_rate": 9.974465283861905e-05, + "loss": 2.2657, + "step": 746 + }, + { + "epoch": 0.03265002841033262, + "grad_norm": 3.125, + "learning_rate": 9.974395908229181e-05, + "loss": 2.202, + "step": 747 + }, + { + "epoch": 0.03269373661436251, + "grad_norm": 3.28125, + "learning_rate": 9.974326438722367e-05, + "loss": 2.4637, + "step": 748 + }, + { + "epoch": 0.032737444818392414, + "grad_norm": 3.109375, + "learning_rate": 9.974256875342772e-05, + "loss": 2.0341, + "step": 749 + }, + { + "epoch": 0.03278115302242231, + "grad_norm": 3.453125, + "learning_rate": 9.974187218091711e-05, + "loss": 2.1654, + "step": 750 + }, + { + "epoch": 0.03282486122645221, + "grad_norm": 5.125, + "learning_rate": 9.974117466970496e-05, + "loss": 3.255, + "step": 751 + }, + { + "epoch": 0.0328685694304821, + "grad_norm": 3.4375, + "learning_rate": 9.974047621980447e-05, + "loss": 2.4065, + "step": 752 + }, + { + "epoch": 0.032912277634511995, + "grad_norm": 3.4375, + "learning_rate": 9.97397768312288e-05, + "loss": 2.6438, + "step": 753 + }, + { + "epoch": 0.032955985838541896, + "grad_norm": 3.53125, + "learning_rate": 9.973907650399113e-05, + "loss": 2.4801, + "step": 754 + }, + { + "epoch": 0.03299969404257179, + "grad_norm": 2.96875, + "learning_rate": 9.973837523810471e-05, + "loss": 2.4131, + "step": 755 + }, + { + "epoch": 0.03304340224660169, + "grad_norm": 4.1875, + "learning_rate": 9.973767303358275e-05, + "loss": 2.4873, + "step": 756 + }, + { + "epoch": 0.033087110450631584, + "grad_norm": 3.125, + "learning_rate": 9.973696989043852e-05, + "loss": 2.5531, + "step": 757 + }, + { + "epoch": 0.03313081865466148, + "grad_norm": 3.5, + "learning_rate": 9.973626580868527e-05, + "loss": 2.2931, + "step": 758 + }, + { + "epoch": 0.03317452685869138, + "grad_norm": 4.65625, + "learning_rate": 9.97355607883363e-05, + "loss": 1.8566, + "step": 759 + }, + { + "epoch": 0.03321823506272127, + "grad_norm": 3.71875, + "learning_rate": 9.97348548294049e-05, + "loss": 3.2503, + "step": 760 + }, + { + "epoch": 0.03326194326675117, + "grad_norm": 4.21875, + "learning_rate": 9.973414793190441e-05, + "loss": 2.9089, + "step": 761 + }, + { + "epoch": 0.033305651470781066, + "grad_norm": 3.5, + "learning_rate": 9.973344009584818e-05, + "loss": 2.7956, + "step": 762 + }, + { + "epoch": 0.03334935967481096, + "grad_norm": 4.0625, + "learning_rate": 9.973273132124954e-05, + "loss": 1.8721, + "step": 763 + }, + { + "epoch": 0.03339306787884086, + "grad_norm": 3.515625, + "learning_rate": 9.973202160812187e-05, + "loss": 2.4582, + "step": 764 + }, + { + "epoch": 0.03343677608287075, + "grad_norm": 3.109375, + "learning_rate": 9.973131095647858e-05, + "loss": 1.904, + "step": 765 + }, + { + "epoch": 0.033480484286900654, + "grad_norm": 3.296875, + "learning_rate": 9.973059936633306e-05, + "loss": 2.0774, + "step": 766 + }, + { + "epoch": 0.03352419249093055, + "grad_norm": 3.015625, + "learning_rate": 9.972988683769877e-05, + "loss": 2.5509, + "step": 767 + }, + { + "epoch": 0.03356790069496044, + "grad_norm": 4.15625, + "learning_rate": 9.972917337058913e-05, + "loss": 2.0877, + "step": 768 + }, + { + "epoch": 0.03361160889899034, + "grad_norm": 4.3125, + "learning_rate": 9.972845896501761e-05, + "loss": 2.6669, + "step": 769 + }, + { + "epoch": 0.033655317103020235, + "grad_norm": 3.15625, + "learning_rate": 9.972774362099768e-05, + "loss": 2.1274, + "step": 770 + }, + { + "epoch": 0.033699025307050136, + "grad_norm": 6.4375, + "learning_rate": 9.972702733854286e-05, + "loss": 2.6224, + "step": 771 + }, + { + "epoch": 0.03374273351108003, + "grad_norm": 3.296875, + "learning_rate": 9.972631011766668e-05, + "loss": 2.4328, + "step": 772 + }, + { + "epoch": 0.03378644171510992, + "grad_norm": 2.703125, + "learning_rate": 9.972559195838263e-05, + "loss": 2.3105, + "step": 773 + }, + { + "epoch": 0.033830149919139824, + "grad_norm": 3.1875, + "learning_rate": 9.97248728607043e-05, + "loss": 1.9875, + "step": 774 + }, + { + "epoch": 0.03387385812316972, + "grad_norm": 6.21875, + "learning_rate": 9.972415282464524e-05, + "loss": 3.3821, + "step": 775 + }, + { + "epoch": 0.03391756632719962, + "grad_norm": 3.234375, + "learning_rate": 9.972343185021906e-05, + "loss": 2.2215, + "step": 776 + }, + { + "epoch": 0.03396127453122951, + "grad_norm": 2.71875, + "learning_rate": 9.972270993743934e-05, + "loss": 1.8742, + "step": 777 + }, + { + "epoch": 0.034004982735259405, + "grad_norm": 2.828125, + "learning_rate": 9.972198708631972e-05, + "loss": 2.1591, + "step": 778 + }, + { + "epoch": 0.034048690939289306, + "grad_norm": 3.984375, + "learning_rate": 9.972126329687384e-05, + "loss": 3.2081, + "step": 779 + }, + { + "epoch": 0.0340923991433192, + "grad_norm": 4.53125, + "learning_rate": 9.972053856911534e-05, + "loss": 1.9423, + "step": 780 + }, + { + "epoch": 0.0341361073473491, + "grad_norm": 9.4375, + "learning_rate": 9.971981290305792e-05, + "loss": 1.6246, + "step": 781 + }, + { + "epoch": 0.03417981555137899, + "grad_norm": 3.453125, + "learning_rate": 9.971908629871527e-05, + "loss": 2.1293, + "step": 782 + }, + { + "epoch": 0.03422352375540889, + "grad_norm": 3.171875, + "learning_rate": 9.971835875610109e-05, + "loss": 2.006, + "step": 783 + }, + { + "epoch": 0.03426723195943879, + "grad_norm": 2.859375, + "learning_rate": 9.971763027522914e-05, + "loss": 2.1156, + "step": 784 + }, + { + "epoch": 0.03431094016346868, + "grad_norm": 4.375, + "learning_rate": 9.971690085611314e-05, + "loss": 3.3214, + "step": 785 + }, + { + "epoch": 0.03435464836749858, + "grad_norm": 3.453125, + "learning_rate": 9.971617049876684e-05, + "loss": 2.8245, + "step": 786 + }, + { + "epoch": 0.034398356571528475, + "grad_norm": 3.125, + "learning_rate": 9.971543920320407e-05, + "loss": 2.7465, + "step": 787 + }, + { + "epoch": 0.03444206477555837, + "grad_norm": 3.046875, + "learning_rate": 9.971470696943859e-05, + "loss": 2.3266, + "step": 788 + }, + { + "epoch": 0.03448577297958827, + "grad_norm": 3.203125, + "learning_rate": 9.971397379748424e-05, + "loss": 1.9464, + "step": 789 + }, + { + "epoch": 0.03452948118361816, + "grad_norm": 3.09375, + "learning_rate": 9.971323968735484e-05, + "loss": 2.7519, + "step": 790 + }, + { + "epoch": 0.034573189387648064, + "grad_norm": 4.4375, + "learning_rate": 9.971250463906426e-05, + "loss": 2.2776, + "step": 791 + }, + { + "epoch": 0.03461689759167796, + "grad_norm": 3.734375, + "learning_rate": 9.971176865262635e-05, + "loss": 1.9055, + "step": 792 + }, + { + "epoch": 0.03466060579570786, + "grad_norm": 4.65625, + "learning_rate": 9.971103172805503e-05, + "loss": 2.7596, + "step": 793 + }, + { + "epoch": 0.03470431399973775, + "grad_norm": 3.234375, + "learning_rate": 9.971029386536419e-05, + "loss": 2.2475, + "step": 794 + }, + { + "epoch": 0.034748022203767645, + "grad_norm": 3.265625, + "learning_rate": 9.970955506456776e-05, + "loss": 2.4918, + "step": 795 + }, + { + "epoch": 0.034791730407797546, + "grad_norm": 5.15625, + "learning_rate": 9.970881532567967e-05, + "loss": 3.1343, + "step": 796 + }, + { + "epoch": 0.03483543861182744, + "grad_norm": 2.875, + "learning_rate": 9.970807464871387e-05, + "loss": 1.9314, + "step": 797 + }, + { + "epoch": 0.03487914681585734, + "grad_norm": 3.734375, + "learning_rate": 9.970733303368438e-05, + "loss": 2.3322, + "step": 798 + }, + { + "epoch": 0.03492285501988723, + "grad_norm": 3.734375, + "learning_rate": 9.970659048060515e-05, + "loss": 2.1469, + "step": 799 + }, + { + "epoch": 0.03496656322391713, + "grad_norm": 3.109375, + "learning_rate": 9.970584698949023e-05, + "loss": 2.778, + "step": 800 + }, + { + "epoch": 0.03501027142794703, + "grad_norm": 5.0625, + "learning_rate": 9.970510256035364e-05, + "loss": 2.5041, + "step": 801 + }, + { + "epoch": 0.03505397963197692, + "grad_norm": 3.203125, + "learning_rate": 9.970435719320941e-05, + "loss": 2.6473, + "step": 802 + }, + { + "epoch": 0.03509768783600682, + "grad_norm": 5.0, + "learning_rate": 9.97036108880716e-05, + "loss": 2.6472, + "step": 803 + }, + { + "epoch": 0.035141396040036715, + "grad_norm": 3.5, + "learning_rate": 9.970286364495434e-05, + "loss": 2.0429, + "step": 804 + }, + { + "epoch": 0.03518510424406661, + "grad_norm": 3.6875, + "learning_rate": 9.970211546387169e-05, + "loss": 2.8479, + "step": 805 + }, + { + "epoch": 0.03522881244809651, + "grad_norm": 3.515625, + "learning_rate": 9.970136634483779e-05, + "loss": 2.3924, + "step": 806 + }, + { + "epoch": 0.0352725206521264, + "grad_norm": 3.546875, + "learning_rate": 9.970061628786678e-05, + "loss": 2.8506, + "step": 807 + }, + { + "epoch": 0.035316228856156304, + "grad_norm": 2.6875, + "learning_rate": 9.96998652929728e-05, + "loss": 1.8322, + "step": 808 + }, + { + "epoch": 0.0353599370601862, + "grad_norm": 3.53125, + "learning_rate": 9.969911336017e-05, + "loss": 2.0667, + "step": 809 + }, + { + "epoch": 0.03540364526421609, + "grad_norm": 3.4375, + "learning_rate": 9.969836048947263e-05, + "loss": 2.4905, + "step": 810 + }, + { + "epoch": 0.03544735346824599, + "grad_norm": 3.40625, + "learning_rate": 9.969760668089486e-05, + "loss": 2.3651, + "step": 811 + }, + { + "epoch": 0.035491061672275885, + "grad_norm": 3.375, + "learning_rate": 9.969685193445091e-05, + "loss": 2.1256, + "step": 812 + }, + { + "epoch": 0.035534769876305786, + "grad_norm": 4.34375, + "learning_rate": 9.969609625015502e-05, + "loss": 2.6686, + "step": 813 + }, + { + "epoch": 0.03557847808033568, + "grad_norm": 5.28125, + "learning_rate": 9.96953396280215e-05, + "loss": 2.7127, + "step": 814 + }, + { + "epoch": 0.03562218628436557, + "grad_norm": 2.75, + "learning_rate": 9.969458206806456e-05, + "loss": 1.9587, + "step": 815 + }, + { + "epoch": 0.03566589448839547, + "grad_norm": 2.8125, + "learning_rate": 9.969382357029856e-05, + "loss": 2.139, + "step": 816 + }, + { + "epoch": 0.03570960269242537, + "grad_norm": 5.84375, + "learning_rate": 9.969306413473776e-05, + "loss": 2.1272, + "step": 817 + }, + { + "epoch": 0.03575331089645527, + "grad_norm": 3.390625, + "learning_rate": 9.969230376139651e-05, + "loss": 2.8055, + "step": 818 + }, + { + "epoch": 0.03579701910048516, + "grad_norm": 3.984375, + "learning_rate": 9.969154245028917e-05, + "loss": 1.7156, + "step": 819 + }, + { + "epoch": 0.035840727304515055, + "grad_norm": 6.0, + "learning_rate": 9.969078020143012e-05, + "loss": 2.3962, + "step": 820 + }, + { + "epoch": 0.035884435508544955, + "grad_norm": 2.921875, + "learning_rate": 9.969001701483372e-05, + "loss": 1.9315, + "step": 821 + }, + { + "epoch": 0.03592814371257485, + "grad_norm": 2.8125, + "learning_rate": 9.968925289051436e-05, + "loss": 2.0727, + "step": 822 + }, + { + "epoch": 0.03597185191660475, + "grad_norm": 2.859375, + "learning_rate": 9.96884878284865e-05, + "loss": 1.8745, + "step": 823 + }, + { + "epoch": 0.03601556012063464, + "grad_norm": 3.21875, + "learning_rate": 9.968772182876454e-05, + "loss": 2.4207, + "step": 824 + }, + { + "epoch": 0.03605926832466454, + "grad_norm": 2.875, + "learning_rate": 9.968695489136296e-05, + "loss": 1.7213, + "step": 825 + }, + { + "epoch": 0.03610297652869444, + "grad_norm": 3.359375, + "learning_rate": 9.968618701629623e-05, + "loss": 2.442, + "step": 826 + }, + { + "epoch": 0.03614668473272433, + "grad_norm": 3.515625, + "learning_rate": 9.968541820357883e-05, + "loss": 2.593, + "step": 827 + }, + { + "epoch": 0.03619039293675423, + "grad_norm": 3.796875, + "learning_rate": 9.968464845322527e-05, + "loss": 2.513, + "step": 828 + }, + { + "epoch": 0.036234101140784125, + "grad_norm": 4.34375, + "learning_rate": 9.96838777652501e-05, + "loss": 2.9353, + "step": 829 + }, + { + "epoch": 0.03627780934481402, + "grad_norm": 3.34375, + "learning_rate": 9.968310613966783e-05, + "loss": 2.7721, + "step": 830 + }, + { + "epoch": 0.03632151754884392, + "grad_norm": 3.8125, + "learning_rate": 9.968233357649303e-05, + "loss": 1.9949, + "step": 831 + }, + { + "epoch": 0.03636522575287381, + "grad_norm": 2.84375, + "learning_rate": 9.96815600757403e-05, + "loss": 1.9257, + "step": 832 + }, + { + "epoch": 0.036408933956903713, + "grad_norm": 3.390625, + "learning_rate": 9.968078563742421e-05, + "loss": 2.6905, + "step": 833 + }, + { + "epoch": 0.03645264216093361, + "grad_norm": 3.859375, + "learning_rate": 9.968001026155939e-05, + "loss": 1.9883, + "step": 834 + }, + { + "epoch": 0.0364963503649635, + "grad_norm": 3.609375, + "learning_rate": 9.967923394816049e-05, + "loss": 3.019, + "step": 835 + }, + { + "epoch": 0.0365400585689934, + "grad_norm": 3.296875, + "learning_rate": 9.967845669724212e-05, + "loss": 2.1879, + "step": 836 + }, + { + "epoch": 0.036583766773023295, + "grad_norm": 3.859375, + "learning_rate": 9.967767850881898e-05, + "loss": 2.161, + "step": 837 + }, + { + "epoch": 0.036627474977053195, + "grad_norm": 3.421875, + "learning_rate": 9.967689938290574e-05, + "loss": 2.8632, + "step": 838 + }, + { + "epoch": 0.03667118318108309, + "grad_norm": 5.1875, + "learning_rate": 9.96761193195171e-05, + "loss": 2.971, + "step": 839 + }, + { + "epoch": 0.03671489138511298, + "grad_norm": 2.984375, + "learning_rate": 9.96753383186678e-05, + "loss": 2.5535, + "step": 840 + }, + { + "epoch": 0.03675859958914288, + "grad_norm": 2.890625, + "learning_rate": 9.967455638037257e-05, + "loss": 2.0311, + "step": 841 + }, + { + "epoch": 0.03680230779317278, + "grad_norm": 2.734375, + "learning_rate": 9.967377350464615e-05, + "loss": 1.9492, + "step": 842 + }, + { + "epoch": 0.03684601599720268, + "grad_norm": 3.078125, + "learning_rate": 9.967298969150334e-05, + "loss": 2.2931, + "step": 843 + }, + { + "epoch": 0.03688972420123257, + "grad_norm": 2.96875, + "learning_rate": 9.96722049409589e-05, + "loss": 1.7512, + "step": 844 + }, + { + "epoch": 0.036933432405262465, + "grad_norm": 2.96875, + "learning_rate": 9.967141925302768e-05, + "loss": 2.3825, + "step": 845 + }, + { + "epoch": 0.036977140609292365, + "grad_norm": 3.84375, + "learning_rate": 9.967063262772447e-05, + "loss": 2.2133, + "step": 846 + }, + { + "epoch": 0.03702084881332226, + "grad_norm": 3.046875, + "learning_rate": 9.966984506506413e-05, + "loss": 2.1161, + "step": 847 + }, + { + "epoch": 0.03706455701735216, + "grad_norm": 3.28125, + "learning_rate": 9.966905656506154e-05, + "loss": 2.2078, + "step": 848 + }, + { + "epoch": 0.03710826522138205, + "grad_norm": 3.625, + "learning_rate": 9.966826712773155e-05, + "loss": 2.5632, + "step": 849 + }, + { + "epoch": 0.03715197342541195, + "grad_norm": 3.421875, + "learning_rate": 9.966747675308907e-05, + "loss": 2.3147, + "step": 850 + }, + { + "epoch": 0.03719568162944185, + "grad_norm": 3.15625, + "learning_rate": 9.966668544114902e-05, + "loss": 2.1258, + "step": 851 + }, + { + "epoch": 0.03723938983347174, + "grad_norm": 3.40625, + "learning_rate": 9.966589319192633e-05, + "loss": 2.3756, + "step": 852 + }, + { + "epoch": 0.03728309803750164, + "grad_norm": 4.125, + "learning_rate": 9.966510000543594e-05, + "loss": 2.3031, + "step": 853 + }, + { + "epoch": 0.037326806241531535, + "grad_norm": 2.8125, + "learning_rate": 9.966430588169285e-05, + "loss": 2.0828, + "step": 854 + }, + { + "epoch": 0.03737051444556143, + "grad_norm": 3.03125, + "learning_rate": 9.966351082071201e-05, + "loss": 2.4494, + "step": 855 + }, + { + "epoch": 0.03741422264959133, + "grad_norm": 3.265625, + "learning_rate": 9.966271482250845e-05, + "loss": 2.3533, + "step": 856 + }, + { + "epoch": 0.03745793085362122, + "grad_norm": 3.265625, + "learning_rate": 9.966191788709716e-05, + "loss": 3.1356, + "step": 857 + }, + { + "epoch": 0.03750163905765112, + "grad_norm": 2.796875, + "learning_rate": 9.966112001449321e-05, + "loss": 2.1622, + "step": 858 + }, + { + "epoch": 0.03754534726168102, + "grad_norm": 4.28125, + "learning_rate": 9.966032120471165e-05, + "loss": 2.6896, + "step": 859 + }, + { + "epoch": 0.03758905546571091, + "grad_norm": 4.03125, + "learning_rate": 9.965952145776754e-05, + "loss": 2.0189, + "step": 860 + }, + { + "epoch": 0.03763276366974081, + "grad_norm": 3.40625, + "learning_rate": 9.9658720773676e-05, + "loss": 2.3372, + "step": 861 + }, + { + "epoch": 0.037676471873770705, + "grad_norm": 3.28125, + "learning_rate": 9.96579191524521e-05, + "loss": 2.0823, + "step": 862 + }, + { + "epoch": 0.037720180077800605, + "grad_norm": 3.296875, + "learning_rate": 9.965711659411102e-05, + "loss": 2.822, + "step": 863 + }, + { + "epoch": 0.0377638882818305, + "grad_norm": 3.015625, + "learning_rate": 9.965631309866788e-05, + "loss": 2.695, + "step": 864 + }, + { + "epoch": 0.0378075964858604, + "grad_norm": 3.234375, + "learning_rate": 9.965550866613782e-05, + "loss": 2.7255, + "step": 865 + }, + { + "epoch": 0.03785130468989029, + "grad_norm": 3.25, + "learning_rate": 9.965470329653604e-05, + "loss": 2.4268, + "step": 866 + }, + { + "epoch": 0.03789501289392019, + "grad_norm": 4.78125, + "learning_rate": 9.965389698987775e-05, + "loss": 2.2879, + "step": 867 + }, + { + "epoch": 0.03793872109795009, + "grad_norm": 3.34375, + "learning_rate": 9.965308974617816e-05, + "loss": 2.4958, + "step": 868 + }, + { + "epoch": 0.03798242930197998, + "grad_norm": 3.34375, + "learning_rate": 9.96522815654525e-05, + "loss": 2.387, + "step": 869 + }, + { + "epoch": 0.03802613750600988, + "grad_norm": 2.703125, + "learning_rate": 9.965147244771601e-05, + "loss": 1.918, + "step": 870 + }, + { + "epoch": 0.038069845710039775, + "grad_norm": 3.25, + "learning_rate": 9.965066239298398e-05, + "loss": 2.7065, + "step": 871 + }, + { + "epoch": 0.03811355391406967, + "grad_norm": 3.484375, + "learning_rate": 9.964985140127168e-05, + "loss": 2.3908, + "step": 872 + }, + { + "epoch": 0.03815726211809957, + "grad_norm": 2.71875, + "learning_rate": 9.964903947259443e-05, + "loss": 1.9557, + "step": 873 + }, + { + "epoch": 0.03820097032212946, + "grad_norm": 2.4375, + "learning_rate": 9.964822660696753e-05, + "loss": 2.1748, + "step": 874 + }, + { + "epoch": 0.03824467852615936, + "grad_norm": 3.96875, + "learning_rate": 9.964741280440634e-05, + "loss": 1.9483, + "step": 875 + }, + { + "epoch": 0.03828838673018926, + "grad_norm": 8.25, + "learning_rate": 9.964659806492624e-05, + "loss": 4.1064, + "step": 876 + }, + { + "epoch": 0.03833209493421915, + "grad_norm": 4.625, + "learning_rate": 9.964578238854254e-05, + "loss": 2.6809, + "step": 877 + }, + { + "epoch": 0.03837580313824905, + "grad_norm": 3.859375, + "learning_rate": 9.964496577527069e-05, + "loss": 1.873, + "step": 878 + }, + { + "epoch": 0.038419511342278945, + "grad_norm": 4.875, + "learning_rate": 9.964414822512607e-05, + "loss": 2.5123, + "step": 879 + }, + { + "epoch": 0.038463219546308845, + "grad_norm": 3.203125, + "learning_rate": 9.964332973812414e-05, + "loss": 2.3094, + "step": 880 + }, + { + "epoch": 0.03850692775033874, + "grad_norm": 5.0625, + "learning_rate": 9.96425103142803e-05, + "loss": 1.6306, + "step": 881 + }, + { + "epoch": 0.03855063595436863, + "grad_norm": 2.984375, + "learning_rate": 9.964168995361005e-05, + "loss": 2.1723, + "step": 882 + }, + { + "epoch": 0.03859434415839853, + "grad_norm": 3.046875, + "learning_rate": 9.964086865612887e-05, + "loss": 2.1385, + "step": 883 + }, + { + "epoch": 0.03863805236242843, + "grad_norm": 3.1875, + "learning_rate": 9.964004642185223e-05, + "loss": 2.2899, + "step": 884 + }, + { + "epoch": 0.03868176056645833, + "grad_norm": 3.578125, + "learning_rate": 9.963922325079567e-05, + "loss": 2.6071, + "step": 885 + }, + { + "epoch": 0.03872546877048822, + "grad_norm": 2.953125, + "learning_rate": 9.963839914297473e-05, + "loss": 2.1812, + "step": 886 + }, + { + "epoch": 0.038769176974518114, + "grad_norm": 3.34375, + "learning_rate": 9.963757409840495e-05, + "loss": 2.4299, + "step": 887 + }, + { + "epoch": 0.038812885178548015, + "grad_norm": 3.421875, + "learning_rate": 9.963674811710191e-05, + "loss": 2.3144, + "step": 888 + }, + { + "epoch": 0.03885659338257791, + "grad_norm": 2.953125, + "learning_rate": 9.963592119908119e-05, + "loss": 2.3799, + "step": 889 + }, + { + "epoch": 0.03890030158660781, + "grad_norm": 3.078125, + "learning_rate": 9.963509334435838e-05, + "loss": 2.4469, + "step": 890 + }, + { + "epoch": 0.0389440097906377, + "grad_norm": 3.875, + "learning_rate": 9.963426455294913e-05, + "loss": 2.425, + "step": 891 + }, + { + "epoch": 0.038987717994667596, + "grad_norm": 3.484375, + "learning_rate": 9.963343482486906e-05, + "loss": 2.833, + "step": 892 + }, + { + "epoch": 0.0390314261986975, + "grad_norm": 4.71875, + "learning_rate": 9.963260416013385e-05, + "loss": 2.5376, + "step": 893 + }, + { + "epoch": 0.03907513440272739, + "grad_norm": 3.96875, + "learning_rate": 9.963177255875915e-05, + "loss": 2.5258, + "step": 894 + }, + { + "epoch": 0.03911884260675729, + "grad_norm": 7.4375, + "learning_rate": 9.963094002076066e-05, + "loss": 3.2763, + "step": 895 + }, + { + "epoch": 0.039162550810787185, + "grad_norm": 4.96875, + "learning_rate": 9.963010654615411e-05, + "loss": 3.1825, + "step": 896 + }, + { + "epoch": 0.03920625901481708, + "grad_norm": 3.21875, + "learning_rate": 9.962927213495521e-05, + "loss": 2.2391, + "step": 897 + }, + { + "epoch": 0.03924996721884698, + "grad_norm": 3.125, + "learning_rate": 9.962843678717972e-05, + "loss": 2.2213, + "step": 898 + }, + { + "epoch": 0.03929367542287687, + "grad_norm": 3.6875, + "learning_rate": 9.96276005028434e-05, + "loss": 2.3569, + "step": 899 + }, + { + "epoch": 0.03933738362690677, + "grad_norm": 3.015625, + "learning_rate": 9.962676328196202e-05, + "loss": 1.9611, + "step": 900 + }, + { + "epoch": 0.03938109183093667, + "grad_norm": 2.53125, + "learning_rate": 9.962592512455138e-05, + "loss": 1.869, + "step": 901 + }, + { + "epoch": 0.03942480003496656, + "grad_norm": 3.640625, + "learning_rate": 9.962508603062732e-05, + "loss": 2.3317, + "step": 902 + }, + { + "epoch": 0.03946850823899646, + "grad_norm": 3.03125, + "learning_rate": 9.962424600020567e-05, + "loss": 1.9824, + "step": 903 + }, + { + "epoch": 0.039512216443026354, + "grad_norm": 3.015625, + "learning_rate": 9.962340503330226e-05, + "loss": 2.6484, + "step": 904 + }, + { + "epoch": 0.039555924647056255, + "grad_norm": 2.96875, + "learning_rate": 9.962256312993297e-05, + "loss": 2.3466, + "step": 905 + }, + { + "epoch": 0.03959963285108615, + "grad_norm": 2.890625, + "learning_rate": 9.96217202901137e-05, + "loss": 2.0264, + "step": 906 + }, + { + "epoch": 0.03964334105511604, + "grad_norm": 3.328125, + "learning_rate": 9.962087651386034e-05, + "loss": 2.5601, + "step": 907 + }, + { + "epoch": 0.03968704925914594, + "grad_norm": 3.265625, + "learning_rate": 9.962003180118882e-05, + "loss": 2.2833, + "step": 908 + }, + { + "epoch": 0.039730757463175836, + "grad_norm": 3.15625, + "learning_rate": 9.96191861521151e-05, + "loss": 2.6963, + "step": 909 + }, + { + "epoch": 0.03977446566720574, + "grad_norm": 2.71875, + "learning_rate": 9.961833956665509e-05, + "loss": 2.1049, + "step": 910 + }, + { + "epoch": 0.03981817387123563, + "grad_norm": 3.171875, + "learning_rate": 9.961749204482483e-05, + "loss": 3.2084, + "step": 911 + }, + { + "epoch": 0.039861882075265524, + "grad_norm": 2.75, + "learning_rate": 9.961664358664024e-05, + "loss": 2.6011, + "step": 912 + }, + { + "epoch": 0.039905590279295425, + "grad_norm": 2.796875, + "learning_rate": 9.961579419211741e-05, + "loss": 2.6555, + "step": 913 + }, + { + "epoch": 0.03994929848332532, + "grad_norm": 3.09375, + "learning_rate": 9.961494386127231e-05, + "loss": 1.9832, + "step": 914 + }, + { + "epoch": 0.03999300668735522, + "grad_norm": 4.5, + "learning_rate": 9.961409259412102e-05, + "loss": 2.9627, + "step": 915 + }, + { + "epoch": 0.04003671489138511, + "grad_norm": 5.40625, + "learning_rate": 9.961324039067958e-05, + "loss": 2.9962, + "step": 916 + }, + { + "epoch": 0.040080423095415006, + "grad_norm": 2.984375, + "learning_rate": 9.961238725096409e-05, + "loss": 2.2526, + "step": 917 + }, + { + "epoch": 0.04012413129944491, + "grad_norm": 3.828125, + "learning_rate": 9.961153317499065e-05, + "loss": 2.5092, + "step": 918 + }, + { + "epoch": 0.0401678395034748, + "grad_norm": 2.828125, + "learning_rate": 9.961067816277536e-05, + "loss": 1.9513, + "step": 919 + }, + { + "epoch": 0.0402115477075047, + "grad_norm": 3.734375, + "learning_rate": 9.960982221433439e-05, + "loss": 2.3512, + "step": 920 + }, + { + "epoch": 0.040255255911534595, + "grad_norm": 3.15625, + "learning_rate": 9.960896532968385e-05, + "loss": 2.5303, + "step": 921 + }, + { + "epoch": 0.04029896411556449, + "grad_norm": 2.84375, + "learning_rate": 9.960810750883995e-05, + "loss": 1.8781, + "step": 922 + }, + { + "epoch": 0.04034267231959439, + "grad_norm": 2.984375, + "learning_rate": 9.960724875181885e-05, + "loss": 2.3986, + "step": 923 + }, + { + "epoch": 0.04038638052362428, + "grad_norm": 2.65625, + "learning_rate": 9.960638905863676e-05, + "loss": 2.0327, + "step": 924 + }, + { + "epoch": 0.04043008872765418, + "grad_norm": 6.15625, + "learning_rate": 9.960552842930992e-05, + "loss": 1.6378, + "step": 925 + }, + { + "epoch": 0.040473796931684076, + "grad_norm": 9.0, + "learning_rate": 9.960466686385456e-05, + "loss": 2.1269, + "step": 926 + }, + { + "epoch": 0.04051750513571397, + "grad_norm": 2.90625, + "learning_rate": 9.960380436228693e-05, + "loss": 2.3509, + "step": 927 + }, + { + "epoch": 0.04056121333974387, + "grad_norm": 3.109375, + "learning_rate": 9.960294092462332e-05, + "loss": 2.1872, + "step": 928 + }, + { + "epoch": 0.040604921543773764, + "grad_norm": 3.828125, + "learning_rate": 9.960207655088003e-05, + "loss": 2.3827, + "step": 929 + }, + { + "epoch": 0.040648629747803665, + "grad_norm": 3.328125, + "learning_rate": 9.960121124107336e-05, + "loss": 1.9355, + "step": 930 + }, + { + "epoch": 0.04069233795183356, + "grad_norm": 3.3125, + "learning_rate": 9.960034499521964e-05, + "loss": 1.8716, + "step": 931 + }, + { + "epoch": 0.04073604615586346, + "grad_norm": 3.1875, + "learning_rate": 9.959947781333522e-05, + "loss": 1.9923, + "step": 932 + }, + { + "epoch": 0.04077975435989335, + "grad_norm": 3.0, + "learning_rate": 9.959860969543648e-05, + "loss": 2.2197, + "step": 933 + }, + { + "epoch": 0.040823462563923246, + "grad_norm": 5.15625, + "learning_rate": 9.959774064153977e-05, + "loss": 2.2632, + "step": 934 + }, + { + "epoch": 0.04086717076795315, + "grad_norm": 2.796875, + "learning_rate": 9.959687065166151e-05, + "loss": 2.3649, + "step": 935 + }, + { + "epoch": 0.04091087897198304, + "grad_norm": 3.890625, + "learning_rate": 9.959599972581812e-05, + "loss": 2.6295, + "step": 936 + }, + { + "epoch": 0.04095458717601294, + "grad_norm": 3.6875, + "learning_rate": 9.959512786402603e-05, + "loss": 2.1709, + "step": 937 + }, + { + "epoch": 0.040998295380042835, + "grad_norm": 3.9375, + "learning_rate": 9.95942550663017e-05, + "loss": 2.1453, + "step": 938 + }, + { + "epoch": 0.04104200358407273, + "grad_norm": 3.578125, + "learning_rate": 9.959338133266158e-05, + "loss": 2.5455, + "step": 939 + }, + { + "epoch": 0.04108571178810263, + "grad_norm": 2.78125, + "learning_rate": 9.959250666312219e-05, + "loss": 2.0837, + "step": 940 + }, + { + "epoch": 0.04112941999213252, + "grad_norm": 2.984375, + "learning_rate": 9.959163105770002e-05, + "loss": 2.2131, + "step": 941 + }, + { + "epoch": 0.04117312819616242, + "grad_norm": 2.65625, + "learning_rate": 9.959075451641159e-05, + "loss": 2.1649, + "step": 942 + }, + { + "epoch": 0.04121683640019232, + "grad_norm": 3.5, + "learning_rate": 9.958987703927345e-05, + "loss": 2.245, + "step": 943 + }, + { + "epoch": 0.04126054460422221, + "grad_norm": 3.0, + "learning_rate": 9.958899862630216e-05, + "loss": 2.1328, + "step": 944 + }, + { + "epoch": 0.04130425280825211, + "grad_norm": 3.25, + "learning_rate": 9.958811927751428e-05, + "loss": 2.9265, + "step": 945 + }, + { + "epoch": 0.041347961012282004, + "grad_norm": 2.890625, + "learning_rate": 9.958723899292641e-05, + "loss": 2.4016, + "step": 946 + }, + { + "epoch": 0.041391669216311905, + "grad_norm": 3.015625, + "learning_rate": 9.958635777255518e-05, + "loss": 2.2132, + "step": 947 + }, + { + "epoch": 0.0414353774203418, + "grad_norm": 2.484375, + "learning_rate": 9.958547561641722e-05, + "loss": 2.1233, + "step": 948 + }, + { + "epoch": 0.04147908562437169, + "grad_norm": 7.8125, + "learning_rate": 9.958459252452916e-05, + "loss": 2.8399, + "step": 949 + }, + { + "epoch": 0.04152279382840159, + "grad_norm": 3.1875, + "learning_rate": 9.958370849690767e-05, + "loss": 2.7092, + "step": 950 + }, + { + "epoch": 0.041566502032431486, + "grad_norm": 3.296875, + "learning_rate": 9.958282353356943e-05, + "loss": 2.0668, + "step": 951 + }, + { + "epoch": 0.04161021023646139, + "grad_norm": 2.734375, + "learning_rate": 9.958193763453116e-05, + "loss": 2.2635, + "step": 952 + }, + { + "epoch": 0.04165391844049128, + "grad_norm": 3.046875, + "learning_rate": 9.958105079980954e-05, + "loss": 2.0128, + "step": 953 + }, + { + "epoch": 0.041697626644521174, + "grad_norm": 2.6875, + "learning_rate": 9.958016302942135e-05, + "loss": 2.1925, + "step": 954 + }, + { + "epoch": 0.041741334848551075, + "grad_norm": 3.25, + "learning_rate": 9.957927432338332e-05, + "loss": 2.0548, + "step": 955 + }, + { + "epoch": 0.04178504305258097, + "grad_norm": 3.421875, + "learning_rate": 9.957838468171223e-05, + "loss": 2.3297, + "step": 956 + }, + { + "epoch": 0.04182875125661087, + "grad_norm": 2.921875, + "learning_rate": 9.957749410442485e-05, + "loss": 2.3113, + "step": 957 + }, + { + "epoch": 0.04187245946064076, + "grad_norm": 2.734375, + "learning_rate": 9.9576602591538e-05, + "loss": 2.4494, + "step": 958 + }, + { + "epoch": 0.041916167664670656, + "grad_norm": 2.890625, + "learning_rate": 9.957571014306852e-05, + "loss": 2.2834, + "step": 959 + }, + { + "epoch": 0.04195987586870056, + "grad_norm": 3.359375, + "learning_rate": 9.957481675903321e-05, + "loss": 1.9757, + "step": 960 + }, + { + "epoch": 0.04200358407273045, + "grad_norm": 3.296875, + "learning_rate": 9.957392243944898e-05, + "loss": 1.9881, + "step": 961 + }, + { + "epoch": 0.04204729227676035, + "grad_norm": 2.890625, + "learning_rate": 9.957302718433266e-05, + "loss": 2.0498, + "step": 962 + }, + { + "epoch": 0.042091000480790244, + "grad_norm": 2.546875, + "learning_rate": 9.957213099370117e-05, + "loss": 2.2652, + "step": 963 + }, + { + "epoch": 0.04213470868482014, + "grad_norm": 3.234375, + "learning_rate": 9.957123386757144e-05, + "loss": 2.2596, + "step": 964 + }, + { + "epoch": 0.04217841688885004, + "grad_norm": 3.234375, + "learning_rate": 9.957033580596036e-05, + "loss": 2.7782, + "step": 965 + }, + { + "epoch": 0.04222212509287993, + "grad_norm": 2.796875, + "learning_rate": 9.95694368088849e-05, + "loss": 2.0644, + "step": 966 + }, + { + "epoch": 0.04226583329690983, + "grad_norm": 2.5625, + "learning_rate": 9.956853687636203e-05, + "loss": 2.149, + "step": 967 + }, + { + "epoch": 0.042309541500939726, + "grad_norm": 3.3125, + "learning_rate": 9.956763600840873e-05, + "loss": 2.8036, + "step": 968 + }, + { + "epoch": 0.04235324970496962, + "grad_norm": 3.125, + "learning_rate": 9.9566734205042e-05, + "loss": 1.844, + "step": 969 + }, + { + "epoch": 0.04239695790899952, + "grad_norm": 3.4375, + "learning_rate": 9.956583146627883e-05, + "loss": 3.511, + "step": 970 + }, + { + "epoch": 0.042440666113029414, + "grad_norm": 3.296875, + "learning_rate": 9.956492779213629e-05, + "loss": 2.1949, + "step": 971 + }, + { + "epoch": 0.042484374317059315, + "grad_norm": 3.828125, + "learning_rate": 9.956402318263143e-05, + "loss": 2.4102, + "step": 972 + }, + { + "epoch": 0.04252808252108921, + "grad_norm": 3.296875, + "learning_rate": 9.95631176377813e-05, + "loss": 2.4406, + "step": 973 + }, + { + "epoch": 0.0425717907251191, + "grad_norm": 3.234375, + "learning_rate": 9.956221115760302e-05, + "loss": 2.7419, + "step": 974 + }, + { + "epoch": 0.042615498929149, + "grad_norm": 3.328125, + "learning_rate": 9.956130374211369e-05, + "loss": 2.8455, + "step": 975 + }, + { + "epoch": 0.042659207133178896, + "grad_norm": 3.171875, + "learning_rate": 9.956039539133042e-05, + "loss": 2.433, + "step": 976 + }, + { + "epoch": 0.0427029153372088, + "grad_norm": 3.078125, + "learning_rate": 9.955948610527035e-05, + "loss": 2.2266, + "step": 977 + }, + { + "epoch": 0.04274662354123869, + "grad_norm": 2.84375, + "learning_rate": 9.955857588395065e-05, + "loss": 2.2863, + "step": 978 + }, + { + "epoch": 0.042790331745268584, + "grad_norm": 3.21875, + "learning_rate": 9.955766472738847e-05, + "loss": 1.9107, + "step": 979 + }, + { + "epoch": 0.042834039949298484, + "grad_norm": 4.53125, + "learning_rate": 9.955675263560107e-05, + "loss": 1.9685, + "step": 980 + }, + { + "epoch": 0.04287774815332838, + "grad_norm": 2.71875, + "learning_rate": 9.95558396086056e-05, + "loss": 1.9, + "step": 981 + }, + { + "epoch": 0.04292145635735828, + "grad_norm": 2.65625, + "learning_rate": 9.955492564641931e-05, + "loss": 1.9955, + "step": 982 + }, + { + "epoch": 0.04296516456138817, + "grad_norm": 3.203125, + "learning_rate": 9.955401074905945e-05, + "loss": 2.3646, + "step": 983 + }, + { + "epoch": 0.043008872765418066, + "grad_norm": 2.84375, + "learning_rate": 9.95530949165433e-05, + "loss": 1.7669, + "step": 984 + }, + { + "epoch": 0.043052580969447966, + "grad_norm": 3.125, + "learning_rate": 9.955217814888811e-05, + "loss": 2.2699, + "step": 985 + }, + { + "epoch": 0.04309628917347786, + "grad_norm": 6.0625, + "learning_rate": 9.955126044611121e-05, + "loss": 2.6444, + "step": 986 + }, + { + "epoch": 0.04313999737750776, + "grad_norm": 3.015625, + "learning_rate": 9.955034180822988e-05, + "loss": 2.1632, + "step": 987 + }, + { + "epoch": 0.043183705581537654, + "grad_norm": 2.96875, + "learning_rate": 9.954942223526152e-05, + "loss": 1.9997, + "step": 988 + }, + { + "epoch": 0.04322741378556755, + "grad_norm": 2.859375, + "learning_rate": 9.954850172722344e-05, + "loss": 2.6184, + "step": 989 + }, + { + "epoch": 0.04327112198959745, + "grad_norm": 3.515625, + "learning_rate": 9.9547580284133e-05, + "loss": 2.7852, + "step": 990 + }, + { + "epoch": 0.04331483019362734, + "grad_norm": 3.609375, + "learning_rate": 9.954665790600761e-05, + "loss": 2.9001, + "step": 991 + }, + { + "epoch": 0.04335853839765724, + "grad_norm": 5.5625, + "learning_rate": 9.954573459286468e-05, + "loss": 2.6454, + "step": 992 + }, + { + "epoch": 0.043402246601687136, + "grad_norm": 3.96875, + "learning_rate": 9.954481034472163e-05, + "loss": 2.3826, + "step": 993 + }, + { + "epoch": 0.04344595480571703, + "grad_norm": 2.9375, + "learning_rate": 9.95438851615959e-05, + "loss": 2.5778, + "step": 994 + }, + { + "epoch": 0.04348966300974693, + "grad_norm": 5.34375, + "learning_rate": 9.954295904350495e-05, + "loss": 2.5694, + "step": 995 + }, + { + "epoch": 0.043533371213776824, + "grad_norm": 3.703125, + "learning_rate": 9.954203199046624e-05, + "loss": 2.0787, + "step": 996 + }, + { + "epoch": 0.043577079417806724, + "grad_norm": 3.203125, + "learning_rate": 9.95411040024973e-05, + "loss": 2.1673, + "step": 997 + }, + { + "epoch": 0.04362078762183662, + "grad_norm": 2.9375, + "learning_rate": 9.954017507961561e-05, + "loss": 2.8185, + "step": 998 + }, + { + "epoch": 0.04366449582586651, + "grad_norm": 3.828125, + "learning_rate": 9.953924522183872e-05, + "loss": 2.495, + "step": 999 + }, + { + "epoch": 0.04370820402989641, + "grad_norm": 3.140625, + "learning_rate": 9.953831442918418e-05, + "loss": 1.975, + "step": 1000 + }, + { + "epoch": 0.043751912233926306, + "grad_norm": 3.375, + "learning_rate": 9.953738270166954e-05, + "loss": 2.072, + "step": 1001 + }, + { + "epoch": 0.043795620437956206, + "grad_norm": 2.609375, + "learning_rate": 9.953645003931239e-05, + "loss": 2.2948, + "step": 1002 + }, + { + "epoch": 0.0438393286419861, + "grad_norm": 2.796875, + "learning_rate": 9.953551644213033e-05, + "loss": 2.4571, + "step": 1003 + }, + { + "epoch": 0.043883036846016, + "grad_norm": 2.96875, + "learning_rate": 9.953458191014098e-05, + "loss": 2.5195, + "step": 1004 + }, + { + "epoch": 0.043926745050045894, + "grad_norm": 2.796875, + "learning_rate": 9.953364644336199e-05, + "loss": 2.5952, + "step": 1005 + }, + { + "epoch": 0.04397045325407579, + "grad_norm": 2.734375, + "learning_rate": 9.953271004181097e-05, + "loss": 2.1683, + "step": 1006 + }, + { + "epoch": 0.04401416145810569, + "grad_norm": 2.765625, + "learning_rate": 9.953177270550564e-05, + "loss": 1.818, + "step": 1007 + }, + { + "epoch": 0.04405786966213558, + "grad_norm": 3.375, + "learning_rate": 9.953083443446366e-05, + "loss": 2.5001, + "step": 1008 + }, + { + "epoch": 0.04410157786616548, + "grad_norm": 2.640625, + "learning_rate": 9.952989522870275e-05, + "loss": 2.2333, + "step": 1009 + }, + { + "epoch": 0.044145286070195376, + "grad_norm": 4.1875, + "learning_rate": 9.952895508824063e-05, + "loss": 2.1663, + "step": 1010 + }, + { + "epoch": 0.04418899427422527, + "grad_norm": 16.25, + "learning_rate": 9.952801401309503e-05, + "loss": 2.8942, + "step": 1011 + }, + { + "epoch": 0.04423270247825517, + "grad_norm": 3.109375, + "learning_rate": 9.952707200328374e-05, + "loss": 1.8845, + "step": 1012 + }, + { + "epoch": 0.044276410682285064, + "grad_norm": 2.84375, + "learning_rate": 9.952612905882451e-05, + "loss": 2.5532, + "step": 1013 + }, + { + "epoch": 0.044320118886314964, + "grad_norm": 2.84375, + "learning_rate": 9.952518517973515e-05, + "loss": 2.4217, + "step": 1014 + }, + { + "epoch": 0.04436382709034486, + "grad_norm": 3.140625, + "learning_rate": 9.952424036603345e-05, + "loss": 1.7717, + "step": 1015 + }, + { + "epoch": 0.04440753529437475, + "grad_norm": 2.5625, + "learning_rate": 9.952329461773726e-05, + "loss": 2.0673, + "step": 1016 + }, + { + "epoch": 0.04445124349840465, + "grad_norm": 2.90625, + "learning_rate": 9.952234793486443e-05, + "loss": 1.9221, + "step": 1017 + }, + { + "epoch": 0.044494951702434546, + "grad_norm": 2.953125, + "learning_rate": 9.95214003174328e-05, + "loss": 2.2079, + "step": 1018 + }, + { + "epoch": 0.044538659906464446, + "grad_norm": 3.703125, + "learning_rate": 9.95204517654603e-05, + "loss": 2.5225, + "step": 1019 + }, + { + "epoch": 0.04458236811049434, + "grad_norm": 3.109375, + "learning_rate": 9.951950227896478e-05, + "loss": 2.343, + "step": 1020 + }, + { + "epoch": 0.044626076314524234, + "grad_norm": 7.15625, + "learning_rate": 9.95185518579642e-05, + "loss": 2.4092, + "step": 1021 + }, + { + "epoch": 0.044669784518554134, + "grad_norm": 3.75, + "learning_rate": 9.951760050247646e-05, + "loss": 2.3665, + "step": 1022 + }, + { + "epoch": 0.04471349272258403, + "grad_norm": 2.9375, + "learning_rate": 9.951664821251952e-05, + "loss": 2.2941, + "step": 1023 + }, + { + "epoch": 0.04475720092661393, + "grad_norm": 3.078125, + "learning_rate": 9.951569498811137e-05, + "loss": 2.8114, + "step": 1024 + }, + { + "epoch": 0.04480090913064382, + "grad_norm": 3.15625, + "learning_rate": 9.951474082927e-05, + "loss": 2.7321, + "step": 1025 + }, + { + "epoch": 0.044844617334673716, + "grad_norm": 3.0, + "learning_rate": 9.951378573601338e-05, + "loss": 2.2982, + "step": 1026 + }, + { + "epoch": 0.044888325538703616, + "grad_norm": 3.25, + "learning_rate": 9.951282970835957e-05, + "loss": 2.0281, + "step": 1027 + }, + { + "epoch": 0.04493203374273351, + "grad_norm": 3.015625, + "learning_rate": 9.951187274632661e-05, + "loss": 2.1427, + "step": 1028 + }, + { + "epoch": 0.04497574194676341, + "grad_norm": 2.78125, + "learning_rate": 9.951091484993256e-05, + "loss": 2.1392, + "step": 1029 + }, + { + "epoch": 0.045019450150793304, + "grad_norm": 2.78125, + "learning_rate": 9.950995601919546e-05, + "loss": 2.1285, + "step": 1030 + }, + { + "epoch": 0.0450631583548232, + "grad_norm": 3.15625, + "learning_rate": 9.950899625413345e-05, + "loss": 1.7744, + "step": 1031 + }, + { + "epoch": 0.0451068665588531, + "grad_norm": 4.875, + "learning_rate": 9.950803555476463e-05, + "loss": 3.7282, + "step": 1032 + }, + { + "epoch": 0.04515057476288299, + "grad_norm": 3.234375, + "learning_rate": 9.950707392110709e-05, + "loss": 3.1688, + "step": 1033 + }, + { + "epoch": 0.04519428296691289, + "grad_norm": 3.09375, + "learning_rate": 9.950611135317904e-05, + "loss": 1.9093, + "step": 1034 + }, + { + "epoch": 0.045237991170942786, + "grad_norm": 3.1875, + "learning_rate": 9.95051478509986e-05, + "loss": 2.6153, + "step": 1035 + }, + { + "epoch": 0.04528169937497268, + "grad_norm": 2.90625, + "learning_rate": 9.950418341458398e-05, + "loss": 2.0785, + "step": 1036 + }, + { + "epoch": 0.04532540757900258, + "grad_norm": 4.96875, + "learning_rate": 9.950321804395338e-05, + "loss": 2.2215, + "step": 1037 + }, + { + "epoch": 0.045369115783032474, + "grad_norm": 3.625, + "learning_rate": 9.950225173912499e-05, + "loss": 3.471, + "step": 1038 + }, + { + "epoch": 0.045412823987062374, + "grad_norm": 4.3125, + "learning_rate": 9.950128450011706e-05, + "loss": 2.7834, + "step": 1039 + }, + { + "epoch": 0.04545653219109227, + "grad_norm": 2.953125, + "learning_rate": 9.950031632694785e-05, + "loss": 2.182, + "step": 1040 + }, + { + "epoch": 0.04550024039512216, + "grad_norm": 2.90625, + "learning_rate": 9.949934721963563e-05, + "loss": 2.1175, + "step": 1041 + }, + { + "epoch": 0.04554394859915206, + "grad_norm": 3.28125, + "learning_rate": 9.949837717819868e-05, + "loss": 2.6425, + "step": 1042 + }, + { + "epoch": 0.045587656803181956, + "grad_norm": 2.828125, + "learning_rate": 9.949740620265532e-05, + "loss": 2.0681, + "step": 1043 + }, + { + "epoch": 0.045631365007211856, + "grad_norm": 3.96875, + "learning_rate": 9.949643429302386e-05, + "loss": 3.4625, + "step": 1044 + }, + { + "epoch": 0.04567507321124175, + "grad_norm": 2.828125, + "learning_rate": 9.949546144932265e-05, + "loss": 1.965, + "step": 1045 + }, + { + "epoch": 0.04571878141527164, + "grad_norm": 3.484375, + "learning_rate": 9.949448767157003e-05, + "loss": 2.3129, + "step": 1046 + }, + { + "epoch": 0.045762489619301544, + "grad_norm": 3.34375, + "learning_rate": 9.949351295978441e-05, + "loss": 1.81, + "step": 1047 + }, + { + "epoch": 0.04580619782333144, + "grad_norm": 3.78125, + "learning_rate": 9.949253731398416e-05, + "loss": 2.5846, + "step": 1048 + }, + { + "epoch": 0.04584990602736134, + "grad_norm": 4.25, + "learning_rate": 9.949156073418769e-05, + "loss": 2.8508, + "step": 1049 + }, + { + "epoch": 0.04589361423139123, + "grad_norm": 2.75, + "learning_rate": 9.949058322041345e-05, + "loss": 2.2982, + "step": 1050 + }, + { + "epoch": 0.045937322435421125, + "grad_norm": 3.65625, + "learning_rate": 9.948960477267986e-05, + "loss": 3.0965, + "step": 1051 + }, + { + "epoch": 0.045981030639451026, + "grad_norm": 2.578125, + "learning_rate": 9.948862539100541e-05, + "loss": 2.16, + "step": 1052 + }, + { + "epoch": 0.04602473884348092, + "grad_norm": 3.421875, + "learning_rate": 9.948764507540858e-05, + "loss": 2.1271, + "step": 1053 + }, + { + "epoch": 0.04606844704751082, + "grad_norm": 2.875, + "learning_rate": 9.948666382590785e-05, + "loss": 2.0429, + "step": 1054 + }, + { + "epoch": 0.046112155251540714, + "grad_norm": 2.6875, + "learning_rate": 9.948568164252174e-05, + "loss": 2.3033, + "step": 1055 + }, + { + "epoch": 0.04615586345557061, + "grad_norm": 7.3125, + "learning_rate": 9.948469852526881e-05, + "loss": 2.4171, + "step": 1056 + }, + { + "epoch": 0.04619957165960051, + "grad_norm": 3.5, + "learning_rate": 9.948371447416758e-05, + "loss": 2.5804, + "step": 1057 + }, + { + "epoch": 0.0462432798636304, + "grad_norm": 2.78125, + "learning_rate": 9.948272948923666e-05, + "loss": 2.2676, + "step": 1058 + }, + { + "epoch": 0.0462869880676603, + "grad_norm": 2.984375, + "learning_rate": 9.94817435704946e-05, + "loss": 2.327, + "step": 1059 + }, + { + "epoch": 0.046330696271690196, + "grad_norm": 3.359375, + "learning_rate": 9.948075671796004e-05, + "loss": 1.9911, + "step": 1060 + }, + { + "epoch": 0.04637440447572009, + "grad_norm": 3.71875, + "learning_rate": 9.947976893165156e-05, + "loss": 2.0437, + "step": 1061 + }, + { + "epoch": 0.04641811267974999, + "grad_norm": 3.0, + "learning_rate": 9.947878021158784e-05, + "loss": 2.4868, + "step": 1062 + }, + { + "epoch": 0.046461820883779884, + "grad_norm": 3.5625, + "learning_rate": 9.947779055778752e-05, + "loss": 2.313, + "step": 1063 + }, + { + "epoch": 0.046505529087809784, + "grad_norm": 2.671875, + "learning_rate": 9.947679997026929e-05, + "loss": 2.0247, + "step": 1064 + }, + { + "epoch": 0.04654923729183968, + "grad_norm": 12.3125, + "learning_rate": 9.947580844905182e-05, + "loss": 3.1176, + "step": 1065 + }, + { + "epoch": 0.04659294549586957, + "grad_norm": 3.015625, + "learning_rate": 9.947481599415384e-05, + "loss": 2.522, + "step": 1066 + }, + { + "epoch": 0.04663665369989947, + "grad_norm": 3.5, + "learning_rate": 9.947382260559408e-05, + "loss": 2.9242, + "step": 1067 + }, + { + "epoch": 0.046680361903929365, + "grad_norm": 2.84375, + "learning_rate": 9.947282828339128e-05, + "loss": 1.9872, + "step": 1068 + }, + { + "epoch": 0.046724070107959266, + "grad_norm": 2.828125, + "learning_rate": 9.947183302756422e-05, + "loss": 2.146, + "step": 1069 + }, + { + "epoch": 0.04676777831198916, + "grad_norm": 2.921875, + "learning_rate": 9.947083683813165e-05, + "loss": 2.3677, + "step": 1070 + }, + { + "epoch": 0.04681148651601906, + "grad_norm": 3.09375, + "learning_rate": 9.946983971511239e-05, + "loss": 2.5506, + "step": 1071 + }, + { + "epoch": 0.046855194720048954, + "grad_norm": 2.703125, + "learning_rate": 9.946884165852525e-05, + "loss": 2.0168, + "step": 1072 + }, + { + "epoch": 0.04689890292407885, + "grad_norm": 2.625, + "learning_rate": 9.946784266838909e-05, + "loss": 1.9957, + "step": 1073 + }, + { + "epoch": 0.04694261112810875, + "grad_norm": 2.6875, + "learning_rate": 9.94668427447227e-05, + "loss": 2.315, + "step": 1074 + }, + { + "epoch": 0.04698631933213864, + "grad_norm": 3.109375, + "learning_rate": 9.946584188754504e-05, + "loss": 2.1152, + "step": 1075 + }, + { + "epoch": 0.04703002753616854, + "grad_norm": 2.703125, + "learning_rate": 9.946484009687493e-05, + "loss": 2.0469, + "step": 1076 + }, + { + "epoch": 0.047073735740198436, + "grad_norm": 4.21875, + "learning_rate": 9.946383737273129e-05, + "loss": 2.7682, + "step": 1077 + }, + { + "epoch": 0.04711744394422833, + "grad_norm": 3.90625, + "learning_rate": 9.946283371513305e-05, + "loss": 3.337, + "step": 1078 + }, + { + "epoch": 0.04716115214825823, + "grad_norm": 3.140625, + "learning_rate": 9.946182912409915e-05, + "loss": 2.2733, + "step": 1079 + }, + { + "epoch": 0.047204860352288124, + "grad_norm": 3.578125, + "learning_rate": 9.946082359964855e-05, + "loss": 2.0077, + "step": 1080 + }, + { + "epoch": 0.047248568556318024, + "grad_norm": 3.046875, + "learning_rate": 9.945981714180021e-05, + "loss": 2.0198, + "step": 1081 + }, + { + "epoch": 0.04729227676034792, + "grad_norm": 2.4375, + "learning_rate": 9.945880975057315e-05, + "loss": 1.9136, + "step": 1082 + }, + { + "epoch": 0.04733598496437781, + "grad_norm": 3.8125, + "learning_rate": 9.945780142598636e-05, + "loss": 2.7276, + "step": 1083 + }, + { + "epoch": 0.04737969316840771, + "grad_norm": 3.875, + "learning_rate": 9.945679216805887e-05, + "loss": 2.2242, + "step": 1084 + }, + { + "epoch": 0.047423401372437606, + "grad_norm": 3.0, + "learning_rate": 9.945578197680974e-05, + "loss": 2.5395, + "step": 1085 + }, + { + "epoch": 0.047467109576467506, + "grad_norm": 2.75, + "learning_rate": 9.9454770852258e-05, + "loss": 2.159, + "step": 1086 + }, + { + "epoch": 0.0475108177804974, + "grad_norm": 3.25, + "learning_rate": 9.945375879442277e-05, + "loss": 1.941, + "step": 1087 + }, + { + "epoch": 0.04755452598452729, + "grad_norm": 4.84375, + "learning_rate": 9.945274580332316e-05, + "loss": 2.7804, + "step": 1088 + }, + { + "epoch": 0.047598234188557194, + "grad_norm": 2.96875, + "learning_rate": 9.945173187897823e-05, + "loss": 2.6826, + "step": 1089 + }, + { + "epoch": 0.04764194239258709, + "grad_norm": 3.734375, + "learning_rate": 9.945071702140716e-05, + "loss": 2.7705, + "step": 1090 + }, + { + "epoch": 0.04768565059661699, + "grad_norm": 3.375, + "learning_rate": 9.94497012306291e-05, + "loss": 2.5417, + "step": 1091 + }, + { + "epoch": 0.04772935880064688, + "grad_norm": 4.09375, + "learning_rate": 9.944868450666318e-05, + "loss": 3.0292, + "step": 1092 + }, + { + "epoch": 0.047773067004676775, + "grad_norm": 3.09375, + "learning_rate": 9.944766684952863e-05, + "loss": 2.1641, + "step": 1093 + }, + { + "epoch": 0.047816775208706676, + "grad_norm": 3.53125, + "learning_rate": 9.944664825924463e-05, + "loss": 2.5355, + "step": 1094 + }, + { + "epoch": 0.04786048341273657, + "grad_norm": 3.09375, + "learning_rate": 9.944562873583042e-05, + "loss": 2.6627, + "step": 1095 + }, + { + "epoch": 0.04790419161676647, + "grad_norm": 2.65625, + "learning_rate": 9.944460827930521e-05, + "loss": 2.0701, + "step": 1096 + }, + { + "epoch": 0.047947899820796364, + "grad_norm": 3.671875, + "learning_rate": 9.944358688968831e-05, + "loss": 2.3189, + "step": 1097 + }, + { + "epoch": 0.04799160802482626, + "grad_norm": 3.265625, + "learning_rate": 9.944256456699895e-05, + "loss": 2.0858, + "step": 1098 + }, + { + "epoch": 0.04803531622885616, + "grad_norm": 2.640625, + "learning_rate": 9.944154131125642e-05, + "loss": 2.1744, + "step": 1099 + }, + { + "epoch": 0.04807902443288605, + "grad_norm": 2.9375, + "learning_rate": 9.944051712248009e-05, + "loss": 2.2945, + "step": 1100 + }, + { + "epoch": 0.04812273263691595, + "grad_norm": 4.0, + "learning_rate": 9.943949200068921e-05, + "loss": 3.715, + "step": 1101 + }, + { + "epoch": 0.048166440840945846, + "grad_norm": 2.84375, + "learning_rate": 9.943846594590316e-05, + "loss": 2.1754, + "step": 1102 + }, + { + "epoch": 0.04821014904497574, + "grad_norm": 3.15625, + "learning_rate": 9.943743895814131e-05, + "loss": 2.2889, + "step": 1103 + }, + { + "epoch": 0.04825385724900564, + "grad_norm": 4.1875, + "learning_rate": 9.943641103742303e-05, + "loss": 2.7616, + "step": 1104 + }, + { + "epoch": 0.04829756545303553, + "grad_norm": 3.34375, + "learning_rate": 9.943538218376773e-05, + "loss": 2.4096, + "step": 1105 + }, + { + "epoch": 0.048341273657065434, + "grad_norm": 2.75, + "learning_rate": 9.943435239719482e-05, + "loss": 2.522, + "step": 1106 + }, + { + "epoch": 0.04838498186109533, + "grad_norm": 3.421875, + "learning_rate": 9.943332167772372e-05, + "loss": 2.0895, + "step": 1107 + }, + { + "epoch": 0.04842869006512522, + "grad_norm": 3.640625, + "learning_rate": 9.943229002537391e-05, + "loss": 2.8614, + "step": 1108 + }, + { + "epoch": 0.04847239826915512, + "grad_norm": 3.125, + "learning_rate": 9.943125744016483e-05, + "loss": 2.4255, + "step": 1109 + }, + { + "epoch": 0.048516106473185015, + "grad_norm": 2.90625, + "learning_rate": 9.943022392211599e-05, + "loss": 2.4272, + "step": 1110 + }, + { + "epoch": 0.048559814677214916, + "grad_norm": 3.625, + "learning_rate": 9.942918947124686e-05, + "loss": 3.2011, + "step": 1111 + }, + { + "epoch": 0.04860352288124481, + "grad_norm": 3.46875, + "learning_rate": 9.942815408757699e-05, + "loss": 1.8, + "step": 1112 + }, + { + "epoch": 0.0486472310852747, + "grad_norm": 2.734375, + "learning_rate": 9.942711777112594e-05, + "loss": 2.1547, + "step": 1113 + }, + { + "epoch": 0.048690939289304604, + "grad_norm": 3.0625, + "learning_rate": 9.942608052191321e-05, + "loss": 2.4241, + "step": 1114 + }, + { + "epoch": 0.0487346474933345, + "grad_norm": 3.359375, + "learning_rate": 9.942504233995842e-05, + "loss": 1.774, + "step": 1115 + }, + { + "epoch": 0.0487783556973644, + "grad_norm": 7.46875, + "learning_rate": 9.942400322528114e-05, + "loss": 2.6383, + "step": 1116 + }, + { + "epoch": 0.04882206390139429, + "grad_norm": 2.9375, + "learning_rate": 9.942296317790099e-05, + "loss": 1.941, + "step": 1117 + }, + { + "epoch": 0.048865772105424185, + "grad_norm": 2.796875, + "learning_rate": 9.942192219783758e-05, + "loss": 1.9, + "step": 1118 + }, + { + "epoch": 0.048909480309454086, + "grad_norm": 2.796875, + "learning_rate": 9.942088028511059e-05, + "loss": 2.382, + "step": 1119 + }, + { + "epoch": 0.04895318851348398, + "grad_norm": 2.8125, + "learning_rate": 9.941983743973964e-05, + "loss": 2.1542, + "step": 1120 + }, + { + "epoch": 0.04899689671751388, + "grad_norm": 3.640625, + "learning_rate": 9.941879366174444e-05, + "loss": 2.5014, + "step": 1121 + }, + { + "epoch": 0.04904060492154377, + "grad_norm": 3.0625, + "learning_rate": 9.941774895114467e-05, + "loss": 2.2071, + "step": 1122 + }, + { + "epoch": 0.04908431312557367, + "grad_norm": 3.921875, + "learning_rate": 9.941670330796007e-05, + "loss": 2.6761, + "step": 1123 + }, + { + "epoch": 0.04912802132960357, + "grad_norm": 6.0625, + "learning_rate": 9.941565673221034e-05, + "loss": 2.7923, + "step": 1124 + }, + { + "epoch": 0.04917172953363346, + "grad_norm": 2.953125, + "learning_rate": 9.941460922391526e-05, + "loss": 2.3441, + "step": 1125 + }, + { + "epoch": 0.04921543773766336, + "grad_norm": 3.15625, + "learning_rate": 9.941356078309457e-05, + "loss": 2.1081, + "step": 1126 + }, + { + "epoch": 0.049259145941693255, + "grad_norm": 2.546875, + "learning_rate": 9.941251140976807e-05, + "loss": 1.9443, + "step": 1127 + }, + { + "epoch": 0.04930285414572315, + "grad_norm": 2.671875, + "learning_rate": 9.941146110395557e-05, + "loss": 2.0367, + "step": 1128 + }, + { + "epoch": 0.04934656234975305, + "grad_norm": 3.265625, + "learning_rate": 9.941040986567689e-05, + "loss": 2.5013, + "step": 1129 + }, + { + "epoch": 0.04939027055378294, + "grad_norm": 7.71875, + "learning_rate": 9.940935769495186e-05, + "loss": 1.9259, + "step": 1130 + }, + { + "epoch": 0.049433978757812844, + "grad_norm": 4.6875, + "learning_rate": 9.940830459180033e-05, + "loss": 1.9416, + "step": 1131 + }, + { + "epoch": 0.04947768696184274, + "grad_norm": 5.28125, + "learning_rate": 9.940725055624218e-05, + "loss": 2.3391, + "step": 1132 + }, + { + "epoch": 0.04952139516587263, + "grad_norm": 3.78125, + "learning_rate": 9.940619558829731e-05, + "loss": 2.6132, + "step": 1133 + }, + { + "epoch": 0.04956510336990253, + "grad_norm": 3.015625, + "learning_rate": 9.940513968798563e-05, + "loss": 2.4921, + "step": 1134 + }, + { + "epoch": 0.049608811573932425, + "grad_norm": 2.96875, + "learning_rate": 9.940408285532705e-05, + "loss": 2.0902, + "step": 1135 + }, + { + "epoch": 0.049652519777962326, + "grad_norm": 2.828125, + "learning_rate": 9.940302509034152e-05, + "loss": 2.2618, + "step": 1136 + }, + { + "epoch": 0.04969622798199222, + "grad_norm": 3.578125, + "learning_rate": 9.9401966393049e-05, + "loss": 2.1943, + "step": 1137 + }, + { + "epoch": 0.04973993618602212, + "grad_norm": 2.8125, + "learning_rate": 9.940090676346948e-05, + "loss": 2.2639, + "step": 1138 + }, + { + "epoch": 0.04978364439005201, + "grad_norm": 3.0, + "learning_rate": 9.939984620162295e-05, + "loss": 2.3306, + "step": 1139 + }, + { + "epoch": 0.04982735259408191, + "grad_norm": 5.375, + "learning_rate": 9.939878470752942e-05, + "loss": 1.9926, + "step": 1140 + }, + { + "epoch": 0.04987106079811181, + "grad_norm": 2.96875, + "learning_rate": 9.939772228120893e-05, + "loss": 2.2294, + "step": 1141 + }, + { + "epoch": 0.0499147690021417, + "grad_norm": 2.75, + "learning_rate": 9.939665892268152e-05, + "loss": 1.99, + "step": 1142 + }, + { + "epoch": 0.0499584772061716, + "grad_norm": 3.34375, + "learning_rate": 9.939559463196727e-05, + "loss": 2.5241, + "step": 1143 + }, + { + "epoch": 0.050002185410201495, + "grad_norm": 3.3125, + "learning_rate": 9.939452940908626e-05, + "loss": 2.1868, + "step": 1144 + }, + { + "epoch": 0.05004589361423139, + "grad_norm": 3.890625, + "learning_rate": 9.939346325405858e-05, + "loss": 3.2205, + "step": 1145 + }, + { + "epoch": 0.05008960181826129, + "grad_norm": 5.03125, + "learning_rate": 9.939239616690436e-05, + "loss": 3.0481, + "step": 1146 + }, + { + "epoch": 0.05013331002229118, + "grad_norm": 3.09375, + "learning_rate": 9.939132814764375e-05, + "loss": 2.1522, + "step": 1147 + }, + { + "epoch": 0.050177018226321084, + "grad_norm": 3.171875, + "learning_rate": 9.939025919629688e-05, + "loss": 2.2517, + "step": 1148 + }, + { + "epoch": 0.05022072643035098, + "grad_norm": 3.046875, + "learning_rate": 9.938918931288395e-05, + "loss": 1.72, + "step": 1149 + }, + { + "epoch": 0.05026443463438087, + "grad_norm": 3.125, + "learning_rate": 9.93881184974251e-05, + "loss": 2.559, + "step": 1150 + }, + { + "epoch": 0.05030814283841077, + "grad_norm": 2.59375, + "learning_rate": 9.938704674994062e-05, + "loss": 2.2228, + "step": 1151 + }, + { + "epoch": 0.050351851042440665, + "grad_norm": 3.1875, + "learning_rate": 9.938597407045065e-05, + "loss": 2.6145, + "step": 1152 + }, + { + "epoch": 0.050395559246470566, + "grad_norm": 2.984375, + "learning_rate": 9.93849004589755e-05, + "loss": 2.0967, + "step": 1153 + }, + { + "epoch": 0.05043926745050046, + "grad_norm": 4.125, + "learning_rate": 9.938382591553538e-05, + "loss": 2.0268, + "step": 1154 + }, + { + "epoch": 0.05048297565453035, + "grad_norm": 3.640625, + "learning_rate": 9.938275044015059e-05, + "loss": 2.994, + "step": 1155 + }, + { + "epoch": 0.05052668385856025, + "grad_norm": 3.90625, + "learning_rate": 9.938167403284144e-05, + "loss": 3.0414, + "step": 1156 + }, + { + "epoch": 0.05057039206259015, + "grad_norm": 3.265625, + "learning_rate": 9.938059669362822e-05, + "loss": 2.5288, + "step": 1157 + }, + { + "epoch": 0.05061410026662005, + "grad_norm": 2.640625, + "learning_rate": 9.937951842253127e-05, + "loss": 1.7965, + "step": 1158 + }, + { + "epoch": 0.05065780847064994, + "grad_norm": 3.1875, + "learning_rate": 9.937843921957095e-05, + "loss": 2.2143, + "step": 1159 + }, + { + "epoch": 0.050701516674679835, + "grad_norm": 3.3125, + "learning_rate": 9.937735908476762e-05, + "loss": 2.3242, + "step": 1160 + }, + { + "epoch": 0.050745224878709735, + "grad_norm": 5.21875, + "learning_rate": 9.937627801814165e-05, + "loss": 1.9423, + "step": 1161 + }, + { + "epoch": 0.05078893308273963, + "grad_norm": 3.59375, + "learning_rate": 9.937519601971343e-05, + "loss": 2.6635, + "step": 1162 + }, + { + "epoch": 0.05083264128676953, + "grad_norm": 6.1875, + "learning_rate": 9.937411308950342e-05, + "loss": 2.6874, + "step": 1163 + }, + { + "epoch": 0.05087634949079942, + "grad_norm": 2.6875, + "learning_rate": 9.937302922753203e-05, + "loss": 2.2795, + "step": 1164 + }, + { + "epoch": 0.05092005769482932, + "grad_norm": 3.328125, + "learning_rate": 9.937194443381972e-05, + "loss": 1.9133, + "step": 1165 + }, + { + "epoch": 0.05096376589885922, + "grad_norm": 2.9375, + "learning_rate": 9.937085870838695e-05, + "loss": 1.9703, + "step": 1166 + }, + { + "epoch": 0.05100747410288911, + "grad_norm": 2.53125, + "learning_rate": 9.936977205125424e-05, + "loss": 1.8399, + "step": 1167 + }, + { + "epoch": 0.05105118230691901, + "grad_norm": 3.703125, + "learning_rate": 9.936868446244208e-05, + "loss": 2.7845, + "step": 1168 + }, + { + "epoch": 0.051094890510948905, + "grad_norm": 3.359375, + "learning_rate": 9.936759594197098e-05, + "loss": 2.5411, + "step": 1169 + }, + { + "epoch": 0.0511385987149788, + "grad_norm": 4.1875, + "learning_rate": 9.936650648986148e-05, + "loss": 2.3818, + "step": 1170 + }, + { + "epoch": 0.0511823069190087, + "grad_norm": 3.484375, + "learning_rate": 9.936541610613416e-05, + "loss": 2.4754, + "step": 1171 + }, + { + "epoch": 0.05122601512303859, + "grad_norm": 3.140625, + "learning_rate": 9.936432479080961e-05, + "loss": 1.9356, + "step": 1172 + }, + { + "epoch": 0.051269723327068494, + "grad_norm": 3.65625, + "learning_rate": 9.93632325439084e-05, + "loss": 2.1584, + "step": 1173 + }, + { + "epoch": 0.05131343153109839, + "grad_norm": 2.5, + "learning_rate": 9.936213936545113e-05, + "loss": 2.1896, + "step": 1174 + }, + { + "epoch": 0.05135713973512828, + "grad_norm": 2.671875, + "learning_rate": 9.936104525545846e-05, + "loss": 2.0162, + "step": 1175 + }, + { + "epoch": 0.05140084793915818, + "grad_norm": 2.625, + "learning_rate": 9.935995021395102e-05, + "loss": 2.0853, + "step": 1176 + }, + { + "epoch": 0.051444556143188075, + "grad_norm": 3.078125, + "learning_rate": 9.935885424094948e-05, + "loss": 1.9196, + "step": 1177 + }, + { + "epoch": 0.051488264347217975, + "grad_norm": 2.734375, + "learning_rate": 9.935775733647452e-05, + "loss": 2.0876, + "step": 1178 + }, + { + "epoch": 0.05153197255124787, + "grad_norm": 3.765625, + "learning_rate": 9.935665950054684e-05, + "loss": 2.6171, + "step": 1179 + }, + { + "epoch": 0.05157568075527776, + "grad_norm": 2.875, + "learning_rate": 9.935556073318716e-05, + "loss": 2.4073, + "step": 1180 + }, + { + "epoch": 0.05161938895930766, + "grad_norm": 3.546875, + "learning_rate": 9.935446103441623e-05, + "loss": 2.6037, + "step": 1181 + }, + { + "epoch": 0.05166309716333756, + "grad_norm": 2.796875, + "learning_rate": 9.935336040425478e-05, + "loss": 2.2674, + "step": 1182 + }, + { + "epoch": 0.05170680536736746, + "grad_norm": 8.0625, + "learning_rate": 9.935225884272359e-05, + "loss": 3.6372, + "step": 1183 + }, + { + "epoch": 0.05175051357139735, + "grad_norm": 3.03125, + "learning_rate": 9.935115634984345e-05, + "loss": 2.1184, + "step": 1184 + }, + { + "epoch": 0.051794221775427245, + "grad_norm": 6.875, + "learning_rate": 9.935005292563515e-05, + "loss": 2.7781, + "step": 1185 + }, + { + "epoch": 0.051837929979457145, + "grad_norm": 2.765625, + "learning_rate": 9.934894857011953e-05, + "loss": 2.1325, + "step": 1186 + }, + { + "epoch": 0.05188163818348704, + "grad_norm": 2.640625, + "learning_rate": 9.934784328331743e-05, + "loss": 2.4299, + "step": 1187 + }, + { + "epoch": 0.05192534638751694, + "grad_norm": 3.21875, + "learning_rate": 9.934673706524969e-05, + "loss": 2.3853, + "step": 1188 + }, + { + "epoch": 0.05196905459154683, + "grad_norm": 2.84375, + "learning_rate": 9.934562991593722e-05, + "loss": 1.9624, + "step": 1189 + }, + { + "epoch": 0.05201276279557673, + "grad_norm": 8.625, + "learning_rate": 9.934452183540089e-05, + "loss": 2.052, + "step": 1190 + }, + { + "epoch": 0.05205647099960663, + "grad_norm": 2.96875, + "learning_rate": 9.934341282366162e-05, + "loss": 2.0976, + "step": 1191 + }, + { + "epoch": 0.05210017920363652, + "grad_norm": 2.734375, + "learning_rate": 9.934230288074032e-05, + "loss": 2.397, + "step": 1192 + }, + { + "epoch": 0.05214388740766642, + "grad_norm": 2.828125, + "learning_rate": 9.934119200665795e-05, + "loss": 2.1238, + "step": 1193 + }, + { + "epoch": 0.052187595611696315, + "grad_norm": 3.4375, + "learning_rate": 9.934008020143548e-05, + "loss": 2.1589, + "step": 1194 + }, + { + "epoch": 0.05223130381572621, + "grad_norm": 2.796875, + "learning_rate": 9.933896746509391e-05, + "loss": 2.0372, + "step": 1195 + }, + { + "epoch": 0.05227501201975611, + "grad_norm": 2.8125, + "learning_rate": 9.933785379765417e-05, + "loss": 2.1988, + "step": 1196 + }, + { + "epoch": 0.052318720223786, + "grad_norm": 3.25, + "learning_rate": 9.933673919913735e-05, + "loss": 2.2642, + "step": 1197 + }, + { + "epoch": 0.0523624284278159, + "grad_norm": 2.5625, + "learning_rate": 9.933562366956445e-05, + "loss": 1.705, + "step": 1198 + }, + { + "epoch": 0.0524061366318458, + "grad_norm": 2.8125, + "learning_rate": 9.933450720895651e-05, + "loss": 2.5751, + "step": 1199 + }, + { + "epoch": 0.05244984483587569, + "grad_norm": 2.71875, + "learning_rate": 9.933338981733464e-05, + "loss": 2.3266, + "step": 1200 + }, + { + "epoch": 0.05249355303990559, + "grad_norm": 3.015625, + "learning_rate": 9.933227149471991e-05, + "loss": 2.8926, + "step": 1201 + }, + { + "epoch": 0.052537261243935485, + "grad_norm": 2.6875, + "learning_rate": 9.933115224113338e-05, + "loss": 2.0426, + "step": 1202 + }, + { + "epoch": 0.052580969447965385, + "grad_norm": 2.578125, + "learning_rate": 9.933003205659623e-05, + "loss": 2.0112, + "step": 1203 + }, + { + "epoch": 0.05262467765199528, + "grad_norm": 5.15625, + "learning_rate": 9.93289109411296e-05, + "loss": 1.9949, + "step": 1204 + }, + { + "epoch": 0.05266838585602517, + "grad_norm": 3.09375, + "learning_rate": 9.93277888947546e-05, + "loss": 2.1734, + "step": 1205 + }, + { + "epoch": 0.05271209406005507, + "grad_norm": 3.25, + "learning_rate": 9.932666591749242e-05, + "loss": 3.1198, + "step": 1206 + }, + { + "epoch": 0.05275580226408497, + "grad_norm": 2.84375, + "learning_rate": 9.932554200936429e-05, + "loss": 2.1966, + "step": 1207 + }, + { + "epoch": 0.05279951046811487, + "grad_norm": 3.546875, + "learning_rate": 9.932441717039138e-05, + "loss": 2.2443, + "step": 1208 + }, + { + "epoch": 0.05284321867214476, + "grad_norm": 9.1875, + "learning_rate": 9.932329140059494e-05, + "loss": 2.2764, + "step": 1209 + }, + { + "epoch": 0.05288692687617466, + "grad_norm": 3.375, + "learning_rate": 9.932216469999618e-05, + "loss": 2.5796, + "step": 1210 + }, + { + "epoch": 0.052930635080204555, + "grad_norm": 3.109375, + "learning_rate": 9.93210370686164e-05, + "loss": 2.1995, + "step": 1211 + }, + { + "epoch": 0.05297434328423445, + "grad_norm": 2.984375, + "learning_rate": 9.931990850647688e-05, + "loss": 2.3949, + "step": 1212 + }, + { + "epoch": 0.05301805148826435, + "grad_norm": 2.71875, + "learning_rate": 9.931877901359888e-05, + "loss": 1.9585, + "step": 1213 + }, + { + "epoch": 0.05306175969229424, + "grad_norm": 3.140625, + "learning_rate": 9.931764859000375e-05, + "loss": 2.6056, + "step": 1214 + }, + { + "epoch": 0.05310546789632414, + "grad_norm": 3.015625, + "learning_rate": 9.931651723571282e-05, + "loss": 2.7266, + "step": 1215 + }, + { + "epoch": 0.05314917610035404, + "grad_norm": 3.609375, + "learning_rate": 9.931538495074743e-05, + "loss": 2.2353, + "step": 1216 + }, + { + "epoch": 0.05319288430438393, + "grad_norm": 3.078125, + "learning_rate": 9.931425173512895e-05, + "loss": 2.2246, + "step": 1217 + }, + { + "epoch": 0.05323659250841383, + "grad_norm": 2.5625, + "learning_rate": 9.931311758887877e-05, + "loss": 1.8312, + "step": 1218 + }, + { + "epoch": 0.053280300712443725, + "grad_norm": 2.625, + "learning_rate": 9.931198251201828e-05, + "loss": 1.859, + "step": 1219 + }, + { + "epoch": 0.053324008916473625, + "grad_norm": 3.265625, + "learning_rate": 9.931084650456892e-05, + "loss": 2.4562, + "step": 1220 + }, + { + "epoch": 0.05336771712050352, + "grad_norm": 5.03125, + "learning_rate": 9.930970956655212e-05, + "loss": 3.3246, + "step": 1221 + }, + { + "epoch": 0.05341142532453341, + "grad_norm": 2.828125, + "learning_rate": 9.930857169798931e-05, + "loss": 2.1604, + "step": 1222 + }, + { + "epoch": 0.05345513352856331, + "grad_norm": 3.65625, + "learning_rate": 9.9307432898902e-05, + "loss": 2.7602, + "step": 1223 + }, + { + "epoch": 0.05349884173259321, + "grad_norm": 3.015625, + "learning_rate": 9.930629316931168e-05, + "loss": 2.5866, + "step": 1224 + }, + { + "epoch": 0.05354254993662311, + "grad_norm": 2.609375, + "learning_rate": 9.930515250923984e-05, + "loss": 1.9332, + "step": 1225 + }, + { + "epoch": 0.053586258140653, + "grad_norm": 2.609375, + "learning_rate": 9.9304010918708e-05, + "loss": 1.9111, + "step": 1226 + }, + { + "epoch": 0.053629966344682894, + "grad_norm": 2.859375, + "learning_rate": 9.930286839773773e-05, + "loss": 2.2201, + "step": 1227 + }, + { + "epoch": 0.053673674548712795, + "grad_norm": 2.953125, + "learning_rate": 9.930172494635057e-05, + "loss": 2.6325, + "step": 1228 + }, + { + "epoch": 0.05371738275274269, + "grad_norm": 3.953125, + "learning_rate": 9.93005805645681e-05, + "loss": 2.6803, + "step": 1229 + }, + { + "epoch": 0.05376109095677259, + "grad_norm": 4.34375, + "learning_rate": 9.929943525241194e-05, + "loss": 3.2208, + "step": 1230 + }, + { + "epoch": 0.05380479916080248, + "grad_norm": 2.90625, + "learning_rate": 9.929828900990367e-05, + "loss": 2.3413, + "step": 1231 + }, + { + "epoch": 0.053848507364832376, + "grad_norm": 3.296875, + "learning_rate": 9.929714183706493e-05, + "loss": 2.2428, + "step": 1232 + }, + { + "epoch": 0.05389221556886228, + "grad_norm": 3.984375, + "learning_rate": 9.929599373391738e-05, + "loss": 2.9937, + "step": 1233 + }, + { + "epoch": 0.05393592377289217, + "grad_norm": 2.953125, + "learning_rate": 9.92948447004827e-05, + "loss": 2.2362, + "step": 1234 + }, + { + "epoch": 0.05397963197692207, + "grad_norm": 2.890625, + "learning_rate": 9.929369473678253e-05, + "loss": 3.1684, + "step": 1235 + }, + { + "epoch": 0.054023340180951965, + "grad_norm": 2.671875, + "learning_rate": 9.92925438428386e-05, + "loss": 2.0654, + "step": 1236 + }, + { + "epoch": 0.05406704838498186, + "grad_norm": 2.796875, + "learning_rate": 9.929139201867263e-05, + "loss": 2.3591, + "step": 1237 + }, + { + "epoch": 0.05411075658901176, + "grad_norm": 2.421875, + "learning_rate": 9.929023926430636e-05, + "loss": 2.0845, + "step": 1238 + }, + { + "epoch": 0.05415446479304165, + "grad_norm": 2.75, + "learning_rate": 9.928908557976153e-05, + "loss": 2.1563, + "step": 1239 + }, + { + "epoch": 0.05419817299707155, + "grad_norm": 2.8125, + "learning_rate": 9.928793096505992e-05, + "loss": 1.9936, + "step": 1240 + }, + { + "epoch": 0.05424188120110145, + "grad_norm": 4.21875, + "learning_rate": 9.928677542022331e-05, + "loss": 2.7263, + "step": 1241 + }, + { + "epoch": 0.05428558940513134, + "grad_norm": 2.796875, + "learning_rate": 9.928561894527353e-05, + "loss": 1.5295, + "step": 1242 + }, + { + "epoch": 0.05432929760916124, + "grad_norm": 2.84375, + "learning_rate": 9.928446154023238e-05, + "loss": 2.5526, + "step": 1243 + }, + { + "epoch": 0.054373005813191135, + "grad_norm": 3.0, + "learning_rate": 9.928330320512171e-05, + "loss": 2.418, + "step": 1244 + }, + { + "epoch": 0.054416714017221035, + "grad_norm": 2.71875, + "learning_rate": 9.928214393996339e-05, + "loss": 2.2359, + "step": 1245 + }, + { + "epoch": 0.05446042222125093, + "grad_norm": 2.578125, + "learning_rate": 9.928098374477927e-05, + "loss": 2.3314, + "step": 1246 + }, + { + "epoch": 0.05450413042528082, + "grad_norm": 2.40625, + "learning_rate": 9.927982261959127e-05, + "loss": 1.9215, + "step": 1247 + }, + { + "epoch": 0.05454783862931072, + "grad_norm": 3.40625, + "learning_rate": 9.927866056442128e-05, + "loss": 1.9057, + "step": 1248 + }, + { + "epoch": 0.054591546833340616, + "grad_norm": 3.125, + "learning_rate": 9.927749757929125e-05, + "loss": 2.2604, + "step": 1249 + }, + { + "epoch": 0.05463525503737052, + "grad_norm": 3.71875, + "learning_rate": 9.927633366422314e-05, + "loss": 2.3643, + "step": 1250 + }, + { + "epoch": 0.05467896324140041, + "grad_norm": 4.03125, + "learning_rate": 9.927516881923889e-05, + "loss": 2.77, + "step": 1251 + }, + { + "epoch": 0.054722671445430304, + "grad_norm": 6.625, + "learning_rate": 9.927400304436047e-05, + "loss": 1.8398, + "step": 1252 + }, + { + "epoch": 0.054766379649460205, + "grad_norm": 2.765625, + "learning_rate": 9.92728363396099e-05, + "loss": 2.0013, + "step": 1253 + }, + { + "epoch": 0.0548100878534901, + "grad_norm": 3.125, + "learning_rate": 9.927166870500922e-05, + "loss": 2.3079, + "step": 1254 + }, + { + "epoch": 0.05485379605752, + "grad_norm": 2.875, + "learning_rate": 9.927050014058042e-05, + "loss": 2.1234, + "step": 1255 + }, + { + "epoch": 0.05489750426154989, + "grad_norm": 2.953125, + "learning_rate": 9.926933064634558e-05, + "loss": 2.1321, + "step": 1256 + }, + { + "epoch": 0.054941212465579786, + "grad_norm": 2.984375, + "learning_rate": 9.926816022232675e-05, + "loss": 2.8561, + "step": 1257 + }, + { + "epoch": 0.05498492066960969, + "grad_norm": 3.234375, + "learning_rate": 9.926698886854604e-05, + "loss": 2.4861, + "step": 1258 + }, + { + "epoch": 0.05502862887363958, + "grad_norm": 3.3125, + "learning_rate": 9.926581658502554e-05, + "loss": 2.2771, + "step": 1259 + }, + { + "epoch": 0.05507233707766948, + "grad_norm": 2.734375, + "learning_rate": 9.926464337178738e-05, + "loss": 2.2922, + "step": 1260 + }, + { + "epoch": 0.055116045281699375, + "grad_norm": 3.5, + "learning_rate": 9.92634692288537e-05, + "loss": 2.449, + "step": 1261 + }, + { + "epoch": 0.05515975348572927, + "grad_norm": 3.296875, + "learning_rate": 9.926229415624666e-05, + "loss": 2.2916, + "step": 1262 + }, + { + "epoch": 0.05520346168975917, + "grad_norm": 2.765625, + "learning_rate": 9.926111815398843e-05, + "loss": 2.6731, + "step": 1263 + }, + { + "epoch": 0.05524716989378906, + "grad_norm": 32.25, + "learning_rate": 9.92599412221012e-05, + "loss": 2.2843, + "step": 1264 + }, + { + "epoch": 0.05529087809781896, + "grad_norm": 2.640625, + "learning_rate": 9.925876336060719e-05, + "loss": 1.9131, + "step": 1265 + }, + { + "epoch": 0.055334586301848857, + "grad_norm": 3.421875, + "learning_rate": 9.925758456952862e-05, + "loss": 1.8637, + "step": 1266 + }, + { + "epoch": 0.05537829450587875, + "grad_norm": 3.109375, + "learning_rate": 9.925640484888774e-05, + "loss": 2.5708, + "step": 1267 + }, + { + "epoch": 0.05542200270990865, + "grad_norm": 2.515625, + "learning_rate": 9.925522419870681e-05, + "loss": 2.1795, + "step": 1268 + }, + { + "epoch": 0.055465710913938544, + "grad_norm": 3.1875, + "learning_rate": 9.92540426190081e-05, + "loss": 2.3094, + "step": 1269 + }, + { + "epoch": 0.055509419117968445, + "grad_norm": 2.9375, + "learning_rate": 9.925286010981394e-05, + "loss": 2.4622, + "step": 1270 + }, + { + "epoch": 0.05555312732199834, + "grad_norm": 2.5625, + "learning_rate": 9.925167667114661e-05, + "loss": 1.9271, + "step": 1271 + }, + { + "epoch": 0.05559683552602823, + "grad_norm": 2.765625, + "learning_rate": 9.925049230302846e-05, + "loss": 2.2545, + "step": 1272 + }, + { + "epoch": 0.05564054373005813, + "grad_norm": 3.0625, + "learning_rate": 9.924930700548185e-05, + "loss": 2.7818, + "step": 1273 + }, + { + "epoch": 0.055684251934088026, + "grad_norm": 2.875, + "learning_rate": 9.924812077852913e-05, + "loss": 1.8465, + "step": 1274 + }, + { + "epoch": 0.05572796013811793, + "grad_norm": 3.234375, + "learning_rate": 9.924693362219269e-05, + "loss": 2.0316, + "step": 1275 + }, + { + "epoch": 0.05577166834214782, + "grad_norm": 2.953125, + "learning_rate": 9.924574553649496e-05, + "loss": 1.9583, + "step": 1276 + }, + { + "epoch": 0.05581537654617772, + "grad_norm": 3.546875, + "learning_rate": 9.924455652145831e-05, + "loss": 2.0286, + "step": 1277 + }, + { + "epoch": 0.055859084750207615, + "grad_norm": 3.328125, + "learning_rate": 9.924336657710522e-05, + "loss": 1.9079, + "step": 1278 + }, + { + "epoch": 0.05590279295423751, + "grad_norm": 2.859375, + "learning_rate": 9.924217570345813e-05, + "loss": 2.3216, + "step": 1279 + }, + { + "epoch": 0.05594650115826741, + "grad_norm": 2.4375, + "learning_rate": 9.924098390053951e-05, + "loss": 2.2232, + "step": 1280 + }, + { + "epoch": 0.0559902093622973, + "grad_norm": 2.53125, + "learning_rate": 9.923979116837185e-05, + "loss": 2.5149, + "step": 1281 + }, + { + "epoch": 0.0560339175663272, + "grad_norm": 5.0, + "learning_rate": 9.923859750697768e-05, + "loss": 2.9284, + "step": 1282 + }, + { + "epoch": 0.0560776257703571, + "grad_norm": 24.25, + "learning_rate": 9.923740291637951e-05, + "loss": 0.6024, + "step": 1283 + }, + { + "epoch": 0.05612133397438699, + "grad_norm": 3.421875, + "learning_rate": 9.923620739659989e-05, + "loss": 3.0348, + "step": 1284 + }, + { + "epoch": 0.05616504217841689, + "grad_norm": 2.71875, + "learning_rate": 9.923501094766136e-05, + "loss": 2.269, + "step": 1285 + }, + { + "epoch": 0.056208750382446784, + "grad_norm": 2.890625, + "learning_rate": 9.923381356958654e-05, + "loss": 2.0336, + "step": 1286 + }, + { + "epoch": 0.056252458586476685, + "grad_norm": 4.6875, + "learning_rate": 9.923261526239798e-05, + "loss": 2.3973, + "step": 1287 + }, + { + "epoch": 0.05629616679050658, + "grad_norm": 2.734375, + "learning_rate": 9.923141602611834e-05, + "loss": 2.2979, + "step": 1288 + }, + { + "epoch": 0.05633987499453647, + "grad_norm": 2.515625, + "learning_rate": 9.92302158607702e-05, + "loss": 2.3606, + "step": 1289 + }, + { + "epoch": 0.05638358319856637, + "grad_norm": 4.6875, + "learning_rate": 9.922901476637625e-05, + "loss": 2.5858, + "step": 1290 + }, + { + "epoch": 0.056427291402596266, + "grad_norm": 2.921875, + "learning_rate": 9.922781274295913e-05, + "loss": 2.6504, + "step": 1291 + }, + { + "epoch": 0.05647099960662617, + "grad_norm": 2.78125, + "learning_rate": 9.922660979054155e-05, + "loss": 2.3354, + "step": 1292 + }, + { + "epoch": 0.05651470781065606, + "grad_norm": 2.203125, + "learning_rate": 9.922540590914619e-05, + "loss": 1.7236, + "step": 1293 + }, + { + "epoch": 0.056558416014685954, + "grad_norm": 2.71875, + "learning_rate": 9.922420109879578e-05, + "loss": 1.9761, + "step": 1294 + }, + { + "epoch": 0.056602124218715855, + "grad_norm": 3.0625, + "learning_rate": 9.922299535951305e-05, + "loss": 2.3798, + "step": 1295 + }, + { + "epoch": 0.05664583242274575, + "grad_norm": 2.609375, + "learning_rate": 9.922178869132075e-05, + "loss": 2.1449, + "step": 1296 + }, + { + "epoch": 0.05668954062677565, + "grad_norm": 3.171875, + "learning_rate": 9.922058109424167e-05, + "loss": 2.036, + "step": 1297 + }, + { + "epoch": 0.05673324883080554, + "grad_norm": 3.0, + "learning_rate": 9.921937256829859e-05, + "loss": 2.653, + "step": 1298 + }, + { + "epoch": 0.056776957034835436, + "grad_norm": 3.0625, + "learning_rate": 9.921816311351431e-05, + "loss": 2.4909, + "step": 1299 + }, + { + "epoch": 0.05682066523886534, + "grad_norm": 3.046875, + "learning_rate": 9.921695272991165e-05, + "loss": 2.1007, + "step": 1300 + }, + { + "epoch": 0.05686437344289523, + "grad_norm": 2.40625, + "learning_rate": 9.921574141751346e-05, + "loss": 1.91, + "step": 1301 + }, + { + "epoch": 0.05690808164692513, + "grad_norm": 3.34375, + "learning_rate": 9.921452917634261e-05, + "loss": 2.7349, + "step": 1302 + }, + { + "epoch": 0.056951789850955024, + "grad_norm": 2.84375, + "learning_rate": 9.921331600642196e-05, + "loss": 1.8035, + "step": 1303 + }, + { + "epoch": 0.05699549805498492, + "grad_norm": 3.390625, + "learning_rate": 9.921210190777441e-05, + "loss": 2.4165, + "step": 1304 + }, + { + "epoch": 0.05703920625901482, + "grad_norm": 3.171875, + "learning_rate": 9.921088688042287e-05, + "loss": 2.0328, + "step": 1305 + }, + { + "epoch": 0.05708291446304471, + "grad_norm": 3.171875, + "learning_rate": 9.920967092439027e-05, + "loss": 2.5423, + "step": 1306 + }, + { + "epoch": 0.05712662266707461, + "grad_norm": 6.59375, + "learning_rate": 9.920845403969957e-05, + "loss": 2.742, + "step": 1307 + }, + { + "epoch": 0.057170330871104506, + "grad_norm": 3.046875, + "learning_rate": 9.920723622637371e-05, + "loss": 2.1735, + "step": 1308 + }, + { + "epoch": 0.0572140390751344, + "grad_norm": 2.984375, + "learning_rate": 9.92060174844357e-05, + "loss": 2.5959, + "step": 1309 + }, + { + "epoch": 0.0572577472791643, + "grad_norm": 2.71875, + "learning_rate": 9.920479781390852e-05, + "loss": 2.1141, + "step": 1310 + }, + { + "epoch": 0.057301455483194194, + "grad_norm": 2.421875, + "learning_rate": 9.920357721481518e-05, + "loss": 1.6989, + "step": 1311 + }, + { + "epoch": 0.057345163687224095, + "grad_norm": 2.546875, + "learning_rate": 9.920235568717873e-05, + "loss": 1.9488, + "step": 1312 + }, + { + "epoch": 0.05738887189125399, + "grad_norm": 2.859375, + "learning_rate": 9.920113323102223e-05, + "loss": 2.0413, + "step": 1313 + }, + { + "epoch": 0.05743258009528388, + "grad_norm": 3.03125, + "learning_rate": 9.919990984636871e-05, + "loss": 2.6842, + "step": 1314 + }, + { + "epoch": 0.05747628829931378, + "grad_norm": 4.0625, + "learning_rate": 9.91986855332413e-05, + "loss": 2.3465, + "step": 1315 + }, + { + "epoch": 0.057519996503343676, + "grad_norm": 3.21875, + "learning_rate": 9.919746029166311e-05, + "loss": 2.2191, + "step": 1316 + }, + { + "epoch": 0.05756370470737358, + "grad_norm": 3.46875, + "learning_rate": 9.91962341216572e-05, + "loss": 2.654, + "step": 1317 + }, + { + "epoch": 0.05760741291140347, + "grad_norm": 2.625, + "learning_rate": 9.919500702324677e-05, + "loss": 1.9608, + "step": 1318 + }, + { + "epoch": 0.057651121115433364, + "grad_norm": 2.734375, + "learning_rate": 9.919377899645497e-05, + "loss": 1.9015, + "step": 1319 + }, + { + "epoch": 0.057694829319463264, + "grad_norm": 3.3125, + "learning_rate": 9.919255004130494e-05, + "loss": 2.0449, + "step": 1320 + }, + { + "epoch": 0.05773853752349316, + "grad_norm": 3.03125, + "learning_rate": 9.919132015781991e-05, + "loss": 2.9302, + "step": 1321 + }, + { + "epoch": 0.05778224572752306, + "grad_norm": 2.65625, + "learning_rate": 9.919008934602307e-05, + "loss": 1.5977, + "step": 1322 + }, + { + "epoch": 0.05782595393155295, + "grad_norm": 3.34375, + "learning_rate": 9.918885760593764e-05, + "loss": 2.4274, + "step": 1323 + }, + { + "epoch": 0.057869662135582846, + "grad_norm": 3.265625, + "learning_rate": 9.918762493758689e-05, + "loss": 1.7857, + "step": 1324 + }, + { + "epoch": 0.057913370339612746, + "grad_norm": 2.734375, + "learning_rate": 9.918639134099407e-05, + "loss": 2.3214, + "step": 1325 + }, + { + "epoch": 0.05795707854364264, + "grad_norm": 2.703125, + "learning_rate": 9.918515681618246e-05, + "loss": 2.0117, + "step": 1326 + }, + { + "epoch": 0.05800078674767254, + "grad_norm": 2.890625, + "learning_rate": 9.918392136317533e-05, + "loss": 2.0574, + "step": 1327 + }, + { + "epoch": 0.058044494951702434, + "grad_norm": 3.84375, + "learning_rate": 9.918268498199604e-05, + "loss": 1.8831, + "step": 1328 + }, + { + "epoch": 0.05808820315573233, + "grad_norm": 2.78125, + "learning_rate": 9.918144767266791e-05, + "loss": 1.9005, + "step": 1329 + }, + { + "epoch": 0.05813191135976223, + "grad_norm": 2.53125, + "learning_rate": 9.918020943521427e-05, + "loss": 1.7483, + "step": 1330 + }, + { + "epoch": 0.05817561956379212, + "grad_norm": 2.796875, + "learning_rate": 9.91789702696585e-05, + "loss": 1.9886, + "step": 1331 + }, + { + "epoch": 0.05821932776782202, + "grad_norm": 2.984375, + "learning_rate": 9.917773017602399e-05, + "loss": 2.307, + "step": 1332 + }, + { + "epoch": 0.058263035971851916, + "grad_norm": 2.578125, + "learning_rate": 9.917648915433413e-05, + "loss": 2.0734, + "step": 1333 + }, + { + "epoch": 0.05830674417588181, + "grad_norm": 2.4375, + "learning_rate": 9.917524720461234e-05, + "loss": 2.1241, + "step": 1334 + }, + { + "epoch": 0.05835045237991171, + "grad_norm": 2.640625, + "learning_rate": 9.917400432688208e-05, + "loss": 1.9862, + "step": 1335 + }, + { + "epoch": 0.058394160583941604, + "grad_norm": 2.484375, + "learning_rate": 9.917276052116677e-05, + "loss": 2.1588, + "step": 1336 + }, + { + "epoch": 0.058437868787971504, + "grad_norm": 2.734375, + "learning_rate": 9.917151578748994e-05, + "loss": 2.2892, + "step": 1337 + }, + { + "epoch": 0.0584815769920014, + "grad_norm": 2.40625, + "learning_rate": 9.9170270125875e-05, + "loss": 2.1376, + "step": 1338 + }, + { + "epoch": 0.05852528519603129, + "grad_norm": 3.03125, + "learning_rate": 9.916902353634552e-05, + "loss": 2.2808, + "step": 1339 + }, + { + "epoch": 0.05856899340006119, + "grad_norm": 4.0625, + "learning_rate": 9.916777601892499e-05, + "loss": 1.8809, + "step": 1340 + }, + { + "epoch": 0.058612701604091086, + "grad_norm": 3.953125, + "learning_rate": 9.916652757363698e-05, + "loss": 3.0737, + "step": 1341 + }, + { + "epoch": 0.058656409808120986, + "grad_norm": 2.453125, + "learning_rate": 9.916527820050504e-05, + "loss": 2.0382, + "step": 1342 + }, + { + "epoch": 0.05870011801215088, + "grad_norm": 2.515625, + "learning_rate": 9.916402789955272e-05, + "loss": 2.1752, + "step": 1343 + }, + { + "epoch": 0.05874382621618078, + "grad_norm": 2.765625, + "learning_rate": 9.916277667080365e-05, + "loss": 2.1868, + "step": 1344 + }, + { + "epoch": 0.058787534420210674, + "grad_norm": 2.625, + "learning_rate": 9.916152451428144e-05, + "loss": 1.9418, + "step": 1345 + }, + { + "epoch": 0.05883124262424057, + "grad_norm": 2.4375, + "learning_rate": 9.91602714300097e-05, + "loss": 1.4852, + "step": 1346 + }, + { + "epoch": 0.05887495082827047, + "grad_norm": 2.734375, + "learning_rate": 9.91590174180121e-05, + "loss": 1.6127, + "step": 1347 + }, + { + "epoch": 0.05891865903230036, + "grad_norm": 3.21875, + "learning_rate": 9.915776247831227e-05, + "loss": 2.0565, + "step": 1348 + }, + { + "epoch": 0.05896236723633026, + "grad_norm": 2.890625, + "learning_rate": 9.915650661093395e-05, + "loss": 2.064, + "step": 1349 + }, + { + "epoch": 0.059006075440360156, + "grad_norm": 2.71875, + "learning_rate": 9.915524981590079e-05, + "loss": 1.8135, + "step": 1350 + }, + { + "epoch": 0.05904978364439005, + "grad_norm": 2.59375, + "learning_rate": 9.915399209323652e-05, + "loss": 2.0698, + "step": 1351 + }, + { + "epoch": 0.05909349184841995, + "grad_norm": 2.46875, + "learning_rate": 9.915273344296488e-05, + "loss": 2.1185, + "step": 1352 + }, + { + "epoch": 0.059137200052449844, + "grad_norm": 3.90625, + "learning_rate": 9.915147386510964e-05, + "loss": 2.5517, + "step": 1353 + }, + { + "epoch": 0.059180908256479745, + "grad_norm": 3.15625, + "learning_rate": 9.915021335969452e-05, + "loss": 2.5902, + "step": 1354 + }, + { + "epoch": 0.05922461646050964, + "grad_norm": 3.78125, + "learning_rate": 9.914895192674336e-05, + "loss": 2.0517, + "step": 1355 + }, + { + "epoch": 0.05926832466453953, + "grad_norm": 2.859375, + "learning_rate": 9.914768956627994e-05, + "loss": 2.1344, + "step": 1356 + }, + { + "epoch": 0.05931203286856943, + "grad_norm": 2.546875, + "learning_rate": 9.914642627832808e-05, + "loss": 2.0166, + "step": 1357 + }, + { + "epoch": 0.059355741072599326, + "grad_norm": 3.0, + "learning_rate": 9.914516206291165e-05, + "loss": 1.9574, + "step": 1358 + }, + { + "epoch": 0.059399449276629226, + "grad_norm": 3.5, + "learning_rate": 9.914389692005446e-05, + "loss": 2.3096, + "step": 1359 + }, + { + "epoch": 0.05944315748065912, + "grad_norm": 3.234375, + "learning_rate": 9.914263084978042e-05, + "loss": 1.9114, + "step": 1360 + }, + { + "epoch": 0.059486865684689014, + "grad_norm": 2.453125, + "learning_rate": 9.914136385211341e-05, + "loss": 1.9104, + "step": 1361 + }, + { + "epoch": 0.059530573888718914, + "grad_norm": 3.515625, + "learning_rate": 9.914009592707733e-05, + "loss": 2.0492, + "step": 1362 + }, + { + "epoch": 0.05957428209274881, + "grad_norm": 3.421875, + "learning_rate": 9.913882707469614e-05, + "loss": 2.3943, + "step": 1363 + }, + { + "epoch": 0.05961799029677871, + "grad_norm": 2.78125, + "learning_rate": 9.913755729499376e-05, + "loss": 2.0374, + "step": 1364 + }, + { + "epoch": 0.0596616985008086, + "grad_norm": 2.375, + "learning_rate": 9.913628658799415e-05, + "loss": 2.1342, + "step": 1365 + }, + { + "epoch": 0.059705406704838496, + "grad_norm": 3.6875, + "learning_rate": 9.91350149537213e-05, + "loss": 2.3227, + "step": 1366 + }, + { + "epoch": 0.059749114908868396, + "grad_norm": 3.03125, + "learning_rate": 9.913374239219922e-05, + "loss": 2.3847, + "step": 1367 + }, + { + "epoch": 0.05979282311289829, + "grad_norm": 2.453125, + "learning_rate": 9.913246890345189e-05, + "loss": 1.9194, + "step": 1368 + }, + { + "epoch": 0.05983653131692819, + "grad_norm": 3.125, + "learning_rate": 9.913119448750337e-05, + "loss": 2.4296, + "step": 1369 + }, + { + "epoch": 0.059880239520958084, + "grad_norm": 2.609375, + "learning_rate": 9.912991914437771e-05, + "loss": 1.7472, + "step": 1370 + }, + { + "epoch": 0.05992394772498798, + "grad_norm": 3.21875, + "learning_rate": 9.912864287409896e-05, + "loss": 1.9631, + "step": 1371 + }, + { + "epoch": 0.05996765592901788, + "grad_norm": 3.921875, + "learning_rate": 9.912736567669121e-05, + "loss": 1.94, + "step": 1372 + }, + { + "epoch": 0.06001136413304777, + "grad_norm": 3.859375, + "learning_rate": 9.912608755217859e-05, + "loss": 2.1358, + "step": 1373 + }, + { + "epoch": 0.06005507233707767, + "grad_norm": 2.546875, + "learning_rate": 9.912480850058516e-05, + "loss": 1.6588, + "step": 1374 + }, + { + "epoch": 0.060098780541107566, + "grad_norm": 2.421875, + "learning_rate": 9.912352852193514e-05, + "loss": 2.0139, + "step": 1375 + }, + { + "epoch": 0.06014248874513746, + "grad_norm": 4.125, + "learning_rate": 9.912224761625262e-05, + "loss": 2.7426, + "step": 1376 + }, + { + "epoch": 0.06018619694916736, + "grad_norm": 3.46875, + "learning_rate": 9.912096578356179e-05, + "loss": 2.7015, + "step": 1377 + }, + { + "epoch": 0.060229905153197254, + "grad_norm": 3.09375, + "learning_rate": 9.911968302388685e-05, + "loss": 2.041, + "step": 1378 + }, + { + "epoch": 0.060273613357227154, + "grad_norm": 2.96875, + "learning_rate": 9.9118399337252e-05, + "loss": 2.2014, + "step": 1379 + }, + { + "epoch": 0.06031732156125705, + "grad_norm": 3.21875, + "learning_rate": 9.911711472368148e-05, + "loss": 1.9344, + "step": 1380 + }, + { + "epoch": 0.06036102976528694, + "grad_norm": 2.640625, + "learning_rate": 9.91158291831995e-05, + "loss": 1.7271, + "step": 1381 + }, + { + "epoch": 0.06040473796931684, + "grad_norm": 7.03125, + "learning_rate": 9.911454271583034e-05, + "loss": 2.765, + "step": 1382 + }, + { + "epoch": 0.060448446173346736, + "grad_norm": 3.109375, + "learning_rate": 9.911325532159828e-05, + "loss": 2.8126, + "step": 1383 + }, + { + "epoch": 0.060492154377376636, + "grad_norm": 2.625, + "learning_rate": 9.91119670005276e-05, + "loss": 2.4261, + "step": 1384 + }, + { + "epoch": 0.06053586258140653, + "grad_norm": 2.40625, + "learning_rate": 9.911067775264264e-05, + "loss": 1.7767, + "step": 1385 + }, + { + "epoch": 0.060579570785436423, + "grad_norm": 3.3125, + "learning_rate": 9.91093875779677e-05, + "loss": 2.9243, + "step": 1386 + }, + { + "epoch": 0.060623278989466324, + "grad_norm": 4.0, + "learning_rate": 9.910809647652715e-05, + "loss": 3.1425, + "step": 1387 + }, + { + "epoch": 0.06066698719349622, + "grad_norm": 3.6875, + "learning_rate": 9.910680444834535e-05, + "loss": 1.9814, + "step": 1388 + }, + { + "epoch": 0.06071069539752612, + "grad_norm": 3.734375, + "learning_rate": 9.910551149344669e-05, + "loss": 2.4808, + "step": 1389 + }, + { + "epoch": 0.06075440360155601, + "grad_norm": 2.71875, + "learning_rate": 9.910421761185553e-05, + "loss": 1.9118, + "step": 1390 + }, + { + "epoch": 0.060798111805585905, + "grad_norm": 2.65625, + "learning_rate": 9.910292280359631e-05, + "loss": 2.3266, + "step": 1391 + }, + { + "epoch": 0.060841820009615806, + "grad_norm": 3.03125, + "learning_rate": 9.91016270686935e-05, + "loss": 2.2481, + "step": 1392 + }, + { + "epoch": 0.0608855282136457, + "grad_norm": 2.515625, + "learning_rate": 9.910033040717152e-05, + "loss": 1.938, + "step": 1393 + }, + { + "epoch": 0.0609292364176756, + "grad_norm": 3.25, + "learning_rate": 9.909903281905484e-05, + "loss": 2.3012, + "step": 1394 + }, + { + "epoch": 0.060972944621705494, + "grad_norm": 3.234375, + "learning_rate": 9.909773430436794e-05, + "loss": 2.2996, + "step": 1395 + }, + { + "epoch": 0.06101665282573539, + "grad_norm": 2.53125, + "learning_rate": 9.909643486313533e-05, + "loss": 2.4493, + "step": 1396 + }, + { + "epoch": 0.06106036102976529, + "grad_norm": 2.859375, + "learning_rate": 9.909513449538156e-05, + "loss": 2.1811, + "step": 1397 + }, + { + "epoch": 0.06110406923379518, + "grad_norm": 2.421875, + "learning_rate": 9.909383320113113e-05, + "loss": 1.8874, + "step": 1398 + }, + { + "epoch": 0.06114777743782508, + "grad_norm": 2.828125, + "learning_rate": 9.909253098040863e-05, + "loss": 2.762, + "step": 1399 + }, + { + "epoch": 0.061191485641854976, + "grad_norm": 2.609375, + "learning_rate": 9.909122783323861e-05, + "loss": 2.0106, + "step": 1400 + }, + { + "epoch": 0.06123519384588487, + "grad_norm": 3.5, + "learning_rate": 9.908992375964568e-05, + "loss": 1.8271, + "step": 1401 + }, + { + "epoch": 0.06127890204991477, + "grad_norm": 3.8125, + "learning_rate": 9.908861875965443e-05, + "loss": 2.2767, + "step": 1402 + }, + { + "epoch": 0.061322610253944664, + "grad_norm": 4.4375, + "learning_rate": 9.908731283328949e-05, + "loss": 2.3544, + "step": 1403 + }, + { + "epoch": 0.061366318457974564, + "grad_norm": 2.296875, + "learning_rate": 9.908600598057554e-05, + "loss": 1.7036, + "step": 1404 + }, + { + "epoch": 0.06141002666200446, + "grad_norm": 2.65625, + "learning_rate": 9.90846982015372e-05, + "loss": 2.2406, + "step": 1405 + }, + { + "epoch": 0.06145373486603435, + "grad_norm": 3.78125, + "learning_rate": 9.908338949619917e-05, + "loss": 2.2038, + "step": 1406 + }, + { + "epoch": 0.06149744307006425, + "grad_norm": 2.578125, + "learning_rate": 9.908207986458613e-05, + "loss": 2.0234, + "step": 1407 + }, + { + "epoch": 0.061541151274094145, + "grad_norm": 3.046875, + "learning_rate": 9.908076930672282e-05, + "loss": 2.0903, + "step": 1408 + }, + { + "epoch": 0.061584859478124046, + "grad_norm": 2.796875, + "learning_rate": 9.907945782263396e-05, + "loss": 2.2869, + "step": 1409 + }, + { + "epoch": 0.06162856768215394, + "grad_norm": 2.90625, + "learning_rate": 9.907814541234429e-05, + "loss": 2.3249, + "step": 1410 + }, + { + "epoch": 0.06167227588618383, + "grad_norm": 2.578125, + "learning_rate": 9.907683207587859e-05, + "loss": 2.0903, + "step": 1411 + }, + { + "epoch": 0.061715984090213734, + "grad_norm": 2.828125, + "learning_rate": 9.907551781326165e-05, + "loss": 1.7549, + "step": 1412 + }, + { + "epoch": 0.06175969229424363, + "grad_norm": 2.65625, + "learning_rate": 9.907420262451826e-05, + "loss": 2.3239, + "step": 1413 + }, + { + "epoch": 0.06180340049827353, + "grad_norm": 2.734375, + "learning_rate": 9.907288650967324e-05, + "loss": 2.0301, + "step": 1414 + }, + { + "epoch": 0.06184710870230342, + "grad_norm": 2.28125, + "learning_rate": 9.907156946875142e-05, + "loss": 1.8346, + "step": 1415 + }, + { + "epoch": 0.06189081690633332, + "grad_norm": 3.09375, + "learning_rate": 9.907025150177768e-05, + "loss": 2.1143, + "step": 1416 + }, + { + "epoch": 0.061934525110363216, + "grad_norm": 3.015625, + "learning_rate": 9.906893260877686e-05, + "loss": 2.0177, + "step": 1417 + }, + { + "epoch": 0.06197823331439311, + "grad_norm": 3.140625, + "learning_rate": 9.906761278977387e-05, + "loss": 3.0162, + "step": 1418 + }, + { + "epoch": 0.06202194151842301, + "grad_norm": 3.046875, + "learning_rate": 9.906629204479362e-05, + "loss": 2.8375, + "step": 1419 + }, + { + "epoch": 0.062065649722452904, + "grad_norm": 4.1875, + "learning_rate": 9.906497037386102e-05, + "loss": 1.6367, + "step": 1420 + }, + { + "epoch": 0.062109357926482804, + "grad_norm": 3.421875, + "learning_rate": 9.906364777700104e-05, + "loss": 2.1294, + "step": 1421 + }, + { + "epoch": 0.0621530661305127, + "grad_norm": 4.0625, + "learning_rate": 9.906232425423858e-05, + "loss": 2.181, + "step": 1422 + }, + { + "epoch": 0.06219677433454259, + "grad_norm": 2.78125, + "learning_rate": 9.906099980559868e-05, + "loss": 1.8974, + "step": 1423 + }, + { + "epoch": 0.06224048253857249, + "grad_norm": 3.828125, + "learning_rate": 9.90596744311063e-05, + "loss": 2.1287, + "step": 1424 + }, + { + "epoch": 0.062284190742602386, + "grad_norm": 2.421875, + "learning_rate": 9.905834813078646e-05, + "loss": 2.0806, + "step": 1425 + }, + { + "epoch": 0.062327898946632286, + "grad_norm": 3.0625, + "learning_rate": 9.905702090466419e-05, + "loss": 2.4359, + "step": 1426 + }, + { + "epoch": 0.06237160715066218, + "grad_norm": 3.5, + "learning_rate": 9.905569275276454e-05, + "loss": 2.2533, + "step": 1427 + }, + { + "epoch": 0.06241531535469207, + "grad_norm": 3.390625, + "learning_rate": 9.905436367511256e-05, + "loss": 1.8447, + "step": 1428 + }, + { + "epoch": 0.062459023558721974, + "grad_norm": 2.671875, + "learning_rate": 9.905303367173336e-05, + "loss": 2.7585, + "step": 1429 + }, + { + "epoch": 0.06250273176275187, + "grad_norm": 2.671875, + "learning_rate": 9.9051702742652e-05, + "loss": 1.7898, + "step": 1430 + }, + { + "epoch": 0.06254643996678176, + "grad_norm": 3.59375, + "learning_rate": 9.905037088789363e-05, + "loss": 2.5768, + "step": 1431 + }, + { + "epoch": 0.06259014817081165, + "grad_norm": 3.34375, + "learning_rate": 9.904903810748339e-05, + "loss": 2.9688, + "step": 1432 + }, + { + "epoch": 0.06263385637484156, + "grad_norm": 2.53125, + "learning_rate": 9.904770440144638e-05, + "loss": 1.9799, + "step": 1433 + }, + { + "epoch": 0.06267756457887146, + "grad_norm": 3.125, + "learning_rate": 9.904636976980782e-05, + "loss": 2.2924, + "step": 1434 + }, + { + "epoch": 0.06272127278290135, + "grad_norm": 2.71875, + "learning_rate": 9.904503421259288e-05, + "loss": 2.0099, + "step": 1435 + }, + { + "epoch": 0.06276498098693124, + "grad_norm": 2.671875, + "learning_rate": 9.904369772982676e-05, + "loss": 2.7724, + "step": 1436 + }, + { + "epoch": 0.06280868919096114, + "grad_norm": 5.03125, + "learning_rate": 9.904236032153469e-05, + "loss": 2.749, + "step": 1437 + }, + { + "epoch": 0.06285239739499104, + "grad_norm": 2.65625, + "learning_rate": 9.904102198774188e-05, + "loss": 1.9447, + "step": 1438 + }, + { + "epoch": 0.06289610559902094, + "grad_norm": 3.8125, + "learning_rate": 9.903968272847363e-05, + "loss": 2.5455, + "step": 1439 + }, + { + "epoch": 0.06293981380305083, + "grad_norm": 3.609375, + "learning_rate": 9.90383425437552e-05, + "loss": 2.74, + "step": 1440 + }, + { + "epoch": 0.06298352200708073, + "grad_norm": 2.5, + "learning_rate": 9.903700143361185e-05, + "loss": 1.8687, + "step": 1441 + }, + { + "epoch": 0.06302723021111062, + "grad_norm": 2.421875, + "learning_rate": 9.903565939806893e-05, + "loss": 1.8802, + "step": 1442 + }, + { + "epoch": 0.06307093841514053, + "grad_norm": 2.5, + "learning_rate": 9.903431643715175e-05, + "loss": 1.7479, + "step": 1443 + }, + { + "epoch": 0.06311464661917042, + "grad_norm": 2.734375, + "learning_rate": 9.903297255088563e-05, + "loss": 2.1058, + "step": 1444 + }, + { + "epoch": 0.06315835482320031, + "grad_norm": 3.515625, + "learning_rate": 9.903162773929599e-05, + "loss": 2.3713, + "step": 1445 + }, + { + "epoch": 0.06320206302723021, + "grad_norm": 3.015625, + "learning_rate": 9.903028200240815e-05, + "loss": 2.5098, + "step": 1446 + }, + { + "epoch": 0.0632457712312601, + "grad_norm": 2.84375, + "learning_rate": 9.902893534024753e-05, + "loss": 2.1725, + "step": 1447 + }, + { + "epoch": 0.06328947943529001, + "grad_norm": 2.828125, + "learning_rate": 9.902758775283955e-05, + "loss": 2.1446, + "step": 1448 + }, + { + "epoch": 0.0633331876393199, + "grad_norm": 2.640625, + "learning_rate": 9.902623924020962e-05, + "loss": 1.8664, + "step": 1449 + }, + { + "epoch": 0.0633768958433498, + "grad_norm": 2.890625, + "learning_rate": 9.902488980238322e-05, + "loss": 2.2319, + "step": 1450 + }, + { + "epoch": 0.06342060404737969, + "grad_norm": 2.5, + "learning_rate": 9.902353943938578e-05, + "loss": 1.9501, + "step": 1451 + }, + { + "epoch": 0.0634643122514096, + "grad_norm": 2.671875, + "learning_rate": 9.90221881512428e-05, + "loss": 1.8332, + "step": 1452 + }, + { + "epoch": 0.06350802045543949, + "grad_norm": 3.078125, + "learning_rate": 9.902083593797979e-05, + "loss": 2.0205, + "step": 1453 + }, + { + "epoch": 0.06355172865946938, + "grad_norm": 3.109375, + "learning_rate": 9.901948279962226e-05, + "loss": 2.2535, + "step": 1454 + }, + { + "epoch": 0.06359543686349928, + "grad_norm": 3.625, + "learning_rate": 9.901812873619574e-05, + "loss": 2.2449, + "step": 1455 + }, + { + "epoch": 0.06363914506752917, + "grad_norm": 4.03125, + "learning_rate": 9.901677374772579e-05, + "loss": 2.5408, + "step": 1456 + }, + { + "epoch": 0.06368285327155908, + "grad_norm": 3.453125, + "learning_rate": 9.901541783423798e-05, + "loss": 2.0815, + "step": 1457 + }, + { + "epoch": 0.06372656147558897, + "grad_norm": 2.53125, + "learning_rate": 9.90140609957579e-05, + "loss": 2.0225, + "step": 1458 + }, + { + "epoch": 0.06377026967961887, + "grad_norm": 2.390625, + "learning_rate": 9.901270323231115e-05, + "loss": 1.7129, + "step": 1459 + }, + { + "epoch": 0.06381397788364876, + "grad_norm": 3.734375, + "learning_rate": 9.901134454392334e-05, + "loss": 2.7482, + "step": 1460 + }, + { + "epoch": 0.06385768608767865, + "grad_norm": 2.796875, + "learning_rate": 9.900998493062015e-05, + "loss": 2.3469, + "step": 1461 + }, + { + "epoch": 0.06390139429170856, + "grad_norm": 2.96875, + "learning_rate": 9.900862439242719e-05, + "loss": 1.9815, + "step": 1462 + }, + { + "epoch": 0.06394510249573845, + "grad_norm": 3.03125, + "learning_rate": 9.900726292937018e-05, + "loss": 2.0003, + "step": 1463 + }, + { + "epoch": 0.06398881069976835, + "grad_norm": 3.21875, + "learning_rate": 9.900590054147478e-05, + "loss": 2.2793, + "step": 1464 + }, + { + "epoch": 0.06403251890379824, + "grad_norm": 2.734375, + "learning_rate": 9.900453722876672e-05, + "loss": 1.8329, + "step": 1465 + }, + { + "epoch": 0.06407622710782813, + "grad_norm": 2.75, + "learning_rate": 9.900317299127171e-05, + "loss": 1.8097, + "step": 1466 + }, + { + "epoch": 0.06411993531185804, + "grad_norm": 3.5, + "learning_rate": 9.900180782901551e-05, + "loss": 2.5161, + "step": 1467 + }, + { + "epoch": 0.06416364351588794, + "grad_norm": 2.8125, + "learning_rate": 9.900044174202388e-05, + "loss": 1.8867, + "step": 1468 + }, + { + "epoch": 0.06420735171991783, + "grad_norm": 3.328125, + "learning_rate": 9.899907473032259e-05, + "loss": 2.3157, + "step": 1469 + }, + { + "epoch": 0.06425105992394772, + "grad_norm": 2.28125, + "learning_rate": 9.899770679393747e-05, + "loss": 1.8642, + "step": 1470 + }, + { + "epoch": 0.06429476812797762, + "grad_norm": 2.46875, + "learning_rate": 9.899633793289427e-05, + "loss": 1.941, + "step": 1471 + }, + { + "epoch": 0.06433847633200752, + "grad_norm": 4.28125, + "learning_rate": 9.89949681472189e-05, + "loss": 2.7477, + "step": 1472 + }, + { + "epoch": 0.06438218453603742, + "grad_norm": 2.796875, + "learning_rate": 9.899359743693714e-05, + "loss": 2.2268, + "step": 1473 + }, + { + "epoch": 0.06442589274006731, + "grad_norm": 2.921875, + "learning_rate": 9.899222580207492e-05, + "loss": 2.1666, + "step": 1474 + }, + { + "epoch": 0.0644696009440972, + "grad_norm": 3.4375, + "learning_rate": 9.899085324265807e-05, + "loss": 2.0582, + "step": 1475 + }, + { + "epoch": 0.0645133091481271, + "grad_norm": 2.515625, + "learning_rate": 9.898947975871253e-05, + "loss": 1.966, + "step": 1476 + }, + { + "epoch": 0.064557017352157, + "grad_norm": 3.015625, + "learning_rate": 9.89881053502642e-05, + "loss": 2.102, + "step": 1477 + }, + { + "epoch": 0.0646007255561869, + "grad_norm": 2.484375, + "learning_rate": 9.898673001733902e-05, + "loss": 2.1326, + "step": 1478 + }, + { + "epoch": 0.0646444337602168, + "grad_norm": 3.203125, + "learning_rate": 9.898535375996296e-05, + "loss": 1.7869, + "step": 1479 + }, + { + "epoch": 0.06468814196424669, + "grad_norm": 2.578125, + "learning_rate": 9.898397657816198e-05, + "loss": 2.1237, + "step": 1480 + }, + { + "epoch": 0.06473185016827658, + "grad_norm": 2.953125, + "learning_rate": 9.898259847196205e-05, + "loss": 2.1493, + "step": 1481 + }, + { + "epoch": 0.06477555837230649, + "grad_norm": 3.65625, + "learning_rate": 9.89812194413892e-05, + "loss": 2.0955, + "step": 1482 + }, + { + "epoch": 0.06481926657633638, + "grad_norm": 2.875, + "learning_rate": 9.897983948646948e-05, + "loss": 1.7992, + "step": 1483 + }, + { + "epoch": 0.06486297478036628, + "grad_norm": 2.71875, + "learning_rate": 9.897845860722888e-05, + "loss": 2.1167, + "step": 1484 + }, + { + "epoch": 0.06490668298439617, + "grad_norm": 3.03125, + "learning_rate": 9.897707680369348e-05, + "loss": 2.0633, + "step": 1485 + }, + { + "epoch": 0.06495039118842606, + "grad_norm": 5.28125, + "learning_rate": 9.897569407588935e-05, + "loss": 3.077, + "step": 1486 + }, + { + "epoch": 0.06499409939245597, + "grad_norm": 3.25, + "learning_rate": 9.897431042384261e-05, + "loss": 2.4853, + "step": 1487 + }, + { + "epoch": 0.06503780759648586, + "grad_norm": 3.03125, + "learning_rate": 9.897292584757934e-05, + "loss": 2.4062, + "step": 1488 + }, + { + "epoch": 0.06508151580051576, + "grad_norm": 2.484375, + "learning_rate": 9.897154034712568e-05, + "loss": 1.9607, + "step": 1489 + }, + { + "epoch": 0.06512522400454565, + "grad_norm": 2.984375, + "learning_rate": 9.897015392250779e-05, + "loss": 2.6984, + "step": 1490 + }, + { + "epoch": 0.06516893220857554, + "grad_norm": 3.0, + "learning_rate": 9.896876657375183e-05, + "loss": 2.8004, + "step": 1491 + }, + { + "epoch": 0.06521264041260545, + "grad_norm": 3.75, + "learning_rate": 9.896737830088396e-05, + "loss": 1.97, + "step": 1492 + }, + { + "epoch": 0.06525634861663535, + "grad_norm": 4.625, + "learning_rate": 9.89659891039304e-05, + "loss": 2.434, + "step": 1493 + }, + { + "epoch": 0.06530005682066524, + "grad_norm": 2.421875, + "learning_rate": 9.896459898291734e-05, + "loss": 1.8298, + "step": 1494 + }, + { + "epoch": 0.06534376502469513, + "grad_norm": 2.671875, + "learning_rate": 9.896320793787106e-05, + "loss": 2.0294, + "step": 1495 + }, + { + "epoch": 0.06538747322872503, + "grad_norm": 4.875, + "learning_rate": 9.896181596881777e-05, + "loss": 1.8582, + "step": 1496 + }, + { + "epoch": 0.06543118143275493, + "grad_norm": 3.0, + "learning_rate": 9.896042307578376e-05, + "loss": 2.4271, + "step": 1497 + }, + { + "epoch": 0.06547488963678483, + "grad_norm": 3.109375, + "learning_rate": 9.89590292587953e-05, + "loss": 2.879, + "step": 1498 + }, + { + "epoch": 0.06551859784081472, + "grad_norm": 2.875, + "learning_rate": 9.895763451787869e-05, + "loss": 2.1706, + "step": 1499 + }, + { + "epoch": 0.06556230604484461, + "grad_norm": 2.90625, + "learning_rate": 9.895623885306029e-05, + "loss": 1.8151, + "step": 1500 + }, + { + "epoch": 0.06560601424887451, + "grad_norm": 3.203125, + "learning_rate": 9.89548422643664e-05, + "loss": 2.4741, + "step": 1501 + }, + { + "epoch": 0.06564972245290442, + "grad_norm": 2.671875, + "learning_rate": 9.895344475182338e-05, + "loss": 1.8052, + "step": 1502 + }, + { + "epoch": 0.06569343065693431, + "grad_norm": 3.625, + "learning_rate": 9.89520463154576e-05, + "loss": 2.1158, + "step": 1503 + }, + { + "epoch": 0.0657371388609642, + "grad_norm": 3.515625, + "learning_rate": 9.895064695529548e-05, + "loss": 1.9689, + "step": 1504 + }, + { + "epoch": 0.0657808470649941, + "grad_norm": 4.0, + "learning_rate": 9.89492466713634e-05, + "loss": 2.3131, + "step": 1505 + }, + { + "epoch": 0.06582455526902399, + "grad_norm": 2.984375, + "learning_rate": 9.894784546368779e-05, + "loss": 2.4752, + "step": 1506 + }, + { + "epoch": 0.0658682634730539, + "grad_norm": 3.125, + "learning_rate": 9.894644333229511e-05, + "loss": 2.025, + "step": 1507 + }, + { + "epoch": 0.06591197167708379, + "grad_norm": 2.65625, + "learning_rate": 9.894504027721179e-05, + "loss": 2.1173, + "step": 1508 + }, + { + "epoch": 0.06595567988111369, + "grad_norm": 2.984375, + "learning_rate": 9.894363629846432e-05, + "loss": 2.4434, + "step": 1509 + }, + { + "epoch": 0.06599938808514358, + "grad_norm": 2.640625, + "learning_rate": 9.894223139607921e-05, + "loss": 1.9026, + "step": 1510 + }, + { + "epoch": 0.06604309628917347, + "grad_norm": 2.953125, + "learning_rate": 9.894082557008296e-05, + "loss": 1.9825, + "step": 1511 + }, + { + "epoch": 0.06608680449320338, + "grad_norm": 2.890625, + "learning_rate": 9.893941882050209e-05, + "loss": 2.8117, + "step": 1512 + }, + { + "epoch": 0.06613051269723327, + "grad_norm": 2.90625, + "learning_rate": 9.893801114736318e-05, + "loss": 2.2047, + "step": 1513 + }, + { + "epoch": 0.06617422090126317, + "grad_norm": 5.40625, + "learning_rate": 9.893660255069275e-05, + "loss": 2.2197, + "step": 1514 + }, + { + "epoch": 0.06621792910529306, + "grad_norm": 3.1875, + "learning_rate": 9.893519303051742e-05, + "loss": 2.2841, + "step": 1515 + }, + { + "epoch": 0.06626163730932295, + "grad_norm": 6.5625, + "learning_rate": 9.893378258686377e-05, + "loss": 2.3869, + "step": 1516 + }, + { + "epoch": 0.06630534551335286, + "grad_norm": 2.859375, + "learning_rate": 9.893237121975843e-05, + "loss": 2.418, + "step": 1517 + }, + { + "epoch": 0.06634905371738276, + "grad_norm": 3.21875, + "learning_rate": 9.893095892922803e-05, + "loss": 2.641, + "step": 1518 + }, + { + "epoch": 0.06639276192141265, + "grad_norm": 7.0, + "learning_rate": 9.89295457152992e-05, + "loss": 2.23, + "step": 1519 + }, + { + "epoch": 0.06643647012544254, + "grad_norm": 2.609375, + "learning_rate": 9.892813157799864e-05, + "loss": 1.9, + "step": 1520 + }, + { + "epoch": 0.06648017832947244, + "grad_norm": 3.71875, + "learning_rate": 9.892671651735304e-05, + "loss": 2.3359, + "step": 1521 + }, + { + "epoch": 0.06652388653350234, + "grad_norm": 3.09375, + "learning_rate": 9.892530053338909e-05, + "loss": 2.4135, + "step": 1522 + }, + { + "epoch": 0.06656759473753224, + "grad_norm": 2.921875, + "learning_rate": 9.89238836261335e-05, + "loss": 2.7742, + "step": 1523 + }, + { + "epoch": 0.06661130294156213, + "grad_norm": 2.53125, + "learning_rate": 9.892246579561302e-05, + "loss": 1.8957, + "step": 1524 + }, + { + "epoch": 0.06665501114559202, + "grad_norm": 2.859375, + "learning_rate": 9.89210470418544e-05, + "loss": 2.5296, + "step": 1525 + }, + { + "epoch": 0.06669871934962192, + "grad_norm": 2.6875, + "learning_rate": 9.891962736488443e-05, + "loss": 2.2513, + "step": 1526 + }, + { + "epoch": 0.06674242755365183, + "grad_norm": 2.734375, + "learning_rate": 9.89182067647299e-05, + "loss": 2.5814, + "step": 1527 + }, + { + "epoch": 0.06678613575768172, + "grad_norm": 2.234375, + "learning_rate": 9.891678524141758e-05, + "loss": 1.811, + "step": 1528 + }, + { + "epoch": 0.06682984396171161, + "grad_norm": 2.921875, + "learning_rate": 9.891536279497436e-05, + "loss": 2.0597, + "step": 1529 + }, + { + "epoch": 0.0668735521657415, + "grad_norm": 3.765625, + "learning_rate": 9.891393942542704e-05, + "loss": 2.3774, + "step": 1530 + }, + { + "epoch": 0.0669172603697714, + "grad_norm": 3.671875, + "learning_rate": 9.891251513280248e-05, + "loss": 2.5181, + "step": 1531 + }, + { + "epoch": 0.06696096857380131, + "grad_norm": 2.375, + "learning_rate": 9.891108991712759e-05, + "loss": 2.0023, + "step": 1532 + }, + { + "epoch": 0.0670046767778312, + "grad_norm": 2.65625, + "learning_rate": 9.890966377842925e-05, + "loss": 1.9709, + "step": 1533 + }, + { + "epoch": 0.0670483849818611, + "grad_norm": 2.53125, + "learning_rate": 9.890823671673436e-05, + "loss": 2.2626, + "step": 1534 + }, + { + "epoch": 0.06709209318589099, + "grad_norm": 2.453125, + "learning_rate": 9.890680873206986e-05, + "loss": 2.0042, + "step": 1535 + }, + { + "epoch": 0.06713580138992088, + "grad_norm": 2.65625, + "learning_rate": 9.89053798244627e-05, + "loss": 2.0635, + "step": 1536 + }, + { + "epoch": 0.06717950959395079, + "grad_norm": 4.0625, + "learning_rate": 9.890394999393984e-05, + "loss": 2.7742, + "step": 1537 + }, + { + "epoch": 0.06722321779798068, + "grad_norm": 2.671875, + "learning_rate": 9.890251924052827e-05, + "loss": 2.0707, + "step": 1538 + }, + { + "epoch": 0.06726692600201058, + "grad_norm": 2.796875, + "learning_rate": 9.8901087564255e-05, + "loss": 2.774, + "step": 1539 + }, + { + "epoch": 0.06731063420604047, + "grad_norm": 3.546875, + "learning_rate": 9.889965496514702e-05, + "loss": 3.196, + "step": 1540 + }, + { + "epoch": 0.06735434241007036, + "grad_norm": 3.0625, + "learning_rate": 9.889822144323137e-05, + "loss": 2.3068, + "step": 1541 + }, + { + "epoch": 0.06739805061410027, + "grad_norm": 2.65625, + "learning_rate": 9.889678699853514e-05, + "loss": 2.4428, + "step": 1542 + }, + { + "epoch": 0.06744175881813017, + "grad_norm": 2.5, + "learning_rate": 9.889535163108537e-05, + "loss": 2.0466, + "step": 1543 + }, + { + "epoch": 0.06748546702216006, + "grad_norm": 2.359375, + "learning_rate": 9.889391534090912e-05, + "loss": 1.907, + "step": 1544 + }, + { + "epoch": 0.06752917522618995, + "grad_norm": 12.0625, + "learning_rate": 9.889247812803356e-05, + "loss": 6.6027, + "step": 1545 + }, + { + "epoch": 0.06757288343021985, + "grad_norm": 3.125, + "learning_rate": 9.889103999248576e-05, + "loss": 2.2651, + "step": 1546 + }, + { + "epoch": 0.06761659163424975, + "grad_norm": 2.828125, + "learning_rate": 9.88896009342929e-05, + "loss": 1.9441, + "step": 1547 + }, + { + "epoch": 0.06766029983827965, + "grad_norm": 3.09375, + "learning_rate": 9.888816095348209e-05, + "loss": 2.6584, + "step": 1548 + }, + { + "epoch": 0.06770400804230954, + "grad_norm": 3.015625, + "learning_rate": 9.888672005008054e-05, + "loss": 2.5327, + "step": 1549 + }, + { + "epoch": 0.06774771624633943, + "grad_norm": 2.703125, + "learning_rate": 9.888527822411543e-05, + "loss": 2.1271, + "step": 1550 + }, + { + "epoch": 0.06779142445036933, + "grad_norm": 2.953125, + "learning_rate": 9.888383547561398e-05, + "loss": 2.3199, + "step": 1551 + }, + { + "epoch": 0.06783513265439924, + "grad_norm": 4.5625, + "learning_rate": 9.888239180460339e-05, + "loss": 2.153, + "step": 1552 + }, + { + "epoch": 0.06787884085842913, + "grad_norm": 2.625, + "learning_rate": 9.888094721111093e-05, + "loss": 2.5418, + "step": 1553 + }, + { + "epoch": 0.06792254906245902, + "grad_norm": 3.296875, + "learning_rate": 9.887950169516386e-05, + "loss": 2.1519, + "step": 1554 + }, + { + "epoch": 0.06796625726648892, + "grad_norm": 3.25, + "learning_rate": 9.887805525678943e-05, + "loss": 2.2746, + "step": 1555 + }, + { + "epoch": 0.06800996547051881, + "grad_norm": 2.53125, + "learning_rate": 9.887660789601499e-05, + "loss": 1.8309, + "step": 1556 + }, + { + "epoch": 0.06805367367454872, + "grad_norm": 2.84375, + "learning_rate": 9.88751596128678e-05, + "loss": 2.4382, + "step": 1557 + }, + { + "epoch": 0.06809738187857861, + "grad_norm": 3.0625, + "learning_rate": 9.887371040737523e-05, + "loss": 2.1003, + "step": 1558 + }, + { + "epoch": 0.0681410900826085, + "grad_norm": 3.140625, + "learning_rate": 9.88722602795646e-05, + "loss": 2.383, + "step": 1559 + }, + { + "epoch": 0.0681847982866384, + "grad_norm": 2.78125, + "learning_rate": 9.887080922946329e-05, + "loss": 2.1193, + "step": 1560 + }, + { + "epoch": 0.06822850649066829, + "grad_norm": 3.171875, + "learning_rate": 9.886935725709868e-05, + "loss": 2.1573, + "step": 1561 + }, + { + "epoch": 0.0682722146946982, + "grad_norm": 2.921875, + "learning_rate": 9.886790436249818e-05, + "loss": 2.4051, + "step": 1562 + }, + { + "epoch": 0.0683159228987281, + "grad_norm": 2.6875, + "learning_rate": 9.886645054568919e-05, + "loss": 1.9849, + "step": 1563 + }, + { + "epoch": 0.06835963110275799, + "grad_norm": 3.0625, + "learning_rate": 9.886499580669917e-05, + "loss": 1.915, + "step": 1564 + }, + { + "epoch": 0.06840333930678788, + "grad_norm": 2.75, + "learning_rate": 9.886354014555554e-05, + "loss": 2.0232, + "step": 1565 + }, + { + "epoch": 0.06844704751081777, + "grad_norm": 2.4375, + "learning_rate": 9.886208356228581e-05, + "loss": 2.0822, + "step": 1566 + }, + { + "epoch": 0.06849075571484768, + "grad_norm": 2.75, + "learning_rate": 9.886062605691743e-05, + "loss": 1.6554, + "step": 1567 + }, + { + "epoch": 0.06853446391887758, + "grad_norm": 2.734375, + "learning_rate": 9.885916762947795e-05, + "loss": 2.097, + "step": 1568 + }, + { + "epoch": 0.06857817212290747, + "grad_norm": 2.78125, + "learning_rate": 9.885770827999484e-05, + "loss": 1.9294, + "step": 1569 + }, + { + "epoch": 0.06862188032693736, + "grad_norm": 3.21875, + "learning_rate": 9.885624800849567e-05, + "loss": 2.3477, + "step": 1570 + }, + { + "epoch": 0.06866558853096726, + "grad_norm": 2.46875, + "learning_rate": 9.8854786815008e-05, + "loss": 2.2106, + "step": 1571 + }, + { + "epoch": 0.06870929673499716, + "grad_norm": 2.65625, + "learning_rate": 9.88533246995594e-05, + "loss": 1.9569, + "step": 1572 + }, + { + "epoch": 0.06875300493902706, + "grad_norm": 2.671875, + "learning_rate": 9.885186166217746e-05, + "loss": 1.7803, + "step": 1573 + }, + { + "epoch": 0.06879671314305695, + "grad_norm": 2.75, + "learning_rate": 9.885039770288979e-05, + "loss": 2.2029, + "step": 1574 + }, + { + "epoch": 0.06884042134708684, + "grad_norm": 2.71875, + "learning_rate": 9.884893282172401e-05, + "loss": 2.0134, + "step": 1575 + }, + { + "epoch": 0.06888412955111674, + "grad_norm": 2.765625, + "learning_rate": 9.884746701870777e-05, + "loss": 2.7702, + "step": 1576 + }, + { + "epoch": 0.06892783775514665, + "grad_norm": 2.546875, + "learning_rate": 9.884600029386875e-05, + "loss": 2.3747, + "step": 1577 + }, + { + "epoch": 0.06897154595917654, + "grad_norm": 2.78125, + "learning_rate": 9.884453264723459e-05, + "loss": 2.3949, + "step": 1578 + }, + { + "epoch": 0.06901525416320643, + "grad_norm": 5.78125, + "learning_rate": 9.884306407883301e-05, + "loss": 2.6275, + "step": 1579 + }, + { + "epoch": 0.06905896236723633, + "grad_norm": 2.71875, + "learning_rate": 9.884159458869173e-05, + "loss": 2.6411, + "step": 1580 + }, + { + "epoch": 0.06910267057126622, + "grad_norm": 3.859375, + "learning_rate": 9.884012417683849e-05, + "loss": 2.56, + "step": 1581 + }, + { + "epoch": 0.06914637877529613, + "grad_norm": 2.65625, + "learning_rate": 9.8838652843301e-05, + "loss": 1.8407, + "step": 1582 + }, + { + "epoch": 0.06919008697932602, + "grad_norm": 2.84375, + "learning_rate": 9.883718058810707e-05, + "loss": 2.2617, + "step": 1583 + }, + { + "epoch": 0.06923379518335591, + "grad_norm": 2.828125, + "learning_rate": 9.883570741128446e-05, + "loss": 2.1872, + "step": 1584 + }, + { + "epoch": 0.06927750338738581, + "grad_norm": 3.21875, + "learning_rate": 9.883423331286096e-05, + "loss": 3.165, + "step": 1585 + }, + { + "epoch": 0.06932121159141572, + "grad_norm": 2.265625, + "learning_rate": 9.88327582928644e-05, + "loss": 2.0316, + "step": 1586 + }, + { + "epoch": 0.06936491979544561, + "grad_norm": 4.0625, + "learning_rate": 9.883128235132264e-05, + "loss": 2.3265, + "step": 1587 + }, + { + "epoch": 0.0694086279994755, + "grad_norm": 2.75, + "learning_rate": 9.882980548826349e-05, + "loss": 2.2151, + "step": 1588 + }, + { + "epoch": 0.0694523362035054, + "grad_norm": 3.65625, + "learning_rate": 9.882832770371487e-05, + "loss": 2.3292, + "step": 1589 + }, + { + "epoch": 0.06949604440753529, + "grad_norm": 2.78125, + "learning_rate": 9.88268489977046e-05, + "loss": 2.4083, + "step": 1590 + }, + { + "epoch": 0.0695397526115652, + "grad_norm": 4.90625, + "learning_rate": 9.882536937026066e-05, + "loss": 2.4895, + "step": 1591 + }, + { + "epoch": 0.06958346081559509, + "grad_norm": 4.53125, + "learning_rate": 9.882388882141092e-05, + "loss": 2.478, + "step": 1592 + }, + { + "epoch": 0.06962716901962498, + "grad_norm": 5.28125, + "learning_rate": 9.882240735118334e-05, + "loss": 1.8026, + "step": 1593 + }, + { + "epoch": 0.06967087722365488, + "grad_norm": 2.375, + "learning_rate": 9.882092495960589e-05, + "loss": 1.8172, + "step": 1594 + }, + { + "epoch": 0.06971458542768477, + "grad_norm": 3.90625, + "learning_rate": 9.881944164670651e-05, + "loss": 2.2879, + "step": 1595 + }, + { + "epoch": 0.06975829363171468, + "grad_norm": 2.828125, + "learning_rate": 9.881795741251323e-05, + "loss": 2.2479, + "step": 1596 + }, + { + "epoch": 0.06980200183574457, + "grad_norm": 2.796875, + "learning_rate": 9.881647225705403e-05, + "loss": 2.1813, + "step": 1597 + }, + { + "epoch": 0.06984571003977447, + "grad_norm": 2.984375, + "learning_rate": 9.881498618035695e-05, + "loss": 2.2546, + "step": 1598 + }, + { + "epoch": 0.06988941824380436, + "grad_norm": 3.59375, + "learning_rate": 9.881349918245005e-05, + "loss": 2.346, + "step": 1599 + }, + { + "epoch": 0.06993312644783425, + "grad_norm": 2.984375, + "learning_rate": 9.881201126336135e-05, + "loss": 2.2346, + "step": 1600 + }, + { + "epoch": 0.06997683465186416, + "grad_norm": 2.375, + "learning_rate": 9.881052242311896e-05, + "loss": 1.7978, + "step": 1601 + }, + { + "epoch": 0.07002054285589406, + "grad_norm": 2.8125, + "learning_rate": 9.880903266175098e-05, + "loss": 1.906, + "step": 1602 + }, + { + "epoch": 0.07006425105992395, + "grad_norm": 2.96875, + "learning_rate": 9.880754197928553e-05, + "loss": 2.1055, + "step": 1603 + }, + { + "epoch": 0.07010795926395384, + "grad_norm": 2.6875, + "learning_rate": 9.88060503757507e-05, + "loss": 2.0959, + "step": 1604 + }, + { + "epoch": 0.07015166746798374, + "grad_norm": 2.65625, + "learning_rate": 9.880455785117469e-05, + "loss": 2.2334, + "step": 1605 + }, + { + "epoch": 0.07019537567201364, + "grad_norm": 2.453125, + "learning_rate": 9.880306440558562e-05, + "loss": 2.0596, + "step": 1606 + }, + { + "epoch": 0.07023908387604354, + "grad_norm": 2.453125, + "learning_rate": 9.880157003901171e-05, + "loss": 2.3092, + "step": 1607 + }, + { + "epoch": 0.07028279208007343, + "grad_norm": 2.75, + "learning_rate": 9.880007475148114e-05, + "loss": 1.8754, + "step": 1608 + }, + { + "epoch": 0.07032650028410332, + "grad_norm": 2.53125, + "learning_rate": 9.879857854302214e-05, + "loss": 1.8996, + "step": 1609 + }, + { + "epoch": 0.07037020848813322, + "grad_norm": 3.609375, + "learning_rate": 9.879708141366293e-05, + "loss": 2.2173, + "step": 1610 + }, + { + "epoch": 0.07041391669216313, + "grad_norm": 2.765625, + "learning_rate": 9.879558336343177e-05, + "loss": 2.2584, + "step": 1611 + }, + { + "epoch": 0.07045762489619302, + "grad_norm": 2.59375, + "learning_rate": 9.879408439235696e-05, + "loss": 1.9219, + "step": 1612 + }, + { + "epoch": 0.07050133310022291, + "grad_norm": 2.25, + "learning_rate": 9.879258450046673e-05, + "loss": 1.9287, + "step": 1613 + }, + { + "epoch": 0.0705450413042528, + "grad_norm": 2.453125, + "learning_rate": 9.879108368778943e-05, + "loss": 2.1835, + "step": 1614 + }, + { + "epoch": 0.0705887495082827, + "grad_norm": 3.703125, + "learning_rate": 9.878958195435338e-05, + "loss": 2.613, + "step": 1615 + }, + { + "epoch": 0.07063245771231261, + "grad_norm": 2.921875, + "learning_rate": 9.878807930018689e-05, + "loss": 2.775, + "step": 1616 + }, + { + "epoch": 0.0706761659163425, + "grad_norm": 2.3125, + "learning_rate": 9.878657572531833e-05, + "loss": 2.0104, + "step": 1617 + }, + { + "epoch": 0.0707198741203724, + "grad_norm": 3.90625, + "learning_rate": 9.878507122977609e-05, + "loss": 2.1273, + "step": 1618 + }, + { + "epoch": 0.07076358232440229, + "grad_norm": 3.109375, + "learning_rate": 9.878356581358856e-05, + "loss": 2.3979, + "step": 1619 + }, + { + "epoch": 0.07080729052843218, + "grad_norm": 2.65625, + "learning_rate": 9.878205947678414e-05, + "loss": 2.65, + "step": 1620 + }, + { + "epoch": 0.07085099873246209, + "grad_norm": 3.046875, + "learning_rate": 9.878055221939127e-05, + "loss": 2.9008, + "step": 1621 + }, + { + "epoch": 0.07089470693649198, + "grad_norm": 2.28125, + "learning_rate": 9.877904404143837e-05, + "loss": 1.728, + "step": 1622 + }, + { + "epoch": 0.07093841514052188, + "grad_norm": 2.34375, + "learning_rate": 9.87775349429539e-05, + "loss": 2.2248, + "step": 1623 + }, + { + "epoch": 0.07098212334455177, + "grad_norm": 2.9375, + "learning_rate": 9.877602492396636e-05, + "loss": 2.0064, + "step": 1624 + }, + { + "epoch": 0.07102583154858166, + "grad_norm": 3.25, + "learning_rate": 9.877451398450426e-05, + "loss": 2.1671, + "step": 1625 + }, + { + "epoch": 0.07106953975261157, + "grad_norm": 2.515625, + "learning_rate": 9.877300212459608e-05, + "loss": 2.1163, + "step": 1626 + }, + { + "epoch": 0.07111324795664146, + "grad_norm": 2.5625, + "learning_rate": 9.877148934427037e-05, + "loss": 2.1899, + "step": 1627 + }, + { + "epoch": 0.07115695616067136, + "grad_norm": 4.96875, + "learning_rate": 9.876997564355565e-05, + "loss": 2.778, + "step": 1628 + }, + { + "epoch": 0.07120066436470125, + "grad_norm": 3.015625, + "learning_rate": 9.876846102248053e-05, + "loss": 2.4448, + "step": 1629 + }, + { + "epoch": 0.07124437256873115, + "grad_norm": 2.984375, + "learning_rate": 9.876694548107357e-05, + "loss": 2.8018, + "step": 1630 + }, + { + "epoch": 0.07128808077276105, + "grad_norm": 3.234375, + "learning_rate": 9.876542901936336e-05, + "loss": 2.128, + "step": 1631 + }, + { + "epoch": 0.07133178897679095, + "grad_norm": 2.6875, + "learning_rate": 9.876391163737853e-05, + "loss": 2.2933, + "step": 1632 + }, + { + "epoch": 0.07137549718082084, + "grad_norm": 2.90625, + "learning_rate": 9.876239333514772e-05, + "loss": 1.9584, + "step": 1633 + }, + { + "epoch": 0.07141920538485073, + "grad_norm": 3.75, + "learning_rate": 9.876087411269959e-05, + "loss": 2.2731, + "step": 1634 + }, + { + "epoch": 0.07146291358888063, + "grad_norm": 3.46875, + "learning_rate": 9.875935397006278e-05, + "loss": 2.5422, + "step": 1635 + }, + { + "epoch": 0.07150662179291054, + "grad_norm": 3.15625, + "learning_rate": 9.875783290726601e-05, + "loss": 2.1556, + "step": 1636 + }, + { + "epoch": 0.07155032999694043, + "grad_norm": 3.0625, + "learning_rate": 9.875631092433795e-05, + "loss": 2.452, + "step": 1637 + }, + { + "epoch": 0.07159403820097032, + "grad_norm": 2.34375, + "learning_rate": 9.875478802130736e-05, + "loss": 1.9157, + "step": 1638 + }, + { + "epoch": 0.07163774640500022, + "grad_norm": 2.890625, + "learning_rate": 9.875326419820296e-05, + "loss": 2.4768, + "step": 1639 + }, + { + "epoch": 0.07168145460903011, + "grad_norm": 3.5625, + "learning_rate": 9.87517394550535e-05, + "loss": 2.0934, + "step": 1640 + }, + { + "epoch": 0.07172516281306002, + "grad_norm": 3.765625, + "learning_rate": 9.875021379188776e-05, + "loss": 2.2172, + "step": 1641 + }, + { + "epoch": 0.07176887101708991, + "grad_norm": 2.390625, + "learning_rate": 9.874868720873454e-05, + "loss": 2.0909, + "step": 1642 + }, + { + "epoch": 0.0718125792211198, + "grad_norm": 3.40625, + "learning_rate": 9.874715970562262e-05, + "loss": 2.1855, + "step": 1643 + }, + { + "epoch": 0.0718562874251497, + "grad_norm": 2.59375, + "learning_rate": 9.874563128258087e-05, + "loss": 2.1557, + "step": 1644 + }, + { + "epoch": 0.07189999562917959, + "grad_norm": 2.953125, + "learning_rate": 9.874410193963813e-05, + "loss": 2.1594, + "step": 1645 + }, + { + "epoch": 0.0719437038332095, + "grad_norm": 3.09375, + "learning_rate": 9.874257167682321e-05, + "loss": 2.2683, + "step": 1646 + }, + { + "epoch": 0.07198741203723939, + "grad_norm": 4.09375, + "learning_rate": 9.874104049416502e-05, + "loss": 2.4891, + "step": 1647 + }, + { + "epoch": 0.07203112024126929, + "grad_norm": 3.0625, + "learning_rate": 9.873950839169248e-05, + "loss": 2.486, + "step": 1648 + }, + { + "epoch": 0.07207482844529918, + "grad_norm": 2.53125, + "learning_rate": 9.873797536943447e-05, + "loss": 2.0965, + "step": 1649 + }, + { + "epoch": 0.07211853664932907, + "grad_norm": 2.796875, + "learning_rate": 9.873644142741992e-05, + "loss": 2.0246, + "step": 1650 + }, + { + "epoch": 0.07216224485335898, + "grad_norm": 3.953125, + "learning_rate": 9.87349065656778e-05, + "loss": 2.8423, + "step": 1651 + }, + { + "epoch": 0.07220595305738887, + "grad_norm": 2.765625, + "learning_rate": 9.873337078423706e-05, + "loss": 2.1673, + "step": 1652 + }, + { + "epoch": 0.07224966126141877, + "grad_norm": 2.890625, + "learning_rate": 9.873183408312668e-05, + "loss": 2.2967, + "step": 1653 + }, + { + "epoch": 0.07229336946544866, + "grad_norm": 2.546875, + "learning_rate": 9.873029646237567e-05, + "loss": 1.8027, + "step": 1654 + }, + { + "epoch": 0.07233707766947856, + "grad_norm": 2.625, + "learning_rate": 9.872875792201304e-05, + "loss": 2.0913, + "step": 1655 + }, + { + "epoch": 0.07238078587350846, + "grad_norm": 3.234375, + "learning_rate": 9.872721846206783e-05, + "loss": 2.0012, + "step": 1656 + }, + { + "epoch": 0.07242449407753836, + "grad_norm": 2.953125, + "learning_rate": 9.872567808256909e-05, + "loss": 2.3273, + "step": 1657 + }, + { + "epoch": 0.07246820228156825, + "grad_norm": 3.046875, + "learning_rate": 9.872413678354589e-05, + "loss": 2.4328, + "step": 1658 + }, + { + "epoch": 0.07251191048559814, + "grad_norm": 4.6875, + "learning_rate": 9.87225945650273e-05, + "loss": 2.1735, + "step": 1659 + }, + { + "epoch": 0.07255561868962804, + "grad_norm": 2.421875, + "learning_rate": 9.872105142704244e-05, + "loss": 2.0066, + "step": 1660 + }, + { + "epoch": 0.07259932689365795, + "grad_norm": 2.71875, + "learning_rate": 9.871950736962044e-05, + "loss": 2.249, + "step": 1661 + }, + { + "epoch": 0.07264303509768784, + "grad_norm": 2.71875, + "learning_rate": 9.871796239279043e-05, + "loss": 2.347, + "step": 1662 + }, + { + "epoch": 0.07268674330171773, + "grad_norm": 4.375, + "learning_rate": 9.871641649658155e-05, + "loss": 2.0257, + "step": 1663 + }, + { + "epoch": 0.07273045150574763, + "grad_norm": 2.859375, + "learning_rate": 9.871486968102299e-05, + "loss": 2.1046, + "step": 1664 + }, + { + "epoch": 0.07277415970977752, + "grad_norm": 3.640625, + "learning_rate": 9.871332194614395e-05, + "loss": 2.882, + "step": 1665 + }, + { + "epoch": 0.07281786791380743, + "grad_norm": 3.03125, + "learning_rate": 9.871177329197362e-05, + "loss": 1.7936, + "step": 1666 + }, + { + "epoch": 0.07286157611783732, + "grad_norm": 2.484375, + "learning_rate": 9.871022371854123e-05, + "loss": 1.7715, + "step": 1667 + }, + { + "epoch": 0.07290528432186721, + "grad_norm": 3.546875, + "learning_rate": 9.870867322587602e-05, + "loss": 2.7403, + "step": 1668 + }, + { + "epoch": 0.07294899252589711, + "grad_norm": 3.0, + "learning_rate": 9.870712181400726e-05, + "loss": 2.8774, + "step": 1669 + }, + { + "epoch": 0.072992700729927, + "grad_norm": 3.375, + "learning_rate": 9.870556948296423e-05, + "loss": 2.4485, + "step": 1670 + }, + { + "epoch": 0.07303640893395691, + "grad_norm": 3.046875, + "learning_rate": 9.87040162327762e-05, + "loss": 1.856, + "step": 1671 + }, + { + "epoch": 0.0730801171379868, + "grad_norm": 3.0, + "learning_rate": 9.870246206347252e-05, + "loss": 2.0979, + "step": 1672 + }, + { + "epoch": 0.0731238253420167, + "grad_norm": 2.703125, + "learning_rate": 9.870090697508248e-05, + "loss": 1.9364, + "step": 1673 + }, + { + "epoch": 0.07316753354604659, + "grad_norm": 4.96875, + "learning_rate": 9.869935096763543e-05, + "loss": 2.922, + "step": 1674 + }, + { + "epoch": 0.07321124175007648, + "grad_norm": 2.625, + "learning_rate": 9.869779404116078e-05, + "loss": 2.2812, + "step": 1675 + }, + { + "epoch": 0.07325494995410639, + "grad_norm": 3.0, + "learning_rate": 9.869623619568786e-05, + "loss": 2.1977, + "step": 1676 + }, + { + "epoch": 0.07329865815813628, + "grad_norm": 3.03125, + "learning_rate": 9.86946774312461e-05, + "loss": 1.9256, + "step": 1677 + }, + { + "epoch": 0.07334236636216618, + "grad_norm": 3.578125, + "learning_rate": 9.86931177478649e-05, + "loss": 2.031, + "step": 1678 + }, + { + "epoch": 0.07338607456619607, + "grad_norm": 2.703125, + "learning_rate": 9.86915571455737e-05, + "loss": 2.2388, + "step": 1679 + }, + { + "epoch": 0.07342978277022597, + "grad_norm": 2.828125, + "learning_rate": 9.868999562440194e-05, + "loss": 2.7069, + "step": 1680 + }, + { + "epoch": 0.07347349097425587, + "grad_norm": 2.875, + "learning_rate": 9.86884331843791e-05, + "loss": 2.6101, + "step": 1681 + }, + { + "epoch": 0.07351719917828577, + "grad_norm": 3.34375, + "learning_rate": 9.868686982553468e-05, + "loss": 2.801, + "step": 1682 + }, + { + "epoch": 0.07356090738231566, + "grad_norm": 2.859375, + "learning_rate": 9.868530554789815e-05, + "loss": 2.0419, + "step": 1683 + }, + { + "epoch": 0.07360461558634555, + "grad_norm": 2.453125, + "learning_rate": 9.868374035149905e-05, + "loss": 2.0969, + "step": 1684 + }, + { + "epoch": 0.07364832379037545, + "grad_norm": 2.5625, + "learning_rate": 9.868217423636693e-05, + "loss": 2.2815, + "step": 1685 + }, + { + "epoch": 0.07369203199440535, + "grad_norm": 3.671875, + "learning_rate": 9.86806072025313e-05, + "loss": 2.4339, + "step": 1686 + }, + { + "epoch": 0.07373574019843525, + "grad_norm": 11.3125, + "learning_rate": 9.867903925002178e-05, + "loss": 2.8798, + "step": 1687 + }, + { + "epoch": 0.07377944840246514, + "grad_norm": 2.828125, + "learning_rate": 9.867747037886793e-05, + "loss": 2.0418, + "step": 1688 + }, + { + "epoch": 0.07382315660649504, + "grad_norm": 2.359375, + "learning_rate": 9.867590058909936e-05, + "loss": 2.1559, + "step": 1689 + }, + { + "epoch": 0.07386686481052493, + "grad_norm": 3.53125, + "learning_rate": 9.867432988074572e-05, + "loss": 2.198, + "step": 1690 + }, + { + "epoch": 0.07391057301455484, + "grad_norm": 2.46875, + "learning_rate": 9.867275825383664e-05, + "loss": 1.8589, + "step": 1691 + }, + { + "epoch": 0.07395428121858473, + "grad_norm": 2.875, + "learning_rate": 9.867118570840175e-05, + "loss": 2.4594, + "step": 1692 + }, + { + "epoch": 0.07399798942261462, + "grad_norm": 2.625, + "learning_rate": 9.866961224447075e-05, + "loss": 2.5562, + "step": 1693 + }, + { + "epoch": 0.07404169762664452, + "grad_norm": 2.96875, + "learning_rate": 9.866803786207335e-05, + "loss": 2.3999, + "step": 1694 + }, + { + "epoch": 0.07408540583067441, + "grad_norm": 2.9375, + "learning_rate": 9.866646256123922e-05, + "loss": 1.985, + "step": 1695 + }, + { + "epoch": 0.07412911403470432, + "grad_norm": 3.03125, + "learning_rate": 9.866488634199813e-05, + "loss": 2.6449, + "step": 1696 + }, + { + "epoch": 0.07417282223873421, + "grad_norm": 2.75, + "learning_rate": 9.866330920437979e-05, + "loss": 2.1322, + "step": 1697 + }, + { + "epoch": 0.0742165304427641, + "grad_norm": 2.25, + "learning_rate": 9.8661731148414e-05, + "loss": 1.5896, + "step": 1698 + }, + { + "epoch": 0.074260238646794, + "grad_norm": 2.59375, + "learning_rate": 9.86601521741305e-05, + "loss": 2.0734, + "step": 1699 + }, + { + "epoch": 0.0743039468508239, + "grad_norm": 2.859375, + "learning_rate": 9.865857228155911e-05, + "loss": 2.4698, + "step": 1700 + }, + { + "epoch": 0.0743476550548538, + "grad_norm": 2.796875, + "learning_rate": 9.865699147072964e-05, + "loss": 2.0935, + "step": 1701 + }, + { + "epoch": 0.0743913632588837, + "grad_norm": 3.0, + "learning_rate": 9.865540974167193e-05, + "loss": 2.4411, + "step": 1702 + }, + { + "epoch": 0.07443507146291359, + "grad_norm": 2.875, + "learning_rate": 9.865382709441584e-05, + "loss": 2.3764, + "step": 1703 + }, + { + "epoch": 0.07447877966694348, + "grad_norm": 2.75, + "learning_rate": 9.865224352899119e-05, + "loss": 2.1394, + "step": 1704 + }, + { + "epoch": 0.07452248787097338, + "grad_norm": 3.125, + "learning_rate": 9.865065904542792e-05, + "loss": 2.3577, + "step": 1705 + }, + { + "epoch": 0.07456619607500328, + "grad_norm": 3.359375, + "learning_rate": 9.864907364375589e-05, + "loss": 2.5026, + "step": 1706 + }, + { + "epoch": 0.07460990427903318, + "grad_norm": 3.234375, + "learning_rate": 9.864748732400504e-05, + "loss": 2.5778, + "step": 1707 + }, + { + "epoch": 0.07465361248306307, + "grad_norm": 2.9375, + "learning_rate": 9.86459000862053e-05, + "loss": 2.1946, + "step": 1708 + }, + { + "epoch": 0.07469732068709296, + "grad_norm": 3.390625, + "learning_rate": 9.864431193038662e-05, + "loss": 2.1463, + "step": 1709 + }, + { + "epoch": 0.07474102889112286, + "grad_norm": 3.171875, + "learning_rate": 9.864272285657898e-05, + "loss": 2.2934, + "step": 1710 + }, + { + "epoch": 0.07478473709515276, + "grad_norm": 2.6875, + "learning_rate": 9.864113286481237e-05, + "loss": 2.0675, + "step": 1711 + }, + { + "epoch": 0.07482844529918266, + "grad_norm": 2.5, + "learning_rate": 9.863954195511677e-05, + "loss": 2.022, + "step": 1712 + }, + { + "epoch": 0.07487215350321255, + "grad_norm": 2.953125, + "learning_rate": 9.863795012752224e-05, + "loss": 2.0623, + "step": 1713 + }, + { + "epoch": 0.07491586170724245, + "grad_norm": 2.640625, + "learning_rate": 9.863635738205881e-05, + "loss": 2.5347, + "step": 1714 + }, + { + "epoch": 0.07495956991127234, + "grad_norm": 6.125, + "learning_rate": 9.863476371875651e-05, + "loss": 2.5752, + "step": 1715 + }, + { + "epoch": 0.07500327811530225, + "grad_norm": 3.765625, + "learning_rate": 9.863316913764545e-05, + "loss": 2.5674, + "step": 1716 + }, + { + "epoch": 0.07504698631933214, + "grad_norm": 3.5, + "learning_rate": 9.863157363875568e-05, + "loss": 2.4708, + "step": 1717 + }, + { + "epoch": 0.07509069452336203, + "grad_norm": 21.0, + "learning_rate": 9.862997722211735e-05, + "loss": 7.6248, + "step": 1718 + }, + { + "epoch": 0.07513440272739193, + "grad_norm": 2.296875, + "learning_rate": 9.862837988776059e-05, + "loss": 1.906, + "step": 1719 + }, + { + "epoch": 0.07517811093142182, + "grad_norm": 3.046875, + "learning_rate": 9.86267816357155e-05, + "loss": 2.6795, + "step": 1720 + }, + { + "epoch": 0.07522181913545173, + "grad_norm": 2.671875, + "learning_rate": 9.86251824660123e-05, + "loss": 3.0829, + "step": 1721 + }, + { + "epoch": 0.07526552733948162, + "grad_norm": 2.6875, + "learning_rate": 9.86235823786811e-05, + "loss": 2.1595, + "step": 1722 + }, + { + "epoch": 0.07530923554351152, + "grad_norm": 3.171875, + "learning_rate": 9.862198137375215e-05, + "loss": 2.2193, + "step": 1723 + }, + { + "epoch": 0.07535294374754141, + "grad_norm": 4.46875, + "learning_rate": 9.862037945125564e-05, + "loss": 2.0731, + "step": 1724 + }, + { + "epoch": 0.07539665195157132, + "grad_norm": 2.875, + "learning_rate": 9.86187766112218e-05, + "loss": 1.9634, + "step": 1725 + }, + { + "epoch": 0.07544036015560121, + "grad_norm": 2.96875, + "learning_rate": 9.861717285368091e-05, + "loss": 2.0719, + "step": 1726 + }, + { + "epoch": 0.0754840683596311, + "grad_norm": 3.40625, + "learning_rate": 9.861556817866318e-05, + "loss": 2.5894, + "step": 1727 + }, + { + "epoch": 0.075527776563661, + "grad_norm": 3.703125, + "learning_rate": 9.861396258619894e-05, + "loss": 2.2967, + "step": 1728 + }, + { + "epoch": 0.07557148476769089, + "grad_norm": 2.65625, + "learning_rate": 9.861235607631847e-05, + "loss": 1.8993, + "step": 1729 + }, + { + "epoch": 0.0756151929717208, + "grad_norm": 3.140625, + "learning_rate": 9.861074864905207e-05, + "loss": 1.9114, + "step": 1730 + }, + { + "epoch": 0.07565890117575069, + "grad_norm": 3.203125, + "learning_rate": 9.860914030443012e-05, + "loss": 1.9079, + "step": 1731 + }, + { + "epoch": 0.07570260937978059, + "grad_norm": 3.59375, + "learning_rate": 9.860753104248292e-05, + "loss": 2.5769, + "step": 1732 + }, + { + "epoch": 0.07574631758381048, + "grad_norm": 2.890625, + "learning_rate": 9.860592086324088e-05, + "loss": 2.4568, + "step": 1733 + }, + { + "epoch": 0.07579002578784037, + "grad_norm": 2.703125, + "learning_rate": 9.860430976673436e-05, + "loss": 2.1262, + "step": 1734 + }, + { + "epoch": 0.07583373399187028, + "grad_norm": 2.625, + "learning_rate": 9.86026977529938e-05, + "loss": 1.8477, + "step": 1735 + }, + { + "epoch": 0.07587744219590017, + "grad_norm": 2.3125, + "learning_rate": 9.860108482204957e-05, + "loss": 2.0501, + "step": 1736 + }, + { + "epoch": 0.07592115039993007, + "grad_norm": 2.796875, + "learning_rate": 9.859947097393215e-05, + "loss": 2.7278, + "step": 1737 + }, + { + "epoch": 0.07596485860395996, + "grad_norm": 2.453125, + "learning_rate": 9.859785620867197e-05, + "loss": 2.0078, + "step": 1738 + }, + { + "epoch": 0.07600856680798986, + "grad_norm": 2.65625, + "learning_rate": 9.859624052629951e-05, + "loss": 2.1657, + "step": 1739 + }, + { + "epoch": 0.07605227501201976, + "grad_norm": 2.40625, + "learning_rate": 9.859462392684526e-05, + "loss": 2.1615, + "step": 1740 + }, + { + "epoch": 0.07609598321604966, + "grad_norm": 2.546875, + "learning_rate": 9.859300641033974e-05, + "loss": 2.0125, + "step": 1741 + }, + { + "epoch": 0.07613969142007955, + "grad_norm": 2.78125, + "learning_rate": 9.859138797681347e-05, + "loss": 2.6558, + "step": 1742 + }, + { + "epoch": 0.07618339962410944, + "grad_norm": 5.34375, + "learning_rate": 9.858976862629698e-05, + "loss": 2.9906, + "step": 1743 + }, + { + "epoch": 0.07622710782813934, + "grad_norm": 2.390625, + "learning_rate": 9.858814835882085e-05, + "loss": 2.4754, + "step": 1744 + }, + { + "epoch": 0.07627081603216924, + "grad_norm": 3.0, + "learning_rate": 9.85865271744156e-05, + "loss": 2.1856, + "step": 1745 + }, + { + "epoch": 0.07631452423619914, + "grad_norm": 3.203125, + "learning_rate": 9.85849050731119e-05, + "loss": 2.4571, + "step": 1746 + }, + { + "epoch": 0.07635823244022903, + "grad_norm": 2.578125, + "learning_rate": 9.858328205494035e-05, + "loss": 1.9271, + "step": 1747 + }, + { + "epoch": 0.07640194064425893, + "grad_norm": 2.75, + "learning_rate": 9.858165811993153e-05, + "loss": 2.4178, + "step": 1748 + }, + { + "epoch": 0.07644564884828882, + "grad_norm": 2.71875, + "learning_rate": 9.858003326811611e-05, + "loss": 2.3161, + "step": 1749 + }, + { + "epoch": 0.07648935705231873, + "grad_norm": 5.15625, + "learning_rate": 9.857840749952478e-05, + "loss": 1.7724, + "step": 1750 + }, + { + "epoch": 0.07653306525634862, + "grad_norm": 2.8125, + "learning_rate": 9.857678081418818e-05, + "loss": 2.2475, + "step": 1751 + }, + { + "epoch": 0.07657677346037851, + "grad_norm": 2.953125, + "learning_rate": 9.857515321213704e-05, + "loss": 1.9093, + "step": 1752 + }, + { + "epoch": 0.07662048166440841, + "grad_norm": 2.921875, + "learning_rate": 9.857352469340204e-05, + "loss": 2.1066, + "step": 1753 + }, + { + "epoch": 0.0766641898684383, + "grad_norm": 2.671875, + "learning_rate": 9.857189525801396e-05, + "loss": 1.8703, + "step": 1754 + }, + { + "epoch": 0.07670789807246821, + "grad_norm": 2.5625, + "learning_rate": 9.857026490600349e-05, + "loss": 2.1322, + "step": 1755 + }, + { + "epoch": 0.0767516062764981, + "grad_norm": 3.21875, + "learning_rate": 9.856863363740145e-05, + "loss": 2.6255, + "step": 1756 + }, + { + "epoch": 0.076795314480528, + "grad_norm": 2.484375, + "learning_rate": 9.856700145223862e-05, + "loss": 1.8698, + "step": 1757 + }, + { + "epoch": 0.07683902268455789, + "grad_norm": 3.03125, + "learning_rate": 9.856536835054577e-05, + "loss": 3.1263, + "step": 1758 + }, + { + "epoch": 0.07688273088858778, + "grad_norm": 3.859375, + "learning_rate": 9.856373433235373e-05, + "loss": 1.8888, + "step": 1759 + }, + { + "epoch": 0.07692643909261769, + "grad_norm": 2.5, + "learning_rate": 9.856209939769335e-05, + "loss": 2.0877, + "step": 1760 + }, + { + "epoch": 0.07697014729664758, + "grad_norm": 2.59375, + "learning_rate": 9.856046354659547e-05, + "loss": 2.0977, + "step": 1761 + }, + { + "epoch": 0.07701385550067748, + "grad_norm": 2.625, + "learning_rate": 9.855882677909099e-05, + "loss": 2.3225, + "step": 1762 + }, + { + "epoch": 0.07705756370470737, + "grad_norm": 2.375, + "learning_rate": 9.855718909521075e-05, + "loss": 1.7009, + "step": 1763 + }, + { + "epoch": 0.07710127190873726, + "grad_norm": 2.71875, + "learning_rate": 9.855555049498568e-05, + "loss": 2.2034, + "step": 1764 + }, + { + "epoch": 0.07714498011276717, + "grad_norm": 2.84375, + "learning_rate": 9.85539109784467e-05, + "loss": 1.5595, + "step": 1765 + }, + { + "epoch": 0.07718868831679707, + "grad_norm": 3.15625, + "learning_rate": 9.855227054562476e-05, + "loss": 2.9596, + "step": 1766 + }, + { + "epoch": 0.07723239652082696, + "grad_norm": 2.84375, + "learning_rate": 9.855062919655083e-05, + "loss": 1.9394, + "step": 1767 + }, + { + "epoch": 0.07727610472485685, + "grad_norm": 2.765625, + "learning_rate": 9.854898693125586e-05, + "loss": 2.1817, + "step": 1768 + }, + { + "epoch": 0.07731981292888675, + "grad_norm": 3.8125, + "learning_rate": 9.854734374977081e-05, + "loss": 2.0313, + "step": 1769 + }, + { + "epoch": 0.07736352113291665, + "grad_norm": 2.390625, + "learning_rate": 9.854569965212676e-05, + "loss": 1.8892, + "step": 1770 + }, + { + "epoch": 0.07740722933694655, + "grad_norm": 2.578125, + "learning_rate": 9.854405463835468e-05, + "loss": 2.2463, + "step": 1771 + }, + { + "epoch": 0.07745093754097644, + "grad_norm": 2.765625, + "learning_rate": 9.854240870848565e-05, + "loss": 1.8787, + "step": 1772 + }, + { + "epoch": 0.07749464574500634, + "grad_norm": 3.046875, + "learning_rate": 9.854076186255072e-05, + "loss": 2.0774, + "step": 1773 + }, + { + "epoch": 0.07753835394903623, + "grad_norm": 2.421875, + "learning_rate": 9.853911410058097e-05, + "loss": 2.1188, + "step": 1774 + }, + { + "epoch": 0.07758206215306614, + "grad_norm": 5.28125, + "learning_rate": 9.853746542260749e-05, + "loss": 2.0926, + "step": 1775 + }, + { + "epoch": 0.07762577035709603, + "grad_norm": 3.390625, + "learning_rate": 9.853581582866139e-05, + "loss": 2.7162, + "step": 1776 + }, + { + "epoch": 0.07766947856112592, + "grad_norm": 3.09375, + "learning_rate": 9.85341653187738e-05, + "loss": 2.7719, + "step": 1777 + }, + { + "epoch": 0.07771318676515582, + "grad_norm": 2.828125, + "learning_rate": 9.853251389297587e-05, + "loss": 2.4834, + "step": 1778 + }, + { + "epoch": 0.07775689496918571, + "grad_norm": 3.59375, + "learning_rate": 9.853086155129878e-05, + "loss": 2.0547, + "step": 1779 + }, + { + "epoch": 0.07780060317321562, + "grad_norm": 2.5, + "learning_rate": 9.852920829377369e-05, + "loss": 2.4811, + "step": 1780 + }, + { + "epoch": 0.07784431137724551, + "grad_norm": 3.25, + "learning_rate": 9.85275541204318e-05, + "loss": 2.6087, + "step": 1781 + }, + { + "epoch": 0.0778880195812754, + "grad_norm": 3.078125, + "learning_rate": 9.852589903130435e-05, + "loss": 2.869, + "step": 1782 + }, + { + "epoch": 0.0779317277853053, + "grad_norm": 2.4375, + "learning_rate": 9.852424302642256e-05, + "loss": 1.683, + "step": 1783 + }, + { + "epoch": 0.07797543598933519, + "grad_norm": 2.75, + "learning_rate": 9.852258610581768e-05, + "loss": 2.5439, + "step": 1784 + }, + { + "epoch": 0.0780191441933651, + "grad_norm": 3.265625, + "learning_rate": 9.852092826952097e-05, + "loss": 1.7083, + "step": 1785 + }, + { + "epoch": 0.078062852397395, + "grad_norm": 3.703125, + "learning_rate": 9.851926951756374e-05, + "loss": 2.2147, + "step": 1786 + }, + { + "epoch": 0.07810656060142489, + "grad_norm": 2.53125, + "learning_rate": 9.851760984997727e-05, + "loss": 1.9388, + "step": 1787 + }, + { + "epoch": 0.07815026880545478, + "grad_norm": 4.125, + "learning_rate": 9.851594926679287e-05, + "loss": 2.1677, + "step": 1788 + }, + { + "epoch": 0.07819397700948467, + "grad_norm": 4.0625, + "learning_rate": 9.851428776804191e-05, + "loss": 3.2133, + "step": 1789 + }, + { + "epoch": 0.07823768521351458, + "grad_norm": 2.5625, + "learning_rate": 9.851262535375574e-05, + "loss": 2.3131, + "step": 1790 + }, + { + "epoch": 0.07828139341754448, + "grad_norm": 3.359375, + "learning_rate": 9.851096202396572e-05, + "loss": 2.0723, + "step": 1791 + }, + { + "epoch": 0.07832510162157437, + "grad_norm": 2.546875, + "learning_rate": 9.850929777870324e-05, + "loss": 2.3914, + "step": 1792 + }, + { + "epoch": 0.07836880982560426, + "grad_norm": 2.84375, + "learning_rate": 9.850763261799969e-05, + "loss": 1.8671, + "step": 1793 + }, + { + "epoch": 0.07841251802963416, + "grad_norm": 3.1875, + "learning_rate": 9.850596654188653e-05, + "loss": 2.0209, + "step": 1794 + }, + { + "epoch": 0.07845622623366406, + "grad_norm": 2.765625, + "learning_rate": 9.850429955039518e-05, + "loss": 2.0369, + "step": 1795 + }, + { + "epoch": 0.07849993443769396, + "grad_norm": 2.453125, + "learning_rate": 9.85026316435571e-05, + "loss": 2.1397, + "step": 1796 + }, + { + "epoch": 0.07854364264172385, + "grad_norm": 2.53125, + "learning_rate": 9.850096282140379e-05, + "loss": 1.6577, + "step": 1797 + }, + { + "epoch": 0.07858735084575375, + "grad_norm": 3.296875, + "learning_rate": 9.84992930839667e-05, + "loss": 2.19, + "step": 1798 + }, + { + "epoch": 0.07863105904978364, + "grad_norm": 3.375, + "learning_rate": 9.849762243127737e-05, + "loss": 1.9441, + "step": 1799 + }, + { + "epoch": 0.07867476725381355, + "grad_norm": 2.6875, + "learning_rate": 9.849595086336732e-05, + "loss": 2.5172, + "step": 1800 + }, + { + "epoch": 0.07871847545784344, + "grad_norm": 2.703125, + "learning_rate": 9.84942783802681e-05, + "loss": 1.7464, + "step": 1801 + }, + { + "epoch": 0.07876218366187333, + "grad_norm": 2.546875, + "learning_rate": 9.849260498201126e-05, + "loss": 2.0358, + "step": 1802 + }, + { + "epoch": 0.07880589186590323, + "grad_norm": 2.75, + "learning_rate": 9.849093066862837e-05, + "loss": 2.3709, + "step": 1803 + }, + { + "epoch": 0.07884960006993312, + "grad_norm": 2.6875, + "learning_rate": 9.848925544015106e-05, + "loss": 2.3652, + "step": 1804 + }, + { + "epoch": 0.07889330827396303, + "grad_norm": 2.296875, + "learning_rate": 9.848757929661095e-05, + "loss": 1.8576, + "step": 1805 + }, + { + "epoch": 0.07893701647799292, + "grad_norm": 3.0625, + "learning_rate": 9.848590223803961e-05, + "loss": 1.9136, + "step": 1806 + }, + { + "epoch": 0.07898072468202282, + "grad_norm": 4.46875, + "learning_rate": 9.848422426446875e-05, + "loss": 2.979, + "step": 1807 + }, + { + "epoch": 0.07902443288605271, + "grad_norm": 2.828125, + "learning_rate": 9.848254537593e-05, + "loss": 2.4617, + "step": 1808 + }, + { + "epoch": 0.0790681410900826, + "grad_norm": 2.265625, + "learning_rate": 9.848086557245507e-05, + "loss": 1.9116, + "step": 1809 + }, + { + "epoch": 0.07911184929411251, + "grad_norm": 2.890625, + "learning_rate": 9.847918485407563e-05, + "loss": 2.1013, + "step": 1810 + }, + { + "epoch": 0.0791555574981424, + "grad_norm": 2.75, + "learning_rate": 9.847750322082341e-05, + "loss": 2.1538, + "step": 1811 + }, + { + "epoch": 0.0791992657021723, + "grad_norm": 2.53125, + "learning_rate": 9.847582067273015e-05, + "loss": 2.1033, + "step": 1812 + }, + { + "epoch": 0.07924297390620219, + "grad_norm": 2.453125, + "learning_rate": 9.847413720982763e-05, + "loss": 2.1661, + "step": 1813 + }, + { + "epoch": 0.07928668211023208, + "grad_norm": 2.375, + "learning_rate": 9.847245283214757e-05, + "loss": 1.8955, + "step": 1814 + }, + { + "epoch": 0.07933039031426199, + "grad_norm": 2.484375, + "learning_rate": 9.847076753972176e-05, + "loss": 1.9043, + "step": 1815 + }, + { + "epoch": 0.07937409851829189, + "grad_norm": 3.28125, + "learning_rate": 9.846908133258204e-05, + "loss": 1.8656, + "step": 1816 + }, + { + "epoch": 0.07941780672232178, + "grad_norm": 2.984375, + "learning_rate": 9.846739421076022e-05, + "loss": 2.4502, + "step": 1817 + }, + { + "epoch": 0.07946151492635167, + "grad_norm": 2.4375, + "learning_rate": 9.846570617428811e-05, + "loss": 2.2399, + "step": 1818 + }, + { + "epoch": 0.07950522313038157, + "grad_norm": 2.53125, + "learning_rate": 9.84640172231976e-05, + "loss": 2.2902, + "step": 1819 + }, + { + "epoch": 0.07954893133441147, + "grad_norm": 5.0625, + "learning_rate": 9.846232735752055e-05, + "loss": 1.7842, + "step": 1820 + }, + { + "epoch": 0.07959263953844137, + "grad_norm": 2.953125, + "learning_rate": 9.846063657728884e-05, + "loss": 1.8978, + "step": 1821 + }, + { + "epoch": 0.07963634774247126, + "grad_norm": 3.078125, + "learning_rate": 9.845894488253438e-05, + "loss": 2.3265, + "step": 1822 + }, + { + "epoch": 0.07968005594650115, + "grad_norm": 3.921875, + "learning_rate": 9.84572522732891e-05, + "loss": 1.7457, + "step": 1823 + }, + { + "epoch": 0.07972376415053105, + "grad_norm": 3.46875, + "learning_rate": 9.845555874958496e-05, + "loss": 2.2468, + "step": 1824 + }, + { + "epoch": 0.07976747235456096, + "grad_norm": 2.9375, + "learning_rate": 9.84538643114539e-05, + "loss": 2.579, + "step": 1825 + }, + { + "epoch": 0.07981118055859085, + "grad_norm": 12.9375, + "learning_rate": 9.84521689589279e-05, + "loss": 1.3026, + "step": 1826 + }, + { + "epoch": 0.07985488876262074, + "grad_norm": 3.625, + "learning_rate": 9.845047269203895e-05, + "loss": 2.2743, + "step": 1827 + }, + { + "epoch": 0.07989859696665064, + "grad_norm": 2.296875, + "learning_rate": 9.844877551081906e-05, + "loss": 1.8684, + "step": 1828 + }, + { + "epoch": 0.07994230517068053, + "grad_norm": 3.265625, + "learning_rate": 9.844707741530026e-05, + "loss": 2.2146, + "step": 1829 + }, + { + "epoch": 0.07998601337471044, + "grad_norm": 3.203125, + "learning_rate": 9.844537840551462e-05, + "loss": 2.2717, + "step": 1830 + }, + { + "epoch": 0.08002972157874033, + "grad_norm": 2.578125, + "learning_rate": 9.844367848149417e-05, + "loss": 1.9394, + "step": 1831 + }, + { + "epoch": 0.08007342978277023, + "grad_norm": 2.40625, + "learning_rate": 9.844197764327097e-05, + "loss": 2.0863, + "step": 1832 + }, + { + "epoch": 0.08011713798680012, + "grad_norm": 2.75, + "learning_rate": 9.844027589087719e-05, + "loss": 2.6755, + "step": 1833 + }, + { + "epoch": 0.08016084619083001, + "grad_norm": 2.671875, + "learning_rate": 9.843857322434487e-05, + "loss": 2.1548, + "step": 1834 + }, + { + "epoch": 0.08020455439485992, + "grad_norm": 2.859375, + "learning_rate": 9.84368696437062e-05, + "loss": 2.247, + "step": 1835 + }, + { + "epoch": 0.08024826259888981, + "grad_norm": 2.96875, + "learning_rate": 9.843516514899329e-05, + "loss": 1.8082, + "step": 1836 + }, + { + "epoch": 0.08029197080291971, + "grad_norm": 2.734375, + "learning_rate": 9.843345974023832e-05, + "loss": 2.3728, + "step": 1837 + }, + { + "epoch": 0.0803356790069496, + "grad_norm": 3.546875, + "learning_rate": 9.843175341747348e-05, + "loss": 2.3455, + "step": 1838 + }, + { + "epoch": 0.0803793872109795, + "grad_norm": 3.34375, + "learning_rate": 9.843004618073096e-05, + "loss": 2.3092, + "step": 1839 + }, + { + "epoch": 0.0804230954150094, + "grad_norm": 2.90625, + "learning_rate": 9.842833803004298e-05, + "loss": 1.8681, + "step": 1840 + }, + { + "epoch": 0.0804668036190393, + "grad_norm": 2.578125, + "learning_rate": 9.842662896544176e-05, + "loss": 1.9617, + "step": 1841 + }, + { + "epoch": 0.08051051182306919, + "grad_norm": 3.421875, + "learning_rate": 9.84249189869596e-05, + "loss": 2.3176, + "step": 1842 + }, + { + "epoch": 0.08055422002709908, + "grad_norm": 2.3125, + "learning_rate": 9.842320809462873e-05, + "loss": 2.0864, + "step": 1843 + }, + { + "epoch": 0.08059792823112898, + "grad_norm": 3.125, + "learning_rate": 9.842149628848145e-05, + "loss": 1.9941, + "step": 1844 + }, + { + "epoch": 0.08064163643515888, + "grad_norm": 4.25, + "learning_rate": 9.841978356855005e-05, + "loss": 2.5347, + "step": 1845 + }, + { + "epoch": 0.08068534463918878, + "grad_norm": 2.65625, + "learning_rate": 9.841806993486686e-05, + "loss": 2.5548, + "step": 1846 + }, + { + "epoch": 0.08072905284321867, + "grad_norm": 2.3125, + "learning_rate": 9.841635538746424e-05, + "loss": 2.0899, + "step": 1847 + }, + { + "epoch": 0.08077276104724856, + "grad_norm": 2.625, + "learning_rate": 9.841463992637451e-05, + "loss": 2.5336, + "step": 1848 + }, + { + "epoch": 0.08081646925127846, + "grad_norm": 2.703125, + "learning_rate": 9.841292355163008e-05, + "loss": 1.8719, + "step": 1849 + }, + { + "epoch": 0.08086017745530837, + "grad_norm": 3.21875, + "learning_rate": 9.84112062632633e-05, + "loss": 2.7897, + "step": 1850 + }, + { + "epoch": 0.08090388565933826, + "grad_norm": 4.03125, + "learning_rate": 9.84094880613066e-05, + "loss": 3.1195, + "step": 1851 + }, + { + "epoch": 0.08094759386336815, + "grad_norm": 3.03125, + "learning_rate": 9.840776894579241e-05, + "loss": 1.8838, + "step": 1852 + }, + { + "epoch": 0.08099130206739805, + "grad_norm": 2.609375, + "learning_rate": 9.840604891675317e-05, + "loss": 1.8534, + "step": 1853 + }, + { + "epoch": 0.08103501027142794, + "grad_norm": 3.609375, + "learning_rate": 9.840432797422132e-05, + "loss": 1.9414, + "step": 1854 + }, + { + "epoch": 0.08107871847545785, + "grad_norm": 3.1875, + "learning_rate": 9.840260611822938e-05, + "loss": 2.4429, + "step": 1855 + }, + { + "epoch": 0.08112242667948774, + "grad_norm": 2.953125, + "learning_rate": 9.84008833488098e-05, + "loss": 2.4672, + "step": 1856 + }, + { + "epoch": 0.08116613488351763, + "grad_norm": 2.84375, + "learning_rate": 9.83991596659951e-05, + "loss": 2.3259, + "step": 1857 + }, + { + "epoch": 0.08120984308754753, + "grad_norm": 5.65625, + "learning_rate": 9.839743506981782e-05, + "loss": 1.7876, + "step": 1858 + }, + { + "epoch": 0.08125355129157742, + "grad_norm": 3.046875, + "learning_rate": 9.83957095603105e-05, + "loss": 2.2488, + "step": 1859 + }, + { + "epoch": 0.08129725949560733, + "grad_norm": 3.984375, + "learning_rate": 9.839398313750571e-05, + "loss": 2.2605, + "step": 1860 + }, + { + "epoch": 0.08134096769963722, + "grad_norm": 2.453125, + "learning_rate": 9.839225580143601e-05, + "loss": 1.7144, + "step": 1861 + }, + { + "epoch": 0.08138467590366712, + "grad_norm": 2.515625, + "learning_rate": 9.839052755213403e-05, + "loss": 2.2738, + "step": 1862 + }, + { + "epoch": 0.08142838410769701, + "grad_norm": 2.828125, + "learning_rate": 9.838879838963235e-05, + "loss": 1.904, + "step": 1863 + }, + { + "epoch": 0.08147209231172692, + "grad_norm": 3.078125, + "learning_rate": 9.838706831396362e-05, + "loss": 2.7066, + "step": 1864 + }, + { + "epoch": 0.08151580051575681, + "grad_norm": 2.78125, + "learning_rate": 9.838533732516051e-05, + "loss": 1.9853, + "step": 1865 + }, + { + "epoch": 0.0815595087197867, + "grad_norm": 2.765625, + "learning_rate": 9.838360542325565e-05, + "loss": 2.0883, + "step": 1866 + }, + { + "epoch": 0.0816032169238166, + "grad_norm": 3.046875, + "learning_rate": 9.838187260828173e-05, + "loss": 2.198, + "step": 1867 + }, + { + "epoch": 0.08164692512784649, + "grad_norm": 2.46875, + "learning_rate": 9.838013888027145e-05, + "loss": 2.0494, + "step": 1868 + }, + { + "epoch": 0.0816906333318764, + "grad_norm": 2.484375, + "learning_rate": 9.837840423925755e-05, + "loss": 1.6796, + "step": 1869 + }, + { + "epoch": 0.0817343415359063, + "grad_norm": 2.84375, + "learning_rate": 9.837666868527274e-05, + "loss": 2.2668, + "step": 1870 + }, + { + "epoch": 0.08177804973993619, + "grad_norm": 2.375, + "learning_rate": 9.837493221834979e-05, + "loss": 2.0788, + "step": 1871 + }, + { + "epoch": 0.08182175794396608, + "grad_norm": 2.59375, + "learning_rate": 9.837319483852147e-05, + "loss": 1.9986, + "step": 1872 + }, + { + "epoch": 0.08186546614799597, + "grad_norm": 2.578125, + "learning_rate": 9.837145654582054e-05, + "loss": 2.0753, + "step": 1873 + }, + { + "epoch": 0.08190917435202588, + "grad_norm": 2.53125, + "learning_rate": 9.836971734027985e-05, + "loss": 2.1466, + "step": 1874 + }, + { + "epoch": 0.08195288255605578, + "grad_norm": 2.703125, + "learning_rate": 9.836797722193217e-05, + "loss": 2.1172, + "step": 1875 + }, + { + "epoch": 0.08199659076008567, + "grad_norm": 2.84375, + "learning_rate": 9.836623619081039e-05, + "loss": 2.0186, + "step": 1876 + }, + { + "epoch": 0.08204029896411556, + "grad_norm": 2.5625, + "learning_rate": 9.836449424694732e-05, + "loss": 1.7065, + "step": 1877 + }, + { + "epoch": 0.08208400716814546, + "grad_norm": 3.609375, + "learning_rate": 9.836275139037585e-05, + "loss": 2.7346, + "step": 1878 + }, + { + "epoch": 0.08212771537217536, + "grad_norm": 2.546875, + "learning_rate": 9.836100762112888e-05, + "loss": 2.1246, + "step": 1879 + }, + { + "epoch": 0.08217142357620526, + "grad_norm": 2.671875, + "learning_rate": 9.83592629392393e-05, + "loss": 2.2013, + "step": 1880 + }, + { + "epoch": 0.08221513178023515, + "grad_norm": 3.453125, + "learning_rate": 9.835751734474005e-05, + "loss": 2.2353, + "step": 1881 + }, + { + "epoch": 0.08225883998426504, + "grad_norm": 2.90625, + "learning_rate": 9.835577083766407e-05, + "loss": 2.8191, + "step": 1882 + }, + { + "epoch": 0.08230254818829494, + "grad_norm": 3.25, + "learning_rate": 9.835402341804432e-05, + "loss": 2.6574, + "step": 1883 + }, + { + "epoch": 0.08234625639232485, + "grad_norm": 2.59375, + "learning_rate": 9.835227508591376e-05, + "loss": 2.3453, + "step": 1884 + }, + { + "epoch": 0.08238996459635474, + "grad_norm": 2.65625, + "learning_rate": 9.835052584130541e-05, + "loss": 1.847, + "step": 1885 + }, + { + "epoch": 0.08243367280038463, + "grad_norm": 3.140625, + "learning_rate": 9.834877568425225e-05, + "loss": 1.968, + "step": 1886 + }, + { + "epoch": 0.08247738100441453, + "grad_norm": 2.46875, + "learning_rate": 9.834702461478733e-05, + "loss": 1.6705, + "step": 1887 + }, + { + "epoch": 0.08252108920844442, + "grad_norm": 3.125, + "learning_rate": 9.834527263294371e-05, + "loss": 2.1591, + "step": 1888 + }, + { + "epoch": 0.08256479741247433, + "grad_norm": 2.359375, + "learning_rate": 9.834351973875441e-05, + "loss": 1.9738, + "step": 1889 + }, + { + "epoch": 0.08260850561650422, + "grad_norm": 4.21875, + "learning_rate": 9.834176593225254e-05, + "loss": 2.5599, + "step": 1890 + }, + { + "epoch": 0.08265221382053412, + "grad_norm": 3.28125, + "learning_rate": 9.834001121347119e-05, + "loss": 1.9454, + "step": 1891 + }, + { + "epoch": 0.08269592202456401, + "grad_norm": 2.515625, + "learning_rate": 9.833825558244347e-05, + "loss": 1.8525, + "step": 1892 + }, + { + "epoch": 0.0827396302285939, + "grad_norm": 3.140625, + "learning_rate": 9.83364990392025e-05, + "loss": 2.792, + "step": 1893 + }, + { + "epoch": 0.08278333843262381, + "grad_norm": 2.609375, + "learning_rate": 9.833474158378147e-05, + "loss": 2.171, + "step": 1894 + }, + { + "epoch": 0.0828270466366537, + "grad_norm": 3.53125, + "learning_rate": 9.833298321621349e-05, + "loss": 2.5549, + "step": 1895 + }, + { + "epoch": 0.0828707548406836, + "grad_norm": 2.625, + "learning_rate": 9.833122393653178e-05, + "loss": 2.6365, + "step": 1896 + }, + { + "epoch": 0.08291446304471349, + "grad_norm": 3.234375, + "learning_rate": 9.832946374476954e-05, + "loss": 2.2036, + "step": 1897 + }, + { + "epoch": 0.08295817124874338, + "grad_norm": 2.6875, + "learning_rate": 9.832770264095998e-05, + "loss": 2.0949, + "step": 1898 + }, + { + "epoch": 0.08300187945277329, + "grad_norm": 2.484375, + "learning_rate": 9.832594062513634e-05, + "loss": 1.9104, + "step": 1899 + }, + { + "epoch": 0.08304558765680319, + "grad_norm": 3.25, + "learning_rate": 9.832417769733185e-05, + "loss": 2.0378, + "step": 1900 + }, + { + "epoch": 0.08308929586083308, + "grad_norm": 2.625, + "learning_rate": 9.832241385757981e-05, + "loss": 2.1519, + "step": 1901 + }, + { + "epoch": 0.08313300406486297, + "grad_norm": 2.9375, + "learning_rate": 9.832064910591348e-05, + "loss": 2.804, + "step": 1902 + }, + { + "epoch": 0.08317671226889287, + "grad_norm": 2.40625, + "learning_rate": 9.831888344236617e-05, + "loss": 1.7018, + "step": 1903 + }, + { + "epoch": 0.08322042047292277, + "grad_norm": 2.46875, + "learning_rate": 9.83171168669712e-05, + "loss": 2.1303, + "step": 1904 + }, + { + "epoch": 0.08326412867695267, + "grad_norm": 3.109375, + "learning_rate": 9.831534937976192e-05, + "loss": 2.5991, + "step": 1905 + }, + { + "epoch": 0.08330783688098256, + "grad_norm": 2.453125, + "learning_rate": 9.831358098077168e-05, + "loss": 2.0354, + "step": 1906 + }, + { + "epoch": 0.08335154508501245, + "grad_norm": 2.1875, + "learning_rate": 9.831181167003385e-05, + "loss": 1.8323, + "step": 1907 + }, + { + "epoch": 0.08339525328904235, + "grad_norm": 2.421875, + "learning_rate": 9.831004144758183e-05, + "loss": 1.5327, + "step": 1908 + }, + { + "epoch": 0.08343896149307226, + "grad_norm": 2.265625, + "learning_rate": 9.8308270313449e-05, + "loss": 1.8952, + "step": 1909 + }, + { + "epoch": 0.08348266969710215, + "grad_norm": 3.140625, + "learning_rate": 9.83064982676688e-05, + "loss": 2.3476, + "step": 1910 + }, + { + "epoch": 0.08352637790113204, + "grad_norm": 2.328125, + "learning_rate": 9.830472531027468e-05, + "loss": 2.3885, + "step": 1911 + }, + { + "epoch": 0.08357008610516194, + "grad_norm": 2.78125, + "learning_rate": 9.830295144130009e-05, + "loss": 2.0377, + "step": 1912 + }, + { + "epoch": 0.08361379430919183, + "grad_norm": 2.90625, + "learning_rate": 9.830117666077849e-05, + "loss": 3.0738, + "step": 1913 + }, + { + "epoch": 0.08365750251322174, + "grad_norm": 3.578125, + "learning_rate": 9.82994009687434e-05, + "loss": 2.0192, + "step": 1914 + }, + { + "epoch": 0.08370121071725163, + "grad_norm": 2.484375, + "learning_rate": 9.829762436522831e-05, + "loss": 2.0367, + "step": 1915 + }, + { + "epoch": 0.08374491892128152, + "grad_norm": 2.46875, + "learning_rate": 9.829584685026676e-05, + "loss": 2.0879, + "step": 1916 + }, + { + "epoch": 0.08378862712531142, + "grad_norm": 3.765625, + "learning_rate": 9.829406842389229e-05, + "loss": 3.2253, + "step": 1917 + }, + { + "epoch": 0.08383233532934131, + "grad_norm": 2.9375, + "learning_rate": 9.829228908613845e-05, + "loss": 1.9372, + "step": 1918 + }, + { + "epoch": 0.08387604353337122, + "grad_norm": 3.109375, + "learning_rate": 9.829050883703882e-05, + "loss": 1.9538, + "step": 1919 + }, + { + "epoch": 0.08391975173740111, + "grad_norm": 2.75, + "learning_rate": 9.828872767662705e-05, + "loss": 2.3358, + "step": 1920 + }, + { + "epoch": 0.083963459941431, + "grad_norm": 2.828125, + "learning_rate": 9.828694560493667e-05, + "loss": 2.8138, + "step": 1921 + }, + { + "epoch": 0.0840071681454609, + "grad_norm": 8.0, + "learning_rate": 9.828516262200135e-05, + "loss": 2.7766, + "step": 1922 + }, + { + "epoch": 0.0840508763494908, + "grad_norm": 2.671875, + "learning_rate": 9.828337872785475e-05, + "loss": 2.4589, + "step": 1923 + }, + { + "epoch": 0.0840945845535207, + "grad_norm": 2.765625, + "learning_rate": 9.828159392253052e-05, + "loss": 2.168, + "step": 1924 + }, + { + "epoch": 0.0841382927575506, + "grad_norm": 3.171875, + "learning_rate": 9.827980820606232e-05, + "loss": 2.1371, + "step": 1925 + }, + { + "epoch": 0.08418200096158049, + "grad_norm": 3.328125, + "learning_rate": 9.82780215784839e-05, + "loss": 2.5149, + "step": 1926 + }, + { + "epoch": 0.08422570916561038, + "grad_norm": 2.765625, + "learning_rate": 9.827623403982892e-05, + "loss": 2.0682, + "step": 1927 + }, + { + "epoch": 0.08426941736964028, + "grad_norm": 2.953125, + "learning_rate": 9.827444559013115e-05, + "loss": 2.2317, + "step": 1928 + }, + { + "epoch": 0.08431312557367018, + "grad_norm": 2.140625, + "learning_rate": 9.827265622942434e-05, + "loss": 1.63, + "step": 1929 + }, + { + "epoch": 0.08435683377770008, + "grad_norm": 2.75, + "learning_rate": 9.827086595774225e-05, + "loss": 1.8615, + "step": 1930 + }, + { + "epoch": 0.08440054198172997, + "grad_norm": 2.6875, + "learning_rate": 9.826907477511865e-05, + "loss": 2.2187, + "step": 1931 + }, + { + "epoch": 0.08444425018575986, + "grad_norm": 2.796875, + "learning_rate": 9.826728268158737e-05, + "loss": 2.1593, + "step": 1932 + }, + { + "epoch": 0.08448795838978976, + "grad_norm": 2.734375, + "learning_rate": 9.826548967718221e-05, + "loss": 2.1498, + "step": 1933 + }, + { + "epoch": 0.08453166659381967, + "grad_norm": 2.40625, + "learning_rate": 9.826369576193701e-05, + "loss": 2.1045, + "step": 1934 + }, + { + "epoch": 0.08457537479784956, + "grad_norm": 2.765625, + "learning_rate": 9.826190093588563e-05, + "loss": 2.108, + "step": 1935 + }, + { + "epoch": 0.08461908300187945, + "grad_norm": 2.65625, + "learning_rate": 9.826010519906194e-05, + "loss": 2.1962, + "step": 1936 + }, + { + "epoch": 0.08466279120590935, + "grad_norm": 2.828125, + "learning_rate": 9.825830855149983e-05, + "loss": 2.0458, + "step": 1937 + }, + { + "epoch": 0.08470649940993924, + "grad_norm": 2.71875, + "learning_rate": 9.82565109932332e-05, + "loss": 2.0363, + "step": 1938 + }, + { + "epoch": 0.08475020761396915, + "grad_norm": 4.34375, + "learning_rate": 9.825471252429596e-05, + "loss": 1.9519, + "step": 1939 + }, + { + "epoch": 0.08479391581799904, + "grad_norm": 2.734375, + "learning_rate": 9.825291314472208e-05, + "loss": 2.0237, + "step": 1940 + }, + { + "epoch": 0.08483762402202893, + "grad_norm": 3.578125, + "learning_rate": 9.82511128545455e-05, + "loss": 2.245, + "step": 1941 + }, + { + "epoch": 0.08488133222605883, + "grad_norm": 3.640625, + "learning_rate": 9.824931165380018e-05, + "loss": 3.7645, + "step": 1942 + }, + { + "epoch": 0.08492504043008872, + "grad_norm": 2.65625, + "learning_rate": 9.824750954252014e-05, + "loss": 2.355, + "step": 1943 + }, + { + "epoch": 0.08496874863411863, + "grad_norm": 3.28125, + "learning_rate": 9.824570652073935e-05, + "loss": 2.4083, + "step": 1944 + }, + { + "epoch": 0.08501245683814852, + "grad_norm": 3.1875, + "learning_rate": 9.824390258849188e-05, + "loss": 2.2051, + "step": 1945 + }, + { + "epoch": 0.08505616504217842, + "grad_norm": 3.0625, + "learning_rate": 9.824209774581174e-05, + "loss": 2.7646, + "step": 1946 + }, + { + "epoch": 0.08509987324620831, + "grad_norm": 3.6875, + "learning_rate": 9.824029199273302e-05, + "loss": 3.1808, + "step": 1947 + }, + { + "epoch": 0.0851435814502382, + "grad_norm": 2.84375, + "learning_rate": 9.823848532928978e-05, + "loss": 2.0116, + "step": 1948 + }, + { + "epoch": 0.08518728965426811, + "grad_norm": 2.609375, + "learning_rate": 9.823667775551611e-05, + "loss": 2.1517, + "step": 1949 + }, + { + "epoch": 0.085230997858298, + "grad_norm": 2.6875, + "learning_rate": 9.823486927144612e-05, + "loss": 2.1716, + "step": 1950 + }, + { + "epoch": 0.0852747060623279, + "grad_norm": 3.515625, + "learning_rate": 9.823305987711393e-05, + "loss": 1.7987, + "step": 1951 + }, + { + "epoch": 0.08531841426635779, + "grad_norm": 8.6875, + "learning_rate": 9.823124957255372e-05, + "loss": 1.7863, + "step": 1952 + }, + { + "epoch": 0.08536212247038769, + "grad_norm": 2.3125, + "learning_rate": 9.822943835779963e-05, + "loss": 1.958, + "step": 1953 + }, + { + "epoch": 0.0854058306744176, + "grad_norm": 2.734375, + "learning_rate": 9.822762623288584e-05, + "loss": 2.1187, + "step": 1954 + }, + { + "epoch": 0.08544953887844749, + "grad_norm": 2.40625, + "learning_rate": 9.822581319784656e-05, + "loss": 1.9922, + "step": 1955 + }, + { + "epoch": 0.08549324708247738, + "grad_norm": 3.015625, + "learning_rate": 9.822399925271598e-05, + "loss": 2.26, + "step": 1956 + }, + { + "epoch": 0.08553695528650727, + "grad_norm": 2.453125, + "learning_rate": 9.822218439752834e-05, + "loss": 1.9014, + "step": 1957 + }, + { + "epoch": 0.08558066349053717, + "grad_norm": 3.3125, + "learning_rate": 9.822036863231793e-05, + "loss": 1.8039, + "step": 1958 + }, + { + "epoch": 0.08562437169456708, + "grad_norm": 2.90625, + "learning_rate": 9.821855195711897e-05, + "loss": 2.4106, + "step": 1959 + }, + { + "epoch": 0.08566807989859697, + "grad_norm": 2.828125, + "learning_rate": 9.821673437196574e-05, + "loss": 1.9691, + "step": 1960 + }, + { + "epoch": 0.08571178810262686, + "grad_norm": 3.703125, + "learning_rate": 9.821491587689257e-05, + "loss": 1.9763, + "step": 1961 + }, + { + "epoch": 0.08575549630665676, + "grad_norm": 3.09375, + "learning_rate": 9.821309647193376e-05, + "loss": 2.3648, + "step": 1962 + }, + { + "epoch": 0.08579920451068665, + "grad_norm": 2.953125, + "learning_rate": 9.821127615712364e-05, + "loss": 2.5316, + "step": 1963 + }, + { + "epoch": 0.08584291271471656, + "grad_norm": 2.453125, + "learning_rate": 9.820945493249659e-05, + "loss": 1.8494, + "step": 1964 + }, + { + "epoch": 0.08588662091874645, + "grad_norm": 2.546875, + "learning_rate": 9.820763279808695e-05, + "loss": 2.0713, + "step": 1965 + }, + { + "epoch": 0.08593032912277634, + "grad_norm": 2.453125, + "learning_rate": 9.820580975392912e-05, + "loss": 1.7838, + "step": 1966 + }, + { + "epoch": 0.08597403732680624, + "grad_norm": 3.515625, + "learning_rate": 9.820398580005749e-05, + "loss": 3.0606, + "step": 1967 + }, + { + "epoch": 0.08601774553083613, + "grad_norm": 2.359375, + "learning_rate": 9.820216093650649e-05, + "loss": 1.7928, + "step": 1968 + }, + { + "epoch": 0.08606145373486604, + "grad_norm": 2.328125, + "learning_rate": 9.820033516331057e-05, + "loss": 1.9501, + "step": 1969 + }, + { + "epoch": 0.08610516193889593, + "grad_norm": 2.40625, + "learning_rate": 9.819850848050419e-05, + "loss": 2.061, + "step": 1970 + }, + { + "epoch": 0.08614887014292583, + "grad_norm": 3.15625, + "learning_rate": 9.819668088812177e-05, + "loss": 1.9728, + "step": 1971 + }, + { + "epoch": 0.08619257834695572, + "grad_norm": 3.375, + "learning_rate": 9.819485238619784e-05, + "loss": 2.0728, + "step": 1972 + }, + { + "epoch": 0.08623628655098561, + "grad_norm": 2.71875, + "learning_rate": 9.819302297476692e-05, + "loss": 1.947, + "step": 1973 + }, + { + "epoch": 0.08627999475501552, + "grad_norm": 3.0625, + "learning_rate": 9.819119265386349e-05, + "loss": 2.1996, + "step": 1974 + }, + { + "epoch": 0.08632370295904541, + "grad_norm": 3.15625, + "learning_rate": 9.818936142352212e-05, + "loss": 2.351, + "step": 1975 + }, + { + "epoch": 0.08636741116307531, + "grad_norm": 2.578125, + "learning_rate": 9.81875292837774e-05, + "loss": 1.9119, + "step": 1976 + }, + { + "epoch": 0.0864111193671052, + "grad_norm": 2.609375, + "learning_rate": 9.818569623466383e-05, + "loss": 2.0526, + "step": 1977 + }, + { + "epoch": 0.0864548275711351, + "grad_norm": 3.03125, + "learning_rate": 9.818386227621605e-05, + "loss": 2.6655, + "step": 1978 + }, + { + "epoch": 0.086498535775165, + "grad_norm": 3.40625, + "learning_rate": 9.818202740846867e-05, + "loss": 2.1808, + "step": 1979 + }, + { + "epoch": 0.0865422439791949, + "grad_norm": 4.21875, + "learning_rate": 9.818019163145631e-05, + "loss": 2.8019, + "step": 1980 + }, + { + "epoch": 0.08658595218322479, + "grad_norm": 3.546875, + "learning_rate": 9.81783549452136e-05, + "loss": 2.1154, + "step": 1981 + }, + { + "epoch": 0.08662966038725468, + "grad_norm": 2.5, + "learning_rate": 9.81765173497752e-05, + "loss": 2.0534, + "step": 1982 + }, + { + "epoch": 0.08667336859128458, + "grad_norm": 2.75, + "learning_rate": 9.817467884517582e-05, + "loss": 2.0711, + "step": 1983 + }, + { + "epoch": 0.08671707679531448, + "grad_norm": 3.5625, + "learning_rate": 9.817283943145013e-05, + "loss": 2.0547, + "step": 1984 + }, + { + "epoch": 0.08676078499934438, + "grad_norm": 2.71875, + "learning_rate": 9.817099910863285e-05, + "loss": 2.4254, + "step": 1985 + }, + { + "epoch": 0.08680449320337427, + "grad_norm": 2.6875, + "learning_rate": 9.81691578767587e-05, + "loss": 2.117, + "step": 1986 + }, + { + "epoch": 0.08684820140740417, + "grad_norm": 2.671875, + "learning_rate": 9.816731573586245e-05, + "loss": 2.1401, + "step": 1987 + }, + { + "epoch": 0.08689190961143406, + "grad_norm": 2.75, + "learning_rate": 9.816547268597882e-05, + "loss": 2.3913, + "step": 1988 + }, + { + "epoch": 0.08693561781546397, + "grad_norm": 2.796875, + "learning_rate": 9.816362872714264e-05, + "loss": 2.3797, + "step": 1989 + }, + { + "epoch": 0.08697932601949386, + "grad_norm": 2.65625, + "learning_rate": 9.816178385938868e-05, + "loss": 2.1, + "step": 1990 + }, + { + "epoch": 0.08702303422352375, + "grad_norm": 2.421875, + "learning_rate": 9.815993808275177e-05, + "loss": 2.0036, + "step": 1991 + }, + { + "epoch": 0.08706674242755365, + "grad_norm": 3.78125, + "learning_rate": 9.815809139726673e-05, + "loss": 1.5592, + "step": 1992 + }, + { + "epoch": 0.08711045063158354, + "grad_norm": 3.09375, + "learning_rate": 9.815624380296841e-05, + "loss": 2.726, + "step": 1993 + }, + { + "epoch": 0.08715415883561345, + "grad_norm": 3.03125, + "learning_rate": 9.815439529989168e-05, + "loss": 2.5015, + "step": 1994 + }, + { + "epoch": 0.08719786703964334, + "grad_norm": 3.125, + "learning_rate": 9.815254588807143e-05, + "loss": 1.9638, + "step": 1995 + }, + { + "epoch": 0.08724157524367324, + "grad_norm": 3.203125, + "learning_rate": 9.815069556754256e-05, + "loss": 2.2552, + "step": 1996 + }, + { + "epoch": 0.08728528344770313, + "grad_norm": 2.625, + "learning_rate": 9.814884433833999e-05, + "loss": 2.0396, + "step": 1997 + }, + { + "epoch": 0.08732899165173302, + "grad_norm": 9.1875, + "learning_rate": 9.814699220049863e-05, + "loss": 1.9973, + "step": 1998 + }, + { + "epoch": 0.08737269985576293, + "grad_norm": 2.5625, + "learning_rate": 9.814513915405346e-05, + "loss": 2.2719, + "step": 1999 + }, + { + "epoch": 0.08741640805979282, + "grad_norm": 3.625, + "learning_rate": 9.814328519903946e-05, + "loss": 2.6787, + "step": 2000 + }, + { + "epoch": 0.08746011626382272, + "grad_norm": 2.84375, + "learning_rate": 9.814143033549157e-05, + "loss": 2.5491, + "step": 2001 + }, + { + "epoch": 0.08750382446785261, + "grad_norm": 3.125, + "learning_rate": 9.813957456344485e-05, + "loss": 3.0051, + "step": 2002 + }, + { + "epoch": 0.08754753267188252, + "grad_norm": 3.703125, + "learning_rate": 9.813771788293429e-05, + "loss": 1.8316, + "step": 2003 + }, + { + "epoch": 0.08759124087591241, + "grad_norm": 2.390625, + "learning_rate": 9.813586029399492e-05, + "loss": 1.6633, + "step": 2004 + }, + { + "epoch": 0.0876349490799423, + "grad_norm": 4.375, + "learning_rate": 9.813400179666181e-05, + "loss": 2.5481, + "step": 2005 + }, + { + "epoch": 0.0876786572839722, + "grad_norm": 2.984375, + "learning_rate": 9.813214239097004e-05, + "loss": 2.0448, + "step": 2006 + }, + { + "epoch": 0.0877223654880021, + "grad_norm": 3.25, + "learning_rate": 9.813028207695467e-05, + "loss": 1.9342, + "step": 2007 + }, + { + "epoch": 0.087766073692032, + "grad_norm": 3.265625, + "learning_rate": 9.812842085465085e-05, + "loss": 2.5093, + "step": 2008 + }, + { + "epoch": 0.0878097818960619, + "grad_norm": 2.53125, + "learning_rate": 9.812655872409366e-05, + "loss": 2.2058, + "step": 2009 + }, + { + "epoch": 0.08785349010009179, + "grad_norm": 3.453125, + "learning_rate": 9.812469568531828e-05, + "loss": 2.2205, + "step": 2010 + }, + { + "epoch": 0.08789719830412168, + "grad_norm": 4.46875, + "learning_rate": 9.812283173835985e-05, + "loss": 2.2833, + "step": 2011 + }, + { + "epoch": 0.08794090650815158, + "grad_norm": 2.4375, + "learning_rate": 9.812096688325354e-05, + "loss": 2.1205, + "step": 2012 + }, + { + "epoch": 0.08798461471218148, + "grad_norm": 3.1875, + "learning_rate": 9.811910112003455e-05, + "loss": 1.9461, + "step": 2013 + }, + { + "epoch": 0.08802832291621138, + "grad_norm": 42.5, + "learning_rate": 9.811723444873809e-05, + "loss": 2.3175, + "step": 2014 + }, + { + "epoch": 0.08807203112024127, + "grad_norm": 2.40625, + "learning_rate": 9.811536686939937e-05, + "loss": 1.8189, + "step": 2015 + }, + { + "epoch": 0.08811573932427116, + "grad_norm": 2.953125, + "learning_rate": 9.811349838205366e-05, + "loss": 2.5017, + "step": 2016 + }, + { + "epoch": 0.08815944752830106, + "grad_norm": 2.59375, + "learning_rate": 9.81116289867362e-05, + "loss": 1.8558, + "step": 2017 + }, + { + "epoch": 0.08820315573233097, + "grad_norm": 2.375, + "learning_rate": 9.81097586834823e-05, + "loss": 2.0018, + "step": 2018 + }, + { + "epoch": 0.08824686393636086, + "grad_norm": 3.15625, + "learning_rate": 9.810788747232721e-05, + "loss": 2.0362, + "step": 2019 + }, + { + "epoch": 0.08829057214039075, + "grad_norm": 3.28125, + "learning_rate": 9.810601535330627e-05, + "loss": 2.3536, + "step": 2020 + }, + { + "epoch": 0.08833428034442065, + "grad_norm": 2.5625, + "learning_rate": 9.810414232645482e-05, + "loss": 2.0757, + "step": 2021 + }, + { + "epoch": 0.08837798854845054, + "grad_norm": 2.4375, + "learning_rate": 9.810226839180817e-05, + "loss": 2.1589, + "step": 2022 + }, + { + "epoch": 0.08842169675248045, + "grad_norm": 3.5, + "learning_rate": 9.810039354940173e-05, + "loss": 2.8814, + "step": 2023 + }, + { + "epoch": 0.08846540495651034, + "grad_norm": 3.1875, + "learning_rate": 9.809851779927084e-05, + "loss": 2.5291, + "step": 2024 + }, + { + "epoch": 0.08850911316054023, + "grad_norm": 2.8125, + "learning_rate": 9.809664114145091e-05, + "loss": 2.4861, + "step": 2025 + }, + { + "epoch": 0.08855282136457013, + "grad_norm": 2.6875, + "learning_rate": 9.809476357597738e-05, + "loss": 2.0149, + "step": 2026 + }, + { + "epoch": 0.08859652956860002, + "grad_norm": 4.34375, + "learning_rate": 9.809288510288566e-05, + "loss": 3.2423, + "step": 2027 + }, + { + "epoch": 0.08864023777262993, + "grad_norm": 2.984375, + "learning_rate": 9.809100572221118e-05, + "loss": 1.4191, + "step": 2028 + }, + { + "epoch": 0.08868394597665982, + "grad_norm": 7.0625, + "learning_rate": 9.808912543398945e-05, + "loss": 2.0518, + "step": 2029 + }, + { + "epoch": 0.08872765418068972, + "grad_norm": 3.609375, + "learning_rate": 9.808724423825592e-05, + "loss": 1.9862, + "step": 2030 + }, + { + "epoch": 0.08877136238471961, + "grad_norm": 2.234375, + "learning_rate": 9.80853621350461e-05, + "loss": 1.978, + "step": 2031 + }, + { + "epoch": 0.0888150705887495, + "grad_norm": 2.28125, + "learning_rate": 9.808347912439554e-05, + "loss": 2.0013, + "step": 2032 + }, + { + "epoch": 0.08885877879277941, + "grad_norm": 2.671875, + "learning_rate": 9.808159520633973e-05, + "loss": 2.1639, + "step": 2033 + }, + { + "epoch": 0.0889024869968093, + "grad_norm": 2.96875, + "learning_rate": 9.807971038091423e-05, + "loss": 2.9512, + "step": 2034 + }, + { + "epoch": 0.0889461952008392, + "grad_norm": 2.796875, + "learning_rate": 9.807782464815463e-05, + "loss": 2.0199, + "step": 2035 + }, + { + "epoch": 0.08898990340486909, + "grad_norm": 2.5, + "learning_rate": 9.80759380080965e-05, + "loss": 2.0716, + "step": 2036 + }, + { + "epoch": 0.08903361160889899, + "grad_norm": 2.5, + "learning_rate": 9.807405046077545e-05, + "loss": 2.1774, + "step": 2037 + }, + { + "epoch": 0.08907731981292889, + "grad_norm": 4.8125, + "learning_rate": 9.80721620062271e-05, + "loss": 2.5221, + "step": 2038 + }, + { + "epoch": 0.08912102801695879, + "grad_norm": 3.359375, + "learning_rate": 9.807027264448708e-05, + "loss": 2.0483, + "step": 2039 + }, + { + "epoch": 0.08916473622098868, + "grad_norm": 2.9375, + "learning_rate": 9.806838237559107e-05, + "loss": 1.8315, + "step": 2040 + }, + { + "epoch": 0.08920844442501857, + "grad_norm": 3.21875, + "learning_rate": 9.80664911995747e-05, + "loss": 2.7907, + "step": 2041 + }, + { + "epoch": 0.08925215262904847, + "grad_norm": 2.890625, + "learning_rate": 9.80645991164737e-05, + "loss": 2.3077, + "step": 2042 + }, + { + "epoch": 0.08929586083307837, + "grad_norm": 2.640625, + "learning_rate": 9.806270612632375e-05, + "loss": 2.032, + "step": 2043 + }, + { + "epoch": 0.08933956903710827, + "grad_norm": 2.78125, + "learning_rate": 9.806081222916059e-05, + "loss": 2.1644, + "step": 2044 + }, + { + "epoch": 0.08938327724113816, + "grad_norm": 5.5625, + "learning_rate": 9.805891742501996e-05, + "loss": 2.7271, + "step": 2045 + }, + { + "epoch": 0.08942698544516806, + "grad_norm": 2.484375, + "learning_rate": 9.80570217139376e-05, + "loss": 1.9969, + "step": 2046 + }, + { + "epoch": 0.08947069364919795, + "grad_norm": 3.046875, + "learning_rate": 9.80551250959493e-05, + "loss": 2.655, + "step": 2047 + }, + { + "epoch": 0.08951440185322786, + "grad_norm": 2.640625, + "learning_rate": 9.805322757109086e-05, + "loss": 2.6689, + "step": 2048 + }, + { + "epoch": 0.08955811005725775, + "grad_norm": 2.421875, + "learning_rate": 9.805132913939807e-05, + "loss": 2.1874, + "step": 2049 + }, + { + "epoch": 0.08960181826128764, + "grad_norm": 2.578125, + "learning_rate": 9.804942980090676e-05, + "loss": 2.3881, + "step": 2050 + }, + { + "epoch": 0.08964552646531754, + "grad_norm": 2.28125, + "learning_rate": 9.804752955565278e-05, + "loss": 2.0989, + "step": 2051 + }, + { + "epoch": 0.08968923466934743, + "grad_norm": 3.125, + "learning_rate": 9.804562840367198e-05, + "loss": 2.2578, + "step": 2052 + }, + { + "epoch": 0.08973294287337734, + "grad_norm": 2.625, + "learning_rate": 9.804372634500026e-05, + "loss": 1.8339, + "step": 2053 + }, + { + "epoch": 0.08977665107740723, + "grad_norm": 2.96875, + "learning_rate": 9.804182337967349e-05, + "loss": 2.4403, + "step": 2054 + }, + { + "epoch": 0.08982035928143713, + "grad_norm": 3.03125, + "learning_rate": 9.80399195077276e-05, + "loss": 2.6908, + "step": 2055 + }, + { + "epoch": 0.08986406748546702, + "grad_norm": 2.203125, + "learning_rate": 9.803801472919849e-05, + "loss": 2.0668, + "step": 2056 + }, + { + "epoch": 0.08990777568949691, + "grad_norm": 2.421875, + "learning_rate": 9.803610904412214e-05, + "loss": 2.2085, + "step": 2057 + }, + { + "epoch": 0.08995148389352682, + "grad_norm": 2.703125, + "learning_rate": 9.803420245253449e-05, + "loss": 2.0336, + "step": 2058 + }, + { + "epoch": 0.08999519209755671, + "grad_norm": 3.125, + "learning_rate": 9.803229495447154e-05, + "loss": 2.9318, + "step": 2059 + }, + { + "epoch": 0.09003890030158661, + "grad_norm": 2.484375, + "learning_rate": 9.803038654996927e-05, + "loss": 1.6366, + "step": 2060 + }, + { + "epoch": 0.0900826085056165, + "grad_norm": 2.375, + "learning_rate": 9.802847723906371e-05, + "loss": 1.9645, + "step": 2061 + }, + { + "epoch": 0.0901263167096464, + "grad_norm": 3.9375, + "learning_rate": 9.802656702179088e-05, + "loss": 2.3127, + "step": 2062 + }, + { + "epoch": 0.0901700249136763, + "grad_norm": 3.109375, + "learning_rate": 9.802465589818683e-05, + "loss": 1.8978, + "step": 2063 + }, + { + "epoch": 0.0902137331177062, + "grad_norm": 2.28125, + "learning_rate": 9.802274386828762e-05, + "loss": 1.9738, + "step": 2064 + }, + { + "epoch": 0.09025744132173609, + "grad_norm": 3.34375, + "learning_rate": 9.802083093212935e-05, + "loss": 2.8582, + "step": 2065 + }, + { + "epoch": 0.09030114952576598, + "grad_norm": 3.125, + "learning_rate": 9.801891708974809e-05, + "loss": 2.021, + "step": 2066 + }, + { + "epoch": 0.09034485772979588, + "grad_norm": 3.71875, + "learning_rate": 9.801700234117999e-05, + "loss": 2.0541, + "step": 2067 + }, + { + "epoch": 0.09038856593382578, + "grad_norm": 4.03125, + "learning_rate": 9.801508668646118e-05, + "loss": 1.8612, + "step": 2068 + }, + { + "epoch": 0.09043227413785568, + "grad_norm": 2.921875, + "learning_rate": 9.801317012562779e-05, + "loss": 2.3819, + "step": 2069 + }, + { + "epoch": 0.09047598234188557, + "grad_norm": 2.546875, + "learning_rate": 9.8011252658716e-05, + "loss": 2.1188, + "step": 2070 + }, + { + "epoch": 0.09051969054591547, + "grad_norm": 2.359375, + "learning_rate": 9.8009334285762e-05, + "loss": 2.0454, + "step": 2071 + }, + { + "epoch": 0.09056339874994536, + "grad_norm": 2.65625, + "learning_rate": 9.800741500680197e-05, + "loss": 2.1398, + "step": 2072 + }, + { + "epoch": 0.09060710695397527, + "grad_norm": 4.8125, + "learning_rate": 9.800549482187216e-05, + "loss": 1.7601, + "step": 2073 + }, + { + "epoch": 0.09065081515800516, + "grad_norm": 6.40625, + "learning_rate": 9.800357373100882e-05, + "loss": 2.1453, + "step": 2074 + }, + { + "epoch": 0.09069452336203505, + "grad_norm": 2.8125, + "learning_rate": 9.800165173424814e-05, + "loss": 2.3561, + "step": 2075 + }, + { + "epoch": 0.09073823156606495, + "grad_norm": 2.8125, + "learning_rate": 9.799972883162645e-05, + "loss": 1.8398, + "step": 2076 + }, + { + "epoch": 0.09078193977009484, + "grad_norm": 2.765625, + "learning_rate": 9.799780502318e-05, + "loss": 2.2264, + "step": 2077 + }, + { + "epoch": 0.09082564797412475, + "grad_norm": 2.703125, + "learning_rate": 9.799588030894512e-05, + "loss": 2.0077, + "step": 2078 + }, + { + "epoch": 0.09086935617815464, + "grad_norm": 2.53125, + "learning_rate": 9.799395468895812e-05, + "loss": 2.0867, + "step": 2079 + }, + { + "epoch": 0.09091306438218454, + "grad_norm": 2.578125, + "learning_rate": 9.799202816325534e-05, + "loss": 2.1293, + "step": 2080 + }, + { + "epoch": 0.09095677258621443, + "grad_norm": 2.40625, + "learning_rate": 9.799010073187316e-05, + "loss": 2.2106, + "step": 2081 + }, + { + "epoch": 0.09100048079024432, + "grad_norm": 2.453125, + "learning_rate": 9.798817239484792e-05, + "loss": 1.7821, + "step": 2082 + }, + { + "epoch": 0.09104418899427423, + "grad_norm": 3.65625, + "learning_rate": 9.798624315221603e-05, + "loss": 2.1652, + "step": 2083 + }, + { + "epoch": 0.09108789719830412, + "grad_norm": 2.3125, + "learning_rate": 9.798431300401388e-05, + "loss": 1.9395, + "step": 2084 + }, + { + "epoch": 0.09113160540233402, + "grad_norm": 2.84375, + "learning_rate": 9.79823819502779e-05, + "loss": 2.8928, + "step": 2085 + }, + { + "epoch": 0.09117531360636391, + "grad_norm": 2.375, + "learning_rate": 9.798044999104456e-05, + "loss": 2.069, + "step": 2086 + }, + { + "epoch": 0.0912190218103938, + "grad_norm": 3.15625, + "learning_rate": 9.797851712635028e-05, + "loss": 2.0174, + "step": 2087 + }, + { + "epoch": 0.09126273001442371, + "grad_norm": 2.34375, + "learning_rate": 9.797658335623155e-05, + "loss": 2.1882, + "step": 2088 + }, + { + "epoch": 0.0913064382184536, + "grad_norm": 2.75, + "learning_rate": 9.797464868072488e-05, + "loss": 2.2737, + "step": 2089 + }, + { + "epoch": 0.0913501464224835, + "grad_norm": 2.40625, + "learning_rate": 9.797271309986676e-05, + "loss": 2.0482, + "step": 2090 + }, + { + "epoch": 0.0913938546265134, + "grad_norm": 3.515625, + "learning_rate": 9.797077661369372e-05, + "loss": 2.1115, + "step": 2091 + }, + { + "epoch": 0.09143756283054329, + "grad_norm": 2.9375, + "learning_rate": 9.79688392222423e-05, + "loss": 2.2927, + "step": 2092 + }, + { + "epoch": 0.0914812710345732, + "grad_norm": 2.546875, + "learning_rate": 9.796690092554908e-05, + "loss": 1.8428, + "step": 2093 + }, + { + "epoch": 0.09152497923860309, + "grad_norm": 2.421875, + "learning_rate": 9.796496172365061e-05, + "loss": 1.911, + "step": 2094 + }, + { + "epoch": 0.09156868744263298, + "grad_norm": 3.75, + "learning_rate": 9.796302161658353e-05, + "loss": 2.5428, + "step": 2095 + }, + { + "epoch": 0.09161239564666288, + "grad_norm": 2.875, + "learning_rate": 9.79610806043844e-05, + "loss": 1.9935, + "step": 2096 + }, + { + "epoch": 0.09165610385069277, + "grad_norm": 2.46875, + "learning_rate": 9.795913868708989e-05, + "loss": 2.322, + "step": 2097 + }, + { + "epoch": 0.09169981205472268, + "grad_norm": 3.9375, + "learning_rate": 9.795719586473663e-05, + "loss": 2.0394, + "step": 2098 + }, + { + "epoch": 0.09174352025875257, + "grad_norm": 5.03125, + "learning_rate": 9.795525213736128e-05, + "loss": 2.4226, + "step": 2099 + }, + { + "epoch": 0.09178722846278246, + "grad_norm": 12.25, + "learning_rate": 9.795330750500054e-05, + "loss": 1.4187, + "step": 2100 + }, + { + "epoch": 0.09183093666681236, + "grad_norm": 2.5, + "learning_rate": 9.795136196769107e-05, + "loss": 1.8698, + "step": 2101 + }, + { + "epoch": 0.09187464487084225, + "grad_norm": 3.0, + "learning_rate": 9.794941552546963e-05, + "loss": 2.5359, + "step": 2102 + }, + { + "epoch": 0.09191835307487216, + "grad_norm": 3.171875, + "learning_rate": 9.794746817837293e-05, + "loss": 2.3038, + "step": 2103 + }, + { + "epoch": 0.09196206127890205, + "grad_norm": 2.796875, + "learning_rate": 9.794551992643772e-05, + "loss": 2.0825, + "step": 2104 + }, + { + "epoch": 0.09200576948293195, + "grad_norm": 3.546875, + "learning_rate": 9.794357076970076e-05, + "loss": 2.0303, + "step": 2105 + }, + { + "epoch": 0.09204947768696184, + "grad_norm": 2.328125, + "learning_rate": 9.794162070819885e-05, + "loss": 1.7951, + "step": 2106 + }, + { + "epoch": 0.09209318589099173, + "grad_norm": 2.3125, + "learning_rate": 9.793966974196878e-05, + "loss": 2.0509, + "step": 2107 + }, + { + "epoch": 0.09213689409502164, + "grad_norm": 2.65625, + "learning_rate": 9.793771787104735e-05, + "loss": 2.2638, + "step": 2108 + }, + { + "epoch": 0.09218060229905153, + "grad_norm": 2.875, + "learning_rate": 9.793576509547144e-05, + "loss": 2.2946, + "step": 2109 + }, + { + "epoch": 0.09222431050308143, + "grad_norm": 3.171875, + "learning_rate": 9.793381141527786e-05, + "loss": 2.5537, + "step": 2110 + }, + { + "epoch": 0.09226801870711132, + "grad_norm": 2.515625, + "learning_rate": 9.79318568305035e-05, + "loss": 2.0841, + "step": 2111 + }, + { + "epoch": 0.09231172691114121, + "grad_norm": 2.578125, + "learning_rate": 9.792990134118525e-05, + "loss": 2.1249, + "step": 2112 + }, + { + "epoch": 0.09235543511517112, + "grad_norm": 4.125, + "learning_rate": 9.792794494736e-05, + "loss": 2.4162, + "step": 2113 + }, + { + "epoch": 0.09239914331920102, + "grad_norm": 2.625, + "learning_rate": 9.792598764906466e-05, + "loss": 2.1053, + "step": 2114 + }, + { + "epoch": 0.09244285152323091, + "grad_norm": 2.71875, + "learning_rate": 9.792402944633619e-05, + "loss": 2.5305, + "step": 2115 + }, + { + "epoch": 0.0924865597272608, + "grad_norm": 2.390625, + "learning_rate": 9.792207033921152e-05, + "loss": 2.1751, + "step": 2116 + }, + { + "epoch": 0.0925302679312907, + "grad_norm": 2.484375, + "learning_rate": 9.792011032772765e-05, + "loss": 1.9627, + "step": 2117 + }, + { + "epoch": 0.0925739761353206, + "grad_norm": 2.75, + "learning_rate": 9.791814941192155e-05, + "loss": 2.2857, + "step": 2118 + }, + { + "epoch": 0.0926176843393505, + "grad_norm": 2.703125, + "learning_rate": 9.791618759183023e-05, + "loss": 2.2022, + "step": 2119 + }, + { + "epoch": 0.09266139254338039, + "grad_norm": 3.9375, + "learning_rate": 9.791422486749071e-05, + "loss": 2.8579, + "step": 2120 + }, + { + "epoch": 0.09270510074741029, + "grad_norm": 3.34375, + "learning_rate": 9.791226123894003e-05, + "loss": 2.7253, + "step": 2121 + }, + { + "epoch": 0.09274880895144018, + "grad_norm": 3.0, + "learning_rate": 9.791029670621525e-05, + "loss": 1.9731, + "step": 2122 + }, + { + "epoch": 0.09279251715547009, + "grad_norm": 2.609375, + "learning_rate": 9.790833126935343e-05, + "loss": 1.9945, + "step": 2123 + }, + { + "epoch": 0.09283622535949998, + "grad_norm": 4.84375, + "learning_rate": 9.79063649283917e-05, + "loss": 1.9241, + "step": 2124 + }, + { + "epoch": 0.09287993356352987, + "grad_norm": 3.015625, + "learning_rate": 9.79043976833671e-05, + "loss": 2.9551, + "step": 2125 + }, + { + "epoch": 0.09292364176755977, + "grad_norm": 2.859375, + "learning_rate": 9.790242953431682e-05, + "loss": 1.9754, + "step": 2126 + }, + { + "epoch": 0.09296734997158966, + "grad_norm": 2.953125, + "learning_rate": 9.790046048127797e-05, + "loss": 1.9089, + "step": 2127 + }, + { + "epoch": 0.09301105817561957, + "grad_norm": 3.078125, + "learning_rate": 9.789849052428772e-05, + "loss": 2.0825, + "step": 2128 + }, + { + "epoch": 0.09305476637964946, + "grad_norm": 2.390625, + "learning_rate": 9.789651966338323e-05, + "loss": 2.5024, + "step": 2129 + }, + { + "epoch": 0.09309847458367936, + "grad_norm": 2.84375, + "learning_rate": 9.789454789860171e-05, + "loss": 1.8926, + "step": 2130 + }, + { + "epoch": 0.09314218278770925, + "grad_norm": 2.90625, + "learning_rate": 9.789257522998037e-05, + "loss": 2.1313, + "step": 2131 + }, + { + "epoch": 0.09318589099173914, + "grad_norm": 2.625, + "learning_rate": 9.789060165755643e-05, + "loss": 2.2453, + "step": 2132 + }, + { + "epoch": 0.09322959919576905, + "grad_norm": 2.6875, + "learning_rate": 9.788862718136713e-05, + "loss": 2.0841, + "step": 2133 + }, + { + "epoch": 0.09327330739979894, + "grad_norm": 3.71875, + "learning_rate": 9.788665180144973e-05, + "loss": 2.1431, + "step": 2134 + }, + { + "epoch": 0.09331701560382884, + "grad_norm": 2.265625, + "learning_rate": 9.788467551784153e-05, + "loss": 2.3447, + "step": 2135 + }, + { + "epoch": 0.09336072380785873, + "grad_norm": 2.28125, + "learning_rate": 9.788269833057979e-05, + "loss": 2.0081, + "step": 2136 + }, + { + "epoch": 0.09340443201188864, + "grad_norm": 2.765625, + "learning_rate": 9.788072023970187e-05, + "loss": 2.434, + "step": 2137 + }, + { + "epoch": 0.09344814021591853, + "grad_norm": 2.90625, + "learning_rate": 9.787874124524505e-05, + "loss": 1.61, + "step": 2138 + }, + { + "epoch": 0.09349184841994843, + "grad_norm": 2.25, + "learning_rate": 9.78767613472467e-05, + "loss": 1.7895, + "step": 2139 + }, + { + "epoch": 0.09353555662397832, + "grad_norm": 2.984375, + "learning_rate": 9.787478054574419e-05, + "loss": 1.8193, + "step": 2140 + }, + { + "epoch": 0.09357926482800821, + "grad_norm": 3.484375, + "learning_rate": 9.787279884077489e-05, + "loss": 2.2462, + "step": 2141 + }, + { + "epoch": 0.09362297303203812, + "grad_norm": 2.359375, + "learning_rate": 9.78708162323762e-05, + "loss": 1.8915, + "step": 2142 + }, + { + "epoch": 0.09366668123606801, + "grad_norm": 2.375, + "learning_rate": 9.786883272058554e-05, + "loss": 2.1082, + "step": 2143 + }, + { + "epoch": 0.09371038944009791, + "grad_norm": 2.40625, + "learning_rate": 9.786684830544033e-05, + "loss": 1.9747, + "step": 2144 + }, + { + "epoch": 0.0937540976441278, + "grad_norm": 2.21875, + "learning_rate": 9.786486298697803e-05, + "loss": 1.9774, + "step": 2145 + }, + { + "epoch": 0.0937978058481577, + "grad_norm": 3.484375, + "learning_rate": 9.786287676523609e-05, + "loss": 2.7382, + "step": 2146 + }, + { + "epoch": 0.0938415140521876, + "grad_norm": 2.515625, + "learning_rate": 9.786088964025201e-05, + "loss": 1.9554, + "step": 2147 + }, + { + "epoch": 0.0938852222562175, + "grad_norm": 2.875, + "learning_rate": 9.78589016120633e-05, + "loss": 1.9873, + "step": 2148 + }, + { + "epoch": 0.09392893046024739, + "grad_norm": 3.359375, + "learning_rate": 9.785691268070745e-05, + "loss": 2.1718, + "step": 2149 + }, + { + "epoch": 0.09397263866427728, + "grad_norm": 2.453125, + "learning_rate": 9.7854922846222e-05, + "loss": 2.5165, + "step": 2150 + }, + { + "epoch": 0.09401634686830718, + "grad_norm": 2.3125, + "learning_rate": 9.785293210864451e-05, + "loss": 1.8265, + "step": 2151 + }, + { + "epoch": 0.09406005507233708, + "grad_norm": 2.359375, + "learning_rate": 9.785094046801256e-05, + "loss": 2.2153, + "step": 2152 + }, + { + "epoch": 0.09410376327636698, + "grad_norm": 2.296875, + "learning_rate": 9.78489479243637e-05, + "loss": 1.8579, + "step": 2153 + }, + { + "epoch": 0.09414747148039687, + "grad_norm": 2.875, + "learning_rate": 9.784695447773555e-05, + "loss": 3.1056, + "step": 2154 + }, + { + "epoch": 0.09419117968442677, + "grad_norm": 3.59375, + "learning_rate": 9.784496012816573e-05, + "loss": 1.8621, + "step": 2155 + }, + { + "epoch": 0.09423488788845666, + "grad_norm": 2.359375, + "learning_rate": 9.78429648756919e-05, + "loss": 1.3941, + "step": 2156 + }, + { + "epoch": 0.09427859609248657, + "grad_norm": 3.34375, + "learning_rate": 9.784096872035167e-05, + "loss": 2.677, + "step": 2157 + }, + { + "epoch": 0.09432230429651646, + "grad_norm": 2.71875, + "learning_rate": 9.783897166218273e-05, + "loss": 2.3034, + "step": 2158 + }, + { + "epoch": 0.09436601250054635, + "grad_norm": 3.5625, + "learning_rate": 9.783697370122278e-05, + "loss": 2.7365, + "step": 2159 + }, + { + "epoch": 0.09440972070457625, + "grad_norm": 2.609375, + "learning_rate": 9.78349748375095e-05, + "loss": 2.675, + "step": 2160 + }, + { + "epoch": 0.09445342890860614, + "grad_norm": 2.453125, + "learning_rate": 9.783297507108065e-05, + "loss": 2.2508, + "step": 2161 + }, + { + "epoch": 0.09449713711263605, + "grad_norm": 2.4375, + "learning_rate": 9.78309744019739e-05, + "loss": 2.5425, + "step": 2162 + }, + { + "epoch": 0.09454084531666594, + "grad_norm": 2.890625, + "learning_rate": 9.78289728302271e-05, + "loss": 1.9132, + "step": 2163 + }, + { + "epoch": 0.09458455352069584, + "grad_norm": 2.53125, + "learning_rate": 9.782697035587793e-05, + "loss": 1.9358, + "step": 2164 + }, + { + "epoch": 0.09462826172472573, + "grad_norm": 3.40625, + "learning_rate": 9.782496697896424e-05, + "loss": 2.3742, + "step": 2165 + }, + { + "epoch": 0.09467196992875562, + "grad_norm": 2.46875, + "learning_rate": 9.78229626995238e-05, + "loss": 2.5638, + "step": 2166 + }, + { + "epoch": 0.09471567813278553, + "grad_norm": 2.328125, + "learning_rate": 9.782095751759448e-05, + "loss": 2.0113, + "step": 2167 + }, + { + "epoch": 0.09475938633681542, + "grad_norm": 2.640625, + "learning_rate": 9.781895143321406e-05, + "loss": 2.1018, + "step": 2168 + }, + { + "epoch": 0.09480309454084532, + "grad_norm": 2.203125, + "learning_rate": 9.781694444642044e-05, + "loss": 1.7566, + "step": 2169 + }, + { + "epoch": 0.09484680274487521, + "grad_norm": 3.15625, + "learning_rate": 9.781493655725149e-05, + "loss": 2.4728, + "step": 2170 + }, + { + "epoch": 0.0948905109489051, + "grad_norm": 3.84375, + "learning_rate": 9.781292776574509e-05, + "loss": 1.4731, + "step": 2171 + }, + { + "epoch": 0.09493421915293501, + "grad_norm": 2.3125, + "learning_rate": 9.781091807193916e-05, + "loss": 2.251, + "step": 2172 + }, + { + "epoch": 0.0949779273569649, + "grad_norm": 2.4375, + "learning_rate": 9.780890747587164e-05, + "loss": 1.8029, + "step": 2173 + }, + { + "epoch": 0.0950216355609948, + "grad_norm": 3.1875, + "learning_rate": 9.780689597758041e-05, + "loss": 1.749, + "step": 2174 + }, + { + "epoch": 0.09506534376502469, + "grad_norm": 3.171875, + "learning_rate": 9.78048835771035e-05, + "loss": 2.6273, + "step": 2175 + }, + { + "epoch": 0.09510905196905459, + "grad_norm": 3.1875, + "learning_rate": 9.780287027447886e-05, + "loss": 2.1231, + "step": 2176 + }, + { + "epoch": 0.0951527601730845, + "grad_norm": 3.390625, + "learning_rate": 9.780085606974448e-05, + "loss": 2.3221, + "step": 2177 + }, + { + "epoch": 0.09519646837711439, + "grad_norm": 2.6875, + "learning_rate": 9.779884096293838e-05, + "loss": 2.2185, + "step": 2178 + }, + { + "epoch": 0.09524017658114428, + "grad_norm": 3.265625, + "learning_rate": 9.779682495409857e-05, + "loss": 2.3901, + "step": 2179 + }, + { + "epoch": 0.09528388478517417, + "grad_norm": 2.40625, + "learning_rate": 9.779480804326313e-05, + "loss": 2.7555, + "step": 2180 + }, + { + "epoch": 0.09532759298920407, + "grad_norm": 2.828125, + "learning_rate": 9.779279023047008e-05, + "loss": 2.0593, + "step": 2181 + }, + { + "epoch": 0.09537130119323398, + "grad_norm": 3.078125, + "learning_rate": 9.779077151575753e-05, + "loss": 2.1978, + "step": 2182 + }, + { + "epoch": 0.09541500939726387, + "grad_norm": 3.9375, + "learning_rate": 9.778875189916356e-05, + "loss": 2.2172, + "step": 2183 + }, + { + "epoch": 0.09545871760129376, + "grad_norm": 2.84375, + "learning_rate": 9.778673138072629e-05, + "loss": 1.9492, + "step": 2184 + }, + { + "epoch": 0.09550242580532366, + "grad_norm": 3.234375, + "learning_rate": 9.778470996048383e-05, + "loss": 2.308, + "step": 2185 + }, + { + "epoch": 0.09554613400935355, + "grad_norm": 2.515625, + "learning_rate": 9.778268763847438e-05, + "loss": 2.4126, + "step": 2186 + }, + { + "epoch": 0.09558984221338346, + "grad_norm": 3.390625, + "learning_rate": 9.778066441473604e-05, + "loss": 1.7932, + "step": 2187 + }, + { + "epoch": 0.09563355041741335, + "grad_norm": 3.109375, + "learning_rate": 9.777864028930705e-05, + "loss": 2.7622, + "step": 2188 + }, + { + "epoch": 0.09567725862144325, + "grad_norm": 2.59375, + "learning_rate": 9.777661526222556e-05, + "loss": 2.0042, + "step": 2189 + }, + { + "epoch": 0.09572096682547314, + "grad_norm": 3.015625, + "learning_rate": 9.777458933352981e-05, + "loss": 2.0112, + "step": 2190 + }, + { + "epoch": 0.09576467502950303, + "grad_norm": 2.578125, + "learning_rate": 9.777256250325801e-05, + "loss": 2.1095, + "step": 2191 + }, + { + "epoch": 0.09580838323353294, + "grad_norm": 2.796875, + "learning_rate": 9.777053477144844e-05, + "loss": 2.1284, + "step": 2192 + }, + { + "epoch": 0.09585209143756283, + "grad_norm": 3.109375, + "learning_rate": 9.776850613813936e-05, + "loss": 2.2209, + "step": 2193 + }, + { + "epoch": 0.09589579964159273, + "grad_norm": 2.921875, + "learning_rate": 9.776647660336903e-05, + "loss": 2.518, + "step": 2194 + }, + { + "epoch": 0.09593950784562262, + "grad_norm": 2.84375, + "learning_rate": 9.776444616717578e-05, + "loss": 2.4591, + "step": 2195 + }, + { + "epoch": 0.09598321604965251, + "grad_norm": 2.1875, + "learning_rate": 9.776241482959791e-05, + "loss": 2.046, + "step": 2196 + }, + { + "epoch": 0.09602692425368242, + "grad_norm": 2.40625, + "learning_rate": 9.776038259067375e-05, + "loss": 2.0352, + "step": 2197 + }, + { + "epoch": 0.09607063245771232, + "grad_norm": 3.1875, + "learning_rate": 9.775834945044167e-05, + "loss": 2.0531, + "step": 2198 + }, + { + "epoch": 0.09611434066174221, + "grad_norm": 3.046875, + "learning_rate": 9.775631540894002e-05, + "loss": 2.5601, + "step": 2199 + }, + { + "epoch": 0.0961580488657721, + "grad_norm": 3.5625, + "learning_rate": 9.775428046620718e-05, + "loss": 2.8106, + "step": 2200 + }, + { + "epoch": 0.096201757069802, + "grad_norm": 2.75, + "learning_rate": 9.775224462228159e-05, + "loss": 2.3545, + "step": 2201 + }, + { + "epoch": 0.0962454652738319, + "grad_norm": 6.71875, + "learning_rate": 9.775020787720162e-05, + "loss": 1.8256, + "step": 2202 + }, + { + "epoch": 0.0962891734778618, + "grad_norm": 2.71875, + "learning_rate": 9.774817023100575e-05, + "loss": 2.4899, + "step": 2203 + }, + { + "epoch": 0.09633288168189169, + "grad_norm": 2.546875, + "learning_rate": 9.77461316837324e-05, + "loss": 2.3327, + "step": 2204 + }, + { + "epoch": 0.09637658988592158, + "grad_norm": 3.296875, + "learning_rate": 9.774409223542006e-05, + "loss": 2.6428, + "step": 2205 + }, + { + "epoch": 0.09642029808995148, + "grad_norm": 2.421875, + "learning_rate": 9.774205188610721e-05, + "loss": 1.8124, + "step": 2206 + }, + { + "epoch": 0.09646400629398139, + "grad_norm": 2.546875, + "learning_rate": 9.774001063583235e-05, + "loss": 2.2997, + "step": 2207 + }, + { + "epoch": 0.09650771449801128, + "grad_norm": 2.4375, + "learning_rate": 9.773796848463402e-05, + "loss": 2.1743, + "step": 2208 + }, + { + "epoch": 0.09655142270204117, + "grad_norm": 2.546875, + "learning_rate": 9.773592543255074e-05, + "loss": 2.0742, + "step": 2209 + }, + { + "epoch": 0.09659513090607107, + "grad_norm": 2.421875, + "learning_rate": 9.773388147962106e-05, + "loss": 1.9118, + "step": 2210 + }, + { + "epoch": 0.09663883911010096, + "grad_norm": 2.34375, + "learning_rate": 9.773183662588356e-05, + "loss": 1.9707, + "step": 2211 + }, + { + "epoch": 0.09668254731413087, + "grad_norm": 2.9375, + "learning_rate": 9.772979087137686e-05, + "loss": 2.2365, + "step": 2212 + }, + { + "epoch": 0.09672625551816076, + "grad_norm": 2.203125, + "learning_rate": 9.772774421613951e-05, + "loss": 1.9471, + "step": 2213 + }, + { + "epoch": 0.09676996372219066, + "grad_norm": 2.453125, + "learning_rate": 9.772569666021018e-05, + "loss": 2.2223, + "step": 2214 + }, + { + "epoch": 0.09681367192622055, + "grad_norm": 2.609375, + "learning_rate": 9.772364820362749e-05, + "loss": 2.2746, + "step": 2215 + }, + { + "epoch": 0.09685738013025044, + "grad_norm": 2.375, + "learning_rate": 9.77215988464301e-05, + "loss": 1.7533, + "step": 2216 + }, + { + "epoch": 0.09690108833428035, + "grad_norm": 3.3125, + "learning_rate": 9.771954858865668e-05, + "loss": 2.0964, + "step": 2217 + }, + { + "epoch": 0.09694479653831024, + "grad_norm": 3.65625, + "learning_rate": 9.771749743034592e-05, + "loss": 2.1936, + "step": 2218 + }, + { + "epoch": 0.09698850474234014, + "grad_norm": 4.34375, + "learning_rate": 9.771544537153653e-05, + "loss": 2.7054, + "step": 2219 + }, + { + "epoch": 0.09703221294637003, + "grad_norm": 2.484375, + "learning_rate": 9.771339241226726e-05, + "loss": 2.2081, + "step": 2220 + }, + { + "epoch": 0.09707592115039992, + "grad_norm": 2.703125, + "learning_rate": 9.771133855257684e-05, + "loss": 2.1181, + "step": 2221 + }, + { + "epoch": 0.09711962935442983, + "grad_norm": 13.4375, + "learning_rate": 9.770928379250399e-05, + "loss": 2.4265, + "step": 2222 + }, + { + "epoch": 0.09716333755845973, + "grad_norm": 10.25, + "learning_rate": 9.770722813208754e-05, + "loss": 6.5293, + "step": 2223 + }, + { + "epoch": 0.09720704576248962, + "grad_norm": 3.453125, + "learning_rate": 9.770517157136625e-05, + "loss": 2.4392, + "step": 2224 + }, + { + "epoch": 0.09725075396651951, + "grad_norm": 3.0625, + "learning_rate": 9.770311411037894e-05, + "loss": 2.3015, + "step": 2225 + }, + { + "epoch": 0.0972944621705494, + "grad_norm": 2.8125, + "learning_rate": 9.770105574916443e-05, + "loss": 2.1963, + "step": 2226 + }, + { + "epoch": 0.09733817037457931, + "grad_norm": 3.421875, + "learning_rate": 9.76989964877616e-05, + "loss": 1.7115, + "step": 2227 + }, + { + "epoch": 0.09738187857860921, + "grad_norm": 4.0, + "learning_rate": 9.769693632620926e-05, + "loss": 1.7496, + "step": 2228 + }, + { + "epoch": 0.0974255867826391, + "grad_norm": 2.65625, + "learning_rate": 9.769487526454631e-05, + "loss": 2.2873, + "step": 2229 + }, + { + "epoch": 0.097469294986669, + "grad_norm": 2.84375, + "learning_rate": 9.769281330281165e-05, + "loss": 2.1804, + "step": 2230 + }, + { + "epoch": 0.09751300319069889, + "grad_norm": 2.484375, + "learning_rate": 9.76907504410442e-05, + "loss": 2.0584, + "step": 2231 + }, + { + "epoch": 0.0975567113947288, + "grad_norm": 3.390625, + "learning_rate": 9.768868667928288e-05, + "loss": 1.7148, + "step": 2232 + }, + { + "epoch": 0.09760041959875869, + "grad_norm": 2.6875, + "learning_rate": 9.768662201756662e-05, + "loss": 1.9899, + "step": 2233 + }, + { + "epoch": 0.09764412780278858, + "grad_norm": 2.78125, + "learning_rate": 9.768455645593441e-05, + "loss": 2.4444, + "step": 2234 + }, + { + "epoch": 0.09768783600681848, + "grad_norm": 2.703125, + "learning_rate": 9.768248999442522e-05, + "loss": 1.8971, + "step": 2235 + }, + { + "epoch": 0.09773154421084837, + "grad_norm": 2.890625, + "learning_rate": 9.768042263307804e-05, + "loss": 2.4553, + "step": 2236 + }, + { + "epoch": 0.09777525241487828, + "grad_norm": 2.65625, + "learning_rate": 9.767835437193187e-05, + "loss": 1.7158, + "step": 2237 + }, + { + "epoch": 0.09781896061890817, + "grad_norm": 2.859375, + "learning_rate": 9.767628521102578e-05, + "loss": 2.3434, + "step": 2238 + }, + { + "epoch": 0.09786266882293806, + "grad_norm": 2.953125, + "learning_rate": 9.767421515039881e-05, + "loss": 1.9139, + "step": 2239 + }, + { + "epoch": 0.09790637702696796, + "grad_norm": 2.328125, + "learning_rate": 9.767214419009e-05, + "loss": 2.0438, + "step": 2240 + }, + { + "epoch": 0.09795008523099785, + "grad_norm": 2.375, + "learning_rate": 9.767007233013845e-05, + "loss": 1.9096, + "step": 2241 + }, + { + "epoch": 0.09799379343502776, + "grad_norm": 2.609375, + "learning_rate": 9.766799957058324e-05, + "loss": 2.0401, + "step": 2242 + }, + { + "epoch": 0.09803750163905765, + "grad_norm": 3.796875, + "learning_rate": 9.766592591146352e-05, + "loss": 3.3079, + "step": 2243 + }, + { + "epoch": 0.09808120984308755, + "grad_norm": 2.84375, + "learning_rate": 9.766385135281839e-05, + "loss": 2.1826, + "step": 2244 + }, + { + "epoch": 0.09812491804711744, + "grad_norm": 2.40625, + "learning_rate": 9.766177589468701e-05, + "loss": 1.8902, + "step": 2245 + }, + { + "epoch": 0.09816862625114733, + "grad_norm": 2.515625, + "learning_rate": 9.765969953710857e-05, + "loss": 1.9289, + "step": 2246 + }, + { + "epoch": 0.09821233445517724, + "grad_norm": 2.484375, + "learning_rate": 9.765762228012222e-05, + "loss": 1.9364, + "step": 2247 + }, + { + "epoch": 0.09825604265920714, + "grad_norm": 18.75, + "learning_rate": 9.765554412376719e-05, + "loss": 6.6763, + "step": 2248 + }, + { + "epoch": 0.09829975086323703, + "grad_norm": 2.828125, + "learning_rate": 9.765346506808266e-05, + "loss": 1.8456, + "step": 2249 + }, + { + "epoch": 0.09834345906726692, + "grad_norm": 2.515625, + "learning_rate": 9.765138511310791e-05, + "loss": 2.7566, + "step": 2250 + }, + { + "epoch": 0.09838716727129682, + "grad_norm": 2.546875, + "learning_rate": 9.764930425888215e-05, + "loss": 1.8674, + "step": 2251 + }, + { + "epoch": 0.09843087547532672, + "grad_norm": 2.8125, + "learning_rate": 9.764722250544469e-05, + "loss": 2.1064, + "step": 2252 + }, + { + "epoch": 0.09847458367935662, + "grad_norm": 2.53125, + "learning_rate": 9.764513985283478e-05, + "loss": 2.2086, + "step": 2253 + }, + { + "epoch": 0.09851829188338651, + "grad_norm": 2.625, + "learning_rate": 9.764305630109175e-05, + "loss": 2.2642, + "step": 2254 + }, + { + "epoch": 0.0985620000874164, + "grad_norm": 3.078125, + "learning_rate": 9.764097185025489e-05, + "loss": 2.1661, + "step": 2255 + }, + { + "epoch": 0.0986057082914463, + "grad_norm": 2.375, + "learning_rate": 9.763888650036354e-05, + "loss": 1.985, + "step": 2256 + }, + { + "epoch": 0.0986494164954762, + "grad_norm": 3.15625, + "learning_rate": 9.76368002514571e-05, + "loss": 2.8063, + "step": 2257 + }, + { + "epoch": 0.0986931246995061, + "grad_norm": 3.0625, + "learning_rate": 9.763471310357488e-05, + "loss": 1.8268, + "step": 2258 + }, + { + "epoch": 0.09873683290353599, + "grad_norm": 3.265625, + "learning_rate": 9.763262505675632e-05, + "loss": 2.2688, + "step": 2259 + }, + { + "epoch": 0.09878054110756589, + "grad_norm": 6.71875, + "learning_rate": 9.763053611104079e-05, + "loss": 2.1481, + "step": 2260 + }, + { + "epoch": 0.09882424931159578, + "grad_norm": 3.3125, + "learning_rate": 9.76284462664677e-05, + "loss": 1.79, + "step": 2261 + }, + { + "epoch": 0.09886795751562569, + "grad_norm": 3.265625, + "learning_rate": 9.762635552307653e-05, + "loss": 2.7382, + "step": 2262 + }, + { + "epoch": 0.09891166571965558, + "grad_norm": 2.5, + "learning_rate": 9.762426388090671e-05, + "loss": 2.0873, + "step": 2263 + }, + { + "epoch": 0.09895537392368547, + "grad_norm": 3.65625, + "learning_rate": 9.762217133999771e-05, + "loss": 2.393, + "step": 2264 + }, + { + "epoch": 0.09899908212771537, + "grad_norm": 2.9375, + "learning_rate": 9.762007790038904e-05, + "loss": 1.5162, + "step": 2265 + }, + { + "epoch": 0.09904279033174526, + "grad_norm": 3.875, + "learning_rate": 9.761798356212019e-05, + "loss": 3.1219, + "step": 2266 + }, + { + "epoch": 0.09908649853577517, + "grad_norm": 2.46875, + "learning_rate": 9.761588832523067e-05, + "loss": 2.3465, + "step": 2267 + }, + { + "epoch": 0.09913020673980506, + "grad_norm": 2.5625, + "learning_rate": 9.761379218976005e-05, + "loss": 1.912, + "step": 2268 + }, + { + "epoch": 0.09917391494383496, + "grad_norm": 2.484375, + "learning_rate": 9.761169515574786e-05, + "loss": 1.8608, + "step": 2269 + }, + { + "epoch": 0.09921762314786485, + "grad_norm": 2.515625, + "learning_rate": 9.760959722323371e-05, + "loss": 1.9141, + "step": 2270 + }, + { + "epoch": 0.09926133135189474, + "grad_norm": 3.421875, + "learning_rate": 9.760749839225714e-05, + "loss": 1.937, + "step": 2271 + }, + { + "epoch": 0.09930503955592465, + "grad_norm": 3.578125, + "learning_rate": 9.760539866285781e-05, + "loss": 2.1976, + "step": 2272 + }, + { + "epoch": 0.09934874775995454, + "grad_norm": 2.234375, + "learning_rate": 9.76032980350753e-05, + "loss": 1.4496, + "step": 2273 + }, + { + "epoch": 0.09939245596398444, + "grad_norm": 3.234375, + "learning_rate": 9.760119650894929e-05, + "loss": 2.3222, + "step": 2274 + }, + { + "epoch": 0.09943616416801433, + "grad_norm": 2.8125, + "learning_rate": 9.759909408451942e-05, + "loss": 2.3326, + "step": 2275 + }, + { + "epoch": 0.09947987237204424, + "grad_norm": 2.4375, + "learning_rate": 9.759699076182536e-05, + "loss": 1.8843, + "step": 2276 + }, + { + "epoch": 0.09952358057607413, + "grad_norm": 6.25, + "learning_rate": 9.759488654090681e-05, + "loss": 2.3823, + "step": 2277 + }, + { + "epoch": 0.09956728878010403, + "grad_norm": 2.671875, + "learning_rate": 9.759278142180348e-05, + "loss": 2.1267, + "step": 2278 + }, + { + "epoch": 0.09961099698413392, + "grad_norm": 2.59375, + "learning_rate": 9.759067540455511e-05, + "loss": 1.9924, + "step": 2279 + }, + { + "epoch": 0.09965470518816381, + "grad_norm": 6.0625, + "learning_rate": 9.758856848920142e-05, + "loss": 2.2601, + "step": 2280 + }, + { + "epoch": 0.09969841339219372, + "grad_norm": 2.390625, + "learning_rate": 9.758646067578216e-05, + "loss": 2.5403, + "step": 2281 + }, + { + "epoch": 0.09974212159622362, + "grad_norm": 3.765625, + "learning_rate": 9.758435196433716e-05, + "loss": 2.2937, + "step": 2282 + }, + { + "epoch": 0.09978582980025351, + "grad_norm": 5.125, + "learning_rate": 9.758224235490618e-05, + "loss": 1.7025, + "step": 2283 + }, + { + "epoch": 0.0998295380042834, + "grad_norm": 2.671875, + "learning_rate": 9.758013184752901e-05, + "loss": 2.2129, + "step": 2284 + }, + { + "epoch": 0.0998732462083133, + "grad_norm": 2.5, + "learning_rate": 9.757802044224553e-05, + "loss": 2.5303, + "step": 2285 + }, + { + "epoch": 0.0999169544123432, + "grad_norm": 2.34375, + "learning_rate": 9.757590813909554e-05, + "loss": 1.7859, + "step": 2286 + }, + { + "epoch": 0.0999606626163731, + "grad_norm": 2.375, + "learning_rate": 9.757379493811892e-05, + "loss": 2.1649, + "step": 2287 + }, + { + "epoch": 0.10000437082040299, + "grad_norm": 2.84375, + "learning_rate": 9.757168083935556e-05, + "loss": 1.764, + "step": 2288 + }, + { + "epoch": 0.10004807902443288, + "grad_norm": 2.296875, + "learning_rate": 9.756956584284533e-05, + "loss": 1.8986, + "step": 2289 + }, + { + "epoch": 0.10009178722846278, + "grad_norm": 2.515625, + "learning_rate": 9.756744994862817e-05, + "loss": 2.2794, + "step": 2290 + }, + { + "epoch": 0.10013549543249269, + "grad_norm": 3.234375, + "learning_rate": 9.7565333156744e-05, + "loss": 1.987, + "step": 2291 + }, + { + "epoch": 0.10017920363652258, + "grad_norm": 2.59375, + "learning_rate": 9.756321546723277e-05, + "loss": 2.2158, + "step": 2292 + }, + { + "epoch": 0.10022291184055247, + "grad_norm": 2.828125, + "learning_rate": 9.756109688013442e-05, + "loss": 1.4978, + "step": 2293 + }, + { + "epoch": 0.10026662004458237, + "grad_norm": 2.671875, + "learning_rate": 9.755897739548896e-05, + "loss": 2.0564, + "step": 2294 + }, + { + "epoch": 0.10031032824861226, + "grad_norm": 2.84375, + "learning_rate": 9.75568570133364e-05, + "loss": 1.9511, + "step": 2295 + }, + { + "epoch": 0.10035403645264217, + "grad_norm": 2.671875, + "learning_rate": 9.75547357337167e-05, + "loss": 2.2555, + "step": 2296 + }, + { + "epoch": 0.10039774465667206, + "grad_norm": 2.375, + "learning_rate": 9.755261355666994e-05, + "loss": 1.7196, + "step": 2297 + }, + { + "epoch": 0.10044145286070195, + "grad_norm": 3.0625, + "learning_rate": 9.755049048223615e-05, + "loss": 1.6631, + "step": 2298 + }, + { + "epoch": 0.10048516106473185, + "grad_norm": 2.625, + "learning_rate": 9.754836651045538e-05, + "loss": 1.9797, + "step": 2299 + }, + { + "epoch": 0.10052886926876174, + "grad_norm": 3.265625, + "learning_rate": 9.754624164136774e-05, + "loss": 2.3664, + "step": 2300 + }, + { + "epoch": 0.10057257747279165, + "grad_norm": 2.09375, + "learning_rate": 9.754411587501333e-05, + "loss": 1.7274, + "step": 2301 + }, + { + "epoch": 0.10061628567682154, + "grad_norm": 2.59375, + "learning_rate": 9.754198921143226e-05, + "loss": 1.8973, + "step": 2302 + }, + { + "epoch": 0.10065999388085144, + "grad_norm": 2.578125, + "learning_rate": 9.753986165066464e-05, + "loss": 2.0499, + "step": 2303 + }, + { + "epoch": 0.10070370208488133, + "grad_norm": 2.578125, + "learning_rate": 9.753773319275065e-05, + "loss": 1.7278, + "step": 2304 + }, + { + "epoch": 0.10074741028891122, + "grad_norm": 2.875, + "learning_rate": 9.753560383773046e-05, + "loss": 2.5053, + "step": 2305 + }, + { + "epoch": 0.10079111849294113, + "grad_norm": 14.0, + "learning_rate": 9.753347358564423e-05, + "loss": 5.9202, + "step": 2306 + }, + { + "epoch": 0.10083482669697102, + "grad_norm": 2.703125, + "learning_rate": 9.753134243653217e-05, + "loss": 1.9736, + "step": 2307 + }, + { + "epoch": 0.10087853490100092, + "grad_norm": 2.8125, + "learning_rate": 9.75292103904345e-05, + "loss": 2.354, + "step": 2308 + }, + { + "epoch": 0.10092224310503081, + "grad_norm": 2.828125, + "learning_rate": 9.752707744739145e-05, + "loss": 3.0874, + "step": 2309 + }, + { + "epoch": 0.1009659513090607, + "grad_norm": 2.796875, + "learning_rate": 9.752494360744329e-05, + "loss": 2.251, + "step": 2310 + }, + { + "epoch": 0.10100965951309061, + "grad_norm": 3.078125, + "learning_rate": 9.752280887063026e-05, + "loss": 2.0361, + "step": 2311 + }, + { + "epoch": 0.1010533677171205, + "grad_norm": 2.734375, + "learning_rate": 9.752067323699267e-05, + "loss": 2.2617, + "step": 2312 + }, + { + "epoch": 0.1010970759211504, + "grad_norm": 3.140625, + "learning_rate": 9.751853670657081e-05, + "loss": 1.9537, + "step": 2313 + }, + { + "epoch": 0.1011407841251803, + "grad_norm": 2.875, + "learning_rate": 9.7516399279405e-05, + "loss": 2.3131, + "step": 2314 + }, + { + "epoch": 0.10118449232921019, + "grad_norm": 2.9375, + "learning_rate": 9.751426095553557e-05, + "loss": 3.22, + "step": 2315 + }, + { + "epoch": 0.1012282005332401, + "grad_norm": 2.5625, + "learning_rate": 9.751212173500291e-05, + "loss": 1.9288, + "step": 2316 + }, + { + "epoch": 0.10127190873726999, + "grad_norm": 2.625, + "learning_rate": 9.750998161784734e-05, + "loss": 2.2181, + "step": 2317 + }, + { + "epoch": 0.10131561694129988, + "grad_norm": 3.578125, + "learning_rate": 9.750784060410927e-05, + "loss": 2.2256, + "step": 2318 + }, + { + "epoch": 0.10135932514532978, + "grad_norm": 3.328125, + "learning_rate": 9.750569869382911e-05, + "loss": 2.3126, + "step": 2319 + }, + { + "epoch": 0.10140303334935967, + "grad_norm": 2.46875, + "learning_rate": 9.750355588704727e-05, + "loss": 2.6601, + "step": 2320 + }, + { + "epoch": 0.10144674155338958, + "grad_norm": 3.375, + "learning_rate": 9.750141218380419e-05, + "loss": 2.0413, + "step": 2321 + }, + { + "epoch": 0.10149044975741947, + "grad_norm": 2.6875, + "learning_rate": 9.749926758414035e-05, + "loss": 2.0241, + "step": 2322 + }, + { + "epoch": 0.10153415796144936, + "grad_norm": 2.5625, + "learning_rate": 9.749712208809618e-05, + "loss": 2.1908, + "step": 2323 + }, + { + "epoch": 0.10157786616547926, + "grad_norm": 3.25, + "learning_rate": 9.749497569571217e-05, + "loss": 1.5666, + "step": 2324 + }, + { + "epoch": 0.10162157436950915, + "grad_norm": 2.359375, + "learning_rate": 9.749282840702887e-05, + "loss": 2.0106, + "step": 2325 + }, + { + "epoch": 0.10166528257353906, + "grad_norm": 5.21875, + "learning_rate": 9.749068022208676e-05, + "loss": 2.8774, + "step": 2326 + }, + { + "epoch": 0.10170899077756895, + "grad_norm": 2.59375, + "learning_rate": 9.748853114092639e-05, + "loss": 1.7896, + "step": 2327 + }, + { + "epoch": 0.10175269898159885, + "grad_norm": 2.84375, + "learning_rate": 9.748638116358834e-05, + "loss": 2.3381, + "step": 2328 + }, + { + "epoch": 0.10179640718562874, + "grad_norm": 2.53125, + "learning_rate": 9.748423029011317e-05, + "loss": 1.9284, + "step": 2329 + }, + { + "epoch": 0.10184011538965863, + "grad_norm": 2.8125, + "learning_rate": 9.748207852054144e-05, + "loss": 1.5755, + "step": 2330 + }, + { + "epoch": 0.10188382359368854, + "grad_norm": 2.703125, + "learning_rate": 9.747992585491379e-05, + "loss": 2.5287, + "step": 2331 + }, + { + "epoch": 0.10192753179771843, + "grad_norm": 2.640625, + "learning_rate": 9.747777229327084e-05, + "loss": 2.5582, + "step": 2332 + }, + { + "epoch": 0.10197124000174833, + "grad_norm": 2.296875, + "learning_rate": 9.747561783565323e-05, + "loss": 1.9647, + "step": 2333 + }, + { + "epoch": 0.10201494820577822, + "grad_norm": 3.046875, + "learning_rate": 9.747346248210161e-05, + "loss": 2.9669, + "step": 2334 + }, + { + "epoch": 0.10205865640980812, + "grad_norm": 3.265625, + "learning_rate": 9.747130623265665e-05, + "loss": 2.9084, + "step": 2335 + }, + { + "epoch": 0.10210236461383802, + "grad_norm": 3.375, + "learning_rate": 9.746914908735906e-05, + "loss": 1.8825, + "step": 2336 + }, + { + "epoch": 0.10214607281786792, + "grad_norm": 2.671875, + "learning_rate": 9.746699104624953e-05, + "loss": 2.0626, + "step": 2337 + }, + { + "epoch": 0.10218978102189781, + "grad_norm": 3.359375, + "learning_rate": 9.746483210936881e-05, + "loss": 2.3356, + "step": 2338 + }, + { + "epoch": 0.1022334892259277, + "grad_norm": 2.765625, + "learning_rate": 9.74626722767576e-05, + "loss": 2.2161, + "step": 2339 + }, + { + "epoch": 0.1022771974299576, + "grad_norm": 2.671875, + "learning_rate": 9.74605115484567e-05, + "loss": 2.2453, + "step": 2340 + }, + { + "epoch": 0.1023209056339875, + "grad_norm": 2.5, + "learning_rate": 9.745834992450689e-05, + "loss": 1.6678, + "step": 2341 + }, + { + "epoch": 0.1023646138380174, + "grad_norm": 2.453125, + "learning_rate": 9.745618740494892e-05, + "loss": 1.775, + "step": 2342 + }, + { + "epoch": 0.10240832204204729, + "grad_norm": 2.390625, + "learning_rate": 9.745402398982363e-05, + "loss": 1.9103, + "step": 2343 + }, + { + "epoch": 0.10245203024607719, + "grad_norm": 2.34375, + "learning_rate": 9.745185967917184e-05, + "loss": 1.9037, + "step": 2344 + }, + { + "epoch": 0.10249573845010708, + "grad_norm": 2.34375, + "learning_rate": 9.74496944730344e-05, + "loss": 2.4936, + "step": 2345 + }, + { + "epoch": 0.10253944665413699, + "grad_norm": 2.6875, + "learning_rate": 9.744752837145217e-05, + "loss": 1.9646, + "step": 2346 + }, + { + "epoch": 0.10258315485816688, + "grad_norm": 2.578125, + "learning_rate": 9.744536137446601e-05, + "loss": 1.7475, + "step": 2347 + }, + { + "epoch": 0.10262686306219677, + "grad_norm": 2.6875, + "learning_rate": 9.744319348211684e-05, + "loss": 2.0058, + "step": 2348 + }, + { + "epoch": 0.10267057126622667, + "grad_norm": 2.515625, + "learning_rate": 9.744102469444555e-05, + "loss": 2.3312, + "step": 2349 + }, + { + "epoch": 0.10271427947025656, + "grad_norm": 2.8125, + "learning_rate": 9.743885501149308e-05, + "loss": 2.1554, + "step": 2350 + }, + { + "epoch": 0.10275798767428647, + "grad_norm": 2.890625, + "learning_rate": 9.743668443330037e-05, + "loss": 2.3322, + "step": 2351 + }, + { + "epoch": 0.10280169587831636, + "grad_norm": 2.375, + "learning_rate": 9.743451295990837e-05, + "loss": 1.9326, + "step": 2352 + }, + { + "epoch": 0.10284540408234626, + "grad_norm": 2.265625, + "learning_rate": 9.743234059135811e-05, + "loss": 2.1341, + "step": 2353 + }, + { + "epoch": 0.10288911228637615, + "grad_norm": 2.421875, + "learning_rate": 9.743016732769053e-05, + "loss": 2.1587, + "step": 2354 + }, + { + "epoch": 0.10293282049040604, + "grad_norm": 2.9375, + "learning_rate": 9.742799316894663e-05, + "loss": 2.2016, + "step": 2355 + }, + { + "epoch": 0.10297652869443595, + "grad_norm": 2.59375, + "learning_rate": 9.742581811516751e-05, + "loss": 1.9144, + "step": 2356 + }, + { + "epoch": 0.10302023689846584, + "grad_norm": 2.984375, + "learning_rate": 9.742364216639416e-05, + "loss": 2.4872, + "step": 2357 + }, + { + "epoch": 0.10306394510249574, + "grad_norm": 2.28125, + "learning_rate": 9.742146532266767e-05, + "loss": 1.7991, + "step": 2358 + }, + { + "epoch": 0.10310765330652563, + "grad_norm": 2.609375, + "learning_rate": 9.741928758402912e-05, + "loss": 2.3372, + "step": 2359 + }, + { + "epoch": 0.10315136151055553, + "grad_norm": 2.578125, + "learning_rate": 9.741710895051958e-05, + "loss": 1.8571, + "step": 2360 + }, + { + "epoch": 0.10319506971458543, + "grad_norm": 2.953125, + "learning_rate": 9.741492942218018e-05, + "loss": 1.9904, + "step": 2361 + }, + { + "epoch": 0.10323877791861533, + "grad_norm": 2.765625, + "learning_rate": 9.741274899905207e-05, + "loss": 2.087, + "step": 2362 + }, + { + "epoch": 0.10328248612264522, + "grad_norm": 2.546875, + "learning_rate": 9.741056768117636e-05, + "loss": 2.236, + "step": 2363 + }, + { + "epoch": 0.10332619432667511, + "grad_norm": 2.90625, + "learning_rate": 9.740838546859426e-05, + "loss": 1.9287, + "step": 2364 + }, + { + "epoch": 0.10336990253070501, + "grad_norm": 2.453125, + "learning_rate": 9.74062023613469e-05, + "loss": 2.3908, + "step": 2365 + }, + { + "epoch": 0.10341361073473491, + "grad_norm": 2.46875, + "learning_rate": 9.740401835947551e-05, + "loss": 1.9419, + "step": 2366 + }, + { + "epoch": 0.10345731893876481, + "grad_norm": 2.578125, + "learning_rate": 9.740183346302131e-05, + "loss": 2.2898, + "step": 2367 + }, + { + "epoch": 0.1035010271427947, + "grad_norm": 2.375, + "learning_rate": 9.739964767202552e-05, + "loss": 2.1998, + "step": 2368 + }, + { + "epoch": 0.1035447353468246, + "grad_norm": 2.390625, + "learning_rate": 9.739746098652939e-05, + "loss": 2.2489, + "step": 2369 + }, + { + "epoch": 0.10358844355085449, + "grad_norm": 3.015625, + "learning_rate": 9.73952734065742e-05, + "loss": 1.9166, + "step": 2370 + }, + { + "epoch": 0.1036321517548844, + "grad_norm": 2.1875, + "learning_rate": 9.73930849322012e-05, + "loss": 1.8404, + "step": 2371 + }, + { + "epoch": 0.10367585995891429, + "grad_norm": 2.4375, + "learning_rate": 9.739089556345171e-05, + "loss": 2.4283, + "step": 2372 + }, + { + "epoch": 0.10371956816294418, + "grad_norm": 2.671875, + "learning_rate": 9.738870530036706e-05, + "loss": 1.9826, + "step": 2373 + }, + { + "epoch": 0.10376327636697408, + "grad_norm": 2.765625, + "learning_rate": 9.738651414298857e-05, + "loss": 2.3027, + "step": 2374 + }, + { + "epoch": 0.10380698457100397, + "grad_norm": 2.625, + "learning_rate": 9.738432209135757e-05, + "loss": 1.8125, + "step": 2375 + }, + { + "epoch": 0.10385069277503388, + "grad_norm": 2.234375, + "learning_rate": 9.738212914551547e-05, + "loss": 1.6889, + "step": 2376 + }, + { + "epoch": 0.10389440097906377, + "grad_norm": 2.84375, + "learning_rate": 9.737993530550362e-05, + "loss": 2.5132, + "step": 2377 + }, + { + "epoch": 0.10393810918309367, + "grad_norm": 4.3125, + "learning_rate": 9.737774057136344e-05, + "loss": 2.1089, + "step": 2378 + }, + { + "epoch": 0.10398181738712356, + "grad_norm": 5.0, + "learning_rate": 9.737554494313635e-05, + "loss": 2.1395, + "step": 2379 + }, + { + "epoch": 0.10402552559115345, + "grad_norm": 2.78125, + "learning_rate": 9.737334842086374e-05, + "loss": 2.4614, + "step": 2380 + }, + { + "epoch": 0.10406923379518336, + "grad_norm": 7.59375, + "learning_rate": 9.737115100458713e-05, + "loss": 1.8017, + "step": 2381 + }, + { + "epoch": 0.10411294199921325, + "grad_norm": 2.65625, + "learning_rate": 9.736895269434794e-05, + "loss": 2.0097, + "step": 2382 + }, + { + "epoch": 0.10415665020324315, + "grad_norm": 2.546875, + "learning_rate": 9.736675349018767e-05, + "loss": 1.8963, + "step": 2383 + }, + { + "epoch": 0.10420035840727304, + "grad_norm": 2.9375, + "learning_rate": 9.736455339214783e-05, + "loss": 2.386, + "step": 2384 + }, + { + "epoch": 0.10424406661130294, + "grad_norm": 2.203125, + "learning_rate": 9.736235240026993e-05, + "loss": 2.0695, + "step": 2385 + }, + { + "epoch": 0.10428777481533284, + "grad_norm": 3.40625, + "learning_rate": 9.73601505145955e-05, + "loss": 2.3432, + "step": 2386 + }, + { + "epoch": 0.10433148301936274, + "grad_norm": 3.25, + "learning_rate": 9.735794773516611e-05, + "loss": 2.3339, + "step": 2387 + }, + { + "epoch": 0.10437519122339263, + "grad_norm": 3.421875, + "learning_rate": 9.735574406202332e-05, + "loss": 1.8667, + "step": 2388 + }, + { + "epoch": 0.10441889942742252, + "grad_norm": 2.78125, + "learning_rate": 9.735353949520871e-05, + "loss": 2.2035, + "step": 2389 + }, + { + "epoch": 0.10446260763145242, + "grad_norm": 4.1875, + "learning_rate": 9.73513340347639e-05, + "loss": 1.8563, + "step": 2390 + }, + { + "epoch": 0.10450631583548232, + "grad_norm": 2.109375, + "learning_rate": 9.73491276807305e-05, + "loss": 1.6536, + "step": 2391 + }, + { + "epoch": 0.10455002403951222, + "grad_norm": 2.796875, + "learning_rate": 9.734692043315012e-05, + "loss": 2.2219, + "step": 2392 + }, + { + "epoch": 0.10459373224354211, + "grad_norm": 2.859375, + "learning_rate": 9.734471229206448e-05, + "loss": 2.2594, + "step": 2393 + }, + { + "epoch": 0.104637440447572, + "grad_norm": 2.640625, + "learning_rate": 9.73425032575152e-05, + "loss": 2.0679, + "step": 2394 + }, + { + "epoch": 0.1046811486516019, + "grad_norm": 2.265625, + "learning_rate": 9.734029332954395e-05, + "loss": 2.0133, + "step": 2395 + }, + { + "epoch": 0.1047248568556318, + "grad_norm": 2.46875, + "learning_rate": 9.73380825081925e-05, + "loss": 1.9773, + "step": 2396 + }, + { + "epoch": 0.1047685650596617, + "grad_norm": 3.109375, + "learning_rate": 9.733587079350252e-05, + "loss": 2.7695, + "step": 2397 + }, + { + "epoch": 0.1048122732636916, + "grad_norm": 2.328125, + "learning_rate": 9.733365818551576e-05, + "loss": 1.9399, + "step": 2398 + }, + { + "epoch": 0.10485598146772149, + "grad_norm": 2.625, + "learning_rate": 9.7331444684274e-05, + "loss": 2.1683, + "step": 2399 + }, + { + "epoch": 0.10489968967175138, + "grad_norm": 3.0, + "learning_rate": 9.732923028981897e-05, + "loss": 2.5887, + "step": 2400 + }, + { + "epoch": 0.10494339787578129, + "grad_norm": 2.75, + "learning_rate": 9.732701500219251e-05, + "loss": 1.808, + "step": 2401 + }, + { + "epoch": 0.10498710607981118, + "grad_norm": 4.8125, + "learning_rate": 9.732479882143636e-05, + "loss": 3.8407, + "step": 2402 + }, + { + "epoch": 0.10503081428384108, + "grad_norm": 7.09375, + "learning_rate": 9.732258174759239e-05, + "loss": 3.2358, + "step": 2403 + }, + { + "epoch": 0.10507452248787097, + "grad_norm": 2.921875, + "learning_rate": 9.732036378070243e-05, + "loss": 1.934, + "step": 2404 + }, + { + "epoch": 0.10511823069190086, + "grad_norm": 2.453125, + "learning_rate": 9.731814492080832e-05, + "loss": 2.7517, + "step": 2405 + }, + { + "epoch": 0.10516193889593077, + "grad_norm": 2.71875, + "learning_rate": 9.731592516795197e-05, + "loss": 2.9847, + "step": 2406 + }, + { + "epoch": 0.10520564709996066, + "grad_norm": 2.203125, + "learning_rate": 9.731370452217524e-05, + "loss": 1.8667, + "step": 2407 + }, + { + "epoch": 0.10524935530399056, + "grad_norm": 2.359375, + "learning_rate": 9.731148298352004e-05, + "loss": 2.0742, + "step": 2408 + }, + { + "epoch": 0.10529306350802045, + "grad_norm": 4.0625, + "learning_rate": 9.73092605520283e-05, + "loss": 1.9423, + "step": 2409 + }, + { + "epoch": 0.10533677171205034, + "grad_norm": 4.28125, + "learning_rate": 9.730703722774196e-05, + "loss": 2.7559, + "step": 2410 + }, + { + "epoch": 0.10538047991608025, + "grad_norm": 2.640625, + "learning_rate": 9.730481301070298e-05, + "loss": 2.0713, + "step": 2411 + }, + { + "epoch": 0.10542418812011015, + "grad_norm": 2.578125, + "learning_rate": 9.730258790095331e-05, + "loss": 2.1342, + "step": 2412 + }, + { + "epoch": 0.10546789632414004, + "grad_norm": 2.875, + "learning_rate": 9.730036189853498e-05, + "loss": 2.3037, + "step": 2413 + }, + { + "epoch": 0.10551160452816993, + "grad_norm": 2.546875, + "learning_rate": 9.729813500348997e-05, + "loss": 2.0718, + "step": 2414 + }, + { + "epoch": 0.10555531273219984, + "grad_norm": 2.921875, + "learning_rate": 9.72959072158603e-05, + "loss": 3.004, + "step": 2415 + }, + { + "epoch": 0.10559902093622973, + "grad_norm": 2.3125, + "learning_rate": 9.729367853568805e-05, + "loss": 1.9165, + "step": 2416 + }, + { + "epoch": 0.10564272914025963, + "grad_norm": 3.046875, + "learning_rate": 9.729144896301524e-05, + "loss": 1.8318, + "step": 2417 + }, + { + "epoch": 0.10568643734428952, + "grad_norm": 3.015625, + "learning_rate": 9.728921849788397e-05, + "loss": 2.5869, + "step": 2418 + }, + { + "epoch": 0.10573014554831942, + "grad_norm": 2.828125, + "learning_rate": 9.72869871403363e-05, + "loss": 2.5118, + "step": 2419 + }, + { + "epoch": 0.10577385375234932, + "grad_norm": 2.890625, + "learning_rate": 9.728475489041438e-05, + "loss": 2.0763, + "step": 2420 + }, + { + "epoch": 0.10581756195637922, + "grad_norm": 2.453125, + "learning_rate": 9.728252174816031e-05, + "loss": 2.224, + "step": 2421 + }, + { + "epoch": 0.10586127016040911, + "grad_norm": 2.46875, + "learning_rate": 9.728028771361624e-05, + "loss": 2.1799, + "step": 2422 + }, + { + "epoch": 0.105904978364439, + "grad_norm": 2.515625, + "learning_rate": 9.727805278682431e-05, + "loss": 1.9667, + "step": 2423 + }, + { + "epoch": 0.1059486865684689, + "grad_norm": 2.421875, + "learning_rate": 9.727581696782673e-05, + "loss": 2.0938, + "step": 2424 + }, + { + "epoch": 0.1059923947724988, + "grad_norm": 2.5625, + "learning_rate": 9.727358025666568e-05, + "loss": 2.1156, + "step": 2425 + }, + { + "epoch": 0.1060361029765287, + "grad_norm": 2.53125, + "learning_rate": 9.727134265338335e-05, + "loss": 1.8436, + "step": 2426 + }, + { + "epoch": 0.10607981118055859, + "grad_norm": 2.40625, + "learning_rate": 9.7269104158022e-05, + "loss": 2.3787, + "step": 2427 + }, + { + "epoch": 0.10612351938458849, + "grad_norm": 2.359375, + "learning_rate": 9.726686477062386e-05, + "loss": 1.9247, + "step": 2428 + }, + { + "epoch": 0.10616722758861838, + "grad_norm": 3.1875, + "learning_rate": 9.726462449123117e-05, + "loss": 2.2084, + "step": 2429 + }, + { + "epoch": 0.10621093579264829, + "grad_norm": 2.375, + "learning_rate": 9.726238331988624e-05, + "loss": 2.3032, + "step": 2430 + }, + { + "epoch": 0.10625464399667818, + "grad_norm": 2.3125, + "learning_rate": 9.726014125663135e-05, + "loss": 1.9042, + "step": 2431 + }, + { + "epoch": 0.10629835220070807, + "grad_norm": 2.6875, + "learning_rate": 9.725789830150882e-05, + "loss": 2.1058, + "step": 2432 + }, + { + "epoch": 0.10634206040473797, + "grad_norm": 2.4375, + "learning_rate": 9.725565445456095e-05, + "loss": 1.7417, + "step": 2433 + }, + { + "epoch": 0.10638576860876786, + "grad_norm": 7.0, + "learning_rate": 9.72534097158301e-05, + "loss": 2.8365, + "step": 2434 + }, + { + "epoch": 0.10642947681279777, + "grad_norm": 2.484375, + "learning_rate": 9.725116408535864e-05, + "loss": 2.0676, + "step": 2435 + }, + { + "epoch": 0.10647318501682766, + "grad_norm": 2.640625, + "learning_rate": 9.724891756318895e-05, + "loss": 2.0782, + "step": 2436 + }, + { + "epoch": 0.10651689322085756, + "grad_norm": 2.21875, + "learning_rate": 9.724667014936342e-05, + "loss": 1.8681, + "step": 2437 + }, + { + "epoch": 0.10656060142488745, + "grad_norm": 2.484375, + "learning_rate": 9.724442184392445e-05, + "loss": 2.1118, + "step": 2438 + }, + { + "epoch": 0.10660430962891734, + "grad_norm": 2.625, + "learning_rate": 9.724217264691448e-05, + "loss": 2.16, + "step": 2439 + }, + { + "epoch": 0.10664801783294725, + "grad_norm": 3.0625, + "learning_rate": 9.723992255837596e-05, + "loss": 1.5494, + "step": 2440 + }, + { + "epoch": 0.10669172603697714, + "grad_norm": 2.859375, + "learning_rate": 9.723767157835135e-05, + "loss": 2.2222, + "step": 2441 + }, + { + "epoch": 0.10673543424100704, + "grad_norm": 3.671875, + "learning_rate": 9.723541970688311e-05, + "loss": 2.3099, + "step": 2442 + }, + { + "epoch": 0.10677914244503693, + "grad_norm": 2.953125, + "learning_rate": 9.723316694401377e-05, + "loss": 2.4099, + "step": 2443 + }, + { + "epoch": 0.10682285064906683, + "grad_norm": 4.59375, + "learning_rate": 9.723091328978581e-05, + "loss": 2.0523, + "step": 2444 + }, + { + "epoch": 0.10686655885309673, + "grad_norm": 2.75, + "learning_rate": 9.722865874424178e-05, + "loss": 2.5832, + "step": 2445 + }, + { + "epoch": 0.10691026705712663, + "grad_norm": 2.5625, + "learning_rate": 9.722640330742423e-05, + "loss": 2.1906, + "step": 2446 + }, + { + "epoch": 0.10695397526115652, + "grad_norm": 2.4375, + "learning_rate": 9.722414697937572e-05, + "loss": 2.4322, + "step": 2447 + }, + { + "epoch": 0.10699768346518641, + "grad_norm": 2.53125, + "learning_rate": 9.72218897601388e-05, + "loss": 1.8567, + "step": 2448 + }, + { + "epoch": 0.10704139166921631, + "grad_norm": 2.515625, + "learning_rate": 9.721963164975612e-05, + "loss": 1.9813, + "step": 2449 + }, + { + "epoch": 0.10708509987324621, + "grad_norm": 2.625, + "learning_rate": 9.721737264827025e-05, + "loss": 2.5686, + "step": 2450 + }, + { + "epoch": 0.10712880807727611, + "grad_norm": 2.4375, + "learning_rate": 9.721511275572384e-05, + "loss": 1.8549, + "step": 2451 + }, + { + "epoch": 0.107172516281306, + "grad_norm": 2.28125, + "learning_rate": 9.721285197215952e-05, + "loss": 2.0145, + "step": 2452 + }, + { + "epoch": 0.1072162244853359, + "grad_norm": 2.09375, + "learning_rate": 9.721059029761999e-05, + "loss": 1.6011, + "step": 2453 + }, + { + "epoch": 0.10725993268936579, + "grad_norm": 2.15625, + "learning_rate": 9.720832773214789e-05, + "loss": 1.7277, + "step": 2454 + }, + { + "epoch": 0.1073036408933957, + "grad_norm": 3.203125, + "learning_rate": 9.720606427578595e-05, + "loss": 2.0673, + "step": 2455 + }, + { + "epoch": 0.10734734909742559, + "grad_norm": 2.75, + "learning_rate": 9.720379992857687e-05, + "loss": 2.2455, + "step": 2456 + }, + { + "epoch": 0.10739105730145548, + "grad_norm": 2.375, + "learning_rate": 9.720153469056338e-05, + "loss": 2.2157, + "step": 2457 + }, + { + "epoch": 0.10743476550548538, + "grad_norm": 2.734375, + "learning_rate": 9.719926856178823e-05, + "loss": 1.8761, + "step": 2458 + }, + { + "epoch": 0.10747847370951527, + "grad_norm": 2.484375, + "learning_rate": 9.71970015422942e-05, + "loss": 2.5707, + "step": 2459 + }, + { + "epoch": 0.10752218191354518, + "grad_norm": 2.65625, + "learning_rate": 9.719473363212405e-05, + "loss": 2.0183, + "step": 2460 + }, + { + "epoch": 0.10756589011757507, + "grad_norm": 2.359375, + "learning_rate": 9.719246483132058e-05, + "loss": 2.1769, + "step": 2461 + }, + { + "epoch": 0.10760959832160497, + "grad_norm": 2.25, + "learning_rate": 9.719019513992662e-05, + "loss": 1.8736, + "step": 2462 + }, + { + "epoch": 0.10765330652563486, + "grad_norm": 2.28125, + "learning_rate": 9.7187924557985e-05, + "loss": 2.0864, + "step": 2463 + }, + { + "epoch": 0.10769701472966475, + "grad_norm": 2.359375, + "learning_rate": 9.718565308553857e-05, + "loss": 1.8813, + "step": 2464 + }, + { + "epoch": 0.10774072293369466, + "grad_norm": 2.3125, + "learning_rate": 9.718338072263017e-05, + "loss": 1.9003, + "step": 2465 + }, + { + "epoch": 0.10778443113772455, + "grad_norm": 3.75, + "learning_rate": 9.718110746930272e-05, + "loss": 1.6998, + "step": 2466 + }, + { + "epoch": 0.10782813934175445, + "grad_norm": 2.40625, + "learning_rate": 9.71788333255991e-05, + "loss": 1.8179, + "step": 2467 + }, + { + "epoch": 0.10787184754578434, + "grad_norm": 2.8125, + "learning_rate": 9.717655829156222e-05, + "loss": 1.872, + "step": 2468 + }, + { + "epoch": 0.10791555574981423, + "grad_norm": 6.28125, + "learning_rate": 9.717428236723505e-05, + "loss": 2.1897, + "step": 2469 + }, + { + "epoch": 0.10795926395384414, + "grad_norm": 2.734375, + "learning_rate": 9.717200555266049e-05, + "loss": 1.6204, + "step": 2470 + }, + { + "epoch": 0.10800297215787404, + "grad_norm": 2.671875, + "learning_rate": 9.716972784788152e-05, + "loss": 2.1799, + "step": 2471 + }, + { + "epoch": 0.10804668036190393, + "grad_norm": 2.71875, + "learning_rate": 9.716744925294116e-05, + "loss": 2.0839, + "step": 2472 + }, + { + "epoch": 0.10809038856593382, + "grad_norm": 2.78125, + "learning_rate": 9.716516976788236e-05, + "loss": 2.656, + "step": 2473 + }, + { + "epoch": 0.10813409676996372, + "grad_norm": 2.625, + "learning_rate": 9.716288939274819e-05, + "loss": 2.4926, + "step": 2474 + }, + { + "epoch": 0.10817780497399362, + "grad_norm": 2.328125, + "learning_rate": 9.716060812758163e-05, + "loss": 2.2936, + "step": 2475 + }, + { + "epoch": 0.10822151317802352, + "grad_norm": 2.53125, + "learning_rate": 9.715832597242576e-05, + "loss": 1.5035, + "step": 2476 + }, + { + "epoch": 0.10826522138205341, + "grad_norm": 2.921875, + "learning_rate": 9.715604292732366e-05, + "loss": 2.7127, + "step": 2477 + }, + { + "epoch": 0.1083089295860833, + "grad_norm": 2.90625, + "learning_rate": 9.715375899231837e-05, + "loss": 2.8202, + "step": 2478 + }, + { + "epoch": 0.1083526377901132, + "grad_norm": 2.765625, + "learning_rate": 9.715147416745303e-05, + "loss": 1.6242, + "step": 2479 + }, + { + "epoch": 0.1083963459941431, + "grad_norm": 2.359375, + "learning_rate": 9.714918845277075e-05, + "loss": 2.0765, + "step": 2480 + }, + { + "epoch": 0.108440054198173, + "grad_norm": 2.671875, + "learning_rate": 9.714690184831465e-05, + "loss": 2.0368, + "step": 2481 + }, + { + "epoch": 0.1084837624022029, + "grad_norm": 3.265625, + "learning_rate": 9.714461435412792e-05, + "loss": 2.2531, + "step": 2482 + }, + { + "epoch": 0.10852747060623279, + "grad_norm": 2.5625, + "learning_rate": 9.714232597025368e-05, + "loss": 1.8642, + "step": 2483 + }, + { + "epoch": 0.10857117881026268, + "grad_norm": 2.46875, + "learning_rate": 9.714003669673515e-05, + "loss": 2.1971, + "step": 2484 + }, + { + "epoch": 0.10861488701429259, + "grad_norm": 2.171875, + "learning_rate": 9.713774653361549e-05, + "loss": 1.6596, + "step": 2485 + }, + { + "epoch": 0.10865859521832248, + "grad_norm": 2.703125, + "learning_rate": 9.713545548093797e-05, + "loss": 2.9122, + "step": 2486 + }, + { + "epoch": 0.10870230342235238, + "grad_norm": 2.4375, + "learning_rate": 9.713316353874581e-05, + "loss": 1.8774, + "step": 2487 + }, + { + "epoch": 0.10874601162638227, + "grad_norm": 2.3125, + "learning_rate": 9.713087070708224e-05, + "loss": 2.1811, + "step": 2488 + }, + { + "epoch": 0.10878971983041216, + "grad_norm": 3.265625, + "learning_rate": 9.712857698599054e-05, + "loss": 1.9505, + "step": 2489 + }, + { + "epoch": 0.10883342803444207, + "grad_norm": 2.3125, + "learning_rate": 9.712628237551402e-05, + "loss": 1.8161, + "step": 2490 + }, + { + "epoch": 0.10887713623847196, + "grad_norm": 2.359375, + "learning_rate": 9.712398687569595e-05, + "loss": 1.9934, + "step": 2491 + }, + { + "epoch": 0.10892084444250186, + "grad_norm": 2.609375, + "learning_rate": 9.712169048657966e-05, + "loss": 2.0326, + "step": 2492 + }, + { + "epoch": 0.10896455264653175, + "grad_norm": 3.140625, + "learning_rate": 9.711939320820848e-05, + "loss": 1.9797, + "step": 2493 + }, + { + "epoch": 0.10900826085056164, + "grad_norm": 2.171875, + "learning_rate": 9.71170950406258e-05, + "loss": 1.9381, + "step": 2494 + }, + { + "epoch": 0.10905196905459155, + "grad_norm": 3.328125, + "learning_rate": 9.711479598387494e-05, + "loss": 2.5, + "step": 2495 + }, + { + "epoch": 0.10909567725862145, + "grad_norm": 2.34375, + "learning_rate": 9.71124960379993e-05, + "loss": 1.9666, + "step": 2496 + }, + { + "epoch": 0.10913938546265134, + "grad_norm": 2.859375, + "learning_rate": 9.711019520304231e-05, + "loss": 2.0122, + "step": 2497 + }, + { + "epoch": 0.10918309366668123, + "grad_norm": 2.421875, + "learning_rate": 9.710789347904736e-05, + "loss": 2.2053, + "step": 2498 + }, + { + "epoch": 0.10922680187071113, + "grad_norm": 2.4375, + "learning_rate": 9.71055908660579e-05, + "loss": 2.1717, + "step": 2499 + }, + { + "epoch": 0.10927051007474103, + "grad_norm": 3.0625, + "learning_rate": 9.710328736411737e-05, + "loss": 2.2256, + "step": 2500 + }, + { + "epoch": 0.10931421827877093, + "grad_norm": 2.1875, + "learning_rate": 9.710098297326928e-05, + "loss": 1.8244, + "step": 2501 + }, + { + "epoch": 0.10935792648280082, + "grad_norm": 2.953125, + "learning_rate": 9.709867769355707e-05, + "loss": 2.2914, + "step": 2502 + }, + { + "epoch": 0.10940163468683071, + "grad_norm": 2.484375, + "learning_rate": 9.709637152502427e-05, + "loss": 2.2509, + "step": 2503 + }, + { + "epoch": 0.10944534289086061, + "grad_norm": 2.546875, + "learning_rate": 9.709406446771439e-05, + "loss": 2.1208, + "step": 2504 + }, + { + "epoch": 0.10948905109489052, + "grad_norm": 2.734375, + "learning_rate": 9.709175652167096e-05, + "loss": 1.6196, + "step": 2505 + }, + { + "epoch": 0.10953275929892041, + "grad_norm": 2.859375, + "learning_rate": 9.708944768693755e-05, + "loss": 2.3912, + "step": 2506 + }, + { + "epoch": 0.1095764675029503, + "grad_norm": 3.5625, + "learning_rate": 9.708713796355773e-05, + "loss": 2.3145, + "step": 2507 + }, + { + "epoch": 0.1096201757069802, + "grad_norm": 3.109375, + "learning_rate": 9.708482735157509e-05, + "loss": 2.4361, + "step": 2508 + }, + { + "epoch": 0.10966388391101009, + "grad_norm": 2.28125, + "learning_rate": 9.708251585103322e-05, + "loss": 2.2467, + "step": 2509 + }, + { + "epoch": 0.10970759211504, + "grad_norm": 4.375, + "learning_rate": 9.708020346197577e-05, + "loss": 2.3246, + "step": 2510 + }, + { + "epoch": 0.10975130031906989, + "grad_norm": 3.03125, + "learning_rate": 9.707789018444636e-05, + "loss": 2.2745, + "step": 2511 + }, + { + "epoch": 0.10979500852309979, + "grad_norm": 3.09375, + "learning_rate": 9.707557601848862e-05, + "loss": 2.3604, + "step": 2512 + }, + { + "epoch": 0.10983871672712968, + "grad_norm": 2.46875, + "learning_rate": 9.707326096414625e-05, + "loss": 1.7883, + "step": 2513 + }, + { + "epoch": 0.10988242493115957, + "grad_norm": 2.328125, + "learning_rate": 9.707094502146294e-05, + "loss": 1.9555, + "step": 2514 + }, + { + "epoch": 0.10992613313518948, + "grad_norm": 3.078125, + "learning_rate": 9.706862819048239e-05, + "loss": 2.4865, + "step": 2515 + }, + { + "epoch": 0.10996984133921937, + "grad_norm": 2.40625, + "learning_rate": 9.706631047124833e-05, + "loss": 2.1589, + "step": 2516 + }, + { + "epoch": 0.11001354954324927, + "grad_norm": 2.53125, + "learning_rate": 9.706399186380446e-05, + "loss": 1.9848, + "step": 2517 + }, + { + "epoch": 0.11005725774727916, + "grad_norm": 2.9375, + "learning_rate": 9.706167236819459e-05, + "loss": 2.1828, + "step": 2518 + }, + { + "epoch": 0.11010096595130905, + "grad_norm": 2.65625, + "learning_rate": 9.705935198446246e-05, + "loss": 1.9501, + "step": 2519 + }, + { + "epoch": 0.11014467415533896, + "grad_norm": 2.671875, + "learning_rate": 9.705703071265187e-05, + "loss": 2.0098, + "step": 2520 + }, + { + "epoch": 0.11018838235936886, + "grad_norm": 2.640625, + "learning_rate": 9.70547085528066e-05, + "loss": 1.8923, + "step": 2521 + }, + { + "epoch": 0.11023209056339875, + "grad_norm": 2.84375, + "learning_rate": 9.705238550497053e-05, + "loss": 2.8409, + "step": 2522 + }, + { + "epoch": 0.11027579876742864, + "grad_norm": 2.90625, + "learning_rate": 9.705006156918744e-05, + "loss": 1.8483, + "step": 2523 + }, + { + "epoch": 0.11031950697145854, + "grad_norm": 2.625, + "learning_rate": 9.704773674550123e-05, + "loss": 2.0112, + "step": 2524 + }, + { + "epoch": 0.11036321517548844, + "grad_norm": 2.8125, + "learning_rate": 9.704541103395574e-05, + "loss": 2.122, + "step": 2525 + }, + { + "epoch": 0.11040692337951834, + "grad_norm": 3.1875, + "learning_rate": 9.704308443459487e-05, + "loss": 2.662, + "step": 2526 + }, + { + "epoch": 0.11045063158354823, + "grad_norm": 2.46875, + "learning_rate": 9.704075694746253e-05, + "loss": 2.0194, + "step": 2527 + }, + { + "epoch": 0.11049433978757812, + "grad_norm": 2.40625, + "learning_rate": 9.703842857260263e-05, + "loss": 1.7063, + "step": 2528 + }, + { + "epoch": 0.11053804799160802, + "grad_norm": 2.421875, + "learning_rate": 9.703609931005914e-05, + "loss": 1.9463, + "step": 2529 + }, + { + "epoch": 0.11058175619563793, + "grad_norm": 2.671875, + "learning_rate": 9.703376915987601e-05, + "loss": 1.9706, + "step": 2530 + }, + { + "epoch": 0.11062546439966782, + "grad_norm": 3.265625, + "learning_rate": 9.703143812209718e-05, + "loss": 2.4519, + "step": 2531 + }, + { + "epoch": 0.11066917260369771, + "grad_norm": 3.15625, + "learning_rate": 9.702910619676667e-05, + "loss": 2.0152, + "step": 2532 + }, + { + "epoch": 0.1107128808077276, + "grad_norm": 3.03125, + "learning_rate": 9.702677338392847e-05, + "loss": 2.2778, + "step": 2533 + }, + { + "epoch": 0.1107565890117575, + "grad_norm": 2.21875, + "learning_rate": 9.702443968362662e-05, + "loss": 1.9025, + "step": 2534 + }, + { + "epoch": 0.11080029721578741, + "grad_norm": 2.84375, + "learning_rate": 9.702210509590514e-05, + "loss": 2.1866, + "step": 2535 + }, + { + "epoch": 0.1108440054198173, + "grad_norm": 2.421875, + "learning_rate": 9.701976962080812e-05, + "loss": 1.7364, + "step": 2536 + }, + { + "epoch": 0.1108877136238472, + "grad_norm": 3.140625, + "learning_rate": 9.70174332583796e-05, + "loss": 2.5829, + "step": 2537 + }, + { + "epoch": 0.11093142182787709, + "grad_norm": 5.65625, + "learning_rate": 9.701509600866368e-05, + "loss": 2.1956, + "step": 2538 + }, + { + "epoch": 0.11097513003190698, + "grad_norm": 4.53125, + "learning_rate": 9.701275787170448e-05, + "loss": 2.4632, + "step": 2539 + }, + { + "epoch": 0.11101883823593689, + "grad_norm": 3.109375, + "learning_rate": 9.701041884754612e-05, + "loss": 2.389, + "step": 2540 + }, + { + "epoch": 0.11106254643996678, + "grad_norm": 2.53125, + "learning_rate": 9.700807893623272e-05, + "loss": 2.3511, + "step": 2541 + }, + { + "epoch": 0.11110625464399668, + "grad_norm": 2.421875, + "learning_rate": 9.700573813780847e-05, + "loss": 2.1516, + "step": 2542 + }, + { + "epoch": 0.11114996284802657, + "grad_norm": 2.671875, + "learning_rate": 9.700339645231751e-05, + "loss": 2.1829, + "step": 2543 + }, + { + "epoch": 0.11119367105205646, + "grad_norm": 2.828125, + "learning_rate": 9.700105387980406e-05, + "loss": 2.7425, + "step": 2544 + }, + { + "epoch": 0.11123737925608637, + "grad_norm": 3.0, + "learning_rate": 9.699871042031232e-05, + "loss": 2.3237, + "step": 2545 + }, + { + "epoch": 0.11128108746011627, + "grad_norm": 2.765625, + "learning_rate": 9.69963660738865e-05, + "loss": 1.8203, + "step": 2546 + }, + { + "epoch": 0.11132479566414616, + "grad_norm": 2.65625, + "learning_rate": 9.699402084057086e-05, + "loss": 1.8776, + "step": 2547 + }, + { + "epoch": 0.11136850386817605, + "grad_norm": 3.1875, + "learning_rate": 9.699167472040964e-05, + "loss": 2.8036, + "step": 2548 + }, + { + "epoch": 0.11141221207220595, + "grad_norm": 3.03125, + "learning_rate": 9.698932771344715e-05, + "loss": 2.2721, + "step": 2549 + }, + { + "epoch": 0.11145592027623585, + "grad_norm": 2.109375, + "learning_rate": 9.698697981972763e-05, + "loss": 1.9911, + "step": 2550 + }, + { + "epoch": 0.11149962848026575, + "grad_norm": 7.59375, + "learning_rate": 9.698463103929542e-05, + "loss": 2.914, + "step": 2551 + }, + { + "epoch": 0.11154333668429564, + "grad_norm": 2.546875, + "learning_rate": 9.698228137219485e-05, + "loss": 2.3899, + "step": 2552 + }, + { + "epoch": 0.11158704488832553, + "grad_norm": 2.78125, + "learning_rate": 9.697993081847024e-05, + "loss": 2.4362, + "step": 2553 + }, + { + "epoch": 0.11163075309235544, + "grad_norm": 2.375, + "learning_rate": 9.697757937816596e-05, + "loss": 2.0962, + "step": 2554 + }, + { + "epoch": 0.11167446129638534, + "grad_norm": 11.1875, + "learning_rate": 9.69752270513264e-05, + "loss": 2.4154, + "step": 2555 + }, + { + "epoch": 0.11171816950041523, + "grad_norm": 3.140625, + "learning_rate": 9.697287383799592e-05, + "loss": 1.3796, + "step": 2556 + }, + { + "epoch": 0.11176187770444512, + "grad_norm": 2.65625, + "learning_rate": 9.697051973821895e-05, + "loss": 1.9024, + "step": 2557 + }, + { + "epoch": 0.11180558590847502, + "grad_norm": 2.53125, + "learning_rate": 9.696816475203992e-05, + "loss": 2.4041, + "step": 2558 + }, + { + "epoch": 0.11184929411250492, + "grad_norm": 2.328125, + "learning_rate": 9.696580887950324e-05, + "loss": 2.0977, + "step": 2559 + }, + { + "epoch": 0.11189300231653482, + "grad_norm": 2.71875, + "learning_rate": 9.69634521206534e-05, + "loss": 2.501, + "step": 2560 + }, + { + "epoch": 0.11193671052056471, + "grad_norm": 2.890625, + "learning_rate": 9.696109447553488e-05, + "loss": 1.9393, + "step": 2561 + }, + { + "epoch": 0.1119804187245946, + "grad_norm": 3.046875, + "learning_rate": 9.695873594419213e-05, + "loss": 1.7693, + "step": 2562 + }, + { + "epoch": 0.1120241269286245, + "grad_norm": 3.078125, + "learning_rate": 9.695637652666972e-05, + "loss": 2.4633, + "step": 2563 + }, + { + "epoch": 0.1120678351326544, + "grad_norm": 2.296875, + "learning_rate": 9.695401622301212e-05, + "loss": 1.7753, + "step": 2564 + }, + { + "epoch": 0.1121115433366843, + "grad_norm": 3.75, + "learning_rate": 9.69516550332639e-05, + "loss": 3.1476, + "step": 2565 + }, + { + "epoch": 0.1121552515407142, + "grad_norm": 3.546875, + "learning_rate": 9.694929295746963e-05, + "loss": 2.3226, + "step": 2566 + }, + { + "epoch": 0.11219895974474409, + "grad_norm": 2.5625, + "learning_rate": 9.694692999567386e-05, + "loss": 2.6497, + "step": 2567 + }, + { + "epoch": 0.11224266794877398, + "grad_norm": 2.703125, + "learning_rate": 9.694456614792119e-05, + "loss": 1.9439, + "step": 2568 + }, + { + "epoch": 0.11228637615280389, + "grad_norm": 2.5, + "learning_rate": 9.694220141425623e-05, + "loss": 2.1219, + "step": 2569 + }, + { + "epoch": 0.11233008435683378, + "grad_norm": 2.296875, + "learning_rate": 9.69398357947236e-05, + "loss": 1.9452, + "step": 2570 + }, + { + "epoch": 0.11237379256086368, + "grad_norm": 2.015625, + "learning_rate": 9.693746928936798e-05, + "loss": 1.7742, + "step": 2571 + }, + { + "epoch": 0.11241750076489357, + "grad_norm": 2.390625, + "learning_rate": 9.693510189823398e-05, + "loss": 2.0195, + "step": 2572 + }, + { + "epoch": 0.11246120896892346, + "grad_norm": 2.71875, + "learning_rate": 9.69327336213663e-05, + "loss": 2.213, + "step": 2573 + }, + { + "epoch": 0.11250491717295337, + "grad_norm": 3.125, + "learning_rate": 9.693036445880963e-05, + "loss": 1.6811, + "step": 2574 + }, + { + "epoch": 0.11254862537698326, + "grad_norm": 3.4375, + "learning_rate": 9.692799441060868e-05, + "loss": 2.2884, + "step": 2575 + }, + { + "epoch": 0.11259233358101316, + "grad_norm": 2.65625, + "learning_rate": 9.692562347680817e-05, + "loss": 2.2061, + "step": 2576 + }, + { + "epoch": 0.11263604178504305, + "grad_norm": 2.359375, + "learning_rate": 9.692325165745285e-05, + "loss": 1.8484, + "step": 2577 + }, + { + "epoch": 0.11267974998907294, + "grad_norm": 3.015625, + "learning_rate": 9.692087895258748e-05, + "loss": 2.6623, + "step": 2578 + }, + { + "epoch": 0.11272345819310285, + "grad_norm": 2.328125, + "learning_rate": 9.691850536225684e-05, + "loss": 2.2441, + "step": 2579 + }, + { + "epoch": 0.11276716639713275, + "grad_norm": 3.359375, + "learning_rate": 9.691613088650571e-05, + "loss": 1.9522, + "step": 2580 + }, + { + "epoch": 0.11281087460116264, + "grad_norm": 2.75, + "learning_rate": 9.69137555253789e-05, + "loss": 2.5621, + "step": 2581 + }, + { + "epoch": 0.11285458280519253, + "grad_norm": 2.9375, + "learning_rate": 9.691137927892125e-05, + "loss": 2.6895, + "step": 2582 + }, + { + "epoch": 0.11289829100922243, + "grad_norm": 2.484375, + "learning_rate": 9.69090021471776e-05, + "loss": 1.9568, + "step": 2583 + }, + { + "epoch": 0.11294199921325233, + "grad_norm": 2.671875, + "learning_rate": 9.69066241301928e-05, + "loss": 1.9114, + "step": 2584 + }, + { + "epoch": 0.11298570741728223, + "grad_norm": 2.90625, + "learning_rate": 9.690424522801173e-05, + "loss": 2.1984, + "step": 2585 + }, + { + "epoch": 0.11302941562131212, + "grad_norm": 2.25, + "learning_rate": 9.69018654406793e-05, + "loss": 2.0646, + "step": 2586 + }, + { + "epoch": 0.11307312382534201, + "grad_norm": 2.828125, + "learning_rate": 9.68994847682404e-05, + "loss": 1.6406, + "step": 2587 + }, + { + "epoch": 0.11311683202937191, + "grad_norm": 3.765625, + "learning_rate": 9.689710321073997e-05, + "loss": 2.687, + "step": 2588 + }, + { + "epoch": 0.11316054023340182, + "grad_norm": 2.40625, + "learning_rate": 9.689472076822295e-05, + "loss": 1.8021, + "step": 2589 + }, + { + "epoch": 0.11320424843743171, + "grad_norm": 2.875, + "learning_rate": 9.689233744073427e-05, + "loss": 2.06, + "step": 2590 + }, + { + "epoch": 0.1132479566414616, + "grad_norm": 2.96875, + "learning_rate": 9.688995322831895e-05, + "loss": 2.283, + "step": 2591 + }, + { + "epoch": 0.1132916648454915, + "grad_norm": 2.46875, + "learning_rate": 9.688756813102197e-05, + "loss": 2.036, + "step": 2592 + }, + { + "epoch": 0.11333537304952139, + "grad_norm": 2.203125, + "learning_rate": 9.688518214888836e-05, + "loss": 2.0899, + "step": 2593 + }, + { + "epoch": 0.1133790812535513, + "grad_norm": 2.5, + "learning_rate": 9.688279528196309e-05, + "loss": 2.0154, + "step": 2594 + }, + { + "epoch": 0.11342278945758119, + "grad_norm": 2.484375, + "learning_rate": 9.688040753029125e-05, + "loss": 1.6289, + "step": 2595 + }, + { + "epoch": 0.11346649766161108, + "grad_norm": 2.765625, + "learning_rate": 9.687801889391789e-05, + "loss": 2.3163, + "step": 2596 + }, + { + "epoch": 0.11351020586564098, + "grad_norm": 3.984375, + "learning_rate": 9.687562937288807e-05, + "loss": 2.118, + "step": 2597 + }, + { + "epoch": 0.11355391406967087, + "grad_norm": 4.09375, + "learning_rate": 9.687323896724693e-05, + "loss": 2.3258, + "step": 2598 + }, + { + "epoch": 0.11359762227370078, + "grad_norm": 2.515625, + "learning_rate": 9.687084767703954e-05, + "loss": 2.6265, + "step": 2599 + }, + { + "epoch": 0.11364133047773067, + "grad_norm": 2.875, + "learning_rate": 9.686845550231102e-05, + "loss": 2.2793, + "step": 2600 + }, + { + "epoch": 0.11368503868176057, + "grad_norm": 2.5, + "learning_rate": 9.686606244310654e-05, + "loss": 1.9895, + "step": 2601 + }, + { + "epoch": 0.11372874688579046, + "grad_norm": 5.4375, + "learning_rate": 9.686366849947126e-05, + "loss": 2.4364, + "step": 2602 + }, + { + "epoch": 0.11377245508982035, + "grad_norm": 2.921875, + "learning_rate": 9.686127367145034e-05, + "loss": 2.4419, + "step": 2603 + }, + { + "epoch": 0.11381616329385026, + "grad_norm": 4.59375, + "learning_rate": 9.685887795908899e-05, + "loss": 1.8057, + "step": 2604 + }, + { + "epoch": 0.11385987149788016, + "grad_norm": 2.6875, + "learning_rate": 9.68564813624324e-05, + "loss": 1.5621, + "step": 2605 + }, + { + "epoch": 0.11390357970191005, + "grad_norm": 2.6875, + "learning_rate": 9.685408388152581e-05, + "loss": 1.7561, + "step": 2606 + }, + { + "epoch": 0.11394728790593994, + "grad_norm": 2.921875, + "learning_rate": 9.685168551641448e-05, + "loss": 2.0413, + "step": 2607 + }, + { + "epoch": 0.11399099610996984, + "grad_norm": 3.546875, + "learning_rate": 9.684928626714365e-05, + "loss": 1.4987, + "step": 2608 + }, + { + "epoch": 0.11403470431399974, + "grad_norm": 2.671875, + "learning_rate": 9.68468861337586e-05, + "loss": 2.4914, + "step": 2609 + }, + { + "epoch": 0.11407841251802964, + "grad_norm": 2.203125, + "learning_rate": 9.684448511630461e-05, + "loss": 2.1296, + "step": 2610 + }, + { + "epoch": 0.11412212072205953, + "grad_norm": 2.390625, + "learning_rate": 9.684208321482704e-05, + "loss": 1.8615, + "step": 2611 + }, + { + "epoch": 0.11416582892608942, + "grad_norm": 2.875, + "learning_rate": 9.683968042937117e-05, + "loss": 2.1629, + "step": 2612 + }, + { + "epoch": 0.11420953713011932, + "grad_norm": 2.515625, + "learning_rate": 9.683727675998236e-05, + "loss": 1.9825, + "step": 2613 + }, + { + "epoch": 0.11425324533414923, + "grad_norm": 2.9375, + "learning_rate": 9.683487220670595e-05, + "loss": 2.6845, + "step": 2614 + }, + { + "epoch": 0.11429695353817912, + "grad_norm": 2.53125, + "learning_rate": 9.683246676958735e-05, + "loss": 2.0385, + "step": 2615 + }, + { + "epoch": 0.11434066174220901, + "grad_norm": 2.734375, + "learning_rate": 9.683006044867194e-05, + "loss": 2.3356, + "step": 2616 + }, + { + "epoch": 0.1143843699462389, + "grad_norm": 2.28125, + "learning_rate": 9.682765324400514e-05, + "loss": 1.81, + "step": 2617 + }, + { + "epoch": 0.1144280781502688, + "grad_norm": 2.515625, + "learning_rate": 9.682524515563236e-05, + "loss": 2.5127, + "step": 2618 + }, + { + "epoch": 0.11447178635429871, + "grad_norm": 3.984375, + "learning_rate": 9.682283618359905e-05, + "loss": 2.0217, + "step": 2619 + }, + { + "epoch": 0.1145154945583286, + "grad_norm": 2.5625, + "learning_rate": 9.682042632795067e-05, + "loss": 1.9385, + "step": 2620 + }, + { + "epoch": 0.1145592027623585, + "grad_norm": 3.28125, + "learning_rate": 9.681801558873272e-05, + "loss": 2.1403, + "step": 2621 + }, + { + "epoch": 0.11460291096638839, + "grad_norm": 3.0, + "learning_rate": 9.681560396599068e-05, + "loss": 1.5651, + "step": 2622 + }, + { + "epoch": 0.11464661917041828, + "grad_norm": 2.65625, + "learning_rate": 9.681319145977003e-05, + "loss": 2.7997, + "step": 2623 + }, + { + "epoch": 0.11469032737444819, + "grad_norm": 2.203125, + "learning_rate": 9.681077807011634e-05, + "loss": 2.129, + "step": 2624 + }, + { + "epoch": 0.11473403557847808, + "grad_norm": 2.703125, + "learning_rate": 9.680836379707513e-05, + "loss": 2.0857, + "step": 2625 + }, + { + "epoch": 0.11477774378250798, + "grad_norm": 3.453125, + "learning_rate": 9.680594864069197e-05, + "loss": 1.9064, + "step": 2626 + }, + { + "epoch": 0.11482145198653787, + "grad_norm": 2.265625, + "learning_rate": 9.680353260101245e-05, + "loss": 1.6006, + "step": 2627 + }, + { + "epoch": 0.11486516019056776, + "grad_norm": 2.390625, + "learning_rate": 9.680111567808213e-05, + "loss": 1.6344, + "step": 2628 + }, + { + "epoch": 0.11490886839459767, + "grad_norm": 2.53125, + "learning_rate": 9.679869787194664e-05, + "loss": 2.1607, + "step": 2629 + }, + { + "epoch": 0.11495257659862756, + "grad_norm": 3.265625, + "learning_rate": 9.679627918265163e-05, + "loss": 2.1579, + "step": 2630 + }, + { + "epoch": 0.11499628480265746, + "grad_norm": 2.34375, + "learning_rate": 9.679385961024271e-05, + "loss": 2.589, + "step": 2631 + }, + { + "epoch": 0.11503999300668735, + "grad_norm": 3.34375, + "learning_rate": 9.679143915476556e-05, + "loss": 3.4489, + "step": 2632 + }, + { + "epoch": 0.11508370121071725, + "grad_norm": 3.609375, + "learning_rate": 9.678901781626584e-05, + "loss": 1.725, + "step": 2633 + }, + { + "epoch": 0.11512740941474715, + "grad_norm": 5.25, + "learning_rate": 9.678659559478926e-05, + "loss": 2.3209, + "step": 2634 + }, + { + "epoch": 0.11517111761877705, + "grad_norm": 2.921875, + "learning_rate": 9.678417249038154e-05, + "loss": 2.0092, + "step": 2635 + }, + { + "epoch": 0.11521482582280694, + "grad_norm": 2.515625, + "learning_rate": 9.678174850308839e-05, + "loss": 2.1935, + "step": 2636 + }, + { + "epoch": 0.11525853402683683, + "grad_norm": 3.25, + "learning_rate": 9.677932363295555e-05, + "loss": 2.3839, + "step": 2637 + }, + { + "epoch": 0.11530224223086673, + "grad_norm": 2.578125, + "learning_rate": 9.677689788002879e-05, + "loss": 2.3866, + "step": 2638 + }, + { + "epoch": 0.11534595043489664, + "grad_norm": 2.28125, + "learning_rate": 9.677447124435389e-05, + "loss": 1.7951, + "step": 2639 + }, + { + "epoch": 0.11538965863892653, + "grad_norm": 2.734375, + "learning_rate": 9.677204372597663e-05, + "loss": 2.4154, + "step": 2640 + }, + { + "epoch": 0.11543336684295642, + "grad_norm": 2.203125, + "learning_rate": 9.676961532494284e-05, + "loss": 2.1352, + "step": 2641 + }, + { + "epoch": 0.11547707504698632, + "grad_norm": 2.703125, + "learning_rate": 9.676718604129832e-05, + "loss": 2.3192, + "step": 2642 + }, + { + "epoch": 0.11552078325101621, + "grad_norm": 2.125, + "learning_rate": 9.676475587508897e-05, + "loss": 2.0761, + "step": 2643 + }, + { + "epoch": 0.11556449145504612, + "grad_norm": 2.375, + "learning_rate": 9.67623248263606e-05, + "loss": 1.6954, + "step": 2644 + }, + { + "epoch": 0.11560819965907601, + "grad_norm": 2.359375, + "learning_rate": 9.675989289515908e-05, + "loss": 2.0882, + "step": 2645 + }, + { + "epoch": 0.1156519078631059, + "grad_norm": 3.25, + "learning_rate": 9.675746008153035e-05, + "loss": 2.324, + "step": 2646 + }, + { + "epoch": 0.1156956160671358, + "grad_norm": 3.078125, + "learning_rate": 9.675502638552029e-05, + "loss": 2.7426, + "step": 2647 + }, + { + "epoch": 0.11573932427116569, + "grad_norm": 2.609375, + "learning_rate": 9.675259180717482e-05, + "loss": 2.0731, + "step": 2648 + }, + { + "epoch": 0.1157830324751956, + "grad_norm": 2.65625, + "learning_rate": 9.675015634653992e-05, + "loss": 2.5737, + "step": 2649 + }, + { + "epoch": 0.11582674067922549, + "grad_norm": 2.671875, + "learning_rate": 9.674772000366151e-05, + "loss": 2.3047, + "step": 2650 + }, + { + "epoch": 0.11587044888325539, + "grad_norm": 2.765625, + "learning_rate": 9.674528277858559e-05, + "loss": 1.3378, + "step": 2651 + }, + { + "epoch": 0.11591415708728528, + "grad_norm": 3.0625, + "learning_rate": 9.674284467135816e-05, + "loss": 2.2169, + "step": 2652 + }, + { + "epoch": 0.11595786529131517, + "grad_norm": 2.40625, + "learning_rate": 9.67404056820252e-05, + "loss": 2.0575, + "step": 2653 + }, + { + "epoch": 0.11600157349534508, + "grad_norm": 2.4375, + "learning_rate": 9.673796581063278e-05, + "loss": 2.0379, + "step": 2654 + }, + { + "epoch": 0.11604528169937497, + "grad_norm": 2.40625, + "learning_rate": 9.67355250572269e-05, + "loss": 1.7116, + "step": 2655 + }, + { + "epoch": 0.11608898990340487, + "grad_norm": 2.296875, + "learning_rate": 9.673308342185365e-05, + "loss": 1.9168, + "step": 2656 + }, + { + "epoch": 0.11613269810743476, + "grad_norm": 2.34375, + "learning_rate": 9.673064090455911e-05, + "loss": 2.8693, + "step": 2657 + }, + { + "epoch": 0.11617640631146466, + "grad_norm": 2.375, + "learning_rate": 9.672819750538935e-05, + "loss": 2.0332, + "step": 2658 + }, + { + "epoch": 0.11622011451549456, + "grad_norm": 2.515625, + "learning_rate": 9.67257532243905e-05, + "loss": 2.4218, + "step": 2659 + }, + { + "epoch": 0.11626382271952446, + "grad_norm": 2.4375, + "learning_rate": 9.672330806160868e-05, + "loss": 1.8975, + "step": 2660 + }, + { + "epoch": 0.11630753092355435, + "grad_norm": 3.0, + "learning_rate": 9.672086201709003e-05, + "loss": 2.4923, + "step": 2661 + }, + { + "epoch": 0.11635123912758424, + "grad_norm": 3.15625, + "learning_rate": 9.671841509088073e-05, + "loss": 2.2724, + "step": 2662 + }, + { + "epoch": 0.11639494733161414, + "grad_norm": 2.859375, + "learning_rate": 9.671596728302692e-05, + "loss": 2.2291, + "step": 2663 + }, + { + "epoch": 0.11643865553564405, + "grad_norm": 2.828125, + "learning_rate": 9.671351859357483e-05, + "loss": 2.263, + "step": 2664 + }, + { + "epoch": 0.11648236373967394, + "grad_norm": 2.6875, + "learning_rate": 9.671106902257065e-05, + "loss": 2.7241, + "step": 2665 + }, + { + "epoch": 0.11652607194370383, + "grad_norm": 2.578125, + "learning_rate": 9.670861857006061e-05, + "loss": 2.1171, + "step": 2666 + }, + { + "epoch": 0.11656978014773373, + "grad_norm": 2.828125, + "learning_rate": 9.670616723609096e-05, + "loss": 2.266, + "step": 2667 + }, + { + "epoch": 0.11661348835176362, + "grad_norm": 2.5625, + "learning_rate": 9.670371502070795e-05, + "loss": 1.9497, + "step": 2668 + }, + { + "epoch": 0.11665719655579353, + "grad_norm": 2.875, + "learning_rate": 9.670126192395787e-05, + "loss": 2.612, + "step": 2669 + }, + { + "epoch": 0.11670090475982342, + "grad_norm": 2.921875, + "learning_rate": 9.669880794588701e-05, + "loss": 2.1332, + "step": 2670 + }, + { + "epoch": 0.11674461296385331, + "grad_norm": 3.046875, + "learning_rate": 9.669635308654166e-05, + "loss": 2.334, + "step": 2671 + }, + { + "epoch": 0.11678832116788321, + "grad_norm": 2.78125, + "learning_rate": 9.669389734596819e-05, + "loss": 2.0459, + "step": 2672 + }, + { + "epoch": 0.1168320293719131, + "grad_norm": 2.859375, + "learning_rate": 9.66914407242129e-05, + "loss": 2.9586, + "step": 2673 + }, + { + "epoch": 0.11687573757594301, + "grad_norm": 2.28125, + "learning_rate": 9.668898322132218e-05, + "loss": 1.893, + "step": 2674 + }, + { + "epoch": 0.1169194457799729, + "grad_norm": 2.40625, + "learning_rate": 9.668652483734237e-05, + "loss": 2.3674, + "step": 2675 + }, + { + "epoch": 0.1169631539840028, + "grad_norm": 2.28125, + "learning_rate": 9.668406557231991e-05, + "loss": 2.0931, + "step": 2676 + }, + { + "epoch": 0.11700686218803269, + "grad_norm": 2.3125, + "learning_rate": 9.668160542630118e-05, + "loss": 1.92, + "step": 2677 + }, + { + "epoch": 0.11705057039206258, + "grad_norm": 2.203125, + "learning_rate": 9.667914439933262e-05, + "loss": 1.6033, + "step": 2678 + }, + { + "epoch": 0.11709427859609249, + "grad_norm": 3.296875, + "learning_rate": 9.667668249146067e-05, + "loss": 2.6922, + "step": 2679 + }, + { + "epoch": 0.11713798680012238, + "grad_norm": 2.40625, + "learning_rate": 9.667421970273177e-05, + "loss": 1.8012, + "step": 2680 + }, + { + "epoch": 0.11718169500415228, + "grad_norm": 2.84375, + "learning_rate": 9.667175603319243e-05, + "loss": 2.2011, + "step": 2681 + }, + { + "epoch": 0.11722540320818217, + "grad_norm": 2.46875, + "learning_rate": 9.66692914828891e-05, + "loss": 2.2007, + "step": 2682 + }, + { + "epoch": 0.11726911141221207, + "grad_norm": 2.90625, + "learning_rate": 9.666682605186835e-05, + "loss": 2.1641, + "step": 2683 + }, + { + "epoch": 0.11731281961624197, + "grad_norm": 2.265625, + "learning_rate": 9.666435974017665e-05, + "loss": 2.4392, + "step": 2684 + }, + { + "epoch": 0.11735652782027187, + "grad_norm": 2.234375, + "learning_rate": 9.666189254786056e-05, + "loss": 1.8101, + "step": 2685 + }, + { + "epoch": 0.11740023602430176, + "grad_norm": 2.84375, + "learning_rate": 9.665942447496666e-05, + "loss": 2.6438, + "step": 2686 + }, + { + "epoch": 0.11744394422833165, + "grad_norm": 8.3125, + "learning_rate": 9.66569555215415e-05, + "loss": 2.9357, + "step": 2687 + }, + { + "epoch": 0.11748765243236156, + "grad_norm": 3.1875, + "learning_rate": 9.665448568763169e-05, + "loss": 2.0782, + "step": 2688 + }, + { + "epoch": 0.11753136063639145, + "grad_norm": 2.328125, + "learning_rate": 9.665201497328384e-05, + "loss": 2.1359, + "step": 2689 + }, + { + "epoch": 0.11757506884042135, + "grad_norm": 2.296875, + "learning_rate": 9.664954337854455e-05, + "loss": 1.7294, + "step": 2690 + }, + { + "epoch": 0.11761877704445124, + "grad_norm": 2.625, + "learning_rate": 9.66470709034605e-05, + "loss": 2.2099, + "step": 2691 + }, + { + "epoch": 0.11766248524848114, + "grad_norm": 2.890625, + "learning_rate": 9.664459754807832e-05, + "loss": 3.201, + "step": 2692 + }, + { + "epoch": 0.11770619345251104, + "grad_norm": 2.734375, + "learning_rate": 9.66421233124447e-05, + "loss": 2.0315, + "step": 2693 + }, + { + "epoch": 0.11774990165654094, + "grad_norm": 3.078125, + "learning_rate": 9.663964819660633e-05, + "loss": 1.792, + "step": 2694 + }, + { + "epoch": 0.11779360986057083, + "grad_norm": 2.75, + "learning_rate": 9.663717220060991e-05, + "loss": 2.191, + "step": 2695 + }, + { + "epoch": 0.11783731806460072, + "grad_norm": 2.5, + "learning_rate": 9.663469532450218e-05, + "loss": 2.0232, + "step": 2696 + }, + { + "epoch": 0.11788102626863062, + "grad_norm": 2.859375, + "learning_rate": 9.663221756832988e-05, + "loss": 1.938, + "step": 2697 + }, + { + "epoch": 0.11792473447266053, + "grad_norm": 2.71875, + "learning_rate": 9.662973893213976e-05, + "loss": 2.7432, + "step": 2698 + }, + { + "epoch": 0.11796844267669042, + "grad_norm": 2.515625, + "learning_rate": 9.662725941597859e-05, + "loss": 1.9639, + "step": 2699 + }, + { + "epoch": 0.11801215088072031, + "grad_norm": 2.078125, + "learning_rate": 9.662477901989318e-05, + "loss": 1.9061, + "step": 2700 + }, + { + "epoch": 0.1180558590847502, + "grad_norm": 2.59375, + "learning_rate": 9.662229774393032e-05, + "loss": 2.5382, + "step": 2701 + }, + { + "epoch": 0.1180995672887801, + "grad_norm": 2.8125, + "learning_rate": 9.661981558813687e-05, + "loss": 2.3339, + "step": 2702 + }, + { + "epoch": 0.11814327549281001, + "grad_norm": 2.1875, + "learning_rate": 9.661733255255963e-05, + "loss": 1.7774, + "step": 2703 + }, + { + "epoch": 0.1181869836968399, + "grad_norm": 4.53125, + "learning_rate": 9.661484863724549e-05, + "loss": 1.8731, + "step": 2704 + }, + { + "epoch": 0.1182306919008698, + "grad_norm": 3.09375, + "learning_rate": 9.661236384224129e-05, + "loss": 2.4805, + "step": 2705 + }, + { + "epoch": 0.11827440010489969, + "grad_norm": 2.40625, + "learning_rate": 9.660987816759396e-05, + "loss": 1.7, + "step": 2706 + }, + { + "epoch": 0.11831810830892958, + "grad_norm": 2.921875, + "learning_rate": 9.66073916133504e-05, + "loss": 2.4671, + "step": 2707 + }, + { + "epoch": 0.11836181651295949, + "grad_norm": 2.890625, + "learning_rate": 9.660490417955749e-05, + "loss": 2.5451, + "step": 2708 + }, + { + "epoch": 0.11840552471698938, + "grad_norm": 2.8125, + "learning_rate": 9.660241586626224e-05, + "loss": 2.3707, + "step": 2709 + }, + { + "epoch": 0.11844923292101928, + "grad_norm": 3.046875, + "learning_rate": 9.659992667351157e-05, + "loss": 2.45, + "step": 2710 + }, + { + "epoch": 0.11849294112504917, + "grad_norm": 2.71875, + "learning_rate": 9.659743660135245e-05, + "loss": 2.5622, + "step": 2711 + }, + { + "epoch": 0.11853664932907906, + "grad_norm": 2.421875, + "learning_rate": 9.659494564983191e-05, + "loss": 2.4197, + "step": 2712 + }, + { + "epoch": 0.11858035753310897, + "grad_norm": 2.953125, + "learning_rate": 9.659245381899691e-05, + "loss": 2.1187, + "step": 2713 + }, + { + "epoch": 0.11862406573713886, + "grad_norm": 2.296875, + "learning_rate": 9.658996110889449e-05, + "loss": 2.2612, + "step": 2714 + }, + { + "epoch": 0.11866777394116876, + "grad_norm": 2.25, + "learning_rate": 9.65874675195717e-05, + "loss": 1.9677, + "step": 2715 + }, + { + "epoch": 0.11871148214519865, + "grad_norm": 2.84375, + "learning_rate": 9.658497305107558e-05, + "loss": 1.9515, + "step": 2716 + }, + { + "epoch": 0.11875519034922855, + "grad_norm": 2.1875, + "learning_rate": 9.658247770345323e-05, + "loss": 1.9262, + "step": 2717 + }, + { + "epoch": 0.11879889855325845, + "grad_norm": 2.875, + "learning_rate": 9.657998147675173e-05, + "loss": 2.469, + "step": 2718 + }, + { + "epoch": 0.11884260675728835, + "grad_norm": 2.390625, + "learning_rate": 9.65774843710182e-05, + "loss": 2.1998, + "step": 2719 + }, + { + "epoch": 0.11888631496131824, + "grad_norm": 2.15625, + "learning_rate": 9.657498638629972e-05, + "loss": 1.9848, + "step": 2720 + }, + { + "epoch": 0.11893002316534813, + "grad_norm": 2.78125, + "learning_rate": 9.657248752264348e-05, + "loss": 2.4081, + "step": 2721 + }, + { + "epoch": 0.11897373136937803, + "grad_norm": 2.1875, + "learning_rate": 9.656998778009661e-05, + "loss": 1.5549, + "step": 2722 + }, + { + "epoch": 0.11901743957340793, + "grad_norm": 2.484375, + "learning_rate": 9.656748715870629e-05, + "loss": 1.8315, + "step": 2723 + }, + { + "epoch": 0.11906114777743783, + "grad_norm": 2.40625, + "learning_rate": 9.656498565851972e-05, + "loss": 1.9119, + "step": 2724 + }, + { + "epoch": 0.11910485598146772, + "grad_norm": 2.625, + "learning_rate": 9.656248327958409e-05, + "loss": 1.7329, + "step": 2725 + }, + { + "epoch": 0.11914856418549762, + "grad_norm": 3.3125, + "learning_rate": 9.655998002194663e-05, + "loss": 2.5823, + "step": 2726 + }, + { + "epoch": 0.11919227238952751, + "grad_norm": 2.140625, + "learning_rate": 9.65574758856546e-05, + "loss": 1.5248, + "step": 2727 + }, + { + "epoch": 0.11923598059355742, + "grad_norm": 2.765625, + "learning_rate": 9.655497087075522e-05, + "loss": 2.3173, + "step": 2728 + }, + { + "epoch": 0.11927968879758731, + "grad_norm": 2.5625, + "learning_rate": 9.655246497729578e-05, + "loss": 1.7577, + "step": 2729 + }, + { + "epoch": 0.1193233970016172, + "grad_norm": 4.5625, + "learning_rate": 9.654995820532359e-05, + "loss": 1.9014, + "step": 2730 + }, + { + "epoch": 0.1193671052056471, + "grad_norm": 2.3125, + "learning_rate": 9.654745055488592e-05, + "loss": 1.8937, + "step": 2731 + }, + { + "epoch": 0.11941081340967699, + "grad_norm": 2.265625, + "learning_rate": 9.654494202603013e-05, + "loss": 1.7926, + "step": 2732 + }, + { + "epoch": 0.1194545216137069, + "grad_norm": 2.65625, + "learning_rate": 9.654243261880353e-05, + "loss": 1.5646, + "step": 2733 + }, + { + "epoch": 0.11949822981773679, + "grad_norm": 2.890625, + "learning_rate": 9.653992233325348e-05, + "loss": 3.3764, + "step": 2734 + }, + { + "epoch": 0.11954193802176669, + "grad_norm": 3.21875, + "learning_rate": 9.653741116942738e-05, + "loss": 2.5024, + "step": 2735 + }, + { + "epoch": 0.11958564622579658, + "grad_norm": 2.859375, + "learning_rate": 9.653489912737258e-05, + "loss": 2.5474, + "step": 2736 + }, + { + "epoch": 0.11962935442982647, + "grad_norm": 2.40625, + "learning_rate": 9.653238620713652e-05, + "loss": 2.585, + "step": 2737 + }, + { + "epoch": 0.11967306263385638, + "grad_norm": 2.59375, + "learning_rate": 9.652987240876659e-05, + "loss": 2.2581, + "step": 2738 + }, + { + "epoch": 0.11971677083788627, + "grad_norm": 2.96875, + "learning_rate": 9.652735773231025e-05, + "loss": 2.341, + "step": 2739 + }, + { + "epoch": 0.11976047904191617, + "grad_norm": 2.578125, + "learning_rate": 9.652484217781497e-05, + "loss": 2.4574, + "step": 2740 + }, + { + "epoch": 0.11980418724594606, + "grad_norm": 3.71875, + "learning_rate": 9.65223257453282e-05, + "loss": 2.1156, + "step": 2741 + }, + { + "epoch": 0.11984789544997596, + "grad_norm": 2.40625, + "learning_rate": 9.651980843489742e-05, + "loss": 1.846, + "step": 2742 + }, + { + "epoch": 0.11989160365400586, + "grad_norm": 3.0625, + "learning_rate": 9.651729024657014e-05, + "loss": 2.0689, + "step": 2743 + }, + { + "epoch": 0.11993531185803576, + "grad_norm": 2.53125, + "learning_rate": 9.651477118039391e-05, + "loss": 1.9795, + "step": 2744 + }, + { + "epoch": 0.11997902006206565, + "grad_norm": 3.171875, + "learning_rate": 9.651225123641625e-05, + "loss": 2.5617, + "step": 2745 + }, + { + "epoch": 0.12002272826609554, + "grad_norm": 2.71875, + "learning_rate": 9.65097304146847e-05, + "loss": 2.7336, + "step": 2746 + }, + { + "epoch": 0.12006643647012544, + "grad_norm": 2.875, + "learning_rate": 9.650720871524686e-05, + "loss": 1.7494, + "step": 2747 + }, + { + "epoch": 0.12011014467415534, + "grad_norm": 2.84375, + "learning_rate": 9.65046861381503e-05, + "loss": 1.8763, + "step": 2748 + }, + { + "epoch": 0.12015385287818524, + "grad_norm": 2.3125, + "learning_rate": 9.650216268344262e-05, + "loss": 1.9197, + "step": 2749 + }, + { + "epoch": 0.12019756108221513, + "grad_norm": 2.28125, + "learning_rate": 9.649963835117147e-05, + "loss": 2.1772, + "step": 2750 + }, + { + "epoch": 0.12024126928624503, + "grad_norm": 2.3125, + "learning_rate": 9.649711314138443e-05, + "loss": 1.8606, + "step": 2751 + }, + { + "epoch": 0.12028497749027492, + "grad_norm": 2.515625, + "learning_rate": 9.649458705412921e-05, + "loss": 1.6259, + "step": 2752 + }, + { + "epoch": 0.12032868569430483, + "grad_norm": 2.046875, + "learning_rate": 9.649206008945348e-05, + "loss": 1.7527, + "step": 2753 + }, + { + "epoch": 0.12037239389833472, + "grad_norm": 7.90625, + "learning_rate": 9.648953224740489e-05, + "loss": 3.3193, + "step": 2754 + }, + { + "epoch": 0.12041610210236461, + "grad_norm": 2.828125, + "learning_rate": 9.648700352803118e-05, + "loss": 2.3352, + "step": 2755 + }, + { + "epoch": 0.12045981030639451, + "grad_norm": 2.875, + "learning_rate": 9.648447393138005e-05, + "loss": 2.2762, + "step": 2756 + }, + { + "epoch": 0.1205035185104244, + "grad_norm": 2.53125, + "learning_rate": 9.648194345749923e-05, + "loss": 2.3559, + "step": 2757 + }, + { + "epoch": 0.12054722671445431, + "grad_norm": 2.625, + "learning_rate": 9.64794121064365e-05, + "loss": 2.6152, + "step": 2758 + }, + { + "epoch": 0.1205909349184842, + "grad_norm": 2.25, + "learning_rate": 9.64768798782396e-05, + "loss": 1.7972, + "step": 2759 + }, + { + "epoch": 0.1206346431225141, + "grad_norm": 3.171875, + "learning_rate": 9.647434677295635e-05, + "loss": 1.8208, + "step": 2760 + }, + { + "epoch": 0.12067835132654399, + "grad_norm": 2.984375, + "learning_rate": 9.647181279063453e-05, + "loss": 2.6227, + "step": 2761 + }, + { + "epoch": 0.12072205953057388, + "grad_norm": 2.3125, + "learning_rate": 9.646927793132195e-05, + "loss": 2.2009, + "step": 2762 + }, + { + "epoch": 0.12076576773460379, + "grad_norm": 2.78125, + "learning_rate": 9.646674219506648e-05, + "loss": 2.5652, + "step": 2763 + }, + { + "epoch": 0.12080947593863368, + "grad_norm": 2.46875, + "learning_rate": 9.646420558191596e-05, + "loss": 2.5864, + "step": 2764 + }, + { + "epoch": 0.12085318414266358, + "grad_norm": 2.328125, + "learning_rate": 9.646166809191824e-05, + "loss": 2.542, + "step": 2765 + }, + { + "epoch": 0.12089689234669347, + "grad_norm": 2.375, + "learning_rate": 9.645912972512124e-05, + "loss": 1.6604, + "step": 2766 + }, + { + "epoch": 0.12094060055072337, + "grad_norm": 2.21875, + "learning_rate": 9.645659048157282e-05, + "loss": 1.5808, + "step": 2767 + }, + { + "epoch": 0.12098430875475327, + "grad_norm": 3.0625, + "learning_rate": 9.645405036132093e-05, + "loss": 1.7339, + "step": 2768 + }, + { + "epoch": 0.12102801695878317, + "grad_norm": 2.65625, + "learning_rate": 9.645150936441351e-05, + "loss": 2.326, + "step": 2769 + }, + { + "epoch": 0.12107172516281306, + "grad_norm": 2.640625, + "learning_rate": 9.64489674908985e-05, + "loss": 1.763, + "step": 2770 + }, + { + "epoch": 0.12111543336684295, + "grad_norm": 2.578125, + "learning_rate": 9.644642474082386e-05, + "loss": 1.7941, + "step": 2771 + }, + { + "epoch": 0.12115914157087285, + "grad_norm": 2.578125, + "learning_rate": 9.644388111423759e-05, + "loss": 1.76, + "step": 2772 + }, + { + "epoch": 0.12120284977490275, + "grad_norm": 2.4375, + "learning_rate": 9.644133661118769e-05, + "loss": 2.1463, + "step": 2773 + }, + { + "epoch": 0.12124655797893265, + "grad_norm": 2.390625, + "learning_rate": 9.643879123172218e-05, + "loss": 1.7819, + "step": 2774 + }, + { + "epoch": 0.12129026618296254, + "grad_norm": 2.609375, + "learning_rate": 9.643624497588908e-05, + "loss": 2.1013, + "step": 2775 + }, + { + "epoch": 0.12133397438699244, + "grad_norm": 2.765625, + "learning_rate": 9.643369784373645e-05, + "loss": 2.0237, + "step": 2776 + }, + { + "epoch": 0.12137768259102233, + "grad_norm": 2.359375, + "learning_rate": 9.643114983531238e-05, + "loss": 1.6875, + "step": 2777 + }, + { + "epoch": 0.12142139079505224, + "grad_norm": 2.59375, + "learning_rate": 9.642860095066493e-05, + "loss": 2.4743, + "step": 2778 + }, + { + "epoch": 0.12146509899908213, + "grad_norm": 2.5625, + "learning_rate": 9.64260511898422e-05, + "loss": 2.577, + "step": 2779 + }, + { + "epoch": 0.12150880720311202, + "grad_norm": 3.828125, + "learning_rate": 9.642350055289232e-05, + "loss": 2.5573, + "step": 2780 + }, + { + "epoch": 0.12155251540714192, + "grad_norm": 2.25, + "learning_rate": 9.642094903986341e-05, + "loss": 1.871, + "step": 2781 + }, + { + "epoch": 0.12159622361117181, + "grad_norm": 3.015625, + "learning_rate": 9.641839665080363e-05, + "loss": 2.1364, + "step": 2782 + }, + { + "epoch": 0.12163993181520172, + "grad_norm": 2.234375, + "learning_rate": 9.641584338576115e-05, + "loss": 1.9107, + "step": 2783 + }, + { + "epoch": 0.12168364001923161, + "grad_norm": 2.515625, + "learning_rate": 9.641328924478416e-05, + "loss": 1.7113, + "step": 2784 + }, + { + "epoch": 0.1217273482232615, + "grad_norm": 2.3125, + "learning_rate": 9.641073422792085e-05, + "loss": 1.885, + "step": 2785 + }, + { + "epoch": 0.1217710564272914, + "grad_norm": 2.46875, + "learning_rate": 9.640817833521941e-05, + "loss": 2.057, + "step": 2786 + }, + { + "epoch": 0.12181476463132129, + "grad_norm": 3.609375, + "learning_rate": 9.640562156672812e-05, + "loss": 2.3443, + "step": 2787 + }, + { + "epoch": 0.1218584728353512, + "grad_norm": 4.1875, + "learning_rate": 9.64030639224952e-05, + "loss": 1.3211, + "step": 2788 + }, + { + "epoch": 0.1219021810393811, + "grad_norm": 2.609375, + "learning_rate": 9.640050540256896e-05, + "loss": 1.985, + "step": 2789 + }, + { + "epoch": 0.12194588924341099, + "grad_norm": 2.53125, + "learning_rate": 9.639794600699761e-05, + "loss": 1.9536, + "step": 2790 + }, + { + "epoch": 0.12198959744744088, + "grad_norm": 2.359375, + "learning_rate": 9.639538573582951e-05, + "loss": 1.9697, + "step": 2791 + }, + { + "epoch": 0.12203330565147077, + "grad_norm": 3.296875, + "learning_rate": 9.639282458911297e-05, + "loss": 1.9432, + "step": 2792 + }, + { + "epoch": 0.12207701385550068, + "grad_norm": 2.265625, + "learning_rate": 9.639026256689628e-05, + "loss": 1.8395, + "step": 2793 + }, + { + "epoch": 0.12212072205953058, + "grad_norm": 3.3125, + "learning_rate": 9.638769966922783e-05, + "loss": 2.3836, + "step": 2794 + }, + { + "epoch": 0.12216443026356047, + "grad_norm": 2.984375, + "learning_rate": 9.638513589615596e-05, + "loss": 2.228, + "step": 2795 + }, + { + "epoch": 0.12220813846759036, + "grad_norm": 2.359375, + "learning_rate": 9.638257124772909e-05, + "loss": 2.281, + "step": 2796 + }, + { + "epoch": 0.12225184667162026, + "grad_norm": 2.296875, + "learning_rate": 9.638000572399559e-05, + "loss": 2.0127, + "step": 2797 + }, + { + "epoch": 0.12229555487565016, + "grad_norm": 2.640625, + "learning_rate": 9.637743932500385e-05, + "loss": 2.5281, + "step": 2798 + }, + { + "epoch": 0.12233926307968006, + "grad_norm": 2.34375, + "learning_rate": 9.637487205080236e-05, + "loss": 2.0413, + "step": 2799 + }, + { + "epoch": 0.12238297128370995, + "grad_norm": 2.234375, + "learning_rate": 9.637230390143953e-05, + "loss": 2.1581, + "step": 2800 + }, + { + "epoch": 0.12242667948773985, + "grad_norm": 2.359375, + "learning_rate": 9.636973487696383e-05, + "loss": 2.5069, + "step": 2801 + }, + { + "epoch": 0.12247038769176974, + "grad_norm": 2.28125, + "learning_rate": 9.636716497742375e-05, + "loss": 1.5927, + "step": 2802 + }, + { + "epoch": 0.12251409589579965, + "grad_norm": 2.09375, + "learning_rate": 9.636459420286779e-05, + "loss": 1.7826, + "step": 2803 + }, + { + "epoch": 0.12255780409982954, + "grad_norm": 3.71875, + "learning_rate": 9.636202255334444e-05, + "loss": 2.4786, + "step": 2804 + }, + { + "epoch": 0.12260151230385943, + "grad_norm": 2.625, + "learning_rate": 9.635945002890225e-05, + "loss": 2.1374, + "step": 2805 + }, + { + "epoch": 0.12264522050788933, + "grad_norm": 2.234375, + "learning_rate": 9.635687662958978e-05, + "loss": 1.6501, + "step": 2806 + }, + { + "epoch": 0.12268892871191922, + "grad_norm": 2.625, + "learning_rate": 9.635430235545557e-05, + "loss": 2.1622, + "step": 2807 + }, + { + "epoch": 0.12273263691594913, + "grad_norm": 2.0625, + "learning_rate": 9.635172720654822e-05, + "loss": 1.9891, + "step": 2808 + }, + { + "epoch": 0.12277634511997902, + "grad_norm": 2.375, + "learning_rate": 9.634915118291629e-05, + "loss": 1.9462, + "step": 2809 + }, + { + "epoch": 0.12282005332400892, + "grad_norm": 2.859375, + "learning_rate": 9.634657428460844e-05, + "loss": 2.5531, + "step": 2810 + }, + { + "epoch": 0.12286376152803881, + "grad_norm": 2.21875, + "learning_rate": 9.634399651167328e-05, + "loss": 1.9049, + "step": 2811 + }, + { + "epoch": 0.1229074697320687, + "grad_norm": 3.078125, + "learning_rate": 9.634141786415944e-05, + "loss": 2.4264, + "step": 2812 + }, + { + "epoch": 0.12295117793609861, + "grad_norm": 3.640625, + "learning_rate": 9.633883834211562e-05, + "loss": 2.2442, + "step": 2813 + }, + { + "epoch": 0.1229948861401285, + "grad_norm": 3.234375, + "learning_rate": 9.633625794559045e-05, + "loss": 2.1397, + "step": 2814 + }, + { + "epoch": 0.1230385943441584, + "grad_norm": 2.71875, + "learning_rate": 9.633367667463267e-05, + "loss": 2.8998, + "step": 2815 + }, + { + "epoch": 0.12308230254818829, + "grad_norm": 2.28125, + "learning_rate": 9.633109452929097e-05, + "loss": 1.9054, + "step": 2816 + }, + { + "epoch": 0.12312601075221818, + "grad_norm": 2.1875, + "learning_rate": 9.632851150961409e-05, + "loss": 1.9339, + "step": 2817 + }, + { + "epoch": 0.12316971895624809, + "grad_norm": 2.34375, + "learning_rate": 9.632592761565077e-05, + "loss": 1.8701, + "step": 2818 + }, + { + "epoch": 0.12321342716027799, + "grad_norm": 2.859375, + "learning_rate": 9.632334284744978e-05, + "loss": 2.0784, + "step": 2819 + }, + { + "epoch": 0.12325713536430788, + "grad_norm": 7.4375, + "learning_rate": 9.632075720505987e-05, + "loss": 1.8226, + "step": 2820 + }, + { + "epoch": 0.12330084356833777, + "grad_norm": 2.28125, + "learning_rate": 9.631817068852986e-05, + "loss": 1.7425, + "step": 2821 + }, + { + "epoch": 0.12334455177236767, + "grad_norm": 2.546875, + "learning_rate": 9.631558329790857e-05, + "loss": 1.9114, + "step": 2822 + }, + { + "epoch": 0.12338825997639757, + "grad_norm": 3.078125, + "learning_rate": 9.63129950332448e-05, + "loss": 2.1165, + "step": 2823 + }, + { + "epoch": 0.12343196818042747, + "grad_norm": 2.234375, + "learning_rate": 9.631040589458741e-05, + "loss": 1.8682, + "step": 2824 + }, + { + "epoch": 0.12347567638445736, + "grad_norm": 2.84375, + "learning_rate": 9.630781588198526e-05, + "loss": 1.7066, + "step": 2825 + }, + { + "epoch": 0.12351938458848725, + "grad_norm": 2.578125, + "learning_rate": 9.630522499548723e-05, + "loss": 2.5323, + "step": 2826 + }, + { + "epoch": 0.12356309279251716, + "grad_norm": 2.640625, + "learning_rate": 9.630263323514222e-05, + "loss": 2.2879, + "step": 2827 + }, + { + "epoch": 0.12360680099654706, + "grad_norm": 2.359375, + "learning_rate": 9.630004060099911e-05, + "loss": 2.0478, + "step": 2828 + }, + { + "epoch": 0.12365050920057695, + "grad_norm": 2.109375, + "learning_rate": 9.629744709310686e-05, + "loss": 1.6938, + "step": 2829 + }, + { + "epoch": 0.12369421740460684, + "grad_norm": 2.53125, + "learning_rate": 9.629485271151439e-05, + "loss": 1.9106, + "step": 2830 + }, + { + "epoch": 0.12373792560863674, + "grad_norm": 3.9375, + "learning_rate": 9.629225745627069e-05, + "loss": 2.8505, + "step": 2831 + }, + { + "epoch": 0.12378163381266664, + "grad_norm": 2.640625, + "learning_rate": 9.628966132742469e-05, + "loss": 2.4035, + "step": 2832 + }, + { + "epoch": 0.12382534201669654, + "grad_norm": 2.75, + "learning_rate": 9.628706432502543e-05, + "loss": 1.9975, + "step": 2833 + }, + { + "epoch": 0.12386905022072643, + "grad_norm": 2.234375, + "learning_rate": 9.62844664491219e-05, + "loss": 1.9244, + "step": 2834 + }, + { + "epoch": 0.12391275842475633, + "grad_norm": 2.25, + "learning_rate": 9.628186769976311e-05, + "loss": 1.9185, + "step": 2835 + }, + { + "epoch": 0.12395646662878622, + "grad_norm": 4.09375, + "learning_rate": 9.627926807699812e-05, + "loss": 1.9349, + "step": 2836 + }, + { + "epoch": 0.12400017483281613, + "grad_norm": 4.28125, + "learning_rate": 9.6276667580876e-05, + "loss": 1.671, + "step": 2837 + }, + { + "epoch": 0.12404388303684602, + "grad_norm": 2.1875, + "learning_rate": 9.627406621144578e-05, + "loss": 1.6789, + "step": 2838 + }, + { + "epoch": 0.12408759124087591, + "grad_norm": 2.53125, + "learning_rate": 9.62714639687566e-05, + "loss": 1.9697, + "step": 2839 + }, + { + "epoch": 0.12413129944490581, + "grad_norm": 2.3125, + "learning_rate": 9.626886085285755e-05, + "loss": 2.0127, + "step": 2840 + }, + { + "epoch": 0.1241750076489357, + "grad_norm": 2.328125, + "learning_rate": 9.626625686379776e-05, + "loss": 1.8073, + "step": 2841 + }, + { + "epoch": 0.12421871585296561, + "grad_norm": 2.140625, + "learning_rate": 9.626365200162636e-05, + "loss": 1.6166, + "step": 2842 + }, + { + "epoch": 0.1242624240569955, + "grad_norm": 3.90625, + "learning_rate": 9.626104626639252e-05, + "loss": 1.9251, + "step": 2843 + }, + { + "epoch": 0.1243061322610254, + "grad_norm": 3.875, + "learning_rate": 9.625843965814539e-05, + "loss": 2.5869, + "step": 2844 + }, + { + "epoch": 0.12434984046505529, + "grad_norm": 2.796875, + "learning_rate": 9.62558321769342e-05, + "loss": 3.0161, + "step": 2845 + }, + { + "epoch": 0.12439354866908518, + "grad_norm": 2.375, + "learning_rate": 9.625322382280811e-05, + "loss": 1.901, + "step": 2846 + }, + { + "epoch": 0.12443725687311509, + "grad_norm": 2.21875, + "learning_rate": 9.625061459581639e-05, + "loss": 1.9044, + "step": 2847 + }, + { + "epoch": 0.12448096507714498, + "grad_norm": 2.453125, + "learning_rate": 9.624800449600825e-05, + "loss": 2.3202, + "step": 2848 + }, + { + "epoch": 0.12452467328117488, + "grad_norm": 2.515625, + "learning_rate": 9.624539352343295e-05, + "loss": 2.5673, + "step": 2849 + }, + { + "epoch": 0.12456838148520477, + "grad_norm": 2.3125, + "learning_rate": 9.624278167813977e-05, + "loss": 2.3917, + "step": 2850 + }, + { + "epoch": 0.12461208968923466, + "grad_norm": 3.53125, + "learning_rate": 9.6240168960178e-05, + "loss": 3.0122, + "step": 2851 + }, + { + "epoch": 0.12465579789326457, + "grad_norm": 2.125, + "learning_rate": 9.623755536959693e-05, + "loss": 1.8228, + "step": 2852 + }, + { + "epoch": 0.12469950609729447, + "grad_norm": 3.21875, + "learning_rate": 9.623494090644591e-05, + "loss": 3.0652, + "step": 2853 + }, + { + "epoch": 0.12474321430132436, + "grad_norm": 2.296875, + "learning_rate": 9.623232557077426e-05, + "loss": 2.1147, + "step": 2854 + }, + { + "epoch": 0.12478692250535425, + "grad_norm": 3.34375, + "learning_rate": 9.622970936263134e-05, + "loss": 2.8506, + "step": 2855 + }, + { + "epoch": 0.12483063070938415, + "grad_norm": 3.15625, + "learning_rate": 9.622709228206651e-05, + "loss": 2.3544, + "step": 2856 + }, + { + "epoch": 0.12487433891341405, + "grad_norm": 2.65625, + "learning_rate": 9.622447432912918e-05, + "loss": 1.9846, + "step": 2857 + }, + { + "epoch": 0.12491804711744395, + "grad_norm": 2.46875, + "learning_rate": 9.622185550386873e-05, + "loss": 1.9084, + "step": 2858 + }, + { + "epoch": 0.12496175532147384, + "grad_norm": 2.25, + "learning_rate": 9.62192358063346e-05, + "loss": 1.8372, + "step": 2859 + }, + { + "epoch": 0.12500546352550373, + "grad_norm": 2.265625, + "learning_rate": 9.621661523657623e-05, + "loss": 2.4741, + "step": 2860 + }, + { + "epoch": 0.12504917172953364, + "grad_norm": 2.65625, + "learning_rate": 9.621399379464306e-05, + "loss": 2.4071, + "step": 2861 + }, + { + "epoch": 0.12509287993356352, + "grad_norm": 2.796875, + "learning_rate": 9.621137148058457e-05, + "loss": 2.3149, + "step": 2862 + }, + { + "epoch": 0.12513658813759343, + "grad_norm": 2.578125, + "learning_rate": 9.620874829445023e-05, + "loss": 2.3921, + "step": 2863 + }, + { + "epoch": 0.1251802963416233, + "grad_norm": 2.40625, + "learning_rate": 9.620612423628956e-05, + "loss": 1.9683, + "step": 2864 + }, + { + "epoch": 0.12522400454565322, + "grad_norm": 3.4375, + "learning_rate": 9.620349930615207e-05, + "loss": 2.6925, + "step": 2865 + }, + { + "epoch": 0.12526771274968312, + "grad_norm": 2.390625, + "learning_rate": 9.620087350408732e-05, + "loss": 2.2013, + "step": 2866 + }, + { + "epoch": 0.125311420953713, + "grad_norm": 2.703125, + "learning_rate": 9.619824683014484e-05, + "loss": 2.3714, + "step": 2867 + }, + { + "epoch": 0.1253551291577429, + "grad_norm": 2.140625, + "learning_rate": 9.61956192843742e-05, + "loss": 1.9963, + "step": 2868 + }, + { + "epoch": 0.1253988373617728, + "grad_norm": 2.4375, + "learning_rate": 9.619299086682498e-05, + "loss": 1.7972, + "step": 2869 + }, + { + "epoch": 0.1254425455658027, + "grad_norm": 2.5625, + "learning_rate": 9.61903615775468e-05, + "loss": 2.12, + "step": 2870 + }, + { + "epoch": 0.1254862537698326, + "grad_norm": 2.203125, + "learning_rate": 9.618773141658927e-05, + "loss": 2.1438, + "step": 2871 + }, + { + "epoch": 0.12552996197386249, + "grad_norm": 2.34375, + "learning_rate": 9.618510038400203e-05, + "loss": 2.0048, + "step": 2872 + }, + { + "epoch": 0.1255736701778924, + "grad_norm": 2.734375, + "learning_rate": 9.618246847983471e-05, + "loss": 2.315, + "step": 2873 + }, + { + "epoch": 0.12561737838192227, + "grad_norm": 2.234375, + "learning_rate": 9.617983570413702e-05, + "loss": 1.9862, + "step": 2874 + }, + { + "epoch": 0.12566108658595218, + "grad_norm": 2.09375, + "learning_rate": 9.617720205695862e-05, + "loss": 1.7937, + "step": 2875 + }, + { + "epoch": 0.1257047947899821, + "grad_norm": 2.75, + "learning_rate": 9.617456753834919e-05, + "loss": 2.5493, + "step": 2876 + }, + { + "epoch": 0.12574850299401197, + "grad_norm": 2.421875, + "learning_rate": 9.617193214835847e-05, + "loss": 2.4217, + "step": 2877 + }, + { + "epoch": 0.12579221119804188, + "grad_norm": 8.375, + "learning_rate": 9.616929588703618e-05, + "loss": 2.2522, + "step": 2878 + }, + { + "epoch": 0.12583591940207176, + "grad_norm": 2.390625, + "learning_rate": 9.61666587544321e-05, + "loss": 1.9273, + "step": 2879 + }, + { + "epoch": 0.12587962760610166, + "grad_norm": 2.640625, + "learning_rate": 9.616402075059597e-05, + "loss": 2.637, + "step": 2880 + }, + { + "epoch": 0.12592333581013157, + "grad_norm": 2.515625, + "learning_rate": 9.616138187557758e-05, + "loss": 2.0555, + "step": 2881 + }, + { + "epoch": 0.12596704401416145, + "grad_norm": 3.953125, + "learning_rate": 9.615874212942673e-05, + "loss": 2.0235, + "step": 2882 + }, + { + "epoch": 0.12601075221819136, + "grad_norm": 2.390625, + "learning_rate": 9.615610151219323e-05, + "loss": 2.136, + "step": 2883 + }, + { + "epoch": 0.12605446042222124, + "grad_norm": 2.453125, + "learning_rate": 9.615346002392692e-05, + "loss": 2.1353, + "step": 2884 + }, + { + "epoch": 0.12609816862625114, + "grad_norm": 2.546875, + "learning_rate": 9.615081766467764e-05, + "loss": 2.4363, + "step": 2885 + }, + { + "epoch": 0.12614187683028105, + "grad_norm": 2.578125, + "learning_rate": 9.614817443449529e-05, + "loss": 1.9039, + "step": 2886 + }, + { + "epoch": 0.12618558503431093, + "grad_norm": 3.046875, + "learning_rate": 9.614553033342969e-05, + "loss": 2.5831, + "step": 2887 + }, + { + "epoch": 0.12622929323834084, + "grad_norm": 2.234375, + "learning_rate": 9.614288536153078e-05, + "loss": 2.0155, + "step": 2888 + }, + { + "epoch": 0.12627300144237072, + "grad_norm": 2.171875, + "learning_rate": 9.614023951884848e-05, + "loss": 1.7467, + "step": 2889 + }, + { + "epoch": 0.12631670964640063, + "grad_norm": 2.6875, + "learning_rate": 9.613759280543269e-05, + "loss": 1.8073, + "step": 2890 + }, + { + "epoch": 0.12636041785043053, + "grad_norm": 4.90625, + "learning_rate": 9.613494522133337e-05, + "loss": 3.2485, + "step": 2891 + }, + { + "epoch": 0.12640412605446041, + "grad_norm": 2.796875, + "learning_rate": 9.613229676660049e-05, + "loss": 2.328, + "step": 2892 + }, + { + "epoch": 0.12644783425849032, + "grad_norm": 2.0, + "learning_rate": 9.612964744128404e-05, + "loss": 1.8794, + "step": 2893 + }, + { + "epoch": 0.1264915424625202, + "grad_norm": 2.1875, + "learning_rate": 9.6126997245434e-05, + "loss": 2.0111, + "step": 2894 + }, + { + "epoch": 0.1265352506665501, + "grad_norm": 2.25, + "learning_rate": 9.612434617910038e-05, + "loss": 2.0805, + "step": 2895 + }, + { + "epoch": 0.12657895887058002, + "grad_norm": 2.375, + "learning_rate": 9.612169424233323e-05, + "loss": 2.1638, + "step": 2896 + }, + { + "epoch": 0.1266226670746099, + "grad_norm": 2.96875, + "learning_rate": 9.611904143518257e-05, + "loss": 2.546, + "step": 2897 + }, + { + "epoch": 0.1266663752786398, + "grad_norm": 3.78125, + "learning_rate": 9.611638775769848e-05, + "loss": 2.0348, + "step": 2898 + }, + { + "epoch": 0.1267100834826697, + "grad_norm": 2.203125, + "learning_rate": 9.611373320993104e-05, + "loss": 1.8323, + "step": 2899 + }, + { + "epoch": 0.1267537916866996, + "grad_norm": 2.328125, + "learning_rate": 9.611107779193033e-05, + "loss": 1.6366, + "step": 2900 + }, + { + "epoch": 0.1267974998907295, + "grad_norm": 2.328125, + "learning_rate": 9.610842150374647e-05, + "loss": 1.679, + "step": 2901 + }, + { + "epoch": 0.12684120809475938, + "grad_norm": 2.09375, + "learning_rate": 9.610576434542959e-05, + "loss": 1.9237, + "step": 2902 + }, + { + "epoch": 0.12688491629878929, + "grad_norm": 2.578125, + "learning_rate": 9.610310631702983e-05, + "loss": 2.8621, + "step": 2903 + }, + { + "epoch": 0.1269286245028192, + "grad_norm": 2.765625, + "learning_rate": 9.610044741859736e-05, + "loss": 2.1277, + "step": 2904 + }, + { + "epoch": 0.12697233270684907, + "grad_norm": 2.40625, + "learning_rate": 9.609778765018235e-05, + "loss": 1.9442, + "step": 2905 + }, + { + "epoch": 0.12701604091087898, + "grad_norm": 2.96875, + "learning_rate": 9.609512701183499e-05, + "loss": 2.3559, + "step": 2906 + }, + { + "epoch": 0.12705974911490886, + "grad_norm": 2.359375, + "learning_rate": 9.609246550360551e-05, + "loss": 2.4651, + "step": 2907 + }, + { + "epoch": 0.12710345731893877, + "grad_norm": 4.90625, + "learning_rate": 9.60898031255441e-05, + "loss": 2.1018, + "step": 2908 + }, + { + "epoch": 0.12714716552296867, + "grad_norm": 2.40625, + "learning_rate": 9.608713987770103e-05, + "loss": 1.988, + "step": 2909 + }, + { + "epoch": 0.12719087372699855, + "grad_norm": 2.375, + "learning_rate": 9.608447576012656e-05, + "loss": 2.4612, + "step": 2910 + }, + { + "epoch": 0.12723458193102846, + "grad_norm": 2.203125, + "learning_rate": 9.608181077287098e-05, + "loss": 2.0709, + "step": 2911 + }, + { + "epoch": 0.12727829013505834, + "grad_norm": 2.484375, + "learning_rate": 9.607914491598453e-05, + "loss": 1.7109, + "step": 2912 + }, + { + "epoch": 0.12732199833908825, + "grad_norm": 2.921875, + "learning_rate": 9.607647818951756e-05, + "loss": 2.0656, + "step": 2913 + }, + { + "epoch": 0.12736570654311816, + "grad_norm": 3.125, + "learning_rate": 9.607381059352038e-05, + "loss": 3.0216, + "step": 2914 + }, + { + "epoch": 0.12740941474714804, + "grad_norm": 2.25, + "learning_rate": 9.607114212804335e-05, + "loss": 1.9264, + "step": 2915 + }, + { + "epoch": 0.12745312295117794, + "grad_norm": 2.640625, + "learning_rate": 9.606847279313681e-05, + "loss": 2.1365, + "step": 2916 + }, + { + "epoch": 0.12749683115520782, + "grad_norm": 2.40625, + "learning_rate": 9.606580258885114e-05, + "loss": 1.7999, + "step": 2917 + }, + { + "epoch": 0.12754053935923773, + "grad_norm": 2.84375, + "learning_rate": 9.606313151523672e-05, + "loss": 1.9917, + "step": 2918 + }, + { + "epoch": 0.12758424756326764, + "grad_norm": 2.453125, + "learning_rate": 9.606045957234398e-05, + "loss": 2.5768, + "step": 2919 + }, + { + "epoch": 0.12762795576729752, + "grad_norm": 2.390625, + "learning_rate": 9.605778676022333e-05, + "loss": 1.913, + "step": 2920 + }, + { + "epoch": 0.12767166397132743, + "grad_norm": 2.46875, + "learning_rate": 9.605511307892519e-05, + "loss": 2.1415, + "step": 2921 + }, + { + "epoch": 0.1277153721753573, + "grad_norm": 2.40625, + "learning_rate": 9.605243852850006e-05, + "loss": 1.9214, + "step": 2922 + }, + { + "epoch": 0.1277590803793872, + "grad_norm": 2.953125, + "learning_rate": 9.604976310899837e-05, + "loss": 2.1233, + "step": 2923 + }, + { + "epoch": 0.12780278858341712, + "grad_norm": 2.1875, + "learning_rate": 9.604708682047064e-05, + "loss": 2.0758, + "step": 2924 + }, + { + "epoch": 0.127846496787447, + "grad_norm": 3.09375, + "learning_rate": 9.604440966296734e-05, + "loss": 2.0466, + "step": 2925 + }, + { + "epoch": 0.1278902049914769, + "grad_norm": 2.8125, + "learning_rate": 9.604173163653904e-05, + "loss": 2.3425, + "step": 2926 + }, + { + "epoch": 0.1279339131955068, + "grad_norm": 2.703125, + "learning_rate": 9.603905274123626e-05, + "loss": 2.4204, + "step": 2927 + }, + { + "epoch": 0.1279776213995367, + "grad_norm": 2.265625, + "learning_rate": 9.603637297710954e-05, + "loss": 2.0726, + "step": 2928 + }, + { + "epoch": 0.1280213296035666, + "grad_norm": 2.109375, + "learning_rate": 9.603369234420945e-05, + "loss": 1.755, + "step": 2929 + }, + { + "epoch": 0.12806503780759648, + "grad_norm": 2.890625, + "learning_rate": 9.603101084258658e-05, + "loss": 2.0132, + "step": 2930 + }, + { + "epoch": 0.1281087460116264, + "grad_norm": 2.5, + "learning_rate": 9.602832847229156e-05, + "loss": 1.9451, + "step": 2931 + }, + { + "epoch": 0.12815245421565627, + "grad_norm": 2.3125, + "learning_rate": 9.602564523337498e-05, + "loss": 1.7434, + "step": 2932 + }, + { + "epoch": 0.12819616241968618, + "grad_norm": 2.515625, + "learning_rate": 9.602296112588749e-05, + "loss": 2.1285, + "step": 2933 + }, + { + "epoch": 0.12823987062371608, + "grad_norm": 3.09375, + "learning_rate": 9.602027614987974e-05, + "loss": 2.3753, + "step": 2934 + }, + { + "epoch": 0.12828357882774596, + "grad_norm": 2.1875, + "learning_rate": 9.60175903054024e-05, + "loss": 1.763, + "step": 2935 + }, + { + "epoch": 0.12832728703177587, + "grad_norm": 2.125, + "learning_rate": 9.601490359250615e-05, + "loss": 1.7454, + "step": 2936 + }, + { + "epoch": 0.12837099523580575, + "grad_norm": 2.5625, + "learning_rate": 9.601221601124172e-05, + "loss": 1.8542, + "step": 2937 + }, + { + "epoch": 0.12841470343983566, + "grad_norm": 2.4375, + "learning_rate": 9.600952756165979e-05, + "loss": 2.1361, + "step": 2938 + }, + { + "epoch": 0.12845841164386557, + "grad_norm": 2.75, + "learning_rate": 9.600683824381112e-05, + "loss": 1.8827, + "step": 2939 + }, + { + "epoch": 0.12850211984789545, + "grad_norm": 4.40625, + "learning_rate": 9.600414805774643e-05, + "loss": 2.8128, + "step": 2940 + }, + { + "epoch": 0.12854582805192535, + "grad_norm": 2.1875, + "learning_rate": 9.600145700351652e-05, + "loss": 1.7991, + "step": 2941 + }, + { + "epoch": 0.12858953625595523, + "grad_norm": 2.34375, + "learning_rate": 9.599876508117219e-05, + "loss": 2.6316, + "step": 2942 + }, + { + "epoch": 0.12863324445998514, + "grad_norm": 2.46875, + "learning_rate": 9.599607229076418e-05, + "loss": 1.8649, + "step": 2943 + }, + { + "epoch": 0.12867695266401505, + "grad_norm": 2.5625, + "learning_rate": 9.599337863234335e-05, + "loss": 2.396, + "step": 2944 + }, + { + "epoch": 0.12872066086804493, + "grad_norm": 2.40625, + "learning_rate": 9.599068410596053e-05, + "loss": 2.2869, + "step": 2945 + }, + { + "epoch": 0.12876436907207484, + "grad_norm": 2.6875, + "learning_rate": 9.598798871166656e-05, + "loss": 2.4572, + "step": 2946 + }, + { + "epoch": 0.12880807727610472, + "grad_norm": 3.09375, + "learning_rate": 9.598529244951232e-05, + "loss": 2.1541, + "step": 2947 + }, + { + "epoch": 0.12885178548013462, + "grad_norm": 2.8125, + "learning_rate": 9.598259531954868e-05, + "loss": 2.0215, + "step": 2948 + }, + { + "epoch": 0.12889549368416453, + "grad_norm": 2.625, + "learning_rate": 9.597989732182654e-05, + "loss": 2.2977, + "step": 2949 + }, + { + "epoch": 0.1289392018881944, + "grad_norm": 2.421875, + "learning_rate": 9.597719845639682e-05, + "loss": 1.8592, + "step": 2950 + }, + { + "epoch": 0.12898291009222432, + "grad_norm": 2.375, + "learning_rate": 9.597449872331045e-05, + "loss": 2.1149, + "step": 2951 + }, + { + "epoch": 0.1290266182962542, + "grad_norm": 2.015625, + "learning_rate": 9.597179812261836e-05, + "loss": 1.9048, + "step": 2952 + }, + { + "epoch": 0.1290703265002841, + "grad_norm": 2.234375, + "learning_rate": 9.596909665437155e-05, + "loss": 2.1005, + "step": 2953 + }, + { + "epoch": 0.129114034704314, + "grad_norm": 2.78125, + "learning_rate": 9.596639431862098e-05, + "loss": 1.4811, + "step": 2954 + }, + { + "epoch": 0.1291577429083439, + "grad_norm": 2.203125, + "learning_rate": 9.596369111541764e-05, + "loss": 2.0181, + "step": 2955 + }, + { + "epoch": 0.1292014511123738, + "grad_norm": 2.53125, + "learning_rate": 9.596098704481255e-05, + "loss": 2.2414, + "step": 2956 + }, + { + "epoch": 0.12924515931640368, + "grad_norm": 3.171875, + "learning_rate": 9.595828210685675e-05, + "loss": 1.9929, + "step": 2957 + }, + { + "epoch": 0.1292888675204336, + "grad_norm": 2.375, + "learning_rate": 9.595557630160127e-05, + "loss": 2.1108, + "step": 2958 + }, + { + "epoch": 0.1293325757244635, + "grad_norm": 2.484375, + "learning_rate": 9.595286962909717e-05, + "loss": 1.8243, + "step": 2959 + }, + { + "epoch": 0.12937628392849337, + "grad_norm": 2.109375, + "learning_rate": 9.595016208939555e-05, + "loss": 2.0417, + "step": 2960 + }, + { + "epoch": 0.12941999213252328, + "grad_norm": 2.546875, + "learning_rate": 9.594745368254751e-05, + "loss": 2.0429, + "step": 2961 + }, + { + "epoch": 0.12946370033655316, + "grad_norm": 4.0, + "learning_rate": 9.594474440860412e-05, + "loss": 2.2023, + "step": 2962 + }, + { + "epoch": 0.12950740854058307, + "grad_norm": 3.453125, + "learning_rate": 9.594203426761656e-05, + "loss": 2.6341, + "step": 2963 + }, + { + "epoch": 0.12955111674461298, + "grad_norm": 2.390625, + "learning_rate": 9.593932325963593e-05, + "loss": 2.112, + "step": 2964 + }, + { + "epoch": 0.12959482494864286, + "grad_norm": 3.03125, + "learning_rate": 9.593661138471342e-05, + "loss": 1.9946, + "step": 2965 + }, + { + "epoch": 0.12963853315267276, + "grad_norm": 2.671875, + "learning_rate": 9.593389864290018e-05, + "loss": 2.502, + "step": 2966 + }, + { + "epoch": 0.12968224135670264, + "grad_norm": 3.4375, + "learning_rate": 9.593118503424743e-05, + "loss": 2.1195, + "step": 2967 + }, + { + "epoch": 0.12972594956073255, + "grad_norm": 2.15625, + "learning_rate": 9.592847055880636e-05, + "loss": 2.0636, + "step": 2968 + }, + { + "epoch": 0.12976965776476246, + "grad_norm": 2.734375, + "learning_rate": 9.592575521662821e-05, + "loss": 2.1656, + "step": 2969 + }, + { + "epoch": 0.12981336596879234, + "grad_norm": 2.546875, + "learning_rate": 9.592303900776422e-05, + "loss": 2.2107, + "step": 2970 + }, + { + "epoch": 0.12985707417282225, + "grad_norm": 3.25, + "learning_rate": 9.592032193226564e-05, + "loss": 2.9833, + "step": 2971 + }, + { + "epoch": 0.12990078237685213, + "grad_norm": 2.796875, + "learning_rate": 9.591760399018375e-05, + "loss": 2.4552, + "step": 2972 + }, + { + "epoch": 0.12994449058088203, + "grad_norm": 2.25, + "learning_rate": 9.591488518156985e-05, + "loss": 2.3015, + "step": 2973 + }, + { + "epoch": 0.12998819878491194, + "grad_norm": 2.5625, + "learning_rate": 9.591216550647524e-05, + "loss": 1.8688, + "step": 2974 + }, + { + "epoch": 0.13003190698894182, + "grad_norm": 2.328125, + "learning_rate": 9.590944496495124e-05, + "loss": 1.8787, + "step": 2975 + }, + { + "epoch": 0.13007561519297173, + "grad_norm": 2.515625, + "learning_rate": 9.59067235570492e-05, + "loss": 2.0342, + "step": 2976 + }, + { + "epoch": 0.1301193233970016, + "grad_norm": 2.703125, + "learning_rate": 9.590400128282047e-05, + "loss": 2.1873, + "step": 2977 + }, + { + "epoch": 0.13016303160103151, + "grad_norm": 5.46875, + "learning_rate": 9.590127814231642e-05, + "loss": 3.4453, + "step": 2978 + }, + { + "epoch": 0.13020673980506142, + "grad_norm": 2.125, + "learning_rate": 9.589855413558846e-05, + "loss": 1.825, + "step": 2979 + }, + { + "epoch": 0.1302504480090913, + "grad_norm": 2.234375, + "learning_rate": 9.589582926268798e-05, + "loss": 1.9386, + "step": 2980 + }, + { + "epoch": 0.1302941562131212, + "grad_norm": 2.484375, + "learning_rate": 9.589310352366639e-05, + "loss": 2.2409, + "step": 2981 + }, + { + "epoch": 0.1303378644171511, + "grad_norm": 2.875, + "learning_rate": 9.589037691857515e-05, + "loss": 2.7937, + "step": 2982 + }, + { + "epoch": 0.130381572621181, + "grad_norm": 2.609375, + "learning_rate": 9.588764944746571e-05, + "loss": 2.0267, + "step": 2983 + }, + { + "epoch": 0.1304252808252109, + "grad_norm": 2.609375, + "learning_rate": 9.588492111038953e-05, + "loss": 2.0755, + "step": 2984 + }, + { + "epoch": 0.13046898902924078, + "grad_norm": 2.875, + "learning_rate": 9.588219190739811e-05, + "loss": 2.3942, + "step": 2985 + }, + { + "epoch": 0.1305126972332707, + "grad_norm": 2.21875, + "learning_rate": 9.587946183854295e-05, + "loss": 1.7443, + "step": 2986 + }, + { + "epoch": 0.13055640543730057, + "grad_norm": 2.421875, + "learning_rate": 9.587673090387558e-05, + "loss": 1.7173, + "step": 2987 + }, + { + "epoch": 0.13060011364133048, + "grad_norm": 3.328125, + "learning_rate": 9.587399910344753e-05, + "loss": 2.3119, + "step": 2988 + }, + { + "epoch": 0.13064382184536039, + "grad_norm": 2.515625, + "learning_rate": 9.587126643731033e-05, + "loss": 1.8181, + "step": 2989 + }, + { + "epoch": 0.13068753004939027, + "grad_norm": 2.46875, + "learning_rate": 9.586853290551558e-05, + "loss": 2.0235, + "step": 2990 + }, + { + "epoch": 0.13073123825342017, + "grad_norm": 2.578125, + "learning_rate": 9.586579850811486e-05, + "loss": 1.7663, + "step": 2991 + }, + { + "epoch": 0.13077494645745005, + "grad_norm": 3.1875, + "learning_rate": 9.586306324515976e-05, + "loss": 2.0949, + "step": 2992 + }, + { + "epoch": 0.13081865466147996, + "grad_norm": 2.34375, + "learning_rate": 9.58603271167019e-05, + "loss": 2.186, + "step": 2993 + }, + { + "epoch": 0.13086236286550987, + "grad_norm": 2.453125, + "learning_rate": 9.585759012279294e-05, + "loss": 1.7462, + "step": 2994 + }, + { + "epoch": 0.13090607106953975, + "grad_norm": 3.984375, + "learning_rate": 9.58548522634845e-05, + "loss": 2.6174, + "step": 2995 + }, + { + "epoch": 0.13094977927356966, + "grad_norm": 2.453125, + "learning_rate": 9.585211353882826e-05, + "loss": 2.5182, + "step": 2996 + }, + { + "epoch": 0.13099348747759954, + "grad_norm": 2.421875, + "learning_rate": 9.58493739488759e-05, + "loss": 1.7231, + "step": 2997 + }, + { + "epoch": 0.13103719568162944, + "grad_norm": 2.71875, + "learning_rate": 9.584663349367912e-05, + "loss": 2.4883, + "step": 2998 + }, + { + "epoch": 0.13108090388565935, + "grad_norm": 2.453125, + "learning_rate": 9.584389217328966e-05, + "loss": 1.7723, + "step": 2999 + }, + { + "epoch": 0.13112461208968923, + "grad_norm": 2.15625, + "learning_rate": 9.584114998775921e-05, + "loss": 1.8153, + "step": 3000 + }, + { + "epoch": 0.13116832029371914, + "grad_norm": 2.578125, + "learning_rate": 9.583840693713954e-05, + "loss": 2.0988, + "step": 3001 + }, + { + "epoch": 0.13121202849774902, + "grad_norm": 3.03125, + "learning_rate": 9.583566302148244e-05, + "loss": 2.2539, + "step": 3002 + }, + { + "epoch": 0.13125573670177892, + "grad_norm": 2.578125, + "learning_rate": 9.583291824083965e-05, + "loss": 1.6684, + "step": 3003 + }, + { + "epoch": 0.13129944490580883, + "grad_norm": 2.96875, + "learning_rate": 9.583017259526299e-05, + "loss": 2.04, + "step": 3004 + }, + { + "epoch": 0.1313431531098387, + "grad_norm": 2.453125, + "learning_rate": 9.582742608480428e-05, + "loss": 2.2517, + "step": 3005 + }, + { + "epoch": 0.13138686131386862, + "grad_norm": 2.84375, + "learning_rate": 9.582467870951533e-05, + "loss": 1.9857, + "step": 3006 + }, + { + "epoch": 0.1314305695178985, + "grad_norm": 2.390625, + "learning_rate": 9.5821930469448e-05, + "loss": 2.4797, + "step": 3007 + }, + { + "epoch": 0.1314742777219284, + "grad_norm": 2.640625, + "learning_rate": 9.581918136465416e-05, + "loss": 2.4298, + "step": 3008 + }, + { + "epoch": 0.13151798592595831, + "grad_norm": 2.375, + "learning_rate": 9.581643139518565e-05, + "loss": 1.7604, + "step": 3009 + }, + { + "epoch": 0.1315616941299882, + "grad_norm": 2.6875, + "learning_rate": 9.581368056109443e-05, + "loss": 2.3003, + "step": 3010 + }, + { + "epoch": 0.1316054023340181, + "grad_norm": 2.609375, + "learning_rate": 9.581092886243237e-05, + "loss": 1.9386, + "step": 3011 + }, + { + "epoch": 0.13164911053804798, + "grad_norm": 4.4375, + "learning_rate": 9.58081762992514e-05, + "loss": 2.2748, + "step": 3012 + }, + { + "epoch": 0.1316928187420779, + "grad_norm": 3.59375, + "learning_rate": 9.580542287160348e-05, + "loss": 2.9969, + "step": 3013 + }, + { + "epoch": 0.1317365269461078, + "grad_norm": 2.625, + "learning_rate": 9.580266857954057e-05, + "loss": 2.2068, + "step": 3014 + }, + { + "epoch": 0.13178023515013768, + "grad_norm": 2.796875, + "learning_rate": 9.579991342311463e-05, + "loss": 1.8651, + "step": 3015 + }, + { + "epoch": 0.13182394335416758, + "grad_norm": 2.4375, + "learning_rate": 9.579715740237766e-05, + "loss": 1.7948, + "step": 3016 + }, + { + "epoch": 0.13186765155819746, + "grad_norm": 2.265625, + "learning_rate": 9.579440051738168e-05, + "loss": 2.0184, + "step": 3017 + }, + { + "epoch": 0.13191135976222737, + "grad_norm": 2.59375, + "learning_rate": 9.579164276817873e-05, + "loss": 2.3655, + "step": 3018 + }, + { + "epoch": 0.13195506796625728, + "grad_norm": 3.5625, + "learning_rate": 9.578888415482082e-05, + "loss": 1.9905, + "step": 3019 + }, + { + "epoch": 0.13199877617028716, + "grad_norm": 3.65625, + "learning_rate": 9.578612467736004e-05, + "loss": 2.846, + "step": 3020 + }, + { + "epoch": 0.13204248437431707, + "grad_norm": 2.609375, + "learning_rate": 9.578336433584842e-05, + "loss": 2.0329, + "step": 3021 + }, + { + "epoch": 0.13208619257834694, + "grad_norm": 2.296875, + "learning_rate": 9.57806031303381e-05, + "loss": 2.0556, + "step": 3022 + }, + { + "epoch": 0.13212990078237685, + "grad_norm": 2.3125, + "learning_rate": 9.577784106088115e-05, + "loss": 2.2177, + "step": 3023 + }, + { + "epoch": 0.13217360898640676, + "grad_norm": 2.078125, + "learning_rate": 9.577507812752972e-05, + "loss": 1.7264, + "step": 3024 + }, + { + "epoch": 0.13221731719043664, + "grad_norm": 2.3125, + "learning_rate": 9.577231433033596e-05, + "loss": 1.7551, + "step": 3025 + }, + { + "epoch": 0.13226102539446655, + "grad_norm": 2.21875, + "learning_rate": 9.576954966935198e-05, + "loss": 1.812, + "step": 3026 + }, + { + "epoch": 0.13230473359849643, + "grad_norm": 3.578125, + "learning_rate": 9.576678414463001e-05, + "loss": 3.2237, + "step": 3027 + }, + { + "epoch": 0.13234844180252633, + "grad_norm": 2.203125, + "learning_rate": 9.576401775622222e-05, + "loss": 1.7513, + "step": 3028 + }, + { + "epoch": 0.13239215000655624, + "grad_norm": 2.984375, + "learning_rate": 9.576125050418077e-05, + "loss": 2.0373, + "step": 3029 + }, + { + "epoch": 0.13243585821058612, + "grad_norm": 2.578125, + "learning_rate": 9.575848238855796e-05, + "loss": 2.1212, + "step": 3030 + }, + { + "epoch": 0.13247956641461603, + "grad_norm": 3.765625, + "learning_rate": 9.575571340940597e-05, + "loss": 1.8877, + "step": 3031 + }, + { + "epoch": 0.1325232746186459, + "grad_norm": 3.09375, + "learning_rate": 9.575294356677707e-05, + "loss": 2.2786, + "step": 3032 + }, + { + "epoch": 0.13256698282267582, + "grad_norm": 2.890625, + "learning_rate": 9.575017286072355e-05, + "loss": 1.922, + "step": 3033 + }, + { + "epoch": 0.13261069102670572, + "grad_norm": 2.609375, + "learning_rate": 9.574740129129767e-05, + "loss": 2.1338, + "step": 3034 + }, + { + "epoch": 0.1326543992307356, + "grad_norm": 2.890625, + "learning_rate": 9.574462885855174e-05, + "loss": 2.4, + "step": 3035 + }, + { + "epoch": 0.1326981074347655, + "grad_norm": 3.859375, + "learning_rate": 9.57418555625381e-05, + "loss": 2.177, + "step": 3036 + }, + { + "epoch": 0.1327418156387954, + "grad_norm": 2.8125, + "learning_rate": 9.573908140330905e-05, + "loss": 2.1712, + "step": 3037 + }, + { + "epoch": 0.1327855238428253, + "grad_norm": 2.703125, + "learning_rate": 9.573630638091698e-05, + "loss": 1.9705, + "step": 3038 + }, + { + "epoch": 0.1328292320468552, + "grad_norm": 2.21875, + "learning_rate": 9.573353049541425e-05, + "loss": 1.9445, + "step": 3039 + }, + { + "epoch": 0.13287294025088509, + "grad_norm": 2.03125, + "learning_rate": 9.573075374685323e-05, + "loss": 2.0247, + "step": 3040 + }, + { + "epoch": 0.132916648454915, + "grad_norm": 4.28125, + "learning_rate": 9.572797613528633e-05, + "loss": 1.9356, + "step": 3041 + }, + { + "epoch": 0.13296035665894487, + "grad_norm": 3.109375, + "learning_rate": 9.572519766076595e-05, + "loss": 2.2053, + "step": 3042 + }, + { + "epoch": 0.13300406486297478, + "grad_norm": 2.921875, + "learning_rate": 9.572241832334457e-05, + "loss": 2.8439, + "step": 3043 + }, + { + "epoch": 0.1330477730670047, + "grad_norm": 2.703125, + "learning_rate": 9.571963812307459e-05, + "loss": 2.3801, + "step": 3044 + }, + { + "epoch": 0.13309148127103457, + "grad_norm": 2.53125, + "learning_rate": 9.57168570600085e-05, + "loss": 2.0442, + "step": 3045 + }, + { + "epoch": 0.13313518947506447, + "grad_norm": 2.1875, + "learning_rate": 9.571407513419877e-05, + "loss": 1.8136, + "step": 3046 + }, + { + "epoch": 0.13317889767909435, + "grad_norm": 2.6875, + "learning_rate": 9.571129234569792e-05, + "loss": 2.2411, + "step": 3047 + }, + { + "epoch": 0.13322260588312426, + "grad_norm": 2.5625, + "learning_rate": 9.570850869455845e-05, + "loss": 2.2694, + "step": 3048 + }, + { + "epoch": 0.13326631408715417, + "grad_norm": 2.484375, + "learning_rate": 9.57057241808329e-05, + "loss": 1.9991, + "step": 3049 + }, + { + "epoch": 0.13331002229118405, + "grad_norm": 3.265625, + "learning_rate": 9.570293880457382e-05, + "loss": 2.7739, + "step": 3050 + }, + { + "epoch": 0.13335373049521396, + "grad_norm": 2.875, + "learning_rate": 9.570015256583375e-05, + "loss": 2.8135, + "step": 3051 + }, + { + "epoch": 0.13339743869924384, + "grad_norm": 2.5, + "learning_rate": 9.56973654646653e-05, + "loss": 1.9088, + "step": 3052 + }, + { + "epoch": 0.13344114690327374, + "grad_norm": 2.53125, + "learning_rate": 9.569457750112106e-05, + "loss": 2.6223, + "step": 3053 + }, + { + "epoch": 0.13348485510730365, + "grad_norm": 2.515625, + "learning_rate": 9.569178867525362e-05, + "loss": 2.0422, + "step": 3054 + }, + { + "epoch": 0.13352856331133353, + "grad_norm": 2.3125, + "learning_rate": 9.568899898711563e-05, + "loss": 2.0285, + "step": 3055 + }, + { + "epoch": 0.13357227151536344, + "grad_norm": 2.265625, + "learning_rate": 9.568620843675975e-05, + "loss": 2.091, + "step": 3056 + }, + { + "epoch": 0.13361597971939332, + "grad_norm": 2.359375, + "learning_rate": 9.56834170242386e-05, + "loss": 2.0307, + "step": 3057 + }, + { + "epoch": 0.13365968792342323, + "grad_norm": 2.453125, + "learning_rate": 9.568062474960489e-05, + "loss": 2.0899, + "step": 3058 + }, + { + "epoch": 0.13370339612745313, + "grad_norm": 2.625, + "learning_rate": 9.56778316129113e-05, + "loss": 1.8432, + "step": 3059 + }, + { + "epoch": 0.133747104331483, + "grad_norm": 3.671875, + "learning_rate": 9.567503761421057e-05, + "loss": 1.4322, + "step": 3060 + }, + { + "epoch": 0.13379081253551292, + "grad_norm": 2.28125, + "learning_rate": 9.567224275355539e-05, + "loss": 2.1823, + "step": 3061 + }, + { + "epoch": 0.1338345207395428, + "grad_norm": 2.328125, + "learning_rate": 9.566944703099852e-05, + "loss": 1.718, + "step": 3062 + }, + { + "epoch": 0.1338782289435727, + "grad_norm": 2.671875, + "learning_rate": 9.56666504465927e-05, + "loss": 2.4914, + "step": 3063 + }, + { + "epoch": 0.13392193714760262, + "grad_norm": 2.640625, + "learning_rate": 9.566385300039074e-05, + "loss": 2.0368, + "step": 3064 + }, + { + "epoch": 0.1339656453516325, + "grad_norm": 2.421875, + "learning_rate": 9.56610546924454e-05, + "loss": 1.9711, + "step": 3065 + }, + { + "epoch": 0.1340093535556624, + "grad_norm": 2.59375, + "learning_rate": 9.56582555228095e-05, + "loss": 2.2761, + "step": 3066 + }, + { + "epoch": 0.13405306175969228, + "grad_norm": 2.3125, + "learning_rate": 9.565545549153588e-05, + "loss": 1.7822, + "step": 3067 + }, + { + "epoch": 0.1340967699637222, + "grad_norm": 2.34375, + "learning_rate": 9.565265459867736e-05, + "loss": 2.0829, + "step": 3068 + }, + { + "epoch": 0.1341404781677521, + "grad_norm": 2.15625, + "learning_rate": 9.564985284428679e-05, + "loss": 1.8644, + "step": 3069 + }, + { + "epoch": 0.13418418637178198, + "grad_norm": 2.1875, + "learning_rate": 9.564705022841706e-05, + "loss": 1.8726, + "step": 3070 + }, + { + "epoch": 0.13422789457581188, + "grad_norm": 2.5625, + "learning_rate": 9.564424675112106e-05, + "loss": 2.0443, + "step": 3071 + }, + { + "epoch": 0.13427160277984176, + "grad_norm": 2.3125, + "learning_rate": 9.564144241245173e-05, + "loss": 2.0608, + "step": 3072 + }, + { + "epoch": 0.13431531098387167, + "grad_norm": 2.921875, + "learning_rate": 9.563863721246191e-05, + "loss": 1.9922, + "step": 3073 + }, + { + "epoch": 0.13435901918790158, + "grad_norm": 2.4375, + "learning_rate": 9.563583115120458e-05, + "loss": 2.3749, + "step": 3074 + }, + { + "epoch": 0.13440272739193146, + "grad_norm": 2.46875, + "learning_rate": 9.563302422873272e-05, + "loss": 1.9774, + "step": 3075 + }, + { + "epoch": 0.13444643559596137, + "grad_norm": 3.96875, + "learning_rate": 9.563021644509926e-05, + "loss": 2.1169, + "step": 3076 + }, + { + "epoch": 0.13449014379999125, + "grad_norm": 2.875, + "learning_rate": 9.562740780035721e-05, + "loss": 2.4444, + "step": 3077 + }, + { + "epoch": 0.13453385200402115, + "grad_norm": 2.453125, + "learning_rate": 9.562459829455957e-05, + "loss": 1.5088, + "step": 3078 + }, + { + "epoch": 0.13457756020805106, + "grad_norm": 2.84375, + "learning_rate": 9.562178792775936e-05, + "loss": 2.3153, + "step": 3079 + }, + { + "epoch": 0.13462126841208094, + "grad_norm": 2.53125, + "learning_rate": 9.561897670000958e-05, + "loss": 2.1347, + "step": 3080 + }, + { + "epoch": 0.13466497661611085, + "grad_norm": 2.46875, + "learning_rate": 9.561616461136336e-05, + "loss": 2.0139, + "step": 3081 + }, + { + "epoch": 0.13470868482014073, + "grad_norm": 2.375, + "learning_rate": 9.56133516618737e-05, + "loss": 1.9502, + "step": 3082 + }, + { + "epoch": 0.13475239302417064, + "grad_norm": 2.03125, + "learning_rate": 9.561053785159371e-05, + "loss": 1.7509, + "step": 3083 + }, + { + "epoch": 0.13479610122820054, + "grad_norm": 2.875, + "learning_rate": 9.56077231805765e-05, + "loss": 2.5232, + "step": 3084 + }, + { + "epoch": 0.13483980943223042, + "grad_norm": 3.359375, + "learning_rate": 9.560490764887516e-05, + "loss": 2.7191, + "step": 3085 + }, + { + "epoch": 0.13488351763626033, + "grad_norm": 2.15625, + "learning_rate": 9.560209125654282e-05, + "loss": 1.7089, + "step": 3086 + }, + { + "epoch": 0.1349272258402902, + "grad_norm": 3.140625, + "learning_rate": 9.559927400363268e-05, + "loss": 1.6867, + "step": 3087 + }, + { + "epoch": 0.13497093404432012, + "grad_norm": 2.875, + "learning_rate": 9.559645589019785e-05, + "loss": 1.9889, + "step": 3088 + }, + { + "epoch": 0.13501464224835003, + "grad_norm": 2.5625, + "learning_rate": 9.559363691629155e-05, + "loss": 1.6799, + "step": 3089 + }, + { + "epoch": 0.1350583504523799, + "grad_norm": 2.515625, + "learning_rate": 9.559081708196696e-05, + "loss": 2.2714, + "step": 3090 + }, + { + "epoch": 0.1351020586564098, + "grad_norm": 5.21875, + "learning_rate": 9.55879963872773e-05, + "loss": 2.3897, + "step": 3091 + }, + { + "epoch": 0.1351457668604397, + "grad_norm": 2.328125, + "learning_rate": 9.558517483227579e-05, + "loss": 2.3887, + "step": 3092 + }, + { + "epoch": 0.1351894750644696, + "grad_norm": 2.15625, + "learning_rate": 9.55823524170157e-05, + "loss": 1.8025, + "step": 3093 + }, + { + "epoch": 0.1352331832684995, + "grad_norm": 3.40625, + "learning_rate": 9.557952914155027e-05, + "loss": 2.3162, + "step": 3094 + }, + { + "epoch": 0.1352768914725294, + "grad_norm": 2.21875, + "learning_rate": 9.557670500593276e-05, + "loss": 1.6999, + "step": 3095 + }, + { + "epoch": 0.1353205996765593, + "grad_norm": 2.59375, + "learning_rate": 9.557388001021653e-05, + "loss": 2.1748, + "step": 3096 + }, + { + "epoch": 0.13536430788058917, + "grad_norm": 2.484375, + "learning_rate": 9.557105415445484e-05, + "loss": 2.0518, + "step": 3097 + }, + { + "epoch": 0.13540801608461908, + "grad_norm": 2.609375, + "learning_rate": 9.556822743870104e-05, + "loss": 2.3651, + "step": 3098 + }, + { + "epoch": 0.135451724288649, + "grad_norm": 2.46875, + "learning_rate": 9.556539986300845e-05, + "loss": 1.8946, + "step": 3099 + }, + { + "epoch": 0.13549543249267887, + "grad_norm": 2.78125, + "learning_rate": 9.556257142743046e-05, + "loss": 2.0344, + "step": 3100 + }, + { + "epoch": 0.13553914069670878, + "grad_norm": 2.78125, + "learning_rate": 9.555974213202044e-05, + "loss": 2.6906, + "step": 3101 + }, + { + "epoch": 0.13558284890073866, + "grad_norm": 2.203125, + "learning_rate": 9.555691197683177e-05, + "loss": 2.0669, + "step": 3102 + }, + { + "epoch": 0.13562655710476856, + "grad_norm": 2.109375, + "learning_rate": 9.555408096191786e-05, + "loss": 1.9788, + "step": 3103 + }, + { + "epoch": 0.13567026530879847, + "grad_norm": 2.203125, + "learning_rate": 9.555124908733215e-05, + "loss": 1.8438, + "step": 3104 + }, + { + "epoch": 0.13571397351282835, + "grad_norm": 2.140625, + "learning_rate": 9.554841635312805e-05, + "loss": 2.1955, + "step": 3105 + }, + { + "epoch": 0.13575768171685826, + "grad_norm": 2.640625, + "learning_rate": 9.554558275935907e-05, + "loss": 1.9396, + "step": 3106 + }, + { + "epoch": 0.13580138992088814, + "grad_norm": 2.1875, + "learning_rate": 9.554274830607866e-05, + "loss": 1.7357, + "step": 3107 + }, + { + "epoch": 0.13584509812491805, + "grad_norm": 2.859375, + "learning_rate": 9.553991299334028e-05, + "loss": 1.8541, + "step": 3108 + }, + { + "epoch": 0.13588880632894795, + "grad_norm": 2.3125, + "learning_rate": 9.553707682119746e-05, + "loss": 1.9001, + "step": 3109 + }, + { + "epoch": 0.13593251453297783, + "grad_norm": 2.65625, + "learning_rate": 9.553423978970376e-05, + "loss": 2.2973, + "step": 3110 + }, + { + "epoch": 0.13597622273700774, + "grad_norm": 2.78125, + "learning_rate": 9.553140189891266e-05, + "loss": 2.4227, + "step": 3111 + }, + { + "epoch": 0.13601993094103762, + "grad_norm": 2.40625, + "learning_rate": 9.552856314887772e-05, + "loss": 1.8651, + "step": 3112 + }, + { + "epoch": 0.13606363914506753, + "grad_norm": 3.140625, + "learning_rate": 9.552572353965254e-05, + "loss": 1.8486, + "step": 3113 + }, + { + "epoch": 0.13610734734909744, + "grad_norm": 2.6875, + "learning_rate": 9.552288307129072e-05, + "loss": 2.4455, + "step": 3114 + }, + { + "epoch": 0.13615105555312731, + "grad_norm": 2.59375, + "learning_rate": 9.552004174384583e-05, + "loss": 2.3039, + "step": 3115 + }, + { + "epoch": 0.13619476375715722, + "grad_norm": 2.5, + "learning_rate": 9.551719955737148e-05, + "loss": 2.0296, + "step": 3116 + }, + { + "epoch": 0.1362384719611871, + "grad_norm": 2.4375, + "learning_rate": 9.551435651192135e-05, + "loss": 1.6363, + "step": 3117 + }, + { + "epoch": 0.136282180165217, + "grad_norm": 3.703125, + "learning_rate": 9.551151260754907e-05, + "loss": 1.8366, + "step": 3118 + }, + { + "epoch": 0.13632588836924692, + "grad_norm": 2.9375, + "learning_rate": 9.550866784430829e-05, + "loss": 1.8424, + "step": 3119 + }, + { + "epoch": 0.1363695965732768, + "grad_norm": 2.921875, + "learning_rate": 9.550582222225273e-05, + "loss": 2.4484, + "step": 3120 + }, + { + "epoch": 0.1364133047773067, + "grad_norm": 2.609375, + "learning_rate": 9.550297574143608e-05, + "loss": 2.1218, + "step": 3121 + }, + { + "epoch": 0.13645701298133658, + "grad_norm": 2.265625, + "learning_rate": 9.550012840191203e-05, + "loss": 1.9442, + "step": 3122 + }, + { + "epoch": 0.1365007211853665, + "grad_norm": 2.40625, + "learning_rate": 9.549728020373434e-05, + "loss": 2.4977, + "step": 3123 + }, + { + "epoch": 0.1365444293893964, + "grad_norm": 2.484375, + "learning_rate": 9.549443114695676e-05, + "loss": 1.6271, + "step": 3124 + }, + { + "epoch": 0.13658813759342628, + "grad_norm": 2.203125, + "learning_rate": 9.549158123163305e-05, + "loss": 1.5913, + "step": 3125 + }, + { + "epoch": 0.1366318457974562, + "grad_norm": 3.078125, + "learning_rate": 9.5488730457817e-05, + "loss": 2.578, + "step": 3126 + }, + { + "epoch": 0.13667555400148607, + "grad_norm": 3.15625, + "learning_rate": 9.54858788255624e-05, + "loss": 3.1502, + "step": 3127 + }, + { + "epoch": 0.13671926220551597, + "grad_norm": 2.703125, + "learning_rate": 9.548302633492306e-05, + "loss": 1.9293, + "step": 3128 + }, + { + "epoch": 0.13676297040954588, + "grad_norm": 2.359375, + "learning_rate": 9.548017298595279e-05, + "loss": 1.9183, + "step": 3129 + }, + { + "epoch": 0.13680667861357576, + "grad_norm": 2.34375, + "learning_rate": 9.54773187787055e-05, + "loss": 2.0055, + "step": 3130 + }, + { + "epoch": 0.13685038681760567, + "grad_norm": 2.71875, + "learning_rate": 9.547446371323501e-05, + "loss": 2.2359, + "step": 3131 + }, + { + "epoch": 0.13689409502163555, + "grad_norm": 2.796875, + "learning_rate": 9.547160778959519e-05, + "loss": 2.2192, + "step": 3132 + }, + { + "epoch": 0.13693780322566546, + "grad_norm": 2.625, + "learning_rate": 9.546875100783996e-05, + "loss": 2.0813, + "step": 3133 + }, + { + "epoch": 0.13698151142969536, + "grad_norm": 3.609375, + "learning_rate": 9.54658933680232e-05, + "loss": 2.0326, + "step": 3134 + }, + { + "epoch": 0.13702521963372524, + "grad_norm": 2.421875, + "learning_rate": 9.546303487019888e-05, + "loss": 2.1513, + "step": 3135 + }, + { + "epoch": 0.13706892783775515, + "grad_norm": 3.421875, + "learning_rate": 9.546017551442092e-05, + "loss": 2.4596, + "step": 3136 + }, + { + "epoch": 0.13711263604178503, + "grad_norm": 3.5625, + "learning_rate": 9.545731530074328e-05, + "loss": 2.2997, + "step": 3137 + }, + { + "epoch": 0.13715634424581494, + "grad_norm": 2.625, + "learning_rate": 9.545445422921996e-05, + "loss": 2.3707, + "step": 3138 + }, + { + "epoch": 0.13720005244984484, + "grad_norm": 3.6875, + "learning_rate": 9.545159229990493e-05, + "loss": 2.7255, + "step": 3139 + }, + { + "epoch": 0.13724376065387472, + "grad_norm": 3.90625, + "learning_rate": 9.544872951285217e-05, + "loss": 3.7178, + "step": 3140 + }, + { + "epoch": 0.13728746885790463, + "grad_norm": 3.21875, + "learning_rate": 9.544586586811576e-05, + "loss": 1.2338, + "step": 3141 + }, + { + "epoch": 0.1373311770619345, + "grad_norm": 2.8125, + "learning_rate": 9.544300136574973e-05, + "loss": 2.3403, + "step": 3142 + }, + { + "epoch": 0.13737488526596442, + "grad_norm": 2.515625, + "learning_rate": 9.54401360058081e-05, + "loss": 2.3907, + "step": 3143 + }, + { + "epoch": 0.13741859346999433, + "grad_norm": 2.234375, + "learning_rate": 9.543726978834497e-05, + "loss": 2.0025, + "step": 3144 + }, + { + "epoch": 0.1374623016740242, + "grad_norm": 2.546875, + "learning_rate": 9.543440271341444e-05, + "loss": 1.6868, + "step": 3145 + }, + { + "epoch": 0.13750600987805411, + "grad_norm": 2.296875, + "learning_rate": 9.543153478107061e-05, + "loss": 1.867, + "step": 3146 + }, + { + "epoch": 0.137549718082084, + "grad_norm": 2.609375, + "learning_rate": 9.542866599136759e-05, + "loss": 2.2322, + "step": 3147 + }, + { + "epoch": 0.1375934262861139, + "grad_norm": 2.203125, + "learning_rate": 9.54257963443595e-05, + "loss": 2.1401, + "step": 3148 + }, + { + "epoch": 0.1376371344901438, + "grad_norm": 2.40625, + "learning_rate": 9.542292584010056e-05, + "loss": 1.5096, + "step": 3149 + }, + { + "epoch": 0.1376808426941737, + "grad_norm": 3.359375, + "learning_rate": 9.542005447864488e-05, + "loss": 2.3567, + "step": 3150 + }, + { + "epoch": 0.1377245508982036, + "grad_norm": 2.484375, + "learning_rate": 9.541718226004665e-05, + "loss": 2.2019, + "step": 3151 + }, + { + "epoch": 0.13776825910223348, + "grad_norm": 2.71875, + "learning_rate": 9.541430918436011e-05, + "loss": 1.9213, + "step": 3152 + }, + { + "epoch": 0.13781196730626338, + "grad_norm": 2.40625, + "learning_rate": 9.541143525163946e-05, + "loss": 1.7277, + "step": 3153 + }, + { + "epoch": 0.1378556755102933, + "grad_norm": 2.390625, + "learning_rate": 9.540856046193894e-05, + "loss": 2.1676, + "step": 3154 + }, + { + "epoch": 0.13789938371432317, + "grad_norm": 2.21875, + "learning_rate": 9.540568481531277e-05, + "loss": 1.7101, + "step": 3155 + }, + { + "epoch": 0.13794309191835308, + "grad_norm": 2.125, + "learning_rate": 9.540280831181525e-05, + "loss": 1.8688, + "step": 3156 + }, + { + "epoch": 0.13798680012238296, + "grad_norm": 2.859375, + "learning_rate": 9.539993095150066e-05, + "loss": 1.7798, + "step": 3157 + }, + { + "epoch": 0.13803050832641287, + "grad_norm": 3.765625, + "learning_rate": 9.53970527344233e-05, + "loss": 2.4279, + "step": 3158 + }, + { + "epoch": 0.13807421653044277, + "grad_norm": 2.203125, + "learning_rate": 9.539417366063748e-05, + "loss": 2.1791, + "step": 3159 + }, + { + "epoch": 0.13811792473447265, + "grad_norm": 2.359375, + "learning_rate": 9.539129373019754e-05, + "loss": 1.9027, + "step": 3160 + }, + { + "epoch": 0.13816163293850256, + "grad_norm": 2.078125, + "learning_rate": 9.53884129431578e-05, + "loss": 1.9052, + "step": 3161 + }, + { + "epoch": 0.13820534114253244, + "grad_norm": 2.296875, + "learning_rate": 9.538553129957268e-05, + "loss": 2.1048, + "step": 3162 + }, + { + "epoch": 0.13824904934656235, + "grad_norm": 2.296875, + "learning_rate": 9.538264879949652e-05, + "loss": 1.6879, + "step": 3163 + }, + { + "epoch": 0.13829275755059225, + "grad_norm": 2.6875, + "learning_rate": 9.537976544298373e-05, + "loss": 2.0213, + "step": 3164 + }, + { + "epoch": 0.13833646575462213, + "grad_norm": 2.328125, + "learning_rate": 9.53768812300887e-05, + "loss": 2.1036, + "step": 3165 + }, + { + "epoch": 0.13838017395865204, + "grad_norm": 2.25, + "learning_rate": 9.537399616086588e-05, + "loss": 2.0177, + "step": 3166 + }, + { + "epoch": 0.13842388216268192, + "grad_norm": 2.265625, + "learning_rate": 9.537111023536973e-05, + "loss": 1.8663, + "step": 3167 + }, + { + "epoch": 0.13846759036671183, + "grad_norm": 2.34375, + "learning_rate": 9.53682234536547e-05, + "loss": 1.6764, + "step": 3168 + }, + { + "epoch": 0.13851129857074174, + "grad_norm": 2.453125, + "learning_rate": 9.536533581577525e-05, + "loss": 2.2184, + "step": 3169 + }, + { + "epoch": 0.13855500677477162, + "grad_norm": 2.625, + "learning_rate": 9.536244732178588e-05, + "loss": 1.9081, + "step": 3170 + }, + { + "epoch": 0.13859871497880152, + "grad_norm": 2.328125, + "learning_rate": 9.535955797174112e-05, + "loss": 2.1123, + "step": 3171 + }, + { + "epoch": 0.13864242318283143, + "grad_norm": 2.546875, + "learning_rate": 9.535666776569547e-05, + "loss": 2.1118, + "step": 3172 + }, + { + "epoch": 0.1386861313868613, + "grad_norm": 2.453125, + "learning_rate": 9.53537767037035e-05, + "loss": 2.053, + "step": 3173 + }, + { + "epoch": 0.13872983959089122, + "grad_norm": 2.78125, + "learning_rate": 9.535088478581975e-05, + "loss": 2.8949, + "step": 3174 + }, + { + "epoch": 0.1387735477949211, + "grad_norm": 2.328125, + "learning_rate": 9.53479920120988e-05, + "loss": 1.5815, + "step": 3175 + }, + { + "epoch": 0.138817255998951, + "grad_norm": 2.296875, + "learning_rate": 9.534509838259523e-05, + "loss": 2.0353, + "step": 3176 + }, + { + "epoch": 0.1388609642029809, + "grad_norm": 2.796875, + "learning_rate": 9.534220389736367e-05, + "loss": 1.7666, + "step": 3177 + }, + { + "epoch": 0.1389046724070108, + "grad_norm": 3.046875, + "learning_rate": 9.533930855645872e-05, + "loss": 3.2732, + "step": 3178 + }, + { + "epoch": 0.1389483806110407, + "grad_norm": 2.265625, + "learning_rate": 9.533641235993504e-05, + "loss": 2.1203, + "step": 3179 + }, + { + "epoch": 0.13899208881507058, + "grad_norm": 3.265625, + "learning_rate": 9.533351530784726e-05, + "loss": 2.1847, + "step": 3180 + }, + { + "epoch": 0.1390357970191005, + "grad_norm": 11.3125, + "learning_rate": 9.533061740025008e-05, + "loss": 3.1752, + "step": 3181 + }, + { + "epoch": 0.1390795052231304, + "grad_norm": 2.34375, + "learning_rate": 9.532771863719816e-05, + "loss": 2.4082, + "step": 3182 + }, + { + "epoch": 0.13912321342716027, + "grad_norm": 2.125, + "learning_rate": 9.532481901874624e-05, + "loss": 2.0876, + "step": 3183 + }, + { + "epoch": 0.13916692163119018, + "grad_norm": 2.203125, + "learning_rate": 9.532191854494901e-05, + "loss": 2.1728, + "step": 3184 + }, + { + "epoch": 0.13921062983522006, + "grad_norm": 3.0, + "learning_rate": 9.531901721586121e-05, + "loss": 2.6058, + "step": 3185 + }, + { + "epoch": 0.13925433803924997, + "grad_norm": 2.28125, + "learning_rate": 9.531611503153759e-05, + "loss": 1.9939, + "step": 3186 + }, + { + "epoch": 0.13929804624327988, + "grad_norm": 2.75, + "learning_rate": 9.531321199203292e-05, + "loss": 2.3874, + "step": 3187 + }, + { + "epoch": 0.13934175444730976, + "grad_norm": 2.234375, + "learning_rate": 9.5310308097402e-05, + "loss": 1.8303, + "step": 3188 + }, + { + "epoch": 0.13938546265133966, + "grad_norm": 2.234375, + "learning_rate": 9.530740334769963e-05, + "loss": 2.1354, + "step": 3189 + }, + { + "epoch": 0.13942917085536954, + "grad_norm": 2.90625, + "learning_rate": 9.53044977429806e-05, + "loss": 2.4122, + "step": 3190 + }, + { + "epoch": 0.13947287905939945, + "grad_norm": 2.484375, + "learning_rate": 9.530159128329976e-05, + "loss": 1.836, + "step": 3191 + }, + { + "epoch": 0.13951658726342936, + "grad_norm": 2.5625, + "learning_rate": 9.529868396871197e-05, + "loss": 1.8899, + "step": 3192 + }, + { + "epoch": 0.13956029546745924, + "grad_norm": 2.734375, + "learning_rate": 9.529577579927209e-05, + "loss": 2.129, + "step": 3193 + }, + { + "epoch": 0.13960400367148915, + "grad_norm": 2.890625, + "learning_rate": 9.529286677503499e-05, + "loss": 2.3418, + "step": 3194 + }, + { + "epoch": 0.13964771187551903, + "grad_norm": 2.40625, + "learning_rate": 9.528995689605556e-05, + "loss": 1.5699, + "step": 3195 + }, + { + "epoch": 0.13969142007954893, + "grad_norm": 2.359375, + "learning_rate": 9.528704616238874e-05, + "loss": 1.5867, + "step": 3196 + }, + { + "epoch": 0.13973512828357884, + "grad_norm": 2.59375, + "learning_rate": 9.528413457408944e-05, + "loss": 1.8923, + "step": 3197 + }, + { + "epoch": 0.13977883648760872, + "grad_norm": 2.6875, + "learning_rate": 9.528122213121262e-05, + "loss": 2.0186, + "step": 3198 + }, + { + "epoch": 0.13982254469163863, + "grad_norm": 2.390625, + "learning_rate": 9.527830883381324e-05, + "loss": 1.7945, + "step": 3199 + }, + { + "epoch": 0.1398662528956685, + "grad_norm": 2.1875, + "learning_rate": 9.527539468194625e-05, + "loss": 1.8891, + "step": 3200 + }, + { + "epoch": 0.13990996109969842, + "grad_norm": 2.296875, + "learning_rate": 9.527247967566668e-05, + "loss": 2.1507, + "step": 3201 + }, + { + "epoch": 0.13995366930372832, + "grad_norm": 2.84375, + "learning_rate": 9.526956381502953e-05, + "loss": 2.117, + "step": 3202 + }, + { + "epoch": 0.1399973775077582, + "grad_norm": 2.609375, + "learning_rate": 9.526664710008983e-05, + "loss": 2.8609, + "step": 3203 + }, + { + "epoch": 0.1400410857117881, + "grad_norm": 2.640625, + "learning_rate": 9.52637295309026e-05, + "loss": 2.5578, + "step": 3204 + }, + { + "epoch": 0.140084793915818, + "grad_norm": 5.8125, + "learning_rate": 9.526081110752292e-05, + "loss": 2.2185, + "step": 3205 + }, + { + "epoch": 0.1401285021198479, + "grad_norm": 2.4375, + "learning_rate": 9.525789183000588e-05, + "loss": 1.6714, + "step": 3206 + }, + { + "epoch": 0.1401722103238778, + "grad_norm": 2.390625, + "learning_rate": 9.525497169840653e-05, + "loss": 2.083, + "step": 3207 + }, + { + "epoch": 0.14021591852790768, + "grad_norm": 2.8125, + "learning_rate": 9.525205071278e-05, + "loss": 2.0151, + "step": 3208 + }, + { + "epoch": 0.1402596267319376, + "grad_norm": 2.1875, + "learning_rate": 9.524912887318142e-05, + "loss": 1.9866, + "step": 3209 + }, + { + "epoch": 0.14030333493596747, + "grad_norm": 2.671875, + "learning_rate": 9.524620617966593e-05, + "loss": 2.3579, + "step": 3210 + }, + { + "epoch": 0.14034704313999738, + "grad_norm": 2.3125, + "learning_rate": 9.524328263228865e-05, + "loss": 1.9456, + "step": 3211 + }, + { + "epoch": 0.1403907513440273, + "grad_norm": 2.328125, + "learning_rate": 9.52403582311048e-05, + "loss": 1.9813, + "step": 3212 + }, + { + "epoch": 0.14043445954805717, + "grad_norm": 2.859375, + "learning_rate": 9.523743297616954e-05, + "loss": 2.962, + "step": 3213 + }, + { + "epoch": 0.14047816775208707, + "grad_norm": 2.328125, + "learning_rate": 9.523450686753807e-05, + "loss": 1.8951, + "step": 3214 + }, + { + "epoch": 0.14052187595611695, + "grad_norm": 2.359375, + "learning_rate": 9.523157990526564e-05, + "loss": 2.1464, + "step": 3215 + }, + { + "epoch": 0.14056558416014686, + "grad_norm": 2.984375, + "learning_rate": 9.522865208940745e-05, + "loss": 2.693, + "step": 3216 + }, + { + "epoch": 0.14060929236417677, + "grad_norm": 3.21875, + "learning_rate": 9.522572342001876e-05, + "loss": 2.5182, + "step": 3217 + }, + { + "epoch": 0.14065300056820665, + "grad_norm": 2.515625, + "learning_rate": 9.522279389715488e-05, + "loss": 2.3654, + "step": 3218 + }, + { + "epoch": 0.14069670877223656, + "grad_norm": 2.703125, + "learning_rate": 9.521986352087102e-05, + "loss": 2.6586, + "step": 3219 + }, + { + "epoch": 0.14074041697626644, + "grad_norm": 2.53125, + "learning_rate": 9.521693229122255e-05, + "loss": 2.3786, + "step": 3220 + }, + { + "epoch": 0.14078412518029634, + "grad_norm": 2.375, + "learning_rate": 9.521400020826475e-05, + "loss": 2.2397, + "step": 3221 + }, + { + "epoch": 0.14082783338432625, + "grad_norm": 2.65625, + "learning_rate": 9.521106727205295e-05, + "loss": 1.7273, + "step": 3222 + }, + { + "epoch": 0.14087154158835613, + "grad_norm": 3.171875, + "learning_rate": 9.520813348264252e-05, + "loss": 1.6895, + "step": 3223 + }, + { + "epoch": 0.14091524979238604, + "grad_norm": 2.265625, + "learning_rate": 9.520519884008881e-05, + "loss": 2.1822, + "step": 3224 + }, + { + "epoch": 0.14095895799641592, + "grad_norm": 2.375, + "learning_rate": 9.52022633444472e-05, + "loss": 2.0736, + "step": 3225 + }, + { + "epoch": 0.14100266620044583, + "grad_norm": 2.90625, + "learning_rate": 9.519932699577309e-05, + "loss": 2.0192, + "step": 3226 + }, + { + "epoch": 0.14104637440447573, + "grad_norm": 4.28125, + "learning_rate": 9.519638979412191e-05, + "loss": 2.1896, + "step": 3227 + }, + { + "epoch": 0.1410900826085056, + "grad_norm": 3.015625, + "learning_rate": 9.519345173954907e-05, + "loss": 2.3782, + "step": 3228 + }, + { + "epoch": 0.14113379081253552, + "grad_norm": 2.328125, + "learning_rate": 9.519051283211002e-05, + "loss": 2.3049, + "step": 3229 + }, + { + "epoch": 0.1411774990165654, + "grad_norm": 2.8125, + "learning_rate": 9.518757307186021e-05, + "loss": 1.8682, + "step": 3230 + }, + { + "epoch": 0.1412212072205953, + "grad_norm": 10.5, + "learning_rate": 9.518463245885513e-05, + "loss": 3.0038, + "step": 3231 + }, + { + "epoch": 0.14126491542462521, + "grad_norm": 2.390625, + "learning_rate": 9.518169099315028e-05, + "loss": 1.859, + "step": 3232 + }, + { + "epoch": 0.1413086236286551, + "grad_norm": 2.390625, + "learning_rate": 9.517874867480117e-05, + "loss": 1.7138, + "step": 3233 + }, + { + "epoch": 0.141352331832685, + "grad_norm": 2.84375, + "learning_rate": 9.517580550386331e-05, + "loss": 3.028, + "step": 3234 + }, + { + "epoch": 0.14139604003671488, + "grad_norm": 2.3125, + "learning_rate": 9.517286148039223e-05, + "loss": 2.257, + "step": 3235 + }, + { + "epoch": 0.1414397482407448, + "grad_norm": 3.015625, + "learning_rate": 9.516991660444355e-05, + "loss": 2.1228, + "step": 3236 + }, + { + "epoch": 0.1414834564447747, + "grad_norm": 2.4375, + "learning_rate": 9.516697087607276e-05, + "loss": 2.25, + "step": 3237 + }, + { + "epoch": 0.14152716464880458, + "grad_norm": 2.59375, + "learning_rate": 9.516402429533552e-05, + "loss": 2.3626, + "step": 3238 + }, + { + "epoch": 0.14157087285283448, + "grad_norm": 2.453125, + "learning_rate": 9.51610768622874e-05, + "loss": 1.8227, + "step": 3239 + }, + { + "epoch": 0.14161458105686436, + "grad_norm": 2.703125, + "learning_rate": 9.515812857698403e-05, + "loss": 2.5844, + "step": 3240 + }, + { + "epoch": 0.14165828926089427, + "grad_norm": 2.140625, + "learning_rate": 9.515517943948105e-05, + "loss": 1.8612, + "step": 3241 + }, + { + "epoch": 0.14170199746492418, + "grad_norm": 3.859375, + "learning_rate": 9.51522294498341e-05, + "loss": 2.173, + "step": 3242 + }, + { + "epoch": 0.14174570566895406, + "grad_norm": 2.734375, + "learning_rate": 9.514927860809888e-05, + "loss": 2.1819, + "step": 3243 + }, + { + "epoch": 0.14178941387298397, + "grad_norm": 2.015625, + "learning_rate": 9.514632691433107e-05, + "loss": 2.0039, + "step": 3244 + }, + { + "epoch": 0.14183312207701385, + "grad_norm": 2.453125, + "learning_rate": 9.514337436858635e-05, + "loss": 2.5136, + "step": 3245 + }, + { + "epoch": 0.14187683028104375, + "grad_norm": 2.21875, + "learning_rate": 9.514042097092045e-05, + "loss": 1.9573, + "step": 3246 + }, + { + "epoch": 0.14192053848507366, + "grad_norm": 2.859375, + "learning_rate": 9.513746672138911e-05, + "loss": 2.0925, + "step": 3247 + }, + { + "epoch": 0.14196424668910354, + "grad_norm": 2.5, + "learning_rate": 9.513451162004809e-05, + "loss": 1.9035, + "step": 3248 + }, + { + "epoch": 0.14200795489313345, + "grad_norm": 2.859375, + "learning_rate": 9.513155566695313e-05, + "loss": 2.0891, + "step": 3249 + }, + { + "epoch": 0.14205166309716333, + "grad_norm": 2.140625, + "learning_rate": 9.512859886216003e-05, + "loss": 2.0599, + "step": 3250 + }, + { + "epoch": 0.14209537130119324, + "grad_norm": 2.625, + "learning_rate": 9.51256412057246e-05, + "loss": 2.3514, + "step": 3251 + }, + { + "epoch": 0.14213907950522314, + "grad_norm": 2.109375, + "learning_rate": 9.512268269770264e-05, + "loss": 2.2689, + "step": 3252 + }, + { + "epoch": 0.14218278770925302, + "grad_norm": 2.625, + "learning_rate": 9.511972333814998e-05, + "loss": 2.5667, + "step": 3253 + }, + { + "epoch": 0.14222649591328293, + "grad_norm": 2.4375, + "learning_rate": 9.511676312712246e-05, + "loss": 1.9769, + "step": 3254 + }, + { + "epoch": 0.1422702041173128, + "grad_norm": 2.375, + "learning_rate": 9.511380206467597e-05, + "loss": 2.144, + "step": 3255 + }, + { + "epoch": 0.14231391232134272, + "grad_norm": 2.59375, + "learning_rate": 9.511084015086637e-05, + "loss": 2.1737, + "step": 3256 + }, + { + "epoch": 0.14235762052537262, + "grad_norm": 2.140625, + "learning_rate": 9.510787738574958e-05, + "loss": 1.7987, + "step": 3257 + }, + { + "epoch": 0.1424013287294025, + "grad_norm": 2.796875, + "learning_rate": 9.510491376938147e-05, + "loss": 2.2213, + "step": 3258 + }, + { + "epoch": 0.1424450369334324, + "grad_norm": 2.328125, + "learning_rate": 9.510194930181799e-05, + "loss": 1.9827, + "step": 3259 + }, + { + "epoch": 0.1424887451374623, + "grad_norm": 2.421875, + "learning_rate": 9.50989839831151e-05, + "loss": 2.4605, + "step": 3260 + }, + { + "epoch": 0.1425324533414922, + "grad_norm": 2.25, + "learning_rate": 9.509601781332873e-05, + "loss": 1.6873, + "step": 3261 + }, + { + "epoch": 0.1425761615455221, + "grad_norm": 2.265625, + "learning_rate": 9.509305079251487e-05, + "loss": 1.9218, + "step": 3262 + }, + { + "epoch": 0.142619869749552, + "grad_norm": 2.5, + "learning_rate": 9.509008292072951e-05, + "loss": 1.5666, + "step": 3263 + }, + { + "epoch": 0.1426635779535819, + "grad_norm": 2.0625, + "learning_rate": 9.508711419802867e-05, + "loss": 1.6043, + "step": 3264 + }, + { + "epoch": 0.14270728615761177, + "grad_norm": 2.15625, + "learning_rate": 9.508414462446835e-05, + "loss": 1.8474, + "step": 3265 + }, + { + "epoch": 0.14275099436164168, + "grad_norm": 2.78125, + "learning_rate": 9.508117420010462e-05, + "loss": 2.0662, + "step": 3266 + }, + { + "epoch": 0.1427947025656716, + "grad_norm": 2.0625, + "learning_rate": 9.507820292499353e-05, + "loss": 1.7683, + "step": 3267 + }, + { + "epoch": 0.14283841076970147, + "grad_norm": 2.390625, + "learning_rate": 9.507523079919111e-05, + "loss": 1.8974, + "step": 3268 + }, + { + "epoch": 0.14288211897373138, + "grad_norm": 2.203125, + "learning_rate": 9.507225782275349e-05, + "loss": 1.9533, + "step": 3269 + }, + { + "epoch": 0.14292582717776126, + "grad_norm": 2.921875, + "learning_rate": 9.506928399573678e-05, + "loss": 2.3724, + "step": 3270 + }, + { + "epoch": 0.14296953538179116, + "grad_norm": 2.578125, + "learning_rate": 9.506630931819707e-05, + "loss": 2.1634, + "step": 3271 + }, + { + "epoch": 0.14301324358582107, + "grad_norm": 2.59375, + "learning_rate": 9.506333379019052e-05, + "loss": 2.2706, + "step": 3272 + }, + { + "epoch": 0.14305695178985095, + "grad_norm": 2.1875, + "learning_rate": 9.506035741177329e-05, + "loss": 1.7177, + "step": 3273 + }, + { + "epoch": 0.14310065999388086, + "grad_norm": 2.328125, + "learning_rate": 9.50573801830015e-05, + "loss": 1.9437, + "step": 3274 + }, + { + "epoch": 0.14314436819791074, + "grad_norm": 2.4375, + "learning_rate": 9.50544021039314e-05, + "loss": 1.9555, + "step": 3275 + }, + { + "epoch": 0.14318807640194064, + "grad_norm": 2.609375, + "learning_rate": 9.505142317461915e-05, + "loss": 2.1584, + "step": 3276 + }, + { + "epoch": 0.14323178460597055, + "grad_norm": 3.125, + "learning_rate": 9.504844339512095e-05, + "loss": 3.2088, + "step": 3277 + }, + { + "epoch": 0.14327549281000043, + "grad_norm": 2.34375, + "learning_rate": 9.504546276549309e-05, + "loss": 1.9269, + "step": 3278 + }, + { + "epoch": 0.14331920101403034, + "grad_norm": 2.71875, + "learning_rate": 9.504248128579177e-05, + "loss": 2.091, + "step": 3279 + }, + { + "epoch": 0.14336290921806022, + "grad_norm": 2.4375, + "learning_rate": 9.503949895607329e-05, + "loss": 2.1584, + "step": 3280 + }, + { + "epoch": 0.14340661742209013, + "grad_norm": 2.234375, + "learning_rate": 9.50365157763939e-05, + "loss": 1.7713, + "step": 3281 + }, + { + "epoch": 0.14345032562612003, + "grad_norm": 2.8125, + "learning_rate": 9.503353174680991e-05, + "loss": 2.0846, + "step": 3282 + }, + { + "epoch": 0.14349403383014991, + "grad_norm": 2.671875, + "learning_rate": 9.503054686737763e-05, + "loss": 2.443, + "step": 3283 + }, + { + "epoch": 0.14353774203417982, + "grad_norm": 2.890625, + "learning_rate": 9.502756113815338e-05, + "loss": 2.3072, + "step": 3284 + }, + { + "epoch": 0.1435814502382097, + "grad_norm": 2.28125, + "learning_rate": 9.502457455919355e-05, + "loss": 1.807, + "step": 3285 + }, + { + "epoch": 0.1436251584422396, + "grad_norm": 2.703125, + "learning_rate": 9.502158713055444e-05, + "loss": 2.1623, + "step": 3286 + }, + { + "epoch": 0.14366886664626952, + "grad_norm": 2.4375, + "learning_rate": 9.501859885229248e-05, + "loss": 1.5188, + "step": 3287 + }, + { + "epoch": 0.1437125748502994, + "grad_norm": 2.703125, + "learning_rate": 9.501560972446402e-05, + "loss": 2.3192, + "step": 3288 + }, + { + "epoch": 0.1437562830543293, + "grad_norm": 2.359375, + "learning_rate": 9.501261974712548e-05, + "loss": 1.9327, + "step": 3289 + }, + { + "epoch": 0.14379999125835918, + "grad_norm": 2.359375, + "learning_rate": 9.50096289203333e-05, + "loss": 2.3821, + "step": 3290 + }, + { + "epoch": 0.1438436994623891, + "grad_norm": 2.671875, + "learning_rate": 9.500663724414392e-05, + "loss": 1.9964, + "step": 3291 + }, + { + "epoch": 0.143887407666419, + "grad_norm": 2.296875, + "learning_rate": 9.500364471861378e-05, + "loss": 1.928, + "step": 3292 + }, + { + "epoch": 0.14393111587044888, + "grad_norm": 2.46875, + "learning_rate": 9.500065134379939e-05, + "loss": 2.0895, + "step": 3293 + }, + { + "epoch": 0.14397482407447879, + "grad_norm": 3.375, + "learning_rate": 9.49976571197572e-05, + "loss": 1.7263, + "step": 3294 + }, + { + "epoch": 0.14401853227850867, + "grad_norm": 3.0, + "learning_rate": 9.499466204654372e-05, + "loss": 2.0133, + "step": 3295 + }, + { + "epoch": 0.14406224048253857, + "grad_norm": 2.5625, + "learning_rate": 9.499166612421548e-05, + "loss": 2.2996, + "step": 3296 + }, + { + "epoch": 0.14410594868656848, + "grad_norm": 2.1875, + "learning_rate": 9.498866935282902e-05, + "loss": 2.0241, + "step": 3297 + }, + { + "epoch": 0.14414965689059836, + "grad_norm": 2.125, + "learning_rate": 9.49856717324409e-05, + "loss": 2.0614, + "step": 3298 + }, + { + "epoch": 0.14419336509462827, + "grad_norm": 2.59375, + "learning_rate": 9.498267326310768e-05, + "loss": 1.7679, + "step": 3299 + }, + { + "epoch": 0.14423707329865815, + "grad_norm": 2.53125, + "learning_rate": 9.497967394488594e-05, + "loss": 2.318, + "step": 3300 + }, + { + "epoch": 0.14428078150268805, + "grad_norm": 3.5625, + "learning_rate": 9.497667377783228e-05, + "loss": 2.6375, + "step": 3301 + }, + { + "epoch": 0.14432448970671796, + "grad_norm": 6.15625, + "learning_rate": 9.497367276200335e-05, + "loss": 1.3413, + "step": 3302 + }, + { + "epoch": 0.14436819791074784, + "grad_norm": 2.359375, + "learning_rate": 9.497067089745572e-05, + "loss": 2.1526, + "step": 3303 + }, + { + "epoch": 0.14441190611477775, + "grad_norm": 2.8125, + "learning_rate": 9.496766818424612e-05, + "loss": 2.5314, + "step": 3304 + }, + { + "epoch": 0.14445561431880763, + "grad_norm": 2.53125, + "learning_rate": 9.496466462243115e-05, + "loss": 2.562, + "step": 3305 + }, + { + "epoch": 0.14449932252283754, + "grad_norm": 2.4375, + "learning_rate": 9.496166021206753e-05, + "loss": 1.9449, + "step": 3306 + }, + { + "epoch": 0.14454303072686744, + "grad_norm": 2.140625, + "learning_rate": 9.495865495321194e-05, + "loss": 1.7941, + "step": 3307 + }, + { + "epoch": 0.14458673893089732, + "grad_norm": 2.5625, + "learning_rate": 9.495564884592109e-05, + "loss": 3.1137, + "step": 3308 + }, + { + "epoch": 0.14463044713492723, + "grad_norm": 2.25, + "learning_rate": 9.495264189025172e-05, + "loss": 1.586, + "step": 3309 + }, + { + "epoch": 0.1446741553389571, + "grad_norm": 3.078125, + "learning_rate": 9.494963408626056e-05, + "loss": 1.9154, + "step": 3310 + }, + { + "epoch": 0.14471786354298702, + "grad_norm": 2.453125, + "learning_rate": 9.49466254340044e-05, + "loss": 1.7515, + "step": 3311 + }, + { + "epoch": 0.14476157174701693, + "grad_norm": 10.5, + "learning_rate": 9.494361593354e-05, + "loss": 6.3968, + "step": 3312 + }, + { + "epoch": 0.1448052799510468, + "grad_norm": 2.625, + "learning_rate": 9.494060558492415e-05, + "loss": 2.0223, + "step": 3313 + }, + { + "epoch": 0.1448489881550767, + "grad_norm": 2.515625, + "learning_rate": 9.493759438821366e-05, + "loss": 1.9596, + "step": 3314 + }, + { + "epoch": 0.1448926963591066, + "grad_norm": 3.046875, + "learning_rate": 9.493458234346537e-05, + "loss": 1.9956, + "step": 3315 + }, + { + "epoch": 0.1449364045631365, + "grad_norm": 2.875, + "learning_rate": 9.493156945073611e-05, + "loss": 2.5605, + "step": 3316 + }, + { + "epoch": 0.1449801127671664, + "grad_norm": 2.984375, + "learning_rate": 9.492855571008275e-05, + "loss": 2.2362, + "step": 3317 + }, + { + "epoch": 0.1450238209711963, + "grad_norm": 2.625, + "learning_rate": 9.492554112156214e-05, + "loss": 1.9502, + "step": 3318 + }, + { + "epoch": 0.1450675291752262, + "grad_norm": 2.21875, + "learning_rate": 9.492252568523117e-05, + "loss": 1.599, + "step": 3319 + }, + { + "epoch": 0.14511123737925608, + "grad_norm": 2.640625, + "learning_rate": 9.491950940114678e-05, + "loss": 1.7509, + "step": 3320 + }, + { + "epoch": 0.14515494558328598, + "grad_norm": 2.84375, + "learning_rate": 9.491649226936585e-05, + "loss": 3.2139, + "step": 3321 + }, + { + "epoch": 0.1451986537873159, + "grad_norm": 4.75, + "learning_rate": 9.491347428994536e-05, + "loss": 1.9854, + "step": 3322 + }, + { + "epoch": 0.14524236199134577, + "grad_norm": 3.546875, + "learning_rate": 9.491045546294223e-05, + "loss": 2.3345, + "step": 3323 + }, + { + "epoch": 0.14528607019537568, + "grad_norm": 2.390625, + "learning_rate": 9.490743578841344e-05, + "loss": 2.7734, + "step": 3324 + }, + { + "epoch": 0.14532977839940556, + "grad_norm": 2.75, + "learning_rate": 9.490441526641599e-05, + "loss": 1.8867, + "step": 3325 + }, + { + "epoch": 0.14537348660343546, + "grad_norm": 2.546875, + "learning_rate": 9.490139389700685e-05, + "loss": 2.8756, + "step": 3326 + }, + { + "epoch": 0.14541719480746537, + "grad_norm": 2.140625, + "learning_rate": 9.489837168024307e-05, + "loss": 2.0287, + "step": 3327 + }, + { + "epoch": 0.14546090301149525, + "grad_norm": 2.640625, + "learning_rate": 9.489534861618166e-05, + "loss": 2.3343, + "step": 3328 + }, + { + "epoch": 0.14550461121552516, + "grad_norm": 2.765625, + "learning_rate": 9.48923247048797e-05, + "loss": 2.1087, + "step": 3329 + }, + { + "epoch": 0.14554831941955504, + "grad_norm": 2.15625, + "learning_rate": 9.488929994639421e-05, + "loss": 2.1728, + "step": 3330 + }, + { + "epoch": 0.14559202762358495, + "grad_norm": 2.5, + "learning_rate": 9.488627434078232e-05, + "loss": 2.136, + "step": 3331 + }, + { + "epoch": 0.14563573582761485, + "grad_norm": 2.15625, + "learning_rate": 9.488324788810108e-05, + "loss": 2.0132, + "step": 3332 + }, + { + "epoch": 0.14567944403164473, + "grad_norm": 2.5, + "learning_rate": 9.488022058840765e-05, + "loss": 1.7678, + "step": 3333 + }, + { + "epoch": 0.14572315223567464, + "grad_norm": 2.4375, + "learning_rate": 9.487719244175912e-05, + "loss": 1.7634, + "step": 3334 + }, + { + "epoch": 0.14576686043970452, + "grad_norm": 2.734375, + "learning_rate": 9.487416344821267e-05, + "loss": 1.8089, + "step": 3335 + }, + { + "epoch": 0.14581056864373443, + "grad_norm": 2.390625, + "learning_rate": 9.487113360782543e-05, + "loss": 1.9985, + "step": 3336 + }, + { + "epoch": 0.14585427684776434, + "grad_norm": 2.78125, + "learning_rate": 9.48681029206546e-05, + "loss": 1.8382, + "step": 3337 + }, + { + "epoch": 0.14589798505179422, + "grad_norm": 2.875, + "learning_rate": 9.486507138675735e-05, + "loss": 2.4906, + "step": 3338 + }, + { + "epoch": 0.14594169325582412, + "grad_norm": 2.671875, + "learning_rate": 9.486203900619092e-05, + "loss": 2.1798, + "step": 3339 + }, + { + "epoch": 0.145985401459854, + "grad_norm": 2.609375, + "learning_rate": 9.485900577901252e-05, + "loss": 2.698, + "step": 3340 + }, + { + "epoch": 0.1460291096638839, + "grad_norm": 2.25, + "learning_rate": 9.485597170527939e-05, + "loss": 2.2058, + "step": 3341 + }, + { + "epoch": 0.14607281786791382, + "grad_norm": 3.546875, + "learning_rate": 9.485293678504879e-05, + "loss": 2.5278, + "step": 3342 + }, + { + "epoch": 0.1461165260719437, + "grad_norm": 2.0625, + "learning_rate": 9.484990101837799e-05, + "loss": 1.7709, + "step": 3343 + }, + { + "epoch": 0.1461602342759736, + "grad_norm": 2.25, + "learning_rate": 9.484686440532429e-05, + "loss": 1.9356, + "step": 3344 + }, + { + "epoch": 0.14620394248000348, + "grad_norm": 3.25, + "learning_rate": 9.484382694594498e-05, + "loss": 1.9257, + "step": 3345 + }, + { + "epoch": 0.1462476506840334, + "grad_norm": 2.28125, + "learning_rate": 9.484078864029739e-05, + "loss": 1.8498, + "step": 3346 + }, + { + "epoch": 0.1462913588880633, + "grad_norm": 2.84375, + "learning_rate": 9.483774948843884e-05, + "loss": 2.0713, + "step": 3347 + }, + { + "epoch": 0.14633506709209318, + "grad_norm": 2.65625, + "learning_rate": 9.483470949042672e-05, + "loss": 1.8746, + "step": 3348 + }, + { + "epoch": 0.1463787752961231, + "grad_norm": 3.25, + "learning_rate": 9.483166864631837e-05, + "loss": 2.3901, + "step": 3349 + }, + { + "epoch": 0.14642248350015297, + "grad_norm": 3.15625, + "learning_rate": 9.482862695617119e-05, + "loss": 2.7898, + "step": 3350 + }, + { + "epoch": 0.14646619170418287, + "grad_norm": 2.1875, + "learning_rate": 9.482558442004257e-05, + "loss": 2.0902, + "step": 3351 + }, + { + "epoch": 0.14650989990821278, + "grad_norm": 2.8125, + "learning_rate": 9.482254103798993e-05, + "loss": 1.9117, + "step": 3352 + }, + { + "epoch": 0.14655360811224266, + "grad_norm": 3.1875, + "learning_rate": 9.481949681007069e-05, + "loss": 2.7748, + "step": 3353 + }, + { + "epoch": 0.14659731631627257, + "grad_norm": 2.40625, + "learning_rate": 9.481645173634234e-05, + "loss": 1.8882, + "step": 3354 + }, + { + "epoch": 0.14664102452030245, + "grad_norm": 2.65625, + "learning_rate": 9.48134058168623e-05, + "loss": 2.5552, + "step": 3355 + }, + { + "epoch": 0.14668473272433236, + "grad_norm": 2.21875, + "learning_rate": 9.481035905168808e-05, + "loss": 1.67, + "step": 3356 + }, + { + "epoch": 0.14672844092836226, + "grad_norm": 3.453125, + "learning_rate": 9.480731144087716e-05, + "loss": 1.8939, + "step": 3357 + }, + { + "epoch": 0.14677214913239214, + "grad_norm": 2.296875, + "learning_rate": 9.480426298448706e-05, + "loss": 1.988, + "step": 3358 + }, + { + "epoch": 0.14681585733642205, + "grad_norm": 2.453125, + "learning_rate": 9.480121368257531e-05, + "loss": 1.9197, + "step": 3359 + }, + { + "epoch": 0.14685956554045193, + "grad_norm": 2.421875, + "learning_rate": 9.479816353519946e-05, + "loss": 1.9942, + "step": 3360 + }, + { + "epoch": 0.14690327374448184, + "grad_norm": 3.015625, + "learning_rate": 9.479511254241704e-05, + "loss": 2.4083, + "step": 3361 + }, + { + "epoch": 0.14694698194851175, + "grad_norm": 2.296875, + "learning_rate": 9.479206070428568e-05, + "loss": 2.3581, + "step": 3362 + }, + { + "epoch": 0.14699069015254163, + "grad_norm": 2.40625, + "learning_rate": 9.478900802086292e-05, + "loss": 1.7141, + "step": 3363 + }, + { + "epoch": 0.14703439835657153, + "grad_norm": 2.453125, + "learning_rate": 9.478595449220639e-05, + "loss": 1.9283, + "step": 3364 + }, + { + "epoch": 0.1470781065606014, + "grad_norm": 2.125, + "learning_rate": 9.478290011837375e-05, + "loss": 1.9525, + "step": 3365 + }, + { + "epoch": 0.14712181476463132, + "grad_norm": 2.25, + "learning_rate": 9.477984489942258e-05, + "loss": 2.0672, + "step": 3366 + }, + { + "epoch": 0.14716552296866123, + "grad_norm": 2.15625, + "learning_rate": 9.477678883541055e-05, + "loss": 1.9176, + "step": 3367 + }, + { + "epoch": 0.1472092311726911, + "grad_norm": 2.25, + "learning_rate": 9.477373192639536e-05, + "loss": 2.0645, + "step": 3368 + }, + { + "epoch": 0.14725293937672101, + "grad_norm": 2.4375, + "learning_rate": 9.477067417243468e-05, + "loss": 1.8324, + "step": 3369 + }, + { + "epoch": 0.1472966475807509, + "grad_norm": 4.5625, + "learning_rate": 9.476761557358623e-05, + "loss": 2.603, + "step": 3370 + }, + { + "epoch": 0.1473403557847808, + "grad_norm": 3.234375, + "learning_rate": 9.476455612990771e-05, + "loss": 2.2431, + "step": 3371 + }, + { + "epoch": 0.1473840639888107, + "grad_norm": 2.171875, + "learning_rate": 9.476149584145687e-05, + "loss": 1.86, + "step": 3372 + }, + { + "epoch": 0.1474277721928406, + "grad_norm": 2.65625, + "learning_rate": 9.475843470829145e-05, + "loss": 1.998, + "step": 3373 + }, + { + "epoch": 0.1474714803968705, + "grad_norm": 2.78125, + "learning_rate": 9.475537273046922e-05, + "loss": 2.1818, + "step": 3374 + }, + { + "epoch": 0.14751518860090038, + "grad_norm": 2.359375, + "learning_rate": 9.475230990804797e-05, + "loss": 1.7166, + "step": 3375 + }, + { + "epoch": 0.14755889680493028, + "grad_norm": 2.78125, + "learning_rate": 9.47492462410855e-05, + "loss": 1.8999, + "step": 3376 + }, + { + "epoch": 0.1476026050089602, + "grad_norm": 2.046875, + "learning_rate": 9.474618172963963e-05, + "loss": 2.1685, + "step": 3377 + }, + { + "epoch": 0.14764631321299007, + "grad_norm": 2.09375, + "learning_rate": 9.474311637376818e-05, + "loss": 1.6009, + "step": 3378 + }, + { + "epoch": 0.14769002141701998, + "grad_norm": 3.0, + "learning_rate": 9.474005017352899e-05, + "loss": 2.7177, + "step": 3379 + }, + { + "epoch": 0.14773372962104986, + "grad_norm": 2.328125, + "learning_rate": 9.473698312897997e-05, + "loss": 2.1018, + "step": 3380 + }, + { + "epoch": 0.14777743782507977, + "grad_norm": 2.1875, + "learning_rate": 9.473391524017894e-05, + "loss": 2.058, + "step": 3381 + }, + { + "epoch": 0.14782114602910967, + "grad_norm": 2.296875, + "learning_rate": 9.473084650718382e-05, + "loss": 2.4579, + "step": 3382 + }, + { + "epoch": 0.14786485423313955, + "grad_norm": 2.40625, + "learning_rate": 9.472777693005254e-05, + "loss": 2.458, + "step": 3383 + }, + { + "epoch": 0.14790856243716946, + "grad_norm": 2.5, + "learning_rate": 9.4724706508843e-05, + "loss": 2.1446, + "step": 3384 + }, + { + "epoch": 0.14795227064119934, + "grad_norm": 2.171875, + "learning_rate": 9.472163524361315e-05, + "loss": 2.2017, + "step": 3385 + }, + { + "epoch": 0.14799597884522925, + "grad_norm": 2.78125, + "learning_rate": 9.471856313442098e-05, + "loss": 2.6495, + "step": 3386 + }, + { + "epoch": 0.14803968704925916, + "grad_norm": 2.265625, + "learning_rate": 9.471549018132442e-05, + "loss": 1.7599, + "step": 3387 + }, + { + "epoch": 0.14808339525328904, + "grad_norm": 2.796875, + "learning_rate": 9.471241638438148e-05, + "loss": 2.5642, + "step": 3388 + }, + { + "epoch": 0.14812710345731894, + "grad_norm": 2.578125, + "learning_rate": 9.470934174365016e-05, + "loss": 1.8783, + "step": 3389 + }, + { + "epoch": 0.14817081166134882, + "grad_norm": 5.21875, + "learning_rate": 9.470626625918851e-05, + "loss": 2.3672, + "step": 3390 + }, + { + "epoch": 0.14821451986537873, + "grad_norm": 3.6875, + "learning_rate": 9.470318993105453e-05, + "loss": 2.1336, + "step": 3391 + }, + { + "epoch": 0.14825822806940864, + "grad_norm": 2.703125, + "learning_rate": 9.47001127593063e-05, + "loss": 1.8938, + "step": 3392 + }, + { + "epoch": 0.14830193627343852, + "grad_norm": 2.578125, + "learning_rate": 9.469703474400188e-05, + "loss": 2.27, + "step": 3393 + }, + { + "epoch": 0.14834564447746842, + "grad_norm": 3.28125, + "learning_rate": 9.469395588519939e-05, + "loss": 2.4541, + "step": 3394 + }, + { + "epoch": 0.1483893526814983, + "grad_norm": 2.375, + "learning_rate": 9.469087618295687e-05, + "loss": 1.814, + "step": 3395 + }, + { + "epoch": 0.1484330608855282, + "grad_norm": 2.125, + "learning_rate": 9.468779563733248e-05, + "loss": 1.6851, + "step": 3396 + }, + { + "epoch": 0.14847676908955812, + "grad_norm": 5.375, + "learning_rate": 9.468471424838434e-05, + "loss": 1.4152, + "step": 3397 + }, + { + "epoch": 0.148520477293588, + "grad_norm": 5.53125, + "learning_rate": 9.468163201617062e-05, + "loss": 1.3591, + "step": 3398 + }, + { + "epoch": 0.1485641854976179, + "grad_norm": 3.03125, + "learning_rate": 9.467854894074945e-05, + "loss": 2.6621, + "step": 3399 + }, + { + "epoch": 0.1486078937016478, + "grad_norm": 2.34375, + "learning_rate": 9.467546502217907e-05, + "loss": 1.9949, + "step": 3400 + }, + { + "epoch": 0.1486516019056777, + "grad_norm": 2.546875, + "learning_rate": 9.467238026051762e-05, + "loss": 2.0779, + "step": 3401 + }, + { + "epoch": 0.1486953101097076, + "grad_norm": 2.203125, + "learning_rate": 9.466929465582335e-05, + "loss": 1.8255, + "step": 3402 + }, + { + "epoch": 0.14873901831373748, + "grad_norm": 2.875, + "learning_rate": 9.466620820815446e-05, + "loss": 2.032, + "step": 3403 + }, + { + "epoch": 0.1487827265177674, + "grad_norm": 2.3125, + "learning_rate": 9.466312091756922e-05, + "loss": 2.1223, + "step": 3404 + }, + { + "epoch": 0.14882643472179727, + "grad_norm": 2.1875, + "learning_rate": 9.46600327841259e-05, + "loss": 2.3961, + "step": 3405 + }, + { + "epoch": 0.14887014292582718, + "grad_norm": 2.609375, + "learning_rate": 9.465694380788274e-05, + "loss": 2.0143, + "step": 3406 + }, + { + "epoch": 0.14891385112985708, + "grad_norm": 2.484375, + "learning_rate": 9.465385398889806e-05, + "loss": 2.1623, + "step": 3407 + }, + { + "epoch": 0.14895755933388696, + "grad_norm": 3.078125, + "learning_rate": 9.465076332723017e-05, + "loss": 2.1627, + "step": 3408 + }, + { + "epoch": 0.14900126753791687, + "grad_norm": 2.921875, + "learning_rate": 9.464767182293739e-05, + "loss": 1.8762, + "step": 3409 + }, + { + "epoch": 0.14904497574194675, + "grad_norm": 2.53125, + "learning_rate": 9.464457947607805e-05, + "loss": 2.4482, + "step": 3410 + }, + { + "epoch": 0.14908868394597666, + "grad_norm": 2.40625, + "learning_rate": 9.464148628671053e-05, + "loss": 1.5772, + "step": 3411 + }, + { + "epoch": 0.14913239215000657, + "grad_norm": 2.328125, + "learning_rate": 9.46383922548932e-05, + "loss": 2.1683, + "step": 3412 + }, + { + "epoch": 0.14917610035403644, + "grad_norm": 2.09375, + "learning_rate": 9.463529738068441e-05, + "loss": 2.185, + "step": 3413 + }, + { + "epoch": 0.14921980855806635, + "grad_norm": 2.796875, + "learning_rate": 9.463220166414262e-05, + "loss": 2.1126, + "step": 3414 + }, + { + "epoch": 0.14926351676209623, + "grad_norm": 2.5, + "learning_rate": 9.462910510532621e-05, + "loss": 2.6235, + "step": 3415 + }, + { + "epoch": 0.14930722496612614, + "grad_norm": 2.484375, + "learning_rate": 9.462600770429364e-05, + "loss": 1.4827, + "step": 3416 + }, + { + "epoch": 0.14935093317015605, + "grad_norm": 2.1875, + "learning_rate": 9.462290946110335e-05, + "loss": 2.3322, + "step": 3417 + }, + { + "epoch": 0.14939464137418593, + "grad_norm": 2.96875, + "learning_rate": 9.461981037581383e-05, + "loss": 2.0577, + "step": 3418 + }, + { + "epoch": 0.14943834957821583, + "grad_norm": 2.359375, + "learning_rate": 9.461671044848352e-05, + "loss": 1.8846, + "step": 3419 + }, + { + "epoch": 0.14948205778224571, + "grad_norm": 2.328125, + "learning_rate": 9.461360967917098e-05, + "loss": 1.9623, + "step": 3420 + }, + { + "epoch": 0.14952576598627562, + "grad_norm": 2.609375, + "learning_rate": 9.461050806793468e-05, + "loss": 1.7624, + "step": 3421 + }, + { + "epoch": 0.14956947419030553, + "grad_norm": 2.25, + "learning_rate": 9.460740561483314e-05, + "loss": 2.1842, + "step": 3422 + }, + { + "epoch": 0.1496131823943354, + "grad_norm": 5.875, + "learning_rate": 9.460430231992496e-05, + "loss": 1.1398, + "step": 3423 + }, + { + "epoch": 0.14965689059836532, + "grad_norm": 2.375, + "learning_rate": 9.460119818326866e-05, + "loss": 1.8948, + "step": 3424 + }, + { + "epoch": 0.1497005988023952, + "grad_norm": 2.625, + "learning_rate": 9.459809320492286e-05, + "loss": 1.7911, + "step": 3425 + }, + { + "epoch": 0.1497443070064251, + "grad_norm": 2.6875, + "learning_rate": 9.459498738494613e-05, + "loss": 2.3553, + "step": 3426 + }, + { + "epoch": 0.149788015210455, + "grad_norm": 2.4375, + "learning_rate": 9.459188072339706e-05, + "loss": 1.7611, + "step": 3427 + }, + { + "epoch": 0.1498317234144849, + "grad_norm": 2.734375, + "learning_rate": 9.458877322033431e-05, + "loss": 2.0372, + "step": 3428 + }, + { + "epoch": 0.1498754316185148, + "grad_norm": 2.765625, + "learning_rate": 9.458566487581653e-05, + "loss": 1.9606, + "step": 3429 + }, + { + "epoch": 0.14991913982254468, + "grad_norm": 2.265625, + "learning_rate": 9.458255568990235e-05, + "loss": 2.0393, + "step": 3430 + }, + { + "epoch": 0.14996284802657459, + "grad_norm": 3.046875, + "learning_rate": 9.457944566265045e-05, + "loss": 2.1543, + "step": 3431 + }, + { + "epoch": 0.1500065562306045, + "grad_norm": 2.390625, + "learning_rate": 9.457633479411952e-05, + "loss": 2.36, + "step": 3432 + }, + { + "epoch": 0.15005026443463437, + "grad_norm": 2.4375, + "learning_rate": 9.457322308436828e-05, + "loss": 1.8532, + "step": 3433 + }, + { + "epoch": 0.15009397263866428, + "grad_norm": 2.84375, + "learning_rate": 9.457011053345547e-05, + "loss": 2.2903, + "step": 3434 + }, + { + "epoch": 0.15013768084269416, + "grad_norm": 2.53125, + "learning_rate": 9.45669971414398e-05, + "loss": 2.2657, + "step": 3435 + }, + { + "epoch": 0.15018138904672407, + "grad_norm": 3.078125, + "learning_rate": 9.456388290838e-05, + "loss": 1.8505, + "step": 3436 + }, + { + "epoch": 0.15022509725075398, + "grad_norm": 2.546875, + "learning_rate": 9.45607678343349e-05, + "loss": 1.9809, + "step": 3437 + }, + { + "epoch": 0.15026880545478385, + "grad_norm": 2.8125, + "learning_rate": 9.455765191936326e-05, + "loss": 1.7168, + "step": 3438 + }, + { + "epoch": 0.15031251365881376, + "grad_norm": 2.46875, + "learning_rate": 9.455453516352385e-05, + "loss": 2.7301, + "step": 3439 + }, + { + "epoch": 0.15035622186284364, + "grad_norm": 2.15625, + "learning_rate": 9.455141756687554e-05, + "loss": 2.3094, + "step": 3440 + }, + { + "epoch": 0.15039993006687355, + "grad_norm": 3.140625, + "learning_rate": 9.454829912947712e-05, + "loss": 1.5946, + "step": 3441 + }, + { + "epoch": 0.15044363827090346, + "grad_norm": 2.421875, + "learning_rate": 9.454517985138747e-05, + "loss": 1.9504, + "step": 3442 + }, + { + "epoch": 0.15048734647493334, + "grad_norm": 2.265625, + "learning_rate": 9.454205973266543e-05, + "loss": 2.2709, + "step": 3443 + }, + { + "epoch": 0.15053105467896324, + "grad_norm": 2.828125, + "learning_rate": 9.453893877336991e-05, + "loss": 1.9691, + "step": 3444 + }, + { + "epoch": 0.15057476288299312, + "grad_norm": 2.609375, + "learning_rate": 9.453581697355978e-05, + "loss": 2.2119, + "step": 3445 + }, + { + "epoch": 0.15061847108702303, + "grad_norm": 2.078125, + "learning_rate": 9.453269433329398e-05, + "loss": 1.9717, + "step": 3446 + }, + { + "epoch": 0.15066217929105294, + "grad_norm": 2.140625, + "learning_rate": 9.452957085263142e-05, + "loss": 1.919, + "step": 3447 + }, + { + "epoch": 0.15070588749508282, + "grad_norm": 2.53125, + "learning_rate": 9.452644653163104e-05, + "loss": 2.4109, + "step": 3448 + }, + { + "epoch": 0.15074959569911273, + "grad_norm": 2.46875, + "learning_rate": 9.452332137035181e-05, + "loss": 1.6877, + "step": 3449 + }, + { + "epoch": 0.15079330390314263, + "grad_norm": 2.484375, + "learning_rate": 9.452019536885271e-05, + "loss": 2.4079, + "step": 3450 + }, + { + "epoch": 0.1508370121071725, + "grad_norm": 2.203125, + "learning_rate": 9.451706852719273e-05, + "loss": 1.923, + "step": 3451 + }, + { + "epoch": 0.15088072031120242, + "grad_norm": 2.15625, + "learning_rate": 9.451394084543087e-05, + "loss": 1.6903, + "step": 3452 + }, + { + "epoch": 0.1509244285152323, + "grad_norm": 3.765625, + "learning_rate": 9.451081232362616e-05, + "loss": 1.7523, + "step": 3453 + }, + { + "epoch": 0.1509681367192622, + "grad_norm": 2.59375, + "learning_rate": 9.450768296183765e-05, + "loss": 2.4434, + "step": 3454 + }, + { + "epoch": 0.15101184492329212, + "grad_norm": 2.84375, + "learning_rate": 9.450455276012435e-05, + "loss": 1.9919, + "step": 3455 + }, + { + "epoch": 0.151055553127322, + "grad_norm": 3.359375, + "learning_rate": 9.45014217185454e-05, + "loss": 3.1905, + "step": 3456 + }, + { + "epoch": 0.1510992613313519, + "grad_norm": 2.25, + "learning_rate": 9.449828983715985e-05, + "loss": 1.9003, + "step": 3457 + }, + { + "epoch": 0.15114296953538178, + "grad_norm": 2.140625, + "learning_rate": 9.44951571160268e-05, + "loss": 1.8811, + "step": 3458 + }, + { + "epoch": 0.1511866777394117, + "grad_norm": 2.796875, + "learning_rate": 9.449202355520537e-05, + "loss": 2.2725, + "step": 3459 + }, + { + "epoch": 0.1512303859434416, + "grad_norm": 2.734375, + "learning_rate": 9.448888915475471e-05, + "loss": 1.641, + "step": 3460 + }, + { + "epoch": 0.15127409414747148, + "grad_norm": 4.875, + "learning_rate": 9.448575391473396e-05, + "loss": 1.6954, + "step": 3461 + }, + { + "epoch": 0.15131780235150138, + "grad_norm": 2.109375, + "learning_rate": 9.448261783520228e-05, + "loss": 1.897, + "step": 3462 + }, + { + "epoch": 0.15136151055553126, + "grad_norm": 2.828125, + "learning_rate": 9.447948091621886e-05, + "loss": 2.1477, + "step": 3463 + }, + { + "epoch": 0.15140521875956117, + "grad_norm": 2.125, + "learning_rate": 9.44763431578429e-05, + "loss": 1.9178, + "step": 3464 + }, + { + "epoch": 0.15144892696359108, + "grad_norm": 2.796875, + "learning_rate": 9.447320456013362e-05, + "loss": 2.9479, + "step": 3465 + }, + { + "epoch": 0.15149263516762096, + "grad_norm": 3.09375, + "learning_rate": 9.447006512315025e-05, + "loss": 2.2641, + "step": 3466 + }, + { + "epoch": 0.15153634337165087, + "grad_norm": 2.109375, + "learning_rate": 9.4466924846952e-05, + "loss": 1.8904, + "step": 3467 + }, + { + "epoch": 0.15158005157568075, + "grad_norm": 2.546875, + "learning_rate": 9.446378373159818e-05, + "loss": 1.7885, + "step": 3468 + }, + { + "epoch": 0.15162375977971065, + "grad_norm": 2.765625, + "learning_rate": 9.446064177714804e-05, + "loss": 3.0216, + "step": 3469 + }, + { + "epoch": 0.15166746798374056, + "grad_norm": 2.265625, + "learning_rate": 9.445749898366089e-05, + "loss": 2.1373, + "step": 3470 + }, + { + "epoch": 0.15171117618777044, + "grad_norm": 2.34375, + "learning_rate": 9.445435535119602e-05, + "loss": 1.9214, + "step": 3471 + }, + { + "epoch": 0.15175488439180035, + "grad_norm": 3.96875, + "learning_rate": 9.445121087981277e-05, + "loss": 2.1844, + "step": 3472 + }, + { + "epoch": 0.15179859259583023, + "grad_norm": 2.4375, + "learning_rate": 9.444806556957047e-05, + "loss": 1.8605, + "step": 3473 + }, + { + "epoch": 0.15184230079986014, + "grad_norm": 2.09375, + "learning_rate": 9.444491942052849e-05, + "loss": 1.8039, + "step": 3474 + }, + { + "epoch": 0.15188600900389004, + "grad_norm": 3.1875, + "learning_rate": 9.444177243274618e-05, + "loss": 2.4046, + "step": 3475 + }, + { + "epoch": 0.15192971720791992, + "grad_norm": 2.515625, + "learning_rate": 9.443862460628295e-05, + "loss": 2.4799, + "step": 3476 + }, + { + "epoch": 0.15197342541194983, + "grad_norm": 2.375, + "learning_rate": 9.44354759411982e-05, + "loss": 1.925, + "step": 3477 + }, + { + "epoch": 0.1520171336159797, + "grad_norm": 2.46875, + "learning_rate": 9.443232643755133e-05, + "loss": 2.4786, + "step": 3478 + }, + { + "epoch": 0.15206084182000962, + "grad_norm": 3.21875, + "learning_rate": 9.442917609540181e-05, + "loss": 1.7428, + "step": 3479 + }, + { + "epoch": 0.15210455002403953, + "grad_norm": 3.09375, + "learning_rate": 9.442602491480906e-05, + "loss": 2.5866, + "step": 3480 + }, + { + "epoch": 0.1521482582280694, + "grad_norm": 2.671875, + "learning_rate": 9.442287289583259e-05, + "loss": 1.6681, + "step": 3481 + }, + { + "epoch": 0.1521919664320993, + "grad_norm": 2.546875, + "learning_rate": 9.441972003853181e-05, + "loss": 2.4017, + "step": 3482 + }, + { + "epoch": 0.1522356746361292, + "grad_norm": 1.9921875, + "learning_rate": 9.44165663429663e-05, + "loss": 1.6264, + "step": 3483 + }, + { + "epoch": 0.1522793828401591, + "grad_norm": 2.3125, + "learning_rate": 9.441341180919551e-05, + "loss": 1.6506, + "step": 3484 + }, + { + "epoch": 0.152323091044189, + "grad_norm": 2.09375, + "learning_rate": 9.4410256437279e-05, + "loss": 1.826, + "step": 3485 + }, + { + "epoch": 0.1523667992482189, + "grad_norm": 2.75, + "learning_rate": 9.440710022727634e-05, + "loss": 2.2919, + "step": 3486 + }, + { + "epoch": 0.1524105074522488, + "grad_norm": 2.890625, + "learning_rate": 9.440394317924706e-05, + "loss": 1.9157, + "step": 3487 + }, + { + "epoch": 0.15245421565627867, + "grad_norm": 2.515625, + "learning_rate": 9.440078529325073e-05, + "loss": 1.9245, + "step": 3488 + }, + { + "epoch": 0.15249792386030858, + "grad_norm": 2.515625, + "learning_rate": 9.439762656934698e-05, + "loss": 1.9615, + "step": 3489 + }, + { + "epoch": 0.1525416320643385, + "grad_norm": 2.28125, + "learning_rate": 9.439446700759537e-05, + "loss": 2.1101, + "step": 3490 + }, + { + "epoch": 0.15258534026836837, + "grad_norm": 2.46875, + "learning_rate": 9.439130660805558e-05, + "loss": 2.4986, + "step": 3491 + }, + { + "epoch": 0.15262904847239828, + "grad_norm": 2.359375, + "learning_rate": 9.438814537078722e-05, + "loss": 1.9813, + "step": 3492 + }, + { + "epoch": 0.15267275667642816, + "grad_norm": 4.3125, + "learning_rate": 9.438498329584995e-05, + "loss": 1.785, + "step": 3493 + }, + { + "epoch": 0.15271646488045806, + "grad_norm": 2.125, + "learning_rate": 9.438182038330345e-05, + "loss": 2.1659, + "step": 3494 + }, + { + "epoch": 0.15276017308448797, + "grad_norm": 2.234375, + "learning_rate": 9.43786566332074e-05, + "loss": 1.9646, + "step": 3495 + }, + { + "epoch": 0.15280388128851785, + "grad_norm": 2.40625, + "learning_rate": 9.437549204562151e-05, + "loss": 1.9267, + "step": 3496 + }, + { + "epoch": 0.15284758949254776, + "grad_norm": 2.734375, + "learning_rate": 9.43723266206055e-05, + "loss": 1.9642, + "step": 3497 + }, + { + "epoch": 0.15289129769657764, + "grad_norm": 2.0, + "learning_rate": 9.43691603582191e-05, + "loss": 1.8211, + "step": 3498 + }, + { + "epoch": 0.15293500590060755, + "grad_norm": 2.515625, + "learning_rate": 9.436599325852208e-05, + "loss": 1.7812, + "step": 3499 + }, + { + "epoch": 0.15297871410463745, + "grad_norm": 2.21875, + "learning_rate": 9.436282532157419e-05, + "loss": 1.7518, + "step": 3500 + }, + { + "epoch": 0.15302242230866733, + "grad_norm": 2.9375, + "learning_rate": 9.435965654743522e-05, + "loss": 1.9779, + "step": 3501 + }, + { + "epoch": 0.15306613051269724, + "grad_norm": 19.5, + "learning_rate": 9.435648693616496e-05, + "loss": 0.4233, + "step": 3502 + }, + { + "epoch": 0.15310983871672712, + "grad_norm": 2.40625, + "learning_rate": 9.435331648782324e-05, + "loss": 2.0467, + "step": 3503 + }, + { + "epoch": 0.15315354692075703, + "grad_norm": 2.609375, + "learning_rate": 9.43501452024699e-05, + "loss": 1.829, + "step": 3504 + }, + { + "epoch": 0.15319725512478694, + "grad_norm": 2.453125, + "learning_rate": 9.434697308016475e-05, + "loss": 2.502, + "step": 3505 + }, + { + "epoch": 0.15324096332881681, + "grad_norm": 2.8125, + "learning_rate": 9.434380012096768e-05, + "loss": 2.0048, + "step": 3506 + }, + { + "epoch": 0.15328467153284672, + "grad_norm": 3.640625, + "learning_rate": 9.434062632493856e-05, + "loss": 1.4008, + "step": 3507 + }, + { + "epoch": 0.1533283797368766, + "grad_norm": 3.109375, + "learning_rate": 9.43374516921373e-05, + "loss": 2.3127, + "step": 3508 + }, + { + "epoch": 0.1533720879409065, + "grad_norm": 3.359375, + "learning_rate": 9.433427622262379e-05, + "loss": 2.145, + "step": 3509 + }, + { + "epoch": 0.15341579614493642, + "grad_norm": 2.375, + "learning_rate": 9.433109991645795e-05, + "loss": 1.7612, + "step": 3510 + }, + { + "epoch": 0.1534595043489663, + "grad_norm": 3.234375, + "learning_rate": 9.432792277369974e-05, + "loss": 2.0052, + "step": 3511 + }, + { + "epoch": 0.1535032125529962, + "grad_norm": 2.390625, + "learning_rate": 9.432474479440912e-05, + "loss": 1.7988, + "step": 3512 + }, + { + "epoch": 0.15354692075702608, + "grad_norm": 4.25, + "learning_rate": 9.432156597864604e-05, + "loss": 1.9816, + "step": 3513 + }, + { + "epoch": 0.153590628961056, + "grad_norm": 2.734375, + "learning_rate": 9.431838632647052e-05, + "loss": 1.6767, + "step": 3514 + }, + { + "epoch": 0.1536343371650859, + "grad_norm": 2.046875, + "learning_rate": 9.431520583794254e-05, + "loss": 1.8451, + "step": 3515 + }, + { + "epoch": 0.15367804536911578, + "grad_norm": 2.484375, + "learning_rate": 9.431202451312211e-05, + "loss": 2.6531, + "step": 3516 + }, + { + "epoch": 0.1537217535731457, + "grad_norm": 2.546875, + "learning_rate": 9.43088423520693e-05, + "loss": 2.4231, + "step": 3517 + }, + { + "epoch": 0.15376546177717557, + "grad_norm": 3.359375, + "learning_rate": 9.430565935484416e-05, + "loss": 2.4119, + "step": 3518 + }, + { + "epoch": 0.15380916998120547, + "grad_norm": 2.390625, + "learning_rate": 9.430247552150673e-05, + "loss": 2.4021, + "step": 3519 + }, + { + "epoch": 0.15385287818523538, + "grad_norm": 4.6875, + "learning_rate": 9.42992908521171e-05, + "loss": 1.9986, + "step": 3520 + }, + { + "epoch": 0.15389658638926526, + "grad_norm": 2.1875, + "learning_rate": 9.429610534673538e-05, + "loss": 1.8919, + "step": 3521 + }, + { + "epoch": 0.15394029459329517, + "grad_norm": 2.171875, + "learning_rate": 9.42929190054217e-05, + "loss": 1.787, + "step": 3522 + }, + { + "epoch": 0.15398400279732505, + "grad_norm": 3.015625, + "learning_rate": 9.428973182823616e-05, + "loss": 2.095, + "step": 3523 + }, + { + "epoch": 0.15402771100135496, + "grad_norm": 4.53125, + "learning_rate": 9.428654381523892e-05, + "loss": 1.0479, + "step": 3524 + }, + { + "epoch": 0.15407141920538486, + "grad_norm": 3.5, + "learning_rate": 9.428335496649014e-05, + "loss": 2.9287, + "step": 3525 + }, + { + "epoch": 0.15411512740941474, + "grad_norm": 2.515625, + "learning_rate": 9.428016528205001e-05, + "loss": 2.4471, + "step": 3526 + }, + { + "epoch": 0.15415883561344465, + "grad_norm": 2.8125, + "learning_rate": 9.42769747619787e-05, + "loss": 1.9132, + "step": 3527 + }, + { + "epoch": 0.15420254381747453, + "grad_norm": 2.46875, + "learning_rate": 9.427378340633645e-05, + "loss": 1.7993, + "step": 3528 + }, + { + "epoch": 0.15424625202150444, + "grad_norm": 3.859375, + "learning_rate": 9.427059121518346e-05, + "loss": 2.7769, + "step": 3529 + }, + { + "epoch": 0.15428996022553434, + "grad_norm": 3.625, + "learning_rate": 9.426739818857998e-05, + "loss": 1.8669, + "step": 3530 + }, + { + "epoch": 0.15433366842956422, + "grad_norm": 2.84375, + "learning_rate": 9.426420432658627e-05, + "loss": 2.2612, + "step": 3531 + }, + { + "epoch": 0.15437737663359413, + "grad_norm": 2.796875, + "learning_rate": 9.426100962926261e-05, + "loss": 1.631, + "step": 3532 + }, + { + "epoch": 0.154421084837624, + "grad_norm": 2.3125, + "learning_rate": 9.425781409666926e-05, + "loss": 1.6795, + "step": 3533 + }, + { + "epoch": 0.15446479304165392, + "grad_norm": 2.984375, + "learning_rate": 9.425461772886656e-05, + "loss": 3.0124, + "step": 3534 + }, + { + "epoch": 0.15450850124568383, + "grad_norm": 2.6875, + "learning_rate": 9.42514205259148e-05, + "loss": 1.6248, + "step": 3535 + }, + { + "epoch": 0.1545522094497137, + "grad_norm": 3.015625, + "learning_rate": 9.424822248787435e-05, + "loss": 2.2115, + "step": 3536 + }, + { + "epoch": 0.15459591765374361, + "grad_norm": 2.375, + "learning_rate": 9.424502361480552e-05, + "loss": 2.0061, + "step": 3537 + }, + { + "epoch": 0.1546396258577735, + "grad_norm": 2.265625, + "learning_rate": 9.424182390676872e-05, + "loss": 1.5383, + "step": 3538 + }, + { + "epoch": 0.1546833340618034, + "grad_norm": 2.421875, + "learning_rate": 9.423862336382429e-05, + "loss": 1.6646, + "step": 3539 + }, + { + "epoch": 0.1547270422658333, + "grad_norm": 2.015625, + "learning_rate": 9.423542198603267e-05, + "loss": 1.7199, + "step": 3540 + }, + { + "epoch": 0.1547707504698632, + "grad_norm": 2.40625, + "learning_rate": 9.423221977345424e-05, + "loss": 1.8018, + "step": 3541 + }, + { + "epoch": 0.1548144586738931, + "grad_norm": 2.734375, + "learning_rate": 9.422901672614946e-05, + "loss": 2.5242, + "step": 3542 + }, + { + "epoch": 0.15485816687792298, + "grad_norm": 2.875, + "learning_rate": 9.422581284417875e-05, + "loss": 2.0725, + "step": 3543 + }, + { + "epoch": 0.15490187508195288, + "grad_norm": 2.640625, + "learning_rate": 9.422260812760259e-05, + "loss": 2.1044, + "step": 3544 + }, + { + "epoch": 0.1549455832859828, + "grad_norm": 3.109375, + "learning_rate": 9.421940257648146e-05, + "loss": 1.7116, + "step": 3545 + }, + { + "epoch": 0.15498929149001267, + "grad_norm": 2.203125, + "learning_rate": 9.421619619087582e-05, + "loss": 1.9174, + "step": 3546 + }, + { + "epoch": 0.15503299969404258, + "grad_norm": 2.09375, + "learning_rate": 9.421298897084623e-05, + "loss": 1.6679, + "step": 3547 + }, + { + "epoch": 0.15507670789807246, + "grad_norm": 3.25, + "learning_rate": 9.420978091645318e-05, + "loss": 2.2195, + "step": 3548 + }, + { + "epoch": 0.15512041610210237, + "grad_norm": 2.296875, + "learning_rate": 9.420657202775722e-05, + "loss": 1.989, + "step": 3549 + }, + { + "epoch": 0.15516412430613227, + "grad_norm": 2.5625, + "learning_rate": 9.42033623048189e-05, + "loss": 1.9881, + "step": 3550 + }, + { + "epoch": 0.15520783251016215, + "grad_norm": 3.140625, + "learning_rate": 9.420015174769881e-05, + "loss": 2.7604, + "step": 3551 + }, + { + "epoch": 0.15525154071419206, + "grad_norm": 2.375, + "learning_rate": 9.419694035645751e-05, + "loss": 1.9209, + "step": 3552 + }, + { + "epoch": 0.15529524891822194, + "grad_norm": 7.90625, + "learning_rate": 9.419372813115563e-05, + "loss": 2.8501, + "step": 3553 + }, + { + "epoch": 0.15533895712225185, + "grad_norm": 2.421875, + "learning_rate": 9.419051507185378e-05, + "loss": 2.2907, + "step": 3554 + }, + { + "epoch": 0.15538266532628175, + "grad_norm": 2.53125, + "learning_rate": 9.418730117861259e-05, + "loss": 2.0245, + "step": 3555 + }, + { + "epoch": 0.15542637353031163, + "grad_norm": 2.34375, + "learning_rate": 9.418408645149273e-05, + "loss": 1.9516, + "step": 3556 + }, + { + "epoch": 0.15547008173434154, + "grad_norm": 3.125, + "learning_rate": 9.418087089055484e-05, + "loss": 2.2403, + "step": 3557 + }, + { + "epoch": 0.15551378993837142, + "grad_norm": 2.375, + "learning_rate": 9.417765449585961e-05, + "loss": 2.1481, + "step": 3558 + }, + { + "epoch": 0.15555749814240133, + "grad_norm": 2.234375, + "learning_rate": 9.417443726746776e-05, + "loss": 2.1226, + "step": 3559 + }, + { + "epoch": 0.15560120634643124, + "grad_norm": 2.203125, + "learning_rate": 9.417121920543996e-05, + "loss": 2.2953, + "step": 3560 + }, + { + "epoch": 0.15564491455046112, + "grad_norm": 3.515625, + "learning_rate": 9.416800030983699e-05, + "loss": 1.9059, + "step": 3561 + }, + { + "epoch": 0.15568862275449102, + "grad_norm": 2.796875, + "learning_rate": 9.416478058071956e-05, + "loss": 2.1399, + "step": 3562 + }, + { + "epoch": 0.1557323309585209, + "grad_norm": 2.1875, + "learning_rate": 9.416156001814843e-05, + "loss": 2.1246, + "step": 3563 + }, + { + "epoch": 0.1557760391625508, + "grad_norm": 2.65625, + "learning_rate": 9.41583386221844e-05, + "loss": 1.8416, + "step": 3564 + }, + { + "epoch": 0.15581974736658072, + "grad_norm": 2.984375, + "learning_rate": 9.415511639288826e-05, + "loss": 2.3951, + "step": 3565 + }, + { + "epoch": 0.1558634555706106, + "grad_norm": 2.921875, + "learning_rate": 9.41518933303208e-05, + "loss": 2.3691, + "step": 3566 + }, + { + "epoch": 0.1559071637746405, + "grad_norm": 2.609375, + "learning_rate": 9.414866943454284e-05, + "loss": 2.3471, + "step": 3567 + }, + { + "epoch": 0.15595087197867039, + "grad_norm": 3.421875, + "learning_rate": 9.414544470561524e-05, + "loss": 2.5009, + "step": 3568 + }, + { + "epoch": 0.1559945801827003, + "grad_norm": 2.640625, + "learning_rate": 9.414221914359886e-05, + "loss": 2.6902, + "step": 3569 + }, + { + "epoch": 0.1560382883867302, + "grad_norm": 3.34375, + "learning_rate": 9.413899274855454e-05, + "loss": 2.5366, + "step": 3570 + }, + { + "epoch": 0.15608199659076008, + "grad_norm": 2.375, + "learning_rate": 9.41357655205432e-05, + "loss": 1.9691, + "step": 3571 + }, + { + "epoch": 0.15612570479479, + "grad_norm": 4.6875, + "learning_rate": 9.413253745962573e-05, + "loss": 2.2237, + "step": 3572 + }, + { + "epoch": 0.15616941299881987, + "grad_norm": 2.125, + "learning_rate": 9.412930856586304e-05, + "loss": 2.151, + "step": 3573 + }, + { + "epoch": 0.15621312120284978, + "grad_norm": 2.34375, + "learning_rate": 9.412607883931607e-05, + "loss": 1.9646, + "step": 3574 + }, + { + "epoch": 0.15625682940687968, + "grad_norm": 2.71875, + "learning_rate": 9.412284828004577e-05, + "loss": 1.9428, + "step": 3575 + }, + { + "epoch": 0.15630053761090956, + "grad_norm": 2.578125, + "learning_rate": 9.41196168881131e-05, + "loss": 2.2769, + "step": 3576 + }, + { + "epoch": 0.15634424581493947, + "grad_norm": 2.078125, + "learning_rate": 9.411638466357906e-05, + "loss": 1.6582, + "step": 3577 + }, + { + "epoch": 0.15638795401896935, + "grad_norm": 2.03125, + "learning_rate": 9.411315160650462e-05, + "loss": 1.6979, + "step": 3578 + }, + { + "epoch": 0.15643166222299926, + "grad_norm": 2.1875, + "learning_rate": 9.410991771695082e-05, + "loss": 2.0957, + "step": 3579 + }, + { + "epoch": 0.15647537042702916, + "grad_norm": 2.9375, + "learning_rate": 9.410668299497864e-05, + "loss": 1.7496, + "step": 3580 + }, + { + "epoch": 0.15651907863105904, + "grad_norm": 2.25, + "learning_rate": 9.410344744064919e-05, + "loss": 1.6589, + "step": 3581 + }, + { + "epoch": 0.15656278683508895, + "grad_norm": 3.140625, + "learning_rate": 9.410021105402348e-05, + "loss": 3.1007, + "step": 3582 + }, + { + "epoch": 0.15660649503911883, + "grad_norm": 2.4375, + "learning_rate": 9.409697383516263e-05, + "loss": 2.4073, + "step": 3583 + }, + { + "epoch": 0.15665020324314874, + "grad_norm": 2.09375, + "learning_rate": 9.409373578412769e-05, + "loss": 2.0086, + "step": 3584 + }, + { + "epoch": 0.15669391144717865, + "grad_norm": 2.78125, + "learning_rate": 9.409049690097977e-05, + "loss": 1.5228, + "step": 3585 + }, + { + "epoch": 0.15673761965120853, + "grad_norm": 3.0, + "learning_rate": 9.408725718578e-05, + "loss": 2.3846, + "step": 3586 + }, + { + "epoch": 0.15678132785523843, + "grad_norm": 2.859375, + "learning_rate": 9.408401663858953e-05, + "loss": 1.5352, + "step": 3587 + }, + { + "epoch": 0.1568250360592683, + "grad_norm": 2.328125, + "learning_rate": 9.408077525946952e-05, + "loss": 2.0641, + "step": 3588 + }, + { + "epoch": 0.15686874426329822, + "grad_norm": 2.375, + "learning_rate": 9.40775330484811e-05, + "loss": 2.0999, + "step": 3589 + }, + { + "epoch": 0.15691245246732813, + "grad_norm": 2.09375, + "learning_rate": 9.407429000568549e-05, + "loss": 1.557, + "step": 3590 + }, + { + "epoch": 0.156956160671358, + "grad_norm": 2.140625, + "learning_rate": 9.407104613114388e-05, + "loss": 1.735, + "step": 3591 + }, + { + "epoch": 0.15699986887538792, + "grad_norm": 2.484375, + "learning_rate": 9.406780142491748e-05, + "loss": 1.8904, + "step": 3592 + }, + { + "epoch": 0.1570435770794178, + "grad_norm": 11.9375, + "learning_rate": 9.406455588706752e-05, + "loss": 2.161, + "step": 3593 + }, + { + "epoch": 0.1570872852834477, + "grad_norm": 2.640625, + "learning_rate": 9.406130951765529e-05, + "loss": 2.2186, + "step": 3594 + }, + { + "epoch": 0.1571309934874776, + "grad_norm": 2.171875, + "learning_rate": 9.405806231674202e-05, + "loss": 1.8745, + "step": 3595 + }, + { + "epoch": 0.1571747016915075, + "grad_norm": 2.25, + "learning_rate": 9.405481428438896e-05, + "loss": 1.8984, + "step": 3596 + }, + { + "epoch": 0.1572184098955374, + "grad_norm": 2.890625, + "learning_rate": 9.405156542065745e-05, + "loss": 1.8932, + "step": 3597 + }, + { + "epoch": 0.15726211809956728, + "grad_norm": 2.15625, + "learning_rate": 9.404831572560879e-05, + "loss": 2.3254, + "step": 3598 + }, + { + "epoch": 0.15730582630359718, + "grad_norm": 2.71875, + "learning_rate": 9.40450651993043e-05, + "loss": 2.3937, + "step": 3599 + }, + { + "epoch": 0.1573495345076271, + "grad_norm": 2.375, + "learning_rate": 9.404181384180532e-05, + "loss": 2.4118, + "step": 3600 + }, + { + "epoch": 0.15739324271165697, + "grad_norm": 2.65625, + "learning_rate": 9.403856165317321e-05, + "loss": 2.095, + "step": 3601 + }, + { + "epoch": 0.15743695091568688, + "grad_norm": 2.578125, + "learning_rate": 9.403530863346937e-05, + "loss": 2.1934, + "step": 3602 + }, + { + "epoch": 0.15748065911971676, + "grad_norm": 2.875, + "learning_rate": 9.403205478275514e-05, + "loss": 1.9076, + "step": 3603 + }, + { + "epoch": 0.15752436732374667, + "grad_norm": 2.4375, + "learning_rate": 9.402880010109196e-05, + "loss": 2.0982, + "step": 3604 + }, + { + "epoch": 0.15756807552777657, + "grad_norm": 2.359375, + "learning_rate": 9.402554458854125e-05, + "loss": 2.1694, + "step": 3605 + }, + { + "epoch": 0.15761178373180645, + "grad_norm": 2.609375, + "learning_rate": 9.402228824516442e-05, + "loss": 2.0995, + "step": 3606 + }, + { + "epoch": 0.15765549193583636, + "grad_norm": 2.546875, + "learning_rate": 9.401903107102296e-05, + "loss": 2.2792, + "step": 3607 + }, + { + "epoch": 0.15769920013986624, + "grad_norm": 2.421875, + "learning_rate": 9.40157730661783e-05, + "loss": 1.9848, + "step": 3608 + }, + { + "epoch": 0.15774290834389615, + "grad_norm": 2.453125, + "learning_rate": 9.401251423069194e-05, + "loss": 1.9047, + "step": 3609 + }, + { + "epoch": 0.15778661654792606, + "grad_norm": 2.21875, + "learning_rate": 9.400925456462539e-05, + "loss": 1.9244, + "step": 3610 + }, + { + "epoch": 0.15783032475195594, + "grad_norm": 3.375, + "learning_rate": 9.400599406804016e-05, + "loss": 2.0467, + "step": 3611 + }, + { + "epoch": 0.15787403295598584, + "grad_norm": 2.453125, + "learning_rate": 9.400273274099776e-05, + "loss": 1.9429, + "step": 3612 + }, + { + "epoch": 0.15791774116001572, + "grad_norm": 2.34375, + "learning_rate": 9.399947058355976e-05, + "loss": 1.8686, + "step": 3613 + }, + { + "epoch": 0.15796144936404563, + "grad_norm": 2.328125, + "learning_rate": 9.399620759578769e-05, + "loss": 1.9697, + "step": 3614 + }, + { + "epoch": 0.15800515756807554, + "grad_norm": 2.921875, + "learning_rate": 9.399294377774318e-05, + "loss": 2.0929, + "step": 3615 + }, + { + "epoch": 0.15804886577210542, + "grad_norm": 2.703125, + "learning_rate": 9.398967912948778e-05, + "loss": 2.5104, + "step": 3616 + }, + { + "epoch": 0.15809257397613533, + "grad_norm": 2.296875, + "learning_rate": 9.398641365108309e-05, + "loss": 2.3617, + "step": 3617 + }, + { + "epoch": 0.1581362821801652, + "grad_norm": 2.71875, + "learning_rate": 9.398314734259078e-05, + "loss": 2.0578, + "step": 3618 + }, + { + "epoch": 0.1581799903841951, + "grad_norm": 6.15625, + "learning_rate": 9.397988020407246e-05, + "loss": 2.7893, + "step": 3619 + }, + { + "epoch": 0.15822369858822502, + "grad_norm": 2.34375, + "learning_rate": 9.397661223558979e-05, + "loss": 1.8977, + "step": 3620 + }, + { + "epoch": 0.1582674067922549, + "grad_norm": 2.46875, + "learning_rate": 9.397334343720445e-05, + "loss": 1.6987, + "step": 3621 + }, + { + "epoch": 0.1583111149962848, + "grad_norm": 3.0, + "learning_rate": 9.39700738089781e-05, + "loss": 2.5893, + "step": 3622 + }, + { + "epoch": 0.1583548232003147, + "grad_norm": 2.25, + "learning_rate": 9.396680335097247e-05, + "loss": 1.7377, + "step": 3623 + }, + { + "epoch": 0.1583985314043446, + "grad_norm": 2.59375, + "learning_rate": 9.396353206324929e-05, + "loss": 2.4146, + "step": 3624 + }, + { + "epoch": 0.1584422396083745, + "grad_norm": 2.046875, + "learning_rate": 9.396025994587024e-05, + "loss": 1.7168, + "step": 3625 + }, + { + "epoch": 0.15848594781240438, + "grad_norm": 2.1875, + "learning_rate": 9.395698699889713e-05, + "loss": 1.5608, + "step": 3626 + }, + { + "epoch": 0.1585296560164343, + "grad_norm": 2.09375, + "learning_rate": 9.395371322239168e-05, + "loss": 1.6012, + "step": 3627 + }, + { + "epoch": 0.15857336422046417, + "grad_norm": 2.421875, + "learning_rate": 9.395043861641571e-05, + "loss": 2.1641, + "step": 3628 + }, + { + "epoch": 0.15861707242449408, + "grad_norm": 2.78125, + "learning_rate": 9.394716318103098e-05, + "loss": 2.3077, + "step": 3629 + }, + { + "epoch": 0.15866078062852398, + "grad_norm": 2.515625, + "learning_rate": 9.394388691629932e-05, + "loss": 1.7936, + "step": 3630 + }, + { + "epoch": 0.15870448883255386, + "grad_norm": 3.484375, + "learning_rate": 9.394060982228257e-05, + "loss": 1.8558, + "step": 3631 + }, + { + "epoch": 0.15874819703658377, + "grad_norm": 3.140625, + "learning_rate": 9.393733189904254e-05, + "loss": 1.8328, + "step": 3632 + }, + { + "epoch": 0.15879190524061365, + "grad_norm": 2.265625, + "learning_rate": 9.393405314664113e-05, + "loss": 2.028, + "step": 3633 + }, + { + "epoch": 0.15883561344464356, + "grad_norm": 2.484375, + "learning_rate": 9.393077356514018e-05, + "loss": 1.8092, + "step": 3634 + }, + { + "epoch": 0.15887932164867347, + "grad_norm": 2.453125, + "learning_rate": 9.392749315460161e-05, + "loss": 1.8547, + "step": 3635 + }, + { + "epoch": 0.15892302985270335, + "grad_norm": 2.59375, + "learning_rate": 9.392421191508729e-05, + "loss": 1.7204, + "step": 3636 + }, + { + "epoch": 0.15896673805673325, + "grad_norm": 2.65625, + "learning_rate": 9.392092984665918e-05, + "loss": 3.101, + "step": 3637 + }, + { + "epoch": 0.15901044626076313, + "grad_norm": 2.546875, + "learning_rate": 9.391764694937919e-05, + "loss": 2.1922, + "step": 3638 + }, + { + "epoch": 0.15905415446479304, + "grad_norm": 3.0625, + "learning_rate": 9.391436322330928e-05, + "loss": 1.7696, + "step": 3639 + }, + { + "epoch": 0.15909786266882295, + "grad_norm": 3.390625, + "learning_rate": 9.391107866851143e-05, + "loss": 2.4794, + "step": 3640 + }, + { + "epoch": 0.15914157087285283, + "grad_norm": 2.140625, + "learning_rate": 9.390779328504762e-05, + "loss": 2.1426, + "step": 3641 + }, + { + "epoch": 0.15918527907688274, + "grad_norm": 2.359375, + "learning_rate": 9.390450707297984e-05, + "loss": 1.9995, + "step": 3642 + }, + { + "epoch": 0.15922898728091261, + "grad_norm": 2.484375, + "learning_rate": 9.39012200323701e-05, + "loss": 2.6311, + "step": 3643 + }, + { + "epoch": 0.15927269548494252, + "grad_norm": 2.0625, + "learning_rate": 9.389793216328047e-05, + "loss": 1.9958, + "step": 3644 + }, + { + "epoch": 0.15931640368897243, + "grad_norm": 2.03125, + "learning_rate": 9.389464346577295e-05, + "loss": 1.8677, + "step": 3645 + }, + { + "epoch": 0.1593601118930023, + "grad_norm": 3.53125, + "learning_rate": 9.389135393990962e-05, + "loss": 2.6298, + "step": 3646 + }, + { + "epoch": 0.15940382009703222, + "grad_norm": 3.296875, + "learning_rate": 9.388806358575256e-05, + "loss": 2.1751, + "step": 3647 + }, + { + "epoch": 0.1594475283010621, + "grad_norm": 3.546875, + "learning_rate": 9.388477240336387e-05, + "loss": 2.0166, + "step": 3648 + }, + { + "epoch": 0.159491236505092, + "grad_norm": 3.140625, + "learning_rate": 9.388148039280566e-05, + "loss": 2.4338, + "step": 3649 + }, + { + "epoch": 0.1595349447091219, + "grad_norm": 2.65625, + "learning_rate": 9.387818755414004e-05, + "loss": 1.426, + "step": 3650 + }, + { + "epoch": 0.1595786529131518, + "grad_norm": 2.25, + "learning_rate": 9.387489388742917e-05, + "loss": 1.8721, + "step": 3651 + }, + { + "epoch": 0.1596223611171817, + "grad_norm": 2.21875, + "learning_rate": 9.387159939273518e-05, + "loss": 1.9787, + "step": 3652 + }, + { + "epoch": 0.15966606932121158, + "grad_norm": 3.4375, + "learning_rate": 9.386830407012026e-05, + "loss": 2.2221, + "step": 3653 + }, + { + "epoch": 0.1597097775252415, + "grad_norm": 4.90625, + "learning_rate": 9.386500791964661e-05, + "loss": 2.0422, + "step": 3654 + }, + { + "epoch": 0.1597534857292714, + "grad_norm": 3.140625, + "learning_rate": 9.38617109413764e-05, + "loss": 2.3686, + "step": 3655 + }, + { + "epoch": 0.15979719393330127, + "grad_norm": 3.546875, + "learning_rate": 9.385841313537187e-05, + "loss": 2.6774, + "step": 3656 + }, + { + "epoch": 0.15984090213733118, + "grad_norm": 12.375, + "learning_rate": 9.385511450169525e-05, + "loss": 2.3355, + "step": 3657 + }, + { + "epoch": 0.15988461034136106, + "grad_norm": 2.671875, + "learning_rate": 9.385181504040881e-05, + "loss": 2.1482, + "step": 3658 + }, + { + "epoch": 0.15992831854539097, + "grad_norm": 2.375, + "learning_rate": 9.384851475157477e-05, + "loss": 2.1601, + "step": 3659 + }, + { + "epoch": 0.15997202674942088, + "grad_norm": 2.453125, + "learning_rate": 9.384521363525546e-05, + "loss": 2.2922, + "step": 3660 + }, + { + "epoch": 0.16001573495345076, + "grad_norm": 2.671875, + "learning_rate": 9.384191169151313e-05, + "loss": 2.2924, + "step": 3661 + }, + { + "epoch": 0.16005944315748066, + "grad_norm": 2.6875, + "learning_rate": 9.383860892041014e-05, + "loss": 2.3593, + "step": 3662 + }, + { + "epoch": 0.16010315136151054, + "grad_norm": 2.234375, + "learning_rate": 9.383530532200879e-05, + "loss": 2.0018, + "step": 3663 + }, + { + "epoch": 0.16014685956554045, + "grad_norm": 2.671875, + "learning_rate": 9.383200089637143e-05, + "loss": 1.9835, + "step": 3664 + }, + { + "epoch": 0.16019056776957036, + "grad_norm": 2.4375, + "learning_rate": 9.382869564356043e-05, + "loss": 2.4303, + "step": 3665 + }, + { + "epoch": 0.16023427597360024, + "grad_norm": 2.578125, + "learning_rate": 9.382538956363813e-05, + "loss": 2.014, + "step": 3666 + }, + { + "epoch": 0.16027798417763015, + "grad_norm": 2.125, + "learning_rate": 9.382208265666695e-05, + "loss": 1.8586, + "step": 3667 + }, + { + "epoch": 0.16032169238166002, + "grad_norm": 2.234375, + "learning_rate": 9.38187749227093e-05, + "loss": 2.2283, + "step": 3668 + }, + { + "epoch": 0.16036540058568993, + "grad_norm": 3.3125, + "learning_rate": 9.381546636182758e-05, + "loss": 2.8335, + "step": 3669 + }, + { + "epoch": 0.16040910878971984, + "grad_norm": 3.5625, + "learning_rate": 9.381215697408426e-05, + "loss": 1.6542, + "step": 3670 + }, + { + "epoch": 0.16045281699374972, + "grad_norm": 2.90625, + "learning_rate": 9.380884675954176e-05, + "loss": 2.2729, + "step": 3671 + }, + { + "epoch": 0.16049652519777963, + "grad_norm": 2.390625, + "learning_rate": 9.380553571826256e-05, + "loss": 1.8695, + "step": 3672 + }, + { + "epoch": 0.1605402334018095, + "grad_norm": 2.3125, + "learning_rate": 9.380222385030915e-05, + "loss": 1.9721, + "step": 3673 + }, + { + "epoch": 0.16058394160583941, + "grad_norm": 2.25, + "learning_rate": 9.379891115574402e-05, + "loss": 2.1319, + "step": 3674 + }, + { + "epoch": 0.16062764980986932, + "grad_norm": 2.453125, + "learning_rate": 9.379559763462968e-05, + "loss": 2.5423, + "step": 3675 + }, + { + "epoch": 0.1606713580138992, + "grad_norm": 2.46875, + "learning_rate": 9.379228328702868e-05, + "loss": 1.8878, + "step": 3676 + }, + { + "epoch": 0.1607150662179291, + "grad_norm": 2.59375, + "learning_rate": 9.378896811300356e-05, + "loss": 2.2672, + "step": 3677 + }, + { + "epoch": 0.160758774421959, + "grad_norm": 2.671875, + "learning_rate": 9.378565211261687e-05, + "loss": 1.9762, + "step": 3678 + }, + { + "epoch": 0.1608024826259889, + "grad_norm": 2.328125, + "learning_rate": 9.378233528593121e-05, + "loss": 2.1997, + "step": 3679 + }, + { + "epoch": 0.1608461908300188, + "grad_norm": 2.375, + "learning_rate": 9.377901763300916e-05, + "loss": 1.9741, + "step": 3680 + }, + { + "epoch": 0.16088989903404868, + "grad_norm": 2.953125, + "learning_rate": 9.377569915391333e-05, + "loss": 2.4164, + "step": 3681 + }, + { + "epoch": 0.1609336072380786, + "grad_norm": 2.40625, + "learning_rate": 9.377237984870634e-05, + "loss": 2.2117, + "step": 3682 + }, + { + "epoch": 0.16097731544210847, + "grad_norm": 2.59375, + "learning_rate": 9.376905971745085e-05, + "loss": 2.0477, + "step": 3683 + }, + { + "epoch": 0.16102102364613838, + "grad_norm": 2.515625, + "learning_rate": 9.376573876020948e-05, + "loss": 1.9321, + "step": 3684 + }, + { + "epoch": 0.16106473185016829, + "grad_norm": 2.703125, + "learning_rate": 9.376241697704493e-05, + "loss": 2.2536, + "step": 3685 + }, + { + "epoch": 0.16110844005419817, + "grad_norm": 2.09375, + "learning_rate": 9.375909436801988e-05, + "loss": 1.8819, + "step": 3686 + }, + { + "epoch": 0.16115214825822807, + "grad_norm": 2.453125, + "learning_rate": 9.375577093319701e-05, + "loss": 1.461, + "step": 3687 + }, + { + "epoch": 0.16119585646225795, + "grad_norm": 3.046875, + "learning_rate": 9.37524466726391e-05, + "loss": 2.1925, + "step": 3688 + }, + { + "epoch": 0.16123956466628786, + "grad_norm": 2.453125, + "learning_rate": 9.37491215864088e-05, + "loss": 1.9754, + "step": 3689 + }, + { + "epoch": 0.16128327287031777, + "grad_norm": 2.625, + "learning_rate": 9.374579567456892e-05, + "loss": 1.9294, + "step": 3690 + }, + { + "epoch": 0.16132698107434765, + "grad_norm": 2.5, + "learning_rate": 9.374246893718221e-05, + "loss": 1.5906, + "step": 3691 + }, + { + "epoch": 0.16137068927837755, + "grad_norm": 2.734375, + "learning_rate": 9.373914137431146e-05, + "loss": 2.2835, + "step": 3692 + }, + { + "epoch": 0.16141439748240743, + "grad_norm": 2.296875, + "learning_rate": 9.373581298601943e-05, + "loss": 2.0546, + "step": 3693 + }, + { + "epoch": 0.16145810568643734, + "grad_norm": 2.453125, + "learning_rate": 9.373248377236896e-05, + "loss": 1.9919, + "step": 3694 + }, + { + "epoch": 0.16150181389046725, + "grad_norm": 2.921875, + "learning_rate": 9.372915373342288e-05, + "loss": 1.7252, + "step": 3695 + }, + { + "epoch": 0.16154552209449713, + "grad_norm": 2.046875, + "learning_rate": 9.3725822869244e-05, + "loss": 1.6739, + "step": 3696 + }, + { + "epoch": 0.16158923029852704, + "grad_norm": 2.796875, + "learning_rate": 9.372249117989523e-05, + "loss": 2.5599, + "step": 3697 + }, + { + "epoch": 0.16163293850255692, + "grad_norm": 2.921875, + "learning_rate": 9.37191586654394e-05, + "loss": 1.8535, + "step": 3698 + }, + { + "epoch": 0.16167664670658682, + "grad_norm": 2.359375, + "learning_rate": 9.371582532593943e-05, + "loss": 2.326, + "step": 3699 + }, + { + "epoch": 0.16172035491061673, + "grad_norm": 2.21875, + "learning_rate": 9.37124911614582e-05, + "loss": 1.3687, + "step": 3700 + }, + { + "epoch": 0.1617640631146466, + "grad_norm": 2.40625, + "learning_rate": 9.370915617205865e-05, + "loss": 1.9865, + "step": 3701 + }, + { + "epoch": 0.16180777131867652, + "grad_norm": 2.53125, + "learning_rate": 9.370582035780371e-05, + "loss": 2.1977, + "step": 3702 + }, + { + "epoch": 0.1618514795227064, + "grad_norm": 2.453125, + "learning_rate": 9.370248371875631e-05, + "loss": 2.3067, + "step": 3703 + }, + { + "epoch": 0.1618951877267363, + "grad_norm": 2.328125, + "learning_rate": 9.369914625497945e-05, + "loss": 1.9207, + "step": 3704 + }, + { + "epoch": 0.1619388959307662, + "grad_norm": 3.171875, + "learning_rate": 9.36958079665361e-05, + "loss": 2.5128, + "step": 3705 + }, + { + "epoch": 0.1619826041347961, + "grad_norm": 3.765625, + "learning_rate": 9.369246885348926e-05, + "loss": 2.426, + "step": 3706 + }, + { + "epoch": 0.162026312338826, + "grad_norm": 2.21875, + "learning_rate": 9.368912891590192e-05, + "loss": 1.9204, + "step": 3707 + }, + { + "epoch": 0.16207002054285588, + "grad_norm": 3.3125, + "learning_rate": 9.368578815383715e-05, + "loss": 1.9233, + "step": 3708 + }, + { + "epoch": 0.1621137287468858, + "grad_norm": 2.265625, + "learning_rate": 9.368244656735798e-05, + "loss": 1.6875, + "step": 3709 + }, + { + "epoch": 0.1621574369509157, + "grad_norm": 3.015625, + "learning_rate": 9.367910415652745e-05, + "loss": 2.2647, + "step": 3710 + }, + { + "epoch": 0.16220114515494558, + "grad_norm": 2.171875, + "learning_rate": 9.367576092140866e-05, + "loss": 2.1075, + "step": 3711 + }, + { + "epoch": 0.16224485335897548, + "grad_norm": 2.4375, + "learning_rate": 9.367241686206469e-05, + "loss": 1.6637, + "step": 3712 + }, + { + "epoch": 0.16228856156300536, + "grad_norm": 4.84375, + "learning_rate": 9.366907197855868e-05, + "loss": 2.8638, + "step": 3713 + }, + { + "epoch": 0.16233226976703527, + "grad_norm": 3.84375, + "learning_rate": 9.366572627095369e-05, + "loss": 1.9139, + "step": 3714 + }, + { + "epoch": 0.16237597797106518, + "grad_norm": 2.65625, + "learning_rate": 9.366237973931291e-05, + "loss": 2.1883, + "step": 3715 + }, + { + "epoch": 0.16241968617509506, + "grad_norm": 2.40625, + "learning_rate": 9.365903238369946e-05, + "loss": 1.3408, + "step": 3716 + }, + { + "epoch": 0.16246339437912496, + "grad_norm": 2.359375, + "learning_rate": 9.365568420417655e-05, + "loss": 2.2424, + "step": 3717 + }, + { + "epoch": 0.16250710258315484, + "grad_norm": 2.546875, + "learning_rate": 9.365233520080731e-05, + "loss": 2.0933, + "step": 3718 + }, + { + "epoch": 0.16255081078718475, + "grad_norm": 2.640625, + "learning_rate": 9.364898537365501e-05, + "loss": 1.9398, + "step": 3719 + }, + { + "epoch": 0.16259451899121466, + "grad_norm": 2.21875, + "learning_rate": 9.36456347227828e-05, + "loss": 1.7613, + "step": 3720 + }, + { + "epoch": 0.16263822719524454, + "grad_norm": 2.546875, + "learning_rate": 9.364228324825395e-05, + "loss": 1.9097, + "step": 3721 + }, + { + "epoch": 0.16268193539927445, + "grad_norm": 2.328125, + "learning_rate": 9.36389309501317e-05, + "loss": 2.2346, + "step": 3722 + }, + { + "epoch": 0.16272564360330435, + "grad_norm": 2.640625, + "learning_rate": 9.36355778284793e-05, + "loss": 2.4336, + "step": 3723 + }, + { + "epoch": 0.16276935180733423, + "grad_norm": 2.453125, + "learning_rate": 9.363222388336004e-05, + "loss": 2.0523, + "step": 3724 + }, + { + "epoch": 0.16281306001136414, + "grad_norm": 2.9375, + "learning_rate": 9.362886911483722e-05, + "loss": 1.5397, + "step": 3725 + }, + { + "epoch": 0.16285676821539402, + "grad_norm": 5.84375, + "learning_rate": 9.362551352297413e-05, + "loss": 1.9769, + "step": 3726 + }, + { + "epoch": 0.16290047641942393, + "grad_norm": 2.265625, + "learning_rate": 9.362215710783411e-05, + "loss": 1.7988, + "step": 3727 + }, + { + "epoch": 0.16294418462345384, + "grad_norm": 1.8984375, + "learning_rate": 9.36187998694805e-05, + "loss": 1.6174, + "step": 3728 + }, + { + "epoch": 0.16298789282748372, + "grad_norm": 2.296875, + "learning_rate": 9.361544180797665e-05, + "loss": 2.2354, + "step": 3729 + }, + { + "epoch": 0.16303160103151362, + "grad_norm": 2.125, + "learning_rate": 9.361208292338593e-05, + "loss": 2.0043, + "step": 3730 + }, + { + "epoch": 0.1630753092355435, + "grad_norm": 2.53125, + "learning_rate": 9.360872321577174e-05, + "loss": 2.1913, + "step": 3731 + }, + { + "epoch": 0.1631190174395734, + "grad_norm": 2.828125, + "learning_rate": 9.360536268519746e-05, + "loss": 1.9443, + "step": 3732 + }, + { + "epoch": 0.16316272564360332, + "grad_norm": 2.640625, + "learning_rate": 9.360200133172655e-05, + "loss": 2.0476, + "step": 3733 + }, + { + "epoch": 0.1632064338476332, + "grad_norm": 2.390625, + "learning_rate": 9.359863915542238e-05, + "loss": 1.656, + "step": 3734 + }, + { + "epoch": 0.1632501420516631, + "grad_norm": 2.21875, + "learning_rate": 9.359527615634844e-05, + "loss": 1.7408, + "step": 3735 + }, + { + "epoch": 0.16329385025569298, + "grad_norm": 3.328125, + "learning_rate": 9.359191233456821e-05, + "loss": 1.8783, + "step": 3736 + }, + { + "epoch": 0.1633375584597229, + "grad_norm": 4.53125, + "learning_rate": 9.358854769014513e-05, + "loss": 2.5117, + "step": 3737 + }, + { + "epoch": 0.1633812666637528, + "grad_norm": 2.59375, + "learning_rate": 9.358518222314272e-05, + "loss": 2.0793, + "step": 3738 + }, + { + "epoch": 0.16342497486778268, + "grad_norm": 2.109375, + "learning_rate": 9.35818159336245e-05, + "loss": 1.7235, + "step": 3739 + }, + { + "epoch": 0.1634686830718126, + "grad_norm": 2.375, + "learning_rate": 9.357844882165396e-05, + "loss": 2.2602, + "step": 3740 + }, + { + "epoch": 0.16351239127584247, + "grad_norm": 2.75, + "learning_rate": 9.357508088729468e-05, + "loss": 1.9349, + "step": 3741 + }, + { + "epoch": 0.16355609947987237, + "grad_norm": 2.46875, + "learning_rate": 9.357171213061021e-05, + "loss": 1.54, + "step": 3742 + }, + { + "epoch": 0.16359980768390228, + "grad_norm": 2.71875, + "learning_rate": 9.356834255166409e-05, + "loss": 2.17, + "step": 3743 + }, + { + "epoch": 0.16364351588793216, + "grad_norm": 2.34375, + "learning_rate": 9.356497215051996e-05, + "loss": 2.0412, + "step": 3744 + }, + { + "epoch": 0.16368722409196207, + "grad_norm": 2.703125, + "learning_rate": 9.356160092724138e-05, + "loss": 2.0443, + "step": 3745 + }, + { + "epoch": 0.16373093229599195, + "grad_norm": 5.4375, + "learning_rate": 9.355822888189201e-05, + "loss": 2.7347, + "step": 3746 + }, + { + "epoch": 0.16377464050002186, + "grad_norm": 2.109375, + "learning_rate": 9.355485601453545e-05, + "loss": 2.0591, + "step": 3747 + }, + { + "epoch": 0.16381834870405176, + "grad_norm": 2.75, + "learning_rate": 9.355148232523537e-05, + "loss": 1.9002, + "step": 3748 + }, + { + "epoch": 0.16386205690808164, + "grad_norm": 2.1875, + "learning_rate": 9.354810781405543e-05, + "loss": 1.8313, + "step": 3749 + }, + { + "epoch": 0.16390576511211155, + "grad_norm": 2.15625, + "learning_rate": 9.354473248105932e-05, + "loss": 1.8867, + "step": 3750 + }, + { + "epoch": 0.16394947331614143, + "grad_norm": 2.265625, + "learning_rate": 9.354135632631073e-05, + "loss": 2.0169, + "step": 3751 + }, + { + "epoch": 0.16399318152017134, + "grad_norm": 2.90625, + "learning_rate": 9.353797934987338e-05, + "loss": 1.8563, + "step": 3752 + }, + { + "epoch": 0.16403688972420125, + "grad_norm": 3.21875, + "learning_rate": 9.353460155181098e-05, + "loss": 3.3005, + "step": 3753 + }, + { + "epoch": 0.16408059792823113, + "grad_norm": 2.328125, + "learning_rate": 9.35312229321873e-05, + "loss": 1.9956, + "step": 3754 + }, + { + "epoch": 0.16412430613226103, + "grad_norm": 1.9921875, + "learning_rate": 9.352784349106608e-05, + "loss": 1.6531, + "step": 3755 + }, + { + "epoch": 0.1641680143362909, + "grad_norm": 2.40625, + "learning_rate": 9.35244632285111e-05, + "loss": 1.8427, + "step": 3756 + }, + { + "epoch": 0.16421172254032082, + "grad_norm": 2.546875, + "learning_rate": 9.352108214458616e-05, + "loss": 2.2344, + "step": 3757 + }, + { + "epoch": 0.16425543074435073, + "grad_norm": 3.078125, + "learning_rate": 9.351770023935506e-05, + "loss": 1.8037, + "step": 3758 + }, + { + "epoch": 0.1642991389483806, + "grad_norm": 3.4375, + "learning_rate": 9.35143175128816e-05, + "loss": 1.6607, + "step": 3759 + }, + { + "epoch": 0.16434284715241051, + "grad_norm": 2.828125, + "learning_rate": 9.351093396522965e-05, + "loss": 2.2265, + "step": 3760 + }, + { + "epoch": 0.1643865553564404, + "grad_norm": 2.515625, + "learning_rate": 9.350754959646306e-05, + "loss": 1.9707, + "step": 3761 + }, + { + "epoch": 0.1644302635604703, + "grad_norm": 2.484375, + "learning_rate": 9.350416440664566e-05, + "loss": 2.1641, + "step": 3762 + }, + { + "epoch": 0.1644739717645002, + "grad_norm": 2.234375, + "learning_rate": 9.350077839584138e-05, + "loss": 2.1195, + "step": 3763 + }, + { + "epoch": 0.1645176799685301, + "grad_norm": 2.40625, + "learning_rate": 9.34973915641141e-05, + "loss": 2.0303, + "step": 3764 + }, + { + "epoch": 0.16456138817256, + "grad_norm": 2.265625, + "learning_rate": 9.349400391152773e-05, + "loss": 2.0424, + "step": 3765 + }, + { + "epoch": 0.16460509637658988, + "grad_norm": 2.265625, + "learning_rate": 9.34906154381462e-05, + "loss": 1.924, + "step": 3766 + }, + { + "epoch": 0.16464880458061978, + "grad_norm": 2.25, + "learning_rate": 9.348722614403345e-05, + "loss": 2.0095, + "step": 3767 + }, + { + "epoch": 0.1646925127846497, + "grad_norm": 2.546875, + "learning_rate": 9.348383602925347e-05, + "loss": 1.8124, + "step": 3768 + }, + { + "epoch": 0.16473622098867957, + "grad_norm": 2.4375, + "learning_rate": 9.34804450938702e-05, + "loss": 1.9358, + "step": 3769 + }, + { + "epoch": 0.16477992919270948, + "grad_norm": 2.546875, + "learning_rate": 9.347705333794767e-05, + "loss": 2.5346, + "step": 3770 + }, + { + "epoch": 0.16482363739673936, + "grad_norm": 2.359375, + "learning_rate": 9.347366076154984e-05, + "loss": 1.8833, + "step": 3771 + }, + { + "epoch": 0.16486734560076927, + "grad_norm": 2.515625, + "learning_rate": 9.347026736474078e-05, + "loss": 1.9519, + "step": 3772 + }, + { + "epoch": 0.16491105380479917, + "grad_norm": 2.15625, + "learning_rate": 9.346687314758448e-05, + "loss": 2.0777, + "step": 3773 + }, + { + "epoch": 0.16495476200882905, + "grad_norm": 3.9375, + "learning_rate": 9.346347811014504e-05, + "loss": 2.6569, + "step": 3774 + }, + { + "epoch": 0.16499847021285896, + "grad_norm": 2.703125, + "learning_rate": 9.346008225248651e-05, + "loss": 2.0684, + "step": 3775 + }, + { + "epoch": 0.16504217841688884, + "grad_norm": 2.25, + "learning_rate": 9.345668557467298e-05, + "loss": 2.0878, + "step": 3776 + }, + { + "epoch": 0.16508588662091875, + "grad_norm": 2.15625, + "learning_rate": 9.345328807676854e-05, + "loss": 2.1552, + "step": 3777 + }, + { + "epoch": 0.16512959482494866, + "grad_norm": 2.625, + "learning_rate": 9.34498897588373e-05, + "loss": 1.7176, + "step": 3778 + }, + { + "epoch": 0.16517330302897854, + "grad_norm": 5.34375, + "learning_rate": 9.344649062094342e-05, + "loss": 2.4368, + "step": 3779 + }, + { + "epoch": 0.16521701123300844, + "grad_norm": 3.171875, + "learning_rate": 9.344309066315101e-05, + "loss": 2.6929, + "step": 3780 + }, + { + "epoch": 0.16526071943703832, + "grad_norm": 2.375, + "learning_rate": 9.343968988552426e-05, + "loss": 2.2669, + "step": 3781 + }, + { + "epoch": 0.16530442764106823, + "grad_norm": 2.4375, + "learning_rate": 9.343628828812734e-05, + "loss": 2.6719, + "step": 3782 + }, + { + "epoch": 0.16534813584509814, + "grad_norm": 3.703125, + "learning_rate": 9.343288587102443e-05, + "loss": 2.3055, + "step": 3783 + }, + { + "epoch": 0.16539184404912802, + "grad_norm": 11.5625, + "learning_rate": 9.342948263427977e-05, + "loss": 5.9702, + "step": 3784 + }, + { + "epoch": 0.16543555225315792, + "grad_norm": 3.828125, + "learning_rate": 9.342607857795756e-05, + "loss": 2.4608, + "step": 3785 + }, + { + "epoch": 0.1654792604571878, + "grad_norm": 2.640625, + "learning_rate": 9.342267370212203e-05, + "loss": 2.3413, + "step": 3786 + }, + { + "epoch": 0.1655229686612177, + "grad_norm": 2.59375, + "learning_rate": 9.341926800683745e-05, + "loss": 1.8671, + "step": 3787 + }, + { + "epoch": 0.16556667686524762, + "grad_norm": 2.953125, + "learning_rate": 9.34158614921681e-05, + "loss": 2.4157, + "step": 3788 + }, + { + "epoch": 0.1656103850692775, + "grad_norm": 2.640625, + "learning_rate": 9.341245415817825e-05, + "loss": 2.5063, + "step": 3789 + }, + { + "epoch": 0.1656540932733074, + "grad_norm": 2.3125, + "learning_rate": 9.34090460049322e-05, + "loss": 1.7339, + "step": 3790 + }, + { + "epoch": 0.1656978014773373, + "grad_norm": 3.234375, + "learning_rate": 9.340563703249428e-05, + "loss": 2.6966, + "step": 3791 + }, + { + "epoch": 0.1657415096813672, + "grad_norm": 2.140625, + "learning_rate": 9.34022272409288e-05, + "loss": 2.1976, + "step": 3792 + }, + { + "epoch": 0.1657852178853971, + "grad_norm": 1.953125, + "learning_rate": 9.339881663030014e-05, + "loss": 1.7816, + "step": 3793 + }, + { + "epoch": 0.16582892608942698, + "grad_norm": 2.640625, + "learning_rate": 9.339540520067265e-05, + "loss": 1.6969, + "step": 3794 + }, + { + "epoch": 0.1658726342934569, + "grad_norm": 2.046875, + "learning_rate": 9.33919929521107e-05, + "loss": 1.4362, + "step": 3795 + }, + { + "epoch": 0.16591634249748677, + "grad_norm": 2.015625, + "learning_rate": 9.338857988467868e-05, + "loss": 1.5953, + "step": 3796 + }, + { + "epoch": 0.16596005070151668, + "grad_norm": 2.484375, + "learning_rate": 9.338516599844101e-05, + "loss": 2.5578, + "step": 3797 + }, + { + "epoch": 0.16600375890554658, + "grad_norm": 2.734375, + "learning_rate": 9.338175129346213e-05, + "loss": 2.6275, + "step": 3798 + }, + { + "epoch": 0.16604746710957646, + "grad_norm": 2.140625, + "learning_rate": 9.337833576980646e-05, + "loss": 2.0663, + "step": 3799 + }, + { + "epoch": 0.16609117531360637, + "grad_norm": 2.4375, + "learning_rate": 9.337491942753845e-05, + "loss": 2.4615, + "step": 3800 + }, + { + "epoch": 0.16613488351763625, + "grad_norm": 2.578125, + "learning_rate": 9.337150226672258e-05, + "loss": 1.79, + "step": 3801 + }, + { + "epoch": 0.16617859172166616, + "grad_norm": 2.90625, + "learning_rate": 9.336808428742335e-05, + "loss": 2.3343, + "step": 3802 + }, + { + "epoch": 0.16622229992569607, + "grad_norm": 2.234375, + "learning_rate": 9.336466548970522e-05, + "loss": 2.0193, + "step": 3803 + }, + { + "epoch": 0.16626600812972595, + "grad_norm": 2.65625, + "learning_rate": 9.336124587363278e-05, + "loss": 1.7385, + "step": 3804 + }, + { + "epoch": 0.16630971633375585, + "grad_norm": 2.1875, + "learning_rate": 9.335782543927049e-05, + "loss": 1.5442, + "step": 3805 + }, + { + "epoch": 0.16635342453778573, + "grad_norm": 3.296875, + "learning_rate": 9.335440418668294e-05, + "loss": 2.1318, + "step": 3806 + }, + { + "epoch": 0.16639713274181564, + "grad_norm": 2.296875, + "learning_rate": 9.335098211593469e-05, + "loss": 1.9159, + "step": 3807 + }, + { + "epoch": 0.16644084094584555, + "grad_norm": 2.109375, + "learning_rate": 9.334755922709031e-05, + "loss": 1.7075, + "step": 3808 + }, + { + "epoch": 0.16648454914987543, + "grad_norm": 2.65625, + "learning_rate": 9.334413552021439e-05, + "loss": 2.5683, + "step": 3809 + }, + { + "epoch": 0.16652825735390533, + "grad_norm": 2.390625, + "learning_rate": 9.334071099537154e-05, + "loss": 1.9669, + "step": 3810 + }, + { + "epoch": 0.16657196555793521, + "grad_norm": 2.234375, + "learning_rate": 9.333728565262642e-05, + "loss": 2.0282, + "step": 3811 + }, + { + "epoch": 0.16661567376196512, + "grad_norm": 2.65625, + "learning_rate": 9.333385949204363e-05, + "loss": 1.8504, + "step": 3812 + }, + { + "epoch": 0.16665938196599503, + "grad_norm": 2.125, + "learning_rate": 9.333043251368784e-05, + "loss": 1.7398, + "step": 3813 + }, + { + "epoch": 0.1667030901700249, + "grad_norm": 2.203125, + "learning_rate": 9.332700471762374e-05, + "loss": 1.6821, + "step": 3814 + }, + { + "epoch": 0.16674679837405482, + "grad_norm": 2.21875, + "learning_rate": 9.332357610391598e-05, + "loss": 2.1855, + "step": 3815 + }, + { + "epoch": 0.1667905065780847, + "grad_norm": 3.46875, + "learning_rate": 9.332014667262929e-05, + "loss": 2.1317, + "step": 3816 + }, + { + "epoch": 0.1668342147821146, + "grad_norm": 2.71875, + "learning_rate": 9.33167164238284e-05, + "loss": 1.906, + "step": 3817 + }, + { + "epoch": 0.1668779229861445, + "grad_norm": 2.46875, + "learning_rate": 9.331328535757801e-05, + "loss": 1.8553, + "step": 3818 + }, + { + "epoch": 0.1669216311901744, + "grad_norm": 2.453125, + "learning_rate": 9.33098534739429e-05, + "loss": 1.9632, + "step": 3819 + }, + { + "epoch": 0.1669653393942043, + "grad_norm": 2.34375, + "learning_rate": 9.33064207729878e-05, + "loss": 2.1928, + "step": 3820 + }, + { + "epoch": 0.16700904759823418, + "grad_norm": 2.28125, + "learning_rate": 9.330298725477753e-05, + "loss": 2.4237, + "step": 3821 + }, + { + "epoch": 0.16705275580226409, + "grad_norm": 3.09375, + "learning_rate": 9.329955291937684e-05, + "loss": 3.0202, + "step": 3822 + }, + { + "epoch": 0.167096464006294, + "grad_norm": 2.578125, + "learning_rate": 9.329611776685059e-05, + "loss": 2.1526, + "step": 3823 + }, + { + "epoch": 0.16714017221032387, + "grad_norm": 2.21875, + "learning_rate": 9.329268179726359e-05, + "loss": 2.0835, + "step": 3824 + }, + { + "epoch": 0.16718388041435378, + "grad_norm": 2.453125, + "learning_rate": 9.328924501068066e-05, + "loss": 1.9372, + "step": 3825 + }, + { + "epoch": 0.16722758861838366, + "grad_norm": 2.328125, + "learning_rate": 9.328580740716666e-05, + "loss": 2.1097, + "step": 3826 + }, + { + "epoch": 0.16727129682241357, + "grad_norm": 2.515625, + "learning_rate": 9.32823689867865e-05, + "loss": 1.9521, + "step": 3827 + }, + { + "epoch": 0.16731500502644348, + "grad_norm": 2.234375, + "learning_rate": 9.327892974960503e-05, + "loss": 1.81, + "step": 3828 + }, + { + "epoch": 0.16735871323047335, + "grad_norm": 2.015625, + "learning_rate": 9.327548969568716e-05, + "loss": 1.8508, + "step": 3829 + }, + { + "epoch": 0.16740242143450326, + "grad_norm": 3.203125, + "learning_rate": 9.327204882509784e-05, + "loss": 3.2697, + "step": 3830 + }, + { + "epoch": 0.16744612963853314, + "grad_norm": 2.34375, + "learning_rate": 9.326860713790195e-05, + "loss": 1.9622, + "step": 3831 + }, + { + "epoch": 0.16748983784256305, + "grad_norm": 2.546875, + "learning_rate": 9.326516463416448e-05, + "loss": 2.7874, + "step": 3832 + }, + { + "epoch": 0.16753354604659296, + "grad_norm": 2.921875, + "learning_rate": 9.32617213139504e-05, + "loss": 1.9411, + "step": 3833 + }, + { + "epoch": 0.16757725425062284, + "grad_norm": 3.046875, + "learning_rate": 9.325827717732465e-05, + "loss": 2.0798, + "step": 3834 + }, + { + "epoch": 0.16762096245465274, + "grad_norm": 3.296875, + "learning_rate": 9.325483222435226e-05, + "loss": 2.3578, + "step": 3835 + }, + { + "epoch": 0.16766467065868262, + "grad_norm": 2.25, + "learning_rate": 9.325138645509823e-05, + "loss": 1.6908, + "step": 3836 + }, + { + "epoch": 0.16770837886271253, + "grad_norm": 2.25, + "learning_rate": 9.324793986962758e-05, + "loss": 1.8728, + "step": 3837 + }, + { + "epoch": 0.16775208706674244, + "grad_norm": 2.3125, + "learning_rate": 9.324449246800537e-05, + "loss": 1.8805, + "step": 3838 + }, + { + "epoch": 0.16779579527077232, + "grad_norm": 1.984375, + "learning_rate": 9.324104425029665e-05, + "loss": 1.7632, + "step": 3839 + }, + { + "epoch": 0.16783950347480223, + "grad_norm": 3.09375, + "learning_rate": 9.32375952165665e-05, + "loss": 2.3108, + "step": 3840 + }, + { + "epoch": 0.1678832116788321, + "grad_norm": 2.296875, + "learning_rate": 9.323414536687997e-05, + "loss": 2.0386, + "step": 3841 + }, + { + "epoch": 0.167926919882862, + "grad_norm": 2.671875, + "learning_rate": 9.323069470130221e-05, + "loss": 2.6539, + "step": 3842 + }, + { + "epoch": 0.16797062808689192, + "grad_norm": 3.171875, + "learning_rate": 9.322724321989833e-05, + "loss": 2.3568, + "step": 3843 + }, + { + "epoch": 0.1680143362909218, + "grad_norm": 2.359375, + "learning_rate": 9.322379092273345e-05, + "loss": 2.3111, + "step": 3844 + }, + { + "epoch": 0.1680580444949517, + "grad_norm": 2.21875, + "learning_rate": 9.322033780987272e-05, + "loss": 1.8544, + "step": 3845 + }, + { + "epoch": 0.1681017526989816, + "grad_norm": 2.90625, + "learning_rate": 9.321688388138132e-05, + "loss": 2.1158, + "step": 3846 + }, + { + "epoch": 0.1681454609030115, + "grad_norm": 2.53125, + "learning_rate": 9.321342913732441e-05, + "loss": 2.156, + "step": 3847 + }, + { + "epoch": 0.1681891691070414, + "grad_norm": 2.28125, + "learning_rate": 9.32099735777672e-05, + "loss": 1.8919, + "step": 3848 + }, + { + "epoch": 0.16823287731107128, + "grad_norm": 2.390625, + "learning_rate": 9.320651720277491e-05, + "loss": 1.758, + "step": 3849 + }, + { + "epoch": 0.1682765855151012, + "grad_norm": 3.125, + "learning_rate": 9.320306001241275e-05, + "loss": 2.3838, + "step": 3850 + }, + { + "epoch": 0.16832029371913107, + "grad_norm": 2.078125, + "learning_rate": 9.319960200674597e-05, + "loss": 2.0288, + "step": 3851 + }, + { + "epoch": 0.16836400192316098, + "grad_norm": 2.25, + "learning_rate": 9.319614318583982e-05, + "loss": 2.3708, + "step": 3852 + }, + { + "epoch": 0.16840771012719088, + "grad_norm": 3.171875, + "learning_rate": 9.319268354975959e-05, + "loss": 2.506, + "step": 3853 + }, + { + "epoch": 0.16845141833122076, + "grad_norm": 2.484375, + "learning_rate": 9.318922309857055e-05, + "loss": 1.9185, + "step": 3854 + }, + { + "epoch": 0.16849512653525067, + "grad_norm": 2.75, + "learning_rate": 9.3185761832338e-05, + "loss": 2.2522, + "step": 3855 + }, + { + "epoch": 0.16853883473928055, + "grad_norm": 2.640625, + "learning_rate": 9.318229975112728e-05, + "loss": 2.6273, + "step": 3856 + }, + { + "epoch": 0.16858254294331046, + "grad_norm": 2.375, + "learning_rate": 9.317883685500373e-05, + "loss": 2.1665, + "step": 3857 + }, + { + "epoch": 0.16862625114734037, + "grad_norm": 2.15625, + "learning_rate": 9.317537314403267e-05, + "loss": 1.6587, + "step": 3858 + }, + { + "epoch": 0.16866995935137025, + "grad_norm": 2.1875, + "learning_rate": 9.317190861827949e-05, + "loss": 1.9373, + "step": 3859 + }, + { + "epoch": 0.16871366755540015, + "grad_norm": 2.1875, + "learning_rate": 9.316844327780955e-05, + "loss": 1.7105, + "step": 3860 + }, + { + "epoch": 0.16875737575943003, + "grad_norm": 2.828125, + "learning_rate": 9.316497712268825e-05, + "loss": 1.3369, + "step": 3861 + }, + { + "epoch": 0.16880108396345994, + "grad_norm": 2.078125, + "learning_rate": 9.316151015298103e-05, + "loss": 1.8242, + "step": 3862 + }, + { + "epoch": 0.16884479216748985, + "grad_norm": 2.96875, + "learning_rate": 9.315804236875327e-05, + "loss": 1.7831, + "step": 3863 + }, + { + "epoch": 0.16888850037151973, + "grad_norm": 2.625, + "learning_rate": 9.315457377007046e-05, + "loss": 2.3505, + "step": 3864 + }, + { + "epoch": 0.16893220857554964, + "grad_norm": 2.546875, + "learning_rate": 9.315110435699804e-05, + "loss": 2.0785, + "step": 3865 + }, + { + "epoch": 0.16897591677957952, + "grad_norm": 2.203125, + "learning_rate": 9.314763412960144e-05, + "loss": 1.6823, + "step": 3866 + }, + { + "epoch": 0.16901962498360942, + "grad_norm": 2.171875, + "learning_rate": 9.314416308794621e-05, + "loss": 2.4492, + "step": 3867 + }, + { + "epoch": 0.16906333318763933, + "grad_norm": 4.59375, + "learning_rate": 9.314069123209784e-05, + "loss": 1.8297, + "step": 3868 + }, + { + "epoch": 0.1691070413916692, + "grad_norm": 2.3125, + "learning_rate": 9.313721856212181e-05, + "loss": 2.1315, + "step": 3869 + }, + { + "epoch": 0.16915074959569912, + "grad_norm": 2.59375, + "learning_rate": 9.313374507808371e-05, + "loss": 2.3644, + "step": 3870 + }, + { + "epoch": 0.169194457799729, + "grad_norm": 2.34375, + "learning_rate": 9.313027078004903e-05, + "loss": 2.1034, + "step": 3871 + }, + { + "epoch": 0.1692381660037589, + "grad_norm": 2.625, + "learning_rate": 9.31267956680834e-05, + "loss": 1.766, + "step": 3872 + }, + { + "epoch": 0.1692818742077888, + "grad_norm": 2.6875, + "learning_rate": 9.312331974225235e-05, + "loss": 2.1167, + "step": 3873 + }, + { + "epoch": 0.1693255824118187, + "grad_norm": 2.4375, + "learning_rate": 9.31198430026215e-05, + "loss": 2.5551, + "step": 3874 + }, + { + "epoch": 0.1693692906158486, + "grad_norm": 2.1875, + "learning_rate": 9.311636544925645e-05, + "loss": 1.8521, + "step": 3875 + }, + { + "epoch": 0.16941299881987848, + "grad_norm": 2.328125, + "learning_rate": 9.311288708222284e-05, + "loss": 2.3467, + "step": 3876 + }, + { + "epoch": 0.1694567070239084, + "grad_norm": 2.109375, + "learning_rate": 9.310940790158629e-05, + "loss": 1.8095, + "step": 3877 + }, + { + "epoch": 0.1695004152279383, + "grad_norm": 2.21875, + "learning_rate": 9.310592790741248e-05, + "loss": 2.0915, + "step": 3878 + }, + { + "epoch": 0.16954412343196817, + "grad_norm": 1.984375, + "learning_rate": 9.310244709976707e-05, + "loss": 1.654, + "step": 3879 + }, + { + "epoch": 0.16958783163599808, + "grad_norm": 2.421875, + "learning_rate": 9.309896547871576e-05, + "loss": 2.3766, + "step": 3880 + }, + { + "epoch": 0.16963153984002796, + "grad_norm": 3.5, + "learning_rate": 9.309548304432421e-05, + "loss": 2.7146, + "step": 3881 + }, + { + "epoch": 0.16967524804405787, + "grad_norm": 3.125, + "learning_rate": 9.30919997966582e-05, + "loss": 1.8983, + "step": 3882 + }, + { + "epoch": 0.16971895624808778, + "grad_norm": 2.171875, + "learning_rate": 9.308851573578344e-05, + "loss": 1.8319, + "step": 3883 + }, + { + "epoch": 0.16976266445211766, + "grad_norm": 2.265625, + "learning_rate": 9.308503086176565e-05, + "loss": 1.8928, + "step": 3884 + }, + { + "epoch": 0.16980637265614756, + "grad_norm": 3.765625, + "learning_rate": 9.308154517467065e-05, + "loss": 1.9515, + "step": 3885 + }, + { + "epoch": 0.16985008086017744, + "grad_norm": 2.84375, + "learning_rate": 9.307805867456418e-05, + "loss": 2.2746, + "step": 3886 + }, + { + "epoch": 0.16989378906420735, + "grad_norm": 2.09375, + "learning_rate": 9.307457136151204e-05, + "loss": 1.6149, + "step": 3887 + }, + { + "epoch": 0.16993749726823726, + "grad_norm": 3.234375, + "learning_rate": 9.307108323558005e-05, + "loss": 1.6674, + "step": 3888 + }, + { + "epoch": 0.16998120547226714, + "grad_norm": 4.0625, + "learning_rate": 9.306759429683404e-05, + "loss": 1.9972, + "step": 3889 + }, + { + "epoch": 0.17002491367629705, + "grad_norm": 3.921875, + "learning_rate": 9.306410454533982e-05, + "loss": 2.2144, + "step": 3890 + }, + { + "epoch": 0.17006862188032693, + "grad_norm": 2.40625, + "learning_rate": 9.306061398116331e-05, + "loss": 1.7609, + "step": 3891 + }, + { + "epoch": 0.17011233008435683, + "grad_norm": 2.90625, + "learning_rate": 9.305712260437031e-05, + "loss": 2.6109, + "step": 3892 + }, + { + "epoch": 0.17015603828838674, + "grad_norm": 2.765625, + "learning_rate": 9.305363041502675e-05, + "loss": 2.5016, + "step": 3893 + }, + { + "epoch": 0.17019974649241662, + "grad_norm": 2.46875, + "learning_rate": 9.305013741319852e-05, + "loss": 2.1102, + "step": 3894 + }, + { + "epoch": 0.17024345469644653, + "grad_norm": 3.3125, + "learning_rate": 9.304664359895155e-05, + "loss": 2.5799, + "step": 3895 + }, + { + "epoch": 0.1702871629004764, + "grad_norm": 3.171875, + "learning_rate": 9.304314897235176e-05, + "loss": 1.9134, + "step": 3896 + }, + { + "epoch": 0.17033087110450632, + "grad_norm": 2.34375, + "learning_rate": 9.303965353346508e-05, + "loss": 2.659, + "step": 3897 + }, + { + "epoch": 0.17037457930853622, + "grad_norm": 2.5, + "learning_rate": 9.303615728235753e-05, + "loss": 2.0285, + "step": 3898 + }, + { + "epoch": 0.1704182875125661, + "grad_norm": 2.5625, + "learning_rate": 9.303266021909504e-05, + "loss": 2.3058, + "step": 3899 + }, + { + "epoch": 0.170461995716596, + "grad_norm": 2.625, + "learning_rate": 9.302916234374361e-05, + "loss": 2.6567, + "step": 3900 + }, + { + "epoch": 0.1705057039206259, + "grad_norm": 2.4375, + "learning_rate": 9.302566365636928e-05, + "loss": 2.077, + "step": 3901 + }, + { + "epoch": 0.1705494121246558, + "grad_norm": 2.8125, + "learning_rate": 9.302216415703805e-05, + "loss": 2.523, + "step": 3902 + }, + { + "epoch": 0.1705931203286857, + "grad_norm": 2.859375, + "learning_rate": 9.301866384581597e-05, + "loss": 1.9372, + "step": 3903 + }, + { + "epoch": 0.17063682853271558, + "grad_norm": 2.71875, + "learning_rate": 9.301516272276907e-05, + "loss": 2.5272, + "step": 3904 + }, + { + "epoch": 0.1706805367367455, + "grad_norm": 2.484375, + "learning_rate": 9.301166078796347e-05, + "loss": 1.977, + "step": 3905 + }, + { + "epoch": 0.17072424494077537, + "grad_norm": 2.625, + "learning_rate": 9.300815804146522e-05, + "loss": 2.5269, + "step": 3906 + }, + { + "epoch": 0.17076795314480528, + "grad_norm": 2.1875, + "learning_rate": 9.300465448334044e-05, + "loss": 2.3774, + "step": 3907 + }, + { + "epoch": 0.1708116613488352, + "grad_norm": 2.578125, + "learning_rate": 9.300115011365522e-05, + "loss": 2.1276, + "step": 3908 + }, + { + "epoch": 0.17085536955286507, + "grad_norm": 2.78125, + "learning_rate": 9.299764493247574e-05, + "loss": 1.9362, + "step": 3909 + }, + { + "epoch": 0.17089907775689497, + "grad_norm": 2.203125, + "learning_rate": 9.29941389398681e-05, + "loss": 2.0708, + "step": 3910 + }, + { + "epoch": 0.17094278596092485, + "grad_norm": 2.484375, + "learning_rate": 9.299063213589849e-05, + "loss": 2.2451, + "step": 3911 + }, + { + "epoch": 0.17098649416495476, + "grad_norm": 2.234375, + "learning_rate": 9.298712452063309e-05, + "loss": 1.7547, + "step": 3912 + }, + { + "epoch": 0.17103020236898467, + "grad_norm": 2.640625, + "learning_rate": 9.298361609413805e-05, + "loss": 2.5509, + "step": 3913 + }, + { + "epoch": 0.17107391057301455, + "grad_norm": 2.75, + "learning_rate": 9.298010685647966e-05, + "loss": 1.9587, + "step": 3914 + }, + { + "epoch": 0.17111761877704446, + "grad_norm": 2.125, + "learning_rate": 9.297659680772408e-05, + "loss": 1.6686, + "step": 3915 + }, + { + "epoch": 0.17116132698107434, + "grad_norm": 2.484375, + "learning_rate": 9.297308594793756e-05, + "loss": 2.1915, + "step": 3916 + }, + { + "epoch": 0.17120503518510424, + "grad_norm": 2.125, + "learning_rate": 9.296957427718638e-05, + "loss": 2.1469, + "step": 3917 + }, + { + "epoch": 0.17124874338913415, + "grad_norm": 2.171875, + "learning_rate": 9.296606179553679e-05, + "loss": 2.1142, + "step": 3918 + }, + { + "epoch": 0.17129245159316403, + "grad_norm": 2.65625, + "learning_rate": 9.296254850305506e-05, + "loss": 1.9486, + "step": 3919 + }, + { + "epoch": 0.17133615979719394, + "grad_norm": 2.578125, + "learning_rate": 9.295903439980755e-05, + "loss": 1.8862, + "step": 3920 + }, + { + "epoch": 0.17137986800122382, + "grad_norm": 2.671875, + "learning_rate": 9.29555194858605e-05, + "loss": 3.0025, + "step": 3921 + }, + { + "epoch": 0.17142357620525372, + "grad_norm": 2.171875, + "learning_rate": 9.295200376128031e-05, + "loss": 2.0601, + "step": 3922 + }, + { + "epoch": 0.17146728440928363, + "grad_norm": 2.0625, + "learning_rate": 9.294848722613326e-05, + "loss": 1.4115, + "step": 3923 + }, + { + "epoch": 0.1715109926133135, + "grad_norm": 1.9921875, + "learning_rate": 9.294496988048578e-05, + "loss": 1.6362, + "step": 3924 + }, + { + "epoch": 0.17155470081734342, + "grad_norm": 2.390625, + "learning_rate": 9.29414517244042e-05, + "loss": 1.8668, + "step": 3925 + }, + { + "epoch": 0.1715984090213733, + "grad_norm": 2.3125, + "learning_rate": 9.293793275795492e-05, + "loss": 1.9503, + "step": 3926 + }, + { + "epoch": 0.1716421172254032, + "grad_norm": 2.0625, + "learning_rate": 9.293441298120436e-05, + "loss": 1.7978, + "step": 3927 + }, + { + "epoch": 0.17168582542943311, + "grad_norm": 2.453125, + "learning_rate": 9.293089239421895e-05, + "loss": 2.4183, + "step": 3928 + }, + { + "epoch": 0.171729533633463, + "grad_norm": 1.8828125, + "learning_rate": 9.29273709970651e-05, + "loss": 1.8488, + "step": 3929 + }, + { + "epoch": 0.1717732418374929, + "grad_norm": 2.4375, + "learning_rate": 9.29238487898093e-05, + "loss": 2.9994, + "step": 3930 + }, + { + "epoch": 0.17181695004152278, + "grad_norm": 7.125, + "learning_rate": 9.2920325772518e-05, + "loss": 2.1471, + "step": 3931 + }, + { + "epoch": 0.1718606582455527, + "grad_norm": 2.484375, + "learning_rate": 9.291680194525767e-05, + "loss": 1.7098, + "step": 3932 + }, + { + "epoch": 0.1719043664495826, + "grad_norm": 3.40625, + "learning_rate": 9.291327730809483e-05, + "loss": 1.9738, + "step": 3933 + }, + { + "epoch": 0.17194807465361248, + "grad_norm": 2.875, + "learning_rate": 9.2909751861096e-05, + "loss": 1.9738, + "step": 3934 + }, + { + "epoch": 0.17199178285764238, + "grad_norm": 2.59375, + "learning_rate": 9.29062256043277e-05, + "loss": 2.3958, + "step": 3935 + }, + { + "epoch": 0.17203549106167226, + "grad_norm": 2.984375, + "learning_rate": 9.290269853785645e-05, + "loss": 2.3367, + "step": 3936 + }, + { + "epoch": 0.17207919926570217, + "grad_norm": 5.375, + "learning_rate": 9.289917066174886e-05, + "loss": 2.0566, + "step": 3937 + }, + { + "epoch": 0.17212290746973208, + "grad_norm": 5.46875, + "learning_rate": 9.289564197607148e-05, + "loss": 1.1127, + "step": 3938 + }, + { + "epoch": 0.17216661567376196, + "grad_norm": 2.84375, + "learning_rate": 9.28921124808909e-05, + "loss": 2.359, + "step": 3939 + }, + { + "epoch": 0.17221032387779187, + "grad_norm": 2.859375, + "learning_rate": 9.288858217627374e-05, + "loss": 3.0929, + "step": 3940 + }, + { + "epoch": 0.17225403208182175, + "grad_norm": 2.625, + "learning_rate": 9.28850510622866e-05, + "loss": 1.8901, + "step": 3941 + }, + { + "epoch": 0.17229774028585165, + "grad_norm": 2.203125, + "learning_rate": 9.288151913899614e-05, + "loss": 1.8007, + "step": 3942 + }, + { + "epoch": 0.17234144848988156, + "grad_norm": 2.859375, + "learning_rate": 9.287798640646898e-05, + "loss": 3.0978, + "step": 3943 + }, + { + "epoch": 0.17238515669391144, + "grad_norm": 2.828125, + "learning_rate": 9.287445286477184e-05, + "loss": 2.0779, + "step": 3944 + }, + { + "epoch": 0.17242886489794135, + "grad_norm": 2.15625, + "learning_rate": 9.287091851397137e-05, + "loss": 1.9458, + "step": 3945 + }, + { + "epoch": 0.17247257310197123, + "grad_norm": 2.03125, + "learning_rate": 9.286738335413425e-05, + "loss": 1.902, + "step": 3946 + }, + { + "epoch": 0.17251628130600113, + "grad_norm": 3.0, + "learning_rate": 9.286384738532723e-05, + "loss": 2.942, + "step": 3947 + }, + { + "epoch": 0.17255998951003104, + "grad_norm": 4.1875, + "learning_rate": 9.286031060761703e-05, + "loss": 1.6749, + "step": 3948 + }, + { + "epoch": 0.17260369771406092, + "grad_norm": 2.6875, + "learning_rate": 9.285677302107039e-05, + "loss": 2.1474, + "step": 3949 + }, + { + "epoch": 0.17264740591809083, + "grad_norm": 2.890625, + "learning_rate": 9.285323462575406e-05, + "loss": 2.5917, + "step": 3950 + }, + { + "epoch": 0.1726911141221207, + "grad_norm": 2.84375, + "learning_rate": 9.284969542173482e-05, + "loss": 2.1621, + "step": 3951 + }, + { + "epoch": 0.17273482232615062, + "grad_norm": 2.671875, + "learning_rate": 9.284615540907947e-05, + "loss": 2.9806, + "step": 3952 + }, + { + "epoch": 0.17277853053018052, + "grad_norm": 2.34375, + "learning_rate": 9.28426145878548e-05, + "loss": 1.9889, + "step": 3953 + }, + { + "epoch": 0.1728222387342104, + "grad_norm": 2.875, + "learning_rate": 9.283907295812765e-05, + "loss": 2.4204, + "step": 3954 + }, + { + "epoch": 0.1728659469382403, + "grad_norm": 2.46875, + "learning_rate": 9.283553051996483e-05, + "loss": 2.1639, + "step": 3955 + }, + { + "epoch": 0.1729096551422702, + "grad_norm": 2.859375, + "learning_rate": 9.283198727343322e-05, + "loss": 3.293, + "step": 3956 + }, + { + "epoch": 0.1729533633463001, + "grad_norm": 2.6875, + "learning_rate": 9.282844321859965e-05, + "loss": 2.632, + "step": 3957 + }, + { + "epoch": 0.17299707155033, + "grad_norm": 2.359375, + "learning_rate": 9.282489835553106e-05, + "loss": 1.9452, + "step": 3958 + }, + { + "epoch": 0.17304077975435989, + "grad_norm": 2.96875, + "learning_rate": 9.282135268429427e-05, + "loss": 1.9549, + "step": 3959 + }, + { + "epoch": 0.1730844879583898, + "grad_norm": 2.75, + "learning_rate": 9.281780620495624e-05, + "loss": 2.5024, + "step": 3960 + }, + { + "epoch": 0.17312819616241967, + "grad_norm": 3.109375, + "learning_rate": 9.28142589175839e-05, + "loss": 3.4426, + "step": 3961 + }, + { + "epoch": 0.17317190436644958, + "grad_norm": 2.140625, + "learning_rate": 9.281071082224418e-05, + "loss": 1.8448, + "step": 3962 + }, + { + "epoch": 0.1732156125704795, + "grad_norm": 2.765625, + "learning_rate": 9.280716191900404e-05, + "loss": 1.6593, + "step": 3963 + }, + { + "epoch": 0.17325932077450937, + "grad_norm": 3.015625, + "learning_rate": 9.280361220793044e-05, + "loss": 2.7787, + "step": 3964 + }, + { + "epoch": 0.17330302897853928, + "grad_norm": 2.8125, + "learning_rate": 9.280006168909039e-05, + "loss": 2.3553, + "step": 3965 + }, + { + "epoch": 0.17334673718256915, + "grad_norm": 2.28125, + "learning_rate": 9.279651036255088e-05, + "loss": 1.7812, + "step": 3966 + }, + { + "epoch": 0.17339044538659906, + "grad_norm": 2.359375, + "learning_rate": 9.279295822837893e-05, + "loss": 2.0827, + "step": 3967 + }, + { + "epoch": 0.17343415359062897, + "grad_norm": 2.265625, + "learning_rate": 9.278940528664158e-05, + "loss": 2.0041, + "step": 3968 + }, + { + "epoch": 0.17347786179465885, + "grad_norm": 2.359375, + "learning_rate": 9.278585153740587e-05, + "loss": 2.1094, + "step": 3969 + }, + { + "epoch": 0.17352156999868876, + "grad_norm": 2.015625, + "learning_rate": 9.278229698073888e-05, + "loss": 1.5974, + "step": 3970 + }, + { + "epoch": 0.17356527820271864, + "grad_norm": 2.1875, + "learning_rate": 9.277874161670766e-05, + "loss": 2.0873, + "step": 3971 + }, + { + "epoch": 0.17360898640674854, + "grad_norm": 2.1875, + "learning_rate": 9.277518544537934e-05, + "loss": 1.7993, + "step": 3972 + }, + { + "epoch": 0.17365269461077845, + "grad_norm": 2.34375, + "learning_rate": 9.277162846682102e-05, + "loss": 1.8639, + "step": 3973 + }, + { + "epoch": 0.17369640281480833, + "grad_norm": 2.578125, + "learning_rate": 9.276807068109981e-05, + "loss": 2.4488, + "step": 3974 + }, + { + "epoch": 0.17374011101883824, + "grad_norm": 2.125, + "learning_rate": 9.276451208828285e-05, + "loss": 1.8302, + "step": 3975 + }, + { + "epoch": 0.17378381922286812, + "grad_norm": 2.390625, + "learning_rate": 9.276095268843732e-05, + "loss": 2.6344, + "step": 3976 + }, + { + "epoch": 0.17382752742689803, + "grad_norm": 2.546875, + "learning_rate": 9.275739248163037e-05, + "loss": 2.0113, + "step": 3977 + }, + { + "epoch": 0.17387123563092793, + "grad_norm": 2.609375, + "learning_rate": 9.27538314679292e-05, + "loss": 2.4077, + "step": 3978 + }, + { + "epoch": 0.1739149438349578, + "grad_norm": 2.53125, + "learning_rate": 9.275026964740101e-05, + "loss": 1.8287, + "step": 3979 + }, + { + "epoch": 0.17395865203898772, + "grad_norm": 4.4375, + "learning_rate": 9.2746707020113e-05, + "loss": 1.3134, + "step": 3980 + }, + { + "epoch": 0.1740023602430176, + "grad_norm": 2.421875, + "learning_rate": 9.274314358613241e-05, + "loss": 2.0166, + "step": 3981 + }, + { + "epoch": 0.1740460684470475, + "grad_norm": 2.28125, + "learning_rate": 9.27395793455265e-05, + "loss": 2.2178, + "step": 3982 + }, + { + "epoch": 0.17408977665107742, + "grad_norm": 2.765625, + "learning_rate": 9.273601429836253e-05, + "loss": 2.4367, + "step": 3983 + }, + { + "epoch": 0.1741334848551073, + "grad_norm": 2.4375, + "learning_rate": 9.273244844470777e-05, + "loss": 2.2315, + "step": 3984 + }, + { + "epoch": 0.1741771930591372, + "grad_norm": 2.484375, + "learning_rate": 9.27288817846295e-05, + "loss": 2.0803, + "step": 3985 + }, + { + "epoch": 0.17422090126316708, + "grad_norm": 2.515625, + "learning_rate": 9.272531431819504e-05, + "loss": 2.5731, + "step": 3986 + }, + { + "epoch": 0.174264609467197, + "grad_norm": 2.265625, + "learning_rate": 9.272174604547172e-05, + "loss": 2.1792, + "step": 3987 + }, + { + "epoch": 0.1743083176712269, + "grad_norm": 2.015625, + "learning_rate": 9.271817696652688e-05, + "loss": 1.598, + "step": 3988 + }, + { + "epoch": 0.17435202587525678, + "grad_norm": 2.140625, + "learning_rate": 9.271460708142787e-05, + "loss": 2.1333, + "step": 3989 + }, + { + "epoch": 0.17439573407928669, + "grad_norm": 2.40625, + "learning_rate": 9.271103639024204e-05, + "loss": 2.2194, + "step": 3990 + }, + { + "epoch": 0.17443944228331656, + "grad_norm": 2.421875, + "learning_rate": 9.27074648930368e-05, + "loss": 2.4873, + "step": 3991 + }, + { + "epoch": 0.17448315048734647, + "grad_norm": 2.375, + "learning_rate": 9.270389258987956e-05, + "loss": 1.8827, + "step": 3992 + }, + { + "epoch": 0.17452685869137638, + "grad_norm": 3.28125, + "learning_rate": 9.270031948083769e-05, + "loss": 2.8426, + "step": 3993 + }, + { + "epoch": 0.17457056689540626, + "grad_norm": 2.59375, + "learning_rate": 9.269674556597865e-05, + "loss": 2.0483, + "step": 3994 + }, + { + "epoch": 0.17461427509943617, + "grad_norm": 2.65625, + "learning_rate": 9.269317084536988e-05, + "loss": 2.9369, + "step": 3995 + }, + { + "epoch": 0.17465798330346605, + "grad_norm": 2.0, + "learning_rate": 9.268959531907883e-05, + "loss": 1.7993, + "step": 3996 + }, + { + "epoch": 0.17470169150749595, + "grad_norm": 2.328125, + "learning_rate": 9.268601898717299e-05, + "loss": 2.6471, + "step": 3997 + }, + { + "epoch": 0.17474539971152586, + "grad_norm": 2.0, + "learning_rate": 9.268244184971984e-05, + "loss": 1.9124, + "step": 3998 + }, + { + "epoch": 0.17478910791555574, + "grad_norm": 2.265625, + "learning_rate": 9.267886390678691e-05, + "loss": 1.875, + "step": 3999 + }, + { + "epoch": 0.17483281611958565, + "grad_norm": 4.75, + "learning_rate": 9.267528515844168e-05, + "loss": 1.9821, + "step": 4000 + }, + { + "epoch": 0.17487652432361556, + "grad_norm": 2.296875, + "learning_rate": 9.267170560475172e-05, + "loss": 1.5852, + "step": 4001 + }, + { + "epoch": 0.17492023252764544, + "grad_norm": 2.546875, + "learning_rate": 9.266812524578457e-05, + "loss": 2.142, + "step": 4002 + }, + { + "epoch": 0.17496394073167534, + "grad_norm": 2.296875, + "learning_rate": 9.266454408160779e-05, + "loss": 1.7585, + "step": 4003 + }, + { + "epoch": 0.17500764893570522, + "grad_norm": 2.234375, + "learning_rate": 9.266096211228896e-05, + "loss": 2.1145, + "step": 4004 + }, + { + "epoch": 0.17505135713973513, + "grad_norm": 2.3125, + "learning_rate": 9.265737933789571e-05, + "loss": 1.99, + "step": 4005 + }, + { + "epoch": 0.17509506534376504, + "grad_norm": 2.328125, + "learning_rate": 9.26537957584956e-05, + "loss": 2.156, + "step": 4006 + }, + { + "epoch": 0.17513877354779492, + "grad_norm": 2.84375, + "learning_rate": 9.26502113741563e-05, + "loss": 1.9717, + "step": 4007 + }, + { + "epoch": 0.17518248175182483, + "grad_norm": 2.09375, + "learning_rate": 9.264662618494544e-05, + "loss": 2.2891, + "step": 4008 + }, + { + "epoch": 0.1752261899558547, + "grad_norm": 2.5625, + "learning_rate": 9.264304019093066e-05, + "loss": 2.4795, + "step": 4009 + }, + { + "epoch": 0.1752698981598846, + "grad_norm": 2.296875, + "learning_rate": 9.263945339217967e-05, + "loss": 2.0334, + "step": 4010 + }, + { + "epoch": 0.17531360636391452, + "grad_norm": 2.09375, + "learning_rate": 9.263586578876011e-05, + "loss": 1.9267, + "step": 4011 + }, + { + "epoch": 0.1753573145679444, + "grad_norm": 1.8984375, + "learning_rate": 9.263227738073973e-05, + "loss": 1.7599, + "step": 4012 + }, + { + "epoch": 0.1754010227719743, + "grad_norm": 6.71875, + "learning_rate": 9.262868816818622e-05, + "loss": 2.0304, + "step": 4013 + }, + { + "epoch": 0.1754447309760042, + "grad_norm": 2.390625, + "learning_rate": 9.262509815116732e-05, + "loss": 1.9665, + "step": 4014 + }, + { + "epoch": 0.1754884391800341, + "grad_norm": 2.734375, + "learning_rate": 9.262150732975078e-05, + "loss": 2.1552, + "step": 4015 + }, + { + "epoch": 0.175532147384064, + "grad_norm": 2.203125, + "learning_rate": 9.261791570400436e-05, + "loss": 2.0909, + "step": 4016 + }, + { + "epoch": 0.17557585558809388, + "grad_norm": 2.609375, + "learning_rate": 9.261432327399583e-05, + "loss": 2.7963, + "step": 4017 + }, + { + "epoch": 0.1756195637921238, + "grad_norm": 2.03125, + "learning_rate": 9.261073003979303e-05, + "loss": 1.868, + "step": 4018 + }, + { + "epoch": 0.17566327199615367, + "grad_norm": 2.65625, + "learning_rate": 9.260713600146373e-05, + "loss": 2.179, + "step": 4019 + }, + { + "epoch": 0.17570698020018358, + "grad_norm": 2.296875, + "learning_rate": 9.260354115907574e-05, + "loss": 2.1183, + "step": 4020 + }, + { + "epoch": 0.17575068840421348, + "grad_norm": 2.5, + "learning_rate": 9.259994551269694e-05, + "loss": 1.9912, + "step": 4021 + }, + { + "epoch": 0.17579439660824336, + "grad_norm": 2.734375, + "learning_rate": 9.259634906239516e-05, + "loss": 1.9105, + "step": 4022 + }, + { + "epoch": 0.17583810481227327, + "grad_norm": 1.984375, + "learning_rate": 9.259275180823829e-05, + "loss": 1.8473, + "step": 4023 + }, + { + "epoch": 0.17588181301630315, + "grad_norm": 2.28125, + "learning_rate": 9.258915375029418e-05, + "loss": 1.7336, + "step": 4024 + }, + { + "epoch": 0.17592552122033306, + "grad_norm": 1.9921875, + "learning_rate": 9.258555488863078e-05, + "loss": 2.0827, + "step": 4025 + }, + { + "epoch": 0.17596922942436297, + "grad_norm": 2.4375, + "learning_rate": 9.258195522331596e-05, + "loss": 2.2803, + "step": 4026 + }, + { + "epoch": 0.17601293762839285, + "grad_norm": 3.234375, + "learning_rate": 9.257835475441768e-05, + "loss": 2.1713, + "step": 4027 + }, + { + "epoch": 0.17605664583242275, + "grad_norm": 2.65625, + "learning_rate": 9.257475348200387e-05, + "loss": 2.4612, + "step": 4028 + }, + { + "epoch": 0.17610035403645263, + "grad_norm": 2.0, + "learning_rate": 9.25711514061425e-05, + "loss": 1.716, + "step": 4029 + }, + { + "epoch": 0.17614406224048254, + "grad_norm": 2.578125, + "learning_rate": 9.256754852690152e-05, + "loss": 2.4852, + "step": 4030 + }, + { + "epoch": 0.17618777044451245, + "grad_norm": 3.703125, + "learning_rate": 9.256394484434899e-05, + "loss": 2.4831, + "step": 4031 + }, + { + "epoch": 0.17623147864854233, + "grad_norm": 2.5625, + "learning_rate": 9.256034035855283e-05, + "loss": 2.3615, + "step": 4032 + }, + { + "epoch": 0.17627518685257224, + "grad_norm": 2.578125, + "learning_rate": 9.255673506958113e-05, + "loss": 2.2626, + "step": 4033 + }, + { + "epoch": 0.17631889505660212, + "grad_norm": 2.5625, + "learning_rate": 9.255312897750189e-05, + "loss": 1.6607, + "step": 4034 + }, + { + "epoch": 0.17636260326063202, + "grad_norm": 2.78125, + "learning_rate": 9.254952208238318e-05, + "loss": 1.9265, + "step": 4035 + }, + { + "epoch": 0.17640631146466193, + "grad_norm": 2.421875, + "learning_rate": 9.254591438429306e-05, + "loss": 2.1257, + "step": 4036 + }, + { + "epoch": 0.1764500196686918, + "grad_norm": 5.40625, + "learning_rate": 9.25423058832996e-05, + "loss": 2.5334, + "step": 4037 + }, + { + "epoch": 0.17649372787272172, + "grad_norm": 3.359375, + "learning_rate": 9.253869657947092e-05, + "loss": 2.8139, + "step": 4038 + }, + { + "epoch": 0.1765374360767516, + "grad_norm": 2.125, + "learning_rate": 9.253508647287512e-05, + "loss": 1.7314, + "step": 4039 + }, + { + "epoch": 0.1765811442807815, + "grad_norm": 2.09375, + "learning_rate": 9.253147556358034e-05, + "loss": 1.8548, + "step": 4040 + }, + { + "epoch": 0.1766248524848114, + "grad_norm": 2.375, + "learning_rate": 9.252786385165471e-05, + "loss": 2.5766, + "step": 4041 + }, + { + "epoch": 0.1766685606888413, + "grad_norm": 2.625, + "learning_rate": 9.25242513371664e-05, + "loss": 1.997, + "step": 4042 + }, + { + "epoch": 0.1767122688928712, + "grad_norm": 2.578125, + "learning_rate": 9.252063802018356e-05, + "loss": 2.0024, + "step": 4043 + }, + { + "epoch": 0.17675597709690108, + "grad_norm": 2.578125, + "learning_rate": 9.251702390077441e-05, + "loss": 2.189, + "step": 4044 + }, + { + "epoch": 0.176799685300931, + "grad_norm": 2.015625, + "learning_rate": 9.251340897900713e-05, + "loss": 2.05, + "step": 4045 + }, + { + "epoch": 0.1768433935049609, + "grad_norm": 2.296875, + "learning_rate": 9.250979325494995e-05, + "loss": 1.6734, + "step": 4046 + }, + { + "epoch": 0.17688710170899077, + "grad_norm": 2.078125, + "learning_rate": 9.250617672867108e-05, + "loss": 2.1073, + "step": 4047 + }, + { + "epoch": 0.17693080991302068, + "grad_norm": 2.265625, + "learning_rate": 9.250255940023882e-05, + "loss": 1.8738, + "step": 4048 + }, + { + "epoch": 0.17697451811705056, + "grad_norm": 2.203125, + "learning_rate": 9.24989412697214e-05, + "loss": 1.796, + "step": 4049 + }, + { + "epoch": 0.17701822632108047, + "grad_norm": 7.3125, + "learning_rate": 9.24953223371871e-05, + "loss": 1.5202, + "step": 4050 + }, + { + "epoch": 0.17706193452511038, + "grad_norm": 2.828125, + "learning_rate": 9.249170260270421e-05, + "loss": 3.0731, + "step": 4051 + }, + { + "epoch": 0.17710564272914026, + "grad_norm": 2.25, + "learning_rate": 9.248808206634105e-05, + "loss": 1.5085, + "step": 4052 + }, + { + "epoch": 0.17714935093317016, + "grad_norm": 2.328125, + "learning_rate": 9.248446072816595e-05, + "loss": 1.917, + "step": 4053 + }, + { + "epoch": 0.17719305913720004, + "grad_norm": 2.359375, + "learning_rate": 9.248083858824725e-05, + "loss": 2.1638, + "step": 4054 + }, + { + "epoch": 0.17723676734122995, + "grad_norm": 2.046875, + "learning_rate": 9.247721564665329e-05, + "loss": 1.7179, + "step": 4055 + }, + { + "epoch": 0.17728047554525986, + "grad_norm": 2.3125, + "learning_rate": 9.247359190345243e-05, + "loss": 1.8676, + "step": 4056 + }, + { + "epoch": 0.17732418374928974, + "grad_norm": 2.484375, + "learning_rate": 9.24699673587131e-05, + "loss": 1.9322, + "step": 4057 + }, + { + "epoch": 0.17736789195331965, + "grad_norm": 2.09375, + "learning_rate": 9.246634201250366e-05, + "loss": 1.9405, + "step": 4058 + }, + { + "epoch": 0.17741160015734952, + "grad_norm": 2.203125, + "learning_rate": 9.246271586489255e-05, + "loss": 1.8649, + "step": 4059 + }, + { + "epoch": 0.17745530836137943, + "grad_norm": 2.515625, + "learning_rate": 9.245908891594818e-05, + "loss": 2.0666, + "step": 4060 + }, + { + "epoch": 0.17749901656540934, + "grad_norm": 2.1875, + "learning_rate": 9.245546116573901e-05, + "loss": 1.7037, + "step": 4061 + }, + { + "epoch": 0.17754272476943922, + "grad_norm": 2.21875, + "learning_rate": 9.245183261433349e-05, + "loss": 1.4171, + "step": 4062 + }, + { + "epoch": 0.17758643297346913, + "grad_norm": 2.828125, + "learning_rate": 9.244820326180011e-05, + "loss": 1.9897, + "step": 4063 + }, + { + "epoch": 0.177630141177499, + "grad_norm": 4.59375, + "learning_rate": 9.244457310820736e-05, + "loss": 1.8852, + "step": 4064 + }, + { + "epoch": 0.17767384938152891, + "grad_norm": 2.3125, + "learning_rate": 9.244094215362373e-05, + "loss": 1.8307, + "step": 4065 + }, + { + "epoch": 0.17771755758555882, + "grad_norm": 3.140625, + "learning_rate": 9.243731039811775e-05, + "loss": 3.5239, + "step": 4066 + }, + { + "epoch": 0.1777612657895887, + "grad_norm": 2.28125, + "learning_rate": 9.243367784175796e-05, + "loss": 2.6461, + "step": 4067 + }, + { + "epoch": 0.1778049739936186, + "grad_norm": 2.671875, + "learning_rate": 9.243004448461293e-05, + "loss": 2.6207, + "step": 4068 + }, + { + "epoch": 0.1778486821976485, + "grad_norm": 2.234375, + "learning_rate": 9.242641032675117e-05, + "loss": 1.9232, + "step": 4069 + }, + { + "epoch": 0.1778923904016784, + "grad_norm": 2.0625, + "learning_rate": 9.242277536824134e-05, + "loss": 1.782, + "step": 4070 + }, + { + "epoch": 0.1779360986057083, + "grad_norm": 2.859375, + "learning_rate": 9.241913960915197e-05, + "loss": 1.5369, + "step": 4071 + }, + { + "epoch": 0.17797980680973818, + "grad_norm": 5.34375, + "learning_rate": 9.241550304955168e-05, + "loss": 2.1154, + "step": 4072 + }, + { + "epoch": 0.1780235150137681, + "grad_norm": 2.3125, + "learning_rate": 9.241186568950915e-05, + "loss": 2.0227, + "step": 4073 + }, + { + "epoch": 0.17806722321779797, + "grad_norm": 2.546875, + "learning_rate": 9.240822752909298e-05, + "loss": 2.1406, + "step": 4074 + }, + { + "epoch": 0.17811093142182788, + "grad_norm": 2.140625, + "learning_rate": 9.240458856837182e-05, + "loss": 2.0179, + "step": 4075 + }, + { + "epoch": 0.17815463962585779, + "grad_norm": 2.625, + "learning_rate": 9.240094880741437e-05, + "loss": 2.2995, + "step": 4076 + }, + { + "epoch": 0.17819834782988767, + "grad_norm": 2.046875, + "learning_rate": 9.23973082462893e-05, + "loss": 1.9567, + "step": 4077 + }, + { + "epoch": 0.17824205603391757, + "grad_norm": 2.1875, + "learning_rate": 9.23936668850653e-05, + "loss": 2.1861, + "step": 4078 + }, + { + "epoch": 0.17828576423794745, + "grad_norm": 2.171875, + "learning_rate": 9.239002472381113e-05, + "loss": 1.9636, + "step": 4079 + }, + { + "epoch": 0.17832947244197736, + "grad_norm": 2.171875, + "learning_rate": 9.238638176259549e-05, + "loss": 1.7762, + "step": 4080 + }, + { + "epoch": 0.17837318064600727, + "grad_norm": 2.546875, + "learning_rate": 9.238273800148712e-05, + "loss": 1.7434, + "step": 4081 + }, + { + "epoch": 0.17841688885003715, + "grad_norm": 2.546875, + "learning_rate": 9.237909344055482e-05, + "loss": 1.8037, + "step": 4082 + }, + { + "epoch": 0.17846059705406705, + "grad_norm": 2.515625, + "learning_rate": 9.237544807986733e-05, + "loss": 2.0798, + "step": 4083 + }, + { + "epoch": 0.17850430525809693, + "grad_norm": 2.25, + "learning_rate": 9.237180191949347e-05, + "loss": 2.1024, + "step": 4084 + }, + { + "epoch": 0.17854801346212684, + "grad_norm": 2.4375, + "learning_rate": 9.236815495950204e-05, + "loss": 2.6516, + "step": 4085 + }, + { + "epoch": 0.17859172166615675, + "grad_norm": 2.40625, + "learning_rate": 9.236450719996185e-05, + "loss": 1.988, + "step": 4086 + }, + { + "epoch": 0.17863542987018663, + "grad_norm": 2.25, + "learning_rate": 9.236085864094177e-05, + "loss": 2.3926, + "step": 4087 + }, + { + "epoch": 0.17867913807421654, + "grad_norm": 2.484375, + "learning_rate": 9.235720928251063e-05, + "loss": 2.2052, + "step": 4088 + }, + { + "epoch": 0.17872284627824642, + "grad_norm": 2.375, + "learning_rate": 9.235355912473729e-05, + "loss": 1.9098, + "step": 4089 + }, + { + "epoch": 0.17876655448227632, + "grad_norm": 2.953125, + "learning_rate": 9.234990816769065e-05, + "loss": 2.6546, + "step": 4090 + }, + { + "epoch": 0.17881026268630623, + "grad_norm": 4.5625, + "learning_rate": 9.23462564114396e-05, + "loss": 2.1178, + "step": 4091 + }, + { + "epoch": 0.1788539708903361, + "grad_norm": 2.4375, + "learning_rate": 9.234260385605308e-05, + "loss": 2.0182, + "step": 4092 + }, + { + "epoch": 0.17889767909436602, + "grad_norm": 2.65625, + "learning_rate": 9.233895050159999e-05, + "loss": 2.1849, + "step": 4093 + }, + { + "epoch": 0.1789413872983959, + "grad_norm": 2.234375, + "learning_rate": 9.233529634814928e-05, + "loss": 2.3742, + "step": 4094 + }, + { + "epoch": 0.1789850955024258, + "grad_norm": 2.1875, + "learning_rate": 9.23316413957699e-05, + "loss": 2.0213, + "step": 4095 + }, + { + "epoch": 0.1790288037064557, + "grad_norm": 2.109375, + "learning_rate": 9.232798564453086e-05, + "loss": 1.5629, + "step": 4096 + }, + { + "epoch": 0.1790725119104856, + "grad_norm": 2.546875, + "learning_rate": 9.23243290945011e-05, + "loss": 1.5609, + "step": 4097 + }, + { + "epoch": 0.1791162201145155, + "grad_norm": 2.59375, + "learning_rate": 9.232067174574968e-05, + "loss": 2.1387, + "step": 4098 + }, + { + "epoch": 0.17915992831854538, + "grad_norm": 2.140625, + "learning_rate": 9.231701359834557e-05, + "loss": 2.1409, + "step": 4099 + }, + { + "epoch": 0.1792036365225753, + "grad_norm": 2.84375, + "learning_rate": 9.231335465235782e-05, + "loss": 2.0755, + "step": 4100 + }, + { + "epoch": 0.1792473447266052, + "grad_norm": 3.75, + "learning_rate": 9.230969490785549e-05, + "loss": 2.0952, + "step": 4101 + }, + { + "epoch": 0.17929105293063508, + "grad_norm": 2.359375, + "learning_rate": 9.230603436490763e-05, + "loss": 1.9135, + "step": 4102 + }, + { + "epoch": 0.17933476113466498, + "grad_norm": 2.578125, + "learning_rate": 9.230237302358336e-05, + "loss": 2.0219, + "step": 4103 + }, + { + "epoch": 0.17937846933869486, + "grad_norm": 2.640625, + "learning_rate": 9.22987108839517e-05, + "loss": 1.9435, + "step": 4104 + }, + { + "epoch": 0.17942217754272477, + "grad_norm": 3.25, + "learning_rate": 9.229504794608182e-05, + "loss": 1.6264, + "step": 4105 + }, + { + "epoch": 0.17946588574675468, + "grad_norm": 2.328125, + "learning_rate": 9.229138421004284e-05, + "loss": 2.1426, + "step": 4106 + }, + { + "epoch": 0.17950959395078456, + "grad_norm": 2.15625, + "learning_rate": 9.228771967590388e-05, + "loss": 1.7699, + "step": 4107 + }, + { + "epoch": 0.17955330215481446, + "grad_norm": 2.140625, + "learning_rate": 9.228405434373409e-05, + "loss": 2.0044, + "step": 4108 + }, + { + "epoch": 0.17959701035884434, + "grad_norm": 2.21875, + "learning_rate": 9.228038821360268e-05, + "loss": 1.7695, + "step": 4109 + }, + { + "epoch": 0.17964071856287425, + "grad_norm": 2.828125, + "learning_rate": 9.227672128557879e-05, + "loss": 2.1878, + "step": 4110 + }, + { + "epoch": 0.17968442676690416, + "grad_norm": 2.28125, + "learning_rate": 9.227305355973163e-05, + "loss": 2.2419, + "step": 4111 + }, + { + "epoch": 0.17972813497093404, + "grad_norm": 2.09375, + "learning_rate": 9.226938503613043e-05, + "loss": 1.7856, + "step": 4112 + }, + { + "epoch": 0.17977184317496395, + "grad_norm": 2.34375, + "learning_rate": 9.226571571484442e-05, + "loss": 1.968, + "step": 4113 + }, + { + "epoch": 0.17981555137899383, + "grad_norm": 2.34375, + "learning_rate": 9.226204559594284e-05, + "loss": 2.1826, + "step": 4114 + }, + { + "epoch": 0.17985925958302373, + "grad_norm": 2.234375, + "learning_rate": 9.225837467949495e-05, + "loss": 2.0379, + "step": 4115 + }, + { + "epoch": 0.17990296778705364, + "grad_norm": 2.375, + "learning_rate": 9.225470296557002e-05, + "loss": 1.8321, + "step": 4116 + }, + { + "epoch": 0.17994667599108352, + "grad_norm": 3.453125, + "learning_rate": 9.225103045423735e-05, + "loss": 1.9019, + "step": 4117 + }, + { + "epoch": 0.17999038419511343, + "grad_norm": 2.515625, + "learning_rate": 9.224735714556624e-05, + "loss": 2.7345, + "step": 4118 + }, + { + "epoch": 0.1800340923991433, + "grad_norm": 2.375, + "learning_rate": 9.2243683039626e-05, + "loss": 2.2188, + "step": 4119 + }, + { + "epoch": 0.18007780060317322, + "grad_norm": 2.234375, + "learning_rate": 9.224000813648602e-05, + "loss": 1.8773, + "step": 4120 + }, + { + "epoch": 0.18012150880720312, + "grad_norm": 2.0625, + "learning_rate": 9.223633243621556e-05, + "loss": 1.8854, + "step": 4121 + }, + { + "epoch": 0.180165217011233, + "grad_norm": 2.109375, + "learning_rate": 9.223265593888405e-05, + "loss": 1.7813, + "step": 4122 + }, + { + "epoch": 0.1802089252152629, + "grad_norm": 1.8671875, + "learning_rate": 9.222897864456088e-05, + "loss": 1.6252, + "step": 4123 + }, + { + "epoch": 0.1802526334192928, + "grad_norm": 2.84375, + "learning_rate": 9.22253005533154e-05, + "loss": 1.9214, + "step": 4124 + }, + { + "epoch": 0.1802963416233227, + "grad_norm": 2.5, + "learning_rate": 9.222162166521704e-05, + "loss": 2.6064, + "step": 4125 + }, + { + "epoch": 0.1803400498273526, + "grad_norm": 2.546875, + "learning_rate": 9.221794198033525e-05, + "loss": 1.7756, + "step": 4126 + }, + { + "epoch": 0.18038375803138249, + "grad_norm": 2.140625, + "learning_rate": 9.221426149873942e-05, + "loss": 1.7808, + "step": 4127 + }, + { + "epoch": 0.1804274662354124, + "grad_norm": 2.703125, + "learning_rate": 9.221058022049906e-05, + "loss": 2.3092, + "step": 4128 + }, + { + "epoch": 0.18047117443944227, + "grad_norm": 2.90625, + "learning_rate": 9.220689814568359e-05, + "loss": 1.8663, + "step": 4129 + }, + { + "epoch": 0.18051488264347218, + "grad_norm": 3.015625, + "learning_rate": 9.220321527436256e-05, + "loss": 2.9884, + "step": 4130 + }, + { + "epoch": 0.1805585908475021, + "grad_norm": 2.171875, + "learning_rate": 9.21995316066054e-05, + "loss": 1.7863, + "step": 4131 + }, + { + "epoch": 0.18060229905153197, + "grad_norm": 2.53125, + "learning_rate": 9.219584714248167e-05, + "loss": 1.6175, + "step": 4132 + }, + { + "epoch": 0.18064600725556187, + "grad_norm": 1.8984375, + "learning_rate": 9.21921618820609e-05, + "loss": 1.6722, + "step": 4133 + }, + { + "epoch": 0.18068971545959175, + "grad_norm": 4.21875, + "learning_rate": 9.21884758254126e-05, + "loss": 1.5489, + "step": 4134 + }, + { + "epoch": 0.18073342366362166, + "grad_norm": 2.671875, + "learning_rate": 9.21847889726064e-05, + "loss": 2.5853, + "step": 4135 + }, + { + "epoch": 0.18077713186765157, + "grad_norm": 2.46875, + "learning_rate": 9.218110132371182e-05, + "loss": 2.0443, + "step": 4136 + }, + { + "epoch": 0.18082084007168145, + "grad_norm": 3.390625, + "learning_rate": 9.217741287879846e-05, + "loss": 1.8777, + "step": 4137 + }, + { + "epoch": 0.18086454827571136, + "grad_norm": 2.5, + "learning_rate": 9.217372363793592e-05, + "loss": 1.9894, + "step": 4138 + }, + { + "epoch": 0.18090825647974124, + "grad_norm": 2.203125, + "learning_rate": 9.217003360119386e-05, + "loss": 1.6585, + "step": 4139 + }, + { + "epoch": 0.18095196468377114, + "grad_norm": 2.53125, + "learning_rate": 9.216634276864188e-05, + "loss": 2.5412, + "step": 4140 + }, + { + "epoch": 0.18099567288780105, + "grad_norm": 2.140625, + "learning_rate": 9.216265114034964e-05, + "loss": 2.1771, + "step": 4141 + }, + { + "epoch": 0.18103938109183093, + "grad_norm": 2.71875, + "learning_rate": 9.21589587163868e-05, + "loss": 1.8523, + "step": 4142 + }, + { + "epoch": 0.18108308929586084, + "grad_norm": 2.140625, + "learning_rate": 9.215526549682307e-05, + "loss": 2.1015, + "step": 4143 + }, + { + "epoch": 0.18112679749989072, + "grad_norm": 3.4375, + "learning_rate": 9.21515714817281e-05, + "loss": 2.06, + "step": 4144 + }, + { + "epoch": 0.18117050570392063, + "grad_norm": 2.828125, + "learning_rate": 9.214787667117165e-05, + "loss": 2.2972, + "step": 4145 + }, + { + "epoch": 0.18121421390795053, + "grad_norm": 2.390625, + "learning_rate": 9.214418106522342e-05, + "loss": 2.0532, + "step": 4146 + }, + { + "epoch": 0.1812579221119804, + "grad_norm": 2.390625, + "learning_rate": 9.214048466395316e-05, + "loss": 2.1041, + "step": 4147 + }, + { + "epoch": 0.18130163031601032, + "grad_norm": 2.234375, + "learning_rate": 9.21367874674306e-05, + "loss": 1.9628, + "step": 4148 + }, + { + "epoch": 0.1813453385200402, + "grad_norm": 2.28125, + "learning_rate": 9.213308947572554e-05, + "loss": 2.5201, + "step": 4149 + }, + { + "epoch": 0.1813890467240701, + "grad_norm": 2.203125, + "learning_rate": 9.212939068890778e-05, + "loss": 1.8226, + "step": 4150 + }, + { + "epoch": 0.18143275492810002, + "grad_norm": 2.5625, + "learning_rate": 9.212569110704708e-05, + "loss": 2.3515, + "step": 4151 + }, + { + "epoch": 0.1814764631321299, + "grad_norm": 2.984375, + "learning_rate": 9.212199073021329e-05, + "loss": 1.9236, + "step": 4152 + }, + { + "epoch": 0.1815201713361598, + "grad_norm": 2.125, + "learning_rate": 9.211828955847622e-05, + "loss": 1.9031, + "step": 4153 + }, + { + "epoch": 0.18156387954018968, + "grad_norm": 2.34375, + "learning_rate": 9.211458759190573e-05, + "loss": 1.7923, + "step": 4154 + }, + { + "epoch": 0.1816075877442196, + "grad_norm": 3.15625, + "learning_rate": 9.211088483057168e-05, + "loss": 2.4361, + "step": 4155 + }, + { + "epoch": 0.1816512959482495, + "grad_norm": 2.328125, + "learning_rate": 9.210718127454394e-05, + "loss": 2.2487, + "step": 4156 + }, + { + "epoch": 0.18169500415227938, + "grad_norm": 2.671875, + "learning_rate": 9.210347692389241e-05, + "loss": 1.8312, + "step": 4157 + }, + { + "epoch": 0.18173871235630928, + "grad_norm": 3.21875, + "learning_rate": 9.209977177868698e-05, + "loss": 2.6821, + "step": 4158 + }, + { + "epoch": 0.18178242056033916, + "grad_norm": 2.328125, + "learning_rate": 9.209606583899759e-05, + "loss": 1.7224, + "step": 4159 + }, + { + "epoch": 0.18182612876436907, + "grad_norm": 2.546875, + "learning_rate": 9.209235910489418e-05, + "loss": 2.8291, + "step": 4160 + }, + { + "epoch": 0.18186983696839898, + "grad_norm": 2.5, + "learning_rate": 9.208865157644668e-05, + "loss": 1.6297, + "step": 4161 + }, + { + "epoch": 0.18191354517242886, + "grad_norm": 2.5, + "learning_rate": 9.208494325372508e-05, + "loss": 2.0138, + "step": 4162 + }, + { + "epoch": 0.18195725337645877, + "grad_norm": 2.203125, + "learning_rate": 9.208123413679933e-05, + "loss": 2.2733, + "step": 4163 + }, + { + "epoch": 0.18200096158048865, + "grad_norm": 3.125, + "learning_rate": 9.207752422573946e-05, + "loss": 1.977, + "step": 4164 + }, + { + "epoch": 0.18204466978451855, + "grad_norm": 2.703125, + "learning_rate": 9.207381352061545e-05, + "loss": 2.1237, + "step": 4165 + }, + { + "epoch": 0.18208837798854846, + "grad_norm": 2.78125, + "learning_rate": 9.207010202149736e-05, + "loss": 2.2442, + "step": 4166 + }, + { + "epoch": 0.18213208619257834, + "grad_norm": 2.171875, + "learning_rate": 9.206638972845522e-05, + "loss": 1.8178, + "step": 4167 + }, + { + "epoch": 0.18217579439660825, + "grad_norm": 2.484375, + "learning_rate": 9.206267664155907e-05, + "loss": 2.3336, + "step": 4168 + }, + { + "epoch": 0.18221950260063813, + "grad_norm": 2.765625, + "learning_rate": 9.205896276087899e-05, + "loss": 1.9392, + "step": 4169 + }, + { + "epoch": 0.18226321080466804, + "grad_norm": 2.34375, + "learning_rate": 9.205524808648507e-05, + "loss": 1.5458, + "step": 4170 + }, + { + "epoch": 0.18230691900869794, + "grad_norm": 3.53125, + "learning_rate": 9.205153261844741e-05, + "loss": 2.8738, + "step": 4171 + }, + { + "epoch": 0.18235062721272782, + "grad_norm": 2.828125, + "learning_rate": 9.204781635683613e-05, + "loss": 2.0505, + "step": 4172 + }, + { + "epoch": 0.18239433541675773, + "grad_norm": 16.25, + "learning_rate": 9.204409930172135e-05, + "loss": 2.1514, + "step": 4173 + }, + { + "epoch": 0.1824380436207876, + "grad_norm": 2.84375, + "learning_rate": 9.204038145317324e-05, + "loss": 1.4737, + "step": 4174 + }, + { + "epoch": 0.18248175182481752, + "grad_norm": 2.46875, + "learning_rate": 9.203666281126193e-05, + "loss": 1.922, + "step": 4175 + }, + { + "epoch": 0.18252546002884742, + "grad_norm": 2.203125, + "learning_rate": 9.203294337605761e-05, + "loss": 2.0385, + "step": 4176 + }, + { + "epoch": 0.1825691682328773, + "grad_norm": 2.109375, + "learning_rate": 9.202922314763048e-05, + "loss": 1.6214, + "step": 4177 + }, + { + "epoch": 0.1826128764369072, + "grad_norm": 2.078125, + "learning_rate": 9.202550212605074e-05, + "loss": 1.7539, + "step": 4178 + }, + { + "epoch": 0.1826565846409371, + "grad_norm": 2.359375, + "learning_rate": 9.202178031138862e-05, + "loss": 1.9389, + "step": 4179 + }, + { + "epoch": 0.182700292844967, + "grad_norm": 2.625, + "learning_rate": 9.201805770371432e-05, + "loss": 2.0368, + "step": 4180 + }, + { + "epoch": 0.1827440010489969, + "grad_norm": 2.765625, + "learning_rate": 9.201433430309813e-05, + "loss": 1.7877, + "step": 4181 + }, + { + "epoch": 0.1827877092530268, + "grad_norm": 2.625, + "learning_rate": 9.20106101096103e-05, + "loss": 2.8653, + "step": 4182 + }, + { + "epoch": 0.1828314174570567, + "grad_norm": 2.46875, + "learning_rate": 9.200688512332111e-05, + "loss": 2.528, + "step": 4183 + }, + { + "epoch": 0.18287512566108657, + "grad_norm": 3.671875, + "learning_rate": 9.200315934430088e-05, + "loss": 1.9375, + "step": 4184 + }, + { + "epoch": 0.18291883386511648, + "grad_norm": 2.03125, + "learning_rate": 9.199943277261989e-05, + "loss": 1.5832, + "step": 4185 + }, + { + "epoch": 0.1829625420691464, + "grad_norm": 2.484375, + "learning_rate": 9.199570540834846e-05, + "loss": 2.6069, + "step": 4186 + }, + { + "epoch": 0.18300625027317627, + "grad_norm": 3.375, + "learning_rate": 9.199197725155697e-05, + "loss": 2.8072, + "step": 4187 + }, + { + "epoch": 0.18304995847720618, + "grad_norm": 2.5, + "learning_rate": 9.198824830231573e-05, + "loss": 2.2208, + "step": 4188 + }, + { + "epoch": 0.18309366668123606, + "grad_norm": 2.234375, + "learning_rate": 9.198451856069515e-05, + "loss": 2.1902, + "step": 4189 + }, + { + "epoch": 0.18313737488526596, + "grad_norm": 3.59375, + "learning_rate": 9.19807880267656e-05, + "loss": 2.2622, + "step": 4190 + }, + { + "epoch": 0.18318108308929587, + "grad_norm": 2.5625, + "learning_rate": 9.197705670059747e-05, + "loss": 2.5455, + "step": 4191 + }, + { + "epoch": 0.18322479129332575, + "grad_norm": 2.21875, + "learning_rate": 9.197332458226118e-05, + "loss": 1.8177, + "step": 4192 + }, + { + "epoch": 0.18326849949735566, + "grad_norm": 2.125, + "learning_rate": 9.196959167182719e-05, + "loss": 2.3912, + "step": 4193 + }, + { + "epoch": 0.18331220770138554, + "grad_norm": 2.3125, + "learning_rate": 9.19658579693659e-05, + "loss": 1.7873, + "step": 4194 + }, + { + "epoch": 0.18335591590541545, + "grad_norm": 2.96875, + "learning_rate": 9.196212347494781e-05, + "loss": 2.2754, + "step": 4195 + }, + { + "epoch": 0.18339962410944535, + "grad_norm": 2.515625, + "learning_rate": 9.195838818864337e-05, + "loss": 2.2531, + "step": 4196 + }, + { + "epoch": 0.18344333231347523, + "grad_norm": 2.46875, + "learning_rate": 9.195465211052306e-05, + "loss": 1.9503, + "step": 4197 + }, + { + "epoch": 0.18348704051750514, + "grad_norm": 2.015625, + "learning_rate": 9.195091524065742e-05, + "loss": 1.4529, + "step": 4198 + }, + { + "epoch": 0.18353074872153502, + "grad_norm": 1.96875, + "learning_rate": 9.194717757911694e-05, + "loss": 1.9256, + "step": 4199 + }, + { + "epoch": 0.18357445692556493, + "grad_norm": 1.953125, + "learning_rate": 9.194343912597218e-05, + "loss": 2.1218, + "step": 4200 + }, + { + "epoch": 0.18361816512959483, + "grad_norm": 2.140625, + "learning_rate": 9.193969988129367e-05, + "loss": 2.0301, + "step": 4201 + }, + { + "epoch": 0.18366187333362471, + "grad_norm": 2.3125, + "learning_rate": 9.1935959845152e-05, + "loss": 2.0649, + "step": 4202 + }, + { + "epoch": 0.18370558153765462, + "grad_norm": 5.375, + "learning_rate": 9.193221901761772e-05, + "loss": 3.2402, + "step": 4203 + }, + { + "epoch": 0.1837492897416845, + "grad_norm": 7.09375, + "learning_rate": 9.192847739876142e-05, + "loss": 2.1486, + "step": 4204 + }, + { + "epoch": 0.1837929979457144, + "grad_norm": 2.28125, + "learning_rate": 9.192473498865376e-05, + "loss": 1.6001, + "step": 4205 + }, + { + "epoch": 0.18383670614974432, + "grad_norm": 2.3125, + "learning_rate": 9.192099178736532e-05, + "loss": 1.9152, + "step": 4206 + }, + { + "epoch": 0.1838804143537742, + "grad_norm": 4.28125, + "learning_rate": 9.191724779496675e-05, + "loss": 1.463, + "step": 4207 + }, + { + "epoch": 0.1839241225578041, + "grad_norm": 2.875, + "learning_rate": 9.19135030115287e-05, + "loss": 1.755, + "step": 4208 + }, + { + "epoch": 0.18396783076183398, + "grad_norm": 2.3125, + "learning_rate": 9.190975743712184e-05, + "loss": 1.781, + "step": 4209 + }, + { + "epoch": 0.1840115389658639, + "grad_norm": 2.203125, + "learning_rate": 9.19060110718169e-05, + "loss": 2.0729, + "step": 4210 + }, + { + "epoch": 0.1840552471698938, + "grad_norm": 2.40625, + "learning_rate": 9.19022639156845e-05, + "loss": 1.388, + "step": 4211 + }, + { + "epoch": 0.18409895537392368, + "grad_norm": 2.171875, + "learning_rate": 9.18985159687954e-05, + "loss": 1.774, + "step": 4212 + }, + { + "epoch": 0.18414266357795359, + "grad_norm": 2.171875, + "learning_rate": 9.189476723122034e-05, + "loss": 1.8746, + "step": 4213 + }, + { + "epoch": 0.18418637178198347, + "grad_norm": 2.359375, + "learning_rate": 9.189101770303003e-05, + "loss": 1.9607, + "step": 4214 + }, + { + "epoch": 0.18423007998601337, + "grad_norm": 2.125, + "learning_rate": 9.188726738429526e-05, + "loss": 1.4984, + "step": 4215 + }, + { + "epoch": 0.18427378819004328, + "grad_norm": 2.046875, + "learning_rate": 9.188351627508678e-05, + "loss": 1.7226, + "step": 4216 + }, + { + "epoch": 0.18431749639407316, + "grad_norm": 2.359375, + "learning_rate": 9.187976437547538e-05, + "loss": 2.1207, + "step": 4217 + }, + { + "epoch": 0.18436120459810307, + "grad_norm": 2.578125, + "learning_rate": 9.187601168553191e-05, + "loss": 2.0887, + "step": 4218 + }, + { + "epoch": 0.18440491280213295, + "grad_norm": 2.234375, + "learning_rate": 9.187225820532712e-05, + "loss": 2.0781, + "step": 4219 + }, + { + "epoch": 0.18444862100616286, + "grad_norm": 2.1875, + "learning_rate": 9.186850393493188e-05, + "loss": 2.0583, + "step": 4220 + }, + { + "epoch": 0.18449232921019276, + "grad_norm": 2.34375, + "learning_rate": 9.186474887441704e-05, + "loss": 2.3583, + "step": 4221 + }, + { + "epoch": 0.18453603741422264, + "grad_norm": 2.515625, + "learning_rate": 9.186099302385344e-05, + "loss": 2.019, + "step": 4222 + }, + { + "epoch": 0.18457974561825255, + "grad_norm": 2.515625, + "learning_rate": 9.185723638331201e-05, + "loss": 2.1852, + "step": 4223 + }, + { + "epoch": 0.18462345382228243, + "grad_norm": 2.359375, + "learning_rate": 9.185347895286358e-05, + "loss": 1.8512, + "step": 4224 + }, + { + "epoch": 0.18466716202631234, + "grad_norm": 1.859375, + "learning_rate": 9.184972073257911e-05, + "loss": 1.7002, + "step": 4225 + }, + { + "epoch": 0.18471087023034224, + "grad_norm": 2.703125, + "learning_rate": 9.184596172252948e-05, + "loss": 1.9231, + "step": 4226 + }, + { + "epoch": 0.18475457843437212, + "grad_norm": 2.0625, + "learning_rate": 9.184220192278565e-05, + "loss": 1.7761, + "step": 4227 + }, + { + "epoch": 0.18479828663840203, + "grad_norm": 2.671875, + "learning_rate": 9.183844133341859e-05, + "loss": 1.9647, + "step": 4228 + }, + { + "epoch": 0.1848419948424319, + "grad_norm": 2.03125, + "learning_rate": 9.183467995449924e-05, + "loss": 1.9758, + "step": 4229 + }, + { + "epoch": 0.18488570304646182, + "grad_norm": 2.0625, + "learning_rate": 9.183091778609858e-05, + "loss": 2.1707, + "step": 4230 + }, + { + "epoch": 0.18492941125049173, + "grad_norm": 2.328125, + "learning_rate": 9.182715482828763e-05, + "loss": 2.0485, + "step": 4231 + }, + { + "epoch": 0.1849731194545216, + "grad_norm": 2.546875, + "learning_rate": 9.182339108113738e-05, + "loss": 1.9269, + "step": 4232 + }, + { + "epoch": 0.1850168276585515, + "grad_norm": 3.28125, + "learning_rate": 9.181962654471888e-05, + "loss": 2.4354, + "step": 4233 + }, + { + "epoch": 0.1850605358625814, + "grad_norm": 2.21875, + "learning_rate": 9.181586121910317e-05, + "loss": 2.3555, + "step": 4234 + }, + { + "epoch": 0.1851042440666113, + "grad_norm": 2.953125, + "learning_rate": 9.181209510436128e-05, + "loss": 2.3343, + "step": 4235 + }, + { + "epoch": 0.1851479522706412, + "grad_norm": 2.015625, + "learning_rate": 9.180832820056431e-05, + "loss": 1.7184, + "step": 4236 + }, + { + "epoch": 0.1851916604746711, + "grad_norm": 2.0, + "learning_rate": 9.180456050778334e-05, + "loss": 1.6113, + "step": 4237 + }, + { + "epoch": 0.185235368678701, + "grad_norm": 2.421875, + "learning_rate": 9.180079202608947e-05, + "loss": 1.749, + "step": 4238 + }, + { + "epoch": 0.18527907688273088, + "grad_norm": 2.3125, + "learning_rate": 9.179702275555382e-05, + "loss": 2.1112, + "step": 4239 + }, + { + "epoch": 0.18532278508676078, + "grad_norm": 2.453125, + "learning_rate": 9.17932526962475e-05, + "loss": 1.8853, + "step": 4240 + }, + { + "epoch": 0.1853664932907907, + "grad_norm": 2.53125, + "learning_rate": 9.178948184824168e-05, + "loss": 2.6173, + "step": 4241 + }, + { + "epoch": 0.18541020149482057, + "grad_norm": 2.9375, + "learning_rate": 9.178571021160753e-05, + "loss": 2.3146, + "step": 4242 + }, + { + "epoch": 0.18545390969885048, + "grad_norm": 3.125, + "learning_rate": 9.17819377864162e-05, + "loss": 2.4007, + "step": 4243 + }, + { + "epoch": 0.18549761790288036, + "grad_norm": 2.875, + "learning_rate": 9.17781645727389e-05, + "loss": 2.725, + "step": 4244 + }, + { + "epoch": 0.18554132610691026, + "grad_norm": 2.703125, + "learning_rate": 9.177439057064683e-05, + "loss": 2.5454, + "step": 4245 + }, + { + "epoch": 0.18558503431094017, + "grad_norm": 2.703125, + "learning_rate": 9.17706157802112e-05, + "loss": 1.9567, + "step": 4246 + }, + { + "epoch": 0.18562874251497005, + "grad_norm": 3.0, + "learning_rate": 9.176684020150326e-05, + "loss": 2.2343, + "step": 4247 + }, + { + "epoch": 0.18567245071899996, + "grad_norm": 2.6875, + "learning_rate": 9.176306383459426e-05, + "loss": 2.1333, + "step": 4248 + }, + { + "epoch": 0.18571615892302984, + "grad_norm": 2.578125, + "learning_rate": 9.175928667955546e-05, + "loss": 1.7027, + "step": 4249 + }, + { + "epoch": 0.18575986712705975, + "grad_norm": 3.125, + "learning_rate": 9.175550873645816e-05, + "loss": 2.0228, + "step": 4250 + }, + { + "epoch": 0.18580357533108965, + "grad_norm": 2.796875, + "learning_rate": 9.175173000537361e-05, + "loss": 2.6521, + "step": 4251 + }, + { + "epoch": 0.18584728353511953, + "grad_norm": 2.15625, + "learning_rate": 9.174795048637316e-05, + "loss": 2.2313, + "step": 4252 + }, + { + "epoch": 0.18589099173914944, + "grad_norm": 2.9375, + "learning_rate": 9.174417017952812e-05, + "loss": 2.1287, + "step": 4253 + }, + { + "epoch": 0.18593469994317932, + "grad_norm": 2.0625, + "learning_rate": 9.174038908490984e-05, + "loss": 1.7118, + "step": 4254 + }, + { + "epoch": 0.18597840814720923, + "grad_norm": 2.171875, + "learning_rate": 9.173660720258966e-05, + "loss": 1.589, + "step": 4255 + }, + { + "epoch": 0.18602211635123914, + "grad_norm": 2.21875, + "learning_rate": 9.173282453263897e-05, + "loss": 1.8921, + "step": 4256 + }, + { + "epoch": 0.18606582455526902, + "grad_norm": 2.546875, + "learning_rate": 9.172904107512911e-05, + "loss": 1.9779, + "step": 4257 + }, + { + "epoch": 0.18610953275929892, + "grad_norm": 2.734375, + "learning_rate": 9.172525683013155e-05, + "loss": 2.7427, + "step": 4258 + }, + { + "epoch": 0.1861532409633288, + "grad_norm": 2.734375, + "learning_rate": 9.172147179771765e-05, + "loss": 2.3742, + "step": 4259 + }, + { + "epoch": 0.1861969491673587, + "grad_norm": 2.28125, + "learning_rate": 9.171768597795885e-05, + "loss": 2.1021, + "step": 4260 + }, + { + "epoch": 0.18624065737138862, + "grad_norm": 2.703125, + "learning_rate": 9.17138993709266e-05, + "loss": 2.4938, + "step": 4261 + }, + { + "epoch": 0.1862843655754185, + "grad_norm": 20.25, + "learning_rate": 9.171011197669236e-05, + "loss": 1.6165, + "step": 4262 + }, + { + "epoch": 0.1863280737794484, + "grad_norm": 2.453125, + "learning_rate": 9.170632379532759e-05, + "loss": 1.7445, + "step": 4263 + }, + { + "epoch": 0.18637178198347829, + "grad_norm": 2.234375, + "learning_rate": 9.17025348269038e-05, + "loss": 1.8762, + "step": 4264 + }, + { + "epoch": 0.1864154901875082, + "grad_norm": 2.484375, + "learning_rate": 9.16987450714925e-05, + "loss": 2.254, + "step": 4265 + }, + { + "epoch": 0.1864591983915381, + "grad_norm": 3.203125, + "learning_rate": 9.169495452916516e-05, + "loss": 3.2139, + "step": 4266 + }, + { + "epoch": 0.18650290659556798, + "grad_norm": 2.25, + "learning_rate": 9.169116319999336e-05, + "loss": 2.2422, + "step": 4267 + }, + { + "epoch": 0.1865466147995979, + "grad_norm": 3.34375, + "learning_rate": 9.168737108404864e-05, + "loss": 1.9404, + "step": 4268 + }, + { + "epoch": 0.18659032300362777, + "grad_norm": 2.703125, + "learning_rate": 9.168357818140255e-05, + "loss": 1.9761, + "step": 4269 + }, + { + "epoch": 0.18663403120765767, + "grad_norm": 2.484375, + "learning_rate": 9.167978449212666e-05, + "loss": 2.1656, + "step": 4270 + }, + { + "epoch": 0.18667773941168758, + "grad_norm": 2.359375, + "learning_rate": 9.167599001629257e-05, + "loss": 2.4656, + "step": 4271 + }, + { + "epoch": 0.18672144761571746, + "grad_norm": 2.890625, + "learning_rate": 9.167219475397191e-05, + "loss": 2.2688, + "step": 4272 + }, + { + "epoch": 0.18676515581974737, + "grad_norm": 2.875, + "learning_rate": 9.166839870523627e-05, + "loss": 1.9322, + "step": 4273 + }, + { + "epoch": 0.18680886402377728, + "grad_norm": 2.453125, + "learning_rate": 9.16646018701573e-05, + "loss": 1.6503, + "step": 4274 + }, + { + "epoch": 0.18685257222780716, + "grad_norm": 2.328125, + "learning_rate": 9.166080424880666e-05, + "loss": 1.9766, + "step": 4275 + }, + { + "epoch": 0.18689628043183706, + "grad_norm": 2.390625, + "learning_rate": 9.165700584125601e-05, + "loss": 2.035, + "step": 4276 + }, + { + "epoch": 0.18693998863586694, + "grad_norm": 2.546875, + "learning_rate": 9.165320664757705e-05, + "loss": 2.16, + "step": 4277 + }, + { + "epoch": 0.18698369683989685, + "grad_norm": 2.453125, + "learning_rate": 9.164940666784143e-05, + "loss": 1.9903, + "step": 4278 + }, + { + "epoch": 0.18702740504392676, + "grad_norm": 2.171875, + "learning_rate": 9.164560590212088e-05, + "loss": 1.7399, + "step": 4279 + }, + { + "epoch": 0.18707111324795664, + "grad_norm": 2.640625, + "learning_rate": 9.164180435048715e-05, + "loss": 2.0793, + "step": 4280 + }, + { + "epoch": 0.18711482145198655, + "grad_norm": 6.625, + "learning_rate": 9.163800201301197e-05, + "loss": 2.8651, + "step": 4281 + }, + { + "epoch": 0.18715852965601643, + "grad_norm": 2.625, + "learning_rate": 9.163419888976708e-05, + "loss": 2.162, + "step": 4282 + }, + { + "epoch": 0.18720223786004633, + "grad_norm": 2.0625, + "learning_rate": 9.163039498082428e-05, + "loss": 1.6775, + "step": 4283 + }, + { + "epoch": 0.18724594606407624, + "grad_norm": 2.265625, + "learning_rate": 9.162659028625531e-05, + "loss": 1.9262, + "step": 4284 + }, + { + "epoch": 0.18728965426810612, + "grad_norm": 2.625, + "learning_rate": 9.162278480613203e-05, + "loss": 1.9239, + "step": 4285 + }, + { + "epoch": 0.18733336247213603, + "grad_norm": 2.40625, + "learning_rate": 9.161897854052619e-05, + "loss": 1.6802, + "step": 4286 + }, + { + "epoch": 0.1873770706761659, + "grad_norm": 2.265625, + "learning_rate": 9.161517148950967e-05, + "loss": 1.9088, + "step": 4287 + }, + { + "epoch": 0.18742077888019582, + "grad_norm": 2.171875, + "learning_rate": 9.161136365315428e-05, + "loss": 1.9722, + "step": 4288 + }, + { + "epoch": 0.18746448708422572, + "grad_norm": 2.890625, + "learning_rate": 9.160755503153192e-05, + "loss": 1.692, + "step": 4289 + }, + { + "epoch": 0.1875081952882556, + "grad_norm": 2.28125, + "learning_rate": 9.16037456247144e-05, + "loss": 1.5951, + "step": 4290 + }, + { + "epoch": 0.1875519034922855, + "grad_norm": 2.890625, + "learning_rate": 9.159993543277368e-05, + "loss": 2.0796, + "step": 4291 + }, + { + "epoch": 0.1875956116963154, + "grad_norm": 2.515625, + "learning_rate": 9.159612445578163e-05, + "loss": 1.3354, + "step": 4292 + }, + { + "epoch": 0.1876393199003453, + "grad_norm": 2.65625, + "learning_rate": 9.159231269381016e-05, + "loss": 2.0206, + "step": 4293 + }, + { + "epoch": 0.1876830281043752, + "grad_norm": 2.9375, + "learning_rate": 9.158850014693123e-05, + "loss": 1.9006, + "step": 4294 + }, + { + "epoch": 0.18772673630840508, + "grad_norm": 3.078125, + "learning_rate": 9.158468681521676e-05, + "loss": 2.1584, + "step": 4295 + }, + { + "epoch": 0.187770444512435, + "grad_norm": 2.171875, + "learning_rate": 9.158087269873871e-05, + "loss": 1.8828, + "step": 4296 + }, + { + "epoch": 0.18781415271646487, + "grad_norm": 2.296875, + "learning_rate": 9.15770577975691e-05, + "loss": 1.9622, + "step": 4297 + }, + { + "epoch": 0.18785786092049478, + "grad_norm": 2.203125, + "learning_rate": 9.157324211177991e-05, + "loss": 1.7722, + "step": 4298 + }, + { + "epoch": 0.1879015691245247, + "grad_norm": 2.84375, + "learning_rate": 9.15694256414431e-05, + "loss": 2.0762, + "step": 4299 + }, + { + "epoch": 0.18794527732855457, + "grad_norm": 2.15625, + "learning_rate": 9.156560838663076e-05, + "loss": 2.1648, + "step": 4300 + }, + { + "epoch": 0.18798898553258447, + "grad_norm": 2.15625, + "learning_rate": 9.156179034741486e-05, + "loss": 1.9935, + "step": 4301 + }, + { + "epoch": 0.18803269373661435, + "grad_norm": 2.21875, + "learning_rate": 9.155797152386752e-05, + "loss": 1.7464, + "step": 4302 + }, + { + "epoch": 0.18807640194064426, + "grad_norm": 2.921875, + "learning_rate": 9.155415191606074e-05, + "loss": 1.6097, + "step": 4303 + }, + { + "epoch": 0.18812011014467417, + "grad_norm": 2.078125, + "learning_rate": 9.155033152406665e-05, + "loss": 1.9678, + "step": 4304 + }, + { + "epoch": 0.18816381834870405, + "grad_norm": 2.265625, + "learning_rate": 9.154651034795734e-05, + "loss": 1.6927, + "step": 4305 + }, + { + "epoch": 0.18820752655273396, + "grad_norm": 3.046875, + "learning_rate": 9.154268838780489e-05, + "loss": 2.6745, + "step": 4306 + }, + { + "epoch": 0.18825123475676384, + "grad_norm": 2.375, + "learning_rate": 9.153886564368145e-05, + "loss": 2.1593, + "step": 4307 + }, + { + "epoch": 0.18829494296079374, + "grad_norm": 2.0, + "learning_rate": 9.153504211565917e-05, + "loss": 1.9622, + "step": 4308 + }, + { + "epoch": 0.18833865116482365, + "grad_norm": 2.078125, + "learning_rate": 9.15312178038102e-05, + "loss": 1.9805, + "step": 4309 + }, + { + "epoch": 0.18838235936885353, + "grad_norm": 2.265625, + "learning_rate": 9.15273927082067e-05, + "loss": 1.6238, + "step": 4310 + }, + { + "epoch": 0.18842606757288344, + "grad_norm": 2.390625, + "learning_rate": 9.152356682892085e-05, + "loss": 1.8822, + "step": 4311 + }, + { + "epoch": 0.18846977577691332, + "grad_norm": 2.515625, + "learning_rate": 9.151974016602485e-05, + "loss": 1.5081, + "step": 4312 + }, + { + "epoch": 0.18851348398094323, + "grad_norm": 2.46875, + "learning_rate": 9.151591271959094e-05, + "loss": 1.7578, + "step": 4313 + }, + { + "epoch": 0.18855719218497313, + "grad_norm": 2.171875, + "learning_rate": 9.151208448969134e-05, + "loss": 1.9479, + "step": 4314 + }, + { + "epoch": 0.188600900389003, + "grad_norm": 2.671875, + "learning_rate": 9.150825547639827e-05, + "loss": 2.4475, + "step": 4315 + }, + { + "epoch": 0.18864460859303292, + "grad_norm": 2.109375, + "learning_rate": 9.150442567978402e-05, + "loss": 1.4966, + "step": 4316 + }, + { + "epoch": 0.1886883167970628, + "grad_norm": 1.953125, + "learning_rate": 9.150059509992085e-05, + "loss": 1.818, + "step": 4317 + }, + { + "epoch": 0.1887320250010927, + "grad_norm": 2.359375, + "learning_rate": 9.149676373688105e-05, + "loss": 1.8557, + "step": 4318 + }, + { + "epoch": 0.18877573320512261, + "grad_norm": 2.203125, + "learning_rate": 9.149293159073691e-05, + "loss": 1.636, + "step": 4319 + }, + { + "epoch": 0.1888194414091525, + "grad_norm": 2.21875, + "learning_rate": 9.148909866156076e-05, + "loss": 2.0637, + "step": 4320 + }, + { + "epoch": 0.1888631496131824, + "grad_norm": 2.171875, + "learning_rate": 9.148526494942496e-05, + "loss": 1.9292, + "step": 4321 + }, + { + "epoch": 0.18890685781721228, + "grad_norm": 2.1875, + "learning_rate": 9.14814304544018e-05, + "loss": 1.9628, + "step": 4322 + }, + { + "epoch": 0.1889505660212422, + "grad_norm": 2.3125, + "learning_rate": 9.147759517656369e-05, + "loss": 2.0658, + "step": 4323 + }, + { + "epoch": 0.1889942742252721, + "grad_norm": 2.03125, + "learning_rate": 9.147375911598299e-05, + "loss": 1.3896, + "step": 4324 + }, + { + "epoch": 0.18903798242930198, + "grad_norm": 2.296875, + "learning_rate": 9.14699222727321e-05, + "loss": 2.1023, + "step": 4325 + }, + { + "epoch": 0.18908169063333188, + "grad_norm": 2.4375, + "learning_rate": 9.146608464688341e-05, + "loss": 2.0631, + "step": 4326 + }, + { + "epoch": 0.18912539883736176, + "grad_norm": 2.265625, + "learning_rate": 9.146224623850936e-05, + "loss": 2.255, + "step": 4327 + }, + { + "epoch": 0.18916910704139167, + "grad_norm": 2.640625, + "learning_rate": 9.145840704768238e-05, + "loss": 1.578, + "step": 4328 + }, + { + "epoch": 0.18921281524542158, + "grad_norm": 2.515625, + "learning_rate": 9.145456707447491e-05, + "loss": 1.5111, + "step": 4329 + }, + { + "epoch": 0.18925652344945146, + "grad_norm": 3.046875, + "learning_rate": 9.145072631895942e-05, + "loss": 1.6645, + "step": 4330 + }, + { + "epoch": 0.18930023165348137, + "grad_norm": 2.0625, + "learning_rate": 9.14468847812084e-05, + "loss": 1.8262, + "step": 4331 + }, + { + "epoch": 0.18934393985751125, + "grad_norm": 2.1875, + "learning_rate": 9.144304246129434e-05, + "loss": 1.8456, + "step": 4332 + }, + { + "epoch": 0.18938764806154115, + "grad_norm": 2.46875, + "learning_rate": 9.143919935928975e-05, + "loss": 2.0993, + "step": 4333 + }, + { + "epoch": 0.18943135626557106, + "grad_norm": 2.546875, + "learning_rate": 9.143535547526716e-05, + "loss": 2.0291, + "step": 4334 + }, + { + "epoch": 0.18947506446960094, + "grad_norm": 2.40625, + "learning_rate": 9.143151080929911e-05, + "loss": 2.1035, + "step": 4335 + }, + { + "epoch": 0.18951877267363085, + "grad_norm": 3.234375, + "learning_rate": 9.142766536145815e-05, + "loss": 2.0885, + "step": 4336 + }, + { + "epoch": 0.18956248087766073, + "grad_norm": 2.4375, + "learning_rate": 9.142381913181684e-05, + "loss": 1.8083, + "step": 4337 + }, + { + "epoch": 0.18960618908169063, + "grad_norm": 2.609375, + "learning_rate": 9.141997212044779e-05, + "loss": 1.5989, + "step": 4338 + }, + { + "epoch": 0.18964989728572054, + "grad_norm": 3.625, + "learning_rate": 9.141612432742357e-05, + "loss": 2.7321, + "step": 4339 + }, + { + "epoch": 0.18969360548975042, + "grad_norm": 2.40625, + "learning_rate": 9.14122757528168e-05, + "loss": 2.4181, + "step": 4340 + }, + { + "epoch": 0.18973731369378033, + "grad_norm": 2.3125, + "learning_rate": 9.140842639670014e-05, + "loss": 1.8314, + "step": 4341 + }, + { + "epoch": 0.1897810218978102, + "grad_norm": 2.21875, + "learning_rate": 9.140457625914618e-05, + "loss": 2.2134, + "step": 4342 + }, + { + "epoch": 0.18982473010184012, + "grad_norm": 2.21875, + "learning_rate": 9.14007253402276e-05, + "loss": 2.1502, + "step": 4343 + }, + { + "epoch": 0.18986843830587002, + "grad_norm": 2.9375, + "learning_rate": 9.13968736400171e-05, + "loss": 2.2266, + "step": 4344 + }, + { + "epoch": 0.1899121465098999, + "grad_norm": 2.4375, + "learning_rate": 9.139302115858733e-05, + "loss": 1.8241, + "step": 4345 + }, + { + "epoch": 0.1899558547139298, + "grad_norm": 2.59375, + "learning_rate": 9.138916789601102e-05, + "loss": 2.5854, + "step": 4346 + }, + { + "epoch": 0.1899995629179597, + "grad_norm": 2.71875, + "learning_rate": 9.138531385236086e-05, + "loss": 2.0389, + "step": 4347 + }, + { + "epoch": 0.1900432711219896, + "grad_norm": 2.453125, + "learning_rate": 9.13814590277096e-05, + "loss": 2.3824, + "step": 4348 + }, + { + "epoch": 0.1900869793260195, + "grad_norm": 2.296875, + "learning_rate": 9.137760342212997e-05, + "loss": 1.815, + "step": 4349 + }, + { + "epoch": 0.19013068753004939, + "grad_norm": 2.375, + "learning_rate": 9.137374703569475e-05, + "loss": 2.382, + "step": 4350 + }, + { + "epoch": 0.1901743957340793, + "grad_norm": 2.15625, + "learning_rate": 9.13698898684767e-05, + "loss": 1.7854, + "step": 4351 + }, + { + "epoch": 0.19021810393810917, + "grad_norm": 2.0, + "learning_rate": 9.136603192054862e-05, + "loss": 1.8529, + "step": 4352 + }, + { + "epoch": 0.19026181214213908, + "grad_norm": 2.015625, + "learning_rate": 9.136217319198332e-05, + "loss": 1.661, + "step": 4353 + }, + { + "epoch": 0.190305520346169, + "grad_norm": 2.34375, + "learning_rate": 9.135831368285362e-05, + "loss": 1.901, + "step": 4354 + }, + { + "epoch": 0.19034922855019887, + "grad_norm": 1.984375, + "learning_rate": 9.135445339323232e-05, + "loss": 1.9566, + "step": 4355 + }, + { + "epoch": 0.19039293675422878, + "grad_norm": 2.640625, + "learning_rate": 9.135059232319232e-05, + "loss": 2.8984, + "step": 4356 + }, + { + "epoch": 0.19043664495825866, + "grad_norm": 2.921875, + "learning_rate": 9.134673047280645e-05, + "loss": 2.7567, + "step": 4357 + }, + { + "epoch": 0.19048035316228856, + "grad_norm": 2.75, + "learning_rate": 9.134286784214759e-05, + "loss": 2.2439, + "step": 4358 + }, + { + "epoch": 0.19052406136631847, + "grad_norm": 2.328125, + "learning_rate": 9.133900443128864e-05, + "loss": 1.7812, + "step": 4359 + }, + { + "epoch": 0.19056776957034835, + "grad_norm": 2.5, + "learning_rate": 9.133514024030252e-05, + "loss": 2.3759, + "step": 4360 + }, + { + "epoch": 0.19061147777437826, + "grad_norm": 2.515625, + "learning_rate": 9.133127526926215e-05, + "loss": 2.232, + "step": 4361 + }, + { + "epoch": 0.19065518597840814, + "grad_norm": 2.5625, + "learning_rate": 9.132740951824046e-05, + "loss": 1.6692, + "step": 4362 + }, + { + "epoch": 0.19069889418243804, + "grad_norm": 2.59375, + "learning_rate": 9.13235429873104e-05, + "loss": 2.4393, + "step": 4363 + }, + { + "epoch": 0.19074260238646795, + "grad_norm": 2.5, + "learning_rate": 9.131967567654493e-05, + "loss": 2.0248, + "step": 4364 + }, + { + "epoch": 0.19078631059049783, + "grad_norm": 2.25, + "learning_rate": 9.131580758601705e-05, + "loss": 1.9735, + "step": 4365 + }, + { + "epoch": 0.19083001879452774, + "grad_norm": 2.390625, + "learning_rate": 9.131193871579975e-05, + "loss": 2.3554, + "step": 4366 + }, + { + "epoch": 0.19087372699855762, + "grad_norm": 2.65625, + "learning_rate": 9.130806906596603e-05, + "loss": 2.4335, + "step": 4367 + }, + { + "epoch": 0.19091743520258753, + "grad_norm": 1.9765625, + "learning_rate": 9.130419863658894e-05, + "loss": 1.9289, + "step": 4368 + }, + { + "epoch": 0.19096114340661743, + "grad_norm": 2.203125, + "learning_rate": 9.13003274277415e-05, + "loss": 2.2863, + "step": 4369 + }, + { + "epoch": 0.1910048516106473, + "grad_norm": 2.3125, + "learning_rate": 9.129645543949676e-05, + "loss": 2.1855, + "step": 4370 + }, + { + "epoch": 0.19104855981467722, + "grad_norm": 2.0, + "learning_rate": 9.129258267192783e-05, + "loss": 1.8109, + "step": 4371 + }, + { + "epoch": 0.1910922680187071, + "grad_norm": 2.609375, + "learning_rate": 9.128870912510774e-05, + "loss": 1.9122, + "step": 4372 + }, + { + "epoch": 0.191135976222737, + "grad_norm": 2.109375, + "learning_rate": 9.128483479910963e-05, + "loss": 1.843, + "step": 4373 + }, + { + "epoch": 0.19117968442676692, + "grad_norm": 2.5625, + "learning_rate": 9.12809596940066e-05, + "loss": 2.258, + "step": 4374 + }, + { + "epoch": 0.1912233926307968, + "grad_norm": 2.5, + "learning_rate": 9.127708380987176e-05, + "loss": 1.8263, + "step": 4375 + }, + { + "epoch": 0.1912671008348267, + "grad_norm": 2.625, + "learning_rate": 9.12732071467783e-05, + "loss": 1.9707, + "step": 4376 + }, + { + "epoch": 0.19131080903885658, + "grad_norm": 2.390625, + "learning_rate": 9.126932970479933e-05, + "loss": 2.1258, + "step": 4377 + }, + { + "epoch": 0.1913545172428865, + "grad_norm": 2.4375, + "learning_rate": 9.126545148400807e-05, + "loss": 1.8641, + "step": 4378 + }, + { + "epoch": 0.1913982254469164, + "grad_norm": 2.1875, + "learning_rate": 9.126157248447766e-05, + "loss": 1.7545, + "step": 4379 + }, + { + "epoch": 0.19144193365094628, + "grad_norm": 2.328125, + "learning_rate": 9.125769270628133e-05, + "loss": 1.8299, + "step": 4380 + }, + { + "epoch": 0.19148564185497619, + "grad_norm": 2.125, + "learning_rate": 9.125381214949229e-05, + "loss": 2.2791, + "step": 4381 + }, + { + "epoch": 0.19152935005900606, + "grad_norm": 3.890625, + "learning_rate": 9.124993081418375e-05, + "loss": 2.0627, + "step": 4382 + }, + { + "epoch": 0.19157305826303597, + "grad_norm": 2.6875, + "learning_rate": 9.124604870042901e-05, + "loss": 2.8771, + "step": 4383 + }, + { + "epoch": 0.19161676646706588, + "grad_norm": 2.46875, + "learning_rate": 9.12421658083013e-05, + "loss": 1.8637, + "step": 4384 + }, + { + "epoch": 0.19166047467109576, + "grad_norm": 1.890625, + "learning_rate": 9.123828213787389e-05, + "loss": 2.0092, + "step": 4385 + }, + { + "epoch": 0.19170418287512567, + "grad_norm": 1.8828125, + "learning_rate": 9.123439768922006e-05, + "loss": 1.5351, + "step": 4386 + }, + { + "epoch": 0.19174789107915555, + "grad_norm": 3.453125, + "learning_rate": 9.123051246241314e-05, + "loss": 2.1595, + "step": 4387 + }, + { + "epoch": 0.19179159928318545, + "grad_norm": 3.203125, + "learning_rate": 9.122662645752646e-05, + "loss": 1.3175, + "step": 4388 + }, + { + "epoch": 0.19183530748721536, + "grad_norm": 2.671875, + "learning_rate": 9.122273967463331e-05, + "loss": 1.9585, + "step": 4389 + }, + { + "epoch": 0.19187901569124524, + "grad_norm": 2.296875, + "learning_rate": 9.121885211380708e-05, + "loss": 1.9942, + "step": 4390 + }, + { + "epoch": 0.19192272389527515, + "grad_norm": 1.8515625, + "learning_rate": 9.121496377512114e-05, + "loss": 1.6896, + "step": 4391 + }, + { + "epoch": 0.19196643209930503, + "grad_norm": 2.796875, + "learning_rate": 9.121107465864882e-05, + "loss": 2.2896, + "step": 4392 + }, + { + "epoch": 0.19201014030333494, + "grad_norm": 2.15625, + "learning_rate": 9.120718476446354e-05, + "loss": 2.1606, + "step": 4393 + }, + { + "epoch": 0.19205384850736484, + "grad_norm": 2.140625, + "learning_rate": 9.120329409263871e-05, + "loss": 1.8183, + "step": 4394 + }, + { + "epoch": 0.19209755671139472, + "grad_norm": 2.359375, + "learning_rate": 9.119940264324776e-05, + "loss": 1.8949, + "step": 4395 + }, + { + "epoch": 0.19214126491542463, + "grad_norm": 3.21875, + "learning_rate": 9.119551041636412e-05, + "loss": 2.3729, + "step": 4396 + }, + { + "epoch": 0.1921849731194545, + "grad_norm": 1.9375, + "learning_rate": 9.119161741206123e-05, + "loss": 1.9106, + "step": 4397 + }, + { + "epoch": 0.19222868132348442, + "grad_norm": 2.453125, + "learning_rate": 9.11877236304126e-05, + "loss": 2.1901, + "step": 4398 + }, + { + "epoch": 0.19227238952751433, + "grad_norm": 2.25, + "learning_rate": 9.118382907149165e-05, + "loss": 2.2365, + "step": 4399 + }, + { + "epoch": 0.1923160977315442, + "grad_norm": 1.9140625, + "learning_rate": 9.11799337353719e-05, + "loss": 1.8551, + "step": 4400 + }, + { + "epoch": 0.1923598059355741, + "grad_norm": 2.015625, + "learning_rate": 9.117603762212687e-05, + "loss": 1.8847, + "step": 4401 + }, + { + "epoch": 0.192403514139604, + "grad_norm": 2.484375, + "learning_rate": 9.11721407318301e-05, + "loss": 3.003, + "step": 4402 + }, + { + "epoch": 0.1924472223436339, + "grad_norm": 2.03125, + "learning_rate": 9.116824306455509e-05, + "loss": 1.9676, + "step": 4403 + }, + { + "epoch": 0.1924909305476638, + "grad_norm": 2.125, + "learning_rate": 9.116434462037545e-05, + "loss": 1.9876, + "step": 4404 + }, + { + "epoch": 0.1925346387516937, + "grad_norm": 3.40625, + "learning_rate": 9.116044539936468e-05, + "loss": 1.6742, + "step": 4405 + }, + { + "epoch": 0.1925783469557236, + "grad_norm": 2.265625, + "learning_rate": 9.115654540159641e-05, + "loss": 2.2874, + "step": 4406 + }, + { + "epoch": 0.19262205515975347, + "grad_norm": 2.15625, + "learning_rate": 9.115264462714421e-05, + "loss": 1.6118, + "step": 4407 + }, + { + "epoch": 0.19266576336378338, + "grad_norm": 2.203125, + "learning_rate": 9.114874307608175e-05, + "loss": 1.7595, + "step": 4408 + }, + { + "epoch": 0.1927094715678133, + "grad_norm": 2.8125, + "learning_rate": 9.114484074848259e-05, + "loss": 2.242, + "step": 4409 + }, + { + "epoch": 0.19275317977184317, + "grad_norm": 2.296875, + "learning_rate": 9.11409376444204e-05, + "loss": 1.8576, + "step": 4410 + }, + { + "epoch": 0.19279688797587308, + "grad_norm": 3.46875, + "learning_rate": 9.113703376396885e-05, + "loss": 2.2897, + "step": 4411 + }, + { + "epoch": 0.19284059617990296, + "grad_norm": 2.3125, + "learning_rate": 9.11331291072016e-05, + "loss": 2.1501, + "step": 4412 + }, + { + "epoch": 0.19288430438393286, + "grad_norm": 2.125, + "learning_rate": 9.112922367419234e-05, + "loss": 2.0837, + "step": 4413 + }, + { + "epoch": 0.19292801258796277, + "grad_norm": 2.453125, + "learning_rate": 9.112531746501476e-05, + "loss": 1.7882, + "step": 4414 + }, + { + "epoch": 0.19297172079199265, + "grad_norm": 2.921875, + "learning_rate": 9.112141047974259e-05, + "loss": 2.9361, + "step": 4415 + }, + { + "epoch": 0.19301542899602256, + "grad_norm": 2.21875, + "learning_rate": 9.111750271844957e-05, + "loss": 2.1117, + "step": 4416 + }, + { + "epoch": 0.19305913720005244, + "grad_norm": 2.828125, + "learning_rate": 9.11135941812094e-05, + "loss": 2.7356, + "step": 4417 + }, + { + "epoch": 0.19310284540408235, + "grad_norm": 2.125, + "learning_rate": 9.110968486809588e-05, + "loss": 1.5091, + "step": 4418 + }, + { + "epoch": 0.19314655360811225, + "grad_norm": 2.28125, + "learning_rate": 9.110577477918279e-05, + "loss": 2.2072, + "step": 4419 + }, + { + "epoch": 0.19319026181214213, + "grad_norm": 2.265625, + "learning_rate": 9.110186391454389e-05, + "loss": 1.8448, + "step": 4420 + }, + { + "epoch": 0.19323397001617204, + "grad_norm": 2.234375, + "learning_rate": 9.1097952274253e-05, + "loss": 1.9389, + "step": 4421 + }, + { + "epoch": 0.19327767822020192, + "grad_norm": 2.140625, + "learning_rate": 9.109403985838392e-05, + "loss": 2.0713, + "step": 4422 + }, + { + "epoch": 0.19332138642423183, + "grad_norm": 2.40625, + "learning_rate": 9.109012666701051e-05, + "loss": 2.0326, + "step": 4423 + }, + { + "epoch": 0.19336509462826174, + "grad_norm": 2.5, + "learning_rate": 9.108621270020661e-05, + "loss": 2.5237, + "step": 4424 + }, + { + "epoch": 0.19340880283229162, + "grad_norm": 2.265625, + "learning_rate": 9.108229795804609e-05, + "loss": 1.8756, + "step": 4425 + }, + { + "epoch": 0.19345251103632152, + "grad_norm": 1.9921875, + "learning_rate": 9.10783824406028e-05, + "loss": 1.8804, + "step": 4426 + }, + { + "epoch": 0.1934962192403514, + "grad_norm": 3.484375, + "learning_rate": 9.107446614795063e-05, + "loss": 2.3902, + "step": 4427 + }, + { + "epoch": 0.1935399274443813, + "grad_norm": 2.171875, + "learning_rate": 9.107054908016352e-05, + "loss": 2.1118, + "step": 4428 + }, + { + "epoch": 0.19358363564841122, + "grad_norm": 2.921875, + "learning_rate": 9.106663123731539e-05, + "loss": 1.6956, + "step": 4429 + }, + { + "epoch": 0.1936273438524411, + "grad_norm": 2.984375, + "learning_rate": 9.106271261948014e-05, + "loss": 2.6526, + "step": 4430 + }, + { + "epoch": 0.193671052056471, + "grad_norm": 2.15625, + "learning_rate": 9.105879322673174e-05, + "loss": 1.9012, + "step": 4431 + }, + { + "epoch": 0.19371476026050088, + "grad_norm": 2.25, + "learning_rate": 9.105487305914416e-05, + "loss": 1.5692, + "step": 4432 + }, + { + "epoch": 0.1937584684645308, + "grad_norm": 2.140625, + "learning_rate": 9.105095211679135e-05, + "loss": 2.1344, + "step": 4433 + }, + { + "epoch": 0.1938021766685607, + "grad_norm": 2.171875, + "learning_rate": 9.104703039974736e-05, + "loss": 2.1873, + "step": 4434 + }, + { + "epoch": 0.19384588487259058, + "grad_norm": 2.421875, + "learning_rate": 9.104310790808614e-05, + "loss": 2.5386, + "step": 4435 + }, + { + "epoch": 0.1938895930766205, + "grad_norm": 2.4375, + "learning_rate": 9.103918464188175e-05, + "loss": 2.2476, + "step": 4436 + }, + { + "epoch": 0.19393330128065037, + "grad_norm": 2.265625, + "learning_rate": 9.103526060120821e-05, + "loss": 1.8868, + "step": 4437 + }, + { + "epoch": 0.19397700948468027, + "grad_norm": 2.09375, + "learning_rate": 9.103133578613959e-05, + "loss": 1.8679, + "step": 4438 + }, + { + "epoch": 0.19402071768871018, + "grad_norm": 2.34375, + "learning_rate": 9.102741019674993e-05, + "loss": 1.5364, + "step": 4439 + }, + { + "epoch": 0.19406442589274006, + "grad_norm": 2.640625, + "learning_rate": 9.102348383311334e-05, + "loss": 1.9936, + "step": 4440 + }, + { + "epoch": 0.19410813409676997, + "grad_norm": 2.484375, + "learning_rate": 9.101955669530391e-05, + "loss": 2.3603, + "step": 4441 + }, + { + "epoch": 0.19415184230079985, + "grad_norm": 2.09375, + "learning_rate": 9.101562878339572e-05, + "loss": 1.5734, + "step": 4442 + }, + { + "epoch": 0.19419555050482976, + "grad_norm": 2.125, + "learning_rate": 9.101170009746294e-05, + "loss": 1.8535, + "step": 4443 + }, + { + "epoch": 0.19423925870885966, + "grad_norm": 3.09375, + "learning_rate": 9.100777063757969e-05, + "loss": 2.6233, + "step": 4444 + }, + { + "epoch": 0.19428296691288954, + "grad_norm": 3.453125, + "learning_rate": 9.100384040382011e-05, + "loss": 3.0462, + "step": 4445 + }, + { + "epoch": 0.19432667511691945, + "grad_norm": 2.75, + "learning_rate": 9.099990939625838e-05, + "loss": 2.0341, + "step": 4446 + }, + { + "epoch": 0.19437038332094933, + "grad_norm": 2.4375, + "learning_rate": 9.099597761496869e-05, + "loss": 2.5488, + "step": 4447 + }, + { + "epoch": 0.19441409152497924, + "grad_norm": 2.09375, + "learning_rate": 9.099204506002525e-05, + "loss": 1.9444, + "step": 4448 + }, + { + "epoch": 0.19445779972900915, + "grad_norm": 2.328125, + "learning_rate": 9.098811173150225e-05, + "loss": 1.9015, + "step": 4449 + }, + { + "epoch": 0.19450150793303903, + "grad_norm": 2.625, + "learning_rate": 9.098417762947394e-05, + "loss": 1.7492, + "step": 4450 + }, + { + "epoch": 0.19454521613706893, + "grad_norm": 2.828125, + "learning_rate": 9.098024275401454e-05, + "loss": 2.0373, + "step": 4451 + }, + { + "epoch": 0.1945889243410988, + "grad_norm": 2.25, + "learning_rate": 9.097630710519829e-05, + "loss": 2.2535, + "step": 4452 + }, + { + "epoch": 0.19463263254512872, + "grad_norm": 2.578125, + "learning_rate": 9.097237068309951e-05, + "loss": 1.9008, + "step": 4453 + }, + { + "epoch": 0.19467634074915863, + "grad_norm": 2.59375, + "learning_rate": 9.096843348779247e-05, + "loss": 2.2965, + "step": 4454 + }, + { + "epoch": 0.1947200489531885, + "grad_norm": 3.046875, + "learning_rate": 9.096449551935144e-05, + "loss": 2.1173, + "step": 4455 + }, + { + "epoch": 0.19476375715721841, + "grad_norm": 2.0, + "learning_rate": 9.096055677785078e-05, + "loss": 1.7159, + "step": 4456 + }, + { + "epoch": 0.1948074653612483, + "grad_norm": 1.8671875, + "learning_rate": 9.09566172633648e-05, + "loss": 1.6598, + "step": 4457 + }, + { + "epoch": 0.1948511735652782, + "grad_norm": 2.125, + "learning_rate": 9.095267697596782e-05, + "loss": 1.8245, + "step": 4458 + }, + { + "epoch": 0.1948948817693081, + "grad_norm": 2.375, + "learning_rate": 9.094873591573423e-05, + "loss": 2.1265, + "step": 4459 + }, + { + "epoch": 0.194938589973338, + "grad_norm": 2.3125, + "learning_rate": 9.094479408273841e-05, + "loss": 2.1805, + "step": 4460 + }, + { + "epoch": 0.1949822981773679, + "grad_norm": 2.421875, + "learning_rate": 9.094085147705472e-05, + "loss": 2.1034, + "step": 4461 + }, + { + "epoch": 0.19502600638139778, + "grad_norm": 2.484375, + "learning_rate": 9.093690809875758e-05, + "loss": 1.6653, + "step": 4462 + }, + { + "epoch": 0.19506971458542768, + "grad_norm": 2.078125, + "learning_rate": 9.09329639479214e-05, + "loss": 1.5786, + "step": 4463 + }, + { + "epoch": 0.1951134227894576, + "grad_norm": 2.4375, + "learning_rate": 9.092901902462062e-05, + "loss": 1.8306, + "step": 4464 + }, + { + "epoch": 0.19515713099348747, + "grad_norm": 3.484375, + "learning_rate": 9.092507332892968e-05, + "loss": 2.9709, + "step": 4465 + }, + { + "epoch": 0.19520083919751738, + "grad_norm": 2.28125, + "learning_rate": 9.092112686092304e-05, + "loss": 2.0711, + "step": 4466 + }, + { + "epoch": 0.19524454740154726, + "grad_norm": 2.1875, + "learning_rate": 9.091717962067518e-05, + "loss": 2.3881, + "step": 4467 + }, + { + "epoch": 0.19528825560557717, + "grad_norm": 2.296875, + "learning_rate": 9.09132316082606e-05, + "loss": 2.128, + "step": 4468 + }, + { + "epoch": 0.19533196380960707, + "grad_norm": 2.53125, + "learning_rate": 9.090928282375378e-05, + "loss": 1.5185, + "step": 4469 + }, + { + "epoch": 0.19537567201363695, + "grad_norm": 2.4375, + "learning_rate": 9.090533326722924e-05, + "loss": 2.0833, + "step": 4470 + }, + { + "epoch": 0.19541938021766686, + "grad_norm": 2.578125, + "learning_rate": 9.090138293876155e-05, + "loss": 2.3409, + "step": 4471 + }, + { + "epoch": 0.19546308842169674, + "grad_norm": 2.328125, + "learning_rate": 9.089743183842523e-05, + "loss": 1.8522, + "step": 4472 + }, + { + "epoch": 0.19550679662572665, + "grad_norm": 2.0, + "learning_rate": 9.089347996629484e-05, + "loss": 1.7542, + "step": 4473 + }, + { + "epoch": 0.19555050482975656, + "grad_norm": 2.203125, + "learning_rate": 9.088952732244495e-05, + "loss": 1.8944, + "step": 4474 + }, + { + "epoch": 0.19559421303378643, + "grad_norm": 2.34375, + "learning_rate": 9.088557390695021e-05, + "loss": 1.8428, + "step": 4475 + }, + { + "epoch": 0.19563792123781634, + "grad_norm": 2.28125, + "learning_rate": 9.088161971988516e-05, + "loss": 1.6356, + "step": 4476 + }, + { + "epoch": 0.19568162944184622, + "grad_norm": 2.0625, + "learning_rate": 9.087766476132444e-05, + "loss": 1.7155, + "step": 4477 + }, + { + "epoch": 0.19572533764587613, + "grad_norm": 5.75, + "learning_rate": 9.08737090313427e-05, + "loss": 2.7836, + "step": 4478 + }, + { + "epoch": 0.19576904584990604, + "grad_norm": 2.0625, + "learning_rate": 9.086975253001457e-05, + "loss": 1.6453, + "step": 4479 + }, + { + "epoch": 0.19581275405393592, + "grad_norm": 2.15625, + "learning_rate": 9.086579525741475e-05, + "loss": 1.6787, + "step": 4480 + }, + { + "epoch": 0.19585646225796582, + "grad_norm": 2.140625, + "learning_rate": 9.086183721361787e-05, + "loss": 2.1751, + "step": 4481 + }, + { + "epoch": 0.1959001704619957, + "grad_norm": 2.359375, + "learning_rate": 9.085787839869866e-05, + "loss": 2.2441, + "step": 4482 + }, + { + "epoch": 0.1959438786660256, + "grad_norm": 2.125, + "learning_rate": 9.085391881273182e-05, + "loss": 1.4872, + "step": 4483 + }, + { + "epoch": 0.19598758687005552, + "grad_norm": 2.578125, + "learning_rate": 9.084995845579208e-05, + "loss": 1.9066, + "step": 4484 + }, + { + "epoch": 0.1960312950740854, + "grad_norm": 2.25, + "learning_rate": 9.084599732795415e-05, + "loss": 2.2245, + "step": 4485 + }, + { + "epoch": 0.1960750032781153, + "grad_norm": 2.421875, + "learning_rate": 9.084203542929282e-05, + "loss": 2.049, + "step": 4486 + }, + { + "epoch": 0.19611871148214519, + "grad_norm": 2.234375, + "learning_rate": 9.083807275988284e-05, + "loss": 2.4972, + "step": 4487 + }, + { + "epoch": 0.1961624196861751, + "grad_norm": 2.078125, + "learning_rate": 9.083410931979899e-05, + "loss": 1.5624, + "step": 4488 + }, + { + "epoch": 0.196206127890205, + "grad_norm": 2.234375, + "learning_rate": 9.083014510911604e-05, + "loss": 1.945, + "step": 4489 + }, + { + "epoch": 0.19624983609423488, + "grad_norm": 2.21875, + "learning_rate": 9.082618012790886e-05, + "loss": 2.0894, + "step": 4490 + }, + { + "epoch": 0.1962935442982648, + "grad_norm": 2.203125, + "learning_rate": 9.082221437625223e-05, + "loss": 2.0207, + "step": 4491 + }, + { + "epoch": 0.19633725250229467, + "grad_norm": 2.15625, + "learning_rate": 9.081824785422099e-05, + "loss": 1.7351, + "step": 4492 + }, + { + "epoch": 0.19638096070632458, + "grad_norm": 2.859375, + "learning_rate": 9.081428056189e-05, + "loss": 3.121, + "step": 4493 + }, + { + "epoch": 0.19642466891035448, + "grad_norm": 2.21875, + "learning_rate": 9.081031249933416e-05, + "loss": 2.1267, + "step": 4494 + }, + { + "epoch": 0.19646837711438436, + "grad_norm": 2.046875, + "learning_rate": 9.080634366662832e-05, + "loss": 1.6989, + "step": 4495 + }, + { + "epoch": 0.19651208531841427, + "grad_norm": 2.46875, + "learning_rate": 9.080237406384738e-05, + "loss": 1.9422, + "step": 4496 + }, + { + "epoch": 0.19655579352244415, + "grad_norm": 2.3125, + "learning_rate": 9.079840369106625e-05, + "loss": 2.0947, + "step": 4497 + }, + { + "epoch": 0.19659950172647406, + "grad_norm": 3.421875, + "learning_rate": 9.079443254835987e-05, + "loss": 1.5288, + "step": 4498 + }, + { + "epoch": 0.19664320993050396, + "grad_norm": 2.640625, + "learning_rate": 9.079046063580316e-05, + "loss": 1.7854, + "step": 4499 + }, + { + "epoch": 0.19668691813453384, + "grad_norm": 2.328125, + "learning_rate": 9.07864879534711e-05, + "loss": 1.9261, + "step": 4500 + }, + { + "epoch": 0.19673062633856375, + "grad_norm": 2.140625, + "learning_rate": 9.078251450143866e-05, + "loss": 1.7282, + "step": 4501 + }, + { + "epoch": 0.19677433454259363, + "grad_norm": 2.203125, + "learning_rate": 9.077854027978078e-05, + "loss": 1.9388, + "step": 4502 + }, + { + "epoch": 0.19681804274662354, + "grad_norm": 2.875, + "learning_rate": 9.077456528857254e-05, + "loss": 1.6096, + "step": 4503 + }, + { + "epoch": 0.19686175095065345, + "grad_norm": 3.625, + "learning_rate": 9.077058952788888e-05, + "loss": 1.4145, + "step": 4504 + }, + { + "epoch": 0.19690545915468333, + "grad_norm": 2.046875, + "learning_rate": 9.076661299780486e-05, + "loss": 2.0563, + "step": 4505 + }, + { + "epoch": 0.19694916735871323, + "grad_norm": 2.5625, + "learning_rate": 9.076263569839553e-05, + "loss": 1.9754, + "step": 4506 + }, + { + "epoch": 0.1969928755627431, + "grad_norm": 2.21875, + "learning_rate": 9.075865762973595e-05, + "loss": 1.9317, + "step": 4507 + }, + { + "epoch": 0.19703658376677302, + "grad_norm": 2.09375, + "learning_rate": 9.075467879190117e-05, + "loss": 1.9321, + "step": 4508 + }, + { + "epoch": 0.19708029197080293, + "grad_norm": 2.6875, + "learning_rate": 9.075069918496625e-05, + "loss": 2.1764, + "step": 4509 + }, + { + "epoch": 0.1971240001748328, + "grad_norm": 2.328125, + "learning_rate": 9.074671880900636e-05, + "loss": 1.9645, + "step": 4510 + }, + { + "epoch": 0.19716770837886272, + "grad_norm": 2.28125, + "learning_rate": 9.074273766409657e-05, + "loss": 1.7994, + "step": 4511 + }, + { + "epoch": 0.1972114165828926, + "grad_norm": 2.640625, + "learning_rate": 9.073875575031203e-05, + "loss": 2.1177, + "step": 4512 + }, + { + "epoch": 0.1972551247869225, + "grad_norm": 2.375, + "learning_rate": 9.073477306772789e-05, + "loss": 1.6883, + "step": 4513 + }, + { + "epoch": 0.1972988329909524, + "grad_norm": 2.359375, + "learning_rate": 9.073078961641928e-05, + "loss": 2.5953, + "step": 4514 + }, + { + "epoch": 0.1973425411949823, + "grad_norm": 2.421875, + "learning_rate": 9.07268053964614e-05, + "loss": 2.3189, + "step": 4515 + }, + { + "epoch": 0.1973862493990122, + "grad_norm": 2.609375, + "learning_rate": 9.072282040792939e-05, + "loss": 2.5728, + "step": 4516 + }, + { + "epoch": 0.19742995760304208, + "grad_norm": 2.75, + "learning_rate": 9.071883465089852e-05, + "loss": 2.2928, + "step": 4517 + }, + { + "epoch": 0.19747366580707199, + "grad_norm": 5.96875, + "learning_rate": 9.071484812544398e-05, + "loss": 1.8055, + "step": 4518 + }, + { + "epoch": 0.1975173740111019, + "grad_norm": 3.359375, + "learning_rate": 9.071086083164099e-05, + "loss": 3.7166, + "step": 4519 + }, + { + "epoch": 0.19756108221513177, + "grad_norm": 2.09375, + "learning_rate": 9.07068727695648e-05, + "loss": 1.6634, + "step": 4520 + }, + { + "epoch": 0.19760479041916168, + "grad_norm": 2.390625, + "learning_rate": 9.07028839392907e-05, + "loss": 2.0254, + "step": 4521 + }, + { + "epoch": 0.19764849862319156, + "grad_norm": 2.1875, + "learning_rate": 9.069889434089391e-05, + "loss": 1.9842, + "step": 4522 + }, + { + "epoch": 0.19769220682722147, + "grad_norm": 2.125, + "learning_rate": 9.069490397444975e-05, + "loss": 1.9428, + "step": 4523 + }, + { + "epoch": 0.19773591503125137, + "grad_norm": 2.578125, + "learning_rate": 9.069091284003354e-05, + "loss": 1.9117, + "step": 4524 + }, + { + "epoch": 0.19777962323528125, + "grad_norm": 2.9375, + "learning_rate": 9.068692093772058e-05, + "loss": 2.1338, + "step": 4525 + }, + { + "epoch": 0.19782333143931116, + "grad_norm": 2.015625, + "learning_rate": 9.06829282675862e-05, + "loss": 2.0547, + "step": 4526 + }, + { + "epoch": 0.19786703964334104, + "grad_norm": 2.171875, + "learning_rate": 9.067893482970575e-05, + "loss": 1.9351, + "step": 4527 + }, + { + "epoch": 0.19791074784737095, + "grad_norm": 2.046875, + "learning_rate": 9.06749406241546e-05, + "loss": 2.3361, + "step": 4528 + }, + { + "epoch": 0.19795445605140086, + "grad_norm": 2.15625, + "learning_rate": 9.067094565100811e-05, + "loss": 1.8335, + "step": 4529 + }, + { + "epoch": 0.19799816425543074, + "grad_norm": 2.59375, + "learning_rate": 9.066694991034169e-05, + "loss": 2.0857, + "step": 4530 + }, + { + "epoch": 0.19804187245946064, + "grad_norm": 2.46875, + "learning_rate": 9.066295340223073e-05, + "loss": 2.4841, + "step": 4531 + }, + { + "epoch": 0.19808558066349052, + "grad_norm": 3.109375, + "learning_rate": 9.065895612675066e-05, + "loss": 2.5671, + "step": 4532 + }, + { + "epoch": 0.19812928886752043, + "grad_norm": 2.078125, + "learning_rate": 9.065495808397693e-05, + "loss": 1.6286, + "step": 4533 + }, + { + "epoch": 0.19817299707155034, + "grad_norm": 2.09375, + "learning_rate": 9.065095927398495e-05, + "loss": 1.6137, + "step": 4534 + }, + { + "epoch": 0.19821670527558022, + "grad_norm": 2.1875, + "learning_rate": 9.064695969685021e-05, + "loss": 1.7371, + "step": 4535 + }, + { + "epoch": 0.19826041347961013, + "grad_norm": 2.5, + "learning_rate": 9.064295935264818e-05, + "loss": 1.8502, + "step": 4536 + }, + { + "epoch": 0.19830412168364, + "grad_norm": 2.484375, + "learning_rate": 9.063895824145435e-05, + "loss": 2.0257, + "step": 4537 + }, + { + "epoch": 0.1983478298876699, + "grad_norm": 2.171875, + "learning_rate": 9.063495636334423e-05, + "loss": 1.7634, + "step": 4538 + }, + { + "epoch": 0.19839153809169982, + "grad_norm": 2.703125, + "learning_rate": 9.063095371839337e-05, + "loss": 1.9927, + "step": 4539 + }, + { + "epoch": 0.1984352462957297, + "grad_norm": 2.546875, + "learning_rate": 9.062695030667724e-05, + "loss": 2.03, + "step": 4540 + }, + { + "epoch": 0.1984789544997596, + "grad_norm": 2.640625, + "learning_rate": 9.062294612827145e-05, + "loss": 1.8626, + "step": 4541 + }, + { + "epoch": 0.1985226627037895, + "grad_norm": 2.421875, + "learning_rate": 9.061894118325154e-05, + "loss": 2.0393, + "step": 4542 + }, + { + "epoch": 0.1985663709078194, + "grad_norm": 1.984375, + "learning_rate": 9.061493547169308e-05, + "loss": 1.5743, + "step": 4543 + }, + { + "epoch": 0.1986100791118493, + "grad_norm": 2.125, + "learning_rate": 9.061092899367169e-05, + "loss": 2.0798, + "step": 4544 + }, + { + "epoch": 0.19865378731587918, + "grad_norm": 8.375, + "learning_rate": 9.060692174926296e-05, + "loss": 2.4696, + "step": 4545 + }, + { + "epoch": 0.1986974955199091, + "grad_norm": 2.109375, + "learning_rate": 9.060291373854251e-05, + "loss": 1.7727, + "step": 4546 + }, + { + "epoch": 0.19874120372393897, + "grad_norm": 2.3125, + "learning_rate": 9.059890496158599e-05, + "loss": 2.1714, + "step": 4547 + }, + { + "epoch": 0.19878491192796888, + "grad_norm": 2.9375, + "learning_rate": 9.059489541846903e-05, + "loss": 1.4298, + "step": 4548 + }, + { + "epoch": 0.19882862013199878, + "grad_norm": 2.359375, + "learning_rate": 9.059088510926732e-05, + "loss": 2.2674, + "step": 4549 + }, + { + "epoch": 0.19887232833602866, + "grad_norm": 25.75, + "learning_rate": 9.058687403405653e-05, + "loss": 2.4088, + "step": 4550 + }, + { + "epoch": 0.19891603654005857, + "grad_norm": 1.90625, + "learning_rate": 9.058286219291237e-05, + "loss": 1.5939, + "step": 4551 + }, + { + "epoch": 0.19895974474408848, + "grad_norm": 2.515625, + "learning_rate": 9.057884958591052e-05, + "loss": 2.0231, + "step": 4552 + }, + { + "epoch": 0.19900345294811836, + "grad_norm": 2.34375, + "learning_rate": 9.057483621312671e-05, + "loss": 1.5278, + "step": 4553 + }, + { + "epoch": 0.19904716115214827, + "grad_norm": 2.203125, + "learning_rate": 9.05708220746367e-05, + "loss": 2.2683, + "step": 4554 + }, + { + "epoch": 0.19909086935617815, + "grad_norm": 2.171875, + "learning_rate": 9.056680717051622e-05, + "loss": 1.9358, + "step": 4555 + }, + { + "epoch": 0.19913457756020805, + "grad_norm": 2.984375, + "learning_rate": 9.056279150084106e-05, + "loss": 2.4731, + "step": 4556 + }, + { + "epoch": 0.19917828576423796, + "grad_norm": 2.0625, + "learning_rate": 9.055877506568699e-05, + "loss": 1.8777, + "step": 4557 + }, + { + "epoch": 0.19922199396826784, + "grad_norm": 1.984375, + "learning_rate": 9.05547578651298e-05, + "loss": 1.6332, + "step": 4558 + }, + { + "epoch": 0.19926570217229775, + "grad_norm": 2.421875, + "learning_rate": 9.05507398992453e-05, + "loss": 2.3121, + "step": 4559 + }, + { + "epoch": 0.19930941037632763, + "grad_norm": 2.625, + "learning_rate": 9.054672116810932e-05, + "loss": 2.1761, + "step": 4560 + }, + { + "epoch": 0.19935311858035754, + "grad_norm": 7.90625, + "learning_rate": 9.054270167179768e-05, + "loss": 2.2086, + "step": 4561 + }, + { + "epoch": 0.19939682678438744, + "grad_norm": 2.125, + "learning_rate": 9.053868141038628e-05, + "loss": 1.5267, + "step": 4562 + }, + { + "epoch": 0.19944053498841732, + "grad_norm": 2.25, + "learning_rate": 9.053466038395096e-05, + "loss": 1.4548, + "step": 4563 + }, + { + "epoch": 0.19948424319244723, + "grad_norm": 2.609375, + "learning_rate": 9.053063859256758e-05, + "loss": 1.987, + "step": 4564 + }, + { + "epoch": 0.1995279513964771, + "grad_norm": 6.9375, + "learning_rate": 9.052661603631207e-05, + "loss": 1.8233, + "step": 4565 + }, + { + "epoch": 0.19957165960050702, + "grad_norm": 6.25, + "learning_rate": 9.052259271526033e-05, + "loss": 3.0517, + "step": 4566 + }, + { + "epoch": 0.19961536780453693, + "grad_norm": 2.578125, + "learning_rate": 9.05185686294883e-05, + "loss": 1.7139, + "step": 4567 + }, + { + "epoch": 0.1996590760085668, + "grad_norm": 2.90625, + "learning_rate": 9.051454377907191e-05, + "loss": 2.6329, + "step": 4568 + }, + { + "epoch": 0.1997027842125967, + "grad_norm": 2.546875, + "learning_rate": 9.05105181640871e-05, + "loss": 2.0316, + "step": 4569 + }, + { + "epoch": 0.1997464924166266, + "grad_norm": 2.515625, + "learning_rate": 9.050649178460987e-05, + "loss": 2.1056, + "step": 4570 + }, + { + "epoch": 0.1997902006206565, + "grad_norm": 2.5625, + "learning_rate": 9.050246464071616e-05, + "loss": 2.4229, + "step": 4571 + }, + { + "epoch": 0.1998339088246864, + "grad_norm": 2.890625, + "learning_rate": 9.049843673248199e-05, + "loss": 1.7415, + "step": 4572 + }, + { + "epoch": 0.1998776170287163, + "grad_norm": 2.21875, + "learning_rate": 9.04944080599834e-05, + "loss": 2.1538, + "step": 4573 + }, + { + "epoch": 0.1999213252327462, + "grad_norm": 2.34375, + "learning_rate": 9.04903786232964e-05, + "loss": 1.956, + "step": 4574 + }, + { + "epoch": 0.19996503343677607, + "grad_norm": 3.03125, + "learning_rate": 9.048634842249699e-05, + "loss": 2.8857, + "step": 4575 + }, + { + "epoch": 0.20000874164080598, + "grad_norm": 2.109375, + "learning_rate": 9.048231745766129e-05, + "loss": 1.7488, + "step": 4576 + }, + { + "epoch": 0.2000524498448359, + "grad_norm": 2.484375, + "learning_rate": 9.047828572886532e-05, + "loss": 2.6818, + "step": 4577 + }, + { + "epoch": 0.20009615804886577, + "grad_norm": 1.84375, + "learning_rate": 9.04742532361852e-05, + "loss": 1.7045, + "step": 4578 + }, + { + "epoch": 0.20013986625289568, + "grad_norm": 2.625, + "learning_rate": 9.047021997969701e-05, + "loss": 1.9284, + "step": 4579 + }, + { + "epoch": 0.20018357445692556, + "grad_norm": 2.609375, + "learning_rate": 9.046618595947687e-05, + "loss": 2.3077, + "step": 4580 + }, + { + "epoch": 0.20022728266095546, + "grad_norm": 3.109375, + "learning_rate": 9.04621511756009e-05, + "loss": 1.7883, + "step": 4581 + }, + { + "epoch": 0.20027099086498537, + "grad_norm": 2.171875, + "learning_rate": 9.045811562814525e-05, + "loss": 1.8529, + "step": 4582 + }, + { + "epoch": 0.20031469906901525, + "grad_norm": 2.125, + "learning_rate": 9.045407931718608e-05, + "loss": 1.7938, + "step": 4583 + }, + { + "epoch": 0.20035840727304516, + "grad_norm": 2.546875, + "learning_rate": 9.045004224279954e-05, + "loss": 1.9549, + "step": 4584 + }, + { + "epoch": 0.20040211547707504, + "grad_norm": 2.703125, + "learning_rate": 9.044600440506184e-05, + "loss": 1.5256, + "step": 4585 + }, + { + "epoch": 0.20044582368110495, + "grad_norm": 2.921875, + "learning_rate": 9.044196580404917e-05, + "loss": 2.6624, + "step": 4586 + }, + { + "epoch": 0.20048953188513485, + "grad_norm": 3.03125, + "learning_rate": 9.043792643983774e-05, + "loss": 3.1266, + "step": 4587 + }, + { + "epoch": 0.20053324008916473, + "grad_norm": 2.015625, + "learning_rate": 9.04338863125038e-05, + "loss": 1.8653, + "step": 4588 + }, + { + "epoch": 0.20057694829319464, + "grad_norm": 2.1875, + "learning_rate": 9.042984542212355e-05, + "loss": 2.1311, + "step": 4589 + }, + { + "epoch": 0.20062065649722452, + "grad_norm": 2.03125, + "learning_rate": 9.042580376877329e-05, + "loss": 1.9296, + "step": 4590 + }, + { + "epoch": 0.20066436470125443, + "grad_norm": 2.421875, + "learning_rate": 9.042176135252926e-05, + "loss": 2.8345, + "step": 4591 + }, + { + "epoch": 0.20070807290528433, + "grad_norm": 1.9375, + "learning_rate": 9.041771817346778e-05, + "loss": 1.6019, + "step": 4592 + }, + { + "epoch": 0.20075178110931421, + "grad_norm": 2.15625, + "learning_rate": 9.041367423166513e-05, + "loss": 1.8668, + "step": 4593 + }, + { + "epoch": 0.20079548931334412, + "grad_norm": 2.21875, + "learning_rate": 9.040962952719763e-05, + "loss": 2.3199, + "step": 4594 + }, + { + "epoch": 0.200839197517374, + "grad_norm": 3.125, + "learning_rate": 9.040558406014161e-05, + "loss": 1.8228, + "step": 4595 + }, + { + "epoch": 0.2008829057214039, + "grad_norm": 2.21875, + "learning_rate": 9.040153783057342e-05, + "loss": 2.0666, + "step": 4596 + }, + { + "epoch": 0.20092661392543382, + "grad_norm": 3.203125, + "learning_rate": 9.039749083856938e-05, + "loss": 2.0252, + "step": 4597 + }, + { + "epoch": 0.2009703221294637, + "grad_norm": 2.515625, + "learning_rate": 9.039344308420591e-05, + "loss": 2.374, + "step": 4598 + }, + { + "epoch": 0.2010140303334936, + "grad_norm": 3.265625, + "learning_rate": 9.038939456755938e-05, + "loss": 2.3647, + "step": 4599 + }, + { + "epoch": 0.20105773853752348, + "grad_norm": 2.375, + "learning_rate": 9.038534528870618e-05, + "loss": 1.8677, + "step": 4600 + }, + { + "epoch": 0.2011014467415534, + "grad_norm": 2.40625, + "learning_rate": 9.038129524772274e-05, + "loss": 1.8486, + "step": 4601 + }, + { + "epoch": 0.2011451549455833, + "grad_norm": 1.9921875, + "learning_rate": 9.03772444446855e-05, + "loss": 1.3506, + "step": 4602 + }, + { + "epoch": 0.20118886314961318, + "grad_norm": 1.875, + "learning_rate": 9.037319287967088e-05, + "loss": 1.6946, + "step": 4603 + }, + { + "epoch": 0.20123257135364309, + "grad_norm": 1.8828125, + "learning_rate": 9.036914055275534e-05, + "loss": 1.5738, + "step": 4604 + }, + { + "epoch": 0.20127627955767297, + "grad_norm": 2.328125, + "learning_rate": 9.036508746401538e-05, + "loss": 1.9645, + "step": 4605 + }, + { + "epoch": 0.20131998776170287, + "grad_norm": 2.375, + "learning_rate": 9.036103361352746e-05, + "loss": 1.9012, + "step": 4606 + }, + { + "epoch": 0.20136369596573278, + "grad_norm": 3.3125, + "learning_rate": 9.03569790013681e-05, + "loss": 2.5034, + "step": 4607 + }, + { + "epoch": 0.20140740416976266, + "grad_norm": 3.109375, + "learning_rate": 9.035292362761381e-05, + "loss": 1.9332, + "step": 4608 + }, + { + "epoch": 0.20145111237379257, + "grad_norm": 2.1875, + "learning_rate": 9.034886749234111e-05, + "loss": 1.7966, + "step": 4609 + }, + { + "epoch": 0.20149482057782245, + "grad_norm": 2.921875, + "learning_rate": 9.034481059562657e-05, + "loss": 2.015, + "step": 4610 + }, + { + "epoch": 0.20153852878185236, + "grad_norm": 2.25, + "learning_rate": 9.034075293754672e-05, + "loss": 2.0806, + "step": 4611 + }, + { + "epoch": 0.20158223698588226, + "grad_norm": 2.375, + "learning_rate": 9.033669451817817e-05, + "loss": 2.1522, + "step": 4612 + }, + { + "epoch": 0.20162594518991214, + "grad_norm": 2.078125, + "learning_rate": 9.033263533759748e-05, + "loss": 1.8821, + "step": 4613 + }, + { + "epoch": 0.20166965339394205, + "grad_norm": 2.40625, + "learning_rate": 9.032857539588124e-05, + "loss": 2.1767, + "step": 4614 + }, + { + "epoch": 0.20171336159797193, + "grad_norm": 2.125, + "learning_rate": 9.03245146931061e-05, + "loss": 2.1203, + "step": 4615 + }, + { + "epoch": 0.20175706980200184, + "grad_norm": 2.53125, + "learning_rate": 9.032045322934868e-05, + "loss": 2.2441, + "step": 4616 + }, + { + "epoch": 0.20180077800603174, + "grad_norm": 4.71875, + "learning_rate": 9.031639100468563e-05, + "loss": 1.45, + "step": 4617 + }, + { + "epoch": 0.20184448621006162, + "grad_norm": 2.40625, + "learning_rate": 9.031232801919359e-05, + "loss": 1.8462, + "step": 4618 + }, + { + "epoch": 0.20188819441409153, + "grad_norm": 2.84375, + "learning_rate": 9.030826427294924e-05, + "loss": 2.1721, + "step": 4619 + }, + { + "epoch": 0.2019319026181214, + "grad_norm": 2.15625, + "learning_rate": 9.03041997660293e-05, + "loss": 1.9742, + "step": 4620 + }, + { + "epoch": 0.20197561082215132, + "grad_norm": 2.71875, + "learning_rate": 9.030013449851045e-05, + "loss": 1.8034, + "step": 4621 + }, + { + "epoch": 0.20201931902618123, + "grad_norm": 2.375, + "learning_rate": 9.029606847046941e-05, + "loss": 2.3671, + "step": 4622 + }, + { + "epoch": 0.2020630272302111, + "grad_norm": 2.59375, + "learning_rate": 9.029200168198289e-05, + "loss": 2.5159, + "step": 4623 + }, + { + "epoch": 0.202106735434241, + "grad_norm": 2.90625, + "learning_rate": 9.028793413312767e-05, + "loss": 2.3314, + "step": 4624 + }, + { + "epoch": 0.2021504436382709, + "grad_norm": 2.265625, + "learning_rate": 9.02838658239805e-05, + "loss": 1.9898, + "step": 4625 + }, + { + "epoch": 0.2021941518423008, + "grad_norm": 2.5, + "learning_rate": 9.027979675461814e-05, + "loss": 2.2, + "step": 4626 + }, + { + "epoch": 0.2022378600463307, + "grad_norm": 2.125, + "learning_rate": 9.027572692511739e-05, + "loss": 2.3884, + "step": 4627 + }, + { + "epoch": 0.2022815682503606, + "grad_norm": 2.59375, + "learning_rate": 9.027165633555507e-05, + "loss": 2.0405, + "step": 4628 + }, + { + "epoch": 0.2023252764543905, + "grad_norm": 2.453125, + "learning_rate": 9.026758498600797e-05, + "loss": 2.6007, + "step": 4629 + }, + { + "epoch": 0.20236898465842038, + "grad_norm": 2.5625, + "learning_rate": 9.026351287655294e-05, + "loss": 2.4496, + "step": 4630 + }, + { + "epoch": 0.20241269286245028, + "grad_norm": 2.921875, + "learning_rate": 9.02594400072668e-05, + "loss": 2.2956, + "step": 4631 + }, + { + "epoch": 0.2024564010664802, + "grad_norm": 2.265625, + "learning_rate": 9.025536637822647e-05, + "loss": 1.6348, + "step": 4632 + }, + { + "epoch": 0.20250010927051007, + "grad_norm": 2.234375, + "learning_rate": 9.025129198950877e-05, + "loss": 2.1001, + "step": 4633 + }, + { + "epoch": 0.20254381747453998, + "grad_norm": 2.53125, + "learning_rate": 9.02472168411906e-05, + "loss": 1.516, + "step": 4634 + }, + { + "epoch": 0.20258752567856986, + "grad_norm": 2.484375, + "learning_rate": 9.024314093334886e-05, + "loss": 2.2612, + "step": 4635 + }, + { + "epoch": 0.20263123388259976, + "grad_norm": 2.171875, + "learning_rate": 9.02390642660605e-05, + "loss": 1.5756, + "step": 4636 + }, + { + "epoch": 0.20267494208662967, + "grad_norm": 2.453125, + "learning_rate": 9.023498683940243e-05, + "loss": 1.7007, + "step": 4637 + }, + { + "epoch": 0.20271865029065955, + "grad_norm": 2.34375, + "learning_rate": 9.02309086534516e-05, + "loss": 1.9744, + "step": 4638 + }, + { + "epoch": 0.20276235849468946, + "grad_norm": 2.53125, + "learning_rate": 9.022682970828497e-05, + "loss": 2.396, + "step": 4639 + }, + { + "epoch": 0.20280606669871934, + "grad_norm": 1.953125, + "learning_rate": 9.022275000397951e-05, + "loss": 1.851, + "step": 4640 + }, + { + "epoch": 0.20284977490274925, + "grad_norm": 2.390625, + "learning_rate": 9.021866954061223e-05, + "loss": 1.8836, + "step": 4641 + }, + { + "epoch": 0.20289348310677915, + "grad_norm": 2.625, + "learning_rate": 9.02145883182601e-05, + "loss": 2.2614, + "step": 4642 + }, + { + "epoch": 0.20293719131080903, + "grad_norm": 3.046875, + "learning_rate": 9.021050633700018e-05, + "loss": 2.3415, + "step": 4643 + }, + { + "epoch": 0.20298089951483894, + "grad_norm": 2.75, + "learning_rate": 9.020642359690947e-05, + "loss": 1.7504, + "step": 4644 + }, + { + "epoch": 0.20302460771886882, + "grad_norm": 2.484375, + "learning_rate": 9.020234009806503e-05, + "loss": 1.7237, + "step": 4645 + }, + { + "epoch": 0.20306831592289873, + "grad_norm": 2.453125, + "learning_rate": 9.01982558405439e-05, + "loss": 2.115, + "step": 4646 + }, + { + "epoch": 0.20311202412692864, + "grad_norm": 2.875, + "learning_rate": 9.019417082442321e-05, + "loss": 2.3209, + "step": 4647 + }, + { + "epoch": 0.20315573233095852, + "grad_norm": 2.421875, + "learning_rate": 9.019008504978e-05, + "loss": 2.3012, + "step": 4648 + }, + { + "epoch": 0.20319944053498842, + "grad_norm": 2.484375, + "learning_rate": 9.01859985166914e-05, + "loss": 1.8486, + "step": 4649 + }, + { + "epoch": 0.2032431487390183, + "grad_norm": 2.171875, + "learning_rate": 9.018191122523452e-05, + "loss": 1.7963, + "step": 4650 + }, + { + "epoch": 0.2032868569430482, + "grad_norm": 2.34375, + "learning_rate": 9.017782317548649e-05, + "loss": 1.8977, + "step": 4651 + }, + { + "epoch": 0.20333056514707812, + "grad_norm": 2.234375, + "learning_rate": 9.017373436752445e-05, + "loss": 2.0669, + "step": 4652 + }, + { + "epoch": 0.203374273351108, + "grad_norm": 2.21875, + "learning_rate": 9.016964480142557e-05, + "loss": 2.3295, + "step": 4653 + }, + { + "epoch": 0.2034179815551379, + "grad_norm": 2.140625, + "learning_rate": 9.016555447726704e-05, + "loss": 1.9676, + "step": 4654 + }, + { + "epoch": 0.20346168975916779, + "grad_norm": 3.921875, + "learning_rate": 9.016146339512606e-05, + "loss": 1.961, + "step": 4655 + }, + { + "epoch": 0.2035053979631977, + "grad_norm": 2.5, + "learning_rate": 9.015737155507978e-05, + "loss": 2.3247, + "step": 4656 + }, + { + "epoch": 0.2035491061672276, + "grad_norm": 2.046875, + "learning_rate": 9.015327895720547e-05, + "loss": 1.9737, + "step": 4657 + }, + { + "epoch": 0.20359281437125748, + "grad_norm": 2.125, + "learning_rate": 9.014918560158035e-05, + "loss": 2.0781, + "step": 4658 + }, + { + "epoch": 0.2036365225752874, + "grad_norm": 2.53125, + "learning_rate": 9.014509148828164e-05, + "loss": 2.504, + "step": 4659 + }, + { + "epoch": 0.20368023077931727, + "grad_norm": 2.171875, + "learning_rate": 9.014099661738664e-05, + "loss": 1.7876, + "step": 4660 + }, + { + "epoch": 0.20372393898334717, + "grad_norm": 2.421875, + "learning_rate": 9.01369009889726e-05, + "loss": 1.7947, + "step": 4661 + }, + { + "epoch": 0.20376764718737708, + "grad_norm": 3.546875, + "learning_rate": 9.013280460311684e-05, + "loss": 3.0699, + "step": 4662 + }, + { + "epoch": 0.20381135539140696, + "grad_norm": 2.671875, + "learning_rate": 9.012870745989663e-05, + "loss": 2.0769, + "step": 4663 + }, + { + "epoch": 0.20385506359543687, + "grad_norm": 3.78125, + "learning_rate": 9.012460955938933e-05, + "loss": 1.6031, + "step": 4664 + }, + { + "epoch": 0.20389877179946675, + "grad_norm": 2.296875, + "learning_rate": 9.012051090167222e-05, + "loss": 2.1118, + "step": 4665 + }, + { + "epoch": 0.20394248000349666, + "grad_norm": 4.28125, + "learning_rate": 9.01164114868227e-05, + "loss": 1.6598, + "step": 4666 + }, + { + "epoch": 0.20398618820752656, + "grad_norm": 2.25, + "learning_rate": 9.011231131491808e-05, + "loss": 2.263, + "step": 4667 + }, + { + "epoch": 0.20402989641155644, + "grad_norm": 2.59375, + "learning_rate": 9.01082103860358e-05, + "loss": 1.7258, + "step": 4668 + }, + { + "epoch": 0.20407360461558635, + "grad_norm": 2.625, + "learning_rate": 9.010410870025319e-05, + "loss": 2.1402, + "step": 4669 + }, + { + "epoch": 0.20411731281961623, + "grad_norm": 1.9140625, + "learning_rate": 9.01000062576477e-05, + "loss": 1.9602, + "step": 4670 + }, + { + "epoch": 0.20416102102364614, + "grad_norm": 3.6875, + "learning_rate": 9.009590305829672e-05, + "loss": 2.8867, + "step": 4671 + }, + { + "epoch": 0.20420472922767605, + "grad_norm": 2.296875, + "learning_rate": 9.009179910227768e-05, + "loss": 1.8176, + "step": 4672 + }, + { + "epoch": 0.20424843743170593, + "grad_norm": 2.53125, + "learning_rate": 9.008769438966805e-05, + "loss": 2.1523, + "step": 4673 + }, + { + "epoch": 0.20429214563573583, + "grad_norm": 1.953125, + "learning_rate": 9.008358892054528e-05, + "loss": 1.8026, + "step": 4674 + }, + { + "epoch": 0.2043358538397657, + "grad_norm": 2.203125, + "learning_rate": 9.007948269498685e-05, + "loss": 1.8456, + "step": 4675 + }, + { + "epoch": 0.20437956204379562, + "grad_norm": 2.609375, + "learning_rate": 9.007537571307025e-05, + "loss": 1.8779, + "step": 4676 + }, + { + "epoch": 0.20442327024782553, + "grad_norm": 2.78125, + "learning_rate": 9.007126797487298e-05, + "loss": 2.1508, + "step": 4677 + }, + { + "epoch": 0.2044669784518554, + "grad_norm": 2.46875, + "learning_rate": 9.006715948047257e-05, + "loss": 2.1961, + "step": 4678 + }, + { + "epoch": 0.20451068665588532, + "grad_norm": 2.15625, + "learning_rate": 9.006305022994654e-05, + "loss": 1.837, + "step": 4679 + }, + { + "epoch": 0.2045543948599152, + "grad_norm": 2.984375, + "learning_rate": 9.005894022337245e-05, + "loss": 2.0313, + "step": 4680 + }, + { + "epoch": 0.2045981030639451, + "grad_norm": 6.875, + "learning_rate": 9.005482946082784e-05, + "loss": 1.9768, + "step": 4681 + }, + { + "epoch": 0.204641811267975, + "grad_norm": 2.09375, + "learning_rate": 9.00507179423903e-05, + "loss": 1.8533, + "step": 4682 + }, + { + "epoch": 0.2046855194720049, + "grad_norm": 2.234375, + "learning_rate": 9.004660566813744e-05, + "loss": 1.7265, + "step": 4683 + }, + { + "epoch": 0.2047292276760348, + "grad_norm": 2.328125, + "learning_rate": 9.004249263814683e-05, + "loss": 1.9114, + "step": 4684 + }, + { + "epoch": 0.20477293588006468, + "grad_norm": 2.03125, + "learning_rate": 9.003837885249612e-05, + "loss": 1.8118, + "step": 4685 + }, + { + "epoch": 0.20481664408409458, + "grad_norm": 4.09375, + "learning_rate": 9.003426431126291e-05, + "loss": 1.7946, + "step": 4686 + }, + { + "epoch": 0.2048603522881245, + "grad_norm": 3.328125, + "learning_rate": 9.003014901452488e-05, + "loss": 2.2468, + "step": 4687 + }, + { + "epoch": 0.20490406049215437, + "grad_norm": 2.34375, + "learning_rate": 9.002603296235967e-05, + "loss": 1.8963, + "step": 4688 + }, + { + "epoch": 0.20494776869618428, + "grad_norm": 2.203125, + "learning_rate": 9.002191615484496e-05, + "loss": 2.3479, + "step": 4689 + }, + { + "epoch": 0.20499147690021416, + "grad_norm": 1.8828125, + "learning_rate": 9.001779859205845e-05, + "loss": 1.614, + "step": 4690 + }, + { + "epoch": 0.20503518510424407, + "grad_norm": 1.9296875, + "learning_rate": 9.001368027407784e-05, + "loss": 1.6517, + "step": 4691 + }, + { + "epoch": 0.20507889330827397, + "grad_norm": 2.4375, + "learning_rate": 9.000956120098084e-05, + "loss": 2.1384, + "step": 4692 + }, + { + "epoch": 0.20512260151230385, + "grad_norm": 1.9609375, + "learning_rate": 9.000544137284519e-05, + "loss": 1.7031, + "step": 4693 + }, + { + "epoch": 0.20516630971633376, + "grad_norm": 2.015625, + "learning_rate": 9.000132078974863e-05, + "loss": 1.6414, + "step": 4694 + }, + { + "epoch": 0.20521001792036364, + "grad_norm": 2.265625, + "learning_rate": 8.999719945176894e-05, + "loss": 2.316, + "step": 4695 + }, + { + "epoch": 0.20525372612439355, + "grad_norm": 2.8125, + "learning_rate": 8.999307735898387e-05, + "loss": 1.9569, + "step": 4696 + }, + { + "epoch": 0.20529743432842346, + "grad_norm": 2.296875, + "learning_rate": 8.998895451147125e-05, + "loss": 1.6369, + "step": 4697 + }, + { + "epoch": 0.20534114253245334, + "grad_norm": 2.25, + "learning_rate": 8.998483090930883e-05, + "loss": 1.8258, + "step": 4698 + }, + { + "epoch": 0.20538485073648324, + "grad_norm": 2.375, + "learning_rate": 8.998070655257447e-05, + "loss": 2.0003, + "step": 4699 + }, + { + "epoch": 0.20542855894051312, + "grad_norm": 1.9765625, + "learning_rate": 8.997658144134598e-05, + "loss": 1.6348, + "step": 4700 + }, + { + "epoch": 0.20547226714454303, + "grad_norm": 2.15625, + "learning_rate": 8.997245557570121e-05, + "loss": 1.7787, + "step": 4701 + }, + { + "epoch": 0.20551597534857294, + "grad_norm": 2.15625, + "learning_rate": 8.996832895571803e-05, + "loss": 1.9775, + "step": 4702 + }, + { + "epoch": 0.20555968355260282, + "grad_norm": 2.296875, + "learning_rate": 8.996420158147431e-05, + "loss": 1.8802, + "step": 4703 + }, + { + "epoch": 0.20560339175663273, + "grad_norm": 29.75, + "learning_rate": 8.996007345304795e-05, + "loss": 3.9084, + "step": 4704 + }, + { + "epoch": 0.2056470999606626, + "grad_norm": 18.125, + "learning_rate": 8.995594457051684e-05, + "loss": 1.2003, + "step": 4705 + }, + { + "epoch": 0.2056908081646925, + "grad_norm": 2.75, + "learning_rate": 8.99518149339589e-05, + "loss": 2.1783, + "step": 4706 + }, + { + "epoch": 0.20573451636872242, + "grad_norm": 2.75, + "learning_rate": 8.994768454345206e-05, + "loss": 1.7874, + "step": 4707 + }, + { + "epoch": 0.2057782245727523, + "grad_norm": 2.109375, + "learning_rate": 8.994355339907429e-05, + "loss": 2.057, + "step": 4708 + }, + { + "epoch": 0.2058219327767822, + "grad_norm": 2.734375, + "learning_rate": 8.993942150090352e-05, + "loss": 2.2157, + "step": 4709 + }, + { + "epoch": 0.2058656409808121, + "grad_norm": 2.703125, + "learning_rate": 8.993528884901774e-05, + "loss": 2.5268, + "step": 4710 + }, + { + "epoch": 0.205909349184842, + "grad_norm": 2.796875, + "learning_rate": 8.993115544349493e-05, + "loss": 2.3586, + "step": 4711 + }, + { + "epoch": 0.2059530573888719, + "grad_norm": 2.28125, + "learning_rate": 8.992702128441311e-05, + "loss": 1.8512, + "step": 4712 + }, + { + "epoch": 0.20599676559290178, + "grad_norm": 2.140625, + "learning_rate": 8.992288637185028e-05, + "loss": 1.7177, + "step": 4713 + }, + { + "epoch": 0.2060404737969317, + "grad_norm": 2.21875, + "learning_rate": 8.991875070588447e-05, + "loss": 1.7863, + "step": 4714 + }, + { + "epoch": 0.20608418200096157, + "grad_norm": 2.140625, + "learning_rate": 8.991461428659375e-05, + "loss": 1.9259, + "step": 4715 + }, + { + "epoch": 0.20612789020499148, + "grad_norm": 2.515625, + "learning_rate": 8.991047711405617e-05, + "loss": 1.8682, + "step": 4716 + }, + { + "epoch": 0.20617159840902138, + "grad_norm": 2.75, + "learning_rate": 8.990633918834979e-05, + "loss": 1.8183, + "step": 4717 + }, + { + "epoch": 0.20621530661305126, + "grad_norm": 3.34375, + "learning_rate": 8.990220050955271e-05, + "loss": 2.2547, + "step": 4718 + }, + { + "epoch": 0.20625901481708117, + "grad_norm": 2.734375, + "learning_rate": 8.989806107774304e-05, + "loss": 1.9872, + "step": 4719 + }, + { + "epoch": 0.20630272302111105, + "grad_norm": 2.765625, + "learning_rate": 8.989392089299888e-05, + "loss": 1.8414, + "step": 4720 + }, + { + "epoch": 0.20634643122514096, + "grad_norm": 2.5625, + "learning_rate": 8.988977995539837e-05, + "loss": 2.3367, + "step": 4721 + }, + { + "epoch": 0.20639013942917087, + "grad_norm": 2.78125, + "learning_rate": 8.988563826501965e-05, + "loss": 2.3742, + "step": 4722 + }, + { + "epoch": 0.20643384763320075, + "grad_norm": 2.59375, + "learning_rate": 8.98814958219409e-05, + "loss": 2.4695, + "step": 4723 + }, + { + "epoch": 0.20647755583723065, + "grad_norm": 2.53125, + "learning_rate": 8.987735262624027e-05, + "loss": 2.1642, + "step": 4724 + }, + { + "epoch": 0.20652126404126053, + "grad_norm": 1.84375, + "learning_rate": 8.987320867799594e-05, + "loss": 1.6123, + "step": 4725 + }, + { + "epoch": 0.20656497224529044, + "grad_norm": 2.84375, + "learning_rate": 8.986906397728616e-05, + "loss": 1.9332, + "step": 4726 + }, + { + "epoch": 0.20660868044932035, + "grad_norm": 4.75, + "learning_rate": 8.98649185241891e-05, + "loss": 1.7614, + "step": 4727 + }, + { + "epoch": 0.20665238865335023, + "grad_norm": 3.109375, + "learning_rate": 8.9860772318783e-05, + "loss": 2.1411, + "step": 4728 + }, + { + "epoch": 0.20669609685738013, + "grad_norm": 2.265625, + "learning_rate": 8.985662536114613e-05, + "loss": 1.8783, + "step": 4729 + }, + { + "epoch": 0.20673980506141001, + "grad_norm": 2.609375, + "learning_rate": 8.985247765135672e-05, + "loss": 1.7308, + "step": 4730 + }, + { + "epoch": 0.20678351326543992, + "grad_norm": 2.515625, + "learning_rate": 8.984832918949306e-05, + "loss": 2.3303, + "step": 4731 + }, + { + "epoch": 0.20682722146946983, + "grad_norm": 1.8515625, + "learning_rate": 8.984417997563342e-05, + "loss": 1.6741, + "step": 4732 + }, + { + "epoch": 0.2068709296734997, + "grad_norm": 2.484375, + "learning_rate": 8.984003000985613e-05, + "loss": 1.9943, + "step": 4733 + }, + { + "epoch": 0.20691463787752962, + "grad_norm": 2.40625, + "learning_rate": 8.983587929223948e-05, + "loss": 2.332, + "step": 4734 + }, + { + "epoch": 0.2069583460815595, + "grad_norm": 2.53125, + "learning_rate": 8.98317278228618e-05, + "loss": 2.201, + "step": 4735 + }, + { + "epoch": 0.2070020542855894, + "grad_norm": 2.28125, + "learning_rate": 8.982757560180146e-05, + "loss": 1.8238, + "step": 4736 + }, + { + "epoch": 0.2070457624896193, + "grad_norm": 2.546875, + "learning_rate": 8.982342262913679e-05, + "loss": 1.7964, + "step": 4737 + }, + { + "epoch": 0.2070894706936492, + "grad_norm": 2.21875, + "learning_rate": 8.981926890494619e-05, + "loss": 1.4018, + "step": 4738 + }, + { + "epoch": 0.2071331788976791, + "grad_norm": 2.75, + "learning_rate": 8.981511442930802e-05, + "loss": 1.6008, + "step": 4739 + }, + { + "epoch": 0.20717688710170898, + "grad_norm": 2.375, + "learning_rate": 8.98109592023007e-05, + "loss": 2.0941, + "step": 4740 + }, + { + "epoch": 0.20722059530573889, + "grad_norm": 2.234375, + "learning_rate": 8.980680322400264e-05, + "loss": 1.9683, + "step": 4741 + }, + { + "epoch": 0.2072643035097688, + "grad_norm": 2.0, + "learning_rate": 8.980264649449225e-05, + "loss": 1.7002, + "step": 4742 + }, + { + "epoch": 0.20730801171379867, + "grad_norm": 6.375, + "learning_rate": 8.9798489013848e-05, + "loss": 1.7523, + "step": 4743 + }, + { + "epoch": 0.20735171991782858, + "grad_norm": 2.5625, + "learning_rate": 8.979433078214834e-05, + "loss": 3.0292, + "step": 4744 + }, + { + "epoch": 0.20739542812185846, + "grad_norm": 2.078125, + "learning_rate": 8.979017179947174e-05, + "loss": 2.0299, + "step": 4745 + }, + { + "epoch": 0.20743913632588837, + "grad_norm": 2.890625, + "learning_rate": 8.978601206589669e-05, + "loss": 2.5303, + "step": 4746 + }, + { + "epoch": 0.20748284452991828, + "grad_norm": 2.84375, + "learning_rate": 8.978185158150167e-05, + "loss": 1.5541, + "step": 4747 + }, + { + "epoch": 0.20752655273394816, + "grad_norm": 2.4375, + "learning_rate": 8.977769034636523e-05, + "loss": 1.9323, + "step": 4748 + }, + { + "epoch": 0.20757026093797806, + "grad_norm": 2.265625, + "learning_rate": 8.977352836056587e-05, + "loss": 1.5497, + "step": 4749 + }, + { + "epoch": 0.20761396914200794, + "grad_norm": 2.25, + "learning_rate": 8.976936562418215e-05, + "loss": 2.4203, + "step": 4750 + }, + { + "epoch": 0.20765767734603785, + "grad_norm": 2.078125, + "learning_rate": 8.97652021372926e-05, + "loss": 1.8565, + "step": 4751 + }, + { + "epoch": 0.20770138555006776, + "grad_norm": 2.015625, + "learning_rate": 8.976103789997582e-05, + "loss": 1.4862, + "step": 4752 + }, + { + "epoch": 0.20774509375409764, + "grad_norm": 5.6875, + "learning_rate": 8.975687291231041e-05, + "loss": 2.5187, + "step": 4753 + }, + { + "epoch": 0.20778880195812754, + "grad_norm": 4.03125, + "learning_rate": 8.975270717437492e-05, + "loss": 2.9507, + "step": 4754 + }, + { + "epoch": 0.20783251016215742, + "grad_norm": 3.1875, + "learning_rate": 8.974854068624798e-05, + "loss": 1.4123, + "step": 4755 + }, + { + "epoch": 0.20787621836618733, + "grad_norm": 2.640625, + "learning_rate": 8.974437344800825e-05, + "loss": 2.8158, + "step": 4756 + }, + { + "epoch": 0.20791992657021724, + "grad_norm": 2.4375, + "learning_rate": 8.974020545973433e-05, + "loss": 1.6974, + "step": 4757 + }, + { + "epoch": 0.20796363477424712, + "grad_norm": 2.609375, + "learning_rate": 8.97360367215049e-05, + "loss": 1.9125, + "step": 4758 + }, + { + "epoch": 0.20800734297827703, + "grad_norm": 2.3125, + "learning_rate": 8.973186723339863e-05, + "loss": 2.0247, + "step": 4759 + }, + { + "epoch": 0.2080510511823069, + "grad_norm": 2.796875, + "learning_rate": 8.972769699549419e-05, + "loss": 1.7753, + "step": 4760 + }, + { + "epoch": 0.20809475938633681, + "grad_norm": 2.6875, + "learning_rate": 8.972352600787028e-05, + "loss": 2.3908, + "step": 4761 + }, + { + "epoch": 0.20813846759036672, + "grad_norm": 2.4375, + "learning_rate": 8.971935427060562e-05, + "loss": 2.2623, + "step": 4762 + }, + { + "epoch": 0.2081821757943966, + "grad_norm": 2.703125, + "learning_rate": 8.971518178377895e-05, + "loss": 1.909, + "step": 4763 + }, + { + "epoch": 0.2082258839984265, + "grad_norm": 3.125, + "learning_rate": 8.9711008547469e-05, + "loss": 1.8331, + "step": 4764 + }, + { + "epoch": 0.2082695922024564, + "grad_norm": 3.140625, + "learning_rate": 8.970683456175451e-05, + "loss": 1.4373, + "step": 4765 + }, + { + "epoch": 0.2083133004064863, + "grad_norm": 2.09375, + "learning_rate": 8.970265982671427e-05, + "loss": 2.1941, + "step": 4766 + }, + { + "epoch": 0.2083570086105162, + "grad_norm": 3.3125, + "learning_rate": 8.969848434242705e-05, + "loss": 2.371, + "step": 4767 + }, + { + "epoch": 0.20840071681454608, + "grad_norm": 2.421875, + "learning_rate": 8.969430810897166e-05, + "loss": 2.0571, + "step": 4768 + }, + { + "epoch": 0.208444425018576, + "grad_norm": 2.15625, + "learning_rate": 8.969013112642689e-05, + "loss": 1.9345, + "step": 4769 + }, + { + "epoch": 0.20848813322260587, + "grad_norm": 2.3125, + "learning_rate": 8.968595339487157e-05, + "loss": 1.3695, + "step": 4770 + }, + { + "epoch": 0.20853184142663578, + "grad_norm": 2.21875, + "learning_rate": 8.968177491438457e-05, + "loss": 1.992, + "step": 4771 + }, + { + "epoch": 0.20857554963066569, + "grad_norm": 2.28125, + "learning_rate": 8.967759568504472e-05, + "loss": 2.2224, + "step": 4772 + }, + { + "epoch": 0.20861925783469557, + "grad_norm": 2.359375, + "learning_rate": 8.967341570693088e-05, + "loss": 1.8914, + "step": 4773 + }, + { + "epoch": 0.20866296603872547, + "grad_norm": 1.90625, + "learning_rate": 8.966923498012196e-05, + "loss": 1.6996, + "step": 4774 + }, + { + "epoch": 0.20870667424275535, + "grad_norm": 3.234375, + "learning_rate": 8.966505350469682e-05, + "loss": 2.8157, + "step": 4775 + }, + { + "epoch": 0.20875038244678526, + "grad_norm": 2.4375, + "learning_rate": 8.966087128073441e-05, + "loss": 1.5552, + "step": 4776 + }, + { + "epoch": 0.20879409065081517, + "grad_norm": 1.953125, + "learning_rate": 8.965668830831364e-05, + "loss": 1.7169, + "step": 4777 + }, + { + "epoch": 0.20883779885484505, + "grad_norm": 2.546875, + "learning_rate": 8.965250458751343e-05, + "loss": 2.0599, + "step": 4778 + }, + { + "epoch": 0.20888150705887495, + "grad_norm": 3.125, + "learning_rate": 8.964832011841275e-05, + "loss": 2.4304, + "step": 4779 + }, + { + "epoch": 0.20892521526290483, + "grad_norm": 2.234375, + "learning_rate": 8.964413490109055e-05, + "loss": 1.6481, + "step": 4780 + }, + { + "epoch": 0.20896892346693474, + "grad_norm": 2.203125, + "learning_rate": 8.963994893562585e-05, + "loss": 2.0656, + "step": 4781 + }, + { + "epoch": 0.20901263167096465, + "grad_norm": 2.25, + "learning_rate": 8.96357622220976e-05, + "loss": 1.93, + "step": 4782 + }, + { + "epoch": 0.20905633987499453, + "grad_norm": 2.78125, + "learning_rate": 8.963157476058485e-05, + "loss": 2.4348, + "step": 4783 + }, + { + "epoch": 0.20910004807902444, + "grad_norm": 2.453125, + "learning_rate": 8.962738655116658e-05, + "loss": 2.1908, + "step": 4784 + }, + { + "epoch": 0.20914375628305432, + "grad_norm": 2.0625, + "learning_rate": 8.962319759392188e-05, + "loss": 1.6403, + "step": 4785 + }, + { + "epoch": 0.20918746448708422, + "grad_norm": 20.5, + "learning_rate": 8.961900788892974e-05, + "loss": 1.6297, + "step": 4786 + }, + { + "epoch": 0.20923117269111413, + "grad_norm": 2.59375, + "learning_rate": 8.961481743626928e-05, + "loss": 2.1056, + "step": 4787 + }, + { + "epoch": 0.209274880895144, + "grad_norm": 2.15625, + "learning_rate": 8.961062623601955e-05, + "loss": 1.9578, + "step": 4788 + }, + { + "epoch": 0.20931858909917392, + "grad_norm": 1.9375, + "learning_rate": 8.960643428825966e-05, + "loss": 1.7978, + "step": 4789 + }, + { + "epoch": 0.2093622973032038, + "grad_norm": 2.265625, + "learning_rate": 8.96022415930687e-05, + "loss": 1.7017, + "step": 4790 + }, + { + "epoch": 0.2094060055072337, + "grad_norm": 2.0, + "learning_rate": 8.959804815052582e-05, + "loss": 1.9484, + "step": 4791 + }, + { + "epoch": 0.2094497137112636, + "grad_norm": 2.5625, + "learning_rate": 8.959385396071012e-05, + "loss": 1.9809, + "step": 4792 + }, + { + "epoch": 0.2094934219152935, + "grad_norm": 2.03125, + "learning_rate": 8.958965902370078e-05, + "loss": 1.4298, + "step": 4793 + }, + { + "epoch": 0.2095371301193234, + "grad_norm": 2.375, + "learning_rate": 8.958546333957694e-05, + "loss": 1.766, + "step": 4794 + }, + { + "epoch": 0.20958083832335328, + "grad_norm": 2.265625, + "learning_rate": 8.958126690841781e-05, + "loss": 2.1259, + "step": 4795 + }, + { + "epoch": 0.2096245465273832, + "grad_norm": 2.1875, + "learning_rate": 8.957706973030256e-05, + "loss": 1.6469, + "step": 4796 + }, + { + "epoch": 0.2096682547314131, + "grad_norm": 2.3125, + "learning_rate": 8.95728718053104e-05, + "loss": 1.6772, + "step": 4797 + }, + { + "epoch": 0.20971196293544297, + "grad_norm": 3.015625, + "learning_rate": 8.956867313352056e-05, + "loss": 2.1811, + "step": 4798 + }, + { + "epoch": 0.20975567113947288, + "grad_norm": 2.609375, + "learning_rate": 8.956447371501227e-05, + "loss": 2.0043, + "step": 4799 + }, + { + "epoch": 0.20979937934350276, + "grad_norm": 2.9375, + "learning_rate": 8.956027354986476e-05, + "loss": 2.1774, + "step": 4800 + }, + { + "epoch": 0.20984308754753267, + "grad_norm": 2.15625, + "learning_rate": 8.955607263815732e-05, + "loss": 1.8232, + "step": 4801 + }, + { + "epoch": 0.20988679575156258, + "grad_norm": 2.578125, + "learning_rate": 8.955187097996922e-05, + "loss": 2.3588, + "step": 4802 + }, + { + "epoch": 0.20993050395559246, + "grad_norm": 2.296875, + "learning_rate": 8.954766857537975e-05, + "loss": 1.8239, + "step": 4803 + }, + { + "epoch": 0.20997421215962236, + "grad_norm": 12.1875, + "learning_rate": 8.95434654244682e-05, + "loss": 1.6482, + "step": 4804 + }, + { + "epoch": 0.21001792036365224, + "grad_norm": 2.25, + "learning_rate": 8.953926152731394e-05, + "loss": 2.0697, + "step": 4805 + }, + { + "epoch": 0.21006162856768215, + "grad_norm": 2.3125, + "learning_rate": 8.953505688399624e-05, + "loss": 2.2319, + "step": 4806 + }, + { + "epoch": 0.21010533677171206, + "grad_norm": 2.1875, + "learning_rate": 8.95308514945945e-05, + "loss": 1.8125, + "step": 4807 + }, + { + "epoch": 0.21014904497574194, + "grad_norm": 5.875, + "learning_rate": 8.952664535918803e-05, + "loss": 1.6745, + "step": 4808 + }, + { + "epoch": 0.21019275317977185, + "grad_norm": 2.359375, + "learning_rate": 8.952243847785624e-05, + "loss": 2.3218, + "step": 4809 + }, + { + "epoch": 0.21023646138380173, + "grad_norm": 2.6875, + "learning_rate": 8.951823085067852e-05, + "loss": 2.0469, + "step": 4810 + }, + { + "epoch": 0.21028016958783163, + "grad_norm": 2.484375, + "learning_rate": 8.951402247773428e-05, + "loss": 2.3366, + "step": 4811 + }, + { + "epoch": 0.21032387779186154, + "grad_norm": 2.234375, + "learning_rate": 8.950981335910291e-05, + "loss": 1.7435, + "step": 4812 + }, + { + "epoch": 0.21036758599589142, + "grad_norm": 2.328125, + "learning_rate": 8.950560349486386e-05, + "loss": 1.5361, + "step": 4813 + }, + { + "epoch": 0.21041129419992133, + "grad_norm": 2.875, + "learning_rate": 8.950139288509658e-05, + "loss": 2.4291, + "step": 4814 + }, + { + "epoch": 0.2104550024039512, + "grad_norm": 2.203125, + "learning_rate": 8.949718152988051e-05, + "loss": 2.2256, + "step": 4815 + }, + { + "epoch": 0.21049871060798112, + "grad_norm": 2.25, + "learning_rate": 8.949296942929514e-05, + "loss": 1.9177, + "step": 4816 + }, + { + "epoch": 0.21054241881201102, + "grad_norm": 3.703125, + "learning_rate": 8.948875658341997e-05, + "loss": 2.6135, + "step": 4817 + }, + { + "epoch": 0.2105861270160409, + "grad_norm": 2.46875, + "learning_rate": 8.94845429923345e-05, + "loss": 2.557, + "step": 4818 + }, + { + "epoch": 0.2106298352200708, + "grad_norm": 2.625, + "learning_rate": 8.948032865611822e-05, + "loss": 2.2347, + "step": 4819 + }, + { + "epoch": 0.2106735434241007, + "grad_norm": 2.671875, + "learning_rate": 8.947611357485068e-05, + "loss": 2.5274, + "step": 4820 + }, + { + "epoch": 0.2107172516281306, + "grad_norm": 2.8125, + "learning_rate": 8.947189774861142e-05, + "loss": 1.4707, + "step": 4821 + }, + { + "epoch": 0.2107609598321605, + "grad_norm": 5.3125, + "learning_rate": 8.946768117748001e-05, + "loss": 1.3507, + "step": 4822 + }, + { + "epoch": 0.21080466803619038, + "grad_norm": 2.40625, + "learning_rate": 8.946346386153601e-05, + "loss": 2.4313, + "step": 4823 + }, + { + "epoch": 0.2108483762402203, + "grad_norm": 1.9921875, + "learning_rate": 8.945924580085901e-05, + "loss": 1.7597, + "step": 4824 + }, + { + "epoch": 0.2108920844442502, + "grad_norm": 2.71875, + "learning_rate": 8.945502699552862e-05, + "loss": 1.9471, + "step": 4825 + }, + { + "epoch": 0.21093579264828008, + "grad_norm": 2.390625, + "learning_rate": 8.945080744562442e-05, + "loss": 2.3347, + "step": 4826 + }, + { + "epoch": 0.21097950085231, + "grad_norm": 2.5625, + "learning_rate": 8.944658715122609e-05, + "loss": 2.9457, + "step": 4827 + }, + { + "epoch": 0.21102320905633987, + "grad_norm": 2.234375, + "learning_rate": 8.944236611241323e-05, + "loss": 1.7127, + "step": 4828 + }, + { + "epoch": 0.21106691726036977, + "grad_norm": 2.265625, + "learning_rate": 8.943814432926553e-05, + "loss": 2.4287, + "step": 4829 + }, + { + "epoch": 0.21111062546439968, + "grad_norm": 2.09375, + "learning_rate": 8.943392180186265e-05, + "loss": 2.2223, + "step": 4830 + }, + { + "epoch": 0.21115433366842956, + "grad_norm": 1.9765625, + "learning_rate": 8.942969853028426e-05, + "loss": 1.7299, + "step": 4831 + }, + { + "epoch": 0.21119804187245947, + "grad_norm": 3.71875, + "learning_rate": 8.942547451461008e-05, + "loss": 1.8223, + "step": 4832 + }, + { + "epoch": 0.21124175007648935, + "grad_norm": 2.265625, + "learning_rate": 8.942124975491981e-05, + "loss": 2.1694, + "step": 4833 + }, + { + "epoch": 0.21128545828051926, + "grad_norm": 2.65625, + "learning_rate": 8.94170242512932e-05, + "loss": 1.9498, + "step": 4834 + }, + { + "epoch": 0.21132916648454916, + "grad_norm": 2.734375, + "learning_rate": 8.941279800380995e-05, + "loss": 1.9834, + "step": 4835 + }, + { + "epoch": 0.21137287468857904, + "grad_norm": 2.28125, + "learning_rate": 8.940857101254985e-05, + "loss": 2.5328, + "step": 4836 + }, + { + "epoch": 0.21141658289260895, + "grad_norm": 2.59375, + "learning_rate": 8.940434327759264e-05, + "loss": 2.3951, + "step": 4837 + }, + { + "epoch": 0.21146029109663883, + "grad_norm": 2.09375, + "learning_rate": 8.940011479901816e-05, + "loss": 2.0517, + "step": 4838 + }, + { + "epoch": 0.21150399930066874, + "grad_norm": 2.25, + "learning_rate": 8.939588557690614e-05, + "loss": 2.1932, + "step": 4839 + }, + { + "epoch": 0.21154770750469865, + "grad_norm": 3.671875, + "learning_rate": 8.939165561133642e-05, + "loss": 2.1056, + "step": 4840 + }, + { + "epoch": 0.21159141570872853, + "grad_norm": 2.65625, + "learning_rate": 8.938742490238884e-05, + "loss": 2.1849, + "step": 4841 + }, + { + "epoch": 0.21163512391275843, + "grad_norm": 2.78125, + "learning_rate": 8.938319345014321e-05, + "loss": 2.4694, + "step": 4842 + }, + { + "epoch": 0.2116788321167883, + "grad_norm": 2.109375, + "learning_rate": 8.937896125467942e-05, + "loss": 1.8354, + "step": 4843 + }, + { + "epoch": 0.21172254032081822, + "grad_norm": 3.0625, + "learning_rate": 8.937472831607732e-05, + "loss": 2.216, + "step": 4844 + }, + { + "epoch": 0.21176624852484813, + "grad_norm": 2.171875, + "learning_rate": 8.937049463441678e-05, + "loss": 1.7439, + "step": 4845 + }, + { + "epoch": 0.211809956728878, + "grad_norm": 2.203125, + "learning_rate": 8.93662602097777e-05, + "loss": 2.1265, + "step": 4846 + }, + { + "epoch": 0.21185366493290791, + "grad_norm": 2.140625, + "learning_rate": 8.936202504224e-05, + "loss": 1.944, + "step": 4847 + }, + { + "epoch": 0.2118973731369378, + "grad_norm": 2.15625, + "learning_rate": 8.93577891318836e-05, + "loss": 2.1815, + "step": 4848 + }, + { + "epoch": 0.2119410813409677, + "grad_norm": 2.171875, + "learning_rate": 8.935355247878842e-05, + "loss": 2.2721, + "step": 4849 + }, + { + "epoch": 0.2119847895449976, + "grad_norm": 2.546875, + "learning_rate": 8.934931508303445e-05, + "loss": 2.313, + "step": 4850 + }, + { + "epoch": 0.2120284977490275, + "grad_norm": 2.328125, + "learning_rate": 8.934507694470162e-05, + "loss": 2.0388, + "step": 4851 + }, + { + "epoch": 0.2120722059530574, + "grad_norm": 2.171875, + "learning_rate": 8.934083806386995e-05, + "loss": 1.6507, + "step": 4852 + }, + { + "epoch": 0.21211591415708728, + "grad_norm": 1.96875, + "learning_rate": 8.933659844061938e-05, + "loss": 1.8782, + "step": 4853 + }, + { + "epoch": 0.21215962236111718, + "grad_norm": 2.59375, + "learning_rate": 8.933235807502996e-05, + "loss": 1.8664, + "step": 4854 + }, + { + "epoch": 0.2122033305651471, + "grad_norm": 2.40625, + "learning_rate": 8.932811696718169e-05, + "loss": 1.7721, + "step": 4855 + }, + { + "epoch": 0.21224703876917697, + "grad_norm": 3.09375, + "learning_rate": 8.93238751171546e-05, + "loss": 1.5299, + "step": 4856 + }, + { + "epoch": 0.21229074697320688, + "grad_norm": 1.9609375, + "learning_rate": 8.931963252502878e-05, + "loss": 1.7576, + "step": 4857 + }, + { + "epoch": 0.21233445517723676, + "grad_norm": 2.28125, + "learning_rate": 8.931538919088425e-05, + "loss": 2.125, + "step": 4858 + }, + { + "epoch": 0.21237816338126667, + "grad_norm": 2.203125, + "learning_rate": 8.931114511480112e-05, + "loss": 1.7627, + "step": 4859 + }, + { + "epoch": 0.21242187158529657, + "grad_norm": 2.25, + "learning_rate": 8.930690029685946e-05, + "loss": 1.9135, + "step": 4860 + }, + { + "epoch": 0.21246557978932645, + "grad_norm": 3.453125, + "learning_rate": 8.930265473713938e-05, + "loss": 2.209, + "step": 4861 + }, + { + "epoch": 0.21250928799335636, + "grad_norm": 3.078125, + "learning_rate": 8.9298408435721e-05, + "loss": 1.8138, + "step": 4862 + }, + { + "epoch": 0.21255299619738624, + "grad_norm": 2.59375, + "learning_rate": 8.929416139268446e-05, + "loss": 2.499, + "step": 4863 + }, + { + "epoch": 0.21259670440141615, + "grad_norm": 2.546875, + "learning_rate": 8.92899136081099e-05, + "loss": 2.1922, + "step": 4864 + }, + { + "epoch": 0.21264041260544606, + "grad_norm": 2.75, + "learning_rate": 8.92856650820775e-05, + "loss": 1.584, + "step": 4865 + }, + { + "epoch": 0.21268412080947594, + "grad_norm": 2.375, + "learning_rate": 8.928141581466742e-05, + "loss": 1.9576, + "step": 4866 + }, + { + "epoch": 0.21272782901350584, + "grad_norm": 2.09375, + "learning_rate": 8.927716580595984e-05, + "loss": 1.8619, + "step": 4867 + }, + { + "epoch": 0.21277153721753572, + "grad_norm": 2.390625, + "learning_rate": 8.9272915056035e-05, + "loss": 2.3122, + "step": 4868 + }, + { + "epoch": 0.21281524542156563, + "grad_norm": 2.65625, + "learning_rate": 8.926866356497307e-05, + "loss": 1.3723, + "step": 4869 + }, + { + "epoch": 0.21285895362559554, + "grad_norm": 2.421875, + "learning_rate": 8.92644113328543e-05, + "loss": 1.8232, + "step": 4870 + }, + { + "epoch": 0.21290266182962542, + "grad_norm": 2.171875, + "learning_rate": 8.926015835975896e-05, + "loss": 2.2606, + "step": 4871 + }, + { + "epoch": 0.21294637003365532, + "grad_norm": 2.734375, + "learning_rate": 8.925590464576727e-05, + "loss": 1.2309, + "step": 4872 + }, + { + "epoch": 0.2129900782376852, + "grad_norm": 2.09375, + "learning_rate": 8.925165019095956e-05, + "loss": 1.7613, + "step": 4873 + }, + { + "epoch": 0.2130337864417151, + "grad_norm": 2.15625, + "learning_rate": 8.924739499541606e-05, + "loss": 1.8071, + "step": 4874 + }, + { + "epoch": 0.21307749464574502, + "grad_norm": 5.375, + "learning_rate": 8.924313905921709e-05, + "loss": 2.6352, + "step": 4875 + }, + { + "epoch": 0.2131212028497749, + "grad_norm": 3.25, + "learning_rate": 8.923888238244298e-05, + "loss": 2.1339, + "step": 4876 + }, + { + "epoch": 0.2131649110538048, + "grad_norm": 3.484375, + "learning_rate": 8.923462496517405e-05, + "loss": 2.0866, + "step": 4877 + }, + { + "epoch": 0.2132086192578347, + "grad_norm": 2.1875, + "learning_rate": 8.923036680749064e-05, + "loss": 1.9792, + "step": 4878 + }, + { + "epoch": 0.2132523274618646, + "grad_norm": 3.359375, + "learning_rate": 8.922610790947311e-05, + "loss": 2.6828, + "step": 4879 + }, + { + "epoch": 0.2132960356658945, + "grad_norm": 2.875, + "learning_rate": 8.922184827120183e-05, + "loss": 1.9287, + "step": 4880 + }, + { + "epoch": 0.21333974386992438, + "grad_norm": 2.40625, + "learning_rate": 8.92175878927572e-05, + "loss": 2.0443, + "step": 4881 + }, + { + "epoch": 0.2133834520739543, + "grad_norm": 3.375, + "learning_rate": 8.921332677421961e-05, + "loss": 3.1954, + "step": 4882 + }, + { + "epoch": 0.21342716027798417, + "grad_norm": 2.453125, + "learning_rate": 8.920906491566944e-05, + "loss": 2.149, + "step": 4883 + }, + { + "epoch": 0.21347086848201408, + "grad_norm": 2.28125, + "learning_rate": 8.92048023171872e-05, + "loss": 2.1222, + "step": 4884 + }, + { + "epoch": 0.21351457668604398, + "grad_norm": 4.125, + "learning_rate": 8.920053897885325e-05, + "loss": 1.8552, + "step": 4885 + }, + { + "epoch": 0.21355828489007386, + "grad_norm": 2.34375, + "learning_rate": 8.919627490074807e-05, + "loss": 1.9297, + "step": 4886 + }, + { + "epoch": 0.21360199309410377, + "grad_norm": 2.203125, + "learning_rate": 8.919201008295214e-05, + "loss": 2.3196, + "step": 4887 + }, + { + "epoch": 0.21364570129813365, + "grad_norm": 3.0, + "learning_rate": 8.918774452554595e-05, + "loss": 2.5424, + "step": 4888 + }, + { + "epoch": 0.21368940950216356, + "grad_norm": 2.578125, + "learning_rate": 8.918347822860997e-05, + "loss": 3.2891, + "step": 4889 + }, + { + "epoch": 0.21373311770619347, + "grad_norm": 2.375, + "learning_rate": 8.917921119222474e-05, + "loss": 1.8642, + "step": 4890 + }, + { + "epoch": 0.21377682591022334, + "grad_norm": 2.46875, + "learning_rate": 8.917494341647077e-05, + "loss": 2.4109, + "step": 4891 + }, + { + "epoch": 0.21382053411425325, + "grad_norm": 2.234375, + "learning_rate": 8.917067490142858e-05, + "loss": 1.8142, + "step": 4892 + }, + { + "epoch": 0.21386424231828313, + "grad_norm": 1.8125, + "learning_rate": 8.916640564717878e-05, + "loss": 1.7431, + "step": 4893 + }, + { + "epoch": 0.21390795052231304, + "grad_norm": 2.296875, + "learning_rate": 8.916213565380188e-05, + "loss": 1.7604, + "step": 4894 + }, + { + "epoch": 0.21395165872634295, + "grad_norm": 2.1875, + "learning_rate": 8.915786492137848e-05, + "loss": 2.2038, + "step": 4895 + }, + { + "epoch": 0.21399536693037283, + "grad_norm": 1.9609375, + "learning_rate": 8.915359344998919e-05, + "loss": 1.6342, + "step": 4896 + }, + { + "epoch": 0.21403907513440273, + "grad_norm": 2.03125, + "learning_rate": 8.91493212397146e-05, + "loss": 1.4095, + "step": 4897 + }, + { + "epoch": 0.21408278333843261, + "grad_norm": 2.140625, + "learning_rate": 8.914504829063535e-05, + "loss": 1.7478, + "step": 4898 + }, + { + "epoch": 0.21412649154246252, + "grad_norm": 2.0625, + "learning_rate": 8.914077460283205e-05, + "loss": 1.538, + "step": 4899 + }, + { + "epoch": 0.21417019974649243, + "grad_norm": 2.484375, + "learning_rate": 8.913650017638537e-05, + "loss": 2.1669, + "step": 4900 + }, + { + "epoch": 0.2142139079505223, + "grad_norm": 2.4375, + "learning_rate": 8.913222501137597e-05, + "loss": 1.8726, + "step": 4901 + }, + { + "epoch": 0.21425761615455222, + "grad_norm": 2.296875, + "learning_rate": 8.912794910788453e-05, + "loss": 2.4547, + "step": 4902 + }, + { + "epoch": 0.2143013243585821, + "grad_norm": 2.4375, + "learning_rate": 8.912367246599175e-05, + "loss": 1.9188, + "step": 4903 + }, + { + "epoch": 0.214345032562612, + "grad_norm": 2.203125, + "learning_rate": 8.911939508577833e-05, + "loss": 2.116, + "step": 4904 + }, + { + "epoch": 0.2143887407666419, + "grad_norm": 2.359375, + "learning_rate": 8.911511696732498e-05, + "loss": 2.0171, + "step": 4905 + }, + { + "epoch": 0.2144324489706718, + "grad_norm": 2.078125, + "learning_rate": 8.911083811071244e-05, + "loss": 2.2515, + "step": 4906 + }, + { + "epoch": 0.2144761571747017, + "grad_norm": 2.21875, + "learning_rate": 8.910655851602146e-05, + "loss": 2.4468, + "step": 4907 + }, + { + "epoch": 0.21451986537873158, + "grad_norm": 2.640625, + "learning_rate": 8.910227818333282e-05, + "loss": 2.6137, + "step": 4908 + }, + { + "epoch": 0.21456357358276149, + "grad_norm": 2.453125, + "learning_rate": 8.909799711272727e-05, + "loss": 2.7769, + "step": 4909 + }, + { + "epoch": 0.2146072817867914, + "grad_norm": 2.0625, + "learning_rate": 8.909371530428561e-05, + "loss": 1.7773, + "step": 4910 + }, + { + "epoch": 0.21465098999082127, + "grad_norm": 2.34375, + "learning_rate": 8.908943275808866e-05, + "loss": 1.6918, + "step": 4911 + }, + { + "epoch": 0.21469469819485118, + "grad_norm": 2.25, + "learning_rate": 8.90851494742172e-05, + "loss": 2.4438, + "step": 4912 + }, + { + "epoch": 0.21473840639888106, + "grad_norm": 2.28125, + "learning_rate": 8.908086545275209e-05, + "loss": 2.461, + "step": 4913 + }, + { + "epoch": 0.21478211460291097, + "grad_norm": 3.015625, + "learning_rate": 8.907658069377418e-05, + "loss": 1.8086, + "step": 4914 + }, + { + "epoch": 0.21482582280694087, + "grad_norm": 3.59375, + "learning_rate": 8.90722951973643e-05, + "loss": 1.9816, + "step": 4915 + }, + { + "epoch": 0.21486953101097075, + "grad_norm": 2.4375, + "learning_rate": 8.906800896360336e-05, + "loss": 2.106, + "step": 4916 + }, + { + "epoch": 0.21491323921500066, + "grad_norm": 3.65625, + "learning_rate": 8.906372199257223e-05, + "loss": 1.485, + "step": 4917 + }, + { + "epoch": 0.21495694741903054, + "grad_norm": 2.921875, + "learning_rate": 8.90594342843518e-05, + "loss": 3.2665, + "step": 4918 + }, + { + "epoch": 0.21500065562306045, + "grad_norm": 2.34375, + "learning_rate": 8.9055145839023e-05, + "loss": 1.7803, + "step": 4919 + }, + { + "epoch": 0.21504436382709036, + "grad_norm": 2.4375, + "learning_rate": 8.905085665666674e-05, + "loss": 2.6265, + "step": 4920 + }, + { + "epoch": 0.21508807203112024, + "grad_norm": 2.28125, + "learning_rate": 8.9046566737364e-05, + "loss": 2.2594, + "step": 4921 + }, + { + "epoch": 0.21513178023515014, + "grad_norm": 2.625, + "learning_rate": 8.904227608119571e-05, + "loss": 1.9987, + "step": 4922 + }, + { + "epoch": 0.21517548843918002, + "grad_norm": 2.296875, + "learning_rate": 8.903798468824286e-05, + "loss": 2.3512, + "step": 4923 + }, + { + "epoch": 0.21521919664320993, + "grad_norm": 2.53125, + "learning_rate": 8.90336925585864e-05, + "loss": 2.123, + "step": 4924 + }, + { + "epoch": 0.21526290484723984, + "grad_norm": 2.515625, + "learning_rate": 8.902939969230737e-05, + "loss": 1.8981, + "step": 4925 + }, + { + "epoch": 0.21530661305126972, + "grad_norm": 2.1875, + "learning_rate": 8.902510608948676e-05, + "loss": 2.0691, + "step": 4926 + }, + { + "epoch": 0.21535032125529963, + "grad_norm": 2.4375, + "learning_rate": 8.902081175020559e-05, + "loss": 2.0388, + "step": 4927 + }, + { + "epoch": 0.2153940294593295, + "grad_norm": 2.484375, + "learning_rate": 8.901651667454492e-05, + "loss": 1.8893, + "step": 4928 + }, + { + "epoch": 0.2154377376633594, + "grad_norm": 1.984375, + "learning_rate": 8.901222086258578e-05, + "loss": 1.5117, + "step": 4929 + }, + { + "epoch": 0.21548144586738932, + "grad_norm": 2.375, + "learning_rate": 8.900792431440927e-05, + "loss": 2.1315, + "step": 4930 + }, + { + "epoch": 0.2155251540714192, + "grad_norm": 2.796875, + "learning_rate": 8.900362703009644e-05, + "loss": 1.8519, + "step": 4931 + }, + { + "epoch": 0.2155688622754491, + "grad_norm": 2.125, + "learning_rate": 8.89993290097284e-05, + "loss": 2.1304, + "step": 4932 + }, + { + "epoch": 0.215612570479479, + "grad_norm": 3.265625, + "learning_rate": 8.899503025338627e-05, + "loss": 1.9898, + "step": 4933 + }, + { + "epoch": 0.2156562786835089, + "grad_norm": 2.09375, + "learning_rate": 8.899073076115116e-05, + "loss": 1.602, + "step": 4934 + }, + { + "epoch": 0.2156999868875388, + "grad_norm": 2.125, + "learning_rate": 8.898643053310422e-05, + "loss": 1.7423, + "step": 4935 + }, + { + "epoch": 0.21574369509156868, + "grad_norm": 2.09375, + "learning_rate": 8.898212956932659e-05, + "loss": 1.9523, + "step": 4936 + }, + { + "epoch": 0.2157874032955986, + "grad_norm": 2.15625, + "learning_rate": 8.897782786989944e-05, + "loss": 1.8911, + "step": 4937 + }, + { + "epoch": 0.21583111149962847, + "grad_norm": 2.3125, + "learning_rate": 8.897352543490395e-05, + "loss": 1.5745, + "step": 4938 + }, + { + "epoch": 0.21587481970365838, + "grad_norm": 2.671875, + "learning_rate": 8.89692222644213e-05, + "loss": 2.2996, + "step": 4939 + }, + { + "epoch": 0.21591852790768828, + "grad_norm": 2.609375, + "learning_rate": 8.896491835853272e-05, + "loss": 2.591, + "step": 4940 + }, + { + "epoch": 0.21596223611171816, + "grad_norm": 2.921875, + "learning_rate": 8.896061371731943e-05, + "loss": 2.5338, + "step": 4941 + }, + { + "epoch": 0.21600594431574807, + "grad_norm": 2.15625, + "learning_rate": 8.895630834086264e-05, + "loss": 1.6313, + "step": 4942 + }, + { + "epoch": 0.21604965251977795, + "grad_norm": 2.640625, + "learning_rate": 8.89520022292436e-05, + "loss": 1.9688, + "step": 4943 + }, + { + "epoch": 0.21609336072380786, + "grad_norm": 2.296875, + "learning_rate": 8.894769538254362e-05, + "loss": 2.121, + "step": 4944 + }, + { + "epoch": 0.21613706892783777, + "grad_norm": 2.828125, + "learning_rate": 8.894338780084392e-05, + "loss": 1.9814, + "step": 4945 + }, + { + "epoch": 0.21618077713186765, + "grad_norm": 2.09375, + "learning_rate": 8.893907948422581e-05, + "loss": 1.8047, + "step": 4946 + }, + { + "epoch": 0.21622448533589755, + "grad_norm": 2.0625, + "learning_rate": 8.893477043277061e-05, + "loss": 2.0301, + "step": 4947 + }, + { + "epoch": 0.21626819353992743, + "grad_norm": 2.015625, + "learning_rate": 8.893046064655961e-05, + "loss": 1.6069, + "step": 4948 + }, + { + "epoch": 0.21631190174395734, + "grad_norm": 2.75, + "learning_rate": 8.892615012567416e-05, + "loss": 1.8555, + "step": 4949 + }, + { + "epoch": 0.21635560994798725, + "grad_norm": 3.109375, + "learning_rate": 8.892183887019562e-05, + "loss": 2.1679, + "step": 4950 + }, + { + "epoch": 0.21639931815201713, + "grad_norm": 2.796875, + "learning_rate": 8.891752688020532e-05, + "loss": 2.19, + "step": 4951 + }, + { + "epoch": 0.21644302635604704, + "grad_norm": 2.609375, + "learning_rate": 8.891321415578464e-05, + "loss": 2.3977, + "step": 4952 + }, + { + "epoch": 0.21648673456007692, + "grad_norm": 2.4375, + "learning_rate": 8.890890069701499e-05, + "loss": 1.8516, + "step": 4953 + }, + { + "epoch": 0.21653044276410682, + "grad_norm": 2.328125, + "learning_rate": 8.890458650397774e-05, + "loss": 1.684, + "step": 4954 + }, + { + "epoch": 0.21657415096813673, + "grad_norm": 2.421875, + "learning_rate": 8.890027157675432e-05, + "loss": 2.0473, + "step": 4955 + }, + { + "epoch": 0.2166178591721666, + "grad_norm": 2.265625, + "learning_rate": 8.889595591542617e-05, + "loss": 1.8999, + "step": 4956 + }, + { + "epoch": 0.21666156737619652, + "grad_norm": 2.234375, + "learning_rate": 8.889163952007471e-05, + "loss": 1.7533, + "step": 4957 + }, + { + "epoch": 0.2167052755802264, + "grad_norm": 2.078125, + "learning_rate": 8.888732239078141e-05, + "loss": 1.7618, + "step": 4958 + }, + { + "epoch": 0.2167489837842563, + "grad_norm": 3.234375, + "learning_rate": 8.888300452762774e-05, + "loss": 2.773, + "step": 4959 + }, + { + "epoch": 0.2167926919882862, + "grad_norm": 2.0, + "learning_rate": 8.887868593069519e-05, + "loss": 1.9146, + "step": 4960 + }, + { + "epoch": 0.2168364001923161, + "grad_norm": 3.125, + "learning_rate": 8.887436660006525e-05, + "loss": 1.0823, + "step": 4961 + }, + { + "epoch": 0.216880108396346, + "grad_norm": 2.578125, + "learning_rate": 8.887004653581942e-05, + "loss": 1.4368, + "step": 4962 + }, + { + "epoch": 0.21692381660037588, + "grad_norm": 2.3125, + "learning_rate": 8.886572573803926e-05, + "loss": 2.3, + "step": 4963 + }, + { + "epoch": 0.2169675248044058, + "grad_norm": 2.125, + "learning_rate": 8.886140420680627e-05, + "loss": 1.8728, + "step": 4964 + }, + { + "epoch": 0.2170112330084357, + "grad_norm": 2.8125, + "learning_rate": 8.885708194220204e-05, + "loss": 2.7391, + "step": 4965 + }, + { + "epoch": 0.21705494121246557, + "grad_norm": 2.359375, + "learning_rate": 8.88527589443081e-05, + "loss": 2.1869, + "step": 4966 + }, + { + "epoch": 0.21709864941649548, + "grad_norm": 1.8515625, + "learning_rate": 8.884843521320606e-05, + "loss": 1.664, + "step": 4967 + }, + { + "epoch": 0.21714235762052536, + "grad_norm": 2.1875, + "learning_rate": 8.884411074897751e-05, + "loss": 1.9771, + "step": 4968 + }, + { + "epoch": 0.21718606582455527, + "grad_norm": 3.765625, + "learning_rate": 8.883978555170404e-05, + "loss": 1.2271, + "step": 4969 + }, + { + "epoch": 0.21722977402858518, + "grad_norm": 2.765625, + "learning_rate": 8.883545962146731e-05, + "loss": 2.1806, + "step": 4970 + }, + { + "epoch": 0.21727348223261506, + "grad_norm": 3.4375, + "learning_rate": 8.883113295834892e-05, + "loss": 2.4585, + "step": 4971 + }, + { + "epoch": 0.21731719043664496, + "grad_norm": 2.96875, + "learning_rate": 8.882680556243054e-05, + "loss": 1.8355, + "step": 4972 + }, + { + "epoch": 0.21736089864067484, + "grad_norm": 17.5, + "learning_rate": 8.882247743379383e-05, + "loss": 4.6282, + "step": 4973 + }, + { + "epoch": 0.21740460684470475, + "grad_norm": 1.9609375, + "learning_rate": 8.881814857252046e-05, + "loss": 1.9054, + "step": 4974 + }, + { + "epoch": 0.21744831504873466, + "grad_norm": 2.796875, + "learning_rate": 8.881381897869215e-05, + "loss": 2.76, + "step": 4975 + }, + { + "epoch": 0.21749202325276454, + "grad_norm": 2.21875, + "learning_rate": 8.880948865239057e-05, + "loss": 1.9151, + "step": 4976 + }, + { + "epoch": 0.21753573145679445, + "grad_norm": 2.09375, + "learning_rate": 8.880515759369745e-05, + "loss": 1.7111, + "step": 4977 + }, + { + "epoch": 0.21757943966082433, + "grad_norm": 3.03125, + "learning_rate": 8.880082580269453e-05, + "loss": 2.0375, + "step": 4978 + }, + { + "epoch": 0.21762314786485423, + "grad_norm": 3.546875, + "learning_rate": 8.879649327946356e-05, + "loss": 2.5731, + "step": 4979 + }, + { + "epoch": 0.21766685606888414, + "grad_norm": 2.15625, + "learning_rate": 8.879216002408631e-05, + "loss": 1.8861, + "step": 4980 + }, + { + "epoch": 0.21771056427291402, + "grad_norm": 2.484375, + "learning_rate": 8.878782603664452e-05, + "loss": 1.8373, + "step": 4981 + }, + { + "epoch": 0.21775427247694393, + "grad_norm": 2.328125, + "learning_rate": 8.878349131722e-05, + "loss": 1.8506, + "step": 4982 + }, + { + "epoch": 0.2177979806809738, + "grad_norm": 2.125, + "learning_rate": 8.877915586589456e-05, + "loss": 1.7731, + "step": 4983 + }, + { + "epoch": 0.21784168888500371, + "grad_norm": 2.59375, + "learning_rate": 8.877481968275001e-05, + "loss": 2.25, + "step": 4984 + }, + { + "epoch": 0.21788539708903362, + "grad_norm": 2.21875, + "learning_rate": 8.877048276786817e-05, + "loss": 1.6804, + "step": 4985 + }, + { + "epoch": 0.2179291052930635, + "grad_norm": 2.375, + "learning_rate": 8.87661451213309e-05, + "loss": 2.3068, + "step": 4986 + }, + { + "epoch": 0.2179728134970934, + "grad_norm": 2.78125, + "learning_rate": 8.876180674322005e-05, + "loss": 2.9461, + "step": 4987 + }, + { + "epoch": 0.2180165217011233, + "grad_norm": 2.46875, + "learning_rate": 8.87574676336175e-05, + "loss": 1.837, + "step": 4988 + }, + { + "epoch": 0.2180602299051532, + "grad_norm": 11.1875, + "learning_rate": 8.87531277926051e-05, + "loss": 1.5664, + "step": 4989 + }, + { + "epoch": 0.2181039381091831, + "grad_norm": 2.34375, + "learning_rate": 8.874878722026479e-05, + "loss": 1.7533, + "step": 4990 + }, + { + "epoch": 0.21814764631321298, + "grad_norm": 2.140625, + "learning_rate": 8.874444591667848e-05, + "loss": 1.566, + "step": 4991 + }, + { + "epoch": 0.2181913545172429, + "grad_norm": 1.984375, + "learning_rate": 8.874010388192808e-05, + "loss": 1.5187, + "step": 4992 + }, + { + "epoch": 0.21823506272127277, + "grad_norm": 2.609375, + "learning_rate": 8.873576111609553e-05, + "loss": 2.2533, + "step": 4993 + }, + { + "epoch": 0.21827877092530268, + "grad_norm": 2.3125, + "learning_rate": 8.87314176192628e-05, + "loss": 2.0866, + "step": 4994 + }, + { + "epoch": 0.2183224791293326, + "grad_norm": 2.171875, + "learning_rate": 8.872707339151183e-05, + "loss": 2.0056, + "step": 4995 + }, + { + "epoch": 0.21836618733336247, + "grad_norm": 1.9921875, + "learning_rate": 8.872272843292464e-05, + "loss": 1.6985, + "step": 4996 + }, + { + "epoch": 0.21840989553739237, + "grad_norm": 2.328125, + "learning_rate": 8.871838274358319e-05, + "loss": 1.3936, + "step": 4997 + }, + { + "epoch": 0.21845360374142225, + "grad_norm": 2.5, + "learning_rate": 8.871403632356951e-05, + "loss": 1.8284, + "step": 4998 + }, + { + "epoch": 0.21849731194545216, + "grad_norm": 3.125, + "learning_rate": 8.870968917296562e-05, + "loss": 2.3903, + "step": 4999 + }, + { + "epoch": 0.21854102014948207, + "grad_norm": 2.171875, + "learning_rate": 8.870534129185357e-05, + "loss": 1.67, + "step": 5000 + }, + { + "epoch": 0.21858472835351195, + "grad_norm": 2.0625, + "learning_rate": 8.87009926803154e-05, + "loss": 1.8593, + "step": 5001 + }, + { + "epoch": 0.21862843655754186, + "grad_norm": 3.09375, + "learning_rate": 8.869664333843315e-05, + "loss": 2.3788, + "step": 5002 + }, + { + "epoch": 0.21867214476157174, + "grad_norm": 3.4375, + "learning_rate": 8.869229326628892e-05, + "loss": 1.2086, + "step": 5003 + }, + { + "epoch": 0.21871585296560164, + "grad_norm": 2.5625, + "learning_rate": 8.868794246396481e-05, + "loss": 2.5474, + "step": 5004 + }, + { + "epoch": 0.21875956116963155, + "grad_norm": 2.59375, + "learning_rate": 8.868359093154292e-05, + "loss": 2.0143, + "step": 5005 + }, + { + "epoch": 0.21880326937366143, + "grad_norm": 2.84375, + "learning_rate": 8.867923866910536e-05, + "loss": 2.6357, + "step": 5006 + }, + { + "epoch": 0.21884697757769134, + "grad_norm": 2.65625, + "learning_rate": 8.867488567673429e-05, + "loss": 1.5781, + "step": 5007 + }, + { + "epoch": 0.21889068578172122, + "grad_norm": 2.359375, + "learning_rate": 8.867053195451183e-05, + "loss": 1.9196, + "step": 5008 + }, + { + "epoch": 0.21893439398575112, + "grad_norm": 2.0625, + "learning_rate": 8.866617750252014e-05, + "loss": 1.9003, + "step": 5009 + }, + { + "epoch": 0.21897810218978103, + "grad_norm": 2.28125, + "learning_rate": 8.866182232084141e-05, + "loss": 2.0773, + "step": 5010 + }, + { + "epoch": 0.2190218103938109, + "grad_norm": 2.078125, + "learning_rate": 8.865746640955783e-05, + "loss": 2.4597, + "step": 5011 + }, + { + "epoch": 0.21906551859784082, + "grad_norm": 2.140625, + "learning_rate": 8.865310976875159e-05, + "loss": 1.7359, + "step": 5012 + }, + { + "epoch": 0.2191092268018707, + "grad_norm": 2.03125, + "learning_rate": 8.864875239850489e-05, + "loss": 1.7675, + "step": 5013 + }, + { + "epoch": 0.2191529350059006, + "grad_norm": 2.328125, + "learning_rate": 8.864439429890001e-05, + "loss": 1.7439, + "step": 5014 + }, + { + "epoch": 0.21919664320993051, + "grad_norm": 2.34375, + "learning_rate": 8.864003547001915e-05, + "loss": 1.7313, + "step": 5015 + }, + { + "epoch": 0.2192403514139604, + "grad_norm": 2.046875, + "learning_rate": 8.86356759119446e-05, + "loss": 1.7512, + "step": 5016 + }, + { + "epoch": 0.2192840596179903, + "grad_norm": 2.140625, + "learning_rate": 8.86313156247586e-05, + "loss": 1.7813, + "step": 5017 + }, + { + "epoch": 0.21932776782202018, + "grad_norm": 2.0, + "learning_rate": 8.862695460854347e-05, + "loss": 1.8943, + "step": 5018 + }, + { + "epoch": 0.2193714760260501, + "grad_norm": 2.09375, + "learning_rate": 8.862259286338145e-05, + "loss": 1.486, + "step": 5019 + }, + { + "epoch": 0.21941518423008, + "grad_norm": 2.109375, + "learning_rate": 8.861823038935493e-05, + "loss": 1.7871, + "step": 5020 + }, + { + "epoch": 0.21945889243410988, + "grad_norm": 2.0, + "learning_rate": 8.861386718654618e-05, + "loss": 1.8623, + "step": 5021 + }, + { + "epoch": 0.21950260063813978, + "grad_norm": 2.046875, + "learning_rate": 8.860950325503754e-05, + "loss": 2.0073, + "step": 5022 + }, + { + "epoch": 0.21954630884216966, + "grad_norm": 2.421875, + "learning_rate": 8.86051385949114e-05, + "loss": 1.7694, + "step": 5023 + }, + { + "epoch": 0.21959001704619957, + "grad_norm": 2.859375, + "learning_rate": 8.86007732062501e-05, + "loss": 2.0825, + "step": 5024 + }, + { + "epoch": 0.21963372525022948, + "grad_norm": 2.421875, + "learning_rate": 8.859640708913603e-05, + "loss": 2.2887, + "step": 5025 + }, + { + "epoch": 0.21967743345425936, + "grad_norm": 2.484375, + "learning_rate": 8.859204024365159e-05, + "loss": 1.7923, + "step": 5026 + }, + { + "epoch": 0.21972114165828927, + "grad_norm": 3.109375, + "learning_rate": 8.858767266987917e-05, + "loss": 2.2799, + "step": 5027 + }, + { + "epoch": 0.21976484986231914, + "grad_norm": 2.046875, + "learning_rate": 8.85833043679012e-05, + "loss": 2.2855, + "step": 5028 + }, + { + "epoch": 0.21980855806634905, + "grad_norm": 2.203125, + "learning_rate": 8.857893533780015e-05, + "loss": 2.3333, + "step": 5029 + }, + { + "epoch": 0.21985226627037896, + "grad_norm": 2.21875, + "learning_rate": 8.857456557965842e-05, + "loss": 1.4897, + "step": 5030 + }, + { + "epoch": 0.21989597447440884, + "grad_norm": 3.078125, + "learning_rate": 8.857019509355851e-05, + "loss": 2.6878, + "step": 5031 + }, + { + "epoch": 0.21993968267843875, + "grad_norm": 6.40625, + "learning_rate": 8.856582387958286e-05, + "loss": 1.858, + "step": 5032 + }, + { + "epoch": 0.21998339088246863, + "grad_norm": 2.8125, + "learning_rate": 8.8561451937814e-05, + "loss": 2.4977, + "step": 5033 + }, + { + "epoch": 0.22002709908649853, + "grad_norm": 2.0, + "learning_rate": 8.855707926833441e-05, + "loss": 1.8504, + "step": 5034 + }, + { + "epoch": 0.22007080729052844, + "grad_norm": 2.25, + "learning_rate": 8.855270587122661e-05, + "loss": 2.276, + "step": 5035 + }, + { + "epoch": 0.22011451549455832, + "grad_norm": 2.5, + "learning_rate": 8.854833174657317e-05, + "loss": 1.9943, + "step": 5036 + }, + { + "epoch": 0.22015822369858823, + "grad_norm": 2.46875, + "learning_rate": 8.854395689445658e-05, + "loss": 2.0906, + "step": 5037 + }, + { + "epoch": 0.2202019319026181, + "grad_norm": 2.28125, + "learning_rate": 8.853958131495943e-05, + "loss": 1.8033, + "step": 5038 + }, + { + "epoch": 0.22024564010664802, + "grad_norm": 1.9765625, + "learning_rate": 8.853520500816429e-05, + "loss": 1.8322, + "step": 5039 + }, + { + "epoch": 0.22028934831067792, + "grad_norm": 2.40625, + "learning_rate": 8.853082797415374e-05, + "loss": 1.8526, + "step": 5040 + }, + { + "epoch": 0.2203330565147078, + "grad_norm": 2.4375, + "learning_rate": 8.85264502130104e-05, + "loss": 2.4182, + "step": 5041 + }, + { + "epoch": 0.2203767647187377, + "grad_norm": 2.71875, + "learning_rate": 8.852207172481686e-05, + "loss": 1.7747, + "step": 5042 + }, + { + "epoch": 0.2204204729227676, + "grad_norm": 2.28125, + "learning_rate": 8.851769250965577e-05, + "loss": 2.3158, + "step": 5043 + }, + { + "epoch": 0.2204641811267975, + "grad_norm": 2.46875, + "learning_rate": 8.851331256760975e-05, + "loss": 1.5608, + "step": 5044 + }, + { + "epoch": 0.2205078893308274, + "grad_norm": 2.390625, + "learning_rate": 8.850893189876149e-05, + "loss": 2.1841, + "step": 5045 + }, + { + "epoch": 0.22055159753485729, + "grad_norm": 1.9609375, + "learning_rate": 8.850455050319361e-05, + "loss": 1.779, + "step": 5046 + }, + { + "epoch": 0.2205953057388872, + "grad_norm": 2.171875, + "learning_rate": 8.850016838098885e-05, + "loss": 1.5091, + "step": 5047 + }, + { + "epoch": 0.22063901394291707, + "grad_norm": 1.953125, + "learning_rate": 8.849578553222985e-05, + "loss": 2.0306, + "step": 5048 + }, + { + "epoch": 0.22068272214694698, + "grad_norm": 1.859375, + "learning_rate": 8.849140195699936e-05, + "loss": 1.5791, + "step": 5049 + }, + { + "epoch": 0.2207264303509769, + "grad_norm": 2.171875, + "learning_rate": 8.84870176553801e-05, + "loss": 1.8366, + "step": 5050 + }, + { + "epoch": 0.22077013855500677, + "grad_norm": 2.40625, + "learning_rate": 8.84826326274548e-05, + "loss": 2.0644, + "step": 5051 + }, + { + "epoch": 0.22081384675903667, + "grad_norm": 2.390625, + "learning_rate": 8.847824687330621e-05, + "loss": 1.9944, + "step": 5052 + }, + { + "epoch": 0.22085755496306655, + "grad_norm": 2.75, + "learning_rate": 8.84738603930171e-05, + "loss": 2.3579, + "step": 5053 + }, + { + "epoch": 0.22090126316709646, + "grad_norm": 1.9609375, + "learning_rate": 8.846947318667025e-05, + "loss": 1.8134, + "step": 5054 + }, + { + "epoch": 0.22094497137112637, + "grad_norm": 2.21875, + "learning_rate": 8.846508525434845e-05, + "loss": 2.1187, + "step": 5055 + }, + { + "epoch": 0.22098867957515625, + "grad_norm": 2.484375, + "learning_rate": 8.846069659613451e-05, + "loss": 2.0825, + "step": 5056 + }, + { + "epoch": 0.22103238777918616, + "grad_norm": 2.609375, + "learning_rate": 8.845630721211124e-05, + "loss": 2.2968, + "step": 5057 + }, + { + "epoch": 0.22107609598321604, + "grad_norm": 2.078125, + "learning_rate": 8.84519171023615e-05, + "loss": 1.684, + "step": 5058 + }, + { + "epoch": 0.22111980418724594, + "grad_norm": 2.875, + "learning_rate": 8.84475262669681e-05, + "loss": 1.9728, + "step": 5059 + }, + { + "epoch": 0.22116351239127585, + "grad_norm": 2.265625, + "learning_rate": 8.844313470601393e-05, + "loss": 2.5474, + "step": 5060 + }, + { + "epoch": 0.22120722059530573, + "grad_norm": 4.28125, + "learning_rate": 8.843874241958186e-05, + "loss": 1.8955, + "step": 5061 + }, + { + "epoch": 0.22125092879933564, + "grad_norm": 5.65625, + "learning_rate": 8.843434940775476e-05, + "loss": 1.4155, + "step": 5062 + }, + { + "epoch": 0.22129463700336552, + "grad_norm": 2.765625, + "learning_rate": 8.842995567061558e-05, + "loss": 2.2181, + "step": 5063 + }, + { + "epoch": 0.22133834520739543, + "grad_norm": 2.375, + "learning_rate": 8.842556120824719e-05, + "loss": 1.9322, + "step": 5064 + }, + { + "epoch": 0.22138205341142533, + "grad_norm": 2.078125, + "learning_rate": 8.842116602073252e-05, + "loss": 1.7316, + "step": 5065 + }, + { + "epoch": 0.2214257616154552, + "grad_norm": 2.328125, + "learning_rate": 8.841677010815455e-05, + "loss": 2.1623, + "step": 5066 + }, + { + "epoch": 0.22146946981948512, + "grad_norm": 2.6875, + "learning_rate": 8.841237347059618e-05, + "loss": 1.8282, + "step": 5067 + }, + { + "epoch": 0.221513178023515, + "grad_norm": 2.546875, + "learning_rate": 8.840797610814045e-05, + "loss": 2.1581, + "step": 5068 + }, + { + "epoch": 0.2215568862275449, + "grad_norm": 2.765625, + "learning_rate": 8.84035780208703e-05, + "loss": 2.4111, + "step": 5069 + }, + { + "epoch": 0.22160059443157482, + "grad_norm": 2.078125, + "learning_rate": 8.839917920886874e-05, + "loss": 2.1053, + "step": 5070 + }, + { + "epoch": 0.2216443026356047, + "grad_norm": 2.640625, + "learning_rate": 8.839477967221879e-05, + "loss": 1.8067, + "step": 5071 + }, + { + "epoch": 0.2216880108396346, + "grad_norm": 2.125, + "learning_rate": 8.839037941100344e-05, + "loss": 2.0825, + "step": 5072 + }, + { + "epoch": 0.22173171904366448, + "grad_norm": 2.59375, + "learning_rate": 8.838597842530578e-05, + "loss": 2.5282, + "step": 5073 + }, + { + "epoch": 0.2217754272476944, + "grad_norm": 2.28125, + "learning_rate": 8.838157671520884e-05, + "loss": 1.9524, + "step": 5074 + }, + { + "epoch": 0.2218191354517243, + "grad_norm": 2.125, + "learning_rate": 8.837717428079566e-05, + "loss": 2.0709, + "step": 5075 + }, + { + "epoch": 0.22186284365575418, + "grad_norm": 2.65625, + "learning_rate": 8.837277112214937e-05, + "loss": 1.9269, + "step": 5076 + }, + { + "epoch": 0.22190655185978408, + "grad_norm": 2.84375, + "learning_rate": 8.836836723935303e-05, + "loss": 1.8649, + "step": 5077 + }, + { + "epoch": 0.22195026006381396, + "grad_norm": 3.453125, + "learning_rate": 8.836396263248976e-05, + "loss": 3.6734, + "step": 5078 + }, + { + "epoch": 0.22199396826784387, + "grad_norm": 2.046875, + "learning_rate": 8.835955730164269e-05, + "loss": 2.067, + "step": 5079 + }, + { + "epoch": 0.22203767647187378, + "grad_norm": 2.140625, + "learning_rate": 8.835515124689494e-05, + "loss": 2.0929, + "step": 5080 + }, + { + "epoch": 0.22208138467590366, + "grad_norm": 2.0625, + "learning_rate": 8.835074446832965e-05, + "loss": 1.9263, + "step": 5081 + }, + { + "epoch": 0.22212509287993357, + "grad_norm": 1.9609375, + "learning_rate": 8.834633696603e-05, + "loss": 1.5586, + "step": 5082 + }, + { + "epoch": 0.22216880108396345, + "grad_norm": 2.03125, + "learning_rate": 8.834192874007916e-05, + "loss": 2.14, + "step": 5083 + }, + { + "epoch": 0.22221250928799335, + "grad_norm": 2.171875, + "learning_rate": 8.833751979056032e-05, + "loss": 2.221, + "step": 5084 + }, + { + "epoch": 0.22225621749202326, + "grad_norm": 2.15625, + "learning_rate": 8.833311011755668e-05, + "loss": 1.5307, + "step": 5085 + }, + { + "epoch": 0.22229992569605314, + "grad_norm": 2.140625, + "learning_rate": 8.832869972115148e-05, + "loss": 1.4644, + "step": 5086 + }, + { + "epoch": 0.22234363390008305, + "grad_norm": 1.9921875, + "learning_rate": 8.832428860142792e-05, + "loss": 1.6832, + "step": 5087 + }, + { + "epoch": 0.22238734210411293, + "grad_norm": 2.3125, + "learning_rate": 8.831987675846924e-05, + "loss": 2.2758, + "step": 5088 + }, + { + "epoch": 0.22243105030814284, + "grad_norm": 2.390625, + "learning_rate": 8.831546419235873e-05, + "loss": 1.977, + "step": 5089 + }, + { + "epoch": 0.22247475851217274, + "grad_norm": 2.609375, + "learning_rate": 8.831105090317965e-05, + "loss": 1.7981, + "step": 5090 + }, + { + "epoch": 0.22251846671620262, + "grad_norm": 2.0625, + "learning_rate": 8.830663689101529e-05, + "loss": 1.8568, + "step": 5091 + }, + { + "epoch": 0.22256217492023253, + "grad_norm": 2.234375, + "learning_rate": 8.83022221559489e-05, + "loss": 2.2899, + "step": 5092 + }, + { + "epoch": 0.2226058831242624, + "grad_norm": 2.109375, + "learning_rate": 8.829780669806387e-05, + "loss": 1.7343, + "step": 5093 + }, + { + "epoch": 0.22264959132829232, + "grad_norm": 2.46875, + "learning_rate": 8.829339051744346e-05, + "loss": 2.4551, + "step": 5094 + }, + { + "epoch": 0.22269329953232223, + "grad_norm": 2.15625, + "learning_rate": 8.828897361417106e-05, + "loss": 1.7362, + "step": 5095 + }, + { + "epoch": 0.2227370077363521, + "grad_norm": 2.53125, + "learning_rate": 8.828455598832998e-05, + "loss": 2.0821, + "step": 5096 + }, + { + "epoch": 0.222780715940382, + "grad_norm": 2.0, + "learning_rate": 8.828013764000362e-05, + "loss": 1.9559, + "step": 5097 + }, + { + "epoch": 0.2228244241444119, + "grad_norm": 2.25, + "learning_rate": 8.827571856927535e-05, + "loss": 1.6318, + "step": 5098 + }, + { + "epoch": 0.2228681323484418, + "grad_norm": 2.0625, + "learning_rate": 8.827129877622857e-05, + "loss": 1.8371, + "step": 5099 + }, + { + "epoch": 0.2229118405524717, + "grad_norm": 1.8671875, + "learning_rate": 8.826687826094666e-05, + "loss": 1.8739, + "step": 5100 + }, + { + "epoch": 0.2229555487565016, + "grad_norm": 2.21875, + "learning_rate": 8.826245702351309e-05, + "loss": 2.28, + "step": 5101 + }, + { + "epoch": 0.2229992569605315, + "grad_norm": 2.40625, + "learning_rate": 8.825803506401125e-05, + "loss": 1.8682, + "step": 5102 + }, + { + "epoch": 0.2230429651645614, + "grad_norm": 3.5625, + "learning_rate": 8.82536123825246e-05, + "loss": 1.5349, + "step": 5103 + }, + { + "epoch": 0.22308667336859128, + "grad_norm": 1.9765625, + "learning_rate": 8.824918897913661e-05, + "loss": 1.752, + "step": 5104 + }, + { + "epoch": 0.2231303815726212, + "grad_norm": 1.9921875, + "learning_rate": 8.824476485393076e-05, + "loss": 1.6159, + "step": 5105 + }, + { + "epoch": 0.22317408977665107, + "grad_norm": 2.21875, + "learning_rate": 8.824034000699055e-05, + "loss": 1.7539, + "step": 5106 + }, + { + "epoch": 0.22321779798068098, + "grad_norm": 2.0, + "learning_rate": 8.823591443839944e-05, + "loss": 1.666, + "step": 5107 + }, + { + "epoch": 0.22326150618471088, + "grad_norm": 2.609375, + "learning_rate": 8.8231488148241e-05, + "loss": 2.2305, + "step": 5108 + }, + { + "epoch": 0.22330521438874076, + "grad_norm": 2.328125, + "learning_rate": 8.822706113659872e-05, + "loss": 2.1343, + "step": 5109 + }, + { + "epoch": 0.22334892259277067, + "grad_norm": 2.234375, + "learning_rate": 8.822263340355616e-05, + "loss": 1.7487, + "step": 5110 + }, + { + "epoch": 0.22339263079680055, + "grad_norm": 2.796875, + "learning_rate": 8.82182049491969e-05, + "loss": 2.2869, + "step": 5111 + }, + { + "epoch": 0.22343633900083046, + "grad_norm": 2.0, + "learning_rate": 8.821377577360446e-05, + "loss": 2.0224, + "step": 5112 + }, + { + "epoch": 0.22348004720486037, + "grad_norm": 2.234375, + "learning_rate": 8.820934587686247e-05, + "loss": 2.3331, + "step": 5113 + }, + { + "epoch": 0.22352375540889025, + "grad_norm": 2.109375, + "learning_rate": 8.82049152590545e-05, + "loss": 1.472, + "step": 5114 + }, + { + "epoch": 0.22356746361292015, + "grad_norm": 1.9921875, + "learning_rate": 8.820048392026417e-05, + "loss": 1.8046, + "step": 5115 + }, + { + "epoch": 0.22361117181695003, + "grad_norm": 2.203125, + "learning_rate": 8.819605186057514e-05, + "loss": 1.7677, + "step": 5116 + }, + { + "epoch": 0.22365488002097994, + "grad_norm": 3.03125, + "learning_rate": 8.819161908007099e-05, + "loss": 1.8259, + "step": 5117 + }, + { + "epoch": 0.22369858822500985, + "grad_norm": 2.3125, + "learning_rate": 8.818718557883541e-05, + "loss": 1.3694, + "step": 5118 + }, + { + "epoch": 0.22374229642903973, + "grad_norm": 2.0, + "learning_rate": 8.818275135695207e-05, + "loss": 1.6956, + "step": 5119 + }, + { + "epoch": 0.22378600463306964, + "grad_norm": 2.578125, + "learning_rate": 8.817831641450462e-05, + "loss": 1.988, + "step": 5120 + }, + { + "epoch": 0.22382971283709951, + "grad_norm": 2.3125, + "learning_rate": 8.81738807515768e-05, + "loss": 1.4667, + "step": 5121 + }, + { + "epoch": 0.22387342104112942, + "grad_norm": 2.296875, + "learning_rate": 8.816944436825228e-05, + "loss": 1.899, + "step": 5122 + }, + { + "epoch": 0.22391712924515933, + "grad_norm": 2.296875, + "learning_rate": 8.816500726461478e-05, + "loss": 2.2279, + "step": 5123 + }, + { + "epoch": 0.2239608374491892, + "grad_norm": 2.265625, + "learning_rate": 8.816056944074805e-05, + "loss": 2.3851, + "step": 5124 + }, + { + "epoch": 0.22400454565321912, + "grad_norm": 2.375, + "learning_rate": 8.815613089673583e-05, + "loss": 1.4785, + "step": 5125 + }, + { + "epoch": 0.224048253857249, + "grad_norm": 2.03125, + "learning_rate": 8.81516916326619e-05, + "loss": 2.0775, + "step": 5126 + }, + { + "epoch": 0.2240919620612789, + "grad_norm": 2.640625, + "learning_rate": 8.814725164861001e-05, + "loss": 2.5111, + "step": 5127 + }, + { + "epoch": 0.2241356702653088, + "grad_norm": 2.125, + "learning_rate": 8.814281094466398e-05, + "loss": 1.7167, + "step": 5128 + }, + { + "epoch": 0.2241793784693387, + "grad_norm": 2.375, + "learning_rate": 8.813836952090758e-05, + "loss": 2.0692, + "step": 5129 + }, + { + "epoch": 0.2242230866733686, + "grad_norm": 2.109375, + "learning_rate": 8.813392737742463e-05, + "loss": 1.7218, + "step": 5130 + }, + { + "epoch": 0.22426679487739848, + "grad_norm": 2.40625, + "learning_rate": 8.812948451429898e-05, + "loss": 2.4234, + "step": 5131 + }, + { + "epoch": 0.2243105030814284, + "grad_norm": 2.59375, + "learning_rate": 8.812504093161446e-05, + "loss": 1.8171, + "step": 5132 + }, + { + "epoch": 0.2243542112854583, + "grad_norm": 2.9375, + "learning_rate": 8.812059662945494e-05, + "loss": 1.8073, + "step": 5133 + }, + { + "epoch": 0.22439791948948817, + "grad_norm": 2.046875, + "learning_rate": 8.811615160790427e-05, + "loss": 1.7733, + "step": 5134 + }, + { + "epoch": 0.22444162769351808, + "grad_norm": 2.234375, + "learning_rate": 8.811170586704633e-05, + "loss": 2.5059, + "step": 5135 + }, + { + "epoch": 0.22448533589754796, + "grad_norm": 2.421875, + "learning_rate": 8.810725940696505e-05, + "loss": 1.9448, + "step": 5136 + }, + { + "epoch": 0.22452904410157787, + "grad_norm": 3.625, + "learning_rate": 8.810281222774432e-05, + "loss": 2.2897, + "step": 5137 + }, + { + "epoch": 0.22457275230560778, + "grad_norm": 3.921875, + "learning_rate": 8.809836432946808e-05, + "loss": 1.8743, + "step": 5138 + }, + { + "epoch": 0.22461646050963766, + "grad_norm": 2.390625, + "learning_rate": 8.809391571222023e-05, + "loss": 1.9626, + "step": 5139 + }, + { + "epoch": 0.22466016871366756, + "grad_norm": 2.859375, + "learning_rate": 8.808946637608477e-05, + "loss": 1.7792, + "step": 5140 + }, + { + "epoch": 0.22470387691769744, + "grad_norm": 2.359375, + "learning_rate": 8.808501632114563e-05, + "loss": 1.9027, + "step": 5141 + }, + { + "epoch": 0.22474758512172735, + "grad_norm": 2.859375, + "learning_rate": 8.808056554748681e-05, + "loss": 2.5154, + "step": 5142 + }, + { + "epoch": 0.22479129332575726, + "grad_norm": 2.09375, + "learning_rate": 8.807611405519227e-05, + "loss": 1.9519, + "step": 5143 + }, + { + "epoch": 0.22483500152978714, + "grad_norm": 2.640625, + "learning_rate": 8.807166184434607e-05, + "loss": 2.3366, + "step": 5144 + }, + { + "epoch": 0.22487870973381704, + "grad_norm": 2.296875, + "learning_rate": 8.806720891503219e-05, + "loss": 1.8174, + "step": 5145 + }, + { + "epoch": 0.22492241793784692, + "grad_norm": 2.640625, + "learning_rate": 8.806275526733466e-05, + "loss": 1.7592, + "step": 5146 + }, + { + "epoch": 0.22496612614187683, + "grad_norm": 2.46875, + "learning_rate": 8.805830090133755e-05, + "loss": 2.2325, + "step": 5147 + }, + { + "epoch": 0.22500983434590674, + "grad_norm": 1.984375, + "learning_rate": 8.805384581712492e-05, + "loss": 1.9405, + "step": 5148 + }, + { + "epoch": 0.22505354254993662, + "grad_norm": 2.125, + "learning_rate": 8.804939001478083e-05, + "loss": 1.8371, + "step": 5149 + }, + { + "epoch": 0.22509725075396653, + "grad_norm": 1.8671875, + "learning_rate": 8.804493349438937e-05, + "loss": 1.6809, + "step": 5150 + }, + { + "epoch": 0.2251409589579964, + "grad_norm": 2.328125, + "learning_rate": 8.804047625603464e-05, + "loss": 2.0279, + "step": 5151 + }, + { + "epoch": 0.22518466716202631, + "grad_norm": 2.265625, + "learning_rate": 8.803601829980076e-05, + "loss": 2.1587, + "step": 5152 + }, + { + "epoch": 0.22522837536605622, + "grad_norm": 2.125, + "learning_rate": 8.803155962577186e-05, + "loss": 1.6492, + "step": 5153 + }, + { + "epoch": 0.2252720835700861, + "grad_norm": 2.140625, + "learning_rate": 8.802710023403208e-05, + "loss": 2.2723, + "step": 5154 + }, + { + "epoch": 0.225315791774116, + "grad_norm": 2.1875, + "learning_rate": 8.802264012466557e-05, + "loss": 2.0675, + "step": 5155 + }, + { + "epoch": 0.2253594999781459, + "grad_norm": 1.9453125, + "learning_rate": 8.801817929775649e-05, + "loss": 1.8635, + "step": 5156 + }, + { + "epoch": 0.2254032081821758, + "grad_norm": 2.09375, + "learning_rate": 8.801371775338904e-05, + "loss": 2.1729, + "step": 5157 + }, + { + "epoch": 0.2254469163862057, + "grad_norm": 2.25, + "learning_rate": 8.800925549164741e-05, + "loss": 1.7483, + "step": 5158 + }, + { + "epoch": 0.22549062459023558, + "grad_norm": 2.328125, + "learning_rate": 8.800479251261581e-05, + "loss": 1.9852, + "step": 5159 + }, + { + "epoch": 0.2255343327942655, + "grad_norm": 1.90625, + "learning_rate": 8.800032881637847e-05, + "loss": 1.364, + "step": 5160 + }, + { + "epoch": 0.22557804099829537, + "grad_norm": 2.265625, + "learning_rate": 8.799586440301961e-05, + "loss": 2.4219, + "step": 5161 + }, + { + "epoch": 0.22562174920232528, + "grad_norm": 2.078125, + "learning_rate": 8.79913992726235e-05, + "loss": 2.3594, + "step": 5162 + }, + { + "epoch": 0.22566545740635519, + "grad_norm": 2.484375, + "learning_rate": 8.798693342527438e-05, + "loss": 1.6779, + "step": 5163 + }, + { + "epoch": 0.22570916561038507, + "grad_norm": 2.828125, + "learning_rate": 8.798246686105654e-05, + "loss": 3.4992, + "step": 5164 + }, + { + "epoch": 0.22575287381441497, + "grad_norm": 2.5625, + "learning_rate": 8.797799958005426e-05, + "loss": 1.3351, + "step": 5165 + }, + { + "epoch": 0.22579658201844485, + "grad_norm": 2.0625, + "learning_rate": 8.797353158235186e-05, + "loss": 1.9769, + "step": 5166 + }, + { + "epoch": 0.22584029022247476, + "grad_norm": 2.578125, + "learning_rate": 8.796906286803365e-05, + "loss": 1.8765, + "step": 5167 + }, + { + "epoch": 0.22588399842650467, + "grad_norm": 2.21875, + "learning_rate": 8.796459343718397e-05, + "loss": 2.3319, + "step": 5168 + }, + { + "epoch": 0.22592770663053455, + "grad_norm": 2.78125, + "learning_rate": 8.796012328988716e-05, + "loss": 1.7823, + "step": 5169 + }, + { + "epoch": 0.22597141483456445, + "grad_norm": 2.4375, + "learning_rate": 8.795565242622758e-05, + "loss": 1.6866, + "step": 5170 + }, + { + "epoch": 0.22601512303859433, + "grad_norm": 2.65625, + "learning_rate": 8.795118084628959e-05, + "loss": 1.9945, + "step": 5171 + }, + { + "epoch": 0.22605883124262424, + "grad_norm": 2.5625, + "learning_rate": 8.794670855015757e-05, + "loss": 1.8617, + "step": 5172 + }, + { + "epoch": 0.22610253944665415, + "grad_norm": 2.734375, + "learning_rate": 8.794223553791595e-05, + "loss": 2.4788, + "step": 5173 + }, + { + "epoch": 0.22614624765068403, + "grad_norm": 2.390625, + "learning_rate": 8.79377618096491e-05, + "loss": 2.0072, + "step": 5174 + }, + { + "epoch": 0.22618995585471394, + "grad_norm": 2.265625, + "learning_rate": 8.793328736544149e-05, + "loss": 2.0756, + "step": 5175 + }, + { + "epoch": 0.22623366405874382, + "grad_norm": 2.09375, + "learning_rate": 8.792881220537751e-05, + "loss": 2.3657, + "step": 5176 + }, + { + "epoch": 0.22627737226277372, + "grad_norm": 2.140625, + "learning_rate": 8.792433632954167e-05, + "loss": 2.5119, + "step": 5177 + }, + { + "epoch": 0.22632108046680363, + "grad_norm": 2.46875, + "learning_rate": 8.791985973801839e-05, + "loss": 2.3355, + "step": 5178 + }, + { + "epoch": 0.2263647886708335, + "grad_norm": 4.0, + "learning_rate": 8.791538243089219e-05, + "loss": 2.1918, + "step": 5179 + }, + { + "epoch": 0.22640849687486342, + "grad_norm": 2.03125, + "learning_rate": 8.791090440824752e-05, + "loss": 1.9486, + "step": 5180 + }, + { + "epoch": 0.2264522050788933, + "grad_norm": 2.0, + "learning_rate": 8.790642567016891e-05, + "loss": 2.2691, + "step": 5181 + }, + { + "epoch": 0.2264959132829232, + "grad_norm": 2.203125, + "learning_rate": 8.790194621674088e-05, + "loss": 2.2397, + "step": 5182 + }, + { + "epoch": 0.2265396214869531, + "grad_norm": 2.46875, + "learning_rate": 8.789746604804796e-05, + "loss": 1.9892, + "step": 5183 + }, + { + "epoch": 0.226583329690983, + "grad_norm": 2.1875, + "learning_rate": 8.78929851641747e-05, + "loss": 1.6987, + "step": 5184 + }, + { + "epoch": 0.2266270378950129, + "grad_norm": 2.0625, + "learning_rate": 8.788850356520566e-05, + "loss": 1.6268, + "step": 5185 + }, + { + "epoch": 0.22667074609904278, + "grad_norm": 2.515625, + "learning_rate": 8.788402125122542e-05, + "loss": 2.4155, + "step": 5186 + }, + { + "epoch": 0.2267144543030727, + "grad_norm": 3.078125, + "learning_rate": 8.787953822231855e-05, + "loss": 2.1763, + "step": 5187 + }, + { + "epoch": 0.2267581625071026, + "grad_norm": 2.265625, + "learning_rate": 8.787505447856967e-05, + "loss": 1.7471, + "step": 5188 + }, + { + "epoch": 0.22680187071113247, + "grad_norm": 3.171875, + "learning_rate": 8.787057002006337e-05, + "loss": 2.0579, + "step": 5189 + }, + { + "epoch": 0.22684557891516238, + "grad_norm": 2.4375, + "learning_rate": 8.786608484688432e-05, + "loss": 1.7905, + "step": 5190 + }, + { + "epoch": 0.22688928711919226, + "grad_norm": 13.9375, + "learning_rate": 8.786159895911712e-05, + "loss": 2.8267, + "step": 5191 + }, + { + "epoch": 0.22693299532322217, + "grad_norm": 2.421875, + "learning_rate": 8.785711235684647e-05, + "loss": 2.1309, + "step": 5192 + }, + { + "epoch": 0.22697670352725208, + "grad_norm": 1.8671875, + "learning_rate": 8.785262504015696e-05, + "loss": 1.6625, + "step": 5193 + }, + { + "epoch": 0.22702041173128196, + "grad_norm": 1.8984375, + "learning_rate": 8.784813700913337e-05, + "loss": 1.8822, + "step": 5194 + }, + { + "epoch": 0.22706411993531186, + "grad_norm": 2.28125, + "learning_rate": 8.784364826386034e-05, + "loss": 2.059, + "step": 5195 + }, + { + "epoch": 0.22710782813934174, + "grad_norm": 2.203125, + "learning_rate": 8.783915880442257e-05, + "loss": 2.0635, + "step": 5196 + }, + { + "epoch": 0.22715153634337165, + "grad_norm": 2.015625, + "learning_rate": 8.783466863090482e-05, + "loss": 2.2928, + "step": 5197 + }, + { + "epoch": 0.22719524454740156, + "grad_norm": 2.40625, + "learning_rate": 8.78301777433918e-05, + "loss": 2.1231, + "step": 5198 + }, + { + "epoch": 0.22723895275143144, + "grad_norm": 3.9375, + "learning_rate": 8.782568614196827e-05, + "loss": 3.9269, + "step": 5199 + }, + { + "epoch": 0.22728266095546135, + "grad_norm": 2.828125, + "learning_rate": 8.782119382671898e-05, + "loss": 1.8587, + "step": 5200 + }, + { + "epoch": 0.22732636915949123, + "grad_norm": 2.03125, + "learning_rate": 8.781670079772873e-05, + "loss": 1.5783, + "step": 5201 + }, + { + "epoch": 0.22737007736352113, + "grad_norm": 2.09375, + "learning_rate": 8.781220705508229e-05, + "loss": 1.6649, + "step": 5202 + }, + { + "epoch": 0.22741378556755104, + "grad_norm": 2.078125, + "learning_rate": 8.780771259886447e-05, + "loss": 2.1052, + "step": 5203 + }, + { + "epoch": 0.22745749377158092, + "grad_norm": 2.75, + "learning_rate": 8.780321742916008e-05, + "loss": 2.1247, + "step": 5204 + }, + { + "epoch": 0.22750120197561083, + "grad_norm": 1.9140625, + "learning_rate": 8.779872154605397e-05, + "loss": 1.8571, + "step": 5205 + }, + { + "epoch": 0.2275449101796407, + "grad_norm": 2.078125, + "learning_rate": 8.779422494963096e-05, + "loss": 2.2196, + "step": 5206 + }, + { + "epoch": 0.22758861838367062, + "grad_norm": 2.21875, + "learning_rate": 8.778972763997592e-05, + "loss": 2.0445, + "step": 5207 + }, + { + "epoch": 0.22763232658770052, + "grad_norm": 3.484375, + "learning_rate": 8.778522961717373e-05, + "loss": 2.1614, + "step": 5208 + }, + { + "epoch": 0.2276760347917304, + "grad_norm": 2.5, + "learning_rate": 8.778073088130925e-05, + "loss": 2.4693, + "step": 5209 + }, + { + "epoch": 0.2277197429957603, + "grad_norm": 2.640625, + "learning_rate": 8.777623143246741e-05, + "loss": 2.0724, + "step": 5210 + }, + { + "epoch": 0.2277634511997902, + "grad_norm": 1.9921875, + "learning_rate": 8.777173127073308e-05, + "loss": 1.5359, + "step": 5211 + }, + { + "epoch": 0.2278071594038201, + "grad_norm": 2.15625, + "learning_rate": 8.776723039619121e-05, + "loss": 1.9064, + "step": 5212 + }, + { + "epoch": 0.22785086760785, + "grad_norm": 2.078125, + "learning_rate": 8.776272880892675e-05, + "loss": 1.9944, + "step": 5213 + }, + { + "epoch": 0.22789457581187988, + "grad_norm": 2.5, + "learning_rate": 8.775822650902463e-05, + "loss": 2.0785, + "step": 5214 + }, + { + "epoch": 0.2279382840159098, + "grad_norm": 4.84375, + "learning_rate": 8.775372349656981e-05, + "loss": 2.0266, + "step": 5215 + }, + { + "epoch": 0.22798199221993967, + "grad_norm": 3.796875, + "learning_rate": 8.77492197716473e-05, + "loss": 2.3301, + "step": 5216 + }, + { + "epoch": 0.22802570042396958, + "grad_norm": 2.140625, + "learning_rate": 8.774471533434206e-05, + "loss": 1.6443, + "step": 5217 + }, + { + "epoch": 0.2280694086279995, + "grad_norm": 1.828125, + "learning_rate": 8.77402101847391e-05, + "loss": 1.6125, + "step": 5218 + }, + { + "epoch": 0.22811311683202937, + "grad_norm": 1.90625, + "learning_rate": 8.773570432292344e-05, + "loss": 1.7902, + "step": 5219 + }, + { + "epoch": 0.22815682503605927, + "grad_norm": 2.234375, + "learning_rate": 8.773119774898013e-05, + "loss": 2.0626, + "step": 5220 + }, + { + "epoch": 0.22820053324008915, + "grad_norm": 2.234375, + "learning_rate": 8.77266904629942e-05, + "loss": 2.1337, + "step": 5221 + }, + { + "epoch": 0.22824424144411906, + "grad_norm": 2.5625, + "learning_rate": 8.772218246505073e-05, + "loss": 2.8284, + "step": 5222 + }, + { + "epoch": 0.22828794964814897, + "grad_norm": 1.9375, + "learning_rate": 8.771767375523475e-05, + "loss": 1.8774, + "step": 5223 + }, + { + "epoch": 0.22833165785217885, + "grad_norm": 2.15625, + "learning_rate": 8.771316433363138e-05, + "loss": 1.9789, + "step": 5224 + }, + { + "epoch": 0.22837536605620876, + "grad_norm": 2.34375, + "learning_rate": 8.770865420032571e-05, + "loss": 2.3812, + "step": 5225 + }, + { + "epoch": 0.22841907426023864, + "grad_norm": 2.40625, + "learning_rate": 8.770414335540285e-05, + "loss": 2.1789, + "step": 5226 + }, + { + "epoch": 0.22846278246426854, + "grad_norm": 2.125, + "learning_rate": 8.769963179894793e-05, + "loss": 1.7856, + "step": 5227 + }, + { + "epoch": 0.22850649066829845, + "grad_norm": 1.9296875, + "learning_rate": 8.76951195310461e-05, + "loss": 2.0435, + "step": 5228 + }, + { + "epoch": 0.22855019887232833, + "grad_norm": 3.78125, + "learning_rate": 8.76906065517825e-05, + "loss": 2.341, + "step": 5229 + }, + { + "epoch": 0.22859390707635824, + "grad_norm": 2.125, + "learning_rate": 8.768609286124228e-05, + "loss": 1.9607, + "step": 5230 + }, + { + "epoch": 0.22863761528038812, + "grad_norm": 2.203125, + "learning_rate": 8.768157845951065e-05, + "loss": 2.5474, + "step": 5231 + }, + { + "epoch": 0.22868132348441803, + "grad_norm": 2.25, + "learning_rate": 8.767706334667279e-05, + "loss": 1.9932, + "step": 5232 + }, + { + "epoch": 0.22872503168844793, + "grad_norm": 2.34375, + "learning_rate": 8.767254752281392e-05, + "loss": 1.9982, + "step": 5233 + }, + { + "epoch": 0.2287687398924778, + "grad_norm": 2.140625, + "learning_rate": 8.766803098801924e-05, + "loss": 1.7068, + "step": 5234 + }, + { + "epoch": 0.22881244809650772, + "grad_norm": 2.375, + "learning_rate": 8.766351374237398e-05, + "loss": 2.1922, + "step": 5235 + }, + { + "epoch": 0.2288561563005376, + "grad_norm": 3.328125, + "learning_rate": 8.765899578596342e-05, + "loss": 2.0894, + "step": 5236 + }, + { + "epoch": 0.2288998645045675, + "grad_norm": 2.8125, + "learning_rate": 8.765447711887278e-05, + "loss": 2.5275, + "step": 5237 + }, + { + "epoch": 0.22894357270859741, + "grad_norm": 2.46875, + "learning_rate": 8.764995774118736e-05, + "loss": 2.0998, + "step": 5238 + }, + { + "epoch": 0.2289872809126273, + "grad_norm": 2.140625, + "learning_rate": 8.764543765299245e-05, + "loss": 1.943, + "step": 5239 + }, + { + "epoch": 0.2290309891166572, + "grad_norm": 2.1875, + "learning_rate": 8.764091685437333e-05, + "loss": 1.6759, + "step": 5240 + }, + { + "epoch": 0.22907469732068708, + "grad_norm": 2.59375, + "learning_rate": 8.763639534541533e-05, + "loss": 2.0194, + "step": 5241 + }, + { + "epoch": 0.229118405524717, + "grad_norm": 2.234375, + "learning_rate": 8.763187312620377e-05, + "loss": 1.8292, + "step": 5242 + }, + { + "epoch": 0.2291621137287469, + "grad_norm": 2.21875, + "learning_rate": 8.762735019682399e-05, + "loss": 1.873, + "step": 5243 + }, + { + "epoch": 0.22920582193277678, + "grad_norm": 2.0625, + "learning_rate": 8.762282655736136e-05, + "loss": 1.5856, + "step": 5244 + }, + { + "epoch": 0.22924953013680668, + "grad_norm": 2.578125, + "learning_rate": 8.761830220790123e-05, + "loss": 2.082, + "step": 5245 + }, + { + "epoch": 0.22929323834083656, + "grad_norm": 2.890625, + "learning_rate": 8.761377714852899e-05, + "loss": 2.1958, + "step": 5246 + }, + { + "epoch": 0.22933694654486647, + "grad_norm": 2.25, + "learning_rate": 8.760925137933001e-05, + "loss": 2.0435, + "step": 5247 + }, + { + "epoch": 0.22938065474889638, + "grad_norm": 2.296875, + "learning_rate": 8.760472490038974e-05, + "loss": 2.3615, + "step": 5248 + }, + { + "epoch": 0.22942436295292626, + "grad_norm": 1.875, + "learning_rate": 8.760019771179358e-05, + "loss": 1.7286, + "step": 5249 + }, + { + "epoch": 0.22946807115695617, + "grad_norm": 2.78125, + "learning_rate": 8.759566981362694e-05, + "loss": 1.8911, + "step": 5250 + }, + { + "epoch": 0.22951177936098605, + "grad_norm": 9.9375, + "learning_rate": 8.759114120597532e-05, + "loss": 1.535, + "step": 5251 + }, + { + "epoch": 0.22955548756501595, + "grad_norm": 2.484375, + "learning_rate": 8.758661188892414e-05, + "loss": 2.203, + "step": 5252 + }, + { + "epoch": 0.22959919576904586, + "grad_norm": 3.296875, + "learning_rate": 8.75820818625589e-05, + "loss": 2.2292, + "step": 5253 + }, + { + "epoch": 0.22964290397307574, + "grad_norm": 2.578125, + "learning_rate": 8.757755112696509e-05, + "loss": 2.431, + "step": 5254 + }, + { + "epoch": 0.22968661217710565, + "grad_norm": 3.640625, + "learning_rate": 8.757301968222817e-05, + "loss": 3.2358, + "step": 5255 + }, + { + "epoch": 0.22973032038113553, + "grad_norm": 1.9296875, + "learning_rate": 8.756848752843369e-05, + "loss": 1.7219, + "step": 5256 + }, + { + "epoch": 0.22977402858516544, + "grad_norm": 2.34375, + "learning_rate": 8.756395466566719e-05, + "loss": 1.6914, + "step": 5257 + }, + { + "epoch": 0.22981773678919534, + "grad_norm": 2.46875, + "learning_rate": 8.755942109401417e-05, + "loss": 2.3572, + "step": 5258 + }, + { + "epoch": 0.22986144499322522, + "grad_norm": 2.59375, + "learning_rate": 8.755488681356022e-05, + "loss": 3.0781, + "step": 5259 + }, + { + "epoch": 0.22990515319725513, + "grad_norm": 1.9296875, + "learning_rate": 8.755035182439088e-05, + "loss": 1.7704, + "step": 5260 + }, + { + "epoch": 0.229948861401285, + "grad_norm": 1.9140625, + "learning_rate": 8.754581612659178e-05, + "loss": 1.8192, + "step": 5261 + }, + { + "epoch": 0.22999256960531492, + "grad_norm": 2.21875, + "learning_rate": 8.754127972024846e-05, + "loss": 2.4582, + "step": 5262 + }, + { + "epoch": 0.23003627780934482, + "grad_norm": 2.578125, + "learning_rate": 8.753674260544657e-05, + "loss": 2.6562, + "step": 5263 + }, + { + "epoch": 0.2300799860133747, + "grad_norm": 3.640625, + "learning_rate": 8.753220478227171e-05, + "loss": 1.5956, + "step": 5264 + }, + { + "epoch": 0.2301236942174046, + "grad_norm": 2.71875, + "learning_rate": 8.752766625080952e-05, + "loss": 1.8347, + "step": 5265 + }, + { + "epoch": 0.2301674024214345, + "grad_norm": 2.71875, + "learning_rate": 8.752312701114564e-05, + "loss": 1.8427, + "step": 5266 + }, + { + "epoch": 0.2302111106254644, + "grad_norm": 3.390625, + "learning_rate": 8.751858706336576e-05, + "loss": 3.1018, + "step": 5267 + }, + { + "epoch": 0.2302548188294943, + "grad_norm": 1.953125, + "learning_rate": 8.751404640755552e-05, + "loss": 1.7006, + "step": 5268 + }, + { + "epoch": 0.2302985270335242, + "grad_norm": 2.1875, + "learning_rate": 8.750950504380062e-05, + "loss": 1.9282, + "step": 5269 + }, + { + "epoch": 0.2303422352375541, + "grad_norm": 2.140625, + "learning_rate": 8.750496297218678e-05, + "loss": 1.5637, + "step": 5270 + }, + { + "epoch": 0.23038594344158397, + "grad_norm": 3.40625, + "learning_rate": 8.75004201927997e-05, + "loss": 1.9573, + "step": 5271 + }, + { + "epoch": 0.23042965164561388, + "grad_norm": 2.3125, + "learning_rate": 8.749587670572511e-05, + "loss": 2.4692, + "step": 5272 + }, + { + "epoch": 0.2304733598496438, + "grad_norm": 1.984375, + "learning_rate": 8.749133251104876e-05, + "loss": 1.6436, + "step": 5273 + }, + { + "epoch": 0.23051706805367367, + "grad_norm": 2.59375, + "learning_rate": 8.748678760885638e-05, + "loss": 2.428, + "step": 5274 + }, + { + "epoch": 0.23056077625770358, + "grad_norm": 2.078125, + "learning_rate": 8.748224199923378e-05, + "loss": 1.7959, + "step": 5275 + }, + { + "epoch": 0.23060448446173346, + "grad_norm": 1.9765625, + "learning_rate": 8.747769568226672e-05, + "loss": 1.7832, + "step": 5276 + }, + { + "epoch": 0.23064819266576336, + "grad_norm": 1.9921875, + "learning_rate": 8.747314865804098e-05, + "loss": 1.8904, + "step": 5277 + }, + { + "epoch": 0.23069190086979327, + "grad_norm": 1.8515625, + "learning_rate": 8.74686009266424e-05, + "loss": 1.6946, + "step": 5278 + }, + { + "epoch": 0.23073560907382315, + "grad_norm": 2.546875, + "learning_rate": 8.74640524881568e-05, + "loss": 2.447, + "step": 5279 + }, + { + "epoch": 0.23077931727785306, + "grad_norm": 2.15625, + "learning_rate": 8.745950334266997e-05, + "loss": 1.8461, + "step": 5280 + }, + { + "epoch": 0.23082302548188294, + "grad_norm": 2.25, + "learning_rate": 8.745495349026781e-05, + "loss": 1.5948, + "step": 5281 + }, + { + "epoch": 0.23086673368591284, + "grad_norm": 2.09375, + "learning_rate": 8.745040293103616e-05, + "loss": 1.9506, + "step": 5282 + }, + { + "epoch": 0.23091044188994275, + "grad_norm": 3.640625, + "learning_rate": 8.744585166506089e-05, + "loss": 1.9788, + "step": 5283 + }, + { + "epoch": 0.23095415009397263, + "grad_norm": 3.46875, + "learning_rate": 8.744129969242793e-05, + "loss": 2.6072, + "step": 5284 + }, + { + "epoch": 0.23099785829800254, + "grad_norm": 2.328125, + "learning_rate": 8.743674701322312e-05, + "loss": 1.6111, + "step": 5285 + }, + { + "epoch": 0.23104156650203242, + "grad_norm": 2.359375, + "learning_rate": 8.743219362753244e-05, + "loss": 1.7419, + "step": 5286 + }, + { + "epoch": 0.23108527470606233, + "grad_norm": 2.671875, + "learning_rate": 8.742763953544175e-05, + "loss": 2.2248, + "step": 5287 + }, + { + "epoch": 0.23112898291009223, + "grad_norm": 2.15625, + "learning_rate": 8.742308473703706e-05, + "loss": 1.6965, + "step": 5288 + }, + { + "epoch": 0.23117269111412211, + "grad_norm": 2.203125, + "learning_rate": 8.741852923240427e-05, + "loss": 1.9022, + "step": 5289 + }, + { + "epoch": 0.23121639931815202, + "grad_norm": 4.21875, + "learning_rate": 8.741397302162939e-05, + "loss": 1.2419, + "step": 5290 + }, + { + "epoch": 0.2312601075221819, + "grad_norm": 1.9375, + "learning_rate": 8.740941610479838e-05, + "loss": 1.4853, + "step": 5291 + }, + { + "epoch": 0.2313038157262118, + "grad_norm": 2.84375, + "learning_rate": 8.740485848199723e-05, + "loss": 2.8512, + "step": 5292 + }, + { + "epoch": 0.23134752393024172, + "grad_norm": 2.125, + "learning_rate": 8.740030015331198e-05, + "loss": 1.8565, + "step": 5293 + }, + { + "epoch": 0.2313912321342716, + "grad_norm": 2.4375, + "learning_rate": 8.739574111882862e-05, + "loss": 2.0053, + "step": 5294 + }, + { + "epoch": 0.2314349403383015, + "grad_norm": 2.046875, + "learning_rate": 8.73911813786332e-05, + "loss": 1.7054, + "step": 5295 + }, + { + "epoch": 0.23147864854233138, + "grad_norm": 2.03125, + "learning_rate": 8.738662093281177e-05, + "loss": 1.8426, + "step": 5296 + }, + { + "epoch": 0.2315223567463613, + "grad_norm": 2.234375, + "learning_rate": 8.738205978145038e-05, + "loss": 2.0349, + "step": 5297 + }, + { + "epoch": 0.2315660649503912, + "grad_norm": 1.984375, + "learning_rate": 8.737749792463512e-05, + "loss": 1.8219, + "step": 5298 + }, + { + "epoch": 0.23160977315442108, + "grad_norm": 2.5, + "learning_rate": 8.737293536245207e-05, + "loss": 3.0215, + "step": 5299 + }, + { + "epoch": 0.23165348135845099, + "grad_norm": 1.9140625, + "learning_rate": 8.736837209498736e-05, + "loss": 1.6017, + "step": 5300 + }, + { + "epoch": 0.23169718956248087, + "grad_norm": 2.125, + "learning_rate": 8.736380812232706e-05, + "loss": 2.0057, + "step": 5301 + }, + { + "epoch": 0.23174089776651077, + "grad_norm": 2.734375, + "learning_rate": 8.735924344455732e-05, + "loss": 2.6932, + "step": 5302 + }, + { + "epoch": 0.23178460597054068, + "grad_norm": 2.453125, + "learning_rate": 8.735467806176429e-05, + "loss": 2.12, + "step": 5303 + }, + { + "epoch": 0.23182831417457056, + "grad_norm": 3.03125, + "learning_rate": 8.735011197403411e-05, + "loss": 2.4922, + "step": 5304 + }, + { + "epoch": 0.23187202237860047, + "grad_norm": 3.546875, + "learning_rate": 8.734554518145297e-05, + "loss": 2.1247, + "step": 5305 + }, + { + "epoch": 0.23191573058263035, + "grad_norm": 2.046875, + "learning_rate": 8.734097768410703e-05, + "loss": 2.5513, + "step": 5306 + }, + { + "epoch": 0.23195943878666025, + "grad_norm": 2.671875, + "learning_rate": 8.733640948208248e-05, + "loss": 2.6246, + "step": 5307 + }, + { + "epoch": 0.23200314699069016, + "grad_norm": 2.65625, + "learning_rate": 8.733184057546558e-05, + "loss": 2.9116, + "step": 5308 + }, + { + "epoch": 0.23204685519472004, + "grad_norm": 2.40625, + "learning_rate": 8.732727096434247e-05, + "loss": 2.7499, + "step": 5309 + }, + { + "epoch": 0.23209056339874995, + "grad_norm": 2.046875, + "learning_rate": 8.732270064879947e-05, + "loss": 1.8118, + "step": 5310 + }, + { + "epoch": 0.23213427160277983, + "grad_norm": 2.71875, + "learning_rate": 8.731812962892277e-05, + "loss": 1.5574, + "step": 5311 + }, + { + "epoch": 0.23217797980680974, + "grad_norm": 2.875, + "learning_rate": 8.731355790479862e-05, + "loss": 2.3718, + "step": 5312 + }, + { + "epoch": 0.23222168801083964, + "grad_norm": 2.890625, + "learning_rate": 8.730898547651337e-05, + "loss": 1.4868, + "step": 5313 + }, + { + "epoch": 0.23226539621486952, + "grad_norm": 2.40625, + "learning_rate": 8.730441234415324e-05, + "loss": 2.3289, + "step": 5314 + }, + { + "epoch": 0.23230910441889943, + "grad_norm": 2.109375, + "learning_rate": 8.729983850780456e-05, + "loss": 1.4998, + "step": 5315 + }, + { + "epoch": 0.2323528126229293, + "grad_norm": 2.53125, + "learning_rate": 8.729526396755365e-05, + "loss": 2.767, + "step": 5316 + }, + { + "epoch": 0.23239652082695922, + "grad_norm": 2.078125, + "learning_rate": 8.729068872348681e-05, + "loss": 1.6264, + "step": 5317 + }, + { + "epoch": 0.23244022903098913, + "grad_norm": 2.265625, + "learning_rate": 8.728611277569042e-05, + "loss": 1.6083, + "step": 5318 + }, + { + "epoch": 0.232483937235019, + "grad_norm": 3.1875, + "learning_rate": 8.72815361242508e-05, + "loss": 1.8636, + "step": 5319 + }, + { + "epoch": 0.2325276454390489, + "grad_norm": 2.015625, + "learning_rate": 8.727695876925433e-05, + "loss": 1.5848, + "step": 5320 + }, + { + "epoch": 0.2325713536430788, + "grad_norm": 2.484375, + "learning_rate": 8.72723807107874e-05, + "loss": 2.8408, + "step": 5321 + }, + { + "epoch": 0.2326150618471087, + "grad_norm": 2.234375, + "learning_rate": 8.72678019489364e-05, + "loss": 1.5735, + "step": 5322 + }, + { + "epoch": 0.2326587700511386, + "grad_norm": 2.28125, + "learning_rate": 8.726322248378775e-05, + "loss": 1.8741, + "step": 5323 + }, + { + "epoch": 0.2327024782551685, + "grad_norm": 2.765625, + "learning_rate": 8.725864231542784e-05, + "loss": 1.8679, + "step": 5324 + }, + { + "epoch": 0.2327461864591984, + "grad_norm": 2.234375, + "learning_rate": 8.725406144394313e-05, + "loss": 2.4191, + "step": 5325 + }, + { + "epoch": 0.23278989466322828, + "grad_norm": 2.078125, + "learning_rate": 8.724947986942004e-05, + "loss": 1.9748, + "step": 5326 + }, + { + "epoch": 0.23283360286725818, + "grad_norm": 1.9453125, + "learning_rate": 8.724489759194507e-05, + "loss": 1.6349, + "step": 5327 + }, + { + "epoch": 0.2328773110712881, + "grad_norm": 2.4375, + "learning_rate": 8.724031461160467e-05, + "loss": 2.3706, + "step": 5328 + }, + { + "epoch": 0.23292101927531797, + "grad_norm": 1.9609375, + "learning_rate": 8.723573092848534e-05, + "loss": 1.7051, + "step": 5329 + }, + { + "epoch": 0.23296472747934788, + "grad_norm": 1.953125, + "learning_rate": 8.723114654267356e-05, + "loss": 1.9907, + "step": 5330 + }, + { + "epoch": 0.23300843568337776, + "grad_norm": 2.09375, + "learning_rate": 8.722656145425587e-05, + "loss": 1.867, + "step": 5331 + }, + { + "epoch": 0.23305214388740766, + "grad_norm": 2.15625, + "learning_rate": 8.722197566331878e-05, + "loss": 2.5761, + "step": 5332 + }, + { + "epoch": 0.23309585209143757, + "grad_norm": 2.09375, + "learning_rate": 8.721738916994883e-05, + "loss": 1.9586, + "step": 5333 + }, + { + "epoch": 0.23313956029546745, + "grad_norm": 2.453125, + "learning_rate": 8.721280197423258e-05, + "loss": 1.9132, + "step": 5334 + }, + { + "epoch": 0.23318326849949736, + "grad_norm": 2.3125, + "learning_rate": 8.720821407625661e-05, + "loss": 2.1847, + "step": 5335 + }, + { + "epoch": 0.23322697670352724, + "grad_norm": 4.625, + "learning_rate": 8.720362547610747e-05, + "loss": 2.6595, + "step": 5336 + }, + { + "epoch": 0.23327068490755715, + "grad_norm": 2.71875, + "learning_rate": 8.719903617387178e-05, + "loss": 2.1804, + "step": 5337 + }, + { + "epoch": 0.23331439311158705, + "grad_norm": 2.375, + "learning_rate": 8.719444616963613e-05, + "loss": 2.4745, + "step": 5338 + }, + { + "epoch": 0.23335810131561693, + "grad_norm": 1.8359375, + "learning_rate": 8.718985546348715e-05, + "loss": 1.5742, + "step": 5339 + }, + { + "epoch": 0.23340180951964684, + "grad_norm": 2.296875, + "learning_rate": 8.718526405551146e-05, + "loss": 2.6613, + "step": 5340 + }, + { + "epoch": 0.23344551772367672, + "grad_norm": 2.25, + "learning_rate": 8.718067194579573e-05, + "loss": 2.6931, + "step": 5341 + }, + { + "epoch": 0.23348922592770663, + "grad_norm": 1.8828125, + "learning_rate": 8.71760791344266e-05, + "loss": 1.6553, + "step": 5342 + }, + { + "epoch": 0.23353293413173654, + "grad_norm": 2.125, + "learning_rate": 8.717148562149076e-05, + "loss": 1.8587, + "step": 5343 + }, + { + "epoch": 0.23357664233576642, + "grad_norm": 2.296875, + "learning_rate": 8.716689140707488e-05, + "loss": 2.2146, + "step": 5344 + }, + { + "epoch": 0.23362035053979632, + "grad_norm": 2.125, + "learning_rate": 8.716229649126566e-05, + "loss": 1.5433, + "step": 5345 + }, + { + "epoch": 0.2336640587438262, + "grad_norm": 2.34375, + "learning_rate": 8.715770087414983e-05, + "loss": 1.7082, + "step": 5346 + }, + { + "epoch": 0.2337077669478561, + "grad_norm": 2.53125, + "learning_rate": 8.715310455581409e-05, + "loss": 1.8389, + "step": 5347 + }, + { + "epoch": 0.23375147515188602, + "grad_norm": 2.15625, + "learning_rate": 8.71485075363452e-05, + "loss": 1.7461, + "step": 5348 + }, + { + "epoch": 0.2337951833559159, + "grad_norm": 2.34375, + "learning_rate": 8.714390981582991e-05, + "loss": 1.8022, + "step": 5349 + }, + { + "epoch": 0.2338388915599458, + "grad_norm": 2.0625, + "learning_rate": 8.713931139435497e-05, + "loss": 1.8482, + "step": 5350 + }, + { + "epoch": 0.23388259976397568, + "grad_norm": 1.96875, + "learning_rate": 8.713471227200719e-05, + "loss": 1.8543, + "step": 5351 + }, + { + "epoch": 0.2339263079680056, + "grad_norm": 1.9453125, + "learning_rate": 8.713011244887331e-05, + "loss": 1.3542, + "step": 5352 + }, + { + "epoch": 0.2339700161720355, + "grad_norm": 2.890625, + "learning_rate": 8.712551192504019e-05, + "loss": 1.8759, + "step": 5353 + }, + { + "epoch": 0.23401372437606538, + "grad_norm": 2.78125, + "learning_rate": 8.712091070059463e-05, + "loss": 2.6063, + "step": 5354 + }, + { + "epoch": 0.2340574325800953, + "grad_norm": 4.8125, + "learning_rate": 8.711630877562345e-05, + "loss": 2.0466, + "step": 5355 + }, + { + "epoch": 0.23410114078412517, + "grad_norm": 3.859375, + "learning_rate": 8.71117061502135e-05, + "loss": 2.1461, + "step": 5356 + }, + { + "epoch": 0.23414484898815507, + "grad_norm": 2.75, + "learning_rate": 8.710710282445165e-05, + "loss": 2.7146, + "step": 5357 + }, + { + "epoch": 0.23418855719218498, + "grad_norm": 2.09375, + "learning_rate": 8.710249879842476e-05, + "loss": 1.7998, + "step": 5358 + }, + { + "epoch": 0.23423226539621486, + "grad_norm": 7.21875, + "learning_rate": 8.709789407221971e-05, + "loss": 1.5454, + "step": 5359 + }, + { + "epoch": 0.23427597360024477, + "grad_norm": 2.71875, + "learning_rate": 8.709328864592341e-05, + "loss": 2.0275, + "step": 5360 + }, + { + "epoch": 0.23431968180427465, + "grad_norm": 2.4375, + "learning_rate": 8.708868251962277e-05, + "loss": 2.6636, + "step": 5361 + }, + { + "epoch": 0.23436339000830456, + "grad_norm": 1.8984375, + "learning_rate": 8.70840756934047e-05, + "loss": 1.9007, + "step": 5362 + }, + { + "epoch": 0.23440709821233446, + "grad_norm": 3.53125, + "learning_rate": 8.707946816735617e-05, + "loss": 2.0047, + "step": 5363 + }, + { + "epoch": 0.23445080641636434, + "grad_norm": 2.703125, + "learning_rate": 8.70748599415641e-05, + "loss": 1.9084, + "step": 5364 + }, + { + "epoch": 0.23449451462039425, + "grad_norm": 2.28125, + "learning_rate": 8.707025101611545e-05, + "loss": 2.4486, + "step": 5365 + }, + { + "epoch": 0.23453822282442413, + "grad_norm": 2.546875, + "learning_rate": 8.706564139109722e-05, + "loss": 1.7819, + "step": 5366 + }, + { + "epoch": 0.23458193102845404, + "grad_norm": 2.1875, + "learning_rate": 8.706103106659638e-05, + "loss": 1.785, + "step": 5367 + }, + { + "epoch": 0.23462563923248395, + "grad_norm": 2.0625, + "learning_rate": 8.705642004269996e-05, + "loss": 2.4596, + "step": 5368 + }, + { + "epoch": 0.23466934743651383, + "grad_norm": 2.34375, + "learning_rate": 8.705180831949496e-05, + "loss": 2.7992, + "step": 5369 + }, + { + "epoch": 0.23471305564054373, + "grad_norm": 2.34375, + "learning_rate": 8.70471958970684e-05, + "loss": 1.4907, + "step": 5370 + }, + { + "epoch": 0.2347567638445736, + "grad_norm": 3.0625, + "learning_rate": 8.704258277550735e-05, + "loss": 1.842, + "step": 5371 + }, + { + "epoch": 0.23480047204860352, + "grad_norm": 2.484375, + "learning_rate": 8.703796895489883e-05, + "loss": 1.6824, + "step": 5372 + }, + { + "epoch": 0.23484418025263343, + "grad_norm": 2.75, + "learning_rate": 8.703335443532994e-05, + "loss": 2.8193, + "step": 5373 + }, + { + "epoch": 0.2348878884566633, + "grad_norm": 2.609375, + "learning_rate": 8.702873921688776e-05, + "loss": 2.1645, + "step": 5374 + }, + { + "epoch": 0.23493159666069321, + "grad_norm": 2.078125, + "learning_rate": 8.702412329965937e-05, + "loss": 2.1561, + "step": 5375 + }, + { + "epoch": 0.23497530486472312, + "grad_norm": 1.875, + "learning_rate": 8.701950668373189e-05, + "loss": 1.7574, + "step": 5376 + }, + { + "epoch": 0.235019013068753, + "grad_norm": 2.140625, + "learning_rate": 8.701488936919242e-05, + "loss": 2.1245, + "step": 5377 + }, + { + "epoch": 0.2350627212727829, + "grad_norm": 2.140625, + "learning_rate": 8.701027135612814e-05, + "loss": 1.968, + "step": 5378 + }, + { + "epoch": 0.2351064294768128, + "grad_norm": 2.375, + "learning_rate": 8.700565264462617e-05, + "loss": 1.6649, + "step": 5379 + }, + { + "epoch": 0.2351501376808427, + "grad_norm": 1.953125, + "learning_rate": 8.700103323477366e-05, + "loss": 1.9941, + "step": 5380 + }, + { + "epoch": 0.2351938458848726, + "grad_norm": 2.203125, + "learning_rate": 8.699641312665782e-05, + "loss": 2.0117, + "step": 5381 + }, + { + "epoch": 0.23523755408890248, + "grad_norm": 2.4375, + "learning_rate": 8.69917923203658e-05, + "loss": 1.7337, + "step": 5382 + }, + { + "epoch": 0.2352812622929324, + "grad_norm": 2.90625, + "learning_rate": 8.698717081598484e-05, + "loss": 1.8028, + "step": 5383 + }, + { + "epoch": 0.23532497049696227, + "grad_norm": 2.265625, + "learning_rate": 8.698254861360211e-05, + "loss": 1.703, + "step": 5384 + }, + { + "epoch": 0.23536867870099218, + "grad_norm": 2.734375, + "learning_rate": 8.697792571330487e-05, + "loss": 2.4212, + "step": 5385 + }, + { + "epoch": 0.2354123869050221, + "grad_norm": 2.5, + "learning_rate": 8.697330211518038e-05, + "loss": 2.0185, + "step": 5386 + }, + { + "epoch": 0.23545609510905197, + "grad_norm": 2.875, + "learning_rate": 8.696867781931584e-05, + "loss": 3.0804, + "step": 5387 + }, + { + "epoch": 0.23549980331308187, + "grad_norm": 2.375, + "learning_rate": 8.696405282579855e-05, + "loss": 1.683, + "step": 5388 + }, + { + "epoch": 0.23554351151711175, + "grad_norm": 2.015625, + "learning_rate": 8.695942713471577e-05, + "loss": 2.1921, + "step": 5389 + }, + { + "epoch": 0.23558721972114166, + "grad_norm": 3.125, + "learning_rate": 8.695480074615483e-05, + "loss": 1.9542, + "step": 5390 + }, + { + "epoch": 0.23563092792517157, + "grad_norm": 2.5, + "learning_rate": 8.695017366020301e-05, + "loss": 1.7134, + "step": 5391 + }, + { + "epoch": 0.23567463612920145, + "grad_norm": 2.203125, + "learning_rate": 8.694554587694764e-05, + "loss": 1.7987, + "step": 5392 + }, + { + "epoch": 0.23571834433323136, + "grad_norm": 3.03125, + "learning_rate": 8.694091739647602e-05, + "loss": 2.0596, + "step": 5393 + }, + { + "epoch": 0.23576205253726124, + "grad_norm": 2.3125, + "learning_rate": 8.693628821887556e-05, + "loss": 2.1124, + "step": 5394 + }, + { + "epoch": 0.23580576074129114, + "grad_norm": 2.515625, + "learning_rate": 8.693165834423355e-05, + "loss": 2.8182, + "step": 5395 + }, + { + "epoch": 0.23584946894532105, + "grad_norm": 2.265625, + "learning_rate": 8.692702777263742e-05, + "loss": 1.3823, + "step": 5396 + }, + { + "epoch": 0.23589317714935093, + "grad_norm": 2.546875, + "learning_rate": 8.692239650417452e-05, + "loss": 1.7818, + "step": 5397 + }, + { + "epoch": 0.23593688535338084, + "grad_norm": 2.734375, + "learning_rate": 8.691776453893227e-05, + "loss": 2.2617, + "step": 5398 + }, + { + "epoch": 0.23598059355741072, + "grad_norm": 2.25, + "learning_rate": 8.691313187699805e-05, + "loss": 2.4695, + "step": 5399 + }, + { + "epoch": 0.23602430176144062, + "grad_norm": 2.09375, + "learning_rate": 8.690849851845933e-05, + "loss": 1.8407, + "step": 5400 + }, + { + "epoch": 0.23606800996547053, + "grad_norm": 2.015625, + "learning_rate": 8.690386446340353e-05, + "loss": 1.6898, + "step": 5401 + }, + { + "epoch": 0.2361117181695004, + "grad_norm": 2.265625, + "learning_rate": 8.689922971191809e-05, + "loss": 1.4087, + "step": 5402 + }, + { + "epoch": 0.23615542637353032, + "grad_norm": 3.109375, + "learning_rate": 8.689459426409045e-05, + "loss": 2.578, + "step": 5403 + }, + { + "epoch": 0.2361991345775602, + "grad_norm": 2.296875, + "learning_rate": 8.688995812000815e-05, + "loss": 1.958, + "step": 5404 + }, + { + "epoch": 0.2362428427815901, + "grad_norm": 2.515625, + "learning_rate": 8.688532127975865e-05, + "loss": 2.3085, + "step": 5405 + }, + { + "epoch": 0.23628655098562001, + "grad_norm": 2.96875, + "learning_rate": 8.688068374342944e-05, + "loss": 1.7854, + "step": 5406 + }, + { + "epoch": 0.2363302591896499, + "grad_norm": 2.546875, + "learning_rate": 8.687604551110807e-05, + "loss": 2.3475, + "step": 5407 + }, + { + "epoch": 0.2363739673936798, + "grad_norm": 4.3125, + "learning_rate": 8.687140658288202e-05, + "loss": 1.7633, + "step": 5408 + }, + { + "epoch": 0.23641767559770968, + "grad_norm": 2.953125, + "learning_rate": 8.686676695883889e-05, + "loss": 2.5959, + "step": 5409 + }, + { + "epoch": 0.2364613838017396, + "grad_norm": 1.9765625, + "learning_rate": 8.68621266390662e-05, + "loss": 1.6729, + "step": 5410 + }, + { + "epoch": 0.2365050920057695, + "grad_norm": 2.421875, + "learning_rate": 8.685748562365153e-05, + "loss": 1.8097, + "step": 5411 + }, + { + "epoch": 0.23654880020979938, + "grad_norm": 4.3125, + "learning_rate": 8.685284391268247e-05, + "loss": 1.8138, + "step": 5412 + }, + { + "epoch": 0.23659250841382928, + "grad_norm": 2.03125, + "learning_rate": 8.684820150624659e-05, + "loss": 1.8495, + "step": 5413 + }, + { + "epoch": 0.23663621661785916, + "grad_norm": 2.453125, + "learning_rate": 8.684355840443155e-05, + "loss": 2.0667, + "step": 5414 + }, + { + "epoch": 0.23667992482188907, + "grad_norm": 1.828125, + "learning_rate": 8.683891460732492e-05, + "loss": 1.48, + "step": 5415 + }, + { + "epoch": 0.23672363302591898, + "grad_norm": 2.328125, + "learning_rate": 8.683427011501434e-05, + "loss": 1.7694, + "step": 5416 + }, + { + "epoch": 0.23676734122994886, + "grad_norm": 1.9765625, + "learning_rate": 8.68296249275875e-05, + "loss": 1.4667, + "step": 5417 + }, + { + "epoch": 0.23681104943397877, + "grad_norm": 2.140625, + "learning_rate": 8.682497904513203e-05, + "loss": 1.8178, + "step": 5418 + }, + { + "epoch": 0.23685475763800865, + "grad_norm": 2.203125, + "learning_rate": 8.68203324677356e-05, + "loss": 1.4938, + "step": 5419 + }, + { + "epoch": 0.23689846584203855, + "grad_norm": 2.515625, + "learning_rate": 8.681568519548591e-05, + "loss": 2.7004, + "step": 5420 + }, + { + "epoch": 0.23694217404606846, + "grad_norm": 2.5625, + "learning_rate": 8.681103722847065e-05, + "loss": 2.2187, + "step": 5421 + }, + { + "epoch": 0.23698588225009834, + "grad_norm": 2.359375, + "learning_rate": 8.680638856677754e-05, + "loss": 2.0843, + "step": 5422 + }, + { + "epoch": 0.23702959045412825, + "grad_norm": 1.9453125, + "learning_rate": 8.680173921049433e-05, + "loss": 1.6284, + "step": 5423 + }, + { + "epoch": 0.23707329865815813, + "grad_norm": 1.9609375, + "learning_rate": 8.679708915970873e-05, + "loss": 1.8602, + "step": 5424 + }, + { + "epoch": 0.23711700686218803, + "grad_norm": 2.609375, + "learning_rate": 8.679243841450848e-05, + "loss": 2.6321, + "step": 5425 + }, + { + "epoch": 0.23716071506621794, + "grad_norm": 2.671875, + "learning_rate": 8.67877869749814e-05, + "loss": 2.7493, + "step": 5426 + }, + { + "epoch": 0.23720442327024782, + "grad_norm": 2.03125, + "learning_rate": 8.678313484121522e-05, + "loss": 1.7742, + "step": 5427 + }, + { + "epoch": 0.23724813147427773, + "grad_norm": 3.03125, + "learning_rate": 8.677848201329774e-05, + "loss": 2.1923, + "step": 5428 + }, + { + "epoch": 0.2372918396783076, + "grad_norm": 2.0625, + "learning_rate": 8.677382849131682e-05, + "loss": 1.8479, + "step": 5429 + }, + { + "epoch": 0.23733554788233752, + "grad_norm": 2.546875, + "learning_rate": 8.676917427536017e-05, + "loss": 2.7202, + "step": 5430 + }, + { + "epoch": 0.23737925608636742, + "grad_norm": 2.921875, + "learning_rate": 8.676451936551573e-05, + "loss": 1.9664, + "step": 5431 + }, + { + "epoch": 0.2374229642903973, + "grad_norm": 1.8671875, + "learning_rate": 8.67598637618713e-05, + "loss": 1.6204, + "step": 5432 + }, + { + "epoch": 0.2374666724944272, + "grad_norm": 2.28125, + "learning_rate": 8.675520746451473e-05, + "loss": 1.4899, + "step": 5433 + }, + { + "epoch": 0.2375103806984571, + "grad_norm": 2.09375, + "learning_rate": 8.675055047353391e-05, + "loss": 1.8132, + "step": 5434 + }, + { + "epoch": 0.237554088902487, + "grad_norm": 3.484375, + "learning_rate": 8.67458927890167e-05, + "loss": 1.4917, + "step": 5435 + }, + { + "epoch": 0.2375977971065169, + "grad_norm": 2.890625, + "learning_rate": 8.674123441105102e-05, + "loss": 1.318, + "step": 5436 + }, + { + "epoch": 0.23764150531054679, + "grad_norm": 2.28125, + "learning_rate": 8.673657533972477e-05, + "loss": 1.9401, + "step": 5437 + }, + { + "epoch": 0.2376852135145767, + "grad_norm": 2.640625, + "learning_rate": 8.673191557512588e-05, + "loss": 2.696, + "step": 5438 + }, + { + "epoch": 0.23772892171860657, + "grad_norm": 2.03125, + "learning_rate": 8.672725511734228e-05, + "loss": 2.0011, + "step": 5439 + }, + { + "epoch": 0.23777262992263648, + "grad_norm": 2.25, + "learning_rate": 8.672259396646191e-05, + "loss": 1.6778, + "step": 5440 + }, + { + "epoch": 0.2378163381266664, + "grad_norm": 2.015625, + "learning_rate": 8.671793212257276e-05, + "loss": 2.2002, + "step": 5441 + }, + { + "epoch": 0.23786004633069627, + "grad_norm": 1.984375, + "learning_rate": 8.671326958576279e-05, + "loss": 1.8127, + "step": 5442 + }, + { + "epoch": 0.23790375453472618, + "grad_norm": 2.546875, + "learning_rate": 8.670860635611997e-05, + "loss": 2.4345, + "step": 5443 + }, + { + "epoch": 0.23794746273875605, + "grad_norm": 2.359375, + "learning_rate": 8.670394243373236e-05, + "loss": 2.1367, + "step": 5444 + }, + { + "epoch": 0.23799117094278596, + "grad_norm": 2.265625, + "learning_rate": 8.66992778186879e-05, + "loss": 1.5326, + "step": 5445 + }, + { + "epoch": 0.23803487914681587, + "grad_norm": 2.0, + "learning_rate": 8.669461251107466e-05, + "loss": 2.1858, + "step": 5446 + }, + { + "epoch": 0.23807858735084575, + "grad_norm": 2.625, + "learning_rate": 8.66899465109807e-05, + "loss": 2.0247, + "step": 5447 + }, + { + "epoch": 0.23812229555487566, + "grad_norm": 2.734375, + "learning_rate": 8.668527981849402e-05, + "loss": 2.1743, + "step": 5448 + }, + { + "epoch": 0.23816600375890554, + "grad_norm": 2.328125, + "learning_rate": 8.668061243370274e-05, + "loss": 1.9896, + "step": 5449 + }, + { + "epoch": 0.23820971196293544, + "grad_norm": 3.28125, + "learning_rate": 8.66759443566949e-05, + "loss": 2.7154, + "step": 5450 + }, + { + "epoch": 0.23825342016696535, + "grad_norm": 2.78125, + "learning_rate": 8.667127558755862e-05, + "loss": 2.4444, + "step": 5451 + }, + { + "epoch": 0.23829712837099523, + "grad_norm": 2.203125, + "learning_rate": 8.6666606126382e-05, + "loss": 2.4881, + "step": 5452 + }, + { + "epoch": 0.23834083657502514, + "grad_norm": 2.8125, + "learning_rate": 8.666193597325314e-05, + "loss": 1.6791, + "step": 5453 + }, + { + "epoch": 0.23838454477905502, + "grad_norm": 2.046875, + "learning_rate": 8.66572651282602e-05, + "loss": 2.1325, + "step": 5454 + }, + { + "epoch": 0.23842825298308493, + "grad_norm": 2.609375, + "learning_rate": 8.665259359149132e-05, + "loss": 2.0048, + "step": 5455 + }, + { + "epoch": 0.23847196118711483, + "grad_norm": 2.328125, + "learning_rate": 8.664792136303465e-05, + "loss": 2.0438, + "step": 5456 + }, + { + "epoch": 0.2385156693911447, + "grad_norm": 2.03125, + "learning_rate": 8.664324844297837e-05, + "loss": 1.7064, + "step": 5457 + }, + { + "epoch": 0.23855937759517462, + "grad_norm": 2.5625, + "learning_rate": 8.663857483141064e-05, + "loss": 2.4179, + "step": 5458 + }, + { + "epoch": 0.2386030857992045, + "grad_norm": 2.03125, + "learning_rate": 8.66339005284197e-05, + "loss": 1.6632, + "step": 5459 + }, + { + "epoch": 0.2386467940032344, + "grad_norm": 3.5, + "learning_rate": 8.662922553409373e-05, + "loss": 1.39, + "step": 5460 + }, + { + "epoch": 0.23869050220726432, + "grad_norm": 2.28125, + "learning_rate": 8.662454984852098e-05, + "loss": 1.7344, + "step": 5461 + }, + { + "epoch": 0.2387342104112942, + "grad_norm": 2.15625, + "learning_rate": 8.661987347178964e-05, + "loss": 1.8208, + "step": 5462 + }, + { + "epoch": 0.2387779186153241, + "grad_norm": 2.0625, + "learning_rate": 8.661519640398801e-05, + "loss": 1.9327, + "step": 5463 + }, + { + "epoch": 0.23882162681935398, + "grad_norm": 2.5, + "learning_rate": 8.661051864520432e-05, + "loss": 2.1962, + "step": 5464 + }, + { + "epoch": 0.2388653350233839, + "grad_norm": 2.03125, + "learning_rate": 8.660584019552687e-05, + "loss": 1.6932, + "step": 5465 + }, + { + "epoch": 0.2389090432274138, + "grad_norm": 2.21875, + "learning_rate": 8.660116105504393e-05, + "loss": 1.9655, + "step": 5466 + }, + { + "epoch": 0.23895275143144368, + "grad_norm": 2.21875, + "learning_rate": 8.659648122384382e-05, + "loss": 2.2509, + "step": 5467 + }, + { + "epoch": 0.23899645963547358, + "grad_norm": 2.1875, + "learning_rate": 8.659180070201484e-05, + "loss": 2.0024, + "step": 5468 + }, + { + "epoch": 0.23904016783950346, + "grad_norm": 2.34375, + "learning_rate": 8.658711948964533e-05, + "loss": 2.3215, + "step": 5469 + }, + { + "epoch": 0.23908387604353337, + "grad_norm": 2.0625, + "learning_rate": 8.658243758682361e-05, + "loss": 1.5055, + "step": 5470 + }, + { + "epoch": 0.23912758424756328, + "grad_norm": 2.53125, + "learning_rate": 8.657775499363806e-05, + "loss": 1.6403, + "step": 5471 + }, + { + "epoch": 0.23917129245159316, + "grad_norm": 1.9609375, + "learning_rate": 8.657307171017703e-05, + "loss": 1.699, + "step": 5472 + }, + { + "epoch": 0.23921500065562307, + "grad_norm": 1.875, + "learning_rate": 8.656838773652891e-05, + "loss": 2.1911, + "step": 5473 + }, + { + "epoch": 0.23925870885965295, + "grad_norm": 2.453125, + "learning_rate": 8.65637030727821e-05, + "loss": 2.0156, + "step": 5474 + }, + { + "epoch": 0.23930241706368285, + "grad_norm": 2.609375, + "learning_rate": 8.655901771902499e-05, + "loss": 1.7335, + "step": 5475 + }, + { + "epoch": 0.23934612526771276, + "grad_norm": 2.84375, + "learning_rate": 8.655433167534601e-05, + "loss": 2.5339, + "step": 5476 + }, + { + "epoch": 0.23938983347174264, + "grad_norm": 1.84375, + "learning_rate": 8.654964494183358e-05, + "loss": 1.634, + "step": 5477 + }, + { + "epoch": 0.23943354167577255, + "grad_norm": 2.546875, + "learning_rate": 8.654495751857616e-05, + "loss": 1.8699, + "step": 5478 + }, + { + "epoch": 0.23947724987980243, + "grad_norm": 2.328125, + "learning_rate": 8.654026940566222e-05, + "loss": 2.0441, + "step": 5479 + }, + { + "epoch": 0.23952095808383234, + "grad_norm": 2.28125, + "learning_rate": 8.653558060318018e-05, + "loss": 2.1452, + "step": 5480 + }, + { + "epoch": 0.23956466628786224, + "grad_norm": 2.546875, + "learning_rate": 8.653089111121859e-05, + "loss": 2.3463, + "step": 5481 + }, + { + "epoch": 0.23960837449189212, + "grad_norm": 2.578125, + "learning_rate": 8.65262009298659e-05, + "loss": 1.9638, + "step": 5482 + }, + { + "epoch": 0.23965208269592203, + "grad_norm": 2.203125, + "learning_rate": 8.652151005921064e-05, + "loss": 2.3107, + "step": 5483 + }, + { + "epoch": 0.2396957908999519, + "grad_norm": 2.734375, + "learning_rate": 8.651681849934134e-05, + "loss": 1.8512, + "step": 5484 + }, + { + "epoch": 0.23973949910398182, + "grad_norm": 2.09375, + "learning_rate": 8.651212625034651e-05, + "loss": 1.7393, + "step": 5485 + }, + { + "epoch": 0.23978320730801173, + "grad_norm": 1.8984375, + "learning_rate": 8.650743331231472e-05, + "loss": 1.7654, + "step": 5486 + }, + { + "epoch": 0.2398269155120416, + "grad_norm": 1.96875, + "learning_rate": 8.650273968533454e-05, + "loss": 2.1793, + "step": 5487 + }, + { + "epoch": 0.2398706237160715, + "grad_norm": 2.078125, + "learning_rate": 8.649804536949454e-05, + "loss": 1.9098, + "step": 5488 + }, + { + "epoch": 0.2399143319201014, + "grad_norm": 2.328125, + "learning_rate": 8.649335036488329e-05, + "loss": 2.5594, + "step": 5489 + }, + { + "epoch": 0.2399580401241313, + "grad_norm": 2.3125, + "learning_rate": 8.64886546715894e-05, + "loss": 2.2916, + "step": 5490 + }, + { + "epoch": 0.2400017483281612, + "grad_norm": 2.625, + "learning_rate": 8.64839582897015e-05, + "loss": 2.9065, + "step": 5491 + }, + { + "epoch": 0.2400454565321911, + "grad_norm": 2.171875, + "learning_rate": 8.647926121930821e-05, + "loss": 2.2818, + "step": 5492 + }, + { + "epoch": 0.240089164736221, + "grad_norm": 3.203125, + "learning_rate": 8.647456346049816e-05, + "loss": 1.6659, + "step": 5493 + }, + { + "epoch": 0.24013287294025087, + "grad_norm": 2.609375, + "learning_rate": 8.646986501336001e-05, + "loss": 2.4522, + "step": 5494 + }, + { + "epoch": 0.24017658114428078, + "grad_norm": 3.203125, + "learning_rate": 8.646516587798243e-05, + "loss": 2.2336, + "step": 5495 + }, + { + "epoch": 0.2402202893483107, + "grad_norm": 2.40625, + "learning_rate": 8.64604660544541e-05, + "loss": 1.8014, + "step": 5496 + }, + { + "epoch": 0.24026399755234057, + "grad_norm": 2.21875, + "learning_rate": 8.645576554286372e-05, + "loss": 1.5453, + "step": 5497 + }, + { + "epoch": 0.24030770575637048, + "grad_norm": 2.609375, + "learning_rate": 8.645106434329996e-05, + "loss": 2.1022, + "step": 5498 + }, + { + "epoch": 0.24035141396040036, + "grad_norm": 2.03125, + "learning_rate": 8.644636245585159e-05, + "loss": 1.9023, + "step": 5499 + }, + { + "epoch": 0.24039512216443026, + "grad_norm": 2.078125, + "learning_rate": 8.64416598806073e-05, + "loss": 1.5248, + "step": 5500 + }, + { + "epoch": 0.24043883036846017, + "grad_norm": 2.09375, + "learning_rate": 8.643695661765585e-05, + "loss": 1.7943, + "step": 5501 + }, + { + "epoch": 0.24048253857249005, + "grad_norm": 2.34375, + "learning_rate": 8.643225266708601e-05, + "loss": 2.9711, + "step": 5502 + }, + { + "epoch": 0.24052624677651996, + "grad_norm": 2.40625, + "learning_rate": 8.642754802898652e-05, + "loss": 1.6168, + "step": 5503 + }, + { + "epoch": 0.24056995498054984, + "grad_norm": 2.046875, + "learning_rate": 8.64228427034462e-05, + "loss": 1.6666, + "step": 5504 + }, + { + "epoch": 0.24061366318457975, + "grad_norm": 2.796875, + "learning_rate": 8.641813669055381e-05, + "loss": 2.1982, + "step": 5505 + }, + { + "epoch": 0.24065737138860965, + "grad_norm": 2.109375, + "learning_rate": 8.641342999039819e-05, + "loss": 1.7612, + "step": 5506 + }, + { + "epoch": 0.24070107959263953, + "grad_norm": 2.234375, + "learning_rate": 8.640872260306814e-05, + "loss": 1.2477, + "step": 5507 + }, + { + "epoch": 0.24074478779666944, + "grad_norm": 1.8359375, + "learning_rate": 8.64040145286525e-05, + "loss": 1.8345, + "step": 5508 + }, + { + "epoch": 0.24078849600069932, + "grad_norm": 2.015625, + "learning_rate": 8.639930576724014e-05, + "loss": 1.8334, + "step": 5509 + }, + { + "epoch": 0.24083220420472923, + "grad_norm": 2.0, + "learning_rate": 8.639459631891989e-05, + "loss": 2.0725, + "step": 5510 + }, + { + "epoch": 0.24087591240875914, + "grad_norm": 2.0, + "learning_rate": 8.638988618378063e-05, + "loss": 2.0752, + "step": 5511 + }, + { + "epoch": 0.24091962061278901, + "grad_norm": 2.625, + "learning_rate": 8.638517536191127e-05, + "loss": 2.3952, + "step": 5512 + }, + { + "epoch": 0.24096332881681892, + "grad_norm": 2.078125, + "learning_rate": 8.638046385340069e-05, + "loss": 1.9946, + "step": 5513 + }, + { + "epoch": 0.2410070370208488, + "grad_norm": 2.96875, + "learning_rate": 8.63757516583378e-05, + "loss": 1.7343, + "step": 5514 + }, + { + "epoch": 0.2410507452248787, + "grad_norm": 2.59375, + "learning_rate": 8.637103877681155e-05, + "loss": 2.0783, + "step": 5515 + }, + { + "epoch": 0.24109445342890862, + "grad_norm": 5.4375, + "learning_rate": 8.636632520891085e-05, + "loss": 2.0024, + "step": 5516 + }, + { + "epoch": 0.2411381616329385, + "grad_norm": 2.046875, + "learning_rate": 8.636161095472466e-05, + "loss": 1.9169, + "step": 5517 + }, + { + "epoch": 0.2411818698369684, + "grad_norm": 2.296875, + "learning_rate": 8.635689601434196e-05, + "loss": 2.361, + "step": 5518 + }, + { + "epoch": 0.24122557804099828, + "grad_norm": 2.03125, + "learning_rate": 8.635218038785171e-05, + "loss": 2.0534, + "step": 5519 + }, + { + "epoch": 0.2412692862450282, + "grad_norm": 2.265625, + "learning_rate": 8.634746407534292e-05, + "loss": 1.9767, + "step": 5520 + }, + { + "epoch": 0.2413129944490581, + "grad_norm": 2.0625, + "learning_rate": 8.634274707690458e-05, + "loss": 1.6981, + "step": 5521 + }, + { + "epoch": 0.24135670265308798, + "grad_norm": 2.234375, + "learning_rate": 8.63380293926257e-05, + "loss": 1.6489, + "step": 5522 + }, + { + "epoch": 0.2414004108571179, + "grad_norm": 2.234375, + "learning_rate": 8.633331102259532e-05, + "loss": 1.6282, + "step": 5523 + }, + { + "epoch": 0.24144411906114777, + "grad_norm": 1.921875, + "learning_rate": 8.632859196690249e-05, + "loss": 1.904, + "step": 5524 + }, + { + "epoch": 0.24148782726517767, + "grad_norm": 2.609375, + "learning_rate": 8.632387222563622e-05, + "loss": 2.4687, + "step": 5525 + }, + { + "epoch": 0.24153153546920758, + "grad_norm": 2.859375, + "learning_rate": 8.631915179888566e-05, + "loss": 2.9449, + "step": 5526 + }, + { + "epoch": 0.24157524367323746, + "grad_norm": 2.28125, + "learning_rate": 8.631443068673983e-05, + "loss": 1.87, + "step": 5527 + }, + { + "epoch": 0.24161895187726737, + "grad_norm": 1.9921875, + "learning_rate": 8.630970888928784e-05, + "loss": 1.7336, + "step": 5528 + }, + { + "epoch": 0.24166266008129725, + "grad_norm": 2.34375, + "learning_rate": 8.630498640661879e-05, + "loss": 2.7522, + "step": 5529 + }, + { + "epoch": 0.24170636828532716, + "grad_norm": 2.671875, + "learning_rate": 8.630026323882181e-05, + "loss": 1.2049, + "step": 5530 + }, + { + "epoch": 0.24175007648935706, + "grad_norm": 2.25, + "learning_rate": 8.629553938598603e-05, + "loss": 2.6773, + "step": 5531 + }, + { + "epoch": 0.24179378469338694, + "grad_norm": 2.203125, + "learning_rate": 8.629081484820058e-05, + "loss": 1.9445, + "step": 5532 + }, + { + "epoch": 0.24183749289741685, + "grad_norm": 2.09375, + "learning_rate": 8.628608962555467e-05, + "loss": 1.8289, + "step": 5533 + }, + { + "epoch": 0.24188120110144673, + "grad_norm": 1.8359375, + "learning_rate": 8.62813637181374e-05, + "loss": 1.6484, + "step": 5534 + }, + { + "epoch": 0.24192490930547664, + "grad_norm": 3.359375, + "learning_rate": 8.627663712603802e-05, + "loss": 1.7932, + "step": 5535 + }, + { + "epoch": 0.24196861750950655, + "grad_norm": 2.015625, + "learning_rate": 8.627190984934569e-05, + "loss": 2.0733, + "step": 5536 + }, + { + "epoch": 0.24201232571353642, + "grad_norm": 2.65625, + "learning_rate": 8.626718188814964e-05, + "loss": 2.6501, + "step": 5537 + }, + { + "epoch": 0.24205603391756633, + "grad_norm": 2.203125, + "learning_rate": 8.626245324253906e-05, + "loss": 1.7905, + "step": 5538 + }, + { + "epoch": 0.2420997421215962, + "grad_norm": 2.234375, + "learning_rate": 8.625772391260323e-05, + "loss": 1.7842, + "step": 5539 + }, + { + "epoch": 0.24214345032562612, + "grad_norm": 3.640625, + "learning_rate": 8.625299389843137e-05, + "loss": 3.0661, + "step": 5540 + }, + { + "epoch": 0.24218715852965603, + "grad_norm": 2.25, + "learning_rate": 8.624826320011275e-05, + "loss": 2.2763, + "step": 5541 + }, + { + "epoch": 0.2422308667336859, + "grad_norm": 2.734375, + "learning_rate": 8.624353181773664e-05, + "loss": 1.3571, + "step": 5542 + }, + { + "epoch": 0.24227457493771581, + "grad_norm": 2.953125, + "learning_rate": 8.623879975139235e-05, + "loss": 1.9099, + "step": 5543 + }, + { + "epoch": 0.2423182831417457, + "grad_norm": 3.15625, + "learning_rate": 8.623406700116917e-05, + "loss": 2.2006, + "step": 5544 + }, + { + "epoch": 0.2423619913457756, + "grad_norm": 2.328125, + "learning_rate": 8.622933356715639e-05, + "loss": 1.6884, + "step": 5545 + }, + { + "epoch": 0.2424056995498055, + "grad_norm": 2.390625, + "learning_rate": 8.622459944944336e-05, + "loss": 2.199, + "step": 5546 + }, + { + "epoch": 0.2424494077538354, + "grad_norm": 2.46875, + "learning_rate": 8.621986464811943e-05, + "loss": 1.8362, + "step": 5547 + }, + { + "epoch": 0.2424931159578653, + "grad_norm": 2.421875, + "learning_rate": 8.621512916327394e-05, + "loss": 2.5015, + "step": 5548 + }, + { + "epoch": 0.24253682416189518, + "grad_norm": 2.4375, + "learning_rate": 8.621039299499624e-05, + "loss": 2.1437, + "step": 5549 + }, + { + "epoch": 0.24258053236592508, + "grad_norm": 2.078125, + "learning_rate": 8.620565614337572e-05, + "loss": 1.5535, + "step": 5550 + }, + { + "epoch": 0.242624240569955, + "grad_norm": 2.015625, + "learning_rate": 8.620091860850177e-05, + "loss": 1.5493, + "step": 5551 + }, + { + "epoch": 0.24266794877398487, + "grad_norm": 1.9453125, + "learning_rate": 8.619618039046382e-05, + "loss": 1.6928, + "step": 5552 + }, + { + "epoch": 0.24271165697801478, + "grad_norm": 2.03125, + "learning_rate": 8.619144148935125e-05, + "loss": 2.0614, + "step": 5553 + }, + { + "epoch": 0.24275536518204466, + "grad_norm": 2.5625, + "learning_rate": 8.618670190525352e-05, + "loss": 1.4767, + "step": 5554 + }, + { + "epoch": 0.24279907338607457, + "grad_norm": 3.296875, + "learning_rate": 8.618196163826005e-05, + "loss": 3.169, + "step": 5555 + }, + { + "epoch": 0.24284278159010447, + "grad_norm": 2.203125, + "learning_rate": 8.617722068846028e-05, + "loss": 1.8736, + "step": 5556 + }, + { + "epoch": 0.24288648979413435, + "grad_norm": 2.3125, + "learning_rate": 8.617247905594372e-05, + "loss": 2.5414, + "step": 5557 + }, + { + "epoch": 0.24293019799816426, + "grad_norm": 2.671875, + "learning_rate": 8.616773674079985e-05, + "loss": 2.1125, + "step": 5558 + }, + { + "epoch": 0.24297390620219414, + "grad_norm": 3.171875, + "learning_rate": 8.616299374311814e-05, + "loss": 2.6338, + "step": 5559 + }, + { + "epoch": 0.24301761440622405, + "grad_norm": 2.125, + "learning_rate": 8.615825006298808e-05, + "loss": 1.9468, + "step": 5560 + }, + { + "epoch": 0.24306132261025395, + "grad_norm": 2.28125, + "learning_rate": 8.615350570049924e-05, + "loss": 1.7916, + "step": 5561 + }, + { + "epoch": 0.24310503081428383, + "grad_norm": 2.171875, + "learning_rate": 8.614876065574113e-05, + "loss": 1.6609, + "step": 5562 + }, + { + "epoch": 0.24314873901831374, + "grad_norm": 2.0625, + "learning_rate": 8.614401492880329e-05, + "loss": 2.25, + "step": 5563 + }, + { + "epoch": 0.24319244722234362, + "grad_norm": 2.1875, + "learning_rate": 8.613926851977527e-05, + "loss": 1.9206, + "step": 5564 + }, + { + "epoch": 0.24323615542637353, + "grad_norm": 2.625, + "learning_rate": 8.613452142874667e-05, + "loss": 2.1226, + "step": 5565 + }, + { + "epoch": 0.24327986363040344, + "grad_norm": 2.265625, + "learning_rate": 8.612977365580705e-05, + "loss": 1.3526, + "step": 5566 + }, + { + "epoch": 0.24332357183443332, + "grad_norm": 2.671875, + "learning_rate": 8.612502520104602e-05, + "loss": 2.4662, + "step": 5567 + }, + { + "epoch": 0.24336728003846322, + "grad_norm": 2.140625, + "learning_rate": 8.61202760645532e-05, + "loss": 1.9605, + "step": 5568 + }, + { + "epoch": 0.2434109882424931, + "grad_norm": 2.296875, + "learning_rate": 8.611552624641818e-05, + "loss": 1.9366, + "step": 5569 + }, + { + "epoch": 0.243454696446523, + "grad_norm": 2.46875, + "learning_rate": 8.611077574673063e-05, + "loss": 2.3338, + "step": 5570 + }, + { + "epoch": 0.24349840465055292, + "grad_norm": 2.796875, + "learning_rate": 8.610602456558018e-05, + "loss": 2.3373, + "step": 5571 + }, + { + "epoch": 0.2435421128545828, + "grad_norm": 2.203125, + "learning_rate": 8.61012727030565e-05, + "loss": 2.1172, + "step": 5572 + }, + { + "epoch": 0.2435858210586127, + "grad_norm": 2.390625, + "learning_rate": 8.609652015924925e-05, + "loss": 1.8434, + "step": 5573 + }, + { + "epoch": 0.24362952926264259, + "grad_norm": 2.15625, + "learning_rate": 8.609176693424814e-05, + "loss": 2.0251, + "step": 5574 + }, + { + "epoch": 0.2436732374666725, + "grad_norm": 2.28125, + "learning_rate": 8.608701302814286e-05, + "loss": 1.6486, + "step": 5575 + }, + { + "epoch": 0.2437169456707024, + "grad_norm": 3.765625, + "learning_rate": 8.60822584410231e-05, + "loss": 1.7621, + "step": 5576 + }, + { + "epoch": 0.24376065387473228, + "grad_norm": 2.65625, + "learning_rate": 8.607750317297863e-05, + "loss": 1.6235, + "step": 5577 + }, + { + "epoch": 0.2438043620787622, + "grad_norm": 2.140625, + "learning_rate": 8.607274722409918e-05, + "loss": 2.2202, + "step": 5578 + }, + { + "epoch": 0.24384807028279207, + "grad_norm": 1.9765625, + "learning_rate": 8.606799059447445e-05, + "loss": 1.8622, + "step": 5579 + }, + { + "epoch": 0.24389177848682198, + "grad_norm": 1.9296875, + "learning_rate": 8.606323328419428e-05, + "loss": 1.903, + "step": 5580 + }, + { + "epoch": 0.24393548669085188, + "grad_norm": 2.234375, + "learning_rate": 8.605847529334839e-05, + "loss": 2.0046, + "step": 5581 + }, + { + "epoch": 0.24397919489488176, + "grad_norm": 3.609375, + "learning_rate": 8.60537166220266e-05, + "loss": 1.0194, + "step": 5582 + }, + { + "epoch": 0.24402290309891167, + "grad_norm": 3.671875, + "learning_rate": 8.604895727031869e-05, + "loss": 2.1291, + "step": 5583 + }, + { + "epoch": 0.24406661130294155, + "grad_norm": 1.9765625, + "learning_rate": 8.60441972383145e-05, + "loss": 1.747, + "step": 5584 + }, + { + "epoch": 0.24411031950697146, + "grad_norm": 2.46875, + "learning_rate": 8.603943652610385e-05, + "loss": 2.0577, + "step": 5585 + }, + { + "epoch": 0.24415402771100136, + "grad_norm": 3.71875, + "learning_rate": 8.603467513377657e-05, + "loss": 1.9055, + "step": 5586 + }, + { + "epoch": 0.24419773591503124, + "grad_norm": 2.796875, + "learning_rate": 8.602991306142252e-05, + "loss": 2.5746, + "step": 5587 + }, + { + "epoch": 0.24424144411906115, + "grad_norm": 2.53125, + "learning_rate": 8.602515030913159e-05, + "loss": 2.2653, + "step": 5588 + }, + { + "epoch": 0.24428515232309103, + "grad_norm": 2.53125, + "learning_rate": 8.602038687699364e-05, + "loss": 2.5107, + "step": 5589 + }, + { + "epoch": 0.24432886052712094, + "grad_norm": 2.21875, + "learning_rate": 8.601562276509855e-05, + "loss": 1.8985, + "step": 5590 + }, + { + "epoch": 0.24437256873115085, + "grad_norm": 2.34375, + "learning_rate": 8.601085797353625e-05, + "loss": 1.976, + "step": 5591 + }, + { + "epoch": 0.24441627693518073, + "grad_norm": 2.328125, + "learning_rate": 8.600609250239664e-05, + "loss": 1.7231, + "step": 5592 + }, + { + "epoch": 0.24445998513921063, + "grad_norm": 2.15625, + "learning_rate": 8.600132635176967e-05, + "loss": 1.9487, + "step": 5593 + }, + { + "epoch": 0.2445036933432405, + "grad_norm": 1.9609375, + "learning_rate": 8.599655952174527e-05, + "loss": 2.0027, + "step": 5594 + }, + { + "epoch": 0.24454740154727042, + "grad_norm": 2.078125, + "learning_rate": 8.59917920124134e-05, + "loss": 1.7015, + "step": 5595 + }, + { + "epoch": 0.24459110975130033, + "grad_norm": 5.15625, + "learning_rate": 8.598702382386403e-05, + "loss": 2.4159, + "step": 5596 + }, + { + "epoch": 0.2446348179553302, + "grad_norm": 1.8359375, + "learning_rate": 8.598225495618715e-05, + "loss": 1.7125, + "step": 5597 + }, + { + "epoch": 0.24467852615936012, + "grad_norm": 2.265625, + "learning_rate": 8.597748540947274e-05, + "loss": 1.8233, + "step": 5598 + }, + { + "epoch": 0.24472223436339, + "grad_norm": 2.265625, + "learning_rate": 8.597271518381082e-05, + "loss": 1.7276, + "step": 5599 + }, + { + "epoch": 0.2447659425674199, + "grad_norm": 2.421875, + "learning_rate": 8.596794427929142e-05, + "loss": 2.5536, + "step": 5600 + }, + { + "epoch": 0.2448096507714498, + "grad_norm": 3.015625, + "learning_rate": 8.596317269600455e-05, + "loss": 2.0302, + "step": 5601 + }, + { + "epoch": 0.2448533589754797, + "grad_norm": 2.09375, + "learning_rate": 8.595840043404028e-05, + "loss": 1.6446, + "step": 5602 + }, + { + "epoch": 0.2448970671795096, + "grad_norm": 2.1875, + "learning_rate": 8.595362749348866e-05, + "loss": 2.3099, + "step": 5603 + }, + { + "epoch": 0.24494077538353948, + "grad_norm": 2.25, + "learning_rate": 8.594885387443974e-05, + "loss": 2.1294, + "step": 5604 + }, + { + "epoch": 0.24498448358756938, + "grad_norm": 2.1875, + "learning_rate": 8.594407957698365e-05, + "loss": 1.8185, + "step": 5605 + }, + { + "epoch": 0.2450281917915993, + "grad_norm": 2.0, + "learning_rate": 8.593930460121044e-05, + "loss": 2.2778, + "step": 5606 + }, + { + "epoch": 0.24507189999562917, + "grad_norm": 2.5625, + "learning_rate": 8.593452894721027e-05, + "loss": 1.0981, + "step": 5607 + }, + { + "epoch": 0.24511560819965908, + "grad_norm": 1.984375, + "learning_rate": 8.592975261507323e-05, + "loss": 1.7008, + "step": 5608 + }, + { + "epoch": 0.24515931640368896, + "grad_norm": 2.03125, + "learning_rate": 8.592497560488946e-05, + "loss": 1.9112, + "step": 5609 + }, + { + "epoch": 0.24520302460771887, + "grad_norm": 3.09375, + "learning_rate": 8.592019791674913e-05, + "loss": 1.4231, + "step": 5610 + }, + { + "epoch": 0.24524673281174877, + "grad_norm": 2.171875, + "learning_rate": 8.591541955074235e-05, + "loss": 2.1189, + "step": 5611 + }, + { + "epoch": 0.24529044101577865, + "grad_norm": 2.328125, + "learning_rate": 8.591064050695936e-05, + "loss": 2.0184, + "step": 5612 + }, + { + "epoch": 0.24533414921980856, + "grad_norm": 2.71875, + "learning_rate": 8.590586078549032e-05, + "loss": 2.7024, + "step": 5613 + }, + { + "epoch": 0.24537785742383844, + "grad_norm": 1.9921875, + "learning_rate": 8.590108038642541e-05, + "loss": 1.9898, + "step": 5614 + }, + { + "epoch": 0.24542156562786835, + "grad_norm": 2.09375, + "learning_rate": 8.589629930985487e-05, + "loss": 1.8514, + "step": 5615 + }, + { + "epoch": 0.24546527383189826, + "grad_norm": 2.046875, + "learning_rate": 8.589151755586892e-05, + "loss": 1.9361, + "step": 5616 + }, + { + "epoch": 0.24550898203592814, + "grad_norm": 2.0, + "learning_rate": 8.588673512455781e-05, + "loss": 1.7724, + "step": 5617 + }, + { + "epoch": 0.24555269023995804, + "grad_norm": 1.984375, + "learning_rate": 8.588195201601177e-05, + "loss": 1.6943, + "step": 5618 + }, + { + "epoch": 0.24559639844398792, + "grad_norm": 2.5625, + "learning_rate": 8.587716823032106e-05, + "loss": 1.6803, + "step": 5619 + }, + { + "epoch": 0.24564010664801783, + "grad_norm": 2.0, + "learning_rate": 8.587238376757597e-05, + "loss": 2.5287, + "step": 5620 + }, + { + "epoch": 0.24568381485204774, + "grad_norm": 1.9765625, + "learning_rate": 8.58675986278668e-05, + "loss": 2.0384, + "step": 5621 + }, + { + "epoch": 0.24572752305607762, + "grad_norm": 2.375, + "learning_rate": 8.586281281128383e-05, + "loss": 1.9584, + "step": 5622 + }, + { + "epoch": 0.24577123126010753, + "grad_norm": 2.390625, + "learning_rate": 8.585802631791739e-05, + "loss": 2.0015, + "step": 5623 + }, + { + "epoch": 0.2458149394641374, + "grad_norm": 1.9921875, + "learning_rate": 8.58532391478578e-05, + "loss": 2.0059, + "step": 5624 + }, + { + "epoch": 0.2458586476681673, + "grad_norm": 2.203125, + "learning_rate": 8.584845130119541e-05, + "loss": 1.6853, + "step": 5625 + }, + { + "epoch": 0.24590235587219722, + "grad_norm": 2.109375, + "learning_rate": 8.584366277802057e-05, + "loss": 1.7735, + "step": 5626 + }, + { + "epoch": 0.2459460640762271, + "grad_norm": 2.34375, + "learning_rate": 8.583887357842364e-05, + "loss": 2.4116, + "step": 5627 + }, + { + "epoch": 0.245989772280257, + "grad_norm": 2.765625, + "learning_rate": 8.583408370249501e-05, + "loss": 1.9734, + "step": 5628 + }, + { + "epoch": 0.2460334804842869, + "grad_norm": 1.9453125, + "learning_rate": 8.582929315032507e-05, + "loss": 1.5228, + "step": 5629 + }, + { + "epoch": 0.2460771886883168, + "grad_norm": 1.8984375, + "learning_rate": 8.582450192200421e-05, + "loss": 1.5607, + "step": 5630 + }, + { + "epoch": 0.2461208968923467, + "grad_norm": 2.796875, + "learning_rate": 8.581971001762286e-05, + "loss": 2.2077, + "step": 5631 + }, + { + "epoch": 0.24616460509637658, + "grad_norm": 2.203125, + "learning_rate": 8.581491743727146e-05, + "loss": 1.8515, + "step": 5632 + }, + { + "epoch": 0.2462083133004065, + "grad_norm": 2.015625, + "learning_rate": 8.581012418104044e-05, + "loss": 1.7113, + "step": 5633 + }, + { + "epoch": 0.24625202150443637, + "grad_norm": 2.59375, + "learning_rate": 8.580533024902024e-05, + "loss": 1.9744, + "step": 5634 + }, + { + "epoch": 0.24629572970846628, + "grad_norm": 2.375, + "learning_rate": 8.580053564130137e-05, + "loss": 2.5002, + "step": 5635 + }, + { + "epoch": 0.24633943791249618, + "grad_norm": 2.09375, + "learning_rate": 8.579574035797427e-05, + "loss": 2.1414, + "step": 5636 + }, + { + "epoch": 0.24638314611652606, + "grad_norm": 2.40625, + "learning_rate": 8.579094439912946e-05, + "loss": 1.9267, + "step": 5637 + }, + { + "epoch": 0.24642685432055597, + "grad_norm": 2.25, + "learning_rate": 8.578614776485743e-05, + "loss": 1.9097, + "step": 5638 + }, + { + "epoch": 0.24647056252458585, + "grad_norm": 2.078125, + "learning_rate": 8.578135045524872e-05, + "loss": 2.1637, + "step": 5639 + }, + { + "epoch": 0.24651427072861576, + "grad_norm": 2.046875, + "learning_rate": 8.577655247039384e-05, + "loss": 2.1543, + "step": 5640 + }, + { + "epoch": 0.24655797893264567, + "grad_norm": 2.390625, + "learning_rate": 8.577175381038337e-05, + "loss": 2.2151, + "step": 5641 + }, + { + "epoch": 0.24660168713667555, + "grad_norm": 2.015625, + "learning_rate": 8.576695447530781e-05, + "loss": 1.4585, + "step": 5642 + }, + { + "epoch": 0.24664539534070545, + "grad_norm": 1.9921875, + "learning_rate": 8.576215446525776e-05, + "loss": 2.1276, + "step": 5643 + }, + { + "epoch": 0.24668910354473533, + "grad_norm": 2.734375, + "learning_rate": 8.575735378032382e-05, + "loss": 1.7072, + "step": 5644 + }, + { + "epoch": 0.24673281174876524, + "grad_norm": 1.8671875, + "learning_rate": 8.575255242059656e-05, + "loss": 1.6291, + "step": 5645 + }, + { + "epoch": 0.24677651995279515, + "grad_norm": 2.515625, + "learning_rate": 8.574775038616662e-05, + "loss": 2.2993, + "step": 5646 + }, + { + "epoch": 0.24682022815682503, + "grad_norm": 2.0625, + "learning_rate": 8.574294767712458e-05, + "loss": 1.3124, + "step": 5647 + }, + { + "epoch": 0.24686393636085494, + "grad_norm": 2.296875, + "learning_rate": 8.573814429356113e-05, + "loss": 1.6663, + "step": 5648 + }, + { + "epoch": 0.24690764456488482, + "grad_norm": 2.125, + "learning_rate": 8.573334023556685e-05, + "loss": 1.9524, + "step": 5649 + }, + { + "epoch": 0.24695135276891472, + "grad_norm": 1.8125, + "learning_rate": 8.572853550323245e-05, + "loss": 1.5047, + "step": 5650 + }, + { + "epoch": 0.24699506097294463, + "grad_norm": 2.1875, + "learning_rate": 8.572373009664858e-05, + "loss": 2.7538, + "step": 5651 + }, + { + "epoch": 0.2470387691769745, + "grad_norm": 2.203125, + "learning_rate": 8.571892401590591e-05, + "loss": 1.896, + "step": 5652 + }, + { + "epoch": 0.24708247738100442, + "grad_norm": 2.015625, + "learning_rate": 8.571411726109519e-05, + "loss": 1.6372, + "step": 5653 + }, + { + "epoch": 0.24712618558503432, + "grad_norm": 2.015625, + "learning_rate": 8.570930983230707e-05, + "loss": 1.9993, + "step": 5654 + }, + { + "epoch": 0.2471698937890642, + "grad_norm": 2.0625, + "learning_rate": 8.570450172963232e-05, + "loss": 1.7876, + "step": 5655 + }, + { + "epoch": 0.2472136019930941, + "grad_norm": 2.40625, + "learning_rate": 8.569969295316164e-05, + "loss": 1.8467, + "step": 5656 + }, + { + "epoch": 0.247257310197124, + "grad_norm": 2.28125, + "learning_rate": 8.569488350298583e-05, + "loss": 2.269, + "step": 5657 + }, + { + "epoch": 0.2473010184011539, + "grad_norm": 2.640625, + "learning_rate": 8.569007337919558e-05, + "loss": 1.9019, + "step": 5658 + }, + { + "epoch": 0.2473447266051838, + "grad_norm": 2.171875, + "learning_rate": 8.568526258188172e-05, + "loss": 1.8272, + "step": 5659 + }, + { + "epoch": 0.2473884348092137, + "grad_norm": 2.734375, + "learning_rate": 8.568045111113501e-05, + "loss": 1.8297, + "step": 5660 + }, + { + "epoch": 0.2474321430132436, + "grad_norm": 2.34375, + "learning_rate": 8.567563896704626e-05, + "loss": 2.0034, + "step": 5661 + }, + { + "epoch": 0.24747585121727347, + "grad_norm": 2.1875, + "learning_rate": 8.567082614970627e-05, + "loss": 1.7031, + "step": 5662 + }, + { + "epoch": 0.24751955942130338, + "grad_norm": 2.546875, + "learning_rate": 8.566601265920588e-05, + "loss": 1.8154, + "step": 5663 + }, + { + "epoch": 0.2475632676253333, + "grad_norm": 2.328125, + "learning_rate": 8.566119849563594e-05, + "loss": 1.7957, + "step": 5664 + }, + { + "epoch": 0.24760697582936317, + "grad_norm": 2.90625, + "learning_rate": 8.565638365908726e-05, + "loss": 2.5451, + "step": 5665 + }, + { + "epoch": 0.24765068403339308, + "grad_norm": 2.171875, + "learning_rate": 8.565156814965074e-05, + "loss": 1.937, + "step": 5666 + }, + { + "epoch": 0.24769439223742296, + "grad_norm": 2.140625, + "learning_rate": 8.564675196741722e-05, + "loss": 1.8272, + "step": 5667 + }, + { + "epoch": 0.24773810044145286, + "grad_norm": 2.09375, + "learning_rate": 8.564193511247762e-05, + "loss": 1.7934, + "step": 5668 + }, + { + "epoch": 0.24778180864548277, + "grad_norm": 1.9921875, + "learning_rate": 8.563711758492283e-05, + "loss": 2.6983, + "step": 5669 + }, + { + "epoch": 0.24782551684951265, + "grad_norm": 2.09375, + "learning_rate": 8.563229938484376e-05, + "loss": 1.7599, + "step": 5670 + }, + { + "epoch": 0.24786922505354256, + "grad_norm": 2.03125, + "learning_rate": 8.562748051233135e-05, + "loss": 1.7215, + "step": 5671 + }, + { + "epoch": 0.24791293325757244, + "grad_norm": 2.3125, + "learning_rate": 8.562266096747653e-05, + "loss": 2.0875, + "step": 5672 + }, + { + "epoch": 0.24795664146160235, + "grad_norm": 2.625, + "learning_rate": 8.561784075037023e-05, + "loss": 2.0816, + "step": 5673 + }, + { + "epoch": 0.24800034966563225, + "grad_norm": 2.234375, + "learning_rate": 8.561301986110347e-05, + "loss": 1.7168, + "step": 5674 + }, + { + "epoch": 0.24804405786966213, + "grad_norm": 1.890625, + "learning_rate": 8.560819829976716e-05, + "loss": 1.9503, + "step": 5675 + }, + { + "epoch": 0.24808776607369204, + "grad_norm": 2.34375, + "learning_rate": 8.560337606645234e-05, + "loss": 2.4694, + "step": 5676 + }, + { + "epoch": 0.24813147427772192, + "grad_norm": 1.8515625, + "learning_rate": 8.559855316124999e-05, + "loss": 2.046, + "step": 5677 + }, + { + "epoch": 0.24817518248175183, + "grad_norm": 1.8515625, + "learning_rate": 8.559372958425113e-05, + "loss": 2.6653, + "step": 5678 + }, + { + "epoch": 0.24821889068578173, + "grad_norm": 2.1875, + "learning_rate": 8.55889053355468e-05, + "loss": 1.7663, + "step": 5679 + }, + { + "epoch": 0.24826259888981161, + "grad_norm": 2.578125, + "learning_rate": 8.558408041522801e-05, + "loss": 2.7306, + "step": 5680 + }, + { + "epoch": 0.24830630709384152, + "grad_norm": 2.40625, + "learning_rate": 8.557925482338585e-05, + "loss": 1.6388, + "step": 5681 + }, + { + "epoch": 0.2483500152978714, + "grad_norm": 1.9921875, + "learning_rate": 8.557442856011135e-05, + "loss": 1.9075, + "step": 5682 + }, + { + "epoch": 0.2483937235019013, + "grad_norm": 2.0, + "learning_rate": 8.556960162549563e-05, + "loss": 1.7638, + "step": 5683 + }, + { + "epoch": 0.24843743170593122, + "grad_norm": 2.390625, + "learning_rate": 8.556477401962974e-05, + "loss": 1.8392, + "step": 5684 + }, + { + "epoch": 0.2484811399099611, + "grad_norm": 2.15625, + "learning_rate": 8.55599457426048e-05, + "loss": 1.5737, + "step": 5685 + }, + { + "epoch": 0.248524848113991, + "grad_norm": 2.3125, + "learning_rate": 8.555511679451196e-05, + "loss": 2.7735, + "step": 5686 + }, + { + "epoch": 0.24856855631802088, + "grad_norm": 1.9765625, + "learning_rate": 8.555028717544227e-05, + "loss": 1.7439, + "step": 5687 + }, + { + "epoch": 0.2486122645220508, + "grad_norm": 3.015625, + "learning_rate": 8.554545688548696e-05, + "loss": 2.0171, + "step": 5688 + }, + { + "epoch": 0.2486559727260807, + "grad_norm": 2.203125, + "learning_rate": 8.554062592473713e-05, + "loss": 2.0584, + "step": 5689 + }, + { + "epoch": 0.24869968093011058, + "grad_norm": 2.640625, + "learning_rate": 8.553579429328395e-05, + "loss": 2.2466, + "step": 5690 + }, + { + "epoch": 0.24874338913414049, + "grad_norm": 2.4375, + "learning_rate": 8.553096199121863e-05, + "loss": 2.3407, + "step": 5691 + }, + { + "epoch": 0.24878709733817037, + "grad_norm": 2.453125, + "learning_rate": 8.552612901863233e-05, + "loss": 2.0368, + "step": 5692 + }, + { + "epoch": 0.24883080554220027, + "grad_norm": 1.96875, + "learning_rate": 8.552129537561628e-05, + "loss": 1.7908, + "step": 5693 + }, + { + "epoch": 0.24887451374623018, + "grad_norm": 3.171875, + "learning_rate": 8.551646106226169e-05, + "loss": 1.6307, + "step": 5694 + }, + { + "epoch": 0.24891822195026006, + "grad_norm": 2.140625, + "learning_rate": 8.55116260786598e-05, + "loss": 1.781, + "step": 5695 + }, + { + "epoch": 0.24896193015428997, + "grad_norm": 3.65625, + "learning_rate": 8.550679042490182e-05, + "loss": 2.1163, + "step": 5696 + }, + { + "epoch": 0.24900563835831985, + "grad_norm": 2.03125, + "learning_rate": 8.550195410107902e-05, + "loss": 1.4579, + "step": 5697 + }, + { + "epoch": 0.24904934656234975, + "grad_norm": 2.0, + "learning_rate": 8.549711710728269e-05, + "loss": 1.4266, + "step": 5698 + }, + { + "epoch": 0.24909305476637966, + "grad_norm": 2.078125, + "learning_rate": 8.54922794436041e-05, + "loss": 1.961, + "step": 5699 + }, + { + "epoch": 0.24913676297040954, + "grad_norm": 2.515625, + "learning_rate": 8.548744111013452e-05, + "loss": 2.7949, + "step": 5700 + }, + { + "epoch": 0.24918047117443945, + "grad_norm": 2.34375, + "learning_rate": 8.54826021069653e-05, + "loss": 2.0042, + "step": 5701 + }, + { + "epoch": 0.24922417937846933, + "grad_norm": 2.140625, + "learning_rate": 8.547776243418772e-05, + "loss": 1.7539, + "step": 5702 + }, + { + "epoch": 0.24926788758249924, + "grad_norm": 2.4375, + "learning_rate": 8.547292209189314e-05, + "loss": 2.0249, + "step": 5703 + }, + { + "epoch": 0.24931159578652914, + "grad_norm": 2.3125, + "learning_rate": 8.546808108017288e-05, + "loss": 2.0008, + "step": 5704 + }, + { + "epoch": 0.24935530399055902, + "grad_norm": 2.5, + "learning_rate": 8.54632393991183e-05, + "loss": 2.4508, + "step": 5705 + }, + { + "epoch": 0.24939901219458893, + "grad_norm": 2.25, + "learning_rate": 8.545839704882082e-05, + "loss": 2.152, + "step": 5706 + }, + { + "epoch": 0.2494427203986188, + "grad_norm": 2.3125, + "learning_rate": 8.545355402937175e-05, + "loss": 2.0039, + "step": 5707 + }, + { + "epoch": 0.24948642860264872, + "grad_norm": 3.46875, + "learning_rate": 8.54487103408625e-05, + "loss": 2.6012, + "step": 5708 + }, + { + "epoch": 0.24953013680667863, + "grad_norm": 2.34375, + "learning_rate": 8.544386598338452e-05, + "loss": 1.8128, + "step": 5709 + }, + { + "epoch": 0.2495738450107085, + "grad_norm": 2.171875, + "learning_rate": 8.54390209570292e-05, + "loss": 2.1566, + "step": 5710 + }, + { + "epoch": 0.2496175532147384, + "grad_norm": 2.125, + "learning_rate": 8.543417526188797e-05, + "loss": 1.9152, + "step": 5711 + }, + { + "epoch": 0.2496612614187683, + "grad_norm": 1.9921875, + "learning_rate": 8.542932889805226e-05, + "loss": 1.7658, + "step": 5712 + }, + { + "epoch": 0.2497049696227982, + "grad_norm": 2.46875, + "learning_rate": 8.542448186561359e-05, + "loss": 2.0908, + "step": 5713 + }, + { + "epoch": 0.2497486778268281, + "grad_norm": 2.421875, + "learning_rate": 8.541963416466336e-05, + "loss": 1.8004, + "step": 5714 + }, + { + "epoch": 0.249792386030858, + "grad_norm": 1.8359375, + "learning_rate": 8.541478579529308e-05, + "loss": 1.5534, + "step": 5715 + }, + { + "epoch": 0.2498360942348879, + "grad_norm": 2.296875, + "learning_rate": 8.540993675759427e-05, + "loss": 1.7388, + "step": 5716 + }, + { + "epoch": 0.24987980243891778, + "grad_norm": 1.9609375, + "learning_rate": 8.540508705165839e-05, + "loss": 1.371, + "step": 5717 + }, + { + "epoch": 0.24992351064294768, + "grad_norm": 1.7734375, + "learning_rate": 8.540023667757702e-05, + "loss": 1.7817, + "step": 5718 + }, + { + "epoch": 0.2499672188469776, + "grad_norm": 3.34375, + "learning_rate": 8.539538563544163e-05, + "loss": 1.8297, + "step": 5719 + }, + { + "epoch": 0.25001092705100747, + "grad_norm": 2.484375, + "learning_rate": 8.539053392534382e-05, + "loss": 2.3024, + "step": 5720 + }, + { + "epoch": 0.25005463525503735, + "grad_norm": 2.015625, + "learning_rate": 8.538568154737512e-05, + "loss": 1.5528, + "step": 5721 + }, + { + "epoch": 0.2500983434590673, + "grad_norm": 1.8984375, + "learning_rate": 8.53808285016271e-05, + "loss": 1.8575, + "step": 5722 + }, + { + "epoch": 0.25014205166309716, + "grad_norm": 2.140625, + "learning_rate": 8.537597478819136e-05, + "loss": 1.7515, + "step": 5723 + }, + { + "epoch": 0.25018575986712704, + "grad_norm": 2.734375, + "learning_rate": 8.537112040715948e-05, + "loss": 1.4989, + "step": 5724 + }, + { + "epoch": 0.250229468071157, + "grad_norm": 16.25, + "learning_rate": 8.536626535862309e-05, + "loss": 5.8871, + "step": 5725 + }, + { + "epoch": 0.25027317627518686, + "grad_norm": 2.09375, + "learning_rate": 8.53614096426738e-05, + "loss": 1.7411, + "step": 5726 + }, + { + "epoch": 0.25031688447921674, + "grad_norm": 2.140625, + "learning_rate": 8.535655325940324e-05, + "loss": 2.1619, + "step": 5727 + }, + { + "epoch": 0.2503605926832466, + "grad_norm": 1.8828125, + "learning_rate": 8.535169620890306e-05, + "loss": 1.8591, + "step": 5728 + }, + { + "epoch": 0.25040430088727655, + "grad_norm": 2.3125, + "learning_rate": 8.53468384912649e-05, + "loss": 1.9312, + "step": 5729 + }, + { + "epoch": 0.25044800909130643, + "grad_norm": 2.921875, + "learning_rate": 8.534198010658047e-05, + "loss": 2.3886, + "step": 5730 + }, + { + "epoch": 0.2504917172953363, + "grad_norm": 1.984375, + "learning_rate": 8.533712105494145e-05, + "loss": 1.7222, + "step": 5731 + }, + { + "epoch": 0.25053542549936625, + "grad_norm": 2.359375, + "learning_rate": 8.533226133643951e-05, + "loss": 1.8809, + "step": 5732 + }, + { + "epoch": 0.25057913370339613, + "grad_norm": 2.640625, + "learning_rate": 8.532740095116638e-05, + "loss": 1.8495, + "step": 5733 + }, + { + "epoch": 0.250622841907426, + "grad_norm": 2.796875, + "learning_rate": 8.532253989921378e-05, + "loss": 1.7284, + "step": 5734 + }, + { + "epoch": 0.25066655011145594, + "grad_norm": 2.25, + "learning_rate": 8.531767818067343e-05, + "loss": 1.6024, + "step": 5735 + }, + { + "epoch": 0.2507102583154858, + "grad_norm": 2.546875, + "learning_rate": 8.53128157956371e-05, + "loss": 1.7684, + "step": 5736 + }, + { + "epoch": 0.2507539665195157, + "grad_norm": 2.9375, + "learning_rate": 8.530795274419654e-05, + "loss": 2.55, + "step": 5737 + }, + { + "epoch": 0.2507976747235456, + "grad_norm": 2.625, + "learning_rate": 8.530308902644353e-05, + "loss": 2.0154, + "step": 5738 + }, + { + "epoch": 0.2508413829275755, + "grad_norm": 1.953125, + "learning_rate": 8.529822464246984e-05, + "loss": 1.9612, + "step": 5739 + }, + { + "epoch": 0.2508850911316054, + "grad_norm": 2.3125, + "learning_rate": 8.529335959236729e-05, + "loss": 1.4551, + "step": 5740 + }, + { + "epoch": 0.2509287993356353, + "grad_norm": 2.84375, + "learning_rate": 8.528849387622766e-05, + "loss": 1.9448, + "step": 5741 + }, + { + "epoch": 0.2509725075396652, + "grad_norm": 3.078125, + "learning_rate": 8.528362749414279e-05, + "loss": 2.8076, + "step": 5742 + }, + { + "epoch": 0.2510162157436951, + "grad_norm": 2.453125, + "learning_rate": 8.527876044620453e-05, + "loss": 2.5786, + "step": 5743 + }, + { + "epoch": 0.25105992394772497, + "grad_norm": 2.203125, + "learning_rate": 8.52738927325047e-05, + "loss": 1.8251, + "step": 5744 + }, + { + "epoch": 0.2511036321517549, + "grad_norm": 2.6875, + "learning_rate": 8.526902435313519e-05, + "loss": 2.0876, + "step": 5745 + }, + { + "epoch": 0.2511473403557848, + "grad_norm": 2.875, + "learning_rate": 8.526415530818785e-05, + "loss": 1.8256, + "step": 5746 + }, + { + "epoch": 0.25119104855981467, + "grad_norm": 2.875, + "learning_rate": 8.525928559775458e-05, + "loss": 1.682, + "step": 5747 + }, + { + "epoch": 0.25123475676384455, + "grad_norm": 2.78125, + "learning_rate": 8.525441522192727e-05, + "loss": 1.7598, + "step": 5748 + }, + { + "epoch": 0.2512784649678745, + "grad_norm": 2.90625, + "learning_rate": 8.524954418079782e-05, + "loss": 1.87, + "step": 5749 + }, + { + "epoch": 0.25132217317190436, + "grad_norm": 2.515625, + "learning_rate": 8.52446724744582e-05, + "loss": 2.6512, + "step": 5750 + }, + { + "epoch": 0.25136588137593424, + "grad_norm": 2.859375, + "learning_rate": 8.52398001030003e-05, + "loss": 2.5546, + "step": 5751 + }, + { + "epoch": 0.2514095895799642, + "grad_norm": 1.8828125, + "learning_rate": 8.523492706651607e-05, + "loss": 1.8665, + "step": 5752 + }, + { + "epoch": 0.25145329778399406, + "grad_norm": 2.140625, + "learning_rate": 8.52300533650975e-05, + "loss": 1.7692, + "step": 5753 + }, + { + "epoch": 0.25149700598802394, + "grad_norm": 2.0, + "learning_rate": 8.522517899883654e-05, + "loss": 1.8136, + "step": 5754 + }, + { + "epoch": 0.25154071419205387, + "grad_norm": 2.28125, + "learning_rate": 8.522030396782518e-05, + "loss": 1.8562, + "step": 5755 + }, + { + "epoch": 0.25158442239608375, + "grad_norm": 4.71875, + "learning_rate": 8.521542827215544e-05, + "loss": 1.8953, + "step": 5756 + }, + { + "epoch": 0.25162813060011363, + "grad_norm": 2.765625, + "learning_rate": 8.52105519119193e-05, + "loss": 1.6913, + "step": 5757 + }, + { + "epoch": 0.2516718388041435, + "grad_norm": 1.9140625, + "learning_rate": 8.52056748872088e-05, + "loss": 1.3204, + "step": 5758 + }, + { + "epoch": 0.25171554700817345, + "grad_norm": 2.171875, + "learning_rate": 8.520079719811598e-05, + "loss": 2.2793, + "step": 5759 + }, + { + "epoch": 0.2517592552122033, + "grad_norm": 2.015625, + "learning_rate": 8.519591884473288e-05, + "loss": 1.7225, + "step": 5760 + }, + { + "epoch": 0.2518029634162332, + "grad_norm": 2.390625, + "learning_rate": 8.519103982715158e-05, + "loss": 2.0289, + "step": 5761 + }, + { + "epoch": 0.25184667162026314, + "grad_norm": 2.15625, + "learning_rate": 8.518616014546413e-05, + "loss": 1.7984, + "step": 5762 + }, + { + "epoch": 0.251890379824293, + "grad_norm": 2.28125, + "learning_rate": 8.518127979976262e-05, + "loss": 1.9239, + "step": 5763 + }, + { + "epoch": 0.2519340880283229, + "grad_norm": 6.21875, + "learning_rate": 8.517639879013916e-05, + "loss": 1.6648, + "step": 5764 + }, + { + "epoch": 0.25197779623235284, + "grad_norm": 2.1875, + "learning_rate": 8.517151711668587e-05, + "loss": 1.3681, + "step": 5765 + }, + { + "epoch": 0.2520215044363827, + "grad_norm": 2.234375, + "learning_rate": 8.516663477949485e-05, + "loss": 1.8568, + "step": 5766 + }, + { + "epoch": 0.2520652126404126, + "grad_norm": 2.8125, + "learning_rate": 8.516175177865827e-05, + "loss": 1.7729, + "step": 5767 + }, + { + "epoch": 0.2521089208444425, + "grad_norm": 3.5, + "learning_rate": 8.515686811426824e-05, + "loss": 1.5158, + "step": 5768 + }, + { + "epoch": 0.2521526290484724, + "grad_norm": 2.796875, + "learning_rate": 8.515198378641694e-05, + "loss": 2.2034, + "step": 5769 + }, + { + "epoch": 0.2521963372525023, + "grad_norm": 2.234375, + "learning_rate": 8.514709879519653e-05, + "loss": 1.7099, + "step": 5770 + }, + { + "epoch": 0.25224004545653217, + "grad_norm": 1.9609375, + "learning_rate": 8.514221314069923e-05, + "loss": 2.1182, + "step": 5771 + }, + { + "epoch": 0.2522837536605621, + "grad_norm": 2.625, + "learning_rate": 8.513732682301723e-05, + "loss": 2.1552, + "step": 5772 + }, + { + "epoch": 0.252327461864592, + "grad_norm": 2.5, + "learning_rate": 8.513243984224273e-05, + "loss": 2.3363, + "step": 5773 + }, + { + "epoch": 0.25237117006862186, + "grad_norm": 2.125, + "learning_rate": 8.512755219846793e-05, + "loss": 1.8566, + "step": 5774 + }, + { + "epoch": 0.2524148782726518, + "grad_norm": 1.9375, + "learning_rate": 8.512266389178511e-05, + "loss": 1.7914, + "step": 5775 + }, + { + "epoch": 0.2524585864766817, + "grad_norm": 1.9765625, + "learning_rate": 8.511777492228651e-05, + "loss": 1.6455, + "step": 5776 + }, + { + "epoch": 0.25250229468071156, + "grad_norm": 2.125, + "learning_rate": 8.51128852900644e-05, + "loss": 1.6525, + "step": 5777 + }, + { + "epoch": 0.25254600288474144, + "grad_norm": 3.078125, + "learning_rate": 8.510799499521103e-05, + "loss": 2.3558, + "step": 5778 + }, + { + "epoch": 0.2525897110887714, + "grad_norm": 2.375, + "learning_rate": 8.510310403781867e-05, + "loss": 1.8693, + "step": 5779 + }, + { + "epoch": 0.25263341929280125, + "grad_norm": 2.390625, + "learning_rate": 8.509821241797967e-05, + "loss": 1.9012, + "step": 5780 + }, + { + "epoch": 0.25267712749683113, + "grad_norm": 1.984375, + "learning_rate": 8.509332013578632e-05, + "loss": 1.8172, + "step": 5781 + }, + { + "epoch": 0.25272083570086107, + "grad_norm": 2.71875, + "learning_rate": 8.508842719133094e-05, + "loss": 1.3929, + "step": 5782 + }, + { + "epoch": 0.25276454390489095, + "grad_norm": 2.78125, + "learning_rate": 8.508353358470586e-05, + "loss": 2.3934, + "step": 5783 + }, + { + "epoch": 0.25280825210892083, + "grad_norm": 2.203125, + "learning_rate": 8.507863931600346e-05, + "loss": 1.8745, + "step": 5784 + }, + { + "epoch": 0.25285196031295076, + "grad_norm": 1.859375, + "learning_rate": 8.507374438531607e-05, + "loss": 1.8801, + "step": 5785 + }, + { + "epoch": 0.25289566851698064, + "grad_norm": 1.984375, + "learning_rate": 8.506884879273608e-05, + "loss": 1.8671, + "step": 5786 + }, + { + "epoch": 0.2529393767210105, + "grad_norm": 2.234375, + "learning_rate": 8.506395253835586e-05, + "loss": 1.6834, + "step": 5787 + }, + { + "epoch": 0.2529830849250404, + "grad_norm": 3.6875, + "learning_rate": 8.505905562226783e-05, + "loss": 2.4975, + "step": 5788 + }, + { + "epoch": 0.25302679312907034, + "grad_norm": 3.125, + "learning_rate": 8.50541580445644e-05, + "loss": 1.4975, + "step": 5789 + }, + { + "epoch": 0.2530705013331002, + "grad_norm": 2.25, + "learning_rate": 8.504925980533797e-05, + "loss": 2.0253, + "step": 5790 + }, + { + "epoch": 0.2531142095371301, + "grad_norm": 2.375, + "learning_rate": 8.504436090468103e-05, + "loss": 1.482, + "step": 5791 + }, + { + "epoch": 0.25315791774116003, + "grad_norm": 4.6875, + "learning_rate": 8.503946134268596e-05, + "loss": 2.5413, + "step": 5792 + }, + { + "epoch": 0.2532016259451899, + "grad_norm": 2.75, + "learning_rate": 8.503456111944529e-05, + "loss": 1.794, + "step": 5793 + }, + { + "epoch": 0.2532453341492198, + "grad_norm": 2.578125, + "learning_rate": 8.502966023505143e-05, + "loss": 1.9426, + "step": 5794 + }, + { + "epoch": 0.2532890423532497, + "grad_norm": 2.15625, + "learning_rate": 8.502475868959692e-05, + "loss": 1.6883, + "step": 5795 + }, + { + "epoch": 0.2533327505572796, + "grad_norm": 2.34375, + "learning_rate": 8.501985648317423e-05, + "loss": 1.847, + "step": 5796 + }, + { + "epoch": 0.2533764587613095, + "grad_norm": 2.078125, + "learning_rate": 8.501495361587588e-05, + "loss": 1.9255, + "step": 5797 + }, + { + "epoch": 0.2534201669653394, + "grad_norm": 2.171875, + "learning_rate": 8.501005008779439e-05, + "loss": 2.6771, + "step": 5798 + }, + { + "epoch": 0.2534638751693693, + "grad_norm": 2.078125, + "learning_rate": 8.50051458990223e-05, + "loss": 1.9616, + "step": 5799 + }, + { + "epoch": 0.2535075833733992, + "grad_norm": 2.15625, + "learning_rate": 8.500024104965217e-05, + "loss": 1.777, + "step": 5800 + }, + { + "epoch": 0.25355129157742906, + "grad_norm": 4.46875, + "learning_rate": 8.499533553977654e-05, + "loss": 1.363, + "step": 5801 + }, + { + "epoch": 0.253594999781459, + "grad_norm": 2.078125, + "learning_rate": 8.4990429369488e-05, + "loss": 1.8625, + "step": 5802 + }, + { + "epoch": 0.2536387079854889, + "grad_norm": 3.53125, + "learning_rate": 8.498552253887912e-05, + "loss": 1.6072, + "step": 5803 + }, + { + "epoch": 0.25368241618951876, + "grad_norm": 2.203125, + "learning_rate": 8.498061504804253e-05, + "loss": 1.8609, + "step": 5804 + }, + { + "epoch": 0.2537261243935487, + "grad_norm": 2.09375, + "learning_rate": 8.49757068970708e-05, + "loss": 1.9611, + "step": 5805 + }, + { + "epoch": 0.25376983259757857, + "grad_norm": 1.953125, + "learning_rate": 8.497079808605659e-05, + "loss": 1.9539, + "step": 5806 + }, + { + "epoch": 0.25381354080160845, + "grad_norm": 2.296875, + "learning_rate": 8.496588861509253e-05, + "loss": 2.2939, + "step": 5807 + }, + { + "epoch": 0.2538572490056384, + "grad_norm": 2.375, + "learning_rate": 8.496097848427124e-05, + "loss": 2.2952, + "step": 5808 + }, + { + "epoch": 0.25390095720966827, + "grad_norm": 1.9921875, + "learning_rate": 8.495606769368543e-05, + "loss": 2.087, + "step": 5809 + }, + { + "epoch": 0.25394466541369815, + "grad_norm": 2.265625, + "learning_rate": 8.495115624342772e-05, + "loss": 1.9989, + "step": 5810 + }, + { + "epoch": 0.253988373617728, + "grad_norm": 2.421875, + "learning_rate": 8.494624413359083e-05, + "loss": 2.0572, + "step": 5811 + }, + { + "epoch": 0.25403208182175796, + "grad_norm": 18.25, + "learning_rate": 8.494133136426746e-05, + "loss": 3.0295, + "step": 5812 + }, + { + "epoch": 0.25407579002578784, + "grad_norm": 2.1875, + "learning_rate": 8.493641793555032e-05, + "loss": 1.7027, + "step": 5813 + }, + { + "epoch": 0.2541194982298177, + "grad_norm": 2.265625, + "learning_rate": 8.49315038475321e-05, + "loss": 2.3696, + "step": 5814 + }, + { + "epoch": 0.25416320643384765, + "grad_norm": 2.328125, + "learning_rate": 8.492658910030557e-05, + "loss": 1.951, + "step": 5815 + }, + { + "epoch": 0.25420691463787753, + "grad_norm": 2.359375, + "learning_rate": 8.492167369396349e-05, + "loss": 1.4611, + "step": 5816 + }, + { + "epoch": 0.2542506228419074, + "grad_norm": 2.578125, + "learning_rate": 8.491675762859858e-05, + "loss": 2.0407, + "step": 5817 + }, + { + "epoch": 0.25429433104593735, + "grad_norm": 2.171875, + "learning_rate": 8.491184090430364e-05, + "loss": 2.1088, + "step": 5818 + }, + { + "epoch": 0.25433803924996723, + "grad_norm": 2.109375, + "learning_rate": 8.490692352117147e-05, + "loss": 1.9155, + "step": 5819 + }, + { + "epoch": 0.2543817474539971, + "grad_norm": 1.9609375, + "learning_rate": 8.490200547929481e-05, + "loss": 1.7446, + "step": 5820 + }, + { + "epoch": 0.254425455658027, + "grad_norm": 2.171875, + "learning_rate": 8.489708677876653e-05, + "loss": 1.8726, + "step": 5821 + }, + { + "epoch": 0.2544691638620569, + "grad_norm": 2.03125, + "learning_rate": 8.489216741967945e-05, + "loss": 2.0559, + "step": 5822 + }, + { + "epoch": 0.2545128720660868, + "grad_norm": 1.953125, + "learning_rate": 8.488724740212636e-05, + "loss": 1.9535, + "step": 5823 + }, + { + "epoch": 0.2545565802701167, + "grad_norm": 2.578125, + "learning_rate": 8.488232672620015e-05, + "loss": 2.1477, + "step": 5824 + }, + { + "epoch": 0.2546002884741466, + "grad_norm": 2.6875, + "learning_rate": 8.487740539199366e-05, + "loss": 1.7686, + "step": 5825 + }, + { + "epoch": 0.2546439966781765, + "grad_norm": 2.40625, + "learning_rate": 8.487248339959976e-05, + "loss": 1.6782, + "step": 5826 + }, + { + "epoch": 0.2546877048822064, + "grad_norm": 2.03125, + "learning_rate": 8.486756074911137e-05, + "loss": 1.6152, + "step": 5827 + }, + { + "epoch": 0.2547314130862363, + "grad_norm": 2.421875, + "learning_rate": 8.486263744062134e-05, + "loss": 1.6807, + "step": 5828 + }, + { + "epoch": 0.2547751212902662, + "grad_norm": 2.34375, + "learning_rate": 8.48577134742226e-05, + "loss": 1.3611, + "step": 5829 + }, + { + "epoch": 0.2548188294942961, + "grad_norm": 2.1875, + "learning_rate": 8.485278885000808e-05, + "loss": 2.2076, + "step": 5830 + }, + { + "epoch": 0.25486253769832595, + "grad_norm": 2.375, + "learning_rate": 8.484786356807071e-05, + "loss": 1.8441, + "step": 5831 + }, + { + "epoch": 0.2549062459023559, + "grad_norm": 2.78125, + "learning_rate": 8.484293762850344e-05, + "loss": 2.0972, + "step": 5832 + }, + { + "epoch": 0.25494995410638577, + "grad_norm": 2.09375, + "learning_rate": 8.483801103139923e-05, + "loss": 1.763, + "step": 5833 + }, + { + "epoch": 0.25499366231041565, + "grad_norm": 2.1875, + "learning_rate": 8.483308377685104e-05, + "loss": 1.9297, + "step": 5834 + }, + { + "epoch": 0.2550373705144456, + "grad_norm": 3.09375, + "learning_rate": 8.482815586495184e-05, + "loss": 1.6853, + "step": 5835 + }, + { + "epoch": 0.25508107871847546, + "grad_norm": 2.75, + "learning_rate": 8.482322729579468e-05, + "loss": 2.3892, + "step": 5836 + }, + { + "epoch": 0.25512478692250534, + "grad_norm": 3.328125, + "learning_rate": 8.481829806947252e-05, + "loss": 1.6897, + "step": 5837 + }, + { + "epoch": 0.2551684951265353, + "grad_norm": 3.65625, + "learning_rate": 8.481336818607842e-05, + "loss": 1.6136, + "step": 5838 + }, + { + "epoch": 0.25521220333056516, + "grad_norm": 2.234375, + "learning_rate": 8.480843764570537e-05, + "loss": 2.6346, + "step": 5839 + }, + { + "epoch": 0.25525591153459504, + "grad_norm": 2.609375, + "learning_rate": 8.480350644844645e-05, + "loss": 1.8048, + "step": 5840 + }, + { + "epoch": 0.2552996197386249, + "grad_norm": 2.4375, + "learning_rate": 8.479857459439471e-05, + "loss": 1.9963, + "step": 5841 + }, + { + "epoch": 0.25534332794265485, + "grad_norm": 1.9765625, + "learning_rate": 8.479364208364323e-05, + "loss": 1.7453, + "step": 5842 + }, + { + "epoch": 0.25538703614668473, + "grad_norm": 2.421875, + "learning_rate": 8.478870891628507e-05, + "loss": 1.8611, + "step": 5843 + }, + { + "epoch": 0.2554307443507146, + "grad_norm": 3.984375, + "learning_rate": 8.478377509241334e-05, + "loss": 1.5761, + "step": 5844 + }, + { + "epoch": 0.25547445255474455, + "grad_norm": 3.359375, + "learning_rate": 8.477884061212115e-05, + "loss": 3.9752, + "step": 5845 + }, + { + "epoch": 0.2555181607587744, + "grad_norm": 2.40625, + "learning_rate": 8.477390547550162e-05, + "loss": 3.0617, + "step": 5846 + }, + { + "epoch": 0.2555618689628043, + "grad_norm": 2.109375, + "learning_rate": 8.47689696826479e-05, + "loss": 1.7095, + "step": 5847 + }, + { + "epoch": 0.25560557716683424, + "grad_norm": 2.40625, + "learning_rate": 8.47640332336531e-05, + "loss": 2.1366, + "step": 5848 + }, + { + "epoch": 0.2556492853708641, + "grad_norm": 2.71875, + "learning_rate": 8.47590961286104e-05, + "loss": 2.36, + "step": 5849 + }, + { + "epoch": 0.255692993574894, + "grad_norm": 4.09375, + "learning_rate": 8.475415836761295e-05, + "loss": 1.0162, + "step": 5850 + }, + { + "epoch": 0.2557367017789239, + "grad_norm": 4.25, + "learning_rate": 8.474921995075398e-05, + "loss": 1.045, + "step": 5851 + }, + { + "epoch": 0.2557804099829538, + "grad_norm": 2.203125, + "learning_rate": 8.474428087812664e-05, + "loss": 2.409, + "step": 5852 + }, + { + "epoch": 0.2558241181869837, + "grad_norm": 2.21875, + "learning_rate": 8.473934114982416e-05, + "loss": 2.0687, + "step": 5853 + }, + { + "epoch": 0.2558678263910136, + "grad_norm": 2.59375, + "learning_rate": 8.473440076593973e-05, + "loss": 2.1305, + "step": 5854 + }, + { + "epoch": 0.2559115345950435, + "grad_norm": 1.8671875, + "learning_rate": 8.472945972656662e-05, + "loss": 1.7336, + "step": 5855 + }, + { + "epoch": 0.2559552427990734, + "grad_norm": 2.046875, + "learning_rate": 8.472451803179807e-05, + "loss": 1.7277, + "step": 5856 + }, + { + "epoch": 0.25599895100310327, + "grad_norm": 2.3125, + "learning_rate": 8.47195756817273e-05, + "loss": 1.8274, + "step": 5857 + }, + { + "epoch": 0.2560426592071332, + "grad_norm": 2.390625, + "learning_rate": 8.471463267644761e-05, + "loss": 2.0623, + "step": 5858 + }, + { + "epoch": 0.2560863674111631, + "grad_norm": 2.4375, + "learning_rate": 8.47096890160523e-05, + "loss": 1.7609, + "step": 5859 + }, + { + "epoch": 0.25613007561519296, + "grad_norm": 1.8515625, + "learning_rate": 8.470474470063461e-05, + "loss": 1.7962, + "step": 5860 + }, + { + "epoch": 0.25617378381922284, + "grad_norm": 2.296875, + "learning_rate": 8.46997997302879e-05, + "loss": 2.4369, + "step": 5861 + }, + { + "epoch": 0.2562174920232528, + "grad_norm": 2.625, + "learning_rate": 8.469485410510545e-05, + "loss": 2.3581, + "step": 5862 + }, + { + "epoch": 0.25626120022728266, + "grad_norm": 2.265625, + "learning_rate": 8.468990782518063e-05, + "loss": 2.239, + "step": 5863 + }, + { + "epoch": 0.25630490843131254, + "grad_norm": 2.5, + "learning_rate": 8.468496089060674e-05, + "loss": 1.8326, + "step": 5864 + }, + { + "epoch": 0.2563486166353425, + "grad_norm": 2.203125, + "learning_rate": 8.468001330147714e-05, + "loss": 2.0595, + "step": 5865 + }, + { + "epoch": 0.25639232483937235, + "grad_norm": 2.359375, + "learning_rate": 8.467506505788525e-05, + "loss": 2.3059, + "step": 5866 + }, + { + "epoch": 0.25643603304340223, + "grad_norm": 2.25, + "learning_rate": 8.46701161599244e-05, + "loss": 1.7416, + "step": 5867 + }, + { + "epoch": 0.25647974124743217, + "grad_norm": 2.15625, + "learning_rate": 8.4665166607688e-05, + "loss": 1.5773, + "step": 5868 + }, + { + "epoch": 0.25652344945146205, + "grad_norm": 2.078125, + "learning_rate": 8.466021640126945e-05, + "loss": 2.0622, + "step": 5869 + }, + { + "epoch": 0.25656715765549193, + "grad_norm": 2.15625, + "learning_rate": 8.465526554076217e-05, + "loss": 1.6875, + "step": 5870 + }, + { + "epoch": 0.2566108658595218, + "grad_norm": 2.15625, + "learning_rate": 8.46503140262596e-05, + "loss": 1.9292, + "step": 5871 + }, + { + "epoch": 0.25665457406355174, + "grad_norm": 2.703125, + "learning_rate": 8.464536185785516e-05, + "loss": 1.3767, + "step": 5872 + }, + { + "epoch": 0.2566982822675816, + "grad_norm": 2.8125, + "learning_rate": 8.464040903564234e-05, + "loss": 2.645, + "step": 5873 + }, + { + "epoch": 0.2567419904716115, + "grad_norm": 2.375, + "learning_rate": 8.463545555971458e-05, + "loss": 1.9756, + "step": 5874 + }, + { + "epoch": 0.25678569867564144, + "grad_norm": 2.234375, + "learning_rate": 8.463050143016533e-05, + "loss": 1.9329, + "step": 5875 + }, + { + "epoch": 0.2568294068796713, + "grad_norm": 2.75, + "learning_rate": 8.462554664708815e-05, + "loss": 2.5044, + "step": 5876 + }, + { + "epoch": 0.2568731150837012, + "grad_norm": 1.9296875, + "learning_rate": 8.462059121057651e-05, + "loss": 2.1592, + "step": 5877 + }, + { + "epoch": 0.25691682328773113, + "grad_norm": 1.8828125, + "learning_rate": 8.461563512072392e-05, + "loss": 1.6989, + "step": 5878 + }, + { + "epoch": 0.256960531491761, + "grad_norm": 2.125, + "learning_rate": 8.461067837762391e-05, + "loss": 1.9492, + "step": 5879 + }, + { + "epoch": 0.2570042396957909, + "grad_norm": 1.9375, + "learning_rate": 8.460572098137e-05, + "loss": 1.9093, + "step": 5880 + }, + { + "epoch": 0.2570479478998208, + "grad_norm": 3.578125, + "learning_rate": 8.460076293205581e-05, + "loss": 1.6803, + "step": 5881 + }, + { + "epoch": 0.2570916561038507, + "grad_norm": 2.171875, + "learning_rate": 8.459580422977484e-05, + "loss": 2.7429, + "step": 5882 + }, + { + "epoch": 0.2571353643078806, + "grad_norm": 1.8515625, + "learning_rate": 8.459084487462072e-05, + "loss": 1.4708, + "step": 5883 + }, + { + "epoch": 0.25717907251191047, + "grad_norm": 2.0625, + "learning_rate": 8.4585884866687e-05, + "loss": 2.23, + "step": 5884 + }, + { + "epoch": 0.2572227807159404, + "grad_norm": 2.75, + "learning_rate": 8.458092420606727e-05, + "loss": 1.7679, + "step": 5885 + }, + { + "epoch": 0.2572664889199703, + "grad_norm": 2.0, + "learning_rate": 8.45759628928552e-05, + "loss": 1.1209, + "step": 5886 + }, + { + "epoch": 0.25731019712400016, + "grad_norm": 2.359375, + "learning_rate": 8.457100092714438e-05, + "loss": 2.1544, + "step": 5887 + }, + { + "epoch": 0.2573539053280301, + "grad_norm": 2.1875, + "learning_rate": 8.456603830902845e-05, + "loss": 1.6905, + "step": 5888 + }, + { + "epoch": 0.25739761353206, + "grad_norm": 3.125, + "learning_rate": 8.456107503860107e-05, + "loss": 1.5867, + "step": 5889 + }, + { + "epoch": 0.25744132173608986, + "grad_norm": 2.125, + "learning_rate": 8.455611111595591e-05, + "loss": 2.0275, + "step": 5890 + }, + { + "epoch": 0.25748502994011974, + "grad_norm": 2.765625, + "learning_rate": 8.455114654118663e-05, + "loss": 1.8815, + "step": 5891 + }, + { + "epoch": 0.25752873814414967, + "grad_norm": 2.21875, + "learning_rate": 8.454618131438693e-05, + "loss": 1.4841, + "step": 5892 + }, + { + "epoch": 0.25757244634817955, + "grad_norm": 1.9375, + "learning_rate": 8.454121543565052e-05, + "loss": 1.9968, + "step": 5893 + }, + { + "epoch": 0.25761615455220943, + "grad_norm": 4.6875, + "learning_rate": 8.453624890507109e-05, + "loss": 1.7452, + "step": 5894 + }, + { + "epoch": 0.25765986275623937, + "grad_norm": 3.15625, + "learning_rate": 8.453128172274238e-05, + "loss": 2.609, + "step": 5895 + }, + { + "epoch": 0.25770357096026925, + "grad_norm": 2.0, + "learning_rate": 8.452631388875814e-05, + "loss": 1.4952, + "step": 5896 + }, + { + "epoch": 0.2577472791642991, + "grad_norm": 2.9375, + "learning_rate": 8.452134540321208e-05, + "loss": 1.5478, + "step": 5897 + }, + { + "epoch": 0.25779098736832906, + "grad_norm": 2.671875, + "learning_rate": 8.451637626619801e-05, + "loss": 2.7189, + "step": 5898 + }, + { + "epoch": 0.25783469557235894, + "grad_norm": 2.265625, + "learning_rate": 8.451140647780967e-05, + "loss": 2.0582, + "step": 5899 + }, + { + "epoch": 0.2578784037763888, + "grad_norm": 1.9453125, + "learning_rate": 8.450643603814086e-05, + "loss": 1.5491, + "step": 5900 + }, + { + "epoch": 0.2579221119804187, + "grad_norm": 2.109375, + "learning_rate": 8.45014649472854e-05, + "loss": 1.8259, + "step": 5901 + }, + { + "epoch": 0.25796582018444864, + "grad_norm": 2.6875, + "learning_rate": 8.449649320533706e-05, + "loss": 2.0901, + "step": 5902 + }, + { + "epoch": 0.2580095283884785, + "grad_norm": 2.5, + "learning_rate": 8.44915208123897e-05, + "loss": 1.1867, + "step": 5903 + }, + { + "epoch": 0.2580532365925084, + "grad_norm": 2.125, + "learning_rate": 8.448654776853714e-05, + "loss": 2.0911, + "step": 5904 + }, + { + "epoch": 0.25809694479653833, + "grad_norm": 2.171875, + "learning_rate": 8.448157407387323e-05, + "loss": 1.4067, + "step": 5905 + }, + { + "epoch": 0.2581406530005682, + "grad_norm": 2.4375, + "learning_rate": 8.447659972849183e-05, + "loss": 1.938, + "step": 5906 + }, + { + "epoch": 0.2581843612045981, + "grad_norm": 2.25, + "learning_rate": 8.447162473248682e-05, + "loss": 1.7084, + "step": 5907 + }, + { + "epoch": 0.258228069408628, + "grad_norm": 2.578125, + "learning_rate": 8.446664908595207e-05, + "loss": 2.4166, + "step": 5908 + }, + { + "epoch": 0.2582717776126579, + "grad_norm": 1.9453125, + "learning_rate": 8.44616727889815e-05, + "loss": 2.0318, + "step": 5909 + }, + { + "epoch": 0.2583154858166878, + "grad_norm": 2.1875, + "learning_rate": 8.4456695841669e-05, + "loss": 1.4643, + "step": 5910 + }, + { + "epoch": 0.25835919402071766, + "grad_norm": 2.015625, + "learning_rate": 8.445171824410848e-05, + "loss": 1.8736, + "step": 5911 + }, + { + "epoch": 0.2584029022247476, + "grad_norm": 2.453125, + "learning_rate": 8.444673999639393e-05, + "loss": 2.0095, + "step": 5912 + }, + { + "epoch": 0.2584466104287775, + "grad_norm": 3.875, + "learning_rate": 8.444176109861926e-05, + "loss": 1.1565, + "step": 5913 + }, + { + "epoch": 0.25849031863280736, + "grad_norm": 2.265625, + "learning_rate": 8.44367815508784e-05, + "loss": 2.3151, + "step": 5914 + }, + { + "epoch": 0.2585340268368373, + "grad_norm": 3.296875, + "learning_rate": 8.443180135326538e-05, + "loss": 2.2622, + "step": 5915 + }, + { + "epoch": 0.2585777350408672, + "grad_norm": 2.6875, + "learning_rate": 8.442682050587414e-05, + "loss": 1.0598, + "step": 5916 + }, + { + "epoch": 0.25862144324489705, + "grad_norm": 2.53125, + "learning_rate": 8.44218390087987e-05, + "loss": 1.5134, + "step": 5917 + }, + { + "epoch": 0.258665151448927, + "grad_norm": 3.015625, + "learning_rate": 8.441685686213306e-05, + "loss": 1.896, + "step": 5918 + }, + { + "epoch": 0.25870885965295687, + "grad_norm": 2.25, + "learning_rate": 8.441187406597123e-05, + "loss": 3.0446, + "step": 5919 + }, + { + "epoch": 0.25875256785698675, + "grad_norm": 2.28125, + "learning_rate": 8.440689062040725e-05, + "loss": 1.9769, + "step": 5920 + }, + { + "epoch": 0.25879627606101663, + "grad_norm": 2.734375, + "learning_rate": 8.440190652553517e-05, + "loss": 2.0879, + "step": 5921 + }, + { + "epoch": 0.25883998426504656, + "grad_norm": 2.40625, + "learning_rate": 8.439692178144906e-05, + "loss": 2.3422, + "step": 5922 + }, + { + "epoch": 0.25888369246907644, + "grad_norm": 3.859375, + "learning_rate": 8.439193638824296e-05, + "loss": 2.7385, + "step": 5923 + }, + { + "epoch": 0.2589274006731063, + "grad_norm": 2.34375, + "learning_rate": 8.438695034601097e-05, + "loss": 1.7118, + "step": 5924 + }, + { + "epoch": 0.25897110887713626, + "grad_norm": 2.9375, + "learning_rate": 8.438196365484718e-05, + "loss": 2.2815, + "step": 5925 + }, + { + "epoch": 0.25901481708116614, + "grad_norm": 2.265625, + "learning_rate": 8.43769763148457e-05, + "loss": 1.369, + "step": 5926 + }, + { + "epoch": 0.259058525285196, + "grad_norm": 2.984375, + "learning_rate": 8.437198832610063e-05, + "loss": 2.1702, + "step": 5927 + }, + { + "epoch": 0.25910223348922595, + "grad_norm": 2.28125, + "learning_rate": 8.436699968870612e-05, + "loss": 1.6627, + "step": 5928 + }, + { + "epoch": 0.25914594169325583, + "grad_norm": 2.1875, + "learning_rate": 8.43620104027563e-05, + "loss": 1.8602, + "step": 5929 + }, + { + "epoch": 0.2591896498972857, + "grad_norm": 2.015625, + "learning_rate": 8.435702046834535e-05, + "loss": 1.3718, + "step": 5930 + }, + { + "epoch": 0.2592333581013156, + "grad_norm": 5.0625, + "learning_rate": 8.435202988556739e-05, + "loss": 2.0221, + "step": 5931 + }, + { + "epoch": 0.2592770663053455, + "grad_norm": 2.828125, + "learning_rate": 8.434703865451665e-05, + "loss": 2.0702, + "step": 5932 + }, + { + "epoch": 0.2593207745093754, + "grad_norm": 2.3125, + "learning_rate": 8.43420467752873e-05, + "loss": 1.816, + "step": 5933 + }, + { + "epoch": 0.2593644827134053, + "grad_norm": 2.5, + "learning_rate": 8.433705424797355e-05, + "loss": 2.138, + "step": 5934 + }, + { + "epoch": 0.2594081909174352, + "grad_norm": 2.609375, + "learning_rate": 8.43320610726696e-05, + "loss": 2.6959, + "step": 5935 + }, + { + "epoch": 0.2594518991214651, + "grad_norm": 1.9296875, + "learning_rate": 8.43270672494697e-05, + "loss": 1.9112, + "step": 5936 + }, + { + "epoch": 0.259495607325495, + "grad_norm": 1.9765625, + "learning_rate": 8.432207277846806e-05, + "loss": 1.7615, + "step": 5937 + }, + { + "epoch": 0.2595393155295249, + "grad_norm": 2.375, + "learning_rate": 8.431707765975898e-05, + "loss": 2.2894, + "step": 5938 + }, + { + "epoch": 0.2595830237335548, + "grad_norm": 1.90625, + "learning_rate": 8.43120818934367e-05, + "loss": 1.6917, + "step": 5939 + }, + { + "epoch": 0.2596267319375847, + "grad_norm": 2.1875, + "learning_rate": 8.430708547959547e-05, + "loss": 1.6675, + "step": 5940 + }, + { + "epoch": 0.25967044014161456, + "grad_norm": 2.46875, + "learning_rate": 8.43020884183296e-05, + "loss": 2.4967, + "step": 5941 + }, + { + "epoch": 0.2597141483456445, + "grad_norm": 2.34375, + "learning_rate": 8.429709070973342e-05, + "loss": 2.0623, + "step": 5942 + }, + { + "epoch": 0.25975785654967437, + "grad_norm": 2.609375, + "learning_rate": 8.429209235390123e-05, + "loss": 1.6179, + "step": 5943 + }, + { + "epoch": 0.25980156475370425, + "grad_norm": 2.0625, + "learning_rate": 8.428709335092733e-05, + "loss": 1.2945, + "step": 5944 + }, + { + "epoch": 0.2598452729577342, + "grad_norm": 2.484375, + "learning_rate": 8.428209370090607e-05, + "loss": 2.2274, + "step": 5945 + }, + { + "epoch": 0.25988898116176407, + "grad_norm": 3.125, + "learning_rate": 8.427709340393181e-05, + "loss": 1.5624, + "step": 5946 + }, + { + "epoch": 0.25993268936579395, + "grad_norm": 2.625, + "learning_rate": 8.427209246009893e-05, + "loss": 1.8742, + "step": 5947 + }, + { + "epoch": 0.2599763975698239, + "grad_norm": 2.21875, + "learning_rate": 8.426709086950178e-05, + "loss": 2.013, + "step": 5948 + }, + { + "epoch": 0.26002010577385376, + "grad_norm": 2.203125, + "learning_rate": 8.426208863223473e-05, + "loss": 2.0844, + "step": 5949 + }, + { + "epoch": 0.26006381397788364, + "grad_norm": 2.25, + "learning_rate": 8.425708574839222e-05, + "loss": 1.6669, + "step": 5950 + }, + { + "epoch": 0.2601075221819135, + "grad_norm": 2.359375, + "learning_rate": 8.425208221806862e-05, + "loss": 1.0572, + "step": 5951 + }, + { + "epoch": 0.26015123038594345, + "grad_norm": 2.21875, + "learning_rate": 8.42470780413584e-05, + "loss": 1.8001, + "step": 5952 + }, + { + "epoch": 0.26019493858997333, + "grad_norm": 1.9140625, + "learning_rate": 8.424207321835598e-05, + "loss": 1.5684, + "step": 5953 + }, + { + "epoch": 0.2602386467940032, + "grad_norm": 2.03125, + "learning_rate": 8.423706774915579e-05, + "loss": 1.7722, + "step": 5954 + }, + { + "epoch": 0.26028235499803315, + "grad_norm": 2.703125, + "learning_rate": 8.42320616338523e-05, + "loss": 1.5773, + "step": 5955 + }, + { + "epoch": 0.26032606320206303, + "grad_norm": 1.921875, + "learning_rate": 8.422705487253996e-05, + "loss": 1.4819, + "step": 5956 + }, + { + "epoch": 0.2603697714060929, + "grad_norm": 2.328125, + "learning_rate": 8.422204746531332e-05, + "loss": 2.35, + "step": 5957 + }, + { + "epoch": 0.26041347961012284, + "grad_norm": 2.78125, + "learning_rate": 8.421703941226682e-05, + "loss": 1.896, + "step": 5958 + }, + { + "epoch": 0.2604571878141527, + "grad_norm": 2.0625, + "learning_rate": 8.421203071349498e-05, + "loss": 1.5529, + "step": 5959 + }, + { + "epoch": 0.2605008960181826, + "grad_norm": 2.046875, + "learning_rate": 8.420702136909234e-05, + "loss": 1.3801, + "step": 5960 + }, + { + "epoch": 0.2605446042222125, + "grad_norm": 2.0625, + "learning_rate": 8.420201137915342e-05, + "loss": 1.6023, + "step": 5961 + }, + { + "epoch": 0.2605883124262424, + "grad_norm": 2.03125, + "learning_rate": 8.419700074377275e-05, + "loss": 1.5803, + "step": 5962 + }, + { + "epoch": 0.2606320206302723, + "grad_norm": 2.734375, + "learning_rate": 8.419198946304491e-05, + "loss": 1.7009, + "step": 5963 + }, + { + "epoch": 0.2606757288343022, + "grad_norm": 2.328125, + "learning_rate": 8.418697753706449e-05, + "loss": 2.4918, + "step": 5964 + }, + { + "epoch": 0.2607194370383321, + "grad_norm": 2.28125, + "learning_rate": 8.418196496592603e-05, + "loss": 2.4145, + "step": 5965 + }, + { + "epoch": 0.260763145242362, + "grad_norm": 3.609375, + "learning_rate": 8.417695174972413e-05, + "loss": 2.5977, + "step": 5966 + }, + { + "epoch": 0.2608068534463919, + "grad_norm": 2.59375, + "learning_rate": 8.41719378885534e-05, + "loss": 2.1141, + "step": 5967 + }, + { + "epoch": 0.2608505616504218, + "grad_norm": 2.125, + "learning_rate": 8.416692338250848e-05, + "loss": 2.1758, + "step": 5968 + }, + { + "epoch": 0.2608942698544517, + "grad_norm": 2.046875, + "learning_rate": 8.416190823168402e-05, + "loss": 1.8151, + "step": 5969 + }, + { + "epoch": 0.26093797805848157, + "grad_norm": 1.953125, + "learning_rate": 8.41568924361746e-05, + "loss": 1.7852, + "step": 5970 + }, + { + "epoch": 0.26098168626251145, + "grad_norm": 2.71875, + "learning_rate": 8.41518759960749e-05, + "loss": 1.7464, + "step": 5971 + }, + { + "epoch": 0.2610253944665414, + "grad_norm": 2.90625, + "learning_rate": 8.41468589114796e-05, + "loss": 2.5288, + "step": 5972 + }, + { + "epoch": 0.26106910267057126, + "grad_norm": 4.34375, + "learning_rate": 8.414184118248339e-05, + "loss": 1.4174, + "step": 5973 + }, + { + "epoch": 0.26111281087460114, + "grad_norm": 2.75, + "learning_rate": 8.413682280918093e-05, + "loss": 2.227, + "step": 5974 + }, + { + "epoch": 0.2611565190786311, + "grad_norm": 2.234375, + "learning_rate": 8.413180379166694e-05, + "loss": 2.214, + "step": 5975 + }, + { + "epoch": 0.26120022728266096, + "grad_norm": 2.390625, + "learning_rate": 8.412678413003614e-05, + "loss": 1.6059, + "step": 5976 + }, + { + "epoch": 0.26124393548669084, + "grad_norm": 3.140625, + "learning_rate": 8.412176382438326e-05, + "loss": 1.8957, + "step": 5977 + }, + { + "epoch": 0.26128764369072077, + "grad_norm": 2.03125, + "learning_rate": 8.411674287480303e-05, + "loss": 1.9365, + "step": 5978 + }, + { + "epoch": 0.26133135189475065, + "grad_norm": 2.4375, + "learning_rate": 8.411172128139021e-05, + "loss": 2.0909, + "step": 5979 + }, + { + "epoch": 0.26137506009878053, + "grad_norm": 2.484375, + "learning_rate": 8.410669904423955e-05, + "loss": 1.919, + "step": 5980 + }, + { + "epoch": 0.2614187683028104, + "grad_norm": 2.53125, + "learning_rate": 8.410167616344586e-05, + "loss": 2.6475, + "step": 5981 + }, + { + "epoch": 0.26146247650684035, + "grad_norm": 3.046875, + "learning_rate": 8.40966526391039e-05, + "loss": 1.7789, + "step": 5982 + }, + { + "epoch": 0.2615061847108702, + "grad_norm": 2.6875, + "learning_rate": 8.409162847130847e-05, + "loss": 1.9418, + "step": 5983 + }, + { + "epoch": 0.2615498929149001, + "grad_norm": 2.140625, + "learning_rate": 8.40866036601544e-05, + "loss": 1.7559, + "step": 5984 + }, + { + "epoch": 0.26159360111893004, + "grad_norm": 2.1875, + "learning_rate": 8.40815782057365e-05, + "loss": 2.1114, + "step": 5985 + }, + { + "epoch": 0.2616373093229599, + "grad_norm": 1.875, + "learning_rate": 8.407655210814962e-05, + "loss": 1.7708, + "step": 5986 + }, + { + "epoch": 0.2616810175269898, + "grad_norm": 2.0625, + "learning_rate": 8.407152536748861e-05, + "loss": 1.7084, + "step": 5987 + }, + { + "epoch": 0.26172472573101974, + "grad_norm": 2.3125, + "learning_rate": 8.406649798384834e-05, + "loss": 1.8533, + "step": 5988 + }, + { + "epoch": 0.2617684339350496, + "grad_norm": 2.21875, + "learning_rate": 8.406146995732365e-05, + "loss": 1.8596, + "step": 5989 + }, + { + "epoch": 0.2618121421390795, + "grad_norm": 1.828125, + "learning_rate": 8.405644128800945e-05, + "loss": 1.7747, + "step": 5990 + }, + { + "epoch": 0.2618558503431094, + "grad_norm": 2.71875, + "learning_rate": 8.405141197600064e-05, + "loss": 1.559, + "step": 5991 + }, + { + "epoch": 0.2618995585471393, + "grad_norm": 2.375, + "learning_rate": 8.404638202139213e-05, + "loss": 1.9718, + "step": 5992 + }, + { + "epoch": 0.2619432667511692, + "grad_norm": 2.734375, + "learning_rate": 8.404135142427884e-05, + "loss": 1.9266, + "step": 5993 + }, + { + "epoch": 0.26198697495519907, + "grad_norm": 2.9375, + "learning_rate": 8.40363201847557e-05, + "loss": 1.9553, + "step": 5994 + }, + { + "epoch": 0.262030683159229, + "grad_norm": 2.71875, + "learning_rate": 8.403128830291767e-05, + "loss": 2.5839, + "step": 5995 + }, + { + "epoch": 0.2620743913632589, + "grad_norm": 2.140625, + "learning_rate": 8.402625577885968e-05, + "loss": 2.0484, + "step": 5996 + }, + { + "epoch": 0.26211809956728876, + "grad_norm": 2.3125, + "learning_rate": 8.402122261267673e-05, + "loss": 1.694, + "step": 5997 + }, + { + "epoch": 0.2621618077713187, + "grad_norm": 2.625, + "learning_rate": 8.40161888044638e-05, + "loss": 2.2003, + "step": 5998 + }, + { + "epoch": 0.2622055159753486, + "grad_norm": 2.546875, + "learning_rate": 8.401115435431587e-05, + "loss": 2.1018, + "step": 5999 + }, + { + "epoch": 0.26224922417937846, + "grad_norm": 2.203125, + "learning_rate": 8.400611926232797e-05, + "loss": 2.1926, + "step": 6000 + }, + { + "epoch": 0.26229293238340834, + "grad_norm": 2.109375, + "learning_rate": 8.400108352859508e-05, + "loss": 2.101, + "step": 6001 + }, + { + "epoch": 0.2623366405874383, + "grad_norm": 2.078125, + "learning_rate": 8.399604715321227e-05, + "loss": 1.8448, + "step": 6002 + }, + { + "epoch": 0.26238034879146815, + "grad_norm": 2.234375, + "learning_rate": 8.399101013627458e-05, + "loss": 2.1145, + "step": 6003 + }, + { + "epoch": 0.26242405699549803, + "grad_norm": 2.125, + "learning_rate": 8.398597247787705e-05, + "loss": 1.993, + "step": 6004 + }, + { + "epoch": 0.26246776519952797, + "grad_norm": 1.984375, + "learning_rate": 8.398093417811477e-05, + "loss": 1.7932, + "step": 6005 + }, + { + "epoch": 0.26251147340355785, + "grad_norm": 1.703125, + "learning_rate": 8.397589523708278e-05, + "loss": 1.4415, + "step": 6006 + }, + { + "epoch": 0.26255518160758773, + "grad_norm": 2.0625, + "learning_rate": 8.397085565487621e-05, + "loss": 2.1253, + "step": 6007 + }, + { + "epoch": 0.26259888981161766, + "grad_norm": 2.28125, + "learning_rate": 8.396581543159017e-05, + "loss": 2.4571, + "step": 6008 + }, + { + "epoch": 0.26264259801564754, + "grad_norm": 2.953125, + "learning_rate": 8.396077456731974e-05, + "loss": 1.5223, + "step": 6009 + }, + { + "epoch": 0.2626863062196774, + "grad_norm": 1.96875, + "learning_rate": 8.395573306216005e-05, + "loss": 1.9146, + "step": 6010 + }, + { + "epoch": 0.2627300144237073, + "grad_norm": 2.15625, + "learning_rate": 8.395069091620628e-05, + "loss": 2.2909, + "step": 6011 + }, + { + "epoch": 0.26277372262773724, + "grad_norm": 2.078125, + "learning_rate": 8.394564812955355e-05, + "loss": 1.7254, + "step": 6012 + }, + { + "epoch": 0.2628174308317671, + "grad_norm": 1.875, + "learning_rate": 8.394060470229704e-05, + "loss": 2.082, + "step": 6013 + }, + { + "epoch": 0.262861139035797, + "grad_norm": 1.984375, + "learning_rate": 8.393556063453192e-05, + "loss": 1.4722, + "step": 6014 + }, + { + "epoch": 0.26290484723982693, + "grad_norm": 2.046875, + "learning_rate": 8.393051592635337e-05, + "loss": 1.8483, + "step": 6015 + }, + { + "epoch": 0.2629485554438568, + "grad_norm": 2.015625, + "learning_rate": 8.392547057785661e-05, + "loss": 1.9085, + "step": 6016 + }, + { + "epoch": 0.2629922636478867, + "grad_norm": 2.5, + "learning_rate": 8.392042458913685e-05, + "loss": 2.4344, + "step": 6017 + }, + { + "epoch": 0.26303597185191663, + "grad_norm": 2.46875, + "learning_rate": 8.39153779602893e-05, + "loss": 2.4289, + "step": 6018 + }, + { + "epoch": 0.2630796800559465, + "grad_norm": 2.28125, + "learning_rate": 8.391033069140921e-05, + "loss": 2.3144, + "step": 6019 + }, + { + "epoch": 0.2631233882599764, + "grad_norm": 1.9140625, + "learning_rate": 8.390528278259182e-05, + "loss": 1.7527, + "step": 6020 + }, + { + "epoch": 0.26316709646400627, + "grad_norm": 2.28125, + "learning_rate": 8.39002342339324e-05, + "loss": 2.7173, + "step": 6021 + }, + { + "epoch": 0.2632108046680362, + "grad_norm": 2.03125, + "learning_rate": 8.389518504552622e-05, + "loss": 1.7396, + "step": 6022 + }, + { + "epoch": 0.2632545128720661, + "grad_norm": 2.625, + "learning_rate": 8.389013521746857e-05, + "loss": 2.6851, + "step": 6023 + }, + { + "epoch": 0.26329822107609596, + "grad_norm": 2.453125, + "learning_rate": 8.388508474985474e-05, + "loss": 1.8855, + "step": 6024 + }, + { + "epoch": 0.2633419292801259, + "grad_norm": 2.8125, + "learning_rate": 8.388003364278005e-05, + "loss": 1.9291, + "step": 6025 + }, + { + "epoch": 0.2633856374841558, + "grad_norm": 2.359375, + "learning_rate": 8.387498189633979e-05, + "loss": 1.743, + "step": 6026 + }, + { + "epoch": 0.26342934568818566, + "grad_norm": 3.40625, + "learning_rate": 8.386992951062935e-05, + "loss": 2.0112, + "step": 6027 + }, + { + "epoch": 0.2634730538922156, + "grad_norm": 1.9765625, + "learning_rate": 8.386487648574403e-05, + "loss": 1.7297, + "step": 6028 + }, + { + "epoch": 0.26351676209624547, + "grad_norm": 1.9140625, + "learning_rate": 8.385982282177922e-05, + "loss": 1.628, + "step": 6029 + }, + { + "epoch": 0.26356047030027535, + "grad_norm": 2.265625, + "learning_rate": 8.385476851883025e-05, + "loss": 1.2578, + "step": 6030 + }, + { + "epoch": 0.26360417850430523, + "grad_norm": 2.671875, + "learning_rate": 8.384971357699254e-05, + "loss": 2.0018, + "step": 6031 + }, + { + "epoch": 0.26364788670833517, + "grad_norm": 3.015625, + "learning_rate": 8.384465799636145e-05, + "loss": 1.8163, + "step": 6032 + }, + { + "epoch": 0.26369159491236505, + "grad_norm": 2.25, + "learning_rate": 8.383960177703243e-05, + "loss": 1.2395, + "step": 6033 + }, + { + "epoch": 0.2637353031163949, + "grad_norm": 2.125, + "learning_rate": 8.383454491910086e-05, + "loss": 1.9073, + "step": 6034 + }, + { + "epoch": 0.26377901132042486, + "grad_norm": 2.84375, + "learning_rate": 8.38294874226622e-05, + "loss": 1.8201, + "step": 6035 + }, + { + "epoch": 0.26382271952445474, + "grad_norm": 2.5625, + "learning_rate": 8.382442928781184e-05, + "loss": 2.0795, + "step": 6036 + }, + { + "epoch": 0.2638664277284846, + "grad_norm": 2.328125, + "learning_rate": 8.38193705146453e-05, + "loss": 1.7793, + "step": 6037 + }, + { + "epoch": 0.26391013593251456, + "grad_norm": 2.0, + "learning_rate": 8.3814311103258e-05, + "loss": 1.4628, + "step": 6038 + }, + { + "epoch": 0.26395384413654444, + "grad_norm": 2.0, + "learning_rate": 8.380925105374544e-05, + "loss": 1.5489, + "step": 6039 + }, + { + "epoch": 0.2639975523405743, + "grad_norm": 2.21875, + "learning_rate": 8.380419036620312e-05, + "loss": 1.9486, + "step": 6040 + }, + { + "epoch": 0.2640412605446042, + "grad_norm": 2.0, + "learning_rate": 8.379912904072651e-05, + "loss": 1.6631, + "step": 6041 + }, + { + "epoch": 0.26408496874863413, + "grad_norm": 2.265625, + "learning_rate": 8.379406707741115e-05, + "loss": 2.0659, + "step": 6042 + }, + { + "epoch": 0.264128676952664, + "grad_norm": 2.203125, + "learning_rate": 8.378900447635256e-05, + "loss": 1.9999, + "step": 6043 + }, + { + "epoch": 0.2641723851566939, + "grad_norm": 2.234375, + "learning_rate": 8.378394123764628e-05, + "loss": 2.2885, + "step": 6044 + }, + { + "epoch": 0.2642160933607238, + "grad_norm": 2.015625, + "learning_rate": 8.377887736138786e-05, + "loss": 1.6849, + "step": 6045 + }, + { + "epoch": 0.2642598015647537, + "grad_norm": 2.03125, + "learning_rate": 8.377381284767285e-05, + "loss": 1.595, + "step": 6046 + }, + { + "epoch": 0.2643035097687836, + "grad_norm": 2.3125, + "learning_rate": 8.376874769659684e-05, + "loss": 1.8376, + "step": 6047 + }, + { + "epoch": 0.2643472179728135, + "grad_norm": 1.8828125, + "learning_rate": 8.376368190825541e-05, + "loss": 1.3498, + "step": 6048 + }, + { + "epoch": 0.2643909261768434, + "grad_norm": 1.921875, + "learning_rate": 8.375861548274418e-05, + "loss": 1.5076, + "step": 6049 + }, + { + "epoch": 0.2644346343808733, + "grad_norm": 2.296875, + "learning_rate": 8.375354842015873e-05, + "loss": 1.6232, + "step": 6050 + }, + { + "epoch": 0.26447834258490316, + "grad_norm": 2.421875, + "learning_rate": 8.374848072059469e-05, + "loss": 1.8066, + "step": 6051 + }, + { + "epoch": 0.2645220507889331, + "grad_norm": 3.21875, + "learning_rate": 8.374341238414769e-05, + "loss": 1.2586, + "step": 6052 + }, + { + "epoch": 0.264565758992963, + "grad_norm": 3.609375, + "learning_rate": 8.37383434109134e-05, + "loss": 1.9279, + "step": 6053 + }, + { + "epoch": 0.26460946719699285, + "grad_norm": 4.375, + "learning_rate": 8.373327380098748e-05, + "loss": 1.6612, + "step": 6054 + }, + { + "epoch": 0.2646531754010228, + "grad_norm": 2.234375, + "learning_rate": 8.372820355446558e-05, + "loss": 1.921, + "step": 6055 + }, + { + "epoch": 0.26469688360505267, + "grad_norm": 2.25, + "learning_rate": 8.372313267144338e-05, + "loss": 2.0424, + "step": 6056 + }, + { + "epoch": 0.26474059180908255, + "grad_norm": 2.234375, + "learning_rate": 8.37180611520166e-05, + "loss": 2.2897, + "step": 6057 + }, + { + "epoch": 0.2647843000131125, + "grad_norm": 2.15625, + "learning_rate": 8.371298899628091e-05, + "loss": 1.8176, + "step": 6058 + }, + { + "epoch": 0.26482800821714236, + "grad_norm": 2.34375, + "learning_rate": 8.370791620433206e-05, + "loss": 1.5037, + "step": 6059 + }, + { + "epoch": 0.26487171642117224, + "grad_norm": 2.359375, + "learning_rate": 8.370284277626577e-05, + "loss": 2.4598, + "step": 6060 + }, + { + "epoch": 0.2649154246252021, + "grad_norm": 2.78125, + "learning_rate": 8.369776871217781e-05, + "loss": 1.8672, + "step": 6061 + }, + { + "epoch": 0.26495913282923206, + "grad_norm": 2.1875, + "learning_rate": 8.369269401216387e-05, + "loss": 2.3062, + "step": 6062 + }, + { + "epoch": 0.26500284103326194, + "grad_norm": 3.109375, + "learning_rate": 8.368761867631978e-05, + "loss": 2.6771, + "step": 6063 + }, + { + "epoch": 0.2650465492372918, + "grad_norm": 2.484375, + "learning_rate": 8.368254270474128e-05, + "loss": 2.3499, + "step": 6064 + }, + { + "epoch": 0.26509025744132175, + "grad_norm": 2.3125, + "learning_rate": 8.367746609752419e-05, + "loss": 2.033, + "step": 6065 + }, + { + "epoch": 0.26513396564535163, + "grad_norm": 2.515625, + "learning_rate": 8.36723888547643e-05, + "loss": 1.5885, + "step": 6066 + }, + { + "epoch": 0.2651776738493815, + "grad_norm": 3.09375, + "learning_rate": 8.366731097655742e-05, + "loss": 2.0933, + "step": 6067 + }, + { + "epoch": 0.26522138205341145, + "grad_norm": 2.03125, + "learning_rate": 8.366223246299938e-05, + "loss": 1.838, + "step": 6068 + }, + { + "epoch": 0.2652650902574413, + "grad_norm": 2.46875, + "learning_rate": 8.365715331418602e-05, + "loss": 2.1014, + "step": 6069 + }, + { + "epoch": 0.2653087984614712, + "grad_norm": 2.203125, + "learning_rate": 8.36520735302132e-05, + "loss": 1.5907, + "step": 6070 + }, + { + "epoch": 0.26535250666550114, + "grad_norm": 2.015625, + "learning_rate": 8.364699311117677e-05, + "loss": 1.9376, + "step": 6071 + }, + { + "epoch": 0.265396214869531, + "grad_norm": 1.890625, + "learning_rate": 8.36419120571726e-05, + "loss": 1.7767, + "step": 6072 + }, + { + "epoch": 0.2654399230735609, + "grad_norm": 2.6875, + "learning_rate": 8.36368303682966e-05, + "loss": 1.4563, + "step": 6073 + }, + { + "epoch": 0.2654836312775908, + "grad_norm": 12.125, + "learning_rate": 8.363174804464465e-05, + "loss": 2.3386, + "step": 6074 + }, + { + "epoch": 0.2655273394816207, + "grad_norm": 2.703125, + "learning_rate": 8.362666508631264e-05, + "loss": 1.813, + "step": 6075 + }, + { + "epoch": 0.2655710476856506, + "grad_norm": 2.59375, + "learning_rate": 8.362158149339656e-05, + "loss": 1.9771, + "step": 6076 + }, + { + "epoch": 0.2656147558896805, + "grad_norm": 2.640625, + "learning_rate": 8.361649726599228e-05, + "loss": 2.1627, + "step": 6077 + }, + { + "epoch": 0.2656584640937104, + "grad_norm": 2.125, + "learning_rate": 8.361141240419578e-05, + "loss": 1.6052, + "step": 6078 + }, + { + "epoch": 0.2657021722977403, + "grad_norm": 2.140625, + "learning_rate": 8.3606326908103e-05, + "loss": 2.0792, + "step": 6079 + }, + { + "epoch": 0.26574588050177017, + "grad_norm": 1.9921875, + "learning_rate": 8.360124077780994e-05, + "loss": 1.6047, + "step": 6080 + }, + { + "epoch": 0.2657895887058001, + "grad_norm": 2.25, + "learning_rate": 8.359615401341254e-05, + "loss": 2.0099, + "step": 6081 + }, + { + "epoch": 0.26583329690983, + "grad_norm": 2.109375, + "learning_rate": 8.359106661500683e-05, + "loss": 1.9349, + "step": 6082 + }, + { + "epoch": 0.26587700511385987, + "grad_norm": 3.09375, + "learning_rate": 8.358597858268878e-05, + "loss": 1.8877, + "step": 6083 + }, + { + "epoch": 0.26592071331788975, + "grad_norm": 2.0, + "learning_rate": 8.358088991655447e-05, + "loss": 1.7101, + "step": 6084 + }, + { + "epoch": 0.2659644215219197, + "grad_norm": 2.359375, + "learning_rate": 8.357580061669985e-05, + "loss": 1.8284, + "step": 6085 + }, + { + "epoch": 0.26600812972594956, + "grad_norm": 2.5, + "learning_rate": 8.357071068322104e-05, + "loss": 2.4833, + "step": 6086 + }, + { + "epoch": 0.26605183792997944, + "grad_norm": 2.125, + "learning_rate": 8.356562011621404e-05, + "loss": 1.9072, + "step": 6087 + }, + { + "epoch": 0.2660955461340094, + "grad_norm": 2.09375, + "learning_rate": 8.356052891577494e-05, + "loss": 1.7125, + "step": 6088 + }, + { + "epoch": 0.26613925433803926, + "grad_norm": 1.8515625, + "learning_rate": 8.355543708199982e-05, + "loss": 1.396, + "step": 6089 + }, + { + "epoch": 0.26618296254206913, + "grad_norm": 1.9140625, + "learning_rate": 8.355034461498477e-05, + "loss": 1.6723, + "step": 6090 + }, + { + "epoch": 0.26622667074609907, + "grad_norm": 2.28125, + "learning_rate": 8.354525151482587e-05, + "loss": 1.8009, + "step": 6091 + }, + { + "epoch": 0.26627037895012895, + "grad_norm": 2.453125, + "learning_rate": 8.354015778161925e-05, + "loss": 2.198, + "step": 6092 + }, + { + "epoch": 0.26631408715415883, + "grad_norm": 2.03125, + "learning_rate": 8.353506341546104e-05, + "loss": 1.7902, + "step": 6093 + }, + { + "epoch": 0.2663577953581887, + "grad_norm": 2.28125, + "learning_rate": 8.352996841644741e-05, + "loss": 1.8954, + "step": 6094 + }, + { + "epoch": 0.26640150356221864, + "grad_norm": 2.015625, + "learning_rate": 8.352487278467443e-05, + "loss": 1.6802, + "step": 6095 + }, + { + "epoch": 0.2664452117662485, + "grad_norm": 2.34375, + "learning_rate": 8.351977652023833e-05, + "loss": 2.098, + "step": 6096 + }, + { + "epoch": 0.2664889199702784, + "grad_norm": 1.9140625, + "learning_rate": 8.351467962323525e-05, + "loss": 1.4421, + "step": 6097 + }, + { + "epoch": 0.26653262817430834, + "grad_norm": 2.3125, + "learning_rate": 8.350958209376138e-05, + "loss": 1.5341, + "step": 6098 + }, + { + "epoch": 0.2665763363783382, + "grad_norm": 2.109375, + "learning_rate": 8.350448393191295e-05, + "loss": 1.7144, + "step": 6099 + }, + { + "epoch": 0.2666200445823681, + "grad_norm": 3.640625, + "learning_rate": 8.349938513778613e-05, + "loss": 1.6064, + "step": 6100 + }, + { + "epoch": 0.26666375278639803, + "grad_norm": 2.1875, + "learning_rate": 8.349428571147717e-05, + "loss": 1.8313, + "step": 6101 + }, + { + "epoch": 0.2667074609904279, + "grad_norm": 2.90625, + "learning_rate": 8.348918565308226e-05, + "loss": 3.0004, + "step": 6102 + }, + { + "epoch": 0.2667511691944578, + "grad_norm": 2.15625, + "learning_rate": 8.34840849626977e-05, + "loss": 1.8625, + "step": 6103 + }, + { + "epoch": 0.2667948773984877, + "grad_norm": 3.140625, + "learning_rate": 8.347898364041973e-05, + "loss": 2.0085, + "step": 6104 + }, + { + "epoch": 0.2668385856025176, + "grad_norm": 2.296875, + "learning_rate": 8.34738816863446e-05, + "loss": 2.2929, + "step": 6105 + }, + { + "epoch": 0.2668822938065475, + "grad_norm": 2.390625, + "learning_rate": 8.34687791005686e-05, + "loss": 2.3305, + "step": 6106 + }, + { + "epoch": 0.26692600201057737, + "grad_norm": 2.0625, + "learning_rate": 8.346367588318804e-05, + "loss": 1.8152, + "step": 6107 + }, + { + "epoch": 0.2669697102146073, + "grad_norm": 2.34375, + "learning_rate": 8.345857203429919e-05, + "loss": 2.0315, + "step": 6108 + }, + { + "epoch": 0.2670134184186372, + "grad_norm": 2.546875, + "learning_rate": 8.345346755399841e-05, + "loss": 2.0062, + "step": 6109 + }, + { + "epoch": 0.26705712662266706, + "grad_norm": 2.265625, + "learning_rate": 8.344836244238199e-05, + "loss": 2.1459, + "step": 6110 + }, + { + "epoch": 0.267100834826697, + "grad_norm": 2.359375, + "learning_rate": 8.344325669954631e-05, + "loss": 1.3544, + "step": 6111 + }, + { + "epoch": 0.2671445430307269, + "grad_norm": 2.15625, + "learning_rate": 8.343815032558768e-05, + "loss": 2.2826, + "step": 6112 + }, + { + "epoch": 0.26718825123475676, + "grad_norm": 1.8828125, + "learning_rate": 8.34330433206025e-05, + "loss": 1.2367, + "step": 6113 + }, + { + "epoch": 0.26723195943878664, + "grad_norm": 2.546875, + "learning_rate": 8.342793568468713e-05, + "loss": 1.9825, + "step": 6114 + }, + { + "epoch": 0.2672756676428166, + "grad_norm": 2.15625, + "learning_rate": 8.342282741793796e-05, + "loss": 1.8248, + "step": 6115 + }, + { + "epoch": 0.26731937584684645, + "grad_norm": 2.015625, + "learning_rate": 8.34177185204514e-05, + "loss": 1.5443, + "step": 6116 + }, + { + "epoch": 0.26736308405087633, + "grad_norm": 3.0, + "learning_rate": 8.341260899232383e-05, + "loss": 2.6857, + "step": 6117 + }, + { + "epoch": 0.26740679225490627, + "grad_norm": 1.8515625, + "learning_rate": 8.340749883365174e-05, + "loss": 1.6376, + "step": 6118 + }, + { + "epoch": 0.26745050045893615, + "grad_norm": 2.265625, + "learning_rate": 8.340238804453148e-05, + "loss": 2.387, + "step": 6119 + }, + { + "epoch": 0.267494208662966, + "grad_norm": 1.9609375, + "learning_rate": 8.339727662505957e-05, + "loss": 1.637, + "step": 6120 + }, + { + "epoch": 0.26753791686699596, + "grad_norm": 2.625, + "learning_rate": 8.339216457533244e-05, + "loss": 1.8343, + "step": 6121 + }, + { + "epoch": 0.26758162507102584, + "grad_norm": 2.375, + "learning_rate": 8.338705189544655e-05, + "loss": 2.1124, + "step": 6122 + }, + { + "epoch": 0.2676253332750557, + "grad_norm": 2.328125, + "learning_rate": 8.338193858549842e-05, + "loss": 1.2943, + "step": 6123 + }, + { + "epoch": 0.2676690414790856, + "grad_norm": 1.9609375, + "learning_rate": 8.337682464558452e-05, + "loss": 1.8715, + "step": 6124 + }, + { + "epoch": 0.26771274968311554, + "grad_norm": 2.390625, + "learning_rate": 8.337171007580135e-05, + "loss": 1.9677, + "step": 6125 + }, + { + "epoch": 0.2677564578871454, + "grad_norm": 2.4375, + "learning_rate": 8.336659487624545e-05, + "loss": 2.3621, + "step": 6126 + }, + { + "epoch": 0.2678001660911753, + "grad_norm": 2.046875, + "learning_rate": 8.336147904701332e-05, + "loss": 1.705, + "step": 6127 + }, + { + "epoch": 0.26784387429520523, + "grad_norm": 1.984375, + "learning_rate": 8.335636258820155e-05, + "loss": 1.5326, + "step": 6128 + }, + { + "epoch": 0.2678875824992351, + "grad_norm": 2.734375, + "learning_rate": 8.335124549990667e-05, + "loss": 2.4204, + "step": 6129 + }, + { + "epoch": 0.267931290703265, + "grad_norm": 1.9375, + "learning_rate": 8.334612778222524e-05, + "loss": 1.5847, + "step": 6130 + }, + { + "epoch": 0.2679749989072949, + "grad_norm": 2.125, + "learning_rate": 8.334100943525385e-05, + "loss": 1.3181, + "step": 6131 + }, + { + "epoch": 0.2680187071113248, + "grad_norm": 2.046875, + "learning_rate": 8.333589045908907e-05, + "loss": 2.1265, + "step": 6132 + }, + { + "epoch": 0.2680624153153547, + "grad_norm": 2.046875, + "learning_rate": 8.333077085382752e-05, + "loss": 1.4887, + "step": 6133 + }, + { + "epoch": 0.26810612351938456, + "grad_norm": 2.4375, + "learning_rate": 8.332565061956584e-05, + "loss": 1.9686, + "step": 6134 + }, + { + "epoch": 0.2681498317234145, + "grad_norm": 2.640625, + "learning_rate": 8.332052975640061e-05, + "loss": 1.8269, + "step": 6135 + }, + { + "epoch": 0.2681935399274444, + "grad_norm": 2.0625, + "learning_rate": 8.33154082644285e-05, + "loss": 1.8044, + "step": 6136 + }, + { + "epoch": 0.26823724813147426, + "grad_norm": 2.109375, + "learning_rate": 8.331028614374612e-05, + "loss": 1.8124, + "step": 6137 + }, + { + "epoch": 0.2682809563355042, + "grad_norm": 2.484375, + "learning_rate": 8.330516339445018e-05, + "loss": 2.2855, + "step": 6138 + }, + { + "epoch": 0.2683246645395341, + "grad_norm": 2.25, + "learning_rate": 8.330004001663733e-05, + "loss": 1.9794, + "step": 6139 + }, + { + "epoch": 0.26836837274356395, + "grad_norm": 2.21875, + "learning_rate": 8.329491601040427e-05, + "loss": 2.4759, + "step": 6140 + }, + { + "epoch": 0.2684120809475939, + "grad_norm": 2.484375, + "learning_rate": 8.328979137584767e-05, + "loss": 1.6447, + "step": 6141 + }, + { + "epoch": 0.26845578915162377, + "grad_norm": 2.34375, + "learning_rate": 8.328466611306427e-05, + "loss": 2.2443, + "step": 6142 + }, + { + "epoch": 0.26849949735565365, + "grad_norm": 2.359375, + "learning_rate": 8.327954022215076e-05, + "loss": 1.942, + "step": 6143 + }, + { + "epoch": 0.26854320555968353, + "grad_norm": 2.078125, + "learning_rate": 8.32744137032039e-05, + "loss": 1.8281, + "step": 6144 + }, + { + "epoch": 0.26858691376371346, + "grad_norm": 3.6875, + "learning_rate": 8.326928655632043e-05, + "loss": 1.2905, + "step": 6145 + }, + { + "epoch": 0.26863062196774334, + "grad_norm": 2.0625, + "learning_rate": 8.32641587815971e-05, + "loss": 1.9108, + "step": 6146 + }, + { + "epoch": 0.2686743301717732, + "grad_norm": 2.34375, + "learning_rate": 8.325903037913067e-05, + "loss": 2.4243, + "step": 6147 + }, + { + "epoch": 0.26871803837580316, + "grad_norm": 2.171875, + "learning_rate": 8.325390134901794e-05, + "loss": 1.801, + "step": 6148 + }, + { + "epoch": 0.26876174657983304, + "grad_norm": 2.125, + "learning_rate": 8.324877169135569e-05, + "loss": 1.8057, + "step": 6149 + }, + { + "epoch": 0.2688054547838629, + "grad_norm": 2.078125, + "learning_rate": 8.324364140624073e-05, + "loss": 2.0626, + "step": 6150 + }, + { + "epoch": 0.26884916298789285, + "grad_norm": 2.09375, + "learning_rate": 8.323851049376987e-05, + "loss": 1.8675, + "step": 6151 + }, + { + "epoch": 0.26889287119192273, + "grad_norm": 1.9453125, + "learning_rate": 8.323337895403994e-05, + "loss": 1.7426, + "step": 6152 + }, + { + "epoch": 0.2689365793959526, + "grad_norm": 2.21875, + "learning_rate": 8.322824678714776e-05, + "loss": 1.7979, + "step": 6153 + }, + { + "epoch": 0.2689802875999825, + "grad_norm": 2.265625, + "learning_rate": 8.322311399319023e-05, + "loss": 1.97, + "step": 6154 + }, + { + "epoch": 0.26902399580401243, + "grad_norm": 1.921875, + "learning_rate": 8.321798057226417e-05, + "loss": 1.6476, + "step": 6155 + }, + { + "epoch": 0.2690677040080423, + "grad_norm": 2.0, + "learning_rate": 8.32128465244665e-05, + "loss": 1.8328, + "step": 6156 + }, + { + "epoch": 0.2691114122120722, + "grad_norm": 2.4375, + "learning_rate": 8.320771184989404e-05, + "loss": 1.6689, + "step": 6157 + }, + { + "epoch": 0.2691551204161021, + "grad_norm": 2.75, + "learning_rate": 8.320257654864374e-05, + "loss": 1.8429, + "step": 6158 + }, + { + "epoch": 0.269198828620132, + "grad_norm": 2.453125, + "learning_rate": 8.31974406208125e-05, + "loss": 1.3874, + "step": 6159 + }, + { + "epoch": 0.2692425368241619, + "grad_norm": 3.5625, + "learning_rate": 8.319230406649723e-05, + "loss": 2.851, + "step": 6160 + }, + { + "epoch": 0.2692862450281918, + "grad_norm": 1.984375, + "learning_rate": 8.318716688579488e-05, + "loss": 1.8165, + "step": 6161 + }, + { + "epoch": 0.2693299532322217, + "grad_norm": 2.4375, + "learning_rate": 8.31820290788024e-05, + "loss": 1.682, + "step": 6162 + }, + { + "epoch": 0.2693736614362516, + "grad_norm": 1.8828125, + "learning_rate": 8.317689064561671e-05, + "loss": 1.5811, + "step": 6163 + }, + { + "epoch": 0.26941736964028146, + "grad_norm": 2.078125, + "learning_rate": 8.317175158633483e-05, + "loss": 1.8027, + "step": 6164 + }, + { + "epoch": 0.2694610778443114, + "grad_norm": 2.015625, + "learning_rate": 8.316661190105371e-05, + "loss": 1.8132, + "step": 6165 + }, + { + "epoch": 0.26950478604834127, + "grad_norm": 1.9140625, + "learning_rate": 8.316147158987036e-05, + "loss": 1.6619, + "step": 6166 + }, + { + "epoch": 0.26954849425237115, + "grad_norm": 2.484375, + "learning_rate": 8.315633065288176e-05, + "loss": 1.9372, + "step": 6167 + }, + { + "epoch": 0.2695922024564011, + "grad_norm": 2.578125, + "learning_rate": 8.315118909018495e-05, + "loss": 2.3095, + "step": 6168 + }, + { + "epoch": 0.26963591066043097, + "grad_norm": 1.9453125, + "learning_rate": 8.314604690187695e-05, + "loss": 1.9157, + "step": 6169 + }, + { + "epoch": 0.26967961886446085, + "grad_norm": 2.234375, + "learning_rate": 8.314090408805482e-05, + "loss": 2.1959, + "step": 6170 + }, + { + "epoch": 0.2697233270684908, + "grad_norm": 1.9296875, + "learning_rate": 8.313576064881558e-05, + "loss": 1.7974, + "step": 6171 + }, + { + "epoch": 0.26976703527252066, + "grad_norm": 1.9765625, + "learning_rate": 8.313061658425632e-05, + "loss": 2.098, + "step": 6172 + }, + { + "epoch": 0.26981074347655054, + "grad_norm": 2.140625, + "learning_rate": 8.312547189447409e-05, + "loss": 1.4758, + "step": 6173 + }, + { + "epoch": 0.2698544516805804, + "grad_norm": 2.421875, + "learning_rate": 8.312032657956599e-05, + "loss": 2.3724, + "step": 6174 + }, + { + "epoch": 0.26989815988461036, + "grad_norm": 2.875, + "learning_rate": 8.311518063962914e-05, + "loss": 2.2493, + "step": 6175 + }, + { + "epoch": 0.26994186808864024, + "grad_norm": 2.8125, + "learning_rate": 8.311003407476064e-05, + "loss": 2.4625, + "step": 6176 + }, + { + "epoch": 0.2699855762926701, + "grad_norm": 1.90625, + "learning_rate": 8.31048868850576e-05, + "loss": 1.5971, + "step": 6177 + }, + { + "epoch": 0.27002928449670005, + "grad_norm": 2.328125, + "learning_rate": 8.309973907061715e-05, + "loss": 2.2358, + "step": 6178 + }, + { + "epoch": 0.27007299270072993, + "grad_norm": 2.25, + "learning_rate": 8.309459063153646e-05, + "loss": 1.8094, + "step": 6179 + }, + { + "epoch": 0.2701167009047598, + "grad_norm": 2.09375, + "learning_rate": 8.308944156791268e-05, + "loss": 1.8803, + "step": 6180 + }, + { + "epoch": 0.27016040910878975, + "grad_norm": 2.09375, + "learning_rate": 8.308429187984297e-05, + "loss": 1.9824, + "step": 6181 + }, + { + "epoch": 0.2702041173128196, + "grad_norm": 2.78125, + "learning_rate": 8.307914156742454e-05, + "loss": 1.7535, + "step": 6182 + }, + { + "epoch": 0.2702478255168495, + "grad_norm": 4.3125, + "learning_rate": 8.307399063075453e-05, + "loss": 1.4322, + "step": 6183 + }, + { + "epoch": 0.2702915337208794, + "grad_norm": 2.265625, + "learning_rate": 8.306883906993022e-05, + "loss": 1.9071, + "step": 6184 + }, + { + "epoch": 0.2703352419249093, + "grad_norm": 2.578125, + "learning_rate": 8.306368688504876e-05, + "loss": 2.7365, + "step": 6185 + }, + { + "epoch": 0.2703789501289392, + "grad_norm": 1.921875, + "learning_rate": 8.305853407620742e-05, + "loss": 1.8138, + "step": 6186 + }, + { + "epoch": 0.2704226583329691, + "grad_norm": 2.1875, + "learning_rate": 8.305338064350342e-05, + "loss": 2.2551, + "step": 6187 + }, + { + "epoch": 0.270466366536999, + "grad_norm": 2.03125, + "learning_rate": 8.304822658703402e-05, + "loss": 1.7694, + "step": 6188 + }, + { + "epoch": 0.2705100747410289, + "grad_norm": 2.34375, + "learning_rate": 8.304307190689649e-05, + "loss": 1.9035, + "step": 6189 + }, + { + "epoch": 0.2705537829450588, + "grad_norm": 2.796875, + "learning_rate": 8.30379166031881e-05, + "loss": 2.906, + "step": 6190 + }, + { + "epoch": 0.2705974911490887, + "grad_norm": 2.5, + "learning_rate": 8.303276067600614e-05, + "loss": 1.8274, + "step": 6191 + }, + { + "epoch": 0.2706411993531186, + "grad_norm": 4.8125, + "learning_rate": 8.30276041254479e-05, + "loss": 1.8758, + "step": 6192 + }, + { + "epoch": 0.27068490755714847, + "grad_norm": 1.9609375, + "learning_rate": 8.30224469516107e-05, + "loss": 1.784, + "step": 6193 + }, + { + "epoch": 0.27072861576117835, + "grad_norm": 2.109375, + "learning_rate": 8.301728915459188e-05, + "loss": 1.81, + "step": 6194 + }, + { + "epoch": 0.2707723239652083, + "grad_norm": 2.328125, + "learning_rate": 8.301213073448874e-05, + "loss": 1.76, + "step": 6195 + }, + { + "epoch": 0.27081603216923816, + "grad_norm": 2.3125, + "learning_rate": 8.300697169139867e-05, + "loss": 1.9981, + "step": 6196 + }, + { + "epoch": 0.27085974037326804, + "grad_norm": 2.203125, + "learning_rate": 8.300181202541899e-05, + "loss": 2.1087, + "step": 6197 + }, + { + "epoch": 0.270903448577298, + "grad_norm": 2.6875, + "learning_rate": 8.299665173664708e-05, + "loss": 2.4448, + "step": 6198 + }, + { + "epoch": 0.27094715678132786, + "grad_norm": 2.546875, + "learning_rate": 8.299149082518034e-05, + "loss": 2.4269, + "step": 6199 + }, + { + "epoch": 0.27099086498535774, + "grad_norm": 2.484375, + "learning_rate": 8.298632929111614e-05, + "loss": 2.4778, + "step": 6200 + }, + { + "epoch": 0.2710345731893877, + "grad_norm": 2.328125, + "learning_rate": 8.298116713455191e-05, + "loss": 1.7866, + "step": 6201 + }, + { + "epoch": 0.27107828139341755, + "grad_norm": 2.65625, + "learning_rate": 8.297600435558505e-05, + "loss": 1.9275, + "step": 6202 + }, + { + "epoch": 0.27112198959744743, + "grad_norm": 2.703125, + "learning_rate": 8.297084095431298e-05, + "loss": 2.4905, + "step": 6203 + }, + { + "epoch": 0.2711656978014773, + "grad_norm": 2.625, + "learning_rate": 8.296567693083317e-05, + "loss": 2.0117, + "step": 6204 + }, + { + "epoch": 0.27120940600550725, + "grad_norm": 1.859375, + "learning_rate": 8.296051228524305e-05, + "loss": 1.6825, + "step": 6205 + }, + { + "epoch": 0.2712531142095371, + "grad_norm": 2.5625, + "learning_rate": 8.29553470176401e-05, + "loss": 2.1916, + "step": 6206 + }, + { + "epoch": 0.271296822413567, + "grad_norm": 1.859375, + "learning_rate": 8.295018112812179e-05, + "loss": 1.6755, + "step": 6207 + }, + { + "epoch": 0.27134053061759694, + "grad_norm": 2.03125, + "learning_rate": 8.294501461678559e-05, + "loss": 1.9042, + "step": 6208 + }, + { + "epoch": 0.2713842388216268, + "grad_norm": 4.15625, + "learning_rate": 8.293984748372901e-05, + "loss": 2.5244, + "step": 6209 + }, + { + "epoch": 0.2714279470256567, + "grad_norm": 2.546875, + "learning_rate": 8.293467972904958e-05, + "loss": 2.1528, + "step": 6210 + }, + { + "epoch": 0.27147165522968664, + "grad_norm": 2.125, + "learning_rate": 8.292951135284481e-05, + "loss": 1.8469, + "step": 6211 + }, + { + "epoch": 0.2715153634337165, + "grad_norm": 2.15625, + "learning_rate": 8.292434235521222e-05, + "loss": 1.7206, + "step": 6212 + }, + { + "epoch": 0.2715590716377464, + "grad_norm": 2.46875, + "learning_rate": 8.291917273624939e-05, + "loss": 1.7422, + "step": 6213 + }, + { + "epoch": 0.2716027798417763, + "grad_norm": 2.3125, + "learning_rate": 8.291400249605386e-05, + "loss": 2.3142, + "step": 6214 + }, + { + "epoch": 0.2716464880458062, + "grad_norm": 2.296875, + "learning_rate": 8.290883163472318e-05, + "loss": 2.354, + "step": 6215 + }, + { + "epoch": 0.2716901962498361, + "grad_norm": 1.765625, + "learning_rate": 8.290366015235497e-05, + "loss": 1.3597, + "step": 6216 + }, + { + "epoch": 0.27173390445386597, + "grad_norm": 2.84375, + "learning_rate": 8.289848804904678e-05, + "loss": 2.4125, + "step": 6217 + }, + { + "epoch": 0.2717776126578959, + "grad_norm": 2.21875, + "learning_rate": 8.289331532489626e-05, + "loss": 1.9229, + "step": 6218 + }, + { + "epoch": 0.2718213208619258, + "grad_norm": 1.9609375, + "learning_rate": 8.2888141980001e-05, + "loss": 1.9399, + "step": 6219 + }, + { + "epoch": 0.27186502906595567, + "grad_norm": 2.65625, + "learning_rate": 8.288296801445863e-05, + "loss": 2.0839, + "step": 6220 + }, + { + "epoch": 0.2719087372699856, + "grad_norm": 2.953125, + "learning_rate": 8.287779342836682e-05, + "loss": 1.7229, + "step": 6221 + }, + { + "epoch": 0.2719524454740155, + "grad_norm": 2.359375, + "learning_rate": 8.287261822182316e-05, + "loss": 1.7666, + "step": 6222 + }, + { + "epoch": 0.27199615367804536, + "grad_norm": 2.234375, + "learning_rate": 8.286744239492537e-05, + "loss": 1.9596, + "step": 6223 + }, + { + "epoch": 0.27203986188207524, + "grad_norm": 2.265625, + "learning_rate": 8.28622659477711e-05, + "loss": 1.8435, + "step": 6224 + }, + { + "epoch": 0.2720835700861052, + "grad_norm": 2.25, + "learning_rate": 8.285708888045803e-05, + "loss": 1.8474, + "step": 6225 + }, + { + "epoch": 0.27212727829013506, + "grad_norm": 2.3125, + "learning_rate": 8.285191119308389e-05, + "loss": 2.607, + "step": 6226 + }, + { + "epoch": 0.27217098649416493, + "grad_norm": 2.296875, + "learning_rate": 8.284673288574637e-05, + "loss": 1.9533, + "step": 6227 + }, + { + "epoch": 0.27221469469819487, + "grad_norm": 2.6875, + "learning_rate": 8.284155395854318e-05, + "loss": 1.8539, + "step": 6228 + }, + { + "epoch": 0.27225840290222475, + "grad_norm": 2.203125, + "learning_rate": 8.283637441157209e-05, + "loss": 1.8384, + "step": 6229 + }, + { + "epoch": 0.27230211110625463, + "grad_norm": 1.984375, + "learning_rate": 8.283119424493082e-05, + "loss": 1.7905, + "step": 6230 + }, + { + "epoch": 0.27234581931028456, + "grad_norm": 2.0, + "learning_rate": 8.282601345871713e-05, + "loss": 2.0853, + "step": 6231 + }, + { + "epoch": 0.27238952751431444, + "grad_norm": 2.171875, + "learning_rate": 8.282083205302877e-05, + "loss": 2.1163, + "step": 6232 + }, + { + "epoch": 0.2724332357183443, + "grad_norm": 1.9375, + "learning_rate": 8.281565002796356e-05, + "loss": 1.4279, + "step": 6233 + }, + { + "epoch": 0.2724769439223742, + "grad_norm": 2.125, + "learning_rate": 8.281046738361926e-05, + "loss": 2.0867, + "step": 6234 + }, + { + "epoch": 0.27252065212640414, + "grad_norm": 2.671875, + "learning_rate": 8.28052841200937e-05, + "loss": 2.2017, + "step": 6235 + }, + { + "epoch": 0.272564360330434, + "grad_norm": 2.328125, + "learning_rate": 8.280010023748467e-05, + "loss": 2.464, + "step": 6236 + }, + { + "epoch": 0.2726080685344639, + "grad_norm": 1.8984375, + "learning_rate": 8.279491573589004e-05, + "loss": 1.7016, + "step": 6237 + }, + { + "epoch": 0.27265177673849383, + "grad_norm": 2.125, + "learning_rate": 8.278973061540758e-05, + "loss": 1.849, + "step": 6238 + }, + { + "epoch": 0.2726954849425237, + "grad_norm": 2.265625, + "learning_rate": 8.27845448761352e-05, + "loss": 2.3198, + "step": 6239 + }, + { + "epoch": 0.2727391931465536, + "grad_norm": 1.984375, + "learning_rate": 8.277935851817075e-05, + "loss": 2.0965, + "step": 6240 + }, + { + "epoch": 0.27278290135058353, + "grad_norm": 1.984375, + "learning_rate": 8.277417154161208e-05, + "loss": 1.7996, + "step": 6241 + }, + { + "epoch": 0.2728266095546134, + "grad_norm": 2.109375, + "learning_rate": 8.27689839465571e-05, + "loss": 1.7935, + "step": 6242 + }, + { + "epoch": 0.2728703177586433, + "grad_norm": 2.5, + "learning_rate": 8.27637957331037e-05, + "loss": 2.6535, + "step": 6243 + }, + { + "epoch": 0.27291402596267317, + "grad_norm": 2.5, + "learning_rate": 8.275860690134978e-05, + "loss": 1.6348, + "step": 6244 + }, + { + "epoch": 0.2729577341667031, + "grad_norm": 2.546875, + "learning_rate": 8.275341745139329e-05, + "loss": 2.5961, + "step": 6245 + }, + { + "epoch": 0.273001442370733, + "grad_norm": 2.25, + "learning_rate": 8.274822738333214e-05, + "loss": 1.988, + "step": 6246 + }, + { + "epoch": 0.27304515057476286, + "grad_norm": 1.984375, + "learning_rate": 8.274303669726426e-05, + "loss": 1.7639, + "step": 6247 + }, + { + "epoch": 0.2730888587787928, + "grad_norm": 1.890625, + "learning_rate": 8.273784539328763e-05, + "loss": 1.5711, + "step": 6248 + }, + { + "epoch": 0.2731325669828227, + "grad_norm": 2.171875, + "learning_rate": 8.273265347150021e-05, + "loss": 1.4728, + "step": 6249 + }, + { + "epoch": 0.27317627518685256, + "grad_norm": 2.09375, + "learning_rate": 8.272746093199996e-05, + "loss": 1.6939, + "step": 6250 + }, + { + "epoch": 0.2732199833908825, + "grad_norm": 2.3125, + "learning_rate": 8.272226777488492e-05, + "loss": 1.7364, + "step": 6251 + }, + { + "epoch": 0.2732636915949124, + "grad_norm": 3.109375, + "learning_rate": 8.271707400025306e-05, + "loss": 1.8905, + "step": 6252 + }, + { + "epoch": 0.27330739979894225, + "grad_norm": 2.34375, + "learning_rate": 8.271187960820237e-05, + "loss": 2.2283, + "step": 6253 + }, + { + "epoch": 0.27335110800297213, + "grad_norm": 2.09375, + "learning_rate": 8.270668459883093e-05, + "loss": 2.0876, + "step": 6254 + }, + { + "epoch": 0.27339481620700207, + "grad_norm": 2.0, + "learning_rate": 8.270148897223673e-05, + "loss": 1.9225, + "step": 6255 + }, + { + "epoch": 0.27343852441103195, + "grad_norm": 2.0, + "learning_rate": 8.269629272851785e-05, + "loss": 1.451, + "step": 6256 + }, + { + "epoch": 0.2734822326150618, + "grad_norm": 1.9453125, + "learning_rate": 8.269109586777234e-05, + "loss": 1.8739, + "step": 6257 + }, + { + "epoch": 0.27352594081909176, + "grad_norm": 1.9609375, + "learning_rate": 8.268589839009826e-05, + "loss": 1.9135, + "step": 6258 + }, + { + "epoch": 0.27356964902312164, + "grad_norm": 2.453125, + "learning_rate": 8.268070029559372e-05, + "loss": 1.818, + "step": 6259 + }, + { + "epoch": 0.2736133572271515, + "grad_norm": 2.265625, + "learning_rate": 8.267550158435679e-05, + "loss": 2.6027, + "step": 6260 + }, + { + "epoch": 0.27365706543118146, + "grad_norm": 2.578125, + "learning_rate": 8.26703022564856e-05, + "loss": 2.0903, + "step": 6261 + }, + { + "epoch": 0.27370077363521134, + "grad_norm": 2.734375, + "learning_rate": 8.266510231207824e-05, + "loss": 2.4899, + "step": 6262 + }, + { + "epoch": 0.2737444818392412, + "grad_norm": 2.265625, + "learning_rate": 8.265990175123286e-05, + "loss": 1.5422, + "step": 6263 + }, + { + "epoch": 0.2737881900432711, + "grad_norm": 2.1875, + "learning_rate": 8.265470057404761e-05, + "loss": 2.1211, + "step": 6264 + }, + { + "epoch": 0.27383189824730103, + "grad_norm": 1.921875, + "learning_rate": 8.264949878062062e-05, + "loss": 2.0299, + "step": 6265 + }, + { + "epoch": 0.2738756064513309, + "grad_norm": 2.0625, + "learning_rate": 8.264429637105009e-05, + "loss": 1.7817, + "step": 6266 + }, + { + "epoch": 0.2739193146553608, + "grad_norm": 2.0625, + "learning_rate": 8.263909334543416e-05, + "loss": 1.8252, + "step": 6267 + }, + { + "epoch": 0.2739630228593907, + "grad_norm": 2.375, + "learning_rate": 8.263388970387102e-05, + "loss": 2.439, + "step": 6268 + }, + { + "epoch": 0.2740067310634206, + "grad_norm": 2.453125, + "learning_rate": 8.262868544645892e-05, + "loss": 1.74, + "step": 6269 + }, + { + "epoch": 0.2740504392674505, + "grad_norm": 2.390625, + "learning_rate": 8.262348057329601e-05, + "loss": 1.8924, + "step": 6270 + }, + { + "epoch": 0.2740941474714804, + "grad_norm": 2.5625, + "learning_rate": 8.261827508448056e-05, + "loss": 3.097, + "step": 6271 + }, + { + "epoch": 0.2741378556755103, + "grad_norm": 2.46875, + "learning_rate": 8.261306898011077e-05, + "loss": 1.4289, + "step": 6272 + }, + { + "epoch": 0.2741815638795402, + "grad_norm": 2.578125, + "learning_rate": 8.26078622602849e-05, + "loss": 2.0246, + "step": 6273 + }, + { + "epoch": 0.27422527208357006, + "grad_norm": 2.375, + "learning_rate": 8.260265492510122e-05, + "loss": 2.4173, + "step": 6274 + }, + { + "epoch": 0.2742689802876, + "grad_norm": 2.46875, + "learning_rate": 8.259744697465799e-05, + "loss": 1.5699, + "step": 6275 + }, + { + "epoch": 0.2743126884916299, + "grad_norm": 3.5625, + "learning_rate": 8.25922384090535e-05, + "loss": 1.3334, + "step": 6276 + }, + { + "epoch": 0.27435639669565975, + "grad_norm": 2.25, + "learning_rate": 8.258702922838603e-05, + "loss": 1.9971, + "step": 6277 + }, + { + "epoch": 0.2744001048996897, + "grad_norm": 2.234375, + "learning_rate": 8.258181943275387e-05, + "loss": 1.7544, + "step": 6278 + }, + { + "epoch": 0.27444381310371957, + "grad_norm": 2.28125, + "learning_rate": 8.257660902225539e-05, + "loss": 2.1136, + "step": 6279 + }, + { + "epoch": 0.27448752130774945, + "grad_norm": 2.25, + "learning_rate": 8.257139799698886e-05, + "loss": 1.9613, + "step": 6280 + }, + { + "epoch": 0.2745312295117794, + "grad_norm": 2.953125, + "learning_rate": 8.256618635705267e-05, + "loss": 3.2578, + "step": 6281 + }, + { + "epoch": 0.27457493771580926, + "grad_norm": 6.0, + "learning_rate": 8.256097410254512e-05, + "loss": 1.0849, + "step": 6282 + }, + { + "epoch": 0.27461864591983914, + "grad_norm": 2.421875, + "learning_rate": 8.25557612335646e-05, + "loss": 2.0473, + "step": 6283 + }, + { + "epoch": 0.274662354123869, + "grad_norm": 3.125, + "learning_rate": 8.255054775020949e-05, + "loss": 1.511, + "step": 6284 + }, + { + "epoch": 0.27470606232789896, + "grad_norm": 2.203125, + "learning_rate": 8.254533365257817e-05, + "loss": 1.7947, + "step": 6285 + }, + { + "epoch": 0.27474977053192884, + "grad_norm": 2.8125, + "learning_rate": 8.254011894076904e-05, + "loss": 2.7416, + "step": 6286 + }, + { + "epoch": 0.2747934787359587, + "grad_norm": 2.359375, + "learning_rate": 8.25349036148805e-05, + "loss": 2.099, + "step": 6287 + }, + { + "epoch": 0.27483718693998865, + "grad_norm": 2.46875, + "learning_rate": 8.252968767501098e-05, + "loss": 2.0382, + "step": 6288 + }, + { + "epoch": 0.27488089514401853, + "grad_norm": 2.28125, + "learning_rate": 8.252447112125889e-05, + "loss": 1.3828, + "step": 6289 + }, + { + "epoch": 0.2749246033480484, + "grad_norm": 2.296875, + "learning_rate": 8.251925395372272e-05, + "loss": 1.9086, + "step": 6290 + }, + { + "epoch": 0.27496831155207835, + "grad_norm": 1.96875, + "learning_rate": 8.251403617250088e-05, + "loss": 1.6746, + "step": 6291 + }, + { + "epoch": 0.27501201975610823, + "grad_norm": 2.40625, + "learning_rate": 8.250881777769187e-05, + "loss": 1.8238, + "step": 6292 + }, + { + "epoch": 0.2750557279601381, + "grad_norm": 2.3125, + "learning_rate": 8.250359876939415e-05, + "loss": 2.2168, + "step": 6293 + }, + { + "epoch": 0.275099436164168, + "grad_norm": 1.9453125, + "learning_rate": 8.249837914770621e-05, + "loss": 1.8462, + "step": 6294 + }, + { + "epoch": 0.2751431443681979, + "grad_norm": 1.9453125, + "learning_rate": 8.249315891272655e-05, + "loss": 1.9032, + "step": 6295 + }, + { + "epoch": 0.2751868525722278, + "grad_norm": 2.1875, + "learning_rate": 8.24879380645537e-05, + "loss": 1.4899, + "step": 6296 + }, + { + "epoch": 0.2752305607762577, + "grad_norm": 2.078125, + "learning_rate": 8.248271660328618e-05, + "loss": 2.1009, + "step": 6297 + }, + { + "epoch": 0.2752742689802876, + "grad_norm": 2.171875, + "learning_rate": 8.247749452902251e-05, + "loss": 1.9969, + "step": 6298 + }, + { + "epoch": 0.2753179771843175, + "grad_norm": 2.0, + "learning_rate": 8.247227184186126e-05, + "loss": 2.2424, + "step": 6299 + }, + { + "epoch": 0.2753616853883474, + "grad_norm": 2.734375, + "learning_rate": 8.246704854190098e-05, + "loss": 1.373, + "step": 6300 + }, + { + "epoch": 0.2754053935923773, + "grad_norm": 2.328125, + "learning_rate": 8.246182462924023e-05, + "loss": 2.7404, + "step": 6301 + }, + { + "epoch": 0.2754491017964072, + "grad_norm": 2.078125, + "learning_rate": 8.24566001039776e-05, + "loss": 1.9392, + "step": 6302 + }, + { + "epoch": 0.27549281000043707, + "grad_norm": 2.046875, + "learning_rate": 8.245137496621169e-05, + "loss": 1.6657, + "step": 6303 + }, + { + "epoch": 0.27553651820446695, + "grad_norm": 2.1875, + "learning_rate": 8.244614921604111e-05, + "loss": 1.6642, + "step": 6304 + }, + { + "epoch": 0.2755802264084969, + "grad_norm": 2.71875, + "learning_rate": 8.244092285356446e-05, + "loss": 1.851, + "step": 6305 + }, + { + "epoch": 0.27562393461252677, + "grad_norm": 2.875, + "learning_rate": 8.24356958788804e-05, + "loss": 1.5946, + "step": 6306 + }, + { + "epoch": 0.27566764281655665, + "grad_norm": 2.453125, + "learning_rate": 8.243046829208754e-05, + "loss": 1.6248, + "step": 6307 + }, + { + "epoch": 0.2757113510205866, + "grad_norm": 2.25, + "learning_rate": 8.242524009328454e-05, + "loss": 2.4631, + "step": 6308 + }, + { + "epoch": 0.27575505922461646, + "grad_norm": 2.390625, + "learning_rate": 8.242001128257007e-05, + "loss": 1.7864, + "step": 6309 + }, + { + "epoch": 0.27579876742864634, + "grad_norm": 3.53125, + "learning_rate": 8.24147818600428e-05, + "loss": 1.8237, + "step": 6310 + }, + { + "epoch": 0.2758424756326763, + "grad_norm": 2.125, + "learning_rate": 8.240955182580143e-05, + "loss": 1.5502, + "step": 6311 + }, + { + "epoch": 0.27588618383670616, + "grad_norm": 2.078125, + "learning_rate": 8.240432117994464e-05, + "loss": 2.0371, + "step": 6312 + }, + { + "epoch": 0.27592989204073604, + "grad_norm": 2.09375, + "learning_rate": 8.239908992257115e-05, + "loss": 1.7174, + "step": 6313 + }, + { + "epoch": 0.2759736002447659, + "grad_norm": 2.40625, + "learning_rate": 8.239385805377966e-05, + "loss": 1.4633, + "step": 6314 + }, + { + "epoch": 0.27601730844879585, + "grad_norm": 3.046875, + "learning_rate": 8.238862557366893e-05, + "loss": 1.6023, + "step": 6315 + }, + { + "epoch": 0.27606101665282573, + "grad_norm": 3.65625, + "learning_rate": 8.23833924823377e-05, + "loss": 3.2681, + "step": 6316 + }, + { + "epoch": 0.2761047248568556, + "grad_norm": 1.8828125, + "learning_rate": 8.237815877988472e-05, + "loss": 1.6448, + "step": 6317 + }, + { + "epoch": 0.27614843306088555, + "grad_norm": 1.9296875, + "learning_rate": 8.237292446640877e-05, + "loss": 1.7115, + "step": 6318 + }, + { + "epoch": 0.2761921412649154, + "grad_norm": 2.125, + "learning_rate": 8.236768954200862e-05, + "loss": 1.8672, + "step": 6319 + }, + { + "epoch": 0.2762358494689453, + "grad_norm": 2.390625, + "learning_rate": 8.236245400678304e-05, + "loss": 2.6833, + "step": 6320 + }, + { + "epoch": 0.27627955767297524, + "grad_norm": 2.15625, + "learning_rate": 8.235721786083087e-05, + "loss": 2.5507, + "step": 6321 + }, + { + "epoch": 0.2763232658770051, + "grad_norm": 2.125, + "learning_rate": 8.23519811042509e-05, + "loss": 1.4608, + "step": 6322 + }, + { + "epoch": 0.276366974081035, + "grad_norm": 1.828125, + "learning_rate": 8.234674373714196e-05, + "loss": 1.4405, + "step": 6323 + }, + { + "epoch": 0.2764106822850649, + "grad_norm": 2.640625, + "learning_rate": 8.234150575960288e-05, + "loss": 1.7559, + "step": 6324 + }, + { + "epoch": 0.2764543904890948, + "grad_norm": 2.140625, + "learning_rate": 8.233626717173251e-05, + "loss": 2.0372, + "step": 6325 + }, + { + "epoch": 0.2764980986931247, + "grad_norm": 2.78125, + "learning_rate": 8.233102797362974e-05, + "loss": 2.3649, + "step": 6326 + }, + { + "epoch": 0.2765418068971546, + "grad_norm": 2.84375, + "learning_rate": 8.23257881653934e-05, + "loss": 1.9945, + "step": 6327 + }, + { + "epoch": 0.2765855151011845, + "grad_norm": 2.1875, + "learning_rate": 8.232054774712238e-05, + "loss": 1.8691, + "step": 6328 + }, + { + "epoch": 0.2766292233052144, + "grad_norm": 2.234375, + "learning_rate": 8.23153067189156e-05, + "loss": 1.6846, + "step": 6329 + }, + { + "epoch": 0.27667293150924427, + "grad_norm": 2.328125, + "learning_rate": 8.231006508087196e-05, + "loss": 1.7247, + "step": 6330 + }, + { + "epoch": 0.2767166397132742, + "grad_norm": 2.46875, + "learning_rate": 8.230482283309035e-05, + "loss": 2.3358, + "step": 6331 + }, + { + "epoch": 0.2767603479173041, + "grad_norm": 2.21875, + "learning_rate": 8.229957997566971e-05, + "loss": 1.9732, + "step": 6332 + }, + { + "epoch": 0.27680405612133396, + "grad_norm": 2.046875, + "learning_rate": 8.2294336508709e-05, + "loss": 1.8837, + "step": 6333 + }, + { + "epoch": 0.27684776432536384, + "grad_norm": 1.8515625, + "learning_rate": 8.228909243230714e-05, + "loss": 1.7376, + "step": 6334 + }, + { + "epoch": 0.2768914725293938, + "grad_norm": 2.25, + "learning_rate": 8.228384774656312e-05, + "loss": 1.9227, + "step": 6335 + }, + { + "epoch": 0.27693518073342366, + "grad_norm": 2.171875, + "learning_rate": 8.227860245157593e-05, + "loss": 1.8045, + "step": 6336 + }, + { + "epoch": 0.27697888893745354, + "grad_norm": 2.90625, + "learning_rate": 8.22733565474445e-05, + "loss": 2.3304, + "step": 6337 + }, + { + "epoch": 0.2770225971414835, + "grad_norm": 1.9296875, + "learning_rate": 8.226811003426788e-05, + "loss": 1.5063, + "step": 6338 + }, + { + "epoch": 0.27706630534551335, + "grad_norm": 2.4375, + "learning_rate": 8.226286291214504e-05, + "loss": 1.6672, + "step": 6339 + }, + { + "epoch": 0.27711001354954323, + "grad_norm": 1.9765625, + "learning_rate": 8.225761518117503e-05, + "loss": 1.7877, + "step": 6340 + }, + { + "epoch": 0.27715372175357317, + "grad_norm": 1.8671875, + "learning_rate": 8.225236684145688e-05, + "loss": 1.9904, + "step": 6341 + }, + { + "epoch": 0.27719742995760305, + "grad_norm": 2.03125, + "learning_rate": 8.224711789308963e-05, + "loss": 1.7372, + "step": 6342 + }, + { + "epoch": 0.2772411381616329, + "grad_norm": 3.390625, + "learning_rate": 8.224186833617231e-05, + "loss": 2.7278, + "step": 6343 + }, + { + "epoch": 0.27728484636566286, + "grad_norm": 3.53125, + "learning_rate": 8.223661817080403e-05, + "loss": 2.5373, + "step": 6344 + }, + { + "epoch": 0.27732855456969274, + "grad_norm": 2.125, + "learning_rate": 8.223136739708383e-05, + "loss": 1.7289, + "step": 6345 + }, + { + "epoch": 0.2773722627737226, + "grad_norm": 2.046875, + "learning_rate": 8.222611601511083e-05, + "loss": 1.7691, + "step": 6346 + }, + { + "epoch": 0.2774159709777525, + "grad_norm": 2.703125, + "learning_rate": 8.222086402498412e-05, + "loss": 2.8211, + "step": 6347 + }, + { + "epoch": 0.27745967918178244, + "grad_norm": 2.328125, + "learning_rate": 8.221561142680281e-05, + "loss": 1.4104, + "step": 6348 + }, + { + "epoch": 0.2775033873858123, + "grad_norm": 2.296875, + "learning_rate": 8.221035822066601e-05, + "loss": 2.357, + "step": 6349 + }, + { + "epoch": 0.2775470955898422, + "grad_norm": 2.125, + "learning_rate": 8.220510440667289e-05, + "loss": 1.7921, + "step": 6350 + }, + { + "epoch": 0.27759080379387213, + "grad_norm": 2.15625, + "learning_rate": 8.219984998492256e-05, + "loss": 1.8645, + "step": 6351 + }, + { + "epoch": 0.277634511997902, + "grad_norm": 1.9921875, + "learning_rate": 8.219459495551421e-05, + "loss": 1.8275, + "step": 6352 + }, + { + "epoch": 0.2776782202019319, + "grad_norm": 2.0, + "learning_rate": 8.218933931854697e-05, + "loss": 1.5563, + "step": 6353 + }, + { + "epoch": 0.2777219284059618, + "grad_norm": 2.046875, + "learning_rate": 8.218408307412006e-05, + "loss": 1.6998, + "step": 6354 + }, + { + "epoch": 0.2777656366099917, + "grad_norm": 2.125, + "learning_rate": 8.217882622233268e-05, + "loss": 1.4991, + "step": 6355 + }, + { + "epoch": 0.2778093448140216, + "grad_norm": 2.4375, + "learning_rate": 8.217356876328402e-05, + "loss": 2.0588, + "step": 6356 + }, + { + "epoch": 0.27785305301805147, + "grad_norm": 2.078125, + "learning_rate": 8.216831069707326e-05, + "loss": 2.2262, + "step": 6357 + }, + { + "epoch": 0.2778967612220814, + "grad_norm": 2.296875, + "learning_rate": 8.216305202379968e-05, + "loss": 1.8249, + "step": 6358 + }, + { + "epoch": 0.2779404694261113, + "grad_norm": 2.15625, + "learning_rate": 8.215779274356248e-05, + "loss": 1.7044, + "step": 6359 + }, + { + "epoch": 0.27798417763014116, + "grad_norm": 2.21875, + "learning_rate": 8.215253285646094e-05, + "loss": 1.7547, + "step": 6360 + }, + { + "epoch": 0.2780278858341711, + "grad_norm": 2.21875, + "learning_rate": 8.214727236259431e-05, + "loss": 1.8101, + "step": 6361 + }, + { + "epoch": 0.278071594038201, + "grad_norm": 2.21875, + "learning_rate": 8.214201126206184e-05, + "loss": 2.1843, + "step": 6362 + }, + { + "epoch": 0.27811530224223086, + "grad_norm": 2.140625, + "learning_rate": 8.213674955496287e-05, + "loss": 2.5314, + "step": 6363 + }, + { + "epoch": 0.2781590104462608, + "grad_norm": 1.9921875, + "learning_rate": 8.213148724139665e-05, + "loss": 1.6071, + "step": 6364 + }, + { + "epoch": 0.27820271865029067, + "grad_norm": 2.046875, + "learning_rate": 8.212622432146248e-05, + "loss": 1.8472, + "step": 6365 + }, + { + "epoch": 0.27824642685432055, + "grad_norm": 2.09375, + "learning_rate": 8.212096079525974e-05, + "loss": 1.2409, + "step": 6366 + }, + { + "epoch": 0.27829013505835043, + "grad_norm": 2.015625, + "learning_rate": 8.211569666288769e-05, + "loss": 1.8642, + "step": 6367 + }, + { + "epoch": 0.27833384326238036, + "grad_norm": 2.484375, + "learning_rate": 8.211043192444572e-05, + "loss": 2.009, + "step": 6368 + }, + { + "epoch": 0.27837755146641024, + "grad_norm": 3.328125, + "learning_rate": 8.210516658003316e-05, + "loss": 1.6596, + "step": 6369 + }, + { + "epoch": 0.2784212596704401, + "grad_norm": 2.234375, + "learning_rate": 8.209990062974936e-05, + "loss": 1.956, + "step": 6370 + }, + { + "epoch": 0.27846496787447006, + "grad_norm": 2.40625, + "learning_rate": 8.209463407369373e-05, + "loss": 2.223, + "step": 6371 + }, + { + "epoch": 0.27850867607849994, + "grad_norm": 2.140625, + "learning_rate": 8.208936691196565e-05, + "loss": 1.4611, + "step": 6372 + }, + { + "epoch": 0.2785523842825298, + "grad_norm": 2.03125, + "learning_rate": 8.20840991446645e-05, + "loss": 1.7562, + "step": 6373 + }, + { + "epoch": 0.27859609248655975, + "grad_norm": 2.203125, + "learning_rate": 8.207883077188971e-05, + "loss": 1.8233, + "step": 6374 + }, + { + "epoch": 0.27863980069058963, + "grad_norm": 2.359375, + "learning_rate": 8.20735617937407e-05, + "loss": 2.423, + "step": 6375 + }, + { + "epoch": 0.2786835088946195, + "grad_norm": 1.8671875, + "learning_rate": 8.20682922103169e-05, + "loss": 1.5783, + "step": 6376 + }, + { + "epoch": 0.2787272170986494, + "grad_norm": 2.25, + "learning_rate": 8.206302202171775e-05, + "loss": 1.7583, + "step": 6377 + }, + { + "epoch": 0.27877092530267933, + "grad_norm": 2.21875, + "learning_rate": 8.20577512280427e-05, + "loss": 1.9529, + "step": 6378 + }, + { + "epoch": 0.2788146335067092, + "grad_norm": 2.109375, + "learning_rate": 8.205247982939123e-05, + "loss": 2.1829, + "step": 6379 + }, + { + "epoch": 0.2788583417107391, + "grad_norm": 1.8125, + "learning_rate": 8.204720782586281e-05, + "loss": 1.5143, + "step": 6380 + }, + { + "epoch": 0.278902049914769, + "grad_norm": 2.328125, + "learning_rate": 8.204193521755694e-05, + "loss": 1.2953, + "step": 6381 + }, + { + "epoch": 0.2789457581187989, + "grad_norm": 1.9765625, + "learning_rate": 8.203666200457313e-05, + "loss": 2.1489, + "step": 6382 + }, + { + "epoch": 0.2789894663228288, + "grad_norm": 2.28125, + "learning_rate": 8.203138818701087e-05, + "loss": 2.1649, + "step": 6383 + }, + { + "epoch": 0.2790331745268587, + "grad_norm": 2.078125, + "learning_rate": 8.20261137649697e-05, + "loss": 1.788, + "step": 6384 + }, + { + "epoch": 0.2790768827308886, + "grad_norm": 1.859375, + "learning_rate": 8.202083873854914e-05, + "loss": 1.6652, + "step": 6385 + }, + { + "epoch": 0.2791205909349185, + "grad_norm": 2.703125, + "learning_rate": 8.201556310784877e-05, + "loss": 2.2077, + "step": 6386 + }, + { + "epoch": 0.27916429913894836, + "grad_norm": 2.71875, + "learning_rate": 8.20102868729681e-05, + "loss": 2.984, + "step": 6387 + }, + { + "epoch": 0.2792080073429783, + "grad_norm": 2.21875, + "learning_rate": 8.200501003400676e-05, + "loss": 2.2747, + "step": 6388 + }, + { + "epoch": 0.2792517155470082, + "grad_norm": 2.0, + "learning_rate": 8.199973259106427e-05, + "loss": 1.7683, + "step": 6389 + }, + { + "epoch": 0.27929542375103805, + "grad_norm": 1.90625, + "learning_rate": 8.199445454424026e-05, + "loss": 1.855, + "step": 6390 + }, + { + "epoch": 0.279339131955068, + "grad_norm": 2.0, + "learning_rate": 8.198917589363432e-05, + "loss": 1.5856, + "step": 6391 + }, + { + "epoch": 0.27938284015909787, + "grad_norm": 1.8125, + "learning_rate": 8.198389663934611e-05, + "loss": 1.5465, + "step": 6392 + }, + { + "epoch": 0.27942654836312775, + "grad_norm": 2.25, + "learning_rate": 8.197861678147518e-05, + "loss": 1.8482, + "step": 6393 + }, + { + "epoch": 0.2794702565671577, + "grad_norm": 2.265625, + "learning_rate": 8.197333632012123e-05, + "loss": 2.3386, + "step": 6394 + }, + { + "epoch": 0.27951396477118756, + "grad_norm": 2.3125, + "learning_rate": 8.196805525538389e-05, + "loss": 2.0954, + "step": 6395 + }, + { + "epoch": 0.27955767297521744, + "grad_norm": 1.9375, + "learning_rate": 8.19627735873628e-05, + "loss": 1.5945, + "step": 6396 + }, + { + "epoch": 0.2796013811792473, + "grad_norm": 2.140625, + "learning_rate": 8.195749131615767e-05, + "loss": 1.8132, + "step": 6397 + }, + { + "epoch": 0.27964508938327726, + "grad_norm": 2.171875, + "learning_rate": 8.195220844186817e-05, + "loss": 1.6905, + "step": 6398 + }, + { + "epoch": 0.27968879758730714, + "grad_norm": 2.25, + "learning_rate": 8.194692496459398e-05, + "loss": 2.4612, + "step": 6399 + }, + { + "epoch": 0.279732505791337, + "grad_norm": 2.0625, + "learning_rate": 8.194164088443482e-05, + "loss": 1.7586, + "step": 6400 + }, + { + "epoch": 0.27977621399536695, + "grad_norm": 5.59375, + "learning_rate": 8.19363562014904e-05, + "loss": 2.1648, + "step": 6401 + }, + { + "epoch": 0.27981992219939683, + "grad_norm": 2.078125, + "learning_rate": 8.193107091586048e-05, + "loss": 1.8121, + "step": 6402 + }, + { + "epoch": 0.2798636304034267, + "grad_norm": 3.125, + "learning_rate": 8.192578502764476e-05, + "loss": 2.687, + "step": 6403 + }, + { + "epoch": 0.27990733860745665, + "grad_norm": 2.015625, + "learning_rate": 8.192049853694301e-05, + "loss": 1.8532, + "step": 6404 + }, + { + "epoch": 0.2799510468114865, + "grad_norm": 1.90625, + "learning_rate": 8.191521144385501e-05, + "loss": 1.6634, + "step": 6405 + }, + { + "epoch": 0.2799947550155164, + "grad_norm": 1.953125, + "learning_rate": 8.190992374848052e-05, + "loss": 1.8239, + "step": 6406 + }, + { + "epoch": 0.2800384632195463, + "grad_norm": 1.9921875, + "learning_rate": 8.190463545091931e-05, + "loss": 1.9444, + "step": 6407 + }, + { + "epoch": 0.2800821714235762, + "grad_norm": 1.8203125, + "learning_rate": 8.189934655127121e-05, + "loss": 1.6436, + "step": 6408 + }, + { + "epoch": 0.2801258796276061, + "grad_norm": 1.9609375, + "learning_rate": 8.1894057049636e-05, + "loss": 1.5293, + "step": 6409 + }, + { + "epoch": 0.280169587831636, + "grad_norm": 2.25, + "learning_rate": 8.188876694611351e-05, + "loss": 2.2065, + "step": 6410 + }, + { + "epoch": 0.2802132960356659, + "grad_norm": 2.546875, + "learning_rate": 8.188347624080359e-05, + "loss": 1.4536, + "step": 6411 + }, + { + "epoch": 0.2802570042396958, + "grad_norm": 2.03125, + "learning_rate": 8.187818493380607e-05, + "loss": 1.959, + "step": 6412 + }, + { + "epoch": 0.2803007124437257, + "grad_norm": 2.359375, + "learning_rate": 8.18728930252208e-05, + "loss": 1.874, + "step": 6413 + }, + { + "epoch": 0.2803444206477556, + "grad_norm": 12.0625, + "learning_rate": 8.186760051514765e-05, + "loss": 1.8555, + "step": 6414 + }, + { + "epoch": 0.2803881288517855, + "grad_norm": 1.7421875, + "learning_rate": 8.186230740368649e-05, + "loss": 1.4012, + "step": 6415 + }, + { + "epoch": 0.28043183705581537, + "grad_norm": 2.234375, + "learning_rate": 8.185701369093722e-05, + "loss": 1.9157, + "step": 6416 + }, + { + "epoch": 0.28047554525984525, + "grad_norm": 1.90625, + "learning_rate": 8.185171937699974e-05, + "loss": 2.0665, + "step": 6417 + }, + { + "epoch": 0.2805192534638752, + "grad_norm": 2.5, + "learning_rate": 8.184642446197395e-05, + "loss": 2.6483, + "step": 6418 + }, + { + "epoch": 0.28056296166790506, + "grad_norm": 1.8984375, + "learning_rate": 8.184112894595979e-05, + "loss": 1.6593, + "step": 6419 + }, + { + "epoch": 0.28060666987193494, + "grad_norm": 2.15625, + "learning_rate": 8.183583282905717e-05, + "loss": 2.048, + "step": 6420 + }, + { + "epoch": 0.2806503780759649, + "grad_norm": 2.0625, + "learning_rate": 8.183053611136607e-05, + "loss": 1.9933, + "step": 6421 + }, + { + "epoch": 0.28069408627999476, + "grad_norm": 1.9140625, + "learning_rate": 8.18252387929864e-05, + "loss": 1.734, + "step": 6422 + }, + { + "epoch": 0.28073779448402464, + "grad_norm": 2.546875, + "learning_rate": 8.181994087401819e-05, + "loss": 2.8626, + "step": 6423 + }, + { + "epoch": 0.2807815026880546, + "grad_norm": 3.078125, + "learning_rate": 8.181464235456136e-05, + "loss": 1.482, + "step": 6424 + }, + { + "epoch": 0.28082521089208445, + "grad_norm": 2.109375, + "learning_rate": 8.180934323471592e-05, + "loss": 1.6419, + "step": 6425 + }, + { + "epoch": 0.28086891909611433, + "grad_norm": 2.609375, + "learning_rate": 8.180404351458189e-05, + "loss": 1.9609, + "step": 6426 + }, + { + "epoch": 0.2809126273001442, + "grad_norm": 1.8515625, + "learning_rate": 8.179874319425926e-05, + "loss": 1.8158, + "step": 6427 + }, + { + "epoch": 0.28095633550417415, + "grad_norm": 2.515625, + "learning_rate": 8.179344227384807e-05, + "loss": 2.282, + "step": 6428 + }, + { + "epoch": 0.28100004370820403, + "grad_norm": 1.96875, + "learning_rate": 8.178814075344836e-05, + "loss": 1.9564, + "step": 6429 + }, + { + "epoch": 0.2810437519122339, + "grad_norm": 2.046875, + "learning_rate": 8.178283863316015e-05, + "loss": 2.1462, + "step": 6430 + }, + { + "epoch": 0.28108746011626384, + "grad_norm": 2.421875, + "learning_rate": 8.177753591308352e-05, + "loss": 2.547, + "step": 6431 + }, + { + "epoch": 0.2811311683202937, + "grad_norm": 2.109375, + "learning_rate": 8.177223259331855e-05, + "loss": 1.6894, + "step": 6432 + }, + { + "epoch": 0.2811748765243236, + "grad_norm": 2.109375, + "learning_rate": 8.176692867396531e-05, + "loss": 1.7359, + "step": 6433 + }, + { + "epoch": 0.28121858472835354, + "grad_norm": 1.984375, + "learning_rate": 8.176162415512388e-05, + "loss": 1.8399, + "step": 6434 + }, + { + "epoch": 0.2812622929323834, + "grad_norm": 1.9609375, + "learning_rate": 8.175631903689438e-05, + "loss": 1.7448, + "step": 6435 + }, + { + "epoch": 0.2813060011364133, + "grad_norm": 3.109375, + "learning_rate": 8.175101331937693e-05, + "loss": 2.505, + "step": 6436 + }, + { + "epoch": 0.2813497093404432, + "grad_norm": 2.484375, + "learning_rate": 8.174570700267163e-05, + "loss": 2.1827, + "step": 6437 + }, + { + "epoch": 0.2813934175444731, + "grad_norm": 3.203125, + "learning_rate": 8.174040008687864e-05, + "loss": 1.9409, + "step": 6438 + }, + { + "epoch": 0.281437125748503, + "grad_norm": 2.015625, + "learning_rate": 8.17350925720981e-05, + "loss": 1.7053, + "step": 6439 + }, + { + "epoch": 0.28148083395253287, + "grad_norm": 2.328125, + "learning_rate": 8.17297844584302e-05, + "loss": 1.9401, + "step": 6440 + }, + { + "epoch": 0.2815245421565628, + "grad_norm": 2.75, + "learning_rate": 8.172447574597506e-05, + "loss": 2.2327, + "step": 6441 + }, + { + "epoch": 0.2815682503605927, + "grad_norm": 2.40625, + "learning_rate": 8.171916643483291e-05, + "loss": 1.9431, + "step": 6442 + }, + { + "epoch": 0.28161195856462257, + "grad_norm": 2.171875, + "learning_rate": 8.17138565251039e-05, + "loss": 2.0892, + "step": 6443 + }, + { + "epoch": 0.2816556667686525, + "grad_norm": 2.625, + "learning_rate": 8.170854601688828e-05, + "loss": 1.9925, + "step": 6444 + }, + { + "epoch": 0.2816993749726824, + "grad_norm": 2.5625, + "learning_rate": 8.170323491028624e-05, + "loss": 1.9526, + "step": 6445 + }, + { + "epoch": 0.28174308317671226, + "grad_norm": 2.140625, + "learning_rate": 8.169792320539802e-05, + "loss": 1.7848, + "step": 6446 + }, + { + "epoch": 0.28178679138074214, + "grad_norm": 2.21875, + "learning_rate": 8.169261090232385e-05, + "loss": 1.9374, + "step": 6447 + }, + { + "epoch": 0.2818304995847721, + "grad_norm": 2.125, + "learning_rate": 8.168729800116398e-05, + "loss": 2.2083, + "step": 6448 + }, + { + "epoch": 0.28187420778880196, + "grad_norm": 2.21875, + "learning_rate": 8.168198450201869e-05, + "loss": 1.8002, + "step": 6449 + }, + { + "epoch": 0.28191791599283184, + "grad_norm": 2.015625, + "learning_rate": 8.167667040498823e-05, + "loss": 1.5561, + "step": 6450 + }, + { + "epoch": 0.28196162419686177, + "grad_norm": 2.234375, + "learning_rate": 8.16713557101729e-05, + "loss": 2.1003, + "step": 6451 + }, + { + "epoch": 0.28200533240089165, + "grad_norm": 2.234375, + "learning_rate": 8.166604041767299e-05, + "loss": 2.0644, + "step": 6452 + }, + { + "epoch": 0.28204904060492153, + "grad_norm": 2.0, + "learning_rate": 8.166072452758883e-05, + "loss": 1.5807, + "step": 6453 + }, + { + "epoch": 0.28209274880895147, + "grad_norm": 2.140625, + "learning_rate": 8.16554080400207e-05, + "loss": 2.0718, + "step": 6454 + }, + { + "epoch": 0.28213645701298135, + "grad_norm": 2.09375, + "learning_rate": 8.165009095506895e-05, + "loss": 1.9122, + "step": 6455 + }, + { + "epoch": 0.2821801652170112, + "grad_norm": 2.515625, + "learning_rate": 8.164477327283391e-05, + "loss": 1.8248, + "step": 6456 + }, + { + "epoch": 0.2822238734210411, + "grad_norm": 2.65625, + "learning_rate": 8.163945499341596e-05, + "loss": 2.3692, + "step": 6457 + }, + { + "epoch": 0.28226758162507104, + "grad_norm": 2.171875, + "learning_rate": 8.163413611691544e-05, + "loss": 1.3578, + "step": 6458 + }, + { + "epoch": 0.2823112898291009, + "grad_norm": 2.109375, + "learning_rate": 8.162881664343271e-05, + "loss": 1.7039, + "step": 6459 + }, + { + "epoch": 0.2823549980331308, + "grad_norm": 2.0, + "learning_rate": 8.16234965730682e-05, + "loss": 2.7822, + "step": 6460 + }, + { + "epoch": 0.28239870623716073, + "grad_norm": 2.203125, + "learning_rate": 8.161817590592228e-05, + "loss": 1.3216, + "step": 6461 + }, + { + "epoch": 0.2824424144411906, + "grad_norm": 1.8046875, + "learning_rate": 8.161285464209537e-05, + "loss": 1.5041, + "step": 6462 + }, + { + "epoch": 0.2824861226452205, + "grad_norm": 2.25, + "learning_rate": 8.160753278168787e-05, + "loss": 1.7797, + "step": 6463 + }, + { + "epoch": 0.28252983084925043, + "grad_norm": 1.8984375, + "learning_rate": 8.160221032480021e-05, + "loss": 1.8841, + "step": 6464 + }, + { + "epoch": 0.2825735390532803, + "grad_norm": 2.28125, + "learning_rate": 8.159688727153287e-05, + "loss": 1.8538, + "step": 6465 + }, + { + "epoch": 0.2826172472573102, + "grad_norm": 1.953125, + "learning_rate": 8.159156362198628e-05, + "loss": 1.6499, + "step": 6466 + }, + { + "epoch": 0.28266095546134007, + "grad_norm": 2.234375, + "learning_rate": 8.15862393762609e-05, + "loss": 1.8777, + "step": 6467 + }, + { + "epoch": 0.28270466366537, + "grad_norm": 2.390625, + "learning_rate": 8.15809145344572e-05, + "loss": 1.7798, + "step": 6468 + }, + { + "epoch": 0.2827483718693999, + "grad_norm": 2.03125, + "learning_rate": 8.157558909667569e-05, + "loss": 1.5544, + "step": 6469 + }, + { + "epoch": 0.28279208007342976, + "grad_norm": 2.296875, + "learning_rate": 8.157026306301685e-05, + "loss": 1.9482, + "step": 6470 + }, + { + "epoch": 0.2828357882774597, + "grad_norm": 3.453125, + "learning_rate": 8.156493643358121e-05, + "loss": 1.892, + "step": 6471 + }, + { + "epoch": 0.2828794964814896, + "grad_norm": 1.90625, + "learning_rate": 8.155960920846926e-05, + "loss": 1.6708, + "step": 6472 + }, + { + "epoch": 0.28292320468551946, + "grad_norm": 2.25, + "learning_rate": 8.155428138778158e-05, + "loss": 1.9615, + "step": 6473 + }, + { + "epoch": 0.2829669128895494, + "grad_norm": 1.796875, + "learning_rate": 8.154895297161866e-05, + "loss": 1.6599, + "step": 6474 + }, + { + "epoch": 0.2830106210935793, + "grad_norm": 2.28125, + "learning_rate": 8.154362396008109e-05, + "loss": 2.2279, + "step": 6475 + }, + { + "epoch": 0.28305432929760915, + "grad_norm": 2.265625, + "learning_rate": 8.153829435326942e-05, + "loss": 1.5279, + "step": 6476 + }, + { + "epoch": 0.28309803750163903, + "grad_norm": 2.140625, + "learning_rate": 8.153296415128425e-05, + "loss": 1.9643, + "step": 6477 + }, + { + "epoch": 0.28314174570566897, + "grad_norm": 2.046875, + "learning_rate": 8.152763335422613e-05, + "loss": 1.7157, + "step": 6478 + }, + { + "epoch": 0.28318545390969885, + "grad_norm": 2.125, + "learning_rate": 8.15223019621957e-05, + "loss": 1.7339, + "step": 6479 + }, + { + "epoch": 0.2832291621137287, + "grad_norm": 2.84375, + "learning_rate": 8.151696997529354e-05, + "loss": 1.2496, + "step": 6480 + }, + { + "epoch": 0.28327287031775866, + "grad_norm": 1.8515625, + "learning_rate": 8.151163739362029e-05, + "loss": 1.8114, + "step": 6481 + }, + { + "epoch": 0.28331657852178854, + "grad_norm": 2.375, + "learning_rate": 8.150630421727659e-05, + "loss": 1.8059, + "step": 6482 + }, + { + "epoch": 0.2833602867258184, + "grad_norm": 2.296875, + "learning_rate": 8.150097044636307e-05, + "loss": 1.8667, + "step": 6483 + }, + { + "epoch": 0.28340399492984836, + "grad_norm": 1.8671875, + "learning_rate": 8.149563608098037e-05, + "loss": 1.6471, + "step": 6484 + }, + { + "epoch": 0.28344770313387824, + "grad_norm": 1.921875, + "learning_rate": 8.14903011212292e-05, + "loss": 1.4635, + "step": 6485 + }, + { + "epoch": 0.2834914113379081, + "grad_norm": 2.203125, + "learning_rate": 8.148496556721022e-05, + "loss": 1.7774, + "step": 6486 + }, + { + "epoch": 0.283535119541938, + "grad_norm": 2.09375, + "learning_rate": 8.14796294190241e-05, + "loss": 1.8177, + "step": 6487 + }, + { + "epoch": 0.28357882774596793, + "grad_norm": 1.90625, + "learning_rate": 8.147429267677156e-05, + "loss": 1.8345, + "step": 6488 + }, + { + "epoch": 0.2836225359499978, + "grad_norm": 2.265625, + "learning_rate": 8.146895534055332e-05, + "loss": 2.1024, + "step": 6489 + }, + { + "epoch": 0.2836662441540277, + "grad_norm": 2.53125, + "learning_rate": 8.146361741047006e-05, + "loss": 2.1015, + "step": 6490 + }, + { + "epoch": 0.2837099523580576, + "grad_norm": 2.046875, + "learning_rate": 8.145827888662257e-05, + "loss": 1.6696, + "step": 6491 + }, + { + "epoch": 0.2837536605620875, + "grad_norm": 1.8515625, + "learning_rate": 8.145293976911158e-05, + "loss": 1.7858, + "step": 6492 + }, + { + "epoch": 0.2837973687661174, + "grad_norm": 2.28125, + "learning_rate": 8.144760005803783e-05, + "loss": 1.94, + "step": 6493 + }, + { + "epoch": 0.2838410769701473, + "grad_norm": 2.21875, + "learning_rate": 8.14422597535021e-05, + "loss": 1.595, + "step": 6494 + }, + { + "epoch": 0.2838847851741772, + "grad_norm": 2.1875, + "learning_rate": 8.143691885560515e-05, + "loss": 2.0082, + "step": 6495 + }, + { + "epoch": 0.2839284933782071, + "grad_norm": 2.0625, + "learning_rate": 8.14315773644478e-05, + "loss": 1.6122, + "step": 6496 + }, + { + "epoch": 0.28397220158223696, + "grad_norm": 2.15625, + "learning_rate": 8.142623528013084e-05, + "loss": 1.9811, + "step": 6497 + }, + { + "epoch": 0.2840159097862669, + "grad_norm": 2.703125, + "learning_rate": 8.142089260275509e-05, + "loss": 1.9579, + "step": 6498 + }, + { + "epoch": 0.2840596179902968, + "grad_norm": 2.453125, + "learning_rate": 8.141554933242135e-05, + "loss": 2.491, + "step": 6499 + }, + { + "epoch": 0.28410332619432666, + "grad_norm": 1.9609375, + "learning_rate": 8.141020546923048e-05, + "loss": 1.4218, + "step": 6500 + }, + { + "epoch": 0.2841470343983566, + "grad_norm": 2.140625, + "learning_rate": 8.14048610132833e-05, + "loss": 1.2199, + "step": 6501 + }, + { + "epoch": 0.28419074260238647, + "grad_norm": 2.953125, + "learning_rate": 8.13995159646807e-05, + "loss": 1.6209, + "step": 6502 + }, + { + "epoch": 0.28423445080641635, + "grad_norm": 4.34375, + "learning_rate": 8.139417032352354e-05, + "loss": 3.7028, + "step": 6503 + }, + { + "epoch": 0.2842781590104463, + "grad_norm": 2.375, + "learning_rate": 8.138882408991268e-05, + "loss": 1.2836, + "step": 6504 + }, + { + "epoch": 0.28432186721447616, + "grad_norm": 2.109375, + "learning_rate": 8.138347726394904e-05, + "loss": 1.7913, + "step": 6505 + }, + { + "epoch": 0.28436557541850604, + "grad_norm": 2.96875, + "learning_rate": 8.13781298457335e-05, + "loss": 1.7721, + "step": 6506 + }, + { + "epoch": 0.2844092836225359, + "grad_norm": 2.890625, + "learning_rate": 8.137278183536699e-05, + "loss": 2.4412, + "step": 6507 + }, + { + "epoch": 0.28445299182656586, + "grad_norm": 2.609375, + "learning_rate": 8.13674332329504e-05, + "loss": 2.2241, + "step": 6508 + }, + { + "epoch": 0.28449670003059574, + "grad_norm": 3.359375, + "learning_rate": 8.136208403858472e-05, + "loss": 2.3641, + "step": 6509 + }, + { + "epoch": 0.2845404082346256, + "grad_norm": 2.34375, + "learning_rate": 8.135673425237084e-05, + "loss": 1.7108, + "step": 6510 + }, + { + "epoch": 0.28458411643865555, + "grad_norm": 1.984375, + "learning_rate": 8.135138387440977e-05, + "loss": 1.385, + "step": 6511 + }, + { + "epoch": 0.28462782464268543, + "grad_norm": 2.09375, + "learning_rate": 8.134603290480245e-05, + "loss": 2.3765, + "step": 6512 + }, + { + "epoch": 0.2846715328467153, + "grad_norm": 2.4375, + "learning_rate": 8.134068134364987e-05, + "loss": 2.4618, + "step": 6513 + }, + { + "epoch": 0.28471524105074525, + "grad_norm": 1.9375, + "learning_rate": 8.133532919105302e-05, + "loss": 1.9383, + "step": 6514 + }, + { + "epoch": 0.28475894925477513, + "grad_norm": 1.984375, + "learning_rate": 8.13299764471129e-05, + "loss": 1.7269, + "step": 6515 + }, + { + "epoch": 0.284802657458805, + "grad_norm": 2.0, + "learning_rate": 8.132462311193053e-05, + "loss": 1.7784, + "step": 6516 + }, + { + "epoch": 0.2848463656628349, + "grad_norm": 2.015625, + "learning_rate": 8.131926918560692e-05, + "loss": 1.8252, + "step": 6517 + }, + { + "epoch": 0.2848900738668648, + "grad_norm": 2.0625, + "learning_rate": 8.131391466824312e-05, + "loss": 1.5615, + "step": 6518 + }, + { + "epoch": 0.2849337820708947, + "grad_norm": 2.390625, + "learning_rate": 8.130855955994019e-05, + "loss": 2.0726, + "step": 6519 + }, + { + "epoch": 0.2849774902749246, + "grad_norm": 2.21875, + "learning_rate": 8.130320386079915e-05, + "loss": 1.6181, + "step": 6520 + }, + { + "epoch": 0.2850211984789545, + "grad_norm": 2.1875, + "learning_rate": 8.129784757092111e-05, + "loss": 1.9783, + "step": 6521 + }, + { + "epoch": 0.2850649066829844, + "grad_norm": 2.859375, + "learning_rate": 8.129249069040712e-05, + "loss": 3.0229, + "step": 6522 + }, + { + "epoch": 0.2851086148870143, + "grad_norm": 2.1875, + "learning_rate": 8.12871332193583e-05, + "loss": 1.6179, + "step": 6523 + }, + { + "epoch": 0.2851523230910442, + "grad_norm": 2.59375, + "learning_rate": 8.128177515787574e-05, + "loss": 2.1017, + "step": 6524 + }, + { + "epoch": 0.2851960312950741, + "grad_norm": 2.15625, + "learning_rate": 8.127641650606054e-05, + "loss": 1.5302, + "step": 6525 + }, + { + "epoch": 0.285239739499104, + "grad_norm": 2.15625, + "learning_rate": 8.127105726401386e-05, + "loss": 1.546, + "step": 6526 + }, + { + "epoch": 0.28528344770313385, + "grad_norm": 2.171875, + "learning_rate": 8.126569743183681e-05, + "loss": 1.6359, + "step": 6527 + }, + { + "epoch": 0.2853271559071638, + "grad_norm": 2.015625, + "learning_rate": 8.126033700963053e-05, + "loss": 1.6768, + "step": 6528 + }, + { + "epoch": 0.28537086411119367, + "grad_norm": 2.046875, + "learning_rate": 8.12549759974962e-05, + "loss": 1.6905, + "step": 6529 + }, + { + "epoch": 0.28541457231522355, + "grad_norm": 2.46875, + "learning_rate": 8.1249614395535e-05, + "loss": 1.8334, + "step": 6530 + }, + { + "epoch": 0.2854582805192535, + "grad_norm": 2.203125, + "learning_rate": 8.124425220384808e-05, + "loss": 2.195, + "step": 6531 + }, + { + "epoch": 0.28550198872328336, + "grad_norm": 2.453125, + "learning_rate": 8.123888942253666e-05, + "loss": 2.0734, + "step": 6532 + }, + { + "epoch": 0.28554569692731324, + "grad_norm": 2.078125, + "learning_rate": 8.123352605170191e-05, + "loss": 1.9242, + "step": 6533 + }, + { + "epoch": 0.2855894051313432, + "grad_norm": 1.9296875, + "learning_rate": 8.122816209144509e-05, + "loss": 2.0874, + "step": 6534 + }, + { + "epoch": 0.28563311333537306, + "grad_norm": 2.59375, + "learning_rate": 8.122279754186736e-05, + "loss": 1.4013, + "step": 6535 + }, + { + "epoch": 0.28567682153940294, + "grad_norm": 3.1875, + "learning_rate": 8.121743240307003e-05, + "loss": 1.9932, + "step": 6536 + }, + { + "epoch": 0.2857205297434328, + "grad_norm": 2.046875, + "learning_rate": 8.121206667515431e-05, + "loss": 1.5798, + "step": 6537 + }, + { + "epoch": 0.28576423794746275, + "grad_norm": 2.03125, + "learning_rate": 8.120670035822147e-05, + "loss": 2.2536, + "step": 6538 + }, + { + "epoch": 0.28580794615149263, + "grad_norm": 1.8515625, + "learning_rate": 8.120133345237278e-05, + "loss": 1.8946, + "step": 6539 + }, + { + "epoch": 0.2858516543555225, + "grad_norm": 2.3125, + "learning_rate": 8.11959659577095e-05, + "loss": 1.5591, + "step": 6540 + }, + { + "epoch": 0.28589536255955245, + "grad_norm": 1.9453125, + "learning_rate": 8.119059787433294e-05, + "loss": 1.8271, + "step": 6541 + }, + { + "epoch": 0.2859390707635823, + "grad_norm": 2.078125, + "learning_rate": 8.118522920234442e-05, + "loss": 1.7642, + "step": 6542 + }, + { + "epoch": 0.2859827789676122, + "grad_norm": 1.953125, + "learning_rate": 8.117985994184522e-05, + "loss": 1.6098, + "step": 6543 + }, + { + "epoch": 0.28602648717164214, + "grad_norm": 2.109375, + "learning_rate": 8.117449009293668e-05, + "loss": 2.3942, + "step": 6544 + }, + { + "epoch": 0.286070195375672, + "grad_norm": 2.734375, + "learning_rate": 8.116911965572016e-05, + "loss": 2.513, + "step": 6545 + }, + { + "epoch": 0.2861139035797019, + "grad_norm": 2.59375, + "learning_rate": 8.116374863029696e-05, + "loss": 2.7693, + "step": 6546 + }, + { + "epoch": 0.2861576117837318, + "grad_norm": 2.0, + "learning_rate": 8.115837701676848e-05, + "loss": 2.2292, + "step": 6547 + }, + { + "epoch": 0.2862013199877617, + "grad_norm": 2.390625, + "learning_rate": 8.115300481523609e-05, + "loss": 1.8436, + "step": 6548 + }, + { + "epoch": 0.2862450281917916, + "grad_norm": 2.640625, + "learning_rate": 8.114763202580113e-05, + "loss": 1.6627, + "step": 6549 + }, + { + "epoch": 0.2862887363958215, + "grad_norm": 6.625, + "learning_rate": 8.114225864856504e-05, + "loss": 2.6498, + "step": 6550 + }, + { + "epoch": 0.2863324445998514, + "grad_norm": 1.734375, + "learning_rate": 8.11368846836292e-05, + "loss": 1.5364, + "step": 6551 + }, + { + "epoch": 0.2863761528038813, + "grad_norm": 2.375, + "learning_rate": 8.113151013109503e-05, + "loss": 2.3289, + "step": 6552 + }, + { + "epoch": 0.28641986100791117, + "grad_norm": 2.328125, + "learning_rate": 8.112613499106396e-05, + "loss": 1.7235, + "step": 6553 + }, + { + "epoch": 0.2864635692119411, + "grad_norm": 2.140625, + "learning_rate": 8.11207592636374e-05, + "loss": 2.1643, + "step": 6554 + }, + { + "epoch": 0.286507277415971, + "grad_norm": 2.875, + "learning_rate": 8.111538294891684e-05, + "loss": 1.8422, + "step": 6555 + }, + { + "epoch": 0.28655098562000086, + "grad_norm": 2.0, + "learning_rate": 8.11100060470037e-05, + "loss": 1.6211, + "step": 6556 + }, + { + "epoch": 0.28659469382403074, + "grad_norm": 2.265625, + "learning_rate": 8.110462855799949e-05, + "loss": 1.5814, + "step": 6557 + }, + { + "epoch": 0.2866384020280607, + "grad_norm": 2.546875, + "learning_rate": 8.109925048200565e-05, + "loss": 1.3296, + "step": 6558 + }, + { + "epoch": 0.28668211023209056, + "grad_norm": 2.140625, + "learning_rate": 8.109387181912369e-05, + "loss": 2.0455, + "step": 6559 + }, + { + "epoch": 0.28672581843612044, + "grad_norm": 1.9921875, + "learning_rate": 8.108849256945513e-05, + "loss": 1.9988, + "step": 6560 + }, + { + "epoch": 0.2867695266401504, + "grad_norm": 2.1875, + "learning_rate": 8.108311273310146e-05, + "loss": 1.8771, + "step": 6561 + }, + { + "epoch": 0.28681323484418025, + "grad_norm": 2.46875, + "learning_rate": 8.10777323101642e-05, + "loss": 2.4142, + "step": 6562 + }, + { + "epoch": 0.28685694304821013, + "grad_norm": 2.078125, + "learning_rate": 8.107235130074492e-05, + "loss": 1.7797, + "step": 6563 + }, + { + "epoch": 0.28690065125224007, + "grad_norm": 2.28125, + "learning_rate": 8.106696970494514e-05, + "loss": 1.9409, + "step": 6564 + }, + { + "epoch": 0.28694435945626995, + "grad_norm": 1.953125, + "learning_rate": 8.106158752286642e-05, + "loss": 1.4793, + "step": 6565 + }, + { + "epoch": 0.28698806766029983, + "grad_norm": 1.90625, + "learning_rate": 8.105620475461033e-05, + "loss": 1.6698, + "step": 6566 + }, + { + "epoch": 0.2870317758643297, + "grad_norm": 2.09375, + "learning_rate": 8.105082140027846e-05, + "loss": 2.1729, + "step": 6567 + }, + { + "epoch": 0.28707548406835964, + "grad_norm": 2.3125, + "learning_rate": 8.104543745997242e-05, + "loss": 1.7732, + "step": 6568 + }, + { + "epoch": 0.2871191922723895, + "grad_norm": 2.109375, + "learning_rate": 8.104005293379378e-05, + "loss": 1.779, + "step": 6569 + }, + { + "epoch": 0.2871629004764194, + "grad_norm": 2.53125, + "learning_rate": 8.103466782184415e-05, + "loss": 1.9623, + "step": 6570 + }, + { + "epoch": 0.28720660868044934, + "grad_norm": 2.328125, + "learning_rate": 8.102928212422519e-05, + "loss": 2.3689, + "step": 6571 + }, + { + "epoch": 0.2872503168844792, + "grad_norm": 2.25, + "learning_rate": 8.102389584103849e-05, + "loss": 1.5722, + "step": 6572 + }, + { + "epoch": 0.2872940250885091, + "grad_norm": 2.203125, + "learning_rate": 8.101850897238574e-05, + "loss": 2.7088, + "step": 6573 + }, + { + "epoch": 0.28733773329253903, + "grad_norm": 2.671875, + "learning_rate": 8.101312151836857e-05, + "loss": 2.5903, + "step": 6574 + }, + { + "epoch": 0.2873814414965689, + "grad_norm": 2.265625, + "learning_rate": 8.100773347908868e-05, + "loss": 2.3079, + "step": 6575 + }, + { + "epoch": 0.2874251497005988, + "grad_norm": 2.171875, + "learning_rate": 8.100234485464771e-05, + "loss": 2.0066, + "step": 6576 + }, + { + "epoch": 0.28746885790462867, + "grad_norm": 2.328125, + "learning_rate": 8.099695564514737e-05, + "loss": 2.1999, + "step": 6577 + }, + { + "epoch": 0.2875125661086586, + "grad_norm": 2.046875, + "learning_rate": 8.099156585068937e-05, + "loss": 1.8645, + "step": 6578 + }, + { + "epoch": 0.2875562743126885, + "grad_norm": 3.046875, + "learning_rate": 8.098617547137541e-05, + "loss": 1.7175, + "step": 6579 + }, + { + "epoch": 0.28759998251671837, + "grad_norm": 2.109375, + "learning_rate": 8.098078450730723e-05, + "loss": 1.8844, + "step": 6580 + }, + { + "epoch": 0.2876436907207483, + "grad_norm": 2.078125, + "learning_rate": 8.097539295858656e-05, + "loss": 1.7426, + "step": 6581 + }, + { + "epoch": 0.2876873989247782, + "grad_norm": 2.3125, + "learning_rate": 8.097000082531512e-05, + "loss": 2.1152, + "step": 6582 + }, + { + "epoch": 0.28773110712880806, + "grad_norm": 1.9765625, + "learning_rate": 8.096460810759472e-05, + "loss": 1.8683, + "step": 6583 + }, + { + "epoch": 0.287774815332838, + "grad_norm": 2.0, + "learning_rate": 8.095921480552707e-05, + "loss": 1.4677, + "step": 6584 + }, + { + "epoch": 0.2878185235368679, + "grad_norm": 2.0625, + "learning_rate": 8.095382091921399e-05, + "loss": 1.6928, + "step": 6585 + }, + { + "epoch": 0.28786223174089776, + "grad_norm": 2.109375, + "learning_rate": 8.094842644875726e-05, + "loss": 1.963, + "step": 6586 + }, + { + "epoch": 0.28790593994492764, + "grad_norm": 1.8828125, + "learning_rate": 8.094303139425867e-05, + "loss": 1.5391, + "step": 6587 + }, + { + "epoch": 0.28794964814895757, + "grad_norm": 2.328125, + "learning_rate": 8.093763575582006e-05, + "loss": 2.0163, + "step": 6588 + }, + { + "epoch": 0.28799335635298745, + "grad_norm": 2.203125, + "learning_rate": 8.093223953354323e-05, + "loss": 1.5258, + "step": 6589 + }, + { + "epoch": 0.28803706455701733, + "grad_norm": 2.359375, + "learning_rate": 8.092684272753002e-05, + "loss": 2.2255, + "step": 6590 + }, + { + "epoch": 0.28808077276104727, + "grad_norm": 2.5, + "learning_rate": 8.092144533788226e-05, + "loss": 1.9112, + "step": 6591 + }, + { + "epoch": 0.28812448096507715, + "grad_norm": 2.4375, + "learning_rate": 8.091604736470184e-05, + "loss": 2.0743, + "step": 6592 + }, + { + "epoch": 0.288168189169107, + "grad_norm": 2.1875, + "learning_rate": 8.091064880809061e-05, + "loss": 1.6284, + "step": 6593 + }, + { + "epoch": 0.28821189737313696, + "grad_norm": 2.296875, + "learning_rate": 8.090524966815042e-05, + "loss": 2.0639, + "step": 6594 + }, + { + "epoch": 0.28825560557716684, + "grad_norm": 2.15625, + "learning_rate": 8.089984994498324e-05, + "loss": 1.5172, + "step": 6595 + }, + { + "epoch": 0.2882993137811967, + "grad_norm": 2.890625, + "learning_rate": 8.089444963869088e-05, + "loss": 1.8543, + "step": 6596 + }, + { + "epoch": 0.2883430219852266, + "grad_norm": 2.46875, + "learning_rate": 8.088904874937528e-05, + "loss": 2.9156, + "step": 6597 + }, + { + "epoch": 0.28838673018925653, + "grad_norm": 2.359375, + "learning_rate": 8.088364727713841e-05, + "loss": 1.9629, + "step": 6598 + }, + { + "epoch": 0.2884304383932864, + "grad_norm": 2.15625, + "learning_rate": 8.087824522208215e-05, + "loss": 2.4238, + "step": 6599 + }, + { + "epoch": 0.2884741465973163, + "grad_norm": 2.296875, + "learning_rate": 8.087284258430847e-05, + "loss": 2.4762, + "step": 6600 + }, + { + "epoch": 0.28851785480134623, + "grad_norm": 2.171875, + "learning_rate": 8.08674393639193e-05, + "loss": 1.8678, + "step": 6601 + }, + { + "epoch": 0.2885615630053761, + "grad_norm": 2.96875, + "learning_rate": 8.086203556101664e-05, + "loss": 3.2367, + "step": 6602 + }, + { + "epoch": 0.288605271209406, + "grad_norm": 2.59375, + "learning_rate": 8.085663117570243e-05, + "loss": 2.9074, + "step": 6603 + }, + { + "epoch": 0.2886489794134359, + "grad_norm": 3.1875, + "learning_rate": 8.08512262080787e-05, + "loss": 2.416, + "step": 6604 + }, + { + "epoch": 0.2886926876174658, + "grad_norm": 2.296875, + "learning_rate": 8.084582065824743e-05, + "loss": 1.755, + "step": 6605 + }, + { + "epoch": 0.2887363958214957, + "grad_norm": 1.8203125, + "learning_rate": 8.084041452631062e-05, + "loss": 1.7571, + "step": 6606 + }, + { + "epoch": 0.28878010402552556, + "grad_norm": 2.390625, + "learning_rate": 8.08350078123703e-05, + "loss": 2.1158, + "step": 6607 + }, + { + "epoch": 0.2888238122295555, + "grad_norm": 1.984375, + "learning_rate": 8.082960051652852e-05, + "loss": 1.8783, + "step": 6608 + }, + { + "epoch": 0.2888675204335854, + "grad_norm": 2.515625, + "learning_rate": 8.08241926388873e-05, + "loss": 2.3248, + "step": 6609 + }, + { + "epoch": 0.28891122863761526, + "grad_norm": 2.40625, + "learning_rate": 8.08187841795487e-05, + "loss": 1.7596, + "step": 6610 + }, + { + "epoch": 0.2889549368416452, + "grad_norm": 1.9140625, + "learning_rate": 8.081337513861478e-05, + "loss": 1.851, + "step": 6611 + }, + { + "epoch": 0.2889986450456751, + "grad_norm": 2.09375, + "learning_rate": 8.080796551618763e-05, + "loss": 1.7286, + "step": 6612 + }, + { + "epoch": 0.28904235324970495, + "grad_norm": 2.265625, + "learning_rate": 8.080255531236935e-05, + "loss": 1.975, + "step": 6613 + }, + { + "epoch": 0.2890860614537349, + "grad_norm": 2.140625, + "learning_rate": 8.079714452726199e-05, + "loss": 1.9174, + "step": 6614 + }, + { + "epoch": 0.28912976965776477, + "grad_norm": 2.953125, + "learning_rate": 8.079173316096772e-05, + "loss": 2.6944, + "step": 6615 + }, + { + "epoch": 0.28917347786179465, + "grad_norm": 2.0625, + "learning_rate": 8.07863212135886e-05, + "loss": 2.5183, + "step": 6616 + }, + { + "epoch": 0.2892171860658246, + "grad_norm": 2.5, + "learning_rate": 8.07809086852268e-05, + "loss": 2.2211, + "step": 6617 + }, + { + "epoch": 0.28926089426985446, + "grad_norm": 2.28125, + "learning_rate": 8.077549557598448e-05, + "loss": 1.8159, + "step": 6618 + }, + { + "epoch": 0.28930460247388434, + "grad_norm": 2.25, + "learning_rate": 8.077008188596375e-05, + "loss": 1.7229, + "step": 6619 + }, + { + "epoch": 0.2893483106779142, + "grad_norm": 2.34375, + "learning_rate": 8.076466761526678e-05, + "loss": 1.8865, + "step": 6620 + }, + { + "epoch": 0.28939201888194416, + "grad_norm": 1.890625, + "learning_rate": 8.075925276399576e-05, + "loss": 1.7476, + "step": 6621 + }, + { + "epoch": 0.28943572708597404, + "grad_norm": 2.015625, + "learning_rate": 8.075383733225288e-05, + "loss": 1.7712, + "step": 6622 + }, + { + "epoch": 0.2894794352900039, + "grad_norm": 2.4375, + "learning_rate": 8.074842132014034e-05, + "loss": 1.715, + "step": 6623 + }, + { + "epoch": 0.28952314349403385, + "grad_norm": 1.9453125, + "learning_rate": 8.074300472776031e-05, + "loss": 1.8801, + "step": 6624 + }, + { + "epoch": 0.28956685169806373, + "grad_norm": 2.703125, + "learning_rate": 8.073758755521505e-05, + "loss": 1.5784, + "step": 6625 + }, + { + "epoch": 0.2896105599020936, + "grad_norm": 2.765625, + "learning_rate": 8.073216980260678e-05, + "loss": 1.6752, + "step": 6626 + }, + { + "epoch": 0.28965426810612355, + "grad_norm": 3.84375, + "learning_rate": 8.072675147003773e-05, + "loss": 1.2782, + "step": 6627 + }, + { + "epoch": 0.2896979763101534, + "grad_norm": 6.90625, + "learning_rate": 8.072133255761017e-05, + "loss": 2.8148, + "step": 6628 + }, + { + "epoch": 0.2897416845141833, + "grad_norm": 1.9765625, + "learning_rate": 8.071591306542634e-05, + "loss": 1.8114, + "step": 6629 + }, + { + "epoch": 0.2897853927182132, + "grad_norm": 1.9453125, + "learning_rate": 8.071049299358853e-05, + "loss": 1.2275, + "step": 6630 + }, + { + "epoch": 0.2898291009222431, + "grad_norm": 2.34375, + "learning_rate": 8.070507234219901e-05, + "loss": 2.4481, + "step": 6631 + }, + { + "epoch": 0.289872809126273, + "grad_norm": 2.71875, + "learning_rate": 8.06996511113601e-05, + "loss": 2.0297, + "step": 6632 + }, + { + "epoch": 0.2899165173303029, + "grad_norm": 2.296875, + "learning_rate": 8.06942293011741e-05, + "loss": 1.6154, + "step": 6633 + }, + { + "epoch": 0.2899602255343328, + "grad_norm": 2.375, + "learning_rate": 8.06888069117433e-05, + "loss": 2.2716, + "step": 6634 + }, + { + "epoch": 0.2900039337383627, + "grad_norm": 2.03125, + "learning_rate": 8.068338394317005e-05, + "loss": 2.1387, + "step": 6635 + }, + { + "epoch": 0.2900476419423926, + "grad_norm": 1.78125, + "learning_rate": 8.06779603955567e-05, + "loss": 1.6537, + "step": 6636 + }, + { + "epoch": 0.2900913501464225, + "grad_norm": 2.21875, + "learning_rate": 8.067253626900558e-05, + "loss": 2.2958, + "step": 6637 + }, + { + "epoch": 0.2901350583504524, + "grad_norm": 2.734375, + "learning_rate": 8.066711156361905e-05, + "loss": 2.7007, + "step": 6638 + }, + { + "epoch": 0.29017876655448227, + "grad_norm": 3.03125, + "learning_rate": 8.066168627949952e-05, + "loss": 2.0117, + "step": 6639 + }, + { + "epoch": 0.29022247475851215, + "grad_norm": 2.140625, + "learning_rate": 8.065626041674932e-05, + "loss": 2.6418, + "step": 6640 + }, + { + "epoch": 0.2902661829625421, + "grad_norm": 1.984375, + "learning_rate": 8.065083397547086e-05, + "loss": 1.6984, + "step": 6641 + }, + { + "epoch": 0.29030989116657197, + "grad_norm": 2.0625, + "learning_rate": 8.064540695576657e-05, + "loss": 2.5482, + "step": 6642 + }, + { + "epoch": 0.29035359937060184, + "grad_norm": 2.578125, + "learning_rate": 8.063997935773885e-05, + "loss": 1.8375, + "step": 6643 + }, + { + "epoch": 0.2903973075746318, + "grad_norm": 2.15625, + "learning_rate": 8.063455118149013e-05, + "loss": 2.1303, + "step": 6644 + }, + { + "epoch": 0.29044101577866166, + "grad_norm": 1.7890625, + "learning_rate": 8.062912242712282e-05, + "loss": 1.3989, + "step": 6645 + }, + { + "epoch": 0.29048472398269154, + "grad_norm": 1.90625, + "learning_rate": 8.06236930947394e-05, + "loss": 1.541, + "step": 6646 + }, + { + "epoch": 0.2905284321867215, + "grad_norm": 2.0625, + "learning_rate": 8.061826318444232e-05, + "loss": 1.709, + "step": 6647 + }, + { + "epoch": 0.29057214039075135, + "grad_norm": 2.5625, + "learning_rate": 8.061283269633407e-05, + "loss": 1.6461, + "step": 6648 + }, + { + "epoch": 0.29061584859478123, + "grad_norm": 1.9921875, + "learning_rate": 8.06074016305171e-05, + "loss": 1.5956, + "step": 6649 + }, + { + "epoch": 0.2906595567988111, + "grad_norm": 2.078125, + "learning_rate": 8.060196998709391e-05, + "loss": 1.9987, + "step": 6650 + }, + { + "epoch": 0.29070326500284105, + "grad_norm": 2.265625, + "learning_rate": 8.0596537766167e-05, + "loss": 2.2364, + "step": 6651 + }, + { + "epoch": 0.29074697320687093, + "grad_norm": 6.1875, + "learning_rate": 8.05911049678389e-05, + "loss": 1.6748, + "step": 6652 + }, + { + "epoch": 0.2907906814109008, + "grad_norm": 14.9375, + "learning_rate": 8.058567159221213e-05, + "loss": 3.4629, + "step": 6653 + }, + { + "epoch": 0.29083438961493074, + "grad_norm": 2.1875, + "learning_rate": 8.058023763938922e-05, + "loss": 1.752, + "step": 6654 + }, + { + "epoch": 0.2908780978189606, + "grad_norm": 2.609375, + "learning_rate": 8.057480310947271e-05, + "loss": 2.5652, + "step": 6655 + }, + { + "epoch": 0.2909218060229905, + "grad_norm": 2.78125, + "learning_rate": 8.056936800256517e-05, + "loss": 2.0146, + "step": 6656 + }, + { + "epoch": 0.29096551422702044, + "grad_norm": 2.328125, + "learning_rate": 8.056393231876918e-05, + "loss": 1.7524, + "step": 6657 + }, + { + "epoch": 0.2910092224310503, + "grad_norm": 2.6875, + "learning_rate": 8.055849605818728e-05, + "loss": 1.9087, + "step": 6658 + }, + { + "epoch": 0.2910529306350802, + "grad_norm": 2.5625, + "learning_rate": 8.055305922092208e-05, + "loss": 2.1884, + "step": 6659 + }, + { + "epoch": 0.2910966388391101, + "grad_norm": 1.8515625, + "learning_rate": 8.05476218070762e-05, + "loss": 1.6765, + "step": 6660 + }, + { + "epoch": 0.29114034704314, + "grad_norm": 2.390625, + "learning_rate": 8.054218381675225e-05, + "loss": 1.5884, + "step": 6661 + }, + { + "epoch": 0.2911840552471699, + "grad_norm": 2.125, + "learning_rate": 8.053674525005282e-05, + "loss": 1.8013, + "step": 6662 + }, + { + "epoch": 0.2912277634511998, + "grad_norm": 2.28125, + "learning_rate": 8.053130610708057e-05, + "loss": 1.8213, + "step": 6663 + }, + { + "epoch": 0.2912714716552297, + "grad_norm": 2.25, + "learning_rate": 8.052586638793814e-05, + "loss": 2.4342, + "step": 6664 + }, + { + "epoch": 0.2913151798592596, + "grad_norm": 1.9375, + "learning_rate": 8.052042609272817e-05, + "loss": 1.9834, + "step": 6665 + }, + { + "epoch": 0.29135888806328947, + "grad_norm": 2.78125, + "learning_rate": 8.051498522155334e-05, + "loss": 1.7118, + "step": 6666 + }, + { + "epoch": 0.2914025962673194, + "grad_norm": 1.96875, + "learning_rate": 8.050954377451634e-05, + "loss": 1.9769, + "step": 6667 + }, + { + "epoch": 0.2914463044713493, + "grad_norm": 2.046875, + "learning_rate": 8.050410175171983e-05, + "loss": 1.5139, + "step": 6668 + }, + { + "epoch": 0.29149001267537916, + "grad_norm": 2.703125, + "learning_rate": 8.049865915326653e-05, + "loss": 2.0808, + "step": 6669 + }, + { + "epoch": 0.29153372087940904, + "grad_norm": 3.015625, + "learning_rate": 8.049321597925914e-05, + "loss": 1.6965, + "step": 6670 + }, + { + "epoch": 0.291577429083439, + "grad_norm": 2.015625, + "learning_rate": 8.04877722298004e-05, + "loss": 2.0674, + "step": 6671 + }, + { + "epoch": 0.29162113728746886, + "grad_norm": 1.96875, + "learning_rate": 8.0482327904993e-05, + "loss": 1.6745, + "step": 6672 + }, + { + "epoch": 0.29166484549149874, + "grad_norm": 2.234375, + "learning_rate": 8.047688300493972e-05, + "loss": 1.9184, + "step": 6673 + }, + { + "epoch": 0.29170855369552867, + "grad_norm": 2.796875, + "learning_rate": 8.047143752974331e-05, + "loss": 1.4173, + "step": 6674 + }, + { + "epoch": 0.29175226189955855, + "grad_norm": 3.96875, + "learning_rate": 8.046599147950651e-05, + "loss": 2.3369, + "step": 6675 + }, + { + "epoch": 0.29179597010358843, + "grad_norm": 1.890625, + "learning_rate": 8.046054485433211e-05, + "loss": 1.6808, + "step": 6676 + }, + { + "epoch": 0.29183967830761837, + "grad_norm": 3.25, + "learning_rate": 8.04550976543229e-05, + "loss": 1.5491, + "step": 6677 + }, + { + "epoch": 0.29188338651164825, + "grad_norm": 2.53125, + "learning_rate": 8.044964987958168e-05, + "loss": 1.711, + "step": 6678 + }, + { + "epoch": 0.2919270947156781, + "grad_norm": 2.390625, + "learning_rate": 8.044420153021124e-05, + "loss": 2.976, + "step": 6679 + }, + { + "epoch": 0.291970802919708, + "grad_norm": 2.03125, + "learning_rate": 8.043875260631442e-05, + "loss": 1.2881, + "step": 6680 + }, + { + "epoch": 0.29201451112373794, + "grad_norm": 2.0, + "learning_rate": 8.043330310799402e-05, + "loss": 1.7619, + "step": 6681 + }, + { + "epoch": 0.2920582193277678, + "grad_norm": 2.984375, + "learning_rate": 8.042785303535289e-05, + "loss": 2.2613, + "step": 6682 + }, + { + "epoch": 0.2921019275317977, + "grad_norm": 1.9921875, + "learning_rate": 8.04224023884939e-05, + "loss": 1.8949, + "step": 6683 + }, + { + "epoch": 0.29214563573582764, + "grad_norm": 1.8125, + "learning_rate": 8.041695116751991e-05, + "loss": 1.5188, + "step": 6684 + }, + { + "epoch": 0.2921893439398575, + "grad_norm": 2.40625, + "learning_rate": 8.041149937253378e-05, + "loss": 1.3007, + "step": 6685 + }, + { + "epoch": 0.2922330521438874, + "grad_norm": 2.484375, + "learning_rate": 8.040604700363838e-05, + "loss": 2.9892, + "step": 6686 + }, + { + "epoch": 0.29227676034791733, + "grad_norm": 2.15625, + "learning_rate": 8.040059406093662e-05, + "loss": 2.1024, + "step": 6687 + }, + { + "epoch": 0.2923204685519472, + "grad_norm": 1.9296875, + "learning_rate": 8.039514054453141e-05, + "loss": 1.5767, + "step": 6688 + }, + { + "epoch": 0.2923641767559771, + "grad_norm": 2.03125, + "learning_rate": 8.038968645452567e-05, + "loss": 1.3219, + "step": 6689 + }, + { + "epoch": 0.29240788496000697, + "grad_norm": 2.078125, + "learning_rate": 8.03842317910223e-05, + "loss": 1.9124, + "step": 6690 + }, + { + "epoch": 0.2924515931640369, + "grad_norm": 2.21875, + "learning_rate": 8.037877655412426e-05, + "loss": 1.6807, + "step": 6691 + }, + { + "epoch": 0.2924953013680668, + "grad_norm": 2.3125, + "learning_rate": 8.037332074393449e-05, + "loss": 2.4909, + "step": 6692 + }, + { + "epoch": 0.29253900957209666, + "grad_norm": 2.359375, + "learning_rate": 8.036786436055595e-05, + "loss": 1.5007, + "step": 6693 + }, + { + "epoch": 0.2925827177761266, + "grad_norm": 2.671875, + "learning_rate": 8.036240740409162e-05, + "loss": 2.127, + "step": 6694 + }, + { + "epoch": 0.2926264259801565, + "grad_norm": 2.21875, + "learning_rate": 8.035694987464446e-05, + "loss": 2.2236, + "step": 6695 + }, + { + "epoch": 0.29267013418418636, + "grad_norm": 2.21875, + "learning_rate": 8.035149177231749e-05, + "loss": 1.577, + "step": 6696 + }, + { + "epoch": 0.2927138423882163, + "grad_norm": 1.8515625, + "learning_rate": 8.034603309721368e-05, + "loss": 1.6475, + "step": 6697 + }, + { + "epoch": 0.2927575505922462, + "grad_norm": 2.671875, + "learning_rate": 8.034057384943606e-05, + "loss": 1.9149, + "step": 6698 + }, + { + "epoch": 0.29280125879627605, + "grad_norm": 2.078125, + "learning_rate": 8.033511402908767e-05, + "loss": 1.9467, + "step": 6699 + }, + { + "epoch": 0.29284496700030593, + "grad_norm": 2.84375, + "learning_rate": 8.03296536362715e-05, + "loss": 3.2761, + "step": 6700 + }, + { + "epoch": 0.29288867520433587, + "grad_norm": 2.109375, + "learning_rate": 8.032419267109066e-05, + "loss": 1.9798, + "step": 6701 + }, + { + "epoch": 0.29293238340836575, + "grad_norm": 2.578125, + "learning_rate": 8.031873113364814e-05, + "loss": 2.787, + "step": 6702 + }, + { + "epoch": 0.29297609161239563, + "grad_norm": 2.640625, + "learning_rate": 8.031326902404703e-05, + "loss": 2.9573, + "step": 6703 + }, + { + "epoch": 0.29301979981642556, + "grad_norm": 1.9453125, + "learning_rate": 8.030780634239043e-05, + "loss": 1.6533, + "step": 6704 + }, + { + "epoch": 0.29306350802045544, + "grad_norm": 2.25, + "learning_rate": 8.030234308878142e-05, + "loss": 2.3842, + "step": 6705 + }, + { + "epoch": 0.2931072162244853, + "grad_norm": 17.625, + "learning_rate": 8.02968792633231e-05, + "loss": 2.1034, + "step": 6706 + }, + { + "epoch": 0.29315092442851526, + "grad_norm": 2.4375, + "learning_rate": 8.029141486611856e-05, + "loss": 1.9293, + "step": 6707 + }, + { + "epoch": 0.29319463263254514, + "grad_norm": 2.171875, + "learning_rate": 8.028594989727092e-05, + "loss": 1.9474, + "step": 6708 + }, + { + "epoch": 0.293238340836575, + "grad_norm": 2.3125, + "learning_rate": 8.028048435688333e-05, + "loss": 1.9978, + "step": 6709 + }, + { + "epoch": 0.2932820490406049, + "grad_norm": 1.8828125, + "learning_rate": 8.027501824505895e-05, + "loss": 1.4904, + "step": 6710 + }, + { + "epoch": 0.29332575724463483, + "grad_norm": 2.140625, + "learning_rate": 8.02695515619009e-05, + "loss": 2.2193, + "step": 6711 + }, + { + "epoch": 0.2933694654486647, + "grad_norm": 2.203125, + "learning_rate": 8.026408430751235e-05, + "loss": 2.0577, + "step": 6712 + }, + { + "epoch": 0.2934131736526946, + "grad_norm": 1.9296875, + "learning_rate": 8.025861648199649e-05, + "loss": 1.9362, + "step": 6713 + }, + { + "epoch": 0.2934568818567245, + "grad_norm": 2.828125, + "learning_rate": 8.02531480854565e-05, + "loss": 1.6689, + "step": 6714 + }, + { + "epoch": 0.2935005900607544, + "grad_norm": 2.5625, + "learning_rate": 8.024767911799558e-05, + "loss": 2.1455, + "step": 6715 + }, + { + "epoch": 0.2935442982647843, + "grad_norm": 1.9921875, + "learning_rate": 8.024220957971693e-05, + "loss": 2.7358, + "step": 6716 + }, + { + "epoch": 0.2935880064688142, + "grad_norm": 2.09375, + "learning_rate": 8.023673947072376e-05, + "loss": 2.3428, + "step": 6717 + }, + { + "epoch": 0.2936317146728441, + "grad_norm": 1.921875, + "learning_rate": 8.023126879111931e-05, + "loss": 2.0114, + "step": 6718 + }, + { + "epoch": 0.293675422876874, + "grad_norm": 1.828125, + "learning_rate": 8.022579754100681e-05, + "loss": 1.714, + "step": 6719 + }, + { + "epoch": 0.29371913108090386, + "grad_norm": 1.9609375, + "learning_rate": 8.022032572048954e-05, + "loss": 1.6105, + "step": 6720 + }, + { + "epoch": 0.2937628392849338, + "grad_norm": 3.328125, + "learning_rate": 8.021485332967072e-05, + "loss": 2.8173, + "step": 6721 + }, + { + "epoch": 0.2938065474889637, + "grad_norm": 2.328125, + "learning_rate": 8.020938036865365e-05, + "loss": 2.0372, + "step": 6722 + }, + { + "epoch": 0.29385025569299356, + "grad_norm": 2.640625, + "learning_rate": 8.020390683754161e-05, + "loss": 1.6673, + "step": 6723 + }, + { + "epoch": 0.2938939638970235, + "grad_norm": 2.328125, + "learning_rate": 8.019843273643788e-05, + "loss": 1.8072, + "step": 6724 + }, + { + "epoch": 0.29393767210105337, + "grad_norm": 1.984375, + "learning_rate": 8.019295806544578e-05, + "loss": 1.9004, + "step": 6725 + }, + { + "epoch": 0.29398138030508325, + "grad_norm": 2.015625, + "learning_rate": 8.018748282466862e-05, + "loss": 1.6449, + "step": 6726 + }, + { + "epoch": 0.2940250885091132, + "grad_norm": 2.265625, + "learning_rate": 8.018200701420971e-05, + "loss": 2.4514, + "step": 6727 + }, + { + "epoch": 0.29406879671314307, + "grad_norm": 4.28125, + "learning_rate": 8.017653063417241e-05, + "loss": 1.6911, + "step": 6728 + }, + { + "epoch": 0.29411250491717295, + "grad_norm": 2.234375, + "learning_rate": 8.017105368466006e-05, + "loss": 1.3129, + "step": 6729 + }, + { + "epoch": 0.2941562131212028, + "grad_norm": 2.5, + "learning_rate": 8.016557616577601e-05, + "loss": 2.1801, + "step": 6730 + }, + { + "epoch": 0.29419992132523276, + "grad_norm": 2.078125, + "learning_rate": 8.016009807762364e-05, + "loss": 1.9602, + "step": 6731 + }, + { + "epoch": 0.29424362952926264, + "grad_norm": 2.796875, + "learning_rate": 8.015461942030631e-05, + "loss": 2.5422, + "step": 6732 + }, + { + "epoch": 0.2942873377332925, + "grad_norm": 2.421875, + "learning_rate": 8.014914019392743e-05, + "loss": 1.9976, + "step": 6733 + }, + { + "epoch": 0.29433104593732246, + "grad_norm": 2.078125, + "learning_rate": 8.01436603985904e-05, + "loss": 1.7281, + "step": 6734 + }, + { + "epoch": 0.29437475414135233, + "grad_norm": 2.265625, + "learning_rate": 8.013818003439861e-05, + "loss": 1.6319, + "step": 6735 + }, + { + "epoch": 0.2944184623453822, + "grad_norm": 2.359375, + "learning_rate": 8.013269910145552e-05, + "loss": 1.9, + "step": 6736 + }, + { + "epoch": 0.29446217054941215, + "grad_norm": 3.109375, + "learning_rate": 8.012721759986452e-05, + "loss": 2.5719, + "step": 6737 + }, + { + "epoch": 0.29450587875344203, + "grad_norm": 2.734375, + "learning_rate": 8.01217355297291e-05, + "loss": 2.2032, + "step": 6738 + }, + { + "epoch": 0.2945495869574719, + "grad_norm": 4.0, + "learning_rate": 8.011625289115267e-05, + "loss": 2.4784, + "step": 6739 + }, + { + "epoch": 0.2945932951615018, + "grad_norm": 1.875, + "learning_rate": 8.011076968423872e-05, + "loss": 1.816, + "step": 6740 + }, + { + "epoch": 0.2946370033655317, + "grad_norm": 2.15625, + "learning_rate": 8.010528590909073e-05, + "loss": 2.3117, + "step": 6741 + }, + { + "epoch": 0.2946807115695616, + "grad_norm": 2.8125, + "learning_rate": 8.009980156581217e-05, + "loss": 1.533, + "step": 6742 + }, + { + "epoch": 0.2947244197735915, + "grad_norm": 2.359375, + "learning_rate": 8.009431665450655e-05, + "loss": 2.4178, + "step": 6743 + }, + { + "epoch": 0.2947681279776214, + "grad_norm": 2.140625, + "learning_rate": 8.008883117527738e-05, + "loss": 1.5071, + "step": 6744 + }, + { + "epoch": 0.2948118361816513, + "grad_norm": 2.03125, + "learning_rate": 8.008334512822817e-05, + "loss": 1.9468, + "step": 6745 + }, + { + "epoch": 0.2948555443856812, + "grad_norm": 2.3125, + "learning_rate": 8.007785851346245e-05, + "loss": 1.5961, + "step": 6746 + }, + { + "epoch": 0.2948992525897111, + "grad_norm": 3.5625, + "learning_rate": 8.007237133108376e-05, + "loss": 2.2648, + "step": 6747 + }, + { + "epoch": 0.294942960793741, + "grad_norm": 2.1875, + "learning_rate": 8.006688358119568e-05, + "loss": 1.8828, + "step": 6748 + }, + { + "epoch": 0.2949866689977709, + "grad_norm": 2.046875, + "learning_rate": 8.006139526390172e-05, + "loss": 2.2068, + "step": 6749 + }, + { + "epoch": 0.29503037720180075, + "grad_norm": 2.203125, + "learning_rate": 8.005590637930548e-05, + "loss": 2.3664, + "step": 6750 + }, + { + "epoch": 0.2950740854058307, + "grad_norm": 2.296875, + "learning_rate": 8.005041692751055e-05, + "loss": 1.5957, + "step": 6751 + }, + { + "epoch": 0.29511779360986057, + "grad_norm": 2.84375, + "learning_rate": 8.00449269086205e-05, + "loss": 1.9969, + "step": 6752 + }, + { + "epoch": 0.29516150181389045, + "grad_norm": 3.109375, + "learning_rate": 8.003943632273898e-05, + "loss": 2.201, + "step": 6753 + }, + { + "epoch": 0.2952052100179204, + "grad_norm": 2.171875, + "learning_rate": 8.003394516996956e-05, + "loss": 2.0344, + "step": 6754 + }, + { + "epoch": 0.29524891822195026, + "grad_norm": 2.46875, + "learning_rate": 8.002845345041589e-05, + "loss": 2.9183, + "step": 6755 + }, + { + "epoch": 0.29529262642598014, + "grad_norm": 2.0625, + "learning_rate": 8.00229611641816e-05, + "loss": 1.9433, + "step": 6756 + }, + { + "epoch": 0.2953363346300101, + "grad_norm": 2.421875, + "learning_rate": 8.001746831137032e-05, + "loss": 1.548, + "step": 6757 + }, + { + "epoch": 0.29538004283403996, + "grad_norm": 2.125, + "learning_rate": 8.001197489208572e-05, + "loss": 2.0139, + "step": 6758 + }, + { + "epoch": 0.29542375103806984, + "grad_norm": 2.28125, + "learning_rate": 8.00064809064315e-05, + "loss": 1.7309, + "step": 6759 + }, + { + "epoch": 0.2954674592420997, + "grad_norm": 2.5, + "learning_rate": 8.00009863545113e-05, + "loss": 1.7731, + "step": 6760 + }, + { + "epoch": 0.29551116744612965, + "grad_norm": 1.8828125, + "learning_rate": 7.999549123642882e-05, + "loss": 1.9047, + "step": 6761 + }, + { + "epoch": 0.29555487565015953, + "grad_norm": 2.53125, + "learning_rate": 7.998999555228777e-05, + "loss": 1.9589, + "step": 6762 + }, + { + "epoch": 0.2955985838541894, + "grad_norm": 2.96875, + "learning_rate": 7.998449930219185e-05, + "loss": 3.0505, + "step": 6763 + }, + { + "epoch": 0.29564229205821935, + "grad_norm": 2.21875, + "learning_rate": 7.997900248624479e-05, + "loss": 1.55, + "step": 6764 + }, + { + "epoch": 0.2956860002622492, + "grad_norm": 2.1875, + "learning_rate": 7.997350510455032e-05, + "loss": 2.1873, + "step": 6765 + }, + { + "epoch": 0.2957297084662791, + "grad_norm": 2.140625, + "learning_rate": 7.99680071572122e-05, + "loss": 1.4288, + "step": 6766 + }, + { + "epoch": 0.29577341667030904, + "grad_norm": 2.15625, + "learning_rate": 7.996250864433415e-05, + "loss": 1.6355, + "step": 6767 + }, + { + "epoch": 0.2958171248743389, + "grad_norm": 2.265625, + "learning_rate": 7.995700956601995e-05, + "loss": 1.9442, + "step": 6768 + }, + { + "epoch": 0.2958608330783688, + "grad_norm": 2.46875, + "learning_rate": 7.995150992237339e-05, + "loss": 1.9044, + "step": 6769 + }, + { + "epoch": 0.2959045412823987, + "grad_norm": 2.1875, + "learning_rate": 7.994600971349825e-05, + "loss": 2.1128, + "step": 6770 + }, + { + "epoch": 0.2959482494864286, + "grad_norm": 2.1875, + "learning_rate": 7.994050893949832e-05, + "loss": 2.3428, + "step": 6771 + }, + { + "epoch": 0.2959919576904585, + "grad_norm": 2.109375, + "learning_rate": 7.993500760047739e-05, + "loss": 2.0457, + "step": 6772 + }, + { + "epoch": 0.2960356658944884, + "grad_norm": 2.796875, + "learning_rate": 7.992950569653932e-05, + "loss": 1.9142, + "step": 6773 + }, + { + "epoch": 0.2960793740985183, + "grad_norm": 1.921875, + "learning_rate": 7.992400322778791e-05, + "loss": 1.6232, + "step": 6774 + }, + { + "epoch": 0.2961230823025482, + "grad_norm": 2.015625, + "learning_rate": 7.9918500194327e-05, + "loss": 2.474, + "step": 6775 + }, + { + "epoch": 0.29616679050657807, + "grad_norm": 2.6875, + "learning_rate": 7.991299659626046e-05, + "loss": 1.8029, + "step": 6776 + }, + { + "epoch": 0.296210498710608, + "grad_norm": 2.15625, + "learning_rate": 7.990749243369214e-05, + "loss": 2.3675, + "step": 6777 + }, + { + "epoch": 0.2962542069146379, + "grad_norm": 2.203125, + "learning_rate": 7.99019877067259e-05, + "loss": 1.9141, + "step": 6778 + }, + { + "epoch": 0.29629791511866777, + "grad_norm": 2.171875, + "learning_rate": 7.989648241546563e-05, + "loss": 1.9746, + "step": 6779 + }, + { + "epoch": 0.29634162332269764, + "grad_norm": 2.421875, + "learning_rate": 7.989097656001524e-05, + "loss": 2.2656, + "step": 6780 + }, + { + "epoch": 0.2963853315267276, + "grad_norm": 1.875, + "learning_rate": 7.98854701404786e-05, + "loss": 1.8483, + "step": 6781 + }, + { + "epoch": 0.29642903973075746, + "grad_norm": 2.203125, + "learning_rate": 7.987996315695965e-05, + "loss": 2.1934, + "step": 6782 + }, + { + "epoch": 0.29647274793478734, + "grad_norm": 2.09375, + "learning_rate": 7.98744556095623e-05, + "loss": 1.8816, + "step": 6783 + }, + { + "epoch": 0.2965164561388173, + "grad_norm": 2.546875, + "learning_rate": 7.986894749839049e-05, + "loss": 1.7686, + "step": 6784 + }, + { + "epoch": 0.29656016434284715, + "grad_norm": 2.984375, + "learning_rate": 7.986343882354818e-05, + "loss": 2.356, + "step": 6785 + }, + { + "epoch": 0.29660387254687703, + "grad_norm": 3.21875, + "learning_rate": 7.985792958513931e-05, + "loss": 1.9254, + "step": 6786 + }, + { + "epoch": 0.29664758075090697, + "grad_norm": 2.921875, + "learning_rate": 7.985241978326786e-05, + "loss": 2.4049, + "step": 6787 + }, + { + "epoch": 0.29669128895493685, + "grad_norm": 2.84375, + "learning_rate": 7.984690941803779e-05, + "loss": 1.249, + "step": 6788 + }, + { + "epoch": 0.29673499715896673, + "grad_norm": 1.9453125, + "learning_rate": 7.984139848955309e-05, + "loss": 2.1914, + "step": 6789 + }, + { + "epoch": 0.2967787053629966, + "grad_norm": 1.9296875, + "learning_rate": 7.98358869979178e-05, + "loss": 1.8875, + "step": 6790 + }, + { + "epoch": 0.29682241356702654, + "grad_norm": 1.8046875, + "learning_rate": 7.983037494323588e-05, + "loss": 1.6059, + "step": 6791 + }, + { + "epoch": 0.2968661217710564, + "grad_norm": 1.953125, + "learning_rate": 7.982486232561138e-05, + "loss": 1.8311, + "step": 6792 + }, + { + "epoch": 0.2969098299750863, + "grad_norm": 2.140625, + "learning_rate": 7.981934914514829e-05, + "loss": 2.5336, + "step": 6793 + }, + { + "epoch": 0.29695353817911624, + "grad_norm": 2.890625, + "learning_rate": 7.98138354019507e-05, + "loss": 1.7158, + "step": 6794 + }, + { + "epoch": 0.2969972463831461, + "grad_norm": 1.9375, + "learning_rate": 7.980832109612265e-05, + "loss": 1.6328, + "step": 6795 + }, + { + "epoch": 0.297040954587176, + "grad_norm": 2.359375, + "learning_rate": 7.980280622776819e-05, + "loss": 1.5029, + "step": 6796 + }, + { + "epoch": 0.29708466279120593, + "grad_norm": 3.09375, + "learning_rate": 7.979729079699141e-05, + "loss": 1.9137, + "step": 6797 + }, + { + "epoch": 0.2971283709952358, + "grad_norm": 2.28125, + "learning_rate": 7.979177480389637e-05, + "loss": 1.9737, + "step": 6798 + }, + { + "epoch": 0.2971720791992657, + "grad_norm": 2.96875, + "learning_rate": 7.978625824858719e-05, + "loss": 2.503, + "step": 6799 + }, + { + "epoch": 0.2972157874032956, + "grad_norm": 2.515625, + "learning_rate": 7.978074113116796e-05, + "loss": 1.8997, + "step": 6800 + }, + { + "epoch": 0.2972594956073255, + "grad_norm": 2.828125, + "learning_rate": 7.977522345174281e-05, + "loss": 1.8808, + "step": 6801 + }, + { + "epoch": 0.2973032038113554, + "grad_norm": 2.515625, + "learning_rate": 7.976970521041585e-05, + "loss": 2.4508, + "step": 6802 + }, + { + "epoch": 0.29734691201538527, + "grad_norm": 2.484375, + "learning_rate": 7.976418640729122e-05, + "loss": 1.4971, + "step": 6803 + }, + { + "epoch": 0.2973906202194152, + "grad_norm": 2.0625, + "learning_rate": 7.975866704247307e-05, + "loss": 1.9584, + "step": 6804 + }, + { + "epoch": 0.2974343284234451, + "grad_norm": 2.25, + "learning_rate": 7.975314711606558e-05, + "loss": 2.1464, + "step": 6805 + }, + { + "epoch": 0.29747803662747496, + "grad_norm": 1.9453125, + "learning_rate": 7.974762662817289e-05, + "loss": 1.552, + "step": 6806 + }, + { + "epoch": 0.2975217448315049, + "grad_norm": 2.984375, + "learning_rate": 7.974210557889919e-05, + "loss": 2.6815, + "step": 6807 + }, + { + "epoch": 0.2975654530355348, + "grad_norm": 2.34375, + "learning_rate": 7.973658396834866e-05, + "loss": 2.0794, + "step": 6808 + }, + { + "epoch": 0.29760916123956466, + "grad_norm": 2.296875, + "learning_rate": 7.973106179662553e-05, + "loss": 2.1026, + "step": 6809 + }, + { + "epoch": 0.29765286944359454, + "grad_norm": 1.9453125, + "learning_rate": 7.972553906383398e-05, + "loss": 1.8778, + "step": 6810 + }, + { + "epoch": 0.29769657764762447, + "grad_norm": 2.375, + "learning_rate": 7.972001577007825e-05, + "loss": 2.0974, + "step": 6811 + }, + { + "epoch": 0.29774028585165435, + "grad_norm": 2.46875, + "learning_rate": 7.971449191546256e-05, + "loss": 2.1475, + "step": 6812 + }, + { + "epoch": 0.29778399405568423, + "grad_norm": 1.96875, + "learning_rate": 7.970896750009115e-05, + "loss": 1.5038, + "step": 6813 + }, + { + "epoch": 0.29782770225971417, + "grad_norm": 1.953125, + "learning_rate": 7.970344252406831e-05, + "loss": 2.0435, + "step": 6814 + }, + { + "epoch": 0.29787141046374405, + "grad_norm": 2.953125, + "learning_rate": 7.969791698749827e-05, + "loss": 1.9473, + "step": 6815 + }, + { + "epoch": 0.2979151186677739, + "grad_norm": 2.40625, + "learning_rate": 7.969239089048531e-05, + "loss": 2.4137, + "step": 6816 + }, + { + "epoch": 0.29795882687180386, + "grad_norm": 2.125, + "learning_rate": 7.968686423313372e-05, + "loss": 2.2156, + "step": 6817 + }, + { + "epoch": 0.29800253507583374, + "grad_norm": 2.125, + "learning_rate": 7.968133701554779e-05, + "loss": 1.4572, + "step": 6818 + }, + { + "epoch": 0.2980462432798636, + "grad_norm": 2.4375, + "learning_rate": 7.967580923783184e-05, + "loss": 2.623, + "step": 6819 + }, + { + "epoch": 0.2980899514838935, + "grad_norm": 1.890625, + "learning_rate": 7.967028090009016e-05, + "loss": 1.7034, + "step": 6820 + }, + { + "epoch": 0.29813365968792344, + "grad_norm": 2.21875, + "learning_rate": 7.966475200242713e-05, + "loss": 1.4808, + "step": 6821 + }, + { + "epoch": 0.2981773678919533, + "grad_norm": 4.71875, + "learning_rate": 7.965922254494702e-05, + "loss": 1.6554, + "step": 6822 + }, + { + "epoch": 0.2982210760959832, + "grad_norm": 2.265625, + "learning_rate": 7.965369252775424e-05, + "loss": 1.5935, + "step": 6823 + }, + { + "epoch": 0.29826478430001313, + "grad_norm": 1.8671875, + "learning_rate": 7.96481619509531e-05, + "loss": 1.3847, + "step": 6824 + }, + { + "epoch": 0.298308492504043, + "grad_norm": 2.109375, + "learning_rate": 7.9642630814648e-05, + "loss": 2.2384, + "step": 6825 + }, + { + "epoch": 0.2983522007080729, + "grad_norm": 2.09375, + "learning_rate": 7.963709911894333e-05, + "loss": 1.6818, + "step": 6826 + }, + { + "epoch": 0.2983959089121028, + "grad_norm": 1.8984375, + "learning_rate": 7.963156686394345e-05, + "loss": 1.734, + "step": 6827 + }, + { + "epoch": 0.2984396171161327, + "grad_norm": 2.375, + "learning_rate": 7.962603404975278e-05, + "loss": 2.9876, + "step": 6828 + }, + { + "epoch": 0.2984833253201626, + "grad_norm": 2.28125, + "learning_rate": 7.962050067647573e-05, + "loss": 2.0571, + "step": 6829 + }, + { + "epoch": 0.29852703352419246, + "grad_norm": 2.171875, + "learning_rate": 7.961496674421672e-05, + "loss": 1.4679, + "step": 6830 + }, + { + "epoch": 0.2985707417282224, + "grad_norm": 2.046875, + "learning_rate": 7.960943225308019e-05, + "loss": 1.9087, + "step": 6831 + }, + { + "epoch": 0.2986144499322523, + "grad_norm": 2.109375, + "learning_rate": 7.960389720317057e-05, + "loss": 1.6903, + "step": 6832 + }, + { + "epoch": 0.29865815813628216, + "grad_norm": 1.96875, + "learning_rate": 7.959836159459231e-05, + "loss": 1.7672, + "step": 6833 + }, + { + "epoch": 0.2987018663403121, + "grad_norm": 1.8984375, + "learning_rate": 7.959282542744992e-05, + "loss": 1.6609, + "step": 6834 + }, + { + "epoch": 0.298745574544342, + "grad_norm": 1.9296875, + "learning_rate": 7.958728870184782e-05, + "loss": 1.8973, + "step": 6835 + }, + { + "epoch": 0.29878928274837185, + "grad_norm": 2.359375, + "learning_rate": 7.958175141789054e-05, + "loss": 1.6439, + "step": 6836 + }, + { + "epoch": 0.2988329909524018, + "grad_norm": 2.0625, + "learning_rate": 7.957621357568254e-05, + "loss": 1.9569, + "step": 6837 + }, + { + "epoch": 0.29887669915643167, + "grad_norm": 2.96875, + "learning_rate": 7.957067517532835e-05, + "loss": 2.3333, + "step": 6838 + }, + { + "epoch": 0.29892040736046155, + "grad_norm": 2.53125, + "learning_rate": 7.956513621693248e-05, + "loss": 1.7887, + "step": 6839 + }, + { + "epoch": 0.29896411556449143, + "grad_norm": 2.21875, + "learning_rate": 7.955959670059947e-05, + "loss": 1.6348, + "step": 6840 + }, + { + "epoch": 0.29900782376852136, + "grad_norm": 2.390625, + "learning_rate": 7.955405662643384e-05, + "loss": 2.1351, + "step": 6841 + }, + { + "epoch": 0.29905153197255124, + "grad_norm": 1.984375, + "learning_rate": 7.954851599454014e-05, + "loss": 2.4259, + "step": 6842 + }, + { + "epoch": 0.2990952401765811, + "grad_norm": 1.921875, + "learning_rate": 7.954297480502293e-05, + "loss": 1.6529, + "step": 6843 + }, + { + "epoch": 0.29913894838061106, + "grad_norm": 1.953125, + "learning_rate": 7.953743305798682e-05, + "loss": 1.8452, + "step": 6844 + }, + { + "epoch": 0.29918265658464094, + "grad_norm": 1.9609375, + "learning_rate": 7.953189075353633e-05, + "loss": 1.8695, + "step": 6845 + }, + { + "epoch": 0.2992263647886708, + "grad_norm": 2.3125, + "learning_rate": 7.95263478917761e-05, + "loss": 2.2973, + "step": 6846 + }, + { + "epoch": 0.29927007299270075, + "grad_norm": 2.015625, + "learning_rate": 7.95208044728107e-05, + "loss": 1.5313, + "step": 6847 + }, + { + "epoch": 0.29931378119673063, + "grad_norm": 2.140625, + "learning_rate": 7.951526049674475e-05, + "loss": 1.9208, + "step": 6848 + }, + { + "epoch": 0.2993574894007605, + "grad_norm": 2.359375, + "learning_rate": 7.950971596368289e-05, + "loss": 1.8555, + "step": 6849 + }, + { + "epoch": 0.2994011976047904, + "grad_norm": 2.75, + "learning_rate": 7.950417087372972e-05, + "loss": 1.3852, + "step": 6850 + }, + { + "epoch": 0.2994449058088203, + "grad_norm": 2.765625, + "learning_rate": 7.949862522698992e-05, + "loss": 2.5552, + "step": 6851 + }, + { + "epoch": 0.2994886140128502, + "grad_norm": 2.0, + "learning_rate": 7.949307902356813e-05, + "loss": 1.6658, + "step": 6852 + }, + { + "epoch": 0.2995323222168801, + "grad_norm": 2.140625, + "learning_rate": 7.9487532263569e-05, + "loss": 2.0919, + "step": 6853 + }, + { + "epoch": 0.29957603042091, + "grad_norm": 2.203125, + "learning_rate": 7.948198494709724e-05, + "loss": 1.8746, + "step": 6854 + }, + { + "epoch": 0.2996197386249399, + "grad_norm": 2.125, + "learning_rate": 7.947643707425749e-05, + "loss": 1.9121, + "step": 6855 + }, + { + "epoch": 0.2996634468289698, + "grad_norm": 2.875, + "learning_rate": 7.94708886451545e-05, + "loss": 2.0367, + "step": 6856 + }, + { + "epoch": 0.2997071550329997, + "grad_norm": 2.390625, + "learning_rate": 7.946533965989293e-05, + "loss": 1.6858, + "step": 6857 + }, + { + "epoch": 0.2997508632370296, + "grad_norm": 2.1875, + "learning_rate": 7.945979011857751e-05, + "loss": 2.0082, + "step": 6858 + }, + { + "epoch": 0.2997945714410595, + "grad_norm": 1.984375, + "learning_rate": 7.945424002131298e-05, + "loss": 1.92, + "step": 6859 + }, + { + "epoch": 0.29983827964508936, + "grad_norm": 1.8828125, + "learning_rate": 7.944868936820408e-05, + "loss": 1.7948, + "step": 6860 + }, + { + "epoch": 0.2998819878491193, + "grad_norm": 1.9453125, + "learning_rate": 7.944313815935556e-05, + "loss": 1.986, + "step": 6861 + }, + { + "epoch": 0.29992569605314917, + "grad_norm": 2.015625, + "learning_rate": 7.943758639487216e-05, + "loss": 1.5364, + "step": 6862 + }, + { + "epoch": 0.29996940425717905, + "grad_norm": 2.0625, + "learning_rate": 7.943203407485864e-05, + "loss": 2.095, + "step": 6863 + }, + { + "epoch": 0.300013112461209, + "grad_norm": 3.328125, + "learning_rate": 7.942648119941982e-05, + "loss": 1.8194, + "step": 6864 + }, + { + "epoch": 0.30005682066523887, + "grad_norm": 2.03125, + "learning_rate": 7.942092776866048e-05, + "loss": 1.7115, + "step": 6865 + }, + { + "epoch": 0.30010052886926875, + "grad_norm": 2.5, + "learning_rate": 7.94153737826854e-05, + "loss": 1.4567, + "step": 6866 + }, + { + "epoch": 0.3001442370732987, + "grad_norm": 2.140625, + "learning_rate": 7.94098192415994e-05, + "loss": 2.0076, + "step": 6867 + }, + { + "epoch": 0.30018794527732856, + "grad_norm": 1.96875, + "learning_rate": 7.940426414550732e-05, + "loss": 1.7571, + "step": 6868 + }, + { + "epoch": 0.30023165348135844, + "grad_norm": 2.421875, + "learning_rate": 7.939870849451398e-05, + "loss": 1.8405, + "step": 6869 + }, + { + "epoch": 0.3002753616853883, + "grad_norm": 3.3125, + "learning_rate": 7.939315228872421e-05, + "loss": 2.0807, + "step": 6870 + }, + { + "epoch": 0.30031906988941826, + "grad_norm": 3.53125, + "learning_rate": 7.938759552824288e-05, + "loss": 2.4804, + "step": 6871 + }, + { + "epoch": 0.30036277809344814, + "grad_norm": 2.265625, + "learning_rate": 7.938203821317487e-05, + "loss": 2.3102, + "step": 6872 + }, + { + "epoch": 0.300406486297478, + "grad_norm": 1.8515625, + "learning_rate": 7.937648034362502e-05, + "loss": 1.4459, + "step": 6873 + }, + { + "epoch": 0.30045019450150795, + "grad_norm": 1.796875, + "learning_rate": 7.937092191969821e-05, + "loss": 1.3184, + "step": 6874 + }, + { + "epoch": 0.30049390270553783, + "grad_norm": 2.140625, + "learning_rate": 7.936536294149939e-05, + "loss": 1.9091, + "step": 6875 + }, + { + "epoch": 0.3005376109095677, + "grad_norm": 1.8515625, + "learning_rate": 7.935980340913342e-05, + "loss": 1.5597, + "step": 6876 + }, + { + "epoch": 0.30058131911359764, + "grad_norm": 1.8671875, + "learning_rate": 7.935424332270522e-05, + "loss": 1.9584, + "step": 6877 + }, + { + "epoch": 0.3006250273176275, + "grad_norm": 2.171875, + "learning_rate": 7.934868268231973e-05, + "loss": 1.6071, + "step": 6878 + }, + { + "epoch": 0.3006687355216574, + "grad_norm": 2.234375, + "learning_rate": 7.93431214880819e-05, + "loss": 2.0378, + "step": 6879 + }, + { + "epoch": 0.3007124437256873, + "grad_norm": 2.515625, + "learning_rate": 7.933755974009663e-05, + "loss": 2.7808, + "step": 6880 + }, + { + "epoch": 0.3007561519297172, + "grad_norm": 2.015625, + "learning_rate": 7.933199743846893e-05, + "loss": 1.7805, + "step": 6881 + }, + { + "epoch": 0.3007998601337471, + "grad_norm": 3.109375, + "learning_rate": 7.932643458330374e-05, + "loss": 2.1663, + "step": 6882 + }, + { + "epoch": 0.300843568337777, + "grad_norm": 2.25, + "learning_rate": 7.932087117470606e-05, + "loss": 1.845, + "step": 6883 + }, + { + "epoch": 0.3008872765418069, + "grad_norm": 1.984375, + "learning_rate": 7.931530721278084e-05, + "loss": 1.5432, + "step": 6884 + }, + { + "epoch": 0.3009309847458368, + "grad_norm": 2.40625, + "learning_rate": 7.930974269763313e-05, + "loss": 2.3063, + "step": 6885 + }, + { + "epoch": 0.3009746929498667, + "grad_norm": 2.171875, + "learning_rate": 7.93041776293679e-05, + "loss": 2.1164, + "step": 6886 + }, + { + "epoch": 0.3010184011538966, + "grad_norm": 2.53125, + "learning_rate": 7.929861200809021e-05, + "loss": 2.5617, + "step": 6887 + }, + { + "epoch": 0.3010621093579265, + "grad_norm": 2.078125, + "learning_rate": 7.929304583390505e-05, + "loss": 2.2743, + "step": 6888 + }, + { + "epoch": 0.30110581756195637, + "grad_norm": 2.203125, + "learning_rate": 7.92874791069175e-05, + "loss": 2.0906, + "step": 6889 + }, + { + "epoch": 0.30114952576598625, + "grad_norm": 2.0, + "learning_rate": 7.928191182723256e-05, + "loss": 1.8454, + "step": 6890 + }, + { + "epoch": 0.3011932339700162, + "grad_norm": 2.046875, + "learning_rate": 7.927634399495536e-05, + "loss": 1.8599, + "step": 6891 + }, + { + "epoch": 0.30123694217404606, + "grad_norm": 1.875, + "learning_rate": 7.927077561019092e-05, + "loss": 1.3247, + "step": 6892 + }, + { + "epoch": 0.30128065037807594, + "grad_norm": 1.8984375, + "learning_rate": 7.926520667304434e-05, + "loss": 1.5736, + "step": 6893 + }, + { + "epoch": 0.3013243585821059, + "grad_norm": 1.8984375, + "learning_rate": 7.925963718362073e-05, + "loss": 1.8178, + "step": 6894 + }, + { + "epoch": 0.30136806678613576, + "grad_norm": 2.0625, + "learning_rate": 7.925406714202517e-05, + "loss": 1.571, + "step": 6895 + }, + { + "epoch": 0.30141177499016564, + "grad_norm": 2.4375, + "learning_rate": 7.924849654836281e-05, + "loss": 2.1494, + "step": 6896 + }, + { + "epoch": 0.3014554831941956, + "grad_norm": 1.953125, + "learning_rate": 7.924292540273872e-05, + "loss": 1.5882, + "step": 6897 + }, + { + "epoch": 0.30149919139822545, + "grad_norm": 2.40625, + "learning_rate": 7.923735370525809e-05, + "loss": 1.4472, + "step": 6898 + }, + { + "epoch": 0.30154289960225533, + "grad_norm": 2.421875, + "learning_rate": 7.923178145602603e-05, + "loss": 2.0544, + "step": 6899 + }, + { + "epoch": 0.30158660780628527, + "grad_norm": 2.203125, + "learning_rate": 7.922620865514772e-05, + "loss": 2.23, + "step": 6900 + }, + { + "epoch": 0.30163031601031515, + "grad_norm": 2.34375, + "learning_rate": 7.92206353027283e-05, + "loss": 1.5023, + "step": 6901 + }, + { + "epoch": 0.301674024214345, + "grad_norm": 1.9375, + "learning_rate": 7.921506139887297e-05, + "loss": 1.8787, + "step": 6902 + }, + { + "epoch": 0.3017177324183749, + "grad_norm": 2.34375, + "learning_rate": 7.92094869436869e-05, + "loss": 1.8779, + "step": 6903 + }, + { + "epoch": 0.30176144062240484, + "grad_norm": 3.078125, + "learning_rate": 7.920391193727532e-05, + "loss": 2.8937, + "step": 6904 + }, + { + "epoch": 0.3018051488264347, + "grad_norm": 2.328125, + "learning_rate": 7.91983363797434e-05, + "loss": 2.0161, + "step": 6905 + }, + { + "epoch": 0.3018488570304646, + "grad_norm": 2.171875, + "learning_rate": 7.91927602711964e-05, + "loss": 1.6358, + "step": 6906 + }, + { + "epoch": 0.30189256523449454, + "grad_norm": 2.734375, + "learning_rate": 7.91871836117395e-05, + "loss": 1.476, + "step": 6907 + }, + { + "epoch": 0.3019362734385244, + "grad_norm": 2.21875, + "learning_rate": 7.918160640147798e-05, + "loss": 2.321, + "step": 6908 + }, + { + "epoch": 0.3019799816425543, + "grad_norm": 1.9453125, + "learning_rate": 7.917602864051706e-05, + "loss": 2.1206, + "step": 6909 + }, + { + "epoch": 0.30202368984658423, + "grad_norm": 2.046875, + "learning_rate": 7.917045032896202e-05, + "loss": 2.1465, + "step": 6910 + }, + { + "epoch": 0.3020673980506141, + "grad_norm": 2.0, + "learning_rate": 7.916487146691815e-05, + "loss": 1.9398, + "step": 6911 + }, + { + "epoch": 0.302111106254644, + "grad_norm": 2.328125, + "learning_rate": 7.915929205449069e-05, + "loss": 1.5202, + "step": 6912 + }, + { + "epoch": 0.30215481445867387, + "grad_norm": 1.75, + "learning_rate": 7.915371209178494e-05, + "loss": 1.6357, + "step": 6913 + }, + { + "epoch": 0.3021985226627038, + "grad_norm": 2.359375, + "learning_rate": 7.914813157890623e-05, + "loss": 1.6771, + "step": 6914 + }, + { + "epoch": 0.3022422308667337, + "grad_norm": 2.171875, + "learning_rate": 7.914255051595984e-05, + "loss": 2.1166, + "step": 6915 + }, + { + "epoch": 0.30228593907076357, + "grad_norm": 2.125, + "learning_rate": 7.913696890305112e-05, + "loss": 1.9647, + "step": 6916 + }, + { + "epoch": 0.3023296472747935, + "grad_norm": 1.8203125, + "learning_rate": 7.913138674028537e-05, + "loss": 1.7212, + "step": 6917 + }, + { + "epoch": 0.3023733554788234, + "grad_norm": 2.21875, + "learning_rate": 7.912580402776797e-05, + "loss": 1.934, + "step": 6918 + }, + { + "epoch": 0.30241706368285326, + "grad_norm": 2.046875, + "learning_rate": 7.912022076560426e-05, + "loss": 2.1029, + "step": 6919 + }, + { + "epoch": 0.3024607718868832, + "grad_norm": 1.96875, + "learning_rate": 7.911463695389959e-05, + "loss": 2.3138, + "step": 6920 + }, + { + "epoch": 0.3025044800909131, + "grad_norm": 3.640625, + "learning_rate": 7.910905259275936e-05, + "loss": 1.9484, + "step": 6921 + }, + { + "epoch": 0.30254818829494295, + "grad_norm": 2.6875, + "learning_rate": 7.910346768228894e-05, + "loss": 2.4706, + "step": 6922 + }, + { + "epoch": 0.30259189649897283, + "grad_norm": 2.25, + "learning_rate": 7.909788222259372e-05, + "loss": 2.3833, + "step": 6923 + }, + { + "epoch": 0.30263560470300277, + "grad_norm": 2.296875, + "learning_rate": 7.90922962137791e-05, + "loss": 1.9278, + "step": 6924 + }, + { + "epoch": 0.30267931290703265, + "grad_norm": 2.671875, + "learning_rate": 7.908670965595052e-05, + "loss": 2.091, + "step": 6925 + }, + { + "epoch": 0.30272302111106253, + "grad_norm": 1.8984375, + "learning_rate": 7.908112254921341e-05, + "loss": 1.9959, + "step": 6926 + }, + { + "epoch": 0.30276672931509246, + "grad_norm": 1.953125, + "learning_rate": 7.907553489367316e-05, + "loss": 1.8171, + "step": 6927 + }, + { + "epoch": 0.30281043751912234, + "grad_norm": 2.5, + "learning_rate": 7.906994668943528e-05, + "loss": 1.744, + "step": 6928 + }, + { + "epoch": 0.3028541457231522, + "grad_norm": 3.0, + "learning_rate": 7.906435793660519e-05, + "loss": 2.5186, + "step": 6929 + }, + { + "epoch": 0.30289785392718216, + "grad_norm": 2.4375, + "learning_rate": 7.905876863528834e-05, + "loss": 2.2941, + "step": 6930 + }, + { + "epoch": 0.30294156213121204, + "grad_norm": 2.484375, + "learning_rate": 7.905317878559026e-05, + "loss": 1.5411, + "step": 6931 + }, + { + "epoch": 0.3029852703352419, + "grad_norm": 2.3125, + "learning_rate": 7.90475883876164e-05, + "loss": 2.3315, + "step": 6932 + }, + { + "epoch": 0.3030289785392718, + "grad_norm": 3.828125, + "learning_rate": 7.904199744147228e-05, + "loss": 2.2387, + "step": 6933 + }, + { + "epoch": 0.30307268674330173, + "grad_norm": 2.828125, + "learning_rate": 7.903640594726339e-05, + "loss": 2.0719, + "step": 6934 + }, + { + "epoch": 0.3031163949473316, + "grad_norm": 2.015625, + "learning_rate": 7.903081390509525e-05, + "loss": 1.5947, + "step": 6935 + }, + { + "epoch": 0.3031601031513615, + "grad_norm": 2.03125, + "learning_rate": 7.902522131507341e-05, + "loss": 1.4249, + "step": 6936 + }, + { + "epoch": 0.30320381135539143, + "grad_norm": 2.109375, + "learning_rate": 7.901962817730341e-05, + "loss": 1.5904, + "step": 6937 + }, + { + "epoch": 0.3032475195594213, + "grad_norm": 1.921875, + "learning_rate": 7.901403449189077e-05, + "loss": 1.9359, + "step": 6938 + }, + { + "epoch": 0.3032912277634512, + "grad_norm": 2.140625, + "learning_rate": 7.900844025894109e-05, + "loss": 2.1124, + "step": 6939 + }, + { + "epoch": 0.3033349359674811, + "grad_norm": 3.015625, + "learning_rate": 7.900284547855991e-05, + "loss": 2.2528, + "step": 6940 + }, + { + "epoch": 0.303378644171511, + "grad_norm": 2.28125, + "learning_rate": 7.899725015085285e-05, + "loss": 2.0718, + "step": 6941 + }, + { + "epoch": 0.3034223523755409, + "grad_norm": 2.546875, + "learning_rate": 7.899165427592543e-05, + "loss": 2.1333, + "step": 6942 + }, + { + "epoch": 0.30346606057957076, + "grad_norm": 2.953125, + "learning_rate": 7.898605785388334e-05, + "loss": 1.8171, + "step": 6943 + }, + { + "epoch": 0.3035097687836007, + "grad_norm": 2.515625, + "learning_rate": 7.898046088483214e-05, + "loss": 2.5504, + "step": 6944 + }, + { + "epoch": 0.3035534769876306, + "grad_norm": 2.5, + "learning_rate": 7.897486336887746e-05, + "loss": 2.1916, + "step": 6945 + }, + { + "epoch": 0.30359718519166046, + "grad_norm": 1.8671875, + "learning_rate": 7.896926530612492e-05, + "loss": 1.616, + "step": 6946 + }, + { + "epoch": 0.3036408933956904, + "grad_norm": 2.28125, + "learning_rate": 7.89636666966802e-05, + "loss": 1.3841, + "step": 6947 + }, + { + "epoch": 0.30368460159972027, + "grad_norm": 2.890625, + "learning_rate": 7.895806754064893e-05, + "loss": 1.758, + "step": 6948 + }, + { + "epoch": 0.30372830980375015, + "grad_norm": 2.421875, + "learning_rate": 7.895246783813677e-05, + "loss": 2.1847, + "step": 6949 + }, + { + "epoch": 0.3037720180077801, + "grad_norm": 3.515625, + "learning_rate": 7.894686758924942e-05, + "loss": 1.5626, + "step": 6950 + }, + { + "epoch": 0.30381572621180997, + "grad_norm": 2.078125, + "learning_rate": 7.894126679409254e-05, + "loss": 1.8753, + "step": 6951 + }, + { + "epoch": 0.30385943441583985, + "grad_norm": 2.234375, + "learning_rate": 7.893566545277184e-05, + "loss": 1.9157, + "step": 6952 + }, + { + "epoch": 0.3039031426198697, + "grad_norm": 1.984375, + "learning_rate": 7.893006356539303e-05, + "loss": 1.4586, + "step": 6953 + }, + { + "epoch": 0.30394685082389966, + "grad_norm": 2.59375, + "learning_rate": 7.89244611320618e-05, + "loss": 2.8266, + "step": 6954 + }, + { + "epoch": 0.30399055902792954, + "grad_norm": 2.53125, + "learning_rate": 7.891885815288388e-05, + "loss": 2.0882, + "step": 6955 + }, + { + "epoch": 0.3040342672319594, + "grad_norm": 3.0, + "learning_rate": 7.891325462796503e-05, + "loss": 3.2572, + "step": 6956 + }, + { + "epoch": 0.30407797543598936, + "grad_norm": 2.328125, + "learning_rate": 7.890765055741098e-05, + "loss": 2.1291, + "step": 6957 + }, + { + "epoch": 0.30412168364001924, + "grad_norm": 12.75, + "learning_rate": 7.89020459413275e-05, + "loss": 2.1205, + "step": 6958 + }, + { + "epoch": 0.3041653918440491, + "grad_norm": 2.0625, + "learning_rate": 7.889644077982033e-05, + "loss": 1.6894, + "step": 6959 + }, + { + "epoch": 0.30420910004807905, + "grad_norm": 1.9921875, + "learning_rate": 7.889083507299529e-05, + "loss": 1.7101, + "step": 6960 + }, + { + "epoch": 0.30425280825210893, + "grad_norm": 2.421875, + "learning_rate": 7.888522882095813e-05, + "loss": 1.3514, + "step": 6961 + }, + { + "epoch": 0.3042965164561388, + "grad_norm": 2.421875, + "learning_rate": 7.887962202381465e-05, + "loss": 1.7874, + "step": 6962 + }, + { + "epoch": 0.3043402246601687, + "grad_norm": 2.09375, + "learning_rate": 7.887401468167068e-05, + "loss": 1.7584, + "step": 6963 + }, + { + "epoch": 0.3043839328641986, + "grad_norm": 2.234375, + "learning_rate": 7.886840679463203e-05, + "loss": 1.8778, + "step": 6964 + }, + { + "epoch": 0.3044276410682285, + "grad_norm": 2.71875, + "learning_rate": 7.886279836280454e-05, + "loss": 1.6752, + "step": 6965 + }, + { + "epoch": 0.3044713492722584, + "grad_norm": 2.484375, + "learning_rate": 7.885718938629402e-05, + "loss": 1.838, + "step": 6966 + }, + { + "epoch": 0.3045150574762883, + "grad_norm": 3.5625, + "learning_rate": 7.885157986520634e-05, + "loss": 2.1676, + "step": 6967 + }, + { + "epoch": 0.3045587656803182, + "grad_norm": 2.234375, + "learning_rate": 7.884596979964736e-05, + "loss": 1.8915, + "step": 6968 + }, + { + "epoch": 0.3046024738843481, + "grad_norm": 1.875, + "learning_rate": 7.884035918972295e-05, + "loss": 1.6817, + "step": 6969 + }, + { + "epoch": 0.304646182088378, + "grad_norm": 2.015625, + "learning_rate": 7.883474803553899e-05, + "loss": 1.9612, + "step": 6970 + }, + { + "epoch": 0.3046898902924079, + "grad_norm": 1.8984375, + "learning_rate": 7.882913633720135e-05, + "loss": 1.4549, + "step": 6971 + }, + { + "epoch": 0.3047335984964378, + "grad_norm": 2.234375, + "learning_rate": 7.882352409481597e-05, + "loss": 1.7752, + "step": 6972 + }, + { + "epoch": 0.30477730670046765, + "grad_norm": 3.9375, + "learning_rate": 7.881791130848873e-05, + "loss": 1.3633, + "step": 6973 + }, + { + "epoch": 0.3048210149044976, + "grad_norm": 2.1875, + "learning_rate": 7.881229797832554e-05, + "loss": 1.5878, + "step": 6974 + }, + { + "epoch": 0.30486472310852747, + "grad_norm": 2.15625, + "learning_rate": 7.880668410443238e-05, + "loss": 2.0573, + "step": 6975 + }, + { + "epoch": 0.30490843131255735, + "grad_norm": 2.015625, + "learning_rate": 7.880106968691517e-05, + "loss": 1.8925, + "step": 6976 + }, + { + "epoch": 0.3049521395165873, + "grad_norm": 2.078125, + "learning_rate": 7.879545472587984e-05, + "loss": 1.6229, + "step": 6977 + }, + { + "epoch": 0.30499584772061716, + "grad_norm": 2.40625, + "learning_rate": 7.878983922143237e-05, + "loss": 1.9097, + "step": 6978 + }, + { + "epoch": 0.30503955592464704, + "grad_norm": 2.515625, + "learning_rate": 7.878422317367873e-05, + "loss": 1.9204, + "step": 6979 + }, + { + "epoch": 0.305083264128677, + "grad_norm": 2.234375, + "learning_rate": 7.877860658272491e-05, + "loss": 2.0385, + "step": 6980 + }, + { + "epoch": 0.30512697233270686, + "grad_norm": 2.359375, + "learning_rate": 7.87729894486769e-05, + "loss": 1.7698, + "step": 6981 + }, + { + "epoch": 0.30517068053673674, + "grad_norm": 2.140625, + "learning_rate": 7.876737177164071e-05, + "loss": 2.0247, + "step": 6982 + }, + { + "epoch": 0.3052143887407666, + "grad_norm": 2.515625, + "learning_rate": 7.876175355172234e-05, + "loss": 2.0221, + "step": 6983 + }, + { + "epoch": 0.30525809694479655, + "grad_norm": 2.0, + "learning_rate": 7.875613478902782e-05, + "loss": 1.7424, + "step": 6984 + }, + { + "epoch": 0.30530180514882643, + "grad_norm": 2.21875, + "learning_rate": 7.875051548366316e-05, + "loss": 1.5619, + "step": 6985 + }, + { + "epoch": 0.3053455133528563, + "grad_norm": 2.078125, + "learning_rate": 7.874489563573446e-05, + "loss": 1.806, + "step": 6986 + }, + { + "epoch": 0.30538922155688625, + "grad_norm": 2.1875, + "learning_rate": 7.873927524534775e-05, + "loss": 2.3417, + "step": 6987 + }, + { + "epoch": 0.3054329297609161, + "grad_norm": 1.9296875, + "learning_rate": 7.873365431260906e-05, + "loss": 1.8671, + "step": 6988 + }, + { + "epoch": 0.305476637964946, + "grad_norm": 1.875, + "learning_rate": 7.87280328376245e-05, + "loss": 1.7924, + "step": 6989 + }, + { + "epoch": 0.30552034616897594, + "grad_norm": 1.8984375, + "learning_rate": 7.872241082050016e-05, + "loss": 1.9036, + "step": 6990 + }, + { + "epoch": 0.3055640543730058, + "grad_norm": 1.875, + "learning_rate": 7.871678826134211e-05, + "loss": 1.6127, + "step": 6991 + }, + { + "epoch": 0.3056077625770357, + "grad_norm": 2.5, + "learning_rate": 7.871116516025647e-05, + "loss": 1.8068, + "step": 6992 + }, + { + "epoch": 0.3056514707810656, + "grad_norm": 2.421875, + "learning_rate": 7.870554151734937e-05, + "loss": 2.2059, + "step": 6993 + }, + { + "epoch": 0.3056951789850955, + "grad_norm": 2.484375, + "learning_rate": 7.869991733272692e-05, + "loss": 1.8673, + "step": 6994 + }, + { + "epoch": 0.3057388871891254, + "grad_norm": 1.953125, + "learning_rate": 7.869429260649526e-05, + "loss": 1.5146, + "step": 6995 + }, + { + "epoch": 0.3057825953931553, + "grad_norm": 3.640625, + "learning_rate": 7.868866733876052e-05, + "loss": 1.1441, + "step": 6996 + }, + { + "epoch": 0.3058263035971852, + "grad_norm": 2.109375, + "learning_rate": 7.868304152962889e-05, + "loss": 1.926, + "step": 6997 + }, + { + "epoch": 0.3058700118012151, + "grad_norm": 2.59375, + "learning_rate": 7.867741517920653e-05, + "loss": 1.4671, + "step": 6998 + }, + { + "epoch": 0.30591372000524497, + "grad_norm": 2.40625, + "learning_rate": 7.867178828759958e-05, + "loss": 2.3758, + "step": 6999 + }, + { + "epoch": 0.3059574282092749, + "grad_norm": 1.9296875, + "learning_rate": 7.866616085491426e-05, + "loss": 1.7675, + "step": 7000 + }, + { + "epoch": 0.3060011364133048, + "grad_norm": 1.7578125, + "learning_rate": 7.866053288125678e-05, + "loss": 1.6345, + "step": 7001 + }, + { + "epoch": 0.30604484461733467, + "grad_norm": 2.328125, + "learning_rate": 7.865490436673331e-05, + "loss": 1.8859, + "step": 7002 + }, + { + "epoch": 0.30608855282136455, + "grad_norm": 2.421875, + "learning_rate": 7.864927531145011e-05, + "loss": 1.9555, + "step": 7003 + }, + { + "epoch": 0.3061322610253945, + "grad_norm": 2.09375, + "learning_rate": 7.864364571551337e-05, + "loss": 1.4031, + "step": 7004 + }, + { + "epoch": 0.30617596922942436, + "grad_norm": 2.46875, + "learning_rate": 7.863801557902936e-05, + "loss": 1.9749, + "step": 7005 + }, + { + "epoch": 0.30621967743345424, + "grad_norm": 2.46875, + "learning_rate": 7.863238490210432e-05, + "loss": 1.5911, + "step": 7006 + }, + { + "epoch": 0.3062633856374842, + "grad_norm": 2.984375, + "learning_rate": 7.862675368484449e-05, + "loss": 2.1096, + "step": 7007 + }, + { + "epoch": 0.30630709384151406, + "grad_norm": 2.265625, + "learning_rate": 7.862112192735616e-05, + "loss": 1.6069, + "step": 7008 + }, + { + "epoch": 0.30635080204554394, + "grad_norm": 2.65625, + "learning_rate": 7.86154896297456e-05, + "loss": 2.3773, + "step": 7009 + }, + { + "epoch": 0.30639451024957387, + "grad_norm": 2.125, + "learning_rate": 7.86098567921191e-05, + "loss": 1.9284, + "step": 7010 + }, + { + "epoch": 0.30643821845360375, + "grad_norm": 2.171875, + "learning_rate": 7.860422341458298e-05, + "loss": 1.9852, + "step": 7011 + }, + { + "epoch": 0.30648192665763363, + "grad_norm": 2.1875, + "learning_rate": 7.859858949724351e-05, + "loss": 1.8512, + "step": 7012 + }, + { + "epoch": 0.3065256348616635, + "grad_norm": 2.171875, + "learning_rate": 7.859295504020706e-05, + "loss": 2.174, + "step": 7013 + }, + { + "epoch": 0.30656934306569344, + "grad_norm": 2.140625, + "learning_rate": 7.858732004357989e-05, + "loss": 1.7676, + "step": 7014 + }, + { + "epoch": 0.3066130512697233, + "grad_norm": 2.6875, + "learning_rate": 7.85816845074684e-05, + "loss": 1.7165, + "step": 7015 + }, + { + "epoch": 0.3066567594737532, + "grad_norm": 3.40625, + "learning_rate": 7.857604843197896e-05, + "loss": 2.595, + "step": 7016 + }, + { + "epoch": 0.30670046767778314, + "grad_norm": 2.65625, + "learning_rate": 7.857041181721787e-05, + "loss": 3.0552, + "step": 7017 + }, + { + "epoch": 0.306744175881813, + "grad_norm": 1.953125, + "learning_rate": 7.856477466329152e-05, + "loss": 1.7754, + "step": 7018 + }, + { + "epoch": 0.3067878840858429, + "grad_norm": 2.359375, + "learning_rate": 7.85591369703063e-05, + "loss": 1.7161, + "step": 7019 + }, + { + "epoch": 0.30683159228987283, + "grad_norm": 2.09375, + "learning_rate": 7.855349873836862e-05, + "loss": 2.4942, + "step": 7020 + }, + { + "epoch": 0.3068753004939027, + "grad_norm": 1.875, + "learning_rate": 7.854785996758485e-05, + "loss": 1.9154, + "step": 7021 + }, + { + "epoch": 0.3069190086979326, + "grad_norm": 1.8359375, + "learning_rate": 7.854222065806141e-05, + "loss": 1.6528, + "step": 7022 + }, + { + "epoch": 0.3069627169019625, + "grad_norm": 2.234375, + "learning_rate": 7.853658080990471e-05, + "loss": 1.9866, + "step": 7023 + }, + { + "epoch": 0.3070064251059924, + "grad_norm": 1.640625, + "learning_rate": 7.853094042322121e-05, + "loss": 1.3854, + "step": 7024 + }, + { + "epoch": 0.3070501333100223, + "grad_norm": 2.34375, + "learning_rate": 7.852529949811734e-05, + "loss": 2.4263, + "step": 7025 + }, + { + "epoch": 0.30709384151405217, + "grad_norm": 2.359375, + "learning_rate": 7.851965803469956e-05, + "loss": 1.3436, + "step": 7026 + }, + { + "epoch": 0.3071375497180821, + "grad_norm": 10.875, + "learning_rate": 7.851401603307431e-05, + "loss": 5.592, + "step": 7027 + }, + { + "epoch": 0.307181257922112, + "grad_norm": 2.296875, + "learning_rate": 7.85083734933481e-05, + "loss": 1.9178, + "step": 7028 + }, + { + "epoch": 0.30722496612614186, + "grad_norm": 2.453125, + "learning_rate": 7.850273041562737e-05, + "loss": 2.4896, + "step": 7029 + }, + { + "epoch": 0.3072686743301718, + "grad_norm": 2.140625, + "learning_rate": 7.849708680001862e-05, + "loss": 1.8103, + "step": 7030 + }, + { + "epoch": 0.3073123825342017, + "grad_norm": 2.28125, + "learning_rate": 7.84914426466284e-05, + "loss": 2.4037, + "step": 7031 + }, + { + "epoch": 0.30735609073823156, + "grad_norm": 2.34375, + "learning_rate": 7.848579795556316e-05, + "loss": 1.6429, + "step": 7032 + }, + { + "epoch": 0.30739979894226144, + "grad_norm": 1.84375, + "learning_rate": 7.848015272692947e-05, + "loss": 1.5081, + "step": 7033 + }, + { + "epoch": 0.3074435071462914, + "grad_norm": 3.5625, + "learning_rate": 7.847450696083385e-05, + "loss": 2.2542, + "step": 7034 + }, + { + "epoch": 0.30748721535032125, + "grad_norm": 2.5, + "learning_rate": 7.846886065738284e-05, + "loss": 1.5479, + "step": 7035 + }, + { + "epoch": 0.30753092355435113, + "grad_norm": 2.46875, + "learning_rate": 7.846321381668298e-05, + "loss": 2.4093, + "step": 7036 + }, + { + "epoch": 0.30757463175838107, + "grad_norm": 2.046875, + "learning_rate": 7.845756643884087e-05, + "loss": 1.6931, + "step": 7037 + }, + { + "epoch": 0.30761833996241095, + "grad_norm": 2.34375, + "learning_rate": 7.845191852396305e-05, + "loss": 2.3507, + "step": 7038 + }, + { + "epoch": 0.3076620481664408, + "grad_norm": 1.8203125, + "learning_rate": 7.844627007215613e-05, + "loss": 1.7684, + "step": 7039 + }, + { + "epoch": 0.30770575637047076, + "grad_norm": 2.25, + "learning_rate": 7.844062108352668e-05, + "loss": 1.9573, + "step": 7040 + }, + { + "epoch": 0.30774946457450064, + "grad_norm": 3.28125, + "learning_rate": 7.843497155818132e-05, + "loss": 2.1727, + "step": 7041 + }, + { + "epoch": 0.3077931727785305, + "grad_norm": 2.046875, + "learning_rate": 7.842932149622666e-05, + "loss": 1.7293, + "step": 7042 + }, + { + "epoch": 0.3078368809825604, + "grad_norm": 2.328125, + "learning_rate": 7.842367089776932e-05, + "loss": 1.8516, + "step": 7043 + }, + { + "epoch": 0.30788058918659034, + "grad_norm": 2.15625, + "learning_rate": 7.841801976291595e-05, + "loss": 1.437, + "step": 7044 + }, + { + "epoch": 0.3079242973906202, + "grad_norm": 2.015625, + "learning_rate": 7.841236809177317e-05, + "loss": 1.8, + "step": 7045 + }, + { + "epoch": 0.3079680055946501, + "grad_norm": 2.09375, + "learning_rate": 7.840671588444768e-05, + "loss": 1.8159, + "step": 7046 + }, + { + "epoch": 0.30801171379868003, + "grad_norm": 2.953125, + "learning_rate": 7.84010631410461e-05, + "loss": 2.0276, + "step": 7047 + }, + { + "epoch": 0.3080554220027099, + "grad_norm": 2.25, + "learning_rate": 7.839540986167514e-05, + "loss": 1.6176, + "step": 7048 + }, + { + "epoch": 0.3080991302067398, + "grad_norm": 2.15625, + "learning_rate": 7.838975604644146e-05, + "loss": 2.1571, + "step": 7049 + }, + { + "epoch": 0.3081428384107697, + "grad_norm": 2.296875, + "learning_rate": 7.838410169545176e-05, + "loss": 1.8845, + "step": 7050 + }, + { + "epoch": 0.3081865466147996, + "grad_norm": 2.046875, + "learning_rate": 7.837844680881274e-05, + "loss": 1.7899, + "step": 7051 + }, + { + "epoch": 0.3082302548188295, + "grad_norm": 3.390625, + "learning_rate": 7.837279138663114e-05, + "loss": 2.2053, + "step": 7052 + }, + { + "epoch": 0.30827396302285937, + "grad_norm": 2.3125, + "learning_rate": 7.836713542901366e-05, + "loss": 2.0172, + "step": 7053 + }, + { + "epoch": 0.3083176712268893, + "grad_norm": 2.515625, + "learning_rate": 7.836147893606707e-05, + "loss": 2.0153, + "step": 7054 + }, + { + "epoch": 0.3083613794309192, + "grad_norm": 3.25, + "learning_rate": 7.835582190789807e-05, + "loss": 2.5625, + "step": 7055 + }, + { + "epoch": 0.30840508763494906, + "grad_norm": 10.0625, + "learning_rate": 7.835016434461345e-05, + "loss": 1.7711, + "step": 7056 + }, + { + "epoch": 0.308448795838979, + "grad_norm": 2.078125, + "learning_rate": 7.834450624631996e-05, + "loss": 1.8224, + "step": 7057 + }, + { + "epoch": 0.3084925040430089, + "grad_norm": 2.09375, + "learning_rate": 7.83388476131244e-05, + "loss": 1.7466, + "step": 7058 + }, + { + "epoch": 0.30853621224703875, + "grad_norm": 2.328125, + "learning_rate": 7.833318844513353e-05, + "loss": 2.2057, + "step": 7059 + }, + { + "epoch": 0.3085799204510687, + "grad_norm": 2.140625, + "learning_rate": 7.832752874245415e-05, + "loss": 1.783, + "step": 7060 + }, + { + "epoch": 0.30862362865509857, + "grad_norm": 1.890625, + "learning_rate": 7.83218685051931e-05, + "loss": 1.6596, + "step": 7061 + }, + { + "epoch": 0.30866733685912845, + "grad_norm": 2.09375, + "learning_rate": 7.831620773345715e-05, + "loss": 1.66, + "step": 7062 + }, + { + "epoch": 0.30871104506315833, + "grad_norm": 2.15625, + "learning_rate": 7.831054642735315e-05, + "loss": 1.8626, + "step": 7063 + }, + { + "epoch": 0.30875475326718826, + "grad_norm": 2.609375, + "learning_rate": 7.830488458698794e-05, + "loss": 1.6357, + "step": 7064 + }, + { + "epoch": 0.30879846147121814, + "grad_norm": 2.046875, + "learning_rate": 7.829922221246835e-05, + "loss": 1.7975, + "step": 7065 + }, + { + "epoch": 0.308842169675248, + "grad_norm": 2.4375, + "learning_rate": 7.829355930390125e-05, + "loss": 1.8929, + "step": 7066 + }, + { + "epoch": 0.30888587787927796, + "grad_norm": 1.7265625, + "learning_rate": 7.828789586139352e-05, + "loss": 1.6512, + "step": 7067 + }, + { + "epoch": 0.30892958608330784, + "grad_norm": 1.890625, + "learning_rate": 7.828223188505202e-05, + "loss": 2.0205, + "step": 7068 + }, + { + "epoch": 0.3089732942873377, + "grad_norm": 1.984375, + "learning_rate": 7.827656737498365e-05, + "loss": 1.6325, + "step": 7069 + }, + { + "epoch": 0.30901700249136765, + "grad_norm": 1.9921875, + "learning_rate": 7.827090233129528e-05, + "loss": 1.7217, + "step": 7070 + }, + { + "epoch": 0.30906071069539753, + "grad_norm": 1.8828125, + "learning_rate": 7.826523675409385e-05, + "loss": 1.4903, + "step": 7071 + }, + { + "epoch": 0.3091044188994274, + "grad_norm": 1.953125, + "learning_rate": 7.825957064348625e-05, + "loss": 1.8773, + "step": 7072 + }, + { + "epoch": 0.3091481271034573, + "grad_norm": 2.6875, + "learning_rate": 7.825390399957944e-05, + "loss": 2.1115, + "step": 7073 + }, + { + "epoch": 0.30919183530748723, + "grad_norm": 1.8203125, + "learning_rate": 7.824823682248033e-05, + "loss": 1.6687, + "step": 7074 + }, + { + "epoch": 0.3092355435115171, + "grad_norm": 2.109375, + "learning_rate": 7.824256911229588e-05, + "loss": 1.6351, + "step": 7075 + }, + { + "epoch": 0.309279251715547, + "grad_norm": 2.359375, + "learning_rate": 7.823690086913305e-05, + "loss": 1.7782, + "step": 7076 + }, + { + "epoch": 0.3093229599195769, + "grad_norm": 2.265625, + "learning_rate": 7.82312320930988e-05, + "loss": 1.6454, + "step": 7077 + }, + { + "epoch": 0.3093666681236068, + "grad_norm": 2.1875, + "learning_rate": 7.822556278430011e-05, + "loss": 1.8831, + "step": 7078 + }, + { + "epoch": 0.3094103763276367, + "grad_norm": 1.984375, + "learning_rate": 7.821989294284397e-05, + "loss": 2.0986, + "step": 7079 + }, + { + "epoch": 0.3094540845316666, + "grad_norm": 1.9765625, + "learning_rate": 7.821422256883736e-05, + "loss": 1.6004, + "step": 7080 + }, + { + "epoch": 0.3094977927356965, + "grad_norm": 1.796875, + "learning_rate": 7.820855166238734e-05, + "loss": 1.667, + "step": 7081 + }, + { + "epoch": 0.3095415009397264, + "grad_norm": 2.390625, + "learning_rate": 7.820288022360087e-05, + "loss": 1.7269, + "step": 7082 + }, + { + "epoch": 0.30958520914375626, + "grad_norm": 2.28125, + "learning_rate": 7.819720825258501e-05, + "loss": 2.1683, + "step": 7083 + }, + { + "epoch": 0.3096289173477862, + "grad_norm": 1.7890625, + "learning_rate": 7.81915357494468e-05, + "loss": 1.5796, + "step": 7084 + }, + { + "epoch": 0.30967262555181607, + "grad_norm": 2.015625, + "learning_rate": 7.818586271429327e-05, + "loss": 1.6388, + "step": 7085 + }, + { + "epoch": 0.30971633375584595, + "grad_norm": 2.234375, + "learning_rate": 7.818018914723149e-05, + "loss": 1.7634, + "step": 7086 + }, + { + "epoch": 0.3097600419598759, + "grad_norm": 5.40625, + "learning_rate": 7.817451504836852e-05, + "loss": 1.5701, + "step": 7087 + }, + { + "epoch": 0.30980375016390577, + "grad_norm": 2.21875, + "learning_rate": 7.816884041781148e-05, + "loss": 1.5761, + "step": 7088 + }, + { + "epoch": 0.30984745836793565, + "grad_norm": 2.203125, + "learning_rate": 7.816316525566738e-05, + "loss": 1.9034, + "step": 7089 + }, + { + "epoch": 0.3098911665719656, + "grad_norm": 1.9296875, + "learning_rate": 7.815748956204337e-05, + "loss": 1.9308, + "step": 7090 + }, + { + "epoch": 0.30993487477599546, + "grad_norm": 2.0, + "learning_rate": 7.815181333704656e-05, + "loss": 1.768, + "step": 7091 + }, + { + "epoch": 0.30997858298002534, + "grad_norm": 2.359375, + "learning_rate": 7.814613658078407e-05, + "loss": 2.7116, + "step": 7092 + }, + { + "epoch": 0.3100222911840552, + "grad_norm": 2.28125, + "learning_rate": 7.814045929336299e-05, + "loss": 1.9427, + "step": 7093 + }, + { + "epoch": 0.31006599938808516, + "grad_norm": 2.578125, + "learning_rate": 7.813478147489052e-05, + "loss": 2.0114, + "step": 7094 + }, + { + "epoch": 0.31010970759211504, + "grad_norm": 3.1875, + "learning_rate": 7.812910312547375e-05, + "loss": 2.1606, + "step": 7095 + }, + { + "epoch": 0.3101534157961449, + "grad_norm": 2.078125, + "learning_rate": 7.812342424521988e-05, + "loss": 2.1392, + "step": 7096 + }, + { + "epoch": 0.31019712400017485, + "grad_norm": 2.03125, + "learning_rate": 7.811774483423605e-05, + "loss": 1.721, + "step": 7097 + }, + { + "epoch": 0.31024083220420473, + "grad_norm": 1.9140625, + "learning_rate": 7.811206489262945e-05, + "loss": 1.7087, + "step": 7098 + }, + { + "epoch": 0.3102845404082346, + "grad_norm": 2.515625, + "learning_rate": 7.810638442050728e-05, + "loss": 1.376, + "step": 7099 + }, + { + "epoch": 0.31032824861226455, + "grad_norm": 2.171875, + "learning_rate": 7.810070341797673e-05, + "loss": 1.9291, + "step": 7100 + }, + { + "epoch": 0.3103719568162944, + "grad_norm": 2.140625, + "learning_rate": 7.8095021885145e-05, + "loss": 2.0947, + "step": 7101 + }, + { + "epoch": 0.3104156650203243, + "grad_norm": 1.9765625, + "learning_rate": 7.808933982211933e-05, + "loss": 1.6669, + "step": 7102 + }, + { + "epoch": 0.3104593732243542, + "grad_norm": 1.890625, + "learning_rate": 7.808365722900693e-05, + "loss": 1.8462, + "step": 7103 + }, + { + "epoch": 0.3105030814283841, + "grad_norm": 2.109375, + "learning_rate": 7.807797410591504e-05, + "loss": 1.6853, + "step": 7104 + }, + { + "epoch": 0.310546789632414, + "grad_norm": 2.546875, + "learning_rate": 7.80722904529509e-05, + "loss": 1.6934, + "step": 7105 + }, + { + "epoch": 0.3105904978364439, + "grad_norm": 2.28125, + "learning_rate": 7.80666062702218e-05, + "loss": 1.8148, + "step": 7106 + }, + { + "epoch": 0.3106342060404738, + "grad_norm": 2.1875, + "learning_rate": 7.806092155783497e-05, + "loss": 2.3154, + "step": 7107 + }, + { + "epoch": 0.3106779142445037, + "grad_norm": 2.203125, + "learning_rate": 7.805523631589774e-05, + "loss": 2.4815, + "step": 7108 + }, + { + "epoch": 0.3107216224485336, + "grad_norm": 2.578125, + "learning_rate": 7.804955054451735e-05, + "loss": 2.6464, + "step": 7109 + }, + { + "epoch": 0.3107653306525635, + "grad_norm": 1.96875, + "learning_rate": 7.804386424380113e-05, + "loss": 1.8508, + "step": 7110 + }, + { + "epoch": 0.3108090388565934, + "grad_norm": 1.9140625, + "learning_rate": 7.803817741385635e-05, + "loss": 1.4178, + "step": 7111 + }, + { + "epoch": 0.31085274706062327, + "grad_norm": 1.9765625, + "learning_rate": 7.803249005479037e-05, + "loss": 1.8201, + "step": 7112 + }, + { + "epoch": 0.31089645526465315, + "grad_norm": 2.140625, + "learning_rate": 7.802680216671053e-05, + "loss": 2.3145, + "step": 7113 + }, + { + "epoch": 0.3109401634686831, + "grad_norm": 3.390625, + "learning_rate": 7.80211137497241e-05, + "loss": 1.7484, + "step": 7114 + }, + { + "epoch": 0.31098387167271296, + "grad_norm": 2.171875, + "learning_rate": 7.801542480393849e-05, + "loss": 2.0886, + "step": 7115 + }, + { + "epoch": 0.31102757987674284, + "grad_norm": 1.9921875, + "learning_rate": 7.800973532946104e-05, + "loss": 1.5976, + "step": 7116 + }, + { + "epoch": 0.3110712880807728, + "grad_norm": 2.359375, + "learning_rate": 7.800404532639911e-05, + "loss": 2.2827, + "step": 7117 + }, + { + "epoch": 0.31111499628480266, + "grad_norm": 2.0, + "learning_rate": 7.799835479486008e-05, + "loss": 1.6772, + "step": 7118 + }, + { + "epoch": 0.31115870448883254, + "grad_norm": 2.4375, + "learning_rate": 7.799266373495137e-05, + "loss": 1.6108, + "step": 7119 + }, + { + "epoch": 0.3112024126928625, + "grad_norm": 2.484375, + "learning_rate": 7.798697214678032e-05, + "loss": 2.0943, + "step": 7120 + }, + { + "epoch": 0.31124612089689235, + "grad_norm": 2.0, + "learning_rate": 7.79812800304544e-05, + "loss": 1.4641, + "step": 7121 + }, + { + "epoch": 0.31128982910092223, + "grad_norm": 2.0, + "learning_rate": 7.797558738608099e-05, + "loss": 1.6022, + "step": 7122 + }, + { + "epoch": 0.3113335373049521, + "grad_norm": 2.34375, + "learning_rate": 7.796989421376755e-05, + "loss": 1.8272, + "step": 7123 + }, + { + "epoch": 0.31137724550898205, + "grad_norm": 2.34375, + "learning_rate": 7.796420051362148e-05, + "loss": 2.2746, + "step": 7124 + }, + { + "epoch": 0.3114209537130119, + "grad_norm": 2.5, + "learning_rate": 7.795850628575024e-05, + "loss": 1.6013, + "step": 7125 + }, + { + "epoch": 0.3114646619170418, + "grad_norm": 2.578125, + "learning_rate": 7.79528115302613e-05, + "loss": 2.7004, + "step": 7126 + }, + { + "epoch": 0.31150837012107174, + "grad_norm": 2.3125, + "learning_rate": 7.794711624726213e-05, + "loss": 1.7298, + "step": 7127 + }, + { + "epoch": 0.3115520783251016, + "grad_norm": 2.71875, + "learning_rate": 7.79414204368602e-05, + "loss": 1.7595, + "step": 7128 + }, + { + "epoch": 0.3115957865291315, + "grad_norm": 1.9140625, + "learning_rate": 7.7935724099163e-05, + "loss": 1.7804, + "step": 7129 + }, + { + "epoch": 0.31163949473316144, + "grad_norm": 1.9765625, + "learning_rate": 7.793002723427802e-05, + "loss": 1.7247, + "step": 7130 + }, + { + "epoch": 0.3116832029371913, + "grad_norm": 1.9140625, + "learning_rate": 7.792432984231277e-05, + "loss": 1.3776, + "step": 7131 + }, + { + "epoch": 0.3117269111412212, + "grad_norm": 2.328125, + "learning_rate": 7.791863192337479e-05, + "loss": 2.9035, + "step": 7132 + }, + { + "epoch": 0.3117706193452511, + "grad_norm": 2.234375, + "learning_rate": 7.791293347757159e-05, + "loss": 2.0015, + "step": 7133 + }, + { + "epoch": 0.311814327549281, + "grad_norm": 2.25, + "learning_rate": 7.79072345050107e-05, + "loss": 1.801, + "step": 7134 + }, + { + "epoch": 0.3118580357533109, + "grad_norm": 1.9921875, + "learning_rate": 7.790153500579968e-05, + "loss": 1.9849, + "step": 7135 + }, + { + "epoch": 0.31190174395734077, + "grad_norm": 2.5625, + "learning_rate": 7.78958349800461e-05, + "loss": 1.977, + "step": 7136 + }, + { + "epoch": 0.3119454521613707, + "grad_norm": 1.96875, + "learning_rate": 7.789013442785749e-05, + "loss": 1.6839, + "step": 7137 + }, + { + "epoch": 0.3119891603654006, + "grad_norm": 2.3125, + "learning_rate": 7.788443334934148e-05, + "loss": 1.6571, + "step": 7138 + }, + { + "epoch": 0.31203286856943047, + "grad_norm": 2.140625, + "learning_rate": 7.78787317446056e-05, + "loss": 2.0375, + "step": 7139 + }, + { + "epoch": 0.3120765767734604, + "grad_norm": 1.90625, + "learning_rate": 7.787302961375748e-05, + "loss": 1.9974, + "step": 7140 + }, + { + "epoch": 0.3121202849774903, + "grad_norm": 2.140625, + "learning_rate": 7.786732695690475e-05, + "loss": 2.2131, + "step": 7141 + }, + { + "epoch": 0.31216399318152016, + "grad_norm": 2.484375, + "learning_rate": 7.786162377415497e-05, + "loss": 2.4709, + "step": 7142 + }, + { + "epoch": 0.31220770138555004, + "grad_norm": 1.953125, + "learning_rate": 7.785592006561582e-05, + "loss": 1.3958, + "step": 7143 + }, + { + "epoch": 0.31225140958958, + "grad_norm": 1.9765625, + "learning_rate": 7.78502158313949e-05, + "loss": 1.7984, + "step": 7144 + }, + { + "epoch": 0.31229511779360986, + "grad_norm": 2.421875, + "learning_rate": 7.784451107159988e-05, + "loss": 1.6409, + "step": 7145 + }, + { + "epoch": 0.31233882599763974, + "grad_norm": 2.078125, + "learning_rate": 7.783880578633843e-05, + "loss": 2.3524, + "step": 7146 + }, + { + "epoch": 0.31238253420166967, + "grad_norm": 2.71875, + "learning_rate": 7.783309997571819e-05, + "loss": 1.5622, + "step": 7147 + }, + { + "epoch": 0.31242624240569955, + "grad_norm": 1.9921875, + "learning_rate": 7.782739363984683e-05, + "loss": 1.5084, + "step": 7148 + }, + { + "epoch": 0.31246995060972943, + "grad_norm": 2.046875, + "learning_rate": 7.782168677883206e-05, + "loss": 1.8476, + "step": 7149 + }, + { + "epoch": 0.31251365881375937, + "grad_norm": 2.296875, + "learning_rate": 7.781597939278156e-05, + "loss": 2.3668, + "step": 7150 + }, + { + "epoch": 0.31255736701778924, + "grad_norm": 2.359375, + "learning_rate": 7.781027148180304e-05, + "loss": 2.4507, + "step": 7151 + }, + { + "epoch": 0.3126010752218191, + "grad_norm": 2.015625, + "learning_rate": 7.780456304600423e-05, + "loss": 2.6835, + "step": 7152 + }, + { + "epoch": 0.312644783425849, + "grad_norm": 1.953125, + "learning_rate": 7.779885408549286e-05, + "loss": 1.8557, + "step": 7153 + }, + { + "epoch": 0.31268849162987894, + "grad_norm": 1.7890625, + "learning_rate": 7.779314460037663e-05, + "loss": 1.6768, + "step": 7154 + }, + { + "epoch": 0.3127321998339088, + "grad_norm": 2.390625, + "learning_rate": 7.778743459076333e-05, + "loss": 2.9159, + "step": 7155 + }, + { + "epoch": 0.3127759080379387, + "grad_norm": 3.375, + "learning_rate": 7.778172405676068e-05, + "loss": 1.871, + "step": 7156 + }, + { + "epoch": 0.31281961624196863, + "grad_norm": 2.15625, + "learning_rate": 7.777601299847648e-05, + "loss": 1.6686, + "step": 7157 + }, + { + "epoch": 0.3128633244459985, + "grad_norm": 2.28125, + "learning_rate": 7.777030141601848e-05, + "loss": 2.1794, + "step": 7158 + }, + { + "epoch": 0.3129070326500284, + "grad_norm": 1.96875, + "learning_rate": 7.776458930949446e-05, + "loss": 1.9874, + "step": 7159 + }, + { + "epoch": 0.31295074085405833, + "grad_norm": 1.984375, + "learning_rate": 7.775887667901225e-05, + "loss": 2.0633, + "step": 7160 + }, + { + "epoch": 0.3129944490580882, + "grad_norm": 2.078125, + "learning_rate": 7.775316352467962e-05, + "loss": 1.8188, + "step": 7161 + }, + { + "epoch": 0.3130381572621181, + "grad_norm": 2.09375, + "learning_rate": 7.774744984660442e-05, + "loss": 1.6672, + "step": 7162 + }, + { + "epoch": 0.31308186546614797, + "grad_norm": 2.171875, + "learning_rate": 7.774173564489445e-05, + "loss": 2.5734, + "step": 7163 + }, + { + "epoch": 0.3131255736701779, + "grad_norm": 2.21875, + "learning_rate": 7.773602091965754e-05, + "loss": 2.3279, + "step": 7164 + }, + { + "epoch": 0.3131692818742078, + "grad_norm": 2.1875, + "learning_rate": 7.773030567100157e-05, + "loss": 1.6927, + "step": 7165 + }, + { + "epoch": 0.31321299007823766, + "grad_norm": 2.859375, + "learning_rate": 7.772458989903437e-05, + "loss": 2.3808, + "step": 7166 + }, + { + "epoch": 0.3132566982822676, + "grad_norm": 2.75, + "learning_rate": 7.771887360386379e-05, + "loss": 2.2934, + "step": 7167 + }, + { + "epoch": 0.3133004064862975, + "grad_norm": 2.515625, + "learning_rate": 7.771315678559774e-05, + "loss": 2.1141, + "step": 7168 + }, + { + "epoch": 0.31334411469032736, + "grad_norm": 3.046875, + "learning_rate": 7.770743944434407e-05, + "loss": 1.8697, + "step": 7169 + }, + { + "epoch": 0.3133878228943573, + "grad_norm": 2.0, + "learning_rate": 7.77017215802107e-05, + "loss": 2.1161, + "step": 7170 + }, + { + "epoch": 0.3134315310983872, + "grad_norm": 2.109375, + "learning_rate": 7.769600319330552e-05, + "loss": 2.0299, + "step": 7171 + }, + { + "epoch": 0.31347523930241705, + "grad_norm": 2.5, + "learning_rate": 7.769028428373645e-05, + "loss": 2.1765, + "step": 7172 + }, + { + "epoch": 0.313518947506447, + "grad_norm": 1.9140625, + "learning_rate": 7.768456485161142e-05, + "loss": 1.7941, + "step": 7173 + }, + { + "epoch": 0.31356265571047687, + "grad_norm": 2.28125, + "learning_rate": 7.767884489703836e-05, + "loss": 2.2785, + "step": 7174 + }, + { + "epoch": 0.31360636391450675, + "grad_norm": 1.875, + "learning_rate": 7.76731244201252e-05, + "loss": 1.3323, + "step": 7175 + }, + { + "epoch": 0.3136500721185366, + "grad_norm": 2.0625, + "learning_rate": 7.766740342097992e-05, + "loss": 1.9032, + "step": 7176 + }, + { + "epoch": 0.31369378032256656, + "grad_norm": 1.734375, + "learning_rate": 7.766168189971046e-05, + "loss": 1.7193, + "step": 7177 + }, + { + "epoch": 0.31373748852659644, + "grad_norm": 2.203125, + "learning_rate": 7.765595985642483e-05, + "loss": 2.1914, + "step": 7178 + }, + { + "epoch": 0.3137811967306263, + "grad_norm": 2.09375, + "learning_rate": 7.765023729123095e-05, + "loss": 1.8275, + "step": 7179 + }, + { + "epoch": 0.31382490493465626, + "grad_norm": 2.34375, + "learning_rate": 7.764451420423687e-05, + "loss": 2.1466, + "step": 7180 + }, + { + "epoch": 0.31386861313868614, + "grad_norm": 2.25, + "learning_rate": 7.763879059555055e-05, + "loss": 2.0121, + "step": 7181 + }, + { + "epoch": 0.313912321342716, + "grad_norm": 2.21875, + "learning_rate": 7.763306646528004e-05, + "loss": 2.1565, + "step": 7182 + }, + { + "epoch": 0.31395602954674595, + "grad_norm": 2.4375, + "learning_rate": 7.762734181353335e-05, + "loss": 2.0406, + "step": 7183 + }, + { + "epoch": 0.31399973775077583, + "grad_norm": 1.921875, + "learning_rate": 7.762161664041852e-05, + "loss": 1.7566, + "step": 7184 + }, + { + "epoch": 0.3140434459548057, + "grad_norm": 2.546875, + "learning_rate": 7.761589094604357e-05, + "loss": 1.4868, + "step": 7185 + }, + { + "epoch": 0.3140871541588356, + "grad_norm": 2.265625, + "learning_rate": 7.761016473051655e-05, + "loss": 1.4814, + "step": 7186 + }, + { + "epoch": 0.3141308623628655, + "grad_norm": 2.390625, + "learning_rate": 7.760443799394557e-05, + "loss": 1.6667, + "step": 7187 + }, + { + "epoch": 0.3141745705668954, + "grad_norm": 2.578125, + "learning_rate": 7.759871073643865e-05, + "loss": 2.9772, + "step": 7188 + }, + { + "epoch": 0.3142182787709253, + "grad_norm": 2.25, + "learning_rate": 7.75929829581039e-05, + "loss": 1.7509, + "step": 7189 + }, + { + "epoch": 0.3142619869749552, + "grad_norm": 2.21875, + "learning_rate": 7.758725465904938e-05, + "loss": 1.8446, + "step": 7190 + }, + { + "epoch": 0.3143056951789851, + "grad_norm": 2.078125, + "learning_rate": 7.758152583938323e-05, + "loss": 2.0265, + "step": 7191 + }, + { + "epoch": 0.314349403383015, + "grad_norm": 1.96875, + "learning_rate": 7.757579649921354e-05, + "loss": 1.84, + "step": 7192 + }, + { + "epoch": 0.3143931115870449, + "grad_norm": 1.7890625, + "learning_rate": 7.757006663864843e-05, + "loss": 1.6102, + "step": 7193 + }, + { + "epoch": 0.3144368197910748, + "grad_norm": 2.640625, + "learning_rate": 7.756433625779604e-05, + "loss": 2.7256, + "step": 7194 + }, + { + "epoch": 0.3144805279951047, + "grad_norm": 2.25, + "learning_rate": 7.755860535676452e-05, + "loss": 2.0873, + "step": 7195 + }, + { + "epoch": 0.31452423619913455, + "grad_norm": 2.453125, + "learning_rate": 7.755287393566199e-05, + "loss": 2.017, + "step": 7196 + }, + { + "epoch": 0.3145679444031645, + "grad_norm": 2.125, + "learning_rate": 7.754714199459663e-05, + "loss": 1.8984, + "step": 7197 + }, + { + "epoch": 0.31461165260719437, + "grad_norm": 2.390625, + "learning_rate": 7.75414095336766e-05, + "loss": 1.4577, + "step": 7198 + }, + { + "epoch": 0.31465536081122425, + "grad_norm": 2.296875, + "learning_rate": 7.753567655301012e-05, + "loss": 1.9013, + "step": 7199 + }, + { + "epoch": 0.3146990690152542, + "grad_norm": 2.15625, + "learning_rate": 7.752994305270534e-05, + "loss": 1.9996, + "step": 7200 + }, + { + "epoch": 0.31474277721928406, + "grad_norm": 2.75, + "learning_rate": 7.752420903287044e-05, + "loss": 2.3113, + "step": 7201 + }, + { + "epoch": 0.31478648542331394, + "grad_norm": 2.40625, + "learning_rate": 7.751847449361367e-05, + "loss": 1.899, + "step": 7202 + }, + { + "epoch": 0.3148301936273439, + "grad_norm": 2.546875, + "learning_rate": 7.751273943504322e-05, + "loss": 3.2488, + "step": 7203 + }, + { + "epoch": 0.31487390183137376, + "grad_norm": 1.859375, + "learning_rate": 7.750700385726736e-05, + "loss": 1.6907, + "step": 7204 + }, + { + "epoch": 0.31491761003540364, + "grad_norm": 1.875, + "learning_rate": 7.75012677603943e-05, + "loss": 1.556, + "step": 7205 + }, + { + "epoch": 0.3149613182394335, + "grad_norm": 2.765625, + "learning_rate": 7.749553114453228e-05, + "loss": 2.0231, + "step": 7206 + }, + { + "epoch": 0.31500502644346345, + "grad_norm": 2.421875, + "learning_rate": 7.748979400978956e-05, + "loss": 2.0842, + "step": 7207 + }, + { + "epoch": 0.31504873464749333, + "grad_norm": 2.453125, + "learning_rate": 7.748405635627444e-05, + "loss": 1.6753, + "step": 7208 + }, + { + "epoch": 0.3150924428515232, + "grad_norm": 2.203125, + "learning_rate": 7.747831818409517e-05, + "loss": 1.8329, + "step": 7209 + }, + { + "epoch": 0.31513615105555315, + "grad_norm": 2.171875, + "learning_rate": 7.747257949336003e-05, + "loss": 1.687, + "step": 7210 + }, + { + "epoch": 0.31517985925958303, + "grad_norm": 2.609375, + "learning_rate": 7.746684028417733e-05, + "loss": 1.944, + "step": 7211 + }, + { + "epoch": 0.3152235674636129, + "grad_norm": 2.265625, + "learning_rate": 7.746110055665539e-05, + "loss": 1.9002, + "step": 7212 + }, + { + "epoch": 0.31526727566764284, + "grad_norm": 2.3125, + "learning_rate": 7.745536031090252e-05, + "loss": 1.7212, + "step": 7213 + }, + { + "epoch": 0.3153109838716727, + "grad_norm": 2.265625, + "learning_rate": 7.744961954702703e-05, + "loss": 1.6796, + "step": 7214 + }, + { + "epoch": 0.3153546920757026, + "grad_norm": 2.4375, + "learning_rate": 7.744387826513726e-05, + "loss": 2.0101, + "step": 7215 + }, + { + "epoch": 0.3153984002797325, + "grad_norm": 2.0625, + "learning_rate": 7.743813646534158e-05, + "loss": 1.7057, + "step": 7216 + }, + { + "epoch": 0.3154421084837624, + "grad_norm": 1.9765625, + "learning_rate": 7.743239414774832e-05, + "loss": 1.9942, + "step": 7217 + }, + { + "epoch": 0.3154858166877923, + "grad_norm": 1.90625, + "learning_rate": 7.742665131246587e-05, + "loss": 1.5873, + "step": 7218 + }, + { + "epoch": 0.3155295248918222, + "grad_norm": 2.234375, + "learning_rate": 7.742090795960259e-05, + "loss": 2.4747, + "step": 7219 + }, + { + "epoch": 0.3155732330958521, + "grad_norm": 1.9140625, + "learning_rate": 7.741516408926686e-05, + "loss": 1.9626, + "step": 7220 + }, + { + "epoch": 0.315616941299882, + "grad_norm": 2.203125, + "learning_rate": 7.740941970156707e-05, + "loss": 1.7351, + "step": 7221 + }, + { + "epoch": 0.31566064950391187, + "grad_norm": 1.9609375, + "learning_rate": 7.740367479661166e-05, + "loss": 1.2704, + "step": 7222 + }, + { + "epoch": 0.3157043577079418, + "grad_norm": 2.234375, + "learning_rate": 7.739792937450901e-05, + "loss": 1.7378, + "step": 7223 + }, + { + "epoch": 0.3157480659119717, + "grad_norm": 2.421875, + "learning_rate": 7.739218343536757e-05, + "loss": 2.3897, + "step": 7224 + }, + { + "epoch": 0.31579177411600157, + "grad_norm": 2.28125, + "learning_rate": 7.738643697929575e-05, + "loss": 2.4272, + "step": 7225 + }, + { + "epoch": 0.31583548232003145, + "grad_norm": 2.828125, + "learning_rate": 7.7380690006402e-05, + "loss": 2.2493, + "step": 7226 + }, + { + "epoch": 0.3158791905240614, + "grad_norm": 2.40625, + "learning_rate": 7.737494251679479e-05, + "loss": 2.0007, + "step": 7227 + }, + { + "epoch": 0.31592289872809126, + "grad_norm": 2.0625, + "learning_rate": 7.736919451058258e-05, + "loss": 1.9244, + "step": 7228 + }, + { + "epoch": 0.31596660693212114, + "grad_norm": 2.953125, + "learning_rate": 7.736344598787381e-05, + "loss": 1.9202, + "step": 7229 + }, + { + "epoch": 0.3160103151361511, + "grad_norm": 2.15625, + "learning_rate": 7.735769694877701e-05, + "loss": 1.7687, + "step": 7230 + }, + { + "epoch": 0.31605402334018096, + "grad_norm": 1.9765625, + "learning_rate": 7.735194739340064e-05, + "loss": 1.5402, + "step": 7231 + }, + { + "epoch": 0.31609773154421084, + "grad_norm": 2.0625, + "learning_rate": 7.734619732185322e-05, + "loss": 1.9211, + "step": 7232 + }, + { + "epoch": 0.31614143974824077, + "grad_norm": 2.328125, + "learning_rate": 7.734044673424325e-05, + "loss": 2.1806, + "step": 7233 + }, + { + "epoch": 0.31618514795227065, + "grad_norm": 2.046875, + "learning_rate": 7.733469563067928e-05, + "loss": 1.5808, + "step": 7234 + }, + { + "epoch": 0.31622885615630053, + "grad_norm": 2.5, + "learning_rate": 7.73289440112698e-05, + "loss": 2.3715, + "step": 7235 + }, + { + "epoch": 0.3162725643603304, + "grad_norm": 2.328125, + "learning_rate": 7.732319187612335e-05, + "loss": 2.3467, + "step": 7236 + }, + { + "epoch": 0.31631627256436035, + "grad_norm": 2.1875, + "learning_rate": 7.731743922534853e-05, + "loss": 1.9838, + "step": 7237 + }, + { + "epoch": 0.3163599807683902, + "grad_norm": 2.234375, + "learning_rate": 7.731168605905388e-05, + "loss": 2.6055, + "step": 7238 + }, + { + "epoch": 0.3164036889724201, + "grad_norm": 2.890625, + "learning_rate": 7.730593237734796e-05, + "loss": 3.3343, + "step": 7239 + }, + { + "epoch": 0.31644739717645004, + "grad_norm": 1.9921875, + "learning_rate": 7.730017818033935e-05, + "loss": 2.0111, + "step": 7240 + }, + { + "epoch": 0.3164911053804799, + "grad_norm": 2.390625, + "learning_rate": 7.729442346813662e-05, + "loss": 2.0082, + "step": 7241 + }, + { + "epoch": 0.3165348135845098, + "grad_norm": 1.9921875, + "learning_rate": 7.728866824084842e-05, + "loss": 2.1414, + "step": 7242 + }, + { + "epoch": 0.31657852178853974, + "grad_norm": 2.15625, + "learning_rate": 7.728291249858332e-05, + "loss": 1.7971, + "step": 7243 + }, + { + "epoch": 0.3166222299925696, + "grad_norm": 2.03125, + "learning_rate": 7.727715624144998e-05, + "loss": 2.4157, + "step": 7244 + }, + { + "epoch": 0.3166659381965995, + "grad_norm": 2.046875, + "learning_rate": 7.727139946955697e-05, + "loss": 1.6075, + "step": 7245 + }, + { + "epoch": 0.3167096464006294, + "grad_norm": 2.078125, + "learning_rate": 7.726564218301297e-05, + "loss": 2.1659, + "step": 7246 + }, + { + "epoch": 0.3167533546046593, + "grad_norm": 2.703125, + "learning_rate": 7.725988438192662e-05, + "loss": 1.6702, + "step": 7247 + }, + { + "epoch": 0.3167970628086892, + "grad_norm": 2.28125, + "learning_rate": 7.725412606640658e-05, + "loss": 1.9162, + "step": 7248 + }, + { + "epoch": 0.31684077101271907, + "grad_norm": 1.984375, + "learning_rate": 7.724836723656153e-05, + "loss": 1.8255, + "step": 7249 + }, + { + "epoch": 0.316884479216749, + "grad_norm": 2.15625, + "learning_rate": 7.724260789250011e-05, + "loss": 1.9185, + "step": 7250 + }, + { + "epoch": 0.3169281874207789, + "grad_norm": 2.03125, + "learning_rate": 7.723684803433102e-05, + "loss": 1.7484, + "step": 7251 + }, + { + "epoch": 0.31697189562480876, + "grad_norm": 1.9921875, + "learning_rate": 7.723108766216298e-05, + "loss": 1.7841, + "step": 7252 + }, + { + "epoch": 0.3170156038288387, + "grad_norm": 2.328125, + "learning_rate": 7.72253267761047e-05, + "loss": 3.1336, + "step": 7253 + }, + { + "epoch": 0.3170593120328686, + "grad_norm": 2.40625, + "learning_rate": 7.721956537626487e-05, + "loss": 1.9293, + "step": 7254 + }, + { + "epoch": 0.31710302023689846, + "grad_norm": 2.09375, + "learning_rate": 7.721380346275222e-05, + "loss": 1.7513, + "step": 7255 + }, + { + "epoch": 0.31714672844092834, + "grad_norm": 2.765625, + "learning_rate": 7.720804103567546e-05, + "loss": 2.3873, + "step": 7256 + }, + { + "epoch": 0.3171904366449583, + "grad_norm": 2.1875, + "learning_rate": 7.720227809514343e-05, + "loss": 1.8492, + "step": 7257 + }, + { + "epoch": 0.31723414484898815, + "grad_norm": 3.34375, + "learning_rate": 7.719651464126475e-05, + "loss": 2.1024, + "step": 7258 + }, + { + "epoch": 0.31727785305301803, + "grad_norm": 1.859375, + "learning_rate": 7.719075067414831e-05, + "loss": 1.4269, + "step": 7259 + }, + { + "epoch": 0.31732156125704797, + "grad_norm": 2.015625, + "learning_rate": 7.718498619390283e-05, + "loss": 2.1015, + "step": 7260 + }, + { + "epoch": 0.31736526946107785, + "grad_norm": 2.0625, + "learning_rate": 7.717922120063706e-05, + "loss": 1.6376, + "step": 7261 + }, + { + "epoch": 0.3174089776651077, + "grad_norm": 2.484375, + "learning_rate": 7.717345569445986e-05, + "loss": 1.0115, + "step": 7262 + }, + { + "epoch": 0.31745268586913766, + "grad_norm": 2.203125, + "learning_rate": 7.716768967547998e-05, + "loss": 2.3164, + "step": 7263 + }, + { + "epoch": 0.31749639407316754, + "grad_norm": 2.21875, + "learning_rate": 7.716192314380626e-05, + "loss": 1.6065, + "step": 7264 + }, + { + "epoch": 0.3175401022771974, + "grad_norm": 2.140625, + "learning_rate": 7.715615609954752e-05, + "loss": 2.0752, + "step": 7265 + }, + { + "epoch": 0.3175838104812273, + "grad_norm": 2.671875, + "learning_rate": 7.71503885428126e-05, + "loss": 1.8567, + "step": 7266 + }, + { + "epoch": 0.31762751868525724, + "grad_norm": 2.390625, + "learning_rate": 7.714462047371031e-05, + "loss": 2.532, + "step": 7267 + }, + { + "epoch": 0.3176712268892871, + "grad_norm": 2.125, + "learning_rate": 7.713885189234956e-05, + "loss": 1.9574, + "step": 7268 + }, + { + "epoch": 0.317714935093317, + "grad_norm": 2.453125, + "learning_rate": 7.713308279883915e-05, + "loss": 2.0208, + "step": 7269 + }, + { + "epoch": 0.31775864329734693, + "grad_norm": 2.53125, + "learning_rate": 7.712731319328798e-05, + "loss": 1.8329, + "step": 7270 + }, + { + "epoch": 0.3178023515013768, + "grad_norm": 2.40625, + "learning_rate": 7.712154307580493e-05, + "loss": 2.0871, + "step": 7271 + }, + { + "epoch": 0.3178460597054067, + "grad_norm": 2.09375, + "learning_rate": 7.711577244649888e-05, + "loss": 2.0626, + "step": 7272 + }, + { + "epoch": 0.3178897679094366, + "grad_norm": 2.546875, + "learning_rate": 7.711000130547875e-05, + "loss": 2.0854, + "step": 7273 + }, + { + "epoch": 0.3179334761134665, + "grad_norm": 1.953125, + "learning_rate": 7.710422965285344e-05, + "loss": 1.7566, + "step": 7274 + }, + { + "epoch": 0.3179771843174964, + "grad_norm": 1.8203125, + "learning_rate": 7.709845748873187e-05, + "loss": 1.6372, + "step": 7275 + }, + { + "epoch": 0.31802089252152627, + "grad_norm": 2.859375, + "learning_rate": 7.709268481322296e-05, + "loss": 1.9832, + "step": 7276 + }, + { + "epoch": 0.3180646007255562, + "grad_norm": 2.515625, + "learning_rate": 7.708691162643565e-05, + "loss": 1.6387, + "step": 7277 + }, + { + "epoch": 0.3181083089295861, + "grad_norm": 2.1875, + "learning_rate": 7.70811379284789e-05, + "loss": 2.0553, + "step": 7278 + }, + { + "epoch": 0.31815201713361596, + "grad_norm": 1.90625, + "learning_rate": 7.707536371946167e-05, + "loss": 1.6551, + "step": 7279 + }, + { + "epoch": 0.3181957253376459, + "grad_norm": 2.046875, + "learning_rate": 7.706958899949293e-05, + "loss": 1.7686, + "step": 7280 + }, + { + "epoch": 0.3182394335416758, + "grad_norm": 2.1875, + "learning_rate": 7.706381376868162e-05, + "loss": 1.6572, + "step": 7281 + }, + { + "epoch": 0.31828314174570566, + "grad_norm": 3.921875, + "learning_rate": 7.705803802713677e-05, + "loss": 2.0216, + "step": 7282 + }, + { + "epoch": 0.3183268499497356, + "grad_norm": 3.296875, + "learning_rate": 7.705226177496736e-05, + "loss": 2.2726, + "step": 7283 + }, + { + "epoch": 0.31837055815376547, + "grad_norm": 2.875, + "learning_rate": 7.70464850122824e-05, + "loss": 2.1551, + "step": 7284 + }, + { + "epoch": 0.31841426635779535, + "grad_norm": 1.8515625, + "learning_rate": 7.70407077391909e-05, + "loss": 1.7063, + "step": 7285 + }, + { + "epoch": 0.31845797456182523, + "grad_norm": 1.7890625, + "learning_rate": 7.703492995580188e-05, + "loss": 1.7317, + "step": 7286 + }, + { + "epoch": 0.31850168276585517, + "grad_norm": 2.125, + "learning_rate": 7.70291516622244e-05, + "loss": 1.7335, + "step": 7287 + }, + { + "epoch": 0.31854539096988504, + "grad_norm": 2.09375, + "learning_rate": 7.702337285856748e-05, + "loss": 1.8964, + "step": 7288 + }, + { + "epoch": 0.3185890991739149, + "grad_norm": 2.703125, + "learning_rate": 7.701759354494018e-05, + "loss": 1.7829, + "step": 7289 + }, + { + "epoch": 0.31863280737794486, + "grad_norm": 2.0625, + "learning_rate": 7.701181372145159e-05, + "loss": 1.8129, + "step": 7290 + }, + { + "epoch": 0.31867651558197474, + "grad_norm": 3.6875, + "learning_rate": 7.700603338821074e-05, + "loss": 2.084, + "step": 7291 + }, + { + "epoch": 0.3187202237860046, + "grad_norm": 2.203125, + "learning_rate": 7.700025254532673e-05, + "loss": 2.0998, + "step": 7292 + }, + { + "epoch": 0.31876393199003455, + "grad_norm": 2.140625, + "learning_rate": 7.699447119290867e-05, + "loss": 2.103, + "step": 7293 + }, + { + "epoch": 0.31880764019406443, + "grad_norm": 2.71875, + "learning_rate": 7.698868933106565e-05, + "loss": 2.3017, + "step": 7294 + }, + { + "epoch": 0.3188513483980943, + "grad_norm": 2.296875, + "learning_rate": 7.698290695990677e-05, + "loss": 2.3595, + "step": 7295 + }, + { + "epoch": 0.3188950566021242, + "grad_norm": 2.453125, + "learning_rate": 7.697712407954119e-05, + "loss": 2.0153, + "step": 7296 + }, + { + "epoch": 0.31893876480615413, + "grad_norm": 2.71875, + "learning_rate": 7.697134069007799e-05, + "loss": 2.3144, + "step": 7297 + }, + { + "epoch": 0.318982473010184, + "grad_norm": 2.5, + "learning_rate": 7.696555679162635e-05, + "loss": 2.4251, + "step": 7298 + }, + { + "epoch": 0.3190261812142139, + "grad_norm": 1.984375, + "learning_rate": 7.695977238429539e-05, + "loss": 2.1105, + "step": 7299 + }, + { + "epoch": 0.3190698894182438, + "grad_norm": 2.0625, + "learning_rate": 7.695398746819431e-05, + "loss": 2.094, + "step": 7300 + }, + { + "epoch": 0.3191135976222737, + "grad_norm": 2.078125, + "learning_rate": 7.694820204343223e-05, + "loss": 1.9124, + "step": 7301 + }, + { + "epoch": 0.3191573058263036, + "grad_norm": 2.046875, + "learning_rate": 7.694241611011838e-05, + "loss": 1.5197, + "step": 7302 + }, + { + "epoch": 0.3192010140303335, + "grad_norm": 2.734375, + "learning_rate": 7.693662966836191e-05, + "loss": 1.8983, + "step": 7303 + }, + { + "epoch": 0.3192447222343634, + "grad_norm": 2.421875, + "learning_rate": 7.693084271827205e-05, + "loss": 1.5317, + "step": 7304 + }, + { + "epoch": 0.3192884304383933, + "grad_norm": 2.296875, + "learning_rate": 7.692505525995799e-05, + "loss": 2.3226, + "step": 7305 + }, + { + "epoch": 0.31933213864242316, + "grad_norm": 2.046875, + "learning_rate": 7.691926729352894e-05, + "loss": 1.7196, + "step": 7306 + }, + { + "epoch": 0.3193758468464531, + "grad_norm": 2.703125, + "learning_rate": 7.691347881909412e-05, + "loss": 2.3532, + "step": 7307 + }, + { + "epoch": 0.319419555050483, + "grad_norm": 1.9921875, + "learning_rate": 7.690768983676281e-05, + "loss": 1.8094, + "step": 7308 + }, + { + "epoch": 0.31946326325451285, + "grad_norm": 1.90625, + "learning_rate": 7.690190034664423e-05, + "loss": 2.125, + "step": 7309 + }, + { + "epoch": 0.3195069714585428, + "grad_norm": 10.0, + "learning_rate": 7.689611034884763e-05, + "loss": 2.7774, + "step": 7310 + }, + { + "epoch": 0.31955067966257267, + "grad_norm": 2.09375, + "learning_rate": 7.689031984348227e-05, + "loss": 1.3217, + "step": 7311 + }, + { + "epoch": 0.31959438786660255, + "grad_norm": 2.546875, + "learning_rate": 7.688452883065745e-05, + "loss": 1.6019, + "step": 7312 + }, + { + "epoch": 0.3196380960706325, + "grad_norm": 2.34375, + "learning_rate": 7.687873731048245e-05, + "loss": 1.9432, + "step": 7313 + }, + { + "epoch": 0.31968180427466236, + "grad_norm": 2.390625, + "learning_rate": 7.687294528306655e-05, + "loss": 2.1253, + "step": 7314 + }, + { + "epoch": 0.31972551247869224, + "grad_norm": 2.109375, + "learning_rate": 7.686715274851906e-05, + "loss": 1.5275, + "step": 7315 + }, + { + "epoch": 0.3197692206827221, + "grad_norm": 2.09375, + "learning_rate": 7.68613597069493e-05, + "loss": 2.0593, + "step": 7316 + }, + { + "epoch": 0.31981292888675206, + "grad_norm": 1.9921875, + "learning_rate": 7.685556615846657e-05, + "loss": 1.4496, + "step": 7317 + }, + { + "epoch": 0.31985663709078194, + "grad_norm": 3.28125, + "learning_rate": 7.684977210318024e-05, + "loss": 2.216, + "step": 7318 + }, + { + "epoch": 0.3199003452948118, + "grad_norm": 1.8359375, + "learning_rate": 7.684397754119964e-05, + "loss": 1.4906, + "step": 7319 + }, + { + "epoch": 0.31994405349884175, + "grad_norm": 2.296875, + "learning_rate": 7.683818247263407e-05, + "loss": 1.724, + "step": 7320 + }, + { + "epoch": 0.31998776170287163, + "grad_norm": 2.734375, + "learning_rate": 7.683238689759298e-05, + "loss": 1.9679, + "step": 7321 + }, + { + "epoch": 0.3200314699069015, + "grad_norm": 2.140625, + "learning_rate": 7.682659081618567e-05, + "loss": 1.907, + "step": 7322 + }, + { + "epoch": 0.32007517811093145, + "grad_norm": 2.1875, + "learning_rate": 7.682079422852156e-05, + "loss": 2.0567, + "step": 7323 + }, + { + "epoch": 0.3201188863149613, + "grad_norm": 2.546875, + "learning_rate": 7.681499713471002e-05, + "loss": 1.9915, + "step": 7324 + }, + { + "epoch": 0.3201625945189912, + "grad_norm": 4.25, + "learning_rate": 7.680919953486048e-05, + "loss": 2.7251, + "step": 7325 + }, + { + "epoch": 0.3202063027230211, + "grad_norm": 5.28125, + "learning_rate": 7.680340142908231e-05, + "loss": 2.0837, + "step": 7326 + }, + { + "epoch": 0.320250010927051, + "grad_norm": 1.9921875, + "learning_rate": 7.679760281748491e-05, + "loss": 1.6771, + "step": 7327 + }, + { + "epoch": 0.3202937191310809, + "grad_norm": 2.328125, + "learning_rate": 7.67918037001778e-05, + "loss": 2.085, + "step": 7328 + }, + { + "epoch": 0.3203374273351108, + "grad_norm": 2.796875, + "learning_rate": 7.678600407727032e-05, + "loss": 1.6378, + "step": 7329 + }, + { + "epoch": 0.3203811355391407, + "grad_norm": 2.25, + "learning_rate": 7.678020394887197e-05, + "loss": 2.031, + "step": 7330 + }, + { + "epoch": 0.3204248437431706, + "grad_norm": 2.1875, + "learning_rate": 7.67744033150922e-05, + "loss": 1.9841, + "step": 7331 + }, + { + "epoch": 0.3204685519472005, + "grad_norm": 2.390625, + "learning_rate": 7.676860217604047e-05, + "loss": 2.5098, + "step": 7332 + }, + { + "epoch": 0.3205122601512304, + "grad_norm": 1.9921875, + "learning_rate": 7.676280053182626e-05, + "loss": 1.766, + "step": 7333 + }, + { + "epoch": 0.3205559683552603, + "grad_norm": 2.140625, + "learning_rate": 7.675699838255905e-05, + "loss": 2.0977, + "step": 7334 + }, + { + "epoch": 0.32059967655929017, + "grad_norm": 2.59375, + "learning_rate": 7.675119572834835e-05, + "loss": 2.1568, + "step": 7335 + }, + { + "epoch": 0.32064338476332005, + "grad_norm": 2.421875, + "learning_rate": 7.674539256930363e-05, + "loss": 1.7092, + "step": 7336 + }, + { + "epoch": 0.32068709296735, + "grad_norm": 1.8359375, + "learning_rate": 7.673958890553443e-05, + "loss": 1.755, + "step": 7337 + }, + { + "epoch": 0.32073080117137986, + "grad_norm": 1.7734375, + "learning_rate": 7.673378473715027e-05, + "loss": 1.5277, + "step": 7338 + }, + { + "epoch": 0.32077450937540974, + "grad_norm": 8.5, + "learning_rate": 7.672798006426069e-05, + "loss": 2.4689, + "step": 7339 + }, + { + "epoch": 0.3208182175794397, + "grad_norm": 2.171875, + "learning_rate": 7.672217488697522e-05, + "loss": 1.7042, + "step": 7340 + }, + { + "epoch": 0.32086192578346956, + "grad_norm": 2.125, + "learning_rate": 7.671636920540342e-05, + "loss": 2.0158, + "step": 7341 + }, + { + "epoch": 0.32090563398749944, + "grad_norm": 1.9140625, + "learning_rate": 7.671056301965484e-05, + "loss": 1.4584, + "step": 7342 + }, + { + "epoch": 0.3209493421915294, + "grad_norm": 11.25, + "learning_rate": 7.670475632983909e-05, + "loss": 1.9689, + "step": 7343 + }, + { + "epoch": 0.32099305039555925, + "grad_norm": 2.1875, + "learning_rate": 7.669894913606568e-05, + "loss": 1.9686, + "step": 7344 + }, + { + "epoch": 0.32103675859958913, + "grad_norm": 1.984375, + "learning_rate": 7.669314143844428e-05, + "loss": 1.7838, + "step": 7345 + }, + { + "epoch": 0.321080466803619, + "grad_norm": 3.078125, + "learning_rate": 7.668733323708443e-05, + "loss": 1.416, + "step": 7346 + }, + { + "epoch": 0.32112417500764895, + "grad_norm": 2.34375, + "learning_rate": 7.668152453209576e-05, + "loss": 2.0266, + "step": 7347 + }, + { + "epoch": 0.32116788321167883, + "grad_norm": 1.9765625, + "learning_rate": 7.66757153235879e-05, + "loss": 1.6594, + "step": 7348 + }, + { + "epoch": 0.3212115914157087, + "grad_norm": 1.9296875, + "learning_rate": 7.666990561167046e-05, + "loss": 1.8492, + "step": 7349 + }, + { + "epoch": 0.32125529961973864, + "grad_norm": 2.09375, + "learning_rate": 7.666409539645308e-05, + "loss": 1.8732, + "step": 7350 + }, + { + "epoch": 0.3212990078237685, + "grad_norm": 2.5, + "learning_rate": 7.665828467804542e-05, + "loss": 2.4108, + "step": 7351 + }, + { + "epoch": 0.3213427160277984, + "grad_norm": 2.09375, + "learning_rate": 7.665247345655713e-05, + "loss": 1.5882, + "step": 7352 + }, + { + "epoch": 0.32138642423182834, + "grad_norm": 2.578125, + "learning_rate": 7.664666173209787e-05, + "loss": 1.9882, + "step": 7353 + }, + { + "epoch": 0.3214301324358582, + "grad_norm": 2.203125, + "learning_rate": 7.664084950477731e-05, + "loss": 1.8138, + "step": 7354 + }, + { + "epoch": 0.3214738406398881, + "grad_norm": 2.640625, + "learning_rate": 7.663503677470516e-05, + "loss": 2.143, + "step": 7355 + }, + { + "epoch": 0.321517548843918, + "grad_norm": 2.203125, + "learning_rate": 7.66292235419911e-05, + "loss": 1.7407, + "step": 7356 + }, + { + "epoch": 0.3215612570479479, + "grad_norm": 1.9609375, + "learning_rate": 7.662340980674483e-05, + "loss": 1.5849, + "step": 7357 + }, + { + "epoch": 0.3216049652519778, + "grad_norm": 3.03125, + "learning_rate": 7.661759556907607e-05, + "loss": 2.7416, + "step": 7358 + }, + { + "epoch": 0.32164867345600767, + "grad_norm": 2.1875, + "learning_rate": 7.661178082909455e-05, + "loss": 1.7122, + "step": 7359 + }, + { + "epoch": 0.3216923816600376, + "grad_norm": 2.21875, + "learning_rate": 7.660596558690998e-05, + "loss": 2.0861, + "step": 7360 + }, + { + "epoch": 0.3217360898640675, + "grad_norm": 2.6875, + "learning_rate": 7.660014984263214e-05, + "loss": 2.1306, + "step": 7361 + }, + { + "epoch": 0.32177979806809737, + "grad_norm": 2.03125, + "learning_rate": 7.659433359637072e-05, + "loss": 1.573, + "step": 7362 + }, + { + "epoch": 0.3218235062721273, + "grad_norm": 2.53125, + "learning_rate": 7.658851684823553e-05, + "loss": 1.3277, + "step": 7363 + }, + { + "epoch": 0.3218672144761572, + "grad_norm": 2.03125, + "learning_rate": 7.658269959833635e-05, + "loss": 1.7538, + "step": 7364 + }, + { + "epoch": 0.32191092268018706, + "grad_norm": 2.4375, + "learning_rate": 7.657688184678293e-05, + "loss": 2.4755, + "step": 7365 + }, + { + "epoch": 0.32195463088421694, + "grad_norm": 2.21875, + "learning_rate": 7.657106359368507e-05, + "loss": 1.7868, + "step": 7366 + }, + { + "epoch": 0.3219983390882469, + "grad_norm": 1.90625, + "learning_rate": 7.656524483915256e-05, + "loss": 1.634, + "step": 7367 + }, + { + "epoch": 0.32204204729227676, + "grad_norm": 1.765625, + "learning_rate": 7.655942558329523e-05, + "loss": 1.6302, + "step": 7368 + }, + { + "epoch": 0.32208575549630664, + "grad_norm": 2.109375, + "learning_rate": 7.655360582622286e-05, + "loss": 1.8055, + "step": 7369 + }, + { + "epoch": 0.32212946370033657, + "grad_norm": 2.390625, + "learning_rate": 7.654778556804533e-05, + "loss": 2.1191, + "step": 7370 + }, + { + "epoch": 0.32217317190436645, + "grad_norm": 1.9296875, + "learning_rate": 7.654196480887244e-05, + "loss": 1.8245, + "step": 7371 + }, + { + "epoch": 0.32221688010839633, + "grad_norm": 2.1875, + "learning_rate": 7.653614354881402e-05, + "loss": 1.6235, + "step": 7372 + }, + { + "epoch": 0.32226058831242627, + "grad_norm": 2.578125, + "learning_rate": 7.653032178797996e-05, + "loss": 2.6314, + "step": 7373 + }, + { + "epoch": 0.32230429651645615, + "grad_norm": 3.3125, + "learning_rate": 7.652449952648013e-05, + "loss": 3.2089, + "step": 7374 + }, + { + "epoch": 0.322348004720486, + "grad_norm": 2.015625, + "learning_rate": 7.65186767644244e-05, + "loss": 1.5702, + "step": 7375 + }, + { + "epoch": 0.3223917129245159, + "grad_norm": 2.234375, + "learning_rate": 7.651285350192261e-05, + "loss": 1.951, + "step": 7376 + }, + { + "epoch": 0.32243542112854584, + "grad_norm": 2.421875, + "learning_rate": 7.650702973908471e-05, + "loss": 1.9708, + "step": 7377 + }, + { + "epoch": 0.3224791293325757, + "grad_norm": 2.046875, + "learning_rate": 7.650120547602056e-05, + "loss": 1.8858, + "step": 7378 + }, + { + "epoch": 0.3225228375366056, + "grad_norm": 1.8515625, + "learning_rate": 7.64953807128401e-05, + "loss": 1.9008, + "step": 7379 + }, + { + "epoch": 0.32256654574063554, + "grad_norm": 1.8203125, + "learning_rate": 7.648955544965326e-05, + "loss": 1.4871, + "step": 7380 + }, + { + "epoch": 0.3226102539446654, + "grad_norm": 2.0625, + "learning_rate": 7.648372968656993e-05, + "loss": 1.5375, + "step": 7381 + }, + { + "epoch": 0.3226539621486953, + "grad_norm": 2.484375, + "learning_rate": 7.647790342370009e-05, + "loss": 3.0121, + "step": 7382 + }, + { + "epoch": 0.32269767035272523, + "grad_norm": 3.15625, + "learning_rate": 7.647207666115368e-05, + "loss": 1.7063, + "step": 7383 + }, + { + "epoch": 0.3227413785567551, + "grad_norm": 2.3125, + "learning_rate": 7.646624939904064e-05, + "loss": 2.0754, + "step": 7384 + }, + { + "epoch": 0.322785086760785, + "grad_norm": 2.1875, + "learning_rate": 7.646042163747097e-05, + "loss": 1.5163, + "step": 7385 + }, + { + "epoch": 0.32282879496481487, + "grad_norm": 2.5625, + "learning_rate": 7.645459337655463e-05, + "loss": 2.555, + "step": 7386 + }, + { + "epoch": 0.3228725031688448, + "grad_norm": 2.625, + "learning_rate": 7.644876461640158e-05, + "loss": 2.0501, + "step": 7387 + }, + { + "epoch": 0.3229162113728747, + "grad_norm": 2.109375, + "learning_rate": 7.644293535712189e-05, + "loss": 1.6365, + "step": 7388 + }, + { + "epoch": 0.32295991957690456, + "grad_norm": 3.796875, + "learning_rate": 7.643710559882551e-05, + "loss": 2.3091, + "step": 7389 + }, + { + "epoch": 0.3230036277809345, + "grad_norm": 2.28125, + "learning_rate": 7.643127534162247e-05, + "loss": 2.0301, + "step": 7390 + }, + { + "epoch": 0.3230473359849644, + "grad_norm": 2.96875, + "learning_rate": 7.642544458562278e-05, + "loss": 2.1748, + "step": 7391 + }, + { + "epoch": 0.32309104418899426, + "grad_norm": 1.8828125, + "learning_rate": 7.64196133309365e-05, + "loss": 1.9963, + "step": 7392 + }, + { + "epoch": 0.3231347523930242, + "grad_norm": 2.109375, + "learning_rate": 7.641378157767368e-05, + "loss": 1.8105, + "step": 7393 + }, + { + "epoch": 0.3231784605970541, + "grad_norm": 2.59375, + "learning_rate": 7.640794932594433e-05, + "loss": 1.945, + "step": 7394 + }, + { + "epoch": 0.32322216880108395, + "grad_norm": 2.046875, + "learning_rate": 7.640211657585856e-05, + "loss": 1.8537, + "step": 7395 + }, + { + "epoch": 0.32326587700511383, + "grad_norm": 1.8984375, + "learning_rate": 7.639628332752642e-05, + "loss": 2.0916, + "step": 7396 + }, + { + "epoch": 0.32330958520914377, + "grad_norm": 1.984375, + "learning_rate": 7.639044958105799e-05, + "loss": 1.7224, + "step": 7397 + }, + { + "epoch": 0.32335329341317365, + "grad_norm": 2.390625, + "learning_rate": 7.638461533656338e-05, + "loss": 1.7498, + "step": 7398 + }, + { + "epoch": 0.32339700161720353, + "grad_norm": 2.109375, + "learning_rate": 7.637878059415266e-05, + "loss": 1.8257, + "step": 7399 + }, + { + "epoch": 0.32344070982123346, + "grad_norm": 2.328125, + "learning_rate": 7.637294535393598e-05, + "loss": 1.925, + "step": 7400 + }, + { + "epoch": 0.32348441802526334, + "grad_norm": 2.625, + "learning_rate": 7.636710961602341e-05, + "loss": 2.1079, + "step": 7401 + }, + { + "epoch": 0.3235281262292932, + "grad_norm": 1.78125, + "learning_rate": 7.636127338052512e-05, + "loss": 1.7199, + "step": 7402 + }, + { + "epoch": 0.32357183443332316, + "grad_norm": 2.8125, + "learning_rate": 7.635543664755124e-05, + "loss": 1.651, + "step": 7403 + }, + { + "epoch": 0.32361554263735304, + "grad_norm": 2.484375, + "learning_rate": 7.634959941721191e-05, + "loss": 2.3902, + "step": 7404 + }, + { + "epoch": 0.3236592508413829, + "grad_norm": 2.203125, + "learning_rate": 7.634376168961729e-05, + "loss": 2.8142, + "step": 7405 + }, + { + "epoch": 0.3237029590454128, + "grad_norm": 1.9921875, + "learning_rate": 7.633792346487754e-05, + "loss": 1.6277, + "step": 7406 + }, + { + "epoch": 0.32374666724944273, + "grad_norm": 2.265625, + "learning_rate": 7.633208474310283e-05, + "loss": 1.9859, + "step": 7407 + }, + { + "epoch": 0.3237903754534726, + "grad_norm": 2.203125, + "learning_rate": 7.632624552440337e-05, + "loss": 1.6204, + "step": 7408 + }, + { + "epoch": 0.3238340836575025, + "grad_norm": 1.75, + "learning_rate": 7.632040580888936e-05, + "loss": 1.6116, + "step": 7409 + }, + { + "epoch": 0.3238777918615324, + "grad_norm": 2.296875, + "learning_rate": 7.631456559667095e-05, + "loss": 1.6195, + "step": 7410 + }, + { + "epoch": 0.3239215000655623, + "grad_norm": 1.9140625, + "learning_rate": 7.630872488785841e-05, + "loss": 2.1594, + "step": 7411 + }, + { + "epoch": 0.3239652082695922, + "grad_norm": 1.9921875, + "learning_rate": 7.630288368256193e-05, + "loss": 1.6741, + "step": 7412 + }, + { + "epoch": 0.3240089164736221, + "grad_norm": 1.9609375, + "learning_rate": 7.629704198089175e-05, + "loss": 1.9449, + "step": 7413 + }, + { + "epoch": 0.324052624677652, + "grad_norm": 1.8671875, + "learning_rate": 7.629119978295811e-05, + "loss": 1.716, + "step": 7414 + }, + { + "epoch": 0.3240963328816819, + "grad_norm": 2.5, + "learning_rate": 7.62853570888713e-05, + "loss": 1.6255, + "step": 7415 + }, + { + "epoch": 0.32414004108571176, + "grad_norm": 1.8515625, + "learning_rate": 7.62795138987415e-05, + "loss": 1.7113, + "step": 7416 + }, + { + "epoch": 0.3241837492897417, + "grad_norm": 1.890625, + "learning_rate": 7.627367021267906e-05, + "loss": 1.3831, + "step": 7417 + }, + { + "epoch": 0.3242274574937716, + "grad_norm": 2.03125, + "learning_rate": 7.626782603079421e-05, + "loss": 1.4712, + "step": 7418 + }, + { + "epoch": 0.32427116569780146, + "grad_norm": 1.8046875, + "learning_rate": 7.626198135319724e-05, + "loss": 1.3958, + "step": 7419 + }, + { + "epoch": 0.3243148739018314, + "grad_norm": 2.125, + "learning_rate": 7.625613617999847e-05, + "loss": 1.7865, + "step": 7420 + }, + { + "epoch": 0.32435858210586127, + "grad_norm": 5.65625, + "learning_rate": 7.62502905113082e-05, + "loss": 1.3227, + "step": 7421 + }, + { + "epoch": 0.32440229030989115, + "grad_norm": 1.8125, + "learning_rate": 7.624444434723674e-05, + "loss": 1.6127, + "step": 7422 + }, + { + "epoch": 0.3244459985139211, + "grad_norm": 2.046875, + "learning_rate": 7.623859768789441e-05, + "loss": 1.8083, + "step": 7423 + }, + { + "epoch": 0.32448970671795097, + "grad_norm": 1.8515625, + "learning_rate": 7.623275053339156e-05, + "loss": 1.8015, + "step": 7424 + }, + { + "epoch": 0.32453341492198085, + "grad_norm": 3.734375, + "learning_rate": 7.622690288383853e-05, + "loss": 1.9107, + "step": 7425 + }, + { + "epoch": 0.3245771231260107, + "grad_norm": 2.234375, + "learning_rate": 7.62210547393457e-05, + "loss": 2.0738, + "step": 7426 + }, + { + "epoch": 0.32462083133004066, + "grad_norm": 2.109375, + "learning_rate": 7.621520610002335e-05, + "loss": 1.9741, + "step": 7427 + }, + { + "epoch": 0.32466453953407054, + "grad_norm": 2.21875, + "learning_rate": 7.620935696598192e-05, + "loss": 1.5479, + "step": 7428 + }, + { + "epoch": 0.3247082477381004, + "grad_norm": 2.15625, + "learning_rate": 7.620350733733179e-05, + "loss": 1.9824, + "step": 7429 + }, + { + "epoch": 0.32475195594213035, + "grad_norm": 2.078125, + "learning_rate": 7.619765721418335e-05, + "loss": 1.6375, + "step": 7430 + }, + { + "epoch": 0.32479566414616023, + "grad_norm": 2.15625, + "learning_rate": 7.619180659664698e-05, + "loss": 2.0936, + "step": 7431 + }, + { + "epoch": 0.3248393723501901, + "grad_norm": 1.984375, + "learning_rate": 7.618595548483309e-05, + "loss": 2.2806, + "step": 7432 + }, + { + "epoch": 0.32488308055422005, + "grad_norm": 2.21875, + "learning_rate": 7.61801038788521e-05, + "loss": 1.6695, + "step": 7433 + }, + { + "epoch": 0.32492678875824993, + "grad_norm": 2.09375, + "learning_rate": 7.617425177881446e-05, + "loss": 1.7873, + "step": 7434 + }, + { + "epoch": 0.3249704969622798, + "grad_norm": 2.46875, + "learning_rate": 7.616839918483061e-05, + "loss": 1.407, + "step": 7435 + }, + { + "epoch": 0.3250142051663097, + "grad_norm": 3.46875, + "learning_rate": 7.616254609701096e-05, + "loss": 2.6228, + "step": 7436 + }, + { + "epoch": 0.3250579133703396, + "grad_norm": 3.46875, + "learning_rate": 7.6156692515466e-05, + "loss": 1.7178, + "step": 7437 + }, + { + "epoch": 0.3251016215743695, + "grad_norm": 2.15625, + "learning_rate": 7.615083844030618e-05, + "loss": 2.5461, + "step": 7438 + }, + { + "epoch": 0.3251453297783994, + "grad_norm": 1.984375, + "learning_rate": 7.614498387164198e-05, + "loss": 1.7624, + "step": 7439 + }, + { + "epoch": 0.3251890379824293, + "grad_norm": 2.4375, + "learning_rate": 7.613912880958386e-05, + "loss": 1.7311, + "step": 7440 + }, + { + "epoch": 0.3252327461864592, + "grad_norm": 2.171875, + "learning_rate": 7.613327325424235e-05, + "loss": 1.9983, + "step": 7441 + }, + { + "epoch": 0.3252764543904891, + "grad_norm": 2.359375, + "learning_rate": 7.612741720572794e-05, + "loss": 1.957, + "step": 7442 + }, + { + "epoch": 0.325320162594519, + "grad_norm": 2.375, + "learning_rate": 7.612156066415113e-05, + "loss": 1.513, + "step": 7443 + }, + { + "epoch": 0.3253638707985489, + "grad_norm": 2.28125, + "learning_rate": 7.611570362962248e-05, + "loss": 2.1919, + "step": 7444 + }, + { + "epoch": 0.3254075790025788, + "grad_norm": 1.8125, + "learning_rate": 7.610984610225247e-05, + "loss": 1.4103, + "step": 7445 + }, + { + "epoch": 0.3254512872066087, + "grad_norm": 2.265625, + "learning_rate": 7.610398808215166e-05, + "loss": 2.1165, + "step": 7446 + }, + { + "epoch": 0.3254949954106386, + "grad_norm": 1.9296875, + "learning_rate": 7.609812956943063e-05, + "loss": 1.7011, + "step": 7447 + }, + { + "epoch": 0.32553870361466847, + "grad_norm": 2.203125, + "learning_rate": 7.609227056419989e-05, + "loss": 1.6094, + "step": 7448 + }, + { + "epoch": 0.32558241181869835, + "grad_norm": 2.65625, + "learning_rate": 7.608641106657001e-05, + "loss": 2.8938, + "step": 7449 + }, + { + "epoch": 0.3256261200227283, + "grad_norm": 1.9765625, + "learning_rate": 7.608055107665161e-05, + "loss": 1.9487, + "step": 7450 + }, + { + "epoch": 0.32566982822675816, + "grad_norm": 2.46875, + "learning_rate": 7.607469059455526e-05, + "loss": 1.7912, + "step": 7451 + }, + { + "epoch": 0.32571353643078804, + "grad_norm": 2.21875, + "learning_rate": 7.606882962039154e-05, + "loss": 1.9667, + "step": 7452 + }, + { + "epoch": 0.325757244634818, + "grad_norm": 1.8046875, + "learning_rate": 7.606296815427106e-05, + "loss": 1.8214, + "step": 7453 + }, + { + "epoch": 0.32580095283884786, + "grad_norm": 1.90625, + "learning_rate": 7.605710619630444e-05, + "loss": 2.1281, + "step": 7454 + }, + { + "epoch": 0.32584466104287774, + "grad_norm": 2.5, + "learning_rate": 7.605124374660231e-05, + "loss": 1.7559, + "step": 7455 + }, + { + "epoch": 0.32588836924690767, + "grad_norm": 2.40625, + "learning_rate": 7.604538080527527e-05, + "loss": 1.7102, + "step": 7456 + }, + { + "epoch": 0.32593207745093755, + "grad_norm": 2.09375, + "learning_rate": 7.603951737243402e-05, + "loss": 1.8114, + "step": 7457 + }, + { + "epoch": 0.32597578565496743, + "grad_norm": 1.8203125, + "learning_rate": 7.603365344818916e-05, + "loss": 0.9383, + "step": 7458 + }, + { + "epoch": 0.3260194938589973, + "grad_norm": 1.9765625, + "learning_rate": 7.602778903265137e-05, + "loss": 2.0806, + "step": 7459 + }, + { + "epoch": 0.32606320206302725, + "grad_norm": 2.046875, + "learning_rate": 7.602192412593132e-05, + "loss": 2.2013, + "step": 7460 + }, + { + "epoch": 0.3261069102670571, + "grad_norm": 2.078125, + "learning_rate": 7.601605872813969e-05, + "loss": 2.2884, + "step": 7461 + }, + { + "epoch": 0.326150618471087, + "grad_norm": 1.96875, + "learning_rate": 7.601019283938717e-05, + "loss": 2.3615, + "step": 7462 + }, + { + "epoch": 0.32619432667511694, + "grad_norm": 2.125, + "learning_rate": 7.600432645978444e-05, + "loss": 1.6707, + "step": 7463 + }, + { + "epoch": 0.3262380348791468, + "grad_norm": 2.359375, + "learning_rate": 7.599845958944224e-05, + "loss": 1.7099, + "step": 7464 + }, + { + "epoch": 0.3262817430831767, + "grad_norm": 2.296875, + "learning_rate": 7.599259222847127e-05, + "loss": 1.9748, + "step": 7465 + }, + { + "epoch": 0.32632545128720664, + "grad_norm": 1.9765625, + "learning_rate": 7.598672437698224e-05, + "loss": 1.7586, + "step": 7466 + }, + { + "epoch": 0.3263691594912365, + "grad_norm": 2.359375, + "learning_rate": 7.598085603508592e-05, + "loss": 2.448, + "step": 7467 + }, + { + "epoch": 0.3264128676952664, + "grad_norm": 2.15625, + "learning_rate": 7.597498720289302e-05, + "loss": 1.597, + "step": 7468 + }, + { + "epoch": 0.3264565758992963, + "grad_norm": 1.953125, + "learning_rate": 7.59691178805143e-05, + "loss": 1.6784, + "step": 7469 + }, + { + "epoch": 0.3265002841033262, + "grad_norm": 2.078125, + "learning_rate": 7.596324806806052e-05, + "loss": 1.8834, + "step": 7470 + }, + { + "epoch": 0.3265439923073561, + "grad_norm": 2.34375, + "learning_rate": 7.595737776564249e-05, + "loss": 1.5806, + "step": 7471 + }, + { + "epoch": 0.32658770051138597, + "grad_norm": 1.90625, + "learning_rate": 7.595150697337095e-05, + "loss": 1.6759, + "step": 7472 + }, + { + "epoch": 0.3266314087154159, + "grad_norm": 2.390625, + "learning_rate": 7.594563569135668e-05, + "loss": 1.974, + "step": 7473 + }, + { + "epoch": 0.3266751169194458, + "grad_norm": 2.625, + "learning_rate": 7.593976391971054e-05, + "loss": 1.97, + "step": 7474 + }, + { + "epoch": 0.32671882512347566, + "grad_norm": 2.328125, + "learning_rate": 7.593389165854329e-05, + "loss": 2.3795, + "step": 7475 + }, + { + "epoch": 0.3267625333275056, + "grad_norm": 2.171875, + "learning_rate": 7.592801890796575e-05, + "loss": 2.0789, + "step": 7476 + }, + { + "epoch": 0.3268062415315355, + "grad_norm": 2.015625, + "learning_rate": 7.592214566808877e-05, + "loss": 1.7114, + "step": 7477 + }, + { + "epoch": 0.32684994973556536, + "grad_norm": 2.296875, + "learning_rate": 7.591627193902315e-05, + "loss": 2.268, + "step": 7478 + }, + { + "epoch": 0.32689365793959524, + "grad_norm": 2.09375, + "learning_rate": 7.591039772087977e-05, + "loss": 1.96, + "step": 7479 + }, + { + "epoch": 0.3269373661436252, + "grad_norm": 3.4375, + "learning_rate": 7.59045230137695e-05, + "loss": 2.1119, + "step": 7480 + }, + { + "epoch": 0.32698107434765505, + "grad_norm": 2.25, + "learning_rate": 7.589864781780314e-05, + "loss": 1.6881, + "step": 7481 + }, + { + "epoch": 0.32702478255168493, + "grad_norm": 1.890625, + "learning_rate": 7.589277213309163e-05, + "loss": 1.9424, + "step": 7482 + }, + { + "epoch": 0.32706849075571487, + "grad_norm": 1.8203125, + "learning_rate": 7.58868959597458e-05, + "loss": 1.2206, + "step": 7483 + }, + { + "epoch": 0.32711219895974475, + "grad_norm": 2.203125, + "learning_rate": 7.588101929787658e-05, + "loss": 1.8753, + "step": 7484 + }, + { + "epoch": 0.32715590716377463, + "grad_norm": 3.640625, + "learning_rate": 7.587514214759487e-05, + "loss": 1.7069, + "step": 7485 + }, + { + "epoch": 0.32719961536780456, + "grad_norm": 3.1875, + "learning_rate": 7.586926450901155e-05, + "loss": 1.8258, + "step": 7486 + }, + { + "epoch": 0.32724332357183444, + "grad_norm": 2.375, + "learning_rate": 7.586338638223757e-05, + "loss": 1.7908, + "step": 7487 + }, + { + "epoch": 0.3272870317758643, + "grad_norm": 2.09375, + "learning_rate": 7.585750776738383e-05, + "loss": 2.2592, + "step": 7488 + }, + { + "epoch": 0.3273307399798942, + "grad_norm": 1.875, + "learning_rate": 7.58516286645613e-05, + "loss": 1.6792, + "step": 7489 + }, + { + "epoch": 0.32737444818392414, + "grad_norm": 2.609375, + "learning_rate": 7.584574907388092e-05, + "loss": 2.2359, + "step": 7490 + }, + { + "epoch": 0.327418156387954, + "grad_norm": 2.09375, + "learning_rate": 7.583986899545362e-05, + "loss": 1.7483, + "step": 7491 + }, + { + "epoch": 0.3274618645919839, + "grad_norm": 2.21875, + "learning_rate": 7.58339884293904e-05, + "loss": 1.6944, + "step": 7492 + }, + { + "epoch": 0.32750557279601383, + "grad_norm": 2.78125, + "learning_rate": 7.58281073758022e-05, + "loss": 2.4001, + "step": 7493 + }, + { + "epoch": 0.3275492810000437, + "grad_norm": 2.53125, + "learning_rate": 7.58222258348e-05, + "loss": 1.9724, + "step": 7494 + }, + { + "epoch": 0.3275929892040736, + "grad_norm": 2.34375, + "learning_rate": 7.581634380649488e-05, + "loss": 1.8521, + "step": 7495 + }, + { + "epoch": 0.3276366974081035, + "grad_norm": 2.21875, + "learning_rate": 7.581046129099773e-05, + "loss": 2.0944, + "step": 7496 + }, + { + "epoch": 0.3276804056121334, + "grad_norm": 1.9296875, + "learning_rate": 7.580457828841963e-05, + "loss": 1.59, + "step": 7497 + }, + { + "epoch": 0.3277241138161633, + "grad_norm": 2.5625, + "learning_rate": 7.579869479887158e-05, + "loss": 2.4358, + "step": 7498 + }, + { + "epoch": 0.32776782202019317, + "grad_norm": 1.9765625, + "learning_rate": 7.57928108224646e-05, + "loss": 1.5962, + "step": 7499 + }, + { + "epoch": 0.3278115302242231, + "grad_norm": 2.390625, + "learning_rate": 7.578692635930975e-05, + "loss": 1.5579, + "step": 7500 + }, + { + "epoch": 0.327855238428253, + "grad_norm": 15.0, + "learning_rate": 7.578104140951807e-05, + "loss": 5.9767, + "step": 7501 + }, + { + "epoch": 0.32789894663228286, + "grad_norm": 2.015625, + "learning_rate": 7.577515597320062e-05, + "loss": 2.0851, + "step": 7502 + }, + { + "epoch": 0.3279426548363128, + "grad_norm": 2.15625, + "learning_rate": 7.576927005046844e-05, + "loss": 2.4982, + "step": 7503 + }, + { + "epoch": 0.3279863630403427, + "grad_norm": 2.25, + "learning_rate": 7.576338364143264e-05, + "loss": 1.5677, + "step": 7504 + }, + { + "epoch": 0.32803007124437256, + "grad_norm": 1.96875, + "learning_rate": 7.575749674620431e-05, + "loss": 1.7917, + "step": 7505 + }, + { + "epoch": 0.3280737794484025, + "grad_norm": 1.9296875, + "learning_rate": 7.575160936489452e-05, + "loss": 1.7448, + "step": 7506 + }, + { + "epoch": 0.32811748765243237, + "grad_norm": 2.703125, + "learning_rate": 7.574572149761437e-05, + "loss": 1.9557, + "step": 7507 + }, + { + "epoch": 0.32816119585646225, + "grad_norm": 2.234375, + "learning_rate": 7.573983314447499e-05, + "loss": 1.9883, + "step": 7508 + }, + { + "epoch": 0.32820490406049213, + "grad_norm": 2.203125, + "learning_rate": 7.573394430558749e-05, + "loss": 1.3002, + "step": 7509 + }, + { + "epoch": 0.32824861226452207, + "grad_norm": 2.046875, + "learning_rate": 7.572805498106301e-05, + "loss": 2.2083, + "step": 7510 + }, + { + "epoch": 0.32829232046855195, + "grad_norm": 1.9453125, + "learning_rate": 7.57221651710127e-05, + "loss": 1.7499, + "step": 7511 + }, + { + "epoch": 0.3283360286725818, + "grad_norm": 1.921875, + "learning_rate": 7.571627487554769e-05, + "loss": 1.98, + "step": 7512 + }, + { + "epoch": 0.32837973687661176, + "grad_norm": 2.171875, + "learning_rate": 7.571038409477913e-05, + "loss": 2.192, + "step": 7513 + }, + { + "epoch": 0.32842344508064164, + "grad_norm": 2.75, + "learning_rate": 7.570449282881822e-05, + "loss": 1.4564, + "step": 7514 + }, + { + "epoch": 0.3284671532846715, + "grad_norm": 2.21875, + "learning_rate": 7.569860107777613e-05, + "loss": 1.8013, + "step": 7515 + }, + { + "epoch": 0.32851086148870146, + "grad_norm": 2.078125, + "learning_rate": 7.5692708841764e-05, + "loss": 2.2148, + "step": 7516 + }, + { + "epoch": 0.32855456969273134, + "grad_norm": 2.0, + "learning_rate": 7.56868161208931e-05, + "loss": 1.547, + "step": 7517 + }, + { + "epoch": 0.3285982778967612, + "grad_norm": 2.25, + "learning_rate": 7.568092291527455e-05, + "loss": 2.1837, + "step": 7518 + }, + { + "epoch": 0.3286419861007911, + "grad_norm": 2.015625, + "learning_rate": 7.567502922501963e-05, + "loss": 2.1615, + "step": 7519 + }, + { + "epoch": 0.32868569430482103, + "grad_norm": 1.8203125, + "learning_rate": 7.566913505023956e-05, + "loss": 1.499, + "step": 7520 + }, + { + "epoch": 0.3287294025088509, + "grad_norm": 2.171875, + "learning_rate": 7.566324039104553e-05, + "loss": 2.5199, + "step": 7521 + }, + { + "epoch": 0.3287731107128808, + "grad_norm": 1.8046875, + "learning_rate": 7.565734524754882e-05, + "loss": 1.5872, + "step": 7522 + }, + { + "epoch": 0.3288168189169107, + "grad_norm": 2.140625, + "learning_rate": 7.565144961986064e-05, + "loss": 1.7103, + "step": 7523 + }, + { + "epoch": 0.3288605271209406, + "grad_norm": 2.109375, + "learning_rate": 7.564555350809226e-05, + "loss": 2.1791, + "step": 7524 + }, + { + "epoch": 0.3289042353249705, + "grad_norm": 2.515625, + "learning_rate": 7.5639656912355e-05, + "loss": 1.6546, + "step": 7525 + }, + { + "epoch": 0.3289479435290004, + "grad_norm": 1.765625, + "learning_rate": 7.563375983276008e-05, + "loss": 1.3833, + "step": 7526 + }, + { + "epoch": 0.3289916517330303, + "grad_norm": 2.171875, + "learning_rate": 7.56278622694188e-05, + "loss": 2.4949, + "step": 7527 + }, + { + "epoch": 0.3290353599370602, + "grad_norm": 1.953125, + "learning_rate": 7.562196422244245e-05, + "loss": 1.7161, + "step": 7528 + }, + { + "epoch": 0.32907906814109006, + "grad_norm": 1.9375, + "learning_rate": 7.561606569194237e-05, + "loss": 1.6621, + "step": 7529 + }, + { + "epoch": 0.32912277634512, + "grad_norm": 2.171875, + "learning_rate": 7.561016667802982e-05, + "loss": 1.9472, + "step": 7530 + }, + { + "epoch": 0.3291664845491499, + "grad_norm": 2.265625, + "learning_rate": 7.560426718081617e-05, + "loss": 2.2656, + "step": 7531 + }, + { + "epoch": 0.32921019275317975, + "grad_norm": 2.375, + "learning_rate": 7.559836720041274e-05, + "loss": 1.8794, + "step": 7532 + }, + { + "epoch": 0.3292539009572097, + "grad_norm": 2.375, + "learning_rate": 7.559246673693085e-05, + "loss": 2.2259, + "step": 7533 + }, + { + "epoch": 0.32929760916123957, + "grad_norm": 2.234375, + "learning_rate": 7.558656579048185e-05, + "loss": 1.6449, + "step": 7534 + }, + { + "epoch": 0.32934131736526945, + "grad_norm": 2.21875, + "learning_rate": 7.558066436117715e-05, + "loss": 2.2579, + "step": 7535 + }, + { + "epoch": 0.3293850255692994, + "grad_norm": 4.875, + "learning_rate": 7.557476244912805e-05, + "loss": 2.4239, + "step": 7536 + }, + { + "epoch": 0.32942873377332926, + "grad_norm": 1.8984375, + "learning_rate": 7.556886005444597e-05, + "loss": 1.8257, + "step": 7537 + }, + { + "epoch": 0.32947244197735914, + "grad_norm": 2.765625, + "learning_rate": 7.55629571772423e-05, + "loss": 1.2687, + "step": 7538 + }, + { + "epoch": 0.329516150181389, + "grad_norm": 2.21875, + "learning_rate": 7.555705381762841e-05, + "loss": 2.1488, + "step": 7539 + }, + { + "epoch": 0.32955985838541896, + "grad_norm": 1.953125, + "learning_rate": 7.555114997571572e-05, + "loss": 1.7148, + "step": 7540 + }, + { + "epoch": 0.32960356658944884, + "grad_norm": 1.9296875, + "learning_rate": 7.554524565161565e-05, + "loss": 1.781, + "step": 7541 + }, + { + "epoch": 0.3296472747934787, + "grad_norm": 1.8984375, + "learning_rate": 7.553934084543961e-05, + "loss": 1.5344, + "step": 7542 + }, + { + "epoch": 0.32969098299750865, + "grad_norm": 1.796875, + "learning_rate": 7.553343555729903e-05, + "loss": 1.5419, + "step": 7543 + }, + { + "epoch": 0.32973469120153853, + "grad_norm": 3.828125, + "learning_rate": 7.552752978730536e-05, + "loss": 1.7144, + "step": 7544 + }, + { + "epoch": 0.3297783994055684, + "grad_norm": 1.9296875, + "learning_rate": 7.552162353557006e-05, + "loss": 2.0587, + "step": 7545 + }, + { + "epoch": 0.32982210760959835, + "grad_norm": 2.15625, + "learning_rate": 7.551571680220457e-05, + "loss": 1.6627, + "step": 7546 + }, + { + "epoch": 0.3298658158136282, + "grad_norm": 1.75, + "learning_rate": 7.550980958732037e-05, + "loss": 1.9505, + "step": 7547 + }, + { + "epoch": 0.3299095240176581, + "grad_norm": 2.421875, + "learning_rate": 7.550390189102894e-05, + "loss": 1.7785, + "step": 7548 + }, + { + "epoch": 0.329953232221688, + "grad_norm": 1.8828125, + "learning_rate": 7.549799371344175e-05, + "loss": 1.792, + "step": 7549 + }, + { + "epoch": 0.3299969404257179, + "grad_norm": 2.09375, + "learning_rate": 7.549208505467033e-05, + "loss": 1.8559, + "step": 7550 + }, + { + "epoch": 0.3300406486297478, + "grad_norm": 2.0, + "learning_rate": 7.548617591482614e-05, + "loss": 1.8498, + "step": 7551 + }, + { + "epoch": 0.3300843568337777, + "grad_norm": 2.0625, + "learning_rate": 7.548026629402075e-05, + "loss": 2.5397, + "step": 7552 + }, + { + "epoch": 0.3301280650378076, + "grad_norm": 2.28125, + "learning_rate": 7.547435619236562e-05, + "loss": 1.9982, + "step": 7553 + }, + { + "epoch": 0.3301717732418375, + "grad_norm": 2.734375, + "learning_rate": 7.54684456099723e-05, + "loss": 1.7053, + "step": 7554 + }, + { + "epoch": 0.3302154814458674, + "grad_norm": 2.015625, + "learning_rate": 7.546253454695237e-05, + "loss": 1.6925, + "step": 7555 + }, + { + "epoch": 0.3302591896498973, + "grad_norm": 1.9296875, + "learning_rate": 7.545662300341736e-05, + "loss": 1.8186, + "step": 7556 + }, + { + "epoch": 0.3303028978539272, + "grad_norm": 2.6875, + "learning_rate": 7.54507109794788e-05, + "loss": 2.1328, + "step": 7557 + }, + { + "epoch": 0.33034660605795707, + "grad_norm": 3.15625, + "learning_rate": 7.544479847524829e-05, + "loss": 2.549, + "step": 7558 + }, + { + "epoch": 0.33039031426198695, + "grad_norm": 2.0625, + "learning_rate": 7.54388854908374e-05, + "loss": 2.1843, + "step": 7559 + }, + { + "epoch": 0.3304340224660169, + "grad_norm": 2.0, + "learning_rate": 7.543297202635772e-05, + "loss": 2.3384, + "step": 7560 + }, + { + "epoch": 0.33047773067004677, + "grad_norm": 2.578125, + "learning_rate": 7.542705808192085e-05, + "loss": 1.8247, + "step": 7561 + }, + { + "epoch": 0.33052143887407665, + "grad_norm": 2.640625, + "learning_rate": 7.542114365763837e-05, + "loss": 2.4422, + "step": 7562 + }, + { + "epoch": 0.3305651470781066, + "grad_norm": 3.734375, + "learning_rate": 7.541522875362193e-05, + "loss": 2.1137, + "step": 7563 + }, + { + "epoch": 0.33060885528213646, + "grad_norm": 1.9453125, + "learning_rate": 7.540931336998312e-05, + "loss": 1.5262, + "step": 7564 + }, + { + "epoch": 0.33065256348616634, + "grad_norm": 2.546875, + "learning_rate": 7.540339750683358e-05, + "loss": 1.9562, + "step": 7565 + }, + { + "epoch": 0.3306962716901963, + "grad_norm": 2.09375, + "learning_rate": 7.539748116428495e-05, + "loss": 2.2275, + "step": 7566 + }, + { + "epoch": 0.33073997989422615, + "grad_norm": 2.140625, + "learning_rate": 7.539156434244892e-05, + "loss": 1.2463, + "step": 7567 + }, + { + "epoch": 0.33078368809825603, + "grad_norm": 2.671875, + "learning_rate": 7.53856470414371e-05, + "loss": 1.8308, + "step": 7568 + }, + { + "epoch": 0.3308273963022859, + "grad_norm": 2.078125, + "learning_rate": 7.537972926136115e-05, + "loss": 1.3167, + "step": 7569 + }, + { + "epoch": 0.33087110450631585, + "grad_norm": 2.046875, + "learning_rate": 7.537381100233278e-05, + "loss": 1.7028, + "step": 7570 + }, + { + "epoch": 0.33091481271034573, + "grad_norm": 1.8671875, + "learning_rate": 7.536789226446367e-05, + "loss": 1.7325, + "step": 7571 + }, + { + "epoch": 0.3309585209143756, + "grad_norm": 2.09375, + "learning_rate": 7.536197304786555e-05, + "loss": 1.6386, + "step": 7572 + }, + { + "epoch": 0.33100222911840554, + "grad_norm": 1.8984375, + "learning_rate": 7.535605335265003e-05, + "loss": 1.6623, + "step": 7573 + }, + { + "epoch": 0.3310459373224354, + "grad_norm": 2.25, + "learning_rate": 7.535013317892889e-05, + "loss": 1.4363, + "step": 7574 + }, + { + "epoch": 0.3310896455264653, + "grad_norm": 2.28125, + "learning_rate": 7.534421252681387e-05, + "loss": 2.1629, + "step": 7575 + }, + { + "epoch": 0.33113335373049524, + "grad_norm": 1.9296875, + "learning_rate": 7.533829139641664e-05, + "loss": 1.831, + "step": 7576 + }, + { + "epoch": 0.3311770619345251, + "grad_norm": 2.1875, + "learning_rate": 7.5332369787849e-05, + "loss": 1.9585, + "step": 7577 + }, + { + "epoch": 0.331220770138555, + "grad_norm": 2.578125, + "learning_rate": 7.532644770122266e-05, + "loss": 2.5307, + "step": 7578 + }, + { + "epoch": 0.3312644783425849, + "grad_norm": 1.9375, + "learning_rate": 7.532052513664939e-05, + "loss": 1.8876, + "step": 7579 + }, + { + "epoch": 0.3313081865466148, + "grad_norm": 2.0625, + "learning_rate": 7.531460209424096e-05, + "loss": 1.8074, + "step": 7580 + }, + { + "epoch": 0.3313518947506447, + "grad_norm": 1.9453125, + "learning_rate": 7.530867857410915e-05, + "loss": 1.6012, + "step": 7581 + }, + { + "epoch": 0.3313956029546746, + "grad_norm": 1.671875, + "learning_rate": 7.530275457636574e-05, + "loss": 1.5269, + "step": 7582 + }, + { + "epoch": 0.3314393111587045, + "grad_norm": 2.21875, + "learning_rate": 7.529683010112252e-05, + "loss": 1.6858, + "step": 7583 + }, + { + "epoch": 0.3314830193627344, + "grad_norm": 2.609375, + "learning_rate": 7.529090514849128e-05, + "loss": 2.4866, + "step": 7584 + }, + { + "epoch": 0.33152672756676427, + "grad_norm": 1.8203125, + "learning_rate": 7.528497971858388e-05, + "loss": 1.7441, + "step": 7585 + }, + { + "epoch": 0.3315704357707942, + "grad_norm": 2.09375, + "learning_rate": 7.52790538115121e-05, + "loss": 1.7589, + "step": 7586 + }, + { + "epoch": 0.3316141439748241, + "grad_norm": 2.234375, + "learning_rate": 7.52731274273878e-05, + "loss": 1.9113, + "step": 7587 + }, + { + "epoch": 0.33165785217885396, + "grad_norm": 2.4375, + "learning_rate": 7.526720056632277e-05, + "loss": 1.9697, + "step": 7588 + }, + { + "epoch": 0.33170156038288384, + "grad_norm": 2.421875, + "learning_rate": 7.52612732284289e-05, + "loss": 2.1639, + "step": 7589 + }, + { + "epoch": 0.3317452685869138, + "grad_norm": 1.9296875, + "learning_rate": 7.525534541381806e-05, + "loss": 1.6989, + "step": 7590 + }, + { + "epoch": 0.33178897679094366, + "grad_norm": 2.671875, + "learning_rate": 7.524941712260207e-05, + "loss": 1.7631, + "step": 7591 + }, + { + "epoch": 0.33183268499497354, + "grad_norm": 2.34375, + "learning_rate": 7.524348835489286e-05, + "loss": 1.7935, + "step": 7592 + }, + { + "epoch": 0.33187639319900347, + "grad_norm": 1.9453125, + "learning_rate": 7.523755911080226e-05, + "loss": 1.505, + "step": 7593 + }, + { + "epoch": 0.33192010140303335, + "grad_norm": 2.578125, + "learning_rate": 7.523162939044219e-05, + "loss": 2.3742, + "step": 7594 + }, + { + "epoch": 0.33196380960706323, + "grad_norm": 2.390625, + "learning_rate": 7.522569919392455e-05, + "loss": 2.1225, + "step": 7595 + }, + { + "epoch": 0.33200751781109317, + "grad_norm": 1.9296875, + "learning_rate": 7.521976852136125e-05, + "loss": 1.9097, + "step": 7596 + }, + { + "epoch": 0.33205122601512305, + "grad_norm": 2.046875, + "learning_rate": 7.521383737286423e-05, + "loss": 1.6101, + "step": 7597 + }, + { + "epoch": 0.3320949342191529, + "grad_norm": 2.1875, + "learning_rate": 7.520790574854538e-05, + "loss": 1.6832, + "step": 7598 + }, + { + "epoch": 0.3321386424231828, + "grad_norm": 2.03125, + "learning_rate": 7.520197364851667e-05, + "loss": 1.3584, + "step": 7599 + }, + { + "epoch": 0.33218235062721274, + "grad_norm": 1.7578125, + "learning_rate": 7.519604107289003e-05, + "loss": 1.5448, + "step": 7600 + }, + { + "epoch": 0.3322260588312426, + "grad_norm": 3.875, + "learning_rate": 7.519010802177744e-05, + "loss": 2.2089, + "step": 7601 + }, + { + "epoch": 0.3322697670352725, + "grad_norm": 1.890625, + "learning_rate": 7.518417449529085e-05, + "loss": 1.6202, + "step": 7602 + }, + { + "epoch": 0.33231347523930244, + "grad_norm": 2.25, + "learning_rate": 7.517824049354221e-05, + "loss": 1.6155, + "step": 7603 + }, + { + "epoch": 0.3323571834433323, + "grad_norm": 2.234375, + "learning_rate": 7.517230601664354e-05, + "loss": 2.1604, + "step": 7604 + }, + { + "epoch": 0.3324008916473622, + "grad_norm": 3.109375, + "learning_rate": 7.516637106470683e-05, + "loss": 2.687, + "step": 7605 + }, + { + "epoch": 0.33244459985139213, + "grad_norm": 2.296875, + "learning_rate": 7.516043563784405e-05, + "loss": 2.0484, + "step": 7606 + }, + { + "epoch": 0.332488308055422, + "grad_norm": 2.21875, + "learning_rate": 7.515449973616723e-05, + "loss": 1.984, + "step": 7607 + }, + { + "epoch": 0.3325320162594519, + "grad_norm": 2.3125, + "learning_rate": 7.514856335978842e-05, + "loss": 2.0978, + "step": 7608 + }, + { + "epoch": 0.33257572446348177, + "grad_norm": 2.28125, + "learning_rate": 7.514262650881958e-05, + "loss": 1.5738, + "step": 7609 + }, + { + "epoch": 0.3326194326675117, + "grad_norm": 2.203125, + "learning_rate": 7.51366891833728e-05, + "loss": 2.2538, + "step": 7610 + }, + { + "epoch": 0.3326631408715416, + "grad_norm": 2.4375, + "learning_rate": 7.513075138356012e-05, + "loss": 2.524, + "step": 7611 + }, + { + "epoch": 0.33270684907557146, + "grad_norm": 2.140625, + "learning_rate": 7.512481310949358e-05, + "loss": 1.6862, + "step": 7612 + }, + { + "epoch": 0.3327505572796014, + "grad_norm": 2.34375, + "learning_rate": 7.511887436128525e-05, + "loss": 1.6454, + "step": 7613 + }, + { + "epoch": 0.3327942654836313, + "grad_norm": 2.046875, + "learning_rate": 7.511293513904718e-05, + "loss": 2.314, + "step": 7614 + }, + { + "epoch": 0.33283797368766116, + "grad_norm": 1.8984375, + "learning_rate": 7.510699544289151e-05, + "loss": 1.6821, + "step": 7615 + }, + { + "epoch": 0.3328816818916911, + "grad_norm": 2.53125, + "learning_rate": 7.510105527293026e-05, + "loss": 1.4252, + "step": 7616 + }, + { + "epoch": 0.332925390095721, + "grad_norm": 2.28125, + "learning_rate": 7.509511462927559e-05, + "loss": 2.2525, + "step": 7617 + }, + { + "epoch": 0.33296909829975085, + "grad_norm": 2.09375, + "learning_rate": 7.508917351203957e-05, + "loss": 1.7308, + "step": 7618 + }, + { + "epoch": 0.33301280650378073, + "grad_norm": 2.46875, + "learning_rate": 7.508323192133432e-05, + "loss": 1.7358, + "step": 7619 + }, + { + "epoch": 0.33305651470781067, + "grad_norm": 2.375, + "learning_rate": 7.507728985727199e-05, + "loss": 2.3117, + "step": 7620 + }, + { + "epoch": 0.33310022291184055, + "grad_norm": 1.9453125, + "learning_rate": 7.50713473199647e-05, + "loss": 1.3512, + "step": 7621 + }, + { + "epoch": 0.33314393111587043, + "grad_norm": 2.65625, + "learning_rate": 7.506540430952461e-05, + "loss": 1.5071, + "step": 7622 + }, + { + "epoch": 0.33318763931990036, + "grad_norm": 2.53125, + "learning_rate": 7.505946082606386e-05, + "loss": 1.7641, + "step": 7623 + }, + { + "epoch": 0.33323134752393024, + "grad_norm": 2.171875, + "learning_rate": 7.505351686969457e-05, + "loss": 2.113, + "step": 7624 + }, + { + "epoch": 0.3332750557279601, + "grad_norm": 2.21875, + "learning_rate": 7.504757244052901e-05, + "loss": 1.863, + "step": 7625 + }, + { + "epoch": 0.33331876393199006, + "grad_norm": 3.75, + "learning_rate": 7.504162753867927e-05, + "loss": 2.0576, + "step": 7626 + }, + { + "epoch": 0.33336247213601994, + "grad_norm": 1.8984375, + "learning_rate": 7.503568216425757e-05, + "loss": 1.4004, + "step": 7627 + }, + { + "epoch": 0.3334061803400498, + "grad_norm": 2.078125, + "learning_rate": 7.502973631737612e-05, + "loss": 2.2958, + "step": 7628 + }, + { + "epoch": 0.3334498885440797, + "grad_norm": 1.828125, + "learning_rate": 7.50237899981471e-05, + "loss": 1.5195, + "step": 7629 + }, + { + "epoch": 0.33349359674810963, + "grad_norm": 3.46875, + "learning_rate": 7.501784320668277e-05, + "loss": 2.7067, + "step": 7630 + }, + { + "epoch": 0.3335373049521395, + "grad_norm": 2.484375, + "learning_rate": 7.501189594309531e-05, + "loss": 0.8452, + "step": 7631 + }, + { + "epoch": 0.3335810131561694, + "grad_norm": 2.3125, + "learning_rate": 7.500594820749698e-05, + "loss": 2.0745, + "step": 7632 + }, + { + "epoch": 0.33362472136019933, + "grad_norm": 1.921875, + "learning_rate": 7.500000000000001e-05, + "loss": 1.715, + "step": 7633 + }, + { + "epoch": 0.3336684295642292, + "grad_norm": 2.71875, + "learning_rate": 7.499405132071665e-05, + "loss": 1.7119, + "step": 7634 + }, + { + "epoch": 0.3337121377682591, + "grad_norm": 2.078125, + "learning_rate": 7.498810216975917e-05, + "loss": 2.5392, + "step": 7635 + }, + { + "epoch": 0.333755845972289, + "grad_norm": 1.953125, + "learning_rate": 7.498215254723982e-05, + "loss": 1.5535, + "step": 7636 + }, + { + "epoch": 0.3337995541763189, + "grad_norm": 2.203125, + "learning_rate": 7.49762024532709e-05, + "loss": 2.2684, + "step": 7637 + }, + { + "epoch": 0.3338432623803488, + "grad_norm": 2.5625, + "learning_rate": 7.497025188796469e-05, + "loss": 1.5744, + "step": 7638 + }, + { + "epoch": 0.33388697058437866, + "grad_norm": 1.828125, + "learning_rate": 7.496430085143348e-05, + "loss": 1.4891, + "step": 7639 + }, + { + "epoch": 0.3339306787884086, + "grad_norm": 2.0, + "learning_rate": 7.495834934378958e-05, + "loss": 1.5024, + "step": 7640 + }, + { + "epoch": 0.3339743869924385, + "grad_norm": 2.234375, + "learning_rate": 7.495239736514531e-05, + "loss": 1.609, + "step": 7641 + }, + { + "epoch": 0.33401809519646836, + "grad_norm": 1.9765625, + "learning_rate": 7.494644491561299e-05, + "loss": 2.082, + "step": 7642 + }, + { + "epoch": 0.3340618034004983, + "grad_norm": 1.7890625, + "learning_rate": 7.494049199530494e-05, + "loss": 1.2633, + "step": 7643 + }, + { + "epoch": 0.33410551160452817, + "grad_norm": 1.90625, + "learning_rate": 7.49345386043335e-05, + "loss": 1.6481, + "step": 7644 + }, + { + "epoch": 0.33414921980855805, + "grad_norm": 1.9375, + "learning_rate": 7.492858474281103e-05, + "loss": 1.5163, + "step": 7645 + }, + { + "epoch": 0.334192928012588, + "grad_norm": 4.1875, + "learning_rate": 7.492263041084988e-05, + "loss": 1.8762, + "step": 7646 + }, + { + "epoch": 0.33423663621661787, + "grad_norm": 4.84375, + "learning_rate": 7.491667560856242e-05, + "loss": 1.7709, + "step": 7647 + }, + { + "epoch": 0.33428034442064775, + "grad_norm": 2.140625, + "learning_rate": 7.491072033606104e-05, + "loss": 1.7079, + "step": 7648 + }, + { + "epoch": 0.3343240526246776, + "grad_norm": 2.140625, + "learning_rate": 7.49047645934581e-05, + "loss": 1.9067, + "step": 7649 + }, + { + "epoch": 0.33436776082870756, + "grad_norm": 2.640625, + "learning_rate": 7.4898808380866e-05, + "loss": 2.0671, + "step": 7650 + }, + { + "epoch": 0.33441146903273744, + "grad_norm": 2.15625, + "learning_rate": 7.489285169839717e-05, + "loss": 1.4923, + "step": 7651 + }, + { + "epoch": 0.3344551772367673, + "grad_norm": 2.546875, + "learning_rate": 7.488689454616399e-05, + "loss": 2.8122, + "step": 7652 + }, + { + "epoch": 0.33449888544079726, + "grad_norm": 1.9453125, + "learning_rate": 7.488093692427887e-05, + "loss": 2.0358, + "step": 7653 + }, + { + "epoch": 0.33454259364482714, + "grad_norm": 2.015625, + "learning_rate": 7.487497883285428e-05, + "loss": 2.0688, + "step": 7654 + }, + { + "epoch": 0.334586301848857, + "grad_norm": 2.140625, + "learning_rate": 7.486902027200263e-05, + "loss": 1.5415, + "step": 7655 + }, + { + "epoch": 0.33463001005288695, + "grad_norm": 1.9609375, + "learning_rate": 7.486306124183637e-05, + "loss": 1.4973, + "step": 7656 + }, + { + "epoch": 0.33467371825691683, + "grad_norm": 1.9609375, + "learning_rate": 7.485710174246794e-05, + "loss": 1.6159, + "step": 7657 + }, + { + "epoch": 0.3347174264609467, + "grad_norm": 2.109375, + "learning_rate": 7.485114177400984e-05, + "loss": 1.4921, + "step": 7658 + }, + { + "epoch": 0.3347611346649766, + "grad_norm": 2.359375, + "learning_rate": 7.484518133657455e-05, + "loss": 1.5661, + "step": 7659 + }, + { + "epoch": 0.3348048428690065, + "grad_norm": 4.96875, + "learning_rate": 7.483922043027448e-05, + "loss": 1.9808, + "step": 7660 + }, + { + "epoch": 0.3348485510730364, + "grad_norm": 2.046875, + "learning_rate": 7.48332590552222e-05, + "loss": 2.1961, + "step": 7661 + }, + { + "epoch": 0.3348922592770663, + "grad_norm": 2.265625, + "learning_rate": 7.482729721153016e-05, + "loss": 1.6836, + "step": 7662 + }, + { + "epoch": 0.3349359674810962, + "grad_norm": 1.828125, + "learning_rate": 7.482133489931091e-05, + "loss": 1.5001, + "step": 7663 + }, + { + "epoch": 0.3349796756851261, + "grad_norm": 1.765625, + "learning_rate": 7.481537211867693e-05, + "loss": 1.6789, + "step": 7664 + }, + { + "epoch": 0.335023383889156, + "grad_norm": 1.9921875, + "learning_rate": 7.480940886974077e-05, + "loss": 1.4361, + "step": 7665 + }, + { + "epoch": 0.3350670920931859, + "grad_norm": 2.328125, + "learning_rate": 7.480344515261495e-05, + "loss": 2.1453, + "step": 7666 + }, + { + "epoch": 0.3351108002972158, + "grad_norm": 2.171875, + "learning_rate": 7.479748096741201e-05, + "loss": 1.6566, + "step": 7667 + }, + { + "epoch": 0.3351545085012457, + "grad_norm": 2.1875, + "learning_rate": 7.479151631424453e-05, + "loss": 1.9521, + "step": 7668 + }, + { + "epoch": 0.33519821670527555, + "grad_norm": 2.828125, + "learning_rate": 7.478555119322505e-05, + "loss": 2.6565, + "step": 7669 + }, + { + "epoch": 0.3352419249093055, + "grad_norm": 2.296875, + "learning_rate": 7.477958560446613e-05, + "loss": 2.3781, + "step": 7670 + }, + { + "epoch": 0.33528563311333537, + "grad_norm": 2.15625, + "learning_rate": 7.477361954808037e-05, + "loss": 1.7341, + "step": 7671 + }, + { + "epoch": 0.33532934131736525, + "grad_norm": 2.265625, + "learning_rate": 7.476765302418037e-05, + "loss": 1.6521, + "step": 7672 + }, + { + "epoch": 0.3353730495213952, + "grad_norm": 2.265625, + "learning_rate": 7.47616860328787e-05, + "loss": 2.4899, + "step": 7673 + }, + { + "epoch": 0.33541675772542506, + "grad_norm": 2.328125, + "learning_rate": 7.475571857428797e-05, + "loss": 1.8374, + "step": 7674 + }, + { + "epoch": 0.33546046592945494, + "grad_norm": 2.03125, + "learning_rate": 7.474975064852081e-05, + "loss": 1.6771, + "step": 7675 + }, + { + "epoch": 0.3355041741334849, + "grad_norm": 2.109375, + "learning_rate": 7.474378225568983e-05, + "loss": 2.0606, + "step": 7676 + }, + { + "epoch": 0.33554788233751476, + "grad_norm": 2.25, + "learning_rate": 7.473781339590766e-05, + "loss": 1.8691, + "step": 7677 + }, + { + "epoch": 0.33559159054154464, + "grad_norm": 3.3125, + "learning_rate": 7.473184406928696e-05, + "loss": 2.2992, + "step": 7678 + }, + { + "epoch": 0.3356352987455745, + "grad_norm": 2.140625, + "learning_rate": 7.472587427594037e-05, + "loss": 1.6181, + "step": 7679 + }, + { + "epoch": 0.33567900694960445, + "grad_norm": 2.078125, + "learning_rate": 7.471990401598052e-05, + "loss": 1.5764, + "step": 7680 + }, + { + "epoch": 0.33572271515363433, + "grad_norm": 2.0, + "learning_rate": 7.471393328952012e-05, + "loss": 1.7561, + "step": 7681 + }, + { + "epoch": 0.3357664233576642, + "grad_norm": 2.0, + "learning_rate": 7.470796209667184e-05, + "loss": 1.7098, + "step": 7682 + }, + { + "epoch": 0.33581013156169415, + "grad_norm": 1.7734375, + "learning_rate": 7.470199043754833e-05, + "loss": 1.8006, + "step": 7683 + }, + { + "epoch": 0.335853839765724, + "grad_norm": 2.875, + "learning_rate": 7.469601831226233e-05, + "loss": 1.7551, + "step": 7684 + }, + { + "epoch": 0.3358975479697539, + "grad_norm": 1.8828125, + "learning_rate": 7.469004572092651e-05, + "loss": 1.9204, + "step": 7685 + }, + { + "epoch": 0.33594125617378384, + "grad_norm": 1.78125, + "learning_rate": 7.46840726636536e-05, + "loss": 1.3732, + "step": 7686 + }, + { + "epoch": 0.3359849643778137, + "grad_norm": 2.21875, + "learning_rate": 7.46780991405563e-05, + "loss": 1.8284, + "step": 7687 + }, + { + "epoch": 0.3360286725818436, + "grad_norm": 2.4375, + "learning_rate": 7.467212515174736e-05, + "loss": 1.9712, + "step": 7688 + }, + { + "epoch": 0.3360723807858735, + "grad_norm": 1.859375, + "learning_rate": 7.466615069733951e-05, + "loss": 1.4069, + "step": 7689 + }, + { + "epoch": 0.3361160889899034, + "grad_norm": 2.328125, + "learning_rate": 7.466017577744549e-05, + "loss": 2.1121, + "step": 7690 + }, + { + "epoch": 0.3361597971939333, + "grad_norm": 2.09375, + "learning_rate": 7.465420039217806e-05, + "loss": 1.5186, + "step": 7691 + }, + { + "epoch": 0.3362035053979632, + "grad_norm": 2.125, + "learning_rate": 7.464822454165e-05, + "loss": 1.6417, + "step": 7692 + }, + { + "epoch": 0.3362472136019931, + "grad_norm": 4.90625, + "learning_rate": 7.464224822597407e-05, + "loss": 1.5733, + "step": 7693 + }, + { + "epoch": 0.336290921806023, + "grad_norm": 3.0625, + "learning_rate": 7.463627144526304e-05, + "loss": 2.7127, + "step": 7694 + }, + { + "epoch": 0.33633463001005287, + "grad_norm": 1.8828125, + "learning_rate": 7.463029419962971e-05, + "loss": 1.4256, + "step": 7695 + }, + { + "epoch": 0.3363783382140828, + "grad_norm": 2.265625, + "learning_rate": 7.462431648918689e-05, + "loss": 1.8544, + "step": 7696 + }, + { + "epoch": 0.3364220464181127, + "grad_norm": 2.296875, + "learning_rate": 7.461833831404737e-05, + "loss": 2.4548, + "step": 7697 + }, + { + "epoch": 0.33646575462214257, + "grad_norm": 4.78125, + "learning_rate": 7.461235967432398e-05, + "loss": 2.2272, + "step": 7698 + }, + { + "epoch": 0.33650946282617245, + "grad_norm": 5.53125, + "learning_rate": 7.460638057012955e-05, + "loss": 2.3309, + "step": 7699 + }, + { + "epoch": 0.3365531710302024, + "grad_norm": 2.296875, + "learning_rate": 7.46004010015769e-05, + "loss": 2.1454, + "step": 7700 + }, + { + "epoch": 0.33659687923423226, + "grad_norm": 2.671875, + "learning_rate": 7.459442096877886e-05, + "loss": 1.3449, + "step": 7701 + }, + { + "epoch": 0.33664058743826214, + "grad_norm": 2.390625, + "learning_rate": 7.458844047184832e-05, + "loss": 2.5731, + "step": 7702 + }, + { + "epoch": 0.3366842956422921, + "grad_norm": 2.359375, + "learning_rate": 7.458245951089813e-05, + "loss": 1.9036, + "step": 7703 + }, + { + "epoch": 0.33672800384632195, + "grad_norm": 2.203125, + "learning_rate": 7.457647808604113e-05, + "loss": 2.333, + "step": 7704 + }, + { + "epoch": 0.33677171205035183, + "grad_norm": 1.984375, + "learning_rate": 7.457049619739022e-05, + "loss": 1.6744, + "step": 7705 + }, + { + "epoch": 0.33681542025438177, + "grad_norm": 2.28125, + "learning_rate": 7.45645138450583e-05, + "loss": 1.5279, + "step": 7706 + }, + { + "epoch": 0.33685912845841165, + "grad_norm": 1.8203125, + "learning_rate": 7.455853102915825e-05, + "loss": 1.5336, + "step": 7707 + }, + { + "epoch": 0.33690283666244153, + "grad_norm": 2.21875, + "learning_rate": 7.455254774980297e-05, + "loss": 1.969, + "step": 7708 + }, + { + "epoch": 0.3369465448664714, + "grad_norm": 2.109375, + "learning_rate": 7.45465640071054e-05, + "loss": 2.3044, + "step": 7709 + }, + { + "epoch": 0.33699025307050134, + "grad_norm": 2.671875, + "learning_rate": 7.454057980117841e-05, + "loss": 1.9823, + "step": 7710 + }, + { + "epoch": 0.3370339612745312, + "grad_norm": 1.875, + "learning_rate": 7.453459513213498e-05, + "loss": 1.6091, + "step": 7711 + }, + { + "epoch": 0.3370776694785611, + "grad_norm": 2.3125, + "learning_rate": 7.452861000008803e-05, + "loss": 2.207, + "step": 7712 + }, + { + "epoch": 0.33712137768259104, + "grad_norm": 1.859375, + "learning_rate": 7.45226244051505e-05, + "loss": 1.6165, + "step": 7713 + }, + { + "epoch": 0.3371650858866209, + "grad_norm": 2.171875, + "learning_rate": 7.451663834743537e-05, + "loss": 1.8244, + "step": 7714 + }, + { + "epoch": 0.3372087940906508, + "grad_norm": 1.9609375, + "learning_rate": 7.451065182705558e-05, + "loss": 1.6859, + "step": 7715 + }, + { + "epoch": 0.33725250229468073, + "grad_norm": 2.265625, + "learning_rate": 7.450466484412413e-05, + "loss": 2.2727, + "step": 7716 + }, + { + "epoch": 0.3372962104987106, + "grad_norm": 2.140625, + "learning_rate": 7.449867739875397e-05, + "loss": 1.7578, + "step": 7717 + }, + { + "epoch": 0.3373399187027405, + "grad_norm": 3.15625, + "learning_rate": 7.449268949105812e-05, + "loss": 1.8431, + "step": 7718 + }, + { + "epoch": 0.33738362690677043, + "grad_norm": 2.40625, + "learning_rate": 7.448670112114959e-05, + "loss": 2.004, + "step": 7719 + }, + { + "epoch": 0.3374273351108003, + "grad_norm": 2.125, + "learning_rate": 7.448071228914134e-05, + "loss": 1.9159, + "step": 7720 + }, + { + "epoch": 0.3374710433148302, + "grad_norm": 2.015625, + "learning_rate": 7.447472299514644e-05, + "loss": 1.911, + "step": 7721 + }, + { + "epoch": 0.33751475151886007, + "grad_norm": 2.03125, + "learning_rate": 7.44687332392779e-05, + "loss": 1.8569, + "step": 7722 + }, + { + "epoch": 0.33755845972289, + "grad_norm": 2.15625, + "learning_rate": 7.446274302164873e-05, + "loss": 1.5678, + "step": 7723 + }, + { + "epoch": 0.3376021679269199, + "grad_norm": 2.0, + "learning_rate": 7.445675234237202e-05, + "loss": 1.7527, + "step": 7724 + }, + { + "epoch": 0.33764587613094976, + "grad_norm": 2.046875, + "learning_rate": 7.445076120156078e-05, + "loss": 2.0955, + "step": 7725 + }, + { + "epoch": 0.3376895843349797, + "grad_norm": 2.203125, + "learning_rate": 7.44447695993281e-05, + "loss": 1.7765, + "step": 7726 + }, + { + "epoch": 0.3377332925390096, + "grad_norm": 1.671875, + "learning_rate": 7.443877753578702e-05, + "loss": 1.4968, + "step": 7727 + }, + { + "epoch": 0.33777700074303946, + "grad_norm": 3.109375, + "learning_rate": 7.443278501105065e-05, + "loss": 2.7519, + "step": 7728 + }, + { + "epoch": 0.3378207089470694, + "grad_norm": 2.03125, + "learning_rate": 7.442679202523208e-05, + "loss": 2.2485, + "step": 7729 + }, + { + "epoch": 0.33786441715109927, + "grad_norm": 2.25, + "learning_rate": 7.442079857844438e-05, + "loss": 2.299, + "step": 7730 + }, + { + "epoch": 0.33790812535512915, + "grad_norm": 2.21875, + "learning_rate": 7.441480467080066e-05, + "loss": 1.8709, + "step": 7731 + }, + { + "epoch": 0.33795183355915903, + "grad_norm": 2.296875, + "learning_rate": 7.440881030241407e-05, + "loss": 2.7728, + "step": 7732 + }, + { + "epoch": 0.33799554176318897, + "grad_norm": 2.078125, + "learning_rate": 7.44028154733977e-05, + "loss": 1.9273, + "step": 7733 + }, + { + "epoch": 0.33803924996721885, + "grad_norm": 1.953125, + "learning_rate": 7.439682018386467e-05, + "loss": 1.672, + "step": 7734 + }, + { + "epoch": 0.3380829581712487, + "grad_norm": 2.5, + "learning_rate": 7.439082443392813e-05, + "loss": 2.3186, + "step": 7735 + }, + { + "epoch": 0.33812666637527866, + "grad_norm": 1.9296875, + "learning_rate": 7.438482822370124e-05, + "loss": 1.9351, + "step": 7736 + }, + { + "epoch": 0.33817037457930854, + "grad_norm": 1.9140625, + "learning_rate": 7.437883155329715e-05, + "loss": 1.9687, + "step": 7737 + }, + { + "epoch": 0.3382140827833384, + "grad_norm": 1.921875, + "learning_rate": 7.437283442282904e-05, + "loss": 1.5673, + "step": 7738 + }, + { + "epoch": 0.33825779098736836, + "grad_norm": 2.421875, + "learning_rate": 7.436683683241006e-05, + "loss": 2.2424, + "step": 7739 + }, + { + "epoch": 0.33830149919139824, + "grad_norm": 2.09375, + "learning_rate": 7.43608387821534e-05, + "loss": 2.1129, + "step": 7740 + }, + { + "epoch": 0.3383452073954281, + "grad_norm": 1.8203125, + "learning_rate": 7.435484027217225e-05, + "loss": 1.741, + "step": 7741 + }, + { + "epoch": 0.338388915599458, + "grad_norm": 1.7734375, + "learning_rate": 7.434884130257985e-05, + "loss": 1.89, + "step": 7742 + }, + { + "epoch": 0.33843262380348793, + "grad_norm": 2.4375, + "learning_rate": 7.434284187348935e-05, + "loss": 1.9559, + "step": 7743 + }, + { + "epoch": 0.3384763320075178, + "grad_norm": 2.0, + "learning_rate": 7.4336841985014e-05, + "loss": 1.6128, + "step": 7744 + }, + { + "epoch": 0.3385200402115477, + "grad_norm": 2.078125, + "learning_rate": 7.433084163726703e-05, + "loss": 1.7827, + "step": 7745 + }, + { + "epoch": 0.3385637484155776, + "grad_norm": 2.203125, + "learning_rate": 7.432484083036165e-05, + "loss": 1.7955, + "step": 7746 + }, + { + "epoch": 0.3386074566196075, + "grad_norm": 2.21875, + "learning_rate": 7.431883956441112e-05, + "loss": 2.5427, + "step": 7747 + }, + { + "epoch": 0.3386511648236374, + "grad_norm": 2.296875, + "learning_rate": 7.431283783952872e-05, + "loss": 1.9095, + "step": 7748 + }, + { + "epoch": 0.3386948730276673, + "grad_norm": 2.609375, + "learning_rate": 7.430683565582766e-05, + "loss": 1.592, + "step": 7749 + }, + { + "epoch": 0.3387385812316972, + "grad_norm": 2.71875, + "learning_rate": 7.430083301342124e-05, + "loss": 2.0556, + "step": 7750 + }, + { + "epoch": 0.3387822894357271, + "grad_norm": 1.9765625, + "learning_rate": 7.429482991242274e-05, + "loss": 1.5385, + "step": 7751 + }, + { + "epoch": 0.33882599763975696, + "grad_norm": 1.5390625, + "learning_rate": 7.428882635294543e-05, + "loss": 1.3219, + "step": 7752 + }, + { + "epoch": 0.3388697058437869, + "grad_norm": 1.8828125, + "learning_rate": 7.428282233510262e-05, + "loss": 1.8182, + "step": 7753 + }, + { + "epoch": 0.3389134140478168, + "grad_norm": 2.546875, + "learning_rate": 7.427681785900761e-05, + "loss": 2.379, + "step": 7754 + }, + { + "epoch": 0.33895712225184665, + "grad_norm": 2.21875, + "learning_rate": 7.427081292477371e-05, + "loss": 2.1487, + "step": 7755 + }, + { + "epoch": 0.3390008304558766, + "grad_norm": 2.390625, + "learning_rate": 7.426480753251425e-05, + "loss": 2.2427, + "step": 7756 + }, + { + "epoch": 0.33904453865990647, + "grad_norm": 2.03125, + "learning_rate": 7.425880168234256e-05, + "loss": 2.2215, + "step": 7757 + }, + { + "epoch": 0.33908824686393635, + "grad_norm": 2.953125, + "learning_rate": 7.425279537437198e-05, + "loss": 1.9807, + "step": 7758 + }, + { + "epoch": 0.3391319550679663, + "grad_norm": 6.96875, + "learning_rate": 7.424678860871584e-05, + "loss": 3.9342, + "step": 7759 + }, + { + "epoch": 0.33917566327199616, + "grad_norm": 2.28125, + "learning_rate": 7.42407813854875e-05, + "loss": 2.0916, + "step": 7760 + }, + { + "epoch": 0.33921937147602604, + "grad_norm": 2.1875, + "learning_rate": 7.423477370480035e-05, + "loss": 1.8062, + "step": 7761 + }, + { + "epoch": 0.3392630796800559, + "grad_norm": 2.078125, + "learning_rate": 7.422876556676776e-05, + "loss": 1.7046, + "step": 7762 + }, + { + "epoch": 0.33930678788408586, + "grad_norm": 1.8203125, + "learning_rate": 7.422275697150308e-05, + "loss": 1.4491, + "step": 7763 + }, + { + "epoch": 0.33935049608811574, + "grad_norm": 2.15625, + "learning_rate": 7.421674791911973e-05, + "loss": 1.9007, + "step": 7764 + }, + { + "epoch": 0.3393942042921456, + "grad_norm": 1.765625, + "learning_rate": 7.42107384097311e-05, + "loss": 1.5789, + "step": 7765 + }, + { + "epoch": 0.33943791249617555, + "grad_norm": 1.875, + "learning_rate": 7.420472844345059e-05, + "loss": 1.7002, + "step": 7766 + }, + { + "epoch": 0.33948162070020543, + "grad_norm": 1.921875, + "learning_rate": 7.419871802039163e-05, + "loss": 1.5454, + "step": 7767 + }, + { + "epoch": 0.3395253289042353, + "grad_norm": 1.765625, + "learning_rate": 7.419270714066765e-05, + "loss": 1.8283, + "step": 7768 + }, + { + "epoch": 0.33956903710826525, + "grad_norm": 1.859375, + "learning_rate": 7.418669580439209e-05, + "loss": 1.5417, + "step": 7769 + }, + { + "epoch": 0.33961274531229513, + "grad_norm": 2.328125, + "learning_rate": 7.418068401167834e-05, + "loss": 1.8136, + "step": 7770 + }, + { + "epoch": 0.339656453516325, + "grad_norm": 2.1875, + "learning_rate": 7.41746717626399e-05, + "loss": 1.7942, + "step": 7771 + }, + { + "epoch": 0.3397001617203549, + "grad_norm": 2.765625, + "learning_rate": 7.416865905739024e-05, + "loss": 2.0847, + "step": 7772 + }, + { + "epoch": 0.3397438699243848, + "grad_norm": 1.7734375, + "learning_rate": 7.41626458960428e-05, + "loss": 1.6278, + "step": 7773 + }, + { + "epoch": 0.3397875781284147, + "grad_norm": 2.375, + "learning_rate": 7.415663227871106e-05, + "loss": 2.0389, + "step": 7774 + }, + { + "epoch": 0.3398312863324446, + "grad_norm": 1.8515625, + "learning_rate": 7.41506182055085e-05, + "loss": 1.7392, + "step": 7775 + }, + { + "epoch": 0.3398749945364745, + "grad_norm": 2.65625, + "learning_rate": 7.414460367654864e-05, + "loss": 2.5617, + "step": 7776 + }, + { + "epoch": 0.3399187027405044, + "grad_norm": 2.109375, + "learning_rate": 7.413858869194496e-05, + "loss": 1.6048, + "step": 7777 + }, + { + "epoch": 0.3399624109445343, + "grad_norm": 2.109375, + "learning_rate": 7.413257325181098e-05, + "loss": 1.5458, + "step": 7778 + }, + { + "epoch": 0.3400061191485642, + "grad_norm": 2.53125, + "learning_rate": 7.412655735626024e-05, + "loss": 2.3651, + "step": 7779 + }, + { + "epoch": 0.3400498273525941, + "grad_norm": 2.5625, + "learning_rate": 7.412054100540623e-05, + "loss": 2.3673, + "step": 7780 + }, + { + "epoch": 0.34009353555662397, + "grad_norm": 2.515625, + "learning_rate": 7.41145241993625e-05, + "loss": 2.4358, + "step": 7781 + }, + { + "epoch": 0.34013724376065385, + "grad_norm": 2.546875, + "learning_rate": 7.410850693824261e-05, + "loss": 1.8383, + "step": 7782 + }, + { + "epoch": 0.3401809519646838, + "grad_norm": 2.1875, + "learning_rate": 7.41024892221601e-05, + "loss": 2.3828, + "step": 7783 + }, + { + "epoch": 0.34022466016871367, + "grad_norm": 1.796875, + "learning_rate": 7.409647105122854e-05, + "loss": 1.704, + "step": 7784 + }, + { + "epoch": 0.34026836837274355, + "grad_norm": 1.8828125, + "learning_rate": 7.409045242556151e-05, + "loss": 1.6124, + "step": 7785 + }, + { + "epoch": 0.3403120765767735, + "grad_norm": 2.53125, + "learning_rate": 7.408443334527257e-05, + "loss": 1.0824, + "step": 7786 + }, + { + "epoch": 0.34035578478080336, + "grad_norm": 28.25, + "learning_rate": 7.407841381047532e-05, + "loss": 2.9776, + "step": 7787 + }, + { + "epoch": 0.34039949298483324, + "grad_norm": 2.328125, + "learning_rate": 7.407239382128336e-05, + "loss": 1.7465, + "step": 7788 + }, + { + "epoch": 0.3404432011888632, + "grad_norm": 2.25, + "learning_rate": 7.406637337781031e-05, + "loss": 2.5876, + "step": 7789 + }, + { + "epoch": 0.34048690939289306, + "grad_norm": 2.984375, + "learning_rate": 7.406035248016973e-05, + "loss": 1.9449, + "step": 7790 + }, + { + "epoch": 0.34053061759692294, + "grad_norm": 1.953125, + "learning_rate": 7.40543311284753e-05, + "loss": 1.52, + "step": 7791 + }, + { + "epoch": 0.3405743258009528, + "grad_norm": 2.03125, + "learning_rate": 7.404830932284064e-05, + "loss": 1.8827, + "step": 7792 + }, + { + "epoch": 0.34061803400498275, + "grad_norm": 2.359375, + "learning_rate": 7.404228706337937e-05, + "loss": 2.3619, + "step": 7793 + }, + { + "epoch": 0.34066174220901263, + "grad_norm": 2.515625, + "learning_rate": 7.403626435020516e-05, + "loss": 1.6467, + "step": 7794 + }, + { + "epoch": 0.3407054504130425, + "grad_norm": 2.34375, + "learning_rate": 7.403024118343167e-05, + "loss": 2.2609, + "step": 7795 + }, + { + "epoch": 0.34074915861707245, + "grad_norm": 1.8671875, + "learning_rate": 7.402421756317252e-05, + "loss": 1.1447, + "step": 7796 + }, + { + "epoch": 0.3407928668211023, + "grad_norm": 2.15625, + "learning_rate": 7.401819348954144e-05, + "loss": 1.6363, + "step": 7797 + }, + { + "epoch": 0.3408365750251322, + "grad_norm": 2.171875, + "learning_rate": 7.401216896265208e-05, + "loss": 1.7098, + "step": 7798 + }, + { + "epoch": 0.34088028322916214, + "grad_norm": 1.9453125, + "learning_rate": 7.400614398261817e-05, + "loss": 2.1047, + "step": 7799 + }, + { + "epoch": 0.340923991433192, + "grad_norm": 1.765625, + "learning_rate": 7.400011854955336e-05, + "loss": 1.68, + "step": 7800 + }, + { + "epoch": 0.3409676996372219, + "grad_norm": 1.8203125, + "learning_rate": 7.399409266357139e-05, + "loss": 1.8467, + "step": 7801 + }, + { + "epoch": 0.3410114078412518, + "grad_norm": 3.0, + "learning_rate": 7.398806632478598e-05, + "loss": 2.5555, + "step": 7802 + }, + { + "epoch": 0.3410551160452817, + "grad_norm": 2.46875, + "learning_rate": 7.398203953331083e-05, + "loss": 1.5894, + "step": 7803 + }, + { + "epoch": 0.3410988242493116, + "grad_norm": 2.15625, + "learning_rate": 7.39760122892597e-05, + "loss": 2.3018, + "step": 7804 + }, + { + "epoch": 0.3411425324533415, + "grad_norm": 2.546875, + "learning_rate": 7.396998459274632e-05, + "loss": 1.5333, + "step": 7805 + }, + { + "epoch": 0.3411862406573714, + "grad_norm": 2.625, + "learning_rate": 7.396395644388443e-05, + "loss": 2.7594, + "step": 7806 + }, + { + "epoch": 0.3412299488614013, + "grad_norm": 2.078125, + "learning_rate": 7.395792784278783e-05, + "loss": 1.6819, + "step": 7807 + }, + { + "epoch": 0.34127365706543117, + "grad_norm": 2.359375, + "learning_rate": 7.395189878957025e-05, + "loss": 2.0002, + "step": 7808 + }, + { + "epoch": 0.3413173652694611, + "grad_norm": 1.953125, + "learning_rate": 7.394586928434549e-05, + "loss": 1.7576, + "step": 7809 + }, + { + "epoch": 0.341361073473491, + "grad_norm": 2.015625, + "learning_rate": 7.39398393272273e-05, + "loss": 1.5846, + "step": 7810 + }, + { + "epoch": 0.34140478167752086, + "grad_norm": 2.328125, + "learning_rate": 7.393380891832951e-05, + "loss": 1.8085, + "step": 7811 + }, + { + "epoch": 0.34144848988155074, + "grad_norm": 3.328125, + "learning_rate": 7.392777805776592e-05, + "loss": 2.6784, + "step": 7812 + }, + { + "epoch": 0.3414921980855807, + "grad_norm": 1.8828125, + "learning_rate": 7.392174674565031e-05, + "loss": 1.4711, + "step": 7813 + }, + { + "epoch": 0.34153590628961056, + "grad_norm": 1.984375, + "learning_rate": 7.391571498209654e-05, + "loss": 1.9854, + "step": 7814 + }, + { + "epoch": 0.34157961449364044, + "grad_norm": 1.96875, + "learning_rate": 7.390968276721844e-05, + "loss": 1.5118, + "step": 7815 + }, + { + "epoch": 0.3416233226976704, + "grad_norm": 2.34375, + "learning_rate": 7.390365010112979e-05, + "loss": 2.5905, + "step": 7816 + }, + { + "epoch": 0.34166703090170025, + "grad_norm": 2.078125, + "learning_rate": 7.389761698394449e-05, + "loss": 1.9055, + "step": 7817 + }, + { + "epoch": 0.34171073910573013, + "grad_norm": 2.828125, + "learning_rate": 7.389158341577638e-05, + "loss": 1.9238, + "step": 7818 + }, + { + "epoch": 0.34175444730976007, + "grad_norm": 2.03125, + "learning_rate": 7.388554939673931e-05, + "loss": 1.9666, + "step": 7819 + }, + { + "epoch": 0.34179815551378995, + "grad_norm": 1.8515625, + "learning_rate": 7.387951492694717e-05, + "loss": 1.648, + "step": 7820 + }, + { + "epoch": 0.3418418637178198, + "grad_norm": 2.5, + "learning_rate": 7.387348000651381e-05, + "loss": 2.3185, + "step": 7821 + }, + { + "epoch": 0.3418855719218497, + "grad_norm": 2.390625, + "learning_rate": 7.386744463555316e-05, + "loss": 1.9617, + "step": 7822 + }, + { + "epoch": 0.34192928012587964, + "grad_norm": 2.4375, + "learning_rate": 7.386140881417907e-05, + "loss": 1.8481, + "step": 7823 + }, + { + "epoch": 0.3419729883299095, + "grad_norm": 2.03125, + "learning_rate": 7.385537254250549e-05, + "loss": 1.8291, + "step": 7824 + }, + { + "epoch": 0.3420166965339394, + "grad_norm": 3.734375, + "learning_rate": 7.38493358206463e-05, + "loss": 1.8099, + "step": 7825 + }, + { + "epoch": 0.34206040473796934, + "grad_norm": 2.75, + "learning_rate": 7.384329864871542e-05, + "loss": 2.5644, + "step": 7826 + }, + { + "epoch": 0.3421041129419992, + "grad_norm": 2.296875, + "learning_rate": 7.38372610268268e-05, + "loss": 1.8474, + "step": 7827 + }, + { + "epoch": 0.3421478211460291, + "grad_norm": 3.046875, + "learning_rate": 7.383122295509437e-05, + "loss": 2.8546, + "step": 7828 + }, + { + "epoch": 0.34219152935005903, + "grad_norm": 2.1875, + "learning_rate": 7.382518443363208e-05, + "loss": 2.1391, + "step": 7829 + }, + { + "epoch": 0.3422352375540889, + "grad_norm": 2.15625, + "learning_rate": 7.38191454625539e-05, + "loss": 2.2937, + "step": 7830 + }, + { + "epoch": 0.3422789457581188, + "grad_norm": 2.0625, + "learning_rate": 7.381310604197375e-05, + "loss": 1.79, + "step": 7831 + }, + { + "epoch": 0.34232265396214867, + "grad_norm": 2.65625, + "learning_rate": 7.380706617200564e-05, + "loss": 2.197, + "step": 7832 + }, + { + "epoch": 0.3423663621661786, + "grad_norm": 2.0, + "learning_rate": 7.380102585276355e-05, + "loss": 2.2665, + "step": 7833 + }, + { + "epoch": 0.3424100703702085, + "grad_norm": 1.9921875, + "learning_rate": 7.379498508436146e-05, + "loss": 1.7406, + "step": 7834 + }, + { + "epoch": 0.34245377857423837, + "grad_norm": 2.15625, + "learning_rate": 7.378894386691337e-05, + "loss": 2.1328, + "step": 7835 + }, + { + "epoch": 0.3424974867782683, + "grad_norm": 2.03125, + "learning_rate": 7.378290220053328e-05, + "loss": 1.7703, + "step": 7836 + }, + { + "epoch": 0.3425411949822982, + "grad_norm": 2.359375, + "learning_rate": 7.377686008533521e-05, + "loss": 2.2732, + "step": 7837 + }, + { + "epoch": 0.34258490318632806, + "grad_norm": 2.953125, + "learning_rate": 7.377081752143319e-05, + "loss": 1.8219, + "step": 7838 + }, + { + "epoch": 0.342628611390358, + "grad_norm": 1.921875, + "learning_rate": 7.376477450894124e-05, + "loss": 1.7202, + "step": 7839 + }, + { + "epoch": 0.3426723195943879, + "grad_norm": 1.9609375, + "learning_rate": 7.375873104797341e-05, + "loss": 1.6177, + "step": 7840 + }, + { + "epoch": 0.34271602779841775, + "grad_norm": 2.515625, + "learning_rate": 7.375268713864374e-05, + "loss": 1.2808, + "step": 7841 + }, + { + "epoch": 0.34275973600244763, + "grad_norm": 1.9765625, + "learning_rate": 7.374664278106631e-05, + "loss": 2.2372, + "step": 7842 + }, + { + "epoch": 0.34280344420647757, + "grad_norm": 1.8359375, + "learning_rate": 7.374059797535517e-05, + "loss": 2.0804, + "step": 7843 + }, + { + "epoch": 0.34284715241050745, + "grad_norm": 1.796875, + "learning_rate": 7.373455272162438e-05, + "loss": 1.7002, + "step": 7844 + }, + { + "epoch": 0.34289086061453733, + "grad_norm": 2.75, + "learning_rate": 7.372850701998803e-05, + "loss": 1.9006, + "step": 7845 + }, + { + "epoch": 0.34293456881856726, + "grad_norm": 2.15625, + "learning_rate": 7.372246087056023e-05, + "loss": 2.5089, + "step": 7846 + }, + { + "epoch": 0.34297827702259714, + "grad_norm": 1.9921875, + "learning_rate": 7.371641427345506e-05, + "loss": 1.567, + "step": 7847 + }, + { + "epoch": 0.343021985226627, + "grad_norm": 2.21875, + "learning_rate": 7.371036722878664e-05, + "loss": 2.1754, + "step": 7848 + }, + { + "epoch": 0.34306569343065696, + "grad_norm": 3.265625, + "learning_rate": 7.370431973666909e-05, + "loss": 2.6514, + "step": 7849 + }, + { + "epoch": 0.34310940163468684, + "grad_norm": 2.25, + "learning_rate": 7.369827179721651e-05, + "loss": 1.7452, + "step": 7850 + }, + { + "epoch": 0.3431531098387167, + "grad_norm": 2.078125, + "learning_rate": 7.369222341054305e-05, + "loss": 1.6974, + "step": 7851 + }, + { + "epoch": 0.3431968180427466, + "grad_norm": 2.4375, + "learning_rate": 7.368617457676286e-05, + "loss": 2.2866, + "step": 7852 + }, + { + "epoch": 0.34324052624677653, + "grad_norm": 3.578125, + "learning_rate": 7.36801252959901e-05, + "loss": 2.6568, + "step": 7853 + }, + { + "epoch": 0.3432842344508064, + "grad_norm": 3.03125, + "learning_rate": 7.367407556833887e-05, + "loss": 1.8624, + "step": 7854 + }, + { + "epoch": 0.3433279426548363, + "grad_norm": 2.234375, + "learning_rate": 7.366802539392341e-05, + "loss": 2.464, + "step": 7855 + }, + { + "epoch": 0.34337165085886623, + "grad_norm": 2.421875, + "learning_rate": 7.366197477285785e-05, + "loss": 2.198, + "step": 7856 + }, + { + "epoch": 0.3434153590628961, + "grad_norm": 1.9140625, + "learning_rate": 7.365592370525639e-05, + "loss": 2.274, + "step": 7857 + }, + { + "epoch": 0.343459067266926, + "grad_norm": 2.203125, + "learning_rate": 7.364987219123323e-05, + "loss": 1.6779, + "step": 7858 + }, + { + "epoch": 0.3435027754709559, + "grad_norm": 2.03125, + "learning_rate": 7.364382023090255e-05, + "loss": 2.1957, + "step": 7859 + }, + { + "epoch": 0.3435464836749858, + "grad_norm": 2.296875, + "learning_rate": 7.363776782437857e-05, + "loss": 1.9011, + "step": 7860 + }, + { + "epoch": 0.3435901918790157, + "grad_norm": 2.703125, + "learning_rate": 7.36317149717755e-05, + "loss": 2.4219, + "step": 7861 + }, + { + "epoch": 0.34363390008304556, + "grad_norm": 2.203125, + "learning_rate": 7.362566167320759e-05, + "loss": 1.9962, + "step": 7862 + }, + { + "epoch": 0.3436776082870755, + "grad_norm": 1.8359375, + "learning_rate": 7.361960792878906e-05, + "loss": 1.5491, + "step": 7863 + }, + { + "epoch": 0.3437213164911054, + "grad_norm": 1.6953125, + "learning_rate": 7.361355373863414e-05, + "loss": 1.6535, + "step": 7864 + }, + { + "epoch": 0.34376502469513526, + "grad_norm": 2.546875, + "learning_rate": 7.360749910285711e-05, + "loss": 1.7978, + "step": 7865 + }, + { + "epoch": 0.3438087328991652, + "grad_norm": 2.09375, + "learning_rate": 7.360144402157218e-05, + "loss": 1.3264, + "step": 7866 + }, + { + "epoch": 0.3438524411031951, + "grad_norm": 1.96875, + "learning_rate": 7.359538849489367e-05, + "loss": 1.3655, + "step": 7867 + }, + { + "epoch": 0.34389614930722495, + "grad_norm": 2.09375, + "learning_rate": 7.358933252293585e-05, + "loss": 1.5282, + "step": 7868 + }, + { + "epoch": 0.3439398575112549, + "grad_norm": 2.65625, + "learning_rate": 7.3583276105813e-05, + "loss": 2.3046, + "step": 7869 + }, + { + "epoch": 0.34398356571528477, + "grad_norm": 1.8203125, + "learning_rate": 7.357721924363937e-05, + "loss": 1.8875, + "step": 7870 + }, + { + "epoch": 0.34402727391931465, + "grad_norm": 1.859375, + "learning_rate": 7.357116193652931e-05, + "loss": 1.6185, + "step": 7871 + }, + { + "epoch": 0.3440709821233445, + "grad_norm": 2.546875, + "learning_rate": 7.356510418459714e-05, + "loss": 1.548, + "step": 7872 + }, + { + "epoch": 0.34411469032737446, + "grad_norm": 2.109375, + "learning_rate": 7.355904598795713e-05, + "loss": 1.9057, + "step": 7873 + }, + { + "epoch": 0.34415839853140434, + "grad_norm": 2.078125, + "learning_rate": 7.355298734672364e-05, + "loss": 1.7803, + "step": 7874 + }, + { + "epoch": 0.3442021067354342, + "grad_norm": 1.9921875, + "learning_rate": 7.354692826101102e-05, + "loss": 2.1805, + "step": 7875 + }, + { + "epoch": 0.34424581493946416, + "grad_norm": 3.1875, + "learning_rate": 7.354086873093356e-05, + "loss": 1.8715, + "step": 7876 + }, + { + "epoch": 0.34428952314349404, + "grad_norm": 2.0625, + "learning_rate": 7.353480875660566e-05, + "loss": 1.8297, + "step": 7877 + }, + { + "epoch": 0.3443332313475239, + "grad_norm": 2.75, + "learning_rate": 7.352874833814168e-05, + "loss": 2.1321, + "step": 7878 + }, + { + "epoch": 0.34437693955155385, + "grad_norm": 2.3125, + "learning_rate": 7.352268747565596e-05, + "loss": 2.0992, + "step": 7879 + }, + { + "epoch": 0.34442064775558373, + "grad_norm": 1.890625, + "learning_rate": 7.351662616926289e-05, + "loss": 1.477, + "step": 7880 + }, + { + "epoch": 0.3444643559596136, + "grad_norm": 2.03125, + "learning_rate": 7.351056441907687e-05, + "loss": 1.5112, + "step": 7881 + }, + { + "epoch": 0.3445080641636435, + "grad_norm": 2.0625, + "learning_rate": 7.350450222521226e-05, + "loss": 1.8943, + "step": 7882 + }, + { + "epoch": 0.3445517723676734, + "grad_norm": 2.40625, + "learning_rate": 7.34984395877835e-05, + "loss": 1.7028, + "step": 7883 + }, + { + "epoch": 0.3445954805717033, + "grad_norm": 2.0, + "learning_rate": 7.349237650690497e-05, + "loss": 1.7619, + "step": 7884 + }, + { + "epoch": 0.3446391887757332, + "grad_norm": 2.140625, + "learning_rate": 7.348631298269114e-05, + "loss": 1.6977, + "step": 7885 + }, + { + "epoch": 0.3446828969797631, + "grad_norm": 2.609375, + "learning_rate": 7.348024901525635e-05, + "loss": 2.221, + "step": 7886 + }, + { + "epoch": 0.344726605183793, + "grad_norm": 2.640625, + "learning_rate": 7.347418460471511e-05, + "loss": 1.3263, + "step": 7887 + }, + { + "epoch": 0.3447703133878229, + "grad_norm": 2.21875, + "learning_rate": 7.346811975118185e-05, + "loss": 1.9857, + "step": 7888 + }, + { + "epoch": 0.3448140215918528, + "grad_norm": 2.296875, + "learning_rate": 7.346205445477101e-05, + "loss": 1.9187, + "step": 7889 + }, + { + "epoch": 0.3448577297958827, + "grad_norm": 2.140625, + "learning_rate": 7.345598871559706e-05, + "loss": 2.4341, + "step": 7890 + }, + { + "epoch": 0.3449014379999126, + "grad_norm": 2.5625, + "learning_rate": 7.344992253377445e-05, + "loss": 1.9575, + "step": 7891 + }, + { + "epoch": 0.34494514620394245, + "grad_norm": 2.125, + "learning_rate": 7.344385590941768e-05, + "loss": 1.8408, + "step": 7892 + }, + { + "epoch": 0.3449888544079724, + "grad_norm": 2.578125, + "learning_rate": 7.343778884264123e-05, + "loss": 1.9551, + "step": 7893 + }, + { + "epoch": 0.34503256261200227, + "grad_norm": 2.078125, + "learning_rate": 7.343172133355958e-05, + "loss": 1.682, + "step": 7894 + }, + { + "epoch": 0.34507627081603215, + "grad_norm": 2.171875, + "learning_rate": 7.342565338228726e-05, + "loss": 1.4362, + "step": 7895 + }, + { + "epoch": 0.3451199790200621, + "grad_norm": 2.078125, + "learning_rate": 7.341958498893876e-05, + "loss": 1.7903, + "step": 7896 + }, + { + "epoch": 0.34516368722409196, + "grad_norm": 2.25, + "learning_rate": 7.34135161536286e-05, + "loss": 1.8678, + "step": 7897 + }, + { + "epoch": 0.34520739542812184, + "grad_norm": 1.7890625, + "learning_rate": 7.340744687647133e-05, + "loss": 1.4317, + "step": 7898 + }, + { + "epoch": 0.3452511036321518, + "grad_norm": 2.09375, + "learning_rate": 7.340137715758146e-05, + "loss": 1.769, + "step": 7899 + }, + { + "epoch": 0.34529481183618166, + "grad_norm": 2.6875, + "learning_rate": 7.339530699707354e-05, + "loss": 1.5794, + "step": 7900 + }, + { + "epoch": 0.34533852004021154, + "grad_norm": 3.25, + "learning_rate": 7.338923639506213e-05, + "loss": 2.1146, + "step": 7901 + }, + { + "epoch": 0.3453822282442414, + "grad_norm": 2.265625, + "learning_rate": 7.338316535166179e-05, + "loss": 2.3914, + "step": 7902 + }, + { + "epoch": 0.34542593644827135, + "grad_norm": 2.03125, + "learning_rate": 7.337709386698709e-05, + "loss": 1.5035, + "step": 7903 + }, + { + "epoch": 0.34546964465230123, + "grad_norm": 2.9375, + "learning_rate": 7.33710219411526e-05, + "loss": 2.1352, + "step": 7904 + }, + { + "epoch": 0.3455133528563311, + "grad_norm": 3.015625, + "learning_rate": 7.336494957427292e-05, + "loss": 1.7889, + "step": 7905 + }, + { + "epoch": 0.34555706106036105, + "grad_norm": 1.9296875, + "learning_rate": 7.335887676646263e-05, + "loss": 2.0182, + "step": 7906 + }, + { + "epoch": 0.34560076926439093, + "grad_norm": 2.140625, + "learning_rate": 7.335280351783632e-05, + "loss": 1.5878, + "step": 7907 + }, + { + "epoch": 0.3456444774684208, + "grad_norm": 2.375, + "learning_rate": 7.334672982850865e-05, + "loss": 2.4043, + "step": 7908 + }, + { + "epoch": 0.34568818567245074, + "grad_norm": 2.5, + "learning_rate": 7.334065569859419e-05, + "loss": 2.0702, + "step": 7909 + }, + { + "epoch": 0.3457318938764806, + "grad_norm": 2.09375, + "learning_rate": 7.333458112820758e-05, + "loss": 1.916, + "step": 7910 + }, + { + "epoch": 0.3457756020805105, + "grad_norm": 2.953125, + "learning_rate": 7.332850611746346e-05, + "loss": 2.2221, + "step": 7911 + }, + { + "epoch": 0.3458193102845404, + "grad_norm": 2.171875, + "learning_rate": 7.332243066647651e-05, + "loss": 2.2696, + "step": 7912 + }, + { + "epoch": 0.3458630184885703, + "grad_norm": 2.359375, + "learning_rate": 7.331635477536131e-05, + "loss": 1.8497, + "step": 7913 + }, + { + "epoch": 0.3459067266926002, + "grad_norm": 2.21875, + "learning_rate": 7.331027844423258e-05, + "loss": 1.73, + "step": 7914 + }, + { + "epoch": 0.3459504348966301, + "grad_norm": 2.46875, + "learning_rate": 7.330420167320498e-05, + "loss": 2.7669, + "step": 7915 + }, + { + "epoch": 0.34599414310066, + "grad_norm": 6.78125, + "learning_rate": 7.329812446239315e-05, + "loss": 2.766, + "step": 7916 + }, + { + "epoch": 0.3460378513046899, + "grad_norm": 1.90625, + "learning_rate": 7.329204681191183e-05, + "loss": 1.4046, + "step": 7917 + }, + { + "epoch": 0.34608155950871977, + "grad_norm": 3.59375, + "learning_rate": 7.328596872187567e-05, + "loss": 2.738, + "step": 7918 + }, + { + "epoch": 0.3461252677127497, + "grad_norm": 2.359375, + "learning_rate": 7.327989019239938e-05, + "loss": 2.2535, + "step": 7919 + }, + { + "epoch": 0.3461689759167796, + "grad_norm": 2.75, + "learning_rate": 7.32738112235977e-05, + "loss": 2.1258, + "step": 7920 + }, + { + "epoch": 0.34621268412080947, + "grad_norm": 2.390625, + "learning_rate": 7.326773181558532e-05, + "loss": 1.9796, + "step": 7921 + }, + { + "epoch": 0.34625639232483935, + "grad_norm": 2.203125, + "learning_rate": 7.326165196847697e-05, + "loss": 1.7916, + "step": 7922 + }, + { + "epoch": 0.3463001005288693, + "grad_norm": 1.84375, + "learning_rate": 7.32555716823874e-05, + "loss": 1.8068, + "step": 7923 + }, + { + "epoch": 0.34634380873289916, + "grad_norm": 3.53125, + "learning_rate": 7.324949095743134e-05, + "loss": 2.206, + "step": 7924 + }, + { + "epoch": 0.34638751693692904, + "grad_norm": 2.578125, + "learning_rate": 7.324340979372356e-05, + "loss": 2.2156, + "step": 7925 + }, + { + "epoch": 0.346431225140959, + "grad_norm": 2.015625, + "learning_rate": 7.32373281913788e-05, + "loss": 1.6678, + "step": 7926 + }, + { + "epoch": 0.34647493334498886, + "grad_norm": 2.25, + "learning_rate": 7.323124615051183e-05, + "loss": 1.9071, + "step": 7927 + }, + { + "epoch": 0.34651864154901874, + "grad_norm": 3.640625, + "learning_rate": 7.322516367123744e-05, + "loss": 2.493, + "step": 7928 + }, + { + "epoch": 0.34656234975304867, + "grad_norm": 3.515625, + "learning_rate": 7.321908075367041e-05, + "loss": 1.8614, + "step": 7929 + }, + { + "epoch": 0.34660605795707855, + "grad_norm": 2.046875, + "learning_rate": 7.321299739792552e-05, + "loss": 1.7816, + "step": 7930 + }, + { + "epoch": 0.34664976616110843, + "grad_norm": 2.21875, + "learning_rate": 7.32069136041176e-05, + "loss": 1.5655, + "step": 7931 + }, + { + "epoch": 0.3466934743651383, + "grad_norm": 3.453125, + "learning_rate": 7.320082937236144e-05, + "loss": 1.8418, + "step": 7932 + }, + { + "epoch": 0.34673718256916825, + "grad_norm": 3.59375, + "learning_rate": 7.319474470277187e-05, + "loss": 2.4229, + "step": 7933 + }, + { + "epoch": 0.3467808907731981, + "grad_norm": 1.9609375, + "learning_rate": 7.318865959546369e-05, + "loss": 1.6238, + "step": 7934 + }, + { + "epoch": 0.346824598977228, + "grad_norm": 2.21875, + "learning_rate": 7.318257405055178e-05, + "loss": 1.6683, + "step": 7935 + }, + { + "epoch": 0.34686830718125794, + "grad_norm": 2.328125, + "learning_rate": 7.317648806815094e-05, + "loss": 2.9313, + "step": 7936 + }, + { + "epoch": 0.3469120153852878, + "grad_norm": 1.9296875, + "learning_rate": 7.317040164837604e-05, + "loss": 1.7694, + "step": 7937 + }, + { + "epoch": 0.3469557235893177, + "grad_norm": 1.9375, + "learning_rate": 7.316431479134194e-05, + "loss": 1.7356, + "step": 7938 + }, + { + "epoch": 0.34699943179334763, + "grad_norm": 1.7578125, + "learning_rate": 7.31582274971635e-05, + "loss": 1.583, + "step": 7939 + }, + { + "epoch": 0.3470431399973775, + "grad_norm": 2.140625, + "learning_rate": 7.315213976595561e-05, + "loss": 1.7361, + "step": 7940 + }, + { + "epoch": 0.3470868482014074, + "grad_norm": 2.078125, + "learning_rate": 7.314605159783314e-05, + "loss": 1.5803, + "step": 7941 + }, + { + "epoch": 0.3471305564054373, + "grad_norm": 1.8984375, + "learning_rate": 7.313996299291098e-05, + "loss": 1.6847, + "step": 7942 + }, + { + "epoch": 0.3471742646094672, + "grad_norm": 2.28125, + "learning_rate": 7.313387395130406e-05, + "loss": 1.7097, + "step": 7943 + }, + { + "epoch": 0.3472179728134971, + "grad_norm": 2.171875, + "learning_rate": 7.312778447312725e-05, + "loss": 2.1191, + "step": 7944 + }, + { + "epoch": 0.34726168101752697, + "grad_norm": 2.28125, + "learning_rate": 7.312169455849551e-05, + "loss": 2.0175, + "step": 7945 + }, + { + "epoch": 0.3473053892215569, + "grad_norm": 4.1875, + "learning_rate": 7.311560420752373e-05, + "loss": 2.3162, + "step": 7946 + }, + { + "epoch": 0.3473490974255868, + "grad_norm": 2.296875, + "learning_rate": 7.310951342032684e-05, + "loss": 1.7003, + "step": 7947 + }, + { + "epoch": 0.34739280562961666, + "grad_norm": 2.578125, + "learning_rate": 7.310342219701981e-05, + "loss": 2.1897, + "step": 7948 + }, + { + "epoch": 0.3474365138336466, + "grad_norm": 2.421875, + "learning_rate": 7.309733053771758e-05, + "loss": 2.1538, + "step": 7949 + }, + { + "epoch": 0.3474802220376765, + "grad_norm": 2.390625, + "learning_rate": 7.309123844253511e-05, + "loss": 1.8306, + "step": 7950 + }, + { + "epoch": 0.34752393024170636, + "grad_norm": 1.8046875, + "learning_rate": 7.308514591158735e-05, + "loss": 1.3273, + "step": 7951 + }, + { + "epoch": 0.34756763844573624, + "grad_norm": 1.9453125, + "learning_rate": 7.307905294498929e-05, + "loss": 1.599, + "step": 7952 + }, + { + "epoch": 0.3476113466497662, + "grad_norm": 3.015625, + "learning_rate": 7.30729595428559e-05, + "loss": 2.3818, + "step": 7953 + }, + { + "epoch": 0.34765505485379605, + "grad_norm": 2.234375, + "learning_rate": 7.306686570530221e-05, + "loss": 2.0856, + "step": 7954 + }, + { + "epoch": 0.34769876305782593, + "grad_norm": 2.140625, + "learning_rate": 7.30607714324432e-05, + "loss": 2.4284, + "step": 7955 + }, + { + "epoch": 0.34774247126185587, + "grad_norm": 2.21875, + "learning_rate": 7.305467672439384e-05, + "loss": 1.9622, + "step": 7956 + }, + { + "epoch": 0.34778617946588575, + "grad_norm": 1.9609375, + "learning_rate": 7.304858158126917e-05, + "loss": 1.7486, + "step": 7957 + }, + { + "epoch": 0.3478298876699156, + "grad_norm": 2.0625, + "learning_rate": 7.304248600318425e-05, + "loss": 1.6015, + "step": 7958 + }, + { + "epoch": 0.34787359587394556, + "grad_norm": 2.171875, + "learning_rate": 7.303638999025406e-05, + "loss": 1.9826, + "step": 7959 + }, + { + "epoch": 0.34791730407797544, + "grad_norm": 3.171875, + "learning_rate": 7.303029354259367e-05, + "loss": 1.6595, + "step": 7960 + }, + { + "epoch": 0.3479610122820053, + "grad_norm": 3.46875, + "learning_rate": 7.302419666031813e-05, + "loss": 1.8291, + "step": 7961 + }, + { + "epoch": 0.3480047204860352, + "grad_norm": 2.203125, + "learning_rate": 7.301809934354248e-05, + "loss": 1.7561, + "step": 7962 + }, + { + "epoch": 0.34804842869006514, + "grad_norm": 2.359375, + "learning_rate": 7.30120015923818e-05, + "loss": 2.1202, + "step": 7963 + }, + { + "epoch": 0.348092136894095, + "grad_norm": 1.859375, + "learning_rate": 7.300590340695115e-05, + "loss": 1.761, + "step": 7964 + }, + { + "epoch": 0.3481358450981249, + "grad_norm": 2.03125, + "learning_rate": 7.299980478736564e-05, + "loss": 1.82, + "step": 7965 + }, + { + "epoch": 0.34817955330215483, + "grad_norm": 2.15625, + "learning_rate": 7.299370573374031e-05, + "loss": 1.8937, + "step": 7966 + }, + { + "epoch": 0.3482232615061847, + "grad_norm": 2.453125, + "learning_rate": 7.298760624619029e-05, + "loss": 2.0898, + "step": 7967 + }, + { + "epoch": 0.3482669697102146, + "grad_norm": 2.453125, + "learning_rate": 7.29815063248307e-05, + "loss": 2.0204, + "step": 7968 + }, + { + "epoch": 0.3483106779142445, + "grad_norm": 1.8359375, + "learning_rate": 7.297540596977662e-05, + "loss": 1.4455, + "step": 7969 + }, + { + "epoch": 0.3483543861182744, + "grad_norm": 2.09375, + "learning_rate": 7.29693051811432e-05, + "loss": 1.4206, + "step": 7970 + }, + { + "epoch": 0.3483980943223043, + "grad_norm": 2.21875, + "learning_rate": 7.296320395904556e-05, + "loss": 2.2885, + "step": 7971 + }, + { + "epoch": 0.34844180252633417, + "grad_norm": 2.4375, + "learning_rate": 7.295710230359885e-05, + "loss": 2.0993, + "step": 7972 + }, + { + "epoch": 0.3484855107303641, + "grad_norm": 2.078125, + "learning_rate": 7.295100021491818e-05, + "loss": 1.7229, + "step": 7973 + }, + { + "epoch": 0.348529218934394, + "grad_norm": 2.203125, + "learning_rate": 7.294489769311876e-05, + "loss": 1.7387, + "step": 7974 + }, + { + "epoch": 0.34857292713842386, + "grad_norm": 1.9140625, + "learning_rate": 7.293879473831572e-05, + "loss": 1.8183, + "step": 7975 + }, + { + "epoch": 0.3486166353424538, + "grad_norm": 2.09375, + "learning_rate": 7.293269135062424e-05, + "loss": 1.9519, + "step": 7976 + }, + { + "epoch": 0.3486603435464837, + "grad_norm": 1.953125, + "learning_rate": 7.292658753015948e-05, + "loss": 1.6782, + "step": 7977 + }, + { + "epoch": 0.34870405175051356, + "grad_norm": 2.53125, + "learning_rate": 7.292048327703666e-05, + "loss": 2.2993, + "step": 7978 + }, + { + "epoch": 0.3487477599545435, + "grad_norm": 1.9140625, + "learning_rate": 7.291437859137095e-05, + "loss": 1.7695, + "step": 7979 + }, + { + "epoch": 0.34879146815857337, + "grad_norm": 1.8984375, + "learning_rate": 7.290827347327758e-05, + "loss": 1.7113, + "step": 7980 + }, + { + "epoch": 0.34883517636260325, + "grad_norm": 1.7890625, + "learning_rate": 7.290216792287175e-05, + "loss": 1.5739, + "step": 7981 + }, + { + "epoch": 0.34887888456663313, + "grad_norm": 1.9140625, + "learning_rate": 7.289606194026866e-05, + "loss": 1.8195, + "step": 7982 + }, + { + "epoch": 0.34892259277066306, + "grad_norm": 1.9765625, + "learning_rate": 7.288995552558357e-05, + "loss": 1.7554, + "step": 7983 + }, + { + "epoch": 0.34896630097469294, + "grad_norm": 2.09375, + "learning_rate": 7.28838486789317e-05, + "loss": 1.7297, + "step": 7984 + }, + { + "epoch": 0.3490100091787228, + "grad_norm": 1.921875, + "learning_rate": 7.28777414004283e-05, + "loss": 2.0312, + "step": 7985 + }, + { + "epoch": 0.34905371738275276, + "grad_norm": 2.125, + "learning_rate": 7.287163369018863e-05, + "loss": 2.0484, + "step": 7986 + }, + { + "epoch": 0.34909742558678264, + "grad_norm": 2.234375, + "learning_rate": 7.286552554832793e-05, + "loss": 2.0505, + "step": 7987 + }, + { + "epoch": 0.3491411337908125, + "grad_norm": 2.125, + "learning_rate": 7.28594169749615e-05, + "loss": 1.859, + "step": 7988 + }, + { + "epoch": 0.34918484199484245, + "grad_norm": 2.03125, + "learning_rate": 7.285330797020458e-05, + "loss": 2.0386, + "step": 7989 + }, + { + "epoch": 0.34922855019887233, + "grad_norm": 1.6796875, + "learning_rate": 7.28471985341725e-05, + "loss": 1.5225, + "step": 7990 + }, + { + "epoch": 0.3492722584029022, + "grad_norm": 1.8671875, + "learning_rate": 7.284108866698051e-05, + "loss": 1.7554, + "step": 7991 + }, + { + "epoch": 0.3493159666069321, + "grad_norm": 2.765625, + "learning_rate": 7.283497836874396e-05, + "loss": 1.5883, + "step": 7992 + }, + { + "epoch": 0.34935967481096203, + "grad_norm": 1.9375, + "learning_rate": 7.282886763957812e-05, + "loss": 1.3351, + "step": 7993 + }, + { + "epoch": 0.3494033830149919, + "grad_norm": 1.9140625, + "learning_rate": 7.282275647959831e-05, + "loss": 1.6072, + "step": 7994 + }, + { + "epoch": 0.3494470912190218, + "grad_norm": 3.0, + "learning_rate": 7.281664488891988e-05, + "loss": 2.1401, + "step": 7995 + }, + { + "epoch": 0.3494907994230517, + "grad_norm": 1.9921875, + "learning_rate": 7.281053286765815e-05, + "loss": 1.2633, + "step": 7996 + }, + { + "epoch": 0.3495345076270816, + "grad_norm": 2.21875, + "learning_rate": 7.280442041592846e-05, + "loss": 2.2913, + "step": 7997 + }, + { + "epoch": 0.3495782158311115, + "grad_norm": 2.375, + "learning_rate": 7.279830753384618e-05, + "loss": 2.0087, + "step": 7998 + }, + { + "epoch": 0.3496219240351414, + "grad_norm": 2.0625, + "learning_rate": 7.279219422152666e-05, + "loss": 2.0633, + "step": 7999 + }, + { + "epoch": 0.3496656322391713, + "grad_norm": 2.09375, + "learning_rate": 7.278608047908523e-05, + "loss": 1.8298, + "step": 8000 + }, + { + "epoch": 0.3497093404432012, + "grad_norm": 2.703125, + "learning_rate": 7.277996630663734e-05, + "loss": 1.0935, + "step": 8001 + }, + { + "epoch": 0.3497530486472311, + "grad_norm": 8.4375, + "learning_rate": 7.27738517042983e-05, + "loss": 1.3579, + "step": 8002 + }, + { + "epoch": 0.349796756851261, + "grad_norm": 2.015625, + "learning_rate": 7.276773667218354e-05, + "loss": 1.889, + "step": 8003 + }, + { + "epoch": 0.3498404650552909, + "grad_norm": 1.9921875, + "learning_rate": 7.276162121040846e-05, + "loss": 1.8152, + "step": 8004 + }, + { + "epoch": 0.34988417325932075, + "grad_norm": 1.7890625, + "learning_rate": 7.275550531908846e-05, + "loss": 1.5692, + "step": 8005 + }, + { + "epoch": 0.3499278814633507, + "grad_norm": 3.8125, + "learning_rate": 7.274938899833896e-05, + "loss": 2.0927, + "step": 8006 + }, + { + "epoch": 0.34997158966738057, + "grad_norm": 3.59375, + "learning_rate": 7.274327224827535e-05, + "loss": 1.6025, + "step": 8007 + }, + { + "epoch": 0.35001529787141045, + "grad_norm": 2.0625, + "learning_rate": 7.273715506901312e-05, + "loss": 2.1043, + "step": 8008 + }, + { + "epoch": 0.3500590060754404, + "grad_norm": 2.265625, + "learning_rate": 7.273103746066767e-05, + "loss": 2.2297, + "step": 8009 + }, + { + "epoch": 0.35010271427947026, + "grad_norm": 2.328125, + "learning_rate": 7.272491942335447e-05, + "loss": 1.5105, + "step": 8010 + }, + { + "epoch": 0.35014642248350014, + "grad_norm": 2.453125, + "learning_rate": 7.271880095718895e-05, + "loss": 1.8723, + "step": 8011 + }, + { + "epoch": 0.3501901306875301, + "grad_norm": 2.09375, + "learning_rate": 7.27126820622866e-05, + "loss": 1.9394, + "step": 8012 + }, + { + "epoch": 0.35023383889155996, + "grad_norm": 1.9921875, + "learning_rate": 7.270656273876289e-05, + "loss": 1.9664, + "step": 8013 + }, + { + "epoch": 0.35027754709558984, + "grad_norm": 1.921875, + "learning_rate": 7.270044298673328e-05, + "loss": 1.7432, + "step": 8014 + }, + { + "epoch": 0.3503212552996197, + "grad_norm": 1.8671875, + "learning_rate": 7.269432280631327e-05, + "loss": 1.5492, + "step": 8015 + } + ], + "logging_steps": 1, + "max_steps": 22879, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 229, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.2123448025219072e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}