diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -2,9 +2,9 @@ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.5465717757333797, + "epoch": 0.6887693108835272, "eval_steps": 2951, - "global_step": 5904, + "global_step": 7440, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -41352,6 +41352,10758 @@ "learning_rate": 0.02, "loss": 1.565, "step": 5904 + }, + { + "epoch": 0.5466643522536596, + "grad_norm": 0.138671875, + "learning_rate": 0.02, + "loss": 1.5874, + "step": 5905 + }, + { + "epoch": 0.5467569287739397, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.595, + "step": 5906 + }, + { + "epoch": 0.5468495052942197, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.5294, + "step": 5907 + }, + { + "epoch": 0.5469420818144998, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.5086, + "step": 5908 + }, + { + "epoch": 0.5470346583347798, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.5447, + "step": 5909 + }, + { + "epoch": 0.5471272348550599, + "grad_norm": 0.1572265625, + "learning_rate": 0.02, + "loss": 1.5581, + "step": 5910 + }, + { + "epoch": 0.5472198113753399, + "grad_norm": 0.15625, + "learning_rate": 0.02, + "loss": 1.5498, + "step": 5911 + }, + { + "epoch": 0.54731238789562, + "grad_norm": 0.158203125, + "learning_rate": 0.02, + "loss": 1.6479, + "step": 5912 + }, + { + "epoch": 0.5474049644159, + "grad_norm": 0.158203125, + "learning_rate": 0.02, + "loss": 1.581, + "step": 5913 + }, + { + "epoch": 0.5474975409361801, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.569, + "step": 5914 + }, + { + "epoch": 0.5475901174564601, + "grad_norm": 0.1572265625, + "learning_rate": 0.02, + "loss": 1.5129, + "step": 5915 + }, + { + "epoch": 0.5476826939767402, + "grad_norm": 0.1376953125, + "learning_rate": 0.02, + "loss": 1.5106, + "step": 5916 + }, + { + "epoch": 0.5477752704970202, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.543, + "step": 5917 + }, + { + "epoch": 0.5478678470173003, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.6299, + "step": 5918 + }, + { + "epoch": 0.5479604235375802, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.5737, + "step": 5919 + }, + { + "epoch": 0.5480530000578603, + "grad_norm": 0.138671875, + "learning_rate": 0.02, + "loss": 1.5662, + "step": 5920 + }, + { + "epoch": 0.5481455765781403, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.5858, + "step": 5921 + }, + { + "epoch": 0.5482381530984204, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.5562, + "step": 5922 + }, + { + "epoch": 0.5483307296187004, + "grad_norm": 0.1416015625, + "learning_rate": 0.02, + "loss": 1.566, + "step": 5923 + }, + { + "epoch": 0.5484233061389805, + "grad_norm": 0.1396484375, + "learning_rate": 0.02, + "loss": 1.5291, + "step": 5924 + }, + { + "epoch": 0.5485158826592605, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.5176, + "step": 5925 + }, + { + "epoch": 0.5486084591795406, + "grad_norm": 0.1552734375, + "learning_rate": 0.02, + "loss": 1.5948, + "step": 5926 + }, + { + "epoch": 0.5487010356998206, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.562, + "step": 5927 + }, + { + "epoch": 0.5487936122201007, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.5255, + "step": 5928 + }, + { + "epoch": 0.5488861887403808, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.4922, + "step": 5929 + }, + { + "epoch": 0.5489787652606608, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.5217, + "step": 5930 + }, + { + "epoch": 0.5490713417809409, + "grad_norm": 0.158203125, + "learning_rate": 0.02, + "loss": 1.5319, + "step": 5931 + }, + { + "epoch": 0.5491639183012208, + "grad_norm": 0.15625, + "learning_rate": 0.02, + "loss": 1.5527, + "step": 5932 + }, + { + "epoch": 0.5492564948215009, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.5471, + "step": 5933 + }, + { + "epoch": 0.5493490713417809, + "grad_norm": 0.1572265625, + "learning_rate": 0.02, + "loss": 1.545, + "step": 5934 + }, + { + "epoch": 0.549441647862061, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.5043, + "step": 5935 + }, + { + "epoch": 0.549534224382341, + "grad_norm": 0.1572265625, + "learning_rate": 0.02, + "loss": 1.5489, + "step": 5936 + }, + { + "epoch": 0.5496268009026211, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.5497, + "step": 5937 + }, + { + "epoch": 0.5497193774229011, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5397, + "step": 5938 + }, + { + "epoch": 0.5498119539431812, + "grad_norm": 0.158203125, + "learning_rate": 0.02, + "loss": 1.519, + "step": 5939 + }, + { + "epoch": 0.5499045304634612, + "grad_norm": 0.15625, + "learning_rate": 0.02, + "loss": 1.593, + "step": 5940 + }, + { + "epoch": 0.5499971069837413, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.5034, + "step": 5941 + }, + { + "epoch": 0.5500896835040213, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5427, + "step": 5942 + }, + { + "epoch": 0.5501822600243014, + "grad_norm": 0.158203125, + "learning_rate": 0.02, + "loss": 1.5985, + "step": 5943 + }, + { + "epoch": 0.5502748365445814, + "grad_norm": 0.1435546875, + "learning_rate": 0.02, + "loss": 1.5973, + "step": 5944 + }, + { + "epoch": 0.5503674130648615, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5879, + "step": 5945 + }, + { + "epoch": 0.5504599895851414, + "grad_norm": 0.1552734375, + "learning_rate": 0.02, + "loss": 1.5132, + "step": 5946 + }, + { + "epoch": 0.5505525661054215, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.5282, + "step": 5947 + }, + { + "epoch": 0.5506451426257015, + "grad_norm": 0.1572265625, + "learning_rate": 0.02, + "loss": 1.5109, + "step": 5948 + }, + { + "epoch": 0.5507377191459816, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5365, + "step": 5949 + }, + { + "epoch": 0.5508302956662616, + "grad_norm": 0.1640625, + "learning_rate": 0.02, + "loss": 1.5245, + "step": 5950 + }, + { + "epoch": 0.5509228721865417, + "grad_norm": 0.1591796875, + "learning_rate": 0.02, + "loss": 1.5004, + "step": 5951 + }, + { + "epoch": 0.5510154487068217, + "grad_norm": 0.158203125, + "learning_rate": 0.02, + "loss": 1.5682, + "step": 5952 + }, + { + "epoch": 0.5511080252271018, + "grad_norm": 0.16015625, + "learning_rate": 0.02, + "loss": 1.5883, + "step": 5953 + }, + { + "epoch": 0.5512006017473818, + "grad_norm": 0.1611328125, + "learning_rate": 0.02, + "loss": 1.5506, + "step": 5954 + }, + { + "epoch": 0.5512931782676619, + "grad_norm": 0.1591796875, + "learning_rate": 0.02, + "loss": 1.5621, + "step": 5955 + }, + { + "epoch": 0.5513857547879419, + "grad_norm": 0.1552734375, + "learning_rate": 0.02, + "loss": 1.5737, + "step": 5956 + }, + { + "epoch": 0.551478331308222, + "grad_norm": 0.1396484375, + "learning_rate": 0.02, + "loss": 1.5726, + "step": 5957 + }, + { + "epoch": 0.551570907828502, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.5612, + "step": 5958 + }, + { + "epoch": 0.551663484348782, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.448, + "step": 5959 + }, + { + "epoch": 0.551756060869062, + "grad_norm": 0.1640625, + "learning_rate": 0.02, + "loss": 1.5272, + "step": 5960 + }, + { + "epoch": 0.5518486373893421, + "grad_norm": 0.1552734375, + "learning_rate": 0.02, + "loss": 1.5679, + "step": 5961 + }, + { + "epoch": 0.5519412139096221, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.4825, + "step": 5962 + }, + { + "epoch": 0.5520337904299022, + "grad_norm": 0.1416015625, + "learning_rate": 0.02, + "loss": 1.5138, + "step": 5963 + }, + { + "epoch": 0.5521263669501822, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.5615, + "step": 5964 + }, + { + "epoch": 0.5522189434704623, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.5261, + "step": 5965 + }, + { + "epoch": 0.5523115199907424, + "grad_norm": 0.1435546875, + "learning_rate": 0.02, + "loss": 1.5633, + "step": 5966 + }, + { + "epoch": 0.5524040965110224, + "grad_norm": 0.1416015625, + "learning_rate": 0.02, + "loss": 1.5021, + "step": 5967 + }, + { + "epoch": 0.5524966730313025, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.6194, + "step": 5968 + }, + { + "epoch": 0.5525892495515825, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.5489, + "step": 5969 + }, + { + "epoch": 0.5526818260718626, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.5773, + "step": 5970 + }, + { + "epoch": 0.5527744025921426, + "grad_norm": 0.1552734375, + "learning_rate": 0.02, + "loss": 1.6275, + "step": 5971 + }, + { + "epoch": 0.5528669791124227, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.5864, + "step": 5972 + }, + { + "epoch": 0.5529595556327026, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.5235, + "step": 5973 + }, + { + "epoch": 0.5530521321529827, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.574, + "step": 5974 + }, + { + "epoch": 0.5531447086732627, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.5496, + "step": 5975 + }, + { + "epoch": 0.5532372851935428, + "grad_norm": 0.158203125, + "learning_rate": 0.02, + "loss": 1.5547, + "step": 5976 + }, + { + "epoch": 0.5533298617138228, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.5179, + "step": 5977 + }, + { + "epoch": 0.5534224382341029, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.6324, + "step": 5978 + }, + { + "epoch": 0.5535150147543829, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.6043, + "step": 5979 + }, + { + "epoch": 0.553607591274663, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5972, + "step": 5980 + }, + { + "epoch": 0.553700167794943, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.5851, + "step": 5981 + }, + { + "epoch": 0.5537927443152231, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.5137, + "step": 5982 + }, + { + "epoch": 0.5538853208355031, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5073, + "step": 5983 + }, + { + "epoch": 0.5539778973557832, + "grad_norm": 0.158203125, + "learning_rate": 0.02, + "loss": 1.5631, + "step": 5984 + }, + { + "epoch": 0.5540704738760632, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5708, + "step": 5985 + }, + { + "epoch": 0.5541630503963433, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.505, + "step": 5986 + }, + { + "epoch": 0.5542556269166232, + "grad_norm": 0.1640625, + "learning_rate": 0.02, + "loss": 1.513, + "step": 5987 + }, + { + "epoch": 0.5543482034369033, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.5705, + "step": 5988 + }, + { + "epoch": 0.5544407799571833, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.6026, + "step": 5989 + }, + { + "epoch": 0.5545333564774634, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.505, + "step": 5990 + }, + { + "epoch": 0.5546259329977434, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5464, + "step": 5991 + }, + { + "epoch": 0.5547185095180235, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.5789, + "step": 5992 + }, + { + "epoch": 0.5548110860383035, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.6512, + "step": 5993 + }, + { + "epoch": 0.5549036625585836, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.6024, + "step": 5994 + }, + { + "epoch": 0.5549962390788636, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.5744, + "step": 5995 + }, + { + "epoch": 0.5550888155991437, + "grad_norm": 0.1435546875, + "learning_rate": 0.02, + "loss": 1.5413, + "step": 5996 + }, + { + "epoch": 0.5551813921194237, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.5813, + "step": 5997 + }, + { + "epoch": 0.5552739686397038, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.55, + "step": 5998 + }, + { + "epoch": 0.5553665451599838, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.6071, + "step": 5999 + }, + { + "epoch": 0.5554591216802638, + "grad_norm": 0.1396484375, + "learning_rate": 0.02, + "loss": 1.6078, + "step": 6000 + }, + { + "epoch": 0.5555516982005438, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.5247, + "step": 6001 + }, + { + "epoch": 0.5556442747208239, + "grad_norm": 0.1416015625, + "learning_rate": 0.02, + "loss": 1.6026, + "step": 6002 + }, + { + "epoch": 0.555736851241104, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.5895, + "step": 6003 + }, + { + "epoch": 0.555829427761384, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.4875, + "step": 6004 + }, + { + "epoch": 0.555922004281664, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.5848, + "step": 6005 + }, + { + "epoch": 0.5560145808019441, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.523, + "step": 6006 + }, + { + "epoch": 0.5561071573222242, + "grad_norm": 0.1435546875, + "learning_rate": 0.02, + "loss": 1.6117, + "step": 6007 + }, + { + "epoch": 0.5561997338425042, + "grad_norm": 0.1435546875, + "learning_rate": 0.02, + "loss": 1.5251, + "step": 6008 + }, + { + "epoch": 0.5562923103627843, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5458, + "step": 6009 + }, + { + "epoch": 0.5563848868830643, + "grad_norm": 0.1416015625, + "learning_rate": 0.02, + "loss": 1.6022, + "step": 6010 + }, + { + "epoch": 0.5564774634033444, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.5169, + "step": 6011 + }, + { + "epoch": 0.5565700399236244, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.5442, + "step": 6012 + }, + { + "epoch": 0.5566626164439045, + "grad_norm": 0.158203125, + "learning_rate": 0.02, + "loss": 1.5905, + "step": 6013 + }, + { + "epoch": 0.5567551929641844, + "grad_norm": 0.1435546875, + "learning_rate": 0.02, + "loss": 1.5151, + "step": 6014 + }, + { + "epoch": 0.5568477694844645, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.6318, + "step": 6015 + }, + { + "epoch": 0.5569403460047445, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.507, + "step": 6016 + }, + { + "epoch": 0.5570329225250246, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.5276, + "step": 6017 + }, + { + "epoch": 0.5571254990453046, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.552, + "step": 6018 + }, + { + "epoch": 0.5572180755655847, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5458, + "step": 6019 + }, + { + "epoch": 0.5573106520858647, + "grad_norm": 0.1591796875, + "learning_rate": 0.02, + "loss": 1.6208, + "step": 6020 + }, + { + "epoch": 0.5574032286061448, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.5786, + "step": 6021 + }, + { + "epoch": 0.5574958051264248, + "grad_norm": 0.1552734375, + "learning_rate": 0.02, + "loss": 1.559, + "step": 6022 + }, + { + "epoch": 0.5575883816467049, + "grad_norm": 0.1416015625, + "learning_rate": 0.02, + "loss": 1.5314, + "step": 6023 + }, + { + "epoch": 0.5576809581669849, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.526, + "step": 6024 + }, + { + "epoch": 0.557773534687265, + "grad_norm": 0.15625, + "learning_rate": 0.02, + "loss": 1.6247, + "step": 6025 + }, + { + "epoch": 0.557866111207545, + "grad_norm": 0.1552734375, + "learning_rate": 0.02, + "loss": 1.6031, + "step": 6026 + }, + { + "epoch": 0.557958687727825, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.521, + "step": 6027 + }, + { + "epoch": 0.558051264248105, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.5293, + "step": 6028 + }, + { + "epoch": 0.5581438407683851, + "grad_norm": 0.134765625, + "learning_rate": 0.02, + "loss": 1.5642, + "step": 6029 + }, + { + "epoch": 0.5582364172886651, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.6067, + "step": 6030 + }, + { + "epoch": 0.5583289938089452, + "grad_norm": 0.1416015625, + "learning_rate": 0.02, + "loss": 1.6204, + "step": 6031 + }, + { + "epoch": 0.5584215703292252, + "grad_norm": 0.1572265625, + "learning_rate": 0.02, + "loss": 1.5232, + "step": 6032 + }, + { + "epoch": 0.5585141468495053, + "grad_norm": 0.1328125, + "learning_rate": 0.02, + "loss": 1.6223, + "step": 6033 + }, + { + "epoch": 0.5586067233697853, + "grad_norm": 0.1396484375, + "learning_rate": 0.02, + "loss": 1.5342, + "step": 6034 + }, + { + "epoch": 0.5586992998900654, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.5241, + "step": 6035 + }, + { + "epoch": 0.5587918764103454, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.5571, + "step": 6036 + }, + { + "epoch": 0.5588844529306255, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5282, + "step": 6037 + }, + { + "epoch": 0.5589770294509055, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.5159, + "step": 6038 + }, + { + "epoch": 0.5590696059711856, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.5565, + "step": 6039 + }, + { + "epoch": 0.5591621824914657, + "grad_norm": 0.16015625, + "learning_rate": 0.02, + "loss": 1.5704, + "step": 6040 + }, + { + "epoch": 0.5592547590117456, + "grad_norm": 0.1591796875, + "learning_rate": 0.02, + "loss": 1.5175, + "step": 6041 + }, + { + "epoch": 0.5593473355320256, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.4938, + "step": 6042 + }, + { + "epoch": 0.5594399120523057, + "grad_norm": 0.1416015625, + "learning_rate": 0.02, + "loss": 1.5651, + "step": 6043 + }, + { + "epoch": 0.5595324885725858, + "grad_norm": 0.15625, + "learning_rate": 0.02, + "loss": 1.5619, + "step": 6044 + }, + { + "epoch": 0.5596250650928658, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5835, + "step": 6045 + }, + { + "epoch": 0.5597176416131459, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.6105, + "step": 6046 + }, + { + "epoch": 0.5598102181334259, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.5772, + "step": 6047 + }, + { + "epoch": 0.559902794653706, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.5302, + "step": 6048 + }, + { + "epoch": 0.559995371173986, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.5118, + "step": 6049 + }, + { + "epoch": 0.5600879476942661, + "grad_norm": 0.1396484375, + "learning_rate": 0.02, + "loss": 1.5045, + "step": 6050 + }, + { + "epoch": 0.5601805242145461, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.5051, + "step": 6051 + }, + { + "epoch": 0.5602731007348262, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.6011, + "step": 6052 + }, + { + "epoch": 0.5603656772551062, + "grad_norm": 0.138671875, + "learning_rate": 0.02, + "loss": 1.5594, + "step": 6053 + }, + { + "epoch": 0.5604582537753863, + "grad_norm": 0.1416015625, + "learning_rate": 0.02, + "loss": 1.5485, + "step": 6054 + }, + { + "epoch": 0.5605508302956662, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.5687, + "step": 6055 + }, + { + "epoch": 0.5606434068159463, + "grad_norm": 0.1435546875, + "learning_rate": 0.02, + "loss": 1.5773, + "step": 6056 + }, + { + "epoch": 0.5607359833362263, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5983, + "step": 6057 + }, + { + "epoch": 0.5608285598565064, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.5267, + "step": 6058 + }, + { + "epoch": 0.5609211363767864, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.6068, + "step": 6059 + }, + { + "epoch": 0.5610137128970665, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.4765, + "step": 6060 + }, + { + "epoch": 0.5611062894173465, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.5653, + "step": 6061 + }, + { + "epoch": 0.5611988659376266, + "grad_norm": 0.16015625, + "learning_rate": 0.02, + "loss": 1.5231, + "step": 6062 + }, + { + "epoch": 0.5612914424579066, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.5525, + "step": 6063 + }, + { + "epoch": 0.5613840189781867, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.5029, + "step": 6064 + }, + { + "epoch": 0.5614765954984667, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5684, + "step": 6065 + }, + { + "epoch": 0.5615691720187468, + "grad_norm": 0.1689453125, + "learning_rate": 0.02, + "loss": 1.6028, + "step": 6066 + }, + { + "epoch": 0.5616617485390268, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.5625, + "step": 6067 + }, + { + "epoch": 0.5617543250593068, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.5099, + "step": 6068 + }, + { + "epoch": 0.5618469015795868, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.6117, + "step": 6069 + }, + { + "epoch": 0.5619394780998669, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.558, + "step": 6070 + }, + { + "epoch": 0.5620320546201469, + "grad_norm": 0.1337890625, + "learning_rate": 0.02, + "loss": 1.5471, + "step": 6071 + }, + { + "epoch": 0.562124631140427, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.5803, + "step": 6072 + }, + { + "epoch": 0.562217207660707, + "grad_norm": 0.140625, + "learning_rate": 0.02, + "loss": 1.5761, + "step": 6073 + }, + { + "epoch": 0.5623097841809871, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.6077, + "step": 6074 + }, + { + "epoch": 0.5624023607012671, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.5395, + "step": 6075 + }, + { + "epoch": 0.5624949372215472, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.5342, + "step": 6076 + }, + { + "epoch": 0.5625875137418272, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.5355, + "step": 6077 + }, + { + "epoch": 0.5626800902621073, + "grad_norm": 0.138671875, + "learning_rate": 0.02, + "loss": 1.4983, + "step": 6078 + }, + { + "epoch": 0.5627726667823874, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.5737, + "step": 6079 + }, + { + "epoch": 0.5628652433026674, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.5429, + "step": 6080 + }, + { + "epoch": 0.5629578198229475, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.528, + "step": 6081 + }, + { + "epoch": 0.5630503963432274, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.562, + "step": 6082 + }, + { + "epoch": 0.5631429728635075, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.5798, + "step": 6083 + }, + { + "epoch": 0.5632355493837875, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5786, + "step": 6084 + }, + { + "epoch": 0.5633281259040676, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.5148, + "step": 6085 + }, + { + "epoch": 0.5634207024243476, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.5807, + "step": 6086 + }, + { + "epoch": 0.5635132789446277, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5962, + "step": 6087 + }, + { + "epoch": 0.5636058554649077, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.4903, + "step": 6088 + }, + { + "epoch": 0.5636984319851878, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.5515, + "step": 6089 + }, + { + "epoch": 0.5637910085054678, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.5646, + "step": 6090 + }, + { + "epoch": 0.5638835850257479, + "grad_norm": 0.138671875, + "learning_rate": 0.02, + "loss": 1.4961, + "step": 6091 + }, + { + "epoch": 0.5639761615460279, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.5592, + "step": 6092 + }, + { + "epoch": 0.564068738066308, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.5387, + "step": 6093 + }, + { + "epoch": 0.564161314586588, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.5201, + "step": 6094 + }, + { + "epoch": 0.564253891106868, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.5454, + "step": 6095 + }, + { + "epoch": 0.564346467627148, + "grad_norm": 0.1435546875, + "learning_rate": 0.02, + "loss": 1.6085, + "step": 6096 + }, + { + "epoch": 0.5644390441474281, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.5805, + "step": 6097 + }, + { + "epoch": 0.5645316206677081, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.5672, + "step": 6098 + }, + { + "epoch": 0.5646241971879882, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.5656, + "step": 6099 + }, + { + "epoch": 0.5647167737082682, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.5225, + "step": 6100 + }, + { + "epoch": 0.5648093502285483, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5923, + "step": 6101 + }, + { + "epoch": 0.5649019267488283, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.5151, + "step": 6102 + }, + { + "epoch": 0.5649945032691084, + "grad_norm": 0.15625, + "learning_rate": 0.02, + "loss": 1.6001, + "step": 6103 + }, + { + "epoch": 0.5650870797893884, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.6102, + "step": 6104 + }, + { + "epoch": 0.5651796563096685, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.5519, + "step": 6105 + }, + { + "epoch": 0.5652722328299485, + "grad_norm": 0.12890625, + "learning_rate": 0.02, + "loss": 1.5612, + "step": 6106 + }, + { + "epoch": 0.5653648093502286, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.5942, + "step": 6107 + }, + { + "epoch": 0.5654573858705086, + "grad_norm": 0.1396484375, + "learning_rate": 0.02, + "loss": 1.574, + "step": 6108 + }, + { + "epoch": 0.5655499623907886, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.5701, + "step": 6109 + }, + { + "epoch": 0.5656425389110686, + "grad_norm": 0.1435546875, + "learning_rate": 0.02, + "loss": 1.5858, + "step": 6110 + }, + { + "epoch": 0.5657351154313487, + "grad_norm": 0.16015625, + "learning_rate": 0.02, + "loss": 1.5318, + "step": 6111 + }, + { + "epoch": 0.5658276919516287, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5587, + "step": 6112 + }, + { + "epoch": 0.5659202684719088, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.571, + "step": 6113 + }, + { + "epoch": 0.5660128449921888, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.5667, + "step": 6114 + }, + { + "epoch": 0.5661054215124689, + "grad_norm": 0.1552734375, + "learning_rate": 0.02, + "loss": 1.5175, + "step": 6115 + }, + { + "epoch": 0.566197998032749, + "grad_norm": 0.1435546875, + "learning_rate": 0.02, + "loss": 1.4651, + "step": 6116 + }, + { + "epoch": 0.566290574553029, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5533, + "step": 6117 + }, + { + "epoch": 0.566383151073309, + "grad_norm": 0.1328125, + "learning_rate": 0.02, + "loss": 1.6063, + "step": 6118 + }, + { + "epoch": 0.5664757275935891, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.5644, + "step": 6119 + }, + { + "epoch": 0.5665683041138692, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5103, + "step": 6120 + }, + { + "epoch": 0.5666608806341492, + "grad_norm": 0.158203125, + "learning_rate": 0.02, + "loss": 1.5295, + "step": 6121 + }, + { + "epoch": 0.5667534571544293, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.4538, + "step": 6122 + }, + { + "epoch": 0.5668460336747092, + "grad_norm": 0.1591796875, + "learning_rate": 0.02, + "loss": 1.5419, + "step": 6123 + }, + { + "epoch": 0.5669386101949893, + "grad_norm": 0.1318359375, + "learning_rate": 0.02, + "loss": 1.5419, + "step": 6124 + }, + { + "epoch": 0.5670311867152693, + "grad_norm": 0.1416015625, + "learning_rate": 0.02, + "loss": 1.6051, + "step": 6125 + }, + { + "epoch": 0.5671237632355494, + "grad_norm": 0.1376953125, + "learning_rate": 0.02, + "loss": 1.5779, + "step": 6126 + }, + { + "epoch": 0.5672163397558294, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.5425, + "step": 6127 + }, + { + "epoch": 0.5673089162761095, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.6006, + "step": 6128 + }, + { + "epoch": 0.5674014927963895, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.5269, + "step": 6129 + }, + { + "epoch": 0.5674940693166696, + "grad_norm": 0.1376953125, + "learning_rate": 0.02, + "loss": 1.6145, + "step": 6130 + }, + { + "epoch": 0.5675866458369496, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.552, + "step": 6131 + }, + { + "epoch": 0.5676792223572297, + "grad_norm": 0.1416015625, + "learning_rate": 0.02, + "loss": 1.4956, + "step": 6132 + }, + { + "epoch": 0.5677717988775097, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5368, + "step": 6133 + }, + { + "epoch": 0.5678643753977898, + "grad_norm": 0.134765625, + "learning_rate": 0.02, + "loss": 1.552, + "step": 6134 + }, + { + "epoch": 0.5679569519180698, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.6023, + "step": 6135 + }, + { + "epoch": 0.5680495284383498, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.5595, + "step": 6136 + }, + { + "epoch": 0.5681421049586298, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.621, + "step": 6137 + }, + { + "epoch": 0.5682346814789099, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.6304, + "step": 6138 + }, + { + "epoch": 0.5683272579991899, + "grad_norm": 0.1552734375, + "learning_rate": 0.02, + "loss": 1.5273, + "step": 6139 + }, + { + "epoch": 0.56841983451947, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.459, + "step": 6140 + }, + { + "epoch": 0.56851241103975, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.5311, + "step": 6141 + }, + { + "epoch": 0.5686049875600301, + "grad_norm": 0.1376953125, + "learning_rate": 0.02, + "loss": 1.5889, + "step": 6142 + }, + { + "epoch": 0.5686975640803101, + "grad_norm": 0.1435546875, + "learning_rate": 0.02, + "loss": 1.5465, + "step": 6143 + }, + { + "epoch": 0.5687901406005902, + "grad_norm": 0.140625, + "learning_rate": 0.02, + "loss": 1.5221, + "step": 6144 + }, + { + "epoch": 0.5688827171208702, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.6692, + "step": 6145 + }, + { + "epoch": 0.5689752936411503, + "grad_norm": 0.1376953125, + "learning_rate": 0.02, + "loss": 1.5939, + "step": 6146 + }, + { + "epoch": 0.5690678701614303, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.6172, + "step": 6147 + }, + { + "epoch": 0.5691604466817104, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.5864, + "step": 6148 + }, + { + "epoch": 0.5692530232019904, + "grad_norm": 0.1318359375, + "learning_rate": 0.02, + "loss": 1.4831, + "step": 6149 + }, + { + "epoch": 0.5693455997222704, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.5976, + "step": 6150 + }, + { + "epoch": 0.5694381762425504, + "grad_norm": 0.140625, + "learning_rate": 0.02, + "loss": 1.5434, + "step": 6151 + }, + { + "epoch": 0.5695307527628305, + "grad_norm": 0.1396484375, + "learning_rate": 0.02, + "loss": 1.4685, + "step": 6152 + }, + { + "epoch": 0.5696233292831105, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5528, + "step": 6153 + }, + { + "epoch": 0.5697159058033906, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.5527, + "step": 6154 + }, + { + "epoch": 0.5698084823236707, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.656, + "step": 6155 + }, + { + "epoch": 0.5699010588439507, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.5508, + "step": 6156 + }, + { + "epoch": 0.5699936353642308, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.5532, + "step": 6157 + }, + { + "epoch": 0.5700862118845108, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.5807, + "step": 6158 + }, + { + "epoch": 0.5701787884047909, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.5246, + "step": 6159 + }, + { + "epoch": 0.5702713649250709, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.5369, + "step": 6160 + }, + { + "epoch": 0.570363941445351, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.5218, + "step": 6161 + }, + { + "epoch": 0.570456517965631, + "grad_norm": 0.1396484375, + "learning_rate": 0.02, + "loss": 1.535, + "step": 6162 + }, + { + "epoch": 0.570549094485911, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5569, + "step": 6163 + }, + { + "epoch": 0.570641671006191, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.5727, + "step": 6164 + }, + { + "epoch": 0.5707342475264711, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.5428, + "step": 6165 + }, + { + "epoch": 0.5708268240467511, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.5897, + "step": 6166 + }, + { + "epoch": 0.5709194005670312, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5386, + "step": 6167 + }, + { + "epoch": 0.5710119770873112, + "grad_norm": 0.1435546875, + "learning_rate": 0.02, + "loss": 1.5456, + "step": 6168 + }, + { + "epoch": 0.5711045536075913, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.54, + "step": 6169 + }, + { + "epoch": 0.5711971301278713, + "grad_norm": 0.140625, + "learning_rate": 0.02, + "loss": 1.4929, + "step": 6170 + }, + { + "epoch": 0.5712897066481514, + "grad_norm": 0.1376953125, + "learning_rate": 0.02, + "loss": 1.5632, + "step": 6171 + }, + { + "epoch": 0.5713822831684314, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.6497, + "step": 6172 + }, + { + "epoch": 0.5714748596887115, + "grad_norm": 0.1357421875, + "learning_rate": 0.02, + "loss": 1.5095, + "step": 6173 + }, + { + "epoch": 0.5715674362089915, + "grad_norm": 0.1435546875, + "learning_rate": 0.02, + "loss": 1.5955, + "step": 6174 + }, + { + "epoch": 0.5716600127292716, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.577, + "step": 6175 + }, + { + "epoch": 0.5717525892495516, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.5497, + "step": 6176 + }, + { + "epoch": 0.5718451657698316, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5829, + "step": 6177 + }, + { + "epoch": 0.5719377422901116, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.6288, + "step": 6178 + }, + { + "epoch": 0.5720303188103917, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.527, + "step": 6179 + }, + { + "epoch": 0.5721228953306717, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.5059, + "step": 6180 + }, + { + "epoch": 0.5722154718509518, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.5368, + "step": 6181 + }, + { + "epoch": 0.5723080483712318, + "grad_norm": 0.16015625, + "learning_rate": 0.02, + "loss": 1.6413, + "step": 6182 + }, + { + "epoch": 0.5724006248915119, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5558, + "step": 6183 + }, + { + "epoch": 0.5724932014117919, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.4733, + "step": 6184 + }, + { + "epoch": 0.572585777932072, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5474, + "step": 6185 + }, + { + "epoch": 0.572678354452352, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.5813, + "step": 6186 + }, + { + "epoch": 0.5727709309726321, + "grad_norm": 0.162109375, + "learning_rate": 0.02, + "loss": 1.6214, + "step": 6187 + }, + { + "epoch": 0.5728635074929121, + "grad_norm": 0.158203125, + "learning_rate": 0.02, + "loss": 1.5323, + "step": 6188 + }, + { + "epoch": 0.5729560840131922, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.5501, + "step": 6189 + }, + { + "epoch": 0.5730486605334721, + "grad_norm": 0.16015625, + "learning_rate": 0.02, + "loss": 1.5082, + "step": 6190 + }, + { + "epoch": 0.5731412370537522, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.5008, + "step": 6191 + }, + { + "epoch": 0.5732338135740322, + "grad_norm": 0.1572265625, + "learning_rate": 0.02, + "loss": 1.5606, + "step": 6192 + }, + { + "epoch": 0.5733263900943123, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.5407, + "step": 6193 + }, + { + "epoch": 0.5734189666145924, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.5331, + "step": 6194 + }, + { + "epoch": 0.5735115431348724, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.4957, + "step": 6195 + }, + { + "epoch": 0.5736041196551525, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5677, + "step": 6196 + }, + { + "epoch": 0.5736966961754325, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.6088, + "step": 6197 + }, + { + "epoch": 0.5737892726957126, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5389, + "step": 6198 + }, + { + "epoch": 0.5738818492159926, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.5088, + "step": 6199 + }, + { + "epoch": 0.5739744257362727, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.5406, + "step": 6200 + }, + { + "epoch": 0.5740670022565527, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.4772, + "step": 6201 + }, + { + "epoch": 0.5741595787768328, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.5772, + "step": 6202 + }, + { + "epoch": 0.5742521552971128, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.558, + "step": 6203 + }, + { + "epoch": 0.5743447318173928, + "grad_norm": 0.1591796875, + "learning_rate": 0.02, + "loss": 1.5736, + "step": 6204 + }, + { + "epoch": 0.5744373083376728, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.4973, + "step": 6205 + }, + { + "epoch": 0.5745298848579529, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.5749, + "step": 6206 + }, + { + "epoch": 0.5746224613782329, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.6057, + "step": 6207 + }, + { + "epoch": 0.574715037898513, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.6144, + "step": 6208 + }, + { + "epoch": 0.574807614418793, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.5222, + "step": 6209 + }, + { + "epoch": 0.5749001909390731, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.5857, + "step": 6210 + }, + { + "epoch": 0.5749927674593531, + "grad_norm": 0.1572265625, + "learning_rate": 0.02, + "loss": 1.6274, + "step": 6211 + }, + { + "epoch": 0.5750853439796332, + "grad_norm": 0.1572265625, + "learning_rate": 0.02, + "loss": 1.5724, + "step": 6212 + }, + { + "epoch": 0.5751779204999132, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.5634, + "step": 6213 + }, + { + "epoch": 0.5752704970201933, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.5589, + "step": 6214 + }, + { + "epoch": 0.5753630735404733, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.5267, + "step": 6215 + }, + { + "epoch": 0.5754556500607534, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.58, + "step": 6216 + }, + { + "epoch": 0.5755482265810334, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.5863, + "step": 6217 + }, + { + "epoch": 0.5756408031013134, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5335, + "step": 6218 + }, + { + "epoch": 0.5757333796215934, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.5579, + "step": 6219 + }, + { + "epoch": 0.5758259561418735, + "grad_norm": 0.16015625, + "learning_rate": 0.02, + "loss": 1.5358, + "step": 6220 + }, + { + "epoch": 0.5759185326621535, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.5837, + "step": 6221 + }, + { + "epoch": 0.5760111091824336, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.5617, + "step": 6222 + }, + { + "epoch": 0.5761036857027136, + "grad_norm": 0.1435546875, + "learning_rate": 0.02, + "loss": 1.6017, + "step": 6223 + }, + { + "epoch": 0.5761962622229937, + "grad_norm": 0.1591796875, + "learning_rate": 0.02, + "loss": 1.5398, + "step": 6224 + }, + { + "epoch": 0.5762888387432737, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.5637, + "step": 6225 + }, + { + "epoch": 0.5763814152635538, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.5417, + "step": 6226 + }, + { + "epoch": 0.5764739917838339, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.5244, + "step": 6227 + }, + { + "epoch": 0.5765665683041139, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.634, + "step": 6228 + }, + { + "epoch": 0.576659144824394, + "grad_norm": 0.1435546875, + "learning_rate": 0.02, + "loss": 1.5735, + "step": 6229 + }, + { + "epoch": 0.576751721344674, + "grad_norm": 0.140625, + "learning_rate": 0.02, + "loss": 1.5315, + "step": 6230 + }, + { + "epoch": 0.576844297864954, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.5709, + "step": 6231 + }, + { + "epoch": 0.576936874385234, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.627, + "step": 6232 + }, + { + "epoch": 0.577029450905514, + "grad_norm": 0.1435546875, + "learning_rate": 0.02, + "loss": 1.5224, + "step": 6233 + }, + { + "epoch": 0.5771220274257941, + "grad_norm": 0.1376953125, + "learning_rate": 0.02, + "loss": 1.527, + "step": 6234 + }, + { + "epoch": 0.5772146039460742, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.593, + "step": 6235 + }, + { + "epoch": 0.5773071804663542, + "grad_norm": 0.1416015625, + "learning_rate": 0.02, + "loss": 1.5215, + "step": 6236 + }, + { + "epoch": 0.5773997569866343, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.489, + "step": 6237 + }, + { + "epoch": 0.5774923335069143, + "grad_norm": 0.1435546875, + "learning_rate": 0.02, + "loss": 1.5513, + "step": 6238 + }, + { + "epoch": 0.5775849100271944, + "grad_norm": 0.1416015625, + "learning_rate": 0.02, + "loss": 1.5078, + "step": 6239 + }, + { + "epoch": 0.5776774865474744, + "grad_norm": 0.1416015625, + "learning_rate": 0.02, + "loss": 1.4865, + "step": 6240 + }, + { + "epoch": 0.5777700630677545, + "grad_norm": 0.15625, + "learning_rate": 0.02, + "loss": 1.529, + "step": 6241 + }, + { + "epoch": 0.5778626395880345, + "grad_norm": 0.140625, + "learning_rate": 0.02, + "loss": 1.5794, + "step": 6242 + }, + { + "epoch": 0.5779552161083146, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5606, + "step": 6243 + }, + { + "epoch": 0.5780477926285946, + "grad_norm": 0.1416015625, + "learning_rate": 0.02, + "loss": 1.5319, + "step": 6244 + }, + { + "epoch": 0.5781403691488746, + "grad_norm": 0.16015625, + "learning_rate": 0.02, + "loss": 1.5463, + "step": 6245 + }, + { + "epoch": 0.5782329456691546, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.567, + "step": 6246 + }, + { + "epoch": 0.5783255221894347, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.5152, + "step": 6247 + }, + { + "epoch": 0.5784180987097147, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.5863, + "step": 6248 + }, + { + "epoch": 0.5785106752299948, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.5728, + "step": 6249 + }, + { + "epoch": 0.5786032517502748, + "grad_norm": 0.1416015625, + "learning_rate": 0.02, + "loss": 1.5268, + "step": 6250 + }, + { + "epoch": 0.5786958282705549, + "grad_norm": 0.1416015625, + "learning_rate": 0.02, + "loss": 1.5605, + "step": 6251 + }, + { + "epoch": 0.5787884047908349, + "grad_norm": 0.1416015625, + "learning_rate": 0.02, + "loss": 1.5085, + "step": 6252 + }, + { + "epoch": 0.578880981311115, + "grad_norm": 0.1435546875, + "learning_rate": 0.02, + "loss": 1.5225, + "step": 6253 + }, + { + "epoch": 0.578973557831395, + "grad_norm": 0.140625, + "learning_rate": 0.02, + "loss": 1.5207, + "step": 6254 + }, + { + "epoch": 0.5790661343516751, + "grad_norm": 0.1396484375, + "learning_rate": 0.02, + "loss": 1.5876, + "step": 6255 + }, + { + "epoch": 0.5791587108719551, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.6537, + "step": 6256 + }, + { + "epoch": 0.5792512873922352, + "grad_norm": 0.1416015625, + "learning_rate": 0.02, + "loss": 1.5429, + "step": 6257 + }, + { + "epoch": 0.5793438639125151, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.5922, + "step": 6258 + }, + { + "epoch": 0.5794364404327952, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5318, + "step": 6259 + }, + { + "epoch": 0.5795290169530752, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.5117, + "step": 6260 + }, + { + "epoch": 0.5796215934733553, + "grad_norm": 0.1640625, + "learning_rate": 0.02, + "loss": 1.5841, + "step": 6261 + }, + { + "epoch": 0.5797141699936353, + "grad_norm": 0.138671875, + "learning_rate": 0.02, + "loss": 1.541, + "step": 6262 + }, + { + "epoch": 0.5798067465139154, + "grad_norm": 0.140625, + "learning_rate": 0.02, + "loss": 1.5798, + "step": 6263 + }, + { + "epoch": 0.5798993230341954, + "grad_norm": 0.1396484375, + "learning_rate": 0.02, + "loss": 1.551, + "step": 6264 + }, + { + "epoch": 0.5799918995544755, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.537, + "step": 6265 + }, + { + "epoch": 0.5800844760747556, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.5439, + "step": 6266 + }, + { + "epoch": 0.5801770525950356, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.4923, + "step": 6267 + }, + { + "epoch": 0.5802696291153157, + "grad_norm": 0.1396484375, + "learning_rate": 0.02, + "loss": 1.4585, + "step": 6268 + }, + { + "epoch": 0.5803622056355957, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.5752, + "step": 6269 + }, + { + "epoch": 0.5804547821558758, + "grad_norm": 0.1572265625, + "learning_rate": 0.02, + "loss": 1.5792, + "step": 6270 + }, + { + "epoch": 0.5805473586761558, + "grad_norm": 0.15625, + "learning_rate": 0.02, + "loss": 1.5695, + "step": 6271 + }, + { + "epoch": 0.5806399351964358, + "grad_norm": 0.13671875, + "learning_rate": 0.02, + "loss": 1.538, + "step": 6272 + }, + { + "epoch": 0.5807325117167158, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.5323, + "step": 6273 + }, + { + "epoch": 0.5808250882369959, + "grad_norm": 0.13671875, + "learning_rate": 0.02, + "loss": 1.5639, + "step": 6274 + }, + { + "epoch": 0.5809176647572759, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.5445, + "step": 6275 + }, + { + "epoch": 0.581010241277556, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.5044, + "step": 6276 + }, + { + "epoch": 0.581102817797836, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.4582, + "step": 6277 + }, + { + "epoch": 0.5811953943181161, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5625, + "step": 6278 + }, + { + "epoch": 0.5812879708383961, + "grad_norm": 0.140625, + "learning_rate": 0.02, + "loss": 1.553, + "step": 6279 + }, + { + "epoch": 0.5813805473586762, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5976, + "step": 6280 + }, + { + "epoch": 0.5814731238789562, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.5379, + "step": 6281 + }, + { + "epoch": 0.5815657003992363, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.5477, + "step": 6282 + }, + { + "epoch": 0.5816582769195163, + "grad_norm": 0.1572265625, + "learning_rate": 0.02, + "loss": 1.5659, + "step": 6283 + }, + { + "epoch": 0.5817508534397964, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.4831, + "step": 6284 + }, + { + "epoch": 0.5818434299600764, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.5283, + "step": 6285 + }, + { + "epoch": 0.5819360064803564, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.518, + "step": 6286 + }, + { + "epoch": 0.5820285830006364, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.5323, + "step": 6287 + }, + { + "epoch": 0.5821211595209165, + "grad_norm": 0.1435546875, + "learning_rate": 0.02, + "loss": 1.5101, + "step": 6288 + }, + { + "epoch": 0.5822137360411965, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.5626, + "step": 6289 + }, + { + "epoch": 0.5823063125614766, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.548, + "step": 6290 + }, + { + "epoch": 0.5823988890817566, + "grad_norm": 0.1435546875, + "learning_rate": 0.02, + "loss": 1.4608, + "step": 6291 + }, + { + "epoch": 0.5824914656020367, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.5382, + "step": 6292 + }, + { + "epoch": 0.5825840421223167, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.5461, + "step": 6293 + }, + { + "epoch": 0.5826766186425968, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5345, + "step": 6294 + }, + { + "epoch": 0.5827691951628768, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.5537, + "step": 6295 + }, + { + "epoch": 0.5828617716831569, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5838, + "step": 6296 + }, + { + "epoch": 0.582954348203437, + "grad_norm": 0.16015625, + "learning_rate": 0.02, + "loss": 1.5383, + "step": 6297 + }, + { + "epoch": 0.583046924723717, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.5583, + "step": 6298 + }, + { + "epoch": 0.5831395012439969, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5919, + "step": 6299 + }, + { + "epoch": 0.583232077764277, + "grad_norm": 0.1416015625, + "learning_rate": 0.02, + "loss": 1.5756, + "step": 6300 + }, + { + "epoch": 0.583324654284557, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.591, + "step": 6301 + }, + { + "epoch": 0.5834172308048371, + "grad_norm": 0.1376953125, + "learning_rate": 0.02, + "loss": 1.4678, + "step": 6302 + }, + { + "epoch": 0.5835098073251171, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.5689, + "step": 6303 + }, + { + "epoch": 0.5836023838453972, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.5426, + "step": 6304 + }, + { + "epoch": 0.5836949603656773, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5045, + "step": 6305 + }, + { + "epoch": 0.5837875368859573, + "grad_norm": 0.140625, + "learning_rate": 0.02, + "loss": 1.5852, + "step": 6306 + }, + { + "epoch": 0.5838801134062374, + "grad_norm": 0.1591796875, + "learning_rate": 0.02, + "loss": 1.5393, + "step": 6307 + }, + { + "epoch": 0.5839726899265174, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.5541, + "step": 6308 + }, + { + "epoch": 0.5840652664467975, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.5501, + "step": 6309 + }, + { + "epoch": 0.5841578429670775, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.5962, + "step": 6310 + }, + { + "epoch": 0.5842504194873576, + "grad_norm": 0.1552734375, + "learning_rate": 0.02, + "loss": 1.5288, + "step": 6311 + }, + { + "epoch": 0.5843429960076376, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.4677, + "step": 6312 + }, + { + "epoch": 0.5844355725279176, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.5353, + "step": 6313 + }, + { + "epoch": 0.5845281490481976, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.4701, + "step": 6314 + }, + { + "epoch": 0.5846207255684777, + "grad_norm": 0.158203125, + "learning_rate": 0.02, + "loss": 1.5017, + "step": 6315 + }, + { + "epoch": 0.5847133020887577, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.4944, + "step": 6316 + }, + { + "epoch": 0.5848058786090378, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5353, + "step": 6317 + }, + { + "epoch": 0.5848984551293178, + "grad_norm": 0.1611328125, + "learning_rate": 0.02, + "loss": 1.6036, + "step": 6318 + }, + { + "epoch": 0.5849910316495979, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.5741, + "step": 6319 + }, + { + "epoch": 0.5850836081698779, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.5497, + "step": 6320 + }, + { + "epoch": 0.585176184690158, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.5268, + "step": 6321 + }, + { + "epoch": 0.585268761210438, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.5545, + "step": 6322 + }, + { + "epoch": 0.5853613377307181, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.6326, + "step": 6323 + }, + { + "epoch": 0.5854539142509981, + "grad_norm": 0.1552734375, + "learning_rate": 0.02, + "loss": 1.572, + "step": 6324 + }, + { + "epoch": 0.5855464907712782, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.553, + "step": 6325 + }, + { + "epoch": 0.5856390672915581, + "grad_norm": 0.15625, + "learning_rate": 0.02, + "loss": 1.5608, + "step": 6326 + }, + { + "epoch": 0.5857316438118382, + "grad_norm": 0.162109375, + "learning_rate": 0.02, + "loss": 1.528, + "step": 6327 + }, + { + "epoch": 0.5858242203321182, + "grad_norm": 0.169921875, + "learning_rate": 0.02, + "loss": 1.5681, + "step": 6328 + }, + { + "epoch": 0.5859167968523983, + "grad_norm": 0.1552734375, + "learning_rate": 0.02, + "loss": 1.5855, + "step": 6329 + }, + { + "epoch": 0.5860093733726783, + "grad_norm": 0.15625, + "learning_rate": 0.02, + "loss": 1.5622, + "step": 6330 + }, + { + "epoch": 0.5861019498929584, + "grad_norm": 0.16015625, + "learning_rate": 0.02, + "loss": 1.5444, + "step": 6331 + }, + { + "epoch": 0.5861945264132384, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.5229, + "step": 6332 + }, + { + "epoch": 0.5862871029335185, + "grad_norm": 0.1611328125, + "learning_rate": 0.02, + "loss": 1.5173, + "step": 6333 + }, + { + "epoch": 0.5863796794537985, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.5245, + "step": 6334 + }, + { + "epoch": 0.5864722559740786, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.4832, + "step": 6335 + }, + { + "epoch": 0.5865648324943586, + "grad_norm": 0.1416015625, + "learning_rate": 0.02, + "loss": 1.543, + "step": 6336 + }, + { + "epoch": 0.5866574090146387, + "grad_norm": 0.15625, + "learning_rate": 0.02, + "loss": 1.5048, + "step": 6337 + }, + { + "epoch": 0.5867499855349187, + "grad_norm": 0.1376953125, + "learning_rate": 0.02, + "loss": 1.5277, + "step": 6338 + }, + { + "epoch": 0.5868425620551988, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.5372, + "step": 6339 + }, + { + "epoch": 0.5869351385754787, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.5939, + "step": 6340 + }, + { + "epoch": 0.5870277150957588, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.5363, + "step": 6341 + }, + { + "epoch": 0.5871202916160388, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.5968, + "step": 6342 + }, + { + "epoch": 0.5872128681363189, + "grad_norm": 0.1572265625, + "learning_rate": 0.02, + "loss": 1.5293, + "step": 6343 + }, + { + "epoch": 0.587305444656599, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.5396, + "step": 6344 + }, + { + "epoch": 0.587398021176879, + "grad_norm": 0.16015625, + "learning_rate": 0.02, + "loss": 1.5327, + "step": 6345 + }, + { + "epoch": 0.5874905976971591, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.5035, + "step": 6346 + }, + { + "epoch": 0.5875831742174391, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.5403, + "step": 6347 + }, + { + "epoch": 0.5876757507377192, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.5447, + "step": 6348 + }, + { + "epoch": 0.5877683272579992, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.5663, + "step": 6349 + }, + { + "epoch": 0.5878609037782793, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.5374, + "step": 6350 + }, + { + "epoch": 0.5879534802985593, + "grad_norm": 0.1552734375, + "learning_rate": 0.02, + "loss": 1.587, + "step": 6351 + }, + { + "epoch": 0.5880460568188394, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.5679, + "step": 6352 + }, + { + "epoch": 0.5881386333391194, + "grad_norm": 0.140625, + "learning_rate": 0.02, + "loss": 1.5836, + "step": 6353 + }, + { + "epoch": 0.5882312098593994, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.6134, + "step": 6354 + }, + { + "epoch": 0.5883237863796794, + "grad_norm": 0.1435546875, + "learning_rate": 0.02, + "loss": 1.4913, + "step": 6355 + }, + { + "epoch": 0.5884163628999595, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.5116, + "step": 6356 + }, + { + "epoch": 0.5885089394202395, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.4725, + "step": 6357 + }, + { + "epoch": 0.5886015159405196, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.5865, + "step": 6358 + }, + { + "epoch": 0.5886940924607996, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.4726, + "step": 6359 + }, + { + "epoch": 0.5887866689810797, + "grad_norm": 0.1552734375, + "learning_rate": 0.02, + "loss": 1.6126, + "step": 6360 + }, + { + "epoch": 0.5888792455013597, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.5546, + "step": 6361 + }, + { + "epoch": 0.5889718220216398, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.5451, + "step": 6362 + }, + { + "epoch": 0.5890643985419198, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.5333, + "step": 6363 + }, + { + "epoch": 0.5891569750621999, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5465, + "step": 6364 + }, + { + "epoch": 0.5892495515824799, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5933, + "step": 6365 + }, + { + "epoch": 0.58934212810276, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.4986, + "step": 6366 + }, + { + "epoch": 0.5894347046230399, + "grad_norm": 0.140625, + "learning_rate": 0.02, + "loss": 1.5465, + "step": 6367 + }, + { + "epoch": 0.58952728114332, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.5526, + "step": 6368 + }, + { + "epoch": 0.5896198576636, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.5368, + "step": 6369 + }, + { + "epoch": 0.5897124341838801, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.5293, + "step": 6370 + }, + { + "epoch": 0.5898050107041601, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.5985, + "step": 6371 + }, + { + "epoch": 0.5898975872244402, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.5128, + "step": 6372 + }, + { + "epoch": 0.5899901637447202, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.5541, + "step": 6373 + }, + { + "epoch": 0.5900827402650003, + "grad_norm": 0.158203125, + "learning_rate": 0.02, + "loss": 1.5625, + "step": 6374 + }, + { + "epoch": 0.5901753167852803, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.5565, + "step": 6375 + }, + { + "epoch": 0.5902678933055604, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.561, + "step": 6376 + }, + { + "epoch": 0.5903604698258405, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5212, + "step": 6377 + }, + { + "epoch": 0.5904530463461205, + "grad_norm": 0.140625, + "learning_rate": 0.02, + "loss": 1.5104, + "step": 6378 + }, + { + "epoch": 0.5905456228664006, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.5835, + "step": 6379 + }, + { + "epoch": 0.5906381993866806, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.5764, + "step": 6380 + }, + { + "epoch": 0.5907307759069605, + "grad_norm": 0.13671875, + "learning_rate": 0.02, + "loss": 1.5509, + "step": 6381 + }, + { + "epoch": 0.5908233524272406, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.5665, + "step": 6382 + }, + { + "epoch": 0.5909159289475207, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.4999, + "step": 6383 + }, + { + "epoch": 0.5910085054678007, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.5412, + "step": 6384 + }, + { + "epoch": 0.5911010819880808, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.5162, + "step": 6385 + }, + { + "epoch": 0.5911936585083608, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.4715, + "step": 6386 + }, + { + "epoch": 0.5912862350286409, + "grad_norm": 0.15625, + "learning_rate": 0.02, + "loss": 1.5593, + "step": 6387 + }, + { + "epoch": 0.5913788115489209, + "grad_norm": 0.15625, + "learning_rate": 0.02, + "loss": 1.5979, + "step": 6388 + }, + { + "epoch": 0.591471388069201, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.5641, + "step": 6389 + }, + { + "epoch": 0.591563964589481, + "grad_norm": 0.15625, + "learning_rate": 0.02, + "loss": 1.5768, + "step": 6390 + }, + { + "epoch": 0.5916565411097611, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5787, + "step": 6391 + }, + { + "epoch": 0.5917491176300411, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.543, + "step": 6392 + }, + { + "epoch": 0.5918416941503212, + "grad_norm": 0.13671875, + "learning_rate": 0.02, + "loss": 1.5481, + "step": 6393 + }, + { + "epoch": 0.5919342706706011, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.5603, + "step": 6394 + }, + { + "epoch": 0.5920268471908812, + "grad_norm": 0.1328125, + "learning_rate": 0.02, + "loss": 1.5628, + "step": 6395 + }, + { + "epoch": 0.5921194237111612, + "grad_norm": 0.1416015625, + "learning_rate": 0.02, + "loss": 1.5432, + "step": 6396 + }, + { + "epoch": 0.5922120002314413, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.5468, + "step": 6397 + }, + { + "epoch": 0.5923045767517213, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5261, + "step": 6398 + }, + { + "epoch": 0.5923971532720014, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.5705, + "step": 6399 + }, + { + "epoch": 0.5924897297922814, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.5513, + "step": 6400 + }, + { + "epoch": 0.5925823063125615, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.5644, + "step": 6401 + }, + { + "epoch": 0.5926748828328415, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.5196, + "step": 6402 + }, + { + "epoch": 0.5927674593531216, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.5728, + "step": 6403 + }, + { + "epoch": 0.5928600358734016, + "grad_norm": 0.1416015625, + "learning_rate": 0.02, + "loss": 1.5213, + "step": 6404 + }, + { + "epoch": 0.5929526123936817, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.5383, + "step": 6405 + }, + { + "epoch": 0.5930451889139617, + "grad_norm": 0.1376953125, + "learning_rate": 0.02, + "loss": 1.5566, + "step": 6406 + }, + { + "epoch": 0.5931377654342418, + "grad_norm": 0.140625, + "learning_rate": 0.02, + "loss": 1.5459, + "step": 6407 + }, + { + "epoch": 0.5932303419545217, + "grad_norm": 0.1552734375, + "learning_rate": 0.02, + "loss": 1.4937, + "step": 6408 + }, + { + "epoch": 0.5933229184748018, + "grad_norm": 0.1591796875, + "learning_rate": 0.02, + "loss": 1.6202, + "step": 6409 + }, + { + "epoch": 0.5934154949950818, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.5716, + "step": 6410 + }, + { + "epoch": 0.5935080715153619, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.5845, + "step": 6411 + }, + { + "epoch": 0.5936006480356419, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5689, + "step": 6412 + }, + { + "epoch": 0.593693224555922, + "grad_norm": 0.1611328125, + "learning_rate": 0.02, + "loss": 1.5358, + "step": 6413 + }, + { + "epoch": 0.593785801076202, + "grad_norm": 0.1357421875, + "learning_rate": 0.02, + "loss": 1.6157, + "step": 6414 + }, + { + "epoch": 0.5938783775964821, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5641, + "step": 6415 + }, + { + "epoch": 0.5939709541167622, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.6275, + "step": 6416 + }, + { + "epoch": 0.5940635306370422, + "grad_norm": 0.16015625, + "learning_rate": 0.02, + "loss": 1.5707, + "step": 6417 + }, + { + "epoch": 0.5941561071573223, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.5931, + "step": 6418 + }, + { + "epoch": 0.5942486836776023, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.5819, + "step": 6419 + }, + { + "epoch": 0.5943412601978824, + "grad_norm": 0.1611328125, + "learning_rate": 0.02, + "loss": 1.5657, + "step": 6420 + }, + { + "epoch": 0.5944338367181623, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.5176, + "step": 6421 + }, + { + "epoch": 0.5945264132384424, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.5463, + "step": 6422 + }, + { + "epoch": 0.5946189897587224, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.5355, + "step": 6423 + }, + { + "epoch": 0.5947115662790025, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.6074, + "step": 6424 + }, + { + "epoch": 0.5948041427992825, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.5958, + "step": 6425 + }, + { + "epoch": 0.5948967193195626, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.5317, + "step": 6426 + }, + { + "epoch": 0.5949892958398426, + "grad_norm": 0.1630859375, + "learning_rate": 0.02, + "loss": 1.5086, + "step": 6427 + }, + { + "epoch": 0.5950818723601227, + "grad_norm": 0.158203125, + "learning_rate": 0.02, + "loss": 1.5121, + "step": 6428 + }, + { + "epoch": 0.5951744488804027, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5949, + "step": 6429 + }, + { + "epoch": 0.5952670254006828, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5292, + "step": 6430 + }, + { + "epoch": 0.5953596019209628, + "grad_norm": 0.158203125, + "learning_rate": 0.02, + "loss": 1.6059, + "step": 6431 + }, + { + "epoch": 0.5954521784412429, + "grad_norm": 0.1611328125, + "learning_rate": 0.02, + "loss": 1.5515, + "step": 6432 + }, + { + "epoch": 0.5955447549615229, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.502, + "step": 6433 + }, + { + "epoch": 0.595637331481803, + "grad_norm": 0.15625, + "learning_rate": 0.02, + "loss": 1.5245, + "step": 6434 + }, + { + "epoch": 0.5957299080020829, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.6093, + "step": 6435 + }, + { + "epoch": 0.595822484522363, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.5204, + "step": 6436 + }, + { + "epoch": 0.595915061042643, + "grad_norm": 0.1376953125, + "learning_rate": 0.02, + "loss": 1.5549, + "step": 6437 + }, + { + "epoch": 0.5960076375629231, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.59, + "step": 6438 + }, + { + "epoch": 0.5961002140832031, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.5688, + "step": 6439 + }, + { + "epoch": 0.5961927906034832, + "grad_norm": 0.1640625, + "learning_rate": 0.02, + "loss": 1.5686, + "step": 6440 + }, + { + "epoch": 0.5962853671237632, + "grad_norm": 0.166015625, + "learning_rate": 0.02, + "loss": 1.5391, + "step": 6441 + }, + { + "epoch": 0.5963779436440433, + "grad_norm": 0.1630859375, + "learning_rate": 0.02, + "loss": 1.5544, + "step": 6442 + }, + { + "epoch": 0.5964705201643233, + "grad_norm": 0.1611328125, + "learning_rate": 0.02, + "loss": 1.5486, + "step": 6443 + }, + { + "epoch": 0.5965630966846034, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.5564, + "step": 6444 + }, + { + "epoch": 0.5966556732048834, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.5502, + "step": 6445 + }, + { + "epoch": 0.5967482497251635, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.5162, + "step": 6446 + }, + { + "epoch": 0.5968408262454435, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.5785, + "step": 6447 + }, + { + "epoch": 0.5969334027657236, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.5395, + "step": 6448 + }, + { + "epoch": 0.5970259792860035, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.5465, + "step": 6449 + }, + { + "epoch": 0.5971185558062836, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.5495, + "step": 6450 + }, + { + "epoch": 0.5972111323265636, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.5932, + "step": 6451 + }, + { + "epoch": 0.5973037088468437, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5477, + "step": 6452 + }, + { + "epoch": 0.5973962853671237, + "grad_norm": 0.1552734375, + "learning_rate": 0.02, + "loss": 1.5092, + "step": 6453 + }, + { + "epoch": 0.5974888618874038, + "grad_norm": 0.1572265625, + "learning_rate": 0.02, + "loss": 1.5084, + "step": 6454 + }, + { + "epoch": 0.5975814384076839, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.5462, + "step": 6455 + }, + { + "epoch": 0.5976740149279639, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.54, + "step": 6456 + }, + { + "epoch": 0.597766591448244, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.5101, + "step": 6457 + }, + { + "epoch": 0.597859167968524, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.5885, + "step": 6458 + }, + { + "epoch": 0.5979517444888041, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.5022, + "step": 6459 + }, + { + "epoch": 0.5980443210090841, + "grad_norm": 0.1357421875, + "learning_rate": 0.02, + "loss": 1.6019, + "step": 6460 + }, + { + "epoch": 0.5981368975293642, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.6054, + "step": 6461 + }, + { + "epoch": 0.5982294740496441, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.5466, + "step": 6462 + }, + { + "epoch": 0.5983220505699242, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.4979, + "step": 6463 + }, + { + "epoch": 0.5984146270902042, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.4881, + "step": 6464 + }, + { + "epoch": 0.5985072036104843, + "grad_norm": 0.15625, + "learning_rate": 0.02, + "loss": 1.5608, + "step": 6465 + }, + { + "epoch": 0.5985997801307643, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5846, + "step": 6466 + }, + { + "epoch": 0.5986923566510444, + "grad_norm": 0.16015625, + "learning_rate": 0.02, + "loss": 1.535, + "step": 6467 + }, + { + "epoch": 0.5987849331713244, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.4723, + "step": 6468 + }, + { + "epoch": 0.5988775096916045, + "grad_norm": 0.1552734375, + "learning_rate": 0.02, + "loss": 1.6076, + "step": 6469 + }, + { + "epoch": 0.5989700862118845, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.5674, + "step": 6470 + }, + { + "epoch": 0.5990626627321646, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.535, + "step": 6471 + }, + { + "epoch": 0.5991552392524446, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.6081, + "step": 6472 + }, + { + "epoch": 0.5992478157727247, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.4759, + "step": 6473 + }, + { + "epoch": 0.5993403922930047, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.5287, + "step": 6474 + }, + { + "epoch": 0.5994329688132848, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.471, + "step": 6475 + }, + { + "epoch": 0.5995255453335647, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.5687, + "step": 6476 + }, + { + "epoch": 0.5996181218538448, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.4771, + "step": 6477 + }, + { + "epoch": 0.5997106983741248, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.5557, + "step": 6478 + }, + { + "epoch": 0.5998032748944049, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5703, + "step": 6479 + }, + { + "epoch": 0.5998958514146849, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.5532, + "step": 6480 + }, + { + "epoch": 0.599988427934965, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.552, + "step": 6481 + }, + { + "epoch": 0.600081004455245, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.5465, + "step": 6482 + }, + { + "epoch": 0.6001735809755251, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.5658, + "step": 6483 + }, + { + "epoch": 0.6002661574958051, + "grad_norm": 0.15625, + "learning_rate": 0.02, + "loss": 1.5777, + "step": 6484 + }, + { + "epoch": 0.6003587340160852, + "grad_norm": 0.1669921875, + "learning_rate": 0.02, + "loss": 1.5446, + "step": 6485 + }, + { + "epoch": 0.6004513105363652, + "grad_norm": 0.15625, + "learning_rate": 0.02, + "loss": 1.5848, + "step": 6486 + }, + { + "epoch": 0.6005438870566453, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.5498, + "step": 6487 + }, + { + "epoch": 0.6006364635769254, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.5528, + "step": 6488 + }, + { + "epoch": 0.6007290400972053, + "grad_norm": 0.1552734375, + "learning_rate": 0.02, + "loss": 1.5257, + "step": 6489 + }, + { + "epoch": 0.6008216166174853, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5368, + "step": 6490 + }, + { + "epoch": 0.6009141931377654, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.4677, + "step": 6491 + }, + { + "epoch": 0.6010067696580454, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5882, + "step": 6492 + }, + { + "epoch": 0.6010993461783255, + "grad_norm": 0.15625, + "learning_rate": 0.02, + "loss": 1.5558, + "step": 6493 + }, + { + "epoch": 0.6011919226986056, + "grad_norm": 0.158203125, + "learning_rate": 0.02, + "loss": 1.4939, + "step": 6494 + }, + { + "epoch": 0.6012844992188856, + "grad_norm": 0.1572265625, + "learning_rate": 0.02, + "loss": 1.5687, + "step": 6495 + }, + { + "epoch": 0.6013770757391657, + "grad_norm": 0.158203125, + "learning_rate": 0.02, + "loss": 1.5171, + "step": 6496 + }, + { + "epoch": 0.6014696522594457, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.4823, + "step": 6497 + }, + { + "epoch": 0.6015622287797258, + "grad_norm": 0.16015625, + "learning_rate": 0.02, + "loss": 1.6103, + "step": 6498 + }, + { + "epoch": 0.6016548053000058, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.5202, + "step": 6499 + }, + { + "epoch": 0.6017473818202859, + "grad_norm": 0.162109375, + "learning_rate": 0.02, + "loss": 1.519, + "step": 6500 + }, + { + "epoch": 0.6018399583405659, + "grad_norm": 0.140625, + "learning_rate": 0.02, + "loss": 1.5342, + "step": 6501 + }, + { + "epoch": 0.601932534860846, + "grad_norm": 0.1552734375, + "learning_rate": 0.02, + "loss": 1.515, + "step": 6502 + }, + { + "epoch": 0.6020251113811259, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.5204, + "step": 6503 + }, + { + "epoch": 0.602117687901406, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.5829, + "step": 6504 + }, + { + "epoch": 0.602210264421686, + "grad_norm": 0.1435546875, + "learning_rate": 0.02, + "loss": 1.582, + "step": 6505 + }, + { + "epoch": 0.6023028409419661, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.5545, + "step": 6506 + }, + { + "epoch": 0.6023954174622461, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.6172, + "step": 6507 + }, + { + "epoch": 0.6024879939825262, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.5134, + "step": 6508 + }, + { + "epoch": 0.6025805705028062, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.5258, + "step": 6509 + }, + { + "epoch": 0.6026731470230863, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.5799, + "step": 6510 + }, + { + "epoch": 0.6027657235433663, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5681, + "step": 6511 + }, + { + "epoch": 0.6028583000636464, + "grad_norm": 0.13671875, + "learning_rate": 0.02, + "loss": 1.5795, + "step": 6512 + }, + { + "epoch": 0.6029508765839264, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.5378, + "step": 6513 + }, + { + "epoch": 0.6030434531042065, + "grad_norm": 0.1357421875, + "learning_rate": 0.02, + "loss": 1.5804, + "step": 6514 + }, + { + "epoch": 0.6031360296244865, + "grad_norm": 0.140625, + "learning_rate": 0.02, + "loss": 1.4941, + "step": 6515 + }, + { + "epoch": 0.6032286061447666, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.5601, + "step": 6516 + }, + { + "epoch": 0.6033211826650465, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5465, + "step": 6517 + }, + { + "epoch": 0.6034137591853266, + "grad_norm": 0.1416015625, + "learning_rate": 0.02, + "loss": 1.5613, + "step": 6518 + }, + { + "epoch": 0.6035063357056066, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.5093, + "step": 6519 + }, + { + "epoch": 0.6035989122258867, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.5122, + "step": 6520 + }, + { + "epoch": 0.6036914887461667, + "grad_norm": 0.15625, + "learning_rate": 0.02, + "loss": 1.6274, + "step": 6521 + }, + { + "epoch": 0.6037840652664468, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.555, + "step": 6522 + }, + { + "epoch": 0.6038766417867268, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.5507, + "step": 6523 + }, + { + "epoch": 0.6039692183070069, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.5681, + "step": 6524 + }, + { + "epoch": 0.604061794827287, + "grad_norm": 0.1552734375, + "learning_rate": 0.02, + "loss": 1.5563, + "step": 6525 + }, + { + "epoch": 0.604154371347567, + "grad_norm": 0.1689453125, + "learning_rate": 0.02, + "loss": 1.5846, + "step": 6526 + }, + { + "epoch": 0.604246947867847, + "grad_norm": 0.1552734375, + "learning_rate": 0.02, + "loss": 1.5819, + "step": 6527 + }, + { + "epoch": 0.6043395243881271, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.5792, + "step": 6528 + }, + { + "epoch": 0.6044321009084072, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.5261, + "step": 6529 + }, + { + "epoch": 0.6045246774286871, + "grad_norm": 0.15625, + "learning_rate": 0.02, + "loss": 1.5948, + "step": 6530 + }, + { + "epoch": 0.6046172539489671, + "grad_norm": 0.1572265625, + "learning_rate": 0.02, + "loss": 1.5082, + "step": 6531 + }, + { + "epoch": 0.6047098304692472, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.5443, + "step": 6532 + }, + { + "epoch": 0.6048024069895273, + "grad_norm": 0.1416015625, + "learning_rate": 0.02, + "loss": 1.4872, + "step": 6533 + }, + { + "epoch": 0.6048949835098073, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.6424, + "step": 6534 + }, + { + "epoch": 0.6049875600300874, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.496, + "step": 6535 + }, + { + "epoch": 0.6050801365503674, + "grad_norm": 0.15625, + "learning_rate": 0.02, + "loss": 1.4944, + "step": 6536 + }, + { + "epoch": 0.6051727130706475, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.507, + "step": 6537 + }, + { + "epoch": 0.6052652895909275, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.5267, + "step": 6538 + }, + { + "epoch": 0.6053578661112076, + "grad_norm": 0.16015625, + "learning_rate": 0.02, + "loss": 1.5416, + "step": 6539 + }, + { + "epoch": 0.6054504426314876, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.5959, + "step": 6540 + }, + { + "epoch": 0.6055430191517677, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.5292, + "step": 6541 + }, + { + "epoch": 0.6056355956720477, + "grad_norm": 0.1552734375, + "learning_rate": 0.02, + "loss": 1.5418, + "step": 6542 + }, + { + "epoch": 0.6057281721923278, + "grad_norm": 0.15625, + "learning_rate": 0.02, + "loss": 1.5231, + "step": 6543 + }, + { + "epoch": 0.6058207487126077, + "grad_norm": 0.158203125, + "learning_rate": 0.02, + "loss": 1.5853, + "step": 6544 + }, + { + "epoch": 0.6059133252328878, + "grad_norm": 0.1435546875, + "learning_rate": 0.02, + "loss": 1.5003, + "step": 6545 + }, + { + "epoch": 0.6060059017531678, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5482, + "step": 6546 + }, + { + "epoch": 0.6060984782734479, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.6247, + "step": 6547 + }, + { + "epoch": 0.6061910547937279, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.4844, + "step": 6548 + }, + { + "epoch": 0.606283631314008, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.5107, + "step": 6549 + }, + { + "epoch": 0.606376207834288, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.5626, + "step": 6550 + }, + { + "epoch": 0.6064687843545681, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.5299, + "step": 6551 + }, + { + "epoch": 0.6065613608748481, + "grad_norm": 0.1572265625, + "learning_rate": 0.02, + "loss": 1.5385, + "step": 6552 + }, + { + "epoch": 0.6066539373951282, + "grad_norm": 0.1435546875, + "learning_rate": 0.02, + "loss": 1.4735, + "step": 6553 + }, + { + "epoch": 0.6067465139154082, + "grad_norm": 0.1435546875, + "learning_rate": 0.02, + "loss": 1.5624, + "step": 6554 + }, + { + "epoch": 0.6068390904356883, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.5498, + "step": 6555 + }, + { + "epoch": 0.6069316669559683, + "grad_norm": 0.1435546875, + "learning_rate": 0.02, + "loss": 1.4612, + "step": 6556 + }, + { + "epoch": 0.6070242434762483, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.5409, + "step": 6557 + }, + { + "epoch": 0.6071168199965283, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5862, + "step": 6558 + }, + { + "epoch": 0.6072093965168084, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.5758, + "step": 6559 + }, + { + "epoch": 0.6073019730370884, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.557, + "step": 6560 + }, + { + "epoch": 0.6073945495573685, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.5748, + "step": 6561 + }, + { + "epoch": 0.6074871260776485, + "grad_norm": 0.140625, + "learning_rate": 0.02, + "loss": 1.565, + "step": 6562 + }, + { + "epoch": 0.6075797025979286, + "grad_norm": 0.134765625, + "learning_rate": 0.02, + "loss": 1.5265, + "step": 6563 + }, + { + "epoch": 0.6076722791182086, + "grad_norm": 0.1328125, + "learning_rate": 0.02, + "loss": 1.5407, + "step": 6564 + }, + { + "epoch": 0.6077648556384887, + "grad_norm": 0.1396484375, + "learning_rate": 0.02, + "loss": 1.5846, + "step": 6565 + }, + { + "epoch": 0.6078574321587688, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.5542, + "step": 6566 + }, + { + "epoch": 0.6079500086790488, + "grad_norm": 0.13671875, + "learning_rate": 0.02, + "loss": 1.5203, + "step": 6567 + }, + { + "epoch": 0.6080425851993289, + "grad_norm": 0.1376953125, + "learning_rate": 0.02, + "loss": 1.5503, + "step": 6568 + }, + { + "epoch": 0.6081351617196089, + "grad_norm": 0.13671875, + "learning_rate": 0.02, + "loss": 1.5956, + "step": 6569 + }, + { + "epoch": 0.608227738239889, + "grad_norm": 0.138671875, + "learning_rate": 0.02, + "loss": 1.6191, + "step": 6570 + }, + { + "epoch": 0.6083203147601689, + "grad_norm": 0.1435546875, + "learning_rate": 0.02, + "loss": 1.5918, + "step": 6571 + }, + { + "epoch": 0.608412891280449, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5376, + "step": 6572 + }, + { + "epoch": 0.608505467800729, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.5655, + "step": 6573 + }, + { + "epoch": 0.6085980443210091, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.5514, + "step": 6574 + }, + { + "epoch": 0.6086906208412891, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.4902, + "step": 6575 + }, + { + "epoch": 0.6087831973615692, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.4581, + "step": 6576 + }, + { + "epoch": 0.6088757738818492, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5505, + "step": 6577 + }, + { + "epoch": 0.6089683504021293, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.4879, + "step": 6578 + }, + { + "epoch": 0.6090609269224093, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.5769, + "step": 6579 + }, + { + "epoch": 0.6091535034426894, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.5708, + "step": 6580 + }, + { + "epoch": 0.6092460799629694, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.4736, + "step": 6581 + }, + { + "epoch": 0.6093386564832495, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.5186, + "step": 6582 + }, + { + "epoch": 0.6094312330035295, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.5759, + "step": 6583 + }, + { + "epoch": 0.6095238095238096, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.5327, + "step": 6584 + }, + { + "epoch": 0.6096163860440895, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.5683, + "step": 6585 + }, + { + "epoch": 0.6097089625643696, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.5488, + "step": 6586 + }, + { + "epoch": 0.6098015390846496, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5267, + "step": 6587 + }, + { + "epoch": 0.6098941156049297, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.5775, + "step": 6588 + }, + { + "epoch": 0.6099866921252097, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5455, + "step": 6589 + }, + { + "epoch": 0.6100792686454898, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.5672, + "step": 6590 + }, + { + "epoch": 0.6101718451657698, + "grad_norm": 0.1416015625, + "learning_rate": 0.02, + "loss": 1.5174, + "step": 6591 + }, + { + "epoch": 0.6102644216860499, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.5988, + "step": 6592 + }, + { + "epoch": 0.6103569982063299, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.4693, + "step": 6593 + }, + { + "epoch": 0.61044957472661, + "grad_norm": 0.1435546875, + "learning_rate": 0.02, + "loss": 1.5315, + "step": 6594 + }, + { + "epoch": 0.61054215124689, + "grad_norm": 0.134765625, + "learning_rate": 0.02, + "loss": 1.5557, + "step": 6595 + }, + { + "epoch": 0.6106347277671701, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5256, + "step": 6596 + }, + { + "epoch": 0.6107273042874501, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.5602, + "step": 6597 + }, + { + "epoch": 0.6108198808077301, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.5272, + "step": 6598 + }, + { + "epoch": 0.6109124573280101, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.5499, + "step": 6599 + }, + { + "epoch": 0.6110050338482902, + "grad_norm": 0.1435546875, + "learning_rate": 0.02, + "loss": 1.5372, + "step": 6600 + }, + { + "epoch": 0.6110976103685702, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.5684, + "step": 6601 + }, + { + "epoch": 0.6111901868888503, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.4804, + "step": 6602 + }, + { + "epoch": 0.6112827634091303, + "grad_norm": 0.1416015625, + "learning_rate": 0.02, + "loss": 1.534, + "step": 6603 + }, + { + "epoch": 0.6113753399294104, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.5451, + "step": 6604 + }, + { + "epoch": 0.6114679164496905, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.6211, + "step": 6605 + }, + { + "epoch": 0.6115604929699705, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.5822, + "step": 6606 + }, + { + "epoch": 0.6116530694902506, + "grad_norm": 0.15625, + "learning_rate": 0.02, + "loss": 1.547, + "step": 6607 + }, + { + "epoch": 0.6117456460105306, + "grad_norm": 0.158203125, + "learning_rate": 0.02, + "loss": 1.5704, + "step": 6608 + }, + { + "epoch": 0.6118382225308107, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.5664, + "step": 6609 + }, + { + "epoch": 0.6119307990510907, + "grad_norm": 0.1591796875, + "learning_rate": 0.02, + "loss": 1.5201, + "step": 6610 + }, + { + "epoch": 0.6120233755713708, + "grad_norm": 0.1396484375, + "learning_rate": 0.02, + "loss": 1.4944, + "step": 6611 + }, + { + "epoch": 0.6121159520916507, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5013, + "step": 6612 + }, + { + "epoch": 0.6122085286119308, + "grad_norm": 0.1416015625, + "learning_rate": 0.02, + "loss": 1.4498, + "step": 6613 + }, + { + "epoch": 0.6123011051322108, + "grad_norm": 0.158203125, + "learning_rate": 0.02, + "loss": 1.5259, + "step": 6614 + }, + { + "epoch": 0.6123936816524909, + "grad_norm": 0.140625, + "learning_rate": 0.02, + "loss": 1.5289, + "step": 6615 + }, + { + "epoch": 0.6124862581727709, + "grad_norm": 0.138671875, + "learning_rate": 0.02, + "loss": 1.5128, + "step": 6616 + }, + { + "epoch": 0.612578834693051, + "grad_norm": 0.1416015625, + "learning_rate": 0.02, + "loss": 1.5374, + "step": 6617 + }, + { + "epoch": 0.612671411213331, + "grad_norm": 0.15625, + "learning_rate": 0.02, + "loss": 1.5595, + "step": 6618 + }, + { + "epoch": 0.6127639877336111, + "grad_norm": 0.138671875, + "learning_rate": 0.02, + "loss": 1.4861, + "step": 6619 + }, + { + "epoch": 0.6128565642538911, + "grad_norm": 0.1572265625, + "learning_rate": 0.02, + "loss": 1.482, + "step": 6620 + }, + { + "epoch": 0.6129491407741712, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5353, + "step": 6621 + }, + { + "epoch": 0.6130417172944512, + "grad_norm": 0.16015625, + "learning_rate": 0.02, + "loss": 1.5561, + "step": 6622 + }, + { + "epoch": 0.6131342938147313, + "grad_norm": 0.1552734375, + "learning_rate": 0.02, + "loss": 1.6124, + "step": 6623 + }, + { + "epoch": 0.6132268703350113, + "grad_norm": 0.162109375, + "learning_rate": 0.02, + "loss": 1.5961, + "step": 6624 + }, + { + "epoch": 0.6133194468552913, + "grad_norm": 0.1640625, + "learning_rate": 0.02, + "loss": 1.5311, + "step": 6625 + }, + { + "epoch": 0.6134120233755713, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.5189, + "step": 6626 + }, + { + "epoch": 0.6135045998958514, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.5266, + "step": 6627 + }, + { + "epoch": 0.6135971764161314, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.5757, + "step": 6628 + }, + { + "epoch": 0.6136897529364115, + "grad_norm": 0.15625, + "learning_rate": 0.02, + "loss": 1.6004, + "step": 6629 + }, + { + "epoch": 0.6137823294566915, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5896, + "step": 6630 + }, + { + "epoch": 0.6138749059769716, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.5633, + "step": 6631 + }, + { + "epoch": 0.6139674824972516, + "grad_norm": 0.1416015625, + "learning_rate": 0.02, + "loss": 1.4798, + "step": 6632 + }, + { + "epoch": 0.6140600590175317, + "grad_norm": 0.16015625, + "learning_rate": 0.02, + "loss": 1.5741, + "step": 6633 + }, + { + "epoch": 0.6141526355378117, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.5684, + "step": 6634 + }, + { + "epoch": 0.6142452120580918, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.5334, + "step": 6635 + }, + { + "epoch": 0.6143377885783718, + "grad_norm": 0.140625, + "learning_rate": 0.02, + "loss": 1.4836, + "step": 6636 + }, + { + "epoch": 0.6144303650986519, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.57, + "step": 6637 + }, + { + "epoch": 0.614522941618932, + "grad_norm": 0.13671875, + "learning_rate": 0.02, + "loss": 1.5456, + "step": 6638 + }, + { + "epoch": 0.6146155181392119, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.528, + "step": 6639 + }, + { + "epoch": 0.6147080946594919, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.5032, + "step": 6640 + }, + { + "epoch": 0.614800671179772, + "grad_norm": 0.13671875, + "learning_rate": 0.02, + "loss": 1.5767, + "step": 6641 + }, + { + "epoch": 0.614893247700052, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.5037, + "step": 6642 + }, + { + "epoch": 0.6149858242203321, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.5108, + "step": 6643 + }, + { + "epoch": 0.6150784007406122, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.5914, + "step": 6644 + }, + { + "epoch": 0.6151709772608922, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.5933, + "step": 6645 + }, + { + "epoch": 0.6152635537811723, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.5199, + "step": 6646 + }, + { + "epoch": 0.6153561303014523, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5621, + "step": 6647 + }, + { + "epoch": 0.6154487068217324, + "grad_norm": 0.1552734375, + "learning_rate": 0.02, + "loss": 1.5604, + "step": 6648 + }, + { + "epoch": 0.6155412833420124, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.4952, + "step": 6649 + }, + { + "epoch": 0.6156338598622925, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5301, + "step": 6650 + }, + { + "epoch": 0.6157264363825725, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.5563, + "step": 6651 + }, + { + "epoch": 0.6158190129028526, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.5388, + "step": 6652 + }, + { + "epoch": 0.6159115894231325, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.6154, + "step": 6653 + }, + { + "epoch": 0.6160041659434126, + "grad_norm": 0.15625, + "learning_rate": 0.02, + "loss": 1.5262, + "step": 6654 + }, + { + "epoch": 0.6160967424636926, + "grad_norm": 0.1689453125, + "learning_rate": 0.02, + "loss": 1.5928, + "step": 6655 + }, + { + "epoch": 0.6161893189839727, + "grad_norm": 0.1572265625, + "learning_rate": 0.02, + "loss": 1.5204, + "step": 6656 + }, + { + "epoch": 0.6162818955042527, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5998, + "step": 6657 + }, + { + "epoch": 0.6163744720245328, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.5215, + "step": 6658 + }, + { + "epoch": 0.6164670485448128, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.4974, + "step": 6659 + }, + { + "epoch": 0.6165596250650929, + "grad_norm": 0.16015625, + "learning_rate": 0.02, + "loss": 1.5237, + "step": 6660 + }, + { + "epoch": 0.6166522015853729, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.5332, + "step": 6661 + }, + { + "epoch": 0.616744778105653, + "grad_norm": 0.162109375, + "learning_rate": 0.02, + "loss": 1.582, + "step": 6662 + }, + { + "epoch": 0.616837354625933, + "grad_norm": 0.1435546875, + "learning_rate": 0.02, + "loss": 1.5228, + "step": 6663 + }, + { + "epoch": 0.6169299311462131, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.5022, + "step": 6664 + }, + { + "epoch": 0.6170225076664931, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5566, + "step": 6665 + }, + { + "epoch": 0.6171150841867731, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.5247, + "step": 6666 + }, + { + "epoch": 0.6172076607070531, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.546, + "step": 6667 + }, + { + "epoch": 0.6173002372273332, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.5824, + "step": 6668 + }, + { + "epoch": 0.6173928137476132, + "grad_norm": 0.158203125, + "learning_rate": 0.02, + "loss": 1.5199, + "step": 6669 + }, + { + "epoch": 0.6174853902678933, + "grad_norm": 0.1591796875, + "learning_rate": 0.02, + "loss": 1.6074, + "step": 6670 + }, + { + "epoch": 0.6175779667881733, + "grad_norm": 0.1552734375, + "learning_rate": 0.02, + "loss": 1.5608, + "step": 6671 + }, + { + "epoch": 0.6176705433084534, + "grad_norm": 0.15625, + "learning_rate": 0.02, + "loss": 1.5506, + "step": 6672 + }, + { + "epoch": 0.6177631198287334, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.4967, + "step": 6673 + }, + { + "epoch": 0.6178556963490135, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.5419, + "step": 6674 + }, + { + "epoch": 0.6179482728692935, + "grad_norm": 0.1416015625, + "learning_rate": 0.02, + "loss": 1.5208, + "step": 6675 + }, + { + "epoch": 0.6180408493895736, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.4989, + "step": 6676 + }, + { + "epoch": 0.6181334259098537, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5445, + "step": 6677 + }, + { + "epoch": 0.6182260024301337, + "grad_norm": 0.158203125, + "learning_rate": 0.02, + "loss": 1.5214, + "step": 6678 + }, + { + "epoch": 0.6183185789504138, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.5326, + "step": 6679 + }, + { + "epoch": 0.6184111554706937, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5612, + "step": 6680 + }, + { + "epoch": 0.6185037319909737, + "grad_norm": 0.15625, + "learning_rate": 0.02, + "loss": 1.5966, + "step": 6681 + }, + { + "epoch": 0.6185963085112538, + "grad_norm": 0.15625, + "learning_rate": 0.02, + "loss": 1.5649, + "step": 6682 + }, + { + "epoch": 0.6186888850315339, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5808, + "step": 6683 + }, + { + "epoch": 0.6187814615518139, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.5585, + "step": 6684 + }, + { + "epoch": 0.618874038072094, + "grad_norm": 0.16015625, + "learning_rate": 0.02, + "loss": 1.5065, + "step": 6685 + }, + { + "epoch": 0.618966614592374, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.503, + "step": 6686 + }, + { + "epoch": 0.6190591911126541, + "grad_norm": 0.1708984375, + "learning_rate": 0.02, + "loss": 1.4825, + "step": 6687 + }, + { + "epoch": 0.6191517676329341, + "grad_norm": 0.1396484375, + "learning_rate": 0.02, + "loss": 1.5768, + "step": 6688 + }, + { + "epoch": 0.6192443441532142, + "grad_norm": 0.1650390625, + "learning_rate": 0.02, + "loss": 1.5553, + "step": 6689 + }, + { + "epoch": 0.6193369206734942, + "grad_norm": 0.138671875, + "learning_rate": 0.02, + "loss": 1.6236, + "step": 6690 + }, + { + "epoch": 0.6194294971937743, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.4929, + "step": 6691 + }, + { + "epoch": 0.6195220737140543, + "grad_norm": 0.1435546875, + "learning_rate": 0.02, + "loss": 1.4727, + "step": 6692 + }, + { + "epoch": 0.6196146502343343, + "grad_norm": 0.1591796875, + "learning_rate": 0.02, + "loss": 1.4949, + "step": 6693 + }, + { + "epoch": 0.6197072267546143, + "grad_norm": 0.140625, + "learning_rate": 0.02, + "loss": 1.5192, + "step": 6694 + }, + { + "epoch": 0.6197998032748944, + "grad_norm": 0.1572265625, + "learning_rate": 0.02, + "loss": 1.5407, + "step": 6695 + }, + { + "epoch": 0.6198923797951744, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.4978, + "step": 6696 + }, + { + "epoch": 0.6199849563154545, + "grad_norm": 0.15625, + "learning_rate": 0.02, + "loss": 1.5381, + "step": 6697 + }, + { + "epoch": 0.6200775328357345, + "grad_norm": 0.1396484375, + "learning_rate": 0.02, + "loss": 1.5135, + "step": 6698 + }, + { + "epoch": 0.6201701093560146, + "grad_norm": 0.1611328125, + "learning_rate": 0.02, + "loss": 1.5321, + "step": 6699 + }, + { + "epoch": 0.6202626858762946, + "grad_norm": 0.1591796875, + "learning_rate": 0.02, + "loss": 1.5232, + "step": 6700 + }, + { + "epoch": 0.6203552623965747, + "grad_norm": 0.1611328125, + "learning_rate": 0.02, + "loss": 1.495, + "step": 6701 + }, + { + "epoch": 0.6204478389168547, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5319, + "step": 6702 + }, + { + "epoch": 0.6205404154371348, + "grad_norm": 0.1611328125, + "learning_rate": 0.02, + "loss": 1.5022, + "step": 6703 + }, + { + "epoch": 0.6206329919574148, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.5856, + "step": 6704 + }, + { + "epoch": 0.6207255684776949, + "grad_norm": 0.1630859375, + "learning_rate": 0.02, + "loss": 1.5209, + "step": 6705 + }, + { + "epoch": 0.6208181449979749, + "grad_norm": 0.1611328125, + "learning_rate": 0.02, + "loss": 1.5802, + "step": 6706 + }, + { + "epoch": 0.6209107215182549, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.538, + "step": 6707 + }, + { + "epoch": 0.6210032980385349, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.5421, + "step": 6708 + }, + { + "epoch": 0.621095874558815, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.482, + "step": 6709 + }, + { + "epoch": 0.621188451079095, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.4862, + "step": 6710 + }, + { + "epoch": 0.6212810275993751, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5777, + "step": 6711 + }, + { + "epoch": 0.6213736041196551, + "grad_norm": 0.140625, + "learning_rate": 0.02, + "loss": 1.4995, + "step": 6712 + }, + { + "epoch": 0.6214661806399352, + "grad_norm": 0.1552734375, + "learning_rate": 0.02, + "loss": 1.6032, + "step": 6713 + }, + { + "epoch": 0.6215587571602152, + "grad_norm": 0.1396484375, + "learning_rate": 0.02, + "loss": 1.5751, + "step": 6714 + }, + { + "epoch": 0.6216513336804953, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5646, + "step": 6715 + }, + { + "epoch": 0.6217439102007754, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.5763, + "step": 6716 + }, + { + "epoch": 0.6218364867210554, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.5474, + "step": 6717 + }, + { + "epoch": 0.6219290632413355, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.6049, + "step": 6718 + }, + { + "epoch": 0.6220216397616155, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5419, + "step": 6719 + }, + { + "epoch": 0.6221142162818954, + "grad_norm": 0.1376953125, + "learning_rate": 0.02, + "loss": 1.5154, + "step": 6720 + }, + { + "epoch": 0.6222067928021755, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.5755, + "step": 6721 + }, + { + "epoch": 0.6222993693224556, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.5325, + "step": 6722 + }, + { + "epoch": 0.6223919458427356, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.6207, + "step": 6723 + }, + { + "epoch": 0.6224845223630157, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.5891, + "step": 6724 + }, + { + "epoch": 0.6225770988832957, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.5534, + "step": 6725 + }, + { + "epoch": 0.6226696754035758, + "grad_norm": 0.1611328125, + "learning_rate": 0.02, + "loss": 1.5288, + "step": 6726 + }, + { + "epoch": 0.6227622519238558, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.4771, + "step": 6727 + }, + { + "epoch": 0.6228548284441359, + "grad_norm": 0.162109375, + "learning_rate": 0.02, + "loss": 1.5371, + "step": 6728 + }, + { + "epoch": 0.6229474049644159, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.4744, + "step": 6729 + }, + { + "epoch": 0.623039981484696, + "grad_norm": 0.1435546875, + "learning_rate": 0.02, + "loss": 1.4472, + "step": 6730 + }, + { + "epoch": 0.623132558004976, + "grad_norm": 0.13671875, + "learning_rate": 0.02, + "loss": 1.5206, + "step": 6731 + }, + { + "epoch": 0.6232251345252561, + "grad_norm": 0.16015625, + "learning_rate": 0.02, + "loss": 1.4971, + "step": 6732 + }, + { + "epoch": 0.6233177110455361, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.5427, + "step": 6733 + }, + { + "epoch": 0.6234102875658161, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.5065, + "step": 6734 + }, + { + "epoch": 0.6235028640860961, + "grad_norm": 0.1376953125, + "learning_rate": 0.02, + "loss": 1.5552, + "step": 6735 + }, + { + "epoch": 0.6235954406063762, + "grad_norm": 0.1552734375, + "learning_rate": 0.02, + "loss": 1.4611, + "step": 6736 + }, + { + "epoch": 0.6236880171266562, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.4739, + "step": 6737 + }, + { + "epoch": 0.6237805936469363, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5504, + "step": 6738 + }, + { + "epoch": 0.6238731701672163, + "grad_norm": 0.1416015625, + "learning_rate": 0.02, + "loss": 1.511, + "step": 6739 + }, + { + "epoch": 0.6239657466874964, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.4979, + "step": 6740 + }, + { + "epoch": 0.6240583232077764, + "grad_norm": 0.138671875, + "learning_rate": 0.02, + "loss": 1.5451, + "step": 6741 + }, + { + "epoch": 0.6241508997280565, + "grad_norm": 0.1416015625, + "learning_rate": 0.02, + "loss": 1.5801, + "step": 6742 + }, + { + "epoch": 0.6242434762483365, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.5192, + "step": 6743 + }, + { + "epoch": 0.6243360527686166, + "grad_norm": 0.1328125, + "learning_rate": 0.02, + "loss": 1.5191, + "step": 6744 + }, + { + "epoch": 0.6244286292888966, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.5168, + "step": 6745 + }, + { + "epoch": 0.6245212058091767, + "grad_norm": 0.138671875, + "learning_rate": 0.02, + "loss": 1.531, + "step": 6746 + }, + { + "epoch": 0.6246137823294567, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.5246, + "step": 6747 + }, + { + "epoch": 0.6247063588497367, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.6065, + "step": 6748 + }, + { + "epoch": 0.6247989353700167, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.4978, + "step": 6749 + }, + { + "epoch": 0.6248915118902968, + "grad_norm": 0.13671875, + "learning_rate": 0.02, + "loss": 1.5058, + "step": 6750 + }, + { + "epoch": 0.6249840884105768, + "grad_norm": 0.15625, + "learning_rate": 0.02, + "loss": 1.4946, + "step": 6751 + }, + { + "epoch": 0.6250766649308569, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.6439, + "step": 6752 + }, + { + "epoch": 0.625169241451137, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.5023, + "step": 6753 + }, + { + "epoch": 0.625261817971417, + "grad_norm": 0.134765625, + "learning_rate": 0.02, + "loss": 1.5575, + "step": 6754 + }, + { + "epoch": 0.625354394491697, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.5247, + "step": 6755 + }, + { + "epoch": 0.6254469710119771, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5078, + "step": 6756 + }, + { + "epoch": 0.6255395475322572, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5392, + "step": 6757 + }, + { + "epoch": 0.6256321240525372, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5691, + "step": 6758 + }, + { + "epoch": 0.6257247005728173, + "grad_norm": 0.15625, + "learning_rate": 0.02, + "loss": 1.4953, + "step": 6759 + }, + { + "epoch": 0.6258172770930973, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.5074, + "step": 6760 + }, + { + "epoch": 0.6259098536133773, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.5939, + "step": 6761 + }, + { + "epoch": 0.6260024301336573, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.557, + "step": 6762 + }, + { + "epoch": 0.6260950066539374, + "grad_norm": 0.1591796875, + "learning_rate": 0.02, + "loss": 1.6182, + "step": 6763 + }, + { + "epoch": 0.6261875831742174, + "grad_norm": 0.1552734375, + "learning_rate": 0.02, + "loss": 1.6222, + "step": 6764 + }, + { + "epoch": 0.6262801596944975, + "grad_norm": 0.158203125, + "learning_rate": 0.02, + "loss": 1.5027, + "step": 6765 + }, + { + "epoch": 0.6263727362147775, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.5019, + "step": 6766 + }, + { + "epoch": 0.6264653127350576, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5826, + "step": 6767 + }, + { + "epoch": 0.6265578892553376, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5582, + "step": 6768 + }, + { + "epoch": 0.6266504657756177, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5937, + "step": 6769 + }, + { + "epoch": 0.6267430422958977, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.6126, + "step": 6770 + }, + { + "epoch": 0.6268356188161778, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.527, + "step": 6771 + }, + { + "epoch": 0.6269281953364578, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.5487, + "step": 6772 + }, + { + "epoch": 0.6270207718567379, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.5721, + "step": 6773 + }, + { + "epoch": 0.6271133483770179, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.5953, + "step": 6774 + }, + { + "epoch": 0.6272059248972979, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.494, + "step": 6775 + }, + { + "epoch": 0.6272985014175779, + "grad_norm": 0.138671875, + "learning_rate": 0.02, + "loss": 1.5298, + "step": 6776 + }, + { + "epoch": 0.627391077937858, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.5538, + "step": 6777 + }, + { + "epoch": 0.627483654458138, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5399, + "step": 6778 + }, + { + "epoch": 0.6275762309784181, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.5315, + "step": 6779 + }, + { + "epoch": 0.6276688074986981, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.5445, + "step": 6780 + }, + { + "epoch": 0.6277613840189782, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.5176, + "step": 6781 + }, + { + "epoch": 0.6278539605392582, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.5561, + "step": 6782 + }, + { + "epoch": 0.6279465370595383, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.6013, + "step": 6783 + }, + { + "epoch": 0.6280391135798183, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.5188, + "step": 6784 + }, + { + "epoch": 0.6281316901000984, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.5482, + "step": 6785 + }, + { + "epoch": 0.6282242666203784, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.5199, + "step": 6786 + }, + { + "epoch": 0.6283168431406585, + "grad_norm": 0.1630859375, + "learning_rate": 0.02, + "loss": 1.5347, + "step": 6787 + }, + { + "epoch": 0.6284094196609384, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.6074, + "step": 6788 + }, + { + "epoch": 0.6285019961812185, + "grad_norm": 0.1572265625, + "learning_rate": 0.02, + "loss": 1.5224, + "step": 6789 + }, + { + "epoch": 0.6285945727014985, + "grad_norm": 0.1552734375, + "learning_rate": 0.02, + "loss": 1.566, + "step": 6790 + }, + { + "epoch": 0.6286871492217786, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.4844, + "step": 6791 + }, + { + "epoch": 0.6287797257420586, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.5236, + "step": 6792 + }, + { + "epoch": 0.6288723022623387, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.5125, + "step": 6793 + }, + { + "epoch": 0.6289648787826188, + "grad_norm": 0.138671875, + "learning_rate": 0.02, + "loss": 1.5127, + "step": 6794 + }, + { + "epoch": 0.6290574553028988, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.5138, + "step": 6795 + }, + { + "epoch": 0.6291500318231789, + "grad_norm": 0.13671875, + "learning_rate": 0.02, + "loss": 1.4401, + "step": 6796 + }, + { + "epoch": 0.6292426083434589, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.5157, + "step": 6797 + }, + { + "epoch": 0.629335184863739, + "grad_norm": 0.140625, + "learning_rate": 0.02, + "loss": 1.5402, + "step": 6798 + }, + { + "epoch": 0.629427761384019, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.5397, + "step": 6799 + }, + { + "epoch": 0.6295203379042991, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.634, + "step": 6800 + }, + { + "epoch": 0.6296129144245791, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.5347, + "step": 6801 + }, + { + "epoch": 0.6297054909448591, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.5449, + "step": 6802 + }, + { + "epoch": 0.6297980674651391, + "grad_norm": 0.1591796875, + "learning_rate": 0.02, + "loss": 1.5431, + "step": 6803 + }, + { + "epoch": 0.6298906439854192, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.5944, + "step": 6804 + }, + { + "epoch": 0.6299832205056992, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.5464, + "step": 6805 + }, + { + "epoch": 0.6300757970259793, + "grad_norm": 0.1552734375, + "learning_rate": 0.02, + "loss": 1.5359, + "step": 6806 + }, + { + "epoch": 0.6301683735462593, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.5277, + "step": 6807 + }, + { + "epoch": 0.6302609500665394, + "grad_norm": 0.1611328125, + "learning_rate": 0.02, + "loss": 1.5286, + "step": 6808 + }, + { + "epoch": 0.6303535265868194, + "grad_norm": 0.138671875, + "learning_rate": 0.02, + "loss": 1.5568, + "step": 6809 + }, + { + "epoch": 0.6304461031070995, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.4955, + "step": 6810 + }, + { + "epoch": 0.6305386796273795, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.6033, + "step": 6811 + }, + { + "epoch": 0.6306312561476596, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.5297, + "step": 6812 + }, + { + "epoch": 0.6307238326679396, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.5824, + "step": 6813 + }, + { + "epoch": 0.6308164091882197, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.4772, + "step": 6814 + }, + { + "epoch": 0.6309089857084997, + "grad_norm": 0.134765625, + "learning_rate": 0.02, + "loss": 1.4861, + "step": 6815 + }, + { + "epoch": 0.6310015622287797, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.5484, + "step": 6816 + }, + { + "epoch": 0.6310941387490597, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.5725, + "step": 6817 + }, + { + "epoch": 0.6311867152693398, + "grad_norm": 0.1591796875, + "learning_rate": 0.02, + "loss": 1.5253, + "step": 6818 + }, + { + "epoch": 0.6312792917896198, + "grad_norm": 0.140625, + "learning_rate": 0.02, + "loss": 1.5673, + "step": 6819 + }, + { + "epoch": 0.6313718683098999, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.518, + "step": 6820 + }, + { + "epoch": 0.6314644448301799, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5634, + "step": 6821 + }, + { + "epoch": 0.63155702135046, + "grad_norm": 0.1435546875, + "learning_rate": 0.02, + "loss": 1.5053, + "step": 6822 + }, + { + "epoch": 0.63164959787074, + "grad_norm": 0.1435546875, + "learning_rate": 0.02, + "loss": 1.6209, + "step": 6823 + }, + { + "epoch": 0.6317421743910201, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.5405, + "step": 6824 + }, + { + "epoch": 0.6318347509113001, + "grad_norm": 0.1611328125, + "learning_rate": 0.02, + "loss": 1.5051, + "step": 6825 + }, + { + "epoch": 0.6319273274315802, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.6131, + "step": 6826 + }, + { + "epoch": 0.6320199039518603, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.5352, + "step": 6827 + }, + { + "epoch": 0.6321124804721403, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.5225, + "step": 6828 + }, + { + "epoch": 0.6322050569924202, + "grad_norm": 0.158203125, + "learning_rate": 0.02, + "loss": 1.5132, + "step": 6829 + }, + { + "epoch": 0.6322976335127003, + "grad_norm": 0.1396484375, + "learning_rate": 0.02, + "loss": 1.5845, + "step": 6830 + }, + { + "epoch": 0.6323902100329803, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.5778, + "step": 6831 + }, + { + "epoch": 0.6324827865532604, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5354, + "step": 6832 + }, + { + "epoch": 0.6325753630735405, + "grad_norm": 0.1552734375, + "learning_rate": 0.02, + "loss": 1.5108, + "step": 6833 + }, + { + "epoch": 0.6326679395938205, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5193, + "step": 6834 + }, + { + "epoch": 0.6327605161141006, + "grad_norm": 0.1591796875, + "learning_rate": 0.02, + "loss": 1.5823, + "step": 6835 + }, + { + "epoch": 0.6328530926343806, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.4716, + "step": 6836 + }, + { + "epoch": 0.6329456691546607, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.531, + "step": 6837 + }, + { + "epoch": 0.6330382456749407, + "grad_norm": 0.13671875, + "learning_rate": 0.02, + "loss": 1.5429, + "step": 6838 + }, + { + "epoch": 0.6331308221952208, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5171, + "step": 6839 + }, + { + "epoch": 0.6332233987155008, + "grad_norm": 0.158203125, + "learning_rate": 0.02, + "loss": 1.4949, + "step": 6840 + }, + { + "epoch": 0.6333159752357809, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5696, + "step": 6841 + }, + { + "epoch": 0.6334085517560609, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.5396, + "step": 6842 + }, + { + "epoch": 0.6335011282763409, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5314, + "step": 6843 + }, + { + "epoch": 0.6335937047966209, + "grad_norm": 0.16015625, + "learning_rate": 0.02, + "loss": 1.6416, + "step": 6844 + }, + { + "epoch": 0.633686281316901, + "grad_norm": 0.1572265625, + "learning_rate": 0.02, + "loss": 1.5146, + "step": 6845 + }, + { + "epoch": 0.633778857837181, + "grad_norm": 0.1572265625, + "learning_rate": 0.02, + "loss": 1.5109, + "step": 6846 + }, + { + "epoch": 0.6338714343574611, + "grad_norm": 0.1435546875, + "learning_rate": 0.02, + "loss": 1.5465, + "step": 6847 + }, + { + "epoch": 0.6339640108777411, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.4715, + "step": 6848 + }, + { + "epoch": 0.6340565873980212, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.5156, + "step": 6849 + }, + { + "epoch": 0.6341491639183012, + "grad_norm": 0.1552734375, + "learning_rate": 0.02, + "loss": 1.5612, + "step": 6850 + }, + { + "epoch": 0.6342417404385813, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.5648, + "step": 6851 + }, + { + "epoch": 0.6343343169588613, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.5548, + "step": 6852 + }, + { + "epoch": 0.6344268934791414, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5017, + "step": 6853 + }, + { + "epoch": 0.6345194699994214, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.5028, + "step": 6854 + }, + { + "epoch": 0.6346120465197015, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.5272, + "step": 6855 + }, + { + "epoch": 0.6347046230399814, + "grad_norm": 0.158203125, + "learning_rate": 0.02, + "loss": 1.5115, + "step": 6856 + }, + { + "epoch": 0.6347971995602615, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.5362, + "step": 6857 + }, + { + "epoch": 0.6348897760805415, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.5531, + "step": 6858 + }, + { + "epoch": 0.6349823526008216, + "grad_norm": 0.140625, + "learning_rate": 0.02, + "loss": 1.5135, + "step": 6859 + }, + { + "epoch": 0.6350749291211016, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.5148, + "step": 6860 + }, + { + "epoch": 0.6351675056413817, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.5144, + "step": 6861 + }, + { + "epoch": 0.6352600821616617, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.5273, + "step": 6862 + }, + { + "epoch": 0.6353526586819418, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.5307, + "step": 6863 + }, + { + "epoch": 0.6354452352022218, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.5497, + "step": 6864 + }, + { + "epoch": 0.6355378117225019, + "grad_norm": 0.140625, + "learning_rate": 0.02, + "loss": 1.5532, + "step": 6865 + }, + { + "epoch": 0.635630388242782, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5032, + "step": 6866 + }, + { + "epoch": 0.635722964763062, + "grad_norm": 0.1435546875, + "learning_rate": 0.02, + "loss": 1.484, + "step": 6867 + }, + { + "epoch": 0.6358155412833421, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.5252, + "step": 6868 + }, + { + "epoch": 0.6359081178036221, + "grad_norm": 0.140625, + "learning_rate": 0.02, + "loss": 1.6132, + "step": 6869 + }, + { + "epoch": 0.636000694323902, + "grad_norm": 0.1611328125, + "learning_rate": 0.02, + "loss": 1.5779, + "step": 6870 + }, + { + "epoch": 0.6360932708441821, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.5395, + "step": 6871 + }, + { + "epoch": 0.6361858473644622, + "grad_norm": 0.1611328125, + "learning_rate": 0.02, + "loss": 1.4735, + "step": 6872 + }, + { + "epoch": 0.6362784238847422, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.6134, + "step": 6873 + }, + { + "epoch": 0.6363710004050223, + "grad_norm": 0.1669921875, + "learning_rate": 0.02, + "loss": 1.5436, + "step": 6874 + }, + { + "epoch": 0.6364635769253023, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.4479, + "step": 6875 + }, + { + "epoch": 0.6365561534455824, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.5501, + "step": 6876 + }, + { + "epoch": 0.6366487299658624, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.5602, + "step": 6877 + }, + { + "epoch": 0.6367413064861425, + "grad_norm": 0.16015625, + "learning_rate": 0.02, + "loss": 1.5308, + "step": 6878 + }, + { + "epoch": 0.6368338830064225, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.5616, + "step": 6879 + }, + { + "epoch": 0.6369264595267026, + "grad_norm": 0.1591796875, + "learning_rate": 0.02, + "loss": 1.4953, + "step": 6880 + }, + { + "epoch": 0.6370190360469826, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.5021, + "step": 6881 + }, + { + "epoch": 0.6371116125672627, + "grad_norm": 0.1552734375, + "learning_rate": 0.02, + "loss": 1.5274, + "step": 6882 + }, + { + "epoch": 0.6372041890875427, + "grad_norm": 0.1416015625, + "learning_rate": 0.02, + "loss": 1.5295, + "step": 6883 + }, + { + "epoch": 0.6372967656078227, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.5061, + "step": 6884 + }, + { + "epoch": 0.6373893421281027, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.5102, + "step": 6885 + }, + { + "epoch": 0.6374819186483828, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.5372, + "step": 6886 + }, + { + "epoch": 0.6375744951686628, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5279, + "step": 6887 + }, + { + "epoch": 0.6376670716889429, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.5596, + "step": 6888 + }, + { + "epoch": 0.6377596482092229, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.5539, + "step": 6889 + }, + { + "epoch": 0.637852224729503, + "grad_norm": 0.1669921875, + "learning_rate": 0.02, + "loss": 1.5243, + "step": 6890 + }, + { + "epoch": 0.637944801249783, + "grad_norm": 0.1611328125, + "learning_rate": 0.02, + "loss": 1.559, + "step": 6891 + }, + { + "epoch": 0.6380373777700631, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.4824, + "step": 6892 + }, + { + "epoch": 0.6381299542903431, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.5057, + "step": 6893 + }, + { + "epoch": 0.6382225308106232, + "grad_norm": 0.1435546875, + "learning_rate": 0.02, + "loss": 1.5187, + "step": 6894 + }, + { + "epoch": 0.6383151073309032, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.5176, + "step": 6895 + }, + { + "epoch": 0.6384076838511833, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.5905, + "step": 6896 + }, + { + "epoch": 0.6385002603714632, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.5079, + "step": 6897 + }, + { + "epoch": 0.6385928368917433, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.5887, + "step": 6898 + }, + { + "epoch": 0.6386854134120233, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.5667, + "step": 6899 + }, + { + "epoch": 0.6387779899323034, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.4797, + "step": 6900 + }, + { + "epoch": 0.6388705664525834, + "grad_norm": 0.1376953125, + "learning_rate": 0.02, + "loss": 1.5768, + "step": 6901 + }, + { + "epoch": 0.6389631429728635, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.5428, + "step": 6902 + }, + { + "epoch": 0.6390557194931435, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.5882, + "step": 6903 + }, + { + "epoch": 0.6391482960134236, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.5279, + "step": 6904 + }, + { + "epoch": 0.6392408725337037, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.5541, + "step": 6905 + }, + { + "epoch": 0.6393334490539837, + "grad_norm": 0.158203125, + "learning_rate": 0.02, + "loss": 1.5721, + "step": 6906 + }, + { + "epoch": 0.6394260255742638, + "grad_norm": 0.1552734375, + "learning_rate": 0.02, + "loss": 1.4988, + "step": 6907 + }, + { + "epoch": 0.6395186020945438, + "grad_norm": 0.1640625, + "learning_rate": 0.02, + "loss": 1.5483, + "step": 6908 + }, + { + "epoch": 0.6396111786148239, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5715, + "step": 6909 + }, + { + "epoch": 0.6397037551351039, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.6095, + "step": 6910 + }, + { + "epoch": 0.6397963316553839, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.5479, + "step": 6911 + }, + { + "epoch": 0.6398889081756639, + "grad_norm": 0.1552734375, + "learning_rate": 0.02, + "loss": 1.5332, + "step": 6912 + }, + { + "epoch": 0.639981484695944, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.5467, + "step": 6913 + }, + { + "epoch": 0.640074061216224, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.5142, + "step": 6914 + }, + { + "epoch": 0.6401666377365041, + "grad_norm": 0.158203125, + "learning_rate": 0.02, + "loss": 1.4879, + "step": 6915 + }, + { + "epoch": 0.6402592142567841, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.517, + "step": 6916 + }, + { + "epoch": 0.6403517907770642, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.5169, + "step": 6917 + }, + { + "epoch": 0.6404443672973442, + "grad_norm": 0.1611328125, + "learning_rate": 0.02, + "loss": 1.5689, + "step": 6918 + }, + { + "epoch": 0.6405369438176243, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.5503, + "step": 6919 + }, + { + "epoch": 0.6406295203379043, + "grad_norm": 0.1591796875, + "learning_rate": 0.02, + "loss": 1.5423, + "step": 6920 + }, + { + "epoch": 0.6407220968581844, + "grad_norm": 0.134765625, + "learning_rate": 0.02, + "loss": 1.5662, + "step": 6921 + }, + { + "epoch": 0.6408146733784644, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5408, + "step": 6922 + }, + { + "epoch": 0.6409072498987445, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.5496, + "step": 6923 + }, + { + "epoch": 0.6409998264190244, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.4823, + "step": 6924 + }, + { + "epoch": 0.6410924029393045, + "grad_norm": 0.1416015625, + "learning_rate": 0.02, + "loss": 1.4633, + "step": 6925 + }, + { + "epoch": 0.6411849794595845, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.5403, + "step": 6926 + }, + { + "epoch": 0.6412775559798646, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.5125, + "step": 6927 + }, + { + "epoch": 0.6413701325001446, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.5201, + "step": 6928 + }, + { + "epoch": 0.6414627090204247, + "grad_norm": 0.12890625, + "learning_rate": 0.02, + "loss": 1.4283, + "step": 6929 + }, + { + "epoch": 0.6415552855407047, + "grad_norm": 0.1435546875, + "learning_rate": 0.02, + "loss": 1.5329, + "step": 6930 + }, + { + "epoch": 0.6416478620609848, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.5394, + "step": 6931 + }, + { + "epoch": 0.6417404385812648, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.5008, + "step": 6932 + }, + { + "epoch": 0.6418330151015449, + "grad_norm": 0.1416015625, + "learning_rate": 0.02, + "loss": 1.5371, + "step": 6933 + }, + { + "epoch": 0.6419255916218249, + "grad_norm": 0.1435546875, + "learning_rate": 0.02, + "loss": 1.565, + "step": 6934 + }, + { + "epoch": 0.642018168142105, + "grad_norm": 0.140625, + "learning_rate": 0.02, + "loss": 1.5065, + "step": 6935 + }, + { + "epoch": 0.642110744662385, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.5359, + "step": 6936 + }, + { + "epoch": 0.6422033211826651, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.5331, + "step": 6937 + }, + { + "epoch": 0.642295897702945, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.5694, + "step": 6938 + }, + { + "epoch": 0.6423884742232251, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.5365, + "step": 6939 + }, + { + "epoch": 0.6424810507435051, + "grad_norm": 0.1591796875, + "learning_rate": 0.02, + "loss": 1.5305, + "step": 6940 + }, + { + "epoch": 0.6425736272637852, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.5389, + "step": 6941 + }, + { + "epoch": 0.6426662037840652, + "grad_norm": 0.1552734375, + "learning_rate": 0.02, + "loss": 1.4914, + "step": 6942 + }, + { + "epoch": 0.6427587803043453, + "grad_norm": 0.1435546875, + "learning_rate": 0.02, + "loss": 1.5213, + "step": 6943 + }, + { + "epoch": 0.6428513568246254, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.5706, + "step": 6944 + }, + { + "epoch": 0.6429439333449054, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.5141, + "step": 6945 + }, + { + "epoch": 0.6430365098651855, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.4908, + "step": 6946 + }, + { + "epoch": 0.6431290863854655, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5179, + "step": 6947 + }, + { + "epoch": 0.6432216629057456, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.509, + "step": 6948 + }, + { + "epoch": 0.6433142394260256, + "grad_norm": 0.140625, + "learning_rate": 0.02, + "loss": 1.5446, + "step": 6949 + }, + { + "epoch": 0.6434068159463057, + "grad_norm": 0.1572265625, + "learning_rate": 0.02, + "loss": 1.5493, + "step": 6950 + }, + { + "epoch": 0.6434993924665856, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.5626, + "step": 6951 + }, + { + "epoch": 0.6435919689868657, + "grad_norm": 0.1552734375, + "learning_rate": 0.02, + "loss": 1.5388, + "step": 6952 + }, + { + "epoch": 0.6436845455071457, + "grad_norm": 0.140625, + "learning_rate": 0.02, + "loss": 1.5022, + "step": 6953 + }, + { + "epoch": 0.6437771220274258, + "grad_norm": 0.1552734375, + "learning_rate": 0.02, + "loss": 1.532, + "step": 6954 + }, + { + "epoch": 0.6438696985477058, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.5086, + "step": 6955 + }, + { + "epoch": 0.6439622750679859, + "grad_norm": 0.1435546875, + "learning_rate": 0.02, + "loss": 1.5256, + "step": 6956 + }, + { + "epoch": 0.6440548515882659, + "grad_norm": 0.1357421875, + "learning_rate": 0.02, + "loss": 1.545, + "step": 6957 + }, + { + "epoch": 0.644147428108546, + "grad_norm": 0.140625, + "learning_rate": 0.02, + "loss": 1.5534, + "step": 6958 + }, + { + "epoch": 0.644240004628826, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.5293, + "step": 6959 + }, + { + "epoch": 0.6443325811491061, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.4989, + "step": 6960 + }, + { + "epoch": 0.6444251576693861, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.5141, + "step": 6961 + }, + { + "epoch": 0.6445177341896662, + "grad_norm": 0.1572265625, + "learning_rate": 0.02, + "loss": 1.5295, + "step": 6962 + }, + { + "epoch": 0.6446103107099462, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.4915, + "step": 6963 + }, + { + "epoch": 0.6447028872302263, + "grad_norm": 0.140625, + "learning_rate": 0.02, + "loss": 1.537, + "step": 6964 + }, + { + "epoch": 0.6447954637505062, + "grad_norm": 0.1552734375, + "learning_rate": 0.02, + "loss": 1.6043, + "step": 6965 + }, + { + "epoch": 0.6448880402707863, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.513, + "step": 6966 + }, + { + "epoch": 0.6449806167910663, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.513, + "step": 6967 + }, + { + "epoch": 0.6450731933113464, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.5594, + "step": 6968 + }, + { + "epoch": 0.6451657698316264, + "grad_norm": 0.1552734375, + "learning_rate": 0.02, + "loss": 1.5712, + "step": 6969 + }, + { + "epoch": 0.6452583463519065, + "grad_norm": 0.15625, + "learning_rate": 0.02, + "loss": 1.5551, + "step": 6970 + }, + { + "epoch": 0.6453509228721865, + "grad_norm": 0.1591796875, + "learning_rate": 0.02, + "loss": 1.5477, + "step": 6971 + }, + { + "epoch": 0.6454434993924666, + "grad_norm": 0.134765625, + "learning_rate": 0.02, + "loss": 1.4643, + "step": 6972 + }, + { + "epoch": 0.6455360759127466, + "grad_norm": 0.1552734375, + "learning_rate": 0.02, + "loss": 1.5483, + "step": 6973 + }, + { + "epoch": 0.6456286524330267, + "grad_norm": 0.138671875, + "learning_rate": 0.02, + "loss": 1.4795, + "step": 6974 + }, + { + "epoch": 0.6457212289533067, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.5788, + "step": 6975 + }, + { + "epoch": 0.6458138054735868, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.5272, + "step": 6976 + }, + { + "epoch": 0.6459063819938669, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.5188, + "step": 6977 + }, + { + "epoch": 0.6459989585141469, + "grad_norm": 0.1396484375, + "learning_rate": 0.02, + "loss": 1.5237, + "step": 6978 + }, + { + "epoch": 0.6460915350344268, + "grad_norm": 0.15625, + "learning_rate": 0.02, + "loss": 1.5556, + "step": 6979 + }, + { + "epoch": 0.6461841115547069, + "grad_norm": 0.15625, + "learning_rate": 0.02, + "loss": 1.6078, + "step": 6980 + }, + { + "epoch": 0.646276688074987, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.5068, + "step": 6981 + }, + { + "epoch": 0.646369264595267, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.5594, + "step": 6982 + }, + { + "epoch": 0.646461841115547, + "grad_norm": 0.1376953125, + "learning_rate": 0.02, + "loss": 1.5739, + "step": 6983 + }, + { + "epoch": 0.6465544176358271, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.5648, + "step": 6984 + }, + { + "epoch": 0.6466469941561072, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.5095, + "step": 6985 + }, + { + "epoch": 0.6467395706763872, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.5381, + "step": 6986 + }, + { + "epoch": 0.6468321471966673, + "grad_norm": 0.162109375, + "learning_rate": 0.02, + "loss": 1.592, + "step": 6987 + }, + { + "epoch": 0.6469247237169473, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.5577, + "step": 6988 + }, + { + "epoch": 0.6470173002372274, + "grad_norm": 0.158203125, + "learning_rate": 0.02, + "loss": 1.5469, + "step": 6989 + }, + { + "epoch": 0.6471098767575074, + "grad_norm": 0.1416015625, + "learning_rate": 0.02, + "loss": 1.5236, + "step": 6990 + }, + { + "epoch": 0.6472024532777875, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.496, + "step": 6991 + }, + { + "epoch": 0.6472950297980674, + "grad_norm": 0.1435546875, + "learning_rate": 0.02, + "loss": 1.5682, + "step": 6992 + }, + { + "epoch": 0.6473876063183475, + "grad_norm": 0.1591796875, + "learning_rate": 0.02, + "loss": 1.5051, + "step": 6993 + }, + { + "epoch": 0.6474801828386275, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.4882, + "step": 6994 + }, + { + "epoch": 0.6475727593589076, + "grad_norm": 0.1591796875, + "learning_rate": 0.02, + "loss": 1.5555, + "step": 6995 + }, + { + "epoch": 0.6476653358791876, + "grad_norm": 0.158203125, + "learning_rate": 0.02, + "loss": 1.5144, + "step": 6996 + }, + { + "epoch": 0.6477579123994677, + "grad_norm": 0.16015625, + "learning_rate": 0.02, + "loss": 1.5738, + "step": 6997 + }, + { + "epoch": 0.6478504889197477, + "grad_norm": 0.15625, + "learning_rate": 0.02, + "loss": 1.5481, + "step": 6998 + }, + { + "epoch": 0.6479430654400278, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5341, + "step": 6999 + }, + { + "epoch": 0.6480356419603078, + "grad_norm": 0.1611328125, + "learning_rate": 0.02, + "loss": 1.57, + "step": 7000 + }, + { + "epoch": 0.6481282184805879, + "grad_norm": 0.1611328125, + "learning_rate": 0.02, + "loss": 1.5522, + "step": 7001 + }, + { + "epoch": 0.6482207950008679, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.4615, + "step": 7002 + }, + { + "epoch": 0.648313371521148, + "grad_norm": 0.1376953125, + "learning_rate": 0.02, + "loss": 1.4901, + "step": 7003 + }, + { + "epoch": 0.648405948041428, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.5291, + "step": 7004 + }, + { + "epoch": 0.6484985245617081, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.5162, + "step": 7005 + }, + { + "epoch": 0.648591101081988, + "grad_norm": 0.1328125, + "learning_rate": 0.02, + "loss": 1.55, + "step": 7006 + }, + { + "epoch": 0.6486836776022681, + "grad_norm": 0.13671875, + "learning_rate": 0.02, + "loss": 1.503, + "step": 7007 + }, + { + "epoch": 0.6487762541225481, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5728, + "step": 7008 + }, + { + "epoch": 0.6488688306428282, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.5504, + "step": 7009 + }, + { + "epoch": 0.6489614071631082, + "grad_norm": 0.1416015625, + "learning_rate": 0.02, + "loss": 1.5112, + "step": 7010 + }, + { + "epoch": 0.6490539836833883, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.498, + "step": 7011 + }, + { + "epoch": 0.6491465602036683, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.5504, + "step": 7012 + }, + { + "epoch": 0.6492391367239484, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.5146, + "step": 7013 + }, + { + "epoch": 0.6493317132442284, + "grad_norm": 0.134765625, + "learning_rate": 0.02, + "loss": 1.5434, + "step": 7014 + }, + { + "epoch": 0.6494242897645085, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.6207, + "step": 7015 + }, + { + "epoch": 0.6495168662847886, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.5758, + "step": 7016 + }, + { + "epoch": 0.6496094428050686, + "grad_norm": 0.140625, + "learning_rate": 0.02, + "loss": 1.5382, + "step": 7017 + }, + { + "epoch": 0.6497020193253487, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.51, + "step": 7018 + }, + { + "epoch": 0.6497945958456286, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.5833, + "step": 7019 + }, + { + "epoch": 0.6498871723659086, + "grad_norm": 0.13671875, + "learning_rate": 0.02, + "loss": 1.5017, + "step": 7020 + }, + { + "epoch": 0.6499797488861887, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.6002, + "step": 7021 + }, + { + "epoch": 0.6500723254064688, + "grad_norm": 0.158203125, + "learning_rate": 0.02, + "loss": 1.5729, + "step": 7022 + }, + { + "epoch": 0.6501649019267488, + "grad_norm": 0.1611328125, + "learning_rate": 0.02, + "loss": 1.5552, + "step": 7023 + }, + { + "epoch": 0.6502574784470289, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.5883, + "step": 7024 + }, + { + "epoch": 0.6503500549673089, + "grad_norm": 0.16015625, + "learning_rate": 0.02, + "loss": 1.5193, + "step": 7025 + }, + { + "epoch": 0.650442631487589, + "grad_norm": 0.1640625, + "learning_rate": 0.02, + "loss": 1.5085, + "step": 7026 + }, + { + "epoch": 0.650535208007869, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.5219, + "step": 7027 + }, + { + "epoch": 0.6506277845281491, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.5924, + "step": 7028 + }, + { + "epoch": 0.6507203610484291, + "grad_norm": 0.162109375, + "learning_rate": 0.02, + "loss": 1.5353, + "step": 7029 + }, + { + "epoch": 0.6508129375687092, + "grad_norm": 0.15625, + "learning_rate": 0.02, + "loss": 1.498, + "step": 7030 + }, + { + "epoch": 0.6509055140889892, + "grad_norm": 0.1435546875, + "learning_rate": 0.02, + "loss": 1.5127, + "step": 7031 + }, + { + "epoch": 0.6509980906092693, + "grad_norm": 0.1552734375, + "learning_rate": 0.02, + "loss": 1.5929, + "step": 7032 + }, + { + "epoch": 0.6510906671295492, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.5752, + "step": 7033 + }, + { + "epoch": 0.6511832436498293, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.5528, + "step": 7034 + }, + { + "epoch": 0.6512758201701093, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.5595, + "step": 7035 + }, + { + "epoch": 0.6513683966903894, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5401, + "step": 7036 + }, + { + "epoch": 0.6514609732106694, + "grad_norm": 0.1572265625, + "learning_rate": 0.02, + "loss": 1.6001, + "step": 7037 + }, + { + "epoch": 0.6515535497309495, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.5089, + "step": 7038 + }, + { + "epoch": 0.6516461262512295, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.5124, + "step": 7039 + }, + { + "epoch": 0.6517387027715096, + "grad_norm": 0.1435546875, + "learning_rate": 0.02, + "loss": 1.5675, + "step": 7040 + }, + { + "epoch": 0.6518312792917896, + "grad_norm": 0.15625, + "learning_rate": 0.02, + "loss": 1.4882, + "step": 7041 + }, + { + "epoch": 0.6519238558120697, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5613, + "step": 7042 + }, + { + "epoch": 0.6520164323323497, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.5192, + "step": 7043 + }, + { + "epoch": 0.6521090088526298, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.5245, + "step": 7044 + }, + { + "epoch": 0.6522015853729098, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.5628, + "step": 7045 + }, + { + "epoch": 0.6522941618931899, + "grad_norm": 0.1357421875, + "learning_rate": 0.02, + "loss": 1.5165, + "step": 7046 + }, + { + "epoch": 0.6523867384134698, + "grad_norm": 0.1640625, + "learning_rate": 0.02, + "loss": 1.4986, + "step": 7047 + }, + { + "epoch": 0.6524793149337499, + "grad_norm": 0.1376953125, + "learning_rate": 0.02, + "loss": 1.5836, + "step": 7048 + }, + { + "epoch": 0.6525718914540299, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.5569, + "step": 7049 + }, + { + "epoch": 0.65266446797431, + "grad_norm": 0.13671875, + "learning_rate": 0.02, + "loss": 1.5142, + "step": 7050 + }, + { + "epoch": 0.65275704449459, + "grad_norm": 0.1435546875, + "learning_rate": 0.02, + "loss": 1.6124, + "step": 7051 + }, + { + "epoch": 0.6528496210148701, + "grad_norm": 0.1357421875, + "learning_rate": 0.02, + "loss": 1.4804, + "step": 7052 + }, + { + "epoch": 0.6529421975351501, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.534, + "step": 7053 + }, + { + "epoch": 0.6530347740554302, + "grad_norm": 0.1376953125, + "learning_rate": 0.02, + "loss": 1.5432, + "step": 7054 + }, + { + "epoch": 0.6531273505757103, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.4559, + "step": 7055 + }, + { + "epoch": 0.6532199270959903, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5517, + "step": 7056 + }, + { + "epoch": 0.6533125036162704, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.5942, + "step": 7057 + }, + { + "epoch": 0.6534050801365504, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.4725, + "step": 7058 + }, + { + "epoch": 0.6534976566568305, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.5225, + "step": 7059 + }, + { + "epoch": 0.6535902331771104, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.5129, + "step": 7060 + }, + { + "epoch": 0.6536828096973905, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5477, + "step": 7061 + }, + { + "epoch": 0.6537753862176705, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.4667, + "step": 7062 + }, + { + "epoch": 0.6538679627379506, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.5612, + "step": 7063 + }, + { + "epoch": 0.6539605392582306, + "grad_norm": 0.1630859375, + "learning_rate": 0.02, + "loss": 1.524, + "step": 7064 + }, + { + "epoch": 0.6540531157785107, + "grad_norm": 0.1630859375, + "learning_rate": 0.02, + "loss": 1.5737, + "step": 7065 + }, + { + "epoch": 0.6541456922987907, + "grad_norm": 0.16015625, + "learning_rate": 0.02, + "loss": 1.5636, + "step": 7066 + }, + { + "epoch": 0.6542382688190708, + "grad_norm": 0.1435546875, + "learning_rate": 0.02, + "loss": 1.538, + "step": 7067 + }, + { + "epoch": 0.6543308453393508, + "grad_norm": 0.1591796875, + "learning_rate": 0.02, + "loss": 1.4911, + "step": 7068 + }, + { + "epoch": 0.6544234218596309, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.5825, + "step": 7069 + }, + { + "epoch": 0.6545159983799109, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.489, + "step": 7070 + }, + { + "epoch": 0.654608574900191, + "grad_norm": 0.13671875, + "learning_rate": 0.02, + "loss": 1.4667, + "step": 7071 + }, + { + "epoch": 0.654701151420471, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.5188, + "step": 7072 + }, + { + "epoch": 0.6547937279407511, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.4967, + "step": 7073 + }, + { + "epoch": 0.654886304461031, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.502, + "step": 7074 + }, + { + "epoch": 0.6549788809813111, + "grad_norm": 0.1416015625, + "learning_rate": 0.02, + "loss": 1.538, + "step": 7075 + }, + { + "epoch": 0.6550714575015911, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.5159, + "step": 7076 + }, + { + "epoch": 0.6551640340218712, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5382, + "step": 7077 + }, + { + "epoch": 0.6552566105421512, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.5602, + "step": 7078 + }, + { + "epoch": 0.6553491870624313, + "grad_norm": 0.1572265625, + "learning_rate": 0.02, + "loss": 1.4543, + "step": 7079 + }, + { + "epoch": 0.6554417635827113, + "grad_norm": 0.1572265625, + "learning_rate": 0.02, + "loss": 1.543, + "step": 7080 + }, + { + "epoch": 0.6555343401029914, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.5178, + "step": 7081 + }, + { + "epoch": 0.6556269166232714, + "grad_norm": 0.162109375, + "learning_rate": 0.02, + "loss": 1.5675, + "step": 7082 + }, + { + "epoch": 0.6557194931435515, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.5709, + "step": 7083 + }, + { + "epoch": 0.6558120696638315, + "grad_norm": 0.15625, + "learning_rate": 0.02, + "loss": 1.5332, + "step": 7084 + }, + { + "epoch": 0.6559046461841116, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.6065, + "step": 7085 + }, + { + "epoch": 0.6559972227043916, + "grad_norm": 0.13671875, + "learning_rate": 0.02, + "loss": 1.5203, + "step": 7086 + }, + { + "epoch": 0.6560897992246716, + "grad_norm": 0.140625, + "learning_rate": 0.02, + "loss": 1.5032, + "step": 7087 + }, + { + "epoch": 0.6561823757449516, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.5191, + "step": 7088 + }, + { + "epoch": 0.6562749522652317, + "grad_norm": 0.1572265625, + "learning_rate": 0.02, + "loss": 1.5865, + "step": 7089 + }, + { + "epoch": 0.6563675287855117, + "grad_norm": 0.1572265625, + "learning_rate": 0.02, + "loss": 1.5691, + "step": 7090 + }, + { + "epoch": 0.6564601053057918, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.5632, + "step": 7091 + }, + { + "epoch": 0.6565526818260718, + "grad_norm": 0.1435546875, + "learning_rate": 0.02, + "loss": 1.5852, + "step": 7092 + }, + { + "epoch": 0.6566452583463519, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.5442, + "step": 7093 + }, + { + "epoch": 0.656737834866632, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.5066, + "step": 7094 + }, + { + "epoch": 0.656830411386912, + "grad_norm": 0.16015625, + "learning_rate": 0.02, + "loss": 1.5269, + "step": 7095 + }, + { + "epoch": 0.6569229879071921, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.5695, + "step": 7096 + }, + { + "epoch": 0.6570155644274721, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.5025, + "step": 7097 + }, + { + "epoch": 0.6571081409477522, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.5173, + "step": 7098 + }, + { + "epoch": 0.6572007174680322, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.5416, + "step": 7099 + }, + { + "epoch": 0.6572932939883123, + "grad_norm": 0.1357421875, + "learning_rate": 0.02, + "loss": 1.513, + "step": 7100 + }, + { + "epoch": 0.6573858705085922, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5878, + "step": 7101 + }, + { + "epoch": 0.6574784470288723, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5321, + "step": 7102 + }, + { + "epoch": 0.6575710235491523, + "grad_norm": 0.1552734375, + "learning_rate": 0.02, + "loss": 1.4953, + "step": 7103 + }, + { + "epoch": 0.6576636000694324, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.5587, + "step": 7104 + }, + { + "epoch": 0.6577561765897124, + "grad_norm": 0.158203125, + "learning_rate": 0.02, + "loss": 1.5255, + "step": 7105 + }, + { + "epoch": 0.6578487531099925, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.5729, + "step": 7106 + }, + { + "epoch": 0.6579413296302725, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.4722, + "step": 7107 + }, + { + "epoch": 0.6580339061505526, + "grad_norm": 0.1396484375, + "learning_rate": 0.02, + "loss": 1.5141, + "step": 7108 + }, + { + "epoch": 0.6581264826708326, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.53, + "step": 7109 + }, + { + "epoch": 0.6582190591911127, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5049, + "step": 7110 + }, + { + "epoch": 0.6583116357113927, + "grad_norm": 0.1552734375, + "learning_rate": 0.02, + "loss": 1.5695, + "step": 7111 + }, + { + "epoch": 0.6584042122316728, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.5702, + "step": 7112 + }, + { + "epoch": 0.6584967887519528, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.5384, + "step": 7113 + }, + { + "epoch": 0.6585893652722329, + "grad_norm": 0.13671875, + "learning_rate": 0.02, + "loss": 1.5777, + "step": 7114 + }, + { + "epoch": 0.6586819417925128, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.5142, + "step": 7115 + }, + { + "epoch": 0.6587745183127929, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.5431, + "step": 7116 + }, + { + "epoch": 0.6588670948330729, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.4982, + "step": 7117 + }, + { + "epoch": 0.658959671353353, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.5792, + "step": 7118 + }, + { + "epoch": 0.659052247873633, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.5506, + "step": 7119 + }, + { + "epoch": 0.6591448243939131, + "grad_norm": 0.1552734375, + "learning_rate": 0.02, + "loss": 1.5503, + "step": 7120 + }, + { + "epoch": 0.6592374009141931, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.5406, + "step": 7121 + }, + { + "epoch": 0.6593299774344732, + "grad_norm": 0.1435546875, + "learning_rate": 0.02, + "loss": 1.5276, + "step": 7122 + }, + { + "epoch": 0.6594225539547532, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.5232, + "step": 7123 + }, + { + "epoch": 0.6595151304750333, + "grad_norm": 0.1552734375, + "learning_rate": 0.02, + "loss": 1.562, + "step": 7124 + }, + { + "epoch": 0.6596077069953133, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5392, + "step": 7125 + }, + { + "epoch": 0.6597002835155934, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.5357, + "step": 7126 + }, + { + "epoch": 0.6597928600358735, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.5559, + "step": 7127 + }, + { + "epoch": 0.6598854365561534, + "grad_norm": 0.162109375, + "learning_rate": 0.02, + "loss": 1.4738, + "step": 7128 + }, + { + "epoch": 0.6599780130764334, + "grad_norm": 0.1416015625, + "learning_rate": 0.02, + "loss": 1.5174, + "step": 7129 + }, + { + "epoch": 0.6600705895967135, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.4874, + "step": 7130 + }, + { + "epoch": 0.6601631661169935, + "grad_norm": 0.13671875, + "learning_rate": 0.02, + "loss": 1.5331, + "step": 7131 + }, + { + "epoch": 0.6602557426372736, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.5166, + "step": 7132 + }, + { + "epoch": 0.6603483191575537, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.5238, + "step": 7133 + }, + { + "epoch": 0.6604408956778337, + "grad_norm": 0.16015625, + "learning_rate": 0.02, + "loss": 1.5368, + "step": 7134 + }, + { + "epoch": 0.6605334721981138, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.4943, + "step": 7135 + }, + { + "epoch": 0.6606260487183938, + "grad_norm": 0.1591796875, + "learning_rate": 0.02, + "loss": 1.5634, + "step": 7136 + }, + { + "epoch": 0.6607186252386739, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5243, + "step": 7137 + }, + { + "epoch": 0.6608112017589539, + "grad_norm": 0.158203125, + "learning_rate": 0.02, + "loss": 1.4649, + "step": 7138 + }, + { + "epoch": 0.660903778279234, + "grad_norm": 0.1376953125, + "learning_rate": 0.02, + "loss": 1.5548, + "step": 7139 + }, + { + "epoch": 0.660996354799514, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.546, + "step": 7140 + }, + { + "epoch": 0.6610889313197941, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.5231, + "step": 7141 + }, + { + "epoch": 0.661181507840074, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.4466, + "step": 7142 + }, + { + "epoch": 0.6612740843603541, + "grad_norm": 0.1416015625, + "learning_rate": 0.02, + "loss": 1.6078, + "step": 7143 + }, + { + "epoch": 0.6613666608806341, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.5178, + "step": 7144 + }, + { + "epoch": 0.6614592374009142, + "grad_norm": 0.138671875, + "learning_rate": 0.02, + "loss": 1.5519, + "step": 7145 + }, + { + "epoch": 0.6615518139211942, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.5726, + "step": 7146 + }, + { + "epoch": 0.6616443904414743, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5071, + "step": 7147 + }, + { + "epoch": 0.6617369669617543, + "grad_norm": 0.15625, + "learning_rate": 0.02, + "loss": 1.5506, + "step": 7148 + }, + { + "epoch": 0.6618295434820344, + "grad_norm": 0.15625, + "learning_rate": 0.02, + "loss": 1.5557, + "step": 7149 + }, + { + "epoch": 0.6619221200023144, + "grad_norm": 0.1640625, + "learning_rate": 0.02, + "loss": 1.598, + "step": 7150 + }, + { + "epoch": 0.6620146965225945, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.5964, + "step": 7151 + }, + { + "epoch": 0.6621072730428745, + "grad_norm": 0.166015625, + "learning_rate": 0.02, + "loss": 1.5408, + "step": 7152 + }, + { + "epoch": 0.6621998495631546, + "grad_norm": 0.1416015625, + "learning_rate": 0.02, + "loss": 1.5969, + "step": 7153 + }, + { + "epoch": 0.6622924260834346, + "grad_norm": 0.166015625, + "learning_rate": 0.02, + "loss": 1.5948, + "step": 7154 + }, + { + "epoch": 0.6623850026037146, + "grad_norm": 0.162109375, + "learning_rate": 0.02, + "loss": 1.5563, + "step": 7155 + }, + { + "epoch": 0.6624775791239946, + "grad_norm": 0.166015625, + "learning_rate": 0.02, + "loss": 1.5378, + "step": 7156 + }, + { + "epoch": 0.6625701556442747, + "grad_norm": 0.1787109375, + "learning_rate": 0.02, + "loss": 1.5333, + "step": 7157 + }, + { + "epoch": 0.6626627321645547, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.5619, + "step": 7158 + }, + { + "epoch": 0.6627553086848348, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.5354, + "step": 7159 + }, + { + "epoch": 0.6628478852051148, + "grad_norm": 0.1552734375, + "learning_rate": 0.02, + "loss": 1.5298, + "step": 7160 + }, + { + "epoch": 0.6629404617253949, + "grad_norm": 0.1552734375, + "learning_rate": 0.02, + "loss": 1.5279, + "step": 7161 + }, + { + "epoch": 0.6630330382456749, + "grad_norm": 0.1552734375, + "learning_rate": 0.02, + "loss": 1.5337, + "step": 7162 + }, + { + "epoch": 0.663125614765955, + "grad_norm": 0.1572265625, + "learning_rate": 0.02, + "loss": 1.5593, + "step": 7163 + }, + { + "epoch": 0.663218191286235, + "grad_norm": 0.15625, + "learning_rate": 0.02, + "loss": 1.5441, + "step": 7164 + }, + { + "epoch": 0.6633107678065151, + "grad_norm": 0.162109375, + "learning_rate": 0.02, + "loss": 1.6139, + "step": 7165 + }, + { + "epoch": 0.6634033443267952, + "grad_norm": 0.1591796875, + "learning_rate": 0.02, + "loss": 1.4896, + "step": 7166 + }, + { + "epoch": 0.6634959208470752, + "grad_norm": 0.158203125, + "learning_rate": 0.02, + "loss": 1.6001, + "step": 7167 + }, + { + "epoch": 0.6635884973673553, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.5397, + "step": 7168 + }, + { + "epoch": 0.6636810738876352, + "grad_norm": 0.1552734375, + "learning_rate": 0.02, + "loss": 1.5601, + "step": 7169 + }, + { + "epoch": 0.6637736504079152, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.5898, + "step": 7170 + }, + { + "epoch": 0.6638662269281953, + "grad_norm": 0.1611328125, + "learning_rate": 0.02, + "loss": 1.5099, + "step": 7171 + }, + { + "epoch": 0.6639588034484754, + "grad_norm": 0.1552734375, + "learning_rate": 0.02, + "loss": 1.5629, + "step": 7172 + }, + { + "epoch": 0.6640513799687554, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.4783, + "step": 7173 + }, + { + "epoch": 0.6641439564890355, + "grad_norm": 0.138671875, + "learning_rate": 0.02, + "loss": 1.4947, + "step": 7174 + }, + { + "epoch": 0.6642365330093155, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.5053, + "step": 7175 + }, + { + "epoch": 0.6643291095295956, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.5149, + "step": 7176 + }, + { + "epoch": 0.6644216860498756, + "grad_norm": 0.1435546875, + "learning_rate": 0.02, + "loss": 1.5256, + "step": 7177 + }, + { + "epoch": 0.6645142625701557, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5664, + "step": 7178 + }, + { + "epoch": 0.6646068390904357, + "grad_norm": 0.1552734375, + "learning_rate": 0.02, + "loss": 1.5148, + "step": 7179 + }, + { + "epoch": 0.6646994156107158, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5602, + "step": 7180 + }, + { + "epoch": 0.6647919921309958, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.5878, + "step": 7181 + }, + { + "epoch": 0.6648845686512758, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.5171, + "step": 7182 + }, + { + "epoch": 0.6649771451715558, + "grad_norm": 0.140625, + "learning_rate": 0.02, + "loss": 1.5691, + "step": 7183 + }, + { + "epoch": 0.6650697216918359, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.5745, + "step": 7184 + }, + { + "epoch": 0.6651622982121159, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.4759, + "step": 7185 + }, + { + "epoch": 0.665254874732396, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.5892, + "step": 7186 + }, + { + "epoch": 0.665347451252676, + "grad_norm": 0.1435546875, + "learning_rate": 0.02, + "loss": 1.5543, + "step": 7187 + }, + { + "epoch": 0.6654400277729561, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.502, + "step": 7188 + }, + { + "epoch": 0.6655326042932361, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.474, + "step": 7189 + }, + { + "epoch": 0.6656251808135162, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.5121, + "step": 7190 + }, + { + "epoch": 0.6657177573337962, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.4957, + "step": 7191 + }, + { + "epoch": 0.6658103338540763, + "grad_norm": 0.140625, + "learning_rate": 0.02, + "loss": 1.4405, + "step": 7192 + }, + { + "epoch": 0.6659029103743563, + "grad_norm": 0.1416015625, + "learning_rate": 0.02, + "loss": 1.5286, + "step": 7193 + }, + { + "epoch": 0.6659954868946364, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.5325, + "step": 7194 + }, + { + "epoch": 0.6660880634149164, + "grad_norm": 0.1591796875, + "learning_rate": 0.02, + "loss": 1.5783, + "step": 7195 + }, + { + "epoch": 0.6661806399351964, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.5453, + "step": 7196 + }, + { + "epoch": 0.6662732164554764, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.5387, + "step": 7197 + }, + { + "epoch": 0.6663657929757565, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.571, + "step": 7198 + }, + { + "epoch": 0.6664583694960365, + "grad_norm": 0.158203125, + "learning_rate": 0.02, + "loss": 1.5366, + "step": 7199 + }, + { + "epoch": 0.6665509460163166, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.5379, + "step": 7200 + }, + { + "epoch": 0.6666435225365966, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.4347, + "step": 7201 + }, + { + "epoch": 0.6667360990568767, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.5078, + "step": 7202 + }, + { + "epoch": 0.6668286755771567, + "grad_norm": 0.1591796875, + "learning_rate": 0.02, + "loss": 1.4877, + "step": 7203 + }, + { + "epoch": 0.6669212520974368, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.577, + "step": 7204 + }, + { + "epoch": 0.6670138286177169, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.5395, + "step": 7205 + }, + { + "epoch": 0.6671064051379969, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.5582, + "step": 7206 + }, + { + "epoch": 0.667198981658277, + "grad_norm": 0.162109375, + "learning_rate": 0.02, + "loss": 1.6177, + "step": 7207 + }, + { + "epoch": 0.667291558178557, + "grad_norm": 0.1572265625, + "learning_rate": 0.02, + "loss": 1.5635, + "step": 7208 + }, + { + "epoch": 0.6673841346988371, + "grad_norm": 0.138671875, + "learning_rate": 0.02, + "loss": 1.4763, + "step": 7209 + }, + { + "epoch": 0.667476711219117, + "grad_norm": 0.1640625, + "learning_rate": 0.02, + "loss": 1.5029, + "step": 7210 + }, + { + "epoch": 0.667569287739397, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.5508, + "step": 7211 + }, + { + "epoch": 0.6676618642596771, + "grad_norm": 0.16015625, + "learning_rate": 0.02, + "loss": 1.5336, + "step": 7212 + }, + { + "epoch": 0.6677544407799572, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.4986, + "step": 7213 + }, + { + "epoch": 0.6678470173002372, + "grad_norm": 0.162109375, + "learning_rate": 0.02, + "loss": 1.5035, + "step": 7214 + }, + { + "epoch": 0.6679395938205173, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.56, + "step": 7215 + }, + { + "epoch": 0.6680321703407973, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5499, + "step": 7216 + }, + { + "epoch": 0.6681247468610774, + "grad_norm": 0.140625, + "learning_rate": 0.02, + "loss": 1.6123, + "step": 7217 + }, + { + "epoch": 0.6682173233813574, + "grad_norm": 0.158203125, + "learning_rate": 0.02, + "loss": 1.5434, + "step": 7218 + }, + { + "epoch": 0.6683098999016375, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.4776, + "step": 7219 + }, + { + "epoch": 0.6684024764219175, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.524, + "step": 7220 + }, + { + "epoch": 0.6684950529421976, + "grad_norm": 0.15625, + "learning_rate": 0.02, + "loss": 1.525, + "step": 7221 + }, + { + "epoch": 0.6685876294624776, + "grad_norm": 0.15625, + "learning_rate": 0.02, + "loss": 1.5257, + "step": 7222 + }, + { + "epoch": 0.6686802059827576, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.5541, + "step": 7223 + }, + { + "epoch": 0.6687727825030376, + "grad_norm": 0.15625, + "learning_rate": 0.02, + "loss": 1.5456, + "step": 7224 + }, + { + "epoch": 0.6688653590233177, + "grad_norm": 0.1591796875, + "learning_rate": 0.02, + "loss": 1.4785, + "step": 7225 + }, + { + "epoch": 0.6689579355435977, + "grad_norm": 0.15625, + "learning_rate": 0.02, + "loss": 1.5146, + "step": 7226 + }, + { + "epoch": 0.6690505120638778, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.4814, + "step": 7227 + }, + { + "epoch": 0.6691430885841578, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.4842, + "step": 7228 + }, + { + "epoch": 0.6692356651044379, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.4826, + "step": 7229 + }, + { + "epoch": 0.6693282416247179, + "grad_norm": 0.140625, + "learning_rate": 0.02, + "loss": 1.5381, + "step": 7230 + }, + { + "epoch": 0.669420818144998, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.6165, + "step": 7231 + }, + { + "epoch": 0.669513394665278, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.4714, + "step": 7232 + }, + { + "epoch": 0.6696059711855581, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.5282, + "step": 7233 + }, + { + "epoch": 0.6696985477058381, + "grad_norm": 0.1435546875, + "learning_rate": 0.02, + "loss": 1.6161, + "step": 7234 + }, + { + "epoch": 0.6697911242261182, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.498, + "step": 7235 + }, + { + "epoch": 0.6698837007463982, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.465, + "step": 7236 + }, + { + "epoch": 0.6699762772666782, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.5868, + "step": 7237 + }, + { + "epoch": 0.6700688537869582, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.5231, + "step": 7238 + }, + { + "epoch": 0.6701614303072383, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.5378, + "step": 7239 + }, + { + "epoch": 0.6702540068275183, + "grad_norm": 0.1435546875, + "learning_rate": 0.02, + "loss": 1.5046, + "step": 7240 + }, + { + "epoch": 0.6703465833477984, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.5005, + "step": 7241 + }, + { + "epoch": 0.6704391598680784, + "grad_norm": 0.1416015625, + "learning_rate": 0.02, + "loss": 1.4253, + "step": 7242 + }, + { + "epoch": 0.6705317363883585, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.4826, + "step": 7243 + }, + { + "epoch": 0.6706243129086386, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.4633, + "step": 7244 + }, + { + "epoch": 0.6707168894289186, + "grad_norm": 0.1416015625, + "learning_rate": 0.02, + "loss": 1.4599, + "step": 7245 + }, + { + "epoch": 0.6708094659491987, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.5515, + "step": 7246 + }, + { + "epoch": 0.6709020424694787, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5312, + "step": 7247 + }, + { + "epoch": 0.6709946189897588, + "grad_norm": 0.140625, + "learning_rate": 0.02, + "loss": 1.488, + "step": 7248 + }, + { + "epoch": 0.6710871955100388, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.5267, + "step": 7249 + }, + { + "epoch": 0.6711797720303188, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.4875, + "step": 7250 + }, + { + "epoch": 0.6712723485505988, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5055, + "step": 7251 + }, + { + "epoch": 0.6713649250708789, + "grad_norm": 0.1416015625, + "learning_rate": 0.02, + "loss": 1.5251, + "step": 7252 + }, + { + "epoch": 0.6714575015911589, + "grad_norm": 0.1591796875, + "learning_rate": 0.02, + "loss": 1.4931, + "step": 7253 + }, + { + "epoch": 0.671550078111439, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.5863, + "step": 7254 + }, + { + "epoch": 0.671642654631719, + "grad_norm": 0.1611328125, + "learning_rate": 0.02, + "loss": 1.5178, + "step": 7255 + }, + { + "epoch": 0.6717352311519991, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.5436, + "step": 7256 + }, + { + "epoch": 0.6718278076722791, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.4701, + "step": 7257 + }, + { + "epoch": 0.6719203841925592, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.5031, + "step": 7258 + }, + { + "epoch": 0.6720129607128392, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.5146, + "step": 7259 + }, + { + "epoch": 0.6721055372331193, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.5435, + "step": 7260 + }, + { + "epoch": 0.6721981137533993, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.4942, + "step": 7261 + }, + { + "epoch": 0.6722906902736794, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.4986, + "step": 7262 + }, + { + "epoch": 0.6723832667939594, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.4925, + "step": 7263 + }, + { + "epoch": 0.6724758433142394, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.5009, + "step": 7264 + }, + { + "epoch": 0.6725684198345194, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.5492, + "step": 7265 + }, + { + "epoch": 0.6726609963547995, + "grad_norm": 0.1552734375, + "learning_rate": 0.02, + "loss": 1.529, + "step": 7266 + }, + { + "epoch": 0.6727535728750795, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.4179, + "step": 7267 + }, + { + "epoch": 0.6728461493953596, + "grad_norm": 0.1611328125, + "learning_rate": 0.02, + "loss": 1.4933, + "step": 7268 + }, + { + "epoch": 0.6729387259156396, + "grad_norm": 0.1435546875, + "learning_rate": 0.02, + "loss": 1.5538, + "step": 7269 + }, + { + "epoch": 0.6730313024359197, + "grad_norm": 0.1572265625, + "learning_rate": 0.02, + "loss": 1.5521, + "step": 7270 + }, + { + "epoch": 0.6731238789561997, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5315, + "step": 7271 + }, + { + "epoch": 0.6732164554764798, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.5084, + "step": 7272 + }, + { + "epoch": 0.6733090319967598, + "grad_norm": 0.1435546875, + "learning_rate": 0.02, + "loss": 1.5228, + "step": 7273 + }, + { + "epoch": 0.6734016085170399, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.5578, + "step": 7274 + }, + { + "epoch": 0.67349418503732, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.4632, + "step": 7275 + }, + { + "epoch": 0.6735867615576, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.5971, + "step": 7276 + }, + { + "epoch": 0.67367933807788, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.5007, + "step": 7277 + }, + { + "epoch": 0.67377191459816, + "grad_norm": 0.162109375, + "learning_rate": 0.02, + "loss": 1.5587, + "step": 7278 + }, + { + "epoch": 0.67386449111844, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.4848, + "step": 7279 + }, + { + "epoch": 0.6739570676387201, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.4997, + "step": 7280 + }, + { + "epoch": 0.6740496441590001, + "grad_norm": 0.158203125, + "learning_rate": 0.02, + "loss": 1.5154, + "step": 7281 + }, + { + "epoch": 0.6741422206792802, + "grad_norm": 0.15625, + "learning_rate": 0.02, + "loss": 1.5275, + "step": 7282 + }, + { + "epoch": 0.6742347971995603, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.5133, + "step": 7283 + }, + { + "epoch": 0.6743273737198403, + "grad_norm": 0.162109375, + "learning_rate": 0.02, + "loss": 1.502, + "step": 7284 + }, + { + "epoch": 0.6744199502401204, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.5601, + "step": 7285 + }, + { + "epoch": 0.6745125267604004, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.5227, + "step": 7286 + }, + { + "epoch": 0.6746051032806805, + "grad_norm": 0.1416015625, + "learning_rate": 0.02, + "loss": 1.4801, + "step": 7287 + }, + { + "epoch": 0.6746976798009605, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.5126, + "step": 7288 + }, + { + "epoch": 0.6747902563212406, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.5353, + "step": 7289 + }, + { + "epoch": 0.6748828328415206, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.5646, + "step": 7290 + }, + { + "epoch": 0.6749754093618006, + "grad_norm": 0.1435546875, + "learning_rate": 0.02, + "loss": 1.4658, + "step": 7291 + }, + { + "epoch": 0.6750679858820806, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5118, + "step": 7292 + }, + { + "epoch": 0.6751605624023607, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.596, + "step": 7293 + }, + { + "epoch": 0.6752531389226407, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.5276, + "step": 7294 + }, + { + "epoch": 0.6753457154429208, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.5659, + "step": 7295 + }, + { + "epoch": 0.6754382919632008, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5002, + "step": 7296 + }, + { + "epoch": 0.6755308684834809, + "grad_norm": 0.1572265625, + "learning_rate": 0.02, + "loss": 1.5878, + "step": 7297 + }, + { + "epoch": 0.6756234450037609, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.5248, + "step": 7298 + }, + { + "epoch": 0.675716021524041, + "grad_norm": 0.1552734375, + "learning_rate": 0.02, + "loss": 1.4757, + "step": 7299 + }, + { + "epoch": 0.675808598044321, + "grad_norm": 0.140625, + "learning_rate": 0.02, + "loss": 1.477, + "step": 7300 + }, + { + "epoch": 0.6759011745646011, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.5331, + "step": 7301 + }, + { + "epoch": 0.6759937510848811, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.4789, + "step": 7302 + }, + { + "epoch": 0.6760863276051612, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.5093, + "step": 7303 + }, + { + "epoch": 0.6761789041254412, + "grad_norm": 0.1435546875, + "learning_rate": 0.02, + "loss": 1.5175, + "step": 7304 + }, + { + "epoch": 0.6762714806457212, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.4642, + "step": 7305 + }, + { + "epoch": 0.6763640571660012, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5742, + "step": 7306 + }, + { + "epoch": 0.6764566336862813, + "grad_norm": 0.15625, + "learning_rate": 0.02, + "loss": 1.6081, + "step": 7307 + }, + { + "epoch": 0.6765492102065613, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.5655, + "step": 7308 + }, + { + "epoch": 0.6766417867268414, + "grad_norm": 0.1572265625, + "learning_rate": 0.02, + "loss": 1.5124, + "step": 7309 + }, + { + "epoch": 0.6767343632471214, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.4942, + "step": 7310 + }, + { + "epoch": 0.6768269397674015, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.5616, + "step": 7311 + }, + { + "epoch": 0.6769195162876815, + "grad_norm": 0.15625, + "learning_rate": 0.02, + "loss": 1.5361, + "step": 7312 + }, + { + "epoch": 0.6770120928079616, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.5343, + "step": 7313 + }, + { + "epoch": 0.6771046693282416, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.5188, + "step": 7314 + }, + { + "epoch": 0.6771972458485217, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.56, + "step": 7315 + }, + { + "epoch": 0.6772898223688018, + "grad_norm": 0.1650390625, + "learning_rate": 0.02, + "loss": 1.5331, + "step": 7316 + }, + { + "epoch": 0.6773823988890818, + "grad_norm": 0.1416015625, + "learning_rate": 0.02, + "loss": 1.5047, + "step": 7317 + }, + { + "epoch": 0.6774749754093617, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.4803, + "step": 7318 + }, + { + "epoch": 0.6775675519296418, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.5665, + "step": 7319 + }, + { + "epoch": 0.6776601284499219, + "grad_norm": 0.1572265625, + "learning_rate": 0.02, + "loss": 1.5727, + "step": 7320 + }, + { + "epoch": 0.6777527049702019, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.5347, + "step": 7321 + }, + { + "epoch": 0.677845281490482, + "grad_norm": 0.15625, + "learning_rate": 0.02, + "loss": 1.4872, + "step": 7322 + }, + { + "epoch": 0.677937858010762, + "grad_norm": 0.1591796875, + "learning_rate": 0.02, + "loss": 1.5092, + "step": 7323 + }, + { + "epoch": 0.6780304345310421, + "grad_norm": 0.1591796875, + "learning_rate": 0.02, + "loss": 1.4781, + "step": 7324 + }, + { + "epoch": 0.6781230110513221, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.5573, + "step": 7325 + }, + { + "epoch": 0.6782155875716022, + "grad_norm": 0.1611328125, + "learning_rate": 0.02, + "loss": 1.4438, + "step": 7326 + }, + { + "epoch": 0.6783081640918822, + "grad_norm": 0.162109375, + "learning_rate": 0.02, + "loss": 1.552, + "step": 7327 + }, + { + "epoch": 0.6784007406121623, + "grad_norm": 0.1591796875, + "learning_rate": 0.02, + "loss": 1.5302, + "step": 7328 + }, + { + "epoch": 0.6784933171324423, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.5429, + "step": 7329 + }, + { + "epoch": 0.6785858936527224, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.4792, + "step": 7330 + }, + { + "epoch": 0.6786784701730024, + "grad_norm": 0.1552734375, + "learning_rate": 0.02, + "loss": 1.5632, + "step": 7331 + }, + { + "epoch": 0.6787710466932824, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.5327, + "step": 7332 + }, + { + "epoch": 0.6788636232135624, + "grad_norm": 0.158203125, + "learning_rate": 0.02, + "loss": 1.5624, + "step": 7333 + }, + { + "epoch": 0.6789561997338425, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.5891, + "step": 7334 + }, + { + "epoch": 0.6790487762541225, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.5289, + "step": 7335 + }, + { + "epoch": 0.6791413527744026, + "grad_norm": 0.1416015625, + "learning_rate": 0.02, + "loss": 1.4554, + "step": 7336 + }, + { + "epoch": 0.6792339292946826, + "grad_norm": 0.158203125, + "learning_rate": 0.02, + "loss": 1.525, + "step": 7337 + }, + { + "epoch": 0.6793265058149627, + "grad_norm": 0.1552734375, + "learning_rate": 0.02, + "loss": 1.5914, + "step": 7338 + }, + { + "epoch": 0.6794190823352427, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.5697, + "step": 7339 + }, + { + "epoch": 0.6795116588555228, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.4911, + "step": 7340 + }, + { + "epoch": 0.6796042353758028, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.5078, + "step": 7341 + }, + { + "epoch": 0.6796968118960829, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.5713, + "step": 7342 + }, + { + "epoch": 0.6797893884163629, + "grad_norm": 0.166015625, + "learning_rate": 0.02, + "loss": 1.4377, + "step": 7343 + }, + { + "epoch": 0.679881964936643, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.4304, + "step": 7344 + }, + { + "epoch": 0.679974541456923, + "grad_norm": 0.1611328125, + "learning_rate": 0.02, + "loss": 1.6018, + "step": 7345 + }, + { + "epoch": 0.680067117977203, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.6202, + "step": 7346 + }, + { + "epoch": 0.680159694497483, + "grad_norm": 0.16015625, + "learning_rate": 0.02, + "loss": 1.489, + "step": 7347 + }, + { + "epoch": 0.6802522710177631, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.5354, + "step": 7348 + }, + { + "epoch": 0.6803448475380431, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.4541, + "step": 7349 + }, + { + "epoch": 0.6804374240583232, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.5599, + "step": 7350 + }, + { + "epoch": 0.6805300005786032, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5078, + "step": 7351 + }, + { + "epoch": 0.6806225770988833, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.4506, + "step": 7352 + }, + { + "epoch": 0.6807151536191633, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.5107, + "step": 7353 + }, + { + "epoch": 0.6808077301394434, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.5214, + "step": 7354 + }, + { + "epoch": 0.6809003066597235, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.4983, + "step": 7355 + }, + { + "epoch": 0.6809928831800035, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.4641, + "step": 7356 + }, + { + "epoch": 0.6810854597002836, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.5502, + "step": 7357 + }, + { + "epoch": 0.6811780362205636, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.5399, + "step": 7358 + }, + { + "epoch": 0.6812706127408436, + "grad_norm": 0.1435546875, + "learning_rate": 0.02, + "loss": 1.5021, + "step": 7359 + }, + { + "epoch": 0.6813631892611236, + "grad_norm": 0.1416015625, + "learning_rate": 0.02, + "loss": 1.469, + "step": 7360 + }, + { + "epoch": 0.6814557657814037, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5576, + "step": 7361 + }, + { + "epoch": 0.6815483423016837, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.5761, + "step": 7362 + }, + { + "epoch": 0.6816409188219638, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.488, + "step": 7363 + }, + { + "epoch": 0.6817334953422438, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.4215, + "step": 7364 + }, + { + "epoch": 0.6818260718625239, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.5353, + "step": 7365 + }, + { + "epoch": 0.6819186483828039, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.5073, + "step": 7366 + }, + { + "epoch": 0.682011224903084, + "grad_norm": 0.158203125, + "learning_rate": 0.02, + "loss": 1.5642, + "step": 7367 + }, + { + "epoch": 0.682103801423364, + "grad_norm": 0.158203125, + "learning_rate": 0.02, + "loss": 1.4755, + "step": 7368 + }, + { + "epoch": 0.6821963779436441, + "grad_norm": 0.1435546875, + "learning_rate": 0.02, + "loss": 1.5806, + "step": 7369 + }, + { + "epoch": 0.6822889544639241, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.5347, + "step": 7370 + }, + { + "epoch": 0.6823815309842042, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.5303, + "step": 7371 + }, + { + "epoch": 0.6824741075044842, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5137, + "step": 7372 + }, + { + "epoch": 0.6825666840247642, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5442, + "step": 7373 + }, + { + "epoch": 0.6826592605450442, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.5166, + "step": 7374 + }, + { + "epoch": 0.6827518370653243, + "grad_norm": 0.1552734375, + "learning_rate": 0.02, + "loss": 1.4808, + "step": 7375 + }, + { + "epoch": 0.6828444135856043, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.497, + "step": 7376 + }, + { + "epoch": 0.6829369901058844, + "grad_norm": 0.1572265625, + "learning_rate": 0.02, + "loss": 1.4695, + "step": 7377 + }, + { + "epoch": 0.6830295666261644, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.521, + "step": 7378 + }, + { + "epoch": 0.6831221431464445, + "grad_norm": 0.162109375, + "learning_rate": 0.02, + "loss": 1.4887, + "step": 7379 + }, + { + "epoch": 0.6832147196667245, + "grad_norm": 0.1357421875, + "learning_rate": 0.02, + "loss": 1.4984, + "step": 7380 + }, + { + "epoch": 0.6833072961870046, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.5001, + "step": 7381 + }, + { + "epoch": 0.6833998727072846, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.5183, + "step": 7382 + }, + { + "epoch": 0.6834924492275647, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.495, + "step": 7383 + }, + { + "epoch": 0.6835850257478447, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.5766, + "step": 7384 + }, + { + "epoch": 0.6836776022681248, + "grad_norm": 0.1572265625, + "learning_rate": 0.02, + "loss": 1.5788, + "step": 7385 + }, + { + "epoch": 0.6837701787884047, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.5495, + "step": 7386 + }, + { + "epoch": 0.6838627553086848, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.4891, + "step": 7387 + }, + { + "epoch": 0.6839553318289648, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.5912, + "step": 7388 + }, + { + "epoch": 0.6840479083492449, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.5436, + "step": 7389 + }, + { + "epoch": 0.684140484869525, + "grad_norm": 0.1611328125, + "learning_rate": 0.02, + "loss": 1.5307, + "step": 7390 + }, + { + "epoch": 0.684233061389805, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.5441, + "step": 7391 + }, + { + "epoch": 0.684325637910085, + "grad_norm": 0.15625, + "learning_rate": 0.02, + "loss": 1.5218, + "step": 7392 + }, + { + "epoch": 0.6844182144303651, + "grad_norm": 0.1533203125, + "learning_rate": 0.02, + "loss": 1.6006, + "step": 7393 + }, + { + "epoch": 0.6845107909506452, + "grad_norm": 0.16015625, + "learning_rate": 0.02, + "loss": 1.5419, + "step": 7394 + }, + { + "epoch": 0.6846033674709252, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.4775, + "step": 7395 + }, + { + "epoch": 0.6846959439912053, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.545, + "step": 7396 + }, + { + "epoch": 0.6847885205114853, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.5628, + "step": 7397 + }, + { + "epoch": 0.6848810970317654, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5521, + "step": 7398 + }, + { + "epoch": 0.6849736735520454, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.514, + "step": 7399 + }, + { + "epoch": 0.6850662500723254, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5556, + "step": 7400 + }, + { + "epoch": 0.6851588265926054, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5129, + "step": 7401 + }, + { + "epoch": 0.6852514031128855, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.5262, + "step": 7402 + }, + { + "epoch": 0.6853439796331655, + "grad_norm": 0.1611328125, + "learning_rate": 0.02, + "loss": 1.5519, + "step": 7403 + }, + { + "epoch": 0.6854365561534456, + "grad_norm": 0.16015625, + "learning_rate": 0.02, + "loss": 1.5364, + "step": 7404 + }, + { + "epoch": 0.6855291326737256, + "grad_norm": 0.16015625, + "learning_rate": 0.02, + "loss": 1.4768, + "step": 7405 + }, + { + "epoch": 0.6856217091940057, + "grad_norm": 0.162109375, + "learning_rate": 0.02, + "loss": 1.5421, + "step": 7406 + }, + { + "epoch": 0.6857142857142857, + "grad_norm": 0.1591796875, + "learning_rate": 0.02, + "loss": 1.5112, + "step": 7407 + }, + { + "epoch": 0.6858068622345658, + "grad_norm": 0.15625, + "learning_rate": 0.02, + "loss": 1.5195, + "step": 7408 + }, + { + "epoch": 0.6858994387548458, + "grad_norm": 0.15625, + "learning_rate": 0.02, + "loss": 1.5407, + "step": 7409 + }, + { + "epoch": 0.6859920152751259, + "grad_norm": 0.1728515625, + "learning_rate": 0.02, + "loss": 1.4589, + "step": 7410 + }, + { + "epoch": 0.6860845917954059, + "grad_norm": 0.1455078125, + "learning_rate": 0.02, + "loss": 1.5148, + "step": 7411 + }, + { + "epoch": 0.686177168315686, + "grad_norm": 0.158203125, + "learning_rate": 0.02, + "loss": 1.4883, + "step": 7412 + }, + { + "epoch": 0.686269744835966, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.5213, + "step": 7413 + }, + { + "epoch": 0.686362321356246, + "grad_norm": 0.1572265625, + "learning_rate": 0.02, + "loss": 1.5104, + "step": 7414 + }, + { + "epoch": 0.686454897876526, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.548, + "step": 7415 + }, + { + "epoch": 0.6865474743968061, + "grad_norm": 0.1552734375, + "learning_rate": 0.02, + "loss": 1.5317, + "step": 7416 + }, + { + "epoch": 0.6866400509170861, + "grad_norm": 0.1416015625, + "learning_rate": 0.02, + "loss": 1.516, + "step": 7417 + }, + { + "epoch": 0.6867326274373662, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.5046, + "step": 7418 + }, + { + "epoch": 0.6868252039576462, + "grad_norm": 0.1484375, + "learning_rate": 0.02, + "loss": 1.4812, + "step": 7419 + }, + { + "epoch": 0.6869177804779263, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.5018, + "step": 7420 + }, + { + "epoch": 0.6870103569982063, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.5047, + "step": 7421 + }, + { + "epoch": 0.6871029335184864, + "grad_norm": 0.1552734375, + "learning_rate": 0.02, + "loss": 1.5227, + "step": 7422 + }, + { + "epoch": 0.6871955100387664, + "grad_norm": 0.1474609375, + "learning_rate": 0.02, + "loss": 1.5506, + "step": 7423 + }, + { + "epoch": 0.6872880865590465, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.4786, + "step": 7424 + }, + { + "epoch": 0.6873806630793265, + "grad_norm": 0.1513671875, + "learning_rate": 0.02, + "loss": 1.4945, + "step": 7425 + }, + { + "epoch": 0.6874732395996066, + "grad_norm": 0.142578125, + "learning_rate": 0.02, + "loss": 1.4975, + "step": 7426 + }, + { + "epoch": 0.6875658161198865, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.6003, + "step": 7427 + }, + { + "epoch": 0.6876583926401666, + "grad_norm": 0.14453125, + "learning_rate": 0.02, + "loss": 1.4866, + "step": 7428 + }, + { + "epoch": 0.6877509691604466, + "grad_norm": 0.16015625, + "learning_rate": 0.02, + "loss": 1.4946, + "step": 7429 + }, + { + "epoch": 0.6878435456807267, + "grad_norm": 0.1337890625, + "learning_rate": 0.02, + "loss": 1.501, + "step": 7430 + }, + { + "epoch": 0.6879361222010068, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.5101, + "step": 7431 + }, + { + "epoch": 0.6880286987212868, + "grad_norm": 0.150390625, + "learning_rate": 0.02, + "loss": 1.4444, + "step": 7432 + }, + { + "epoch": 0.6881212752415669, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.5034, + "step": 7433 + }, + { + "epoch": 0.6882138517618469, + "grad_norm": 0.146484375, + "learning_rate": 0.02, + "loss": 1.5182, + "step": 7434 + }, + { + "epoch": 0.688306428282127, + "grad_norm": 0.154296875, + "learning_rate": 0.02, + "loss": 1.6076, + "step": 7435 + }, + { + "epoch": 0.688399004802407, + "grad_norm": 0.1494140625, + "learning_rate": 0.02, + "loss": 1.4451, + "step": 7436 + }, + { + "epoch": 0.6884915813226871, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.513, + "step": 7437 + }, + { + "epoch": 0.6885841578429671, + "grad_norm": 0.15234375, + "learning_rate": 0.02, + "loss": 1.5914, + "step": 7438 + }, + { + "epoch": 0.6886767343632472, + "grad_norm": 0.1630859375, + "learning_rate": 0.02, + "loss": 1.5765, + "step": 7439 + }, + { + "epoch": 0.6887693108835272, + "grad_norm": 0.1357421875, + "learning_rate": 0.02, + "loss": 1.5209, + "step": 7440 } ], "logging_steps": 1, @@ -41365,13 +52117,13 @@ "should_epoch_stop": false, "should_evaluate": false, "should_log": false, - "should_save": true, + "should_save": false, "should_training_stop": false }, "attributes": {} } }, - "total_flos": 5.200473320974384e+18, + "total_flos": 6.554468380893837e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null