| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.030597377367654, | |
| "eval_steps": 500, | |
| "global_step": 1950, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.007770762506070908, | |
| "grad_norm": 0.18359375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3922, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.015541525012141816, | |
| "grad_norm": 0.11865234375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2477, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.023312287518212724, | |
| "grad_norm": 0.0654296875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1374, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.03108305002428363, | |
| "grad_norm": 0.1083984375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1301, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.03885381253035454, | |
| "grad_norm": 0.2119140625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1581, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.04662457503642545, | |
| "grad_norm": 0.126953125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1942, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.054395337542496355, | |
| "grad_norm": 0.251953125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2204, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.06216610004856726, | |
| "grad_norm": 0.26953125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2906, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.06993686255463817, | |
| "grad_norm": 0.21484375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2077, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.07770762506070908, | |
| "grad_norm": 0.1708984375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2592, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.08547838756677999, | |
| "grad_norm": 0.072265625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1449, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.0932491500728509, | |
| "grad_norm": 0.0361328125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0462, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.1010199125789218, | |
| "grad_norm": 0.054931640625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0439, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.10879067508499271, | |
| "grad_norm": 0.04248046875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0318, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.11656143759106362, | |
| "grad_norm": 0.09521484375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0901, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.12433220009713453, | |
| "grad_norm": 0.095703125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1907, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.13210296260320545, | |
| "grad_norm": 0.1279296875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1495, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.13987372510927634, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1917, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.14764448761534726, | |
| "grad_norm": 0.125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1706, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.15541525012141816, | |
| "grad_norm": 0.279296875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2569, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.16318601262748908, | |
| "grad_norm": 0.04052734375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1343, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.17095677513355997, | |
| "grad_norm": 0.0478515625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0482, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.1787275376396309, | |
| "grad_norm": 0.0281982421875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0397, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.1864983001457018, | |
| "grad_norm": 0.025390625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0529, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.1942690626517727, | |
| "grad_norm": 0.091796875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.083, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.2020398251578436, | |
| "grad_norm": 0.1123046875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1048, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.20981058766391453, | |
| "grad_norm": 0.12353515625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1576, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.21758135016998542, | |
| "grad_norm": 0.12353515625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1435, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.22535211267605634, | |
| "grad_norm": 0.140625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1928, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.23312287518212724, | |
| "grad_norm": 0.1669921875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2278, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.24089363768819816, | |
| "grad_norm": 0.0439453125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0949, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.24866440019426905, | |
| "grad_norm": 0.1015625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0518, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.25643516270033995, | |
| "grad_norm": 0.040283203125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0378, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.2642059252064109, | |
| "grad_norm": 0.041259765625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.032, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.2719766877124818, | |
| "grad_norm": 0.061279296875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0801, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.2797474502185527, | |
| "grad_norm": 0.1025390625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1381, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.2875182127246236, | |
| "grad_norm": 0.08154296875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1337, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.29528897523069453, | |
| "grad_norm": 0.06494140625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1276, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.3030597377367654, | |
| "grad_norm": 0.162109375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.155, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.3108305002428363, | |
| "grad_norm": 0.146484375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1496, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.3186012627489072, | |
| "grad_norm": 0.0269775390625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1061, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.32637202525497816, | |
| "grad_norm": 0.021484375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0486, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.33414278776104905, | |
| "grad_norm": 0.0296630859375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0279, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.34191355026711995, | |
| "grad_norm": 0.019775390625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0284, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.34968431277319084, | |
| "grad_norm": 0.0791015625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0874, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.3574550752792618, | |
| "grad_norm": 0.1044921875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1743, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.3652258377853327, | |
| "grad_norm": 0.1435546875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1282, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.3729966002914036, | |
| "grad_norm": 0.09521484375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1091, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.38076736279747453, | |
| "grad_norm": 0.1728515625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1333, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.3885381253035454, | |
| "grad_norm": 0.1630859375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1941, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.3963088878096163, | |
| "grad_norm": 0.042236328125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1179, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.4040796503156872, | |
| "grad_norm": 0.0595703125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0754, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.41185041282175816, | |
| "grad_norm": 0.0546875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0429, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.41962117532782905, | |
| "grad_norm": 0.047607421875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0354, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.42739193783389995, | |
| "grad_norm": 0.09912109375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1228, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.43516270033997084, | |
| "grad_norm": 0.087890625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1233, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.4429334628460418, | |
| "grad_norm": 0.09326171875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1454, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.4507042253521127, | |
| "grad_norm": 0.1337890625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1248, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.4584749878581836, | |
| "grad_norm": 0.103515625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1154, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.4662457503642545, | |
| "grad_norm": 0.1396484375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1596, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.4740165128703254, | |
| "grad_norm": 0.048583984375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.124, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.4817872753763963, | |
| "grad_norm": 0.0205078125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0428, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.4895580378824672, | |
| "grad_norm": 0.034423828125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.016, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.4973288003885381, | |
| "grad_norm": 0.0361328125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0196, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.505099562894609, | |
| "grad_norm": 0.10595703125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1089, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.5128703254006799, | |
| "grad_norm": 0.14453125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.132, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.5206410879067509, | |
| "grad_norm": 0.07861328125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1718, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.5284118504128218, | |
| "grad_norm": 0.09814453125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1096, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.5361826129188927, | |
| "grad_norm": 0.10986328125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1342, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.5439533754249636, | |
| "grad_norm": 0.1630859375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1532, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.5517241379310345, | |
| "grad_norm": 0.053955078125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.112, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.5594949004371054, | |
| "grad_norm": 0.051513671875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0371, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.5672656629431763, | |
| "grad_norm": 0.0264892578125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0321, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.5750364254492472, | |
| "grad_norm": 0.032470703125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0473, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.5828071879553182, | |
| "grad_norm": 0.07568359375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0738, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.5905779504613891, | |
| "grad_norm": 0.11572265625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1384, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.59834871296746, | |
| "grad_norm": 0.1220703125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1191, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.6061194754735308, | |
| "grad_norm": 0.14453125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1188, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.6138902379796017, | |
| "grad_norm": 0.15234375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1327, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.6216610004856726, | |
| "grad_norm": 0.1748046875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1695, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.6294317629917435, | |
| "grad_norm": 0.046142578125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.11, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.6372025254978144, | |
| "grad_norm": 0.042236328125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.052, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.6449732880038854, | |
| "grad_norm": 0.0205078125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.02, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.6527440505099563, | |
| "grad_norm": 0.06982421875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0554, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.6605148130160272, | |
| "grad_norm": 0.06396484375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.063, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.6682855755220981, | |
| "grad_norm": 0.0712890625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1229, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.676056338028169, | |
| "grad_norm": 0.09619140625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1218, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.6838271005342399, | |
| "grad_norm": 0.11376953125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.11, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.6915978630403108, | |
| "grad_norm": 0.10205078125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1338, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.6993686255463817, | |
| "grad_norm": 0.216796875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2131, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.7071393880524527, | |
| "grad_norm": 0.04541015625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.111, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.7149101505585236, | |
| "grad_norm": 0.0264892578125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0656, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.7226809130645945, | |
| "grad_norm": 0.023193359375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0216, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.7304516755706654, | |
| "grad_norm": 0.04296875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.042, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.7382224380767363, | |
| "grad_norm": 0.08056640625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0533, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.7459932005828072, | |
| "grad_norm": 0.11083984375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0643, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.753763963088878, | |
| "grad_norm": 0.08154296875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.08, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.7615347255949491, | |
| "grad_norm": 0.1171875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1347, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.76930548810102, | |
| "grad_norm": 0.099609375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0792, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.7770762506070908, | |
| "grad_norm": 0.2314453125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1212, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.7848470131131617, | |
| "grad_norm": 0.05615234375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1122, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.7926177756192326, | |
| "grad_norm": 0.006927490234375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0504, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.8003885381253035, | |
| "grad_norm": 0.01495361328125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0143, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.8081593006313744, | |
| "grad_norm": 0.03125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0229, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.8159300631374453, | |
| "grad_norm": 0.0849609375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0618, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.8237008256435163, | |
| "grad_norm": 0.1025390625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1189, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.8314715881495872, | |
| "grad_norm": 0.11572265625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.104, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.8392423506556581, | |
| "grad_norm": 0.11376953125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0988, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.847013113161729, | |
| "grad_norm": 0.13671875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1413, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.8547838756677999, | |
| "grad_norm": 0.1181640625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1027, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.8625546381738708, | |
| "grad_norm": 0.06787109375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1324, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.8703254006799417, | |
| "grad_norm": 0.0062255859375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0265, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.8780961631860126, | |
| "grad_norm": 0.0634765625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.02, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.8858669256920836, | |
| "grad_norm": 0.0296630859375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0326, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.8936376881981545, | |
| "grad_norm": 0.0771484375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0746, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.9014084507042254, | |
| "grad_norm": 0.08740234375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1296, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.9091792132102963, | |
| "grad_norm": 0.11328125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1037, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.9169499757163672, | |
| "grad_norm": 0.162109375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1419, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.924720738222438, | |
| "grad_norm": 0.1201171875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1326, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.932491500728509, | |
| "grad_norm": 0.1640625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.154, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.9402622632345798, | |
| "grad_norm": 0.05908203125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1029, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.9480330257406508, | |
| "grad_norm": 0.032958984375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0223, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.9558037882467217, | |
| "grad_norm": 0.0361328125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0326, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.9635745507527926, | |
| "grad_norm": 0.054931640625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0454, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.9713453132588635, | |
| "grad_norm": 0.1376953125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1382, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.9791160757649344, | |
| "grad_norm": 0.09619140625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1148, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.9868868382710053, | |
| "grad_norm": 0.12158203125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1029, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 0.9946576007770762, | |
| "grad_norm": 0.09814453125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1802, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.0024283632831472, | |
| "grad_norm": 0.047119140625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1323, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 1.010199125789218, | |
| "grad_norm": 0.053955078125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0703, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.017969888295289, | |
| "grad_norm": 0.00970458984375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0169, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 1.0257406508013598, | |
| "grad_norm": 0.0311279296875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0106, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.0335114133074308, | |
| "grad_norm": 0.04736328125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0357, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 1.0412821758135018, | |
| "grad_norm": 0.083984375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0458, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.0490529383195726, | |
| "grad_norm": 0.111328125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.073, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 1.0568237008256436, | |
| "grad_norm": 0.10595703125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.059, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.0645944633317144, | |
| "grad_norm": 0.1650390625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0613, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 1.0723652258377854, | |
| "grad_norm": 0.162109375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0741, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.0801359883438562, | |
| "grad_norm": 0.0654296875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0879, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 1.0879067508499272, | |
| "grad_norm": 0.06005859375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0678, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.095677513355998, | |
| "grad_norm": 0.0673828125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0388, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 1.103448275862069, | |
| "grad_norm": 0.03662109375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0128, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.11121903836814, | |
| "grad_norm": 0.020263671875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0178, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 1.1189898008742107, | |
| "grad_norm": 0.0908203125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0538, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.1267605633802817, | |
| "grad_norm": 0.10302734375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1117, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 1.1345313258863525, | |
| "grad_norm": 0.126953125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0767, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.1423020883924235, | |
| "grad_norm": 0.11474609375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.061, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 1.1500728508984945, | |
| "grad_norm": 0.15625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0714, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.1578436134045653, | |
| "grad_norm": 0.072265625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0921, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 1.1656143759106363, | |
| "grad_norm": 0.050537109375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0691, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.173385138416707, | |
| "grad_norm": 0.028076171875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0239, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 1.1811559009227781, | |
| "grad_norm": 0.0498046875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0202, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.188926663428849, | |
| "grad_norm": 0.0654296875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0296, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 1.19669742593492, | |
| "grad_norm": 0.0927734375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0632, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.2044681884409907, | |
| "grad_norm": 0.06591796875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0675, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 1.2122389509470617, | |
| "grad_norm": 0.11962890625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0772, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.2200097134531327, | |
| "grad_norm": 0.08935546875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0628, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 1.2277804759592035, | |
| "grad_norm": 0.09326171875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0662, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.2355512384652745, | |
| "grad_norm": 0.05078125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0962, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 1.2433220009713453, | |
| "grad_norm": 0.0302734375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0606, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.2510927634774163, | |
| "grad_norm": 0.0361328125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0119, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 1.258863525983487, | |
| "grad_norm": 0.03515625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0156, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.266634288489558, | |
| "grad_norm": 0.05810546875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0393, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 1.2744050509956288, | |
| "grad_norm": 0.1328125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1104, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.2821758135016998, | |
| "grad_norm": 0.09130859375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0592, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 1.2899465760077709, | |
| "grad_norm": 0.076171875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0604, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.2977173385138416, | |
| "grad_norm": 0.11865234375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0507, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 1.3054881010199126, | |
| "grad_norm": 0.1337890625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.079, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.3132588635259834, | |
| "grad_norm": 0.06396484375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1041, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 1.3210296260320544, | |
| "grad_norm": 0.06982421875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0751, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.3288003885381254, | |
| "grad_norm": 0.044921875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0338, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 1.3365711510441962, | |
| "grad_norm": 0.0546875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0144, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.344341913550267, | |
| "grad_norm": 0.07080078125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0359, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 1.352112676056338, | |
| "grad_norm": 0.08203125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0498, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.359883438562409, | |
| "grad_norm": 0.08935546875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0422, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 1.3676542010684798, | |
| "grad_norm": 0.09326171875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.046, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.3754249635745508, | |
| "grad_norm": 0.146484375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.059, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 1.3831957260806216, | |
| "grad_norm": 0.1962890625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0545, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.3909664885866926, | |
| "grad_norm": 0.06201171875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.117, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 1.3987372510927636, | |
| "grad_norm": 0.03857421875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0588, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.4065080135988344, | |
| "grad_norm": 0.0576171875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0243, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 1.4142787761049052, | |
| "grad_norm": 0.064453125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0204, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.4220495386109762, | |
| "grad_norm": 0.040283203125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0323, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 1.4298203011170472, | |
| "grad_norm": 0.08056640625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0851, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.437591063623118, | |
| "grad_norm": 0.078125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0821, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 1.445361826129189, | |
| "grad_norm": 0.08837890625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0597, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.4531325886352597, | |
| "grad_norm": 0.1201171875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0617, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 1.4609033511413307, | |
| "grad_norm": 0.134765625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.047, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.4686741136474017, | |
| "grad_norm": 0.06494140625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0853, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 1.4764448761534725, | |
| "grad_norm": 0.059326171875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0803, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.4842156386595435, | |
| "grad_norm": 0.058837890625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0302, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 1.4919864011656143, | |
| "grad_norm": 0.0284423828125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0173, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.4997571636716853, | |
| "grad_norm": 0.054443359375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0262, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 1.5075279261777563, | |
| "grad_norm": 0.06396484375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0533, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.515298688683827, | |
| "grad_norm": 0.039794921875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0651, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 1.523069451189898, | |
| "grad_norm": 0.0791015625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0682, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.530840213695969, | |
| "grad_norm": 0.1552734375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0532, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 1.53861097620204, | |
| "grad_norm": 0.1259765625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0736, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.5463817387081107, | |
| "grad_norm": 0.0693359375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0748, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 1.5541525012141817, | |
| "grad_norm": 0.08056640625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0824, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.5619232637202525, | |
| "grad_norm": 0.016357421875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0252, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 1.5696940262263235, | |
| "grad_norm": 0.025390625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0179, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 1.5774647887323945, | |
| "grad_norm": 0.06005859375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0342, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 1.5852355512384653, | |
| "grad_norm": 0.134765625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0657, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 1.593006313744536, | |
| "grad_norm": 0.08203125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0668, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 1.600777076250607, | |
| "grad_norm": 0.12060546875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0769, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 1.608547838756678, | |
| "grad_norm": 0.08837890625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0369, | |
| "step": 1035 | |
| }, | |
| { | |
| "epoch": 1.616318601262749, | |
| "grad_norm": 0.125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0803, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 1.6240893637688198, | |
| "grad_norm": 0.046875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.083, | |
| "step": 1045 | |
| }, | |
| { | |
| "epoch": 1.6318601262748906, | |
| "grad_norm": 0.052001953125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0768, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.6396308887809616, | |
| "grad_norm": 0.03173828125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0226, | |
| "step": 1055 | |
| }, | |
| { | |
| "epoch": 1.6474016512870326, | |
| "grad_norm": 0.037353515625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0145, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 1.6551724137931034, | |
| "grad_norm": 0.054443359375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.022, | |
| "step": 1065 | |
| }, | |
| { | |
| "epoch": 1.6629431762991742, | |
| "grad_norm": 0.0908203125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0627, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 1.6707139388052452, | |
| "grad_norm": 0.0986328125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0709, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 1.6784847013113162, | |
| "grad_norm": 0.08984375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0613, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.6862554638173872, | |
| "grad_norm": 0.1689453125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0543, | |
| "step": 1085 | |
| }, | |
| { | |
| "epoch": 1.694026226323458, | |
| "grad_norm": 0.10302734375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.064, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 1.7017969888295288, | |
| "grad_norm": 0.055908203125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0977, | |
| "step": 1095 | |
| }, | |
| { | |
| "epoch": 1.7095677513355998, | |
| "grad_norm": 0.062255859375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0519, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.7173385138416708, | |
| "grad_norm": 0.02734375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0124, | |
| "step": 1105 | |
| }, | |
| { | |
| "epoch": 1.7251092763477416, | |
| "grad_norm": 0.05224609375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0103, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 1.7328800388538124, | |
| "grad_norm": 0.09375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0276, | |
| "step": 1115 | |
| }, | |
| { | |
| "epoch": 1.7406508013598834, | |
| "grad_norm": 0.103515625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.045, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 1.7484215638659544, | |
| "grad_norm": 0.1455078125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0708, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 1.7561923263720254, | |
| "grad_norm": 0.11962890625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0618, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 1.7639630888780962, | |
| "grad_norm": 0.1318359375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0687, | |
| "step": 1135 | |
| }, | |
| { | |
| "epoch": 1.771733851384167, | |
| "grad_norm": 0.150390625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0451, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 1.779504613890238, | |
| "grad_norm": 0.058349609375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1109, | |
| "step": 1145 | |
| }, | |
| { | |
| "epoch": 1.787275376396309, | |
| "grad_norm": 0.032958984375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0728, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.79504613890238, | |
| "grad_norm": 0.044921875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0254, | |
| "step": 1155 | |
| }, | |
| { | |
| "epoch": 1.8028169014084507, | |
| "grad_norm": 0.0458984375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0137, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 1.8105876639145215, | |
| "grad_norm": 0.042724609375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0242, | |
| "step": 1165 | |
| }, | |
| { | |
| "epoch": 1.8183584264205925, | |
| "grad_norm": 0.11328125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0717, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 1.8261291889266635, | |
| "grad_norm": 0.10986328125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0648, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 1.8338999514327343, | |
| "grad_norm": 0.1318359375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0778, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 1.841670713938805, | |
| "grad_norm": 0.08154296875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0435, | |
| "step": 1185 | |
| }, | |
| { | |
| "epoch": 1.849441476444876, | |
| "grad_norm": 0.1513671875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0714, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 1.8572122389509471, | |
| "grad_norm": 0.0576171875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0979, | |
| "step": 1195 | |
| }, | |
| { | |
| "epoch": 1.8649830014570181, | |
| "grad_norm": 0.062255859375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0785, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.872753763963089, | |
| "grad_norm": 0.057861328125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0183, | |
| "step": 1205 | |
| }, | |
| { | |
| "epoch": 1.8805245264691597, | |
| "grad_norm": 0.06787109375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0167, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 1.8882952889752307, | |
| "grad_norm": 0.119140625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0269, | |
| "step": 1215 | |
| }, | |
| { | |
| "epoch": 1.8960660514813017, | |
| "grad_norm": 0.06689453125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0532, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 1.9038368139873725, | |
| "grad_norm": 0.130859375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0654, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 1.9116075764934433, | |
| "grad_norm": 0.1064453125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0592, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 1.9193783389995143, | |
| "grad_norm": 0.16015625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0627, | |
| "step": 1235 | |
| }, | |
| { | |
| "epoch": 1.9271491015055853, | |
| "grad_norm": 0.1650390625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0654, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 1.9349198640116563, | |
| "grad_norm": 0.052734375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0763, | |
| "step": 1245 | |
| }, | |
| { | |
| "epoch": 1.942690626517727, | |
| "grad_norm": 0.04296875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0551, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.9504613890237978, | |
| "grad_norm": 0.0244140625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0299, | |
| "step": 1255 | |
| }, | |
| { | |
| "epoch": 1.9582321515298688, | |
| "grad_norm": 0.06591796875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0362, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 1.9660029140359399, | |
| "grad_norm": 0.06494140625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0285, | |
| "step": 1265 | |
| }, | |
| { | |
| "epoch": 1.9737736765420106, | |
| "grad_norm": 0.10107421875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0653, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 1.9815444390480816, | |
| "grad_norm": 0.0869140625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0591, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 1.9893152015541524, | |
| "grad_norm": 0.11083984375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0599, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 1.9970859640602234, | |
| "grad_norm": 0.099609375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0703, | |
| "step": 1285 | |
| }, | |
| { | |
| "epoch": 2.0048567265662944, | |
| "grad_norm": 0.0595703125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0841, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 2.0126274890723654, | |
| "grad_norm": 0.04345703125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0243, | |
| "step": 1295 | |
| }, | |
| { | |
| "epoch": 2.020398251578436, | |
| "grad_norm": 0.0260009765625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0117, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.028169014084507, | |
| "grad_norm": 0.08056640625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0329, | |
| "step": 1305 | |
| }, | |
| { | |
| "epoch": 2.035939776590578, | |
| "grad_norm": 0.0703125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0344, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 2.043710539096649, | |
| "grad_norm": 0.1328125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0517, | |
| "step": 1315 | |
| }, | |
| { | |
| "epoch": 2.0514813016027196, | |
| "grad_norm": 0.058349609375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0209, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 2.0592520641087906, | |
| "grad_norm": 0.103515625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0464, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 2.0670228266148616, | |
| "grad_norm": 0.10302734375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0241, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 2.0747935891209326, | |
| "grad_norm": 0.043701171875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0356, | |
| "step": 1335 | |
| }, | |
| { | |
| "epoch": 2.0825643516270036, | |
| "grad_norm": 0.107421875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.068, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 2.090335114133074, | |
| "grad_norm": 0.03515625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0273, | |
| "step": 1345 | |
| }, | |
| { | |
| "epoch": 2.098105876639145, | |
| "grad_norm": 0.0157470703125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0066, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 2.105876639145216, | |
| "grad_norm": 0.05419921875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0076, | |
| "step": 1355 | |
| }, | |
| { | |
| "epoch": 2.113647401651287, | |
| "grad_norm": 0.0595703125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0128, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 2.1214181641573577, | |
| "grad_norm": 0.10595703125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0345, | |
| "step": 1365 | |
| }, | |
| { | |
| "epoch": 2.1291889266634287, | |
| "grad_norm": 0.087890625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0319, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 2.1369596891694997, | |
| "grad_norm": 0.0947265625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0345, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 2.1447304516755707, | |
| "grad_norm": 0.08447265625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0193, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 2.1525012141816418, | |
| "grad_norm": 0.11572265625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0348, | |
| "step": 1385 | |
| }, | |
| { | |
| "epoch": 2.1602719766877123, | |
| "grad_norm": 0.083984375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0704, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 2.1680427391937833, | |
| "grad_norm": 0.0185546875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0289, | |
| "step": 1395 | |
| }, | |
| { | |
| "epoch": 2.1758135016998543, | |
| "grad_norm": 0.03466796875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0092, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.1835842642059253, | |
| "grad_norm": 0.0400390625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0246, | |
| "step": 1405 | |
| }, | |
| { | |
| "epoch": 2.191355026711996, | |
| "grad_norm": 0.0257568359375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0137, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 2.199125789218067, | |
| "grad_norm": 0.1201171875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0364, | |
| "step": 1415 | |
| }, | |
| { | |
| "epoch": 2.206896551724138, | |
| "grad_norm": 0.1240234375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0496, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 2.214667314230209, | |
| "grad_norm": 0.06005859375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0228, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 2.22243807673628, | |
| "grad_norm": 0.08251953125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0163, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 2.2302088392423505, | |
| "grad_norm": 0.10693359375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0283, | |
| "step": 1435 | |
| }, | |
| { | |
| "epoch": 2.2379796017484215, | |
| "grad_norm": 0.078125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0626, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 2.2457503642544925, | |
| "grad_norm": 0.035888671875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0391, | |
| "step": 1445 | |
| }, | |
| { | |
| "epoch": 2.2535211267605635, | |
| "grad_norm": 0.034912109375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0111, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 2.2612918892666345, | |
| "grad_norm": 0.038818359375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0083, | |
| "step": 1455 | |
| }, | |
| { | |
| "epoch": 2.269062651772705, | |
| "grad_norm": 0.061279296875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0177, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 2.276833414278776, | |
| "grad_norm": 0.057373046875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0448, | |
| "step": 1465 | |
| }, | |
| { | |
| "epoch": 2.284604176784847, | |
| "grad_norm": 0.109375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0218, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 2.292374939290918, | |
| "grad_norm": 0.0771484375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0324, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 2.300145701796989, | |
| "grad_norm": 0.1201171875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0296, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 2.3079164643030596, | |
| "grad_norm": 0.11279296875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0303, | |
| "step": 1485 | |
| }, | |
| { | |
| "epoch": 2.3156872268091306, | |
| "grad_norm": 0.08740234375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0771, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 2.3234579893152016, | |
| "grad_norm": 0.03466796875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.018, | |
| "step": 1495 | |
| }, | |
| { | |
| "epoch": 2.3312287518212726, | |
| "grad_norm": 0.03662109375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0117, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.338999514327343, | |
| "grad_norm": 0.052001953125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0169, | |
| "step": 1505 | |
| }, | |
| { | |
| "epoch": 2.346770276833414, | |
| "grad_norm": 0.068359375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0227, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 2.354541039339485, | |
| "grad_norm": 0.07861328125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.034, | |
| "step": 1515 | |
| }, | |
| { | |
| "epoch": 2.3623118018455562, | |
| "grad_norm": 0.1142578125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.031, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 2.370082564351627, | |
| "grad_norm": 0.08544921875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0278, | |
| "step": 1525 | |
| }, | |
| { | |
| "epoch": 2.377853326857698, | |
| "grad_norm": 0.1318359375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0325, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 2.385624089363769, | |
| "grad_norm": 0.1455078125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0277, | |
| "step": 1535 | |
| }, | |
| { | |
| "epoch": 2.39339485186984, | |
| "grad_norm": 0.0634765625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0537, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 2.401165614375911, | |
| "grad_norm": 0.037841796875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.04, | |
| "step": 1545 | |
| }, | |
| { | |
| "epoch": 2.4089363768819814, | |
| "grad_norm": 0.01531982421875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0078, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 2.4167071393880524, | |
| "grad_norm": 0.024169921875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0086, | |
| "step": 1555 | |
| }, | |
| { | |
| "epoch": 2.4244779018941234, | |
| "grad_norm": 0.047119140625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0212, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 2.4322486644001944, | |
| "grad_norm": 0.126953125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0399, | |
| "step": 1565 | |
| }, | |
| { | |
| "epoch": 2.4400194269062654, | |
| "grad_norm": 0.10009765625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0285, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 2.447790189412336, | |
| "grad_norm": 0.1279296875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0261, | |
| "step": 1575 | |
| }, | |
| { | |
| "epoch": 2.455560951918407, | |
| "grad_norm": 0.0986328125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0237, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 2.463331714424478, | |
| "grad_norm": 0.1943359375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0401, | |
| "step": 1585 | |
| }, | |
| { | |
| "epoch": 2.471102476930549, | |
| "grad_norm": 0.08056640625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.058, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 2.4788732394366195, | |
| "grad_norm": 0.0478515625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0378, | |
| "step": 1595 | |
| }, | |
| { | |
| "epoch": 2.4866440019426905, | |
| "grad_norm": 0.03662109375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0066, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 2.4944147644487615, | |
| "grad_norm": 0.0478515625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0128, | |
| "step": 1605 | |
| }, | |
| { | |
| "epoch": 2.5021855269548325, | |
| "grad_norm": 0.10205078125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0217, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 2.509956289460903, | |
| "grad_norm": 0.060302734375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0329, | |
| "step": 1615 | |
| }, | |
| { | |
| "epoch": 2.517727051966974, | |
| "grad_norm": 0.125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0318, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 2.525497814473045, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0288, | |
| "step": 1625 | |
| }, | |
| { | |
| "epoch": 2.533268576979116, | |
| "grad_norm": 0.0556640625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0258, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 2.541039339485187, | |
| "grad_norm": 0.08740234375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.03, | |
| "step": 1635 | |
| }, | |
| { | |
| "epoch": 2.5488101019912577, | |
| "grad_norm": 0.09521484375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0755, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 2.5565808644973287, | |
| "grad_norm": 0.05810546875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.029, | |
| "step": 1645 | |
| }, | |
| { | |
| "epoch": 2.5643516270033997, | |
| "grad_norm": 0.0400390625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0105, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 2.5721223895094707, | |
| "grad_norm": 0.062255859375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0113, | |
| "step": 1655 | |
| }, | |
| { | |
| "epoch": 2.5798931520155417, | |
| "grad_norm": 0.05810546875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0124, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 2.5876639145216123, | |
| "grad_norm": 0.064453125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0364, | |
| "step": 1665 | |
| }, | |
| { | |
| "epoch": 2.5954346770276833, | |
| "grad_norm": 0.10791015625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0363, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 2.6032054395337543, | |
| "grad_norm": 0.109375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0296, | |
| "step": 1675 | |
| }, | |
| { | |
| "epoch": 2.6109762020398253, | |
| "grad_norm": 0.11572265625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0218, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 2.6187469645458963, | |
| "grad_norm": 0.0888671875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0346, | |
| "step": 1685 | |
| }, | |
| { | |
| "epoch": 2.626517727051967, | |
| "grad_norm": 0.062255859375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0738, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 2.634288489558038, | |
| "grad_norm": 0.037841796875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0247, | |
| "step": 1695 | |
| }, | |
| { | |
| "epoch": 2.642059252064109, | |
| "grad_norm": 0.00836181640625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0104, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.64983001457018, | |
| "grad_norm": 0.06591796875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.017, | |
| "step": 1705 | |
| }, | |
| { | |
| "epoch": 2.657600777076251, | |
| "grad_norm": 0.07421875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.029, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 2.6653715395823214, | |
| "grad_norm": 0.0693359375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0319, | |
| "step": 1715 | |
| }, | |
| { | |
| "epoch": 2.6731423020883924, | |
| "grad_norm": 0.0498046875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0267, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 2.6809130645944634, | |
| "grad_norm": 0.1650390625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0352, | |
| "step": 1725 | |
| }, | |
| { | |
| "epoch": 2.688683827100534, | |
| "grad_norm": 0.154296875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0296, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 2.696454589606605, | |
| "grad_norm": 0.13671875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0476, | |
| "step": 1735 | |
| }, | |
| { | |
| "epoch": 2.704225352112676, | |
| "grad_norm": 0.0712890625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0693, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 2.711996114618747, | |
| "grad_norm": 0.01177978515625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0263, | |
| "step": 1745 | |
| }, | |
| { | |
| "epoch": 2.719766877124818, | |
| "grad_norm": 0.048095703125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0117, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 2.7275376396308886, | |
| "grad_norm": 0.0390625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0121, | |
| "step": 1755 | |
| }, | |
| { | |
| "epoch": 2.7353084021369596, | |
| "grad_norm": 0.07861328125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0207, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 2.7430791646430306, | |
| "grad_norm": 0.11083984375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0459, | |
| "step": 1765 | |
| }, | |
| { | |
| "epoch": 2.7508499271491016, | |
| "grad_norm": 0.0576171875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0356, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 2.7586206896551726, | |
| "grad_norm": 0.08740234375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0385, | |
| "step": 1775 | |
| }, | |
| { | |
| "epoch": 2.766391452161243, | |
| "grad_norm": 0.0888671875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0476, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 2.774162214667314, | |
| "grad_norm": 0.1337890625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.034, | |
| "step": 1785 | |
| }, | |
| { | |
| "epoch": 2.781932977173385, | |
| "grad_norm": 0.06884765625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0657, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 2.789703739679456, | |
| "grad_norm": 0.07080078125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.035, | |
| "step": 1795 | |
| }, | |
| { | |
| "epoch": 2.797474502185527, | |
| "grad_norm": 0.042724609375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0238, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.8052452646915977, | |
| "grad_norm": 0.0673828125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0119, | |
| "step": 1805 | |
| }, | |
| { | |
| "epoch": 2.8130160271976687, | |
| "grad_norm": 0.0703125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0143, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 2.8207867897037397, | |
| "grad_norm": 0.09716796875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0368, | |
| "step": 1815 | |
| }, | |
| { | |
| "epoch": 2.8285575522098103, | |
| "grad_norm": 0.0947265625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0289, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 2.8363283147158818, | |
| "grad_norm": 0.09619140625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0301, | |
| "step": 1825 | |
| }, | |
| { | |
| "epoch": 2.8440990772219523, | |
| "grad_norm": 0.12158203125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0344, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 2.8518698397280233, | |
| "grad_norm": 0.09423828125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0246, | |
| "step": 1835 | |
| }, | |
| { | |
| "epoch": 2.8596406022340943, | |
| "grad_norm": 0.062255859375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0867, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 2.867411364740165, | |
| "grad_norm": 0.033203125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0426, | |
| "step": 1845 | |
| }, | |
| { | |
| "epoch": 2.875182127246236, | |
| "grad_norm": 0.05810546875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0329, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 2.882952889752307, | |
| "grad_norm": 0.0341796875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0092, | |
| "step": 1855 | |
| }, | |
| { | |
| "epoch": 2.890723652258378, | |
| "grad_norm": 0.0400390625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0262, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 2.898494414764449, | |
| "grad_norm": 0.095703125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0433, | |
| "step": 1865 | |
| }, | |
| { | |
| "epoch": 2.9062651772705195, | |
| "grad_norm": 0.08349609375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0282, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 2.9140359397765905, | |
| "grad_norm": 0.06396484375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0286, | |
| "step": 1875 | |
| }, | |
| { | |
| "epoch": 2.9218067022826615, | |
| "grad_norm": 0.109375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0278, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 2.9295774647887325, | |
| "grad_norm": 0.2021484375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0216, | |
| "step": 1885 | |
| }, | |
| { | |
| "epoch": 2.9373482272948035, | |
| "grad_norm": 0.05517578125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0828, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 2.945118989800874, | |
| "grad_norm": 0.025390625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0286, | |
| "step": 1895 | |
| }, | |
| { | |
| "epoch": 2.952889752306945, | |
| "grad_norm": 0.01043701171875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0267, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 2.960660514813016, | |
| "grad_norm": 0.06201171875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0149, | |
| "step": 1905 | |
| }, | |
| { | |
| "epoch": 2.968431277319087, | |
| "grad_norm": 0.0615234375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0528, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 2.976202039825158, | |
| "grad_norm": 0.10693359375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0393, | |
| "step": 1915 | |
| }, | |
| { | |
| "epoch": 2.9839728023312286, | |
| "grad_norm": 0.09912109375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0304, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 2.9917435648372996, | |
| "grad_norm": 0.058349609375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0279, | |
| "step": 1925 | |
| }, | |
| { | |
| "epoch": 2.9995143273433706, | |
| "grad_norm": 0.318359375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0327, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 3.0072850898494417, | |
| "grad_norm": 0.058837890625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0456, | |
| "step": 1935 | |
| }, | |
| { | |
| "epoch": 3.015055852355512, | |
| "grad_norm": 0.041748046875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0142, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 3.022826614861583, | |
| "grad_norm": 0.052490234375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0105, | |
| "step": 1945 | |
| }, | |
| { | |
| "epoch": 3.030597377367654, | |
| "grad_norm": 0.072265625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0103, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 3.030597377367654, | |
| "step": 1950, | |
| "total_flos": 3.358681688807424e+17, | |
| "train_loss": 0.06525009815127422, | |
| "train_runtime": 54324.0702, | |
| "train_samples_per_second": 0.574, | |
| "train_steps_per_second": 0.036 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 1950, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 4, | |
| "save_steps": 90, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.358681688807424e+17, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |