| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.030597377367654, | |
| "eval_steps": 500, | |
| "global_step": 1950, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.007770762506070908, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 0.0001, | |
| "loss": 1.6268, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.015541525012141816, | |
| "grad_norm": 0.140625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3484, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.023312287518212724, | |
| "grad_norm": 0.11181640625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2158, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.03108305002428363, | |
| "grad_norm": 0.10302734375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1784, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.03885381253035454, | |
| "grad_norm": 0.1328125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2267, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.04662457503642545, | |
| "grad_norm": 0.181640625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.3222, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.054395337542496355, | |
| "grad_norm": 0.255859375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2583, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.06216610004856726, | |
| "grad_norm": 0.369140625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2882, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.06993686255463817, | |
| "grad_norm": 0.275390625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2039, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.07770762506070908, | |
| "grad_norm": 0.2236328125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2636, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.08547838756677999, | |
| "grad_norm": 0.07373046875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.149, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.0932491500728509, | |
| "grad_norm": 0.0390625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0486, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.1010199125789218, | |
| "grad_norm": 0.058349609375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0444, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.10879067508499271, | |
| "grad_norm": 0.043212890625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.034, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.11656143759106362, | |
| "grad_norm": 0.09912109375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0925, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.12433220009713453, | |
| "grad_norm": 0.08837890625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1931, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.13210296260320545, | |
| "grad_norm": 0.1357421875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1508, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.13987372510927634, | |
| "grad_norm": 0.1484375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1954, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.14764448761534726, | |
| "grad_norm": 0.12060546875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1666, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.15541525012141816, | |
| "grad_norm": 0.32421875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.263, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.16318601262748908, | |
| "grad_norm": 0.04736328125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1395, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.17095677513355997, | |
| "grad_norm": 0.04296875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0486, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.1787275376396309, | |
| "grad_norm": 0.031982421875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0418, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.1864983001457018, | |
| "grad_norm": 0.034912109375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0538, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.1942690626517727, | |
| "grad_norm": 0.0927734375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0847, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.2020398251578436, | |
| "grad_norm": 0.1298828125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1045, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.20981058766391453, | |
| "grad_norm": 0.12451171875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1584, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.21758135016998542, | |
| "grad_norm": 0.1396484375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1446, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.22535211267605634, | |
| "grad_norm": 0.1865234375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1932, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.23312287518212724, | |
| "grad_norm": 0.2412109375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2388, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.24089363768819816, | |
| "grad_norm": 0.045166015625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0958, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.24866440019426905, | |
| "grad_norm": 0.04443359375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0522, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.25643516270033995, | |
| "grad_norm": 0.044921875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.039, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.2642059252064109, | |
| "grad_norm": 0.044189453125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0332, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.2719766877124818, | |
| "grad_norm": 0.0673828125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0846, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.2797474502185527, | |
| "grad_norm": 0.09423828125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1465, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.2875182127246236, | |
| "grad_norm": 0.09765625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1395, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.29528897523069453, | |
| "grad_norm": 0.06640625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1369, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.3030597377367654, | |
| "grad_norm": 0.1669921875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1556, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.3108305002428363, | |
| "grad_norm": 0.1669921875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1521, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.3186012627489072, | |
| "grad_norm": 0.0286865234375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1091, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.32637202525497816, | |
| "grad_norm": 0.0262451171875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0511, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.33414278776104905, | |
| "grad_norm": 0.033447265625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.028, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.34191355026711995, | |
| "grad_norm": 0.0218505859375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0297, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.34968431277319084, | |
| "grad_norm": 0.0791015625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0885, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.3574550752792618, | |
| "grad_norm": 0.09423828125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1735, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.3652258377853327, | |
| "grad_norm": 0.162109375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1268, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.3729966002914036, | |
| "grad_norm": 0.1015625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1057, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.38076736279747453, | |
| "grad_norm": 0.212890625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1398, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.3885381253035454, | |
| "grad_norm": 0.1611328125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1977, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.3963088878096163, | |
| "grad_norm": 0.045166015625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1193, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.4040796503156872, | |
| "grad_norm": 0.05810546875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0761, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.41185041282175816, | |
| "grad_norm": 0.055908203125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0432, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.41962117532782905, | |
| "grad_norm": 0.04248046875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0349, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.42739193783389995, | |
| "grad_norm": 0.09814453125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1242, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.43516270033997084, | |
| "grad_norm": 0.08154296875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1191, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.4429334628460418, | |
| "grad_norm": 0.0810546875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1463, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.4507042253521127, | |
| "grad_norm": 0.1259765625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1266, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.4584749878581836, | |
| "grad_norm": 0.1083984375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1161, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.4662457503642545, | |
| "grad_norm": 0.203125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1616, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.4740165128703254, | |
| "grad_norm": 0.050048828125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1255, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.4817872753763963, | |
| "grad_norm": 0.015869140625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0435, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.4895580378824672, | |
| "grad_norm": 0.032958984375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0162, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.4973288003885381, | |
| "grad_norm": 0.03662109375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0213, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.505099562894609, | |
| "grad_norm": 0.09765625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1135, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.5128703254006799, | |
| "grad_norm": 0.142578125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1382, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.5206410879067509, | |
| "grad_norm": 0.091796875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1748, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.5284118504128218, | |
| "grad_norm": 0.11669921875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.115, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.5361826129188927, | |
| "grad_norm": 0.11572265625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1357, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.5439533754249636, | |
| "grad_norm": 0.15234375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.159, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.5517241379310345, | |
| "grad_norm": 0.052734375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.112, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.5594949004371054, | |
| "grad_norm": 0.044921875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0378, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.5672656629431763, | |
| "grad_norm": 0.0269775390625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0328, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.5750364254492472, | |
| "grad_norm": 0.03369140625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0498, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.5828071879553182, | |
| "grad_norm": 0.0751953125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0752, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.5905779504613891, | |
| "grad_norm": 0.12158203125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1457, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.59834871296746, | |
| "grad_norm": 0.11474609375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1236, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.6061194754735308, | |
| "grad_norm": 0.22265625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1249, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.6138902379796017, | |
| "grad_norm": 0.1572265625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1376, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.6216610004856726, | |
| "grad_norm": 0.1474609375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1713, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.6294317629917435, | |
| "grad_norm": 0.04736328125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1155, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.6372025254978144, | |
| "grad_norm": 0.03466796875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0526, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.6449732880038854, | |
| "grad_norm": 0.0194091796875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0214, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.6527440505099563, | |
| "grad_norm": 0.06787109375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0553, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.6605148130160272, | |
| "grad_norm": 0.06201171875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0648, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.6682855755220981, | |
| "grad_norm": 0.07177734375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1258, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.676056338028169, | |
| "grad_norm": 0.09326171875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1269, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.6838271005342399, | |
| "grad_norm": 0.119140625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1127, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.6915978630403108, | |
| "grad_norm": 0.09130859375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1422, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.6993686255463817, | |
| "grad_norm": 0.255859375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2041, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.7071393880524527, | |
| "grad_norm": 0.0458984375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1114, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.7149101505585236, | |
| "grad_norm": 0.0263671875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0674, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.7226809130645945, | |
| "grad_norm": 0.026611328125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0225, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.7304516755706654, | |
| "grad_norm": 0.043212890625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0435, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.7382224380767363, | |
| "grad_norm": 0.08984375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0567, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.7459932005828072, | |
| "grad_norm": 0.08251953125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0623, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.753763963088878, | |
| "grad_norm": 0.0869140625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0851, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.7615347255949491, | |
| "grad_norm": 0.11328125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1367, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.76930548810102, | |
| "grad_norm": 0.0908203125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0808, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.7770762506070908, | |
| "grad_norm": 0.259765625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1243, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.7848470131131617, | |
| "grad_norm": 0.048828125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1142, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.7926177756192326, | |
| "grad_norm": 0.007781982421875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0499, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.8003885381253035, | |
| "grad_norm": 0.0147705078125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0142, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.8081593006313744, | |
| "grad_norm": 0.0517578125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0259, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.8159300631374453, | |
| "grad_norm": 0.0810546875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0634, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.8237008256435163, | |
| "grad_norm": 0.11083984375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1226, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.8314715881495872, | |
| "grad_norm": 0.11572265625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1012, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.8392423506556581, | |
| "grad_norm": 0.12060546875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.103, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.847013113161729, | |
| "grad_norm": 0.1513671875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1475, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.8547838756677999, | |
| "grad_norm": 0.11181640625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1074, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.8625546381738708, | |
| "grad_norm": 0.0712890625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1353, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.8703254006799417, | |
| "grad_norm": 0.007781982421875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0262, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.8780961631860126, | |
| "grad_norm": 0.06689453125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0208, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.8858669256920836, | |
| "grad_norm": 0.026123046875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0307, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.8936376881981545, | |
| "grad_norm": 0.07421875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.075, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.9014084507042254, | |
| "grad_norm": 0.09423828125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1302, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.9091792132102963, | |
| "grad_norm": 0.1044921875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1165, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.9169499757163672, | |
| "grad_norm": 0.11474609375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1413, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.924720738222438, | |
| "grad_norm": 0.095703125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1307, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.932491500728509, | |
| "grad_norm": 0.1689453125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1533, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.9402622632345798, | |
| "grad_norm": 0.04443359375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1054, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.9480330257406508, | |
| "grad_norm": 0.031982421875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0223, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.9558037882467217, | |
| "grad_norm": 0.0400390625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0336, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.9635745507527926, | |
| "grad_norm": 0.05322265625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0454, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.9713453132588635, | |
| "grad_norm": 0.169921875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1449, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.9791160757649344, | |
| "grad_norm": 0.08935546875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1174, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.9868868382710053, | |
| "grad_norm": 0.11767578125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1063, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 0.9946576007770762, | |
| "grad_norm": 0.10400390625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.187, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.0024283632831472, | |
| "grad_norm": 0.046630859375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1296, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 1.010199125789218, | |
| "grad_norm": 0.0517578125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0741, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.017969888295289, | |
| "grad_norm": 0.0093994140625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0179, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 1.0257406508013598, | |
| "grad_norm": 0.027587890625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0117, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.0335114133074308, | |
| "grad_norm": 0.04736328125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0372, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 1.0412821758135018, | |
| "grad_norm": 0.07470703125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0489, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.0490529383195726, | |
| "grad_norm": 0.09423828125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0753, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 1.0568237008256436, | |
| "grad_norm": 0.11572265625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0678, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.0645944633317144, | |
| "grad_norm": 0.18359375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0719, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 1.0723652258377854, | |
| "grad_norm": 0.16796875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0758, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.0801359883438562, | |
| "grad_norm": 0.06005859375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0931, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 1.0879067508499272, | |
| "grad_norm": 0.0625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.068, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.095677513355998, | |
| "grad_norm": 0.051513671875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0374, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 1.103448275862069, | |
| "grad_norm": 0.032470703125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0129, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.11121903836814, | |
| "grad_norm": 0.0234375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0202, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 1.1189898008742107, | |
| "grad_norm": 0.08203125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0579, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.1267605633802817, | |
| "grad_norm": 0.11083984375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1094, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 1.1345313258863525, | |
| "grad_norm": 0.1298828125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.076, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.1423020883924235, | |
| "grad_norm": 0.138671875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0648, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 1.1500728508984945, | |
| "grad_norm": 0.146484375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0702, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.1578436134045653, | |
| "grad_norm": 0.0849609375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0881, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 1.1656143759106363, | |
| "grad_norm": 0.06103515625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0715, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.173385138416707, | |
| "grad_norm": 0.0225830078125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0241, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 1.1811559009227781, | |
| "grad_norm": 0.0546875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0222, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.188926663428849, | |
| "grad_norm": 0.061279296875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0319, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 1.19669742593492, | |
| "grad_norm": 0.08203125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0672, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.2044681884409907, | |
| "grad_norm": 0.061767578125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0693, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 1.2122389509470617, | |
| "grad_norm": 0.1171875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0811, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.2200097134531327, | |
| "grad_norm": 0.0966796875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0665, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 1.2277804759592035, | |
| "grad_norm": 0.08056640625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0703, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.2355512384652745, | |
| "grad_norm": 0.052734375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1042, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 1.2433220009713453, | |
| "grad_norm": 0.0257568359375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0667, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.2510927634774163, | |
| "grad_norm": 0.03564453125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0133, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 1.258863525983487, | |
| "grad_norm": 0.03369140625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0167, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.266634288489558, | |
| "grad_norm": 0.050537109375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.041, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 1.2744050509956288, | |
| "grad_norm": 0.130859375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1126, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.2821758135016998, | |
| "grad_norm": 0.08984375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.064, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 1.2899465760077709, | |
| "grad_norm": 0.09716796875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0683, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.2977173385138416, | |
| "grad_norm": 0.1259765625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0552, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 1.3054881010199126, | |
| "grad_norm": 0.1259765625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0839, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.3132588635259834, | |
| "grad_norm": 0.07275390625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1073, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 1.3210296260320544, | |
| "grad_norm": 0.08740234375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0792, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.3288003885381254, | |
| "grad_norm": 0.04638671875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0354, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 1.3365711510441962, | |
| "grad_norm": 0.05078125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0145, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.344341913550267, | |
| "grad_norm": 0.068359375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0348, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 1.352112676056338, | |
| "grad_norm": 0.08935546875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0524, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.359883438562409, | |
| "grad_norm": 0.0634765625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0427, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 1.3676542010684798, | |
| "grad_norm": 0.06591796875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0425, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.3754249635745508, | |
| "grad_norm": 0.166015625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.061, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 1.3831957260806216, | |
| "grad_norm": 0.17578125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0512, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.3909664885866926, | |
| "grad_norm": 0.059814453125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1101, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 1.3987372510927636, | |
| "grad_norm": 0.03759765625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0625, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.4065080135988344, | |
| "grad_norm": 0.053955078125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0264, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 1.4142787761049052, | |
| "grad_norm": 0.0615234375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0199, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.4220495386109762, | |
| "grad_norm": 0.033203125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0332, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 1.4298203011170472, | |
| "grad_norm": 0.08642578125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0886, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.437591063623118, | |
| "grad_norm": 0.0859375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0829, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 1.445361826129189, | |
| "grad_norm": 0.09765625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0626, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.4531325886352597, | |
| "grad_norm": 0.11181640625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0602, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 1.4609033511413307, | |
| "grad_norm": 0.16015625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0425, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.4686741136474017, | |
| "grad_norm": 0.056640625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.084, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 1.4764448761534725, | |
| "grad_norm": 0.052001953125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0844, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.4842156386595435, | |
| "grad_norm": 0.052978515625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0313, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 1.4919864011656143, | |
| "grad_norm": 0.0380859375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0201, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.4997571636716853, | |
| "grad_norm": 0.053955078125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0285, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 1.5075279261777563, | |
| "grad_norm": 0.061279296875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0593, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.515298688683827, | |
| "grad_norm": 0.0380859375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0689, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 1.523069451189898, | |
| "grad_norm": 0.080078125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0739, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.530840213695969, | |
| "grad_norm": 0.1513671875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0539, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 1.53861097620204, | |
| "grad_norm": 0.1240234375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0781, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.5463817387081107, | |
| "grad_norm": 0.064453125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0748, | |
| "step": 995 | |
| }, | |
| { | |
| "epoch": 1.5541525012141817, | |
| "grad_norm": 0.06591796875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0867, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.5619232637202525, | |
| "grad_norm": 0.01556396484375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0261, | |
| "step": 1005 | |
| }, | |
| { | |
| "epoch": 1.5696940262263235, | |
| "grad_norm": 0.0322265625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0182, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 1.5774647887323945, | |
| "grad_norm": 0.055908203125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0339, | |
| "step": 1015 | |
| }, | |
| { | |
| "epoch": 1.5852355512384653, | |
| "grad_norm": 0.10107421875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0663, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 1.593006313744536, | |
| "grad_norm": 0.076171875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0737, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 1.600777076250607, | |
| "grad_norm": 0.12890625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0849, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 1.608547838756678, | |
| "grad_norm": 0.150390625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0393, | |
| "step": 1035 | |
| }, | |
| { | |
| "epoch": 1.616318601262749, | |
| "grad_norm": 0.1552734375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.085, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 1.6240893637688198, | |
| "grad_norm": 0.044921875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0806, | |
| "step": 1045 | |
| }, | |
| { | |
| "epoch": 1.6318601262748906, | |
| "grad_norm": 0.05859375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0804, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.6396308887809616, | |
| "grad_norm": 0.0181884765625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0238, | |
| "step": 1055 | |
| }, | |
| { | |
| "epoch": 1.6474016512870326, | |
| "grad_norm": 0.060546875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0176, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 1.6551724137931034, | |
| "grad_norm": 0.049072265625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0223, | |
| "step": 1065 | |
| }, | |
| { | |
| "epoch": 1.6629431762991742, | |
| "grad_norm": 0.09619140625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0682, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 1.6707139388052452, | |
| "grad_norm": 0.08642578125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0745, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 1.6784847013113162, | |
| "grad_norm": 0.080078125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0643, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.6862554638173872, | |
| "grad_norm": 0.14453125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0538, | |
| "step": 1085 | |
| }, | |
| { | |
| "epoch": 1.694026226323458, | |
| "grad_norm": 0.091796875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0653, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 1.7017969888295288, | |
| "grad_norm": 0.06005859375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1079, | |
| "step": 1095 | |
| }, | |
| { | |
| "epoch": 1.7095677513355998, | |
| "grad_norm": 0.053955078125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0545, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.7173385138416708, | |
| "grad_norm": 0.025390625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0126, | |
| "step": 1105 | |
| }, | |
| { | |
| "epoch": 1.7251092763477416, | |
| "grad_norm": 0.0419921875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0101, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 1.7328800388538124, | |
| "grad_norm": 0.0908203125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0302, | |
| "step": 1115 | |
| }, | |
| { | |
| "epoch": 1.7406508013598834, | |
| "grad_norm": 0.0849609375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0467, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 1.7484215638659544, | |
| "grad_norm": 0.1435546875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0702, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 1.7561923263720254, | |
| "grad_norm": 0.11865234375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0664, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 1.7639630888780962, | |
| "grad_norm": 0.140625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0746, | |
| "step": 1135 | |
| }, | |
| { | |
| "epoch": 1.771733851384167, | |
| "grad_norm": 0.1357421875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0458, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 1.779504613890238, | |
| "grad_norm": 0.0576171875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1097, | |
| "step": 1145 | |
| }, | |
| { | |
| "epoch": 1.787275376396309, | |
| "grad_norm": 0.038818359375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0759, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 1.79504613890238, | |
| "grad_norm": 0.056396484375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0274, | |
| "step": 1155 | |
| }, | |
| { | |
| "epoch": 1.8028169014084507, | |
| "grad_norm": 0.05078125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0146, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 1.8105876639145215, | |
| "grad_norm": 0.048583984375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0251, | |
| "step": 1165 | |
| }, | |
| { | |
| "epoch": 1.8183584264205925, | |
| "grad_norm": 0.09423828125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0727, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 1.8261291889266635, | |
| "grad_norm": 0.109375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0685, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 1.8338999514327343, | |
| "grad_norm": 0.138671875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0829, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 1.841670713938805, | |
| "grad_norm": 0.07275390625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0445, | |
| "step": 1185 | |
| }, | |
| { | |
| "epoch": 1.849441476444876, | |
| "grad_norm": 0.2041015625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0699, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 1.8572122389509471, | |
| "grad_norm": 0.07373046875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0995, | |
| "step": 1195 | |
| }, | |
| { | |
| "epoch": 1.8649830014570181, | |
| "grad_norm": 0.0546875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0798, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.872753763963089, | |
| "grad_norm": 0.045654296875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.019, | |
| "step": 1205 | |
| }, | |
| { | |
| "epoch": 1.8805245264691597, | |
| "grad_norm": 0.0673828125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0179, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 1.8882952889752307, | |
| "grad_norm": 0.06640625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.027, | |
| "step": 1215 | |
| }, | |
| { | |
| "epoch": 1.8960660514813017, | |
| "grad_norm": 0.0830078125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.056, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 1.9038368139873725, | |
| "grad_norm": 0.1279296875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0699, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 1.9116075764934433, | |
| "grad_norm": 0.09912109375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.063, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 1.9193783389995143, | |
| "grad_norm": 0.1328125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0633, | |
| "step": 1235 | |
| }, | |
| { | |
| "epoch": 1.9271491015055853, | |
| "grad_norm": 0.169921875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0662, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 1.9349198640116563, | |
| "grad_norm": 0.057373046875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0728, | |
| "step": 1245 | |
| }, | |
| { | |
| "epoch": 1.942690626517727, | |
| "grad_norm": 0.056884765625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0569, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.9504613890237978, | |
| "grad_norm": 0.052490234375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0309, | |
| "step": 1255 | |
| }, | |
| { | |
| "epoch": 1.9582321515298688, | |
| "grad_norm": 0.0625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0352, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 1.9660029140359399, | |
| "grad_norm": 0.06591796875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0289, | |
| "step": 1265 | |
| }, | |
| { | |
| "epoch": 1.9737736765420106, | |
| "grad_norm": 0.09033203125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0689, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 1.9815444390480816, | |
| "grad_norm": 0.1015625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0669, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 1.9893152015541524, | |
| "grad_norm": 0.09814453125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.056, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 1.9970859640602234, | |
| "grad_norm": 0.111328125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0683, | |
| "step": 1285 | |
| }, | |
| { | |
| "epoch": 2.0048567265662944, | |
| "grad_norm": 0.055908203125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0879, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 2.0126274890723654, | |
| "grad_norm": 0.037109375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.026, | |
| "step": 1295 | |
| }, | |
| { | |
| "epoch": 2.020398251578436, | |
| "grad_norm": 0.0252685546875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0134, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.028169014084507, | |
| "grad_norm": 0.06640625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0336, | |
| "step": 1305 | |
| }, | |
| { | |
| "epoch": 2.035939776590578, | |
| "grad_norm": 0.06494140625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0374, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 2.043710539096649, | |
| "grad_norm": 0.091796875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0574, | |
| "step": 1315 | |
| }, | |
| { | |
| "epoch": 2.0514813016027196, | |
| "grad_norm": 0.1181640625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0282, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 2.0592520641087906, | |
| "grad_norm": 0.0947265625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0419, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 2.0670228266148616, | |
| "grad_norm": 0.154296875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0283, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 2.0747935891209326, | |
| "grad_norm": 0.054931640625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0379, | |
| "step": 1335 | |
| }, | |
| { | |
| "epoch": 2.0825643516270036, | |
| "grad_norm": 0.059814453125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0721, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 2.090335114133074, | |
| "grad_norm": 0.03466796875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0264, | |
| "step": 1345 | |
| }, | |
| { | |
| "epoch": 2.098105876639145, | |
| "grad_norm": 0.0299072265625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0079, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 2.105876639145216, | |
| "grad_norm": 0.06494140625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0087, | |
| "step": 1355 | |
| }, | |
| { | |
| "epoch": 2.113647401651287, | |
| "grad_norm": 0.06201171875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0131, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 2.1214181641573577, | |
| "grad_norm": 0.1005859375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.04, | |
| "step": 1365 | |
| }, | |
| { | |
| "epoch": 2.1291889266634287, | |
| "grad_norm": 0.06298828125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.035, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 2.1369596891694997, | |
| "grad_norm": 0.08544921875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0477, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 2.1447304516755707, | |
| "grad_norm": 0.09521484375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.023, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 2.1525012141816418, | |
| "grad_norm": 0.0810546875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0341, | |
| "step": 1385 | |
| }, | |
| { | |
| "epoch": 2.1602719766877123, | |
| "grad_norm": 0.087890625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0797, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 2.1680427391937833, | |
| "grad_norm": 0.0125732421875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0308, | |
| "step": 1395 | |
| }, | |
| { | |
| "epoch": 2.1758135016998543, | |
| "grad_norm": 0.0439453125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0105, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.1835842642059253, | |
| "grad_norm": 0.04541015625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0275, | |
| "step": 1405 | |
| }, | |
| { | |
| "epoch": 2.191355026711996, | |
| "grad_norm": 0.0311279296875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0155, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 2.199125789218067, | |
| "grad_norm": 0.10205078125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0386, | |
| "step": 1415 | |
| }, | |
| { | |
| "epoch": 2.206896551724138, | |
| "grad_norm": 0.2080078125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.05, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 2.214667314230209, | |
| "grad_norm": 0.0712890625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0236, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 2.22243807673628, | |
| "grad_norm": 0.1181640625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0198, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 2.2302088392423505, | |
| "grad_norm": 0.1337890625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0274, | |
| "step": 1435 | |
| }, | |
| { | |
| "epoch": 2.2379796017484215, | |
| "grad_norm": 0.130859375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0659, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 2.2457503642544925, | |
| "grad_norm": 0.041015625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0416, | |
| "step": 1445 | |
| }, | |
| { | |
| "epoch": 2.2535211267605635, | |
| "grad_norm": 0.02685546875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0123, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 2.2612918892666345, | |
| "grad_norm": 0.0245361328125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0078, | |
| "step": 1455 | |
| }, | |
| { | |
| "epoch": 2.269062651772705, | |
| "grad_norm": 0.04833984375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0186, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 2.276833414278776, | |
| "grad_norm": 0.07275390625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0529, | |
| "step": 1465 | |
| }, | |
| { | |
| "epoch": 2.284604176784847, | |
| "grad_norm": 0.0888671875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0243, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 2.292374939290918, | |
| "grad_norm": 0.07421875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0296, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 2.300145701796989, | |
| "grad_norm": 0.21875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0334, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 2.3079164643030596, | |
| "grad_norm": 0.109375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0306, | |
| "step": 1485 | |
| }, | |
| { | |
| "epoch": 2.3156872268091306, | |
| "grad_norm": 0.0810546875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0796, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 2.3234579893152016, | |
| "grad_norm": 0.05078125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.023, | |
| "step": 1495 | |
| }, | |
| { | |
| "epoch": 2.3312287518212726, | |
| "grad_norm": 0.038818359375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0128, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.338999514327343, | |
| "grad_norm": 0.05712890625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0166, | |
| "step": 1505 | |
| }, | |
| { | |
| "epoch": 2.346770276833414, | |
| "grad_norm": 0.06103515625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0222, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 2.354541039339485, | |
| "grad_norm": 0.09521484375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0375, | |
| "step": 1515 | |
| }, | |
| { | |
| "epoch": 2.3623118018455562, | |
| "grad_norm": 0.109375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0284, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 2.370082564351627, | |
| "grad_norm": 0.0908203125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0257, | |
| "step": 1525 | |
| }, | |
| { | |
| "epoch": 2.377853326857698, | |
| "grad_norm": 0.13671875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0423, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 2.385624089363769, | |
| "grad_norm": 0.08642578125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0283, | |
| "step": 1535 | |
| }, | |
| { | |
| "epoch": 2.39339485186984, | |
| "grad_norm": 0.053466796875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0543, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 2.401165614375911, | |
| "grad_norm": 0.045166015625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0405, | |
| "step": 1545 | |
| }, | |
| { | |
| "epoch": 2.4089363768819814, | |
| "grad_norm": 0.00860595703125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0088, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 2.4167071393880524, | |
| "grad_norm": 0.0206298828125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0092, | |
| "step": 1555 | |
| }, | |
| { | |
| "epoch": 2.4244779018941234, | |
| "grad_norm": 0.03564453125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0233, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 2.4322486644001944, | |
| "grad_norm": 0.1103515625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0386, | |
| "step": 1565 | |
| }, | |
| { | |
| "epoch": 2.4400194269062654, | |
| "grad_norm": 0.0830078125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0318, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 2.447790189412336, | |
| "grad_norm": 0.1328125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0313, | |
| "step": 1575 | |
| }, | |
| { | |
| "epoch": 2.455560951918407, | |
| "grad_norm": 0.12060546875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0258, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 2.463331714424478, | |
| "grad_norm": 0.12060546875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.039, | |
| "step": 1585 | |
| }, | |
| { | |
| "epoch": 2.471102476930549, | |
| "grad_norm": 0.072265625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0593, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 2.4788732394366195, | |
| "grad_norm": 0.05126953125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0398, | |
| "step": 1595 | |
| }, | |
| { | |
| "epoch": 2.4866440019426905, | |
| "grad_norm": 0.034912109375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0067, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 2.4944147644487615, | |
| "grad_norm": 0.0284423828125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0132, | |
| "step": 1605 | |
| }, | |
| { | |
| "epoch": 2.5021855269548325, | |
| "grad_norm": 0.064453125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0213, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 2.509956289460903, | |
| "grad_norm": 0.0732421875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0299, | |
| "step": 1615 | |
| }, | |
| { | |
| "epoch": 2.517727051966974, | |
| "grad_norm": 0.10888671875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.036, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 2.525497814473045, | |
| "grad_norm": 0.1435546875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0313, | |
| "step": 1625 | |
| }, | |
| { | |
| "epoch": 2.533268576979116, | |
| "grad_norm": 0.09375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.03, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 2.541039339485187, | |
| "grad_norm": 0.1201171875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0308, | |
| "step": 1635 | |
| }, | |
| { | |
| "epoch": 2.5488101019912577, | |
| "grad_norm": 0.109375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.077, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 2.5565808644973287, | |
| "grad_norm": 0.03466796875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0328, | |
| "step": 1645 | |
| }, | |
| { | |
| "epoch": 2.5643516270033997, | |
| "grad_norm": 0.03955078125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0118, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 2.5721223895094707, | |
| "grad_norm": 0.0712890625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0127, | |
| "step": 1655 | |
| }, | |
| { | |
| "epoch": 2.5798931520155417, | |
| "grad_norm": 0.06201171875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0154, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 2.5876639145216123, | |
| "grad_norm": 0.06982421875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0467, | |
| "step": 1665 | |
| }, | |
| { | |
| "epoch": 2.5954346770276833, | |
| "grad_norm": 0.11572265625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0477, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 2.6032054395337543, | |
| "grad_norm": 0.1142578125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0265, | |
| "step": 1675 | |
| }, | |
| { | |
| "epoch": 2.6109762020398253, | |
| "grad_norm": 0.08984375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0214, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 2.6187469645458963, | |
| "grad_norm": 0.1328125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0364, | |
| "step": 1685 | |
| }, | |
| { | |
| "epoch": 2.626517727051967, | |
| "grad_norm": 0.062255859375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0746, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 2.634288489558038, | |
| "grad_norm": 0.036376953125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.026, | |
| "step": 1695 | |
| }, | |
| { | |
| "epoch": 2.642059252064109, | |
| "grad_norm": 0.00982666015625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0106, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.64983001457018, | |
| "grad_norm": 0.072265625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0181, | |
| "step": 1705 | |
| }, | |
| { | |
| "epoch": 2.657600777076251, | |
| "grad_norm": 0.062255859375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0317, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 2.6653715395823214, | |
| "grad_norm": 0.0859375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0366, | |
| "step": 1715 | |
| }, | |
| { | |
| "epoch": 2.6731423020883924, | |
| "grad_norm": 0.050048828125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0273, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 2.6809130645944634, | |
| "grad_norm": 0.15234375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0385, | |
| "step": 1725 | |
| }, | |
| { | |
| "epoch": 2.688683827100534, | |
| "grad_norm": 0.1416015625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0333, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 2.696454589606605, | |
| "grad_norm": 0.1552734375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.045, | |
| "step": 1735 | |
| }, | |
| { | |
| "epoch": 2.704225352112676, | |
| "grad_norm": 0.0673828125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0696, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 2.711996114618747, | |
| "grad_norm": 0.033203125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0295, | |
| "step": 1745 | |
| }, | |
| { | |
| "epoch": 2.719766877124818, | |
| "grad_norm": 0.04150390625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.01, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 2.7275376396308886, | |
| "grad_norm": 0.04150390625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0129, | |
| "step": 1755 | |
| }, | |
| { | |
| "epoch": 2.7353084021369596, | |
| "grad_norm": 0.0693359375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0209, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 2.7430791646430306, | |
| "grad_norm": 0.1298828125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0456, | |
| "step": 1765 | |
| }, | |
| { | |
| "epoch": 2.7508499271491016, | |
| "grad_norm": 0.138671875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0398, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 2.7586206896551726, | |
| "grad_norm": 0.10205078125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0402, | |
| "step": 1775 | |
| }, | |
| { | |
| "epoch": 2.766391452161243, | |
| "grad_norm": 0.06396484375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0447, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 2.774162214667314, | |
| "grad_norm": 0.1689453125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0308, | |
| "step": 1785 | |
| }, | |
| { | |
| "epoch": 2.781932977173385, | |
| "grad_norm": 0.0712890625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0725, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 2.789703739679456, | |
| "grad_norm": 0.047607421875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0362, | |
| "step": 1795 | |
| }, | |
| { | |
| "epoch": 2.797474502185527, | |
| "grad_norm": 0.034912109375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0246, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.8052452646915977, | |
| "grad_norm": 0.039794921875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0129, | |
| "step": 1805 | |
| }, | |
| { | |
| "epoch": 2.8130160271976687, | |
| "grad_norm": 0.064453125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0158, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 2.8207867897037397, | |
| "grad_norm": 0.099609375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0383, | |
| "step": 1815 | |
| }, | |
| { | |
| "epoch": 2.8285575522098103, | |
| "grad_norm": 0.125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0278, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 2.8363283147158818, | |
| "grad_norm": 0.060791015625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0296, | |
| "step": 1825 | |
| }, | |
| { | |
| "epoch": 2.8440990772219523, | |
| "grad_norm": 0.11767578125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0381, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 2.8518698397280233, | |
| "grad_norm": 0.0888671875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.023, | |
| "step": 1835 | |
| }, | |
| { | |
| "epoch": 2.8596406022340943, | |
| "grad_norm": 0.06396484375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0857, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 2.867411364740165, | |
| "grad_norm": 0.027099609375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0419, | |
| "step": 1845 | |
| }, | |
| { | |
| "epoch": 2.875182127246236, | |
| "grad_norm": 0.0546875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0366, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 2.882952889752307, | |
| "grad_norm": 0.03955078125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0095, | |
| "step": 1855 | |
| }, | |
| { | |
| "epoch": 2.890723652258378, | |
| "grad_norm": 0.02978515625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.027, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 2.898494414764449, | |
| "grad_norm": 0.109375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0469, | |
| "step": 1865 | |
| }, | |
| { | |
| "epoch": 2.9062651772705195, | |
| "grad_norm": 0.08935546875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0318, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 2.9140359397765905, | |
| "grad_norm": 0.048095703125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0298, | |
| "step": 1875 | |
| }, | |
| { | |
| "epoch": 2.9218067022826615, | |
| "grad_norm": 0.1220703125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0286, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 2.9295774647887325, | |
| "grad_norm": 0.232421875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0286, | |
| "step": 1885 | |
| }, | |
| { | |
| "epoch": 2.9373482272948035, | |
| "grad_norm": 0.04443359375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0868, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 2.945118989800874, | |
| "grad_norm": 0.0240478515625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0325, | |
| "step": 1895 | |
| }, | |
| { | |
| "epoch": 2.952889752306945, | |
| "grad_norm": 0.01556396484375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0273, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 2.960660514813016, | |
| "grad_norm": 0.0517578125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.017, | |
| "step": 1905 | |
| }, | |
| { | |
| "epoch": 2.968431277319087, | |
| "grad_norm": 0.056884765625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0567, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 2.976202039825158, | |
| "grad_norm": 0.1298828125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0429, | |
| "step": 1915 | |
| }, | |
| { | |
| "epoch": 2.9839728023312286, | |
| "grad_norm": 0.07373046875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0317, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 2.9917435648372996, | |
| "grad_norm": 0.05712890625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0283, | |
| "step": 1925 | |
| }, | |
| { | |
| "epoch": 2.9995143273433706, | |
| "grad_norm": 0.15234375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0246, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 3.0072850898494417, | |
| "grad_norm": 0.062255859375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0468, | |
| "step": 1935 | |
| }, | |
| { | |
| "epoch": 3.015055852355512, | |
| "grad_norm": 0.0272216796875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0136, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 3.022826614861583, | |
| "grad_norm": 0.053955078125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0105, | |
| "step": 1945 | |
| }, | |
| { | |
| "epoch": 3.030597377367654, | |
| "grad_norm": 0.10009765625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0102, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 3.030597377367654, | |
| "step": 1950, | |
| "total_flos": 3.358681688807424e+17, | |
| "train_loss": 0.07148738998824206, | |
| "train_runtime": 38765.133, | |
| "train_samples_per_second": 0.805, | |
| "train_steps_per_second": 0.05 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 1950, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 4, | |
| "save_steps": 90, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.358681688807424e+17, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |