{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.030597377367654, "eval_steps": 500, "global_step": 1950, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007770762506070908, "grad_norm": 0.22265625, "learning_rate": 0.0001, "loss": 1.6268, "step": 5 }, { "epoch": 0.015541525012141816, "grad_norm": 0.140625, "learning_rate": 0.0001, "loss": 0.3484, "step": 10 }, { "epoch": 0.023312287518212724, "grad_norm": 0.11181640625, "learning_rate": 0.0001, "loss": 0.2158, "step": 15 }, { "epoch": 0.03108305002428363, "grad_norm": 0.10302734375, "learning_rate": 0.0001, "loss": 0.1784, "step": 20 }, { "epoch": 0.03885381253035454, "grad_norm": 0.1328125, "learning_rate": 0.0001, "loss": 0.2267, "step": 25 }, { "epoch": 0.04662457503642545, "grad_norm": 0.181640625, "learning_rate": 0.0001, "loss": 0.3222, "step": 30 }, { "epoch": 0.054395337542496355, "grad_norm": 0.255859375, "learning_rate": 0.0001, "loss": 0.2583, "step": 35 }, { "epoch": 0.06216610004856726, "grad_norm": 0.369140625, "learning_rate": 0.0001, "loss": 0.2882, "step": 40 }, { "epoch": 0.06993686255463817, "grad_norm": 0.275390625, "learning_rate": 0.0001, "loss": 0.2039, "step": 45 }, { "epoch": 0.07770762506070908, "grad_norm": 0.2236328125, "learning_rate": 0.0001, "loss": 0.2636, "step": 50 }, { "epoch": 0.08547838756677999, "grad_norm": 0.07373046875, "learning_rate": 0.0001, "loss": 0.149, "step": 55 }, { "epoch": 0.0932491500728509, "grad_norm": 0.0390625, "learning_rate": 0.0001, "loss": 0.0486, "step": 60 }, { "epoch": 0.1010199125789218, "grad_norm": 0.058349609375, "learning_rate": 0.0001, "loss": 0.0444, "step": 65 }, { "epoch": 0.10879067508499271, "grad_norm": 0.043212890625, "learning_rate": 0.0001, "loss": 0.034, "step": 70 }, { "epoch": 0.11656143759106362, "grad_norm": 0.09912109375, "learning_rate": 0.0001, "loss": 0.0925, "step": 75 }, { "epoch": 0.12433220009713453, "grad_norm": 0.08837890625, "learning_rate": 0.0001, "loss": 0.1931, "step": 80 }, { "epoch": 0.13210296260320545, "grad_norm": 0.1357421875, "learning_rate": 0.0001, "loss": 0.1508, "step": 85 }, { "epoch": 0.13987372510927634, "grad_norm": 0.1484375, "learning_rate": 0.0001, "loss": 0.1954, "step": 90 }, { "epoch": 0.14764448761534726, "grad_norm": 0.12060546875, "learning_rate": 0.0001, "loss": 0.1666, "step": 95 }, { "epoch": 0.15541525012141816, "grad_norm": 0.32421875, "learning_rate": 0.0001, "loss": 0.263, "step": 100 }, { "epoch": 0.16318601262748908, "grad_norm": 0.04736328125, "learning_rate": 0.0001, "loss": 0.1395, "step": 105 }, { "epoch": 0.17095677513355997, "grad_norm": 0.04296875, "learning_rate": 0.0001, "loss": 0.0486, "step": 110 }, { "epoch": 0.1787275376396309, "grad_norm": 0.031982421875, "learning_rate": 0.0001, "loss": 0.0418, "step": 115 }, { "epoch": 0.1864983001457018, "grad_norm": 0.034912109375, "learning_rate": 0.0001, "loss": 0.0538, "step": 120 }, { "epoch": 0.1942690626517727, "grad_norm": 0.0927734375, "learning_rate": 0.0001, "loss": 0.0847, "step": 125 }, { "epoch": 0.2020398251578436, "grad_norm": 0.1298828125, "learning_rate": 0.0001, "loss": 0.1045, "step": 130 }, { "epoch": 0.20981058766391453, "grad_norm": 0.12451171875, "learning_rate": 0.0001, "loss": 0.1584, "step": 135 }, { "epoch": 0.21758135016998542, "grad_norm": 0.1396484375, "learning_rate": 0.0001, "loss": 0.1446, "step": 140 }, { "epoch": 0.22535211267605634, "grad_norm": 0.1865234375, "learning_rate": 0.0001, "loss": 0.1932, "step": 145 }, { "epoch": 0.23312287518212724, "grad_norm": 0.2412109375, "learning_rate": 0.0001, "loss": 0.2388, "step": 150 }, { "epoch": 0.24089363768819816, "grad_norm": 0.045166015625, "learning_rate": 0.0001, "loss": 0.0958, "step": 155 }, { "epoch": 0.24866440019426905, "grad_norm": 0.04443359375, "learning_rate": 0.0001, "loss": 0.0522, "step": 160 }, { "epoch": 0.25643516270033995, "grad_norm": 0.044921875, "learning_rate": 0.0001, "loss": 0.039, "step": 165 }, { "epoch": 0.2642059252064109, "grad_norm": 0.044189453125, "learning_rate": 0.0001, "loss": 0.0332, "step": 170 }, { "epoch": 0.2719766877124818, "grad_norm": 0.0673828125, "learning_rate": 0.0001, "loss": 0.0846, "step": 175 }, { "epoch": 0.2797474502185527, "grad_norm": 0.09423828125, "learning_rate": 0.0001, "loss": 0.1465, "step": 180 }, { "epoch": 0.2875182127246236, "grad_norm": 0.09765625, "learning_rate": 0.0001, "loss": 0.1395, "step": 185 }, { "epoch": 0.29528897523069453, "grad_norm": 0.06640625, "learning_rate": 0.0001, "loss": 0.1369, "step": 190 }, { "epoch": 0.3030597377367654, "grad_norm": 0.1669921875, "learning_rate": 0.0001, "loss": 0.1556, "step": 195 }, { "epoch": 0.3108305002428363, "grad_norm": 0.1669921875, "learning_rate": 0.0001, "loss": 0.1521, "step": 200 }, { "epoch": 0.3186012627489072, "grad_norm": 0.0286865234375, "learning_rate": 0.0001, "loss": 0.1091, "step": 205 }, { "epoch": 0.32637202525497816, "grad_norm": 0.0262451171875, "learning_rate": 0.0001, "loss": 0.0511, "step": 210 }, { "epoch": 0.33414278776104905, "grad_norm": 0.033447265625, "learning_rate": 0.0001, "loss": 0.028, "step": 215 }, { "epoch": 0.34191355026711995, "grad_norm": 0.0218505859375, "learning_rate": 0.0001, "loss": 0.0297, "step": 220 }, { "epoch": 0.34968431277319084, "grad_norm": 0.0791015625, "learning_rate": 0.0001, "loss": 0.0885, "step": 225 }, { "epoch": 0.3574550752792618, "grad_norm": 0.09423828125, "learning_rate": 0.0001, "loss": 0.1735, "step": 230 }, { "epoch": 0.3652258377853327, "grad_norm": 0.162109375, "learning_rate": 0.0001, "loss": 0.1268, "step": 235 }, { "epoch": 0.3729966002914036, "grad_norm": 0.1015625, "learning_rate": 0.0001, "loss": 0.1057, "step": 240 }, { "epoch": 0.38076736279747453, "grad_norm": 0.212890625, "learning_rate": 0.0001, "loss": 0.1398, "step": 245 }, { "epoch": 0.3885381253035454, "grad_norm": 0.1611328125, "learning_rate": 0.0001, "loss": 0.1977, "step": 250 }, { "epoch": 0.3963088878096163, "grad_norm": 0.045166015625, "learning_rate": 0.0001, "loss": 0.1193, "step": 255 }, { "epoch": 0.4040796503156872, "grad_norm": 0.05810546875, "learning_rate": 0.0001, "loss": 0.0761, "step": 260 }, { "epoch": 0.41185041282175816, "grad_norm": 0.055908203125, "learning_rate": 0.0001, "loss": 0.0432, "step": 265 }, { "epoch": 0.41962117532782905, "grad_norm": 0.04248046875, "learning_rate": 0.0001, "loss": 0.0349, "step": 270 }, { "epoch": 0.42739193783389995, "grad_norm": 0.09814453125, "learning_rate": 0.0001, "loss": 0.1242, "step": 275 }, { "epoch": 0.43516270033997084, "grad_norm": 0.08154296875, "learning_rate": 0.0001, "loss": 0.1191, "step": 280 }, { "epoch": 0.4429334628460418, "grad_norm": 0.0810546875, "learning_rate": 0.0001, "loss": 0.1463, "step": 285 }, { "epoch": 0.4507042253521127, "grad_norm": 0.1259765625, "learning_rate": 0.0001, "loss": 0.1266, "step": 290 }, { "epoch": 0.4584749878581836, "grad_norm": 0.1083984375, "learning_rate": 0.0001, "loss": 0.1161, "step": 295 }, { "epoch": 0.4662457503642545, "grad_norm": 0.203125, "learning_rate": 0.0001, "loss": 0.1616, "step": 300 }, { "epoch": 0.4740165128703254, "grad_norm": 0.050048828125, "learning_rate": 0.0001, "loss": 0.1255, "step": 305 }, { "epoch": 0.4817872753763963, "grad_norm": 0.015869140625, "learning_rate": 0.0001, "loss": 0.0435, "step": 310 }, { "epoch": 0.4895580378824672, "grad_norm": 0.032958984375, "learning_rate": 0.0001, "loss": 0.0162, "step": 315 }, { "epoch": 0.4973288003885381, "grad_norm": 0.03662109375, "learning_rate": 0.0001, "loss": 0.0213, "step": 320 }, { "epoch": 0.505099562894609, "grad_norm": 0.09765625, "learning_rate": 0.0001, "loss": 0.1135, "step": 325 }, { "epoch": 0.5128703254006799, "grad_norm": 0.142578125, "learning_rate": 0.0001, "loss": 0.1382, "step": 330 }, { "epoch": 0.5206410879067509, "grad_norm": 0.091796875, "learning_rate": 0.0001, "loss": 0.1748, "step": 335 }, { "epoch": 0.5284118504128218, "grad_norm": 0.11669921875, "learning_rate": 0.0001, "loss": 0.115, "step": 340 }, { "epoch": 0.5361826129188927, "grad_norm": 0.11572265625, "learning_rate": 0.0001, "loss": 0.1357, "step": 345 }, { "epoch": 0.5439533754249636, "grad_norm": 0.15234375, "learning_rate": 0.0001, "loss": 0.159, "step": 350 }, { "epoch": 0.5517241379310345, "grad_norm": 0.052734375, "learning_rate": 0.0001, "loss": 0.112, "step": 355 }, { "epoch": 0.5594949004371054, "grad_norm": 0.044921875, "learning_rate": 0.0001, "loss": 0.0378, "step": 360 }, { "epoch": 0.5672656629431763, "grad_norm": 0.0269775390625, "learning_rate": 0.0001, "loss": 0.0328, "step": 365 }, { "epoch": 0.5750364254492472, "grad_norm": 0.03369140625, "learning_rate": 0.0001, "loss": 0.0498, "step": 370 }, { "epoch": 0.5828071879553182, "grad_norm": 0.0751953125, "learning_rate": 0.0001, "loss": 0.0752, "step": 375 }, { "epoch": 0.5905779504613891, "grad_norm": 0.12158203125, "learning_rate": 0.0001, "loss": 0.1457, "step": 380 }, { "epoch": 0.59834871296746, "grad_norm": 0.11474609375, "learning_rate": 0.0001, "loss": 0.1236, "step": 385 }, { "epoch": 0.6061194754735308, "grad_norm": 0.22265625, "learning_rate": 0.0001, "loss": 0.1249, "step": 390 }, { "epoch": 0.6138902379796017, "grad_norm": 0.1572265625, "learning_rate": 0.0001, "loss": 0.1376, "step": 395 }, { "epoch": 0.6216610004856726, "grad_norm": 0.1474609375, "learning_rate": 0.0001, "loss": 0.1713, "step": 400 }, { "epoch": 0.6294317629917435, "grad_norm": 0.04736328125, "learning_rate": 0.0001, "loss": 0.1155, "step": 405 }, { "epoch": 0.6372025254978144, "grad_norm": 0.03466796875, "learning_rate": 0.0001, "loss": 0.0526, "step": 410 }, { "epoch": 0.6449732880038854, "grad_norm": 0.0194091796875, "learning_rate": 0.0001, "loss": 0.0214, "step": 415 }, { "epoch": 0.6527440505099563, "grad_norm": 0.06787109375, "learning_rate": 0.0001, "loss": 0.0553, "step": 420 }, { "epoch": 0.6605148130160272, "grad_norm": 0.06201171875, "learning_rate": 0.0001, "loss": 0.0648, "step": 425 }, { "epoch": 0.6682855755220981, "grad_norm": 0.07177734375, "learning_rate": 0.0001, "loss": 0.1258, "step": 430 }, { "epoch": 0.676056338028169, "grad_norm": 0.09326171875, "learning_rate": 0.0001, "loss": 0.1269, "step": 435 }, { "epoch": 0.6838271005342399, "grad_norm": 0.119140625, "learning_rate": 0.0001, "loss": 0.1127, "step": 440 }, { "epoch": 0.6915978630403108, "grad_norm": 0.09130859375, "learning_rate": 0.0001, "loss": 0.1422, "step": 445 }, { "epoch": 0.6993686255463817, "grad_norm": 0.255859375, "learning_rate": 0.0001, "loss": 0.2041, "step": 450 }, { "epoch": 0.7071393880524527, "grad_norm": 0.0458984375, "learning_rate": 0.0001, "loss": 0.1114, "step": 455 }, { "epoch": 0.7149101505585236, "grad_norm": 0.0263671875, "learning_rate": 0.0001, "loss": 0.0674, "step": 460 }, { "epoch": 0.7226809130645945, "grad_norm": 0.026611328125, "learning_rate": 0.0001, "loss": 0.0225, "step": 465 }, { "epoch": 0.7304516755706654, "grad_norm": 0.043212890625, "learning_rate": 0.0001, "loss": 0.0435, "step": 470 }, { "epoch": 0.7382224380767363, "grad_norm": 0.08984375, "learning_rate": 0.0001, "loss": 0.0567, "step": 475 }, { "epoch": 0.7459932005828072, "grad_norm": 0.08251953125, "learning_rate": 0.0001, "loss": 0.0623, "step": 480 }, { "epoch": 0.753763963088878, "grad_norm": 0.0869140625, "learning_rate": 0.0001, "loss": 0.0851, "step": 485 }, { "epoch": 0.7615347255949491, "grad_norm": 0.11328125, "learning_rate": 0.0001, "loss": 0.1367, "step": 490 }, { "epoch": 0.76930548810102, "grad_norm": 0.0908203125, "learning_rate": 0.0001, "loss": 0.0808, "step": 495 }, { "epoch": 0.7770762506070908, "grad_norm": 0.259765625, "learning_rate": 0.0001, "loss": 0.1243, "step": 500 }, { "epoch": 0.7848470131131617, "grad_norm": 0.048828125, "learning_rate": 0.0001, "loss": 0.1142, "step": 505 }, { "epoch": 0.7926177756192326, "grad_norm": 0.007781982421875, "learning_rate": 0.0001, "loss": 0.0499, "step": 510 }, { "epoch": 0.8003885381253035, "grad_norm": 0.0147705078125, "learning_rate": 0.0001, "loss": 0.0142, "step": 515 }, { "epoch": 0.8081593006313744, "grad_norm": 0.0517578125, "learning_rate": 0.0001, "loss": 0.0259, "step": 520 }, { "epoch": 0.8159300631374453, "grad_norm": 0.0810546875, "learning_rate": 0.0001, "loss": 0.0634, "step": 525 }, { "epoch": 0.8237008256435163, "grad_norm": 0.11083984375, "learning_rate": 0.0001, "loss": 0.1226, "step": 530 }, { "epoch": 0.8314715881495872, "grad_norm": 0.11572265625, "learning_rate": 0.0001, "loss": 0.1012, "step": 535 }, { "epoch": 0.8392423506556581, "grad_norm": 0.12060546875, "learning_rate": 0.0001, "loss": 0.103, "step": 540 }, { "epoch": 0.847013113161729, "grad_norm": 0.1513671875, "learning_rate": 0.0001, "loss": 0.1475, "step": 545 }, { "epoch": 0.8547838756677999, "grad_norm": 0.11181640625, "learning_rate": 0.0001, "loss": 0.1074, "step": 550 }, { "epoch": 0.8625546381738708, "grad_norm": 0.0712890625, "learning_rate": 0.0001, "loss": 0.1353, "step": 555 }, { "epoch": 0.8703254006799417, "grad_norm": 0.007781982421875, "learning_rate": 0.0001, "loss": 0.0262, "step": 560 }, { "epoch": 0.8780961631860126, "grad_norm": 0.06689453125, "learning_rate": 0.0001, "loss": 0.0208, "step": 565 }, { "epoch": 0.8858669256920836, "grad_norm": 0.026123046875, "learning_rate": 0.0001, "loss": 0.0307, "step": 570 }, { "epoch": 0.8936376881981545, "grad_norm": 0.07421875, "learning_rate": 0.0001, "loss": 0.075, "step": 575 }, { "epoch": 0.9014084507042254, "grad_norm": 0.09423828125, "learning_rate": 0.0001, "loss": 0.1302, "step": 580 }, { "epoch": 0.9091792132102963, "grad_norm": 0.1044921875, "learning_rate": 0.0001, "loss": 0.1165, "step": 585 }, { "epoch": 0.9169499757163672, "grad_norm": 0.11474609375, "learning_rate": 0.0001, "loss": 0.1413, "step": 590 }, { "epoch": 0.924720738222438, "grad_norm": 0.095703125, "learning_rate": 0.0001, "loss": 0.1307, "step": 595 }, { "epoch": 0.932491500728509, "grad_norm": 0.1689453125, "learning_rate": 0.0001, "loss": 0.1533, "step": 600 }, { "epoch": 0.9402622632345798, "grad_norm": 0.04443359375, "learning_rate": 0.0001, "loss": 0.1054, "step": 605 }, { "epoch": 0.9480330257406508, "grad_norm": 0.031982421875, "learning_rate": 0.0001, "loss": 0.0223, "step": 610 }, { "epoch": 0.9558037882467217, "grad_norm": 0.0400390625, "learning_rate": 0.0001, "loss": 0.0336, "step": 615 }, { "epoch": 0.9635745507527926, "grad_norm": 0.05322265625, "learning_rate": 0.0001, "loss": 0.0454, "step": 620 }, { "epoch": 0.9713453132588635, "grad_norm": 0.169921875, "learning_rate": 0.0001, "loss": 0.1449, "step": 625 }, { "epoch": 0.9791160757649344, "grad_norm": 0.08935546875, "learning_rate": 0.0001, "loss": 0.1174, "step": 630 }, { "epoch": 0.9868868382710053, "grad_norm": 0.11767578125, "learning_rate": 0.0001, "loss": 0.1063, "step": 635 }, { "epoch": 0.9946576007770762, "grad_norm": 0.10400390625, "learning_rate": 0.0001, "loss": 0.187, "step": 640 }, { "epoch": 1.0024283632831472, "grad_norm": 0.046630859375, "learning_rate": 0.0001, "loss": 0.1296, "step": 645 }, { "epoch": 1.010199125789218, "grad_norm": 0.0517578125, "learning_rate": 0.0001, "loss": 0.0741, "step": 650 }, { "epoch": 1.017969888295289, "grad_norm": 0.0093994140625, "learning_rate": 0.0001, "loss": 0.0179, "step": 655 }, { "epoch": 1.0257406508013598, "grad_norm": 0.027587890625, "learning_rate": 0.0001, "loss": 0.0117, "step": 660 }, { "epoch": 1.0335114133074308, "grad_norm": 0.04736328125, "learning_rate": 0.0001, "loss": 0.0372, "step": 665 }, { "epoch": 1.0412821758135018, "grad_norm": 0.07470703125, "learning_rate": 0.0001, "loss": 0.0489, "step": 670 }, { "epoch": 1.0490529383195726, "grad_norm": 0.09423828125, "learning_rate": 0.0001, "loss": 0.0753, "step": 675 }, { "epoch": 1.0568237008256436, "grad_norm": 0.11572265625, "learning_rate": 0.0001, "loss": 0.0678, "step": 680 }, { "epoch": 1.0645944633317144, "grad_norm": 0.18359375, "learning_rate": 0.0001, "loss": 0.0719, "step": 685 }, { "epoch": 1.0723652258377854, "grad_norm": 0.16796875, "learning_rate": 0.0001, "loss": 0.0758, "step": 690 }, { "epoch": 1.0801359883438562, "grad_norm": 0.06005859375, "learning_rate": 0.0001, "loss": 0.0931, "step": 695 }, { "epoch": 1.0879067508499272, "grad_norm": 0.0625, "learning_rate": 0.0001, "loss": 0.068, "step": 700 }, { "epoch": 1.095677513355998, "grad_norm": 0.051513671875, "learning_rate": 0.0001, "loss": 0.0374, "step": 705 }, { "epoch": 1.103448275862069, "grad_norm": 0.032470703125, "learning_rate": 0.0001, "loss": 0.0129, "step": 710 }, { "epoch": 1.11121903836814, "grad_norm": 0.0234375, "learning_rate": 0.0001, "loss": 0.0202, "step": 715 }, { "epoch": 1.1189898008742107, "grad_norm": 0.08203125, "learning_rate": 0.0001, "loss": 0.0579, "step": 720 }, { "epoch": 1.1267605633802817, "grad_norm": 0.11083984375, "learning_rate": 0.0001, "loss": 0.1094, "step": 725 }, { "epoch": 1.1345313258863525, "grad_norm": 0.1298828125, "learning_rate": 0.0001, "loss": 0.076, "step": 730 }, { "epoch": 1.1423020883924235, "grad_norm": 0.138671875, "learning_rate": 0.0001, "loss": 0.0648, "step": 735 }, { "epoch": 1.1500728508984945, "grad_norm": 0.146484375, "learning_rate": 0.0001, "loss": 0.0702, "step": 740 }, { "epoch": 1.1578436134045653, "grad_norm": 0.0849609375, "learning_rate": 0.0001, "loss": 0.0881, "step": 745 }, { "epoch": 1.1656143759106363, "grad_norm": 0.06103515625, "learning_rate": 0.0001, "loss": 0.0715, "step": 750 }, { "epoch": 1.173385138416707, "grad_norm": 0.0225830078125, "learning_rate": 0.0001, "loss": 0.0241, "step": 755 }, { "epoch": 1.1811559009227781, "grad_norm": 0.0546875, "learning_rate": 0.0001, "loss": 0.0222, "step": 760 }, { "epoch": 1.188926663428849, "grad_norm": 0.061279296875, "learning_rate": 0.0001, "loss": 0.0319, "step": 765 }, { "epoch": 1.19669742593492, "grad_norm": 0.08203125, "learning_rate": 0.0001, "loss": 0.0672, "step": 770 }, { "epoch": 1.2044681884409907, "grad_norm": 0.061767578125, "learning_rate": 0.0001, "loss": 0.0693, "step": 775 }, { "epoch": 1.2122389509470617, "grad_norm": 0.1171875, "learning_rate": 0.0001, "loss": 0.0811, "step": 780 }, { "epoch": 1.2200097134531327, "grad_norm": 0.0966796875, "learning_rate": 0.0001, "loss": 0.0665, "step": 785 }, { "epoch": 1.2277804759592035, "grad_norm": 0.08056640625, "learning_rate": 0.0001, "loss": 0.0703, "step": 790 }, { "epoch": 1.2355512384652745, "grad_norm": 0.052734375, "learning_rate": 0.0001, "loss": 0.1042, "step": 795 }, { "epoch": 1.2433220009713453, "grad_norm": 0.0257568359375, "learning_rate": 0.0001, "loss": 0.0667, "step": 800 }, { "epoch": 1.2510927634774163, "grad_norm": 0.03564453125, "learning_rate": 0.0001, "loss": 0.0133, "step": 805 }, { "epoch": 1.258863525983487, "grad_norm": 0.03369140625, "learning_rate": 0.0001, "loss": 0.0167, "step": 810 }, { "epoch": 1.266634288489558, "grad_norm": 0.050537109375, "learning_rate": 0.0001, "loss": 0.041, "step": 815 }, { "epoch": 1.2744050509956288, "grad_norm": 0.130859375, "learning_rate": 0.0001, "loss": 0.1126, "step": 820 }, { "epoch": 1.2821758135016998, "grad_norm": 0.08984375, "learning_rate": 0.0001, "loss": 0.064, "step": 825 }, { "epoch": 1.2899465760077709, "grad_norm": 0.09716796875, "learning_rate": 0.0001, "loss": 0.0683, "step": 830 }, { "epoch": 1.2977173385138416, "grad_norm": 0.1259765625, "learning_rate": 0.0001, "loss": 0.0552, "step": 835 }, { "epoch": 1.3054881010199126, "grad_norm": 0.1259765625, "learning_rate": 0.0001, "loss": 0.0839, "step": 840 }, { "epoch": 1.3132588635259834, "grad_norm": 0.07275390625, "learning_rate": 0.0001, "loss": 0.1073, "step": 845 }, { "epoch": 1.3210296260320544, "grad_norm": 0.08740234375, "learning_rate": 0.0001, "loss": 0.0792, "step": 850 }, { "epoch": 1.3288003885381254, "grad_norm": 0.04638671875, "learning_rate": 0.0001, "loss": 0.0354, "step": 855 }, { "epoch": 1.3365711510441962, "grad_norm": 0.05078125, "learning_rate": 0.0001, "loss": 0.0145, "step": 860 }, { "epoch": 1.344341913550267, "grad_norm": 0.068359375, "learning_rate": 0.0001, "loss": 0.0348, "step": 865 }, { "epoch": 1.352112676056338, "grad_norm": 0.08935546875, "learning_rate": 0.0001, "loss": 0.0524, "step": 870 }, { "epoch": 1.359883438562409, "grad_norm": 0.0634765625, "learning_rate": 0.0001, "loss": 0.0427, "step": 875 }, { "epoch": 1.3676542010684798, "grad_norm": 0.06591796875, "learning_rate": 0.0001, "loss": 0.0425, "step": 880 }, { "epoch": 1.3754249635745508, "grad_norm": 0.166015625, "learning_rate": 0.0001, "loss": 0.061, "step": 885 }, { "epoch": 1.3831957260806216, "grad_norm": 0.17578125, "learning_rate": 0.0001, "loss": 0.0512, "step": 890 }, { "epoch": 1.3909664885866926, "grad_norm": 0.059814453125, "learning_rate": 0.0001, "loss": 0.1101, "step": 895 }, { "epoch": 1.3987372510927636, "grad_norm": 0.03759765625, "learning_rate": 0.0001, "loss": 0.0625, "step": 900 }, { "epoch": 1.4065080135988344, "grad_norm": 0.053955078125, "learning_rate": 0.0001, "loss": 0.0264, "step": 905 }, { "epoch": 1.4142787761049052, "grad_norm": 0.0615234375, "learning_rate": 0.0001, "loss": 0.0199, "step": 910 }, { "epoch": 1.4220495386109762, "grad_norm": 0.033203125, "learning_rate": 0.0001, "loss": 0.0332, "step": 915 }, { "epoch": 1.4298203011170472, "grad_norm": 0.08642578125, "learning_rate": 0.0001, "loss": 0.0886, "step": 920 }, { "epoch": 1.437591063623118, "grad_norm": 0.0859375, "learning_rate": 0.0001, "loss": 0.0829, "step": 925 }, { "epoch": 1.445361826129189, "grad_norm": 0.09765625, "learning_rate": 0.0001, "loss": 0.0626, "step": 930 }, { "epoch": 1.4531325886352597, "grad_norm": 0.11181640625, "learning_rate": 0.0001, "loss": 0.0602, "step": 935 }, { "epoch": 1.4609033511413307, "grad_norm": 0.16015625, "learning_rate": 0.0001, "loss": 0.0425, "step": 940 }, { "epoch": 1.4686741136474017, "grad_norm": 0.056640625, "learning_rate": 0.0001, "loss": 0.084, "step": 945 }, { "epoch": 1.4764448761534725, "grad_norm": 0.052001953125, "learning_rate": 0.0001, "loss": 0.0844, "step": 950 }, { "epoch": 1.4842156386595435, "grad_norm": 0.052978515625, "learning_rate": 0.0001, "loss": 0.0313, "step": 955 }, { "epoch": 1.4919864011656143, "grad_norm": 0.0380859375, "learning_rate": 0.0001, "loss": 0.0201, "step": 960 }, { "epoch": 1.4997571636716853, "grad_norm": 0.053955078125, "learning_rate": 0.0001, "loss": 0.0285, "step": 965 }, { "epoch": 1.5075279261777563, "grad_norm": 0.061279296875, "learning_rate": 0.0001, "loss": 0.0593, "step": 970 }, { "epoch": 1.515298688683827, "grad_norm": 0.0380859375, "learning_rate": 0.0001, "loss": 0.0689, "step": 975 }, { "epoch": 1.523069451189898, "grad_norm": 0.080078125, "learning_rate": 0.0001, "loss": 0.0739, "step": 980 }, { "epoch": 1.530840213695969, "grad_norm": 0.1513671875, "learning_rate": 0.0001, "loss": 0.0539, "step": 985 }, { "epoch": 1.53861097620204, "grad_norm": 0.1240234375, "learning_rate": 0.0001, "loss": 0.0781, "step": 990 }, { "epoch": 1.5463817387081107, "grad_norm": 0.064453125, "learning_rate": 0.0001, "loss": 0.0748, "step": 995 }, { "epoch": 1.5541525012141817, "grad_norm": 0.06591796875, "learning_rate": 0.0001, "loss": 0.0867, "step": 1000 }, { "epoch": 1.5619232637202525, "grad_norm": 0.01556396484375, "learning_rate": 0.0001, "loss": 0.0261, "step": 1005 }, { "epoch": 1.5696940262263235, "grad_norm": 0.0322265625, "learning_rate": 0.0001, "loss": 0.0182, "step": 1010 }, { "epoch": 1.5774647887323945, "grad_norm": 0.055908203125, "learning_rate": 0.0001, "loss": 0.0339, "step": 1015 }, { "epoch": 1.5852355512384653, "grad_norm": 0.10107421875, "learning_rate": 0.0001, "loss": 0.0663, "step": 1020 }, { "epoch": 1.593006313744536, "grad_norm": 0.076171875, "learning_rate": 0.0001, "loss": 0.0737, "step": 1025 }, { "epoch": 1.600777076250607, "grad_norm": 0.12890625, "learning_rate": 0.0001, "loss": 0.0849, "step": 1030 }, { "epoch": 1.608547838756678, "grad_norm": 0.150390625, "learning_rate": 0.0001, "loss": 0.0393, "step": 1035 }, { "epoch": 1.616318601262749, "grad_norm": 0.1552734375, "learning_rate": 0.0001, "loss": 0.085, "step": 1040 }, { "epoch": 1.6240893637688198, "grad_norm": 0.044921875, "learning_rate": 0.0001, "loss": 0.0806, "step": 1045 }, { "epoch": 1.6318601262748906, "grad_norm": 0.05859375, "learning_rate": 0.0001, "loss": 0.0804, "step": 1050 }, { "epoch": 1.6396308887809616, "grad_norm": 0.0181884765625, "learning_rate": 0.0001, "loss": 0.0238, "step": 1055 }, { "epoch": 1.6474016512870326, "grad_norm": 0.060546875, "learning_rate": 0.0001, "loss": 0.0176, "step": 1060 }, { "epoch": 1.6551724137931034, "grad_norm": 0.049072265625, "learning_rate": 0.0001, "loss": 0.0223, "step": 1065 }, { "epoch": 1.6629431762991742, "grad_norm": 0.09619140625, "learning_rate": 0.0001, "loss": 0.0682, "step": 1070 }, { "epoch": 1.6707139388052452, "grad_norm": 0.08642578125, "learning_rate": 0.0001, "loss": 0.0745, "step": 1075 }, { "epoch": 1.6784847013113162, "grad_norm": 0.080078125, "learning_rate": 0.0001, "loss": 0.0643, "step": 1080 }, { "epoch": 1.6862554638173872, "grad_norm": 0.14453125, "learning_rate": 0.0001, "loss": 0.0538, "step": 1085 }, { "epoch": 1.694026226323458, "grad_norm": 0.091796875, "learning_rate": 0.0001, "loss": 0.0653, "step": 1090 }, { "epoch": 1.7017969888295288, "grad_norm": 0.06005859375, "learning_rate": 0.0001, "loss": 0.1079, "step": 1095 }, { "epoch": 1.7095677513355998, "grad_norm": 0.053955078125, "learning_rate": 0.0001, "loss": 0.0545, "step": 1100 }, { "epoch": 1.7173385138416708, "grad_norm": 0.025390625, "learning_rate": 0.0001, "loss": 0.0126, "step": 1105 }, { "epoch": 1.7251092763477416, "grad_norm": 0.0419921875, "learning_rate": 0.0001, "loss": 0.0101, "step": 1110 }, { "epoch": 1.7328800388538124, "grad_norm": 0.0908203125, "learning_rate": 0.0001, "loss": 0.0302, "step": 1115 }, { "epoch": 1.7406508013598834, "grad_norm": 0.0849609375, "learning_rate": 0.0001, "loss": 0.0467, "step": 1120 }, { "epoch": 1.7484215638659544, "grad_norm": 0.1435546875, "learning_rate": 0.0001, "loss": 0.0702, "step": 1125 }, { "epoch": 1.7561923263720254, "grad_norm": 0.11865234375, "learning_rate": 0.0001, "loss": 0.0664, "step": 1130 }, { "epoch": 1.7639630888780962, "grad_norm": 0.140625, "learning_rate": 0.0001, "loss": 0.0746, "step": 1135 }, { "epoch": 1.771733851384167, "grad_norm": 0.1357421875, "learning_rate": 0.0001, "loss": 0.0458, "step": 1140 }, { "epoch": 1.779504613890238, "grad_norm": 0.0576171875, "learning_rate": 0.0001, "loss": 0.1097, "step": 1145 }, { "epoch": 1.787275376396309, "grad_norm": 0.038818359375, "learning_rate": 0.0001, "loss": 0.0759, "step": 1150 }, { "epoch": 1.79504613890238, "grad_norm": 0.056396484375, "learning_rate": 0.0001, "loss": 0.0274, "step": 1155 }, { "epoch": 1.8028169014084507, "grad_norm": 0.05078125, "learning_rate": 0.0001, "loss": 0.0146, "step": 1160 }, { "epoch": 1.8105876639145215, "grad_norm": 0.048583984375, "learning_rate": 0.0001, "loss": 0.0251, "step": 1165 }, { "epoch": 1.8183584264205925, "grad_norm": 0.09423828125, "learning_rate": 0.0001, "loss": 0.0727, "step": 1170 }, { "epoch": 1.8261291889266635, "grad_norm": 0.109375, "learning_rate": 0.0001, "loss": 0.0685, "step": 1175 }, { "epoch": 1.8338999514327343, "grad_norm": 0.138671875, "learning_rate": 0.0001, "loss": 0.0829, "step": 1180 }, { "epoch": 1.841670713938805, "grad_norm": 0.07275390625, "learning_rate": 0.0001, "loss": 0.0445, "step": 1185 }, { "epoch": 1.849441476444876, "grad_norm": 0.2041015625, "learning_rate": 0.0001, "loss": 0.0699, "step": 1190 }, { "epoch": 1.8572122389509471, "grad_norm": 0.07373046875, "learning_rate": 0.0001, "loss": 0.0995, "step": 1195 }, { "epoch": 1.8649830014570181, "grad_norm": 0.0546875, "learning_rate": 0.0001, "loss": 0.0798, "step": 1200 }, { "epoch": 1.872753763963089, "grad_norm": 0.045654296875, "learning_rate": 0.0001, "loss": 0.019, "step": 1205 }, { "epoch": 1.8805245264691597, "grad_norm": 0.0673828125, "learning_rate": 0.0001, "loss": 0.0179, "step": 1210 }, { "epoch": 1.8882952889752307, "grad_norm": 0.06640625, "learning_rate": 0.0001, "loss": 0.027, "step": 1215 }, { "epoch": 1.8960660514813017, "grad_norm": 0.0830078125, "learning_rate": 0.0001, "loss": 0.056, "step": 1220 }, { "epoch": 1.9038368139873725, "grad_norm": 0.1279296875, "learning_rate": 0.0001, "loss": 0.0699, "step": 1225 }, { "epoch": 1.9116075764934433, "grad_norm": 0.09912109375, "learning_rate": 0.0001, "loss": 0.063, "step": 1230 }, { "epoch": 1.9193783389995143, "grad_norm": 0.1328125, "learning_rate": 0.0001, "loss": 0.0633, "step": 1235 }, { "epoch": 1.9271491015055853, "grad_norm": 0.169921875, "learning_rate": 0.0001, "loss": 0.0662, "step": 1240 }, { "epoch": 1.9349198640116563, "grad_norm": 0.057373046875, "learning_rate": 0.0001, "loss": 0.0728, "step": 1245 }, { "epoch": 1.942690626517727, "grad_norm": 0.056884765625, "learning_rate": 0.0001, "loss": 0.0569, "step": 1250 }, { "epoch": 1.9504613890237978, "grad_norm": 0.052490234375, "learning_rate": 0.0001, "loss": 0.0309, "step": 1255 }, { "epoch": 1.9582321515298688, "grad_norm": 0.0625, "learning_rate": 0.0001, "loss": 0.0352, "step": 1260 }, { "epoch": 1.9660029140359399, "grad_norm": 0.06591796875, "learning_rate": 0.0001, "loss": 0.0289, "step": 1265 }, { "epoch": 1.9737736765420106, "grad_norm": 0.09033203125, "learning_rate": 0.0001, "loss": 0.0689, "step": 1270 }, { "epoch": 1.9815444390480816, "grad_norm": 0.1015625, "learning_rate": 0.0001, "loss": 0.0669, "step": 1275 }, { "epoch": 1.9893152015541524, "grad_norm": 0.09814453125, "learning_rate": 0.0001, "loss": 0.056, "step": 1280 }, { "epoch": 1.9970859640602234, "grad_norm": 0.111328125, "learning_rate": 0.0001, "loss": 0.0683, "step": 1285 }, { "epoch": 2.0048567265662944, "grad_norm": 0.055908203125, "learning_rate": 0.0001, "loss": 0.0879, "step": 1290 }, { "epoch": 2.0126274890723654, "grad_norm": 0.037109375, "learning_rate": 0.0001, "loss": 0.026, "step": 1295 }, { "epoch": 2.020398251578436, "grad_norm": 0.0252685546875, "learning_rate": 0.0001, "loss": 0.0134, "step": 1300 }, { "epoch": 2.028169014084507, "grad_norm": 0.06640625, "learning_rate": 0.0001, "loss": 0.0336, "step": 1305 }, { "epoch": 2.035939776590578, "grad_norm": 0.06494140625, "learning_rate": 0.0001, "loss": 0.0374, "step": 1310 }, { "epoch": 2.043710539096649, "grad_norm": 0.091796875, "learning_rate": 0.0001, "loss": 0.0574, "step": 1315 }, { "epoch": 2.0514813016027196, "grad_norm": 0.1181640625, "learning_rate": 0.0001, "loss": 0.0282, "step": 1320 }, { "epoch": 2.0592520641087906, "grad_norm": 0.0947265625, "learning_rate": 0.0001, "loss": 0.0419, "step": 1325 }, { "epoch": 2.0670228266148616, "grad_norm": 0.154296875, "learning_rate": 0.0001, "loss": 0.0283, "step": 1330 }, { "epoch": 2.0747935891209326, "grad_norm": 0.054931640625, "learning_rate": 0.0001, "loss": 0.0379, "step": 1335 }, { "epoch": 2.0825643516270036, "grad_norm": 0.059814453125, "learning_rate": 0.0001, "loss": 0.0721, "step": 1340 }, { "epoch": 2.090335114133074, "grad_norm": 0.03466796875, "learning_rate": 0.0001, "loss": 0.0264, "step": 1345 }, { "epoch": 2.098105876639145, "grad_norm": 0.0299072265625, "learning_rate": 0.0001, "loss": 0.0079, "step": 1350 }, { "epoch": 2.105876639145216, "grad_norm": 0.06494140625, "learning_rate": 0.0001, "loss": 0.0087, "step": 1355 }, { "epoch": 2.113647401651287, "grad_norm": 0.06201171875, "learning_rate": 0.0001, "loss": 0.0131, "step": 1360 }, { "epoch": 2.1214181641573577, "grad_norm": 0.1005859375, "learning_rate": 0.0001, "loss": 0.04, "step": 1365 }, { "epoch": 2.1291889266634287, "grad_norm": 0.06298828125, "learning_rate": 0.0001, "loss": 0.035, "step": 1370 }, { "epoch": 2.1369596891694997, "grad_norm": 0.08544921875, "learning_rate": 0.0001, "loss": 0.0477, "step": 1375 }, { "epoch": 2.1447304516755707, "grad_norm": 0.09521484375, "learning_rate": 0.0001, "loss": 0.023, "step": 1380 }, { "epoch": 2.1525012141816418, "grad_norm": 0.0810546875, "learning_rate": 0.0001, "loss": 0.0341, "step": 1385 }, { "epoch": 2.1602719766877123, "grad_norm": 0.087890625, "learning_rate": 0.0001, "loss": 0.0797, "step": 1390 }, { "epoch": 2.1680427391937833, "grad_norm": 0.0125732421875, "learning_rate": 0.0001, "loss": 0.0308, "step": 1395 }, { "epoch": 2.1758135016998543, "grad_norm": 0.0439453125, "learning_rate": 0.0001, "loss": 0.0105, "step": 1400 }, { "epoch": 2.1835842642059253, "grad_norm": 0.04541015625, "learning_rate": 0.0001, "loss": 0.0275, "step": 1405 }, { "epoch": 2.191355026711996, "grad_norm": 0.0311279296875, "learning_rate": 0.0001, "loss": 0.0155, "step": 1410 }, { "epoch": 2.199125789218067, "grad_norm": 0.10205078125, "learning_rate": 0.0001, "loss": 0.0386, "step": 1415 }, { "epoch": 2.206896551724138, "grad_norm": 0.2080078125, "learning_rate": 0.0001, "loss": 0.05, "step": 1420 }, { "epoch": 2.214667314230209, "grad_norm": 0.0712890625, "learning_rate": 0.0001, "loss": 0.0236, "step": 1425 }, { "epoch": 2.22243807673628, "grad_norm": 0.1181640625, "learning_rate": 0.0001, "loss": 0.0198, "step": 1430 }, { "epoch": 2.2302088392423505, "grad_norm": 0.1337890625, "learning_rate": 0.0001, "loss": 0.0274, "step": 1435 }, { "epoch": 2.2379796017484215, "grad_norm": 0.130859375, "learning_rate": 0.0001, "loss": 0.0659, "step": 1440 }, { "epoch": 2.2457503642544925, "grad_norm": 0.041015625, "learning_rate": 0.0001, "loss": 0.0416, "step": 1445 }, { "epoch": 2.2535211267605635, "grad_norm": 0.02685546875, "learning_rate": 0.0001, "loss": 0.0123, "step": 1450 }, { "epoch": 2.2612918892666345, "grad_norm": 0.0245361328125, "learning_rate": 0.0001, "loss": 0.0078, "step": 1455 }, { "epoch": 2.269062651772705, "grad_norm": 0.04833984375, "learning_rate": 0.0001, "loss": 0.0186, "step": 1460 }, { "epoch": 2.276833414278776, "grad_norm": 0.07275390625, "learning_rate": 0.0001, "loss": 0.0529, "step": 1465 }, { "epoch": 2.284604176784847, "grad_norm": 0.0888671875, "learning_rate": 0.0001, "loss": 0.0243, "step": 1470 }, { "epoch": 2.292374939290918, "grad_norm": 0.07421875, "learning_rate": 0.0001, "loss": 0.0296, "step": 1475 }, { "epoch": 2.300145701796989, "grad_norm": 0.21875, "learning_rate": 0.0001, "loss": 0.0334, "step": 1480 }, { "epoch": 2.3079164643030596, "grad_norm": 0.109375, "learning_rate": 0.0001, "loss": 0.0306, "step": 1485 }, { "epoch": 2.3156872268091306, "grad_norm": 0.0810546875, "learning_rate": 0.0001, "loss": 0.0796, "step": 1490 }, { "epoch": 2.3234579893152016, "grad_norm": 0.05078125, "learning_rate": 0.0001, "loss": 0.023, "step": 1495 }, { "epoch": 2.3312287518212726, "grad_norm": 0.038818359375, "learning_rate": 0.0001, "loss": 0.0128, "step": 1500 }, { "epoch": 2.338999514327343, "grad_norm": 0.05712890625, "learning_rate": 0.0001, "loss": 0.0166, "step": 1505 }, { "epoch": 2.346770276833414, "grad_norm": 0.06103515625, "learning_rate": 0.0001, "loss": 0.0222, "step": 1510 }, { "epoch": 2.354541039339485, "grad_norm": 0.09521484375, "learning_rate": 0.0001, "loss": 0.0375, "step": 1515 }, { "epoch": 2.3623118018455562, "grad_norm": 0.109375, "learning_rate": 0.0001, "loss": 0.0284, "step": 1520 }, { "epoch": 2.370082564351627, "grad_norm": 0.0908203125, "learning_rate": 0.0001, "loss": 0.0257, "step": 1525 }, { "epoch": 2.377853326857698, "grad_norm": 0.13671875, "learning_rate": 0.0001, "loss": 0.0423, "step": 1530 }, { "epoch": 2.385624089363769, "grad_norm": 0.08642578125, "learning_rate": 0.0001, "loss": 0.0283, "step": 1535 }, { "epoch": 2.39339485186984, "grad_norm": 0.053466796875, "learning_rate": 0.0001, "loss": 0.0543, "step": 1540 }, { "epoch": 2.401165614375911, "grad_norm": 0.045166015625, "learning_rate": 0.0001, "loss": 0.0405, "step": 1545 }, { "epoch": 2.4089363768819814, "grad_norm": 0.00860595703125, "learning_rate": 0.0001, "loss": 0.0088, "step": 1550 }, { "epoch": 2.4167071393880524, "grad_norm": 0.0206298828125, "learning_rate": 0.0001, "loss": 0.0092, "step": 1555 }, { "epoch": 2.4244779018941234, "grad_norm": 0.03564453125, "learning_rate": 0.0001, "loss": 0.0233, "step": 1560 }, { "epoch": 2.4322486644001944, "grad_norm": 0.1103515625, "learning_rate": 0.0001, "loss": 0.0386, "step": 1565 }, { "epoch": 2.4400194269062654, "grad_norm": 0.0830078125, "learning_rate": 0.0001, "loss": 0.0318, "step": 1570 }, { "epoch": 2.447790189412336, "grad_norm": 0.1328125, "learning_rate": 0.0001, "loss": 0.0313, "step": 1575 }, { "epoch": 2.455560951918407, "grad_norm": 0.12060546875, "learning_rate": 0.0001, "loss": 0.0258, "step": 1580 }, { "epoch": 2.463331714424478, "grad_norm": 0.12060546875, "learning_rate": 0.0001, "loss": 0.039, "step": 1585 }, { "epoch": 2.471102476930549, "grad_norm": 0.072265625, "learning_rate": 0.0001, "loss": 0.0593, "step": 1590 }, { "epoch": 2.4788732394366195, "grad_norm": 0.05126953125, "learning_rate": 0.0001, "loss": 0.0398, "step": 1595 }, { "epoch": 2.4866440019426905, "grad_norm": 0.034912109375, "learning_rate": 0.0001, "loss": 0.0067, "step": 1600 }, { "epoch": 2.4944147644487615, "grad_norm": 0.0284423828125, "learning_rate": 0.0001, "loss": 0.0132, "step": 1605 }, { "epoch": 2.5021855269548325, "grad_norm": 0.064453125, "learning_rate": 0.0001, "loss": 0.0213, "step": 1610 }, { "epoch": 2.509956289460903, "grad_norm": 0.0732421875, "learning_rate": 0.0001, "loss": 0.0299, "step": 1615 }, { "epoch": 2.517727051966974, "grad_norm": 0.10888671875, "learning_rate": 0.0001, "loss": 0.036, "step": 1620 }, { "epoch": 2.525497814473045, "grad_norm": 0.1435546875, "learning_rate": 0.0001, "loss": 0.0313, "step": 1625 }, { "epoch": 2.533268576979116, "grad_norm": 0.09375, "learning_rate": 0.0001, "loss": 0.03, "step": 1630 }, { "epoch": 2.541039339485187, "grad_norm": 0.1201171875, "learning_rate": 0.0001, "loss": 0.0308, "step": 1635 }, { "epoch": 2.5488101019912577, "grad_norm": 0.109375, "learning_rate": 0.0001, "loss": 0.077, "step": 1640 }, { "epoch": 2.5565808644973287, "grad_norm": 0.03466796875, "learning_rate": 0.0001, "loss": 0.0328, "step": 1645 }, { "epoch": 2.5643516270033997, "grad_norm": 0.03955078125, "learning_rate": 0.0001, "loss": 0.0118, "step": 1650 }, { "epoch": 2.5721223895094707, "grad_norm": 0.0712890625, "learning_rate": 0.0001, "loss": 0.0127, "step": 1655 }, { "epoch": 2.5798931520155417, "grad_norm": 0.06201171875, "learning_rate": 0.0001, "loss": 0.0154, "step": 1660 }, { "epoch": 2.5876639145216123, "grad_norm": 0.06982421875, "learning_rate": 0.0001, "loss": 0.0467, "step": 1665 }, { "epoch": 2.5954346770276833, "grad_norm": 0.11572265625, "learning_rate": 0.0001, "loss": 0.0477, "step": 1670 }, { "epoch": 2.6032054395337543, "grad_norm": 0.1142578125, "learning_rate": 0.0001, "loss": 0.0265, "step": 1675 }, { "epoch": 2.6109762020398253, "grad_norm": 0.08984375, "learning_rate": 0.0001, "loss": 0.0214, "step": 1680 }, { "epoch": 2.6187469645458963, "grad_norm": 0.1328125, "learning_rate": 0.0001, "loss": 0.0364, "step": 1685 }, { "epoch": 2.626517727051967, "grad_norm": 0.062255859375, "learning_rate": 0.0001, "loss": 0.0746, "step": 1690 }, { "epoch": 2.634288489558038, "grad_norm": 0.036376953125, "learning_rate": 0.0001, "loss": 0.026, "step": 1695 }, { "epoch": 2.642059252064109, "grad_norm": 0.00982666015625, "learning_rate": 0.0001, "loss": 0.0106, "step": 1700 }, { "epoch": 2.64983001457018, "grad_norm": 0.072265625, "learning_rate": 0.0001, "loss": 0.0181, "step": 1705 }, { "epoch": 2.657600777076251, "grad_norm": 0.062255859375, "learning_rate": 0.0001, "loss": 0.0317, "step": 1710 }, { "epoch": 2.6653715395823214, "grad_norm": 0.0859375, "learning_rate": 0.0001, "loss": 0.0366, "step": 1715 }, { "epoch": 2.6731423020883924, "grad_norm": 0.050048828125, "learning_rate": 0.0001, "loss": 0.0273, "step": 1720 }, { "epoch": 2.6809130645944634, "grad_norm": 0.15234375, "learning_rate": 0.0001, "loss": 0.0385, "step": 1725 }, { "epoch": 2.688683827100534, "grad_norm": 0.1416015625, "learning_rate": 0.0001, "loss": 0.0333, "step": 1730 }, { "epoch": 2.696454589606605, "grad_norm": 0.1552734375, "learning_rate": 0.0001, "loss": 0.045, "step": 1735 }, { "epoch": 2.704225352112676, "grad_norm": 0.0673828125, "learning_rate": 0.0001, "loss": 0.0696, "step": 1740 }, { "epoch": 2.711996114618747, "grad_norm": 0.033203125, "learning_rate": 0.0001, "loss": 0.0295, "step": 1745 }, { "epoch": 2.719766877124818, "grad_norm": 0.04150390625, "learning_rate": 0.0001, "loss": 0.01, "step": 1750 }, { "epoch": 2.7275376396308886, "grad_norm": 0.04150390625, "learning_rate": 0.0001, "loss": 0.0129, "step": 1755 }, { "epoch": 2.7353084021369596, "grad_norm": 0.0693359375, "learning_rate": 0.0001, "loss": 0.0209, "step": 1760 }, { "epoch": 2.7430791646430306, "grad_norm": 0.1298828125, "learning_rate": 0.0001, "loss": 0.0456, "step": 1765 }, { "epoch": 2.7508499271491016, "grad_norm": 0.138671875, "learning_rate": 0.0001, "loss": 0.0398, "step": 1770 }, { "epoch": 2.7586206896551726, "grad_norm": 0.10205078125, "learning_rate": 0.0001, "loss": 0.0402, "step": 1775 }, { "epoch": 2.766391452161243, "grad_norm": 0.06396484375, "learning_rate": 0.0001, "loss": 0.0447, "step": 1780 }, { "epoch": 2.774162214667314, "grad_norm": 0.1689453125, "learning_rate": 0.0001, "loss": 0.0308, "step": 1785 }, { "epoch": 2.781932977173385, "grad_norm": 0.0712890625, "learning_rate": 0.0001, "loss": 0.0725, "step": 1790 }, { "epoch": 2.789703739679456, "grad_norm": 0.047607421875, "learning_rate": 0.0001, "loss": 0.0362, "step": 1795 }, { "epoch": 2.797474502185527, "grad_norm": 0.034912109375, "learning_rate": 0.0001, "loss": 0.0246, "step": 1800 }, { "epoch": 2.8052452646915977, "grad_norm": 0.039794921875, "learning_rate": 0.0001, "loss": 0.0129, "step": 1805 }, { "epoch": 2.8130160271976687, "grad_norm": 0.064453125, "learning_rate": 0.0001, "loss": 0.0158, "step": 1810 }, { "epoch": 2.8207867897037397, "grad_norm": 0.099609375, "learning_rate": 0.0001, "loss": 0.0383, "step": 1815 }, { "epoch": 2.8285575522098103, "grad_norm": 0.125, "learning_rate": 0.0001, "loss": 0.0278, "step": 1820 }, { "epoch": 2.8363283147158818, "grad_norm": 0.060791015625, "learning_rate": 0.0001, "loss": 0.0296, "step": 1825 }, { "epoch": 2.8440990772219523, "grad_norm": 0.11767578125, "learning_rate": 0.0001, "loss": 0.0381, "step": 1830 }, { "epoch": 2.8518698397280233, "grad_norm": 0.0888671875, "learning_rate": 0.0001, "loss": 0.023, "step": 1835 }, { "epoch": 2.8596406022340943, "grad_norm": 0.06396484375, "learning_rate": 0.0001, "loss": 0.0857, "step": 1840 }, { "epoch": 2.867411364740165, "grad_norm": 0.027099609375, "learning_rate": 0.0001, "loss": 0.0419, "step": 1845 }, { "epoch": 2.875182127246236, "grad_norm": 0.0546875, "learning_rate": 0.0001, "loss": 0.0366, "step": 1850 }, { "epoch": 2.882952889752307, "grad_norm": 0.03955078125, "learning_rate": 0.0001, "loss": 0.0095, "step": 1855 }, { "epoch": 2.890723652258378, "grad_norm": 0.02978515625, "learning_rate": 0.0001, "loss": 0.027, "step": 1860 }, { "epoch": 2.898494414764449, "grad_norm": 0.109375, "learning_rate": 0.0001, "loss": 0.0469, "step": 1865 }, { "epoch": 2.9062651772705195, "grad_norm": 0.08935546875, "learning_rate": 0.0001, "loss": 0.0318, "step": 1870 }, { "epoch": 2.9140359397765905, "grad_norm": 0.048095703125, "learning_rate": 0.0001, "loss": 0.0298, "step": 1875 }, { "epoch": 2.9218067022826615, "grad_norm": 0.1220703125, "learning_rate": 0.0001, "loss": 0.0286, "step": 1880 }, { "epoch": 2.9295774647887325, "grad_norm": 0.232421875, "learning_rate": 0.0001, "loss": 0.0286, "step": 1885 }, { "epoch": 2.9373482272948035, "grad_norm": 0.04443359375, "learning_rate": 0.0001, "loss": 0.0868, "step": 1890 }, { "epoch": 2.945118989800874, "grad_norm": 0.0240478515625, "learning_rate": 0.0001, "loss": 0.0325, "step": 1895 }, { "epoch": 2.952889752306945, "grad_norm": 0.01556396484375, "learning_rate": 0.0001, "loss": 0.0273, "step": 1900 }, { "epoch": 2.960660514813016, "grad_norm": 0.0517578125, "learning_rate": 0.0001, "loss": 0.017, "step": 1905 }, { "epoch": 2.968431277319087, "grad_norm": 0.056884765625, "learning_rate": 0.0001, "loss": 0.0567, "step": 1910 }, { "epoch": 2.976202039825158, "grad_norm": 0.1298828125, "learning_rate": 0.0001, "loss": 0.0429, "step": 1915 }, { "epoch": 2.9839728023312286, "grad_norm": 0.07373046875, "learning_rate": 0.0001, "loss": 0.0317, "step": 1920 }, { "epoch": 2.9917435648372996, "grad_norm": 0.05712890625, "learning_rate": 0.0001, "loss": 0.0283, "step": 1925 }, { "epoch": 2.9995143273433706, "grad_norm": 0.15234375, "learning_rate": 0.0001, "loss": 0.0246, "step": 1930 }, { "epoch": 3.0072850898494417, "grad_norm": 0.062255859375, "learning_rate": 0.0001, "loss": 0.0468, "step": 1935 }, { "epoch": 3.015055852355512, "grad_norm": 0.0272216796875, "learning_rate": 0.0001, "loss": 0.0136, "step": 1940 }, { "epoch": 3.022826614861583, "grad_norm": 0.053955078125, "learning_rate": 0.0001, "loss": 0.0105, "step": 1945 }, { "epoch": 3.030597377367654, "grad_norm": 0.10009765625, "learning_rate": 0.0001, "loss": 0.0102, "step": 1950 } ], "logging_steps": 5, "max_steps": 1950, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 90, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.358681688807424e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }