| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.9993075751280986, | |
| "eval_steps": 50.0, | |
| "global_step": 2706, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.007385865300281586, | |
| "grad_norm": 56.16927719116211, | |
| "learning_rate": 6.666666666666667e-07, | |
| "loss": 2.2986, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.014771730600563172, | |
| "grad_norm": 52.235191345214844, | |
| "learning_rate": 1.3333333333333334e-06, | |
| "loss": 2.1946, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.02215759590084476, | |
| "grad_norm": 14.341564178466797, | |
| "learning_rate": 2.0000000000000003e-06, | |
| "loss": 1.9498, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.029543461201126345, | |
| "grad_norm": 9.743363380432129, | |
| "learning_rate": 2.666666666666667e-06, | |
| "loss": 1.7599, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.03692932650140793, | |
| "grad_norm": 10.694592475891113, | |
| "learning_rate": 3.3333333333333333e-06, | |
| "loss": 1.77, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.04431519180168952, | |
| "grad_norm": 6.704069137573242, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": 1.5609, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.0517010571019711, | |
| "grad_norm": 5.9378342628479, | |
| "learning_rate": 4.666666666666667e-06, | |
| "loss": 1.5134, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.05908692240225269, | |
| "grad_norm": 5.821998119354248, | |
| "learning_rate": 5.333333333333334e-06, | |
| "loss": 1.4795, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.06647278770253427, | |
| "grad_norm": 6.466773986816406, | |
| "learning_rate": 6e-06, | |
| "loss": 1.4666, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.07385865300281585, | |
| "grad_norm": 5.7971625328063965, | |
| "learning_rate": 6.666666666666667e-06, | |
| "loss": 1.4187, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.08124451830309745, | |
| "grad_norm": 19.75885581970215, | |
| "learning_rate": 7.333333333333333e-06, | |
| "loss": 1.4012, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.08863038360337903, | |
| "grad_norm": 6.692321300506592, | |
| "learning_rate": 8.000000000000001e-06, | |
| "loss": 1.3932, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.09601624890366062, | |
| "grad_norm": 8.816634178161621, | |
| "learning_rate": 8.666666666666668e-06, | |
| "loss": 1.3924, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.1034021142039422, | |
| "grad_norm": 6.486945152282715, | |
| "learning_rate": 9.333333333333334e-06, | |
| "loss": 1.3117, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.1107879795042238, | |
| "grad_norm": 8.362743377685547, | |
| "learning_rate": 1e-05, | |
| "loss": 1.2642, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.11817384480450538, | |
| "grad_norm": 7.534619331359863, | |
| "learning_rate": 1.0666666666666667e-05, | |
| "loss": 1.2891, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.12555971010478698, | |
| "grad_norm": 7.239850997924805, | |
| "learning_rate": 1.1333333333333334e-05, | |
| "loss": 1.2664, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.13294557540506854, | |
| "grad_norm": 6.650047779083252, | |
| "learning_rate": 1.2e-05, | |
| "loss": 1.2494, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.14033144070535014, | |
| "grad_norm": 5.859479904174805, | |
| "learning_rate": 1.2666666666666667e-05, | |
| "loss": 1.2844, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.1477173060056317, | |
| "grad_norm": 7.5547027587890625, | |
| "learning_rate": 1.3333333333333333e-05, | |
| "loss": 1.2898, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.1551031713059133, | |
| "grad_norm": 8.316688537597656, | |
| "learning_rate": 1.4e-05, | |
| "loss": 1.1774, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.1624890366061949, | |
| "grad_norm": 7.763572692871094, | |
| "learning_rate": 1.4666666666666666e-05, | |
| "loss": 1.2881, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.16987490190647647, | |
| "grad_norm": 7.132694244384766, | |
| "learning_rate": 1.5333333333333334e-05, | |
| "loss": 1.2359, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.17726076720675807, | |
| "grad_norm": 6.167331218719482, | |
| "learning_rate": 1.6000000000000003e-05, | |
| "loss": 1.1939, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.18464663250703967, | |
| "grad_norm": 7.399999141693115, | |
| "learning_rate": 1.6666666666666667e-05, | |
| "loss": 1.2213, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.19203249780732123, | |
| "grad_norm": 5.161776065826416, | |
| "learning_rate": 1.7333333333333336e-05, | |
| "loss": 1.1747, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.19941836310760283, | |
| "grad_norm": 9.162799835205078, | |
| "learning_rate": 1.8e-05, | |
| "loss": 1.1751, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.2068042284078844, | |
| "grad_norm": 6.043332576751709, | |
| "learning_rate": 1.866666666666667e-05, | |
| "loss": 1.181, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.214190093708166, | |
| "grad_norm": 5.533137798309326, | |
| "learning_rate": 1.9333333333333333e-05, | |
| "loss": 1.1727, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.2215759590084476, | |
| "grad_norm": 4.7085862159729, | |
| "learning_rate": 2e-05, | |
| "loss": 1.3127, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.22896182430872916, | |
| "grad_norm": 5.2254815101623535, | |
| "learning_rate": 1.9999147543290536e-05, | |
| "loss": 1.1853, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.23634768960901076, | |
| "grad_norm": 5.015223503112793, | |
| "learning_rate": 1.999659031849863e-05, | |
| "loss": 1.1846, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.24373355490929235, | |
| "grad_norm": 6.505156993865967, | |
| "learning_rate": 1.9992328761608965e-05, | |
| "loss": 1.1572, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.25111942020957395, | |
| "grad_norm": 4.331061840057373, | |
| "learning_rate": 1.99863635991801e-05, | |
| "loss": 1.0744, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.2585052855098555, | |
| "grad_norm": 6.760496616363525, | |
| "learning_rate": 1.997869584822058e-05, | |
| "loss": 1.1019, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.2658911508101371, | |
| "grad_norm": 6.3948235511779785, | |
| "learning_rate": 1.9969326816015556e-05, | |
| "loss": 1.1073, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.2732770161104187, | |
| "grad_norm": 5.087249279022217, | |
| "learning_rate": 1.9958258099903894e-05, | |
| "loss": 1.0751, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.2806628814107003, | |
| "grad_norm": 10.829612731933594, | |
| "learning_rate": 1.9945491587005867e-05, | |
| "loss": 1.083, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.28804874671098185, | |
| "grad_norm": 5.7423176765441895, | |
| "learning_rate": 1.9931029453901384e-05, | |
| "loss": 1.0639, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.2954346120112634, | |
| "grad_norm": 4.613246440887451, | |
| "learning_rate": 1.9914874166258927e-05, | |
| "loss": 1.0604, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.30282047731154504, | |
| "grad_norm": 4.079463005065918, | |
| "learning_rate": 1.9897028478415165e-05, | |
| "loss": 1.0017, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.3102063426118266, | |
| "grad_norm": 4.641962051391602, | |
| "learning_rate": 1.9877495432905363e-05, | |
| "loss": 1.0263, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.3175922079121082, | |
| "grad_norm": 6.14805269241333, | |
| "learning_rate": 1.9856278359944664e-05, | |
| "loss": 1.0451, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.3249780732123898, | |
| "grad_norm": 5.665846824645996, | |
| "learning_rate": 1.9833380876860305e-05, | |
| "loss": 1.0361, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.3323639385126714, | |
| "grad_norm": 7.826813220977783, | |
| "learning_rate": 1.9808806887474907e-05, | |
| "loss": 0.9795, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.33974980381295294, | |
| "grad_norm": 4.955426216125488, | |
| "learning_rate": 1.9782560581440894e-05, | |
| "loss": 1.0433, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.34713566911323457, | |
| "grad_norm": 5.327470302581787, | |
| "learning_rate": 1.97546464335262e-05, | |
| "loss": 0.9605, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.35452153441351614, | |
| "grad_norm": 4.838713645935059, | |
| "learning_rate": 1.972506920285136e-05, | |
| "loss": 0.9935, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.3619073997137977, | |
| "grad_norm": 6.030056476593018, | |
| "learning_rate": 1.969383393207813e-05, | |
| "loss": 1.0043, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.36929326501407933, | |
| "grad_norm": 5.917972087860107, | |
| "learning_rate": 1.9660945946549727e-05, | |
| "loss": 0.9701, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.3766791303143609, | |
| "grad_norm": 4.341779708862305, | |
| "learning_rate": 1.962641085338294e-05, | |
| "loss": 0.9913, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.38406499561464247, | |
| "grad_norm": 4.399661064147949, | |
| "learning_rate": 1.959023454051215e-05, | |
| "loss": 0.9196, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.39145086091492404, | |
| "grad_norm": 4.028534412384033, | |
| "learning_rate": 1.9552423175685478e-05, | |
| "loss": 0.9369, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.39883672621520566, | |
| "grad_norm": 4.389466285705566, | |
| "learning_rate": 1.9512983205413253e-05, | |
| "loss": 1.0191, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.40622259151548723, | |
| "grad_norm": 5.277081489562988, | |
| "learning_rate": 1.9471921353868932e-05, | |
| "loss": 0.9399, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.4136084568157688, | |
| "grad_norm": 4.73630428314209, | |
| "learning_rate": 1.9429244621742685e-05, | |
| "loss": 0.9588, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.4209943221160504, | |
| "grad_norm": 3.3033573627471924, | |
| "learning_rate": 1.938496028504784e-05, | |
| "loss": 0.9038, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.428380187416332, | |
| "grad_norm": 7.80294942855835, | |
| "learning_rate": 1.9339075893880382e-05, | |
| "loss": 0.9403, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.43576605271661356, | |
| "grad_norm": 4.098162651062012, | |
| "learning_rate": 1.9291599271131737e-05, | |
| "loss": 0.9344, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.4431519180168952, | |
| "grad_norm": 3.7808070182800293, | |
| "learning_rate": 1.9242538511155024e-05, | |
| "loss": 0.8939, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.45053778331717675, | |
| "grad_norm": 4.160403728485107, | |
| "learning_rate": 1.9191901978385048e-05, | |
| "loss": 0.8786, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.4579236486174583, | |
| "grad_norm": 3.7800965309143066, | |
| "learning_rate": 1.9139698305912227e-05, | |
| "loss": 0.8977, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.46530951391773995, | |
| "grad_norm": 3.8200621604919434, | |
| "learning_rate": 1.9085936394010733e-05, | |
| "loss": 0.8793, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.4726953792180215, | |
| "grad_norm": 4.453779220581055, | |
| "learning_rate": 1.903062540862107e-05, | |
| "loss": 0.8813, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.4800812445183031, | |
| "grad_norm": 5.653434753417969, | |
| "learning_rate": 1.897377477978736e-05, | |
| "loss": 0.9544, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.4874671098185847, | |
| "grad_norm": 4.868826389312744, | |
| "learning_rate": 1.8915394200049597e-05, | |
| "loss": 0.8858, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.4948529751188663, | |
| "grad_norm": 4.187640190124512, | |
| "learning_rate": 1.8855493622791163e-05, | |
| "loss": 0.9077, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.5022388404191479, | |
| "grad_norm": 4.503122806549072, | |
| "learning_rate": 1.8794083260541853e-05, | |
| "loss": 0.9278, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.5096247057194294, | |
| "grad_norm": 4.902103900909424, | |
| "learning_rate": 1.8731173583236737e-05, | |
| "loss": 0.8281, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.517010571019711, | |
| "grad_norm": 4.273303031921387, | |
| "learning_rate": 1.8666775316431113e-05, | |
| "loss": 0.8054, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.5243964363199927, | |
| "grad_norm": 55.874359130859375, | |
| "learning_rate": 1.8600899439471902e-05, | |
| "loss": 0.8091, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.5317823016202742, | |
| "grad_norm": 4.271385192871094, | |
| "learning_rate": 1.8533557183625773e-05, | |
| "loss": 0.788, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.5391681669205558, | |
| "grad_norm": 5.59772253036499, | |
| "learning_rate": 1.8464760030164287e-05, | |
| "loss": 0.7942, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.5465540322208374, | |
| "grad_norm": 3.724728584289551, | |
| "learning_rate": 1.8394519708406454e-05, | |
| "loss": 0.8234, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.5539398975211189, | |
| "grad_norm": 5.2906036376953125, | |
| "learning_rate": 1.8322848193718984e-05, | |
| "loss": 0.8143, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.5613257628214006, | |
| "grad_norm": 5.114410877227783, | |
| "learning_rate": 1.82497577054746e-05, | |
| "loss": 0.7946, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.5687116281216821, | |
| "grad_norm": 4.730770587921143, | |
| "learning_rate": 1.8175260704968716e-05, | |
| "loss": 0.7771, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.5760974934219637, | |
| "grad_norm": 3.0836727619171143, | |
| "learning_rate": 1.809936989329492e-05, | |
| "loss": 0.739, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.5834833587222453, | |
| "grad_norm": 2.7664663791656494, | |
| "learning_rate": 1.802209820917952e-05, | |
| "loss": 0.731, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.5908692240225268, | |
| "grad_norm": 3.5617446899414062, | |
| "learning_rate": 1.7943458826775646e-05, | |
| "loss": 0.6807, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.5982550893228085, | |
| "grad_norm": 7.652963638305664, | |
| "learning_rate": 1.786346515341712e-05, | |
| "loss": 0.6883, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.6056409546230901, | |
| "grad_norm": 3.5472395420074463, | |
| "learning_rate": 1.778213082733266e-05, | |
| "loss": 0.6822, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.6130268199233716, | |
| "grad_norm": 4.652453899383545, | |
| "learning_rate": 1.7699469715320663e-05, | |
| "loss": 0.6508, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.6204126852236532, | |
| "grad_norm": 3.976405620574951, | |
| "learning_rate": 1.7615495910385036e-05, | |
| "loss": 0.6007, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.6277985505239349, | |
| "grad_norm": 3.0713090896606445, | |
| "learning_rate": 1.7530223729332464e-05, | |
| "loss": 0.6174, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.6351844158242164, | |
| "grad_norm": 4.036540508270264, | |
| "learning_rate": 1.7443667710331523e-05, | |
| "loss": 0.617, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.642570281124498, | |
| "grad_norm": 7.731866836547852, | |
| "learning_rate": 1.7355842610434045e-05, | |
| "loss": 0.6245, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.6499561464247796, | |
| "grad_norm": 4.550940036773682, | |
| "learning_rate": 1.7266763403059162e-05, | |
| "loss": 0.593, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.6573420117250611, | |
| "grad_norm": 2.5473084449768066, | |
| "learning_rate": 1.7176445275440468e-05, | |
| "loss": 0.5677, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.6647278770253428, | |
| "grad_norm": 2.1716835498809814, | |
| "learning_rate": 1.7084903626036743e-05, | |
| "loss": 0.5452, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.6721137423256244, | |
| "grad_norm": 4.398560523986816, | |
| "learning_rate": 1.6992154061906637e-05, | |
| "loss": 0.5599, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.6794996076259059, | |
| "grad_norm": 2.8742692470550537, | |
| "learning_rate": 1.6898212396047788e-05, | |
| "loss": 0.5004, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.6868854729261875, | |
| "grad_norm": 3.202517032623291, | |
| "learning_rate": 1.6803094644700878e-05, | |
| "loss": 0.5079, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.6942713382264691, | |
| "grad_norm": 5.449188232421875, | |
| "learning_rate": 1.6706817024618966e-05, | |
| "loss": 0.5122, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.7016572035267507, | |
| "grad_norm": 5.538541316986084, | |
| "learning_rate": 1.6609395950302693e-05, | |
| "loss": 0.5241, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.7090430688270323, | |
| "grad_norm": 3.380526304244995, | |
| "learning_rate": 1.6510848031201755e-05, | |
| "loss": 0.4631, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.7164289341273139, | |
| "grad_norm": 3.240527629852295, | |
| "learning_rate": 1.6411190068883114e-05, | |
| "loss": 0.5214, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.7238147994275954, | |
| "grad_norm": 16.668127059936523, | |
| "learning_rate": 1.63104390541665e-05, | |
| "loss": 0.5373, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.731200664727877, | |
| "grad_norm": 3.9278078079223633, | |
| "learning_rate": 1.6208612164227605e-05, | |
| "loss": 0.4789, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.7385865300281587, | |
| "grad_norm": 3.5258326530456543, | |
| "learning_rate": 1.6105726759669534e-05, | |
| "loss": 0.465, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.7459723953284402, | |
| "grad_norm": 2.779311418533325, | |
| "learning_rate": 1.600180038156298e-05, | |
| "loss": 0.4501, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.7533582606287218, | |
| "grad_norm": 3.857485771179199, | |
| "learning_rate": 1.58968507484556e-05, | |
| "loss": 0.4519, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.7607441259290034, | |
| "grad_norm": 2.959052324295044, | |
| "learning_rate": 1.579089575335117e-05, | |
| "loss": 0.4357, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.7681299912292849, | |
| "grad_norm": 1.8662097454071045, | |
| "learning_rate": 1.568395346065899e-05, | |
| "loss": 0.3633, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.7755158565295666, | |
| "grad_norm": 5.543001174926758, | |
| "learning_rate": 1.5576042103114043e-05, | |
| "loss": 0.4111, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.7829017218298481, | |
| "grad_norm": 6.083206653594971, | |
| "learning_rate": 1.5467180078668485e-05, | |
| "loss": 0.3764, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.7902875871301297, | |
| "grad_norm": 2.5218305587768555, | |
| "learning_rate": 1.5357385947354945e-05, | |
| "loss": 0.374, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.7976734524304113, | |
| "grad_norm": 4.317601680755615, | |
| "learning_rate": 1.52466784281222e-05, | |
| "loss": 0.3571, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.8050593177306928, | |
| "grad_norm": 2.0782041549682617, | |
| "learning_rate": 1.5135076395643765e-05, | |
| "loss": 0.3739, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.8124451830309745, | |
| "grad_norm": 2.443953037261963, | |
| "learning_rate": 1.5022598877099913e-05, | |
| "loss": 0.3607, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.8198310483312561, | |
| "grad_norm": 2.276827573776245, | |
| "learning_rate": 1.4909265048933716e-05, | |
| "loss": 0.3607, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.8272169136315376, | |
| "grad_norm": 2.808431386947632, | |
| "learning_rate": 1.4795094233581616e-05, | |
| "loss": 0.3387, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.8346027789318192, | |
| "grad_norm": 2.5325915813446045, | |
| "learning_rate": 1.468010589617913e-05, | |
| "loss": 0.3172, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.8419886442321008, | |
| "grad_norm": 2.4943833351135254, | |
| "learning_rate": 1.4564319641242202e-05, | |
| "loss": 0.3193, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.8493745095323824, | |
| "grad_norm": 2.2182066440582275, | |
| "learning_rate": 1.4447755209324807e-05, | |
| "loss": 0.3118, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.856760374832664, | |
| "grad_norm": 1.920409083366394, | |
| "learning_rate": 1.4330432473653369e-05, | |
| "loss": 0.3246, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.8641462401329456, | |
| "grad_norm": 3.2863781452178955, | |
| "learning_rate": 1.4212371436738518e-05, | |
| "loss": 0.3065, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.8715321054332271, | |
| "grad_norm": 2.6266987323760986, | |
| "learning_rate": 1.4093592226964863e-05, | |
| "loss": 0.2813, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.8789179707335087, | |
| "grad_norm": 2.526742935180664, | |
| "learning_rate": 1.3974115095159273e-05, | |
| "loss": 0.284, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.8863038360337904, | |
| "grad_norm": 2.1190872192382812, | |
| "learning_rate": 1.3853960411138272e-05, | |
| "loss": 0.2865, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.8936897013340719, | |
| "grad_norm": 3.0260584354400635, | |
| "learning_rate": 1.373314866023517e-05, | |
| "loss": 0.3019, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.9010755666343535, | |
| "grad_norm": 4.537729740142822, | |
| "learning_rate": 1.3611700439807503e-05, | |
| "loss": 0.2946, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.9084614319346351, | |
| "grad_norm": 3.150209903717041, | |
| "learning_rate": 1.3489636455725337e-05, | |
| "loss": 0.2795, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.9158472972349166, | |
| "grad_norm": 1.6362818479537964, | |
| "learning_rate": 1.336697751884111e-05, | |
| "loss": 0.2815, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.9232331625351983, | |
| "grad_norm": 1.3282984495162964, | |
| "learning_rate": 1.3243744541441578e-05, | |
| "loss": 0.2679, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.9306190278354799, | |
| "grad_norm": 4.261312961578369, | |
| "learning_rate": 1.3119958533682417e-05, | |
| "loss": 0.2634, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.9380048931357614, | |
| "grad_norm": 2.1109001636505127, | |
| "learning_rate": 1.2995640600006196e-05, | |
| "loss": 0.2566, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.945390758436043, | |
| "grad_norm": 2.4117610454559326, | |
| "learning_rate": 1.2870811935544252e-05, | |
| "loss": 0.2502, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.9527766237363247, | |
| "grad_norm": 2.0748672485351562, | |
| "learning_rate": 1.2745493822503096e-05, | |
| "loss": 0.2422, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.9601624890366062, | |
| "grad_norm": 3.0310394763946533, | |
| "learning_rate": 1.261970762653598e-05, | |
| "loss": 0.2508, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.9675483543368878, | |
| "grad_norm": 2.0341477394104004, | |
| "learning_rate": 1.2493474793100249e-05, | |
| "loss": 0.2467, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.9749342196371694, | |
| "grad_norm": 1.4582960605621338, | |
| "learning_rate": 1.2366816843801066e-05, | |
| "loss": 0.2479, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.9823200849374509, | |
| "grad_norm": 3.3330225944519043, | |
| "learning_rate": 1.2239755372722169e-05, | |
| "loss": 0.2516, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.9897059502377326, | |
| "grad_norm": 1.4349642992019653, | |
| "learning_rate": 1.2112312042744263e-05, | |
| "loss": 0.2153, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.9970918155380141, | |
| "grad_norm": 2.073673725128174, | |
| "learning_rate": 1.1984508581851694e-05, | |
| "loss": 0.1858, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 1.0051701057101972, | |
| "grad_norm": 4.247702598571777, | |
| "learning_rate": 1.1856366779428008e-05, | |
| "loss": 0.2183, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 1.0125559710104788, | |
| "grad_norm": 4.242294788360596, | |
| "learning_rate": 1.1727908482541048e-05, | |
| "loss": 0.2059, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 1.0199418363107602, | |
| "grad_norm": 2.2901999950408936, | |
| "learning_rate": 1.1599155592218234e-05, | |
| "loss": 0.2207, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 1.0273277016110418, | |
| "grad_norm": 1.7798693180084229, | |
| "learning_rate": 1.1470130059712607e-05, | |
| "loss": 0.1898, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 1.0347135669113234, | |
| "grad_norm": 1.9651380777359009, | |
| "learning_rate": 1.1340853882760343e-05, | |
| "loss": 0.1958, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.042099432211605, | |
| "grad_norm": 1.8335607051849365, | |
| "learning_rate": 1.1211349101830323e-05, | |
| "loss": 0.2201, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 1.0494852975118867, | |
| "grad_norm": 2.270725965499878, | |
| "learning_rate": 1.1081637796366432e-05, | |
| "loss": 0.1881, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 1.0568711628121683, | |
| "grad_norm": 3.337350368499756, | |
| "learning_rate": 1.0951742081023196e-05, | |
| "loss": 0.2176, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 1.0642570281124497, | |
| "grad_norm": 3.7382607460021973, | |
| "learning_rate": 1.0821684101895429e-05, | |
| "loss": 0.2043, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 1.0716428934127313, | |
| "grad_norm": 1.3422726392745972, | |
| "learning_rate": 1.0691486032742522e-05, | |
| "loss": 0.1908, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 1.079028758713013, | |
| "grad_norm": 3.4625842571258545, | |
| "learning_rate": 1.0561170071207987e-05, | |
| "loss": 0.1747, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 1.0864146240132946, | |
| "grad_norm": 1.8566938638687134, | |
| "learning_rate": 1.0430758435034985e-05, | |
| "loss": 0.2003, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 1.0938004893135762, | |
| "grad_norm": 4.041960716247559, | |
| "learning_rate": 1.0300273358278362e-05, | |
| "loss": 0.1716, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 1.1011863546138578, | |
| "grad_norm": 1.5447806119918823, | |
| "learning_rate": 1.016973708751395e-05, | |
| "loss": 0.1911, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 1.1085722199141392, | |
| "grad_norm": 1.8091706037521362, | |
| "learning_rate": 1.003917187804572e-05, | |
| "loss": 0.1687, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.1159580852144209, | |
| "grad_norm": 1.5981247425079346, | |
| "learning_rate": 9.908599990111438e-06, | |
| "loss": 0.1706, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 1.1233439505147025, | |
| "grad_norm": 1.5762553215026855, | |
| "learning_rate": 9.778043685087488e-06, | |
| "loss": 0.1896, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 1.130729815814984, | |
| "grad_norm": 1.4694616794586182, | |
| "learning_rate": 9.64752522169351e-06, | |
| "loss": 0.1718, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 1.1381156811152657, | |
| "grad_norm": 1.4669324159622192, | |
| "learning_rate": 9.517066852197469e-06, | |
| "loss": 0.1481, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 1.1455015464155474, | |
| "grad_norm": 2.1808154582977295, | |
| "learning_rate": 9.386690818621845e-06, | |
| "loss": 0.1878, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 1.1528874117158288, | |
| "grad_norm": 1.0794235467910767, | |
| "learning_rate": 9.256419348951545e-06, | |
| "loss": 0.1809, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 1.1602732770161104, | |
| "grad_norm": 1.1634767055511475, | |
| "learning_rate": 9.126274653344249e-06, | |
| "loss": 0.1558, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 1.167659142316392, | |
| "grad_norm": 3.980741024017334, | |
| "learning_rate": 8.996278920343753e-06, | |
| "loss": 0.1714, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 1.1750450076166736, | |
| "grad_norm": 1.3018531799316406, | |
| "learning_rate": 8.866454313097011e-06, | |
| "loss": 0.1476, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 1.1824308729169553, | |
| "grad_norm": 1.6033530235290527, | |
| "learning_rate": 8.736822965575526e-06, | |
| "loss": 0.1702, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.1898167382172367, | |
| "grad_norm": 1.6837263107299805, | |
| "learning_rate": 8.607406978801692e-06, | |
| "loss": 0.1622, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 1.1972026035175183, | |
| "grad_norm": 4.44855260848999, | |
| "learning_rate": 8.478228417080749e-06, | |
| "loss": 0.2111, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 1.2045884688178, | |
| "grad_norm": 1.133955478668213, | |
| "learning_rate": 8.349309304239033e-06, | |
| "loss": 0.1407, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 1.2119743341180815, | |
| "grad_norm": 2.430974006652832, | |
| "learning_rate": 8.22067161986909e-06, | |
| "loss": 0.1502, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 1.2193601994183632, | |
| "grad_norm": 1.0593976974487305, | |
| "learning_rate": 8.092337295582342e-06, | |
| "loss": 0.1461, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 1.2267460647186448, | |
| "grad_norm": 1.5466171503067017, | |
| "learning_rate": 7.964328211269949e-06, | |
| "loss": 0.1257, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 1.2341319300189264, | |
| "grad_norm": 3.7850043773651123, | |
| "learning_rate": 7.83666619137247e-06, | |
| "loss": 0.1237, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 1.2415177953192078, | |
| "grad_norm": 2.987395763397217, | |
| "learning_rate": 7.709373001158989e-06, | |
| "loss": 0.135, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 1.2489036606194894, | |
| "grad_norm": 1.1026815176010132, | |
| "learning_rate": 7.582470343016315e-06, | |
| "loss": 0.1339, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 1.256289525919771, | |
| "grad_norm": 0.8675901293754578, | |
| "learning_rate": 7.455979852748926e-06, | |
| "loss": 0.1187, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.2636753912200527, | |
| "grad_norm": 1.0071134567260742, | |
| "learning_rate": 7.3299230958902455e-06, | |
| "loss": 0.1288, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 1.2710612565203343, | |
| "grad_norm": 1.257807731628418, | |
| "learning_rate": 7.2043215640259045e-06, | |
| "loss": 0.1219, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 1.2784471218206157, | |
| "grad_norm": 1.5844953060150146, | |
| "learning_rate": 7.079196671129613e-06, | |
| "loss": 0.1293, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 1.2858329871208973, | |
| "grad_norm": 1.242968201637268, | |
| "learning_rate": 6.954569749912268e-06, | |
| "loss": 0.1242, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 1.293218852421179, | |
| "grad_norm": 6.035883903503418, | |
| "learning_rate": 6.8304620481849e-06, | |
| "loss": 0.1324, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 1.3006047177214606, | |
| "grad_norm": 1.1064496040344238, | |
| "learning_rate": 6.706894725236118e-06, | |
| "loss": 0.113, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 1.3079905830217422, | |
| "grad_norm": 3.75222110748291, | |
| "learning_rate": 6.583888848224628e-06, | |
| "loss": 0.1402, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 1.3153764483220236, | |
| "grad_norm": 2.064958333969116, | |
| "learning_rate": 6.4614653885874564e-06, | |
| "loss": 0.1354, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 1.3227623136223052, | |
| "grad_norm": 1.2012087106704712, | |
| "learning_rate": 6.339645218464521e-06, | |
| "loss": 0.1162, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 1.3301481789225869, | |
| "grad_norm": 3.533600330352783, | |
| "learning_rate": 6.218449107140093e-06, | |
| "loss": 0.114, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.3375340442228685, | |
| "grad_norm": 1.0663248300552368, | |
| "learning_rate": 6.097897717501829e-06, | |
| "loss": 0.1102, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 1.34491990952315, | |
| "grad_norm": 2.6653411388397217, | |
| "learning_rate": 5.978011602517908e-06, | |
| "loss": 0.1115, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 1.3523057748234317, | |
| "grad_norm": 2.8922715187072754, | |
| "learning_rate": 5.858811201732952e-06, | |
| "loss": 0.1168, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 1.3596916401237134, | |
| "grad_norm": 0.7805532813072205, | |
| "learning_rate": 5.740316837783247e-06, | |
| "loss": 0.0985, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 1.3670775054239948, | |
| "grad_norm": 1.6969873905181885, | |
| "learning_rate": 5.622548712931907e-06, | |
| "loss": 0.115, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 1.3744633707242764, | |
| "grad_norm": 1.0871217250823975, | |
| "learning_rate": 5.50552690562457e-06, | |
| "loss": 0.1077, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 1.381849236024558, | |
| "grad_norm": 1.25892174243927, | |
| "learning_rate": 5.389271367066193e-06, | |
| "loss": 0.0974, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 1.3892351013248396, | |
| "grad_norm": 0.6338607668876648, | |
| "learning_rate": 5.273801917819552e-06, | |
| "loss": 0.098, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 1.3966209666251213, | |
| "grad_norm": 0.43911364674568176, | |
| "learning_rate": 5.159138244425996e-06, | |
| "loss": 0.0965, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 1.4040068319254027, | |
| "grad_norm": 0.7171842455863953, | |
| "learning_rate": 5.045299896049063e-06, | |
| "loss": 0.1043, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.4113926972256843, | |
| "grad_norm": 0.7495408058166504, | |
| "learning_rate": 4.932306281141531e-06, | |
| "loss": 0.1067, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 1.418778562525966, | |
| "grad_norm": 0.6386808753013611, | |
| "learning_rate": 4.82017666413643e-06, | |
| "loss": 0.095, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 1.4261644278262475, | |
| "grad_norm": 0.4710920751094818, | |
| "learning_rate": 4.7089301621626285e-06, | |
| "loss": 0.0946, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 1.4335502931265292, | |
| "grad_norm": 2.0037851333618164, | |
| "learning_rate": 4.598585741785529e-06, | |
| "loss": 0.1343, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 1.4409361584268106, | |
| "grad_norm": 0.731887936592102, | |
| "learning_rate": 4.489162215773437e-06, | |
| "loss": 0.1021, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 1.4483220237270924, | |
| "grad_norm": 1.012526035308838, | |
| "learning_rate": 4.380678239890128e-06, | |
| "loss": 0.0986, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 1.4557078890273738, | |
| "grad_norm": 1.7591279745101929, | |
| "learning_rate": 4.273152309714231e-06, | |
| "loss": 0.0921, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 1.4630937543276554, | |
| "grad_norm": 0.5881451964378357, | |
| "learning_rate": 4.166602757485865e-06, | |
| "loss": 0.0889, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 1.470479619627937, | |
| "grad_norm": 0.6772285103797913, | |
| "learning_rate": 4.061047748981171e-06, | |
| "loss": 0.0999, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 1.4778654849282187, | |
| "grad_norm": 1.0633774995803833, | |
| "learning_rate": 3.9565052804151925e-06, | |
| "loss": 0.0929, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.4852513502285003, | |
| "grad_norm": 0.5887898802757263, | |
| "learning_rate": 3.852993175373679e-06, | |
| "loss": 0.0929, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 1.4926372155287817, | |
| "grad_norm": 0.9685658812522888, | |
| "learning_rate": 3.7505290817743256e-06, | |
| "loss": 0.0932, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 1.5000230808290633, | |
| "grad_norm": 3.481058120727539, | |
| "learning_rate": 3.6491304688579376e-06, | |
| "loss": 0.1034, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 1.507408946129345, | |
| "grad_norm": 1.2913931608200073, | |
| "learning_rate": 3.5488146242101018e-06, | |
| "loss": 0.0914, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 1.5147948114296266, | |
| "grad_norm": 0.49071353673934937, | |
| "learning_rate": 3.4495986508137847e-06, | |
| "loss": 0.097, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 1.5221806767299082, | |
| "grad_norm": 0.7845070362091064, | |
| "learning_rate": 3.3514994641334274e-06, | |
| "loss": 0.0895, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 1.5295665420301896, | |
| "grad_norm": 0.7540778517723083, | |
| "learning_rate": 3.254533789231008e-06, | |
| "loss": 0.094, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 1.5369524073304714, | |
| "grad_norm": 0.8221713900566101, | |
| "learning_rate": 3.158718157914559e-06, | |
| "loss": 0.0857, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 1.5443382726307529, | |
| "grad_norm": 0.458886057138443, | |
| "learning_rate": 3.0640689059196328e-06, | |
| "loss": 0.0834, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 1.5517241379310345, | |
| "grad_norm": 5.687739372253418, | |
| "learning_rate": 2.9706021701242127e-06, | |
| "loss": 0.0944, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 1.559110003231316, | |
| "grad_norm": 0.609434962272644, | |
| "learning_rate": 2.8783338857975087e-06, | |
| "loss": 0.0926, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 1.5664958685315975, | |
| "grad_norm": 3.346607208251953, | |
| "learning_rate": 2.787279783883129e-06, | |
| "loss": 0.087, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 1.5738817338318793, | |
| "grad_norm": 2.047215700149536, | |
| "learning_rate": 2.697455388317094e-06, | |
| "loss": 0.0807, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 1.5812675991321608, | |
| "grad_norm": 1.0655306577682495, | |
| "learning_rate": 2.6088760133811418e-06, | |
| "loss": 0.0857, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 1.5886534644324424, | |
| "grad_norm": 1.1660749912261963, | |
| "learning_rate": 2.5215567610917623e-06, | |
| "loss": 0.08, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 1.596039329732724, | |
| "grad_norm": 0.45875102281570435, | |
| "learning_rate": 2.4355125186254547e-06, | |
| "loss": 0.0931, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 1.6034251950330056, | |
| "grad_norm": 1.5347977876663208, | |
| "learning_rate": 2.3507579557805803e-06, | |
| "loss": 0.083, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 1.6108110603332872, | |
| "grad_norm": 1.1268221139907837, | |
| "learning_rate": 2.26730752247629e-06, | |
| "loss": 0.0841, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 1.6181969256335687, | |
| "grad_norm": 0.4492045044898987, | |
| "learning_rate": 2.1851754462889373e-06, | |
| "loss": 0.0791, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 1.6255827909338505, | |
| "grad_norm": 0.9329794645309448, | |
| "learning_rate": 2.104375730026406e-06, | |
| "loss": 0.0827, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 1.632968656234132, | |
| "grad_norm": 0.4460253119468689, | |
| "learning_rate": 2.024922149340748e-06, | |
| "loss": 0.0812, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 1.6403545215344135, | |
| "grad_norm": 3.0073747634887695, | |
| "learning_rate": 1.9468282503795465e-06, | |
| "loss": 0.0836, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 1.6477403868346951, | |
| "grad_norm": 0.7037497758865356, | |
| "learning_rate": 1.8701073474764342e-06, | |
| "loss": 0.0757, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 1.6551262521349765, | |
| "grad_norm": 2.326693058013916, | |
| "learning_rate": 1.7947725208810962e-06, | |
| "loss": 0.0743, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 1.6625121174352584, | |
| "grad_norm": 0.2990873456001282, | |
| "learning_rate": 1.720836614529211e-06, | |
| "loss": 0.0799, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 1.6698979827355398, | |
| "grad_norm": 0.4213595688343048, | |
| "learning_rate": 1.648312233852666e-06, | |
| "loss": 0.0802, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 1.6772838480358214, | |
| "grad_norm": 0.5848265290260315, | |
| "learning_rate": 1.5772117436304446e-06, | |
| "loss": 0.0795, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 1.684669713336103, | |
| "grad_norm": 0.6411451697349548, | |
| "learning_rate": 1.5075472658805301e-06, | |
| "loss": 0.0739, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 1.6920555786363847, | |
| "grad_norm": 0.8654035925865173, | |
| "learning_rate": 1.4393306777932192e-06, | |
| "loss": 0.0796, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 1.6994414439366663, | |
| "grad_norm": 0.7043092250823975, | |
| "learning_rate": 1.3725736097061537e-06, | |
| "loss": 0.0811, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 1.7068273092369477, | |
| "grad_norm": 1.6693702936172485, | |
| "learning_rate": 1.307287443121452e-06, | |
| "loss": 0.094, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 1.7142131745372293, | |
| "grad_norm": 0.33761119842529297, | |
| "learning_rate": 1.2434833087652642e-06, | |
| "loss": 0.0759, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 1.721599039837511, | |
| "grad_norm": 0.9389520883560181, | |
| "learning_rate": 1.181172084690072e-06, | |
| "loss": 0.0727, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 1.7289849051377926, | |
| "grad_norm": 0.2903837263584137, | |
| "learning_rate": 1.120364394420087e-06, | |
| "loss": 0.0743, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 1.7363707704380742, | |
| "grad_norm": 0.325009822845459, | |
| "learning_rate": 1.0610706051400165e-06, | |
| "loss": 0.0801, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 1.7437566357383556, | |
| "grad_norm": 0.9325069785118103, | |
| "learning_rate": 1.0033008259275635e-06, | |
| "loss": 0.0759, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 1.7511425010386374, | |
| "grad_norm": 1.0802961587905884, | |
| "learning_rate": 9.470649060299041e-07, | |
| "loss": 0.0779, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 1.7585283663389188, | |
| "grad_norm": 0.4947347939014435, | |
| "learning_rate": 8.923724331844875e-07, | |
| "loss": 0.0786, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 1.7659142316392005, | |
| "grad_norm": 0.47125598788261414, | |
| "learning_rate": 8.392327319843985e-07, | |
| "loss": 0.0751, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 1.773300096939482, | |
| "grad_norm": 0.3219301402568817, | |
| "learning_rate": 7.876548622886038e-07, | |
| "loss": 0.0702, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 1.7806859622397635, | |
| "grad_norm": 0.602854311466217, | |
| "learning_rate": 7.376476176773184e-07, | |
| "loss": 0.0772, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 1.7880718275400453, | |
| "grad_norm": 0.48326513171195984, | |
| "learning_rate": 6.89219523952781e-07, | |
| "loss": 0.0797, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 1.7954576928403267, | |
| "grad_norm": 0.5595663189888, | |
| "learning_rate": 6.423788376856765e-07, | |
| "loss": 0.066, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 1.8028435581406084, | |
| "grad_norm": 1.7976887226104736, | |
| "learning_rate": 5.971335448074611e-07, | |
| "loss": 0.0732, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 1.81022942344089, | |
| "grad_norm": 1.282763957977295, | |
| "learning_rate": 5.534913592488322e-07, | |
| "loss": 0.0816, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 1.8176152887411716, | |
| "grad_norm": 0.9589461088180542, | |
| "learning_rate": 5.114597216245698e-07, | |
| "loss": 0.0798, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 1.8250011540414532, | |
| "grad_norm": 0.43628719449043274, | |
| "learning_rate": 4.7104579796497405e-07, | |
| "loss": 0.0835, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 1.8323870193417346, | |
| "grad_norm": 0.49431607127189636, | |
| "learning_rate": 4.3225647849411854e-07, | |
| "loss": 0.074, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 1.8397728846420165, | |
| "grad_norm": 0.9135465025901794, | |
| "learning_rate": 3.9509837645513306e-07, | |
| "loss": 0.0736, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 1.847158749942298, | |
| "grad_norm": 0.6499918103218079, | |
| "learning_rate": 3.595778269826966e-07, | |
| "loss": 0.0723, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 1.8545446152425795, | |
| "grad_norm": 1.299659013748169, | |
| "learning_rate": 3.257008860229527e-07, | |
| "loss": 0.0735, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 1.8619304805428611, | |
| "grad_norm": 0.7049327492713928, | |
| "learning_rate": 2.9347332930102503e-07, | |
| "loss": 0.0713, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 1.8693163458431425, | |
| "grad_norm": 0.29024580121040344, | |
| "learning_rate": 2.6290065133630637e-07, | |
| "loss": 0.0774, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 1.8767022111434244, | |
| "grad_norm": 0.7386340498924255, | |
| "learning_rate": 2.3398806450568577e-07, | |
| "loss": 0.0739, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 1.8840880764437058, | |
| "grad_norm": 0.5153611898422241, | |
| "learning_rate": 2.067404981548915e-07, | |
| "loss": 0.0702, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 1.8914739417439874, | |
| "grad_norm": 1.2201671600341797, | |
| "learning_rate": 1.811625977580722e-07, | |
| "loss": 0.082, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 1.898859807044269, | |
| "grad_norm": 0.7881399989128113, | |
| "learning_rate": 1.5725872412579058e-07, | |
| "loss": 0.0677, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 1.9062456723445507, | |
| "grad_norm": 0.3312283456325531, | |
| "learning_rate": 1.3503295266153903e-07, | |
| "loss": 0.0756, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 1.9136315376448323, | |
| "grad_norm": 0.4955926239490509, | |
| "learning_rate": 1.14489072666919e-07, | |
| "loss": 0.0692, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 1.9210174029451137, | |
| "grad_norm": 0.45805656909942627, | |
| "learning_rate": 9.563058669559755e-08, | |
| "loss": 0.0753, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 1.9284032682453955, | |
| "grad_norm": 0.5555469393730164, | |
| "learning_rate": 7.846070995615518e-08, | |
| "loss": 0.0716, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 1.935789133545677, | |
| "grad_norm": 0.5252045392990112, | |
| "learning_rate": 6.298236976391537e-08, | |
| "loss": 0.0772, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 1.9431749988459586, | |
| "grad_norm": 1.8346993923187256, | |
| "learning_rate": 4.919820504186934e-08, | |
| "loss": 0.0764, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 1.9505608641462402, | |
| "grad_norm": 0.4004700481891632, | |
| "learning_rate": 3.711056587075712e-08, | |
| "loss": 0.0739, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 1.9579467294465216, | |
| "grad_norm": 1.077645182609558, | |
| "learning_rate": 2.672151308840243e-08, | |
| "loss": 0.07, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 1.9653325947468034, | |
| "grad_norm": 0.6247801184654236, | |
| "learning_rate": 1.8032817938352653e-08, | |
| "loss": 0.0666, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 1.9727184600470848, | |
| "grad_norm": 0.4016879200935364, | |
| "learning_rate": 1.1045961767904844e-08, | |
| "loss": 0.0695, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 1.9801043253473665, | |
| "grad_norm": 0.5175566673278809, | |
| "learning_rate": 5.7621357755432984e-09, | |
| "loss": 0.0722, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 1.987490190647648, | |
| "grad_norm": 0.5656958222389221, | |
| "learning_rate": 2.1822408078508994e-09, | |
| "loss": 0.0728, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 1.9948760559479295, | |
| "grad_norm": 0.5182742476463318, | |
| "learning_rate": 3.068872059253103e-10, | |
| "loss": 0.0727, | |
| "step": 2700 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 2706, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4.22919470739456e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |