| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 939, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.03194888178913738, | |
| "grad_norm": 2.244253158569336, | |
| "learning_rate": 9.997201868901463e-06, | |
| "loss": 0.6628, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.06389776357827476, | |
| "grad_norm": 2.3703079223632812, | |
| "learning_rate": 9.988810607420912e-06, | |
| "loss": 0.5322, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.09584664536741214, | |
| "grad_norm": 2.0787246227264404, | |
| "learning_rate": 9.974835607498224e-06, | |
| "loss": 0.5513, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.12779552715654952, | |
| "grad_norm": 1.8989689350128174, | |
| "learning_rate": 9.955292510686156e-06, | |
| "loss": 0.5212, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.1597444089456869, | |
| "grad_norm": 2.0805258750915527, | |
| "learning_rate": 9.930203190643491e-06, | |
| "loss": 0.5511, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.19169329073482427, | |
| "grad_norm": 2.0137438774108887, | |
| "learning_rate": 9.899595728652883e-06, | |
| "loss": 0.5134, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.22364217252396165, | |
| "grad_norm": 2.0841853618621826, | |
| "learning_rate": 9.863504382190838e-06, | |
| "loss": 0.4956, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.25559105431309903, | |
| "grad_norm": 1.5514801740646362, | |
| "learning_rate": 9.821969546584922e-06, | |
| "loss": 0.5168, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.28753993610223644, | |
| "grad_norm": 2.0024027824401855, | |
| "learning_rate": 9.775037709801206e-06, | |
| "loss": 0.4878, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.3194888178913738, | |
| "grad_norm": 1.877454161643982, | |
| "learning_rate": 9.722761400412496e-06, | |
| "loss": 0.5156, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.3514376996805112, | |
| "grad_norm": 1.4342358112335205, | |
| "learning_rate": 9.6651991288056e-06, | |
| "loss": 0.5025, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.38338658146964855, | |
| "grad_norm": 1.7300364971160889, | |
| "learning_rate": 9.602415321693434e-06, | |
| "loss": 0.5102, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.41533546325878595, | |
| "grad_norm": 1.5914117097854614, | |
| "learning_rate": 9.534480250005263e-06, | |
| "loss": 0.5074, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.4472843450479233, | |
| "grad_norm": 1.7477545738220215, | |
| "learning_rate": 9.461469950235795e-06, | |
| "loss": 0.5025, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.4792332268370607, | |
| "grad_norm": 1.8615833520889282, | |
| "learning_rate": 9.38346613934115e-06, | |
| "loss": 0.5066, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.5111821086261981, | |
| "grad_norm": 1.7704814672470093, | |
| "learning_rate": 9.300556123276955e-06, | |
| "loss": 0.4952, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.5431309904153354, | |
| "grad_norm": 1.7545205354690552, | |
| "learning_rate": 9.212832699280942e-06, | |
| "loss": 0.5093, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.5750798722044729, | |
| "grad_norm": 2.059382438659668, | |
| "learning_rate": 9.120394052009412e-06, | |
| "loss": 0.4938, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.6070287539936102, | |
| "grad_norm": 1.4858242273330688, | |
| "learning_rate": 9.023343643643821e-06, | |
| "loss": 0.4984, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.6389776357827476, | |
| "grad_norm": 1.5544804334640503, | |
| "learning_rate": 8.921790098090477e-06, | |
| "loss": 0.4969, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.670926517571885, | |
| "grad_norm": 1.8466405868530273, | |
| "learning_rate": 8.815847079402972e-06, | |
| "loss": 0.5071, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.7028753993610224, | |
| "grad_norm": 1.684263825416565, | |
| "learning_rate": 8.705633164563413e-06, | |
| "loss": 0.4731, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.7348242811501597, | |
| "grad_norm": 1.578497052192688, | |
| "learning_rate": 8.591271710764839e-06, | |
| "loss": 0.4741, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.7667731629392971, | |
| "grad_norm": 1.6205824613571167, | |
| "learning_rate": 8.472890717343391e-06, | |
| "loss": 0.4648, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.7987220447284346, | |
| "grad_norm": 1.6458181142807007, | |
| "learning_rate": 8.350622682514735e-06, | |
| "loss": 0.5115, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.8306709265175719, | |
| "grad_norm": 1.6952828168869019, | |
| "learning_rate": 8.224604455075115e-06, | |
| "loss": 0.4802, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.8626198083067093, | |
| "grad_norm": 1.5683045387268066, | |
| "learning_rate": 8.094977081233006e-06, | |
| "loss": 0.5013, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.8945686900958466, | |
| "grad_norm": 1.5408000946044922, | |
| "learning_rate": 7.961885646742793e-06, | |
| "loss": 0.4897, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.9265175718849841, | |
| "grad_norm": 1.5393075942993164, | |
| "learning_rate": 7.825479114517197e-06, | |
| "loss": 0.4759, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.9584664536741214, | |
| "grad_norm": 1.638293743133545, | |
| "learning_rate": 7.685910157900158e-06, | |
| "loss": 0.4782, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.9904153354632588, | |
| "grad_norm": 1.3112537860870361, | |
| "learning_rate": 7.5433349897868445e-06, | |
| "loss": 0.4874, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.0223642172523961, | |
| "grad_norm": 1.236051321029663, | |
| "learning_rate": 7.397913187781962e-06, | |
| "loss": 0.3528, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.0543130990415335, | |
| "grad_norm": 1.8970476388931274, | |
| "learning_rate": 7.249807515592149e-06, | |
| "loss": 0.2994, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.0862619808306708, | |
| "grad_norm": 1.694257378578186, | |
| "learning_rate": 7.099183740852296e-06, | |
| "loss": 0.2844, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.1182108626198084, | |
| "grad_norm": 1.5203874111175537, | |
| "learning_rate": 6.946210449589714e-06, | |
| "loss": 0.2881, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.1501597444089458, | |
| "grad_norm": 1.5055556297302246, | |
| "learning_rate": 6.791058857533814e-06, | |
| "loss": 0.2746, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.182108626198083, | |
| "grad_norm": 2.622091293334961, | |
| "learning_rate": 6.633902618482484e-06, | |
| "loss": 0.283, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.2140575079872205, | |
| "grad_norm": 1.4760558605194092, | |
| "learning_rate": 6.474917629939652e-06, | |
| "loss": 0.2775, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.2460063897763578, | |
| "grad_norm": 1.5865486860275269, | |
| "learning_rate": 6.314281836241573e-06, | |
| "loss": 0.2881, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.2779552715654952, | |
| "grad_norm": 1.7005813121795654, | |
| "learning_rate": 6.1521750293922035e-06, | |
| "loss": 0.2846, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.3099041533546325, | |
| "grad_norm": 1.4311730861663818, | |
| "learning_rate": 5.988778647830554e-06, | |
| "loss": 0.2871, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.34185303514377, | |
| "grad_norm": 1.6767691373825073, | |
| "learning_rate": 5.824275573355278e-06, | |
| "loss": 0.281, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.3738019169329074, | |
| "grad_norm": 1.725629448890686, | |
| "learning_rate": 5.658849926433774e-06, | |
| "loss": 0.2917, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.4057507987220448, | |
| "grad_norm": 1.8225221633911133, | |
| "learning_rate": 5.4926868601249e-06, | |
| "loss": 0.2755, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.4376996805111821, | |
| "grad_norm": 1.9555097818374634, | |
| "learning_rate": 5.325972352845965e-06, | |
| "loss": 0.2813, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.4696485623003195, | |
| "grad_norm": 1.4284628629684448, | |
| "learning_rate": 5.1588930002159255e-06, | |
| "loss": 0.2819, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.5015974440894568, | |
| "grad_norm": 1.607223391532898, | |
| "learning_rate": 4.991635806207788e-06, | |
| "loss": 0.2734, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.5335463258785942, | |
| "grad_norm": 1.7128931283950806, | |
| "learning_rate": 4.824387973843957e-06, | |
| "loss": 0.2712, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.5654952076677318, | |
| "grad_norm": 1.7128102779388428, | |
| "learning_rate": 4.6573366956687885e-06, | |
| "loss": 0.2638, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.5974440894568689, | |
| "grad_norm": 1.6286168098449707, | |
| "learning_rate": 4.4906689442328935e-06, | |
| "loss": 0.2761, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.6293929712460065, | |
| "grad_norm": 1.7579160928726196, | |
| "learning_rate": 4.3245712628236356e-06, | |
| "loss": 0.2772, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.6613418530351438, | |
| "grad_norm": 2.3854265213012695, | |
| "learning_rate": 4.159229556676111e-06, | |
| "loss": 0.2604, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.6932907348242812, | |
| "grad_norm": 1.5023325681686401, | |
| "learning_rate": 3.994828884898267e-06, | |
| "loss": 0.2842, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.7252396166134185, | |
| "grad_norm": 1.7911590337753296, | |
| "learning_rate": 3.8315532533430285e-06, | |
| "loss": 0.2718, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.7571884984025559, | |
| "grad_norm": 1.4628454446792603, | |
| "learning_rate": 3.6695854086593126e-06, | |
| "loss": 0.28, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.7891373801916934, | |
| "grad_norm": 1.6940367221832275, | |
| "learning_rate": 3.509106633752387e-06, | |
| "loss": 0.285, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.8210862619808306, | |
| "grad_norm": 1.588935375213623, | |
| "learning_rate": 3.350296544882543e-06, | |
| "loss": 0.28, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.8530351437699681, | |
| "grad_norm": 1.9102530479431152, | |
| "learning_rate": 3.19333289062915e-06, | |
| "loss": 0.2878, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.8849840255591053, | |
| "grad_norm": 1.792137622833252, | |
| "learning_rate": 3.0383913529451286e-06, | |
| "loss": 0.2796, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.9169329073482428, | |
| "grad_norm": 1.5013296604156494, | |
| "learning_rate": 2.8856453505245018e-06, | |
| "loss": 0.2599, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.9488817891373802, | |
| "grad_norm": 1.6300023794174194, | |
| "learning_rate": 2.7352658447030882e-06, | |
| "loss": 0.2654, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.9808306709265175, | |
| "grad_norm": 1.6086463928222656, | |
| "learning_rate": 2.587421148109619e-06, | |
| "loss": 0.2619, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 2.012779552715655, | |
| "grad_norm": 1.1864526271820068, | |
| "learning_rate": 2.4422767362814045e-06, | |
| "loss": 0.2232, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 2.0447284345047922, | |
| "grad_norm": 1.4812664985656738, | |
| "learning_rate": 2.299995062455459e-06, | |
| "loss": 0.1507, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 2.07667731629393, | |
| "grad_norm": 1.800999402999878, | |
| "learning_rate": 2.16073537574229e-06, | |
| "loss": 0.1401, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 2.108626198083067, | |
| "grad_norm": 1.3780609369277954, | |
| "learning_rate": 2.0246535428859652e-06, | |
| "loss": 0.1365, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 2.1405750798722045, | |
| "grad_norm": 1.6491374969482422, | |
| "learning_rate": 1.8919018738098704e-06, | |
| "loss": 0.1325, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 2.1725239616613417, | |
| "grad_norm": 1.7934684753417969, | |
| "learning_rate": 1.762628951143454e-06, | |
| "loss": 0.1374, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 2.2044728434504792, | |
| "grad_norm": 1.9819718599319458, | |
| "learning_rate": 1.6369794639207626e-06, | |
| "loss": 0.1281, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 2.236421725239617, | |
| "grad_norm": 1.7123699188232422, | |
| "learning_rate": 1.5150940456368784e-06, | |
| "loss": 0.1306, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.268370607028754, | |
| "grad_norm": 1.4628161191940308, | |
| "learning_rate": 1.3971091168435463e-06, | |
| "loss": 0.1312, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 2.3003194888178915, | |
| "grad_norm": 1.4270434379577637, | |
| "learning_rate": 1.2831567324601325e-06, | |
| "loss": 0.1376, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 2.3322683706070286, | |
| "grad_norm": 1.568036675453186, | |
| "learning_rate": 1.173364433970835e-06, | |
| "loss": 0.1381, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 2.364217252396166, | |
| "grad_norm": 1.4632725715637207, | |
| "learning_rate": 1.0678551066735671e-06, | |
| "loss": 0.1442, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 2.3961661341853033, | |
| "grad_norm": 1.6986589431762695, | |
| "learning_rate": 9.66746842140287e-07, | |
| "loss": 0.1309, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 2.428115015974441, | |
| "grad_norm": 1.4087769985198975, | |
| "learning_rate": 8.701528060427194e-07, | |
| "loss": 0.1366, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 2.460063897763578, | |
| "grad_norm": 1.3985058069229126, | |
| "learning_rate": 7.781811114913995e-07, | |
| "loss": 0.1315, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 2.4920127795527156, | |
| "grad_norm": 2.15179443359375, | |
| "learning_rate": 6.909346980298093e-07, | |
| "loss": 0.1402, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 2.523961661341853, | |
| "grad_norm": 1.854887843132019, | |
| "learning_rate": 6.085112164190466e-07, | |
| "loss": 0.1409, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 2.5559105431309903, | |
| "grad_norm": 1.7915081977844238, | |
| "learning_rate": 5.310029193419697e-07, | |
| "loss": 0.1208, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.587859424920128, | |
| "grad_norm": 1.8391714096069336, | |
| "learning_rate": 4.5849655814915683e-07, | |
| "loss": 0.13, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 2.619808306709265, | |
| "grad_norm": 1.8333430290222168, | |
| "learning_rate": 3.9107328576224736e-07, | |
| "loss": 0.1346, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 2.6517571884984026, | |
| "grad_norm": 1.4059584140777588, | |
| "learning_rate": 3.2880856584333043e-07, | |
| "loss": 0.1255, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 2.68370607028754, | |
| "grad_norm": 1.555797815322876, | |
| "learning_rate": 2.717720883320685e-07, | |
| "loss": 0.1314, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 2.7156549520766773, | |
| "grad_norm": 1.521854281425476, | |
| "learning_rate": 2.2002769144504943e-07, | |
| "loss": 0.1293, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 2.747603833865815, | |
| "grad_norm": 1.4914255142211914, | |
| "learning_rate": 1.7363329022471564e-07, | |
| "loss": 0.1269, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 2.779552715654952, | |
| "grad_norm": 1.473074197769165, | |
| "learning_rate": 1.3264081171780797e-07, | |
| "loss": 0.1291, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 2.8115015974440896, | |
| "grad_norm": 1.8382492065429688, | |
| "learning_rate": 9.709613685589314e-08, | |
| "loss": 0.1232, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 2.8434504792332267, | |
| "grad_norm": 2.253413200378418, | |
| "learning_rate": 6.703904910301929e-08, | |
| "loss": 0.1278, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 2.8753993610223643, | |
| "grad_norm": 1.3528082370758057, | |
| "learning_rate": 4.250318992797375e-08, | |
| "loss": 0.1279, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 2.9073482428115014, | |
| "grad_norm": 1.2942605018615723, | |
| "learning_rate": 2.351602115099272e-08, | |
| "loss": 0.1259, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 2.939297124600639, | |
| "grad_norm": 1.6663892269134521, | |
| "learning_rate": 1.0098794207047402e-08, | |
| "loss": 0.1332, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 2.9712460063897765, | |
| "grad_norm": 1.7137141227722168, | |
| "learning_rate": 2.2665263601240328e-09, | |
| "loss": 0.1277, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "step": 939, | |
| "total_flos": 9.762156657969725e+17, | |
| "train_loss": 0.3061042113806874, | |
| "train_runtime": 22579.097, | |
| "train_samples_per_second": 2.66, | |
| "train_steps_per_second": 0.042 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 939, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 50.0, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 9.762156657969725e+17, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |