| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 11.194029850746269, |
| "eval_steps": 500, |
| "global_step": 1500, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.07462686567164178, |
| "grad_norm": 2.6359663009643555, |
| "learning_rate": 9e-07, |
| "loss": 0.2956, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.14925373134328357, |
| "grad_norm": 4.154141426086426, |
| "learning_rate": 1.9e-06, |
| "loss": 0.3612, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.22388059701492538, |
| "grad_norm": 2.7230818271636963, |
| "learning_rate": 2.9e-06, |
| "loss": 0.2889, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.29850746268656714, |
| "grad_norm": 1.8438748121261597, |
| "learning_rate": 3.9e-06, |
| "loss": 0.2648, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.373134328358209, |
| "grad_norm": 1.8854578733444214, |
| "learning_rate": 4.9000000000000005e-06, |
| "loss": 0.2208, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.44776119402985076, |
| "grad_norm": 1.1787480115890503, |
| "learning_rate": 5.9e-06, |
| "loss": 0.177, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.5223880597014925, |
| "grad_norm": 0.8205132484436035, |
| "learning_rate": 6.900000000000001e-06, |
| "loss": 0.1031, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.5970149253731343, |
| "grad_norm": 0.567926824092865, |
| "learning_rate": 7.9e-06, |
| "loss": 0.088, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.6716417910447762, |
| "grad_norm": 0.570378839969635, |
| "learning_rate": 8.9e-06, |
| "loss": 0.0625, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.746268656716418, |
| "grad_norm": 0.31382572650909424, |
| "learning_rate": 9.900000000000002e-06, |
| "loss": 0.0502, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.8208955223880597, |
| "grad_norm": 0.4518411159515381, |
| "learning_rate": 1.09e-05, |
| "loss": 0.0448, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.8955223880597015, |
| "grad_norm": 0.3404799997806549, |
| "learning_rate": 1.19e-05, |
| "loss": 0.0512, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.9701492537313433, |
| "grad_norm": 0.195379838347435, |
| "learning_rate": 1.29e-05, |
| "loss": 0.0337, |
| "step": 130 |
| }, |
| { |
| "epoch": 1.044776119402985, |
| "grad_norm": 0.30482611060142517, |
| "learning_rate": 1.3900000000000002e-05, |
| "loss": 0.0367, |
| "step": 140 |
| }, |
| { |
| "epoch": 1.1194029850746268, |
| "grad_norm": 0.22756843268871307, |
| "learning_rate": 1.49e-05, |
| "loss": 0.0375, |
| "step": 150 |
| }, |
| { |
| "epoch": 1.1940298507462686, |
| "grad_norm": 0.33363619446754456, |
| "learning_rate": 1.59e-05, |
| "loss": 0.0374, |
| "step": 160 |
| }, |
| { |
| "epoch": 1.2686567164179103, |
| "grad_norm": 0.2732360064983368, |
| "learning_rate": 1.69e-05, |
| "loss": 0.0293, |
| "step": 170 |
| }, |
| { |
| "epoch": 1.3432835820895521, |
| "grad_norm": 0.16427411139011383, |
| "learning_rate": 1.79e-05, |
| "loss": 0.0294, |
| "step": 180 |
| }, |
| { |
| "epoch": 1.417910447761194, |
| "grad_norm": 0.21058405935764313, |
| "learning_rate": 1.8900000000000002e-05, |
| "loss": 0.0345, |
| "step": 190 |
| }, |
| { |
| "epoch": 1.4925373134328357, |
| "grad_norm": 0.27761879563331604, |
| "learning_rate": 1.9900000000000003e-05, |
| "loss": 0.0309, |
| "step": 200 |
| }, |
| { |
| "epoch": 1.5671641791044775, |
| "grad_norm": 0.16202031075954437, |
| "learning_rate": 2.09e-05, |
| "loss": 0.0306, |
| "step": 210 |
| }, |
| { |
| "epoch": 1.6417910447761193, |
| "grad_norm": 0.2178775519132614, |
| "learning_rate": 2.19e-05, |
| "loss": 0.0269, |
| "step": 220 |
| }, |
| { |
| "epoch": 1.716417910447761, |
| "grad_norm": 0.297981858253479, |
| "learning_rate": 2.29e-05, |
| "loss": 0.0277, |
| "step": 230 |
| }, |
| { |
| "epoch": 1.7910447761194028, |
| "grad_norm": 0.3659513592720032, |
| "learning_rate": 2.39e-05, |
| "loss": 0.027, |
| "step": 240 |
| }, |
| { |
| "epoch": 1.8656716417910446, |
| "grad_norm": 0.44367414712905884, |
| "learning_rate": 2.4900000000000002e-05, |
| "loss": 0.0277, |
| "step": 250 |
| }, |
| { |
| "epoch": 1.9402985074626866, |
| "grad_norm": 0.17233391106128693, |
| "learning_rate": 2.5900000000000003e-05, |
| "loss": 0.0261, |
| "step": 260 |
| }, |
| { |
| "epoch": 2.014925373134328, |
| "grad_norm": 0.6397565007209778, |
| "learning_rate": 2.6900000000000003e-05, |
| "loss": 0.027, |
| "step": 270 |
| }, |
| { |
| "epoch": 2.08955223880597, |
| "grad_norm": 0.24160514771938324, |
| "learning_rate": 2.7900000000000004e-05, |
| "loss": 0.0277, |
| "step": 280 |
| }, |
| { |
| "epoch": 2.1641791044776117, |
| "grad_norm": 0.20041881501674652, |
| "learning_rate": 2.8899999999999998e-05, |
| "loss": 0.0208, |
| "step": 290 |
| }, |
| { |
| "epoch": 2.2388059701492535, |
| "grad_norm": 0.2287420779466629, |
| "learning_rate": 2.9900000000000002e-05, |
| "loss": 0.0248, |
| "step": 300 |
| }, |
| { |
| "epoch": 2.3134328358208958, |
| "grad_norm": 0.2654382586479187, |
| "learning_rate": 3.09e-05, |
| "loss": 0.0235, |
| "step": 310 |
| }, |
| { |
| "epoch": 2.388059701492537, |
| "grad_norm": 0.2787795662879944, |
| "learning_rate": 3.19e-05, |
| "loss": 0.0222, |
| "step": 320 |
| }, |
| { |
| "epoch": 2.4626865671641793, |
| "grad_norm": 0.25142550468444824, |
| "learning_rate": 3.29e-05, |
| "loss": 0.0239, |
| "step": 330 |
| }, |
| { |
| "epoch": 2.5373134328358207, |
| "grad_norm": 0.37863755226135254, |
| "learning_rate": 3.3900000000000004e-05, |
| "loss": 0.0181, |
| "step": 340 |
| }, |
| { |
| "epoch": 2.611940298507463, |
| "grad_norm": 0.24035662412643433, |
| "learning_rate": 3.49e-05, |
| "loss": 0.0224, |
| "step": 350 |
| }, |
| { |
| "epoch": 2.6865671641791042, |
| "grad_norm": 0.2062656730413437, |
| "learning_rate": 3.59e-05, |
| "loss": 0.024, |
| "step": 360 |
| }, |
| { |
| "epoch": 2.7611940298507465, |
| "grad_norm": 0.2967865467071533, |
| "learning_rate": 3.69e-05, |
| "loss": 0.0159, |
| "step": 370 |
| }, |
| { |
| "epoch": 2.835820895522388, |
| "grad_norm": 0.19556978344917297, |
| "learning_rate": 3.79e-05, |
| "loss": 0.0172, |
| "step": 380 |
| }, |
| { |
| "epoch": 2.91044776119403, |
| "grad_norm": 0.20358707010746002, |
| "learning_rate": 3.8900000000000004e-05, |
| "loss": 0.0209, |
| "step": 390 |
| }, |
| { |
| "epoch": 2.9850746268656714, |
| "grad_norm": 0.2279907613992691, |
| "learning_rate": 3.99e-05, |
| "loss": 0.014, |
| "step": 400 |
| }, |
| { |
| "epoch": 3.0597014925373136, |
| "grad_norm": 0.23948746919631958, |
| "learning_rate": 4.09e-05, |
| "loss": 0.018, |
| "step": 410 |
| }, |
| { |
| "epoch": 3.1343283582089554, |
| "grad_norm": 0.10130123049020767, |
| "learning_rate": 4.19e-05, |
| "loss": 0.0209, |
| "step": 420 |
| }, |
| { |
| "epoch": 3.208955223880597, |
| "grad_norm": 0.21031032502651215, |
| "learning_rate": 4.29e-05, |
| "loss": 0.0195, |
| "step": 430 |
| }, |
| { |
| "epoch": 3.283582089552239, |
| "grad_norm": 0.18734681606292725, |
| "learning_rate": 4.39e-05, |
| "loss": 0.0179, |
| "step": 440 |
| }, |
| { |
| "epoch": 3.3582089552238807, |
| "grad_norm": 0.148833230137825, |
| "learning_rate": 4.49e-05, |
| "loss": 0.0172, |
| "step": 450 |
| }, |
| { |
| "epoch": 3.4328358208955225, |
| "grad_norm": 0.17836610972881317, |
| "learning_rate": 4.5900000000000004e-05, |
| "loss": 0.0149, |
| "step": 460 |
| }, |
| { |
| "epoch": 3.5074626865671643, |
| "grad_norm": 0.2201811820268631, |
| "learning_rate": 4.69e-05, |
| "loss": 0.0186, |
| "step": 470 |
| }, |
| { |
| "epoch": 3.582089552238806, |
| "grad_norm": 0.14344210922718048, |
| "learning_rate": 4.79e-05, |
| "loss": 0.0141, |
| "step": 480 |
| }, |
| { |
| "epoch": 3.656716417910448, |
| "grad_norm": 0.14173491299152374, |
| "learning_rate": 4.89e-05, |
| "loss": 0.0164, |
| "step": 490 |
| }, |
| { |
| "epoch": 3.7313432835820897, |
| "grad_norm": 0.10802058130502701, |
| "learning_rate": 4.99e-05, |
| "loss": 0.0116, |
| "step": 500 |
| }, |
| { |
| "epoch": 3.8059701492537314, |
| "grad_norm": 0.15126974880695343, |
| "learning_rate": 5.0900000000000004e-05, |
| "loss": 0.0141, |
| "step": 510 |
| }, |
| { |
| "epoch": 3.8805970149253732, |
| "grad_norm": 0.23868845403194427, |
| "learning_rate": 5.19e-05, |
| "loss": 0.014, |
| "step": 520 |
| }, |
| { |
| "epoch": 3.955223880597015, |
| "grad_norm": 0.3107485771179199, |
| "learning_rate": 5.2900000000000005e-05, |
| "loss": 0.0167, |
| "step": 530 |
| }, |
| { |
| "epoch": 4.029850746268656, |
| "grad_norm": 0.16036587953567505, |
| "learning_rate": 5.390000000000001e-05, |
| "loss": 0.0172, |
| "step": 540 |
| }, |
| { |
| "epoch": 4.104477611940299, |
| "grad_norm": 0.10766558349132538, |
| "learning_rate": 5.4900000000000006e-05, |
| "loss": 0.0123, |
| "step": 550 |
| }, |
| { |
| "epoch": 4.17910447761194, |
| "grad_norm": 0.15585613250732422, |
| "learning_rate": 5.590000000000001e-05, |
| "loss": 0.014, |
| "step": 560 |
| }, |
| { |
| "epoch": 4.253731343283582, |
| "grad_norm": 0.12563645839691162, |
| "learning_rate": 5.69e-05, |
| "loss": 0.0168, |
| "step": 570 |
| }, |
| { |
| "epoch": 4.3283582089552235, |
| "grad_norm": 0.2879914939403534, |
| "learning_rate": 5.79e-05, |
| "loss": 0.0172, |
| "step": 580 |
| }, |
| { |
| "epoch": 4.402985074626866, |
| "grad_norm": 0.3350052237510681, |
| "learning_rate": 5.89e-05, |
| "loss": 0.0133, |
| "step": 590 |
| }, |
| { |
| "epoch": 4.477611940298507, |
| "grad_norm": 0.20012852549552917, |
| "learning_rate": 5.99e-05, |
| "loss": 0.0132, |
| "step": 600 |
| }, |
| { |
| "epoch": 4.552238805970149, |
| "grad_norm": 0.18231847882270813, |
| "learning_rate": 6.09e-05, |
| "loss": 0.0183, |
| "step": 610 |
| }, |
| { |
| "epoch": 4.6268656716417915, |
| "grad_norm": 0.1492220014333725, |
| "learning_rate": 6.19e-05, |
| "loss": 0.0158, |
| "step": 620 |
| }, |
| { |
| "epoch": 4.701492537313433, |
| "grad_norm": 0.11610173434019089, |
| "learning_rate": 6.29e-05, |
| "loss": 0.0121, |
| "step": 630 |
| }, |
| { |
| "epoch": 4.776119402985074, |
| "grad_norm": 0.1784248948097229, |
| "learning_rate": 6.390000000000001e-05, |
| "loss": 0.0118, |
| "step": 640 |
| }, |
| { |
| "epoch": 4.850746268656716, |
| "grad_norm": 0.170172318816185, |
| "learning_rate": 6.49e-05, |
| "loss": 0.0122, |
| "step": 650 |
| }, |
| { |
| "epoch": 4.925373134328359, |
| "grad_norm": 0.16932718455791473, |
| "learning_rate": 6.59e-05, |
| "loss": 0.0122, |
| "step": 660 |
| }, |
| { |
| "epoch": 5.0, |
| "grad_norm": 0.2288501113653183, |
| "learning_rate": 6.690000000000001e-05, |
| "loss": 0.0103, |
| "step": 670 |
| }, |
| { |
| "epoch": 5.074626865671641, |
| "grad_norm": 0.14068059623241425, |
| "learning_rate": 6.790000000000001e-05, |
| "loss": 0.0147, |
| "step": 680 |
| }, |
| { |
| "epoch": 5.149253731343284, |
| "grad_norm": 0.11700227111577988, |
| "learning_rate": 6.89e-05, |
| "loss": 0.009, |
| "step": 690 |
| }, |
| { |
| "epoch": 5.223880597014926, |
| "grad_norm": 0.1194658875465393, |
| "learning_rate": 6.99e-05, |
| "loss": 0.0094, |
| "step": 700 |
| }, |
| { |
| "epoch": 5.298507462686567, |
| "grad_norm": 0.13679799437522888, |
| "learning_rate": 7.09e-05, |
| "loss": 0.0148, |
| "step": 710 |
| }, |
| { |
| "epoch": 5.373134328358209, |
| "grad_norm": 0.13276909291744232, |
| "learning_rate": 7.19e-05, |
| "loss": 0.0118, |
| "step": 720 |
| }, |
| { |
| "epoch": 5.447761194029851, |
| "grad_norm": 0.094159796833992, |
| "learning_rate": 7.29e-05, |
| "loss": 0.0124, |
| "step": 730 |
| }, |
| { |
| "epoch": 5.522388059701493, |
| "grad_norm": 0.16352857649326324, |
| "learning_rate": 7.390000000000001e-05, |
| "loss": 0.0111, |
| "step": 740 |
| }, |
| { |
| "epoch": 5.597014925373134, |
| "grad_norm": 0.14447464048862457, |
| "learning_rate": 7.49e-05, |
| "loss": 0.0124, |
| "step": 750 |
| }, |
| { |
| "epoch": 5.6716417910447765, |
| "grad_norm": 0.16974638402462006, |
| "learning_rate": 7.59e-05, |
| "loss": 0.0099, |
| "step": 760 |
| }, |
| { |
| "epoch": 5.746268656716418, |
| "grad_norm": 0.1150907427072525, |
| "learning_rate": 7.69e-05, |
| "loss": 0.0104, |
| "step": 770 |
| }, |
| { |
| "epoch": 5.82089552238806, |
| "grad_norm": 0.17046909034252167, |
| "learning_rate": 7.790000000000001e-05, |
| "loss": 0.0126, |
| "step": 780 |
| }, |
| { |
| "epoch": 5.895522388059701, |
| "grad_norm": 0.10910604149103165, |
| "learning_rate": 7.890000000000001e-05, |
| "loss": 0.0074, |
| "step": 790 |
| }, |
| { |
| "epoch": 5.970149253731344, |
| "grad_norm": 0.11517564207315445, |
| "learning_rate": 7.99e-05, |
| "loss": 0.0116, |
| "step": 800 |
| }, |
| { |
| "epoch": 6.044776119402985, |
| "grad_norm": 0.18701259791851044, |
| "learning_rate": 8.090000000000001e-05, |
| "loss": 0.0072, |
| "step": 810 |
| }, |
| { |
| "epoch": 6.119402985074627, |
| "grad_norm": 0.18763042986392975, |
| "learning_rate": 8.19e-05, |
| "loss": 0.0143, |
| "step": 820 |
| }, |
| { |
| "epoch": 6.1940298507462686, |
| "grad_norm": 0.16200491786003113, |
| "learning_rate": 8.29e-05, |
| "loss": 0.0116, |
| "step": 830 |
| }, |
| { |
| "epoch": 6.268656716417911, |
| "grad_norm": 0.10211509466171265, |
| "learning_rate": 8.39e-05, |
| "loss": 0.0094, |
| "step": 840 |
| }, |
| { |
| "epoch": 6.343283582089552, |
| "grad_norm": 0.11517218500375748, |
| "learning_rate": 8.49e-05, |
| "loss": 0.01, |
| "step": 850 |
| }, |
| { |
| "epoch": 6.417910447761194, |
| "grad_norm": 0.11806347966194153, |
| "learning_rate": 8.59e-05, |
| "loss": 0.0128, |
| "step": 860 |
| }, |
| { |
| "epoch": 6.492537313432836, |
| "grad_norm": 0.1730322539806366, |
| "learning_rate": 8.69e-05, |
| "loss": 0.009, |
| "step": 870 |
| }, |
| { |
| "epoch": 6.567164179104478, |
| "grad_norm": 0.20324429869651794, |
| "learning_rate": 8.790000000000001e-05, |
| "loss": 0.0096, |
| "step": 880 |
| }, |
| { |
| "epoch": 6.641791044776119, |
| "grad_norm": 0.22341865301132202, |
| "learning_rate": 8.89e-05, |
| "loss": 0.0108, |
| "step": 890 |
| }, |
| { |
| "epoch": 6.7164179104477615, |
| "grad_norm": 0.14523452520370483, |
| "learning_rate": 8.99e-05, |
| "loss": 0.0093, |
| "step": 900 |
| }, |
| { |
| "epoch": 6.791044776119403, |
| "grad_norm": 0.2617056369781494, |
| "learning_rate": 9.090000000000001e-05, |
| "loss": 0.0091, |
| "step": 910 |
| }, |
| { |
| "epoch": 6.865671641791045, |
| "grad_norm": 0.13474082946777344, |
| "learning_rate": 9.190000000000001e-05, |
| "loss": 0.0164, |
| "step": 920 |
| }, |
| { |
| "epoch": 6.940298507462686, |
| "grad_norm": 0.1422346830368042, |
| "learning_rate": 9.290000000000001e-05, |
| "loss": 0.0144, |
| "step": 930 |
| }, |
| { |
| "epoch": 7.014925373134329, |
| "grad_norm": 0.11779774725437164, |
| "learning_rate": 9.39e-05, |
| "loss": 0.0095, |
| "step": 940 |
| }, |
| { |
| "epoch": 7.08955223880597, |
| "grad_norm": 0.07547947764396667, |
| "learning_rate": 9.49e-05, |
| "loss": 0.0077, |
| "step": 950 |
| }, |
| { |
| "epoch": 7.164179104477612, |
| "grad_norm": 0.13442008197307587, |
| "learning_rate": 9.59e-05, |
| "loss": 0.0125, |
| "step": 960 |
| }, |
| { |
| "epoch": 7.2388059701492535, |
| "grad_norm": 0.1359054297208786, |
| "learning_rate": 9.69e-05, |
| "loss": 0.0069, |
| "step": 970 |
| }, |
| { |
| "epoch": 7.313432835820896, |
| "grad_norm": 0.15389667451381683, |
| "learning_rate": 9.790000000000001e-05, |
| "loss": 0.0091, |
| "step": 980 |
| }, |
| { |
| "epoch": 7.388059701492537, |
| "grad_norm": 0.15574434399604797, |
| "learning_rate": 9.89e-05, |
| "loss": 0.0079, |
| "step": 990 |
| }, |
| { |
| "epoch": 7.462686567164179, |
| "grad_norm": 0.15894819796085358, |
| "learning_rate": 9.99e-05, |
| "loss": 0.0108, |
| "step": 1000 |
| }, |
| { |
| "epoch": 7.537313432835821, |
| "grad_norm": 0.10361112654209137, |
| "learning_rate": 9.999994463727085e-05, |
| "loss": 0.0095, |
| "step": 1010 |
| }, |
| { |
| "epoch": 7.611940298507463, |
| "grad_norm": 0.11922885477542877, |
| "learning_rate": 9.999975326009292e-05, |
| "loss": 0.0068, |
| "step": 1020 |
| }, |
| { |
| "epoch": 7.686567164179104, |
| "grad_norm": 0.10447347909212112, |
| "learning_rate": 9.999942518549879e-05, |
| "loss": 0.0104, |
| "step": 1030 |
| }, |
| { |
| "epoch": 7.7611940298507465, |
| "grad_norm": 0.10702274739742279, |
| "learning_rate": 9.999896041438544e-05, |
| "loss": 0.0074, |
| "step": 1040 |
| }, |
| { |
| "epoch": 7.835820895522388, |
| "grad_norm": 0.0975528284907341, |
| "learning_rate": 9.999835894802353e-05, |
| "loss": 0.0077, |
| "step": 1050 |
| }, |
| { |
| "epoch": 7.91044776119403, |
| "grad_norm": 0.16120749711990356, |
| "learning_rate": 9.999762078805743e-05, |
| "loss": 0.0115, |
| "step": 1060 |
| }, |
| { |
| "epoch": 7.985074626865671, |
| "grad_norm": 0.12331589311361313, |
| "learning_rate": 9.999674593650526e-05, |
| "loss": 0.0078, |
| "step": 1070 |
| }, |
| { |
| "epoch": 8.059701492537313, |
| "grad_norm": 0.1256706714630127, |
| "learning_rate": 9.99957343957588e-05, |
| "loss": 0.008, |
| "step": 1080 |
| }, |
| { |
| "epoch": 8.134328358208956, |
| "grad_norm": 0.12802068889141083, |
| "learning_rate": 9.99945861685836e-05, |
| "loss": 0.0068, |
| "step": 1090 |
| }, |
| { |
| "epoch": 8.208955223880597, |
| "grad_norm": 0.1550874561071396, |
| "learning_rate": 9.999330125811884e-05, |
| "loss": 0.0104, |
| "step": 1100 |
| }, |
| { |
| "epoch": 8.283582089552239, |
| "grad_norm": 0.10126641392707825, |
| "learning_rate": 9.999187966787744e-05, |
| "loss": 0.0064, |
| "step": 1110 |
| }, |
| { |
| "epoch": 8.35820895522388, |
| "grad_norm": 0.12046301364898682, |
| "learning_rate": 9.999032140174595e-05, |
| "loss": 0.0072, |
| "step": 1120 |
| }, |
| { |
| "epoch": 8.432835820895523, |
| "grad_norm": 0.21631024777889252, |
| "learning_rate": 9.998862646398464e-05, |
| "loss": 0.0091, |
| "step": 1130 |
| }, |
| { |
| "epoch": 8.507462686567164, |
| "grad_norm": 0.16928455233573914, |
| "learning_rate": 9.998679485922739e-05, |
| "loss": 0.006, |
| "step": 1140 |
| }, |
| { |
| "epoch": 8.582089552238806, |
| "grad_norm": 0.1493779420852661, |
| "learning_rate": 9.998482659248174e-05, |
| "loss": 0.009, |
| "step": 1150 |
| }, |
| { |
| "epoch": 8.656716417910447, |
| "grad_norm": 0.15652231872081757, |
| "learning_rate": 9.998272166912883e-05, |
| "loss": 0.0066, |
| "step": 1160 |
| }, |
| { |
| "epoch": 8.73134328358209, |
| "grad_norm": 0.1458636224269867, |
| "learning_rate": 9.998048009492347e-05, |
| "loss": 0.0079, |
| "step": 1170 |
| }, |
| { |
| "epoch": 8.805970149253731, |
| "grad_norm": 0.11824473738670349, |
| "learning_rate": 9.997810187599403e-05, |
| "loss": 0.0068, |
| "step": 1180 |
| }, |
| { |
| "epoch": 8.880597014925373, |
| "grad_norm": 0.09037546068429947, |
| "learning_rate": 9.997558701884249e-05, |
| "loss": 0.0089, |
| "step": 1190 |
| }, |
| { |
| "epoch": 8.955223880597014, |
| "grad_norm": 0.1596665233373642, |
| "learning_rate": 9.997293553034433e-05, |
| "loss": 0.0103, |
| "step": 1200 |
| }, |
| { |
| "epoch": 9.029850746268657, |
| "grad_norm": 0.1774168163537979, |
| "learning_rate": 9.997014741774866e-05, |
| "loss": 0.0098, |
| "step": 1210 |
| }, |
| { |
| "epoch": 9.104477611940299, |
| "grad_norm": 0.17946547269821167, |
| "learning_rate": 9.996722268867803e-05, |
| "loss": 0.0086, |
| "step": 1220 |
| }, |
| { |
| "epoch": 9.17910447761194, |
| "grad_norm": 0.1658417135477066, |
| "learning_rate": 9.996416135112858e-05, |
| "loss": 0.0073, |
| "step": 1230 |
| }, |
| { |
| "epoch": 9.253731343283581, |
| "grad_norm": 0.09881595522165298, |
| "learning_rate": 9.996096341346988e-05, |
| "loss": 0.007, |
| "step": 1240 |
| }, |
| { |
| "epoch": 9.328358208955224, |
| "grad_norm": 0.16437676548957825, |
| "learning_rate": 9.995762888444495e-05, |
| "loss": 0.0071, |
| "step": 1250 |
| }, |
| { |
| "epoch": 9.402985074626866, |
| "grad_norm": 0.13102124631404877, |
| "learning_rate": 9.995415777317027e-05, |
| "loss": 0.0054, |
| "step": 1260 |
| }, |
| { |
| "epoch": 9.477611940298507, |
| "grad_norm": 0.12780794501304626, |
| "learning_rate": 9.995055008913574e-05, |
| "loss": 0.0042, |
| "step": 1270 |
| }, |
| { |
| "epoch": 9.552238805970148, |
| "grad_norm": 0.13209404051303864, |
| "learning_rate": 9.994680584220463e-05, |
| "loss": 0.0082, |
| "step": 1280 |
| }, |
| { |
| "epoch": 9.626865671641792, |
| "grad_norm": 0.1280508041381836, |
| "learning_rate": 9.994292504261355e-05, |
| "loss": 0.0064, |
| "step": 1290 |
| }, |
| { |
| "epoch": 9.701492537313433, |
| "grad_norm": 0.1415010243654251, |
| "learning_rate": 9.993890770097247e-05, |
| "loss": 0.0077, |
| "step": 1300 |
| }, |
| { |
| "epoch": 9.776119402985074, |
| "grad_norm": 0.0671149417757988, |
| "learning_rate": 9.993475382826467e-05, |
| "loss": 0.0064, |
| "step": 1310 |
| }, |
| { |
| "epoch": 9.850746268656717, |
| "grad_norm": 0.08181161433458328, |
| "learning_rate": 9.993046343584664e-05, |
| "loss": 0.0057, |
| "step": 1320 |
| }, |
| { |
| "epoch": 9.925373134328359, |
| "grad_norm": 0.0873553529381752, |
| "learning_rate": 9.992603653544816e-05, |
| "loss": 0.0111, |
| "step": 1330 |
| }, |
| { |
| "epoch": 10.0, |
| "grad_norm": 0.1723933219909668, |
| "learning_rate": 9.992147313917222e-05, |
| "loss": 0.0099, |
| "step": 1340 |
| }, |
| { |
| "epoch": 10.074626865671641, |
| "grad_norm": 0.1347159594297409, |
| "learning_rate": 9.991677325949497e-05, |
| "loss": 0.0092, |
| "step": 1350 |
| }, |
| { |
| "epoch": 10.149253731343283, |
| "grad_norm": 0.12214215844869614, |
| "learning_rate": 9.991193690926568e-05, |
| "loss": 0.0064, |
| "step": 1360 |
| }, |
| { |
| "epoch": 10.223880597014926, |
| "grad_norm": 0.0954289585351944, |
| "learning_rate": 9.990696410170678e-05, |
| "loss": 0.0115, |
| "step": 1370 |
| }, |
| { |
| "epoch": 10.298507462686567, |
| "grad_norm": 0.12168958783149719, |
| "learning_rate": 9.990185485041371e-05, |
| "loss": 0.0068, |
| "step": 1380 |
| }, |
| { |
| "epoch": 10.373134328358208, |
| "grad_norm": 0.11939054727554321, |
| "learning_rate": 9.989660916935498e-05, |
| "loss": 0.0062, |
| "step": 1390 |
| }, |
| { |
| "epoch": 10.447761194029852, |
| "grad_norm": 0.09408388286828995, |
| "learning_rate": 9.989122707287208e-05, |
| "loss": 0.0089, |
| "step": 1400 |
| }, |
| { |
| "epoch": 10.522388059701493, |
| "grad_norm": 0.10327029973268509, |
| "learning_rate": 9.988570857567945e-05, |
| "loss": 0.0129, |
| "step": 1410 |
| }, |
| { |
| "epoch": 10.597014925373134, |
| "grad_norm": 0.17698495090007782, |
| "learning_rate": 9.988005369286446e-05, |
| "loss": 0.0082, |
| "step": 1420 |
| }, |
| { |
| "epoch": 10.671641791044776, |
| "grad_norm": 0.13924898207187653, |
| "learning_rate": 9.987426243988734e-05, |
| "loss": 0.0092, |
| "step": 1430 |
| }, |
| { |
| "epoch": 10.746268656716419, |
| "grad_norm": 0.12503832578659058, |
| "learning_rate": 9.986833483258114e-05, |
| "loss": 0.0065, |
| "step": 1440 |
| }, |
| { |
| "epoch": 10.82089552238806, |
| "grad_norm": 0.10239670425653458, |
| "learning_rate": 9.986227088715173e-05, |
| "loss": 0.0067, |
| "step": 1450 |
| }, |
| { |
| "epoch": 10.895522388059701, |
| "grad_norm": 0.09500975906848907, |
| "learning_rate": 9.98560706201777e-05, |
| "loss": 0.0112, |
| "step": 1460 |
| }, |
| { |
| "epoch": 10.970149253731343, |
| "grad_norm": 0.1916763335466385, |
| "learning_rate": 9.984973404861036e-05, |
| "loss": 0.0051, |
| "step": 1470 |
| }, |
| { |
| "epoch": 11.044776119402986, |
| "grad_norm": 0.15825192630290985, |
| "learning_rate": 9.984326118977361e-05, |
| "loss": 0.0079, |
| "step": 1480 |
| }, |
| { |
| "epoch": 11.119402985074627, |
| "grad_norm": 0.19664017856121063, |
| "learning_rate": 9.983665206136406e-05, |
| "loss": 0.0052, |
| "step": 1490 |
| }, |
| { |
| "epoch": 11.194029850746269, |
| "grad_norm": 0.12272711098194122, |
| "learning_rate": 9.982990668145075e-05, |
| "loss": 0.0086, |
| "step": 1500 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 20000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 150, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.3585337048988736e+16, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|