| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.273117937291103, | |
| "eval_steps": 500, | |
| "global_step": 2000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.006366385484641095, | |
| "grad_norm": 28.7295833201599, | |
| "learning_rate": 0.00011219390703061137, | |
| "loss": 6.6223, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.01273277096928219, | |
| "grad_norm": 15.081303427956621, | |
| "learning_rate": 0.00014596763837756142, | |
| "loss": 2.7812, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.019099156453923283, | |
| "grad_norm": 17.081283324526982, | |
| "learning_rate": 0.00016572400472495783, | |
| "loss": 2.6574, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.02546554193856438, | |
| "grad_norm": 15.201762160364256, | |
| "learning_rate": 0.00017974136972451145, | |
| "loss": 2.9039, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.031831927423205474, | |
| "grad_norm": 11.557044296106628, | |
| "learning_rate": 0.0001906140827142727, | |
| "loss": 2.9066, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.03819831290784657, | |
| "grad_norm": 14.915432761425704, | |
| "learning_rate": 0.00019949773607190786, | |
| "loss": 2.8914, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.04456469839248767, | |
| "grad_norm": 8.745902020340656, | |
| "learning_rate": 0.0002070087579637228, | |
| "loss": 2.9014, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.05093108387712876, | |
| "grad_norm": 8.857658675771324, | |
| "learning_rate": 0.0002135151010714615, | |
| "loss": 2.9293, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.057297469361769854, | |
| "grad_norm": 11.48593285000581, | |
| "learning_rate": 0.0002192541024193043, | |
| "loss": 2.8625, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.06366385484641095, | |
| "grad_norm": 7.815960040918993, | |
| "learning_rate": 0.00022438781406122275, | |
| "loss": 3.0063, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.07003024033105204, | |
| "grad_norm": 8.632620514774269, | |
| "learning_rate": 0.00022903182113161202, | |
| "loss": 2.9527, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.07639662581569313, | |
| "grad_norm": 7.955348207734164, | |
| "learning_rate": 0.00023327146741885792, | |
| "loss": 3.0082, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.08276301130033424, | |
| "grad_norm": 7.216941061016277, | |
| "learning_rate": 0.00023717156393669215, | |
| "loss": 2.7871, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.08912939678497533, | |
| "grad_norm": 5.822424121539524, | |
| "learning_rate": 0.00024078248931067283, | |
| "loss": 2.7844, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.09549578226961643, | |
| "grad_norm": 7.906464628684026, | |
| "learning_rate": 0.00024414418040861915, | |
| "loss": 2.8514, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.10186216775425752, | |
| "grad_norm": 7.514192904534888, | |
| "learning_rate": 0.00024728883241841157, | |
| "loss": 3.0086, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.10822855323889861, | |
| "grad_norm": 6.425203024033055, | |
| "learning_rate": 0.0002502427789216415, | |
| "loss": 2.8754, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.11459493872353971, | |
| "grad_norm": 7.088171053123606, | |
| "learning_rate": 0.00025302783376625435, | |
| "loss": 2.9094, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.1209613242081808, | |
| "grad_norm": 6.538392044337595, | |
| "learning_rate": 0.00025566226965097254, | |
| "loss": 2.9262, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.1273277096928219, | |
| "grad_norm": 8.672338929603391, | |
| "learning_rate": 0.0002581615454081728, | |
| "loss": 2.8391, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.133694095177463, | |
| "grad_norm": 7.204443295582082, | |
| "learning_rate": 0.00026053885565806924, | |
| "loss": 2.9121, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.14006048066210408, | |
| "grad_norm": 6.463760927732387, | |
| "learning_rate": 0.0002628055524785621, | |
| "loss": 2.9268, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.1464268661467452, | |
| "grad_norm": 6.869739493367925, | |
| "learning_rate": 0.0002649714732657648, | |
| "loss": 2.8965, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.15279325163138627, | |
| "grad_norm": 5.613452359830457, | |
| "learning_rate": 0.00026704519876580795, | |
| "loss": 2.9688, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.15915963711602737, | |
| "grad_norm": 5.7575322101372866, | |
| "learning_rate": 0.000269034258397934, | |
| "loss": 2.9008, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.16552602260066848, | |
| "grad_norm": 7.337481922102622, | |
| "learning_rate": 0.00027094529528364224, | |
| "loss": 2.9398, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.17189240808530956, | |
| "grad_norm": 6.95242288679829, | |
| "learning_rate": 0.00027278420011365073, | |
| "loss": 2.8973, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.17825879356995067, | |
| "grad_norm": 4.554710482953035, | |
| "learning_rate": 0.00027455622065762283, | |
| "loss": 2.9867, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.18462517905459175, | |
| "grad_norm": 7.670629856220385, | |
| "learning_rate": 0.00027626605204863905, | |
| "loss": 2.9191, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.19099156453923286, | |
| "grad_norm": 6.140158435864239, | |
| "learning_rate": 0.0002779179117555692, | |
| "loss": 2.8199, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.19735795002387393, | |
| "grad_norm": 6.06880330501044, | |
| "learning_rate": 0.00027951560225766885, | |
| "loss": 2.959, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.20372433550851504, | |
| "grad_norm": 5.932458703590213, | |
| "learning_rate": 0.0002810625637653616, | |
| "loss": 2.9984, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.21009072099315615, | |
| "grad_norm": 5.413884214713577, | |
| "learning_rate": 0.0002825619188259585, | |
| "loss": 2.8937, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.21645710647779723, | |
| "grad_norm": 5.6133458231383315, | |
| "learning_rate": 0.00028401651026859154, | |
| "loss": 2.8844, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.22282349196243834, | |
| "grad_norm": 6.157515185075721, | |
| "learning_rate": 0.00028542893364738413, | |
| "loss": 2.9137, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.22918987744707942, | |
| "grad_norm": 6.150435968994946, | |
| "learning_rate": 0.0002868015651132044, | |
| "loss": 2.8086, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.23555626293172052, | |
| "grad_norm": 6.1391865377467205, | |
| "learning_rate": 0.00028813658546582825, | |
| "loss": 2.9246, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.2419226484163616, | |
| "grad_norm": 4.720212170517568, | |
| "learning_rate": 0.0002894360009979226, | |
| "loss": 2.7273, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.2482890339010027, | |
| "grad_norm": 5.221438084968306, | |
| "learning_rate": 0.0002907016616310386, | |
| "loss": 2.8332, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.2546554193856438, | |
| "grad_norm": 5.397389581466363, | |
| "learning_rate": 0.00029193527675512284, | |
| "loss": 2.776, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.26102180487028487, | |
| "grad_norm": 7.525981812223918, | |
| "learning_rate": 0.0002931384291118962, | |
| "loss": 2.8727, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.267388190354926, | |
| "grad_norm": 5.7116155190893805, | |
| "learning_rate": 0.00029431258700501927, | |
| "loss": 2.9766, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.2737545758395671, | |
| "grad_norm": 7.012023587035556, | |
| "learning_rate": 0.00029545911507334384, | |
| "loss": 2.7902, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.28012096132420816, | |
| "grad_norm": 4.8865448559050435, | |
| "learning_rate": 0.0002965792838255121, | |
| "loss": 2.6873, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.2864873468088493, | |
| "grad_norm": 4.682656054807096, | |
| "learning_rate": 0.0002976742781029656, | |
| "loss": 2.8498, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.2928537322934904, | |
| "grad_norm": 4.114956894922335, | |
| "learning_rate": 0.00029874520461271485, | |
| "loss": 2.7461, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.29922011777813146, | |
| "grad_norm": 4.388435500348782, | |
| "learning_rate": 0.0002997930986499357, | |
| "loss": 2.9004, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.30558650326277254, | |
| "grad_norm": 4.06992963685517, | |
| "learning_rate": 0.0002995048337656213, | |
| "loss": 2.8203, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.31195288874741367, | |
| "grad_norm": 5.7122354924160375, | |
| "learning_rate": 0.0002987974534307946, | |
| "loss": 2.8477, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.31831927423205475, | |
| "grad_norm": 4.543743759310822, | |
| "learning_rate": 0.0002980900730959679, | |
| "loss": 2.8168, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.32468565971669583, | |
| "grad_norm": 5.2449822235737384, | |
| "learning_rate": 0.0002973826927611412, | |
| "loss": 2.7021, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.33105204520133696, | |
| "grad_norm": 6.42302117474529, | |
| "learning_rate": 0.0002966753124263145, | |
| "loss": 2.8545, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.33741843068597804, | |
| "grad_norm": 6.4116258183908945, | |
| "learning_rate": 0.0002959679320914878, | |
| "loss": 2.8436, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.3437848161706191, | |
| "grad_norm": 4.982523320381435, | |
| "learning_rate": 0.00029526055175666113, | |
| "loss": 2.7607, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.3501512016552602, | |
| "grad_norm": 4.815920229901925, | |
| "learning_rate": 0.00029455317142183443, | |
| "loss": 2.734, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.35651758713990134, | |
| "grad_norm": 9.300484207985622, | |
| "learning_rate": 0.00029384579108700774, | |
| "loss": 2.7889, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.3628839726245424, | |
| "grad_norm": 4.5559522343853285, | |
| "learning_rate": 0.00029313841075218105, | |
| "loss": 2.7531, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.3692503581091835, | |
| "grad_norm": 5.351641807589207, | |
| "learning_rate": 0.0002924310304173544, | |
| "loss": 2.7451, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.37561674359382463, | |
| "grad_norm": 5.696588702079196, | |
| "learning_rate": 0.00029172365008252766, | |
| "loss": 2.7189, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.3819831290784657, | |
| "grad_norm": 4.694068773418385, | |
| "learning_rate": 0.00029101626974770096, | |
| "loss": 2.6096, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.3883495145631068, | |
| "grad_norm": 9.013555425033168, | |
| "learning_rate": 0.0002903088894128743, | |
| "loss": 2.6605, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.39471590004774787, | |
| "grad_norm": 10.964569620769009, | |
| "learning_rate": 0.00028960150907804763, | |
| "loss": 2.7258, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.401082285532389, | |
| "grad_norm": 8.94321766980447, | |
| "learning_rate": 0.00028889412874322093, | |
| "loss": 2.6688, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.4074486710170301, | |
| "grad_norm": 10.742420289850541, | |
| "learning_rate": 0.0002881867484083942, | |
| "loss": 2.573, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.41381505650167116, | |
| "grad_norm": 8.890111021830462, | |
| "learning_rate": 0.00028747936807356755, | |
| "loss": 2.5846, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.4201814419863123, | |
| "grad_norm": 27.744833393643688, | |
| "learning_rate": 0.00028677198773874085, | |
| "loss": 2.5406, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.4265478274709534, | |
| "grad_norm": 8.283239589638123, | |
| "learning_rate": 0.00028606460740391416, | |
| "loss": 2.602, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.43291421295559446, | |
| "grad_norm": 7.738262298672388, | |
| "learning_rate": 0.00028535722706908746, | |
| "loss": 2.6947, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.43928059844023554, | |
| "grad_norm": 11.536106786052837, | |
| "learning_rate": 0.00028464984673426077, | |
| "loss": 2.6422, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.44564698392487667, | |
| "grad_norm": 7.93060742031869, | |
| "learning_rate": 0.0002839424663994341, | |
| "loss": 2.5859, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.45201336940951775, | |
| "grad_norm": 4.031662354294752, | |
| "learning_rate": 0.0002832350860646074, | |
| "loss": 2.5947, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.45837975489415883, | |
| "grad_norm": 5.258278074052536, | |
| "learning_rate": 0.0002825277057297807, | |
| "loss": 2.64, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.4647461403787999, | |
| "grad_norm": 4.323834597980534, | |
| "learning_rate": 0.000281820325394954, | |
| "loss": 2.6949, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.47111252586344105, | |
| "grad_norm": 5.0167173121086135, | |
| "learning_rate": 0.0002811129450601273, | |
| "loss": 2.6221, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.4774789113480821, | |
| "grad_norm": 5.933856548166221, | |
| "learning_rate": 0.0002804055647253006, | |
| "loss": 2.632, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.4838452968327232, | |
| "grad_norm": 5.090852518324657, | |
| "learning_rate": 0.0002796981843904739, | |
| "loss": 2.8242, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.49021168231736434, | |
| "grad_norm": 4.60199924467046, | |
| "learning_rate": 0.0002789908040556472, | |
| "loss": 2.6357, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.4965780678020054, | |
| "grad_norm": 4.62334978934682, | |
| "learning_rate": 0.0002782834237208206, | |
| "loss": 2.6455, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.5029444532866465, | |
| "grad_norm": 5.7098868621282834, | |
| "learning_rate": 0.0002775760433859938, | |
| "loss": 2.7822, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.5093108387712876, | |
| "grad_norm": 5.492909395839608, | |
| "learning_rate": 0.00027686866305116713, | |
| "loss": 2.5965, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.5156772242559287, | |
| "grad_norm": 5.007751033932614, | |
| "learning_rate": 0.00027616128271634044, | |
| "loss": 2.6355, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.5220436097405697, | |
| "grad_norm": 6.170648429353342, | |
| "learning_rate": 0.0002754539023815138, | |
| "loss": 2.6123, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.5284099952252109, | |
| "grad_norm": 4.235113061064731, | |
| "learning_rate": 0.0002747465220466871, | |
| "loss": 2.6453, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.534776380709852, | |
| "grad_norm": 4.538569485793573, | |
| "learning_rate": 0.00027403914171186036, | |
| "loss": 2.6713, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.5411427661944931, | |
| "grad_norm": 4.019460059963649, | |
| "learning_rate": 0.0002733317613770337, | |
| "loss": 2.5943, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.5475091516791342, | |
| "grad_norm": 6.254490760667432, | |
| "learning_rate": 0.000272624381042207, | |
| "loss": 2.5418, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.5538755371637752, | |
| "grad_norm": 4.520259916472461, | |
| "learning_rate": 0.0002719170007073803, | |
| "loss": 2.584, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.5602419226484163, | |
| "grad_norm": 5.065731135931019, | |
| "learning_rate": 0.0002712096203725536, | |
| "loss": 2.6723, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.5666083081330574, | |
| "grad_norm": 3.833075034277501, | |
| "learning_rate": 0.00027050224003772694, | |
| "loss": 2.5248, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.5729746936176986, | |
| "grad_norm": 5.507080157861899, | |
| "learning_rate": 0.00026979485970290024, | |
| "loss": 2.6379, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.5793410791023397, | |
| "grad_norm": 14.128248882331409, | |
| "learning_rate": 0.00026908747936807355, | |
| "loss": 2.5904, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.5857074645869808, | |
| "grad_norm": 6.1778278241357265, | |
| "learning_rate": 0.00026838009903324686, | |
| "loss": 2.5844, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.5920738500716218, | |
| "grad_norm": 8.064848357512728, | |
| "learning_rate": 0.00026767271869842016, | |
| "loss": 2.6055, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.5984402355562629, | |
| "grad_norm": 6.16438539022089, | |
| "learning_rate": 0.00026696533836359347, | |
| "loss": 2.5506, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.604806621040904, | |
| "grad_norm": 5.882124357483614, | |
| "learning_rate": 0.0002662579580287668, | |
| "loss": 2.4973, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.6111730065255451, | |
| "grad_norm": 8.373991467730166, | |
| "learning_rate": 0.0002655505776939401, | |
| "loss": 2.5428, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.6175393920101863, | |
| "grad_norm": 6.831051520016273, | |
| "learning_rate": 0.0002648431973591134, | |
| "loss": 2.4982, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.6239057774948273, | |
| "grad_norm": 5.074869694709693, | |
| "learning_rate": 0.0002641358170242867, | |
| "loss": 2.5465, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.6302721629794684, | |
| "grad_norm": 5.503484162830985, | |
| "learning_rate": 0.00026342843668946, | |
| "loss": 2.324, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.6366385484641095, | |
| "grad_norm": 4.385405886090266, | |
| "learning_rate": 0.0002627210563546333, | |
| "loss": 2.417, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.6430049339487506, | |
| "grad_norm": 5.034207848822367, | |
| "learning_rate": 0.0002620136760198066, | |
| "loss": 2.6066, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.6493713194333917, | |
| "grad_norm": 3.696840611162144, | |
| "learning_rate": 0.00026130629568497997, | |
| "loss": 2.5441, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.6557377049180327, | |
| "grad_norm": 4.6314444028768875, | |
| "learning_rate": 0.0002605989153501533, | |
| "loss": 2.4752, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.6621040904026739, | |
| "grad_norm": 3.4663340841035093, | |
| "learning_rate": 0.0002598915350153265, | |
| "loss": 2.4637, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.668470475887315, | |
| "grad_norm": 8.334602958229153, | |
| "learning_rate": 0.00025918415468049983, | |
| "loss": 2.6082, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.6748368613719561, | |
| "grad_norm": 4.575304861734687, | |
| "learning_rate": 0.0002584767743456732, | |
| "loss": 2.4842, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.6812032468565972, | |
| "grad_norm": 3.618603659881246, | |
| "learning_rate": 0.0002577693940108465, | |
| "loss": 2.4982, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.6875696323412382, | |
| "grad_norm": 4.4740750234847475, | |
| "learning_rate": 0.00025706201367601975, | |
| "loss": 2.4029, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.6939360178258793, | |
| "grad_norm": 4.208151138305904, | |
| "learning_rate": 0.0002563546333411931, | |
| "loss": 2.4836, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.7003024033105204, | |
| "grad_norm": 5.736418133529195, | |
| "learning_rate": 0.0002556472530063664, | |
| "loss": 2.6402, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.7066687887951616, | |
| "grad_norm": 3.6852138552936395, | |
| "learning_rate": 0.0002549398726715397, | |
| "loss": 2.4941, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.7130351742798027, | |
| "grad_norm": 4.11339792490071, | |
| "learning_rate": 0.000254232492336713, | |
| "loss": 2.4641, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.7194015597644438, | |
| "grad_norm": 3.7487169212790477, | |
| "learning_rate": 0.00025352511200188633, | |
| "loss": 2.3414, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.7257679452490848, | |
| "grad_norm": 3.442251781584789, | |
| "learning_rate": 0.00025281773166705964, | |
| "loss": 2.3932, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.7321343307337259, | |
| "grad_norm": 3.8470452186807846, | |
| "learning_rate": 0.00025211035133223294, | |
| "loss": 2.4334, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.738500716218367, | |
| "grad_norm": 4.506628903442333, | |
| "learning_rate": 0.00025140297099740625, | |
| "loss": 2.3195, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.7448671017030081, | |
| "grad_norm": 4.325389792461109, | |
| "learning_rate": 0.00025069559066257955, | |
| "loss": 2.4027, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.7512334871876493, | |
| "grad_norm": 3.8669860452410445, | |
| "learning_rate": 0.00024998821032775286, | |
| "loss": 2.4686, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.7575998726722903, | |
| "grad_norm": 3.561694549590622, | |
| "learning_rate": 0.00024928082999292617, | |
| "loss": 2.5395, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.7639662581569314, | |
| "grad_norm": 4.156976510059248, | |
| "learning_rate": 0.00024857344965809947, | |
| "loss": 2.4416, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.7703326436415725, | |
| "grad_norm": 5.616798309964706, | |
| "learning_rate": 0.0002478660693232728, | |
| "loss": 2.4055, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.7766990291262136, | |
| "grad_norm": 3.149306910194074, | |
| "learning_rate": 0.0002471586889884461, | |
| "loss": 2.4488, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.7830654146108547, | |
| "grad_norm": 3.0254085146949974, | |
| "learning_rate": 0.00024645130865361944, | |
| "loss": 2.44, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.7894318000954957, | |
| "grad_norm": 3.3553234269170815, | |
| "learning_rate": 0.0002457439283187927, | |
| "loss": 2.4689, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.7957981855801369, | |
| "grad_norm": 3.560968004340506, | |
| "learning_rate": 0.000245036547983966, | |
| "loss": 2.4328, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.802164571064778, | |
| "grad_norm": 3.670143268942363, | |
| "learning_rate": 0.00024432916764913936, | |
| "loss": 2.357, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.8085309565494191, | |
| "grad_norm": 4.603180595509743, | |
| "learning_rate": 0.00024362178731431264, | |
| "loss": 2.4336, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.8148973420340602, | |
| "grad_norm": 3.2173958923931845, | |
| "learning_rate": 0.00024291440697948595, | |
| "loss": 2.3859, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.8212637275187012, | |
| "grad_norm": 4.238015371838477, | |
| "learning_rate": 0.00024220702664465925, | |
| "loss": 2.4234, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.8276301130033423, | |
| "grad_norm": 3.350821041839247, | |
| "learning_rate": 0.00024149964630983258, | |
| "loss": 2.34, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.8339964984879834, | |
| "grad_norm": 3.956822835287818, | |
| "learning_rate": 0.00024079226597500586, | |
| "loss": 2.409, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.8403628839726246, | |
| "grad_norm": 3.206745954274851, | |
| "learning_rate": 0.00024008488564017917, | |
| "loss": 2.4766, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.8467292694572657, | |
| "grad_norm": 3.16798874537864, | |
| "learning_rate": 0.0002393775053053525, | |
| "loss": 2.4918, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.8530956549419068, | |
| "grad_norm": 3.2084119932227613, | |
| "learning_rate": 0.0002386701249705258, | |
| "loss": 2.3279, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.8594620404265478, | |
| "grad_norm": 2.6626163554970694, | |
| "learning_rate": 0.00023796274463569909, | |
| "loss": 2.3762, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.8658284259111889, | |
| "grad_norm": 3.543568104802714, | |
| "learning_rate": 0.00023725536430087242, | |
| "loss": 2.4406, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.87219481139583, | |
| "grad_norm": 3.0950338366546832, | |
| "learning_rate": 0.00023654798396604572, | |
| "loss": 2.4135, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.8785611968804711, | |
| "grad_norm": 3.4459117956268583, | |
| "learning_rate": 0.00023584060363121903, | |
| "loss": 2.3371, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.8849275823651122, | |
| "grad_norm": 3.3339357175581026, | |
| "learning_rate": 0.00023513322329639234, | |
| "loss": 2.3568, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.8912939678497533, | |
| "grad_norm": 2.7770602679788836, | |
| "learning_rate": 0.00023442584296156567, | |
| "loss": 2.4703, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.8976603533343944, | |
| "grad_norm": 3.1587359306963925, | |
| "learning_rate": 0.00023371846262673895, | |
| "loss": 2.3687, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.9040267388190355, | |
| "grad_norm": 3.6463373041615057, | |
| "learning_rate": 0.00023301108229191225, | |
| "loss": 2.5037, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.9103931243036766, | |
| "grad_norm": 3.8799007179484066, | |
| "learning_rate": 0.00023230370195708559, | |
| "loss": 2.3883, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.9167595097883177, | |
| "grad_norm": 2.650501985473451, | |
| "learning_rate": 0.0002315963216222589, | |
| "loss": 2.3818, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.9231258952729587, | |
| "grad_norm": 3.3575457214628353, | |
| "learning_rate": 0.00023088894128743217, | |
| "loss": 2.2014, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.9294922807575998, | |
| "grad_norm": 4.160903578127555, | |
| "learning_rate": 0.00023018156095260548, | |
| "loss": 2.3586, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.935858666242241, | |
| "grad_norm": 3.7275531046661805, | |
| "learning_rate": 0.0002294741806177788, | |
| "loss": 2.4684, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.9422250517268821, | |
| "grad_norm": 3.5804971221330515, | |
| "learning_rate": 0.00022876680028295211, | |
| "loss": 2.3984, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.9485914372115232, | |
| "grad_norm": 2.903402028010888, | |
| "learning_rate": 0.00022805941994812542, | |
| "loss": 2.3508, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.9549578226961642, | |
| "grad_norm": 3.2883720153819804, | |
| "learning_rate": 0.00022735203961329875, | |
| "loss": 2.468, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.9613242081808053, | |
| "grad_norm": 3.165068282067837, | |
| "learning_rate": 0.00022664465927847203, | |
| "loss": 2.3387, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.9676905936654464, | |
| "grad_norm": 2.8318847580527895, | |
| "learning_rate": 0.00022593727894364534, | |
| "loss": 2.235, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.9740569791500875, | |
| "grad_norm": 3.0627223359274556, | |
| "learning_rate": 0.00022522989860881867, | |
| "loss": 2.4135, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.9804233646347287, | |
| "grad_norm": 4.63667156924246, | |
| "learning_rate": 0.00022452251827399198, | |
| "loss": 2.3891, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.9867897501193698, | |
| "grad_norm": 2.7155003830350735, | |
| "learning_rate": 0.00022381513793916526, | |
| "loss": 2.2814, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.9931561356040108, | |
| "grad_norm": 3.743790594304019, | |
| "learning_rate": 0.00022310775760433856, | |
| "loss": 2.1729, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 0.9995225210886519, | |
| "grad_norm": 3.0964479659673336, | |
| "learning_rate": 0.0002224003772695119, | |
| "loss": 2.3365, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 1.005729746936177, | |
| "grad_norm": 3.9150118918426307, | |
| "learning_rate": 0.0002216929969346852, | |
| "loss": 1.9852, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 1.012096132420818, | |
| "grad_norm": 3.5380613834183614, | |
| "learning_rate": 0.0002209856165998585, | |
| "loss": 2.0525, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 1.0184625179054592, | |
| "grad_norm": 3.1487817928894613, | |
| "learning_rate": 0.00022027823626503184, | |
| "loss": 1.9064, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.0248289033901004, | |
| "grad_norm": 3.266369156342291, | |
| "learning_rate": 0.00021957085593020512, | |
| "loss": 1.8943, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 1.0311952888747413, | |
| "grad_norm": 3.4336556947723307, | |
| "learning_rate": 0.00021886347559537842, | |
| "loss": 1.7861, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 1.0375616743593825, | |
| "grad_norm": 3.832267113304745, | |
| "learning_rate": 0.00021815609526055173, | |
| "loss": 2.0424, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 1.0439280598440235, | |
| "grad_norm": 4.130087016761162, | |
| "learning_rate": 0.00021744871492572506, | |
| "loss": 1.9139, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 1.0502944453286647, | |
| "grad_norm": 2.7335284776977886, | |
| "learning_rate": 0.00021674133459089834, | |
| "loss": 2.0488, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 1.0566608308133056, | |
| "grad_norm": 3.329599622048459, | |
| "learning_rate": 0.00021603395425607165, | |
| "loss": 1.9137, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 1.0630272162979468, | |
| "grad_norm": 3.6314777543543313, | |
| "learning_rate": 0.00021532657392124498, | |
| "loss": 1.9768, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 1.069393601782588, | |
| "grad_norm": 4.218111606262662, | |
| "learning_rate": 0.00021461919358641828, | |
| "loss": 2.0141, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 1.075759987267229, | |
| "grad_norm": 2.895306844768361, | |
| "learning_rate": 0.0002139118132515916, | |
| "loss": 1.9783, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 1.0821263727518702, | |
| "grad_norm": 3.753588342038039, | |
| "learning_rate": 0.00021320443291676492, | |
| "loss": 2.0227, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.0884927582365111, | |
| "grad_norm": 3.250518576003523, | |
| "learning_rate": 0.0002124970525819382, | |
| "loss": 1.8848, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 1.0948591437211523, | |
| "grad_norm": 2.980988290019129, | |
| "learning_rate": 0.0002117896722471115, | |
| "loss": 1.867, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 1.1012255292057933, | |
| "grad_norm": 3.3311589155605263, | |
| "learning_rate": 0.0002110822919122848, | |
| "loss": 1.8766, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 1.1075919146904345, | |
| "grad_norm": 3.2929109952092523, | |
| "learning_rate": 0.00021037491157745815, | |
| "loss": 2.0685, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 1.1139583001750757, | |
| "grad_norm": 3.8495434375573243, | |
| "learning_rate": 0.00020966753124263143, | |
| "loss": 1.8662, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 1.1203246856597167, | |
| "grad_norm": 3.223669203555661, | |
| "learning_rate": 0.00020896015090780473, | |
| "loss": 1.9401, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 1.1266910711443578, | |
| "grad_norm": 3.068725890159831, | |
| "learning_rate": 0.00020825277057297806, | |
| "loss": 1.9732, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 1.1330574566289988, | |
| "grad_norm": 3.7652954958158724, | |
| "learning_rate": 0.00020754539023815137, | |
| "loss": 1.9264, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 1.13942384211364, | |
| "grad_norm": 3.283263110696873, | |
| "learning_rate": 0.00020683800990332468, | |
| "loss": 1.8805, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 1.145790227598281, | |
| "grad_norm": 3.578786877708257, | |
| "learning_rate": 0.00020613062956849795, | |
| "loss": 1.8764, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.1521566130829222, | |
| "grad_norm": 3.2073175602979402, | |
| "learning_rate": 0.0002054232492336713, | |
| "loss": 1.9123, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 1.1585229985675634, | |
| "grad_norm": 3.960679432169151, | |
| "learning_rate": 0.0002047158688988446, | |
| "loss": 1.8938, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 1.1648893840522043, | |
| "grad_norm": 3.026836467606075, | |
| "learning_rate": 0.0002040084885640179, | |
| "loss": 1.9291, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 1.1712557695368455, | |
| "grad_norm": 3.368777784279614, | |
| "learning_rate": 0.00020330110822919123, | |
| "loss": 1.9459, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 1.1776221550214865, | |
| "grad_norm": 2.843063910159576, | |
| "learning_rate": 0.0002025937278943645, | |
| "loss": 1.9449, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 1.1839885405061277, | |
| "grad_norm": 2.9510127362455876, | |
| "learning_rate": 0.00020188634755953782, | |
| "loss": 1.9309, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 1.1903549259907686, | |
| "grad_norm": 4.545581163313047, | |
| "learning_rate": 0.00020117896722471115, | |
| "loss": 1.8869, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 1.1967213114754098, | |
| "grad_norm": 3.301260365676958, | |
| "learning_rate": 0.00020047158688988445, | |
| "loss": 1.9613, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 1.203087696960051, | |
| "grad_norm": 3.299718394999854, | |
| "learning_rate": 0.00019976420655505776, | |
| "loss": 2.0072, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 1.209454082444692, | |
| "grad_norm": 3.5617690109472444, | |
| "learning_rate": 0.00019905682622023104, | |
| "loss": 1.9713, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.2158204679293332, | |
| "grad_norm": 3.1359097580279767, | |
| "learning_rate": 0.00019834944588540437, | |
| "loss": 1.8154, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 1.2221868534139742, | |
| "grad_norm": 3.0984711664236206, | |
| "learning_rate": 0.00019764206555057768, | |
| "loss": 1.8568, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 1.2285532388986153, | |
| "grad_norm": 2.842733095279462, | |
| "learning_rate": 0.00019693468521575098, | |
| "loss": 1.851, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 1.2349196243832563, | |
| "grad_norm": 2.935936669675825, | |
| "learning_rate": 0.00019622730488092432, | |
| "loss": 1.9604, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 1.2412860098678975, | |
| "grad_norm": 3.3716657028276096, | |
| "learning_rate": 0.0001955199245460976, | |
| "loss": 1.9171, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 1.2476523953525387, | |
| "grad_norm": 2.9604032685493133, | |
| "learning_rate": 0.0001948125442112709, | |
| "loss": 1.8604, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 1.2540187808371797, | |
| "grad_norm": 2.7269826434015427, | |
| "learning_rate": 0.0001941051638764442, | |
| "loss": 1.9076, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 1.2603851663218208, | |
| "grad_norm": 4.046544087868519, | |
| "learning_rate": 0.00019339778354161754, | |
| "loss": 1.8299, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 1.2667515518064618, | |
| "grad_norm": 3.12579708206849, | |
| "learning_rate": 0.00019269040320679084, | |
| "loss": 1.7545, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 1.273117937291103, | |
| "grad_norm": 2.7391520471543225, | |
| "learning_rate": 0.00019198302287196412, | |
| "loss": 1.8973, | |
| "step": 2000 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 4713, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |