| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.0, |
| "eval_steps": 20000, |
| "global_step": 12652, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.003161555485298767, |
| "grad_norm": 90.72049713134766, |
| "learning_rate": 5.00526870389884e-07, |
| "loss": 28.6428, |
| "sparse_loss": 28.6428, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.006323110970597534, |
| "grad_norm": 192.71334838867188, |
| "learning_rate": 1.0273972602739725e-06, |
| "loss": 26.2256, |
| "sparse_loss": 26.2256, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.0094846664558963, |
| "grad_norm": 103.9129867553711, |
| "learning_rate": 1.554267650158061e-06, |
| "loss": 26.0737, |
| "sparse_loss": 26.0737, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.012646221941195067, |
| "grad_norm": 529.5110473632812, |
| "learning_rate": 2.08113804004215e-06, |
| "loss": 22.2104, |
| "sparse_loss": 22.2104, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.015807777426493835, |
| "grad_norm": 130.07557678222656, |
| "learning_rate": 2.6080084299262384e-06, |
| "loss": 17.9675, |
| "sparse_loss": 17.9675, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.0189693329117926, |
| "grad_norm": 207.0006561279297, |
| "learning_rate": 3.1348788198103265e-06, |
| "loss": 14.3645, |
| "sparse_loss": 14.3645, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.022130888397091368, |
| "grad_norm": 230.92396545410156, |
| "learning_rate": 3.661749209694415e-06, |
| "loss": 9.8382, |
| "sparse_loss": 9.8382, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.025292443882390134, |
| "grad_norm": 147.09567260742188, |
| "learning_rate": 4.188619599578504e-06, |
| "loss": 6.1231, |
| "sparse_loss": 6.1231, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.028453999367688904, |
| "grad_norm": 102.5186996459961, |
| "learning_rate": 4.715489989462593e-06, |
| "loss": 4.5984, |
| "sparse_loss": 4.5984, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.03161555485298767, |
| "grad_norm": 43.4206657409668, |
| "learning_rate": 5.242360379346681e-06, |
| "loss": 3.6251, |
| "sparse_loss": 3.6251, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.034777110338286434, |
| "grad_norm": 133.94253540039062, |
| "learning_rate": 5.76923076923077e-06, |
| "loss": 3.3179, |
| "sparse_loss": 3.3179, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.0379386658235852, |
| "grad_norm": 58.420440673828125, |
| "learning_rate": 6.296101159114858e-06, |
| "loss": 2.8121, |
| "sparse_loss": 2.8121, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.04110022130888397, |
| "grad_norm": 25.974889755249023, |
| "learning_rate": 6.822971548998947e-06, |
| "loss": 2.3196, |
| "sparse_loss": 2.3196, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.044261776794182736, |
| "grad_norm": 14.511346817016602, |
| "learning_rate": 7.349841938883036e-06, |
| "loss": 2.1138, |
| "sparse_loss": 2.1138, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.047423332279481506, |
| "grad_norm": 77.72422790527344, |
| "learning_rate": 7.876712328767124e-06, |
| "loss": 2.1616, |
| "sparse_loss": 2.1616, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.05058488776478027, |
| "grad_norm": 11.835925102233887, |
| "learning_rate": 8.403582718651212e-06, |
| "loss": 2.3001, |
| "sparse_loss": 2.3001, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.05374644325007904, |
| "grad_norm": 19.07284164428711, |
| "learning_rate": 8.930453108535302e-06, |
| "loss": 1.7455, |
| "sparse_loss": 1.7455, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.05690799873537781, |
| "grad_norm": 44.019432067871094, |
| "learning_rate": 9.457323498419388e-06, |
| "loss": 1.7734, |
| "sparse_loss": 1.7734, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.06006955422067657, |
| "grad_norm": 95.81041717529297, |
| "learning_rate": 9.984193888303478e-06, |
| "loss": 1.7507, |
| "sparse_loss": 1.7507, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.06323110970597534, |
| "grad_norm": 50.2440071105957, |
| "learning_rate": 1.0511064278187566e-05, |
| "loss": 1.8376, |
| "sparse_loss": 1.8376, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.06639266519127411, |
| "grad_norm": 11.301913261413574, |
| "learning_rate": 1.1037934668071655e-05, |
| "loss": 1.6355, |
| "sparse_loss": 1.6355, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.06955422067657287, |
| "grad_norm": 19.044418334960938, |
| "learning_rate": 1.1564805057955744e-05, |
| "loss": 1.6548, |
| "sparse_loss": 1.6548, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.07271577616187164, |
| "grad_norm": 47.92369842529297, |
| "learning_rate": 1.209167544783983e-05, |
| "loss": 1.7548, |
| "sparse_loss": 1.7548, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.0758773316471704, |
| "grad_norm": 21.29003143310547, |
| "learning_rate": 1.2618545837723922e-05, |
| "loss": 1.7677, |
| "sparse_loss": 1.7677, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.07903888713246918, |
| "grad_norm": 13.993868827819824, |
| "learning_rate": 1.3145416227608009e-05, |
| "loss": 1.7335, |
| "sparse_loss": 1.7335, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.08220044261776795, |
| "grad_norm": 14.208980560302734, |
| "learning_rate": 1.3672286617492097e-05, |
| "loss": 1.6585, |
| "sparse_loss": 1.6585, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.0853619981030667, |
| "grad_norm": 45.097537994384766, |
| "learning_rate": 1.4199157007376185e-05, |
| "loss": 1.7808, |
| "sparse_loss": 1.7808, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.08852355358836547, |
| "grad_norm": 35.2084846496582, |
| "learning_rate": 1.4726027397260275e-05, |
| "loss": 1.5633, |
| "sparse_loss": 1.5633, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.09168510907366424, |
| "grad_norm": 30.503475189208984, |
| "learning_rate": 1.5252897787144363e-05, |
| "loss": 1.5752, |
| "sparse_loss": 1.5752, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.09484666455896301, |
| "grad_norm": 21.45793342590332, |
| "learning_rate": 1.577976817702845e-05, |
| "loss": 1.6597, |
| "sparse_loss": 1.6597, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.09800822004426178, |
| "grad_norm": 10.771159172058105, |
| "learning_rate": 1.630663856691254e-05, |
| "loss": 1.4463, |
| "sparse_loss": 1.4463, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.10116977552956054, |
| "grad_norm": 19.072532653808594, |
| "learning_rate": 1.683350895679663e-05, |
| "loss": 1.6486, |
| "sparse_loss": 1.6486, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.10433133101485931, |
| "grad_norm": 30.260778427124023, |
| "learning_rate": 1.7360379346680716e-05, |
| "loss": 1.8312, |
| "sparse_loss": 1.8312, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.10749288650015808, |
| "grad_norm": 46.612857818603516, |
| "learning_rate": 1.7887249736564805e-05, |
| "loss": 1.6324, |
| "sparse_loss": 1.6324, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.11065444198545685, |
| "grad_norm": 13.51109504699707, |
| "learning_rate": 1.8414120126448895e-05, |
| "loss": 1.5031, |
| "sparse_loss": 1.5031, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.11381599747075562, |
| "grad_norm": 52.77560806274414, |
| "learning_rate": 1.894099051633298e-05, |
| "loss": 1.5043, |
| "sparse_loss": 1.5043, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.11697755295605437, |
| "grad_norm": 23.98613929748535, |
| "learning_rate": 1.946786090621707e-05, |
| "loss": 1.7519, |
| "sparse_loss": 1.7519, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.12013910844135314, |
| "grad_norm": 12.080920219421387, |
| "learning_rate": 1.999473129610116e-05, |
| "loss": 1.5368, |
| "sparse_loss": 1.5368, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.12330066392665191, |
| "grad_norm": 8.275101661682129, |
| "learning_rate": 2.0521601685985248e-05, |
| "loss": 1.5252, |
| "sparse_loss": 1.5252, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.12646221941195068, |
| "grad_norm": 39.524532318115234, |
| "learning_rate": 2.1048472075869338e-05, |
| "loss": 1.6159, |
| "sparse_loss": 1.6159, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.12962377489724944, |
| "grad_norm": 28.030683517456055, |
| "learning_rate": 2.1575342465753427e-05, |
| "loss": 1.7463, |
| "sparse_loss": 1.7463, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.13278533038254822, |
| "grad_norm": 7.208009719848633, |
| "learning_rate": 2.2102212855637514e-05, |
| "loss": 1.8495, |
| "sparse_loss": 1.8495, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.13594688586784698, |
| "grad_norm": 7.575499534606934, |
| "learning_rate": 2.2629083245521604e-05, |
| "loss": 1.7152, |
| "sparse_loss": 1.7152, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.13910844135314573, |
| "grad_norm": 9.741721153259277, |
| "learning_rate": 2.315595363540569e-05, |
| "loss": 1.6196, |
| "sparse_loss": 1.6196, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.14226999683844452, |
| "grad_norm": 9.187643051147461, |
| "learning_rate": 2.368282402528978e-05, |
| "loss": 1.5192, |
| "sparse_loss": 1.5192, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.14543155232374327, |
| "grad_norm": 25.94437026977539, |
| "learning_rate": 2.420969441517387e-05, |
| "loss": 1.7447, |
| "sparse_loss": 1.7447, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.14859310780904206, |
| "grad_norm": 14.261216163635254, |
| "learning_rate": 2.4736564805057956e-05, |
| "loss": 1.6974, |
| "sparse_loss": 1.6974, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.1517546632943408, |
| "grad_norm": 32.37978744506836, |
| "learning_rate": 2.5263435194942046e-05, |
| "loss": 1.5887, |
| "sparse_loss": 1.5887, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.15491621877963957, |
| "grad_norm": 9.615072250366211, |
| "learning_rate": 2.5790305584826136e-05, |
| "loss": 1.4764, |
| "sparse_loss": 1.4764, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.15807777426493835, |
| "grad_norm": 5.270341873168945, |
| "learning_rate": 2.6317175974710222e-05, |
| "loss": 1.4227, |
| "sparse_loss": 1.4227, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.1612393297502371, |
| "grad_norm": 30.450881958007812, |
| "learning_rate": 2.6844046364594312e-05, |
| "loss": 1.3536, |
| "sparse_loss": 1.3536, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.1644008852355359, |
| "grad_norm": 80.92005157470703, |
| "learning_rate": 2.73709167544784e-05, |
| "loss": 1.7506, |
| "sparse_loss": 1.7506, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.16756244072083465, |
| "grad_norm": 51.93912124633789, |
| "learning_rate": 2.7897787144362485e-05, |
| "loss": 1.5311, |
| "sparse_loss": 1.5311, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.1707239962061334, |
| "grad_norm": 23.142057418823242, |
| "learning_rate": 2.842465753424658e-05, |
| "loss": 1.5044, |
| "sparse_loss": 1.5044, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.1738855516914322, |
| "grad_norm": 9.527504920959473, |
| "learning_rate": 2.8951527924130668e-05, |
| "loss": 1.3364, |
| "sparse_loss": 1.3364, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.17704710717673094, |
| "grad_norm": 5.871148109436035, |
| "learning_rate": 2.9478398314014755e-05, |
| "loss": 1.4623, |
| "sparse_loss": 1.4623, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.18020866266202973, |
| "grad_norm": 50.690467834472656, |
| "learning_rate": 3.000526870389884e-05, |
| "loss": 1.4804, |
| "sparse_loss": 1.4804, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.18337021814732848, |
| "grad_norm": 5.734912395477295, |
| "learning_rate": 3.053213909378293e-05, |
| "loss": 1.702, |
| "sparse_loss": 1.702, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.18653177363262724, |
| "grad_norm": 75.72001647949219, |
| "learning_rate": 3.105900948366702e-05, |
| "loss": 1.3781, |
| "sparse_loss": 1.3781, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.18969332911792602, |
| "grad_norm": 16.198474884033203, |
| "learning_rate": 3.1585879873551104e-05, |
| "loss": 1.3378, |
| "sparse_loss": 1.3378, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.19285488460322478, |
| "grad_norm": 18.437875747680664, |
| "learning_rate": 3.21127502634352e-05, |
| "loss": 1.459, |
| "sparse_loss": 1.459, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.19601644008852356, |
| "grad_norm": 18.53944206237793, |
| "learning_rate": 3.2639620653319283e-05, |
| "loss": 1.3585, |
| "sparse_loss": 1.3585, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.19917799557382232, |
| "grad_norm": 31.09649085998535, |
| "learning_rate": 3.316649104320337e-05, |
| "loss": 1.3483, |
| "sparse_loss": 1.3483, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.20233955105912108, |
| "grad_norm": 3.9741768836975098, |
| "learning_rate": 3.369336143308746e-05, |
| "loss": 1.2617, |
| "sparse_loss": 1.2617, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.20550110654441986, |
| "grad_norm": 13.636088371276855, |
| "learning_rate": 3.4220231822971546e-05, |
| "loss": 1.3285, |
| "sparse_loss": 1.3285, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.20866266202971862, |
| "grad_norm": 23.573163986206055, |
| "learning_rate": 3.4747102212855636e-05, |
| "loss": 1.4407, |
| "sparse_loss": 1.4407, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.2118242175150174, |
| "grad_norm": 8.226530075073242, |
| "learning_rate": 3.527397260273973e-05, |
| "loss": 1.2957, |
| "sparse_loss": 1.2957, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.21498577300031615, |
| "grad_norm": 66.35002899169922, |
| "learning_rate": 3.5800842992623816e-05, |
| "loss": 1.4965, |
| "sparse_loss": 1.4965, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.2181473284856149, |
| "grad_norm": 29.577178955078125, |
| "learning_rate": 3.6327713382507905e-05, |
| "loss": 1.2716, |
| "sparse_loss": 1.2716, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.2213088839709137, |
| "grad_norm": 25.51114273071289, |
| "learning_rate": 3.6854583772391995e-05, |
| "loss": 1.305, |
| "sparse_loss": 1.305, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.22447043945621245, |
| "grad_norm": 92.59777069091797, |
| "learning_rate": 3.738145416227608e-05, |
| "loss": 1.5987, |
| "sparse_loss": 1.5987, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.22763199494151123, |
| "grad_norm": 36.93169403076172, |
| "learning_rate": 3.790832455216017e-05, |
| "loss": 1.9617, |
| "sparse_loss": 1.9617, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.23079355042681, |
| "grad_norm": 24.72101593017578, |
| "learning_rate": 3.843519494204426e-05, |
| "loss": 1.692, |
| "sparse_loss": 1.692, |
| "step": 1460 |
| }, |
| { |
| "epoch": 0.23395510591210875, |
| "grad_norm": 6.122689247131348, |
| "learning_rate": 3.896206533192835e-05, |
| "loss": 1.4688, |
| "sparse_loss": 1.4688, |
| "step": 1480 |
| }, |
| { |
| "epoch": 0.23711666139740753, |
| "grad_norm": 19.387109756469727, |
| "learning_rate": 3.948893572181244e-05, |
| "loss": 1.2138, |
| "sparse_loss": 1.2138, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.24027821688270629, |
| "grad_norm": 9.966318130493164, |
| "learning_rate": 4.001580611169653e-05, |
| "loss": 1.3798, |
| "sparse_loss": 1.3798, |
| "step": 1520 |
| }, |
| { |
| "epoch": 0.24343977236800507, |
| "grad_norm": 10.776817321777344, |
| "learning_rate": 4.054267650158061e-05, |
| "loss": 1.2668, |
| "sparse_loss": 1.2668, |
| "step": 1540 |
| }, |
| { |
| "epoch": 0.24660132785330383, |
| "grad_norm": 17.45214080810547, |
| "learning_rate": 4.10695468914647e-05, |
| "loss": 1.4434, |
| "sparse_loss": 1.4434, |
| "step": 1560 |
| }, |
| { |
| "epoch": 0.24976288333860258, |
| "grad_norm": 65.40130615234375, |
| "learning_rate": 4.159641728134879e-05, |
| "loss": 1.4343, |
| "sparse_loss": 1.4343, |
| "step": 1580 |
| }, |
| { |
| "epoch": 0.25292443882390137, |
| "grad_norm": 8.373201370239258, |
| "learning_rate": 4.212328767123288e-05, |
| "loss": 1.2343, |
| "sparse_loss": 1.2343, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.25608599430920015, |
| "grad_norm": 5.707662105560303, |
| "learning_rate": 4.265015806111697e-05, |
| "loss": 1.3365, |
| "sparse_loss": 1.3365, |
| "step": 1620 |
| }, |
| { |
| "epoch": 0.2592475497944989, |
| "grad_norm": 5.130167484283447, |
| "learning_rate": 4.317702845100105e-05, |
| "loss": 1.3023, |
| "sparse_loss": 1.3023, |
| "step": 1640 |
| }, |
| { |
| "epoch": 0.26240910527979766, |
| "grad_norm": 7.4915313720703125, |
| "learning_rate": 4.370389884088514e-05, |
| "loss": 1.4274, |
| "sparse_loss": 1.4274, |
| "step": 1660 |
| }, |
| { |
| "epoch": 0.26557066076509644, |
| "grad_norm": 81.6876220703125, |
| "learning_rate": 4.423076923076923e-05, |
| "loss": 1.3786, |
| "sparse_loss": 1.3786, |
| "step": 1680 |
| }, |
| { |
| "epoch": 0.2687322162503952, |
| "grad_norm": 154.30575561523438, |
| "learning_rate": 4.4757639620653316e-05, |
| "loss": 1.4343, |
| "sparse_loss": 1.4343, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.27189377173569396, |
| "grad_norm": 30.539709091186523, |
| "learning_rate": 4.528451001053741e-05, |
| "loss": 1.5181, |
| "sparse_loss": 1.5181, |
| "step": 1720 |
| }, |
| { |
| "epoch": 0.27505532722099274, |
| "grad_norm": 11.73371410369873, |
| "learning_rate": 4.58113804004215e-05, |
| "loss": 1.1963, |
| "sparse_loss": 1.1963, |
| "step": 1740 |
| }, |
| { |
| "epoch": 0.27821688270629147, |
| "grad_norm": 41.04491424560547, |
| "learning_rate": 4.6338250790305585e-05, |
| "loss": 1.2356, |
| "sparse_loss": 1.2356, |
| "step": 1760 |
| }, |
| { |
| "epoch": 0.28137843819159025, |
| "grad_norm": 5.058065891265869, |
| "learning_rate": 4.6865121180189675e-05, |
| "loss": 1.23, |
| "sparse_loss": 1.23, |
| "step": 1780 |
| }, |
| { |
| "epoch": 0.28453999367688904, |
| "grad_norm": 11.308867454528809, |
| "learning_rate": 4.7391991570073765e-05, |
| "loss": 1.3572, |
| "sparse_loss": 1.3572, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.2877015491621878, |
| "grad_norm": 8.532959938049316, |
| "learning_rate": 4.791886195995785e-05, |
| "loss": 1.3385, |
| "sparse_loss": 1.3385, |
| "step": 1820 |
| }, |
| { |
| "epoch": 0.29086310464748655, |
| "grad_norm": 4.430708408355713, |
| "learning_rate": 4.8445732349841945e-05, |
| "loss": 1.3498, |
| "sparse_loss": 1.3498, |
| "step": 1840 |
| }, |
| { |
| "epoch": 0.29402466013278533, |
| "grad_norm": 13.108844757080078, |
| "learning_rate": 4.8972602739726034e-05, |
| "loss": 1.2505, |
| "sparse_loss": 1.2505, |
| "step": 1860 |
| }, |
| { |
| "epoch": 0.2971862156180841, |
| "grad_norm": 14.208166122436523, |
| "learning_rate": 4.949947312961012e-05, |
| "loss": 1.3876, |
| "sparse_loss": 1.3876, |
| "step": 1880 |
| }, |
| { |
| "epoch": 0.30034777110338284, |
| "grad_norm": 6.0673017501831055, |
| "learning_rate": 4.999999893323271e-05, |
| "loss": 1.3779, |
| "sparse_loss": 1.3779, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.3035093265886816, |
| "grad_norm": 6.847878456115723, |
| "learning_rate": 4.999952955709672e-05, |
| "loss": 1.2894, |
| "sparse_loss": 1.2894, |
| "step": 1920 |
| }, |
| { |
| "epoch": 0.3066708820739804, |
| "grad_norm": 4.5629119873046875, |
| "learning_rate": 4.999820678560873e-05, |
| "loss": 1.2486, |
| "sparse_loss": 1.2486, |
| "step": 1940 |
| }, |
| { |
| "epoch": 0.30983243755927914, |
| "grad_norm": 15.195846557617188, |
| "learning_rate": 4.999603066392346e-05, |
| "loss": 1.2844, |
| "sparse_loss": 1.2844, |
| "step": 1960 |
| }, |
| { |
| "epoch": 0.3129939930445779, |
| "grad_norm": 81.70159912109375, |
| "learning_rate": 4.999300126632601e-05, |
| "loss": 1.3135, |
| "sparse_loss": 1.3135, |
| "step": 1980 |
| }, |
| { |
| "epoch": 0.3161555485298767, |
| "grad_norm": 4.467297077178955, |
| "learning_rate": 4.998911869622926e-05, |
| "loss": 1.1267, |
| "sparse_loss": 1.1267, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.3193171040151755, |
| "grad_norm": 4.738478183746338, |
| "learning_rate": 4.998438308617042e-05, |
| "loss": 1.1558, |
| "sparse_loss": 1.1558, |
| "step": 2020 |
| }, |
| { |
| "epoch": 0.3224786595004742, |
| "grad_norm": 49.64394760131836, |
| "learning_rate": 4.997879459780641e-05, |
| "loss": 1.3313, |
| "sparse_loss": 1.3313, |
| "step": 2040 |
| }, |
| { |
| "epoch": 0.325640214985773, |
| "grad_norm": 93.02969360351562, |
| "learning_rate": 4.997235342190843e-05, |
| "loss": 1.3522, |
| "sparse_loss": 1.3522, |
| "step": 2060 |
| }, |
| { |
| "epoch": 0.3288017704710718, |
| "grad_norm": 29.788345336914062, |
| "learning_rate": 4.996505977835541e-05, |
| "loss": 1.2318, |
| "sparse_loss": 1.2318, |
| "step": 2080 |
| }, |
| { |
| "epoch": 0.3319633259563705, |
| "grad_norm": 4.9465155601501465, |
| "learning_rate": 4.995691391612649e-05, |
| "loss": 1.3701, |
| "sparse_loss": 1.3701, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.3351248814416693, |
| "grad_norm": 8.436901092529297, |
| "learning_rate": 4.994791611329253e-05, |
| "loss": 1.1667, |
| "sparse_loss": 1.1667, |
| "step": 2120 |
| }, |
| { |
| "epoch": 0.3382864369269681, |
| "grad_norm": 9.941946983337402, |
| "learning_rate": 4.9938066677006644e-05, |
| "loss": 1.2692, |
| "sparse_loss": 1.2692, |
| "step": 2140 |
| }, |
| { |
| "epoch": 0.3414479924122668, |
| "grad_norm": 3.722710609436035, |
| "learning_rate": 4.9927365943493686e-05, |
| "loss": 1.2353, |
| "sparse_loss": 1.2353, |
| "step": 2160 |
| }, |
| { |
| "epoch": 0.3446095478975656, |
| "grad_norm": 4.697382926940918, |
| "learning_rate": 4.991581427803879e-05, |
| "loss": 1.0708, |
| "sparse_loss": 1.0708, |
| "step": 2180 |
| }, |
| { |
| "epoch": 0.3477711033828644, |
| "grad_norm": 10.880094528198242, |
| "learning_rate": 4.990341207497485e-05, |
| "loss": 1.2122, |
| "sparse_loss": 1.2122, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.35093265886816316, |
| "grad_norm": 3.677126169204712, |
| "learning_rate": 4.989015975766916e-05, |
| "loss": 1.1419, |
| "sparse_loss": 1.1419, |
| "step": 2220 |
| }, |
| { |
| "epoch": 0.3540942143534619, |
| "grad_norm": 7.251951217651367, |
| "learning_rate": 4.987605777850886e-05, |
| "loss": 1.2176, |
| "sparse_loss": 1.2176, |
| "step": 2240 |
| }, |
| { |
| "epoch": 0.3572557698387607, |
| "grad_norm": 5.328221321105957, |
| "learning_rate": 4.986110661888555e-05, |
| "loss": 1.2348, |
| "sparse_loss": 1.2348, |
| "step": 2260 |
| }, |
| { |
| "epoch": 0.36041732532405946, |
| "grad_norm": 58.50775146484375, |
| "learning_rate": 4.9845306789178833e-05, |
| "loss": 1.234, |
| "sparse_loss": 1.234, |
| "step": 2280 |
| }, |
| { |
| "epoch": 0.3635788808093582, |
| "grad_norm": 9.532604217529297, |
| "learning_rate": 4.982865882873893e-05, |
| "loss": 1.2236, |
| "sparse_loss": 1.2236, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.36674043629465697, |
| "grad_norm": 7.424788951873779, |
| "learning_rate": 4.9811163305868185e-05, |
| "loss": 1.1314, |
| "sparse_loss": 1.1314, |
| "step": 2320 |
| }, |
| { |
| "epoch": 0.36990199177995575, |
| "grad_norm": 6.7801361083984375, |
| "learning_rate": 4.9792820817801776e-05, |
| "loss": 1.2094, |
| "sparse_loss": 1.2094, |
| "step": 2340 |
| }, |
| { |
| "epoch": 0.3730635472652545, |
| "grad_norm": 90.10533142089844, |
| "learning_rate": 4.977363199068724e-05, |
| "loss": 1.1324, |
| "sparse_loss": 1.1324, |
| "step": 2360 |
| }, |
| { |
| "epoch": 0.37622510275055326, |
| "grad_norm": 4.0776519775390625, |
| "learning_rate": 4.9753597479563135e-05, |
| "loss": 1.1505, |
| "sparse_loss": 1.1505, |
| "step": 2380 |
| }, |
| { |
| "epoch": 0.37938665823585205, |
| "grad_norm": 4.055939674377441, |
| "learning_rate": 4.9732717968336684e-05, |
| "loss": 1.2998, |
| "sparse_loss": 1.2998, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.38254821372115083, |
| "grad_norm": 3.455716133117676, |
| "learning_rate": 4.971099416976041e-05, |
| "loss": 1.1047, |
| "sparse_loss": 1.1047, |
| "step": 2420 |
| }, |
| { |
| "epoch": 0.38570976920644956, |
| "grad_norm": 20.777507781982422, |
| "learning_rate": 4.968842682540782e-05, |
| "loss": 1.214, |
| "sparse_loss": 1.214, |
| "step": 2440 |
| }, |
| { |
| "epoch": 0.38887132469174834, |
| "grad_norm": 4.029702663421631, |
| "learning_rate": 4.966501670564807e-05, |
| "loss": 1.1673, |
| "sparse_loss": 1.1673, |
| "step": 2460 |
| }, |
| { |
| "epoch": 0.3920328801770471, |
| "grad_norm": 3.2463579177856445, |
| "learning_rate": 4.964076460961971e-05, |
| "loss": 1.0922, |
| "sparse_loss": 1.0922, |
| "step": 2480 |
| }, |
| { |
| "epoch": 0.39519443566234586, |
| "grad_norm": 24.983285903930664, |
| "learning_rate": 4.961567136520335e-05, |
| "loss": 1.1501, |
| "sparse_loss": 1.1501, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.39835599114764464, |
| "grad_norm": 7.758336067199707, |
| "learning_rate": 4.958973782899344e-05, |
| "loss": 1.1135, |
| "sparse_loss": 1.1135, |
| "step": 2520 |
| }, |
| { |
| "epoch": 0.4015175466329434, |
| "grad_norm": 5.4922404289245605, |
| "learning_rate": 4.9562964886269005e-05, |
| "loss": 1.2597, |
| "sparse_loss": 1.2597, |
| "step": 2540 |
| }, |
| { |
| "epoch": 0.40467910211824215, |
| "grad_norm": 7.335123062133789, |
| "learning_rate": 4.953535345096344e-05, |
| "loss": 1.1811, |
| "sparse_loss": 1.1811, |
| "step": 2560 |
| }, |
| { |
| "epoch": 0.40784065760354093, |
| "grad_norm": 10.459525108337402, |
| "learning_rate": 4.95069044656333e-05, |
| "loss": 1.4975, |
| "sparse_loss": 1.4975, |
| "step": 2580 |
| }, |
| { |
| "epoch": 0.4110022130888397, |
| "grad_norm": 3.796081304550171, |
| "learning_rate": 4.947761890142615e-05, |
| "loss": 1.2545, |
| "sparse_loss": 1.2545, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.4141637685741385, |
| "grad_norm": 23.229045867919922, |
| "learning_rate": 4.9447497758047354e-05, |
| "loss": 1.2265, |
| "sparse_loss": 1.2265, |
| "step": 2620 |
| }, |
| { |
| "epoch": 0.41732532405943723, |
| "grad_norm": 3.592000961303711, |
| "learning_rate": 4.941654206372602e-05, |
| "loss": 1.2435, |
| "sparse_loss": 1.2435, |
| "step": 2640 |
| }, |
| { |
| "epoch": 0.420486879544736, |
| "grad_norm": 9.876830101013184, |
| "learning_rate": 4.9384752875179876e-05, |
| "loss": 1.0913, |
| "sparse_loss": 1.0913, |
| "step": 2660 |
| }, |
| { |
| "epoch": 0.4236484350300348, |
| "grad_norm": 5.687900543212891, |
| "learning_rate": 4.9352131277579144e-05, |
| "loss": 1.1109, |
| "sparse_loss": 1.1109, |
| "step": 2680 |
| }, |
| { |
| "epoch": 0.4268099905153335, |
| "grad_norm": 6.638156890869141, |
| "learning_rate": 4.931867838450959e-05, |
| "loss": 1.1235, |
| "sparse_loss": 1.1235, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.4299715460006323, |
| "grad_norm": 86.10536193847656, |
| "learning_rate": 4.928439533793443e-05, |
| "loss": 1.2064, |
| "sparse_loss": 1.2064, |
| "step": 2720 |
| }, |
| { |
| "epoch": 0.4331331014859311, |
| "grad_norm": 5.764664173126221, |
| "learning_rate": 4.92492833081554e-05, |
| "loss": 1.2203, |
| "sparse_loss": 1.2203, |
| "step": 2740 |
| }, |
| { |
| "epoch": 0.4362946569712298, |
| "grad_norm": 3.5471487045288086, |
| "learning_rate": 4.921334349377277e-05, |
| "loss": 1.1381, |
| "sparse_loss": 1.1381, |
| "step": 2760 |
| }, |
| { |
| "epoch": 0.4394562124565286, |
| "grad_norm": 2.527348279953003, |
| "learning_rate": 4.917657712164445e-05, |
| "loss": 1.1552, |
| "sparse_loss": 1.1552, |
| "step": 2780 |
| }, |
| { |
| "epoch": 0.4426177679418274, |
| "grad_norm": 5.361288070678711, |
| "learning_rate": 4.91389854468441e-05, |
| "loss": 1.246, |
| "sparse_loss": 1.246, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.4457793234271262, |
| "grad_norm": 3.979912519454956, |
| "learning_rate": 4.910056975261829e-05, |
| "loss": 1.1758, |
| "sparse_loss": 1.1758, |
| "step": 2820 |
| }, |
| { |
| "epoch": 0.4489408789124249, |
| "grad_norm": 4.702384948730469, |
| "learning_rate": 4.906133135034269e-05, |
| "loss": 1.2303, |
| "sparse_loss": 1.2303, |
| "step": 2840 |
| }, |
| { |
| "epoch": 0.4521024343977237, |
| "grad_norm": 4.330478191375732, |
| "learning_rate": 4.902127157947732e-05, |
| "loss": 1.1303, |
| "sparse_loss": 1.1303, |
| "step": 2860 |
| }, |
| { |
| "epoch": 0.45526398988302247, |
| "grad_norm": 3.032644271850586, |
| "learning_rate": 4.898039180752079e-05, |
| "loss": 1.1296, |
| "sparse_loss": 1.1296, |
| "step": 2880 |
| }, |
| { |
| "epoch": 0.4584255453683212, |
| "grad_norm": 36.98707962036133, |
| "learning_rate": 4.893869342996367e-05, |
| "loss": 1.1419, |
| "sparse_loss": 1.1419, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.46158710085362, |
| "grad_norm": 38.45972442626953, |
| "learning_rate": 4.889617787024079e-05, |
| "loss": 1.2288, |
| "sparse_loss": 1.2288, |
| "step": 2920 |
| }, |
| { |
| "epoch": 0.46474865633891876, |
| "grad_norm": 3.52923321723938, |
| "learning_rate": 4.885284657968272e-05, |
| "loss": 1.1064, |
| "sparse_loss": 1.1064, |
| "step": 2940 |
| }, |
| { |
| "epoch": 0.4679102118242175, |
| "grad_norm": 17.003992080688477, |
| "learning_rate": 4.880870103746617e-05, |
| "loss": 1.2217, |
| "sparse_loss": 1.2217, |
| "step": 2960 |
| }, |
| { |
| "epoch": 0.4710717673095163, |
| "grad_norm": 6.5978569984436035, |
| "learning_rate": 4.8763742750563515e-05, |
| "loss": 1.1936, |
| "sparse_loss": 1.1936, |
| "step": 2980 |
| }, |
| { |
| "epoch": 0.47423332279481506, |
| "grad_norm": 375.36273193359375, |
| "learning_rate": 4.8717973253691365e-05, |
| "loss": 1.3667, |
| "sparse_loss": 1.3667, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.47739487828011384, |
| "grad_norm": 3.5779974460601807, |
| "learning_rate": 4.867139410925815e-05, |
| "loss": 1.373, |
| "sparse_loss": 1.373, |
| "step": 3020 |
| }, |
| { |
| "epoch": 0.48055643376541257, |
| "grad_norm": 12.275580406188965, |
| "learning_rate": 4.8624006907310804e-05, |
| "loss": 1.1946, |
| "sparse_loss": 1.1946, |
| "step": 3040 |
| }, |
| { |
| "epoch": 0.48371798925071136, |
| "grad_norm": 3.74411678314209, |
| "learning_rate": 4.857581326548049e-05, |
| "loss": 1.5584, |
| "sparse_loss": 1.5584, |
| "step": 3060 |
| }, |
| { |
| "epoch": 0.48687954473601014, |
| "grad_norm": 4.420019626617432, |
| "learning_rate": 4.852681482892735e-05, |
| "loss": 1.2366, |
| "sparse_loss": 1.2366, |
| "step": 3080 |
| }, |
| { |
| "epoch": 0.49004110022130887, |
| "grad_norm": 5.069537162780762, |
| "learning_rate": 4.847701327028439e-05, |
| "loss": 1.2799, |
| "sparse_loss": 1.2799, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.49320265570660765, |
| "grad_norm": 4.507127285003662, |
| "learning_rate": 4.8426410289600356e-05, |
| "loss": 1.286, |
| "sparse_loss": 1.286, |
| "step": 3120 |
| }, |
| { |
| "epoch": 0.49636421119190643, |
| "grad_norm": 536.4945678710938, |
| "learning_rate": 4.837500761428167e-05, |
| "loss": 1.1875, |
| "sparse_loss": 1.1875, |
| "step": 3140 |
| }, |
| { |
| "epoch": 0.49952576667720516, |
| "grad_norm": 4.59815788269043, |
| "learning_rate": 4.832280699903355e-05, |
| "loss": 1.1452, |
| "sparse_loss": 1.1452, |
| "step": 3160 |
| }, |
| { |
| "epoch": 0.502687322162504, |
| "grad_norm": 3.2997987270355225, |
| "learning_rate": 4.826981022580001e-05, |
| "loss": 1.2692, |
| "sparse_loss": 1.2692, |
| "step": 3180 |
| }, |
| { |
| "epoch": 0.5058488776478027, |
| "grad_norm": 3.63319993019104, |
| "learning_rate": 4.821601910370308e-05, |
| "loss": 1.1087, |
| "sparse_loss": 1.1087, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.5090104331331015, |
| "grad_norm": 3.5504705905914307, |
| "learning_rate": 4.8161435468981074e-05, |
| "loss": 1.379, |
| "sparse_loss": 1.379, |
| "step": 3220 |
| }, |
| { |
| "epoch": 0.5121719886184003, |
| "grad_norm": 132.48561096191406, |
| "learning_rate": 4.8106061184925856e-05, |
| "loss": 1.0955, |
| "sparse_loss": 1.0955, |
| "step": 3240 |
| }, |
| { |
| "epoch": 0.515333544103699, |
| "grad_norm": 9.595959663391113, |
| "learning_rate": 4.804989814181926e-05, |
| "loss": 0.9732, |
| "sparse_loss": 0.9732, |
| "step": 3260 |
| }, |
| { |
| "epoch": 0.5184950995889978, |
| "grad_norm": 1358.23193359375, |
| "learning_rate": 4.799294825686855e-05, |
| "loss": 1.3688, |
| "sparse_loss": 1.3688, |
| "step": 3280 |
| }, |
| { |
| "epoch": 0.5216566550742966, |
| "grad_norm": 5.830153465270996, |
| "learning_rate": 4.793521347414102e-05, |
| "loss": 3.7253, |
| "sparse_loss": 3.7253, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.5248182105595953, |
| "grad_norm": 3048.8115234375, |
| "learning_rate": 4.787669576449755e-05, |
| "loss": 9.729, |
| "sparse_loss": 9.729, |
| "step": 3320 |
| }, |
| { |
| "epoch": 0.527979766044894, |
| "grad_norm": 4.22714900970459, |
| "learning_rate": 4.781739712552539e-05, |
| "loss": 8.5322, |
| "sparse_loss": 8.5322, |
| "step": 3340 |
| }, |
| { |
| "epoch": 0.5311413215301929, |
| "grad_norm": 5.15207052230835, |
| "learning_rate": 4.775731958146995e-05, |
| "loss": 1.6707, |
| "sparse_loss": 1.6707, |
| "step": 3360 |
| }, |
| { |
| "epoch": 0.5343028770154916, |
| "grad_norm": 21.08283233642578, |
| "learning_rate": 4.769646518316568e-05, |
| "loss": 2.5016, |
| "sparse_loss": 2.5016, |
| "step": 3380 |
| }, |
| { |
| "epoch": 0.5374644325007903, |
| "grad_norm": 335.0862121582031, |
| "learning_rate": 4.763483600796612e-05, |
| "loss": 5.182, |
| "sparse_loss": 5.182, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.5406259879860892, |
| "grad_norm": 16.12190818786621, |
| "learning_rate": 4.757243415967291e-05, |
| "loss": 2.1036, |
| "sparse_loss": 2.1036, |
| "step": 3420 |
| }, |
| { |
| "epoch": 0.5437875434713879, |
| "grad_norm": 1093.560546875, |
| "learning_rate": 4.750926176846404e-05, |
| "loss": 1.5086, |
| "sparse_loss": 1.5086, |
| "step": 3440 |
| }, |
| { |
| "epoch": 0.5469490989566866, |
| "grad_norm": 5.97625207901001, |
| "learning_rate": 4.744532099082107e-05, |
| "loss": 1.3835, |
| "sparse_loss": 1.3835, |
| "step": 3460 |
| }, |
| { |
| "epoch": 0.5501106544419855, |
| "grad_norm": 3847.875732421875, |
| "learning_rate": 4.7380614009455595e-05, |
| "loss": 1.3316, |
| "sparse_loss": 1.3316, |
| "step": 3480 |
| }, |
| { |
| "epoch": 0.5532722099272842, |
| "grad_norm": 5.960216999053955, |
| "learning_rate": 4.7315143033234654e-05, |
| "loss": 1.0839, |
| "sparse_loss": 1.0839, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.5564337654125829, |
| "grad_norm": 6.381541728973389, |
| "learning_rate": 4.724891029710537e-05, |
| "loss": 1.1241, |
| "sparse_loss": 1.1241, |
| "step": 3520 |
| }, |
| { |
| "epoch": 0.5595953208978818, |
| "grad_norm": 5.727816104888916, |
| "learning_rate": 4.7181918062018674e-05, |
| "loss": 1.2075, |
| "sparse_loss": 1.2075, |
| "step": 3540 |
| }, |
| { |
| "epoch": 0.5627568763831805, |
| "grad_norm": 6.4741597175598145, |
| "learning_rate": 4.7114168614852064e-05, |
| "loss": 1.326, |
| "sparse_loss": 1.326, |
| "step": 3560 |
| }, |
| { |
| "epoch": 0.5659184318684793, |
| "grad_norm": 4.220773696899414, |
| "learning_rate": 4.70456642683316e-05, |
| "loss": 1.2169, |
| "sparse_loss": 1.2169, |
| "step": 3580 |
| }, |
| { |
| "epoch": 0.5690799873537781, |
| "grad_norm": 2.715653419494629, |
| "learning_rate": 4.697640736095292e-05, |
| "loss": 1.1474, |
| "sparse_loss": 1.1474, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.5722415428390768, |
| "grad_norm": 2.276413679122925, |
| "learning_rate": 4.690640025690143e-05, |
| "loss": 1.228, |
| "sparse_loss": 1.228, |
| "step": 3620 |
| }, |
| { |
| "epoch": 0.5754030983243756, |
| "grad_norm": 14.458223342895508, |
| "learning_rate": 4.683564534597159e-05, |
| "loss": 1.0549, |
| "sparse_loss": 1.0549, |
| "step": 3640 |
| }, |
| { |
| "epoch": 0.5785646538096744, |
| "grad_norm": 7.121801853179932, |
| "learning_rate": 4.676414504348533e-05, |
| "loss": 1.154, |
| "sparse_loss": 1.154, |
| "step": 3660 |
| }, |
| { |
| "epoch": 0.5817262092949731, |
| "grad_norm": 4.728893280029297, |
| "learning_rate": 4.669190179020962e-05, |
| "loss": 1.1328, |
| "sparse_loss": 1.1328, |
| "step": 3680 |
| }, |
| { |
| "epoch": 0.5848877647802719, |
| "grad_norm": 19.522703170776367, |
| "learning_rate": 4.661891805227313e-05, |
| "loss": 1.1913, |
| "sparse_loss": 1.1913, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.5880493202655707, |
| "grad_norm": 16.189546585083008, |
| "learning_rate": 4.654519632108204e-05, |
| "loss": 1.0713, |
| "sparse_loss": 1.0713, |
| "step": 3720 |
| }, |
| { |
| "epoch": 0.5912108757508694, |
| "grad_norm": 2.9793543815612793, |
| "learning_rate": 4.6470739113235026e-05, |
| "loss": 1.1421, |
| "sparse_loss": 1.1421, |
| "step": 3740 |
| }, |
| { |
| "epoch": 0.5943724312361682, |
| "grad_norm": 3.7264773845672607, |
| "learning_rate": 4.639554897043731e-05, |
| "loss": 0.9968, |
| "sparse_loss": 0.9968, |
| "step": 3760 |
| }, |
| { |
| "epoch": 0.597533986721467, |
| "grad_norm": 5.013382434844971, |
| "learning_rate": 4.6319628459413946e-05, |
| "loss": 1.0329, |
| "sparse_loss": 1.0329, |
| "step": 3780 |
| }, |
| { |
| "epoch": 0.6006955422067657, |
| "grad_norm": 7.1503705978393555, |
| "learning_rate": 4.6242980171822134e-05, |
| "loss": 1.079, |
| "sparse_loss": 1.079, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.6038570976920645, |
| "grad_norm": 3.3819286823272705, |
| "learning_rate": 4.6165606724162816e-05, |
| "loss": 1.0308, |
| "sparse_loss": 1.0308, |
| "step": 3820 |
| }, |
| { |
| "epoch": 0.6070186531773633, |
| "grad_norm": 4.328090667724609, |
| "learning_rate": 4.608751075769131e-05, |
| "loss": 1.1002, |
| "sparse_loss": 1.1002, |
| "step": 3840 |
| }, |
| { |
| "epoch": 0.610180208662662, |
| "grad_norm": 6.626924991607666, |
| "learning_rate": 4.600869493832718e-05, |
| "loss": 0.9787, |
| "sparse_loss": 0.9787, |
| "step": 3860 |
| }, |
| { |
| "epoch": 0.6133417641479608, |
| "grad_norm": 3.033689260482788, |
| "learning_rate": 4.592916195656322e-05, |
| "loss": 1.0471, |
| "sparse_loss": 1.0471, |
| "step": 3880 |
| }, |
| { |
| "epoch": 0.6165033196332595, |
| "grad_norm": 3.961258888244629, |
| "learning_rate": 4.5848914527373574e-05, |
| "loss": 1.1687, |
| "sparse_loss": 1.1687, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.6196648751185583, |
| "grad_norm": 9.87937068939209, |
| "learning_rate": 4.576795539012114e-05, |
| "loss": 2.0557, |
| "sparse_loss": 2.0557, |
| "step": 3920 |
| }, |
| { |
| "epoch": 0.6228264306038571, |
| "grad_norm": 2.485781192779541, |
| "learning_rate": 4.568628730846397e-05, |
| "loss": 1.0667, |
| "sparse_loss": 1.0667, |
| "step": 3940 |
| }, |
| { |
| "epoch": 0.6259879860891558, |
| "grad_norm": 4.919963359832764, |
| "learning_rate": 4.560391307026097e-05, |
| "loss": 1.1894, |
| "sparse_loss": 1.1894, |
| "step": 3960 |
| }, |
| { |
| "epoch": 0.6291495415744547, |
| "grad_norm": 4.169412612915039, |
| "learning_rate": 4.5520835487476753e-05, |
| "loss": 1.072, |
| "sparse_loss": 1.072, |
| "step": 3980 |
| }, |
| { |
| "epoch": 0.6323110970597534, |
| "grad_norm": 2.8163318634033203, |
| "learning_rate": 4.5437057396085584e-05, |
| "loss": 1.0059, |
| "sparse_loss": 1.0059, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.6354726525450521, |
| "grad_norm": 4.585569381713867, |
| "learning_rate": 4.535258165597465e-05, |
| "loss": 0.9931, |
| "sparse_loss": 0.9931, |
| "step": 4020 |
| }, |
| { |
| "epoch": 0.638634208030351, |
| "grad_norm": 3.607029914855957, |
| "learning_rate": 4.526741115084636e-05, |
| "loss": 1.0642, |
| "sparse_loss": 1.0642, |
| "step": 4040 |
| }, |
| { |
| "epoch": 0.6417957635156497, |
| "grad_norm": 6.397791862487793, |
| "learning_rate": 4.518154878811997e-05, |
| "loss": 1.074, |
| "sparse_loss": 1.074, |
| "step": 4060 |
| }, |
| { |
| "epoch": 0.6449573190009484, |
| "grad_norm": 3.547840118408203, |
| "learning_rate": 4.509499749883226e-05, |
| "loss": 1.9425, |
| "sparse_loss": 1.9425, |
| "step": 4080 |
| }, |
| { |
| "epoch": 0.6481188744862473, |
| "grad_norm": 5.856851577758789, |
| "learning_rate": 4.5007760237537566e-05, |
| "loss": 0.9978, |
| "sparse_loss": 0.9978, |
| "step": 4100 |
| }, |
| { |
| "epoch": 0.651280429971546, |
| "grad_norm": 22.970752716064453, |
| "learning_rate": 4.491983998220686e-05, |
| "loss": 1.087, |
| "sparse_loss": 1.087, |
| "step": 4120 |
| }, |
| { |
| "epoch": 0.6544419854568447, |
| "grad_norm": 5.967113494873047, |
| "learning_rate": 4.483123973412611e-05, |
| "loss": 1.0515, |
| "sparse_loss": 1.0515, |
| "step": 4140 |
| }, |
| { |
| "epoch": 0.6576035409421436, |
| "grad_norm": 8.511434555053711, |
| "learning_rate": 4.474196251779381e-05, |
| "loss": 1.0739, |
| "sparse_loss": 1.0739, |
| "step": 4160 |
| }, |
| { |
| "epoch": 0.6607650964274423, |
| "grad_norm": 8.526429176330566, |
| "learning_rate": 4.465201138081778e-05, |
| "loss": 1.1908, |
| "sparse_loss": 1.1908, |
| "step": 4180 |
| }, |
| { |
| "epoch": 0.663926651912741, |
| "grad_norm": 5.918643951416016, |
| "learning_rate": 4.4561389393811096e-05, |
| "loss": 1.0785, |
| "sparse_loss": 1.0785, |
| "step": 4200 |
| }, |
| { |
| "epoch": 0.6670882073980399, |
| "grad_norm": 10.329548835754395, |
| "learning_rate": 4.4470099650287255e-05, |
| "loss": 0.9379, |
| "sparse_loss": 0.9379, |
| "step": 4220 |
| }, |
| { |
| "epoch": 0.6702497628833386, |
| "grad_norm": 2.3712220191955566, |
| "learning_rate": 4.4378145266554625e-05, |
| "loss": 0.9539, |
| "sparse_loss": 0.9539, |
| "step": 4240 |
| }, |
| { |
| "epoch": 0.6734113183686373, |
| "grad_norm": 3.1307120323181152, |
| "learning_rate": 4.428552938161002e-05, |
| "loss": 1.0695, |
| "sparse_loss": 1.0695, |
| "step": 4260 |
| }, |
| { |
| "epoch": 0.6765728738539362, |
| "grad_norm": 23.584779739379883, |
| "learning_rate": 4.419225515703155e-05, |
| "loss": 0.9849, |
| "sparse_loss": 0.9849, |
| "step": 4280 |
| }, |
| { |
| "epoch": 0.6797344293392349, |
| "grad_norm": 3.50978684425354, |
| "learning_rate": 4.4098325776870734e-05, |
| "loss": 1.2731, |
| "sparse_loss": 1.2731, |
| "step": 4300 |
| }, |
| { |
| "epoch": 0.6828959848245336, |
| "grad_norm": 28.15680694580078, |
| "learning_rate": 4.400374444754376e-05, |
| "loss": 1.1422, |
| "sparse_loss": 1.1422, |
| "step": 4320 |
| }, |
| { |
| "epoch": 0.6860575403098325, |
| "grad_norm": 4.0869293212890625, |
| "learning_rate": 4.3908514397722064e-05, |
| "loss": 1.1778, |
| "sparse_loss": 1.1778, |
| "step": 4340 |
| }, |
| { |
| "epoch": 0.6892190957951312, |
| "grad_norm": 2.486721992492676, |
| "learning_rate": 4.3812638878222095e-05, |
| "loss": 1.988, |
| "sparse_loss": 1.988, |
| "step": 4360 |
| }, |
| { |
| "epoch": 0.69238065128043, |
| "grad_norm": 33.50421142578125, |
| "learning_rate": 4.371612116189434e-05, |
| "loss": 1.2742, |
| "sparse_loss": 1.2742, |
| "step": 4380 |
| }, |
| { |
| "epoch": 0.6955422067657288, |
| "grad_norm": 3.6956710815429688, |
| "learning_rate": 4.361896454351162e-05, |
| "loss": 1.1552, |
| "sparse_loss": 1.1552, |
| "step": 4400 |
| }, |
| { |
| "epoch": 0.6987037622510275, |
| "grad_norm": 2.9968864917755127, |
| "learning_rate": 4.3521172339656616e-05, |
| "loss": 1.0634, |
| "sparse_loss": 1.0634, |
| "step": 4420 |
| }, |
| { |
| "epoch": 0.7018653177363263, |
| "grad_norm": 17.745704650878906, |
| "learning_rate": 4.342274788860863e-05, |
| "loss": 1.1205, |
| "sparse_loss": 1.1205, |
| "step": 4440 |
| }, |
| { |
| "epoch": 0.705026873221625, |
| "grad_norm": 3.0485663414001465, |
| "learning_rate": 4.332369455022965e-05, |
| "loss": 1.0362, |
| "sparse_loss": 1.0362, |
| "step": 4460 |
| }, |
| { |
| "epoch": 0.7081884287069238, |
| "grad_norm": 3.4497592449188232, |
| "learning_rate": 4.322401570584965e-05, |
| "loss": 0.9509, |
| "sparse_loss": 0.9509, |
| "step": 4480 |
| }, |
| { |
| "epoch": 0.7113499841922226, |
| "grad_norm": 2.4316234588623047, |
| "learning_rate": 4.312371475815116e-05, |
| "loss": 1.0206, |
| "sparse_loss": 1.0206, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.7145115396775213, |
| "grad_norm": 10.038222312927246, |
| "learning_rate": 4.3022795131053104e-05, |
| "loss": 1.1059, |
| "sparse_loss": 1.1059, |
| "step": 4520 |
| }, |
| { |
| "epoch": 0.7176730951628201, |
| "grad_norm": 4.645579814910889, |
| "learning_rate": 4.2921260269593954e-05, |
| "loss": 1.0915, |
| "sparse_loss": 1.0915, |
| "step": 4540 |
| }, |
| { |
| "epoch": 0.7208346506481189, |
| "grad_norm": 2.9513022899627686, |
| "learning_rate": 4.281911363981407e-05, |
| "loss": 1.3803, |
| "sparse_loss": 1.3803, |
| "step": 4560 |
| }, |
| { |
| "epoch": 0.7239962061334176, |
| "grad_norm": 6.830081939697266, |
| "learning_rate": 4.271635872863744e-05, |
| "loss": 1.3414, |
| "sparse_loss": 1.3414, |
| "step": 4580 |
| }, |
| { |
| "epoch": 0.7271577616187164, |
| "grad_norm": 3.441859245300293, |
| "learning_rate": 4.261299904375261e-05, |
| "loss": 1.785, |
| "sparse_loss": 1.785, |
| "step": 4600 |
| }, |
| { |
| "epoch": 0.7303193171040152, |
| "grad_norm": 4.937995433807373, |
| "learning_rate": 4.250903811349297e-05, |
| "loss": 0.93, |
| "sparse_loss": 0.93, |
| "step": 4620 |
| }, |
| { |
| "epoch": 0.7334808725893139, |
| "grad_norm": 3.705645799636841, |
| "learning_rate": 4.240447948671628e-05, |
| "loss": 1.0316, |
| "sparse_loss": 1.0316, |
| "step": 4640 |
| }, |
| { |
| "epoch": 0.7366424280746127, |
| "grad_norm": 56.22758865356445, |
| "learning_rate": 4.2299326732683555e-05, |
| "loss": 0.9974, |
| "sparse_loss": 0.9974, |
| "step": 4660 |
| }, |
| { |
| "epoch": 0.7398039835599115, |
| "grad_norm": 122.35603332519531, |
| "learning_rate": 4.219358344093719e-05, |
| "loss": 1.7038, |
| "sparse_loss": 1.7038, |
| "step": 4680 |
| }, |
| { |
| "epoch": 0.7429655390452102, |
| "grad_norm": 243.83538818359375, |
| "learning_rate": 4.208725322117848e-05, |
| "loss": 1.4334, |
| "sparse_loss": 1.4334, |
| "step": 4700 |
| }, |
| { |
| "epoch": 0.746127094530509, |
| "grad_norm": 908.5032958984375, |
| "learning_rate": 4.1980339703144325e-05, |
| "loss": 6.8806, |
| "sparse_loss": 6.8806, |
| "step": 4720 |
| }, |
| { |
| "epoch": 0.7492886500158078, |
| "grad_norm": 21.237464904785156, |
| "learning_rate": 4.1872846536483377e-05, |
| "loss": 2.4809, |
| "sparse_loss": 2.4809, |
| "step": 4740 |
| }, |
| { |
| "epoch": 0.7524502055011065, |
| "grad_norm": 5.8883466720581055, |
| "learning_rate": 4.176477739063146e-05, |
| "loss": 1.0461, |
| "sparse_loss": 1.0461, |
| "step": 4760 |
| }, |
| { |
| "epoch": 0.7556117609864053, |
| "grad_norm": 85.04512786865234, |
| "learning_rate": 4.165613595468624e-05, |
| "loss": 1.3042, |
| "sparse_loss": 1.3042, |
| "step": 4780 |
| }, |
| { |
| "epoch": 0.7587733164717041, |
| "grad_norm": 8.59842300415039, |
| "learning_rate": 4.1546925937281376e-05, |
| "loss": 1.8298, |
| "sparse_loss": 1.8298, |
| "step": 4800 |
| }, |
| { |
| "epoch": 0.7619348719570028, |
| "grad_norm": 31.37865447998047, |
| "learning_rate": 4.143715106645986e-05, |
| "loss": 1.4291, |
| "sparse_loss": 1.4291, |
| "step": 4820 |
| }, |
| { |
| "epoch": 0.7650964274423017, |
| "grad_norm": 3.250920534133911, |
| "learning_rate": 4.13268150895468e-05, |
| "loss": 1.3777, |
| "sparse_loss": 1.3777, |
| "step": 4840 |
| }, |
| { |
| "epoch": 0.7682579829276004, |
| "grad_norm": 3.320081949234009, |
| "learning_rate": 4.121592177302147e-05, |
| "loss": 1.1557, |
| "sparse_loss": 1.1557, |
| "step": 4860 |
| }, |
| { |
| "epoch": 0.7714195384128991, |
| "grad_norm": 12.010161399841309, |
| "learning_rate": 4.1104474902388734e-05, |
| "loss": 1.181, |
| "sparse_loss": 1.181, |
| "step": 4880 |
| }, |
| { |
| "epoch": 0.774581093898198, |
| "grad_norm": 4.192005157470703, |
| "learning_rate": 4.099247828204984e-05, |
| "loss": 1.0431, |
| "sparse_loss": 1.0431, |
| "step": 4900 |
| }, |
| { |
| "epoch": 0.7777426493834967, |
| "grad_norm": 94.05577087402344, |
| "learning_rate": 4.0879935735172526e-05, |
| "loss": 0.9924, |
| "sparse_loss": 0.9924, |
| "step": 4920 |
| }, |
| { |
| "epoch": 0.7809042048687954, |
| "grad_norm": 3.2286782264709473, |
| "learning_rate": 4.076685110356057e-05, |
| "loss": 1.2762, |
| "sparse_loss": 1.2762, |
| "step": 4940 |
| }, |
| { |
| "epoch": 0.7840657603540943, |
| "grad_norm": 3.522233486175537, |
| "learning_rate": 4.0653228247522545e-05, |
| "loss": 1.3096, |
| "sparse_loss": 1.3096, |
| "step": 4960 |
| }, |
| { |
| "epoch": 0.787227315839393, |
| "grad_norm": 5.314358711242676, |
| "learning_rate": 4.053907104574016e-05, |
| "loss": 1.2653, |
| "sparse_loss": 1.2653, |
| "step": 4980 |
| }, |
| { |
| "epoch": 0.7903888713246917, |
| "grad_norm": 12.498971939086914, |
| "learning_rate": 4.042438339513573e-05, |
| "loss": 1.1159, |
| "sparse_loss": 1.1159, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.7935504268099906, |
| "grad_norm": 656.21240234375, |
| "learning_rate": 4.030916921073926e-05, |
| "loss": 1.3001, |
| "sparse_loss": 1.3001, |
| "step": 5020 |
| }, |
| { |
| "epoch": 0.7967119822952893, |
| "grad_norm": 3.7802658081054688, |
| "learning_rate": 4.019343242555474e-05, |
| "loss": 0.9852, |
| "sparse_loss": 0.9852, |
| "step": 5040 |
| }, |
| { |
| "epoch": 0.799873537780588, |
| "grad_norm": 5.901795864105225, |
| "learning_rate": 4.00771769904259e-05, |
| "loss": 1.2979, |
| "sparse_loss": 1.2979, |
| "step": 5060 |
| }, |
| { |
| "epoch": 0.8030350932658868, |
| "grad_norm": 3.6312479972839355, |
| "learning_rate": 3.9960406873901335e-05, |
| "loss": 1.123, |
| "sparse_loss": 1.123, |
| "step": 5080 |
| }, |
| { |
| "epoch": 0.8061966487511856, |
| "grad_norm": 4.615361213684082, |
| "learning_rate": 3.984312606209904e-05, |
| "loss": 1.2087, |
| "sparse_loss": 1.2087, |
| "step": 5100 |
| }, |
| { |
| "epoch": 0.8093582042364843, |
| "grad_norm": 12.550694465637207, |
| "learning_rate": 3.9725338558570335e-05, |
| "loss": 0.9877, |
| "sparse_loss": 0.9877, |
| "step": 5120 |
| }, |
| { |
| "epoch": 0.8125197597217831, |
| "grad_norm": 14.886857986450195, |
| "learning_rate": 3.960704838416321e-05, |
| "loss": 1.1369, |
| "sparse_loss": 1.1369, |
| "step": 5140 |
| }, |
| { |
| "epoch": 0.8156813152070819, |
| "grad_norm": 23.565824508666992, |
| "learning_rate": 3.948825957688506e-05, |
| "loss": 1.5903, |
| "sparse_loss": 1.5903, |
| "step": 5160 |
| }, |
| { |
| "epoch": 0.8188428706923806, |
| "grad_norm": 90.38725280761719, |
| "learning_rate": 3.9368976191764806e-05, |
| "loss": 1.4377, |
| "sparse_loss": 1.4377, |
| "step": 5180 |
| }, |
| { |
| "epoch": 0.8220044261776794, |
| "grad_norm": 167.6644287109375, |
| "learning_rate": 3.924920230071456e-05, |
| "loss": 1.0149, |
| "sparse_loss": 1.0149, |
| "step": 5200 |
| }, |
| { |
| "epoch": 0.8251659816629782, |
| "grad_norm": 3.468456268310547, |
| "learning_rate": 3.912894199239052e-05, |
| "loss": 0.9692, |
| "sparse_loss": 0.9692, |
| "step": 5220 |
| }, |
| { |
| "epoch": 0.828327537148277, |
| "grad_norm": 53.09666442871094, |
| "learning_rate": 3.900819937205348e-05, |
| "loss": 1.0828, |
| "sparse_loss": 1.0828, |
| "step": 5240 |
| }, |
| { |
| "epoch": 0.8314890926335757, |
| "grad_norm": 16.443645477294922, |
| "learning_rate": 3.888697856142861e-05, |
| "loss": 1.5313, |
| "sparse_loss": 1.5313, |
| "step": 5260 |
| }, |
| { |
| "epoch": 0.8346506481188745, |
| "grad_norm": 4.44474458694458, |
| "learning_rate": 3.876528369856486e-05, |
| "loss": 0.9266, |
| "sparse_loss": 0.9266, |
| "step": 5280 |
| }, |
| { |
| "epoch": 0.8378122036041733, |
| "grad_norm": 11.000542640686035, |
| "learning_rate": 3.864311893769361e-05, |
| "loss": 1.0082, |
| "sparse_loss": 1.0082, |
| "step": 5300 |
| }, |
| { |
| "epoch": 0.840973759089472, |
| "grad_norm": 5.926991939544678, |
| "learning_rate": 3.85204884490869e-05, |
| "loss": 1.0804, |
| "sparse_loss": 1.0804, |
| "step": 5320 |
| }, |
| { |
| "epoch": 0.8441353145747708, |
| "grad_norm": 6.9386396408081055, |
| "learning_rate": 3.839739641891506e-05, |
| "loss": 1.0393, |
| "sparse_loss": 1.0393, |
| "step": 5340 |
| }, |
| { |
| "epoch": 0.8472968700600696, |
| "grad_norm": 4.770463943481445, |
| "learning_rate": 3.8273847049103816e-05, |
| "loss": 1.0193, |
| "sparse_loss": 1.0193, |
| "step": 5360 |
| }, |
| { |
| "epoch": 0.8504584255453683, |
| "grad_norm": 15.692386627197266, |
| "learning_rate": 3.8149844557190855e-05, |
| "loss": 0.9763, |
| "sparse_loss": 0.9763, |
| "step": 5380 |
| }, |
| { |
| "epoch": 0.853619981030667, |
| "grad_norm": 9.34354019165039, |
| "learning_rate": 3.802539317618185e-05, |
| "loss": 1.7999, |
| "sparse_loss": 1.7999, |
| "step": 5400 |
| }, |
| { |
| "epoch": 0.8567815365159659, |
| "grad_norm": 22.36298942565918, |
| "learning_rate": 3.790049715440592e-05, |
| "loss": 0.9753, |
| "sparse_loss": 0.9753, |
| "step": 5420 |
| }, |
| { |
| "epoch": 0.8599430920012646, |
| "grad_norm": 6.282666206359863, |
| "learning_rate": 3.7775160755370695e-05, |
| "loss": 0.8948, |
| "sparse_loss": 0.8948, |
| "step": 5440 |
| }, |
| { |
| "epoch": 0.8631046474865633, |
| "grad_norm": 7.10538911819458, |
| "learning_rate": 3.764938825761671e-05, |
| "loss": 0.9001, |
| "sparse_loss": 0.9001, |
| "step": 5460 |
| }, |
| { |
| "epoch": 0.8662662029718622, |
| "grad_norm": 12.88032054901123, |
| "learning_rate": 3.7523183954571336e-05, |
| "loss": 1.2805, |
| "sparse_loss": 1.2805, |
| "step": 5480 |
| }, |
| { |
| "epoch": 0.8694277584571609, |
| "grad_norm": 9.392233848571777, |
| "learning_rate": 3.739655215440228e-05, |
| "loss": 0.8856, |
| "sparse_loss": 0.8856, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.8725893139424596, |
| "grad_norm": 40.612823486328125, |
| "learning_rate": 3.726949717987048e-05, |
| "loss": 0.9528, |
| "sparse_loss": 0.9528, |
| "step": 5520 |
| }, |
| { |
| "epoch": 0.8757508694277585, |
| "grad_norm": 8.556236267089844, |
| "learning_rate": 3.714202336818252e-05, |
| "loss": 1.1261, |
| "sparse_loss": 1.1261, |
| "step": 5540 |
| }, |
| { |
| "epoch": 0.8789124249130572, |
| "grad_norm": 5.332094192504883, |
| "learning_rate": 3.701413507084264e-05, |
| "loss": 1.0244, |
| "sparse_loss": 1.0244, |
| "step": 5560 |
| }, |
| { |
| "epoch": 0.8820739803983559, |
| "grad_norm": 10.452747344970703, |
| "learning_rate": 3.6885836653504124e-05, |
| "loss": 0.9389, |
| "sparse_loss": 0.9389, |
| "step": 5580 |
| }, |
| { |
| "epoch": 0.8852355358836548, |
| "grad_norm": 23.091854095458984, |
| "learning_rate": 3.675713249582031e-05, |
| "loss": 1.1378, |
| "sparse_loss": 1.1378, |
| "step": 5600 |
| }, |
| { |
| "epoch": 0.8883970913689535, |
| "grad_norm": 21.80746078491211, |
| "learning_rate": 3.662802699129508e-05, |
| "loss": 0.9005, |
| "sparse_loss": 0.9005, |
| "step": 5620 |
| }, |
| { |
| "epoch": 0.8915586468542523, |
| "grad_norm": 27.802932739257812, |
| "learning_rate": 3.649852454713286e-05, |
| "loss": 1.0643, |
| "sparse_loss": 1.0643, |
| "step": 5640 |
| }, |
| { |
| "epoch": 0.8947202023395511, |
| "grad_norm": 47.525020599365234, |
| "learning_rate": 3.636862958408818e-05, |
| "loss": 1.0409, |
| "sparse_loss": 1.0409, |
| "step": 5660 |
| }, |
| { |
| "epoch": 0.8978817578248498, |
| "grad_norm": 10.603707313537598, |
| "learning_rate": 3.6238346536314815e-05, |
| "loss": 1.1111, |
| "sparse_loss": 1.1111, |
| "step": 5680 |
| }, |
| { |
| "epoch": 0.9010433133101486, |
| "grad_norm": 4.220586776733398, |
| "learning_rate": 3.610767985121433e-05, |
| "loss": 1.527, |
| "sparse_loss": 1.527, |
| "step": 5700 |
| }, |
| { |
| "epoch": 0.9042048687954474, |
| "grad_norm": 27.324771881103516, |
| "learning_rate": 3.597663398928435e-05, |
| "loss": 1.2022, |
| "sparse_loss": 1.2022, |
| "step": 5720 |
| }, |
| { |
| "epoch": 0.9073664242807461, |
| "grad_norm": 10.594104766845703, |
| "learning_rate": 3.584521342396623e-05, |
| "loss": 1.134, |
| "sparse_loss": 1.134, |
| "step": 5740 |
| }, |
| { |
| "epoch": 0.9105279797660449, |
| "grad_norm": 9.350955963134766, |
| "learning_rate": 3.5713422641492355e-05, |
| "loss": 1.1128, |
| "sparse_loss": 1.1128, |
| "step": 5760 |
| }, |
| { |
| "epoch": 0.9136895352513437, |
| "grad_norm": 30.16716957092285, |
| "learning_rate": 3.558126614073305e-05, |
| "loss": 1.4697, |
| "sparse_loss": 1.4697, |
| "step": 5780 |
| }, |
| { |
| "epoch": 0.9168510907366424, |
| "grad_norm": 144.28836059570312, |
| "learning_rate": 3.544874843304294e-05, |
| "loss": 1.1559, |
| "sparse_loss": 1.1559, |
| "step": 5800 |
| }, |
| { |
| "epoch": 0.9200126462219412, |
| "grad_norm": 5.698886871337891, |
| "learning_rate": 3.5315874042107e-05, |
| "loss": 1.2828, |
| "sparse_loss": 1.2828, |
| "step": 5820 |
| }, |
| { |
| "epoch": 0.92317420170724, |
| "grad_norm": 10.48292350769043, |
| "learning_rate": 3.518264750378606e-05, |
| "loss": 1.2694, |
| "sparse_loss": 1.2694, |
| "step": 5840 |
| }, |
| { |
| "epoch": 0.9263357571925387, |
| "grad_norm": 4.3222575187683105, |
| "learning_rate": 3.5049073365962065e-05, |
| "loss": 1.1258, |
| "sparse_loss": 1.1258, |
| "step": 5860 |
| }, |
| { |
| "epoch": 0.9294973126778375, |
| "grad_norm": 4.344067573547363, |
| "learning_rate": 3.491515618838275e-05, |
| "loss": 1.1675, |
| "sparse_loss": 1.1675, |
| "step": 5880 |
| }, |
| { |
| "epoch": 0.9326588681631363, |
| "grad_norm": 4.376672744750977, |
| "learning_rate": 3.4780900542506e-05, |
| "loss": 1.1709, |
| "sparse_loss": 1.1709, |
| "step": 5900 |
| }, |
| { |
| "epoch": 0.935820423648435, |
| "grad_norm": 3.426212787628174, |
| "learning_rate": 3.464631101134385e-05, |
| "loss": 1.5698, |
| "sparse_loss": 1.5698, |
| "step": 5920 |
| }, |
| { |
| "epoch": 0.9389819791337338, |
| "grad_norm": 9.606375694274902, |
| "learning_rate": 3.451139218930595e-05, |
| "loss": 1.0853, |
| "sparse_loss": 1.0853, |
| "step": 5940 |
| }, |
| { |
| "epoch": 0.9421435346190326, |
| "grad_norm": 6.818150520324707, |
| "learning_rate": 3.43761486820428e-05, |
| "loss": 1.4761, |
| "sparse_loss": 1.4761, |
| "step": 5960 |
| }, |
| { |
| "epoch": 0.9453050901043313, |
| "grad_norm": 5.038766860961914, |
| "learning_rate": 3.424058510628849e-05, |
| "loss": 1.0478, |
| "sparse_loss": 1.0478, |
| "step": 5980 |
| }, |
| { |
| "epoch": 0.9484666455896301, |
| "grad_norm": 3.961850881576538, |
| "learning_rate": 3.410470608970313e-05, |
| "loss": 0.9513, |
| "sparse_loss": 0.9513, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.9516282010749288, |
| "grad_norm": 3.9220104217529297, |
| "learning_rate": 3.396851627071484e-05, |
| "loss": 0.9381, |
| "sparse_loss": 0.9381, |
| "step": 6020 |
| }, |
| { |
| "epoch": 0.9547897565602277, |
| "grad_norm": 5.577884674072266, |
| "learning_rate": 3.383202029836145e-05, |
| "loss": 1.0799, |
| "sparse_loss": 1.0799, |
| "step": 6040 |
| }, |
| { |
| "epoch": 0.9579513120455264, |
| "grad_norm": 6.933228015899658, |
| "learning_rate": 3.369522283213179e-05, |
| "loss": 1.5161, |
| "sparse_loss": 1.5161, |
| "step": 6060 |
| }, |
| { |
| "epoch": 0.9611128675308251, |
| "grad_norm": 3.897125005722046, |
| "learning_rate": 3.3558128541806586e-05, |
| "loss": 1.0702, |
| "sparse_loss": 1.0702, |
| "step": 6080 |
| }, |
| { |
| "epoch": 0.964274423016124, |
| "grad_norm": 4.546823024749756, |
| "learning_rate": 3.3420742107299117e-05, |
| "loss": 1.5374, |
| "sparse_loss": 1.5374, |
| "step": 6100 |
| }, |
| { |
| "epoch": 0.9674359785014227, |
| "grad_norm": 4.492236614227295, |
| "learning_rate": 3.328306821849542e-05, |
| "loss": 1.524, |
| "sparse_loss": 1.524, |
| "step": 6120 |
| }, |
| { |
| "epoch": 0.9705975339867214, |
| "grad_norm": 3.2623276710510254, |
| "learning_rate": 3.314511157509422e-05, |
| "loss": 1.0181, |
| "sparse_loss": 1.0181, |
| "step": 6140 |
| }, |
| { |
| "epoch": 0.9737590894720203, |
| "grad_norm": 5.898326396942139, |
| "learning_rate": 3.300687688644644e-05, |
| "loss": 1.0289, |
| "sparse_loss": 1.0289, |
| "step": 6160 |
| }, |
| { |
| "epoch": 0.976920644957319, |
| "grad_norm": 3.5350193977355957, |
| "learning_rate": 3.286836887139454e-05, |
| "loss": 1.0142, |
| "sparse_loss": 1.0142, |
| "step": 6180 |
| }, |
| { |
| "epoch": 0.9800822004426177, |
| "grad_norm": 3.6061811447143555, |
| "learning_rate": 3.272959225811132e-05, |
| "loss": 0.8989, |
| "sparse_loss": 0.8989, |
| "step": 6200 |
| }, |
| { |
| "epoch": 0.9832437559279166, |
| "grad_norm": 4.5806884765625, |
| "learning_rate": 3.259055178393859e-05, |
| "loss": 0.9607, |
| "sparse_loss": 0.9607, |
| "step": 6220 |
| }, |
| { |
| "epoch": 0.9864053114132153, |
| "grad_norm": 5.311800003051758, |
| "learning_rate": 3.2451252195225476e-05, |
| "loss": 0.8816, |
| "sparse_loss": 0.8816, |
| "step": 6240 |
| }, |
| { |
| "epoch": 0.989566866898514, |
| "grad_norm": 4.409742832183838, |
| "learning_rate": 3.231169824716628e-05, |
| "loss": 0.9233, |
| "sparse_loss": 0.9233, |
| "step": 6260 |
| }, |
| { |
| "epoch": 0.9927284223838129, |
| "grad_norm": 4.337621212005615, |
| "learning_rate": 3.2171894703638306e-05, |
| "loss": 0.8896, |
| "sparse_loss": 0.8896, |
| "step": 6280 |
| }, |
| { |
| "epoch": 0.9958899778691116, |
| "grad_norm": 3.0346786975860596, |
| "learning_rate": 3.2031846337039105e-05, |
| "loss": 1.0924, |
| "sparse_loss": 1.0924, |
| "step": 6300 |
| }, |
| { |
| "epoch": 0.9990515333544103, |
| "grad_norm": 8.614317893981934, |
| "learning_rate": 3.189155792812366e-05, |
| "loss": 0.968, |
| "sparse_loss": 0.968, |
| "step": 6320 |
| }, |
| { |
| "epoch": 1.0022130888397092, |
| "grad_norm": 4.639912128448486, |
| "learning_rate": 3.175103426584113e-05, |
| "loss": 0.909, |
| "sparse_loss": 0.909, |
| "step": 6340 |
| }, |
| { |
| "epoch": 1.005374644325008, |
| "grad_norm": 3.5092694759368896, |
| "learning_rate": 3.161028014717138e-05, |
| "loss": 0.9127, |
| "sparse_loss": 0.9127, |
| "step": 6360 |
| }, |
| { |
| "epoch": 1.0085361998103066, |
| "grad_norm": 4.135300159454346, |
| "learning_rate": 3.146930037696127e-05, |
| "loss": 0.9888, |
| "sparse_loss": 0.9888, |
| "step": 6380 |
| }, |
| { |
| "epoch": 1.0116977552956055, |
| "grad_norm": 3.9154438972473145, |
| "learning_rate": 3.1328099767760584e-05, |
| "loss": 0.9214, |
| "sparse_loss": 0.9214, |
| "step": 6400 |
| }, |
| { |
| "epoch": 1.0148593107809043, |
| "grad_norm": 10.022296905517578, |
| "learning_rate": 3.118668313965775e-05, |
| "loss": 1.0435, |
| "sparse_loss": 1.0435, |
| "step": 6420 |
| }, |
| { |
| "epoch": 1.018020866266203, |
| "grad_norm": 3.800605535507202, |
| "learning_rate": 3.1045055320115356e-05, |
| "loss": 1.0115, |
| "sparse_loss": 1.0115, |
| "step": 6440 |
| }, |
| { |
| "epoch": 1.0211824217515018, |
| "grad_norm": 4.005403518676758, |
| "learning_rate": 3.090322114380528e-05, |
| "loss": 0.9155, |
| "sparse_loss": 0.9155, |
| "step": 6460 |
| }, |
| { |
| "epoch": 1.0243439772368006, |
| "grad_norm": 8.109750747680664, |
| "learning_rate": 3.076118545244371e-05, |
| "loss": 0.7896, |
| "sparse_loss": 0.7896, |
| "step": 6480 |
| }, |
| { |
| "epoch": 1.0275055327220992, |
| "grad_norm": 5.5729875564575195, |
| "learning_rate": 3.0618953094625856e-05, |
| "loss": 0.8496, |
| "sparse_loss": 0.8496, |
| "step": 6500 |
| }, |
| { |
| "epoch": 1.030667088207398, |
| "grad_norm": 3.2701199054718018, |
| "learning_rate": 3.0476528925660382e-05, |
| "loss": 0.8769, |
| "sparse_loss": 0.8769, |
| "step": 6520 |
| }, |
| { |
| "epoch": 1.033828643692697, |
| "grad_norm": 3.7248306274414062, |
| "learning_rate": 3.033391780740374e-05, |
| "loss": 0.8401, |
| "sparse_loss": 0.8401, |
| "step": 6540 |
| }, |
| { |
| "epoch": 1.0369901991779955, |
| "grad_norm": 5.966579437255859, |
| "learning_rate": 3.019112460809415e-05, |
| "loss": 0.9762, |
| "sparse_loss": 0.9762, |
| "step": 6560 |
| }, |
| { |
| "epoch": 1.0401517546632943, |
| "grad_norm": 4.336265563964844, |
| "learning_rate": 3.0048154202185452e-05, |
| "loss": 0.8426, |
| "sparse_loss": 0.8426, |
| "step": 6580 |
| }, |
| { |
| "epoch": 1.0433133101485932, |
| "grad_norm": 3.8256428241729736, |
| "learning_rate": 2.9905011470180683e-05, |
| "loss": 0.8695, |
| "sparse_loss": 0.8695, |
| "step": 6600 |
| }, |
| { |
| "epoch": 1.0464748656338918, |
| "grad_norm": 4.434247016906738, |
| "learning_rate": 2.9761701298465465e-05, |
| "loss": 0.8763, |
| "sparse_loss": 0.8763, |
| "step": 6620 |
| }, |
| { |
| "epoch": 1.0496364211191906, |
| "grad_norm": 6.489551544189453, |
| "learning_rate": 2.9618228579141244e-05, |
| "loss": 0.9235, |
| "sparse_loss": 0.9235, |
| "step": 6640 |
| }, |
| { |
| "epoch": 1.0527979766044895, |
| "grad_norm": 4.415309906005859, |
| "learning_rate": 2.9474598209858262e-05, |
| "loss": 0.881, |
| "sparse_loss": 0.881, |
| "step": 6660 |
| }, |
| { |
| "epoch": 1.055959532089788, |
| "grad_norm": 3.765221357345581, |
| "learning_rate": 2.9330815093648344e-05, |
| "loss": 0.9031, |
| "sparse_loss": 0.9031, |
| "step": 6680 |
| }, |
| { |
| "epoch": 1.059121087575087, |
| "grad_norm": 4.988484859466553, |
| "learning_rate": 2.9186884138757596e-05, |
| "loss": 0.8607, |
| "sparse_loss": 0.8607, |
| "step": 6700 |
| }, |
| { |
| "epoch": 1.0622826430603858, |
| "grad_norm": 3.8098223209381104, |
| "learning_rate": 2.9042810258478785e-05, |
| "loss": 0.8593, |
| "sparse_loss": 0.8593, |
| "step": 6720 |
| }, |
| { |
| "epoch": 1.0654441985456844, |
| "grad_norm": 3.849292755126953, |
| "learning_rate": 2.8898598370983642e-05, |
| "loss": 0.9486, |
| "sparse_loss": 0.9486, |
| "step": 6740 |
| }, |
| { |
| "epoch": 1.0686057540309832, |
| "grad_norm": 4.708007335662842, |
| "learning_rate": 2.8754253399154995e-05, |
| "loss": 0.9008, |
| "sparse_loss": 0.9008, |
| "step": 6760 |
| }, |
| { |
| "epoch": 1.071767309516282, |
| "grad_norm": 3.5177364349365234, |
| "learning_rate": 2.8609780270418684e-05, |
| "loss": 0.8607, |
| "sparse_loss": 0.8607, |
| "step": 6780 |
| }, |
| { |
| "epoch": 1.0749288650015807, |
| "grad_norm": 4.5687785148620605, |
| "learning_rate": 2.846518391657538e-05, |
| "loss": 0.9738, |
| "sparse_loss": 0.9738, |
| "step": 6800 |
| }, |
| { |
| "epoch": 1.0780904204868795, |
| "grad_norm": 3.226118326187134, |
| "learning_rate": 2.832046927363221e-05, |
| "loss": 0.9142, |
| "sparse_loss": 0.9142, |
| "step": 6820 |
| }, |
| { |
| "epoch": 1.0812519759721784, |
| "grad_norm": 7.462769031524658, |
| "learning_rate": 2.8175641281634285e-05, |
| "loss": 0.9307, |
| "sparse_loss": 0.9307, |
| "step": 6840 |
| }, |
| { |
| "epoch": 1.084413531457477, |
| "grad_norm": 4.55466365814209, |
| "learning_rate": 2.8030704884496056e-05, |
| "loss": 0.8854, |
| "sparse_loss": 0.8854, |
| "step": 6860 |
| }, |
| { |
| "epoch": 1.0875750869427758, |
| "grad_norm": 3.169097661972046, |
| "learning_rate": 2.7885665029832515e-05, |
| "loss": 0.8043, |
| "sparse_loss": 0.8043, |
| "step": 6880 |
| }, |
| { |
| "epoch": 1.0907366424280747, |
| "grad_norm": 4.427898406982422, |
| "learning_rate": 2.7740526668790355e-05, |
| "loss": 0.8476, |
| "sparse_loss": 0.8476, |
| "step": 6900 |
| }, |
| { |
| "epoch": 1.0938981979133733, |
| "grad_norm": 3.025594711303711, |
| "learning_rate": 2.7595294755878914e-05, |
| "loss": 0.811, |
| "sparse_loss": 0.811, |
| "step": 6920 |
| }, |
| { |
| "epoch": 1.0970597533986721, |
| "grad_norm": 5.815234184265137, |
| "learning_rate": 2.744997424880107e-05, |
| "loss": 0.8351, |
| "sparse_loss": 0.8351, |
| "step": 6940 |
| }, |
| { |
| "epoch": 1.100221308883971, |
| "grad_norm": 3.9317593574523926, |
| "learning_rate": 2.7304570108283978e-05, |
| "loss": 0.8359, |
| "sparse_loss": 0.8359, |
| "step": 6960 |
| }, |
| { |
| "epoch": 1.1033828643692696, |
| "grad_norm": 2.971867799758911, |
| "learning_rate": 2.715908729790974e-05, |
| "loss": 0.859, |
| "sparse_loss": 0.859, |
| "step": 6980 |
| }, |
| { |
| "epoch": 1.1065444198545684, |
| "grad_norm": 5.231795787811279, |
| "learning_rate": 2.701353078394599e-05, |
| "loss": 0.9768, |
| "sparse_loss": 0.9768, |
| "step": 7000 |
| }, |
| { |
| "epoch": 1.1097059753398673, |
| "grad_norm": 5.067847728729248, |
| "learning_rate": 2.686790553517632e-05, |
| "loss": 0.7727, |
| "sparse_loss": 0.7727, |
| "step": 7020 |
| }, |
| { |
| "epoch": 1.112867530825166, |
| "grad_norm": 3.353957414627075, |
| "learning_rate": 2.6722216522730693e-05, |
| "loss": 0.8607, |
| "sparse_loss": 0.8607, |
| "step": 7040 |
| }, |
| { |
| "epoch": 1.1160290863104647, |
| "grad_norm": 3.633568048477173, |
| "learning_rate": 2.657646871991575e-05, |
| "loss": 0.8446, |
| "sparse_loss": 0.8446, |
| "step": 7060 |
| }, |
| { |
| "epoch": 1.1191906417957636, |
| "grad_norm": 4.36560583114624, |
| "learning_rate": 2.6430667102044994e-05, |
| "loss": 1.0285, |
| "sparse_loss": 1.0285, |
| "step": 7080 |
| }, |
| { |
| "epoch": 1.1223521972810624, |
| "grad_norm": 3.966235637664795, |
| "learning_rate": 2.628481664626901e-05, |
| "loss": 0.7571, |
| "sparse_loss": 0.7571, |
| "step": 7100 |
| }, |
| { |
| "epoch": 1.125513752766361, |
| "grad_norm": 3.6655080318450928, |
| "learning_rate": 2.6138922331405545e-05, |
| "loss": 0.7987, |
| "sparse_loss": 0.7987, |
| "step": 7120 |
| }, |
| { |
| "epoch": 1.1286753082516598, |
| "grad_norm": 4.856502056121826, |
| "learning_rate": 2.5992989137769512e-05, |
| "loss": 0.8789, |
| "sparse_loss": 0.8789, |
| "step": 7140 |
| }, |
| { |
| "epoch": 1.1318368637369587, |
| "grad_norm": 6.328119277954102, |
| "learning_rate": 2.5847022047003016e-05, |
| "loss": 0.8377, |
| "sparse_loss": 0.8377, |
| "step": 7160 |
| }, |
| { |
| "epoch": 1.1349984192222573, |
| "grad_norm": 3.454359769821167, |
| "learning_rate": 2.5701026041905306e-05, |
| "loss": 0.7203, |
| "sparse_loss": 0.7203, |
| "step": 7180 |
| }, |
| { |
| "epoch": 1.1381599747075561, |
| "grad_norm": 3.939906120300293, |
| "learning_rate": 2.555500610626264e-05, |
| "loss": 0.8824, |
| "sparse_loss": 0.8824, |
| "step": 7200 |
| }, |
| { |
| "epoch": 1.141321530192855, |
| "grad_norm": 2.77530837059021, |
| "learning_rate": 2.5408967224678203e-05, |
| "loss": 0.909, |
| "sparse_loss": 0.909, |
| "step": 7220 |
| }, |
| { |
| "epoch": 1.1444830856781536, |
| "grad_norm": 4.3749165534973145, |
| "learning_rate": 2.5262914382401908e-05, |
| "loss": 0.8797, |
| "sparse_loss": 0.8797, |
| "step": 7240 |
| }, |
| { |
| "epoch": 1.1476446411634524, |
| "grad_norm": 3.943511724472046, |
| "learning_rate": 2.5116852565160253e-05, |
| "loss": 0.7876, |
| "sparse_loss": 0.7876, |
| "step": 7260 |
| }, |
| { |
| "epoch": 1.1508061966487513, |
| "grad_norm": 4.530856132507324, |
| "learning_rate": 2.4970786758986098e-05, |
| "loss": 0.8024, |
| "sparse_loss": 0.8024, |
| "step": 7280 |
| }, |
| { |
| "epoch": 1.15396775213405, |
| "grad_norm": 2.65533185005188, |
| "learning_rate": 2.482472195004847e-05, |
| "loss": 0.8083, |
| "sparse_loss": 0.8083, |
| "step": 7300 |
| }, |
| { |
| "epoch": 1.1571293076193487, |
| "grad_norm": 4.328652858734131, |
| "learning_rate": 2.4678663124482358e-05, |
| "loss": 0.8453, |
| "sparse_loss": 0.8453, |
| "step": 7320 |
| }, |
| { |
| "epoch": 1.1602908631046476, |
| "grad_norm": 3.4754040241241455, |
| "learning_rate": 2.4532615268218503e-05, |
| "loss": 0.844, |
| "sparse_loss": 0.844, |
| "step": 7340 |
| }, |
| { |
| "epoch": 1.1634524185899462, |
| "grad_norm": 8.937326431274414, |
| "learning_rate": 2.438658336681319e-05, |
| "loss": 0.84, |
| "sparse_loss": 0.84, |
| "step": 7360 |
| }, |
| { |
| "epoch": 1.166613974075245, |
| "grad_norm": 3.833845376968384, |
| "learning_rate": 2.4240572405278065e-05, |
| "loss": 0.8231, |
| "sparse_loss": 0.8231, |
| "step": 7380 |
| }, |
| { |
| "epoch": 1.1697755295605439, |
| "grad_norm": 4.556502342224121, |
| "learning_rate": 2.4094587367909942e-05, |
| "loss": 0.9652, |
| "sparse_loss": 0.9652, |
| "step": 7400 |
| }, |
| { |
| "epoch": 1.1729370850458425, |
| "grad_norm": 5.612978458404541, |
| "learning_rate": 2.394863323812072e-05, |
| "loss": 0.8199, |
| "sparse_loss": 0.8199, |
| "step": 7420 |
| }, |
| { |
| "epoch": 1.1760986405311413, |
| "grad_norm": 2.8453385829925537, |
| "learning_rate": 2.3802714998267177e-05, |
| "loss": 0.8569, |
| "sparse_loss": 0.8569, |
| "step": 7440 |
| }, |
| { |
| "epoch": 1.1792601960164402, |
| "grad_norm": 4.4765753746032715, |
| "learning_rate": 2.365683762948094e-05, |
| "loss": 0.8032, |
| "sparse_loss": 0.8032, |
| "step": 7460 |
| }, |
| { |
| "epoch": 1.1824217515017388, |
| "grad_norm": 11.731705665588379, |
| "learning_rate": 2.3511006111498486e-05, |
| "loss": 0.7358, |
| "sparse_loss": 0.7358, |
| "step": 7480 |
| }, |
| { |
| "epoch": 1.1855833069870376, |
| "grad_norm": 4.292891025543213, |
| "learning_rate": 2.3365225422491045e-05, |
| "loss": 0.8545, |
| "sparse_loss": 0.8545, |
| "step": 7500 |
| }, |
| { |
| "epoch": 1.1887448624723365, |
| "grad_norm": 5.5326151847839355, |
| "learning_rate": 2.3219500538894796e-05, |
| "loss": 0.8115, |
| "sparse_loss": 0.8115, |
| "step": 7520 |
| }, |
| { |
| "epoch": 1.191906417957635, |
| "grad_norm": 4.541591644287109, |
| "learning_rate": 2.307383643524085e-05, |
| "loss": 0.8587, |
| "sparse_loss": 0.8587, |
| "step": 7540 |
| }, |
| { |
| "epoch": 1.195067973442934, |
| "grad_norm": 6.370118141174316, |
| "learning_rate": 2.292823808398554e-05, |
| "loss": 0.7829, |
| "sparse_loss": 0.7829, |
| "step": 7560 |
| }, |
| { |
| "epoch": 1.1982295289282328, |
| "grad_norm": 3.750469923019409, |
| "learning_rate": 2.2782710455340666e-05, |
| "loss": 0.8701, |
| "sparse_loss": 0.8701, |
| "step": 7580 |
| }, |
| { |
| "epoch": 1.2013910844135314, |
| "grad_norm": 4.589087009429932, |
| "learning_rate": 2.2637258517103754e-05, |
| "loss": 0.8066, |
| "sparse_loss": 0.8066, |
| "step": 7600 |
| }, |
| { |
| "epoch": 1.2045526398988302, |
| "grad_norm": 4.9637675285339355, |
| "learning_rate": 2.249188723448859e-05, |
| "loss": 0.8028, |
| "sparse_loss": 0.8028, |
| "step": 7620 |
| }, |
| { |
| "epoch": 1.207714195384129, |
| "grad_norm": 4.271427631378174, |
| "learning_rate": 2.2346601569955622e-05, |
| "loss": 0.8269, |
| "sparse_loss": 0.8269, |
| "step": 7640 |
| }, |
| { |
| "epoch": 1.2108757508694277, |
| "grad_norm": 4.1753411293029785, |
| "learning_rate": 2.2201406483042592e-05, |
| "loss": 0.8146, |
| "sparse_loss": 0.8146, |
| "step": 7660 |
| }, |
| { |
| "epoch": 1.2140373063547265, |
| "grad_norm": 6.351126194000244, |
| "learning_rate": 2.205630693019529e-05, |
| "loss": 0.7742, |
| "sparse_loss": 0.7742, |
| "step": 7680 |
| }, |
| { |
| "epoch": 1.2171988618400253, |
| "grad_norm": 3.95145320892334, |
| "learning_rate": 2.1911307864598253e-05, |
| "loss": 0.8023, |
| "sparse_loss": 0.8023, |
| "step": 7700 |
| }, |
| { |
| "epoch": 1.220360417325324, |
| "grad_norm": 5.053138256072998, |
| "learning_rate": 2.1766414236005795e-05, |
| "loss": 0.8261, |
| "sparse_loss": 0.8261, |
| "step": 7720 |
| }, |
| { |
| "epoch": 1.2235219728106228, |
| "grad_norm": 4.380322456359863, |
| "learning_rate": 2.162163099057295e-05, |
| "loss": 0.8389, |
| "sparse_loss": 0.8389, |
| "step": 7740 |
| }, |
| { |
| "epoch": 1.2266835282959216, |
| "grad_norm": 4.750369548797607, |
| "learning_rate": 2.1476963070686658e-05, |
| "loss": 0.8576, |
| "sparse_loss": 0.8576, |
| "step": 7760 |
| }, |
| { |
| "epoch": 1.2298450837812203, |
| "grad_norm": 2.2098541259765625, |
| "learning_rate": 2.1332415414797083e-05, |
| "loss": 0.764, |
| "sparse_loss": 0.764, |
| "step": 7780 |
| }, |
| { |
| "epoch": 1.233006639266519, |
| "grad_norm": 5.421565532684326, |
| "learning_rate": 2.1187992957248975e-05, |
| "loss": 0.9024, |
| "sparse_loss": 0.9024, |
| "step": 7800 |
| }, |
| { |
| "epoch": 1.236168194751818, |
| "grad_norm": 3.557370901107788, |
| "learning_rate": 2.1043700628113274e-05, |
| "loss": 0.8104, |
| "sparse_loss": 0.8104, |
| "step": 7820 |
| }, |
| { |
| "epoch": 1.2393297502371166, |
| "grad_norm": 4.613358020782471, |
| "learning_rate": 2.0899543353018792e-05, |
| "loss": 0.769, |
| "sparse_loss": 0.769, |
| "step": 7840 |
| }, |
| { |
| "epoch": 1.2424913057224154, |
| "grad_norm": 3.3156280517578125, |
| "learning_rate": 2.0755526052984048e-05, |
| "loss": 0.7804, |
| "sparse_loss": 0.7804, |
| "step": 7860 |
| }, |
| { |
| "epoch": 1.2456528612077142, |
| "grad_norm": 6.744318008422852, |
| "learning_rate": 2.0611653644249363e-05, |
| "loss": 0.8267, |
| "sparse_loss": 0.8267, |
| "step": 7880 |
| }, |
| { |
| "epoch": 1.2488144166930129, |
| "grad_norm": 10.97127628326416, |
| "learning_rate": 2.0467931038108933e-05, |
| "loss": 0.7403, |
| "sparse_loss": 0.7403, |
| "step": 7900 |
| }, |
| { |
| "epoch": 1.2519759721783117, |
| "grad_norm": 3.974884271621704, |
| "learning_rate": 2.032436314074326e-05, |
| "loss": 0.8371, |
| "sparse_loss": 0.8371, |
| "step": 7920 |
| }, |
| { |
| "epoch": 1.2551375276636105, |
| "grad_norm": 10.700128555297852, |
| "learning_rate": 2.01809548530516e-05, |
| "loss": 0.965, |
| "sparse_loss": 0.965, |
| "step": 7940 |
| }, |
| { |
| "epoch": 1.2582990831489091, |
| "grad_norm": 4.8021626472473145, |
| "learning_rate": 2.003771107048474e-05, |
| "loss": 0.8832, |
| "sparse_loss": 0.8832, |
| "step": 7960 |
| }, |
| { |
| "epoch": 1.261460638634208, |
| "grad_norm": 4.239758014678955, |
| "learning_rate": 1.9894636682877812e-05, |
| "loss": 0.7371, |
| "sparse_loss": 0.7371, |
| "step": 7980 |
| }, |
| { |
| "epoch": 1.2646221941195068, |
| "grad_norm": 32.94401168823242, |
| "learning_rate": 1.9751736574283416e-05, |
| "loss": 0.8073, |
| "sparse_loss": 0.8073, |
| "step": 8000 |
| }, |
| { |
| "epoch": 1.2677837496048054, |
| "grad_norm": 3.893188238143921, |
| "learning_rate": 1.96090156228049e-05, |
| "loss": 0.8241, |
| "sparse_loss": 0.8241, |
| "step": 8020 |
| }, |
| { |
| "epoch": 1.2709453050901043, |
| "grad_norm": 6.187018871307373, |
| "learning_rate": 1.9466478700429793e-05, |
| "loss": 0.7952, |
| "sparse_loss": 0.7952, |
| "step": 8040 |
| }, |
| { |
| "epoch": 1.2741068605754031, |
| "grad_norm": 9.897286415100098, |
| "learning_rate": 1.932413067286355e-05, |
| "loss": 0.7955, |
| "sparse_loss": 0.7955, |
| "step": 8060 |
| }, |
| { |
| "epoch": 1.2772684160607017, |
| "grad_norm": 5.557135581970215, |
| "learning_rate": 1.9181976399363415e-05, |
| "loss": 0.8176, |
| "sparse_loss": 0.8176, |
| "step": 8080 |
| }, |
| { |
| "epoch": 1.2804299715460006, |
| "grad_norm": 4.779278755187988, |
| "learning_rate": 1.904002073257254e-05, |
| "loss": 0.7168, |
| "sparse_loss": 0.7168, |
| "step": 8100 |
| }, |
| { |
| "epoch": 1.2835915270312994, |
| "grad_norm": 5.089160919189453, |
| "learning_rate": 1.8898268518354383e-05, |
| "loss": 0.7675, |
| "sparse_loss": 0.7675, |
| "step": 8120 |
| }, |
| { |
| "epoch": 1.286753082516598, |
| "grad_norm": 4.806196212768555, |
| "learning_rate": 1.8756724595627207e-05, |
| "loss": 0.7554, |
| "sparse_loss": 0.7554, |
| "step": 8140 |
| }, |
| { |
| "epoch": 1.2899146380018969, |
| "grad_norm": 6.0005621910095215, |
| "learning_rate": 1.861539379619899e-05, |
| "loss": 0.8476, |
| "sparse_loss": 0.8476, |
| "step": 8160 |
| }, |
| { |
| "epoch": 1.2930761934871957, |
| "grad_norm": 7.122402191162109, |
| "learning_rate": 1.84742809446024e-05, |
| "loss": 0.8156, |
| "sparse_loss": 0.8156, |
| "step": 8180 |
| }, |
| { |
| "epoch": 1.2962377489724943, |
| "grad_norm": 3.730856418609619, |
| "learning_rate": 1.8333390857930144e-05, |
| "loss": 0.7345, |
| "sparse_loss": 0.7345, |
| "step": 8200 |
| }, |
| { |
| "epoch": 1.2993993044577932, |
| "grad_norm": 3.008673906326294, |
| "learning_rate": 1.8192728345670547e-05, |
| "loss": 0.7445, |
| "sparse_loss": 0.7445, |
| "step": 8220 |
| }, |
| { |
| "epoch": 1.302560859943092, |
| "grad_norm": 3.7446534633636475, |
| "learning_rate": 1.8052298209543315e-05, |
| "loss": 0.8129, |
| "sparse_loss": 0.8129, |
| "step": 8240 |
| }, |
| { |
| "epoch": 1.3057224154283908, |
| "grad_norm": 5.062664985656738, |
| "learning_rate": 1.7912105243335687e-05, |
| "loss": 0.8321, |
| "sparse_loss": 0.8321, |
| "step": 8260 |
| }, |
| { |
| "epoch": 1.3088839709136895, |
| "grad_norm": 6.619328022003174, |
| "learning_rate": 1.7772154232738745e-05, |
| "loss": 0.777, |
| "sparse_loss": 0.777, |
| "step": 8280 |
| }, |
| { |
| "epoch": 1.3120455263989883, |
| "grad_norm": 5.793715476989746, |
| "learning_rate": 1.763244995518406e-05, |
| "loss": 0.7601, |
| "sparse_loss": 0.7601, |
| "step": 8300 |
| }, |
| { |
| "epoch": 1.3152070818842871, |
| "grad_norm": 4.310614585876465, |
| "learning_rate": 1.749299717968063e-05, |
| "loss": 0.8386, |
| "sparse_loss": 0.8386, |
| "step": 8320 |
| }, |
| { |
| "epoch": 1.3183686373695858, |
| "grad_norm": 7.3589982986450195, |
| "learning_rate": 1.7353800666652046e-05, |
| "loss": 0.8023, |
| "sparse_loss": 0.8023, |
| "step": 8340 |
| }, |
| { |
| "epoch": 1.3215301928548846, |
| "grad_norm": 6.955898761749268, |
| "learning_rate": 1.721486516777402e-05, |
| "loss": 0.734, |
| "sparse_loss": 0.734, |
| "step": 8360 |
| }, |
| { |
| "epoch": 1.3246917483401834, |
| "grad_norm": 5.73208475112915, |
| "learning_rate": 1.707619542581215e-05, |
| "loss": 0.7604, |
| "sparse_loss": 0.7604, |
| "step": 8380 |
| }, |
| { |
| "epoch": 1.327853303825482, |
| "grad_norm": 17.0853271484375, |
| "learning_rate": 1.6937796174460044e-05, |
| "loss": 0.7662, |
| "sparse_loss": 0.7662, |
| "step": 8400 |
| }, |
| { |
| "epoch": 1.331014859310781, |
| "grad_norm": 3.7031631469726562, |
| "learning_rate": 1.6799672138177726e-05, |
| "loss": 0.7875, |
| "sparse_loss": 0.7875, |
| "step": 8420 |
| }, |
| { |
| "epoch": 1.3341764147960797, |
| "grad_norm": 6.3429789543151855, |
| "learning_rate": 1.6661828032030334e-05, |
| "loss": 0.7987, |
| "sparse_loss": 0.7987, |
| "step": 8440 |
| }, |
| { |
| "epoch": 1.3373379702813786, |
| "grad_norm": 4.421934127807617, |
| "learning_rate": 1.652426856152721e-05, |
| "loss": 0.7414, |
| "sparse_loss": 0.7414, |
| "step": 8460 |
| }, |
| { |
| "epoch": 1.3404995257666772, |
| "grad_norm": 4.043078422546387, |
| "learning_rate": 1.638699842246121e-05, |
| "loss": 0.801, |
| "sparse_loss": 0.801, |
| "step": 8480 |
| }, |
| { |
| "epoch": 1.343661081251976, |
| "grad_norm": 3.617863178253174, |
| "learning_rate": 1.6250022300748486e-05, |
| "loss": 0.7287, |
| "sparse_loss": 0.7287, |
| "step": 8500 |
| }, |
| { |
| "epoch": 1.3468226367372749, |
| "grad_norm": 5.197469234466553, |
| "learning_rate": 1.611334487226842e-05, |
| "loss": 0.6786, |
| "sparse_loss": 0.6786, |
| "step": 8520 |
| }, |
| { |
| "epoch": 1.3499841922225735, |
| "grad_norm": 5.947723865509033, |
| "learning_rate": 1.5976970802704106e-05, |
| "loss": 0.7428, |
| "sparse_loss": 0.7428, |
| "step": 8540 |
| }, |
| { |
| "epoch": 1.3531457477078723, |
| "grad_norm": 3.481750011444092, |
| "learning_rate": 1.584090474738305e-05, |
| "loss": 0.7375, |
| "sparse_loss": 0.7375, |
| "step": 8560 |
| }, |
| { |
| "epoch": 1.3563073031931712, |
| "grad_norm": 6.375229835510254, |
| "learning_rate": 1.5705151351118192e-05, |
| "loss": 0.689, |
| "sparse_loss": 0.689, |
| "step": 8580 |
| }, |
| { |
| "epoch": 1.3594688586784698, |
| "grad_norm": 4.1146368980407715, |
| "learning_rate": 1.5569715248049457e-05, |
| "loss": 0.8682, |
| "sparse_loss": 0.8682, |
| "step": 8600 |
| }, |
| { |
| "epoch": 1.3626304141637686, |
| "grad_norm": 4.153675079345703, |
| "learning_rate": 1.5434601061485477e-05, |
| "loss": 0.7152, |
| "sparse_loss": 0.7152, |
| "step": 8620 |
| }, |
| { |
| "epoch": 1.3657919696490675, |
| "grad_norm": 6.575358867645264, |
| "learning_rate": 1.5299813403745777e-05, |
| "loss": 0.8519, |
| "sparse_loss": 0.8519, |
| "step": 8640 |
| }, |
| { |
| "epoch": 1.368953525134366, |
| "grad_norm": 6.450138092041016, |
| "learning_rate": 1.5165356876003395e-05, |
| "loss": 0.7737, |
| "sparse_loss": 0.7737, |
| "step": 8660 |
| }, |
| { |
| "epoch": 1.372115080619665, |
| "grad_norm": 4.407864570617676, |
| "learning_rate": 1.5031236068127701e-05, |
| "loss": 0.7976, |
| "sparse_loss": 0.7976, |
| "step": 8680 |
| }, |
| { |
| "epoch": 1.3752766361049638, |
| "grad_norm": 4.3346848487854, |
| "learning_rate": 1.4897455558527845e-05, |
| "loss": 0.7806, |
| "sparse_loss": 0.7806, |
| "step": 8700 |
| }, |
| { |
| "epoch": 1.3784381915902624, |
| "grad_norm": 5.701340675354004, |
| "learning_rate": 1.4764019913996355e-05, |
| "loss": 0.8074, |
| "sparse_loss": 0.8074, |
| "step": 8720 |
| }, |
| { |
| "epoch": 1.3815997470755612, |
| "grad_norm": 5.895680904388428, |
| "learning_rate": 1.463093368955328e-05, |
| "loss": 0.7799, |
| "sparse_loss": 0.7799, |
| "step": 8740 |
| }, |
| { |
| "epoch": 1.38476130256086, |
| "grad_norm": 5.741954803466797, |
| "learning_rate": 1.4498201428290759e-05, |
| "loss": 0.7566, |
| "sparse_loss": 0.7566, |
| "step": 8760 |
| }, |
| { |
| "epoch": 1.3879228580461587, |
| "grad_norm": 3.5622339248657227, |
| "learning_rate": 1.4365827661217815e-05, |
| "loss": 0.775, |
| "sparse_loss": 0.775, |
| "step": 8780 |
| }, |
| { |
| "epoch": 1.3910844135314575, |
| "grad_norm": 5.323927879333496, |
| "learning_rate": 1.4233816907105808e-05, |
| "loss": 0.717, |
| "sparse_loss": 0.717, |
| "step": 8800 |
| }, |
| { |
| "epoch": 1.3942459690167563, |
| "grad_norm": 3.92340350151062, |
| "learning_rate": 1.4102173672334087e-05, |
| "loss": 0.7135, |
| "sparse_loss": 0.7135, |
| "step": 8820 |
| }, |
| { |
| "epoch": 1.397407524502055, |
| "grad_norm": 7.306149959564209, |
| "learning_rate": 1.3970902450736207e-05, |
| "loss": 0.8414, |
| "sparse_loss": 0.8414, |
| "step": 8840 |
| }, |
| { |
| "epoch": 1.4005690799873538, |
| "grad_norm": 3.1595659255981445, |
| "learning_rate": 1.3840007723446497e-05, |
| "loss": 0.8132, |
| "sparse_loss": 0.8132, |
| "step": 8860 |
| }, |
| { |
| "epoch": 1.4037306354726526, |
| "grad_norm": 3.890341281890869, |
| "learning_rate": 1.3709493958747114e-05, |
| "loss": 0.712, |
| "sparse_loss": 0.712, |
| "step": 8880 |
| }, |
| { |
| "epoch": 1.4068921909579513, |
| "grad_norm": 5.3588948249816895, |
| "learning_rate": 1.3579365611915517e-05, |
| "loss": 0.7556, |
| "sparse_loss": 0.7556, |
| "step": 8900 |
| }, |
| { |
| "epoch": 1.41005374644325, |
| "grad_norm": 3.7485015392303467, |
| "learning_rate": 1.3449627125072348e-05, |
| "loss": 0.7766, |
| "sparse_loss": 0.7766, |
| "step": 8920 |
| }, |
| { |
| "epoch": 1.413215301928549, |
| "grad_norm": 7.076303005218506, |
| "learning_rate": 1.3320282927029806e-05, |
| "loss": 0.8162, |
| "sparse_loss": 0.8162, |
| "step": 8940 |
| }, |
| { |
| "epoch": 1.4163768574138476, |
| "grad_norm": 4.979154586791992, |
| "learning_rate": 1.3191337433140477e-05, |
| "loss": 0.7816, |
| "sparse_loss": 0.7816, |
| "step": 8960 |
| }, |
| { |
| "epoch": 1.4195384128991464, |
| "grad_norm": 6.925867080688477, |
| "learning_rate": 1.3062795045146586e-05, |
| "loss": 0.7431, |
| "sparse_loss": 0.7431, |
| "step": 8980 |
| }, |
| { |
| "epoch": 1.4226999683844452, |
| "grad_norm": 6.432084560394287, |
| "learning_rate": 1.2934660151029787e-05, |
| "loss": 0.7273, |
| "sparse_loss": 0.7273, |
| "step": 9000 |
| }, |
| { |
| "epoch": 1.4258615238697439, |
| "grad_norm": 4.4813385009765625, |
| "learning_rate": 1.280693712486129e-05, |
| "loss": 0.7382, |
| "sparse_loss": 0.7382, |
| "step": 9020 |
| }, |
| { |
| "epoch": 1.4290230793550427, |
| "grad_norm": 4.549583435058594, |
| "learning_rate": 1.2679630326652637e-05, |
| "loss": 0.786, |
| "sparse_loss": 0.786, |
| "step": 9040 |
| }, |
| { |
| "epoch": 1.4321846348403415, |
| "grad_norm": 4.904806613922119, |
| "learning_rate": 1.2552744102206795e-05, |
| "loss": 0.7608, |
| "sparse_loss": 0.7608, |
| "step": 9060 |
| }, |
| { |
| "epoch": 1.4353461903256401, |
| "grad_norm": 4.732978343963623, |
| "learning_rate": 1.2426282782969817e-05, |
| "loss": 0.7246, |
| "sparse_loss": 0.7246, |
| "step": 9080 |
| }, |
| { |
| "epoch": 1.438507745810939, |
| "grad_norm": 5.384263038635254, |
| "learning_rate": 1.2300250685883045e-05, |
| "loss": 0.9673, |
| "sparse_loss": 0.9673, |
| "step": 9100 |
| }, |
| { |
| "epoch": 1.4416693012962378, |
| "grad_norm": 4.50255823135376, |
| "learning_rate": 1.2174652113235651e-05, |
| "loss": 0.7476, |
| "sparse_loss": 0.7476, |
| "step": 9120 |
| }, |
| { |
| "epoch": 1.4448308567815364, |
| "grad_norm": 4.945241928100586, |
| "learning_rate": 1.2049491352517866e-05, |
| "loss": 0.7798, |
| "sparse_loss": 0.7798, |
| "step": 9140 |
| }, |
| { |
| "epoch": 1.4479924122668353, |
| "grad_norm": 6.198413372039795, |
| "learning_rate": 1.1924772676274546e-05, |
| "loss": 0.7981, |
| "sparse_loss": 0.7981, |
| "step": 9160 |
| }, |
| { |
| "epoch": 1.4511539677521341, |
| "grad_norm": 3.905620813369751, |
| "learning_rate": 1.1800500341959317e-05, |
| "loss": 1.038, |
| "sparse_loss": 1.038, |
| "step": 9180 |
| }, |
| { |
| "epoch": 1.4543155232374327, |
| "grad_norm": 5.019196510314941, |
| "learning_rate": 1.1676678591789341e-05, |
| "loss": 0.7107, |
| "sparse_loss": 0.7107, |
| "step": 9200 |
| }, |
| { |
| "epoch": 1.4574770787227316, |
| "grad_norm": 4.559060096740723, |
| "learning_rate": 1.155331165260038e-05, |
| "loss": 0.7464, |
| "sparse_loss": 0.7464, |
| "step": 9220 |
| }, |
| { |
| "epoch": 1.4606386342080304, |
| "grad_norm": 9.256589889526367, |
| "learning_rate": 1.1430403735702599e-05, |
| "loss": 0.7481, |
| "sparse_loss": 0.7481, |
| "step": 9240 |
| }, |
| { |
| "epoch": 1.463800189693329, |
| "grad_norm": 5.14002799987793, |
| "learning_rate": 1.1307959036736754e-05, |
| "loss": 0.734, |
| "sparse_loss": 0.734, |
| "step": 9260 |
| }, |
| { |
| "epoch": 1.4669617451786279, |
| "grad_norm": 10.058789253234863, |
| "learning_rate": 1.1185981735530945e-05, |
| "loss": 0.8064, |
| "sparse_loss": 0.8064, |
| "step": 9280 |
| }, |
| { |
| "epoch": 1.4701233006639267, |
| "grad_norm": 5.617347717285156, |
| "learning_rate": 1.1064475995958035e-05, |
| "loss": 0.7194, |
| "sparse_loss": 0.7194, |
| "step": 9300 |
| }, |
| { |
| "epoch": 1.4732848561492253, |
| "grad_norm": 9.134224891662598, |
| "learning_rate": 1.0943445965793391e-05, |
| "loss": 0.7925, |
| "sparse_loss": 0.7925, |
| "step": 9320 |
| }, |
| { |
| "epoch": 1.4764464116345242, |
| "grad_norm": 3.8909339904785156, |
| "learning_rate": 1.0822895776573386e-05, |
| "loss": 0.7638, |
| "sparse_loss": 0.7638, |
| "step": 9340 |
| }, |
| { |
| "epoch": 1.479607967119823, |
| "grad_norm": 5.176581859588623, |
| "learning_rate": 1.0702829543454295e-05, |
| "loss": 1.0023, |
| "sparse_loss": 1.0023, |
| "step": 9360 |
| }, |
| { |
| "epoch": 1.4827695226051216, |
| "grad_norm": 5.498968601226807, |
| "learning_rate": 1.0583251365071856e-05, |
| "loss": 0.7646, |
| "sparse_loss": 0.7646, |
| "step": 9380 |
| }, |
| { |
| "epoch": 1.4859310780904205, |
| "grad_norm": 4.207261562347412, |
| "learning_rate": 1.0464165323401348e-05, |
| "loss": 0.6717, |
| "sparse_loss": 0.6717, |
| "step": 9400 |
| }, |
| { |
| "epoch": 1.4890926335757193, |
| "grad_norm": 3.687284231185913, |
| "learning_rate": 1.0345575483618236e-05, |
| "loss": 0.7554, |
| "sparse_loss": 0.7554, |
| "step": 9420 |
| }, |
| { |
| "epoch": 1.492254189061018, |
| "grad_norm": 8.198759078979492, |
| "learning_rate": 1.022748589395944e-05, |
| "loss": 0.7571, |
| "sparse_loss": 0.7571, |
| "step": 9440 |
| }, |
| { |
| "epoch": 1.4954157445463168, |
| "grad_norm": 4.430997371673584, |
| "learning_rate": 1.0109900585585089e-05, |
| "loss": 0.692, |
| "sparse_loss": 0.692, |
| "step": 9460 |
| }, |
| { |
| "epoch": 1.4985773000316156, |
| "grad_norm": 3.510127544403076, |
| "learning_rate": 9.992823572440936e-06, |
| "loss": 0.7567, |
| "sparse_loss": 0.7567, |
| "step": 9480 |
| }, |
| { |
| "epoch": 1.5017388555169142, |
| "grad_norm": 7.045821189880371, |
| "learning_rate": 9.876258851121342e-06, |
| "loss": 0.7497, |
| "sparse_loss": 0.7497, |
| "step": 9500 |
| }, |
| { |
| "epoch": 1.504900411002213, |
| "grad_norm": 6.190208911895752, |
| "learning_rate": 9.760210400732837e-06, |
| "loss": 0.793, |
| "sparse_loss": 0.793, |
| "step": 9520 |
| }, |
| { |
| "epoch": 1.508061966487512, |
| "grad_norm": 6.944947719573975, |
| "learning_rate": 9.644682182758306e-06, |
| "loss": 0.7369, |
| "sparse_loss": 0.7369, |
| "step": 9540 |
| }, |
| { |
| "epoch": 1.5112235219728105, |
| "grad_norm": 4.1611008644104, |
| "learning_rate": 9.529678140921721e-06, |
| "loss": 0.7192, |
| "sparse_loss": 0.7192, |
| "step": 9560 |
| }, |
| { |
| "epoch": 1.5143850774581094, |
| "grad_norm": 5.603640079498291, |
| "learning_rate": 9.415202201053553e-06, |
| "loss": 0.8147, |
| "sparse_loss": 0.8147, |
| "step": 9580 |
| }, |
| { |
| "epoch": 1.5175466329434082, |
| "grad_norm": 5.05360746383667, |
| "learning_rate": 9.301258270956733e-06, |
| "loss": 1.0065, |
| "sparse_loss": 1.0065, |
| "step": 9600 |
| }, |
| { |
| "epoch": 1.5207081884287068, |
| "grad_norm": 4.60209846496582, |
| "learning_rate": 9.187850240273263e-06, |
| "loss": 0.7092, |
| "sparse_loss": 0.7092, |
| "step": 9620 |
| }, |
| { |
| "epoch": 1.5238697439140056, |
| "grad_norm": 7.637816905975342, |
| "learning_rate": 9.074981980351461e-06, |
| "loss": 0.7562, |
| "sparse_loss": 0.7562, |
| "step": 9640 |
| }, |
| { |
| "epoch": 1.5270312993993045, |
| "grad_norm": 4.913488864898682, |
| "learning_rate": 8.962657344113756e-06, |
| "loss": 0.7591, |
| "sparse_loss": 0.7591, |
| "step": 9660 |
| }, |
| { |
| "epoch": 1.530192854884603, |
| "grad_norm": 5.313370704650879, |
| "learning_rate": 8.850880165925198e-06, |
| "loss": 0.7395, |
| "sparse_loss": 0.7395, |
| "step": 9680 |
| }, |
| { |
| "epoch": 1.533354410369902, |
| "grad_norm": 3.915731191635132, |
| "learning_rate": 8.73965426146257e-06, |
| "loss": 0.973, |
| "sparse_loss": 0.973, |
| "step": 9700 |
| }, |
| { |
| "epoch": 1.5365159658552008, |
| "grad_norm": 4.924602508544922, |
| "learning_rate": 8.628983427584104e-06, |
| "loss": 0.6733, |
| "sparse_loss": 0.6733, |
| "step": 9720 |
| }, |
| { |
| "epoch": 1.5396775213404994, |
| "grad_norm": 5.523366451263428, |
| "learning_rate": 8.518871442199916e-06, |
| "loss": 0.7755, |
| "sparse_loss": 0.7755, |
| "step": 9740 |
| }, |
| { |
| "epoch": 1.5428390768257982, |
| "grad_norm": 5.189067363739014, |
| "learning_rate": 8.40932206414299e-06, |
| "loss": 0.6654, |
| "sparse_loss": 0.6654, |
| "step": 9760 |
| }, |
| { |
| "epoch": 1.546000632311097, |
| "grad_norm": 4.71336555480957, |
| "learning_rate": 8.300339033040908e-06, |
| "loss": 0.7118, |
| "sparse_loss": 0.7118, |
| "step": 9780 |
| }, |
| { |
| "epoch": 1.5491621877963957, |
| "grad_norm": 3.470552921295166, |
| "learning_rate": 8.191926069188155e-06, |
| "loss": 0.6827, |
| "sparse_loss": 0.6827, |
| "step": 9800 |
| }, |
| { |
| "epoch": 1.5523237432816948, |
| "grad_norm": 5.004092693328857, |
| "learning_rate": 8.084086873419144e-06, |
| "loss": 0.9226, |
| "sparse_loss": 0.9226, |
| "step": 9820 |
| }, |
| { |
| "epoch": 1.5554852987669934, |
| "grad_norm": 3.3111562728881836, |
| "learning_rate": 7.976825126981907e-06, |
| "loss": 0.7468, |
| "sparse_loss": 0.7468, |
| "step": 9840 |
| }, |
| { |
| "epoch": 1.558646854252292, |
| "grad_norm": 3.59743332862854, |
| "learning_rate": 7.87014449141236e-06, |
| "loss": 0.7771, |
| "sparse_loss": 0.7771, |
| "step": 9860 |
| }, |
| { |
| "epoch": 1.561808409737591, |
| "grad_norm": 3.9839119911193848, |
| "learning_rate": 7.764048608409394e-06, |
| "loss": 0.8062, |
| "sparse_loss": 0.8062, |
| "step": 9880 |
| }, |
| { |
| "epoch": 1.5649699652228897, |
| "grad_norm": 5.173106670379639, |
| "learning_rate": 7.65854109971048e-06, |
| "loss": 0.7018, |
| "sparse_loss": 0.7018, |
| "step": 9900 |
| }, |
| { |
| "epoch": 1.5681315207081883, |
| "grad_norm": 3.0103750228881836, |
| "learning_rate": 7.553625566968092e-06, |
| "loss": 0.779, |
| "sparse_loss": 0.779, |
| "step": 9920 |
| }, |
| { |
| "epoch": 1.5712930761934873, |
| "grad_norm": 4.104783058166504, |
| "learning_rate": 7.44930559162676e-06, |
| "loss": 0.7385, |
| "sparse_loss": 0.7385, |
| "step": 9940 |
| }, |
| { |
| "epoch": 1.574454631678786, |
| "grad_norm": 4.807834148406982, |
| "learning_rate": 7.345584734800764e-06, |
| "loss": 0.7734, |
| "sparse_loss": 0.7734, |
| "step": 9960 |
| }, |
| { |
| "epoch": 1.5776161871640846, |
| "grad_norm": 3.29559063911438, |
| "learning_rate": 7.242466537152639e-06, |
| "loss": 0.5872, |
| "sparse_loss": 0.5872, |
| "step": 9980 |
| }, |
| { |
| "epoch": 1.5807777426493836, |
| "grad_norm": 6.058928489685059, |
| "learning_rate": 7.139954518772227e-06, |
| "loss": 0.7984, |
| "sparse_loss": 0.7984, |
| "step": 10000 |
| }, |
| { |
| "epoch": 1.5839392981346823, |
| "grad_norm": 4.601917266845703, |
| "learning_rate": 7.038052179056573e-06, |
| "loss": 0.6556, |
| "sparse_loss": 0.6556, |
| "step": 10020 |
| }, |
| { |
| "epoch": 1.5871008536199809, |
| "grad_norm": 4.858172416687012, |
| "learning_rate": 6.936762996590482e-06, |
| "loss": 0.763, |
| "sparse_loss": 0.763, |
| "step": 10040 |
| }, |
| { |
| "epoch": 1.59026240910528, |
| "grad_norm": 3.7159643173217773, |
| "learning_rate": 6.8360904290276975e-06, |
| "loss": 0.6973, |
| "sparse_loss": 0.6973, |
| "step": 10060 |
| }, |
| { |
| "epoch": 1.5934239645905786, |
| "grad_norm": 2.954636335372925, |
| "learning_rate": 6.736037912972967e-06, |
| "loss": 0.8943, |
| "sparse_loss": 0.8943, |
| "step": 10080 |
| }, |
| { |
| "epoch": 1.5965855200758772, |
| "grad_norm": 4.900074005126953, |
| "learning_rate": 6.6366088638646154e-06, |
| "loss": 0.6099, |
| "sparse_loss": 0.6099, |
| "step": 10100 |
| }, |
| { |
| "epoch": 1.5997470755611762, |
| "grad_norm": 5.711692810058594, |
| "learning_rate": 6.537806675858066e-06, |
| "loss": 0.6872, |
| "sparse_loss": 0.6872, |
| "step": 10120 |
| }, |
| { |
| "epoch": 1.6029086310464749, |
| "grad_norm": 6.680075645446777, |
| "learning_rate": 6.439634721709905e-06, |
| "loss": 0.6117, |
| "sparse_loss": 0.6117, |
| "step": 10140 |
| }, |
| { |
| "epoch": 1.6060701865317735, |
| "grad_norm": 7.32015323638916, |
| "learning_rate": 6.34209635266276e-06, |
| "loss": 0.7191, |
| "sparse_loss": 0.7191, |
| "step": 10160 |
| }, |
| { |
| "epoch": 1.6092317420170725, |
| "grad_norm": 5.282656669616699, |
| "learning_rate": 6.245194898330933e-06, |
| "loss": 0.6835, |
| "sparse_loss": 0.6835, |
| "step": 10180 |
| }, |
| { |
| "epoch": 1.6123932975023711, |
| "grad_norm": 4.690248489379883, |
| "learning_rate": 6.148933666586693e-06, |
| "loss": 0.7652, |
| "sparse_loss": 0.7652, |
| "step": 10200 |
| }, |
| { |
| "epoch": 1.61555485298767, |
| "grad_norm": 5.6486334800720215, |
| "learning_rate": 6.0533159434473825e-06, |
| "loss": 0.6382, |
| "sparse_loss": 0.6382, |
| "step": 10220 |
| }, |
| { |
| "epoch": 1.6187164084729688, |
| "grad_norm": 4.573511600494385, |
| "learning_rate": 5.958344992963247e-06, |
| "loss": 0.7626, |
| "sparse_loss": 0.7626, |
| "step": 10240 |
| }, |
| { |
| "epoch": 1.6218779639582674, |
| "grad_norm": 10.142600059509277, |
| "learning_rate": 5.864024057105993e-06, |
| "loss": 0.6621, |
| "sparse_loss": 0.6621, |
| "step": 10260 |
| }, |
| { |
| "epoch": 1.6250395194435663, |
| "grad_norm": 3.3775432109832764, |
| "learning_rate": 5.770356355658155e-06, |
| "loss": 0.7596, |
| "sparse_loss": 0.7596, |
| "step": 10280 |
| }, |
| { |
| "epoch": 1.6282010749288651, |
| "grad_norm": 3.7121312618255615, |
| "learning_rate": 5.6773450861031365e-06, |
| "loss": 0.6681, |
| "sparse_loss": 0.6681, |
| "step": 10300 |
| }, |
| { |
| "epoch": 1.6313626304141637, |
| "grad_norm": 4.153905391693115, |
| "learning_rate": 5.584993423516088e-06, |
| "loss": 0.7242, |
| "sparse_loss": 0.7242, |
| "step": 10320 |
| }, |
| { |
| "epoch": 1.6345241858994626, |
| "grad_norm": 4.132081508636475, |
| "learning_rate": 5.49330452045552e-06, |
| "loss": 0.8251, |
| "sparse_loss": 0.8251, |
| "step": 10340 |
| }, |
| { |
| "epoch": 1.6376857413847614, |
| "grad_norm": 3.704270124435425, |
| "learning_rate": 5.402281506855672e-06, |
| "loss": 0.7695, |
| "sparse_loss": 0.7695, |
| "step": 10360 |
| }, |
| { |
| "epoch": 1.64084729687006, |
| "grad_norm": 4.198578834533691, |
| "learning_rate": 5.3119274899196965e-06, |
| "loss": 0.6834, |
| "sparse_loss": 0.6834, |
| "step": 10380 |
| }, |
| { |
| "epoch": 1.6440088523553589, |
| "grad_norm": 3.4071285724639893, |
| "learning_rate": 5.222245554013552e-06, |
| "loss": 0.9807, |
| "sparse_loss": 0.9807, |
| "step": 10400 |
| }, |
| { |
| "epoch": 1.6471704078406577, |
| "grad_norm": 6.942999839782715, |
| "learning_rate": 5.133238760560735e-06, |
| "loss": 0.664, |
| "sparse_loss": 0.664, |
| "step": 10420 |
| }, |
| { |
| "epoch": 1.6503319633259563, |
| "grad_norm": 6.275842189788818, |
| "learning_rate": 5.044910147937778e-06, |
| "loss": 0.6363, |
| "sparse_loss": 0.6363, |
| "step": 10440 |
| }, |
| { |
| "epoch": 1.6534935188112552, |
| "grad_norm": 5.802741527557373, |
| "learning_rate": 4.95726273137051e-06, |
| "loss": 0.8276, |
| "sparse_loss": 0.8276, |
| "step": 10460 |
| }, |
| { |
| "epoch": 1.656655074296554, |
| "grad_norm": 4.6166887283325195, |
| "learning_rate": 4.870299502831163e-06, |
| "loss": 0.7193, |
| "sparse_loss": 0.7193, |
| "step": 10480 |
| }, |
| { |
| "epoch": 1.6598166297818526, |
| "grad_norm": 3.928992986679077, |
| "learning_rate": 4.784023430936193e-06, |
| "loss": 0.7666, |
| "sparse_loss": 0.7666, |
| "step": 10500 |
| }, |
| { |
| "epoch": 1.6629781852671515, |
| "grad_norm": 3.6028928756713867, |
| "learning_rate": 4.698437460844976e-06, |
| "loss": 0.7701, |
| "sparse_loss": 0.7701, |
| "step": 10520 |
| }, |
| { |
| "epoch": 1.6661397407524503, |
| "grad_norm": 4.561351776123047, |
| "learning_rate": 4.613544514159246e-06, |
| "loss": 0.6138, |
| "sparse_loss": 0.6138, |
| "step": 10540 |
| }, |
| { |
| "epoch": 1.669301296237749, |
| "grad_norm": 6.210057735443115, |
| "learning_rate": 4.52934748882338e-06, |
| "loss": 0.766, |
| "sparse_loss": 0.766, |
| "step": 10560 |
| }, |
| { |
| "epoch": 1.6724628517230478, |
| "grad_norm": 4.3861494064331055, |
| "learning_rate": 4.445849259025475e-06, |
| "loss": 0.7487, |
| "sparse_loss": 0.7487, |
| "step": 10580 |
| }, |
| { |
| "epoch": 1.6756244072083466, |
| "grad_norm": 4.2435622215271, |
| "learning_rate": 4.363052675099213e-06, |
| "loss": 0.7803, |
| "sparse_loss": 0.7803, |
| "step": 10600 |
| }, |
| { |
| "epoch": 1.6787859626936452, |
| "grad_norm": 3.400221347808838, |
| "learning_rate": 4.2809605634265755e-06, |
| "loss": 0.7253, |
| "sparse_loss": 0.7253, |
| "step": 10620 |
| }, |
| { |
| "epoch": 1.681947518178944, |
| "grad_norm": 5.363790988922119, |
| "learning_rate": 4.199575726341346e-06, |
| "loss": 0.6903, |
| "sparse_loss": 0.6903, |
| "step": 10640 |
| }, |
| { |
| "epoch": 1.685109073664243, |
| "grad_norm": 3.888535499572754, |
| "learning_rate": 4.118900942033491e-06, |
| "loss": 0.7668, |
| "sparse_loss": 0.7668, |
| "step": 10660 |
| }, |
| { |
| "epoch": 1.6882706291495415, |
| "grad_norm": 3.354111433029175, |
| "learning_rate": 4.0389389644542586e-06, |
| "loss": 0.6539, |
| "sparse_loss": 0.6539, |
| "step": 10680 |
| }, |
| { |
| "epoch": 1.6914321846348404, |
| "grad_norm": 5.164332866668701, |
| "learning_rate": 3.9596925232222196e-06, |
| "loss": 0.7182, |
| "sparse_loss": 0.7182, |
| "step": 10700 |
| }, |
| { |
| "epoch": 1.6945937401201392, |
| "grad_norm": 5.303167819976807, |
| "learning_rate": 3.881164323530062e-06, |
| "loss": 0.664, |
| "sparse_loss": 0.664, |
| "step": 10720 |
| }, |
| { |
| "epoch": 1.6977552956054378, |
| "grad_norm": 6.435517311096191, |
| "learning_rate": 3.8033570460522498e-06, |
| "loss": 0.969, |
| "sparse_loss": 0.969, |
| "step": 10740 |
| }, |
| { |
| "epoch": 1.7009168510907366, |
| "grad_norm": 3.580998182296753, |
| "learning_rate": 3.7262733468535317e-06, |
| "loss": 0.658, |
| "sparse_loss": 0.658, |
| "step": 10760 |
| }, |
| { |
| "epoch": 1.7040784065760355, |
| "grad_norm": 3.7486093044281006, |
| "learning_rate": 3.649915857298242e-06, |
| "loss": 0.6847, |
| "sparse_loss": 0.6847, |
| "step": 10780 |
| }, |
| { |
| "epoch": 1.707239962061334, |
| "grad_norm": 7.520793914794922, |
| "learning_rate": 3.5742871839605006e-06, |
| "loss": 0.6823, |
| "sparse_loss": 0.6823, |
| "step": 10800 |
| }, |
| { |
| "epoch": 1.710401517546633, |
| "grad_norm": 10.77906608581543, |
| "learning_rate": 3.499389908535222e-06, |
| "loss": 0.6856, |
| "sparse_loss": 0.6856, |
| "step": 10820 |
| }, |
| { |
| "epoch": 1.7135630730319318, |
| "grad_norm": 4.748463153839111, |
| "learning_rate": 3.425226587749977e-06, |
| "loss": 0.9269, |
| "sparse_loss": 0.9269, |
| "step": 10840 |
| }, |
| { |
| "epoch": 1.7167246285172304, |
| "grad_norm": 4.665073871612549, |
| "learning_rate": 3.3517997532777485e-06, |
| "loss": 0.6424, |
| "sparse_loss": 0.6424, |
| "step": 10860 |
| }, |
| { |
| "epoch": 1.7198861840025292, |
| "grad_norm": 7.545193195343018, |
| "learning_rate": 3.2791119116504703e-06, |
| "loss": 0.7232, |
| "sparse_loss": 0.7232, |
| "step": 10880 |
| }, |
| { |
| "epoch": 1.723047739487828, |
| "grad_norm": 7.026817798614502, |
| "learning_rate": 3.207165544173482e-06, |
| "loss": 0.7651, |
| "sparse_loss": 0.7651, |
| "step": 10900 |
| }, |
| { |
| "epoch": 1.7262092949731267, |
| "grad_norm": 3.63460373878479, |
| "learning_rate": 3.1359631068408224e-06, |
| "loss": 0.7386, |
| "sparse_loss": 0.7386, |
| "step": 10920 |
| }, |
| { |
| "epoch": 1.7293708504584255, |
| "grad_norm": 3.9442801475524902, |
| "learning_rate": 3.0655070302513884e-06, |
| "loss": 0.6512, |
| "sparse_loss": 0.6512, |
| "step": 10940 |
| }, |
| { |
| "epoch": 1.7325324059437244, |
| "grad_norm": 3.9229090213775635, |
| "learning_rate": 2.9957997195259796e-06, |
| "loss": 0.6933, |
| "sparse_loss": 0.6933, |
| "step": 10960 |
| }, |
| { |
| "epoch": 1.735693961429023, |
| "grad_norm": 6.298892498016357, |
| "learning_rate": 2.926843554225167e-06, |
| "loss": 0.7798, |
| "sparse_loss": 0.7798, |
| "step": 10980 |
| }, |
| { |
| "epoch": 1.7388555169143218, |
| "grad_norm": 3.828557014465332, |
| "learning_rate": 2.8586408882680827e-06, |
| "loss": 1.0798, |
| "sparse_loss": 1.0798, |
| "step": 11000 |
| }, |
| { |
| "epoch": 1.7420170723996207, |
| "grad_norm": 4.936686038970947, |
| "learning_rate": 2.791194049852075e-06, |
| "loss": 0.6725, |
| "sparse_loss": 0.6725, |
| "step": 11020 |
| }, |
| { |
| "epoch": 1.7451786278849193, |
| "grad_norm": 4.79223108291626, |
| "learning_rate": 2.7245053413731876e-06, |
| "loss": 0.6857, |
| "sparse_loss": 0.6857, |
| "step": 11040 |
| }, |
| { |
| "epoch": 1.7483401833702181, |
| "grad_norm": 4.9190473556518555, |
| "learning_rate": 2.6585770393476288e-06, |
| "loss": 0.7417, |
| "sparse_loss": 0.7417, |
| "step": 11060 |
| }, |
| { |
| "epoch": 1.751501738855517, |
| "grad_norm": 4.649078369140625, |
| "learning_rate": 2.593411394334e-06, |
| "loss": 0.6224, |
| "sparse_loss": 0.6224, |
| "step": 11080 |
| }, |
| { |
| "epoch": 1.7546632943408156, |
| "grad_norm": 4.408849239349365, |
| "learning_rate": 2.529010630856507e-06, |
| "loss": 0.716, |
| "sparse_loss": 0.716, |
| "step": 11100 |
| }, |
| { |
| "epoch": 1.7578248498261144, |
| "grad_norm": 3.401287794113159, |
| "learning_rate": 2.465376947329015e-06, |
| "loss": 0.6733, |
| "sparse_loss": 0.6733, |
| "step": 11120 |
| }, |
| { |
| "epoch": 1.7609864053114133, |
| "grad_norm": 4.21400785446167, |
| "learning_rate": 2.402512515979974e-06, |
| "loss": 0.6824, |
| "sparse_loss": 0.6824, |
| "step": 11140 |
| }, |
| { |
| "epoch": 1.7641479607967119, |
| "grad_norm": 5.908743381500244, |
| "learning_rate": 2.3404194827783223e-06, |
| "loss": 0.6968, |
| "sparse_loss": 0.6968, |
| "step": 11160 |
| }, |
| { |
| "epoch": 1.7673095162820107, |
| "grad_norm": 4.381137847900391, |
| "learning_rate": 2.2790999673601736e-06, |
| "loss": 0.7176, |
| "sparse_loss": 0.7176, |
| "step": 11180 |
| }, |
| { |
| "epoch": 1.7704710717673096, |
| "grad_norm": 6.203781604766846, |
| "learning_rate": 2.218556062956506e-06, |
| "loss": 0.6751, |
| "sparse_loss": 0.6751, |
| "step": 11200 |
| }, |
| { |
| "epoch": 1.7736326272526082, |
| "grad_norm": 12.243690490722656, |
| "learning_rate": 2.158789836321673e-06, |
| "loss": 0.7181, |
| "sparse_loss": 0.7181, |
| "step": 11220 |
| }, |
| { |
| "epoch": 1.776794182737907, |
| "grad_norm": 4.719541549682617, |
| "learning_rate": 2.0998033276628525e-06, |
| "loss": 0.639, |
| "sparse_loss": 0.639, |
| "step": 11240 |
| }, |
| { |
| "epoch": 1.7799557382232059, |
| "grad_norm": 25.822694778442383, |
| "learning_rate": 2.0415985505704476e-06, |
| "loss": 0.6679, |
| "sparse_loss": 0.6679, |
| "step": 11260 |
| }, |
| { |
| "epoch": 1.7831172937085045, |
| "grad_norm": 9.564554214477539, |
| "learning_rate": 1.984177491949285e-06, |
| "loss": 0.8706, |
| "sparse_loss": 0.8706, |
| "step": 11280 |
| }, |
| { |
| "epoch": 1.7862788491938033, |
| "grad_norm": 6.144157886505127, |
| "learning_rate": 1.927542111950836e-06, |
| "loss": 0.6419, |
| "sparse_loss": 0.6419, |
| "step": 11300 |
| }, |
| { |
| "epoch": 1.7894404046791021, |
| "grad_norm": 4.055122375488281, |
| "learning_rate": 1.8716943439062883e-06, |
| "loss": 0.6952, |
| "sparse_loss": 0.6952, |
| "step": 11320 |
| }, |
| { |
| "epoch": 1.7926019601644008, |
| "grad_norm": 6.384298324584961, |
| "learning_rate": 1.8166360942605348e-06, |
| "loss": 0.6709, |
| "sparse_loss": 0.6709, |
| "step": 11340 |
| }, |
| { |
| "epoch": 1.7957635156496996, |
| "grad_norm": 3.9979257583618164, |
| "learning_rate": 1.7623692425071225e-06, |
| "loss": 0.6926, |
| "sparse_loss": 0.6926, |
| "step": 11360 |
| }, |
| { |
| "epoch": 1.7989250711349984, |
| "grad_norm": 3.888801336288452, |
| "learning_rate": 1.708895641124064e-06, |
| "loss": 0.7631, |
| "sparse_loss": 0.7631, |
| "step": 11380 |
| }, |
| { |
| "epoch": 1.802086626620297, |
| "grad_norm": 5.142566204071045, |
| "learning_rate": 1.656217115510636e-06, |
| "loss": 0.7042, |
| "sparse_loss": 0.7042, |
| "step": 11400 |
| }, |
| { |
| "epoch": 1.805248182105596, |
| "grad_norm": 3.104776382446289, |
| "learning_rate": 1.6043354639250301e-06, |
| "loss": 0.6538, |
| "sparse_loss": 0.6538, |
| "step": 11420 |
| }, |
| { |
| "epoch": 1.8084097375908947, |
| "grad_norm": 7.280310153961182, |
| "learning_rate": 1.553252457422985e-06, |
| "loss": 0.894, |
| "sparse_loss": 0.894, |
| "step": 11440 |
| }, |
| { |
| "epoch": 1.8115712930761934, |
| "grad_norm": 5.336974143981934, |
| "learning_rate": 1.5029698397973274e-06, |
| "loss": 0.6807, |
| "sparse_loss": 0.6807, |
| "step": 11460 |
| }, |
| { |
| "epoch": 1.8147328485614924, |
| "grad_norm": 4.305406093597412, |
| "learning_rate": 1.4534893275184397e-06, |
| "loss": 0.7875, |
| "sparse_loss": 0.7875, |
| "step": 11480 |
| }, |
| { |
| "epoch": 1.817894404046791, |
| "grad_norm": 4.251785755157471, |
| "learning_rate": 1.4048126096756847e-06, |
| "loss": 0.6582, |
| "sparse_loss": 0.6582, |
| "step": 11500 |
| }, |
| { |
| "epoch": 1.8210559595320897, |
| "grad_norm": 4.971356391906738, |
| "learning_rate": 1.3569413479197129e-06, |
| "loss": 0.7407, |
| "sparse_loss": 0.7407, |
| "step": 11520 |
| }, |
| { |
| "epoch": 1.8242175150173887, |
| "grad_norm": 4.023759365081787, |
| "learning_rate": 1.3098771764057715e-06, |
| "loss": 0.7286, |
| "sparse_loss": 0.7286, |
| "step": 11540 |
| }, |
| { |
| "epoch": 1.8273790705026873, |
| "grad_norm": 3.3543927669525146, |
| "learning_rate": 1.2636217017378992e-06, |
| "loss": 0.6443, |
| "sparse_loss": 0.6443, |
| "step": 11560 |
| }, |
| { |
| "epoch": 1.830540625987986, |
| "grad_norm": 4.27974796295166, |
| "learning_rate": 1.2181765029140868e-06, |
| "loss": 0.7002, |
| "sparse_loss": 0.7002, |
| "step": 11580 |
| }, |
| { |
| "epoch": 1.833702181473285, |
| "grad_norm": 3.6529595851898193, |
| "learning_rate": 1.173543131272395e-06, |
| "loss": 0.6918, |
| "sparse_loss": 0.6918, |
| "step": 11600 |
| }, |
| { |
| "epoch": 1.8368637369585836, |
| "grad_norm": 3.3575801849365234, |
| "learning_rate": 1.1297231104379691e-06, |
| "loss": 0.7157, |
| "sparse_loss": 0.7157, |
| "step": 11620 |
| }, |
| { |
| "epoch": 1.8400252924438822, |
| "grad_norm": 4.040219783782959, |
| "learning_rate": 1.0867179362710367e-06, |
| "loss": 0.7565, |
| "sparse_loss": 0.7565, |
| "step": 11640 |
| }, |
| { |
| "epoch": 1.8431868479291813, |
| "grad_norm": 4.253774642944336, |
| "learning_rate": 1.0445290768158561e-06, |
| "loss": 0.663, |
| "sparse_loss": 0.663, |
| "step": 11660 |
| }, |
| { |
| "epoch": 1.84634840341448, |
| "grad_norm": 4.688701629638672, |
| "learning_rate": 1.0031579722505902e-06, |
| "loss": 0.6053, |
| "sparse_loss": 0.6053, |
| "step": 11680 |
| }, |
| { |
| "epoch": 1.8495099588997785, |
| "grad_norm": 5.650112628936768, |
| "learning_rate": 9.626060348381482e-07, |
| "loss": 0.7206, |
| "sparse_loss": 0.7206, |
| "step": 11700 |
| }, |
| { |
| "epoch": 1.8526715143850776, |
| "grad_norm": 5.086851596832275, |
| "learning_rate": 9.228746488779777e-07, |
| "loss": 0.6682, |
| "sparse_loss": 0.6682, |
| "step": 11720 |
| }, |
| { |
| "epoch": 1.8558330698703762, |
| "grad_norm": 4.1554460525512695, |
| "learning_rate": 8.839651706588042e-07, |
| "loss": 0.7064, |
| "sparse_loss": 0.7064, |
| "step": 11740 |
| }, |
| { |
| "epoch": 1.8589946253556748, |
| "grad_norm": 4.700277805328369, |
| "learning_rate": 8.458789284123359e-07, |
| "loss": 0.73, |
| "sparse_loss": 0.73, |
| "step": 11760 |
| }, |
| { |
| "epoch": 1.862156180840974, |
| "grad_norm": 3.3193020820617676, |
| "learning_rate": 8.086172222679184e-07, |
| "loss": 0.7108, |
| "sparse_loss": 0.7108, |
| "step": 11780 |
| }, |
| { |
| "epoch": 1.8653177363262725, |
| "grad_norm": 3.3436214923858643, |
| "learning_rate": 7.721813242081682e-07, |
| "loss": 0.6975, |
| "sparse_loss": 0.6975, |
| "step": 11800 |
| }, |
| { |
| "epoch": 1.8684792918115711, |
| "grad_norm": 4.1823344230651855, |
| "learning_rate": 7.365724780255239e-07, |
| "loss": 0.7245, |
| "sparse_loss": 0.7245, |
| "step": 11820 |
| }, |
| { |
| "epoch": 1.8716408472968702, |
| "grad_norm": 4.967905521392822, |
| "learning_rate": 7.017918992798272e-07, |
| "loss": 0.686, |
| "sparse_loss": 0.686, |
| "step": 11840 |
| }, |
| { |
| "epoch": 1.8748024027821688, |
| "grad_norm": 5.125014781951904, |
| "learning_rate": 6.678407752567756e-07, |
| "loss": 0.6269, |
| "sparse_loss": 0.6269, |
| "step": 11860 |
| }, |
| { |
| "epoch": 1.8779639582674676, |
| "grad_norm": 3.9895036220550537, |
| "learning_rate": 6.34720264927438e-07, |
| "loss": 0.6523, |
| "sparse_loss": 0.6523, |
| "step": 11880 |
| }, |
| { |
| "epoch": 1.8811255137527665, |
| "grad_norm": 5.10162878036499, |
| "learning_rate": 6.024314989086788e-07, |
| "loss": 0.7276, |
| "sparse_loss": 0.7276, |
| "step": 11900 |
| }, |
| { |
| "epoch": 1.884287069238065, |
| "grad_norm": 4.555959701538086, |
| "learning_rate": 5.709755794245458e-07, |
| "loss": 0.695, |
| "sparse_loss": 0.695, |
| "step": 11920 |
| }, |
| { |
| "epoch": 1.887448624723364, |
| "grad_norm": 3.6415183544158936, |
| "learning_rate": 5.403535802686738e-07, |
| "loss": 0.678, |
| "sparse_loss": 0.678, |
| "step": 11940 |
| }, |
| { |
| "epoch": 1.8906101802086628, |
| "grad_norm": 4.21772575378418, |
| "learning_rate": 5.105665467675963e-07, |
| "loss": 0.6504, |
| "sparse_loss": 0.6504, |
| "step": 11960 |
| }, |
| { |
| "epoch": 1.8937717356939614, |
| "grad_norm": 5.778132915496826, |
| "learning_rate": 4.816154957450831e-07, |
| "loss": 0.5766, |
| "sparse_loss": 0.5766, |
| "step": 11980 |
| }, |
| { |
| "epoch": 1.8969332911792602, |
| "grad_norm": 5.38350248336792, |
| "learning_rate": 4.53501415487434e-07, |
| "loss": 0.6935, |
| "sparse_loss": 0.6935, |
| "step": 12000 |
| }, |
| { |
| "epoch": 1.900094846664559, |
| "grad_norm": 4.640732765197754, |
| "learning_rate": 4.2622526570972044e-07, |
| "loss": 0.6321, |
| "sparse_loss": 0.6321, |
| "step": 12020 |
| }, |
| { |
| "epoch": 1.9032564021498577, |
| "grad_norm": 3.8653504848480225, |
| "learning_rate": 3.997879775230445e-07, |
| "loss": 0.6369, |
| "sparse_loss": 0.6369, |
| "step": 12040 |
| }, |
| { |
| "epoch": 1.9064179576351565, |
| "grad_norm": 3.3370511531829834, |
| "learning_rate": 3.741904534027424e-07, |
| "loss": 0.6187, |
| "sparse_loss": 0.6187, |
| "step": 12060 |
| }, |
| { |
| "epoch": 1.9095795131204554, |
| "grad_norm": 3.8426411151885986, |
| "learning_rate": 3.494335671575755e-07, |
| "loss": 0.7079, |
| "sparse_loss": 0.7079, |
| "step": 12080 |
| }, |
| { |
| "epoch": 1.912741068605754, |
| "grad_norm": 4.085023880004883, |
| "learning_rate": 3.255181638999211e-07, |
| "loss": 0.6413, |
| "sparse_loss": 0.6413, |
| "step": 12100 |
| }, |
| { |
| "epoch": 1.9159026240910528, |
| "grad_norm": 3.8709089756011963, |
| "learning_rate": 3.0244506001689543e-07, |
| "loss": 0.639, |
| "sparse_loss": 0.639, |
| "step": 12120 |
| }, |
| { |
| "epoch": 1.9190641795763517, |
| "grad_norm": 4.617068290710449, |
| "learning_rate": 2.8021504314250934e-07, |
| "loss": 0.716, |
| "sparse_loss": 0.716, |
| "step": 12140 |
| }, |
| { |
| "epoch": 1.9222257350616503, |
| "grad_norm": 8.647622108459473, |
| "learning_rate": 2.588288721307619e-07, |
| "loss": 0.6784, |
| "sparse_loss": 0.6784, |
| "step": 12160 |
| }, |
| { |
| "epoch": 1.9253872905469491, |
| "grad_norm": 7.49450159072876, |
| "learning_rate": 2.3828727702975007e-07, |
| "loss": 0.7079, |
| "sparse_loss": 0.7079, |
| "step": 12180 |
| }, |
| { |
| "epoch": 1.928548846032248, |
| "grad_norm": 4.875016212463379, |
| "learning_rate": 2.1859095905674143e-07, |
| "loss": 0.6504, |
| "sparse_loss": 0.6504, |
| "step": 12200 |
| }, |
| { |
| "epoch": 1.9317104015175466, |
| "grad_norm": 4.579043388366699, |
| "learning_rate": 1.9974059057423223e-07, |
| "loss": 0.7201, |
| "sparse_loss": 0.7201, |
| "step": 12220 |
| }, |
| { |
| "epoch": 1.9348719570028454, |
| "grad_norm": 4.354573726654053, |
| "learning_rate": 1.8173681506701013e-07, |
| "loss": 0.7279, |
| "sparse_loss": 0.7279, |
| "step": 12240 |
| }, |
| { |
| "epoch": 1.9380335124881443, |
| "grad_norm": 5.453174114227295, |
| "learning_rate": 1.6458024712017182e-07, |
| "loss": 0.9232, |
| "sparse_loss": 0.9232, |
| "step": 12260 |
| }, |
| { |
| "epoch": 1.9411950679734429, |
| "grad_norm": 4.5288825035095215, |
| "learning_rate": 1.4827147239815097e-07, |
| "loss": 0.6213, |
| "sparse_loss": 0.6213, |
| "step": 12280 |
| }, |
| { |
| "epoch": 1.9443566234587417, |
| "grad_norm": 4.951496124267578, |
| "learning_rate": 1.328110476247285e-07, |
| "loss": 0.6959, |
| "sparse_loss": 0.6959, |
| "step": 12300 |
| }, |
| { |
| "epoch": 1.9475181789440406, |
| "grad_norm": 11.245010375976562, |
| "learning_rate": 1.181995005640174e-07, |
| "loss": 0.7559, |
| "sparse_loss": 0.7559, |
| "step": 12320 |
| }, |
| { |
| "epoch": 1.9506797344293392, |
| "grad_norm": 5.009317874908447, |
| "learning_rate": 1.0443733000246037e-07, |
| "loss": 0.7514, |
| "sparse_loss": 0.7514, |
| "step": 12340 |
| }, |
| { |
| "epoch": 1.953841289914638, |
| "grad_norm": 6.3035736083984375, |
| "learning_rate": 9.152500573179345e-08, |
| "loss": 0.6578, |
| "sparse_loss": 0.6578, |
| "step": 12360 |
| }, |
| { |
| "epoch": 1.9570028453999369, |
| "grad_norm": 3.7395153045654297, |
| "learning_rate": 7.946296853300895e-08, |
| "loss": 0.7104, |
| "sparse_loss": 0.7104, |
| "step": 12380 |
| }, |
| { |
| "epoch": 1.9601644008852355, |
| "grad_norm": 8.872085571289062, |
| "learning_rate": 6.825163016132007e-08, |
| "loss": 0.6662, |
| "sparse_loss": 0.6662, |
| "step": 12400 |
| }, |
| { |
| "epoch": 1.9633259563705343, |
| "grad_norm": 4.300417423248291, |
| "learning_rate": 5.78913733320835e-08, |
| "loss": 0.7136, |
| "sparse_loss": 0.7136, |
| "step": 12420 |
| }, |
| { |
| "epoch": 1.9664875118558331, |
| "grad_norm": 2.6573374271392822, |
| "learning_rate": 4.8382551707762403e-08, |
| "loss": 0.6415, |
| "sparse_loss": 0.6415, |
| "step": 12440 |
| }, |
| { |
| "epoch": 1.9696490673411318, |
| "grad_norm": 4.849949359893799, |
| "learning_rate": 3.972548988582792e-08, |
| "loss": 0.7226, |
| "sparse_loss": 0.7226, |
| "step": 12460 |
| }, |
| { |
| "epoch": 1.9728106228264306, |
| "grad_norm": 4.734280109405518, |
| "learning_rate": 3.192048338769293e-08, |
| "loss": 0.7787, |
| "sparse_loss": 0.7787, |
| "step": 12480 |
| }, |
| { |
| "epoch": 1.9759721783117294, |
| "grad_norm": 3.4710962772369385, |
| "learning_rate": 2.496779864862575e-08, |
| "loss": 0.6803, |
| "sparse_loss": 0.6803, |
| "step": 12500 |
| }, |
| { |
| "epoch": 1.979133733797028, |
| "grad_norm": 4.116549491882324, |
| "learning_rate": 1.886767300864345e-08, |
| "loss": 0.6908, |
| "sparse_loss": 0.6908, |
| "step": 12520 |
| }, |
| { |
| "epoch": 1.982295289282327, |
| "grad_norm": 3.8753113746643066, |
| "learning_rate": 1.362031470441838e-08, |
| "loss": 0.7203, |
| "sparse_loss": 0.7203, |
| "step": 12540 |
| }, |
| { |
| "epoch": 1.9854568447676257, |
| "grad_norm": 3.2648284435272217, |
| "learning_rate": 9.225902862172731e-09, |
| "loss": 0.6811, |
| "sparse_loss": 0.6811, |
| "step": 12560 |
| }, |
| { |
| "epoch": 1.9886184002529244, |
| "grad_norm": 3.1582114696502686, |
| "learning_rate": 5.684587491550097e-09, |
| "loss": 0.6963, |
| "sparse_loss": 0.6963, |
| "step": 12580 |
| }, |
| { |
| "epoch": 1.9917799557382232, |
| "grad_norm": 3.733403205871582, |
| "learning_rate": 2.996489480514009e-09, |
| "loss": 0.714, |
| "sparse_loss": 0.714, |
| "step": 12600 |
| }, |
| { |
| "epoch": 1.994941511223522, |
| "grad_norm": 3.784757375717163, |
| "learning_rate": 1.1617005911984668e-09, |
| "loss": 0.7004, |
| "sparse_loss": 0.7004, |
| "step": 12620 |
| }, |
| { |
| "epoch": 1.9981030667088207, |
| "grad_norm": 8.876729011535645, |
| "learning_rate": 1.8028345680209946e-10, |
| "loss": 0.6596, |
| "sparse_loss": 0.6596, |
| "step": 12640 |
| } |
| ], |
| "logging_steps": 20, |
| "max_steps": 12652, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 2000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 32, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|