| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.0, |
| "eval_steps": 200, |
| "global_step": 408, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.004901960784313725, |
| "grad_norm": 9.949305642315444, |
| "learning_rate": 9.999851776425575e-06, |
| "loss": 0.5539, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.00980392156862745, |
| "grad_norm": 4.075674507588934, |
| "learning_rate": 9.999407114490384e-06, |
| "loss": 0.4213, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.014705882352941176, |
| "grad_norm": 4.867105580967725, |
| "learning_rate": 9.998666040558187e-06, |
| "loss": 0.4319, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.0196078431372549, |
| "grad_norm": 1.8297624952470446, |
| "learning_rate": 9.99762859856683e-06, |
| "loss": 0.3531, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.024509803921568627, |
| "grad_norm": 1.3398804411974734, |
| "learning_rate": 9.996294850025658e-06, |
| "loss": 0.3147, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.029411764705882353, |
| "grad_norm": 0.9826229003116685, |
| "learning_rate": 9.994664874011864e-06, |
| "loss": 0.3227, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.03431372549019608, |
| "grad_norm": 0.7586586122465202, |
| "learning_rate": 9.992738767165791e-06, |
| "loss": 0.319, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.0392156862745098, |
| "grad_norm": 0.882703555100207, |
| "learning_rate": 9.990516643685222e-06, |
| "loss": 0.2825, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.04411764705882353, |
| "grad_norm": 0.8996418859466875, |
| "learning_rate": 9.987998635318586e-06, |
| "loss": 0.3206, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.049019607843137254, |
| "grad_norm": 0.6941482280225552, |
| "learning_rate": 9.985184891357165e-06, |
| "loss": 0.3222, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.05392156862745098, |
| "grad_norm": 0.6330852150187295, |
| "learning_rate": 9.982075578626235e-06, |
| "loss": 0.3091, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.058823529411764705, |
| "grad_norm": 0.6492344466203871, |
| "learning_rate": 9.978670881475173e-06, |
| "loss": 0.2981, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.06372549019607843, |
| "grad_norm": 0.6749300990695969, |
| "learning_rate": 9.974971001766534e-06, |
| "loss": 0.2855, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.06862745098039216, |
| "grad_norm": 0.6104328540607359, |
| "learning_rate": 9.970976158864074e-06, |
| "loss": 0.2946, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.07352941176470588, |
| "grad_norm": 0.5474488618726454, |
| "learning_rate": 9.96668658961975e-06, |
| "loss": 0.2877, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.0784313725490196, |
| "grad_norm": 0.5672503974973263, |
| "learning_rate": 9.96210254835968e-06, |
| "loss": 0.2744, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.08333333333333333, |
| "grad_norm": 0.5351355676937355, |
| "learning_rate": 9.957224306869053e-06, |
| "loss": 0.2837, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.08823529411764706, |
| "grad_norm": 0.5420649578895781, |
| "learning_rate": 9.952052154376027e-06, |
| "loss": 0.2851, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.09313725490196079, |
| "grad_norm": 0.616265084827749, |
| "learning_rate": 9.946586397534572e-06, |
| "loss": 0.3106, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.09803921568627451, |
| "grad_norm": 0.5357910945074501, |
| "learning_rate": 9.940827360406297e-06, |
| "loss": 0.2866, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.10294117647058823, |
| "grad_norm": 0.5436018725783519, |
| "learning_rate": 9.93477538444123e-06, |
| "loss": 0.3129, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.10784313725490197, |
| "grad_norm": 0.5352962246240812, |
| "learning_rate": 9.92843082845757e-06, |
| "loss": 0.2911, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.11274509803921569, |
| "grad_norm": 0.512996507276675, |
| "learning_rate": 9.92179406862043e-06, |
| "loss": 0.2761, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.11764705882352941, |
| "grad_norm": 0.5316783600397025, |
| "learning_rate": 9.91486549841951e-06, |
| "loss": 0.2975, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.12254901960784313, |
| "grad_norm": 0.4789505118842049, |
| "learning_rate": 9.907645528645791e-06, |
| "loss": 0.2613, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.12745098039215685, |
| "grad_norm": 0.5113514733732404, |
| "learning_rate": 9.90013458736716e-06, |
| "loss": 0.2662, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.1323529411764706, |
| "grad_norm": 0.504304050146625, |
| "learning_rate": 9.892333119903045e-06, |
| "loss": 0.3001, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.13725490196078433, |
| "grad_norm": 0.5181939763522874, |
| "learning_rate": 9.884241588798004e-06, |
| "loss": 0.2928, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.14215686274509803, |
| "grad_norm": 0.5407134908885123, |
| "learning_rate": 9.875860473794302e-06, |
| "loss": 0.2743, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.14705882352941177, |
| "grad_norm": 0.4918389865538274, |
| "learning_rate": 9.867190271803466e-06, |
| "loss": 0.2781, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.15196078431372548, |
| "grad_norm": 0.5040301538424998, |
| "learning_rate": 9.85823149687683e-06, |
| "loss": 0.2765, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.1568627450980392, |
| "grad_norm": 0.5229000429022846, |
| "learning_rate": 9.848984680175049e-06, |
| "loss": 0.2814, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.16176470588235295, |
| "grad_norm": 0.5184957323080689, |
| "learning_rate": 9.839450369936615e-06, |
| "loss": 0.2855, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.16666666666666666, |
| "grad_norm": 0.5069951437454678, |
| "learning_rate": 9.829629131445342e-06, |
| "loss": 0.2936, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.1715686274509804, |
| "grad_norm": 0.5036566946779656, |
| "learning_rate": 9.819521546996864e-06, |
| "loss": 0.2739, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.17647058823529413, |
| "grad_norm": 0.4895005980353086, |
| "learning_rate": 9.809128215864096e-06, |
| "loss": 0.2864, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.18137254901960784, |
| "grad_norm": 0.5051322582648758, |
| "learning_rate": 9.798449754261716e-06, |
| "loss": 0.2543, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.18627450980392157, |
| "grad_norm": 0.5606615218448902, |
| "learning_rate": 9.787486795309621e-06, |
| "loss": 0.3097, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.19117647058823528, |
| "grad_norm": 0.5113656698846124, |
| "learning_rate": 9.776239988995401e-06, |
| "loss": 0.2696, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.19607843137254902, |
| "grad_norm": 0.4991653304275424, |
| "learning_rate": 9.764710002135784e-06, |
| "loss": 0.2942, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.20098039215686275, |
| "grad_norm": 0.4757517371235649, |
| "learning_rate": 9.752897518337117e-06, |
| "loss": 0.2715, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.20588235294117646, |
| "grad_norm": 0.4969728084578434, |
| "learning_rate": 9.74080323795483e-06, |
| "loss": 0.2911, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.2107843137254902, |
| "grad_norm": 0.48508387550906595, |
| "learning_rate": 9.72842787805191e-06, |
| "loss": 0.2797, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.21568627450980393, |
| "grad_norm": 0.500256000950693, |
| "learning_rate": 9.715772172356388e-06, |
| "loss": 0.2648, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.22058823529411764, |
| "grad_norm": 0.4842669080621426, |
| "learning_rate": 9.702836871217838e-06, |
| "loss": 0.2778, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.22549019607843138, |
| "grad_norm": 0.4747803990331153, |
| "learning_rate": 9.689622741562891e-06, |
| "loss": 0.2607, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.23039215686274508, |
| "grad_norm": 0.5010061541098526, |
| "learning_rate": 9.676130566849757e-06, |
| "loss": 0.2984, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.23529411764705882, |
| "grad_norm": 0.47171751321285477, |
| "learning_rate": 9.66236114702178e-06, |
| "loss": 0.2614, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.24019607843137256, |
| "grad_norm": 0.4486004421177675, |
| "learning_rate": 9.64831529846001e-06, |
| "loss": 0.251, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.24509803921568626, |
| "grad_norm": 0.4792184131399122, |
| "learning_rate": 9.633993853934803e-06, |
| "loss": 0.2763, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.25, |
| "grad_norm": 0.5286196103587543, |
| "learning_rate": 9.619397662556434e-06, |
| "loss": 0.2733, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.2549019607843137, |
| "grad_norm": 0.4628990001171691, |
| "learning_rate": 9.60452758972477e-06, |
| "loss": 0.2529, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.25980392156862747, |
| "grad_norm": 0.4546346553824907, |
| "learning_rate": 9.589384517077945e-06, |
| "loss": 0.2575, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.2647058823529412, |
| "grad_norm": 0.4644836879571118, |
| "learning_rate": 9.573969342440107e-06, |
| "loss": 0.2659, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.2696078431372549, |
| "grad_norm": 0.4564926053780548, |
| "learning_rate": 9.558282979768164e-06, |
| "loss": 0.2806, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.27450980392156865, |
| "grad_norm": 0.47250787332790883, |
| "learning_rate": 9.542326359097619e-06, |
| "loss": 0.2783, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.27941176470588236, |
| "grad_norm": 0.4925803264813645, |
| "learning_rate": 9.52610042648741e-06, |
| "loss": 0.2738, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.28431372549019607, |
| "grad_norm": 0.5764917572941164, |
| "learning_rate": 9.509606143963832e-06, |
| "loss": 0.2946, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.28921568627450983, |
| "grad_norm": 0.5027549400707922, |
| "learning_rate": 9.492844489463486e-06, |
| "loss": 0.2705, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.29411764705882354, |
| "grad_norm": 0.4948958040547992, |
| "learning_rate": 9.475816456775313e-06, |
| "loss": 0.279, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.29901960784313725, |
| "grad_norm": 0.5249720085150932, |
| "learning_rate": 9.458523055481658e-06, |
| "loss": 0.2775, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.30392156862745096, |
| "grad_norm": 0.5160221747233424, |
| "learning_rate": 9.440965310898425e-06, |
| "loss": 0.2749, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.3088235294117647, |
| "grad_norm": 0.5043522735174615, |
| "learning_rate": 9.423144264014278e-06, |
| "loss": 0.2904, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.3137254901960784, |
| "grad_norm": 0.5058021983501817, |
| "learning_rate": 9.405060971428924e-06, |
| "loss": 0.2873, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.31862745098039214, |
| "grad_norm": 0.4769659728548327, |
| "learning_rate": 9.386716505290467e-06, |
| "loss": 0.2746, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.3235294117647059, |
| "grad_norm": 0.4805442594216887, |
| "learning_rate": 9.368111953231849e-06, |
| "loss": 0.274, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.3284313725490196, |
| "grad_norm": 0.483159811379141, |
| "learning_rate": 9.349248418306347e-06, |
| "loss": 0.2688, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.3333333333333333, |
| "grad_norm": 0.5011176419560011, |
| "learning_rate": 9.330127018922195e-06, |
| "loss": 0.3139, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.3382352941176471, |
| "grad_norm": 0.5310341725773848, |
| "learning_rate": 9.310748888776254e-06, |
| "loss": 0.303, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.3431372549019608, |
| "grad_norm": 0.47577451907323093, |
| "learning_rate": 9.291115176786814e-06, |
| "loss": 0.2696, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.3480392156862745, |
| "grad_norm": 0.45184435487313246, |
| "learning_rate": 9.271227047025462e-06, |
| "loss": 0.2827, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.35294117647058826, |
| "grad_norm": 0.48943856307135464, |
| "learning_rate": 9.251085678648072e-06, |
| "loss": 0.2695, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.35784313725490197, |
| "grad_norm": 0.4721139530533066, |
| "learning_rate": 9.230692265824888e-06, |
| "loss": 0.2555, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.3627450980392157, |
| "grad_norm": 0.47202502734236734, |
| "learning_rate": 9.210048017669727e-06, |
| "loss": 0.2926, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.36764705882352944, |
| "grad_norm": 0.49767797359868676, |
| "learning_rate": 9.189154158168293e-06, |
| "loss": 0.2913, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.37254901960784315, |
| "grad_norm": 0.5025843797659678, |
| "learning_rate": 9.168011926105598e-06, |
| "loss": 0.2666, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.37745098039215685, |
| "grad_norm": 0.5022682553518323, |
| "learning_rate": 9.146622574992528e-06, |
| "loss": 0.2806, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.38235294117647056, |
| "grad_norm": 0.45419943811853347, |
| "learning_rate": 9.124987372991512e-06, |
| "loss": 0.2907, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.3872549019607843, |
| "grad_norm": 0.4595804279730611, |
| "learning_rate": 9.103107602841341e-06, |
| "loss": 0.259, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.39215686274509803, |
| "grad_norm": 0.471535607395236, |
| "learning_rate": 9.08098456178111e-06, |
| "loss": 0.2728, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.39705882352941174, |
| "grad_norm": 0.5184899058796497, |
| "learning_rate": 9.058619561473308e-06, |
| "loss": 0.2677, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.4019607843137255, |
| "grad_norm": 0.47509634282362856, |
| "learning_rate": 9.036013927926049e-06, |
| "loss": 0.2742, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.4068627450980392, |
| "grad_norm": 0.47503959889700315, |
| "learning_rate": 9.013169001414458e-06, |
| "loss": 0.2778, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.4117647058823529, |
| "grad_norm": 0.47489487818225196, |
| "learning_rate": 8.990086136401199e-06, |
| "loss": 0.2498, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.4166666666666667, |
| "grad_norm": 0.46801056645527583, |
| "learning_rate": 8.966766701456177e-06, |
| "loss": 0.2641, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.4215686274509804, |
| "grad_norm": 0.5112905724249457, |
| "learning_rate": 8.943212079175392e-06, |
| "loss": 0.2771, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.4264705882352941, |
| "grad_norm": 0.4447440095916987, |
| "learning_rate": 8.91942366609897e-06, |
| "loss": 0.2701, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.43137254901960786, |
| "grad_norm": 0.4734853505195237, |
| "learning_rate": 8.895402872628352e-06, |
| "loss": 0.2919, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.4362745098039216, |
| "grad_norm": 0.45704855765327873, |
| "learning_rate": 8.871151122942692e-06, |
| "loss": 0.2606, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.4411764705882353, |
| "grad_norm": 0.4701885669821727, |
| "learning_rate": 8.846669854914395e-06, |
| "loss": 0.2768, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.44607843137254904, |
| "grad_norm": 0.4829583138980323, |
| "learning_rate": 8.821960520023884e-06, |
| "loss": 0.2796, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.45098039215686275, |
| "grad_norm": 0.487539635758669, |
| "learning_rate": 8.797024583273536e-06, |
| "loss": 0.2758, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.45588235294117646, |
| "grad_norm": 0.47395890303919547, |
| "learning_rate": 8.771863523100821e-06, |
| "loss": 0.2739, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.46078431372549017, |
| "grad_norm": 0.5050007846153621, |
| "learning_rate": 8.746478831290648e-06, |
| "loss": 0.2993, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.46568627450980393, |
| "grad_norm": 0.5224607026899196, |
| "learning_rate": 8.720872012886918e-06, |
| "loss": 0.2863, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.47058823529411764, |
| "grad_norm": 0.47538250913835894, |
| "learning_rate": 8.695044586103297e-06, |
| "loss": 0.2912, |
| "step": 96 |
| }, |
| { |
| "epoch": 0.47549019607843135, |
| "grad_norm": 0.4548594126958497, |
| "learning_rate": 8.668998082233186e-06, |
| "loss": 0.2557, |
| "step": 97 |
| }, |
| { |
| "epoch": 0.4803921568627451, |
| "grad_norm": 0.4817593707119681, |
| "learning_rate": 8.642734045558952e-06, |
| "loss": 0.2559, |
| "step": 98 |
| }, |
| { |
| "epoch": 0.4852941176470588, |
| "grad_norm": 0.49702278125831084, |
| "learning_rate": 8.616254033260351e-06, |
| "loss": 0.2797, |
| "step": 99 |
| }, |
| { |
| "epoch": 0.49019607843137253, |
| "grad_norm": 0.46937954625053363, |
| "learning_rate": 8.58955961532221e-06, |
| "loss": 0.2543, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.4950980392156863, |
| "grad_norm": 0.5097981268395455, |
| "learning_rate": 8.56265237444135e-06, |
| "loss": 0.2805, |
| "step": 101 |
| }, |
| { |
| "epoch": 0.5, |
| "grad_norm": 0.5147622119362961, |
| "learning_rate": 8.535533905932739e-06, |
| "loss": 0.2669, |
| "step": 102 |
| }, |
| { |
| "epoch": 0.5049019607843137, |
| "grad_norm": 0.48075492916551527, |
| "learning_rate": 8.508205817634908e-06, |
| "loss": 0.2701, |
| "step": 103 |
| }, |
| { |
| "epoch": 0.5098039215686274, |
| "grad_norm": 0.46764850077800707, |
| "learning_rate": 8.480669729814635e-06, |
| "loss": 0.2777, |
| "step": 104 |
| }, |
| { |
| "epoch": 0.5147058823529411, |
| "grad_norm": 0.4905586064810874, |
| "learning_rate": 8.452927275070858e-06, |
| "loss": 0.2752, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.5196078431372549, |
| "grad_norm": 0.5164789347154716, |
| "learning_rate": 8.424980098237904e-06, |
| "loss": 0.2929, |
| "step": 106 |
| }, |
| { |
| "epoch": 0.5245098039215687, |
| "grad_norm": 0.47222892287405277, |
| "learning_rate": 8.39682985628795e-06, |
| "loss": 0.2825, |
| "step": 107 |
| }, |
| { |
| "epoch": 0.5294117647058824, |
| "grad_norm": 0.48296479038244106, |
| "learning_rate": 8.368478218232787e-06, |
| "loss": 0.2696, |
| "step": 108 |
| }, |
| { |
| "epoch": 0.5343137254901961, |
| "grad_norm": 0.5456904576064092, |
| "learning_rate": 8.339926865024871e-06, |
| "loss": 0.2625, |
| "step": 109 |
| }, |
| { |
| "epoch": 0.5392156862745098, |
| "grad_norm": 0.46572429076517435, |
| "learning_rate": 8.311177489457653e-06, |
| "loss": 0.2682, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.5441176470588235, |
| "grad_norm": 0.4845901394041396, |
| "learning_rate": 8.282231796065215e-06, |
| "loss": 0.2661, |
| "step": 111 |
| }, |
| { |
| "epoch": 0.5490196078431373, |
| "grad_norm": 0.5338366403432033, |
| "learning_rate": 8.25309150102121e-06, |
| "loss": 0.3103, |
| "step": 112 |
| }, |
| { |
| "epoch": 0.553921568627451, |
| "grad_norm": 0.49130182412984563, |
| "learning_rate": 8.223758332037121e-06, |
| "loss": 0.2715, |
| "step": 113 |
| }, |
| { |
| "epoch": 0.5588235294117647, |
| "grad_norm": 0.4910434085279551, |
| "learning_rate": 8.194234028259806e-06, |
| "loss": 0.2951, |
| "step": 114 |
| }, |
| { |
| "epoch": 0.5637254901960784, |
| "grad_norm": 0.4837447278276799, |
| "learning_rate": 8.164520340168404e-06, |
| "loss": 0.2889, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.5686274509803921, |
| "grad_norm": 0.5235056748140671, |
| "learning_rate": 8.134619029470535e-06, |
| "loss": 0.3195, |
| "step": 116 |
| }, |
| { |
| "epoch": 0.5735294117647058, |
| "grad_norm": 0.5393547644876617, |
| "learning_rate": 8.104531868997858e-06, |
| "loss": 0.2635, |
| "step": 117 |
| }, |
| { |
| "epoch": 0.5784313725490197, |
| "grad_norm": 0.46319401757550593, |
| "learning_rate": 8.074260642600963e-06, |
| "loss": 0.2735, |
| "step": 118 |
| }, |
| { |
| "epoch": 0.5833333333333334, |
| "grad_norm": 0.5011014808072246, |
| "learning_rate": 8.043807145043604e-06, |
| "loss": 0.2834, |
| "step": 119 |
| }, |
| { |
| "epoch": 0.5882352941176471, |
| "grad_norm": 0.4885650619750526, |
| "learning_rate": 8.013173181896283e-06, |
| "loss": 0.28, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.5931372549019608, |
| "grad_norm": 0.48818735448671163, |
| "learning_rate": 7.982360569429206e-06, |
| "loss": 0.2832, |
| "step": 121 |
| }, |
| { |
| "epoch": 0.5980392156862745, |
| "grad_norm": 0.47782983621824704, |
| "learning_rate": 7.951371134504599e-06, |
| "loss": 0.2655, |
| "step": 122 |
| }, |
| { |
| "epoch": 0.6029411764705882, |
| "grad_norm": 0.5287209528766823, |
| "learning_rate": 7.920206714468383e-06, |
| "loss": 0.2852, |
| "step": 123 |
| }, |
| { |
| "epoch": 0.6078431372549019, |
| "grad_norm": 0.5115537708833304, |
| "learning_rate": 7.888869157041257e-06, |
| "loss": 0.2838, |
| "step": 124 |
| }, |
| { |
| "epoch": 0.6127450980392157, |
| "grad_norm": 0.5014284065264025, |
| "learning_rate": 7.857360320209126e-06, |
| "loss": 0.2716, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.6176470588235294, |
| "grad_norm": 0.4651603608252272, |
| "learning_rate": 7.82568207211296e-06, |
| "loss": 0.2782, |
| "step": 126 |
| }, |
| { |
| "epoch": 0.6225490196078431, |
| "grad_norm": 0.5043652198730769, |
| "learning_rate": 7.793836290938026e-06, |
| "loss": 0.2714, |
| "step": 127 |
| }, |
| { |
| "epoch": 0.6274509803921569, |
| "grad_norm": 0.5581975764740609, |
| "learning_rate": 7.76182486480253e-06, |
| "loss": 0.2936, |
| "step": 128 |
| }, |
| { |
| "epoch": 0.6323529411764706, |
| "grad_norm": 0.49460513690934, |
| "learning_rate": 7.729649691645673e-06, |
| "loss": 0.2514, |
| "step": 129 |
| }, |
| { |
| "epoch": 0.6372549019607843, |
| "grad_norm": 0.49477864005095495, |
| "learning_rate": 7.697312679115126e-06, |
| "loss": 0.3039, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.6421568627450981, |
| "grad_norm": 0.43459499281810504, |
| "learning_rate": 7.664815744453918e-06, |
| "loss": 0.258, |
| "step": 131 |
| }, |
| { |
| "epoch": 0.6470588235294118, |
| "grad_norm": 0.4662127766253258, |
| "learning_rate": 7.63216081438678e-06, |
| "loss": 0.2682, |
| "step": 132 |
| }, |
| { |
| "epoch": 0.6519607843137255, |
| "grad_norm": 0.4461887881123459, |
| "learning_rate": 7.599349825005892e-06, |
| "loss": 0.2517, |
| "step": 133 |
| }, |
| { |
| "epoch": 0.6568627450980392, |
| "grad_norm": 0.48712380156922164, |
| "learning_rate": 7.566384721656103e-06, |
| "loss": 0.2741, |
| "step": 134 |
| }, |
| { |
| "epoch": 0.6617647058823529, |
| "grad_norm": 0.4954211848354499, |
| "learning_rate": 7.533267458819597e-06, |
| "loss": 0.2772, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.6666666666666666, |
| "grad_norm": 0.4882543467157239, |
| "learning_rate": 7.500000000000001e-06, |
| "loss": 0.2693, |
| "step": 136 |
| }, |
| { |
| "epoch": 0.6715686274509803, |
| "grad_norm": 0.533534907534477, |
| "learning_rate": 7.466584317605978e-06, |
| "loss": 0.2682, |
| "step": 137 |
| }, |
| { |
| "epoch": 0.6764705882352942, |
| "grad_norm": 0.47618239404048246, |
| "learning_rate": 7.4330223928342814e-06, |
| "loss": 0.2793, |
| "step": 138 |
| }, |
| { |
| "epoch": 0.6813725490196079, |
| "grad_norm": 0.45267230623426713, |
| "learning_rate": 7.399316215552296e-06, |
| "loss": 0.2676, |
| "step": 139 |
| }, |
| { |
| "epoch": 0.6862745098039216, |
| "grad_norm": 0.484264347953501, |
| "learning_rate": 7.365467784180051e-06, |
| "loss": 0.2762, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.6911764705882353, |
| "grad_norm": 0.4811924950195945, |
| "learning_rate": 7.33147910557174e-06, |
| "loss": 0.256, |
| "step": 141 |
| }, |
| { |
| "epoch": 0.696078431372549, |
| "grad_norm": 0.4713139634213423, |
| "learning_rate": 7.297352194896738e-06, |
| "loss": 0.259, |
| "step": 142 |
| }, |
| { |
| "epoch": 0.7009803921568627, |
| "grad_norm": 0.4650406278525342, |
| "learning_rate": 7.26308907552012e-06, |
| "loss": 0.2589, |
| "step": 143 |
| }, |
| { |
| "epoch": 0.7058823529411765, |
| "grad_norm": 0.508611340986804, |
| "learning_rate": 7.2286917788826926e-06, |
| "loss": 0.2846, |
| "step": 144 |
| }, |
| { |
| "epoch": 0.7107843137254902, |
| "grad_norm": 0.49931442983079793, |
| "learning_rate": 7.194162344380561e-06, |
| "loss": 0.2847, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.7156862745098039, |
| "grad_norm": 0.4402469704937451, |
| "learning_rate": 7.159502819244206e-06, |
| "loss": 0.2513, |
| "step": 146 |
| }, |
| { |
| "epoch": 0.7205882352941176, |
| "grad_norm": 0.4338024772855486, |
| "learning_rate": 7.124715258417111e-06, |
| "loss": 0.2546, |
| "step": 147 |
| }, |
| { |
| "epoch": 0.7254901960784313, |
| "grad_norm": 0.4928083208518911, |
| "learning_rate": 7.089801724433918e-06, |
| "loss": 0.2712, |
| "step": 148 |
| }, |
| { |
| "epoch": 0.7303921568627451, |
| "grad_norm": 0.4477835861527179, |
| "learning_rate": 7.05476428729815e-06, |
| "loss": 0.262, |
| "step": 149 |
| }, |
| { |
| "epoch": 0.7352941176470589, |
| "grad_norm": 0.4874852700808163, |
| "learning_rate": 7.019605024359475e-06, |
| "loss": 0.2622, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.7401960784313726, |
| "grad_norm": 0.4555898200997557, |
| "learning_rate": 6.984326020190544e-06, |
| "loss": 0.2616, |
| "step": 151 |
| }, |
| { |
| "epoch": 0.7450980392156863, |
| "grad_norm": 0.4950149488353659, |
| "learning_rate": 6.948929366463397e-06, |
| "loss": 0.3077, |
| "step": 152 |
| }, |
| { |
| "epoch": 0.75, |
| "grad_norm": 0.5366809202495563, |
| "learning_rate": 6.913417161825449e-06, |
| "loss": 0.2834, |
| "step": 153 |
| }, |
| { |
| "epoch": 0.7549019607843137, |
| "grad_norm": 0.48159848397294547, |
| "learning_rate": 6.877791511775064e-06, |
| "loss": 0.2547, |
| "step": 154 |
| }, |
| { |
| "epoch": 0.7598039215686274, |
| "grad_norm": 0.46460905629471105, |
| "learning_rate": 6.842054528536717e-06, |
| "loss": 0.2616, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.7647058823529411, |
| "grad_norm": 0.4733666361824114, |
| "learning_rate": 6.806208330935766e-06, |
| "loss": 0.2529, |
| "step": 156 |
| }, |
| { |
| "epoch": 0.7696078431372549, |
| "grad_norm": 0.46805105182447543, |
| "learning_rate": 6.770255044272826e-06, |
| "loss": 0.2678, |
| "step": 157 |
| }, |
| { |
| "epoch": 0.7745098039215687, |
| "grad_norm": 0.47075639159563154, |
| "learning_rate": 6.734196800197763e-06, |
| "loss": 0.2554, |
| "step": 158 |
| }, |
| { |
| "epoch": 0.7794117647058824, |
| "grad_norm": 0.47216348647984946, |
| "learning_rate": 6.698035736583307e-06, |
| "loss": 0.2691, |
| "step": 159 |
| }, |
| { |
| "epoch": 0.7843137254901961, |
| "grad_norm": 0.4776754820294553, |
| "learning_rate": 6.6617739973982985e-06, |
| "loss": 0.2893, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.7892156862745098, |
| "grad_norm": 0.5056737376559055, |
| "learning_rate": 6.625413732580577e-06, |
| "loss": 0.3119, |
| "step": 161 |
| }, |
| { |
| "epoch": 0.7941176470588235, |
| "grad_norm": 0.5001505320557236, |
| "learning_rate": 6.588957097909509e-06, |
| "loss": 0.2534, |
| "step": 162 |
| }, |
| { |
| "epoch": 0.7990196078431373, |
| "grad_norm": 0.4672061035111928, |
| "learning_rate": 6.552406254878175e-06, |
| "loss": 0.2656, |
| "step": 163 |
| }, |
| { |
| "epoch": 0.803921568627451, |
| "grad_norm": 0.4554461122885562, |
| "learning_rate": 6.515763370565218e-06, |
| "loss": 0.261, |
| "step": 164 |
| }, |
| { |
| "epoch": 0.8088235294117647, |
| "grad_norm": 0.4926507470830697, |
| "learning_rate": 6.4790306175063535e-06, |
| "loss": 0.2597, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.8137254901960784, |
| "grad_norm": 0.4943296156206765, |
| "learning_rate": 6.442210173565562e-06, |
| "loss": 0.279, |
| "step": 166 |
| }, |
| { |
| "epoch": 0.8186274509803921, |
| "grad_norm": 0.4728785530345938, |
| "learning_rate": 6.405304221805972e-06, |
| "loss": 0.271, |
| "step": 167 |
| }, |
| { |
| "epoch": 0.8235294117647058, |
| "grad_norm": 0.5132288172128124, |
| "learning_rate": 6.368314950360416e-06, |
| "loss": 0.2778, |
| "step": 168 |
| }, |
| { |
| "epoch": 0.8284313725490197, |
| "grad_norm": 0.4840027829550652, |
| "learning_rate": 6.331244552301705e-06, |
| "loss": 0.28, |
| "step": 169 |
| }, |
| { |
| "epoch": 0.8333333333333334, |
| "grad_norm": 0.4753576271704974, |
| "learning_rate": 6.294095225512604e-06, |
| "loss": 0.2491, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.8382352941176471, |
| "grad_norm": 0.46179398942588656, |
| "learning_rate": 6.2568691725555144e-06, |
| "loss": 0.2736, |
| "step": 171 |
| }, |
| { |
| "epoch": 0.8431372549019608, |
| "grad_norm": 0.4552053511887567, |
| "learning_rate": 6.219568600541886e-06, |
| "loss": 0.255, |
| "step": 172 |
| }, |
| { |
| "epoch": 0.8480392156862745, |
| "grad_norm": 0.461827774670562, |
| "learning_rate": 6.182195721001366e-06, |
| "loss": 0.2699, |
| "step": 173 |
| }, |
| { |
| "epoch": 0.8529411764705882, |
| "grad_norm": 0.45609580191383764, |
| "learning_rate": 6.144752749750671e-06, |
| "loss": 0.2618, |
| "step": 174 |
| }, |
| { |
| "epoch": 0.8578431372549019, |
| "grad_norm": 0.4977220589396911, |
| "learning_rate": 6.107241906762214e-06, |
| "loss": 0.2587, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.8627450980392157, |
| "grad_norm": 0.5124222190867876, |
| "learning_rate": 6.0696654160324875e-06, |
| "loss": 0.3044, |
| "step": 176 |
| }, |
| { |
| "epoch": 0.8676470588235294, |
| "grad_norm": 0.544844105660069, |
| "learning_rate": 6.0320255054501985e-06, |
| "loss": 0.2893, |
| "step": 177 |
| }, |
| { |
| "epoch": 0.8725490196078431, |
| "grad_norm": 0.47482041593280866, |
| "learning_rate": 5.994324406664184e-06, |
| "loss": 0.2524, |
| "step": 178 |
| }, |
| { |
| "epoch": 0.8774509803921569, |
| "grad_norm": 0.5065344279012253, |
| "learning_rate": 5.956564354951091e-06, |
| "loss": 0.2656, |
| "step": 179 |
| }, |
| { |
| "epoch": 0.8823529411764706, |
| "grad_norm": 0.490249393727454, |
| "learning_rate": 5.918747589082853e-06, |
| "loss": 0.2809, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.8872549019607843, |
| "grad_norm": 0.48483179337750465, |
| "learning_rate": 5.880876351193956e-06, |
| "loss": 0.2784, |
| "step": 181 |
| }, |
| { |
| "epoch": 0.8921568627450981, |
| "grad_norm": 0.4468073287249408, |
| "learning_rate": 5.842952886648496e-06, |
| "loss": 0.2478, |
| "step": 182 |
| }, |
| { |
| "epoch": 0.8970588235294118, |
| "grad_norm": 0.5394634417319949, |
| "learning_rate": 5.804979443907065e-06, |
| "loss": 0.2715, |
| "step": 183 |
| }, |
| { |
| "epoch": 0.9019607843137255, |
| "grad_norm": 0.48798110547637624, |
| "learning_rate": 5.766958274393428e-06, |
| "loss": 0.2733, |
| "step": 184 |
| }, |
| { |
| "epoch": 0.9068627450980392, |
| "grad_norm": 0.49133168083308504, |
| "learning_rate": 5.728891632361043e-06, |
| "loss": 0.256, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.9117647058823529, |
| "grad_norm": 0.48254056420422375, |
| "learning_rate": 5.690781774759412e-06, |
| "loss": 0.281, |
| "step": 186 |
| }, |
| { |
| "epoch": 0.9166666666666666, |
| "grad_norm": 0.5159826456616268, |
| "learning_rate": 5.65263096110026e-06, |
| "loss": 0.3025, |
| "step": 187 |
| }, |
| { |
| "epoch": 0.9215686274509803, |
| "grad_norm": 0.4998585500181035, |
| "learning_rate": 5.614441453323571e-06, |
| "loss": 0.2641, |
| "step": 188 |
| }, |
| { |
| "epoch": 0.9264705882352942, |
| "grad_norm": 0.47408281336995545, |
| "learning_rate": 5.576215515663489e-06, |
| "loss": 0.2672, |
| "step": 189 |
| }, |
| { |
| "epoch": 0.9313725490196079, |
| "grad_norm": 0.5213843608163483, |
| "learning_rate": 5.537955414514058e-06, |
| "loss": 0.287, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.9362745098039216, |
| "grad_norm": 0.5225831835576515, |
| "learning_rate": 5.499663418294858e-06, |
| "loss": 0.2719, |
| "step": 191 |
| }, |
| { |
| "epoch": 0.9411764705882353, |
| "grad_norm": 0.4879552875250876, |
| "learning_rate": 5.46134179731651e-06, |
| "loss": 0.2808, |
| "step": 192 |
| }, |
| { |
| "epoch": 0.946078431372549, |
| "grad_norm": 0.4429756422592165, |
| "learning_rate": 5.4229928236460705e-06, |
| "loss": 0.2511, |
| "step": 193 |
| }, |
| { |
| "epoch": 0.9509803921568627, |
| "grad_norm": 0.4838680164008833, |
| "learning_rate": 5.3846187709723195e-06, |
| "loss": 0.2729, |
| "step": 194 |
| }, |
| { |
| "epoch": 0.9558823529411765, |
| "grad_norm": 0.45487335735058904, |
| "learning_rate": 5.346221914470959e-06, |
| "loss": 0.2639, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.9607843137254902, |
| "grad_norm": 0.47930922167465867, |
| "learning_rate": 5.3078045306697154e-06, |
| "loss": 0.2721, |
| "step": 196 |
| }, |
| { |
| "epoch": 0.9656862745098039, |
| "grad_norm": 0.48206365406271, |
| "learning_rate": 5.2693688973133675e-06, |
| "loss": 0.2844, |
| "step": 197 |
| }, |
| { |
| "epoch": 0.9705882352941176, |
| "grad_norm": 0.4426568449367021, |
| "learning_rate": 5.230917293228699e-06, |
| "loss": 0.2472, |
| "step": 198 |
| }, |
| { |
| "epoch": 0.9754901960784313, |
| "grad_norm": 0.45364504567570346, |
| "learning_rate": 5.192451998189392e-06, |
| "loss": 0.2734, |
| "step": 199 |
| }, |
| { |
| "epoch": 0.9803921568627451, |
| "grad_norm": 0.4757644973640628, |
| "learning_rate": 5.153975292780852e-06, |
| "loss": 0.2634, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.9803921568627451, |
| "eval_loss": 0.2901236116886139, |
| "eval_runtime": 4.5192, |
| "eval_samples_per_second": 14.604, |
| "eval_steps_per_second": 3.762, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.9852941176470589, |
| "grad_norm": 0.4743962403323634, |
| "learning_rate": 5.115489458265006e-06, |
| "loss": 0.2708, |
| "step": 201 |
| }, |
| { |
| "epoch": 0.9901960784313726, |
| "grad_norm": 0.4984282092923338, |
| "learning_rate": 5.0769967764450345e-06, |
| "loss": 0.2507, |
| "step": 202 |
| }, |
| { |
| "epoch": 0.9950980392156863, |
| "grad_norm": 0.4605973534637747, |
| "learning_rate": 5.038499529530094e-06, |
| "loss": 0.2604, |
| "step": 203 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 0.49516075604073534, |
| "learning_rate": 5e-06, |
| "loss": 0.2695, |
| "step": 204 |
| }, |
| { |
| "epoch": 1.0049019607843137, |
| "grad_norm": 0.5082618843355043, |
| "learning_rate": 4.961500470469908e-06, |
| "loss": 0.2255, |
| "step": 205 |
| }, |
| { |
| "epoch": 1.0098039215686274, |
| "grad_norm": 0.5088751533658034, |
| "learning_rate": 4.923003223554967e-06, |
| "loss": 0.2447, |
| "step": 206 |
| }, |
| { |
| "epoch": 1.0147058823529411, |
| "grad_norm": 0.49220028140675365, |
| "learning_rate": 4.8845105417349955e-06, |
| "loss": 0.2312, |
| "step": 207 |
| }, |
| { |
| "epoch": 1.0196078431372548, |
| "grad_norm": 0.4708853500743636, |
| "learning_rate": 4.846024707219149e-06, |
| "loss": 0.2198, |
| "step": 208 |
| }, |
| { |
| "epoch": 1.0245098039215685, |
| "grad_norm": 0.5298187958047372, |
| "learning_rate": 4.807548001810611e-06, |
| "loss": 0.2366, |
| "step": 209 |
| }, |
| { |
| "epoch": 1.0294117647058822, |
| "grad_norm": 0.4526141045695304, |
| "learning_rate": 4.7690827067713035e-06, |
| "loss": 0.2137, |
| "step": 210 |
| }, |
| { |
| "epoch": 1.0343137254901962, |
| "grad_norm": 0.4587936745039723, |
| "learning_rate": 4.730631102686635e-06, |
| "loss": 0.2379, |
| "step": 211 |
| }, |
| { |
| "epoch": 1.0392156862745099, |
| "grad_norm": 0.47490380719342673, |
| "learning_rate": 4.692195469330286e-06, |
| "loss": 0.2166, |
| "step": 212 |
| }, |
| { |
| "epoch": 1.0441176470588236, |
| "grad_norm": 0.4543116276180149, |
| "learning_rate": 4.653778085529043e-06, |
| "loss": 0.2269, |
| "step": 213 |
| }, |
| { |
| "epoch": 1.0490196078431373, |
| "grad_norm": 0.4789578356476451, |
| "learning_rate": 4.615381229027681e-06, |
| "loss": 0.208, |
| "step": 214 |
| }, |
| { |
| "epoch": 1.053921568627451, |
| "grad_norm": 0.5295078000919256, |
| "learning_rate": 4.577007176353931e-06, |
| "loss": 0.2184, |
| "step": 215 |
| }, |
| { |
| "epoch": 1.0588235294117647, |
| "grad_norm": 0.5848689976910358, |
| "learning_rate": 4.53865820268349e-06, |
| "loss": 0.2275, |
| "step": 216 |
| }, |
| { |
| "epoch": 1.0637254901960784, |
| "grad_norm": 0.4738691598319379, |
| "learning_rate": 4.5003365817051434e-06, |
| "loss": 0.2307, |
| "step": 217 |
| }, |
| { |
| "epoch": 1.0686274509803921, |
| "grad_norm": 0.43950765165268024, |
| "learning_rate": 4.462044585485944e-06, |
| "loss": 0.2165, |
| "step": 218 |
| }, |
| { |
| "epoch": 1.0735294117647058, |
| "grad_norm": 0.49593785245882227, |
| "learning_rate": 4.4237844843365126e-06, |
| "loss": 0.2462, |
| "step": 219 |
| }, |
| { |
| "epoch": 1.0784313725490196, |
| "grad_norm": 0.5332165363629019, |
| "learning_rate": 4.3855585466764305e-06, |
| "loss": 0.2273, |
| "step": 220 |
| }, |
| { |
| "epoch": 1.0833333333333333, |
| "grad_norm": 0.4850004626701561, |
| "learning_rate": 4.347369038899744e-06, |
| "loss": 0.2092, |
| "step": 221 |
| }, |
| { |
| "epoch": 1.088235294117647, |
| "grad_norm": 0.5203678675930979, |
| "learning_rate": 4.309218225240591e-06, |
| "loss": 0.232, |
| "step": 222 |
| }, |
| { |
| "epoch": 1.093137254901961, |
| "grad_norm": 0.5490724478431357, |
| "learning_rate": 4.271108367638959e-06, |
| "loss": 0.2261, |
| "step": 223 |
| }, |
| { |
| "epoch": 1.0980392156862746, |
| "grad_norm": 0.4752955178586657, |
| "learning_rate": 4.233041725606573e-06, |
| "loss": 0.2103, |
| "step": 224 |
| }, |
| { |
| "epoch": 1.1029411764705883, |
| "grad_norm": 0.5271708488599877, |
| "learning_rate": 4.195020556092935e-06, |
| "loss": 0.2599, |
| "step": 225 |
| }, |
| { |
| "epoch": 1.107843137254902, |
| "grad_norm": 0.48431472139133885, |
| "learning_rate": 4.157047113351504e-06, |
| "loss": 0.2081, |
| "step": 226 |
| }, |
| { |
| "epoch": 1.1127450980392157, |
| "grad_norm": 0.4799065587797215, |
| "learning_rate": 4.119123648806046e-06, |
| "loss": 0.2239, |
| "step": 227 |
| }, |
| { |
| "epoch": 1.1176470588235294, |
| "grad_norm": 0.4843432542281396, |
| "learning_rate": 4.081252410917148e-06, |
| "loss": 0.233, |
| "step": 228 |
| }, |
| { |
| "epoch": 1.1225490196078431, |
| "grad_norm": 0.5134294387550117, |
| "learning_rate": 4.043435645048911e-06, |
| "loss": 0.2347, |
| "step": 229 |
| }, |
| { |
| "epoch": 1.1274509803921569, |
| "grad_norm": 0.4674896337352177, |
| "learning_rate": 4.005675593335818e-06, |
| "loss": 0.2107, |
| "step": 230 |
| }, |
| { |
| "epoch": 1.1323529411764706, |
| "grad_norm": 0.4612970868262565, |
| "learning_rate": 3.967974494549803e-06, |
| "loss": 0.2091, |
| "step": 231 |
| }, |
| { |
| "epoch": 1.1372549019607843, |
| "grad_norm": 0.5731954137918644, |
| "learning_rate": 3.930334583967514e-06, |
| "loss": 0.2372, |
| "step": 232 |
| }, |
| { |
| "epoch": 1.142156862745098, |
| "grad_norm": 0.5075101754997032, |
| "learning_rate": 3.892758093237788e-06, |
| "loss": 0.2304, |
| "step": 233 |
| }, |
| { |
| "epoch": 1.1470588235294117, |
| "grad_norm": 0.479594961464793, |
| "learning_rate": 3.855247250249331e-06, |
| "loss": 0.2086, |
| "step": 234 |
| }, |
| { |
| "epoch": 1.1519607843137254, |
| "grad_norm": 0.5071236660718516, |
| "learning_rate": 3.8178042789986355e-06, |
| "loss": 0.2306, |
| "step": 235 |
| }, |
| { |
| "epoch": 1.156862745098039, |
| "grad_norm": 0.46823532072465995, |
| "learning_rate": 3.7804313994581143e-06, |
| "loss": 0.2123, |
| "step": 236 |
| }, |
| { |
| "epoch": 1.161764705882353, |
| "grad_norm": 0.4963955283839783, |
| "learning_rate": 3.743130827444487e-06, |
| "loss": 0.2276, |
| "step": 237 |
| }, |
| { |
| "epoch": 1.1666666666666667, |
| "grad_norm": 0.4722357144886749, |
| "learning_rate": 3.705904774487396e-06, |
| "loss": 0.225, |
| "step": 238 |
| }, |
| { |
| "epoch": 1.1715686274509804, |
| "grad_norm": 0.4712118235647605, |
| "learning_rate": 3.6687554476982954e-06, |
| "loss": 0.2171, |
| "step": 239 |
| }, |
| { |
| "epoch": 1.1764705882352942, |
| "grad_norm": 0.4839778546728081, |
| "learning_rate": 3.6316850496395863e-06, |
| "loss": 0.209, |
| "step": 240 |
| }, |
| { |
| "epoch": 1.1813725490196079, |
| "grad_norm": 0.4855855826367996, |
| "learning_rate": 3.5946957781940296e-06, |
| "loss": 0.215, |
| "step": 241 |
| }, |
| { |
| "epoch": 1.1862745098039216, |
| "grad_norm": 0.4661121070145884, |
| "learning_rate": 3.557789826434439e-06, |
| "loss": 0.2163, |
| "step": 242 |
| }, |
| { |
| "epoch": 1.1911764705882353, |
| "grad_norm": 0.5101563310470888, |
| "learning_rate": 3.5209693824936486e-06, |
| "loss": 0.2219, |
| "step": 243 |
| }, |
| { |
| "epoch": 1.196078431372549, |
| "grad_norm": 0.45246713488863194, |
| "learning_rate": 3.484236629434783e-06, |
| "loss": 0.22, |
| "step": 244 |
| }, |
| { |
| "epoch": 1.2009803921568627, |
| "grad_norm": 0.48803317168808835, |
| "learning_rate": 3.4475937451218257e-06, |
| "loss": 0.2421, |
| "step": 245 |
| }, |
| { |
| "epoch": 1.2058823529411764, |
| "grad_norm": 0.47070215757963546, |
| "learning_rate": 3.4110429020904924e-06, |
| "loss": 0.2112, |
| "step": 246 |
| }, |
| { |
| "epoch": 1.2107843137254901, |
| "grad_norm": 0.5433683679114224, |
| "learning_rate": 3.3745862674194246e-06, |
| "loss": 0.2318, |
| "step": 247 |
| }, |
| { |
| "epoch": 1.215686274509804, |
| "grad_norm": 0.494372431441497, |
| "learning_rate": 3.3382260026017027e-06, |
| "loss": 0.2263, |
| "step": 248 |
| }, |
| { |
| "epoch": 1.2205882352941178, |
| "grad_norm": 0.48636265742721485, |
| "learning_rate": 3.301964263416693e-06, |
| "loss": 0.2114, |
| "step": 249 |
| }, |
| { |
| "epoch": 1.2254901960784315, |
| "grad_norm": 0.4739784224799503, |
| "learning_rate": 3.2658031998022368e-06, |
| "loss": 0.2043, |
| "step": 250 |
| }, |
| { |
| "epoch": 1.2303921568627452, |
| "grad_norm": 0.4815109523313931, |
| "learning_rate": 3.2297449557271743e-06, |
| "loss": 0.2165, |
| "step": 251 |
| }, |
| { |
| "epoch": 1.2352941176470589, |
| "grad_norm": 0.45623070246733244, |
| "learning_rate": 3.1937916690642356e-06, |
| "loss": 0.2018, |
| "step": 252 |
| }, |
| { |
| "epoch": 1.2401960784313726, |
| "grad_norm": 0.5217926836177432, |
| "learning_rate": 3.1579454714632853e-06, |
| "loss": 0.242, |
| "step": 253 |
| }, |
| { |
| "epoch": 1.2450980392156863, |
| "grad_norm": 0.4853475659437706, |
| "learning_rate": 3.1222084882249375e-06, |
| "loss": 0.223, |
| "step": 254 |
| }, |
| { |
| "epoch": 1.25, |
| "grad_norm": 0.4723805429523192, |
| "learning_rate": 3.0865828381745515e-06, |
| "loss": 0.2289, |
| "step": 255 |
| }, |
| { |
| "epoch": 1.2549019607843137, |
| "grad_norm": 0.5016994760967523, |
| "learning_rate": 3.0510706335366034e-06, |
| "loss": 0.2424, |
| "step": 256 |
| }, |
| { |
| "epoch": 1.2598039215686274, |
| "grad_norm": 0.5020856006286446, |
| "learning_rate": 3.015673979809457e-06, |
| "loss": 0.2283, |
| "step": 257 |
| }, |
| { |
| "epoch": 1.2647058823529411, |
| "grad_norm": 0.48802662651093065, |
| "learning_rate": 2.980394975640526e-06, |
| "loss": 0.2101, |
| "step": 258 |
| }, |
| { |
| "epoch": 1.2696078431372548, |
| "grad_norm": 0.5011136337277475, |
| "learning_rate": 2.9452357127018516e-06, |
| "loss": 0.2159, |
| "step": 259 |
| }, |
| { |
| "epoch": 1.2745098039215685, |
| "grad_norm": 0.5400359633757877, |
| "learning_rate": 2.910198275566085e-06, |
| "loss": 0.2277, |
| "step": 260 |
| }, |
| { |
| "epoch": 1.2794117647058822, |
| "grad_norm": 0.5137240802607623, |
| "learning_rate": 2.8752847415828923e-06, |
| "loss": 0.2215, |
| "step": 261 |
| }, |
| { |
| "epoch": 1.284313725490196, |
| "grad_norm": 0.47572170016097703, |
| "learning_rate": 2.8404971807557957e-06, |
| "loss": 0.2219, |
| "step": 262 |
| }, |
| { |
| "epoch": 1.2892156862745099, |
| "grad_norm": 0.485360542980026, |
| "learning_rate": 2.80583765561944e-06, |
| "loss": 0.2301, |
| "step": 263 |
| }, |
| { |
| "epoch": 1.2941176470588236, |
| "grad_norm": 0.5083747858931846, |
| "learning_rate": 2.771308221117309e-06, |
| "loss": 0.2148, |
| "step": 264 |
| }, |
| { |
| "epoch": 1.2990196078431373, |
| "grad_norm": 0.5607090322482143, |
| "learning_rate": 2.736910924479881e-06, |
| "loss": 0.2268, |
| "step": 265 |
| }, |
| { |
| "epoch": 1.303921568627451, |
| "grad_norm": 0.4896494954982839, |
| "learning_rate": 2.7026478051032625e-06, |
| "loss": 0.2351, |
| "step": 266 |
| }, |
| { |
| "epoch": 1.3088235294117647, |
| "grad_norm": 0.4929197961015348, |
| "learning_rate": 2.668520894428259e-06, |
| "loss": 0.2126, |
| "step": 267 |
| }, |
| { |
| "epoch": 1.3137254901960784, |
| "grad_norm": 0.4539652841374491, |
| "learning_rate": 2.6345322158199503e-06, |
| "loss": 0.2094, |
| "step": 268 |
| }, |
| { |
| "epoch": 1.3186274509803921, |
| "grad_norm": 0.48162901242603456, |
| "learning_rate": 2.600683784447704e-06, |
| "loss": 0.2084, |
| "step": 269 |
| }, |
| { |
| "epoch": 1.3235294117647058, |
| "grad_norm": 0.47818141221316735, |
| "learning_rate": 2.5669776071657194e-06, |
| "loss": 0.2156, |
| "step": 270 |
| }, |
| { |
| "epoch": 1.3284313725490196, |
| "grad_norm": 0.4928567042617787, |
| "learning_rate": 2.5334156823940237e-06, |
| "loss": 0.231, |
| "step": 271 |
| }, |
| { |
| "epoch": 1.3333333333333333, |
| "grad_norm": 0.49369116994850987, |
| "learning_rate": 2.5000000000000015e-06, |
| "loss": 0.2221, |
| "step": 272 |
| }, |
| { |
| "epoch": 1.3382352941176472, |
| "grad_norm": 0.48305909995269875, |
| "learning_rate": 2.466732541180404e-06, |
| "loss": 0.2135, |
| "step": 273 |
| }, |
| { |
| "epoch": 1.343137254901961, |
| "grad_norm": 0.4951678749675107, |
| "learning_rate": 2.4336152783438984e-06, |
| "loss": 0.2297, |
| "step": 274 |
| }, |
| { |
| "epoch": 1.3480392156862746, |
| "grad_norm": 0.4657323792291692, |
| "learning_rate": 2.4006501749941097e-06, |
| "loss": 0.2274, |
| "step": 275 |
| }, |
| { |
| "epoch": 1.3529411764705883, |
| "grad_norm": 0.5241852758039561, |
| "learning_rate": 2.3678391856132203e-06, |
| "loss": 0.2231, |
| "step": 276 |
| }, |
| { |
| "epoch": 1.357843137254902, |
| "grad_norm": 0.48935802891939345, |
| "learning_rate": 2.335184255546083e-06, |
| "loss": 0.207, |
| "step": 277 |
| }, |
| { |
| "epoch": 1.3627450980392157, |
| "grad_norm": 0.49803971224569626, |
| "learning_rate": 2.302687320884876e-06, |
| "loss": 0.2041, |
| "step": 278 |
| }, |
| { |
| "epoch": 1.3676470588235294, |
| "grad_norm": 0.522300739994449, |
| "learning_rate": 2.2703503083543288e-06, |
| "loss": 0.2322, |
| "step": 279 |
| }, |
| { |
| "epoch": 1.3725490196078431, |
| "grad_norm": 0.5175680961463683, |
| "learning_rate": 2.238175135197471e-06, |
| "loss": 0.2292, |
| "step": 280 |
| }, |
| { |
| "epoch": 1.3774509803921569, |
| "grad_norm": 0.503707855088509, |
| "learning_rate": 2.206163709061976e-06, |
| "loss": 0.2127, |
| "step": 281 |
| }, |
| { |
| "epoch": 1.3823529411764706, |
| "grad_norm": 0.5074511535403994, |
| "learning_rate": 2.174317927887041e-06, |
| "loss": 0.229, |
| "step": 282 |
| }, |
| { |
| "epoch": 1.3872549019607843, |
| "grad_norm": 0.4925131267407971, |
| "learning_rate": 2.1426396797908764e-06, |
| "loss": 0.2165, |
| "step": 283 |
| }, |
| { |
| "epoch": 1.392156862745098, |
| "grad_norm": 0.5106602755906986, |
| "learning_rate": 2.1111308429587446e-06, |
| "loss": 0.2191, |
| "step": 284 |
| }, |
| { |
| "epoch": 1.3970588235294117, |
| "grad_norm": 0.49942574868641376, |
| "learning_rate": 2.0797932855316183e-06, |
| "loss": 0.2108, |
| "step": 285 |
| }, |
| { |
| "epoch": 1.4019607843137254, |
| "grad_norm": 0.4783779601581248, |
| "learning_rate": 2.048628865495403e-06, |
| "loss": 0.2154, |
| "step": 286 |
| }, |
| { |
| "epoch": 1.406862745098039, |
| "grad_norm": 0.4675068927861205, |
| "learning_rate": 2.017639430570794e-06, |
| "loss": 0.2132, |
| "step": 287 |
| }, |
| { |
| "epoch": 1.4117647058823528, |
| "grad_norm": 0.5045061184751272, |
| "learning_rate": 1.9868268181037186e-06, |
| "loss": 0.2307, |
| "step": 288 |
| }, |
| { |
| "epoch": 1.4166666666666667, |
| "grad_norm": 0.48658941494641167, |
| "learning_rate": 1.956192854956397e-06, |
| "loss": 0.2213, |
| "step": 289 |
| }, |
| { |
| "epoch": 1.4215686274509804, |
| "grad_norm": 0.49002558251207806, |
| "learning_rate": 1.925739357399038e-06, |
| "loss": 0.229, |
| "step": 290 |
| }, |
| { |
| "epoch": 1.4264705882352942, |
| "grad_norm": 0.48906447602859293, |
| "learning_rate": 1.8954681310021434e-06, |
| "loss": 0.2272, |
| "step": 291 |
| }, |
| { |
| "epoch": 1.4313725490196079, |
| "grad_norm": 0.5142081803047547, |
| "learning_rate": 1.865380970529469e-06, |
| "loss": 0.2325, |
| "step": 292 |
| }, |
| { |
| "epoch": 1.4362745098039216, |
| "grad_norm": 0.4912306633034611, |
| "learning_rate": 1.8354796598315977e-06, |
| "loss": 0.2227, |
| "step": 293 |
| }, |
| { |
| "epoch": 1.4411764705882353, |
| "grad_norm": 0.5145847762913899, |
| "learning_rate": 1.8057659717401948e-06, |
| "loss": 0.2255, |
| "step": 294 |
| }, |
| { |
| "epoch": 1.446078431372549, |
| "grad_norm": 0.49819056747659707, |
| "learning_rate": 1.7762416679628792e-06, |
| "loss": 0.2176, |
| "step": 295 |
| }, |
| { |
| "epoch": 1.4509803921568627, |
| "grad_norm": 0.531159719917642, |
| "learning_rate": 1.746908498978791e-06, |
| "loss": 0.2486, |
| "step": 296 |
| }, |
| { |
| "epoch": 1.4558823529411764, |
| "grad_norm": 0.5032774150199282, |
| "learning_rate": 1.7177682039347875e-06, |
| "loss": 0.222, |
| "step": 297 |
| }, |
| { |
| "epoch": 1.4607843137254901, |
| "grad_norm": 0.5141853027734455, |
| "learning_rate": 1.6888225105423505e-06, |
| "loss": 0.2472, |
| "step": 298 |
| }, |
| { |
| "epoch": 1.465686274509804, |
| "grad_norm": 0.5159209957285815, |
| "learning_rate": 1.6600731349751303e-06, |
| "loss": 0.2137, |
| "step": 299 |
| }, |
| { |
| "epoch": 1.4705882352941178, |
| "grad_norm": 0.4548136506677179, |
| "learning_rate": 1.6315217817672142e-06, |
| "loss": 0.2028, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.4754901960784315, |
| "grad_norm": 0.5217869077832439, |
| "learning_rate": 1.6031701437120512e-06, |
| "loss": 0.2282, |
| "step": 301 |
| }, |
| { |
| "epoch": 1.4803921568627452, |
| "grad_norm": 0.4743126314542162, |
| "learning_rate": 1.575019901762097e-06, |
| "loss": 0.2214, |
| "step": 302 |
| }, |
| { |
| "epoch": 1.4852941176470589, |
| "grad_norm": 0.46199991416748565, |
| "learning_rate": 1.5470727249291423e-06, |
| "loss": 0.2362, |
| "step": 303 |
| }, |
| { |
| "epoch": 1.4901960784313726, |
| "grad_norm": 0.4880202546561932, |
| "learning_rate": 1.5193302701853674e-06, |
| "loss": 0.2274, |
| "step": 304 |
| }, |
| { |
| "epoch": 1.4950980392156863, |
| "grad_norm": 0.5157149627873591, |
| "learning_rate": 1.4917941823650917e-06, |
| "loss": 0.2443, |
| "step": 305 |
| }, |
| { |
| "epoch": 1.5, |
| "grad_norm": 0.49949954036382943, |
| "learning_rate": 1.4644660940672628e-06, |
| "loss": 0.2207, |
| "step": 306 |
| }, |
| { |
| "epoch": 1.5049019607843137, |
| "grad_norm": 0.47935960669362754, |
| "learning_rate": 1.4373476255586515e-06, |
| "loss": 0.2175, |
| "step": 307 |
| }, |
| { |
| "epoch": 1.5098039215686274, |
| "grad_norm": 0.47396201830045953, |
| "learning_rate": 1.410440384677791e-06, |
| "loss": 0.2274, |
| "step": 308 |
| }, |
| { |
| "epoch": 1.5147058823529411, |
| "grad_norm": 0.47348072824518034, |
| "learning_rate": 1.383745966739652e-06, |
| "loss": 0.2251, |
| "step": 309 |
| }, |
| { |
| "epoch": 1.5196078431372548, |
| "grad_norm": 0.49930262299209816, |
| "learning_rate": 1.3572659544410493e-06, |
| "loss": 0.2163, |
| "step": 310 |
| }, |
| { |
| "epoch": 1.5245098039215685, |
| "grad_norm": 0.4992054804915185, |
| "learning_rate": 1.3310019177668154e-06, |
| "loss": 0.2329, |
| "step": 311 |
| }, |
| { |
| "epoch": 1.5294117647058822, |
| "grad_norm": 0.4845633753575591, |
| "learning_rate": 1.3049554138967052e-06, |
| "loss": 0.2224, |
| "step": 312 |
| }, |
| { |
| "epoch": 1.534313725490196, |
| "grad_norm": 0.4856356018612521, |
| "learning_rate": 1.2791279871130824e-06, |
| "loss": 0.2212, |
| "step": 313 |
| }, |
| { |
| "epoch": 1.5392156862745097, |
| "grad_norm": 0.5136795584316685, |
| "learning_rate": 1.2535211687093535e-06, |
| "loss": 0.2426, |
| "step": 314 |
| }, |
| { |
| "epoch": 1.5441176470588234, |
| "grad_norm": 0.4992226834558923, |
| "learning_rate": 1.2281364768991804e-06, |
| "loss": 0.2337, |
| "step": 315 |
| }, |
| { |
| "epoch": 1.5490196078431373, |
| "grad_norm": 0.46127348587510075, |
| "learning_rate": 1.202975416726464e-06, |
| "loss": 0.2206, |
| "step": 316 |
| }, |
| { |
| "epoch": 1.553921568627451, |
| "grad_norm": 0.4751601261175884, |
| "learning_rate": 1.1780394799761163e-06, |
| "loss": 0.221, |
| "step": 317 |
| }, |
| { |
| "epoch": 1.5588235294117647, |
| "grad_norm": 0.4687740072346912, |
| "learning_rate": 1.1533301450856054e-06, |
| "loss": 0.2147, |
| "step": 318 |
| }, |
| { |
| "epoch": 1.5637254901960784, |
| "grad_norm": 0.494253944161966, |
| "learning_rate": 1.1288488770573097e-06, |
| "loss": 0.2145, |
| "step": 319 |
| }, |
| { |
| "epoch": 1.5686274509803921, |
| "grad_norm": 0.45656769356436955, |
| "learning_rate": 1.1045971273716476e-06, |
| "loss": 0.2225, |
| "step": 320 |
| }, |
| { |
| "epoch": 1.5735294117647058, |
| "grad_norm": 0.4700967866343513, |
| "learning_rate": 1.0805763339010329e-06, |
| "loss": 0.2155, |
| "step": 321 |
| }, |
| { |
| "epoch": 1.5784313725490198, |
| "grad_norm": 0.4785394705087758, |
| "learning_rate": 1.0567879208246084e-06, |
| "loss": 0.2001, |
| "step": 322 |
| }, |
| { |
| "epoch": 1.5833333333333335, |
| "grad_norm": 0.4733995988013302, |
| "learning_rate": 1.0332332985438248e-06, |
| "loss": 0.2092, |
| "step": 323 |
| }, |
| { |
| "epoch": 1.5882352941176472, |
| "grad_norm": 0.4721076608628528, |
| "learning_rate": 1.0099138635988026e-06, |
| "loss": 0.2171, |
| "step": 324 |
| }, |
| { |
| "epoch": 1.593137254901961, |
| "grad_norm": 0.4760012038103648, |
| "learning_rate": 9.868309985855446e-07, |
| "loss": 0.2285, |
| "step": 325 |
| }, |
| { |
| "epoch": 1.5980392156862746, |
| "grad_norm": 0.46040599411726224, |
| "learning_rate": 9.639860720739524e-07, |
| "loss": 0.2069, |
| "step": 326 |
| }, |
| { |
| "epoch": 1.6029411764705883, |
| "grad_norm": 0.4604293862507961, |
| "learning_rate": 9.41380438526694e-07, |
| "loss": 0.2242, |
| "step": 327 |
| }, |
| { |
| "epoch": 1.607843137254902, |
| "grad_norm": 0.4757645851924005, |
| "learning_rate": 9.190154382188921e-07, |
| "loss": 0.228, |
| "step": 328 |
| }, |
| { |
| "epoch": 1.6127450980392157, |
| "grad_norm": 0.4950460680266549, |
| "learning_rate": 8.968923971586596e-07, |
| "loss": 0.2003, |
| "step": 329 |
| }, |
| { |
| "epoch": 1.6176470588235294, |
| "grad_norm": 0.49276290384706434, |
| "learning_rate": 8.750126270084891e-07, |
| "loss": 0.2225, |
| "step": 330 |
| }, |
| { |
| "epoch": 1.6225490196078431, |
| "grad_norm": 0.5021860257275205, |
| "learning_rate": 8.533774250074727e-07, |
| "loss": 0.2301, |
| "step": 331 |
| }, |
| { |
| "epoch": 1.6274509803921569, |
| "grad_norm": 0.5235703470648727, |
| "learning_rate": 8.31988073894403e-07, |
| "loss": 0.2459, |
| "step": 332 |
| }, |
| { |
| "epoch": 1.6323529411764706, |
| "grad_norm": 0.46964359302698966, |
| "learning_rate": 8.108458418317089e-07, |
| "loss": 0.2371, |
| "step": 333 |
| }, |
| { |
| "epoch": 1.6372549019607843, |
| "grad_norm": 0.4686413263785466, |
| "learning_rate": 7.899519823302743e-07, |
| "loss": 0.2243, |
| "step": 334 |
| }, |
| { |
| "epoch": 1.642156862745098, |
| "grad_norm": 0.44320620484465817, |
| "learning_rate": 7.693077341751138e-07, |
| "loss": 0.2046, |
| "step": 335 |
| }, |
| { |
| "epoch": 1.6470588235294117, |
| "grad_norm": 0.46592155506606175, |
| "learning_rate": 7.489143213519301e-07, |
| "loss": 0.2201, |
| "step": 336 |
| }, |
| { |
| "epoch": 1.6519607843137254, |
| "grad_norm": 0.4714629835797636, |
| "learning_rate": 7.287729529745386e-07, |
| "loss": 0.2333, |
| "step": 337 |
| }, |
| { |
| "epoch": 1.656862745098039, |
| "grad_norm": 0.5011962626617099, |
| "learning_rate": 7.088848232131862e-07, |
| "loss": 0.2286, |
| "step": 338 |
| }, |
| { |
| "epoch": 1.6617647058823528, |
| "grad_norm": 0.4840431535834992, |
| "learning_rate": 6.892511112237472e-07, |
| "loss": 0.2262, |
| "step": 339 |
| }, |
| { |
| "epoch": 1.6666666666666665, |
| "grad_norm": 0.49298475918099244, |
| "learning_rate": 6.698729810778065e-07, |
| "loss": 0.2423, |
| "step": 340 |
| }, |
| { |
| "epoch": 1.6715686274509802, |
| "grad_norm": 0.5150625646437968, |
| "learning_rate": 6.507515816936538e-07, |
| "loss": 0.2431, |
| "step": 341 |
| }, |
| { |
| "epoch": 1.6764705882352942, |
| "grad_norm": 0.4673229541968105, |
| "learning_rate": 6.318880467681527e-07, |
| "loss": 0.197, |
| "step": 342 |
| }, |
| { |
| "epoch": 1.6813725490196079, |
| "grad_norm": 0.4852125017912043, |
| "learning_rate": 6.132834947095334e-07, |
| "loss": 0.2163, |
| "step": 343 |
| }, |
| { |
| "epoch": 1.6862745098039216, |
| "grad_norm": 0.5016719261559505, |
| "learning_rate": 5.949390285710777e-07, |
| "loss": 0.2291, |
| "step": 344 |
| }, |
| { |
| "epoch": 1.6911764705882353, |
| "grad_norm": 0.5224531030766459, |
| "learning_rate": 5.768557359857241e-07, |
| "loss": 0.2428, |
| "step": 345 |
| }, |
| { |
| "epoch": 1.696078431372549, |
| "grad_norm": 0.5022281569239445, |
| "learning_rate": 5.590346891015758e-07, |
| "loss": 0.2311, |
| "step": 346 |
| }, |
| { |
| "epoch": 1.7009803921568627, |
| "grad_norm": 0.48366634743846737, |
| "learning_rate": 5.414769445183432e-07, |
| "loss": 0.2366, |
| "step": 347 |
| }, |
| { |
| "epoch": 1.7058823529411766, |
| "grad_norm": 0.43375840975227226, |
| "learning_rate": 5.241835432246888e-07, |
| "loss": 0.193, |
| "step": 348 |
| }, |
| { |
| "epoch": 1.7107843137254903, |
| "grad_norm": 0.48263877759982, |
| "learning_rate": 5.071555105365156e-07, |
| "loss": 0.1957, |
| "step": 349 |
| }, |
| { |
| "epoch": 1.715686274509804, |
| "grad_norm": 0.49674137654837686, |
| "learning_rate": 4.903938560361698e-07, |
| "loss": 0.226, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.7205882352941178, |
| "grad_norm": 0.48486655987557653, |
| "learning_rate": 4.738995735125895e-07, |
| "loss": 0.2113, |
| "step": 351 |
| }, |
| { |
| "epoch": 1.7254901960784315, |
| "grad_norm": 0.4727249461110737, |
| "learning_rate": 4.576736409023813e-07, |
| "loss": 0.2344, |
| "step": 352 |
| }, |
| { |
| "epoch": 1.7303921568627452, |
| "grad_norm": 0.4794537140272506, |
| "learning_rate": 4.4171702023183663e-07, |
| "loss": 0.2117, |
| "step": 353 |
| }, |
| { |
| "epoch": 1.7352941176470589, |
| "grad_norm": 0.4798108438983259, |
| "learning_rate": 4.2603065755989493e-07, |
| "loss": 0.2253, |
| "step": 354 |
| }, |
| { |
| "epoch": 1.7401960784313726, |
| "grad_norm": 0.5174065293558706, |
| "learning_rate": 4.10615482922056e-07, |
| "loss": 0.2246, |
| "step": 355 |
| }, |
| { |
| "epoch": 1.7450980392156863, |
| "grad_norm": 0.4806212887917769, |
| "learning_rate": 3.9547241027523164e-07, |
| "loss": 0.2228, |
| "step": 356 |
| }, |
| { |
| "epoch": 1.75, |
| "grad_norm": 0.48418984478836974, |
| "learning_rate": 3.8060233744356634e-07, |
| "loss": 0.2197, |
| "step": 357 |
| }, |
| { |
| "epoch": 1.7549019607843137, |
| "grad_norm": 0.46333557615327836, |
| "learning_rate": 3.660061460651981e-07, |
| "loss": 0.2194, |
| "step": 358 |
| }, |
| { |
| "epoch": 1.7598039215686274, |
| "grad_norm": 0.5008257727294606, |
| "learning_rate": 3.5168470153998937e-07, |
| "loss": 0.2435, |
| "step": 359 |
| }, |
| { |
| "epoch": 1.7647058823529411, |
| "grad_norm": 0.4551834191653335, |
| "learning_rate": 3.3763885297822153e-07, |
| "loss": 0.2145, |
| "step": 360 |
| }, |
| { |
| "epoch": 1.7696078431372548, |
| "grad_norm": 0.48282176466836263, |
| "learning_rate": 3.238694331502451e-07, |
| "loss": 0.2114, |
| "step": 361 |
| }, |
| { |
| "epoch": 1.7745098039215685, |
| "grad_norm": 0.5041514081418971, |
| "learning_rate": 3.103772584371106e-07, |
| "loss": 0.2569, |
| "step": 362 |
| }, |
| { |
| "epoch": 1.7794117647058822, |
| "grad_norm": 0.4812170684266997, |
| "learning_rate": 2.9716312878216194e-07, |
| "loss": 0.2138, |
| "step": 363 |
| }, |
| { |
| "epoch": 1.784313725490196, |
| "grad_norm": 0.458630979087121, |
| "learning_rate": 2.842278276436128e-07, |
| "loss": 0.2066, |
| "step": 364 |
| }, |
| { |
| "epoch": 1.7892156862745097, |
| "grad_norm": 0.4589648323345021, |
| "learning_rate": 2.71572121948091e-07, |
| "loss": 0.2291, |
| "step": 365 |
| }, |
| { |
| "epoch": 1.7941176470588234, |
| "grad_norm": 0.48772513974124904, |
| "learning_rate": 2.5919676204517073e-07, |
| "loss": 0.225, |
| "step": 366 |
| }, |
| { |
| "epoch": 1.7990196078431373, |
| "grad_norm": 0.47688867452378064, |
| "learning_rate": 2.471024816628836e-07, |
| "loss": 0.2117, |
| "step": 367 |
| }, |
| { |
| "epoch": 1.803921568627451, |
| "grad_norm": 0.4970666334607962, |
| "learning_rate": 2.3528999786421758e-07, |
| "loss": 0.2203, |
| "step": 368 |
| }, |
| { |
| "epoch": 1.8088235294117647, |
| "grad_norm": 0.5068554152010685, |
| "learning_rate": 2.237600110046001e-07, |
| "loss": 0.2432, |
| "step": 369 |
| }, |
| { |
| "epoch": 1.8137254901960784, |
| "grad_norm": 0.46797846957419414, |
| "learning_rate": 2.1251320469037827e-07, |
| "loss": 0.2131, |
| "step": 370 |
| }, |
| { |
| "epoch": 1.8186274509803921, |
| "grad_norm": 0.49107452866371326, |
| "learning_rate": 2.0155024573828452e-07, |
| "loss": 0.2344, |
| "step": 371 |
| }, |
| { |
| "epoch": 1.8235294117647058, |
| "grad_norm": 0.49061343192781487, |
| "learning_rate": 1.908717841359048e-07, |
| "loss": 0.2174, |
| "step": 372 |
| }, |
| { |
| "epoch": 1.8284313725490198, |
| "grad_norm": 0.46324699413017995, |
| "learning_rate": 1.8047845300313726e-07, |
| "loss": 0.231, |
| "step": 373 |
| }, |
| { |
| "epoch": 1.8333333333333335, |
| "grad_norm": 0.4894838733242118, |
| "learning_rate": 1.7037086855465902e-07, |
| "loss": 0.2443, |
| "step": 374 |
| }, |
| { |
| "epoch": 1.8382352941176472, |
| "grad_norm": 0.4692723244866964, |
| "learning_rate": 1.6054963006338742e-07, |
| "loss": 0.2143, |
| "step": 375 |
| }, |
| { |
| "epoch": 1.843137254901961, |
| "grad_norm": 0.4698179780132864, |
| "learning_rate": 1.510153198249531e-07, |
| "loss": 0.2066, |
| "step": 376 |
| }, |
| { |
| "epoch": 1.8480392156862746, |
| "grad_norm": 0.4619932559660832, |
| "learning_rate": 1.4176850312317246e-07, |
| "loss": 0.2083, |
| "step": 377 |
| }, |
| { |
| "epoch": 1.8529411764705883, |
| "grad_norm": 0.5033482409328541, |
| "learning_rate": 1.328097281965357e-07, |
| "loss": 0.2385, |
| "step": 378 |
| }, |
| { |
| "epoch": 1.857843137254902, |
| "grad_norm": 0.49653636838509496, |
| "learning_rate": 1.241395262056999e-07, |
| "loss": 0.2259, |
| "step": 379 |
| }, |
| { |
| "epoch": 1.8627450980392157, |
| "grad_norm": 0.492657271024254, |
| "learning_rate": 1.157584112019966e-07, |
| "loss": 0.2354, |
| "step": 380 |
| }, |
| { |
| "epoch": 1.8676470588235294, |
| "grad_norm": 0.4798776902464036, |
| "learning_rate": 1.0766688009695548e-07, |
| "loss": 0.239, |
| "step": 381 |
| }, |
| { |
| "epoch": 1.8725490196078431, |
| "grad_norm": 0.4818365919471583, |
| "learning_rate": 9.986541263284077e-08, |
| "loss": 0.2329, |
| "step": 382 |
| }, |
| { |
| "epoch": 1.8774509803921569, |
| "grad_norm": 0.4969256351294847, |
| "learning_rate": 9.235447135421127e-08, |
| "loss": 0.2253, |
| "step": 383 |
| }, |
| { |
| "epoch": 1.8823529411764706, |
| "grad_norm": 0.5001352235803699, |
| "learning_rate": 8.513450158049109e-08, |
| "loss": 0.2435, |
| "step": 384 |
| }, |
| { |
| "epoch": 1.8872549019607843, |
| "grad_norm": 0.4851147711386313, |
| "learning_rate": 7.820593137957244e-08, |
| "loss": 0.2113, |
| "step": 385 |
| }, |
| { |
| "epoch": 1.892156862745098, |
| "grad_norm": 0.49751220828120635, |
| "learning_rate": 7.156917154243048e-08, |
| "loss": 0.2326, |
| "step": 386 |
| }, |
| { |
| "epoch": 1.8970588235294117, |
| "grad_norm": 0.4377191985748351, |
| "learning_rate": 6.522461555877213e-08, |
| "loss": 0.2121, |
| "step": 387 |
| }, |
| { |
| "epoch": 1.9019607843137254, |
| "grad_norm": 0.44546214474502566, |
| "learning_rate": 5.917263959370312e-08, |
| "loss": 0.2092, |
| "step": 388 |
| }, |
| { |
| "epoch": 1.906862745098039, |
| "grad_norm": 0.49712054558929825, |
| "learning_rate": 5.341360246542804e-08, |
| "loss": 0.2166, |
| "step": 389 |
| }, |
| { |
| "epoch": 1.9117647058823528, |
| "grad_norm": 0.49011006260488194, |
| "learning_rate": 4.794784562397459e-08, |
| "loss": 0.2224, |
| "step": 390 |
| }, |
| { |
| "epoch": 1.9166666666666665, |
| "grad_norm": 0.4816656251558823, |
| "learning_rate": 4.2775693130948094e-08, |
| "loss": 0.2199, |
| "step": 391 |
| }, |
| { |
| "epoch": 1.9215686274509802, |
| "grad_norm": 0.48846238423214716, |
| "learning_rate": 3.7897451640321326e-08, |
| "loss": 0.2388, |
| "step": 392 |
| }, |
| { |
| "epoch": 1.9264705882352942, |
| "grad_norm": 0.4666098884837087, |
| "learning_rate": 3.3313410380250157e-08, |
| "loss": 0.2037, |
| "step": 393 |
| }, |
| { |
| "epoch": 1.9313725490196079, |
| "grad_norm": 0.4682399224429251, |
| "learning_rate": 2.9023841135927822e-08, |
| "loss": 0.2142, |
| "step": 394 |
| }, |
| { |
| "epoch": 1.9362745098039216, |
| "grad_norm": 0.4736092237883573, |
| "learning_rate": 2.5028998233467272e-08, |
| "loss": 0.2228, |
| "step": 395 |
| }, |
| { |
| "epoch": 1.9411764705882353, |
| "grad_norm": 0.4697089850402861, |
| "learning_rate": 2.1329118524827662e-08, |
| "loss": 0.2315, |
| "step": 396 |
| }, |
| { |
| "epoch": 1.946078431372549, |
| "grad_norm": 0.4672227329719579, |
| "learning_rate": 1.7924421373766153e-08, |
| "loss": 0.2156, |
| "step": 397 |
| }, |
| { |
| "epoch": 1.9509803921568627, |
| "grad_norm": 0.497487935493445, |
| "learning_rate": 1.481510864283553e-08, |
| "loss": 0.2292, |
| "step": 398 |
| }, |
| { |
| "epoch": 1.9558823529411766, |
| "grad_norm": 0.46974714857662914, |
| "learning_rate": 1.200136468141544e-08, |
| "loss": 0.2111, |
| "step": 399 |
| }, |
| { |
| "epoch": 1.9607843137254903, |
| "grad_norm": 0.4879408121501933, |
| "learning_rate": 9.48335631477948e-09, |
| "loss": 0.2219, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.9607843137254903, |
| "eval_loss": 0.2928379774093628, |
| "eval_runtime": 4.5224, |
| "eval_samples_per_second": 14.594, |
| "eval_steps_per_second": 3.759, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.965686274509804, |
| "grad_norm": 0.5040773663379564, |
| "learning_rate": 7.261232834209208e-09, |
| "loss": 0.2405, |
| "step": 401 |
| }, |
| { |
| "epoch": 1.9705882352941178, |
| "grad_norm": 0.48149306121852936, |
| "learning_rate": 5.3351259881379016e-09, |
| "loss": 0.2352, |
| "step": 402 |
| }, |
| { |
| "epoch": 1.9754901960784315, |
| "grad_norm": 0.4749550695488964, |
| "learning_rate": 3.705149974342348e-09, |
| "loss": 0.2153, |
| "step": 403 |
| }, |
| { |
| "epoch": 1.9803921568627452, |
| "grad_norm": 0.4819452418309937, |
| "learning_rate": 2.371401433170495e-09, |
| "loss": 0.232, |
| "step": 404 |
| }, |
| { |
| "epoch": 1.9852941176470589, |
| "grad_norm": 0.46988250711319407, |
| "learning_rate": 1.3339594418138036e-09, |
| "loss": 0.2024, |
| "step": 405 |
| }, |
| { |
| "epoch": 1.9901960784313726, |
| "grad_norm": 0.4762711111697877, |
| "learning_rate": 5.928855096154485e-10, |
| "loss": 0.2009, |
| "step": 406 |
| }, |
| { |
| "epoch": 1.9950980392156863, |
| "grad_norm": 0.46631748299658515, |
| "learning_rate": 1.4822357442656475e-10, |
| "loss": 0.2188, |
| "step": 407 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 0.532021803787812, |
| "learning_rate": 0.0, |
| "loss": 0.2193, |
| "step": 408 |
| }, |
| { |
| "epoch": 2.0, |
| "step": 408, |
| "total_flos": 38770669780992.0, |
| "train_loss": 0.2513315852950601, |
| "train_runtime": 1293.2809, |
| "train_samples_per_second": 10.061, |
| "train_steps_per_second": 0.315 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 408, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 500000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 38770669780992.0, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|