{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 340, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.014705882352941176, "grad_norm": 2.547836181388219, "learning_rate": 2.3529411764705885e-06, "loss": 0.693, "step": 1 }, { "epoch": 0.029411764705882353, "grad_norm": 2.5270813414533366, "learning_rate": 4.705882352941177e-06, "loss": 0.6892, "step": 2 }, { "epoch": 0.04411764705882353, "grad_norm": 2.355483907211656, "learning_rate": 7.058823529411766e-06, "loss": 0.6851, "step": 3 }, { "epoch": 0.058823529411764705, "grad_norm": 1.8130870424308736, "learning_rate": 9.411764705882354e-06, "loss": 0.6513, "step": 4 }, { "epoch": 0.07352941176470588, "grad_norm": 1.2216520407231852, "learning_rate": 1.1764705882352942e-05, "loss": 0.6231, "step": 5 }, { "epoch": 0.08823529411764706, "grad_norm": 1.503718312141216, "learning_rate": 1.4117647058823532e-05, "loss": 0.6169, "step": 6 }, { "epoch": 0.10294117647058823, "grad_norm": 1.7964660884238277, "learning_rate": 1.647058823529412e-05, "loss": 0.5687, "step": 7 }, { "epoch": 0.11764705882352941, "grad_norm": 1.7269912533351832, "learning_rate": 1.8823529411764708e-05, "loss": 0.5586, "step": 8 }, { "epoch": 0.1323529411764706, "grad_norm": 0.7251113424301481, "learning_rate": 2.1176470588235296e-05, "loss": 0.5208, "step": 9 }, { "epoch": 0.14705882352941177, "grad_norm": 1.1160352667226052, "learning_rate": 2.3529411764705884e-05, "loss": 0.5086, "step": 10 }, { "epoch": 0.16176470588235295, "grad_norm": 0.9015674485810162, "learning_rate": 2.5882352941176475e-05, "loss": 0.4903, "step": 11 }, { "epoch": 0.17647058823529413, "grad_norm": 0.7153137232697248, "learning_rate": 2.8235294117647063e-05, "loss": 0.4841, "step": 12 }, { "epoch": 0.19117647058823528, "grad_norm": 0.8088993429376143, "learning_rate": 3.0588235294117644e-05, "loss": 0.4711, "step": 13 }, { "epoch": 0.20588235294117646, "grad_norm": 0.5653053624299016, "learning_rate": 3.294117647058824e-05, "loss": 0.4567, "step": 14 }, { "epoch": 0.22058823529411764, "grad_norm": 0.5534586516972508, "learning_rate": 3.529411764705883e-05, "loss": 0.4522, "step": 15 }, { "epoch": 0.23529411764705882, "grad_norm": 0.5736228520847013, "learning_rate": 3.7647058823529415e-05, "loss": 0.4415, "step": 16 }, { "epoch": 0.25, "grad_norm": 0.4337208100637674, "learning_rate": 4e-05, "loss": 0.4327, "step": 17 }, { "epoch": 0.2647058823529412, "grad_norm": 0.4876347786470063, "learning_rate": 4.235294117647059e-05, "loss": 0.43, "step": 18 }, { "epoch": 0.27941176470588236, "grad_norm": 0.4061097022997626, "learning_rate": 4.470588235294118e-05, "loss": 0.4317, "step": 19 }, { "epoch": 0.29411764705882354, "grad_norm": 0.4103637458773689, "learning_rate": 4.705882352941177e-05, "loss": 0.43, "step": 20 }, { "epoch": 0.3088235294117647, "grad_norm": 0.3888995331551954, "learning_rate": 4.941176470588236e-05, "loss": 0.4199, "step": 21 }, { "epoch": 0.3235294117647059, "grad_norm": 0.3268371484433097, "learning_rate": 5.176470588235295e-05, "loss": 0.4185, "step": 22 }, { "epoch": 0.3382352941176471, "grad_norm": 0.39223698739269497, "learning_rate": 5.411764705882354e-05, "loss": 0.4128, "step": 23 }, { "epoch": 0.35294117647058826, "grad_norm": 0.2779761293219138, "learning_rate": 5.6470588235294126e-05, "loss": 0.4043, "step": 24 }, { "epoch": 0.36764705882352944, "grad_norm": 0.3989443078505564, "learning_rate": 5.8823529411764714e-05, "loss": 0.4089, "step": 25 }, { "epoch": 0.38235294117647056, "grad_norm": 0.38093200095401203, "learning_rate": 6.117647058823529e-05, "loss": 0.4017, "step": 26 }, { "epoch": 0.39705882352941174, "grad_norm": 0.4064320796173135, "learning_rate": 6.352941176470589e-05, "loss": 0.4011, "step": 27 }, { "epoch": 0.4117647058823529, "grad_norm": 0.4971422289820653, "learning_rate": 6.588235294117648e-05, "loss": 0.3989, "step": 28 }, { "epoch": 0.4264705882352941, "grad_norm": 0.7406572881589351, "learning_rate": 6.823529411764707e-05, "loss": 0.4051, "step": 29 }, { "epoch": 0.4411764705882353, "grad_norm": 0.8206793701783639, "learning_rate": 7.058823529411765e-05, "loss": 0.406, "step": 30 }, { "epoch": 0.45588235294117646, "grad_norm": 0.46999206766579715, "learning_rate": 7.294117647058824e-05, "loss": 0.3959, "step": 31 }, { "epoch": 0.47058823529411764, "grad_norm": 0.7832181937714577, "learning_rate": 7.529411764705883e-05, "loss": 0.3979, "step": 32 }, { "epoch": 0.4852941176470588, "grad_norm": 0.6729630572440819, "learning_rate": 7.764705882352942e-05, "loss": 0.4037, "step": 33 }, { "epoch": 0.5, "grad_norm": 0.5845843059576046, "learning_rate": 8e-05, "loss": 0.3981, "step": 34 }, { "epoch": 0.5147058823529411, "grad_norm": 0.5179039508771461, "learning_rate": 7.999789193948694e-05, "loss": 0.3899, "step": 35 }, { "epoch": 0.5294117647058824, "grad_norm": 0.8446247503414208, "learning_rate": 7.999156798014366e-05, "loss": 0.3919, "step": 36 }, { "epoch": 0.5441176470588235, "grad_norm": 0.5811147704566235, "learning_rate": 7.998102878853464e-05, "loss": 0.3842, "step": 37 }, { "epoch": 0.5588235294117647, "grad_norm": 0.7042141225204365, "learning_rate": 7.996627547552256e-05, "loss": 0.3887, "step": 38 }, { "epoch": 0.5735294117647058, "grad_norm": 0.5162453656112366, "learning_rate": 7.994730959615125e-05, "loss": 0.3801, "step": 39 }, { "epoch": 0.5882352941176471, "grad_norm": 0.6306069956051037, "learning_rate": 7.992413314948177e-05, "loss": 0.3872, "step": 40 }, { "epoch": 0.6029411764705882, "grad_norm": 0.4954620508512063, "learning_rate": 7.989674857838173e-05, "loss": 0.3799, "step": 41 }, { "epoch": 0.6176470588235294, "grad_norm": 0.4655956434769946, "learning_rate": 7.986515876926777e-05, "loss": 0.3795, "step": 42 }, { "epoch": 0.6323529411764706, "grad_norm": 0.4180599863638677, "learning_rate": 7.982936705180139e-05, "loss": 0.3797, "step": 43 }, { "epoch": 0.6470588235294118, "grad_norm": 0.3370827795593815, "learning_rate": 7.978937719853786e-05, "loss": 0.3713, "step": 44 }, { "epoch": 0.6617647058823529, "grad_norm": 0.302709681434872, "learning_rate": 7.974519342452872e-05, "loss": 0.37, "step": 45 }, { "epoch": 0.6764705882352942, "grad_norm": 0.35605679848702854, "learning_rate": 7.969682038687744e-05, "loss": 0.3706, "step": 46 }, { "epoch": 0.6911764705882353, "grad_norm": 0.3386095408702867, "learning_rate": 7.964426318424855e-05, "loss": 0.3717, "step": 47 }, { "epoch": 0.7058823529411765, "grad_norm": 0.3740443449230433, "learning_rate": 7.958752735633022e-05, "loss": 0.3678, "step": 48 }, { "epoch": 0.7205882352941176, "grad_norm": 0.291877051555131, "learning_rate": 7.952661888325038e-05, "loss": 0.3667, "step": 49 }, { "epoch": 0.7352941176470589, "grad_norm": 0.22798970572724098, "learning_rate": 7.946154418494639e-05, "loss": 0.3658, "step": 50 }, { "epoch": 0.75, "grad_norm": 0.3017794055631277, "learning_rate": 7.939231012048833e-05, "loss": 0.366, "step": 51 }, { "epoch": 0.7647058823529411, "grad_norm": 0.2183488887265105, "learning_rate": 7.931892398735608e-05, "loss": 0.364, "step": 52 }, { "epoch": 0.7794117647058824, "grad_norm": 0.2651745781496878, "learning_rate": 7.92413935206701e-05, "loss": 0.36, "step": 53 }, { "epoch": 0.7941176470588235, "grad_norm": 0.25030378975718287, "learning_rate": 7.915972689237618e-05, "loss": 0.3629, "step": 54 }, { "epoch": 0.8088235294117647, "grad_norm": 0.2531618636697317, "learning_rate": 7.907393271038403e-05, "loss": 0.3548, "step": 55 }, { "epoch": 0.8235294117647058, "grad_norm": 0.33260642234735066, "learning_rate": 7.898402001766002e-05, "loss": 0.364, "step": 56 }, { "epoch": 0.8382352941176471, "grad_norm": 0.41115101334491694, "learning_rate": 7.888999829127398e-05, "loss": 0.3578, "step": 57 }, { "epoch": 0.8529411764705882, "grad_norm": 0.44590061545961934, "learning_rate": 7.879187744140039e-05, "loss": 0.3607, "step": 58 }, { "epoch": 0.8676470588235294, "grad_norm": 0.522962751461851, "learning_rate": 7.868966781027367e-05, "loss": 0.3592, "step": 59 }, { "epoch": 0.8823529411764706, "grad_norm": 0.6601512591505266, "learning_rate": 7.858338017109822e-05, "loss": 0.3657, "step": 60 }, { "epoch": 0.8970588235294118, "grad_norm": 0.5825651454852345, "learning_rate": 7.847302572691277e-05, "loss": 0.3625, "step": 61 }, { "epoch": 0.9117647058823529, "grad_norm": 0.42428578770971376, "learning_rate": 7.835861610940965e-05, "loss": 0.3613, "step": 62 }, { "epoch": 0.9264705882352942, "grad_norm": 0.3717888932030909, "learning_rate": 7.824016337770872e-05, "loss": 0.3573, "step": 63 }, { "epoch": 0.9411764705882353, "grad_norm": 0.42447915248441553, "learning_rate": 7.811768001708627e-05, "loss": 0.362, "step": 64 }, { "epoch": 0.9558823529411765, "grad_norm": 0.3876778292466403, "learning_rate": 7.799117893765913e-05, "loss": 0.3557, "step": 65 }, { "epoch": 0.9705882352941176, "grad_norm": 0.26608755562026015, "learning_rate": 7.786067347302379e-05, "loss": 0.3545, "step": 66 }, { "epoch": 0.9852941176470589, "grad_norm": 0.3434111610187913, "learning_rate": 7.77261773788511e-05, "loss": 0.353, "step": 67 }, { "epoch": 1.0, "grad_norm": 0.3497537599784584, "learning_rate": 7.758770483143634e-05, "loss": 0.3545, "step": 68 }, { "epoch": 1.0147058823529411, "grad_norm": 0.3391085192527439, "learning_rate": 7.744527042620496e-05, "loss": 0.3419, "step": 69 }, { "epoch": 1.0294117647058822, "grad_norm": 0.5077627087325893, "learning_rate": 7.729888917617424e-05, "loss": 0.3471, "step": 70 }, { "epoch": 1.0441176470588236, "grad_norm": 0.5532598557050799, "learning_rate": 7.714857651037081e-05, "loss": 0.3412, "step": 71 }, { "epoch": 1.0588235294117647, "grad_norm": 0.5780960309028261, "learning_rate": 7.699434827220448e-05, "loss": 0.3428, "step": 72 }, { "epoch": 1.0735294117647058, "grad_norm": 0.49974865026294507, "learning_rate": 7.683622071779816e-05, "loss": 0.3364, "step": 73 }, { "epoch": 1.088235294117647, "grad_norm": 0.36747177299557693, "learning_rate": 7.667421051427453e-05, "loss": 0.3389, "step": 74 }, { "epoch": 1.1029411764705883, "grad_norm": 0.3669907335719105, "learning_rate": 7.650833473799922e-05, "loss": 0.3312, "step": 75 }, { "epoch": 1.1176470588235294, "grad_norm": 0.3922235437218937, "learning_rate": 7.633861087278095e-05, "loss": 0.3293, "step": 76 }, { "epoch": 1.1323529411764706, "grad_norm": 0.3719011690300337, "learning_rate": 7.616505680802863e-05, "loss": 0.3375, "step": 77 }, { "epoch": 1.1470588235294117, "grad_norm": 0.3377098123124342, "learning_rate": 7.598769083686582e-05, "loss": 0.3405, "step": 78 }, { "epoch": 1.161764705882353, "grad_norm": 0.29899552501628507, "learning_rate": 7.58065316542025e-05, "loss": 0.3351, "step": 79 }, { "epoch": 1.1764705882352942, "grad_norm": 0.32828082894171606, "learning_rate": 7.562159835476466e-05, "loss": 0.3389, "step": 80 }, { "epoch": 1.1911764705882353, "grad_norm": 0.3445874209931112, "learning_rate": 7.543291043108159e-05, "loss": 0.3378, "step": 81 }, { "epoch": 1.2058823529411764, "grad_norm": 0.3023978628206444, "learning_rate": 7.524048777143139e-05, "loss": 0.3387, "step": 82 }, { "epoch": 1.2205882352941178, "grad_norm": 0.25558483521205383, "learning_rate": 7.504435065774455e-05, "loss": 0.3306, "step": 83 }, { "epoch": 1.2352941176470589, "grad_norm": 0.32953601128265847, "learning_rate": 7.48445197634663e-05, "loss": 0.3349, "step": 84 }, { "epoch": 1.25, "grad_norm": 0.33756326919937646, "learning_rate": 7.464101615137756e-05, "loss": 0.3324, "step": 85 }, { "epoch": 1.2647058823529411, "grad_norm": 0.24120088917587124, "learning_rate": 7.443386127137472e-05, "loss": 0.3317, "step": 86 }, { "epoch": 1.2794117647058822, "grad_norm": 0.2914122682482861, "learning_rate": 7.422307695820893e-05, "loss": 0.3346, "step": 87 }, { "epoch": 1.2941176470588236, "grad_norm": 0.32020365300061576, "learning_rate": 7.400868542918457e-05, "loss": 0.3303, "step": 88 }, { "epoch": 1.3088235294117647, "grad_norm": 0.2516025057415832, "learning_rate": 7.379070928181747e-05, "loss": 0.3351, "step": 89 }, { "epoch": 1.3235294117647058, "grad_norm": 0.25001986331755705, "learning_rate": 7.356917149145308e-05, "loss": 0.3353, "step": 90 }, { "epoch": 1.3382352941176472, "grad_norm": 0.2792197235528977, "learning_rate": 7.334409540884479e-05, "loss": 0.3294, "step": 91 }, { "epoch": 1.3529411764705883, "grad_norm": 0.28078783596981044, "learning_rate": 7.311550475769272e-05, "loss": 0.3248, "step": 92 }, { "epoch": 1.3676470588235294, "grad_norm": 0.37967936203746205, "learning_rate": 7.288342363214313e-05, "loss": 0.3328, "step": 93 }, { "epoch": 1.3823529411764706, "grad_norm": 0.5149190442137033, "learning_rate": 7.264787649424888e-05, "loss": 0.3312, "step": 94 }, { "epoch": 1.3970588235294117, "grad_norm": 0.6122843221799549, "learning_rate": 7.240888817139094e-05, "loss": 0.3348, "step": 95 }, { "epoch": 1.4117647058823528, "grad_norm": 0.5886981731078635, "learning_rate": 7.216648385366167e-05, "loss": 0.3395, "step": 96 }, { "epoch": 1.4264705882352942, "grad_norm": 0.34409848388448383, "learning_rate": 7.192068909120959e-05, "loss": 0.3306, "step": 97 }, { "epoch": 1.4411764705882353, "grad_norm": 0.2978091494500524, "learning_rate": 7.167152979154634e-05, "loss": 0.3334, "step": 98 }, { "epoch": 1.4558823529411764, "grad_norm": 0.4400729737039225, "learning_rate": 7.141903221681595e-05, "loss": 0.3404, "step": 99 }, { "epoch": 1.4705882352941178, "grad_norm": 0.3694283042397561, "learning_rate": 7.116322298102681e-05, "loss": 0.3332, "step": 100 }, { "epoch": 1.4852941176470589, "grad_norm": 0.2639067391873521, "learning_rate": 7.090412904724636e-05, "loss": 0.3313, "step": 101 }, { "epoch": 1.5, "grad_norm": 0.37416820779021187, "learning_rate": 7.064177772475912e-05, "loss": 0.3285, "step": 102 }, { "epoch": 1.5147058823529411, "grad_norm": 0.33275488946271775, "learning_rate": 7.037619666618829e-05, "loss": 0.3361, "step": 103 }, { "epoch": 1.5294117647058822, "grad_norm": 0.23894606263596457, "learning_rate": 7.010741386458099e-05, "loss": 0.3388, "step": 104 }, { "epoch": 1.5441176470588234, "grad_norm": 0.3668325550727754, "learning_rate": 6.983545765045774e-05, "loss": 0.3311, "step": 105 }, { "epoch": 1.5588235294117647, "grad_norm": 0.2399104316842811, "learning_rate": 6.956035668882637e-05, "loss": 0.3297, "step": 106 }, { "epoch": 1.5735294117647058, "grad_norm": 0.2357871283562126, "learning_rate": 6.928213997616059e-05, "loss": 0.3318, "step": 107 }, { "epoch": 1.5882352941176472, "grad_norm": 0.2849131707817035, "learning_rate": 6.900083683734372e-05, "loss": 0.3304, "step": 108 }, { "epoch": 1.6029411764705883, "grad_norm": 0.17423798370987492, "learning_rate": 6.871647692257768e-05, "loss": 0.3276, "step": 109 }, { "epoch": 1.6176470588235294, "grad_norm": 0.2548925157793133, "learning_rate": 6.842909020425789e-05, "loss": 0.334, "step": 110 }, { "epoch": 1.6323529411764706, "grad_norm": 0.24942793454876405, "learning_rate": 6.8138706973814e-05, "loss": 0.3286, "step": 111 }, { "epoch": 1.6470588235294117, "grad_norm": 0.18689295644396056, "learning_rate": 6.784535783851708e-05, "loss": 0.3266, "step": 112 }, { "epoch": 1.6617647058823528, "grad_norm": 0.2911151755362797, "learning_rate": 6.754907371825355e-05, "loss": 0.3262, "step": 113 }, { "epoch": 1.6764705882352942, "grad_norm": 0.27837463790987516, "learning_rate": 6.724988584226616e-05, "loss": 0.3279, "step": 114 }, { "epoch": 1.6911764705882353, "grad_norm": 0.28908336337027807, "learning_rate": 6.69478257458623e-05, "loss": 0.3281, "step": 115 }, { "epoch": 1.7058823529411766, "grad_norm": 0.34403162565227147, "learning_rate": 6.664292526709001e-05, "loss": 0.3313, "step": 116 }, { "epoch": 1.7205882352941178, "grad_norm": 0.29186151217355694, "learning_rate": 6.633521654338233e-05, "loss": 0.3334, "step": 117 }, { "epoch": 1.7352941176470589, "grad_norm": 0.3213370329801203, "learning_rate": 6.602473200816969e-05, "loss": 0.3267, "step": 118 }, { "epoch": 1.75, "grad_norm": 0.30160846132752556, "learning_rate": 6.571150438746157e-05, "loss": 0.3242, "step": 119 }, { "epoch": 1.7647058823529411, "grad_norm": 0.26892209589390914, "learning_rate": 6.539556669639691e-05, "loss": 0.3244, "step": 120 }, { "epoch": 1.7794117647058822, "grad_norm": 0.2440948850515908, "learning_rate": 6.507695223576428e-05, "loss": 0.3229, "step": 121 }, { "epoch": 1.7941176470588234, "grad_norm": 0.20672290158202897, "learning_rate": 6.475569458849178e-05, "loss": 0.331, "step": 122 }, { "epoch": 1.8088235294117647, "grad_norm": 0.2459670080944092, "learning_rate": 6.443182761610752e-05, "loss": 0.3321, "step": 123 }, { "epoch": 1.8235294117647058, "grad_norm": 0.2704407966484609, "learning_rate": 6.410538545517026e-05, "loss": 0.3288, "step": 124 }, { "epoch": 1.8382352941176472, "grad_norm": 0.23662938749712678, "learning_rate": 6.377640251367148e-05, "loss": 0.3285, "step": 125 }, { "epoch": 1.8529411764705883, "grad_norm": 0.23508733004486604, "learning_rate": 6.344491346740859e-05, "loss": 0.3265, "step": 126 }, { "epoch": 1.8676470588235294, "grad_norm": 0.15905797582955938, "learning_rate": 6.311095325633006e-05, "loss": 0.3287, "step": 127 }, { "epoch": 1.8823529411764706, "grad_norm": 0.16194725388549308, "learning_rate": 6.277455708085255e-05, "loss": 0.3193, "step": 128 }, { "epoch": 1.8970588235294117, "grad_norm": 0.19431781452308503, "learning_rate": 6.24357603981508e-05, "loss": 0.3218, "step": 129 }, { "epoch": 1.9117647058823528, "grad_norm": 0.1530375931457355, "learning_rate": 6.209459891842023e-05, "loss": 0.3232, "step": 130 }, { "epoch": 1.9264705882352942, "grad_norm": 0.20552563428946305, "learning_rate": 6.175110860111307e-05, "loss": 0.3291, "step": 131 }, { "epoch": 1.9411764705882353, "grad_norm": 0.16800259989430927, "learning_rate": 6.140532565114801e-05, "loss": 0.3255, "step": 132 }, { "epoch": 1.9558823529411766, "grad_norm": 0.15539283189220082, "learning_rate": 6.105728651509424e-05, "loss": 0.3254, "step": 133 }, { "epoch": 1.9705882352941178, "grad_norm": 0.16103547056804585, "learning_rate": 6.070702787732971e-05, "loss": 0.3249, "step": 134 }, { "epoch": 1.9852941176470589, "grad_norm": 0.16239367515131709, "learning_rate": 6.0354586656174606e-05, "loss": 0.3288, "step": 135 }, { "epoch": 2.0, "grad_norm": 0.20166055674169706, "learning_rate": 6.000000000000001e-05, "loss": 0.3099, "step": 136 }, { "epoch": 2.014705882352941, "grad_norm": 0.24260127096827225, "learning_rate": 5.964330528331234e-05, "loss": 0.3056, "step": 137 }, { "epoch": 2.0294117647058822, "grad_norm": 0.33127332158017103, "learning_rate": 5.9284540102813964e-05, "loss": 0.3, "step": 138 }, { "epoch": 2.0441176470588234, "grad_norm": 0.43353541229148657, "learning_rate": 5.892374227344041e-05, "loss": 0.308, "step": 139 }, { "epoch": 2.0588235294117645, "grad_norm": 0.49315921894412995, "learning_rate": 5.856094982437454e-05, "loss": 0.3067, "step": 140 }, { "epoch": 2.073529411764706, "grad_norm": 0.450512729936066, "learning_rate": 5.819620099503818e-05, "loss": 0.3101, "step": 141 }, { "epoch": 2.088235294117647, "grad_norm": 0.343920919271517, "learning_rate": 5.782953423106154e-05, "loss": 0.3046, "step": 142 }, { "epoch": 2.1029411764705883, "grad_norm": 0.2814078808698335, "learning_rate": 5.746098818023093e-05, "loss": 0.2988, "step": 143 }, { "epoch": 2.1176470588235294, "grad_norm": 0.3529797938787429, "learning_rate": 5.709060168841524e-05, "loss": 0.3033, "step": 144 }, { "epoch": 2.1323529411764706, "grad_norm": 0.33042737999638, "learning_rate": 5.6718413795471346e-05, "loss": 0.3028, "step": 145 }, { "epoch": 2.1470588235294117, "grad_norm": 0.16519735683489525, "learning_rate": 5.634446373112926e-05, "loss": 0.3035, "step": 146 }, { "epoch": 2.161764705882353, "grad_norm": 0.23206099454265375, "learning_rate": 5.596879091085724e-05, "loss": 0.3001, "step": 147 }, { "epoch": 2.176470588235294, "grad_norm": 0.2771142062084271, "learning_rate": 5.5591434931707176e-05, "loss": 0.3005, "step": 148 }, { "epoch": 2.1911764705882355, "grad_norm": 0.2431821354495366, "learning_rate": 5.5212435568141036e-05, "loss": 0.3045, "step": 149 }, { "epoch": 2.2058823529411766, "grad_norm": 0.18676533826823546, "learning_rate": 5.4831832767838436e-05, "loss": 0.2993, "step": 150 }, { "epoch": 2.2205882352941178, "grad_norm": 0.21179962464182328, "learning_rate": 5.444966664748613e-05, "loss": 0.2967, "step": 151 }, { "epoch": 2.235294117647059, "grad_norm": 0.2277766014598405, "learning_rate": 5.406597748854947e-05, "loss": 0.2993, "step": 152 }, { "epoch": 2.25, "grad_norm": 0.19342351829324006, "learning_rate": 5.368080573302676e-05, "loss": 0.3044, "step": 153 }, { "epoch": 2.264705882352941, "grad_norm": 0.21064512781524572, "learning_rate": 5.329419197918639e-05, "loss": 0.3062, "step": 154 }, { "epoch": 2.2794117647058822, "grad_norm": 0.15702816077320908, "learning_rate": 5.29061769772878e-05, "loss": 0.2995, "step": 155 }, { "epoch": 2.2941176470588234, "grad_norm": 0.17947407344934332, "learning_rate": 5.251680162528618e-05, "loss": 0.3013, "step": 156 }, { "epoch": 2.3088235294117645, "grad_norm": 0.18466536636530687, "learning_rate": 5.212610696452174e-05, "loss": 0.3036, "step": 157 }, { "epoch": 2.323529411764706, "grad_norm": 0.1680929482668945, "learning_rate": 5.173413417539385e-05, "loss": 0.3029, "step": 158 }, { "epoch": 2.338235294117647, "grad_norm": 0.1654731382812225, "learning_rate": 5.134092457302044e-05, "loss": 0.3024, "step": 159 }, { "epoch": 2.3529411764705883, "grad_norm": 0.15258036230353525, "learning_rate": 5.0946519602883326e-05, "loss": 0.3037, "step": 160 }, { "epoch": 2.3676470588235294, "grad_norm": 0.1491955575134207, "learning_rate": 5.0550960836459674e-05, "loss": 0.3044, "step": 161 }, { "epoch": 2.3823529411764706, "grad_norm": 0.14545439455177017, "learning_rate": 5.0154289966840315e-05, "loss": 0.2954, "step": 162 }, { "epoch": 2.3970588235294117, "grad_norm": 0.13499905174566654, "learning_rate": 4.975654880433509e-05, "loss": 0.2991, "step": 163 }, { "epoch": 2.411764705882353, "grad_norm": 0.13529530394095032, "learning_rate": 4.935777927206595e-05, "loss": 0.301, "step": 164 }, { "epoch": 2.426470588235294, "grad_norm": 0.14447147193835977, "learning_rate": 4.895802340154813e-05, "loss": 0.3038, "step": 165 }, { "epoch": 2.4411764705882355, "grad_norm": 0.1368779663418461, "learning_rate": 4.85573233282599e-05, "loss": 0.3022, "step": 166 }, { "epoch": 2.4558823529411766, "grad_norm": 0.13420641137376216, "learning_rate": 4.815572128720138e-05, "loss": 0.3049, "step": 167 }, { "epoch": 2.4705882352941178, "grad_norm": 0.12163815573974876, "learning_rate": 4.7753259608442804e-05, "loss": 0.2998, "step": 168 }, { "epoch": 2.485294117647059, "grad_norm": 0.15356753682422525, "learning_rate": 4.734998071266282e-05, "loss": 0.298, "step": 169 }, { "epoch": 2.5, "grad_norm": 0.11474062387695687, "learning_rate": 4.694592710667723e-05, "loss": 0.3068, "step": 170 }, { "epoch": 2.514705882352941, "grad_norm": 0.15293267687494932, "learning_rate": 4.65411413789586e-05, "loss": 0.3005, "step": 171 }, { "epoch": 2.5294117647058822, "grad_norm": 0.13885259280209167, "learning_rate": 4.6135666195147426e-05, "loss": 0.2994, "step": 172 }, { "epoch": 2.5441176470588234, "grad_norm": 0.11212797930990655, "learning_rate": 4.572954429355487e-05, "loss": 0.3026, "step": 173 }, { "epoch": 2.5588235294117645, "grad_norm": 0.1441373090862667, "learning_rate": 4.532281848065816e-05, "loss": 0.3014, "step": 174 }, { "epoch": 2.5735294117647056, "grad_norm": 0.1438877611890079, "learning_rate": 4.491553162658857e-05, "loss": 0.3044, "step": 175 }, { "epoch": 2.588235294117647, "grad_norm": 0.12603413712308525, "learning_rate": 4.450772666061285e-05, "loss": 0.301, "step": 176 }, { "epoch": 2.6029411764705883, "grad_norm": 0.12633770238098366, "learning_rate": 4.409944656660828e-05, "loss": 0.2965, "step": 177 }, { "epoch": 2.6176470588235294, "grad_norm": 0.13789374969079662, "learning_rate": 4.369073437853208e-05, "loss": 0.3009, "step": 178 }, { "epoch": 2.6323529411764706, "grad_norm": 0.1402520516162208, "learning_rate": 4.328163317588552e-05, "loss": 0.298, "step": 179 }, { "epoch": 2.6470588235294117, "grad_norm": 0.09904937633936901, "learning_rate": 4.2872186079173106e-05, "loss": 0.3013, "step": 180 }, { "epoch": 2.661764705882353, "grad_norm": 0.1312793347901254, "learning_rate": 4.2462436245357724e-05, "loss": 0.3, "step": 181 }, { "epoch": 2.6764705882352944, "grad_norm": 0.09621059323368814, "learning_rate": 4.205242686331159e-05, "loss": 0.3029, "step": 182 }, { "epoch": 2.6911764705882355, "grad_norm": 0.10807537972219632, "learning_rate": 4.164220114926414e-05, "loss": 0.2978, "step": 183 }, { "epoch": 2.7058823529411766, "grad_norm": 0.10722934994246927, "learning_rate": 4.123180234224682e-05, "loss": 0.2998, "step": 184 }, { "epoch": 2.7205882352941178, "grad_norm": 0.1064704662992324, "learning_rate": 4.0821273699535625e-05, "loss": 0.3013, "step": 185 }, { "epoch": 2.735294117647059, "grad_norm": 0.10969792661155021, "learning_rate": 4.04106584920916e-05, "loss": 0.3006, "step": 186 }, { "epoch": 2.75, "grad_norm": 0.12146189911454472, "learning_rate": 4e-05, "loss": 0.3031, "step": 187 }, { "epoch": 2.764705882352941, "grad_norm": 0.11305408445478238, "learning_rate": 3.9589341507908415e-05, "loss": 0.3037, "step": 188 }, { "epoch": 2.7794117647058822, "grad_norm": 0.13033799624199852, "learning_rate": 3.917872630046439e-05, "loss": 0.3032, "step": 189 }, { "epoch": 2.7941176470588234, "grad_norm": 0.10937208565515465, "learning_rate": 3.8768197657753194e-05, "loss": 0.3035, "step": 190 }, { "epoch": 2.8088235294117645, "grad_norm": 0.11386446489403948, "learning_rate": 3.835779885073588e-05, "loss": 0.2985, "step": 191 }, { "epoch": 2.8235294117647056, "grad_norm": 0.10573108090134906, "learning_rate": 3.794757313668841e-05, "loss": 0.3025, "step": 192 }, { "epoch": 2.838235294117647, "grad_norm": 0.09522893522850687, "learning_rate": 3.753756375464229e-05, "loss": 0.2964, "step": 193 }, { "epoch": 2.8529411764705883, "grad_norm": 0.10763504421587497, "learning_rate": 3.71278139208269e-05, "loss": 0.2983, "step": 194 }, { "epoch": 2.8676470588235294, "grad_norm": 0.09673967344497875, "learning_rate": 3.67183668241145e-05, "loss": 0.303, "step": 195 }, { "epoch": 2.8823529411764706, "grad_norm": 0.11019580337657149, "learning_rate": 3.630926562146792e-05, "loss": 0.2977, "step": 196 }, { "epoch": 2.8970588235294117, "grad_norm": 0.09457792059050557, "learning_rate": 3.5900553433391724e-05, "loss": 0.2987, "step": 197 }, { "epoch": 2.911764705882353, "grad_norm": 0.08738325053473636, "learning_rate": 3.549227333938716e-05, "loss": 0.2986, "step": 198 }, { "epoch": 2.9264705882352944, "grad_norm": 0.09681306300823637, "learning_rate": 3.5084468373411444e-05, "loss": 0.3013, "step": 199 }, { "epoch": 2.9411764705882355, "grad_norm": 0.09249395424589109, "learning_rate": 3.467718151934187e-05, "loss": 0.2972, "step": 200 }, { "epoch": 2.9558823529411766, "grad_norm": 0.09703820251292773, "learning_rate": 3.427045570644515e-05, "loss": 0.2979, "step": 201 }, { "epoch": 2.9705882352941178, "grad_norm": 0.10540258782003765, "learning_rate": 3.386433380485258e-05, "loss": 0.2992, "step": 202 }, { "epoch": 2.985294117647059, "grad_norm": 0.08015062850340511, "learning_rate": 3.34588586210414e-05, "loss": 0.3011, "step": 203 }, { "epoch": 3.0, "grad_norm": 0.14706722768632058, "learning_rate": 3.305407289332279e-05, "loss": 0.2806, "step": 204 }, { "epoch": 3.014705882352941, "grad_norm": 0.10603496844146229, "learning_rate": 3.2650019287337184e-05, "loss": 0.2799, "step": 205 }, { "epoch": 3.0294117647058822, "grad_norm": 0.15530639149269276, "learning_rate": 3.22467403915572e-05, "loss": 0.2781, "step": 206 }, { "epoch": 3.0441176470588234, "grad_norm": 0.1421223150968687, "learning_rate": 3.184427871279863e-05, "loss": 0.282, "step": 207 }, { "epoch": 3.0588235294117645, "grad_norm": 0.14419949652005803, "learning_rate": 3.144267667174011e-05, "loss": 0.2776, "step": 208 }, { "epoch": 3.073529411764706, "grad_norm": 0.12994406412161302, "learning_rate": 3.1041976598451884e-05, "loss": 0.2781, "step": 209 }, { "epoch": 3.088235294117647, "grad_norm": 0.14614171687593527, "learning_rate": 3.064222072793407e-05, "loss": 0.2765, "step": 210 }, { "epoch": 3.1029411764705883, "grad_norm": 0.12016869114374024, "learning_rate": 3.0243451195664914e-05, "loss": 0.2783, "step": 211 }, { "epoch": 3.1176470588235294, "grad_norm": 0.1310851029344073, "learning_rate": 2.984571003315969e-05, "loss": 0.2781, "step": 212 }, { "epoch": 3.1323529411764706, "grad_norm": 0.11922587006224326, "learning_rate": 2.944903916354032e-05, "loss": 0.2795, "step": 213 }, { "epoch": 3.1470588235294117, "grad_norm": 0.12710795880675843, "learning_rate": 2.905348039711669e-05, "loss": 0.2784, "step": 214 }, { "epoch": 3.161764705882353, "grad_norm": 0.11445734817339984, "learning_rate": 2.865907542697957e-05, "loss": 0.2758, "step": 215 }, { "epoch": 3.176470588235294, "grad_norm": 0.1300689688100464, "learning_rate": 2.8265865824606165e-05, "loss": 0.2758, "step": 216 }, { "epoch": 3.1911764705882355, "grad_norm": 0.09831374725437492, "learning_rate": 2.7873893035478265e-05, "loss": 0.2748, "step": 217 }, { "epoch": 3.2058823529411766, "grad_norm": 0.1272913228663667, "learning_rate": 2.7483198374713836e-05, "loss": 0.2746, "step": 218 }, { "epoch": 3.2205882352941178, "grad_norm": 0.0993483978526809, "learning_rate": 2.7093823022712217e-05, "loss": 0.2739, "step": 219 }, { "epoch": 3.235294117647059, "grad_norm": 0.09886695511684795, "learning_rate": 2.6705808020813622e-05, "loss": 0.2832, "step": 220 }, { "epoch": 3.25, "grad_norm": 0.0998986520318052, "learning_rate": 2.6319194266973256e-05, "loss": 0.2743, "step": 221 }, { "epoch": 3.264705882352941, "grad_norm": 0.0964816428580283, "learning_rate": 2.5934022511450528e-05, "loss": 0.2762, "step": 222 }, { "epoch": 3.2794117647058822, "grad_norm": 0.09255980694012796, "learning_rate": 2.5550333352513885e-05, "loss": 0.2782, "step": 223 }, { "epoch": 3.2941176470588234, "grad_norm": 0.0988289109757557, "learning_rate": 2.5168167232161574e-05, "loss": 0.2748, "step": 224 }, { "epoch": 3.3088235294117645, "grad_norm": 0.0884234456508421, "learning_rate": 2.4787564431858977e-05, "loss": 0.2753, "step": 225 }, { "epoch": 3.323529411764706, "grad_norm": 0.09774336930093334, "learning_rate": 2.4408565068292827e-05, "loss": 0.2751, "step": 226 }, { "epoch": 3.338235294117647, "grad_norm": 0.0883002824327121, "learning_rate": 2.4031209089142773e-05, "loss": 0.2773, "step": 227 }, { "epoch": 3.3529411764705883, "grad_norm": 0.09215211229010013, "learning_rate": 2.3655536268870744e-05, "loss": 0.2752, "step": 228 }, { "epoch": 3.3676470588235294, "grad_norm": 0.08094179064778044, "learning_rate": 2.328158620452868e-05, "loss": 0.2729, "step": 229 }, { "epoch": 3.3823529411764706, "grad_norm": 0.09692135730954927, "learning_rate": 2.2909398311584775e-05, "loss": 0.2731, "step": 230 }, { "epoch": 3.3970588235294117, "grad_norm": 0.0813183018452283, "learning_rate": 2.2539011819769056e-05, "loss": 0.2782, "step": 231 }, { "epoch": 3.411764705882353, "grad_norm": 0.08149910156743696, "learning_rate": 2.2170465768938473e-05, "loss": 0.275, "step": 232 }, { "epoch": 3.426470588235294, "grad_norm": 0.08503586167349093, "learning_rate": 2.1803799004961824e-05, "loss": 0.2766, "step": 233 }, { "epoch": 3.4411764705882355, "grad_norm": 0.07599564665486494, "learning_rate": 2.1439050175625474e-05, "loss": 0.2759, "step": 234 }, { "epoch": 3.4558823529411766, "grad_norm": 0.08451424035355402, "learning_rate": 2.1076257726559603e-05, "loss": 0.2795, "step": 235 }, { "epoch": 3.4705882352941178, "grad_norm": 0.07467914550010295, "learning_rate": 2.0715459897186046e-05, "loss": 0.2767, "step": 236 }, { "epoch": 3.485294117647059, "grad_norm": 0.09085541610657201, "learning_rate": 2.0356694716687687e-05, "loss": 0.2785, "step": 237 }, { "epoch": 3.5, "grad_norm": 0.0734615712405951, "learning_rate": 2.0000000000000012e-05, "loss": 0.2739, "step": 238 }, { "epoch": 3.514705882352941, "grad_norm": 0.07717163991035296, "learning_rate": 1.964541334382541e-05, "loss": 0.2729, "step": 239 }, { "epoch": 3.5294117647058822, "grad_norm": 0.07496301816518236, "learning_rate": 1.9292972122670303e-05, "loss": 0.2752, "step": 240 }, { "epoch": 3.5441176470588234, "grad_norm": 0.08080347359960849, "learning_rate": 1.8942713484905762e-05, "loss": 0.2801, "step": 241 }, { "epoch": 3.5588235294117645, "grad_norm": 0.0738257225960406, "learning_rate": 1.8594674348851992e-05, "loss": 0.2767, "step": 242 }, { "epoch": 3.5735294117647056, "grad_norm": 0.07389506773645138, "learning_rate": 1.824889139888694e-05, "loss": 0.2773, "step": 243 }, { "epoch": 3.588235294117647, "grad_norm": 0.070294801390775, "learning_rate": 1.790540108157977e-05, "loss": 0.2763, "step": 244 }, { "epoch": 3.6029411764705883, "grad_norm": 0.068704655766595, "learning_rate": 1.756423960184922e-05, "loss": 0.2781, "step": 245 }, { "epoch": 3.6176470588235294, "grad_norm": 0.06594242125553404, "learning_rate": 1.7225442919147467e-05, "loss": 0.2757, "step": 246 }, { "epoch": 3.6323529411764706, "grad_norm": 0.06969707402390993, "learning_rate": 1.6889046743669957e-05, "loss": 0.2776, "step": 247 }, { "epoch": 3.6470588235294117, "grad_norm": 0.06492677835336866, "learning_rate": 1.6555086532591425e-05, "loss": 0.2781, "step": 248 }, { "epoch": 3.661764705882353, "grad_norm": 0.06331911499148075, "learning_rate": 1.6223597486328534e-05, "loss": 0.279, "step": 249 }, { "epoch": 3.6764705882352944, "grad_norm": 0.06722098051730128, "learning_rate": 1.589461454482975e-05, "loss": 0.2802, "step": 250 }, { "epoch": 3.6911764705882355, "grad_norm": 0.06266474278629547, "learning_rate": 1.556817238389249e-05, "loss": 0.2781, "step": 251 }, { "epoch": 3.7058823529411766, "grad_norm": 0.07000155789305507, "learning_rate": 1.5244305411508217e-05, "loss": 0.278, "step": 252 }, { "epoch": 3.7205882352941178, "grad_norm": 0.061574644987438566, "learning_rate": 1.4923047764235752e-05, "loss": 0.2767, "step": 253 }, { "epoch": 3.735294117647059, "grad_norm": 0.0682684963649869, "learning_rate": 1.4604433303603092e-05, "loss": 0.2732, "step": 254 }, { "epoch": 3.75, "grad_norm": 0.0642818898133848, "learning_rate": 1.4288495612538427e-05, "loss": 0.2743, "step": 255 }, { "epoch": 3.764705882352941, "grad_norm": 0.06713851434875447, "learning_rate": 1.3975267991830327e-05, "loss": 0.2817, "step": 256 }, { "epoch": 3.7794117647058822, "grad_norm": 0.06828899506125703, "learning_rate": 1.3664783456617703e-05, "loss": 0.2725, "step": 257 }, { "epoch": 3.7941176470588234, "grad_norm": 0.07069453573008218, "learning_rate": 1.3357074732909996e-05, "loss": 0.2775, "step": 258 }, { "epoch": 3.8088235294117645, "grad_norm": 0.06390051742059577, "learning_rate": 1.3052174254137713e-05, "loss": 0.2771, "step": 259 }, { "epoch": 3.8235294117647056, "grad_norm": 0.06129666220940109, "learning_rate": 1.275011415773383e-05, "loss": 0.2789, "step": 260 }, { "epoch": 3.838235294117647, "grad_norm": 0.065817213580326, "learning_rate": 1.2450926281746458e-05, "loss": 0.274, "step": 261 }, { "epoch": 3.8529411764705883, "grad_norm": 0.06519904576862424, "learning_rate": 1.2154642161482939e-05, "loss": 0.2771, "step": 262 }, { "epoch": 3.8676470588235294, "grad_norm": 0.06313901880567484, "learning_rate": 1.1861293026186007e-05, "loss": 0.2754, "step": 263 }, { "epoch": 3.8823529411764706, "grad_norm": 0.06296057703800705, "learning_rate": 1.1570909795742118e-05, "loss": 0.2732, "step": 264 }, { "epoch": 3.8970588235294117, "grad_norm": 0.06544715501817833, "learning_rate": 1.1283523077422327e-05, "loss": 0.2797, "step": 265 }, { "epoch": 3.911764705882353, "grad_norm": 0.0650053871447722, "learning_rate": 1.0999163162656296e-05, "loss": 0.279, "step": 266 }, { "epoch": 3.9264705882352944, "grad_norm": 0.06600064138970108, "learning_rate": 1.0717860023839424e-05, "loss": 0.276, "step": 267 }, { "epoch": 3.9411764705882355, "grad_norm": 0.06210716465473454, "learning_rate": 1.0439643311173642e-05, "loss": 0.2768, "step": 268 }, { "epoch": 3.9558823529411766, "grad_norm": 0.06401894182204264, "learning_rate": 1.0164542349542273e-05, "loss": 0.2788, "step": 269 }, { "epoch": 3.9705882352941178, "grad_norm": 0.06372281222639016, "learning_rate": 9.892586135419022e-06, "loss": 0.2777, "step": 270 }, { "epoch": 3.985294117647059, "grad_norm": 0.05892561579737352, "learning_rate": 9.623803333811713e-06, "loss": 0.2771, "step": 271 }, { "epoch": 4.0, "grad_norm": 0.12086246389399692, "learning_rate": 9.358222275240884e-06, "loss": 0.2599, "step": 272 }, { "epoch": 4.014705882352941, "grad_norm": 0.08602389508274347, "learning_rate": 9.095870952753647e-06, "loss": 0.2593, "step": 273 }, { "epoch": 4.029411764705882, "grad_norm": 0.07197077150088015, "learning_rate": 8.83677701897318e-06, "loss": 0.262, "step": 274 }, { "epoch": 4.044117647058823, "grad_norm": 0.0968580360252349, "learning_rate": 8.580967783184055e-06, "loss": 0.261, "step": 275 }, { "epoch": 4.0588235294117645, "grad_norm": 0.09014393537599821, "learning_rate": 8.328470208453683e-06, "loss": 0.2622, "step": 276 }, { "epoch": 4.073529411764706, "grad_norm": 0.0789945543394142, "learning_rate": 8.07931090879042e-06, "loss": 0.2577, "step": 277 }, { "epoch": 4.088235294117647, "grad_norm": 0.08815856516590073, "learning_rate": 7.833516146338329e-06, "loss": 0.2617, "step": 278 }, { "epoch": 4.102941176470588, "grad_norm": 0.08015266822256034, "learning_rate": 7.591111828609059e-06, "loss": 0.2641, "step": 279 }, { "epoch": 4.117647058823529, "grad_norm": 0.08154629673882077, "learning_rate": 7.3521235057511364e-06, "loss": 0.2638, "step": 280 }, { "epoch": 4.132352941176471, "grad_norm": 0.08290035992126818, "learning_rate": 7.116576367856871e-06, "loss": 0.2606, "step": 281 }, { "epoch": 4.147058823529412, "grad_norm": 0.07674741527367407, "learning_rate": 6.884495242307285e-06, "loss": 0.2613, "step": 282 }, { "epoch": 4.161764705882353, "grad_norm": 0.07292754427620451, "learning_rate": 6.655904591155224e-06, "loss": 0.2618, "step": 283 }, { "epoch": 4.176470588235294, "grad_norm": 0.0736899911159241, "learning_rate": 6.430828508546936e-06, "loss": 0.2637, "step": 284 }, { "epoch": 4.1911764705882355, "grad_norm": 0.07217050587422956, "learning_rate": 6.209290718182539e-06, "loss": 0.2615, "step": 285 }, { "epoch": 4.205882352941177, "grad_norm": 0.07553232921098052, "learning_rate": 5.991314570815441e-06, "loss": 0.265, "step": 286 }, { "epoch": 4.220588235294118, "grad_norm": 0.06886285952389541, "learning_rate": 5.776923041791076e-06, "loss": 0.2602, "step": 287 }, { "epoch": 4.235294117647059, "grad_norm": 0.06173078424810273, "learning_rate": 5.566138728625294e-06, "loss": 0.2575, "step": 288 }, { "epoch": 4.25, "grad_norm": 0.06592763687997258, "learning_rate": 5.358983848622452e-06, "loss": 0.2566, "step": 289 }, { "epoch": 4.264705882352941, "grad_norm": 0.06767164583052036, "learning_rate": 5.15548023653369e-06, "loss": 0.2595, "step": 290 }, { "epoch": 4.279411764705882, "grad_norm": 0.06893875689961178, "learning_rate": 4.955649342255462e-06, "loss": 0.2622, "step": 291 }, { "epoch": 4.294117647058823, "grad_norm": 0.0627731194283305, "learning_rate": 4.7595122285686215e-06, "loss": 0.2605, "step": 292 }, { "epoch": 4.3088235294117645, "grad_norm": 0.06183425105989223, "learning_rate": 4.567089568918403e-06, "loss": 0.262, "step": 293 }, { "epoch": 4.323529411764706, "grad_norm": 0.08304190651695828, "learning_rate": 4.3784016452353526e-06, "loss": 0.2577, "step": 294 }, { "epoch": 4.338235294117647, "grad_norm": 0.06162011756699104, "learning_rate": 4.193468345797511e-06, "loss": 0.2626, "step": 295 }, { "epoch": 4.352941176470588, "grad_norm": 0.06077784990223975, "learning_rate": 4.012309163134194e-06, "loss": 0.2631, "step": 296 }, { "epoch": 4.367647058823529, "grad_norm": 0.059344232351091784, "learning_rate": 3.8349431919713655e-06, "loss": 0.2606, "step": 297 }, { "epoch": 4.382352941176471, "grad_norm": 0.053110563118761965, "learning_rate": 3.6613891272190506e-06, "loss": 0.2584, "step": 298 }, { "epoch": 4.397058823529412, "grad_norm": 0.0550813714546073, "learning_rate": 3.49166526200079e-06, "loss": 0.2623, "step": 299 }, { "epoch": 4.411764705882353, "grad_norm": 0.05497952308193732, "learning_rate": 3.325789485725488e-06, "loss": 0.2607, "step": 300 }, { "epoch": 4.426470588235294, "grad_norm": 0.056505023992606394, "learning_rate": 3.163779282201853e-06, "loss": 0.2648, "step": 301 }, { "epoch": 4.4411764705882355, "grad_norm": 0.055167426784272444, "learning_rate": 3.0056517277955357e-06, "loss": 0.2612, "step": 302 }, { "epoch": 4.455882352941177, "grad_norm": 0.052033455176057224, "learning_rate": 2.8514234896291904e-06, "loss": 0.2617, "step": 303 }, { "epoch": 4.470588235294118, "grad_norm": 0.05049118136446942, "learning_rate": 2.7011108238257723e-06, "loss": 0.2656, "step": 304 }, { "epoch": 4.485294117647059, "grad_norm": 0.04973325645683792, "learning_rate": 2.5547295737950475e-06, "loss": 0.2651, "step": 305 }, { "epoch": 4.5, "grad_norm": 0.05470312702591553, "learning_rate": 2.4122951685636674e-06, "loss": 0.2647, "step": 306 }, { "epoch": 4.514705882352941, "grad_norm": 0.04975562581567009, "learning_rate": 2.2738226211489024e-06, "loss": 0.2588, "step": 307 }, { "epoch": 4.529411764705882, "grad_norm": 0.04990586736807332, "learning_rate": 2.1393265269762197e-06, "loss": 0.2628, "step": 308 }, { "epoch": 4.544117647058823, "grad_norm": 0.048763186017272454, "learning_rate": 2.008821062340891e-06, "loss": 0.2621, "step": 309 }, { "epoch": 4.5588235294117645, "grad_norm": 0.04772811522858683, "learning_rate": 1.8823199829137406e-06, "loss": 0.2604, "step": 310 }, { "epoch": 4.573529411764706, "grad_norm": 0.04514011191017183, "learning_rate": 1.7598366222912933e-06, "loss": 0.2626, "step": 311 }, { "epoch": 4.588235294117647, "grad_norm": 0.0447568436808381, "learning_rate": 1.6413838905903556e-06, "loss": 0.2567, "step": 312 }, { "epoch": 4.602941176470588, "grad_norm": 0.04828872191610463, "learning_rate": 1.5269742730872384e-06, "loss": 0.2618, "step": 313 }, { "epoch": 4.617647058823529, "grad_norm": 0.048070892531791296, "learning_rate": 1.4166198289017952e-06, "loss": 0.2624, "step": 314 }, { "epoch": 4.632352941176471, "grad_norm": 0.0481947096054082, "learning_rate": 1.3103321897263421e-06, "loss": 0.2624, "step": 315 }, { "epoch": 4.647058823529412, "grad_norm": 0.0498006739485343, "learning_rate": 1.2081225585996248e-06, "loss": 0.2594, "step": 316 }, { "epoch": 4.661764705882353, "grad_norm": 0.046785729335567995, "learning_rate": 1.1100017087260205e-06, "loss": 0.2622, "step": 317 }, { "epoch": 4.676470588235294, "grad_norm": 0.04518076179941004, "learning_rate": 1.015979982339994e-06, "loss": 0.2636, "step": 318 }, { "epoch": 4.6911764705882355, "grad_norm": 0.04627910929165888, "learning_rate": 9.260672896159728e-07, "loss": 0.2603, "step": 319 }, { "epoch": 4.705882352941177, "grad_norm": 0.04505892744233274, "learning_rate": 8.402731076238191e-07, "loss": 0.2606, "step": 320 }, { "epoch": 4.720588235294118, "grad_norm": 0.04552316822622753, "learning_rate": 7.586064793298998e-07, "loss": 0.2593, "step": 321 }, { "epoch": 4.735294117647059, "grad_norm": 0.045512369955118724, "learning_rate": 6.810760126439287e-07, "loss": 0.2654, "step": 322 }, { "epoch": 4.75, "grad_norm": 0.0451772973693403, "learning_rate": 6.076898795116792e-07, "loss": 0.2621, "step": 323 }, { "epoch": 4.764705882352941, "grad_norm": 0.044079852747311694, "learning_rate": 5.384558150536201e-07, "loss": 0.2615, "step": 324 }, { "epoch": 4.779411764705882, "grad_norm": 0.04455441736236055, "learning_rate": 4.7338111674962495e-07, "loss": 0.2606, "step": 325 }, { "epoch": 4.794117647058823, "grad_norm": 0.04405689466623866, "learning_rate": 4.124726436697879e-07, "loss": 0.2621, "step": 326 }, { "epoch": 4.8088235294117645, "grad_norm": 0.044529538816460816, "learning_rate": 3.557368157514596e-07, "loss": 0.2646, "step": 327 }, { "epoch": 4.823529411764706, "grad_norm": 0.04553425966459204, "learning_rate": 3.031796131225706e-07, "loss": 0.2625, "step": 328 }, { "epoch": 4.838235294117647, "grad_norm": 0.04331163075687581, "learning_rate": 2.548065754712914e-07, "loss": 0.2592, "step": 329 }, { "epoch": 4.852941176470588, "grad_norm": 0.045474788941501976, "learning_rate": 2.1062280146215252e-07, "loss": 0.2625, "step": 330 }, { "epoch": 4.867647058823529, "grad_norm": 0.04615519000321619, "learning_rate": 1.706329481986213e-07, "loss": 0.2635, "step": 331 }, { "epoch": 4.882352941176471, "grad_norm": 0.04427563364549322, "learning_rate": 1.3484123073222332e-07, "loss": 0.2637, "step": 332 }, { "epoch": 4.897058823529412, "grad_norm": 0.04265147614885937, "learning_rate": 1.0325142161827561e-07, "loss": 0.2637, "step": 333 }, { "epoch": 4.911764705882353, "grad_norm": 0.04295525693989313, "learning_rate": 7.586685051823584e-08, "loss": 0.2586, "step": 334 }, { "epoch": 4.926470588235294, "grad_norm": 0.04243683788724124, "learning_rate": 5.2690403848760785e-08, "loss": 0.2603, "step": 335 }, { "epoch": 4.9411764705882355, "grad_norm": 0.043654171470091575, "learning_rate": 3.3724524477447564e-08, "loss": 0.2622, "step": 336 }, { "epoch": 4.955882352941177, "grad_norm": 0.04272527321797204, "learning_rate": 1.897121146536396e-08, "loss": 0.2542, "step": 337 }, { "epoch": 4.970588235294118, "grad_norm": 0.04827925979026659, "learning_rate": 8.432019856345896e-09, "loss": 0.2582, "step": 338 }, { "epoch": 4.985294117647059, "grad_norm": 0.042423900810093146, "learning_rate": 2.1080605130752162e-09, "loss": 0.2564, "step": 339 }, { "epoch": 5.0, "grad_norm": 0.07128915321863381, "learning_rate": 0.0, "loss": 0.2511, "step": 340 }, { "epoch": 5.0, "step": 340, "total_flos": 1.915314895847424e+16, "train_loss": 0.3188589934040518, "train_runtime": 19815.4609, "train_samples_per_second": 8.67, "train_steps_per_second": 0.017 } ], "logging_steps": 1, "max_steps": 340, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.915314895847424e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }