| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 5.0, | |
| "eval_steps": 500, | |
| "global_step": 340, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.014705882352941176, | |
| "grad_norm": 2.547836181388219, | |
| "learning_rate": 2.3529411764705885e-06, | |
| "loss": 0.693, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.029411764705882353, | |
| "grad_norm": 2.5270813414533366, | |
| "learning_rate": 4.705882352941177e-06, | |
| "loss": 0.6892, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.04411764705882353, | |
| "grad_norm": 2.355483907211656, | |
| "learning_rate": 7.058823529411766e-06, | |
| "loss": 0.6851, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.058823529411764705, | |
| "grad_norm": 1.8130870424308736, | |
| "learning_rate": 9.411764705882354e-06, | |
| "loss": 0.6513, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.07352941176470588, | |
| "grad_norm": 1.2216520407231852, | |
| "learning_rate": 1.1764705882352942e-05, | |
| "loss": 0.6231, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.08823529411764706, | |
| "grad_norm": 1.503718312141216, | |
| "learning_rate": 1.4117647058823532e-05, | |
| "loss": 0.6169, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.10294117647058823, | |
| "grad_norm": 1.7964660884238277, | |
| "learning_rate": 1.647058823529412e-05, | |
| "loss": 0.5687, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.11764705882352941, | |
| "grad_norm": 1.7269912533351832, | |
| "learning_rate": 1.8823529411764708e-05, | |
| "loss": 0.5586, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.1323529411764706, | |
| "grad_norm": 0.7251113424301481, | |
| "learning_rate": 2.1176470588235296e-05, | |
| "loss": 0.5208, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.14705882352941177, | |
| "grad_norm": 1.1160352667226052, | |
| "learning_rate": 2.3529411764705884e-05, | |
| "loss": 0.5086, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.16176470588235295, | |
| "grad_norm": 0.9015674485810162, | |
| "learning_rate": 2.5882352941176475e-05, | |
| "loss": 0.4903, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.17647058823529413, | |
| "grad_norm": 0.7153137232697248, | |
| "learning_rate": 2.8235294117647063e-05, | |
| "loss": 0.4841, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.19117647058823528, | |
| "grad_norm": 0.8088993429376143, | |
| "learning_rate": 3.0588235294117644e-05, | |
| "loss": 0.4711, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.20588235294117646, | |
| "grad_norm": 0.5653053624299016, | |
| "learning_rate": 3.294117647058824e-05, | |
| "loss": 0.4567, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.22058823529411764, | |
| "grad_norm": 0.5534586516972508, | |
| "learning_rate": 3.529411764705883e-05, | |
| "loss": 0.4522, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.23529411764705882, | |
| "grad_norm": 0.5736228520847013, | |
| "learning_rate": 3.7647058823529415e-05, | |
| "loss": 0.4415, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 0.4337208100637674, | |
| "learning_rate": 4e-05, | |
| "loss": 0.4327, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.2647058823529412, | |
| "grad_norm": 0.4876347786470063, | |
| "learning_rate": 4.235294117647059e-05, | |
| "loss": 0.43, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.27941176470588236, | |
| "grad_norm": 0.4061097022997626, | |
| "learning_rate": 4.470588235294118e-05, | |
| "loss": 0.4317, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.29411764705882354, | |
| "grad_norm": 0.4103637458773689, | |
| "learning_rate": 4.705882352941177e-05, | |
| "loss": 0.43, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.3088235294117647, | |
| "grad_norm": 0.3888995331551954, | |
| "learning_rate": 4.941176470588236e-05, | |
| "loss": 0.4199, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.3235294117647059, | |
| "grad_norm": 0.3268371484433097, | |
| "learning_rate": 5.176470588235295e-05, | |
| "loss": 0.4185, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.3382352941176471, | |
| "grad_norm": 0.39223698739269497, | |
| "learning_rate": 5.411764705882354e-05, | |
| "loss": 0.4128, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.35294117647058826, | |
| "grad_norm": 0.2779761293219138, | |
| "learning_rate": 5.6470588235294126e-05, | |
| "loss": 0.4043, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.36764705882352944, | |
| "grad_norm": 0.3989443078505564, | |
| "learning_rate": 5.8823529411764714e-05, | |
| "loss": 0.4089, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.38235294117647056, | |
| "grad_norm": 0.38093200095401203, | |
| "learning_rate": 6.117647058823529e-05, | |
| "loss": 0.4017, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.39705882352941174, | |
| "grad_norm": 0.4064320796173135, | |
| "learning_rate": 6.352941176470589e-05, | |
| "loss": 0.4011, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.4117647058823529, | |
| "grad_norm": 0.4971422289820653, | |
| "learning_rate": 6.588235294117648e-05, | |
| "loss": 0.3989, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.4264705882352941, | |
| "grad_norm": 0.7406572881589351, | |
| "learning_rate": 6.823529411764707e-05, | |
| "loss": 0.4051, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.4411764705882353, | |
| "grad_norm": 0.8206793701783639, | |
| "learning_rate": 7.058823529411765e-05, | |
| "loss": 0.406, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.45588235294117646, | |
| "grad_norm": 0.46999206766579715, | |
| "learning_rate": 7.294117647058824e-05, | |
| "loss": 0.3959, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.47058823529411764, | |
| "grad_norm": 0.7832181937714577, | |
| "learning_rate": 7.529411764705883e-05, | |
| "loss": 0.3979, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.4852941176470588, | |
| "grad_norm": 0.6729630572440819, | |
| "learning_rate": 7.764705882352942e-05, | |
| "loss": 0.4037, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.5845843059576046, | |
| "learning_rate": 8e-05, | |
| "loss": 0.3981, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.5147058823529411, | |
| "grad_norm": 0.5179039508771461, | |
| "learning_rate": 7.999789193948694e-05, | |
| "loss": 0.3899, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.5294117647058824, | |
| "grad_norm": 0.8446247503414208, | |
| "learning_rate": 7.999156798014366e-05, | |
| "loss": 0.3919, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.5441176470588235, | |
| "grad_norm": 0.5811147704566235, | |
| "learning_rate": 7.998102878853464e-05, | |
| "loss": 0.3842, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.5588235294117647, | |
| "grad_norm": 0.7042141225204365, | |
| "learning_rate": 7.996627547552256e-05, | |
| "loss": 0.3887, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.5735294117647058, | |
| "grad_norm": 0.5162453656112366, | |
| "learning_rate": 7.994730959615125e-05, | |
| "loss": 0.3801, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.5882352941176471, | |
| "grad_norm": 0.6306069956051037, | |
| "learning_rate": 7.992413314948177e-05, | |
| "loss": 0.3872, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.6029411764705882, | |
| "grad_norm": 0.4954620508512063, | |
| "learning_rate": 7.989674857838173e-05, | |
| "loss": 0.3799, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.6176470588235294, | |
| "grad_norm": 0.4655956434769946, | |
| "learning_rate": 7.986515876926777e-05, | |
| "loss": 0.3795, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.6323529411764706, | |
| "grad_norm": 0.4180599863638677, | |
| "learning_rate": 7.982936705180139e-05, | |
| "loss": 0.3797, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.6470588235294118, | |
| "grad_norm": 0.3370827795593815, | |
| "learning_rate": 7.978937719853786e-05, | |
| "loss": 0.3713, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.6617647058823529, | |
| "grad_norm": 0.302709681434872, | |
| "learning_rate": 7.974519342452872e-05, | |
| "loss": 0.37, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.6764705882352942, | |
| "grad_norm": 0.35605679848702854, | |
| "learning_rate": 7.969682038687744e-05, | |
| "loss": 0.3706, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.6911764705882353, | |
| "grad_norm": 0.3386095408702867, | |
| "learning_rate": 7.964426318424855e-05, | |
| "loss": 0.3717, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.7058823529411765, | |
| "grad_norm": 0.3740443449230433, | |
| "learning_rate": 7.958752735633022e-05, | |
| "loss": 0.3678, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.7205882352941176, | |
| "grad_norm": 0.291877051555131, | |
| "learning_rate": 7.952661888325038e-05, | |
| "loss": 0.3667, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.7352941176470589, | |
| "grad_norm": 0.22798970572724098, | |
| "learning_rate": 7.946154418494639e-05, | |
| "loss": 0.3658, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 0.3017794055631277, | |
| "learning_rate": 7.939231012048833e-05, | |
| "loss": 0.366, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.7647058823529411, | |
| "grad_norm": 0.2183488887265105, | |
| "learning_rate": 7.931892398735608e-05, | |
| "loss": 0.364, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.7794117647058824, | |
| "grad_norm": 0.2651745781496878, | |
| "learning_rate": 7.92413935206701e-05, | |
| "loss": 0.36, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.7941176470588235, | |
| "grad_norm": 0.25030378975718287, | |
| "learning_rate": 7.915972689237618e-05, | |
| "loss": 0.3629, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.8088235294117647, | |
| "grad_norm": 0.2531618636697317, | |
| "learning_rate": 7.907393271038403e-05, | |
| "loss": 0.3548, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.8235294117647058, | |
| "grad_norm": 0.33260642234735066, | |
| "learning_rate": 7.898402001766002e-05, | |
| "loss": 0.364, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.8382352941176471, | |
| "grad_norm": 0.41115101334491694, | |
| "learning_rate": 7.888999829127398e-05, | |
| "loss": 0.3578, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.8529411764705882, | |
| "grad_norm": 0.44590061545961934, | |
| "learning_rate": 7.879187744140039e-05, | |
| "loss": 0.3607, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.8676470588235294, | |
| "grad_norm": 0.522962751461851, | |
| "learning_rate": 7.868966781027367e-05, | |
| "loss": 0.3592, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.8823529411764706, | |
| "grad_norm": 0.6601512591505266, | |
| "learning_rate": 7.858338017109822e-05, | |
| "loss": 0.3657, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.8970588235294118, | |
| "grad_norm": 0.5825651454852345, | |
| "learning_rate": 7.847302572691277e-05, | |
| "loss": 0.3625, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.9117647058823529, | |
| "grad_norm": 0.42428578770971376, | |
| "learning_rate": 7.835861610940965e-05, | |
| "loss": 0.3613, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.9264705882352942, | |
| "grad_norm": 0.3717888932030909, | |
| "learning_rate": 7.824016337770872e-05, | |
| "loss": 0.3573, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.9411764705882353, | |
| "grad_norm": 0.42447915248441553, | |
| "learning_rate": 7.811768001708627e-05, | |
| "loss": 0.362, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.9558823529411765, | |
| "grad_norm": 0.3876778292466403, | |
| "learning_rate": 7.799117893765913e-05, | |
| "loss": 0.3557, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.9705882352941176, | |
| "grad_norm": 0.26608755562026015, | |
| "learning_rate": 7.786067347302379e-05, | |
| "loss": 0.3545, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.9852941176470589, | |
| "grad_norm": 0.3434111610187913, | |
| "learning_rate": 7.77261773788511e-05, | |
| "loss": 0.353, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.3497537599784584, | |
| "learning_rate": 7.758770483143634e-05, | |
| "loss": 0.3545, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 1.0147058823529411, | |
| "grad_norm": 0.3391085192527439, | |
| "learning_rate": 7.744527042620496e-05, | |
| "loss": 0.3419, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 1.0294117647058822, | |
| "grad_norm": 0.5077627087325893, | |
| "learning_rate": 7.729888917617424e-05, | |
| "loss": 0.3471, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 1.0441176470588236, | |
| "grad_norm": 0.5532598557050799, | |
| "learning_rate": 7.714857651037081e-05, | |
| "loss": 0.3412, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 1.0588235294117647, | |
| "grad_norm": 0.5780960309028261, | |
| "learning_rate": 7.699434827220448e-05, | |
| "loss": 0.3428, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 1.0735294117647058, | |
| "grad_norm": 0.49974865026294507, | |
| "learning_rate": 7.683622071779816e-05, | |
| "loss": 0.3364, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 1.088235294117647, | |
| "grad_norm": 0.36747177299557693, | |
| "learning_rate": 7.667421051427453e-05, | |
| "loss": 0.3389, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 1.1029411764705883, | |
| "grad_norm": 0.3669907335719105, | |
| "learning_rate": 7.650833473799922e-05, | |
| "loss": 0.3312, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 1.1176470588235294, | |
| "grad_norm": 0.3922235437218937, | |
| "learning_rate": 7.633861087278095e-05, | |
| "loss": 0.3293, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 1.1323529411764706, | |
| "grad_norm": 0.3719011690300337, | |
| "learning_rate": 7.616505680802863e-05, | |
| "loss": 0.3375, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 1.1470588235294117, | |
| "grad_norm": 0.3377098123124342, | |
| "learning_rate": 7.598769083686582e-05, | |
| "loss": 0.3405, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 1.161764705882353, | |
| "grad_norm": 0.29899552501628507, | |
| "learning_rate": 7.58065316542025e-05, | |
| "loss": 0.3351, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 1.1764705882352942, | |
| "grad_norm": 0.32828082894171606, | |
| "learning_rate": 7.562159835476466e-05, | |
| "loss": 0.3389, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 1.1911764705882353, | |
| "grad_norm": 0.3445874209931112, | |
| "learning_rate": 7.543291043108159e-05, | |
| "loss": 0.3378, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 1.2058823529411764, | |
| "grad_norm": 0.3023978628206444, | |
| "learning_rate": 7.524048777143139e-05, | |
| "loss": 0.3387, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 1.2205882352941178, | |
| "grad_norm": 0.25558483521205383, | |
| "learning_rate": 7.504435065774455e-05, | |
| "loss": 0.3306, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 1.2352941176470589, | |
| "grad_norm": 0.32953601128265847, | |
| "learning_rate": 7.48445197634663e-05, | |
| "loss": 0.3349, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 1.25, | |
| "grad_norm": 0.33756326919937646, | |
| "learning_rate": 7.464101615137756e-05, | |
| "loss": 0.3324, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 1.2647058823529411, | |
| "grad_norm": 0.24120088917587124, | |
| "learning_rate": 7.443386127137472e-05, | |
| "loss": 0.3317, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 1.2794117647058822, | |
| "grad_norm": 0.2914122682482861, | |
| "learning_rate": 7.422307695820893e-05, | |
| "loss": 0.3346, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 1.2941176470588236, | |
| "grad_norm": 0.32020365300061576, | |
| "learning_rate": 7.400868542918457e-05, | |
| "loss": 0.3303, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 1.3088235294117647, | |
| "grad_norm": 0.2516025057415832, | |
| "learning_rate": 7.379070928181747e-05, | |
| "loss": 0.3351, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 1.3235294117647058, | |
| "grad_norm": 0.25001986331755705, | |
| "learning_rate": 7.356917149145308e-05, | |
| "loss": 0.3353, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.3382352941176472, | |
| "grad_norm": 0.2792197235528977, | |
| "learning_rate": 7.334409540884479e-05, | |
| "loss": 0.3294, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 1.3529411764705883, | |
| "grad_norm": 0.28078783596981044, | |
| "learning_rate": 7.311550475769272e-05, | |
| "loss": 0.3248, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 1.3676470588235294, | |
| "grad_norm": 0.37967936203746205, | |
| "learning_rate": 7.288342363214313e-05, | |
| "loss": 0.3328, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 1.3823529411764706, | |
| "grad_norm": 0.5149190442137033, | |
| "learning_rate": 7.264787649424888e-05, | |
| "loss": 0.3312, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 1.3970588235294117, | |
| "grad_norm": 0.6122843221799549, | |
| "learning_rate": 7.240888817139094e-05, | |
| "loss": 0.3348, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 1.4117647058823528, | |
| "grad_norm": 0.5886981731078635, | |
| "learning_rate": 7.216648385366167e-05, | |
| "loss": 0.3395, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 1.4264705882352942, | |
| "grad_norm": 0.34409848388448383, | |
| "learning_rate": 7.192068909120959e-05, | |
| "loss": 0.3306, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 1.4411764705882353, | |
| "grad_norm": 0.2978091494500524, | |
| "learning_rate": 7.167152979154634e-05, | |
| "loss": 0.3334, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 1.4558823529411764, | |
| "grad_norm": 0.4400729737039225, | |
| "learning_rate": 7.141903221681595e-05, | |
| "loss": 0.3404, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 1.4705882352941178, | |
| "grad_norm": 0.3694283042397561, | |
| "learning_rate": 7.116322298102681e-05, | |
| "loss": 0.3332, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.4852941176470589, | |
| "grad_norm": 0.2639067391873521, | |
| "learning_rate": 7.090412904724636e-05, | |
| "loss": 0.3313, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 1.5, | |
| "grad_norm": 0.37416820779021187, | |
| "learning_rate": 7.064177772475912e-05, | |
| "loss": 0.3285, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 1.5147058823529411, | |
| "grad_norm": 0.33275488946271775, | |
| "learning_rate": 7.037619666618829e-05, | |
| "loss": 0.3361, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 1.5294117647058822, | |
| "grad_norm": 0.23894606263596457, | |
| "learning_rate": 7.010741386458099e-05, | |
| "loss": 0.3388, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 1.5441176470588234, | |
| "grad_norm": 0.3668325550727754, | |
| "learning_rate": 6.983545765045774e-05, | |
| "loss": 0.3311, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 1.5588235294117647, | |
| "grad_norm": 0.2399104316842811, | |
| "learning_rate": 6.956035668882637e-05, | |
| "loss": 0.3297, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 1.5735294117647058, | |
| "grad_norm": 0.2357871283562126, | |
| "learning_rate": 6.928213997616059e-05, | |
| "loss": 0.3318, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 1.5882352941176472, | |
| "grad_norm": 0.2849131707817035, | |
| "learning_rate": 6.900083683734372e-05, | |
| "loss": 0.3304, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 1.6029411764705883, | |
| "grad_norm": 0.17423798370987492, | |
| "learning_rate": 6.871647692257768e-05, | |
| "loss": 0.3276, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 1.6176470588235294, | |
| "grad_norm": 0.2548925157793133, | |
| "learning_rate": 6.842909020425789e-05, | |
| "loss": 0.334, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.6323529411764706, | |
| "grad_norm": 0.24942793454876405, | |
| "learning_rate": 6.8138706973814e-05, | |
| "loss": 0.3286, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 1.6470588235294117, | |
| "grad_norm": 0.18689295644396056, | |
| "learning_rate": 6.784535783851708e-05, | |
| "loss": 0.3266, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 1.6617647058823528, | |
| "grad_norm": 0.2911151755362797, | |
| "learning_rate": 6.754907371825355e-05, | |
| "loss": 0.3262, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 1.6764705882352942, | |
| "grad_norm": 0.27837463790987516, | |
| "learning_rate": 6.724988584226616e-05, | |
| "loss": 0.3279, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 1.6911764705882353, | |
| "grad_norm": 0.28908336337027807, | |
| "learning_rate": 6.69478257458623e-05, | |
| "loss": 0.3281, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 1.7058823529411766, | |
| "grad_norm": 0.34403162565227147, | |
| "learning_rate": 6.664292526709001e-05, | |
| "loss": 0.3313, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 1.7205882352941178, | |
| "grad_norm": 0.29186151217355694, | |
| "learning_rate": 6.633521654338233e-05, | |
| "loss": 0.3334, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 1.7352941176470589, | |
| "grad_norm": 0.3213370329801203, | |
| "learning_rate": 6.602473200816969e-05, | |
| "loss": 0.3267, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "grad_norm": 0.30160846132752556, | |
| "learning_rate": 6.571150438746157e-05, | |
| "loss": 0.3242, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 1.7647058823529411, | |
| "grad_norm": 0.26892209589390914, | |
| "learning_rate": 6.539556669639691e-05, | |
| "loss": 0.3244, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.7794117647058822, | |
| "grad_norm": 0.2440948850515908, | |
| "learning_rate": 6.507695223576428e-05, | |
| "loss": 0.3229, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 1.7941176470588234, | |
| "grad_norm": 0.20672290158202897, | |
| "learning_rate": 6.475569458849178e-05, | |
| "loss": 0.331, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 1.8088235294117647, | |
| "grad_norm": 0.2459670080944092, | |
| "learning_rate": 6.443182761610752e-05, | |
| "loss": 0.3321, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 1.8235294117647058, | |
| "grad_norm": 0.2704407966484609, | |
| "learning_rate": 6.410538545517026e-05, | |
| "loss": 0.3288, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 1.8382352941176472, | |
| "grad_norm": 0.23662938749712678, | |
| "learning_rate": 6.377640251367148e-05, | |
| "loss": 0.3285, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 1.8529411764705883, | |
| "grad_norm": 0.23508733004486604, | |
| "learning_rate": 6.344491346740859e-05, | |
| "loss": 0.3265, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 1.8676470588235294, | |
| "grad_norm": 0.15905797582955938, | |
| "learning_rate": 6.311095325633006e-05, | |
| "loss": 0.3287, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 1.8823529411764706, | |
| "grad_norm": 0.16194725388549308, | |
| "learning_rate": 6.277455708085255e-05, | |
| "loss": 0.3193, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 1.8970588235294117, | |
| "grad_norm": 0.19431781452308503, | |
| "learning_rate": 6.24357603981508e-05, | |
| "loss": 0.3218, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 1.9117647058823528, | |
| "grad_norm": 0.1530375931457355, | |
| "learning_rate": 6.209459891842023e-05, | |
| "loss": 0.3232, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.9264705882352942, | |
| "grad_norm": 0.20552563428946305, | |
| "learning_rate": 6.175110860111307e-05, | |
| "loss": 0.3291, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 1.9411764705882353, | |
| "grad_norm": 0.16800259989430927, | |
| "learning_rate": 6.140532565114801e-05, | |
| "loss": 0.3255, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 1.9558823529411766, | |
| "grad_norm": 0.15539283189220082, | |
| "learning_rate": 6.105728651509424e-05, | |
| "loss": 0.3254, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 1.9705882352941178, | |
| "grad_norm": 0.16103547056804585, | |
| "learning_rate": 6.070702787732971e-05, | |
| "loss": 0.3249, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 1.9852941176470589, | |
| "grad_norm": 0.16239367515131709, | |
| "learning_rate": 6.0354586656174606e-05, | |
| "loss": 0.3288, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.20166055674169706, | |
| "learning_rate": 6.000000000000001e-05, | |
| "loss": 0.3099, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 2.014705882352941, | |
| "grad_norm": 0.24260127096827225, | |
| "learning_rate": 5.964330528331234e-05, | |
| "loss": 0.3056, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 2.0294117647058822, | |
| "grad_norm": 0.33127332158017103, | |
| "learning_rate": 5.9284540102813964e-05, | |
| "loss": 0.3, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 2.0441176470588234, | |
| "grad_norm": 0.43353541229148657, | |
| "learning_rate": 5.892374227344041e-05, | |
| "loss": 0.308, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 2.0588235294117645, | |
| "grad_norm": 0.49315921894412995, | |
| "learning_rate": 5.856094982437454e-05, | |
| "loss": 0.3067, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 2.073529411764706, | |
| "grad_norm": 0.450512729936066, | |
| "learning_rate": 5.819620099503818e-05, | |
| "loss": 0.3101, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 2.088235294117647, | |
| "grad_norm": 0.343920919271517, | |
| "learning_rate": 5.782953423106154e-05, | |
| "loss": 0.3046, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 2.1029411764705883, | |
| "grad_norm": 0.2814078808698335, | |
| "learning_rate": 5.746098818023093e-05, | |
| "loss": 0.2988, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 2.1176470588235294, | |
| "grad_norm": 0.3529797938787429, | |
| "learning_rate": 5.709060168841524e-05, | |
| "loss": 0.3033, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 2.1323529411764706, | |
| "grad_norm": 0.33042737999638, | |
| "learning_rate": 5.6718413795471346e-05, | |
| "loss": 0.3028, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 2.1470588235294117, | |
| "grad_norm": 0.16519735683489525, | |
| "learning_rate": 5.634446373112926e-05, | |
| "loss": 0.3035, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 2.161764705882353, | |
| "grad_norm": 0.23206099454265375, | |
| "learning_rate": 5.596879091085724e-05, | |
| "loss": 0.3001, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 2.176470588235294, | |
| "grad_norm": 0.2771142062084271, | |
| "learning_rate": 5.5591434931707176e-05, | |
| "loss": 0.3005, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 2.1911764705882355, | |
| "grad_norm": 0.2431821354495366, | |
| "learning_rate": 5.5212435568141036e-05, | |
| "loss": 0.3045, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 2.2058823529411766, | |
| "grad_norm": 0.18676533826823546, | |
| "learning_rate": 5.4831832767838436e-05, | |
| "loss": 0.2993, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 2.2205882352941178, | |
| "grad_norm": 0.21179962464182328, | |
| "learning_rate": 5.444966664748613e-05, | |
| "loss": 0.2967, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 2.235294117647059, | |
| "grad_norm": 0.2277766014598405, | |
| "learning_rate": 5.406597748854947e-05, | |
| "loss": 0.2993, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 2.25, | |
| "grad_norm": 0.19342351829324006, | |
| "learning_rate": 5.368080573302676e-05, | |
| "loss": 0.3044, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 2.264705882352941, | |
| "grad_norm": 0.21064512781524572, | |
| "learning_rate": 5.329419197918639e-05, | |
| "loss": 0.3062, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 2.2794117647058822, | |
| "grad_norm": 0.15702816077320908, | |
| "learning_rate": 5.29061769772878e-05, | |
| "loss": 0.2995, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 2.2941176470588234, | |
| "grad_norm": 0.17947407344934332, | |
| "learning_rate": 5.251680162528618e-05, | |
| "loss": 0.3013, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 2.3088235294117645, | |
| "grad_norm": 0.18466536636530687, | |
| "learning_rate": 5.212610696452174e-05, | |
| "loss": 0.3036, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 2.323529411764706, | |
| "grad_norm": 0.1680929482668945, | |
| "learning_rate": 5.173413417539385e-05, | |
| "loss": 0.3029, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 2.338235294117647, | |
| "grad_norm": 0.1654731382812225, | |
| "learning_rate": 5.134092457302044e-05, | |
| "loss": 0.3024, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 2.3529411764705883, | |
| "grad_norm": 0.15258036230353525, | |
| "learning_rate": 5.0946519602883326e-05, | |
| "loss": 0.3037, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 2.3676470588235294, | |
| "grad_norm": 0.1491955575134207, | |
| "learning_rate": 5.0550960836459674e-05, | |
| "loss": 0.3044, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 2.3823529411764706, | |
| "grad_norm": 0.14545439455177017, | |
| "learning_rate": 5.0154289966840315e-05, | |
| "loss": 0.2954, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 2.3970588235294117, | |
| "grad_norm": 0.13499905174566654, | |
| "learning_rate": 4.975654880433509e-05, | |
| "loss": 0.2991, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 2.411764705882353, | |
| "grad_norm": 0.13529530394095032, | |
| "learning_rate": 4.935777927206595e-05, | |
| "loss": 0.301, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 2.426470588235294, | |
| "grad_norm": 0.14447147193835977, | |
| "learning_rate": 4.895802340154813e-05, | |
| "loss": 0.3038, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 2.4411764705882355, | |
| "grad_norm": 0.1368779663418461, | |
| "learning_rate": 4.85573233282599e-05, | |
| "loss": 0.3022, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 2.4558823529411766, | |
| "grad_norm": 0.13420641137376216, | |
| "learning_rate": 4.815572128720138e-05, | |
| "loss": 0.3049, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 2.4705882352941178, | |
| "grad_norm": 0.12163815573974876, | |
| "learning_rate": 4.7753259608442804e-05, | |
| "loss": 0.2998, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 2.485294117647059, | |
| "grad_norm": 0.15356753682422525, | |
| "learning_rate": 4.734998071266282e-05, | |
| "loss": 0.298, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 2.5, | |
| "grad_norm": 0.11474062387695687, | |
| "learning_rate": 4.694592710667723e-05, | |
| "loss": 0.3068, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 2.514705882352941, | |
| "grad_norm": 0.15293267687494932, | |
| "learning_rate": 4.65411413789586e-05, | |
| "loss": 0.3005, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 2.5294117647058822, | |
| "grad_norm": 0.13885259280209167, | |
| "learning_rate": 4.6135666195147426e-05, | |
| "loss": 0.2994, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 2.5441176470588234, | |
| "grad_norm": 0.11212797930990655, | |
| "learning_rate": 4.572954429355487e-05, | |
| "loss": 0.3026, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 2.5588235294117645, | |
| "grad_norm": 0.1441373090862667, | |
| "learning_rate": 4.532281848065816e-05, | |
| "loss": 0.3014, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 2.5735294117647056, | |
| "grad_norm": 0.1438877611890079, | |
| "learning_rate": 4.491553162658857e-05, | |
| "loss": 0.3044, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 2.588235294117647, | |
| "grad_norm": 0.12603413712308525, | |
| "learning_rate": 4.450772666061285e-05, | |
| "loss": 0.301, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 2.6029411764705883, | |
| "grad_norm": 0.12633770238098366, | |
| "learning_rate": 4.409944656660828e-05, | |
| "loss": 0.2965, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 2.6176470588235294, | |
| "grad_norm": 0.13789374969079662, | |
| "learning_rate": 4.369073437853208e-05, | |
| "loss": 0.3009, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 2.6323529411764706, | |
| "grad_norm": 0.1402520516162208, | |
| "learning_rate": 4.328163317588552e-05, | |
| "loss": 0.298, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 2.6470588235294117, | |
| "grad_norm": 0.09904937633936901, | |
| "learning_rate": 4.2872186079173106e-05, | |
| "loss": 0.3013, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 2.661764705882353, | |
| "grad_norm": 0.1312793347901254, | |
| "learning_rate": 4.2462436245357724e-05, | |
| "loss": 0.3, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 2.6764705882352944, | |
| "grad_norm": 0.09621059323368814, | |
| "learning_rate": 4.205242686331159e-05, | |
| "loss": 0.3029, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 2.6911764705882355, | |
| "grad_norm": 0.10807537972219632, | |
| "learning_rate": 4.164220114926414e-05, | |
| "loss": 0.2978, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 2.7058823529411766, | |
| "grad_norm": 0.10722934994246927, | |
| "learning_rate": 4.123180234224682e-05, | |
| "loss": 0.2998, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 2.7205882352941178, | |
| "grad_norm": 0.1064704662992324, | |
| "learning_rate": 4.0821273699535625e-05, | |
| "loss": 0.3013, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 2.735294117647059, | |
| "grad_norm": 0.10969792661155021, | |
| "learning_rate": 4.04106584920916e-05, | |
| "loss": 0.3006, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 2.75, | |
| "grad_norm": 0.12146189911454472, | |
| "learning_rate": 4e-05, | |
| "loss": 0.3031, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 2.764705882352941, | |
| "grad_norm": 0.11305408445478238, | |
| "learning_rate": 3.9589341507908415e-05, | |
| "loss": 0.3037, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 2.7794117647058822, | |
| "grad_norm": 0.13033799624199852, | |
| "learning_rate": 3.917872630046439e-05, | |
| "loss": 0.3032, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 2.7941176470588234, | |
| "grad_norm": 0.10937208565515465, | |
| "learning_rate": 3.8768197657753194e-05, | |
| "loss": 0.3035, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 2.8088235294117645, | |
| "grad_norm": 0.11386446489403948, | |
| "learning_rate": 3.835779885073588e-05, | |
| "loss": 0.2985, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 2.8235294117647056, | |
| "grad_norm": 0.10573108090134906, | |
| "learning_rate": 3.794757313668841e-05, | |
| "loss": 0.3025, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 2.838235294117647, | |
| "grad_norm": 0.09522893522850687, | |
| "learning_rate": 3.753756375464229e-05, | |
| "loss": 0.2964, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 2.8529411764705883, | |
| "grad_norm": 0.10763504421587497, | |
| "learning_rate": 3.71278139208269e-05, | |
| "loss": 0.2983, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 2.8676470588235294, | |
| "grad_norm": 0.09673967344497875, | |
| "learning_rate": 3.67183668241145e-05, | |
| "loss": 0.303, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 2.8823529411764706, | |
| "grad_norm": 0.11019580337657149, | |
| "learning_rate": 3.630926562146792e-05, | |
| "loss": 0.2977, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 2.8970588235294117, | |
| "grad_norm": 0.09457792059050557, | |
| "learning_rate": 3.5900553433391724e-05, | |
| "loss": 0.2987, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 2.911764705882353, | |
| "grad_norm": 0.08738325053473636, | |
| "learning_rate": 3.549227333938716e-05, | |
| "loss": 0.2986, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 2.9264705882352944, | |
| "grad_norm": 0.09681306300823637, | |
| "learning_rate": 3.5084468373411444e-05, | |
| "loss": 0.3013, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 2.9411764705882355, | |
| "grad_norm": 0.09249395424589109, | |
| "learning_rate": 3.467718151934187e-05, | |
| "loss": 0.2972, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 2.9558823529411766, | |
| "grad_norm": 0.09703820251292773, | |
| "learning_rate": 3.427045570644515e-05, | |
| "loss": 0.2979, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 2.9705882352941178, | |
| "grad_norm": 0.10540258782003765, | |
| "learning_rate": 3.386433380485258e-05, | |
| "loss": 0.2992, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 2.985294117647059, | |
| "grad_norm": 0.08015062850340511, | |
| "learning_rate": 3.34588586210414e-05, | |
| "loss": 0.3011, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.14706722768632058, | |
| "learning_rate": 3.305407289332279e-05, | |
| "loss": 0.2806, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 3.014705882352941, | |
| "grad_norm": 0.10603496844146229, | |
| "learning_rate": 3.2650019287337184e-05, | |
| "loss": 0.2799, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 3.0294117647058822, | |
| "grad_norm": 0.15530639149269276, | |
| "learning_rate": 3.22467403915572e-05, | |
| "loss": 0.2781, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 3.0441176470588234, | |
| "grad_norm": 0.1421223150968687, | |
| "learning_rate": 3.184427871279863e-05, | |
| "loss": 0.282, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 3.0588235294117645, | |
| "grad_norm": 0.14419949652005803, | |
| "learning_rate": 3.144267667174011e-05, | |
| "loss": 0.2776, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 3.073529411764706, | |
| "grad_norm": 0.12994406412161302, | |
| "learning_rate": 3.1041976598451884e-05, | |
| "loss": 0.2781, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 3.088235294117647, | |
| "grad_norm": 0.14614171687593527, | |
| "learning_rate": 3.064222072793407e-05, | |
| "loss": 0.2765, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 3.1029411764705883, | |
| "grad_norm": 0.12016869114374024, | |
| "learning_rate": 3.0243451195664914e-05, | |
| "loss": 0.2783, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 3.1176470588235294, | |
| "grad_norm": 0.1310851029344073, | |
| "learning_rate": 2.984571003315969e-05, | |
| "loss": 0.2781, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 3.1323529411764706, | |
| "grad_norm": 0.11922587006224326, | |
| "learning_rate": 2.944903916354032e-05, | |
| "loss": 0.2795, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 3.1470588235294117, | |
| "grad_norm": 0.12710795880675843, | |
| "learning_rate": 2.905348039711669e-05, | |
| "loss": 0.2784, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 3.161764705882353, | |
| "grad_norm": 0.11445734817339984, | |
| "learning_rate": 2.865907542697957e-05, | |
| "loss": 0.2758, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 3.176470588235294, | |
| "grad_norm": 0.1300689688100464, | |
| "learning_rate": 2.8265865824606165e-05, | |
| "loss": 0.2758, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 3.1911764705882355, | |
| "grad_norm": 0.09831374725437492, | |
| "learning_rate": 2.7873893035478265e-05, | |
| "loss": 0.2748, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 3.2058823529411766, | |
| "grad_norm": 0.1272913228663667, | |
| "learning_rate": 2.7483198374713836e-05, | |
| "loss": 0.2746, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 3.2205882352941178, | |
| "grad_norm": 0.0993483978526809, | |
| "learning_rate": 2.7093823022712217e-05, | |
| "loss": 0.2739, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 3.235294117647059, | |
| "grad_norm": 0.09886695511684795, | |
| "learning_rate": 2.6705808020813622e-05, | |
| "loss": 0.2832, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 3.25, | |
| "grad_norm": 0.0998986520318052, | |
| "learning_rate": 2.6319194266973256e-05, | |
| "loss": 0.2743, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 3.264705882352941, | |
| "grad_norm": 0.0964816428580283, | |
| "learning_rate": 2.5934022511450528e-05, | |
| "loss": 0.2762, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 3.2794117647058822, | |
| "grad_norm": 0.09255980694012796, | |
| "learning_rate": 2.5550333352513885e-05, | |
| "loss": 0.2782, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 3.2941176470588234, | |
| "grad_norm": 0.0988289109757557, | |
| "learning_rate": 2.5168167232161574e-05, | |
| "loss": 0.2748, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 3.3088235294117645, | |
| "grad_norm": 0.0884234456508421, | |
| "learning_rate": 2.4787564431858977e-05, | |
| "loss": 0.2753, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 3.323529411764706, | |
| "grad_norm": 0.09774336930093334, | |
| "learning_rate": 2.4408565068292827e-05, | |
| "loss": 0.2751, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 3.338235294117647, | |
| "grad_norm": 0.0883002824327121, | |
| "learning_rate": 2.4031209089142773e-05, | |
| "loss": 0.2773, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 3.3529411764705883, | |
| "grad_norm": 0.09215211229010013, | |
| "learning_rate": 2.3655536268870744e-05, | |
| "loss": 0.2752, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 3.3676470588235294, | |
| "grad_norm": 0.08094179064778044, | |
| "learning_rate": 2.328158620452868e-05, | |
| "loss": 0.2729, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 3.3823529411764706, | |
| "grad_norm": 0.09692135730954927, | |
| "learning_rate": 2.2909398311584775e-05, | |
| "loss": 0.2731, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 3.3970588235294117, | |
| "grad_norm": 0.0813183018452283, | |
| "learning_rate": 2.2539011819769056e-05, | |
| "loss": 0.2782, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 3.411764705882353, | |
| "grad_norm": 0.08149910156743696, | |
| "learning_rate": 2.2170465768938473e-05, | |
| "loss": 0.275, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 3.426470588235294, | |
| "grad_norm": 0.08503586167349093, | |
| "learning_rate": 2.1803799004961824e-05, | |
| "loss": 0.2766, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 3.4411764705882355, | |
| "grad_norm": 0.07599564665486494, | |
| "learning_rate": 2.1439050175625474e-05, | |
| "loss": 0.2759, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 3.4558823529411766, | |
| "grad_norm": 0.08451424035355402, | |
| "learning_rate": 2.1076257726559603e-05, | |
| "loss": 0.2795, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 3.4705882352941178, | |
| "grad_norm": 0.07467914550010295, | |
| "learning_rate": 2.0715459897186046e-05, | |
| "loss": 0.2767, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 3.485294117647059, | |
| "grad_norm": 0.09085541610657201, | |
| "learning_rate": 2.0356694716687687e-05, | |
| "loss": 0.2785, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 3.5, | |
| "grad_norm": 0.0734615712405951, | |
| "learning_rate": 2.0000000000000012e-05, | |
| "loss": 0.2739, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 3.514705882352941, | |
| "grad_norm": 0.07717163991035296, | |
| "learning_rate": 1.964541334382541e-05, | |
| "loss": 0.2729, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 3.5294117647058822, | |
| "grad_norm": 0.07496301816518236, | |
| "learning_rate": 1.9292972122670303e-05, | |
| "loss": 0.2752, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 3.5441176470588234, | |
| "grad_norm": 0.08080347359960849, | |
| "learning_rate": 1.8942713484905762e-05, | |
| "loss": 0.2801, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 3.5588235294117645, | |
| "grad_norm": 0.0738257225960406, | |
| "learning_rate": 1.8594674348851992e-05, | |
| "loss": 0.2767, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 3.5735294117647056, | |
| "grad_norm": 0.07389506773645138, | |
| "learning_rate": 1.824889139888694e-05, | |
| "loss": 0.2773, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 3.588235294117647, | |
| "grad_norm": 0.070294801390775, | |
| "learning_rate": 1.790540108157977e-05, | |
| "loss": 0.2763, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 3.6029411764705883, | |
| "grad_norm": 0.068704655766595, | |
| "learning_rate": 1.756423960184922e-05, | |
| "loss": 0.2781, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 3.6176470588235294, | |
| "grad_norm": 0.06594242125553404, | |
| "learning_rate": 1.7225442919147467e-05, | |
| "loss": 0.2757, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 3.6323529411764706, | |
| "grad_norm": 0.06969707402390993, | |
| "learning_rate": 1.6889046743669957e-05, | |
| "loss": 0.2776, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 3.6470588235294117, | |
| "grad_norm": 0.06492677835336866, | |
| "learning_rate": 1.6555086532591425e-05, | |
| "loss": 0.2781, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 3.661764705882353, | |
| "grad_norm": 0.06331911499148075, | |
| "learning_rate": 1.6223597486328534e-05, | |
| "loss": 0.279, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 3.6764705882352944, | |
| "grad_norm": 0.06722098051730128, | |
| "learning_rate": 1.589461454482975e-05, | |
| "loss": 0.2802, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 3.6911764705882355, | |
| "grad_norm": 0.06266474278629547, | |
| "learning_rate": 1.556817238389249e-05, | |
| "loss": 0.2781, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 3.7058823529411766, | |
| "grad_norm": 0.07000155789305507, | |
| "learning_rate": 1.5244305411508217e-05, | |
| "loss": 0.278, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 3.7205882352941178, | |
| "grad_norm": 0.061574644987438566, | |
| "learning_rate": 1.4923047764235752e-05, | |
| "loss": 0.2767, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 3.735294117647059, | |
| "grad_norm": 0.0682684963649869, | |
| "learning_rate": 1.4604433303603092e-05, | |
| "loss": 0.2732, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 3.75, | |
| "grad_norm": 0.0642818898133848, | |
| "learning_rate": 1.4288495612538427e-05, | |
| "loss": 0.2743, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 3.764705882352941, | |
| "grad_norm": 0.06713851434875447, | |
| "learning_rate": 1.3975267991830327e-05, | |
| "loss": 0.2817, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 3.7794117647058822, | |
| "grad_norm": 0.06828899506125703, | |
| "learning_rate": 1.3664783456617703e-05, | |
| "loss": 0.2725, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 3.7941176470588234, | |
| "grad_norm": 0.07069453573008218, | |
| "learning_rate": 1.3357074732909996e-05, | |
| "loss": 0.2775, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 3.8088235294117645, | |
| "grad_norm": 0.06390051742059577, | |
| "learning_rate": 1.3052174254137713e-05, | |
| "loss": 0.2771, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 3.8235294117647056, | |
| "grad_norm": 0.06129666220940109, | |
| "learning_rate": 1.275011415773383e-05, | |
| "loss": 0.2789, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 3.838235294117647, | |
| "grad_norm": 0.065817213580326, | |
| "learning_rate": 1.2450926281746458e-05, | |
| "loss": 0.274, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 3.8529411764705883, | |
| "grad_norm": 0.06519904576862424, | |
| "learning_rate": 1.2154642161482939e-05, | |
| "loss": 0.2771, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 3.8676470588235294, | |
| "grad_norm": 0.06313901880567484, | |
| "learning_rate": 1.1861293026186007e-05, | |
| "loss": 0.2754, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 3.8823529411764706, | |
| "grad_norm": 0.06296057703800705, | |
| "learning_rate": 1.1570909795742118e-05, | |
| "loss": 0.2732, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 3.8970588235294117, | |
| "grad_norm": 0.06544715501817833, | |
| "learning_rate": 1.1283523077422327e-05, | |
| "loss": 0.2797, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 3.911764705882353, | |
| "grad_norm": 0.0650053871447722, | |
| "learning_rate": 1.0999163162656296e-05, | |
| "loss": 0.279, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 3.9264705882352944, | |
| "grad_norm": 0.06600064138970108, | |
| "learning_rate": 1.0717860023839424e-05, | |
| "loss": 0.276, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 3.9411764705882355, | |
| "grad_norm": 0.06210716465473454, | |
| "learning_rate": 1.0439643311173642e-05, | |
| "loss": 0.2768, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 3.9558823529411766, | |
| "grad_norm": 0.06401894182204264, | |
| "learning_rate": 1.0164542349542273e-05, | |
| "loss": 0.2788, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 3.9705882352941178, | |
| "grad_norm": 0.06372281222639016, | |
| "learning_rate": 9.892586135419022e-06, | |
| "loss": 0.2777, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 3.985294117647059, | |
| "grad_norm": 0.05892561579737352, | |
| "learning_rate": 9.623803333811713e-06, | |
| "loss": 0.2771, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 0.12086246389399692, | |
| "learning_rate": 9.358222275240884e-06, | |
| "loss": 0.2599, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 4.014705882352941, | |
| "grad_norm": 0.08602389508274347, | |
| "learning_rate": 9.095870952753647e-06, | |
| "loss": 0.2593, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 4.029411764705882, | |
| "grad_norm": 0.07197077150088015, | |
| "learning_rate": 8.83677701897318e-06, | |
| "loss": 0.262, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 4.044117647058823, | |
| "grad_norm": 0.0968580360252349, | |
| "learning_rate": 8.580967783184055e-06, | |
| "loss": 0.261, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 4.0588235294117645, | |
| "grad_norm": 0.09014393537599821, | |
| "learning_rate": 8.328470208453683e-06, | |
| "loss": 0.2622, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 4.073529411764706, | |
| "grad_norm": 0.0789945543394142, | |
| "learning_rate": 8.07931090879042e-06, | |
| "loss": 0.2577, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 4.088235294117647, | |
| "grad_norm": 0.08815856516590073, | |
| "learning_rate": 7.833516146338329e-06, | |
| "loss": 0.2617, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 4.102941176470588, | |
| "grad_norm": 0.08015266822256034, | |
| "learning_rate": 7.591111828609059e-06, | |
| "loss": 0.2641, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 4.117647058823529, | |
| "grad_norm": 0.08154629673882077, | |
| "learning_rate": 7.3521235057511364e-06, | |
| "loss": 0.2638, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 4.132352941176471, | |
| "grad_norm": 0.08290035992126818, | |
| "learning_rate": 7.116576367856871e-06, | |
| "loss": 0.2606, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 4.147058823529412, | |
| "grad_norm": 0.07674741527367407, | |
| "learning_rate": 6.884495242307285e-06, | |
| "loss": 0.2613, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 4.161764705882353, | |
| "grad_norm": 0.07292754427620451, | |
| "learning_rate": 6.655904591155224e-06, | |
| "loss": 0.2618, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 4.176470588235294, | |
| "grad_norm": 0.0736899911159241, | |
| "learning_rate": 6.430828508546936e-06, | |
| "loss": 0.2637, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 4.1911764705882355, | |
| "grad_norm": 0.07217050587422956, | |
| "learning_rate": 6.209290718182539e-06, | |
| "loss": 0.2615, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 4.205882352941177, | |
| "grad_norm": 0.07553232921098052, | |
| "learning_rate": 5.991314570815441e-06, | |
| "loss": 0.265, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 4.220588235294118, | |
| "grad_norm": 0.06886285952389541, | |
| "learning_rate": 5.776923041791076e-06, | |
| "loss": 0.2602, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 4.235294117647059, | |
| "grad_norm": 0.06173078424810273, | |
| "learning_rate": 5.566138728625294e-06, | |
| "loss": 0.2575, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 4.25, | |
| "grad_norm": 0.06592763687997258, | |
| "learning_rate": 5.358983848622452e-06, | |
| "loss": 0.2566, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 4.264705882352941, | |
| "grad_norm": 0.06767164583052036, | |
| "learning_rate": 5.15548023653369e-06, | |
| "loss": 0.2595, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 4.279411764705882, | |
| "grad_norm": 0.06893875689961178, | |
| "learning_rate": 4.955649342255462e-06, | |
| "loss": 0.2622, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 4.294117647058823, | |
| "grad_norm": 0.0627731194283305, | |
| "learning_rate": 4.7595122285686215e-06, | |
| "loss": 0.2605, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 4.3088235294117645, | |
| "grad_norm": 0.06183425105989223, | |
| "learning_rate": 4.567089568918403e-06, | |
| "loss": 0.262, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 4.323529411764706, | |
| "grad_norm": 0.08304190651695828, | |
| "learning_rate": 4.3784016452353526e-06, | |
| "loss": 0.2577, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 4.338235294117647, | |
| "grad_norm": 0.06162011756699104, | |
| "learning_rate": 4.193468345797511e-06, | |
| "loss": 0.2626, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 4.352941176470588, | |
| "grad_norm": 0.06077784990223975, | |
| "learning_rate": 4.012309163134194e-06, | |
| "loss": 0.2631, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 4.367647058823529, | |
| "grad_norm": 0.059344232351091784, | |
| "learning_rate": 3.8349431919713655e-06, | |
| "loss": 0.2606, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 4.382352941176471, | |
| "grad_norm": 0.053110563118761965, | |
| "learning_rate": 3.6613891272190506e-06, | |
| "loss": 0.2584, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 4.397058823529412, | |
| "grad_norm": 0.0550813714546073, | |
| "learning_rate": 3.49166526200079e-06, | |
| "loss": 0.2623, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 4.411764705882353, | |
| "grad_norm": 0.05497952308193732, | |
| "learning_rate": 3.325789485725488e-06, | |
| "loss": 0.2607, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 4.426470588235294, | |
| "grad_norm": 0.056505023992606394, | |
| "learning_rate": 3.163779282201853e-06, | |
| "loss": 0.2648, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 4.4411764705882355, | |
| "grad_norm": 0.055167426784272444, | |
| "learning_rate": 3.0056517277955357e-06, | |
| "loss": 0.2612, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 4.455882352941177, | |
| "grad_norm": 0.052033455176057224, | |
| "learning_rate": 2.8514234896291904e-06, | |
| "loss": 0.2617, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 4.470588235294118, | |
| "grad_norm": 0.05049118136446942, | |
| "learning_rate": 2.7011108238257723e-06, | |
| "loss": 0.2656, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 4.485294117647059, | |
| "grad_norm": 0.04973325645683792, | |
| "learning_rate": 2.5547295737950475e-06, | |
| "loss": 0.2651, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 4.5, | |
| "grad_norm": 0.05470312702591553, | |
| "learning_rate": 2.4122951685636674e-06, | |
| "loss": 0.2647, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 4.514705882352941, | |
| "grad_norm": 0.04975562581567009, | |
| "learning_rate": 2.2738226211489024e-06, | |
| "loss": 0.2588, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 4.529411764705882, | |
| "grad_norm": 0.04990586736807332, | |
| "learning_rate": 2.1393265269762197e-06, | |
| "loss": 0.2628, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 4.544117647058823, | |
| "grad_norm": 0.048763186017272454, | |
| "learning_rate": 2.008821062340891e-06, | |
| "loss": 0.2621, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 4.5588235294117645, | |
| "grad_norm": 0.04772811522858683, | |
| "learning_rate": 1.8823199829137406e-06, | |
| "loss": 0.2604, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 4.573529411764706, | |
| "grad_norm": 0.04514011191017183, | |
| "learning_rate": 1.7598366222912933e-06, | |
| "loss": 0.2626, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 4.588235294117647, | |
| "grad_norm": 0.0447568436808381, | |
| "learning_rate": 1.6413838905903556e-06, | |
| "loss": 0.2567, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 4.602941176470588, | |
| "grad_norm": 0.04828872191610463, | |
| "learning_rate": 1.5269742730872384e-06, | |
| "loss": 0.2618, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 4.617647058823529, | |
| "grad_norm": 0.048070892531791296, | |
| "learning_rate": 1.4166198289017952e-06, | |
| "loss": 0.2624, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 4.632352941176471, | |
| "grad_norm": 0.0481947096054082, | |
| "learning_rate": 1.3103321897263421e-06, | |
| "loss": 0.2624, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 4.647058823529412, | |
| "grad_norm": 0.0498006739485343, | |
| "learning_rate": 1.2081225585996248e-06, | |
| "loss": 0.2594, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 4.661764705882353, | |
| "grad_norm": 0.046785729335567995, | |
| "learning_rate": 1.1100017087260205e-06, | |
| "loss": 0.2622, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 4.676470588235294, | |
| "grad_norm": 0.04518076179941004, | |
| "learning_rate": 1.015979982339994e-06, | |
| "loss": 0.2636, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 4.6911764705882355, | |
| "grad_norm": 0.04627910929165888, | |
| "learning_rate": 9.260672896159728e-07, | |
| "loss": 0.2603, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 4.705882352941177, | |
| "grad_norm": 0.04505892744233274, | |
| "learning_rate": 8.402731076238191e-07, | |
| "loss": 0.2606, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 4.720588235294118, | |
| "grad_norm": 0.04552316822622753, | |
| "learning_rate": 7.586064793298998e-07, | |
| "loss": 0.2593, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 4.735294117647059, | |
| "grad_norm": 0.045512369955118724, | |
| "learning_rate": 6.810760126439287e-07, | |
| "loss": 0.2654, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 4.75, | |
| "grad_norm": 0.0451772973693403, | |
| "learning_rate": 6.076898795116792e-07, | |
| "loss": 0.2621, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 4.764705882352941, | |
| "grad_norm": 0.044079852747311694, | |
| "learning_rate": 5.384558150536201e-07, | |
| "loss": 0.2615, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 4.779411764705882, | |
| "grad_norm": 0.04455441736236055, | |
| "learning_rate": 4.7338111674962495e-07, | |
| "loss": 0.2606, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 4.794117647058823, | |
| "grad_norm": 0.04405689466623866, | |
| "learning_rate": 4.124726436697879e-07, | |
| "loss": 0.2621, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 4.8088235294117645, | |
| "grad_norm": 0.044529538816460816, | |
| "learning_rate": 3.557368157514596e-07, | |
| "loss": 0.2646, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 4.823529411764706, | |
| "grad_norm": 0.04553425966459204, | |
| "learning_rate": 3.031796131225706e-07, | |
| "loss": 0.2625, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 4.838235294117647, | |
| "grad_norm": 0.04331163075687581, | |
| "learning_rate": 2.548065754712914e-07, | |
| "loss": 0.2592, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 4.852941176470588, | |
| "grad_norm": 0.045474788941501976, | |
| "learning_rate": 2.1062280146215252e-07, | |
| "loss": 0.2625, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 4.867647058823529, | |
| "grad_norm": 0.04615519000321619, | |
| "learning_rate": 1.706329481986213e-07, | |
| "loss": 0.2635, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 4.882352941176471, | |
| "grad_norm": 0.04427563364549322, | |
| "learning_rate": 1.3484123073222332e-07, | |
| "loss": 0.2637, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 4.897058823529412, | |
| "grad_norm": 0.04265147614885937, | |
| "learning_rate": 1.0325142161827561e-07, | |
| "loss": 0.2637, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 4.911764705882353, | |
| "grad_norm": 0.04295525693989313, | |
| "learning_rate": 7.586685051823584e-08, | |
| "loss": 0.2586, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 4.926470588235294, | |
| "grad_norm": 0.04243683788724124, | |
| "learning_rate": 5.2690403848760785e-08, | |
| "loss": 0.2603, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 4.9411764705882355, | |
| "grad_norm": 0.043654171470091575, | |
| "learning_rate": 3.3724524477447564e-08, | |
| "loss": 0.2622, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 4.955882352941177, | |
| "grad_norm": 0.04272527321797204, | |
| "learning_rate": 1.897121146536396e-08, | |
| "loss": 0.2542, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 4.970588235294118, | |
| "grad_norm": 0.04827925979026659, | |
| "learning_rate": 8.432019856345896e-09, | |
| "loss": 0.2582, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 4.985294117647059, | |
| "grad_norm": 0.042423900810093146, | |
| "learning_rate": 2.1080605130752162e-09, | |
| "loss": 0.2564, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 0.07128915321863381, | |
| "learning_rate": 0.0, | |
| "loss": 0.2511, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "step": 340, | |
| "total_flos": 1.915314895847424e+16, | |
| "train_loss": 0.3188589934040518, | |
| "train_runtime": 19815.4609, | |
| "train_samples_per_second": 8.67, | |
| "train_steps_per_second": 0.017 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 340, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.915314895847424e+16, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |