{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.25, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005, "grad_norm": 6.1962890625, "learning_rate": 9.995e-07, "loss": -0.0, "step": 1 }, { "epoch": 0.001, "grad_norm": 6.744086742401123, "learning_rate": 9.989999999999999e-07, "loss": -0.0, "step": 2 }, { "epoch": 0.0015, "grad_norm": 6.945072174072266, "learning_rate": 9.985e-07, "loss": 0.0, "step": 3 }, { "epoch": 0.002, "grad_norm": 6.354312419891357, "learning_rate": 9.98e-07, "loss": -0.0, "step": 4 }, { "epoch": 0.0025, "grad_norm": 5.802479267120361, "learning_rate": 9.975e-07, "loss": 0.0, "step": 5 }, { "epoch": 0.003, "grad_norm": 4.5852274894714355, "learning_rate": 9.97e-07, "loss": 0.0, "step": 6 }, { "epoch": 0.0035, "grad_norm": 7.049472332000732, "learning_rate": 9.965e-07, "loss": 0.0, "step": 7 }, { "epoch": 0.004, "grad_norm": 21.362648010253906, "learning_rate": 9.959999999999999e-07, "loss": -0.0, "step": 8 }, { "epoch": 0.0045, "grad_norm": 5.594510555267334, "learning_rate": 9.955e-07, "loss": 0.0, "step": 9 }, { "epoch": 0.005, "grad_norm": 5.9653730392456055, "learning_rate": 9.95e-07, "loss": 0.0, "step": 10 }, { "epoch": 0.0055, "grad_norm": 5.095400333404541, "learning_rate": 9.945e-07, "loss": -0.0, "step": 11 }, { "epoch": 0.006, "grad_norm": 0.0, "learning_rate": 9.94e-07, "loss": 0.0, "step": 12 }, { "epoch": 0.0065, "grad_norm": 10.911425590515137, "learning_rate": 9.935e-07, "loss": -0.0, "step": 13 }, { "epoch": 0.007, "grad_norm": 9.652170181274414, "learning_rate": 9.929999999999999e-07, "loss": 0.0, "step": 14 }, { "epoch": 0.0075, "grad_norm": 6.956664562225342, "learning_rate": 9.925e-07, "loss": 0.0, "step": 15 }, { "epoch": 0.008, "grad_norm": 12.070667266845703, "learning_rate": 9.92e-07, "loss": 0.0, "step": 16 }, { "epoch": 0.0085, "grad_norm": 14.007853507995605, "learning_rate": 9.915e-07, "loss": 0.0, "step": 17 }, { "epoch": 0.009, "grad_norm": 4.017375469207764, "learning_rate": 9.91e-07, "loss": 0.0, "step": 18 }, { "epoch": 0.0095, "grad_norm": 0.0, "learning_rate": 9.905e-07, "loss": 0.0, "step": 19 }, { "epoch": 0.01, "grad_norm": 6.546974182128906, "learning_rate": 9.9e-07, "loss": 0.0, "step": 20 }, { "epoch": 0.0105, "grad_norm": 7.551206588745117, "learning_rate": 9.895e-07, "loss": -0.0, "step": 21 }, { "epoch": 0.011, "grad_norm": 0.0, "learning_rate": 9.89e-07, "loss": 0.0, "step": 22 }, { "epoch": 0.0115, "grad_norm": 6.233001232147217, "learning_rate": 9.885e-07, "loss": -0.0, "step": 23 }, { "epoch": 0.012, "grad_norm": 0.0, "learning_rate": 9.88e-07, "loss": 0.0, "step": 24 }, { "epoch": 0.0125, "grad_norm": 7.307622909545898, "learning_rate": 9.875e-07, "loss": -0.0, "step": 25 }, { "epoch": 0.013, "grad_norm": 5.898115158081055, "learning_rate": 9.87e-07, "loss": -0.0, "step": 26 }, { "epoch": 0.0135, "grad_norm": 8.286269187927246, "learning_rate": 9.865e-07, "loss": 0.0, "step": 27 }, { "epoch": 0.014, "grad_norm": 9.178420066833496, "learning_rate": 9.86e-07, "loss": 0.0, "step": 28 }, { "epoch": 0.0145, "grad_norm": 7.090274810791016, "learning_rate": 9.855e-07, "loss": 0.0, "step": 29 }, { "epoch": 0.015, "grad_norm": 10.001739501953125, "learning_rate": 9.849999999999999e-07, "loss": 0.0, "step": 30 }, { "epoch": 0.0155, "grad_norm": 8.978482246398926, "learning_rate": 9.845e-07, "loss": 0.0, "step": 31 }, { "epoch": 0.016, "grad_norm": 8.083369255065918, "learning_rate": 9.84e-07, "loss": -0.0, "step": 32 }, { "epoch": 0.0165, "grad_norm": 9.646997451782227, "learning_rate": 9.835e-07, "loss": 0.0, "step": 33 }, { "epoch": 0.017, "grad_norm": 6.892234802246094, "learning_rate": 9.83e-07, "loss": 0.0, "step": 34 }, { "epoch": 0.0175, "grad_norm": 0.0, "learning_rate": 9.825e-07, "loss": 0.0, "step": 35 }, { "epoch": 0.018, "grad_norm": 6.182197570800781, "learning_rate": 9.819999999999999e-07, "loss": 0.0, "step": 36 }, { "epoch": 0.0185, "grad_norm": 5.895266532897949, "learning_rate": 9.815e-07, "loss": -0.0, "step": 37 }, { "epoch": 0.019, "grad_norm": 11.212841033935547, "learning_rate": 9.81e-07, "loss": -0.0, "step": 38 }, { "epoch": 0.0195, "grad_norm": 7.982095241546631, "learning_rate": 9.805e-07, "loss": 0.0, "step": 39 }, { "epoch": 0.02, "grad_norm": 5.73940896987915, "learning_rate": 9.8e-07, "loss": 0.0, "step": 40 }, { "epoch": 0.0205, "grad_norm": 8.540511131286621, "learning_rate": 9.795e-07, "loss": -0.0, "step": 41 }, { "epoch": 0.021, "grad_norm": 0.0, "learning_rate": 9.789999999999999e-07, "loss": 0.0, "step": 42 }, { "epoch": 0.0215, "grad_norm": 8.709277153015137, "learning_rate": 9.785e-07, "loss": -0.0, "step": 43 }, { "epoch": 0.022, "grad_norm": 6.68982458114624, "learning_rate": 9.78e-07, "loss": 0.0, "step": 44 }, { "epoch": 0.0225, "grad_norm": 6.988176345825195, "learning_rate": 9.775e-07, "loss": 0.0, "step": 45 }, { "epoch": 0.023, "grad_norm": 7.0302910804748535, "learning_rate": 9.77e-07, "loss": 0.0, "step": 46 }, { "epoch": 0.0235, "grad_norm": 8.396454811096191, "learning_rate": 9.765e-07, "loss": -0.0, "step": 47 }, { "epoch": 0.024, "grad_norm": 4.7376227378845215, "learning_rate": 9.759999999999998e-07, "loss": -0.0, "step": 48 }, { "epoch": 0.0245, "grad_norm": 0.0, "learning_rate": 9.755e-07, "loss": 0.0, "step": 49 }, { "epoch": 0.025, "grad_norm": 6.381641387939453, "learning_rate": 9.75e-07, "loss": 0.0, "step": 50 }, { "epoch": 0.0255, "grad_norm": 0.0, "learning_rate": 9.745e-07, "loss": 0.0, "step": 51 }, { "epoch": 0.026, "grad_norm": 8.140380859375, "learning_rate": 9.74e-07, "loss": -0.0, "step": 52 }, { "epoch": 0.0265, "grad_norm": 4.727418899536133, "learning_rate": 9.735e-07, "loss": 0.0, "step": 53 }, { "epoch": 0.027, "grad_norm": 6.386085510253906, "learning_rate": 9.729999999999998e-07, "loss": -0.0, "step": 54 }, { "epoch": 0.0275, "grad_norm": 6.39836311340332, "learning_rate": 9.725e-07, "loss": 0.0, "step": 55 }, { "epoch": 0.028, "grad_norm": 5.749513149261475, "learning_rate": 9.72e-07, "loss": -0.0, "step": 56 }, { "epoch": 0.0285, "grad_norm": 4.699296474456787, "learning_rate": 9.715e-07, "loss": -0.0, "step": 57 }, { "epoch": 0.029, "grad_norm": 8.458806037902832, "learning_rate": 9.709999999999999e-07, "loss": -0.0, "step": 58 }, { "epoch": 0.0295, "grad_norm": 9.1854248046875, "learning_rate": 9.705e-07, "loss": -0.0, "step": 59 }, { "epoch": 0.03, "grad_norm": 6.844909191131592, "learning_rate": 9.7e-07, "loss": 0.0, "step": 60 }, { "epoch": 0.0305, "grad_norm": 33.0734977722168, "learning_rate": 9.695e-07, "loss": 0.0, "step": 61 }, { "epoch": 0.031, "grad_norm": 0.0, "learning_rate": 9.69e-07, "loss": 0.0, "step": 62 }, { "epoch": 0.0315, "grad_norm": 7.425229072570801, "learning_rate": 9.685e-07, "loss": 0.0, "step": 63 }, { "epoch": 0.032, "grad_norm": 9.169403076171875, "learning_rate": 9.679999999999999e-07, "loss": -0.0, "step": 64 }, { "epoch": 0.0325, "grad_norm": 13.490100860595703, "learning_rate": 9.675e-07, "loss": 0.0, "step": 65 }, { "epoch": 0.033, "grad_norm": 7.570629596710205, "learning_rate": 9.67e-07, "loss": -0.0, "step": 66 }, { "epoch": 0.0335, "grad_norm": 5.252549648284912, "learning_rate": 9.665e-07, "loss": 0.0, "step": 67 }, { "epoch": 0.034, "grad_norm": 5.543639183044434, "learning_rate": 9.66e-07, "loss": -0.0, "step": 68 }, { "epoch": 0.0345, "grad_norm": 0.0, "learning_rate": 9.655e-07, "loss": 0.0, "step": 69 }, { "epoch": 0.035, "grad_norm": 5.360587120056152, "learning_rate": 9.649999999999999e-07, "loss": 0.0, "step": 70 }, { "epoch": 0.0355, "grad_norm": 7.327621936798096, "learning_rate": 9.645e-07, "loss": 0.0, "step": 71 }, { "epoch": 0.036, "grad_norm": 9.594143867492676, "learning_rate": 9.64e-07, "loss": 0.0, "step": 72 }, { "epoch": 0.0365, "grad_norm": 5.346116065979004, "learning_rate": 9.635e-07, "loss": 0.0, "step": 73 }, { "epoch": 0.037, "grad_norm": 5.963859558105469, "learning_rate": 9.63e-07, "loss": 0.0, "step": 74 }, { "epoch": 0.0375, "grad_norm": 7.078248023986816, "learning_rate": 9.624999999999999e-07, "loss": 0.0, "step": 75 }, { "epoch": 0.038, "grad_norm": 5.854560375213623, "learning_rate": 9.619999999999999e-07, "loss": 0.0, "step": 76 }, { "epoch": 0.0385, "grad_norm": 8.13651180267334, "learning_rate": 9.615e-07, "loss": -0.0, "step": 77 }, { "epoch": 0.039, "grad_norm": 8.167058944702148, "learning_rate": 9.61e-07, "loss": -0.0, "step": 78 }, { "epoch": 0.0395, "grad_norm": 5.878276348114014, "learning_rate": 9.605e-07, "loss": 0.0, "step": 79 }, { "epoch": 0.04, "grad_norm": 12.290175437927246, "learning_rate": 9.6e-07, "loss": 0.0, "step": 80 }, { "epoch": 0.0405, "grad_norm": 4.8677496910095215, "learning_rate": 9.594999999999999e-07, "loss": 0.0, "step": 81 }, { "epoch": 0.041, "grad_norm": 9.993011474609375, "learning_rate": 9.589999999999998e-07, "loss": 0.0, "step": 82 }, { "epoch": 0.0415, "grad_norm": 7.9544477462768555, "learning_rate": 9.585e-07, "loss": 0.0, "step": 83 }, { "epoch": 0.042, "grad_norm": 8.334663391113281, "learning_rate": 9.58e-07, "loss": -0.0, "step": 84 }, { "epoch": 0.0425, "grad_norm": 21.026262283325195, "learning_rate": 9.575e-07, "loss": -0.0, "step": 85 }, { "epoch": 0.043, "grad_norm": 13.211177825927734, "learning_rate": 9.57e-07, "loss": 0.0, "step": 86 }, { "epoch": 0.0435, "grad_norm": 9.141230583190918, "learning_rate": 9.565e-07, "loss": 0.0, "step": 87 }, { "epoch": 0.044, "grad_norm": 7.934508800506592, "learning_rate": 9.559999999999998e-07, "loss": -0.0, "step": 88 }, { "epoch": 0.0445, "grad_norm": 8.56117057800293, "learning_rate": 9.555e-07, "loss": 0.0, "step": 89 }, { "epoch": 0.045, "grad_norm": 0.0, "learning_rate": 9.55e-07, "loss": 0.0, "step": 90 }, { "epoch": 0.0455, "grad_norm": 15.598448753356934, "learning_rate": 9.545e-07, "loss": 0.0, "step": 91 }, { "epoch": 0.046, "grad_norm": 9.095897674560547, "learning_rate": 9.539999999999999e-07, "loss": -0.0, "step": 92 }, { "epoch": 0.0465, "grad_norm": 4.865746974945068, "learning_rate": 9.535e-07, "loss": -0.0, "step": 93 }, { "epoch": 0.047, "grad_norm": 0.0, "learning_rate": 9.529999999999999e-07, "loss": 0.0, "step": 94 }, { "epoch": 0.0475, "grad_norm": 5.1494951248168945, "learning_rate": 9.525e-07, "loss": 0.0, "step": 95 }, { "epoch": 0.048, "grad_norm": 11.34716510772705, "learning_rate": 9.52e-07, "loss": 0.0, "step": 96 }, { "epoch": 0.0485, "grad_norm": 11.986861228942871, "learning_rate": 9.515e-07, "loss": 0.0, "step": 97 }, { "epoch": 0.049, "grad_norm": 7.944230079650879, "learning_rate": 9.509999999999999e-07, "loss": 0.0, "step": 98 }, { "epoch": 0.0495, "grad_norm": 7.5184783935546875, "learning_rate": 9.504999999999999e-07, "loss": -0.0, "step": 99 }, { "epoch": 0.05, "grad_norm": 4.20994758605957, "learning_rate": 9.499999999999999e-07, "loss": 0.0, "step": 100 }, { "epoch": 0.0505, "grad_norm": 0.0, "learning_rate": 9.495e-07, "loss": 0.0, "step": 101 }, { "epoch": 0.051, "grad_norm": 0.0, "learning_rate": 9.489999999999999e-07, "loss": 0.0, "step": 102 }, { "epoch": 0.0515, "grad_norm": 7.179519176483154, "learning_rate": 9.485e-07, "loss": -0.0, "step": 103 }, { "epoch": 0.052, "grad_norm": 8.312400817871094, "learning_rate": 9.479999999999999e-07, "loss": 0.0, "step": 104 }, { "epoch": 0.0525, "grad_norm": 0.0, "learning_rate": 9.474999999999999e-07, "loss": 0.0, "step": 105 }, { "epoch": 0.053, "grad_norm": 6.276727676391602, "learning_rate": 9.469999999999999e-07, "loss": 0.0, "step": 106 }, { "epoch": 0.0535, "grad_norm": 6.952809810638428, "learning_rate": 9.465e-07, "loss": 0.0, "step": 107 }, { "epoch": 0.054, "grad_norm": 12.95068645477295, "learning_rate": 9.459999999999999e-07, "loss": -0.0, "step": 108 }, { "epoch": 0.0545, "grad_norm": 0.0, "learning_rate": 9.455e-07, "loss": 0.0, "step": 109 }, { "epoch": 0.055, "grad_norm": 13.65576457977295, "learning_rate": 9.45e-07, "loss": 0.0, "step": 110 }, { "epoch": 0.0555, "grad_norm": 8.414222717285156, "learning_rate": 9.444999999999999e-07, "loss": 0.0, "step": 111 }, { "epoch": 0.056, "grad_norm": 7.828263759613037, "learning_rate": 9.439999999999999e-07, "loss": 0.0, "step": 112 }, { "epoch": 0.0565, "grad_norm": 0.0, "learning_rate": 9.434999999999999e-07, "loss": 0.0, "step": 113 }, { "epoch": 0.057, "grad_norm": 7.849336624145508, "learning_rate": 9.429999999999999e-07, "loss": 0.0, "step": 114 }, { "epoch": 0.0575, "grad_norm": 13.594552993774414, "learning_rate": 9.425e-07, "loss": 0.0, "step": 115 }, { "epoch": 0.058, "grad_norm": 6.633617877960205, "learning_rate": 9.419999999999999e-07, "loss": 0.0, "step": 116 }, { "epoch": 0.0585, "grad_norm": 7.893250942230225, "learning_rate": 9.415e-07, "loss": -0.0, "step": 117 }, { "epoch": 0.059, "grad_norm": 7.897842884063721, "learning_rate": 9.409999999999999e-07, "loss": -0.0, "step": 118 }, { "epoch": 0.0595, "grad_norm": 7.738225936889648, "learning_rate": 9.404999999999999e-07, "loss": 0.0, "step": 119 }, { "epoch": 0.06, "grad_norm": 10.054285049438477, "learning_rate": 9.399999999999999e-07, "loss": 0.0, "step": 120 }, { "epoch": 0.0605, "grad_norm": 6.2317328453063965, "learning_rate": 9.395e-07, "loss": 0.0, "step": 121 }, { "epoch": 0.061, "grad_norm": 7.4707207679748535, "learning_rate": 9.389999999999999e-07, "loss": -0.0, "step": 122 }, { "epoch": 0.0615, "grad_norm": 0.0, "learning_rate": 9.385e-07, "loss": 0.0, "step": 123 }, { "epoch": 0.062, "grad_norm": 6.883451461791992, "learning_rate": 9.379999999999998e-07, "loss": 0.0, "step": 124 }, { "epoch": 0.0625, "grad_norm": 5.7558274269104, "learning_rate": 9.374999999999999e-07, "loss": 0.0, "step": 125 }, { "epoch": 0.063, "grad_norm": 4.654928207397461, "learning_rate": 9.37e-07, "loss": -0.0, "step": 126 }, { "epoch": 0.0635, "grad_norm": 13.459746360778809, "learning_rate": 9.365e-07, "loss": 0.0, "step": 127 }, { "epoch": 0.064, "grad_norm": 6.189227104187012, "learning_rate": 9.36e-07, "loss": -0.0, "step": 128 }, { "epoch": 0.0645, "grad_norm": 15.807933807373047, "learning_rate": 9.355e-07, "loss": -0.0, "step": 129 }, { "epoch": 0.065, "grad_norm": 8.20335865020752, "learning_rate": 9.35e-07, "loss": -0.0, "step": 130 }, { "epoch": 0.0655, "grad_norm": 7.410068511962891, "learning_rate": 9.344999999999999e-07, "loss": 0.0, "step": 131 }, { "epoch": 0.066, "grad_norm": 5.982290744781494, "learning_rate": 9.34e-07, "loss": 0.0, "step": 132 }, { "epoch": 0.0665, "grad_norm": 7.302867889404297, "learning_rate": 9.334999999999999e-07, "loss": 0.0, "step": 133 }, { "epoch": 0.067, "grad_norm": 7.16635799407959, "learning_rate": 9.33e-07, "loss": 0.0, "step": 134 }, { "epoch": 0.0675, "grad_norm": 0.0, "learning_rate": 9.325e-07, "loss": 0.0, "step": 135 }, { "epoch": 0.068, "grad_norm": 5.66601037979126, "learning_rate": 9.32e-07, "loss": -0.0, "step": 136 }, { "epoch": 0.0685, "grad_norm": 0.0, "learning_rate": 9.315e-07, "loss": 0.0, "step": 137 }, { "epoch": 0.069, "grad_norm": 12.146499633789062, "learning_rate": 9.31e-07, "loss": -0.0, "step": 138 }, { "epoch": 0.0695, "grad_norm": 6.333805084228516, "learning_rate": 9.304999999999999e-07, "loss": 0.0, "step": 139 }, { "epoch": 0.07, "grad_norm": 17.41741943359375, "learning_rate": 9.3e-07, "loss": -0.0, "step": 140 }, { "epoch": 0.0705, "grad_norm": 0.0, "learning_rate": 9.295e-07, "loss": 0.0, "step": 141 }, { "epoch": 0.071, "grad_norm": 18.96269989013672, "learning_rate": 9.29e-07, "loss": 0.0, "step": 142 }, { "epoch": 0.0715, "grad_norm": 30.19170570373535, "learning_rate": 9.285e-07, "loss": 0.0, "step": 143 }, { "epoch": 0.072, "grad_norm": 12.67878532409668, "learning_rate": 9.28e-07, "loss": -0.0, "step": 144 }, { "epoch": 0.0725, "grad_norm": 16.92245101928711, "learning_rate": 9.274999999999999e-07, "loss": 0.0, "step": 145 }, { "epoch": 0.073, "grad_norm": 8.775379180908203, "learning_rate": 9.27e-07, "loss": 0.0, "step": 146 }, { "epoch": 0.0735, "grad_norm": 0.0, "learning_rate": 9.264999999999999e-07, "loss": 0.0, "step": 147 }, { "epoch": 0.074, "grad_norm": 12.122485160827637, "learning_rate": 9.26e-07, "loss": 0.0, "step": 148 }, { "epoch": 0.0745, "grad_norm": 41.2854118347168, "learning_rate": 9.255e-07, "loss": 0.0, "step": 149 }, { "epoch": 0.075, "grad_norm": 0.0, "learning_rate": 9.25e-07, "loss": 0.0, "step": 150 }, { "epoch": 0.0755, "grad_norm": 12.417732238769531, "learning_rate": 9.244999999999999e-07, "loss": 0.0, "step": 151 }, { "epoch": 0.076, "grad_norm": 23.242403030395508, "learning_rate": 9.24e-07, "loss": 0.0, "step": 152 }, { "epoch": 0.0765, "grad_norm": 0.0, "learning_rate": 9.234999999999999e-07, "loss": 0.0, "step": 153 }, { "epoch": 0.077, "grad_norm": 8.696711540222168, "learning_rate": 9.23e-07, "loss": -0.0, "step": 154 }, { "epoch": 0.0775, "grad_norm": 0.0, "learning_rate": 9.225e-07, "loss": 0.0, "step": 155 }, { "epoch": 0.078, "grad_norm": 0.0, "learning_rate": 9.22e-07, "loss": 0.0, "step": 156 }, { "epoch": 0.0785, "grad_norm": 12.881440162658691, "learning_rate": 9.215e-07, "loss": -0.0, "step": 157 }, { "epoch": 0.079, "grad_norm": 0.0, "learning_rate": 9.21e-07, "loss": 0.0, "step": 158 }, { "epoch": 0.0795, "grad_norm": 21.86204719543457, "learning_rate": 9.204999999999999e-07, "loss": 0.0, "step": 159 }, { "epoch": 0.08, "grad_norm": 16.32013702392578, "learning_rate": 9.2e-07, "loss": -0.0, "step": 160 }, { "epoch": 0.0805, "grad_norm": 0.0, "learning_rate": 9.194999999999999e-07, "loss": 0.0, "step": 161 }, { "epoch": 0.081, "grad_norm": 0.0, "learning_rate": 9.19e-07, "loss": 0.0, "step": 162 }, { "epoch": 0.0815, "grad_norm": 21.536087036132812, "learning_rate": 9.185e-07, "loss": 0.0, "step": 163 }, { "epoch": 0.082, "grad_norm": 15.687423706054688, "learning_rate": 9.18e-07, "loss": 0.0, "step": 164 }, { "epoch": 0.0825, "grad_norm": 0.0, "learning_rate": 9.174999999999999e-07, "loss": 0.0, "step": 165 }, { "epoch": 0.083, "grad_norm": 0.0, "learning_rate": 9.17e-07, "loss": 0.0, "step": 166 }, { "epoch": 0.0835, "grad_norm": 0.0, "learning_rate": 9.164999999999999e-07, "loss": 0.0, "step": 167 }, { "epoch": 0.084, "grad_norm": 0.0, "learning_rate": 9.16e-07, "loss": 0.0, "step": 168 }, { "epoch": 0.0845, "grad_norm": 0.0, "learning_rate": 9.155e-07, "loss": 0.0, "step": 169 }, { "epoch": 0.085, "grad_norm": 0.0, "learning_rate": 9.15e-07, "loss": 0.0, "step": 170 }, { "epoch": 0.0855, "grad_norm": 25.705774307250977, "learning_rate": 9.145e-07, "loss": -0.0, "step": 171 }, { "epoch": 0.086, "grad_norm": 21.59645652770996, "learning_rate": 9.14e-07, "loss": -0.0, "step": 172 }, { "epoch": 0.0865, "grad_norm": 10.857905387878418, "learning_rate": 9.134999999999999e-07, "loss": -0.0, "step": 173 }, { "epoch": 0.087, "grad_norm": 0.0, "learning_rate": 9.13e-07, "loss": 0.0, "step": 174 }, { "epoch": 0.0875, "grad_norm": 0.0, "learning_rate": 9.124999999999999e-07, "loss": 0.0, "step": 175 }, { "epoch": 0.088, "grad_norm": 0.0, "learning_rate": 9.12e-07, "loss": 0.0, "step": 176 }, { "epoch": 0.0885, "grad_norm": 0.0, "learning_rate": 9.115e-07, "loss": 0.0, "step": 177 }, { "epoch": 0.089, "grad_norm": 20.786745071411133, "learning_rate": 9.109999999999999e-07, "loss": 0.0, "step": 178 }, { "epoch": 0.0895, "grad_norm": 8.460957527160645, "learning_rate": 9.104999999999999e-07, "loss": -0.0, "step": 179 }, { "epoch": 0.09, "grad_norm": 0.0, "learning_rate": 9.1e-07, "loss": 0.0, "step": 180 }, { "epoch": 0.0905, "grad_norm": 0.0, "learning_rate": 9.094999999999999e-07, "loss": 0.0, "step": 181 }, { "epoch": 0.091, "grad_norm": 0.0, "learning_rate": 9.09e-07, "loss": 0.0, "step": 182 }, { "epoch": 0.0915, "grad_norm": 49.33989715576172, "learning_rate": 9.085e-07, "loss": -0.0, "step": 183 }, { "epoch": 0.092, "grad_norm": 0.0, "learning_rate": 9.08e-07, "loss": 0.0, "step": 184 }, { "epoch": 0.0925, "grad_norm": 0.0, "learning_rate": 9.074999999999999e-07, "loss": 0.0, "step": 185 }, { "epoch": 0.093, "grad_norm": 0.0, "learning_rate": 9.07e-07, "loss": 0.0, "step": 186 }, { "epoch": 0.0935, "grad_norm": 0.0, "learning_rate": 9.064999999999999e-07, "loss": 0.0, "step": 187 }, { "epoch": 0.094, "grad_norm": 16.010793685913086, "learning_rate": 9.06e-07, "loss": 0.0, "step": 188 }, { "epoch": 0.0945, "grad_norm": 17.950115203857422, "learning_rate": 9.055e-07, "loss": 0.0, "step": 189 }, { "epoch": 0.095, "grad_norm": 0.0, "learning_rate": 9.05e-07, "loss": 0.0, "step": 190 }, { "epoch": 0.0955, "grad_norm": 0.0, "learning_rate": 9.045e-07, "loss": 0.0, "step": 191 }, { "epoch": 0.096, "grad_norm": 8.419339179992676, "learning_rate": 9.039999999999999e-07, "loss": -0.0, "step": 192 }, { "epoch": 0.0965, "grad_norm": 0.0, "learning_rate": 9.034999999999999e-07, "loss": 0.0, "step": 193 }, { "epoch": 0.097, "grad_norm": 17.22492790222168, "learning_rate": 9.03e-07, "loss": -0.0, "step": 194 }, { "epoch": 0.0975, "grad_norm": 0.0, "learning_rate": 9.024999999999999e-07, "loss": 0.0, "step": 195 }, { "epoch": 0.098, "grad_norm": 15.984553337097168, "learning_rate": 9.02e-07, "loss": 0.0, "step": 196 }, { "epoch": 0.0985, "grad_norm": 0.0, "learning_rate": 9.015e-07, "loss": 0.0, "step": 197 }, { "epoch": 0.099, "grad_norm": 11.981531143188477, "learning_rate": 9.01e-07, "loss": 0.0, "step": 198 }, { "epoch": 0.0995, "grad_norm": 0.0, "learning_rate": 9.004999999999999e-07, "loss": 0.0, "step": 199 }, { "epoch": 0.1, "grad_norm": 16.9019832611084, "learning_rate": 9e-07, "loss": -0.0, "step": 200 }, { "epoch": 0.1005, "grad_norm": 0.0, "learning_rate": 8.994999999999999e-07, "loss": 0.0, "step": 201 }, { "epoch": 0.101, "grad_norm": 10.651970863342285, "learning_rate": 8.99e-07, "loss": 0.0, "step": 202 }, { "epoch": 0.1015, "grad_norm": 0.0, "learning_rate": 8.985e-07, "loss": 0.0, "step": 203 }, { "epoch": 0.102, "grad_norm": 0.0, "learning_rate": 8.98e-07, "loss": 0.0, "step": 204 }, { "epoch": 0.1025, "grad_norm": 0.0, "learning_rate": 8.974999999999999e-07, "loss": 0.0, "step": 205 }, { "epoch": 0.103, "grad_norm": 33.05813980102539, "learning_rate": 8.969999999999999e-07, "loss": 0.0, "step": 206 }, { "epoch": 0.1035, "grad_norm": 26.88140296936035, "learning_rate": 8.964999999999999e-07, "loss": 0.0, "step": 207 }, { "epoch": 0.104, "grad_norm": 18.670848846435547, "learning_rate": 8.96e-07, "loss": -0.0, "step": 208 }, { "epoch": 0.1045, "grad_norm": 18.841079711914062, "learning_rate": 8.954999999999999e-07, "loss": -0.0, "step": 209 }, { "epoch": 0.105, "grad_norm": 0.0, "learning_rate": 8.95e-07, "loss": 0.0, "step": 210 }, { "epoch": 0.1055, "grad_norm": 13.156370162963867, "learning_rate": 8.945e-07, "loss": 0.0, "step": 211 }, { "epoch": 0.106, "grad_norm": 0.0, "learning_rate": 8.939999999999999e-07, "loss": 0.0, "step": 212 }, { "epoch": 0.1065, "grad_norm": 0.0, "learning_rate": 8.934999999999999e-07, "loss": 0.0, "step": 213 }, { "epoch": 0.107, "grad_norm": 23.25225830078125, "learning_rate": 8.93e-07, "loss": 0.0, "step": 214 }, { "epoch": 0.1075, "grad_norm": 0.0, "learning_rate": 8.924999999999999e-07, "loss": 0.0, "step": 215 }, { "epoch": 0.108, "grad_norm": 0.0, "learning_rate": 8.92e-07, "loss": 0.0, "step": 216 }, { "epoch": 0.1085, "grad_norm": 0.0, "learning_rate": 8.915e-07, "loss": 0.0, "step": 217 }, { "epoch": 0.109, "grad_norm": 0.0, "learning_rate": 8.91e-07, "loss": 0.0, "step": 218 }, { "epoch": 0.1095, "grad_norm": 57.88274383544922, "learning_rate": 8.904999999999999e-07, "loss": 0.0, "step": 219 }, { "epoch": 0.11, "grad_norm": 31.124988555908203, "learning_rate": 8.9e-07, "loss": 0.0, "step": 220 }, { "epoch": 0.1105, "grad_norm": 0.0, "learning_rate": 8.894999999999999e-07, "loss": 0.0, "step": 221 }, { "epoch": 0.111, "grad_norm": 22.94927215576172, "learning_rate": 8.89e-07, "loss": -0.0, "step": 222 }, { "epoch": 0.1115, "grad_norm": 0.0, "learning_rate": 8.884999999999999e-07, "loss": 0.0, "step": 223 }, { "epoch": 0.112, "grad_norm": 0.0, "learning_rate": 8.88e-07, "loss": 0.0, "step": 224 }, { "epoch": 0.1125, "grad_norm": 22.883502960205078, "learning_rate": 8.874999999999999e-07, "loss": 0.0, "step": 225 }, { "epoch": 0.113, "grad_norm": 10.071247100830078, "learning_rate": 8.869999999999999e-07, "loss": 0.0, "step": 226 }, { "epoch": 0.1135, "grad_norm": 0.0, "learning_rate": 8.864999999999999e-07, "loss": 0.0, "step": 227 }, { "epoch": 0.114, "grad_norm": 231.0457305908203, "learning_rate": 8.86e-07, "loss": -0.0, "step": 228 }, { "epoch": 0.1145, "grad_norm": 0.0, "learning_rate": 8.854999999999999e-07, "loss": 0.0, "step": 229 }, { "epoch": 0.115, "grad_norm": 23.97252655029297, "learning_rate": 8.85e-07, "loss": 0.0, "step": 230 }, { "epoch": 0.1155, "grad_norm": 15.410896301269531, "learning_rate": 8.845e-07, "loss": 0.0, "step": 231 }, { "epoch": 0.116, "grad_norm": 39.541412353515625, "learning_rate": 8.839999999999999e-07, "loss": 0.0, "step": 232 }, { "epoch": 0.1165, "grad_norm": 13.713851928710938, "learning_rate": 8.834999999999999e-07, "loss": 0.0, "step": 233 }, { "epoch": 0.117, "grad_norm": 35.34727096557617, "learning_rate": 8.83e-07, "loss": -0.0, "step": 234 }, { "epoch": 0.1175, "grad_norm": 45.32273864746094, "learning_rate": 8.824999999999999e-07, "loss": 0.0, "step": 235 }, { "epoch": 0.118, "grad_norm": 0.0, "learning_rate": 8.82e-07, "loss": 0.0, "step": 236 }, { "epoch": 0.1185, "grad_norm": 0.0, "learning_rate": 8.814999999999999e-07, "loss": 0.0, "step": 237 }, { "epoch": 0.119, "grad_norm": 267.7450256347656, "learning_rate": 8.81e-07, "loss": 0.0, "step": 238 }, { "epoch": 0.1195, "grad_norm": 143.29161071777344, "learning_rate": 8.804999999999999e-07, "loss": -0.0, "step": 239 }, { "epoch": 0.12, "grad_norm": 52.909034729003906, "learning_rate": 8.799999999999999e-07, "loss": -0.0, "step": 240 }, { "epoch": 0.1205, "grad_norm": 0.0, "learning_rate": 8.794999999999999e-07, "loss": 0.0, "step": 241 }, { "epoch": 0.121, "grad_norm": 37.857696533203125, "learning_rate": 8.79e-07, "loss": 0.0, "step": 242 }, { "epoch": 0.1215, "grad_norm": 0.0, "learning_rate": 8.784999999999999e-07, "loss": 0.0, "step": 243 }, { "epoch": 0.122, "grad_norm": 0.0, "learning_rate": 8.78e-07, "loss": 0.0, "step": 244 }, { "epoch": 0.1225, "grad_norm": 0.0, "learning_rate": 8.774999999999999e-07, "loss": 0.0, "step": 245 }, { "epoch": 0.123, "grad_norm": 0.0, "learning_rate": 8.769999999999999e-07, "loss": 0.0, "step": 246 }, { "epoch": 0.1235, "grad_norm": 30.24044418334961, "learning_rate": 8.764999999999999e-07, "loss": 0.0, "step": 247 }, { "epoch": 0.124, "grad_norm": 0.0, "learning_rate": 8.76e-07, "loss": 0.0, "step": 248 }, { "epoch": 0.1245, "grad_norm": 33.06248092651367, "learning_rate": 8.754999999999999e-07, "loss": 0.0, "step": 249 }, { "epoch": 0.125, "grad_norm": 20.05577278137207, "learning_rate": 8.75e-07, "loss": -0.0, "step": 250 }, { "epoch": 0.1255, "grad_norm": 0.0, "learning_rate": 8.745000000000001e-07, "loss": 0.0, "step": 251 }, { "epoch": 0.126, "grad_norm": 18.56123161315918, "learning_rate": 8.739999999999999e-07, "loss": 0.0, "step": 252 }, { "epoch": 0.1265, "grad_norm": 0.0, "learning_rate": 8.735e-07, "loss": 0.0, "step": 253 }, { "epoch": 0.127, "grad_norm": 12.27500057220459, "learning_rate": 8.729999999999999e-07, "loss": 0.0, "step": 254 }, { "epoch": 0.1275, "grad_norm": 0.0, "learning_rate": 8.725e-07, "loss": 0.0, "step": 255 }, { "epoch": 0.128, "grad_norm": 53.35928726196289, "learning_rate": 8.72e-07, "loss": -0.0, "step": 256 }, { "epoch": 0.1285, "grad_norm": 0.0, "learning_rate": 8.715e-07, "loss": 0.0, "step": 257 }, { "epoch": 0.129, "grad_norm": 0.0, "learning_rate": 8.71e-07, "loss": 0.0, "step": 258 }, { "epoch": 0.1295, "grad_norm": 0.0, "learning_rate": 8.705e-07, "loss": 0.0, "step": 259 }, { "epoch": 0.13, "grad_norm": 0.0, "learning_rate": 8.699999999999999e-07, "loss": 0.0, "step": 260 }, { "epoch": 0.1305, "grad_norm": 40.95280838012695, "learning_rate": 8.695e-07, "loss": 0.0, "step": 261 }, { "epoch": 0.131, "grad_norm": 0.0, "learning_rate": 8.69e-07, "loss": 0.0, "step": 262 }, { "epoch": 0.1315, "grad_norm": 0.0, "learning_rate": 8.685e-07, "loss": 0.0, "step": 263 }, { "epoch": 0.132, "grad_norm": 0.0, "learning_rate": 8.68e-07, "loss": 0.0, "step": 264 }, { "epoch": 0.1325, "grad_norm": 0.0, "learning_rate": 8.675000000000001e-07, "loss": 0.0, "step": 265 }, { "epoch": 0.133, "grad_norm": 0.0, "learning_rate": 8.669999999999999e-07, "loss": 0.0, "step": 266 }, { "epoch": 0.1335, "grad_norm": 0.0, "learning_rate": 8.665e-07, "loss": 0.0, "step": 267 }, { "epoch": 0.134, "grad_norm": 0.0, "learning_rate": 8.659999999999999e-07, "loss": 0.0, "step": 268 }, { "epoch": 0.1345, "grad_norm": 29.156984329223633, "learning_rate": 8.655e-07, "loss": -0.0, "step": 269 }, { "epoch": 0.135, "grad_norm": 25.566734313964844, "learning_rate": 8.65e-07, "loss": 0.0, "step": 270 }, { "epoch": 0.1355, "grad_norm": 90.18716430664062, "learning_rate": 8.645e-07, "loss": 0.0, "step": 271 }, { "epoch": 0.136, "grad_norm": 0.0, "learning_rate": 8.639999999999999e-07, "loss": 0.0, "step": 272 }, { "epoch": 0.1365, "grad_norm": 0.0, "learning_rate": 8.635e-07, "loss": 0.0, "step": 273 }, { "epoch": 0.137, "grad_norm": 0.0, "learning_rate": 8.629999999999999e-07, "loss": 0.0, "step": 274 }, { "epoch": 0.1375, "grad_norm": 0.0, "learning_rate": 8.625e-07, "loss": 0.0, "step": 275 }, { "epoch": 0.138, "grad_norm": 0.0, "learning_rate": 8.62e-07, "loss": 0.0, "step": 276 }, { "epoch": 0.1385, "grad_norm": 0.0, "learning_rate": 8.615e-07, "loss": 0.0, "step": 277 }, { "epoch": 0.139, "grad_norm": 74.6231460571289, "learning_rate": 8.61e-07, "loss": 0.0, "step": 278 }, { "epoch": 0.1395, "grad_norm": 0.0, "learning_rate": 8.605e-07, "loss": 0.0, "step": 279 }, { "epoch": 0.14, "grad_norm": 0.0, "learning_rate": 8.599999999999999e-07, "loss": 0.0, "step": 280 }, { "epoch": 0.1405, "grad_norm": 0.0, "learning_rate": 8.595e-07, "loss": 0.0, "step": 281 }, { "epoch": 0.141, "grad_norm": 0.0, "learning_rate": 8.59e-07, "loss": 0.0, "step": 282 }, { "epoch": 0.1415, "grad_norm": 0.0, "learning_rate": 8.585e-07, "loss": 0.0, "step": 283 }, { "epoch": 0.142, "grad_norm": 562.8270263671875, "learning_rate": 8.58e-07, "loss": 0.0, "step": 284 }, { "epoch": 0.1425, "grad_norm": 0.0, "learning_rate": 8.575e-07, "loss": 0.0, "step": 285 }, { "epoch": 0.143, "grad_norm": 0.0, "learning_rate": 8.569999999999999e-07, "loss": 0.0, "step": 286 }, { "epoch": 0.1435, "grad_norm": 0.0, "learning_rate": 8.565e-07, "loss": 0.0, "step": 287 }, { "epoch": 0.144, "grad_norm": 0.0, "learning_rate": 8.559999999999999e-07, "loss": 0.0, "step": 288 }, { "epoch": 0.1445, "grad_norm": 0.0, "learning_rate": 8.555e-07, "loss": 0.0, "step": 289 }, { "epoch": 0.145, "grad_norm": 0.0, "learning_rate": 8.55e-07, "loss": 0.0, "step": 290 }, { "epoch": 0.1455, "grad_norm": 0.0, "learning_rate": 8.545e-07, "loss": 0.0, "step": 291 }, { "epoch": 0.146, "grad_norm": 0.0, "learning_rate": 8.539999999999999e-07, "loss": 0.0, "step": 292 }, { "epoch": 0.1465, "grad_norm": 0.0, "learning_rate": 8.535e-07, "loss": 0.0, "step": 293 }, { "epoch": 0.147, "grad_norm": 0.0, "learning_rate": 8.529999999999999e-07, "loss": 0.0, "step": 294 }, { "epoch": 0.1475, "grad_norm": 0.0, "learning_rate": 8.525e-07, "loss": 0.0, "step": 295 }, { "epoch": 0.148, "grad_norm": 0.0, "learning_rate": 8.52e-07, "loss": 0.0, "step": 296 }, { "epoch": 0.1485, "grad_norm": 0.0, "learning_rate": 8.515e-07, "loss": 0.0, "step": 297 }, { "epoch": 0.149, "grad_norm": 0.0, "learning_rate": 8.51e-07, "loss": 0.0, "step": 298 }, { "epoch": 0.1495, "grad_norm": 0.0, "learning_rate": 8.504999999999999e-07, "loss": 0.0, "step": 299 }, { "epoch": 0.15, "grad_norm": 0.0, "learning_rate": 8.499999999999999e-07, "loss": 0.0, "step": 300 }, { "epoch": 0.1505, "grad_norm": 0.0, "learning_rate": 8.495e-07, "loss": 0.0, "step": 301 }, { "epoch": 0.151, "grad_norm": 0.0, "learning_rate": 8.489999999999999e-07, "loss": 0.0, "step": 302 }, { "epoch": 0.1515, "grad_norm": 0.0, "learning_rate": 8.485e-07, "loss": 0.0, "step": 303 }, { "epoch": 0.152, "grad_norm": 0.0, "learning_rate": 8.48e-07, "loss": 0.0, "step": 304 }, { "epoch": 0.1525, "grad_norm": 0.0, "learning_rate": 8.475e-07, "loss": 0.0, "step": 305 }, { "epoch": 0.153, "grad_norm": 0.0, "learning_rate": 8.469999999999999e-07, "loss": 0.0, "step": 306 }, { "epoch": 0.1535, "grad_norm": 53.436363220214844, "learning_rate": 8.465e-07, "loss": 0.0, "step": 307 }, { "epoch": 0.154, "grad_norm": 0.0, "learning_rate": 8.459999999999999e-07, "loss": 0.0, "step": 308 }, { "epoch": 0.1545, "grad_norm": 0.0, "learning_rate": 8.455e-07, "loss": 0.0, "step": 309 }, { "epoch": 0.155, "grad_norm": 45.34641647338867, "learning_rate": 8.45e-07, "loss": -0.0, "step": 310 }, { "epoch": 0.1555, "grad_norm": 0.0, "learning_rate": 8.445e-07, "loss": 0.0, "step": 311 }, { "epoch": 0.156, "grad_norm": 0.0, "learning_rate": 8.439999999999999e-07, "loss": 0.0, "step": 312 }, { "epoch": 0.1565, "grad_norm": 0.0, "learning_rate": 8.435e-07, "loss": 0.0, "step": 313 }, { "epoch": 0.157, "grad_norm": 0.0, "learning_rate": 8.429999999999999e-07, "loss": 0.0, "step": 314 }, { "epoch": 0.1575, "grad_norm": 207.4761962890625, "learning_rate": 8.425e-07, "loss": -0.0, "step": 315 }, { "epoch": 0.158, "grad_norm": 0.0, "learning_rate": 8.419999999999999e-07, "loss": 0.0, "step": 316 }, { "epoch": 0.1585, "grad_norm": 49.840850830078125, "learning_rate": 8.415e-07, "loss": -0.0, "step": 317 }, { "epoch": 0.159, "grad_norm": 0.0, "learning_rate": 8.41e-07, "loss": 0.0, "step": 318 }, { "epoch": 0.1595, "grad_norm": 0.0, "learning_rate": 8.404999999999999e-07, "loss": 0.0, "step": 319 }, { "epoch": 0.16, "grad_norm": 0.0, "learning_rate": 8.399999999999999e-07, "loss": 0.0, "step": 320 }, { "epoch": 0.1605, "grad_norm": 42.99878692626953, "learning_rate": 8.395e-07, "loss": -0.0, "step": 321 }, { "epoch": 0.161, "grad_norm": 0.0, "learning_rate": 8.389999999999999e-07, "loss": 0.0, "step": 322 }, { "epoch": 0.1615, "grad_norm": 0.0, "learning_rate": 8.385e-07, "loss": 0.0, "step": 323 }, { "epoch": 0.162, "grad_norm": 26.691635131835938, "learning_rate": 8.38e-07, "loss": 0.0, "step": 324 }, { "epoch": 0.1625, "grad_norm": 0.0, "learning_rate": 8.375e-07, "loss": 0.0, "step": 325 }, { "epoch": 0.163, "grad_norm": 0.0, "learning_rate": 8.369999999999999e-07, "loss": 0.0, "step": 326 }, { "epoch": 0.1635, "grad_norm": 0.0, "learning_rate": 8.365e-07, "loss": 0.0, "step": 327 }, { "epoch": 0.164, "grad_norm": 0.0, "learning_rate": 8.359999999999999e-07, "loss": 0.0, "step": 328 }, { "epoch": 0.1645, "grad_norm": 0.0, "learning_rate": 8.355e-07, "loss": 0.0, "step": 329 }, { "epoch": 0.165, "grad_norm": 78.05026245117188, "learning_rate": 8.349999999999999e-07, "loss": -0.0, "step": 330 }, { "epoch": 0.1655, "grad_norm": 0.0, "learning_rate": 8.345e-07, "loss": 0.0, "step": 331 }, { "epoch": 0.166, "grad_norm": 0.0, "learning_rate": 8.34e-07, "loss": 0.0, "step": 332 }, { "epoch": 0.1665, "grad_norm": 0.0, "learning_rate": 8.334999999999999e-07, "loss": 0.0, "step": 333 }, { "epoch": 0.167, "grad_norm": 0.0, "learning_rate": 8.329999999999999e-07, "loss": 0.0, "step": 334 }, { "epoch": 0.1675, "grad_norm": 0.0, "learning_rate": 8.325e-07, "loss": 0.0, "step": 335 }, { "epoch": 0.168, "grad_norm": 0.0, "learning_rate": 8.319999999999999e-07, "loss": 0.0, "step": 336 }, { "epoch": 0.1685, "grad_norm": 0.0, "learning_rate": 8.315e-07, "loss": 0.0, "step": 337 }, { "epoch": 0.169, "grad_norm": 0.0, "learning_rate": 8.31e-07, "loss": 0.0, "step": 338 }, { "epoch": 0.1695, "grad_norm": 54.89845657348633, "learning_rate": 8.304999999999999e-07, "loss": -0.0, "step": 339 }, { "epoch": 0.17, "grad_norm": 0.0, "learning_rate": 8.299999999999999e-07, "loss": 0.0, "step": 340 }, { "epoch": 0.1705, "grad_norm": 0.0, "learning_rate": 8.295e-07, "loss": 0.0, "step": 341 }, { "epoch": 0.171, "grad_norm": 0.0, "learning_rate": 8.289999999999999e-07, "loss": 0.0, "step": 342 }, { "epoch": 0.1715, "grad_norm": 0.0, "learning_rate": 8.285e-07, "loss": 0.0, "step": 343 }, { "epoch": 0.172, "grad_norm": 0.0, "learning_rate": 8.28e-07, "loss": 0.0, "step": 344 }, { "epoch": 0.1725, "grad_norm": 0.0, "learning_rate": 8.275e-07, "loss": 0.0, "step": 345 }, { "epoch": 0.173, "grad_norm": 0.0, "learning_rate": 8.269999999999999e-07, "loss": 0.0, "step": 346 }, { "epoch": 0.1735, "grad_norm": 0.0, "learning_rate": 8.264999999999999e-07, "loss": 0.0, "step": 347 }, { "epoch": 0.174, "grad_norm": 0.0, "learning_rate": 8.259999999999999e-07, "loss": 0.0, "step": 348 }, { "epoch": 0.1745, "grad_norm": 0.0, "learning_rate": 8.255e-07, "loss": 0.0, "step": 349 }, { "epoch": 0.175, "grad_norm": 0.0, "learning_rate": 8.249999999999999e-07, "loss": 0.0, "step": 350 }, { "epoch": 0.1755, "grad_norm": 0.0, "learning_rate": 8.245e-07, "loss": 0.0, "step": 351 }, { "epoch": 0.176, "grad_norm": 0.0, "learning_rate": 8.24e-07, "loss": 0.0, "step": 352 }, { "epoch": 0.1765, "grad_norm": 0.0, "learning_rate": 8.234999999999999e-07, "loss": 0.0, "step": 353 }, { "epoch": 0.177, "grad_norm": 0.0, "learning_rate": 8.229999999999999e-07, "loss": 0.0, "step": 354 }, { "epoch": 0.1775, "grad_norm": 0.0, "learning_rate": 8.225e-07, "loss": 0.0, "step": 355 }, { "epoch": 0.178, "grad_norm": 0.0, "learning_rate": 8.219999999999999e-07, "loss": 0.0, "step": 356 }, { "epoch": 0.1785, "grad_norm": 0.0, "learning_rate": 8.215e-07, "loss": 0.0, "step": 357 }, { "epoch": 0.179, "grad_norm": 95.88402557373047, "learning_rate": 8.21e-07, "loss": 0.0, "step": 358 }, { "epoch": 0.1795, "grad_norm": 0.0, "learning_rate": 8.205e-07, "loss": 0.0, "step": 359 }, { "epoch": 0.18, "grad_norm": 0.0, "learning_rate": 8.199999999999999e-07, "loss": 0.0, "step": 360 }, { "epoch": 0.1805, "grad_norm": 0.0, "learning_rate": 8.194999999999999e-07, "loss": 0.0, "step": 361 }, { "epoch": 0.181, "grad_norm": 16.117612838745117, "learning_rate": 8.189999999999999e-07, "loss": 0.0, "step": 362 }, { "epoch": 0.1815, "grad_norm": 0.0, "learning_rate": 8.185e-07, "loss": 0.0, "step": 363 }, { "epoch": 0.182, "grad_norm": 0.0, "learning_rate": 8.179999999999999e-07, "loss": 0.0, "step": 364 }, { "epoch": 0.1825, "grad_norm": 82.06559753417969, "learning_rate": 8.175e-07, "loss": 0.0, "step": 365 }, { "epoch": 0.183, "grad_norm": 0.0, "learning_rate": 8.169999999999999e-07, "loss": 0.0, "step": 366 }, { "epoch": 0.1835, "grad_norm": 0.0, "learning_rate": 8.164999999999999e-07, "loss": 0.0, "step": 367 }, { "epoch": 0.184, "grad_norm": 0.0, "learning_rate": 8.159999999999999e-07, "loss": 0.0, "step": 368 }, { "epoch": 0.1845, "grad_norm": 134.08810424804688, "learning_rate": 8.155e-07, "loss": 0.0, "step": 369 }, { "epoch": 0.185, "grad_norm": 0.0, "learning_rate": 8.149999999999999e-07, "loss": 0.0, "step": 370 }, { "epoch": 0.1855, "grad_norm": 0.0, "learning_rate": 8.145e-07, "loss": 0.0, "step": 371 }, { "epoch": 0.186, "grad_norm": 0.0, "learning_rate": 8.14e-07, "loss": 0.0, "step": 372 }, { "epoch": 0.1865, "grad_norm": 0.0, "learning_rate": 8.134999999999999e-07, "loss": 0.0, "step": 373 }, { "epoch": 0.187, "grad_norm": 0.0, "learning_rate": 8.129999999999999e-07, "loss": 0.0, "step": 374 }, { "epoch": 0.1875, "grad_norm": 0.0, "learning_rate": 8.125e-07, "loss": 0.0, "step": 375 }, { "epoch": 0.188, "grad_norm": 0.0, "learning_rate": 8.12e-07, "loss": 0.0, "step": 376 }, { "epoch": 0.1885, "grad_norm": 0.0, "learning_rate": 8.115e-07, "loss": 0.0, "step": 377 }, { "epoch": 0.189, "grad_norm": 0.0, "learning_rate": 8.11e-07, "loss": 0.0, "step": 378 }, { "epoch": 0.1895, "grad_norm": 0.0, "learning_rate": 8.105e-07, "loss": 0.0, "step": 379 }, { "epoch": 0.19, "grad_norm": 0.0, "learning_rate": 8.1e-07, "loss": 0.0, "step": 380 }, { "epoch": 0.1905, "grad_norm": 0.0, "learning_rate": 8.094999999999999e-07, "loss": 0.0, "step": 381 }, { "epoch": 0.191, "grad_norm": 0.0, "learning_rate": 8.09e-07, "loss": 0.0, "step": 382 }, { "epoch": 0.1915, "grad_norm": 0.0, "learning_rate": 8.085e-07, "loss": 0.0, "step": 383 }, { "epoch": 0.192, "grad_norm": 0.0, "learning_rate": 8.08e-07, "loss": 0.0, "step": 384 }, { "epoch": 0.1925, "grad_norm": 0.0, "learning_rate": 8.075e-07, "loss": 0.0, "step": 385 }, { "epoch": 0.193, "grad_norm": 0.0, "learning_rate": 8.070000000000001e-07, "loss": 0.0, "step": 386 }, { "epoch": 0.1935, "grad_norm": 0.0, "learning_rate": 8.064999999999999e-07, "loss": 0.0, "step": 387 }, { "epoch": 0.194, "grad_norm": 0.0, "learning_rate": 8.06e-07, "loss": 0.0, "step": 388 }, { "epoch": 0.1945, "grad_norm": 0.0, "learning_rate": 8.055e-07, "loss": 0.0, "step": 389 }, { "epoch": 0.195, "grad_norm": 0.0, "learning_rate": 8.05e-07, "loss": 0.0, "step": 390 }, { "epoch": 0.1955, "grad_norm": 15.130922317504883, "learning_rate": 8.045e-07, "loss": 0.0, "step": 391 }, { "epoch": 0.196, "grad_norm": 0.0, "learning_rate": 8.04e-07, "loss": 0.0, "step": 392 }, { "epoch": 0.1965, "grad_norm": 0.0, "learning_rate": 8.034999999999999e-07, "loss": 0.0, "step": 393 }, { "epoch": 0.197, "grad_norm": 0.0, "learning_rate": 8.03e-07, "loss": 0.0, "step": 394 }, { "epoch": 0.1975, "grad_norm": 0.0, "learning_rate": 8.024999999999999e-07, "loss": 0.0, "step": 395 }, { "epoch": 0.198, "grad_norm": 0.0, "learning_rate": 8.02e-07, "loss": 0.0, "step": 396 }, { "epoch": 0.1985, "grad_norm": 0.0, "learning_rate": 8.015e-07, "loss": 0.0, "step": 397 }, { "epoch": 0.199, "grad_norm": 0.0, "learning_rate": 8.01e-07, "loss": 0.0, "step": 398 }, { "epoch": 0.1995, "grad_norm": 0.0, "learning_rate": 8.005e-07, "loss": 0.0, "step": 399 }, { "epoch": 0.2, "grad_norm": 0.0, "learning_rate": 8e-07, "loss": 0.0, "step": 400 }, { "epoch": 0.2005, "grad_norm": 0.0, "learning_rate": 7.994999999999999e-07, "loss": 0.0, "step": 401 }, { "epoch": 0.201, "grad_norm": 0.0, "learning_rate": 7.99e-07, "loss": 0.0, "step": 402 }, { "epoch": 0.2015, "grad_norm": 0.0, "learning_rate": 7.985e-07, "loss": 0.0, "step": 403 }, { "epoch": 0.202, "grad_norm": 0.0, "learning_rate": 7.98e-07, "loss": 0.0, "step": 404 }, { "epoch": 0.2025, "grad_norm": 0.0, "learning_rate": 7.975e-07, "loss": 0.0, "step": 405 }, { "epoch": 0.203, "grad_norm": 0.0, "learning_rate": 7.970000000000001e-07, "loss": 0.0, "step": 406 }, { "epoch": 0.2035, "grad_norm": 0.0, "learning_rate": 7.964999999999999e-07, "loss": 0.0, "step": 407 }, { "epoch": 0.204, "grad_norm": 139.8319854736328, "learning_rate": 7.96e-07, "loss": 0.0, "step": 408 }, { "epoch": 0.2045, "grad_norm": 0.0, "learning_rate": 7.954999999999999e-07, "loss": 0.0, "step": 409 }, { "epoch": 0.205, "grad_norm": 0.0, "learning_rate": 7.95e-07, "loss": 0.0, "step": 410 }, { "epoch": 0.2055, "grad_norm": 72.6037368774414, "learning_rate": 7.945e-07, "loss": 0.0, "step": 411 }, { "epoch": 0.206, "grad_norm": 0.0, "learning_rate": 7.94e-07, "loss": 0.0, "step": 412 }, { "epoch": 0.2065, "grad_norm": 0.0, "learning_rate": 7.934999999999999e-07, "loss": 0.0, "step": 413 }, { "epoch": 0.207, "grad_norm": 0.0, "learning_rate": 7.93e-07, "loss": 0.0, "step": 414 }, { "epoch": 0.2075, "grad_norm": 0.0, "learning_rate": 7.924999999999999e-07, "loss": 0.0, "step": 415 }, { "epoch": 0.208, "grad_norm": 0.0, "learning_rate": 7.92e-07, "loss": 0.0, "step": 416 }, { "epoch": 0.2085, "grad_norm": 0.0, "learning_rate": 7.915e-07, "loss": 0.0, "step": 417 }, { "epoch": 0.209, "grad_norm": 0.0, "learning_rate": 7.91e-07, "loss": 0.0, "step": 418 }, { "epoch": 0.2095, "grad_norm": 0.0, "learning_rate": 7.905e-07, "loss": 0.0, "step": 419 }, { "epoch": 0.21, "grad_norm": 0.0, "learning_rate": 7.9e-07, "loss": 0.0, "step": 420 }, { "epoch": 0.2105, "grad_norm": 0.0, "learning_rate": 7.894999999999999e-07, "loss": 0.0, "step": 421 }, { "epoch": 0.211, "grad_norm": 0.0, "learning_rate": 7.89e-07, "loss": 0.0, "step": 422 }, { "epoch": 0.2115, "grad_norm": 0.0, "learning_rate": 7.884999999999999e-07, "loss": 0.0, "step": 423 }, { "epoch": 0.212, "grad_norm": 0.0, "learning_rate": 7.88e-07, "loss": 0.0, "step": 424 }, { "epoch": 0.2125, "grad_norm": 66.85465240478516, "learning_rate": 7.875e-07, "loss": 0.0, "step": 425 }, { "epoch": 0.213, "grad_norm": 108.80921936035156, "learning_rate": 7.87e-07, "loss": -0.0, "step": 426 }, { "epoch": 0.2135, "grad_norm": 0.0, "learning_rate": 7.864999999999999e-07, "loss": 0.0, "step": 427 }, { "epoch": 0.214, "grad_norm": 0.0, "learning_rate": 7.86e-07, "loss": 0.0, "step": 428 }, { "epoch": 0.2145, "grad_norm": 0.0, "learning_rate": 7.854999999999999e-07, "loss": 0.0, "step": 429 }, { "epoch": 0.215, "grad_norm": 0.0, "learning_rate": 7.85e-07, "loss": 0.0, "step": 430 }, { "epoch": 0.2155, "grad_norm": 107.53791046142578, "learning_rate": 7.845e-07, "loss": -0.0, "step": 431 }, { "epoch": 0.216, "grad_norm": 0.0, "learning_rate": 7.84e-07, "loss": 0.0, "step": 432 }, { "epoch": 0.2165, "grad_norm": 0.0, "learning_rate": 7.834999999999999e-07, "loss": 0.0, "step": 433 }, { "epoch": 0.217, "grad_norm": 0.0, "learning_rate": 7.83e-07, "loss": 0.0, "step": 434 }, { "epoch": 0.2175, "grad_norm": 0.0, "learning_rate": 7.824999999999999e-07, "loss": 0.0, "step": 435 }, { "epoch": 0.218, "grad_norm": 184.61976623535156, "learning_rate": 7.82e-07, "loss": 0.0, "step": 436 }, { "epoch": 0.2185, "grad_norm": 0.0, "learning_rate": 7.815e-07, "loss": 0.0, "step": 437 }, { "epoch": 0.219, "grad_norm": 73.76115417480469, "learning_rate": 7.81e-07, "loss": 0.0, "step": 438 }, { "epoch": 0.2195, "grad_norm": 0.0, "learning_rate": 7.805e-07, "loss": 0.0, "step": 439 }, { "epoch": 0.22, "grad_norm": 0.0, "learning_rate": 7.799999999999999e-07, "loss": 0.0, "step": 440 }, { "epoch": 0.2205, "grad_norm": 0.0, "learning_rate": 7.794999999999999e-07, "loss": 0.0, "step": 441 }, { "epoch": 0.221, "grad_norm": 0.0, "learning_rate": 7.79e-07, "loss": 0.0, "step": 442 }, { "epoch": 0.2215, "grad_norm": 82.87494659423828, "learning_rate": 7.784999999999999e-07, "loss": 0.0, "step": 443 }, { "epoch": 0.222, "grad_norm": 0.0, "learning_rate": 7.78e-07, "loss": 0.0, "step": 444 }, { "epoch": 0.2225, "grad_norm": 126.44339752197266, "learning_rate": 7.775e-07, "loss": -0.0, "step": 445 }, { "epoch": 0.223, "grad_norm": 0.0, "learning_rate": 7.77e-07, "loss": 0.0, "step": 446 }, { "epoch": 0.2235, "grad_norm": 0.0, "learning_rate": 7.764999999999999e-07, "loss": 0.0, "step": 447 }, { "epoch": 0.224, "grad_norm": 0.0, "learning_rate": 7.76e-07, "loss": 0.0, "step": 448 }, { "epoch": 0.2245, "grad_norm": 0.0, "learning_rate": 7.754999999999999e-07, "loss": 0.0, "step": 449 }, { "epoch": 0.225, "grad_norm": 0.0, "learning_rate": 7.75e-07, "loss": 0.0, "step": 450 }, { "epoch": 0.2255, "grad_norm": 0.0, "learning_rate": 7.745e-07, "loss": 0.0, "step": 451 }, { "epoch": 0.226, "grad_norm": 0.0, "learning_rate": 7.74e-07, "loss": 0.0, "step": 452 }, { "epoch": 0.2265, "grad_norm": 0.0, "learning_rate": 7.734999999999999e-07, "loss": 0.0, "step": 453 }, { "epoch": 0.227, "grad_norm": 37.326351165771484, "learning_rate": 7.729999999999999e-07, "loss": 0.0, "step": 454 }, { "epoch": 0.2275, "grad_norm": 0.0, "learning_rate": 7.724999999999999e-07, "loss": 0.0, "step": 455 }, { "epoch": 0.228, "grad_norm": 0.0, "learning_rate": 7.72e-07, "loss": 0.0, "step": 456 }, { "epoch": 0.2285, "grad_norm": 0.0, "learning_rate": 7.714999999999999e-07, "loss": 0.0, "step": 457 }, { "epoch": 0.229, "grad_norm": 0.0, "learning_rate": 7.71e-07, "loss": 0.0, "step": 458 }, { "epoch": 0.2295, "grad_norm": 0.0, "learning_rate": 7.705e-07, "loss": 0.0, "step": 459 }, { "epoch": 0.23, "grad_norm": 0.0, "learning_rate": 7.699999999999999e-07, "loss": 0.0, "step": 460 }, { "epoch": 0.2305, "grad_norm": 0.0, "learning_rate": 7.694999999999999e-07, "loss": 0.0, "step": 461 }, { "epoch": 0.231, "grad_norm": 0.0, "learning_rate": 7.69e-07, "loss": 0.0, "step": 462 }, { "epoch": 0.2315, "grad_norm": 0.0, "learning_rate": 7.684999999999999e-07, "loss": 0.0, "step": 463 }, { "epoch": 0.232, "grad_norm": 0.0, "learning_rate": 7.68e-07, "loss": 0.0, "step": 464 }, { "epoch": 0.2325, "grad_norm": 0.0, "learning_rate": 7.675e-07, "loss": 0.0, "step": 465 }, { "epoch": 0.233, "grad_norm": 0.0, "learning_rate": 7.67e-07, "loss": 0.0, "step": 466 }, { "epoch": 0.2335, "grad_norm": 0.0, "learning_rate": 7.664999999999999e-07, "loss": 0.0, "step": 467 }, { "epoch": 0.234, "grad_norm": 0.0, "learning_rate": 7.66e-07, "loss": 0.0, "step": 468 }, { "epoch": 0.2345, "grad_norm": 0.0, "learning_rate": 7.654999999999999e-07, "loss": 0.0, "step": 469 }, { "epoch": 0.235, "grad_norm": 0.0, "learning_rate": 7.65e-07, "loss": 0.0, "step": 470 }, { "epoch": 0.2355, "grad_norm": 0.0, "learning_rate": 7.644999999999999e-07, "loss": 0.0, "step": 471 }, { "epoch": 0.236, "grad_norm": 67.02527618408203, "learning_rate": 7.64e-07, "loss": -0.0, "step": 472 }, { "epoch": 0.2365, "grad_norm": 0.0, "learning_rate": 7.635e-07, "loss": 0.0, "step": 473 }, { "epoch": 0.237, "grad_norm": 0.0, "learning_rate": 7.629999999999999e-07, "loss": 0.0, "step": 474 }, { "epoch": 0.2375, "grad_norm": 0.0, "learning_rate": 7.624999999999999e-07, "loss": 0.0, "step": 475 }, { "epoch": 0.238, "grad_norm": 0.0, "learning_rate": 7.62e-07, "loss": 0.0, "step": 476 }, { "epoch": 0.2385, "grad_norm": 0.0, "learning_rate": 7.614999999999999e-07, "loss": 0.0, "step": 477 }, { "epoch": 0.239, "grad_norm": 0.0, "learning_rate": 7.61e-07, "loss": 0.0, "step": 478 }, { "epoch": 0.2395, "grad_norm": 0.0, "learning_rate": 7.605e-07, "loss": 0.0, "step": 479 }, { "epoch": 0.24, "grad_norm": 0.0, "learning_rate": 7.599999999999999e-07, "loss": 0.0, "step": 480 }, { "epoch": 0.2405, "grad_norm": 0.0, "learning_rate": 7.594999999999999e-07, "loss": 0.0, "step": 481 }, { "epoch": 0.241, "grad_norm": 0.0, "learning_rate": 7.59e-07, "loss": 0.0, "step": 482 }, { "epoch": 0.2415, "grad_norm": 0.0, "learning_rate": 7.584999999999999e-07, "loss": 0.0, "step": 483 }, { "epoch": 0.242, "grad_norm": 0.0, "learning_rate": 7.58e-07, "loss": 0.0, "step": 484 }, { "epoch": 0.2425, "grad_norm": 0.0, "learning_rate": 7.575e-07, "loss": 0.0, "step": 485 }, { "epoch": 0.243, "grad_norm": 0.0, "learning_rate": 7.57e-07, "loss": 0.0, "step": 486 }, { "epoch": 0.2435, "grad_norm": 0.0, "learning_rate": 7.564999999999999e-07, "loss": 0.0, "step": 487 }, { "epoch": 0.244, "grad_norm": 0.0, "learning_rate": 7.559999999999999e-07, "loss": 0.0, "step": 488 }, { "epoch": 0.2445, "grad_norm": 0.0, "learning_rate": 7.554999999999999e-07, "loss": 0.0, "step": 489 }, { "epoch": 0.245, "grad_norm": 0.0, "learning_rate": 7.55e-07, "loss": 0.0, "step": 490 }, { "epoch": 0.2455, "grad_norm": 0.0, "learning_rate": 7.544999999999999e-07, "loss": 0.0, "step": 491 }, { "epoch": 0.246, "grad_norm": 0.0, "learning_rate": 7.54e-07, "loss": 0.0, "step": 492 }, { "epoch": 0.2465, "grad_norm": 0.0, "learning_rate": 7.535e-07, "loss": 0.0, "step": 493 }, { "epoch": 0.247, "grad_norm": 0.0, "learning_rate": 7.529999999999999e-07, "loss": 0.0, "step": 494 }, { "epoch": 0.2475, "grad_norm": 0.0, "learning_rate": 7.524999999999999e-07, "loss": 0.0, "step": 495 }, { "epoch": 0.248, "grad_norm": 0.0, "learning_rate": 7.52e-07, "loss": 0.0, "step": 496 }, { "epoch": 0.2485, "grad_norm": 0.0, "learning_rate": 7.514999999999999e-07, "loss": 0.0, "step": 497 }, { "epoch": 0.249, "grad_norm": 59.718631744384766, "learning_rate": 7.51e-07, "loss": 0.0, "step": 498 }, { "epoch": 0.2495, "grad_norm": 0.0, "learning_rate": 7.505e-07, "loss": 0.0, "step": 499 }, { "epoch": 0.25, "grad_norm": 0.0, "learning_rate": 7.5e-07, "loss": 0.0, "step": 500 } ], "logging_steps": 1.0, "max_steps": 2000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }