{ "best_metric": null, "best_model_checkpoint": null, "epoch": 6.0, "eval_steps": 500, "global_step": 288, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "learning_rate": 2.2222222222222223e-05, "loss": 2.0365, "step": 1 }, { "epoch": 0.04, "learning_rate": 4.4444444444444447e-05, "loss": 2.066, "step": 2 }, { "epoch": 0.06, "learning_rate": 6.666666666666667e-05, "loss": 2.0303, "step": 3 }, { "epoch": 0.08, "learning_rate": 8.888888888888889e-05, "loss": 2.0011, "step": 4 }, { "epoch": 0.1, "learning_rate": 0.00011111111111111112, "loss": 1.9033, "step": 5 }, { "epoch": 0.12, "learning_rate": 0.00013333333333333334, "loss": 1.8849, "step": 6 }, { "epoch": 0.15, "learning_rate": 0.00015555555555555556, "loss": 1.8357, "step": 7 }, { "epoch": 0.17, "learning_rate": 0.00017777777777777779, "loss": 1.7318, "step": 8 }, { "epoch": 0.19, "learning_rate": 0.0002, "loss": 1.77, "step": 9 }, { "epoch": 0.21, "learning_rate": 0.0001999936604744804, "loss": 1.679, "step": 10 }, { "epoch": 0.23, "learning_rate": 0.0001999746427017133, "loss": 1.6819, "step": 11 }, { "epoch": 0.25, "learning_rate": 0.0001999429490929718, "loss": 1.6999, "step": 12 }, { "epoch": 0.27, "learning_rate": 0.00019989858366670476, "loss": 1.6876, "step": 13 }, { "epoch": 0.29, "learning_rate": 0.00019984155204802714, "loss": 1.6702, "step": 14 }, { "epoch": 0.31, "learning_rate": 0.00019977186146800707, "loss": 1.6607, "step": 15 }, { "epoch": 0.33, "learning_rate": 0.00019968952076274872, "loss": 1.6519, "step": 16 }, { "epoch": 0.35, "learning_rate": 0.00019959454037227214, "loss": 1.6445, "step": 17 }, { "epoch": 0.38, "learning_rate": 0.00019948693233918952, "loss": 1.6258, "step": 18 }, { "epoch": 0.4, "learning_rate": 0.0001993667103071783, "loss": 1.5944, "step": 19 }, { "epoch": 0.42, "learning_rate": 0.00019923388951925125, "loss": 1.6484, "step": 20 }, { "epoch": 0.44, "learning_rate": 0.00019908848681582391, "loss": 1.6131, "step": 21 }, { "epoch": 0.46, "learning_rate": 0.0001989305206325792, "loss": 1.6264, "step": 22 }, { "epoch": 0.48, "learning_rate": 0.00019876001099813017, "loss": 1.6238, "step": 23 }, { "epoch": 0.5, "learning_rate": 0.00019857697953148037, "loss": 1.6249, "step": 24 }, { "epoch": 0.52, "learning_rate": 0.0001983814494392829, "loss": 1.5891, "step": 25 }, { "epoch": 0.54, "learning_rate": 0.00019817344551289795, "loss": 1.6246, "step": 26 }, { "epoch": 0.56, "learning_rate": 0.00019795299412524945, "loss": 1.615, "step": 27 }, { "epoch": 0.58, "learning_rate": 0.0001977201232274814, "loss": 1.5451, "step": 28 }, { "epoch": 0.6, "learning_rate": 0.00019747486234541383, "loss": 1.5968, "step": 29 }, { "epoch": 0.62, "learning_rate": 0.00019721724257579907, "loss": 1.5895, "step": 30 }, { "epoch": 0.65, "learning_rate": 0.00019694729658237926, "loss": 1.6084, "step": 31 }, { "epoch": 0.67, "learning_rate": 0.00019666505859174463, "loss": 1.5562, "step": 32 }, { "epoch": 0.69, "learning_rate": 0.0001963705643889941, "loss": 1.5704, "step": 33 }, { "epoch": 0.71, "learning_rate": 0.00019606385131319792, "loss": 1.5856, "step": 34 }, { "epoch": 0.73, "learning_rate": 0.00019574495825266358, "loss": 1.6082, "step": 35 }, { "epoch": 0.75, "learning_rate": 0.00019541392564000488, "loss": 1.5809, "step": 36 }, { "epoch": 0.77, "learning_rate": 0.00019507079544701583, "loss": 1.6034, "step": 37 }, { "epoch": 0.79, "learning_rate": 0.00019471561117934868, "loss": 1.5573, "step": 38 }, { "epoch": 0.81, "learning_rate": 0.00019434841787099803, "loss": 1.5286, "step": 39 }, { "epoch": 0.83, "learning_rate": 0.00019396926207859084, "loss": 1.5683, "step": 40 }, { "epoch": 0.85, "learning_rate": 0.0001935781918754836, "loss": 1.5521, "step": 41 }, { "epoch": 0.88, "learning_rate": 0.00019317525684566685, "loss": 1.5245, "step": 42 }, { "epoch": 0.9, "learning_rate": 0.0001927605080774788, "loss": 1.5501, "step": 43 }, { "epoch": 0.92, "learning_rate": 0.00019233399815712736, "loss": 1.5389, "step": 44 }, { "epoch": 0.94, "learning_rate": 0.00019189578116202307, "loss": 1.5595, "step": 45 }, { "epoch": 0.96, "learning_rate": 0.0001914459126539224, "loss": 1.5618, "step": 46 }, { "epoch": 0.98, "learning_rate": 0.00019098444967188306, "loss": 1.5576, "step": 47 }, { "epoch": 1.0, "learning_rate": 0.00019051145072503215, "loss": 1.4294, "step": 48 }, { "epoch": 1.02, "learning_rate": 0.00019002697578514747, "loss": 1.3713, "step": 49 }, { "epoch": 1.04, "learning_rate": 0.00018953108627905394, "loss": 1.4402, "step": 50 }, { "epoch": 1.06, "learning_rate": 0.00018902384508083517, "loss": 1.3995, "step": 51 }, { "epoch": 1.08, "learning_rate": 0.00018850531650386153, "loss": 1.374, "step": 52 }, { "epoch": 1.1, "learning_rate": 0.00018797556629263602, "loss": 1.3839, "step": 53 }, { "epoch": 1.12, "learning_rate": 0.00018743466161445823, "loss": 1.3465, "step": 54 }, { "epoch": 1.15, "learning_rate": 0.0001868826710509084, "loss": 1.4043, "step": 55 }, { "epoch": 1.17, "learning_rate": 0.0001863196645891518, "loss": 1.3152, "step": 56 }, { "epoch": 1.19, "learning_rate": 0.0001857457136130651, "loss": 1.3446, "step": 57 }, { "epoch": 1.21, "learning_rate": 0.00018516089089418549, "loss": 1.3554, "step": 58 }, { "epoch": 1.23, "learning_rate": 0.000184565270582484, "loss": 1.3529, "step": 59 }, { "epoch": 1.25, "learning_rate": 0.00018395892819696389, "loss": 1.3498, "step": 60 }, { "epoch": 1.27, "learning_rate": 0.00018334194061608576, "loss": 1.3836, "step": 61 }, { "epoch": 1.29, "learning_rate": 0.00018271438606801986, "loss": 1.3502, "step": 62 }, { "epoch": 1.31, "learning_rate": 0.00018207634412072764, "loss": 1.3961, "step": 63 }, { "epoch": 1.33, "learning_rate": 0.00018142789567187327, "loss": 1.319, "step": 64 }, { "epoch": 1.35, "learning_rate": 0.0001807691229385665, "loss": 1.3707, "step": 65 }, { "epoch": 1.38, "learning_rate": 0.00018010010944693848, "loss": 1.3051, "step": 66 }, { "epoch": 1.4, "learning_rate": 0.0001794209400215512, "loss": 1.3259, "step": 67 }, { "epoch": 1.42, "learning_rate": 0.00017873170077464283, "loss": 1.3874, "step": 68 }, { "epoch": 1.44, "learning_rate": 0.0001780324790952092, "loss": 1.3599, "step": 69 }, { "epoch": 1.46, "learning_rate": 0.00017732336363792395, "loss": 1.3649, "step": 70 }, { "epoch": 1.48, "learning_rate": 0.0001766044443118978, "loss": 1.3707, "step": 71 }, { "epoch": 1.5, "learning_rate": 0.0001758758122692791, "loss": 1.3653, "step": 72 }, { "epoch": 1.52, "learning_rate": 0.00017513755989369636, "loss": 1.3411, "step": 73 }, { "epoch": 1.54, "learning_rate": 0.00017438978078854512, "loss": 1.3397, "step": 74 }, { "epoch": 1.56, "learning_rate": 0.00017363256976511972, "loss": 1.3914, "step": 75 }, { "epoch": 1.58, "learning_rate": 0.00017286602283059238, "loss": 1.3515, "step": 76 }, { "epoch": 1.6, "learning_rate": 0.00017209023717584013, "loss": 1.3399, "step": 77 }, { "epoch": 1.62, "learning_rate": 0.00017130531116312203, "loss": 1.3233, "step": 78 }, { "epoch": 1.65, "learning_rate": 0.00017051134431360796, "loss": 1.3346, "step": 79 }, { "epoch": 1.67, "learning_rate": 0.00016970843729475991, "loss": 1.3604, "step": 80 }, { "epoch": 1.69, "learning_rate": 0.00016889669190756868, "loss": 1.3503, "step": 81 }, { "epoch": 1.71, "learning_rate": 0.00016807621107364613, "loss": 1.3472, "step": 82 }, { "epoch": 1.73, "learning_rate": 0.00016724709882217603, "loss": 1.3482, "step": 83 }, { "epoch": 1.75, "learning_rate": 0.00016640946027672392, "loss": 1.3414, "step": 84 }, { "epoch": 1.77, "learning_rate": 0.00016556340164190845, "loss": 1.4021, "step": 85 }, { "epoch": 1.79, "learning_rate": 0.00016470903018993578, "loss": 1.3689, "step": 86 }, { "epoch": 1.81, "learning_rate": 0.00016384645424699835, "loss": 1.3341, "step": 87 }, { "epoch": 1.83, "learning_rate": 0.00016297578317954025, "loss": 1.3414, "step": 88 }, { "epoch": 1.85, "learning_rate": 0.00016209712738039049, "loss": 1.3386, "step": 89 }, { "epoch": 1.88, "learning_rate": 0.0001612105982547663, "loss": 1.3425, "step": 90 }, { "epoch": 1.9, "learning_rate": 0.00016031630820614797, "loss": 1.3599, "step": 91 }, { "epoch": 1.92, "learning_rate": 0.0001594143706220273, "loss": 1.3706, "step": 92 }, { "epoch": 1.94, "learning_rate": 0.00015850489985953076, "loss": 1.3107, "step": 93 }, { "epoch": 1.96, "learning_rate": 0.00015758801123092066, "loss": 1.3324, "step": 94 }, { "epoch": 1.98, "learning_rate": 0.00015666382098897412, "loss": 1.3469, "step": 95 }, { "epoch": 2.0, "learning_rate": 0.00015573244631224365, "loss": 1.1865, "step": 96 }, { "epoch": 2.02, "learning_rate": 0.00015479400529019985, "loss": 1.181, "step": 97 }, { "epoch": 2.04, "learning_rate": 0.0001538486169082589, "loss": 1.1968, "step": 98 }, { "epoch": 2.06, "learning_rate": 0.00015289640103269625, "loss": 1.1365, "step": 99 }, { "epoch": 2.08, "learning_rate": 0.00015193747839544876, "loss": 1.1734, "step": 100 }, { "epoch": 2.1, "learning_rate": 0.00015097197057880706, "loss": 1.1399, "step": 101 }, { "epoch": 2.12, "learning_rate": 0.00015000000000000001, "loss": 1.0993, "step": 102 }, { "epoch": 2.15, "learning_rate": 0.00014902168989567335, "loss": 1.1613, "step": 103 }, { "epoch": 2.17, "learning_rate": 0.00014803716430626456, "loss": 1.1317, "step": 104 }, { "epoch": 2.19, "learning_rate": 0.0001470465480602756, "loss": 1.1656, "step": 105 }, { "epoch": 2.21, "learning_rate": 0.00014604996675844585, "loss": 1.1501, "step": 106 }, { "epoch": 2.23, "learning_rate": 0.0001450475467578273, "loss": 1.1233, "step": 107 }, { "epoch": 2.25, "learning_rate": 0.00014403941515576344, "loss": 1.1439, "step": 108 }, { "epoch": 2.27, "learning_rate": 0.0001430256997737746, "loss": 1.1664, "step": 109 }, { "epoch": 2.29, "learning_rate": 0.0001420065291413515, "loss": 1.1584, "step": 110 }, { "epoch": 2.31, "learning_rate": 0.00014098203247965875, "loss": 1.1161, "step": 111 }, { "epoch": 2.33, "learning_rate": 0.00013995233968515104, "loss": 1.1207, "step": 112 }, { "epoch": 2.35, "learning_rate": 0.0001389175813131033, "loss": 1.1216, "step": 113 }, { "epoch": 2.38, "learning_rate": 0.0001378778885610576, "loss": 1.1373, "step": 114 }, { "epoch": 2.4, "learning_rate": 0.00013683339325218873, "loss": 1.1292, "step": 115 }, { "epoch": 2.42, "learning_rate": 0.00013578422781858993, "loss": 1.1381, "step": 116 }, { "epoch": 2.44, "learning_rate": 0.00013473052528448201, "loss": 1.1236, "step": 117 }, { "epoch": 2.46, "learning_rate": 0.00013367241924934714, "loss": 1.1479, "step": 118 }, { "epoch": 2.48, "learning_rate": 0.0001326100438709895, "loss": 1.1445, "step": 119 }, { "epoch": 2.5, "learning_rate": 0.00013154353384852558, "loss": 1.1658, "step": 120 }, { "epoch": 2.52, "learning_rate": 0.00013047302440530537, "loss": 1.1165, "step": 121 }, { "epoch": 2.54, "learning_rate": 0.0001293986512717677, "loss": 1.1418, "step": 122 }, { "epoch": 2.56, "learning_rate": 0.00012832055066823038, "loss": 1.1195, "step": 123 }, { "epoch": 2.58, "learning_rate": 0.00012723885928761933, "loss": 1.1395, "step": 124 }, { "epoch": 2.6, "learning_rate": 0.0001261537142781367, "loss": 1.1039, "step": 125 }, { "epoch": 2.62, "learning_rate": 0.00012506525322587207, "loss": 1.1465, "step": 126 }, { "epoch": 2.65, "learning_rate": 0.00012397361413735784, "loss": 1.1199, "step": 127 }, { "epoch": 2.67, "learning_rate": 0.0001228789354220712, "loss": 1.1098, "step": 128 }, { "epoch": 2.69, "learning_rate": 0.00012178135587488515, "loss": 1.1116, "step": 129 }, { "epoch": 2.71, "learning_rate": 0.00012068101465847075, "loss": 1.1007, "step": 130 }, { "epoch": 2.73, "learning_rate": 0.00011957805128565232, "loss": 1.1688, "step": 131 }, { "epoch": 2.75, "learning_rate": 0.00011847260560171896, "loss": 1.1404, "step": 132 }, { "epoch": 2.77, "learning_rate": 0.00011736481776669306, "loss": 1.1526, "step": 133 }, { "epoch": 2.79, "learning_rate": 0.00011625482823755965, "loss": 1.1444, "step": 134 }, { "epoch": 2.81, "learning_rate": 0.00011514277775045768, "loss": 1.1252, "step": 135 }, { "epoch": 2.83, "learning_rate": 0.00011402880730283598, "loss": 1.1544, "step": 136 }, { "epoch": 2.85, "learning_rate": 0.00011291305813557615, "loss": 1.1349, "step": 137 }, { "epoch": 2.88, "learning_rate": 0.00011179567171508463, "loss": 1.1086, "step": 138 }, { "epoch": 2.9, "learning_rate": 0.00011067678971535589, "loss": 1.1274, "step": 139 }, { "epoch": 2.92, "learning_rate": 0.00010955655400000984, "loss": 1.1648, "step": 140 }, { "epoch": 2.94, "learning_rate": 0.00010843510660430447, "loss": 1.1158, "step": 141 }, { "epoch": 2.96, "learning_rate": 0.00010731258971712761, "loss": 1.1279, "step": 142 }, { "epoch": 2.98, "learning_rate": 0.0001061891456629682, "loss": 1.1101, "step": 143 }, { "epoch": 3.0, "learning_rate": 0.00010506491688387127, "loss": 1.0081, "step": 144 }, { "epoch": 3.02, "learning_rate": 0.00010394004592137757, "loss": 0.9643, "step": 145 }, { "epoch": 3.04, "learning_rate": 0.00010281467539845051, "loss": 0.9642, "step": 146 }, { "epoch": 3.06, "learning_rate": 0.0001016889480013931, "loss": 0.9563, "step": 147 }, { "epoch": 3.08, "learning_rate": 0.0001005630064617566, "loss": 0.9485, "step": 148 }, { "epoch": 3.1, "learning_rate": 9.943699353824345e-05, "loss": 0.9159, "step": 149 }, { "epoch": 3.12, "learning_rate": 9.83110519986069e-05, "loss": 0.9132, "step": 150 }, { "epoch": 3.15, "learning_rate": 9.718532460154948e-05, "loss": 0.929, "step": 151 }, { "epoch": 3.17, "learning_rate": 9.605995407862247e-05, "loss": 0.9148, "step": 152 }, { "epoch": 3.19, "learning_rate": 9.493508311612874e-05, "loss": 0.9436, "step": 153 }, { "epoch": 3.21, "learning_rate": 9.381085433703182e-05, "loss": 0.9221, "step": 154 }, { "epoch": 3.23, "learning_rate": 9.268741028287239e-05, "loss": 0.8885, "step": 155 }, { "epoch": 3.25, "learning_rate": 9.156489339569554e-05, "loss": 0.9517, "step": 156 }, { "epoch": 3.27, "learning_rate": 9.04434459999902e-05, "loss": 0.9354, "step": 157 }, { "epoch": 3.29, "learning_rate": 8.932321028464412e-05, "loss": 0.9805, "step": 158 }, { "epoch": 3.31, "learning_rate": 8.820432828491542e-05, "loss": 0.95, "step": 159 }, { "epoch": 3.33, "learning_rate": 8.708694186442388e-05, "loss": 0.953, "step": 160 }, { "epoch": 3.35, "learning_rate": 8.597119269716403e-05, "loss": 0.9426, "step": 161 }, { "epoch": 3.38, "learning_rate": 8.485722224954237e-05, "loss": 0.9284, "step": 162 }, { "epoch": 3.4, "learning_rate": 8.374517176244038e-05, "loss": 0.9287, "step": 163 }, { "epoch": 3.42, "learning_rate": 8.263518223330697e-05, "loss": 0.9112, "step": 164 }, { "epoch": 3.44, "learning_rate": 8.15273943982811e-05, "loss": 0.9438, "step": 165 }, { "epoch": 3.46, "learning_rate": 8.04219487143477e-05, "loss": 0.9362, "step": 166 }, { "epoch": 3.48, "learning_rate": 7.931898534152928e-05, "loss": 0.9414, "step": 167 }, { "epoch": 3.5, "learning_rate": 7.821864412511485e-05, "loss": 0.9463, "step": 168 }, { "epoch": 3.52, "learning_rate": 7.712106457792884e-05, "loss": 0.9643, "step": 169 }, { "epoch": 3.54, "learning_rate": 7.602638586264219e-05, "loss": 0.9276, "step": 170 }, { "epoch": 3.56, "learning_rate": 7.493474677412794e-05, "loss": 0.9255, "step": 171 }, { "epoch": 3.58, "learning_rate": 7.384628572186333e-05, "loss": 0.9094, "step": 172 }, { "epoch": 3.6, "learning_rate": 7.276114071238069e-05, "loss": 0.9376, "step": 173 }, { "epoch": 3.62, "learning_rate": 7.16794493317696e-05, "loss": 0.9424, "step": 174 }, { "epoch": 3.65, "learning_rate": 7.060134872823234e-05, "loss": 0.92, "step": 175 }, { "epoch": 3.67, "learning_rate": 6.952697559469464e-05, "loss": 0.9739, "step": 176 }, { "epoch": 3.69, "learning_rate": 6.845646615147445e-05, "loss": 0.9308, "step": 177 }, { "epoch": 3.71, "learning_rate": 6.738995612901051e-05, "loss": 0.9392, "step": 178 }, { "epoch": 3.73, "learning_rate": 6.632758075065288e-05, "loss": 0.9179, "step": 179 }, { "epoch": 3.75, "learning_rate": 6.526947471551798e-05, "loss": 0.9393, "step": 180 }, { "epoch": 3.77, "learning_rate": 6.421577218141008e-05, "loss": 0.9353, "step": 181 }, { "epoch": 3.79, "learning_rate": 6.31666067478113e-05, "loss": 0.9051, "step": 182 }, { "epoch": 3.81, "learning_rate": 6.21221114389424e-05, "loss": 0.9285, "step": 183 }, { "epoch": 3.83, "learning_rate": 6.108241868689675e-05, "loss": 0.9345, "step": 184 }, { "epoch": 3.85, "learning_rate": 6.0047660314849006e-05, "loss": 0.8936, "step": 185 }, { "epoch": 3.88, "learning_rate": 5.901796752034128e-05, "loss": 0.9324, "step": 186 }, { "epoch": 3.9, "learning_rate": 5.799347085864851e-05, "loss": 0.9514, "step": 187 }, { "epoch": 3.92, "learning_rate": 5.697430022622542e-05, "loss": 0.9571, "step": 188 }, { "epoch": 3.94, "learning_rate": 5.596058484423656e-05, "loss": 0.9109, "step": 189 }, { "epoch": 3.96, "learning_rate": 5.495245324217271e-05, "loss": 0.92, "step": 190 }, { "epoch": 3.98, "learning_rate": 5.3950033241554146e-05, "loss": 0.9387, "step": 191 }, { "epoch": 4.0, "learning_rate": 5.2953451939724454e-05, "loss": 0.8087, "step": 192 }, { "epoch": 4.02, "learning_rate": 5.19628356937355e-05, "loss": 0.8224, "step": 193 }, { "epoch": 4.04, "learning_rate": 5.097831010432666e-05, "loss": 0.7526, "step": 194 }, { "epoch": 4.06, "learning_rate": 5.000000000000002e-05, "loss": 0.7709, "step": 195 }, { "epoch": 4.08, "learning_rate": 4.902802942119293e-05, "loss": 0.7642, "step": 196 }, { "epoch": 4.1, "learning_rate": 4.806252160455125e-05, "loss": 0.7933, "step": 197 }, { "epoch": 4.12, "learning_rate": 4.710359896730379e-05, "loss": 0.789, "step": 198 }, { "epoch": 4.15, "learning_rate": 4.6151383091741115e-05, "loss": 0.7902, "step": 199 }, { "epoch": 4.17, "learning_rate": 4.520599470980015e-05, "loss": 0.7809, "step": 200 }, { "epoch": 4.19, "learning_rate": 4.426755368775637e-05, "loss": 0.75, "step": 201 }, { "epoch": 4.21, "learning_rate": 4.333617901102591e-05, "loss": 0.8145, "step": 202 }, { "epoch": 4.23, "learning_rate": 4.241198876907936e-05, "loss": 0.796, "step": 203 }, { "epoch": 4.25, "learning_rate": 4.149510014046922e-05, "loss": 0.7791, "step": 204 }, { "epoch": 4.27, "learning_rate": 4.0585629377972744e-05, "loss": 0.7962, "step": 205 }, { "epoch": 4.29, "learning_rate": 3.968369179385204e-05, "loss": 0.7876, "step": 206 }, { "epoch": 4.31, "learning_rate": 3.878940174523371e-05, "loss": 0.7861, "step": 207 }, { "epoch": 4.33, "learning_rate": 3.790287261960953e-05, "loss": 0.7772, "step": 208 }, { "epoch": 4.35, "learning_rate": 3.7024216820459756e-05, "loss": 0.7788, "step": 209 }, { "epoch": 4.38, "learning_rate": 3.615354575300166e-05, "loss": 0.8003, "step": 210 }, { "epoch": 4.4, "learning_rate": 3.5290969810064255e-05, "loss": 0.7462, "step": 211 }, { "epoch": 4.42, "learning_rate": 3.443659835809158e-05, "loss": 0.7519, "step": 212 }, { "epoch": 4.44, "learning_rate": 3.3590539723276083e-05, "loss": 0.7578, "step": 213 }, { "epoch": 4.46, "learning_rate": 3.275290117782397e-05, "loss": 0.7778, "step": 214 }, { "epoch": 4.48, "learning_rate": 3.1923788926353884e-05, "loss": 0.7937, "step": 215 }, { "epoch": 4.5, "learning_rate": 3.110330809243134e-05, "loss": 0.79, "step": 216 }, { "epoch": 4.52, "learning_rate": 3.0291562705240105e-05, "loss": 0.7717, "step": 217 }, { "epoch": 4.54, "learning_rate": 2.9488655686392086e-05, "loss": 0.7414, "step": 218 }, { "epoch": 4.56, "learning_rate": 2.869468883687798e-05, "loss": 0.7859, "step": 219 }, { "epoch": 4.58, "learning_rate": 2.790976282415989e-05, "loss": 0.7633, "step": 220 }, { "epoch": 4.6, "learning_rate": 2.713397716940763e-05, "loss": 0.7858, "step": 221 }, { "epoch": 4.62, "learning_rate": 2.6367430234880284e-05, "loss": 0.8008, "step": 222 }, { "epoch": 4.65, "learning_rate": 2.56102192114549e-05, "loss": 0.7677, "step": 223 }, { "epoch": 4.67, "learning_rate": 2.4862440106303665e-05, "loss": 0.7928, "step": 224 }, { "epoch": 4.69, "learning_rate": 2.4124187730720917e-05, "loss": 0.7949, "step": 225 }, { "epoch": 4.71, "learning_rate": 2.339555568810221e-05, "loss": 0.7621, "step": 226 }, { "epoch": 4.73, "learning_rate": 2.2676636362076076e-05, "loss": 0.7776, "step": 227 }, { "epoch": 4.75, "learning_rate": 2.1967520904790827e-05, "loss": 0.7959, "step": 228 }, { "epoch": 4.77, "learning_rate": 2.126829922535718e-05, "loss": 0.7657, "step": 229 }, { "epoch": 4.79, "learning_rate": 2.05790599784488e-05, "loss": 0.7759, "step": 230 }, { "epoch": 4.81, "learning_rate": 1.9899890553061562e-05, "loss": 0.7679, "step": 231 }, { "epoch": 4.83, "learning_rate": 1.9230877061433507e-05, "loss": 0.7726, "step": 232 }, { "epoch": 4.85, "learning_rate": 1.857210432812674e-05, "loss": 0.7841, "step": 233 }, { "epoch": 4.88, "learning_rate": 1.7923655879272393e-05, "loss": 0.762, "step": 234 }, { "epoch": 4.9, "learning_rate": 1.728561393198016e-05, "loss": 0.7579, "step": 235 }, { "epoch": 4.92, "learning_rate": 1.6658059383914248e-05, "loss": 0.7911, "step": 236 }, { "epoch": 4.94, "learning_rate": 1.60410718030361e-05, "loss": 0.7901, "step": 237 }, { "epoch": 4.96, "learning_rate": 1.5434729417516047e-05, "loss": 0.7889, "step": 238 }, { "epoch": 4.98, "learning_rate": 1.483910910581452e-05, "loss": 0.7559, "step": 239 }, { "epoch": 5.0, "learning_rate": 1.425428638693489e-05, "loss": 0.7221, "step": 240 }, { "epoch": 5.02, "learning_rate": 1.368033541084821e-05, "loss": 0.6921, "step": 241 }, { "epoch": 5.04, "learning_rate": 1.3117328949091634e-05, "loss": 0.7043, "step": 242 }, { "epoch": 5.06, "learning_rate": 1.2565338385541792e-05, "loss": 0.7169, "step": 243 }, { "epoch": 5.08, "learning_rate": 1.2024433707364002e-05, "loss": 0.7287, "step": 244 }, { "epoch": 5.1, "learning_rate": 1.1494683496138458e-05, "loss": 0.6954, "step": 245 }, { "epoch": 5.12, "learning_rate": 1.097615491916485e-05, "loss": 0.6995, "step": 246 }, { "epoch": 5.15, "learning_rate": 1.0468913720946084e-05, "loss": 0.7289, "step": 247 }, { "epoch": 5.17, "learning_rate": 9.973024214852567e-06, "loss": 0.7059, "step": 248 }, { "epoch": 5.19, "learning_rate": 9.488549274967872e-06, "loss": 0.6897, "step": 249 }, { "epoch": 5.21, "learning_rate": 9.015550328116939e-06, "loss": 0.695, "step": 250 }, { "epoch": 5.23, "learning_rate": 8.554087346077633e-06, "loss": 0.7015, "step": 251 }, { "epoch": 5.25, "learning_rate": 8.10421883797694e-06, "loss": 0.7266, "step": 252 }, { "epoch": 5.27, "learning_rate": 7.666001842872638e-06, "loss": 0.7254, "step": 253 }, { "epoch": 5.29, "learning_rate": 7.239491922521246e-06, "loss": 0.7123, "step": 254 }, { "epoch": 5.31, "learning_rate": 6.824743154333157e-06, "loss": 0.6989, "step": 255 }, { "epoch": 5.33, "learning_rate": 6.421808124516437e-06, "loss": 0.7203, "step": 256 }, { "epoch": 5.35, "learning_rate": 6.030737921409169e-06, "loss": 0.6886, "step": 257 }, { "epoch": 5.38, "learning_rate": 5.651582129001986e-06, "loss": 0.7221, "step": 258 }, { "epoch": 5.4, "learning_rate": 5.284388820651331e-06, "loss": 0.6851, "step": 259 }, { "epoch": 5.42, "learning_rate": 4.929204552984168e-06, "loss": 0.7138, "step": 260 }, { "epoch": 5.44, "learning_rate": 4.586074359995119e-06, "loss": 0.712, "step": 261 }, { "epoch": 5.46, "learning_rate": 4.255041747336452e-06, "loss": 0.713, "step": 262 }, { "epoch": 5.48, "learning_rate": 3.936148686802077e-06, "loss": 0.7207, "step": 263 }, { "epoch": 5.5, "learning_rate": 3.6294356110059157e-06, "loss": 0.6864, "step": 264 }, { "epoch": 5.52, "learning_rate": 3.3349414082553875e-06, "loss": 0.6759, "step": 265 }, { "epoch": 5.54, "learning_rate": 3.0527034176207727e-06, "loss": 0.7169, "step": 266 }, { "epoch": 5.56, "learning_rate": 2.7827574242009437e-06, "loss": 0.6766, "step": 267 }, { "epoch": 5.58, "learning_rate": 2.525137654586185e-06, "loss": 0.6986, "step": 268 }, { "epoch": 5.6, "learning_rate": 2.2798767725185853e-06, "loss": 0.707, "step": 269 }, { "epoch": 5.62, "learning_rate": 2.0470058747505516e-06, "loss": 0.7098, "step": 270 }, { "epoch": 5.65, "learning_rate": 1.8265544871020723e-06, "loss": 0.7201, "step": 271 }, { "epoch": 5.67, "learning_rate": 1.6185505607171026e-06, "loss": 0.7201, "step": 272 }, { "epoch": 5.69, "learning_rate": 1.4230204685196203e-06, "loss": 0.6995, "step": 273 }, { "epoch": 5.71, "learning_rate": 1.2399890018698347e-06, "loss": 0.6899, "step": 274 }, { "epoch": 5.73, "learning_rate": 1.0694793674208114e-06, "loss": 0.6689, "step": 275 }, { "epoch": 5.75, "learning_rate": 9.11513184176116e-07, "loss": 0.7259, "step": 276 }, { "epoch": 5.77, "learning_rate": 7.661104807487607e-07, "loss": 0.7346, "step": 277 }, { "epoch": 5.79, "learning_rate": 6.332896928217257e-07, "loss": 0.701, "step": 278 }, { "epoch": 5.81, "learning_rate": 5.130676608104845e-07, "loss": 0.6666, "step": 279 }, { "epoch": 5.83, "learning_rate": 4.054596277278666e-07, "loss": 0.6859, "step": 280 }, { "epoch": 5.85, "learning_rate": 3.104792372512821e-07, "loss": 0.6497, "step": 281 }, { "epoch": 5.88, "learning_rate": 2.2813853199292746e-07, "loss": 0.7257, "step": 282 }, { "epoch": 5.9, "learning_rate": 1.58447951972851e-07, "loss": 0.6726, "step": 283 }, { "epoch": 5.92, "learning_rate": 1.0141633329525668e-07, "loss": 0.7093, "step": 284 }, { "epoch": 5.94, "learning_rate": 5.705090702819993e-08, "loss": 0.708, "step": 285 }, { "epoch": 5.96, "learning_rate": 2.5357298286698973e-08, "loss": 0.7173, "step": 286 }, { "epoch": 5.98, "learning_rate": 6.3395255195941585e-09, "loss": 0.6994, "step": 287 }, { "epoch": 6.0, "learning_rate": 0.0, "loss": 0.6607, "step": 288 }, { "epoch": 6.0, "step": 288, "total_flos": 86917169807360.0, "train_loss": 1.0930437861631315, "train_runtime": 2990.5988, "train_samples_per_second": 18.903, "train_steps_per_second": 0.096 } ], "logging_steps": 1, "max_steps": 288, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 50000, "total_flos": 86917169807360.0, "train_batch_size": 50, "trial_name": null, "trial_params": null }