{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0172413793103448, "eval_steps": 500, "global_step": 178, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "learning_rate": 2.5e-05, "loss": 1.8823, "step": 1 }, { "epoch": 0.03, "learning_rate": 5e-05, "loss": 1.8925, "step": 2 }, { "epoch": 0.05, "learning_rate": 7.5e-05, "loss": 1.8355, "step": 3 }, { "epoch": 0.07, "learning_rate": 0.0001, "loss": 1.8682, "step": 4 }, { "epoch": 0.09, "learning_rate": 0.000125, "loss": 1.6578, "step": 5 }, { "epoch": 0.1, "learning_rate": 0.00015, "loss": 1.7746, "step": 6 }, { "epoch": 0.12, "learning_rate": 0.000175, "loss": 1.7078, "step": 7 }, { "epoch": 0.14, "learning_rate": 0.0002, "loss": 2.3556, "step": 8 }, { "epoch": 0.16, "learning_rate": 0.00022500000000000002, "loss": 1.785, "step": 9 }, { "epoch": 0.17, "learning_rate": 0.00025, "loss": 1.7491, "step": 10 }, { "epoch": 0.19, "learning_rate": 0.0002499874839708436, "loss": 1.7576, "step": 11 }, { "epoch": 0.21, "learning_rate": 0.0002499499383897902, "loss": 1.7944, "step": 12 }, { "epoch": 0.22, "learning_rate": 0.0002498873707755852, "loss": 1.8368, "step": 13 }, { "epoch": 0.24, "learning_rate": 0.0002497997936577979, "loss": 1.7509, "step": 14 }, { "epoch": 0.26, "learning_rate": 0.0002496872245743125, "loss": 1.9833, "step": 15 }, { "epoch": 0.28, "learning_rate": 0.0002495496860678158, "loss": 1.7853, "step": 16 }, { "epoch": 0.29, "learning_rate": 0.00024938720568128326, "loss": 1.7627, "step": 17 }, { "epoch": 0.31, "learning_rate": 0.0002491998159524629, "loss": 1.7721, "step": 18 }, { "epoch": 0.33, "learning_rate": 0.0002489875544073596, "loss": 1.6278, "step": 19 }, { "epoch": 0.34, "learning_rate": 0.00024875046355272046, "loss": 1.6963, "step": 20 }, { "epoch": 0.36, "learning_rate": 0.0002484885908675223, "loss": 1.7123, "step": 21 }, { "epoch": 0.38, "learning_rate": 0.0002482019887934636, "loss": 1.7719, "step": 22 }, { "epoch": 0.4, "learning_rate": 0.00024789071472446306, "loss": 1.8138, "step": 23 }, { "epoch": 0.41, "learning_rate": 0.00024755483099516584, "loss": 1.8235, "step": 24 }, { "epoch": 0.43, "learning_rate": 0.0002471944048684608, "loss": 1.7845, "step": 25 }, { "epoch": 0.45, "learning_rate": 0.0002468095085220104, "loss": 1.6665, "step": 26 }, { "epoch": 0.47, "learning_rate": 0.00024640021903379704, "loss": 1.6388, "step": 27 }, { "epoch": 0.48, "learning_rate": 0.00024596661836668736, "loss": 1.6817, "step": 28 }, { "epoch": 0.5, "learning_rate": 0.0002455087933520188, "loss": 1.7617, "step": 29 }, { "epoch": 0.52, "learning_rate": 0.0002450268356722112, "loss": 1.6921, "step": 30 }, { "epoch": 0.53, "learning_rate": 0.00024452084184240636, "loss": 1.6591, "step": 31 }, { "epoch": 0.55, "learning_rate": 0.00024399091319114082, "loss": 1.8172, "step": 32 }, { "epoch": 0.57, "learning_rate": 0.00024343715584005372, "loss": 1.7321, "step": 33 }, { "epoch": 0.59, "learning_rate": 0.00024285968068263553, "loss": 1.7617, "step": 34 }, { "epoch": 0.6, "learning_rate": 0.00024225860336202074, "loss": 1.7899, "step": 35 }, { "epoch": 0.62, "learning_rate": 0.00024163404424782967, "loss": 1.7486, "step": 36 }, { "epoch": 0.64, "learning_rate": 0.0002409861284120637, "loss": 1.7422, "step": 37 }, { "epoch": 0.66, "learning_rate": 0.0002403149856040586, "loss": 1.6344, "step": 38 }, { "epoch": 0.67, "learning_rate": 0.0002396207502245017, "loss": 1.7174, "step": 39 }, { "epoch": 0.69, "learning_rate": 0.00023890356129851697, "loss": 1.7665, "step": 40 }, { "epoch": 0.71, "learning_rate": 0.00023816356244782462, "loss": 1.7379, "step": 41 }, { "epoch": 0.72, "learning_rate": 0.0002374009018619796, "loss": 1.7206, "step": 42 }, { "epoch": 0.74, "learning_rate": 0.00023661573226869606, "loss": 1.6732, "step": 43 }, { "epoch": 0.76, "learning_rate": 0.00023580821090326233, "loss": 1.7475, "step": 44 }, { "epoch": 0.78, "learning_rate": 0.00023497849947705368, "loss": 1.7546, "step": 45 }, { "epoch": 0.79, "learning_rate": 0.00023412676414514853, "loss": 1.6588, "step": 46 }, { "epoch": 0.81, "learning_rate": 0.00023325317547305487, "loss": 1.7417, "step": 47 }, { "epoch": 0.83, "learning_rate": 0.00023235790840255328, "loss": 1.7224, "step": 48 }, { "epoch": 0.84, "learning_rate": 0.0002314411422166639, "loss": 1.6476, "step": 49 }, { "epoch": 0.86, "learning_rate": 0.00023050306050374382, "loss": 1.73, "step": 50 }, { "epoch": 0.88, "learning_rate": 0.00022954385112072202, "loss": 1.6658, "step": 51 }, { "epoch": 0.9, "learning_rate": 0.00022856370615548027, "loss": 1.7046, "step": 52 }, { "epoch": 0.91, "learning_rate": 0.00022756282188838596, "loss": 1.7037, "step": 53 }, { "epoch": 0.93, "learning_rate": 0.00022654139875298573, "loss": 1.7166, "step": 54 }, { "epoch": 0.95, "learning_rate": 0.00022549964129586758, "loss": 1.7816, "step": 55 }, { "epoch": 0.97, "learning_rate": 0.00022443775813569874, "loss": 1.7556, "step": 56 }, { "epoch": 0.98, "learning_rate": 0.00022335596192144875, "loss": 1.7044, "step": 57 }, { "epoch": 1.0, "learning_rate": 0.00022225446928980495, "loss": 1.7183, "step": 58 }, { "epoch": 1.02, "learning_rate": 0.0002211335008217896, "loss": 1.738, "step": 59 }, { "epoch": 1.01, "learning_rate": 0.00021999328099858717, "loss": 1.6217, "step": 60 }, { "epoch": 1.03, "learning_rate": 0.00021883403815659054, "loss": 1.5942, "step": 61 }, { "epoch": 1.04, "learning_rate": 0.00021765600444167505, "loss": 1.5313, "step": 62 }, { "epoch": 1.06, "learning_rate": 0.00021645941576270978, "loss": 1.6038, "step": 63 }, { "epoch": 1.08, "learning_rate": 0.00021524451174431556, "loss": 1.5116, "step": 64 }, { "epoch": 1.09, "learning_rate": 0.00021401153567887822, "loss": 1.5218, "step": 65 }, { "epoch": 1.11, "learning_rate": 0.00021276073447782786, "loss": 1.4994, "step": 66 }, { "epoch": 1.13, "learning_rate": 0.0002114923586221933, "loss": 1.5652, "step": 67 }, { "epoch": 1.15, "learning_rate": 0.00021020666211244162, "loss": 1.5574, "step": 68 }, { "epoch": 1.16, "learning_rate": 0.00020890390241761286, "loss": 1.5671, "step": 69 }, { "epoch": 1.18, "learning_rate": 0.00020758434042376017, "loss": 1.4294, "step": 70 }, { "epoch": 1.2, "learning_rate": 0.00020624824038170583, "loss": 1.5317, "step": 71 }, { "epoch": 1.22, "learning_rate": 0.0002048958698541231, "loss": 1.5671, "step": 72 }, { "epoch": 1.23, "learning_rate": 0.0002035274996619553, "loss": 1.5256, "step": 73 }, { "epoch": 1.25, "learning_rate": 0.00020214340383018192, "loss": 1.5205, "step": 74 }, { "epoch": 1.27, "learning_rate": 0.00020074385953294352, "loss": 1.493, "step": 75 }, { "epoch": 1.28, "learning_rate": 0.00019932914703803584, "loss": 1.4958, "step": 76 }, { "epoch": 1.3, "learning_rate": 0.00019789954965078417, "loss": 1.498, "step": 77 }, { "epoch": 1.32, "learning_rate": 0.00019645535365730985, "loss": 1.4321, "step": 78 }, { "epoch": 1.34, "learning_rate": 0.00019499684826719944, "loss": 1.3947, "step": 79 }, { "epoch": 1.35, "learning_rate": 0.00019352432555558874, "loss": 1.4616, "step": 80 }, { "epoch": 1.37, "learning_rate": 0.0001920380804046728, "loss": 1.4462, "step": 81 }, { "epoch": 1.39, "learning_rate": 0.00019053841044465398, "loss": 1.4964, "step": 82 }, { "epoch": 1.41, "learning_rate": 0.00018902561599413927, "loss": 1.5132, "step": 83 }, { "epoch": 1.42, "learning_rate": 0.0001875, "loss": 1.4863, "step": 84 }, { "epoch": 1.44, "learning_rate": 0.00018596186797670432, "loss": 1.475, "step": 85 }, { "epoch": 1.46, "learning_rate": 0.00018441152794513625, "loss": 1.3819, "step": 86 }, { "epoch": 1.47, "learning_rate": 0.00018284929037091238, "loss": 1.4123, "step": 87 }, { "epoch": 1.49, "learning_rate": 0.00018127546810220915, "loss": 1.4385, "step": 88 }, { "epoch": 1.51, "learning_rate": 0.00017969037630711306, "loss": 1.4143, "step": 89 }, { "epoch": 1.53, "learning_rate": 0.000178094332410506, "loss": 1.368, "step": 90 }, { "epoch": 1.54, "learning_rate": 0.0001764876560304991, "loss": 1.4466, "step": 91 }, { "epoch": 1.56, "learning_rate": 0.00017487066891442703, "loss": 1.4604, "step": 92 }, { "epoch": 1.58, "learning_rate": 0.0001732436948744161, "loss": 1.4397, "step": 93 }, { "epoch": 1.59, "learning_rate": 0.0001716070597225386, "loss": 1.4631, "step": 94 }, { "epoch": 1.61, "learning_rate": 0.000169961091205567, "loss": 1.4424, "step": 95 }, { "epoch": 1.63, "learning_rate": 0.00016830611893934048, "loss": 1.4344, "step": 96 }, { "epoch": 1.65, "learning_rate": 0.0001666424743427572, "loss": 1.3999, "step": 97 }, { "epoch": 1.66, "learning_rate": 0.00016497049057140567, "loss": 1.3305, "step": 98 }, { "epoch": 1.68, "learning_rate": 0.00016329050245084795, "loss": 1.4365, "step": 99 }, { "epoch": 1.7, "learning_rate": 0.0001616028464095688, "loss": 1.4156, "step": 100 }, { "epoch": 1.72, "learning_rate": 0.00015990786041160377, "loss": 1.4368, "step": 101 }, { "epoch": 1.73, "learning_rate": 0.00015820588388885953, "loss": 1.3383, "step": 102 }, { "epoch": 1.75, "learning_rate": 0.00015649725767314065, "loss": 1.4104, "step": 103 }, { "epoch": 1.77, "learning_rate": 0.0001547823239278956, "loss": 1.4642, "step": 104 }, { "epoch": 1.78, "learning_rate": 0.00015306142607969655, "loss": 1.349, "step": 105 }, { "epoch": 1.8, "learning_rate": 0.0001513349087494658, "loss": 1.3414, "step": 106 }, { "epoch": 1.82, "learning_rate": 0.00014960311768346318, "loss": 1.4445, "step": 107 }, { "epoch": 1.84, "learning_rate": 0.00014786639968404833, "loss": 1.3416, "step": 108 }, { "epoch": 1.85, "learning_rate": 0.00014612510254023115, "loss": 1.3636, "step": 109 }, { "epoch": 1.87, "learning_rate": 0.00014437957495802472, "loss": 1.4371, "step": 110 }, { "epoch": 1.89, "learning_rate": 0.00014263016649061492, "loss": 1.308, "step": 111 }, { "epoch": 1.91, "learning_rate": 0.00014087722746835985, "loss": 1.4221, "step": 112 }, { "epoch": 1.92, "learning_rate": 0.00013912110892863412, "loss": 1.3635, "step": 113 }, { "epoch": 1.94, "learning_rate": 0.00013736216254553124, "loss": 1.4651, "step": 114 }, { "epoch": 1.96, "learning_rate": 0.00013560074055943866, "loss": 1.4113, "step": 115 }, { "epoch": 1.97, "learning_rate": 0.00013383719570649933, "loss": 1.4427, "step": 116 }, { "epoch": 1.99, "learning_rate": 0.00013207188114797405, "loss": 1.4005, "step": 117 }, { "epoch": 2.01, "learning_rate": 0.00013030515039951854, "loss": 1.3678, "step": 118 }, { "epoch": 2.03, "learning_rate": 0.00012853735726038978, "loss": 1.3718, "step": 119 }, { "epoch": 2.02, "learning_rate": 0.00012676885574259528, "loss": 1.2768, "step": 120 }, { "epoch": 2.03, "learning_rate": 0.000125, "loss": 1.2511, "step": 121 }, { "epoch": 2.05, "learning_rate": 0.00012323114425740475, "loss": 1.2024, "step": 122 }, { "epoch": 2.07, "learning_rate": 0.00012146264273961024, "loss": 1.2752, "step": 123 }, { "epoch": 2.09, "learning_rate": 0.00011969484960048149, "loss": 1.1877, "step": 124 }, { "epoch": 2.1, "learning_rate": 0.00011792811885202594, "loss": 1.129, "step": 125 }, { "epoch": 2.12, "learning_rate": 0.00011616280429350067, "loss": 1.1975, "step": 126 }, { "epoch": 2.14, "learning_rate": 0.0001143992594405614, "loss": 1.3163, "step": 127 }, { "epoch": 2.16, "learning_rate": 0.0001126378374544688, "loss": 1.1691, "step": 128 }, { "epoch": 2.17, "learning_rate": 0.00011087889107136592, "loss": 1.2009, "step": 129 }, { "epoch": 2.19, "learning_rate": 0.00010912277253164016, "loss": 1.1294, "step": 130 }, { "epoch": 2.21, "learning_rate": 0.00010736983350938512, "loss": 1.2445, "step": 131 }, { "epoch": 2.22, "learning_rate": 0.00010562042504197526, "loss": 1.2484, "step": 132 }, { "epoch": 2.24, "learning_rate": 0.00010387489745976888, "loss": 1.1636, "step": 133 }, { "epoch": 2.26, "learning_rate": 0.00010213360031595167, "loss": 1.2341, "step": 134 }, { "epoch": 2.28, "learning_rate": 0.00010039688231653683, "loss": 1.16, "step": 135 }, { "epoch": 2.29, "learning_rate": 9.866509125053425e-05, "loss": 1.1895, "step": 136 }, { "epoch": 2.31, "learning_rate": 9.693857392030342e-05, "loss": 1.1355, "step": 137 }, { "epoch": 2.33, "learning_rate": 9.52176760721044e-05, "loss": 1.0434, "step": 138 }, { "epoch": 2.34, "learning_rate": 9.350274232685938e-05, "loss": 1.1358, "step": 139 }, { "epoch": 2.36, "learning_rate": 9.179411611114049e-05, "loss": 1.0848, "step": 140 }, { "epoch": 2.38, "learning_rate": 9.009213958839625e-05, "loss": 1.1591, "step": 141 }, { "epoch": 2.4, "learning_rate": 8.839715359043121e-05, "loss": 1.2907, "step": 142 }, { "epoch": 2.41, "learning_rate": 8.670949754915208e-05, "loss": 1.1806, "step": 143 }, { "epoch": 2.43, "learning_rate": 8.502950942859435e-05, "loss": 1.1443, "step": 144 }, { "epoch": 2.45, "learning_rate": 8.33575256572428e-05, "loss": 1.0913, "step": 145 }, { "epoch": 2.47, "learning_rate": 8.169388106065959e-05, "loss": 1.0982, "step": 146 }, { "epoch": 2.48, "learning_rate": 8.003890879443303e-05, "loss": 1.0401, "step": 147 }, { "epoch": 2.5, "learning_rate": 7.839294027746143e-05, "loss": 1.1177, "step": 148 }, { "epoch": 2.52, "learning_rate": 7.675630512558389e-05, "loss": 1.0619, "step": 149 }, { "epoch": 2.53, "learning_rate": 7.512933108557296e-05, "loss": 1.0004, "step": 150 }, { "epoch": 2.55, "learning_rate": 7.351234396950094e-05, "loss": 1.169, "step": 151 }, { "epoch": 2.57, "learning_rate": 7.190566758949402e-05, "loss": 1.0743, "step": 152 }, { "epoch": 2.59, "learning_rate": 7.030962369288702e-05, "loss": 1.1376, "step": 153 }, { "epoch": 2.6, "learning_rate": 6.872453189779081e-05, "loss": 1.1071, "step": 154 }, { "epoch": 2.62, "learning_rate": 6.715070962908766e-05, "loss": 1.053, "step": 155 }, { "epoch": 2.64, "learning_rate": 6.558847205486374e-05, "loss": 1.0751, "step": 156 }, { "epoch": 2.66, "learning_rate": 6.403813202329571e-05, "loss": 1.0028, "step": 157 }, { "epoch": 2.67, "learning_rate": 6.250000000000003e-05, "loss": 1.0816, "step": 158 }, { "epoch": 2.69, "learning_rate": 6.0974384005860736e-05, "loss": 1.1124, "step": 159 }, { "epoch": 2.71, "learning_rate": 5.9461589555346056e-05, "loss": 1.0811, "step": 160 }, { "epoch": 2.72, "learning_rate": 5.7961919595327204e-05, "loss": 1.0916, "step": 161 }, { "epoch": 2.74, "learning_rate": 5.647567444441129e-05, "loss": 1.0218, "step": 162 }, { "epoch": 2.76, "learning_rate": 5.500315173280059e-05, "loss": 1.0776, "step": 163 }, { "epoch": 2.78, "learning_rate": 5.354464634269017e-05, "loss": 1.0637, "step": 164 }, { "epoch": 2.79, "learning_rate": 5.210045034921583e-05, "loss": 1.0057, "step": 165 }, { "epoch": 2.81, "learning_rate": 5.0670852961964176e-05, "loss": 1.09, "step": 166 }, { "epoch": 2.83, "learning_rate": 4.925614046705648e-05, "loss": 1.0644, "step": 167 }, { "epoch": 2.84, "learning_rate": 4.785659616981813e-05, "loss": 0.9983, "step": 168 }, { "epoch": 2.86, "learning_rate": 4.64725003380447e-05, "loss": 1.0903, "step": 169 }, { "epoch": 2.88, "learning_rate": 4.510413014587692e-05, "loss": 1.051, "step": 170 }, { "epoch": 2.9, "learning_rate": 4.375175961829415e-05, "loss": 1.0629, "step": 171 }, { "epoch": 2.91, "learning_rate": 4.2415659576239844e-05, "loss": 1.0092, "step": 172 }, { "epoch": 2.93, "learning_rate": 4.109609758238714e-05, "loss": 1.1383, "step": 173 }, { "epoch": 2.95, "learning_rate": 3.979333788755842e-05, "loss": 1.1097, "step": 174 }, { "epoch": 2.97, "learning_rate": 3.850764137780674e-05, "loss": 1.1173, "step": 175 }, { "epoch": 2.98, "learning_rate": 3.723926552217219e-05, "loss": 1.085, "step": 176 }, { "epoch": 3.0, "learning_rate": 3.598846432112182e-05, "loss": 1.0868, "step": 177 }, { "epoch": 3.02, "learning_rate": 3.475548825568445e-05, "loss": 1.0675, "step": 178 } ], "logging_steps": 1, "max_steps": 232, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "total_flos": 1.5793187263900287e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }