{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 242, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01652892561983471, "grad_norm": 12.011792182922363, "learning_rate": 2.857142857142857e-05, "loss": 4.7518, "step": 2 }, { "epoch": 0.03305785123966942, "grad_norm": 12.689871788024902, "learning_rate": 5.714285714285714e-05, "loss": 4.687, "step": 4 }, { "epoch": 0.049586776859504134, "grad_norm": 11.848198890686035, "learning_rate": 8.571428571428571e-05, "loss": 4.4922, "step": 6 }, { "epoch": 0.06611570247933884, "grad_norm": 10.378408432006836, "learning_rate": 0.00011428571428571428, "loss": 3.4839, "step": 8 }, { "epoch": 0.08264462809917356, "grad_norm": 9.543920516967773, "learning_rate": 0.00014285714285714287, "loss": 2.4164, "step": 10 }, { "epoch": 0.09917355371900827, "grad_norm": 5.602456092834473, "learning_rate": 0.00017142857142857143, "loss": 1.838, "step": 12 }, { "epoch": 0.11570247933884298, "grad_norm": 1.7414193153381348, "learning_rate": 0.0002, "loss": 1.4404, "step": 14 }, { "epoch": 0.1322314049586777, "grad_norm": 1.0191770792007446, "learning_rate": 0.0001999910643210378, "loss": 1.3474, "step": 16 }, { "epoch": 0.1487603305785124, "grad_norm": 3.509352922439575, "learning_rate": 0.0001999642588810784, "loss": 1.3789, "step": 18 }, { "epoch": 0.1652892561983471, "grad_norm": 2.426680088043213, "learning_rate": 0.00019991958847061784, "loss": 1.2245, "step": 20 }, { "epoch": 0.18181818181818182, "grad_norm": 0.8060101270675659, "learning_rate": 0.00019985706107286514, "loss": 1.2259, "step": 22 }, { "epoch": 0.19834710743801653, "grad_norm": 1.6215940713882446, "learning_rate": 0.00019977668786231534, "loss": 1.1992, "step": 24 }, { "epoch": 0.21487603305785125, "grad_norm": 2.476259231567383, "learning_rate": 0.0001996784832027525, "loss": 1.2782, "step": 26 }, { "epoch": 0.23140495867768596, "grad_norm": 2.04018235206604, "learning_rate": 0.00019956246464468294, "loss": 1.23, "step": 28 }, { "epoch": 0.24793388429752067, "grad_norm": 1.8799017667770386, "learning_rate": 0.00019942865292219838, "loss": 1.2164, "step": 30 }, { "epoch": 0.2644628099173554, "grad_norm": 1.2413148880004883, "learning_rate": 0.00019927707194927066, "loss": 1.2924, "step": 32 }, { "epoch": 0.2809917355371901, "grad_norm": 1.8723937273025513, "learning_rate": 0.000199107748815478, "loss": 1.2302, "step": 34 }, { "epoch": 0.2975206611570248, "grad_norm": 1.393110752105713, "learning_rate": 0.00019892071378116376, "loss": 1.2276, "step": 36 }, { "epoch": 0.3140495867768595, "grad_norm": 1.1459721326828003, "learning_rate": 0.0001987160002720283, "loss": 1.1504, "step": 38 }, { "epoch": 0.3305785123966942, "grad_norm": 1.4680942296981812, "learning_rate": 0.00019849364487315558, "loss": 1.1623, "step": 40 }, { "epoch": 0.34710743801652894, "grad_norm": 1.8715866804122925, "learning_rate": 0.0001982536873224748, "loss": 1.2155, "step": 42 }, { "epoch": 0.36363636363636365, "grad_norm": 0.871064305305481, "learning_rate": 0.0001979961705036587, "loss": 1.1594, "step": 44 }, { "epoch": 0.38016528925619836, "grad_norm": 0.8239800930023193, "learning_rate": 0.00019772114043845965, "loss": 1.1501, "step": 46 }, { "epoch": 0.39669421487603307, "grad_norm": 0.9587319493293762, "learning_rate": 0.0001974286462784851, "loss": 1.1195, "step": 48 }, { "epoch": 0.4132231404958678, "grad_norm": 1.1645926237106323, "learning_rate": 0.0001971187402964132, "loss": 1.1417, "step": 50 }, { "epoch": 0.4297520661157025, "grad_norm": 0.576813817024231, "learning_rate": 0.00019679147787665126, "loss": 1.1445, "step": 52 }, { "epoch": 0.4462809917355372, "grad_norm": 1.0733133554458618, "learning_rate": 0.00019644691750543767, "loss": 1.0979, "step": 54 }, { "epoch": 0.4628099173553719, "grad_norm": 0.5801639556884766, "learning_rate": 0.00019608512076038962, "loss": 1.0977, "step": 56 }, { "epoch": 0.4793388429752066, "grad_norm": 1.6796538829803467, "learning_rate": 0.00019570615229949842, "loss": 1.1925, "step": 58 }, { "epoch": 0.49586776859504134, "grad_norm": 1.0563887357711792, "learning_rate": 0.00019531007984957408, "loss": 1.0657, "step": 60 }, { "epoch": 0.512396694214876, "grad_norm": 0.9109811186790466, "learning_rate": 0.00019489697419414182, "loss": 1.1098, "step": 62 }, { "epoch": 0.5289256198347108, "grad_norm": 0.7321667671203613, "learning_rate": 0.0001944669091607919, "loss": 1.0929, "step": 64 }, { "epoch": 0.5454545454545454, "grad_norm": 0.685366690158844, "learning_rate": 0.00019401996160798573, "loss": 1.1242, "step": 66 }, { "epoch": 0.5619834710743802, "grad_norm": 0.8959838151931763, "learning_rate": 0.0001935562114113202, "loss": 1.181, "step": 68 }, { "epoch": 0.5785123966942148, "grad_norm": 0.9717262983322144, "learning_rate": 0.00019307574144925287, "loss": 1.2295, "step": 70 }, { "epoch": 0.5950413223140496, "grad_norm": 1.0358582735061646, "learning_rate": 0.00019257863758829035, "loss": 1.1431, "step": 72 }, { "epoch": 0.6115702479338843, "grad_norm": 0.7998526096343994, "learning_rate": 0.00019206498866764288, "loss": 1.1032, "step": 74 }, { "epoch": 0.628099173553719, "grad_norm": 1.1496188640594482, "learning_rate": 0.0001915348864833476, "loss": 1.057, "step": 76 }, { "epoch": 0.6446280991735537, "grad_norm": 0.652406632900238, "learning_rate": 0.00019098842577186314, "loss": 1.146, "step": 78 }, { "epoch": 0.6611570247933884, "grad_norm": 0.9454944729804993, "learning_rate": 0.00019042570419313925, "loss": 1.1543, "step": 80 }, { "epoch": 0.6776859504132231, "grad_norm": 0.7456652522087097, "learning_rate": 0.00018984682231316333, "loss": 1.1189, "step": 82 }, { "epoch": 0.6942148760330579, "grad_norm": 0.7312512397766113, "learning_rate": 0.00018925188358598813, "loss": 1.0873, "step": 84 }, { "epoch": 0.7107438016528925, "grad_norm": 0.8474765419960022, "learning_rate": 0.000188640994335243, "loss": 1.1698, "step": 86 }, { "epoch": 0.7272727272727273, "grad_norm": 0.6979633569717407, "learning_rate": 0.0001880142637351325, "loss": 1.1417, "step": 88 }, { "epoch": 0.743801652892562, "grad_norm": 0.5989161133766174, "learning_rate": 0.00018737180379092537, "loss": 1.0479, "step": 90 }, { "epoch": 0.7603305785123967, "grad_norm": 0.5765272378921509, "learning_rate": 0.00018671372931893773, "loss": 1.1336, "step": 92 }, { "epoch": 0.7768595041322314, "grad_norm": 0.6709849834442139, "learning_rate": 0.00018604015792601396, "loss": 1.1157, "step": 94 }, { "epoch": 0.7933884297520661, "grad_norm": 0.8181343674659729, "learning_rate": 0.00018535120998850848, "loss": 1.0927, "step": 96 }, { "epoch": 0.8099173553719008, "grad_norm": 0.6146332621574402, "learning_rate": 0.00018464700863077312, "loss": 1.0739, "step": 98 }, { "epoch": 0.8264462809917356, "grad_norm": 0.9904415011405945, "learning_rate": 0.00018392767970315313, "loss": 1.0331, "step": 100 }, { "epoch": 0.8429752066115702, "grad_norm": 0.6186695694923401, "learning_rate": 0.0001831933517594957, "loss": 1.0513, "step": 102 }, { "epoch": 0.859504132231405, "grad_norm": 1.1912785768508911, "learning_rate": 0.00018244415603417603, "loss": 1.1567, "step": 104 }, { "epoch": 0.8760330578512396, "grad_norm": 1.3681318759918213, "learning_rate": 0.00018168022641864377, "loss": 1.1497, "step": 106 }, { "epoch": 0.8925619834710744, "grad_norm": 0.619476318359375, "learning_rate": 0.00018090169943749476, "loss": 1.1546, "step": 108 }, { "epoch": 0.9090909090909091, "grad_norm": 0.7421219348907471, "learning_rate": 0.00018010871422407236, "loss": 1.1458, "step": 110 }, { "epoch": 0.9256198347107438, "grad_norm": 0.6569286584854126, "learning_rate": 0.00017930141249560233, "loss": 1.12, "step": 112 }, { "epoch": 0.9421487603305785, "grad_norm": 0.4168110191822052, "learning_rate": 0.0001784799385278661, "loss": 1.1682, "step": 114 }, { "epoch": 0.9586776859504132, "grad_norm": 0.5620162487030029, "learning_rate": 0.00017764443912941672, "loss": 1.1828, "step": 116 }, { "epoch": 0.9752066115702479, "grad_norm": 0.8095484375953674, "learning_rate": 0.00017679506361534215, "loss": 1.1953, "step": 118 }, { "epoch": 0.9917355371900827, "grad_norm": 0.7646257281303406, "learning_rate": 0.0001759319637805806, "loss": 1.2148, "step": 120 }, { "epoch": 1.0082644628099173, "grad_norm": 0.5254501104354858, "learning_rate": 0.00017505529387279277, "loss": 1.1359, "step": 122 }, { "epoch": 1.024793388429752, "grad_norm": 0.6001765727996826, "learning_rate": 0.00017416521056479577, "loss": 1.1336, "step": 124 }, { "epoch": 1.0413223140495869, "grad_norm": 0.35407504439353943, "learning_rate": 0.00017326187292656333, "loss": 1.1833, "step": 126 }, { "epoch": 1.0578512396694215, "grad_norm": 0.414528489112854, "learning_rate": 0.00017234544239679806, "loss": 1.1301, "step": 128 }, { "epoch": 1.0743801652892562, "grad_norm": 0.46355852484703064, "learning_rate": 0.00017141608275408006, "loss": 1.213, "step": 130 }, { "epoch": 1.0909090909090908, "grad_norm": 0.5040593147277832, "learning_rate": 0.00017047396008759754, "loss": 1.132, "step": 132 }, { "epoch": 1.1074380165289257, "grad_norm": 0.4813704192638397, "learning_rate": 0.00016951924276746425, "loss": 1.0831, "step": 134 }, { "epoch": 1.1239669421487604, "grad_norm": 0.5174686312675476, "learning_rate": 0.00016855210141462963, "loss": 1.0514, "step": 136 }, { "epoch": 1.140495867768595, "grad_norm": 0.4712466299533844, "learning_rate": 0.00016757270887038654, "loss": 1.1334, "step": 138 }, { "epoch": 1.1570247933884297, "grad_norm": 0.5912173390388489, "learning_rate": 0.00016658124016548197, "loss": 1.1011, "step": 140 }, { "epoch": 1.1735537190082646, "grad_norm": 0.6392802000045776, "learning_rate": 0.00016557787248883696, "loss": 1.1361, "step": 142 }, { "epoch": 1.1900826446280992, "grad_norm": 0.7376368045806885, "learning_rate": 0.00016456278515588024, "loss": 1.109, "step": 144 }, { "epoch": 1.2066115702479339, "grad_norm": 0.5020875930786133, "learning_rate": 0.00016353615957650236, "loss": 1.0925, "step": 146 }, { "epoch": 1.2231404958677685, "grad_norm": 0.8081740736961365, "learning_rate": 0.00016249817922263517, "loss": 1.047, "step": 148 }, { "epoch": 1.2396694214876034, "grad_norm": 0.6371219754219055, "learning_rate": 0.00016144902959546286, "loss": 1.113, "step": 150 }, { "epoch": 1.256198347107438, "grad_norm": 0.7588189840316772, "learning_rate": 0.00016038889819227045, "loss": 1.1179, "step": 152 }, { "epoch": 1.2727272727272727, "grad_norm": 0.6286205053329468, "learning_rate": 0.00015931797447293552, "loss": 1.1209, "step": 154 }, { "epoch": 1.2892561983471074, "grad_norm": 0.797656238079071, "learning_rate": 0.00015823644982606905, "loss": 1.1698, "step": 156 }, { "epoch": 1.3057851239669422, "grad_norm": 0.5368632078170776, "learning_rate": 0.00015714451753481168, "loss": 1.1973, "step": 158 }, { "epoch": 1.322314049586777, "grad_norm": 0.4135212302207947, "learning_rate": 0.00015604237274229147, "loss": 1.1452, "step": 160 }, { "epoch": 1.3388429752066116, "grad_norm": 0.5289668440818787, "learning_rate": 0.00015493021241674918, "loss": 1.1954, "step": 162 }, { "epoch": 1.3553719008264462, "grad_norm": 0.4092061221599579, "learning_rate": 0.00015380823531633729, "loss": 1.1226, "step": 164 }, { "epoch": 1.3719008264462809, "grad_norm": 0.7049645781517029, "learning_rate": 0.00015267664195359917, "loss": 1.0948, "step": 166 }, { "epoch": 1.3884297520661157, "grad_norm": 0.47164198756217957, "learning_rate": 0.00015153563455963499, "loss": 1.0977, "step": 168 }, { "epoch": 1.4049586776859504, "grad_norm": 0.7871695160865784, "learning_rate": 0.00015038541704796003, "loss": 1.1674, "step": 170 }, { "epoch": 1.421487603305785, "grad_norm": 0.5381121635437012, "learning_rate": 0.00014922619497806277, "loss": 1.1415, "step": 172 }, { "epoch": 1.43801652892562, "grad_norm": 0.39419299364089966, "learning_rate": 0.00014805817551866838, "loss": 1.0747, "step": 174 }, { "epoch": 1.4545454545454546, "grad_norm": 0.38382914662361145, "learning_rate": 0.00014688156741071514, "loss": 1.1278, "step": 176 }, { "epoch": 1.4710743801652892, "grad_norm": 0.32674962282180786, "learning_rate": 0.00014569658093004935, "loss": 0.9774, "step": 178 }, { "epoch": 1.487603305785124, "grad_norm": 0.5443088412284851, "learning_rate": 0.00014450342784984633, "loss": 1.034, "step": 180 }, { "epoch": 1.5041322314049586, "grad_norm": 0.6682401895523071, "learning_rate": 0.00014330232140276366, "loss": 1.1732, "step": 182 }, { "epoch": 1.5206611570247934, "grad_norm": 0.5696044564247131, "learning_rate": 0.0001420934762428335, "loss": 1.0384, "step": 184 }, { "epoch": 1.537190082644628, "grad_norm": 0.6782551407814026, "learning_rate": 0.0001408771084071012, "loss": 1.1107, "step": 186 }, { "epoch": 1.553719008264463, "grad_norm": 0.8336123824119568, "learning_rate": 0.00013965343527701628, "loss": 1.0737, "step": 188 }, { "epoch": 1.5702479338842976, "grad_norm": 0.539226233959198, "learning_rate": 0.00013842267553958371, "loss": 1.1665, "step": 190 }, { "epoch": 1.5867768595041323, "grad_norm": 0.566620409488678, "learning_rate": 0.00013718504914828135, "loss": 1.1333, "step": 192 }, { "epoch": 1.603305785123967, "grad_norm": 0.4735005795955658, "learning_rate": 0.00013594077728375128, "loss": 1.1709, "step": 194 }, { "epoch": 1.6198347107438016, "grad_norm": 0.534383237361908, "learning_rate": 0.00013469008231427207, "loss": 1.0783, "step": 196 }, { "epoch": 1.6363636363636362, "grad_norm": 0.8410363793373108, "learning_rate": 0.0001334331877560182, "loss": 1.0708, "step": 198 }, { "epoch": 1.6528925619834711, "grad_norm": 0.6392219662666321, "learning_rate": 0.00013217031823311488, "loss": 1.0329, "step": 200 }, { "epoch": 1.6694214876033058, "grad_norm": 0.5770404934883118, "learning_rate": 0.00013090169943749476, "loss": 1.0404, "step": 202 }, { "epoch": 1.6859504132231407, "grad_norm": 0.6814575791358948, "learning_rate": 0.00012962755808856342, "loss": 1.0702, "step": 204 }, { "epoch": 1.7024793388429753, "grad_norm": 0.673312783241272, "learning_rate": 0.0001283481218926818, "loss": 1.0529, "step": 206 }, { "epoch": 1.71900826446281, "grad_norm": 0.6180073618888855, "learning_rate": 0.0001270636195024719, "loss": 1.0257, "step": 208 }, { "epoch": 1.7355371900826446, "grad_norm": 0.5565724968910217, "learning_rate": 0.00012577428047595344, "loss": 1.1102, "step": 210 }, { "epoch": 1.7520661157024793, "grad_norm": 0.5586270689964294, "learning_rate": 0.00012448033523551865, "loss": 1.0277, "step": 212 }, { "epoch": 1.768595041322314, "grad_norm": 0.542448878288269, "learning_rate": 0.00012318201502675285, "loss": 1.0988, "step": 214 }, { "epoch": 1.7851239669421488, "grad_norm": 0.513042151927948, "learning_rate": 0.0001218795518771075, "loss": 1.0828, "step": 216 }, { "epoch": 1.8016528925619835, "grad_norm": 0.7613060474395752, "learning_rate": 0.00012057317855443395, "loss": 1.1962, "step": 218 }, { "epoch": 1.8181818181818183, "grad_norm": 0.7522129416465759, "learning_rate": 0.00011926312852538455, "loss": 1.1339, "step": 220 }, { "epoch": 1.834710743801653, "grad_norm": 0.4655594825744629, "learning_rate": 0.00011794963591368893, "loss": 1.0967, "step": 222 }, { "epoch": 1.8512396694214877, "grad_norm": 0.5036570429801941, "learning_rate": 0.00011663293545831302, "loss": 1.0361, "step": 224 }, { "epoch": 1.8677685950413223, "grad_norm": 0.43016380071640015, "learning_rate": 0.00011531326247150803, "loss": 1.1281, "step": 226 }, { "epoch": 1.884297520661157, "grad_norm": 0.5184316635131836, "learning_rate": 0.00011399085279675687, "loss": 1.2083, "step": 228 }, { "epoch": 1.9008264462809916, "grad_norm": 0.6556355357170105, "learning_rate": 0.0001126659427666257, "loss": 1.0266, "step": 230 }, { "epoch": 1.9173553719008265, "grad_norm": 0.515681803226471, "learning_rate": 0.00011133876916052821, "loss": 1.0472, "step": 232 }, { "epoch": 1.9338842975206612, "grad_norm": 0.4592064321041107, "learning_rate": 0.00011000956916240985, "loss": 1.054, "step": 234 }, { "epoch": 1.950413223140496, "grad_norm": 0.5623230338096619, "learning_rate": 0.00010867858031835975, "loss": 1.1571, "step": 236 }, { "epoch": 1.9669421487603307, "grad_norm": 0.5241667032241821, "learning_rate": 0.00010734604049415822, "loss": 1.0985, "step": 238 }, { "epoch": 1.9834710743801653, "grad_norm": 0.54905104637146, "learning_rate": 0.00010601218783276672, "loss": 1.1088, "step": 240 }, { "epoch": 2.0, "grad_norm": 0.8823345303535461, "learning_rate": 0.00010467726071176853, "loss": 1.0991, "step": 242 } ], "logging_steps": 2, "max_steps": 484, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4578676410679296.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }