{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 354, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.014144271570014143, "grad_norm": 1.1091066598892212, "learning_rate": 1.348314606741573e-06, "loss": 1.3703, "step": 5 }, { "epoch": 0.028288543140028287, "grad_norm": 1.1845866441726685, "learning_rate": 3.033707865168539e-06, "loss": 1.3488, "step": 10 }, { "epoch": 0.042432814710042434, "grad_norm": 0.6645119190216064, "learning_rate": 4.719101123595506e-06, "loss": 1.3309, "step": 15 }, { "epoch": 0.056577086280056574, "grad_norm": 0.6867905855178833, "learning_rate": 6.404494382022472e-06, "loss": 1.3852, "step": 20 }, { "epoch": 0.07072135785007072, "grad_norm": 0.8682529330253601, "learning_rate": 8.089887640449438e-06, "loss": 1.2832, "step": 25 }, { "epoch": 0.08486562942008487, "grad_norm": 0.5214539766311646, "learning_rate": 9.775280898876405e-06, "loss": 1.351, "step": 30 }, { "epoch": 0.09900990099009901, "grad_norm": 0.5842329859733582, "learning_rate": 1.146067415730337e-05, "loss": 1.199, "step": 35 }, { "epoch": 0.11315417256011315, "grad_norm": 0.5440396070480347, "learning_rate": 1.3146067415730338e-05, "loss": 1.2351, "step": 40 }, { "epoch": 0.1272984441301273, "grad_norm": 0.5273528695106506, "learning_rate": 1.4831460674157303e-05, "loss": 1.2773, "step": 45 }, { "epoch": 0.14144271570014144, "grad_norm": 0.5239897966384888, "learning_rate": 1.651685393258427e-05, "loss": 1.2557, "step": 50 }, { "epoch": 0.15558698727015557, "grad_norm": 0.5751009583473206, "learning_rate": 1.8202247191011237e-05, "loss": 1.2738, "step": 55 }, { "epoch": 0.16973125884016974, "grad_norm": 0.5215653777122498, "learning_rate": 1.98876404494382e-05, "loss": 1.2258, "step": 60 }, { "epoch": 0.18387553041018387, "grad_norm": 0.5515840649604797, "learning_rate": 2.1573033707865168e-05, "loss": 1.1885, "step": 65 }, { "epoch": 0.19801980198019803, "grad_norm": 0.5996809601783752, "learning_rate": 2.3258426966292135e-05, "loss": 1.1447, "step": 70 }, { "epoch": 0.21216407355021216, "grad_norm": 0.6658637523651123, "learning_rate": 2.4943820224719103e-05, "loss": 1.1789, "step": 75 }, { "epoch": 0.2263083451202263, "grad_norm": 0.5400289297103882, "learning_rate": 2.6629213483146066e-05, "loss": 1.1788, "step": 80 }, { "epoch": 0.24045261669024046, "grad_norm": 0.6226291060447693, "learning_rate": 2.8314606741573034e-05, "loss": 1.1713, "step": 85 }, { "epoch": 0.2545968882602546, "grad_norm": 0.6412995457649231, "learning_rate": 3e-05, "loss": 1.1693, "step": 90 }, { "epoch": 0.26874115983026875, "grad_norm": 0.6276025176048279, "learning_rate": 2.9999345118939752e-05, "loss": 1.1233, "step": 95 }, { "epoch": 0.2828854314002829, "grad_norm": 0.5257118344306946, "learning_rate": 2.9997380532941557e-05, "loss": 1.0805, "step": 100 }, { "epoch": 0.297029702970297, "grad_norm": 0.6102027893066406, "learning_rate": 2.999410641354812e-05, "loss": 1.1336, "step": 105 }, { "epoch": 0.31117397454031115, "grad_norm": 0.6746109127998352, "learning_rate": 2.9989523046647264e-05, "loss": 1.1515, "step": 110 }, { "epoch": 0.32531824611032534, "grad_norm": 0.6818143129348755, "learning_rate": 2.9983630832447017e-05, "loss": 1.0251, "step": 115 }, { "epoch": 0.33946251768033947, "grad_norm": 0.6401700973510742, "learning_rate": 2.9976430285440642e-05, "loss": 1.0817, "step": 120 }, { "epoch": 0.3536067892503536, "grad_norm": 0.6563226580619812, "learning_rate": 2.9967922034361726e-05, "loss": 1.0144, "step": 125 }, { "epoch": 0.36775106082036774, "grad_norm": 0.7533389925956726, "learning_rate": 2.995810682212926e-05, "loss": 1.0501, "step": 130 }, { "epoch": 0.38189533239038187, "grad_norm": 0.6909534335136414, "learning_rate": 2.9946985505782792e-05, "loss": 1.0284, "step": 135 }, { "epoch": 0.39603960396039606, "grad_norm": 0.639701783657074, "learning_rate": 2.993455905640758e-05, "loss": 1.0045, "step": 140 }, { "epoch": 0.4101838755304102, "grad_norm": 1.8686423301696777, "learning_rate": 2.9920828559049805e-05, "loss": 0.9764, "step": 145 }, { "epoch": 0.4243281471004243, "grad_norm": 0.7100444436073303, "learning_rate": 2.9905795212621825e-05, "loss": 1.0063, "step": 150 }, { "epoch": 0.43847241867043846, "grad_norm": 0.8259682059288025, "learning_rate": 2.9889460329797482e-05, "loss": 0.9937, "step": 155 }, { "epoch": 0.4526166902404526, "grad_norm": 0.6790755391120911, "learning_rate": 2.9871825336897495e-05, "loss": 0.9834, "step": 160 }, { "epoch": 0.4667609618104668, "grad_norm": 0.7154070734977722, "learning_rate": 2.985289177376491e-05, "loss": 0.9921, "step": 165 }, { "epoch": 0.4809052333804809, "grad_norm": 0.8498982787132263, "learning_rate": 2.983266129363065e-05, "loss": 0.9274, "step": 170 }, { "epoch": 0.49504950495049505, "grad_norm": 0.7261009216308594, "learning_rate": 2.9811135662969146e-05, "loss": 0.9445, "step": 175 }, { "epoch": 0.5091937765205092, "grad_norm": 1.0621588230133057, "learning_rate": 2.978831676134411e-05, "loss": 0.9137, "step": 180 }, { "epoch": 0.5233380480905233, "grad_norm": 0.9448386430740356, "learning_rate": 2.9764206581244415e-05, "loss": 0.9553, "step": 185 }, { "epoch": 0.5374823196605375, "grad_norm": 0.8367416858673096, "learning_rate": 2.973880722791009e-05, "loss": 0.8912, "step": 190 }, { "epoch": 0.5516265912305516, "grad_norm": 0.8499005436897278, "learning_rate": 2.971212091914854e-05, "loss": 0.9632, "step": 195 }, { "epoch": 0.5657708628005658, "grad_norm": 0.7935090065002441, "learning_rate": 2.968414998514085e-05, "loss": 0.8724, "step": 200 }, { "epoch": 0.57991513437058, "grad_norm": 0.8054782152175903, "learning_rate": 2.9654896868238334e-05, "loss": 0.8849, "step": 205 }, { "epoch": 0.594059405940594, "grad_norm": 0.9245705008506775, "learning_rate": 2.96243641227493e-05, "loss": 0.8934, "step": 210 }, { "epoch": 0.6082036775106082, "grad_norm": 0.851338803768158, "learning_rate": 2.959255441471597e-05, "loss": 0.8839, "step": 215 }, { "epoch": 0.6223479490806223, "grad_norm": 0.9376785159111023, "learning_rate": 2.9559470521681726e-05, "loss": 0.8592, "step": 220 }, { "epoch": 0.6364922206506365, "grad_norm": 0.8601529002189636, "learning_rate": 2.9525115332448557e-05, "loss": 0.8879, "step": 225 }, { "epoch": 0.6506364922206507, "grad_norm": 0.9360470175743103, "learning_rate": 2.948949184682484e-05, "loss": 0.8337, "step": 230 }, { "epoch": 0.6647807637906648, "grad_norm": 0.9480540156364441, "learning_rate": 2.9452603175363364e-05, "loss": 0.9082, "step": 235 }, { "epoch": 0.6789250353606789, "grad_norm": 0.9365689158439636, "learning_rate": 2.941445253908978e-05, "loss": 0.8773, "step": 240 }, { "epoch": 0.693069306930693, "grad_norm": 0.9173905253410339, "learning_rate": 2.9375043269221294e-05, "loss": 0.81, "step": 245 }, { "epoch": 0.7072135785007072, "grad_norm": 0.9756169319152832, "learning_rate": 2.9334378806875838e-05, "loss": 0.8046, "step": 250 }, { "epoch": 0.7213578500707214, "grad_norm": 0.9777851104736328, "learning_rate": 2.929246270277157e-05, "loss": 0.8452, "step": 255 }, { "epoch": 0.7355021216407355, "grad_norm": 0.9113903045654297, "learning_rate": 2.9249298616916856e-05, "loss": 0.8244, "step": 260 }, { "epoch": 0.7496463932107497, "grad_norm": 0.906170666217804, "learning_rate": 2.9204890318290666e-05, "loss": 0.7866, "step": 265 }, { "epoch": 0.7637906647807637, "grad_norm": 0.9952336549758911, "learning_rate": 2.915924168451349e-05, "loss": 0.817, "step": 270 }, { "epoch": 0.7779349363507779, "grad_norm": 1.1563059091567993, "learning_rate": 2.9112356701508756e-05, "loss": 0.8101, "step": 275 }, { "epoch": 0.7920792079207921, "grad_norm": 1.035338282585144, "learning_rate": 2.9064239463154782e-05, "loss": 0.7379, "step": 280 }, { "epoch": 0.8062234794908062, "grad_norm": 1.0756317377090454, "learning_rate": 2.9014894170927308e-05, "loss": 0.7709, "step": 285 }, { "epoch": 0.8203677510608204, "grad_norm": 1.1403799057006836, "learning_rate": 2.8964325133532638e-05, "loss": 0.8136, "step": 290 }, { "epoch": 0.8345120226308345, "grad_norm": 1.111162543296814, "learning_rate": 2.8912536766531424e-05, "loss": 0.7563, "step": 295 }, { "epoch": 0.8486562942008486, "grad_norm": 0.9991361498832703, "learning_rate": 2.885953359195308e-05, "loss": 0.74, "step": 300 }, { "epoch": 0.8628005657708628, "grad_norm": 0.9173119068145752, "learning_rate": 2.8805320237900965e-05, "loss": 0.7599, "step": 305 }, { "epoch": 0.8769448373408769, "grad_norm": 1.1988530158996582, "learning_rate": 2.874990143814826e-05, "loss": 0.7256, "step": 310 }, { "epoch": 0.8910891089108911, "grad_norm": 1.0119719505310059, "learning_rate": 2.8693282031724592e-05, "loss": 0.7472, "step": 315 }, { "epoch": 0.9052333804809052, "grad_norm": 1.0434908866882324, "learning_rate": 2.8635466962493564e-05, "loss": 0.7512, "step": 320 }, { "epoch": 0.9193776520509194, "grad_norm": 1.0775619745254517, "learning_rate": 2.8576461278721016e-05, "loss": 0.6982, "step": 325 }, { "epoch": 0.9335219236209336, "grad_norm": 0.9957691431045532, "learning_rate": 2.8516270132634244e-05, "loss": 0.6556, "step": 330 }, { "epoch": 0.9476661951909476, "grad_norm": 1.0233315229415894, "learning_rate": 2.845489877997213e-05, "loss": 0.7431, "step": 335 }, { "epoch": 0.9618104667609618, "grad_norm": 0.9704013466835022, "learning_rate": 2.8392352579526205e-05, "loss": 0.7121, "step": 340 }, { "epoch": 0.9759547383309759, "grad_norm": 1.0681896209716797, "learning_rate": 2.832863699267274e-05, "loss": 0.716, "step": 345 }, { "epoch": 0.9900990099009901, "grad_norm": 1.088863492012024, "learning_rate": 2.8263757582895888e-05, "loss": 0.699, "step": 350 } ], "logging_steps": 5, "max_steps": 1770, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.9088477124991386e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }