| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 354, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.014144271570014143, | |
| "grad_norm": 1.1091066598892212, | |
| "learning_rate": 1.348314606741573e-06, | |
| "loss": 1.3703, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.028288543140028287, | |
| "grad_norm": 1.1845866441726685, | |
| "learning_rate": 3.033707865168539e-06, | |
| "loss": 1.3488, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.042432814710042434, | |
| "grad_norm": 0.6645119190216064, | |
| "learning_rate": 4.719101123595506e-06, | |
| "loss": 1.3309, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.056577086280056574, | |
| "grad_norm": 0.6867905855178833, | |
| "learning_rate": 6.404494382022472e-06, | |
| "loss": 1.3852, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.07072135785007072, | |
| "grad_norm": 0.8682529330253601, | |
| "learning_rate": 8.089887640449438e-06, | |
| "loss": 1.2832, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.08486562942008487, | |
| "grad_norm": 0.5214539766311646, | |
| "learning_rate": 9.775280898876405e-06, | |
| "loss": 1.351, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.09900990099009901, | |
| "grad_norm": 0.5842329859733582, | |
| "learning_rate": 1.146067415730337e-05, | |
| "loss": 1.199, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.11315417256011315, | |
| "grad_norm": 0.5440396070480347, | |
| "learning_rate": 1.3146067415730338e-05, | |
| "loss": 1.2351, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.1272984441301273, | |
| "grad_norm": 0.5273528695106506, | |
| "learning_rate": 1.4831460674157303e-05, | |
| "loss": 1.2773, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.14144271570014144, | |
| "grad_norm": 0.5239897966384888, | |
| "learning_rate": 1.651685393258427e-05, | |
| "loss": 1.2557, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.15558698727015557, | |
| "grad_norm": 0.5751009583473206, | |
| "learning_rate": 1.8202247191011237e-05, | |
| "loss": 1.2738, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.16973125884016974, | |
| "grad_norm": 0.5215653777122498, | |
| "learning_rate": 1.98876404494382e-05, | |
| "loss": 1.2258, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.18387553041018387, | |
| "grad_norm": 0.5515840649604797, | |
| "learning_rate": 2.1573033707865168e-05, | |
| "loss": 1.1885, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.19801980198019803, | |
| "grad_norm": 0.5996809601783752, | |
| "learning_rate": 2.3258426966292135e-05, | |
| "loss": 1.1447, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.21216407355021216, | |
| "grad_norm": 0.6658637523651123, | |
| "learning_rate": 2.4943820224719103e-05, | |
| "loss": 1.1789, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.2263083451202263, | |
| "grad_norm": 0.5400289297103882, | |
| "learning_rate": 2.6629213483146066e-05, | |
| "loss": 1.1788, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.24045261669024046, | |
| "grad_norm": 0.6226291060447693, | |
| "learning_rate": 2.8314606741573034e-05, | |
| "loss": 1.1713, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.2545968882602546, | |
| "grad_norm": 0.6412995457649231, | |
| "learning_rate": 3e-05, | |
| "loss": 1.1693, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.26874115983026875, | |
| "grad_norm": 0.6276025176048279, | |
| "learning_rate": 2.9999345118939752e-05, | |
| "loss": 1.1233, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.2828854314002829, | |
| "grad_norm": 0.5257118344306946, | |
| "learning_rate": 2.9997380532941557e-05, | |
| "loss": 1.0805, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.297029702970297, | |
| "grad_norm": 0.6102027893066406, | |
| "learning_rate": 2.999410641354812e-05, | |
| "loss": 1.1336, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.31117397454031115, | |
| "grad_norm": 0.6746109127998352, | |
| "learning_rate": 2.9989523046647264e-05, | |
| "loss": 1.1515, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.32531824611032534, | |
| "grad_norm": 0.6818143129348755, | |
| "learning_rate": 2.9983630832447017e-05, | |
| "loss": 1.0251, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.33946251768033947, | |
| "grad_norm": 0.6401700973510742, | |
| "learning_rate": 2.9976430285440642e-05, | |
| "loss": 1.0817, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.3536067892503536, | |
| "grad_norm": 0.6563226580619812, | |
| "learning_rate": 2.9967922034361726e-05, | |
| "loss": 1.0144, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.36775106082036774, | |
| "grad_norm": 0.7533389925956726, | |
| "learning_rate": 2.995810682212926e-05, | |
| "loss": 1.0501, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.38189533239038187, | |
| "grad_norm": 0.6909534335136414, | |
| "learning_rate": 2.9946985505782792e-05, | |
| "loss": 1.0284, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.39603960396039606, | |
| "grad_norm": 0.639701783657074, | |
| "learning_rate": 2.993455905640758e-05, | |
| "loss": 1.0045, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.4101838755304102, | |
| "grad_norm": 1.8686423301696777, | |
| "learning_rate": 2.9920828559049805e-05, | |
| "loss": 0.9764, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.4243281471004243, | |
| "grad_norm": 0.7100444436073303, | |
| "learning_rate": 2.9905795212621825e-05, | |
| "loss": 1.0063, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.43847241867043846, | |
| "grad_norm": 0.8259682059288025, | |
| "learning_rate": 2.9889460329797482e-05, | |
| "loss": 0.9937, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.4526166902404526, | |
| "grad_norm": 0.6790755391120911, | |
| "learning_rate": 2.9871825336897495e-05, | |
| "loss": 0.9834, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.4667609618104668, | |
| "grad_norm": 0.7154070734977722, | |
| "learning_rate": 2.985289177376491e-05, | |
| "loss": 0.9921, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.4809052333804809, | |
| "grad_norm": 0.8498982787132263, | |
| "learning_rate": 2.983266129363065e-05, | |
| "loss": 0.9274, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.49504950495049505, | |
| "grad_norm": 0.7261009216308594, | |
| "learning_rate": 2.9811135662969146e-05, | |
| "loss": 0.9445, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.5091937765205092, | |
| "grad_norm": 1.0621588230133057, | |
| "learning_rate": 2.978831676134411e-05, | |
| "loss": 0.9137, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.5233380480905233, | |
| "grad_norm": 0.9448386430740356, | |
| "learning_rate": 2.9764206581244415e-05, | |
| "loss": 0.9553, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.5374823196605375, | |
| "grad_norm": 0.8367416858673096, | |
| "learning_rate": 2.973880722791009e-05, | |
| "loss": 0.8912, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.5516265912305516, | |
| "grad_norm": 0.8499005436897278, | |
| "learning_rate": 2.971212091914854e-05, | |
| "loss": 0.9632, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.5657708628005658, | |
| "grad_norm": 0.7935090065002441, | |
| "learning_rate": 2.968414998514085e-05, | |
| "loss": 0.8724, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.57991513437058, | |
| "grad_norm": 0.8054782152175903, | |
| "learning_rate": 2.9654896868238334e-05, | |
| "loss": 0.8849, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.594059405940594, | |
| "grad_norm": 0.9245705008506775, | |
| "learning_rate": 2.96243641227493e-05, | |
| "loss": 0.8934, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.6082036775106082, | |
| "grad_norm": 0.851338803768158, | |
| "learning_rate": 2.959255441471597e-05, | |
| "loss": 0.8839, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.6223479490806223, | |
| "grad_norm": 0.9376785159111023, | |
| "learning_rate": 2.9559470521681726e-05, | |
| "loss": 0.8592, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.6364922206506365, | |
| "grad_norm": 0.8601529002189636, | |
| "learning_rate": 2.9525115332448557e-05, | |
| "loss": 0.8879, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.6506364922206507, | |
| "grad_norm": 0.9360470175743103, | |
| "learning_rate": 2.948949184682484e-05, | |
| "loss": 0.8337, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.6647807637906648, | |
| "grad_norm": 0.9480540156364441, | |
| "learning_rate": 2.9452603175363364e-05, | |
| "loss": 0.9082, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.6789250353606789, | |
| "grad_norm": 0.9365689158439636, | |
| "learning_rate": 2.941445253908978e-05, | |
| "loss": 0.8773, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.693069306930693, | |
| "grad_norm": 0.9173905253410339, | |
| "learning_rate": 2.9375043269221294e-05, | |
| "loss": 0.81, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.7072135785007072, | |
| "grad_norm": 0.9756169319152832, | |
| "learning_rate": 2.9334378806875838e-05, | |
| "loss": 0.8046, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.7213578500707214, | |
| "grad_norm": 0.9777851104736328, | |
| "learning_rate": 2.929246270277157e-05, | |
| "loss": 0.8452, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.7355021216407355, | |
| "grad_norm": 0.9113903045654297, | |
| "learning_rate": 2.9249298616916856e-05, | |
| "loss": 0.8244, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.7496463932107497, | |
| "grad_norm": 0.906170666217804, | |
| "learning_rate": 2.9204890318290666e-05, | |
| "loss": 0.7866, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.7637906647807637, | |
| "grad_norm": 0.9952336549758911, | |
| "learning_rate": 2.915924168451349e-05, | |
| "loss": 0.817, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.7779349363507779, | |
| "grad_norm": 1.1563059091567993, | |
| "learning_rate": 2.9112356701508756e-05, | |
| "loss": 0.8101, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.7920792079207921, | |
| "grad_norm": 1.035338282585144, | |
| "learning_rate": 2.9064239463154782e-05, | |
| "loss": 0.7379, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.8062234794908062, | |
| "grad_norm": 1.0756317377090454, | |
| "learning_rate": 2.9014894170927308e-05, | |
| "loss": 0.7709, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.8203677510608204, | |
| "grad_norm": 1.1403799057006836, | |
| "learning_rate": 2.8964325133532638e-05, | |
| "loss": 0.8136, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.8345120226308345, | |
| "grad_norm": 1.111162543296814, | |
| "learning_rate": 2.8912536766531424e-05, | |
| "loss": 0.7563, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.8486562942008486, | |
| "grad_norm": 0.9991361498832703, | |
| "learning_rate": 2.885953359195308e-05, | |
| "loss": 0.74, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.8628005657708628, | |
| "grad_norm": 0.9173119068145752, | |
| "learning_rate": 2.8805320237900965e-05, | |
| "loss": 0.7599, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.8769448373408769, | |
| "grad_norm": 1.1988530158996582, | |
| "learning_rate": 2.874990143814826e-05, | |
| "loss": 0.7256, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.8910891089108911, | |
| "grad_norm": 1.0119719505310059, | |
| "learning_rate": 2.8693282031724592e-05, | |
| "loss": 0.7472, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.9052333804809052, | |
| "grad_norm": 1.0434908866882324, | |
| "learning_rate": 2.8635466962493564e-05, | |
| "loss": 0.7512, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.9193776520509194, | |
| "grad_norm": 1.0775619745254517, | |
| "learning_rate": 2.8576461278721016e-05, | |
| "loss": 0.6982, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.9335219236209336, | |
| "grad_norm": 0.9957691431045532, | |
| "learning_rate": 2.8516270132634244e-05, | |
| "loss": 0.6556, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.9476661951909476, | |
| "grad_norm": 1.0233315229415894, | |
| "learning_rate": 2.845489877997213e-05, | |
| "loss": 0.7431, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.9618104667609618, | |
| "grad_norm": 0.9704013466835022, | |
| "learning_rate": 2.8392352579526205e-05, | |
| "loss": 0.7121, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.9759547383309759, | |
| "grad_norm": 1.0681896209716797, | |
| "learning_rate": 2.832863699267274e-05, | |
| "loss": 0.716, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.9900990099009901, | |
| "grad_norm": 1.088863492012024, | |
| "learning_rate": 2.8263757582895888e-05, | |
| "loss": 0.699, | |
| "step": 350 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 1770, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 2000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4.9088477124991386e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |