| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.49868766404199477, |
| "eval_steps": 500, |
| "global_step": 95, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.005249343832020997, |
| "grad_norm": 4.262586140207107, |
| "learning_rate": 1.2500000000000002e-07, |
| "loss": 1.2143, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.010498687664041995, |
| "grad_norm": 4.1559742669756154, |
| "learning_rate": 2.5000000000000004e-07, |
| "loss": 1.2307, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.015748031496062992, |
| "grad_norm": 4.2196647284049895, |
| "learning_rate": 3.75e-07, |
| "loss": 1.2286, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.02099737532808399, |
| "grad_norm": 4.13634077943981, |
| "learning_rate": 5.000000000000001e-07, |
| "loss": 1.2002, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.026246719160104987, |
| "grad_norm": 4.015668455829927, |
| "learning_rate": 6.25e-07, |
| "loss": 1.1672, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.031496062992125984, |
| "grad_norm": 3.832855314884781, |
| "learning_rate": 7.5e-07, |
| "loss": 1.1993, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.03674540682414698, |
| "grad_norm": 3.8323407788221733, |
| "learning_rate": 8.75e-07, |
| "loss": 1.1554, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.04199475065616798, |
| "grad_norm": 3.7465244180174917, |
| "learning_rate": 1.0000000000000002e-06, |
| "loss": 1.1672, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.047244094488188976, |
| "grad_norm": 3.7827251172961986, |
| "learning_rate": 1.125e-06, |
| "loss": 1.1755, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.05249343832020997, |
| "grad_norm": 3.470602675526565, |
| "learning_rate": 1.25e-06, |
| "loss": 1.1419, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.05774278215223097, |
| "grad_norm": 3.556221853917274, |
| "learning_rate": 1.3750000000000002e-06, |
| "loss": 1.194, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.06299212598425197, |
| "grad_norm": 3.324934060085957, |
| "learning_rate": 1.5e-06, |
| "loss": 1.1336, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.06824146981627296, |
| "grad_norm": 2.965981688480075, |
| "learning_rate": 1.6250000000000001e-06, |
| "loss": 1.1349, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.07349081364829396, |
| "grad_norm": 2.8658973663115046, |
| "learning_rate": 1.75e-06, |
| "loss": 1.1776, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.07874015748031496, |
| "grad_norm": 2.720689909744523, |
| "learning_rate": 1.8750000000000003e-06, |
| "loss": 1.1549, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.08398950131233596, |
| "grad_norm": 2.439062154183451, |
| "learning_rate": 2.0000000000000003e-06, |
| "loss": 1.141, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.08923884514435695, |
| "grad_norm": 2.1353279033918002, |
| "learning_rate": 2.125e-06, |
| "loss": 1.139, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.09448818897637795, |
| "grad_norm": 1.2092875477650313, |
| "learning_rate": 2.25e-06, |
| "loss": 1.0516, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.09973753280839895, |
| "grad_norm": 1.0763847439987342, |
| "learning_rate": 2.375e-06, |
| "loss": 1.0802, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.10498687664041995, |
| "grad_norm": 1.0340250902529846, |
| "learning_rate": 2.5e-06, |
| "loss": 1.0607, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.11023622047244094, |
| "grad_norm": 0.8630354040462489, |
| "learning_rate": 2.6250000000000003e-06, |
| "loss": 1.0496, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.11548556430446194, |
| "grad_norm": 0.8072735722523627, |
| "learning_rate": 2.7500000000000004e-06, |
| "loss": 1.08, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.12073490813648294, |
| "grad_norm": 0.7305262800316248, |
| "learning_rate": 2.875e-06, |
| "loss": 1.0539, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.12598425196850394, |
| "grad_norm": 0.7406394208995156, |
| "learning_rate": 3e-06, |
| "loss": 1.0529, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.13123359580052493, |
| "grad_norm": 0.7903255346265977, |
| "learning_rate": 3.125e-06, |
| "loss": 0.9914, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.13648293963254593, |
| "grad_norm": 0.8017049075586423, |
| "learning_rate": 3.2500000000000002e-06, |
| "loss": 1.0313, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.14173228346456693, |
| "grad_norm": 0.7764417012146556, |
| "learning_rate": 3.3750000000000003e-06, |
| "loss": 1.0422, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.14698162729658792, |
| "grad_norm": 0.7112571206699242, |
| "learning_rate": 3.5e-06, |
| "loss": 1.0187, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.15223097112860892, |
| "grad_norm": 0.6069574236656299, |
| "learning_rate": 3.625e-06, |
| "loss": 0.9958, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.15748031496062992, |
| "grad_norm": 0.5747553586770598, |
| "learning_rate": 3.7500000000000005e-06, |
| "loss": 0.9858, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.16272965879265092, |
| "grad_norm": 0.590510044443011, |
| "learning_rate": 3.875e-06, |
| "loss": 0.9841, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.1679790026246719, |
| "grad_norm": 0.5240510507345066, |
| "learning_rate": 4.000000000000001e-06, |
| "loss": 1.0171, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.1732283464566929, |
| "grad_norm": 0.4913378134702597, |
| "learning_rate": 4.125e-06, |
| "loss": 1.0218, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.1784776902887139, |
| "grad_norm": 0.47909247638813307, |
| "learning_rate": 4.25e-06, |
| "loss": 1.02, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.1837270341207349, |
| "grad_norm": 0.4949722185401137, |
| "learning_rate": 4.3750000000000005e-06, |
| "loss": 1.0206, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.1889763779527559, |
| "grad_norm": 0.48456911030733396, |
| "learning_rate": 4.5e-06, |
| "loss": 1.0321, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.1942257217847769, |
| "grad_norm": 0.49965866704889284, |
| "learning_rate": 4.625000000000001e-06, |
| "loss": 1.0184, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.1994750656167979, |
| "grad_norm": 0.4898360279427832, |
| "learning_rate": 4.75e-06, |
| "loss": 1.0165, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.2047244094488189, |
| "grad_norm": 0.5039246597121783, |
| "learning_rate": 4.875e-06, |
| "loss": 0.9811, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.2099737532808399, |
| "grad_norm": 0.462371012430454, |
| "learning_rate": 5e-06, |
| "loss": 0.9999, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.2152230971128609, |
| "grad_norm": 0.4564370676360458, |
| "learning_rate": 4.99989327925842e-06, |
| "loss": 1.0056, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.2204724409448819, |
| "grad_norm": 0.4838548423542603, |
| "learning_rate": 4.999573126145132e-06, |
| "loss": 1.0099, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.22572178477690288, |
| "grad_norm": 0.47657399375831033, |
| "learning_rate": 4.999039567993719e-06, |
| "loss": 1.0059, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.23097112860892388, |
| "grad_norm": 0.4609228968128241, |
| "learning_rate": 4.998292650357558e-06, |
| "loss": 0.9613, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.23622047244094488, |
| "grad_norm": 0.5379441335347738, |
| "learning_rate": 4.997332437005932e-06, |
| "loss": 0.9912, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.24146981627296588, |
| "grad_norm": 0.4488276134846175, |
| "learning_rate": 4.996159009918586e-06, |
| "loss": 0.9623, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.24671916010498687, |
| "grad_norm": 0.7899879742350473, |
| "learning_rate": 4.994772469278726e-06, |
| "loss": 0.9373, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.25196850393700787, |
| "grad_norm": 0.45351107525432893, |
| "learning_rate": 4.99317293346447e-06, |
| "loss": 0.9312, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.2572178477690289, |
| "grad_norm": 0.4145057356223518, |
| "learning_rate": 4.991360539038737e-06, |
| "loss": 0.9133, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.26246719160104987, |
| "grad_norm": 0.4371226648356658, |
| "learning_rate": 4.989335440737587e-06, |
| "loss": 0.9763, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.2677165354330709, |
| "grad_norm": 0.49987252416920314, |
| "learning_rate": 4.987097811457015e-06, |
| "loss": 0.9753, |
| "step": 51 |
| }, |
| { |
| "epoch": 0.27296587926509186, |
| "grad_norm": 0.4737378597917066, |
| "learning_rate": 4.984647842238185e-06, |
| "loss": 0.9509, |
| "step": 52 |
| }, |
| { |
| "epoch": 0.2782152230971129, |
| "grad_norm": 0.4803218242006868, |
| "learning_rate": 4.981985742251123e-06, |
| "loss": 1.0008, |
| "step": 53 |
| }, |
| { |
| "epoch": 0.28346456692913385, |
| "grad_norm": 0.4238793960297473, |
| "learning_rate": 4.9791117387768575e-06, |
| "loss": 1.0024, |
| "step": 54 |
| }, |
| { |
| "epoch": 0.2887139107611549, |
| "grad_norm": 0.4135698693902407, |
| "learning_rate": 4.976026077188013e-06, |
| "loss": 0.9208, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.29396325459317585, |
| "grad_norm": 0.49893312330659967, |
| "learning_rate": 4.972729020927866e-06, |
| "loss": 0.9771, |
| "step": 56 |
| }, |
| { |
| "epoch": 0.2992125984251969, |
| "grad_norm": 0.43878390791709027, |
| "learning_rate": 4.9692208514878445e-06, |
| "loss": 0.937, |
| "step": 57 |
| }, |
| { |
| "epoch": 0.30446194225721784, |
| "grad_norm": 0.42250251333050837, |
| "learning_rate": 4.965501868383507e-06, |
| "loss": 0.9287, |
| "step": 58 |
| }, |
| { |
| "epoch": 0.30971128608923887, |
| "grad_norm": 0.42426620742455357, |
| "learning_rate": 4.961572389128959e-06, |
| "loss": 0.9374, |
| "step": 59 |
| }, |
| { |
| "epoch": 0.31496062992125984, |
| "grad_norm": 0.4622588583575654, |
| "learning_rate": 4.957432749209755e-06, |
| "loss": 0.99, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.32020997375328086, |
| "grad_norm": 0.4324798534582787, |
| "learning_rate": 4.953083302054247e-06, |
| "loss": 1.0035, |
| "step": 61 |
| }, |
| { |
| "epoch": 0.32545931758530183, |
| "grad_norm": 0.4303590460079348, |
| "learning_rate": 4.948524419003415e-06, |
| "loss": 0.9585, |
| "step": 62 |
| }, |
| { |
| "epoch": 0.33070866141732286, |
| "grad_norm": 0.42861048906851473, |
| "learning_rate": 4.943756489279164e-06, |
| "loss": 0.9772, |
| "step": 63 |
| }, |
| { |
| "epoch": 0.3359580052493438, |
| "grad_norm": 0.4115697149677722, |
| "learning_rate": 4.938779919951092e-06, |
| "loss": 0.9426, |
| "step": 64 |
| }, |
| { |
| "epoch": 0.34120734908136485, |
| "grad_norm": 0.417300147661056, |
| "learning_rate": 4.933595135901733e-06, |
| "loss": 0.9447, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.3464566929133858, |
| "grad_norm": 0.4040601468423496, |
| "learning_rate": 4.928202579790285e-06, |
| "loss": 0.966, |
| "step": 66 |
| }, |
| { |
| "epoch": 0.35170603674540685, |
| "grad_norm": 0.3677161548087925, |
| "learning_rate": 4.9226027120148195e-06, |
| "loss": 0.941, |
| "step": 67 |
| }, |
| { |
| "epoch": 0.3569553805774278, |
| "grad_norm": 0.3832286188469758, |
| "learning_rate": 4.916796010672969e-06, |
| "loss": 0.9822, |
| "step": 68 |
| }, |
| { |
| "epoch": 0.36220472440944884, |
| "grad_norm": 0.4345291089971557, |
| "learning_rate": 4.910782971521112e-06, |
| "loss": 0.9687, |
| "step": 69 |
| }, |
| { |
| "epoch": 0.3674540682414698, |
| "grad_norm": 0.4133843702108161, |
| "learning_rate": 4.904564107932048e-06, |
| "loss": 0.9283, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.37270341207349084, |
| "grad_norm": 0.38465081625519515, |
| "learning_rate": 4.898139950851163e-06, |
| "loss": 0.9479, |
| "step": 71 |
| }, |
| { |
| "epoch": 0.3779527559055118, |
| "grad_norm": 0.424340283488392, |
| "learning_rate": 4.891511048751102e-06, |
| "loss": 0.9593, |
| "step": 72 |
| }, |
| { |
| "epoch": 0.38320209973753283, |
| "grad_norm": 0.40638719244646565, |
| "learning_rate": 4.884677967584945e-06, |
| "loss": 0.9264, |
| "step": 73 |
| }, |
| { |
| "epoch": 0.3884514435695538, |
| "grad_norm": 0.4168684939745582, |
| "learning_rate": 4.8776412907378845e-06, |
| "loss": 0.9399, |
| "step": 74 |
| }, |
| { |
| "epoch": 0.3937007874015748, |
| "grad_norm": 0.3928820462579945, |
| "learning_rate": 4.870401618977415e-06, |
| "loss": 0.9736, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.3989501312335958, |
| "grad_norm": 0.48005159652101537, |
| "learning_rate": 4.86295957040205e-06, |
| "loss": 0.9935, |
| "step": 76 |
| }, |
| { |
| "epoch": 0.4041994750656168, |
| "grad_norm": 0.4229896582271501, |
| "learning_rate": 4.855315780388541e-06, |
| "loss": 0.9358, |
| "step": 77 |
| }, |
| { |
| "epoch": 0.4094488188976378, |
| "grad_norm": 0.4258651800976577, |
| "learning_rate": 4.847470901537642e-06, |
| "loss": 0.9583, |
| "step": 78 |
| }, |
| { |
| "epoch": 0.4146981627296588, |
| "grad_norm": 0.4069975102427097, |
| "learning_rate": 4.839425603618382e-06, |
| "loss": 0.9237, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.4199475065616798, |
| "grad_norm": 0.3853018091795304, |
| "learning_rate": 4.83118057351089e-06, |
| "loss": 0.9528, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.4251968503937008, |
| "grad_norm": 0.41640976933906576, |
| "learning_rate": 4.822736515147748e-06, |
| "loss": 0.9281, |
| "step": 81 |
| }, |
| { |
| "epoch": 0.4304461942257218, |
| "grad_norm": 0.4341414623727107, |
| "learning_rate": 4.814094149453891e-06, |
| "loss": 0.983, |
| "step": 82 |
| }, |
| { |
| "epoch": 0.4356955380577428, |
| "grad_norm": 0.41893835966431175, |
| "learning_rate": 4.805254214285061e-06, |
| "loss": 0.9691, |
| "step": 83 |
| }, |
| { |
| "epoch": 0.4409448818897638, |
| "grad_norm": 0.3778203088676148, |
| "learning_rate": 4.796217464364808e-06, |
| "loss": 0.9386, |
| "step": 84 |
| }, |
| { |
| "epoch": 0.4461942257217848, |
| "grad_norm": 0.4049971835755209, |
| "learning_rate": 4.786984671220053e-06, |
| "loss": 0.9604, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.45144356955380577, |
| "grad_norm": 0.39430021496671025, |
| "learning_rate": 4.7775566231152216e-06, |
| "loss": 0.9625, |
| "step": 86 |
| }, |
| { |
| "epoch": 0.4566929133858268, |
| "grad_norm": 0.4303135030837173, |
| "learning_rate": 4.767934124984941e-06, |
| "loss": 0.9421, |
| "step": 87 |
| }, |
| { |
| "epoch": 0.46194225721784776, |
| "grad_norm": 0.4037289734626591, |
| "learning_rate": 4.7581179983653224e-06, |
| "loss": 0.9395, |
| "step": 88 |
| }, |
| { |
| "epoch": 0.4671916010498688, |
| "grad_norm": 0.389977925556389, |
| "learning_rate": 4.7481090813238145e-06, |
| "loss": 0.9494, |
| "step": 89 |
| }, |
| { |
| "epoch": 0.47244094488188976, |
| "grad_norm": 0.4221791671439025, |
| "learning_rate": 4.737908228387656e-06, |
| "loss": 0.9236, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.4776902887139108, |
| "grad_norm": 0.38571752654505004, |
| "learning_rate": 4.72751631047092e-06, |
| "loss": 0.9836, |
| "step": 91 |
| }, |
| { |
| "epoch": 0.48293963254593175, |
| "grad_norm": 0.3756683416227555, |
| "learning_rate": 4.716934214800155e-06, |
| "loss": 0.9847, |
| "step": 92 |
| }, |
| { |
| "epoch": 0.4881889763779528, |
| "grad_norm": 0.43805821160946656, |
| "learning_rate": 4.70616284483864e-06, |
| "loss": 0.9759, |
| "step": 93 |
| }, |
| { |
| "epoch": 0.49343832020997375, |
| "grad_norm": 0.42186430904629324, |
| "learning_rate": 4.695203120209245e-06, |
| "loss": 0.9381, |
| "step": 94 |
| }, |
| { |
| "epoch": 0.49868766404199477, |
| "grad_norm": 0.3816672833085737, |
| "learning_rate": 4.684055976615924e-06, |
| "loss": 0.9381, |
| "step": 95 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 380, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 95, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.3333583542707814e+17, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|