{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.03385168729503861, "eval_steps": 20, "global_step": 80, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0004231460911879826, "grad_norm": 1.6363259553909302, "learning_rate": 2e-05, "loss": 2.4891, "step": 1 }, { "epoch": 0.0004231460911879826, "eval_loss": 2.7103111743927, "eval_runtime": 25.5407, "eval_samples_per_second": 77.915, "eval_steps_per_second": 19.498, "step": 1 }, { "epoch": 0.0008462921823759652, "grad_norm": 1.6588494777679443, "learning_rate": 4e-05, "loss": 2.6718, "step": 2 }, { "epoch": 0.0012694382735639479, "grad_norm": 1.7058560848236084, "learning_rate": 6e-05, "loss": 2.6474, "step": 3 }, { "epoch": 0.0016925843647519305, "grad_norm": 1.6588222980499268, "learning_rate": 8e-05, "loss": 2.6717, "step": 4 }, { "epoch": 0.002115730455939913, "grad_norm": 1.5508973598480225, "learning_rate": 0.0001, "loss": 2.5936, "step": 5 }, { "epoch": 0.0025388765471278957, "grad_norm": 1.4414312839508057, "learning_rate": 0.00012, "loss": 2.6166, "step": 6 }, { "epoch": 0.0029620226383158784, "grad_norm": 1.3356502056121826, "learning_rate": 0.00014, "loss": 2.6361, "step": 7 }, { "epoch": 0.003385168729503861, "grad_norm": 1.3982197046279907, "learning_rate": 0.00016, "loss": 2.5908, "step": 8 }, { "epoch": 0.003808314820691844, "grad_norm": 1.4515146017074585, "learning_rate": 0.00018, "loss": 2.5478, "step": 9 }, { "epoch": 0.004231460911879826, "grad_norm": 1.5462111234664917, "learning_rate": 0.0002, "loss": 2.5379, "step": 10 }, { "epoch": 0.004654607003067809, "grad_norm": 1.5184928178787231, "learning_rate": 0.00019989930665413147, "loss": 2.5466, "step": 11 }, { "epoch": 0.0050777530942557915, "grad_norm": 1.452031135559082, "learning_rate": 0.00019959742939952392, "loss": 2.4949, "step": 12 }, { "epoch": 0.005500899185443774, "grad_norm": 1.4936366081237793, "learning_rate": 0.00019909497617679348, "loss": 2.3631, "step": 13 }, { "epoch": 0.005924045276631757, "grad_norm": 1.3953323364257812, "learning_rate": 0.00019839295885986296, "loss": 2.3978, "step": 14 }, { "epoch": 0.006347191367819739, "grad_norm": 1.361305594444275, "learning_rate": 0.00019749279121818235, "loss": 2.4401, "step": 15 }, { "epoch": 0.006770337459007722, "grad_norm": 1.3678853511810303, "learning_rate": 0.00019639628606958533, "loss": 2.437, "step": 16 }, { "epoch": 0.0071934835501957055, "grad_norm": 1.3215718269348145, "learning_rate": 0.00019510565162951537, "loss": 2.389, "step": 17 }, { "epoch": 0.007616629641383688, "grad_norm": 1.285584568977356, "learning_rate": 0.00019362348706397373, "loss": 2.3646, "step": 18 }, { "epoch": 0.00803977573257167, "grad_norm": 1.2454744577407837, "learning_rate": 0.0001919527772551451, "loss": 2.2511, "step": 19 }, { "epoch": 0.008462921823759652, "grad_norm": 1.3071717023849487, "learning_rate": 0.0001900968867902419, "loss": 2.3198, "step": 20 }, { "epoch": 0.008462921823759652, "eval_loss": 2.341797351837158, "eval_runtime": 25.0254, "eval_samples_per_second": 79.519, "eval_steps_per_second": 19.9, "step": 20 }, { "epoch": 0.008886067914947636, "grad_norm": 1.375653862953186, "learning_rate": 0.0001880595531856738, "loss": 2.3413, "step": 21 }, { "epoch": 0.009309214006135618, "grad_norm": 1.3680791854858398, "learning_rate": 0.00018584487936018661, "loss": 2.4474, "step": 22 }, { "epoch": 0.009732360097323601, "grad_norm": 1.444303035736084, "learning_rate": 0.00018345732537213027, "loss": 2.4309, "step": 23 }, { "epoch": 0.010155506188511583, "grad_norm": 1.3316677808761597, "learning_rate": 0.00018090169943749476, "loss": 2.3091, "step": 24 }, { "epoch": 0.010578652279699566, "grad_norm": 1.206446647644043, "learning_rate": 0.000178183148246803, "loss": 2.325, "step": 25 }, { "epoch": 0.011001798370887548, "grad_norm": 1.2919548749923706, "learning_rate": 0.00017530714660036112, "loss": 2.2293, "step": 26 }, { "epoch": 0.011424944462075532, "grad_norm": 1.4091441631317139, "learning_rate": 0.00017227948638273916, "loss": 2.2388, "step": 27 }, { "epoch": 0.011848090553263513, "grad_norm": 1.4587730169296265, "learning_rate": 0.00016910626489868649, "loss": 2.3036, "step": 28 }, { "epoch": 0.012271236644451497, "grad_norm": 1.8615366220474243, "learning_rate": 0.00016579387259397127, "loss": 2.3368, "step": 29 }, { "epoch": 0.012694382735639479, "grad_norm": 1.4323437213897705, "learning_rate": 0.00016234898018587337, "loss": 2.3043, "step": 30 }, { "epoch": 0.013117528826827462, "grad_norm": 1.3786976337432861, "learning_rate": 0.00015877852522924732, "loss": 2.2584, "step": 31 }, { "epoch": 0.013540674918015444, "grad_norm": 1.3786307573318481, "learning_rate": 0.00015508969814521025, "loss": 2.3891, "step": 32 }, { "epoch": 0.013963821009203427, "grad_norm": 1.3032869100570679, "learning_rate": 0.00015128992774059063, "loss": 2.2714, "step": 33 }, { "epoch": 0.014386967100391411, "grad_norm": 1.7123363018035889, "learning_rate": 0.00014738686624729986, "loss": 2.2354, "step": 34 }, { "epoch": 0.014810113191579393, "grad_norm": 1.4017690420150757, "learning_rate": 0.00014338837391175582, "loss": 2.1773, "step": 35 }, { "epoch": 0.015233259282767376, "grad_norm": 1.4052975177764893, "learning_rate": 0.00013930250316539238, "loss": 2.1841, "step": 36 }, { "epoch": 0.015656405373955358, "grad_norm": 1.3908483982086182, "learning_rate": 0.0001351374824081343, "loss": 2.2708, "step": 37 }, { "epoch": 0.01607955146514334, "grad_norm": 1.3868993520736694, "learning_rate": 0.00013090169943749476, "loss": 2.2283, "step": 38 }, { "epoch": 0.016502697556331325, "grad_norm": 1.2704371213912964, "learning_rate": 0.00012660368455666752, "loss": 2.2096, "step": 39 }, { "epoch": 0.016925843647519305, "grad_norm": 1.3375109434127808, "learning_rate": 0.00012225209339563145, "loss": 2.1863, "step": 40 }, { "epoch": 0.016925843647519305, "eval_loss": 2.2113733291625977, "eval_runtime": 25.1253, "eval_samples_per_second": 79.203, "eval_steps_per_second": 19.821, "step": 40 }, { "epoch": 0.01734898973870729, "grad_norm": 1.2191954851150513, "learning_rate": 0.00011785568947986367, "loss": 2.2794, "step": 41 }, { "epoch": 0.017772135829895272, "grad_norm": 6.1313605308532715, "learning_rate": 0.00011342332658176555, "loss": 2.3205, "step": 42 }, { "epoch": 0.018195281921083255, "grad_norm": 1.338478446006775, "learning_rate": 0.00010896393089034336, "loss": 2.1809, "step": 43 }, { "epoch": 0.018618428012271235, "grad_norm": 1.3417049646377563, "learning_rate": 0.00010448648303505151, "loss": 2.2497, "step": 44 }, { "epoch": 0.01904157410345922, "grad_norm": 1.32805335521698, "learning_rate": 0.0001, "loss": 2.1696, "step": 45 }, { "epoch": 0.019464720194647202, "grad_norm": 1.2384742498397827, "learning_rate": 9.551351696494854e-05, "loss": 2.1888, "step": 46 }, { "epoch": 0.019887866285835186, "grad_norm": 1.4238135814666748, "learning_rate": 9.103606910965666e-05, "loss": 2.3193, "step": 47 }, { "epoch": 0.020311012377023166, "grad_norm": 1.3317970037460327, "learning_rate": 8.657667341823448e-05, "loss": 2.1241, "step": 48 }, { "epoch": 0.02073415846821115, "grad_norm": 5.128082275390625, "learning_rate": 8.214431052013634e-05, "loss": 2.3208, "step": 49 }, { "epoch": 0.021157304559399133, "grad_norm": 1.5404480695724487, "learning_rate": 7.774790660436858e-05, "loss": 2.4014, "step": 50 }, { "epoch": 0.021580450650587116, "grad_norm": 1.2230170965194702, "learning_rate": 7.339631544333249e-05, "loss": 2.1396, "step": 51 }, { "epoch": 0.022003596741775096, "grad_norm": 1.3236002922058105, "learning_rate": 6.909830056250527e-05, "loss": 2.1162, "step": 52 }, { "epoch": 0.02242674283296308, "grad_norm": 1.2597148418426514, "learning_rate": 6.486251759186572e-05, "loss": 2.0798, "step": 53 }, { "epoch": 0.022849888924151063, "grad_norm": 1.177053689956665, "learning_rate": 6.069749683460765e-05, "loss": 2.1683, "step": 54 }, { "epoch": 0.023273035015339047, "grad_norm": 1.136125922203064, "learning_rate": 5.6611626088244194e-05, "loss": 2.0631, "step": 55 }, { "epoch": 0.023696181106527027, "grad_norm": 1.4012868404388428, "learning_rate": 5.261313375270014e-05, "loss": 2.2295, "step": 56 }, { "epoch": 0.02411932719771501, "grad_norm": 1.1680681705474854, "learning_rate": 4.87100722594094e-05, "loss": 2.2061, "step": 57 }, { "epoch": 0.024542473288902994, "grad_norm": 1.1918954849243164, "learning_rate": 4.491030185478976e-05, "loss": 2.07, "step": 58 }, { "epoch": 0.024965619380090977, "grad_norm": 1.3730967044830322, "learning_rate": 4.12214747707527e-05, "loss": 2.1515, "step": 59 }, { "epoch": 0.025388765471278957, "grad_norm": 1.1280912160873413, "learning_rate": 3.7651019814126654e-05, "loss": 2.156, "step": 60 }, { "epoch": 0.025388765471278957, "eval_loss": 2.168715715408325, "eval_runtime": 25.4328, "eval_samples_per_second": 78.245, "eval_steps_per_second": 19.581, "step": 60 }, { "epoch": 0.02581191156246694, "grad_norm": 1.1480927467346191, "learning_rate": 3.4206127406028745e-05, "loss": 2.231, "step": 61 }, { "epoch": 0.026235057653654924, "grad_norm": 1.1314619779586792, "learning_rate": 3.089373510131354e-05, "loss": 2.1425, "step": 62 }, { "epoch": 0.026658203744842908, "grad_norm": 1.1381090879440308, "learning_rate": 2.7720513617260856e-05, "loss": 2.1525, "step": 63 }, { "epoch": 0.027081349836030888, "grad_norm": 1.2480595111846924, "learning_rate": 2.4692853399638917e-05, "loss": 2.1493, "step": 64 }, { "epoch": 0.02750449592721887, "grad_norm": 1.112952709197998, "learning_rate": 2.181685175319702e-05, "loss": 2.0714, "step": 65 }, { "epoch": 0.027927642018406855, "grad_norm": 1.2665915489196777, "learning_rate": 1.9098300562505266e-05, "loss": 2.1679, "step": 66 }, { "epoch": 0.02835078810959484, "grad_norm": 1.1438580751419067, "learning_rate": 1.6542674627869737e-05, "loss": 2.2069, "step": 67 }, { "epoch": 0.028773934200782822, "grad_norm": 1.0998873710632324, "learning_rate": 1.415512063981339e-05, "loss": 2.1404, "step": 68 }, { "epoch": 0.029197080291970802, "grad_norm": 1.0739041566848755, "learning_rate": 1.19404468143262e-05, "loss": 2.1207, "step": 69 }, { "epoch": 0.029620226383158785, "grad_norm": 1.206387996673584, "learning_rate": 9.903113209758096e-06, "loss": 2.2733, "step": 70 }, { "epoch": 0.03004337247434677, "grad_norm": 1.1807301044464111, "learning_rate": 8.047222744854943e-06, "loss": 2.1562, "step": 71 }, { "epoch": 0.030466518565534752, "grad_norm": 1.1681818962097168, "learning_rate": 6.37651293602628e-06, "loss": 2.1226, "step": 72 }, { "epoch": 0.030889664656722732, "grad_norm": 1.1383371353149414, "learning_rate": 4.8943483704846475e-06, "loss": 2.2101, "step": 73 }, { "epoch": 0.031312810747910716, "grad_norm": 1.1326333284378052, "learning_rate": 3.6037139304146762e-06, "loss": 2.0855, "step": 74 }, { "epoch": 0.031735956839098696, "grad_norm": 1.1640607118606567, "learning_rate": 2.5072087818176382e-06, "loss": 2.1586, "step": 75 }, { "epoch": 0.03215910293028668, "grad_norm": 1.2493934631347656, "learning_rate": 1.6070411401370334e-06, "loss": 2.1355, "step": 76 }, { "epoch": 0.03258224902147466, "grad_norm": 1.3628870248794556, "learning_rate": 9.0502382320653e-07, "loss": 2.1629, "step": 77 }, { "epoch": 0.03300539511266265, "grad_norm": 1.1996616125106812, "learning_rate": 4.025706004760932e-07, "loss": 2.2131, "step": 78 }, { "epoch": 0.03342854120385063, "grad_norm": 1.3560621738433838, "learning_rate": 1.0069334586854107e-07, "loss": 2.2186, "step": 79 }, { "epoch": 0.03385168729503861, "grad_norm": 1.1883161067962646, "learning_rate": 0.0, "loss": 2.2633, "step": 80 }, { "epoch": 0.03385168729503861, "eval_loss": 2.1620700359344482, "eval_runtime": 24.9252, "eval_samples_per_second": 79.839, "eval_steps_per_second": 19.98, "step": 80 } ], "logging_steps": 1, "max_steps": 80, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1476504563220480.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }