{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 470, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.10743061772605192, "grad_norm": 20.089375743938827, "learning_rate": 3.8297872340425535e-06, "loss": 2.2361, "step": 10 }, { "epoch": 0.21486123545210384, "grad_norm": 1.8136781205400963, "learning_rate": 8.085106382978723e-06, "loss": 0.5542, "step": 20 }, { "epoch": 0.3222918531781558, "grad_norm": 5.833809524442147, "learning_rate": 1.2340425531914895e-05, "loss": 0.3915, "step": 30 }, { "epoch": 0.4297224709042077, "grad_norm": 1.3551663864951546, "learning_rate": 1.6595744680851064e-05, "loss": 0.3464, "step": 40 }, { "epoch": 0.5371530886302597, "grad_norm": 1.2219998499437525, "learning_rate": 1.9998896833611603e-05, "loss": 0.3284, "step": 50 }, { "epoch": 0.6445837063563116, "grad_norm": 3.325568321975458, "learning_rate": 1.9960311560501457e-05, "loss": 0.3098, "step": 60 }, { "epoch": 0.7520143240823635, "grad_norm": 1.5025152149862429, "learning_rate": 1.986681112191161e-05, "loss": 0.2947, "step": 70 }, { "epoch": 0.8594449418084154, "grad_norm": 1.1978088609682094, "learning_rate": 1.9718911023007382e-05, "loss": 0.2772, "step": 80 }, { "epoch": 0.9668755595344674, "grad_norm": 3.7209154133323414, "learning_rate": 1.9517426695952358e-05, "loss": 0.2637, "step": 90 }, { "epoch": 1.0644583706356312, "grad_norm": 0.9321825728768071, "learning_rate": 1.926346900410604e-05, "loss": 0.2477, "step": 100 }, { "epoch": 1.1718889883616832, "grad_norm": 1.1579496686187436, "learning_rate": 1.895843811739162e-05, "loss": 0.2384, "step": 110 }, { "epoch": 1.279319606087735, "grad_norm": 1.4252514462516224, "learning_rate": 1.8604015792601395e-05, "loss": 0.2313, "step": 120 }, { "epoch": 1.386750223813787, "grad_norm": 1.158427909482145, "learning_rate": 1.8202156101201646e-05, "loss": 0.2259, "step": 130 }, { "epoch": 1.4941808415398388, "grad_norm": 0.8524241535709315, "learning_rate": 1.7755074655758174e-05, "loss": 0.2207, "step": 140 }, { "epoch": 1.6016114592658908, "grad_norm": 0.6875946112635646, "learning_rate": 1.7265236394381634e-05, "loss": 0.2117, "step": 150 }, { "epoch": 1.7090420769919428, "grad_norm": 0.8721265188917838, "learning_rate": 1.6735341990541766e-05, "loss": 0.207, "step": 160 }, { "epoch": 1.8164726947179948, "grad_norm": 0.800843100380685, "learning_rate": 1.61683129631787e-05, "loss": 0.1962, "step": 170 }, { "epoch": 1.9239033124440466, "grad_norm": 0.7963931429193268, "learning_rate": 1.5567275569205216e-05, "loss": 0.1938, "step": 180 }, { "epoch": 2.0214861235452104, "grad_norm": 0.659012311906312, "learning_rate": 1.4935543567206984e-05, "loss": 0.1954, "step": 190 }, { "epoch": 2.1289167412712624, "grad_norm": 0.7791125669343928, "learning_rate": 1.4276599947371388e-05, "loss": 0.17, "step": 200 }, { "epoch": 2.2363473589973144, "grad_norm": 0.7255530063668101, "learning_rate": 1.3594077728375129e-05, "loss": 0.1676, "step": 210 }, { "epoch": 2.3437779767233664, "grad_norm": 1.0972658675565765, "learning_rate": 1.2891739927104992e-05, "loss": 0.1634, "step": 220 }, { "epoch": 2.451208594449418, "grad_norm": 0.8782737880409247, "learning_rate": 1.217345881164667e-05, "loss": 0.1619, "step": 230 }, { "epoch": 2.55863921217547, "grad_norm": 0.8377800753430866, "learning_rate": 1.1443194551928267e-05, "loss": 0.16, "step": 240 }, { "epoch": 2.666069829901522, "grad_norm": 0.8148515538842358, "learning_rate": 1.0704973385725853e-05, "loss": 0.1545, "step": 250 }, { "epoch": 2.773500447627574, "grad_norm": 0.8382605438092161, "learning_rate": 9.962865420410702e-06, "loss": 0.157, "step": 260 }, { "epoch": 2.880931065353626, "grad_norm": 0.8418234560769987, "learning_rate": 9.22096219282597e-06, "loss": 0.1516, "step": 270 }, { "epoch": 2.9883616830796775, "grad_norm": 1.2348680954607516, "learning_rate": 8.483354111014142e-06, "loss": 0.1507, "step": 280 }, { "epoch": 3.0859444941808416, "grad_norm": 1.4296779451123045, "learning_rate": 7.75410790216802e-06, "loss": 0.1338, "step": 290 }, { "epoch": 3.1933751119068936, "grad_norm": 0.936837659064759, "learning_rate": 7.037244191143662e-06, "loss": 0.1158, "step": 300 }, { "epoch": 3.3008057296329456, "grad_norm": 0.7518008256356895, "learning_rate": 6.336715333153869e-06, "loss": 0.1141, "step": 310 }, { "epoch": 3.408236347358997, "grad_norm": 0.9067706982590781, "learning_rate": 5.656383622859418e-06, "loss": 0.1123, "step": 320 }, { "epoch": 3.515666965085049, "grad_norm": 0.8868213545464397, "learning_rate": 5.000000000000003e-06, "loss": 0.1123, "step": 330 }, { "epoch": 3.623097582811101, "grad_norm": 1.0556337500861324, "learning_rate": 4.371183368969165e-06, "loss": 0.1067, "step": 340 }, { "epoch": 3.730528200537153, "grad_norm": 0.9611508609707269, "learning_rate": 3.7734006463527695e-06, "loss": 0.1101, "step": 350 }, { "epoch": 3.837958818263205, "grad_norm": 0.7820816197402637, "learning_rate": 3.209947646436752e-06, "loss": 0.1095, "step": 360 }, { "epoch": 3.9453894359892567, "grad_norm": 1.0504063992235715, "learning_rate": 2.6839309100699975e-06, "loss": 0.1052, "step": 370 }, { "epoch": 4.042972247090421, "grad_norm": 0.6865272264261028, "learning_rate": 2.1982505770671303e-06, "loss": 0.0921, "step": 380 }, { "epoch": 4.150402864816472, "grad_norm": 0.8796386863699859, "learning_rate": 1.7555843965823992e-06, "loss": 0.076, "step": 390 }, { "epoch": 4.257833482542525, "grad_norm": 0.887963511828224, "learning_rate": 1.3583729636118359e-06, "loss": 0.0733, "step": 400 }, { "epoch": 4.365264100268576, "grad_norm": 0.9590088128005394, "learning_rate": 1.0088062630208272e-06, "loss": 0.0731, "step": 410 }, { "epoch": 4.472694717994629, "grad_norm": 1.2910374680962302, "learning_rate": 7.088115952851238e-07, "loss": 0.0718, "step": 420 }, { "epoch": 4.58012533572068, "grad_norm": 0.9177315234691451, "learning_rate": 4.6004295051554236e-07, "loss": 0.0715, "step": 430 }, { "epoch": 4.687555953446733, "grad_norm": 0.8450380472855318, "learning_rate": 2.638718893515946e-07, "loss": 0.0725, "step": 440 }, { "epoch": 4.794986571172784, "grad_norm": 0.8170638735343974, "learning_rate": 1.21379981001305e-07, "loss": 0.0701, "step": 450 }, { "epoch": 4.902417188898836, "grad_norm": 0.9592988275261085, "learning_rate": 3.335284011929951e-08, "loss": 0.0701, "step": 460 }, { "epoch": 5.0, "grad_norm": 2.4307818442462943, "learning_rate": 2.75795400255241e-10, "loss": 0.0699, "step": 470 }, { "epoch": 5.0, "step": 470, "total_flos": 1189607016235008.0, "train_loss": 0.2205800841463373, "train_runtime": 29859.2322, "train_samples_per_second": 11.969, "train_steps_per_second": 0.016 } ], "logging_steps": 10, "max_steps": 470, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1189607016235008.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }