{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9826119126896042, "eval_steps": 100, "global_step": 504, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.059193488716241215, "grad_norm": 0.19465851783752441, "learning_rate": 2e-05, "loss": 1.1117, "step": 10 }, { "epoch": 0.11838697743248243, "grad_norm": 0.21135994791984558, "learning_rate": 4e-05, "loss": 1.1022, "step": 20 }, { "epoch": 0.17758046614872364, "grad_norm": 0.2708509862422943, "learning_rate": 6e-05, "loss": 1.0546, "step": 30 }, { "epoch": 0.23677395486496486, "grad_norm": 0.21392174065113068, "learning_rate": 8e-05, "loss": 0.94, "step": 40 }, { "epoch": 0.2959674435812061, "grad_norm": 0.17460189759731293, "learning_rate": 0.0001, "loss": 0.8246, "step": 50 }, { "epoch": 0.3551609322974473, "grad_norm": 0.16554006934165955, "learning_rate": 0.00012, "loss": 0.7437, "step": 60 }, { "epoch": 0.41435442101368847, "grad_norm": 0.16454806923866272, "learning_rate": 0.00014, "loss": 0.6901, "step": 70 }, { "epoch": 0.4735479097299297, "grad_norm": 0.1758105754852295, "learning_rate": 0.00016, "loss": 0.6429, "step": 80 }, { "epoch": 0.532741398446171, "grad_norm": 0.1696159690618515, "learning_rate": 0.00018, "loss": 0.6319, "step": 90 }, { "epoch": 0.5919348871624122, "grad_norm": 0.17172178626060486, "learning_rate": 0.0002, "loss": 0.5995, "step": 100 }, { "epoch": 0.5919348871624122, "eval_loss": 0.5879706740379333, "eval_runtime": 186.3324, "eval_samples_per_second": 3.687, "eval_steps_per_second": 0.462, "step": 100 }, { "epoch": 0.6511283758786534, "grad_norm": 0.16957880556583405, "learning_rate": 0.00019504950495049505, "loss": 0.5913, "step": 110 }, { "epoch": 0.7103218645948945, "grad_norm": 0.17516390979290009, "learning_rate": 0.0001900990099009901, "loss": 0.5803, "step": 120 }, { "epoch": 0.7695153533111357, "grad_norm": 0.18869660794734955, "learning_rate": 0.00018514851485148517, "loss": 0.5631, "step": 130 }, { "epoch": 0.8287088420273769, "grad_norm": 0.18655870854854584, "learning_rate": 0.00018019801980198022, "loss": 0.5538, "step": 140 }, { "epoch": 0.8879023307436182, "grad_norm": 0.18268819153308868, "learning_rate": 0.00017524752475247526, "loss": 0.5475, "step": 150 }, { "epoch": 0.9470958194598594, "grad_norm": 0.1926167607307434, "learning_rate": 0.0001702970297029703, "loss": 0.5462, "step": 160 }, { "epoch": 1.005919348871624, "grad_norm": 0.19758427143096924, "learning_rate": 0.00016534653465346535, "loss": 0.5375, "step": 170 }, { "epoch": 1.0651128375878653, "grad_norm": 0.19379234313964844, "learning_rate": 0.00016039603960396042, "loss": 0.5234, "step": 180 }, { "epoch": 1.1243063263041067, "grad_norm": 0.19841700792312622, "learning_rate": 0.00015544554455445547, "loss": 0.5336, "step": 190 }, { "epoch": 1.1834998150203477, "grad_norm": 0.19659265875816345, "learning_rate": 0.00015049504950495051, "loss": 0.5265, "step": 200 }, { "epoch": 1.1834998150203477, "eval_loss": 0.5296523571014404, "eval_runtime": 186.0309, "eval_samples_per_second": 3.693, "eval_steps_per_second": 0.462, "step": 200 }, { "epoch": 1.242693303736589, "grad_norm": 0.2189786285161972, "learning_rate": 0.00014554455445544556, "loss": 0.5228, "step": 210 }, { "epoch": 1.3018867924528301, "grad_norm": 0.2317192554473877, "learning_rate": 0.0001405940594059406, "loss": 0.5219, "step": 220 }, { "epoch": 1.3610802811690714, "grad_norm": 0.20830461382865906, "learning_rate": 0.00013564356435643565, "loss": 0.5095, "step": 230 }, { "epoch": 1.4202737698853127, "grad_norm": 0.2214994877576828, "learning_rate": 0.0001306930693069307, "loss": 0.5192, "step": 240 }, { "epoch": 1.4794672586015538, "grad_norm": 0.21869614720344543, "learning_rate": 0.00012574257425742574, "loss": 0.5031, "step": 250 }, { "epoch": 1.538660747317795, "grad_norm": 0.21614821255207062, "learning_rate": 0.0001207920792079208, "loss": 0.5177, "step": 260 }, { "epoch": 1.5978542360340362, "grad_norm": 0.2306758016347885, "learning_rate": 0.00011584158415841584, "loss": 0.5124, "step": 270 }, { "epoch": 1.6570477247502775, "grad_norm": 0.22710908949375153, "learning_rate": 0.0001108910891089109, "loss": 0.5025, "step": 280 }, { "epoch": 1.7162412134665188, "grad_norm": 0.22501707077026367, "learning_rate": 0.00010594059405940595, "loss": 0.5048, "step": 290 }, { "epoch": 1.77543470218276, "grad_norm": 0.22507990896701813, "learning_rate": 0.00010099009900990099, "loss": 0.5009, "step": 300 }, { "epoch": 1.77543470218276, "eval_loss": 0.508837103843689, "eval_runtime": 185.3987, "eval_samples_per_second": 3.706, "eval_steps_per_second": 0.464, "step": 300 }, { "epoch": 1.834628190899001, "grad_norm": 0.22626681625843048, "learning_rate": 9.603960396039604e-05, "loss": 0.5045, "step": 310 }, { "epoch": 1.8938216796152423, "grad_norm": 0.21802489459514618, "learning_rate": 9.10891089108911e-05, "loss": 0.5087, "step": 320 }, { "epoch": 1.9530151683314836, "grad_norm": 0.2331380993127823, "learning_rate": 8.613861386138614e-05, "loss": 0.5038, "step": 330 }, { "epoch": 2.011838697743248, "grad_norm": 0.21599650382995605, "learning_rate": 8.11881188118812e-05, "loss": 0.4912, "step": 340 }, { "epoch": 2.0710321864594894, "grad_norm": 0.24118073284626007, "learning_rate": 7.623762376237625e-05, "loss": 0.4842, "step": 350 }, { "epoch": 2.1302256751757307, "grad_norm": 0.232917919754982, "learning_rate": 7.128712871287129e-05, "loss": 0.4835, "step": 360 }, { "epoch": 2.189419163891972, "grad_norm": 0.23639123141765594, "learning_rate": 6.633663366336635e-05, "loss": 0.4786, "step": 370 }, { "epoch": 2.2486126526082133, "grad_norm": 0.2632993161678314, "learning_rate": 6.13861386138614e-05, "loss": 0.4824, "step": 380 }, { "epoch": 2.307806141324454, "grad_norm": 0.2573336064815521, "learning_rate": 5.643564356435643e-05, "loss": 0.4771, "step": 390 }, { "epoch": 2.3669996300406955, "grad_norm": 0.24001090228557587, "learning_rate": 5.148514851485149e-05, "loss": 0.4848, "step": 400 }, { "epoch": 2.3669996300406955, "eval_loss": 0.4998326301574707, "eval_runtime": 186.3844, "eval_samples_per_second": 3.686, "eval_steps_per_second": 0.461, "step": 400 }, { "epoch": 2.4261931187569368, "grad_norm": 0.2617679834365845, "learning_rate": 4.653465346534654e-05, "loss": 0.4854, "step": 410 }, { "epoch": 2.485386607473178, "grad_norm": 0.2378547489643097, "learning_rate": 4.158415841584158e-05, "loss": 0.4848, "step": 420 }, { "epoch": 2.544580096189419, "grad_norm": 0.2498575747013092, "learning_rate": 3.6633663366336634e-05, "loss": 0.4768, "step": 430 }, { "epoch": 2.6037735849056602, "grad_norm": 0.2568998336791992, "learning_rate": 3.1683168316831686e-05, "loss": 0.4863, "step": 440 }, { "epoch": 2.6629670736219015, "grad_norm": 0.25008225440979004, "learning_rate": 2.6732673267326734e-05, "loss": 0.4747, "step": 450 }, { "epoch": 2.722160562338143, "grad_norm": 0.24171195924282074, "learning_rate": 2.1782178217821783e-05, "loss": 0.4736, "step": 460 }, { "epoch": 2.781354051054384, "grad_norm": 0.2468671202659607, "learning_rate": 1.6831683168316834e-05, "loss": 0.4894, "step": 470 }, { "epoch": 2.8405475397706255, "grad_norm": 0.2496197372674942, "learning_rate": 1.1881188118811881e-05, "loss": 0.4798, "step": 480 }, { "epoch": 2.8997410284868663, "grad_norm": 0.24788333475589752, "learning_rate": 6.9306930693069314e-06, "loss": 0.4739, "step": 490 }, { "epoch": 2.9589345172031076, "grad_norm": 0.2498323768377304, "learning_rate": 1.9801980198019803e-06, "loss": 0.4718, "step": 500 }, { "epoch": 2.9589345172031076, "eval_loss": 0.49539193511009216, "eval_runtime": 185.1827, "eval_samples_per_second": 3.71, "eval_steps_per_second": 0.464, "step": 500 } ], "logging_steps": 10, "max_steps": 504, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.23509345222656e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }