| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.9826119126896042, | |
| "eval_steps": 100, | |
| "global_step": 504, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.059193488716241215, | |
| "grad_norm": 0.19465851783752441, | |
| "learning_rate": 2e-05, | |
| "loss": 1.1117, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.11838697743248243, | |
| "grad_norm": 0.21135994791984558, | |
| "learning_rate": 4e-05, | |
| "loss": 1.1022, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.17758046614872364, | |
| "grad_norm": 0.2708509862422943, | |
| "learning_rate": 6e-05, | |
| "loss": 1.0546, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.23677395486496486, | |
| "grad_norm": 0.21392174065113068, | |
| "learning_rate": 8e-05, | |
| "loss": 0.94, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.2959674435812061, | |
| "grad_norm": 0.17460189759731293, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8246, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.3551609322974473, | |
| "grad_norm": 0.16554006934165955, | |
| "learning_rate": 0.00012, | |
| "loss": 0.7437, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.41435442101368847, | |
| "grad_norm": 0.16454806923866272, | |
| "learning_rate": 0.00014, | |
| "loss": 0.6901, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.4735479097299297, | |
| "grad_norm": 0.1758105754852295, | |
| "learning_rate": 0.00016, | |
| "loss": 0.6429, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.532741398446171, | |
| "grad_norm": 0.1696159690618515, | |
| "learning_rate": 0.00018, | |
| "loss": 0.6319, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.5919348871624122, | |
| "grad_norm": 0.17172178626060486, | |
| "learning_rate": 0.0002, | |
| "loss": 0.5995, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.5919348871624122, | |
| "eval_loss": 0.5879706740379333, | |
| "eval_runtime": 186.3324, | |
| "eval_samples_per_second": 3.687, | |
| "eval_steps_per_second": 0.462, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.6511283758786534, | |
| "grad_norm": 0.16957880556583405, | |
| "learning_rate": 0.00019504950495049505, | |
| "loss": 0.5913, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.7103218645948945, | |
| "grad_norm": 0.17516390979290009, | |
| "learning_rate": 0.0001900990099009901, | |
| "loss": 0.5803, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.7695153533111357, | |
| "grad_norm": 0.18869660794734955, | |
| "learning_rate": 0.00018514851485148517, | |
| "loss": 0.5631, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.8287088420273769, | |
| "grad_norm": 0.18655870854854584, | |
| "learning_rate": 0.00018019801980198022, | |
| "loss": 0.5538, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.8879023307436182, | |
| "grad_norm": 0.18268819153308868, | |
| "learning_rate": 0.00017524752475247526, | |
| "loss": 0.5475, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.9470958194598594, | |
| "grad_norm": 0.1926167607307434, | |
| "learning_rate": 0.0001702970297029703, | |
| "loss": 0.5462, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.005919348871624, | |
| "grad_norm": 0.19758427143096924, | |
| "learning_rate": 0.00016534653465346535, | |
| "loss": 0.5375, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.0651128375878653, | |
| "grad_norm": 0.19379234313964844, | |
| "learning_rate": 0.00016039603960396042, | |
| "loss": 0.5234, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.1243063263041067, | |
| "grad_norm": 0.19841700792312622, | |
| "learning_rate": 0.00015544554455445547, | |
| "loss": 0.5336, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.1834998150203477, | |
| "grad_norm": 0.19659265875816345, | |
| "learning_rate": 0.00015049504950495051, | |
| "loss": 0.5265, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.1834998150203477, | |
| "eval_loss": 0.5296523571014404, | |
| "eval_runtime": 186.0309, | |
| "eval_samples_per_second": 3.693, | |
| "eval_steps_per_second": 0.462, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.242693303736589, | |
| "grad_norm": 0.2189786285161972, | |
| "learning_rate": 0.00014554455445544556, | |
| "loss": 0.5228, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.3018867924528301, | |
| "grad_norm": 0.2317192554473877, | |
| "learning_rate": 0.0001405940594059406, | |
| "loss": 0.5219, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.3610802811690714, | |
| "grad_norm": 0.20830461382865906, | |
| "learning_rate": 0.00013564356435643565, | |
| "loss": 0.5095, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.4202737698853127, | |
| "grad_norm": 0.2214994877576828, | |
| "learning_rate": 0.0001306930693069307, | |
| "loss": 0.5192, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.4794672586015538, | |
| "grad_norm": 0.21869614720344543, | |
| "learning_rate": 0.00012574257425742574, | |
| "loss": 0.5031, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.538660747317795, | |
| "grad_norm": 0.21614821255207062, | |
| "learning_rate": 0.0001207920792079208, | |
| "loss": 0.5177, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.5978542360340362, | |
| "grad_norm": 0.2306758016347885, | |
| "learning_rate": 0.00011584158415841584, | |
| "loss": 0.5124, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.6570477247502775, | |
| "grad_norm": 0.22710908949375153, | |
| "learning_rate": 0.0001108910891089109, | |
| "loss": 0.5025, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.7162412134665188, | |
| "grad_norm": 0.22501707077026367, | |
| "learning_rate": 0.00010594059405940595, | |
| "loss": 0.5048, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.77543470218276, | |
| "grad_norm": 0.22507990896701813, | |
| "learning_rate": 0.00010099009900990099, | |
| "loss": 0.5009, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.77543470218276, | |
| "eval_loss": 0.508837103843689, | |
| "eval_runtime": 185.3987, | |
| "eval_samples_per_second": 3.706, | |
| "eval_steps_per_second": 0.464, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.834628190899001, | |
| "grad_norm": 0.22626681625843048, | |
| "learning_rate": 9.603960396039604e-05, | |
| "loss": 0.5045, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.8938216796152423, | |
| "grad_norm": 0.21802489459514618, | |
| "learning_rate": 9.10891089108911e-05, | |
| "loss": 0.5087, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.9530151683314836, | |
| "grad_norm": 0.2331380993127823, | |
| "learning_rate": 8.613861386138614e-05, | |
| "loss": 0.5038, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 2.011838697743248, | |
| "grad_norm": 0.21599650382995605, | |
| "learning_rate": 8.11881188118812e-05, | |
| "loss": 0.4912, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 2.0710321864594894, | |
| "grad_norm": 0.24118073284626007, | |
| "learning_rate": 7.623762376237625e-05, | |
| "loss": 0.4842, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 2.1302256751757307, | |
| "grad_norm": 0.232917919754982, | |
| "learning_rate": 7.128712871287129e-05, | |
| "loss": 0.4835, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 2.189419163891972, | |
| "grad_norm": 0.23639123141765594, | |
| "learning_rate": 6.633663366336635e-05, | |
| "loss": 0.4786, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 2.2486126526082133, | |
| "grad_norm": 0.2632993161678314, | |
| "learning_rate": 6.13861386138614e-05, | |
| "loss": 0.4824, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 2.307806141324454, | |
| "grad_norm": 0.2573336064815521, | |
| "learning_rate": 5.643564356435643e-05, | |
| "loss": 0.4771, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 2.3669996300406955, | |
| "grad_norm": 0.24001090228557587, | |
| "learning_rate": 5.148514851485149e-05, | |
| "loss": 0.4848, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.3669996300406955, | |
| "eval_loss": 0.4998326301574707, | |
| "eval_runtime": 186.3844, | |
| "eval_samples_per_second": 3.686, | |
| "eval_steps_per_second": 0.461, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.4261931187569368, | |
| "grad_norm": 0.2617679834365845, | |
| "learning_rate": 4.653465346534654e-05, | |
| "loss": 0.4854, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 2.485386607473178, | |
| "grad_norm": 0.2378547489643097, | |
| "learning_rate": 4.158415841584158e-05, | |
| "loss": 0.4848, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 2.544580096189419, | |
| "grad_norm": 0.2498575747013092, | |
| "learning_rate": 3.6633663366336634e-05, | |
| "loss": 0.4768, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 2.6037735849056602, | |
| "grad_norm": 0.2568998336791992, | |
| "learning_rate": 3.1683168316831686e-05, | |
| "loss": 0.4863, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 2.6629670736219015, | |
| "grad_norm": 0.25008225440979004, | |
| "learning_rate": 2.6732673267326734e-05, | |
| "loss": 0.4747, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 2.722160562338143, | |
| "grad_norm": 0.24171195924282074, | |
| "learning_rate": 2.1782178217821783e-05, | |
| "loss": 0.4736, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 2.781354051054384, | |
| "grad_norm": 0.2468671202659607, | |
| "learning_rate": 1.6831683168316834e-05, | |
| "loss": 0.4894, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 2.8405475397706255, | |
| "grad_norm": 0.2496197372674942, | |
| "learning_rate": 1.1881188118811881e-05, | |
| "loss": 0.4798, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 2.8997410284868663, | |
| "grad_norm": 0.24788333475589752, | |
| "learning_rate": 6.9306930693069314e-06, | |
| "loss": 0.4739, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 2.9589345172031076, | |
| "grad_norm": 0.2498323768377304, | |
| "learning_rate": 1.9801980198019803e-06, | |
| "loss": 0.4718, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.9589345172031076, | |
| "eval_loss": 0.49539193511009216, | |
| "eval_runtime": 185.1827, | |
| "eval_samples_per_second": 3.71, | |
| "eval_steps_per_second": 0.464, | |
| "step": 500 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 504, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 9.23509345222656e+16, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |