| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.9961795606494745, |
| "eval_steps": 500, |
| "global_step": 783, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.038204393505253106, |
| "grad_norm": 17.106196027203886, |
| "learning_rate": 5e-06, |
| "loss": 1.0699, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.07640878701050621, |
| "grad_norm": 3.7575313843264073, |
| "learning_rate": 5e-06, |
| "loss": 0.9412, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.11461318051575932, |
| "grad_norm": 2.0783280143086302, |
| "learning_rate": 5e-06, |
| "loss": 0.8972, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.15281757402101243, |
| "grad_norm": 1.419216051491444, |
| "learning_rate": 5e-06, |
| "loss": 0.8682, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.19102196752626552, |
| "grad_norm": 1.5492013803153966, |
| "learning_rate": 5e-06, |
| "loss": 0.8507, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.22922636103151864, |
| "grad_norm": 0.9979261775157234, |
| "learning_rate": 5e-06, |
| "loss": 0.8317, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.26743075453677173, |
| "grad_norm": 1.1453499414283712, |
| "learning_rate": 5e-06, |
| "loss": 0.8257, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.30563514804202485, |
| "grad_norm": 1.0657509981340374, |
| "learning_rate": 5e-06, |
| "loss": 0.8136, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.3438395415472779, |
| "grad_norm": 0.8154629296380118, |
| "learning_rate": 5e-06, |
| "loss": 0.8012, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.38204393505253104, |
| "grad_norm": 0.7783174666894658, |
| "learning_rate": 5e-06, |
| "loss": 0.7944, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.42024832855778416, |
| "grad_norm": 0.7557295182511866, |
| "learning_rate": 5e-06, |
| "loss": 0.7976, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.4584527220630373, |
| "grad_norm": 0.7419048503084669, |
| "learning_rate": 5e-06, |
| "loss": 0.7838, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.49665711556829034, |
| "grad_norm": 0.6023078446753443, |
| "learning_rate": 5e-06, |
| "loss": 0.7861, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.5348615090735435, |
| "grad_norm": 0.8029384495653265, |
| "learning_rate": 5e-06, |
| "loss": 0.7794, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.5730659025787965, |
| "grad_norm": 0.8049936585803824, |
| "learning_rate": 5e-06, |
| "loss": 0.7824, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.6112702960840497, |
| "grad_norm": 0.8903153692892993, |
| "learning_rate": 5e-06, |
| "loss": 0.7716, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.6494746895893028, |
| "grad_norm": 0.7481918362618383, |
| "learning_rate": 5e-06, |
| "loss": 0.7781, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.6876790830945558, |
| "grad_norm": 0.8571380886679489, |
| "learning_rate": 5e-06, |
| "loss": 0.7728, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.725883476599809, |
| "grad_norm": 0.603577733589318, |
| "learning_rate": 5e-06, |
| "loss": 0.7693, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.7640878701050621, |
| "grad_norm": 0.6986000212250895, |
| "learning_rate": 5e-06, |
| "loss": 0.7708, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.8022922636103151, |
| "grad_norm": 0.6654581267839026, |
| "learning_rate": 5e-06, |
| "loss": 0.7686, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.8404966571155683, |
| "grad_norm": 0.7161329148587753, |
| "learning_rate": 5e-06, |
| "loss": 0.7639, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.8787010506208214, |
| "grad_norm": 0.7291450225202621, |
| "learning_rate": 5e-06, |
| "loss": 0.7653, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.9169054441260746, |
| "grad_norm": 0.7588585120562266, |
| "learning_rate": 5e-06, |
| "loss": 0.7633, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.9551098376313276, |
| "grad_norm": 0.8527681409489486, |
| "learning_rate": 5e-06, |
| "loss": 0.758, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.9933142311365807, |
| "grad_norm": 0.5733961547084557, |
| "learning_rate": 5e-06, |
| "loss": 0.7556, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.997134670487106, |
| "eval_loss": 0.7598350644111633, |
| "eval_runtime": 277.7141, |
| "eval_samples_per_second": 25.397, |
| "eval_steps_per_second": 0.4, |
| "step": 261 |
| }, |
| { |
| "epoch": 1.033906399235912, |
| "grad_norm": 0.8279780719142003, |
| "learning_rate": 5e-06, |
| "loss": 0.7812, |
| "step": 270 |
| }, |
| { |
| "epoch": 1.0721107927411653, |
| "grad_norm": 0.8247306859370325, |
| "learning_rate": 5e-06, |
| "loss": 0.7055, |
| "step": 280 |
| }, |
| { |
| "epoch": 1.1103151862464182, |
| "grad_norm": 0.6966889761138212, |
| "learning_rate": 5e-06, |
| "loss": 0.7083, |
| "step": 290 |
| }, |
| { |
| "epoch": 1.1485195797516714, |
| "grad_norm": 0.7158212384271236, |
| "learning_rate": 5e-06, |
| "loss": 0.7028, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.1867239732569246, |
| "grad_norm": 0.6513073930120158, |
| "learning_rate": 5e-06, |
| "loss": 0.7105, |
| "step": 310 |
| }, |
| { |
| "epoch": 1.2249283667621778, |
| "grad_norm": 0.6782643522162839, |
| "learning_rate": 5e-06, |
| "loss": 0.7092, |
| "step": 320 |
| }, |
| { |
| "epoch": 1.2631327602674307, |
| "grad_norm": 0.7229522703779768, |
| "learning_rate": 5e-06, |
| "loss": 0.7027, |
| "step": 330 |
| }, |
| { |
| "epoch": 1.3013371537726839, |
| "grad_norm": 0.686689277377695, |
| "learning_rate": 5e-06, |
| "loss": 0.7092, |
| "step": 340 |
| }, |
| { |
| "epoch": 1.3395415472779368, |
| "grad_norm": 0.6686838666832164, |
| "learning_rate": 5e-06, |
| "loss": 0.7076, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.37774594078319, |
| "grad_norm": 0.8906041633095456, |
| "learning_rate": 5e-06, |
| "loss": 0.703, |
| "step": 360 |
| }, |
| { |
| "epoch": 1.4159503342884432, |
| "grad_norm": 0.6554966723136705, |
| "learning_rate": 5e-06, |
| "loss": 0.7078, |
| "step": 370 |
| }, |
| { |
| "epoch": 1.4541547277936964, |
| "grad_norm": 0.6044694699607192, |
| "learning_rate": 5e-06, |
| "loss": 0.7064, |
| "step": 380 |
| }, |
| { |
| "epoch": 1.4923591212989493, |
| "grad_norm": 0.6795649168516978, |
| "learning_rate": 5e-06, |
| "loss": 0.7042, |
| "step": 390 |
| }, |
| { |
| "epoch": 1.5305635148042025, |
| "grad_norm": 0.6518471106071317, |
| "learning_rate": 5e-06, |
| "loss": 0.7055, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.5687679083094554, |
| "grad_norm": 0.656923516770704, |
| "learning_rate": 5e-06, |
| "loss": 0.707, |
| "step": 410 |
| }, |
| { |
| "epoch": 1.6069723018147086, |
| "grad_norm": 0.6225351538335282, |
| "learning_rate": 5e-06, |
| "loss": 0.7045, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.6451766953199618, |
| "grad_norm": 0.777993155808027, |
| "learning_rate": 5e-06, |
| "loss": 0.7056, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.683381088825215, |
| "grad_norm": 0.6859522609411438, |
| "learning_rate": 5e-06, |
| "loss": 0.7047, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.7215854823304682, |
| "grad_norm": 0.6651740116728566, |
| "learning_rate": 5e-06, |
| "loss": 0.7004, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.759789875835721, |
| "grad_norm": 0.5969267067929888, |
| "learning_rate": 5e-06, |
| "loss": 0.7012, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.797994269340974, |
| "grad_norm": 0.613011364797191, |
| "learning_rate": 5e-06, |
| "loss": 0.7073, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.8361986628462272, |
| "grad_norm": 0.6207675725240294, |
| "learning_rate": 5e-06, |
| "loss": 0.7002, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.8744030563514804, |
| "grad_norm": 0.5957548029122114, |
| "learning_rate": 5e-06, |
| "loss": 0.7063, |
| "step": 490 |
| }, |
| { |
| "epoch": 1.9126074498567336, |
| "grad_norm": 0.6972405923078006, |
| "learning_rate": 5e-06, |
| "loss": 0.7046, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.9508118433619868, |
| "grad_norm": 0.6246007349854923, |
| "learning_rate": 5e-06, |
| "loss": 0.7074, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.9890162368672397, |
| "grad_norm": 0.6291588284508969, |
| "learning_rate": 5e-06, |
| "loss": 0.6989, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.9966571155682904, |
| "eval_loss": 0.7465963363647461, |
| "eval_runtime": 278.0536, |
| "eval_samples_per_second": 25.366, |
| "eval_steps_per_second": 0.399, |
| "step": 522 |
| }, |
| { |
| "epoch": 2.029608404966571, |
| "grad_norm": 0.9162417337081611, |
| "learning_rate": 5e-06, |
| "loss": 0.7192, |
| "step": 530 |
| }, |
| { |
| "epoch": 2.067812798471824, |
| "grad_norm": 0.6666792440916017, |
| "learning_rate": 5e-06, |
| "loss": 0.6447, |
| "step": 540 |
| }, |
| { |
| "epoch": 2.1060171919770774, |
| "grad_norm": 0.6320270044953581, |
| "learning_rate": 5e-06, |
| "loss": 0.6504, |
| "step": 550 |
| }, |
| { |
| "epoch": 2.1442215854823305, |
| "grad_norm": 0.8407791173957067, |
| "learning_rate": 5e-06, |
| "loss": 0.6486, |
| "step": 560 |
| }, |
| { |
| "epoch": 2.1824259789875837, |
| "grad_norm": 0.9821964292842589, |
| "learning_rate": 5e-06, |
| "loss": 0.6528, |
| "step": 570 |
| }, |
| { |
| "epoch": 2.2206303724928365, |
| "grad_norm": 0.9530943320810575, |
| "learning_rate": 5e-06, |
| "loss": 0.6536, |
| "step": 580 |
| }, |
| { |
| "epoch": 2.2588347659980896, |
| "grad_norm": 0.7065407366963496, |
| "learning_rate": 5e-06, |
| "loss": 0.6515, |
| "step": 590 |
| }, |
| { |
| "epoch": 2.297039159503343, |
| "grad_norm": 0.9143814042284045, |
| "learning_rate": 5e-06, |
| "loss": 0.6516, |
| "step": 600 |
| }, |
| { |
| "epoch": 2.335243553008596, |
| "grad_norm": 1.0356120889247198, |
| "learning_rate": 5e-06, |
| "loss": 0.6522, |
| "step": 610 |
| }, |
| { |
| "epoch": 2.373447946513849, |
| "grad_norm": 0.5955479754526213, |
| "learning_rate": 5e-06, |
| "loss": 0.6484, |
| "step": 620 |
| }, |
| { |
| "epoch": 2.4116523400191023, |
| "grad_norm": 0.668667957683453, |
| "learning_rate": 5e-06, |
| "loss": 0.6517, |
| "step": 630 |
| }, |
| { |
| "epoch": 2.4498567335243555, |
| "grad_norm": 0.5916712868955213, |
| "learning_rate": 5e-06, |
| "loss": 0.6492, |
| "step": 640 |
| }, |
| { |
| "epoch": 2.4880611270296082, |
| "grad_norm": 0.7078011059499841, |
| "learning_rate": 5e-06, |
| "loss": 0.6499, |
| "step": 650 |
| }, |
| { |
| "epoch": 2.5262655205348614, |
| "grad_norm": 0.7113167536030103, |
| "learning_rate": 5e-06, |
| "loss": 0.6491, |
| "step": 660 |
| }, |
| { |
| "epoch": 2.5644699140401146, |
| "grad_norm": 0.7286729281959395, |
| "learning_rate": 5e-06, |
| "loss": 0.6559, |
| "step": 670 |
| }, |
| { |
| "epoch": 2.6026743075453678, |
| "grad_norm": 0.8393175922718014, |
| "learning_rate": 5e-06, |
| "loss": 0.6537, |
| "step": 680 |
| }, |
| { |
| "epoch": 2.640878701050621, |
| "grad_norm": 0.9635082597402534, |
| "learning_rate": 5e-06, |
| "loss": 0.645, |
| "step": 690 |
| }, |
| { |
| "epoch": 2.6790830945558737, |
| "grad_norm": 0.6376260449080609, |
| "learning_rate": 5e-06, |
| "loss": 0.6516, |
| "step": 700 |
| }, |
| { |
| "epoch": 2.7172874880611273, |
| "grad_norm": 0.9042073773765085, |
| "learning_rate": 5e-06, |
| "loss": 0.6538, |
| "step": 710 |
| }, |
| { |
| "epoch": 2.75549188156638, |
| "grad_norm": 0.8795780646670239, |
| "learning_rate": 5e-06, |
| "loss": 0.6537, |
| "step": 720 |
| }, |
| { |
| "epoch": 2.793696275071633, |
| "grad_norm": 0.7101546769683508, |
| "learning_rate": 5e-06, |
| "loss": 0.6507, |
| "step": 730 |
| }, |
| { |
| "epoch": 2.8319006685768864, |
| "grad_norm": 0.6112049740364579, |
| "learning_rate": 5e-06, |
| "loss": 0.6536, |
| "step": 740 |
| }, |
| { |
| "epoch": 2.8701050620821396, |
| "grad_norm": 0.6240740305145582, |
| "learning_rate": 5e-06, |
| "loss": 0.6525, |
| "step": 750 |
| }, |
| { |
| "epoch": 2.9083094555873927, |
| "grad_norm": 0.6687610050145816, |
| "learning_rate": 5e-06, |
| "loss": 0.6569, |
| "step": 760 |
| }, |
| { |
| "epoch": 2.9465138490926455, |
| "grad_norm": 0.7981405552978358, |
| "learning_rate": 5e-06, |
| "loss": 0.655, |
| "step": 770 |
| }, |
| { |
| "epoch": 2.9847182425978986, |
| "grad_norm": 0.6901040178181519, |
| "learning_rate": 5e-06, |
| "loss": 0.6567, |
| "step": 780 |
| }, |
| { |
| "epoch": 2.9961795606494745, |
| "eval_loss": 0.7494649887084961, |
| "eval_runtime": 280.3949, |
| "eval_samples_per_second": 25.154, |
| "eval_steps_per_second": 0.396, |
| "step": 783 |
| }, |
| { |
| "epoch": 2.9961795606494745, |
| "step": 783, |
| "total_flos": 1311344783523840.0, |
| "train_loss": 0.7237713550090181, |
| "train_runtime": 46210.0208, |
| "train_samples_per_second": 8.699, |
| "train_steps_per_second": 0.017 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 783, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1311344783523840.0, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|