| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.9954170485792853, | |
| "eval_steps": 500, | |
| "global_step": 816, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.03666361136571952, | |
| "grad_norm": 10.920966734668648, | |
| "learning_rate": 5e-06, | |
| "loss": 1.033, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.07332722273143905, | |
| "grad_norm": 2.445980198452297, | |
| "learning_rate": 5e-06, | |
| "loss": 0.9011, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.10999083409715857, | |
| "grad_norm": 1.5424232386159482, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8764, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.1466544454628781, | |
| "grad_norm": 1.1797649838046136, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8446, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.18331805682859761, | |
| "grad_norm": 1.0295589020655365, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8204, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.21998166819431714, | |
| "grad_norm": 1.2160434357225554, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8104, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.2566452795600367, | |
| "grad_norm": 1.2135493715768004, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7968, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.2933088909257562, | |
| "grad_norm": 0.7594488712178804, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7836, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.32997250229147573, | |
| "grad_norm": 0.8913076302913621, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7781, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.36663611365719523, | |
| "grad_norm": 1.126183659145103, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7732, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.4032997250229148, | |
| "grad_norm": 0.7476760341544976, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7711, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.4399633363886343, | |
| "grad_norm": 0.828783632948725, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7637, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.4766269477543538, | |
| "grad_norm": 0.7005369874659794, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7617, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.5132905591200734, | |
| "grad_norm": 0.6781356553576761, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7562, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.5499541704857929, | |
| "grad_norm": 0.6643060954517749, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7601, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.5866177818515124, | |
| "grad_norm": 0.654862572470797, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7561, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.6232813932172319, | |
| "grad_norm": 0.7126834121476828, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7549, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.6599450045829515, | |
| "grad_norm": 0.5845932549413623, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7525, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.696608615948671, | |
| "grad_norm": 0.583642927450063, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7507, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.7332722273143905, | |
| "grad_norm": 0.5759630428428489, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7492, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.76993583868011, | |
| "grad_norm": 0.597809207757354, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7446, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.8065994500458296, | |
| "grad_norm": 0.6520665055230834, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7512, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.843263061411549, | |
| "grad_norm": 0.6521761800994458, | |
| "learning_rate": 5e-06, | |
| "loss": 0.744, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.8799266727772685, | |
| "grad_norm": 0.6083361886529014, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7431, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.916590284142988, | |
| "grad_norm": 0.8966782629847545, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7399, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.9532538955087076, | |
| "grad_norm": 0.6584181334872885, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7457, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.9899175068744271, | |
| "grad_norm": 0.5614900416740534, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7434, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.997250229147571, | |
| "eval_loss": 0.743977963924408, | |
| "eval_runtime": 96.6447, | |
| "eval_samples_per_second": 76.052, | |
| "eval_steps_per_second": 0.6, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 1.0284142988084326, | |
| "grad_norm": 0.6377526876986616, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7593, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.065077910174152, | |
| "grad_norm": 0.8312923337011684, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6885, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.1017415215398716, | |
| "grad_norm": 0.6499984381614756, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6893, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.138405132905591, | |
| "grad_norm": 0.658519279927457, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6868, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.1750687442713108, | |
| "grad_norm": 0.6307182099292118, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6885, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.2117323556370303, | |
| "grad_norm": 0.6191143311988347, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6871, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.2483959670027498, | |
| "grad_norm": 0.6735946598593434, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6935, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.2850595783684693, | |
| "grad_norm": 0.7213451984916242, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6943, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.3217231897341888, | |
| "grad_norm": 0.5841901070016948, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6938, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.3583868010999083, | |
| "grad_norm": 0.6609752377099979, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6856, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.3950504124656278, | |
| "grad_norm": 0.6004672142282963, | |
| "learning_rate": 5e-06, | |
| "loss": 0.69, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.4317140238313475, | |
| "grad_norm": 0.7494020947088555, | |
| "learning_rate": 5e-06, | |
| "loss": 0.682, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.468377635197067, | |
| "grad_norm": 0.6711006066177567, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6917, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.5050412465627865, | |
| "grad_norm": 0.6517430215570676, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6871, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.541704857928506, | |
| "grad_norm": 0.6180564693914907, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6829, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.5783684692942255, | |
| "grad_norm": 0.5764324092377354, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6824, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.615032080659945, | |
| "grad_norm": 0.7134204082562298, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6822, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.6516956920256645, | |
| "grad_norm": 0.7630512170385407, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6879, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.6883593033913842, | |
| "grad_norm": 0.6285437172539765, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6804, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.7250229147571035, | |
| "grad_norm": 0.5968789313484854, | |
| "learning_rate": 5e-06, | |
| "loss": 0.686, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.7616865261228232, | |
| "grad_norm": 0.6425175740435289, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6856, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.7983501374885427, | |
| "grad_norm": 0.7614365266625939, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6814, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.8350137488542622, | |
| "grad_norm": 0.5496379357416068, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6855, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.8716773602199817, | |
| "grad_norm": 0.8494093367270151, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6875, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.9083409715857012, | |
| "grad_norm": 0.6756166103000668, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6856, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.9450045829514209, | |
| "grad_norm": 0.7228484772895967, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6841, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.9816681943171401, | |
| "grad_norm": 0.7786774729146112, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6845, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.996333638863428, | |
| "eval_loss": 0.7305116057395935, | |
| "eval_runtime": 96.1532, | |
| "eval_samples_per_second": 76.441, | |
| "eval_steps_per_second": 0.603, | |
| "step": 544 | |
| }, | |
| { | |
| "epoch": 2.020164986251146, | |
| "grad_norm": 1.0425759476709966, | |
| "learning_rate": 5e-06, | |
| "loss": 0.707, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 2.056828597616865, | |
| "grad_norm": 0.8473344095764829, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6313, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 2.093492208982585, | |
| "grad_norm": 0.7205628261028438, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6281, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 2.130155820348304, | |
| "grad_norm": 0.6604987014823058, | |
| "learning_rate": 5e-06, | |
| "loss": 0.631, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 2.166819431714024, | |
| "grad_norm": 0.6774961015973217, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6351, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 2.203483043079743, | |
| "grad_norm": 0.8519809292040578, | |
| "learning_rate": 5e-06, | |
| "loss": 0.634, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 2.240146654445463, | |
| "grad_norm": 0.693823740633704, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6327, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 2.276810265811182, | |
| "grad_norm": 0.6448705487045298, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6339, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 2.313473877176902, | |
| "grad_norm": 0.5865817788059118, | |
| "learning_rate": 5e-06, | |
| "loss": 0.636, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 2.3501374885426216, | |
| "grad_norm": 0.8116556137845999, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6342, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 2.386801099908341, | |
| "grad_norm": 0.6231657257473445, | |
| "learning_rate": 5e-06, | |
| "loss": 0.637, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 2.4234647112740606, | |
| "grad_norm": 0.6250913266909794, | |
| "learning_rate": 5e-06, | |
| "loss": 0.63, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 2.46012832263978, | |
| "grad_norm": 0.582068921531117, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6288, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 2.4967919340054996, | |
| "grad_norm": 0.6912367969819871, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6381, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 2.5334555453712193, | |
| "grad_norm": 0.7147652107920064, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6332, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 2.5701191567369386, | |
| "grad_norm": 0.5792260811836798, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6351, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.606782768102658, | |
| "grad_norm": 0.7963438662743851, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6363, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 2.6434463794683776, | |
| "grad_norm": 0.9276380358330181, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6355, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 2.6801099908340973, | |
| "grad_norm": 0.9313823270809661, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6351, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 2.7167736021998166, | |
| "grad_norm": 0.7304200587600748, | |
| "learning_rate": 5e-06, | |
| "loss": 0.638, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 2.7534372135655363, | |
| "grad_norm": 0.6212966397528322, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6388, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 2.7901008249312556, | |
| "grad_norm": 0.6720686482466423, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6364, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 2.8267644362969753, | |
| "grad_norm": 0.6438467896193539, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6421, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 2.863428047662695, | |
| "grad_norm": 0.6043416931907646, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6379, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 2.9000916590284143, | |
| "grad_norm": 0.6496494693588303, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6414, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 2.936755270394134, | |
| "grad_norm": 0.8144443719589332, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6361, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.9734188817598532, | |
| "grad_norm": 0.7037764123768507, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6373, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 2.9954170485792853, | |
| "eval_loss": 0.7332214117050171, | |
| "eval_runtime": 94.4197, | |
| "eval_samples_per_second": 77.844, | |
| "eval_steps_per_second": 0.614, | |
| "step": 816 | |
| }, | |
| { | |
| "epoch": 2.9954170485792853, | |
| "step": 816, | |
| "total_flos": 1366411632967680.0, | |
| "train_loss": 0.7035682309491962, | |
| "train_runtime": 14220.3782, | |
| "train_samples_per_second": 29.46, | |
| "train_steps_per_second": 0.057 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 816, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1366411632967680.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |