| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.9987463435018804, |
| "eval_steps": 500, |
| "global_step": 897, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.03343083994985374, |
| "grad_norm": 0.465455666565108, |
| "learning_rate": 5e-06, |
| "loss": 0.7858, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.06686167989970748, |
| "grad_norm": 0.3429798043291091, |
| "learning_rate": 5e-06, |
| "loss": 0.7098, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.10029251984956122, |
| "grad_norm": 0.250252156565685, |
| "learning_rate": 5e-06, |
| "loss": 0.6871, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.13372335979941496, |
| "grad_norm": 0.24107134988270987, |
| "learning_rate": 5e-06, |
| "loss": 0.6681, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.1671541997492687, |
| "grad_norm": 0.21806581310258916, |
| "learning_rate": 5e-06, |
| "loss": 0.6633, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.20058503969912245, |
| "grad_norm": 0.22386397438534855, |
| "learning_rate": 5e-06, |
| "loss": 0.656, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.2340158796489762, |
| "grad_norm": 0.2243183236649086, |
| "learning_rate": 5e-06, |
| "loss": 0.651, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.26744671959882993, |
| "grad_norm": 0.2326019524196413, |
| "learning_rate": 5e-06, |
| "loss": 0.6519, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.30087755954868367, |
| "grad_norm": 0.22646799402642662, |
| "learning_rate": 5e-06, |
| "loss": 0.6437, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.3343083994985374, |
| "grad_norm": 0.23209116325221912, |
| "learning_rate": 5e-06, |
| "loss": 0.6392, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.36773923944839115, |
| "grad_norm": 0.22907297757221068, |
| "learning_rate": 5e-06, |
| "loss": 0.6329, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.4011700793982449, |
| "grad_norm": 0.22166520300122122, |
| "learning_rate": 5e-06, |
| "loss": 0.632, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.43460091934809864, |
| "grad_norm": 0.24079199771015034, |
| "learning_rate": 5e-06, |
| "loss": 0.6194, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.4680317592979524, |
| "grad_norm": 0.23095015490130272, |
| "learning_rate": 5e-06, |
| "loss": 0.6172, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.5014625992478061, |
| "grad_norm": 0.22783375101823392, |
| "learning_rate": 5e-06, |
| "loss": 0.6214, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.5348934391976599, |
| "grad_norm": 0.25829648332235927, |
| "learning_rate": 5e-06, |
| "loss": 0.6194, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.5683242791475136, |
| "grad_norm": 0.23696584513088445, |
| "learning_rate": 5e-06, |
| "loss": 0.6159, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.6017551190973673, |
| "grad_norm": 0.24388808303408932, |
| "learning_rate": 5e-06, |
| "loss": 0.6177, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.6351859590472211, |
| "grad_norm": 0.2324052967281535, |
| "learning_rate": 5e-06, |
| "loss": 0.6084, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.6686167989970748, |
| "grad_norm": 0.23047425341157954, |
| "learning_rate": 5e-06, |
| "loss": 0.6074, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.7020476389469286, |
| "grad_norm": 0.23604251749457175, |
| "learning_rate": 5e-06, |
| "loss": 0.6044, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.7354784788967823, |
| "grad_norm": 0.23958101670923093, |
| "learning_rate": 5e-06, |
| "loss": 0.6061, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.768909318846636, |
| "grad_norm": 0.256749568385238, |
| "learning_rate": 5e-06, |
| "loss": 0.6052, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.8023401587964898, |
| "grad_norm": 0.2440390616960533, |
| "learning_rate": 5e-06, |
| "loss": 0.6003, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.8357709987463435, |
| "grad_norm": 0.24502595940912716, |
| "learning_rate": 5e-06, |
| "loss": 0.6122, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.8692018386961973, |
| "grad_norm": 0.2911977665906095, |
| "learning_rate": 5e-06, |
| "loss": 0.6069, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.902632678646051, |
| "grad_norm": 0.2535787166110667, |
| "learning_rate": 5e-06, |
| "loss": 0.6044, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.9360635185959048, |
| "grad_norm": 0.23815820801537682, |
| "learning_rate": 5e-06, |
| "loss": 0.6051, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.9694943585457585, |
| "grad_norm": 0.24190392037693448, |
| "learning_rate": 5e-06, |
| "loss": 0.6027, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.9995821145006268, |
| "eval_loss": 0.6069105863571167, |
| "eval_runtime": 297.1615, |
| "eval_samples_per_second": 27.117, |
| "eval_steps_per_second": 0.424, |
| "step": 299 |
| }, |
| { |
| "epoch": 1.0029251984956122, |
| "grad_norm": 0.7245726510213032, |
| "learning_rate": 5e-06, |
| "loss": 0.6553, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.036356038445466, |
| "grad_norm": 0.27066472051217805, |
| "learning_rate": 5e-06, |
| "loss": 0.5696, |
| "step": 310 |
| }, |
| { |
| "epoch": 1.0697868783953197, |
| "grad_norm": 0.23624656831823893, |
| "learning_rate": 5e-06, |
| "loss": 0.5745, |
| "step": 320 |
| }, |
| { |
| "epoch": 1.1032177183451735, |
| "grad_norm": 0.24234271753426892, |
| "learning_rate": 5e-06, |
| "loss": 0.5736, |
| "step": 330 |
| }, |
| { |
| "epoch": 1.1366485582950272, |
| "grad_norm": 0.24649256824581395, |
| "learning_rate": 5e-06, |
| "loss": 0.5709, |
| "step": 340 |
| }, |
| { |
| "epoch": 1.170079398244881, |
| "grad_norm": 0.25334180595905853, |
| "learning_rate": 5e-06, |
| "loss": 0.5644, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.2035102381947347, |
| "grad_norm": 0.26316949320762967, |
| "learning_rate": 5e-06, |
| "loss": 0.5663, |
| "step": 360 |
| }, |
| { |
| "epoch": 1.2369410781445884, |
| "grad_norm": 0.2605173667942115, |
| "learning_rate": 5e-06, |
| "loss": 0.5722, |
| "step": 370 |
| }, |
| { |
| "epoch": 1.2703719180944422, |
| "grad_norm": 0.2312966165635924, |
| "learning_rate": 5e-06, |
| "loss": 0.576, |
| "step": 380 |
| }, |
| { |
| "epoch": 1.303802758044296, |
| "grad_norm": 0.23702825129111396, |
| "learning_rate": 5e-06, |
| "loss": 0.5687, |
| "step": 390 |
| }, |
| { |
| "epoch": 1.3372335979941496, |
| "grad_norm": 0.2308522015632982, |
| "learning_rate": 5e-06, |
| "loss": 0.5641, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.3706644379440034, |
| "grad_norm": 0.24923916946383898, |
| "learning_rate": 5e-06, |
| "loss": 0.5645, |
| "step": 410 |
| }, |
| { |
| "epoch": 1.4040952778938571, |
| "grad_norm": 0.2546839256609035, |
| "learning_rate": 5e-06, |
| "loss": 0.5688, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.4375261178437109, |
| "grad_norm": 0.2415609699087316, |
| "learning_rate": 5e-06, |
| "loss": 0.5646, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.4709569577935646, |
| "grad_norm": 0.22725529712872503, |
| "learning_rate": 5e-06, |
| "loss": 0.5678, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.5043877977434184, |
| "grad_norm": 0.26364584659408963, |
| "learning_rate": 5e-06, |
| "loss": 0.5715, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.537818637693272, |
| "grad_norm": 0.23719272373058128, |
| "learning_rate": 5e-06, |
| "loss": 0.5666, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.5712494776431258, |
| "grad_norm": 0.23886844481475153, |
| "learning_rate": 5e-06, |
| "loss": 0.5601, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.6046803175929796, |
| "grad_norm": 0.23337130900300732, |
| "learning_rate": 5e-06, |
| "loss": 0.5584, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.6381111575428333, |
| "grad_norm": 0.23593118182043984, |
| "learning_rate": 5e-06, |
| "loss": 0.5708, |
| "step": 490 |
| }, |
| { |
| "epoch": 1.671541997492687, |
| "grad_norm": 0.247965831834247, |
| "learning_rate": 5e-06, |
| "loss": 0.5713, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.7049728374425408, |
| "grad_norm": 0.252355451082656, |
| "learning_rate": 5e-06, |
| "loss": 0.5628, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.7384036773923945, |
| "grad_norm": 0.24093218685685644, |
| "learning_rate": 5e-06, |
| "loss": 0.5655, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.7718345173422483, |
| "grad_norm": 0.2297986641883851, |
| "learning_rate": 5e-06, |
| "loss": 0.5599, |
| "step": 530 |
| }, |
| { |
| "epoch": 1.805265357292102, |
| "grad_norm": 0.2355282077674634, |
| "learning_rate": 5e-06, |
| "loss": 0.5598, |
| "step": 540 |
| }, |
| { |
| "epoch": 1.8386961972419558, |
| "grad_norm": 0.23565926815064536, |
| "learning_rate": 5e-06, |
| "loss": 0.5675, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.8721270371918095, |
| "grad_norm": 0.22987684078185938, |
| "learning_rate": 5e-06, |
| "loss": 0.5678, |
| "step": 560 |
| }, |
| { |
| "epoch": 1.9055578771416632, |
| "grad_norm": 0.2391922454011988, |
| "learning_rate": 5e-06, |
| "loss": 0.5655, |
| "step": 570 |
| }, |
| { |
| "epoch": 1.938988717091517, |
| "grad_norm": 0.2402971182270538, |
| "learning_rate": 5e-06, |
| "loss": 0.5588, |
| "step": 580 |
| }, |
| { |
| "epoch": 1.9724195570413707, |
| "grad_norm": 0.2431684190506289, |
| "learning_rate": 5e-06, |
| "loss": 0.5543, |
| "step": 590 |
| }, |
| { |
| "epoch": 1.9991642290012537, |
| "eval_loss": 0.5879825353622437, |
| "eval_runtime": 297.739, |
| "eval_samples_per_second": 27.064, |
| "eval_steps_per_second": 0.423, |
| "step": 598 |
| }, |
| { |
| "epoch": 2.0058503969912245, |
| "grad_norm": 0.30898456346377806, |
| "learning_rate": 5e-06, |
| "loss": 0.6129, |
| "step": 600 |
| }, |
| { |
| "epoch": 2.039281236941078, |
| "grad_norm": 0.3080648052374505, |
| "learning_rate": 5e-06, |
| "loss": 0.5387, |
| "step": 610 |
| }, |
| { |
| "epoch": 2.072712076890932, |
| "grad_norm": 0.2598126484485968, |
| "learning_rate": 5e-06, |
| "loss": 0.5284, |
| "step": 620 |
| }, |
| { |
| "epoch": 2.1061429168407857, |
| "grad_norm": 0.2409928257299915, |
| "learning_rate": 5e-06, |
| "loss": 0.5252, |
| "step": 630 |
| }, |
| { |
| "epoch": 2.1395737567906394, |
| "grad_norm": 0.2401108659858156, |
| "learning_rate": 5e-06, |
| "loss": 0.5296, |
| "step": 640 |
| }, |
| { |
| "epoch": 2.173004596740493, |
| "grad_norm": 0.23872635023154934, |
| "learning_rate": 5e-06, |
| "loss": 0.5325, |
| "step": 650 |
| }, |
| { |
| "epoch": 2.206435436690347, |
| "grad_norm": 0.24878774215286806, |
| "learning_rate": 5e-06, |
| "loss": 0.5253, |
| "step": 660 |
| }, |
| { |
| "epoch": 2.2398662766402007, |
| "grad_norm": 0.25564499539942276, |
| "learning_rate": 5e-06, |
| "loss": 0.5304, |
| "step": 670 |
| }, |
| { |
| "epoch": 2.2732971165900544, |
| "grad_norm": 0.24837362080494388, |
| "learning_rate": 5e-06, |
| "loss": 0.5253, |
| "step": 680 |
| }, |
| { |
| "epoch": 2.306727956539908, |
| "grad_norm": 0.25514616974853815, |
| "learning_rate": 5e-06, |
| "loss": 0.5293, |
| "step": 690 |
| }, |
| { |
| "epoch": 2.340158796489762, |
| "grad_norm": 0.24759745020055693, |
| "learning_rate": 5e-06, |
| "loss": 0.5423, |
| "step": 700 |
| }, |
| { |
| "epoch": 2.3735896364396156, |
| "grad_norm": 0.2621553483703068, |
| "learning_rate": 5e-06, |
| "loss": 0.5281, |
| "step": 710 |
| }, |
| { |
| "epoch": 2.4070204763894694, |
| "grad_norm": 0.25103305898665595, |
| "learning_rate": 5e-06, |
| "loss": 0.5262, |
| "step": 720 |
| }, |
| { |
| "epoch": 2.440451316339323, |
| "grad_norm": 0.24616309812116494, |
| "learning_rate": 5e-06, |
| "loss": 0.5332, |
| "step": 730 |
| }, |
| { |
| "epoch": 2.473882156289177, |
| "grad_norm": 0.26292695360317175, |
| "learning_rate": 5e-06, |
| "loss": 0.5286, |
| "step": 740 |
| }, |
| { |
| "epoch": 2.5073129962390306, |
| "grad_norm": 0.2556297707505491, |
| "learning_rate": 5e-06, |
| "loss": 0.5363, |
| "step": 750 |
| }, |
| { |
| "epoch": 2.5407438361888843, |
| "grad_norm": 0.2706578070225452, |
| "learning_rate": 5e-06, |
| "loss": 0.5382, |
| "step": 760 |
| }, |
| { |
| "epoch": 2.574174676138738, |
| "grad_norm": 0.27845630575003893, |
| "learning_rate": 5e-06, |
| "loss": 0.5289, |
| "step": 770 |
| }, |
| { |
| "epoch": 2.607605516088592, |
| "grad_norm": 0.2669132752987619, |
| "learning_rate": 5e-06, |
| "loss": 0.5233, |
| "step": 780 |
| }, |
| { |
| "epoch": 2.6410363560384456, |
| "grad_norm": 0.2586067280825215, |
| "learning_rate": 5e-06, |
| "loss": 0.5271, |
| "step": 790 |
| }, |
| { |
| "epoch": 2.6744671959882993, |
| "grad_norm": 0.2559890631197339, |
| "learning_rate": 5e-06, |
| "loss": 0.5361, |
| "step": 800 |
| }, |
| { |
| "epoch": 2.707898035938153, |
| "grad_norm": 0.24748957802636362, |
| "learning_rate": 5e-06, |
| "loss": 0.5317, |
| "step": 810 |
| }, |
| { |
| "epoch": 2.7413288758880068, |
| "grad_norm": 0.24421486038345067, |
| "learning_rate": 5e-06, |
| "loss": 0.536, |
| "step": 820 |
| }, |
| { |
| "epoch": 2.7747597158378605, |
| "grad_norm": 0.27039350090679615, |
| "learning_rate": 5e-06, |
| "loss": 0.5294, |
| "step": 830 |
| }, |
| { |
| "epoch": 2.8081905557877143, |
| "grad_norm": 0.25751063997110557, |
| "learning_rate": 5e-06, |
| "loss": 0.5326, |
| "step": 840 |
| }, |
| { |
| "epoch": 2.841621395737568, |
| "grad_norm": 0.24765515477831007, |
| "learning_rate": 5e-06, |
| "loss": 0.5226, |
| "step": 850 |
| }, |
| { |
| "epoch": 2.8750522356874217, |
| "grad_norm": 0.24426800544335395, |
| "learning_rate": 5e-06, |
| "loss": 0.5254, |
| "step": 860 |
| }, |
| { |
| "epoch": 2.9084830756372755, |
| "grad_norm": 0.24614474825906446, |
| "learning_rate": 5e-06, |
| "loss": 0.5322, |
| "step": 870 |
| }, |
| { |
| "epoch": 2.9419139155871292, |
| "grad_norm": 0.24941400328376895, |
| "learning_rate": 5e-06, |
| "loss": 0.524, |
| "step": 880 |
| }, |
| { |
| "epoch": 2.975344755536983, |
| "grad_norm": 0.2815487127839382, |
| "learning_rate": 5e-06, |
| "loss": 0.5279, |
| "step": 890 |
| }, |
| { |
| "epoch": 2.9987463435018804, |
| "eval_loss": 0.5801695585250854, |
| "eval_runtime": 299.2301, |
| "eval_samples_per_second": 26.929, |
| "eval_steps_per_second": 0.421, |
| "step": 897 |
| }, |
| { |
| "epoch": 2.9987463435018804, |
| "step": 897, |
| "total_flos": 1502299029504000.0, |
| "train_loss": 0.5779525436286543, |
| "train_runtime": 48927.2984, |
| "train_samples_per_second": 9.387, |
| "train_steps_per_second": 0.018 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 897, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1502299029504000.0, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|