| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.0, |
| "eval_steps": 2000, |
| "global_step": 873, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0, |
| "eval_loss": 7.051288604736328, |
| "eval_runtime": 17.1259, |
| "eval_samples_per_second": 0.058, |
| "eval_steps_per_second": 0.058, |
| "step": 0 |
| }, |
| { |
| "epoch": 0.03436426116838488, |
| "grad_norm": 32.725921630859375, |
| "learning_rate": 3.5000000000000004e-06, |
| "loss": 7.176, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.06872852233676977, |
| "grad_norm": 10.35120677947998, |
| "learning_rate": 8.500000000000002e-06, |
| "loss": 6.1643, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.10309278350515463, |
| "grad_norm": 10.239090919494629, |
| "learning_rate": 1.3500000000000001e-05, |
| "loss": 5.8296, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.13745704467353953, |
| "grad_norm": 10.433135032653809, |
| "learning_rate": 1.85e-05, |
| "loss": 5.737, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.1718213058419244, |
| "grad_norm": 9.619898796081543, |
| "learning_rate": 2.35e-05, |
| "loss": 5.7242, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.20618556701030927, |
| "grad_norm": 9.597282409667969, |
| "learning_rate": 2.8499999999999998e-05, |
| "loss": 5.6166, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.24054982817869416, |
| "grad_norm": 9.673783302307129, |
| "learning_rate": 3.35e-05, |
| "loss": 5.5887, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.27491408934707906, |
| "grad_norm": 8.612630844116211, |
| "learning_rate": 3.85e-05, |
| "loss": 5.6619, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.30927835051546393, |
| "grad_norm": 8.954395294189453, |
| "learning_rate": 4.35e-05, |
| "loss": 5.6506, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.3436426116838488, |
| "grad_norm": 8.672375679016113, |
| "learning_rate": 4.85e-05, |
| "loss": 5.5449, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.37800687285223367, |
| "grad_norm": 9.550030708312988, |
| "learning_rate": 4.954721862871928e-05, |
| "loss": 5.6273, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.41237113402061853, |
| "grad_norm": 8.500064849853516, |
| "learning_rate": 4.890038809831824e-05, |
| "loss": 5.5225, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.44673539518900346, |
| "grad_norm": 8.339930534362793, |
| "learning_rate": 4.8253557567917204e-05, |
| "loss": 5.5552, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.48109965635738833, |
| "grad_norm": 8.283628463745117, |
| "learning_rate": 4.760672703751617e-05, |
| "loss": 5.3781, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.5154639175257731, |
| "grad_norm": 9.112236976623535, |
| "learning_rate": 4.6959896507115136e-05, |
| "loss": 5.4563, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.5498281786941581, |
| "grad_norm": 8.985079765319824, |
| "learning_rate": 4.6313065976714105e-05, |
| "loss": 5.5249, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.584192439862543, |
| "grad_norm": 7.019378185272217, |
| "learning_rate": 4.566623544631307e-05, |
| "loss": 5.4304, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.6185567010309279, |
| "grad_norm": 7.323549270629883, |
| "learning_rate": 4.5019404915912036e-05, |
| "loss": 5.4806, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.6529209621993127, |
| "grad_norm": 7.620777130126953, |
| "learning_rate": 4.4372574385511e-05, |
| "loss": 5.3499, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.6872852233676976, |
| "grad_norm": 7.8983659744262695, |
| "learning_rate": 4.372574385510996e-05, |
| "loss": 5.3936, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.7216494845360825, |
| "grad_norm": 6.915121555328369, |
| "learning_rate": 4.307891332470893e-05, |
| "loss": 5.3653, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.7560137457044673, |
| "grad_norm": 7.591084957122803, |
| "learning_rate": 4.243208279430789e-05, |
| "loss": 5.4473, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.7903780068728522, |
| "grad_norm": 7.623355865478516, |
| "learning_rate": 4.178525226390686e-05, |
| "loss": 5.4019, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.8247422680412371, |
| "grad_norm": 7.67550802230835, |
| "learning_rate": 4.113842173350582e-05, |
| "loss": 5.3478, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.8591065292096219, |
| "grad_norm": 7.541503429412842, |
| "learning_rate": 4.049159120310479e-05, |
| "loss": 5.3985, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.8934707903780069, |
| "grad_norm": 7.1159749031066895, |
| "learning_rate": 3.9844760672703754e-05, |
| "loss": 5.3093, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.9278350515463918, |
| "grad_norm": 7.516855716705322, |
| "learning_rate": 3.9197930142302716e-05, |
| "loss": 5.3952, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.9621993127147767, |
| "grad_norm": 7.933532238006592, |
| "learning_rate": 3.855109961190168e-05, |
| "loss": 5.2822, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.9965635738831615, |
| "grad_norm": 7.439053535461426, |
| "learning_rate": 3.790426908150065e-05, |
| "loss": 5.3152, |
| "step": 290 |
| }, |
| { |
| "epoch": 1.0309278350515463, |
| "grad_norm": 9.026961326599121, |
| "learning_rate": 3.7257438551099616e-05, |
| "loss": 4.8395, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.0652920962199313, |
| "grad_norm": 8.444412231445312, |
| "learning_rate": 3.661060802069858e-05, |
| "loss": 4.7306, |
| "step": 310 |
| }, |
| { |
| "epoch": 1.0996563573883162, |
| "grad_norm": 8.353560447692871, |
| "learning_rate": 3.596377749029755e-05, |
| "loss": 4.7795, |
| "step": 320 |
| }, |
| { |
| "epoch": 1.134020618556701, |
| "grad_norm": 8.427460670471191, |
| "learning_rate": 3.531694695989651e-05, |
| "loss": 4.736, |
| "step": 330 |
| }, |
| { |
| "epoch": 1.168384879725086, |
| "grad_norm": 8.1294584274292, |
| "learning_rate": 3.467011642949548e-05, |
| "loss": 4.8136, |
| "step": 340 |
| }, |
| { |
| "epoch": 1.2027491408934707, |
| "grad_norm": 8.170726776123047, |
| "learning_rate": 3.4023285899094434e-05, |
| "loss": 4.7125, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.2371134020618557, |
| "grad_norm": 8.133183479309082, |
| "learning_rate": 3.33764553686934e-05, |
| "loss": 4.8097, |
| "step": 360 |
| }, |
| { |
| "epoch": 1.2714776632302405, |
| "grad_norm": 8.4751615524292, |
| "learning_rate": 3.2729624838292365e-05, |
| "loss": 4.76, |
| "step": 370 |
| }, |
| { |
| "epoch": 1.3058419243986255, |
| "grad_norm": 8.671772003173828, |
| "learning_rate": 3.2082794307891334e-05, |
| "loss": 4.6785, |
| "step": 380 |
| }, |
| { |
| "epoch": 1.3402061855670104, |
| "grad_norm": 9.14929485321045, |
| "learning_rate": 3.14359637774903e-05, |
| "loss": 4.6647, |
| "step": 390 |
| }, |
| { |
| "epoch": 1.3745704467353952, |
| "grad_norm": 7.652307033538818, |
| "learning_rate": 3.0789133247089265e-05, |
| "loss": 4.7723, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.40893470790378, |
| "grad_norm": 8.373456001281738, |
| "learning_rate": 3.014230271668823e-05, |
| "loss": 4.6346, |
| "step": 410 |
| }, |
| { |
| "epoch": 1.443298969072165, |
| "grad_norm": 8.60721492767334, |
| "learning_rate": 2.9495472186287193e-05, |
| "loss": 4.7384, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.47766323024055, |
| "grad_norm": 8.466670036315918, |
| "learning_rate": 2.8848641655886162e-05, |
| "loss": 4.8056, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.5120274914089347, |
| "grad_norm": 8.389398574829102, |
| "learning_rate": 2.8201811125485124e-05, |
| "loss": 4.691, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.5463917525773194, |
| "grad_norm": 8.483616828918457, |
| "learning_rate": 2.755498059508409e-05, |
| "loss": 4.7388, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.5807560137457046, |
| "grad_norm": 8.77346134185791, |
| "learning_rate": 2.6908150064683052e-05, |
| "loss": 4.7401, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.6151202749140894, |
| "grad_norm": 9.041746139526367, |
| "learning_rate": 2.626131953428202e-05, |
| "loss": 4.7421, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.6494845360824741, |
| "grad_norm": 8.592238426208496, |
| "learning_rate": 2.5614489003880986e-05, |
| "loss": 4.7643, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.6838487972508591, |
| "grad_norm": 7.6761956214904785, |
| "learning_rate": 2.496765847347995e-05, |
| "loss": 4.7175, |
| "step": 490 |
| }, |
| { |
| "epoch": 1.718213058419244, |
| "grad_norm": 9.062220573425293, |
| "learning_rate": 2.4320827943078914e-05, |
| "loss": 4.7023, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.7525773195876289, |
| "grad_norm": 8.060118675231934, |
| "learning_rate": 2.367399741267788e-05, |
| "loss": 4.7484, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.7869415807560136, |
| "grad_norm": 8.655328750610352, |
| "learning_rate": 2.3027166882276842e-05, |
| "loss": 4.7492, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.8213058419243986, |
| "grad_norm": 7.456566333770752, |
| "learning_rate": 2.238033635187581e-05, |
| "loss": 4.7337, |
| "step": 530 |
| }, |
| { |
| "epoch": 1.8556701030927836, |
| "grad_norm": 8.35741138458252, |
| "learning_rate": 2.1733505821474777e-05, |
| "loss": 4.8222, |
| "step": 540 |
| }, |
| { |
| "epoch": 1.8900343642611683, |
| "grad_norm": 8.883995056152344, |
| "learning_rate": 2.108667529107374e-05, |
| "loss": 4.7534, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.9243986254295533, |
| "grad_norm": 8.559647560119629, |
| "learning_rate": 2.0439844760672704e-05, |
| "loss": 4.6819, |
| "step": 560 |
| }, |
| { |
| "epoch": 1.9587628865979383, |
| "grad_norm": 8.394923210144043, |
| "learning_rate": 1.979301423027167e-05, |
| "loss": 4.6777, |
| "step": 570 |
| }, |
| { |
| "epoch": 1.993127147766323, |
| "grad_norm": 9.052810668945312, |
| "learning_rate": 1.9146183699870636e-05, |
| "loss": 4.7, |
| "step": 580 |
| }, |
| { |
| "epoch": 2.027491408934708, |
| "grad_norm": 9.817968368530273, |
| "learning_rate": 1.8499353169469598e-05, |
| "loss": 4.0928, |
| "step": 590 |
| }, |
| { |
| "epoch": 2.0618556701030926, |
| "grad_norm": 11.832119941711426, |
| "learning_rate": 1.7852522639068563e-05, |
| "loss": 3.8996, |
| "step": 600 |
| }, |
| { |
| "epoch": 2.0962199312714778, |
| "grad_norm": 11.534992218017578, |
| "learning_rate": 1.720569210866753e-05, |
| "loss": 3.8028, |
| "step": 610 |
| }, |
| { |
| "epoch": 2.1305841924398625, |
| "grad_norm": 11.591385841369629, |
| "learning_rate": 1.6558861578266498e-05, |
| "loss": 3.7741, |
| "step": 620 |
| }, |
| { |
| "epoch": 2.1649484536082473, |
| "grad_norm": 11.821063995361328, |
| "learning_rate": 1.591203104786546e-05, |
| "loss": 3.756, |
| "step": 630 |
| }, |
| { |
| "epoch": 2.1993127147766325, |
| "grad_norm": 12.23161792755127, |
| "learning_rate": 1.5265200517464426e-05, |
| "loss": 3.7957, |
| "step": 640 |
| }, |
| { |
| "epoch": 2.2336769759450172, |
| "grad_norm": 12.386809349060059, |
| "learning_rate": 1.4618369987063391e-05, |
| "loss": 3.7536, |
| "step": 650 |
| }, |
| { |
| "epoch": 2.268041237113402, |
| "grad_norm": 12.17062759399414, |
| "learning_rate": 1.3971539456662355e-05, |
| "loss": 3.7806, |
| "step": 660 |
| }, |
| { |
| "epoch": 2.3024054982817868, |
| "grad_norm": 15.317811012268066, |
| "learning_rate": 1.332470892626132e-05, |
| "loss": 3.794, |
| "step": 670 |
| }, |
| { |
| "epoch": 2.336769759450172, |
| "grad_norm": 11.287023544311523, |
| "learning_rate": 1.2677878395860285e-05, |
| "loss": 3.7329, |
| "step": 680 |
| }, |
| { |
| "epoch": 2.3711340206185567, |
| "grad_norm": 11.834717750549316, |
| "learning_rate": 1.203104786545925e-05, |
| "loss": 3.8154, |
| "step": 690 |
| }, |
| { |
| "epoch": 2.4054982817869415, |
| "grad_norm": 12.480812072753906, |
| "learning_rate": 1.1384217335058216e-05, |
| "loss": 3.7529, |
| "step": 700 |
| }, |
| { |
| "epoch": 2.4398625429553267, |
| "grad_norm": 11.966567039489746, |
| "learning_rate": 1.073738680465718e-05, |
| "loss": 3.8161, |
| "step": 710 |
| }, |
| { |
| "epoch": 2.4742268041237114, |
| "grad_norm": 11.70124340057373, |
| "learning_rate": 1.0090556274256145e-05, |
| "loss": 3.8291, |
| "step": 720 |
| }, |
| { |
| "epoch": 2.508591065292096, |
| "grad_norm": 10.707592010498047, |
| "learning_rate": 9.44372574385511e-06, |
| "loss": 3.8132, |
| "step": 730 |
| }, |
| { |
| "epoch": 2.542955326460481, |
| "grad_norm": 11.80911922454834, |
| "learning_rate": 8.796895213454076e-06, |
| "loss": 3.8293, |
| "step": 740 |
| }, |
| { |
| "epoch": 2.5773195876288657, |
| "grad_norm": 11.74314022064209, |
| "learning_rate": 8.15006468305304e-06, |
| "loss": 3.7905, |
| "step": 750 |
| }, |
| { |
| "epoch": 2.611683848797251, |
| "grad_norm": 12.890970230102539, |
| "learning_rate": 7.503234152652006e-06, |
| "loss": 3.7617, |
| "step": 760 |
| }, |
| { |
| "epoch": 2.6460481099656357, |
| "grad_norm": 12.482461929321289, |
| "learning_rate": 6.856403622250971e-06, |
| "loss": 3.7767, |
| "step": 770 |
| }, |
| { |
| "epoch": 2.680412371134021, |
| "grad_norm": 12.128081321716309, |
| "learning_rate": 6.2095730918499354e-06, |
| "loss": 3.8105, |
| "step": 780 |
| }, |
| { |
| "epoch": 2.7147766323024056, |
| "grad_norm": 11.817726135253906, |
| "learning_rate": 5.5627425614489e-06, |
| "loss": 3.7593, |
| "step": 790 |
| }, |
| { |
| "epoch": 2.7491408934707904, |
| "grad_norm": 13.080018043518066, |
| "learning_rate": 4.915912031047866e-06, |
| "loss": 3.8518, |
| "step": 800 |
| }, |
| { |
| "epoch": 2.783505154639175, |
| "grad_norm": 11.133337020874023, |
| "learning_rate": 4.2690815006468305e-06, |
| "loss": 3.7509, |
| "step": 810 |
| }, |
| { |
| "epoch": 2.81786941580756, |
| "grad_norm": 11.332104682922363, |
| "learning_rate": 3.6222509702457957e-06, |
| "loss": 3.76, |
| "step": 820 |
| }, |
| { |
| "epoch": 2.852233676975945, |
| "grad_norm": 12.596721649169922, |
| "learning_rate": 2.975420439844761e-06, |
| "loss": 3.7612, |
| "step": 830 |
| }, |
| { |
| "epoch": 2.88659793814433, |
| "grad_norm": 11.145185470581055, |
| "learning_rate": 2.328589909443726e-06, |
| "loss": 3.6734, |
| "step": 840 |
| }, |
| { |
| "epoch": 2.9209621993127146, |
| "grad_norm": 13.382460594177246, |
| "learning_rate": 1.6817593790426907e-06, |
| "loss": 3.7865, |
| "step": 850 |
| }, |
| { |
| "epoch": 2.9553264604811, |
| "grad_norm": 11.197080612182617, |
| "learning_rate": 1.0349288486416561e-06, |
| "loss": 3.7716, |
| "step": 860 |
| }, |
| { |
| "epoch": 2.9896907216494846, |
| "grad_norm": 12.549001693725586, |
| "learning_rate": 3.8809831824062096e-07, |
| "loss": 3.7189, |
| "step": 870 |
| }, |
| { |
| "epoch": 3.0, |
| "step": 873, |
| "total_flos": 0.0, |
| "train_loss": 4.699677870445645, |
| "train_runtime": 1249.7659, |
| "train_samples_per_second": 5.581, |
| "train_steps_per_second": 0.699 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 873, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 4000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|