| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.999346832135859, | |
| "eval_steps": 500, | |
| "global_step": 1722, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.017417809710428913, | |
| "grad_norm": 10.378825586435497, | |
| "learning_rate": 5e-06, | |
| "loss": 1.0838, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.034835619420857826, | |
| "grad_norm": 4.271546912407876, | |
| "learning_rate": 5e-06, | |
| "loss": 0.9373, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.05225342913128674, | |
| "grad_norm": 2.1390120781164823, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8909, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.06967123884171565, | |
| "grad_norm": 1.4309168874054254, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8639, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.08708904855214457, | |
| "grad_norm": 1.5406276876095664, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8433, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.10450685826257348, | |
| "grad_norm": 1.2501465404617618, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8247, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.1219246679730024, | |
| "grad_norm": 0.9239834135671998, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8126, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.1393424776834313, | |
| "grad_norm": 0.8433941860985329, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8046, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.15676028739386022, | |
| "grad_norm": 1.097647182335265, | |
| "learning_rate": 5e-06, | |
| "loss": 0.796, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.17417809710428914, | |
| "grad_norm": 1.0683416519694173, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7903, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.19159590681471805, | |
| "grad_norm": 1.572717136046009, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7846, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.20901371652514697, | |
| "grad_norm": 0.9249575894994505, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7806, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.2264315262355759, | |
| "grad_norm": 0.8193084838390464, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7783, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.2438493359460048, | |
| "grad_norm": 0.8732647330500742, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7725, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.2612671456564337, | |
| "grad_norm": 0.7448079044745952, | |
| "learning_rate": 5e-06, | |
| "loss": 0.773, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.2786849553668626, | |
| "grad_norm": 0.9898862055097742, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7695, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.29610276507729155, | |
| "grad_norm": 0.7272776397879028, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7705, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.31352057478772044, | |
| "grad_norm": 1.0826222911505594, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7737, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.3309383844981494, | |
| "grad_norm": 0.6748830770235253, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7611, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.3483561942085783, | |
| "grad_norm": 0.7359625796839507, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7609, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.36577400391900716, | |
| "grad_norm": 0.6962279114859433, | |
| "learning_rate": 5e-06, | |
| "loss": 0.756, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.3831918136294361, | |
| "grad_norm": 0.6255149151141138, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7617, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.400609623339865, | |
| "grad_norm": 0.7052369366480614, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7569, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.41802743305029394, | |
| "grad_norm": 0.6097155207889393, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7546, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.43544524276072283, | |
| "grad_norm": 0.6836967619335058, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7613, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.4528630524711518, | |
| "grad_norm": 0.5747669710959988, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7568, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.47028086218158066, | |
| "grad_norm": 0.8293746415280457, | |
| "learning_rate": 5e-06, | |
| "loss": 0.755, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.4876986718920096, | |
| "grad_norm": 0.6240762658588678, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7512, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.5051164816024385, | |
| "grad_norm": 0.604817000368227, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7565, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.5225342913128674, | |
| "grad_norm": 0.7975782732234052, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7536, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.5399521010232963, | |
| "grad_norm": 0.9097401207787359, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7449, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.5573699107337252, | |
| "grad_norm": 0.5850865591811083, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7476, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.5747877204441542, | |
| "grad_norm": 0.7135182489392279, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7446, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.5922055301545831, | |
| "grad_norm": 0.7289623389467091, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7461, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.6096233398650119, | |
| "grad_norm": 0.7209085909740518, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7441, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.6270411495754409, | |
| "grad_norm": 0.7124009797840823, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7426, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.6444589592858698, | |
| "grad_norm": 0.7812743047985142, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7425, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.6618767689962988, | |
| "grad_norm": 0.9348863468860681, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7426, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.6792945787067276, | |
| "grad_norm": 0.8887235387086234, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7443, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.6967123884171565, | |
| "grad_norm": 0.5644550564287659, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7462, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.7141301981275855, | |
| "grad_norm": 0.7133759479394518, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7392, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.7315480078380143, | |
| "grad_norm": 0.6322425713640688, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7438, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.7489658175484433, | |
| "grad_norm": 0.653356418355363, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7353, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.7663836272588722, | |
| "grad_norm": 0.7172550980138845, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7362, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.7838014369693012, | |
| "grad_norm": 0.5961758862117608, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7374, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.80121924667973, | |
| "grad_norm": 0.682099366244341, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7361, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.8186370563901589, | |
| "grad_norm": 0.6035807917106119, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7394, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.8360548661005879, | |
| "grad_norm": 0.6420432430762913, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7427, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.8534726758110167, | |
| "grad_norm": 0.6771119733640588, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7423, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.8708904855214457, | |
| "grad_norm": 0.7126075005956054, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7383, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.8883082952318746, | |
| "grad_norm": 0.7584360793951261, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7321, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.9057261049423035, | |
| "grad_norm": 0.7439364499749611, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7371, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.9231439146527324, | |
| "grad_norm": 0.6529063351558732, | |
| "learning_rate": 5e-06, | |
| "loss": 0.74, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.9405617243631613, | |
| "grad_norm": 0.6579238997317206, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7339, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.9579795340735903, | |
| "grad_norm": 0.6529747723949795, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7366, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.9753973437840192, | |
| "grad_norm": 0.7465822819939318, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7329, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.992815153494448, | |
| "grad_norm": 0.6744574449718616, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7336, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.9997822773786197, | |
| "eval_loss": 0.7317857146263123, | |
| "eval_runtime": 402.0076, | |
| "eval_samples_per_second": 38.482, | |
| "eval_steps_per_second": 0.602, | |
| "step": 574 | |
| }, | |
| { | |
| "epoch": 1.010232963204877, | |
| "grad_norm": 0.7094322700801018, | |
| "learning_rate": 5e-06, | |
| "loss": 0.763, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.027650772915306, | |
| "grad_norm": 0.619008567608486, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6821, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.0450685826257349, | |
| "grad_norm": 0.7720080238431899, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6814, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.0624863923361638, | |
| "grad_norm": 0.613074115463787, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6871, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.0799042020465925, | |
| "grad_norm": 0.8529977128112323, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6916, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.0973220117570215, | |
| "grad_norm": 0.6185916445154745, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6852, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.1147398214674504, | |
| "grad_norm": 0.5960552407712587, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6902, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.1321576311778794, | |
| "grad_norm": 0.6992739712457544, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6838, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.1495754408883083, | |
| "grad_norm": 0.6896093508843089, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6883, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.1669932505987373, | |
| "grad_norm": 0.7618446264021529, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6866, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.1844110603091662, | |
| "grad_norm": 0.5993982811906263, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6864, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.201828870019595, | |
| "grad_norm": 0.7774490221678223, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6904, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.2192466797300239, | |
| "grad_norm": 0.7209322592758776, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6843, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.2366644894404528, | |
| "grad_norm": 0.5933996592162579, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6858, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.2540822991508818, | |
| "grad_norm": 0.603096793298619, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6895, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.2715001088613107, | |
| "grad_norm": 0.6254264453133531, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6848, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.2889179185717397, | |
| "grad_norm": 0.6255696330381725, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6846, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.3063357282821686, | |
| "grad_norm": 0.5954700919121718, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6857, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.3237535379925975, | |
| "grad_norm": 0.605017152001749, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6833, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.3411713477030263, | |
| "grad_norm": 0.6374571812243335, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6835, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.3585891574134552, | |
| "grad_norm": 0.7034685309687484, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6862, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.3760069671238842, | |
| "grad_norm": 0.6478857838671078, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6834, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.393424776834313, | |
| "grad_norm": 0.6774648068840645, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6835, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.410842586544742, | |
| "grad_norm": 0.7999211706624636, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6849, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.428260396255171, | |
| "grad_norm": 0.7093553520320318, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6874, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.4456782059655997, | |
| "grad_norm": 0.6339150451411103, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6817, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.4630960156760286, | |
| "grad_norm": 0.6378354921425449, | |
| "learning_rate": 5e-06, | |
| "loss": 0.687, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.4805138253864576, | |
| "grad_norm": 0.6060950779002315, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6852, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.4979316350968865, | |
| "grad_norm": 0.7027258672686724, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6866, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.5153494448073155, | |
| "grad_norm": 0.6241403475122037, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6831, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.5327672545177444, | |
| "grad_norm": 0.7090161192507007, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6827, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.5501850642281734, | |
| "grad_norm": 0.6858971984003965, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6792, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.5676028739386023, | |
| "grad_norm": 0.6546584740097385, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6806, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.5850206836490313, | |
| "grad_norm": 0.6020087571697199, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6799, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.6024384933594602, | |
| "grad_norm": 0.6501182817858244, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6857, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.619856303069889, | |
| "grad_norm": 0.584870256422628, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6803, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.6372741127803179, | |
| "grad_norm": 0.6317587237334846, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6855, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.6546919224907468, | |
| "grad_norm": 0.5912078658288651, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6831, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.6721097322011755, | |
| "grad_norm": 0.6128368243065496, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6808, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.6895275419116045, | |
| "grad_norm": 0.7946845578841991, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6878, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.7069453516220334, | |
| "grad_norm": 0.6757515701163817, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6854, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.7243631613324624, | |
| "grad_norm": 0.6571419210311429, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6823, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 1.7417809710428913, | |
| "grad_norm": 0.7598031996788027, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6797, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.7591987807533203, | |
| "grad_norm": 0.6670752253306316, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6842, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 1.7766165904637492, | |
| "grad_norm": 0.6401962122300333, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6812, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 1.7940344001741781, | |
| "grad_norm": 0.7117175290423249, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6824, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 1.811452209884607, | |
| "grad_norm": 0.5800625845659623, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6803, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 1.828870019595036, | |
| "grad_norm": 0.7320040356270946, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6803, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 1.846287829305465, | |
| "grad_norm": 0.6283167554926344, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6839, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 1.8637056390158937, | |
| "grad_norm": 0.8371943589394946, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6812, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 1.8811234487263226, | |
| "grad_norm": 0.6257083963805145, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6838, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 1.8985412584367516, | |
| "grad_norm": 0.5986261297946587, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6797, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 1.9159590681471805, | |
| "grad_norm": 0.6170171924536151, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6797, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.9333768778576093, | |
| "grad_norm": 0.5700527826150651, | |
| "learning_rate": 5e-06, | |
| "loss": 0.681, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 1.9507946875680382, | |
| "grad_norm": 0.7918614747451274, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6836, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 1.9682124972784671, | |
| "grad_norm": 0.6423488525843636, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6841, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 1.985630306988896, | |
| "grad_norm": 0.5727071414052627, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6807, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 1.9995645547572392, | |
| "eval_loss": 0.7203673124313354, | |
| "eval_runtime": 394.6464, | |
| "eval_samples_per_second": 39.2, | |
| "eval_steps_per_second": 0.613, | |
| "step": 1148 | |
| }, | |
| { | |
| "epoch": 2.003048116699325, | |
| "grad_norm": 0.9442473115477681, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7309, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 2.020465926409754, | |
| "grad_norm": 0.7483105880106439, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6325, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 2.037883736120183, | |
| "grad_norm": 0.7540713563935749, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6307, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 2.055301545830612, | |
| "grad_norm": 0.6724851440947269, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6337, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 2.072719355541041, | |
| "grad_norm": 0.8087552584080454, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6291, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 2.0901371652514698, | |
| "grad_norm": 0.5977695571055209, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6303, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 2.1075549749618987, | |
| "grad_norm": 0.7569591780057143, | |
| "learning_rate": 5e-06, | |
| "loss": 0.63, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 2.1249727846723276, | |
| "grad_norm": 0.7224022820617728, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6319, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 2.142390594382756, | |
| "grad_norm": 0.9103041866730269, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6338, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 2.159808404093185, | |
| "grad_norm": 0.6573119068550747, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6315, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 2.177226213803614, | |
| "grad_norm": 0.7130793786888794, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6307, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 2.194644023514043, | |
| "grad_norm": 0.6787527348130123, | |
| "learning_rate": 5e-06, | |
| "loss": 0.635, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 2.212061833224472, | |
| "grad_norm": 0.7583316967190475, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6328, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 2.229479642934901, | |
| "grad_norm": 0.9232698200191256, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6392, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 2.24689745264533, | |
| "grad_norm": 0.7252346361915288, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6347, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 2.2643152623557588, | |
| "grad_norm": 0.6362752959635848, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6304, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.2817330720661877, | |
| "grad_norm": 0.6085958821854244, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6354, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 2.2991508817766166, | |
| "grad_norm": 0.6106358089516802, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6319, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 2.3165686914870456, | |
| "grad_norm": 0.6704766246511351, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6361, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 2.3339865011974745, | |
| "grad_norm": 0.7134352783772538, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6363, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 2.3514043109079035, | |
| "grad_norm": 0.691476396805842, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6408, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 2.3688221206183324, | |
| "grad_norm": 0.6591953333036876, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6333, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 2.3862399303287614, | |
| "grad_norm": 0.6492399404200755, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6328, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 2.40365774003919, | |
| "grad_norm": 0.615428906926297, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6413, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 2.421075549749619, | |
| "grad_norm": 0.6178851722594154, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6362, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 2.4384933594600477, | |
| "grad_norm": 0.6630448901048148, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6354, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.4559111691704767, | |
| "grad_norm": 0.6761352769262333, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6364, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 2.4733289788809056, | |
| "grad_norm": 0.6840809680625406, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6347, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 2.4907467885913346, | |
| "grad_norm": 0.7752711880925182, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6375, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 2.5081645983017635, | |
| "grad_norm": 0.671961987869029, | |
| "learning_rate": 5e-06, | |
| "loss": 0.638, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 2.5255824080121925, | |
| "grad_norm": 0.6066582748163826, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6356, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 2.5430002177226214, | |
| "grad_norm": 0.6494968644059873, | |
| "learning_rate": 5e-06, | |
| "loss": 0.64, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 2.5604180274330504, | |
| "grad_norm": 0.6819270998889235, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6392, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 2.5778358371434793, | |
| "grad_norm": 0.6720899123226914, | |
| "learning_rate": 5e-06, | |
| "loss": 0.638, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 2.5952536468539082, | |
| "grad_norm": 0.8865903590224419, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6418, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 2.612671456564337, | |
| "grad_norm": 0.8255525182739956, | |
| "learning_rate": 5e-06, | |
| "loss": 0.637, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.6300892662747657, | |
| "grad_norm": 0.6926869297282812, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6359, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 2.647507075985195, | |
| "grad_norm": 0.7064032548106364, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6372, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 2.6649248856956236, | |
| "grad_norm": 0.6147984872224924, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6393, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 2.6823426954060525, | |
| "grad_norm": 0.6504099699536218, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6399, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 2.6997605051164815, | |
| "grad_norm": 0.563483068339733, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6348, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 2.7171783148269104, | |
| "grad_norm": 0.730022824759867, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6358, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 2.7345961245373394, | |
| "grad_norm": 0.7435338593643929, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6416, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 2.7520139342477683, | |
| "grad_norm": 0.7041374525178048, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6312, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 2.7694317439581972, | |
| "grad_norm": 0.6185370005773447, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6369, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 2.786849553668626, | |
| "grad_norm": 0.597751813516521, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6376, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 2.804267363379055, | |
| "grad_norm": 0.6525590591893353, | |
| "learning_rate": 5e-06, | |
| "loss": 0.638, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 2.821685173089484, | |
| "grad_norm": 0.6520966089133831, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6363, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 2.839102982799913, | |
| "grad_norm": 0.6318597492523834, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6392, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 2.856520792510342, | |
| "grad_norm": 0.7325044927855683, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6369, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 2.873938602220771, | |
| "grad_norm": 0.6253335500365848, | |
| "learning_rate": 5e-06, | |
| "loss": 0.636, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 2.8913564119311994, | |
| "grad_norm": 0.6704613482673505, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6376, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 2.908774221641629, | |
| "grad_norm": 0.7273411977567759, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6386, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 2.9261920313520573, | |
| "grad_norm": 0.6268248741184522, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6378, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 2.9436098410624862, | |
| "grad_norm": 0.5745167165482074, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6392, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 2.961027650772915, | |
| "grad_norm": 0.6046462612164668, | |
| "learning_rate": 5e-06, | |
| "loss": 0.642, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.978445460483344, | |
| "grad_norm": 0.6397198084499147, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6381, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 2.995863270193773, | |
| "grad_norm": 0.7413825248918653, | |
| "learning_rate": 5e-06, | |
| "loss": 0.638, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 2.999346832135859, | |
| "eval_loss": 0.7229765057563782, | |
| "eval_runtime": 389.254, | |
| "eval_samples_per_second": 39.743, | |
| "eval_steps_per_second": 0.622, | |
| "step": 1722 | |
| }, | |
| { | |
| "epoch": 2.999346832135859, | |
| "step": 1722, | |
| "total_flos": 2884204756992000.0, | |
| "train_loss": 0.6974312729536448, | |
| "train_runtime": 56418.3982, | |
| "train_samples_per_second": 15.629, | |
| "train_steps_per_second": 0.031 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1722, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2884204756992000.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |