{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.999346832135859, "eval_steps": 500, "global_step": 1722, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.017417809710428913, "grad_norm": 10.378825586435497, "learning_rate": 5e-06, "loss": 1.0838, "step": 10 }, { "epoch": 0.034835619420857826, "grad_norm": 4.271546912407876, "learning_rate": 5e-06, "loss": 0.9373, "step": 20 }, { "epoch": 0.05225342913128674, "grad_norm": 2.1390120781164823, "learning_rate": 5e-06, "loss": 0.8909, "step": 30 }, { "epoch": 0.06967123884171565, "grad_norm": 1.4309168874054254, "learning_rate": 5e-06, "loss": 0.8639, "step": 40 }, { "epoch": 0.08708904855214457, "grad_norm": 1.5406276876095664, "learning_rate": 5e-06, "loss": 0.8433, "step": 50 }, { "epoch": 0.10450685826257348, "grad_norm": 1.2501465404617618, "learning_rate": 5e-06, "loss": 0.8247, "step": 60 }, { "epoch": 0.1219246679730024, "grad_norm": 0.9239834135671998, "learning_rate": 5e-06, "loss": 0.8126, "step": 70 }, { "epoch": 0.1393424776834313, "grad_norm": 0.8433941860985329, "learning_rate": 5e-06, "loss": 0.8046, "step": 80 }, { "epoch": 0.15676028739386022, "grad_norm": 1.097647182335265, "learning_rate": 5e-06, "loss": 0.796, "step": 90 }, { "epoch": 0.17417809710428914, "grad_norm": 1.0683416519694173, "learning_rate": 5e-06, "loss": 0.7903, "step": 100 }, { "epoch": 0.19159590681471805, "grad_norm": 1.572717136046009, "learning_rate": 5e-06, "loss": 0.7846, "step": 110 }, { "epoch": 0.20901371652514697, "grad_norm": 0.9249575894994505, "learning_rate": 5e-06, "loss": 0.7806, "step": 120 }, { "epoch": 0.2264315262355759, "grad_norm": 0.8193084838390464, "learning_rate": 5e-06, "loss": 0.7783, "step": 130 }, { "epoch": 0.2438493359460048, "grad_norm": 0.8732647330500742, "learning_rate": 5e-06, "loss": 0.7725, "step": 140 }, { "epoch": 0.2612671456564337, "grad_norm": 0.7448079044745952, "learning_rate": 5e-06, "loss": 0.773, "step": 150 }, { "epoch": 0.2786849553668626, "grad_norm": 0.9898862055097742, "learning_rate": 5e-06, "loss": 0.7695, "step": 160 }, { "epoch": 0.29610276507729155, "grad_norm": 0.7272776397879028, "learning_rate": 5e-06, "loss": 0.7705, "step": 170 }, { "epoch": 0.31352057478772044, "grad_norm": 1.0826222911505594, "learning_rate": 5e-06, "loss": 0.7737, "step": 180 }, { "epoch": 0.3309383844981494, "grad_norm": 0.6748830770235253, "learning_rate": 5e-06, "loss": 0.7611, "step": 190 }, { "epoch": 0.3483561942085783, "grad_norm": 0.7359625796839507, "learning_rate": 5e-06, "loss": 0.7609, "step": 200 }, { "epoch": 0.36577400391900716, "grad_norm": 0.6962279114859433, "learning_rate": 5e-06, "loss": 0.756, "step": 210 }, { "epoch": 0.3831918136294361, "grad_norm": 0.6255149151141138, "learning_rate": 5e-06, "loss": 0.7617, "step": 220 }, { "epoch": 0.400609623339865, "grad_norm": 0.7052369366480614, "learning_rate": 5e-06, "loss": 0.7569, "step": 230 }, { "epoch": 0.41802743305029394, "grad_norm": 0.6097155207889393, "learning_rate": 5e-06, "loss": 0.7546, "step": 240 }, { "epoch": 0.43544524276072283, "grad_norm": 0.6836967619335058, "learning_rate": 5e-06, "loss": 0.7613, "step": 250 }, { "epoch": 0.4528630524711518, "grad_norm": 0.5747669710959988, "learning_rate": 5e-06, "loss": 0.7568, "step": 260 }, { "epoch": 0.47028086218158066, "grad_norm": 0.8293746415280457, "learning_rate": 5e-06, "loss": 0.755, "step": 270 }, { "epoch": 0.4876986718920096, "grad_norm": 0.6240762658588678, "learning_rate": 5e-06, "loss": 0.7512, "step": 280 }, { "epoch": 0.5051164816024385, "grad_norm": 0.604817000368227, "learning_rate": 5e-06, "loss": 0.7565, "step": 290 }, { "epoch": 0.5225342913128674, "grad_norm": 0.7975782732234052, "learning_rate": 5e-06, "loss": 0.7536, "step": 300 }, { "epoch": 0.5399521010232963, "grad_norm": 0.9097401207787359, "learning_rate": 5e-06, "loss": 0.7449, "step": 310 }, { "epoch": 0.5573699107337252, "grad_norm": 0.5850865591811083, "learning_rate": 5e-06, "loss": 0.7476, "step": 320 }, { "epoch": 0.5747877204441542, "grad_norm": 0.7135182489392279, "learning_rate": 5e-06, "loss": 0.7446, "step": 330 }, { "epoch": 0.5922055301545831, "grad_norm": 0.7289623389467091, "learning_rate": 5e-06, "loss": 0.7461, "step": 340 }, { "epoch": 0.6096233398650119, "grad_norm": 0.7209085909740518, "learning_rate": 5e-06, "loss": 0.7441, "step": 350 }, { "epoch": 0.6270411495754409, "grad_norm": 0.7124009797840823, "learning_rate": 5e-06, "loss": 0.7426, "step": 360 }, { "epoch": 0.6444589592858698, "grad_norm": 0.7812743047985142, "learning_rate": 5e-06, "loss": 0.7425, "step": 370 }, { "epoch": 0.6618767689962988, "grad_norm": 0.9348863468860681, "learning_rate": 5e-06, "loss": 0.7426, "step": 380 }, { "epoch": 0.6792945787067276, "grad_norm": 0.8887235387086234, "learning_rate": 5e-06, "loss": 0.7443, "step": 390 }, { "epoch": 0.6967123884171565, "grad_norm": 0.5644550564287659, "learning_rate": 5e-06, "loss": 0.7462, "step": 400 }, { "epoch": 0.7141301981275855, "grad_norm": 0.7133759479394518, "learning_rate": 5e-06, "loss": 0.7392, "step": 410 }, { "epoch": 0.7315480078380143, "grad_norm": 0.6322425713640688, "learning_rate": 5e-06, "loss": 0.7438, "step": 420 }, { "epoch": 0.7489658175484433, "grad_norm": 0.653356418355363, "learning_rate": 5e-06, "loss": 0.7353, "step": 430 }, { "epoch": 0.7663836272588722, "grad_norm": 0.7172550980138845, "learning_rate": 5e-06, "loss": 0.7362, "step": 440 }, { "epoch": 0.7838014369693012, "grad_norm": 0.5961758862117608, "learning_rate": 5e-06, "loss": 0.7374, "step": 450 }, { "epoch": 0.80121924667973, "grad_norm": 0.682099366244341, "learning_rate": 5e-06, "loss": 0.7361, "step": 460 }, { "epoch": 0.8186370563901589, "grad_norm": 0.6035807917106119, "learning_rate": 5e-06, "loss": 0.7394, "step": 470 }, { "epoch": 0.8360548661005879, "grad_norm": 0.6420432430762913, "learning_rate": 5e-06, "loss": 0.7427, "step": 480 }, { "epoch": 0.8534726758110167, "grad_norm": 0.6771119733640588, "learning_rate": 5e-06, "loss": 0.7423, "step": 490 }, { "epoch": 0.8708904855214457, "grad_norm": 0.7126075005956054, "learning_rate": 5e-06, "loss": 0.7383, "step": 500 }, { "epoch": 0.8883082952318746, "grad_norm": 0.7584360793951261, "learning_rate": 5e-06, "loss": 0.7321, "step": 510 }, { "epoch": 0.9057261049423035, "grad_norm": 0.7439364499749611, "learning_rate": 5e-06, "loss": 0.7371, "step": 520 }, { "epoch": 0.9231439146527324, "grad_norm": 0.6529063351558732, "learning_rate": 5e-06, "loss": 0.74, "step": 530 }, { "epoch": 0.9405617243631613, "grad_norm": 0.6579238997317206, "learning_rate": 5e-06, "loss": 0.7339, "step": 540 }, { "epoch": 0.9579795340735903, "grad_norm": 0.6529747723949795, "learning_rate": 5e-06, "loss": 0.7366, "step": 550 }, { "epoch": 0.9753973437840192, "grad_norm": 0.7465822819939318, "learning_rate": 5e-06, "loss": 0.7329, "step": 560 }, { "epoch": 0.992815153494448, "grad_norm": 0.6744574449718616, "learning_rate": 5e-06, "loss": 0.7336, "step": 570 }, { "epoch": 0.9997822773786197, "eval_loss": 0.7317857146263123, "eval_runtime": 402.0076, "eval_samples_per_second": 38.482, "eval_steps_per_second": 0.602, "step": 574 }, { "epoch": 1.010232963204877, "grad_norm": 0.7094322700801018, "learning_rate": 5e-06, "loss": 0.763, "step": 580 }, { "epoch": 1.027650772915306, "grad_norm": 0.619008567608486, "learning_rate": 5e-06, "loss": 0.6821, "step": 590 }, { "epoch": 1.0450685826257349, "grad_norm": 0.7720080238431899, "learning_rate": 5e-06, "loss": 0.6814, "step": 600 }, { "epoch": 1.0624863923361638, "grad_norm": 0.613074115463787, "learning_rate": 5e-06, "loss": 0.6871, "step": 610 }, { "epoch": 1.0799042020465925, "grad_norm": 0.8529977128112323, "learning_rate": 5e-06, "loss": 0.6916, "step": 620 }, { "epoch": 1.0973220117570215, "grad_norm": 0.6185916445154745, "learning_rate": 5e-06, "loss": 0.6852, "step": 630 }, { "epoch": 1.1147398214674504, "grad_norm": 0.5960552407712587, "learning_rate": 5e-06, "loss": 0.6902, "step": 640 }, { "epoch": 1.1321576311778794, "grad_norm": 0.6992739712457544, "learning_rate": 5e-06, "loss": 0.6838, "step": 650 }, { "epoch": 1.1495754408883083, "grad_norm": 0.6896093508843089, "learning_rate": 5e-06, "loss": 0.6883, "step": 660 }, { "epoch": 1.1669932505987373, "grad_norm": 0.7618446264021529, "learning_rate": 5e-06, "loss": 0.6866, "step": 670 }, { "epoch": 1.1844110603091662, "grad_norm": 0.5993982811906263, "learning_rate": 5e-06, "loss": 0.6864, "step": 680 }, { "epoch": 1.201828870019595, "grad_norm": 0.7774490221678223, "learning_rate": 5e-06, "loss": 0.6904, "step": 690 }, { "epoch": 1.2192466797300239, "grad_norm": 0.7209322592758776, "learning_rate": 5e-06, "loss": 0.6843, "step": 700 }, { "epoch": 1.2366644894404528, "grad_norm": 0.5933996592162579, "learning_rate": 5e-06, "loss": 0.6858, "step": 710 }, { "epoch": 1.2540822991508818, "grad_norm": 0.603096793298619, "learning_rate": 5e-06, "loss": 0.6895, "step": 720 }, { "epoch": 1.2715001088613107, "grad_norm": 0.6254264453133531, "learning_rate": 5e-06, "loss": 0.6848, "step": 730 }, { "epoch": 1.2889179185717397, "grad_norm": 0.6255696330381725, "learning_rate": 5e-06, "loss": 0.6846, "step": 740 }, { "epoch": 1.3063357282821686, "grad_norm": 0.5954700919121718, "learning_rate": 5e-06, "loss": 0.6857, "step": 750 }, { "epoch": 1.3237535379925975, "grad_norm": 0.605017152001749, "learning_rate": 5e-06, "loss": 0.6833, "step": 760 }, { "epoch": 1.3411713477030263, "grad_norm": 0.6374571812243335, "learning_rate": 5e-06, "loss": 0.6835, "step": 770 }, { "epoch": 1.3585891574134552, "grad_norm": 0.7034685309687484, "learning_rate": 5e-06, "loss": 0.6862, "step": 780 }, { "epoch": 1.3760069671238842, "grad_norm": 0.6478857838671078, "learning_rate": 5e-06, "loss": 0.6834, "step": 790 }, { "epoch": 1.393424776834313, "grad_norm": 0.6774648068840645, "learning_rate": 5e-06, "loss": 0.6835, "step": 800 }, { "epoch": 1.410842586544742, "grad_norm": 0.7999211706624636, "learning_rate": 5e-06, "loss": 0.6849, "step": 810 }, { "epoch": 1.428260396255171, "grad_norm": 0.7093553520320318, "learning_rate": 5e-06, "loss": 0.6874, "step": 820 }, { "epoch": 1.4456782059655997, "grad_norm": 0.6339150451411103, "learning_rate": 5e-06, "loss": 0.6817, "step": 830 }, { "epoch": 1.4630960156760286, "grad_norm": 0.6378354921425449, "learning_rate": 5e-06, "loss": 0.687, "step": 840 }, { "epoch": 1.4805138253864576, "grad_norm": 0.6060950779002315, "learning_rate": 5e-06, "loss": 0.6852, "step": 850 }, { "epoch": 1.4979316350968865, "grad_norm": 0.7027258672686724, "learning_rate": 5e-06, "loss": 0.6866, "step": 860 }, { "epoch": 1.5153494448073155, "grad_norm": 0.6241403475122037, "learning_rate": 5e-06, "loss": 0.6831, "step": 870 }, { "epoch": 1.5327672545177444, "grad_norm": 0.7090161192507007, "learning_rate": 5e-06, "loss": 0.6827, "step": 880 }, { "epoch": 1.5501850642281734, "grad_norm": 0.6858971984003965, "learning_rate": 5e-06, "loss": 0.6792, "step": 890 }, { "epoch": 1.5676028739386023, "grad_norm": 0.6546584740097385, "learning_rate": 5e-06, "loss": 0.6806, "step": 900 }, { "epoch": 1.5850206836490313, "grad_norm": 0.6020087571697199, "learning_rate": 5e-06, "loss": 0.6799, "step": 910 }, { "epoch": 1.6024384933594602, "grad_norm": 0.6501182817858244, "learning_rate": 5e-06, "loss": 0.6857, "step": 920 }, { "epoch": 1.619856303069889, "grad_norm": 0.584870256422628, "learning_rate": 5e-06, "loss": 0.6803, "step": 930 }, { "epoch": 1.6372741127803179, "grad_norm": 0.6317587237334846, "learning_rate": 5e-06, "loss": 0.6855, "step": 940 }, { "epoch": 1.6546919224907468, "grad_norm": 0.5912078658288651, "learning_rate": 5e-06, "loss": 0.6831, "step": 950 }, { "epoch": 1.6721097322011755, "grad_norm": 0.6128368243065496, "learning_rate": 5e-06, "loss": 0.6808, "step": 960 }, { "epoch": 1.6895275419116045, "grad_norm": 0.7946845578841991, "learning_rate": 5e-06, "loss": 0.6878, "step": 970 }, { "epoch": 1.7069453516220334, "grad_norm": 0.6757515701163817, "learning_rate": 5e-06, "loss": 0.6854, "step": 980 }, { "epoch": 1.7243631613324624, "grad_norm": 0.6571419210311429, "learning_rate": 5e-06, "loss": 0.6823, "step": 990 }, { "epoch": 1.7417809710428913, "grad_norm": 0.7598031996788027, "learning_rate": 5e-06, "loss": 0.6797, "step": 1000 }, { "epoch": 1.7591987807533203, "grad_norm": 0.6670752253306316, "learning_rate": 5e-06, "loss": 0.6842, "step": 1010 }, { "epoch": 1.7766165904637492, "grad_norm": 0.6401962122300333, "learning_rate": 5e-06, "loss": 0.6812, "step": 1020 }, { "epoch": 1.7940344001741781, "grad_norm": 0.7117175290423249, "learning_rate": 5e-06, "loss": 0.6824, "step": 1030 }, { "epoch": 1.811452209884607, "grad_norm": 0.5800625845659623, "learning_rate": 5e-06, "loss": 0.6803, "step": 1040 }, { "epoch": 1.828870019595036, "grad_norm": 0.7320040356270946, "learning_rate": 5e-06, "loss": 0.6803, "step": 1050 }, { "epoch": 1.846287829305465, "grad_norm": 0.6283167554926344, "learning_rate": 5e-06, "loss": 0.6839, "step": 1060 }, { "epoch": 1.8637056390158937, "grad_norm": 0.8371943589394946, "learning_rate": 5e-06, "loss": 0.6812, "step": 1070 }, { "epoch": 1.8811234487263226, "grad_norm": 0.6257083963805145, "learning_rate": 5e-06, "loss": 0.6838, "step": 1080 }, { "epoch": 1.8985412584367516, "grad_norm": 0.5986261297946587, "learning_rate": 5e-06, "loss": 0.6797, "step": 1090 }, { "epoch": 1.9159590681471805, "grad_norm": 0.6170171924536151, "learning_rate": 5e-06, "loss": 0.6797, "step": 1100 }, { "epoch": 1.9333768778576093, "grad_norm": 0.5700527826150651, "learning_rate": 5e-06, "loss": 0.681, "step": 1110 }, { "epoch": 1.9507946875680382, "grad_norm": 0.7918614747451274, "learning_rate": 5e-06, "loss": 0.6836, "step": 1120 }, { "epoch": 1.9682124972784671, "grad_norm": 0.6423488525843636, "learning_rate": 5e-06, "loss": 0.6841, "step": 1130 }, { "epoch": 1.985630306988896, "grad_norm": 0.5727071414052627, "learning_rate": 5e-06, "loss": 0.6807, "step": 1140 }, { "epoch": 1.9995645547572392, "eval_loss": 0.7203673124313354, "eval_runtime": 394.6464, "eval_samples_per_second": 39.2, "eval_steps_per_second": 0.613, "step": 1148 }, { "epoch": 2.003048116699325, "grad_norm": 0.9442473115477681, "learning_rate": 5e-06, "loss": 0.7309, "step": 1150 }, { "epoch": 2.020465926409754, "grad_norm": 0.7483105880106439, "learning_rate": 5e-06, "loss": 0.6325, "step": 1160 }, { "epoch": 2.037883736120183, "grad_norm": 0.7540713563935749, "learning_rate": 5e-06, "loss": 0.6307, "step": 1170 }, { "epoch": 2.055301545830612, "grad_norm": 0.6724851440947269, "learning_rate": 5e-06, "loss": 0.6337, "step": 1180 }, { "epoch": 2.072719355541041, "grad_norm": 0.8087552584080454, "learning_rate": 5e-06, "loss": 0.6291, "step": 1190 }, { "epoch": 2.0901371652514698, "grad_norm": 0.5977695571055209, "learning_rate": 5e-06, "loss": 0.6303, "step": 1200 }, { "epoch": 2.1075549749618987, "grad_norm": 0.7569591780057143, "learning_rate": 5e-06, "loss": 0.63, "step": 1210 }, { "epoch": 2.1249727846723276, "grad_norm": 0.7224022820617728, "learning_rate": 5e-06, "loss": 0.6319, "step": 1220 }, { "epoch": 2.142390594382756, "grad_norm": 0.9103041866730269, "learning_rate": 5e-06, "loss": 0.6338, "step": 1230 }, { "epoch": 2.159808404093185, "grad_norm": 0.6573119068550747, "learning_rate": 5e-06, "loss": 0.6315, "step": 1240 }, { "epoch": 2.177226213803614, "grad_norm": 0.7130793786888794, "learning_rate": 5e-06, "loss": 0.6307, "step": 1250 }, { "epoch": 2.194644023514043, "grad_norm": 0.6787527348130123, "learning_rate": 5e-06, "loss": 0.635, "step": 1260 }, { "epoch": 2.212061833224472, "grad_norm": 0.7583316967190475, "learning_rate": 5e-06, "loss": 0.6328, "step": 1270 }, { "epoch": 2.229479642934901, "grad_norm": 0.9232698200191256, "learning_rate": 5e-06, "loss": 0.6392, "step": 1280 }, { "epoch": 2.24689745264533, "grad_norm": 0.7252346361915288, "learning_rate": 5e-06, "loss": 0.6347, "step": 1290 }, { "epoch": 2.2643152623557588, "grad_norm": 0.6362752959635848, "learning_rate": 5e-06, "loss": 0.6304, "step": 1300 }, { "epoch": 2.2817330720661877, "grad_norm": 0.6085958821854244, "learning_rate": 5e-06, "loss": 0.6354, "step": 1310 }, { "epoch": 2.2991508817766166, "grad_norm": 0.6106358089516802, "learning_rate": 5e-06, "loss": 0.6319, "step": 1320 }, { "epoch": 2.3165686914870456, "grad_norm": 0.6704766246511351, "learning_rate": 5e-06, "loss": 0.6361, "step": 1330 }, { "epoch": 2.3339865011974745, "grad_norm": 0.7134352783772538, "learning_rate": 5e-06, "loss": 0.6363, "step": 1340 }, { "epoch": 2.3514043109079035, "grad_norm": 0.691476396805842, "learning_rate": 5e-06, "loss": 0.6408, "step": 1350 }, { "epoch": 2.3688221206183324, "grad_norm": 0.6591953333036876, "learning_rate": 5e-06, "loss": 0.6333, "step": 1360 }, { "epoch": 2.3862399303287614, "grad_norm": 0.6492399404200755, "learning_rate": 5e-06, "loss": 0.6328, "step": 1370 }, { "epoch": 2.40365774003919, "grad_norm": 0.615428906926297, "learning_rate": 5e-06, "loss": 0.6413, "step": 1380 }, { "epoch": 2.421075549749619, "grad_norm": 0.6178851722594154, "learning_rate": 5e-06, "loss": 0.6362, "step": 1390 }, { "epoch": 2.4384933594600477, "grad_norm": 0.6630448901048148, "learning_rate": 5e-06, "loss": 0.6354, "step": 1400 }, { "epoch": 2.4559111691704767, "grad_norm": 0.6761352769262333, "learning_rate": 5e-06, "loss": 0.6364, "step": 1410 }, { "epoch": 2.4733289788809056, "grad_norm": 0.6840809680625406, "learning_rate": 5e-06, "loss": 0.6347, "step": 1420 }, { "epoch": 2.4907467885913346, "grad_norm": 0.7752711880925182, "learning_rate": 5e-06, "loss": 0.6375, "step": 1430 }, { "epoch": 2.5081645983017635, "grad_norm": 0.671961987869029, "learning_rate": 5e-06, "loss": 0.638, "step": 1440 }, { "epoch": 2.5255824080121925, "grad_norm": 0.6066582748163826, "learning_rate": 5e-06, "loss": 0.6356, "step": 1450 }, { "epoch": 2.5430002177226214, "grad_norm": 0.6494968644059873, "learning_rate": 5e-06, "loss": 0.64, "step": 1460 }, { "epoch": 2.5604180274330504, "grad_norm": 0.6819270998889235, "learning_rate": 5e-06, "loss": 0.6392, "step": 1470 }, { "epoch": 2.5778358371434793, "grad_norm": 0.6720899123226914, "learning_rate": 5e-06, "loss": 0.638, "step": 1480 }, { "epoch": 2.5952536468539082, "grad_norm": 0.8865903590224419, "learning_rate": 5e-06, "loss": 0.6418, "step": 1490 }, { "epoch": 2.612671456564337, "grad_norm": 0.8255525182739956, "learning_rate": 5e-06, "loss": 0.637, "step": 1500 }, { "epoch": 2.6300892662747657, "grad_norm": 0.6926869297282812, "learning_rate": 5e-06, "loss": 0.6359, "step": 1510 }, { "epoch": 2.647507075985195, "grad_norm": 0.7064032548106364, "learning_rate": 5e-06, "loss": 0.6372, "step": 1520 }, { "epoch": 2.6649248856956236, "grad_norm": 0.6147984872224924, "learning_rate": 5e-06, "loss": 0.6393, "step": 1530 }, { "epoch": 2.6823426954060525, "grad_norm": 0.6504099699536218, "learning_rate": 5e-06, "loss": 0.6399, "step": 1540 }, { "epoch": 2.6997605051164815, "grad_norm": 0.563483068339733, "learning_rate": 5e-06, "loss": 0.6348, "step": 1550 }, { "epoch": 2.7171783148269104, "grad_norm": 0.730022824759867, "learning_rate": 5e-06, "loss": 0.6358, "step": 1560 }, { "epoch": 2.7345961245373394, "grad_norm": 0.7435338593643929, "learning_rate": 5e-06, "loss": 0.6416, "step": 1570 }, { "epoch": 2.7520139342477683, "grad_norm": 0.7041374525178048, "learning_rate": 5e-06, "loss": 0.6312, "step": 1580 }, { "epoch": 2.7694317439581972, "grad_norm": 0.6185370005773447, "learning_rate": 5e-06, "loss": 0.6369, "step": 1590 }, { "epoch": 2.786849553668626, "grad_norm": 0.597751813516521, "learning_rate": 5e-06, "loss": 0.6376, "step": 1600 }, { "epoch": 2.804267363379055, "grad_norm": 0.6525590591893353, "learning_rate": 5e-06, "loss": 0.638, "step": 1610 }, { "epoch": 2.821685173089484, "grad_norm": 0.6520966089133831, "learning_rate": 5e-06, "loss": 0.6363, "step": 1620 }, { "epoch": 2.839102982799913, "grad_norm": 0.6318597492523834, "learning_rate": 5e-06, "loss": 0.6392, "step": 1630 }, { "epoch": 2.856520792510342, "grad_norm": 0.7325044927855683, "learning_rate": 5e-06, "loss": 0.6369, "step": 1640 }, { "epoch": 2.873938602220771, "grad_norm": 0.6253335500365848, "learning_rate": 5e-06, "loss": 0.636, "step": 1650 }, { "epoch": 2.8913564119311994, "grad_norm": 0.6704613482673505, "learning_rate": 5e-06, "loss": 0.6376, "step": 1660 }, { "epoch": 2.908774221641629, "grad_norm": 0.7273411977567759, "learning_rate": 5e-06, "loss": 0.6386, "step": 1670 }, { "epoch": 2.9261920313520573, "grad_norm": 0.6268248741184522, "learning_rate": 5e-06, "loss": 0.6378, "step": 1680 }, { "epoch": 2.9436098410624862, "grad_norm": 0.5745167165482074, "learning_rate": 5e-06, "loss": 0.6392, "step": 1690 }, { "epoch": 2.961027650772915, "grad_norm": 0.6046462612164668, "learning_rate": 5e-06, "loss": 0.642, "step": 1700 }, { "epoch": 2.978445460483344, "grad_norm": 0.6397198084499147, "learning_rate": 5e-06, "loss": 0.6381, "step": 1710 }, { "epoch": 2.995863270193773, "grad_norm": 0.7413825248918653, "learning_rate": 5e-06, "loss": 0.638, "step": 1720 }, { "epoch": 2.999346832135859, "eval_loss": 0.7229765057563782, "eval_runtime": 389.254, "eval_samples_per_second": 39.743, "eval_steps_per_second": 0.622, "step": 1722 }, { "epoch": 2.999346832135859, "step": 1722, "total_flos": 2884204756992000.0, "train_loss": 0.6974312729536448, "train_runtime": 56418.3982, "train_samples_per_second": 15.629, "train_steps_per_second": 0.031 } ], "logging_steps": 10, "max_steps": 1722, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2884204756992000.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }