[ { "loss": 0.34830059051513673, "grad_norm": 0.2803570032119751, "learning_rate": 0.00013714285714285716, "num_tokens": 640109.0, "mean_token_accuracy": 0.9155976337194442, "epoch": 0.10746910263299302, "step": 25 }, { "loss": 0.14681048393249513, "grad_norm": 0.23165035247802734, "learning_rate": 0.000199780703920947, "num_tokens": 1262877.0, "mean_token_accuracy": 0.9537364545464516, "epoch": 0.21493820526598603, "step": 50 }, { "loss": 0.1139146327972412, "grad_norm": 0.16550014913082123, "learning_rate": 0.00019830242014201796, "num_tokens": 1905162.0, "mean_token_accuracy": 0.9621101367473602, "epoch": 0.32240730789897903, "step": 75 }, { "loss": 0.11038614273071289, "grad_norm": 0.18707048892974854, "learning_rate": 0.0001954504062771555, "num_tokens": 2537797.0, "mean_token_accuracy": 0.9628356519341469, "epoch": 0.42987641053197206, "step": 100 }, { "eval_loss": 0.1351652890443802, "eval_runtime": 69.5984, "eval_samples_per_second": 2.845, "eval_steps_per_second": 1.422, "eval_num_tokens": 2954175.0, "eval_mean_token_accuracy": 0.9607259185627254, "epoch": 0.4986566362170876, "step": 116 }, { "loss": 0.11258039474487305, "grad_norm": 0.20463427901268005, "learning_rate": 0.00019126451787870527, "num_tokens": 3175008.0, "mean_token_accuracy": 0.9623757800459862, "epoch": 0.537345513164965, "step": 125 }, { "loss": 0.11596426963806153, "grad_norm": 0.1986123025417328, "learning_rate": 0.00018580325076824513, "num_tokens": 3799800.0, "mean_token_accuracy": 0.961990795135498, "epoch": 0.6448146157979581, "step": 150 }, { "loss": 0.0997089672088623, "grad_norm": 0.1511112004518509, "learning_rate": 0.0001791429235849919, "num_tokens": 4438381.0, "mean_token_accuracy": 0.9641576319932937, "epoch": 0.7522837184309511, "step": 175 }, { "loss": 0.09359555244445801, "grad_norm": 0.13941654562950134, "learning_rate": 0.0001713766112687139, "num_tokens": 5075973.0, "mean_token_accuracy": 0.9686441496014595, "epoch": 0.8597528210639441, "step": 200 }, { "loss": 0.11163744926452637, "grad_norm": 0.1599922776222229, "learning_rate": 0.0001626128443812245, "num_tokens": 5723214.0, "mean_token_accuracy": 0.9624734339118004, "epoch": 0.9672219236969372, "step": 225 }, { "eval_loss": 0.12350355833768845, "eval_runtime": 69.5143, "eval_samples_per_second": 2.848, "eval_steps_per_second": 1.424, "eval_num_tokens": 5899964.0, "eval_mean_token_accuracy": 0.9637513997578862, "epoch": 0.9973132724341752, "step": 232 }, { "loss": 0.07306031227111816, "grad_norm": 0.1523396372795105, "learning_rate": 0.00015297409244282694, "num_tokens": 6364544.0, "mean_token_accuracy": 0.9745264253035415, "epoch": 1.0730789897904351, "step": 250 }, { "loss": 0.06787878513336182, "grad_norm": 0.13754014670848846, "learning_rate": 0.00014259505247837074, "num_tokens": 6996524.0, "mean_token_accuracy": 0.9755120638012886, "epoch": 1.1805480924234282, "step": 275 }, { "loss": 0.08596912384033203, "grad_norm": 0.17458459734916687, "learning_rate": 0.0001316207666896824, "num_tokens": 7621299.0, "mean_token_accuracy": 0.9696658563613891, "epoch": 1.2880171950564212, "step": 300 }, { "loss": 0.06656608581542969, "grad_norm": 0.1313866376876831, "learning_rate": 0.00012020459555901427, "num_tokens": 8267290.0, "mean_token_accuracy": 0.9760522067546844, "epoch": 1.3954862976894142, "step": 325 }, { "eval_loss": 0.1249435767531395, "eval_runtime": 69.7547, "eval_samples_per_second": 2.839, "eval_steps_per_second": 1.419, "eval_num_tokens": 8834417.0, "eval_mean_token_accuracy": 0.963239210422593, "epoch": 1.4943578721117678, "step": 348 }, { "loss": 0.07149289608001709, "grad_norm": 0.15927733480930328, "learning_rate": 0.00010850607470843656, "num_tokens": 8883001.0, "mean_token_accuracy": 0.9746167114377022, "epoch": 1.5029554003224073, "step": 350 }, { "loss": 0.0666344976425171, "grad_norm": 0.14084048569202423, "learning_rate": 9.668868546455486e-05, "num_tokens": 9525128.0, "mean_token_accuracy": 0.975580106973648, "epoch": 1.6104245029554003, "step": 375 }, { "loss": 0.07050958156585693, "grad_norm": 0.1653551161289215, "learning_rate": 8.491757028386263e-05, "num_tokens": 10162043.0, "mean_token_accuracy": 0.9742929524183274, "epoch": 1.7178936055883933, "step": 400 }, { "loss": 0.054537668228149414, "grad_norm": 0.17465578019618988, "learning_rate": 7.33572249645848e-05, "num_tokens": 10802637.0, "mean_token_accuracy": 0.9797186449170112, "epoch": 1.8253627082213864, "step": 425 }, { "loss": 0.06608867645263672, "grad_norm": 0.15113526582717896, "learning_rate": 6.216919989526651e-05, "num_tokens": 11430608.0, "mean_token_accuracy": 0.9761568233370781, "epoch": 1.9328318108543794, "step": 450 }, { "eval_loss": 0.11843688040971756, "eval_runtime": 69.9946, "eval_samples_per_second": 2.829, "eval_steps_per_second": 1.414, "eval_num_tokens": 11786762.0, "eval_mean_token_accuracy": 0.965543883015411, "epoch": 1.9930145083288555, "step": 464 }, { "loss": 0.05509011745452881, "grad_norm": 0.09923101216554642, "learning_rate": 5.1509842464076776e-05, "num_tokens": 12054154.0, "mean_token_accuracy": 0.9804502256630641, "epoch": 2.0386888769478775, "step": 475 }, { "loss": 0.031655769348144534, "grad_norm": 0.13326086103916168, "learning_rate": 4.152811217759529e-05, "num_tokens": 12697456.0, "mean_token_accuracy": 0.9891643562912941, "epoch": 2.1461579795808703, "step": 500 }, { "loss": 0.032743215560913086, "grad_norm": 0.12852540612220764, "learning_rate": 3.2363499021769526e-05, "num_tokens": 13329332.0, "mean_token_accuracy": 0.9882625249028206, "epoch": 2.2536270822138635, "step": 525 }, { "loss": 0.029418470859527587, "grad_norm": 0.12474379688501358, "learning_rate": 2.4144074154968832e-05, "num_tokens": 13972889.0, "mean_token_accuracy": 0.9891040176153183, "epoch": 2.3610961848468563, "step": 550 }, { "loss": 0.030236964225769044, "grad_norm": 0.12814833223819733, "learning_rate": 1.6984700173783175e-05, "num_tokens": 14606119.0, "mean_token_accuracy": 0.9886843663454056, "epoch": 2.4685652874798496, "step": 575 }, { "eval_loss": 0.1397247463464737, "eval_runtime": 69.3145, "eval_samples_per_second": 2.857, "eval_steps_per_second": 1.428, "eval_num_tokens": 14728454.0, "eval_mean_token_accuracy": 0.9646775379325404, "epoch": 2.4900591080064483, "step": 580 }, { "loss": 0.030723834037780763, "grad_norm": 0.12963370978832245, "learning_rate": 1.0985425962260343e-05, "num_tokens": 15237584.0, "mean_token_accuracy": 0.9888986241817475, "epoch": 2.5760343901128424, "step": 600 }, { "loss": 0.030379328727722168, "grad_norm": 0.16792573034763336, "learning_rate": 6.230088555808278e-06, "num_tokens": 15876203.0, "mean_token_accuracy": 0.9890140387415886, "epoch": 2.6835034927458357, "step": 625 }, { "loss": 0.02967998743057251, "grad_norm": 0.19673478603363037, "learning_rate": 2.7851415580571692e-06, "num_tokens": 16513501.0, "mean_token_accuracy": 0.9892704981565476, "epoch": 2.7909725953788285, "step": 650 }, { "loss": 0.030462250709533692, "grad_norm": 0.11195345222949982, "learning_rate": 6.987264830045526e-07, "num_tokens": 17145749.0, "mean_token_accuracy": 0.9888393118977546, "epoch": 2.8984416980118217, "step": 675 }, { "eval_loss": 0.13951744139194489, "eval_runtime": 69.3391, "eval_samples_per_second": 2.856, "eval_steps_per_second": 1.428, "eval_num_tokens": 17672878.0, "eval_mean_token_accuracy": 0.9649288859030213, "epoch": 2.9887157442235357, "step": 696 }, { "eval_loss": 0.13946650922298431, "eval_runtime": 69.3693, "eval_samples_per_second": 2.854, "eval_steps_per_second": 1.427, "eval_num_tokens": 17736411.0, "eval_mean_token_accuracy": 0.9649666162452313, "epoch": 3.0, "step": 699 }, { "train_runtime": 14009.2214, "train_samples_per_second": 0.797, "train_steps_per_second": 0.05, "total_flos": 8.799123393273508e+17, "train_loss": 0.07888307489550676, "epoch": 3.0, "step": 699 } ]