{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 391, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02557544757033248, "grad_norm": 5.1979091027844655, "learning_rate": 2.25e-06, "loss": 0.5442, "step": 10 }, { "epoch": 0.05115089514066496, "grad_norm": 1.555199471497413, "learning_rate": 4.75e-06, "loss": 0.4196, "step": 20 }, { "epoch": 0.07672634271099744, "grad_norm": 0.8843964610571793, "learning_rate": 7.25e-06, "loss": 0.2928, "step": 30 }, { "epoch": 0.10230179028132992, "grad_norm": 0.9254467392549297, "learning_rate": 9.75e-06, "loss": 0.2508, "step": 40 }, { "epoch": 0.1278772378516624, "grad_norm": 0.9595475930941818, "learning_rate": 9.983786540671052e-06, "loss": 0.2316, "step": 50 }, { "epoch": 0.1534526854219949, "grad_norm": 0.8734055150087038, "learning_rate": 9.927874998629714e-06, "loss": 0.2198, "step": 60 }, { "epoch": 0.17902813299232737, "grad_norm": 0.8904948291987135, "learning_rate": 9.83251270794707e-06, "loss": 0.2047, "step": 70 }, { "epoch": 0.20460358056265984, "grad_norm": 0.8941187964853939, "learning_rate": 9.698463103929542e-06, "loss": 0.2058, "step": 80 }, { "epoch": 0.23017902813299232, "grad_norm": 0.8232662082320363, "learning_rate": 9.526799338236828e-06, "loss": 0.1903, "step": 90 }, { "epoch": 0.2557544757033248, "grad_norm": 0.8507117163294342, "learning_rate": 9.318895687625752e-06, "loss": 0.1835, "step": 100 }, { "epoch": 0.2813299232736573, "grad_norm": 0.7198977204875759, "learning_rate": 9.076416551997721e-06, "loss": 0.1774, "step": 110 }, { "epoch": 0.3069053708439898, "grad_norm": 0.6072199252093429, "learning_rate": 8.801303129827352e-06, "loss": 0.175, "step": 120 }, { "epoch": 0.33248081841432225, "grad_norm": 0.552937855243384, "learning_rate": 8.495757877643857e-06, "loss": 0.1688, "step": 130 }, { "epoch": 0.35805626598465473, "grad_norm": 0.5078356984647172, "learning_rate": 8.162226877976886e-06, "loss": 0.1679, "step": 140 }, { "epoch": 0.3836317135549872, "grad_norm": 0.4569791962942034, "learning_rate": 7.803380256922495e-06, "loss": 0.1659, "step": 150 }, { "epoch": 0.4092071611253197, "grad_norm": 0.43754448097194093, "learning_rate": 7.422090808099014e-06, "loss": 0.1614, "step": 160 }, { "epoch": 0.43478260869565216, "grad_norm": 0.3965482560875204, "learning_rate": 7.021410994121525e-06, "loss": 0.1658, "step": 170 }, { "epoch": 0.46035805626598464, "grad_norm": 0.40365524431178995, "learning_rate": 6.6045485097126585e-06, "loss": 0.162, "step": 180 }, { "epoch": 0.4859335038363171, "grad_norm": 0.38726639073470187, "learning_rate": 6.1748406020824115e-06, "loss": 0.1603, "step": 190 }, { "epoch": 0.5115089514066496, "grad_norm": 0.3760876442217351, "learning_rate": 5.735727354158581e-06, "loss": 0.1607, "step": 200 }, { "epoch": 0.5370843989769821, "grad_norm": 0.3698546651087282, "learning_rate": 5.290724144552379e-06, "loss": 0.1566, "step": 210 }, { "epoch": 0.5626598465473146, "grad_norm": 0.378775457895615, "learning_rate": 4.8433935047346e-06, "loss": 0.1617, "step": 220 }, { "epoch": 0.5882352941176471, "grad_norm": 0.37700547911055976, "learning_rate": 4.397316598723385e-06, "loss": 0.1577, "step": 230 }, { "epoch": 0.6138107416879796, "grad_norm": 0.34954174520163833, "learning_rate": 3.956064553606708e-06, "loss": 0.1595, "step": 240 }, { "epoch": 0.639386189258312, "grad_norm": 0.357317340726361, "learning_rate": 3.523169870416795e-06, "loss": 0.154, "step": 250 }, { "epoch": 0.6649616368286445, "grad_norm": 0.3684051566901591, "learning_rate": 3.1020981442305187e-06, "loss": 0.1531, "step": 260 }, { "epoch": 0.690537084398977, "grad_norm": 0.3895498086957201, "learning_rate": 2.6962203198941587e-06, "loss": 0.1579, "step": 270 }, { "epoch": 0.7161125319693095, "grad_norm": 0.3731438076456078, "learning_rate": 2.308785705482982e-06, "loss": 0.1586, "step": 280 }, { "epoch": 0.7416879795396419, "grad_norm": 0.41906481058101075, "learning_rate": 1.942895959539939e-06, "loss": 0.1575, "step": 290 }, { "epoch": 0.7672634271099744, "grad_norm": 0.3896314041500002, "learning_rate": 1.6014802603420044e-06, "loss": 0.1553, "step": 300 }, { "epoch": 0.7928388746803069, "grad_norm": 0.385720584527069, "learning_rate": 1.2872718559798852e-06, "loss": 0.1532, "step": 310 }, { "epoch": 0.8184143222506394, "grad_norm": 0.32351933058141824, "learning_rate": 1.0027861829824953e-06, "loss": 0.1562, "step": 320 }, { "epoch": 0.8439897698209718, "grad_norm": 0.3670621953960731, "learning_rate": 7.50300728660407e-07, "loss": 0.155, "step": 330 }, { "epoch": 0.8695652173913043, "grad_norm": 0.3829568746421653, "learning_rate": 5.318367983829393e-07, "loss": 0.1523, "step": 340 }, { "epoch": 0.8951406649616368, "grad_norm": 0.3775945531822611, "learning_rate": 3.49143333753309e-07, "loss": 0.1521, "step": 350 }, { "epoch": 0.9207161125319693, "grad_norm": 0.36031263149371584, "learning_rate": 2.0368291122759898e-07, "loss": 0.1523, "step": 360 }, { "epoch": 0.9462915601023018, "grad_norm": 0.3348863130660584, "learning_rate": 9.662003326740166e-08, "loss": 0.1514, "step": 370 }, { "epoch": 0.9718670076726342, "grad_norm": 0.360326022397128, "learning_rate": 2.8811805762860578e-08, "loss": 0.1501, "step": 380 }, { "epoch": 0.9974424552429667, "grad_norm": 0.37241654765357973, "learning_rate": 8.010763592264381e-10, "loss": 0.152, "step": 390 }, { "epoch": 1.0, "step": 391, "total_flos": 24007129694208.0, "train_loss": 0.18847447641365364, "train_runtime": 1067.9437, "train_samples_per_second": 46.819, "train_steps_per_second": 0.366 } ], "logging_steps": 10, "max_steps": 391, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 24007129694208.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }