| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 391, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.02557544757033248, | |
| "grad_norm": 5.1979091027844655, | |
| "learning_rate": 2.25e-06, | |
| "loss": 0.5442, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.05115089514066496, | |
| "grad_norm": 1.555199471497413, | |
| "learning_rate": 4.75e-06, | |
| "loss": 0.4196, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.07672634271099744, | |
| "grad_norm": 0.8843964610571793, | |
| "learning_rate": 7.25e-06, | |
| "loss": 0.2928, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.10230179028132992, | |
| "grad_norm": 0.9254467392549297, | |
| "learning_rate": 9.75e-06, | |
| "loss": 0.2508, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.1278772378516624, | |
| "grad_norm": 0.9595475930941818, | |
| "learning_rate": 9.983786540671052e-06, | |
| "loss": 0.2316, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.1534526854219949, | |
| "grad_norm": 0.8734055150087038, | |
| "learning_rate": 9.927874998629714e-06, | |
| "loss": 0.2198, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.17902813299232737, | |
| "grad_norm": 0.8904948291987135, | |
| "learning_rate": 9.83251270794707e-06, | |
| "loss": 0.2047, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.20460358056265984, | |
| "grad_norm": 0.8941187964853939, | |
| "learning_rate": 9.698463103929542e-06, | |
| "loss": 0.2058, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.23017902813299232, | |
| "grad_norm": 0.8232662082320363, | |
| "learning_rate": 9.526799338236828e-06, | |
| "loss": 0.1903, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.2557544757033248, | |
| "grad_norm": 0.8507117163294342, | |
| "learning_rate": 9.318895687625752e-06, | |
| "loss": 0.1835, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.2813299232736573, | |
| "grad_norm": 0.7198977204875759, | |
| "learning_rate": 9.076416551997721e-06, | |
| "loss": 0.1774, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.3069053708439898, | |
| "grad_norm": 0.6072199252093429, | |
| "learning_rate": 8.801303129827352e-06, | |
| "loss": 0.175, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.33248081841432225, | |
| "grad_norm": 0.552937855243384, | |
| "learning_rate": 8.495757877643857e-06, | |
| "loss": 0.1688, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.35805626598465473, | |
| "grad_norm": 0.5078356984647172, | |
| "learning_rate": 8.162226877976886e-06, | |
| "loss": 0.1679, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.3836317135549872, | |
| "grad_norm": 0.4569791962942034, | |
| "learning_rate": 7.803380256922495e-06, | |
| "loss": 0.1659, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.4092071611253197, | |
| "grad_norm": 0.43754448097194093, | |
| "learning_rate": 7.422090808099014e-06, | |
| "loss": 0.1614, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.43478260869565216, | |
| "grad_norm": 0.3965482560875204, | |
| "learning_rate": 7.021410994121525e-06, | |
| "loss": 0.1658, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.46035805626598464, | |
| "grad_norm": 0.40365524431178995, | |
| "learning_rate": 6.6045485097126585e-06, | |
| "loss": 0.162, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.4859335038363171, | |
| "grad_norm": 0.38726639073470187, | |
| "learning_rate": 6.1748406020824115e-06, | |
| "loss": 0.1603, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.5115089514066496, | |
| "grad_norm": 0.3760876442217351, | |
| "learning_rate": 5.735727354158581e-06, | |
| "loss": 0.1607, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.5370843989769821, | |
| "grad_norm": 0.3698546651087282, | |
| "learning_rate": 5.290724144552379e-06, | |
| "loss": 0.1566, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.5626598465473146, | |
| "grad_norm": 0.378775457895615, | |
| "learning_rate": 4.8433935047346e-06, | |
| "loss": 0.1617, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.5882352941176471, | |
| "grad_norm": 0.37700547911055976, | |
| "learning_rate": 4.397316598723385e-06, | |
| "loss": 0.1577, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.6138107416879796, | |
| "grad_norm": 0.34954174520163833, | |
| "learning_rate": 3.956064553606708e-06, | |
| "loss": 0.1595, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.639386189258312, | |
| "grad_norm": 0.357317340726361, | |
| "learning_rate": 3.523169870416795e-06, | |
| "loss": 0.154, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.6649616368286445, | |
| "grad_norm": 0.3684051566901591, | |
| "learning_rate": 3.1020981442305187e-06, | |
| "loss": 0.1531, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.690537084398977, | |
| "grad_norm": 0.3895498086957201, | |
| "learning_rate": 2.6962203198941587e-06, | |
| "loss": 0.1579, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.7161125319693095, | |
| "grad_norm": 0.3731438076456078, | |
| "learning_rate": 2.308785705482982e-06, | |
| "loss": 0.1586, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.7416879795396419, | |
| "grad_norm": 0.41906481058101075, | |
| "learning_rate": 1.942895959539939e-06, | |
| "loss": 0.1575, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.7672634271099744, | |
| "grad_norm": 0.3896314041500002, | |
| "learning_rate": 1.6014802603420044e-06, | |
| "loss": 0.1553, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.7928388746803069, | |
| "grad_norm": 0.385720584527069, | |
| "learning_rate": 1.2872718559798852e-06, | |
| "loss": 0.1532, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.8184143222506394, | |
| "grad_norm": 0.32351933058141824, | |
| "learning_rate": 1.0027861829824953e-06, | |
| "loss": 0.1562, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.8439897698209718, | |
| "grad_norm": 0.3670621953960731, | |
| "learning_rate": 7.50300728660407e-07, | |
| "loss": 0.155, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.8695652173913043, | |
| "grad_norm": 0.3829568746421653, | |
| "learning_rate": 5.318367983829393e-07, | |
| "loss": 0.1523, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.8951406649616368, | |
| "grad_norm": 0.3775945531822611, | |
| "learning_rate": 3.49143333753309e-07, | |
| "loss": 0.1521, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.9207161125319693, | |
| "grad_norm": 0.36031263149371584, | |
| "learning_rate": 2.0368291122759898e-07, | |
| "loss": 0.1523, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.9462915601023018, | |
| "grad_norm": 0.3348863130660584, | |
| "learning_rate": 9.662003326740166e-08, | |
| "loss": 0.1514, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.9718670076726342, | |
| "grad_norm": 0.360326022397128, | |
| "learning_rate": 2.8811805762860578e-08, | |
| "loss": 0.1501, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.9974424552429667, | |
| "grad_norm": 0.37241654765357973, | |
| "learning_rate": 8.010763592264381e-10, | |
| "loss": 0.152, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "step": 391, | |
| "total_flos": 24007129694208.0, | |
| "train_loss": 0.18847447641365364, | |
| "train_runtime": 1067.9437, | |
| "train_samples_per_second": 46.819, | |
| "train_steps_per_second": 0.366 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 391, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 24007129694208.0, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |