{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 3878, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.025786487880350695, "grad_norm": 0.1338733434677124, "learning_rate": 6.666666666666667e-06, "loss": 2.5125, "step": 100 }, { "epoch": 0.05157297576070139, "grad_norm": 0.17936278879642487, "learning_rate": 1.3333333333333333e-05, "loss": 2.4886, "step": 200 }, { "epoch": 0.07735946364105209, "grad_norm": 0.2758166491985321, "learning_rate": 2e-05, "loss": 2.4178, "step": 300 }, { "epoch": 0.10314595152140278, "grad_norm": 0.38494062423706055, "learning_rate": 1.9961477891705203e-05, "loss": 2.4016, "step": 400 }, { "epoch": 0.12893243940175347, "grad_norm": 0.408550888299942, "learning_rate": 1.98462083573863e-05, "loss": 2.3598, "step": 500 }, { "epoch": 0.15471892728210418, "grad_norm": 0.43653976917266846, "learning_rate": 1.9655079482140115e-05, "loss": 2.3474, "step": 600 }, { "epoch": 0.18050541516245489, "grad_norm": 0.5025712251663208, "learning_rate": 1.9389563803412753e-05, "loss": 2.3225, "step": 700 }, { "epoch": 0.20629190304280556, "grad_norm": 0.565731406211853, "learning_rate": 1.9051706965950192e-05, "loss": 2.2821, "step": 800 }, { "epoch": 0.23207839092315627, "grad_norm": 0.5192058682441711, "learning_rate": 1.8644111961288605e-05, "loss": 2.3188, "step": 900 }, { "epoch": 0.25786487880350695, "grad_norm": 0.8693593740463257, "learning_rate": 1.816991907320999e-05, "loss": 2.2895, "step": 1000 }, { "epoch": 0.28365136668385765, "grad_norm": 0.6395590901374817, "learning_rate": 1.7632781683671787e-05, "loss": 2.2892, "step": 1100 }, { "epoch": 0.30943785456420836, "grad_norm": 0.7124245166778564, "learning_rate": 1.703683812561179e-05, "loss": 2.2695, "step": 1200 }, { "epoch": 0.33522434244455906, "grad_norm": 0.6226526498794556, "learning_rate": 1.6386679799486236e-05, "loss": 2.2573, "step": 1300 }, { "epoch": 0.36101083032490977, "grad_norm": 0.7176969051361084, "learning_rate": 1.568731579918468e-05, "loss": 2.2483, "step": 1400 }, { "epoch": 0.3867973182052604, "grad_norm": 0.6624855399131775, "learning_rate": 1.494413431985854e-05, "loss": 2.2084, "step": 1500 }, { "epoch": 0.4125838060856111, "grad_norm": 0.669903039932251, "learning_rate": 1.4162861144993671e-05, "loss": 2.2376, "step": 1600 }, { "epoch": 0.43837029396596183, "grad_norm": 0.7109538316726685, "learning_rate": 1.3349515532560074e-05, "loss": 2.2318, "step": 1700 }, { "epoch": 0.46415678184631254, "grad_norm": 0.7055323123931885, "learning_rate": 1.2510363840110396e-05, "loss": 2.2585, "step": 1800 }, { "epoch": 0.48994326972666324, "grad_norm": 0.8980267643928528, "learning_rate": 1.1651871246119102e-05, "loss": 2.2008, "step": 1900 }, { "epoch": 0.5157297576070139, "grad_norm": 0.774202287197113, "learning_rate": 1.0780651939521396e-05, "loss": 2.2063, "step": 2000 }, { "epoch": 0.5415162454873647, "grad_norm": 0.9350955486297607, "learning_rate": 9.903418161212732e-06, "loss": 2.2451, "step": 2100 }, { "epoch": 0.5673027333677153, "grad_norm": 0.9068596959114075, "learning_rate": 9.026928490114683e-06, "loss": 2.2293, "step": 2200 }, { "epoch": 0.5930892212480661, "grad_norm": 0.7404003739356995, "learning_rate": 8.15793577223311e-06, "loss": 2.2155, "step": 2300 }, { "epoch": 0.6188757091284167, "grad_norm": 0.8607114553451538, "learning_rate": 7.303135093885141e-06, "loss": 2.2158, "step": 2400 }, { "epoch": 0.6446621970087674, "grad_norm": 0.7401602864265442, "learning_rate": 6.469112199931131e-06, "loss": 2.1806, "step": 2500 }, { "epoch": 0.6704486848891181, "grad_norm": 0.818596363067627, "learning_rate": 5.662292754419332e-06, "loss": 2.1965, "step": 2600 }, { "epoch": 0.6962351727694688, "grad_norm": 0.9028757810592651, "learning_rate": 4.888892834560608e-06, "loss": 2.2402, "step": 2700 }, { "epoch": 0.7220216606498195, "grad_norm": 0.8473127484321594, "learning_rate": 4.154871039448561e-06, "loss": 2.2167, "step": 2800 }, { "epoch": 0.7478081485301702, "grad_norm": 0.7806875705718994, "learning_rate": 3.4658825824996036e-06, "loss": 2.2116, "step": 2900 }, { "epoch": 0.7735946364105208, "grad_norm": 0.7143692970275879, "learning_rate": 2.82723572130422e-06, "loss": 2.2068, "step": 3000 }, { "epoch": 0.7993811242908716, "grad_norm": 0.8515029549598694, "learning_rate": 2.243850860572239e-06, "loss": 2.1972, "step": 3100 }, { "epoch": 0.8251676121712223, "grad_norm": 0.770374596118927, "learning_rate": 1.7202226432601833e-06, "loss": 2.2316, "step": 3200 }, { "epoch": 0.850954100051573, "grad_norm": 0.7793726325035095, "learning_rate": 1.260385321946761e-06, "loss": 2.2046, "step": 3300 }, { "epoch": 0.8767405879319237, "grad_norm": 1.0234565734863281, "learning_rate": 8.678816772498988e-07, "loss": 2.1848, "step": 3400 }, { "epoch": 0.9025270758122743, "grad_norm": 0.8092569708824158, "learning_rate": 5.457357227510152e-07, "loss": 2.2268, "step": 3500 }, { "epoch": 0.9283135636926251, "grad_norm": 0.9072908759117126, "learning_rate": 2.964294067193008e-07, "loss": 2.2251, "step": 3600 }, { "epoch": 0.9541000515729757, "grad_norm": 0.6942015886306763, "learning_rate": 1.2188349013570356e-07, "loss": 2.2285, "step": 3700 }, { "epoch": 0.9798865394533265, "grad_norm": 0.8001830577850342, "learning_rate": 2.344274834043425e-08, "loss": 2.2278, "step": 3800 }, { "epoch": 1.0, "step": 3878, "total_flos": 7.04757648064512e+16, "train_loss": 2.263120666000264, "train_runtime": 1207.6216, "train_samples_per_second": 6.423, "train_steps_per_second": 3.211 } ], "logging_steps": 100, "max_steps": 3878, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.04757648064512e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }