{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 276, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.018115942028985508, "grad_norm": 1.1163504123687744, "learning_rate": 1.7391304347826088e-06, "loss": 1.3426, "step": 5 }, { "epoch": 0.036231884057971016, "grad_norm": 0.8822958469390869, "learning_rate": 3.913043478260869e-06, "loss": 1.3299, "step": 10 }, { "epoch": 0.05434782608695652, "grad_norm": 0.6369628310203552, "learning_rate": 6.086956521739131e-06, "loss": 1.3549, "step": 15 }, { "epoch": 0.07246376811594203, "grad_norm": 0.6771602630615234, "learning_rate": 8.260869565217392e-06, "loss": 1.2763, "step": 20 }, { "epoch": 0.09057971014492754, "grad_norm": 0.6691910624504089, "learning_rate": 1.0434782608695653e-05, "loss": 1.2558, "step": 25 }, { "epoch": 0.10869565217391304, "grad_norm": 0.6733030676841736, "learning_rate": 1.2608695652173912e-05, "loss": 1.2399, "step": 30 }, { "epoch": 0.12681159420289856, "grad_norm": 0.6181926727294922, "learning_rate": 1.4782608695652174e-05, "loss": 1.223, "step": 35 }, { "epoch": 0.14492753623188406, "grad_norm": 0.5464959144592285, "learning_rate": 1.6956521739130433e-05, "loss": 1.2229, "step": 40 }, { "epoch": 0.16304347826086957, "grad_norm": 0.6214764714241028, "learning_rate": 1.9130434782608694e-05, "loss": 1.2244, "step": 45 }, { "epoch": 0.18115942028985507, "grad_norm": 0.616301417350769, "learning_rate": 2.1304347826086958e-05, "loss": 1.1924, "step": 50 }, { "epoch": 0.19927536231884058, "grad_norm": 0.5678027272224426, "learning_rate": 2.347826086956522e-05, "loss": 1.1545, "step": 55 }, { "epoch": 0.21739130434782608, "grad_norm": 0.5150833129882812, "learning_rate": 2.565217391304348e-05, "loss": 1.1455, "step": 60 }, { "epoch": 0.23550724637681159, "grad_norm": 0.5843022465705872, "learning_rate": 2.782608695652174e-05, "loss": 1.1364, "step": 65 }, { "epoch": 0.2536231884057971, "grad_norm": 0.5521677136421204, "learning_rate": 3e-05, "loss": 1.1215, "step": 70 }, { "epoch": 0.2717391304347826, "grad_norm": 0.7375545501708984, "learning_rate": 2.999892331059753e-05, "loss": 1.0701, "step": 75 }, { "epoch": 0.2898550724637681, "grad_norm": 0.615342915058136, "learning_rate": 2.999569339695812e-05, "loss": 1.1818, "step": 80 }, { "epoch": 0.3079710144927536, "grad_norm": 0.7234134078025818, "learning_rate": 2.9990310722763616e-05, "loss": 1.0776, "step": 85 }, { "epoch": 0.32608695652173914, "grad_norm": 0.6241700053215027, "learning_rate": 2.9982776060743112e-05, "loss": 1.1307, "step": 90 }, { "epoch": 0.3442028985507246, "grad_norm": 0.5964018106460571, "learning_rate": 2.9973090492562048e-05, "loss": 1.0532, "step": 95 }, { "epoch": 0.36231884057971014, "grad_norm": 0.6742740273475647, "learning_rate": 2.9961255408666903e-05, "loss": 1.0253, "step": 100 }, { "epoch": 0.3804347826086957, "grad_norm": 0.7033436894416809, "learning_rate": 2.99472725080856e-05, "loss": 1.0282, "step": 105 }, { "epoch": 0.39855072463768115, "grad_norm": 0.745764970779419, "learning_rate": 2.9931143798183588e-05, "loss": 1.0426, "step": 110 }, { "epoch": 0.4166666666666667, "grad_norm": 0.6261528730392456, "learning_rate": 2.9912871594375667e-05, "loss": 1.0102, "step": 115 }, { "epoch": 0.43478260869565216, "grad_norm": 0.7047893404960632, "learning_rate": 2.98924585197936e-05, "loss": 1.0398, "step": 120 }, { "epoch": 0.4528985507246377, "grad_norm": 0.697711706161499, "learning_rate": 2.9869907504909532e-05, "loss": 0.9616, "step": 125 }, { "epoch": 0.47101449275362317, "grad_norm": 0.6377936005592346, "learning_rate": 2.984522178711529e-05, "loss": 0.9547, "step": 130 }, { "epoch": 0.4891304347826087, "grad_norm": 0.7725468277931213, "learning_rate": 2.9818404910257645e-05, "loss": 0.9385, "step": 135 }, { "epoch": 0.5072463768115942, "grad_norm": 0.7378762364387512, "learning_rate": 2.9789460724129545e-05, "loss": 0.9456, "step": 140 }, { "epoch": 0.5253623188405797, "grad_norm": 0.7447887659072876, "learning_rate": 2.9758393383917447e-05, "loss": 0.9714, "step": 145 }, { "epoch": 0.5434782608695652, "grad_norm": 0.7833755612373352, "learning_rate": 2.9725207349604823e-05, "loss": 0.9186, "step": 150 }, { "epoch": 0.5615942028985508, "grad_norm": 0.8524132370948792, "learning_rate": 2.968990738533186e-05, "loss": 0.9178, "step": 155 }, { "epoch": 0.5797101449275363, "grad_norm": 0.7204749584197998, "learning_rate": 2.965249855871155e-05, "loss": 0.8927, "step": 160 }, { "epoch": 0.5978260869565217, "grad_norm": 0.8009870648384094, "learning_rate": 2.961298624010219e-05, "loss": 0.8965, "step": 165 }, { "epoch": 0.6159420289855072, "grad_norm": 0.8811512589454651, "learning_rate": 2.9571376101836397e-05, "loss": 0.8726, "step": 170 }, { "epoch": 0.6340579710144928, "grad_norm": 0.8356473445892334, "learning_rate": 2.9527674117406834e-05, "loss": 0.8058, "step": 175 }, { "epoch": 0.6521739130434783, "grad_norm": 0.9342594146728516, "learning_rate": 2.948188656060864e-05, "loss": 0.8383, "step": 180 }, { "epoch": 0.6702898550724637, "grad_norm": 0.9308671355247498, "learning_rate": 2.9434020004638757e-05, "loss": 0.8527, "step": 185 }, { "epoch": 0.6884057971014492, "grad_norm": 0.7293853759765625, "learning_rate": 2.9384081321152335e-05, "loss": 0.7734, "step": 190 }, { "epoch": 0.7065217391304348, "grad_norm": 1.030664324760437, "learning_rate": 2.9332077679276206e-05, "loss": 0.7932, "step": 195 }, { "epoch": 0.7246376811594203, "grad_norm": 0.9539803266525269, "learning_rate": 2.927801654457972e-05, "loss": 0.8153, "step": 200 }, { "epoch": 0.7427536231884058, "grad_norm": 0.9612506628036499, "learning_rate": 2.9221905678002982e-05, "loss": 0.7897, "step": 205 }, { "epoch": 0.7608695652173914, "grad_norm": 0.8908689022064209, "learning_rate": 2.9163753134742716e-05, "loss": 0.7576, "step": 210 }, { "epoch": 0.7789855072463768, "grad_norm": 0.9448806643486023, "learning_rate": 2.910356726309586e-05, "loss": 0.775, "step": 215 }, { "epoch": 0.7971014492753623, "grad_norm": 1.0052385330200195, "learning_rate": 2.9041356703261108e-05, "loss": 0.7431, "step": 220 }, { "epoch": 0.8152173913043478, "grad_norm": 1.0107821226119995, "learning_rate": 2.8977130386098525e-05, "loss": 0.7506, "step": 225 }, { "epoch": 0.8333333333333334, "grad_norm": 0.9508954882621765, "learning_rate": 2.8910897531847447e-05, "loss": 0.7291, "step": 230 }, { "epoch": 0.8514492753623188, "grad_norm": 1.0560728311538696, "learning_rate": 2.8842667648802847e-05, "loss": 0.766, "step": 235 }, { "epoch": 0.8695652173913043, "grad_norm": 0.9395149350166321, "learning_rate": 2.877245053195033e-05, "loss": 0.6953, "step": 240 }, { "epoch": 0.8876811594202898, "grad_norm": 1.1801338195800781, "learning_rate": 2.8700256261559962e-05, "loss": 0.7201, "step": 245 }, { "epoch": 0.9057971014492754, "grad_norm": 1.1043130159378052, "learning_rate": 2.8626095201739206e-05, "loss": 0.6939, "step": 250 }, { "epoch": 0.9239130434782609, "grad_norm": 0.933818519115448, "learning_rate": 2.8549977998945003e-05, "loss": 0.7031, "step": 255 }, { "epoch": 0.9420289855072463, "grad_norm": 1.1428236961364746, "learning_rate": 2.847191558045544e-05, "loss": 0.6895, "step": 260 }, { "epoch": 0.9601449275362319, "grad_norm": 0.9816652536392212, "learning_rate": 2.839191915280102e-05, "loss": 0.6698, "step": 265 }, { "epoch": 0.9782608695652174, "grad_norm": 1.0942076444625854, "learning_rate": 2.831000020015585e-05, "loss": 0.6648, "step": 270 }, { "epoch": 0.9963768115942029, "grad_norm": 1.0711333751678467, "learning_rate": 2.8226170482689022e-05, "loss": 0.6473, "step": 275 } ], "logging_steps": 5, "max_steps": 1380, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.610379630160118e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }