{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 276, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.018148820326678767, "grad_norm": 1.0784239768981934, "learning_rate": 1.7391304347826088e-06, "loss": 1.2027, "step": 5 }, { "epoch": 0.036297640653357534, "grad_norm": 0.8809816837310791, "learning_rate": 3.913043478260869e-06, "loss": 1.2648, "step": 10 }, { "epoch": 0.0544464609800363, "grad_norm": 0.6940921545028687, "learning_rate": 6.086956521739131e-06, "loss": 1.263, "step": 15 }, { "epoch": 0.07259528130671507, "grad_norm": 0.7154361009597778, "learning_rate": 8.260869565217392e-06, "loss": 1.2225, "step": 20 }, { "epoch": 0.09074410163339383, "grad_norm": 0.6354780197143555, "learning_rate": 1.0434782608695653e-05, "loss": 1.2134, "step": 25 }, { "epoch": 0.1088929219600726, "grad_norm": 0.6585204005241394, "learning_rate": 1.2608695652173912e-05, "loss": 1.14, "step": 30 }, { "epoch": 0.12704174228675136, "grad_norm": 0.5307602286338806, "learning_rate": 1.4782608695652174e-05, "loss": 1.128, "step": 35 }, { "epoch": 0.14519056261343014, "grad_norm": 0.5485066771507263, "learning_rate": 1.6956521739130433e-05, "loss": 1.1518, "step": 40 }, { "epoch": 0.16333938294010888, "grad_norm": 0.7185664772987366, "learning_rate": 1.9130434782608694e-05, "loss": 1.1149, "step": 45 }, { "epoch": 0.18148820326678766, "grad_norm": 0.51127028465271, "learning_rate": 2.1304347826086958e-05, "loss": 1.143, "step": 50 }, { "epoch": 0.1996370235934664, "grad_norm": 0.535865843296051, "learning_rate": 2.347826086956522e-05, "loss": 1.1259, "step": 55 }, { "epoch": 0.2177858439201452, "grad_norm": 0.5300245881080627, "learning_rate": 2.565217391304348e-05, "loss": 1.0789, "step": 60 }, { "epoch": 0.23593466424682397, "grad_norm": 0.5276510715484619, "learning_rate": 2.782608695652174e-05, "loss": 1.0811, "step": 65 }, { "epoch": 0.2540834845735027, "grad_norm": 0.5264666676521301, "learning_rate": 3e-05, "loss": 1.0645, "step": 70 }, { "epoch": 0.27223230490018147, "grad_norm": 0.5153283476829529, "learning_rate": 2.999892331059753e-05, "loss": 1.1043, "step": 75 }, { "epoch": 0.29038112522686027, "grad_norm": 0.533360481262207, "learning_rate": 2.999569339695812e-05, "loss": 1.0288, "step": 80 }, { "epoch": 0.308529945553539, "grad_norm": 0.6700855493545532, "learning_rate": 2.9990310722763616e-05, "loss": 1.0037, "step": 85 }, { "epoch": 0.32667876588021777, "grad_norm": 0.6054568290710449, "learning_rate": 2.9982776060743112e-05, "loss": 1.0362, "step": 90 }, { "epoch": 0.3448275862068966, "grad_norm": 0.9136693477630615, "learning_rate": 2.9973090492562048e-05, "loss": 1.0228, "step": 95 }, { "epoch": 0.3629764065335753, "grad_norm": 0.609896183013916, "learning_rate": 2.9961255408666903e-05, "loss": 0.9895, "step": 100 }, { "epoch": 0.3811252268602541, "grad_norm": 0.657203733921051, "learning_rate": 2.99472725080856e-05, "loss": 0.9758, "step": 105 }, { "epoch": 0.3992740471869328, "grad_norm": 0.7311093211174011, "learning_rate": 2.9931143798183588e-05, "loss": 0.9626, "step": 110 }, { "epoch": 0.41742286751361163, "grad_norm": 0.6201342940330505, "learning_rate": 2.9912871594375667e-05, "loss": 0.9704, "step": 115 }, { "epoch": 0.4355716878402904, "grad_norm": 0.6121538281440735, "learning_rate": 2.98924585197936e-05, "loss": 0.931, "step": 120 }, { "epoch": 0.4537205081669691, "grad_norm": 0.6566389799118042, "learning_rate": 2.9869907504909532e-05, "loss": 0.9246, "step": 125 }, { "epoch": 0.47186932849364793, "grad_norm": 0.6150594353675842, "learning_rate": 2.984522178711529e-05, "loss": 0.9232, "step": 130 }, { "epoch": 0.4900181488203267, "grad_norm": 0.68607497215271, "learning_rate": 2.9818404910257645e-05, "loss": 0.8966, "step": 135 }, { "epoch": 0.5081669691470054, "grad_norm": 0.6849779486656189, "learning_rate": 2.9789460724129545e-05, "loss": 0.9419, "step": 140 }, { "epoch": 0.5263157894736842, "grad_norm": 0.8690723180770874, "learning_rate": 2.9758393383917447e-05, "loss": 0.8704, "step": 145 }, { "epoch": 0.5444646098003629, "grad_norm": 0.962196409702301, "learning_rate": 2.9725207349604823e-05, "loss": 0.9073, "step": 150 }, { "epoch": 0.5626134301270418, "grad_norm": 0.7416574954986572, "learning_rate": 2.968990738533186e-05, "loss": 0.9249, "step": 155 }, { "epoch": 0.5807622504537205, "grad_norm": 0.7890663146972656, "learning_rate": 2.965249855871155e-05, "loss": 0.8781, "step": 160 }, { "epoch": 0.5989110707803993, "grad_norm": 0.814849317073822, "learning_rate": 2.961298624010219e-05, "loss": 0.8271, "step": 165 }, { "epoch": 0.617059891107078, "grad_norm": 0.8292708396911621, "learning_rate": 2.9571376101836397e-05, "loss": 0.8268, "step": 170 }, { "epoch": 0.6352087114337568, "grad_norm": 0.8682757019996643, "learning_rate": 2.9527674117406834e-05, "loss": 0.7837, "step": 175 }, { "epoch": 0.6533575317604355, "grad_norm": 0.7653095126152039, "learning_rate": 2.948188656060864e-05, "loss": 0.8026, "step": 180 }, { "epoch": 0.6715063520871143, "grad_norm": 1.0673071146011353, "learning_rate": 2.9434020004638757e-05, "loss": 0.7887, "step": 185 }, { "epoch": 0.6896551724137931, "grad_norm": 0.869884192943573, "learning_rate": 2.9384081321152335e-05, "loss": 0.7971, "step": 190 }, { "epoch": 0.7078039927404719, "grad_norm": 0.926493227481842, "learning_rate": 2.9332077679276206e-05, "loss": 0.7297, "step": 195 }, { "epoch": 0.7259528130671506, "grad_norm": 0.8653711080551147, "learning_rate": 2.927801654457972e-05, "loss": 0.7314, "step": 200 }, { "epoch": 0.7441016333938294, "grad_norm": 0.8328022956848145, "learning_rate": 2.9221905678002982e-05, "loss": 0.7543, "step": 205 }, { "epoch": 0.7622504537205081, "grad_norm": 0.8467786908149719, "learning_rate": 2.9163753134742716e-05, "loss": 0.7292, "step": 210 }, { "epoch": 0.7803992740471869, "grad_norm": 0.9099471569061279, "learning_rate": 2.910356726309586e-05, "loss": 0.7632, "step": 215 }, { "epoch": 0.7985480943738656, "grad_norm": 0.8697426915168762, "learning_rate": 2.9041356703261108e-05, "loss": 0.6898, "step": 220 }, { "epoch": 0.8166969147005445, "grad_norm": 0.8971152305603027, "learning_rate": 2.8977130386098525e-05, "loss": 0.6785, "step": 225 }, { "epoch": 0.8348457350272233, "grad_norm": 0.9118236303329468, "learning_rate": 2.8910897531847447e-05, "loss": 0.6571, "step": 230 }, { "epoch": 0.852994555353902, "grad_norm": 0.8698996901512146, "learning_rate": 2.8842667648802847e-05, "loss": 0.6911, "step": 235 }, { "epoch": 0.8711433756805808, "grad_norm": 0.9623836874961853, "learning_rate": 2.877245053195033e-05, "loss": 0.6313, "step": 240 }, { "epoch": 0.8892921960072595, "grad_norm": 0.9111562371253967, "learning_rate": 2.8700256261559962e-05, "loss": 0.6695, "step": 245 }, { "epoch": 0.9074410163339383, "grad_norm": 0.8837359547615051, "learning_rate": 2.8626095201739206e-05, "loss": 0.6461, "step": 250 }, { "epoch": 0.925589836660617, "grad_norm": 1.2072163820266724, "learning_rate": 2.8549977998945003e-05, "loss": 0.6622, "step": 255 }, { "epoch": 0.9437386569872959, "grad_norm": 0.987786591053009, "learning_rate": 2.847191558045544e-05, "loss": 0.6912, "step": 260 }, { "epoch": 0.9618874773139746, "grad_norm": 0.9701903462409973, "learning_rate": 2.839191915280102e-05, "loss": 0.6322, "step": 265 }, { "epoch": 0.9800362976406534, "grad_norm": 1.2309809923171997, "learning_rate": 2.831000020015585e-05, "loss": 0.5662, "step": 270 }, { "epoch": 0.9981851179673321, "grad_norm": 1.0858169794082642, "learning_rate": 2.8226170482689022e-05, "loss": 0.6321, "step": 275 } ], "logging_steps": 5, "max_steps": 1380, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.8427147710234624e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }