{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 680, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.014713996689350745, "grad_norm": 1.3101017475128174, "learning_rate": 9.86764705882353e-05, "loss": 2.9497, "step": 10 }, { "epoch": 0.02942799337870149, "grad_norm": 1.7666537761688232, "learning_rate": 9.720588235294117e-05, "loss": 2.5707, "step": 20 }, { "epoch": 0.044141990068052236, "grad_norm": 1.4447238445281982, "learning_rate": 9.573529411764707e-05, "loss": 2.281, "step": 30 }, { "epoch": 0.05885598675740298, "grad_norm": 1.039144515991211, "learning_rate": 9.426470588235294e-05, "loss": 2.3904, "step": 40 }, { "epoch": 0.07356998344675372, "grad_norm": 1.2274360656738281, "learning_rate": 9.279411764705884e-05, "loss": 2.5839, "step": 50 }, { "epoch": 0.08828398013610447, "grad_norm": 1.1105211973190308, "learning_rate": 9.13235294117647e-05, "loss": 2.2987, "step": 60 }, { "epoch": 0.10299797682545521, "grad_norm": 1.2184361219406128, "learning_rate": 8.98529411764706e-05, "loss": 2.5504, "step": 70 }, { "epoch": 0.11771197351480596, "grad_norm": 1.1057401895523071, "learning_rate": 8.838235294117647e-05, "loss": 2.4922, "step": 80 }, { "epoch": 0.1324259702041567, "grad_norm": 1.1191836595535278, "learning_rate": 8.691176470588237e-05, "loss": 2.6277, "step": 90 }, { "epoch": 0.14713996689350745, "grad_norm": 1.2518597841262817, "learning_rate": 8.544117647058823e-05, "loss": 2.1856, "step": 100 }, { "epoch": 0.1618539635828582, "grad_norm": 1.2593870162963867, "learning_rate": 8.397058823529412e-05, "loss": 2.39, "step": 110 }, { "epoch": 0.17656796027220895, "grad_norm": 1.5582458972930908, "learning_rate": 8.25e-05, "loss": 2.2159, "step": 120 }, { "epoch": 0.1912819569615597, "grad_norm": 1.1720651388168335, "learning_rate": 8.102941176470588e-05, "loss": 2.5332, "step": 130 }, { "epoch": 0.20599595365091042, "grad_norm": 1.3537250757217407, "learning_rate": 7.955882352941176e-05, "loss": 2.549, "step": 140 }, { "epoch": 0.22070995034026117, "grad_norm": 1.440487265586853, "learning_rate": 7.808823529411765e-05, "loss": 2.6141, "step": 150 }, { "epoch": 0.23542394702961192, "grad_norm": 1.2468889951705933, "learning_rate": 7.661764705882354e-05, "loss": 2.1802, "step": 160 }, { "epoch": 0.25013794371896264, "grad_norm": 1.6108534336090088, "learning_rate": 7.514705882352941e-05, "loss": 2.0655, "step": 170 }, { "epoch": 0.2648519404083134, "grad_norm": 1.454962968826294, "learning_rate": 7.367647058823531e-05, "loss": 2.3067, "step": 180 }, { "epoch": 0.27956593709766414, "grad_norm": 1.4071511030197144, "learning_rate": 7.220588235294118e-05, "loss": 2.378, "step": 190 }, { "epoch": 0.2942799337870149, "grad_norm": 1.7635990381240845, "learning_rate": 7.073529411764707e-05, "loss": 2.6275, "step": 200 }, { "epoch": 0.30899393047636564, "grad_norm": 1.4291043281555176, "learning_rate": 6.926470588235294e-05, "loss": 1.8871, "step": 210 }, { "epoch": 0.3237079271657164, "grad_norm": 1.5699518918991089, "learning_rate": 6.779411764705882e-05, "loss": 2.3781, "step": 220 }, { "epoch": 0.33842192385506714, "grad_norm": 1.5319325923919678, "learning_rate": 6.632352941176471e-05, "loss": 2.1488, "step": 230 }, { "epoch": 0.3531359205444179, "grad_norm": 1.505298376083374, "learning_rate": 6.485294117647059e-05, "loss": 2.6991, "step": 240 }, { "epoch": 0.36784991723376864, "grad_norm": 2.733734607696533, "learning_rate": 6.338235294117647e-05, "loss": 2.1978, "step": 250 }, { "epoch": 0.3825639139231194, "grad_norm": 1.6058900356292725, "learning_rate": 6.191176470588235e-05, "loss": 2.1871, "step": 260 }, { "epoch": 0.3972779106124701, "grad_norm": 2.072322130203247, "learning_rate": 6.044117647058824e-05, "loss": 2.5621, "step": 270 }, { "epoch": 0.41199190730182084, "grad_norm": 1.765065312385559, "learning_rate": 5.897058823529412e-05, "loss": 2.1595, "step": 280 }, { "epoch": 0.4267059039911716, "grad_norm": 1.5139451026916504, "learning_rate": 5.7499999999999995e-05, "loss": 2.2917, "step": 290 }, { "epoch": 0.44141990068052234, "grad_norm": 1.7041040658950806, "learning_rate": 5.6029411764705884e-05, "loss": 2.4275, "step": 300 }, { "epoch": 0.4561338973698731, "grad_norm": 1.5826152563095093, "learning_rate": 5.455882352941176e-05, "loss": 2.5011, "step": 310 }, { "epoch": 0.47084789405922384, "grad_norm": 1.692821979522705, "learning_rate": 5.308823529411765e-05, "loss": 2.3798, "step": 320 }, { "epoch": 0.4855618907485746, "grad_norm": 1.8263252973556519, "learning_rate": 5.161764705882354e-05, "loss": 2.6645, "step": 330 }, { "epoch": 0.5002758874379253, "grad_norm": 1.7093111276626587, "learning_rate": 5.0147058823529414e-05, "loss": 2.4835, "step": 340 }, { "epoch": 0.5149898841272761, "grad_norm": 1.7787758111953735, "learning_rate": 4.86764705882353e-05, "loss": 2.3779, "step": 350 }, { "epoch": 0.5297038808166268, "grad_norm": 1.7382874488830566, "learning_rate": 4.720588235294118e-05, "loss": 2.3799, "step": 360 }, { "epoch": 0.5444178775059776, "grad_norm": 1.797096848487854, "learning_rate": 4.573529411764706e-05, "loss": 2.2148, "step": 370 }, { "epoch": 0.5591318741953283, "grad_norm": 1.781445860862732, "learning_rate": 4.4264705882352944e-05, "loss": 2.0308, "step": 380 }, { "epoch": 0.5738458708846791, "grad_norm": 2.1444976329803467, "learning_rate": 4.2794117647058827e-05, "loss": 2.2169, "step": 390 }, { "epoch": 0.5885598675740298, "grad_norm": 1.8745927810668945, "learning_rate": 4.13235294117647e-05, "loss": 1.9554, "step": 400 }, { "epoch": 0.6032738642633806, "grad_norm": 1.5899605751037598, "learning_rate": 3.985294117647059e-05, "loss": 2.4028, "step": 410 }, { "epoch": 0.6179878609527313, "grad_norm": 1.9187322854995728, "learning_rate": 3.8382352941176474e-05, "loss": 2.2514, "step": 420 }, { "epoch": 0.632701857642082, "grad_norm": 1.5589262247085571, "learning_rate": 3.6911764705882356e-05, "loss": 2.343, "step": 430 }, { "epoch": 0.6474158543314328, "grad_norm": 1.5984998941421509, "learning_rate": 3.544117647058824e-05, "loss": 1.9433, "step": 440 }, { "epoch": 0.6621298510207835, "grad_norm": 1.9034677743911743, "learning_rate": 3.397058823529412e-05, "loss": 2.2025, "step": 450 }, { "epoch": 0.6768438477101343, "grad_norm": 1.7389880418777466, "learning_rate": 3.2500000000000004e-05, "loss": 2.3835, "step": 460 }, { "epoch": 0.691557844399485, "grad_norm": 1.8547667264938354, "learning_rate": 3.1029411764705886e-05, "loss": 2.2641, "step": 470 }, { "epoch": 0.7062718410888358, "grad_norm": 1.6838593482971191, "learning_rate": 2.9558823529411766e-05, "loss": 1.8679, "step": 480 }, { "epoch": 0.7209858377781865, "grad_norm": 2.003615617752075, "learning_rate": 2.8088235294117648e-05, "loss": 2.2402, "step": 490 }, { "epoch": 0.7356998344675373, "grad_norm": 1.7946232557296753, "learning_rate": 2.661764705882353e-05, "loss": 1.9735, "step": 500 }, { "epoch": 0.750413831156888, "grad_norm": 1.9260705709457397, "learning_rate": 2.5147058823529413e-05, "loss": 2.3836, "step": 510 }, { "epoch": 0.7651278278462388, "grad_norm": 1.3813334703445435, "learning_rate": 2.3676470588235295e-05, "loss": 2.4856, "step": 520 }, { "epoch": 0.7798418245355895, "grad_norm": 1.6835263967514038, "learning_rate": 2.2205882352941178e-05, "loss": 2.3702, "step": 530 }, { "epoch": 0.7945558212249402, "grad_norm": 2.234776496887207, "learning_rate": 2.073529411764706e-05, "loss": 2.6312, "step": 540 }, { "epoch": 0.809269817914291, "grad_norm": 1.4896553754806519, "learning_rate": 1.9264705882352943e-05, "loss": 2.2481, "step": 550 }, { "epoch": 0.8239838146036417, "grad_norm": 1.92662513256073, "learning_rate": 1.7794117647058825e-05, "loss": 1.9659, "step": 560 }, { "epoch": 0.8386978112929925, "grad_norm": 1.4436355829238892, "learning_rate": 1.6323529411764708e-05, "loss": 2.3563, "step": 570 }, { "epoch": 0.8534118079823432, "grad_norm": 1.8207820653915405, "learning_rate": 1.4852941176470589e-05, "loss": 2.0522, "step": 580 }, { "epoch": 0.868125804671694, "grad_norm": 1.8331910371780396, "learning_rate": 1.3382352941176471e-05, "loss": 2.4762, "step": 590 }, { "epoch": 0.8828398013610447, "grad_norm": 1.7300273180007935, "learning_rate": 1.1911764705882354e-05, "loss": 2.4929, "step": 600 }, { "epoch": 0.8975537980503955, "grad_norm": 1.7423087358474731, "learning_rate": 1.0441176470588236e-05, "loss": 2.065, "step": 610 }, { "epoch": 0.9122677947397462, "grad_norm": 1.853888988494873, "learning_rate": 8.970588235294119e-06, "loss": 2.3043, "step": 620 }, { "epoch": 0.926981791429097, "grad_norm": 1.6552138328552246, "learning_rate": 7.5e-06, "loss": 2.6556, "step": 630 }, { "epoch": 0.9416957881184477, "grad_norm": 1.6745601892471313, "learning_rate": 6.029411764705883e-06, "loss": 2.3833, "step": 640 }, { "epoch": 0.9564097848077984, "grad_norm": 1.6154053211212158, "learning_rate": 4.558823529411764e-06, "loss": 2.4859, "step": 650 }, { "epoch": 0.9711237814971492, "grad_norm": 1.7248449325561523, "learning_rate": 3.0882352941176472e-06, "loss": 2.4022, "step": 660 }, { "epoch": 0.9858377781864999, "grad_norm": 1.7172226905822754, "learning_rate": 1.6176470588235297e-06, "loss": 2.5658, "step": 670 }, { "epoch": 1.0, "grad_norm": 2.6853206157684326, "learning_rate": 1.4705882352941178e-07, "loss": 2.1013, "step": 680 } ], "logging_steps": 10, "max_steps": 680, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.941123206453658e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }