{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.976800976800977, "eval_steps": 500, "global_step": 306, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04884004884004884, "grad_norm": 0.8704025745391846, "learning_rate": 0.00019986827399037812, "loss": 1.278, "step": 5 }, { "epoch": 0.09768009768009768, "grad_norm": 0.8027306199073792, "learning_rate": 0.00019947344299634464, "loss": 1.3067, "step": 10 }, { "epoch": 0.14652014652014653, "grad_norm": 0.9125049114227295, "learning_rate": 0.00019881654720812594, "loss": 1.1604, "step": 15 }, { "epoch": 0.19536019536019536, "grad_norm": 0.8446016311645508, "learning_rate": 0.00019789931723094046, "loss": 1.1119, "step": 20 }, { "epoch": 0.2442002442002442, "grad_norm": 0.8679695129394531, "learning_rate": 0.00019672416952568416, "loss": 1.08, "step": 25 }, { "epoch": 0.29304029304029305, "grad_norm": 0.9639546275138855, "learning_rate": 0.00019529420004271567, "loss": 1.1442, "step": 30 }, { "epoch": 0.3418803418803419, "grad_norm": 0.8240556716918945, "learning_rate": 0.00019361317606551238, "loss": 1.0986, "step": 35 }, { "epoch": 0.3907203907203907, "grad_norm": 0.6859893202781677, "learning_rate": 0.00019168552628568631, "loss": 1.091, "step": 40 }, { "epoch": 0.43956043956043955, "grad_norm": 0.6568402647972107, "learning_rate": 0.00018951632913550626, "loss": 1.0377, "step": 45 }, { "epoch": 0.4884004884004884, "grad_norm": 0.8978002667427063, "learning_rate": 0.00018711129940866575, "loss": 1.0334, "step": 50 }, { "epoch": 0.5372405372405372, "grad_norm": 1.0903880596160889, "learning_rate": 0.00018447677320454367, "loss": 1.1055, "step": 55 }, { "epoch": 0.5860805860805861, "grad_norm": 0.7571815848350525, "learning_rate": 0.0001816196912356222, "loss": 1.0424, "step": 60 }, { "epoch": 0.6349206349206349, "grad_norm": 0.9039235711097717, "learning_rate": 0.00017854758054203988, "loss": 1.0732, "step": 65 }, { "epoch": 0.6837606837606838, "grad_norm": 0.8517094254493713, "learning_rate": 0.00017526853466145244, "loss": 1.0926, "step": 70 }, { "epoch": 0.7326007326007326, "grad_norm": 0.9040933847427368, "learning_rate": 0.0001717911923064442, "loss": 1.0293, "step": 75 }, { "epoch": 0.7814407814407814, "grad_norm": 1.0867003202438354, "learning_rate": 0.0001681247146056654, "loss": 1.0589, "step": 80 }, { "epoch": 0.8302808302808303, "grad_norm": 0.7861086130142212, "learning_rate": 0.00016427876096865394, "loss": 0.9705, "step": 85 }, { "epoch": 0.8791208791208791, "grad_norm": 0.8542861938476562, "learning_rate": 0.00016026346363792567, "loss": 0.9613, "step": 90 }, { "epoch": 0.927960927960928, "grad_norm": 0.87472003698349, "learning_rate": 0.000156089400995377, "loss": 0.9526, "step": 95 }, { "epoch": 0.9768009768009768, "grad_norm": 0.9949895143508911, "learning_rate": 0.00015176756969332425, "loss": 1.0577, "step": 100 }, { "epoch": 1.0195360195360195, "grad_norm": 1.0473015308380127, "learning_rate": 0.00014730935568360102, "loss": 0.9477, "step": 105 }, { "epoch": 1.0683760683760684, "grad_norm": 1.318867802619934, "learning_rate": 0.0001427265042210381, "loss": 0.7841, "step": 110 }, { "epoch": 1.1172161172161172, "grad_norm": 1.3813416957855225, "learning_rate": 0.0001380310889203526, "loss": 0.7016, "step": 115 }, { "epoch": 1.1660561660561661, "grad_norm": 1.2204864025115967, "learning_rate": 0.00013323547994796597, "loss": 0.7277, "step": 120 }, { "epoch": 1.214896214896215, "grad_norm": 0.9304312467575073, "learning_rate": 0.0001283523114325511, "loss": 0.6668, "step": 125 }, { "epoch": 1.2637362637362637, "grad_norm": 1.382764458656311, "learning_rate": 0.0001233944481801649, "loss": 0.6627, "step": 130 }, { "epoch": 1.3125763125763126, "grad_norm": 1.4755507707595825, "learning_rate": 0.00011837495178165706, "loss": 0.7239, "step": 135 }, { "epoch": 1.3614163614163615, "grad_norm": 1.235129475593567, "learning_rate": 0.00011330704620164538, "loss": 0.7067, "step": 140 }, { "epoch": 1.4102564102564101, "grad_norm": 1.1385552883148193, "learning_rate": 0.00010820408293971378, "loss": 0.6466, "step": 145 }, { "epoch": 1.459096459096459, "grad_norm": 1.374949336051941, "learning_rate": 0.00010307950585561706, "loss": 0.7069, "step": 150 }, { "epoch": 1.507936507936508, "grad_norm": 1.3680429458618164, "learning_rate": 9.794681575116097e-05, "loss": 0.6908, "step": 155 }, { "epoch": 1.5567765567765568, "grad_norm": 1.5387816429138184, "learning_rate": 9.281953480206725e-05, "loss": 0.7049, "step": 160 }, { "epoch": 1.6056166056166057, "grad_norm": 1.3777350187301636, "learning_rate": 8.77111709335286e-05, "loss": 0.6259, "step": 165 }, { "epoch": 1.6544566544566544, "grad_norm": 1.4641001224517822, "learning_rate": 8.263518223330697e-05, "loss": 0.7536, "step": 170 }, { "epoch": 1.7032967032967035, "grad_norm": 1.334525465965271, "learning_rate": 7.760494149612971e-05, "loss": 0.6977, "step": 175 }, { "epoch": 1.7521367521367521, "grad_norm": 1.5057116746902466, "learning_rate": 7.263370099279172e-05, "loss": 0.6645, "step": 180 }, { "epoch": 1.800976800976801, "grad_norm": 1.517478585243225, "learning_rate": 6.773455755678054e-05, "loss": 0.6264, "step": 185 }, { "epoch": 1.84981684981685, "grad_norm": 1.4641480445861816, "learning_rate": 6.292041808040393e-05, "loss": 0.7086, "step": 190 }, { "epoch": 1.8986568986568986, "grad_norm": 0.9789106845855713, "learning_rate": 5.82039655113217e-05, "loss": 0.6297, "step": 195 }, { "epoch": 1.9474969474969475, "grad_norm": 1.607702374458313, "learning_rate": 5.359762543906368e-05, "loss": 0.6865, "step": 200 }, { "epoch": 1.9963369963369964, "grad_norm": 1.4452511072158813, "learning_rate": 4.911353335956352e-05, "loss": 0.6567, "step": 205 }, { "epoch": 2.039072039072039, "grad_norm": 1.3248692750930786, "learning_rate": 4.476350270394942e-05, "loss": 0.4983, "step": 210 }, { "epoch": 2.087912087912088, "grad_norm": 1.4795316457748413, "learning_rate": 4.055899371582133e-05, "loss": 0.4564, "step": 215 }, { "epoch": 2.1367521367521367, "grad_norm": 1.5238746404647827, "learning_rate": 3.651108325900773e-05, "loss": 0.397, "step": 220 }, { "epoch": 2.185592185592186, "grad_norm": 1.5181987285614014, "learning_rate": 3.263043563534428e-05, "loss": 0.4685, "step": 225 }, { "epoch": 2.2344322344322345, "grad_norm": 1.6732962131500244, "learning_rate": 2.8927274489355293e-05, "loss": 0.4235, "step": 230 }, { "epoch": 2.283272283272283, "grad_norm": 1.4956883192062378, "learning_rate": 2.541135587385568e-05, "loss": 0.4308, "step": 235 }, { "epoch": 2.3321123321123323, "grad_norm": 1.635918378829956, "learning_rate": 2.2091942547432955e-05, "loss": 0.3784, "step": 240 }, { "epoch": 2.380952380952381, "grad_norm": 1.434029221534729, "learning_rate": 1.8977779571522646e-05, "loss": 0.4247, "step": 245 }, { "epoch": 2.42979242979243, "grad_norm": 1.5720264911651611, "learning_rate": 1.607707127136734e-05, "loss": 0.4322, "step": 250 }, { "epoch": 2.4786324786324787, "grad_norm": 1.7204139232635498, "learning_rate": 1.339745962155613e-05, "loss": 0.4205, "step": 255 }, { "epoch": 2.5274725274725274, "grad_norm": 1.86961030960083, "learning_rate": 1.0946004113088381e-05, "loss": 0.3895, "step": 260 }, { "epoch": 2.576312576312576, "grad_norm": 1.8931158781051636, "learning_rate": 8.729163155001974e-06, "loss": 0.4028, "step": 265 }, { "epoch": 2.625152625152625, "grad_norm": 1.743199110031128, "learning_rate": 6.75277705956443e-06, "loss": 0.4123, "step": 270 }, { "epoch": 2.6739926739926743, "grad_norm": 1.3287917375564575, "learning_rate": 5.0220526558522274e-06, "loss": 0.3504, "step": 275 }, { "epoch": 2.722832722832723, "grad_norm": 1.753952980041504, "learning_rate": 3.541549572254488e-06, "loss": 0.3485, "step": 280 }, { "epoch": 2.7716727716727716, "grad_norm": 1.628715991973877, "learning_rate": 2.315168224039932e-06, "loss": 0.4011, "step": 285 }, { "epoch": 2.8205128205128203, "grad_norm": 1.4908260107040405, "learning_rate": 1.3461395376340502e-06, "loss": 0.3763, "step": 290 }, { "epoch": 2.8693528693528694, "grad_norm": 1.6381616592407227, "learning_rate": 6.370164386782285e-07, "loss": 0.4417, "step": 295 }, { "epoch": 2.918192918192918, "grad_norm": 1.4152339696884155, "learning_rate": 1.8966712629558957e-07, "loss": 0.383, "step": 300 }, { "epoch": 2.967032967032967, "grad_norm": 1.7442331314086914, "learning_rate": 5.270151282688041e-09, "loss": 0.4011, "step": 305 }, { "epoch": 2.976800976800977, "step": 306, "total_flos": 5.308124759536435e+16, "train_loss": 0.7306966563455419, "train_runtime": 718.532, "train_samples_per_second": 6.839, "train_steps_per_second": 0.426 } ], "logging_steps": 5, "max_steps": 306, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.308124759536435e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }