| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.976800976800977, | |
| "eval_steps": 500, | |
| "global_step": 306, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.04884004884004884, | |
| "grad_norm": 0.8704025745391846, | |
| "learning_rate": 0.00019986827399037812, | |
| "loss": 1.278, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.09768009768009768, | |
| "grad_norm": 0.8027306199073792, | |
| "learning_rate": 0.00019947344299634464, | |
| "loss": 1.3067, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.14652014652014653, | |
| "grad_norm": 0.9125049114227295, | |
| "learning_rate": 0.00019881654720812594, | |
| "loss": 1.1604, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.19536019536019536, | |
| "grad_norm": 0.8446016311645508, | |
| "learning_rate": 0.00019789931723094046, | |
| "loss": 1.1119, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.2442002442002442, | |
| "grad_norm": 0.8679695129394531, | |
| "learning_rate": 0.00019672416952568416, | |
| "loss": 1.08, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.29304029304029305, | |
| "grad_norm": 0.9639546275138855, | |
| "learning_rate": 0.00019529420004271567, | |
| "loss": 1.1442, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.3418803418803419, | |
| "grad_norm": 0.8240556716918945, | |
| "learning_rate": 0.00019361317606551238, | |
| "loss": 1.0986, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.3907203907203907, | |
| "grad_norm": 0.6859893202781677, | |
| "learning_rate": 0.00019168552628568631, | |
| "loss": 1.091, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.43956043956043955, | |
| "grad_norm": 0.6568402647972107, | |
| "learning_rate": 0.00018951632913550626, | |
| "loss": 1.0377, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.4884004884004884, | |
| "grad_norm": 0.8978002667427063, | |
| "learning_rate": 0.00018711129940866575, | |
| "loss": 1.0334, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.5372405372405372, | |
| "grad_norm": 1.0903880596160889, | |
| "learning_rate": 0.00018447677320454367, | |
| "loss": 1.1055, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.5860805860805861, | |
| "grad_norm": 0.7571815848350525, | |
| "learning_rate": 0.0001816196912356222, | |
| "loss": 1.0424, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.6349206349206349, | |
| "grad_norm": 0.9039235711097717, | |
| "learning_rate": 0.00017854758054203988, | |
| "loss": 1.0732, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.6837606837606838, | |
| "grad_norm": 0.8517094254493713, | |
| "learning_rate": 0.00017526853466145244, | |
| "loss": 1.0926, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.7326007326007326, | |
| "grad_norm": 0.9040933847427368, | |
| "learning_rate": 0.0001717911923064442, | |
| "loss": 1.0293, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.7814407814407814, | |
| "grad_norm": 1.0867003202438354, | |
| "learning_rate": 0.0001681247146056654, | |
| "loss": 1.0589, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.8302808302808303, | |
| "grad_norm": 0.7861086130142212, | |
| "learning_rate": 0.00016427876096865394, | |
| "loss": 0.9705, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.8791208791208791, | |
| "grad_norm": 0.8542861938476562, | |
| "learning_rate": 0.00016026346363792567, | |
| "loss": 0.9613, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.927960927960928, | |
| "grad_norm": 0.87472003698349, | |
| "learning_rate": 0.000156089400995377, | |
| "loss": 0.9526, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.9768009768009768, | |
| "grad_norm": 0.9949895143508911, | |
| "learning_rate": 0.00015176756969332425, | |
| "loss": 1.0577, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.0195360195360195, | |
| "grad_norm": 1.0473015308380127, | |
| "learning_rate": 0.00014730935568360102, | |
| "loss": 0.9477, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 1.0683760683760684, | |
| "grad_norm": 1.318867802619934, | |
| "learning_rate": 0.0001427265042210381, | |
| "loss": 0.7841, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.1172161172161172, | |
| "grad_norm": 1.3813416957855225, | |
| "learning_rate": 0.0001380310889203526, | |
| "loss": 0.7016, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 1.1660561660561661, | |
| "grad_norm": 1.2204864025115967, | |
| "learning_rate": 0.00013323547994796597, | |
| "loss": 0.7277, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.214896214896215, | |
| "grad_norm": 0.9304312467575073, | |
| "learning_rate": 0.0001283523114325511, | |
| "loss": 0.6668, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 1.2637362637362637, | |
| "grad_norm": 1.382764458656311, | |
| "learning_rate": 0.0001233944481801649, | |
| "loss": 0.6627, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.3125763125763126, | |
| "grad_norm": 1.4755507707595825, | |
| "learning_rate": 0.00011837495178165706, | |
| "loss": 0.7239, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 1.3614163614163615, | |
| "grad_norm": 1.235129475593567, | |
| "learning_rate": 0.00011330704620164538, | |
| "loss": 0.7067, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.4102564102564101, | |
| "grad_norm": 1.1385552883148193, | |
| "learning_rate": 0.00010820408293971378, | |
| "loss": 0.6466, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 1.459096459096459, | |
| "grad_norm": 1.374949336051941, | |
| "learning_rate": 0.00010307950585561706, | |
| "loss": 0.7069, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.507936507936508, | |
| "grad_norm": 1.3680429458618164, | |
| "learning_rate": 9.794681575116097e-05, | |
| "loss": 0.6908, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 1.5567765567765568, | |
| "grad_norm": 1.5387816429138184, | |
| "learning_rate": 9.281953480206725e-05, | |
| "loss": 0.7049, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.6056166056166057, | |
| "grad_norm": 1.3777350187301636, | |
| "learning_rate": 8.77111709335286e-05, | |
| "loss": 0.6259, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 1.6544566544566544, | |
| "grad_norm": 1.4641001224517822, | |
| "learning_rate": 8.263518223330697e-05, | |
| "loss": 0.7536, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.7032967032967035, | |
| "grad_norm": 1.334525465965271, | |
| "learning_rate": 7.760494149612971e-05, | |
| "loss": 0.6977, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.7521367521367521, | |
| "grad_norm": 1.5057116746902466, | |
| "learning_rate": 7.263370099279172e-05, | |
| "loss": 0.6645, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.800976800976801, | |
| "grad_norm": 1.517478585243225, | |
| "learning_rate": 6.773455755678054e-05, | |
| "loss": 0.6264, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 1.84981684981685, | |
| "grad_norm": 1.4641480445861816, | |
| "learning_rate": 6.292041808040393e-05, | |
| "loss": 0.7086, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.8986568986568986, | |
| "grad_norm": 0.9789106845855713, | |
| "learning_rate": 5.82039655113217e-05, | |
| "loss": 0.6297, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 1.9474969474969475, | |
| "grad_norm": 1.607702374458313, | |
| "learning_rate": 5.359762543906368e-05, | |
| "loss": 0.6865, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.9963369963369964, | |
| "grad_norm": 1.4452511072158813, | |
| "learning_rate": 4.911353335956352e-05, | |
| "loss": 0.6567, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 2.039072039072039, | |
| "grad_norm": 1.3248692750930786, | |
| "learning_rate": 4.476350270394942e-05, | |
| "loss": 0.4983, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 2.087912087912088, | |
| "grad_norm": 1.4795316457748413, | |
| "learning_rate": 4.055899371582133e-05, | |
| "loss": 0.4564, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 2.1367521367521367, | |
| "grad_norm": 1.5238746404647827, | |
| "learning_rate": 3.651108325900773e-05, | |
| "loss": 0.397, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 2.185592185592186, | |
| "grad_norm": 1.5181987285614014, | |
| "learning_rate": 3.263043563534428e-05, | |
| "loss": 0.4685, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 2.2344322344322345, | |
| "grad_norm": 1.6732962131500244, | |
| "learning_rate": 2.8927274489355293e-05, | |
| "loss": 0.4235, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 2.283272283272283, | |
| "grad_norm": 1.4956883192062378, | |
| "learning_rate": 2.541135587385568e-05, | |
| "loss": 0.4308, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 2.3321123321123323, | |
| "grad_norm": 1.635918378829956, | |
| "learning_rate": 2.2091942547432955e-05, | |
| "loss": 0.3784, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 2.380952380952381, | |
| "grad_norm": 1.434029221534729, | |
| "learning_rate": 1.8977779571522646e-05, | |
| "loss": 0.4247, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 2.42979242979243, | |
| "grad_norm": 1.5720264911651611, | |
| "learning_rate": 1.607707127136734e-05, | |
| "loss": 0.4322, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 2.4786324786324787, | |
| "grad_norm": 1.7204139232635498, | |
| "learning_rate": 1.339745962155613e-05, | |
| "loss": 0.4205, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 2.5274725274725274, | |
| "grad_norm": 1.86961030960083, | |
| "learning_rate": 1.0946004113088381e-05, | |
| "loss": 0.3895, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 2.576312576312576, | |
| "grad_norm": 1.8931158781051636, | |
| "learning_rate": 8.729163155001974e-06, | |
| "loss": 0.4028, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 2.625152625152625, | |
| "grad_norm": 1.743199110031128, | |
| "learning_rate": 6.75277705956443e-06, | |
| "loss": 0.4123, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 2.6739926739926743, | |
| "grad_norm": 1.3287917375564575, | |
| "learning_rate": 5.0220526558522274e-06, | |
| "loss": 0.3504, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 2.722832722832723, | |
| "grad_norm": 1.753952980041504, | |
| "learning_rate": 3.541549572254488e-06, | |
| "loss": 0.3485, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 2.7716727716727716, | |
| "grad_norm": 1.628715991973877, | |
| "learning_rate": 2.315168224039932e-06, | |
| "loss": 0.4011, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 2.8205128205128203, | |
| "grad_norm": 1.4908260107040405, | |
| "learning_rate": 1.3461395376340502e-06, | |
| "loss": 0.3763, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 2.8693528693528694, | |
| "grad_norm": 1.6381616592407227, | |
| "learning_rate": 6.370164386782285e-07, | |
| "loss": 0.4417, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 2.918192918192918, | |
| "grad_norm": 1.4152339696884155, | |
| "learning_rate": 1.8966712629558957e-07, | |
| "loss": 0.383, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 2.967032967032967, | |
| "grad_norm": 1.7442331314086914, | |
| "learning_rate": 5.270151282688041e-09, | |
| "loss": 0.4011, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 2.976800976800977, | |
| "step": 306, | |
| "total_flos": 5.308124759536435e+16, | |
| "train_loss": 0.7306966563455419, | |
| "train_runtime": 718.532, | |
| "train_samples_per_second": 6.839, | |
| "train_steps_per_second": 0.426 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 306, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.308124759536435e+16, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |