| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.9971181556195967, | |
| "eval_steps": 500, | |
| "global_step": 780, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.03842459173871278, | |
| "grad_norm": 0.7461163997650146, | |
| "learning_rate": 8.333333333333334e-05, | |
| "loss": 1.2598, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.07684918347742556, | |
| "grad_norm": 0.25502100586891174, | |
| "learning_rate": 0.0001666666666666667, | |
| "loss": 0.7563, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.11527377521613832, | |
| "grad_norm": 0.14209185540676117, | |
| "learning_rate": 0.00019996891820008164, | |
| "loss": 0.63, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.15369836695485112, | |
| "grad_norm": 0.10494557023048401, | |
| "learning_rate": 0.0001997790438338385, | |
| "loss": 0.5646, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.19212295869356388, | |
| "grad_norm": 0.11090180277824402, | |
| "learning_rate": 0.0001994168902089112, | |
| "loss": 0.5158, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.23054755043227665, | |
| "grad_norm": 0.09561982750892639, | |
| "learning_rate": 0.00019888308262251285, | |
| "loss": 0.5171, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.2689721421709894, | |
| "grad_norm": 0.13890881836414337, | |
| "learning_rate": 0.0001981785427508966, | |
| "loss": 0.5013, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.30739673390970224, | |
| "grad_norm": 0.09685279428958893, | |
| "learning_rate": 0.00019730448705798239, | |
| "loss": 0.4803, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.345821325648415, | |
| "grad_norm": 0.0978529155254364, | |
| "learning_rate": 0.0001962624246950012, | |
| "loss": 0.4824, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.38424591738712777, | |
| "grad_norm": 0.09823426604270935, | |
| "learning_rate": 0.0001950541548947829, | |
| "loss": 0.4765, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.42267050912584053, | |
| "grad_norm": 0.11479681730270386, | |
| "learning_rate": 0.0001936817638651871, | |
| "loss": 0.4804, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.4610951008645533, | |
| "grad_norm": 0.1102244183421135, | |
| "learning_rate": 0.00019214762118704076, | |
| "loss": 0.4735, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.49951969260326606, | |
| "grad_norm": 0.09442220628261566, | |
| "learning_rate": 0.00019045437572280194, | |
| "loss": 0.4654, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.5379442843419788, | |
| "grad_norm": 0.0998912900686264, | |
| "learning_rate": 0.00018860495104301345, | |
| "loss": 0.4714, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.5763688760806917, | |
| "grad_norm": 0.12593407928943634, | |
| "learning_rate": 0.00018660254037844388, | |
| "loss": 0.4652, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.6147934678194045, | |
| "grad_norm": 0.10841673612594604, | |
| "learning_rate": 0.0001844506011066308, | |
| "loss": 0.4633, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.6532180595581172, | |
| "grad_norm": 0.09892784804105759, | |
| "learning_rate": 0.00018215284878234642, | |
| "loss": 0.461, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.69164265129683, | |
| "grad_norm": 0.5387171506881714, | |
| "learning_rate": 0.00017971325072229226, | |
| "loss": 0.4591, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.7300672430355427, | |
| "grad_norm": 0.11192867159843445, | |
| "learning_rate": 0.0001771360191551, | |
| "loss": 0.4592, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.7684918347742555, | |
| "grad_norm": 0.10694364458322525, | |
| "learning_rate": 0.00017442560394846516, | |
| "loss": 0.4574, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.8069164265129684, | |
| "grad_norm": 0.10873424261808395, | |
| "learning_rate": 0.00017158668492597186, | |
| "loss": 0.4492, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.8453410182516811, | |
| "grad_norm": 0.11694315820932388, | |
| "learning_rate": 0.0001686241637868734, | |
| "loss": 0.4467, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.8837656099903939, | |
| "grad_norm": 0.10100408643484116, | |
| "learning_rate": 0.000165543155642781, | |
| "loss": 0.4488, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.9221902017291066, | |
| "grad_norm": 0.10397649556398392, | |
| "learning_rate": 0.00016234898018587337, | |
| "loss": 0.447, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.9606147934678194, | |
| "grad_norm": 0.10007993876934052, | |
| "learning_rate": 0.00015904715250387498, | |
| "loss": 0.4428, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.9990393852065321, | |
| "grad_norm": 0.10865867137908936, | |
| "learning_rate": 0.00015564337355766412, | |
| "loss": 0.4452, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.037463976945245, | |
| "grad_norm": 0.10476306080818176, | |
| "learning_rate": 0.0001521435203379498, | |
| "loss": 0.4367, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.0758885686839577, | |
| "grad_norm": 0.10958375781774521, | |
| "learning_rate": 0.00014855363571801523, | |
| "loss": 0.4336, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.1143131604226706, | |
| "grad_norm": 0.11801016330718994, | |
| "learning_rate": 0.00014487991802004623, | |
| "loss": 0.4346, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.1527377521613833, | |
| "grad_norm": 0.11526134610176086, | |
| "learning_rate": 0.00014112871031306119, | |
| "loss": 0.4221, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.191162343900096, | |
| "grad_norm": 0.10705429315567017, | |
| "learning_rate": 0.0001373064894609194, | |
| "loss": 0.4363, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.229586935638809, | |
| "grad_norm": 0.09906008094549179, | |
| "learning_rate": 0.00013341985493931877, | |
| "loss": 0.4359, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.2680115273775217, | |
| "grad_norm": 0.11935935914516449, | |
| "learning_rate": 0.00012947551744109043, | |
| "loss": 0.429, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.3064361191162344, | |
| "grad_norm": 0.11651390045881271, | |
| "learning_rate": 0.0001254802872894655, | |
| "loss": 0.4295, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.344860710854947, | |
| "grad_norm": 0.13374051451683044, | |
| "learning_rate": 0.00012144106267931876, | |
| "loss": 0.43, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.38328530259366, | |
| "grad_norm": 0.10709749907255173, | |
| "learning_rate": 0.00011736481776669306, | |
| "loss": 0.4229, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.4217098943323727, | |
| "grad_norm": 0.10747699439525604, | |
| "learning_rate": 0.00011325859062716795, | |
| "loss": 0.4255, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.4601344860710854, | |
| "grad_norm": 0.1302700638771057, | |
| "learning_rate": 0.00010912947110386484, | |
| "loss": 0.4314, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.4985590778097984, | |
| "grad_norm": 0.10743537545204163, | |
| "learning_rate": 0.00010498458856606972, | |
| "loss": 0.4242, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.536983669548511, | |
| "grad_norm": 0.11519400030374527, | |
| "learning_rate": 0.00010083109959960973, | |
| "loss": 0.4216, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.5754082612872238, | |
| "grad_norm": 0.11456304788589478, | |
| "learning_rate": 9.667617565023735e-05, | |
| "loss": 0.4315, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.6138328530259365, | |
| "grad_norm": 0.10759314894676208, | |
| "learning_rate": 9.252699064135758e-05, | |
| "loss": 0.4199, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.6522574447646494, | |
| "grad_norm": 0.1024770587682724, | |
| "learning_rate": 8.839070858747697e-05, | |
| "loss": 0.4272, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.6906820365033621, | |
| "grad_norm": 0.10736548155546188, | |
| "learning_rate": 8.427447122476148e-05, | |
| "loss": 0.4232, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.729106628242075, | |
| "grad_norm": 0.1060362458229065, | |
| "learning_rate": 8.018538568006027e-05, | |
| "loss": 0.4237, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.7675312199807878, | |
| "grad_norm": 0.10463803261518478, | |
| "learning_rate": 7.613051219968623e-05, | |
| "loss": 0.4247, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.8059558117195005, | |
| "grad_norm": 0.10327400267124176, | |
| "learning_rate": 7.211685195914097e-05, | |
| "loss": 0.4196, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.8443804034582132, | |
| "grad_norm": 0.13895417749881744, | |
| "learning_rate": 6.815133497483157e-05, | |
| "loss": 0.4205, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.882804995196926, | |
| "grad_norm": 0.10684759169816971, | |
| "learning_rate": 6.424080813865138e-05, | |
| "loss": 0.4224, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.9212295869356388, | |
| "grad_norm": 0.13927388191223145, | |
| "learning_rate": 6.039202339608432e-05, | |
| "loss": 0.4196, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.9596541786743515, | |
| "grad_norm": 0.10447521507740021, | |
| "learning_rate": 5.6611626088244194e-05, | |
| "loss": 0.4225, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.9980787704130645, | |
| "grad_norm": 0.10929688066244125, | |
| "learning_rate": 5.290614347797802e-05, | |
| "loss": 0.4254, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 2.036503362151777, | |
| "grad_norm": 0.10282639414072037, | |
| "learning_rate": 4.92819734798441e-05, | |
| "loss": 0.4115, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 2.07492795389049, | |
| "grad_norm": 0.10815539956092834, | |
| "learning_rate": 4.574537361342407e-05, | |
| "loss": 0.4162, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 2.1133525456292026, | |
| "grad_norm": 0.1221628338098526, | |
| "learning_rate": 4.23024501990417e-05, | |
| "loss": 0.4056, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 2.1517771373679153, | |
| "grad_norm": 0.1123044565320015, | |
| "learning_rate": 3.89591478145437e-05, | |
| "loss": 0.4089, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 2.1902017291066285, | |
| "grad_norm": 0.11408425867557526, | |
| "learning_rate": 3.5721239031346066e-05, | |
| "loss": 0.4113, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 2.228626320845341, | |
| "grad_norm": 0.10744322091341019, | |
| "learning_rate": 3.259431444746846e-05, | |
| "loss": 0.4018, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 2.267050912584054, | |
| "grad_norm": 0.11307461559772491, | |
| "learning_rate": 2.9583773034764826e-05, | |
| "loss": 0.405, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 2.3054755043227666, | |
| "grad_norm": 0.11285313963890076, | |
| "learning_rate": 2.66948128170174e-05, | |
| "loss": 0.3995, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 2.3439000960614793, | |
| "grad_norm": 0.15918630361557007, | |
| "learning_rate": 2.3932421894989167e-05, | |
| "loss": 0.4027, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 2.382324687800192, | |
| "grad_norm": 0.11159035563468933, | |
| "learning_rate": 2.1301369833931117e-05, | |
| "loss": 0.4082, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 2.4207492795389047, | |
| "grad_norm": 0.11359286308288574, | |
| "learning_rate": 1.880619942841435e-05, | |
| "loss": 0.4054, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 2.459173871277618, | |
| "grad_norm": 0.11326448619365692, | |
| "learning_rate": 1.6451218858706374e-05, | |
| "loss": 0.4064, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 2.4975984630163306, | |
| "grad_norm": 0.11316490173339844, | |
| "learning_rate": 1.4240494252234049e-05, | |
| "loss": 0.4111, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 2.5360230547550433, | |
| "grad_norm": 0.11959807574748993, | |
| "learning_rate": 1.2177842662977135e-05, | |
| "loss": 0.4078, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 2.574447646493756, | |
| "grad_norm": 0.10980040580034256, | |
| "learning_rate": 1.0266825480913611e-05, | |
| "loss": 0.408, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 2.6128722382324687, | |
| "grad_norm": 0.11757558584213257, | |
| "learning_rate": 8.510742282896544e-06, | |
| "loss": 0.4019, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 2.6512968299711814, | |
| "grad_norm": 0.11953994631767273, | |
| "learning_rate": 6.9126251355795864e-06, | |
| "loss": 0.4037, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 2.689721421709894, | |
| "grad_norm": 0.13467754423618317, | |
| "learning_rate": 5.475233360227527e-06, | |
| "loss": 0.404, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.7281460134486073, | |
| "grad_norm": 0.11414311081171036, | |
| "learning_rate": 4.20104876845111e-06, | |
| "loss": 0.4043, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 2.76657060518732, | |
| "grad_norm": 0.11653583496809006, | |
| "learning_rate": 3.092271377092215e-06, | |
| "loss": 0.4036, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 2.8049951969260327, | |
| "grad_norm": 0.1183491125702858, | |
| "learning_rate": 2.150815609657875e-06, | |
| "loss": 0.4053, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 2.8434197886647454, | |
| "grad_norm": 0.142572820186615, | |
| "learning_rate": 1.378306990862177e-06, | |
| "loss": 0.4037, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 2.881844380403458, | |
| "grad_norm": 0.11191695928573608, | |
| "learning_rate": 7.760793399827937e-07, | |
| "loss": 0.4035, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 2.920268972142171, | |
| "grad_norm": 0.11203925311565399, | |
| "learning_rate": 3.451724678784518e-07, | |
| "loss": 0.409, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 2.9586935638808836, | |
| "grad_norm": 0.11161793023347855, | |
| "learning_rate": 8.633038164358454e-08, | |
| "loss": 0.3982, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 2.9971181556195967, | |
| "grad_norm": 0.11390741169452667, | |
| "learning_rate": 0.0, | |
| "loss": 0.4073, | |
| "step": 780 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 780, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.794517495096279e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |