| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 867, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.03466204506065858, | |
| "grad_norm": 1.614887252818893, | |
| "learning_rate": 2.0689655172413796e-06, | |
| "loss": 0.6065, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.06932409012131716, | |
| "grad_norm": 1.2876954446946685, | |
| "learning_rate": 4.367816091954023e-06, | |
| "loss": 0.5347, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.10398613518197573, | |
| "grad_norm": 0.4989329651688657, | |
| "learning_rate": 6.666666666666667e-06, | |
| "loss": 0.4714, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.1386481802426343, | |
| "grad_norm": 0.36809008403990967, | |
| "learning_rate": 8.965517241379312e-06, | |
| "loss": 0.4476, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.1733102253032929, | |
| "grad_norm": 0.3015275688562798, | |
| "learning_rate": 1.1264367816091955e-05, | |
| "loss": 0.4157, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.20797227036395147, | |
| "grad_norm": 0.23325709345466544, | |
| "learning_rate": 1.3563218390804598e-05, | |
| "loss": 0.4014, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.24263431542461006, | |
| "grad_norm": 0.2599261399956402, | |
| "learning_rate": 1.586206896551724e-05, | |
| "loss": 0.3918, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.2772963604852686, | |
| "grad_norm": 0.23676671921370496, | |
| "learning_rate": 1.8160919540229885e-05, | |
| "loss": 0.391, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.3119584055459272, | |
| "grad_norm": 0.24237959936856435, | |
| "learning_rate": 1.9999675557165282e-05, | |
| "loss": 0.3859, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.3466204506065858, | |
| "grad_norm": 0.4446903057333395, | |
| "learning_rate": 1.998832226832327e-05, | |
| "loss": 0.3805, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.38128249566724437, | |
| "grad_norm": 0.3211792221924684, | |
| "learning_rate": 1.9960767884236132e-05, | |
| "loss": 0.3806, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.41594454072790293, | |
| "grad_norm": 0.27403637242653317, | |
| "learning_rate": 1.9917057098215624e-05, | |
| "loss": 0.3805, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.4506065857885615, | |
| "grad_norm": 0.2508084333447016, | |
| "learning_rate": 1.985726080931651e-05, | |
| "loss": 0.3741, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.4852686308492201, | |
| "grad_norm": 0.28374356186453464, | |
| "learning_rate": 1.9781476007338058e-05, | |
| "loss": 0.3739, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.5199306759098787, | |
| "grad_norm": 0.35539697186524694, | |
| "learning_rate": 1.968982561550621e-05, | |
| "loss": 0.3678, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.5545927209705372, | |
| "grad_norm": 0.2613425020538643, | |
| "learning_rate": 1.9582458291091664e-05, | |
| "loss": 0.3647, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.5892547660311959, | |
| "grad_norm": 0.3109549671591506, | |
| "learning_rate": 1.9459548184287254e-05, | |
| "loss": 0.3638, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.6239168110918544, | |
| "grad_norm": 0.3368502474991166, | |
| "learning_rate": 1.932129465573568e-05, | |
| "loss": 0.3626, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.658578856152513, | |
| "grad_norm": 0.2670073589604095, | |
| "learning_rate": 1.9167921953165827e-05, | |
| "loss": 0.3635, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.6932409012131716, | |
| "grad_norm": 0.2916079955306212, | |
| "learning_rate": 1.8999678847662124e-05, | |
| "loss": 0.3597, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.7279029462738301, | |
| "grad_norm": 0.30166318114659063, | |
| "learning_rate": 1.881683823015694e-05, | |
| "loss": 0.3555, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.7625649913344887, | |
| "grad_norm": 0.2713616440874116, | |
| "learning_rate": 1.8619696668800494e-05, | |
| "loss": 0.3586, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.7972270363951474, | |
| "grad_norm": 0.26515561290090994, | |
| "learning_rate": 1.8408573927926225e-05, | |
| "loss": 0.3617, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.8318890814558059, | |
| "grad_norm": 0.2813509663127385, | |
| "learning_rate": 1.818381244939187e-05, | |
| "loss": 0.3556, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.8665511265164645, | |
| "grad_norm": 0.346388313968459, | |
| "learning_rate": 1.7945776797137544e-05, | |
| "loss": 0.3517, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.901213171577123, | |
| "grad_norm": 0.25559948474750327, | |
| "learning_rate": 1.769485306586166e-05, | |
| "loss": 0.3531, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.9358752166377816, | |
| "grad_norm": 0.23567303867404224, | |
| "learning_rate": 1.7431448254773943e-05, | |
| "loss": 0.354, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.9705372616984402, | |
| "grad_norm": 0.2507120585300497, | |
| "learning_rate": 1.715598960744121e-05, | |
| "loss": 0.353, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.0034662045060658, | |
| "grad_norm": 0.3235414509087519, | |
| "learning_rate": 1.6868923918796753e-05, | |
| "loss": 0.3479, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.0381282495667243, | |
| "grad_norm": 0.29249432292924354, | |
| "learning_rate": 1.657071681043731e-05, | |
| "loss": 0.3297, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.072790294627383, | |
| "grad_norm": 0.28107913308047194, | |
| "learning_rate": 1.626185197538314e-05, | |
| "loss": 0.3216, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.1074523396880416, | |
| "grad_norm": 0.2388850688621256, | |
| "learning_rate": 1.5942830393526176e-05, | |
| "loss": 0.3313, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.1421143847487, | |
| "grad_norm": 0.23828446739527873, | |
| "learning_rate": 1.561416951903881e-05, | |
| "loss": 0.3274, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.1767764298093588, | |
| "grad_norm": 0.24738586978364616, | |
| "learning_rate": 1.527640244106133e-05, | |
| "loss": 0.3261, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.2114384748700173, | |
| "grad_norm": 0.24976026828074924, | |
| "learning_rate": 1.4930077019029376e-05, | |
| "loss": 0.3277, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.2461005199306758, | |
| "grad_norm": 0.31665221410517264, | |
| "learning_rate": 1.4575754994043956e-05, | |
| "loss": 0.3242, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.2807625649913346, | |
| "grad_norm": 0.25301472762117483, | |
| "learning_rate": 1.4214011077725293e-05, | |
| "loss": 0.3296, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.315424610051993, | |
| "grad_norm": 0.3032496960590633, | |
| "learning_rate": 1.3845432020028511e-05, | |
| "loss": 0.3243, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.3500866551126516, | |
| "grad_norm": 0.2144539306745234, | |
| "learning_rate": 1.347061565753303e-05, | |
| "loss": 0.3223, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.38474870017331, | |
| "grad_norm": 0.25316588193158246, | |
| "learning_rate": 1.3090169943749475e-05, | |
| "loss": 0.3269, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.4194107452339688, | |
| "grad_norm": 0.23335401936650935, | |
| "learning_rate": 1.270471196301684e-05, | |
| "loss": 0.3233, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.4540727902946273, | |
| "grad_norm": 0.25740474483694903, | |
| "learning_rate": 1.2314866929589434e-05, | |
| "loss": 0.3257, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.4887348353552858, | |
| "grad_norm": 0.22583757323894366, | |
| "learning_rate": 1.1921267173537085e-05, | |
| "loss": 0.3258, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.5233968804159446, | |
| "grad_norm": 0.21072089179588632, | |
| "learning_rate": 1.1524551115103455e-05, | |
| "loss": 0.3198, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.558058925476603, | |
| "grad_norm": 0.2075217279484371, | |
| "learning_rate": 1.1125362229186056e-05, | |
| "loss": 0.3213, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.5927209705372616, | |
| "grad_norm": 0.20733188488510618, | |
| "learning_rate": 1.0724348001617626e-05, | |
| "loss": 0.3189, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.6273830155979203, | |
| "grad_norm": 0.23700844887424702, | |
| "learning_rate": 1.0322158878941733e-05, | |
| "loss": 0.3238, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.6620450606585788, | |
| "grad_norm": 0.2303410797551013, | |
| "learning_rate": 9.919447213386103e-06, | |
| "loss": 0.3188, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.6967071057192373, | |
| "grad_norm": 0.2545430511134805, | |
| "learning_rate": 9.516866204744932e-06, | |
| "loss": 0.3185, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.731369150779896, | |
| "grad_norm": 0.22010025072217687, | |
| "learning_rate": 9.115068840886418e-06, | |
| "loss": 0.3207, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.7660311958405546, | |
| "grad_norm": 0.20716728270454912, | |
| "learning_rate": 8.714706838604056e-06, | |
| "loss": 0.324, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.800693240901213, | |
| "grad_norm": 0.22069615326048286, | |
| "learning_rate": 8.316429586529616e-06, | |
| "loss": 0.3199, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.8353552859618718, | |
| "grad_norm": 0.23991187483045295, | |
| "learning_rate": 7.92088309182241e-06, | |
| "loss": 0.3211, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.8700173310225303, | |
| "grad_norm": 0.2151602225059441, | |
| "learning_rate": 7.5287089323433035e-06, | |
| "loss": 0.321, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.9046793760831888, | |
| "grad_norm": 0.20354266611589614, | |
| "learning_rate": 7.140543216013109e-06, | |
| "loss": 0.3169, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.9393414211438476, | |
| "grad_norm": 0.18489564276501388, | |
| "learning_rate": 6.757015549043174e-06, | |
| "loss": 0.3217, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.974003466204506, | |
| "grad_norm": 0.20059993408138063, | |
| "learning_rate": 6.378748014711834e-06, | |
| "loss": 0.3183, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 2.0069324090121317, | |
| "grad_norm": 0.20918648314950533, | |
| "learning_rate": 6.006354164343047e-06, | |
| "loss": 0.3143, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 2.0415944540727904, | |
| "grad_norm": 0.21120459555296436, | |
| "learning_rate": 5.640438022123898e-06, | |
| "loss": 0.2945, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 2.0762564991334487, | |
| "grad_norm": 0.19259124342978656, | |
| "learning_rate": 5.28159310537518e-06, | |
| "loss": 0.2964, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 2.1109185441941074, | |
| "grad_norm": 0.19456637048215897, | |
| "learning_rate": 4.930401461864099e-06, | |
| "loss": 0.2962, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 2.145580589254766, | |
| "grad_norm": 0.19317531728305584, | |
| "learning_rate": 4.587432725720687e-06, | |
| "loss": 0.2991, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 2.1802426343154244, | |
| "grad_norm": 0.1856506218049172, | |
| "learning_rate": 4.2532431934891646e-06, | |
| "loss": 0.2964, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 2.214904679376083, | |
| "grad_norm": 0.1856754928750834, | |
| "learning_rate": 3.9283749218128885e-06, | |
| "loss": 0.2995, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 2.249566724436742, | |
| "grad_norm": 0.184849964757185, | |
| "learning_rate": 3.6133548482165225e-06, | |
| "loss": 0.295, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 2.2842287694974, | |
| "grad_norm": 0.19565957222744196, | |
| "learning_rate": 3.308693936411421e-06, | |
| "loss": 0.2979, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 2.318890814558059, | |
| "grad_norm": 0.18136403678035376, | |
| "learning_rate": 3.0148863475106315e-06, | |
| "loss": 0.2979, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 2.3535528596187176, | |
| "grad_norm": 0.19360555697632642, | |
| "learning_rate": 2.73240863849777e-06, | |
| "loss": 0.2992, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 2.388214904679376, | |
| "grad_norm": 0.1775757947686451, | |
| "learning_rate": 2.4617189892498326e-06, | |
| "loss": 0.2935, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 2.4228769497400346, | |
| "grad_norm": 0.16925773266551042, | |
| "learning_rate": 2.2032564593677773e-06, | |
| "loss": 0.2954, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.4575389948006934, | |
| "grad_norm": 0.16916508338919892, | |
| "learning_rate": 1.9574402760202315e-06, | |
| "loss": 0.2955, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 2.4922010398613517, | |
| "grad_norm": 0.18075348143642583, | |
| "learning_rate": 1.7246691539555027e-06, | |
| "loss": 0.2922, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 2.5268630849220104, | |
| "grad_norm": 0.1850332211086371, | |
| "learning_rate": 1.5053206487847916e-06, | |
| "loss": 0.2955, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 2.561525129982669, | |
| "grad_norm": 0.16621824099266747, | |
| "learning_rate": 1.2997505445856085e-06, | |
| "loss": 0.2962, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 2.5961871750433274, | |
| "grad_norm": 0.16991443069932874, | |
| "learning_rate": 1.1082922768187098e-06, | |
| "loss": 0.2956, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 2.630849220103986, | |
| "grad_norm": 0.16292018023051463, | |
| "learning_rate": 9.312563914945461e-07, | |
| "loss": 0.2943, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 2.665511265164645, | |
| "grad_norm": 0.16599730368446816, | |
| "learning_rate": 7.689300414665124e-07, | |
| "loss": 0.2947, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 2.700173310225303, | |
| "grad_norm": 0.18732619784560842, | |
| "learning_rate": 6.215765206679569e-07, | |
| "loss": 0.2975, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 2.734835355285962, | |
| "grad_norm": 0.3651555640716956, | |
| "learning_rate": 4.894348370484648e-07, | |
| "loss": 0.2976, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 2.76949740034662, | |
| "grad_norm": 0.15975086160134086, | |
| "learning_rate": 3.7271932490209327e-07, | |
| "loss": 0.2961, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.804159445407279, | |
| "grad_norm": 0.16718571110994368, | |
| "learning_rate": 2.716192972163556e-07, | |
| "loss": 0.2969, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 2.8388214904679376, | |
| "grad_norm": 0.16340541384128912, | |
| "learning_rate": 1.8629873860586567e-07, | |
| "loss": 0.2958, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 2.873483535528596, | |
| "grad_norm": 0.17190266146012298, | |
| "learning_rate": 1.1689603932869664e-07, | |
| "loss": 0.3001, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 2.9081455805892547, | |
| "grad_norm": 0.1591106267026665, | |
| "learning_rate": 6.352377081687011e-08, | |
| "loss": 0.2943, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 2.9428076256499134, | |
| "grad_norm": 0.1674267163284321, | |
| "learning_rate": 2.6268503085089547e-08, | |
| "loss": 0.2982, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 2.9774696707105717, | |
| "grad_norm": 0.15742123030957314, | |
| "learning_rate": 5.190664313851068e-09, | |
| "loss": 0.2965, | |
| "step": 860 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 867, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 10000000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5763988440743936.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |