{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 867, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03466204506065858, "grad_norm": 1.614887252818893, "learning_rate": 2.0689655172413796e-06, "loss": 0.6065, "step": 10 }, { "epoch": 0.06932409012131716, "grad_norm": 1.2876954446946685, "learning_rate": 4.367816091954023e-06, "loss": 0.5347, "step": 20 }, { "epoch": 0.10398613518197573, "grad_norm": 0.4989329651688657, "learning_rate": 6.666666666666667e-06, "loss": 0.4714, "step": 30 }, { "epoch": 0.1386481802426343, "grad_norm": 0.36809008403990967, "learning_rate": 8.965517241379312e-06, "loss": 0.4476, "step": 40 }, { "epoch": 0.1733102253032929, "grad_norm": 0.3015275688562798, "learning_rate": 1.1264367816091955e-05, "loss": 0.4157, "step": 50 }, { "epoch": 0.20797227036395147, "grad_norm": 0.23325709345466544, "learning_rate": 1.3563218390804598e-05, "loss": 0.4014, "step": 60 }, { "epoch": 0.24263431542461006, "grad_norm": 0.2599261399956402, "learning_rate": 1.586206896551724e-05, "loss": 0.3918, "step": 70 }, { "epoch": 0.2772963604852686, "grad_norm": 0.23676671921370496, "learning_rate": 1.8160919540229885e-05, "loss": 0.391, "step": 80 }, { "epoch": 0.3119584055459272, "grad_norm": 0.24237959936856435, "learning_rate": 1.9999675557165282e-05, "loss": 0.3859, "step": 90 }, { "epoch": 0.3466204506065858, "grad_norm": 0.4446903057333395, "learning_rate": 1.998832226832327e-05, "loss": 0.3805, "step": 100 }, { "epoch": 0.38128249566724437, "grad_norm": 0.3211792221924684, "learning_rate": 1.9960767884236132e-05, "loss": 0.3806, "step": 110 }, { "epoch": 0.41594454072790293, "grad_norm": 0.27403637242653317, "learning_rate": 1.9917057098215624e-05, "loss": 0.3805, "step": 120 }, { "epoch": 0.4506065857885615, "grad_norm": 0.2508084333447016, "learning_rate": 1.985726080931651e-05, "loss": 0.3741, "step": 130 }, { "epoch": 0.4852686308492201, "grad_norm": 0.28374356186453464, "learning_rate": 1.9781476007338058e-05, "loss": 0.3739, "step": 140 }, { "epoch": 0.5199306759098787, "grad_norm": 0.35539697186524694, "learning_rate": 1.968982561550621e-05, "loss": 0.3678, "step": 150 }, { "epoch": 0.5545927209705372, "grad_norm": 0.2613425020538643, "learning_rate": 1.9582458291091664e-05, "loss": 0.3647, "step": 160 }, { "epoch": 0.5892547660311959, "grad_norm": 0.3109549671591506, "learning_rate": 1.9459548184287254e-05, "loss": 0.3638, "step": 170 }, { "epoch": 0.6239168110918544, "grad_norm": 0.3368502474991166, "learning_rate": 1.932129465573568e-05, "loss": 0.3626, "step": 180 }, { "epoch": 0.658578856152513, "grad_norm": 0.2670073589604095, "learning_rate": 1.9167921953165827e-05, "loss": 0.3635, "step": 190 }, { "epoch": 0.6932409012131716, "grad_norm": 0.2916079955306212, "learning_rate": 1.8999678847662124e-05, "loss": 0.3597, "step": 200 }, { "epoch": 0.7279029462738301, "grad_norm": 0.30166318114659063, "learning_rate": 1.881683823015694e-05, "loss": 0.3555, "step": 210 }, { "epoch": 0.7625649913344887, "grad_norm": 0.2713616440874116, "learning_rate": 1.8619696668800494e-05, "loss": 0.3586, "step": 220 }, { "epoch": 0.7972270363951474, "grad_norm": 0.26515561290090994, "learning_rate": 1.8408573927926225e-05, "loss": 0.3617, "step": 230 }, { "epoch": 0.8318890814558059, "grad_norm": 0.2813509663127385, "learning_rate": 1.818381244939187e-05, "loss": 0.3556, "step": 240 }, { "epoch": 0.8665511265164645, "grad_norm": 0.346388313968459, "learning_rate": 1.7945776797137544e-05, "loss": 0.3517, "step": 250 }, { "epoch": 0.901213171577123, "grad_norm": 0.25559948474750327, "learning_rate": 1.769485306586166e-05, "loss": 0.3531, "step": 260 }, { "epoch": 0.9358752166377816, "grad_norm": 0.23567303867404224, "learning_rate": 1.7431448254773943e-05, "loss": 0.354, "step": 270 }, { "epoch": 0.9705372616984402, "grad_norm": 0.2507120585300497, "learning_rate": 1.715598960744121e-05, "loss": 0.353, "step": 280 }, { "epoch": 1.0034662045060658, "grad_norm": 0.3235414509087519, "learning_rate": 1.6868923918796753e-05, "loss": 0.3479, "step": 290 }, { "epoch": 1.0381282495667243, "grad_norm": 0.29249432292924354, "learning_rate": 1.657071681043731e-05, "loss": 0.3297, "step": 300 }, { "epoch": 1.072790294627383, "grad_norm": 0.28107913308047194, "learning_rate": 1.626185197538314e-05, "loss": 0.3216, "step": 310 }, { "epoch": 1.1074523396880416, "grad_norm": 0.2388850688621256, "learning_rate": 1.5942830393526176e-05, "loss": 0.3313, "step": 320 }, { "epoch": 1.1421143847487, "grad_norm": 0.23828446739527873, "learning_rate": 1.561416951903881e-05, "loss": 0.3274, "step": 330 }, { "epoch": 1.1767764298093588, "grad_norm": 0.24738586978364616, "learning_rate": 1.527640244106133e-05, "loss": 0.3261, "step": 340 }, { "epoch": 1.2114384748700173, "grad_norm": 0.24976026828074924, "learning_rate": 1.4930077019029376e-05, "loss": 0.3277, "step": 350 }, { "epoch": 1.2461005199306758, "grad_norm": 0.31665221410517264, "learning_rate": 1.4575754994043956e-05, "loss": 0.3242, "step": 360 }, { "epoch": 1.2807625649913346, "grad_norm": 0.25301472762117483, "learning_rate": 1.4214011077725293e-05, "loss": 0.3296, "step": 370 }, { "epoch": 1.315424610051993, "grad_norm": 0.3032496960590633, "learning_rate": 1.3845432020028511e-05, "loss": 0.3243, "step": 380 }, { "epoch": 1.3500866551126516, "grad_norm": 0.2144539306745234, "learning_rate": 1.347061565753303e-05, "loss": 0.3223, "step": 390 }, { "epoch": 1.38474870017331, "grad_norm": 0.25316588193158246, "learning_rate": 1.3090169943749475e-05, "loss": 0.3269, "step": 400 }, { "epoch": 1.4194107452339688, "grad_norm": 0.23335401936650935, "learning_rate": 1.270471196301684e-05, "loss": 0.3233, "step": 410 }, { "epoch": 1.4540727902946273, "grad_norm": 0.25740474483694903, "learning_rate": 1.2314866929589434e-05, "loss": 0.3257, "step": 420 }, { "epoch": 1.4887348353552858, "grad_norm": 0.22583757323894366, "learning_rate": 1.1921267173537085e-05, "loss": 0.3258, "step": 430 }, { "epoch": 1.5233968804159446, "grad_norm": 0.21072089179588632, "learning_rate": 1.1524551115103455e-05, "loss": 0.3198, "step": 440 }, { "epoch": 1.558058925476603, "grad_norm": 0.2075217279484371, "learning_rate": 1.1125362229186056e-05, "loss": 0.3213, "step": 450 }, { "epoch": 1.5927209705372616, "grad_norm": 0.20733188488510618, "learning_rate": 1.0724348001617626e-05, "loss": 0.3189, "step": 460 }, { "epoch": 1.6273830155979203, "grad_norm": 0.23700844887424702, "learning_rate": 1.0322158878941733e-05, "loss": 0.3238, "step": 470 }, { "epoch": 1.6620450606585788, "grad_norm": 0.2303410797551013, "learning_rate": 9.919447213386103e-06, "loss": 0.3188, "step": 480 }, { "epoch": 1.6967071057192373, "grad_norm": 0.2545430511134805, "learning_rate": 9.516866204744932e-06, "loss": 0.3185, "step": 490 }, { "epoch": 1.731369150779896, "grad_norm": 0.22010025072217687, "learning_rate": 9.115068840886418e-06, "loss": 0.3207, "step": 500 }, { "epoch": 1.7660311958405546, "grad_norm": 0.20716728270454912, "learning_rate": 8.714706838604056e-06, "loss": 0.324, "step": 510 }, { "epoch": 1.800693240901213, "grad_norm": 0.22069615326048286, "learning_rate": 8.316429586529616e-06, "loss": 0.3199, "step": 520 }, { "epoch": 1.8353552859618718, "grad_norm": 0.23991187483045295, "learning_rate": 7.92088309182241e-06, "loss": 0.3211, "step": 530 }, { "epoch": 1.8700173310225303, "grad_norm": 0.2151602225059441, "learning_rate": 7.5287089323433035e-06, "loss": 0.321, "step": 540 }, { "epoch": 1.9046793760831888, "grad_norm": 0.20354266611589614, "learning_rate": 7.140543216013109e-06, "loss": 0.3169, "step": 550 }, { "epoch": 1.9393414211438476, "grad_norm": 0.18489564276501388, "learning_rate": 6.757015549043174e-06, "loss": 0.3217, "step": 560 }, { "epoch": 1.974003466204506, "grad_norm": 0.20059993408138063, "learning_rate": 6.378748014711834e-06, "loss": 0.3183, "step": 570 }, { "epoch": 2.0069324090121317, "grad_norm": 0.20918648314950533, "learning_rate": 6.006354164343047e-06, "loss": 0.3143, "step": 580 }, { "epoch": 2.0415944540727904, "grad_norm": 0.21120459555296436, "learning_rate": 5.640438022123898e-06, "loss": 0.2945, "step": 590 }, { "epoch": 2.0762564991334487, "grad_norm": 0.19259124342978656, "learning_rate": 5.28159310537518e-06, "loss": 0.2964, "step": 600 }, { "epoch": 2.1109185441941074, "grad_norm": 0.19456637048215897, "learning_rate": 4.930401461864099e-06, "loss": 0.2962, "step": 610 }, { "epoch": 2.145580589254766, "grad_norm": 0.19317531728305584, "learning_rate": 4.587432725720687e-06, "loss": 0.2991, "step": 620 }, { "epoch": 2.1802426343154244, "grad_norm": 0.1856506218049172, "learning_rate": 4.2532431934891646e-06, "loss": 0.2964, "step": 630 }, { "epoch": 2.214904679376083, "grad_norm": 0.1856754928750834, "learning_rate": 3.9283749218128885e-06, "loss": 0.2995, "step": 640 }, { "epoch": 2.249566724436742, "grad_norm": 0.184849964757185, "learning_rate": 3.6133548482165225e-06, "loss": 0.295, "step": 650 }, { "epoch": 2.2842287694974, "grad_norm": 0.19565957222744196, "learning_rate": 3.308693936411421e-06, "loss": 0.2979, "step": 660 }, { "epoch": 2.318890814558059, "grad_norm": 0.18136403678035376, "learning_rate": 3.0148863475106315e-06, "loss": 0.2979, "step": 670 }, { "epoch": 2.3535528596187176, "grad_norm": 0.19360555697632642, "learning_rate": 2.73240863849777e-06, "loss": 0.2992, "step": 680 }, { "epoch": 2.388214904679376, "grad_norm": 0.1775757947686451, "learning_rate": 2.4617189892498326e-06, "loss": 0.2935, "step": 690 }, { "epoch": 2.4228769497400346, "grad_norm": 0.16925773266551042, "learning_rate": 2.2032564593677773e-06, "loss": 0.2954, "step": 700 }, { "epoch": 2.4575389948006934, "grad_norm": 0.16916508338919892, "learning_rate": 1.9574402760202315e-06, "loss": 0.2955, "step": 710 }, { "epoch": 2.4922010398613517, "grad_norm": 0.18075348143642583, "learning_rate": 1.7246691539555027e-06, "loss": 0.2922, "step": 720 }, { "epoch": 2.5268630849220104, "grad_norm": 0.1850332211086371, "learning_rate": 1.5053206487847916e-06, "loss": 0.2955, "step": 730 }, { "epoch": 2.561525129982669, "grad_norm": 0.16621824099266747, "learning_rate": 1.2997505445856085e-06, "loss": 0.2962, "step": 740 }, { "epoch": 2.5961871750433274, "grad_norm": 0.16991443069932874, "learning_rate": 1.1082922768187098e-06, "loss": 0.2956, "step": 750 }, { "epoch": 2.630849220103986, "grad_norm": 0.16292018023051463, "learning_rate": 9.312563914945461e-07, "loss": 0.2943, "step": 760 }, { "epoch": 2.665511265164645, "grad_norm": 0.16599730368446816, "learning_rate": 7.689300414665124e-07, "loss": 0.2947, "step": 770 }, { "epoch": 2.700173310225303, "grad_norm": 0.18732619784560842, "learning_rate": 6.215765206679569e-07, "loss": 0.2975, "step": 780 }, { "epoch": 2.734835355285962, "grad_norm": 0.3651555640716956, "learning_rate": 4.894348370484648e-07, "loss": 0.2976, "step": 790 }, { "epoch": 2.76949740034662, "grad_norm": 0.15975086160134086, "learning_rate": 3.7271932490209327e-07, "loss": 0.2961, "step": 800 }, { "epoch": 2.804159445407279, "grad_norm": 0.16718571110994368, "learning_rate": 2.716192972163556e-07, "loss": 0.2969, "step": 810 }, { "epoch": 2.8388214904679376, "grad_norm": 0.16340541384128912, "learning_rate": 1.8629873860586567e-07, "loss": 0.2958, "step": 820 }, { "epoch": 2.873483535528596, "grad_norm": 0.17190266146012298, "learning_rate": 1.1689603932869664e-07, "loss": 0.3001, "step": 830 }, { "epoch": 2.9081455805892547, "grad_norm": 0.1591106267026665, "learning_rate": 6.352377081687011e-08, "loss": 0.2943, "step": 840 }, { "epoch": 2.9428076256499134, "grad_norm": 0.1674267163284321, "learning_rate": 2.6268503085089547e-08, "loss": 0.2982, "step": 850 }, { "epoch": 2.9774696707105717, "grad_norm": 0.15742123030957314, "learning_rate": 5.190664313851068e-09, "loss": 0.2965, "step": 860 } ], "logging_steps": 10, "max_steps": 867, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 10000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5763988440743936.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }