| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 9.979036827195468, | |
| "eval_steps": 500, | |
| "global_step": 4410, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.11331444759206799, | |
| "grad_norm": 0.12750307449685053, | |
| "learning_rate": 1.977324263038549e-05, | |
| "loss": 0.2329, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.22662889518413598, | |
| "grad_norm": 0.10813549127897755, | |
| "learning_rate": 1.9546485260770977e-05, | |
| "loss": 0.0091, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.33994334277620397, | |
| "grad_norm": 0.06931579383532684, | |
| "learning_rate": 1.9319727891156463e-05, | |
| "loss": 0.0118, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.45325779036827196, | |
| "grad_norm": 0.05818309038266094, | |
| "learning_rate": 1.9092970521541953e-05, | |
| "loss": 0.004, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.56657223796034, | |
| "grad_norm": 0.04003839601543832, | |
| "learning_rate": 1.886621315192744e-05, | |
| "loss": 0.003, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.6798866855524079, | |
| "grad_norm": 0.05009081423940936, | |
| "learning_rate": 1.863945578231293e-05, | |
| "loss": 0.0025, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.7932011331444759, | |
| "grad_norm": 0.043467219593294984, | |
| "learning_rate": 1.8412698412698415e-05, | |
| "loss": 0.0022, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.9065155807365439, | |
| "grad_norm": 0.03628971874099008, | |
| "learning_rate": 1.81859410430839e-05, | |
| "loss": 0.002, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.0181303116147309, | |
| "grad_norm": 0.03627715893292353, | |
| "learning_rate": 1.795918367346939e-05, | |
| "loss": 0.0017, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.1314447592067989, | |
| "grad_norm": 0.032502169772711025, | |
| "learning_rate": 1.7732426303854877e-05, | |
| "loss": 0.0015, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.2447592067988669, | |
| "grad_norm": 0.03800243357726477, | |
| "learning_rate": 1.7505668934240366e-05, | |
| "loss": 0.0015, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.3580736543909349, | |
| "grad_norm": 0.03683426555432635, | |
| "learning_rate": 1.7278911564625852e-05, | |
| "loss": 0.0014, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.4713881019830028, | |
| "grad_norm": 0.030197812160831627, | |
| "learning_rate": 1.705215419501134e-05, | |
| "loss": 0.0013, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.5847025495750708, | |
| "grad_norm": 0.029519742432872642, | |
| "learning_rate": 1.6825396825396828e-05, | |
| "loss": 0.0013, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.6980169971671388, | |
| "grad_norm": 0.031772318618878956, | |
| "learning_rate": 1.6598639455782314e-05, | |
| "loss": 0.0012, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.8113314447592068, | |
| "grad_norm": 0.029907198061296256, | |
| "learning_rate": 1.63718820861678e-05, | |
| "loss": 0.0012, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.9246458923512748, | |
| "grad_norm": 0.0470370171523219, | |
| "learning_rate": 1.614512471655329e-05, | |
| "loss": 0.0013, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 2.0362606232294618, | |
| "grad_norm": 0.029126616132443304, | |
| "learning_rate": 1.5918367346938776e-05, | |
| "loss": 0.0011, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 2.1495750708215295, | |
| "grad_norm": 0.025826203236221357, | |
| "learning_rate": 1.5691609977324265e-05, | |
| "loss": 0.0009, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 2.2628895184135978, | |
| "grad_norm": 0.026184094780744977, | |
| "learning_rate": 1.546485260770975e-05, | |
| "loss": 0.001, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 2.376203966005666, | |
| "grad_norm": 0.02104685350387791, | |
| "learning_rate": 1.523809523809524e-05, | |
| "loss": 0.0009, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 2.4895184135977337, | |
| "grad_norm": 0.026962231904391707, | |
| "learning_rate": 1.5011337868480727e-05, | |
| "loss": 0.0009, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 2.6028328611898015, | |
| "grad_norm": 0.02386983035895265, | |
| "learning_rate": 1.4784580498866215e-05, | |
| "loss": 0.0009, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 2.7161473087818697, | |
| "grad_norm": 0.03135625507160954, | |
| "learning_rate": 1.4557823129251703e-05, | |
| "loss": 0.0009, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 2.829461756373938, | |
| "grad_norm": 0.021990829289825996, | |
| "learning_rate": 1.433106575963719e-05, | |
| "loss": 0.0008, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 2.9427762039660057, | |
| "grad_norm": 0.027975048412071702, | |
| "learning_rate": 1.4104308390022677e-05, | |
| "loss": 0.0008, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 3.0543909348441924, | |
| "grad_norm": 0.02125910446859899, | |
| "learning_rate": 1.3877551020408165e-05, | |
| "loss": 0.0008, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 3.1677053824362607, | |
| "grad_norm": 0.028731963252144437, | |
| "learning_rate": 1.3650793650793652e-05, | |
| "loss": 0.0007, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 3.2810198300283284, | |
| "grad_norm": 0.022590926398634654, | |
| "learning_rate": 1.342403628117914e-05, | |
| "loss": 0.0007, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 3.3943342776203966, | |
| "grad_norm": 0.021593995910696728, | |
| "learning_rate": 1.3197278911564626e-05, | |
| "loss": 0.0006, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 3.507648725212465, | |
| "grad_norm": 0.02666309225386757, | |
| "learning_rate": 1.2970521541950114e-05, | |
| "loss": 0.0006, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 3.6209631728045326, | |
| "grad_norm": 0.020932694755938938, | |
| "learning_rate": 1.2743764172335602e-05, | |
| "loss": 0.0006, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 3.7342776203966004, | |
| "grad_norm": 0.015749742123260534, | |
| "learning_rate": 1.251700680272109e-05, | |
| "loss": 0.0006, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 3.8475920679886686, | |
| "grad_norm": 0.02090008773458773, | |
| "learning_rate": 1.2290249433106578e-05, | |
| "loss": 0.0006, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 3.960906515580737, | |
| "grad_norm": 0.01537319079431243, | |
| "learning_rate": 1.2063492063492064e-05, | |
| "loss": 0.0006, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 4.0725212464589235, | |
| "grad_norm": 0.016456680181818843, | |
| "learning_rate": 1.1836734693877552e-05, | |
| "loss": 0.0005, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 4.185835694050992, | |
| "grad_norm": 0.014279201845786642, | |
| "learning_rate": 1.160997732426304e-05, | |
| "loss": 0.0004, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 4.299150141643059, | |
| "grad_norm": 0.013506804613582977, | |
| "learning_rate": 1.1383219954648527e-05, | |
| "loss": 0.0004, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 4.412464589235127, | |
| "grad_norm": 0.017575682502776736, | |
| "learning_rate": 1.1156462585034013e-05, | |
| "loss": 0.0004, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 4.5257790368271955, | |
| "grad_norm": 0.020440730932576385, | |
| "learning_rate": 1.0929705215419501e-05, | |
| "loss": 0.0004, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 4.639093484419264, | |
| "grad_norm": 0.01834736863465592, | |
| "learning_rate": 1.0702947845804989e-05, | |
| "loss": 0.0004, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 4.752407932011332, | |
| "grad_norm": 0.017860961767452527, | |
| "learning_rate": 1.0476190476190477e-05, | |
| "loss": 0.0004, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 4.865722379603399, | |
| "grad_norm": 0.020594689995493427, | |
| "learning_rate": 1.0249433106575966e-05, | |
| "loss": 0.0004, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 4.9790368271954675, | |
| "grad_norm": 0.019468621367640594, | |
| "learning_rate": 1.0022675736961451e-05, | |
| "loss": 0.0004, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 5.090651558073654, | |
| "grad_norm": 0.01815010750870599, | |
| "learning_rate": 9.795918367346939e-06, | |
| "loss": 0.0003, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 5.203966005665722, | |
| "grad_norm": 0.013915876229594861, | |
| "learning_rate": 9.569160997732427e-06, | |
| "loss": 0.0003, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 5.317280453257791, | |
| "grad_norm": 0.01012229778431893, | |
| "learning_rate": 9.342403628117914e-06, | |
| "loss": 0.0003, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 5.430594900849858, | |
| "grad_norm": 0.014316093650605412, | |
| "learning_rate": 9.115646258503402e-06, | |
| "loss": 0.0003, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 5.543909348441926, | |
| "grad_norm": 0.017567253599829386, | |
| "learning_rate": 8.888888888888888e-06, | |
| "loss": 0.0003, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 5.657223796033994, | |
| "grad_norm": 0.0226569072606562, | |
| "learning_rate": 8.662131519274378e-06, | |
| "loss": 0.0003, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 5.770538243626063, | |
| "grad_norm": 0.01682883062787434, | |
| "learning_rate": 8.435374149659866e-06, | |
| "loss": 0.0003, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 5.88385269121813, | |
| "grad_norm": 0.013391313567851492, | |
| "learning_rate": 8.208616780045352e-06, | |
| "loss": 0.0003, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 5.997167138810198, | |
| "grad_norm": 0.01778560595671548, | |
| "learning_rate": 7.98185941043084e-06, | |
| "loss": 0.0003, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 6.108781869688385, | |
| "grad_norm": 0.013117817816572433, | |
| "learning_rate": 7.755102040816327e-06, | |
| "loss": 0.0002, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 6.222096317280453, | |
| "grad_norm": 0.017618823564966208, | |
| "learning_rate": 7.528344671201815e-06, | |
| "loss": 0.0002, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 6.335410764872521, | |
| "grad_norm": 0.0096548743047912, | |
| "learning_rate": 7.301587301587301e-06, | |
| "loss": 0.0002, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 6.4487252124645895, | |
| "grad_norm": 0.010097830595674354, | |
| "learning_rate": 7.07482993197279e-06, | |
| "loss": 0.0002, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 6.562039660056657, | |
| "grad_norm": 0.010910197208914625, | |
| "learning_rate": 6.848072562358277e-06, | |
| "loss": 0.0002, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 6.675354107648725, | |
| "grad_norm": 0.012241078806903706, | |
| "learning_rate": 6.621315192743765e-06, | |
| "loss": 0.0002, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 6.788668555240793, | |
| "grad_norm": 0.017003802621226093, | |
| "learning_rate": 6.394557823129253e-06, | |
| "loss": 0.0002, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 6.9019830028328615, | |
| "grad_norm": 0.014847405573922268, | |
| "learning_rate": 6.16780045351474e-06, | |
| "loss": 0.0002, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 7.013597733711048, | |
| "grad_norm": 0.018305480145040905, | |
| "learning_rate": 5.9410430839002275e-06, | |
| "loss": 0.0002, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 7.126912181303116, | |
| "grad_norm": 0.012819862819578573, | |
| "learning_rate": 5.7142857142857145e-06, | |
| "loss": 0.0001, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 7.240226628895184, | |
| "grad_norm": 0.013371528432039266, | |
| "learning_rate": 5.487528344671202e-06, | |
| "loss": 0.0001, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 7.353541076487252, | |
| "grad_norm": 0.005633164169599245, | |
| "learning_rate": 5.260770975056689e-06, | |
| "loss": 0.0001, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 7.46685552407932, | |
| "grad_norm": 0.01061442292135964, | |
| "learning_rate": 5.034013605442177e-06, | |
| "loss": 0.0001, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 7.580169971671388, | |
| "grad_norm": 0.00907172882592498, | |
| "learning_rate": 4.807256235827665e-06, | |
| "loss": 0.0001, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 7.693484419263456, | |
| "grad_norm": 0.010227726380358893, | |
| "learning_rate": 4.580498866213152e-06, | |
| "loss": 0.0001, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 7.806798866855524, | |
| "grad_norm": 0.008230239860332859, | |
| "learning_rate": 4.35374149659864e-06, | |
| "loss": 0.0001, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 7.920113314447592, | |
| "grad_norm": 0.013256768674645692, | |
| "learning_rate": 4.126984126984127e-06, | |
| "loss": 0.0001, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 8.03172804532578, | |
| "grad_norm": 0.01006097891003743, | |
| "learning_rate": 3.9002267573696154e-06, | |
| "loss": 0.0001, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 8.145042492917847, | |
| "grad_norm": 0.009602466515273548, | |
| "learning_rate": 3.6734693877551024e-06, | |
| "loss": 0.0001, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 8.258356940509914, | |
| "grad_norm": 0.005795654820959573, | |
| "learning_rate": 3.44671201814059e-06, | |
| "loss": 0.0001, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 8.371671388101984, | |
| "grad_norm": 0.0034753960503702283, | |
| "learning_rate": 3.2199546485260772e-06, | |
| "loss": 0.0001, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 8.48498583569405, | |
| "grad_norm": 0.006716712372637811, | |
| "learning_rate": 2.993197278911565e-06, | |
| "loss": 0.0001, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 8.598300283286118, | |
| "grad_norm": 0.005691017397956627, | |
| "learning_rate": 2.7664399092970525e-06, | |
| "loss": 0.0001, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 8.711614730878187, | |
| "grad_norm": 0.005512521725963195, | |
| "learning_rate": 2.53968253968254e-06, | |
| "loss": 0.0001, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 8.824929178470255, | |
| "grad_norm": 0.004574835289146862, | |
| "learning_rate": 2.3129251700680273e-06, | |
| "loss": 0.0001, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 8.938243626062324, | |
| "grad_norm": 0.005810520530240234, | |
| "learning_rate": 2.086167800453515e-06, | |
| "loss": 0.0001, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 9.04985835694051, | |
| "grad_norm": 0.0032799015697558206, | |
| "learning_rate": 1.8594104308390023e-06, | |
| "loss": 0.0001, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 9.163172804532579, | |
| "grad_norm": 0.0021797625948262924, | |
| "learning_rate": 1.6326530612244897e-06, | |
| "loss": 0.0, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 9.276487252124646, | |
| "grad_norm": 0.003847390080167819, | |
| "learning_rate": 1.4058956916099775e-06, | |
| "loss": 0.0, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 9.389801699716713, | |
| "grad_norm": 0.0025440319025627875, | |
| "learning_rate": 1.179138321995465e-06, | |
| "loss": 0.0, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 9.503116147308782, | |
| "grad_norm": 0.002651261609298388, | |
| "learning_rate": 9.523809523809525e-07, | |
| "loss": 0.0, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 9.61643059490085, | |
| "grad_norm": 0.0033509868369888817, | |
| "learning_rate": 7.2562358276644e-07, | |
| "loss": 0.0, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 9.729745042492917, | |
| "grad_norm": 0.011727033108113469, | |
| "learning_rate": 4.988662131519275e-07, | |
| "loss": 0.0, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 9.843059490084986, | |
| "grad_norm": 0.005342769601421854, | |
| "learning_rate": 2.72108843537415e-07, | |
| "loss": 0.0, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 9.956373937677053, | |
| "grad_norm": 0.004383322417463679, | |
| "learning_rate": 4.53514739229025e-08, | |
| "loss": 0.0, | |
| "step": 4400 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 4410, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 10, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.446988850023629e+16, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |