| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 9.951406649616368, | |
| "eval_steps": 500, | |
| "global_step": 1950, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.05115089514066496, | |
| "grad_norm": 1.3708726167678833, | |
| "learning_rate": 9.999351124856874e-05, | |
| "loss": 2.3175, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.10230179028132992, | |
| "grad_norm": 0.3893856704235077, | |
| "learning_rate": 9.997404667843075e-05, | |
| "loss": 0.665, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.1534526854219949, | |
| "grad_norm": 0.2740652859210968, | |
| "learning_rate": 9.994161134161634e-05, | |
| "loss": 0.4255, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.20460358056265984, | |
| "grad_norm": 0.3207554221153259, | |
| "learning_rate": 9.989621365671902e-05, | |
| "loss": 0.3868, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.2557544757033248, | |
| "grad_norm": 0.4839524030685425, | |
| "learning_rate": 9.983786540671051e-05, | |
| "loss": 0.3614, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.3069053708439898, | |
| "grad_norm": 0.5630654692649841, | |
| "learning_rate": 9.976658173588244e-05, | |
| "loss": 0.3329, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.35805626598465473, | |
| "grad_norm": 0.15182480216026306, | |
| "learning_rate": 9.968238114591566e-05, | |
| "loss": 0.2914, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.4092071611253197, | |
| "grad_norm": 0.1376497447490692, | |
| "learning_rate": 9.95852854910781e-05, | |
| "loss": 0.2776, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.46035805626598464, | |
| "grad_norm": 0.1654089242219925, | |
| "learning_rate": 9.947531997255256e-05, | |
| "loss": 0.2706, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.5115089514066496, | |
| "grad_norm": 0.14509035646915436, | |
| "learning_rate": 9.935251313189564e-05, | |
| "loss": 0.2659, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.5626598465473146, | |
| "grad_norm": 0.20994672179222107, | |
| "learning_rate": 9.921689684362989e-05, | |
| "loss": 0.2699, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.6138107416879796, | |
| "grad_norm": 0.20592832565307617, | |
| "learning_rate": 9.906850630697068e-05, | |
| "loss": 0.2613, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.6649616368286445, | |
| "grad_norm": 0.23180726170539856, | |
| "learning_rate": 9.890738003669029e-05, | |
| "loss": 0.256, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.7161125319693095, | |
| "grad_norm": 0.1388358324766159, | |
| "learning_rate": 9.87335598531214e-05, | |
| "loss": 0.2539, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.7672634271099744, | |
| "grad_norm": 0.12840186059474945, | |
| "learning_rate": 9.85470908713026e-05, | |
| "loss": 0.256, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.8184143222506394, | |
| "grad_norm": 0.10598399490118027, | |
| "learning_rate": 9.834802148926882e-05, | |
| "loss": 0.2524, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.8695652173913043, | |
| "grad_norm": 0.14293241500854492, | |
| "learning_rate": 9.813640337548954e-05, | |
| "loss": 0.2529, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.9207161125319693, | |
| "grad_norm": 0.13569538295269012, | |
| "learning_rate": 9.791229145545831e-05, | |
| "loss": 0.2492, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.9718670076726342, | |
| "grad_norm": 0.13984881341457367, | |
| "learning_rate": 9.767574389743682e-05, | |
| "loss": 0.2486, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.020460358056266, | |
| "grad_norm": 0.142480731010437, | |
| "learning_rate": 9.742682209735727e-05, | |
| "loss": 0.2466, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.0716112531969308, | |
| "grad_norm": 0.13742870092391968, | |
| "learning_rate": 9.716559066288715e-05, | |
| "loss": 0.2481, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.1227621483375958, | |
| "grad_norm": 0.12724286317825317, | |
| "learning_rate": 9.689211739666023e-05, | |
| "loss": 0.2425, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.1739130434782608, | |
| "grad_norm": 0.10457868129014969, | |
| "learning_rate": 9.66064732786784e-05, | |
| "loss": 0.2477, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.2250639386189257, | |
| "grad_norm": 0.12810398638248444, | |
| "learning_rate": 9.630873244788883e-05, | |
| "loss": 0.2421, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.2762148337595907, | |
| "grad_norm": 0.15804988145828247, | |
| "learning_rate": 9.599897218294122e-05, | |
| "loss": 0.2416, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.3273657289002558, | |
| "grad_norm": 0.15013903379440308, | |
| "learning_rate": 9.567727288213005e-05, | |
| "loss": 0.2426, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.3785166240409208, | |
| "grad_norm": 0.18141338229179382, | |
| "learning_rate": 9.534371804252728e-05, | |
| "loss": 0.2432, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.4296675191815857, | |
| "grad_norm": 0.12825658917427063, | |
| "learning_rate": 9.49983942383106e-05, | |
| "loss": 0.2452, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.4808184143222507, | |
| "grad_norm": 0.13577987253665924, | |
| "learning_rate": 9.464139109829321e-05, | |
| "loss": 0.2378, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.5319693094629157, | |
| "grad_norm": 0.10094985365867615, | |
| "learning_rate": 9.42728012826605e-05, | |
| "loss": 0.238, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.5831202046035806, | |
| "grad_norm": 0.1287088394165039, | |
| "learning_rate": 9.389272045892024e-05, | |
| "loss": 0.2367, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.6342710997442456, | |
| "grad_norm": 0.13634280860424042, | |
| "learning_rate": 9.350124727707197e-05, | |
| "loss": 0.2365, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.6854219948849105, | |
| "grad_norm": 0.10785108059644699, | |
| "learning_rate": 9.309848334400246e-05, | |
| "loss": 0.2369, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.7365728900255755, | |
| "grad_norm": 0.10421392321586609, | |
| "learning_rate": 9.268453319711363e-05, | |
| "loss": 0.2351, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.7877237851662404, | |
| "grad_norm": 0.1302565187215805, | |
| "learning_rate": 9.225950427718975e-05, | |
| "loss": 0.2398, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.8388746803069054, | |
| "grad_norm": 0.11869315803050995, | |
| "learning_rate": 9.182350690051133e-05, | |
| "loss": 0.236, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.8900255754475703, | |
| "grad_norm": 0.12082216143608093, | |
| "learning_rate": 9.13766542302225e-05, | |
| "loss": 0.2404, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.9411764705882353, | |
| "grad_norm": 0.12104278802871704, | |
| "learning_rate": 9.091906224695935e-05, | |
| "loss": 0.234, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.9923273657289002, | |
| "grad_norm": 0.09817857295274734, | |
| "learning_rate": 9.045084971874738e-05, | |
| "loss": 0.2378, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 2.040920716112532, | |
| "grad_norm": 0.0962359830737114, | |
| "learning_rate": 8.997213817017507e-05, | |
| "loss": 0.2319, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 2.0920716112531967, | |
| "grad_norm": 0.10670891404151917, | |
| "learning_rate": 8.948305185085225e-05, | |
| "loss": 0.232, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 2.1432225063938617, | |
| "grad_norm": 0.10067127645015717, | |
| "learning_rate": 8.898371770316111e-05, | |
| "loss": 0.2333, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 2.1943734015345266, | |
| "grad_norm": 0.11361632496118546, | |
| "learning_rate": 8.847426532930831e-05, | |
| "loss": 0.2332, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 2.2455242966751916, | |
| "grad_norm": 0.10807089507579803, | |
| "learning_rate": 8.795482695768658e-05, | |
| "loss": 0.2298, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 2.296675191815857, | |
| "grad_norm": 0.10500900447368622, | |
| "learning_rate": 8.742553740855506e-05, | |
| "loss": 0.2326, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 2.3478260869565215, | |
| "grad_norm": 0.09852053970098495, | |
| "learning_rate": 8.688653405904652e-05, | |
| "loss": 0.233, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 2.398976982097187, | |
| "grad_norm": 0.09985852241516113, | |
| "learning_rate": 8.633795680751116e-05, | |
| "loss": 0.2347, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 2.4501278772378514, | |
| "grad_norm": 0.09869720041751862, | |
| "learning_rate": 8.577994803720606e-05, | |
| "loss": 0.2316, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 2.501278772378517, | |
| "grad_norm": 0.10101813822984695, | |
| "learning_rate": 8.521265257933948e-05, | |
| "loss": 0.2313, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 2.5524296675191813, | |
| "grad_norm": 0.10905516147613525, | |
| "learning_rate": 8.463621767547998e-05, | |
| "loss": 0.2322, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.6035805626598467, | |
| "grad_norm": 0.09563067555427551, | |
| "learning_rate": 8.405079293933986e-05, | |
| "loss": 0.232, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 2.6547314578005117, | |
| "grad_norm": 0.10619215667247772, | |
| "learning_rate": 8.345653031794292e-05, | |
| "loss": 0.2314, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 2.7058823529411766, | |
| "grad_norm": 0.09715953469276428, | |
| "learning_rate": 8.285358405218655e-05, | |
| "loss": 0.2304, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 2.7570332480818416, | |
| "grad_norm": 0.09010881185531616, | |
| "learning_rate": 8.224211063680853e-05, | |
| "loss": 0.2332, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 2.8081841432225065, | |
| "grad_norm": 0.10342929512262344, | |
| "learning_rate": 8.162226877976887e-05, | |
| "loss": 0.2299, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 2.8593350383631715, | |
| "grad_norm": 0.10652179270982742, | |
| "learning_rate": 8.099421936105702e-05, | |
| "loss": 0.23, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 2.9104859335038364, | |
| "grad_norm": 0.09231873601675034, | |
| "learning_rate": 8.035812539093557e-05, | |
| "loss": 0.2292, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 2.9616368286445014, | |
| "grad_norm": 0.12014494836330414, | |
| "learning_rate": 7.971415196763088e-05, | |
| "loss": 0.2318, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 3.010230179028133, | |
| "grad_norm": 0.11261285841464996, | |
| "learning_rate": 7.906246623448183e-05, | |
| "loss": 0.2272, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 3.061381074168798, | |
| "grad_norm": 0.10248834639787674, | |
| "learning_rate": 7.840323733655778e-05, | |
| "loss": 0.2287, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 3.112531969309463, | |
| "grad_norm": 0.09773645550012589, | |
| "learning_rate": 7.773663637675694e-05, | |
| "loss": 0.2264, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 3.163682864450128, | |
| "grad_norm": 0.11643853783607483, | |
| "learning_rate": 7.706283637139658e-05, | |
| "loss": 0.2287, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 3.214833759590793, | |
| "grad_norm": 0.09477915614843369, | |
| "learning_rate": 7.638201220530665e-05, | |
| "loss": 0.2239, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 3.265984654731458, | |
| "grad_norm": 0.09172279387712479, | |
| "learning_rate": 7.569434058643844e-05, | |
| "loss": 0.2276, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 3.317135549872123, | |
| "grad_norm": 0.10399650037288666, | |
| "learning_rate": 7.500000000000001e-05, | |
| "loss": 0.2254, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 3.368286445012788, | |
| "grad_norm": 0.11293193697929382, | |
| "learning_rate": 7.42991706621303e-05, | |
| "loss": 0.2305, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 3.419437340153453, | |
| "grad_norm": 0.0884748324751854, | |
| "learning_rate": 7.35920344731241e-05, | |
| "loss": 0.228, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 3.4705882352941178, | |
| "grad_norm": 0.08815360069274902, | |
| "learning_rate": 7.287877497021978e-05, | |
| "loss": 0.226, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 3.5217391304347827, | |
| "grad_norm": 0.10090487450361252, | |
| "learning_rate": 7.215957727996207e-05, | |
| "loss": 0.2244, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 3.5728900255754477, | |
| "grad_norm": 0.09220373630523682, | |
| "learning_rate": 7.143462807015271e-05, | |
| "loss": 0.225, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 3.6240409207161126, | |
| "grad_norm": 0.0896851196885109, | |
| "learning_rate": 7.07041155014006e-05, | |
| "loss": 0.2283, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 3.6751918158567776, | |
| "grad_norm": 0.09017707407474518, | |
| "learning_rate": 6.996822917828477e-05, | |
| "loss": 0.2259, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 3.7263427109974425, | |
| "grad_norm": 0.09904467314481735, | |
| "learning_rate": 6.922716010014255e-05, | |
| "loss": 0.2286, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 3.7774936061381075, | |
| "grad_norm": 0.09329435229301453, | |
| "learning_rate": 6.848110061149556e-05, | |
| "loss": 0.2273, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 3.8286445012787724, | |
| "grad_norm": 0.09724679589271545, | |
| "learning_rate": 6.773024435212678e-05, | |
| "loss": 0.2258, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 3.8797953964194374, | |
| "grad_norm": 0.10546013712882996, | |
| "learning_rate": 6.697478620682137e-05, | |
| "loss": 0.2258, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 3.9309462915601023, | |
| "grad_norm": 0.0924544557929039, | |
| "learning_rate": 6.621492225478414e-05, | |
| "loss": 0.2259, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 3.9820971867007673, | |
| "grad_norm": 0.09968756884336472, | |
| "learning_rate": 6.545084971874738e-05, | |
| "loss": 0.2268, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 4.030690537084399, | |
| "grad_norm": 0.09283004701137543, | |
| "learning_rate": 6.468276691378155e-05, | |
| "loss": 0.2279, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 4.081841432225064, | |
| "grad_norm": 0.10614680498838425, | |
| "learning_rate": 6.391087319582264e-05, | |
| "loss": 0.2232, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 4.132992327365729, | |
| "grad_norm": 0.10588511824607849, | |
| "learning_rate": 6.313536890992935e-05, | |
| "loss": 0.2231, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 4.1841432225063935, | |
| "grad_norm": 0.0895189642906189, | |
| "learning_rate": 6.235645533828349e-05, | |
| "loss": 0.2263, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 4.235294117647059, | |
| "grad_norm": 0.11032500118017197, | |
| "learning_rate": 6.157433464794716e-05, | |
| "loss": 0.2208, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 4.286445012787723, | |
| "grad_norm": 0.10448218137025833, | |
| "learning_rate": 6.078920983839031e-05, | |
| "loss": 0.2209, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 4.337595907928389, | |
| "grad_norm": 0.10621212422847748, | |
| "learning_rate": 6.0001284688802226e-05, | |
| "loss": 0.2239, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 4.388746803069053, | |
| "grad_norm": 0.09461436420679092, | |
| "learning_rate": 5.921076370520058e-05, | |
| "loss": 0.2253, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 4.439897698209719, | |
| "grad_norm": 0.09044703096151352, | |
| "learning_rate": 5.841785206735192e-05, | |
| "loss": 0.2234, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 4.491048593350383, | |
| "grad_norm": 0.114371657371521, | |
| "learning_rate": 5.762275557551727e-05, | |
| "loss": 0.2235, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 4.542199488491049, | |
| "grad_norm": 0.0894550010561943, | |
| "learning_rate": 5.682568059703659e-05, | |
| "loss": 0.223, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 4.593350383631714, | |
| "grad_norm": 0.09828157722949982, | |
| "learning_rate": 5.602683401276615e-05, | |
| "loss": 0.2223, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 4.6445012787723785, | |
| "grad_norm": 0.09371038526296616, | |
| "learning_rate": 5.522642316338268e-05, | |
| "loss": 0.2215, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 4.695652173913043, | |
| "grad_norm": 0.10780416429042816, | |
| "learning_rate": 5.442465579556793e-05, | |
| "loss": 0.2235, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 4.746803069053708, | |
| "grad_norm": 0.10167014598846436, | |
| "learning_rate": 5.3621740008088126e-05, | |
| "loss": 0.2227, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 4.797953964194374, | |
| "grad_norm": 0.09055500477552414, | |
| "learning_rate": 5.281788419778187e-05, | |
| "loss": 0.2228, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 4.849104859335038, | |
| "grad_norm": 0.0922444760799408, | |
| "learning_rate": 5.201329700547076e-05, | |
| "loss": 0.2233, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 4.900255754475703, | |
| "grad_norm": 0.09607625007629395, | |
| "learning_rate": 5.1208187261806615e-05, | |
| "loss": 0.2258, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 4.951406649616368, | |
| "grad_norm": 0.09127452969551086, | |
| "learning_rate": 5.0402763933069496e-05, | |
| "loss": 0.2233, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 0.17695656418800354, | |
| "learning_rate": 4.9597236066930516e-05, | |
| "loss": 0.224, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 5.051150895140665, | |
| "grad_norm": 0.10697818547487259, | |
| "learning_rate": 4.87918127381934e-05, | |
| "loss": 0.2221, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 5.10230179028133, | |
| "grad_norm": 0.10833295434713364, | |
| "learning_rate": 4.798670299452926e-05, | |
| "loss": 0.2203, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 5.153452685421995, | |
| "grad_norm": 0.09862041473388672, | |
| "learning_rate": 4.7182115802218126e-05, | |
| "loss": 0.2216, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 5.20460358056266, | |
| "grad_norm": 0.10249131172895432, | |
| "learning_rate": 4.6378259991911886e-05, | |
| "loss": 0.223, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 5.255754475703325, | |
| "grad_norm": 0.11569483578205109, | |
| "learning_rate": 4.5575344204432084e-05, | |
| "loss": 0.2183, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 5.30690537084399, | |
| "grad_norm": 0.09916786849498749, | |
| "learning_rate": 4.477357683661734e-05, | |
| "loss": 0.2213, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 5.358056265984655, | |
| "grad_norm": 0.10792168974876404, | |
| "learning_rate": 4.397316598723385e-05, | |
| "loss": 0.2225, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 5.40920716112532, | |
| "grad_norm": 0.1023751050233841, | |
| "learning_rate": 4.317431940296343e-05, | |
| "loss": 0.2229, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 5.460358056265985, | |
| "grad_norm": 0.10447117686271667, | |
| "learning_rate": 4.237724442448273e-05, | |
| "loss": 0.2199, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 5.5115089514066495, | |
| "grad_norm": 0.10666926205158234, | |
| "learning_rate": 4.1582147932648074e-05, | |
| "loss": 0.2189, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 5.562659846547315, | |
| "grad_norm": 0.09775088727474213, | |
| "learning_rate": 4.078923629479943e-05, | |
| "loss": 0.2225, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 5.6138107416879794, | |
| "grad_norm": 0.09840010851621628, | |
| "learning_rate": 3.9998715311197785e-05, | |
| "loss": 0.2204, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 5.664961636828645, | |
| "grad_norm": 0.10039684176445007, | |
| "learning_rate": 3.92107901616097e-05, | |
| "loss": 0.2208, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 5.716112531969309, | |
| "grad_norm": 0.10499241203069687, | |
| "learning_rate": 3.842566535205286e-05, | |
| "loss": 0.2166, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 5.767263427109975, | |
| "grad_norm": 0.10286623984575272, | |
| "learning_rate": 3.764354466171652e-05, | |
| "loss": 0.2169, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 5.818414322250639, | |
| "grad_norm": 0.10527586936950684, | |
| "learning_rate": 3.6864631090070655e-05, | |
| "loss": 0.2206, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 5.869565217391305, | |
| "grad_norm": 0.09527456760406494, | |
| "learning_rate": 3.608912680417737e-05, | |
| "loss": 0.2197, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 5.920716112531969, | |
| "grad_norm": 0.09756523370742798, | |
| "learning_rate": 3.531723308621847e-05, | |
| "loss": 0.2178, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 5.971867007672635, | |
| "grad_norm": 0.11436358839273453, | |
| "learning_rate": 3.4549150281252636e-05, | |
| "loss": 0.2197, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 6.020460358056266, | |
| "grad_norm": 0.10808803141117096, | |
| "learning_rate": 3.3785077745215873e-05, | |
| "loss": 0.2204, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 6.071611253196931, | |
| "grad_norm": 0.10490832477807999, | |
| "learning_rate": 3.3025213793178646e-05, | |
| "loss": 0.2181, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 6.122762148337596, | |
| "grad_norm": 0.1026563048362732, | |
| "learning_rate": 3.226975564787322e-05, | |
| "loss": 0.2157, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 6.173913043478261, | |
| "grad_norm": 0.104311004281044, | |
| "learning_rate": 3.151889938850445e-05, | |
| "loss": 0.2185, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 6.225063938618926, | |
| "grad_norm": 0.10704346746206284, | |
| "learning_rate": 3.0772839899857464e-05, | |
| "loss": 0.2179, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 6.276214833759591, | |
| "grad_norm": 0.12297544628381729, | |
| "learning_rate": 3.003177082171523e-05, | |
| "loss": 0.216, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 6.327365728900256, | |
| "grad_norm": 0.11270362138748169, | |
| "learning_rate": 2.9295884498599414e-05, | |
| "loss": 0.2171, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 6.378516624040921, | |
| "grad_norm": 0.10795643925666809, | |
| "learning_rate": 2.8565371929847284e-05, | |
| "loss": 0.2172, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 6.429667519181586, | |
| "grad_norm": 0.10250139236450195, | |
| "learning_rate": 2.784042272003794e-05, | |
| "loss": 0.2167, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 6.4808184143222505, | |
| "grad_norm": 0.10449033230543137, | |
| "learning_rate": 2.712122502978024e-05, | |
| "loss": 0.2185, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 6.531969309462916, | |
| "grad_norm": 0.10160576552152634, | |
| "learning_rate": 2.64079655268759e-05, | |
| "loss": 0.2171, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 6.58312020460358, | |
| "grad_norm": 0.10214517265558243, | |
| "learning_rate": 2.57008293378697e-05, | |
| "loss": 0.2167, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 6.634271099744246, | |
| "grad_norm": 0.12187962979078293, | |
| "learning_rate": 2.500000000000001e-05, | |
| "loss": 0.2212, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 6.68542199488491, | |
| "grad_norm": 0.11656996607780457, | |
| "learning_rate": 2.430565941356157e-05, | |
| "loss": 0.2188, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 6.736572890025576, | |
| "grad_norm": 0.10884075611829758, | |
| "learning_rate": 2.361798779469336e-05, | |
| "loss": 0.2178, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 6.78772378516624, | |
| "grad_norm": 0.10546910017728806, | |
| "learning_rate": 2.2937163628603435e-05, | |
| "loss": 0.2163, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 6.838874680306906, | |
| "grad_norm": 0.10172303766012192, | |
| "learning_rate": 2.2263363623243054e-05, | |
| "loss": 0.2173, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 6.89002557544757, | |
| "grad_norm": 0.114231638610363, | |
| "learning_rate": 2.1596762663442218e-05, | |
| "loss": 0.2205, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 6.9411764705882355, | |
| "grad_norm": 0.1055825874209404, | |
| "learning_rate": 2.0937533765518187e-05, | |
| "loss": 0.2156, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 6.9923273657289, | |
| "grad_norm": 0.11378146708011627, | |
| "learning_rate": 2.0285848032369137e-05, | |
| "loss": 0.2149, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 7.040920716112532, | |
| "grad_norm": 0.12051114439964294, | |
| "learning_rate": 1.9641874609064443e-05, | |
| "loss": 0.2184, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 7.092071611253197, | |
| "grad_norm": 0.10353773832321167, | |
| "learning_rate": 1.9005780638942982e-05, | |
| "loss": 0.2143, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 7.143222506393862, | |
| "grad_norm": 0.11415048688650131, | |
| "learning_rate": 1.837773122023114e-05, | |
| "loss": 0.2155, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 7.194373401534527, | |
| "grad_norm": 0.10133373737335205, | |
| "learning_rate": 1.7757889363191483e-05, | |
| "loss": 0.2157, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 7.245524296675192, | |
| "grad_norm": 0.12083577364683151, | |
| "learning_rate": 1.714641594781347e-05, | |
| "loss": 0.2138, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 7.296675191815857, | |
| "grad_norm": 0.11394869536161423, | |
| "learning_rate": 1.6543469682057106e-05, | |
| "loss": 0.2142, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 7.3478260869565215, | |
| "grad_norm": 0.10500183701515198, | |
| "learning_rate": 1.5949207060660138e-05, | |
| "loss": 0.2156, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 7.398976982097187, | |
| "grad_norm": 0.11357243359088898, | |
| "learning_rate": 1.536378232452003e-05, | |
| "loss": 0.2161, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 7.450127877237851, | |
| "grad_norm": 0.11761381477117538, | |
| "learning_rate": 1.4787347420660541e-05, | |
| "loss": 0.2122, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 7.501278772378517, | |
| "grad_norm": 0.11960902810096741, | |
| "learning_rate": 1.422005196279395e-05, | |
| "loss": 0.2156, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 7.552429667519181, | |
| "grad_norm": 0.11095816642045975, | |
| "learning_rate": 1.3662043192488849e-05, | |
| "loss": 0.2141, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 7.603580562659847, | |
| "grad_norm": 0.12269174307584763, | |
| "learning_rate": 1.3113465940953495e-05, | |
| "loss": 0.2135, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 7.654731457800511, | |
| "grad_norm": 0.10242617130279541, | |
| "learning_rate": 1.257446259144494e-05, | |
| "loss": 0.2139, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 7.705882352941177, | |
| "grad_norm": 0.1224837675690651, | |
| "learning_rate": 1.204517304231343e-05, | |
| "loss": 0.2154, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 7.757033248081841, | |
| "grad_norm": 0.10968530178070068, | |
| "learning_rate": 1.1525734670691701e-05, | |
| "loss": 0.215, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 7.8081841432225065, | |
| "grad_norm": 0.11930900067090988, | |
| "learning_rate": 1.1016282296838887e-05, | |
| "loss": 0.2137, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 7.859335038363171, | |
| "grad_norm": 0.11679380387067795, | |
| "learning_rate": 1.0516948149147754e-05, | |
| "loss": 0.2156, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 7.910485933503836, | |
| "grad_norm": 0.11422905325889587, | |
| "learning_rate": 1.0027861829824952e-05, | |
| "loss": 0.2153, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 7.961636828644501, | |
| "grad_norm": 0.11838431656360626, | |
| "learning_rate": 9.549150281252633e-06, | |
| "loss": 0.2163, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 8.010230179028133, | |
| "grad_norm": 0.11641950905323029, | |
| "learning_rate": 9.080937753040646e-06, | |
| "loss": 0.2136, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 8.061381074168798, | |
| "grad_norm": 0.12204087525606155, | |
| "learning_rate": 8.623345769777514e-06, | |
| "loss": 0.2125, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 8.112531969309464, | |
| "grad_norm": 0.11705324053764343, | |
| "learning_rate": 8.176493099488663e-06, | |
| "loss": 0.2126, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 8.163682864450127, | |
| "grad_norm": 0.12480920553207397, | |
| "learning_rate": 7.740495722810271e-06, | |
| "loss": 0.2101, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 8.214833759590793, | |
| "grad_norm": 0.11484678834676743, | |
| "learning_rate": 7.315466802886401e-06, | |
| "loss": 0.2152, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 8.265984654731458, | |
| "grad_norm": 0.12149710208177567, | |
| "learning_rate": 6.901516655997536e-06, | |
| "loss": 0.2165, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 8.317135549872123, | |
| "grad_norm": 0.12600597739219666, | |
| "learning_rate": 6.498752722928042e-06, | |
| "loss": 0.2114, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 8.368286445012787, | |
| "grad_norm": 0.13292443752288818, | |
| "learning_rate": 6.107279541079769e-06, | |
| "loss": 0.2145, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 8.419437340153452, | |
| "grad_norm": 0.1156625747680664, | |
| "learning_rate": 5.727198717339511e-06, | |
| "loss": 0.2132, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 8.470588235294118, | |
| "grad_norm": 0.12792544066905975, | |
| "learning_rate": 5.358608901706802e-06, | |
| "loss": 0.213, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 8.521739130434783, | |
| "grad_norm": 0.1326821744441986, | |
| "learning_rate": 5.001605761689398e-06, | |
| "loss": 0.2095, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 8.572890025575447, | |
| "grad_norm": 0.1182461827993393, | |
| "learning_rate": 4.65628195747273e-06, | |
| "loss": 0.2123, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 8.624040920716112, | |
| "grad_norm": 0.11860256642103195, | |
| "learning_rate": 4.322727117869951e-06, | |
| "loss": 0.2099, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 8.675191815856778, | |
| "grad_norm": 0.12541323900222778, | |
| "learning_rate": 4.001027817058789e-06, | |
| "loss": 0.2133, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 8.726342710997443, | |
| "grad_norm": 0.11954925209283829, | |
| "learning_rate": 3.691267552111183e-06, | |
| "loss": 0.2141, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 8.777493606138107, | |
| "grad_norm": 0.11271259188652039, | |
| "learning_rate": 3.393526721321616e-06, | |
| "loss": 0.2143, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 8.828644501278772, | |
| "grad_norm": 0.11964483559131622, | |
| "learning_rate": 3.1078826033397843e-06, | |
| "loss": 0.2135, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 8.879795396419437, | |
| "grad_norm": 0.13187307119369507, | |
| "learning_rate": 2.8344093371128424e-06, | |
| "loss": 0.2123, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 8.930946291560103, | |
| "grad_norm": 0.13071340322494507, | |
| "learning_rate": 2.573177902642726e-06, | |
| "loss": 0.2129, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 8.982097186700766, | |
| "grad_norm": 0.1291477233171463, | |
| "learning_rate": 2.324256102563188e-06, | |
| "loss": 0.2124, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 9.030690537084398, | |
| "grad_norm": 0.12567947804927826, | |
| "learning_rate": 2.087708544541689e-06, | |
| "loss": 0.2144, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 9.081841432225064, | |
| "grad_norm": 0.12100923806428909, | |
| "learning_rate": 1.8635966245104664e-06, | |
| "loss": 0.2118, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 9.132992327365729, | |
| "grad_norm": 0.12870022654533386, | |
| "learning_rate": 1.6519785107311891e-06, | |
| "loss": 0.2146, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 9.184143222506394, | |
| "grad_norm": 0.12739935517311096, | |
| "learning_rate": 1.4529091286973995e-06, | |
| "loss": 0.2131, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 9.235294117647058, | |
| "grad_norm": 0.12040683627128601, | |
| "learning_rate": 1.2664401468786114e-06, | |
| "loss": 0.2118, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 9.286445012787723, | |
| "grad_norm": 0.12090786546468735, | |
| "learning_rate": 1.0926199633097157e-06, | |
| "loss": 0.2097, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 9.337595907928389, | |
| "grad_norm": 0.12753915786743164, | |
| "learning_rate": 9.314936930293283e-07, | |
| "loss": 0.2112, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 9.388746803069054, | |
| "grad_norm": 0.12532560527324677, | |
| "learning_rate": 7.83103156370113e-07, | |
| "loss": 0.2126, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 9.43989769820972, | |
| "grad_norm": 0.11527536064386368, | |
| "learning_rate": 6.474868681043578e-07, | |
| "loss": 0.2108, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 9.491048593350383, | |
| "grad_norm": 0.12571728229522705, | |
| "learning_rate": 5.246800274474439e-07, | |
| "loss": 0.2134, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 9.542199488491049, | |
| "grad_norm": 0.13035434484481812, | |
| "learning_rate": 4.1471450892189846e-07, | |
| "loss": 0.2099, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 9.593350383631714, | |
| "grad_norm": 0.12495685368776321, | |
| "learning_rate": 3.1761885408435054e-07, | |
| "loss": 0.2126, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 9.644501278772378, | |
| "grad_norm": 0.12562309205532074, | |
| "learning_rate": 2.334182641175686e-07, | |
| "loss": 0.2116, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 9.695652173913043, | |
| "grad_norm": 0.120052769780159, | |
| "learning_rate": 1.6213459328950352e-07, | |
| "loss": 0.212, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 9.746803069053708, | |
| "grad_norm": 0.1319722682237625, | |
| "learning_rate": 1.0378634328099269e-07, | |
| "loss": 0.2123, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 9.797953964194374, | |
| "grad_norm": 0.11803357303142548, | |
| "learning_rate": 5.838865838366792e-08, | |
| "loss": 0.2095, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 9.84910485933504, | |
| "grad_norm": 0.11567272990942001, | |
| "learning_rate": 2.595332156925534e-08, | |
| "loss": 0.2091, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 9.900255754475703, | |
| "grad_norm": 0.12867167592048645, | |
| "learning_rate": 6.488751431266149e-09, | |
| "loss": 0.2081, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 9.951406649616368, | |
| "grad_norm": 0.12587673962116241, | |
| "learning_rate": 0.0, | |
| "loss": 0.2119, | |
| "step": 1950 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1950, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 10, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.7341585074281513e+18, | |
| "train_batch_size": 64, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |